diff options
author | Felipe Franciosi <felipe@nutanix.com> | 2019-07-02 14:06:42 +0100 |
---|---|---|
committer | Felipe Franciosi <felipe@nutanix.com> | 2019-09-05 16:45:35 +0100 |
commit | f8ef2771ca6c05dadd3188099eb678e6135e12e2 (patch) | |
tree | 1629283ee553622ce99477c63da4994d4c87bc0f | |
download | libvfio-user-f8ef2771ca6c05dadd3188099eb678e6135e12e2.zip libvfio-user-f8ef2771ca6c05dadd3188099eb678e6135e12e2.tar.gz libvfio-user-f8ef2771ca6c05dadd3188099eb678e6135e12e2.tar.bz2 |
Initial commit
-rw-r--r-- | CMakeLists.txt | 42 | ||||
-rw-r--r-- | LICENSE | 7 | ||||
-rw-r--r-- | Makefile | 74 | ||||
-rw-r--r-- | README.md | 134 | ||||
-rw-r--r-- | kmod/CMakeLists.txt | 47 | ||||
-rw-r--r-- | kmod/muser.c | 1807 | ||||
-rw-r--r-- | kmod/muser.h | 74 | ||||
-rw-r--r-- | lib/.indent.pro | 4 | ||||
-rw-r--r-- | lib/CMakeLists.txt | 46 | ||||
-rw-r--r-- | lib/common.h | 60 | ||||
-rw-r--r-- | lib/dma.c | 331 | ||||
-rw-r--r-- | lib/dma.h | 241 | ||||
-rw-r--r-- | lib/libmuser.c | 1063 | ||||
-rw-r--r-- | lib/libmuser_pci.c | 311 | ||||
-rw-r--r-- | lib/msicap.h | 67 | ||||
-rw-r--r-- | lib/muser.h | 185 | ||||
-rw-r--r-- | lib/pci.h | 276 | ||||
-rw-r--r-- | lib/pmcap.h | 70 | ||||
-rw-r--r-- | lib/pxcap.h | 144 | ||||
-rw-r--r-- | patches/vfio.diff | 192 | ||||
-rw-r--r-- | samples/CMakeLists.txt | 32 | ||||
-rw-r--r-- | samples/test_mmap.c | 199 | ||||
-rw-r--r-- | samples/test_read.c | 233 |
23 files changed, 5639 insertions, 0 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..47a8e6f --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,42 @@ +# +# Copyright (c) 2019 Nutanix Inc. All rights reserved. +# +# Authors: Thanos Makatos <thanos@nutanix.com> +# Swapnil Ingle <swapnil.ingle@nutanix.com> +# Felipe Franciosi <felipe@nutanix.com> +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of Nutanix nor the names of its contributors may be +# used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +cmake_minimum_required (VERSION 2.6) +project(muser) +include(GNUInstallDirs) + +# shared library +add_subdirectory(lib) + +# kernel module +add_subdirectory(kmod) + +# samples +add_subdirectory(samples) @@ -0,0 +1,7 @@ +This project is released under dual license. + +The kernel driver (kmod/muser.[ch]) is released as GPL-2.0 or BSD-3-CLAUSE. + +The remaining source code is released as BSD-3-CLAUSE. + +Each source file in the repository reflects the above. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..11bd3fe --- /dev/null +++ b/Makefile @@ -0,0 +1,74 @@ +# +# Copyright (c) 2019 Nutanix Inc. All rights reserved. +# +# Authors: Thanos Makatos <thanos@nutanix.com> +# Swapnil Ingle <swapnil.ingle@nutanix.com> +# Felipe Franciosi <felipe@nutanix.com> +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of Nutanix nor the names of its contributors may be +# used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +BUILD_TYPE ?= dbg + +ifeq ($(BUILD_TYPE), dbg) + CMAKE_BUILD_TYPE = Debug + CFLAGS += -DDEBUG +else + CMAKE_BUILD_TYPE = Release + CFLAGS += -DNDEBUG +endif + +ifeq ($(VERBOSE),) + MAKEFLAGS += -s +endif + +BUILD_DIR_BASE = $(CURDIR)/build +BUILD_DIR = $(BUILD_DIR_BASE)/$(BUILD_TYPE) + +KDIR ?= "/lib/modules/$(shell uname -r)/build" + +PHONY_TARGETS := all realclean buildclean force_cmake export install-export tags + +.PHONY: $(PHONY_TARGETS) + +all $(filter-out $(PHONY_TARGETS), $(MAKECMDGOALS)): $(BUILD_DIR)/Makefile + +$(MAKE) -C $(BUILD_DIR) $@ + +realclean: + rm -rf $(BUILD_DIR_BASE) + +buildclean: + rm -rf $(BUILD_DIR) + +force_cmake: $(BUILD_DIR)/Makefile + +$(BUILD_DIR)/Makefile: + mkdir -p $(BUILD_DIR) + cd $(BUILD_DIR); cmake \ + -D "CMAKE_C_FLAGS:STRING=$(CFLAGS)" \ + -D "CMAKE_BUILD_TYPE:STRING=$(CMAKE_BUILD_TYPE)" \ + -D "KDIR=$(KDIR)" \ + $(CURDIR) + +tags: + ctags -R --exclude=$(BUILD_DIR) diff --git a/README.md b/README.md new file mode 100644 index 0000000..b744c26 --- /dev/null +++ b/README.md @@ -0,0 +1,134 @@ +Mediated User space device +========================== + +Overview +-------- + +muser is a framework that allows mediated device drivers to be implemented in +user space. The device driver can by a completely virtual one without driving +an actual device of that type. This can greatly simplify the initial +development and prototyping of kernel drivers as no kernel code needs to be +written, and failures result in the user space process crashing in the worst +case. The mediated device can be passed to a virtual machine for proper +testing. Device drivers are typically implemented entirely in kernel space for +various reasons, however in early development stages it's acceptable to do it +in user space. + +muser is implemented by a small kernel module, muser.ko, that registers itself +with mdev. Every request is forwarded to a user space application via a small, +custom ioctl interface on a control device. The application must be externally +provided and needs to contain the actual device implementation by using the API +of libmuser. See src/samples on how to build such an application. Currently +there is a one, single-threaded application instance per device, however the +application can employ any form of concurrency needed. In the future we plan to +make libmuser multi-threaded. The application can be implemented in whatever +way is convenient, e.g. as a Python script using bindings, on the cloud, etc. + + +Memory Mapping the Device +------------------------- + +The device driver can allow parts of the virtual device to be memory mapped by +the virtual machine (e.g. the PCI BARs). The business logic needs to implement +the mmap callback and reply to the request passing the memory address whose +backing pages are then used to satisfy the original mmap call. Currently +reading and writing of the memory mapped memory by the client goes undetected +by libmuser, the business logic needs to poll. In the future we plan to +implement a mechanism in order to provide notifications to libmuser whenever a +page is written to. + + +Interrupts +---------- + +Interrupts are implemented by installing the event file descriptor in libmuser +and then notifying it about it. libmuser can then trigger interrupts simply by +writing to it. This can be much more expensive compared to triggering interrupts +from the kernel, however this performance penalty is perfectly acceptable when +prototyping the functional aspect of a device driver. + + +System Architecture +------------------- + +muser.ko and libmuser communicate via ioctl on a control device. This control +device is create when the mediated device is created and appears as +/dev/muser/<UUID>. libmuser opens this device and then executes a "wait +command" ioctl. Whenever a callback of muser.ko is executed, it fills a struct +with the command details and then completes the ioctl, unblocking libmuser. It +then waits to receive another ioctl from libmuser with the result. Currently +there can be only one command pending, we plan to allow multiple commands to be +executed in parallel. + + +Building muser +============== + +vfio/mdev needs to be patched. To generate the patch run: + + git diff 869e3305f23dfeacdaa234717c92ccb237815d90 --diff-filter=M > vfio.patch + +Apply the patch and rebuild the vfio/mdev modules: + + make SUBDIRS=drivers/vfio/ modules + +Reload the relevant kernel modules: + + drivers/vfio/vfio_iommu_type1.ko + drivers/vfio/vfio.ko + drivers/vfio/mdev/mdev.ko + drivers/vfio/mdev/vfio_mdev.ko + +Build the kernel module: + + cd src/kmod + make + +Build the library: + + mkdir build + cd build + cmake .. + make + make install + +Finally build your program and link it to libmuser.so. + +Running QEMU +============ + +To pass the device to QEMU add the following options: + + -device vfio-pci,sysfsdev=/sys/bus/mdev/devices/00000000-0000-0000-0000-000000000000 + -object memory-backend-file,id=ram-node0,prealloc=yes,mem-path=mem,share=yes,size=1073741824 -numa node,nodeid=0,cpus=0,memdev=ram-node0 + +Guest RAM must be shared (share=yes) otherwise libmuser won't be able to do DMA +transfers from/to it. If you're not using QEMU then any memory that must be +accessed by libmuser must be allocate MAP_SHARED. Registering memory for DMA +that has not been allocated with MAP_SHARED is ignored and any attempts to +access that memory will result in an error. + + +Future Work +=========== + +Making libmuser Restartable +---------------------------- + +muser can be made restartable so that (a) it can recover from failures, and +(b) upgrades are less disrupting. This is something we plan to implement in the +future. To make it restarable muser needs to reconfigure eventfds and DMA +region mmaps first thing when the device is re-opened by libmuser. After muser +has finished reconfiguring it will send a "ready" command, after which normal +operation will be resumed. This "ready" command will always be sent when the +device is opened, even if this is the first time, as this way we don't need to +differentiate between normal operation and restarted operation. libmuser will +store the PCI BAR on /dev/shm (named after e.g. the device UUID) so that it can +easily find them on restart. + + +Making libmuser Multi-threaded +------------------------------- + +libmuser can be made multi-threaded in order to improve performance. To +implement this we'll have to maintain a private context in struct file. diff --git a/kmod/CMakeLists.txt b/kmod/CMakeLists.txt new file mode 100644 index 0000000..9065611 --- /dev/null +++ b/kmod/CMakeLists.txt @@ -0,0 +1,47 @@ +# +# Copyright (c) 2019 Nutanix Inc. All rights reserved. +# +# Authors: Thanos Makatos <thanos@nutanix.com> +# Swapnil Ingle <swapnil.ingle@nutanix.com> +# Felipe Franciosi <felipe@nutanix.com> +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of Nutanix nor the names of its contributors may be +# used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +# Copy sources to build directory (avoid polluting source directory). +# TODO can we copy all source files with a wildcard? +configure_file(muser.c ${CMAKE_CURRENT_BINARY_DIR}/muser.c COPYONLY) +configure_file(muser.h ${CMAKE_CURRENT_BINARY_DIR}/muser.h COPYONLY) +file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/Kbuild "obj-m := muser.o") + +# Build module using kernel's Makefile. +set(KBUILD_CMD ${CMAKE_MAKE_PROGRAM} -C ${KDIR} M=${CMAKE_CURRENT_BINARY_DIR} modules) +ADD_CUSTOM_COMMAND(OUTPUT DRIVER_BIN_FILE + COMMAND ${KBUILD_CMD} + DEPENDS ${MODULE_SOURCE_FILES} VERBATIM +) +ADD_CUSTOM_TARGET(driver ALL DEPENDS DRIVER_BIN_FILE) +execute_process(COMMAND uname -r OUTPUT_VARIABLE kver OUTPUT_STRIP_TRAILING_WHITESPACE) + +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/muser.ko DESTINATION /lib/modules/${kver}/extra/) +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/muser.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/linux) diff --git a/kmod/muser.c b/kmod/muser.c new file mode 100644 index 0000000..8a4ceb0 --- /dev/null +++ b/kmod/muser.c @@ -0,0 +1,1807 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* + * Copyright (c) 2019, Nutanix Inc. All rights reserved. + * + * Author: Thanos Makatos <thanos@nutanix.com> + * Swapnil Ingle <swapnil.ingle@nutanix.com> + * Felipe Franciosi <felipe@nutanix.com> + * + */ + +#include <linux/cdev.h> +#include <linux/compat.h> +#include <linux/device.h> +#include <linux/file.h> +#include <linux/idr.h> +#include <linux/list.h> +#include <linux/miscdevice.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/stat.h> +#include <linux/string.h> +#include <linux/uaccess.h> +#include <linux/wait.h> +#include <linux/vfio.h> +#include <linux/mdev.h> +#include <linux/pagemap.h> +#include <asm-generic/mman-common.h> +#include <linux/device.h> +#include <linux/uaccess.h> + +#include "muser.h" + +#define DRIVER_NAME "muser" + +#define NR_PAGES(x) (((x) + (PAGE_SIZE - 1)) >> PAGE_SHIFT) +#define MIN(a, b) ((a) < (b) ? (a):(b)) + +static struct muser { + struct class *class; + struct list_head dev_list; + struct idr dev_idr; + struct cdev muser_cdev; + dev_t muser_devt; + struct device dev; + struct mutex muser_lock; +} muser; + +#define muser_log(func, fmt, ...) \ + func(&muser.dev, "%s: " fmt "\n", __func__, ## __VA_ARGS__) + +#define muser_dbg(fmt, ...) muser_log(dev_dbg, fmt, ## __VA_ARGS__) +#define muser_info(fmt, ...) muser_log(dev_info, fmt, ## __VA_ARGS__) +#define muser_warn(fmt, ...) muser_log(dev_warn, fmt, ## __VA_ARGS__) +#define muser_err(fmt, ...) muser_log(dev_err, fmt, ## __VA_ARGS__) +#define muser_alert(fmt, ...) muser_log(dev_alert, fmt, ## __VA_ARGS__) + +/* TODO come up with as better name? */ +/* + * FIXME len and nr_pages are confusing, we user either one or the other however + * they seem to serve the same purpose, fix. + */ +struct page_map { + struct page **pages; + int nr_pages; + size_t len; + int offset; +}; + +struct vfio_dma_mapping { + unsigned long iova; + unsigned long length; + struct page **pages; + struct list_head entry; +}; + +/* + * TODO do we use all members at the same time? Does it make sense to put some + * of them in a union? + */ +struct mudev_cmd { + enum muser_cmd_type type; /* copy of muser_cmd.type */ + struct muser_cmd muser_cmd; + struct page_map pg_map; + struct file **fds; + int *data_fds; + /* + * When libmuser completes an mmap call, we need to know the length + * in order to pass it to do_pin_pages. + */ + unsigned long mmap_len; + struct list_head entry; +}; + +// FIXME: Reorganise the members of this struct. +struct muser_dev { + guid_t uuid; + int minor; + struct device *dev; + struct list_head dlist_entry; + struct list_head cmd_list; + // FIXME: mucmd_pending should be per filep context. + struct mudev_cmd *mucmd_pending; + // FIXME: muser_dev should have a list of filep contexts instead of + // srv_opened + atomic_t srv_opened; + atomic_t mdev_opened; + struct mutex dev_lock; + struct mdev_device *mdev; + wait_queue_head_t user_wait_q; + struct semaphore sem; + struct notifier_block iommu_notifier; + + struct vfio_dma_mapping *dma_map; /* Current DMA operation */ + struct list_head dma_list; /* list of dma mappings */ + + struct radix_tree_root devmem_tree; /* Device memory */ +}; + +/* function prototypes */ +static int dma_unmap_all(struct muser_dev *const mudev, const bool skip_user); + +static inline int muser_copyout(void __user *param, const void *address, + unsigned long size) +{ + int err = copy_to_user(param, address, size) ? -EFAULT : 0; + + if (unlikely(err)) + muser_dbg("failed to copy to user: %d", err); + + return err; +} + +static inline int muser_copyin(void *address, void __user *param, + unsigned long size) +{ + int err = copy_from_user(address, param, size) ? -EFAULT : 0; + + if (unlikely(err)) + muser_dbg("failed to copy from user: %d", err); + + return err; +} + +/* called with muser.muser_lock held */ +static struct muser_dev *__muser_search_dev(const guid_t *uuid) +{ + struct muser_dev *mudev; + + list_for_each_entry(mudev, &muser.dev_list, dlist_entry) { + const uuid_le *u = &mudev->uuid; + + if (uuid_le_cmp(*u, *uuid) == 0) + return mudev; + } + + return NULL; +} + +static int muser_create_dev(const guid_t *uuid, struct mdev_device *mdev) +{ + struct muser_dev *mudev; + char uuid_str[UUID_STRING_LEN + 1]; + int minor; + int err = 0; + + mutex_lock(&muser.muser_lock); + mudev = __muser_search_dev(uuid); + if (mudev) { + err = -EEXIST; + goto out; + } + + mudev = kzalloc(sizeof(*mudev), GFP_KERNEL); + if (!mudev) { + err = -ENOMEM; + goto out; + } + + minor = idr_alloc(&muser.dev_idr, mudev, 0, MINORMASK + 1, GFP_KERNEL); + if (minor < 0) { + err = minor; + kfree(mudev); + goto out; + } + + sprintf(uuid_str, "%pUl", uuid); + mudev->dev = device_create(muser.class, NULL, + MKDEV(MAJOR(muser.muser_devt), minor), + mudev, "%s", uuid_str); + if (IS_ERR(mudev->dev)) { + err = PTR_ERR(mudev->dev); + idr_remove(&muser.dev_idr, minor); + kfree(mudev); + goto out; + } + + memcpy(&mudev->uuid, uuid, sizeof(mudev->uuid)); + mudev->minor = minor; + mudev->mdev = mdev; + mutex_init(&mudev->dev_lock); + sema_init(&mudev->sem, 0); + init_waitqueue_head(&mudev->user_wait_q); + INIT_LIST_HEAD(&mudev->cmd_list); + INIT_LIST_HEAD(&mudev->dma_list); + INIT_RADIX_TREE(&mudev->devmem_tree, GFP_KERNEL); + list_add(&mudev->dlist_entry, &muser.dev_list); + mdev_set_drvdata(mdev, mudev); + + muser_info("new device %s", uuid_str); + +out: + mutex_unlock(&muser.muser_lock); + return err; +} + +/* called with muser.muser_lock held */ +static void __muser_deinit_dev(struct muser_dev *mudev) +{ + device_destroy(muser.class, + MKDEV(MAJOR(muser.muser_devt), mudev->minor)); + list_del(&mudev->dlist_entry); + idr_remove(&muser.dev_idr, mudev->minor); +} + +/* called with mudev.dev_lock held */ +static void __mudev_page_free(struct muser_dev *mudev, unsigned long pgnr) +{ + struct page *pg; + + pg = radix_tree_delete(&mudev->devmem_tree, pgnr); + if (WARN_ON(!pg)) + return; + + __free_page(pg); +} + +#define NR_INDICES 16 + +/* called with mudev.dev_lock held */ +static void __mudev_free_devmem(struct muser_dev *mudev) +{ + struct radix_tree_iter iter; + struct radix_tree_root *root = &mudev->devmem_tree; + unsigned long indices[NR_INDICES], index = 0; + void __rcu **slot; + int i, nr; + + do { + nr = 0; + radix_tree_for_each_slot(slot, root, &iter, index) { + indices[nr] = iter.index; + if (++nr == NR_INDICES) + break; + } + for (i = 0; i < nr; i++) { + index = indices[i]; + __mudev_page_free(mudev, index); + } + } while (nr > 0); +} + +static int muser_remove_dev(const uuid_le *uuid) +{ + struct muser_dev *mudev; + char uuid_str[UUID_STRING_LEN + 1]; + int err = 0; + + mutex_lock(&muser.muser_lock); + + mudev = __muser_search_dev(uuid); + if (!mudev) { + err = -ENOENT; + goto out; + } + + if (atomic_read(&mudev->mdev_opened) > 0 || + atomic_read(&mudev->srv_opened) > 0) { + err = -EBUSY; + goto out; + } + + mutex_lock(&mudev->dev_lock); + + WARN_ON(!list_empty(&mudev->cmd_list)); + __mudev_free_devmem(mudev); + __muser_deinit_dev(mudev); + + mutex_unlock(&mudev->dev_lock); + kfree(mudev); + + sprintf(uuid_str, "%pUl", uuid); + muser_info("removed muser device %s", uuid_str); + +out: + mutex_unlock(&muser.muser_lock); + return err; +} + +static ssize_t name_show(struct kobject *kobj, struct device *dev, char *buf) +{ + return sprintf(buf, "muser\n"); +} + +MDEV_TYPE_ATTR_RO(name); + +static ssize_t device_api_show(struct kobject *kobj, struct device *dev, + char *buf) +{ + return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING); +} + +MDEV_TYPE_ATTR_RO(device_api); + +static struct attribute *mdev_types_attrs[] = { + &mdev_type_attr_name.attr, + &mdev_type_attr_device_api.attr, + NULL, +}; + +static struct attribute_group mdev_type_group = { + .name = "1", + .attrs = mdev_types_attrs, +}; + +struct attribute_group *mdev_type_groups[] = { + &mdev_type_group, + NULL, +}; + +static int muser_process_cmd(struct muser_dev *mudev, struct mudev_cmd *mucmd) +{ + int err; + + mucmd->type = mucmd->muser_cmd.type; + + /* Add command to mudev list of commands. */ + mutex_lock(&mudev->dev_lock); + list_add_tail(&mucmd->entry, &mudev->cmd_list); + mutex_unlock(&mudev->dev_lock); + + /* Wake up any sleepers */ + wake_up(&mudev->user_wait_q); + + /* + * TODO: decide what to do with timeouts + * Timeouts can happen if: + * 1. No server has attached to mudev + * 2. Processing of cmd takes more time than timeout + */ + /* + * TODO: Maybe use a while loop instead of goto + */ +retry: + err = down_timeout(&mudev->sem, msecs_to_jiffies(5000)); + if (err) { + struct mudev_cmd *pos, *tmp; + bool found = false; + + mutex_lock(&mudev->dev_lock); + list_for_each_entry_safe(pos, tmp, &mudev->cmd_list, entry) { + if (pos == mucmd) { + list_del(&mucmd->entry); + found = true; + break; + } + } + mutex_unlock(&mudev->dev_lock); + if (found) { + muser_err("giving up, no response for cmd %d", + mucmd->type); + } else { + muser_warn("server taking too long for cmd %d, retry", + mucmd->type); + goto retry; + } + } + + return err; +} + +int muser_create(struct kobject *kobj, struct mdev_device *mdev) +{ + const guid_t *uuid = mdev_uuid(mdev); + + return muser_create_dev(uuid, mdev); +} + +int muser_remove(struct mdev_device *mdev) +{ + const guid_t *uuid = mdev_uuid(mdev); + + return muser_remove_dev(uuid); +} + +static int do_pin_pages(char __user *buf, const size_t count, + int const writeable, struct page_map *const pg_map) +{ + unsigned long start; + unsigned long __user lbuf = (unsigned long __user)buf; + int i; + int err; + + BUG_ON(!buf); + BUG_ON(!pg_map); + + start = round_down(lbuf, PAGE_SIZE); + pg_map->nr_pages = (round_up(lbuf + count, PAGE_SIZE) - start) / + PAGE_SIZE; + pg_map->offset = lbuf - start; + pg_map->pages = kcalloc(pg_map->nr_pages, sizeof *(pg_map->pages), + GFP_KERNEL); + if (unlikely(!pg_map->pages)) { + muser_dbg("failed to allocate %d pages", pg_map->nr_pages); + return -ENOMEM; + } + err = get_user_pages_fast(start, pg_map->nr_pages, writeable, + pg_map->pages); + if (unlikely(err != pg_map->nr_pages)) { + for (i = 0; i < err; i++) + put_page(pg_map->pages[i]); + kfree(pg_map->pages); + muser_dbg("failed to get user pages: %d", err); + return -ENOMEM; + } + + return 0; +} + +static void unpin_pages(struct page_map *const pg_map) +{ + int i; + + if (!pg_map) + return; + + for (i = 0; i < pg_map->nr_pages; i++) + put_page(pg_map->pages[i]); + kfree(pg_map->pages); + pg_map->pages = NULL; +} + + +static int vm_insert_pages(struct vm_area_struct *const vma, + struct page *const pages[], const int nr_pages) +{ + int err = 0, i; + + for (i = 0; i < nr_pages; i++) { + BUG_ON(!pages[i]); + err = vm_insert_page(vma, vma->vm_start + i * PAGE_SIZE, + pages[i]); + if (unlikely(err)) { + muser_dbg("count=%d, anon=%d, slab=%d, type=%d", + page_count(pages[i]), PageAnon(pages[i]), + PageSlab(pages[i]), page_has_type(pages[i])); + muser_dbg("failed to insert page at %lx: %d", + vma->vm_start + i * PAGE_SIZE, err); + unmap_kernel_range((unsigned long)vma->vm_start, + PAGE_SIZE); + break; + } + } + return err; +} + +static struct page *mudev_page_alloc(struct muser_dev *mudev, + unsigned long pgnr) +{ + struct page *pg; + int ret; + + pg = alloc_page(GFP_KERNEL); + if (unlikely(!pg)) + return NULL; + + ret = radix_tree_insert(&mudev->devmem_tree, pgnr, pg); + if (ret) { + __free_page(pg); + return NULL; + } + + return pg; +} + +static int libmuser_mmap_dev(struct file *fp, struct vm_area_struct *vma) +{ + struct muser_dev *mudev = fp->private_data; + struct page *pg; + unsigned int nr_pages; + unsigned long cur_pgidx, end_pgidx; + unsigned long addr, *new_pgs; + int ret, i; + + WARN_ON(mudev == NULL); + nr_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + + /* array to track new allocated pages, to be free'd + * in case of failure */ + new_pgs = kmalloc(nr_pages * sizeof(*new_pgs), GFP_KERNEL); + if (!new_pgs) + return -ENOMEM; + + cur_pgidx = vma->vm_pgoff & ~(BIT(63 - PAGE_SHIFT)); + end_pgidx = cur_pgidx + nr_pages; + + muser_info("mmap_dev: end 0x%lX - start 0x%lX (%lX), off = 0x%lX", + vma->vm_end, vma->vm_start, vma->vm_end - vma->vm_start, + cur_pgidx); + + mutex_lock(&mudev->dev_lock); + for (i = 0; cur_pgidx < end_pgidx; cur_pgidx++, i++) { + pg = radix_tree_lookup(&mudev->devmem_tree, cur_pgidx); + if (!pg) { + pg = mudev_page_alloc(mudev, cur_pgidx); + if (!pg) { + i--; + ret = -ENOMEM; + goto free_pg; + } + } + + addr = vma->vm_start + (cur_pgidx << PAGE_SHIFT); + ret = vm_insert_page(vma, addr, pg); + if (unlikely(ret)) + goto free_pg; + } + + mutex_unlock(&mudev->dev_lock); + kfree(new_pgs); + return 0; + +free_pg: + for ( ;i >= 0; i--) + __mudev_page_free(mudev, new_pgs[i]); + mutex_unlock(&mudev->dev_lock); + kfree(new_pgs); + return ret; +} + +static int libmuser_mmap_dma(struct file *f, struct vm_area_struct *vma) +{ + int err; + unsigned long length; + struct vfio_dma_mapping *dma_map; + struct muser_dev *mudev = f->private_data; + + BUG_ON(!mudev); + + muser_info("mmap_dma: end 0x%lX - start 0x%lX (%lX), off = 0x%lX", + vma->vm_end, vma->vm_start, vma->vm_end - vma->vm_start, + vma->vm_pgoff); + + if (unlikely(!mudev->dma_map)) { + muser_dbg("no pending DMA map operation"); + return -EINVAL; + } + + dma_map = mudev->dma_map; + length = round_up(dma_map->length, PAGE_SIZE); + if (unlikely(vma->vm_end - vma->vm_start != length)) { + muser_dbg("expected mmap of %lx bytes, got %lx instead", + vma->vm_end - vma->vm_start, length); + return -EINVAL; + } + + err = vm_insert_pages(vma, dma_map->pages, NR_PAGES(dma_map->length)); + if (unlikely(err)) { + muser_dbg("DMA region insert failed (%lu pages: %lx-%lx): %d", + NR_PAGES(dma_map->length), vma->vm_start, + vma->vm_end, err); + return err; + } + + return 0; +} + +static int libmuser_mmap(struct file *f, struct vm_area_struct *vma) +{ + if (vma->vm_pgoff & BIT(63 - PAGE_SHIFT)) { + muser_info("offset: 0x%lX (top bit set)", vma->vm_pgoff); + return libmuser_mmap_dev(f, vma); + } + + muser_dbg("offset: 0x%lX", vma->vm_pgoff); + return libmuser_mmap_dma(f, vma); +} + +static int muser_process_dma_request(struct muser_dev *mudev, + struct vfio_dma_mapping *dma_map, + int flags, int type) +{ + int err; + struct mudev_cmd mucmd = { + .type = type, + .muser_cmd = { + .type = type, + .mmap = { + .request = { + .start = dma_map->iova, + .end = dma_map->iova + dma_map->length, + .flags = flags} + } + } + }; + + err = muser_process_cmd(mudev, &mucmd); + if (unlikely(err)) + return err; + + return mucmd.muser_cmd.mmap.response.addr; +} + +static int muser_process_dma_map(struct muser_dev *mudev, int flags) +{ + return muser_process_dma_request(mudev, mudev->dma_map, flags, + MUSER_DMA_MMAP); +} + +static int muser_process_dma_unmap(struct muser_dev *mudev, + struct vfio_dma_mapping *dma_map) +{ + return muser_process_dma_request(mudev, dma_map, 0, MUSER_DMA_MUNMAP); +} + +static int put_dma_map(struct muser_dev *mudev, + struct vfio_dma_mapping *dma_map, int nr_pages) +{ + unsigned long off, iova_pfn; + int i, ret; + + for (i = 0, off = 0; i < nr_pages; i++, off += PAGE_SIZE) { + iova_pfn = (dma_map->iova + off) >> PAGE_SHIFT; + ret = vfio_unpin_pages(mdev_dev(mudev->mdev), &iova_pfn, 1); + if (WARN_ON(ret != 1)) + return -EINVAL; + + put_page(dma_map->pages[i]); + } + + kfree(dma_map->pages); + return 0; +} + +static int +get_dma_map(struct muser_dev *mudev, struct vfio_dma_mapping *dma_map, + struct vfio_iommu_type1_dma_map *map) +{ + unsigned long iova, vaddr; + unsigned long iova_pfn, phys_pfn; + unsigned long length, off; + int pgflag, ret, nr_pages = 0; + struct page **pages; + + length = map->size; + pages = kmalloc_array(NR_PAGES(length), sizeof(*pages), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + pgflag = map->flags & VFIO_DMA_MAP_FLAG_WRITE ? FOLL_WRITE : 0; + dma_map->pages = pages; + dma_map->iova = map->iova; + dma_map->length = map->size; + + iova = map->iova; + vaddr = map->vaddr; + + /* + * XXX: for now the for loop is for each page, vfio_pin_pages() has + * limit of 512 pages. + */ + for (off = 0; off < length; off += PAGE_SIZE, vaddr += PAGE_SIZE) { + iova_pfn = (iova + off) >> PAGE_SHIFT; + ret = vfio_pin_pages(mdev_dev(mudev->mdev), &iova_pfn, 1, + map->flags, &phys_pfn); + if (ret != 1) + goto err; + + ret = get_user_pages_fast(vaddr, 1, pgflag, pages + nr_pages); + if (ret != 1) { + vfio_unpin_pages(mdev_dev(mudev->mdev), &iova_pfn, 1); + goto err; + } + + nr_pages++; + } + + return 0; + +err: + put_dma_map(mudev, dma_map, nr_pages); + return ret; +} + +static int has_anonymous_pages(struct vfio_dma_mapping *dma_map) +{ + int i, nr_pages = NR_PAGES(dma_map->length); + + for (i = 0; i < nr_pages; i++) { + if (PageAnon(dma_map->pages[i])) { + muser_dbg("ignore IOVA=%lx, page(s) not shared", + dma_map->iova); + return 1; + } + } + + return 0; +} + +static int muser_iommu_dma_map(struct muser_dev *mudev, + struct vfio_iommu_type1_dma_map *map) +{ + struct vfio_dma_mapping *dma_map; + int ret; + + /* TODO: support multiple DMA map operations in parallel */ + mutex_lock(&mudev->dev_lock); + if (mudev->dma_map) { + mutex_unlock(&mudev->dev_lock); + muser_dbg("another DMA map operation is ongoing"); + return -EBUSY; + } + + dma_map = kmalloc(sizeof(struct vfio_dma_mapping), GFP_KERNEL); + if (!dma_map) { + mutex_unlock(&mudev->dev_lock); + return -ENOMEM; + } + mudev->dma_map = dma_map; + mutex_unlock(&mudev->dev_lock); + + /* get vfio client pages to be used for DMA map */ + ret = get_dma_map(mudev, dma_map, map); + if (ret) + goto out; + + /* skip anonymous pages */ + if (has_anonymous_pages(mudev->dma_map)) + goto put_pages; + + ret = muser_process_dma_map(mudev, map->flags); + if (ret) + goto put_pages; + + /* add to the dma_list */ + mutex_lock(&mudev->dev_lock); + list_add_tail(&dma_map->entry, &mudev->dma_list); + mudev->dma_map = NULL; + mutex_unlock(&mudev->dev_lock); + return 0; + +put_pages: + put_dma_map(mudev, dma_map, NR_PAGES(dma_map->length)); + +out: + kfree(dma_map); + mutex_lock(&mudev->dev_lock); + mudev->dma_map = NULL; + mutex_unlock(&mudev->dev_lock); + return ret; +} + +/* called with mudev.dev_lock held */ +static struct vfio_dma_mapping *__find_dma_map(struct muser_dev *mudev, + unsigned long iova) +{ + struct vfio_dma_mapping *dma_map; + + list_for_each_entry(dma_map, &mudev->dma_list, entry) { + if (dma_map->iova == iova) + return dma_map; + } + return NULL; +} + +static int muser_iommu_dma_unmap(struct muser_dev *const mudev, + struct vfio_iommu_type1_dma_unmap *const unmap) +{ + int err; + int len; + struct vfio_dma_mapping *dma_map; + + mutex_lock(&mudev->dev_lock); + dma_map = __find_dma_map(mudev, unmap->iova); + if (!dma_map) { + mutex_unlock(&mudev->dev_lock); + muser_dbg("failed to find dma map for iova:%llu\n", unmap->iova); + return -EINVAL; + } + list_del(&dma_map->entry); + mutex_unlock(&mudev->dev_lock); + + len = dma_map->length; + err = muser_process_dma_unmap(mudev, dma_map); + if (unlikely(err)) + muser_dbg("failed to request PCI server to munmap: %d", err); + + err = put_dma_map(mudev, dma_map, NR_PAGES(len)); + if (unlikely(err)) { + muser_dbg("failed to tear down DMA map: %d", err); + goto out; + } + + /* XXX: Do we need this? */ + unmap->size = len; +out: + return err; +} + +/* + * FIXME There can be multiple DMA map calls per device. If each of these calls + * are serialised (this can be enforced by muser), then we tell PCI server to + * mmap the control device. Do we need to distinguish between the different + * DMA map calls at this stage if we can enforce only one outstanding DMA map + * call? What happens when the DMA map happens too early, before GET_DEVICE_FD + * is called? + */ +static int muser_iommu_notifier(struct notifier_block *nb, unsigned long action, + void *data) +{ + struct muser_dev *mudev; + int err; + + BUG_ON(!nb); + BUG_ON(!data); + + mudev = container_of(nb, struct muser_dev, iommu_notifier); + switch (action) { + case VFIO_IOMMU_NOTIFY_DMA_MAP: + err = muser_iommu_dma_map(mudev, + (struct vfio_iommu_type1_dma_map *) + data); + break; + case VFIO_IOMMU_NOTIFY_DMA_UNMAP: + err = muser_iommu_dma_unmap(mudev, + (struct vfio_iommu_type1_dma_unmap + *)data); + break; + default: + muser_dbg("bad action=%lx", action); + err = -EINVAL; + } + + if (unlikely(err)) + return NOTIFY_BAD; + return NOTIFY_OK; +} + +static int register_notifier(struct mdev_device *const mdev) +{ + unsigned long events = + VFIO_IOMMU_NOTIFY_DMA_MAP | VFIO_IOMMU_NOTIFY_DMA_UNMAP; + struct muser_dev *const mudev = mdev_get_drvdata(mdev); + + memset(&mudev->iommu_notifier, 0, sizeof(mudev->iommu_notifier)); + mudev->iommu_notifier.notifier_call = muser_iommu_notifier; + return vfio_register_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, + &events, &mudev->iommu_notifier); +} + +int muser_open(struct mdev_device *mdev) +{ + int err; + struct muser_dev *mudev = mdev_get_drvdata(mdev); + + WARN_ON(mudev == NULL); + + if (atomic_cmpxchg(&mudev->mdev_opened, 0, 1) != 0) { + muser_dbg("device already open"); + return -EBUSY; + } + + err = register_notifier(mdev); + if (unlikely(err)) { + int err2; + /* + * TODO we might have triggered some notifiers which will have + * caused PCI server to mmap. If open fails then PCI server dies + * therefore things get automatically cleaned up (e.g. + * vfio_unpin etc.)? + */ + atomic_dec(&mudev->mdev_opened); + muser_dbg("failed to register notifier: %d", err); + err2 = dma_unmap_all(mudev, false); + if (unlikely(err2)) + muser_dbg("failed to DMA unmap all regions: %d", + err2); + err2 = + vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, + &mudev->iommu_notifier); + if (unlikely(err2)) + muser_info("failed to unregister notifier: %d", err); + + } + + return err; +} + +static int dma_unmap_all(struct muser_dev *mudev, bool skip_user) +{ + struct vfio_dma_mapping *dma_map; + unsigned long length; + LIST_HEAD(head); + + mutex_lock(&mudev->dev_lock); + while (!list_empty(&mudev->dma_list)) { + dma_map = list_first_entry(&mudev->dma_list, + struct vfio_dma_mapping, entry); + list_move(&dma_map->entry, &head); + } + mutex_unlock(&mudev->dev_lock); + + while (!list_empty(&head)) { + int err; + + dma_map = list_first_entry(&head, struct vfio_dma_mapping, + entry); + list_del(&dma_map->entry); + if (!skip_user) { + err = muser_process_dma_unmap(mudev, dma_map); + if (unlikely(err)) { + muser_alert("unmap request failed IOVA=%lx: %d", + dma_map->iova, err); + continue; + } + } + + length = dma_map->length; + err = put_dma_map(mudev, dma_map, NR_PAGES(length)); + if (unlikely(err)) + muser_alert("failed to unmap DMA IOVA=%lx: %d", + dma_map->iova, err); + } + return 0; +} + +void muser_close(struct mdev_device *mdev) +{ + struct muser_dev *mudev = mdev_get_drvdata(mdev); + int err; + + err = dma_unmap_all(mudev, false); + if (unlikely(err)) + muser_alert("failed to remove one or more DMA maps"); + + err = vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, + &mudev->iommu_notifier); + if (unlikely(err)) + muser_info("failed to unregister notifier: %d", err); + + WARN_ON(atomic_read(&mudev->mdev_opened) == 0); + atomic_dec(&mudev->mdev_opened); + + /* TODO: Replace any pending mucmd back in cmd_list. */ +} + +static int +pin_pages(struct mudev_cmd *mucmd, char __user *buf, size_t count, + int writeable) +{ + mucmd->pg_map.len = count; + return do_pin_pages(buf, count, writeable, &mucmd->pg_map); +} + +void dump_buffer(unsigned char const *const buf, uint32_t count) +{ +#if defined(DEBUG) + print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, 4, 1, buf, count, + false); +#endif +} + +ssize_t muser_read(struct mdev_device *mdev, char __user *buf, size_t count, + loff_t *ppos) +{ + struct muser_dev *mudev = mdev_get_drvdata(mdev); + struct mudev_cmd mucmd = { 0 }; + int err; + + WARN_ON(mudev == NULL); + + /* Setup mucmd and ping pages of the calling context. */ + mucmd.type = MUSER_READ; + err = pin_pages(&mucmd, buf, count, 1); + if (err != 0) + return err; + + /* Setup muser_cmd for server context. */ + mucmd.muser_cmd.type = MUSER_READ; + mucmd.muser_cmd.rw.count = count; + mucmd.muser_cmd.rw.pos = *ppos; + + muser_dbg("R %lx@%llx", mucmd.muser_cmd.rw.count, + mucmd.muser_cmd.rw.pos); + + /* Process mudev_cmd in libmuser context. */ + err = muser_process_cmd(mudev, &mucmd); + if (err != 0) + count = -1; + *ppos = mucmd.muser_cmd.rw.pos; + + unpin_pages(&mucmd.pg_map); + + dump_buffer(buf, count); + return count; +} + +ssize_t muser_write(struct mdev_device *mdev, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct muser_dev *mudev = mdev_get_drvdata(mdev); + struct mudev_cmd mucmd = { 0 }; + int err; + size_t _count = count; + loff_t _pos = *ppos; + + muser_dbg("W %lx@%llx", count, *ppos); + dump_buffer(buf, count); + + /* Setup mucmd and pin pages of the calling context. */ + mucmd.type = MUSER_WRITE; + err = pin_pages(&mucmd, (char __user *)buf, count, 0); + if (err != 0) + return err; + + /* Setup muser_cmd for libmuser context. */ + mucmd.muser_cmd.type = MUSER_WRITE; + mucmd.muser_cmd.rw.count = count; + mucmd.muser_cmd.rw.pos = *ppos; + + /* Process mudev_cmd in server context. */ + err = muser_process_cmd(mudev, &mucmd); + if (err != 0) + count = -1; + *ppos = mucmd.muser_cmd.rw.pos; + + unpin_pages(&mucmd.pg_map); + + if (mucmd.muser_cmd.err) + muser_info("PCI config write %ld@0x%llx not handled: %d", + _count, _pos, mucmd.muser_cmd.err); + + return count; +} + +static int +bounce_fds(struct mudev_cmd *mucmd, void __user *data, int user_data_size) +{ + int count = mucmd->muser_cmd.ioctl.data.irq_set.count; + int data_size = count * sizeof(int32_t); + int *user_fds; + int i; + int ret = 0; + + if (user_data_size < data_size) + return -EINVAL; + + mucmd->fds = kcalloc(count, sizeof(*mucmd->fds), GFP_KERNEL); + if (mucmd->fds == NULL) + return -ENOMEM; + + user_fds = memdup_user(data, data_size); + if (IS_ERR(user_fds)) { + kfree(mucmd->fds); + mucmd->fds = NULL; + return PTR_ERR(user_fds); + } + + for (i = 0; i < count; i++) { + if (user_fds[i] == -1) + continue; + mucmd->fds[i] = fget(user_fds[i]); + if (mucmd->fds[i] == NULL) { + ret = -EBADF; + goto err; + } + } + + kfree(user_fds); + + return 0; + +err: + for (i--; i >= 0; i--) + fput(mucmd->fds[i]); + kfree(user_fds); + kfree(mucmd->fds); + mucmd->fds = NULL; + + return ret; +} + +static unsigned int get_minsz(unsigned int cmd) +{ + switch (cmd) { + case VFIO_DEVICE_GET_INFO: + return offsetofend(struct vfio_device_info, num_irqs); + case VFIO_DEVICE_GET_REGION_INFO: + return offsetofend(struct vfio_region_info, offset); + case VFIO_DEVICE_GET_IRQ_INFO: + return offsetofend(struct vfio_irq_info, count); + case VFIO_DEVICE_SET_IRQS: + return offsetofend(struct vfio_irq_set, count); + } + return -1; +} + +static unsigned int get_argsz(unsigned int cmd, struct mudev_cmd *mucmd) +{ + switch (cmd) { + case VFIO_DEVICE_GET_INFO: + return mucmd->muser_cmd.ioctl.data.dev_info.argsz; + case VFIO_DEVICE_GET_REGION_INFO: + return mucmd->muser_cmd.ioctl.data.reg_info.argsz; + case VFIO_DEVICE_GET_IRQ_INFO: + return mucmd->muser_cmd.ioctl.data.irq_info.argsz; + case VFIO_DEVICE_SET_IRQS: + return mucmd->muser_cmd.ioctl.data.irq_set.argsz; + } + return -1; +} + +static int muser_ioctl_setup_cmd(struct mudev_cmd *mucmd, unsigned int cmd, + unsigned long arg) +{ + unsigned int minsz; + unsigned int argsz; + int err; + + /* Determine smallest argsz we need for this command. */ + minsz = get_minsz(cmd); + if (minsz == -1) + return -EOPNOTSUPP; + + /* Copy caller-provided arg. */ + err = muser_copyin(&mucmd->muser_cmd.ioctl.data, (void __user *)arg, + minsz); + if (unlikely(err)) + return err; + + /* Fetch argsz provided by caller. */ + argsz = get_argsz(cmd, mucmd); + if (argsz == -1) + return -EINVAL; + + /* Ensure provided size is at least the minimum required. */ + if (argsz < minsz) + return -EINVAL; + + /* Fetch potential data provided on SET_IRQS. */ + if (cmd == VFIO_DEVICE_SET_IRQS) { + unsigned int flags = mucmd->muser_cmd.ioctl.data.irq_set.flags; + + switch ((flags & VFIO_IRQ_SET_DATA_TYPE_MASK)) { + case VFIO_IRQ_SET_DATA_EVENTFD: + /* Lookup eventfds and bounce references to mucmd. */ + err = bounce_fds(mucmd, (void __user *) (arg + minsz), + argsz - minsz); + if (err) { + muser_dbg("failed to bounce fds: %d\n", err); + return err; + } + break; + } + } + + /* Pin pages of the calling context. */ + err = pin_pages(mucmd, (char __user *)arg, argsz, 1); + if (unlikely(err)) { + muser_dbg("failed to pin pages: %d\n", err); + return err; + } + + return err; +} + +static long muser_ioctl(struct mdev_device *mdev, unsigned int cmd, + unsigned long arg) +{ + struct muser_dev *mudev = mdev_get_drvdata(mdev); + struct mudev_cmd mucmd = { 0 }; + int err; + + muser_dbg("mdev=%p, cmd=%u, arg=0x%lX\n", mdev, cmd, arg); + + if (cmd == VFIO_DEVICE_RESET) { + /* + * Qemu-vfio(check vfio_pci_reset()) takes care of + * enabling/disabling Interrupts. + * + * FIXME: + * No need to block pci config access as only one + * mdev_parent_ops is allowed to execute at a time. + * + * Returning -EAGAIN if client tries to send multiple resets. + */ + if (!device_trylock(mudev->dev)) + return -EAGAIN; + } else { + err = muser_ioctl_setup_cmd(&mucmd, cmd, arg); + if (err) + return err; + } + + /* Setup common mucmd records. */ + mucmd.type = MUSER_IOCTL; + mucmd.muser_cmd.type = MUSER_IOCTL; + mucmd.muser_cmd.ioctl.vfio_cmd = cmd; + + /* Process mudev_cmd in server context. */ + err = muser_process_cmd(mudev, &mucmd); + if (err != 0) { + muser_dbg("failed to process command: %d\n", err); + err = -1; + } + + if (cmd == VFIO_DEVICE_RESET) { + device_unlock(mudev->dev); + } else { + /* Release resources. */ + unpin_pages(&mucmd.pg_map); + + /* maybe allocated for VFIO_IRQ_SET_DATA_EVENTFD */ + kfree(mucmd.fds); + kfree(mucmd.data_fds); + } + + return err; +} + +static int muser_mmap(struct mdev_device *const mdev, + struct vm_area_struct *const vma) +{ + struct muser_dev *mudev = mdev_get_drvdata(mdev); + struct mudev_cmd mucmd = { 0 }; + int err; + + BUG_ON(!mudev); + BUG_ON(!vma); + + /* + * Checking vm_flags cannot be easily done in user space as we can't + * access mm.h, so we have to do it here. Maybe implement the reverse + * of calc_vm_prot_bits/calc_vm_flag_bits? + */ + if ((vma->vm_flags & ~(VM_READ | VM_WRITE | VM_SHARED | VM_MAYREAD | + VM_MAYWRITE | VM_MAYEXEC | VM_MAYSHARE))) { + muser_dbg("bag flags=0x%lx", vma->vm_flags); + return -EINVAL; + } + + mucmd.type = MUSER_MMAP; + mucmd.muser_cmd.type = MUSER_MMAP; + mucmd.muser_cmd.mmap.request.start = vma->vm_start; + mucmd.muser_cmd.mmap.request.end = vma->vm_end; + mucmd.muser_cmd.mmap.request.pgoff = vma->vm_pgoff; + mucmd.mmap_len = vma->vm_end - vma->vm_start; + + /* Process mudev_cmd in server context. */ + err = muser_process_cmd(mudev, &mucmd); + if (unlikely(err)) { + muser_info("failed to mmap: %d", err); + return err; + } + + return vm_insert_pages(vma, mucmd.pg_map.pages, mucmd.pg_map.nr_pages); +} + +struct mdev_parent_ops muser_mdev_fops = { + .owner = THIS_MODULE, + .supported_type_groups = mdev_type_groups, + .create = muser_create, + .remove = muser_remove, + .open = muser_open, + .release = muser_close, + .read = muser_read, + .write = muser_write, + .ioctl = muser_ioctl, + .mmap = muser_mmap, +}; + +/* copy vfio-client pages(mucmd.pg_map) to server(arg) */ +static int bounce_out(void __user *arg, size_t argsz, struct mudev_cmd *mucmd) +{ + unsigned long to_copy, left; + void __user *to; + void *from; + unsigned int offset; + int i, ret = 0; + + left = mucmd->pg_map.len; + if (argsz < left) + return -EINVAL; + + offset = mucmd->pg_map.offset; + + for (i = 0; i < mucmd->pg_map.nr_pages && ret == 0; i++) { + to_copy = min(left, PAGE_SIZE - offset); + to = arg + (mucmd->pg_map.len - left); + from = page_to_virt(mucmd->pg_map.pages[i]) + offset; + + ret = muser_copyout(to, from, to_copy); + if (ret) + return ret; + + left -= to_copy; + + /* Must be zero after first iteration. */ + offset = 0; + } + WARN_ON(left != 0); + + return 0; +} + +/* copy from server(uaddr) to vfio-client pages(mucmd.pg_map) */ +static int bounce_in(struct mudev_cmd *mucmd, void __user *uaddr) +{ + unsigned long to_copy, left; + void __user *from; + void *to; + unsigned int offset; + int i, ret; + + left = mucmd->pg_map.len; + offset = mucmd->pg_map.offset; + + for (i = 0; i < mucmd->pg_map.nr_pages; i++) { + to_copy = min(left, PAGE_SIZE - offset); + from = uaddr + (mucmd->pg_map.len - left); + to = page_to_virt(mucmd->pg_map.pages[i]) + offset; + + ret = muser_copyin(to, from, to_copy); + if (ret) + return ret; + + left -= to_copy; + + /* Must be zero after first iteration. */ + offset = 0; + } + WARN_ON(left != 0); + + return 0; +} + +static long install_fds(struct mudev_cmd *mucmd) +{ + int count = mucmd->muser_cmd.ioctl.data.irq_set.count; + int i; + long ret; + + mucmd->data_fds = kcalloc(count, sizeof(int32_t), GFP_KERNEL); + if (mucmd->data_fds == NULL) + return -ENOMEM; + + for (i = 0; i < count; i++) { + if (mucmd->fds[i] == NULL) { + mucmd->data_fds[i] = -1; + continue; + } + mucmd->data_fds[i] = get_unused_fd_flags(0); + if (mucmd->data_fds[i] < 0) { + ret = mucmd->data_fds[i]; + muser_err("unable to get unused fd: %ld", ret); + goto err; + } + fd_install(mucmd->data_fds[i], mucmd->fds[i]); + } + + return 0; + +err: + for (i--; i >= 0; i--) + put_unused_fd(mucmd->data_fds[i]); + kfree(mucmd->data_fds); + + return ret; +} + +static inline int maybe_install_fds(struct mudev_cmd *mucmd) +{ + unsigned int flags = mucmd->muser_cmd.ioctl.data.irq_set.flags; + long ret = 0; + + if ((mucmd->muser_cmd.type == MUSER_IOCTL) && + (mucmd->muser_cmd.ioctl.vfio_cmd == VFIO_DEVICE_SET_IRQS)) { + switch ((flags & VFIO_IRQ_SET_DATA_TYPE_MASK)) { + case VFIO_IRQ_SET_DATA_EVENTFD: + ret = install_fds(mucmd); + if (unlikely(ret)) + muser_dbg("failed to install fds: %ld", ret); + break; + /* TODO: SET_DATA_BOOL */ + } + } + + return ret; +} + +static inline int mmap_done(struct mudev_cmd * const mucmd) +{ + struct muser_cmd *cmd = &mucmd->muser_cmd; + char __user *addr = (char __user *) cmd->mmap.response.addr; + int ret; + + if (cmd->err < 0) + return -1; + ret = do_pin_pages(addr, mucmd->mmap_len, 1, &mucmd->pg_map); + if (ret) { + muser_alert("failed to pin pages: %d", ret); + mucmd->pg_map.pages = NULL; + mucmd->pg_map.nr_pages = 0; + } + return ret; +} + +static long libmuser_unl_ioctl(struct file *filep, + unsigned int cmd, unsigned long arg) +{ + struct muser_dev *mudev = filep->private_data; + struct mudev_cmd *mucmd; + unsigned long offset; + long ret = -EINVAL; + + WARN_ON(mudev == NULL); + switch (cmd) { + case MUSER_DEV_CMD_WAIT: + /* Block until a request come from vfio. */ + ret = wait_event_interruptible(mudev->user_wait_q, + !list_empty(&mudev->cmd_list)); + if (unlikely(ret)) { + muser_dbg("failed to wait for user space: %ld", ret); + goto out; + } + + /* Pick and remove the mucmd from the cmd_list. */ + mutex_lock(&mudev->dev_lock); + WARN_ON(list_empty(&mudev->cmd_list)); + mucmd = list_first_entry(&mudev->cmd_list, struct mudev_cmd, + entry); + list_del(&mucmd->entry); + mutex_unlock(&mudev->dev_lock); + + /* Keep a reference to mudev_cmd in mudev. */ + WARN_ON(mudev->mucmd_pending != NULL); + mudev->mucmd_pending = mucmd; + /* TODO: These WARN_ON()s should really just detach mudev. */ + + /* Populate userspace with mucmd. */ + ret = muser_copyout((void __user *)arg, &mucmd->muser_cmd, + sizeof(struct muser_cmd)); + if (ret) + return -EFAULT; + + /* Install FDs on VFIO_SET_IRQS */ + ret = maybe_install_fds(mucmd); + if (ret) + return ret; + + break; + case MUSER_DEV_CMD_DONE: + /* This is only called when a command is pending. */ + if (mudev->mucmd_pending == NULL) { + muser_dbg("done but no command pending"); + return -1; + } + + /* Fetch (and clear) the pending command. */ + mucmd = mudev->mucmd_pending; + mudev->mucmd_pending = NULL; + + /* Fetch response from userspace. */ + ret = muser_copyin(&mucmd->muser_cmd, (void __user *)arg, + sizeof(struct muser_cmd)); + if (ret) + goto out; + + switch (mucmd->type) { + case MUSER_IOCTL: + offset = offsetof(struct muser_cmd, ioctl); + offset += offsetof(struct muser_cmd_ioctl, data); + ret = bounce_in(mucmd, (void __user *)(arg + offset)); + break; + case MUSER_MMAP: + ret = mmap_done(mucmd); + break; + case MUSER_WRITE: + case MUSER_READ: + case MUSER_DMA_MMAP: + case MUSER_DMA_MUNMAP: + break; + default: + muser_alert("bad command %d", mucmd->type); + ret = -EINVAL; + break; + } + + /* Wake up vfio client. */ + up(&mudev->sem); + break; + + default: + muser_info("bad ioctl 0x%x", cmd); + return -1; + } + +out: + return ret; +} + +#ifdef CONFIG_COMPAT +static long libmuser_compat_ioctl(struct file *filep, + unsigned int cmd, unsigned long arg) +{ + arg = (unsigned long)compat_ptr(arg); + return libmuser_unl_ioctl(filep, cmd, arg); +} +#endif /* CONFIG_COMPAT */ + +static struct muser_dev *muser_get_dev_from_minor(int minor) +{ + struct muser_dev *mudev; + + /* Locate mudev using idr. */ + mutex_lock(&muser.muser_lock); + mudev = idr_find(&muser.dev_idr, minor); + mutex_unlock(&muser.muser_lock); + + return mudev; +} + +static int libmuser_open(struct inode *inode, struct file *filep) +{ + struct muser_dev *mudev; + int opened; + + /* Fetch corresponding mudev. */ + mudev = muser_get_dev_from_minor(iminor(inode)); + if (!mudev) + return -ENOENT; + + /* Allow only one server for each mudev. */ + opened = atomic_cmpxchg(&mudev->srv_opened, 0, 1); + if (opened) + return -EBUSY; + + WARN_ON(filep->private_data != NULL); + filep->private_data = mudev; + + return 0; +} + +static int libmuser_release(struct inode *inode, struct file *filep) +{ + struct muser_dev *mudev = filep->private_data; + int err; + + WARN_ON(mudev == NULL); + mutex_lock(&mudev->dev_lock); + /* + * FIXME must be per filep + */ + if (mudev->mucmd_pending) { + muser_info("moving command back in list"); + list_add_tail(&mudev->mucmd_pending->entry, &mudev->cmd_list); + mudev->mucmd_pending = NULL; + } + mutex_unlock(&mudev->dev_lock); + + err = dma_unmap_all(mudev, true); + if (unlikely(err)) + muser_alert("failed to remove DMA maps"); + + filep->private_data = NULL; + atomic_dec(&mudev->srv_opened); + + return 0; +} + +static inline int irq_set_data_eventfd(void __user * const buf, + struct mudev_cmd * const mucmd) +{ + return muser_copyout((void __user *)buf, mucmd->data_fds, + sizeof(__s32) * mucmd->muser_cmd.ioctl.data.irq_set.count); +} + +static inline int irq_set_data_bool(void __user * const buf, + struct mudev_cmd * const mucmd) +{ + return muser_copyout((void __user *)buf, mucmd->data_fds, + sizeof(__u8) * mucmd->muser_cmd.ioctl.data.irq_set.count); +} + +/* + * Called by libmuser for kernel->user transfers. + */ +static ssize_t libmuser_read(struct file *filp, char __user *buf, + size_t bufsz, loff_t *ppos) +{ + struct muser_dev *mudev = filp->private_data; + struct mudev_cmd *mucmd = mudev->mucmd_pending; + int ret = -EINVAL; + uint32_t irq_set_flags; + + if (!mucmd || !mudev) { + muser_dbg("bad arguments"); + return -EINVAL; + } + + if (!access_ok(buf, bufsz)) { + muser_dbg("bad permissions"); + return -EFAULT; + } + + switch (mucmd->type) { + case MUSER_WRITE: + ret = bounce_out(buf, bufsz, mucmd); + if (ret) { + muser_dbg("failed to copy to user: %d", ret); + goto err; + } + break; + case MUSER_IOCTL: + /* FIXME move case into separate function */ + if (mucmd->muser_cmd.ioctl.vfio_cmd != VFIO_DEVICE_SET_IRQS) { + muser_dbg("expected VFIO command %d, got %d instead", + VFIO_DEVICE_SET_IRQS, + mucmd->muser_cmd.ioctl.vfio_cmd); + goto err; + } + irq_set_flags = mucmd->muser_cmd.ioctl.data.irq_set.flags & + VFIO_IRQ_SET_DATA_TYPE_MASK; + switch (irq_set_flags) { + case VFIO_IRQ_SET_DATA_EVENTFD: + ret = irq_set_data_eventfd((void __user *)buf, mucmd); + if (unlikely(ret)) { + muser_dbg("failed to set data eventfd: %d", + ret); + goto err; + } + break; + case VFIO_IRQ_SET_DATA_BOOL: + ret = irq_set_data_bool((void __user *)buf, mucmd); + if (unlikely(ret)) + goto err; + break; + default: + muser_dbg("bad VFIO set IRQ flags %d", irq_set_flags); + goto err; + } + break; + default: + muser_dbg("bad muser command %d", mucmd->type); + goto err; + } + return bufsz; + +err: + return ret; +} + +/* + * Called by libmuser for user->kernel transfers. + */ +static ssize_t libmuser_write(struct file *filp, const char __user *buf, + size_t bufsz, loff_t *ppos) +{ + struct muser_dev *mudev = filp->private_data; + struct mudev_cmd *mucmd = mudev->mucmd_pending; + struct muser_cmd muser_cmd; + int ret; + + if (!mucmd || !mudev) { + muser_dbg("bad arguments"); + return -EINVAL; + } + + if (!access_ok(buf, bufsz)) { + muser_dbg("bad permissions"); + return -EFAULT; + } + + ret = muser_copyin(&muser_cmd, (void __user *)buf, + sizeof(struct muser_cmd)); + if (ret) + return ret; + + if (mucmd->type != muser_cmd.type) { + muser_dbg("bad command %d", muser_cmd.type); + return -EINVAL; + } + + WARN_ON(muser_cmd.type != MUSER_READ); + ret = bounce_in(mucmd, muser_cmd.rw.buf); + if (ret) + return ret; + + return bufsz; +} + +static const struct file_operations libmuser_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = libmuser_unl_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = libmuser_compat_ioctl, +#endif + .open = libmuser_open, + .release = libmuser_release, + .mmap = libmuser_mmap, + .read = libmuser_read, + .write = libmuser_write, +}; + +static void muser_device_release(struct device *dev) +{ + muser_info("muser dev released\n"); +} + +static char *muser_devnode(struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, DRIVER_NAME "/%s", dev_name(dev)); +} + +static int __init muser_init(void) +{ + int ret; + + /* Initialise idr. */ + idr_init(&muser.dev_idr); + mutex_init(&muser.muser_lock); + INIT_LIST_HEAD(&muser.dev_list); + + /* Initialise class. */ + muser.class = class_create(THIS_MODULE, DRIVER_NAME); + if (IS_ERR(muser.class)) + return PTR_ERR(muser.class); + muser.class->devnode = muser_devnode; + + /* Allocate and register a chardev for muser devices. */ + ret = alloc_chrdev_region(&muser.muser_devt, 0, MINORMASK + 1, + DRIVER_NAME); + if (ret) + goto err_alloc_chrdev; + + cdev_init(&muser.muser_cdev, &libmuser_fops); + ret = cdev_add(&muser.muser_cdev, muser.muser_devt, MINORMASK + 1); + if (ret) + goto err_cdev_add; + + muser.dev.class = muser.class; + muser.dev.release = muser_device_release; + dev_set_name(&muser.dev, "%s", DRIVER_NAME); + + ret = device_register(&muser.dev); + if (ret) + goto err_device_register; + + /* Register ourselves with mdev. */ + ret = mdev_register_device(&muser.dev, &muser_mdev_fops); + if (ret) + goto err_mdev_register_device; + + return 0; + +err_mdev_register_device: + device_unregister(&muser.dev); +err_device_register: + cdev_del(&muser.muser_cdev); +err_cdev_add: + unregister_chrdev_region(muser.muser_devt, MINORMASK + 1); +err_alloc_chrdev: + class_destroy(muser.class); + muser.class = NULL; + return ret; +} + +static void __exit muser_cleanup(void) +{ + struct muser_dev *mudev, *tmp; + + /* Remove all devices. */ + mutex_lock(&muser.muser_lock); + list_for_each_entry_safe(mudev, tmp, &muser.dev_list, dlist_entry) { + __muser_deinit_dev(mudev); + kfree(mudev); + } + mutex_unlock(&muser.muser_lock); + + /* Unregister with mdev. */ + muser.dev.bus = NULL; + mdev_unregister_device(&muser.dev); + + /* Cleanup everything else. */ + device_unregister(&muser.dev); + idr_destroy(&muser.dev_idr); + cdev_del(&muser.muser_cdev); + unregister_chrdev_region(muser.muser_devt, MINORMASK + 1); + class_destroy(muser.class); + muser.class = NULL; +} + +module_init(muser_init); +module_exit(muser_cleanup); + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/kmod/muser.h b/kmod/muser.h new file mode 100644 index 0000000..14fecd6 --- /dev/null +++ b/kmod/muser.h @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* + * Copyright (c) 2019, Nutanix Inc. All rights reserved. + * + * Author: Thanos Makatos <thanos@nutanix.com> + * Swapnil Ingle <swapnil.ingle@nutanix.com> + * Felipe Franciosi <felipe@nutanix.com> + * + */ + +#ifndef _LINUX_MUSER_H +#define _LINUX_MUSER_H + +#ifndef __KERNEL__ +#include <sys/types.h> +#endif + +#include <linux/ioctl.h> +#include <linux/vfio.h> + +#define MUSER_DEVNODE "muser" + +enum muser_cmd_type { + MUSER_IOCTL = 1, + MUSER_READ, + MUSER_WRITE, + MUSER_MMAP, + MUSER_DMA_MMAP, + MUSER_DMA_MUNMAP, +}; + +struct muser_cmd_rw { + size_t count; + loff_t pos; + char *buf; /* only used for write */ +}; + +struct muser_cmd_ioctl { + int vfio_cmd; + union { + struct vfio_device_info dev_info; + struct vfio_region_info reg_info; + struct vfio_irq_info irq_info; + struct vfio_irq_set irq_set; + } data; +}; + +union muser_cmd_mmap { + struct { + unsigned long start; + unsigned long end; + unsigned long flags; + unsigned long pgoff; + } request; + struct { + unsigned long addr; + } response; +}; + +struct muser_cmd { + enum muser_cmd_type type; + union { + struct muser_cmd_rw rw; + struct muser_cmd_ioctl ioctl; + union muser_cmd_mmap mmap; + }; + int err; +}; + +/* ioctl cmds valid for /dev/muser/<uuid> */ +#define MUSER_DEV_CMD_WAIT _IOW('M', 1, struct muser_cmd) +#define MUSER_DEV_CMD_DONE _IOW('M', 2, struct muser_cmd) + +#endif /* _LINUX_MUSER_H */ diff --git a/lib/.indent.pro b/lib/.indent.pro new file mode 100644 index 0000000..52ef8f2 --- /dev/null +++ b/lib/.indent.pro @@ -0,0 +1,4 @@ +-nbad -bap -nbc -bbo -hnl -br -brs -c33 -cd33 -ncdb -ce -ci4 +-cli0 -d0 -di1 -nfc1 -i4 -ip0 -l80 -lp -npcs -nprs -psl -sai +-saf -saw -ncs -nsc -nsob -nfca -cp33 -ss -ts8 -il0 +-nut -blf diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt new file mode 100644 index 0000000..6d3d0ae --- /dev/null +++ b/lib/CMakeLists.txt @@ -0,0 +1,46 @@ +# +# Copyright (c) 2019 Nutanix Inc. All rights reserved. +# +# Authors: Thanos Makatos <thanos@nutanix.com> +# Swapnil Ingle <swapnil.ingle@nutanix.com> +# Felipe Franciosi <felipe@nutanix.com> +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of Nutanix nor the names of its contributors may be +# used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +add_library(muser SHARED + ../kmod/muser.h + muser.h + pci.h + pmcap.h + msicap.h + pxcap.h + common.h + dma.h + dma.c + libmuser.c + libmuser_pci.c) +set_target_properties(muser PROPERTIES PUBLIC_HEADER "muser.h;pci.h;pmcap.h;msicap.h;pxcap.h") +install(TARGETS muser + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/muser) diff --git a/lib/common.h b/lib/common.h new file mode 100644 index 0000000..4fbc048 --- /dev/null +++ b/lib/common.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2019 Nutanix Inc. All rights reserved. + * + * Authors: Thanos Makatos <thanos@nutanix.com> + * Swapnil Ingle <swapnil.ingle@nutanix.com> + * Felipe Franciosi <felipe@nutanix.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Nutanix nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + */ + +#ifndef __COMMON_H__ +#define __COMMON_H__ + +#include <stdint.h> + +#define PAGE_SIZE sysconf(_SC_PAGE_SIZE) +#define PAGE_ALIGNED(x) (((x) & ((typeof(x))(PAGE_SIZE) - 1)) == 0) + +#define BIT(nr) (1UL << (nr)) + +#define ARRAY_SIZE(array) (sizeof(array) / sizeof((array)[0])) + +#define likely(e) __builtin_expect(!!(e), 1) +#define unlikely(e) __builtin_expect(e, 0) + +#define ROUND_DOWN(x, a) ((x) & ~((a)-1)) +#define ROUND_UP(x,a) ROUND_DOWN((x)+(a)-1, a) + +void lm_log(lm_ctx_t const *const lm_ctx, const lm_log_lvl_t lvl, + char const *const fmt, ...); + +void dump_buffer(lm_ctx_t const *const lm_ctx, char const *const prefix, + unsigned char const *const buf, uint32_t count); + + +#endif /* __COMMON_H__ */ + +/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/lib/dma.c b/lib/dma.c new file mode 100644 index 0000000..5c9455f --- /dev/null +++ b/lib/dma.c @@ -0,0 +1,331 @@ +/* + * Copyright (c) 2019 Nutanix Inc. All rights reserved. + * + * Authors: Mike Cui <cui@nutanix.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Nutanix nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + */ + +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> +#include <stdio.h> +#include <sys/param.h> + +#include <stddef.h> +#include <stdbool.h> +#include <string.h> +#include <stdlib.h> + +#include <errno.h> + +#include "dma.h" + +static inline ssize_t +fd_get_blocksize(int fd) +{ + struct stat st; + + if (fstat(fd, &st) != 0) + return -1; + + return st.st_blksize; +} + +/* Returns true if 2 fds refer to the same file. + If any fd is invalid, return false. */ +static inline bool +fds_are_same_file(int fd1, int fd2) +{ + struct stat st1, st2; + + return (fstat(fd1, &st1) == 0 && fstat(fd2, &st2) == 0 && + st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino); +} + +dma_controller_t * +dma_controller_create(int max_regions) +{ + dma_controller_t *dma; + + dma = malloc(offsetof(dma_controller_t, regions) + + max_regions * sizeof(dma->regions[0])); + + if (dma == NULL) { + return dma; + } + + dma->max_regions = max_regions; + dma->nregions = 0; + memset(dma->regions, 0, max_regions * sizeof(dma->regions[0])); + + return dma; +} + +static void +_dma_controller_do_remove_region(dma_memory_region_t * const region) +{ + assert(region); +#if DMA_MAP_FAST_IMPL + dma_unmap_region(region, region->virt_addr, region->size); +#endif + (void)close(region->fd); +} + +/* FIXME not thread safe */ +int +dma_controller_remove_region(dma_controller_t * dma, dma_addr_t dma_addr, + size_t size, int fd) +{ + int idx; + dma_memory_region_t *region; + + assert(dma); + + for (idx = 0; idx < dma->nregions; idx++) { + region = &dma->regions[idx]; + if (region->dma_addr == dma_addr && region->size == size && + fds_are_same_file(region->fd, fd)) { + _dma_controller_do_remove_region(region); + if (dma->nregions > 1) + memcpy(region, &dma->regions[dma->nregions - 1], + sizeof *region); + dma->nregions--; + return 0; + } + } + return -ENOENT; +} + +static inline void +dma_controller_remove_regions(lm_ctx_t * const ctx, + dma_controller_t * const dma) +{ + int i; + + assert(dma); + + for (i = 0; i < dma->nregions; i++) { + dma_memory_region_t *region = &dma->regions[i]; + + lm_log(ctx, LM_INF, "unmap vaddr=%lx IOVA=%lx\n", + region->virt_addr, region->dma_addr); + + _dma_controller_do_remove_region(region); + } +} + +void +dma_controller_destroy(lm_ctx_t * const ctx, dma_controller_t * dma) +{ + dma_controller_remove_regions(ctx, dma); + free(dma); +} + +int +dma_controller_add_region(lm_ctx_t * const lm_ctx, dma_controller_t * dma, + dma_addr_t dma_addr, size_t size, + int fd, off_t offset) +{ + int idx; + dma_memory_region_t *region; + int page_size; + + for (idx = 0; idx < dma->nregions; idx++) { + region = &dma->regions[idx]; + + /* First check if this is the same exact region. */ + if (region->dma_addr == dma_addr && region->size == size) { + if (offset != region->offset) { + lm_log(lm_ctx, LM_ERR, "bad offset for new DMA region %lx+%lx, " + "want=%d, existing=%d\n", + dma_addr, size, offset, region->offset); + goto err; + } + if (!fds_are_same_file(region->fd, fd)) { + /* + * Printing the file descriptors here doesn't really make + * sense as they can be different but actually pointing to + * the same file, however in the majority of cases we'll be + * using a single fd. + */ + lm_log(lm_ctx, LM_ERR, "bad fd=%d for new DMA region %lx-%lx, " + "existing fd=%d\n", fd, region->fd); + goto err; + } + return idx; + } + + /* Check for overlap, i.e. start of one region is within another. */ + if ((dma_addr >= region->dma_addr && + dma_addr < region->dma_addr + region->size) || + (region->dma_addr >= dma_addr && + region->dma_addr < dma_addr + size)) { + lm_log(lm_ctx, LM_INF, "new DMA region %lx+%lx overlaps with DMA " + "region %lx-%lx\n", dma_addr, size, region->dma_addr, + region->size); + goto err; + } + } + + if (dma->nregions == dma->max_regions) { + idx = dma->max_regions; + lm_log(lm_ctx, LM_ERR, "reached maxed regions\n"); + goto err; + } + + idx = dma->nregions; + region = &dma->regions[idx]; + + page_size = fd_get_blocksize(fd); + if (page_size < 0) { + lm_log(lm_ctx, LM_ERR, "bad page size %d\n", page_size); + goto err; + } + page_size = MAX(page_size, getpagesize()); + + region->dma_addr = dma_addr; + region->size = size; + region->page_size = page_size; + region->offset = offset; + + region->fd = dup(fd); // dup the fd to get our own private copy + if (region->fd < 0) { + lm_log(lm_ctx, LM_ERR, "failed to duplicate file descriptor: %s\n", + strerror(errno)); + goto err; + } +#if DMA_MAP_FAST_IMPL + region->virt_addr = dma_map_region(region, PROT_READ | PROT_WRITE, + 0, region->size); + if (region->virt_addr == MAP_FAILED) { + lm_log(lm_ctx, LM_ERR, "failed to memory map DMA region %lx-%lx: %s\n", + dma_addr, dma_addr + size, strerror(errno)); + close(region->fd); + goto err; + } +#endif + + dma->nregions++; + + return idx; + +err: + return -idx - 1; +} + +static inline void +mmap_round(size_t * offset, size_t * size, size_t page_size) +{ + size_t offset_orig = *offset; + *offset = ROUND_DOWN(offset_orig, page_size); + *size = ROUND_UP(offset_orig + *size, page_size) - *offset; +} + +void * +dma_map_region(dma_memory_region_t * region, int prot, + size_t offset, size_t len) +{ + size_t mmap_offset, mmap_size = len; + char *mmap_base; + + if (offset >= region->size || offset + len > region->size) { + return MAP_FAILED; + } + + offset += region->offset; + mmap_offset = offset; + mmap_round(&mmap_offset, &mmap_size, region->page_size); + + // Do the mmap. + mmap_base = mmap(NULL, mmap_size, prot, MAP_SHARED, + region->fd, mmap_offset); + if (mmap_base == MAP_FAILED) { + return mmap_base; + } + // Do not dump. + madvise(mmap_base, mmap_size, MADV_DONTDUMP); + + return mmap_base + (offset - mmap_offset); +} + +void +dma_unmap_region(dma_memory_region_t * region, void *virt_addr, size_t len) +{ + mmap_round((size_t *) & virt_addr, &len, region->page_size); + munmap(virt_addr, len); +} + +int +_dma_addr_sg_split(lm_ctx_t * const ctx, const dma_controller_t * dma, + dma_addr_t dma_addr, uint32_t len, + dma_scattergather_t * sg, int max_sg) +{ + int idx; + int cnt = 0; + bool found = true; // Whether the current region is found. + + while (found && len > 0) { + found = false; + for (idx = 0; idx < dma->nregions; idx++) { + const dma_memory_region_t *const region = &dma->regions[idx]; + const dma_addr_t region_end = region->dma_addr + region->size; + + while (dma_addr >= region->dma_addr && dma_addr < region_end) { + size_t region_len = MIN(region_end - dma_addr, len); + + if (cnt < max_sg) { + sg[cnt].region = idx; + sg[cnt].offset = dma_addr - region->dma_addr; + sg[cnt].length = region_len; + } + + cnt++; + + // dma_addr found, may need to start from the top for the + // next dma_addr. + found = true; + dma_addr += region_len; + len -= region_len; + + if (len == 0) { + goto out; + } + } + } + } + +out: + if (!found) { + // There is still a region which was not found. + assert(len > 0); + cnt = -1; + } else if (cnt > max_sg) { + cnt = -cnt - 1; + } + return cnt; +} + +/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/lib/dma.h b/lib/dma.h new file mode 100644 index 0000000..80afaec --- /dev/null +++ b/lib/dma.h @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2019 Nutanix Inc. All rights reserved. + * + * Authors: Mike Cui <cui@nutanix.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Nutanix nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + */ + +#ifndef DMA_DMA_H +#define DMA_DMA_H + +/* + * This library emulates a DMA controller for a device emulation application to + * perform DMA operations on a foreign memory space. + * + * Concepts: + * - A DMA controller has its own 64-bit DMA address space. + * - Foreign memory is made available to the DMA controller in linear chunks + * called memory regions. + * - Each memory region is backed by a file descriptor and + * is registered with the DMA controllers at a unique, non-overlapping + * linear span of the DMA address space. + * - To perform DMA, the application should first build a scatter-gather + * list (sglist) of dma_scattergather_t from DMA addresses. Then the sglist + * can be mapped using dma_map_sg() into the process's virtual address space + * as an iovec for direct access, and unmapped using dma_unmap_sg() when done. + * - dma_map_addr() and dma_unmap_addr() helper functions are provided + * for mapping DMA regions that can fit into one scatter-gather entry. + * + * This library can be compiled to function in two modes as defined by the + * following macros. + * - DMA_MAP_FAST (default): Every region is mapped into the application's + * virtual address space at registration time with R/W permissions. + * dma_map_sg() ignores all protection bits and only does lookups and + * returns pointers to the previously mapped regions. dma_unmap_sg() is + * effectively a no-op. + * - DMA_MAP_PROTECTED: Every call to dma_map_sg() does mmap()s and + * dma_unmap_sg() does munmap()s. All permission bits are honored. This mode + * is obviously much slower if used in the fast path. It may be useful to + * have the exta protection if the fast path does not need direct virtual + * memory access to foreign memory and data is accessed using a different + * method (e.g. RDMA, vfio-iommu). It can also be useful in debugging to + * make sure we are not writing to guest memory that's readonly for the + * device. + */ + +#ifdef DMA_MAP_PROTECTED +#undef DMA_MAP_FAST +#define DMA_MAP_FAST_IMPL 0 +#else +#define DMA_MAP_FAST_IMPL 1 +#endif + +#include <assert.h> +#include <sys/types.h> +#include <sys/uio.h> +#include <sys/mman.h> +#include <stdint.h> +#include <stdlib.h> + +#include "muser.h" +#include "common.h" + +typedef struct { + dma_addr_t dma_addr; // DMA address of this region + size_t size; // Size of this region + int fd; // File descriptor to mmap + int page_size; // Page size of this fd + off_t offset; // File offset +#if DMA_MAP_FAST_IMPL + void *virt_addr; // Virtual address of this region +#endif +} dma_memory_region_t; + +typedef struct { + int max_regions; + int nregions; + dma_memory_region_t regions[0]; +} dma_controller_t; + +dma_controller_t *dma_controller_create(int max_regions); +void dma_controller_destroy(lm_ctx_t * const ctx, dma_controller_t * dma); + +/* Registers a new memory region. + * Returns: + * - On success, a non-negative region number + * - On failure, a negative integer (-x - 1) where x is the region number + * where this region would have been mapped to if the call could succeed + * (e.g. due to conflict with existing region). + */ +int dma_controller_add_region(lm_ctx_t * const ctx, dma_controller_t * dma, + dma_addr_t dma_addr, size_t size, + int fd, off_t offset); + +int dma_controller_remove_region(dma_controller_t * dma, dma_addr_t dma_addr, + size_t size, int fd); + +// Helper for dma_addr_to_sg() slow path. +int _dma_addr_sg_split(lm_ctx_t * const ctx, const dma_controller_t * dma, + dma_addr_t dma_addr, uint32_t len, + dma_scattergather_t * sg, int max_sg); + +/* Takes a linear dma address span and returns a sg list suitable for DMA. + * A single linear dma address span may need to be split into multiple + * scatter gather regions due to limitations of how memory can be mapped. + * + * Returns: + * - On success, number of scatter gather entries created. + * - On failure: + * -1 if the dma address span is invalid + * (-x - 1) if @max_sg is too small, where x is the number of sg entries + * necessary to complete this request. + */ +static inline int +dma_addr_to_sg(lm_ctx_t * const ctx, const dma_controller_t * dma, + dma_addr_t dma_addr, uint32_t len, + dma_scattergather_t * sg, int max_sg) +{ + static __thread int region_hint; + int cnt; + + const dma_memory_region_t *const region = &dma->regions[region_hint]; + const dma_addr_t region_end = region->dma_addr + region->size; + + // Fast path: single region. + if (likely(max_sg > 0 && len > 0 && + dma_addr >= region->dma_addr && dma_addr + len <= region_end)) { + sg->region = region_hint; + sg->offset = dma_addr - region->dma_addr; + sg->length = len; + return 1; + } + // Slow path: search through regions. + cnt = _dma_addr_sg_split(ctx, dma, dma_addr, len, sg, max_sg); + if (likely(cnt > 0)) { + region_hint = sg->region; + } + return cnt; +} + +void *dma_map_region(dma_memory_region_t * region, int prot, + size_t offset, size_t len); + +void dma_unmap_region(dma_memory_region_t * region, + void *virt_addr, size_t len); + +static inline int +dma_map_sg(dma_controller_t * dma, int prot, + const dma_scattergather_t * sg, struct iovec *iov, int cnt) +{ + int i; + + for (i = 0; i < cnt; i++) { + dma_memory_region_t *const region = &dma->regions[sg[i].region]; + +#if DMA_MAP_FAST_IMPL + iov[i].iov_base = (char *)region->virt_addr + sg[i].offset; +#else + iov[i].iov_base = dma_map_region(region, prot, + sg[i].offset, sg[i].length); + if (iov[i].iov_base == MAP_FAILED) { + return -1; + } +#endif + iov[i].iov_len = sg[i].length; + } + + return 0; +} + +static inline void +dma_unmap_sg(dma_controller_t * dma, + const dma_scattergather_t * sg, struct iovec *iov, int cnt) +{ + int i; + + for (i = 0; i < cnt; i++) { + dma_memory_region_t *const region = &dma->regions[sg[i].region]; + if (!DMA_MAP_FAST_IMPL) { + dma_unmap_region(region, iov[i].iov_base, iov[i].iov_len); + } + } +} + +static inline void * +dma_map_addr(lm_ctx_t * const ctx, dma_controller_t * dma, int prot, + dma_addr_t dma_addr, uint32_t len) +{ + dma_scattergather_t sg; + struct iovec iov; + + if (dma_addr_to_sg(ctx, dma, dma_addr, len, &sg, 1) == 1 && + dma_map_sg(dma, prot, &sg, &iov, 1) == 0) { + return iov.iov_base; + } + + return NULL; +} + +static inline void +dma_unmap_addr(lm_ctx_t * const ctx, dma_controller_t * dma, + dma_addr_t dma_addr, uint32_t len, void *addr) +{ + dma_scattergather_t sg; + struct iovec iov = { + .iov_base = addr, + .iov_len = len, + }; + int r; + + r = dma_addr_to_sg(ctx, dma, dma_addr, len, &sg, 1); + assert(r == 1); + + dma_unmap_sg(dma, &sg, &iov, 1); +} + +#endif /* DMA_DMA_H */ + +/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/lib/libmuser.c b/lib/libmuser.c new file mode 100644 index 0000000..ba016fe --- /dev/null +++ b/lib/libmuser.c @@ -0,0 +1,1063 @@ +/* + * Copyright (c) 2019 Nutanix Inc. All rights reserved. + * + * Authors: Thanos Makatos <thanos@nutanix.com> + * Swapnil Ingle <swapnil.ingle@nutanix.com> + * Felipe Franciosi <felipe@nutanix.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Nutanix nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + */ + +#define _GNU_SOURCE +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <fcntl.h> +#include <unistd.h> +#include <sys/eventfd.h> +#include <sys/ioctl.h> +#include <assert.h> +#include <errno.h> +#include <stddef.h> +#include <sys/mman.h> +#include <stdarg.h> + +#include "../kmod/muser.h" +#include "muser.h" +#include "dma.h" + +typedef enum { + IRQ_NONE = 0, + IRQ_INTX, + IRQ_MSI, + IRQ_MSIX, +} irq_type_t; + +typedef struct { + irq_type_t type; /* irq type this device is using */ + int err_efd; /* eventfd for irq err */ + int req_efd; /* eventfd for irq req */ + uint32_t max_ivs; /* maximum number of ivs supported */ + int efds[0]; /* XXX must be last */ +} lm_irqs_t; + +/* + * Macro that ensures that a particular struct member is last. Doesn't work for + * flexible array members. + */ +#define MUST_BE_LAST(s, m, t) \ + _Static_assert(sizeof(s) - offsetof(s, m) == sizeof(t), \ + #t " " #m " must be last member in " #s) + +struct lm_ctx { + void *pvt; + dma_controller_t *dma; + int fd; + bool extended; + lm_fops_t fops; + lm_log_lvl_t log_lvl; + lm_log_fn_t *log; + lm_pci_info_t pci_info; + lm_pci_config_space_t *pci_config_space; + lm_irqs_t irqs; /* XXX must be last */ +}; +MUST_BE_LAST(struct lm_ctx, irqs, lm_irqs_t); + +#define LM_CTX_SIZE(irqs) (sizeof(lm_ctx_t) + sizeof(int) * irqs) +#define LM2VFIO_IRQT(type) (type - 1) + +void lm_log(const lm_ctx_t * const ctx, const lm_log_lvl_t lvl, + const char *const fmt, ...) +{ + va_list ap; + char buf[BUFSIZ]; + + assert(ctx); + + if (!ctx->log || lvl > ctx->log_lvl || !fmt) { + return; + } + + va_start(ap, fmt); + vsnprintf(buf, sizeof buf, fmt, ap); + va_end(ap); + ctx->log(ctx->pvt, buf); +} + +static long irqs_disable(lm_ctx_t * lm_ctx, uint32_t index) +{ + int *irq_efd = NULL; + uint32_t i; + + assert(lm_ctx != NULL); + assert(index < LM_DEV_NUM_IRQS); + + switch (index) { + case VFIO_PCI_INTX_IRQ_INDEX: + case VFIO_PCI_MSI_IRQ_INDEX: + case VFIO_PCI_MSIX_IRQ_INDEX: + lm_ctx->irqs.type = IRQ_NONE; + for (i = 0; i < lm_ctx->irqs.max_ivs; i++) { + if (lm_ctx->irqs.efds[i] >= 0) { + (void) close(lm_ctx->irqs.efds[i]); + lm_ctx->irqs.efds[i] = -1; + } + } + return 0; + case VFIO_PCI_ERR_IRQ_INDEX: + irq_efd = &lm_ctx->irqs.err_efd; + break; + case VFIO_PCI_REQ_IRQ_INDEX: + irq_efd = &lm_ctx->irqs.req_efd; + break; + } + + if (irq_efd != NULL) { + (void)close(*irq_efd); + *irq_efd = -1; + return 0; + } + + return -EINVAL; +} + +static int irqs_set_data_none(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set) +{ + int efd, i; + long ret; + eventfd_t val; + + for (i = irq_set->start; i < irq_set->start + irq_set->count; i++) { + efd = lm_ctx->irqs.efds[i]; + if (efd >= 0) { + val = 1; + ret = eventfd_write(efd, val); + if (ret == -1) { + return -errno; + } + } + } + + return 0; +} + +static int +irqs_set_data_bool(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set, void *data) +{ + uint8_t *d8; + int efd, i; + long ret; + eventfd_t val; + + assert(data != NULL); + for (i = irq_set->start, d8 = data; i < irq_set->start + irq_set->count; + i++, d8++) { + efd = lm_ctx->irqs.efds[i]; + if (efd >= 0 && *d8 == 1) { + val = 1; + ret = eventfd_write(efd, val); + if (ret == -1) { + return -errno; + } + } + } + + return 0; +} + +static int +irqs_set_data_eventfd(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set, void *data) +{ + int32_t *d32; + int efd, i; + + assert(data != NULL); + for (i = irq_set->start, d32 = data; i < irq_set->start + irq_set->count; + i++, d32++) { + efd = lm_ctx->irqs.efds[i]; + if (efd >= 0) { + (void) close(efd); + lm_ctx->irqs.efds[i] = -1; + } + if (*d32 >= 0) { + lm_ctx->irqs.efds[i] = *d32; + } + } + + return 0; +} + +static long +irqs_trigger(lm_ctx_t * lm_ctx, struct vfio_irq_set *irq_set, void *data) +{ + int err = 0; + + assert(lm_ctx != NULL); + assert(irq_set != NULL); + + if (irq_set->count == 0) { + return irqs_disable(lm_ctx, irq_set->index); + } + + switch (irq_set->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) { + case VFIO_IRQ_SET_DATA_NONE: + err = irqs_set_data_none(lm_ctx, irq_set); + break; + case VFIO_IRQ_SET_DATA_BOOL: + err = irqs_set_data_bool(lm_ctx, irq_set, data); + break; + case VFIO_IRQ_SET_DATA_EVENTFD: + err = irqs_set_data_eventfd(lm_ctx, irq_set, data); + break; + } + + return err; +} + +static long +dev_set_irqs_validate(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set) +{ + lm_pci_info_t *pci_info = &lm_ctx->pci_info; + uint32_t a_type, d_type; + + assert(lm_ctx != NULL); + assert(irq_set != NULL); + + // Separate action and data types from flags. + a_type = (irq_set->flags & VFIO_IRQ_SET_ACTION_TYPE_MASK); + d_type = (irq_set->flags & VFIO_IRQ_SET_DATA_TYPE_MASK); + + // Ensure index is within bounds. + if (irq_set->index >= LM_DEV_NUM_IRQS) { + return -EINVAL; + } + + /* TODO make each condition a function */ + + // Only one of MASK/UNMASK/TRIGGER is valid. + if ((a_type != VFIO_IRQ_SET_ACTION_MASK) && + (a_type != VFIO_IRQ_SET_ACTION_UNMASK) && + (a_type != VFIO_IRQ_SET_ACTION_TRIGGER)) { + return -EINVAL; + } + // Only one of NONE/BOOL/EVENTFD is valid. + if ((d_type != VFIO_IRQ_SET_DATA_NONE) && + (d_type != VFIO_IRQ_SET_DATA_BOOL) && + (d_type != VFIO_IRQ_SET_DATA_EVENTFD)) { + return -EINVAL; + } + // Ensure irq_set's start and count are within bounds. + if ((irq_set->start >= pci_info->irq_count[irq_set->index]) || + (irq_set->start + irq_set->count > pci_info->irq_count[irq_set->index])) { + return -EINVAL; + } + // Only TRIGGER is valid for ERR/REQ. + if (((irq_set->index == VFIO_PCI_ERR_IRQ_INDEX) || + (irq_set->index == VFIO_PCI_REQ_IRQ_INDEX)) && + (a_type != VFIO_IRQ_SET_ACTION_TRIGGER)) { + return -EINVAL; + } + // count == 0 is only valid with ACTION_TRIGGER and DATA_NONE. + if ((irq_set->count == 0) && ((a_type != VFIO_IRQ_SET_ACTION_TRIGGER) || + (d_type != VFIO_IRQ_SET_DATA_NONE))) { + return -EINVAL; + } + // If IRQs are set, ensure index matches what's enabled for the device. + if ((irq_set->count != 0) && (lm_ctx->irqs.type != IRQ_NONE) && + (irq_set->index != LM2VFIO_IRQT(lm_ctx->irqs.type))) { + return -EINVAL; + } + + return 0; +} + +static long +dev_set_irqs(lm_ctx_t * lm_ctx, struct vfio_irq_set *irq_set, void *data) +{ + long ret; + + assert(lm_ctx != NULL); + assert(irq_set != NULL); + + // Ensure irq_set is valid. + ret = dev_set_irqs_validate(lm_ctx, irq_set); + if (ret != 0) { + return ret; + } + + switch (irq_set->flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { + case VFIO_IRQ_SET_ACTION_MASK: // fallthrough + case VFIO_IRQ_SET_ACTION_UNMASK: + // We're always edge-triggered without un/mask support. + return 0; + } + + return irqs_trigger(lm_ctx, irq_set, data); +} + +static long dev_get_irqinfo(lm_ctx_t * lm_ctx, struct vfio_irq_info *irq_info) +{ + assert(lm_ctx != NULL); + assert(irq_info != NULL); + lm_pci_info_t *pci_info = &lm_ctx->pci_info; + + // Ensure provided argsz is sufficiently big and index is within bounds. + if ((irq_info->argsz < sizeof(struct vfio_irq_info)) || + (irq_info->index >= LM_DEV_NUM_IRQS)) { + return -EINVAL; + } + + irq_info->count = pci_info->irq_count[irq_info->index]; + irq_info->flags = VFIO_IRQ_INFO_EVENTFD; + + return 0; +} + +static long +dev_get_reginfo(lm_ctx_t * lm_ctx, struct vfio_region_info *reg_info) +{ + assert(lm_ctx != NULL); + assert(reg_info != NULL); + lm_pci_info_t *pci_info = &lm_ctx->pci_info; + + // Ensure provided argsz is sufficiently big and index is within bounds. + if ((reg_info->argsz < sizeof(struct vfio_region_info)) || + (reg_info->index >= LM_DEV_NUM_REGS)) { + return -EINVAL; + } + + reg_info->offset = pci_info->reg_info[reg_info->index].offset; + reg_info->flags = pci_info->reg_info[reg_info->index].flags; + reg_info->size = pci_info->reg_info[reg_info->index].size; + + lm_log(lm_ctx, LM_DBG, "region_info[%d]\n", reg_info->index); + dump_buffer(lm_ctx, "", (unsigned char *)reg_info, sizeof *reg_info); + + return 0; +} + +static long dev_get_info(struct vfio_device_info *dev_info) +{ + assert(dev_info != NULL); + + // Ensure provided argsz is sufficiently big. + if (dev_info->argsz < sizeof(struct vfio_device_info)) { + return -EINVAL; + } + + dev_info->flags = VFIO_DEVICE_FLAGS_PCI | VFIO_DEVICE_FLAGS_RESET; + dev_info->num_regions = LM_DEV_NUM_REGS; + dev_info->num_irqs = LM_DEV_NUM_IRQS; + + return 0; +} + +static long +do_muser_ioctl(lm_ctx_t * lm_ctx, struct muser_cmd_ioctl *cmd_ioctl, void *data) +{ + int err = -ENOTSUP; + + assert(lm_ctx != NULL); + switch (cmd_ioctl->vfio_cmd) { + case VFIO_DEVICE_GET_INFO: + err = dev_get_info(&cmd_ioctl->data.dev_info); + break; + case VFIO_DEVICE_GET_REGION_INFO: + err = dev_get_reginfo(lm_ctx, &cmd_ioctl->data.reg_info); + break; + case VFIO_DEVICE_GET_IRQ_INFO: + err = dev_get_irqinfo(lm_ctx, &cmd_ioctl->data.irq_info); + break; + case VFIO_DEVICE_SET_IRQS: + err = dev_set_irqs(lm_ctx, &cmd_ioctl->data.irq_set, data); + break; + case VFIO_DEVICE_RESET: + if (lm_ctx->fops.reset) { + return lm_ctx->fops.reset(lm_ctx->pvt); + } + } + + return err; +} + +static int muser_dma_unmap(lm_ctx_t * lm_ctx, struct muser_cmd *cmd) +{ + int err; + + lm_log(lm_ctx, LM_INF, "removing DMA region %lx-%lx\n", + cmd->mmap.request.start, cmd->mmap.request.end); + + if (lm_ctx->dma == NULL) { + lm_log(lm_ctx, LM_ERR, "DMA not initialized\n"); + cmd->mmap.response.addr = -1; + return -1; + } + + err = dma_controller_remove_region(lm_ctx->dma, + cmd->mmap.request.start, + cmd->mmap.request.end - + cmd->mmap.request.start, lm_ctx->fd); + if (err != 0) { + lm_log(lm_ctx, LM_ERR, "failed to remove DMA region %lx-%lx: %s\n", + cmd->mmap.request.start, cmd->mmap.request.end, strerror(err)); + } + + cmd->mmap.response.addr = err; + + return err; +} + +static int muser_dma_map(lm_ctx_t * lm_ctx, struct muser_cmd *cmd) +{ + int err; + + lm_log(lm_ctx, LM_INF, "adding DMA region %lx-%lx\n", + cmd->mmap.request.start, cmd->mmap.request.end); + + if (lm_ctx->dma == NULL) { + lm_log(lm_ctx, LM_ERR, "DMA not initialized\n"); + cmd->mmap.response.addr = -1; + return -1; + } + + if (cmd->mmap.request.start >= cmd->mmap.request.end) { + lm_log(lm_ctx, LM_ERR, "bad DMA region %lx-%lx\n", + cmd->mmap.request.start, cmd->mmap.request.end); + cmd->mmap.response.addr = -1; + return -1; + } + err = dma_controller_add_region(lm_ctx, lm_ctx->dma, + cmd->mmap.request.start, + cmd->mmap.request.end - + cmd->mmap.request.start, lm_ctx->fd, 0); + if (err < 0) { + lm_log(lm_ctx, LM_ERR, "failed to add DMA region %lx-%lx: %d\n", + cmd->mmap.request.start, cmd->mmap.request.end, err); + cmd->mmap.response.addr = -1; + return -1; + } + + // TODO: Are we just abusing response.addr as a rc? + cmd->mmap.response.addr = 0; + + return 0; +} + +static int muser_mmap(lm_ctx_t * lm_ctx, struct muser_cmd *cmd) +{ + unsigned long addr; + unsigned long start = cmd->mmap.request.start; + unsigned long end = cmd->mmap.request.end; + unsigned long pgoff = cmd->mmap.request.pgoff; + + addr = lm_ctx->fops.mmap(lm_ctx->pvt, pgoff); + cmd->mmap.response.addr = addr; + + if ((void *)addr == MAP_FAILED) { + cmd->err = -1; + return -1; + } + + return 0; +} + +static int +post_read(lm_ctx_t * const lm_ctx, struct muser_cmd *const cmd, + char *const data, const size_t offset, ssize_t ret) +{ + if (ret != cmd->rw.count) { + /* FIXME shouldn't we still reply to the kernel in case of error? */ + lm_log(lm_ctx, LM_ERR, "%s: bad fops read: %d/%d, %s\n", + __func__, ret, cmd->rw.count, strerror(errno)); + return ret; + } + + /* + * TODO the kernel will first copy the command and then will use the .buf + * pointer to copy the data. Does it make sense to use writev in order to + * get rid of the .buf member? THe 1st element of the iovec will be the + * command and the 2nd the data. + */ + cmd->rw.buf = data; + ret = write(lm_ctx->fd, cmd, sizeof(*cmd)); + if ((int)ret != sizeof(*cmd)) { + lm_log(lm_ctx, LM_ERR, "%s: bad muser write: %d/%d, %s\n", + __func__, ret, sizeof(*cmd), strerror(errno)); + } + return ret; +} + +int +lm_get_region(lm_ctx_t * const lm_ctx, const loff_t pos, const size_t count, + loff_t * const off) +{ + assert(lm_ctx); + assert(off); + lm_pci_info_t *pci_info = &lm_ctx->pci_info; + + int i; + + for (i = 0; i < LM_DEV_NUM_REGS; i++) { + const lm_reg_info_t * const reg_info = &pci_info->reg_info[i]; + if (pos >= reg_info->offset) { + if (pos - reg_info->offset + count <= reg_info->size) { + *off = pos - reg_info->offset; + return i; + } + } + } + return -ENOENT; +} + +static ssize_t +do_access(lm_ctx_t * const lm_ctx, char * const buf, size_t count, loff_t pos, + const bool is_write) +{ + int idx; + loff_t offset; + int ret = -EINVAL; + lm_pci_info_t *pci_info; + + assert(lm_ctx != NULL); + assert(buf != NULL); + assert(count > 0); + + pci_info = &lm_ctx->pci_info; + idx = lm_get_region(lm_ctx, pos, count, &offset); + if (idx < 0) { + lm_log(lm_ctx, LM_ERR, "invalid region %d\n", idx); + return idx; + } + + /* + * TODO we should check at device registration time that all necessary + * callbacks are there in order to avoid having to check at runtime + */ + switch (idx) { + case LM_DEV_BAR0_REG_IDX ... LM_DEV_BAR5_REG_IDX: + if (pci_info->bar_fn) + return pci_info->bar_fn(lm_ctx->pvt, idx, buf, count, offset, is_write); + break; + case LM_DEV_ROM_REG_IDX: + if (pci_info->rom_fn) + return pci_info->rom_fn(lm_ctx->pvt, buf, count, offset, is_write); + break; + case LM_DEV_CFG_REG_IDX: + if (pci_info->pci_config_fn) + return pci_info->pci_config_fn(lm_ctx->pvt, buf, count, offset, + is_write); + break; + case LM_DEV_VGA_REG_IDX: + if (pci_info->vga_fn) + return pci_info->vga_fn(lm_ctx->pvt, buf, count, offset, is_write); + break; + default: + lm_log(lm_ctx, LM_ERR, "bad region %d\n", idx); + return ret; + } + + if (is_write && lm_ctx->fops.write) { + ret = lm_ctx->fops.write(lm_ctx->pvt, idx, buf, count, pos); + } else if (lm_ctx->fops.read) { + ret = lm_ctx->fops.read(lm_ctx->pvt, idx, buf, count, pos); + } else { + lm_log(lm_ctx, LM_ERR, "no R/W callback, region %d, %x@%lx\n", + idx, count, pos); + } + + return ret; +} + +/* + * TODO function name same lm_access_t, fix + */ +ssize_t +lm_access(lm_ctx_t * const lm_ctx, char *buf, size_t count, + loff_t * const ppos, const bool is_write) +{ + unsigned int done = 0; + int ret; + + assert(lm_ctx != NULL); + /* buf and ppos can be NULL if count is 0 */ + + while (count) { + size_t size; + if (count >= 8 && !(*ppos % 8)) { + size = 8; + } else if (count >= 4 && !(*ppos % 4)) { + size = 4; + } else if (count >= 2 && !(*ppos % 2)) { + size = 2; + } else { + size = 1; + } + ret = do_access(lm_ctx, buf, size, *ppos, is_write); + if (ret <= 0) { + lm_log(lm_ctx, LM_ERR, "failed to %s %lx@%llx: %s\n", + is_write ? "write" : "read", *ppos, size, strerror(-ret)); + return -EFAULT; + } + count -= size; + done += size; + *ppos += size; + buf += size; + } + return done; +} + + +static inline int +muser_access(lm_ctx_t * const lm_ctx, struct muser_cmd *const cmd, + const bool is_write) +{ + char *data; + int err; + unsigned int i; + size_t count = 0; + ssize_t ret; + + /* TODO how big do we expect count to be? Can we use alloca(3) instead? */ + data = calloc(1, cmd->rw.count); + if (data == NULL) { + lm_log(lm_ctx, LM_ERR, "failed to allocate memory\n"); + return -1; + } + + lm_log(lm_ctx, LM_DBG, "%s %x@%lx\n", is_write ? "W" : "R", cmd->rw.count, + cmd->rw.pos); + + /* copy data to be written from kernel to user space */ + if (is_write) { + err = read(lm_ctx->fd, data, cmd->rw.count); + /* + * FIXME this is wrong, we should be checking for + * err != cmd->rw.count + */ + if (err < 0) { + lm_log(lm_ctx, LM_ERR, "failed to read from kernel: %s\n", + strerror(errno)); + goto out; + } + err = 0; + dump_buffer(lm_ctx, "buffer write", data, cmd->rw.count); + } + + count = cmd->rw.count; + cmd->err = muser_pci_hdr_access(lm_ctx, &cmd->rw.count, &cmd->rw.pos, + is_write, data); + if (cmd->err) { + lm_log(lm_ctx, LM_ERR, "failed to access PCI header: %d\n", cmd->err); + } + count -= cmd->rw.count; + ret = lm_access(lm_ctx, data + count, cmd->rw.count, &cmd->rw.pos, + is_write); + if (!is_write) { + err = post_read(lm_ctx, cmd, data, count, ret); + dump_buffer(lm_ctx, "buffer read", data, cmd->rw.count); + } + +out: + free(data); + + return err; +} + +static int +muser_ioctl(lm_ctx_t * lm_ctx, struct muser_cmd *cmd) +{ + void *data = NULL; + size_t size = 0; + int ret; + + /* TODO make this a function that returns the size */ + if (cmd->ioctl.vfio_cmd == VFIO_DEVICE_SET_IRQS) { + uint32_t flags = cmd->ioctl.data.irq_set.flags; + switch ((flags & VFIO_IRQ_SET_DATA_TYPE_MASK)) { + case VFIO_IRQ_SET_DATA_EVENTFD: + size = sizeof(int32_t) * cmd->ioctl.data.irq_set.count; + break; + case VFIO_IRQ_SET_DATA_BOOL: + size = sizeof(uint8_t) * cmd->ioctl.data.irq_set.count; + break; + } + } + + if (size != 0) { + data = calloc(1, size); + if (data == NULL) { +#ifdef DEBUG + perror("calloc"); +#endif + return -1; + } + + ret = read(lm_ctx->fd, data, size); + if (ret < 0) { +#ifdef DEBUG + perror("read failed"); +#endif + goto out; + } + } + + ret = (int)do_muser_ioctl(lm_ctx, &cmd->ioctl, data); + +out: + + free(data); + return ret; +} + +static int drive_loop(lm_ctx_t *lm_ctx) +{ + struct muser_cmd cmd = { 0 }; + int err; + size_t size; + unsigned int i; + + do { + err = ioctl(lm_ctx->fd, MUSER_DEV_CMD_WAIT, &cmd); + if (err < 0) { + return err; + } + + switch (cmd.type) { + case MUSER_IOCTL: + err = muser_ioctl(lm_ctx, &cmd); + break; + case MUSER_READ: + case MUSER_WRITE: + err = muser_access(lm_ctx, &cmd, cmd.type == MUSER_WRITE); + break; + case MUSER_MMAP: + err = muser_mmap(lm_ctx, &cmd); + break; + case MUSER_DMA_MMAP: + err = muser_dma_map(lm_ctx, &cmd); + break; + case MUSER_DMA_MUNMAP: + err = muser_dma_unmap(lm_ctx, &cmd); + break; + default: + lm_log(lm_ctx, LM_ERR, "bad command %d\n", cmd.type); + continue; + } + cmd.err = err; + err = ioctl(lm_ctx->fd, MUSER_DEV_CMD_DONE, &cmd); + if (err < 0) { + lm_log(lm_ctx, LM_ERR, "failed to complete command: %s\n", + strerror(errno)); + } + // TODO: Figure out a clean way to get out of the loop. + } while (1); + + return err; +} + +int +lm_ctx_drive(lm_ctx_t * lm_ctx) +{ + + if (lm_ctx == NULL) { + errno = EINVAL; + return -1; + } + + return drive_loop(lm_ctx); +} + +static int +dev_detach(int dev_fd) +{ + return close(dev_fd); +} + +static int +dev_attach(const char *uuid) +{ + char *path; + int dev_fd; + int err; + + err = asprintf(&path, "/dev/" MUSER_DEVNODE "/%s", uuid); + if (err != (int)(strlen(MUSER_DEVNODE) + strlen(uuid) + 6)) { + return -1; + } + + dev_fd = open(path, O_RDWR); + + free(path); + + return dev_fd; +} + +void * +lm_mmap(lm_ctx_t * lm_ctx, size_t length, off_t offset) +{ + off_t lm_off; + + if ((lm_ctx == NULL) || (length == 0) || !PAGE_ALIGNED(offset)) { + errno = EINVAL; + return MAP_FAILED; + } + + lm_off = offset | BIT(63); + return mmap(NULL, length, PROT_READ | PROT_WRITE, MAP_SHARED, + lm_ctx->fd, lm_off); +} + +int +lm_irq_trigger(lm_ctx_t * lm_ctx, uint32_t vector) +{ + eventfd_t val = 1; + + if ((lm_ctx == NULL) || (vector >= lm_ctx->irqs.max_ivs)) { + errno = EINVAL; + return -1; + } + + if (lm_ctx->irqs.efds[vector] == -1) { + errno = ENOENT; + return -1; + } + + return eventfd_write(lm_ctx->irqs.efds[vector], val); +} + +void +lm_ctx_destroy(lm_ctx_t * lm_ctx) +{ + if (lm_ctx == NULL) { + return; + } + + free(lm_ctx->pci_config_space); + dev_detach(lm_ctx->fd); + if (lm_ctx->dma != NULL) { + dma_controller_destroy(lm_ctx, lm_ctx->dma); + } + free(lm_ctx); + // FIXME: Maybe close any open irq efds? Unmap stuff? +} + +static void +init_pci_hdr(lm_pci_hdr_t * const hdr, const lm_pci_hdr_id_t * const id, + const lm_pci_hdr_cc_t * const cc) +{ + assert(hdr); + assert(id); + assert(cc); + + hdr->id = *id; + hdr->cc = *cc; + + hdr->ss.vid = hdr->id.vid; + hdr->ss.sid = hdr->id.did; +} + +lm_ctx_t * +lm_ctx_create(lm_dev_info_t * const dev_info) +{ + lm_ctx_t *lm_ctx; + uint32_t max_ivs = 0; + uint32_t i; + int err = 0; + size_t size; + + if (dev_info == NULL) { + err = EINVAL; + goto out; + } + + for (i = 0; i < LM_DEV_NUM_IRQS; i++) { + if (max_ivs < dev_info->pci_info.irq_count[i]) { + max_ivs = dev_info->pci_info.irq_count[i]; + } + } + + lm_ctx = calloc(1, LM_CTX_SIZE(max_ivs)); + if (lm_ctx == NULL) { + err = errno; + goto out; + } + + memcpy(&lm_ctx->pci_info, &dev_info->pci_info, sizeof(lm_pci_info_t)); + + lm_ctx->fd = dev_attach(dev_info->uuid); + if (lm_ctx->fd == -1) { + err = errno; + goto out; + } + + if (dev_info->nr_dma_regions > 0) { + lm_ctx->dma = dma_controller_create(dev_info->nr_dma_regions); + if (lm_ctx->dma == NULL) { + err = errno; + goto out; + } + } + + lm_ctx->pci_info.irq_count[LM_DEV_ERR_IRQ_IDX] = 1; + lm_ctx->pci_info.irq_count[LM_DEV_REQ_IRQ_IDX] = 1; + + lm_ctx->extended = dev_info->extended; + if (lm_ctx->extended) { + size = PCI_EXTENDED_CONFIG_SPACE_SIZEOF; + } else { + size = PCI_CONFIG_SPACE_SIZEOF; + } + lm_ctx->pci_config_space = calloc(PCI_EXTENDED_CONFIG_SPACE_SIZEOF, 1); + if (!lm_ctx->pci_config_space) { + err = errno; + goto out; + } + + init_pci_hdr(&lm_ctx->pci_config_space->hdr, &dev_info->id, &dev_info->cc); + for (i = 0; i < ARRAY_SIZE(lm_ctx->pci_config_space->hdr.bars); i++) { + if ((dev_info->pci_info.reg_info[i].flags & LM_REG_FLAG_MEM) == 0) { + lm_ctx->pci_config_space->hdr.bars[i].io.region_type |= 0x1; + } + } + + lm_ctx->fops = dev_info->fops; + lm_ctx->pvt = dev_info->pvt; + + for (i = 0; i < max_ivs; i++) { + lm_ctx->irqs.efds[i] = -1; + } + lm_ctx->irqs.err_efd = -1; + lm_ctx->irqs.req_efd = -1; + lm_ctx->irqs.type = IRQ_NONE; + lm_ctx->irqs.max_ivs = max_ivs; + + lm_ctx->log = dev_info->log; + lm_ctx->log_lvl = dev_info->log_lvl; + + lm_ctx->pci_info.bar_fn = dev_info->pci_info.bar_fn; + lm_ctx->pci_info.rom_fn = dev_info->pci_info.rom_fn; + lm_ctx->pci_info.pci_config_fn = dev_info->pci_info.pci_config_fn; + lm_ctx->pci_info.vga_fn = dev_info->pci_info.vga_fn; + +out: + if (err) { + if (lm_ctx) { + dev_detach(lm_ctx->fd); + free(lm_ctx->pci_config_space); + free(lm_ctx); + lm_ctx = NULL; + } + errno = err; + } + return lm_ctx; +} + +void +dump_buffer(lm_ctx_t const *const lm_ctx, char const *const prefix, + unsigned char const *const buf, const uint32_t count) +{ +#ifdef DEBUG + int i; + const size_t bytes_per_line = 0x8; + + if (strcmp(prefix, "")) { + lm_log(lm_ctx, LM_DBG, "%s\n", prefix); + } + for (i = 0; i < (int)count; i++) { + if (i % bytes_per_line != 0) { + lm_log(lm_ctx, LM_DBG, " "); + } + /* TODO valgrind emits a warning if count is 1 */ + lm_log(lm_ctx, LM_DBG, "0x%02x", *(buf + i)); + if ((i + 1) % bytes_per_line == 0) { + lm_log(lm_ctx, LM_DBG, "\n"); + } + } + if (i % bytes_per_line != 0) { + lm_log(lm_ctx, LM_DBG, "\n"); + } +#endif +} + +/* + * Returns a pointer to the standard part of the PCI configuration space. + */ +inline lm_pci_config_space_t * +lm_get_pci_config_space(lm_ctx_t * const lm_ctx) +{ + assert(lm_ctx != NULL); + return lm_ctx->pci_config_space; +} + +/* + * Returns a pointer to the non-standard part of the PCI configuration space. + */ +inline uint8_t * +lm_get_pci_non_std_config_space(lm_ctx_t * const lm_ctx) +{ + assert(lm_ctx != NULL); + return (uint8_t *) & lm_ctx->pci_config_space->non_std; +} + +inline lm_reg_info_t * +lm_get_region_info(lm_ctx_t * const lm_ctx) +{ + assert(lm_ctx != NULL); + return lm_ctx->pci_info.reg_info; +} + +inline int +lm_addr_to_sg(lm_ctx_t * const lm_ctx, dma_addr_t dma_addr, + uint32_t len, dma_scattergather_t * sg, int max_sg) +{ + return dma_addr_to_sg(lm_ctx, lm_ctx->dma, dma_addr, len, sg, max_sg); +} + +inline int +lm_map_sg(lm_ctx_t * const lm_ctx, int prot, + const dma_scattergather_t * sg, struct iovec *iov, int cnt) +{ + return dma_map_sg(lm_ctx->dma, prot, sg, iov, cnt); +} + +inline void +lm_unmap_sg(lm_ctx_t * const lm_ctx, const dma_scattergather_t * sg, + struct iovec *iov, int cnt) +{ + return dma_unmap_sg(lm_ctx->dma, sg, iov, cnt); +} + +int +lm_ctx_run(lm_ctx_t * const lm_ctx) +{ + int ret = lm_ctx_drive(lm_ctx); + + lm_ctx_destroy(lm_ctx); + return ret; +} + +/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/lib/libmuser_pci.c b/lib/libmuser_pci.c new file mode 100644 index 0000000..df45336 --- /dev/null +++ b/lib/libmuser_pci.c @@ -0,0 +1,311 @@ +/* + * Copyright (c) 2019 Nutanix Inc. All rights reserved. + * + * Authors: Thanos Makatos <thanos@nutanix.com> + * Swapnil Ingle <swapnil.ingle@nutanix.com> + * Felipe Franciosi <felipe@nutanix.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Nutanix nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + */ + +#include <stdio.h> +#include <assert.h> +#include <string.h> +#include <sys/param.h> +#include <errno.h> + +#include <linux/pci_regs.h> +#include <linux/vfio.h> + +#include "muser.h" +#include "pci.h" +#include "common.h" + +static inline void +muser_pci_hdr_write_bar(lm_ctx_t * const lm_ctx, const uint16_t bar_index, + const char *const buf) +{ + uint32_t cfg_addr; + uint32_t *bar; + unsigned long mask; + lm_reg_info_t *reg_info = lm_get_region_info(lm_ctx); + + assert(lm_ctx); + + if (reg_info[bar_index].size == 0) { + return; + } + + bar = (uint32_t *) & lm_get_pci_config_space(lm_ctx)->hdr.bars[bar_index]; + cfg_addr = *(uint32_t *) buf; + + lm_log(lm_ctx, LM_DBG, "BAR%d addr 0x%x\n", bar_index, cfg_addr); + + if (cfg_addr == 0xffffffff) { + cfg_addr = ~(reg_info[bar_index].size) + 1; + } + + if ((reg_info[bar_index].flags & LM_REG_FLAG_MEM)) { + mask = PCI_BASE_ADDRESS_MEM_MASK; + } else { + mask = PCI_BASE_ADDRESS_IO_MASK; + } + cfg_addr |= (*bar & ~mask); + + *bar = htole32(cfg_addr); +} + +#define BAR_INDEX(offset) ((offset - PCI_BASE_ADDRESS_0) >> 2) + +static int +handle_command_write(lm_ctx_t * const ctx, lm_pci_config_space_t * const pci, + const char * const buf, const size_t count) +{ + uint16_t v; + + assert(ctx); + + if (count != 2) { + lm_log(ctx, LM_ERR, "bad write command size %d\n", count); + return -EINVAL; + } + + assert(pci); + assert(buf); + + v = *(uint16_t*)buf; + + if ((v & PCI_COMMAND_IO) == PCI_COMMAND_IO) { + if (!pci->hdr.cmd.iose) { + pci->hdr.cmd.iose = 0x1; + lm_log(ctx, LM_INF, "I/O space enabled\n"); + } + v &= ~PCI_COMMAND_IO; + } else { + if (pci->hdr.cmd.iose) { + pci->hdr.cmd.iose = 0x0; + lm_log(ctx, LM_INF, "I/O space disabled\n"); + } + } + + if ((v & PCI_COMMAND_MEMORY) == PCI_COMMAND_MEMORY) { + if (!pci->hdr.cmd.mse) { + pci->hdr.cmd.mse = 0x1; + lm_log(ctx, LM_INF, "memory space enabled\n"); + } + v &= ~PCI_COMMAND_MEMORY; + } else { + if (pci->hdr.cmd.mse) { + pci->hdr.cmd.mse = 0x0; + lm_log(ctx, LM_INF, "memory space disabled\n"); + } + } + + if ((v & PCI_COMMAND_MASTER) == PCI_COMMAND_MASTER) { + if (!pci->hdr.cmd.bme) { + pci->hdr.cmd.bme = 0x1; + lm_log(ctx, LM_INF, "bus master enabled\n"); + } + v &= ~PCI_COMMAND_MASTER; + } else { + if (pci->hdr.cmd.bme) { + pci->hdr.cmd.bme = 0x0; + lm_log(ctx, LM_INF, "bus master disabled\n"); + } + } + + if ((v & PCI_COMMAND_SERR) == PCI_COMMAND_SERR) { + if (!pci->hdr.cmd.see) { + pci->hdr.cmd.see = 0x1; + lm_log(ctx, LM_INF, "SERR# enabled\n"); + } + v &= ~PCI_COMMAND_SERR; + } else { + if (pci->hdr.cmd.see) { + pci->hdr.cmd.see = 0x0; + lm_log(ctx, LM_INF, "SERR# disabled\n"); + } + } + + if ((v & PCI_COMMAND_INTX_DISABLE) == PCI_COMMAND_INTX_DISABLE) { + if (!pci->hdr.cmd.id) { + pci->hdr.cmd.id = 0x1; + lm_log(ctx, LM_INF, "INTx emulation enabled\n"); + } + v &= ~PCI_COMMAND_INTX_DISABLE; + } else { + if (pci->hdr.cmd.id) { + pci->hdr.cmd.id = 0x0; + lm_log(ctx, LM_INF, "INTx emulation disabled\n"); + } + } + + if (v) { + lm_log(ctx, LM_ERR, "unconsumed command flags %x\n", v); + return -EINVAL; + } + + return 0; +} + +static int +handle_erom_write(lm_ctx_t * const ctx, lm_pci_config_space_t * const pci, + const char *const buf, const size_t count) +{ + uint32_t v; + + assert(ctx); + assert(pci); + + if (count != 0x4) { + lm_log(ctx, LM_ERR, "bad EROM count %d\n", count); + return -EINVAL; + } + v = *(uint32_t*)buf; + + if (v == (uint32_t)PCI_ROM_ADDRESS_MASK) { + lm_log(ctx, LM_INF, "write mask to EROM ignored\n"); + } else if (v == 0) { + lm_log(ctx, LM_INF, "cleared EROM\n"); + pci->hdr.erom = 0; + } else if (v == ~PCI_ROM_ADDRESS_ENABLE) { + lm_log(ctx, LM_INF, "EROM disable ignored\n"); + } else { + lm_log(ctx, LM_ERR, "bad write to EROM 0x%x bytes\n", v); + return -EINVAL; + } + return 0; +} + +static inline int +muser_pci_hdr_write(lm_ctx_t * const lm_ctx, const uint16_t offset, + const char *const buf, const size_t count) +{ + uint32_t *bar; + lm_pci_config_space_t *pci; + int ret = 0; + + assert(lm_ctx); + assert(buf); + + pci = lm_get_pci_config_space(lm_ctx); + + switch (offset) { + case PCI_COMMAND: + ret = handle_command_write(lm_ctx, pci, buf, count); + break; + case PCI_STATUS: + lm_log(lm_ctx, LM_INF, "write to status ignored\n"); + break; + case PCI_INTERRUPT_PIN: + lm_log(lm_ctx, LM_ERR, "attempt to write read-only field IPIN\n"); + ret = -EINVAL; + break; + case PCI_INTERRUPT_LINE: + pci->hdr.intr.iline = buf[0]; + break; + case PCI_LATENCY_TIMER: + pci->hdr.mlt = (uint8_t)buf[0]; + lm_log(lm_ctx, LM_INF, "set to latency timer to %hhx\n", pci->hdr.mlt); + break; + case PCI_BASE_ADDRESS_0: + case PCI_BASE_ADDRESS_1: + case PCI_BASE_ADDRESS_2: + case PCI_BASE_ADDRESS_3: + case PCI_BASE_ADDRESS_4: + case PCI_BASE_ADDRESS_5: + muser_pci_hdr_write_bar(lm_ctx, BAR_INDEX(offset), buf); + break; + case PCI_ROM_ADDRESS: + ret = handle_erom_write(lm_ctx, pci, buf, count); + break; + default: + lm_log(lm_ctx, LM_INF, "PCI config write %x@%x not handled\n", + count, offset); + ret = -EINVAL; + } + + dump_buffer(lm_ctx, "PCI header", pci->hdr.raw, 0xff); + + return ret; +} + +/* + * @pci_hdr: the PCI header + * @reg_info: region info + * @rw: the command + * @write: whether this is a PCI header write + * @count: output parameter that receives the number of bytes read/written + */ +static inline int +muser_do_pci_hdr_access(lm_ctx_t * const lm_ctx, size_t * const count, + loff_t * const pos, const bool is_write, + unsigned char *const buf) +{ + size_t _count; + loff_t _pos; + int err = 0; + + assert(lm_ctx); + assert(count); + assert(pos); + assert(buf); + + _pos = *pos - lm_get_region_info(lm_ctx)[LM_DEV_CFG_REG_IDX].offset; + _count = MIN(*count, PCI_STD_HEADER_SIZEOF - _pos); + + if (is_write) { + err = muser_pci_hdr_write(lm_ctx, _pos, buf, _count); + } else { + memcpy(buf, lm_get_pci_config_space(lm_ctx)->hdr.raw + _pos, _count); + } + *pos += _count; + *count -= _count; + return err; +} + +static inline bool +muser_is_pci_hdr_access(const lm_reg_info_t * const reg_info, const loff_t pos) +{ + const off_t off = (loff_t) reg_info[LM_DEV_CFG_REG_IDX].offset; + return pos - off >= 0 && pos - off < PCI_STD_HEADER_SIZEOF; +} + +int +muser_pci_hdr_access(lm_ctx_t * const lm_ctx, size_t * const count, + loff_t * const pos, const bool is_write, + unsigned char *const buf) +{ + assert(lm_ctx); + assert(count); + assert(pos); + + if (!muser_is_pci_hdr_access(lm_get_region_info(lm_ctx), *pos)) { + return 0; + } + return muser_do_pci_hdr_access(lm_ctx, count, pos, is_write, buf); +} + +/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/lib/msicap.h b/lib/msicap.h new file mode 100644 index 0000000..bfcf1cd --- /dev/null +++ b/lib/msicap.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2019 Nutanix Inc. All rights reserved. + * + * Authors: Thanos Makatos <thanos@nutanix.com> + * Swapnil Ingle <swapnil.ingle@nutanix.com> + * Felipe Franciosi <felipe@nutanix.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Nutanix nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + */ + +struct mid { + unsigned int next:8; + unsigned int cid:8; +} __attribute__ ((packed)); +_Static_assert(sizeof(struct mid) == 0x2, "bad MID size"); + +struct mc { + unsigned int msie:1; + unsigned int mmc:3; + unsigned int mme:3; + unsigned int c64:1; + unsigned int pvm:1; + unsigned int res1:7; +} __attribute__ ((packed)); +_Static_assert(sizeof(struct mc) == 0x2, "bad MC size"); + +struct ma { + unsigned int res1:2; + unsigned int addr:30; +} __attribute__ ((packed)); +_Static_assert(sizeof(struct ma) == 0x4, "bad MA size"); + +struct msicap { + struct mid mid; + struct mc mc; + struct ma ma; + uint32_t mua; + uint16_t md; + uint16_t padding; + uint32_t mmask; + uint32_t mpend; +} __attribute__ ((packed)); +_Static_assert(sizeof(struct msicap) == 0x18, "bad MSICAP size"); + +/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/lib/muser.h b/lib/muser.h new file mode 100644 index 0000000..a844f5c --- /dev/null +++ b/lib/muser.h @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2019 Nutanix Inc. All rights reserved. + * + * Authors: Thanos Makatos <thanos@nutanix.com> + * Swapnil Ingle <swapnil.ingle@nutanix.com> + * Felipe Franciosi <felipe@nutanix.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Nutanix nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + */ + +#ifndef LIB_MUSER_H +#define LIB_MUSER_H + +#include <stdint.h> +#include <sys/uio.h> +#include <unistd.h> + +#include "pci.h" + +/** + * lm_fops_t - driver callbacks + * + * @read: read device configuration space + * @write: write device configuration space + * @mmap: mmap device configuration space + * @reset: reset the device + */ +typedef struct { + ssize_t (*read) (void *pvt, const int index, char *buf, size_t count, + loff_t pos); + ssize_t (*write) (void *pvt, const int index, char *buf, size_t count, + loff_t pos); + unsigned long (*mmap) (void *pvt, unsigned long pgoff); + int (*reset) (void *pvt); +} lm_fops_t; + + +/** + * Callback function signatures for each regions. + * + * @lm_bar_access_t: typedef for BAR access function. + * @lm_non_bar_access_t: typedef for non-BAR(rom, pci config, + * vga) access functions. + */ +typedef ssize_t (lm_bar_access_t) (void *pvt, const int region_index, + char * const buf, size_t count, + loff_t offset, const bool is_write); +typedef ssize_t (lm_non_bar_access_t) (void *pvt, char * const buf, + size_t count, loff_t offset, + const bool is_write); +typedef struct { + uint32_t irq_count[LM_DEV_NUM_IRQS]; + lm_reg_info_t reg_info[LM_DEV_NUM_REGS]; + + /* Optional PCI region access callbacks. */ + lm_bar_access_t *bar_fn; + lm_non_bar_access_t *rom_fn; + lm_non_bar_access_t *pci_config_fn; + lm_non_bar_access_t *vga_fn; +} lm_pci_info_t; + +/** + * Callback function signature for log function + * + * @lm_log_fn_t: typedef for log function. + */ +typedef void (lm_log_fn_t) (void *pvt, const char *const msg); + +/** + * Device information structure, used to create the lm_ctx. + * To be filled and passed to lm_ctx_run() + */ +typedef struct { + char *uuid; + void *pvt; + /* + * whether an extended PCI configuration space should be created + */ + bool extended; + int nr_dma_regions; + lm_log_fn_t *log; + lm_log_lvl_t log_lvl; + lm_fops_t fops; + lm_pci_hdr_id_t id; + lm_pci_hdr_cc_t cc; + lm_pci_info_t pci_info; +} lm_dev_info_t; + +/** + * Creates libmuser context. + * + * Arguments: + * @dev_info: device information used to create the context. + */ +lm_ctx_t *lm_ctx_create(lm_dev_info_t * dev_info); + +/** + * Destroys libmuser context. + * + * Arguments: + * @lm_ctx: libmuser context to destroy. + */ +void lm_ctx_destroy(lm_ctx_t * lm_ctx); + +/** + * Once the lm_ctx is configured lm_ctx_drive() drives it. This function waits + * for commands comming from muser.ko and then processes it.. + * + * Arguments: + * @lm_ctx: libmuser context to drive. + */ + +int lm_ctx_drive(lm_ctx_t * lm_ctx); + + +/** + * Creates mapping of BAR's into the callers vmem. It should be called from + * lm_fops_t->mmap. + * + * Arguments: + * @lm_ctx: libmuser context to create mapping from. + */ +void *lm_mmap(lm_ctx_t * lm_ctx, size_t length, off_t offset); + +/** + * Trigger interrupt. + * + * Arguments: + * @lm_ctx: libmuser context to trigger interrupt. + * @vector: vector to tirgger interrupt on. + */ +int lm_irq_trigger(lm_ctx_t * lm_ctx, uint32_t vector); + +/* Helper functions */ + +int lm_ctx_run(lm_ctx_t * const ctx); + +uint8_t *lm_get_pci_non_std_config_space(lm_ctx_t * const lm_ctx); + +int lm_addr_to_sg(lm_ctx_t * const ctx, dma_addr_t dma_addr, uint32_t len, + dma_scattergather_t * sg, int max_sg); + +int +lm_map_sg(lm_ctx_t * const ctx, int prot, const dma_scattergather_t * sg, + struct iovec *iov, int cnt); + +void +lm_unmap_sg(lm_ctx_t * const ctx, const dma_scattergather_t * sg, + struct iovec *iov, int cnt); + +int +lm_get_region(lm_ctx_t * const ctx, const loff_t pos, + const size_t count, loff_t * const off); + +#ifdef DEBUG +void +dump_buffer(lm_ctx_t const *const lm_ctx, char const *const prefix, + unsigned char const *const buf, const uint32_t count); +#endif + +#endif /* LIB_MUSER_H */ + +/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/lib/pci.h b/lib/pci.h new file mode 100644 index 0000000..4b7132a --- /dev/null +++ b/lib/pci.h @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2019 Nutanix Inc. All rights reserved. + * + * Authors: Thanos Makatos <thanos@nutanix.com> + * Swapnil Ingle <swapnil.ingle@nutanix.com> + * Felipe Franciosi <felipe@nutanix.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Nutanix nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + */ + +#ifndef LIBMUSER_PCI_H +#define LIBMUSER_PCI_H + +#include <stdint.h> +#include <stdbool.h> + +#include <linux/pci_regs.h> + +struct lm_ctx; +typedef struct lm_ctx lm_ctx_t; + +typedef uint64_t dma_addr_t; + +typedef struct { + int region; + int length; + uint64_t offset; +} dma_scattergather_t; + +typedef struct lm_ctx lm_ctx_t; +typedef struct lm_reg_info lm_reg_info_t; +typedef struct lm_pci_config_space lm_pci_config_space_t; + +typedef enum { + LM_ERR, + LM_INF, + LM_DBG +} lm_log_lvl_t; + +#define PCI_CONFIG_SPACE_SIZEOF 0x100 +#define PCI_EXTENDED_CONFIG_SPACE_SIZEOF 0x1000 + +enum { + LM_DEV_BAR0_REG_IDX, + LM_DEV_BAR1_REG_IDX, + LM_DEV_BAR2_REG_IDX, + LM_DEV_BAR3_REG_IDX, + LM_DEV_BAR4_REG_IDX, + LM_DEV_BAR5_REG_IDX, + LM_DEV_ROM_REG_IDX, + LM_DEV_CFG_REG_IDX, + LM_DEV_VGA_REG_IDX, + LM_DEV_NUM_REGS = 9 +}; + +/* + * TODO lots of the sizes of each member are defined in pci_regs.h, use those + * instead? + */ + +typedef union { + uint32_t raw; + struct { + uint16_t vid; + uint16_t sid; + } __attribute__ ((packed)); +} __attribute__ ((packed)) lm_pci_hdr_ss_t; +_Static_assert(sizeof(lm_pci_hdr_ss_t) == 0x4, "bad SS size"); + +typedef union { + uint8_t raw; +} __attribute__ ((packed)) lm_pci_hdr_bist_t; +_Static_assert(sizeof(lm_pci_hdr_bist_t) == 0x1, "bad BIST size"); + +typedef union { + uint32_t raw; + union { + struct { + unsigned int region_type:1; + unsigned int locatable:2; + unsigned int prefetchable:1; + unsigned int base_address:28; + } __attribute__ ((packed)) mem; + struct { + unsigned int region_type:1; + unsigned int reserved:1; + unsigned int base_address:30; + } __attribute__ ((packed)) io; + } __attribute__ ((packed)); +} __attribute__ ((packed)) lm_bar_t; +_Static_assert(sizeof(lm_bar_t) == 0x4, "bad BAR size"); + +typedef union { + uint8_t raw; +} __attribute__ ((packed)) lm_pci_hdr_htype_t; +_Static_assert(sizeof(lm_pci_hdr_htype_t) == 0x1, "bad HTYPE size"); + +typedef union { + uint8_t raw[3]; + struct { + uint8_t pi; + uint8_t scc; + uint8_t bcc; + } __attribute__ ((packed)); +} __attribute__ ((packed)) lm_pci_hdr_cc_t; +_Static_assert(sizeof(lm_pci_hdr_cc_t) == 0x3, "bad CC size"); + +/* device status */ +typedef union { + uint16_t raw; + struct { + unsigned int res1:3; + unsigned int is:1; + unsigned int cl:1; + unsigned int c66:1; + unsigned int res2:1; + unsigned int fbc:1; + unsigned int dpd:1; + unsigned int devt:2; + unsigned int sta:1; + unsigned int rta:1; + unsigned int rma:1; + unsigned int sse:1; + unsigned int dpe:1; + } __attribute__ ((packed)); +} __attribute__ ((packed)) lm_pci_hdr_sts_t; +_Static_assert(sizeof(lm_pci_hdr_sts_t) == 0x2, "bad STS size"); + +typedef union { + uint16_t raw; + struct { + uint8_t iose:1; + uint8_t mse:1; + uint8_t bme:1; + uint8_t sce:1; + uint8_t mwie:1; + uint8_t vga:1; + uint8_t pee:1; + uint8_t zero:1; + uint8_t see:1; + uint8_t fbe:1; + uint8_t id:1; + uint8_t res1:5; + } __attribute__ ((packed)); +} __attribute__ ((packed)) lm_pci_hdr_cmd_t; +_Static_assert(sizeof(lm_pci_hdr_cmd_t) == 0x2, "bad CMD size"); + +typedef union { + uint32_t raw; + struct { + uint16_t vid; + uint16_t did; + } __attribute__ ((packed)); +} __attribute__ ((packed)) lm_pci_hdr_id_t; +_Static_assert(sizeof(lm_pci_hdr_id_t) == 0x4, "bad ID size"); + +typedef union { + uint16_t raw; + struct { + uint8_t iline; + uint8_t ipin; + } __attribute__ ((packed)); +} __attribute__ ((packed)) lm_pci_hdr_intr_t; +_Static_assert(sizeof(lm_pci_hdr_intr_t) == 0x2, "bad INTR size"); + +typedef union { + uint8_t raw[PCI_STD_HEADER_SIZEOF]; + struct { + lm_pci_hdr_id_t id; + lm_pci_hdr_cmd_t cmd; + lm_pci_hdr_sts_t sts; + uint8_t rid; + lm_pci_hdr_cc_t cc; + uint8_t cls; + uint8_t mlt; + lm_pci_hdr_htype_t htype; + lm_pci_hdr_bist_t bist; +#define PCI_BARS_NR 6 + lm_bar_t bars[PCI_BARS_NR]; + uint32_t ccptr; + lm_pci_hdr_ss_t ss; + uint32_t erom; + uint8_t cap; + uint8_t res1[7]; + lm_pci_hdr_intr_t intr; + uint8_t mgnt; + uint8_t mlat; + } __attribute__ ((packed)); +} __attribute__ ((packed)) lm_pci_hdr_t; +_Static_assert(sizeof(lm_pci_hdr_t) == 0x40, "bad PCI header size"); + +typedef struct { + uint8_t raw[PCI_CONFIG_SPACE_SIZEOF - PCI_STD_HEADER_SIZEOF]; +} __attribute__ ((packed)) lm_pci_non_std_config_space_t; +_Static_assert(sizeof(lm_pci_non_std_config_space_t) == 0xc0, + "bad non-standard PCI configuration space size"); + +struct lm_pci_config_space { + union { + uint8_t raw[PCI_CONFIG_SPACE_SIZEOF]; + struct { + lm_pci_hdr_t hdr; + lm_pci_non_std_config_space_t non_std; + } __attribute__ ((packed)); + } __attribute__ ((packed)); + uint8_t extended[]; +} __attribute__ ((packed)); +_Static_assert(sizeof(struct lm_pci_config_space) == 0x100, + "bad PCI configuration space size"); + +// Region flags. +#define LM_REG_FLAG_READ (1 << 0) +#define LM_REG_FLAG_WRITE (1 << 1) +#define LM_REG_FLAG_MMAP (1 << 2) // TODO: how this relates to IO bar? +#define LM_REG_FLAG_RW (LM_REG_FLAG_READ | LM_REG_FLAG_WRITE) +#define LM_REG_FLAG_MEM (1 << 3) // if unset, bar is IO + +struct lm_reg_info { + uint32_t flags; + uint32_t size; + uint64_t offset; +}; + +enum { + LM_DEV_INTX_IRQ_IDX, + LM_DEV_MSI_IRQ_IDX, + LM_DEV_MSIX_IRQ_IDX, + LM_DEV_ERR_IRQ_IDX, + LM_DEV_REQ_IRQ_IDX, + LM_DEV_NUM_IRQS = 5 +}; + +/* + * Returns a pointer to the non-standard part of the PCI configuration space. + */ +lm_pci_config_space_t *lm_get_pci_config_space(lm_ctx_t * const lm_ctx); + +lm_reg_info_t *lm_get_region_info(lm_ctx_t * const lm_ctx); + +/* + * TODO the rest of these functions don't need to be public, put them in a + * private header file so libmuser.c can use them. + * TODO replace the "muser" prefix + */ +int +muser_pci_hdr_access(lm_ctx_t * const lm_ctx, size_t * const count, + loff_t * const pos, const bool write, + unsigned char *const buf); + + + +#endif /* LIBMUSER_PCI_H */ + +/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/lib/pmcap.h b/lib/pmcap.h new file mode 100644 index 0000000..2757a3e --- /dev/null +++ b/lib/pmcap.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2019 Nutanix Inc. All rights reserved. + * + * Authors: Thanos Makatos <thanos@nutanix.com> + * Swapnil Ingle <swapnil.ingle@nutanix.com> + * Felipe Franciosi <felipe@nutanix.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Nutanix nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + */ + +struct pid { + unsigned int cid:8; + unsigned int next:8; +} __attribute__((packed)); +_Static_assert(sizeof(struct pid) == 0x2, "bad PID size"); + +struct pc { + unsigned int vs:3; + unsigned int pmec:1; + unsigned int res:1; + unsigned int dsi:1; + unsigned int auxc:3; + unsigned int d1s:1; + unsigned int d2s:1; + unsigned int psup:5; +} __attribute__((packed)); +_Static_assert(sizeof(struct pc) == 0x2, "bad PC size"); + +struct pmcs { + unsigned int ps:2; + unsigned int res1:1; + unsigned int nsfrst:1; + unsigned int res2:4; + unsigned int pmee:1; + unsigned int dse:4; + unsigned int dsc:2; + unsigned int pmes:1; +}; +_Static_assert(sizeof(struct pc) == 0x2, "bad PC size"); + +struct pmcap { + struct pid pid; + struct pc pc; + struct pmcs pmcs; +} __attribute__((packed)) __attribute__ ((aligned(8))); +_Static_assert(sizeof(struct pmcap) == 0x8, "bad PC size"); + +/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/lib/pxcap.h b/lib/pxcap.h new file mode 100644 index 0000000..fbea685 --- /dev/null +++ b/lib/pxcap.h @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2019 Nutanix Inc. All rights reserved. + * + * Authors: Thanos Makatos <thanos@nutanix.com> + * Swapnil Ingle <swapnil.ingle@nutanix.com> + * Felipe Franciosi <felipe@nutanix.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Nutanix nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + */ + +struct pxid { + unsigned int cid:8; + unsigned int next:8; +} __attribute__((packed)); +_Static_assert(sizeof(struct pxid) == 0x2, "bad PXID size"); + +struct pxcap { + unsigned int ver:4; + unsigned int dpt:4; + unsigned int si:1; + unsigned int imn:5; + unsigned int res1:2; +} __attribute__((packed)); +_Static_assert(sizeof(struct pxcap) == 0x2, "bad PXCAP size"); + +struct pxdcap { + unsigned int mps:3; + unsigned int pfs:2; + unsigned int etfs:1; + unsigned int l0sl:3; + unsigned int l1l:3; + unsigned int per:1; + unsigned int res1:2; + unsigned int csplv:8; + unsigned int cspls:2; + unsigned int flrc:1; + unsigned int res2:3; +} __attribute__((packed)); +_Static_assert(sizeof(struct pxdcap) == 0x4, "bad PXDCAP size"); + +union pxdc { + uint16_t raw; + struct { + unsigned int cere:1; + unsigned int nfere:1; + unsigned int fere:1; + unsigned int urre:1; + unsigned int ero:1; + unsigned int mps:3; + unsigned int ete:1; + unsigned int pfe:1; + unsigned int appme:1; + unsigned int ens:1; + unsigned int mrrs:3; + unsigned int iflr:1; + } __attribute__((packed)); +} __attribute__((packed)); +_Static_assert(sizeof(union pxdc) == 0x2, "bad PXDC size"); + +/* TODO not defining for now since all values are 0 for reset */ +struct pxds { + unsigned int stuff:16; +} __attribute__((packed)); +_Static_assert(sizeof(struct pxds) == 0x2, "bad PXDS size"); + +struct pxlcap { + unsigned int stuff:32; +} __attribute__((packed)); +_Static_assert(sizeof(struct pxlcap) == 0x4, "bad PXLCAP size"); + +struct pxlc { + unsigned int stuff:16; +} __attribute__((packed)); +_Static_assert(sizeof(struct pxlc) == 0x2, "bad PXLC size"); + +struct pxls { + unsigned int stuff:16; +} __attribute__((packed)); +_Static_assert(sizeof(struct pxls) == 0x2, "bad PXLS size"); + +struct pxdcap2 { + unsigned int ctrs:4; + unsigned int ctds:1; + unsigned int arifs:1; + unsigned int aors:1; + unsigned int aocs32:1; + unsigned int aocs64:1; + unsigned int ccs128:1; + unsigned int nprpr:1; + unsigned int ltrs:1; + unsigned int tphcs:2; + unsigned int obffs:2; + unsigned int effs:1; + unsigned int eetps:1; + unsigned int meetp:2; + unsigned int res1:8; +} __attribute__((packed)); +_Static_assert(sizeof(struct pxdcap2) == 0x4, "bad PXDCAP2 size"); + +struct pxdc2 { + unsigned int stuff:16; +} __attribute__((packed)); +_Static_assert(sizeof(struct pxdc2) == 0x2, "bad PXDC2 size"); + +/* TODO name conflicts with PXCAP */ +struct PCI_Express_Capability { + struct pxid pxid; + struct pxcap pxcap; + struct pxdcap pxdcap; + union pxdc pxdc; + struct pxds pxds; + struct pxlcap pxlcap; + struct pxlc pxlc; + struct pxls pxls; + uint8_t pad[0x10]; + struct pxdcap2 pxdcap2; + struct pxdc2 pxdc2; +} __attribute__((packed)); +_Static_assert(sizeof(struct PCI_Express_Capability) == 0x2a, + "bad PCI Express Capability size"); + +/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/patches/vfio.diff b/patches/vfio.diff new file mode 100644 index 0000000..d19da2e --- /dev/null +++ b/patches/vfio.diff @@ -0,0 +1,192 @@ +diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c +index a3030cd..ab1b82c 100644 +--- a/drivers/vfio/vfio.c ++++ b/drivers/vfio/vfio.c +@@ -2019,15 +2019,24 @@ static int vfio_register_iommu_notifier(struct vfio_group *group, + int ret; + + ret = vfio_group_add_container_user(group); +- if (ret) ++ if (ret) { ++ pr_info("vfio_group_add_container_user failed with %d\n", ret); + return -EINVAL; ++ } + + container = group->container; + driver = container->iommu_driver; +- if (likely(driver && driver->ops->register_notifier)) ++ if (likely(driver && driver->ops->register_notifier)) { + ret = driver->ops->register_notifier(container->iommu_data, +- events, nb); +- else ++ events, nb); ++ if (unlikely(!ret) && driver->ops->retro_notify) { ++ ret = driver->ops->retro_notify(container->iommu_data); ++ if (unlikely((ret & NOTIFY_BAD) == NOTIFY_BAD)) ++ ret = -ENOTTY; ++ else ++ ret = 0; ++ } ++ } else + ret = -ENOTTY; + + vfio_group_try_dissolve_container(group); +@@ -2140,6 +2149,7 @@ int vfio_register_notifier(struct device *dev, enum vfio_notify_type type, + ret = vfio_register_group_notifier(group, events, nb); + break; + default: ++ pr_info("bad notification type %d\n", type); + ret = -EINVAL; + } + +diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c +index d0f731c..b47b8f96 100644 +--- a/drivers/vfio/vfio_iommu_type1.c ++++ b/drivers/vfio/vfio_iommu_type1.c +@@ -558,8 +558,10 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data, + return -EINVAL; + + /* Supported for v2 version only */ +- if (!iommu->v2) ++ if (!iommu->v2) { ++ pr_debug("non v2 IOMMU\n"); + return -EACCES; ++ } + + mutex_lock(&iommu->lock); + +@@ -1050,6 +1052,30 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma, + return ret; + } + ++static int vfio_dma_map_trigger_notifiers(struct vfio_iommu * const iommu, ++ struct vfio_dma const * const dma) ++ ++{ ++ struct vfio_iommu_type1_dma_map nb_map = {0}; ++ ++ BUG_ON(!iommu); ++ BUG_ON(!dma); ++ ++ nb_map.flags = dma->prot; ++ ++ if ((dma->prot & IOMMU_READ) == IOMMU_READ) ++ nb_map.flags |= VFIO_DMA_MAP_FLAG_READ; ++ if ((dma->prot & IOMMU_WRITE) == IOMMU_WRITE) ++ nb_map.flags |= VFIO_DMA_MAP_FLAG_WRITE; ++ nb_map.vaddr = dma->vaddr; ++ nb_map.iova = dma->iova; ++ nb_map.size = dma->size; ++ ++ return blocking_notifier_call_chain(&iommu->notifier, ++ VFIO_IOMMU_NOTIFY_DMA_MAP, ++ &nb_map); ++} ++ + static int vfio_dma_do_map(struct vfio_iommu *iommu, + struct vfio_iommu_type1_dma_map *map) + { +@@ -1139,13 +1165,25 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, + vfio_link_dma(iommu, dma); + + /* Don't pin and map if container doesn't contain IOMMU capable domain*/ +- if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) ++ if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) { + dma->size = size; +- else ++ ret = 0; ++ } else + ret = vfio_pin_map_dma(iommu, dma, size); + + out_unlock: + mutex_unlock(&iommu->lock); ++ /* FIXME is the following safe without having acquired the mutex? */ ++ if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) && !ret) { ++ ret = vfio_dma_map_trigger_notifiers(iommu, dma); ++ /* FIXME proceed or clean up and fail? */ ++ if ((ret & NOTIFY_BAD) == NOTIFY_BAD) { ++ pr_debug("failed to trigger notifier(s): %d\n", ret); ++ ret = -EINVAL; ++ } else ++ ret = 0; ++ } ++ + return ret; + } + +@@ -1504,8 +1542,11 @@ static void vfio_sanity_check_pfn_list(struct vfio_iommu *iommu) + + dma = rb_entry(n, struct vfio_dma, node); + +- if (WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list))) ++ if (WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list))) { ++ pr_debug("DMA region %llx-%llx still pinned\n", ++ dma->iova, dma->iova + dma->size); + break; ++ } + } + /* mdev vendor driver must unregister notifier */ + WARN_ON(iommu->notifier.head); +@@ -1740,7 +1781,7 @@ static int vfio_iommu_type1_register_notifier(void *iommu_data, + struct vfio_iommu *iommu = iommu_data; + + /* clear known events */ +- *events &= ~VFIO_IOMMU_NOTIFY_DMA_UNMAP; ++ *events &= ~(VFIO_IOMMU_NOTIFY_DMA_MAP | VFIO_IOMMU_NOTIFY_DMA_UNMAP); + + /* refuse to register if still events remaining */ + if (*events) +@@ -1749,6 +1790,25 @@ static int vfio_iommu_type1_register_notifier(void *iommu_data, + return blocking_notifier_chain_register(&iommu->notifier, nb); + } + ++static int vfio_iommu_type1_retro_notify(void *iommu_data) ++{ ++ int err = NOTIFY_OK; ++ struct vfio_iommu *iommu; ++ struct vfio_dma *pos, *n; ++ ++ BUG_ON(!iommu_data); ++ ++ iommu = (struct vfio_iommu*)iommu_data; ++ ++ rbtree_postorder_for_each_entry_safe(pos, n, &iommu->dma_list, node) { ++ err = vfio_dma_map_trigger_notifiers(iommu, pos); ++ if ((err & NOTIFY_BAD) == NOTIFY_BAD) ++ break; ++ } ++ ++ return err; ++} ++ + static int vfio_iommu_type1_unregister_notifier(void *iommu_data, + struct notifier_block *nb) + { +@@ -1769,6 +1829,7 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = { + .unpin_pages = vfio_iommu_type1_unpin_pages, + .register_notifier = vfio_iommu_type1_register_notifier, + .unregister_notifier = vfio_iommu_type1_unregister_notifier, ++ .retro_notify = vfio_iommu_type1_retro_notify, + }; + + static int __init vfio_iommu_type1_init(void) +diff --git a/include/linux/vfio.h b/include/linux/vfio.h +index 66741ab0..10ee80b 100644 +--- a/include/linux/vfio.h ++++ b/include/linux/vfio.h +@@ -85,6 +85,7 @@ struct vfio_iommu_driver_ops { + struct notifier_block *nb); + int (*unregister_notifier)(void *iommu_data, + struct notifier_block *nb); ++ int (*retro_notify)(void *iommu_data); + }; + + extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops); +@@ -118,6 +119,7 @@ enum vfio_notify_type { + + /* events for VFIO_IOMMU_NOTIFY */ + #define VFIO_IOMMU_NOTIFY_DMA_UNMAP BIT(0) ++#define VFIO_IOMMU_NOTIFY_DMA_MAP BIT(1) + + /* events for VFIO_GROUP_NOTIFY */ + #define VFIO_GROUP_NOTIFY_SET_KVM BIT(0) diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt new file mode 100644 index 0000000..d12a813 --- /dev/null +++ b/samples/CMakeLists.txt @@ -0,0 +1,32 @@ +# +# Copyright (c) 2019 Nutanix Inc. All rights reserved. +# +# Authors: Thanos Makatos <thanos@nutanix.com> +# Swapnil Ingle <swapnil.ingle@nutanix.com> +# Felipe Franciosi <felipe@nutanix.com> +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of Nutanix nor the names of its contributors may be +# used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +add_executable(test_read test_read.c) +add_executable(test_mmap test_mmap.c) diff --git a/samples/test_mmap.c b/samples/test_mmap.c new file mode 100644 index 0000000..02c32f1 --- /dev/null +++ b/samples/test_mmap.c @@ -0,0 +1,199 @@ +/* + * Userspace mediated device sample application + * + * Copyright (c) 2019, Nutanix Inc. All rights reserved. + * Author: Thanos Makatos <thanos@nutanix.com> + * Swapnil Ingle <swapnil.ingle@nutanix.com> + * Felipe Franciosi <felipe@nutanix.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Nutanix nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <errno.h> +#include <unistd.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <stdbool.h> +#include <string.h> +#include <linux/vfio.h> +#include <limits.h> +#include <assert.h> +#include <sys/ioctl.h> +#include <inttypes.h> + +#define VFIO_PATH "/dev/vfio/" +#define VFIO_CTR_PATH VFIO_PATH "vfio" +#define SYSFS_PCI_DEV_PATH "/sys/bus/pci/devices/" +#define SYSFS_IOMMU_GROUP "/iommu_group" + +static int +pci_group_id(const char *bdf) +{ + char *dev_path; + char group_path[PATH_MAX]; + int group_id; + + assert(bdf); + + asprintf(&dev_path, SYSFS_PCI_DEV_PATH "%s" SYSFS_IOMMU_GROUP, bdf); + memset(group_path, 0, sizeof(group_path)); + readlink(dev_path, group_path, sizeof(group_path)); + free(dev_path); + sscanf(basename(group_path), "%d", &group_id); + return group_id; +} + +static inline void* +test_map_dma(const int fd, const unsigned long size, const unsigned long iova) +{ + int err; + struct vfio_iommu_type1_dma_map dma_map = { + .argsz = sizeof(dma_map), + .size = size, + .iova = iova, + .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, + }; + + /* Allocate some space and setup a DMA mapping */ + /* FIXME it *must* be MAP_SHARED */ + dma_map.vaddr = (unsigned long long)mmap(0, size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, 0, 0); + if (dma_map.vaddr == (unsigned long)MAP_FAILED) { + perror("failed to map DMA"); + return NULL; + } + printf("%llx\n", dma_map.vaddr); + strcpy((char*)dma_map.vaddr, "foo"); + + fprintf(stderr, "attempting to MAP_DMA IOVA=%llx\n", dma_map.iova); + + err = ioctl(fd, VFIO_IOMMU_MAP_DMA, &dma_map); + if (err) { + fprintf(stderr, "failed to MAP_DMA: %d (errno=%d)", err, errno); + return NULL; + } + printf("[%s]\n", (char*)dma_map.vaddr); + + return (void*)dma_map.vaddr; +} + +static inline void +test_unmap_dma(const int fd, const unsigned long size, const unsigned long iova) +{ + int err; + struct vfio_iommu_type1_dma_unmap dma_unmap = { + .argsz = sizeof dma_unmap, + .size = size, + .iova = iova, + .flags = 0 + }; + + err = ioctl(fd, VFIO_IOMMU_UNMAP_DMA, &dma_unmap); + if (err) { + perror("UNMAP_DMA\n"); + return; + } + printf("unmapped IOVA=%llx\n", dma_unmap.iova); +} + +int main(int argc, char * argv[]) +{ + int err, vfio_ctr_fd, vfio_grp_fd, vfio_dev_fd; + char *grp_path; +#ifdef DEBUG + struct vfio_group_status grp_status; +#endif + struct vfio_iommu_type1_info iommu_info; + void *dma_map_addr = NULL; + + if (argc != 2) { + printf("Usage: %s <device bdf in full>\n", argv[0]); + printf(" ex: %s 0000:82:00.0\n", argv[0]); + return EXIT_FAILURE; + } + + vfio_ctr_fd = open(VFIO_CTR_PATH, O_RDWR); + assert(vfio_ctr_fd >= 0); + +#ifdef DEBUG + err = ioctl(vfio_ctr_fd, VFIO_GET_API_VERSION); + assert(err == VFIO_API_VERSION); + err = ioctl(vfio_ctr_fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU); + assert(err == 1); +#endif + + // Open the VFIO entry for this device's IOMMU GROUP. + err = asprintf(&grp_path, VFIO_PATH "%d", pci_group_id(argv[1])); + assert(err > 0); + vfio_grp_fd = open(grp_path, O_RDWR); + assert(vfio_grp_fd >= 0); + free(grp_path); + +#ifdef DEBUG + // Ensure group is viable. + memset(&grp_status, 0, sizeof(grp_status)); + grp_status.argsz = sizeof(grp_status); + err = ioctl(vfio_grp_fd, VFIO_GROUP_GET_STATUS, &grp_status); + assert(!err); + assert((grp_status.flags & VFIO_GROUP_FLAGS_VIABLE) == 1); +#endif + + // Add the group to the container. + err = ioctl(vfio_grp_fd, VFIO_GROUP_SET_CONTAINER, &vfio_ctr_fd); + assert(!err); + + // Enable IOMMU type 1 on container. + err = ioctl(vfio_ctr_fd, VFIO_SET_IOMMU, VFIO_TYPE1v2_IOMMU); + assert(!err); + + // Fetch IOMMU information from VFIO. + memset(&iommu_info, 0, sizeof(iommu_info)); + iommu_info.argsz = sizeof(iommu_info); + err = ioctl(vfio_ctr_fd, VFIO_IOMMU_GET_INFO, &iommu_info); + assert(!err); + + // Get a device fd from VFIO. + vfio_dev_fd = ioctl(vfio_grp_fd, VFIO_GROUP_GET_DEVICE_FD, argv[1]); + assert(vfio_dev_fd >= 0); + + void *p; + p = mmap(NULL, sysconf(_SC_PAGESIZE), PROT_READ | PROT_WRITE, + MAP_SHARED, vfio_dev_fd, 0); + assert(p != MAP_FAILED); + printf("%p\n", p); + printf("%s\n", (char*)p); + + dma_map_addr = test_map_dma(vfio_ctr_fd, 4096, 0xdeadbeef000); + if (!dma_map_addr) + exit(EXIT_FAILURE); + test_unmap_dma(vfio_ctr_fd, 4096, 0xdeadbeef000); + + return 0; +} diff --git a/samples/test_read.c b/samples/test_read.c new file mode 100644 index 0000000..24af454 --- /dev/null +++ b/samples/test_read.c @@ -0,0 +1,233 @@ +/* + * Userspace mediated device sample application + * + * Copyright (c) 2019, Nutanix Inc. All rights reserved. + * Author: Thanos Makatos <thanos@nutanix.com> + * Swapnil Ingle <swapnil.ingle@nutanix.com> + * Felipe Franciosi <felipe@nutanix.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Nutanix nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <errno.h> +#include <unistd.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <stdbool.h> +#include <string.h> +#include <linux/vfio.h> +#include <limits.h> +#include <assert.h> +#include <sys/ioctl.h> +#include <inttypes.h> + +#define VFIO_PATH "/dev/vfio/" +#define VFIO_CTR_PATH VFIO_PATH "vfio" +#define SYSFS_MUSER_DEV_PATH "/sys/class/muser/muser/" +#define SYSFS_IOMMU_GROUP "/iommu_group" + +static int +test_read(int vfio_dev_fd, off_t offset) +{ + size_t bytes; + char buf[256]; + int i; + + memset(buf, 0, sizeof(buf)); + printf("* Reading %zd bytes\n", sizeof(buf)); + bytes = pread(vfio_dev_fd, buf, sizeof(buf), offset); + assert(bytes == sizeof(buf)); + printf("** Read %zd bytes\n", bytes); + + for (i = 0; i < sizeof(buf); i++) { + if (i % 16 == 0) { + printf("%04X:", i); + } + printf(" %02hhX", buf[i]); + if (i % 16 == 15) { + printf("\n"); + } + } + if (i % 16 != 0) { + printf("\n"); + } + + return 0; +} + +static int +pci_group_id(const char *uuid) +{ + char *dev_path; + char group_path[PATH_MAX]; + int group_id; + + assert(uuid != NULL); + + asprintf(&dev_path, SYSFS_MUSER_DEV_PATH "%s" SYSFS_IOMMU_GROUP, uuid); + memset(group_path, 0, sizeof(group_path)); + readlink(dev_path, group_path, sizeof(group_path)); + free(dev_path); + sscanf(basename(group_path), "%d", &group_id); + return group_id; +} + +int +main(int argc, char * argv[]) +{ + int vfio_ctr_fd, vfio_grp_fd, vfio_dev_fd; + char *grp_path; + int i; + int err; + + if (argc != 2) { + printf("Usage: %s <muser_dev_uuid>\n", argv[0]); + return EXIT_FAILURE; + } + + // Create a new VFIO container. + printf("* Creating new VFIO container...\n"); + vfio_ctr_fd = open(VFIO_CTR_PATH, O_RDWR); + assert(vfio_ctr_fd >= 0); + printf("** vfio_ctr_fd = %d\n", vfio_ctr_fd); + + // Ensure kernel VFIO is compatible. + printf("* Fetching VFIO API version...\n"); + err = ioctl(vfio_ctr_fd, VFIO_GET_API_VERSION); + assert(err == VFIO_API_VERSION); + + // Ensure VFIO supports TYPE1 IOMMU. + printf("* Checking for IOMMU TYPE1 extension in VFIO...\n"); + err = ioctl(vfio_ctr_fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU); + assert(err == 1); + + // Open the VFIO entry for this device's IOMMU GROUP. + err = asprintf(&grp_path, VFIO_PATH "%d", pci_group_id(argv[1])); + assert(err > 0); + printf("* Opening the VFIO group (%s)...\n", grp_path); + vfio_grp_fd = open(grp_path, O_RDWR); + assert(vfio_grp_fd >= 0); + printf("** vfio_grp_fd = %d\n", vfio_grp_fd); + free(grp_path); + + // Ensure group is viable. + struct vfio_group_status grp_status; + printf("* Ensuring all devices in this group are bound to VFIO...\n"); + memset(&grp_status, 0, sizeof(grp_status)); + grp_status.argsz = sizeof(grp_status); + err = ioctl(vfio_grp_fd, VFIO_GROUP_GET_STATUS, &grp_status); + assert(!err); + assert((grp_status.flags & VFIO_GROUP_FLAGS_VIABLE) == 1); + + // Add the group to the container. + printf("* Adding group to container...\n"); + err = ioctl(vfio_grp_fd, VFIO_GROUP_SET_CONTAINER, &vfio_ctr_fd); + assert(!err); + + // Enable IOMMU type 1 on container. + printf("* Setting IOMMU Type 1 on container...\n"); + err = ioctl(vfio_ctr_fd, VFIO_SET_IOMMU, VFIO_TYPE1v2_IOMMU); + assert(!err); + + // Fetch IOMMU information from VFIO. + struct vfio_iommu_type1_info iommu_info; + printf("* Fetching IOMMU information...\n"); + memset(&iommu_info, 0, sizeof(iommu_info)); + iommu_info.argsz = sizeof(iommu_info); + err = ioctl(vfio_ctr_fd, VFIO_IOMMU_GET_INFO, &iommu_info); + assert(!err); + + // Get a device fd from VFIO. + printf("* Getting a device (%s) fd from group...\n", argv[1]); + vfio_dev_fd = ioctl(vfio_grp_fd, VFIO_GROUP_GET_DEVICE_FD, argv[1]); + assert(vfio_dev_fd >= 0); + printf("** vfio_dev_fd = %d\n", vfio_dev_fd); + + // Fetch device information. + printf("* Fetching device information...\n"); + struct vfio_device_info dev_info; + memset(&dev_info, 0, sizeof(dev_info)); + dev_info.argsz = sizeof(dev_info); + err = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_INFO, &dev_info); + assert(err == 0); + assert(dev_info.num_regions <= VFIO_PCI_NUM_REGIONS); + + // Fetch region information for this device. + struct vfio_region_info reg_info[VFIO_PCI_NUM_REGIONS]; + printf("* Fetching information for %u regions\n", dev_info.num_regions); + for (i = 0; i < (int)dev_info.num_regions; i++) { + memset(®_info[i], 0, sizeof(reg_info[i])); + reg_info[i].argsz = sizeof(reg_info[i]); + reg_info[i].index = i; + err = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ®_info[i]); + if (err != 0) { + // This region doesn't exist or isn't accessible. + printf("** %d: Region info unavailable\n", i); + memset(®_info[i], 0, sizeof(reg_info[i])); + } else { + printf("** %d: argsz=0x%X, flags=0x%X, index=0x%X, " + "size=0x%llX, offset=0x%llX\n", + i, + reg_info[i].argsz, + reg_info[i].flags, + reg_info[i].index, + reg_info[i].size, + reg_info[i].offset); + } + } + + // Fetch irq information for this device. + struct vfio_irq_info irq_info[VFIO_PCI_NUM_IRQS]; + printf("* Fetching information for %u irqs\n", dev_info.num_irqs); + for (i = 0; i < (int)dev_info.num_irqs; i++) { + memset(&irq_info[i], 0, sizeof(irq_info[i])); + irq_info[i].argsz = sizeof(irq_info[i]); + irq_info[i].index = i; + err = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info[i]); + if (err != 0) { + // This irq doesn't exist or isn't accessible. + printf("** %d: Irq info unavailable\n", i); + memset(&irq_info[i], 0, sizeof(irq_info[i])); + } else { + printf("** %d: argsz=0x%X, flags=0x%X, index=0x%X, count=%u\n", + i, + irq_info[i].argsz, + irq_info[i].flags, + irq_info[i].index, + irq_info[i].count); + } + } + + // Test. + err = test_read(vfio_dev_fd, reg_info[VFIO_PCI_CONFIG_REGION_INDEX].offset); + assert(!err); + + return 0; +} |