aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt3
-rw-r--r--README.md134
-rw-r--r--kmod/CMakeLists.txt52
-rw-r--r--kmod/muser.c1841
-rw-r--r--kmod/muser.h167
-rw-r--r--lib/CMakeLists.txt1
-rw-r--r--lib/libmuser.c546
-rw-r--r--lib/libmuser_pci.c16
-rw-r--r--lib/muser_priv.h70
-rw-r--r--patches/vfio.diff192
10 files changed, 163 insertions, 2859 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 74695f4..af97768 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -35,9 +35,6 @@ include(GNUInstallDirs)
# shared libraries
add_subdirectory(lib)
-# kernel module
-#add_subdirectory(kmod)
-
# samples
add_subdirectory(samples)
diff --git a/README.md b/README.md
index 3bb649c..887c51a 100644
--- a/README.md
+++ b/README.md
@@ -4,30 +4,10 @@ Mediated Userspace Device
Overview
--------
-MUSER is a framework that allows PCI devices to be implemented in userspace. It
-leverages the Linux kernel VFIO/MDEV infrastructure, allowing such devices to
-be easily accessed via standard VFIO interfaces and subsequently virtual
-machines. These can be completely virtual and not backed by any real hardware.
-This provides interesting benefits, including:
-
-* Simplification of the initial development of kernel drivers for new devices
-* Easy plumbing to hypervisors that support VFIO device pass-through
-* Performance benefits as a single process can poll multiple drivers
-
-In this fork we focus on making QEMU and MUSER work without the need of the
-MUSER kernel module. This has been demonstrated as a PoC in
-https://lists.gnu.org/archive/html/qemu-devel/2020-03/msg07900.html. In the PoC
-we use a library to intercept QEMU's syscalls to VFIO (libpathtrap) and convert
-theme into messages that we send to the process where device emulation is
-implemented (libvfio). Any QEMU version can be used, unpatched.
-
-The PoC is merely a step towards defining a device offloading protocol that
-will hopefully be officially suported by QEMU so we won't need to do tricks with
-intercepting syscalls etc. This protocol will be called VFIO-over-socket (or
-vfio-user) and is based on the existing VFIO framework (it reuses structs,
-defines, concepts, etc). Hopefully the protocol won't be too different from the
-one in the PoC. You can follow/participate in the discussion here:
-https://www.mail-archive.com/qemu-devel@nongnu.org/msg723773.html
+MUSER is a framework that allows implementing PCI devices under the [vfio-user
+protocol](https://lists.gnu.org/archive/html/qemu-devel/2020-11/msg02458.html).
+MUSER is the _backend_ part of the vfio-user protocol, the frontend part is
+implemented by Oracle in https://github.com/oracle/qemu/tree/vfio-user-v0.1.
The library abstracts most of the complexity around representing the device.
Applications using libmuser provide a description of the device (eg. region and
@@ -41,17 +21,6 @@ future we plan to make libmuser multi-threaded. The application can be
implemented in whatever way is convenient, e.g. as a Python script using
bindings, on the cloud, etc. There's also experimental support for polling.
-There is also an ongoing effort to define a protocol based on VFIO that will be
-officially supported by QEMU so the kernel module won't be necessary. This
-protocol (tentatively named VFIO-over-socket and soon to be renamed to
-vfio-user) has been discussed as an RFC in qemu-devel:
-https://lists.gnu.org/archive/html/qemu-devel/2020-03/msg07900.html,
-and is now in the process of being reviewed:
-https://www.mail-archive.com/qemu-devel@nongnu.org/msg723773.html.
-In the RFC email thread it is explained how to run the GPIO sample without the
-MUSER kernel module, where to get sources etc. Please refer to the RFC email
-thread for more information.
-
Memory Mapping the Device
-------------------------
@@ -76,110 +45,17 @@ from the kernel, however this performance penalty is perfectly acceptable when
prototyping the functional aspect of a device driver.
-System Architecture
--------------------
-
-QEMU (with the "help" of libpathtrap and livfio) and libmuser communicate via a
-UNIX domain socket (in the future it can be anything, e.g. UDP). Whenever QEMU
-executes an ioctl to the VFIO device, libpathtrap/libvfio convert the operation
-into a message and send it to libmuser, unblocking it. libmuser executed the
-request and sends back the response. Currently there can be only one command
-pending, we plan to allow multiple commands to be executed in parallel.
-
-
Building muser
==============
Just do:
- git submodule update --init
make && make install
-The kernel headers are necessary because VFIO structs and defines are resused.
+The kernel headers are necessary because VFIO structs and defines are reused.
To specify an alternative kernel directory set the KDIR environment variable
accordingly.
To enable Python bindings set the PYTHON_BINDINGS environment variable to a
non-empty string.
Finally build your program and link it to libmuser.so.
-
-Running QEMU
-============
-
-Use the following snippet to create the directory structure, this is required
-because QEMU still thinks it's talking to VFIO. "muser" can really by anything
-or even omitted. "foo" is typically the guest name/UUID. "0" is the IOMMU
-group, this must be an integer and must not exist under /dev/vfio. SELinux and
-cgroups can be tricky to set up correctly, so try and keep it simple for now
-(e.g. disable SELinux, use world-accessible paths such as /var/run etc.).
-
- mkdir -p /var/run/muser/iommu_group /var/run/muser/foo/0
- cd /var/run/muser/foo/0 && ln -sf ../0 iommu_group
- ln -s /var/run/muser/foo/0 /var/run/muser/iommu_group/0
-
-Create your libmuser context setting /var/run/muser/foo/0 as the UUID.
-
-Run QEMU as follows:
-
- LD_PRELOAD=muser/build/dbg/libvfio/libvfio.so qemu-system-x86_64 \
- ... \
- -device vfio-pci,sysfsdev=/var/run/muser/foo/0
- -object memory-backend-file,id=ram-node0,prealloc=yes,mem-path=mem,share=yes,size=1073741824 -numa node,nodeid=0,cpus=0,memdev=ram-node0
-
-Guest RAM must be shared (share=yes) otherwise libmuser won't be able to do DMA
-transfers from/to it. If you're not using QEMU then any memory that must be
-accessed by libmuser must be allocate MAP_SHARED. Registering memory for DMA
-that has not been allocated with MAP_SHARED is ignored and any attempts to
-access that memory will result in an error.
-
-Example
-=======
-
-samples/gpio-pci-idio-16.c implements a tiny part of the PCI-IDIO-16 GPIO
-(https://www.accesio.com/?p=/pci/pci_idio_16.html). In this sample it's a simple
-device that toggles the input every 3 times it's read.
-
-Running gpio-pci-idio-16
-------------------------
-
-1. First, follow the instructions to build and load muser.
-2. Then, start the gpio-pci-idio-16 device emulation:
-
- # build/dbg/samples/gpio-pci-idio-16 -s /var/run/muser/foo/0
-
-3. Finally, start the VM adding the command line explained earlier and then
-execute:
-
- # insmod gpio-pci-idio-16.ko
- # cat /sys/class/gpio/gpiochip480/base > /sys/class/gpio/export
- # for ((i=0;i<12;i++)); do cat /sys/class/gpio/OUT0/value; done
- 0
- 0
- 0
- 1
- 1
- 1
- 0
- 0
- 0
- 1
- 1
- 1
-
-Future Work
-===========
-
-See official fork for more details.
-
-Troubleshooting
----------------
-
-It's easy to mess things up as this is a PoC. libvfio stores logs under
-`/tmp/libvfio`. When things fail it's usually because the directory hasn't been
-correctly set up or cleaned up from the previous run, use `strace` and check
-which syscalls fail and why.
-
-To debug accesses to your PCI device from QEMU add the following to the QEMU
-command line:
-
- -trace enable=vfio*,file=qemu-vfio.trace
diff --git a/kmod/CMakeLists.txt b/kmod/CMakeLists.txt
deleted file mode 100644
index 07e90e7..0000000
--- a/kmod/CMakeLists.txt
+++ /dev/null
@@ -1,52 +0,0 @@
-#
-# Copyright (c) 2019 Nutanix Inc. All rights reserved.
-#
-# Authors: Thanos Makatos <thanos@nutanix.com>
-# Swapnil Ingle <swapnil.ingle@nutanix.com>
-# Felipe Franciosi <felipe@nutanix.com>
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-# * Neither the name of Nutanix nor the names of its contributors may be
-# used to endorse or promote products derived from this software without
-# specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
-# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-
-# Copy sources to build directory (avoid polluting source directory).
-# TODO can we copy all source files with a wildcard?
-configure_file(muser.c ${CMAKE_CURRENT_BINARY_DIR}/muser.c COPYONLY)
-configure_file(muser.h ${CMAKE_CURRENT_BINARY_DIR}/muser.h COPYONLY)
-# FIXME need to pass "CFLAGS_muser.o := -DDEBUG" for debug builds
-set(KMOD_MAKEFILE_CONTENT "obj-m := muser.o")
-IF(CMAKE_BUILD_TYPE MATCHES Debug)
- set(KMOD_MAKEFILE_CONTENT "CFLAGS_muser.o := -DDEBUG\n${KMOD_MAKEFILE_CONTENT}")
-ENDIF(CMAKE_BUILD_TYPE MATCHES Debug)
-file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/Kbuild ${KMOD_MAKEFILE_CONTENT})
-
-# Build module using kernel's Makefile.
-set(KBUILD_CMD ${CMAKE_MAKE_PROGRAM} -C ${KDIR} M=${CMAKE_CURRENT_BINARY_DIR} modules)
-ADD_CUSTOM_COMMAND(OUTPUT DRIVER_BIN_FILE
- COMMAND ${KBUILD_CMD}
- DEPENDS ${MODULE_SOURCE_FILES} VERBATIM
-)
-ADD_CUSTOM_TARGET(driver ALL DEPENDS DRIVER_BIN_FILE)
-execute_process(COMMAND uname -r OUTPUT_VARIABLE kver OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/muser.ko DESTINATION /lib/modules/${kver}/extra/)
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/muser.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/linux)
diff --git a/kmod/muser.c b/kmod/muser.c
deleted file mode 100644
index 53cc3d8..0000000
--- a/kmod/muser.c
+++ /dev/null
@@ -1,1841 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
-/*
- * Copyright (c) 2019, Nutanix Inc. All rights reserved.
- *
- * Author: Thanos Makatos <thanos@nutanix.com>
- * Swapnil Ingle <swapnil.ingle@nutanix.com>
- * Felipe Franciosi <felipe@nutanix.com>
- *
- */
-
-#include <linux/cdev.h>
-#include <linux/compat.h>
-#include <linux/device.h>
-#include <linux/file.h>
-#include <linux/idr.h>
-#include <linux/list.h>
-#include <linux/miscdevice.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/stat.h>
-#include <linux/string.h>
-#include <linux/uaccess.h>
-#include <linux/wait.h>
-#include <linux/vfio.h>
-#include <linux/mdev.h>
-#include <linux/pagemap.h>
-#include <asm-generic/mman-common.h>
-#include <linux/device.h>
-#include <linux/version.h>
-#include <linux/sched/mm.h>
-
-#include "muser.h"
-
-#define DRIVER_NAME "muser"
-
-#define NR_PAGES(x) (((x) + (PAGE_SIZE - 1)) >> PAGE_SHIFT)
-#define MIN(a, b) ((a) < (b) ? (a):(b))
-
-static struct muser {
- struct class *class;
- struct list_head mudev_list;
- struct idr dev_idr;
- struct cdev muser_cdev;
- dev_t muser_devt;
- struct device dev;
- struct mutex muser_lock;
-} muser;
-
-#define muser_log(func, fmt, ...) \
- func(&muser.dev, "%s: " fmt "\n", __func__, ## __VA_ARGS__)
-
-#define muser_dbg(fmt, ...) muser_log(dev_dbg, fmt, ## __VA_ARGS__)
-#define muser_info(fmt, ...) muser_log(dev_info, fmt, ## __VA_ARGS__)
-#define muser_warn(fmt, ...) muser_log(dev_warn, fmt, ## __VA_ARGS__)
-#define muser_err(fmt, ...) muser_log(dev_err, fmt, ## __VA_ARGS__)
-#define muser_alert(fmt, ...) muser_log(dev_alert, fmt, ## __VA_ARGS__)
-
-/* TODO come up with as better name? */
-/*
- * FIXME len and nr_pages are confusing, we user either one or the other however
- * they seem to serve the same purpose, fix.
- */
-struct page_map {
- struct page **pages;
- int nr_pages;
- size_t len;
- int offset;
-};
-
-struct muser_dma_mapping {
- unsigned long iova;
- unsigned long length;
- unsigned long offset;
- struct file *file;
- int fd;
- struct list_head entry;
-};
-
-/*
- * TODO do we use all members at the same time? Does it make sense to put some
- * of them in a union?
- */
-struct mudev_cmd {
- enum muser_cmd_type type; /* copy of muser_cmd.type */
- struct muser_cmd muser_cmd;
- struct page_map pg_map;
- struct file **fds;
- int *data_fds;
- /*
- * When libmuser completes an mmap call, we need to know the length
- * in order to pass it to do_pin_pages.
- */
- unsigned long mmap_len;
- struct list_head entry;
-};
-
-/*
- * TODO:
- * Reorganise the members of this struct muser_dev
- * mucmd_pending should be per filep context
- * muser_dev should have a list of filep contexts instead of srv_opened
- */
-struct muser_dev {
- guid_t uuid;
- int minor;
- struct device *dev;
- struct list_head dlist_entry;
- struct list_head cmd_list;
- struct mudev_cmd *mucmd_pending;
- atomic_t srv_opened;
- atomic_t mdev_opened;
- struct mutex dev_lock;
- struct mdev_device *mdev;
- wait_queue_head_t user_wait_q;
- struct semaphore sem;
- struct notifier_block iommu_notifier;
- struct muser_dma_mapping *dma_map; /* Current DMA operation */
- struct list_head dma_list; /* list of dma mappings */
- struct radix_tree_root devmem_tree; /* Device memory */
-};
-
-static inline int muser_copyout(void __user *param, const void *address,
- unsigned long size)
-{
- int err = copy_to_user(param, address, size) ? -EFAULT : 0;
-
- if (unlikely(err))
- muser_dbg("failed to copy to user: %d", err);
-
- return err;
-}
-
-static inline int muser_copyin(void *address, void __user *param,
- unsigned long size)
-{
- int err = copy_from_user(address, param, size) ? -EFAULT : 0;
-
- if (unlikely(err))
- muser_dbg("failed to copy from user: %d", err);
-
- return err;
-}
-
-/* called with muser.muser_lock held */
-static struct muser_dev *__muser_search_dev(const guid_t *uuid)
-{
- struct muser_dev *mudev;
-
- list_for_each_entry(mudev, &muser.mudev_list, dlist_entry) {
- const uuid_le *u = &mudev->uuid;
-
- if (uuid_le_cmp(*u, *uuid) == 0)
- return mudev;
- }
-
- return NULL;
-}
-
-static int muser_create_dev(const guid_t *uuid, struct mdev_device *mdev)
-{
- struct muser_dev *mudev;
- char uuid_str[UUID_STRING_LEN + 1];
- int minor;
- int err = 0;
-
- mutex_lock(&muser.muser_lock);
- mudev = __muser_search_dev(uuid);
- if (mudev) {
- err = -EEXIST;
- goto out;
- }
-
- mudev = kzalloc(sizeof(*mudev), GFP_KERNEL);
- if (!mudev) {
- err = -ENOMEM;
- goto out;
- }
-
- minor = idr_alloc(&muser.dev_idr, mudev, 0, MINORMASK + 1, GFP_KERNEL);
- if (minor < 0) {
- err = minor;
- kfree(mudev);
- goto out;
- }
-
- sprintf(uuid_str, "%pUl", uuid);
- mudev->dev = device_create(muser.class, NULL,
- MKDEV(MAJOR(muser.muser_devt), minor),
- mudev, "%s", uuid_str);
- if (IS_ERR(mudev->dev)) {
- err = PTR_ERR(mudev->dev);
- idr_remove(&muser.dev_idr, minor);
- kfree(mudev);
- goto out;
- }
-
- memcpy(&mudev->uuid, uuid, sizeof(mudev->uuid));
- mudev->minor = minor;
- mudev->mdev = mdev;
- mutex_init(&mudev->dev_lock);
- sema_init(&mudev->sem, 0);
- init_waitqueue_head(&mudev->user_wait_q);
- INIT_LIST_HEAD(&mudev->cmd_list);
- INIT_LIST_HEAD(&mudev->dma_list);
- INIT_RADIX_TREE(&mudev->devmem_tree, GFP_KERNEL);
- list_add(&mudev->dlist_entry, &muser.mudev_list);
- mdev_set_drvdata(mdev, mudev);
-
- muser_info("new device %s", uuid_str);
-
-out:
- mutex_unlock(&muser.muser_lock);
- return err;
-}
-
-/* called with muser.muser_lock held */
-static void __muser_deinit_dev(struct muser_dev *mudev)
-{
- device_destroy(muser.class,
- MKDEV(MAJOR(muser.muser_devt), mudev->minor));
- list_del(&mudev->dlist_entry);
- idr_remove(&muser.dev_idr, mudev->minor);
-}
-
-/* called with mudev.dev_lock held */
-static void __mudev_page_free(struct muser_dev *mudev, unsigned long pgnr)
-{
- struct page *pg;
-
- pg = radix_tree_delete(&mudev->devmem_tree, pgnr);
- if (WARN_ON(!pg))
- return;
-
- __free_page(pg);
-}
-
-#define NR_INDICES 16
-
-/* called with mudev.dev_lock held */
-static void __mudev_free_devmem(struct muser_dev *mudev)
-{
- struct radix_tree_iter iter;
- struct radix_tree_root *root = &mudev->devmem_tree;
- unsigned long indices[NR_INDICES], index = 0;
- void __rcu **slot;
- int i, nr;
-
- do {
- nr = 0;
- radix_tree_for_each_slot(slot, root, &iter, index) {
- indices[nr] = iter.index;
- if (++nr == NR_INDICES)
- break;
- }
- for (i = 0; i < nr; i++) {
- index = indices[i];
- __mudev_page_free(mudev, index);
- }
- } while (nr > 0);
-}
-
-static int muser_remove_dev(const uuid_le *uuid)
-{
- struct muser_dev *mudev;
- char uuid_str[UUID_STRING_LEN + 1];
- int err = 0;
-
- mutex_lock(&muser.muser_lock);
-
- mudev = __muser_search_dev(uuid);
- if (!mudev) {
- err = -ENOENT;
- goto out;
- }
-
- if (atomic_read(&mudev->mdev_opened) > 0 ||
- atomic_read(&mudev->srv_opened) > 0) {
- err = -EBUSY;
- goto out;
- }
-
- mutex_lock(&mudev->dev_lock);
-
- WARN_ON(!list_empty(&mudev->cmd_list));
- __mudev_free_devmem(mudev);
- __muser_deinit_dev(mudev);
-
- mutex_unlock(&mudev->dev_lock);
- kfree(mudev);
-
- sprintf(uuid_str, "%pUl", uuid);
- muser_info("removed muser device %s", uuid_str);
-
-out:
- mutex_unlock(&muser.muser_lock);
- return err;
-}
-
-static ssize_t name_show(struct kobject *kobj, struct device *dev, char *buf)
-{
- return sprintf(buf, "muser\n");
-}
-MDEV_TYPE_ATTR_RO(name);
-
-static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
- char *buf)
-{
- return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
-}
-MDEV_TYPE_ATTR_RO(device_api);
-
-static struct attribute *mdev_types_attrs[] = {
- &mdev_type_attr_name.attr,
- &mdev_type_attr_device_api.attr,
- NULL,
-};
-
-static struct attribute_group mdev_type_group = {
- .name = "1",
- .attrs = mdev_types_attrs,
-};
-
-struct attribute_group *mdev_type_groups[] = {
- &mdev_type_group,
- NULL,
-};
-
-static int muser_process_cmd(struct muser_dev *mudev, struct mudev_cmd *mucmd)
-{
- int err;
-
- mucmd->type = mucmd->muser_cmd.type;
-
- /* Add command to mudev list of commands. */
- mutex_lock(&mudev->dev_lock);
- list_add_tail(&mucmd->entry, &mudev->cmd_list);
- mutex_unlock(&mudev->dev_lock);
-
- /* Wake up any sleepers */
- wake_up(&mudev->user_wait_q);
-
- /*
- * TODO: decide what to do with timeouts
- * Timeouts can happen if:
- * 1. No server has attached to mudev
- * 2. Processing of cmd takes more time than timeout
- *
- * Maybe use a while loop instead of goto
- */
-retry:
- err = down_timeout(&mudev->sem, msecs_to_jiffies(5000));
- if (err) {
- struct mudev_cmd *pos, *tmp;
- bool found = false;
-
- mutex_lock(&mudev->dev_lock);
- list_for_each_entry_safe(pos, tmp, &mudev->cmd_list, entry) {
- if (pos == mucmd) {
- list_del(&mucmd->entry);
- found = true;
- break;
- }
- }
- mutex_unlock(&mudev->dev_lock);
- if (found) {
- muser_err("giving up, no response for cmd %d",
- mucmd->type);
- } else {
- muser_warn("server taking too long for cmd %d, retry",
- mucmd->type);
- goto retry;
- }
- }
-
- return err;
-}
-
-int muser_create(struct kobject *kobj, struct mdev_device *mdev)
-{
- /* XXX this should be taken out when upstreaming */
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(4,19,67)
- const uuid_le uuid = mdev_uuid(mdev);
- return muser_create_dev(&uuid, mdev);
-#else
- return muser_create_dev(mdev_uuid(mdev), mdev);
-#endif
-}
-
-int muser_remove(struct mdev_device *mdev)
-{
- /* XXX this should be taken out when upstreaming */
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(4,19,67)
- const uuid_le uuid = mdev_uuid(mdev);
- return muser_remove_dev(&uuid);
-#else
- return muser_remove_dev(mdev_uuid(mdev));
-#endif
-}
-
-static int do_pin_pages(char __user *buf, const size_t count,
- int const writeable, struct page_map *const pg_map)
-{
- unsigned long start;
- unsigned long __user lbuf = (unsigned long __user)buf;
- int i;
- int err;
-
- BUG_ON(!buf);
- BUG_ON(!pg_map);
-
- start = round_down(lbuf, PAGE_SIZE);
- pg_map->nr_pages = (round_up(lbuf + count, PAGE_SIZE) - start) /
- PAGE_SIZE;
- pg_map->offset = lbuf - start;
- pg_map->pages = kcalloc(pg_map->nr_pages, sizeof *(pg_map->pages),
- GFP_KERNEL);
- if (unlikely(!pg_map->pages)) {
- muser_dbg("failed to allocate %d pages", pg_map->nr_pages);
- return -ENOMEM;
- }
- err = get_user_pages_fast(start, pg_map->nr_pages, writeable,
- pg_map->pages);
- if (unlikely(err != pg_map->nr_pages)) {
- for (i = 0; i < err; i++)
- put_page(pg_map->pages[i]);
- kfree(pg_map->pages);
- muser_dbg("failed to get user pages: %d", err);
- return -ENOMEM;
- }
-
- return 0;
-}
-
-static void unpin_pages(struct page_map *const pg_map)
-{
- int i;
-
- if (!pg_map)
- return;
-
- for (i = 0; i < pg_map->nr_pages; i++)
- put_page(pg_map->pages[i]);
- kfree(pg_map->pages);
- pg_map->pages = NULL;
-}
-
-static int vm_insert_pages(struct vm_area_struct *const vma,
- struct page *const pages[], const int nr_pages)
-{
- int err = 0, i;
-
- for (i = 0; i < nr_pages; i++) {
- BUG_ON(!pages[i]);
- err = vm_insert_page(vma, vma->vm_start + i * PAGE_SIZE,
- pages[i]);
- if (unlikely(err)) {
- muser_dbg("count=%d, anon=%d, slab=%d",
- page_count(pages[i]), PageAnon(pages[i]),
- PageSlab(pages[i]));
- muser_dbg("failed to insert page at %lx: %d",
- vma->vm_start + i * PAGE_SIZE, err);
- unmap_kernel_range((unsigned long)vma->vm_start,
- PAGE_SIZE);
- break;
- }
- }
- return err;
-}
-
-static struct page *mudev_page_alloc(struct muser_dev *mudev,
- unsigned long pgnr)
-{
- struct page *pg;
- int ret;
-
- pg = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_ZERO);
- if (unlikely(!pg))
- return NULL;
-
- ret = radix_tree_insert(&mudev->devmem_tree, pgnr, pg);
- if (ret) {
- __free_page(pg);
- return NULL;
- }
-
- return pg;
-}
-
-static int libmuser_mmap_dev(struct file *fp, struct vm_area_struct *vma)
-{
- struct muser_dev *mudev = fp->private_data;
- struct page *pg;
- unsigned int nr_pages;
- unsigned long cur_pgidx, end_pgidx;
- unsigned long addr, *new_pgs;
- int ret, i;
-
- WARN_ON(mudev == NULL);
- nr_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
-
- /* array to track new alloc'd pages, to be free'd in case of failure */
- new_pgs = kmalloc_array(nr_pages, sizeof(*new_pgs), GFP_KERNEL);
- if (new_pgs == NULL)
- return -ENOMEM;
-
- cur_pgidx = vma->vm_pgoff;
- end_pgidx = cur_pgidx + nr_pages;
-
- muser_dbg("mmap_dev: end 0x%lX - start 0x%lX (0x%lX), off = 0x%lX",
- vma->vm_end, vma->vm_start, vma->vm_end - vma->vm_start,
- cur_pgidx);
-
- mutex_lock(&mudev->dev_lock);
- for (i = 0; cur_pgidx < end_pgidx; cur_pgidx++, i++) {
- pg = radix_tree_lookup(&mudev->devmem_tree, cur_pgidx);
- if (pg == NULL) {
- pg = mudev_page_alloc(mudev, cur_pgidx);
- if (pg == NULL) {
- muser_dbg("failed to alloc mudev page for index %ld", cur_pgidx);
- i--;
- ret = -ENOMEM;
- goto free_pg;
- }
- new_pgs[i] = cur_pgidx;
- }
-
- addr = vma->vm_start + (i << PAGE_SHIFT);
- ret = vm_insert_page(vma, addr, pg);
- if (unlikely(ret != 0)) {
- muser_dbg("failed to insert page vma=%p addr=%ld page=%p: %d",
- vma, addr, pg, ret);
- goto free_pg;
- }
- }
- mutex_unlock(&mudev->dev_lock);
-
- kfree(new_pgs);
- return 0;
-
-free_pg:
- for ( ; i >= 0; i--)
- __mudev_page_free(mudev, new_pgs[i]);
- mutex_unlock(&mudev->dev_lock);
- kfree(new_pgs);
- return ret;
-}
-
-static int muser_process_dma_request(struct muser_dev *mudev,
- struct muser_dma_mapping *dma_map,
- int flags, int type)
-{
- int err;
- struct mudev_cmd mucmd = {
- .type = type,
- .muser_cmd = {
- .type = type,
- .mmap = {
- .request = {
- .addr = dma_map->iova,
- .len = dma_map->length,
- .offset = dma_map->offset,
- .flags = flags}
- }
- }
- };
-
- if (type == MUSER_DMA_MMAP)
- mucmd.muser_cmd.mmap.request.file = dma_map->file;
- else if (type == MUSER_DMA_MUNMAP)
- mucmd.muser_cmd.mmap.request.fd = dma_map->fd;
-
- err = muser_process_cmd(mudev, &mucmd);
- if (unlikely(err))
- return err;
-
- return mucmd.muser_cmd.err;
-}
-
-static int muser_process_dma_map(struct muser_dev *mudev, int flags)
-{
- return muser_process_dma_request(mudev, mudev->dma_map, flags,
- MUSER_DMA_MMAP);
-}
-
-static int muser_process_dma_unmap(struct muser_dev *mudev,
- struct muser_dma_mapping *dma_map)
-{
- return muser_process_dma_request(mudev, dma_map, 0, MUSER_DMA_MUNMAP);
-}
-
-/*
- * Returns the struct file backing this VMA and sets the offset.
- */
-static struct file* find_file_for_vaddr(unsigned long long vaddr,
- unsigned long *offset)
-{
- struct vm_area_struct *vma = NULL;
- struct mm_struct *mm = get_task_mm(current);
- if (unlikely(!mm))
- goto out;
- vma = find_vma(mm, vaddr);
- if (!vma) {
- muser_dbg("no VMA for vaddr=%#llx", vaddr);
- goto out;
- }
- if (!vma->vm_file) {
- muser_dbg("no file for vaddr=%#llx", vaddr);
- vma = NULL;
- goto out;
- }
- if (IS_PRIVATE(vma->vm_file->f_inode)) {
- muser_info("ignore private %pD\n", vma->vm_file);
- vma = NULL;
- goto out;
- }
- *offset = vaddr - vma->vm_start;
-out:
- if (mm)
- mmput(mm);
- if (vma)
- return vma->vm_file;
- return NULL;
-}
-
-static int muser_iommu_dma_map(struct muser_dev *mudev,
- struct vfio_iommu_type1_dma_map *map)
-{
- struct muser_dma_mapping *dma_map;
- int ret;
- struct file *file;
- unsigned long offset;
-
- muser_dbg("DMA map vaddr=%#llx iova=%#llx-%#llx", map->vaddr, map->iova,
- map->iova + map->size);
-
- /* TODO: support multiple DMA map operations in parallel */
- /*
- * TODO I think it's not necessary anymore to block concurrent DMA map
- * operations as libmuser will mmap the file and then return in one go.
- * Therefore mudev->dma_map can be removed.
- */
- mutex_lock(&mudev->dev_lock);
- if (mudev->dma_map != NULL) {
- mutex_unlock(&mudev->dev_lock);
- muser_dbg("another DMA map operation is ongoing");
- return -EBUSY;
- }
-
- file = find_file_for_vaddr(map->vaddr, &offset);
- if (file == NULL) {
- mutex_unlock(&mudev->dev_lock);
- return 0;
- }
-
- dma_map = kmalloc(sizeof(struct muser_dma_mapping), GFP_KERNEL);
- if (dma_map == NULL) {
- mutex_unlock(&mudev->dev_lock);
- return -ENOMEM;
- }
- mudev->dma_map = dma_map;
- mutex_unlock(&mudev->dev_lock);
-
- dma_map->iova = map->iova;
- dma_map->length = map->size;
- dma_map->file = file;
- dma_map->offset = offset;
-
- ret = muser_process_dma_map(mudev, map->flags);
- if (ret) {
- muser_dbg("libmuser failed to DMA map iova=%llx vaddr=%llx size=%llx: %d",
- map->iova, map->vaddr, map->size, ret);
- goto out;
- }
-
- /* add to the dma_list */
- mutex_lock(&mudev->dev_lock);
- list_add_tail(&dma_map->entry, &mudev->dma_list);
- mudev->dma_map = NULL;
- mutex_unlock(&mudev->dev_lock);
- return 0;
-
-out:
- kfree(dma_map);
- mutex_lock(&mudev->dev_lock);
- mudev->dma_map = NULL;
- mutex_unlock(&mudev->dev_lock);
- return ret;
-}
-
-/* called with mudev.dev_lock held */
-static struct muser_dma_mapping *__find_dma_map(struct muser_dev *mudev,
- unsigned long iova)
-{
- struct muser_dma_mapping *dma_map;
-
- list_for_each_entry(dma_map, &mudev->dma_list, entry) {
- if (dma_map->iova == iova)
- return dma_map;
- }
- return NULL;
-}
-
-static int muser_iommu_dma_unmap(struct muser_dev *const mudev,
- struct vfio_iommu_type1_dma_unmap *const unmap)
-{
- int err;
- unsigned long len;
- struct muser_dma_mapping *dma_map;
-
- mutex_lock(&mudev->dev_lock);
- dma_map = __find_dma_map(mudev, unmap->iova);
- if (!dma_map) {
- /*
- * XXX We no longer use vfio_pin_pages() so VFIO doesn't send
- * DMA unmap events at all. We've patched vfio_iommu_type1 to
- * send DMA unmap events even if we haven't pinned any of the
- * pages of a particular region (e.g. the VMA is not
- * shareable), so we have to ignore requests for such regions.
- * This behaviormight be temprorary, depending on whether or
- * not this solution gets accepted. For more information see:
- * https://www.redhat.com/archives/vfio-users/2020-February/msg00016.html.
- */
- mutex_unlock(&mudev->dev_lock);
- return 0;
- }
- list_del(&dma_map->entry);
- mutex_unlock(&mudev->dev_lock);
-
- len = dma_map->length;
- err = muser_process_dma_unmap(mudev, dma_map);
- if (unlikely(err))
- muser_dbg("failed to request libmuser to munmap iova=%#llx-%#llx: %d",
- unmap->iova, unmap->size, err);
-
- kfree(dma_map);
-
- /* XXX: Do we need this? */
- unmap->size = len;
- return err;
-}
-
-/*
- * FIXME There can be multiple DMA map calls per device. If each of these calls
- * are serialised (this can be enforced by muser), then we tell libmuser to
- * mmap the control device. Do we need to distinguish between the different
- * DMA map calls at this stage if we can enforce only one outstanding DMA map
- * call?
- */
-static int muser_iommu_notifier(struct notifier_block *nb, unsigned long action,
- void *data)
-{
- struct muser_dev *mudev;
- int err;
-
- BUG_ON(!nb);
- BUG_ON(!data);
-
- mudev = container_of(nb, struct muser_dev, iommu_notifier);
- switch (action) {
- case VFIO_IOMMU_NOTIFY_DMA_MAP:
- err = muser_iommu_dma_map(mudev,
- (struct vfio_iommu_type1_dma_map *)
- data);
- break;
- case VFIO_IOMMU_NOTIFY_DMA_UNMAP:
- err = muser_iommu_dma_unmap(mudev,
- (struct vfio_iommu_type1_dma_unmap
- *)data);
- break;
- default:
- muser_dbg("bad action=%lx", action);
- err = -EINVAL;
- }
-
- if (unlikely(err))
- return NOTIFY_BAD;
- return NOTIFY_OK;
-}
-
-static int register_notifier(struct mdev_device *const mdev)
-{
- unsigned long events =
- VFIO_IOMMU_NOTIFY_DMA_MAP | VFIO_IOMMU_NOTIFY_DMA_UNMAP;
- struct muser_dev *const mudev = mdev_get_drvdata(mdev);
-
- memset(&mudev->iommu_notifier, 0, sizeof(mudev->iommu_notifier));
- mudev->iommu_notifier.notifier_call = muser_iommu_notifier;
- return vfio_register_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY,
- &events, &mudev->iommu_notifier);
-}
-
-static int dma_unmap_all(struct muser_dev *mudev)
-{
- struct muser_dma_mapping *dma_map;
- unsigned long length;
- LIST_HEAD(head);
-
- /*
- * TODO: Cleanup
- * Use better list functions like:
- * list_replace()/list_replace_init()
- * list_for_each_entry_safe()
- */
-
- mutex_lock(&mudev->dev_lock);
- while (!list_empty(&mudev->dma_list)) {
- dma_map = list_first_entry(&mudev->dma_list,
- struct muser_dma_mapping, entry);
- list_move(&dma_map->entry, &head);
- }
- mutex_unlock(&mudev->dev_lock);
-
- while (!list_empty(&head)) {
- dma_map = list_first_entry(&head, struct muser_dma_mapping,
- entry);
- list_del(&dma_map->entry);
- length = dma_map->length;
- kfree(dma_map);
- }
- return 0;
-}
-
-int muser_open(struct mdev_device *mdev)
-{
- int err;
- struct muser_dev *mudev = mdev_get_drvdata(mdev);
-
- WARN_ON(mudev == NULL);
-
- if (atomic_cmpxchg(&mudev->mdev_opened, 0, 1) != 0) {
- muser_dbg("device already open");
- return -EBUSY;
- }
-
- if (!try_module_get(THIS_MODULE)) {
- atomic_dec(&mudev->mdev_opened);
- return -ENODEV;
- }
-
- err = register_notifier(mdev);
- if (unlikely(err)) {
- int err2;
- /*
- * TODO we might have triggered some notifiers which will have
- * caused libmuser to mmap. If open fails then libmuser dies
- * therefore things get automatically cleaned up (e.g.
- * vfio_unpin etc.)?
- */
- atomic_dec(&mudev->mdev_opened);
- module_put(THIS_MODULE);
-
- muser_dbg("failed to register notifier: %d", err);
- err2 = dma_unmap_all(mudev);
- if (unlikely(err2))
- muser_dbg("failed to DMA unmap all regions: %d", err2);
- err2 = vfio_unregister_notifier(mdev_dev(mdev),
- VFIO_IOMMU_NOTIFY,
- &mudev->iommu_notifier);
- if (unlikely(err2))
- muser_info("failed to unregister notifier: %d", err);
- }
-
-
- return err;
-}
-
-void muser_close(struct mdev_device *mdev)
-{
- struct muser_dev *mudev = mdev_get_drvdata(mdev);
- int err;
-
- muser_dbg("release %pUl", &mudev->uuid);
-
- err = dma_unmap_all(mudev);
- if (unlikely(err))
- muser_alert("failed to remove one or more DMA maps");
-
- err = vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY,
- &mudev->iommu_notifier);
- if (unlikely(err))
- muser_info("failed to unregister notifier: %d", err);
-
- WARN_ON(atomic_read(&mudev->mdev_opened) == 0);
- atomic_dec(&mudev->mdev_opened);
-
- /* TODO: Replace any pending mucmd back in cmd_list. */
- module_put(THIS_MODULE);
-}
-
-static int
-pin_pages(struct mudev_cmd *mucmd, char __user *buf, size_t count,
- int writeable)
-{
- mucmd->pg_map.len = count;
- return do_pin_pages(buf, count, writeable, &mucmd->pg_map);
-}
-
-void dump_buffer(unsigned char const *const buf, uint32_t count)
-{
-#if defined(DEBUG)
- /*
- * TODO would be nice to add an option to print_hex_dump to hide
- * repeated lines, e.g. like od(1)
- */
- print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, 4, 1, buf, count,
- false);
-#endif
-}
-
-static ssize_t muser_read(struct mdev_device *mdev, char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct muser_dev *mudev = mdev_get_drvdata(mdev);
- struct mudev_cmd mucmd = { 0 };
- int err;
- ssize_t _count;
-
- WARN_ON(mudev == NULL);
-
- /* Setup mucmd and pin pages of the calling context. */
- mucmd.type = MUSER_READ;
- err = pin_pages(&mucmd, buf, count, 1);
- if (err != 0) {
- muser_dbg("failed to pin pages: %d", err);
- return err;
- }
-
- /* Setup muser_cmd for server context. */
- mucmd.muser_cmd.type = MUSER_READ;
- mucmd.muser_cmd.rw.count = count;
- mucmd.muser_cmd.rw.pos = *ppos;
-
- muser_dbg("R %lx@%llx", mucmd.muser_cmd.rw.count,
- mucmd.muser_cmd.rw.pos);
-
- /* TODO: move following into function */
-
- /* Process mudev_cmd in libmuser context */
- err = muser_process_cmd(mudev, &mucmd);
- if (unlikely(err != 0))
- _count = err;
- else
- _count = mucmd.muser_cmd.err;
-
- if (_count < 0)
- muser_dbg("failed to process read: %d, %d", err,
- mucmd.muser_cmd.err);
-
- *ppos = mucmd.muser_cmd.rw.pos;
-
- if (_count > 0) {
- muser_dbg("received 0x%lx bytes from user space (0x%lx)",
- _count, mucmd.muser_cmd.rw.count);
- dump_buffer(buf, _count);
- }
-
- unpin_pages(&mucmd.pg_map);
-
- return _count;
-}
-
-ssize_t muser_write(struct mdev_device *mdev, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct muser_dev *mudev = mdev_get_drvdata(mdev);
- struct mudev_cmd mucmd = { 0 };
- int err;
- size_t _count = count;
- loff_t _pos = *ppos;
-
- muser_dbg("W %lx@%llx", count, *ppos);
- dump_buffer(buf, count);
-
- /* Setup mucmd and pin pages of the calling context. */
- mucmd.type = MUSER_WRITE;
- err = pin_pages(&mucmd, (char __user *)buf, count, 0);
- if (err != 0)
- return err;
-
- /* Setup muser_cmd for libmuser context. */
- mucmd.muser_cmd.type = MUSER_WRITE;
- mucmd.muser_cmd.rw.count = count;
- mucmd.muser_cmd.rw.pos = *ppos;
-
- /* Process mudev_cmd in server context. */
- err = muser_process_cmd(mudev, &mucmd);
- if (err != 0)
- count = -1;
- *ppos = mucmd.muser_cmd.rw.pos;
-
- unpin_pages(&mucmd.pg_map);
-
- if (mucmd.muser_cmd.err)
- muser_info("PCI config write %ld@0x%llx not handled: %d",
- _count, _pos, mucmd.muser_cmd.err);
-
- return count;
-}
-
-static int bounce_fds(struct mudev_cmd *mucmd, void __user *data,
- int user_data_size)
-{
- int count = mucmd->muser_cmd.ioctl.data.irq_set.count;
- int data_size = count * sizeof(int32_t);
- int *user_fds;
- int i;
- int ret = 0;
-
- if (user_data_size < data_size)
- return -EINVAL;
-
- mucmd->fds = kcalloc(count, sizeof(*mucmd->fds), GFP_KERNEL);
- if (mucmd->fds == NULL)
- return -ENOMEM;
-
- user_fds = memdup_user(data, data_size);
- if (IS_ERR(user_fds)) {
- kfree(mucmd->fds);
- mucmd->fds = NULL;
- return PTR_ERR(user_fds);
- }
-
- for (i = 0; i < count; i++) {
- if (user_fds[i] == -1)
- continue;
- mucmd->fds[i] = fget(user_fds[i]);
- if (mucmd->fds[i] == NULL) {
- ret = -EBADF;
- goto err;
- }
- }
-
- kfree(user_fds);
-
- return 0;
-
-err:
- for (i--; i >= 0; i--)
- fput(mucmd->fds[i]);
- kfree(user_fds);
- kfree(mucmd->fds);
- mucmd->fds = NULL;
-
- return ret;
-}
-
-static int muser_ioctl_setup_cmd(struct mudev_cmd *mucmd, unsigned int cmd,
- unsigned long arg)
-{
- ssize_t argsz, minsz;
- int err;
-
- /* Determine smallest argsz we need for this command. */
- minsz = get_minsz(cmd);
- if (minsz < 0)
- return minsz;
-
- /* Copy caller-provided arg. */
- err = muser_copyin(&mucmd->muser_cmd.ioctl.data, (void __user *)arg,
- minsz);
- if (unlikely(err))
- return err;
-
- /* Fetch argsz provided by caller. */
- argsz = get_argsz(cmd, &mucmd->muser_cmd);
- if (argsz < 0)
- return argsz;
-
- /* Ensure provided size is at least the minimum required. */
- if (argsz < minsz)
- return -EINVAL;
-
- /* Fetch potential data provided on SET_IRQS. */
- if (cmd == VFIO_DEVICE_SET_IRQS) {
- unsigned int flags = mucmd->muser_cmd.ioctl.data.irq_set.flags;
-
- switch ((flags & VFIO_IRQ_SET_DATA_TYPE_MASK)) {
- case VFIO_IRQ_SET_DATA_NONE:
- /* FIXME */
- muser_warn("ignore DATA_NONE index=%d start=%d count=%d",
- mucmd->muser_cmd.ioctl.data.irq_set.index,
- mucmd->muser_cmd.ioctl.data.irq_set.start,
- mucmd->muser_cmd.ioctl.data.irq_set.count);
- break;
- case VFIO_IRQ_SET_DATA_EVENTFD:
- /* Lookup eventfds and bounce references to mucmd. */
- err = bounce_fds(mucmd, (void __user *) (arg + minsz),
- argsz - minsz);
- if (err) {
- muser_dbg("failed to bounce fds: %d", err);
- return err;
- }
- break;
- default:
- muser_warn("ignore flags=0x%x", flags);
- }
- }
-
- /* Pin pages of the calling context. */
- err = pin_pages(mucmd, (char __user *)arg, argsz, 1);
- if (unlikely(err)) {
- muser_dbg("failed to pin pages: %d", err);
- return err;
- }
-
- return err;
-}
-
-static long muser_ioctl(struct mdev_device *mdev, unsigned int cmd,
- unsigned long arg)
-{
- struct muser_dev *mudev = mdev_get_drvdata(mdev);
- struct mudev_cmd mucmd = { 0 };
- int err;
-
- muser_dbg("mdev=%p, cmd=%u, arg=0x%lX", mdev, cmd, arg);
-
- if (cmd == VFIO_DEVICE_RESET) {
- if (!device_trylock(mudev->dev))
- return -EAGAIN;
- } else {
- err = muser_ioctl_setup_cmd(&mucmd, cmd, arg);
- if (err)
- return err;
- }
-
- /* Setup common mucmd records. */
- mucmd.type = MUSER_IOCTL;
- mucmd.muser_cmd.type = MUSER_IOCTL;
- mucmd.muser_cmd.ioctl.vfio_cmd = cmd;
-
- /* Process mudev_cmd in server context. */
- err = muser_process_cmd(mudev, &mucmd);
- if (err != 0) {
- muser_dbg("failed to process command: %d", err);
- err = -1;
- }
-
- if (cmd == VFIO_DEVICE_RESET) {
- device_unlock(mudev->dev);
- } else {
- /* Release resources. */
- unpin_pages(&mucmd.pg_map);
-
- /* maybe allocated for VFIO_IRQ_SET_DATA_EVENTFD */
- kfree(mucmd.fds);
- kfree(mucmd.data_fds);
- }
-
- return err;
-}
-
-static int muser_mmap(struct mdev_device *const mdev,
- struct vm_area_struct *const vma)
-{
- struct muser_dev *mudev = mdev_get_drvdata(mdev);
- struct mudev_cmd mucmd = { 0 };
- int err;
-
- BUG_ON(!mudev);
- BUG_ON(!vma);
-
- /*
- * Checking vm_flags cannot be easily done in user space as we can't
- * access mm.h, so we have to do it here. Maybe implement the reverse
- * of calc_vm_prot_bits/calc_vm_flag_bits?
- */
- if ((vma->vm_flags & ~(VM_READ | VM_WRITE | VM_SHARED | VM_MAYREAD |
- VM_MAYWRITE | VM_MAYEXEC | VM_MAYSHARE))) {
- muser_dbg("bag flags=%#lx", vma->vm_flags);
- return -EINVAL;
- }
-
- mucmd.type = MUSER_MMAP;
- mucmd.mmap_len = vma->vm_end - vma->vm_start;
-
- mucmd.muser_cmd.type = MUSER_MMAP;
- mucmd.muser_cmd.mmap.request.addr = vma->vm_pgoff << PAGE_SHIFT;
- mucmd.muser_cmd.mmap.request.len = vma->vm_end - vma->vm_start;
-
- /* Process mudev_cmd in server context. */
- err = muser_process_cmd(mudev, &mucmd);
- if (likely(err == 0)) {
- err = mucmd.muser_cmd.err;
- }
- if (unlikely(err != 0)) {
- muser_info("failed to mmap %#lx-%#lx: %d",
- mucmd.muser_cmd.mmap.request.addr,
- mucmd.muser_cmd.mmap.request.addr + mucmd.muser_cmd.mmap.request.len,
- err);
- return err;
- }
-
- return vm_insert_pages(vma, mucmd.pg_map.pages, mucmd.pg_map.nr_pages);
-}
-
-struct mdev_parent_ops muser_mdev_fops = {
- .owner = THIS_MODULE,
- .supported_type_groups = mdev_type_groups,
- .create = muser_create,
- .remove = muser_remove,
- .open = muser_open,
- .release = muser_close,
- .read = muser_read,
- .write = muser_write,
- .ioctl = muser_ioctl,
- .mmap = muser_mmap,
-};
-
-/* copy vfio-client pages(mucmd.pg_map) to server(arg) */
-static int bounce_out(void __user *arg, size_t argsz, struct mudev_cmd *mucmd)
-{
- unsigned long to_copy, left;
- void __user *to;
- void *from;
- unsigned int offset;
- int i, ret = 0;
-
- left = mucmd->pg_map.len;
- if (argsz < left)
- return -EINVAL;
-
- offset = mucmd->pg_map.offset;
-
- for (i = 0; i < mucmd->pg_map.nr_pages && ret == 0; i++) {
- to_copy = min(left, PAGE_SIZE - offset);
- to = arg + (mucmd->pg_map.len - left);
- from = page_to_virt(mucmd->pg_map.pages[i]) + offset;
-
- ret = muser_copyout(to, from, to_copy);
- if (ret)
- return ret;
-
- left -= to_copy;
-
- /* Must be zero after first iteration. */
- offset = 0;
- }
- WARN_ON(left != 0);
-
- return 0;
-}
-
-/*
- * copy from server(ubuf) to vfio-client pages(mucmd.pg_map)
- * skip seek bytes from destination before copying.
- *
- * @page_map: map representing vfio-client pages
- * @ubuf : user buffer to copy from
- * @bufsz : size of ubuf
- * @seek : bytes to be skip from page_map before copy
- */
-int bounce_in_seek(struct page_map *page_map, void __user *ubuf, size_t bufsz,
- size_t seek)
-{
- unsigned long to_copy = 0;
- void __user *from = ubuf;
- void *to;
- size_t total, offset, pgoff;
- int pgnr, i, ret;
-
- if (page_map->len < bufsz)
- return -ENOSPC;
-
- pgnr = NR_PAGES(seek) - 1;
- pgoff = seek & ~PAGE_SIZE;
- offset = page_map->offset;
-
- if (!pgnr)
- offset += pgoff;
- else
- offset = pgoff;
-
- total = bufsz;
- for (i = pgnr; i < page_map->nr_pages; i++) {
- to = page_to_virt(page_map->pages[i]) + offset;
- from += to_copy;
- to_copy = min(total, PAGE_SIZE - offset);
-
- ret = muser_copyin(to, from, to_copy);
- if (ret)
- return ret;
-
- total -= to_copy;
- offset = 0;
- }
-
- return 0;
-}
-
-/* copy from server(uaddr) to vfio-client pages(mucmd.pg_map) */
-static int bounce_in(struct mudev_cmd *mucmd, void __user *uaddr)
-{
- unsigned long to_copy, left;
- void __user *from;
- void *to;
- unsigned int offset;
- int i, ret;
-
- left = mucmd->pg_map.len;
- offset = mucmd->pg_map.offset;
-
- for (i = 0; i < mucmd->pg_map.nr_pages; i++) {
- to_copy = min(left, PAGE_SIZE - offset);
- from = uaddr + (mucmd->pg_map.len - left);
- to = page_to_virt(mucmd->pg_map.pages[i]) + offset;
-
- ret = muser_copyin(to, from, to_copy);
- if (ret)
- return ret;
-
- left -= to_copy;
-
- /* Must be zero after first iteration. */
- offset = 0;
- }
- WARN_ON(left != 0);
-
- return 0;
-}
-
-static long install_fds(struct mudev_cmd *mucmd)
-{
- int count = mucmd->muser_cmd.ioctl.data.irq_set.count;
- int i;
- long ret;
-
- mucmd->data_fds = kcalloc(count, sizeof(int32_t), GFP_KERNEL);
- if (mucmd->data_fds == NULL)
- return -ENOMEM;
-
- for (i = 0; i < count; i++) {
- if (mucmd->fds[i] == NULL) {
- mucmd->data_fds[i] = -1;
- continue;
- }
- mucmd->data_fds[i] = get_unused_fd_flags(0);
- if (mucmd->data_fds[i] < 0) {
- ret = mucmd->data_fds[i];
- muser_err("unable to get unused fd: %ld", ret);
- goto err;
- }
- fd_install(mucmd->data_fds[i], mucmd->fds[i]);
- }
-
- return 0;
-
-err:
- for (i--; i >= 0; i--)
- put_unused_fd(mucmd->data_fds[i]);
- kfree(mucmd->data_fds);
-
- return ret;
-}
-
-static inline int maybe_install_fds(struct mudev_cmd *mucmd)
-{
- unsigned int flags = mucmd->muser_cmd.ioctl.data.irq_set.flags;
- long ret = 0;
-
- if ((mucmd->muser_cmd.type == MUSER_IOCTL) &&
- (mucmd->muser_cmd.ioctl.vfio_cmd == VFIO_DEVICE_SET_IRQS)) {
- ret = -EINVAL;
- switch ((flags & VFIO_IRQ_SET_DATA_TYPE_MASK)) {
- case VFIO_IRQ_SET_DATA_NONE:
- /* FIXME */
- muser_warn("ignore DATA_NONE index=%d start=%d count=%d",
- mucmd->muser_cmd.ioctl.data.irq_set.index,
- mucmd->muser_cmd.ioctl.data.irq_set.start,
- mucmd->muser_cmd.ioctl.data.irq_set.count);
- ret = 0;
- break;
- case VFIO_IRQ_SET_DATA_EVENTFD:
- ret = install_fds(mucmd);
- if (unlikely(ret))
- muser_dbg("failed to install fds: %ld", ret);
- break;
- default:
- muser_warn("bad flags=0x%x", flags);
- /* TODO: SET_DATA_BOOL */
- }
- }
-
- return ret;
-}
-
-static inline int mmap_done(struct mudev_cmd * const mucmd)
-{
- struct muser_cmd *cmd = &mucmd->muser_cmd;
- char __user *addr = (char __user *) cmd->mmap.response;
- int ret;
-
- ret = do_pin_pages(addr, mucmd->mmap_len, 1, &mucmd->pg_map);
- if (ret) {
- muser_alert("failed to pin pages: %d", ret);
- mucmd->pg_map.pages = NULL;
- mucmd->pg_map.nr_pages = 0;
- }
-
- return ret;
-}
-
-static int fd_install_dma_map(struct file *file)
-{
- int fd;
- fd = get_unused_fd_flags(0);
- if (fd < 0) {
- muser_err("unable to get unused fd: %d", fd);
- return fd;
- }
- get_file(file);
- fd_install(fd, file);
- return fd;
-}
-
-static int do_dev_cmd_wait(struct muser_dev *mudev, unsigned long arg)
-{
- int ret;
- struct mudev_cmd *mucmd;
-
- /* Block until a request come from vfio. */
- ret = wait_event_interruptible(mudev->user_wait_q,
- !list_empty(&mudev->cmd_list));
- if (unlikely(ret)) {
- muser_dbg("failed to wait for user space: %d", ret);
- return ret;
- }
-
- /* Pick and remove the mucmd from the cmd_list. */
- mutex_lock(&mudev->dev_lock);
- WARN_ON(list_empty(&mudev->cmd_list));
- mucmd = list_first_entry(&mudev->cmd_list, struct mudev_cmd, entry);
- list_del(&mucmd->entry);
- mutex_unlock(&mudev->dev_lock);
-
- /* Keep a reference to mudev_cmd in mudev. */
- WARN_ON(mudev->mucmd_pending != NULL);
- mudev->mucmd_pending = mucmd;
- /* TODO: These WARN_ON()s should really just detach mudev. */
-
- if (mucmd->muser_cmd.type == MUSER_DMA_MMAP) {
- ret = fd_install_dma_map(mucmd->muser_cmd.mmap.request.file);
- if (unlikely(ret < 0))
- return ret;
- mudev->dma_map->fd = mucmd->muser_cmd.mmap.request.fd = ret;
- }
-
- /* Populate userspace with mucmd. */
- ret = muser_copyout((void __user *)arg, &mucmd->muser_cmd,
- sizeof(struct muser_cmd));
- if (unlikely(ret))
- return -EFAULT;
-
- /* Install FDs on VFIO_SET_IRQS */
- return maybe_install_fds(mucmd);
-}
-
-static int do_dev_cmd_done(struct muser_dev *mudev, unsigned long arg)
-{
- int ret;
- struct mudev_cmd *mucmd;
- unsigned long offset;
- int mucmd_err;
-
- /* This is only called when a command is pending. */
- if (unlikely(mudev->mucmd_pending == NULL)) {
- muser_dbg("done but no command pending");
- return -EINVAL;
- }
-
- /* Fetch (and clear) the pending command. */
- mucmd = mudev->mucmd_pending;
- mudev->mucmd_pending = NULL;
-
- /* Fetch response from userspace. */
- ret = muser_copyin(&mucmd->muser_cmd, (void __user *)arg,
- sizeof(struct muser_cmd));
- if (unlikely(ret))
- return ret;
-
- mucmd_err = mucmd->muser_cmd.err;
- switch (mucmd->type) {
- case MUSER_IOCTL:
- offset = offsetof(struct muser_cmd, ioctl);
- offset += offsetof(struct muser_cmd_ioctl, data);
- ret = bounce_in(mucmd, (void __user *)(arg + offset));
- break;
- case MUSER_MMAP:
- if (!mucmd_err)
- ret = mmap_done(mucmd);
- break;
- case MUSER_READ:
- if (unlikely(mucmd_err < 0))
- muser_alert("read failed: %d", mucmd_err);
- break;
- case MUSER_WRITE:
- case MUSER_DMA_MMAP:
- case MUSER_DMA_MUNMAP:
- break;
- default:
- muser_alert("bad command %d", mucmd->type);
- ret = -EINVAL;
- break;
- }
-
- /* Wake up vfio client. */
- up(&mudev->sem);
-
- return ret;
-}
-
-static long libmuser_unl_ioctl(struct file *filep, unsigned int cmd,
- unsigned long arg)
-{
- struct muser_dev *mudev = filep->private_data;
-
- WARN_ON(mudev == NULL);
- switch (cmd) {
- case MUSER_DEV_CMD_WAIT:
- return do_dev_cmd_wait(mudev, arg);
- case MUSER_DEV_CMD_DONE:
- return do_dev_cmd_done(mudev, arg);
- }
- muser_info("bad ioctl 0x%x", cmd);
- return -EINVAL;
-}
-
-#ifdef CONFIG_COMPAT
-static long libmuser_compat_ioctl(struct file *filep,
- unsigned int cmd, unsigned long arg)
-{
- arg = (unsigned long)compat_ptr(arg);
- return libmuser_unl_ioctl(filep, cmd, arg);
-}
-#endif /* CONFIG_COMPAT */
-
-static struct muser_dev *muser_get_dev_from_minor(int minor)
-{
- struct muser_dev *mudev;
-
- /* Locate mudev using idr. */
- mutex_lock(&muser.muser_lock);
- mudev = idr_find(&muser.dev_idr, minor);
- mutex_unlock(&muser.muser_lock);
-
- return mudev;
-}
-
-static int libmuser_open(struct inode *inode, struct file *filep)
-{
- struct muser_dev *mudev;
- int opened;
-
- /* Fetch corresponding mudev. */
- mudev = muser_get_dev_from_minor(iminor(inode));
- if (!mudev)
- return -ENOENT;
-
- /* Allow only one server for each mudev. */
- opened = atomic_cmpxchg(&mudev->srv_opened, 0, 1);
- if (opened)
- return -EBUSY;
-
- WARN_ON(filep->private_data != NULL);
- filep->private_data = mudev;
-
- return 0;
-}
-
-static int libmuser_release(struct inode *inode, struct file *filep)
-{
- struct muser_dev *mudev = filep->private_data;
-
- WARN_ON(mudev == NULL);
- mutex_lock(&mudev->dev_lock);
- /*
- * FIXME must be per filep
- */
- if (mudev->mucmd_pending) {
- muser_info("moving command back in list");
- list_add_tail(&mudev->mucmd_pending->entry, &mudev->cmd_list);
- mudev->mucmd_pending = NULL;
- }
- mutex_unlock(&mudev->dev_lock);
-
- filep->private_data = NULL;
- atomic_dec(&mudev->srv_opened);
-
- return 0;
-}
-
-static inline int irq_set_data_eventfd(void __user * const buf,
- struct mudev_cmd * const mucmd)
-{
- return muser_copyout((void __user *)buf, mucmd->data_fds,
- sizeof(__s32) * mucmd->muser_cmd.ioctl.data.irq_set.count);
-}
-
-static inline int irq_set_data_bool(void __user * const buf,
- struct mudev_cmd * const mucmd)
-{
- return muser_copyout((void __user *)buf, mucmd->data_fds,
- sizeof(__u8) * mucmd->muser_cmd.ioctl.data.irq_set.count);
-}
-
-/*
- * Called by libmuser for kernel->user transfers.
- */
-static ssize_t libmuser_read(struct file *filp, char __user *buf,
- size_t bufsz, loff_t *ppos)
-{
- struct muser_dev *mudev = filp->private_data;
- struct mudev_cmd *mucmd = mudev->mucmd_pending;
- int ret = -EINVAL;
- uint32_t irq_set_flags;
-
- if (!mucmd || !mudev) {
- muser_dbg("bad arguments");
- return -EINVAL;
- }
-
- /* XXX this should be taken out when upstreaming */
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(4,19,67)
- if (!access_ok(VERIFY_WRITE, buf, bufsz)) {
-#else
- if (!access_ok(buf, bufsz)) {
-#endif
- muser_dbg("bad permissions");
- return -EFAULT;
- }
-
- switch (mucmd->type) {
- case MUSER_WRITE:
- ret = bounce_out(buf, bufsz, mucmd);
- if (ret) {
- muser_dbg("failed to copy to user: %d", ret);
- goto err;
- }
- break;
- case MUSER_IOCTL:
- /* FIXME move case into separate function */
- if (mucmd->muser_cmd.ioctl.vfio_cmd != VFIO_DEVICE_SET_IRQS) {
- muser_dbg("expected VFIO command %d, got %d instead",
- VFIO_DEVICE_SET_IRQS,
- mucmd->muser_cmd.ioctl.vfio_cmd);
- goto err;
- }
- irq_set_flags = mucmd->muser_cmd.ioctl.data.irq_set.flags &
- VFIO_IRQ_SET_DATA_TYPE_MASK;
- switch (irq_set_flags) {
- case VFIO_IRQ_SET_DATA_EVENTFD:
- ret = irq_set_data_eventfd((void __user *)buf, mucmd);
- if (unlikely(ret)) {
- muser_dbg("failed to set data eventfd: %d",
- ret);
- goto err;
- }
- break;
- case VFIO_IRQ_SET_DATA_BOOL:
- ret = irq_set_data_bool((void __user *)buf, mucmd);
- if (unlikely(ret))
- goto err;
- break;
- default:
- muser_dbg("bad VFIO set IRQ flags %d", irq_set_flags);
- goto err;
- }
- break;
- default:
- muser_dbg("bad muser command %d", mucmd->type);
- goto err;
- }
- return bufsz;
-
-err:
- return ret;
-}
-
-/*
- * Called by libmuser for user->kernel transfers.
- */
-static ssize_t libmuser_write(struct file *filp, const char __user *buf,
- size_t bufsz, loff_t *ppos)
-{
- struct muser_dev *mudev = filp->private_data;
- struct mudev_cmd *mucmd = mudev->mucmd_pending;
- unsigned int seek;
- int ret;
-
- if (!mucmd || !mudev) {
- muser_dbg("bad arguments");
- return -EINVAL;
- }
- /* XXX this should be taken out when upstreaming */
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(4,19,67)
- if (!access_ok(VERIFY_READ, buf, bufsz)) {
-#else
- if (!access_ok(buf, bufsz)) {
-#endif
- muser_dbg("bad permissions");
- return -EFAULT;
- }
-
- switch (mucmd->type) {
- case MUSER_READ:
- muser_dbg("received data from libmuser");
- dump_buffer(buf, bufsz);
- ret = bounce_in(mucmd, (void __user *)buf);
- if (ret)
- return ret;
- break;
- case MUSER_IOCTL:
- muser_dbg("received sparse mmap from libmuser");
- /*
- * copy the sparse mmap cap information after the
- * struct vfio_region_info.
- */
- seek = sizeof(struct vfio_region_info);
- ret = bounce_in_seek(&mucmd->pg_map, (void __user *)buf, bufsz,
- seek);
- if (ret)
- return ret;
- mucmd->pg_map.len -= seek;
- break;
- default:
- muser_dbg("bad command 0x%x", mucmd->type);
- return -EINVAL;
- }
-
- return bufsz;
-}
-
-static const struct file_operations libmuser_fops = {
- .owner = THIS_MODULE,
- .unlocked_ioctl = libmuser_unl_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = libmuser_compat_ioctl,
-#endif
- .open = libmuser_open,
- .release = libmuser_release,
- .mmap = libmuser_mmap_dev,
- .read = libmuser_read,
- .write = libmuser_write,
-};
-
-static void muser_device_release(struct device *dev)
-{
- muser_info("muser dev released");
-}
-
-static char *muser_devnode(struct device *dev, umode_t *mode)
-{
- return kasprintf(GFP_KERNEL, DRIVER_NAME "/%s", dev_name(dev));
-}
-
-static int __init muser_init(void)
-{
- int ret;
-
- /* Initialise idr. */
- idr_init(&muser.dev_idr);
- mutex_init(&muser.muser_lock);
- INIT_LIST_HEAD(&muser.mudev_list);
-
- /* Initialise class. */
- muser.class = class_create(THIS_MODULE, DRIVER_NAME);
- if (IS_ERR(muser.class))
- return PTR_ERR(muser.class);
- muser.class->devnode = muser_devnode;
-
- /* Allocate and register a chardev for muser devices. */
- ret = alloc_chrdev_region(&muser.muser_devt, 0, MINORMASK + 1,
- DRIVER_NAME);
- if (ret)
- goto err_alloc_chrdev;
-
- cdev_init(&muser.muser_cdev, &libmuser_fops);
- ret = cdev_add(&muser.muser_cdev, muser.muser_devt, MINORMASK + 1);
- if (ret)
- goto err_cdev_add;
-
- muser.dev.class = muser.class;
- muser.dev.release = muser_device_release;
- dev_set_name(&muser.dev, "%s", DRIVER_NAME);
-
- ret = device_register(&muser.dev);
- if (ret)
- goto err_device_register;
-
- /* Register ourselves with mdev. */
- ret = mdev_register_device(&muser.dev, &muser_mdev_fops);
- if (ret)
- goto err_mdev_register_device;
-
- return 0;
-
-err_mdev_register_device:
- device_unregister(&muser.dev);
-err_device_register:
- cdev_del(&muser.muser_cdev);
-err_cdev_add:
- unregister_chrdev_region(muser.muser_devt, MINORMASK + 1);
-err_alloc_chrdev:
- class_destroy(muser.class);
- muser.class = NULL;
- return ret;
-}
-
-static void __exit muser_cleanup(void)
-{
- struct muser_dev *mudev, *tmp;
-
- /* Remove all devices. */
- mutex_lock(&muser.muser_lock);
- list_for_each_entry_safe(mudev, tmp, &muser.mudev_list, dlist_entry) {
- WARN_ON(atomic_read(&mudev->mdev_opened) ||
- atomic_read(&mudev->srv_opened));
- __muser_deinit_dev(mudev);
- kfree(mudev);
- }
- mutex_unlock(&muser.muser_lock);
-
- /* Unregister with mdev. */
- muser.dev.bus = NULL;
- mdev_unregister_device(&muser.dev);
-
- /* Cleanup everything else. */
- device_unregister(&muser.dev);
- idr_destroy(&muser.dev_idr);
- cdev_del(&muser.muser_cdev);
- unregister_chrdev_region(muser.muser_devt, MINORMASK + 1);
- class_destroy(muser.class);
- muser.class = NULL;
-}
-
-module_init(muser_init);
-module_exit(muser_cleanup);
-
-MODULE_LICENSE("Dual BSD/GPL");
diff --git a/kmod/muser.h b/kmod/muser.h
deleted file mode 100644
index 9791736..0000000
--- a/kmod/muser.h
+++ /dev/null
@@ -1,167 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
-/*
- * Copyright (c) 2019, Nutanix Inc. All rights reserved.
- *
- * Author: Thanos Makatos <thanos@nutanix.com>
- * Swapnil Ingle <swapnil.ingle@nutanix.com>
- * Felipe Franciosi <felipe@nutanix.com>
- *
- */
-
-#ifndef _UAPI_LINUX_MUSER_H
-#define _UAPI_LINUX_MUSER_H
-
-#ifndef __KERNEL__
-#include <sys/types.h>
-#include <stddef.h>
-#include <errno.h>
-
-/* FIXME copied from include/linux/stddef.h, is this OK license-wise? */
-#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER))
-#define offsetofend(TYPE, MEMBER) \
- (offsetof(TYPE, MEMBER) + sizeof_field(TYPE, MEMBER))
-
-#endif
-
-#include <linux/ioctl.h>
-#include <linux/vfio.h>
-
-#define MUSER_DEVNODE "muser"
-
-enum muser_cmd_type {
- MUSER_IOCTL = 1,
- MUSER_READ,
- MUSER_WRITE,
- MUSER_MMAP,
- MUSER_DMA_MMAP,
- MUSER_DMA_MUNMAP,
-};
-
-struct muser_cmd_rw {
- size_t count;
- loff_t pos;
-};
-
-struct muser_cmd_ioctl {
- int vfio_cmd;
- union {
- struct vfio_device_info dev_info;
- struct vfio_region_info reg_info;
- struct vfio_irq_info irq_info;
- struct vfio_irq_set irq_set;
- struct vfio_group_status group_status;
- int vfio_api_version;
- int vfio_extension;
- int container_fd;
- int device_fd;
- int iommu_type;
- struct vfio_iommu_type1_info iommu_type1_info;
- struct vfio_iommu_type1_dma_map dma_map;
- struct vfio_iommu_type1_dma_unmap dma_unmap;
- } data;
-};
-
-union muser_cmd_mmap {
- struct {
- unsigned long addr; /* iova for DMA_MAP, offset for MMAP */
- unsigned long len;
- unsigned long offset;
- unsigned long flags;
- struct file *file;
- int fd;
- } request;
- unsigned long response;
-};
-
-struct muser_cmd {
- enum muser_cmd_type type;
- union {
- struct muser_cmd_rw rw;
- struct muser_cmd_ioctl ioctl;
- union muser_cmd_mmap mmap;
- };
- int err;
-};
-
-/* ioctl cmds valid for /dev/muser/<uuid> */
-#define MUSER_DEV_CMD_WAIT _IOR('M', 1, struct muser_cmd)
-#define MUSER_DEV_CMD_DONE _IOW('M', 2, struct muser_cmd)
-
-static inline ssize_t get_minsz(unsigned int cmd)
-{
- switch (cmd) {
- case VFIO_DEVICE_GET_INFO:
- return offsetofend(struct vfio_device_info, num_irqs);
- case VFIO_DEVICE_GET_REGION_INFO:
- return offsetofend(struct vfio_region_info, offset);
- case VFIO_DEVICE_GET_IRQ_INFO:
- return offsetofend(struct vfio_irq_info, count);
- case VFIO_DEVICE_SET_IRQS:
- return offsetofend(struct vfio_irq_set, count);
- case VFIO_GROUP_GET_STATUS:
- return offsetofend(struct vfio_group_status, flags);
- case VFIO_GET_API_VERSION:
- return 0;
- case VFIO_CHECK_EXTENSION:
- case VFIO_GROUP_SET_CONTAINER:
- case VFIO_GROUP_UNSET_CONTAINER:
- case VFIO_SET_IOMMU:
- return sizeof(int);
- case VFIO_IOMMU_GET_INFO:
- return offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
- case VFIO_IOMMU_MAP_DMA:
- return offsetofend(struct vfio_iommu_type1_dma_map, size);
- case VFIO_IOMMU_UNMAP_DMA:
- return offsetofend(struct vfio_iommu_type1_dma_unmap, size);
- case VFIO_GROUP_GET_DEVICE_FD:
- case VFIO_DEVICE_RESET:
- return 0;
- }
- return -EOPNOTSUPP;
-}
-
-static inline ssize_t get_argsz(unsigned int cmd, struct muser_cmd *muser_cmd)
-{
- switch (cmd) {
- case VFIO_DEVICE_GET_INFO:
- return muser_cmd->ioctl.data.dev_info.argsz;
- case VFIO_DEVICE_GET_REGION_INFO:
- return muser_cmd->ioctl.data.reg_info.argsz;
- case VFIO_DEVICE_GET_IRQ_INFO:
- return muser_cmd->ioctl.data.irq_info.argsz;
- case VFIO_DEVICE_SET_IRQS:
- return muser_cmd->ioctl.data.irq_set.argsz;
- }
-
- return -EOPNOTSUPP;
-}
-
-static inline const char* vfio_cmd_to_str(int cmd) {
- switch (cmd) {
- case VFIO_GET_API_VERSION: return "VFIO_GET_API_VERSION";
- case VFIO_CHECK_EXTENSION: return "VFIO_CHECK_EXTENSION";
- case VFIO_SET_IOMMU: return "VFIO_SET_IOMMU";
- case VFIO_GROUP_GET_STATUS: return "VFIO_GROUP_GET_STATUS";
- case VFIO_GROUP_SET_CONTAINER: return "VFIO_GROUP_SET_CONTAINER";
- case VFIO_GROUP_UNSET_CONTAINER: return "VFIO_GROUP_UNSET_CONTAINER";
- case VFIO_GROUP_GET_DEVICE_FD: return "VFIO_GROUP_GET_DEVICE_FD";
- case VFIO_DEVICE_GET_INFO: return "VFIO_DEVICE_GET_INFO";
- case VFIO_DEVICE_GET_REGION_INFO: return "VFIO_DEVICE_GET_REGION_INFO";
- case VFIO_DEVICE_GET_IRQ_INFO: return "VFIO_DEVICE_GET_IRQ_INFO";
- case VFIO_DEVICE_SET_IRQS: return "VFIO_DEVICE_SET_IRQS";
- case VFIO_DEVICE_RESET: return "VFIO_DEVICE_RESET";
- case VFIO_IOMMU_GET_INFO: return "VFIO_IOMMU_GET_INFO/VFIO_DEVICE_GET_PCI_HOT_RESET_INFO/VFIO_IOMMU_SPAPR_TCE_GET_INFO";
- case VFIO_IOMMU_MAP_DMA: return "VFIO_IOMMU_MAP_DMA/VFIO_DEVICE_PCI_HOT_RESET";
- case VFIO_IOMMU_UNMAP_DMA: return "VFIO_IOMMU_UNMAP_DMA";
- case VFIO_IOMMU_ENABLE: return "VFIO_IOMMU_ENABLE";
- case VFIO_IOMMU_DISABLE: return "VFIO_IOMMU_DISABLE";
- case VFIO_EEH_PE_OP: return "VFIO_EEH_PE_OP";
- case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: return "VFIO_IOMMU_SPAPR_REGISTER_MEMORY";
- case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: return "VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY";
- case VFIO_IOMMU_SPAPR_TCE_CREATE: return "VFIO_IOMMU_SPAPR_TCE_CREATE";
- case VFIO_IOMMU_SPAPR_TCE_REMOVE: return "VFIO_IOMMU_SPAPR_TCE_REMOVE";
- }
- return NULL;
-}
-
-#endif /* _UAPI_LINUX_MUSER_H */
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 506fb9b..44e9301 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -29,7 +29,6 @@
#
add_library(muser SHARED
- ../kmod/muser.h
vfio_user.h
muser.h
muser_priv.h
diff --git a/lib/libmuser.c b/lib/libmuser.c
index 29022af..aa1c26c 100644
--- a/lib/libmuser.c
+++ b/lib/libmuser.c
@@ -54,7 +54,6 @@
#include <time.h>
#include <sys/select.h>
-#include "../kmod/muser.h"
#include "muser.h"
#include "muser_priv.h"
#include "dma.h"
@@ -135,12 +134,6 @@ struct lm_ctx {
/* function prototypes */
-static int
-muser_dma_map(lm_ctx_t*, struct muser_cmd*);
-
-static int
-muser_dma_unmap(lm_ctx_t*, struct muser_cmd*);
-
static void
free_sparse_mmap_areas(lm_reg_info_t*);
@@ -161,53 +154,6 @@ static inline int recv_blocking(int sock, void *buf, size_t len, int flags)
}
static int
-dev_detach(lm_ctx_t *lm_ctx)
-{
- int ret = 0;
-
- if (lm_ctx->fd != -1) {
- ret = close(lm_ctx->fd);
- }
- return ret;
-}
-
-static int
-dev_attach(lm_ctx_t *lm_ctx)
-{
- char *path;
- int dev_fd;
- int err;
-
- assert(lm_ctx != NULL);
-
- err = asprintf(&path, "/dev/" MUSER_DEVNODE "/%s", lm_ctx->uuid);
- if (err != (int)(strlen(MUSER_DEVNODE) + strlen(lm_ctx->uuid) + 6)) {
- return -1;
- }
-
- dev_fd = open(path, O_RDWR);
-
- free(path);
-
- return dev_fd;
-}
-
-static ssize_t
-recv_fds_kernel(lm_ctx_t *lm_ctx, void *buf, size_t size)
-{
- return read(lm_ctx->fd, buf, size);
-}
-
-static int
-get_request_kernel(lm_ctx_t *lm_ctx, struct vfio_user_header *cmd,
- int *fds __attribute__((unused)),
- int *nr_fds __attribute__((unused)))
-{
- assert(false);
- return ioctl(lm_ctx->fd, MUSER_DEV_CMD_WAIT, &cmd);
-}
-
-static int
init_sock(lm_ctx_t *lm_ctx)
{
struct sockaddr_un addr = { .sun_family = AF_UNIX };
@@ -626,26 +572,6 @@ get_request_sock(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr,
return ret;
}
-static void
-get_path_from_fd(int fd, char *buf)
-{
- int err;
- ssize_t ret;
- char pathname[PATH_MAX];
-
- err = snprintf(pathname, PATH_MAX, "/proc/self/fd/%d", fd);
- if (err >= PATH_MAX || err == -1) {
- buf[0] = '\0';
- }
- ret = readlink(pathname, buf, PATH_MAX);
- if (ret == -1) {
- ret = 0;
- } else if (ret == PATH_MAX) {
- ret -= 1;
- }
- buf[ret] = '\0';
-}
-
static ssize_t
recv_fds_sock(lm_ctx_t *lm_ctx, void *buf, size_t size)
{
@@ -663,13 +589,6 @@ static struct transport_ops {
int (*get_request)(lm_ctx_t*, struct vfio_user_header*, int *fds, int *nr_fds);
ssize_t (*recv_fds)(lm_ctx_t*, void *buf, size_t size);
} transports_ops[] = {
- [LM_TRANS_KERNEL] = {
- .init = NULL,
- .attach = dev_attach,
- .detach = dev_detach,
- .recv_fds = recv_fds_kernel,
- .get_request = get_request_kernel,
- },
[LM_TRANS_SOCK] = {
.init = init_sock,
.attach = open_sock,
@@ -996,7 +915,7 @@ dev_get_irqinfo(lm_ctx_t *lm_ctx, struct vfio_irq_info *irq_info)
*/
static int
dev_get_sparse_mmap_cap(lm_ctx_t *lm_ctx, lm_reg_info_t *lm_reg, int reg_index,
- struct vfio_region_info **vfio_reg, bool is_kernel)
+ struct vfio_region_info **vfio_reg)
{
struct vfio_info_cap_header *header;
struct vfio_region_info_cap_type *type = NULL;
@@ -1006,7 +925,6 @@ dev_get_sparse_mmap_cap(lm_ctx_t *lm_ctx, lm_reg_info_t *lm_reg, int reg_index,
size_t type_size = 0;
size_t sparse_size = 0;
size_t cap_size;
- ssize_t ret;
void *cap_ptr;
if (reg_index == LM_DEV_MIGRATION_REG_IDX) {
@@ -1023,19 +941,6 @@ dev_get_sparse_mmap_cap(lm_ctx_t *lm_ctx, lm_reg_info_t *lm_reg, int reg_index,
return 0;
}
- /*
- * If vfio_reg does not have enough space to accommodate sparse info then
- * set the argsz with the expected size and return. This behaviour
- * is only for kernel/muser.ko, where the request comes from kernel/vfio.
- */
-
- if ((*vfio_reg)->argsz < cap_size + sizeof(**vfio_reg) && is_kernel) {
- lm_log(lm_ctx, LM_DBG, "vfio_reg too small=%d\n", (*vfio_reg)->argsz);
- (*vfio_reg)->argsz = cap_size + sizeof(**vfio_reg);
- (*vfio_reg)->cap_offset = 0;
- return 0;
- }
-
/* TODO deosn't need to be calloc, we overwrite it entirely */
header = calloc(1, cap_size);
if (header == NULL) {
@@ -1082,25 +987,16 @@ dev_get_sparse_mmap_cap(lm_ctx_t *lm_ctx, lm_reg_info_t *lm_reg, int reg_index,
*/
(*vfio_reg)->flags |= VFIO_REGION_INFO_FLAG_MMAP | VFIO_REGION_INFO_FLAG_CAPS;
- if (is_kernel) {
- /* write the sparse mmap cap info to vfio-client user pages */
- ret = write(lm_ctx->conn_fd, header, cap_size);
- if (ret != (ssize_t)cap_size) {
- free(header);
- return -EIO;
- }
- } else {
- (*vfio_reg)->argsz = cap_size + sizeof(**vfio_reg);
- *vfio_reg = realloc(*vfio_reg, (*vfio_reg)->argsz);
- if (*vfio_reg == NULL) {
- free(header);
- return -ENOMEM;
- }
-
- cap_ptr = (char *)*vfio_reg + (*vfio_reg)->cap_offset;
- memcpy(cap_ptr, header, cap_size);
+ (*vfio_reg)->argsz = cap_size + sizeof(**vfio_reg);
+ *vfio_reg = realloc(*vfio_reg, (*vfio_reg)->argsz);
+ if (*vfio_reg == NULL) {
+ free(header);
+ return -ENOMEM;
}
+ cap_ptr = (char *)*vfio_reg + (*vfio_reg)->cap_offset;
+ memcpy(cap_ptr, header, cap_size);
+
free(header);
return 0;
}
@@ -1149,8 +1045,7 @@ dump_buffer(const char *prefix, const char *buf, uint32_t count)
#endif
static long
-dev_get_reginfo(lm_ctx_t *lm_ctx, struct vfio_region_info **vfio_reg,
- bool is_kernel)
+dev_get_reginfo(lm_ctx_t *lm_ctx, struct vfio_region_info **vfio_reg)
{
lm_reg_info_t *lm_reg;
int err;
@@ -1171,8 +1066,7 @@ dev_get_reginfo(lm_ctx_t *lm_ctx, struct vfio_region_info **vfio_reg,
(*vfio_reg)->flags = lm_reg->flags;
(*vfio_reg)->size = lm_reg->size;
- err = dev_get_sparse_mmap_cap(lm_ctx, lm_reg, (*vfio_reg)->index, vfio_reg,
- is_kernel);
+ err = dev_get_sparse_mmap_cap(lm_ctx, lm_reg, (*vfio_reg)->index, vfio_reg);
if (err) {
return err;
}
@@ -1203,159 +1097,6 @@ dev_get_info(lm_ctx_t *lm_ctx, struct vfio_device_info *dev_info)
return 0;
}
-static long
-do_muser_ioctl(lm_ctx_t *lm_ctx, struct muser_cmd_ioctl *cmd_ioctl, void *data)
-{
- struct vfio_region_info *reg_info;
- int err = -ENOTSUP;
-
- assert(lm_ctx != NULL);
- switch (cmd_ioctl->vfio_cmd) {
- case VFIO_DEVICE_GET_INFO:
- err = dev_get_info(lm_ctx, &cmd_ioctl->data.dev_info);
- break;
- case VFIO_DEVICE_GET_REGION_INFO:
- reg_info = &cmd_ioctl->data.reg_info;
- err = dev_get_reginfo(lm_ctx, &reg_info, true);
- break;
- case VFIO_DEVICE_GET_IRQ_INFO:
- err = dev_get_irqinfo(lm_ctx, &cmd_ioctl->data.irq_info);
- break;
- case VFIO_DEVICE_SET_IRQS:
- err = dev_set_irqs(lm_ctx, &cmd_ioctl->data.irq_set, data);
- break;
- case VFIO_DEVICE_RESET:
- err = device_reset(lm_ctx);
- break;
- case VFIO_GROUP_GET_STATUS:
- cmd_ioctl->data.group_status.flags = VFIO_GROUP_FLAGS_VIABLE;
- err = 0;
- break;
- case VFIO_GET_API_VERSION:
- cmd_ioctl->data.vfio_api_version = VFIO_API_VERSION;
- err = 0;
- break;
- case VFIO_CHECK_EXTENSION:
- if (cmd_ioctl->data.vfio_extension == VFIO_TYPE1v2_IOMMU) {
- err = 0;
- }
- break;
- case VFIO_IOMMU_GET_INFO:
- cmd_ioctl->data.iommu_type1_info.flags = VFIO_IOMMU_INFO_PGSIZES;
- cmd_ioctl->data.iommu_type1_info.iova_pgsizes = sysconf(_SC_PAGESIZE);
- err = 0;
- break;
- case VFIO_IOMMU_MAP_DMA:
- {
- struct muser_cmd muser_cmd = {
- .type = MUSER_DMA_MMAP,
- .mmap.request.fd = *((int*)data),
- .mmap.request.addr = cmd_ioctl->data.dma_map.iova,
- .mmap.request.len = cmd_ioctl->data.dma_map.size,
- .mmap.request.offset = cmd_ioctl->data.dma_map.vaddr
- };
- err = muser_dma_map(lm_ctx, &muser_cmd);
- }
- break;
- case VFIO_IOMMU_UNMAP_DMA:
- {
- struct muser_cmd muser_cmd = {
- .type = MUSER_DMA_MUNMAP,
- .mmap.request.addr = cmd_ioctl->data.dma_unmap.iova,
- .mmap.request.len = cmd_ioctl->data.dma_unmap.size
- };
- err = muser_dma_unmap(lm_ctx, &muser_cmd);
- }
- break;
- /* FIXME */
- case VFIO_GROUP_SET_CONTAINER:
- case VFIO_GROUP_UNSET_CONTAINER:
- case VFIO_SET_IOMMU:
- err = 0;
- break;
- default:
- lm_log(lm_ctx, LM_ERR, "bad comamnd %d", cmd_ioctl->vfio_cmd);
- }
-
- return err;
-}
-
-static int
-muser_dma_unmap(lm_ctx_t *lm_ctx, struct muser_cmd *cmd)
-{
- int err;
-
- lm_log(lm_ctx, LM_INF, "removing DMA region iova=%#lx-%#lx\n",
- cmd->mmap.request.addr,
- cmd->mmap.request.addr + cmd->mmap.request.len);
-
- if (lm_ctx->unmap_dma == NULL) {
- return 0;
- }
-
- if (lm_ctx->dma == NULL) {
- lm_log(lm_ctx, LM_ERR, "DMA not initialized\n");
- return -EINVAL;
- }
-
- err = dma_controller_remove_region(lm_ctx->dma,
- cmd->mmap.request.addr,
- cmd->mmap.request.len,
- lm_ctx->unmap_dma, lm_ctx->pvt);
- if (err != 0 && err != -ENOENT) {
- lm_log(lm_ctx, LM_ERR, "failed to remove DMA region %#lx-%#lx: %s\n",
- cmd->mmap.request.addr,
- cmd->mmap.request.addr + cmd->mmap.request.len,
- strerror(-err));
- }
-
- return err;
-}
-
-static int
-muser_dma_map(lm_ctx_t *lm_ctx, struct muser_cmd *cmd)
-{
- int err;
- char buf[PATH_MAX];
-
- get_path_from_fd(cmd->mmap.request.fd, buf);
-
- lm_log(lm_ctx, LM_INF, "%s DMA region fd=%d path=%s iova=%#lx-%#lx "
- "offset=%#lx\n", lm_ctx->unmap_dma == NULL ? "ignoring" : "adding",
- cmd->mmap.request.fd, buf, cmd->mmap.request.addr,
- cmd->mmap.request.addr + cmd->mmap.request.len,
- cmd->mmap.request.offset);
-
- if (lm_ctx->unmap_dma == NULL) {
- return 0;
- }
-
- if (lm_ctx->dma == NULL) {
- lm_log(lm_ctx, LM_ERR, "DMA not initialized\n");
- return -EINVAL;
- }
-
- err = dma_controller_add_region(lm_ctx->dma,
- cmd->mmap.request.addr,
- cmd->mmap.request.len,
- cmd->mmap.request.fd,
- cmd->mmap.request.offset);
- if (err < 0) {
- lm_log(lm_ctx, LM_ERR, "failed to add DMA region fd=%d path=%s %#lx-%#lx: "
- "%d\n", cmd->mmap.request.fd, buf, cmd->mmap.request.addr,
- cmd->mmap.request.addr + cmd->mmap.request.len, err);
- } else {
- err = 0;
- }
-
- if (lm_ctx->map_dma != NULL) {
- lm_ctx->map_dma(lm_ctx->pvt, cmd->mmap.request.addr,
- cmd->mmap.request.len);
- }
-
- return err;
-}
-
int
muser_send_fds(int sock, int *fds, size_t count) {
struct msghdr msg = { 0 };
@@ -1422,77 +1163,8 @@ muser_recv_fds(int sock, int *fds, size_t count)
}
/*
- * Callback that is executed when device memory is to be mmap'd.
- *
- * TODO vfio-over-socket: each PCI region can be sparsely memory mapped, so
- * there can be multiple mapped regions per PCI region. We need to make these
- * mapped regions persistent. One way would be to store each sparse region as
- * an individual file named after the memory range, e.g.
- * /dev/shm/muser/<UUID>/<region>/<offset>-<length> (the <region> can be <bar0>,
- * <rom> etc.).
- *
- * Another way would be to create one file per PCI region and then
- * tell libvfio which offset of each file corresponds to each region. The
- * mapping between sparse regions and file offsets can be 1:1, so there can be
- * large gaps in file which should be fine since it will be sparsely allocated.
- * Alternatively, each sparse region can be put right next to each other so
- * we'll need some kind of translation.
- *
- * However this functionality is implemented, it must be provided by libmuser.
- * For now we don't do anything (except for receiving the file descriptors)
- * and leave it to the device implementation to handle.
- */
-static int
-__attribute__((unused)) muser_mmap(lm_ctx_t *lm_ctx, struct muser_cmd *cmd)
-{
- int region, err = 0;
- unsigned long addr;
- unsigned long len = cmd->mmap.request.len;
- loff_t offset = cmd->mmap.request.addr;
-
- region = lm_get_region(offset, len, &offset);
- if (region < 0) {
- lm_log(lm_ctx, LM_ERR, "bad region %d\n", region);
- err = EINVAL;
- goto out;
- }
-
- if (lm_ctx->pci_info.reg_info[region].map == NULL) {
- lm_log(lm_ctx, LM_ERR, "region not mmapable\n");
- err = ENOTSUP;
- goto out;
- }
-
- addr = lm_ctx->pci_info.reg_info[region].map(lm_ctx->pvt, offset, len);
- if ((void *)addr == MAP_FAILED) {
- err = errno;
- lm_log(lm_ctx, LM_ERR, "failed to mmap: %m\n");
- goto out;
- }
- cmd->mmap.response = addr;
-
- /* FIXME */
- if (lm_ctx->trans == LM_TRANS_SOCK) {
- err = muser_send_fds(lm_ctx->conn_fd, (int*)&addr, 1);
- if (err == -1) {
- lm_log(lm_ctx, LM_ERR, "failed to send fd=%d: %d, %m\n",
- *((int*)&addr), err);
- }
- err = 0;
- }
-
-out:
- if (err != 0) {
- lm_log(lm_ctx, LM_ERR, "failed to mmap device memory %#x-%#lx: %s\n",
- offset, offset + len, strerror(err));
- }
-
- return -err;
-}
-
-/*
- * Returns the number of bytes communicated to the kernel (may be less than
- * ret), or a negative number on error.
+ * Returns the number of bytes sent (may be less than ret), or a negative
+ * number on error.
*/
static int
post_read(lm_ctx_t *lm_ctx, char *rwbuf, ssize_t count)
@@ -1811,7 +1483,7 @@ handle_migration_region_access(lm_ctx_t *lm_ctx, char *buf, size_t count,
}
static ssize_t
-do_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t pos, bool is_write)
+do_access(lm_ctx_t *lm_ctx, char *buf, uint8_t count, uint64_t pos, bool is_write)
{
int idx;
loff_t offset;
@@ -1819,7 +1491,7 @@ do_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t pos, bool is_write)
assert(lm_ctx != NULL);
assert(buf != NULL);
- assert(count > 0);
+ assert(count == 1 || count == 2 || count == 4 || count == 8);
pci_info = &lm_ctx->pci_info;
idx = lm_get_region(pos, count, &offset);
@@ -1864,12 +1536,15 @@ do_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t pos, bool is_write)
* error.
*
* TODO function name same lm_access_t, fix
+ * FIXME we must be able to return values up to uint32_t bit, or negative on
+ * error. Better to make return value an int and return the number of bytes
+ * processed via an argument.
*/
ssize_t
-lm_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t *ppos,
+lm_access(lm_ctx_t *lm_ctx, char *buf, uint32_t count, uint64_t *ppos,
bool is_write)
{
- unsigned int done = 0;
+ uint32_t done = 0;
int ret;
assert(lm_ctx != NULL);
@@ -1879,7 +1554,10 @@ lm_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t *ppos,
size_t size;
/*
* Limit accesses to qword and enforce alignment. Figure out whether
- * the PCI spec requires this.
+ * the PCI spec requires this
+ * FIXME while this makes sense for registers, we might be able to relax
+ * this requirement and make some transfers more efficient. Maybe make
+ * this a per-region option that can be set by the user?
*/
if (count >= 8 && !(*ppos % 8)) {
size = 8;
@@ -1912,50 +1590,52 @@ lm_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t *ppos,
}
static inline int
-muser_access(lm_ctx_t *lm_ctx, struct muser_cmd *cmd, bool is_write,
- void **data)
+muser_access(lm_ctx_t *lm_ctx, bool is_write, void **data, uint32_t count,
+ uint64_t *pos)
{
struct vfio_user_region_access *region_access;
char *rwbuf;
int err;
- size_t count = 0, _count;
- ssize_t ret;
+ uint32_t processed = 0, _count;
+ int ret;
+
+ assert(pos != NULL);
/* TODO how big do we expect count to be? Can we use alloca(3) instead? */
- region_access = calloc(1, sizeof(*region_access) + cmd->rw.count);
+ region_access = calloc(1, sizeof(*region_access) + count);
if (region_access == NULL) {
lm_log(lm_ctx, LM_ERR, "failed to allocate memory\n");
return -1;
}
rwbuf = (char*)(region_access + 1);
- lm_log(lm_ctx, LM_DBG, "%s %#lx-%#lx", is_write ? "W" : "R", cmd->rw.pos,
- cmd->rw.pos + cmd->rw.count - 1);
+ lm_log(lm_ctx, LM_DBG, "%s %#lx-%#lx", is_write ? "W" : "R", *pos,
+ *pos + count - 1);
- /* copy data to be written from kernel to user space */
+ /* receive data to be written */
if (is_write) {
- err = read(lm_ctx->conn_fd, rwbuf, cmd->rw.count);
+ err = read(lm_ctx->conn_fd, rwbuf, count);
/*
* FIXME this is wrong, we should be checking for
- * err != cmd->rw.count
+ * err != count
*/
if (err < 0) {
- lm_log(lm_ctx, LM_ERR, "failed to read from kernel: %s",
+ lm_log(lm_ctx, LM_ERR, "failed to receive write payload: %s",
strerror(errno));
goto out;
}
err = 0;
#ifdef LM_VERBOSE_LOGGING
- dump_buffer("buffer write", rwbuf, cmd->rw.count);
+ dump_buffer("buffer write", rwbuf, count);
#endif
}
- count = _count = cmd->rw.count;
- cmd->err = muser_pci_hdr_access(lm_ctx, &_count, &cmd->rw.pos,
- is_write, rwbuf);
- if (cmd->err) {
+ _count = count;
+ ret = muser_pci_hdr_access(lm_ctx, &_count, pos, is_write, rwbuf);
+ if (ret != 0) {
+ /* FIXME shouldn't we fail here? */
lm_log(lm_ctx, LM_ERR, "failed to access PCI header: %s",
- strerror(-cmd->err));
+ strerror(-ret));
#ifdef LM_VERBOSE_LOGGING
dump_buffer("buffer write", rwbuf, _count);
#endif
@@ -1965,11 +1645,10 @@ muser_access(lm_ctx_t *lm_ctx, struct muser_cmd *cmd, bool is_write,
* count is how much has been processed by muser_pci_hdr_access,
* _count is how much there's left to be processed by lm_access
*/
- count -= _count;
- ret = lm_access(lm_ctx, rwbuf + count, _count, &cmd->rw.pos,
- is_write);
+ processed = count - _count;
+ ret = lm_access(lm_ctx, rwbuf + processed, _count, pos, is_write);
if (ret >= 0) {
- ret += count;
+ ret += processed;
if (data != NULL) {
/*
* FIXME the spec doesn't specify whether the reset of the
@@ -1994,58 +1673,6 @@ out:
return ret;
}
-static int
-__attribute__((unused)) muser_ioctl(lm_ctx_t *lm_ctx, struct muser_cmd *cmd)
-{
- void *data = NULL;
- size_t size = 0;
- int ret;
- uint32_t flags;
-
- /* TODO make this a function that returns the size */
- switch (cmd->ioctl.vfio_cmd) {
- case VFIO_DEVICE_SET_IRQS:
- flags = cmd->ioctl.data.irq_set.flags;
- switch ((flags & VFIO_IRQ_SET_DATA_TYPE_MASK)) {
- case VFIO_IRQ_SET_DATA_EVENTFD:
- size = sizeof(int32_t) * cmd->ioctl.data.irq_set.count;
- break;
- case VFIO_IRQ_SET_DATA_BOOL:
- size = sizeof(uint8_t) * cmd->ioctl.data.irq_set.count;
- break;
- }
- break;
- case VFIO_IOMMU_MAP_DMA:
- size = sizeof(int);
- break;
- }
-
- if (size != 0) {
- data = calloc(1, size); /* TODO use alloca */
- if (data == NULL) {
-#ifdef DEBUG
- perror("calloc");
-#endif
- return -1;
- }
- ret = transports_ops[lm_ctx->trans].recv_fds(lm_ctx, data, size);
- if (ret < 0) {
- goto out;
- }
- if (ret != (int)size) {
- lm_log(lm_ctx, LM_ERR, "short read for fds\n");
- return -EINVAL;
- }
- }
-
- ret = (int)do_muser_ioctl(lm_ctx, &cmd->ioctl, data);
-
-out:
-
- free(data);
- return ret;
-}
-
static int handle_device_get_region_info(lm_ctx_t *lm_ctx,
struct vfio_user_header *hdr,
struct vfio_region_info **dev_reg_info)
@@ -2069,7 +1696,7 @@ static int handle_device_get_region_info(lm_ctx_t *lm_ctx,
return -errno;
}
- ret = dev_get_reginfo(lm_ctx, &reg_info, false);
+ ret = dev_get_reginfo(lm_ctx, &reg_info);
if (ret < 0) {
free(reg_info);
return ret;
@@ -2279,7 +1906,7 @@ handle_region_access(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr,
void **data, size_t *len)
{
struct vfio_user_region_access region_access;
- struct muser_cmd muser_cmd = { 0, };
+ uint64_t count, offset;
int ret;
assert(lm_ctx != NULL);
@@ -2287,14 +1914,13 @@ handle_region_access(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr,
assert(data != NULL);
/*
- * TODO if muser_access doesn't need to handle the kernel case, then we can
- * avoid having to do an additional read/recv inside muser_access (one recv
- * for struct region_access and another for the write data) by doing a
- * single recvmsg here with an iovec where the first element of the array
- * will be struct vfio_user_region_access and the second a buffer if it's a
- * write. The size of the write buffer is:
- * hdr->msg_size - sizeof *hdr - sizeof region_access,
- * and should be equal to region_access.count.
+ * TODO Since muser_access doesn't have to handle the kernel case any more,
+ * we can avoid having to do an additional read/recv inside muser_access
+ * (one recv for struct region_access and another for the write data) by
+ * doing a single recvmsg here with an iovec where the first element of the
+ * array will be struct vfio_user_region_access and the second a buffer if
+ * it's a write. The size of the write buffer is: hdr->msg_size - sizeof
+ * *hdr - sizeof region_access, and should be equal to region_access.count.
*/
hdr->msg_size -= sizeof *hdr;
@@ -2317,11 +1943,11 @@ handle_region_access(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr,
region_access.region, region_access.count);
return -EINVAL;
}
- muser_cmd.rw.count = region_access.count;
- muser_cmd.rw.pos = region_to_offset(region_access.region) + region_access.offset;
+ count = region_access.count;
+ offset = region_to_offset(region_access.region) + region_access.offset;
- ret = muser_access(lm_ctx, &muser_cmd, hdr->cmd == VFIO_USER_REGION_WRITE,
- data);
+ ret = muser_access(lm_ctx, hdr->cmd == VFIO_USER_REGION_WRITE,
+ data, count, &offset);
if (ret != (int)region_access.count) {
lm_log(lm_ctx, LM_ERR, "bad region access acount, expected=%d, actual=%d",
region_access.count, ret);
@@ -2331,7 +1957,6 @@ handle_region_access(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr,
}
return ret;
}
- assert(muser_cmd.err == 0);
*len = sizeof(region_access);
if (hdr->cmd == VFIO_USER_REGION_READ) {
@@ -2711,6 +2336,8 @@ lm_irq_message(lm_ctx_t *lm_ctx, uint32_t subindex)
void
lm_ctx_destroy(lm_ctx_t *lm_ctx)
{
+ int ret;
+
if (lm_ctx == NULL) {
return;
}
@@ -2722,32 +2349,29 @@ lm_ctx_destroy(lm_ctx_t *lm_ctx)
* is called since it might delete files it did not create. Improve by
* acquiring a lock on the directory.
*/
- if (lm_ctx->trans == LM_TRANS_SOCK) {
- int ret;
-
- if (lm_ctx->iommu_dir_fd != -1) {
- if ((ret = unlinkat(lm_ctx->iommu_dir_fd, IOMMU_GRP_NAME, 0)) == -1
- && errno != ENOENT) {
- lm_log(lm_ctx, LM_DBG, "failed to remove " IOMMU_GRP_NAME ": "
- "%m\n");
- }
- if ((ret = unlinkat(lm_ctx->iommu_dir_fd, MUSER_SOCK, 0)) == -1 &&
- errno != ENOENT) {
- lm_log(lm_ctx, LM_DBG, "failed to remove " MUSER_SOCK ": %m\n");
- }
- if (close(lm_ctx->iommu_dir_fd) == -1) {
- lm_log(lm_ctx, LM_DBG, "failed to close IOMMU dir fd %d: %m\n",
- lm_ctx->iommu_dir_fd);
- }
+
+ if (lm_ctx->iommu_dir_fd != -1) {
+ if ((ret = unlinkat(lm_ctx->iommu_dir_fd, IOMMU_GRP_NAME, 0)) == -1
+ && errno != ENOENT) {
+ lm_log(lm_ctx, LM_DBG, "failed to remove " IOMMU_GRP_NAME ": "
+ "%m\n");
}
- if (lm_ctx->iommu_dir != NULL) {
- if ((ret = rmdir(lm_ctx->iommu_dir)) == -1 && errno != ENOENT) {
- lm_log(lm_ctx, LM_DBG, "failed to remove %s: %m\n",
- lm_ctx->iommu_dir);
- }
- free(lm_ctx->iommu_dir);
+ if ((ret = unlinkat(lm_ctx->iommu_dir_fd, MUSER_SOCK, 0)) == -1 &&
+ errno != ENOENT) {
+ lm_log(lm_ctx, LM_DBG, "failed to remove " MUSER_SOCK ": %m\n");
+ }
+ if (close(lm_ctx->iommu_dir_fd) == -1) {
+ lm_log(lm_ctx, LM_DBG, "failed to close IOMMU dir fd %d: %m\n",
+ lm_ctx->iommu_dir_fd);
}
}
+ if (lm_ctx->iommu_dir != NULL) {
+ if ((ret = rmdir(lm_ctx->iommu_dir)) == -1 && errno != ENOENT) {
+ lm_log(lm_ctx, LM_DBG, "failed to remove %s: %m\n",
+ lm_ctx->iommu_dir);
+ }
+ free(lm_ctx->iommu_dir);
+ }
free(lm_ctx->pci_config_space);
transports_ops[lm_ctx->trans].detach(lm_ctx);
@@ -2913,17 +2537,11 @@ lm_ctx_create(const lm_dev_info_t *dev_info)
return NULL;
}
- if (dev_info->trans < 0 || dev_info->trans >= LM_TRANS_MAX) {
+ if (dev_info->trans != LM_TRANS_SOCK) {
errno = EINVAL;
return NULL;
}
- if ((dev_info->flags & LM_FLAG_ATTACH_NB) != 0 &&
- dev_info->trans != LM_TRANS_SOCK) {
- errno = EINVAL;
- return NULL;
- }
-
/*
* FIXME need to check that the number of MSI and MSI-X IRQs are valid
* (1, 2, 4, 8, 16 or 32 for MSI and up to 2048 for MSI-X).
diff --git a/lib/libmuser_pci.c b/lib/libmuser_pci.c
index 711acc4..2846301 100644
--- a/lib/libmuser_pci.c
+++ b/lib/libmuser_pci.c
@@ -281,11 +281,11 @@ muser_pci_hdr_write(lm_ctx_t *lm_ctx, uint16_t offset,
* @count: output parameter that receives the number of bytes read/written
*/
static inline int
-muser_do_pci_hdr_access(lm_ctx_t *lm_ctx, size_t *count,
- loff_t *pos, bool is_write,
+muser_do_pci_hdr_access(lm_ctx_t *lm_ctx, uint32_t *count,
+ uint64_t *pos, bool is_write,
char *buf)
{
- size_t _count;
+ uint32_t _count;
loff_t _pos;
int err = 0;
@@ -308,16 +308,16 @@ muser_do_pci_hdr_access(lm_ctx_t *lm_ctx, size_t *count,
}
static inline bool
-muser_is_pci_hdr_access(loff_t pos)
+muser_is_pci_hdr_access(uint64_t pos)
{
- const off_t off = (loff_t) region_to_offset(LM_DEV_CFG_REG_IDX);
- return pos - off >= 0 && pos - off < PCI_STD_HEADER_SIZEOF;
+ const uint64_t off = region_to_offset(LM_DEV_CFG_REG_IDX);
+ return pos >= off && pos - off < PCI_STD_HEADER_SIZEOF;
}
/* FIXME this function is misleading, remove it */
int
-muser_pci_hdr_access(lm_ctx_t *lm_ctx, size_t *count,
- loff_t *pos, bool is_write,
+muser_pci_hdr_access(lm_ctx_t *lm_ctx, uint32_t *count,
+ uint64_t *pos, bool is_write,
char *buf)
{
assert(lm_ctx != NULL);
diff --git a/lib/muser_priv.h b/lib/muser_priv.h
index c45a8f3..097874a 100644
--- a/lib/muser_priv.h
+++ b/lib/muser_priv.h
@@ -38,8 +38,8 @@
extern char *irq_to_str[];
int
-muser_pci_hdr_access(lm_ctx_t *lm_ctx, size_t *count,
- loff_t *pos, bool write, char *buf);
+muser_pci_hdr_access(lm_ctx_t *lm_ctx, uint32_t *count,
+ uint64_t *pos, bool write, char *buf);
lm_reg_info_t *
lm_get_region_info(lm_ctx_t *lm_ctx);
@@ -86,6 +86,72 @@ send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd,
struct vfio_user_header *hdr,
void *recv_data, size_t recv_len);
+/* FIXME copied from include/linux/stddef.h, is this OK license-wise? */
+#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER))
+#define offsetofend(TYPE, MEMBER) \
+ (offsetof(TYPE, MEMBER) + sizeof_field(TYPE, MEMBER))
+
+static inline ssize_t get_minsz(unsigned int cmd)
+{
+ switch (cmd) {
+ case VFIO_DEVICE_GET_INFO:
+ return offsetofend(struct vfio_device_info, num_irqs);
+ case VFIO_DEVICE_GET_REGION_INFO:
+ return offsetofend(struct vfio_region_info, offset);
+ case VFIO_DEVICE_GET_IRQ_INFO:
+ return offsetofend(struct vfio_irq_info, count);
+ case VFIO_DEVICE_SET_IRQS:
+ return offsetofend(struct vfio_irq_set, count);
+ case VFIO_GROUP_GET_STATUS:
+ return offsetofend(struct vfio_group_status, flags);
+ case VFIO_GET_API_VERSION:
+ return 0;
+ case VFIO_CHECK_EXTENSION:
+ case VFIO_GROUP_SET_CONTAINER:
+ case VFIO_GROUP_UNSET_CONTAINER:
+ case VFIO_SET_IOMMU:
+ return sizeof(int);
+ case VFIO_IOMMU_GET_INFO:
+ return offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
+ case VFIO_IOMMU_MAP_DMA:
+ return offsetofend(struct vfio_iommu_type1_dma_map, size);
+ case VFIO_IOMMU_UNMAP_DMA:
+ return offsetofend(struct vfio_iommu_type1_dma_unmap, size);
+ case VFIO_GROUP_GET_DEVICE_FD:
+ case VFIO_DEVICE_RESET:
+ return 0;
+ }
+ return -EOPNOTSUPP;
+}
+
+static inline const char* vfio_cmd_to_str(int cmd) {
+ switch (cmd) {
+ case VFIO_GET_API_VERSION: return "VFIO_GET_API_VERSION";
+ case VFIO_CHECK_EXTENSION: return "VFIO_CHECK_EXTENSION";
+ case VFIO_SET_IOMMU: return "VFIO_SET_IOMMU";
+ case VFIO_GROUP_GET_STATUS: return "VFIO_GROUP_GET_STATUS";
+ case VFIO_GROUP_SET_CONTAINER: return "VFIO_GROUP_SET_CONTAINER";
+ case VFIO_GROUP_UNSET_CONTAINER: return "VFIO_GROUP_UNSET_CONTAINER";
+ case VFIO_GROUP_GET_DEVICE_FD: return "VFIO_GROUP_GET_DEVICE_FD";
+ case VFIO_DEVICE_GET_INFO: return "VFIO_DEVICE_GET_INFO";
+ case VFIO_DEVICE_GET_REGION_INFO: return "VFIO_DEVICE_GET_REGION_INFO";
+ case VFIO_DEVICE_GET_IRQ_INFO: return "VFIO_DEVICE_GET_IRQ_INFO";
+ case VFIO_DEVICE_SET_IRQS: return "VFIO_DEVICE_SET_IRQS";
+ case VFIO_DEVICE_RESET: return "VFIO_DEVICE_RESET";
+ case VFIO_IOMMU_GET_INFO: return "VFIO_IOMMU_GET_INFO/VFIO_DEVICE_GET_PCI_HOT_RESET_INFO/VFIO_IOMMU_SPAPR_TCE_GET_INFO";
+ case VFIO_IOMMU_MAP_DMA: return "VFIO_IOMMU_MAP_DMA/VFIO_DEVICE_PCI_HOT_RESET";
+ case VFIO_IOMMU_UNMAP_DMA: return "VFIO_IOMMU_UNMAP_DMA";
+ case VFIO_IOMMU_ENABLE: return "VFIO_IOMMU_ENABLE";
+ case VFIO_IOMMU_DISABLE: return "VFIO_IOMMU_DISABLE";
+ case VFIO_EEH_PE_OP: return "VFIO_EEH_PE_OP";
+ case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: return "VFIO_IOMMU_SPAPR_REGISTER_MEMORY";
+ case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: return "VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY";
+ case VFIO_IOMMU_SPAPR_TCE_CREATE: return "VFIO_IOMMU_SPAPR_TCE_CREATE";
+ case VFIO_IOMMU_SPAPR_TCE_REMOVE: return "VFIO_IOMMU_SPAPR_TCE_REMOVE";
+ }
+ return NULL;
+}
+
#endif /* MUSER_PRIV_H */
/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/patches/vfio.diff b/patches/vfio.diff
deleted file mode 100644
index d19da2e..0000000
--- a/patches/vfio.diff
+++ /dev/null
@@ -1,192 +0,0 @@
-diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
-index a3030cd..ab1b82c 100644
---- a/drivers/vfio/vfio.c
-+++ b/drivers/vfio/vfio.c
-@@ -2019,15 +2019,24 @@ static int vfio_register_iommu_notifier(struct vfio_group *group,
- int ret;
-
- ret = vfio_group_add_container_user(group);
-- if (ret)
-+ if (ret) {
-+ pr_info("vfio_group_add_container_user failed with %d\n", ret);
- return -EINVAL;
-+ }
-
- container = group->container;
- driver = container->iommu_driver;
-- if (likely(driver && driver->ops->register_notifier))
-+ if (likely(driver && driver->ops->register_notifier)) {
- ret = driver->ops->register_notifier(container->iommu_data,
-- events, nb);
-- else
-+ events, nb);
-+ if (unlikely(!ret) && driver->ops->retro_notify) {
-+ ret = driver->ops->retro_notify(container->iommu_data);
-+ if (unlikely((ret & NOTIFY_BAD) == NOTIFY_BAD))
-+ ret = -ENOTTY;
-+ else
-+ ret = 0;
-+ }
-+ } else
- ret = -ENOTTY;
-
- vfio_group_try_dissolve_container(group);
-@@ -2140,6 +2149,7 @@ int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
- ret = vfio_register_group_notifier(group, events, nb);
- break;
- default:
-+ pr_info("bad notification type %d\n", type);
- ret = -EINVAL;
- }
-
-diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
-index d0f731c..b47b8f96 100644
---- a/drivers/vfio/vfio_iommu_type1.c
-+++ b/drivers/vfio/vfio_iommu_type1.c
-@@ -558,8 +558,10 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
- return -EINVAL;
-
- /* Supported for v2 version only */
-- if (!iommu->v2)
-+ if (!iommu->v2) {
-+ pr_debug("non v2 IOMMU\n");
- return -EACCES;
-+ }
-
- mutex_lock(&iommu->lock);
-
-@@ -1050,6 +1052,30 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
- return ret;
- }
-
-+static int vfio_dma_map_trigger_notifiers(struct vfio_iommu * const iommu,
-+ struct vfio_dma const * const dma)
-+
-+{
-+ struct vfio_iommu_type1_dma_map nb_map = {0};
-+
-+ BUG_ON(!iommu);
-+ BUG_ON(!dma);
-+
-+ nb_map.flags = dma->prot;
-+
-+ if ((dma->prot & IOMMU_READ) == IOMMU_READ)
-+ nb_map.flags |= VFIO_DMA_MAP_FLAG_READ;
-+ if ((dma->prot & IOMMU_WRITE) == IOMMU_WRITE)
-+ nb_map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
-+ nb_map.vaddr = dma->vaddr;
-+ nb_map.iova = dma->iova;
-+ nb_map.size = dma->size;
-+
-+ return blocking_notifier_call_chain(&iommu->notifier,
-+ VFIO_IOMMU_NOTIFY_DMA_MAP,
-+ &nb_map);
-+}
-+
- static int vfio_dma_do_map(struct vfio_iommu *iommu,
- struct vfio_iommu_type1_dma_map *map)
- {
-@@ -1139,13 +1165,25 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
- vfio_link_dma(iommu, dma);
-
- /* Don't pin and map if container doesn't contain IOMMU capable domain*/
-- if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
-+ if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
- dma->size = size;
-- else
-+ ret = 0;
-+ } else
- ret = vfio_pin_map_dma(iommu, dma, size);
-
- out_unlock:
- mutex_unlock(&iommu->lock);
-+ /* FIXME is the following safe without having acquired the mutex? */
-+ if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) && !ret) {
-+ ret = vfio_dma_map_trigger_notifiers(iommu, dma);
-+ /* FIXME proceed or clean up and fail? */
-+ if ((ret & NOTIFY_BAD) == NOTIFY_BAD) {
-+ pr_debug("failed to trigger notifier(s): %d\n", ret);
-+ ret = -EINVAL;
-+ } else
-+ ret = 0;
-+ }
-+
- return ret;
- }
-
-@@ -1504,8 +1542,11 @@ static void vfio_sanity_check_pfn_list(struct vfio_iommu *iommu)
-
- dma = rb_entry(n, struct vfio_dma, node);
-
-- if (WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list)))
-+ if (WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list))) {
-+ pr_debug("DMA region %llx-%llx still pinned\n",
-+ dma->iova, dma->iova + dma->size);
- break;
-+ }
- }
- /* mdev vendor driver must unregister notifier */
- WARN_ON(iommu->notifier.head);
-@@ -1740,7 +1781,7 @@ static int vfio_iommu_type1_register_notifier(void *iommu_data,
- struct vfio_iommu *iommu = iommu_data;
-
- /* clear known events */
-- *events &= ~VFIO_IOMMU_NOTIFY_DMA_UNMAP;
-+ *events &= ~(VFIO_IOMMU_NOTIFY_DMA_MAP | VFIO_IOMMU_NOTIFY_DMA_UNMAP);
-
- /* refuse to register if still events remaining */
- if (*events)
-@@ -1749,6 +1790,25 @@ static int vfio_iommu_type1_register_notifier(void *iommu_data,
- return blocking_notifier_chain_register(&iommu->notifier, nb);
- }
-
-+static int vfio_iommu_type1_retro_notify(void *iommu_data)
-+{
-+ int err = NOTIFY_OK;
-+ struct vfio_iommu *iommu;
-+ struct vfio_dma *pos, *n;
-+
-+ BUG_ON(!iommu_data);
-+
-+ iommu = (struct vfio_iommu*)iommu_data;
-+
-+ rbtree_postorder_for_each_entry_safe(pos, n, &iommu->dma_list, node) {
-+ err = vfio_dma_map_trigger_notifiers(iommu, pos);
-+ if ((err & NOTIFY_BAD) == NOTIFY_BAD)
-+ break;
-+ }
-+
-+ return err;
-+}
-+
- static int vfio_iommu_type1_unregister_notifier(void *iommu_data,
- struct notifier_block *nb)
- {
-@@ -1769,6 +1829,7 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
- .unpin_pages = vfio_iommu_type1_unpin_pages,
- .register_notifier = vfio_iommu_type1_register_notifier,
- .unregister_notifier = vfio_iommu_type1_unregister_notifier,
-+ .retro_notify = vfio_iommu_type1_retro_notify,
- };
-
- static int __init vfio_iommu_type1_init(void)
-diff --git a/include/linux/vfio.h b/include/linux/vfio.h
-index 66741ab0..10ee80b 100644
---- a/include/linux/vfio.h
-+++ b/include/linux/vfio.h
-@@ -85,6 +85,7 @@ struct vfio_iommu_driver_ops {
- struct notifier_block *nb);
- int (*unregister_notifier)(void *iommu_data,
- struct notifier_block *nb);
-+ int (*retro_notify)(void *iommu_data);
- };
-
- extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
-@@ -118,6 +119,7 @@ enum vfio_notify_type {
-
- /* events for VFIO_IOMMU_NOTIFY */
- #define VFIO_IOMMU_NOTIFY_DMA_UNMAP BIT(0)
-+#define VFIO_IOMMU_NOTIFY_DMA_MAP BIT(1)
-
- /* events for VFIO_GROUP_NOTIFY */
- #define VFIO_GROUP_NOTIFY_SET_KVM BIT(0)