aboutsummaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorFelipe Franciosi <felipe@nutanix.com>2019-07-02 14:06:42 +0100
committerFelipe Franciosi <felipe@nutanix.com>2019-09-05 16:45:35 +0100
commitf8ef2771ca6c05dadd3188099eb678e6135e12e2 (patch)
tree1629283ee553622ce99477c63da4994d4c87bc0f /lib
downloadlibvfio-user-f8ef2771ca6c05dadd3188099eb678e6135e12e2.zip
libvfio-user-f8ef2771ca6c05dadd3188099eb678e6135e12e2.tar.gz
libvfio-user-f8ef2771ca6c05dadd3188099eb678e6135e12e2.tar.bz2
Initial commit
Diffstat (limited to 'lib')
-rw-r--r--lib/.indent.pro4
-rw-r--r--lib/CMakeLists.txt46
-rw-r--r--lib/common.h60
-rw-r--r--lib/dma.c331
-rw-r--r--lib/dma.h241
-rw-r--r--lib/libmuser.c1063
-rw-r--r--lib/libmuser_pci.c311
-rw-r--r--lib/msicap.h67
-rw-r--r--lib/muser.h185
-rw-r--r--lib/pci.h276
-rw-r--r--lib/pmcap.h70
-rw-r--r--lib/pxcap.h144
12 files changed, 2798 insertions, 0 deletions
diff --git a/lib/.indent.pro b/lib/.indent.pro
new file mode 100644
index 0000000..52ef8f2
--- /dev/null
+++ b/lib/.indent.pro
@@ -0,0 +1,4 @@
+-nbad -bap -nbc -bbo -hnl -br -brs -c33 -cd33 -ncdb -ce -ci4
+-cli0 -d0 -di1 -nfc1 -i4 -ip0 -l80 -lp -npcs -nprs -psl -sai
+-saf -saw -ncs -nsc -nsob -nfca -cp33 -ss -ts8 -il0
+-nut -blf
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
new file mode 100644
index 0000000..6d3d0ae
--- /dev/null
+++ b/lib/CMakeLists.txt
@@ -0,0 +1,46 @@
+#
+# Copyright (c) 2019 Nutanix Inc. All rights reserved.
+#
+# Authors: Thanos Makatos <thanos@nutanix.com>
+# Swapnil Ingle <swapnil.ingle@nutanix.com>
+# Felipe Franciosi <felipe@nutanix.com>
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Nutanix nor the names of its contributors may be
+# used to endorse or promote products derived from this software without
+# specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+add_library(muser SHARED
+ ../kmod/muser.h
+ muser.h
+ pci.h
+ pmcap.h
+ msicap.h
+ pxcap.h
+ common.h
+ dma.h
+ dma.c
+ libmuser.c
+ libmuser_pci.c)
+set_target_properties(muser PROPERTIES PUBLIC_HEADER "muser.h;pci.h;pmcap.h;msicap.h;pxcap.h")
+install(TARGETS muser
+ LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+ PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/muser)
diff --git a/lib/common.h b/lib/common.h
new file mode 100644
index 0000000..4fbc048
--- /dev/null
+++ b/lib/common.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2019 Nutanix Inc. All rights reserved.
+ *
+ * Authors: Thanos Makatos <thanos@nutanix.com>
+ * Swapnil Ingle <swapnil.ingle@nutanix.com>
+ * Felipe Franciosi <felipe@nutanix.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Nutanix nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#ifndef __COMMON_H__
+#define __COMMON_H__
+
+#include <stdint.h>
+
+#define PAGE_SIZE sysconf(_SC_PAGE_SIZE)
+#define PAGE_ALIGNED(x) (((x) & ((typeof(x))(PAGE_SIZE) - 1)) == 0)
+
+#define BIT(nr) (1UL << (nr))
+
+#define ARRAY_SIZE(array) (sizeof(array) / sizeof((array)[0]))
+
+#define likely(e) __builtin_expect(!!(e), 1)
+#define unlikely(e) __builtin_expect(e, 0)
+
+#define ROUND_DOWN(x, a) ((x) & ~((a)-1))
+#define ROUND_UP(x,a) ROUND_DOWN((x)+(a)-1, a)
+
+void lm_log(lm_ctx_t const *const lm_ctx, const lm_log_lvl_t lvl,
+ char const *const fmt, ...);
+
+void dump_buffer(lm_ctx_t const *const lm_ctx, char const *const prefix,
+ unsigned char const *const buf, uint32_t count);
+
+
+#endif /* __COMMON_H__ */
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/dma.c b/lib/dma.c
new file mode 100644
index 0000000..5c9455f
--- /dev/null
+++ b/lib/dma.c
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2019 Nutanix Inc. All rights reserved.
+ *
+ * Authors: Mike Cui <cui@nutanix.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Nutanix nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <sys/param.h>
+
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include <errno.h>
+
+#include "dma.h"
+
+static inline ssize_t
+fd_get_blocksize(int fd)
+{
+ struct stat st;
+
+ if (fstat(fd, &st) != 0)
+ return -1;
+
+ return st.st_blksize;
+}
+
+/* Returns true if 2 fds refer to the same file.
+ If any fd is invalid, return false. */
+static inline bool
+fds_are_same_file(int fd1, int fd2)
+{
+ struct stat st1, st2;
+
+ return (fstat(fd1, &st1) == 0 && fstat(fd2, &st2) == 0 &&
+ st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
+}
+
+dma_controller_t *
+dma_controller_create(int max_regions)
+{
+ dma_controller_t *dma;
+
+ dma = malloc(offsetof(dma_controller_t, regions) +
+ max_regions * sizeof(dma->regions[0]));
+
+ if (dma == NULL) {
+ return dma;
+ }
+
+ dma->max_regions = max_regions;
+ dma->nregions = 0;
+ memset(dma->regions, 0, max_regions * sizeof(dma->regions[0]));
+
+ return dma;
+}
+
+static void
+_dma_controller_do_remove_region(dma_memory_region_t * const region)
+{
+ assert(region);
+#if DMA_MAP_FAST_IMPL
+ dma_unmap_region(region, region->virt_addr, region->size);
+#endif
+ (void)close(region->fd);
+}
+
+/* FIXME not thread safe */
+int
+dma_controller_remove_region(dma_controller_t * dma, dma_addr_t dma_addr,
+ size_t size, int fd)
+{
+ int idx;
+ dma_memory_region_t *region;
+
+ assert(dma);
+
+ for (idx = 0; idx < dma->nregions; idx++) {
+ region = &dma->regions[idx];
+ if (region->dma_addr == dma_addr && region->size == size &&
+ fds_are_same_file(region->fd, fd)) {
+ _dma_controller_do_remove_region(region);
+ if (dma->nregions > 1)
+ memcpy(region, &dma->regions[dma->nregions - 1],
+ sizeof *region);
+ dma->nregions--;
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+static inline void
+dma_controller_remove_regions(lm_ctx_t * const ctx,
+ dma_controller_t * const dma)
+{
+ int i;
+
+ assert(dma);
+
+ for (i = 0; i < dma->nregions; i++) {
+ dma_memory_region_t *region = &dma->regions[i];
+
+ lm_log(ctx, LM_INF, "unmap vaddr=%lx IOVA=%lx\n",
+ region->virt_addr, region->dma_addr);
+
+ _dma_controller_do_remove_region(region);
+ }
+}
+
+void
+dma_controller_destroy(lm_ctx_t * const ctx, dma_controller_t * dma)
+{
+ dma_controller_remove_regions(ctx, dma);
+ free(dma);
+}
+
+int
+dma_controller_add_region(lm_ctx_t * const lm_ctx, dma_controller_t * dma,
+ dma_addr_t dma_addr, size_t size,
+ int fd, off_t offset)
+{
+ int idx;
+ dma_memory_region_t *region;
+ int page_size;
+
+ for (idx = 0; idx < dma->nregions; idx++) {
+ region = &dma->regions[idx];
+
+ /* First check if this is the same exact region. */
+ if (region->dma_addr == dma_addr && region->size == size) {
+ if (offset != region->offset) {
+ lm_log(lm_ctx, LM_ERR, "bad offset for new DMA region %lx+%lx, "
+ "want=%d, existing=%d\n",
+ dma_addr, size, offset, region->offset);
+ goto err;
+ }
+ if (!fds_are_same_file(region->fd, fd)) {
+ /*
+ * Printing the file descriptors here doesn't really make
+ * sense as they can be different but actually pointing to
+ * the same file, however in the majority of cases we'll be
+ * using a single fd.
+ */
+ lm_log(lm_ctx, LM_ERR, "bad fd=%d for new DMA region %lx-%lx, "
+ "existing fd=%d\n", fd, region->fd);
+ goto err;
+ }
+ return idx;
+ }
+
+ /* Check for overlap, i.e. start of one region is within another. */
+ if ((dma_addr >= region->dma_addr &&
+ dma_addr < region->dma_addr + region->size) ||
+ (region->dma_addr >= dma_addr &&
+ region->dma_addr < dma_addr + size)) {
+ lm_log(lm_ctx, LM_INF, "new DMA region %lx+%lx overlaps with DMA "
+ "region %lx-%lx\n", dma_addr, size, region->dma_addr,
+ region->size);
+ goto err;
+ }
+ }
+
+ if (dma->nregions == dma->max_regions) {
+ idx = dma->max_regions;
+ lm_log(lm_ctx, LM_ERR, "reached maxed regions\n");
+ goto err;
+ }
+
+ idx = dma->nregions;
+ region = &dma->regions[idx];
+
+ page_size = fd_get_blocksize(fd);
+ if (page_size < 0) {
+ lm_log(lm_ctx, LM_ERR, "bad page size %d\n", page_size);
+ goto err;
+ }
+ page_size = MAX(page_size, getpagesize());
+
+ region->dma_addr = dma_addr;
+ region->size = size;
+ region->page_size = page_size;
+ region->offset = offset;
+
+ region->fd = dup(fd); // dup the fd to get our own private copy
+ if (region->fd < 0) {
+ lm_log(lm_ctx, LM_ERR, "failed to duplicate file descriptor: %s\n",
+ strerror(errno));
+ goto err;
+ }
+#if DMA_MAP_FAST_IMPL
+ region->virt_addr = dma_map_region(region, PROT_READ | PROT_WRITE,
+ 0, region->size);
+ if (region->virt_addr == MAP_FAILED) {
+ lm_log(lm_ctx, LM_ERR, "failed to memory map DMA region %lx-%lx: %s\n",
+ dma_addr, dma_addr + size, strerror(errno));
+ close(region->fd);
+ goto err;
+ }
+#endif
+
+ dma->nregions++;
+
+ return idx;
+
+err:
+ return -idx - 1;
+}
+
+static inline void
+mmap_round(size_t * offset, size_t * size, size_t page_size)
+{
+ size_t offset_orig = *offset;
+ *offset = ROUND_DOWN(offset_orig, page_size);
+ *size = ROUND_UP(offset_orig + *size, page_size) - *offset;
+}
+
+void *
+dma_map_region(dma_memory_region_t * region, int prot,
+ size_t offset, size_t len)
+{
+ size_t mmap_offset, mmap_size = len;
+ char *mmap_base;
+
+ if (offset >= region->size || offset + len > region->size) {
+ return MAP_FAILED;
+ }
+
+ offset += region->offset;
+ mmap_offset = offset;
+ mmap_round(&mmap_offset, &mmap_size, region->page_size);
+
+ // Do the mmap.
+ mmap_base = mmap(NULL, mmap_size, prot, MAP_SHARED,
+ region->fd, mmap_offset);
+ if (mmap_base == MAP_FAILED) {
+ return mmap_base;
+ }
+ // Do not dump.
+ madvise(mmap_base, mmap_size, MADV_DONTDUMP);
+
+ return mmap_base + (offset - mmap_offset);
+}
+
+void
+dma_unmap_region(dma_memory_region_t * region, void *virt_addr, size_t len)
+{
+ mmap_round((size_t *) & virt_addr, &len, region->page_size);
+ munmap(virt_addr, len);
+}
+
+int
+_dma_addr_sg_split(lm_ctx_t * const ctx, const dma_controller_t * dma,
+ dma_addr_t dma_addr, uint32_t len,
+ dma_scattergather_t * sg, int max_sg)
+{
+ int idx;
+ int cnt = 0;
+ bool found = true; // Whether the current region is found.
+
+ while (found && len > 0) {
+ found = false;
+ for (idx = 0; idx < dma->nregions; idx++) {
+ const dma_memory_region_t *const region = &dma->regions[idx];
+ const dma_addr_t region_end = region->dma_addr + region->size;
+
+ while (dma_addr >= region->dma_addr && dma_addr < region_end) {
+ size_t region_len = MIN(region_end - dma_addr, len);
+
+ if (cnt < max_sg) {
+ sg[cnt].region = idx;
+ sg[cnt].offset = dma_addr - region->dma_addr;
+ sg[cnt].length = region_len;
+ }
+
+ cnt++;
+
+ // dma_addr found, may need to start from the top for the
+ // next dma_addr.
+ found = true;
+ dma_addr += region_len;
+ len -= region_len;
+
+ if (len == 0) {
+ goto out;
+ }
+ }
+ }
+ }
+
+out:
+ if (!found) {
+ // There is still a region which was not found.
+ assert(len > 0);
+ cnt = -1;
+ } else if (cnt > max_sg) {
+ cnt = -cnt - 1;
+ }
+ return cnt;
+}
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/dma.h b/lib/dma.h
new file mode 100644
index 0000000..80afaec
--- /dev/null
+++ b/lib/dma.h
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2019 Nutanix Inc. All rights reserved.
+ *
+ * Authors: Mike Cui <cui@nutanix.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Nutanix nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#ifndef DMA_DMA_H
+#define DMA_DMA_H
+
+/*
+ * This library emulates a DMA controller for a device emulation application to
+ * perform DMA operations on a foreign memory space.
+ *
+ * Concepts:
+ * - A DMA controller has its own 64-bit DMA address space.
+ * - Foreign memory is made available to the DMA controller in linear chunks
+ * called memory regions.
+ * - Each memory region is backed by a file descriptor and
+ * is registered with the DMA controllers at a unique, non-overlapping
+ * linear span of the DMA address space.
+ * - To perform DMA, the application should first build a scatter-gather
+ * list (sglist) of dma_scattergather_t from DMA addresses. Then the sglist
+ * can be mapped using dma_map_sg() into the process's virtual address space
+ * as an iovec for direct access, and unmapped using dma_unmap_sg() when done.
+ * - dma_map_addr() and dma_unmap_addr() helper functions are provided
+ * for mapping DMA regions that can fit into one scatter-gather entry.
+ *
+ * This library can be compiled to function in two modes as defined by the
+ * following macros.
+ * - DMA_MAP_FAST (default): Every region is mapped into the application's
+ * virtual address space at registration time with R/W permissions.
+ * dma_map_sg() ignores all protection bits and only does lookups and
+ * returns pointers to the previously mapped regions. dma_unmap_sg() is
+ * effectively a no-op.
+ * - DMA_MAP_PROTECTED: Every call to dma_map_sg() does mmap()s and
+ * dma_unmap_sg() does munmap()s. All permission bits are honored. This mode
+ * is obviously much slower if used in the fast path. It may be useful to
+ * have the exta protection if the fast path does not need direct virtual
+ * memory access to foreign memory and data is accessed using a different
+ * method (e.g. RDMA, vfio-iommu). It can also be useful in debugging to
+ * make sure we are not writing to guest memory that's readonly for the
+ * device.
+ */
+
+#ifdef DMA_MAP_PROTECTED
+#undef DMA_MAP_FAST
+#define DMA_MAP_FAST_IMPL 0
+#else
+#define DMA_MAP_FAST_IMPL 1
+#endif
+
+#include <assert.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/mman.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "muser.h"
+#include "common.h"
+
+typedef struct {
+ dma_addr_t dma_addr; // DMA address of this region
+ size_t size; // Size of this region
+ int fd; // File descriptor to mmap
+ int page_size; // Page size of this fd
+ off_t offset; // File offset
+#if DMA_MAP_FAST_IMPL
+ void *virt_addr; // Virtual address of this region
+#endif
+} dma_memory_region_t;
+
+typedef struct {
+ int max_regions;
+ int nregions;
+ dma_memory_region_t regions[0];
+} dma_controller_t;
+
+dma_controller_t *dma_controller_create(int max_regions);
+void dma_controller_destroy(lm_ctx_t * const ctx, dma_controller_t * dma);
+
+/* Registers a new memory region.
+ * Returns:
+ * - On success, a non-negative region number
+ * - On failure, a negative integer (-x - 1) where x is the region number
+ * where this region would have been mapped to if the call could succeed
+ * (e.g. due to conflict with existing region).
+ */
+int dma_controller_add_region(lm_ctx_t * const ctx, dma_controller_t * dma,
+ dma_addr_t dma_addr, size_t size,
+ int fd, off_t offset);
+
+int dma_controller_remove_region(dma_controller_t * dma, dma_addr_t dma_addr,
+ size_t size, int fd);
+
+// Helper for dma_addr_to_sg() slow path.
+int _dma_addr_sg_split(lm_ctx_t * const ctx, const dma_controller_t * dma,
+ dma_addr_t dma_addr, uint32_t len,
+ dma_scattergather_t * sg, int max_sg);
+
+/* Takes a linear dma address span and returns a sg list suitable for DMA.
+ * A single linear dma address span may need to be split into multiple
+ * scatter gather regions due to limitations of how memory can be mapped.
+ *
+ * Returns:
+ * - On success, number of scatter gather entries created.
+ * - On failure:
+ * -1 if the dma address span is invalid
+ * (-x - 1) if @max_sg is too small, where x is the number of sg entries
+ * necessary to complete this request.
+ */
+static inline int
+dma_addr_to_sg(lm_ctx_t * const ctx, const dma_controller_t * dma,
+ dma_addr_t dma_addr, uint32_t len,
+ dma_scattergather_t * sg, int max_sg)
+{
+ static __thread int region_hint;
+ int cnt;
+
+ const dma_memory_region_t *const region = &dma->regions[region_hint];
+ const dma_addr_t region_end = region->dma_addr + region->size;
+
+ // Fast path: single region.
+ if (likely(max_sg > 0 && len > 0 &&
+ dma_addr >= region->dma_addr && dma_addr + len <= region_end)) {
+ sg->region = region_hint;
+ sg->offset = dma_addr - region->dma_addr;
+ sg->length = len;
+ return 1;
+ }
+ // Slow path: search through regions.
+ cnt = _dma_addr_sg_split(ctx, dma, dma_addr, len, sg, max_sg);
+ if (likely(cnt > 0)) {
+ region_hint = sg->region;
+ }
+ return cnt;
+}
+
+void *dma_map_region(dma_memory_region_t * region, int prot,
+ size_t offset, size_t len);
+
+void dma_unmap_region(dma_memory_region_t * region,
+ void *virt_addr, size_t len);
+
+static inline int
+dma_map_sg(dma_controller_t * dma, int prot,
+ const dma_scattergather_t * sg, struct iovec *iov, int cnt)
+{
+ int i;
+
+ for (i = 0; i < cnt; i++) {
+ dma_memory_region_t *const region = &dma->regions[sg[i].region];
+
+#if DMA_MAP_FAST_IMPL
+ iov[i].iov_base = (char *)region->virt_addr + sg[i].offset;
+#else
+ iov[i].iov_base = dma_map_region(region, prot,
+ sg[i].offset, sg[i].length);
+ if (iov[i].iov_base == MAP_FAILED) {
+ return -1;
+ }
+#endif
+ iov[i].iov_len = sg[i].length;
+ }
+
+ return 0;
+}
+
+static inline void
+dma_unmap_sg(dma_controller_t * dma,
+ const dma_scattergather_t * sg, struct iovec *iov, int cnt)
+{
+ int i;
+
+ for (i = 0; i < cnt; i++) {
+ dma_memory_region_t *const region = &dma->regions[sg[i].region];
+ if (!DMA_MAP_FAST_IMPL) {
+ dma_unmap_region(region, iov[i].iov_base, iov[i].iov_len);
+ }
+ }
+}
+
+static inline void *
+dma_map_addr(lm_ctx_t * const ctx, dma_controller_t * dma, int prot,
+ dma_addr_t dma_addr, uint32_t len)
+{
+ dma_scattergather_t sg;
+ struct iovec iov;
+
+ if (dma_addr_to_sg(ctx, dma, dma_addr, len, &sg, 1) == 1 &&
+ dma_map_sg(dma, prot, &sg, &iov, 1) == 0) {
+ return iov.iov_base;
+ }
+
+ return NULL;
+}
+
+static inline void
+dma_unmap_addr(lm_ctx_t * const ctx, dma_controller_t * dma,
+ dma_addr_t dma_addr, uint32_t len, void *addr)
+{
+ dma_scattergather_t sg;
+ struct iovec iov = {
+ .iov_base = addr,
+ .iov_len = len,
+ };
+ int r;
+
+ r = dma_addr_to_sg(ctx, dma, dma_addr, len, &sg, 1);
+ assert(r == 1);
+
+ dma_unmap_sg(dma, &sg, &iov, 1);
+}
+
+#endif /* DMA_DMA_H */
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/libmuser.c b/lib/libmuser.c
new file mode 100644
index 0000000..ba016fe
--- /dev/null
+++ b/lib/libmuser.c
@@ -0,0 +1,1063 @@
+/*
+ * Copyright (c) 2019 Nutanix Inc. All rights reserved.
+ *
+ * Authors: Thanos Makatos <thanos@nutanix.com>
+ * Swapnil Ingle <swapnil.ingle@nutanix.com>
+ * Felipe Franciosi <felipe@nutanix.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Nutanix nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#define _GNU_SOURCE
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/eventfd.h>
+#include <sys/ioctl.h>
+#include <assert.h>
+#include <errno.h>
+#include <stddef.h>
+#include <sys/mman.h>
+#include <stdarg.h>
+
+#include "../kmod/muser.h"
+#include "muser.h"
+#include "dma.h"
+
+typedef enum {
+ IRQ_NONE = 0,
+ IRQ_INTX,
+ IRQ_MSI,
+ IRQ_MSIX,
+} irq_type_t;
+
+typedef struct {
+ irq_type_t type; /* irq type this device is using */
+ int err_efd; /* eventfd for irq err */
+ int req_efd; /* eventfd for irq req */
+ uint32_t max_ivs; /* maximum number of ivs supported */
+ int efds[0]; /* XXX must be last */
+} lm_irqs_t;
+
+/*
+ * Macro that ensures that a particular struct member is last. Doesn't work for
+ * flexible array members.
+ */
+#define MUST_BE_LAST(s, m, t) \
+ _Static_assert(sizeof(s) - offsetof(s, m) == sizeof(t), \
+ #t " " #m " must be last member in " #s)
+
+struct lm_ctx {
+ void *pvt;
+ dma_controller_t *dma;
+ int fd;
+ bool extended;
+ lm_fops_t fops;
+ lm_log_lvl_t log_lvl;
+ lm_log_fn_t *log;
+ lm_pci_info_t pci_info;
+ lm_pci_config_space_t *pci_config_space;
+ lm_irqs_t irqs; /* XXX must be last */
+};
+MUST_BE_LAST(struct lm_ctx, irqs, lm_irqs_t);
+
+#define LM_CTX_SIZE(irqs) (sizeof(lm_ctx_t) + sizeof(int) * irqs)
+#define LM2VFIO_IRQT(type) (type - 1)
+
+void lm_log(const lm_ctx_t * const ctx, const lm_log_lvl_t lvl,
+ const char *const fmt, ...)
+{
+ va_list ap;
+ char buf[BUFSIZ];
+
+ assert(ctx);
+
+ if (!ctx->log || lvl > ctx->log_lvl || !fmt) {
+ return;
+ }
+
+ va_start(ap, fmt);
+ vsnprintf(buf, sizeof buf, fmt, ap);
+ va_end(ap);
+ ctx->log(ctx->pvt, buf);
+}
+
+static long irqs_disable(lm_ctx_t * lm_ctx, uint32_t index)
+{
+ int *irq_efd = NULL;
+ uint32_t i;
+
+ assert(lm_ctx != NULL);
+ assert(index < LM_DEV_NUM_IRQS);
+
+ switch (index) {
+ case VFIO_PCI_INTX_IRQ_INDEX:
+ case VFIO_PCI_MSI_IRQ_INDEX:
+ case VFIO_PCI_MSIX_IRQ_INDEX:
+ lm_ctx->irqs.type = IRQ_NONE;
+ for (i = 0; i < lm_ctx->irqs.max_ivs; i++) {
+ if (lm_ctx->irqs.efds[i] >= 0) {
+ (void) close(lm_ctx->irqs.efds[i]);
+ lm_ctx->irqs.efds[i] = -1;
+ }
+ }
+ return 0;
+ case VFIO_PCI_ERR_IRQ_INDEX:
+ irq_efd = &lm_ctx->irqs.err_efd;
+ break;
+ case VFIO_PCI_REQ_IRQ_INDEX:
+ irq_efd = &lm_ctx->irqs.req_efd;
+ break;
+ }
+
+ if (irq_efd != NULL) {
+ (void)close(*irq_efd);
+ *irq_efd = -1;
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static int irqs_set_data_none(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set)
+{
+ int efd, i;
+ long ret;
+ eventfd_t val;
+
+ for (i = irq_set->start; i < irq_set->start + irq_set->count; i++) {
+ efd = lm_ctx->irqs.efds[i];
+ if (efd >= 0) {
+ val = 1;
+ ret = eventfd_write(efd, val);
+ if (ret == -1) {
+ return -errno;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int
+irqs_set_data_bool(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set, void *data)
+{
+ uint8_t *d8;
+ int efd, i;
+ long ret;
+ eventfd_t val;
+
+ assert(data != NULL);
+ for (i = irq_set->start, d8 = data; i < irq_set->start + irq_set->count;
+ i++, d8++) {
+ efd = lm_ctx->irqs.efds[i];
+ if (efd >= 0 && *d8 == 1) {
+ val = 1;
+ ret = eventfd_write(efd, val);
+ if (ret == -1) {
+ return -errno;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int
+irqs_set_data_eventfd(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set, void *data)
+{
+ int32_t *d32;
+ int efd, i;
+
+ assert(data != NULL);
+ for (i = irq_set->start, d32 = data; i < irq_set->start + irq_set->count;
+ i++, d32++) {
+ efd = lm_ctx->irqs.efds[i];
+ if (efd >= 0) {
+ (void) close(efd);
+ lm_ctx->irqs.efds[i] = -1;
+ }
+ if (*d32 >= 0) {
+ lm_ctx->irqs.efds[i] = *d32;
+ }
+ }
+
+ return 0;
+}
+
+static long
+irqs_trigger(lm_ctx_t * lm_ctx, struct vfio_irq_set *irq_set, void *data)
+{
+ int err = 0;
+
+ assert(lm_ctx != NULL);
+ assert(irq_set != NULL);
+
+ if (irq_set->count == 0) {
+ return irqs_disable(lm_ctx, irq_set->index);
+ }
+
+ switch (irq_set->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
+ case VFIO_IRQ_SET_DATA_NONE:
+ err = irqs_set_data_none(lm_ctx, irq_set);
+ break;
+ case VFIO_IRQ_SET_DATA_BOOL:
+ err = irqs_set_data_bool(lm_ctx, irq_set, data);
+ break;
+ case VFIO_IRQ_SET_DATA_EVENTFD:
+ err = irqs_set_data_eventfd(lm_ctx, irq_set, data);
+ break;
+ }
+
+ return err;
+}
+
+static long
+dev_set_irqs_validate(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set)
+{
+ lm_pci_info_t *pci_info = &lm_ctx->pci_info;
+ uint32_t a_type, d_type;
+
+ assert(lm_ctx != NULL);
+ assert(irq_set != NULL);
+
+ // Separate action and data types from flags.
+ a_type = (irq_set->flags & VFIO_IRQ_SET_ACTION_TYPE_MASK);
+ d_type = (irq_set->flags & VFIO_IRQ_SET_DATA_TYPE_MASK);
+
+ // Ensure index is within bounds.
+ if (irq_set->index >= LM_DEV_NUM_IRQS) {
+ return -EINVAL;
+ }
+
+ /* TODO make each condition a function */
+
+ // Only one of MASK/UNMASK/TRIGGER is valid.
+ if ((a_type != VFIO_IRQ_SET_ACTION_MASK) &&
+ (a_type != VFIO_IRQ_SET_ACTION_UNMASK) &&
+ (a_type != VFIO_IRQ_SET_ACTION_TRIGGER)) {
+ return -EINVAL;
+ }
+ // Only one of NONE/BOOL/EVENTFD is valid.
+ if ((d_type != VFIO_IRQ_SET_DATA_NONE) &&
+ (d_type != VFIO_IRQ_SET_DATA_BOOL) &&
+ (d_type != VFIO_IRQ_SET_DATA_EVENTFD)) {
+ return -EINVAL;
+ }
+ // Ensure irq_set's start and count are within bounds.
+ if ((irq_set->start >= pci_info->irq_count[irq_set->index]) ||
+ (irq_set->start + irq_set->count > pci_info->irq_count[irq_set->index])) {
+ return -EINVAL;
+ }
+ // Only TRIGGER is valid for ERR/REQ.
+ if (((irq_set->index == VFIO_PCI_ERR_IRQ_INDEX) ||
+ (irq_set->index == VFIO_PCI_REQ_IRQ_INDEX)) &&
+ (a_type != VFIO_IRQ_SET_ACTION_TRIGGER)) {
+ return -EINVAL;
+ }
+ // count == 0 is only valid with ACTION_TRIGGER and DATA_NONE.
+ if ((irq_set->count == 0) && ((a_type != VFIO_IRQ_SET_ACTION_TRIGGER) ||
+ (d_type != VFIO_IRQ_SET_DATA_NONE))) {
+ return -EINVAL;
+ }
+ // If IRQs are set, ensure index matches what's enabled for the device.
+ if ((irq_set->count != 0) && (lm_ctx->irqs.type != IRQ_NONE) &&
+ (irq_set->index != LM2VFIO_IRQT(lm_ctx->irqs.type))) {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static long
+dev_set_irqs(lm_ctx_t * lm_ctx, struct vfio_irq_set *irq_set, void *data)
+{
+ long ret;
+
+ assert(lm_ctx != NULL);
+ assert(irq_set != NULL);
+
+ // Ensure irq_set is valid.
+ ret = dev_set_irqs_validate(lm_ctx, irq_set);
+ if (ret != 0) {
+ return ret;
+ }
+
+ switch (irq_set->flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+ case VFIO_IRQ_SET_ACTION_MASK: // fallthrough
+ case VFIO_IRQ_SET_ACTION_UNMASK:
+ // We're always edge-triggered without un/mask support.
+ return 0;
+ }
+
+ return irqs_trigger(lm_ctx, irq_set, data);
+}
+
+static long dev_get_irqinfo(lm_ctx_t * lm_ctx, struct vfio_irq_info *irq_info)
+{
+ assert(lm_ctx != NULL);
+ assert(irq_info != NULL);
+ lm_pci_info_t *pci_info = &lm_ctx->pci_info;
+
+ // Ensure provided argsz is sufficiently big and index is within bounds.
+ if ((irq_info->argsz < sizeof(struct vfio_irq_info)) ||
+ (irq_info->index >= LM_DEV_NUM_IRQS)) {
+ return -EINVAL;
+ }
+
+ irq_info->count = pci_info->irq_count[irq_info->index];
+ irq_info->flags = VFIO_IRQ_INFO_EVENTFD;
+
+ return 0;
+}
+
+static long
+dev_get_reginfo(lm_ctx_t * lm_ctx, struct vfio_region_info *reg_info)
+{
+ assert(lm_ctx != NULL);
+ assert(reg_info != NULL);
+ lm_pci_info_t *pci_info = &lm_ctx->pci_info;
+
+ // Ensure provided argsz is sufficiently big and index is within bounds.
+ if ((reg_info->argsz < sizeof(struct vfio_region_info)) ||
+ (reg_info->index >= LM_DEV_NUM_REGS)) {
+ return -EINVAL;
+ }
+
+ reg_info->offset = pci_info->reg_info[reg_info->index].offset;
+ reg_info->flags = pci_info->reg_info[reg_info->index].flags;
+ reg_info->size = pci_info->reg_info[reg_info->index].size;
+
+ lm_log(lm_ctx, LM_DBG, "region_info[%d]\n", reg_info->index);
+ dump_buffer(lm_ctx, "", (unsigned char *)reg_info, sizeof *reg_info);
+
+ return 0;
+}
+
+static long dev_get_info(struct vfio_device_info *dev_info)
+{
+ assert(dev_info != NULL);
+
+ // Ensure provided argsz is sufficiently big.
+ if (dev_info->argsz < sizeof(struct vfio_device_info)) {
+ return -EINVAL;
+ }
+
+ dev_info->flags = VFIO_DEVICE_FLAGS_PCI | VFIO_DEVICE_FLAGS_RESET;
+ dev_info->num_regions = LM_DEV_NUM_REGS;
+ dev_info->num_irqs = LM_DEV_NUM_IRQS;
+
+ return 0;
+}
+
+static long
+do_muser_ioctl(lm_ctx_t * lm_ctx, struct muser_cmd_ioctl *cmd_ioctl, void *data)
+{
+ int err = -ENOTSUP;
+
+ assert(lm_ctx != NULL);
+ switch (cmd_ioctl->vfio_cmd) {
+ case VFIO_DEVICE_GET_INFO:
+ err = dev_get_info(&cmd_ioctl->data.dev_info);
+ break;
+ case VFIO_DEVICE_GET_REGION_INFO:
+ err = dev_get_reginfo(lm_ctx, &cmd_ioctl->data.reg_info);
+ break;
+ case VFIO_DEVICE_GET_IRQ_INFO:
+ err = dev_get_irqinfo(lm_ctx, &cmd_ioctl->data.irq_info);
+ break;
+ case VFIO_DEVICE_SET_IRQS:
+ err = dev_set_irqs(lm_ctx, &cmd_ioctl->data.irq_set, data);
+ break;
+ case VFIO_DEVICE_RESET:
+ if (lm_ctx->fops.reset) {
+ return lm_ctx->fops.reset(lm_ctx->pvt);
+ }
+ }
+
+ return err;
+}
+
+static int muser_dma_unmap(lm_ctx_t * lm_ctx, struct muser_cmd *cmd)
+{
+ int err;
+
+ lm_log(lm_ctx, LM_INF, "removing DMA region %lx-%lx\n",
+ cmd->mmap.request.start, cmd->mmap.request.end);
+
+ if (lm_ctx->dma == NULL) {
+ lm_log(lm_ctx, LM_ERR, "DMA not initialized\n");
+ cmd->mmap.response.addr = -1;
+ return -1;
+ }
+
+ err = dma_controller_remove_region(lm_ctx->dma,
+ cmd->mmap.request.start,
+ cmd->mmap.request.end -
+ cmd->mmap.request.start, lm_ctx->fd);
+ if (err != 0) {
+ lm_log(lm_ctx, LM_ERR, "failed to remove DMA region %lx-%lx: %s\n",
+ cmd->mmap.request.start, cmd->mmap.request.end, strerror(err));
+ }
+
+ cmd->mmap.response.addr = err;
+
+ return err;
+}
+
+static int muser_dma_map(lm_ctx_t * lm_ctx, struct muser_cmd *cmd)
+{
+ int err;
+
+ lm_log(lm_ctx, LM_INF, "adding DMA region %lx-%lx\n",
+ cmd->mmap.request.start, cmd->mmap.request.end);
+
+ if (lm_ctx->dma == NULL) {
+ lm_log(lm_ctx, LM_ERR, "DMA not initialized\n");
+ cmd->mmap.response.addr = -1;
+ return -1;
+ }
+
+ if (cmd->mmap.request.start >= cmd->mmap.request.end) {
+ lm_log(lm_ctx, LM_ERR, "bad DMA region %lx-%lx\n",
+ cmd->mmap.request.start, cmd->mmap.request.end);
+ cmd->mmap.response.addr = -1;
+ return -1;
+ }
+ err = dma_controller_add_region(lm_ctx, lm_ctx->dma,
+ cmd->mmap.request.start,
+ cmd->mmap.request.end -
+ cmd->mmap.request.start, lm_ctx->fd, 0);
+ if (err < 0) {
+ lm_log(lm_ctx, LM_ERR, "failed to add DMA region %lx-%lx: %d\n",
+ cmd->mmap.request.start, cmd->mmap.request.end, err);
+ cmd->mmap.response.addr = -1;
+ return -1;
+ }
+
+ // TODO: Are we just abusing response.addr as a rc?
+ cmd->mmap.response.addr = 0;
+
+ return 0;
+}
+
+static int muser_mmap(lm_ctx_t * lm_ctx, struct muser_cmd *cmd)
+{
+ unsigned long addr;
+ unsigned long start = cmd->mmap.request.start;
+ unsigned long end = cmd->mmap.request.end;
+ unsigned long pgoff = cmd->mmap.request.pgoff;
+
+ addr = lm_ctx->fops.mmap(lm_ctx->pvt, pgoff);
+ cmd->mmap.response.addr = addr;
+
+ if ((void *)addr == MAP_FAILED) {
+ cmd->err = -1;
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+post_read(lm_ctx_t * const lm_ctx, struct muser_cmd *const cmd,
+ char *const data, const size_t offset, ssize_t ret)
+{
+ if (ret != cmd->rw.count) {
+ /* FIXME shouldn't we still reply to the kernel in case of error? */
+ lm_log(lm_ctx, LM_ERR, "%s: bad fops read: %d/%d, %s\n",
+ __func__, ret, cmd->rw.count, strerror(errno));
+ return ret;
+ }
+
+ /*
+ * TODO the kernel will first copy the command and then will use the .buf
+ * pointer to copy the data. Does it make sense to use writev in order to
+ * get rid of the .buf member? THe 1st element of the iovec will be the
+ * command and the 2nd the data.
+ */
+ cmd->rw.buf = data;
+ ret = write(lm_ctx->fd, cmd, sizeof(*cmd));
+ if ((int)ret != sizeof(*cmd)) {
+ lm_log(lm_ctx, LM_ERR, "%s: bad muser write: %d/%d, %s\n",
+ __func__, ret, sizeof(*cmd), strerror(errno));
+ }
+ return ret;
+}
+
+int
+lm_get_region(lm_ctx_t * const lm_ctx, const loff_t pos, const size_t count,
+ loff_t * const off)
+{
+ assert(lm_ctx);
+ assert(off);
+ lm_pci_info_t *pci_info = &lm_ctx->pci_info;
+
+ int i;
+
+ for (i = 0; i < LM_DEV_NUM_REGS; i++) {
+ const lm_reg_info_t * const reg_info = &pci_info->reg_info[i];
+ if (pos >= reg_info->offset) {
+ if (pos - reg_info->offset + count <= reg_info->size) {
+ *off = pos - reg_info->offset;
+ return i;
+ }
+ }
+ }
+ return -ENOENT;
+}
+
+static ssize_t
+do_access(lm_ctx_t * const lm_ctx, char * const buf, size_t count, loff_t pos,
+ const bool is_write)
+{
+ int idx;
+ loff_t offset;
+ int ret = -EINVAL;
+ lm_pci_info_t *pci_info;
+
+ assert(lm_ctx != NULL);
+ assert(buf != NULL);
+ assert(count > 0);
+
+ pci_info = &lm_ctx->pci_info;
+ idx = lm_get_region(lm_ctx, pos, count, &offset);
+ if (idx < 0) {
+ lm_log(lm_ctx, LM_ERR, "invalid region %d\n", idx);
+ return idx;
+ }
+
+ /*
+ * TODO we should check at device registration time that all necessary
+ * callbacks are there in order to avoid having to check at runtime
+ */
+ switch (idx) {
+ case LM_DEV_BAR0_REG_IDX ... LM_DEV_BAR5_REG_IDX:
+ if (pci_info->bar_fn)
+ return pci_info->bar_fn(lm_ctx->pvt, idx, buf, count, offset, is_write);
+ break;
+ case LM_DEV_ROM_REG_IDX:
+ if (pci_info->rom_fn)
+ return pci_info->rom_fn(lm_ctx->pvt, buf, count, offset, is_write);
+ break;
+ case LM_DEV_CFG_REG_IDX:
+ if (pci_info->pci_config_fn)
+ return pci_info->pci_config_fn(lm_ctx->pvt, buf, count, offset,
+ is_write);
+ break;
+ case LM_DEV_VGA_REG_IDX:
+ if (pci_info->vga_fn)
+ return pci_info->vga_fn(lm_ctx->pvt, buf, count, offset, is_write);
+ break;
+ default:
+ lm_log(lm_ctx, LM_ERR, "bad region %d\n", idx);
+ return ret;
+ }
+
+ if (is_write && lm_ctx->fops.write) {
+ ret = lm_ctx->fops.write(lm_ctx->pvt, idx, buf, count, pos);
+ } else if (lm_ctx->fops.read) {
+ ret = lm_ctx->fops.read(lm_ctx->pvt, idx, buf, count, pos);
+ } else {
+ lm_log(lm_ctx, LM_ERR, "no R/W callback, region %d, %x@%lx\n",
+ idx, count, pos);
+ }
+
+ return ret;
+}
+
+/*
+ * TODO function name same lm_access_t, fix
+ */
+ssize_t
+lm_access(lm_ctx_t * const lm_ctx, char *buf, size_t count,
+ loff_t * const ppos, const bool is_write)
+{
+ unsigned int done = 0;
+ int ret;
+
+ assert(lm_ctx != NULL);
+ /* buf and ppos can be NULL if count is 0 */
+
+ while (count) {
+ size_t size;
+ if (count >= 8 && !(*ppos % 8)) {
+ size = 8;
+ } else if (count >= 4 && !(*ppos % 4)) {
+ size = 4;
+ } else if (count >= 2 && !(*ppos % 2)) {
+ size = 2;
+ } else {
+ size = 1;
+ }
+ ret = do_access(lm_ctx, buf, size, *ppos, is_write);
+ if (ret <= 0) {
+ lm_log(lm_ctx, LM_ERR, "failed to %s %lx@%llx: %s\n",
+ is_write ? "write" : "read", *ppos, size, strerror(-ret));
+ return -EFAULT;
+ }
+ count -= size;
+ done += size;
+ *ppos += size;
+ buf += size;
+ }
+ return done;
+}
+
+
+static inline int
+muser_access(lm_ctx_t * const lm_ctx, struct muser_cmd *const cmd,
+ const bool is_write)
+{
+ char *data;
+ int err;
+ unsigned int i;
+ size_t count = 0;
+ ssize_t ret;
+
+ /* TODO how big do we expect count to be? Can we use alloca(3) instead? */
+ data = calloc(1, cmd->rw.count);
+ if (data == NULL) {
+ lm_log(lm_ctx, LM_ERR, "failed to allocate memory\n");
+ return -1;
+ }
+
+ lm_log(lm_ctx, LM_DBG, "%s %x@%lx\n", is_write ? "W" : "R", cmd->rw.count,
+ cmd->rw.pos);
+
+ /* copy data to be written from kernel to user space */
+ if (is_write) {
+ err = read(lm_ctx->fd, data, cmd->rw.count);
+ /*
+ * FIXME this is wrong, we should be checking for
+ * err != cmd->rw.count
+ */
+ if (err < 0) {
+ lm_log(lm_ctx, LM_ERR, "failed to read from kernel: %s\n",
+ strerror(errno));
+ goto out;
+ }
+ err = 0;
+ dump_buffer(lm_ctx, "buffer write", data, cmd->rw.count);
+ }
+
+ count = cmd->rw.count;
+ cmd->err = muser_pci_hdr_access(lm_ctx, &cmd->rw.count, &cmd->rw.pos,
+ is_write, data);
+ if (cmd->err) {
+ lm_log(lm_ctx, LM_ERR, "failed to access PCI header: %d\n", cmd->err);
+ }
+ count -= cmd->rw.count;
+ ret = lm_access(lm_ctx, data + count, cmd->rw.count, &cmd->rw.pos,
+ is_write);
+ if (!is_write) {
+ err = post_read(lm_ctx, cmd, data, count, ret);
+ dump_buffer(lm_ctx, "buffer read", data, cmd->rw.count);
+ }
+
+out:
+ free(data);
+
+ return err;
+}
+
+static int
+muser_ioctl(lm_ctx_t * lm_ctx, struct muser_cmd *cmd)
+{
+ void *data = NULL;
+ size_t size = 0;
+ int ret;
+
+ /* TODO make this a function that returns the size */
+ if (cmd->ioctl.vfio_cmd == VFIO_DEVICE_SET_IRQS) {
+ uint32_t flags = cmd->ioctl.data.irq_set.flags;
+ switch ((flags & VFIO_IRQ_SET_DATA_TYPE_MASK)) {
+ case VFIO_IRQ_SET_DATA_EVENTFD:
+ size = sizeof(int32_t) * cmd->ioctl.data.irq_set.count;
+ break;
+ case VFIO_IRQ_SET_DATA_BOOL:
+ size = sizeof(uint8_t) * cmd->ioctl.data.irq_set.count;
+ break;
+ }
+ }
+
+ if (size != 0) {
+ data = calloc(1, size);
+ if (data == NULL) {
+#ifdef DEBUG
+ perror("calloc");
+#endif
+ return -1;
+ }
+
+ ret = read(lm_ctx->fd, data, size);
+ if (ret < 0) {
+#ifdef DEBUG
+ perror("read failed");
+#endif
+ goto out;
+ }
+ }
+
+ ret = (int)do_muser_ioctl(lm_ctx, &cmd->ioctl, data);
+
+out:
+
+ free(data);
+ return ret;
+}
+
+static int drive_loop(lm_ctx_t *lm_ctx)
+{
+ struct muser_cmd cmd = { 0 };
+ int err;
+ size_t size;
+ unsigned int i;
+
+ do {
+ err = ioctl(lm_ctx->fd, MUSER_DEV_CMD_WAIT, &cmd);
+ if (err < 0) {
+ return err;
+ }
+
+ switch (cmd.type) {
+ case MUSER_IOCTL:
+ err = muser_ioctl(lm_ctx, &cmd);
+ break;
+ case MUSER_READ:
+ case MUSER_WRITE:
+ err = muser_access(lm_ctx, &cmd, cmd.type == MUSER_WRITE);
+ break;
+ case MUSER_MMAP:
+ err = muser_mmap(lm_ctx, &cmd);
+ break;
+ case MUSER_DMA_MMAP:
+ err = muser_dma_map(lm_ctx, &cmd);
+ break;
+ case MUSER_DMA_MUNMAP:
+ err = muser_dma_unmap(lm_ctx, &cmd);
+ break;
+ default:
+ lm_log(lm_ctx, LM_ERR, "bad command %d\n", cmd.type);
+ continue;
+ }
+ cmd.err = err;
+ err = ioctl(lm_ctx->fd, MUSER_DEV_CMD_DONE, &cmd);
+ if (err < 0) {
+ lm_log(lm_ctx, LM_ERR, "failed to complete command: %s\n",
+ strerror(errno));
+ }
+ // TODO: Figure out a clean way to get out of the loop.
+ } while (1);
+
+ return err;
+}
+
+int
+lm_ctx_drive(lm_ctx_t * lm_ctx)
+{
+
+ if (lm_ctx == NULL) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ return drive_loop(lm_ctx);
+}
+
+static int
+dev_detach(int dev_fd)
+{
+ return close(dev_fd);
+}
+
+static int
+dev_attach(const char *uuid)
+{
+ char *path;
+ int dev_fd;
+ int err;
+
+ err = asprintf(&path, "/dev/" MUSER_DEVNODE "/%s", uuid);
+ if (err != (int)(strlen(MUSER_DEVNODE) + strlen(uuid) + 6)) {
+ return -1;
+ }
+
+ dev_fd = open(path, O_RDWR);
+
+ free(path);
+
+ return dev_fd;
+}
+
+void *
+lm_mmap(lm_ctx_t * lm_ctx, size_t length, off_t offset)
+{
+ off_t lm_off;
+
+ if ((lm_ctx == NULL) || (length == 0) || !PAGE_ALIGNED(offset)) {
+ errno = EINVAL;
+ return MAP_FAILED;
+ }
+
+ lm_off = offset | BIT(63);
+ return mmap(NULL, length, PROT_READ | PROT_WRITE, MAP_SHARED,
+ lm_ctx->fd, lm_off);
+}
+
+int
+lm_irq_trigger(lm_ctx_t * lm_ctx, uint32_t vector)
+{
+ eventfd_t val = 1;
+
+ if ((lm_ctx == NULL) || (vector >= lm_ctx->irqs.max_ivs)) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (lm_ctx->irqs.efds[vector] == -1) {
+ errno = ENOENT;
+ return -1;
+ }
+
+ return eventfd_write(lm_ctx->irqs.efds[vector], val);
+}
+
+void
+lm_ctx_destroy(lm_ctx_t * lm_ctx)
+{
+ if (lm_ctx == NULL) {
+ return;
+ }
+
+ free(lm_ctx->pci_config_space);
+ dev_detach(lm_ctx->fd);
+ if (lm_ctx->dma != NULL) {
+ dma_controller_destroy(lm_ctx, lm_ctx->dma);
+ }
+ free(lm_ctx);
+ // FIXME: Maybe close any open irq efds? Unmap stuff?
+}
+
+static void
+init_pci_hdr(lm_pci_hdr_t * const hdr, const lm_pci_hdr_id_t * const id,
+ const lm_pci_hdr_cc_t * const cc)
+{
+ assert(hdr);
+ assert(id);
+ assert(cc);
+
+ hdr->id = *id;
+ hdr->cc = *cc;
+
+ hdr->ss.vid = hdr->id.vid;
+ hdr->ss.sid = hdr->id.did;
+}
+
+lm_ctx_t *
+lm_ctx_create(lm_dev_info_t * const dev_info)
+{
+ lm_ctx_t *lm_ctx;
+ uint32_t max_ivs = 0;
+ uint32_t i;
+ int err = 0;
+ size_t size;
+
+ if (dev_info == NULL) {
+ err = EINVAL;
+ goto out;
+ }
+
+ for (i = 0; i < LM_DEV_NUM_IRQS; i++) {
+ if (max_ivs < dev_info->pci_info.irq_count[i]) {
+ max_ivs = dev_info->pci_info.irq_count[i];
+ }
+ }
+
+ lm_ctx = calloc(1, LM_CTX_SIZE(max_ivs));
+ if (lm_ctx == NULL) {
+ err = errno;
+ goto out;
+ }
+
+ memcpy(&lm_ctx->pci_info, &dev_info->pci_info, sizeof(lm_pci_info_t));
+
+ lm_ctx->fd = dev_attach(dev_info->uuid);
+ if (lm_ctx->fd == -1) {
+ err = errno;
+ goto out;
+ }
+
+ if (dev_info->nr_dma_regions > 0) {
+ lm_ctx->dma = dma_controller_create(dev_info->nr_dma_regions);
+ if (lm_ctx->dma == NULL) {
+ err = errno;
+ goto out;
+ }
+ }
+
+ lm_ctx->pci_info.irq_count[LM_DEV_ERR_IRQ_IDX] = 1;
+ lm_ctx->pci_info.irq_count[LM_DEV_REQ_IRQ_IDX] = 1;
+
+ lm_ctx->extended = dev_info->extended;
+ if (lm_ctx->extended) {
+ size = PCI_EXTENDED_CONFIG_SPACE_SIZEOF;
+ } else {
+ size = PCI_CONFIG_SPACE_SIZEOF;
+ }
+ lm_ctx->pci_config_space = calloc(PCI_EXTENDED_CONFIG_SPACE_SIZEOF, 1);
+ if (!lm_ctx->pci_config_space) {
+ err = errno;
+ goto out;
+ }
+
+ init_pci_hdr(&lm_ctx->pci_config_space->hdr, &dev_info->id, &dev_info->cc);
+ for (i = 0; i < ARRAY_SIZE(lm_ctx->pci_config_space->hdr.bars); i++) {
+ if ((dev_info->pci_info.reg_info[i].flags & LM_REG_FLAG_MEM) == 0) {
+ lm_ctx->pci_config_space->hdr.bars[i].io.region_type |= 0x1;
+ }
+ }
+
+ lm_ctx->fops = dev_info->fops;
+ lm_ctx->pvt = dev_info->pvt;
+
+ for (i = 0; i < max_ivs; i++) {
+ lm_ctx->irqs.efds[i] = -1;
+ }
+ lm_ctx->irqs.err_efd = -1;
+ lm_ctx->irqs.req_efd = -1;
+ lm_ctx->irqs.type = IRQ_NONE;
+ lm_ctx->irqs.max_ivs = max_ivs;
+
+ lm_ctx->log = dev_info->log;
+ lm_ctx->log_lvl = dev_info->log_lvl;
+
+ lm_ctx->pci_info.bar_fn = dev_info->pci_info.bar_fn;
+ lm_ctx->pci_info.rom_fn = dev_info->pci_info.rom_fn;
+ lm_ctx->pci_info.pci_config_fn = dev_info->pci_info.pci_config_fn;
+ lm_ctx->pci_info.vga_fn = dev_info->pci_info.vga_fn;
+
+out:
+ if (err) {
+ if (lm_ctx) {
+ dev_detach(lm_ctx->fd);
+ free(lm_ctx->pci_config_space);
+ free(lm_ctx);
+ lm_ctx = NULL;
+ }
+ errno = err;
+ }
+ return lm_ctx;
+}
+
+void
+dump_buffer(lm_ctx_t const *const lm_ctx, char const *const prefix,
+ unsigned char const *const buf, const uint32_t count)
+{
+#ifdef DEBUG
+ int i;
+ const size_t bytes_per_line = 0x8;
+
+ if (strcmp(prefix, "")) {
+ lm_log(lm_ctx, LM_DBG, "%s\n", prefix);
+ }
+ for (i = 0; i < (int)count; i++) {
+ if (i % bytes_per_line != 0) {
+ lm_log(lm_ctx, LM_DBG, " ");
+ }
+ /* TODO valgrind emits a warning if count is 1 */
+ lm_log(lm_ctx, LM_DBG, "0x%02x", *(buf + i));
+ if ((i + 1) % bytes_per_line == 0) {
+ lm_log(lm_ctx, LM_DBG, "\n");
+ }
+ }
+ if (i % bytes_per_line != 0) {
+ lm_log(lm_ctx, LM_DBG, "\n");
+ }
+#endif
+}
+
+/*
+ * Returns a pointer to the standard part of the PCI configuration space.
+ */
+inline lm_pci_config_space_t *
+lm_get_pci_config_space(lm_ctx_t * const lm_ctx)
+{
+ assert(lm_ctx != NULL);
+ return lm_ctx->pci_config_space;
+}
+
+/*
+ * Returns a pointer to the non-standard part of the PCI configuration space.
+ */
+inline uint8_t *
+lm_get_pci_non_std_config_space(lm_ctx_t * const lm_ctx)
+{
+ assert(lm_ctx != NULL);
+ return (uint8_t *) & lm_ctx->pci_config_space->non_std;
+}
+
+inline lm_reg_info_t *
+lm_get_region_info(lm_ctx_t * const lm_ctx)
+{
+ assert(lm_ctx != NULL);
+ return lm_ctx->pci_info.reg_info;
+}
+
+inline int
+lm_addr_to_sg(lm_ctx_t * const lm_ctx, dma_addr_t dma_addr,
+ uint32_t len, dma_scattergather_t * sg, int max_sg)
+{
+ return dma_addr_to_sg(lm_ctx, lm_ctx->dma, dma_addr, len, sg, max_sg);
+}
+
+inline int
+lm_map_sg(lm_ctx_t * const lm_ctx, int prot,
+ const dma_scattergather_t * sg, struct iovec *iov, int cnt)
+{
+ return dma_map_sg(lm_ctx->dma, prot, sg, iov, cnt);
+}
+
+inline void
+lm_unmap_sg(lm_ctx_t * const lm_ctx, const dma_scattergather_t * sg,
+ struct iovec *iov, int cnt)
+{
+ return dma_unmap_sg(lm_ctx->dma, sg, iov, cnt);
+}
+
+int
+lm_ctx_run(lm_ctx_t * const lm_ctx)
+{
+ int ret = lm_ctx_drive(lm_ctx);
+
+ lm_ctx_destroy(lm_ctx);
+ return ret;
+}
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/libmuser_pci.c b/lib/libmuser_pci.c
new file mode 100644
index 0000000..df45336
--- /dev/null
+++ b/lib/libmuser_pci.c
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 2019 Nutanix Inc. All rights reserved.
+ *
+ * Authors: Thanos Makatos <thanos@nutanix.com>
+ * Swapnil Ingle <swapnil.ingle@nutanix.com>
+ * Felipe Franciosi <felipe@nutanix.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Nutanix nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+#include <sys/param.h>
+#include <errno.h>
+
+#include <linux/pci_regs.h>
+#include <linux/vfio.h>
+
+#include "muser.h"
+#include "pci.h"
+#include "common.h"
+
+static inline void
+muser_pci_hdr_write_bar(lm_ctx_t * const lm_ctx, const uint16_t bar_index,
+ const char *const buf)
+{
+ uint32_t cfg_addr;
+ uint32_t *bar;
+ unsigned long mask;
+ lm_reg_info_t *reg_info = lm_get_region_info(lm_ctx);
+
+ assert(lm_ctx);
+
+ if (reg_info[bar_index].size == 0) {
+ return;
+ }
+
+ bar = (uint32_t *) & lm_get_pci_config_space(lm_ctx)->hdr.bars[bar_index];
+ cfg_addr = *(uint32_t *) buf;
+
+ lm_log(lm_ctx, LM_DBG, "BAR%d addr 0x%x\n", bar_index, cfg_addr);
+
+ if (cfg_addr == 0xffffffff) {
+ cfg_addr = ~(reg_info[bar_index].size) + 1;
+ }
+
+ if ((reg_info[bar_index].flags & LM_REG_FLAG_MEM)) {
+ mask = PCI_BASE_ADDRESS_MEM_MASK;
+ } else {
+ mask = PCI_BASE_ADDRESS_IO_MASK;
+ }
+ cfg_addr |= (*bar & ~mask);
+
+ *bar = htole32(cfg_addr);
+}
+
+#define BAR_INDEX(offset) ((offset - PCI_BASE_ADDRESS_0) >> 2)
+
+static int
+handle_command_write(lm_ctx_t * const ctx, lm_pci_config_space_t * const pci,
+ const char * const buf, const size_t count)
+{
+ uint16_t v;
+
+ assert(ctx);
+
+ if (count != 2) {
+ lm_log(ctx, LM_ERR, "bad write command size %d\n", count);
+ return -EINVAL;
+ }
+
+ assert(pci);
+ assert(buf);
+
+ v = *(uint16_t*)buf;
+
+ if ((v & PCI_COMMAND_IO) == PCI_COMMAND_IO) {
+ if (!pci->hdr.cmd.iose) {
+ pci->hdr.cmd.iose = 0x1;
+ lm_log(ctx, LM_INF, "I/O space enabled\n");
+ }
+ v &= ~PCI_COMMAND_IO;
+ } else {
+ if (pci->hdr.cmd.iose) {
+ pci->hdr.cmd.iose = 0x0;
+ lm_log(ctx, LM_INF, "I/O space disabled\n");
+ }
+ }
+
+ if ((v & PCI_COMMAND_MEMORY) == PCI_COMMAND_MEMORY) {
+ if (!pci->hdr.cmd.mse) {
+ pci->hdr.cmd.mse = 0x1;
+ lm_log(ctx, LM_INF, "memory space enabled\n");
+ }
+ v &= ~PCI_COMMAND_MEMORY;
+ } else {
+ if (pci->hdr.cmd.mse) {
+ pci->hdr.cmd.mse = 0x0;
+ lm_log(ctx, LM_INF, "memory space disabled\n");
+ }
+ }
+
+ if ((v & PCI_COMMAND_MASTER) == PCI_COMMAND_MASTER) {
+ if (!pci->hdr.cmd.bme) {
+ pci->hdr.cmd.bme = 0x1;
+ lm_log(ctx, LM_INF, "bus master enabled\n");
+ }
+ v &= ~PCI_COMMAND_MASTER;
+ } else {
+ if (pci->hdr.cmd.bme) {
+ pci->hdr.cmd.bme = 0x0;
+ lm_log(ctx, LM_INF, "bus master disabled\n");
+ }
+ }
+
+ if ((v & PCI_COMMAND_SERR) == PCI_COMMAND_SERR) {
+ if (!pci->hdr.cmd.see) {
+ pci->hdr.cmd.see = 0x1;
+ lm_log(ctx, LM_INF, "SERR# enabled\n");
+ }
+ v &= ~PCI_COMMAND_SERR;
+ } else {
+ if (pci->hdr.cmd.see) {
+ pci->hdr.cmd.see = 0x0;
+ lm_log(ctx, LM_INF, "SERR# disabled\n");
+ }
+ }
+
+ if ((v & PCI_COMMAND_INTX_DISABLE) == PCI_COMMAND_INTX_DISABLE) {
+ if (!pci->hdr.cmd.id) {
+ pci->hdr.cmd.id = 0x1;
+ lm_log(ctx, LM_INF, "INTx emulation enabled\n");
+ }
+ v &= ~PCI_COMMAND_INTX_DISABLE;
+ } else {
+ if (pci->hdr.cmd.id) {
+ pci->hdr.cmd.id = 0x0;
+ lm_log(ctx, LM_INF, "INTx emulation disabled\n");
+ }
+ }
+
+ if (v) {
+ lm_log(ctx, LM_ERR, "unconsumed command flags %x\n", v);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int
+handle_erom_write(lm_ctx_t * const ctx, lm_pci_config_space_t * const pci,
+ const char *const buf, const size_t count)
+{
+ uint32_t v;
+
+ assert(ctx);
+ assert(pci);
+
+ if (count != 0x4) {
+ lm_log(ctx, LM_ERR, "bad EROM count %d\n", count);
+ return -EINVAL;
+ }
+ v = *(uint32_t*)buf;
+
+ if (v == (uint32_t)PCI_ROM_ADDRESS_MASK) {
+ lm_log(ctx, LM_INF, "write mask to EROM ignored\n");
+ } else if (v == 0) {
+ lm_log(ctx, LM_INF, "cleared EROM\n");
+ pci->hdr.erom = 0;
+ } else if (v == ~PCI_ROM_ADDRESS_ENABLE) {
+ lm_log(ctx, LM_INF, "EROM disable ignored\n");
+ } else {
+ lm_log(ctx, LM_ERR, "bad write to EROM 0x%x bytes\n", v);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static inline int
+muser_pci_hdr_write(lm_ctx_t * const lm_ctx, const uint16_t offset,
+ const char *const buf, const size_t count)
+{
+ uint32_t *bar;
+ lm_pci_config_space_t *pci;
+ int ret = 0;
+
+ assert(lm_ctx);
+ assert(buf);
+
+ pci = lm_get_pci_config_space(lm_ctx);
+
+ switch (offset) {
+ case PCI_COMMAND:
+ ret = handle_command_write(lm_ctx, pci, buf, count);
+ break;
+ case PCI_STATUS:
+ lm_log(lm_ctx, LM_INF, "write to status ignored\n");
+ break;
+ case PCI_INTERRUPT_PIN:
+ lm_log(lm_ctx, LM_ERR, "attempt to write read-only field IPIN\n");
+ ret = -EINVAL;
+ break;
+ case PCI_INTERRUPT_LINE:
+ pci->hdr.intr.iline = buf[0];
+ break;
+ case PCI_LATENCY_TIMER:
+ pci->hdr.mlt = (uint8_t)buf[0];
+ lm_log(lm_ctx, LM_INF, "set to latency timer to %hhx\n", pci->hdr.mlt);
+ break;
+ case PCI_BASE_ADDRESS_0:
+ case PCI_BASE_ADDRESS_1:
+ case PCI_BASE_ADDRESS_2:
+ case PCI_BASE_ADDRESS_3:
+ case PCI_BASE_ADDRESS_4:
+ case PCI_BASE_ADDRESS_5:
+ muser_pci_hdr_write_bar(lm_ctx, BAR_INDEX(offset), buf);
+ break;
+ case PCI_ROM_ADDRESS:
+ ret = handle_erom_write(lm_ctx, pci, buf, count);
+ break;
+ default:
+ lm_log(lm_ctx, LM_INF, "PCI config write %x@%x not handled\n",
+ count, offset);
+ ret = -EINVAL;
+ }
+
+ dump_buffer(lm_ctx, "PCI header", pci->hdr.raw, 0xff);
+
+ return ret;
+}
+
+/*
+ * @pci_hdr: the PCI header
+ * @reg_info: region info
+ * @rw: the command
+ * @write: whether this is a PCI header write
+ * @count: output parameter that receives the number of bytes read/written
+ */
+static inline int
+muser_do_pci_hdr_access(lm_ctx_t * const lm_ctx, size_t * const count,
+ loff_t * const pos, const bool is_write,
+ unsigned char *const buf)
+{
+ size_t _count;
+ loff_t _pos;
+ int err = 0;
+
+ assert(lm_ctx);
+ assert(count);
+ assert(pos);
+ assert(buf);
+
+ _pos = *pos - lm_get_region_info(lm_ctx)[LM_DEV_CFG_REG_IDX].offset;
+ _count = MIN(*count, PCI_STD_HEADER_SIZEOF - _pos);
+
+ if (is_write) {
+ err = muser_pci_hdr_write(lm_ctx, _pos, buf, _count);
+ } else {
+ memcpy(buf, lm_get_pci_config_space(lm_ctx)->hdr.raw + _pos, _count);
+ }
+ *pos += _count;
+ *count -= _count;
+ return err;
+}
+
+static inline bool
+muser_is_pci_hdr_access(const lm_reg_info_t * const reg_info, const loff_t pos)
+{
+ const off_t off = (loff_t) reg_info[LM_DEV_CFG_REG_IDX].offset;
+ return pos - off >= 0 && pos - off < PCI_STD_HEADER_SIZEOF;
+}
+
+int
+muser_pci_hdr_access(lm_ctx_t * const lm_ctx, size_t * const count,
+ loff_t * const pos, const bool is_write,
+ unsigned char *const buf)
+{
+ assert(lm_ctx);
+ assert(count);
+ assert(pos);
+
+ if (!muser_is_pci_hdr_access(lm_get_region_info(lm_ctx), *pos)) {
+ return 0;
+ }
+ return muser_do_pci_hdr_access(lm_ctx, count, pos, is_write, buf);
+}
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/msicap.h b/lib/msicap.h
new file mode 100644
index 0000000..bfcf1cd
--- /dev/null
+++ b/lib/msicap.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2019 Nutanix Inc. All rights reserved.
+ *
+ * Authors: Thanos Makatos <thanos@nutanix.com>
+ * Swapnil Ingle <swapnil.ingle@nutanix.com>
+ * Felipe Franciosi <felipe@nutanix.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Nutanix nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+struct mid {
+ unsigned int next:8;
+ unsigned int cid:8;
+} __attribute__ ((packed));
+_Static_assert(sizeof(struct mid) == 0x2, "bad MID size");
+
+struct mc {
+ unsigned int msie:1;
+ unsigned int mmc:3;
+ unsigned int mme:3;
+ unsigned int c64:1;
+ unsigned int pvm:1;
+ unsigned int res1:7;
+} __attribute__ ((packed));
+_Static_assert(sizeof(struct mc) == 0x2, "bad MC size");
+
+struct ma {
+ unsigned int res1:2;
+ unsigned int addr:30;
+} __attribute__ ((packed));
+_Static_assert(sizeof(struct ma) == 0x4, "bad MA size");
+
+struct msicap {
+ struct mid mid;
+ struct mc mc;
+ struct ma ma;
+ uint32_t mua;
+ uint16_t md;
+ uint16_t padding;
+ uint32_t mmask;
+ uint32_t mpend;
+} __attribute__ ((packed));
+_Static_assert(sizeof(struct msicap) == 0x18, "bad MSICAP size");
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/muser.h b/lib/muser.h
new file mode 100644
index 0000000..a844f5c
--- /dev/null
+++ b/lib/muser.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2019 Nutanix Inc. All rights reserved.
+ *
+ * Authors: Thanos Makatos <thanos@nutanix.com>
+ * Swapnil Ingle <swapnil.ingle@nutanix.com>
+ * Felipe Franciosi <felipe@nutanix.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Nutanix nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#ifndef LIB_MUSER_H
+#define LIB_MUSER_H
+
+#include <stdint.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+#include "pci.h"
+
+/**
+ * lm_fops_t - driver callbacks
+ *
+ * @read: read device configuration space
+ * @write: write device configuration space
+ * @mmap: mmap device configuration space
+ * @reset: reset the device
+ */
+typedef struct {
+ ssize_t (*read) (void *pvt, const int index, char *buf, size_t count,
+ loff_t pos);
+ ssize_t (*write) (void *pvt, const int index, char *buf, size_t count,
+ loff_t pos);
+ unsigned long (*mmap) (void *pvt, unsigned long pgoff);
+ int (*reset) (void *pvt);
+} lm_fops_t;
+
+
+/**
+ * Callback function signatures for each regions.
+ *
+ * @lm_bar_access_t: typedef for BAR access function.
+ * @lm_non_bar_access_t: typedef for non-BAR(rom, pci config,
+ * vga) access functions.
+ */
+typedef ssize_t (lm_bar_access_t) (void *pvt, const int region_index,
+ char * const buf, size_t count,
+ loff_t offset, const bool is_write);
+typedef ssize_t (lm_non_bar_access_t) (void *pvt, char * const buf,
+ size_t count, loff_t offset,
+ const bool is_write);
+typedef struct {
+ uint32_t irq_count[LM_DEV_NUM_IRQS];
+ lm_reg_info_t reg_info[LM_DEV_NUM_REGS];
+
+ /* Optional PCI region access callbacks. */
+ lm_bar_access_t *bar_fn;
+ lm_non_bar_access_t *rom_fn;
+ lm_non_bar_access_t *pci_config_fn;
+ lm_non_bar_access_t *vga_fn;
+} lm_pci_info_t;
+
+/**
+ * Callback function signature for log function
+ *
+ * @lm_log_fn_t: typedef for log function.
+ */
+typedef void (lm_log_fn_t) (void *pvt, const char *const msg);
+
+/**
+ * Device information structure, used to create the lm_ctx.
+ * To be filled and passed to lm_ctx_run()
+ */
+typedef struct {
+ char *uuid;
+ void *pvt;
+ /*
+ * whether an extended PCI configuration space should be created
+ */
+ bool extended;
+ int nr_dma_regions;
+ lm_log_fn_t *log;
+ lm_log_lvl_t log_lvl;
+ lm_fops_t fops;
+ lm_pci_hdr_id_t id;
+ lm_pci_hdr_cc_t cc;
+ lm_pci_info_t pci_info;
+} lm_dev_info_t;
+
+/**
+ * Creates libmuser context.
+ *
+ * Arguments:
+ * @dev_info: device information used to create the context.
+ */
+lm_ctx_t *lm_ctx_create(lm_dev_info_t * dev_info);
+
+/**
+ * Destroys libmuser context.
+ *
+ * Arguments:
+ * @lm_ctx: libmuser context to destroy.
+ */
+void lm_ctx_destroy(lm_ctx_t * lm_ctx);
+
+/**
+ * Once the lm_ctx is configured lm_ctx_drive() drives it. This function waits
+ * for commands comming from muser.ko and then processes it..
+ *
+ * Arguments:
+ * @lm_ctx: libmuser context to drive.
+ */
+
+int lm_ctx_drive(lm_ctx_t * lm_ctx);
+
+
+/**
+ * Creates mapping of BAR's into the callers vmem. It should be called from
+ * lm_fops_t->mmap.
+ *
+ * Arguments:
+ * @lm_ctx: libmuser context to create mapping from.
+ */
+void *lm_mmap(lm_ctx_t * lm_ctx, size_t length, off_t offset);
+
+/**
+ * Trigger interrupt.
+ *
+ * Arguments:
+ * @lm_ctx: libmuser context to trigger interrupt.
+ * @vector: vector to tirgger interrupt on.
+ */
+int lm_irq_trigger(lm_ctx_t * lm_ctx, uint32_t vector);
+
+/* Helper functions */
+
+int lm_ctx_run(lm_ctx_t * const ctx);
+
+uint8_t *lm_get_pci_non_std_config_space(lm_ctx_t * const lm_ctx);
+
+int lm_addr_to_sg(lm_ctx_t * const ctx, dma_addr_t dma_addr, uint32_t len,
+ dma_scattergather_t * sg, int max_sg);
+
+int
+lm_map_sg(lm_ctx_t * const ctx, int prot, const dma_scattergather_t * sg,
+ struct iovec *iov, int cnt);
+
+void
+lm_unmap_sg(lm_ctx_t * const ctx, const dma_scattergather_t * sg,
+ struct iovec *iov, int cnt);
+
+int
+lm_get_region(lm_ctx_t * const ctx, const loff_t pos,
+ const size_t count, loff_t * const off);
+
+#ifdef DEBUG
+void
+dump_buffer(lm_ctx_t const *const lm_ctx, char const *const prefix,
+ unsigned char const *const buf, const uint32_t count);
+#endif
+
+#endif /* LIB_MUSER_H */
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/pci.h b/lib/pci.h
new file mode 100644
index 0000000..4b7132a
--- /dev/null
+++ b/lib/pci.h
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2019 Nutanix Inc. All rights reserved.
+ *
+ * Authors: Thanos Makatos <thanos@nutanix.com>
+ * Swapnil Ingle <swapnil.ingle@nutanix.com>
+ * Felipe Franciosi <felipe@nutanix.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Nutanix nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#ifndef LIBMUSER_PCI_H
+#define LIBMUSER_PCI_H
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include <linux/pci_regs.h>
+
+struct lm_ctx;
+typedef struct lm_ctx lm_ctx_t;
+
+typedef uint64_t dma_addr_t;
+
+typedef struct {
+ int region;
+ int length;
+ uint64_t offset;
+} dma_scattergather_t;
+
+typedef struct lm_ctx lm_ctx_t;
+typedef struct lm_reg_info lm_reg_info_t;
+typedef struct lm_pci_config_space lm_pci_config_space_t;
+
+typedef enum {
+ LM_ERR,
+ LM_INF,
+ LM_DBG
+} lm_log_lvl_t;
+
+#define PCI_CONFIG_SPACE_SIZEOF 0x100
+#define PCI_EXTENDED_CONFIG_SPACE_SIZEOF 0x1000
+
+enum {
+ LM_DEV_BAR0_REG_IDX,
+ LM_DEV_BAR1_REG_IDX,
+ LM_DEV_BAR2_REG_IDX,
+ LM_DEV_BAR3_REG_IDX,
+ LM_DEV_BAR4_REG_IDX,
+ LM_DEV_BAR5_REG_IDX,
+ LM_DEV_ROM_REG_IDX,
+ LM_DEV_CFG_REG_IDX,
+ LM_DEV_VGA_REG_IDX,
+ LM_DEV_NUM_REGS = 9
+};
+
+/*
+ * TODO lots of the sizes of each member are defined in pci_regs.h, use those
+ * instead?
+ */
+
+typedef union {
+ uint32_t raw;
+ struct {
+ uint16_t vid;
+ uint16_t sid;
+ } __attribute__ ((packed));
+} __attribute__ ((packed)) lm_pci_hdr_ss_t;
+_Static_assert(sizeof(lm_pci_hdr_ss_t) == 0x4, "bad SS size");
+
+typedef union {
+ uint8_t raw;
+} __attribute__ ((packed)) lm_pci_hdr_bist_t;
+_Static_assert(sizeof(lm_pci_hdr_bist_t) == 0x1, "bad BIST size");
+
+typedef union {
+ uint32_t raw;
+ union {
+ struct {
+ unsigned int region_type:1;
+ unsigned int locatable:2;
+ unsigned int prefetchable:1;
+ unsigned int base_address:28;
+ } __attribute__ ((packed)) mem;
+ struct {
+ unsigned int region_type:1;
+ unsigned int reserved:1;
+ unsigned int base_address:30;
+ } __attribute__ ((packed)) io;
+ } __attribute__ ((packed));
+} __attribute__ ((packed)) lm_bar_t;
+_Static_assert(sizeof(lm_bar_t) == 0x4, "bad BAR size");
+
+typedef union {
+ uint8_t raw;
+} __attribute__ ((packed)) lm_pci_hdr_htype_t;
+_Static_assert(sizeof(lm_pci_hdr_htype_t) == 0x1, "bad HTYPE size");
+
+typedef union {
+ uint8_t raw[3];
+ struct {
+ uint8_t pi;
+ uint8_t scc;
+ uint8_t bcc;
+ } __attribute__ ((packed));
+} __attribute__ ((packed)) lm_pci_hdr_cc_t;
+_Static_assert(sizeof(lm_pci_hdr_cc_t) == 0x3, "bad CC size");
+
+/* device status */
+typedef union {
+ uint16_t raw;
+ struct {
+ unsigned int res1:3;
+ unsigned int is:1;
+ unsigned int cl:1;
+ unsigned int c66:1;
+ unsigned int res2:1;
+ unsigned int fbc:1;
+ unsigned int dpd:1;
+ unsigned int devt:2;
+ unsigned int sta:1;
+ unsigned int rta:1;
+ unsigned int rma:1;
+ unsigned int sse:1;
+ unsigned int dpe:1;
+ } __attribute__ ((packed));
+} __attribute__ ((packed)) lm_pci_hdr_sts_t;
+_Static_assert(sizeof(lm_pci_hdr_sts_t) == 0x2, "bad STS size");
+
+typedef union {
+ uint16_t raw;
+ struct {
+ uint8_t iose:1;
+ uint8_t mse:1;
+ uint8_t bme:1;
+ uint8_t sce:1;
+ uint8_t mwie:1;
+ uint8_t vga:1;
+ uint8_t pee:1;
+ uint8_t zero:1;
+ uint8_t see:1;
+ uint8_t fbe:1;
+ uint8_t id:1;
+ uint8_t res1:5;
+ } __attribute__ ((packed));
+} __attribute__ ((packed)) lm_pci_hdr_cmd_t;
+_Static_assert(sizeof(lm_pci_hdr_cmd_t) == 0x2, "bad CMD size");
+
+typedef union {
+ uint32_t raw;
+ struct {
+ uint16_t vid;
+ uint16_t did;
+ } __attribute__ ((packed));
+} __attribute__ ((packed)) lm_pci_hdr_id_t;
+_Static_assert(sizeof(lm_pci_hdr_id_t) == 0x4, "bad ID size");
+
+typedef union {
+ uint16_t raw;
+ struct {
+ uint8_t iline;
+ uint8_t ipin;
+ } __attribute__ ((packed));
+} __attribute__ ((packed)) lm_pci_hdr_intr_t;
+_Static_assert(sizeof(lm_pci_hdr_intr_t) == 0x2, "bad INTR size");
+
+typedef union {
+ uint8_t raw[PCI_STD_HEADER_SIZEOF];
+ struct {
+ lm_pci_hdr_id_t id;
+ lm_pci_hdr_cmd_t cmd;
+ lm_pci_hdr_sts_t sts;
+ uint8_t rid;
+ lm_pci_hdr_cc_t cc;
+ uint8_t cls;
+ uint8_t mlt;
+ lm_pci_hdr_htype_t htype;
+ lm_pci_hdr_bist_t bist;
+#define PCI_BARS_NR 6
+ lm_bar_t bars[PCI_BARS_NR];
+ uint32_t ccptr;
+ lm_pci_hdr_ss_t ss;
+ uint32_t erom;
+ uint8_t cap;
+ uint8_t res1[7];
+ lm_pci_hdr_intr_t intr;
+ uint8_t mgnt;
+ uint8_t mlat;
+ } __attribute__ ((packed));
+} __attribute__ ((packed)) lm_pci_hdr_t;
+_Static_assert(sizeof(lm_pci_hdr_t) == 0x40, "bad PCI header size");
+
+typedef struct {
+ uint8_t raw[PCI_CONFIG_SPACE_SIZEOF - PCI_STD_HEADER_SIZEOF];
+} __attribute__ ((packed)) lm_pci_non_std_config_space_t;
+_Static_assert(sizeof(lm_pci_non_std_config_space_t) == 0xc0,
+ "bad non-standard PCI configuration space size");
+
+struct lm_pci_config_space {
+ union {
+ uint8_t raw[PCI_CONFIG_SPACE_SIZEOF];
+ struct {
+ lm_pci_hdr_t hdr;
+ lm_pci_non_std_config_space_t non_std;
+ } __attribute__ ((packed));
+ } __attribute__ ((packed));
+ uint8_t extended[];
+} __attribute__ ((packed));
+_Static_assert(sizeof(struct lm_pci_config_space) == 0x100,
+ "bad PCI configuration space size");
+
+// Region flags.
+#define LM_REG_FLAG_READ (1 << 0)
+#define LM_REG_FLAG_WRITE (1 << 1)
+#define LM_REG_FLAG_MMAP (1 << 2) // TODO: how this relates to IO bar?
+#define LM_REG_FLAG_RW (LM_REG_FLAG_READ | LM_REG_FLAG_WRITE)
+#define LM_REG_FLAG_MEM (1 << 3) // if unset, bar is IO
+
+struct lm_reg_info {
+ uint32_t flags;
+ uint32_t size;
+ uint64_t offset;
+};
+
+enum {
+ LM_DEV_INTX_IRQ_IDX,
+ LM_DEV_MSI_IRQ_IDX,
+ LM_DEV_MSIX_IRQ_IDX,
+ LM_DEV_ERR_IRQ_IDX,
+ LM_DEV_REQ_IRQ_IDX,
+ LM_DEV_NUM_IRQS = 5
+};
+
+/*
+ * Returns a pointer to the non-standard part of the PCI configuration space.
+ */
+lm_pci_config_space_t *lm_get_pci_config_space(lm_ctx_t * const lm_ctx);
+
+lm_reg_info_t *lm_get_region_info(lm_ctx_t * const lm_ctx);
+
+/*
+ * TODO the rest of these functions don't need to be public, put them in a
+ * private header file so libmuser.c can use them.
+ * TODO replace the "muser" prefix
+ */
+int
+muser_pci_hdr_access(lm_ctx_t * const lm_ctx, size_t * const count,
+ loff_t * const pos, const bool write,
+ unsigned char *const buf);
+
+
+
+#endif /* LIBMUSER_PCI_H */
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/pmcap.h b/lib/pmcap.h
new file mode 100644
index 0000000..2757a3e
--- /dev/null
+++ b/lib/pmcap.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2019 Nutanix Inc. All rights reserved.
+ *
+ * Authors: Thanos Makatos <thanos@nutanix.com>
+ * Swapnil Ingle <swapnil.ingle@nutanix.com>
+ * Felipe Franciosi <felipe@nutanix.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Nutanix nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+struct pid {
+ unsigned int cid:8;
+ unsigned int next:8;
+} __attribute__((packed));
+_Static_assert(sizeof(struct pid) == 0x2, "bad PID size");
+
+struct pc {
+ unsigned int vs:3;
+ unsigned int pmec:1;
+ unsigned int res:1;
+ unsigned int dsi:1;
+ unsigned int auxc:3;
+ unsigned int d1s:1;
+ unsigned int d2s:1;
+ unsigned int psup:5;
+} __attribute__((packed));
+_Static_assert(sizeof(struct pc) == 0x2, "bad PC size");
+
+struct pmcs {
+ unsigned int ps:2;
+ unsigned int res1:1;
+ unsigned int nsfrst:1;
+ unsigned int res2:4;
+ unsigned int pmee:1;
+ unsigned int dse:4;
+ unsigned int dsc:2;
+ unsigned int pmes:1;
+};
+_Static_assert(sizeof(struct pc) == 0x2, "bad PC size");
+
+struct pmcap {
+ struct pid pid;
+ struct pc pc;
+ struct pmcs pmcs;
+} __attribute__((packed)) __attribute__ ((aligned(8)));
+_Static_assert(sizeof(struct pmcap) == 0x8, "bad PC size");
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/pxcap.h b/lib/pxcap.h
new file mode 100644
index 0000000..fbea685
--- /dev/null
+++ b/lib/pxcap.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2019 Nutanix Inc. All rights reserved.
+ *
+ * Authors: Thanos Makatos <thanos@nutanix.com>
+ * Swapnil Ingle <swapnil.ingle@nutanix.com>
+ * Felipe Franciosi <felipe@nutanix.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Nutanix nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+struct pxid {
+ unsigned int cid:8;
+ unsigned int next:8;
+} __attribute__((packed));
+_Static_assert(sizeof(struct pxid) == 0x2, "bad PXID size");
+
+struct pxcap {
+ unsigned int ver:4;
+ unsigned int dpt:4;
+ unsigned int si:1;
+ unsigned int imn:5;
+ unsigned int res1:2;
+} __attribute__((packed));
+_Static_assert(sizeof(struct pxcap) == 0x2, "bad PXCAP size");
+
+struct pxdcap {
+ unsigned int mps:3;
+ unsigned int pfs:2;
+ unsigned int etfs:1;
+ unsigned int l0sl:3;
+ unsigned int l1l:3;
+ unsigned int per:1;
+ unsigned int res1:2;
+ unsigned int csplv:8;
+ unsigned int cspls:2;
+ unsigned int flrc:1;
+ unsigned int res2:3;
+} __attribute__((packed));
+_Static_assert(sizeof(struct pxdcap) == 0x4, "bad PXDCAP size");
+
+union pxdc {
+ uint16_t raw;
+ struct {
+ unsigned int cere:1;
+ unsigned int nfere:1;
+ unsigned int fere:1;
+ unsigned int urre:1;
+ unsigned int ero:1;
+ unsigned int mps:3;
+ unsigned int ete:1;
+ unsigned int pfe:1;
+ unsigned int appme:1;
+ unsigned int ens:1;
+ unsigned int mrrs:3;
+ unsigned int iflr:1;
+ } __attribute__((packed));
+} __attribute__((packed));
+_Static_assert(sizeof(union pxdc) == 0x2, "bad PXDC size");
+
+/* TODO not defining for now since all values are 0 for reset */
+struct pxds {
+ unsigned int stuff:16;
+} __attribute__((packed));
+_Static_assert(sizeof(struct pxds) == 0x2, "bad PXDS size");
+
+struct pxlcap {
+ unsigned int stuff:32;
+} __attribute__((packed));
+_Static_assert(sizeof(struct pxlcap) == 0x4, "bad PXLCAP size");
+
+struct pxlc {
+ unsigned int stuff:16;
+} __attribute__((packed));
+_Static_assert(sizeof(struct pxlc) == 0x2, "bad PXLC size");
+
+struct pxls {
+ unsigned int stuff:16;
+} __attribute__((packed));
+_Static_assert(sizeof(struct pxls) == 0x2, "bad PXLS size");
+
+struct pxdcap2 {
+ unsigned int ctrs:4;
+ unsigned int ctds:1;
+ unsigned int arifs:1;
+ unsigned int aors:1;
+ unsigned int aocs32:1;
+ unsigned int aocs64:1;
+ unsigned int ccs128:1;
+ unsigned int nprpr:1;
+ unsigned int ltrs:1;
+ unsigned int tphcs:2;
+ unsigned int obffs:2;
+ unsigned int effs:1;
+ unsigned int eetps:1;
+ unsigned int meetp:2;
+ unsigned int res1:8;
+} __attribute__((packed));
+_Static_assert(sizeof(struct pxdcap2) == 0x4, "bad PXDCAP2 size");
+
+struct pxdc2 {
+ unsigned int stuff:16;
+} __attribute__((packed));
+_Static_assert(sizeof(struct pxdc2) == 0x2, "bad PXDC2 size");
+
+/* TODO name conflicts with PXCAP */
+struct PCI_Express_Capability {
+ struct pxid pxid;
+ struct pxcap pxcap;
+ struct pxdcap pxdcap;
+ union pxdc pxdc;
+ struct pxds pxds;
+ struct pxlcap pxlcap;
+ struct pxlc pxlc;
+ struct pxls pxls;
+ uint8_t pad[0x10];
+ struct pxdcap2 pxdcap2;
+ struct pxdc2 pxdc2;
+} __attribute__((packed));
+_Static_assert(sizeof(struct PCI_Express_Capability) == 0x2a,
+ "bad PCI Express Capability size");
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */