aboutsummaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorThanos Makatos <thanos.makatos@nutanix.com>2020-11-11 07:35:10 -0500
committerThanos Makatos <thanos.makatos@nutanix.com>2020-11-11 07:35:10 -0500
commitb6e8c7b39456e7c0a164f87c48eac2bbfd6d85f3 (patch)
treec94839c02cde83bca416221bd906e4952fbc8c53 /lib
parentb9a2e75360e14e59db651d6081894e0cf20e7c2d (diff)
parent985940e6539eaf8f41e0b6421938b5bf5c1db22c (diff)
downloadlibvfio-user-b6e8c7b39456e7c0a164f87c48eac2bbfd6d85f3.zip
libvfio-user-b6e8c7b39456e7c0a164f87c48eac2bbfd6d85f3.tar.gz
libvfio-user-b6e8c7b39456e7c0a164f87c48eac2bbfd6d85f3.tar.bz2
Merge branch 'vfio-user'
Diffstat (limited to 'lib')
-rw-r--r--lib/CMakeLists.txt4
-rw-r--r--lib/cap.c444
-rw-r--r--lib/cap.h9
-rw-r--r--lib/caps/common.h46
-rw-r--r--lib/caps/msi.h9
-rw-r--r--lib/caps/msix.h9
-rw-r--r--lib/caps/pm.h15
-rw-r--r--lib/caps/px.h9
-rw-r--r--lib/common.h8
-rw-r--r--lib/dma.c248
-rw-r--r--lib/dma.h137
-rw-r--r--lib/muser.h232
-rw-r--r--lib/muser_ctx.c2242
-rw-r--r--lib/muser_pci.c75
-rw-r--r--lib/muser_priv.h113
-rw-r--r--lib/vfio_user.h167
16 files changed, 3093 insertions, 674 deletions
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index e2084fe..bc9e4b8 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -32,14 +32,14 @@ set(CMAKE_C_FLAGS "-Wall -Wextra -Werror -fPIC")
set(CMAKE_C_FLAGS_DEBUG "-O0 -ggdb")
add_library(muser SHARED
- ../kmod/muser.h
+ vfio_user.h
muser.h
muser_priv.h
common.h)
target_link_libraries(muser muser_ctx muser_pci dma cap)
set_target_properties(muser PROPERTIES LINKER_LANGUAGE C)
-set_target_properties(muser PROPERTIES PUBLIC_HEADER "muser.h;pci.h")
+set_target_properties(muser PROPERTIES PUBLIC_HEADER "muser.h;pci.h;vfio_user.h")
set(UT_CFLAGS "-O0 -ggdb --coverage")
set(UT_LFLAGS "--coverage")
diff --git a/lib/cap.c b/lib/cap.c
index ca2235a..451c85a 100644
--- a/lib/cap.c
+++ b/lib/cap.c
@@ -34,56 +34,60 @@
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
+#include <stddef.h>
+#include <string.h>
#include "muser.h"
#include "cap.h"
struct cap {
- uint8_t start;
- uint8_t end;
- uint8_t id;
- lm_cap_access_t *fn;
+ uint8_t start;
+ uint8_t end;
};
struct caps {
- struct cap caps[LM_MAX_CAPS];
- int nr_caps;
+ struct cap caps[LM_MAX_CAPS]; /* FIXME only needs to be as big as nr_caps */
+ unsigned int nr_caps;
};
/*
* Tells whether a capability is being accessed.
*/
static bool
-cap_is_accessed(struct cap *caps, int nr_caps, loff_t offset)
+cap_is_accessed(struct cap *caps, int nr_caps, size_t count, loff_t offset)
{
- /*
- * Ignore if it's at the standard PCI header. The first capability starts
- * right after that.
- */
- if (offset < PCI_STD_HEADER_SIZEOF) {
- return false;
- }
-
- /* ignore if there are no capabilities */
- if (!nr_caps) {
+ if (nr_caps == 0) {
return false;
}
- assert(caps);
+ assert(caps != NULL);
- /*
- * Ignore if it's before the first capability. This check is probably
- * redundant since we assume that the first capability starts right after
- * the standard PCI header.
- * TODO should we check that it doesn't cross into the first capability?
- */
if (offset < caps[0].start) {
+ /* write starts before first capability */
+
+ if (offset + count <= caps[0].start) {
+ /* write ends before first capability */
+ return false;
+ }
+
+ /*
+ * FIXME write starts before capabilities but extends into them. I don't
+ * think that the while loop in lm_access will allow this in the first
+ * place.
+ */
+ assert(false);
+ } else if (offset > caps[nr_caps - 1].end) {
+ /* write starts after last capability */
return false;
}
- /* ignore if it's past the last capability */
- if (offset > caps[nr_caps - 1].end) {
- return false;
+ if (offset + count > (size_t)(caps[nr_caps - 1].end + 1)) {
+ /*
+ * FIXME write starts within capabilities but extends past them, I think
+ * that this _is_ possible, e.g. MSI-X is 12 bytes (PCI_CAP_MSIX_SIZEOF)
+ * and the host writes to first 8 bytes and then writes 8 more.
+ */
+ assert(false);
}
return true;
}
@@ -92,151 +96,369 @@ cap_is_accessed(struct cap *caps, int nr_caps, loff_t offset)
* Returns the PCI capability that is contained within the specified region
* (offset + count).
*/
-static struct cap *
-cap_find(struct cap *caps, int nr_caps, loff_t offset, size_t count)
+static uint8_t *
+cap_find(lm_pci_config_space_t *config_space, struct caps *caps, loff_t offset,
+ size_t count)
{
struct cap *cap;
- cap = caps;
- while (cap < caps + nr_caps) {
+ assert(config_space != NULL);
+ assert(caps != NULL);
+
+ cap = caps->caps;
+ while (cap < caps->caps + caps->nr_caps) {
/*
- * TODO this assumes that at most one capability is read. It might be
- * legitimate to read an arbitrary number of bytes, which we could
- * support. For now lets explicitly fail such cases.
+ * FIXME ensure that at most one capability is written to. It might
+ * legitimate to write to two capabilities at the same time.
*/
- if (offset >= cap->start && offset + count - 1 <= cap->end) {
- return cap;
+ if (offset >= cap->start && offset <= cap->end) {
+ if (offset + count - 1 > cap->end) {
+ assert(false);
+ }
+ return config_space->raw + cap->start;
}
cap++;
}
- /* this means that the access spans more than a capability */
return NULL;
}
-/*
- * Tells whether the header of a PCI capability is accessed.
- */
static bool
-cap_header_is_accessed(struct cap *cap, loff_t offset)
+cap_is_valid(uint8_t id)
{
- assert(cap);
- return offset - cap->start <= 1;
+ /* TODO 0 is a valid capability ID (Null Capability), check
+ * https://pcisig.com/sites/default/files/files/PCI_Code-ID_r_1_11__v24_Jan_2019.pdf:
+ *
+ */
+ return id >= PCI_CAP_ID_PM && id <= PCI_CAP_ID_MAX;
}
-/*
- * Reads the header of a PCI capability.
- */
-static int
-cap_header_access(struct caps *caps, struct cap *cap, char *buf,
- loff_t offset, size_t count, bool is_write)
+uint8_t *
+cap_find_by_id(lm_ctx_t *lm_ctx, uint8_t id)
{
- int n;
+ uint8_t *pos;
+ lm_pci_config_space_t *config_space;
- /*
- * We don't allow ID and next to be written. TODO not sure what the PCI
- * spec says about this, need to check.
- */
- if (is_write) {
- return -EINVAL;
+ if (!cap_is_valid(id)) {
+ errno = EINVAL;
+ return NULL;
}
- assert(caps);
- assert(cap);
- n = 0;
- /*
- * We handle reads to ID and next, the rest is handled by the callback.
- */
- if (offset == cap->start && count > 0) { /* ID */
- buf[n++] = cap->id;
- offset++;
- count--;
+ config_space = lm_get_pci_config_space(lm_ctx);
+
+ if (config_space->hdr.cap == 0) {
+ errno = ENOENT;
+ return NULL;
}
- if (offset == cap->start + 1 && count > 0) { /* next */
- if ((cap - caps->caps) / sizeof *cap == (size_t)(caps->nr_caps - 1)) {
- buf[n++] = 0;
- } else {
- buf[n++] = (cap + 1)->start;
+ pos = config_space->raw + config_space->hdr.cap;
+ while (true) {
+ if (*(pos + PCI_CAP_LIST_ID) == id) {
+ return pos;
}
-
- offset++;
- count--;
+ if (*(pos + PCI_CAP_LIST_NEXT) == 0) {
+ break;
+ }
+ pos = config_space->raw + *(pos + PCI_CAP_LIST_NEXT);
}
- return n;
+ errno = ENOENT;
+ return NULL;
}
+/*
+ * Tells whether the header of a PCI capability is accessed.
+ */
+static bool
+cap_header_is_accessed(uint8_t cap_offset, loff_t offset)
+{
+ return offset - cap_offset <= 1;
+}
+
+typedef ssize_t (cap_access) (lm_ctx_t *lm_ctx, uint8_t *cap, char *buf,
+ size_t count, loff_t offset);
+
+static ssize_t
+handle_pmcs_write(lm_ctx_t *lm_ctx, struct pmcap *pm,
+ const struct pmcs *const pmcs)
+{
+
+ if (pm->pmcs.ps != pmcs->ps) {
+ lm_log(lm_ctx, LM_DBG, "power state set to %#x\n", pmcs->ps);
+ }
+ if (pm->pmcs.pmee != pmcs->pmee) {
+ lm_log(lm_ctx, LM_DBG, "PME enable set to %#x\n", pmcs->pmee);
+ }
+ if (pm->pmcs.dse != pmcs->dse) {
+ lm_log(lm_ctx, LM_DBG, "data select set to %#x\n", pmcs->dse);
+ }
+ if (pm->pmcs.pmes != pmcs->pmes) {
+ lm_log(lm_ctx, LM_DBG, "PME status set to %#x\n", pmcs->pmes);
+ }
+ pm->pmcs = *pmcs;
+ return 0;
+}
+
+static ssize_t
+handle_pm_write(lm_ctx_t *lm_ctx, uint8_t *cap, char *const buf,
+ const size_t count, const loff_t offset)
+{
+ struct pmcap *pm = (struct pmcap *)cap;
+
+ switch (offset) {
+ case offsetof(struct pmcap, pc):
+ if (count != sizeof(struct pc)) {
+ return -EINVAL;
+ }
+ assert(false); /* FIXME implement */
+ case offsetof(struct pmcap, pmcs):
+ if (count != sizeof(struct pmcs)) {
+ return -EINVAL;
+ }
+ return handle_pmcs_write(lm_ctx, pm, (struct pmcs *)buf);
+ }
+ return -EINVAL;
+}
+
+static ssize_t
+handle_mxc_write(lm_ctx_t *lm_ctx, struct msixcap *msix,
+ const struct mxc *const mxc)
+{
+ assert(msix != NULL);
+ assert(mxc != NULL);
+
+ if (mxc->mxe != msix->mxc.mxe) {
+ lm_log(lm_ctx, LM_DBG, "%s MSI-X\n", mxc->mxe ? "enable" : "disable");
+ msix->mxc.mxe = mxc->mxe;
+ }
+
+ if (mxc->fm != msix->mxc.fm) {
+ if (mxc->fm) {
+ lm_log(lm_ctx, LM_DBG, "all MSI-X vectors masked\n");
+ } else {
+ lm_log(lm_ctx, LM_DBG,
+ "vector's mask bit determines whether vector is masked\n");
+ }
+ msix->mxc.fm = mxc->fm;
+ }
+
+ return sizeof(struct mxc);
+}
+
+static ssize_t
+handle_msix_write(lm_ctx_t *lm_ctx, uint8_t *cap, char *const buf,
+ const size_t count, const loff_t offset)
+{
+ struct msixcap *msix = (struct msixcap *)cap;
+
+ if (count == sizeof(struct mxc)) {
+ switch (offset) {
+ case offsetof(struct msixcap, mxc):
+ return handle_mxc_write(lm_ctx, msix, (struct mxc *)buf);
+ default:
+ lm_log(lm_ctx, LM_ERR, "invalid MSI-X write offset %ld\n", offset);
+ return -EINVAL;
+ }
+ }
+ lm_log(lm_ctx, LM_ERR, "invalid MSI-X write size %lu\n", count);
+ return -EINVAL;
+}
+
+static int
+handle_px_pxdc_write(lm_ctx_t *lm_ctx, struct pxcap *px, const union pxdc *const p)
+{
+ assert(px != NULL);
+ assert(p != NULL);
+
+ if (p->cere != px->pxdc.cere) {
+ px->pxdc.cere = p->cere;
+ lm_log(lm_ctx, LM_DBG, "CERE %s\n", p->cere ? "enable" : "disable");
+ }
+
+ if (p->nfere != px->pxdc.nfere) {
+ px->pxdc.nfere = p->nfere;
+ lm_log(lm_ctx, LM_DBG, "NFERE %s\n", p->nfere ? "enable" : "disable");
+ }
+
+ if (p->fere != px->pxdc.fere) {
+ px->pxdc.fere = p->fere;
+ lm_log(lm_ctx, LM_DBG, "FERE %s\n", p->fere ? "enable" : "disable");
+ }
+
+ if (p->urre != px->pxdc.urre) {
+ px->pxdc.urre = p->urre;
+ lm_log(lm_ctx, LM_DBG, "URRE %s\n", p->urre ? "enable" : "disable");
+ }
+
+ if (p->ero != px->pxdc.ero) {
+ px->pxdc.ero = p->ero;
+ lm_log(lm_ctx, LM_DBG, "ERO %s\n", p->ero ? "enable" : "disable");
+ }
+
+ if (p->mps != px->pxdc.mps) {
+ px->pxdc.mps = p->mps;
+ lm_log(lm_ctx, LM_DBG, "MPS set to %d\n", p->mps);
+ }
+
+ if (p->ete != px->pxdc.ete) {
+ px->pxdc.ete = p->ete;
+ lm_log(lm_ctx, LM_DBG, "ETE %s\n", p->ete ? "enable" : "disable");
+ }
+
+ if (p->pfe != px->pxdc.pfe) {
+ px->pxdc.pfe = p->pfe;
+ lm_log(lm_ctx, LM_DBG, "PFE %s\n", p->pfe ? "enable" : "disable");
+ }
+
+ if (p->appme != px->pxdc.appme) {
+ px->pxdc.appme = p->appme;
+ lm_log(lm_ctx, LM_DBG, "APPME %s\n", p->appme ? "enable" : "disable");
+ }
+
+ if (p->ens != px->pxdc.ens) {
+ px->pxdc.ens = p->ens;
+ lm_log(lm_ctx, LM_DBG, "ENS %s\n", p->ens ? "enable" : "disable");
+ }
+
+ if (p->mrrs != px->pxdc.mrrs) {
+ px->pxdc.mrrs = p->mrrs;
+ lm_log(lm_ctx, LM_DBG, "MRRS set to %d\n", p->mrrs);
+ }
+
+ if (p->iflr) {
+ lm_log(lm_ctx, LM_DBG,
+ "initiate function level reset\n");
+ }
+
+ return 0;
+}
+
+static int
+handle_px_write_2_bytes(lm_ctx_t *lm_ctx, struct pxcap *px, char *const buf,
+ loff_t off)
+{
+ switch (off) {
+ case offsetof(struct pxcap, pxdc):
+ return handle_px_pxdc_write(lm_ctx, px, (union pxdc *)buf);
+ }
+ return -EINVAL;
+}
+
+static ssize_t
+handle_px_write(lm_ctx_t *lm_ctx, uint8_t *cap, char *const buf,
+ size_t count, loff_t offset)
+{
+ struct pxcap *px = (struct pxcap *)cap;
+
+ int err = -EINVAL;
+ switch (count) {
+ case 2:
+ err = handle_px_write_2_bytes(lm_ctx, px, buf, offset);
+ break;
+ }
+ if (err != 0) {
+ return err;
+ }
+ return count;
+}
+
+static const struct cap_handler {
+ char *name;
+ size_t size;
+ cap_access *fn;
+} cap_handlers[PCI_CAP_ID_MAX + 1] = {
+ [PCI_CAP_ID_PM] = {"PM", PCI_PM_SIZEOF, handle_pm_write},
+ [PCI_CAP_ID_EXP] = {"PCI Express", PCI_CAP_EXP_ENDPOINT_SIZEOF_V2,
+ handle_px_write},
+ [PCI_CAP_ID_MSIX] = {"MSI-X", PCI_CAP_MSIX_SIZEOF, handle_msix_write},
+};
+
ssize_t
-cap_maybe_access(struct caps *caps, void *pvt, char *buf, size_t count,
- loff_t offset, bool is_write)
+cap_maybe_access(lm_ctx_t *lm_ctx, struct caps *caps, char *buf, size_t count,
+ loff_t offset)
{
- struct cap *cap;
+ lm_pci_config_space_t *config_space;
+ uint8_t *cap;
- if (!caps) {
+ if (caps == NULL) {
return 0;
}
- if (!count) {
+ if (count == 0) {
return 0;
}
- if (!cap_is_accessed(caps->caps, caps->nr_caps, offset)) {
+ if (!cap_is_accessed(caps->caps, caps->nr_caps, count, offset)) {
return 0;
}
/* we're now guaranteed that the access is within some capability */
- cap = cap_find(caps->caps, caps->nr_caps, offset, count);
+ config_space = lm_get_pci_config_space(lm_ctx);
+ cap = cap_find(config_space, caps, offset, count);
+ assert(cap != NULL); /* FIXME */
- if (!cap) {
- return 0;
- }
-
- if (cap_header_is_accessed(cap, offset)) {
- return cap_header_access(caps, cap, buf, offset, count, is_write);
- }
- if (count > 0) {
- return cap->fn(pvt, cap->id, buf, count, offset - cap->start, is_write);
+ if (cap_header_is_accessed(cap - config_space->raw, offset)) {
+ /* FIXME how to deal with writes to capability header? */
+ assert(false);
}
- return 0;
-}
-
-static bool
-cap_is_valid(uint8_t id)
-{
- return id >= PCI_CAP_ID_PM && id <= PCI_CAP_ID_MAX;
+ return cap_handlers[cap[PCI_CAP_LIST_ID]].fn(lm_ctx, cap, buf, count,
+ offset - (loff_t)(cap - config_space->raw));
}
struct caps *
-caps_create(const lm_cap_t *lm_caps, int nr_caps)
+caps_create(lm_ctx_t *lm_ctx, lm_cap_t **lm_caps, int nr_caps)
{
- uint8_t prev_end;
int i, err = 0;
- struct caps *caps = NULL;
+ uint8_t *prev;
+ uint8_t next;
+ lm_pci_config_space_t *config_space;
+ struct caps *caps;
if (nr_caps <= 0 || nr_caps >= LM_MAX_CAPS) {
err = EINVAL;
goto out;
}
- assert(lm_caps);
+ assert(lm_caps != NULL);
caps = calloc(1, sizeof *caps);
- if (!caps) {
- err = errno;
+ if (caps == NULL) {
goto out;
}
- prev_end = PCI_STD_HEADER_SIZEOF - 1;
+ config_space = lm_get_pci_config_space(lm_ctx);
+ /* points to the next field of the previous capability */
+ prev = &config_space->hdr.cap;
+
+ /* relative offset that points where the next capability should be placed */
+ next = PCI_STD_HEADER_SIZEOF;
+
for (i = 0; i < nr_caps; i++) {
- if (!cap_is_valid(lm_caps[i].id) || !lm_caps[i].fn || !lm_caps[i].size) {
+ uint8_t *cap = (uint8_t*)lm_caps[i];
+ uint8_t id = cap[PCI_CAP_LIST_ID];
+ size_t size;
+
+ if (!cap_is_valid(id)) {
+ err = EINVAL;
+ goto out;
+ }
+
+ size = cap_handlers[id].size;
+ if (size == 0) {
err = EINVAL;
goto out;
}
- caps->caps[i].id = lm_caps[i].id;
- caps->caps[i].fn = lm_caps[i].fn;
- /* FIXME PCI capabilities must be dword aligned. */
- caps->caps[i].start = prev_end + 1;
- caps->caps[i].end = prev_end = caps->caps[i].start + lm_caps[i].size - 1;
+ caps->caps[i].start = next;
+ caps->caps[i].end = next + size - 1;
+
+ memcpy(&config_space->hdr.raw[next], cap, size);
+ *prev = next;
+ prev = &config_space->hdr.raw[next + PCI_CAP_LIST_NEXT];
+ *prev = 0;
+ next += size;
+ assert(next % 4 == 0); /* FIXME */
+
+ lm_log(lm_ctx, LM_DBG, "initialized capability %s %#x-%#x\n",
+ cap_handlers[id].name, caps->caps[i].start, caps->caps[i].end);
}
caps->nr_caps = nr_caps;
diff --git a/lib/cap.h b/lib/cap.h
index e814d6c..1f72247 100644
--- a/lib/cap.h
+++ b/lib/cap.h
@@ -44,7 +44,7 @@ struct caps;
* capabilities have been added.
*/
struct caps *
-caps_create(const lm_cap_t *caps, int nr_caps);
+caps_create(lm_ctx_t *lm_ctx, lm_cap_t **caps, int nr_caps);
/*
* Conditionally accesses the PCI capabilities. Returns:
@@ -54,8 +54,11 @@ caps_create(const lm_cap_t *caps, int nr_caps);
* <0: negative error code on error.
*/
ssize_t
-cap_maybe_access(struct caps *caps, void *pvt, char *buf, size_t count,
- loff_t offset, bool is_write);
+cap_maybe_access(lm_ctx_t *lm_ctx, struct caps *caps, char *buf, size_t count,
+ loff_t offset);
+
+uint8_t *
+cap_find_by_id(lm_ctx_t *lm_ctx, uint8_t id);
#endif /* __CAP_H__ */
diff --git a/lib/caps/common.h b/lib/caps/common.h
new file mode 100644
index 0000000..2181a3b
--- /dev/null
+++ b/lib/caps/common.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2020 Nutanix Inc. All rights reserved.
+ *
+ * Authors: Thanos Makatos <thanos@nutanix.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Nutanix nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#ifndef LM_PCI_CAP_COMMON_H
+#define LM_PCI_CAP_COMMON_H
+
+#include <stddef.h>
+
+struct cap_hdr {
+ uint8_t id;
+ uint8_t next;
+} __attribute__((packed));
+_Static_assert(sizeof(struct cap_hdr) == 0x2, "bad PCI capability header size");
+_Static_assert(offsetof(struct cap_hdr, id) == PCI_CAP_LIST_ID, "bad offset");
+_Static_assert(offsetof(struct cap_hdr, next) == PCI_CAP_LIST_NEXT, "bad offset");
+
+#endif /* LM_PCI_CAP_COMMON_H */
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/caps/msi.h b/lib/caps/msi.h
index b310ae9..5933006 100644
--- a/lib/caps/msi.h
+++ b/lib/caps/msi.h
@@ -33,11 +33,7 @@
#ifndef LM_PCI_CAP_MSI_H
#define LM_PCI_CAP_MSI_H
-struct mid {
- unsigned int cid:8;
- unsigned int next:8;
-} __attribute__ ((packed));
-_Static_assert(sizeof(struct mid) == 0x2, "bad MID size");
+#include "common.h"
struct mc {
unsigned int msie:1;
@@ -56,7 +52,7 @@ struct ma {
_Static_assert(sizeof(struct ma) == 0x4, "bad MA size");
struct msicap {
- struct mid mid;
+ struct cap_hdr hdr;
struct mc mc;
struct ma ma;
uint32_t mua;
@@ -66,6 +62,7 @@ struct msicap {
uint32_t mpend;
} __attribute__ ((packed));
_Static_assert(sizeof(struct msicap) == 0x18, "bad MSICAP size");
+_Static_assert(offsetof(struct msicap, hdr) == 0, "bad offset");
#endif /* LM_CAP_MSI_H */
diff --git a/lib/caps/msix.h b/lib/caps/msix.h
index b13c1c8..b0bc1a5 100644
--- a/lib/caps/msix.h
+++ b/lib/caps/msix.h
@@ -35,12 +35,6 @@
#include <linux/pci_regs.h>
-struct mxid {
- unsigned int cid:8;
- unsigned int next:8;
-} __attribute__ ((packed));
-_Static_assert(sizeof(struct mxid) == 0x2, "bad MXID size");
-
struct mxc {
unsigned int ts:11;
unsigned int reserved:3;
@@ -63,12 +57,13 @@ _Static_assert(sizeof(struct mtab) == PCI_MSIX_PBA - PCI_MSIX_TABLE,
"bad MPBA size");
struct msixcap {
- struct mxid mxid;
+ struct cap_hdr hdr;
struct mxc mxc;
struct mtab mtab;
struct mpba mpba;
} __attribute__ ((packed)) __attribute__ ((aligned(4)));
_Static_assert(sizeof(struct msixcap) == PCI_CAP_MSIX_SIZEOF, "bad MSI-X size");
+_Static_assert(offsetof(struct msixcap, hdr) == 0, "bad offset");
#endif /* LM_CAP_MSIX_H */
diff --git a/lib/caps/pm.h b/lib/caps/pm.h
index ddae2e6..e976d95 100644
--- a/lib/caps/pm.h
+++ b/lib/caps/pm.h
@@ -33,11 +33,7 @@
#ifndef LM_PCI_CAP_PM_H
#define LM_PCI_CAP_PM_H
-struct pid {
- unsigned int cid:8;
- unsigned int next:8;
-} __attribute__((packed));
-_Static_assert(sizeof(struct pid) == 0x2, "bad PID size");
+#include "common.h"
struct pc {
unsigned int vs:3;
@@ -60,15 +56,16 @@ struct pmcs {
unsigned int dse:4;
unsigned int dsc:2;
unsigned int pmes:1;
-};
-_Static_assert(sizeof(struct pc) == 0x2, "bad PC size");
+} __attribute__((packed));
+_Static_assert(sizeof(struct pc) == 0x2, "bad PMCS size");
struct pmcap {
- struct pid pid;
+ struct cap_hdr hdr;
struct pc pc;
struct pmcs pmcs;
-} __attribute__((packed)) __attribute__ ((aligned(8)));
+} __attribute__((packed)) __attribute__ ((aligned(8))); /* FIXME why does it need to be aligned? */
_Static_assert(sizeof(struct pmcap) == PCI_PM_SIZEOF, "bad PC size");
+_Static_assert(offsetof(struct pmcap, hdr) == 0, "bad offset");
#endif /* LM_CAP_PM_H */
diff --git a/lib/caps/px.h b/lib/caps/px.h
index ce17cfe..28a04d5 100644
--- a/lib/caps/px.h
+++ b/lib/caps/px.h
@@ -33,11 +33,7 @@
#ifndef LM_PCI_CAP_PX_H
#define LM_PCI_CAP_PX_H
-struct pxid {
- unsigned int cid:8;
- unsigned int next:8;
-} __attribute__((packed));
-_Static_assert(sizeof(struct pxid) == 0x2, "bad PXID size");
+#include "common.h"
struct pxcaps {
unsigned int ver:4;
@@ -133,7 +129,7 @@ _Static_assert(sizeof(struct pxdc2) == 0x2, "bad PXDC2 size");
* the whole struct.
*/
struct pxcap {
- struct pxid pxid;
+ struct cap_hdr hdr;
struct pxcaps pxcaps;
struct pxdcap pxdcap;
union pxdc pxdc;
@@ -147,6 +143,7 @@ struct pxcap {
} __attribute__((packed));
_Static_assert(sizeof(struct pxcap) == 0x2a,
"bad PCI Express Capability size");
+_Static_assert(offsetof(struct pxcap, hdr) == 0, "bad offset");
#endif /* LM_PCI_CAP_PX_H */
diff --git a/lib/common.h b/lib/common.h
index 27d6735..f5de4d8 100644
--- a/lib/common.h
+++ b/lib/common.h
@@ -45,18 +45,18 @@
#define likely(e) __builtin_expect(!!(e), 1)
#define unlikely(e) __builtin_expect(e, 0)
+/* XXX NB 2nd argument must be power of two */
#define ROUND_DOWN(x, a) ((x) & ~((a)-1))
#define ROUND_UP(x,a) ROUND_DOWN((x)+(a)-1, a)
void
lm_log(lm_ctx_t *lm_ctx, lm_log_lvl_t lvl, const char *fmt, ...);
-#ifdef DEBUG
+#ifdef LM_VERBOSE_LOGGING
void
-dump_buffer(lm_ctx_t *lm_ctx, const char *prefix,
- const char *buf, uint32_t count);
+dump_buffer(const char *prefix, const char *buf, uint32_t count);
#else
-#define dump_buffer(lm_ctx, prefix, buf, count)
+#define dump_buffer(prefix, buf, count)
#endif
#endif /* __COMMON_H__ */
diff --git a/lib/dma.c b/lib/dma.c
index eb4b9d4..b6d365e 100644
--- a/lib/dma.c
+++ b/lib/dma.c
@@ -66,7 +66,7 @@ fds_are_same_file(int fd1, int fd2)
}
dma_controller_t *
-dma_controller_create(int max_regions)
+dma_controller_create(lm_ctx_t *lm_ctx, int max_regions)
{
dma_controller_t *dma;
@@ -77,37 +77,89 @@ dma_controller_create(int max_regions)
return dma;
}
+ dma->lm_ctx = lm_ctx;
dma->max_regions = max_regions;
dma->nregions = 0;
memset(dma->regions, 0, max_regions * sizeof(dma->regions[0]));
+ dma->dirty_pgsize = 0;
return dma;
}
static void
-_dma_controller_do_remove_region(dma_memory_region_t *region)
+_dma_controller_do_remove_region(dma_controller_t *dma,
+ dma_memory_region_t *region)
{
- assert(region);
- dma_unmap_region(region, region->virt_addr, region->size);
- (void)close(region->fd);
+ int err;
+
+ assert(dma != NULL);
+ assert(region != NULL);
+
+ err = dma_unmap_region(region, region->virt_addr, region->size);
+ if (err != 0) {
+ lm_log(dma->lm_ctx, LM_DBG, "failed to unmap fd=%d vaddr=%#lx-%#lx\n",
+ region->fd, region->virt_addr, region->size);
+ }
+ if (region->fd != -1) {
+ if (close(region->fd) == -1) {
+ lm_log(dma->lm_ctx, LM_DBG, "failed to close fd %d: %m\n", region->fd);
+ }
+ }
+}
+
+/*
+ * FIXME no longer used. Also, it doesn't work for addresses that span two
+ * DMA regions.
+ */
+bool
+dma_controller_region_valid(dma_controller_t *dma, dma_addr_t dma_addr,
+ size_t size)
+{
+ dma_memory_region_t *region;
+ int i;
+
+ for (i = 0; i < dma->nregions; i++) {
+ region = &dma->regions[i];
+ if (dma_addr == region->dma_addr && size <= region->size) {
+ return true;
+ }
+ }
+
+ return false;
}
/* FIXME not thread safe */
int
-dma_controller_remove_region(dma_controller_t *dma, dma_addr_t dma_addr,
- size_t size, int fd)
+dma_controller_remove_region(dma_controller_t *dma,
+ dma_addr_t dma_addr, size_t size,
+ int (*unmap_dma) (void*, uint64_t), void *data)
{
int idx;
dma_memory_region_t *region;
+ int err;
- assert(dma);
+ assert(dma != NULL);
for (idx = 0; idx < dma->nregions; idx++) {
region = &dma->regions[idx];
- if (region->dma_addr == dma_addr && region->size == size &&
- fds_are_same_file(region->fd, fd)) {
- _dma_controller_do_remove_region(region);
+ if (region->dma_addr == dma_addr && region->size == size) {
+ if (region->refcnt > 0) {
+ err = unmap_dma(data, region->dma_addr);
+ if (err != 0) {
+ lm_log(dma->lm_ctx, LM_ERR,
+ "failed to notify of removal of DMA region %#lx-%#lx: %s\n",
+ region->dma_addr, region->dma_addr + region->size,
+ strerror(-err));
+ return err;
+ }
+ assert(region->refcnt == 0);
+ }
+ _dma_controller_do_remove_region(dma, region);
if (dma->nregions > 1)
+ /*
+ * FIXME valgrind complains with 'Source and destination overlap in memcpy',
+ * check whether memmove eliminates this warning.
+ */
memcpy(region, &dma->regions[dma->nregions - 1],
sizeof *region);
dma->nregions--;
@@ -118,7 +170,7 @@ dma_controller_remove_region(dma_controller_t *dma, dma_addr_t dma_addr,
}
static inline void
-dma_controller_remove_regions(lm_ctx_t *ctx, dma_controller_t *dma)
+dma_controller_remove_regions(dma_controller_t *dma)
{
int i;
@@ -127,26 +179,26 @@ dma_controller_remove_regions(lm_ctx_t *ctx, dma_controller_t *dma)
for (i = 0; i < dma->nregions; i++) {
dma_memory_region_t *region = &dma->regions[i];
- lm_log(ctx, LM_INF, "unmap vaddr=%lx IOVA=%lx\n",
+ lm_log(dma->lm_ctx, LM_INF, "unmap vaddr=%#lx IOVA=%#lx",
region->virt_addr, region->dma_addr);
- _dma_controller_do_remove_region(region);
+ _dma_controller_do_remove_region(dma, region);
}
}
void
-dma_controller_destroy(lm_ctx_t *lm_ctx, dma_controller_t *dma)
+dma_controller_destroy(dma_controller_t *dma)
{
if (dma == NULL) {
return;
}
- dma_controller_remove_regions(lm_ctx, dma);
+ dma_controller_remove_regions(dma);
free(dma);
}
int
-dma_controller_add_region(lm_ctx_t *lm_ctx, dma_controller_t *dma,
+dma_controller_add_region(dma_controller_t *dma,
dma_addr_t dma_addr, size_t size,
int fd, off_t offset)
{
@@ -160,8 +212,8 @@ dma_controller_add_region(lm_ctx_t *lm_ctx, dma_controller_t *dma,
/* First check if this is the same exact region. */
if (region->dma_addr == dma_addr && region->size == size) {
if (offset != region->offset) {
- lm_log(lm_ctx, LM_ERR, "bad offset for new DMA region %lx+%lx, "
- "want=%d, existing=%d\n",
+ lm_log(dma->lm_ctx, LM_ERR,
+ "bad offset for new DMA region %#lx+%#lx, want=%d, existing=%d\n",
dma_addr, size, offset, region->offset);
goto err;
}
@@ -172,8 +224,9 @@ dma_controller_add_region(lm_ctx_t *lm_ctx, dma_controller_t *dma,
* the same file, however in the majority of cases we'll be
* using a single fd.
*/
- lm_log(lm_ctx, LM_ERR, "bad fd=%d for new DMA region %lx-%lx, "
- "existing fd=%d\n", fd, region->fd);
+ lm_log(dma->lm_ctx, LM_ERR,
+ "bad fd=%d for new DMA region %#lx-%#lx, existing fd=%d\n",
+ fd, region->fd);
goto err;
}
return idx;
@@ -184,16 +237,17 @@ dma_controller_add_region(lm_ctx_t *lm_ctx, dma_controller_t *dma,
dma_addr < region->dma_addr + region->size) ||
(region->dma_addr >= dma_addr &&
region->dma_addr < dma_addr + size)) {
- lm_log(lm_ctx, LM_INF, "new DMA region %lx+%lx overlaps with DMA "
- "region %lx-%lx\n", dma_addr, size, region->dma_addr,
- region->size);
+ lm_log(dma->lm_ctx, LM_INF,
+ "new DMA region %#lx+%#lx overlaps with DMA region %#lx-%#lx\n",
+ dma_addr, size, region->dma_addr, region->size);
goto err;
}
}
if (dma->nregions == dma->max_regions) {
idx = dma->max_regions;
- lm_log(lm_ctx, LM_ERR, "reached maxed regions, recompile with higher number of DMA regions\n");
+ lm_log(dma->lm_ctx, LM_ERR,
+ "reached maxed regions, recompile with higher number of DMA regions\n");
goto err;
}
@@ -202,7 +256,7 @@ dma_controller_add_region(lm_ctx_t *lm_ctx, dma_controller_t *dma,
page_size = fd_get_blocksize(fd);
if (page_size < 0) {
- lm_log(lm_ctx, LM_ERR, "bad page size %d\n", page_size);
+ lm_log(dma->lm_ctx, LM_ERR, "bad page size %d\n", page_size);
goto err;
}
page_size = MAX(page_size, getpagesize());
@@ -211,20 +265,21 @@ dma_controller_add_region(lm_ctx_t *lm_ctx, dma_controller_t *dma,
region->size = size;
region->page_size = page_size;
region->offset = offset;
-
- region->fd = dup(fd); // dup the fd to get our own private copy
- if (region->fd < 0) {
- lm_log(lm_ctx, LM_ERR, "failed to duplicate file descriptor: %s\n",
- strerror(errno));
- goto err;
- }
+ region->fd = fd;
+ region->refcnt = 0;
region->virt_addr = dma_map_region(region, PROT_READ | PROT_WRITE,
0, region->size);
if (region->virt_addr == MAP_FAILED) {
- lm_log(lm_ctx, LM_ERR, "failed to memory map DMA region %lx-%lx: %s\n",
+ lm_log(dma->lm_ctx, LM_ERR,
+ "failed to memory map DMA region %#lx-%#lx: %s\n",
dma_addr, dma_addr + size, strerror(errno));
- close(region->fd);
+ if (region->fd != -1) {
+ if (close(region->fd) == -1) {
+ lm_log(dma->lm_ctx, LM_DBG, "failed to close fd %d: %m\n",
+ region->fd);
+ }
+ }
goto err;
}
dma->nregions++;
@@ -269,17 +324,17 @@ dma_map_region(dma_memory_region_t *region, int prot, size_t offset, size_t len)
return mmap_base + (offset - mmap_offset);
}
-void
+int
dma_unmap_region(dma_memory_region_t *region, void *virt_addr, size_t len)
{
mmap_round((size_t *)&virt_addr, &len, region->page_size);
- munmap(virt_addr, len);
+ return munmap(virt_addr, len);
}
int
_dma_addr_sg_split(const dma_controller_t *dma,
dma_addr_t dma_addr, uint32_t len,
- dma_sg_t *sg, int max_sg)
+ dma_sg_t *sg, int max_sg, int prot)
{
int idx;
int cnt = 0;
@@ -295,9 +350,13 @@ _dma_addr_sg_split(const dma_controller_t *dma,
size_t region_len = MIN(region_end - dma_addr, len);
if (cnt < max_sg) {
+ sg[cnt].dma_addr = region->dma_addr;
sg[cnt].region = idx;
sg[cnt].offset = dma_addr - region->dma_addr;
sg[cnt].length = region_len;
+ if (_dma_should_mark_dirty(dma, prot)) {
+ _dma_mark_dirty(dma, region, sg);
+ }
}
cnt++;
@@ -326,4 +385,117 @@ out:
return cnt;
}
+ssize_t _get_bitmap_size(size_t region_size, size_t pgsize)
+{
+ if (pgsize == 0) {
+ return -EINVAL;
+ }
+ if (region_size < pgsize) {
+ return -EINVAL;
+ }
+ size_t nr_pages = (region_size / pgsize) + (region_size % pgsize != 0);
+ return (nr_pages / CHAR_BIT) + (nr_pages % CHAR_BIT != 0);
+}
+
+int dma_controller_dirty_page_logging_start(dma_controller_t *dma, size_t pgsize)
+{
+ int i;
+
+ assert(dma != NULL);
+
+ if (pgsize == 0) {
+ return -EINVAL;
+ }
+
+ if (dma->dirty_pgsize > 0) {
+ if (dma->dirty_pgsize != pgsize) {
+ return -EINVAL;
+ }
+ return 0;
+ }
+
+ for (i = 0; i < dma->nregions; i++) {
+ dma_memory_region_t *region = &dma->regions[i];
+ ssize_t bitmap_size = _get_bitmap_size(region->size, pgsize);
+ if (bitmap_size < 0) {
+ return bitmap_size;
+ }
+ region->dirty_bitmap = calloc(bitmap_size, sizeof(char));
+ if (region->dirty_bitmap == NULL) {
+ int j, ret = -errno;
+ for (j = 0; j < i; j++) {
+ free(region->dirty_bitmap);
+ region->dirty_bitmap = NULL;
+ }
+ return ret;
+ }
+ }
+ dma->dirty_pgsize = pgsize;
+ return 0;
+}
+
+int dma_controller_dirty_page_logging_stop(dma_controller_t *dma)
+{
+ int i;
+
+ assert(dma != NULL);
+
+ if (dma->dirty_pgsize == 0) {
+ return 0;
+ }
+
+ for (i = 0; i < dma->nregions; i++) {
+ free(dma->regions[i].dirty_bitmap);
+ dma->regions[i].dirty_bitmap = NULL;
+ }
+ dma->dirty_pgsize = 0;
+ return 0;
+}
+
+int
+dma_controller_dirty_page_get(dma_controller_t *dma, dma_addr_t addr, int len,
+ size_t pgsize, size_t size, char **data)
+{
+ int ret;
+ ssize_t bitmap_size;
+ dma_sg_t sg;
+ dma_memory_region_t *region;
+
+ assert(dma != NULL);
+ assert(data != NULL);
+
+ /*
+ * FIXME for now we support IOVAs that match exactly the DMA region. This
+ * is purely for simplifying the implementation. We MUST allow arbitrary
+ * IOVAs.
+ */
+ ret = dma_addr_to_sg(dma, addr, len, &sg, 1, PROT_NONE);
+ if (ret != 1 || sg.dma_addr != addr || sg.length != len) {
+ return -ENOTSUP;
+ }
+
+ if (pgsize != dma->dirty_pgsize) {
+ return -EINVAL;
+ }
+
+ bitmap_size = _get_bitmap_size(len, pgsize);
+ if (bitmap_size < 0) {
+ return bitmap_size;
+ }
+
+ /*
+ * FIXME they must be equal because this is how much data the client
+ * expects to receive.
+ */
+ if (size != (size_t)bitmap_size) {
+ return -EINVAL;
+ }
+
+ region = &dma->regions[sg.region];
+
+ *data = region->dirty_bitmap;
+
+ return 0;
+}
+
/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/dma.h b/lib/dma.h
index 1c41dce..7715b89 100644
--- a/lib/dma.h
+++ b/lib/dma.h
@@ -32,6 +32,11 @@
#define DMA_DMA_H
/*
+ * FIXME check whether DMA regions must be page aligned. If so then the
+ * implementation can be greatly simpified.
+ */
+
+/*
* This library emulates a DMA controller for a device emulation application to
* perform DMA operations on a foreign memory space.
*
@@ -72,6 +77,8 @@
#include "muser.h"
#include "common.h"
+struct lm_ctx;
+
typedef struct {
dma_addr_t dma_addr; // DMA address of this region
size_t size; // Size of this region
@@ -79,19 +86,23 @@ typedef struct {
int page_size; // Page size of this fd
off_t offset; // File offset
void *virt_addr; // Virtual address of this region
+ int refcnt; // Number of users of this region
+ char *dirty_bitmap; // Dirty page bitmap
} dma_memory_region_t;
typedef struct {
int max_regions;
int nregions;
+ struct lm_ctx *lm_ctx;
+ size_t dirty_pgsize; // Dirty page granularity
dma_memory_region_t regions[0];
} dma_controller_t;
dma_controller_t *
-dma_controller_create(int max_regions);
+dma_controller_create(lm_ctx_t *lm_ctx, int max_regions);
void
-dma_controller_destroy(lm_ctx_t *ctx, dma_controller_t *dma);
+dma_controller_destroy(dma_controller_t *dma);
/* Registers a new memory region.
* Returns:
@@ -101,19 +112,72 @@ dma_controller_destroy(lm_ctx_t *ctx, dma_controller_t *dma);
* (e.g. due to conflict with existing region).
*/
int
-dma_controller_add_region(lm_ctx_t *ctx, dma_controller_t *dma,
+dma_controller_add_region(dma_controller_t *dma,
dma_addr_t dma_addr, size_t size,
int fd, off_t offset);
int
-dma_controller_remove_region(dma_controller_t *dma, dma_addr_t dma_addr,
- size_t size, int fd);
+dma_controller_remove_region(dma_controller_t *dma,
+ dma_addr_t dma_addr, size_t size,
+ int (*unmap_dma) (void*, uint64_t), void *data);
// Helper for dma_addr_to_sg() slow path.
int
_dma_addr_sg_split(const dma_controller_t *dma,
dma_addr_t dma_addr, uint32_t len,
- dma_sg_t *sg, int max_sg);
+ dma_sg_t *sg, int max_sg, int prot);
+
+static bool
+_dma_should_mark_dirty(const dma_controller_t *dma, int prot)
+{
+ assert(dma != NULL);
+
+ return (prot & PROT_WRITE) == PROT_WRITE && dma->dirty_pgsize > 0;
+}
+
+static size_t
+_get_pgstart(size_t pgsize, uint64_t base_addr, uint64_t offset)
+{
+ return (offset - base_addr) / pgsize;
+}
+
+static size_t
+_get_pgend(size_t pgsize, uint64_t len, size_t start)
+{
+ return start + (len / pgsize) + (len % pgsize != 0) - 1;
+}
+
+static void
+_dma_bitmap_get_pgrange(const dma_controller_t *dma,
+ const dma_memory_region_t *region,
+ const dma_sg_t *sg, size_t *start, size_t *end)
+{
+ assert(dma != NULL);
+ assert(region != NULL);
+ assert(sg != NULL);
+ assert(start != NULL);
+ assert(end != NULL);
+
+ *start = _get_pgstart(dma->dirty_pgsize, region->dma_addr, sg->offset);
+ *end = _get_pgend(dma->dirty_pgsize, sg->length, *start);
+}
+
+static void
+_dma_mark_dirty(const dma_controller_t *dma, const dma_memory_region_t *region,
+ dma_sg_t *sg)
+{
+ size_t i, start, end;
+
+ assert(dma != NULL);
+ assert(region != NULL);
+ assert(sg != NULL);
+ assert(region->dirty_bitmap != NULL);
+
+ _dma_bitmap_get_pgrange(dma, region, sg, &start, &end);
+ for (i = start; i <= end; i++) {
+ region->dirty_bitmap[i / CHAR_BIT] |= 1 << (i % CHAR_BIT);
+ }
+}
/* Takes a linear dma address span and returns a sg list suitable for DMA.
* A single linear dma address span may need to be split into multiple
@@ -129,7 +193,7 @@ _dma_addr_sg_split(const dma_controller_t *dma,
static inline int
dma_addr_to_sg(const dma_controller_t *dma,
dma_addr_t dma_addr, uint32_t len,
- dma_sg_t *sg, int max_sg)
+ dma_sg_t *sg, int max_sg, int prot)
{
static __thread int region_hint;
int cnt;
@@ -139,14 +203,19 @@ dma_addr_to_sg(const dma_controller_t *dma,
// Fast path: single region.
if (likely(max_sg > 0 && len > 0 &&
- dma_addr >= region->dma_addr && dma_addr + len <= region_end)) {
+ dma_addr >= region->dma_addr && dma_addr + len <= region_end &&
+ region_hint < dma->nregions)) {
+ sg->dma_addr = region->dma_addr;
sg->region = region_hint;
sg->offset = dma_addr - region->dma_addr;
sg->length = len;
+ if (_dma_should_mark_dirty(dma, prot)) {
+ _dma_mark_dirty(dma, region, sg);
+ }
return 1;
}
// Slow path: search through regions.
- cnt = _dma_addr_sg_split(dma, dma_addr, len, sg, max_sg);
+ cnt = _dma_addr_sg_split(dma, dma_addr, len, sg, max_sg, prot);
if (likely(cnt > 0)) {
region_hint = sg->region;
}
@@ -157,7 +226,7 @@ void *
dma_map_region(dma_memory_region_t *region, int prot,
size_t offset, size_t len);
-void
+int
dma_unmap_region(dma_memory_region_t *region, void *virt_addr, size_t len);
static inline int
@@ -168,31 +237,53 @@ dma_map_sg(dma_controller_t *dma, const dma_sg_t *sg, struct iovec *iov,
int i;
for (i = 0; i < cnt; i++) {
+ lm_log(dma->lm_ctx, LM_DBG, "map %#lx-%#lx\n",
+ sg->dma_addr + sg->offset, sg->dma_addr + sg->offset + sg->length);
region = &dma->regions[sg[i].region];
iov[i].iov_base = region->virt_addr + sg[i].offset;
iov[i].iov_len = sg[i].length;
+ region->refcnt++;
}
return 0;
}
+/* FIXME useless define */
#define UNUSED __attribute__((unused))
static inline void
-dma_unmap_sg(UNUSED dma_controller_t *dma, UNUSED const dma_sg_t *sg,
- UNUSED struct iovec *iov, UNUSED int cnt)
+dma_unmap_sg(dma_controller_t *dma, const dma_sg_t *sg,
+ UNUSED struct iovec *iov, int cnt)
{
- /* just a placeholder for now */
+ int i;
+
+ for (i = 0; i < cnt; i++) {
+ dma_memory_region_t *r;
+ /*
+ * FIXME this double loop will be removed if we replace the array with
+ * tfind(3)
+ */
+ for (r = dma->regions;
+ r < dma->regions + dma->nregions && r->dma_addr != sg[i].dma_addr;
+ r++);
+ if (r > dma->regions + dma->nregions) {
+ /* bad region */
+ continue;
+ }
+ lm_log(dma->lm_ctx, LM_DBG, "unmap %#lx-%#lx\n",
+ sg[i].dma_addr + sg[i].offset, sg[i].dma_addr + sg[i].offset + sg[i].length);
+ r->refcnt--;
+ }
return;
}
static inline void *
-dma_map_addr(dma_controller_t *dma, dma_addr_t dma_addr, uint32_t len)
+dma_map_addr(dma_controller_t *dma, dma_addr_t dma_addr, uint32_t len, int prot)
{
dma_sg_t sg;
struct iovec iov;
- if (dma_addr_to_sg(dma, dma_addr, len, &sg, 1) == 1 &&
+ if (dma_addr_to_sg(dma, dma_addr, len, &sg, 1, prot) == 1 &&
dma_map_sg(dma, &sg, &iov, 1) == 0) {
return iov.iov_base;
}
@@ -211,12 +302,26 @@ dma_unmap_addr(dma_controller_t *dma,
};
int r;
- r = dma_addr_to_sg(dma, dma_addr, len, &sg, 1);
+ r = dma_addr_to_sg(dma, dma_addr, len, &sg, 1, PROT_NONE);
assert(r == 1);
dma_unmap_sg(dma, &sg, &iov, 1);
}
+int
+dma_controller_dirty_page_logging_start(dma_controller_t *dma, size_t pgsize);
+
+int
+dma_controller_dirty_page_logging_stop(dma_controller_t *dma);
+
+int
+dma_controller_dirty_page_get(dma_controller_t *dma, dma_addr_t addr, int len,
+ size_t pgsize, size_t size, char **data);
+
+bool
+dma_controller_region_valid(dma_controller_t *dma, dma_addr_t dma_addr,
+ size_t size);
+
#endif /* DMA_DMA_H */
/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/muser.h b/lib/muser.h
index f3330fe..a39d477 100644
--- a/lib/muser.h
+++ b/lib/muser.h
@@ -37,22 +37,27 @@
#include <sys/uio.h>
#include <unistd.h>
+#include "vfio_user.h"
#include "pci.h"
+#include "caps/pm.h"
+#include "caps/px.h"
+#include "caps/msi.h"
+#include "caps/msix.h"
-/*
- * Influential enviroment variables:
- *
- * LM_TERSE_LOGGING: define to make libmuser log only erroneous PCI accesses.
- * (this should really be done with a more fine grained debug
- * level)
- */
-#ifndef LM_TERSE_LOGGING
-#define LM_TERSE_LOGGING 0
-#endif
+#define LIB_MUSER_VFIO_USER_VERS_MJ 0
+#define LIB_MUSER_VFIO_USER_VERS_MN 1
+
+#define VFIO_NAME "vfio"
+#define VFIO_DIR "/dev/" VFIO_NAME "/"
+#define VFIO_CONTAINER VFIO_DIR "/" VFIO_NAME
+
+#define MUSER_DIR "/var/run/muser/"
+#define MUSER_SOCK "cntrl"
typedef uint64_t dma_addr_t;
typedef struct {
+ dma_addr_t dma_addr;
int region;
int length;
uint64_t offset;
@@ -134,6 +139,8 @@ typedef struct {
/*
* Callback function that is called when the region is read or written.
+ * Note that the memory of the region is owned by the user, except for the
+ * standard header (first 64 bytes) of the PCI configuration space.
*/
lm_region_access_t *fn;
@@ -149,9 +156,12 @@ enum {
LM_DEV_INTX_IRQ_IDX,
LM_DEV_MSI_IRQ_IDX,
LM_DEV_MSIX_IRQ_IDX,
- LM_DEV_NUM_IRQS = 3
+ LM_DEV_ERR_IRQ_INDEX,
+ LM_DEV_REQ_IRQ_INDEX,
+ LM_DEV_NUM_IRQS
};
+/* FIXME these are PCI regions */
enum {
LM_DEV_BAR0_REG_IDX,
LM_DEV_BAR1_REG_IDX,
@@ -162,7 +172,15 @@ enum {
LM_DEV_ROM_REG_IDX,
LM_DEV_CFG_REG_IDX,
LM_DEV_VGA_REG_IDX,
- LM_DEV_NUM_REGS = 9
+ /*
+ * FIXME this really belong here, but simplifies implementation for now. A
+ * migration region can exist for non-PCI devices (can its index be
+ * anything?). In any case, we should allow the user to define custom regions
+ * at will, by fixing the migration region in that position we don't allow
+ * this.
+ */
+ LM_DEV_MIGRATION_REG_IDX,
+ LM_DEV_NUM_REGS = 10, /* TODO rename to LM_DEV_NUM_PCI_REGS */
};
typedef struct {
@@ -191,7 +209,7 @@ typedef struct {
} lm_pci_info_t;
/*
- * Returns a pointer to the non-standard part of the PCI configuration space.
+ * Returns a pointer to the standard part of the PCI configuration space.
*/
lm_pci_config_space_t *lm_get_pci_config_space(lm_ctx_t *lm_ctx);
@@ -208,7 +226,7 @@ typedef enum {
*
* @lm_log_fn_t: typedef for log function.
*/
-typedef void (lm_log_fn_t) (void *pvt, const char *msg);
+typedef void (lm_log_fn_t) (void *pvt, lm_log_lvl_t lvl, const char *msg);
/**
* Callback function that gets called when a capability is accessed. The
@@ -228,26 +246,77 @@ typedef ssize_t (lm_cap_access_t) (void *pvt, uint8_t id,
char *buf, size_t count,
loff_t offset, bool is_write);
+/* FIXME does it have to be packed as well? */
+typedef union {
+ struct msicap msi;
+ struct msixcap msix;
+ struct pmcap pm;
+ struct pxcap px;
+} lm_cap_t;
+
+typedef enum {
+ LM_TRANS_KERNEL,
+ LM_TRANS_SOCK,
+ LM_TRANS_MAX
+} lm_trans_t;
+
+#define LM_MAX_CAPS (PCI_CFG_SPACE_SIZE - PCI_STD_HEADER_SIZEOF) / PCI_CAP_SIZEOF
+
+/*
+ * FIXME the names of migration callback functions are probably far too long,
+ * but for now it helps with the implementation.
+ */
+typedef int (lm_migration_callback_t)(void *pvt);
+
+typedef enum {
+ LM_MIGR_STATE_STOP,
+ LM_MIGR_STATE_START,
+ LM_MIGR_STATE_STOP_AND_COPY,
+ LM_MIGR_STATE_PRE_COPY,
+ LM_MIGR_STATE_RESUME
+} lm_migr_state_t;
+
typedef struct {
+ /* migration state transition callback */
+ /* TODO rename to lm_migration_state_transition_callback */
+ /* FIXME maybe we should create a single callback and pass the state? */
+ int (*transition)(void *pvt, lm_migr_state_t state);
+
+ /* Callbacks for saving device state */
+
/*
- * Capability ID, as defined by the PCI specification. Also defined as
- * PCI_CAP_ID_XXX in <linux/pci_regs.h>.
+ * Function that is called to retrieve pending migration data. If migration
+ * data were previously made available (function prepare_data has been
+ * called) then calling this function signifies that they have been read
+ * (e.g. migration data can be discarded). If the function returns 0 then
+ * migration has finished and this function won't be called again.
*/
- uint8_t id;
+ __u64 (*get_pending_bytes)(void *pvt);
/*
- * Size of the capability.
+ * Function that is called to instruct the device to prepare migration data.
+ * The function must return only after migration data are available at the
+ * specified offset.
*/
- size_t size;
+ int (*prepare_data)(void *pvt, __u64 *offset, __u64 *size);
/*
- * Function to call back when the capability gets read or written.
+ * Function that is called to read migration data. offset and size can
+ * be any subrange on the offset and size previously returned by
+ * prepare_data. The function must return the amount of data read. This
+ * function can be called even if the migration data can be memory mapped.
+ *
+ * Does this mean that reading data_offset/data_size updates the values?
*/
- lm_cap_access_t *fn;
-} lm_cap_t;
+ size_t (*read_data)(void *pvt, void *buf, __u64 count, __u64 offset);
-#define LM_MAX_CAPS (PCI_CFG_SPACE_SIZE - PCI_STD_HEADER_SIZEOF) / PCI_CAP_SIZEOF
+ /* Callback for restoring device state */
+
+ /* Fuction that is called for writing previously stored device state. */
+ size_t (*write_data)(void *pvt, void *data, __u64 size);
+
+} lm_migration_callbacks_t;
/**
* Device information structure, used to create the lm_ctx.
@@ -287,16 +356,36 @@ typedef struct {
int (*reset) (void *pvt);
/*
- * PCI capabilities. The user needs to only define the ID and size of each
- * capability. The actual capability is not maintained by libmuser. When a
- * capability is accessed the appropriate callback function is called.
+ * Function that is called when the guest maps a DMA region. Optional.
+ */
+ void (*map_dma) (void *pvt, uint64_t iova, uint64_t len);
+
+ /*
+ * Function that is called when the guest unmaps a DMA region. The device
+ * must release all references to that region before the callback returns.
+ * This is required if you want to be able to access guest memory.
*/
- lm_cap_t caps[LM_MAX_CAPS];
+ int (*unmap_dma) (void *pvt, uint64_t iova);
+
+ lm_trans_t trans;
/*
- * Number of capabilities in above array.
+ * Attaching to the transport is non-blocking. The library will not attempt
+ * to attach during context creation time. The caller must then manually
+ * call lm_ctx_try_attach(), which is non-blocking, as many times as
+ * necessary.
+ */
+#define LM_FLAG_ATTACH_NB (1 << 0)
+ uint64_t flags;
+
+ /*
+ * PCI capabilities.
*/
int nr_caps;
+ lm_cap_t **caps;
+
+ lm_migration_callbacks_t migration_callbacks;
+
} lm_dev_info_t;
/**
@@ -339,18 +428,49 @@ int
lm_ctx_run(lm_dev_info_t *dev_info);
/**
+ * Polls, without blocking, an lm_ctx. This is an alternative to using
+ * a thread and making a blocking call to lm_ctx_drive(). Instead, the
+ * application can periodically poll the context directly from one of
+ * its own threads.
+ *
+ * This is only allowed when LM_FLAG_ATTACH_NB is specified during creation.
+ *
+ * @lm_ctx: The libmuser context to poll
+ *
+ * @returns 0 on success, -errno on failure.
+ */
+int
+lm_ctx_poll(lm_ctx_t *lm_ctx);
+
+/**
* Triggers an interrupt.
*
+ * libmuser takes care of using the correct IRQ type (IRQ index: INTx or MSI/X),
+ * the caller only needs to specify the sub-index.
+ *
+ * @lm_ctx: the libmuser context to trigger interrupt
+ * @subindex: vector subindex to trigger interrupt on
+ *
+ * @returns 0 on success, or -1 on failure. Sets errno.
+ */
+int
+lm_irq_trigger(lm_ctx_t *lm_ctx, uint32_t subindex);
+
+/**
+ * Sends message to client to trigger an interrupt.
+ *
* libmuser takes care of using the IRQ type (INTx, MSI/X), the caller only
* needs to specify the sub-index.
+ * This api can be used to trigger interrupt by sending message to client.
*
* @lm_ctx: the libmuser context to trigger interrupt
* @subindex: vector subindex to trigger interrupt on
*
* @returns 0 on success, or -1 on failure. Sets errno.
*/
+
int
-lm_irq_trigger(lm_ctx_t *lm_ctx, uint32_t subindex);
+lm_irq_message(lm_ctx_t *lm_ctx, uint32_t subindex);
/* Helper functions */
@@ -366,12 +486,15 @@ lm_irq_trigger(lm_ctx_t *lm_ctx, uint32_t subindex);
* than can be individually mapped in the program's virtual memory. A single
* linear guest physical address span may need to be split into multiple
* scatter/gather regions due to limitations of how memory can be mapped.
+ * Field unmap_dma must have been provided at context creation time in order
+ * to use this function.
*
* @lm_ctx: the libmuser context
* @dma_addr: the guest physical address
* @len: size of memory to be mapped
* @sg: array that receives the scatter/gather entries to be mapped
* @max_sg: maximum number of elements in above array
+ * @prot: protection as define in <sys/mman.h>
*
* @returns the number of scatter/gather entries created on success, and on
* failure:
@@ -381,12 +504,14 @@ lm_irq_trigger(lm_ctx_t *lm_ctx, uint32_t subindex);
*/
int
lm_addr_to_sg(lm_ctx_t *lm_ctx, dma_addr_t dma_addr, uint32_t len,
- dma_sg_t *sg, int max_sg);
+ dma_sg_t *sg, int max_sg, int prot);
/**
* Maps a list scatter/gather entries from the guest's physical address space
* to the program's virtual memory. It is the caller's responsibility to remove
* the mappings by calling lm_unmap_sg.
+ * Field unmap_dma must have been provided at context creation time in order
+ * to use this function.
*
* @lm_ctx: the libmuser context
* @sg: array of scatter/gather entries returned by lm_addr_to_sg
@@ -403,6 +528,8 @@ lm_map_sg(lm_ctx_t *lm_ctx, const dma_sg_t *sg,
/**
* Unmaps a list scatter/gather entries (previously mapped by lm_map_sg) from
* the program's virtual memory.
+ * Field unmap_dma must have been provided at context creation time in order
+ * to use this function.
*
* @lm_ctx: the libmuser context
* @sg: array of scatter/gather entries to unmap
@@ -426,16 +553,59 @@ lm_unmap_sg(lm_ctx_t *lm_ctx, const dma_sg_t *sg,
int
lm_get_region(loff_t pos, size_t count, loff_t *off);
+/**
+ * Read from the dma region exposed by the client.
+ *
+ * @lm_ctx: the libmuser context
+ * @sg: a DMA segment obtained from dma_addr_to_sg
+ * @data: data buffer to read into
+ */
+int
+lm_dma_read(lm_ctx_t *lm_ctx, dma_sg_t *sg, void *data);
+
+/**
+ * Write to the dma region exposed by the client.
+ *
+ * @lm_ctx: the libmuser context
+ * @sg: a DMA segment obtained from dma_addr_to_sg
+ * @data: data buffer to write
+ */
+int
+lm_dma_write(lm_ctx_t *lm_ctx, dma_sg_t *sg, void *data);
+
/*
* Advanced stuff.
*/
/**
- * Returns the non-standard part of the PCI configuragion space.
+ * Returns the non-standard part of the PCI configuration space.
*/
uint8_t *
lm_get_pci_non_std_config_space(lm_ctx_t *lm_ctx);
+/*
+ * Attempts to attach to the transport. LM_FLAG_ATTACH_NB must be set when
+ * creating the context. Returns 0 on success and -1 on error. If errno is set
+ * to EAGAIN or EWOULDBLOCK then the transport is not ready to attach to and the
+ * operation must be retried.
+ */
+int
+lm_ctx_try_attach(lm_ctx_t *lm_ctx);
+
+/*
+ * FIXME need to make sure that there can be at most one capability with a given
+ * ID, otherwise this function will return the first one with this ID.
+ */
+uint8_t *
+lm_ctx_get_cap(lm_ctx_t *lm_ctx, uint8_t id);
+
+void
+lm_log(lm_ctx_t *lm_ctx, lm_log_lvl_t lvl, const char *fmt, ...);
+
+/* FIXME */
+int muser_send_fds(int sock, int *fds, size_t count);
+ssize_t muser_recv_fds(int sock, int *fds, size_t count);
+
#endif /* LIB_MUSER_H */
/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/muser_ctx.c b/lib/muser_ctx.c
index 0de3ac0..92155d7 100644
--- a/lib/muser_ctx.c
+++ b/lib/muser_ctx.c
@@ -47,13 +47,22 @@
#include <stdarg.h>
#include <linux/vfio.h>
#include <sys/param.h>
+#include <sys/un.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <time.h>
+#include <sys/select.h>
-#include "../kmod/muser.h"
#include "muser.h"
#include "muser_priv.h"
#include "dma.h"
#include "cap.h"
+#define MAX_FDS 8
+
+#define IOMMU_GRP_NAME "iommu_group"
+
typedef enum {
IRQ_NONE = 0,
IRQ_INTX,
@@ -61,6 +70,14 @@ typedef enum {
IRQ_MSIX,
} irq_type_t;
+char *irq_to_str[] = {
+ [LM_DEV_INTX_IRQ_IDX] = "INTx",
+ [LM_DEV_MSI_IRQ_IDX] = "MSI",
+ [LM_DEV_MSIX_IRQ_IDX] = "MSI-X",
+ [LM_DEV_ERR_IRQ_INDEX] = "ERR",
+ [LM_DEV_REQ_IRQ_INDEX] = "REQ"
+};
+
typedef struct {
irq_type_t type; /* irq type this device is using */
int err_efd; /* eventfd for irq err */
@@ -69,27 +86,517 @@ typedef struct {
int efds[0]; /* XXX must be last */
} lm_irqs_t;
-/*
- * Macro that ensures that a particular struct member is last. Doesn't work for
- * flexible array members.
- */
-#define MUST_BE_LAST(s, m, t) \
- _Static_assert(sizeof(s) - offsetof(s, m) == sizeof(t), \
- #t " " #m " must be last member in " #s)
+enum migration_iteration_state {
+ VFIO_USER_MIGRATION_ITERATION_STATE_INITIAL,
+ VFIO_USER_MIGRATION_ITERATION_STATE_STARTED,
+ VFIO_USER_MIGRATION_ITERATION_STATE_DATA_PREPARED,
+ VFIO_USER_MIGRATION_ITERATION_STATE_FINISHED
+};
struct lm_ctx {
void *pvt;
dma_controller_t *dma;
int fd;
+ int conn_fd;
int (*reset) (void *pvt);
lm_log_lvl_t log_lvl;
lm_log_fn_t *log;
lm_pci_info_t pci_info;
lm_pci_config_space_t *pci_config_space;
+ lm_trans_t trans;
struct caps *caps;
+ uint64_t flags;
+ char *uuid;
+ void (*map_dma) (void *pvt, uint64_t iova, uint64_t len);
+ int (*unmap_dma) (void *pvt, uint64_t iova);
+
+ /* TODO there should be a void * variable to store transport-specific stuff */
+ /* LM_TRANS_SOCK */
+ char *iommu_dir;
+ int iommu_dir_fd;
+ int sock_flags;
+
+ int client_max_fds;
+
+ struct {
+ struct vfio_device_migration_info info;
+ size_t pgsize;
+ lm_migration_callbacks_t callbacks;
+ struct {
+ enum migration_iteration_state state;
+ __u64 offset;
+ __u64 size;
+ } iter;
+ } migration;
+
lm_irqs_t irqs; /* XXX must be last */
};
-MUST_BE_LAST(struct lm_ctx, irqs, lm_irqs_t);
+
+
+/* function prototypes */
+static void
+free_sparse_mmap_areas(lm_reg_info_t*);
+
+static inline int recv_blocking(int sock, void *buf, size_t len, int flags)
+{
+ int f = fcntl(sock, F_GETFL, 0);
+ int ret, fret;
+
+ fret = fcntl(sock, F_SETFL, f & ~O_NONBLOCK);
+ assert(fret != -1);
+
+ ret = recv(sock, buf, len, flags);
+
+ fret = fcntl(sock, F_SETFL, f);
+ assert(fret != -1);
+
+ return ret;
+}
+
+static int
+init_sock(lm_ctx_t *lm_ctx)
+{
+ struct sockaddr_un addr = { .sun_family = AF_UNIX };
+ int ret, unix_sock;
+ mode_t mode;
+
+ assert(lm_ctx != NULL);
+
+ lm_ctx->iommu_dir = strdup(lm_ctx->uuid);
+ if (!lm_ctx->iommu_dir) {
+ return -ENOMEM;
+ }
+
+ /* FIXME SPDK can't easily run as non-root */
+ mode = umask(0000);
+
+ if ((unix_sock = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
+ ret = errno;
+ goto out;
+ }
+
+ if (lm_ctx->flags & LM_FLAG_ATTACH_NB) {
+ ret = fcntl(unix_sock, F_SETFL,
+ fcntl(unix_sock, F_GETFL, 0) | O_NONBLOCK);
+ if (ret < 0) {
+ ret = errno;
+ goto close_unix_sock;
+ }
+ lm_ctx->sock_flags = MSG_DONTWAIT | MSG_WAITALL;
+ } else {
+ lm_ctx->sock_flags = 0;
+ }
+
+ lm_ctx->iommu_dir_fd = open(lm_ctx->iommu_dir, O_DIRECTORY);
+ if (lm_ctx->iommu_dir_fd < 0) {
+ ret = errno;
+ goto close_unix_sock;
+ }
+
+ ret = snprintf(addr.sun_path, sizeof addr.sun_path, "%s/" MUSER_SOCK,
+ lm_ctx->iommu_dir);
+ if (ret >= (int)sizeof addr.sun_path) {
+ ret = ENAMETOOLONG;
+ goto close_iommu_dir_fd;
+ }
+ if (ret < 0) {
+ goto close_iommu_dir_fd;
+ }
+
+ /* start listening business */
+ ret = bind(unix_sock, (struct sockaddr*)&addr, sizeof(addr));
+ if (ret < 0) {
+ ret = errno;
+ goto close_iommu_dir_fd;
+ }
+
+ ret = listen(unix_sock, 0);
+ if (ret < 0) {
+ ret = errno;
+ goto close_iommu_dir_fd;
+ }
+
+ umask(mode);
+ return unix_sock;
+
+close_iommu_dir_fd:
+ close(lm_ctx->iommu_dir_fd);
+close_unix_sock:
+ close(unix_sock);
+out:
+ return -ret;
+}
+
+static void
+__free_s(char **p)
+{
+ free(*p);
+}
+
+int
+_send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply,
+ enum vfio_user_command cmd,
+ struct iovec *iovecs, size_t nr_iovecs,
+ int *fds, int count)
+{
+ int ret;
+ struct vfio_user_header hdr = {.msg_id = msg_id};
+ struct msghdr msg;
+ size_t i;
+
+ if (nr_iovecs == 0) {
+ iovecs = alloca(sizeof(*iovecs));
+ nr_iovecs = 1;
+ }
+
+ memset(&msg, 0, sizeof(msg));
+
+ if (is_reply) {
+ hdr.flags.type = VFIO_USER_F_TYPE_REPLY;
+ } else {
+ hdr.cmd = cmd;
+ hdr.flags.type = VFIO_USER_F_TYPE_COMMAND;
+ }
+
+ iovecs[0].iov_base = &hdr;
+ iovecs[0].iov_len = sizeof(hdr);
+
+ for (i = 0; i < nr_iovecs; i++) {
+ hdr.msg_size += iovecs[i].iov_len;
+ }
+
+ msg.msg_iovlen = nr_iovecs;
+ msg.msg_iov = iovecs;
+
+ if (fds != NULL) {
+ size_t size = count * sizeof *fds;
+ char *buf = alloca(CMSG_SPACE(size));
+
+ msg.msg_control = buf;
+ msg.msg_controllen = CMSG_SPACE(size);
+
+ struct cmsghdr * cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(size);
+ memcpy(CMSG_DATA(cmsg), fds, size);
+ }
+
+ ret = sendmsg(sock, &msg, 0);
+ if (ret == -1) {
+ return -errno;
+ }
+
+ return 0;
+}
+
+int
+send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply,
+ enum vfio_user_command cmd,
+ void *data, size_t data_len,
+ int *fds, size_t count) {
+
+ struct iovec iovecs[2] = {
+ [1] = {
+ .iov_base = data,
+ .iov_len = data_len
+ }
+ };
+ return _send_vfio_user_msg(sock, msg_id, is_reply, cmd, iovecs,
+ ARRAY_SIZE(iovecs), fds, count);
+}
+
+int
+send_version(int sock, int major, int minor, uint16_t msg_id, bool is_reply,
+ char *caps)
+{
+ int ret;
+ char *data;
+
+ ret = asprintf(&data,
+ "{version: {\"major\": %d, \"minor\": %d}, capabilities: %s}",
+ major, minor, caps != NULL ? caps : "{}");
+ if (ret == -1) {
+ return -1;
+ }
+ ret = send_vfio_user_msg(sock, msg_id, is_reply, VFIO_USER_VERSION, data,
+ ret, NULL, 0);
+ free(data);
+ return ret;
+}
+
+int
+recv_vfio_user_msg(int sock, struct vfio_user_header *hdr, bool is_reply,
+ uint16_t *msg_id, void *data, size_t *len)
+{
+ int ret;
+
+ ret = recv_blocking(sock, hdr, sizeof(*hdr), 0);
+ if (ret == -1) {
+ return -errno;
+ }
+ if (ret < (int)sizeof(*hdr)) {
+ return -EINVAL;
+ }
+
+ if (is_reply) {
+ if (hdr->msg_id != *msg_id) {
+ return -EINVAL;
+ }
+
+ if (hdr->flags.type != VFIO_USER_F_TYPE_REPLY) {
+ return -EINVAL;
+ }
+
+ if (hdr->flags.error == 1U) {
+ if (hdr->error_no <= 0) {
+ hdr->error_no = EINVAL;
+ }
+ return -hdr->error_no;
+ }
+ } else {
+ if (hdr->flags.type != VFIO_USER_F_TYPE_COMMAND) {
+ return -EINVAL;
+ }
+ *msg_id = hdr->msg_id;
+ }
+
+ if (len != NULL && *len > 0 && hdr->msg_size > sizeof *hdr) {
+ ret = recv_blocking(sock, data, MIN(hdr->msg_size - sizeof *hdr, *len),
+ 0);
+ if (ret < 0) {
+ return ret;
+ }
+ if (*len != (size_t)ret) { /* FIXME we should allow receiving less */
+ return -EINVAL;
+ }
+ *len = ret;
+ }
+ return 0;
+}
+
+int
+recv_version(int sock, int *major, int *minor, uint16_t *msg_id, bool is_reply,
+ int *max_fds, size_t *pgsize)
+{
+ int ret;
+ struct vfio_user_header hdr;
+ char *data __attribute__((__cleanup__(__free_s))) = NULL;
+
+ ret = recv_vfio_user_msg(sock, &hdr, is_reply, msg_id, NULL, NULL);
+ if (ret < 0) {
+ return ret;
+ }
+
+ hdr.msg_size -= sizeof(hdr);
+ data = malloc(hdr.msg_size);
+ if (data == NULL) {
+ return -errno;
+ }
+ ret = recv_blocking(sock, data, hdr.msg_size, 0);
+ if (ret == -1) {
+ return -errno;
+ }
+ if (ret < (int)hdr.msg_size) {
+ return -EINVAL;
+ }
+
+ /* FIXME use proper parsing */
+ ret = sscanf(data,
+ "{version: {\"major\": %d, \"minor\": %d}, capabilities: {max_fds: %d, migration: {pgsize: %lu}}}",
+ major, minor, max_fds, pgsize);
+ if (ret != 4) {
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int
+_send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd,
+ struct iovec *iovecs, size_t nr_iovecs,
+ int *send_fds, size_t fd_count,
+ struct vfio_user_header *hdr,
+ void *recv_data, size_t recv_len)
+{
+ int ret = _send_vfio_user_msg(sock, msg_id, false, cmd, iovecs, nr_iovecs,
+ send_fds, fd_count);
+ if (ret < 0) {
+ return ret;
+ }
+ if (hdr == NULL) {
+ hdr = alloca(sizeof *hdr);
+ }
+ return recv_vfio_user_msg(sock, hdr, true, &msg_id, recv_data, &recv_len);
+}
+
+int
+send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd,
+ void *send_data, size_t send_len,
+ int *send_fds, size_t fd_count,
+ struct vfio_user_header *hdr,
+ void *recv_data, size_t recv_len)
+{
+ struct iovec iovecs[2] = {
+ [1] = {
+ .iov_base = send_data,
+ .iov_len = send_len
+ }
+ };
+ return _send_recv_vfio_user_msg(sock, msg_id, cmd, iovecs,
+ ARRAY_SIZE(iovecs), send_fds, fd_count,
+ hdr, recv_data, recv_len);
+}
+
+static int
+set_version(lm_ctx_t *lm_ctx, int sock)
+{
+ int ret;
+ int client_mj, client_mn;
+ uint16_t msg_id = 0;
+ char *server_caps;
+
+ ret = asprintf(&server_caps, "{max_fds: %d, migration: {pgsize: %ld}}",
+ MAX_FDS, sysconf(_SC_PAGESIZE));
+ if (ret == -1) {
+ return -ENOMEM;
+ }
+
+ ret = send_version(sock, LIB_MUSER_VFIO_USER_VERS_MJ,
+ LIB_MUSER_VFIO_USER_VERS_MN, msg_id, false, server_caps);
+ if (ret < 0) {
+ lm_log(lm_ctx, LM_DBG, "failed to send version: %s", strerror(-ret));
+ goto out;
+ }
+
+ ret = recv_version(sock, &client_mj, &client_mn, &msg_id, true,
+ &lm_ctx->client_max_fds, &lm_ctx->migration.pgsize);
+ if (ret < 0) {
+ lm_log(lm_ctx, LM_DBG, "failed to receive version: %s", strerror(-ret));
+ goto out;
+ }
+ if (client_mj != LIB_MUSER_VFIO_USER_VERS_MJ ||
+ client_mn != LIB_MUSER_VFIO_USER_VERS_MN) {
+ lm_log(lm_ctx, LM_DBG, "version mismatch, server=%d.%d, client=%d.%d",
+ LIB_MUSER_VFIO_USER_VERS_MJ, LIB_MUSER_VFIO_USER_VERS_MN,
+ client_mj, client_mn);
+ ret = -EINVAL;
+ goto out;
+ }
+ if (lm_ctx->migration.pgsize == 0) {
+ lm_log(lm_ctx, LM_ERR, "bad migration page size");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* FIXME need to check max_fds */
+
+ lm_ctx->migration.pgsize = MIN(lm_ctx->migration.pgsize,
+ sysconf(_SC_PAGESIZE));
+out:
+ free(server_caps);
+ return ret;
+}
+
+/**
+ * lm_ctx: libmuser context
+ * iommu_dir: full path to the IOMMU group to create. All parent directories
+ * must already exist.
+ */
+static int
+open_sock(lm_ctx_t *lm_ctx)
+{
+ int ret;
+ int conn_fd;
+
+ assert(lm_ctx != NULL);
+
+ conn_fd = accept(lm_ctx->fd, NULL, NULL);
+ if (conn_fd == -1) {
+ return conn_fd;
+ }
+
+ /* send version and caps */
+ ret = set_version(lm_ctx, conn_fd);
+ if (ret < 0) {
+ return ret;
+ }
+
+ lm_ctx->conn_fd = conn_fd;
+ return conn_fd;
+}
+
+static int
+close_sock(lm_ctx_t *lm_ctx)
+{
+ return close(lm_ctx->conn_fd);
+}
+
+static int
+get_request_sock(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr,
+ int *fds, int *nr_fds)
+{
+ int ret;
+ struct iovec iov = {.iov_base = hdr, .iov_len = sizeof *hdr};
+ struct msghdr msg = {.msg_iov = &iov, .msg_iovlen = 1};
+ struct cmsghdr *cmsg;
+
+ msg.msg_controllen = CMSG_SPACE(sizeof(int) * *nr_fds);
+ msg.msg_control = alloca(msg.msg_controllen);
+
+ /*
+ * TODO ideally we should set O_NONBLOCK on the fd so that the syscall is
+ * faster (?). I tried that and get short reads, so we need to store the
+ * partially received buffer somewhere and retry.
+ */
+ ret = recvmsg(lm_ctx->conn_fd, &msg, lm_ctx->sock_flags);
+ if (ret == -1) {
+ return -errno;
+ }
+
+ for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
+ if (cmsg->cmsg_level != SOL_SOCKET || cmsg->cmsg_type != SCM_RIGHTS) {
+ continue;
+ }
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(int))) {
+ return -EINVAL;
+ }
+ int size = cmsg->cmsg_len - CMSG_LEN(0);
+ if (size % sizeof(int) != 0) {
+ return -EINVAL;
+ }
+ *nr_fds = (int)(size / sizeof(int));
+ memcpy(fds, CMSG_DATA(cmsg), *nr_fds * sizeof(int));
+ break;
+ }
+
+ return ret;
+}
+
+static ssize_t
+recv_fds_sock(lm_ctx_t *lm_ctx, void *buf, size_t size)
+{
+ ssize_t ret = muser_recv_fds(lm_ctx->conn_fd, buf, size / sizeof(int));
+ if (ret < 0) {
+ return ret;
+ }
+ return ret * sizeof(int);
+}
+
+static struct transport_ops {
+ int (*init)(lm_ctx_t*);
+ int (*attach)(lm_ctx_t*);
+ int(*detach)(lm_ctx_t*);
+ int (*get_request)(lm_ctx_t*, struct vfio_user_header*, int *fds, int *nr_fds);
+ ssize_t (*recv_fds)(lm_ctx_t*, void *buf, size_t size);
+} transports_ops[] = {
+ [LM_TRANS_SOCK] = {
+ .init = init_sock,
+ .attach = open_sock,
+ .detach = close_sock,
+ .recv_fds = recv_fds_sock,
+ .get_request = get_request_sock,
+ }
+};
#define LM2VFIO_IRQT(type) (type - 1)
@@ -98,6 +605,7 @@ lm_log(lm_ctx_t *lm_ctx, lm_log_lvl_t lvl, const char *fmt, ...)
{
va_list ap;
char buf[BUFSIZ];
+ int _errno = errno;
assert(lm_ctx != NULL);
@@ -108,7 +616,8 @@ lm_log(lm_ctx_t *lm_ctx, lm_log_lvl_t lvl, const char *fmt, ...)
va_start(ap, fmt);
vsnprintf(buf, sizeof buf, fmt, ap);
va_end(ap);
- lm_ctx->log(lm_ctx->pvt, buf);
+ lm_ctx->log(lm_ctx->pvt, lvl, buf);
+ errno = _errno;
}
static const char *
@@ -137,11 +646,14 @@ irqs_disable(lm_ctx_t *lm_ctx, uint32_t index)
case VFIO_PCI_INTX_IRQ_INDEX:
case VFIO_PCI_MSI_IRQ_INDEX:
case VFIO_PCI_MSIX_IRQ_INDEX:
- lm_log(lm_ctx, LM_DBG, "disabling IRQ %s\n", vfio_irq_idx_to_str(index));
+ lm_log(lm_ctx, LM_DBG, "disabling IRQ %s", vfio_irq_idx_to_str(index));
lm_ctx->irqs.type = IRQ_NONE;
for (i = 0; i < lm_ctx->irqs.max_ivs; i++) {
if (lm_ctx->irqs.efds[i] >= 0) {
- (void)close(lm_ctx->irqs.efds[i]);
+ if (close(lm_ctx->irqs.efds[i]) == -1) {
+ lm_log(lm_ctx, LM_DBG, "failed to close IRQ fd %d: %m",
+ lm_ctx->irqs.efds[i]);
+ }
lm_ctx->irqs.efds[i] = -1;
}
}
@@ -155,12 +667,17 @@ irqs_disable(lm_ctx_t *lm_ctx, uint32_t index)
}
if (irq_efd != NULL) {
- (void)close(*irq_efd);
- *irq_efd = -1;
+ if (*irq_efd != -1) {
+ if (close(*irq_efd) == -1) {
+ lm_log(lm_ctx, LM_DBG, "failed to close IRQ fd %d: %m",
+ *irq_efd);
+ }
+ *irq_efd = -1;
+ }
return 0;
}
- lm_log(lm_ctx, LM_DBG, "failed to disable IRQs\n");
+ lm_log(lm_ctx, LM_DBG, "failed to disable IRQs");
return -EINVAL;
}
@@ -178,9 +695,8 @@ irqs_set_data_none(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set)
val = 1;
ret = eventfd_write(efd, val);
if (ret == -1) {
- ret = -errno;
- lm_log(lm_ctx, LM_DBG, "IRQ: failed to set data to none: %m\n");
- return ret;
+ lm_log(lm_ctx, LM_DBG, "IRQ: failed to set data to none: %m");
+ return -errno;
}
}
}
@@ -206,9 +722,8 @@ irqs_set_data_bool(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set, void *data)
val = 1;
ret = eventfd_write(efd, val);
if (ret == -1) {
- ret = -errno;
- lm_log(lm_ctx, LM_DBG, "IRQ: failed to set data to bool: %m\n");
- return ret;
+ lm_log(lm_ctx, LM_DBG, "IRQ: failed to set data to bool: %m");
+ return -errno;
}
}
}
@@ -228,13 +743,16 @@ irqs_set_data_eventfd(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set, void *data
i++, d32++) {
efd = lm_ctx->irqs.efds[i];
if (efd >= 0) {
- (void) close(efd);
+ if (close(efd) == -1) {
+ lm_log(lm_ctx, LM_DBG, "failed to close IRQ fd %d: %m", efd);
+ }
+
lm_ctx->irqs.efds[i] = -1;
}
if (*d32 >= 0) {
lm_ctx->irqs.efds[i] = *d32;
}
- lm_log(lm_ctx, LM_DBG, "event fd[%d]=%d\n", i, lm_ctx->irqs.efds[i]);
+ lm_log(lm_ctx, LM_DBG, "event fd[%d]=%d", i, lm_ctx->irqs.efds[i]);
}
return 0;
@@ -252,7 +770,7 @@ irqs_trigger(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set, void *data)
return irqs_disable(lm_ctx, irq_set->index);
}
- lm_log(lm_ctx, LM_DBG, "setting IRQ %s flags=0x%x\n",
+ lm_log(lm_ctx, LM_DBG, "setting IRQ %s flags=%#lx",
vfio_irq_idx_to_str(irq_set->index), irq_set->flags);
switch (irq_set->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
@@ -334,6 +852,17 @@ dev_set_irqs_validate(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set)
return 0;
}
+static int
+device_reset(lm_ctx_t *lm_ctx)
+{
+ lm_log(lm_ctx, LM_DBG, "Device reset called by client");
+ if (lm_ctx->reset != NULL) {
+ return lm_ctx->reset(lm_ctx->pvt);
+ }
+
+ return 0;
+}
+
static long
dev_set_irqs(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set, void *data)
{
@@ -368,7 +897,8 @@ dev_get_irqinfo(lm_ctx_t *lm_ctx, struct vfio_irq_info *irq_info)
// Ensure provided argsz is sufficiently big and index is within bounds.
if ((irq_info->argsz < sizeof(struct vfio_irq_info)) ||
(irq_info->index >= LM_DEV_NUM_IRQS)) {
- lm_log(lm_ctx, LM_DBG, "bad irq_info\n");
+ lm_log(lm_ctx, LM_DBG, "bad irq_info (size=%d index=%d)\n",
+ irq_info->argsz, irq_info->index);
return -EINVAL;
}
@@ -380,66 +910,94 @@ dev_get_irqinfo(lm_ctx_t *lm_ctx, struct vfio_irq_info *irq_info)
/*
* Populate the sparse mmap capability information to vfio-client.
- * kernel/muser constructs the response for VFIO_DEVICE_GET_REGION_INFO
- * accommodating sparse mmap information.
* Sparse mmap information stays after struct vfio_region_info and cap_offest
* points accordingly.
*/
static int
-dev_get_sparse_mmap_cap(lm_ctx_t *lm_ctx, lm_reg_info_t *lm_reg,
- struct vfio_region_info *vfio_reg)
+dev_get_sparse_mmap_cap(lm_ctx_t *lm_ctx, lm_reg_info_t *lm_reg, int reg_index,
+ struct vfio_region_info **vfio_reg)
{
+ struct vfio_info_cap_header *header;
+ struct vfio_region_info_cap_type *type = NULL;
struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
struct lm_sparse_mmap_areas *mmap_areas;
int nr_mmap_areas, i;
- size_t size;
- ssize_t ret;
-
- if (lm_reg->mmap_areas == NULL)
- return -EINVAL;
+ size_t type_size = 0;
+ size_t sparse_size = 0;
+ size_t cap_size;
+ void *cap_ptr;
- nr_mmap_areas = lm_reg->mmap_areas->nr_mmap_areas;
- size = sizeof(*sparse) + (nr_mmap_areas * sizeof(*sparse->areas));
+ if (reg_index == LM_DEV_MIGRATION_REG_IDX) {
+ type_size = sizeof(struct vfio_region_info_cap_type);
+ }
- /*
- * If vfio_reg does not have enough space to accommodate sparse info then
- * set the argsz with the expected size and return. Vfio client will call
- * back after reallocating the vfio_reg
- */
+ if (lm_reg->mmap_areas != NULL) {
+ nr_mmap_areas = lm_reg->mmap_areas->nr_mmap_areas;
+ sparse_size = sizeof(*sparse) + (nr_mmap_areas * sizeof(*sparse->areas));
+ }
- if (vfio_reg->argsz < size + sizeof(*vfio_reg)) {
- vfio_reg->argsz = size + sizeof(*vfio_reg);
- vfio_reg->cap_offset = 0;
+ cap_size = type_size + sparse_size;
+ if (cap_size == 0) {
return 0;
}
- lm_log(lm_ctx, LM_DBG, "%s: size %llu, nr_mmap_areas %u\n", __func__, size,
- nr_mmap_areas);
- sparse = calloc(1, size);
- if (sparse == NULL)
+ /* TODO deosn't need to be calloc, we overwrite it entirely */
+ header = calloc(1, cap_size);
+ if (header == NULL) {
return -ENOMEM;
- sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
- sparse->header.version = 1;
- sparse->header.next = 0;
- sparse->nr_areas = nr_mmap_areas;
+ }
+
+ if (reg_index == LM_DEV_MIGRATION_REG_IDX) {
+ type = (struct vfio_region_info_cap_type*)header;
+ type->header.id = VFIO_REGION_INFO_CAP_TYPE;
+ type->header.version = 1;
+ type->header.next = 0;
+ type->type = VFIO_REGION_TYPE_MIGRATION;
+ type->subtype = VFIO_REGION_SUBTYPE_MIGRATION;
+ (*vfio_reg)->cap_offset = sizeof(struct vfio_region_info);
+ }
- mmap_areas = lm_reg->mmap_areas;
- for (i = 0; i < nr_mmap_areas; i++) {
- sparse->areas[i].offset = mmap_areas->areas[i].start;
- sparse->areas[i].size = mmap_areas->areas[i].size;
+ if (lm_reg->mmap_areas != NULL) {
+ if (type != NULL) {
+ type->header.next = (*vfio_reg)->cap_offset + sizeof(struct vfio_region_info_cap_type);
+ sparse = (struct vfio_region_info_cap_sparse_mmap*)(type + 1);
+ } else {
+ (*vfio_reg)->cap_offset = sizeof(struct vfio_region_info);
+ sparse = (struct vfio_region_info_cap_sparse_mmap*)header;
+ }
+ sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
+ sparse->header.version = 1;
+ sparse->header.next = 0;
+ sparse->nr_areas = nr_mmap_areas;
+
+ lm_log(lm_ctx, LM_DBG, "%s: capsize %llu, nr_mmap_areas %u", __func__,
+ sparse_size, nr_mmap_areas);
+ mmap_areas = lm_reg->mmap_areas;
+ for (i = 0; i < nr_mmap_areas; i++) {
+ sparse->areas[i].offset = mmap_areas->areas[i].start;
+ sparse->areas[i].size = mmap_areas->areas[i].size;
+ lm_log(lm_ctx, LM_DBG, "%s: area %d offset %#lx size %llu", __func__,
+ i, sparse->areas[i].offset, sparse->areas[i].size);
+ }
}
- /* write the sparse mmap cap info to vfio-client user pages */
- ret = write(lm_ctx->fd, sparse, size);
- if (ret != (ssize_t)size) {
- free(sparse);
- return -EIO;
+ /*
+ * FIXME VFIO_REGION_INFO_FLAG_MMAP is valid if the region is
+ * memory-mappable in general, not only if it supports sparse mmap.
+ */
+ (*vfio_reg)->flags |= VFIO_REGION_INFO_FLAG_MMAP | VFIO_REGION_INFO_FLAG_CAPS;
+
+ (*vfio_reg)->argsz = cap_size + sizeof(**vfio_reg);
+ *vfio_reg = realloc(*vfio_reg, (*vfio_reg)->argsz);
+ if (*vfio_reg == NULL) {
+ free(header);
+ return -ENOMEM;
}
- vfio_reg->flags |= VFIO_REGION_INFO_FLAG_MMAP | VFIO_REGION_INFO_FLAG_CAPS;
- vfio_reg->cap_offset = sizeof(*vfio_reg);
+ cap_ptr = (char *)*vfio_reg + (*vfio_reg)->cap_offset;
+ memcpy(cap_ptr, header, cap_size);
- free(sparse);
+ free(header);
return 0;
}
@@ -458,42 +1016,73 @@ offset_to_region(uint64_t offset)
return (offset >> LM_REGION_SHIFT) & LM_REGION_MASK;
}
+#ifdef LM_VERBOSE_LOGGING
+void
+dump_buffer(const char *prefix, const char *buf, uint32_t count)
+{
+ int i;
+ const size_t bytes_per_line = 0x8;
+
+ if (strcmp(prefix, "")) {
+ fprintf(stderr, "%s\n", prefix);
+ }
+ for (i = 0; i < (int)count; i++) {
+ if (i % bytes_per_line != 0) {
+ fprintf(stderr, " ");
+ }
+ /* TODO valgrind emits a warning if count is 1 */
+ fprintf(stderr,"0x%02x", *(buf + i));
+ if ((i + 1) % bytes_per_line == 0) {
+ fprintf(stderr, "\n");
+ }
+ }
+ if (i % bytes_per_line != 0) {
+ fprintf(stderr, "\n");
+ }
+}
+#else
+#define dump_buffer(prefix, buf, count)
+#endif
+
static long
-dev_get_reginfo(lm_ctx_t *lm_ctx, struct vfio_region_info *vfio_reg)
+dev_get_reginfo(lm_ctx_t *lm_ctx, struct vfio_region_info **vfio_reg)
{
lm_reg_info_t *lm_reg;
int err;
assert(lm_ctx != NULL);
- assert(vfio_reg != NULL);
- lm_reg = &lm_ctx->pci_info.reg_info[vfio_reg->index];
+ assert(*vfio_reg != NULL);
+ lm_reg = &lm_ctx->pci_info.reg_info[(*vfio_reg)->index];
// Ensure provided argsz is sufficiently big and index is within bounds.
- if ((vfio_reg->argsz < sizeof(struct vfio_region_info)) ||
- (vfio_reg->index >= LM_DEV_NUM_REGS)) {
+ if (((*vfio_reg)->argsz < sizeof(struct vfio_region_info)) ||
+ ((*vfio_reg)->index >= LM_DEV_NUM_REGS)) {
+ lm_log(lm_ctx, LM_DBG, "bad args argsz=%d index=%d",
+ (*vfio_reg)->argsz, (*vfio_reg)->index);
return -EINVAL;
}
- vfio_reg->offset = region_to_offset(vfio_reg->index);
- vfio_reg->flags = lm_reg->flags;
- vfio_reg->size = lm_reg->size;
+ (*vfio_reg)->offset = region_to_offset((*vfio_reg)->index);
+ (*vfio_reg)->flags = lm_reg->flags;
+ (*vfio_reg)->size = lm_reg->size;
- if (lm_reg->mmap_areas != NULL) {
- err = dev_get_sparse_mmap_cap(lm_ctx, lm_reg, vfio_reg);
- if (err) {
- return err;
- }
+ err = dev_get_sparse_mmap_cap(lm_ctx, lm_reg, (*vfio_reg)->index, vfio_reg);
+ if (err) {
+ return err;
}
- lm_log(lm_ctx, LM_DBG, "region_info[%d]\n", vfio_reg->index);
- dump_buffer(lm_ctx, "", (char*)vfio_reg, sizeof *vfio_reg);
+ lm_log(lm_ctx, LM_DBG, "region_info[%d] offset %#lx flags %#x size %llu "
+ "argsz %llu",
+ (*vfio_reg)->index, (*vfio_reg)->offset, (*vfio_reg)->flags,
+ (*vfio_reg)->size, (*vfio_reg)->argsz);
return 0;
}
static long
-dev_get_info(struct vfio_device_info *dev_info)
+dev_get_info(lm_ctx_t *lm_ctx, struct vfio_device_info *dev_info)
{
+ assert(lm_ctx != NULL);
assert(dev_info != NULL);
// Ensure provided argsz is sufficiently big.
@@ -508,173 +1097,81 @@ dev_get_info(struct vfio_device_info *dev_info)
return 0;
}
-static long
-do_muser_ioctl(lm_ctx_t *lm_ctx, struct muser_cmd_ioctl *cmd_ioctl, void *data)
-{
- int err = -ENOTSUP;
-
- assert(lm_ctx != NULL);
- switch (cmd_ioctl->vfio_cmd) {
- case VFIO_DEVICE_GET_INFO:
- err = dev_get_info(&cmd_ioctl->data.dev_info);
- break;
- case VFIO_DEVICE_GET_REGION_INFO:
- err = dev_get_reginfo(lm_ctx, &cmd_ioctl->data.reg_info);
- break;
- case VFIO_DEVICE_GET_IRQ_INFO:
- err = dev_get_irqinfo(lm_ctx, &cmd_ioctl->data.irq_info);
- break;
- case VFIO_DEVICE_SET_IRQS:
- err = dev_set_irqs(lm_ctx, &cmd_ioctl->data.irq_set, data);
- break;
- case VFIO_DEVICE_RESET:
- if (lm_ctx->reset != NULL) {
- return lm_ctx->reset(lm_ctx->pvt);
- }
- lm_log(lm_ctx, LM_DBG, "reset called but not reset function present\n");
- break;
- }
-
- return err;
-}
-
-static void
-get_path_from_fd(lm_ctx_t *lm_ctx, int fd, char *buf)
-{
- int err;
- ssize_t ret;
- char pathname[PATH_MAX];
-
- err = snprintf(pathname, PATH_MAX, "/proc/self/fd/%d", fd);
- if (err >= PATH_MAX || err == -1) {
- buf[0] = '\0';
- }
- ret = readlink(pathname, buf, PATH_MAX);
- if (ret == -1) {
- lm_log(lm_ctx, LM_DBG, "failed to readlink %s: %m\n", pathname);
- ret = 0;
- } else if (ret == PATH_MAX) {
- lm_log(lm_ctx, LM_DBG, "failed to readlink %s, output truncated\n",
- pathname);
- ret -= 1;
- }
- buf[ret] = '\0';
-}
-
-static int
-muser_dma_unmap(lm_ctx_t *lm_ctx, struct muser_cmd *cmd)
-{
- int err;
- char buf[PATH_MAX];
-
- get_path_from_fd(lm_ctx, cmd->mmap.request.fd, buf);
-
- lm_log(lm_ctx, LM_INF, "removing DMA region fd=%d path=%s %#lx-%#lx\n",
- cmd->mmap.request.fd, buf, cmd->mmap.request.addr,
- cmd->mmap.request.addr + cmd->mmap.request.len);
-
- if (lm_ctx->dma == NULL) {
- lm_log(lm_ctx, LM_ERR, "DMA not initialized\n");
- return -EINVAL;
- }
-
- err = dma_controller_remove_region(lm_ctx->dma,
- cmd->mmap.request.addr,
- cmd->mmap.request.len,
- cmd->mmap.request.fd);
- if (err != 0) {
- lm_log(lm_ctx, LM_ERR, "failed to remove DMA region fd=%d path=%s %#lx-%#lx: %s\n",
- cmd->mmap.request.fd, buf, cmd->mmap.request.addr,
- cmd->mmap.request.addr + cmd->mmap.request.len,
- strerror(err));
- }
-
- return err;
-}
-
-static int
-muser_dma_map(lm_ctx_t *lm_ctx, struct muser_cmd *cmd)
-{
- int err;
- char buf[PATH_MAX];
-
- get_path_from_fd(lm_ctx, cmd->mmap.request.fd, buf);
-
- lm_log(lm_ctx, LM_INF, "adding DMA region fd=%d path=%s iova=%#lx-%#lx offset=%#lx\n",
- cmd->mmap.request.fd, buf, cmd->mmap.request.addr,
- cmd->mmap.request.addr + cmd->mmap.request.len,
- cmd->mmap.request.offset);
-
- if (lm_ctx->dma == NULL) {
- lm_log(lm_ctx, LM_ERR, "DMA not initialized\n");
- return -EINVAL;
- }
-
- err = dma_controller_add_region(lm_ctx, lm_ctx->dma,
- cmd->mmap.request.addr,
- cmd->mmap.request.len,
- cmd->mmap.request.fd,
- cmd->mmap.request.offset);
- if (err < 0) {
- lm_log(lm_ctx, LM_ERR, "failed to add DMA region fd=%d path=%s %#lx-%#lx: %d\n",
- cmd->mmap.request.fd, buf, cmd->mmap.request.addr,
- cmd->mmap.request.addr + cmd->mmap.request.len, err);
- }
-
- return 0;
+int
+muser_send_fds(int sock, int *fds, size_t count) {
+ struct msghdr msg = { 0 };
+ size_t size = count * sizeof *fds;
+ char buf[CMSG_SPACE(size)];
+ memset(buf, '\0', sizeof(buf));
+
+ /* XXX requires at least one byte */
+ struct iovec io = { .iov_base = "\0", .iov_len = 1 };
+
+ msg.msg_iov = &io;
+ msg.msg_iovlen = 1;
+ msg.msg_control = buf;
+ msg.msg_controllen = sizeof(buf);
+
+ struct cmsghdr * cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(size);
+ memcpy(CMSG_DATA(cmsg), fds, size);
+ msg.msg_controllen = CMSG_SPACE(size);
+ return sendmsg(sock, &msg, 0);
}
-/*
- * Callback that is executed when device memory is to be mmap'd.
- */
-static int
-muser_mmap(lm_ctx_t *lm_ctx, struct muser_cmd *cmd)
+ssize_t
+muser_recv_fds(int sock, int *fds, size_t count)
{
- int region, err = 0;
- unsigned long addr;
- unsigned long len = cmd->mmap.request.len;
- loff_t offset = cmd->mmap.request.addr;
+ int ret;
+ struct cmsghdr *cmsg;
+ size_t fds_size;
+ char msg_buf[sysconf(_SC_PAGESIZE)];
+ struct iovec io = {.iov_base = msg_buf, .iov_len = sizeof(msg_buf)};
+ char cmsg_buf[sysconf(_SC_PAGESIZE)];
+ struct msghdr msg = {
+ .msg_iov = &io,
+ .msg_iovlen = 1,
+ .msg_control = cmsg_buf,
+ .msg_controllen = sizeof(cmsg_buf)
+ };
- region = lm_get_region(offset, len, &offset);
- if (region < 0) {
- lm_log(lm_ctx, LM_ERR, "bad region %d\n", region);
- err = EINVAL;
- goto out;
+ if (fds == NULL || count <= 0) {
+ errno = EINVAL;
+ return -1;
}
- if (lm_ctx->pci_info.reg_info[region].map == NULL) {
- lm_log(lm_ctx, LM_ERR, "region not mmapable\n");
- err = ENOTSUP;
- goto out;
+ ret = recvmsg(sock, &msg, 0);
+ if (ret == -1) {
+ return ret;
}
- addr = lm_ctx->pci_info.reg_info[region].map(lm_ctx->pvt, offset, len);
- if ((void *)addr == MAP_FAILED) {
- err = errno;
- lm_log(lm_ctx, LM_ERR, "failed to mmap: %m\n");
- goto out;
+ cmsg = CMSG_FIRSTHDR(&msg);
+ if (cmsg == NULL) {
+ errno = EINVAL;
+ return -1;
}
- cmd->mmap.response = addr;
-
-out:
- if (err != 0) {
- lm_log(lm_ctx, LM_ERR, "failed to mmap device memory %#x-%#lx: %s\n",
- offset, offset + len, strerror(err));
+ fds_size = cmsg->cmsg_len - sizeof *cmsg;
+ if ((fds_size % sizeof(int)) != 0 || fds_size / sizeof (int) > count) {
+ errno = EINVAL;
+ return -1;
}
+ memcpy((void*)fds, CMSG_DATA(cmsg), cmsg->cmsg_len - sizeof *cmsg);
- return -err;
+ return fds_size / sizeof(int);
}
/*
- * Returns the number of bytes communicated to the kernel (may be less than
- * ret), or a negative number on error.
+ * Returns the number of bytes sent (may be less than ret), or a negative
+ * number on error.
*/
static int
post_read(lm_ctx_t *lm_ctx, char *rwbuf, ssize_t count)
{
ssize_t ret;
- ret = write(lm_ctx->fd, rwbuf, count);
+ ret = write(lm_ctx->conn_fd, rwbuf, count);
if (ret != count) {
lm_log(lm_ctx, LM_ERR, "%s: bad muser write: %lu/%lu, %s\n",
__func__, ret, count, strerror(errno));
@@ -719,17 +1216,274 @@ handle_pci_config_space_access(lm_ctx_t *lm_ctx, char *buf, size_t count,
int ret;
count = MIN(pci_config_space_size(lm_ctx), count);
- ret = cap_maybe_access(lm_ctx->caps, lm_ctx->pvt, buf, count, pos, is_write);
+ if (is_write) {
+ ret = cap_maybe_access(lm_ctx, lm_ctx->caps, buf, count, pos);
+ if (ret < 0) {
+ lm_log(lm_ctx, LM_ERR, "bad access to capabilities %u@%#x\n", count,
+ pos);
+ return ret;
+ }
+ } else {
+ memcpy(buf, lm_ctx->pci_config_space->raw + pos, count);
+ }
+ return count;
+}
+
+/* valid migration state transitions */
+__u32 migration_states[VFIO_DEVICE_STATE_MASK] = {
+ [VFIO_DEVICE_STATE_STOP] = 1 << VFIO_DEVICE_STATE_STOP,
+ [VFIO_DEVICE_STATE_RUNNING] = /* running */
+ (1 << VFIO_DEVICE_STATE_STOP) |
+ (1 << VFIO_DEVICE_STATE_RUNNING) |
+ (1 << VFIO_DEVICE_STATE_SAVING) |
+ (1 << (VFIO_DEVICE_STATE_RUNNING | VFIO_DEVICE_STATE_SAVING)) |
+ (1 << VFIO_DEVICE_STATE_RESUMING),
+ [VFIO_DEVICE_STATE_SAVING] = /* stop-and-copy */
+ (1 << VFIO_DEVICE_STATE_STOP) |
+ (1 << VFIO_DEVICE_STATE_SAVING),
+ [VFIO_DEVICE_STATE_RUNNING | VFIO_DEVICE_STATE_SAVING] = /* pre-copy */
+ (1 << VFIO_DEVICE_STATE_SAVING) |
+ (1 << VFIO_DEVICE_STATE_RUNNING | VFIO_DEVICE_STATE_SAVING),
+ [VFIO_DEVICE_STATE_RESUMING] = /* resuming */
+ (1 << VFIO_DEVICE_STATE_RUNNING) |
+ (1 << VFIO_DEVICE_STATE_RESUMING)
+};
+
+static bool
+_migration_state_transition_is_valid(__u32 from, __u32 to)
+{
+ return migration_states[from] & (1 << to);
+}
+
+static ssize_t
+handle_migration_device_state(lm_ctx_t *lm_ctx, __u32 *device_state,
+ bool is_write) {
+
+ int ret;
+
+ assert(lm_ctx != NULL);
+ assert(device_state != NULL);
+
+ if (!is_write) {
+ *device_state = lm_ctx->migration.info.device_state;
+ return 0;
+ }
+
+ if (*device_state & ~VFIO_DEVICE_STATE_MASK) {
+ return -EINVAL;
+ }
+
+ if (!_migration_state_transition_is_valid(lm_ctx->migration.info.device_state,
+ *device_state)) {
+ return -EINVAL;
+ }
+
+ switch (*device_state) {
+ case VFIO_DEVICE_STATE_STOP:
+ ret = lm_ctx->migration.callbacks.transition(lm_ctx->pvt,
+ LM_MIGR_STATE_STOP);
+ break;
+ case VFIO_DEVICE_STATE_RUNNING:
+ ret = lm_ctx->migration.callbacks.transition(lm_ctx->pvt,
+ LM_MIGR_STATE_START);
+ break;
+ case VFIO_DEVICE_STATE_SAVING:
+ /*
+ * FIXME How should the device operate during the stop-and-copy
+ * phase? Should we only allow the migration data to be read from
+ * the migration region? E.g. Access to any other region should be
+ * failed? This might be a good question to send to LKML.
+ */
+ ret = lm_ctx->migration.callbacks.transition(lm_ctx->pvt,
+ LM_MIGR_STATE_STOP_AND_COPY);
+ break;
+ case VFIO_DEVICE_STATE_RUNNING | VFIO_DEVICE_STATE_SAVING:
+ ret = lm_ctx->migration.callbacks.transition(lm_ctx->pvt,
+ LM_MIGR_STATE_PRE_COPY);
+ break;
+ case VFIO_DEVICE_STATE_RESUMING:
+ ret = lm_ctx->migration.callbacks.transition(lm_ctx->pvt,
+ LM_MIGR_STATE_RESUME);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ if (ret == 0) {
+ lm_ctx->migration.info.device_state = *device_state;
+ }
+
+ return ret;
+}
+
+static ssize_t
+handle_migration_pending_bytes(lm_ctx_t *lm_ctx, __u64 *pending_bytes,
+ bool is_write)
+{
+ assert(lm_ctx != NULL);
+ assert(pending_bytes != NULL);
+
+ if (is_write) {
+ return -EINVAL;
+ }
+
+ if (lm_ctx->migration.iter.state == VFIO_USER_MIGRATION_ITERATION_STATE_FINISHED) {
+ *pending_bytes = 0;
+ return 0;
+ }
+
+ *pending_bytes = lm_ctx->migration.callbacks.get_pending_bytes(lm_ctx->pvt);
+
+ switch (lm_ctx->migration.iter.state) {
+ case VFIO_USER_MIGRATION_ITERATION_STATE_INITIAL:
+ case VFIO_USER_MIGRATION_ITERATION_STATE_DATA_PREPARED:
+ /*
+ * FIXME what happens if data haven't been consumed in the previous
+ * iteration? Ask on LKML.
+ */
+ if (*pending_bytes == 0) {
+ lm_ctx->migration.iter.state = VFIO_USER_MIGRATION_ITERATION_STATE_FINISHED;
+ } else {
+ lm_ctx->migration.iter.state = VFIO_USER_MIGRATION_ITERATION_STATE_STARTED;
+ }
+ break;
+ case VFIO_USER_MIGRATION_ITERATION_STATE_STARTED:
+ /*
+ * Repeated reads of pending_bytes should not have any side effects.
+ * FIXME does it have to be the same as the previous value? Can it
+ * increase or even decrease? I suppose it can't be lower than
+ * data_size? Ask on LKML.
+ */
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static ssize_t
+handle_migration_data_offset(lm_ctx_t *lm_ctx, __u64 *offset, bool is_write)
+{
+ int ret;
+
+ assert(lm_ctx != NULL);
+ assert(offset != NULL);
+
+ if (is_write) {
+ return -EINVAL;
+ }
+
+ switch (lm_ctx->migration.iter.state) {
+ case VFIO_USER_MIGRATION_ITERATION_STATE_STARTED:
+ break;
+ default:
+ /*
+ * FIXME it's not clear whether these registers can be accessed in
+ * other parts of the iteration, need clarification on the
+ * following:
+ *
+ * Read on data_offset and data_size should return the offset and
+ * size of the current buffer if the user application reads
+ * data_offset and data_size more than once here.
+ */
+ return -EINVAL;
+ }
+
+ ret = lm_ctx->migration.callbacks.prepare_data(lm_ctx->pvt,
+ &lm_ctx->migration.iter.offset,
+ &lm_ctx->migration.iter.size);
if (ret < 0) {
- lm_log(lm_ctx, LM_ERR, "bad access to capabilities %u@%#x\n", count,
- pos);
return ret;
}
- return count;
+
+ *offset = lm_ctx->migration.iter.offset + sizeof(struct vfio_device_migration_info);
+
+ return ret;
+}
+
+static ssize_t
+handle_migration_data_size(lm_ctx_t *lm_ctx, __u64 *size, bool is_write)
+{
+ assert(lm_ctx != NULL);
+ assert(size != NULL);
+
+ if (is_write) {
+ return -EINVAL;
+ }
+
+ switch (lm_ctx->migration.iter.state) {
+ case VFIO_USER_MIGRATION_ITERATION_STATE_STARTED:
+ break;
+ default:
+ /* FIXME see comment in handle_migration_data_offset */
+ return -EINVAL;
+ }
+
+ *size = lm_ctx->migration.iter.size;
+
+ return 0;
}
static ssize_t
-do_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t pos, bool is_write)
+handle_migration_region_access(lm_ctx_t *lm_ctx, char *buf, size_t count,
+ loff_t pos, bool is_write)
+{
+ int ret;
+
+ assert(lm_ctx != NULL);
+ assert(buf != NULL);
+
+ if (pos + count > lm_ctx->pci_info.reg_info[LM_DEV_MIGRATION_REG_IDX].size) {
+ lm_log(lm_ctx, LM_ERR, "read %#x-%#x past end of migration region",
+ pos, pos + count - 1);
+ return -EINVAL;
+ }
+ switch (pos) {
+ case offsetof(struct vfio_device_migration_info, device_state):
+ if (count != sizeof(lm_ctx->migration.info.device_state)) {
+ return -EINVAL;
+ }
+ ret = handle_migration_device_state(lm_ctx, (__u32*)buf,
+ is_write);
+ break;
+ case offsetof(struct vfio_device_migration_info, pending_bytes):
+ if (count != sizeof(lm_ctx->migration.info.pending_bytes)) {
+ return -EINVAL;
+ }
+ ret = handle_migration_pending_bytes(lm_ctx, (__u64*)buf, is_write);
+ break;
+ case offsetof(struct vfio_device_migration_info, data_offset):
+ if (count != sizeof(lm_ctx->migration.info.data_offset)) {
+ return -EINVAL;
+ }
+ ret = handle_migration_data_offset(lm_ctx, (__u64*)buf, is_write);
+ break;
+ case offsetof(struct vfio_device_migration_info, data_size):
+ if (count != sizeof(lm_ctx->migration.info.data_size)) {
+ return -EINVAL;
+ }
+ ret = handle_migration_data_size(lm_ctx, (__u64*)buf, is_write);
+ break;
+ default:
+ if (is_write) {
+ /* FIXME how do we handle the offset? */
+ ret = lm_ctx->migration.callbacks.write_data(lm_ctx->pvt,
+ buf, count);
+ } else {
+ ret = lm_ctx->migration.callbacks.read_data(lm_ctx->pvt,
+ buf, count,
+ pos - sizeof(struct vfio_device_migration_info));
+ }
+ }
+
+ if (ret == 0) {
+ ret = count;
+ }
+ return ret;
+}
+
+static ssize_t
+do_access(lm_ctx_t *lm_ctx, char *buf, uint8_t count, uint64_t pos, bool is_write)
{
int idx;
loff_t offset;
@@ -737,7 +1491,7 @@ do_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t pos, bool is_write)
assert(lm_ctx != NULL);
assert(buf != NULL);
- assert(count > 0);
+ assert(count == 1 || count == 2 || count == 4 || count == 8);
pci_info = &lm_ctx->pci_info;
idx = lm_get_region(pos, count, &offset);
@@ -756,6 +1510,11 @@ do_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t pos, bool is_write)
is_write);
}
+ if (idx == LM_DEV_MIGRATION_REG_IDX) {
+ return handle_migration_region_access(lm_ctx, buf, count, offset,
+ is_write);
+ }
+
/*
* Checking whether a callback exists might sound expensive however this
* code is not performance critical. This works well when we don't expect a
@@ -777,12 +1536,15 @@ do_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t pos, bool is_write)
* error.
*
* TODO function name same lm_access_t, fix
+ * FIXME we must be able to return values up to uint32_t bit, or negative on
+ * error. Better to make return value an int and return the number of bytes
+ * processed via an argument.
*/
ssize_t
-lm_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t *ppos,
+lm_access(lm_ctx_t *lm_ctx, char *buf, uint32_t count, uint64_t *ppos,
bool is_write)
{
- unsigned int done = 0;
+ uint32_t done = 0;
int ret;
assert(lm_ctx != NULL);
@@ -792,7 +1554,10 @@ lm_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t *ppos,
size_t size;
/*
* Limit accesses to qword and enforce alignment. Figure out whether
- * the PCI spec requires this.
+ * the PCI spec requires this
+ * FIXME while this makes sense for registers, we might be able to relax
+ * this requirement and make some transfers more efficient. Maybe make
+ * this a per-region option that can be set by the user?
*/
if (count >= 8 && !(*ppos % 8)) {
size = 8;
@@ -805,15 +1570,16 @@ lm_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t *ppos,
}
ret = do_access(lm_ctx, buf, size, *ppos, is_write);
if (ret <= 0) {
- lm_log(lm_ctx, LM_ERR, "failed to %s %llx@%lx: %s\n",
- is_write ? "write" : "read", size, *ppos, strerror(-ret));
+ lm_log(lm_ctx, LM_ERR, "failed to %s %#lx-%#lx: %s",
+ is_write ? "write to" : "read from", *ppos, *ppos + size - 1,
+ strerror(-ret));
/*
* TODO if ret < 0 then it might contain a legitimate error code, why replace it with EFAULT?
*/
return -EFAULT;
}
if (ret != (int)size) {
- lm_log(lm_ctx, LM_DBG, "bad read %d != %d\n", ret, size);
+ lm_log(lm_ctx, LM_DBG, "bad read %d != %d", ret, size);
}
count -= size;
done += size;
@@ -824,50 +1590,54 @@ lm_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t *ppos,
}
static inline int
-muser_access(lm_ctx_t *lm_ctx, struct muser_cmd *cmd, bool is_write)
+muser_access(lm_ctx_t *lm_ctx, bool is_write, void **data, uint32_t count,
+ uint64_t *pos)
{
+ struct vfio_user_region_access *region_access;
char *rwbuf;
int err;
- size_t count = 0, _count;
- ssize_t ret;
+ uint32_t processed = 0, _count;
+ int ret;
+
+ assert(pos != NULL);
/* TODO how big do we expect count to be? Can we use alloca(3) instead? */
- rwbuf = calloc(1, cmd->rw.count);
- if (rwbuf == NULL) {
+ region_access = calloc(1, sizeof(*region_access) + count);
+ if (region_access == NULL) {
lm_log(lm_ctx, LM_ERR, "failed to allocate memory\n");
return -1;
}
+ rwbuf = (char*)(region_access + 1);
-#ifndef LM_TERSE_LOGGING
- lm_log(lm_ctx, LM_DBG, "%s %x@%lx\n", is_write ? "W" : "R", cmd->rw.count,
- cmd->rw.pos);
-#endif
+ lm_log(lm_ctx, LM_DBG, "%s %#lx-%#lx", is_write ? "W" : "R", *pos,
+ *pos + count - 1);
- /* copy data to be written from kernel to user space */
+ /* receive data to be written */
if (is_write) {
- err = read(lm_ctx->fd, rwbuf, cmd->rw.count);
+ err = read(lm_ctx->conn_fd, rwbuf, count);
/*
* FIXME this is wrong, we should be checking for
- * err != cmd->rw.count
+ * err != count
*/
if (err < 0) {
- lm_log(lm_ctx, LM_ERR, "failed to read from kernel: %s\n",
+ lm_log(lm_ctx, LM_ERR, "failed to receive write payload: %s",
strerror(errno));
goto out;
}
err = 0;
-#ifndef LM_TERSE_LOGGING
- dump_buffer(lm_ctx, "buffer write", rwbuf, cmd->rw.count);
+#ifdef LM_VERBOSE_LOGGING
+ dump_buffer("buffer write", rwbuf, count);
#endif
}
- count = _count = cmd->rw.count;
- cmd->err = muser_pci_hdr_access(lm_ctx, &_count, &cmd->rw.pos,
- is_write, rwbuf);
- if (cmd->err) {
- lm_log(lm_ctx, LM_ERR, "failed to access PCI header: %d\n", cmd->err);
-#ifndef LM_TERSE_LOGGING
- dump_buffer(lm_ctx, "buffer write", rwbuf, _count);
+ _count = count;
+ ret = muser_pci_hdr_access(lm_ctx, &_count, pos, is_write, rwbuf);
+ if (ret != 0) {
+ /* FIXME shouldn't we fail here? */
+ lm_log(lm_ctx, LM_ERR, "failed to access PCI header: %s",
+ strerror(-ret));
+#ifdef LM_VERBOSE_LOGGING
+ dump_buffer("buffer write", rwbuf, _count);
#endif
}
@@ -875,150 +1645,618 @@ muser_access(lm_ctx_t *lm_ctx, struct muser_cmd *cmd, bool is_write)
* count is how much has been processed by muser_pci_hdr_access,
* _count is how much there's left to be processed by lm_access
*/
- count -= _count;
- ret = lm_access(lm_ctx, rwbuf + count, _count, &cmd->rw.pos,
- is_write);
- if (!is_write && ret >= 0) {
- ret += count;
- err = post_read(lm_ctx, rwbuf, ret);
- if (!LM_TERSE_LOGGING && err == ret) {
- dump_buffer(lm_ctx, "buffer read", rwbuf, ret);
+ processed = count - _count;
+ ret = lm_access(lm_ctx, rwbuf + processed, _count, pos, is_write);
+ if (ret >= 0) {
+ ret += processed;
+ if (data != NULL) {
+ /*
+ * FIXME the spec doesn't specify whether the reset of the
+ * region_access struct needs to be populated.
+ */
+ region_access->count = ret;
+ *data = region_access;
+ return ret;
+ } else if (!is_write) {
+ err = post_read(lm_ctx, rwbuf, ret);
+#ifdef LM_VERBOSE_LOGGING
+ if (err == ret) {
+ dump_buffer("buffer read", rwbuf, ret);
+ }
+#endif
}
}
out:
- free(rwbuf);
+ free(region_access);
- return err;
+ return ret;
+}
+
+static int handle_device_get_region_info(lm_ctx_t *lm_ctx,
+ struct vfio_user_header *hdr,
+ struct vfio_region_info **dev_reg_info)
+{
+ struct vfio_region_info *reg_info;
+ int ret;
+
+ reg_info = calloc(sizeof(*reg_info), 1);
+ if (reg_info == NULL) {
+ return -ENOMEM;
+ }
+
+ if ((hdr->msg_size - sizeof(*hdr)) != sizeof(*reg_info)) {
+ free(reg_info);
+ return -EINVAL;
+ }
+
+ ret = recv(lm_ctx->conn_fd, reg_info, sizeof(*reg_info), 0);
+ if (ret < 0) {
+ free(reg_info);
+ return -errno;
+ }
+
+ ret = dev_get_reginfo(lm_ctx, &reg_info);
+ if (ret < 0) {
+ free(reg_info);
+ return ret;
+ }
+ *dev_reg_info = reg_info;
+
+ return 0;
+}
+
+static int handle_device_get_info(lm_ctx_t *lm_ctx,
+ struct vfio_user_header *hdr,
+ struct vfio_device_info *dev_info)
+{
+ int ret;
+
+ if ((hdr->msg_size - sizeof(*hdr)) != sizeof(*dev_info)) {
+ return -EINVAL;
+ }
+
+ ret = recv(lm_ctx->conn_fd, dev_info, sizeof(*dev_info), 0);
+ if (ret < 0) {
+ return -errno;
+ }
+
+ ret = dev_get_info(lm_ctx, dev_info);
+ if (ret < 0) {
+ return ret;
+ }
+
+ lm_log(lm_ctx, LM_DBG, "sent devinfo flags %#x, num_regions %d, num_irqs"
+ " %d", dev_info->flags, dev_info->num_regions, dev_info->num_irqs);
+ return ret;
}
static int
-muser_ioctl(lm_ctx_t *lm_ctx, struct muser_cmd *cmd)
+handle_device_get_irq_info(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr,
+ struct vfio_irq_info *irq_info)
{
- void *data = NULL;
- size_t size = 0;
int ret;
- /* TODO make this a function that returns the size */
- if (cmd->ioctl.vfio_cmd == VFIO_DEVICE_SET_IRQS) {
- uint32_t flags = cmd->ioctl.data.irq_set.flags;
- switch ((flags & VFIO_IRQ_SET_DATA_TYPE_MASK)) {
+ assert(lm_ctx != NULL);
+ assert(irq_info != NULL);
+
+ hdr->msg_size -= sizeof *hdr;
+
+ if (hdr->msg_size != sizeof *irq_info) {
+ return -EINVAL;
+ }
+
+ ret = recv(lm_ctx->conn_fd, irq_info, hdr->msg_size, 0);
+ if (ret < 0) {
+ return -errno;
+ }
+ if (ret != (int)hdr->msg_size) {
+ assert(false); /* FIXME */
+ }
+
+ return dev_get_irqinfo(lm_ctx, irq_info);
+}
+
+static int
+handle_device_set_irqs(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr,
+ int *fds, int nr_fds)
+{
+ int ret;
+ struct vfio_irq_set *irq_set;
+ void *data;
+
+ assert(lm_ctx != NULL);
+ assert(hdr != NULL);
+
+ hdr->msg_size -= sizeof *hdr;
+
+ if (hdr->msg_size < sizeof *irq_set) {
+ return -EINVAL;
+ }
+
+ irq_set = alloca(hdr->msg_size); /* FIXME */
+
+ ret = recv(lm_ctx->conn_fd, irq_set, hdr->msg_size, 0);
+ if (ret < 0) {
+ return -errno;
+ }
+ if (ret != (int)hdr->msg_size) {
+ assert(false); /* FIXME */
+ }
+ if (ret != (int)irq_set->argsz) {
+ assert(false); /* FIXME */
+ }
+ switch (irq_set->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
case VFIO_IRQ_SET_DATA_EVENTFD:
- size = sizeof(int32_t) * cmd->ioctl.data.irq_set.count;
+ data = fds;
+ if (nr_fds != (int)irq_set->count) {
+ return -EINVAL;
+ }
break;
case VFIO_IRQ_SET_DATA_BOOL:
- size = sizeof(uint8_t) * cmd->ioctl.data.irq_set.count;
+ data = irq_set + 1;
break;
+ }
+
+ return dev_set_irqs(lm_ctx, irq_set, data);
+}
+
+static int
+handle_dma_map_or_unmap(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr, bool map,
+ int *fds, int nr_fds)
+{
+ int ret, i;
+ int nr_dma_regions;
+ struct vfio_user_dma_region *dma_regions;
+
+ assert(lm_ctx != NULL);
+ assert(hdr != NULL);
+
+ hdr->msg_size -= sizeof *hdr;
+
+ if (hdr->msg_size % sizeof(struct vfio_user_dma_region) != 0) {
+ lm_log(lm_ctx, LM_ERR, "bad size of DMA regions %d", hdr->msg_size);
+ return -EINVAL;
+ }
+
+ nr_dma_regions = (int)(hdr->msg_size / sizeof(struct vfio_user_dma_region));
+ if (map && nr_dma_regions != nr_fds) {
+ lm_log(lm_ctx, LM_ERR, "expected %d fds but got %d instead",
+ nr_dma_regions, nr_fds);
+ return -EINVAL;
+ }
+
+ dma_regions = alloca(nr_dma_regions * sizeof(*dma_regions));
+
+ ret = recv(lm_ctx->conn_fd, dma_regions, hdr->msg_size, 0);
+ if (ret == -1) {
+ lm_log(lm_ctx, LM_ERR, "failed to receive DMA region entries: %m");
+ return -errno;
+ }
+
+ if (lm_ctx->dma == NULL) {
+ return 0;
+ }
+
+ for (i = 0; i < nr_dma_regions; i++) {
+ if (map) {
+ if (dma_regions[i].flags != VFIO_USER_F_DMA_REGION_MAPPABLE) {
+ /*
+ * FIXME implement non-mappable DMA regions. This requires changing
+ * dma.c to not take a file descriptor.
+ */
+ assert(false);
+ }
+
+ ret = dma_controller_add_region(lm_ctx->dma,
+ dma_regions[i].addr,
+ dma_regions[i].size,
+ fds[i],
+ dma_regions[i].offset);
+ if (ret < 0) {
+ lm_log(lm_ctx, LM_INF,
+ "failed to add DMA region %#lx-%#lx offset=%#lx fd=%d: %s",
+ dma_regions[i].addr,
+ dma_regions[i].addr + dma_regions[i].size - 1,
+ dma_regions[i].offset, fds[i],
+ strerror(-ret));
+ } else {
+ lm_log(lm_ctx, LM_DBG,
+ "added DMA region %#lx-%#lx offset=%#lx fd=%d",
+ dma_regions[i].addr,
+ dma_regions[i].addr + dma_regions[i].size - 1,
+ dma_regions[i].offset, fds[i]);
+ }
+ } else {
+ ret = dma_controller_remove_region(lm_ctx->dma,
+ dma_regions[i].addr,
+ dma_regions[i].size,
+ lm_ctx->unmap_dma, lm_ctx->pvt);
+ if (ret < 0) {
+ lm_log(lm_ctx, LM_INF,
+ "failed to remove DMA region %#lx-%#lx: %s",
+ dma_regions[i].addr,
+ dma_regions[i].addr + dma_regions[i].size - 1,
+ strerror(-ret));
+ } else {
+ lm_log(lm_ctx, LM_DBG,
+ "removed DMA region %#lx-%#lx",
+ dma_regions[i].addr,
+ dma_regions[i].addr + dma_regions[i].size - 1);
+ }
+ }
+ if (ret < 0) {
+ return ret;
+ }
+ if (lm_ctx->map_dma != NULL) {
+ lm_ctx->map_dma(lm_ctx->pvt, dma_regions[i].addr, dma_regions[i].size);
}
}
+ return 0;
+}
- if (size != 0) {
- data = calloc(1, size);
- if (data == NULL) {
-#ifdef DEBUG
- perror("calloc");
-#endif
- return -1;
+static int
+handle_device_reset(lm_ctx_t *lm_ctx)
+{
+ return device_reset(lm_ctx);
+}
+
+static int
+handle_region_access(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr,
+ void **data, size_t *len)
+{
+ struct vfio_user_region_access region_access;
+ uint64_t count, offset;
+ int ret;
+
+ assert(lm_ctx != NULL);
+ assert(hdr != NULL);
+ assert(data != NULL);
+
+ /*
+ * TODO Since muser_access doesn't have to handle the kernel case any more,
+ * we can avoid having to do an additional read/recv inside muser_access
+ * (one recv for struct region_access and another for the write data) by
+ * doing a single recvmsg here with an iovec where the first element of the
+ * array will be struct vfio_user_region_access and the second a buffer if
+ * it's a write. The size of the write buffer is: hdr->msg_size - sizeof
+ * *hdr - sizeof region_access, and should be equal to region_access.count.
+ */
+
+ hdr->msg_size -= sizeof *hdr;
+ if (hdr->msg_size < sizeof region_access) {
+ lm_log(lm_ctx, LM_ERR, "message size too small (%d)", hdr->msg_size);
+ return -EINVAL;
+ }
+
+ ret = recv(lm_ctx->conn_fd, &region_access, sizeof region_access, 0);
+ if (ret == -1) {
+ lm_log(lm_ctx, LM_ERR, "failed to recv: %m");
+ return -errno;
+ }
+ if (ret != sizeof region_access) {
+ lm_log(lm_ctx, LM_ERR, "bad region_access size %d", ret);
+ return -EINVAL;
+ }
+ if (region_access.region >= LM_DEV_NUM_REGS || region_access.count <= 0 ) {
+ lm_log(lm_ctx, LM_ERR, "bad region %d and/or count %d",
+ region_access.region, region_access.count);
+ return -EINVAL;
+ }
+ count = region_access.count;
+ offset = region_to_offset(region_access.region) + region_access.offset;
+
+ ret = muser_access(lm_ctx, hdr->cmd == VFIO_USER_REGION_WRITE,
+ data, count, &offset);
+ if (ret != (int)region_access.count) {
+ lm_log(lm_ctx, LM_ERR, "bad region access acount, expected=%d, actual=%d",
+ region_access.count, ret);
+ /* FIXME we should return whatever has been accessed, not an error */
+ if (ret >= 0) {
+ ret = -EINVAL;
}
+ return ret;
+ }
- ret = read(lm_ctx->fd, data, size);
- if (ret < 0) {
-#ifdef DEBUG
- perror("read failed");
-#endif
+ *len = sizeof(region_access);
+ if (hdr->cmd == VFIO_USER_REGION_READ) {
+ *len += region_access.count;
+ }
+
+ return 0;
+}
+
+static int
+handle_dirty_pages_get(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr,
+ struct iovec **iovecs, size_t *nr_iovecs)
+{
+ int size, ret;
+ size_t i;
+ struct vfio_iommu_type1_dirty_bitmap_get *ranges;
+
+ assert(lm_ctx != NULL);
+ assert(hdr != NULL);
+ assert(iovecs != NULL);
+ assert(nr_iovecs != NULL);
+
+ size = hdr->msg_size - sizeof(*hdr) - sizeof(struct vfio_iommu_type1_dirty_bitmap);
+ if (size % sizeof(struct vfio_iommu_type1_dirty_bitmap_get) != 0) {
+ return -EINVAL;
+ }
+ ranges = malloc(size);
+ if (ranges == NULL) {
+ return -errno;
+ }
+ ret = recv(lm_ctx->conn_fd, ranges, size, 0);
+ if (ret == -1) {
+ ret = -errno;
+ goto out;
+ }
+ if (ret != size) {
+ ret = -EINVAL;
+ goto out;
+ }
+ *nr_iovecs = 1 + size / sizeof(struct vfio_iommu_type1_dirty_bitmap_get);
+ *iovecs = malloc(*nr_iovecs * sizeof(struct iovec));
+ if (*iovecs == NULL) {
+ ret = -errno;
+ goto out;
+ }
+
+ for (i = 1; i < *nr_iovecs; i++) {
+ struct vfio_iommu_type1_dirty_bitmap_get *r = &ranges[(i - 1)]; /* FIXME ugly indexing */
+ ret = dma_controller_dirty_page_get(lm_ctx->dma, r->iova, r->size,
+ r->bitmap.pgsize, r->bitmap.size,
+ (char**)&((*iovecs)[i].iov_base));
+ if (ret != 0) {
goto out;
}
+ (*iovecs)[i].iov_len = r->bitmap.size;
}
+out:
+ if (ret != 0) {
+ if (*iovecs != NULL) {
+ free(*iovecs);
+ *iovecs = NULL;
+ }
+ }
+ free(ranges);
+ return ret;
+}
- ret = (int)do_muser_ioctl(lm_ctx, &cmd->ioctl, data);
+static int
+handle_dirty_pages(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr,
+ struct iovec **iovecs, size_t *nr_iovecs)
+{
+ struct vfio_iommu_type1_dirty_bitmap dirty_bitmap;
+ int ret;
-out:
+ assert(lm_ctx != NULL);
+ assert(hdr != NULL);
+ assert(iovecs != NULL);
+ assert(nr_iovecs != NULL);
+
+ if (hdr->msg_size - sizeof *hdr < sizeof dirty_bitmap) {
+ lm_log(lm_ctx, LM_ERR, "invalid header size %lu", hdr->msg_size);
+ return -EINVAL;
+ }
+
+ /* FIXME must also check argsz */
+
+ ret = recv(lm_ctx->conn_fd, &dirty_bitmap, sizeof dirty_bitmap, 0);
+ if (ret == -1) {
+ return -errno;
+ }
+ if ((size_t)ret < sizeof dirty_bitmap) {
+ return -EINVAL;
+ }
+
+ if (dirty_bitmap.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
+ ret = dma_controller_dirty_page_logging_start(lm_ctx->dma,
+ lm_ctx->migration.pgsize);
+ } else if (dirty_bitmap.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
+ ret = dma_controller_dirty_page_logging_stop(lm_ctx->dma);
+ } else if (dirty_bitmap.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
+ ret = handle_dirty_pages_get(lm_ctx, hdr, iovecs, nr_iovecs);
+ } else {
+ ret = -EINVAL;
+ }
- free(data);
return ret;
}
+/*
+ * FIXME return value is messed up, sometimes we return -1 and set errno while
+ * other times we return -errno. Fix.
+ */
+
static int
-drive_loop(lm_ctx_t *lm_ctx)
+process_request(lm_ctx_t *lm_ctx)
{
- struct muser_cmd cmd = { 0 };
- int err;
+ struct vfio_user_header hdr = { 0, };
+ int ret;
+ int *fds = NULL;
+ int nr_fds;
+ struct vfio_irq_info irq_info;
+ struct vfio_device_info dev_info;
+ struct vfio_region_info *dev_reg_info = NULL;
+ struct iovec _iovecs[2] = { { 0, } };
+ struct iovec *iovecs = NULL;
+ size_t nr_iovecs = 0;
+ bool free_iovec_data = true;
- do {
- err = ioctl(lm_ctx->fd, MUSER_DEV_CMD_WAIT, &cmd);
- if (err < 0) {
- return err;
+ assert(lm_ctx != NULL);
+
+ if (lm_ctx->pci_info.reg_info[LM_DEV_MIGRATION_REG_IDX].size > 0 &&
+ lm_ctx->migration.info.device_state == VFIO_DEVICE_STATE_STOP) {
+ return -ESHUTDOWN;
+ }
+
+ nr_fds = lm_ctx->client_max_fds;
+ fds = alloca(nr_fds * sizeof(int));
+
+ /* FIXME get request shouldn't set errno, it should return it as -errno */
+ ret = transports_ops[lm_ctx->trans].get_request(lm_ctx, &hdr, fds, &nr_fds);
+ if (unlikely(ret < 0)) {
+ if (ret == -EAGAIN || ret == -EWOULDBLOCK) {
+ return 0;
+ }
+ if (ret != -EINTR) {
+ lm_log(lm_ctx, LM_ERR, "failed to receive request: %s", strerror(-ret));
}
+ return ret;
+ }
+ if (unlikely(ret == 0)) {
+ if (errno == EINTR) {
+ return -EINTR;
+ }
+ if (errno == 0) {
+ lm_log(lm_ctx, LM_INF, "VFIO client closed connection");
+ } else {
+ lm_log(lm_ctx, LM_ERR, "end of file: %m");
+ }
+ return -ENOTCONN;
+ }
+
+ if (ret < (int)sizeof hdr) {
+ lm_log(lm_ctx, LM_ERR, "short header read %d", ret);
+ return -EINVAL;
+ }
- switch (cmd.type) {
- case MUSER_IOCTL:
- err = muser_ioctl(lm_ctx, &cmd);
+ if (hdr.flags.type != VFIO_USER_F_TYPE_COMMAND) {
+ lm_log(lm_ctx, LM_ERR, "header not a request");
+ return -EINVAL;
+ }
+
+ if (hdr.msg_size < sizeof hdr) {
+ lm_log(lm_ctx, LM_ERR, "bad size in header %d", hdr.msg_size);
+ return -EINVAL;
+ }
+
+ /* FIXME in most of the following function we check that hdr.count is >=
+ * than the command-specific struct and there is an additional recv(2) for
+ * that data. We should eliminate duplicating this common code and move it
+ * here.
+ */
+
+ switch (hdr.cmd) {
+ case VFIO_USER_DMA_MAP:
+ case VFIO_USER_DMA_UNMAP:
+ ret = handle_dma_map_or_unmap(lm_ctx, &hdr,
+ hdr.cmd == VFIO_USER_DMA_MAP,
+ fds, nr_fds);
break;
- case MUSER_READ:
- case MUSER_WRITE:
- err = muser_access(lm_ctx, &cmd, cmd.type == MUSER_WRITE);
+ case VFIO_USER_DEVICE_GET_INFO:
+ ret = handle_device_get_info(lm_ctx, &hdr, &dev_info);
+ if (ret == 0) {
+ _iovecs[1].iov_base = &dev_info;
+ _iovecs[1].iov_len = dev_info.argsz;
+ iovecs = _iovecs;
+ nr_iovecs = 2;
+ }
break;
- case MUSER_MMAP:
- err = muser_mmap(lm_ctx, &cmd);
+ case VFIO_USER_DEVICE_GET_REGION_INFO:
+ ret = handle_device_get_region_info(lm_ctx, &hdr, &dev_reg_info);
+ if (ret == 0) {
+ _iovecs[1].iov_base = dev_reg_info;
+ _iovecs[1].iov_len = dev_reg_info->argsz;
+ iovecs = _iovecs;
+ nr_iovecs = 2;
+ }
+ break;
+ case VFIO_USER_DEVICE_GET_IRQ_INFO:
+ ret = handle_device_get_irq_info(lm_ctx, &hdr, &irq_info);
+ if (ret == 0) {
+ _iovecs[1].iov_base = &irq_info;
+ _iovecs[1].iov_len = sizeof irq_info;
+ iovecs = _iovecs;
+ nr_iovecs = 2;
+ }
break;
- case MUSER_DMA_MMAP:
- err = muser_dma_map(lm_ctx, &cmd);
+ case VFIO_USER_DEVICE_SET_IRQS:
+ ret = handle_device_set_irqs(lm_ctx, &hdr, fds, nr_fds);
break;
- case MUSER_DMA_MUNMAP:
- err = muser_dma_unmap(lm_ctx, &cmd);
+ case VFIO_USER_REGION_READ:
+ case VFIO_USER_REGION_WRITE:
+ iovecs = _iovecs;
+ ret = handle_region_access(lm_ctx, &hdr, &iovecs[1].iov_base,
+ &iovecs[1].iov_len);
+ nr_iovecs = 2;
+ break;
+ case VFIO_USER_DEVICE_RESET:
+ ret = handle_device_reset(lm_ctx);
+ break;
+ case VFIO_USER_DIRTY_PAGES:
+ ret = handle_dirty_pages(lm_ctx, &hdr, &iovecs, &nr_iovecs);
+ if (ret >= 0) {
+ free_iovec_data = false;
+ }
break;
default:
- lm_log(lm_ctx, LM_ERR, "bad command %d\n", cmd.type);
- continue;
- }
- cmd.err = err;
- err = ioctl(lm_ctx->fd, MUSER_DEV_CMD_DONE, &cmd);
- if (err < 0) {
- lm_log(lm_ctx, LM_ERR, "failed to complete command: %s\n",
- strerror(errno));
+ lm_log(lm_ctx, LM_ERR, "bad command %d", hdr.cmd);
+ return -EINVAL;
+ }
+
+ /*
+ * TODO: In case of error during command handling set errno respectively
+ * in the reply message.
+ */
+ if (ret < 0) {
+ lm_log(lm_ctx, LM_ERR, "failed to handle command %d: %s", hdr.cmd,
+ strerror(-ret));
+ assert(false); /* FIXME */
+ }
+ ret = _send_vfio_user_msg(lm_ctx->conn_fd, hdr.msg_id, true,
+ 0, iovecs, nr_iovecs, NULL, 0);
+ if (unlikely(ret < 0)) {
+ lm_log(lm_ctx, LM_ERR, "failed to complete command: %s",
+ strerror(-ret));
+ }
+ if (iovecs != NULL && iovecs != _iovecs) {
+ if (free_iovec_data) {
+ size_t i;
+ for (i = 0; i < nr_iovecs; i++) {
+ free(iovecs[i].iov_base);
+ }
}
- // TODO: Figure out a clean way to get out of the loop.
- } while (1);
+ free(iovecs);
+ }
- return err;
+ return ret;
}
int
lm_ctx_drive(lm_ctx_t *lm_ctx)
{
+ int err;
+
if (lm_ctx == NULL) {
errno = EINVAL;
return -1;
}
- return drive_loop(lm_ctx);
-}
+ do {
+ err = process_request(lm_ctx);
+ } while (err >= 0);
-static int
-dev_detach(int dev_fd)
-{
- return close(dev_fd);
+ return err;
}
-static int
-dev_attach(const char *uuid)
+int
+lm_ctx_poll(lm_ctx_t *lm_ctx)
{
- char *path;
- int dev_fd;
int err;
- err = asprintf(&path, "/dev/" MUSER_DEVNODE "/%s", uuid);
- if (err != (int)(strlen(MUSER_DEVNODE) + strlen(uuid) + 6)) {
- return -1;
+ if (unlikely((lm_ctx->flags & LM_FLAG_ATTACH_NB) == 0)) {
+ return -ENOTSUP;
}
- dev_fd = open(path, O_RDWR);
-
- free(path);
+ err = process_request(lm_ctx);
- return dev_fd;
+ return err >= 0 ? 0 : err;
}
+/* FIXME this is not enough anymore, check muser_mmap */
void *
lm_mmap(lm_ctx_t *lm_ctx, off_t offset, size_t length)
{
@@ -1035,38 +2273,64 @@ lm_mmap(lm_ctx_t *lm_ctx, off_t offset, size_t length)
lm_ctx->fd, offset);
}
-int
-lm_irq_trigger(lm_ctx_t *lm_ctx, uint32_t vector)
+static int validate_irq_subindex(lm_ctx_t *lm_ctx, uint32_t subindex)
{
- eventfd_t val = 1;
- if ((lm_ctx == NULL) || (vector >= lm_ctx->irqs.max_ivs)) {
- lm_log(lm_ctx, LM_ERR, "bad IRQ %d, max=%d\n", vector,
+ if ((lm_ctx == NULL) || (subindex >= lm_ctx->irqs.max_ivs)) {
+ lm_log(lm_ctx, LM_ERR, "bad IRQ %d, max=%d\n", subindex,
lm_ctx->irqs.max_ivs);
+ /* FIXME should return -errno */
errno = EINVAL;
return -1;
}
- if (lm_ctx->irqs.efds[vector] == -1) {
- lm_log(lm_ctx, LM_ERR, "no fd for interrupt %d\n", vector);
+ return 0;
+}
+
+int
+lm_irq_trigger(lm_ctx_t *lm_ctx, uint32_t subindex)
+{
+ int ret;
+ eventfd_t val = 1;
+
+ ret = validate_irq_subindex(lm_ctx, subindex);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (lm_ctx->irqs.efds[subindex] == -1) {
+ lm_log(lm_ctx, LM_ERR, "no fd for interrupt %d\n", subindex);
+ /* FIXME should return -errno */
errno = ENOENT;
return -1;
}
- if (vector == LM_DEV_INTX_IRQ_IDX && !lm_ctx->pci_config_space->hdr.cmd.id) {
- lm_log(lm_ctx, LM_ERR, "failed to trigger INTx IRQ, INTx disabled\n");
- errno = EINVAL;
+ return eventfd_write(lm_ctx->irqs.efds[subindex], val);
+}
+
+int
+lm_irq_message(lm_ctx_t *lm_ctx, uint32_t subindex)
+{
+ int ret, msg_id = 1;
+ struct vfio_user_irq_info irq_info;
+
+ ret = validate_irq_subindex(lm_ctx, subindex);
+ if (ret < 0) {
return -1;
- } else if (vector == LM_DEV_MSIX_IRQ_IDX) {
- /*
- * FIXME must check that MSI-X capability exists during creation time
- * FIXME need to check that MSI-X is enabled and that it's not masked.
- * Currently that's not possible because libmuser doesn't care about
- * the internals of a capability.
- */
}
- return eventfd_write(lm_ctx->irqs.efds[vector], val);
+ irq_info.subindex = subindex;
+ ret = send_recv_vfio_user_msg(lm_ctx->conn_fd, msg_id,
+ VFIO_USER_VM_INTERRUPT,
+ &irq_info, sizeof irq_info,
+ NULL, 0, NULL, NULL, 0);
+ if (ret < 0) {
+ /* FIXME should return -errno */
+ errno = -ret;
+ return -1;
+ }
+
+ return 0;
}
static void
@@ -1081,16 +2345,50 @@ free_sparse_mmap_areas(lm_reg_info_t *reg_info)
void
lm_ctx_destroy(lm_ctx_t *lm_ctx)
{
+ int ret;
+
if (lm_ctx == NULL) {
return;
}
+ free(lm_ctx->uuid);
+
+ /*
+ * FIXME The following cleanup can be dangerous depending on how lm_ctx_destroy
+ * is called since it might delete files it did not create. Improve by
+ * acquiring a lock on the directory.
+ */
+
+ if (lm_ctx->iommu_dir_fd != -1) {
+ if ((ret = unlinkat(lm_ctx->iommu_dir_fd, IOMMU_GRP_NAME, 0)) == -1
+ && errno != ENOENT) {
+ lm_log(lm_ctx, LM_DBG, "failed to remove " IOMMU_GRP_NAME ": "
+ "%m\n");
+ }
+ if ((ret = unlinkat(lm_ctx->iommu_dir_fd, MUSER_SOCK, 0)) == -1 &&
+ errno != ENOENT) {
+ lm_log(lm_ctx, LM_DBG, "failed to remove " MUSER_SOCK ": %m\n");
+ }
+ if (close(lm_ctx->iommu_dir_fd) == -1) {
+ lm_log(lm_ctx, LM_DBG, "failed to close IOMMU dir fd %d: %m\n",
+ lm_ctx->iommu_dir_fd);
+ }
+ }
+ if (lm_ctx->iommu_dir != NULL) {
+ if ((ret = rmdir(lm_ctx->iommu_dir)) == -1 && errno != ENOENT) {
+ lm_log(lm_ctx, LM_DBG, "failed to remove %s: %m\n",
+ lm_ctx->iommu_dir);
+ }
+ free(lm_ctx->iommu_dir);
+ }
+
free(lm_ctx->pci_config_space);
- dev_detach(lm_ctx->fd);
+ transports_ops[lm_ctx->trans].detach(lm_ctx);
if (lm_ctx->dma != NULL) {
- dma_controller_destroy(lm_ctx, lm_ctx->dma);
+ dma_controller_destroy(lm_ctx->dma);
}
free_sparse_mmap_areas(lm_ctx->pci_info.reg_info);
+ free(lm_ctx->caps);
free(lm_ctx);
// FIXME: Maybe close any open irq efds? Unmap stuff?
}
@@ -1125,6 +2423,7 @@ pci_config_setup(lm_ctx_t *lm_ctx, const lm_dev_info_t *dev_info)
{
lm_reg_info_t *cfg_reg;
const lm_reg_info_t zero_reg = { 0 };
+ lm_reg_info_t *migr_reg;
int i;
assert(lm_ctx != NULL);
@@ -1171,7 +2470,7 @@ pci_config_setup(lm_ctx_t *lm_ctx, const lm_dev_info_t *dev_info)
// Initialise capabilities.
if (dev_info->nr_caps > 0) {
- lm_ctx->caps = caps_create(dev_info->caps, dev_info->nr_caps);
+ lm_ctx->caps = caps_create(lm_ctx, dev_info->caps, dev_info->nr_caps);
if (lm_ctx->caps == NULL) {
lm_log(lm_ctx, LM_ERR, "failed to create PCI capabilities: %m\n");
goto err;
@@ -1181,6 +2480,28 @@ pci_config_setup(lm_ctx_t *lm_ctx, const lm_dev_info_t *dev_info)
lm_ctx->pci_config_space->hdr.cap = PCI_STD_HEADER_SIZEOF;
}
+ /*
+ * Check the migration region.
+ */
+ migr_reg = &lm_ctx->pci_info.reg_info[LM_DEV_MIGRATION_REG_IDX];
+ if (migr_reg->size > 0) {
+ if (migr_reg->size < sizeof(struct vfio_device_migration_info)) {
+ return -EINVAL;
+ }
+
+ /* FIXME this should be done in lm_ctx_run or poll */
+ lm_ctx->migration.info.device_state = VFIO_DEVICE_STATE_RUNNING;
+
+ lm_ctx->migration.callbacks = dev_info->migration_callbacks;
+ if (lm_ctx->migration.callbacks.transition == NULL ||
+ lm_ctx->migration.callbacks.get_pending_bytes == NULL ||
+ lm_ctx->migration.callbacks.prepare_data == NULL ||
+ lm_ctx->migration.callbacks.read_data == NULL ||
+ lm_ctx->migration.callbacks.write_data == NULL) {
+ return -EINVAL;
+ }
+ }
+
return 0;
err:
@@ -1212,6 +2533,18 @@ pci_info_bounce(lm_pci_info_t *dst, const lm_pci_info_t *src)
dst->cc = src->cc;
}
+int
+lm_ctx_try_attach(lm_ctx_t *lm_ctx)
+{
+ assert(lm_ctx != NULL);
+
+ if ((lm_ctx->flags & LM_FLAG_ATTACH_NB) == 0) {
+ errno = EINVAL;
+ return -1;
+ }
+ return transports_ops[lm_ctx->trans].attach(lm_ctx);
+}
+
lm_ctx_t *
lm_ctx_create(const lm_dev_info_t *dev_info)
{
@@ -1226,6 +2559,11 @@ lm_ctx_create(const lm_dev_info_t *dev_info)
return NULL;
}
+ if (dev_info->trans != LM_TRANS_SOCK) {
+ errno = EINVAL;
+ return NULL;
+ }
+
/*
* FIXME need to check that the number of MSI and MSI-X IRQs are valid
* (1, 2, 4, 8, 16 or 32 for MSI and up to 2048 for MSI-X).
@@ -1244,6 +2582,9 @@ lm_ctx_create(const lm_dev_info_t *dev_info)
if (lm_ctx == NULL) {
return NULL;
}
+ lm_ctx->trans = dev_info->trans;
+
+ lm_ctx->iommu_dir_fd = -1;
// Set context irq information.
for (i = 0; i < max_ivs; i++) {
@@ -1259,10 +2600,26 @@ lm_ctx_create(const lm_dev_info_t *dev_info)
lm_ctx->log = dev_info->log;
lm_ctx->log_lvl = dev_info->log_lvl;
lm_ctx->reset = dev_info->reset;
+ lm_ctx->flags = dev_info->flags;
+
+ lm_ctx->uuid = strdup(dev_info->uuid);
+ if (lm_ctx->uuid == NULL) {
+ err = errno;
+ goto out;
+ }
// Bounce the provided pci_info into the context.
pci_info_bounce(&lm_ctx->pci_info, &dev_info->pci_info);
+ /*
+ * FIXME above memcpy also copies reg_info->mmap_areas. If pci_config_setup
+ * fails then we try to free reg_info->mmap_areas, which is wrong because
+ * this is a user pointer.
+ */
+ for (i = 0; i < ARRAY_SIZE(lm_ctx->pci_info.reg_info); i++) {
+ lm_ctx->pci_info.reg_info[i].mmap_areas = NULL;
+ }
+
// Setup the PCI config space for this context.
err = pci_config_setup(lm_ctx, dev_info);
if (err != 0) {
@@ -1276,65 +2633,53 @@ lm_ctx_create(const lm_dev_info_t *dev_info)
goto out;
}
- // Attach to the muser control device.
- lm_ctx->fd = dev_attach(dev_info->uuid);
- if (lm_ctx->fd == -1) {
- err = errno;
- goto out;
+ if (transports_ops[dev_info->trans].init != NULL) {
+ err = transports_ops[dev_info->trans].init(lm_ctx);
+ if (err < 0) {
+ goto out;
+ }
+ lm_ctx->fd = err;
+ }
+ err = 0;
+
+ // Attach to the muser control device. With LM_FLAG_ATTACH_NB caller is
+ // always expected to call lm_ctx_try_attach().
+ if ((dev_info->flags & LM_FLAG_ATTACH_NB) == 0) {
+ lm_ctx->conn_fd = transports_ops[dev_info->trans].attach(lm_ctx);
+ if (lm_ctx->conn_fd < 0) {
+ err = lm_ctx->conn_fd;
+ if (err != EINTR) {
+ lm_log(lm_ctx, LM_ERR, "failed to attach: %s",
+ strerror(-err));
+ }
+ goto out;
+ }
}
+ lm_ctx->map_dma = dev_info->map_dma;
+ lm_ctx->unmap_dma = dev_info->unmap_dma;
+
// Create the internal DMA controller.
- lm_ctx->dma = dma_controller_create(LM_DMA_REGIONS);
- if (lm_ctx->dma == NULL) {
- err = errno;
- goto out;
+ if (lm_ctx->unmap_dma != NULL) {
+ lm_ctx->dma = dma_controller_create(lm_ctx, LM_DMA_REGIONS);
+ if (lm_ctx->dma == NULL) {
+ err = errno;
+ goto out;
+ }
}
out:
- if (err) {
- if (lm_ctx) {
- dma_controller_destroy(lm_ctx, lm_ctx->dma);
- dev_detach(lm_ctx->fd);
- free_sparse_mmap_areas(lm_ctx->pci_info.reg_info);
- free(lm_ctx->pci_config_space);
- free(lm_ctx);
+ if (err != 0) {
+ if (lm_ctx != NULL) {
+ lm_ctx_destroy(lm_ctx);
lm_ctx = NULL;
}
- errno = err;
+ errno = -err;
}
return lm_ctx;
}
-#ifdef DEBUG
-static void
-dump_buffer(lm_ctx_t *lm_ctx, const char *prefix,
- const char *buf, uint32_t count)
-{
- int i;
- const size_t bytes_per_line = 0x8;
-
- if (strcmp(prefix, "")) {
- lm_log(lm_ctx, LM_DBG, "%s\n", prefix);
- }
- for (i = 0; i < (int)count; i++) {
- if (i % bytes_per_line != 0) {
- lm_log(lm_ctx, LM_DBG, " ");
- }
- /* TODO valgrind emits a warning if count is 1 */
- lm_log(lm_ctx, LM_DBG, "0x%02x", *(buf + i));
- if ((i + 1) % bytes_per_line == 0) {
- lm_log(lm_ctx, LM_DBG, "\n");
- }
- }
- if (i % bytes_per_line != 0) {
- lm_log(lm_ctx, LM_DBG, "\n");
- }
-}
-#else
-#define dump_buffer(lm_ctx, prefix, buf, count)
-#endif
-
/*
* Returns a pointer to the standard part of the PCI configuration space.
*/
@@ -1364,21 +2709,34 @@ lm_get_region_info(lm_ctx_t *lm_ctx)
inline int
lm_addr_to_sg(lm_ctx_t *lm_ctx, dma_addr_t dma_addr,
- uint32_t len, dma_sg_t *sg, int max_sg)
+ uint32_t len, dma_sg_t *sg, int max_sg, int prot)
{
- return dma_addr_to_sg(lm_ctx->dma, dma_addr, len, sg, max_sg);
+ assert(lm_ctx != NULL);
+
+ if (unlikely(lm_ctx->unmap_dma == NULL)) {
+ errno = EINVAL;
+ return -1;
+ }
+ return dma_addr_to_sg(lm_ctx->dma, dma_addr, len, sg, max_sg, prot);
}
inline int
lm_map_sg(lm_ctx_t *lm_ctx, const dma_sg_t *sg,
struct iovec *iov, int cnt)
{
+ if (unlikely(lm_ctx->unmap_dma == NULL)) {
+ errno = EINVAL;
+ return -1;
+ }
return dma_map_sg(lm_ctx->dma, sg, iov, cnt);
}
inline void
lm_unmap_sg(lm_ctx_t *lm_ctx, const dma_sg_t *sg, struct iovec *iov, int cnt)
{
+ if (unlikely(lm_ctx->unmap_dma == NULL)) {
+ return;
+ }
return dma_unmap_sg(lm_ctx->dma, sg, iov, cnt);
}
@@ -1396,4 +2754,66 @@ lm_ctx_run(lm_dev_info_t *dev_info)
return ret;
}
+uint8_t *
+lm_ctx_get_cap(lm_ctx_t *lm_ctx, uint8_t id)
+{
+ assert(lm_ctx != NULL);
+
+ return cap_find_by_id(lm_ctx, id);
+}
+
+int
+lm_dma_read(lm_ctx_t *lm_ctx, dma_sg_t *sg, void *data)
+{
+ struct vfio_user_dma_region_access *dma_recv;
+ struct vfio_user_dma_region_access dma_send;
+ int recv_size;
+ int msg_id = 1, ret;
+
+ assert(lm_ctx != NULL);
+ assert(sg != NULL);
+
+ recv_size = sizeof(*dma_recv) + sg->length;
+
+ dma_recv = calloc(recv_size, 1);
+ if (dma_recv == NULL) {
+ return -ENOMEM;
+ }
+
+ dma_send.addr = sg->dma_addr;
+ dma_send.count = sg->length;
+ ret = send_recv_vfio_user_msg(lm_ctx->conn_fd, msg_id, VFIO_USER_DMA_READ,
+ &dma_send, sizeof dma_send, NULL, 0, NULL,
+ dma_recv, recv_size);
+ memcpy(data, dma_recv->data, sg->length); /* FIXME no need for memcpy */
+ free(dma_recv);
+
+ return ret;
+}
+
+int
+lm_dma_write(lm_ctx_t *lm_ctx, dma_sg_t *sg, void *data)
+{
+ struct vfio_user_dma_region_access *dma_send, dma_recv;
+ int send_size = sizeof(*dma_send) + sg->length;
+ int msg_id = 1, ret;
+
+ assert(lm_ctx != NULL);
+ assert(sg != NULL);
+
+ dma_send = calloc(send_size, 1);
+ if (dma_send == NULL) {
+ return -ENOMEM;
+ }
+ dma_send->addr = sg->dma_addr;
+ dma_send->count = sg->length;
+ memcpy(dma_send->data, data, sg->length); /* FIXME no need to copy! */
+ ret = send_recv_vfio_user_msg(lm_ctx->conn_fd, msg_id, VFIO_USER_DMA_WRITE,
+ dma_send, send_size,
+ NULL, 0, NULL, &dma_recv, sizeof(dma_recv));
+ free(dma_send);
+
+ return ret;
+}
+
/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/muser_pci.c b/lib/muser_pci.c
index 36692ab..2846301 100644
--- a/lib/muser_pci.c
+++ b/lib/muser_pci.c
@@ -52,7 +52,7 @@ muser_pci_hdr_write_bar(lm_ctx_t *lm_ctx, uint16_t bar_index, const char *buf)
lm_reg_info_t *reg_info = lm_get_region_info(lm_ctx);
lm_pci_hdr_t *hdr;
- assert(lm_ctx);
+ assert(lm_ctx != NULL);
if (reg_info[bar_index].size == 0) {
return;
@@ -86,15 +86,15 @@ handle_command_write(lm_ctx_t *ctx, lm_pci_config_space_t *pci,
{
uint16_t v;
- assert(ctx);
+ assert(ctx != NULL);
if (count != 2) {
lm_log(ctx, LM_ERR, "bad write command size %d\n", count);
return -EINVAL;
}
- assert(pci);
- assert(buf);
+ assert(pci != NULL);
+ assert(buf != NULL);
v = *(uint16_t*)buf;
@@ -153,17 +153,35 @@ handle_command_write(lm_ctx_t *ctx, lm_pci_config_space_t *pci,
if ((v & PCI_COMMAND_INTX_DISABLE) == PCI_COMMAND_INTX_DISABLE) {
if (!pci->hdr.cmd.id) {
pci->hdr.cmd.id = 0x1;
- lm_log(ctx, LM_INF, "INTx emulation enabled\n");
+ lm_log(ctx, LM_INF, "INTx emulation disabled\n");
}
v &= ~PCI_COMMAND_INTX_DISABLE;
} else {
if (pci->hdr.cmd.id) {
pci->hdr.cmd.id = 0x0;
- lm_log(ctx, LM_INF, "INTx emulation disabled\n");
+ lm_log(ctx, LM_INF, "INTx emulation enabled\n");
}
}
- if (v) {
+ if ((v & PCI_COMMAND_INVALIDATE) == PCI_COMMAND_INVALIDATE) {
+ if (!pci->hdr.cmd.mwie) {
+ pci->hdr.cmd.mwie = 1U;
+ lm_log(ctx, LM_INF, "memory write and invalidate enabled\n");
+ }
+ v &= ~PCI_COMMAND_INVALIDATE;
+ } else {
+ if (pci->hdr.cmd.mwie) {
+ pci->hdr.cmd.mwie = 0;
+ lm_log(ctx, LM_INF, "memory write and invalidate disabled");
+ }
+ }
+
+ if ((v & PCI_COMMAND_VGA_PALETTE) == PCI_COMMAND_VGA_PALETTE) {
+ lm_log(ctx, LM_INF, "enabling VGA palette snooping ignored\n");
+ v &= ~PCI_COMMAND_VGA_PALETTE;
+ }
+
+ if (v != 0) {
lm_log(ctx, LM_ERR, "unconsumed command flags %x\n", v);
return -EINVAL;
}
@@ -177,8 +195,8 @@ handle_erom_write(lm_ctx_t *ctx, lm_pci_config_space_t *pci,
{
uint32_t v;
- assert(ctx);
- assert(pci);
+ assert(ctx != NULL);
+ assert(pci != NULL);
if (count != 0x4) {
lm_log(ctx, LM_ERR, "bad EROM count %d\n", count);
@@ -207,8 +225,8 @@ muser_pci_hdr_write(lm_ctx_t *lm_ctx, uint16_t offset,
lm_pci_config_space_t *pci;
int ret = 0;
- assert(lm_ctx);
- assert(buf);
+ assert(lm_ctx != NULL);
+ assert(buf != NULL);
pci = lm_get_pci_config_space(lm_ctx);
@@ -248,8 +266,8 @@ muser_pci_hdr_write(lm_ctx_t *lm_ctx, uint16_t offset,
ret = -EINVAL;
}
-#ifndef LM_TERSE_LOGGING
- dump_buffer(lm_ctx, "PCI header", pci->hdr.raw, 0xff);
+#ifdef LM_VERBOSE_LOGGING
+ dump_buffer("PCI header", (char*)pci->hdr.raw, 0xff);
#endif
return ret;
@@ -263,18 +281,18 @@ muser_pci_hdr_write(lm_ctx_t *lm_ctx, uint16_t offset,
* @count: output parameter that receives the number of bytes read/written
*/
static inline int
-muser_do_pci_hdr_access(lm_ctx_t *lm_ctx, size_t *count,
- loff_t *pos, bool is_write,
+muser_do_pci_hdr_access(lm_ctx_t *lm_ctx, uint32_t *count,
+ uint64_t *pos, bool is_write,
char *buf)
{
- size_t _count;
+ uint32_t _count;
loff_t _pos;
int err = 0;
- assert(lm_ctx);
- assert(count);
- assert(pos);
- assert(buf);
+ assert(lm_ctx != NULL);
+ assert(count != NULL);
+ assert(pos != NULL);
+ assert(buf != NULL);
_pos = *pos - region_to_offset(LM_DEV_CFG_REG_IDX);
_count = MIN(*count, PCI_STD_HEADER_SIZEOF - _pos);
@@ -290,20 +308,21 @@ muser_do_pci_hdr_access(lm_ctx_t *lm_ctx, size_t *count,
}
static inline bool
-muser_is_pci_hdr_access(loff_t pos)
+muser_is_pci_hdr_access(uint64_t pos)
{
- const off_t off = (loff_t) region_to_offset(LM_DEV_CFG_REG_IDX);
- return pos - off >= 0 && pos - off < PCI_STD_HEADER_SIZEOF;
+ const uint64_t off = region_to_offset(LM_DEV_CFG_REG_IDX);
+ return pos >= off && pos - off < PCI_STD_HEADER_SIZEOF;
}
+/* FIXME this function is misleading, remove it */
int
-muser_pci_hdr_access(lm_ctx_t *lm_ctx, size_t *count,
- loff_t *pos, bool is_write,
+muser_pci_hdr_access(lm_ctx_t *lm_ctx, uint32_t *count,
+ uint64_t *pos, bool is_write,
char *buf)
{
- assert(lm_ctx);
- assert(count);
- assert(pos);
+ assert(lm_ctx != NULL);
+ assert(count != NULL);
+ assert(pos != NULL);
if (!muser_is_pci_hdr_access(*pos)) {
return 0;
diff --git a/lib/muser_priv.h b/lib/muser_priv.h
index aa29f5a..097874a 100644
--- a/lib/muser_priv.h
+++ b/lib/muser_priv.h
@@ -35,9 +35,11 @@
#include "muser.h"
+extern char *irq_to_str[];
+
int
-muser_pci_hdr_access(lm_ctx_t *lm_ctx, size_t *count,
- loff_t *pos, bool write, char *buf);
+muser_pci_hdr_access(lm_ctx_t *lm_ctx, uint32_t *count,
+ uint64_t *pos, bool write, char *buf);
lm_reg_info_t *
lm_get_region_info(lm_ctx_t *lm_ctx);
@@ -45,4 +47,111 @@ lm_get_region_info(lm_ctx_t *lm_ctx);
uint64_t
region_to_offset(uint32_t region);
+int
+_send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply,
+ enum vfio_user_command cmd,
+ struct iovec *iovecs, size_t nr_iovecs,
+ int *fds, int count);
+
+int
+send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply,
+ enum vfio_user_command cmd,
+ void *data, size_t data_len,
+ int *fds, size_t count);
+
+
+int
+recv_vfio_user_msg(int sock, struct vfio_user_header *hdr, bool is_reply,
+ uint16_t *msg_id, void *data, size_t *len);
+
+int
+send_version(int sock, int major, int minor, uint16_t msg_id, bool is_reply,
+ char *caps);
+
+int
+recv_version(int sock, int *major, int *minor, uint16_t *msg_id, bool is_reply,
+ int *max_fds, size_t *pgsize);
+
+int
+_send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd,
+ struct iovec *iovecs, size_t nr_iovecs,
+ int *send_fds, size_t fd_count,
+ struct vfio_user_header *hdr,
+ void *recv_data, size_t recv_len);
+
+int
+send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd,
+ void *send_data, size_t send_len,
+ int *send_fds, size_t fd_count,
+ struct vfio_user_header *hdr,
+ void *recv_data, size_t recv_len);
+
+/* FIXME copied from include/linux/stddef.h, is this OK license-wise? */
+#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER))
+#define offsetofend(TYPE, MEMBER) \
+ (offsetof(TYPE, MEMBER) + sizeof_field(TYPE, MEMBER))
+
+static inline ssize_t get_minsz(unsigned int cmd)
+{
+ switch (cmd) {
+ case VFIO_DEVICE_GET_INFO:
+ return offsetofend(struct vfio_device_info, num_irqs);
+ case VFIO_DEVICE_GET_REGION_INFO:
+ return offsetofend(struct vfio_region_info, offset);
+ case VFIO_DEVICE_GET_IRQ_INFO:
+ return offsetofend(struct vfio_irq_info, count);
+ case VFIO_DEVICE_SET_IRQS:
+ return offsetofend(struct vfio_irq_set, count);
+ case VFIO_GROUP_GET_STATUS:
+ return offsetofend(struct vfio_group_status, flags);
+ case VFIO_GET_API_VERSION:
+ return 0;
+ case VFIO_CHECK_EXTENSION:
+ case VFIO_GROUP_SET_CONTAINER:
+ case VFIO_GROUP_UNSET_CONTAINER:
+ case VFIO_SET_IOMMU:
+ return sizeof(int);
+ case VFIO_IOMMU_GET_INFO:
+ return offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
+ case VFIO_IOMMU_MAP_DMA:
+ return offsetofend(struct vfio_iommu_type1_dma_map, size);
+ case VFIO_IOMMU_UNMAP_DMA:
+ return offsetofend(struct vfio_iommu_type1_dma_unmap, size);
+ case VFIO_GROUP_GET_DEVICE_FD:
+ case VFIO_DEVICE_RESET:
+ return 0;
+ }
+ return -EOPNOTSUPP;
+}
+
+static inline const char* vfio_cmd_to_str(int cmd) {
+ switch (cmd) {
+ case VFIO_GET_API_VERSION: return "VFIO_GET_API_VERSION";
+ case VFIO_CHECK_EXTENSION: return "VFIO_CHECK_EXTENSION";
+ case VFIO_SET_IOMMU: return "VFIO_SET_IOMMU";
+ case VFIO_GROUP_GET_STATUS: return "VFIO_GROUP_GET_STATUS";
+ case VFIO_GROUP_SET_CONTAINER: return "VFIO_GROUP_SET_CONTAINER";
+ case VFIO_GROUP_UNSET_CONTAINER: return "VFIO_GROUP_UNSET_CONTAINER";
+ case VFIO_GROUP_GET_DEVICE_FD: return "VFIO_GROUP_GET_DEVICE_FD";
+ case VFIO_DEVICE_GET_INFO: return "VFIO_DEVICE_GET_INFO";
+ case VFIO_DEVICE_GET_REGION_INFO: return "VFIO_DEVICE_GET_REGION_INFO";
+ case VFIO_DEVICE_GET_IRQ_INFO: return "VFIO_DEVICE_GET_IRQ_INFO";
+ case VFIO_DEVICE_SET_IRQS: return "VFIO_DEVICE_SET_IRQS";
+ case VFIO_DEVICE_RESET: return "VFIO_DEVICE_RESET";
+ case VFIO_IOMMU_GET_INFO: return "VFIO_IOMMU_GET_INFO/VFIO_DEVICE_GET_PCI_HOT_RESET_INFO/VFIO_IOMMU_SPAPR_TCE_GET_INFO";
+ case VFIO_IOMMU_MAP_DMA: return "VFIO_IOMMU_MAP_DMA/VFIO_DEVICE_PCI_HOT_RESET";
+ case VFIO_IOMMU_UNMAP_DMA: return "VFIO_IOMMU_UNMAP_DMA";
+ case VFIO_IOMMU_ENABLE: return "VFIO_IOMMU_ENABLE";
+ case VFIO_IOMMU_DISABLE: return "VFIO_IOMMU_DISABLE";
+ case VFIO_EEH_PE_OP: return "VFIO_EEH_PE_OP";
+ case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: return "VFIO_IOMMU_SPAPR_REGISTER_MEMORY";
+ case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: return "VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY";
+ case VFIO_IOMMU_SPAPR_TCE_CREATE: return "VFIO_IOMMU_SPAPR_TCE_CREATE";
+ case VFIO_IOMMU_SPAPR_TCE_REMOVE: return "VFIO_IOMMU_SPAPR_TCE_REMOVE";
+ }
+ return NULL;
+}
+
#endif /* MUSER_PRIV_H */
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/vfio_user.h b/lib/vfio_user.h
new file mode 100644
index 0000000..19f751a
--- /dev/null
+++ b/lib/vfio_user.h
@@ -0,0 +1,167 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VFIO_USER_H
+#define _VFIO_USER_H
+
+#include <inttypes.h>
+#include <linux/vfio.h>
+#include <linux/version.h>
+
+enum vfio_user_command {
+ VFIO_USER_VERSION = 1,
+ VFIO_USER_DMA_MAP = 2,
+ VFIO_USER_DMA_UNMAP = 3,
+ VFIO_USER_DEVICE_GET_INFO = 4,
+ VFIO_USER_DEVICE_GET_REGION_INFO = 5,
+ VFIO_USER_DEVICE_GET_IRQ_INFO = 6,
+ VFIO_USER_DEVICE_SET_IRQS = 7,
+ VFIO_USER_REGION_READ = 8,
+ VFIO_USER_REGION_WRITE = 9,
+ VFIO_USER_DMA_READ = 10,
+ VFIO_USER_DMA_WRITE = 11,
+ VFIO_USER_VM_INTERRUPT = 12,
+ VFIO_USER_DEVICE_RESET = 13,
+ VFIO_USER_DIRTY_PAGES = 14,
+ VFIO_USER_MAX,
+};
+
+enum vfio_user_message_type {
+ VFIO_USER_MESSAGE_COMMAND = 0,
+ VFIO_USER_MESSAGE_REPLY = 1,
+};
+
+#define VFIO_USER_FLAGS_NO_REPLY (0x1)
+
+struct vfio_user_header {
+ uint16_t msg_id;
+ uint16_t cmd;
+ uint32_t msg_size;
+ struct {
+ uint32_t type : 4;
+#define VFIO_USER_F_TYPE_COMMAND 0
+#define VFIO_USER_F_TYPE_REPLY 1
+ uint32_t no_reply : 1;
+ uint32_t error : 1;
+ uint32_t resvd : 26;
+ } flags;
+ uint32_t error_no;
+} __attribute__((packed));
+
+struct vfio_user_dma_region {
+ uint64_t addr;
+ uint64_t size;
+ uint64_t offset;
+ uint32_t prot;
+ uint32_t flags;
+#define VFIO_USER_F_DMA_REGION_MAPPABLE (0x0)
+} __attribute__((packed));
+
+struct vfio_user_region_access {
+ uint64_t offset;
+ uint32_t region;
+ uint32_t count;
+ uint8_t data[];
+} __attribute__((packed));
+
+struct vfio_user_dma_region_access {
+ uint64_t addr;
+ uint32_t count;
+ uint8_t data[];
+} __attribute__((packed));
+
+struct vfio_user_irq_info {
+ uint32_t subindex;
+} __attribute__((packed));
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5,8,0)
+
+/* copied from <linux/vfio.h> */
+
+#define VFIO_REGION_TYPE_MIGRATION (3)
+#define VFIO_REGION_SUBTYPE_MIGRATION (1)
+
+struct vfio_device_migration_info {
+ __u32 device_state; /* VFIO device state */
+#define VFIO_DEVICE_STATE_STOP (0)
+#define VFIO_DEVICE_STATE_RUNNING (1 << 0)
+#define VFIO_DEVICE_STATE_SAVING (1 << 1)
+#define VFIO_DEVICE_STATE_RESUMING (1 << 2)
+#define VFIO_DEVICE_STATE_MASK (VFIO_DEVICE_STATE_RUNNING | \
+ VFIO_DEVICE_STATE_SAVING | \
+ VFIO_DEVICE_STATE_RESUMING)
+
+#define VFIO_DEVICE_STATE_VALID(state) \
+ (state & VFIO_DEVICE_STATE_RESUMING ? \
+ (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1)
+
+#define VFIO_DEVICE_STATE_IS_ERROR(state) \
+ ((state & VFIO_DEVICE_STATE_MASK) == (VFIO_DEVICE_STATE_SAVING | \
+ VFIO_DEVICE_STATE_RESUMING))
+
+#define VFIO_DEVICE_STATE_SET_ERROR(state) \
+ ((state & ~VFIO_DEVICE_STATE_MASK) | VFIO_DEVICE_SATE_SAVING | \
+ VFIO_DEVICE_STATE_RESUMING)
+
+ __u32 reserved;
+ __u64 pending_bytes;
+ __u64 data_offset;
+ __u64 data_size;
+};
+
+struct vfio_bitmap {
+ __u64 pgsize; /* page size for bitmap in bytes */
+ __u64 size; /* in bytes */
+ __u64 *data; /* one bit per page */
+};
+
+struct vfio_iommu_type1_dirty_bitmap {
+ __u32 argsz;
+ __u32 flags;
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_START (1 << 0)
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP (1 << 1)
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP (1 << 2)
+ __u8 data[];
+};
+
+struct vfio_iommu_type1_dirty_bitmap_get {
+ __u64 iova; /* IO virtual address */
+ __u64 size; /* Size of iova range */
+ struct vfio_bitmap bitmap;
+};
+
+#endif
+
+#endif
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */