diff options
author | Thanos Makatos <thanos.makatos@nutanix.com> | 2020-11-11 07:35:10 -0500 |
---|---|---|
committer | Thanos Makatos <thanos.makatos@nutanix.com> | 2020-11-11 07:35:10 -0500 |
commit | b6e8c7b39456e7c0a164f87c48eac2bbfd6d85f3 (patch) | |
tree | c94839c02cde83bca416221bd906e4952fbc8c53 /lib | |
parent | b9a2e75360e14e59db651d6081894e0cf20e7c2d (diff) | |
parent | 985940e6539eaf8f41e0b6421938b5bf5c1db22c (diff) | |
download | libvfio-user-b6e8c7b39456e7c0a164f87c48eac2bbfd6d85f3.zip libvfio-user-b6e8c7b39456e7c0a164f87c48eac2bbfd6d85f3.tar.gz libvfio-user-b6e8c7b39456e7c0a164f87c48eac2bbfd6d85f3.tar.bz2 |
Merge branch 'vfio-user'
Diffstat (limited to 'lib')
-rw-r--r-- | lib/CMakeLists.txt | 4 | ||||
-rw-r--r-- | lib/cap.c | 444 | ||||
-rw-r--r-- | lib/cap.h | 9 | ||||
-rw-r--r-- | lib/caps/common.h | 46 | ||||
-rw-r--r-- | lib/caps/msi.h | 9 | ||||
-rw-r--r-- | lib/caps/msix.h | 9 | ||||
-rw-r--r-- | lib/caps/pm.h | 15 | ||||
-rw-r--r-- | lib/caps/px.h | 9 | ||||
-rw-r--r-- | lib/common.h | 8 | ||||
-rw-r--r-- | lib/dma.c | 248 | ||||
-rw-r--r-- | lib/dma.h | 137 | ||||
-rw-r--r-- | lib/muser.h | 232 | ||||
-rw-r--r-- | lib/muser_ctx.c | 2242 | ||||
-rw-r--r-- | lib/muser_pci.c | 75 | ||||
-rw-r--r-- | lib/muser_priv.h | 113 | ||||
-rw-r--r-- | lib/vfio_user.h | 167 |
16 files changed, 3093 insertions, 674 deletions
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index e2084fe..bc9e4b8 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -32,14 +32,14 @@ set(CMAKE_C_FLAGS "-Wall -Wextra -Werror -fPIC") set(CMAKE_C_FLAGS_DEBUG "-O0 -ggdb") add_library(muser SHARED - ../kmod/muser.h + vfio_user.h muser.h muser_priv.h common.h) target_link_libraries(muser muser_ctx muser_pci dma cap) set_target_properties(muser PROPERTIES LINKER_LANGUAGE C) -set_target_properties(muser PROPERTIES PUBLIC_HEADER "muser.h;pci.h") +set_target_properties(muser PROPERTIES PUBLIC_HEADER "muser.h;pci.h;vfio_user.h") set(UT_CFLAGS "-O0 -ggdb --coverage") set(UT_LFLAGS "--coverage") @@ -34,56 +34,60 @@ #include <errno.h> #include <stdlib.h> #include <stdio.h> +#include <stddef.h> +#include <string.h> #include "muser.h" #include "cap.h" struct cap { - uint8_t start; - uint8_t end; - uint8_t id; - lm_cap_access_t *fn; + uint8_t start; + uint8_t end; }; struct caps { - struct cap caps[LM_MAX_CAPS]; - int nr_caps; + struct cap caps[LM_MAX_CAPS]; /* FIXME only needs to be as big as nr_caps */ + unsigned int nr_caps; }; /* * Tells whether a capability is being accessed. */ static bool -cap_is_accessed(struct cap *caps, int nr_caps, loff_t offset) +cap_is_accessed(struct cap *caps, int nr_caps, size_t count, loff_t offset) { - /* - * Ignore if it's at the standard PCI header. The first capability starts - * right after that. - */ - if (offset < PCI_STD_HEADER_SIZEOF) { - return false; - } - - /* ignore if there are no capabilities */ - if (!nr_caps) { + if (nr_caps == 0) { return false; } - assert(caps); + assert(caps != NULL); - /* - * Ignore if it's before the first capability. This check is probably - * redundant since we assume that the first capability starts right after - * the standard PCI header. - * TODO should we check that it doesn't cross into the first capability? - */ if (offset < caps[0].start) { + /* write starts before first capability */ + + if (offset + count <= caps[0].start) { + /* write ends before first capability */ + return false; + } + + /* + * FIXME write starts before capabilities but extends into them. I don't + * think that the while loop in lm_access will allow this in the first + * place. + */ + assert(false); + } else if (offset > caps[nr_caps - 1].end) { + /* write starts after last capability */ return false; } - /* ignore if it's past the last capability */ - if (offset > caps[nr_caps - 1].end) { - return false; + if (offset + count > (size_t)(caps[nr_caps - 1].end + 1)) { + /* + * FIXME write starts within capabilities but extends past them, I think + * that this _is_ possible, e.g. MSI-X is 12 bytes (PCI_CAP_MSIX_SIZEOF) + * and the host writes to first 8 bytes and then writes 8 more. + */ + assert(false); } return true; } @@ -92,151 +96,369 @@ cap_is_accessed(struct cap *caps, int nr_caps, loff_t offset) * Returns the PCI capability that is contained within the specified region * (offset + count). */ -static struct cap * -cap_find(struct cap *caps, int nr_caps, loff_t offset, size_t count) +static uint8_t * +cap_find(lm_pci_config_space_t *config_space, struct caps *caps, loff_t offset, + size_t count) { struct cap *cap; - cap = caps; - while (cap < caps + nr_caps) { + assert(config_space != NULL); + assert(caps != NULL); + + cap = caps->caps; + while (cap < caps->caps + caps->nr_caps) { /* - * TODO this assumes that at most one capability is read. It might be - * legitimate to read an arbitrary number of bytes, which we could - * support. For now lets explicitly fail such cases. + * FIXME ensure that at most one capability is written to. It might + * legitimate to write to two capabilities at the same time. */ - if (offset >= cap->start && offset + count - 1 <= cap->end) { - return cap; + if (offset >= cap->start && offset <= cap->end) { + if (offset + count - 1 > cap->end) { + assert(false); + } + return config_space->raw + cap->start; } cap++; } - /* this means that the access spans more than a capability */ return NULL; } -/* - * Tells whether the header of a PCI capability is accessed. - */ static bool -cap_header_is_accessed(struct cap *cap, loff_t offset) +cap_is_valid(uint8_t id) { - assert(cap); - return offset - cap->start <= 1; + /* TODO 0 is a valid capability ID (Null Capability), check + * https://pcisig.com/sites/default/files/files/PCI_Code-ID_r_1_11__v24_Jan_2019.pdf: + * + */ + return id >= PCI_CAP_ID_PM && id <= PCI_CAP_ID_MAX; } -/* - * Reads the header of a PCI capability. - */ -static int -cap_header_access(struct caps *caps, struct cap *cap, char *buf, - loff_t offset, size_t count, bool is_write) +uint8_t * +cap_find_by_id(lm_ctx_t *lm_ctx, uint8_t id) { - int n; + uint8_t *pos; + lm_pci_config_space_t *config_space; - /* - * We don't allow ID and next to be written. TODO not sure what the PCI - * spec says about this, need to check. - */ - if (is_write) { - return -EINVAL; + if (!cap_is_valid(id)) { + errno = EINVAL; + return NULL; } - assert(caps); - assert(cap); - n = 0; - /* - * We handle reads to ID and next, the rest is handled by the callback. - */ - if (offset == cap->start && count > 0) { /* ID */ - buf[n++] = cap->id; - offset++; - count--; + config_space = lm_get_pci_config_space(lm_ctx); + + if (config_space->hdr.cap == 0) { + errno = ENOENT; + return NULL; } - if (offset == cap->start + 1 && count > 0) { /* next */ - if ((cap - caps->caps) / sizeof *cap == (size_t)(caps->nr_caps - 1)) { - buf[n++] = 0; - } else { - buf[n++] = (cap + 1)->start; + pos = config_space->raw + config_space->hdr.cap; + while (true) { + if (*(pos + PCI_CAP_LIST_ID) == id) { + return pos; } - - offset++; - count--; + if (*(pos + PCI_CAP_LIST_NEXT) == 0) { + break; + } + pos = config_space->raw + *(pos + PCI_CAP_LIST_NEXT); } - return n; + errno = ENOENT; + return NULL; } +/* + * Tells whether the header of a PCI capability is accessed. + */ +static bool +cap_header_is_accessed(uint8_t cap_offset, loff_t offset) +{ + return offset - cap_offset <= 1; +} + +typedef ssize_t (cap_access) (lm_ctx_t *lm_ctx, uint8_t *cap, char *buf, + size_t count, loff_t offset); + +static ssize_t +handle_pmcs_write(lm_ctx_t *lm_ctx, struct pmcap *pm, + const struct pmcs *const pmcs) +{ + + if (pm->pmcs.ps != pmcs->ps) { + lm_log(lm_ctx, LM_DBG, "power state set to %#x\n", pmcs->ps); + } + if (pm->pmcs.pmee != pmcs->pmee) { + lm_log(lm_ctx, LM_DBG, "PME enable set to %#x\n", pmcs->pmee); + } + if (pm->pmcs.dse != pmcs->dse) { + lm_log(lm_ctx, LM_DBG, "data select set to %#x\n", pmcs->dse); + } + if (pm->pmcs.pmes != pmcs->pmes) { + lm_log(lm_ctx, LM_DBG, "PME status set to %#x\n", pmcs->pmes); + } + pm->pmcs = *pmcs; + return 0; +} + +static ssize_t +handle_pm_write(lm_ctx_t *lm_ctx, uint8_t *cap, char *const buf, + const size_t count, const loff_t offset) +{ + struct pmcap *pm = (struct pmcap *)cap; + + switch (offset) { + case offsetof(struct pmcap, pc): + if (count != sizeof(struct pc)) { + return -EINVAL; + } + assert(false); /* FIXME implement */ + case offsetof(struct pmcap, pmcs): + if (count != sizeof(struct pmcs)) { + return -EINVAL; + } + return handle_pmcs_write(lm_ctx, pm, (struct pmcs *)buf); + } + return -EINVAL; +} + +static ssize_t +handle_mxc_write(lm_ctx_t *lm_ctx, struct msixcap *msix, + const struct mxc *const mxc) +{ + assert(msix != NULL); + assert(mxc != NULL); + + if (mxc->mxe != msix->mxc.mxe) { + lm_log(lm_ctx, LM_DBG, "%s MSI-X\n", mxc->mxe ? "enable" : "disable"); + msix->mxc.mxe = mxc->mxe; + } + + if (mxc->fm != msix->mxc.fm) { + if (mxc->fm) { + lm_log(lm_ctx, LM_DBG, "all MSI-X vectors masked\n"); + } else { + lm_log(lm_ctx, LM_DBG, + "vector's mask bit determines whether vector is masked\n"); + } + msix->mxc.fm = mxc->fm; + } + + return sizeof(struct mxc); +} + +static ssize_t +handle_msix_write(lm_ctx_t *lm_ctx, uint8_t *cap, char *const buf, + const size_t count, const loff_t offset) +{ + struct msixcap *msix = (struct msixcap *)cap; + + if (count == sizeof(struct mxc)) { + switch (offset) { + case offsetof(struct msixcap, mxc): + return handle_mxc_write(lm_ctx, msix, (struct mxc *)buf); + default: + lm_log(lm_ctx, LM_ERR, "invalid MSI-X write offset %ld\n", offset); + return -EINVAL; + } + } + lm_log(lm_ctx, LM_ERR, "invalid MSI-X write size %lu\n", count); + return -EINVAL; +} + +static int +handle_px_pxdc_write(lm_ctx_t *lm_ctx, struct pxcap *px, const union pxdc *const p) +{ + assert(px != NULL); + assert(p != NULL); + + if (p->cere != px->pxdc.cere) { + px->pxdc.cere = p->cere; + lm_log(lm_ctx, LM_DBG, "CERE %s\n", p->cere ? "enable" : "disable"); + } + + if (p->nfere != px->pxdc.nfere) { + px->pxdc.nfere = p->nfere; + lm_log(lm_ctx, LM_DBG, "NFERE %s\n", p->nfere ? "enable" : "disable"); + } + + if (p->fere != px->pxdc.fere) { + px->pxdc.fere = p->fere; + lm_log(lm_ctx, LM_DBG, "FERE %s\n", p->fere ? "enable" : "disable"); + } + + if (p->urre != px->pxdc.urre) { + px->pxdc.urre = p->urre; + lm_log(lm_ctx, LM_DBG, "URRE %s\n", p->urre ? "enable" : "disable"); + } + + if (p->ero != px->pxdc.ero) { + px->pxdc.ero = p->ero; + lm_log(lm_ctx, LM_DBG, "ERO %s\n", p->ero ? "enable" : "disable"); + } + + if (p->mps != px->pxdc.mps) { + px->pxdc.mps = p->mps; + lm_log(lm_ctx, LM_DBG, "MPS set to %d\n", p->mps); + } + + if (p->ete != px->pxdc.ete) { + px->pxdc.ete = p->ete; + lm_log(lm_ctx, LM_DBG, "ETE %s\n", p->ete ? "enable" : "disable"); + } + + if (p->pfe != px->pxdc.pfe) { + px->pxdc.pfe = p->pfe; + lm_log(lm_ctx, LM_DBG, "PFE %s\n", p->pfe ? "enable" : "disable"); + } + + if (p->appme != px->pxdc.appme) { + px->pxdc.appme = p->appme; + lm_log(lm_ctx, LM_DBG, "APPME %s\n", p->appme ? "enable" : "disable"); + } + + if (p->ens != px->pxdc.ens) { + px->pxdc.ens = p->ens; + lm_log(lm_ctx, LM_DBG, "ENS %s\n", p->ens ? "enable" : "disable"); + } + + if (p->mrrs != px->pxdc.mrrs) { + px->pxdc.mrrs = p->mrrs; + lm_log(lm_ctx, LM_DBG, "MRRS set to %d\n", p->mrrs); + } + + if (p->iflr) { + lm_log(lm_ctx, LM_DBG, + "initiate function level reset\n"); + } + + return 0; +} + +static int +handle_px_write_2_bytes(lm_ctx_t *lm_ctx, struct pxcap *px, char *const buf, + loff_t off) +{ + switch (off) { + case offsetof(struct pxcap, pxdc): + return handle_px_pxdc_write(lm_ctx, px, (union pxdc *)buf); + } + return -EINVAL; +} + +static ssize_t +handle_px_write(lm_ctx_t *lm_ctx, uint8_t *cap, char *const buf, + size_t count, loff_t offset) +{ + struct pxcap *px = (struct pxcap *)cap; + + int err = -EINVAL; + switch (count) { + case 2: + err = handle_px_write_2_bytes(lm_ctx, px, buf, offset); + break; + } + if (err != 0) { + return err; + } + return count; +} + +static const struct cap_handler { + char *name; + size_t size; + cap_access *fn; +} cap_handlers[PCI_CAP_ID_MAX + 1] = { + [PCI_CAP_ID_PM] = {"PM", PCI_PM_SIZEOF, handle_pm_write}, + [PCI_CAP_ID_EXP] = {"PCI Express", PCI_CAP_EXP_ENDPOINT_SIZEOF_V2, + handle_px_write}, + [PCI_CAP_ID_MSIX] = {"MSI-X", PCI_CAP_MSIX_SIZEOF, handle_msix_write}, +}; + ssize_t -cap_maybe_access(struct caps *caps, void *pvt, char *buf, size_t count, - loff_t offset, bool is_write) +cap_maybe_access(lm_ctx_t *lm_ctx, struct caps *caps, char *buf, size_t count, + loff_t offset) { - struct cap *cap; + lm_pci_config_space_t *config_space; + uint8_t *cap; - if (!caps) { + if (caps == NULL) { return 0; } - if (!count) { + if (count == 0) { return 0; } - if (!cap_is_accessed(caps->caps, caps->nr_caps, offset)) { + if (!cap_is_accessed(caps->caps, caps->nr_caps, count, offset)) { return 0; } /* we're now guaranteed that the access is within some capability */ - cap = cap_find(caps->caps, caps->nr_caps, offset, count); + config_space = lm_get_pci_config_space(lm_ctx); + cap = cap_find(config_space, caps, offset, count); + assert(cap != NULL); /* FIXME */ - if (!cap) { - return 0; - } - - if (cap_header_is_accessed(cap, offset)) { - return cap_header_access(caps, cap, buf, offset, count, is_write); - } - if (count > 0) { - return cap->fn(pvt, cap->id, buf, count, offset - cap->start, is_write); + if (cap_header_is_accessed(cap - config_space->raw, offset)) { + /* FIXME how to deal with writes to capability header? */ + assert(false); } - return 0; -} - -static bool -cap_is_valid(uint8_t id) -{ - return id >= PCI_CAP_ID_PM && id <= PCI_CAP_ID_MAX; + return cap_handlers[cap[PCI_CAP_LIST_ID]].fn(lm_ctx, cap, buf, count, + offset - (loff_t)(cap - config_space->raw)); } struct caps * -caps_create(const lm_cap_t *lm_caps, int nr_caps) +caps_create(lm_ctx_t *lm_ctx, lm_cap_t **lm_caps, int nr_caps) { - uint8_t prev_end; int i, err = 0; - struct caps *caps = NULL; + uint8_t *prev; + uint8_t next; + lm_pci_config_space_t *config_space; + struct caps *caps; if (nr_caps <= 0 || nr_caps >= LM_MAX_CAPS) { err = EINVAL; goto out; } - assert(lm_caps); + assert(lm_caps != NULL); caps = calloc(1, sizeof *caps); - if (!caps) { - err = errno; + if (caps == NULL) { goto out; } - prev_end = PCI_STD_HEADER_SIZEOF - 1; + config_space = lm_get_pci_config_space(lm_ctx); + /* points to the next field of the previous capability */ + prev = &config_space->hdr.cap; + + /* relative offset that points where the next capability should be placed */ + next = PCI_STD_HEADER_SIZEOF; + for (i = 0; i < nr_caps; i++) { - if (!cap_is_valid(lm_caps[i].id) || !lm_caps[i].fn || !lm_caps[i].size) { + uint8_t *cap = (uint8_t*)lm_caps[i]; + uint8_t id = cap[PCI_CAP_LIST_ID]; + size_t size; + + if (!cap_is_valid(id)) { + err = EINVAL; + goto out; + } + + size = cap_handlers[id].size; + if (size == 0) { err = EINVAL; goto out; } - caps->caps[i].id = lm_caps[i].id; - caps->caps[i].fn = lm_caps[i].fn; - /* FIXME PCI capabilities must be dword aligned. */ - caps->caps[i].start = prev_end + 1; - caps->caps[i].end = prev_end = caps->caps[i].start + lm_caps[i].size - 1; + caps->caps[i].start = next; + caps->caps[i].end = next + size - 1; + + memcpy(&config_space->hdr.raw[next], cap, size); + *prev = next; + prev = &config_space->hdr.raw[next + PCI_CAP_LIST_NEXT]; + *prev = 0; + next += size; + assert(next % 4 == 0); /* FIXME */ + + lm_log(lm_ctx, LM_DBG, "initialized capability %s %#x-%#x\n", + cap_handlers[id].name, caps->caps[i].start, caps->caps[i].end); } caps->nr_caps = nr_caps; @@ -44,7 +44,7 @@ struct caps; * capabilities have been added. */ struct caps * -caps_create(const lm_cap_t *caps, int nr_caps); +caps_create(lm_ctx_t *lm_ctx, lm_cap_t **caps, int nr_caps); /* * Conditionally accesses the PCI capabilities. Returns: @@ -54,8 +54,11 @@ caps_create(const lm_cap_t *caps, int nr_caps); * <0: negative error code on error. */ ssize_t -cap_maybe_access(struct caps *caps, void *pvt, char *buf, size_t count, - loff_t offset, bool is_write); +cap_maybe_access(lm_ctx_t *lm_ctx, struct caps *caps, char *buf, size_t count, + loff_t offset); + +uint8_t * +cap_find_by_id(lm_ctx_t *lm_ctx, uint8_t id); #endif /* __CAP_H__ */ diff --git a/lib/caps/common.h b/lib/caps/common.h new file mode 100644 index 0000000..2181a3b --- /dev/null +++ b/lib/caps/common.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2020 Nutanix Inc. All rights reserved. + * + * Authors: Thanos Makatos <thanos@nutanix.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Nutanix nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + */ + +#ifndef LM_PCI_CAP_COMMON_H +#define LM_PCI_CAP_COMMON_H + +#include <stddef.h> + +struct cap_hdr { + uint8_t id; + uint8_t next; +} __attribute__((packed)); +_Static_assert(sizeof(struct cap_hdr) == 0x2, "bad PCI capability header size"); +_Static_assert(offsetof(struct cap_hdr, id) == PCI_CAP_LIST_ID, "bad offset"); +_Static_assert(offsetof(struct cap_hdr, next) == PCI_CAP_LIST_NEXT, "bad offset"); + +#endif /* LM_PCI_CAP_COMMON_H */ + +/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/lib/caps/msi.h b/lib/caps/msi.h index b310ae9..5933006 100644 --- a/lib/caps/msi.h +++ b/lib/caps/msi.h @@ -33,11 +33,7 @@ #ifndef LM_PCI_CAP_MSI_H #define LM_PCI_CAP_MSI_H -struct mid { - unsigned int cid:8; - unsigned int next:8; -} __attribute__ ((packed)); -_Static_assert(sizeof(struct mid) == 0x2, "bad MID size"); +#include "common.h" struct mc { unsigned int msie:1; @@ -56,7 +52,7 @@ struct ma { _Static_assert(sizeof(struct ma) == 0x4, "bad MA size"); struct msicap { - struct mid mid; + struct cap_hdr hdr; struct mc mc; struct ma ma; uint32_t mua; @@ -66,6 +62,7 @@ struct msicap { uint32_t mpend; } __attribute__ ((packed)); _Static_assert(sizeof(struct msicap) == 0x18, "bad MSICAP size"); +_Static_assert(offsetof(struct msicap, hdr) == 0, "bad offset"); #endif /* LM_CAP_MSI_H */ diff --git a/lib/caps/msix.h b/lib/caps/msix.h index b13c1c8..b0bc1a5 100644 --- a/lib/caps/msix.h +++ b/lib/caps/msix.h @@ -35,12 +35,6 @@ #include <linux/pci_regs.h> -struct mxid { - unsigned int cid:8; - unsigned int next:8; -} __attribute__ ((packed)); -_Static_assert(sizeof(struct mxid) == 0x2, "bad MXID size"); - struct mxc { unsigned int ts:11; unsigned int reserved:3; @@ -63,12 +57,13 @@ _Static_assert(sizeof(struct mtab) == PCI_MSIX_PBA - PCI_MSIX_TABLE, "bad MPBA size"); struct msixcap { - struct mxid mxid; + struct cap_hdr hdr; struct mxc mxc; struct mtab mtab; struct mpba mpba; } __attribute__ ((packed)) __attribute__ ((aligned(4))); _Static_assert(sizeof(struct msixcap) == PCI_CAP_MSIX_SIZEOF, "bad MSI-X size"); +_Static_assert(offsetof(struct msixcap, hdr) == 0, "bad offset"); #endif /* LM_CAP_MSIX_H */ diff --git a/lib/caps/pm.h b/lib/caps/pm.h index ddae2e6..e976d95 100644 --- a/lib/caps/pm.h +++ b/lib/caps/pm.h @@ -33,11 +33,7 @@ #ifndef LM_PCI_CAP_PM_H #define LM_PCI_CAP_PM_H -struct pid { - unsigned int cid:8; - unsigned int next:8; -} __attribute__((packed)); -_Static_assert(sizeof(struct pid) == 0x2, "bad PID size"); +#include "common.h" struct pc { unsigned int vs:3; @@ -60,15 +56,16 @@ struct pmcs { unsigned int dse:4; unsigned int dsc:2; unsigned int pmes:1; -}; -_Static_assert(sizeof(struct pc) == 0x2, "bad PC size"); +} __attribute__((packed)); +_Static_assert(sizeof(struct pc) == 0x2, "bad PMCS size"); struct pmcap { - struct pid pid; + struct cap_hdr hdr; struct pc pc; struct pmcs pmcs; -} __attribute__((packed)) __attribute__ ((aligned(8))); +} __attribute__((packed)) __attribute__ ((aligned(8))); /* FIXME why does it need to be aligned? */ _Static_assert(sizeof(struct pmcap) == PCI_PM_SIZEOF, "bad PC size"); +_Static_assert(offsetof(struct pmcap, hdr) == 0, "bad offset"); #endif /* LM_CAP_PM_H */ diff --git a/lib/caps/px.h b/lib/caps/px.h index ce17cfe..28a04d5 100644 --- a/lib/caps/px.h +++ b/lib/caps/px.h @@ -33,11 +33,7 @@ #ifndef LM_PCI_CAP_PX_H #define LM_PCI_CAP_PX_H -struct pxid { - unsigned int cid:8; - unsigned int next:8; -} __attribute__((packed)); -_Static_assert(sizeof(struct pxid) == 0x2, "bad PXID size"); +#include "common.h" struct pxcaps { unsigned int ver:4; @@ -133,7 +129,7 @@ _Static_assert(sizeof(struct pxdc2) == 0x2, "bad PXDC2 size"); * the whole struct. */ struct pxcap { - struct pxid pxid; + struct cap_hdr hdr; struct pxcaps pxcaps; struct pxdcap pxdcap; union pxdc pxdc; @@ -147,6 +143,7 @@ struct pxcap { } __attribute__((packed)); _Static_assert(sizeof(struct pxcap) == 0x2a, "bad PCI Express Capability size"); +_Static_assert(offsetof(struct pxcap, hdr) == 0, "bad offset"); #endif /* LM_PCI_CAP_PX_H */ diff --git a/lib/common.h b/lib/common.h index 27d6735..f5de4d8 100644 --- a/lib/common.h +++ b/lib/common.h @@ -45,18 +45,18 @@ #define likely(e) __builtin_expect(!!(e), 1) #define unlikely(e) __builtin_expect(e, 0) +/* XXX NB 2nd argument must be power of two */ #define ROUND_DOWN(x, a) ((x) & ~((a)-1)) #define ROUND_UP(x,a) ROUND_DOWN((x)+(a)-1, a) void lm_log(lm_ctx_t *lm_ctx, lm_log_lvl_t lvl, const char *fmt, ...); -#ifdef DEBUG +#ifdef LM_VERBOSE_LOGGING void -dump_buffer(lm_ctx_t *lm_ctx, const char *prefix, - const char *buf, uint32_t count); +dump_buffer(const char *prefix, const char *buf, uint32_t count); #else -#define dump_buffer(lm_ctx, prefix, buf, count) +#define dump_buffer(prefix, buf, count) #endif #endif /* __COMMON_H__ */ @@ -66,7 +66,7 @@ fds_are_same_file(int fd1, int fd2) } dma_controller_t * -dma_controller_create(int max_regions) +dma_controller_create(lm_ctx_t *lm_ctx, int max_regions) { dma_controller_t *dma; @@ -77,37 +77,89 @@ dma_controller_create(int max_regions) return dma; } + dma->lm_ctx = lm_ctx; dma->max_regions = max_regions; dma->nregions = 0; memset(dma->regions, 0, max_regions * sizeof(dma->regions[0])); + dma->dirty_pgsize = 0; return dma; } static void -_dma_controller_do_remove_region(dma_memory_region_t *region) +_dma_controller_do_remove_region(dma_controller_t *dma, + dma_memory_region_t *region) { - assert(region); - dma_unmap_region(region, region->virt_addr, region->size); - (void)close(region->fd); + int err; + + assert(dma != NULL); + assert(region != NULL); + + err = dma_unmap_region(region, region->virt_addr, region->size); + if (err != 0) { + lm_log(dma->lm_ctx, LM_DBG, "failed to unmap fd=%d vaddr=%#lx-%#lx\n", + region->fd, region->virt_addr, region->size); + } + if (region->fd != -1) { + if (close(region->fd) == -1) { + lm_log(dma->lm_ctx, LM_DBG, "failed to close fd %d: %m\n", region->fd); + } + } +} + +/* + * FIXME no longer used. Also, it doesn't work for addresses that span two + * DMA regions. + */ +bool +dma_controller_region_valid(dma_controller_t *dma, dma_addr_t dma_addr, + size_t size) +{ + dma_memory_region_t *region; + int i; + + for (i = 0; i < dma->nregions; i++) { + region = &dma->regions[i]; + if (dma_addr == region->dma_addr && size <= region->size) { + return true; + } + } + + return false; } /* FIXME not thread safe */ int -dma_controller_remove_region(dma_controller_t *dma, dma_addr_t dma_addr, - size_t size, int fd) +dma_controller_remove_region(dma_controller_t *dma, + dma_addr_t dma_addr, size_t size, + int (*unmap_dma) (void*, uint64_t), void *data) { int idx; dma_memory_region_t *region; + int err; - assert(dma); + assert(dma != NULL); for (idx = 0; idx < dma->nregions; idx++) { region = &dma->regions[idx]; - if (region->dma_addr == dma_addr && region->size == size && - fds_are_same_file(region->fd, fd)) { - _dma_controller_do_remove_region(region); + if (region->dma_addr == dma_addr && region->size == size) { + if (region->refcnt > 0) { + err = unmap_dma(data, region->dma_addr); + if (err != 0) { + lm_log(dma->lm_ctx, LM_ERR, + "failed to notify of removal of DMA region %#lx-%#lx: %s\n", + region->dma_addr, region->dma_addr + region->size, + strerror(-err)); + return err; + } + assert(region->refcnt == 0); + } + _dma_controller_do_remove_region(dma, region); if (dma->nregions > 1) + /* + * FIXME valgrind complains with 'Source and destination overlap in memcpy', + * check whether memmove eliminates this warning. + */ memcpy(region, &dma->regions[dma->nregions - 1], sizeof *region); dma->nregions--; @@ -118,7 +170,7 @@ dma_controller_remove_region(dma_controller_t *dma, dma_addr_t dma_addr, } static inline void -dma_controller_remove_regions(lm_ctx_t *ctx, dma_controller_t *dma) +dma_controller_remove_regions(dma_controller_t *dma) { int i; @@ -127,26 +179,26 @@ dma_controller_remove_regions(lm_ctx_t *ctx, dma_controller_t *dma) for (i = 0; i < dma->nregions; i++) { dma_memory_region_t *region = &dma->regions[i]; - lm_log(ctx, LM_INF, "unmap vaddr=%lx IOVA=%lx\n", + lm_log(dma->lm_ctx, LM_INF, "unmap vaddr=%#lx IOVA=%#lx", region->virt_addr, region->dma_addr); - _dma_controller_do_remove_region(region); + _dma_controller_do_remove_region(dma, region); } } void -dma_controller_destroy(lm_ctx_t *lm_ctx, dma_controller_t *dma) +dma_controller_destroy(dma_controller_t *dma) { if (dma == NULL) { return; } - dma_controller_remove_regions(lm_ctx, dma); + dma_controller_remove_regions(dma); free(dma); } int -dma_controller_add_region(lm_ctx_t *lm_ctx, dma_controller_t *dma, +dma_controller_add_region(dma_controller_t *dma, dma_addr_t dma_addr, size_t size, int fd, off_t offset) { @@ -160,8 +212,8 @@ dma_controller_add_region(lm_ctx_t *lm_ctx, dma_controller_t *dma, /* First check if this is the same exact region. */ if (region->dma_addr == dma_addr && region->size == size) { if (offset != region->offset) { - lm_log(lm_ctx, LM_ERR, "bad offset for new DMA region %lx+%lx, " - "want=%d, existing=%d\n", + lm_log(dma->lm_ctx, LM_ERR, + "bad offset for new DMA region %#lx+%#lx, want=%d, existing=%d\n", dma_addr, size, offset, region->offset); goto err; } @@ -172,8 +224,9 @@ dma_controller_add_region(lm_ctx_t *lm_ctx, dma_controller_t *dma, * the same file, however in the majority of cases we'll be * using a single fd. */ - lm_log(lm_ctx, LM_ERR, "bad fd=%d for new DMA region %lx-%lx, " - "existing fd=%d\n", fd, region->fd); + lm_log(dma->lm_ctx, LM_ERR, + "bad fd=%d for new DMA region %#lx-%#lx, existing fd=%d\n", + fd, region->fd); goto err; } return idx; @@ -184,16 +237,17 @@ dma_controller_add_region(lm_ctx_t *lm_ctx, dma_controller_t *dma, dma_addr < region->dma_addr + region->size) || (region->dma_addr >= dma_addr && region->dma_addr < dma_addr + size)) { - lm_log(lm_ctx, LM_INF, "new DMA region %lx+%lx overlaps with DMA " - "region %lx-%lx\n", dma_addr, size, region->dma_addr, - region->size); + lm_log(dma->lm_ctx, LM_INF, + "new DMA region %#lx+%#lx overlaps with DMA region %#lx-%#lx\n", + dma_addr, size, region->dma_addr, region->size); goto err; } } if (dma->nregions == dma->max_regions) { idx = dma->max_regions; - lm_log(lm_ctx, LM_ERR, "reached maxed regions, recompile with higher number of DMA regions\n"); + lm_log(dma->lm_ctx, LM_ERR, + "reached maxed regions, recompile with higher number of DMA regions\n"); goto err; } @@ -202,7 +256,7 @@ dma_controller_add_region(lm_ctx_t *lm_ctx, dma_controller_t *dma, page_size = fd_get_blocksize(fd); if (page_size < 0) { - lm_log(lm_ctx, LM_ERR, "bad page size %d\n", page_size); + lm_log(dma->lm_ctx, LM_ERR, "bad page size %d\n", page_size); goto err; } page_size = MAX(page_size, getpagesize()); @@ -211,20 +265,21 @@ dma_controller_add_region(lm_ctx_t *lm_ctx, dma_controller_t *dma, region->size = size; region->page_size = page_size; region->offset = offset; - - region->fd = dup(fd); // dup the fd to get our own private copy - if (region->fd < 0) { - lm_log(lm_ctx, LM_ERR, "failed to duplicate file descriptor: %s\n", - strerror(errno)); - goto err; - } + region->fd = fd; + region->refcnt = 0; region->virt_addr = dma_map_region(region, PROT_READ | PROT_WRITE, 0, region->size); if (region->virt_addr == MAP_FAILED) { - lm_log(lm_ctx, LM_ERR, "failed to memory map DMA region %lx-%lx: %s\n", + lm_log(dma->lm_ctx, LM_ERR, + "failed to memory map DMA region %#lx-%#lx: %s\n", dma_addr, dma_addr + size, strerror(errno)); - close(region->fd); + if (region->fd != -1) { + if (close(region->fd) == -1) { + lm_log(dma->lm_ctx, LM_DBG, "failed to close fd %d: %m\n", + region->fd); + } + } goto err; } dma->nregions++; @@ -269,17 +324,17 @@ dma_map_region(dma_memory_region_t *region, int prot, size_t offset, size_t len) return mmap_base + (offset - mmap_offset); } -void +int dma_unmap_region(dma_memory_region_t *region, void *virt_addr, size_t len) { mmap_round((size_t *)&virt_addr, &len, region->page_size); - munmap(virt_addr, len); + return munmap(virt_addr, len); } int _dma_addr_sg_split(const dma_controller_t *dma, dma_addr_t dma_addr, uint32_t len, - dma_sg_t *sg, int max_sg) + dma_sg_t *sg, int max_sg, int prot) { int idx; int cnt = 0; @@ -295,9 +350,13 @@ _dma_addr_sg_split(const dma_controller_t *dma, size_t region_len = MIN(region_end - dma_addr, len); if (cnt < max_sg) { + sg[cnt].dma_addr = region->dma_addr; sg[cnt].region = idx; sg[cnt].offset = dma_addr - region->dma_addr; sg[cnt].length = region_len; + if (_dma_should_mark_dirty(dma, prot)) { + _dma_mark_dirty(dma, region, sg); + } } cnt++; @@ -326,4 +385,117 @@ out: return cnt; } +ssize_t _get_bitmap_size(size_t region_size, size_t pgsize) +{ + if (pgsize == 0) { + return -EINVAL; + } + if (region_size < pgsize) { + return -EINVAL; + } + size_t nr_pages = (region_size / pgsize) + (region_size % pgsize != 0); + return (nr_pages / CHAR_BIT) + (nr_pages % CHAR_BIT != 0); +} + +int dma_controller_dirty_page_logging_start(dma_controller_t *dma, size_t pgsize) +{ + int i; + + assert(dma != NULL); + + if (pgsize == 0) { + return -EINVAL; + } + + if (dma->dirty_pgsize > 0) { + if (dma->dirty_pgsize != pgsize) { + return -EINVAL; + } + return 0; + } + + for (i = 0; i < dma->nregions; i++) { + dma_memory_region_t *region = &dma->regions[i]; + ssize_t bitmap_size = _get_bitmap_size(region->size, pgsize); + if (bitmap_size < 0) { + return bitmap_size; + } + region->dirty_bitmap = calloc(bitmap_size, sizeof(char)); + if (region->dirty_bitmap == NULL) { + int j, ret = -errno; + for (j = 0; j < i; j++) { + free(region->dirty_bitmap); + region->dirty_bitmap = NULL; + } + return ret; + } + } + dma->dirty_pgsize = pgsize; + return 0; +} + +int dma_controller_dirty_page_logging_stop(dma_controller_t *dma) +{ + int i; + + assert(dma != NULL); + + if (dma->dirty_pgsize == 0) { + return 0; + } + + for (i = 0; i < dma->nregions; i++) { + free(dma->regions[i].dirty_bitmap); + dma->regions[i].dirty_bitmap = NULL; + } + dma->dirty_pgsize = 0; + return 0; +} + +int +dma_controller_dirty_page_get(dma_controller_t *dma, dma_addr_t addr, int len, + size_t pgsize, size_t size, char **data) +{ + int ret; + ssize_t bitmap_size; + dma_sg_t sg; + dma_memory_region_t *region; + + assert(dma != NULL); + assert(data != NULL); + + /* + * FIXME for now we support IOVAs that match exactly the DMA region. This + * is purely for simplifying the implementation. We MUST allow arbitrary + * IOVAs. + */ + ret = dma_addr_to_sg(dma, addr, len, &sg, 1, PROT_NONE); + if (ret != 1 || sg.dma_addr != addr || sg.length != len) { + return -ENOTSUP; + } + + if (pgsize != dma->dirty_pgsize) { + return -EINVAL; + } + + bitmap_size = _get_bitmap_size(len, pgsize); + if (bitmap_size < 0) { + return bitmap_size; + } + + /* + * FIXME they must be equal because this is how much data the client + * expects to receive. + */ + if (size != (size_t)bitmap_size) { + return -EINVAL; + } + + region = &dma->regions[sg.region]; + + *data = region->dirty_bitmap; + + return 0; +} + /* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ @@ -32,6 +32,11 @@ #define DMA_DMA_H /* + * FIXME check whether DMA regions must be page aligned. If so then the + * implementation can be greatly simpified. + */ + +/* * This library emulates a DMA controller for a device emulation application to * perform DMA operations on a foreign memory space. * @@ -72,6 +77,8 @@ #include "muser.h" #include "common.h" +struct lm_ctx; + typedef struct { dma_addr_t dma_addr; // DMA address of this region size_t size; // Size of this region @@ -79,19 +86,23 @@ typedef struct { int page_size; // Page size of this fd off_t offset; // File offset void *virt_addr; // Virtual address of this region + int refcnt; // Number of users of this region + char *dirty_bitmap; // Dirty page bitmap } dma_memory_region_t; typedef struct { int max_regions; int nregions; + struct lm_ctx *lm_ctx; + size_t dirty_pgsize; // Dirty page granularity dma_memory_region_t regions[0]; } dma_controller_t; dma_controller_t * -dma_controller_create(int max_regions); +dma_controller_create(lm_ctx_t *lm_ctx, int max_regions); void -dma_controller_destroy(lm_ctx_t *ctx, dma_controller_t *dma); +dma_controller_destroy(dma_controller_t *dma); /* Registers a new memory region. * Returns: @@ -101,19 +112,72 @@ dma_controller_destroy(lm_ctx_t *ctx, dma_controller_t *dma); * (e.g. due to conflict with existing region). */ int -dma_controller_add_region(lm_ctx_t *ctx, dma_controller_t *dma, +dma_controller_add_region(dma_controller_t *dma, dma_addr_t dma_addr, size_t size, int fd, off_t offset); int -dma_controller_remove_region(dma_controller_t *dma, dma_addr_t dma_addr, - size_t size, int fd); +dma_controller_remove_region(dma_controller_t *dma, + dma_addr_t dma_addr, size_t size, + int (*unmap_dma) (void*, uint64_t), void *data); // Helper for dma_addr_to_sg() slow path. int _dma_addr_sg_split(const dma_controller_t *dma, dma_addr_t dma_addr, uint32_t len, - dma_sg_t *sg, int max_sg); + dma_sg_t *sg, int max_sg, int prot); + +static bool +_dma_should_mark_dirty(const dma_controller_t *dma, int prot) +{ + assert(dma != NULL); + + return (prot & PROT_WRITE) == PROT_WRITE && dma->dirty_pgsize > 0; +} + +static size_t +_get_pgstart(size_t pgsize, uint64_t base_addr, uint64_t offset) +{ + return (offset - base_addr) / pgsize; +} + +static size_t +_get_pgend(size_t pgsize, uint64_t len, size_t start) +{ + return start + (len / pgsize) + (len % pgsize != 0) - 1; +} + +static void +_dma_bitmap_get_pgrange(const dma_controller_t *dma, + const dma_memory_region_t *region, + const dma_sg_t *sg, size_t *start, size_t *end) +{ + assert(dma != NULL); + assert(region != NULL); + assert(sg != NULL); + assert(start != NULL); + assert(end != NULL); + + *start = _get_pgstart(dma->dirty_pgsize, region->dma_addr, sg->offset); + *end = _get_pgend(dma->dirty_pgsize, sg->length, *start); +} + +static void +_dma_mark_dirty(const dma_controller_t *dma, const dma_memory_region_t *region, + dma_sg_t *sg) +{ + size_t i, start, end; + + assert(dma != NULL); + assert(region != NULL); + assert(sg != NULL); + assert(region->dirty_bitmap != NULL); + + _dma_bitmap_get_pgrange(dma, region, sg, &start, &end); + for (i = start; i <= end; i++) { + region->dirty_bitmap[i / CHAR_BIT] |= 1 << (i % CHAR_BIT); + } +} /* Takes a linear dma address span and returns a sg list suitable for DMA. * A single linear dma address span may need to be split into multiple @@ -129,7 +193,7 @@ _dma_addr_sg_split(const dma_controller_t *dma, static inline int dma_addr_to_sg(const dma_controller_t *dma, dma_addr_t dma_addr, uint32_t len, - dma_sg_t *sg, int max_sg) + dma_sg_t *sg, int max_sg, int prot) { static __thread int region_hint; int cnt; @@ -139,14 +203,19 @@ dma_addr_to_sg(const dma_controller_t *dma, // Fast path: single region. if (likely(max_sg > 0 && len > 0 && - dma_addr >= region->dma_addr && dma_addr + len <= region_end)) { + dma_addr >= region->dma_addr && dma_addr + len <= region_end && + region_hint < dma->nregions)) { + sg->dma_addr = region->dma_addr; sg->region = region_hint; sg->offset = dma_addr - region->dma_addr; sg->length = len; + if (_dma_should_mark_dirty(dma, prot)) { + _dma_mark_dirty(dma, region, sg); + } return 1; } // Slow path: search through regions. - cnt = _dma_addr_sg_split(dma, dma_addr, len, sg, max_sg); + cnt = _dma_addr_sg_split(dma, dma_addr, len, sg, max_sg, prot); if (likely(cnt > 0)) { region_hint = sg->region; } @@ -157,7 +226,7 @@ void * dma_map_region(dma_memory_region_t *region, int prot, size_t offset, size_t len); -void +int dma_unmap_region(dma_memory_region_t *region, void *virt_addr, size_t len); static inline int @@ -168,31 +237,53 @@ dma_map_sg(dma_controller_t *dma, const dma_sg_t *sg, struct iovec *iov, int i; for (i = 0; i < cnt; i++) { + lm_log(dma->lm_ctx, LM_DBG, "map %#lx-%#lx\n", + sg->dma_addr + sg->offset, sg->dma_addr + sg->offset + sg->length); region = &dma->regions[sg[i].region]; iov[i].iov_base = region->virt_addr + sg[i].offset; iov[i].iov_len = sg[i].length; + region->refcnt++; } return 0; } +/* FIXME useless define */ #define UNUSED __attribute__((unused)) static inline void -dma_unmap_sg(UNUSED dma_controller_t *dma, UNUSED const dma_sg_t *sg, - UNUSED struct iovec *iov, UNUSED int cnt) +dma_unmap_sg(dma_controller_t *dma, const dma_sg_t *sg, + UNUSED struct iovec *iov, int cnt) { - /* just a placeholder for now */ + int i; + + for (i = 0; i < cnt; i++) { + dma_memory_region_t *r; + /* + * FIXME this double loop will be removed if we replace the array with + * tfind(3) + */ + for (r = dma->regions; + r < dma->regions + dma->nregions && r->dma_addr != sg[i].dma_addr; + r++); + if (r > dma->regions + dma->nregions) { + /* bad region */ + continue; + } + lm_log(dma->lm_ctx, LM_DBG, "unmap %#lx-%#lx\n", + sg[i].dma_addr + sg[i].offset, sg[i].dma_addr + sg[i].offset + sg[i].length); + r->refcnt--; + } return; } static inline void * -dma_map_addr(dma_controller_t *dma, dma_addr_t dma_addr, uint32_t len) +dma_map_addr(dma_controller_t *dma, dma_addr_t dma_addr, uint32_t len, int prot) { dma_sg_t sg; struct iovec iov; - if (dma_addr_to_sg(dma, dma_addr, len, &sg, 1) == 1 && + if (dma_addr_to_sg(dma, dma_addr, len, &sg, 1, prot) == 1 && dma_map_sg(dma, &sg, &iov, 1) == 0) { return iov.iov_base; } @@ -211,12 +302,26 @@ dma_unmap_addr(dma_controller_t *dma, }; int r; - r = dma_addr_to_sg(dma, dma_addr, len, &sg, 1); + r = dma_addr_to_sg(dma, dma_addr, len, &sg, 1, PROT_NONE); assert(r == 1); dma_unmap_sg(dma, &sg, &iov, 1); } +int +dma_controller_dirty_page_logging_start(dma_controller_t *dma, size_t pgsize); + +int +dma_controller_dirty_page_logging_stop(dma_controller_t *dma); + +int +dma_controller_dirty_page_get(dma_controller_t *dma, dma_addr_t addr, int len, + size_t pgsize, size_t size, char **data); + +bool +dma_controller_region_valid(dma_controller_t *dma, dma_addr_t dma_addr, + size_t size); + #endif /* DMA_DMA_H */ /* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/lib/muser.h b/lib/muser.h index f3330fe..a39d477 100644 --- a/lib/muser.h +++ b/lib/muser.h @@ -37,22 +37,27 @@ #include <sys/uio.h> #include <unistd.h> +#include "vfio_user.h" #include "pci.h" +#include "caps/pm.h" +#include "caps/px.h" +#include "caps/msi.h" +#include "caps/msix.h" -/* - * Influential enviroment variables: - * - * LM_TERSE_LOGGING: define to make libmuser log only erroneous PCI accesses. - * (this should really be done with a more fine grained debug - * level) - */ -#ifndef LM_TERSE_LOGGING -#define LM_TERSE_LOGGING 0 -#endif +#define LIB_MUSER_VFIO_USER_VERS_MJ 0 +#define LIB_MUSER_VFIO_USER_VERS_MN 1 + +#define VFIO_NAME "vfio" +#define VFIO_DIR "/dev/" VFIO_NAME "/" +#define VFIO_CONTAINER VFIO_DIR "/" VFIO_NAME + +#define MUSER_DIR "/var/run/muser/" +#define MUSER_SOCK "cntrl" typedef uint64_t dma_addr_t; typedef struct { + dma_addr_t dma_addr; int region; int length; uint64_t offset; @@ -134,6 +139,8 @@ typedef struct { /* * Callback function that is called when the region is read or written. + * Note that the memory of the region is owned by the user, except for the + * standard header (first 64 bytes) of the PCI configuration space. */ lm_region_access_t *fn; @@ -149,9 +156,12 @@ enum { LM_DEV_INTX_IRQ_IDX, LM_DEV_MSI_IRQ_IDX, LM_DEV_MSIX_IRQ_IDX, - LM_DEV_NUM_IRQS = 3 + LM_DEV_ERR_IRQ_INDEX, + LM_DEV_REQ_IRQ_INDEX, + LM_DEV_NUM_IRQS }; +/* FIXME these are PCI regions */ enum { LM_DEV_BAR0_REG_IDX, LM_DEV_BAR1_REG_IDX, @@ -162,7 +172,15 @@ enum { LM_DEV_ROM_REG_IDX, LM_DEV_CFG_REG_IDX, LM_DEV_VGA_REG_IDX, - LM_DEV_NUM_REGS = 9 + /* + * FIXME this really belong here, but simplifies implementation for now. A + * migration region can exist for non-PCI devices (can its index be + * anything?). In any case, we should allow the user to define custom regions + * at will, by fixing the migration region in that position we don't allow + * this. + */ + LM_DEV_MIGRATION_REG_IDX, + LM_DEV_NUM_REGS = 10, /* TODO rename to LM_DEV_NUM_PCI_REGS */ }; typedef struct { @@ -191,7 +209,7 @@ typedef struct { } lm_pci_info_t; /* - * Returns a pointer to the non-standard part of the PCI configuration space. + * Returns a pointer to the standard part of the PCI configuration space. */ lm_pci_config_space_t *lm_get_pci_config_space(lm_ctx_t *lm_ctx); @@ -208,7 +226,7 @@ typedef enum { * * @lm_log_fn_t: typedef for log function. */ -typedef void (lm_log_fn_t) (void *pvt, const char *msg); +typedef void (lm_log_fn_t) (void *pvt, lm_log_lvl_t lvl, const char *msg); /** * Callback function that gets called when a capability is accessed. The @@ -228,26 +246,77 @@ typedef ssize_t (lm_cap_access_t) (void *pvt, uint8_t id, char *buf, size_t count, loff_t offset, bool is_write); +/* FIXME does it have to be packed as well? */ +typedef union { + struct msicap msi; + struct msixcap msix; + struct pmcap pm; + struct pxcap px; +} lm_cap_t; + +typedef enum { + LM_TRANS_KERNEL, + LM_TRANS_SOCK, + LM_TRANS_MAX +} lm_trans_t; + +#define LM_MAX_CAPS (PCI_CFG_SPACE_SIZE - PCI_STD_HEADER_SIZEOF) / PCI_CAP_SIZEOF + +/* + * FIXME the names of migration callback functions are probably far too long, + * but for now it helps with the implementation. + */ +typedef int (lm_migration_callback_t)(void *pvt); + +typedef enum { + LM_MIGR_STATE_STOP, + LM_MIGR_STATE_START, + LM_MIGR_STATE_STOP_AND_COPY, + LM_MIGR_STATE_PRE_COPY, + LM_MIGR_STATE_RESUME +} lm_migr_state_t; + typedef struct { + /* migration state transition callback */ + /* TODO rename to lm_migration_state_transition_callback */ + /* FIXME maybe we should create a single callback and pass the state? */ + int (*transition)(void *pvt, lm_migr_state_t state); + + /* Callbacks for saving device state */ + /* - * Capability ID, as defined by the PCI specification. Also defined as - * PCI_CAP_ID_XXX in <linux/pci_regs.h>. + * Function that is called to retrieve pending migration data. If migration + * data were previously made available (function prepare_data has been + * called) then calling this function signifies that they have been read + * (e.g. migration data can be discarded). If the function returns 0 then + * migration has finished and this function won't be called again. */ - uint8_t id; + __u64 (*get_pending_bytes)(void *pvt); /* - * Size of the capability. + * Function that is called to instruct the device to prepare migration data. + * The function must return only after migration data are available at the + * specified offset. */ - size_t size; + int (*prepare_data)(void *pvt, __u64 *offset, __u64 *size); /* - * Function to call back when the capability gets read or written. + * Function that is called to read migration data. offset and size can + * be any subrange on the offset and size previously returned by + * prepare_data. The function must return the amount of data read. This + * function can be called even if the migration data can be memory mapped. + * + * Does this mean that reading data_offset/data_size updates the values? */ - lm_cap_access_t *fn; -} lm_cap_t; + size_t (*read_data)(void *pvt, void *buf, __u64 count, __u64 offset); -#define LM_MAX_CAPS (PCI_CFG_SPACE_SIZE - PCI_STD_HEADER_SIZEOF) / PCI_CAP_SIZEOF + /* Callback for restoring device state */ + + /* Fuction that is called for writing previously stored device state. */ + size_t (*write_data)(void *pvt, void *data, __u64 size); + +} lm_migration_callbacks_t; /** * Device information structure, used to create the lm_ctx. @@ -287,16 +356,36 @@ typedef struct { int (*reset) (void *pvt); /* - * PCI capabilities. The user needs to only define the ID and size of each - * capability. The actual capability is not maintained by libmuser. When a - * capability is accessed the appropriate callback function is called. + * Function that is called when the guest maps a DMA region. Optional. + */ + void (*map_dma) (void *pvt, uint64_t iova, uint64_t len); + + /* + * Function that is called when the guest unmaps a DMA region. The device + * must release all references to that region before the callback returns. + * This is required if you want to be able to access guest memory. */ - lm_cap_t caps[LM_MAX_CAPS]; + int (*unmap_dma) (void *pvt, uint64_t iova); + + lm_trans_t trans; /* - * Number of capabilities in above array. + * Attaching to the transport is non-blocking. The library will not attempt + * to attach during context creation time. The caller must then manually + * call lm_ctx_try_attach(), which is non-blocking, as many times as + * necessary. + */ +#define LM_FLAG_ATTACH_NB (1 << 0) + uint64_t flags; + + /* + * PCI capabilities. */ int nr_caps; + lm_cap_t **caps; + + lm_migration_callbacks_t migration_callbacks; + } lm_dev_info_t; /** @@ -339,18 +428,49 @@ int lm_ctx_run(lm_dev_info_t *dev_info); /** + * Polls, without blocking, an lm_ctx. This is an alternative to using + * a thread and making a blocking call to lm_ctx_drive(). Instead, the + * application can periodically poll the context directly from one of + * its own threads. + * + * This is only allowed when LM_FLAG_ATTACH_NB is specified during creation. + * + * @lm_ctx: The libmuser context to poll + * + * @returns 0 on success, -errno on failure. + */ +int +lm_ctx_poll(lm_ctx_t *lm_ctx); + +/** * Triggers an interrupt. * + * libmuser takes care of using the correct IRQ type (IRQ index: INTx or MSI/X), + * the caller only needs to specify the sub-index. + * + * @lm_ctx: the libmuser context to trigger interrupt + * @subindex: vector subindex to trigger interrupt on + * + * @returns 0 on success, or -1 on failure. Sets errno. + */ +int +lm_irq_trigger(lm_ctx_t *lm_ctx, uint32_t subindex); + +/** + * Sends message to client to trigger an interrupt. + * * libmuser takes care of using the IRQ type (INTx, MSI/X), the caller only * needs to specify the sub-index. + * This api can be used to trigger interrupt by sending message to client. * * @lm_ctx: the libmuser context to trigger interrupt * @subindex: vector subindex to trigger interrupt on * * @returns 0 on success, or -1 on failure. Sets errno. */ + int -lm_irq_trigger(lm_ctx_t *lm_ctx, uint32_t subindex); +lm_irq_message(lm_ctx_t *lm_ctx, uint32_t subindex); /* Helper functions */ @@ -366,12 +486,15 @@ lm_irq_trigger(lm_ctx_t *lm_ctx, uint32_t subindex); * than can be individually mapped in the program's virtual memory. A single * linear guest physical address span may need to be split into multiple * scatter/gather regions due to limitations of how memory can be mapped. + * Field unmap_dma must have been provided at context creation time in order + * to use this function. * * @lm_ctx: the libmuser context * @dma_addr: the guest physical address * @len: size of memory to be mapped * @sg: array that receives the scatter/gather entries to be mapped * @max_sg: maximum number of elements in above array + * @prot: protection as define in <sys/mman.h> * * @returns the number of scatter/gather entries created on success, and on * failure: @@ -381,12 +504,14 @@ lm_irq_trigger(lm_ctx_t *lm_ctx, uint32_t subindex); */ int lm_addr_to_sg(lm_ctx_t *lm_ctx, dma_addr_t dma_addr, uint32_t len, - dma_sg_t *sg, int max_sg); + dma_sg_t *sg, int max_sg, int prot); /** * Maps a list scatter/gather entries from the guest's physical address space * to the program's virtual memory. It is the caller's responsibility to remove * the mappings by calling lm_unmap_sg. + * Field unmap_dma must have been provided at context creation time in order + * to use this function. * * @lm_ctx: the libmuser context * @sg: array of scatter/gather entries returned by lm_addr_to_sg @@ -403,6 +528,8 @@ lm_map_sg(lm_ctx_t *lm_ctx, const dma_sg_t *sg, /** * Unmaps a list scatter/gather entries (previously mapped by lm_map_sg) from * the program's virtual memory. + * Field unmap_dma must have been provided at context creation time in order + * to use this function. * * @lm_ctx: the libmuser context * @sg: array of scatter/gather entries to unmap @@ -426,16 +553,59 @@ lm_unmap_sg(lm_ctx_t *lm_ctx, const dma_sg_t *sg, int lm_get_region(loff_t pos, size_t count, loff_t *off); +/** + * Read from the dma region exposed by the client. + * + * @lm_ctx: the libmuser context + * @sg: a DMA segment obtained from dma_addr_to_sg + * @data: data buffer to read into + */ +int +lm_dma_read(lm_ctx_t *lm_ctx, dma_sg_t *sg, void *data); + +/** + * Write to the dma region exposed by the client. + * + * @lm_ctx: the libmuser context + * @sg: a DMA segment obtained from dma_addr_to_sg + * @data: data buffer to write + */ +int +lm_dma_write(lm_ctx_t *lm_ctx, dma_sg_t *sg, void *data); + /* * Advanced stuff. */ /** - * Returns the non-standard part of the PCI configuragion space. + * Returns the non-standard part of the PCI configuration space. */ uint8_t * lm_get_pci_non_std_config_space(lm_ctx_t *lm_ctx); +/* + * Attempts to attach to the transport. LM_FLAG_ATTACH_NB must be set when + * creating the context. Returns 0 on success and -1 on error. If errno is set + * to EAGAIN or EWOULDBLOCK then the transport is not ready to attach to and the + * operation must be retried. + */ +int +lm_ctx_try_attach(lm_ctx_t *lm_ctx); + +/* + * FIXME need to make sure that there can be at most one capability with a given + * ID, otherwise this function will return the first one with this ID. + */ +uint8_t * +lm_ctx_get_cap(lm_ctx_t *lm_ctx, uint8_t id); + +void +lm_log(lm_ctx_t *lm_ctx, lm_log_lvl_t lvl, const char *fmt, ...); + +/* FIXME */ +int muser_send_fds(int sock, int *fds, size_t count); +ssize_t muser_recv_fds(int sock, int *fds, size_t count); + #endif /* LIB_MUSER_H */ /* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/lib/muser_ctx.c b/lib/muser_ctx.c index 0de3ac0..92155d7 100644 --- a/lib/muser_ctx.c +++ b/lib/muser_ctx.c @@ -47,13 +47,22 @@ #include <stdarg.h> #include <linux/vfio.h> #include <sys/param.h> +#include <sys/un.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <time.h> +#include <sys/select.h> -#include "../kmod/muser.h" #include "muser.h" #include "muser_priv.h" #include "dma.h" #include "cap.h" +#define MAX_FDS 8 + +#define IOMMU_GRP_NAME "iommu_group" + typedef enum { IRQ_NONE = 0, IRQ_INTX, @@ -61,6 +70,14 @@ typedef enum { IRQ_MSIX, } irq_type_t; +char *irq_to_str[] = { + [LM_DEV_INTX_IRQ_IDX] = "INTx", + [LM_DEV_MSI_IRQ_IDX] = "MSI", + [LM_DEV_MSIX_IRQ_IDX] = "MSI-X", + [LM_DEV_ERR_IRQ_INDEX] = "ERR", + [LM_DEV_REQ_IRQ_INDEX] = "REQ" +}; + typedef struct { irq_type_t type; /* irq type this device is using */ int err_efd; /* eventfd for irq err */ @@ -69,27 +86,517 @@ typedef struct { int efds[0]; /* XXX must be last */ } lm_irqs_t; -/* - * Macro that ensures that a particular struct member is last. Doesn't work for - * flexible array members. - */ -#define MUST_BE_LAST(s, m, t) \ - _Static_assert(sizeof(s) - offsetof(s, m) == sizeof(t), \ - #t " " #m " must be last member in " #s) +enum migration_iteration_state { + VFIO_USER_MIGRATION_ITERATION_STATE_INITIAL, + VFIO_USER_MIGRATION_ITERATION_STATE_STARTED, + VFIO_USER_MIGRATION_ITERATION_STATE_DATA_PREPARED, + VFIO_USER_MIGRATION_ITERATION_STATE_FINISHED +}; struct lm_ctx { void *pvt; dma_controller_t *dma; int fd; + int conn_fd; int (*reset) (void *pvt); lm_log_lvl_t log_lvl; lm_log_fn_t *log; lm_pci_info_t pci_info; lm_pci_config_space_t *pci_config_space; + lm_trans_t trans; struct caps *caps; + uint64_t flags; + char *uuid; + void (*map_dma) (void *pvt, uint64_t iova, uint64_t len); + int (*unmap_dma) (void *pvt, uint64_t iova); + + /* TODO there should be a void * variable to store transport-specific stuff */ + /* LM_TRANS_SOCK */ + char *iommu_dir; + int iommu_dir_fd; + int sock_flags; + + int client_max_fds; + + struct { + struct vfio_device_migration_info info; + size_t pgsize; + lm_migration_callbacks_t callbacks; + struct { + enum migration_iteration_state state; + __u64 offset; + __u64 size; + } iter; + } migration; + lm_irqs_t irqs; /* XXX must be last */ }; -MUST_BE_LAST(struct lm_ctx, irqs, lm_irqs_t); + + +/* function prototypes */ +static void +free_sparse_mmap_areas(lm_reg_info_t*); + +static inline int recv_blocking(int sock, void *buf, size_t len, int flags) +{ + int f = fcntl(sock, F_GETFL, 0); + int ret, fret; + + fret = fcntl(sock, F_SETFL, f & ~O_NONBLOCK); + assert(fret != -1); + + ret = recv(sock, buf, len, flags); + + fret = fcntl(sock, F_SETFL, f); + assert(fret != -1); + + return ret; +} + +static int +init_sock(lm_ctx_t *lm_ctx) +{ + struct sockaddr_un addr = { .sun_family = AF_UNIX }; + int ret, unix_sock; + mode_t mode; + + assert(lm_ctx != NULL); + + lm_ctx->iommu_dir = strdup(lm_ctx->uuid); + if (!lm_ctx->iommu_dir) { + return -ENOMEM; + } + + /* FIXME SPDK can't easily run as non-root */ + mode = umask(0000); + + if ((unix_sock = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) { + ret = errno; + goto out; + } + + if (lm_ctx->flags & LM_FLAG_ATTACH_NB) { + ret = fcntl(unix_sock, F_SETFL, + fcntl(unix_sock, F_GETFL, 0) | O_NONBLOCK); + if (ret < 0) { + ret = errno; + goto close_unix_sock; + } + lm_ctx->sock_flags = MSG_DONTWAIT | MSG_WAITALL; + } else { + lm_ctx->sock_flags = 0; + } + + lm_ctx->iommu_dir_fd = open(lm_ctx->iommu_dir, O_DIRECTORY); + if (lm_ctx->iommu_dir_fd < 0) { + ret = errno; + goto close_unix_sock; + } + + ret = snprintf(addr.sun_path, sizeof addr.sun_path, "%s/" MUSER_SOCK, + lm_ctx->iommu_dir); + if (ret >= (int)sizeof addr.sun_path) { + ret = ENAMETOOLONG; + goto close_iommu_dir_fd; + } + if (ret < 0) { + goto close_iommu_dir_fd; + } + + /* start listening business */ + ret = bind(unix_sock, (struct sockaddr*)&addr, sizeof(addr)); + if (ret < 0) { + ret = errno; + goto close_iommu_dir_fd; + } + + ret = listen(unix_sock, 0); + if (ret < 0) { + ret = errno; + goto close_iommu_dir_fd; + } + + umask(mode); + return unix_sock; + +close_iommu_dir_fd: + close(lm_ctx->iommu_dir_fd); +close_unix_sock: + close(unix_sock); +out: + return -ret; +} + +static void +__free_s(char **p) +{ + free(*p); +} + +int +_send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply, + enum vfio_user_command cmd, + struct iovec *iovecs, size_t nr_iovecs, + int *fds, int count) +{ + int ret; + struct vfio_user_header hdr = {.msg_id = msg_id}; + struct msghdr msg; + size_t i; + + if (nr_iovecs == 0) { + iovecs = alloca(sizeof(*iovecs)); + nr_iovecs = 1; + } + + memset(&msg, 0, sizeof(msg)); + + if (is_reply) { + hdr.flags.type = VFIO_USER_F_TYPE_REPLY; + } else { + hdr.cmd = cmd; + hdr.flags.type = VFIO_USER_F_TYPE_COMMAND; + } + + iovecs[0].iov_base = &hdr; + iovecs[0].iov_len = sizeof(hdr); + + for (i = 0; i < nr_iovecs; i++) { + hdr.msg_size += iovecs[i].iov_len; + } + + msg.msg_iovlen = nr_iovecs; + msg.msg_iov = iovecs; + + if (fds != NULL) { + size_t size = count * sizeof *fds; + char *buf = alloca(CMSG_SPACE(size)); + + msg.msg_control = buf; + msg.msg_controllen = CMSG_SPACE(size); + + struct cmsghdr * cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(size); + memcpy(CMSG_DATA(cmsg), fds, size); + } + + ret = sendmsg(sock, &msg, 0); + if (ret == -1) { + return -errno; + } + + return 0; +} + +int +send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply, + enum vfio_user_command cmd, + void *data, size_t data_len, + int *fds, size_t count) { + + struct iovec iovecs[2] = { + [1] = { + .iov_base = data, + .iov_len = data_len + } + }; + return _send_vfio_user_msg(sock, msg_id, is_reply, cmd, iovecs, + ARRAY_SIZE(iovecs), fds, count); +} + +int +send_version(int sock, int major, int minor, uint16_t msg_id, bool is_reply, + char *caps) +{ + int ret; + char *data; + + ret = asprintf(&data, + "{version: {\"major\": %d, \"minor\": %d}, capabilities: %s}", + major, minor, caps != NULL ? caps : "{}"); + if (ret == -1) { + return -1; + } + ret = send_vfio_user_msg(sock, msg_id, is_reply, VFIO_USER_VERSION, data, + ret, NULL, 0); + free(data); + return ret; +} + +int +recv_vfio_user_msg(int sock, struct vfio_user_header *hdr, bool is_reply, + uint16_t *msg_id, void *data, size_t *len) +{ + int ret; + + ret = recv_blocking(sock, hdr, sizeof(*hdr), 0); + if (ret == -1) { + return -errno; + } + if (ret < (int)sizeof(*hdr)) { + return -EINVAL; + } + + if (is_reply) { + if (hdr->msg_id != *msg_id) { + return -EINVAL; + } + + if (hdr->flags.type != VFIO_USER_F_TYPE_REPLY) { + return -EINVAL; + } + + if (hdr->flags.error == 1U) { + if (hdr->error_no <= 0) { + hdr->error_no = EINVAL; + } + return -hdr->error_no; + } + } else { + if (hdr->flags.type != VFIO_USER_F_TYPE_COMMAND) { + return -EINVAL; + } + *msg_id = hdr->msg_id; + } + + if (len != NULL && *len > 0 && hdr->msg_size > sizeof *hdr) { + ret = recv_blocking(sock, data, MIN(hdr->msg_size - sizeof *hdr, *len), + 0); + if (ret < 0) { + return ret; + } + if (*len != (size_t)ret) { /* FIXME we should allow receiving less */ + return -EINVAL; + } + *len = ret; + } + return 0; +} + +int +recv_version(int sock, int *major, int *minor, uint16_t *msg_id, bool is_reply, + int *max_fds, size_t *pgsize) +{ + int ret; + struct vfio_user_header hdr; + char *data __attribute__((__cleanup__(__free_s))) = NULL; + + ret = recv_vfio_user_msg(sock, &hdr, is_reply, msg_id, NULL, NULL); + if (ret < 0) { + return ret; + } + + hdr.msg_size -= sizeof(hdr); + data = malloc(hdr.msg_size); + if (data == NULL) { + return -errno; + } + ret = recv_blocking(sock, data, hdr.msg_size, 0); + if (ret == -1) { + return -errno; + } + if (ret < (int)hdr.msg_size) { + return -EINVAL; + } + + /* FIXME use proper parsing */ + ret = sscanf(data, + "{version: {\"major\": %d, \"minor\": %d}, capabilities: {max_fds: %d, migration: {pgsize: %lu}}}", + major, minor, max_fds, pgsize); + if (ret != 4) { + return -EINVAL; + } + return 0; +} + +int +_send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd, + struct iovec *iovecs, size_t nr_iovecs, + int *send_fds, size_t fd_count, + struct vfio_user_header *hdr, + void *recv_data, size_t recv_len) +{ + int ret = _send_vfio_user_msg(sock, msg_id, false, cmd, iovecs, nr_iovecs, + send_fds, fd_count); + if (ret < 0) { + return ret; + } + if (hdr == NULL) { + hdr = alloca(sizeof *hdr); + } + return recv_vfio_user_msg(sock, hdr, true, &msg_id, recv_data, &recv_len); +} + +int +send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd, + void *send_data, size_t send_len, + int *send_fds, size_t fd_count, + struct vfio_user_header *hdr, + void *recv_data, size_t recv_len) +{ + struct iovec iovecs[2] = { + [1] = { + .iov_base = send_data, + .iov_len = send_len + } + }; + return _send_recv_vfio_user_msg(sock, msg_id, cmd, iovecs, + ARRAY_SIZE(iovecs), send_fds, fd_count, + hdr, recv_data, recv_len); +} + +static int +set_version(lm_ctx_t *lm_ctx, int sock) +{ + int ret; + int client_mj, client_mn; + uint16_t msg_id = 0; + char *server_caps; + + ret = asprintf(&server_caps, "{max_fds: %d, migration: {pgsize: %ld}}", + MAX_FDS, sysconf(_SC_PAGESIZE)); + if (ret == -1) { + return -ENOMEM; + } + + ret = send_version(sock, LIB_MUSER_VFIO_USER_VERS_MJ, + LIB_MUSER_VFIO_USER_VERS_MN, msg_id, false, server_caps); + if (ret < 0) { + lm_log(lm_ctx, LM_DBG, "failed to send version: %s", strerror(-ret)); + goto out; + } + + ret = recv_version(sock, &client_mj, &client_mn, &msg_id, true, + &lm_ctx->client_max_fds, &lm_ctx->migration.pgsize); + if (ret < 0) { + lm_log(lm_ctx, LM_DBG, "failed to receive version: %s", strerror(-ret)); + goto out; + } + if (client_mj != LIB_MUSER_VFIO_USER_VERS_MJ || + client_mn != LIB_MUSER_VFIO_USER_VERS_MN) { + lm_log(lm_ctx, LM_DBG, "version mismatch, server=%d.%d, client=%d.%d", + LIB_MUSER_VFIO_USER_VERS_MJ, LIB_MUSER_VFIO_USER_VERS_MN, + client_mj, client_mn); + ret = -EINVAL; + goto out; + } + if (lm_ctx->migration.pgsize == 0) { + lm_log(lm_ctx, LM_ERR, "bad migration page size"); + ret = -EINVAL; + goto out; + } + + /* FIXME need to check max_fds */ + + lm_ctx->migration.pgsize = MIN(lm_ctx->migration.pgsize, + sysconf(_SC_PAGESIZE)); +out: + free(server_caps); + return ret; +} + +/** + * lm_ctx: libmuser context + * iommu_dir: full path to the IOMMU group to create. All parent directories + * must already exist. + */ +static int +open_sock(lm_ctx_t *lm_ctx) +{ + int ret; + int conn_fd; + + assert(lm_ctx != NULL); + + conn_fd = accept(lm_ctx->fd, NULL, NULL); + if (conn_fd == -1) { + return conn_fd; + } + + /* send version and caps */ + ret = set_version(lm_ctx, conn_fd); + if (ret < 0) { + return ret; + } + + lm_ctx->conn_fd = conn_fd; + return conn_fd; +} + +static int +close_sock(lm_ctx_t *lm_ctx) +{ + return close(lm_ctx->conn_fd); +} + +static int +get_request_sock(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr, + int *fds, int *nr_fds) +{ + int ret; + struct iovec iov = {.iov_base = hdr, .iov_len = sizeof *hdr}; + struct msghdr msg = {.msg_iov = &iov, .msg_iovlen = 1}; + struct cmsghdr *cmsg; + + msg.msg_controllen = CMSG_SPACE(sizeof(int) * *nr_fds); + msg.msg_control = alloca(msg.msg_controllen); + + /* + * TODO ideally we should set O_NONBLOCK on the fd so that the syscall is + * faster (?). I tried that and get short reads, so we need to store the + * partially received buffer somewhere and retry. + */ + ret = recvmsg(lm_ctx->conn_fd, &msg, lm_ctx->sock_flags); + if (ret == -1) { + return -errno; + } + + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL; cmsg = CMSG_NXTHDR(&msg, cmsg)) { + if (cmsg->cmsg_level != SOL_SOCKET || cmsg->cmsg_type != SCM_RIGHTS) { + continue; + } + if (cmsg->cmsg_len < CMSG_LEN(sizeof(int))) { + return -EINVAL; + } + int size = cmsg->cmsg_len - CMSG_LEN(0); + if (size % sizeof(int) != 0) { + return -EINVAL; + } + *nr_fds = (int)(size / sizeof(int)); + memcpy(fds, CMSG_DATA(cmsg), *nr_fds * sizeof(int)); + break; + } + + return ret; +} + +static ssize_t +recv_fds_sock(lm_ctx_t *lm_ctx, void *buf, size_t size) +{ + ssize_t ret = muser_recv_fds(lm_ctx->conn_fd, buf, size / sizeof(int)); + if (ret < 0) { + return ret; + } + return ret * sizeof(int); +} + +static struct transport_ops { + int (*init)(lm_ctx_t*); + int (*attach)(lm_ctx_t*); + int(*detach)(lm_ctx_t*); + int (*get_request)(lm_ctx_t*, struct vfio_user_header*, int *fds, int *nr_fds); + ssize_t (*recv_fds)(lm_ctx_t*, void *buf, size_t size); +} transports_ops[] = { + [LM_TRANS_SOCK] = { + .init = init_sock, + .attach = open_sock, + .detach = close_sock, + .recv_fds = recv_fds_sock, + .get_request = get_request_sock, + } +}; #define LM2VFIO_IRQT(type) (type - 1) @@ -98,6 +605,7 @@ lm_log(lm_ctx_t *lm_ctx, lm_log_lvl_t lvl, const char *fmt, ...) { va_list ap; char buf[BUFSIZ]; + int _errno = errno; assert(lm_ctx != NULL); @@ -108,7 +616,8 @@ lm_log(lm_ctx_t *lm_ctx, lm_log_lvl_t lvl, const char *fmt, ...) va_start(ap, fmt); vsnprintf(buf, sizeof buf, fmt, ap); va_end(ap); - lm_ctx->log(lm_ctx->pvt, buf); + lm_ctx->log(lm_ctx->pvt, lvl, buf); + errno = _errno; } static const char * @@ -137,11 +646,14 @@ irqs_disable(lm_ctx_t *lm_ctx, uint32_t index) case VFIO_PCI_INTX_IRQ_INDEX: case VFIO_PCI_MSI_IRQ_INDEX: case VFIO_PCI_MSIX_IRQ_INDEX: - lm_log(lm_ctx, LM_DBG, "disabling IRQ %s\n", vfio_irq_idx_to_str(index)); + lm_log(lm_ctx, LM_DBG, "disabling IRQ %s", vfio_irq_idx_to_str(index)); lm_ctx->irqs.type = IRQ_NONE; for (i = 0; i < lm_ctx->irqs.max_ivs; i++) { if (lm_ctx->irqs.efds[i] >= 0) { - (void)close(lm_ctx->irqs.efds[i]); + if (close(lm_ctx->irqs.efds[i]) == -1) { + lm_log(lm_ctx, LM_DBG, "failed to close IRQ fd %d: %m", + lm_ctx->irqs.efds[i]); + } lm_ctx->irqs.efds[i] = -1; } } @@ -155,12 +667,17 @@ irqs_disable(lm_ctx_t *lm_ctx, uint32_t index) } if (irq_efd != NULL) { - (void)close(*irq_efd); - *irq_efd = -1; + if (*irq_efd != -1) { + if (close(*irq_efd) == -1) { + lm_log(lm_ctx, LM_DBG, "failed to close IRQ fd %d: %m", + *irq_efd); + } + *irq_efd = -1; + } return 0; } - lm_log(lm_ctx, LM_DBG, "failed to disable IRQs\n"); + lm_log(lm_ctx, LM_DBG, "failed to disable IRQs"); return -EINVAL; } @@ -178,9 +695,8 @@ irqs_set_data_none(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set) val = 1; ret = eventfd_write(efd, val); if (ret == -1) { - ret = -errno; - lm_log(lm_ctx, LM_DBG, "IRQ: failed to set data to none: %m\n"); - return ret; + lm_log(lm_ctx, LM_DBG, "IRQ: failed to set data to none: %m"); + return -errno; } } } @@ -206,9 +722,8 @@ irqs_set_data_bool(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set, void *data) val = 1; ret = eventfd_write(efd, val); if (ret == -1) { - ret = -errno; - lm_log(lm_ctx, LM_DBG, "IRQ: failed to set data to bool: %m\n"); - return ret; + lm_log(lm_ctx, LM_DBG, "IRQ: failed to set data to bool: %m"); + return -errno; } } } @@ -228,13 +743,16 @@ irqs_set_data_eventfd(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set, void *data i++, d32++) { efd = lm_ctx->irqs.efds[i]; if (efd >= 0) { - (void) close(efd); + if (close(efd) == -1) { + lm_log(lm_ctx, LM_DBG, "failed to close IRQ fd %d: %m", efd); + } + lm_ctx->irqs.efds[i] = -1; } if (*d32 >= 0) { lm_ctx->irqs.efds[i] = *d32; } - lm_log(lm_ctx, LM_DBG, "event fd[%d]=%d\n", i, lm_ctx->irqs.efds[i]); + lm_log(lm_ctx, LM_DBG, "event fd[%d]=%d", i, lm_ctx->irqs.efds[i]); } return 0; @@ -252,7 +770,7 @@ irqs_trigger(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set, void *data) return irqs_disable(lm_ctx, irq_set->index); } - lm_log(lm_ctx, LM_DBG, "setting IRQ %s flags=0x%x\n", + lm_log(lm_ctx, LM_DBG, "setting IRQ %s flags=%#lx", vfio_irq_idx_to_str(irq_set->index), irq_set->flags); switch (irq_set->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) { @@ -334,6 +852,17 @@ dev_set_irqs_validate(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set) return 0; } +static int +device_reset(lm_ctx_t *lm_ctx) +{ + lm_log(lm_ctx, LM_DBG, "Device reset called by client"); + if (lm_ctx->reset != NULL) { + return lm_ctx->reset(lm_ctx->pvt); + } + + return 0; +} + static long dev_set_irqs(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set, void *data) { @@ -368,7 +897,8 @@ dev_get_irqinfo(lm_ctx_t *lm_ctx, struct vfio_irq_info *irq_info) // Ensure provided argsz is sufficiently big and index is within bounds. if ((irq_info->argsz < sizeof(struct vfio_irq_info)) || (irq_info->index >= LM_DEV_NUM_IRQS)) { - lm_log(lm_ctx, LM_DBG, "bad irq_info\n"); + lm_log(lm_ctx, LM_DBG, "bad irq_info (size=%d index=%d)\n", + irq_info->argsz, irq_info->index); return -EINVAL; } @@ -380,66 +910,94 @@ dev_get_irqinfo(lm_ctx_t *lm_ctx, struct vfio_irq_info *irq_info) /* * Populate the sparse mmap capability information to vfio-client. - * kernel/muser constructs the response for VFIO_DEVICE_GET_REGION_INFO - * accommodating sparse mmap information. * Sparse mmap information stays after struct vfio_region_info and cap_offest * points accordingly. */ static int -dev_get_sparse_mmap_cap(lm_ctx_t *lm_ctx, lm_reg_info_t *lm_reg, - struct vfio_region_info *vfio_reg) +dev_get_sparse_mmap_cap(lm_ctx_t *lm_ctx, lm_reg_info_t *lm_reg, int reg_index, + struct vfio_region_info **vfio_reg) { + struct vfio_info_cap_header *header; + struct vfio_region_info_cap_type *type = NULL; struct vfio_region_info_cap_sparse_mmap *sparse = NULL; struct lm_sparse_mmap_areas *mmap_areas; int nr_mmap_areas, i; - size_t size; - ssize_t ret; - - if (lm_reg->mmap_areas == NULL) - return -EINVAL; + size_t type_size = 0; + size_t sparse_size = 0; + size_t cap_size; + void *cap_ptr; - nr_mmap_areas = lm_reg->mmap_areas->nr_mmap_areas; - size = sizeof(*sparse) + (nr_mmap_areas * sizeof(*sparse->areas)); + if (reg_index == LM_DEV_MIGRATION_REG_IDX) { + type_size = sizeof(struct vfio_region_info_cap_type); + } - /* - * If vfio_reg does not have enough space to accommodate sparse info then - * set the argsz with the expected size and return. Vfio client will call - * back after reallocating the vfio_reg - */ + if (lm_reg->mmap_areas != NULL) { + nr_mmap_areas = lm_reg->mmap_areas->nr_mmap_areas; + sparse_size = sizeof(*sparse) + (nr_mmap_areas * sizeof(*sparse->areas)); + } - if (vfio_reg->argsz < size + sizeof(*vfio_reg)) { - vfio_reg->argsz = size + sizeof(*vfio_reg); - vfio_reg->cap_offset = 0; + cap_size = type_size + sparse_size; + if (cap_size == 0) { return 0; } - lm_log(lm_ctx, LM_DBG, "%s: size %llu, nr_mmap_areas %u\n", __func__, size, - nr_mmap_areas); - sparse = calloc(1, size); - if (sparse == NULL) + /* TODO deosn't need to be calloc, we overwrite it entirely */ + header = calloc(1, cap_size); + if (header == NULL) { return -ENOMEM; - sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; - sparse->header.version = 1; - sparse->header.next = 0; - sparse->nr_areas = nr_mmap_areas; + } + + if (reg_index == LM_DEV_MIGRATION_REG_IDX) { + type = (struct vfio_region_info_cap_type*)header; + type->header.id = VFIO_REGION_INFO_CAP_TYPE; + type->header.version = 1; + type->header.next = 0; + type->type = VFIO_REGION_TYPE_MIGRATION; + type->subtype = VFIO_REGION_SUBTYPE_MIGRATION; + (*vfio_reg)->cap_offset = sizeof(struct vfio_region_info); + } - mmap_areas = lm_reg->mmap_areas; - for (i = 0; i < nr_mmap_areas; i++) { - sparse->areas[i].offset = mmap_areas->areas[i].start; - sparse->areas[i].size = mmap_areas->areas[i].size; + if (lm_reg->mmap_areas != NULL) { + if (type != NULL) { + type->header.next = (*vfio_reg)->cap_offset + sizeof(struct vfio_region_info_cap_type); + sparse = (struct vfio_region_info_cap_sparse_mmap*)(type + 1); + } else { + (*vfio_reg)->cap_offset = sizeof(struct vfio_region_info); + sparse = (struct vfio_region_info_cap_sparse_mmap*)header; + } + sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; + sparse->header.version = 1; + sparse->header.next = 0; + sparse->nr_areas = nr_mmap_areas; + + lm_log(lm_ctx, LM_DBG, "%s: capsize %llu, nr_mmap_areas %u", __func__, + sparse_size, nr_mmap_areas); + mmap_areas = lm_reg->mmap_areas; + for (i = 0; i < nr_mmap_areas; i++) { + sparse->areas[i].offset = mmap_areas->areas[i].start; + sparse->areas[i].size = mmap_areas->areas[i].size; + lm_log(lm_ctx, LM_DBG, "%s: area %d offset %#lx size %llu", __func__, + i, sparse->areas[i].offset, sparse->areas[i].size); + } } - /* write the sparse mmap cap info to vfio-client user pages */ - ret = write(lm_ctx->fd, sparse, size); - if (ret != (ssize_t)size) { - free(sparse); - return -EIO; + /* + * FIXME VFIO_REGION_INFO_FLAG_MMAP is valid if the region is + * memory-mappable in general, not only if it supports sparse mmap. + */ + (*vfio_reg)->flags |= VFIO_REGION_INFO_FLAG_MMAP | VFIO_REGION_INFO_FLAG_CAPS; + + (*vfio_reg)->argsz = cap_size + sizeof(**vfio_reg); + *vfio_reg = realloc(*vfio_reg, (*vfio_reg)->argsz); + if (*vfio_reg == NULL) { + free(header); + return -ENOMEM; } - vfio_reg->flags |= VFIO_REGION_INFO_FLAG_MMAP | VFIO_REGION_INFO_FLAG_CAPS; - vfio_reg->cap_offset = sizeof(*vfio_reg); + cap_ptr = (char *)*vfio_reg + (*vfio_reg)->cap_offset; + memcpy(cap_ptr, header, cap_size); - free(sparse); + free(header); return 0; } @@ -458,42 +1016,73 @@ offset_to_region(uint64_t offset) return (offset >> LM_REGION_SHIFT) & LM_REGION_MASK; } +#ifdef LM_VERBOSE_LOGGING +void +dump_buffer(const char *prefix, const char *buf, uint32_t count) +{ + int i; + const size_t bytes_per_line = 0x8; + + if (strcmp(prefix, "")) { + fprintf(stderr, "%s\n", prefix); + } + for (i = 0; i < (int)count; i++) { + if (i % bytes_per_line != 0) { + fprintf(stderr, " "); + } + /* TODO valgrind emits a warning if count is 1 */ + fprintf(stderr,"0x%02x", *(buf + i)); + if ((i + 1) % bytes_per_line == 0) { + fprintf(stderr, "\n"); + } + } + if (i % bytes_per_line != 0) { + fprintf(stderr, "\n"); + } +} +#else +#define dump_buffer(prefix, buf, count) +#endif + static long -dev_get_reginfo(lm_ctx_t *lm_ctx, struct vfio_region_info *vfio_reg) +dev_get_reginfo(lm_ctx_t *lm_ctx, struct vfio_region_info **vfio_reg) { lm_reg_info_t *lm_reg; int err; assert(lm_ctx != NULL); - assert(vfio_reg != NULL); - lm_reg = &lm_ctx->pci_info.reg_info[vfio_reg->index]; + assert(*vfio_reg != NULL); + lm_reg = &lm_ctx->pci_info.reg_info[(*vfio_reg)->index]; // Ensure provided argsz is sufficiently big and index is within bounds. - if ((vfio_reg->argsz < sizeof(struct vfio_region_info)) || - (vfio_reg->index >= LM_DEV_NUM_REGS)) { + if (((*vfio_reg)->argsz < sizeof(struct vfio_region_info)) || + ((*vfio_reg)->index >= LM_DEV_NUM_REGS)) { + lm_log(lm_ctx, LM_DBG, "bad args argsz=%d index=%d", + (*vfio_reg)->argsz, (*vfio_reg)->index); return -EINVAL; } - vfio_reg->offset = region_to_offset(vfio_reg->index); - vfio_reg->flags = lm_reg->flags; - vfio_reg->size = lm_reg->size; + (*vfio_reg)->offset = region_to_offset((*vfio_reg)->index); + (*vfio_reg)->flags = lm_reg->flags; + (*vfio_reg)->size = lm_reg->size; - if (lm_reg->mmap_areas != NULL) { - err = dev_get_sparse_mmap_cap(lm_ctx, lm_reg, vfio_reg); - if (err) { - return err; - } + err = dev_get_sparse_mmap_cap(lm_ctx, lm_reg, (*vfio_reg)->index, vfio_reg); + if (err) { + return err; } - lm_log(lm_ctx, LM_DBG, "region_info[%d]\n", vfio_reg->index); - dump_buffer(lm_ctx, "", (char*)vfio_reg, sizeof *vfio_reg); + lm_log(lm_ctx, LM_DBG, "region_info[%d] offset %#lx flags %#x size %llu " + "argsz %llu", + (*vfio_reg)->index, (*vfio_reg)->offset, (*vfio_reg)->flags, + (*vfio_reg)->size, (*vfio_reg)->argsz); return 0; } static long -dev_get_info(struct vfio_device_info *dev_info) +dev_get_info(lm_ctx_t *lm_ctx, struct vfio_device_info *dev_info) { + assert(lm_ctx != NULL); assert(dev_info != NULL); // Ensure provided argsz is sufficiently big. @@ -508,173 +1097,81 @@ dev_get_info(struct vfio_device_info *dev_info) return 0; } -static long -do_muser_ioctl(lm_ctx_t *lm_ctx, struct muser_cmd_ioctl *cmd_ioctl, void *data) -{ - int err = -ENOTSUP; - - assert(lm_ctx != NULL); - switch (cmd_ioctl->vfio_cmd) { - case VFIO_DEVICE_GET_INFO: - err = dev_get_info(&cmd_ioctl->data.dev_info); - break; - case VFIO_DEVICE_GET_REGION_INFO: - err = dev_get_reginfo(lm_ctx, &cmd_ioctl->data.reg_info); - break; - case VFIO_DEVICE_GET_IRQ_INFO: - err = dev_get_irqinfo(lm_ctx, &cmd_ioctl->data.irq_info); - break; - case VFIO_DEVICE_SET_IRQS: - err = dev_set_irqs(lm_ctx, &cmd_ioctl->data.irq_set, data); - break; - case VFIO_DEVICE_RESET: - if (lm_ctx->reset != NULL) { - return lm_ctx->reset(lm_ctx->pvt); - } - lm_log(lm_ctx, LM_DBG, "reset called but not reset function present\n"); - break; - } - - return err; -} - -static void -get_path_from_fd(lm_ctx_t *lm_ctx, int fd, char *buf) -{ - int err; - ssize_t ret; - char pathname[PATH_MAX]; - - err = snprintf(pathname, PATH_MAX, "/proc/self/fd/%d", fd); - if (err >= PATH_MAX || err == -1) { - buf[0] = '\0'; - } - ret = readlink(pathname, buf, PATH_MAX); - if (ret == -1) { - lm_log(lm_ctx, LM_DBG, "failed to readlink %s: %m\n", pathname); - ret = 0; - } else if (ret == PATH_MAX) { - lm_log(lm_ctx, LM_DBG, "failed to readlink %s, output truncated\n", - pathname); - ret -= 1; - } - buf[ret] = '\0'; -} - -static int -muser_dma_unmap(lm_ctx_t *lm_ctx, struct muser_cmd *cmd) -{ - int err; - char buf[PATH_MAX]; - - get_path_from_fd(lm_ctx, cmd->mmap.request.fd, buf); - - lm_log(lm_ctx, LM_INF, "removing DMA region fd=%d path=%s %#lx-%#lx\n", - cmd->mmap.request.fd, buf, cmd->mmap.request.addr, - cmd->mmap.request.addr + cmd->mmap.request.len); - - if (lm_ctx->dma == NULL) { - lm_log(lm_ctx, LM_ERR, "DMA not initialized\n"); - return -EINVAL; - } - - err = dma_controller_remove_region(lm_ctx->dma, - cmd->mmap.request.addr, - cmd->mmap.request.len, - cmd->mmap.request.fd); - if (err != 0) { - lm_log(lm_ctx, LM_ERR, "failed to remove DMA region fd=%d path=%s %#lx-%#lx: %s\n", - cmd->mmap.request.fd, buf, cmd->mmap.request.addr, - cmd->mmap.request.addr + cmd->mmap.request.len, - strerror(err)); - } - - return err; -} - -static int -muser_dma_map(lm_ctx_t *lm_ctx, struct muser_cmd *cmd) -{ - int err; - char buf[PATH_MAX]; - - get_path_from_fd(lm_ctx, cmd->mmap.request.fd, buf); - - lm_log(lm_ctx, LM_INF, "adding DMA region fd=%d path=%s iova=%#lx-%#lx offset=%#lx\n", - cmd->mmap.request.fd, buf, cmd->mmap.request.addr, - cmd->mmap.request.addr + cmd->mmap.request.len, - cmd->mmap.request.offset); - - if (lm_ctx->dma == NULL) { - lm_log(lm_ctx, LM_ERR, "DMA not initialized\n"); - return -EINVAL; - } - - err = dma_controller_add_region(lm_ctx, lm_ctx->dma, - cmd->mmap.request.addr, - cmd->mmap.request.len, - cmd->mmap.request.fd, - cmd->mmap.request.offset); - if (err < 0) { - lm_log(lm_ctx, LM_ERR, "failed to add DMA region fd=%d path=%s %#lx-%#lx: %d\n", - cmd->mmap.request.fd, buf, cmd->mmap.request.addr, - cmd->mmap.request.addr + cmd->mmap.request.len, err); - } - - return 0; +int +muser_send_fds(int sock, int *fds, size_t count) { + struct msghdr msg = { 0 }; + size_t size = count * sizeof *fds; + char buf[CMSG_SPACE(size)]; + memset(buf, '\0', sizeof(buf)); + + /* XXX requires at least one byte */ + struct iovec io = { .iov_base = "\0", .iov_len = 1 }; + + msg.msg_iov = &io; + msg.msg_iovlen = 1; + msg.msg_control = buf; + msg.msg_controllen = sizeof(buf); + + struct cmsghdr * cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(size); + memcpy(CMSG_DATA(cmsg), fds, size); + msg.msg_controllen = CMSG_SPACE(size); + return sendmsg(sock, &msg, 0); } -/* - * Callback that is executed when device memory is to be mmap'd. - */ -static int -muser_mmap(lm_ctx_t *lm_ctx, struct muser_cmd *cmd) +ssize_t +muser_recv_fds(int sock, int *fds, size_t count) { - int region, err = 0; - unsigned long addr; - unsigned long len = cmd->mmap.request.len; - loff_t offset = cmd->mmap.request.addr; + int ret; + struct cmsghdr *cmsg; + size_t fds_size; + char msg_buf[sysconf(_SC_PAGESIZE)]; + struct iovec io = {.iov_base = msg_buf, .iov_len = sizeof(msg_buf)}; + char cmsg_buf[sysconf(_SC_PAGESIZE)]; + struct msghdr msg = { + .msg_iov = &io, + .msg_iovlen = 1, + .msg_control = cmsg_buf, + .msg_controllen = sizeof(cmsg_buf) + }; - region = lm_get_region(offset, len, &offset); - if (region < 0) { - lm_log(lm_ctx, LM_ERR, "bad region %d\n", region); - err = EINVAL; - goto out; + if (fds == NULL || count <= 0) { + errno = EINVAL; + return -1; } - if (lm_ctx->pci_info.reg_info[region].map == NULL) { - lm_log(lm_ctx, LM_ERR, "region not mmapable\n"); - err = ENOTSUP; - goto out; + ret = recvmsg(sock, &msg, 0); + if (ret == -1) { + return ret; } - addr = lm_ctx->pci_info.reg_info[region].map(lm_ctx->pvt, offset, len); - if ((void *)addr == MAP_FAILED) { - err = errno; - lm_log(lm_ctx, LM_ERR, "failed to mmap: %m\n"); - goto out; + cmsg = CMSG_FIRSTHDR(&msg); + if (cmsg == NULL) { + errno = EINVAL; + return -1; } - cmd->mmap.response = addr; - -out: - if (err != 0) { - lm_log(lm_ctx, LM_ERR, "failed to mmap device memory %#x-%#lx: %s\n", - offset, offset + len, strerror(err)); + fds_size = cmsg->cmsg_len - sizeof *cmsg; + if ((fds_size % sizeof(int)) != 0 || fds_size / sizeof (int) > count) { + errno = EINVAL; + return -1; } + memcpy((void*)fds, CMSG_DATA(cmsg), cmsg->cmsg_len - sizeof *cmsg); - return -err; + return fds_size / sizeof(int); } /* - * Returns the number of bytes communicated to the kernel (may be less than - * ret), or a negative number on error. + * Returns the number of bytes sent (may be less than ret), or a negative + * number on error. */ static int post_read(lm_ctx_t *lm_ctx, char *rwbuf, ssize_t count) { ssize_t ret; - ret = write(lm_ctx->fd, rwbuf, count); + ret = write(lm_ctx->conn_fd, rwbuf, count); if (ret != count) { lm_log(lm_ctx, LM_ERR, "%s: bad muser write: %lu/%lu, %s\n", __func__, ret, count, strerror(errno)); @@ -719,17 +1216,274 @@ handle_pci_config_space_access(lm_ctx_t *lm_ctx, char *buf, size_t count, int ret; count = MIN(pci_config_space_size(lm_ctx), count); - ret = cap_maybe_access(lm_ctx->caps, lm_ctx->pvt, buf, count, pos, is_write); + if (is_write) { + ret = cap_maybe_access(lm_ctx, lm_ctx->caps, buf, count, pos); + if (ret < 0) { + lm_log(lm_ctx, LM_ERR, "bad access to capabilities %u@%#x\n", count, + pos); + return ret; + } + } else { + memcpy(buf, lm_ctx->pci_config_space->raw + pos, count); + } + return count; +} + +/* valid migration state transitions */ +__u32 migration_states[VFIO_DEVICE_STATE_MASK] = { + [VFIO_DEVICE_STATE_STOP] = 1 << VFIO_DEVICE_STATE_STOP, + [VFIO_DEVICE_STATE_RUNNING] = /* running */ + (1 << VFIO_DEVICE_STATE_STOP) | + (1 << VFIO_DEVICE_STATE_RUNNING) | + (1 << VFIO_DEVICE_STATE_SAVING) | + (1 << (VFIO_DEVICE_STATE_RUNNING | VFIO_DEVICE_STATE_SAVING)) | + (1 << VFIO_DEVICE_STATE_RESUMING), + [VFIO_DEVICE_STATE_SAVING] = /* stop-and-copy */ + (1 << VFIO_DEVICE_STATE_STOP) | + (1 << VFIO_DEVICE_STATE_SAVING), + [VFIO_DEVICE_STATE_RUNNING | VFIO_DEVICE_STATE_SAVING] = /* pre-copy */ + (1 << VFIO_DEVICE_STATE_SAVING) | + (1 << VFIO_DEVICE_STATE_RUNNING | VFIO_DEVICE_STATE_SAVING), + [VFIO_DEVICE_STATE_RESUMING] = /* resuming */ + (1 << VFIO_DEVICE_STATE_RUNNING) | + (1 << VFIO_DEVICE_STATE_RESUMING) +}; + +static bool +_migration_state_transition_is_valid(__u32 from, __u32 to) +{ + return migration_states[from] & (1 << to); +} + +static ssize_t +handle_migration_device_state(lm_ctx_t *lm_ctx, __u32 *device_state, + bool is_write) { + + int ret; + + assert(lm_ctx != NULL); + assert(device_state != NULL); + + if (!is_write) { + *device_state = lm_ctx->migration.info.device_state; + return 0; + } + + if (*device_state & ~VFIO_DEVICE_STATE_MASK) { + return -EINVAL; + } + + if (!_migration_state_transition_is_valid(lm_ctx->migration.info.device_state, + *device_state)) { + return -EINVAL; + } + + switch (*device_state) { + case VFIO_DEVICE_STATE_STOP: + ret = lm_ctx->migration.callbacks.transition(lm_ctx->pvt, + LM_MIGR_STATE_STOP); + break; + case VFIO_DEVICE_STATE_RUNNING: + ret = lm_ctx->migration.callbacks.transition(lm_ctx->pvt, + LM_MIGR_STATE_START); + break; + case VFIO_DEVICE_STATE_SAVING: + /* + * FIXME How should the device operate during the stop-and-copy + * phase? Should we only allow the migration data to be read from + * the migration region? E.g. Access to any other region should be + * failed? This might be a good question to send to LKML. + */ + ret = lm_ctx->migration.callbacks.transition(lm_ctx->pvt, + LM_MIGR_STATE_STOP_AND_COPY); + break; + case VFIO_DEVICE_STATE_RUNNING | VFIO_DEVICE_STATE_SAVING: + ret = lm_ctx->migration.callbacks.transition(lm_ctx->pvt, + LM_MIGR_STATE_PRE_COPY); + break; + case VFIO_DEVICE_STATE_RESUMING: + ret = lm_ctx->migration.callbacks.transition(lm_ctx->pvt, + LM_MIGR_STATE_RESUME); + break; + default: + ret = -EINVAL; + } + + if (ret == 0) { + lm_ctx->migration.info.device_state = *device_state; + } + + return ret; +} + +static ssize_t +handle_migration_pending_bytes(lm_ctx_t *lm_ctx, __u64 *pending_bytes, + bool is_write) +{ + assert(lm_ctx != NULL); + assert(pending_bytes != NULL); + + if (is_write) { + return -EINVAL; + } + + if (lm_ctx->migration.iter.state == VFIO_USER_MIGRATION_ITERATION_STATE_FINISHED) { + *pending_bytes = 0; + return 0; + } + + *pending_bytes = lm_ctx->migration.callbacks.get_pending_bytes(lm_ctx->pvt); + + switch (lm_ctx->migration.iter.state) { + case VFIO_USER_MIGRATION_ITERATION_STATE_INITIAL: + case VFIO_USER_MIGRATION_ITERATION_STATE_DATA_PREPARED: + /* + * FIXME what happens if data haven't been consumed in the previous + * iteration? Ask on LKML. + */ + if (*pending_bytes == 0) { + lm_ctx->migration.iter.state = VFIO_USER_MIGRATION_ITERATION_STATE_FINISHED; + } else { + lm_ctx->migration.iter.state = VFIO_USER_MIGRATION_ITERATION_STATE_STARTED; + } + break; + case VFIO_USER_MIGRATION_ITERATION_STATE_STARTED: + /* + * Repeated reads of pending_bytes should not have any side effects. + * FIXME does it have to be the same as the previous value? Can it + * increase or even decrease? I suppose it can't be lower than + * data_size? Ask on LKML. + */ + break; + default: + return -EINVAL; + } + return 0; +} + +static ssize_t +handle_migration_data_offset(lm_ctx_t *lm_ctx, __u64 *offset, bool is_write) +{ + int ret; + + assert(lm_ctx != NULL); + assert(offset != NULL); + + if (is_write) { + return -EINVAL; + } + + switch (lm_ctx->migration.iter.state) { + case VFIO_USER_MIGRATION_ITERATION_STATE_STARTED: + break; + default: + /* + * FIXME it's not clear whether these registers can be accessed in + * other parts of the iteration, need clarification on the + * following: + * + * Read on data_offset and data_size should return the offset and + * size of the current buffer if the user application reads + * data_offset and data_size more than once here. + */ + return -EINVAL; + } + + ret = lm_ctx->migration.callbacks.prepare_data(lm_ctx->pvt, + &lm_ctx->migration.iter.offset, + &lm_ctx->migration.iter.size); if (ret < 0) { - lm_log(lm_ctx, LM_ERR, "bad access to capabilities %u@%#x\n", count, - pos); return ret; } - return count; + + *offset = lm_ctx->migration.iter.offset + sizeof(struct vfio_device_migration_info); + + return ret; +} + +static ssize_t +handle_migration_data_size(lm_ctx_t *lm_ctx, __u64 *size, bool is_write) +{ + assert(lm_ctx != NULL); + assert(size != NULL); + + if (is_write) { + return -EINVAL; + } + + switch (lm_ctx->migration.iter.state) { + case VFIO_USER_MIGRATION_ITERATION_STATE_STARTED: + break; + default: + /* FIXME see comment in handle_migration_data_offset */ + return -EINVAL; + } + + *size = lm_ctx->migration.iter.size; + + return 0; } static ssize_t -do_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t pos, bool is_write) +handle_migration_region_access(lm_ctx_t *lm_ctx, char *buf, size_t count, + loff_t pos, bool is_write) +{ + int ret; + + assert(lm_ctx != NULL); + assert(buf != NULL); + + if (pos + count > lm_ctx->pci_info.reg_info[LM_DEV_MIGRATION_REG_IDX].size) { + lm_log(lm_ctx, LM_ERR, "read %#x-%#x past end of migration region", + pos, pos + count - 1); + return -EINVAL; + } + switch (pos) { + case offsetof(struct vfio_device_migration_info, device_state): + if (count != sizeof(lm_ctx->migration.info.device_state)) { + return -EINVAL; + } + ret = handle_migration_device_state(lm_ctx, (__u32*)buf, + is_write); + break; + case offsetof(struct vfio_device_migration_info, pending_bytes): + if (count != sizeof(lm_ctx->migration.info.pending_bytes)) { + return -EINVAL; + } + ret = handle_migration_pending_bytes(lm_ctx, (__u64*)buf, is_write); + break; + case offsetof(struct vfio_device_migration_info, data_offset): + if (count != sizeof(lm_ctx->migration.info.data_offset)) { + return -EINVAL; + } + ret = handle_migration_data_offset(lm_ctx, (__u64*)buf, is_write); + break; + case offsetof(struct vfio_device_migration_info, data_size): + if (count != sizeof(lm_ctx->migration.info.data_size)) { + return -EINVAL; + } + ret = handle_migration_data_size(lm_ctx, (__u64*)buf, is_write); + break; + default: + if (is_write) { + /* FIXME how do we handle the offset? */ + ret = lm_ctx->migration.callbacks.write_data(lm_ctx->pvt, + buf, count); + } else { + ret = lm_ctx->migration.callbacks.read_data(lm_ctx->pvt, + buf, count, + pos - sizeof(struct vfio_device_migration_info)); + } + } + + if (ret == 0) { + ret = count; + } + return ret; +} + +static ssize_t +do_access(lm_ctx_t *lm_ctx, char *buf, uint8_t count, uint64_t pos, bool is_write) { int idx; loff_t offset; @@ -737,7 +1491,7 @@ do_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t pos, bool is_write) assert(lm_ctx != NULL); assert(buf != NULL); - assert(count > 0); + assert(count == 1 || count == 2 || count == 4 || count == 8); pci_info = &lm_ctx->pci_info; idx = lm_get_region(pos, count, &offset); @@ -756,6 +1510,11 @@ do_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t pos, bool is_write) is_write); } + if (idx == LM_DEV_MIGRATION_REG_IDX) { + return handle_migration_region_access(lm_ctx, buf, count, offset, + is_write); + } + /* * Checking whether a callback exists might sound expensive however this * code is not performance critical. This works well when we don't expect a @@ -777,12 +1536,15 @@ do_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t pos, bool is_write) * error. * * TODO function name same lm_access_t, fix + * FIXME we must be able to return values up to uint32_t bit, or negative on + * error. Better to make return value an int and return the number of bytes + * processed via an argument. */ ssize_t -lm_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t *ppos, +lm_access(lm_ctx_t *lm_ctx, char *buf, uint32_t count, uint64_t *ppos, bool is_write) { - unsigned int done = 0; + uint32_t done = 0; int ret; assert(lm_ctx != NULL); @@ -792,7 +1554,10 @@ lm_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t *ppos, size_t size; /* * Limit accesses to qword and enforce alignment. Figure out whether - * the PCI spec requires this. + * the PCI spec requires this + * FIXME while this makes sense for registers, we might be able to relax + * this requirement and make some transfers more efficient. Maybe make + * this a per-region option that can be set by the user? */ if (count >= 8 && !(*ppos % 8)) { size = 8; @@ -805,15 +1570,16 @@ lm_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t *ppos, } ret = do_access(lm_ctx, buf, size, *ppos, is_write); if (ret <= 0) { - lm_log(lm_ctx, LM_ERR, "failed to %s %llx@%lx: %s\n", - is_write ? "write" : "read", size, *ppos, strerror(-ret)); + lm_log(lm_ctx, LM_ERR, "failed to %s %#lx-%#lx: %s", + is_write ? "write to" : "read from", *ppos, *ppos + size - 1, + strerror(-ret)); /* * TODO if ret < 0 then it might contain a legitimate error code, why replace it with EFAULT? */ return -EFAULT; } if (ret != (int)size) { - lm_log(lm_ctx, LM_DBG, "bad read %d != %d\n", ret, size); + lm_log(lm_ctx, LM_DBG, "bad read %d != %d", ret, size); } count -= size; done += size; @@ -824,50 +1590,54 @@ lm_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t *ppos, } static inline int -muser_access(lm_ctx_t *lm_ctx, struct muser_cmd *cmd, bool is_write) +muser_access(lm_ctx_t *lm_ctx, bool is_write, void **data, uint32_t count, + uint64_t *pos) { + struct vfio_user_region_access *region_access; char *rwbuf; int err; - size_t count = 0, _count; - ssize_t ret; + uint32_t processed = 0, _count; + int ret; + + assert(pos != NULL); /* TODO how big do we expect count to be? Can we use alloca(3) instead? */ - rwbuf = calloc(1, cmd->rw.count); - if (rwbuf == NULL) { + region_access = calloc(1, sizeof(*region_access) + count); + if (region_access == NULL) { lm_log(lm_ctx, LM_ERR, "failed to allocate memory\n"); return -1; } + rwbuf = (char*)(region_access + 1); -#ifndef LM_TERSE_LOGGING - lm_log(lm_ctx, LM_DBG, "%s %x@%lx\n", is_write ? "W" : "R", cmd->rw.count, - cmd->rw.pos); -#endif + lm_log(lm_ctx, LM_DBG, "%s %#lx-%#lx", is_write ? "W" : "R", *pos, + *pos + count - 1); - /* copy data to be written from kernel to user space */ + /* receive data to be written */ if (is_write) { - err = read(lm_ctx->fd, rwbuf, cmd->rw.count); + err = read(lm_ctx->conn_fd, rwbuf, count); /* * FIXME this is wrong, we should be checking for - * err != cmd->rw.count + * err != count */ if (err < 0) { - lm_log(lm_ctx, LM_ERR, "failed to read from kernel: %s\n", + lm_log(lm_ctx, LM_ERR, "failed to receive write payload: %s", strerror(errno)); goto out; } err = 0; -#ifndef LM_TERSE_LOGGING - dump_buffer(lm_ctx, "buffer write", rwbuf, cmd->rw.count); +#ifdef LM_VERBOSE_LOGGING + dump_buffer("buffer write", rwbuf, count); #endif } - count = _count = cmd->rw.count; - cmd->err = muser_pci_hdr_access(lm_ctx, &_count, &cmd->rw.pos, - is_write, rwbuf); - if (cmd->err) { - lm_log(lm_ctx, LM_ERR, "failed to access PCI header: %d\n", cmd->err); -#ifndef LM_TERSE_LOGGING - dump_buffer(lm_ctx, "buffer write", rwbuf, _count); + _count = count; + ret = muser_pci_hdr_access(lm_ctx, &_count, pos, is_write, rwbuf); + if (ret != 0) { + /* FIXME shouldn't we fail here? */ + lm_log(lm_ctx, LM_ERR, "failed to access PCI header: %s", + strerror(-ret)); +#ifdef LM_VERBOSE_LOGGING + dump_buffer("buffer write", rwbuf, _count); #endif } @@ -875,150 +1645,618 @@ muser_access(lm_ctx_t *lm_ctx, struct muser_cmd *cmd, bool is_write) * count is how much has been processed by muser_pci_hdr_access, * _count is how much there's left to be processed by lm_access */ - count -= _count; - ret = lm_access(lm_ctx, rwbuf + count, _count, &cmd->rw.pos, - is_write); - if (!is_write && ret >= 0) { - ret += count; - err = post_read(lm_ctx, rwbuf, ret); - if (!LM_TERSE_LOGGING && err == ret) { - dump_buffer(lm_ctx, "buffer read", rwbuf, ret); + processed = count - _count; + ret = lm_access(lm_ctx, rwbuf + processed, _count, pos, is_write); + if (ret >= 0) { + ret += processed; + if (data != NULL) { + /* + * FIXME the spec doesn't specify whether the reset of the + * region_access struct needs to be populated. + */ + region_access->count = ret; + *data = region_access; + return ret; + } else if (!is_write) { + err = post_read(lm_ctx, rwbuf, ret); +#ifdef LM_VERBOSE_LOGGING + if (err == ret) { + dump_buffer("buffer read", rwbuf, ret); + } +#endif } } out: - free(rwbuf); + free(region_access); - return err; + return ret; +} + +static int handle_device_get_region_info(lm_ctx_t *lm_ctx, + struct vfio_user_header *hdr, + struct vfio_region_info **dev_reg_info) +{ + struct vfio_region_info *reg_info; + int ret; + + reg_info = calloc(sizeof(*reg_info), 1); + if (reg_info == NULL) { + return -ENOMEM; + } + + if ((hdr->msg_size - sizeof(*hdr)) != sizeof(*reg_info)) { + free(reg_info); + return -EINVAL; + } + + ret = recv(lm_ctx->conn_fd, reg_info, sizeof(*reg_info), 0); + if (ret < 0) { + free(reg_info); + return -errno; + } + + ret = dev_get_reginfo(lm_ctx, ®_info); + if (ret < 0) { + free(reg_info); + return ret; + } + *dev_reg_info = reg_info; + + return 0; +} + +static int handle_device_get_info(lm_ctx_t *lm_ctx, + struct vfio_user_header *hdr, + struct vfio_device_info *dev_info) +{ + int ret; + + if ((hdr->msg_size - sizeof(*hdr)) != sizeof(*dev_info)) { + return -EINVAL; + } + + ret = recv(lm_ctx->conn_fd, dev_info, sizeof(*dev_info), 0); + if (ret < 0) { + return -errno; + } + + ret = dev_get_info(lm_ctx, dev_info); + if (ret < 0) { + return ret; + } + + lm_log(lm_ctx, LM_DBG, "sent devinfo flags %#x, num_regions %d, num_irqs" + " %d", dev_info->flags, dev_info->num_regions, dev_info->num_irqs); + return ret; } static int -muser_ioctl(lm_ctx_t *lm_ctx, struct muser_cmd *cmd) +handle_device_get_irq_info(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr, + struct vfio_irq_info *irq_info) { - void *data = NULL; - size_t size = 0; int ret; - /* TODO make this a function that returns the size */ - if (cmd->ioctl.vfio_cmd == VFIO_DEVICE_SET_IRQS) { - uint32_t flags = cmd->ioctl.data.irq_set.flags; - switch ((flags & VFIO_IRQ_SET_DATA_TYPE_MASK)) { + assert(lm_ctx != NULL); + assert(irq_info != NULL); + + hdr->msg_size -= sizeof *hdr; + + if (hdr->msg_size != sizeof *irq_info) { + return -EINVAL; + } + + ret = recv(lm_ctx->conn_fd, irq_info, hdr->msg_size, 0); + if (ret < 0) { + return -errno; + } + if (ret != (int)hdr->msg_size) { + assert(false); /* FIXME */ + } + + return dev_get_irqinfo(lm_ctx, irq_info); +} + +static int +handle_device_set_irqs(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr, + int *fds, int nr_fds) +{ + int ret; + struct vfio_irq_set *irq_set; + void *data; + + assert(lm_ctx != NULL); + assert(hdr != NULL); + + hdr->msg_size -= sizeof *hdr; + + if (hdr->msg_size < sizeof *irq_set) { + return -EINVAL; + } + + irq_set = alloca(hdr->msg_size); /* FIXME */ + + ret = recv(lm_ctx->conn_fd, irq_set, hdr->msg_size, 0); + if (ret < 0) { + return -errno; + } + if (ret != (int)hdr->msg_size) { + assert(false); /* FIXME */ + } + if (ret != (int)irq_set->argsz) { + assert(false); /* FIXME */ + } + switch (irq_set->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) { case VFIO_IRQ_SET_DATA_EVENTFD: - size = sizeof(int32_t) * cmd->ioctl.data.irq_set.count; + data = fds; + if (nr_fds != (int)irq_set->count) { + return -EINVAL; + } break; case VFIO_IRQ_SET_DATA_BOOL: - size = sizeof(uint8_t) * cmd->ioctl.data.irq_set.count; + data = irq_set + 1; break; + } + + return dev_set_irqs(lm_ctx, irq_set, data); +} + +static int +handle_dma_map_or_unmap(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr, bool map, + int *fds, int nr_fds) +{ + int ret, i; + int nr_dma_regions; + struct vfio_user_dma_region *dma_regions; + + assert(lm_ctx != NULL); + assert(hdr != NULL); + + hdr->msg_size -= sizeof *hdr; + + if (hdr->msg_size % sizeof(struct vfio_user_dma_region) != 0) { + lm_log(lm_ctx, LM_ERR, "bad size of DMA regions %d", hdr->msg_size); + return -EINVAL; + } + + nr_dma_regions = (int)(hdr->msg_size / sizeof(struct vfio_user_dma_region)); + if (map && nr_dma_regions != nr_fds) { + lm_log(lm_ctx, LM_ERR, "expected %d fds but got %d instead", + nr_dma_regions, nr_fds); + return -EINVAL; + } + + dma_regions = alloca(nr_dma_regions * sizeof(*dma_regions)); + + ret = recv(lm_ctx->conn_fd, dma_regions, hdr->msg_size, 0); + if (ret == -1) { + lm_log(lm_ctx, LM_ERR, "failed to receive DMA region entries: %m"); + return -errno; + } + + if (lm_ctx->dma == NULL) { + return 0; + } + + for (i = 0; i < nr_dma_regions; i++) { + if (map) { + if (dma_regions[i].flags != VFIO_USER_F_DMA_REGION_MAPPABLE) { + /* + * FIXME implement non-mappable DMA regions. This requires changing + * dma.c to not take a file descriptor. + */ + assert(false); + } + + ret = dma_controller_add_region(lm_ctx->dma, + dma_regions[i].addr, + dma_regions[i].size, + fds[i], + dma_regions[i].offset); + if (ret < 0) { + lm_log(lm_ctx, LM_INF, + "failed to add DMA region %#lx-%#lx offset=%#lx fd=%d: %s", + dma_regions[i].addr, + dma_regions[i].addr + dma_regions[i].size - 1, + dma_regions[i].offset, fds[i], + strerror(-ret)); + } else { + lm_log(lm_ctx, LM_DBG, + "added DMA region %#lx-%#lx offset=%#lx fd=%d", + dma_regions[i].addr, + dma_regions[i].addr + dma_regions[i].size - 1, + dma_regions[i].offset, fds[i]); + } + } else { + ret = dma_controller_remove_region(lm_ctx->dma, + dma_regions[i].addr, + dma_regions[i].size, + lm_ctx->unmap_dma, lm_ctx->pvt); + if (ret < 0) { + lm_log(lm_ctx, LM_INF, + "failed to remove DMA region %#lx-%#lx: %s", + dma_regions[i].addr, + dma_regions[i].addr + dma_regions[i].size - 1, + strerror(-ret)); + } else { + lm_log(lm_ctx, LM_DBG, + "removed DMA region %#lx-%#lx", + dma_regions[i].addr, + dma_regions[i].addr + dma_regions[i].size - 1); + } + } + if (ret < 0) { + return ret; + } + if (lm_ctx->map_dma != NULL) { + lm_ctx->map_dma(lm_ctx->pvt, dma_regions[i].addr, dma_regions[i].size); } } + return 0; +} - if (size != 0) { - data = calloc(1, size); - if (data == NULL) { -#ifdef DEBUG - perror("calloc"); -#endif - return -1; +static int +handle_device_reset(lm_ctx_t *lm_ctx) +{ + return device_reset(lm_ctx); +} + +static int +handle_region_access(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr, + void **data, size_t *len) +{ + struct vfio_user_region_access region_access; + uint64_t count, offset; + int ret; + + assert(lm_ctx != NULL); + assert(hdr != NULL); + assert(data != NULL); + + /* + * TODO Since muser_access doesn't have to handle the kernel case any more, + * we can avoid having to do an additional read/recv inside muser_access + * (one recv for struct region_access and another for the write data) by + * doing a single recvmsg here with an iovec where the first element of the + * array will be struct vfio_user_region_access and the second a buffer if + * it's a write. The size of the write buffer is: hdr->msg_size - sizeof + * *hdr - sizeof region_access, and should be equal to region_access.count. + */ + + hdr->msg_size -= sizeof *hdr; + if (hdr->msg_size < sizeof region_access) { + lm_log(lm_ctx, LM_ERR, "message size too small (%d)", hdr->msg_size); + return -EINVAL; + } + + ret = recv(lm_ctx->conn_fd, ®ion_access, sizeof region_access, 0); + if (ret == -1) { + lm_log(lm_ctx, LM_ERR, "failed to recv: %m"); + return -errno; + } + if (ret != sizeof region_access) { + lm_log(lm_ctx, LM_ERR, "bad region_access size %d", ret); + return -EINVAL; + } + if (region_access.region >= LM_DEV_NUM_REGS || region_access.count <= 0 ) { + lm_log(lm_ctx, LM_ERR, "bad region %d and/or count %d", + region_access.region, region_access.count); + return -EINVAL; + } + count = region_access.count; + offset = region_to_offset(region_access.region) + region_access.offset; + + ret = muser_access(lm_ctx, hdr->cmd == VFIO_USER_REGION_WRITE, + data, count, &offset); + if (ret != (int)region_access.count) { + lm_log(lm_ctx, LM_ERR, "bad region access acount, expected=%d, actual=%d", + region_access.count, ret); + /* FIXME we should return whatever has been accessed, not an error */ + if (ret >= 0) { + ret = -EINVAL; } + return ret; + } - ret = read(lm_ctx->fd, data, size); - if (ret < 0) { -#ifdef DEBUG - perror("read failed"); -#endif + *len = sizeof(region_access); + if (hdr->cmd == VFIO_USER_REGION_READ) { + *len += region_access.count; + } + + return 0; +} + +static int +handle_dirty_pages_get(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr, + struct iovec **iovecs, size_t *nr_iovecs) +{ + int size, ret; + size_t i; + struct vfio_iommu_type1_dirty_bitmap_get *ranges; + + assert(lm_ctx != NULL); + assert(hdr != NULL); + assert(iovecs != NULL); + assert(nr_iovecs != NULL); + + size = hdr->msg_size - sizeof(*hdr) - sizeof(struct vfio_iommu_type1_dirty_bitmap); + if (size % sizeof(struct vfio_iommu_type1_dirty_bitmap_get) != 0) { + return -EINVAL; + } + ranges = malloc(size); + if (ranges == NULL) { + return -errno; + } + ret = recv(lm_ctx->conn_fd, ranges, size, 0); + if (ret == -1) { + ret = -errno; + goto out; + } + if (ret != size) { + ret = -EINVAL; + goto out; + } + *nr_iovecs = 1 + size / sizeof(struct vfio_iommu_type1_dirty_bitmap_get); + *iovecs = malloc(*nr_iovecs * sizeof(struct iovec)); + if (*iovecs == NULL) { + ret = -errno; + goto out; + } + + for (i = 1; i < *nr_iovecs; i++) { + struct vfio_iommu_type1_dirty_bitmap_get *r = &ranges[(i - 1)]; /* FIXME ugly indexing */ + ret = dma_controller_dirty_page_get(lm_ctx->dma, r->iova, r->size, + r->bitmap.pgsize, r->bitmap.size, + (char**)&((*iovecs)[i].iov_base)); + if (ret != 0) { goto out; } + (*iovecs)[i].iov_len = r->bitmap.size; } +out: + if (ret != 0) { + if (*iovecs != NULL) { + free(*iovecs); + *iovecs = NULL; + } + } + free(ranges); + return ret; +} - ret = (int)do_muser_ioctl(lm_ctx, &cmd->ioctl, data); +static int +handle_dirty_pages(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr, + struct iovec **iovecs, size_t *nr_iovecs) +{ + struct vfio_iommu_type1_dirty_bitmap dirty_bitmap; + int ret; -out: + assert(lm_ctx != NULL); + assert(hdr != NULL); + assert(iovecs != NULL); + assert(nr_iovecs != NULL); + + if (hdr->msg_size - sizeof *hdr < sizeof dirty_bitmap) { + lm_log(lm_ctx, LM_ERR, "invalid header size %lu", hdr->msg_size); + return -EINVAL; + } + + /* FIXME must also check argsz */ + + ret = recv(lm_ctx->conn_fd, &dirty_bitmap, sizeof dirty_bitmap, 0); + if (ret == -1) { + return -errno; + } + if ((size_t)ret < sizeof dirty_bitmap) { + return -EINVAL; + } + + if (dirty_bitmap.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) { + ret = dma_controller_dirty_page_logging_start(lm_ctx->dma, + lm_ctx->migration.pgsize); + } else if (dirty_bitmap.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) { + ret = dma_controller_dirty_page_logging_stop(lm_ctx->dma); + } else if (dirty_bitmap.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) { + ret = handle_dirty_pages_get(lm_ctx, hdr, iovecs, nr_iovecs); + } else { + ret = -EINVAL; + } - free(data); return ret; } +/* + * FIXME return value is messed up, sometimes we return -1 and set errno while + * other times we return -errno. Fix. + */ + static int -drive_loop(lm_ctx_t *lm_ctx) +process_request(lm_ctx_t *lm_ctx) { - struct muser_cmd cmd = { 0 }; - int err; + struct vfio_user_header hdr = { 0, }; + int ret; + int *fds = NULL; + int nr_fds; + struct vfio_irq_info irq_info; + struct vfio_device_info dev_info; + struct vfio_region_info *dev_reg_info = NULL; + struct iovec _iovecs[2] = { { 0, } }; + struct iovec *iovecs = NULL; + size_t nr_iovecs = 0; + bool free_iovec_data = true; - do { - err = ioctl(lm_ctx->fd, MUSER_DEV_CMD_WAIT, &cmd); - if (err < 0) { - return err; + assert(lm_ctx != NULL); + + if (lm_ctx->pci_info.reg_info[LM_DEV_MIGRATION_REG_IDX].size > 0 && + lm_ctx->migration.info.device_state == VFIO_DEVICE_STATE_STOP) { + return -ESHUTDOWN; + } + + nr_fds = lm_ctx->client_max_fds; + fds = alloca(nr_fds * sizeof(int)); + + /* FIXME get request shouldn't set errno, it should return it as -errno */ + ret = transports_ops[lm_ctx->trans].get_request(lm_ctx, &hdr, fds, &nr_fds); + if (unlikely(ret < 0)) { + if (ret == -EAGAIN || ret == -EWOULDBLOCK) { + return 0; + } + if (ret != -EINTR) { + lm_log(lm_ctx, LM_ERR, "failed to receive request: %s", strerror(-ret)); } + return ret; + } + if (unlikely(ret == 0)) { + if (errno == EINTR) { + return -EINTR; + } + if (errno == 0) { + lm_log(lm_ctx, LM_INF, "VFIO client closed connection"); + } else { + lm_log(lm_ctx, LM_ERR, "end of file: %m"); + } + return -ENOTCONN; + } + + if (ret < (int)sizeof hdr) { + lm_log(lm_ctx, LM_ERR, "short header read %d", ret); + return -EINVAL; + } - switch (cmd.type) { - case MUSER_IOCTL: - err = muser_ioctl(lm_ctx, &cmd); + if (hdr.flags.type != VFIO_USER_F_TYPE_COMMAND) { + lm_log(lm_ctx, LM_ERR, "header not a request"); + return -EINVAL; + } + + if (hdr.msg_size < sizeof hdr) { + lm_log(lm_ctx, LM_ERR, "bad size in header %d", hdr.msg_size); + return -EINVAL; + } + + /* FIXME in most of the following function we check that hdr.count is >= + * than the command-specific struct and there is an additional recv(2) for + * that data. We should eliminate duplicating this common code and move it + * here. + */ + + switch (hdr.cmd) { + case VFIO_USER_DMA_MAP: + case VFIO_USER_DMA_UNMAP: + ret = handle_dma_map_or_unmap(lm_ctx, &hdr, + hdr.cmd == VFIO_USER_DMA_MAP, + fds, nr_fds); break; - case MUSER_READ: - case MUSER_WRITE: - err = muser_access(lm_ctx, &cmd, cmd.type == MUSER_WRITE); + case VFIO_USER_DEVICE_GET_INFO: + ret = handle_device_get_info(lm_ctx, &hdr, &dev_info); + if (ret == 0) { + _iovecs[1].iov_base = &dev_info; + _iovecs[1].iov_len = dev_info.argsz; + iovecs = _iovecs; + nr_iovecs = 2; + } break; - case MUSER_MMAP: - err = muser_mmap(lm_ctx, &cmd); + case VFIO_USER_DEVICE_GET_REGION_INFO: + ret = handle_device_get_region_info(lm_ctx, &hdr, &dev_reg_info); + if (ret == 0) { + _iovecs[1].iov_base = dev_reg_info; + _iovecs[1].iov_len = dev_reg_info->argsz; + iovecs = _iovecs; + nr_iovecs = 2; + } + break; + case VFIO_USER_DEVICE_GET_IRQ_INFO: + ret = handle_device_get_irq_info(lm_ctx, &hdr, &irq_info); + if (ret == 0) { + _iovecs[1].iov_base = &irq_info; + _iovecs[1].iov_len = sizeof irq_info; + iovecs = _iovecs; + nr_iovecs = 2; + } break; - case MUSER_DMA_MMAP: - err = muser_dma_map(lm_ctx, &cmd); + case VFIO_USER_DEVICE_SET_IRQS: + ret = handle_device_set_irqs(lm_ctx, &hdr, fds, nr_fds); break; - case MUSER_DMA_MUNMAP: - err = muser_dma_unmap(lm_ctx, &cmd); + case VFIO_USER_REGION_READ: + case VFIO_USER_REGION_WRITE: + iovecs = _iovecs; + ret = handle_region_access(lm_ctx, &hdr, &iovecs[1].iov_base, + &iovecs[1].iov_len); + nr_iovecs = 2; + break; + case VFIO_USER_DEVICE_RESET: + ret = handle_device_reset(lm_ctx); + break; + case VFIO_USER_DIRTY_PAGES: + ret = handle_dirty_pages(lm_ctx, &hdr, &iovecs, &nr_iovecs); + if (ret >= 0) { + free_iovec_data = false; + } break; default: - lm_log(lm_ctx, LM_ERR, "bad command %d\n", cmd.type); - continue; - } - cmd.err = err; - err = ioctl(lm_ctx->fd, MUSER_DEV_CMD_DONE, &cmd); - if (err < 0) { - lm_log(lm_ctx, LM_ERR, "failed to complete command: %s\n", - strerror(errno)); + lm_log(lm_ctx, LM_ERR, "bad command %d", hdr.cmd); + return -EINVAL; + } + + /* + * TODO: In case of error during command handling set errno respectively + * in the reply message. + */ + if (ret < 0) { + lm_log(lm_ctx, LM_ERR, "failed to handle command %d: %s", hdr.cmd, + strerror(-ret)); + assert(false); /* FIXME */ + } + ret = _send_vfio_user_msg(lm_ctx->conn_fd, hdr.msg_id, true, + 0, iovecs, nr_iovecs, NULL, 0); + if (unlikely(ret < 0)) { + lm_log(lm_ctx, LM_ERR, "failed to complete command: %s", + strerror(-ret)); + } + if (iovecs != NULL && iovecs != _iovecs) { + if (free_iovec_data) { + size_t i; + for (i = 0; i < nr_iovecs; i++) { + free(iovecs[i].iov_base); + } } - // TODO: Figure out a clean way to get out of the loop. - } while (1); + free(iovecs); + } - return err; + return ret; } int lm_ctx_drive(lm_ctx_t *lm_ctx) { + int err; + if (lm_ctx == NULL) { errno = EINVAL; return -1; } - return drive_loop(lm_ctx); -} + do { + err = process_request(lm_ctx); + } while (err >= 0); -static int -dev_detach(int dev_fd) -{ - return close(dev_fd); + return err; } -static int -dev_attach(const char *uuid) +int +lm_ctx_poll(lm_ctx_t *lm_ctx) { - char *path; - int dev_fd; int err; - err = asprintf(&path, "/dev/" MUSER_DEVNODE "/%s", uuid); - if (err != (int)(strlen(MUSER_DEVNODE) + strlen(uuid) + 6)) { - return -1; + if (unlikely((lm_ctx->flags & LM_FLAG_ATTACH_NB) == 0)) { + return -ENOTSUP; } - dev_fd = open(path, O_RDWR); - - free(path); + err = process_request(lm_ctx); - return dev_fd; + return err >= 0 ? 0 : err; } +/* FIXME this is not enough anymore, check muser_mmap */ void * lm_mmap(lm_ctx_t *lm_ctx, off_t offset, size_t length) { @@ -1035,38 +2273,64 @@ lm_mmap(lm_ctx_t *lm_ctx, off_t offset, size_t length) lm_ctx->fd, offset); } -int -lm_irq_trigger(lm_ctx_t *lm_ctx, uint32_t vector) +static int validate_irq_subindex(lm_ctx_t *lm_ctx, uint32_t subindex) { - eventfd_t val = 1; - if ((lm_ctx == NULL) || (vector >= lm_ctx->irqs.max_ivs)) { - lm_log(lm_ctx, LM_ERR, "bad IRQ %d, max=%d\n", vector, + if ((lm_ctx == NULL) || (subindex >= lm_ctx->irqs.max_ivs)) { + lm_log(lm_ctx, LM_ERR, "bad IRQ %d, max=%d\n", subindex, lm_ctx->irqs.max_ivs); + /* FIXME should return -errno */ errno = EINVAL; return -1; } - if (lm_ctx->irqs.efds[vector] == -1) { - lm_log(lm_ctx, LM_ERR, "no fd for interrupt %d\n", vector); + return 0; +} + +int +lm_irq_trigger(lm_ctx_t *lm_ctx, uint32_t subindex) +{ + int ret; + eventfd_t val = 1; + + ret = validate_irq_subindex(lm_ctx, subindex); + if (ret < 0) { + return ret; + } + + if (lm_ctx->irqs.efds[subindex] == -1) { + lm_log(lm_ctx, LM_ERR, "no fd for interrupt %d\n", subindex); + /* FIXME should return -errno */ errno = ENOENT; return -1; } - if (vector == LM_DEV_INTX_IRQ_IDX && !lm_ctx->pci_config_space->hdr.cmd.id) { - lm_log(lm_ctx, LM_ERR, "failed to trigger INTx IRQ, INTx disabled\n"); - errno = EINVAL; + return eventfd_write(lm_ctx->irqs.efds[subindex], val); +} + +int +lm_irq_message(lm_ctx_t *lm_ctx, uint32_t subindex) +{ + int ret, msg_id = 1; + struct vfio_user_irq_info irq_info; + + ret = validate_irq_subindex(lm_ctx, subindex); + if (ret < 0) { return -1; - } else if (vector == LM_DEV_MSIX_IRQ_IDX) { - /* - * FIXME must check that MSI-X capability exists during creation time - * FIXME need to check that MSI-X is enabled and that it's not masked. - * Currently that's not possible because libmuser doesn't care about - * the internals of a capability. - */ } - return eventfd_write(lm_ctx->irqs.efds[vector], val); + irq_info.subindex = subindex; + ret = send_recv_vfio_user_msg(lm_ctx->conn_fd, msg_id, + VFIO_USER_VM_INTERRUPT, + &irq_info, sizeof irq_info, + NULL, 0, NULL, NULL, 0); + if (ret < 0) { + /* FIXME should return -errno */ + errno = -ret; + return -1; + } + + return 0; } static void @@ -1081,16 +2345,50 @@ free_sparse_mmap_areas(lm_reg_info_t *reg_info) void lm_ctx_destroy(lm_ctx_t *lm_ctx) { + int ret; + if (lm_ctx == NULL) { return; } + free(lm_ctx->uuid); + + /* + * FIXME The following cleanup can be dangerous depending on how lm_ctx_destroy + * is called since it might delete files it did not create. Improve by + * acquiring a lock on the directory. + */ + + if (lm_ctx->iommu_dir_fd != -1) { + if ((ret = unlinkat(lm_ctx->iommu_dir_fd, IOMMU_GRP_NAME, 0)) == -1 + && errno != ENOENT) { + lm_log(lm_ctx, LM_DBG, "failed to remove " IOMMU_GRP_NAME ": " + "%m\n"); + } + if ((ret = unlinkat(lm_ctx->iommu_dir_fd, MUSER_SOCK, 0)) == -1 && + errno != ENOENT) { + lm_log(lm_ctx, LM_DBG, "failed to remove " MUSER_SOCK ": %m\n"); + } + if (close(lm_ctx->iommu_dir_fd) == -1) { + lm_log(lm_ctx, LM_DBG, "failed to close IOMMU dir fd %d: %m\n", + lm_ctx->iommu_dir_fd); + } + } + if (lm_ctx->iommu_dir != NULL) { + if ((ret = rmdir(lm_ctx->iommu_dir)) == -1 && errno != ENOENT) { + lm_log(lm_ctx, LM_DBG, "failed to remove %s: %m\n", + lm_ctx->iommu_dir); + } + free(lm_ctx->iommu_dir); + } + free(lm_ctx->pci_config_space); - dev_detach(lm_ctx->fd); + transports_ops[lm_ctx->trans].detach(lm_ctx); if (lm_ctx->dma != NULL) { - dma_controller_destroy(lm_ctx, lm_ctx->dma); + dma_controller_destroy(lm_ctx->dma); } free_sparse_mmap_areas(lm_ctx->pci_info.reg_info); + free(lm_ctx->caps); free(lm_ctx); // FIXME: Maybe close any open irq efds? Unmap stuff? } @@ -1125,6 +2423,7 @@ pci_config_setup(lm_ctx_t *lm_ctx, const lm_dev_info_t *dev_info) { lm_reg_info_t *cfg_reg; const lm_reg_info_t zero_reg = { 0 }; + lm_reg_info_t *migr_reg; int i; assert(lm_ctx != NULL); @@ -1171,7 +2470,7 @@ pci_config_setup(lm_ctx_t *lm_ctx, const lm_dev_info_t *dev_info) // Initialise capabilities. if (dev_info->nr_caps > 0) { - lm_ctx->caps = caps_create(dev_info->caps, dev_info->nr_caps); + lm_ctx->caps = caps_create(lm_ctx, dev_info->caps, dev_info->nr_caps); if (lm_ctx->caps == NULL) { lm_log(lm_ctx, LM_ERR, "failed to create PCI capabilities: %m\n"); goto err; @@ -1181,6 +2480,28 @@ pci_config_setup(lm_ctx_t *lm_ctx, const lm_dev_info_t *dev_info) lm_ctx->pci_config_space->hdr.cap = PCI_STD_HEADER_SIZEOF; } + /* + * Check the migration region. + */ + migr_reg = &lm_ctx->pci_info.reg_info[LM_DEV_MIGRATION_REG_IDX]; + if (migr_reg->size > 0) { + if (migr_reg->size < sizeof(struct vfio_device_migration_info)) { + return -EINVAL; + } + + /* FIXME this should be done in lm_ctx_run or poll */ + lm_ctx->migration.info.device_state = VFIO_DEVICE_STATE_RUNNING; + + lm_ctx->migration.callbacks = dev_info->migration_callbacks; + if (lm_ctx->migration.callbacks.transition == NULL || + lm_ctx->migration.callbacks.get_pending_bytes == NULL || + lm_ctx->migration.callbacks.prepare_data == NULL || + lm_ctx->migration.callbacks.read_data == NULL || + lm_ctx->migration.callbacks.write_data == NULL) { + return -EINVAL; + } + } + return 0; err: @@ -1212,6 +2533,18 @@ pci_info_bounce(lm_pci_info_t *dst, const lm_pci_info_t *src) dst->cc = src->cc; } +int +lm_ctx_try_attach(lm_ctx_t *lm_ctx) +{ + assert(lm_ctx != NULL); + + if ((lm_ctx->flags & LM_FLAG_ATTACH_NB) == 0) { + errno = EINVAL; + return -1; + } + return transports_ops[lm_ctx->trans].attach(lm_ctx); +} + lm_ctx_t * lm_ctx_create(const lm_dev_info_t *dev_info) { @@ -1226,6 +2559,11 @@ lm_ctx_create(const lm_dev_info_t *dev_info) return NULL; } + if (dev_info->trans != LM_TRANS_SOCK) { + errno = EINVAL; + return NULL; + } + /* * FIXME need to check that the number of MSI and MSI-X IRQs are valid * (1, 2, 4, 8, 16 or 32 for MSI and up to 2048 for MSI-X). @@ -1244,6 +2582,9 @@ lm_ctx_create(const lm_dev_info_t *dev_info) if (lm_ctx == NULL) { return NULL; } + lm_ctx->trans = dev_info->trans; + + lm_ctx->iommu_dir_fd = -1; // Set context irq information. for (i = 0; i < max_ivs; i++) { @@ -1259,10 +2600,26 @@ lm_ctx_create(const lm_dev_info_t *dev_info) lm_ctx->log = dev_info->log; lm_ctx->log_lvl = dev_info->log_lvl; lm_ctx->reset = dev_info->reset; + lm_ctx->flags = dev_info->flags; + + lm_ctx->uuid = strdup(dev_info->uuid); + if (lm_ctx->uuid == NULL) { + err = errno; + goto out; + } // Bounce the provided pci_info into the context. pci_info_bounce(&lm_ctx->pci_info, &dev_info->pci_info); + /* + * FIXME above memcpy also copies reg_info->mmap_areas. If pci_config_setup + * fails then we try to free reg_info->mmap_areas, which is wrong because + * this is a user pointer. + */ + for (i = 0; i < ARRAY_SIZE(lm_ctx->pci_info.reg_info); i++) { + lm_ctx->pci_info.reg_info[i].mmap_areas = NULL; + } + // Setup the PCI config space for this context. err = pci_config_setup(lm_ctx, dev_info); if (err != 0) { @@ -1276,65 +2633,53 @@ lm_ctx_create(const lm_dev_info_t *dev_info) goto out; } - // Attach to the muser control device. - lm_ctx->fd = dev_attach(dev_info->uuid); - if (lm_ctx->fd == -1) { - err = errno; - goto out; + if (transports_ops[dev_info->trans].init != NULL) { + err = transports_ops[dev_info->trans].init(lm_ctx); + if (err < 0) { + goto out; + } + lm_ctx->fd = err; + } + err = 0; + + // Attach to the muser control device. With LM_FLAG_ATTACH_NB caller is + // always expected to call lm_ctx_try_attach(). + if ((dev_info->flags & LM_FLAG_ATTACH_NB) == 0) { + lm_ctx->conn_fd = transports_ops[dev_info->trans].attach(lm_ctx); + if (lm_ctx->conn_fd < 0) { + err = lm_ctx->conn_fd; + if (err != EINTR) { + lm_log(lm_ctx, LM_ERR, "failed to attach: %s", + strerror(-err)); + } + goto out; + } } + lm_ctx->map_dma = dev_info->map_dma; + lm_ctx->unmap_dma = dev_info->unmap_dma; + // Create the internal DMA controller. - lm_ctx->dma = dma_controller_create(LM_DMA_REGIONS); - if (lm_ctx->dma == NULL) { - err = errno; - goto out; + if (lm_ctx->unmap_dma != NULL) { + lm_ctx->dma = dma_controller_create(lm_ctx, LM_DMA_REGIONS); + if (lm_ctx->dma == NULL) { + err = errno; + goto out; + } } out: - if (err) { - if (lm_ctx) { - dma_controller_destroy(lm_ctx, lm_ctx->dma); - dev_detach(lm_ctx->fd); - free_sparse_mmap_areas(lm_ctx->pci_info.reg_info); - free(lm_ctx->pci_config_space); - free(lm_ctx); + if (err != 0) { + if (lm_ctx != NULL) { + lm_ctx_destroy(lm_ctx); lm_ctx = NULL; } - errno = err; + errno = -err; } return lm_ctx; } -#ifdef DEBUG -static void -dump_buffer(lm_ctx_t *lm_ctx, const char *prefix, - const char *buf, uint32_t count) -{ - int i; - const size_t bytes_per_line = 0x8; - - if (strcmp(prefix, "")) { - lm_log(lm_ctx, LM_DBG, "%s\n", prefix); - } - for (i = 0; i < (int)count; i++) { - if (i % bytes_per_line != 0) { - lm_log(lm_ctx, LM_DBG, " "); - } - /* TODO valgrind emits a warning if count is 1 */ - lm_log(lm_ctx, LM_DBG, "0x%02x", *(buf + i)); - if ((i + 1) % bytes_per_line == 0) { - lm_log(lm_ctx, LM_DBG, "\n"); - } - } - if (i % bytes_per_line != 0) { - lm_log(lm_ctx, LM_DBG, "\n"); - } -} -#else -#define dump_buffer(lm_ctx, prefix, buf, count) -#endif - /* * Returns a pointer to the standard part of the PCI configuration space. */ @@ -1364,21 +2709,34 @@ lm_get_region_info(lm_ctx_t *lm_ctx) inline int lm_addr_to_sg(lm_ctx_t *lm_ctx, dma_addr_t dma_addr, - uint32_t len, dma_sg_t *sg, int max_sg) + uint32_t len, dma_sg_t *sg, int max_sg, int prot) { - return dma_addr_to_sg(lm_ctx->dma, dma_addr, len, sg, max_sg); + assert(lm_ctx != NULL); + + if (unlikely(lm_ctx->unmap_dma == NULL)) { + errno = EINVAL; + return -1; + } + return dma_addr_to_sg(lm_ctx->dma, dma_addr, len, sg, max_sg, prot); } inline int lm_map_sg(lm_ctx_t *lm_ctx, const dma_sg_t *sg, struct iovec *iov, int cnt) { + if (unlikely(lm_ctx->unmap_dma == NULL)) { + errno = EINVAL; + return -1; + } return dma_map_sg(lm_ctx->dma, sg, iov, cnt); } inline void lm_unmap_sg(lm_ctx_t *lm_ctx, const dma_sg_t *sg, struct iovec *iov, int cnt) { + if (unlikely(lm_ctx->unmap_dma == NULL)) { + return; + } return dma_unmap_sg(lm_ctx->dma, sg, iov, cnt); } @@ -1396,4 +2754,66 @@ lm_ctx_run(lm_dev_info_t *dev_info) return ret; } +uint8_t * +lm_ctx_get_cap(lm_ctx_t *lm_ctx, uint8_t id) +{ + assert(lm_ctx != NULL); + + return cap_find_by_id(lm_ctx, id); +} + +int +lm_dma_read(lm_ctx_t *lm_ctx, dma_sg_t *sg, void *data) +{ + struct vfio_user_dma_region_access *dma_recv; + struct vfio_user_dma_region_access dma_send; + int recv_size; + int msg_id = 1, ret; + + assert(lm_ctx != NULL); + assert(sg != NULL); + + recv_size = sizeof(*dma_recv) + sg->length; + + dma_recv = calloc(recv_size, 1); + if (dma_recv == NULL) { + return -ENOMEM; + } + + dma_send.addr = sg->dma_addr; + dma_send.count = sg->length; + ret = send_recv_vfio_user_msg(lm_ctx->conn_fd, msg_id, VFIO_USER_DMA_READ, + &dma_send, sizeof dma_send, NULL, 0, NULL, + dma_recv, recv_size); + memcpy(data, dma_recv->data, sg->length); /* FIXME no need for memcpy */ + free(dma_recv); + + return ret; +} + +int +lm_dma_write(lm_ctx_t *lm_ctx, dma_sg_t *sg, void *data) +{ + struct vfio_user_dma_region_access *dma_send, dma_recv; + int send_size = sizeof(*dma_send) + sg->length; + int msg_id = 1, ret; + + assert(lm_ctx != NULL); + assert(sg != NULL); + + dma_send = calloc(send_size, 1); + if (dma_send == NULL) { + return -ENOMEM; + } + dma_send->addr = sg->dma_addr; + dma_send->count = sg->length; + memcpy(dma_send->data, data, sg->length); /* FIXME no need to copy! */ + ret = send_recv_vfio_user_msg(lm_ctx->conn_fd, msg_id, VFIO_USER_DMA_WRITE, + dma_send, send_size, + NULL, 0, NULL, &dma_recv, sizeof(dma_recv)); + free(dma_send); + + return ret; +} + /* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/lib/muser_pci.c b/lib/muser_pci.c index 36692ab..2846301 100644 --- a/lib/muser_pci.c +++ b/lib/muser_pci.c @@ -52,7 +52,7 @@ muser_pci_hdr_write_bar(lm_ctx_t *lm_ctx, uint16_t bar_index, const char *buf) lm_reg_info_t *reg_info = lm_get_region_info(lm_ctx); lm_pci_hdr_t *hdr; - assert(lm_ctx); + assert(lm_ctx != NULL); if (reg_info[bar_index].size == 0) { return; @@ -86,15 +86,15 @@ handle_command_write(lm_ctx_t *ctx, lm_pci_config_space_t *pci, { uint16_t v; - assert(ctx); + assert(ctx != NULL); if (count != 2) { lm_log(ctx, LM_ERR, "bad write command size %d\n", count); return -EINVAL; } - assert(pci); - assert(buf); + assert(pci != NULL); + assert(buf != NULL); v = *(uint16_t*)buf; @@ -153,17 +153,35 @@ handle_command_write(lm_ctx_t *ctx, lm_pci_config_space_t *pci, if ((v & PCI_COMMAND_INTX_DISABLE) == PCI_COMMAND_INTX_DISABLE) { if (!pci->hdr.cmd.id) { pci->hdr.cmd.id = 0x1; - lm_log(ctx, LM_INF, "INTx emulation enabled\n"); + lm_log(ctx, LM_INF, "INTx emulation disabled\n"); } v &= ~PCI_COMMAND_INTX_DISABLE; } else { if (pci->hdr.cmd.id) { pci->hdr.cmd.id = 0x0; - lm_log(ctx, LM_INF, "INTx emulation disabled\n"); + lm_log(ctx, LM_INF, "INTx emulation enabled\n"); } } - if (v) { + if ((v & PCI_COMMAND_INVALIDATE) == PCI_COMMAND_INVALIDATE) { + if (!pci->hdr.cmd.mwie) { + pci->hdr.cmd.mwie = 1U; + lm_log(ctx, LM_INF, "memory write and invalidate enabled\n"); + } + v &= ~PCI_COMMAND_INVALIDATE; + } else { + if (pci->hdr.cmd.mwie) { + pci->hdr.cmd.mwie = 0; + lm_log(ctx, LM_INF, "memory write and invalidate disabled"); + } + } + + if ((v & PCI_COMMAND_VGA_PALETTE) == PCI_COMMAND_VGA_PALETTE) { + lm_log(ctx, LM_INF, "enabling VGA palette snooping ignored\n"); + v &= ~PCI_COMMAND_VGA_PALETTE; + } + + if (v != 0) { lm_log(ctx, LM_ERR, "unconsumed command flags %x\n", v); return -EINVAL; } @@ -177,8 +195,8 @@ handle_erom_write(lm_ctx_t *ctx, lm_pci_config_space_t *pci, { uint32_t v; - assert(ctx); - assert(pci); + assert(ctx != NULL); + assert(pci != NULL); if (count != 0x4) { lm_log(ctx, LM_ERR, "bad EROM count %d\n", count); @@ -207,8 +225,8 @@ muser_pci_hdr_write(lm_ctx_t *lm_ctx, uint16_t offset, lm_pci_config_space_t *pci; int ret = 0; - assert(lm_ctx); - assert(buf); + assert(lm_ctx != NULL); + assert(buf != NULL); pci = lm_get_pci_config_space(lm_ctx); @@ -248,8 +266,8 @@ muser_pci_hdr_write(lm_ctx_t *lm_ctx, uint16_t offset, ret = -EINVAL; } -#ifndef LM_TERSE_LOGGING - dump_buffer(lm_ctx, "PCI header", pci->hdr.raw, 0xff); +#ifdef LM_VERBOSE_LOGGING + dump_buffer("PCI header", (char*)pci->hdr.raw, 0xff); #endif return ret; @@ -263,18 +281,18 @@ muser_pci_hdr_write(lm_ctx_t *lm_ctx, uint16_t offset, * @count: output parameter that receives the number of bytes read/written */ static inline int -muser_do_pci_hdr_access(lm_ctx_t *lm_ctx, size_t *count, - loff_t *pos, bool is_write, +muser_do_pci_hdr_access(lm_ctx_t *lm_ctx, uint32_t *count, + uint64_t *pos, bool is_write, char *buf) { - size_t _count; + uint32_t _count; loff_t _pos; int err = 0; - assert(lm_ctx); - assert(count); - assert(pos); - assert(buf); + assert(lm_ctx != NULL); + assert(count != NULL); + assert(pos != NULL); + assert(buf != NULL); _pos = *pos - region_to_offset(LM_DEV_CFG_REG_IDX); _count = MIN(*count, PCI_STD_HEADER_SIZEOF - _pos); @@ -290,20 +308,21 @@ muser_do_pci_hdr_access(lm_ctx_t *lm_ctx, size_t *count, } static inline bool -muser_is_pci_hdr_access(loff_t pos) +muser_is_pci_hdr_access(uint64_t pos) { - const off_t off = (loff_t) region_to_offset(LM_DEV_CFG_REG_IDX); - return pos - off >= 0 && pos - off < PCI_STD_HEADER_SIZEOF; + const uint64_t off = region_to_offset(LM_DEV_CFG_REG_IDX); + return pos >= off && pos - off < PCI_STD_HEADER_SIZEOF; } +/* FIXME this function is misleading, remove it */ int -muser_pci_hdr_access(lm_ctx_t *lm_ctx, size_t *count, - loff_t *pos, bool is_write, +muser_pci_hdr_access(lm_ctx_t *lm_ctx, uint32_t *count, + uint64_t *pos, bool is_write, char *buf) { - assert(lm_ctx); - assert(count); - assert(pos); + assert(lm_ctx != NULL); + assert(count != NULL); + assert(pos != NULL); if (!muser_is_pci_hdr_access(*pos)) { return 0; diff --git a/lib/muser_priv.h b/lib/muser_priv.h index aa29f5a..097874a 100644 --- a/lib/muser_priv.h +++ b/lib/muser_priv.h @@ -35,9 +35,11 @@ #include "muser.h" +extern char *irq_to_str[]; + int -muser_pci_hdr_access(lm_ctx_t *lm_ctx, size_t *count, - loff_t *pos, bool write, char *buf); +muser_pci_hdr_access(lm_ctx_t *lm_ctx, uint32_t *count, + uint64_t *pos, bool write, char *buf); lm_reg_info_t * lm_get_region_info(lm_ctx_t *lm_ctx); @@ -45,4 +47,111 @@ lm_get_region_info(lm_ctx_t *lm_ctx); uint64_t region_to_offset(uint32_t region); +int +_send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply, + enum vfio_user_command cmd, + struct iovec *iovecs, size_t nr_iovecs, + int *fds, int count); + +int +send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply, + enum vfio_user_command cmd, + void *data, size_t data_len, + int *fds, size_t count); + + +int +recv_vfio_user_msg(int sock, struct vfio_user_header *hdr, bool is_reply, + uint16_t *msg_id, void *data, size_t *len); + +int +send_version(int sock, int major, int minor, uint16_t msg_id, bool is_reply, + char *caps); + +int +recv_version(int sock, int *major, int *minor, uint16_t *msg_id, bool is_reply, + int *max_fds, size_t *pgsize); + +int +_send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd, + struct iovec *iovecs, size_t nr_iovecs, + int *send_fds, size_t fd_count, + struct vfio_user_header *hdr, + void *recv_data, size_t recv_len); + +int +send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd, + void *send_data, size_t send_len, + int *send_fds, size_t fd_count, + struct vfio_user_header *hdr, + void *recv_data, size_t recv_len); + +/* FIXME copied from include/linux/stddef.h, is this OK license-wise? */ +#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER)) +#define offsetofend(TYPE, MEMBER) \ + (offsetof(TYPE, MEMBER) + sizeof_field(TYPE, MEMBER)) + +static inline ssize_t get_minsz(unsigned int cmd) +{ + switch (cmd) { + case VFIO_DEVICE_GET_INFO: + return offsetofend(struct vfio_device_info, num_irqs); + case VFIO_DEVICE_GET_REGION_INFO: + return offsetofend(struct vfio_region_info, offset); + case VFIO_DEVICE_GET_IRQ_INFO: + return offsetofend(struct vfio_irq_info, count); + case VFIO_DEVICE_SET_IRQS: + return offsetofend(struct vfio_irq_set, count); + case VFIO_GROUP_GET_STATUS: + return offsetofend(struct vfio_group_status, flags); + case VFIO_GET_API_VERSION: + return 0; + case VFIO_CHECK_EXTENSION: + case VFIO_GROUP_SET_CONTAINER: + case VFIO_GROUP_UNSET_CONTAINER: + case VFIO_SET_IOMMU: + return sizeof(int); + case VFIO_IOMMU_GET_INFO: + return offsetofend(struct vfio_iommu_type1_info, iova_pgsizes); + case VFIO_IOMMU_MAP_DMA: + return offsetofend(struct vfio_iommu_type1_dma_map, size); + case VFIO_IOMMU_UNMAP_DMA: + return offsetofend(struct vfio_iommu_type1_dma_unmap, size); + case VFIO_GROUP_GET_DEVICE_FD: + case VFIO_DEVICE_RESET: + return 0; + } + return -EOPNOTSUPP; +} + +static inline const char* vfio_cmd_to_str(int cmd) { + switch (cmd) { + case VFIO_GET_API_VERSION: return "VFIO_GET_API_VERSION"; + case VFIO_CHECK_EXTENSION: return "VFIO_CHECK_EXTENSION"; + case VFIO_SET_IOMMU: return "VFIO_SET_IOMMU"; + case VFIO_GROUP_GET_STATUS: return "VFIO_GROUP_GET_STATUS"; + case VFIO_GROUP_SET_CONTAINER: return "VFIO_GROUP_SET_CONTAINER"; + case VFIO_GROUP_UNSET_CONTAINER: return "VFIO_GROUP_UNSET_CONTAINER"; + case VFIO_GROUP_GET_DEVICE_FD: return "VFIO_GROUP_GET_DEVICE_FD"; + case VFIO_DEVICE_GET_INFO: return "VFIO_DEVICE_GET_INFO"; + case VFIO_DEVICE_GET_REGION_INFO: return "VFIO_DEVICE_GET_REGION_INFO"; + case VFIO_DEVICE_GET_IRQ_INFO: return "VFIO_DEVICE_GET_IRQ_INFO"; + case VFIO_DEVICE_SET_IRQS: return "VFIO_DEVICE_SET_IRQS"; + case VFIO_DEVICE_RESET: return "VFIO_DEVICE_RESET"; + case VFIO_IOMMU_GET_INFO: return "VFIO_IOMMU_GET_INFO/VFIO_DEVICE_GET_PCI_HOT_RESET_INFO/VFIO_IOMMU_SPAPR_TCE_GET_INFO"; + case VFIO_IOMMU_MAP_DMA: return "VFIO_IOMMU_MAP_DMA/VFIO_DEVICE_PCI_HOT_RESET"; + case VFIO_IOMMU_UNMAP_DMA: return "VFIO_IOMMU_UNMAP_DMA"; + case VFIO_IOMMU_ENABLE: return "VFIO_IOMMU_ENABLE"; + case VFIO_IOMMU_DISABLE: return "VFIO_IOMMU_DISABLE"; + case VFIO_EEH_PE_OP: return "VFIO_EEH_PE_OP"; + case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: return "VFIO_IOMMU_SPAPR_REGISTER_MEMORY"; + case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: return "VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY"; + case VFIO_IOMMU_SPAPR_TCE_CREATE: return "VFIO_IOMMU_SPAPR_TCE_CREATE"; + case VFIO_IOMMU_SPAPR_TCE_REMOVE: return "VFIO_IOMMU_SPAPR_TCE_REMOVE"; + } + return NULL; +} + #endif /* MUSER_PRIV_H */ + +/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/lib/vfio_user.h b/lib/vfio_user.h new file mode 100644 index 0000000..19f751a --- /dev/null +++ b/lib/vfio_user.h @@ -0,0 +1,167 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VFIO_USER_H +#define _VFIO_USER_H + +#include <inttypes.h> +#include <linux/vfio.h> +#include <linux/version.h> + +enum vfio_user_command { + VFIO_USER_VERSION = 1, + VFIO_USER_DMA_MAP = 2, + VFIO_USER_DMA_UNMAP = 3, + VFIO_USER_DEVICE_GET_INFO = 4, + VFIO_USER_DEVICE_GET_REGION_INFO = 5, + VFIO_USER_DEVICE_GET_IRQ_INFO = 6, + VFIO_USER_DEVICE_SET_IRQS = 7, + VFIO_USER_REGION_READ = 8, + VFIO_USER_REGION_WRITE = 9, + VFIO_USER_DMA_READ = 10, + VFIO_USER_DMA_WRITE = 11, + VFIO_USER_VM_INTERRUPT = 12, + VFIO_USER_DEVICE_RESET = 13, + VFIO_USER_DIRTY_PAGES = 14, + VFIO_USER_MAX, +}; + +enum vfio_user_message_type { + VFIO_USER_MESSAGE_COMMAND = 0, + VFIO_USER_MESSAGE_REPLY = 1, +}; + +#define VFIO_USER_FLAGS_NO_REPLY (0x1) + +struct vfio_user_header { + uint16_t msg_id; + uint16_t cmd; + uint32_t msg_size; + struct { + uint32_t type : 4; +#define VFIO_USER_F_TYPE_COMMAND 0 +#define VFIO_USER_F_TYPE_REPLY 1 + uint32_t no_reply : 1; + uint32_t error : 1; + uint32_t resvd : 26; + } flags; + uint32_t error_no; +} __attribute__((packed)); + +struct vfio_user_dma_region { + uint64_t addr; + uint64_t size; + uint64_t offset; + uint32_t prot; + uint32_t flags; +#define VFIO_USER_F_DMA_REGION_MAPPABLE (0x0) +} __attribute__((packed)); + +struct vfio_user_region_access { + uint64_t offset; + uint32_t region; + uint32_t count; + uint8_t data[]; +} __attribute__((packed)); + +struct vfio_user_dma_region_access { + uint64_t addr; + uint32_t count; + uint8_t data[]; +} __attribute__((packed)); + +struct vfio_user_irq_info { + uint32_t subindex; +} __attribute__((packed)); + +#if LINUX_VERSION_CODE < KERNEL_VERSION(5,8,0) + +/* copied from <linux/vfio.h> */ + +#define VFIO_REGION_TYPE_MIGRATION (3) +#define VFIO_REGION_SUBTYPE_MIGRATION (1) + +struct vfio_device_migration_info { + __u32 device_state; /* VFIO device state */ +#define VFIO_DEVICE_STATE_STOP (0) +#define VFIO_DEVICE_STATE_RUNNING (1 << 0) +#define VFIO_DEVICE_STATE_SAVING (1 << 1) +#define VFIO_DEVICE_STATE_RESUMING (1 << 2) +#define VFIO_DEVICE_STATE_MASK (VFIO_DEVICE_STATE_RUNNING | \ + VFIO_DEVICE_STATE_SAVING | \ + VFIO_DEVICE_STATE_RESUMING) + +#define VFIO_DEVICE_STATE_VALID(state) \ + (state & VFIO_DEVICE_STATE_RESUMING ? \ + (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1) + +#define VFIO_DEVICE_STATE_IS_ERROR(state) \ + ((state & VFIO_DEVICE_STATE_MASK) == (VFIO_DEVICE_STATE_SAVING | \ + VFIO_DEVICE_STATE_RESUMING)) + +#define VFIO_DEVICE_STATE_SET_ERROR(state) \ + ((state & ~VFIO_DEVICE_STATE_MASK) | VFIO_DEVICE_SATE_SAVING | \ + VFIO_DEVICE_STATE_RESUMING) + + __u32 reserved; + __u64 pending_bytes; + __u64 data_offset; + __u64 data_size; +}; + +struct vfio_bitmap { + __u64 pgsize; /* page size for bitmap in bytes */ + __u64 size; /* in bytes */ + __u64 *data; /* one bit per page */ +}; + +struct vfio_iommu_type1_dirty_bitmap { + __u32 argsz; + __u32 flags; +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_START (1 << 0) +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP (1 << 1) +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP (1 << 2) + __u8 data[]; +}; + +struct vfio_iommu_type1_dirty_bitmap_get { + __u64 iova; /* IO virtual address */ + __u64 size; /* Size of iova range */ + struct vfio_bitmap bitmap; +}; + +#endif + +#endif + +/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ |