Merge branch 'vfio-user'

author: Thanos Makatos <thanos.makatos@nutanix.com> 2020-11-11 07:35:10 -0500
committer: Thanos Makatos <thanos.makatos@nutanix.com> 2020-11-11 07:35:10 -0500
commit: b6e8c7b39456e7c0a164f87c48eac2bbfd6d85f3 (patch)
tree: c94839c02cde83bca416221bd906e4952fbc8c53 /lib
parent: b9a2e75360e14e59db651d6081894e0cf20e7c2d (diff)
parent: 985940e6539eaf8f41e0b6421938b5bf5c1db22c (diff)
download: libvfio-user-b6e8c7b39456e7c0a164f87c48eac2bbfd6d85f3.zip
libvfio-user-b6e8c7b39456e7c0a164f87c48eac2bbfd6d85f3.tar.gz
libvfio-user-b6e8c7b39456e7c0a164f87c48eac2bbfd6d85f3.tar.bz2
16 files changed, 3093 insertions, 674 deletions
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index e2084fe..bc9e4b8 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -32,14 +32,14 @@ set(CMAKE_C_FLAGS "-Wall -Wextra -Werror -fPIC")
 set(CMAKE_C_FLAGS_DEBUG "-O0 -ggdb")
 
 add_library(muser SHARED
-    ../kmod/muser.h
+    vfio_user.h
     muser.h
     muser_priv.h
     common.h)
 
 target_link_libraries(muser muser_ctx muser_pci dma cap)
 set_target_properties(muser PROPERTIES LINKER_LANGUAGE C)
-set_target_properties(muser PROPERTIES PUBLIC_HEADER "muser.h;pci.h")
+set_target_properties(muser PROPERTIES PUBLIC_HEADER "muser.h;pci.h;vfio_user.h")
 
 set(UT_CFLAGS "-O0 -ggdb --coverage")
 set(UT_LFLAGS "--coverage")
diff --git a/lib/cap.c b/lib/cap.c
index ca2235a..451c85a 100644
--- a/lib/cap.c
+++ b/lib/cap.c
@@ -34,56 +34,60 @@
 #include <errno.h>
 #include <stdlib.h>
 #include <stdio.h>
+#include <stddef.h>
+#include <string.h>
 
 #include "muser.h"
 #include "cap.h"
 
 struct cap {
-    uint8_t         start;
-    uint8_t         end;
-    uint8_t         id;
-    lm_cap_access_t *fn;
+    uint8_t start;
+    uint8_t end;
 };
 
 struct caps {
-    struct cap  caps[LM_MAX_CAPS];
-    int         nr_caps;
+    struct cap      caps[LM_MAX_CAPS]; /* FIXME only needs to be as big as nr_caps */
+    unsigned int    nr_caps;
 };
 
 /*
  * Tells whether a capability is being accessed.
  */
 static bool
-cap_is_accessed(struct cap *caps, int nr_caps, loff_t offset)
+cap_is_accessed(struct cap *caps, int nr_caps, size_t count, loff_t offset)
 {
-    /*
-     * Ignore if it's at the standard PCI header. The first capability starts
-     * right after that.
-     */
-    if (offset < PCI_STD_HEADER_SIZEOF) {
-        return false;
-    }
-
-    /* ignore if there are no capabilities */
-    if (!nr_caps) {
+    if (nr_caps == 0) {
         return false;
     }
 
-    assert(caps);
+    assert(caps != NULL);
 
-    /*
-     * Ignore if it's before the first capability. This check is probably
-     * redundant since we assume that the first capability starts right after
-     * the standard PCI header.
-     * TODO should we check that it doesn't cross into the first capability?
-     */
     if (offset < caps[0].start) {
+        /* write starts before first capability */
+
+        if (offset + count <= caps[0].start) {
+            /* write ends before first capability */
+            return false;
+        }
+
+        /*
+         * FIXME write starts before capabilities but extends into them. I don't
+         * think that the while loop in lm_access will allow this in the first
+         * place.
+         */
+        assert(false);
+    } else if (offset > caps[nr_caps - 1].end) {
+        /* write starts after last capability */
         return false;
     }
 
-    /* ignore if it's past the last capability */
-    if (offset > caps[nr_caps - 1].end) {
-        return false;
+    if (offset + count > (size_t)(caps[nr_caps - 1].end + 1)) {
+        /*
+         * FIXME write starts within capabilities but extends past them, I think
+         * that this _is_ possible, e.g. MSI-X is 12 bytes (PCI_CAP_MSIX_SIZEOF)
+         * and the host writes to first 8 bytes and then writes 8 more.
+         */
+        assert(false);
     }
     return true;
 }
@@ -92,151 +96,369 @@ cap_is_accessed(struct cap *caps, int nr_caps, loff_t offset)
  * Returns the PCI capability that is contained within the specified region
  * (offset + count).
  */
-static struct cap *
-cap_find(struct cap *caps, int nr_caps, loff_t offset, size_t count)
+static uint8_t *
+cap_find(lm_pci_config_space_t *config_space, struct caps *caps, loff_t offset,
+         size_t count)
 {
     struct cap *cap;
 
-    cap = caps;
-    while (cap < caps + nr_caps) {
+    assert(config_space != NULL);
+    assert(caps != NULL);
+
+    cap = caps->caps;
+    while (cap < caps->caps + caps->nr_caps) {
         /*
-         * TODO this assumes that at most one capability is read. It might be
-         * legitimate to read an arbitrary number of bytes, which we could
-         * support. For now lets explicitly fail such cases.
+         * FIXME ensure that at most one capability is written to. It might
+         * legitimate to write to two capabilities at the same time.
          */
-        if (offset >= cap->start && offset + count - 1 <= cap->end) {
-            return cap;
+        if (offset >= cap->start && offset <= cap->end) {
+            if (offset + count - 1 > cap->end) {
+                assert(false);
+            }
+            return config_space->raw + cap->start;
         }
         cap++;
     }
-    /* this means that the access spans more than a capability */
     return NULL;
 }
 
-/*
- * Tells whether the header of a PCI capability is accessed.
- */
 static bool
-cap_header_is_accessed(struct cap *cap, loff_t offset)
+cap_is_valid(uint8_t id)
 {
-    assert(cap);
-    return offset - cap->start <= 1;
+    /* TODO 0 is a valid capability ID (Null Capability), check
+     * https://pcisig.com/sites/default/files/files/PCI_Code-ID_r_1_11__v24_Jan_2019.pdf:
+     *
+     */
+    return id >= PCI_CAP_ID_PM && id <= PCI_CAP_ID_MAX;
 }
 
-/*
- * Reads the header of a PCI capability.
- */
-static int
-cap_header_access(struct caps *caps, struct cap *cap, char *buf,
-                  loff_t offset, size_t count, bool is_write)
+uint8_t *
+cap_find_by_id(lm_ctx_t *lm_ctx, uint8_t id)
 {
-    int n;
+    uint8_t *pos;
+    lm_pci_config_space_t *config_space;
 
-    /*
-     * We don't allow ID and next to be written. TODO not sure what the PCI
-     * spec says about this, need to check.
-     */
-    if (is_write) {
-        return -EINVAL;
+    if (!cap_is_valid(id)) {
+        errno = EINVAL;
+        return NULL;
     }
 
-    assert(caps);
-    assert(cap);
-    n = 0;
-    /*
-     * We handle reads to ID and next, the rest is handled by the callback.
-     */
-    if (offset == cap->start && count > 0) { /* ID */
-        buf[n++] = cap->id;
-        offset++;
-        count--;
+    config_space = lm_get_pci_config_space(lm_ctx);
+
+    if (config_space->hdr.cap == 0) {
+        errno = ENOENT;
+        return NULL;
     }
-    if (offset == cap->start + 1 && count > 0) { /* next */
 
-        if ((cap - caps->caps) / sizeof *cap == (size_t)(caps->nr_caps - 1)) {
-            buf[n++] = 0;
-        } else {
-            buf[n++] = (cap + 1)->start;
+    pos = config_space->raw + config_space->hdr.cap;
+    while (true) {
+        if (*(pos + PCI_CAP_LIST_ID) == id) {
+            return pos;
         }
-
-        offset++;
-        count--;
+        if (*(pos + PCI_CAP_LIST_NEXT) == 0) {
+            break;
+        }
+        pos = config_space->raw + *(pos + PCI_CAP_LIST_NEXT);
     }
-    return n;
+    errno = ENOENT;        
+    return NULL;
 }
 
+/*
+ * Tells whether the header of a PCI capability is accessed.
+ */
+static bool
+cap_header_is_accessed(uint8_t cap_offset, loff_t offset)
+{
+    return offset - cap_offset <= 1;
+}
+
+typedef ssize_t (cap_access) (lm_ctx_t *lm_ctx, uint8_t *cap, char *buf,
+                              size_t count, loff_t offset);
+
+static ssize_t
+handle_pmcs_write(lm_ctx_t *lm_ctx, struct pmcap *pm,
+                  const struct pmcs *const pmcs)
+{
+
+	if (pm->pmcs.ps != pmcs->ps) {
+		lm_log(lm_ctx, LM_DBG, "power state set to %#x\n", pmcs->ps);
+	}
+	if (pm->pmcs.pmee != pmcs->pmee) {
+		lm_log(lm_ctx, LM_DBG, "PME enable set to %#x\n", pmcs->pmee);
+	}
+	if (pm->pmcs.dse != pmcs->dse) {
+		lm_log(lm_ctx, LM_DBG, "data select set to %#x\n", pmcs->dse);
+	}
+	if (pm->pmcs.pmes != pmcs->pmes) {
+		lm_log(lm_ctx, LM_DBG, "PME status set to %#x\n", pmcs->pmes);
+	}
+	pm->pmcs = *pmcs;
+	return 0;
+}
+
+static ssize_t
+handle_pm_write(lm_ctx_t *lm_ctx, uint8_t *cap, char *const buf,
+                const size_t count, const loff_t offset)
+{
+    struct pmcap *pm = (struct pmcap *)cap;
+
+	switch (offset) {
+	case offsetof(struct pmcap, pc):
+		if (count != sizeof(struct pc)) {
+			return -EINVAL;
+		}
+        assert(false); /* FIXME implement */
+	case offsetof(struct pmcap, pmcs):
+		if (count != sizeof(struct pmcs)) {
+			return -EINVAL;
+		}
+		return handle_pmcs_write(lm_ctx, pm, (struct pmcs *)buf);
+	}
+	return -EINVAL;
+}
+
+static ssize_t
+handle_mxc_write(lm_ctx_t *lm_ctx, struct msixcap *msix,
+                 const struct mxc *const mxc)
+{
+	assert(msix != NULL);
+	assert(mxc != NULL);
+
+	if (mxc->mxe != msix->mxc.mxe) {
+		lm_log(lm_ctx, LM_DBG, "%s MSI-X\n", mxc->mxe ? "enable" : "disable");
+		msix->mxc.mxe = mxc->mxe;
+	}
+
+	if (mxc->fm != msix->mxc.fm) {
+		if (mxc->fm) {
+			lm_log(lm_ctx, LM_DBG, "all MSI-X vectors masked\n");
+		} else {
+			lm_log(lm_ctx, LM_DBG,
+                   "vector's mask bit determines whether vector is masked\n");
+		}
+		msix->mxc.fm = mxc->fm;
+	}
+
+	return sizeof(struct mxc);
+}
+
+static ssize_t
+handle_msix_write(lm_ctx_t *lm_ctx, uint8_t *cap, char *const buf,
+                  const size_t count, const loff_t offset)
+{
+    struct msixcap *msix = (struct msixcap *)cap;
+
+	if (count == sizeof(struct mxc)) {
+		switch (offset) {
+		case offsetof(struct msixcap, mxc):
+			return handle_mxc_write(lm_ctx, msix, (struct mxc *)buf);
+		default:
+			lm_log(lm_ctx, LM_ERR, "invalid MSI-X write offset %ld\n", offset);
+			return -EINVAL;
+		}
+	}
+	lm_log(lm_ctx, LM_ERR, "invalid MSI-X write size %lu\n", count);
+	return -EINVAL;
+}
+
+static int
+handle_px_pxdc_write(lm_ctx_t *lm_ctx, struct pxcap *px, const union pxdc *const p)
+{
+	assert(px != NULL);
+	assert(p != NULL);
+
+	if (p->cere != px->pxdc.cere) {
+		px->pxdc.cere = p->cere;
+		lm_log(lm_ctx, LM_DBG, "CERE %s\n", p->cere ? "enable" : "disable");
+	}
+
+	if (p->nfere != px->pxdc.nfere) {
+		px->pxdc.nfere = p->nfere;
+		lm_log(lm_ctx, LM_DBG, "NFERE %s\n", p->nfere ? "enable" : "disable");
+	}
+
+	if (p->fere != px->pxdc.fere) {
+		px->pxdc.fere = p->fere;
+		lm_log(lm_ctx, LM_DBG, "FERE %s\n", p->fere ? "enable" : "disable");
+	}
+
+	if (p->urre != px->pxdc.urre) {
+		px->pxdc.urre = p->urre;
+		lm_log(lm_ctx, LM_DBG, "URRE %s\n", p->urre ? "enable" : "disable");
+	}
+
+	if (p->ero != px->pxdc.ero) {
+		px->pxdc.ero = p->ero;
+		lm_log(lm_ctx, LM_DBG, "ERO %s\n", p->ero ? "enable" : "disable");
+	}
+
+	if (p->mps != px->pxdc.mps) {
+		px->pxdc.mps = p->mps;
+		lm_log(lm_ctx, LM_DBG, "MPS set to %d\n", p->mps);
+	}
+
+	if (p->ete != px->pxdc.ete) {
+		px->pxdc.ete = p->ete;
+		lm_log(lm_ctx, LM_DBG, "ETE %s\n", p->ete ? "enable" : "disable");
+	}
+
+	if (p->pfe != px->pxdc.pfe) {
+		px->pxdc.pfe = p->pfe;
+		lm_log(lm_ctx, LM_DBG, "PFE %s\n", p->pfe ? "enable" : "disable");
+	}
+
+	if (p->appme != px->pxdc.appme) {
+		px->pxdc.appme = p->appme;
+		lm_log(lm_ctx, LM_DBG, "APPME %s\n", p->appme ? "enable" : "disable");
+	}
+
+	if (p->ens != px->pxdc.ens) {
+		px->pxdc.ens = p->ens;
+		lm_log(lm_ctx, LM_DBG, "ENS %s\n", p->ens ? "enable" : "disable");
+	}
+
+	if (p->mrrs != px->pxdc.mrrs) {
+		px->pxdc.mrrs = p->mrrs;
+		lm_log(lm_ctx, LM_DBG, "MRRS set to %d\n", p->mrrs);
+	}
+
+	if (p->iflr) {
+		lm_log(lm_ctx, LM_DBG,
+			"initiate function level reset\n");
+	}
+
+	return 0;
+}
+
+static int
+handle_px_write_2_bytes(lm_ctx_t *lm_ctx, struct pxcap *px, char *const buf,
+                        loff_t off)
+{
+	switch (off) {
+	case offsetof(struct pxcap, pxdc):
+		return handle_px_pxdc_write(lm_ctx, px, (union pxdc *)buf);
+	}
+	return -EINVAL;
+}
+
+static ssize_t
+handle_px_write(lm_ctx_t *lm_ctx, uint8_t *cap, char *const buf,
+                size_t count, loff_t offset)
+{
+    struct pxcap *px = (struct pxcap *)cap;
+
+	int err = -EINVAL;
+	switch (count) {
+	case 2:
+		err = handle_px_write_2_bytes(lm_ctx, px, buf, offset);
+		break;
+	}
+	if (err != 0) {
+		return err;
+	}
+	return count;
+}
+
+static const struct cap_handler {
+    char *name;
+    size_t size;
+    cap_access *fn;
+} cap_handlers[PCI_CAP_ID_MAX + 1] = {
+    [PCI_CAP_ID_PM] = {"PM", PCI_PM_SIZEOF, handle_pm_write},
+    [PCI_CAP_ID_EXP] = {"PCI Express", PCI_CAP_EXP_ENDPOINT_SIZEOF_V2,
+                        handle_px_write},
+    [PCI_CAP_ID_MSIX] = {"MSI-X", PCI_CAP_MSIX_SIZEOF, handle_msix_write},
+};
+
 ssize_t
-cap_maybe_access(struct caps *caps, void *pvt, char *buf, size_t count,
-                 loff_t offset, bool is_write)
+cap_maybe_access(lm_ctx_t *lm_ctx, struct caps *caps, char *buf, size_t count,
+                 loff_t offset)
 {
-    struct cap *cap;
+    lm_pci_config_space_t *config_space;
+    uint8_t *cap;
 
-    if (!caps) {
+    if (caps == NULL) {
         return 0;
     }
 
-    if (!count) {
+    if (count == 0) {
         return 0;
     }
 
-    if (!cap_is_accessed(caps->caps, caps->nr_caps, offset)) {
+    if (!cap_is_accessed(caps->caps, caps->nr_caps, count, offset)) {
         return 0;
     }
 
     /* we're now guaranteed that the access is within some capability */
-    cap = cap_find(caps->caps, caps->nr_caps, offset, count);
+    config_space = lm_get_pci_config_space(lm_ctx);
+    cap = cap_find(config_space, caps, offset, count);
+    assert(cap != NULL); /* FIXME */
 
-    if (!cap) {
-        return 0;
-    }
-
-    if (cap_header_is_accessed(cap, offset)) {
-        return cap_header_access(caps, cap, buf, offset, count, is_write);
-    }
-    if (count > 0) {
-        return cap->fn(pvt, cap->id, buf, count, offset - cap->start, is_write);
+    if (cap_header_is_accessed(cap - config_space->raw, offset)) {
+        /* FIXME how to deal with writes to capability header? */
+        assert(false);
     }
-    return 0;
-}
-
-static bool
-cap_is_valid(uint8_t id)
-{
-    return id >= PCI_CAP_ID_PM && id <= PCI_CAP_ID_MAX;
+    return cap_handlers[cap[PCI_CAP_LIST_ID]].fn(lm_ctx, cap, buf, count,
+                                                 offset - (loff_t)(cap - config_space->raw));
 }
 
 struct caps *
-caps_create(const lm_cap_t *lm_caps, int nr_caps)
+caps_create(lm_ctx_t *lm_ctx, lm_cap_t **lm_caps, int nr_caps)
 {
-    uint8_t prev_end;
     int i, err = 0;
-    struct caps *caps = NULL;
+    uint8_t *prev;
+    uint8_t next;
+    lm_pci_config_space_t *config_space;
+    struct caps *caps;
 
     if (nr_caps <= 0 || nr_caps >= LM_MAX_CAPS) {
         err = EINVAL;
         goto out;
     }
 
-    assert(lm_caps);
+    assert(lm_caps != NULL);
 
     caps = calloc(1, sizeof *caps);
-    if (!caps) {
-        err = errno;
+    if (caps == NULL) {
         goto out;
     }
 
-    prev_end = PCI_STD_HEADER_SIZEOF - 1;
+    config_space = lm_get_pci_config_space(lm_ctx);
+    /* points to the next field of the previous capability */
+    prev = &config_space->hdr.cap;
+
+    /* relative offset that points where the next capability should be placed */
+    next = PCI_STD_HEADER_SIZEOF;
+
     for (i = 0; i < nr_caps; i++) {
-        if (!cap_is_valid(lm_caps[i].id) || !lm_caps[i].fn || !lm_caps[i].size) {
+        uint8_t *cap = (uint8_t*)lm_caps[i];
+        uint8_t id = cap[PCI_CAP_LIST_ID];
+        size_t size;
+
+        if (!cap_is_valid(id)) {
+            err = EINVAL;
+            goto out;
+        }
+
+        size = cap_handlers[id].size;
+        if (size == 0) {
             err = EINVAL;
             goto out;
         }
 
-        caps->caps[i].id = lm_caps[i].id;
-        caps->caps[i].fn = lm_caps[i].fn;
-        /* FIXME PCI capabilities must be dword aligned. */
-        caps->caps[i].start = prev_end + 1;
-        caps->caps[i].end = prev_end = caps->caps[i].start + lm_caps[i].size - 1;
+        caps->caps[i].start = next;
+        caps->caps[i].end = next + size - 1;
+
+        memcpy(&config_space->hdr.raw[next], cap, size);
+        *prev = next;
+        prev = &config_space->hdr.raw[next + PCI_CAP_LIST_NEXT];
+        *prev = 0;
+        next += size;
+        assert(next % 4 == 0); /* FIXME */
+
+        lm_log(lm_ctx, LM_DBG, "initialized capability %s %#x-%#x\n",
+               cap_handlers[id].name, caps->caps[i].start, caps->caps[i].end);
     }
     caps->nr_caps = nr_caps;
 
diff --git a/lib/cap.h b/lib/cap.h
index e814d6c..1f72247 100644
--- a/lib/cap.h
+++ b/lib/cap.h
@@ -44,7 +44,7 @@ struct caps;
  * capabilities have been added.
  */
 struct caps *
-caps_create(const lm_cap_t *caps, int nr_caps);
+caps_create(lm_ctx_t *lm_ctx, lm_cap_t **caps, int nr_caps);
 
 /*
  * Conditionally accesses the PCI capabilities. Returns:
@@ -54,8 +54,11 @@ caps_create(const lm_cap_t *caps, int nr_caps);
  * <0: negative error code on error.
  */
 ssize_t
-cap_maybe_access(struct caps *caps, void *pvt, char *buf, size_t count,
-                 loff_t offset, bool is_write);
+cap_maybe_access(lm_ctx_t *lm_ctx, struct caps *caps, char *buf, size_t count,
+                 loff_t offset);
+
+uint8_t *
+cap_find_by_id(lm_ctx_t *lm_ctx, uint8_t id);
 
 #endif /* __CAP_H__ */
 
diff --git a/lib/caps/common.h b/lib/caps/common.h
new file mode 100644
index 0000000..2181a3b
--- /dev/null
+++ b/lib/caps/common.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2020 Nutanix Inc. All rights reserved.
+ *
+ * Authors: Thanos Makatos <thanos@nutanix.com>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *      * Redistributions of source code must retain the above copyright
+ *        notice, this list of conditions and the following disclaimer.
+ *      * Redistributions in binary form must reproduce the above copyright
+ *        notice, this list of conditions and the following disclaimer in the
+ *        documentation and/or other materials provided with the distribution.
+ *      * Neither the name of Nutanix nor the names of its contributors may be
+ *        used to endorse or promote products derived from this software without
+ *        specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *  ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ *  OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ *  DAMAGE.
+ *
+ */
+
+#ifndef LM_PCI_CAP_COMMON_H
+#define LM_PCI_CAP_COMMON_H
+
+#include <stddef.h>
+
+struct cap_hdr {
+    uint8_t id;
+    uint8_t next;
+} __attribute__((packed));
+_Static_assert(sizeof(struct cap_hdr) == 0x2, "bad PCI capability header size");
+_Static_assert(offsetof(struct cap_hdr, id) == PCI_CAP_LIST_ID, "bad offset");
+_Static_assert(offsetof(struct cap_hdr, next) == PCI_CAP_LIST_NEXT, "bad offset");
+
+#endif /* LM_PCI_CAP_COMMON_H */
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/caps/msi.h b/lib/caps/msi.h
index b310ae9..5933006 100644
--- a/lib/caps/msi.h
+++ b/lib/caps/msi.h
@@ -33,11 +33,7 @@
 #ifndef LM_PCI_CAP_MSI_H
 #define LM_PCI_CAP_MSI_H
 
-struct mid {
-    unsigned int cid:8;
-    unsigned int next:8;
-} __attribute__ ((packed));
-_Static_assert(sizeof(struct mid) == 0x2, "bad MID size");
+#include "common.h"
 
 struct mc {
     unsigned int msie:1;
@@ -56,7 +52,7 @@ struct ma {
 _Static_assert(sizeof(struct ma) == 0x4, "bad MA size");
 
 struct msicap {
-    struct mid mid;
+    struct cap_hdr hdr;
     struct mc mc;
     struct ma ma;
     uint32_t mua;
@@ -66,6 +62,7 @@ struct msicap {
     uint32_t mpend;
 }  __attribute__ ((packed));
 _Static_assert(sizeof(struct msicap) == 0x18, "bad MSICAP size");
+_Static_assert(offsetof(struct msicap, hdr) == 0, "bad offset");
 
 #endif /* LM_CAP_MSI_H */
 
diff --git a/lib/caps/msix.h b/lib/caps/msix.h
index b13c1c8..b0bc1a5 100644
--- a/lib/caps/msix.h
+++ b/lib/caps/msix.h
@@ -35,12 +35,6 @@
 
 #include <linux/pci_regs.h>
 
-struct mxid {
-    unsigned int cid:8;
-    unsigned int next:8;
-} __attribute__ ((packed));
-_Static_assert(sizeof(struct mxid) == 0x2, "bad MXID size");
-
 struct mxc {
 	unsigned int ts:11;
 	unsigned int reserved:3;
@@ -63,12 +57,13 @@ _Static_assert(sizeof(struct mtab) == PCI_MSIX_PBA - PCI_MSIX_TABLE,
                "bad MPBA size");
 
 struct msixcap {
-	struct mxid mxid;
+    struct cap_hdr hdr;
 	struct mxc mxc;
 	struct mtab mtab;
 	struct mpba mpba;
 } __attribute__ ((packed)) __attribute__ ((aligned(4)));
 _Static_assert(sizeof(struct msixcap) == PCI_CAP_MSIX_SIZEOF, "bad MSI-X size");
+_Static_assert(offsetof(struct msixcap, hdr) == 0, "bad offset");
 
 #endif /* LM_CAP_MSIX_H */
 
diff --git a/lib/caps/pm.h b/lib/caps/pm.h
index ddae2e6..e976d95 100644
--- a/lib/caps/pm.h
+++ b/lib/caps/pm.h
@@ -33,11 +33,7 @@
 #ifndef LM_PCI_CAP_PM_H
 #define LM_PCI_CAP_PM_H
 
-struct pid {
-    unsigned int cid:8;
-    unsigned int next:8;
-} __attribute__((packed));
-_Static_assert(sizeof(struct pid) == 0x2, "bad PID size");
+#include "common.h"
 
 struct pc {
     unsigned int vs:3;
@@ -60,15 +56,16 @@ struct pmcs {
     unsigned int dse:4;
     unsigned int dsc:2;
     unsigned int pmes:1;
-};
-_Static_assert(sizeof(struct pc) == 0x2, "bad PC size");
+} __attribute__((packed));
+_Static_assert(sizeof(struct pc) == 0x2, "bad PMCS size");
 
 struct pmcap {
-    struct pid pid;
+    struct cap_hdr hdr;
     struct pc pc;
     struct pmcs pmcs;
-} __attribute__((packed))  __attribute__ ((aligned(8)));
+} __attribute__((packed)) __attribute__ ((aligned(8))); /* FIXME why does it need to be aligned? */
 _Static_assert(sizeof(struct pmcap) == PCI_PM_SIZEOF, "bad PC size");
+_Static_assert(offsetof(struct pmcap, hdr) == 0, "bad offset");
 
 #endif /* LM_CAP_PM_H */
 
diff --git a/lib/caps/px.h b/lib/caps/px.h
index ce17cfe..28a04d5 100644
--- a/lib/caps/px.h
+++ b/lib/caps/px.h
@@ -33,11 +33,7 @@
 #ifndef LM_PCI_CAP_PX_H
 #define LM_PCI_CAP_PX_H
 
-struct pxid {
-    unsigned int cid:8;
-    unsigned int next:8;
-} __attribute__((packed));
-_Static_assert(sizeof(struct pxid) == 0x2, "bad PXID size");
+#include "common.h"
 
 struct pxcaps {
     unsigned int ver:4;
@@ -133,7 +129,7 @@ _Static_assert(sizeof(struct pxdc2) == 0x2, "bad PXDC2 size");
  * the whole struct.
  */
 struct pxcap {
-    struct pxid pxid;
+    struct cap_hdr hdr;
     struct pxcaps pxcaps;
     struct pxdcap pxdcap;
     union pxdc pxdc;
@@ -147,6 +143,7 @@ struct pxcap {
 } __attribute__((packed));
 _Static_assert(sizeof(struct pxcap) == 0x2a,
 		"bad PCI Express Capability size");
+_Static_assert(offsetof(struct pxcap, hdr) == 0, "bad offset");
 
 #endif /* LM_PCI_CAP_PX_H */
 
diff --git a/lib/common.h b/lib/common.h
index 27d6735..f5de4d8 100644
--- a/lib/common.h
+++ b/lib/common.h
@@ -45,18 +45,18 @@
 #define likely(e)   __builtin_expect(!!(e), 1)
 #define unlikely(e) __builtin_expect(e, 0)
 
+/* XXX NB 2nd argument must be power of two */
 #define ROUND_DOWN(x, a)    ((x) & ~((a)-1))
 #define ROUND_UP(x,a)       ROUND_DOWN((x)+(a)-1, a)
 
 void
 lm_log(lm_ctx_t *lm_ctx, lm_log_lvl_t lvl, const char *fmt, ...);
 
-#ifdef DEBUG
+#ifdef LM_VERBOSE_LOGGING
 void
-dump_buffer(lm_ctx_t *lm_ctx, const char *prefix,
-            const char *buf, uint32_t count);
+dump_buffer(const char *prefix, const char *buf, uint32_t count);
 #else
-#define dump_buffer(lm_ctx, prefix, buf, count)
+#define dump_buffer(prefix, buf, count)
 #endif
 
 #endif /* __COMMON_H__ */
diff --git a/lib/dma.c b/lib/dma.c
index eb4b9d4..b6d365e 100644
--- a/lib/dma.c
+++ b/lib/dma.c
@@ -66,7 +66,7 @@ fds_are_same_file(int fd1, int fd2)
 }
 
 dma_controller_t *
-dma_controller_create(int max_regions)
+dma_controller_create(lm_ctx_t *lm_ctx, int max_regions)
 {
     dma_controller_t *dma;
 
@@ -77,37 +77,89 @@ dma_controller_create(int max_regions)
         return dma;
     }
 
+    dma->lm_ctx = lm_ctx;
     dma->max_regions = max_regions;
     dma->nregions = 0;
     memset(dma->regions, 0, max_regions * sizeof(dma->regions[0]));
+    dma->dirty_pgsize = 0;
 
     return dma;
 }
 
 static void
-_dma_controller_do_remove_region(dma_memory_region_t *region)
+_dma_controller_do_remove_region(dma_controller_t *dma,
+                                 dma_memory_region_t *region)
 {
-    assert(region);
-    dma_unmap_region(region, region->virt_addr, region->size);
-    (void)close(region->fd);
+    int err;
+
+    assert(dma != NULL);
+    assert(region != NULL);
+
+    err = dma_unmap_region(region, region->virt_addr, region->size);
+    if (err != 0) {
+        lm_log(dma->lm_ctx, LM_DBG, "failed to unmap fd=%d vaddr=%#lx-%#lx\n",
+               region->fd, region->virt_addr, region->size);
+    }
+    if (region->fd != -1) {
+        if (close(region->fd) == -1) {
+            lm_log(dma->lm_ctx, LM_DBG, "failed to close fd %d: %m\n", region->fd);
+        }
+    }
+}
+
+/*
+ * FIXME no longer used. Also, it doesn't work for addresses that span two
+ * DMA regions.
+ */
+bool
+dma_controller_region_valid(dma_controller_t *dma, dma_addr_t dma_addr,
+                            size_t size)
+{
+    dma_memory_region_t *region;
+    int i;
+
+    for (i = 0; i < dma->nregions; i++) {
+        region = &dma->regions[i];
+        if (dma_addr == region->dma_addr && size <= region->size) {
+            return true;
+        }
+    }
+
+    return false;
 }
 
 /* FIXME not thread safe */
 int
-dma_controller_remove_region(dma_controller_t *dma, dma_addr_t dma_addr,
-                             size_t size, int fd)
+dma_controller_remove_region(dma_controller_t *dma,
+                             dma_addr_t dma_addr, size_t size,
+                             int (*unmap_dma) (void*, uint64_t), void *data)
 {
     int idx;
     dma_memory_region_t *region;
+    int err;
 
-    assert(dma);
+    assert(dma != NULL);
 
     for (idx = 0; idx < dma->nregions; idx++) {
         region = &dma->regions[idx];
-        if (region->dma_addr == dma_addr && region->size == size &&
-            fds_are_same_file(region->fd, fd)) {
-            _dma_controller_do_remove_region(region);
+        if (region->dma_addr == dma_addr && region->size == size) {
+            if (region->refcnt > 0) {
+                err = unmap_dma(data, region->dma_addr);
+                if (err != 0) {
+                    lm_log(dma->lm_ctx, LM_ERR,
+                           "failed to notify of removal of DMA region %#lx-%#lx: %s\n",
+                           region->dma_addr, region->dma_addr + region->size,
+                           strerror(-err));
+                    return err;
+                }
+                assert(region->refcnt == 0);
+            }
+            _dma_controller_do_remove_region(dma, region);
             if (dma->nregions > 1)
+                /*
+                 * FIXME valgrind complains with 'Source and destination overlap in memcpy',
+                 * check whether memmove eliminates this warning.
+                 */
                 memcpy(region, &dma->regions[dma->nregions - 1],
                        sizeof *region);
             dma->nregions--;
@@ -118,7 +170,7 @@ dma_controller_remove_region(dma_controller_t *dma, dma_addr_t dma_addr,
 }
 
 static inline void
-dma_controller_remove_regions(lm_ctx_t *ctx, dma_controller_t *dma)
+dma_controller_remove_regions(dma_controller_t *dma)
 {
     int i;
 
@@ -127,26 +179,26 @@ dma_controller_remove_regions(lm_ctx_t *ctx, dma_controller_t *dma)
     for (i = 0; i < dma->nregions; i++) {
         dma_memory_region_t *region = &dma->regions[i];
 
-        lm_log(ctx, LM_INF, "unmap vaddr=%lx IOVA=%lx\n",
+        lm_log(dma->lm_ctx, LM_INF, "unmap vaddr=%#lx IOVA=%#lx",
                region->virt_addr, region->dma_addr);
 
-        _dma_controller_do_remove_region(region);
+        _dma_controller_do_remove_region(dma, region);
     }
 }
 
 void
-dma_controller_destroy(lm_ctx_t *lm_ctx, dma_controller_t *dma)
+dma_controller_destroy(dma_controller_t *dma)
 {
     if (dma == NULL) {
         return;
     }
 
-    dma_controller_remove_regions(lm_ctx, dma);
+    dma_controller_remove_regions(dma);
     free(dma);
 }
 
 int
-dma_controller_add_region(lm_ctx_t *lm_ctx, dma_controller_t *dma,
+dma_controller_add_region(dma_controller_t *dma,
                           dma_addr_t dma_addr, size_t size,
                           int fd, off_t offset)
 {
@@ -160,8 +212,8 @@ dma_controller_add_region(lm_ctx_t *lm_ctx, dma_controller_t *dma,
         /* First check if this is the same exact region. */
         if (region->dma_addr == dma_addr && region->size == size) {
             if (offset != region->offset) {
-                lm_log(lm_ctx, LM_ERR, "bad offset for new DMA region %lx+%lx, "
-                       "want=%d, existing=%d\n",
+                lm_log(dma->lm_ctx, LM_ERR,
+                       "bad offset for new DMA region %#lx+%#lx, want=%d, existing=%d\n",
                        dma_addr, size, offset, region->offset);
                 goto err;
             }
@@ -172,8 +224,9 @@ dma_controller_add_region(lm_ctx_t *lm_ctx, dma_controller_t *dma,
                  * the same file, however in the majority of cases we'll be
                  * using a single fd.
                  */
-                lm_log(lm_ctx, LM_ERR, "bad fd=%d for new DMA region %lx-%lx, "
-                       "existing fd=%d\n", fd, region->fd);
+                lm_log(dma->lm_ctx, LM_ERR,
+                       "bad fd=%d for new DMA region %#lx-%#lx, existing fd=%d\n",
+                       fd, region->fd);
                 goto err;
             }
             return idx;
@@ -184,16 +237,17 @@ dma_controller_add_region(lm_ctx_t *lm_ctx, dma_controller_t *dma,
              dma_addr < region->dma_addr + region->size) ||
             (region->dma_addr >= dma_addr &&
              region->dma_addr < dma_addr + size)) {
-            lm_log(lm_ctx, LM_INF, "new DMA region %lx+%lx overlaps with DMA "
-                   "region %lx-%lx\n", dma_addr, size, region->dma_addr,
-                   region->size);
+            lm_log(dma->lm_ctx, LM_INF,
+                   "new DMA region %#lx+%#lx overlaps with DMA region %#lx-%#lx\n",
+                   dma_addr, size, region->dma_addr, region->size);
             goto err;
         }
     }
 
     if (dma->nregions == dma->max_regions) {
         idx = dma->max_regions;
-        lm_log(lm_ctx, LM_ERR, "reached maxed regions, recompile with higher number of DMA regions\n");
+        lm_log(dma->lm_ctx, LM_ERR,
+               "reached maxed regions, recompile with higher number of DMA regions\n");
         goto err;
     }
 
@@ -202,7 +256,7 @@ dma_controller_add_region(lm_ctx_t *lm_ctx, dma_controller_t *dma,
 
     page_size = fd_get_blocksize(fd);
     if (page_size < 0) {
-        lm_log(lm_ctx, LM_ERR, "bad page size %d\n", page_size);
+        lm_log(dma->lm_ctx, LM_ERR, "bad page size %d\n", page_size);
         goto err;
     }
     page_size = MAX(page_size, getpagesize());
@@ -211,20 +265,21 @@ dma_controller_add_region(lm_ctx_t *lm_ctx, dma_controller_t *dma,
     region->size = size;
     region->page_size = page_size;
     region->offset = offset;
-
-    region->fd = dup(fd);       // dup the fd to get our own private copy
-    if (region->fd < 0) {
-        lm_log(lm_ctx, LM_ERR, "failed to duplicate file descriptor: %s\n",
-               strerror(errno));
-        goto err;
-    }
+    region->fd = fd;
+    region->refcnt = 0;
 
     region->virt_addr = dma_map_region(region, PROT_READ | PROT_WRITE,
                                        0, region->size);
     if (region->virt_addr == MAP_FAILED) {
-        lm_log(lm_ctx, LM_ERR, "failed to memory map DMA region %lx-%lx: %s\n",
+        lm_log(dma->lm_ctx, LM_ERR,
+               "failed to memory map DMA region %#lx-%#lx: %s\n",
                dma_addr, dma_addr + size, strerror(errno));
-        close(region->fd);
+        if (region->fd != -1) {
+            if (close(region->fd) == -1) {
+                lm_log(dma->lm_ctx, LM_DBG, "failed to close fd %d: %m\n",
+                       region->fd);
+            }
+        }
         goto err;
     }
     dma->nregions++;
@@ -269,17 +324,17 @@ dma_map_region(dma_memory_region_t *region, int prot, size_t offset, size_t len)
     return mmap_base + (offset - mmap_offset);
 }
 
-void
+int
 dma_unmap_region(dma_memory_region_t *region, void *virt_addr, size_t len)
 {
     mmap_round((size_t *)&virt_addr, &len, region->page_size);
-    munmap(virt_addr, len);
+    return munmap(virt_addr, len);
 }
 
 int
 _dma_addr_sg_split(const dma_controller_t *dma,
                    dma_addr_t dma_addr, uint32_t len,
-                   dma_sg_t *sg, int max_sg)
+                   dma_sg_t *sg, int max_sg, int prot)
 {
     int idx;
     int cnt = 0;
@@ -295,9 +350,13 @@ _dma_addr_sg_split(const dma_controller_t *dma,
                 size_t region_len = MIN(region_end - dma_addr, len);
 
                 if (cnt < max_sg) {
+                    sg[cnt].dma_addr = region->dma_addr;
                     sg[cnt].region = idx;
                     sg[cnt].offset = dma_addr - region->dma_addr;
                     sg[cnt].length = region_len;
+                    if (_dma_should_mark_dirty(dma, prot)) {
+                        _dma_mark_dirty(dma, region, sg);
+                    }
                 }
 
                 cnt++;
@@ -326,4 +385,117 @@ out:
     return cnt;
 }
 
+ssize_t _get_bitmap_size(size_t region_size, size_t pgsize)
+{
+    if (pgsize == 0) {
+        return -EINVAL;
+    }
+    if (region_size < pgsize) {
+        return -EINVAL;
+    }
+    size_t nr_pages = (region_size / pgsize) + (region_size % pgsize != 0);
+    return (nr_pages / CHAR_BIT) + (nr_pages % CHAR_BIT != 0);
+}
+
+int dma_controller_dirty_page_logging_start(dma_controller_t *dma, size_t pgsize)
+{
+    int i;
+
+    assert(dma != NULL);
+
+    if (pgsize == 0) {
+        return -EINVAL;
+    }
+
+    if (dma->dirty_pgsize > 0) {
+        if (dma->dirty_pgsize != pgsize) {
+            return -EINVAL;
+        }
+        return 0;
+    }
+
+    for (i = 0; i < dma->nregions; i++) {
+        dma_memory_region_t *region = &dma->regions[i];
+        ssize_t bitmap_size = _get_bitmap_size(region->size, pgsize);
+        if (bitmap_size < 0) {
+            return bitmap_size;
+        }
+        region->dirty_bitmap = calloc(bitmap_size, sizeof(char));
+        if (region->dirty_bitmap == NULL) {
+            int j, ret = -errno;
+            for (j = 0; j < i; j++) {
+                free(region->dirty_bitmap);
+                region->dirty_bitmap = NULL;
+            }
+            return ret;
+        }
+    }
+    dma->dirty_pgsize = pgsize;
+    return 0;
+}
+
+int dma_controller_dirty_page_logging_stop(dma_controller_t *dma)
+{
+    int i;
+
+    assert(dma != NULL);
+
+    if (dma->dirty_pgsize == 0) {
+        return 0;
+    }
+
+    for (i = 0; i < dma->nregions; i++) {
+        free(dma->regions[i].dirty_bitmap);
+        dma->regions[i].dirty_bitmap = NULL;
+    }
+    dma->dirty_pgsize = 0;
+    return 0;
+}
+
+int
+dma_controller_dirty_page_get(dma_controller_t *dma, dma_addr_t addr, int len,
+                              size_t pgsize, size_t size, char **data)
+{
+    int ret;
+    ssize_t bitmap_size;
+    dma_sg_t sg;
+    dma_memory_region_t *region;
+
+    assert(dma != NULL);
+    assert(data != NULL);
+
+    /*
+     * FIXME for now we support IOVAs that match exactly the DMA region. This
+     * is purely for simplifying the implementation. We MUST allow arbitrary
+     * IOVAs.
+     */
+    ret = dma_addr_to_sg(dma, addr, len, &sg, 1, PROT_NONE);
+    if (ret != 1 || sg.dma_addr != addr || sg.length != len) {
+        return -ENOTSUP;
+    }
+
+    if (pgsize != dma->dirty_pgsize) {
+        return -EINVAL;
+    }
+
+    bitmap_size = _get_bitmap_size(len, pgsize);
+    if (bitmap_size < 0) {
+        return bitmap_size;
+    }
+
+    /*
+     * FIXME they must be equal because this is how much data the client
+     * expects to receive.
+     */
+    if (size != (size_t)bitmap_size) {
+        return -EINVAL;
+    }
+    
+    region = &dma->regions[sg.region];
+
+    *data = region->dirty_bitmap;
+
+    return 0;
+}
+
 /* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/dma.h b/lib/dma.h
index 1c41dce..7715b89 100644
--- a/lib/dma.h
+++ b/lib/dma.h
@@ -32,6 +32,11 @@
 #define DMA_DMA_H
 
 /*
+ * FIXME check whether DMA regions must be page aligned. If so then the
+ * implementation can be greatly simpified.
+ */
+
+/*
  * This library emulates a DMA controller for a device emulation application to
  * perform DMA operations on a foreign memory space.
  *
@@ -72,6 +77,8 @@
 #include "muser.h"
 #include "common.h"
 
+struct lm_ctx;
+
 typedef struct {
     dma_addr_t dma_addr;        // DMA address of this region
     size_t size;                // Size of this region
@@ -79,19 +86,23 @@ typedef struct {
     int page_size;              // Page size of this fd
     off_t offset;               // File offset
     void *virt_addr;            // Virtual address of this region
+    int refcnt;                 // Number of users of this region
+    char *dirty_bitmap;         // Dirty page bitmap
 } dma_memory_region_t;
 
 typedef struct {
     int max_regions;
     int nregions;
+    struct lm_ctx *lm_ctx;
+    size_t dirty_pgsize;        // Dirty page granularity
     dma_memory_region_t regions[0];
 } dma_controller_t;
 
 dma_controller_t *
-dma_controller_create(int max_regions);
+dma_controller_create(lm_ctx_t *lm_ctx, int max_regions);
 
 void
-dma_controller_destroy(lm_ctx_t *ctx, dma_controller_t *dma);
+dma_controller_destroy(dma_controller_t *dma);
 
 /* Registers a new memory region.
  * Returns:
@@ -101,19 +112,72 @@ dma_controller_destroy(lm_ctx_t *ctx, dma_controller_t *dma);
  *   (e.g. due to conflict with existing region).
  */
 int
-dma_controller_add_region(lm_ctx_t *ctx, dma_controller_t *dma,
+dma_controller_add_region(dma_controller_t *dma,
                           dma_addr_t dma_addr, size_t size,
                           int fd, off_t offset);
 
 int
-dma_controller_remove_region(dma_controller_t *dma, dma_addr_t dma_addr,
-                             size_t size, int fd);
+dma_controller_remove_region(dma_controller_t *dma,
+                             dma_addr_t dma_addr, size_t size,
+                             int (*unmap_dma) (void*, uint64_t), void *data);
 
 // Helper for dma_addr_to_sg() slow path.
 int
 _dma_addr_sg_split(const dma_controller_t *dma,
                    dma_addr_t dma_addr, uint32_t len,
-                   dma_sg_t *sg, int max_sg);
+                   dma_sg_t *sg, int max_sg, int prot);
+
+static bool
+_dma_should_mark_dirty(const dma_controller_t *dma, int prot)
+{
+    assert(dma != NULL);
+
+    return (prot & PROT_WRITE) == PROT_WRITE && dma->dirty_pgsize > 0;
+}
+
+static size_t
+_get_pgstart(size_t pgsize, uint64_t base_addr, uint64_t offset)
+{
+    return (offset - base_addr) / pgsize;
+}
+
+static size_t
+_get_pgend(size_t pgsize, uint64_t len, size_t start)
+{
+    return start + (len / pgsize) + (len % pgsize != 0) - 1;
+}
+
+static void
+_dma_bitmap_get_pgrange(const dma_controller_t *dma,
+                           const dma_memory_region_t *region,
+                           const dma_sg_t *sg, size_t *start, size_t *end)
+{
+    assert(dma != NULL);
+    assert(region != NULL);
+    assert(sg != NULL);
+    assert(start != NULL);
+    assert(end != NULL);
+
+    *start = _get_pgstart(dma->dirty_pgsize, region->dma_addr, sg->offset);
+    *end = _get_pgend(dma->dirty_pgsize, sg->length, *start);
+}
+
+static void
+_dma_mark_dirty(const dma_controller_t *dma, const dma_memory_region_t *region,
+                dma_sg_t *sg)
+{
+    size_t i, start, end;
+
+    assert(dma != NULL);
+    assert(region != NULL);
+    assert(sg != NULL);
+    assert(region->dirty_bitmap != NULL);
+
+    _dma_bitmap_get_pgrange(dma, region, sg, &start, &end);
+    for (i = start; i <= end; i++) {
+        region->dirty_bitmap[i / CHAR_BIT] |= 1 << (i % CHAR_BIT);
+    }
+}
 
 /* Takes a linear dma address span and returns a sg list suitable for DMA.
  * A single linear dma address span may need to be split into multiple
@@ -129,7 +193,7 @@ _dma_addr_sg_split(const dma_controller_t *dma,
 static inline int
 dma_addr_to_sg(const dma_controller_t *dma,
                dma_addr_t dma_addr, uint32_t len,
-               dma_sg_t *sg, int max_sg)
+               dma_sg_t *sg, int max_sg, int prot)
 {
     static __thread int region_hint;
     int cnt;
@@ -139,14 +203,19 @@ dma_addr_to_sg(const dma_controller_t *dma,
 
     // Fast path: single region.
     if (likely(max_sg > 0 && len > 0 &&
-               dma_addr >= region->dma_addr && dma_addr + len <= region_end)) {
+               dma_addr >= region->dma_addr && dma_addr + len <= region_end &&
+               region_hint < dma->nregions)) {
+        sg->dma_addr = region->dma_addr;
         sg->region = region_hint;
         sg->offset = dma_addr - region->dma_addr;
         sg->length = len;
+        if (_dma_should_mark_dirty(dma, prot)) {
+            _dma_mark_dirty(dma, region, sg);
+        }
         return 1;
     }
     // Slow path: search through regions.
-    cnt = _dma_addr_sg_split(dma, dma_addr, len, sg, max_sg);
+    cnt = _dma_addr_sg_split(dma, dma_addr, len, sg, max_sg, prot);
     if (likely(cnt > 0)) {
         region_hint = sg->region;
     }
@@ -157,7 +226,7 @@ void *
 dma_map_region(dma_memory_region_t *region, int prot,
                size_t offset, size_t len);
 
-void
+int
 dma_unmap_region(dma_memory_region_t *region, void *virt_addr, size_t len);
 
 static inline int
@@ -168,31 +237,53 @@ dma_map_sg(dma_controller_t *dma, const dma_sg_t *sg, struct iovec *iov,
     int i;
 
     for (i = 0; i < cnt; i++) {
+        lm_log(dma->lm_ctx, LM_DBG, "map %#lx-%#lx\n",
+               sg->dma_addr + sg->offset, sg->dma_addr + sg->offset + sg->length);
         region = &dma->regions[sg[i].region];
         iov[i].iov_base = region->virt_addr + sg[i].offset;
         iov[i].iov_len = sg[i].length;
+        region->refcnt++;
     }
 
     return 0;
 }
 
+/* FIXME useless define */
 #define UNUSED __attribute__((unused))
 
 static inline void
-dma_unmap_sg(UNUSED dma_controller_t *dma, UNUSED const dma_sg_t *sg,
-	     UNUSED struct iovec *iov, UNUSED int cnt)
+dma_unmap_sg(dma_controller_t *dma, const dma_sg_t *sg,
+	     UNUSED struct iovec *iov, int cnt)
 {
-    /* just a placeholder for now */
+    int i;
+
+    for (i = 0; i < cnt; i++) {
+        dma_memory_region_t *r;
+        /*
+         * FIXME this double loop will be removed if we replace the array with
+         * tfind(3)
+         */
+        for (r = dma->regions;
+             r < dma->regions + dma->nregions && r->dma_addr != sg[i].dma_addr;
+             r++);
+        if (r > dma->regions + dma->nregions) {
+            /* bad region */
+            continue;
+        }
+        lm_log(dma->lm_ctx, LM_DBG, "unmap %#lx-%#lx\n",
+               sg[i].dma_addr + sg[i].offset, sg[i].dma_addr + sg[i].offset + sg[i].length);
+        r->refcnt--;
+    }
     return;
 }
 
 static inline void *
-dma_map_addr(dma_controller_t *dma, dma_addr_t dma_addr, uint32_t len)
+dma_map_addr(dma_controller_t *dma, dma_addr_t dma_addr, uint32_t len, int prot)
 {
     dma_sg_t sg;
     struct iovec iov;
 
-    if (dma_addr_to_sg(dma, dma_addr, len, &sg, 1) == 1 &&
+    if (dma_addr_to_sg(dma, dma_addr, len, &sg, 1, prot) == 1 &&
         dma_map_sg(dma, &sg, &iov, 1) == 0) {
         return iov.iov_base;
     }
@@ -211,12 +302,26 @@ dma_unmap_addr(dma_controller_t *dma,
     };
     int r;
 
-    r = dma_addr_to_sg(dma, dma_addr, len, &sg, 1);
+    r = dma_addr_to_sg(dma, dma_addr, len, &sg, 1, PROT_NONE);
     assert(r == 1);
 
     dma_unmap_sg(dma, &sg, &iov, 1);
 }
 
+int
+dma_controller_dirty_page_logging_start(dma_controller_t *dma, size_t pgsize);
+
+int
+dma_controller_dirty_page_logging_stop(dma_controller_t *dma);
+
+int
+dma_controller_dirty_page_get(dma_controller_t *dma, dma_addr_t addr, int len,
+                              size_t pgsize, size_t size, char **data);
+
+bool
+dma_controller_region_valid(dma_controller_t *dma, dma_addr_t dma_addr,
+                            size_t size);
+
 #endif /* DMA_DMA_H */
 
 /* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/muser.h b/lib/muser.h
index f3330fe..a39d477 100644
--- a/lib/muser.h
+++ b/lib/muser.h
@@ -37,22 +37,27 @@
 #include <sys/uio.h>
 #include <unistd.h>
 
+#include "vfio_user.h"
 #include "pci.h"
+#include "caps/pm.h"
+#include "caps/px.h"
+#include "caps/msi.h"
+#include "caps/msix.h"
 
-/*
- * Influential enviroment variables:
- *
- * LM_TERSE_LOGGING: define to make libmuser log only erroneous PCI accesses.
- *                   (this should really be done with a more fine grained debug
- *                    level)
- */
-#ifndef LM_TERSE_LOGGING
-#define LM_TERSE_LOGGING 0
-#endif
+#define LIB_MUSER_VFIO_USER_VERS_MJ 0
+#define LIB_MUSER_VFIO_USER_VERS_MN 1
+
+#define VFIO_NAME       "vfio"
+#define VFIO_DIR        "/dev/" VFIO_NAME "/"
+#define VFIO_CONTAINER  VFIO_DIR "/" VFIO_NAME
+
+#define MUSER_DIR "/var/run/muser/"
+#define MUSER_SOCK "cntrl"
 
 typedef uint64_t dma_addr_t;
 
 typedef struct {
+    dma_addr_t dma_addr;
     int region;
     int length;
     uint64_t offset;
@@ -134,6 +139,8 @@ typedef struct  {
 
     /*
      * Callback function that is called when the region is read or written.
+     * Note that the memory of the region is owned by the user, except for the
+     * standard header (first 64 bytes) of the PCI configuration space.
      */
     lm_region_access_t  *fn;
 
@@ -149,9 +156,12 @@ enum {
     LM_DEV_INTX_IRQ_IDX,
     LM_DEV_MSI_IRQ_IDX,
     LM_DEV_MSIX_IRQ_IDX,
-    LM_DEV_NUM_IRQS = 3
+    LM_DEV_ERR_IRQ_INDEX,
+    LM_DEV_REQ_IRQ_INDEX,
+    LM_DEV_NUM_IRQS
 };
 
+/* FIXME these are PCI regions */
 enum {
     LM_DEV_BAR0_REG_IDX,
     LM_DEV_BAR1_REG_IDX,
@@ -162,7 +172,15 @@ enum {
     LM_DEV_ROM_REG_IDX,
     LM_DEV_CFG_REG_IDX,
     LM_DEV_VGA_REG_IDX,
-    LM_DEV_NUM_REGS = 9
+    /*
+     * FIXME this really belong here, but simplifies implementation for now. A
+     * migration region can exist for non-PCI devices (can its index be
+     * anything?). In any case, we should allow the user to define custom regions
+     * at will, by fixing the migration region in that position we don't allow
+     * this.
+     */
+    LM_DEV_MIGRATION_REG_IDX,
+    LM_DEV_NUM_REGS = 10, /* TODO rename to LM_DEV_NUM_PCI_REGS */
 };
 
 typedef struct {
@@ -191,7 +209,7 @@ typedef struct {
 } lm_pci_info_t;
 
 /*
- * Returns a pointer to the non-standard part of the PCI configuration space.
+ * Returns a pointer to the standard part of the PCI configuration space.
  */
 lm_pci_config_space_t *lm_get_pci_config_space(lm_ctx_t *lm_ctx);
 
@@ -208,7 +226,7 @@ typedef enum {
  *
  * @lm_log_fn_t: typedef for log function.
  */
-typedef void (lm_log_fn_t) (void *pvt, const char *msg);
+typedef void (lm_log_fn_t) (void *pvt, lm_log_lvl_t lvl, const char *msg);
 
 /**
  * Callback function that gets called when a capability is accessed. The
@@ -228,26 +246,77 @@ typedef ssize_t (lm_cap_access_t) (void *pvt, uint8_t id,
                                    char *buf, size_t count,
                                    loff_t offset, bool is_write);
 
+/* FIXME does it have to be packed as well? */
+typedef union {
+    struct msicap msi;
+    struct msixcap msix;
+    struct pmcap pm;
+    struct pxcap px;
+} lm_cap_t;
+
+typedef enum {
+    LM_TRANS_KERNEL,
+    LM_TRANS_SOCK,
+    LM_TRANS_MAX
+} lm_trans_t;
+
+#define LM_MAX_CAPS (PCI_CFG_SPACE_SIZE - PCI_STD_HEADER_SIZEOF) / PCI_CAP_SIZEOF
+
+/*
+ * FIXME the names of migration callback functions are probably far too long,
+ * but for now it helps with the implementation.
+ */
+typedef int (lm_migration_callback_t)(void *pvt);
+
+typedef enum {
+    LM_MIGR_STATE_STOP,
+    LM_MIGR_STATE_START,
+    LM_MIGR_STATE_STOP_AND_COPY,
+    LM_MIGR_STATE_PRE_COPY,
+    LM_MIGR_STATE_RESUME
+} lm_migr_state_t;
+
 typedef struct {
 
+    /* migration state transition callback */
+    /* TODO rename to lm_migration_state_transition_callback */
+    /* FIXME maybe we should create a single callback and pass the state? */
+    int (*transition)(void *pvt, lm_migr_state_t state);
+
+    /* Callbacks for saving device state */
+
     /*
-     * Capability ID, as defined by the PCI specification. Also defined as
-     * PCI_CAP_ID_XXX in <linux/pci_regs.h>.
+     * Function that is called to retrieve pending migration data. If migration
+     * data were previously made available (function prepare_data has been
+     * called) then calling this function signifies that they have been read
+     * (e.g. migration data can be discarded). If the function returns 0 then
+     * migration has finished and this function won't be called again.
      */
-    uint8_t id;
+    __u64 (*get_pending_bytes)(void *pvt);
 
     /*
-     * Size of the capability.
+     * Function that is called to instruct the device to prepare migration data.
+     * The function must return only after migration data are available at the
+     * specified offset.
      */
-    size_t size;
+    int (*prepare_data)(void *pvt, __u64 *offset, __u64 *size);
 
     /*
-     * Function to call back when the capability gets read or written.
+     * Function that is called to read migration data. offset and size can
+     * be any subrange on the offset and size previously returned by
+     * prepare_data. The function must return the amount of data read. This
+     * function can be called even if the migration data can be memory mapped.
+     *
+     * Does this mean that reading data_offset/data_size updates the values?
      */
-    lm_cap_access_t *fn;
-} lm_cap_t;
+    size_t (*read_data)(void *pvt, void *buf, __u64 count, __u64 offset);
 
-#define LM_MAX_CAPS (PCI_CFG_SPACE_SIZE - PCI_STD_HEADER_SIZEOF) / PCI_CAP_SIZEOF
+    /* Callback for restoring device state */
+
+    /* Fuction that is called for writing previously stored device state. */
+    size_t (*write_data)(void *pvt, void *data, __u64 size);
+
+} lm_migration_callbacks_t;
 
 /**
  * Device information structure, used to create the lm_ctx.
@@ -287,16 +356,36 @@ typedef struct {
     int (*reset)    (void *pvt);
 
     /*
-     * PCI capabilities. The user needs to only define the ID and size of each
-     * capability. The actual capability is not maintained by libmuser. When a
-     * capability is accessed the appropriate callback function is called.
+     * Function that is called when the guest maps a DMA region. Optional.
+     */
+    void (*map_dma) (void *pvt, uint64_t iova, uint64_t len);
+
+    /*
+     * Function that is called when the guest unmaps a DMA region. The device
+     * must release all references to that region before the callback returns.
+     * This is required if you want to be able to access guest memory.
      */
-    lm_cap_t        caps[LM_MAX_CAPS];
+    int (*unmap_dma) (void *pvt, uint64_t iova);
+
+    lm_trans_t      trans;
 
     /*
-     * Number of capabilities in above array.
+     * Attaching to the transport is non-blocking. The library will not attempt
+     * to attach during context creation time. The caller must then manually
+     * call lm_ctx_try_attach(), which is non-blocking, as many times as
+     * necessary.
+     */
+#define LM_FLAG_ATTACH_NB  (1 << 0)
+    uint64_t         flags;
+
+    /*
+     * PCI capabilities.
      */
     int             nr_caps;
+    lm_cap_t        **caps;
+
+    lm_migration_callbacks_t migration_callbacks;
+
 } lm_dev_info_t;
 
 /**
@@ -339,18 +428,49 @@ int
 lm_ctx_run(lm_dev_info_t *dev_info);
 
 /**
+ * Polls, without blocking, an lm_ctx. This is an alternative to using
+ * a thread and making a blocking call to lm_ctx_drive(). Instead, the
+ * application can periodically poll the context directly from one of
+ * its own threads.
+ *
+ * This is only allowed when LM_FLAG_ATTACH_NB is specified during creation.
+ *
+ * @lm_ctx: The libmuser context to poll
+ *
+ * @returns 0 on success, -errno on failure.
+ */
+int
+lm_ctx_poll(lm_ctx_t *lm_ctx);
+
+/**
  * Triggers an interrupt.
  *
+ * libmuser takes care of using the correct IRQ type (IRQ index: INTx or MSI/X),
+ * the caller only needs to specify the sub-index.
+ *
+ * @lm_ctx: the libmuser context to trigger interrupt
+ * @subindex: vector subindex to trigger interrupt on
+ *
+ * @returns 0 on success, or -1 on failure. Sets errno.
+ */
+int
+lm_irq_trigger(lm_ctx_t *lm_ctx, uint32_t subindex);
+
+/**
+ * Sends message to client to trigger an interrupt.
+ *
  * libmuser takes care of using the IRQ type (INTx, MSI/X), the caller only
  * needs to specify the sub-index.
+ * This api can be used to trigger interrupt by sending message to client.
  *
  * @lm_ctx: the libmuser context to trigger interrupt
  * @subindex: vector subindex to trigger interrupt on
  *
  * @returns 0 on success, or -1 on failure. Sets errno.
  */
+
 int
-lm_irq_trigger(lm_ctx_t *lm_ctx, uint32_t subindex);
+lm_irq_message(lm_ctx_t *lm_ctx, uint32_t subindex);
 
 /* Helper functions */
 
@@ -366,12 +486,15 @@ lm_irq_trigger(lm_ctx_t *lm_ctx, uint32_t subindex);
  * than can be individually mapped in the program's virtual memory.  A single
  * linear guest physical address span may need to be split into multiple
  * scatter/gather regions due to limitations of how memory can be mapped.
+ * Field unmap_dma must have been provided at context creation time in order
+ * to use this function.
  *
  * @lm_ctx: the libmuser context
  * @dma_addr: the guest physical address
  * @len: size of memory to be mapped
  * @sg: array that receives the scatter/gather entries to be mapped
  * @max_sg: maximum number of elements in above array
+ * @prot: protection as define in <sys/mman.h>
  *
  * @returns the number of scatter/gather entries created on success, and on
  * failure:
@@ -381,12 +504,14 @@ lm_irq_trigger(lm_ctx_t *lm_ctx, uint32_t subindex);
  */
 int
 lm_addr_to_sg(lm_ctx_t *lm_ctx, dma_addr_t dma_addr, uint32_t len,
-              dma_sg_t *sg, int max_sg);
+              dma_sg_t *sg, int max_sg, int prot);
 
 /**
  * Maps a list scatter/gather entries from the guest's physical address space
  * to the program's virtual memory. It is the caller's responsibility to remove
  * the mappings by calling lm_unmap_sg.
+ * Field unmap_dma must have been provided at context creation time in order
+ * to use this function.
  *
  * @lm_ctx: the libmuser context
  * @sg: array of scatter/gather entries returned by lm_addr_to_sg
@@ -403,6 +528,8 @@ lm_map_sg(lm_ctx_t *lm_ctx, const dma_sg_t *sg,
 /**
  * Unmaps a list scatter/gather entries (previously mapped by lm_map_sg) from
  * the program's virtual memory.
+ * Field unmap_dma must have been provided at context creation time in order
+ * to use this function.
  *
  * @lm_ctx: the libmuser context
  * @sg: array of scatter/gather entries to unmap
@@ -426,16 +553,59 @@ lm_unmap_sg(lm_ctx_t *lm_ctx, const dma_sg_t *sg,
 int
 lm_get_region(loff_t pos, size_t count, loff_t *off);
 
+/**
+ * Read from the dma region exposed by the client.
+ *
+ * @lm_ctx: the libmuser context
+ * @sg: a DMA segment obtained from dma_addr_to_sg
+ * @data: data buffer to read into
+ */
+int
+lm_dma_read(lm_ctx_t *lm_ctx, dma_sg_t *sg, void *data);
+
+/**
+ * Write to the dma region exposed by the client.
+ *
+ * @lm_ctx: the libmuser context
+ * @sg: a DMA segment obtained from dma_addr_to_sg
+ * @data: data buffer to write
+ */
+int
+lm_dma_write(lm_ctx_t *lm_ctx, dma_sg_t *sg, void *data);
+
 /*
  * Advanced stuff.
  */
 
 /**
- * Returns the non-standard part of the PCI configuragion space.
+ * Returns the non-standard part of the PCI configuration space.
  */
 uint8_t *
 lm_get_pci_non_std_config_space(lm_ctx_t *lm_ctx);
 
+/*
+ * Attempts to attach to the transport. LM_FLAG_ATTACH_NB must be set when
+ * creating the context. Returns 0 on success and -1 on error. If errno is set
+ * to EAGAIN or EWOULDBLOCK then the transport is not ready to attach to and the
+ * operation must be retried.
+ */
+int
+lm_ctx_try_attach(lm_ctx_t *lm_ctx);
+
+/*
+ * FIXME need to make sure that there can be at most one capability with a given
+ * ID, otherwise this function will return the first one with this ID.
+ */
+uint8_t *
+lm_ctx_get_cap(lm_ctx_t *lm_ctx, uint8_t id);
+
+void
+lm_log(lm_ctx_t *lm_ctx, lm_log_lvl_t lvl, const char *fmt, ...);
+
+/* FIXME */
+int muser_send_fds(int sock, int *fds, size_t count);
+ssize_t muser_recv_fds(int sock, int *fds, size_t count);
+
 #endif /* LIB_MUSER_H */
 
 /* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/muser_ctx.c b/lib/muser_ctx.c
index 0de3ac0..92155d7 100644
--- a/lib/muser_ctx.c
+++ b/lib/muser_ctx.c
@@ -47,13 +47,22 @@
 #include <stdarg.h>
 #include <linux/vfio.h>
 #include <sys/param.h>
+#include <sys/un.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <time.h>
+#include <sys/select.h>
 
-#include "../kmod/muser.h"
 #include "muser.h"
 #include "muser_priv.h"
 #include "dma.h"
 #include "cap.h"
 
+#define MAX_FDS 8
+
+#define IOMMU_GRP_NAME "iommu_group"
+
 typedef enum {
     IRQ_NONE = 0,
     IRQ_INTX,
@@ -61,6 +70,14 @@ typedef enum {
     IRQ_MSIX,
 } irq_type_t;
 
+char *irq_to_str[] = {
+    [LM_DEV_INTX_IRQ_IDX] = "INTx",
+    [LM_DEV_MSI_IRQ_IDX] = "MSI",
+    [LM_DEV_MSIX_IRQ_IDX] = "MSI-X",
+    [LM_DEV_ERR_IRQ_INDEX] = "ERR",
+    [LM_DEV_REQ_IRQ_INDEX] = "REQ"
+};
+
 typedef struct {
     irq_type_t  type;       /* irq type this device is using */
     int         err_efd;    /* eventfd for irq err */
@@ -69,27 +86,517 @@ typedef struct {
     int         efds[0];    /* XXX must be last */
 } lm_irqs_t;
 
-/*
- * Macro that ensures that a particular struct member is last. Doesn't work for
- * flexible array members.
- */
-#define MUST_BE_LAST(s, m, t) \
-    _Static_assert(sizeof(s) - offsetof(s, m) == sizeof(t), \
-        #t " " #m " must be last member in " #s)
+enum migration_iteration_state {
+    VFIO_USER_MIGRATION_ITERATION_STATE_INITIAL,
+    VFIO_USER_MIGRATION_ITERATION_STATE_STARTED,
+    VFIO_USER_MIGRATION_ITERATION_STATE_DATA_PREPARED,
+    VFIO_USER_MIGRATION_ITERATION_STATE_FINISHED
+};
 
 struct lm_ctx {
     void                    *pvt;
     dma_controller_t        *dma;
     int                     fd;
+    int                     conn_fd;
     int (*reset)            (void *pvt);
     lm_log_lvl_t            log_lvl;
     lm_log_fn_t             *log;
     lm_pci_info_t           pci_info;
     lm_pci_config_space_t   *pci_config_space;
+    lm_trans_t              trans;
     struct caps             *caps;
+    uint64_t                flags;
+    char                    *uuid;
+    void (*map_dma)         (void *pvt, uint64_t iova, uint64_t len);
+    int (*unmap_dma)        (void *pvt, uint64_t iova);
+
+    /* TODO there should be a void * variable to store transport-specific stuff */
+    /* LM_TRANS_SOCK */
+    char                    *iommu_dir;
+    int                     iommu_dir_fd;
+    int                     sock_flags;
+
+    int                     client_max_fds;
+
+    struct {
+        struct vfio_device_migration_info info;
+        size_t pgsize;
+        lm_migration_callbacks_t callbacks;
+        struct {
+            enum migration_iteration_state state;
+            __u64 offset;
+            __u64 size;
+        } iter;
+    } migration;
+
     lm_irqs_t               irqs; /* XXX must be last */
 };
-MUST_BE_LAST(struct lm_ctx, irqs, lm_irqs_t);
+
+
+/* function prototypes */
+static void
+free_sparse_mmap_areas(lm_reg_info_t*);
+
+static inline int recv_blocking(int sock, void *buf, size_t len, int flags)
+{
+    int f = fcntl(sock, F_GETFL, 0);
+    int ret, fret;
+
+    fret = fcntl(sock, F_SETFL, f & ~O_NONBLOCK);
+    assert(fret != -1);
+
+    ret = recv(sock, buf, len, flags);
+
+    fret = fcntl(sock, F_SETFL, f);
+    assert(fret != -1);
+
+    return ret;
+}
+
+static int
+init_sock(lm_ctx_t *lm_ctx)
+{
+    struct sockaddr_un addr = { .sun_family = AF_UNIX };
+    int ret, unix_sock;
+    mode_t mode;
+
+    assert(lm_ctx != NULL);
+
+    lm_ctx->iommu_dir = strdup(lm_ctx->uuid);
+    if (!lm_ctx->iommu_dir) {
+        return -ENOMEM;
+    }
+
+    /* FIXME SPDK can't easily run as non-root */
+    mode =  umask(0000);
+
+    if ((unix_sock = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
+	    ret = errno;
+        goto out;
+    }
+
+    if (lm_ctx->flags & LM_FLAG_ATTACH_NB) {
+        ret = fcntl(unix_sock, F_SETFL,
+                    fcntl(unix_sock, F_GETFL, 0) | O_NONBLOCK);
+        if (ret < 0) {
+            ret = errno;
+            goto close_unix_sock;
+        }
+        lm_ctx->sock_flags = MSG_DONTWAIT | MSG_WAITALL;
+    } else {
+        lm_ctx->sock_flags = 0;
+    }
+
+    lm_ctx->iommu_dir_fd = open(lm_ctx->iommu_dir, O_DIRECTORY);
+    if (lm_ctx->iommu_dir_fd < 0) {
+        ret = errno;
+        goto close_unix_sock;
+    }
+
+    ret = snprintf(addr.sun_path, sizeof addr.sun_path, "%s/" MUSER_SOCK,
+		   lm_ctx->iommu_dir);
+    if (ret >= (int)sizeof addr.sun_path) {
+        ret = ENAMETOOLONG;
+        goto close_iommu_dir_fd;
+    }
+    if (ret < 0) {
+        goto close_iommu_dir_fd;
+    }
+
+    /* start listening business */
+    ret = bind(unix_sock, (struct sockaddr*)&addr, sizeof(addr));
+    if (ret < 0) {
+	    ret = errno;
+        goto close_iommu_dir_fd;
+    }
+
+    ret = listen(unix_sock, 0);
+    if (ret < 0) {
+        ret = errno;
+        goto close_iommu_dir_fd;
+    }
+
+    umask(mode);
+    return unix_sock;
+
+close_iommu_dir_fd:
+    close(lm_ctx->iommu_dir_fd);
+close_unix_sock:
+    close(unix_sock);
+out:
+    return -ret;
+}
+
+static void
+__free_s(char **p)
+{
+    free(*p);
+}
+
+int
+_send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply,
+                   enum vfio_user_command cmd,
+                   struct iovec *iovecs, size_t nr_iovecs,
+                   int *fds, int count)
+{
+    int ret;
+    struct vfio_user_header hdr = {.msg_id = msg_id};
+    struct msghdr msg;
+    size_t i;
+
+    if (nr_iovecs == 0) {
+        iovecs = alloca(sizeof(*iovecs));
+        nr_iovecs = 1;
+    }
+
+    memset(&msg, 0, sizeof(msg));
+
+    if (is_reply) {
+        hdr.flags.type = VFIO_USER_F_TYPE_REPLY;
+    } else {
+        hdr.cmd = cmd;
+        hdr.flags.type = VFIO_USER_F_TYPE_COMMAND;
+    }
+
+    iovecs[0].iov_base = &hdr;
+    iovecs[0].iov_len = sizeof(hdr);
+
+    for (i = 0; i < nr_iovecs; i++) {
+        hdr.msg_size += iovecs[i].iov_len;
+    }
+
+    msg.msg_iovlen = nr_iovecs;
+    msg.msg_iov = iovecs;
+
+    if (fds != NULL) {
+        size_t size = count * sizeof *fds;
+        char *buf = alloca(CMSG_SPACE(size));
+
+        msg.msg_control = buf;
+        msg.msg_controllen = CMSG_SPACE(size);
+
+        struct cmsghdr * cmsg = CMSG_FIRSTHDR(&msg);
+        cmsg->cmsg_level = SOL_SOCKET;
+        cmsg->cmsg_type = SCM_RIGHTS;
+        cmsg->cmsg_len = CMSG_LEN(size);
+        memcpy(CMSG_DATA(cmsg), fds, size);
+    }
+
+    ret = sendmsg(sock, &msg, 0);
+    if (ret == -1) {
+        return -errno;
+    }
+
+    return 0;
+}
+
+int
+send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply,
+                   enum vfio_user_command cmd,
+                   void *data, size_t data_len,
+                   int *fds, size_t count) {
+
+    struct iovec iovecs[2] = {
+        [1] = {
+            .iov_base = data,
+            .iov_len = data_len
+        }
+    };
+    return _send_vfio_user_msg(sock, msg_id, is_reply, cmd, iovecs,
+                               ARRAY_SIZE(iovecs), fds, count);
+}
+
+int
+send_version(int sock, int major, int minor, uint16_t msg_id, bool is_reply,
+             char *caps)
+{
+    int ret;
+    char *data;
+
+    ret  = asprintf(&data,
+                    "{version: {\"major\": %d, \"minor\": %d}, capabilities: %s}",
+                    major, minor, caps != NULL ? caps : "{}");
+    if (ret == -1) {
+        return -1;
+    }
+    ret = send_vfio_user_msg(sock, msg_id, is_reply, VFIO_USER_VERSION, data,
+                             ret, NULL, 0);
+    free(data);
+    return ret;
+}
+
+int
+recv_vfio_user_msg(int sock, struct vfio_user_header *hdr, bool is_reply,
+                   uint16_t *msg_id, void *data, size_t *len)
+{
+    int ret;
+
+    ret = recv_blocking(sock, hdr, sizeof(*hdr), 0);
+    if (ret == -1) {
+        return -errno;
+    }
+    if (ret < (int)sizeof(*hdr)) {
+        return -EINVAL;
+    }
+
+    if (is_reply) {
+        if (hdr->msg_id != *msg_id) {
+            return -EINVAL;
+        }
+
+        if (hdr->flags.type != VFIO_USER_F_TYPE_REPLY) {
+            return -EINVAL;
+        }
+
+        if (hdr->flags.error == 1U) {
+            if (hdr->error_no <= 0) {
+                hdr->error_no = EINVAL;
+            }
+            return -hdr->error_no;
+        }
+    } else {
+        if (hdr->flags.type != VFIO_USER_F_TYPE_COMMAND) {
+            return -EINVAL;
+        }
+        *msg_id = hdr->msg_id;
+    }
+
+    if (len != NULL && *len > 0 && hdr->msg_size > sizeof *hdr) {
+        ret = recv_blocking(sock, data, MIN(hdr->msg_size - sizeof *hdr, *len),
+                            0);
+        if (ret < 0) {
+            return ret;
+        }
+        if (*len != (size_t)ret) { /* FIXME we should allow receiving less */
+            return -EINVAL;
+        }
+        *len = ret;
+    }
+    return 0;
+}
+
+int
+recv_version(int sock, int *major, int *minor, uint16_t *msg_id, bool is_reply,
+             int *max_fds, size_t *pgsize)
+{
+    int ret;
+    struct vfio_user_header hdr;
+    char *data __attribute__((__cleanup__(__free_s))) = NULL;
+
+    ret = recv_vfio_user_msg(sock, &hdr, is_reply, msg_id, NULL, NULL);
+    if (ret < 0) {
+        return ret;
+    }
+
+    hdr.msg_size -= sizeof(hdr);
+    data = malloc(hdr.msg_size);
+    if (data == NULL) {
+        return -errno;
+    }
+    ret = recv_blocking(sock, data, hdr.msg_size, 0);
+    if (ret == -1) {
+        return -errno;
+    }
+    if (ret < (int)hdr.msg_size) {
+        return -EINVAL;
+    }
+
+    /* FIXME use proper parsing */
+    ret = sscanf(data,
+                 "{version: {\"major\": %d, \"minor\": %d}, capabilities: {max_fds: %d, migration: {pgsize: %lu}}}",
+                 major, minor, max_fds, pgsize);
+    if (ret != 4) {
+        return -EINVAL;
+    }
+    return 0;
+}
+
+int
+_send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd,
+                         struct iovec *iovecs, size_t nr_iovecs,
+                         int *send_fds, size_t fd_count,
+                         struct vfio_user_header *hdr,
+                         void *recv_data, size_t recv_len)
+{
+    int ret = _send_vfio_user_msg(sock, msg_id, false, cmd, iovecs, nr_iovecs,
+                                  send_fds, fd_count);
+    if (ret < 0) {
+        return ret;
+    }
+    if (hdr == NULL) {
+        hdr = alloca(sizeof *hdr);
+    }
+    return recv_vfio_user_msg(sock, hdr, true, &msg_id, recv_data, &recv_len);
+}
+
+int
+send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd,
+                        void *send_data, size_t send_len,
+                        int *send_fds, size_t fd_count,
+                        struct vfio_user_header *hdr,
+                        void *recv_data, size_t recv_len)
+{
+    struct iovec iovecs[2] = {
+        [1] = {
+            .iov_base = send_data,
+            .iov_len = send_len
+        }
+    };
+    return _send_recv_vfio_user_msg(sock, msg_id, cmd, iovecs,
+                                    ARRAY_SIZE(iovecs), send_fds, fd_count,
+                                    hdr, recv_data, recv_len);
+}
+
+static int
+set_version(lm_ctx_t *lm_ctx, int sock)
+{
+    int ret;
+    int client_mj, client_mn;
+    uint16_t msg_id = 0;
+    char *server_caps;
+
+    ret = asprintf(&server_caps, "{max_fds: %d, migration: {pgsize: %ld}}",
+                   MAX_FDS, sysconf(_SC_PAGESIZE));
+    if (ret == -1) {
+        return -ENOMEM;
+    }
+
+    ret = send_version(sock, LIB_MUSER_VFIO_USER_VERS_MJ,
+                       LIB_MUSER_VFIO_USER_VERS_MN, msg_id, false, server_caps);
+    if (ret < 0) {
+        lm_log(lm_ctx, LM_DBG, "failed to send version: %s", strerror(-ret));
+        goto out;
+    }
+
+    ret = recv_version(sock, &client_mj, &client_mn, &msg_id, true,
+                       &lm_ctx->client_max_fds, &lm_ctx->migration.pgsize);
+    if (ret < 0) {
+        lm_log(lm_ctx, LM_DBG, "failed to receive version: %s", strerror(-ret));
+        goto out;
+    }
+    if (client_mj != LIB_MUSER_VFIO_USER_VERS_MJ ||
+        client_mn != LIB_MUSER_VFIO_USER_VERS_MN) {
+        lm_log(lm_ctx, LM_DBG, "version mismatch,  server=%d.%d, client=%d.%d",
+               LIB_MUSER_VFIO_USER_VERS_MJ, LIB_MUSER_VFIO_USER_VERS_MN,
+               client_mj, client_mn);
+        ret = -EINVAL;
+        goto out;
+    }
+    if (lm_ctx->migration.pgsize == 0) {
+        lm_log(lm_ctx, LM_ERR, "bad migration page size");
+        ret = -EINVAL;
+        goto out;
+    }
+
+    /* FIXME need to check max_fds */
+
+    lm_ctx->migration.pgsize = MIN(lm_ctx->migration.pgsize,
+                                   sysconf(_SC_PAGESIZE));
+out:
+    free(server_caps);
+    return ret;
+}
+
+/**
+ * lm_ctx: libmuser context
+ * iommu_dir: full path to the IOMMU group to create. All parent directories
+ *            must already exist.
+ */
+static int
+open_sock(lm_ctx_t *lm_ctx)
+{
+    int ret;
+    int conn_fd;
+
+    assert(lm_ctx != NULL);
+
+    conn_fd = accept(lm_ctx->fd, NULL, NULL);
+    if (conn_fd == -1) {
+        return conn_fd;
+    }
+
+    /* send version and caps */
+    ret = set_version(lm_ctx, conn_fd);
+    if (ret < 0) {
+        return ret;
+    }
+
+    lm_ctx->conn_fd = conn_fd;
+    return conn_fd;
+}
+
+static int
+close_sock(lm_ctx_t *lm_ctx)
+{
+    return close(lm_ctx->conn_fd);
+}
+
+static int
+get_request_sock(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr,
+                 int *fds, int *nr_fds)
+{
+    int ret;
+    struct iovec iov = {.iov_base = hdr, .iov_len = sizeof *hdr};
+    struct msghdr msg = {.msg_iov = &iov, .msg_iovlen = 1};
+    struct cmsghdr *cmsg;
+
+    msg.msg_controllen = CMSG_SPACE(sizeof(int) * *nr_fds);
+    msg.msg_control = alloca(msg.msg_controllen);
+
+    /*
+     * TODO ideally we should set O_NONBLOCK on the fd so that the syscall is
+     * faster (?). I tried that and get short reads, so we need to store the
+     * partially received buffer somewhere and retry.
+     */
+    ret = recvmsg(lm_ctx->conn_fd, &msg, lm_ctx->sock_flags);
+    if (ret == -1) {
+        return -errno;
+    }
+
+    for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
+        if (cmsg->cmsg_level != SOL_SOCKET || cmsg->cmsg_type != SCM_RIGHTS) {
+            continue;
+        }
+        if (cmsg->cmsg_len < CMSG_LEN(sizeof(int))) {
+            return -EINVAL;
+        }
+        int size = cmsg->cmsg_len - CMSG_LEN(0);
+        if (size % sizeof(int) != 0) {
+            return -EINVAL;
+        }
+        *nr_fds = (int)(size / sizeof(int));
+        memcpy(fds, CMSG_DATA(cmsg), *nr_fds * sizeof(int));
+        break;
+    }
+
+    return ret;
+}
+
+static ssize_t
+recv_fds_sock(lm_ctx_t *lm_ctx, void *buf, size_t size)
+{
+    ssize_t ret = muser_recv_fds(lm_ctx->conn_fd, buf, size / sizeof(int));
+    if (ret < 0) {
+	    return ret;
+    }
+    return ret * sizeof(int);
+}
+
+static struct transport_ops {
+    int (*init)(lm_ctx_t*);
+    int (*attach)(lm_ctx_t*);
+    int(*detach)(lm_ctx_t*);
+    int (*get_request)(lm_ctx_t*, struct vfio_user_header*, int *fds, int *nr_fds);
+    ssize_t (*recv_fds)(lm_ctx_t*, void *buf, size_t size);
+} transports_ops[] = {
+    [LM_TRANS_SOCK] = {
+        .init = init_sock,
+        .attach = open_sock,
+        .detach = close_sock,
+        .recv_fds = recv_fds_sock,
+        .get_request = get_request_sock,
+    }
+};
 
 #define LM2VFIO_IRQT(type) (type - 1)
 
@@ -98,6 +605,7 @@ lm_log(lm_ctx_t *lm_ctx, lm_log_lvl_t lvl, const char *fmt, ...)
 {
     va_list ap;
     char buf[BUFSIZ];
+    int _errno = errno;
 
     assert(lm_ctx != NULL);
 
@@ -108,7 +616,8 @@ lm_log(lm_ctx_t *lm_ctx, lm_log_lvl_t lvl, const char *fmt, ...)
     va_start(ap, fmt);
     vsnprintf(buf, sizeof buf, fmt, ap);
     va_end(ap);
-    lm_ctx->log(lm_ctx->pvt, buf);
+    lm_ctx->log(lm_ctx->pvt, lvl, buf);
+    errno = _errno;
 }
 
 static const char *
@@ -137,11 +646,14 @@ irqs_disable(lm_ctx_t *lm_ctx, uint32_t index)
     case VFIO_PCI_INTX_IRQ_INDEX:
     case VFIO_PCI_MSI_IRQ_INDEX:
     case VFIO_PCI_MSIX_IRQ_INDEX:
-        lm_log(lm_ctx, LM_DBG, "disabling IRQ %s\n", vfio_irq_idx_to_str(index));
+        lm_log(lm_ctx, LM_DBG, "disabling IRQ %s", vfio_irq_idx_to_str(index));
         lm_ctx->irqs.type = IRQ_NONE;
         for (i = 0; i < lm_ctx->irqs.max_ivs; i++) {
             if (lm_ctx->irqs.efds[i] >= 0) {
-                (void)close(lm_ctx->irqs.efds[i]);
+                if (close(lm_ctx->irqs.efds[i]) == -1) {
+                    lm_log(lm_ctx, LM_DBG, "failed to close IRQ fd %d: %m",
+                           lm_ctx->irqs.efds[i]);
+                }
                 lm_ctx->irqs.efds[i] = -1;
             }
         }
@@ -155,12 +667,17 @@ irqs_disable(lm_ctx_t *lm_ctx, uint32_t index)
     }
 
     if (irq_efd != NULL) {
-        (void)close(*irq_efd);
-        *irq_efd = -1;
+        if (*irq_efd != -1) {
+            if (close(*irq_efd) == -1) {
+                lm_log(lm_ctx, LM_DBG, "failed to close IRQ fd %d: %m",
+                       *irq_efd);
+            }
+            *irq_efd = -1;
+        }
         return 0;
     }
 
-    lm_log(lm_ctx, LM_DBG, "failed to disable IRQs\n");
+    lm_log(lm_ctx, LM_DBG, "failed to disable IRQs");
     return -EINVAL;
 }
 
@@ -178,9 +695,8 @@ irqs_set_data_none(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set)
             val = 1;
             ret = eventfd_write(efd, val);
             if (ret == -1) {
-                ret = -errno;
-                lm_log(lm_ctx, LM_DBG, "IRQ: failed to set data to none: %m\n");
-                return ret;
+                lm_log(lm_ctx, LM_DBG, "IRQ: failed to set data to none: %m");
+                return -errno;
             }
         }
     }
@@ -206,9 +722,8 @@ irqs_set_data_bool(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set, void *data)
             val = 1;
             ret = eventfd_write(efd, val);
             if (ret == -1) {
-                ret = -errno;
-                lm_log(lm_ctx, LM_DBG, "IRQ: failed to set data to bool: %m\n");
-                return ret;
+                lm_log(lm_ctx, LM_DBG, "IRQ: failed to set data to bool: %m");
+                return -errno;
             }
         }
     }
@@ -228,13 +743,16 @@ irqs_set_data_eventfd(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set, void *data
          i++, d32++) {
         efd = lm_ctx->irqs.efds[i];
         if (efd >= 0) {
-            (void) close(efd);
+            if (close(efd) == -1) {
+                lm_log(lm_ctx, LM_DBG, "failed to close IRQ fd %d: %m", efd);
+            }
+
             lm_ctx->irqs.efds[i] = -1;
         }
         if (*d32 >= 0) {
             lm_ctx->irqs.efds[i] = *d32;
         }
-        lm_log(lm_ctx, LM_DBG, "event fd[%d]=%d\n", i, lm_ctx->irqs.efds[i]);
+        lm_log(lm_ctx, LM_DBG, "event fd[%d]=%d", i, lm_ctx->irqs.efds[i]);
     }
 
     return 0;
@@ -252,7 +770,7 @@ irqs_trigger(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set, void *data)
         return irqs_disable(lm_ctx, irq_set->index);
     }
 
-    lm_log(lm_ctx, LM_DBG, "setting IRQ %s flags=0x%x\n",
+    lm_log(lm_ctx, LM_DBG, "setting IRQ %s flags=%#lx",
            vfio_irq_idx_to_str(irq_set->index), irq_set->flags);
 
     switch (irq_set->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
@@ -334,6 +852,17 @@ dev_set_irqs_validate(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set)
     return 0;
 }
 
+static int
+device_reset(lm_ctx_t *lm_ctx)
+{
+    lm_log(lm_ctx, LM_DBG, "Device reset called by client");
+    if (lm_ctx->reset != NULL) {
+        return lm_ctx->reset(lm_ctx->pvt);
+    }
+
+    return 0;
+}
+
 static long
 dev_set_irqs(lm_ctx_t *lm_ctx, struct vfio_irq_set *irq_set, void *data)
 {
@@ -368,7 +897,8 @@ dev_get_irqinfo(lm_ctx_t *lm_ctx, struct vfio_irq_info *irq_info)
     // Ensure provided argsz is sufficiently big and index is within bounds.
     if ((irq_info->argsz < sizeof(struct vfio_irq_info)) ||
         (irq_info->index >= LM_DEV_NUM_IRQS)) {
-        lm_log(lm_ctx, LM_DBG, "bad irq_info\n");
+        lm_log(lm_ctx, LM_DBG, "bad irq_info (size=%d index=%d)\n",
+               irq_info->argsz, irq_info->index);
         return -EINVAL;
     }
 
@@ -380,66 +910,94 @@ dev_get_irqinfo(lm_ctx_t *lm_ctx, struct vfio_irq_info *irq_info)
 
 /*
  * Populate the sparse mmap capability information to vfio-client.
- * kernel/muser constructs the response for VFIO_DEVICE_GET_REGION_INFO
- * accommodating sparse mmap information.
  * Sparse mmap information stays after struct vfio_region_info and cap_offest
  * points accordingly.
  */
 static int
-dev_get_sparse_mmap_cap(lm_ctx_t *lm_ctx, lm_reg_info_t *lm_reg,
-                        struct vfio_region_info *vfio_reg)
+dev_get_sparse_mmap_cap(lm_ctx_t *lm_ctx, lm_reg_info_t *lm_reg, int reg_index,
+                        struct vfio_region_info **vfio_reg)
 {
+    struct vfio_info_cap_header *header;
+    struct vfio_region_info_cap_type *type = NULL;
     struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
     struct lm_sparse_mmap_areas *mmap_areas;
     int nr_mmap_areas, i;
-    size_t size;
-    ssize_t ret;
-
-    if (lm_reg->mmap_areas == NULL)
-        return -EINVAL;
+    size_t type_size = 0;
+    size_t sparse_size = 0;
+    size_t cap_size;
+    void *cap_ptr;
 
-    nr_mmap_areas = lm_reg->mmap_areas->nr_mmap_areas;
-    size = sizeof(*sparse) + (nr_mmap_areas * sizeof(*sparse->areas));
+    if (reg_index == LM_DEV_MIGRATION_REG_IDX) {
+        type_size = sizeof(struct vfio_region_info_cap_type);
+    } 
 
-    /*
-     * If vfio_reg does not have enough space to accommodate  sparse info then
-     * set the argsz with the expected size and return. Vfio client will call
-     * back after reallocating the vfio_reg
-     */
+    if (lm_reg->mmap_areas != NULL) {
+        nr_mmap_areas = lm_reg->mmap_areas->nr_mmap_areas;
+        sparse_size = sizeof(*sparse) + (nr_mmap_areas * sizeof(*sparse->areas));
+    }
 
-    if (vfio_reg->argsz < size + sizeof(*vfio_reg)) {
-        vfio_reg->argsz = size + sizeof(*vfio_reg);
-        vfio_reg->cap_offset = 0;
+    cap_size = type_size + sparse_size;
+    if (cap_size == 0) {
         return 0;
     }
 
-    lm_log(lm_ctx, LM_DBG, "%s: size %llu, nr_mmap_areas %u\n", __func__, size,
-           nr_mmap_areas);
-    sparse = calloc(1, size);
-    if (sparse == NULL)
+    /* TODO deosn't need to be calloc, we overwrite it entirely */
+    header = calloc(1, cap_size);
+    if (header == NULL) {
         return -ENOMEM;
-    sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
-    sparse->header.version = 1;
-    sparse->header.next = 0;
-    sparse->nr_areas = nr_mmap_areas;
+    }
+
+    if (reg_index == LM_DEV_MIGRATION_REG_IDX) {
+        type = (struct vfio_region_info_cap_type*)header;
+        type->header.id = VFIO_REGION_INFO_CAP_TYPE;
+        type->header.version = 1;
+        type->header.next = 0;  
+        type->type = VFIO_REGION_TYPE_MIGRATION; 
+        type->subtype = VFIO_REGION_SUBTYPE_MIGRATION;
+        (*vfio_reg)->cap_offset = sizeof(struct vfio_region_info);
+    }
 
-    mmap_areas = lm_reg->mmap_areas;
-    for (i = 0; i < nr_mmap_areas; i++) {
-        sparse->areas[i].offset = mmap_areas->areas[i].start;
-        sparse->areas[i].size = mmap_areas->areas[i].size;
+    if (lm_reg->mmap_areas != NULL) {
+        if (type != NULL) {
+            type->header.next = (*vfio_reg)->cap_offset + sizeof(struct vfio_region_info_cap_type);
+            sparse = (struct vfio_region_info_cap_sparse_mmap*)(type + 1);
+        } else {
+            (*vfio_reg)->cap_offset = sizeof(struct vfio_region_info);
+            sparse = (struct vfio_region_info_cap_sparse_mmap*)header;
+        }
+        sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
+        sparse->header.version = 1;
+        sparse->header.next = 0;
+        sparse->nr_areas = nr_mmap_areas;
+
+        lm_log(lm_ctx, LM_DBG, "%s: capsize %llu, nr_mmap_areas %u", __func__,
+               sparse_size, nr_mmap_areas);
+        mmap_areas = lm_reg->mmap_areas;
+        for (i = 0; i < nr_mmap_areas; i++) {
+            sparse->areas[i].offset = mmap_areas->areas[i].start;
+            sparse->areas[i].size = mmap_areas->areas[i].size;
+            lm_log(lm_ctx, LM_DBG, "%s: area %d offset %#lx size %llu", __func__,
+                   i, sparse->areas[i].offset, sparse->areas[i].size);
+        }
     }
 
-    /* write the sparse mmap cap info to vfio-client user pages */
-    ret = write(lm_ctx->fd, sparse, size);
-    if (ret != (ssize_t)size) {
-        free(sparse);
-        return -EIO;
+    /*
+     * FIXME VFIO_REGION_INFO_FLAG_MMAP is valid if the region is
+     * memory-mappable in general, not only if it supports sparse mmap.
+     */
+    (*vfio_reg)->flags |= VFIO_REGION_INFO_FLAG_MMAP | VFIO_REGION_INFO_FLAG_CAPS;
+
+    (*vfio_reg)->argsz = cap_size + sizeof(**vfio_reg);
+    *vfio_reg = realloc(*vfio_reg, (*vfio_reg)->argsz);
+    if (*vfio_reg == NULL) {
+        free(header);
+        return -ENOMEM;
     }
 
-    vfio_reg->flags |= VFIO_REGION_INFO_FLAG_MMAP | VFIO_REGION_INFO_FLAG_CAPS;
-    vfio_reg->cap_offset = sizeof(*vfio_reg);
+    cap_ptr = (char *)*vfio_reg + (*vfio_reg)->cap_offset;
+    memcpy(cap_ptr, header, cap_size);
 
-    free(sparse);
+    free(header);
     return 0;
 }
 
@@ -458,42 +1016,73 @@ offset_to_region(uint64_t offset)
     return (offset >> LM_REGION_SHIFT) & LM_REGION_MASK;
 }
 
+#ifdef LM_VERBOSE_LOGGING
+void
+dump_buffer(const char *prefix, const char *buf, uint32_t count)
+{
+    int i;
+    const size_t bytes_per_line = 0x8;
+
+    if (strcmp(prefix, "")) {
+        fprintf(stderr, "%s\n", prefix);
+    }
+    for (i = 0; i < (int)count; i++) {
+        if (i % bytes_per_line != 0) {
+            fprintf(stderr, " ");
+        }
+        /* TODO valgrind emits a warning if count is 1 */
+        fprintf(stderr,"0x%02x", *(buf + i));
+        if ((i + 1) % bytes_per_line == 0) {
+            fprintf(stderr, "\n");
+        }
+    }
+    if (i % bytes_per_line != 0) {
+        fprintf(stderr, "\n");
+    }
+}
+#else
+#define dump_buffer(prefix, buf, count)
+#endif
+
 static long
-dev_get_reginfo(lm_ctx_t *lm_ctx, struct vfio_region_info *vfio_reg)
+dev_get_reginfo(lm_ctx_t *lm_ctx, struct vfio_region_info **vfio_reg)
 {
     lm_reg_info_t *lm_reg;
     int err;
 
     assert(lm_ctx != NULL);
-    assert(vfio_reg != NULL);
-    lm_reg = &lm_ctx->pci_info.reg_info[vfio_reg->index];
+    assert(*vfio_reg != NULL);
+    lm_reg = &lm_ctx->pci_info.reg_info[(*vfio_reg)->index];
 
     // Ensure provided argsz is sufficiently big and index is within bounds.
-    if ((vfio_reg->argsz < sizeof(struct vfio_region_info)) ||
-        (vfio_reg->index >= LM_DEV_NUM_REGS)) {
+    if (((*vfio_reg)->argsz < sizeof(struct vfio_region_info)) ||
+        ((*vfio_reg)->index >= LM_DEV_NUM_REGS)) {
+        lm_log(lm_ctx, LM_DBG, "bad args argsz=%d index=%d",
+               (*vfio_reg)->argsz, (*vfio_reg)->index);
         return -EINVAL;
     }
 
-    vfio_reg->offset = region_to_offset(vfio_reg->index);
-    vfio_reg->flags = lm_reg->flags;
-    vfio_reg->size = lm_reg->size;
+    (*vfio_reg)->offset = region_to_offset((*vfio_reg)->index);
+    (*vfio_reg)->flags = lm_reg->flags;
+    (*vfio_reg)->size = lm_reg->size;
 
-    if (lm_reg->mmap_areas != NULL) {
-        err = dev_get_sparse_mmap_cap(lm_ctx, lm_reg, vfio_reg);
-        if (err) {
-            return err;
-        }
+    err = dev_get_sparse_mmap_cap(lm_ctx, lm_reg, (*vfio_reg)->index, vfio_reg);
+    if (err) {
+        return err;
     }
 
-    lm_log(lm_ctx, LM_DBG, "region_info[%d]\n", vfio_reg->index);
-    dump_buffer(lm_ctx, "", (char*)vfio_reg, sizeof *vfio_reg);
+    lm_log(lm_ctx, LM_DBG, "region_info[%d] offset %#lx flags %#x size %llu "
+           "argsz %llu",
+           (*vfio_reg)->index, (*vfio_reg)->offset, (*vfio_reg)->flags,
+           (*vfio_reg)->size, (*vfio_reg)->argsz);
 
     return 0;
 }
 
 static long
-dev_get_info(struct vfio_device_info *dev_info)
+dev_get_info(lm_ctx_t *lm_ctx, struct vfio_device_info *dev_info)
 {
+    assert(lm_ctx != NULL);
     assert(dev_info != NULL);
 
     // Ensure provided argsz is sufficiently big.
@@ -508,173 +1097,81 @@ dev_get_info(struct vfio_device_info *dev_info)
     return 0;
 }
 
-static long
-do_muser_ioctl(lm_ctx_t *lm_ctx, struct muser_cmd_ioctl *cmd_ioctl, void *data)
-{
-    int err = -ENOTSUP;
-
-    assert(lm_ctx != NULL);
-    switch (cmd_ioctl->vfio_cmd) {
-    case VFIO_DEVICE_GET_INFO:
-        err = dev_get_info(&cmd_ioctl->data.dev_info);
-        break;
-    case VFIO_DEVICE_GET_REGION_INFO:
-        err = dev_get_reginfo(lm_ctx, &cmd_ioctl->data.reg_info);
-        break;
-    case VFIO_DEVICE_GET_IRQ_INFO:
-        err = dev_get_irqinfo(lm_ctx, &cmd_ioctl->data.irq_info);
-        break;
-    case VFIO_DEVICE_SET_IRQS:
-        err = dev_set_irqs(lm_ctx, &cmd_ioctl->data.irq_set, data);
-        break;
-    case VFIO_DEVICE_RESET:
-        if (lm_ctx->reset != NULL) {
-            return lm_ctx->reset(lm_ctx->pvt);
-        }
-        lm_log(lm_ctx, LM_DBG, "reset called but not reset function present\n");
-        break;
-    }
-
-    return err;
-}
-
-static void
-get_path_from_fd(lm_ctx_t *lm_ctx, int fd, char *buf)
-{
-    int err;
-    ssize_t ret;
-    char pathname[PATH_MAX];
-
-    err = snprintf(pathname, PATH_MAX, "/proc/self/fd/%d", fd);
-    if (err >= PATH_MAX || err == -1) {
-        buf[0] = '\0';
-    }
-    ret = readlink(pathname, buf, PATH_MAX);
-    if (ret == -1) {
-        lm_log(lm_ctx, LM_DBG, "failed to readlink %s: %m\n", pathname);
-        ret = 0;
-    } else if (ret == PATH_MAX) {
-        lm_log(lm_ctx, LM_DBG, "failed to readlink %s, output truncated\n",
-               pathname);
-        ret -= 1;
-    }
-    buf[ret] = '\0';
-}
-
-static int
-muser_dma_unmap(lm_ctx_t *lm_ctx, struct muser_cmd *cmd)
-{
-    int err;
-    char buf[PATH_MAX];
-
-    get_path_from_fd(lm_ctx, cmd->mmap.request.fd, buf);
-
-    lm_log(lm_ctx, LM_INF, "removing DMA region fd=%d path=%s %#lx-%#lx\n",
-           cmd->mmap.request.fd, buf, cmd->mmap.request.addr,
-           cmd->mmap.request.addr + cmd->mmap.request.len);
-
-    if (lm_ctx->dma == NULL) {
-        lm_log(lm_ctx, LM_ERR, "DMA not initialized\n");
-        return -EINVAL;
-    }
-
-    err = dma_controller_remove_region(lm_ctx->dma,
-                                       cmd->mmap.request.addr,
-                                       cmd->mmap.request.len,
-                                       cmd->mmap.request.fd);
-    if (err != 0) {
-        lm_log(lm_ctx, LM_ERR, "failed to remove DMA region fd=%d path=%s %#lx-%#lx: %s\n",
-               cmd->mmap.request.fd, buf, cmd->mmap.request.addr,
-               cmd->mmap.request.addr + cmd->mmap.request.len,
-               strerror(err));
-    }
-
-    return err;
-}
-
-static int
-muser_dma_map(lm_ctx_t *lm_ctx, struct muser_cmd *cmd)
-{
-    int err;
-    char buf[PATH_MAX];
-
-    get_path_from_fd(lm_ctx, cmd->mmap.request.fd, buf);
-
-    lm_log(lm_ctx, LM_INF, "adding DMA region fd=%d path=%s iova=%#lx-%#lx offset=%#lx\n",
-           cmd->mmap.request.fd, buf, cmd->mmap.request.addr,
-           cmd->mmap.request.addr + cmd->mmap.request.len,
-           cmd->mmap.request.offset);
-
-    if (lm_ctx->dma == NULL) {
-        lm_log(lm_ctx, LM_ERR, "DMA not initialized\n");
-        return -EINVAL;
-    }
-
-    err = dma_controller_add_region(lm_ctx, lm_ctx->dma,
-                                    cmd->mmap.request.addr,
-                                    cmd->mmap.request.len,
-                                    cmd->mmap.request.fd,
-                                    cmd->mmap.request.offset);
-    if (err < 0) {
-        lm_log(lm_ctx, LM_ERR, "failed to add DMA region fd=%d path=%s %#lx-%#lx: %d\n",
-               cmd->mmap.request.fd, buf, cmd->mmap.request.addr,
-               cmd->mmap.request.addr + cmd->mmap.request.len, err);
-    }
-
-    return 0;
+int
+muser_send_fds(int sock, int *fds, size_t count) {
+	struct msghdr msg = { 0 };
+	size_t size = count * sizeof *fds;
+	char buf[CMSG_SPACE(size)];
+	memset(buf, '\0', sizeof(buf));
+
+	/* XXX requires at least one byte */
+	struct iovec io = { .iov_base = "\0", .iov_len = 1 };
+
+	msg.msg_iov = &io;
+	msg.msg_iovlen = 1;
+	msg.msg_control = buf;
+	msg.msg_controllen = sizeof(buf);
+
+	struct cmsghdr * cmsg = CMSG_FIRSTHDR(&msg);
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type = SCM_RIGHTS;
+	cmsg->cmsg_len = CMSG_LEN(size);
+	memcpy(CMSG_DATA(cmsg), fds, size);
+	msg.msg_controllen = CMSG_SPACE(size);
+	return sendmsg(sock, &msg, 0);
 }
 
-/*
- * Callback that is executed when device memory is to be mmap'd.
- */
-static int
-muser_mmap(lm_ctx_t *lm_ctx, struct muser_cmd *cmd)
+ssize_t
+muser_recv_fds(int sock, int *fds, size_t count)
 {
-    int region, err = 0;
-    unsigned long addr;
-    unsigned long len = cmd->mmap.request.len;
-    loff_t offset = cmd->mmap.request.addr;
+    int ret;
+    struct cmsghdr *cmsg;
+    size_t fds_size;
+    char msg_buf[sysconf(_SC_PAGESIZE)];
+    struct iovec io = {.iov_base = msg_buf, .iov_len = sizeof(msg_buf)};
+    char cmsg_buf[sysconf(_SC_PAGESIZE)];
+    struct msghdr msg = {
+        .msg_iov = &io,
+        .msg_iovlen = 1,
+        .msg_control = cmsg_buf,
+        .msg_controllen = sizeof(cmsg_buf)
+    };
 
-    region = lm_get_region(offset, len, &offset);
-    if (region < 0) {
-        lm_log(lm_ctx, LM_ERR, "bad region %d\n", region);
-        err = EINVAL;
-        goto out;
+    if (fds == NULL || count <= 0) {
+        errno = EINVAL;
+        return -1;
     }
 
-    if (lm_ctx->pci_info.reg_info[region].map == NULL) {
-        lm_log(lm_ctx, LM_ERR, "region not mmapable\n");
-        err = ENOTSUP;
-        goto out;
+    ret = recvmsg(sock, &msg, 0);
+    if (ret == -1) {
+        return ret;
     }
 
-    addr = lm_ctx->pci_info.reg_info[region].map(lm_ctx->pvt, offset, len);
-    if ((void *)addr == MAP_FAILED) {
-        err = errno;
-        lm_log(lm_ctx, LM_ERR, "failed to mmap: %m\n");
-        goto out;
+    cmsg = CMSG_FIRSTHDR(&msg);
+    if (cmsg == NULL) {
+        errno = EINVAL;
+        return -1;
     }
-    cmd->mmap.response = addr;
-
-out:
-    if (err != 0) {
-        lm_log(lm_ctx, LM_ERR, "failed to mmap device memory %#x-%#lx: %s\n",
-               offset, offset + len, strerror(err));
+    fds_size = cmsg->cmsg_len - sizeof *cmsg;
+    if ((fds_size % sizeof(int)) != 0 || fds_size / sizeof (int) > count) {
+        errno = EINVAL;
+        return -1;
     }
+    memcpy((void*)fds, CMSG_DATA(cmsg), cmsg->cmsg_len - sizeof *cmsg);
 
-    return -err;
+    return fds_size / sizeof(int);
 }
 
 /*
- * Returns the number of bytes communicated to the kernel (may be less than
- * ret), or a negative number on error.
+ * Returns the number of bytes sent (may be less than ret), or a negative
+ * number on error.
  */
 static int
 post_read(lm_ctx_t *lm_ctx, char *rwbuf, ssize_t count)
 {
     ssize_t ret;
 
-    ret = write(lm_ctx->fd, rwbuf, count);
+    ret = write(lm_ctx->conn_fd, rwbuf, count);
     if (ret != count) {
         lm_log(lm_ctx, LM_ERR, "%s: bad muser write: %lu/%lu, %s\n",
                __func__, ret, count, strerror(errno));
@@ -719,17 +1216,274 @@ handle_pci_config_space_access(lm_ctx_t *lm_ctx, char *buf, size_t count,
     int ret;
 
     count = MIN(pci_config_space_size(lm_ctx), count);
-    ret = cap_maybe_access(lm_ctx->caps, lm_ctx->pvt, buf, count, pos, is_write);
+    if (is_write) {
+        ret = cap_maybe_access(lm_ctx, lm_ctx->caps, buf, count, pos);
+        if (ret < 0) {
+            lm_log(lm_ctx, LM_ERR, "bad access to capabilities %u@%#x\n", count,
+                   pos);
+            return ret;
+        }
+    } else {
+        memcpy(buf, lm_ctx->pci_config_space->raw + pos, count);
+    }
+    return count;
+}
+
+/* valid migration state transitions */
+__u32 migration_states[VFIO_DEVICE_STATE_MASK] = {
+    [VFIO_DEVICE_STATE_STOP] = 1 << VFIO_DEVICE_STATE_STOP,
+    [VFIO_DEVICE_STATE_RUNNING] = /* running */
+        (1 << VFIO_DEVICE_STATE_STOP) |
+        (1 << VFIO_DEVICE_STATE_RUNNING) |
+        (1 << VFIO_DEVICE_STATE_SAVING) |
+        (1 << (VFIO_DEVICE_STATE_RUNNING | VFIO_DEVICE_STATE_SAVING)) |
+        (1 << VFIO_DEVICE_STATE_RESUMING),
+    [VFIO_DEVICE_STATE_SAVING] = /* stop-and-copy */
+        (1 << VFIO_DEVICE_STATE_STOP) |
+        (1 << VFIO_DEVICE_STATE_SAVING),
+    [VFIO_DEVICE_STATE_RUNNING | VFIO_DEVICE_STATE_SAVING] = /* pre-copy */
+        (1 << VFIO_DEVICE_STATE_SAVING) |
+        (1 << VFIO_DEVICE_STATE_RUNNING | VFIO_DEVICE_STATE_SAVING),
+    [VFIO_DEVICE_STATE_RESUMING] = /* resuming */
+        (1 << VFIO_DEVICE_STATE_RUNNING) |
+        (1 << VFIO_DEVICE_STATE_RESUMING)
+};
+
+static bool
+_migration_state_transition_is_valid(__u32 from, __u32 to)
+{
+    return migration_states[from] & (1 << to);
+}
+
+static ssize_t
+handle_migration_device_state(lm_ctx_t *lm_ctx, __u32 *device_state,
+                              bool is_write) {
+
+    int ret;
+
+    assert(lm_ctx != NULL);
+    assert(device_state != NULL);
+
+    if (!is_write) {
+        *device_state = lm_ctx->migration.info.device_state;
+        return 0;
+    }
+
+    if (*device_state & ~VFIO_DEVICE_STATE_MASK) {
+        return -EINVAL;
+    }
+
+    if (!_migration_state_transition_is_valid(lm_ctx->migration.info.device_state,
+                                              *device_state)) {
+        return -EINVAL;
+    }
+
+    switch (*device_state) {
+        case VFIO_DEVICE_STATE_STOP:
+            ret = lm_ctx->migration.callbacks.transition(lm_ctx->pvt,
+                                                         LM_MIGR_STATE_STOP);
+            break;
+        case VFIO_DEVICE_STATE_RUNNING:
+            ret = lm_ctx->migration.callbacks.transition(lm_ctx->pvt,
+                                                         LM_MIGR_STATE_START);
+            break;
+        case VFIO_DEVICE_STATE_SAVING:
+            /*
+             * FIXME How should the device operate during the stop-and-copy
+             * phase? Should we only allow the migration data to be read from
+             * the migration region? E.g. Access to any other region should be
+             * failed? This might be a good question to send to LKML.
+             */
+            ret = lm_ctx->migration.callbacks.transition(lm_ctx->pvt,
+                                                         LM_MIGR_STATE_STOP_AND_COPY);
+            break;
+        case VFIO_DEVICE_STATE_RUNNING | VFIO_DEVICE_STATE_SAVING:
+            ret = lm_ctx->migration.callbacks.transition(lm_ctx->pvt,
+                                                         LM_MIGR_STATE_PRE_COPY);
+            break;
+        case VFIO_DEVICE_STATE_RESUMING:
+            ret = lm_ctx->migration.callbacks.transition(lm_ctx->pvt,
+                                                         LM_MIGR_STATE_RESUME);
+            break;
+        default:
+            ret = -EINVAL;
+    }
+
+    if (ret == 0) {
+        lm_ctx->migration.info.device_state = *device_state;
+    }
+
+    return ret;
+}
+
+static ssize_t
+handle_migration_pending_bytes(lm_ctx_t *lm_ctx, __u64 *pending_bytes,
+                               bool is_write)
+{
+    assert(lm_ctx != NULL);
+    assert(pending_bytes != NULL);
+
+    if (is_write) {
+        return -EINVAL;
+    }
+
+    if (lm_ctx->migration.iter.state == VFIO_USER_MIGRATION_ITERATION_STATE_FINISHED) {
+        *pending_bytes = 0;
+        return 0;
+    }
+
+    *pending_bytes = lm_ctx->migration.callbacks.get_pending_bytes(lm_ctx->pvt);
+
+    switch (lm_ctx->migration.iter.state) {
+        case VFIO_USER_MIGRATION_ITERATION_STATE_INITIAL:
+        case VFIO_USER_MIGRATION_ITERATION_STATE_DATA_PREPARED:
+            /*
+             * FIXME what happens if data haven't been consumed in the previous
+             * iteration? Ask on LKML.
+             */
+            if (*pending_bytes == 0) {
+                lm_ctx->migration.iter.state = VFIO_USER_MIGRATION_ITERATION_STATE_FINISHED;
+            } else {
+                lm_ctx->migration.iter.state = VFIO_USER_MIGRATION_ITERATION_STATE_STARTED;
+            }
+            break;
+        case VFIO_USER_MIGRATION_ITERATION_STATE_STARTED:
+            /*
+             * Repeated reads of pending_bytes should not have any side effects.
+             * FIXME does it have to be the same as the previous value? Can it
+             * increase or even decrease? I suppose it can't be lower than
+             * data_size? Ask on LKML.
+             */
+            break;
+        default:
+            return -EINVAL;
+    }
+    return 0;
+}
+
+static ssize_t
+handle_migration_data_offset(lm_ctx_t *lm_ctx, __u64 *offset, bool is_write)
+{
+    int ret;
+
+    assert(lm_ctx != NULL);
+    assert(offset != NULL);
+
+    if (is_write) {
+        return -EINVAL;
+    }
+
+    switch (lm_ctx->migration.iter.state) {
+        case VFIO_USER_MIGRATION_ITERATION_STATE_STARTED:
+            break;
+        default:
+            /*
+             * FIXME it's not clear whether these registers can be accessed in
+             * other parts of the iteration, need clarification on the
+             * following:
+             *
+             *  Read on data_offset and data_size should return the offset and
+             *  size of the current buffer if the user application reads
+             *  data_offset and data_size more than once here.
+             */
+            return -EINVAL;
+    }
+
+    ret = lm_ctx->migration.callbacks.prepare_data(lm_ctx->pvt,
+                                                   &lm_ctx->migration.iter.offset,
+                                                   &lm_ctx->migration.iter.size);
     if (ret < 0) {
-        lm_log(lm_ctx, LM_ERR, "bad access to capabilities %u@%#x\n", count,
-               pos);
         return ret;
     }
-    return count;
+
+    *offset = lm_ctx->migration.iter.offset + sizeof(struct vfio_device_migration_info);
+
+    return ret;
+}
+
+static ssize_t
+handle_migration_data_size(lm_ctx_t *lm_ctx, __u64 *size, bool is_write)
+{
+    assert(lm_ctx != NULL);
+    assert(size != NULL);
+
+    if (is_write) {
+        return -EINVAL;
+    }
+
+    switch (lm_ctx->migration.iter.state) {
+        case VFIO_USER_MIGRATION_ITERATION_STATE_STARTED:
+            break;
+        default:
+            /* FIXME see comment in handle_migration_data_offset */
+            return -EINVAL;
+    }
+
+    *size = lm_ctx->migration.iter.size;
+
+    return 0;
 }
 
 static ssize_t
-do_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t pos, bool is_write)
+handle_migration_region_access(lm_ctx_t *lm_ctx, char *buf, size_t count,
+                               loff_t pos, bool is_write)
+{
+    int ret;
+
+    assert(lm_ctx != NULL);
+    assert(buf != NULL);
+
+    if (pos + count > lm_ctx->pci_info.reg_info[LM_DEV_MIGRATION_REG_IDX].size) {
+        lm_log(lm_ctx, LM_ERR, "read %#x-%#x past end of migration region",
+               pos, pos + count - 1);
+        return -EINVAL;
+    }
+    switch (pos) {
+        case offsetof(struct vfio_device_migration_info, device_state):
+            if (count != sizeof(lm_ctx->migration.info.device_state)) {
+                return -EINVAL;
+            }
+            ret = handle_migration_device_state(lm_ctx, (__u32*)buf,
+                                                 is_write);
+            break;
+        case offsetof(struct vfio_device_migration_info, pending_bytes):
+            if (count != sizeof(lm_ctx->migration.info.pending_bytes)) {
+                return -EINVAL;
+            }
+            ret = handle_migration_pending_bytes(lm_ctx, (__u64*)buf, is_write);
+            break;
+        case offsetof(struct vfio_device_migration_info, data_offset):
+            if (count != sizeof(lm_ctx->migration.info.data_offset)) {
+                return -EINVAL;
+            }
+            ret = handle_migration_data_offset(lm_ctx, (__u64*)buf, is_write);
+            break;
+        case offsetof(struct vfio_device_migration_info, data_size):
+            if (count != sizeof(lm_ctx->migration.info.data_size)) {
+                return -EINVAL;
+            }
+            ret = handle_migration_data_size(lm_ctx, (__u64*)buf, is_write);
+            break;
+        default:
+            if (is_write) {
+                /* FIXME how do we handle the offset? */
+                ret = lm_ctx->migration.callbacks.write_data(lm_ctx->pvt,
+                                                             buf, count);
+            } else {
+                ret = lm_ctx->migration.callbacks.read_data(lm_ctx->pvt,
+                                                            buf, count,
+                                                            pos - sizeof(struct vfio_device_migration_info));
+            }
+    }
+
+    if (ret == 0) {
+        ret = count;
+    }
+    return ret;
+}
+
+static ssize_t
+do_access(lm_ctx_t *lm_ctx, char *buf, uint8_t count, uint64_t pos, bool is_write)
 {
     int idx;
     loff_t offset;
@@ -737,7 +1491,7 @@ do_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t pos, bool is_write)
 
     assert(lm_ctx != NULL);
     assert(buf != NULL);
-    assert(count > 0);
+    assert(count == 1 || count == 2 || count == 4 || count == 8);
 
     pci_info = &lm_ctx->pci_info;
     idx = lm_get_region(pos, count, &offset);
@@ -756,6 +1510,11 @@ do_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t pos, bool is_write)
                                               is_write);
     }
 
+    if (idx == LM_DEV_MIGRATION_REG_IDX) {
+        return handle_migration_region_access(lm_ctx, buf, count, offset,
+                                              is_write);
+    }
+
     /*
      * Checking whether a callback exists might sound expensive however this
      * code is not performance critical. This works well when we don't expect a
@@ -777,12 +1536,15 @@ do_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t pos, bool is_write)
  * error.
  *
  * TODO function name same lm_access_t, fix
+ * FIXME we must be able to return values up to uint32_t bit, or negative on
+ * error. Better to make return value an int and return the number of bytes
+ * processed via an argument.
  */
 ssize_t
-lm_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t *ppos,
+lm_access(lm_ctx_t *lm_ctx, char *buf, uint32_t count, uint64_t *ppos,
           bool is_write)
 {
-    unsigned int done = 0;
+    uint32_t done = 0;
     int ret;
 
     assert(lm_ctx != NULL);
@@ -792,7 +1554,10 @@ lm_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t *ppos,
         size_t size;
         /*
          * Limit accesses to qword and enforce alignment. Figure out whether
-         * the PCI spec requires this.
+         * the PCI spec requires this
+         * FIXME while this makes sense for registers, we might be able to relax
+         * this requirement and make some transfers more efficient. Maybe make
+         * this a per-region option that can be set by the user?
          */
         if (count >= 8 && !(*ppos % 8)) {
            size = 8;
@@ -805,15 +1570,16 @@ lm_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t *ppos,
         }
         ret = do_access(lm_ctx, buf, size, *ppos, is_write);
         if (ret <= 0) {
-            lm_log(lm_ctx, LM_ERR, "failed to %s %llx@%lx: %s\n",
-                   is_write ? "write" : "read", size, *ppos, strerror(-ret));
+            lm_log(lm_ctx, LM_ERR, "failed to %s %#lx-%#lx: %s",
+                   is_write ? "write to" : "read from", *ppos, *ppos + size - 1,
+                   strerror(-ret));
             /*
              * TODO if ret < 0 then it might contain a legitimate error code, why replace it with EFAULT?
              */
             return -EFAULT;
         }
         if (ret != (int)size) {
-            lm_log(lm_ctx, LM_DBG, "bad read %d != %d\n", ret, size);
+            lm_log(lm_ctx, LM_DBG, "bad read %d != %d", ret, size);
         }
         count -= size;
         done += size;
@@ -824,50 +1590,54 @@ lm_access(lm_ctx_t *lm_ctx, char *buf, size_t count, loff_t *ppos,
 }
 
 static inline int
-muser_access(lm_ctx_t *lm_ctx, struct muser_cmd *cmd, bool is_write)
+muser_access(lm_ctx_t *lm_ctx, bool is_write, void **data, uint32_t count,
+             uint64_t *pos)
 {
+    struct vfio_user_region_access *region_access;
     char *rwbuf;
     int err;
-    size_t count = 0, _count;
-    ssize_t ret;
+    uint32_t processed = 0, _count;
+    int ret;
+
+    assert(pos != NULL);
 
     /* TODO how big do we expect count to be? Can we use alloca(3) instead? */
-    rwbuf = calloc(1, cmd->rw.count);
-    if (rwbuf == NULL) {
+    region_access = calloc(1, sizeof(*region_access) + count);
+    if (region_access == NULL) {
         lm_log(lm_ctx, LM_ERR, "failed to allocate memory\n");
         return -1;
     }
+    rwbuf = (char*)(region_access + 1);
 
-#ifndef LM_TERSE_LOGGING
-    lm_log(lm_ctx, LM_DBG, "%s %x@%lx\n", is_write ? "W" : "R", cmd->rw.count,
-           cmd->rw.pos);
-#endif
+    lm_log(lm_ctx, LM_DBG, "%s %#lx-%#lx", is_write ? "W" : "R", *pos,
+           *pos + count - 1);
 
-    /* copy data to be written from kernel to user space */
+    /* receive data to be written */
     if (is_write) {
-        err = read(lm_ctx->fd, rwbuf, cmd->rw.count);
+        err = read(lm_ctx->conn_fd, rwbuf, count);
         /*
          * FIXME this is wrong, we should be checking for
-         * err != cmd->rw.count
+         * err != count
          */
         if (err < 0) {
-            lm_log(lm_ctx, LM_ERR, "failed to read from kernel: %s\n",
+            lm_log(lm_ctx, LM_ERR, "failed to receive write payload: %s",
                    strerror(errno));
             goto out;
         }
         err = 0;
-#ifndef LM_TERSE_LOGGING
-        dump_buffer(lm_ctx, "buffer write", rwbuf, cmd->rw.count);
+#ifdef LM_VERBOSE_LOGGING
+        dump_buffer("buffer write", rwbuf, count);
 #endif
     }
 
-    count = _count = cmd->rw.count;
-    cmd->err = muser_pci_hdr_access(lm_ctx, &_count, &cmd->rw.pos,
-                                    is_write, rwbuf);
-    if (cmd->err) {
-        lm_log(lm_ctx, LM_ERR, "failed to access PCI header: %d\n", cmd->err);
-#ifndef LM_TERSE_LOGGING
-        dump_buffer(lm_ctx, "buffer write", rwbuf, _count);
+    _count = count;
+    ret = muser_pci_hdr_access(lm_ctx, &_count, pos, is_write, rwbuf);
+    if (ret != 0) {
+        /* FIXME shouldn't we fail here? */
+        lm_log(lm_ctx, LM_ERR, "failed to access PCI header: %s",
+               strerror(-ret));
+#ifdef LM_VERBOSE_LOGGING
+        dump_buffer("buffer write", rwbuf, _count);
 #endif
     }
 
@@ -875,150 +1645,618 @@ muser_access(lm_ctx_t *lm_ctx, struct muser_cmd *cmd, bool is_write)
      * count is how much has been processed by muser_pci_hdr_access,
      * _count is how much there's left to be processed by lm_access
      */
-    count -= _count;
-    ret = lm_access(lm_ctx, rwbuf + count, _count, &cmd->rw.pos,
-                    is_write);
-    if (!is_write && ret >= 0) {
-        ret += count;
-        err = post_read(lm_ctx, rwbuf, ret);
-        if (!LM_TERSE_LOGGING && err == ret) {
-            dump_buffer(lm_ctx, "buffer read", rwbuf, ret);
+    processed = count - _count;
+    ret = lm_access(lm_ctx, rwbuf + processed, _count, pos, is_write);
+    if (ret >= 0) {
+        ret += processed;
+        if (data != NULL) {
+            /*
+             * FIXME the spec doesn't specify whether the reset of the
+             * region_access struct needs to be populated.
+             */
+            region_access->count = ret;
+            *data = region_access;
+            return ret;
+        } else if (!is_write) {
+            err = post_read(lm_ctx, rwbuf, ret);
+#ifdef LM_VERBOSE_LOGGING
+            if (err == ret) {
+                dump_buffer("buffer read", rwbuf, ret);
+            }
+#endif
         }
     }
 
 out:
-    free(rwbuf);
+    free(region_access);
 
-    return err;
+    return ret;
+}
+
+static int handle_device_get_region_info(lm_ctx_t *lm_ctx,
+                                         struct vfio_user_header *hdr,
+                                         struct vfio_region_info **dev_reg_info)
+{
+    struct vfio_region_info *reg_info;
+    int ret;
+
+    reg_info = calloc(sizeof(*reg_info), 1);
+    if (reg_info == NULL) {
+        return -ENOMEM;
+    }
+
+    if ((hdr->msg_size - sizeof(*hdr)) != sizeof(*reg_info)) {
+        free(reg_info);
+        return -EINVAL;
+    }
+
+    ret = recv(lm_ctx->conn_fd, reg_info, sizeof(*reg_info), 0);
+    if (ret < 0) {
+        free(reg_info);
+        return -errno;
+    }
+
+    ret = dev_get_reginfo(lm_ctx, &reg_info);
+    if (ret < 0) {
+        free(reg_info);
+        return ret;
+    }
+    *dev_reg_info = reg_info;
+
+    return 0;
+}
+
+static int handle_device_get_info(lm_ctx_t *lm_ctx,
+                                  struct vfio_user_header *hdr,
+                                  struct vfio_device_info *dev_info)
+{
+    int ret;
+
+    if ((hdr->msg_size - sizeof(*hdr)) != sizeof(*dev_info)) {
+        return -EINVAL;
+    }
+
+    ret = recv(lm_ctx->conn_fd, dev_info, sizeof(*dev_info), 0);
+    if (ret < 0) {
+        return -errno;
+    }
+
+    ret = dev_get_info(lm_ctx, dev_info);
+    if (ret < 0) {
+        return ret;
+    }
+
+    lm_log(lm_ctx, LM_DBG, "sent devinfo flags %#x, num_regions %d, num_irqs"
+           " %d", dev_info->flags, dev_info->num_regions, dev_info->num_irqs);
+    return ret;
 }
 
 static int
-muser_ioctl(lm_ctx_t *lm_ctx, struct muser_cmd *cmd)
+handle_device_get_irq_info(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr,
+                           struct vfio_irq_info *irq_info)
 {
-    void *data = NULL;
-    size_t size = 0;
     int ret;
 
-    /* TODO make this a function that returns the size */
-    if (cmd->ioctl.vfio_cmd == VFIO_DEVICE_SET_IRQS) {
-        uint32_t flags = cmd->ioctl.data.irq_set.flags;
-        switch ((flags & VFIO_IRQ_SET_DATA_TYPE_MASK)) {
+    assert(lm_ctx != NULL);
+    assert(irq_info != NULL);
+
+    hdr->msg_size -= sizeof *hdr;
+
+    if (hdr->msg_size != sizeof *irq_info) {
+        return -EINVAL;
+    }
+
+    ret = recv(lm_ctx->conn_fd, irq_info, hdr->msg_size, 0);
+    if (ret < 0) {
+        return -errno;
+    }
+    if (ret != (int)hdr->msg_size) {
+        assert(false); /* FIXME */
+    }
+
+    return dev_get_irqinfo(lm_ctx, irq_info);
+}
+
+static int
+handle_device_set_irqs(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr,
+                       int *fds, int nr_fds)
+{
+    int ret;
+    struct vfio_irq_set *irq_set;
+    void *data;
+
+    assert(lm_ctx != NULL);
+    assert(hdr != NULL);
+
+    hdr->msg_size -= sizeof *hdr;
+
+    if (hdr->msg_size < sizeof *irq_set) {
+        return -EINVAL;
+    }
+
+    irq_set = alloca(hdr->msg_size); /* FIXME */
+
+    ret = recv(lm_ctx->conn_fd, irq_set, hdr->msg_size, 0);
+    if (ret < 0) {
+        return -errno;
+    }
+    if (ret != (int)hdr->msg_size) {
+        assert(false); /* FIXME */
+    }
+    if (ret != (int)irq_set->argsz) {
+        assert(false); /* FIXME */
+    }
+    switch (irq_set->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
         case VFIO_IRQ_SET_DATA_EVENTFD:
-            size = sizeof(int32_t) * cmd->ioctl.data.irq_set.count;
+            data = fds;
+            if (nr_fds != (int)irq_set->count) {
+                return -EINVAL;
+            }
             break;
         case VFIO_IRQ_SET_DATA_BOOL:
-            size = sizeof(uint8_t) * cmd->ioctl.data.irq_set.count;
+            data = irq_set + 1;
             break;
+    }
+
+    return dev_set_irqs(lm_ctx, irq_set, data);
+}
+
+static int
+handle_dma_map_or_unmap(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr, bool map,
+                        int *fds, int nr_fds)
+{
+    int ret, i;
+    int nr_dma_regions;
+    struct vfio_user_dma_region *dma_regions;
+
+    assert(lm_ctx != NULL);
+    assert(hdr != NULL);
+
+    hdr->msg_size -= sizeof *hdr;
+
+    if (hdr->msg_size % sizeof(struct vfio_user_dma_region) != 0) {
+        lm_log(lm_ctx, LM_ERR, "bad size of DMA regions %d", hdr->msg_size);
+        return -EINVAL;
+    }
+
+    nr_dma_regions = (int)(hdr->msg_size / sizeof(struct vfio_user_dma_region));
+    if (map && nr_dma_regions != nr_fds) {
+        lm_log(lm_ctx, LM_ERR, "expected %d fds but got %d instead",
+               nr_dma_regions, nr_fds);
+        return -EINVAL;
+    }
+
+    dma_regions = alloca(nr_dma_regions * sizeof(*dma_regions));
+
+    ret = recv(lm_ctx->conn_fd, dma_regions, hdr->msg_size, 0);
+    if (ret == -1) {
+        lm_log(lm_ctx, LM_ERR, "failed to receive DMA region entries: %m");
+        return -errno;
+    }
+
+    if (lm_ctx->dma == NULL) {
+        return 0;
+    }
+
+    for (i = 0; i < nr_dma_regions; i++) {
+        if (map) {
+            if (dma_regions[i].flags != VFIO_USER_F_DMA_REGION_MAPPABLE) {
+                /*
+                 * FIXME implement non-mappable DMA regions. This requires changing
+                 * dma.c to not take a file descriptor.
+                 */
+                assert(false);
+            }
+
+            ret = dma_controller_add_region(lm_ctx->dma,
+                                            dma_regions[i].addr,
+                                            dma_regions[i].size,
+                                            fds[i],
+                                            dma_regions[i].offset);
+            if (ret < 0) {
+                lm_log(lm_ctx, LM_INF,
+                       "failed to add DMA region %#lx-%#lx offset=%#lx fd=%d: %s",
+                       dma_regions[i].addr,
+                       dma_regions[i].addr + dma_regions[i].size - 1,
+                       dma_regions[i].offset, fds[i],
+                       strerror(-ret));
+            } else {
+                lm_log(lm_ctx, LM_DBG,
+                       "added DMA region %#lx-%#lx offset=%#lx fd=%d",
+                       dma_regions[i].addr,
+                       dma_regions[i].addr + dma_regions[i].size - 1,
+                       dma_regions[i].offset, fds[i]);
+            }
+        } else {
+            ret = dma_controller_remove_region(lm_ctx->dma,
+                                               dma_regions[i].addr,
+                                               dma_regions[i].size,
+                                               lm_ctx->unmap_dma, lm_ctx->pvt);
+            if (ret < 0) {
+                lm_log(lm_ctx, LM_INF,
+                       "failed to remove DMA region %#lx-%#lx: %s",
+                       dma_regions[i].addr,
+                       dma_regions[i].addr + dma_regions[i].size - 1,
+                       strerror(-ret));
+            } else {
+                lm_log(lm_ctx, LM_DBG,
+                       "removed DMA region %#lx-%#lx",
+                       dma_regions[i].addr,
+                       dma_regions[i].addr + dma_regions[i].size - 1);
+            }
+        } 
+        if (ret < 0) {
+            return ret;
+        }
+        if (lm_ctx->map_dma != NULL) {
+            lm_ctx->map_dma(lm_ctx->pvt, dma_regions[i].addr, dma_regions[i].size);
         }
     }
+    return 0;
+}
 
-    if (size != 0) {
-        data = calloc(1, size);
-        if (data == NULL) {
-#ifdef DEBUG
-            perror("calloc");
-#endif
-            return -1;
+static int
+handle_device_reset(lm_ctx_t *lm_ctx)
+{
+    return device_reset(lm_ctx);
+}
+
+static int
+handle_region_access(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr,
+                     void **data, size_t *len)
+{
+    struct vfio_user_region_access region_access;
+    uint64_t count, offset;
+    int ret;
+
+    assert(lm_ctx != NULL);
+    assert(hdr != NULL);
+    assert(data != NULL);
+
+    /* 
+     * TODO Since muser_access doesn't have to handle the kernel case any more,
+     * we can avoid having to do an additional read/recv inside muser_access
+     * (one recv for struct region_access and another for the write data) by
+     * doing a single recvmsg here with an iovec where the first element of the
+     * array will be struct vfio_user_region_access and the second a buffer if
+     * it's a write.  The size of the write buffer is: hdr->msg_size - sizeof
+     * *hdr - sizeof region_access, and should be equal to region_access.count.
+     */
+
+    hdr->msg_size -= sizeof *hdr;
+    if (hdr->msg_size < sizeof region_access) {
+        lm_log(lm_ctx, LM_ERR, "message size too small (%d)", hdr->msg_size);
+        return -EINVAL;
+    }
+
+    ret = recv(lm_ctx->conn_fd, &region_access, sizeof region_access, 0);
+    if (ret == -1) {
+        lm_log(lm_ctx, LM_ERR, "failed to recv: %m");
+        return -errno;
+    }
+    if (ret != sizeof region_access) {
+        lm_log(lm_ctx, LM_ERR, "bad region_access size %d", ret);
+        return -EINVAL;
+    }
+    if (region_access.region >= LM_DEV_NUM_REGS || region_access.count <= 0 ) {
+        lm_log(lm_ctx, LM_ERR, "bad region %d and/or count %d",
+               region_access.region, region_access.count);
+        return -EINVAL;
+    }
+    count = region_access.count;
+    offset = region_to_offset(region_access.region) + region_access.offset;
+
+    ret = muser_access(lm_ctx, hdr->cmd == VFIO_USER_REGION_WRITE,
+                       data, count, &offset);
+    if (ret != (int)region_access.count) {
+        lm_log(lm_ctx, LM_ERR, "bad region access acount, expected=%d, actual=%d",
+               region_access.count, ret);
+        /* FIXME we should return whatever has been accessed, not an error */
+        if (ret >= 0) {
+            ret = -EINVAL;
         }
+        return ret;
+    }
 
-        ret = read(lm_ctx->fd, data, size);
-        if (ret < 0) {
-#ifdef DEBUG
-            perror("read failed");
-#endif
+    *len = sizeof(region_access);
+    if (hdr->cmd == VFIO_USER_REGION_READ) {
+        *len += region_access.count;
+    }
+
+    return 0;
+}
+
+static int
+handle_dirty_pages_get(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr,
+                       struct iovec **iovecs, size_t *nr_iovecs)
+{
+    int size, ret;
+    size_t i;
+    struct vfio_iommu_type1_dirty_bitmap_get *ranges;
+
+    assert(lm_ctx != NULL);
+    assert(hdr != NULL);
+    assert(iovecs != NULL);
+    assert(nr_iovecs != NULL);
+
+    size = hdr->msg_size - sizeof(*hdr) - sizeof(struct vfio_iommu_type1_dirty_bitmap);
+    if (size % sizeof(struct vfio_iommu_type1_dirty_bitmap_get) != 0) {
+        return -EINVAL;
+    }
+    ranges = malloc(size);
+    if (ranges == NULL) {
+        return -errno;
+    }
+    ret = recv(lm_ctx->conn_fd, ranges, size, 0);
+    if (ret == -1) {
+        ret = -errno;
+        goto out;
+    }
+    if (ret != size) {
+        ret = -EINVAL;
+        goto out;
+    }
+    *nr_iovecs = 1 + size / sizeof(struct vfio_iommu_type1_dirty_bitmap_get);
+    *iovecs = malloc(*nr_iovecs * sizeof(struct iovec));
+    if (*iovecs == NULL) {
+        ret = -errno;
+        goto out;
+    }
+   
+    for (i = 1; i < *nr_iovecs; i++) {
+        struct vfio_iommu_type1_dirty_bitmap_get *r = &ranges[(i - 1)]; /* FIXME ugly indexing */
+        ret = dma_controller_dirty_page_get(lm_ctx->dma, r->iova, r->size,
+                                            r->bitmap.pgsize, r->bitmap.size,
+                                            (char**)&((*iovecs)[i].iov_base));
+        if (ret != 0) {
             goto out;
         }
+        (*iovecs)[i].iov_len = r->bitmap.size;
     }
+out:
+    if (ret != 0) {
+        if (*iovecs != NULL) {
+            free(*iovecs);
+            *iovecs = NULL;
+        }
+    }
+    free(ranges);
+    return ret;
+}
 
-    ret = (int)do_muser_ioctl(lm_ctx, &cmd->ioctl, data);
+static int
+handle_dirty_pages(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr,
+                   struct iovec **iovecs, size_t *nr_iovecs)
+{
+    struct vfio_iommu_type1_dirty_bitmap dirty_bitmap;
+    int ret;
 
-out:
+    assert(lm_ctx != NULL);
+    assert(hdr != NULL);
+    assert(iovecs != NULL);
+    assert(nr_iovecs != NULL);
+
+    if (hdr->msg_size - sizeof *hdr < sizeof dirty_bitmap) {
+        lm_log(lm_ctx, LM_ERR, "invalid header size %lu", hdr->msg_size);
+        return -EINVAL;
+    }
+
+    /* FIXME must also check argsz */
+
+    ret = recv(lm_ctx->conn_fd, &dirty_bitmap, sizeof dirty_bitmap, 0);
+    if (ret == -1) {
+        return -errno;
+    }
+    if ((size_t)ret < sizeof dirty_bitmap) {
+        return -EINVAL;
+    }
+
+    if (dirty_bitmap.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
+        ret = dma_controller_dirty_page_logging_start(lm_ctx->dma,
+                                                      lm_ctx->migration.pgsize);
+    } else if (dirty_bitmap.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
+        ret = dma_controller_dirty_page_logging_stop(lm_ctx->dma);
+    } else if (dirty_bitmap.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
+        ret = handle_dirty_pages_get(lm_ctx, hdr, iovecs, nr_iovecs);
+    } else {
+        ret = -EINVAL;
+    }
 
-    free(data);
     return ret;
 }
 
+/*
+ * FIXME return value is messed up, sometimes we return -1 and set errno while
+ * other times we return -errno. Fix.
+ */
+
 static int
-drive_loop(lm_ctx_t *lm_ctx)
+process_request(lm_ctx_t *lm_ctx)
 {
-    struct muser_cmd cmd = { 0 };
-    int err;
+    struct vfio_user_header hdr = { 0, };
+    int ret;
+    int *fds = NULL;
+    int nr_fds;
+    struct vfio_irq_info irq_info;
+    struct vfio_device_info dev_info;
+    struct vfio_region_info *dev_reg_info = NULL;
+    struct iovec _iovecs[2] = { { 0, } };
+    struct iovec *iovecs = NULL;
+    size_t nr_iovecs = 0;
+    bool free_iovec_data = true;
 
-    do {
-        err = ioctl(lm_ctx->fd, MUSER_DEV_CMD_WAIT, &cmd);
-        if (err < 0) {
-            return err;
+    assert(lm_ctx != NULL);
+
+    if (lm_ctx->pci_info.reg_info[LM_DEV_MIGRATION_REG_IDX].size > 0 &&
+        lm_ctx->migration.info.device_state == VFIO_DEVICE_STATE_STOP) {
+        return -ESHUTDOWN;
+    }
+
+    nr_fds = lm_ctx->client_max_fds;
+    fds = alloca(nr_fds * sizeof(int));
+
+    /* FIXME get request shouldn't set errno, it should return it as -errno */
+    ret = transports_ops[lm_ctx->trans].get_request(lm_ctx, &hdr, fds, &nr_fds);
+    if (unlikely(ret < 0)) {
+        if (ret == -EAGAIN || ret == -EWOULDBLOCK) {
+            return 0;
+        }
+        if (ret != -EINTR) {
+            lm_log(lm_ctx, LM_ERR, "failed to receive request: %s", strerror(-ret));
         }
+        return ret;
+    }
+    if (unlikely(ret == 0)) {
+        if (errno == EINTR) {
+            return -EINTR;
+        }
+        if (errno == 0) {
+            lm_log(lm_ctx, LM_INF, "VFIO client closed connection");
+        } else {
+            lm_log(lm_ctx, LM_ERR, "end of file: %m");
+        }
+        return -ENOTCONN;
+    }
+
+    if (ret < (int)sizeof hdr) {
+        lm_log(lm_ctx, LM_ERR, "short header read %d", ret);
+        return -EINVAL;
+    }
 
-        switch (cmd.type) {
-        case MUSER_IOCTL:
-            err = muser_ioctl(lm_ctx, &cmd);
+    if (hdr.flags.type != VFIO_USER_F_TYPE_COMMAND) {
+        lm_log(lm_ctx, LM_ERR, "header not a request");
+        return -EINVAL;
+    }
+
+    if (hdr.msg_size < sizeof hdr) {
+        lm_log(lm_ctx, LM_ERR, "bad size in header %d", hdr.msg_size);
+        return -EINVAL;
+    }
+
+    /* FIXME in most of the following function we check that hdr.count is >=
+     * than the command-specific struct and there is an additional recv(2) for
+     * that data. We should eliminate duplicating this common code and move it
+     * here.
+     */
+
+    switch (hdr.cmd) {
+        case VFIO_USER_DMA_MAP:
+        case VFIO_USER_DMA_UNMAP:
+            ret = handle_dma_map_or_unmap(lm_ctx, &hdr,
+                                          hdr.cmd == VFIO_USER_DMA_MAP,
+                                          fds, nr_fds);
             break;
-        case MUSER_READ:
-        case MUSER_WRITE:
-            err = muser_access(lm_ctx, &cmd, cmd.type == MUSER_WRITE);
+        case VFIO_USER_DEVICE_GET_INFO:
+            ret = handle_device_get_info(lm_ctx, &hdr, &dev_info);
+            if (ret == 0) {
+                _iovecs[1].iov_base = &dev_info;
+                _iovecs[1].iov_len = dev_info.argsz;
+                iovecs = _iovecs;
+                nr_iovecs = 2;
+            }
             break;
-        case MUSER_MMAP:
-            err = muser_mmap(lm_ctx, &cmd);
+        case VFIO_USER_DEVICE_GET_REGION_INFO:
+            ret = handle_device_get_region_info(lm_ctx, &hdr, &dev_reg_info);
+            if (ret == 0) {
+                _iovecs[1].iov_base = dev_reg_info;
+                _iovecs[1].iov_len = dev_reg_info->argsz;
+                iovecs = _iovecs;
+                nr_iovecs = 2;
+            }
+            break;
+        case VFIO_USER_DEVICE_GET_IRQ_INFO:
+            ret = handle_device_get_irq_info(lm_ctx, &hdr, &irq_info);
+            if (ret == 0) {
+                _iovecs[1].iov_base = &irq_info;
+                _iovecs[1].iov_len = sizeof irq_info;
+                iovecs = _iovecs;
+                nr_iovecs = 2;
+            }
             break;
-        case MUSER_DMA_MMAP:
-            err = muser_dma_map(lm_ctx, &cmd);
+        case VFIO_USER_DEVICE_SET_IRQS:
+            ret = handle_device_set_irqs(lm_ctx, &hdr, fds, nr_fds);
             break;
-        case MUSER_DMA_MUNMAP:
-            err = muser_dma_unmap(lm_ctx, &cmd);
+        case VFIO_USER_REGION_READ:
+        case VFIO_USER_REGION_WRITE:
+            iovecs = _iovecs;
+            ret = handle_region_access(lm_ctx, &hdr, &iovecs[1].iov_base,
+                                       &iovecs[1].iov_len);
+            nr_iovecs = 2;
+            break;
+        case VFIO_USER_DEVICE_RESET:
+            ret = handle_device_reset(lm_ctx);
+            break;
+        case VFIO_USER_DIRTY_PAGES:
+            ret = handle_dirty_pages(lm_ctx, &hdr, &iovecs, &nr_iovecs);
+            if (ret >= 0) {
+                free_iovec_data = false;
+            }
             break;
         default:
-            lm_log(lm_ctx, LM_ERR, "bad command %d\n", cmd.type);
-            continue;
-        }
-        cmd.err = err;
-        err = ioctl(lm_ctx->fd, MUSER_DEV_CMD_DONE, &cmd);
-        if (err < 0) {
-            lm_log(lm_ctx, LM_ERR, "failed to complete command: %s\n",
-                   strerror(errno));
+            lm_log(lm_ctx, LM_ERR, "bad command %d", hdr.cmd);
+            return -EINVAL;
+    }
+
+    /*
+     * TODO: In case of error during command handling set errno respectively
+     * in the reply message.
+     */
+    if (ret < 0) {
+        lm_log(lm_ctx, LM_ERR, "failed to handle command %d: %s", hdr.cmd,
+               strerror(-ret));
+        assert(false); /* FIXME */
+    }
+    ret = _send_vfio_user_msg(lm_ctx->conn_fd, hdr.msg_id, true,
+                             0, iovecs, nr_iovecs, NULL, 0);
+    if (unlikely(ret < 0)) {
+        lm_log(lm_ctx, LM_ERR, "failed to complete command: %s",
+                strerror(-ret));
+    }
+    if (iovecs != NULL && iovecs != _iovecs) {
+        if (free_iovec_data) {
+            size_t i;
+            for (i = 0; i < nr_iovecs; i++) {
+                free(iovecs[i].iov_base);
+            }
         }
-        // TODO: Figure out a clean way to get out of the loop.
-    } while (1);
+        free(iovecs);
+    }
 
-    return err;
+    return ret;
 }
 
 int
 lm_ctx_drive(lm_ctx_t *lm_ctx)
 {
+    int err;
+
     if (lm_ctx == NULL) {
         errno = EINVAL;
         return -1;
     }
 
-    return drive_loop(lm_ctx);
-}
+    do {
+        err = process_request(lm_ctx);
+    } while (err >= 0);
 
-static int
-dev_detach(int dev_fd)
-{
-    return close(dev_fd);
+    return err;
 }
 
-static int
-dev_attach(const char *uuid)
+int
+lm_ctx_poll(lm_ctx_t *lm_ctx)
 {
-    char *path;
-    int dev_fd;
     int err;
 
-    err = asprintf(&path, "/dev/" MUSER_DEVNODE "/%s", uuid);
-    if (err != (int)(strlen(MUSER_DEVNODE) + strlen(uuid) + 6)) {
-        return -1;
+    if (unlikely((lm_ctx->flags & LM_FLAG_ATTACH_NB) == 0)) {
+        return -ENOTSUP;
     }
 
-    dev_fd = open(path, O_RDWR);
-
-    free(path);
+    err = process_request(lm_ctx);
 
-    return dev_fd;
+    return err >= 0 ? 0 : err;
 }
 
+/* FIXME this is not enough anymore, check muser_mmap */
 void *
 lm_mmap(lm_ctx_t *lm_ctx, off_t offset, size_t length)
 {
@@ -1035,38 +2273,64 @@ lm_mmap(lm_ctx_t *lm_ctx, off_t offset, size_t length)
                 lm_ctx->fd, offset);
 }
 
-int
-lm_irq_trigger(lm_ctx_t *lm_ctx, uint32_t vector)
+static int validate_irq_subindex(lm_ctx_t *lm_ctx, uint32_t subindex)
 {
-    eventfd_t val = 1;
 
-    if ((lm_ctx == NULL) || (vector >= lm_ctx->irqs.max_ivs)) {
-        lm_log(lm_ctx, LM_ERR, "bad IRQ %d, max=%d\n", vector,
+    if ((lm_ctx == NULL) || (subindex >= lm_ctx->irqs.max_ivs)) {
+        lm_log(lm_ctx, LM_ERR, "bad IRQ %d, max=%d\n", subindex,
                lm_ctx->irqs.max_ivs);
+        /* FIXME should return -errno */
         errno = EINVAL;
         return -1;
     }
 
-    if (lm_ctx->irqs.efds[vector] == -1) {
-        lm_log(lm_ctx, LM_ERR, "no fd for interrupt %d\n", vector);
+    return 0;
+}
+
+int
+lm_irq_trigger(lm_ctx_t *lm_ctx, uint32_t subindex)
+{
+    int ret;
+    eventfd_t val = 1;
+
+    ret = validate_irq_subindex(lm_ctx, subindex);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (lm_ctx->irqs.efds[subindex] == -1) {
+        lm_log(lm_ctx, LM_ERR, "no fd for interrupt %d\n", subindex);
+        /* FIXME should return -errno */
         errno = ENOENT;
         return -1;
     }
 
-    if (vector == LM_DEV_INTX_IRQ_IDX && !lm_ctx->pci_config_space->hdr.cmd.id) {
-        lm_log(lm_ctx, LM_ERR, "failed to trigger INTx IRQ, INTx disabled\n");
-        errno = EINVAL;
+    return eventfd_write(lm_ctx->irqs.efds[subindex], val);
+}
+
+int
+lm_irq_message(lm_ctx_t *lm_ctx, uint32_t subindex)
+{
+    int ret, msg_id = 1;
+    struct vfio_user_irq_info irq_info;
+
+    ret = validate_irq_subindex(lm_ctx, subindex);
+    if (ret < 0) {
         return -1;
-    } else if (vector == LM_DEV_MSIX_IRQ_IDX) {
-        /*
-         * FIXME must check that MSI-X capability exists during creation time
-         * FIXME need to check that MSI-X is enabled and that it's not masked.
-         * Currently that's not possible because libmuser doesn't care about
-         * the internals of a capability.
-         */
     }
 
-    return eventfd_write(lm_ctx->irqs.efds[vector], val);
+    irq_info.subindex = subindex;
+    ret = send_recv_vfio_user_msg(lm_ctx->conn_fd, msg_id,
+                                  VFIO_USER_VM_INTERRUPT,
+                                  &irq_info, sizeof irq_info,
+                                  NULL, 0, NULL, NULL, 0);
+    if (ret < 0) {
+        /* FIXME should return -errno */
+	    errno = -ret;
+	    return -1;
+    }
+
+    return 0;
 }
 
 static void
@@ -1081,16 +2345,50 @@ free_sparse_mmap_areas(lm_reg_info_t *reg_info)
 void
 lm_ctx_destroy(lm_ctx_t *lm_ctx)
 {
+    int ret;
+
     if (lm_ctx == NULL) {
         return;
     }
 
+    free(lm_ctx->uuid);
+
+    /*
+     * FIXME The following cleanup can be dangerous depending on how lm_ctx_destroy
+     * is called since it might delete files it did not create. Improve by
+     * acquiring a lock on the directory.
+     */
+
+    if (lm_ctx->iommu_dir_fd != -1) {
+        if ((ret = unlinkat(lm_ctx->iommu_dir_fd, IOMMU_GRP_NAME, 0)) == -1
+            && errno != ENOENT) {
+            lm_log(lm_ctx, LM_DBG, "failed to remove " IOMMU_GRP_NAME ": "
+                   "%m\n");
+        }
+        if ((ret = unlinkat(lm_ctx->iommu_dir_fd, MUSER_SOCK, 0)) == -1 &&
+            errno != ENOENT) {
+            lm_log(lm_ctx, LM_DBG, "failed to remove " MUSER_SOCK ": %m\n");
+        }
+        if (close(lm_ctx->iommu_dir_fd) == -1) {
+            lm_log(lm_ctx, LM_DBG, "failed to close IOMMU dir fd %d: %m\n",
+                   lm_ctx->iommu_dir_fd);
+        }
+    }
+    if (lm_ctx->iommu_dir != NULL) {
+        if ((ret = rmdir(lm_ctx->iommu_dir)) == -1 && errno != ENOENT) {
+            lm_log(lm_ctx, LM_DBG, "failed to remove %s: %m\n",
+                   lm_ctx->iommu_dir);
+        }
+        free(lm_ctx->iommu_dir);
+    }
+
     free(lm_ctx->pci_config_space);
-    dev_detach(lm_ctx->fd);
+    transports_ops[lm_ctx->trans].detach(lm_ctx);
     if (lm_ctx->dma != NULL) {
-        dma_controller_destroy(lm_ctx, lm_ctx->dma);
+        dma_controller_destroy(lm_ctx->dma);
     }
     free_sparse_mmap_areas(lm_ctx->pci_info.reg_info);
+    free(lm_ctx->caps);
     free(lm_ctx);
     // FIXME: Maybe close any open irq efds? Unmap stuff?
 }
@@ -1125,6 +2423,7 @@ pci_config_setup(lm_ctx_t *lm_ctx, const lm_dev_info_t *dev_info)
 {
     lm_reg_info_t *cfg_reg;
     const lm_reg_info_t zero_reg = { 0 };
+    lm_reg_info_t *migr_reg;
     int i;
 
     assert(lm_ctx != NULL);
@@ -1171,7 +2470,7 @@ pci_config_setup(lm_ctx_t *lm_ctx, const lm_dev_info_t *dev_info)
 
     // Initialise capabilities.
     if (dev_info->nr_caps > 0) {
-        lm_ctx->caps = caps_create(dev_info->caps, dev_info->nr_caps);
+        lm_ctx->caps = caps_create(lm_ctx, dev_info->caps, dev_info->nr_caps);
         if (lm_ctx->caps == NULL) {
             lm_log(lm_ctx, LM_ERR, "failed to create PCI capabilities: %m\n");
             goto err;
@@ -1181,6 +2480,28 @@ pci_config_setup(lm_ctx_t *lm_ctx, const lm_dev_info_t *dev_info)
         lm_ctx->pci_config_space->hdr.cap = PCI_STD_HEADER_SIZEOF;
     }
 
+    /*
+     * Check the migration region.
+     */
+    migr_reg = &lm_ctx->pci_info.reg_info[LM_DEV_MIGRATION_REG_IDX];
+    if (migr_reg->size > 0) {
+        if (migr_reg->size < sizeof(struct vfio_device_migration_info)) {
+            return -EINVAL;
+        }
+
+        /* FIXME this should be done in lm_ctx_run or poll */
+        lm_ctx->migration.info.device_state = VFIO_DEVICE_STATE_RUNNING; 
+
+        lm_ctx->migration.callbacks = dev_info->migration_callbacks;
+        if (lm_ctx->migration.callbacks.transition == NULL ||
+            lm_ctx->migration.callbacks.get_pending_bytes == NULL ||
+            lm_ctx->migration.callbacks.prepare_data == NULL ||
+            lm_ctx->migration.callbacks.read_data == NULL ||
+            lm_ctx->migration.callbacks.write_data == NULL) {
+            return -EINVAL;
+        }
+    }
+
     return 0;
 
 err:
@@ -1212,6 +2533,18 @@ pci_info_bounce(lm_pci_info_t *dst, const lm_pci_info_t *src)
     dst->cc = src->cc;
 }
 
+int
+lm_ctx_try_attach(lm_ctx_t *lm_ctx)
+{
+    assert(lm_ctx != NULL);
+
+    if ((lm_ctx->flags & LM_FLAG_ATTACH_NB) == 0) {
+        errno = EINVAL;
+        return -1;
+    }
+    return transports_ops[lm_ctx->trans].attach(lm_ctx);
+}
+
 lm_ctx_t *
 lm_ctx_create(const lm_dev_info_t *dev_info)
 {
@@ -1226,6 +2559,11 @@ lm_ctx_create(const lm_dev_info_t *dev_info)
         return NULL;
     }
 
+    if (dev_info->trans != LM_TRANS_SOCK) {
+            errno = EINVAL;
+            return NULL;
+    }
+
     /*
      * FIXME need to check that the number of MSI and MSI-X IRQs are valid
      * (1, 2, 4, 8, 16 or 32 for MSI and up to 2048 for MSI-X).
@@ -1244,6 +2582,9 @@ lm_ctx_create(const lm_dev_info_t *dev_info)
     if (lm_ctx == NULL) {
         return NULL;
     }
+    lm_ctx->trans = dev_info->trans;
+
+    lm_ctx->iommu_dir_fd = -1;
 
     // Set context irq information.
     for (i = 0; i < max_ivs; i++) {
@@ -1259,10 +2600,26 @@ lm_ctx_create(const lm_dev_info_t *dev_info)
     lm_ctx->log = dev_info->log;
     lm_ctx->log_lvl = dev_info->log_lvl;
     lm_ctx->reset = dev_info->reset;
+    lm_ctx->flags = dev_info->flags;
+
+    lm_ctx->uuid = strdup(dev_info->uuid);
+    if (lm_ctx->uuid == NULL) {
+        err = errno;
+        goto out;
+    }
 
     // Bounce the provided pci_info into the context.
     pci_info_bounce(&lm_ctx->pci_info, &dev_info->pci_info);
 
+    /*
+     * FIXME above memcpy also copies reg_info->mmap_areas. If pci_config_setup
+     * fails then we try to free reg_info->mmap_areas, which is wrong because
+     * this is a user pointer.
+     */
+    for (i = 0; i < ARRAY_SIZE(lm_ctx->pci_info.reg_info); i++) {
+        lm_ctx->pci_info.reg_info[i].mmap_areas = NULL;
+    }
+
     // Setup the PCI config space for this context.
     err = pci_config_setup(lm_ctx, dev_info);
     if (err != 0) {
@@ -1276,65 +2633,53 @@ lm_ctx_create(const lm_dev_info_t *dev_info)
         goto out;
     }
 
-    // Attach to the muser control device.
-    lm_ctx->fd = dev_attach(dev_info->uuid);
-    if (lm_ctx->fd == -1) {
-        err = errno;
-        goto out;
+    if (transports_ops[dev_info->trans].init != NULL) {
+        err = transports_ops[dev_info->trans].init(lm_ctx);
+        if (err < 0) {
+            goto out;
+        }
+        lm_ctx->fd = err;
+    }
+    err = 0;
+
+    // Attach to the muser control device. With LM_FLAG_ATTACH_NB caller is
+    // always expected to call lm_ctx_try_attach().
+    if ((dev_info->flags & LM_FLAG_ATTACH_NB) == 0) {
+        lm_ctx->conn_fd = transports_ops[dev_info->trans].attach(lm_ctx);
+        if (lm_ctx->conn_fd < 0) {
+                err = lm_ctx->conn_fd;
+                if (err != EINTR) {
+                    lm_log(lm_ctx, LM_ERR, "failed to attach: %s",
+                           strerror(-err));
+                }
+                goto out;
+        }
     }
 
+    lm_ctx->map_dma = dev_info->map_dma;
+    lm_ctx->unmap_dma = dev_info->unmap_dma;
+
     // Create the internal DMA controller.
-    lm_ctx->dma = dma_controller_create(LM_DMA_REGIONS);
-    if (lm_ctx->dma == NULL) {
-        err = errno;
-        goto out;
+    if (lm_ctx->unmap_dma != NULL) {
+        lm_ctx->dma = dma_controller_create(lm_ctx, LM_DMA_REGIONS);
+        if (lm_ctx->dma == NULL) {
+            err = errno;
+            goto out;
+        }
     }
 
 out:
-    if (err) {
-        if (lm_ctx) {
-            dma_controller_destroy(lm_ctx, lm_ctx->dma);
-            dev_detach(lm_ctx->fd);
-            free_sparse_mmap_areas(lm_ctx->pci_info.reg_info);
-            free(lm_ctx->pci_config_space);
-            free(lm_ctx);
+    if (err != 0) {
+        if (lm_ctx != NULL) {
+            lm_ctx_destroy(lm_ctx);
             lm_ctx = NULL;
         }
-        errno = err;
+        errno = -err;
     }
 
     return lm_ctx;
 }
 
-#ifdef DEBUG
-static void
-dump_buffer(lm_ctx_t *lm_ctx, const char *prefix,
-            const char *buf, uint32_t count)
-{
-    int i;
-    const size_t bytes_per_line = 0x8;
-
-    if (strcmp(prefix, "")) {
-        lm_log(lm_ctx, LM_DBG, "%s\n", prefix);
-    }
-    for (i = 0; i < (int)count; i++) {
-        if (i % bytes_per_line != 0) {
-            lm_log(lm_ctx, LM_DBG, " ");
-        }
-        /* TODO valgrind emits a warning if count is 1 */
-        lm_log(lm_ctx, LM_DBG, "0x%02x", *(buf + i));
-        if ((i + 1) % bytes_per_line == 0) {
-            lm_log(lm_ctx, LM_DBG, "\n");
-        }
-    }
-    if (i % bytes_per_line != 0) {
-        lm_log(lm_ctx, LM_DBG, "\n");
-    }
-}
-#else
-#define dump_buffer(lm_ctx, prefix, buf, count)
-#endif
-
 /*
  * Returns a pointer to the standard part of the PCI configuration space.
  */
@@ -1364,21 +2709,34 @@ lm_get_region_info(lm_ctx_t *lm_ctx)
 
 inline int
 lm_addr_to_sg(lm_ctx_t *lm_ctx, dma_addr_t dma_addr,
-              uint32_t len, dma_sg_t *sg, int max_sg)
+              uint32_t len, dma_sg_t *sg, int max_sg, int prot)
 {
-    return dma_addr_to_sg(lm_ctx->dma, dma_addr, len, sg, max_sg);
+    assert(lm_ctx != NULL);
+
+    if (unlikely(lm_ctx->unmap_dma == NULL)) {
+        errno = EINVAL;
+        return -1;
+    }
+    return dma_addr_to_sg(lm_ctx->dma, dma_addr, len, sg, max_sg, prot);
 }
 
 inline int
 lm_map_sg(lm_ctx_t *lm_ctx, const dma_sg_t *sg,
 	  struct iovec *iov, int cnt)
 {
+    if (unlikely(lm_ctx->unmap_dma == NULL)) {
+        errno = EINVAL;
+        return -1;
+    }
     return dma_map_sg(lm_ctx->dma, sg, iov, cnt);
 }
 
 inline void
 lm_unmap_sg(lm_ctx_t *lm_ctx, const dma_sg_t *sg, struct iovec *iov, int cnt)
 {
+    if (unlikely(lm_ctx->unmap_dma == NULL)) {
+        return;
+    }
     return dma_unmap_sg(lm_ctx->dma, sg, iov, cnt);
 }
 
@@ -1396,4 +2754,66 @@ lm_ctx_run(lm_dev_info_t *dev_info)
     return ret;
 }
 
+uint8_t *
+lm_ctx_get_cap(lm_ctx_t *lm_ctx, uint8_t id)
+{
+    assert(lm_ctx != NULL);
+
+    return cap_find_by_id(lm_ctx, id);
+}
+
+int
+lm_dma_read(lm_ctx_t *lm_ctx, dma_sg_t *sg, void *data)
+{
+    struct vfio_user_dma_region_access *dma_recv;
+    struct vfio_user_dma_region_access dma_send;
+    int recv_size;
+    int msg_id = 1, ret;
+
+    assert(lm_ctx != NULL);
+    assert(sg != NULL);
+
+    recv_size = sizeof(*dma_recv) + sg->length;
+
+    dma_recv = calloc(recv_size, 1);
+    if (dma_recv == NULL) {
+        return -ENOMEM;
+    }
+
+    dma_send.addr = sg->dma_addr;
+    dma_send.count = sg->length;
+    ret = send_recv_vfio_user_msg(lm_ctx->conn_fd, msg_id, VFIO_USER_DMA_READ,
+                                  &dma_send, sizeof dma_send, NULL, 0, NULL,
+                                  dma_recv, recv_size);
+    memcpy(data, dma_recv->data, sg->length); /* FIXME no need for memcpy */
+    free(dma_recv);
+
+    return ret;
+}
+
+int
+lm_dma_write(lm_ctx_t *lm_ctx, dma_sg_t *sg, void *data)
+{
+    struct vfio_user_dma_region_access *dma_send, dma_recv;
+    int send_size = sizeof(*dma_send) + sg->length;
+    int msg_id = 1, ret;
+
+    assert(lm_ctx != NULL);
+    assert(sg != NULL);
+
+    dma_send = calloc(send_size, 1);
+    if (dma_send == NULL) {
+        return -ENOMEM;
+    }
+    dma_send->addr = sg->dma_addr;
+    dma_send->count = sg->length;
+    memcpy(dma_send->data, data, sg->length); /* FIXME no need to copy! */
+    ret = send_recv_vfio_user_msg(lm_ctx->conn_fd, msg_id, VFIO_USER_DMA_WRITE,
+                                  dma_send, send_size,
+                                  NULL, 0, NULL, &dma_recv, sizeof(dma_recv));
+    free(dma_send);
+
+    return ret;
+}
+
 /* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/muser_pci.c b/lib/muser_pci.c
index 36692ab..2846301 100644
--- a/lib/muser_pci.c
+++ b/lib/muser_pci.c
@@ -52,7 +52,7 @@ muser_pci_hdr_write_bar(lm_ctx_t *lm_ctx, uint16_t bar_index, const char *buf)
     lm_reg_info_t *reg_info = lm_get_region_info(lm_ctx);
     lm_pci_hdr_t *hdr;
 
-    assert(lm_ctx);
+    assert(lm_ctx != NULL);
 
     if (reg_info[bar_index].size == 0) {
         return;
@@ -86,15 +86,15 @@ handle_command_write(lm_ctx_t *ctx, lm_pci_config_space_t *pci,
 {
     uint16_t v;
 
-    assert(ctx);
+    assert(ctx != NULL);
 
     if (count != 2) {
         lm_log(ctx, LM_ERR, "bad write command size %d\n", count);
         return -EINVAL;
     }
 
-    assert(pci);
-    assert(buf);
+    assert(pci != NULL);
+    assert(buf != NULL);
 
     v = *(uint16_t*)buf;
 
@@ -153,17 +153,35 @@ handle_command_write(lm_ctx_t *ctx, lm_pci_config_space_t *pci,
     if ((v & PCI_COMMAND_INTX_DISABLE) == PCI_COMMAND_INTX_DISABLE) {
         if (!pci->hdr.cmd.id) {
             pci->hdr.cmd.id = 0x1;
-            lm_log(ctx, LM_INF, "INTx emulation enabled\n");
+            lm_log(ctx, LM_INF, "INTx emulation disabled\n");
         }
         v &= ~PCI_COMMAND_INTX_DISABLE;
     } else {
         if (pci->hdr.cmd.id) {
             pci->hdr.cmd.id = 0x0;
-            lm_log(ctx, LM_INF, "INTx emulation disabled\n");
+            lm_log(ctx, LM_INF, "INTx emulation enabled\n");
         }
     }
 
-    if (v) {
+    if ((v & PCI_COMMAND_INVALIDATE) == PCI_COMMAND_INVALIDATE) {
+        if (!pci->hdr.cmd.mwie) {
+            pci->hdr.cmd.mwie = 1U;
+            lm_log(ctx, LM_INF, "memory write and invalidate enabled\n");
+        }
+        v &= ~PCI_COMMAND_INVALIDATE;
+    } else {
+        if (pci->hdr.cmd.mwie) {
+            pci->hdr.cmd.mwie = 0;
+            lm_log(ctx, LM_INF, "memory write and invalidate disabled");
+        }
+    }
+
+    if ((v & PCI_COMMAND_VGA_PALETTE) == PCI_COMMAND_VGA_PALETTE) {
+        lm_log(ctx, LM_INF, "enabling VGA palette snooping ignored\n");
+        v &= ~PCI_COMMAND_VGA_PALETTE;
+    }
+
+    if (v != 0) {
         lm_log(ctx, LM_ERR, "unconsumed command flags %x\n", v);
         return -EINVAL;
     }
@@ -177,8 +195,8 @@ handle_erom_write(lm_ctx_t *ctx, lm_pci_config_space_t *pci,
 {
     uint32_t v;
 
-    assert(ctx);
-    assert(pci);
+    assert(ctx != NULL);
+    assert(pci != NULL);
 
     if (count != 0x4) {
         lm_log(ctx, LM_ERR, "bad EROM count %d\n", count);
@@ -207,8 +225,8 @@ muser_pci_hdr_write(lm_ctx_t *lm_ctx, uint16_t offset,
     lm_pci_config_space_t *pci;
     int ret = 0;
 
-    assert(lm_ctx);
-    assert(buf);
+    assert(lm_ctx != NULL);
+    assert(buf != NULL);
 
     pci = lm_get_pci_config_space(lm_ctx);
 
@@ -248,8 +266,8 @@ muser_pci_hdr_write(lm_ctx_t *lm_ctx, uint16_t offset,
         ret = -EINVAL;
     }
 
-#ifndef LM_TERSE_LOGGING
-    dump_buffer(lm_ctx, "PCI header", pci->hdr.raw, 0xff);
+#ifdef LM_VERBOSE_LOGGING
+    dump_buffer("PCI header", (char*)pci->hdr.raw, 0xff);
 #endif
 
     return ret;
@@ -263,18 +281,18 @@ muser_pci_hdr_write(lm_ctx_t *lm_ctx, uint16_t offset,
  * @count: output parameter that receives the number of bytes read/written
  */
 static inline int
-muser_do_pci_hdr_access(lm_ctx_t *lm_ctx, size_t *count,
-                        loff_t *pos, bool is_write,
+muser_do_pci_hdr_access(lm_ctx_t *lm_ctx, uint32_t *count,
+                        uint64_t *pos, bool is_write,
                         char *buf)
 {
-    size_t _count;
+    uint32_t _count;
     loff_t _pos;
     int err = 0;
 
-    assert(lm_ctx);
-    assert(count);
-    assert(pos);
-    assert(buf);
+    assert(lm_ctx != NULL);
+    assert(count != NULL);
+    assert(pos != NULL);
+    assert(buf != NULL);
 
     _pos = *pos - region_to_offset(LM_DEV_CFG_REG_IDX);
     _count = MIN(*count, PCI_STD_HEADER_SIZEOF - _pos);
@@ -290,20 +308,21 @@ muser_do_pci_hdr_access(lm_ctx_t *lm_ctx, size_t *count,
 }
 
 static inline bool
-muser_is_pci_hdr_access(loff_t pos)
+muser_is_pci_hdr_access(uint64_t pos)
 {
-    const off_t off = (loff_t) region_to_offset(LM_DEV_CFG_REG_IDX);
-    return pos - off >= 0 && pos - off < PCI_STD_HEADER_SIZEOF;
+    const uint64_t off = region_to_offset(LM_DEV_CFG_REG_IDX);
+    return pos >= off && pos - off < PCI_STD_HEADER_SIZEOF;
 }
 
+/* FIXME this function is misleading, remove it */
 int
-muser_pci_hdr_access(lm_ctx_t *lm_ctx, size_t *count,
-                     loff_t *pos, bool is_write,
+muser_pci_hdr_access(lm_ctx_t *lm_ctx, uint32_t *count,
+                     uint64_t *pos, bool is_write,
                      char *buf)
 {
-    assert(lm_ctx);
-    assert(count);
-    assert(pos);
+    assert(lm_ctx != NULL);
+    assert(count != NULL);
+    assert(pos != NULL);
 
     if (!muser_is_pci_hdr_access(*pos)) {
         return 0;
diff --git a/lib/muser_priv.h b/lib/muser_priv.h
index aa29f5a..097874a 100644
--- a/lib/muser_priv.h
+++ b/lib/muser_priv.h
@@ -35,9 +35,11 @@
 
 #include "muser.h"
 
+extern char *irq_to_str[];
+
 int
-muser_pci_hdr_access(lm_ctx_t *lm_ctx, size_t *count,
-                     loff_t *pos, bool write, char *buf);
+muser_pci_hdr_access(lm_ctx_t *lm_ctx, uint32_t *count,
+                     uint64_t *pos, bool write, char *buf);
 
 lm_reg_info_t *
 lm_get_region_info(lm_ctx_t *lm_ctx);
@@ -45,4 +47,111 @@ lm_get_region_info(lm_ctx_t *lm_ctx);
 uint64_t
 region_to_offset(uint32_t region);
 
+int
+_send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply,
+                   enum vfio_user_command cmd,
+                   struct iovec *iovecs, size_t nr_iovecs,
+                   int *fds, int count);
+
+int
+send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply,
+                   enum vfio_user_command cmd,
+                   void *data, size_t data_len,
+                   int *fds, size_t count);
+
+
+int
+recv_vfio_user_msg(int sock, struct vfio_user_header *hdr, bool is_reply,
+                   uint16_t *msg_id, void *data, size_t *len);
+
+int
+send_version(int sock, int major, int minor, uint16_t msg_id, bool is_reply,
+             char *caps);
+
+int
+recv_version(int sock, int *major, int *minor, uint16_t *msg_id, bool is_reply,
+             int *max_fds, size_t *pgsize);
+
+int
+_send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd,
+                         struct iovec *iovecs, size_t nr_iovecs,
+                         int *send_fds, size_t fd_count,
+                         struct vfio_user_header *hdr,
+                         void *recv_data, size_t recv_len);
+
+int
+send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd,
+                        void *send_data, size_t send_len,
+                        int *send_fds, size_t fd_count,
+                        struct vfio_user_header *hdr,
+                        void *recv_data, size_t recv_len);
+
+/* FIXME copied from include/linux/stddef.h, is this OK license-wise? */
+#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER))
+#define offsetofend(TYPE, MEMBER) \
+       (offsetof(TYPE, MEMBER) + sizeof_field(TYPE, MEMBER))
+
+static inline ssize_t get_minsz(unsigned int cmd)
+{
+	switch (cmd) {
+	case VFIO_DEVICE_GET_INFO:
+		return offsetofend(struct vfio_device_info, num_irqs);
+	case VFIO_DEVICE_GET_REGION_INFO:
+		return offsetofend(struct vfio_region_info, offset);
+	case VFIO_DEVICE_GET_IRQ_INFO:
+		return offsetofend(struct vfio_irq_info, count);
+	case VFIO_DEVICE_SET_IRQS:
+		return offsetofend(struct vfio_irq_set, count);
+	case VFIO_GROUP_GET_STATUS:
+		return offsetofend(struct vfio_group_status, flags);
+	case VFIO_GET_API_VERSION:
+		return 0;
+	case VFIO_CHECK_EXTENSION:
+	case VFIO_GROUP_SET_CONTAINER:
+	case VFIO_GROUP_UNSET_CONTAINER:
+	case VFIO_SET_IOMMU:
+		return sizeof(int);
+	case VFIO_IOMMU_GET_INFO:
+		return offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
+	case VFIO_IOMMU_MAP_DMA:
+		return offsetofend(struct vfio_iommu_type1_dma_map, size);
+	case VFIO_IOMMU_UNMAP_DMA:
+		return offsetofend(struct vfio_iommu_type1_dma_unmap, size);
+	case VFIO_GROUP_GET_DEVICE_FD:
+	case VFIO_DEVICE_RESET:
+		return 0;
+	}
+	return -EOPNOTSUPP;
+}
+
+static inline const char* vfio_cmd_to_str(int cmd) {
+        switch (cmd) {
+                case VFIO_GET_API_VERSION: return "VFIO_GET_API_VERSION";
+                case VFIO_CHECK_EXTENSION: return "VFIO_CHECK_EXTENSION";
+                case VFIO_SET_IOMMU: return "VFIO_SET_IOMMU";
+                case VFIO_GROUP_GET_STATUS: return "VFIO_GROUP_GET_STATUS";
+                case VFIO_GROUP_SET_CONTAINER: return "VFIO_GROUP_SET_CONTAINER";
+                case VFIO_GROUP_UNSET_CONTAINER: return "VFIO_GROUP_UNSET_CONTAINER";
+                case VFIO_GROUP_GET_DEVICE_FD: return "VFIO_GROUP_GET_DEVICE_FD";
+                case VFIO_DEVICE_GET_INFO: return "VFIO_DEVICE_GET_INFO";
+                case VFIO_DEVICE_GET_REGION_INFO: return "VFIO_DEVICE_GET_REGION_INFO";
+                case VFIO_DEVICE_GET_IRQ_INFO: return "VFIO_DEVICE_GET_IRQ_INFO";
+                case VFIO_DEVICE_SET_IRQS: return "VFIO_DEVICE_SET_IRQS";
+                case VFIO_DEVICE_RESET: return "VFIO_DEVICE_RESET";
+                case VFIO_IOMMU_GET_INFO: return "VFIO_IOMMU_GET_INFO/VFIO_DEVICE_GET_PCI_HOT_RESET_INFO/VFIO_IOMMU_SPAPR_TCE_GET_INFO";
+                case VFIO_IOMMU_MAP_DMA: return "VFIO_IOMMU_MAP_DMA/VFIO_DEVICE_PCI_HOT_RESET";
+                case VFIO_IOMMU_UNMAP_DMA: return "VFIO_IOMMU_UNMAP_DMA";
+                case VFIO_IOMMU_ENABLE: return "VFIO_IOMMU_ENABLE";
+                case VFIO_IOMMU_DISABLE: return "VFIO_IOMMU_DISABLE";
+                case VFIO_EEH_PE_OP: return "VFIO_EEH_PE_OP";
+                case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: return "VFIO_IOMMU_SPAPR_REGISTER_MEMORY";
+                case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: return "VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY";
+                case VFIO_IOMMU_SPAPR_TCE_CREATE: return "VFIO_IOMMU_SPAPR_TCE_CREATE";
+                case VFIO_IOMMU_SPAPR_TCE_REMOVE: return "VFIO_IOMMU_SPAPR_TCE_REMOVE";
+        }
+        return NULL;
+}
+
 #endif /* MUSER_PRIV_H */
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lib/vfio_user.h b/lib/vfio_user.h
new file mode 100644
index 0000000..19f751a
--- /dev/null
+++ b/lib/vfio_user.h
@@ -0,0 +1,167 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VFIO_USER_H
+#define _VFIO_USER_H
+
+#include <inttypes.h>
+#include <linux/vfio.h>
+#include <linux/version.h>
+
+enum vfio_user_command {
+    VFIO_USER_VERSION                   = 1,
+    VFIO_USER_DMA_MAP                   = 2,
+    VFIO_USER_DMA_UNMAP                 = 3,
+    VFIO_USER_DEVICE_GET_INFO           = 4,
+    VFIO_USER_DEVICE_GET_REGION_INFO    = 5,
+    VFIO_USER_DEVICE_GET_IRQ_INFO       = 6,
+    VFIO_USER_DEVICE_SET_IRQS           = 7,
+    VFIO_USER_REGION_READ               = 8,
+    VFIO_USER_REGION_WRITE              = 9,
+    VFIO_USER_DMA_READ                  = 10,
+    VFIO_USER_DMA_WRITE                 = 11,
+    VFIO_USER_VM_INTERRUPT              = 12,
+    VFIO_USER_DEVICE_RESET              = 13,
+    VFIO_USER_DIRTY_PAGES               = 14,
+    VFIO_USER_MAX,
+};
+
+enum vfio_user_message_type {
+    VFIO_USER_MESSAGE_COMMAND   = 0,
+    VFIO_USER_MESSAGE_REPLY     = 1,
+};
+
+#define VFIO_USER_FLAGS_NO_REPLY    (0x1)
+
+struct vfio_user_header {
+    uint16_t    msg_id;
+    uint16_t    cmd;
+    uint32_t    msg_size;
+    struct {
+        uint32_t    type     : 4;
+#define VFIO_USER_F_TYPE_COMMAND    0
+#define VFIO_USER_F_TYPE_REPLY      1
+        uint32_t    no_reply : 1;
+        uint32_t    error    : 1;
+        uint32_t    resvd    : 26;
+    } flags;
+    uint32_t    error_no;
+} __attribute__((packed));
+
+struct vfio_user_dma_region {
+    uint64_t    addr;
+    uint64_t    size;
+    uint64_t    offset;
+    uint32_t    prot;
+    uint32_t    flags;
+#define VFIO_USER_F_DMA_REGION_MAPPABLE (0x0)
+} __attribute__((packed));
+
+struct vfio_user_region_access {
+    uint64_t    offset;
+    uint32_t    region;
+    uint32_t    count;
+    uint8_t     data[];
+} __attribute__((packed));
+
+struct vfio_user_dma_region_access {
+    uint64_t    addr;
+    uint32_t    count;
+    uint8_t     data[];
+} __attribute__((packed));
+
+struct vfio_user_irq_info {
+    uint32_t    subindex;
+} __attribute__((packed));
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5,8,0)
+
+/* copied from <linux/vfio.h> */
+
+#define VFIO_REGION_TYPE_MIGRATION              (3)
+#define VFIO_REGION_SUBTYPE_MIGRATION           (1)
+
+struct vfio_device_migration_info {
+	__u32 device_state;         /* VFIO device state */
+#define VFIO_DEVICE_STATE_STOP      (0)
+#define VFIO_DEVICE_STATE_RUNNING   (1 << 0)
+#define VFIO_DEVICE_STATE_SAVING    (1 << 1)
+#define VFIO_DEVICE_STATE_RESUMING  (1 << 2)
+#define VFIO_DEVICE_STATE_MASK      (VFIO_DEVICE_STATE_RUNNING | \
+				     VFIO_DEVICE_STATE_SAVING |  \
+				     VFIO_DEVICE_STATE_RESUMING)
+
+#define VFIO_DEVICE_STATE_VALID(state) \
+	(state & VFIO_DEVICE_STATE_RESUMING ? \
+	(state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1)
+
+#define VFIO_DEVICE_STATE_IS_ERROR(state) \
+	((state & VFIO_DEVICE_STATE_MASK) == (VFIO_DEVICE_STATE_SAVING | \
+					      VFIO_DEVICE_STATE_RESUMING))
+
+#define VFIO_DEVICE_STATE_SET_ERROR(state) \
+	((state & ~VFIO_DEVICE_STATE_MASK) | VFIO_DEVICE_SATE_SAVING | \
+					     VFIO_DEVICE_STATE_RESUMING)
+
+	__u32 reserved;
+	__u64 pending_bytes;
+	__u64 data_offset;
+	__u64 data_size;
+};
+
+struct vfio_bitmap {
+	__u64        pgsize;	/* page size for bitmap in bytes */
+	__u64        size;	/* in bytes */
+	__u64 *data;	/* one bit per page */
+};
+
+struct vfio_iommu_type1_dirty_bitmap {
+	__u32        argsz;
+	__u32        flags;
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_START	(1 << 0)
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP	(1 << 1)
+#define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP	(1 << 2)
+	__u8         data[];
+};
+
+struct vfio_iommu_type1_dirty_bitmap_get {
+	__u64              iova;	/* IO virtual address */
+	__u64              size;	/* Size of iova range */
+	struct vfio_bitmap bitmap;
+};
+
+#endif
+
+#endif
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */
author	Thanos Makatos <thanos.makatos@nutanix.com>	2020-11-11 07:35:10 -0500
committer	Thanos Makatos <thanos.makatos@nutanix.com>	2020-11-11 07:35:10 -0500
commit	b6e8c7b39456e7c0a164f87c48eac2bbfd6d85f3 (patch)
tree	c94839c02cde83bca416221bd906e4952fbc8c53 /lib
parent	b9a2e75360e14e59db651d6081894e0cf20e7c2d (diff)
parent	985940e6539eaf8f41e0b6421938b5bf5c1db22c (diff)
download	libvfio-user-b6e8c7b39456e7c0a164f87c48eac2bbfd6d85f3.zip libvfio-user-b6e8c7b39456e7c0a164f87c48eac2bbfd6d85f3.tar.gz libvfio-user-b6e8c7b39456e7c0a164f87c48eac2bbfd6d85f3.tar.bz2