8 files changed, 334 insertions, 167 deletions
diff --git a/util/Makefile.objs b/util/Makefile.objs
index 11262aa..6b38b67 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -20,6 +20,7 @@ util-obj-y += envlist.o path.o module.o
 util-obj-y += host-utils.o
 util-obj-y += bitmap.o bitops.o hbitmap.o
 util-obj-y += fifo8.o
+util-obj-y += nvdimm-utils.o
 util-obj-y += cacheinfo.o
 util-obj-y += error.o qemu-error.o
 util-obj-y += qemu-print.o
diff --git a/util/aio-posix.c b/util/aio-posix.c
index a4977f5..9e1befc 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -15,6 +15,7 @@
 
 #include "qemu/osdep.h"
 #include "block/block.h"
+#include "qemu/rcu.h"
 #include "qemu/rcu_queue.h"
 #include "qemu/sockets.h"
 #include "qemu/cutils.h"
@@ -31,12 +32,23 @@ struct AioHandler
     AioPollFn *io_poll;
     IOHandler *io_poll_begin;
     IOHandler *io_poll_end;
-    int deleted;
     void *opaque;
     bool is_external;
     QLIST_ENTRY(AioHandler) node;
+    QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
+    QLIST_ENTRY(AioHandler) node_deleted;
 };
 
+/* Add a handler to a ready list */
+static void add_ready_handler(AioHandlerList *ready_list,
+                              AioHandler *node,
+                              int revents)
+{
+    QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */
+    node->pfd.revents = revents;
+    QLIST_INSERT_HEAD(ready_list, node, node_ready);
+}
+
 #ifdef CONFIG_EPOLL_CREATE1
 
 /* The fd number threshold to switch to epoll */
@@ -67,7 +79,7 @@ static bool aio_epoll_try_enable(AioContext *ctx)
 
     QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
         int r;
-        if (node->deleted || !node->pfd.events) {
+        if (QLIST_IS_INSERTED(node, node_deleted) || !node->pfd.events) {
             continue;
         }
         event.events = epoll_events_from_pfd(node->pfd.events);
@@ -104,17 +116,22 @@ static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
     }
 }
 
-static int aio_epoll(AioContext *ctx, GPollFD *pfds,
-                     unsigned npfd, int64_t timeout)
+static int aio_epoll(AioContext *ctx, AioHandlerList *ready_list,
+                     int64_t timeout)
 {
+    GPollFD pfd = {
+        .fd = ctx->epollfd,
+        .events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR,
+    };
     AioHandler *node;
     int i, ret = 0;
     struct epoll_event events[128];
 
-    assert(npfd == 1);
-    assert(pfds[0].fd == ctx->epollfd);
     if (timeout > 0) {
-        ret = qemu_poll_ns(pfds, npfd, timeout);
+        ret = qemu_poll_ns(&pfd, 1, timeout);
+        if (ret > 0) {
+            timeout = 0;
+        }
     }
     if (timeout <= 0 || ret > 0) {
         ret = epoll_wait(ctx->epollfd, events,
@@ -125,11 +142,13 @@ static int aio_epoll(AioContext *ctx, GPollFD *pfds,
         }
         for (i = 0; i < ret; i++) {
             int ev = events[i].events;
+            int revents = (ev & EPOLLIN ? G_IO_IN : 0) |
+                          (ev & EPOLLOUT ? G_IO_OUT : 0) |
+                          (ev & EPOLLHUP ? G_IO_HUP : 0) |
+                          (ev & EPOLLERR ? G_IO_ERR : 0);
+
             node = events[i].data.ptr;
-            node->pfd.revents = (ev & EPOLLIN ? G_IO_IN : 0) |
-                (ev & EPOLLOUT ? G_IO_OUT : 0) |
-                (ev & EPOLLHUP ? G_IO_HUP : 0) |
-                (ev & EPOLLERR ? G_IO_ERR : 0);
+            add_ready_handler(ready_list, node, revents);
         }
     }
 out:
@@ -167,8 +186,8 @@ static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
 {
 }
 
-static int aio_epoll(AioContext *ctx, GPollFD *pfds,
-                     unsigned npfd, int64_t timeout)
+static int aio_epoll(AioContext *ctx, AioHandlerList *ready_list,
+                     int64_t timeout)
 {
     assert(false);
 }
@@ -191,9 +210,11 @@ static AioHandler *find_aio_handler(AioContext *ctx, int fd)
     AioHandler *node;
 
     QLIST_FOREACH(node, &ctx->aio_handlers, node) {
-        if (node->pfd.fd == fd)
-            if (!node->deleted)
+        if (node->pfd.fd == fd) {
+            if (!QLIST_IS_INSERTED(node, node_deleted)) {
                 return node;
+            }
+        }
     }
 
     return NULL;
@@ -212,7 +233,7 @@ static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
 
     /* If a read is in progress, just mark the node as deleted */
     if (qemu_lockcnt_count(&ctx->list_lock)) {
-        node->deleted = 1;
+        QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
         node->pfd.revents = 0;
         return false;
     }
@@ -354,7 +375,7 @@ static void poll_set_started(AioContext *ctx, bool started)
     QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
         IOHandler *fn;
 
-        if (node->deleted) {
+        if (QLIST_IS_INSERTED(node, node_deleted)) {
             continue;
         }
 
@@ -411,43 +432,82 @@ bool aio_pending(AioContext *ctx)
     return result;
 }
 
-static bool aio_dispatch_handlers(AioContext *ctx)
+static void aio_free_deleted_handlers(AioContext *ctx)
 {
-    AioHandler *node, *tmp;
-    bool progress = false;
+    AioHandler *node;
 
-    QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) {
-        int revents;
+    if (QLIST_EMPTY_RCU(&ctx->deleted_aio_handlers)) {
+        return;
+    }
+    if (!qemu_lockcnt_dec_if_lock(&ctx->list_lock)) {
+        return; /* we are nested, let the parent do the freeing */
+    }
 
-        revents = node->pfd.revents & node->pfd.events;
-        node->pfd.revents = 0;
+    while ((node = QLIST_FIRST_RCU(&ctx->deleted_aio_handlers))) {
+        QLIST_REMOVE(node, node);
+        QLIST_REMOVE(node, node_deleted);
+        g_free(node);
+    }
 
-        if (!node->deleted &&
-            (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
-            aio_node_check(ctx, node->is_external) &&
-            node->io_read) {
-            node->io_read(node->opaque);
+    qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
+}
 
-            /* aio_notify() does not count as progress */
-            if (node->opaque != &ctx->notifier) {
-                progress = true;
-            }
-        }
-        if (!node->deleted &&
-            (revents & (G_IO_OUT | G_IO_ERR)) &&
-            aio_node_check(ctx, node->is_external) &&
-            node->io_write) {
-            node->io_write(node->opaque);
+static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node)
+{
+    bool progress = false;
+    int revents;
+
+    revents = node->pfd.revents & node->pfd.events;
+    node->pfd.revents = 0;
+
+    if (!QLIST_IS_INSERTED(node, node_deleted) &&
+        (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
+        aio_node_check(ctx, node->is_external) &&
+        node->io_read) {
+        node->io_read(node->opaque);
+
+        /* aio_notify() does not count as progress */
+        if (node->opaque != &ctx->notifier) {
             progress = true;
         }
+    }
+    if (!QLIST_IS_INSERTED(node, node_deleted) &&
+        (revents & (G_IO_OUT | G_IO_ERR)) &&
+        aio_node_check(ctx, node->is_external) &&
+        node->io_write) {
+        node->io_write(node->opaque);
+        progress = true;
+    }
 
-        if (node->deleted) {
-            if (qemu_lockcnt_dec_if_lock(&ctx->list_lock)) {
-                QLIST_REMOVE(node, node);
-                g_free(node);
-                qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
-            }
-        }
+    return progress;
+}
+
+/*
+ * If we have a list of ready handlers then this is more efficient than
+ * scanning all handlers with aio_dispatch_handlers().
+ */
+static bool aio_dispatch_ready_handlers(AioContext *ctx,
+                                        AioHandlerList *ready_list)
+{
+    bool progress = false;
+    AioHandler *node;
+
+    while ((node = QLIST_FIRST(ready_list))) {
+        QLIST_SAFE_REMOVE(node, node_ready);
+        progress = aio_dispatch_handler(ctx, node) || progress;
+    }
+
+    return progress;
+}
+
+/* Slower than aio_dispatch_ready_handlers() but only used via glib */
+static bool aio_dispatch_handlers(AioContext *ctx)
+{
+    AioHandler *node, *tmp;
+    bool progress = false;
+
+    QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) {
+        progress = aio_dispatch_handler(ctx, node) || progress;
     }
 
     return progress;
@@ -458,6 +518,7 @@ void aio_dispatch(AioContext *ctx)
     qemu_lockcnt_inc(&ctx->list_lock);
     aio_bh_poll(ctx);
     aio_dispatch_handlers(ctx);
+    aio_free_deleted_handlers(ctx);
     qemu_lockcnt_dec(&ctx->list_lock);
 
     timerlistgroup_run_timers(&ctx->tlg);
@@ -514,8 +575,18 @@ static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
     bool progress = false;
     AioHandler *node;
 
+    /*
+     * Optimization: ->io_poll() handlers often contain RCU read critical
+     * sections and we therefore see many rcu_read_lock() -> rcu_read_unlock()
+     * -> rcu_read_lock() -> ... sequences with expensive memory
+     * synchronization primitives.  Make the entire polling loop an RCU
+     * critical section because nested rcu_read_lock()/rcu_read_unlock() calls
+     * are cheap.
+     */
+    RCU_READ_LOCK_GUARD();
+
     QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
-        if (!node->deleted && node->io_poll &&
+        if (!QLIST_IS_INSERTED(node, node_deleted) && node->io_poll &&
             aio_node_check(ctx, node->is_external) &&
             node->io_poll(node->opaque)) {
             /*
@@ -609,6 +680,7 @@ static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
 
 bool aio_poll(AioContext *ctx, bool blocking)
 {
+    AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
     AioHandler *node;
     int i;
     int ret = 0;
@@ -649,7 +721,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
 
         if (!aio_epoll_enabled(ctx)) {
             QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
-                if (!node->deleted && node->pfd.events
+                if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events
                     && aio_node_check(ctx, node->is_external)) {
                     add_pollfd(node);
                 }
@@ -658,13 +730,8 @@ bool aio_poll(AioContext *ctx, bool blocking)
 
         /* wait until next event */
         if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
-            AioHandler epoll_handler;
-
-            epoll_handler.pfd.fd = ctx->epollfd;
-            epoll_handler.pfd.events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR;
-            npfd = 0;
-            add_pollfd(&epoll_handler);
-            ret = aio_epoll(ctx, pollfds, npfd, timeout);
+            npfd = 0; /* pollfds[] is not being used */
+            ret = aio_epoll(ctx, &ready_list, timeout);
         } else  {
             ret = qemu_poll_ns(pollfds, npfd, timeout);
         }
@@ -719,7 +786,11 @@ bool aio_poll(AioContext *ctx, bool blocking)
     /* if we have any readable fds, dispatch event */
     if (ret > 0) {
         for (i = 0; i < npfd; i++) {
-            nodes[i]->pfd.revents = pollfds[i].revents;
+            int revents = pollfds[i].revents;
+
+            if (revents) {
+                add_ready_handler(&ready_list, nodes[i], revents);
+            }
         }
     }
 
@@ -728,9 +799,11 @@ bool aio_poll(AioContext *ctx, bool blocking)
     progress |= aio_bh_poll(ctx);
 
     if (ret > 0) {
-        progress |= aio_dispatch_handlers(ctx);
+        progress |= aio_dispatch_ready_handlers(ctx, &ready_list);
     }
 
+    aio_free_deleted_handlers(ctx);
+
     qemu_lockcnt_dec(&ctx->list_lock);
 
     progress |= timerlistgroup_run_timers(&ctx->tlg);
diff --git a/util/async.c b/util/async.c
index c192a24..b94518b 100644
--- a/util/async.c
+++ b/util/async.c
@@ -29,6 +29,7 @@
 #include "block/thread-pool.h"
 #include "qemu/main-loop.h"
 #include "qemu/atomic.h"
+#include "qemu/rcu_queue.h"
 #include "block/raw-aio.h"
 #include "qemu/coroutine_int.h"
 #include "trace.h"
@@ -36,16 +37,76 @@
 /***********************************************************/
 /* bottom halves (can be seen as timers which expire ASAP) */
 
+/* QEMUBH::flags values */
+enum {
+    /* Already enqueued and waiting for aio_bh_poll() */
+    BH_PENDING   = (1 << 0),
+
+    /* Invoke the callback */
+    BH_SCHEDULED = (1 << 1),
+
+    /* Delete without invoking callback */
+    BH_DELETED   = (1 << 2),
+
+    /* Delete after invoking callback */
+    BH_ONESHOT   = (1 << 3),
+
+    /* Schedule periodically when the event loop is idle */
+    BH_IDLE      = (1 << 4),
+};
+
 struct QEMUBH {
     AioContext *ctx;
     QEMUBHFunc *cb;
     void *opaque;
-    QEMUBH *next;
-    bool scheduled;
-    bool idle;
-    bool deleted;
+    QSLIST_ENTRY(QEMUBH) next;
+    unsigned flags;
 };
 
+/* Called concurrently from any thread */
+static void aio_bh_enqueue(QEMUBH *bh, unsigned new_flags)
+{
+    AioContext *ctx = bh->ctx;
+    unsigned old_flags;
+
+    /*
+     * The memory barrier implicit in atomic_fetch_or makes sure that:
+     * 1. idle & any writes needed by the callback are done before the
+     *    locations are read in the aio_bh_poll.
+     * 2. ctx is loaded before the callback has a chance to execute and bh
+     *    could be freed.
+     */
+    old_flags = atomic_fetch_or(&bh->flags, BH_PENDING | new_flags);
+    if (!(old_flags & BH_PENDING)) {
+        QSLIST_INSERT_HEAD_ATOMIC(&ctx->bh_list, bh, next);
+    }
+
+    aio_notify(ctx);
+}
+
+/* Only called from aio_bh_poll() and aio_ctx_finalize() */
+static QEMUBH *aio_bh_dequeue(BHList *head, unsigned *flags)
+{
+    QEMUBH *bh = QSLIST_FIRST_RCU(head);
+
+    if (!bh) {
+        return NULL;
+    }
+
+    QSLIST_REMOVE_HEAD(head, next);
+
+    /*
+     * The atomic_and is paired with aio_bh_enqueue().  The implicit memory
+     * barrier ensures that the callback sees all writes done by the scheduling
+     * thread.  It also ensures that the scheduling thread sees the cleared
+     * flag before bh->cb has run, and thus will call aio_notify again if
+     * necessary.
+     */
+    *flags = atomic_fetch_and(&bh->flags,
+                              ~(BH_PENDING | BH_SCHEDULED | BH_IDLE));
+    return bh;
+}
+
 void aio_bh_schedule_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
 {
     QEMUBH *bh;
@@ -55,15 +116,7 @@ void aio_bh_schedule_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
         .cb = cb,
         .opaque = opaque,
     };
-    qemu_lockcnt_lock(&ctx->list_lock);
-    bh->next = ctx->first_bh;
-    bh->scheduled = 1;
-    bh->deleted = 1;
-    /* Make sure that the members are ready before putting bh into list */
-    smp_wmb();
-    ctx->first_bh = bh;
-    qemu_lockcnt_unlock(&ctx->list_lock);
-    aio_notify(ctx);
+    aio_bh_enqueue(bh, BH_SCHEDULED | BH_ONESHOT);
 }
 
 QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
@@ -75,12 +128,6 @@ QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
         .cb = cb,
         .opaque = opaque,
     };
-    qemu_lockcnt_lock(&ctx->list_lock);
-    bh->next = ctx->first_bh;
-    /* Make sure that the members are ready before putting bh into list */
-    smp_wmb();
-    ctx->first_bh = bh;
-    qemu_lockcnt_unlock(&ctx->list_lock);
     return bh;
 }
 
@@ -89,91 +136,56 @@ void aio_bh_call(QEMUBH *bh)
     bh->cb(bh->opaque);
 }
 
-/* Multiple occurrences of aio_bh_poll cannot be called concurrently.
- * The count in ctx->list_lock is incremented before the call, and is
- * not affected by the call.
- */
+/* Multiple occurrences of aio_bh_poll cannot be called concurrently. */
 int aio_bh_poll(AioContext *ctx)
 {
-    QEMUBH *bh, **bhp, *next;
-    int ret;
-    bool deleted = false;
-
-    ret = 0;
-    for (bh = atomic_rcu_read(&ctx->first_bh); bh; bh = next) {
-        next = atomic_rcu_read(&bh->next);
-        /* The atomic_xchg is paired with the one in qemu_bh_schedule.  The
-         * implicit memory barrier ensures that the callback sees all writes
-         * done by the scheduling thread.  It also ensures that the scheduling
-         * thread sees the zero before bh->cb has run, and thus will call
-         * aio_notify again if necessary.
-         */
-        if (atomic_xchg(&bh->scheduled, 0)) {
+    BHListSlice slice;
+    BHListSlice *s;
+    int ret = 0;
+
+    QSLIST_MOVE_ATOMIC(&slice.bh_list, &ctx->bh_list);
+    QSIMPLEQ_INSERT_TAIL(&ctx->bh_slice_list, &slice, next);
+
+    while ((s = QSIMPLEQ_FIRST(&ctx->bh_slice_list))) {
+        QEMUBH *bh;
+        unsigned flags;
+
+        bh = aio_bh_dequeue(&s->bh_list, &flags);
+        if (!bh) {
+            QSIMPLEQ_REMOVE_HEAD(&ctx->bh_slice_list, next);
+            continue;
+        }
+
+        if ((flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) {
             /* Idle BHs don't count as progress */
-            if (!bh->idle) {
+            if (!(flags & BH_IDLE)) {
                 ret = 1;
             }
-            bh->idle = 0;
             aio_bh_call(bh);
         }
-        if (bh->deleted) {
-            deleted = true;
+        if (flags & (BH_DELETED | BH_ONESHOT)) {
+            g_free(bh);
         }
     }
 
-    /* remove deleted bhs */
-    if (!deleted) {
-        return ret;
-    }
-
-    if (qemu_lockcnt_dec_if_lock(&ctx->list_lock)) {
-        bhp = &ctx->first_bh;
-        while (*bhp) {
-            bh = *bhp;
-            if (bh->deleted && !bh->scheduled) {
-                *bhp = bh->next;
-                g_free(bh);
-            } else {
-                bhp = &bh->next;
-            }
-        }
-        qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
-    }
     return ret;
 }
 
 void qemu_bh_schedule_idle(QEMUBH *bh)
 {
-    bh->idle = 1;
-    /* Make sure that idle & any writes needed by the callback are done
-     * before the locations are read in the aio_bh_poll.
-     */
-    atomic_mb_set(&bh->scheduled, 1);
+    aio_bh_enqueue(bh, BH_SCHEDULED | BH_IDLE);
 }
 
 void qemu_bh_schedule(QEMUBH *bh)
 {
-    AioContext *ctx;
-
-    ctx = bh->ctx;
-    bh->idle = 0;
-    /* The memory barrier implicit in atomic_xchg makes sure that:
-     * 1. idle & any writes needed by the callback are done before the
-     *    locations are read in the aio_bh_poll.
-     * 2. ctx is loaded before scheduled is set and the callback has a chance
-     *    to execute.
-     */
-    if (atomic_xchg(&bh->scheduled, 1) == 0) {
-        aio_notify(ctx);
-    }
+    aio_bh_enqueue(bh, BH_SCHEDULED);
 }
 
-
 /* This func is async.
  */
 void qemu_bh_cancel(QEMUBH *bh)
 {
-    atomic_mb_set(&bh->scheduled, 0);
+    atomic_and(&bh->flags, ~BH_SCHEDULED);
 }
 
 /* This func is async.The bottom half will do the delete action at the finial
@@ -181,21 +193,16 @@ void qemu_bh_cancel(QEMUBH *bh)
  */
 void qemu_bh_delete(QEMUBH *bh)
 {
-    bh->scheduled = 0;
-    bh->deleted = 1;
+    aio_bh_enqueue(bh, BH_DELETED);
 }
 
-int64_t
-aio_compute_timeout(AioContext *ctx)
+static int64_t aio_compute_bh_timeout(BHList *head, int timeout)
 {
-    int64_t deadline;
-    int timeout = -1;
     QEMUBH *bh;
 
-    for (bh = atomic_rcu_read(&ctx->first_bh); bh;
-         bh = atomic_rcu_read(&bh->next)) {
-        if (bh->scheduled) {
-            if (bh->idle) {
+    QSLIST_FOREACH_RCU(bh, head, next) {
+        if ((bh->flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) {
+            if (bh->flags & BH_IDLE) {
                 /* idle bottom halves will be polled at least
                  * every 10ms */
                 timeout = 10000000;
@@ -207,6 +214,28 @@ aio_compute_timeout(AioContext *ctx)
         }
     }
 
+    return timeout;
+}
+
+int64_t
+aio_compute_timeout(AioContext *ctx)
+{
+    BHListSlice *s;
+    int64_t deadline;
+    int timeout = -1;
+
+    timeout = aio_compute_bh_timeout(&ctx->bh_list, timeout);
+    if (timeout == 0) {
+        return 0;
+    }
+
+    QSIMPLEQ_FOREACH(s, &ctx->bh_slice_list, next) {
+        timeout = aio_compute_bh_timeout(&s->bh_list, timeout);
+        if (timeout == 0) {
+            return 0;
+        }
+    }
+
     deadline = timerlistgroup_deadline_ns(&ctx->tlg);
     if (deadline == 0) {
         return 0;
@@ -237,15 +266,24 @@ aio_ctx_check(GSource *source)
 {
     AioContext *ctx = (AioContext *) source;
     QEMUBH *bh;
+    BHListSlice *s;
 
     atomic_and(&ctx->notify_me, ~1);
     aio_notify_accept(ctx);
 
-    for (bh = ctx->first_bh; bh; bh = bh->next) {
-        if (bh->scheduled) {
+    QSLIST_FOREACH_RCU(bh, &ctx->bh_list, next) {
+        if ((bh->flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) {
             return true;
         }
     }
+
+    QSIMPLEQ_FOREACH(s, &ctx->bh_slice_list, next) {
+        QSLIST_FOREACH_RCU(bh, &s->bh_list, next) {
+            if ((bh->flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) {
+                return true;
+            }
+        }
+    }
     return aio_pending(ctx) || (timerlistgroup_deadline_ns(&ctx->tlg) == 0);
 }
 
@@ -265,6 +303,8 @@ static void
 aio_ctx_finalize(GSource     *source)
 {
     AioContext *ctx = (AioContext *) source;
+    QEMUBH *bh;
+    unsigned flags;
 
     thread_pool_free(ctx->thread_pool);
 
@@ -287,18 +327,15 @@ aio_ctx_finalize(GSource     *source)
     assert(QSLIST_EMPTY(&ctx->scheduled_coroutines));
     qemu_bh_delete(ctx->co_schedule_bh);
 
-    qemu_lockcnt_lock(&ctx->list_lock);
-    assert(!qemu_lockcnt_count(&ctx->list_lock));
-    while (ctx->first_bh) {
-        QEMUBH *next = ctx->first_bh->next;
+    /* There must be no aio_bh_poll() calls going on */
+    assert(QSIMPLEQ_EMPTY(&ctx->bh_slice_list));
 
+    while ((bh = aio_bh_dequeue(&ctx->bh_list, &flags))) {
         /* qemu_bh_delete() must have been called on BHs in this AioContext */
-        assert(ctx->first_bh->deleted);
+        assert(flags & BH_DELETED);
 
-        g_free(ctx->first_bh);
-        ctx->first_bh = next;
+        g_free(bh);
     }
-    qemu_lockcnt_unlock(&ctx->list_lock);
 
     aio_set_event_notifier(ctx, &ctx->notifier, false, NULL, NULL);
     event_notifier_cleanup(&ctx->notifier);
@@ -445,6 +482,8 @@ AioContext *aio_context_new(Error **errp)
     AioContext *ctx;
 
     ctx = (AioContext *) g_source_new(&aio_source_funcs, sizeof(AioContext));
+    QSLIST_INIT(&ctx->bh_list);
+    QSIMPLEQ_INIT(&ctx->bh_slice_list);
     aio_context_setup(ctx);
 
     ret = event_notifier_init(&ctx->notifier, false);
diff --git a/util/log.c b/util/log.c
index 47f2827..2da6cb3 100644
--- a/util/log.c
+++ b/util/log.c
@@ -332,6 +332,8 @@ const QEMULogItem qemu_log_items[] = {
 #ifdef CONFIG_PLUGIN
     { CPU_LOG_PLUGIN, "plugin", "output from TCG plugins\n"},
 #endif
+    { LOG_STRACE, "strace",
+      "log every user-mode syscall, its input, and its result" },
     { 0, NULL, NULL },
 };
 
diff --git a/util/module.c b/util/module.c
index 8c5315a..236a7bb 100644
--- a/util/module.c
+++ b/util/module.c
@@ -30,6 +30,7 @@ typedef struct ModuleEntry
 typedef QTAILQ_HEAD(, ModuleEntry) ModuleTypeList;
 
 static ModuleTypeList init_type_list[MODULE_INIT_MAX];
+static bool modules_init_done[MODULE_INIT_MAX];
 
 static ModuleTypeList dso_init_list;
 
@@ -91,11 +92,17 @@ void module_call_init(module_init_type type)
     ModuleTypeList *l;
     ModuleEntry *e;
 
+    if (modules_init_done[type]) {
+        return;
+    }
+
     l = find_type(type);
 
     QTAILQ_FOREACH(e, l, node) {
         e->init();
     }
+
+    modules_init_done[type] = true;
 }
 
 #ifdef CONFIG_MODULES
diff --git a/util/nvdimm-utils.c b/util/nvdimm-utils.c
new file mode 100644
index 0000000..5cc768c
--- /dev/null
+++ b/util/nvdimm-utils.c
@@ -0,0 +1,29 @@
+#include "qemu/nvdimm-utils.h"
+#include "hw/mem/nvdimm.h"
+
+static int nvdimm_device_list(Object *obj, void *opaque)
+{
+    GSList **list = opaque;
+
+    if (object_dynamic_cast(obj, TYPE_NVDIMM)) {
+        *list = g_slist_append(*list, DEVICE(obj));
+    }
+
+    object_child_foreach(obj, nvdimm_device_list, opaque);
+    return 0;
+}
+
+/*
+ * inquire NVDIMM devices and link them into the list which is
+ * returned to the caller.
+ *
+ * Note: it is the caller's responsibility to free the list to avoid
+ * memory leak.
+ */
+GSList *nvdimm_get_device_list(void)
+{
+    GSList *list = NULL;
+
+    object_child_foreach(qdev_get_machine(), nvdimm_device_list, &list);
+    return list;
+}
diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index 5a291cc..897e8f3 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -76,6 +76,10 @@ static MemsetThread *memset_thread;
 static int memset_num_threads;
 static bool memset_thread_failed;
 
+static QemuMutex page_mutex;
+static QemuCond page_cond;
+static bool threads_created_flag;
+
 int qemu_get_thread_id(void)
 {
 #if defined(__linux__)
@@ -403,6 +407,17 @@ static void *do_touch_pages(void *arg)
     MemsetThread *memset_args = (MemsetThread *)arg;
     sigset_t set, oldset;
 
+    /*
+     * On Linux, the page faults from the loop below can cause mmap_sem
+     * contention with allocation of the thread stacks.  Do not start
+     * clearing until all threads have been created.
+     */
+    qemu_mutex_lock(&page_mutex);
+    while(!threads_created_flag){
+        qemu_cond_wait(&page_cond, &page_mutex);
+    }
+    qemu_mutex_unlock(&page_mutex);
+
     /* unblock SIGBUS */
     sigemptyset(&set);
     sigaddset(&set, SIGBUS);
@@ -451,27 +466,28 @@ static inline int get_memset_num_threads(int smp_cpus)
 static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
                             int smp_cpus)
 {
-    size_t numpages_per_thread;
-    size_t size_per_thread;
+    size_t numpages_per_thread, leftover;
     char *addr = area;
     int i = 0;
 
     memset_thread_failed = false;
+    threads_created_flag = false;
     memset_num_threads = get_memset_num_threads(smp_cpus);
     memset_thread = g_new0(MemsetThread, memset_num_threads);
-    numpages_per_thread = (numpages / memset_num_threads);
-    size_per_thread = (hpagesize * numpages_per_thread);
+    numpages_per_thread = numpages / memset_num_threads;
+    leftover = numpages % memset_num_threads;
     for (i = 0; i < memset_num_threads; i++) {
         memset_thread[i].addr = addr;
-        memset_thread[i].numpages = (i == (memset_num_threads - 1)) ?
-                                    numpages : numpages_per_thread;
+        memset_thread[i].numpages = numpages_per_thread + (i < leftover);
         memset_thread[i].hpagesize = hpagesize;
         qemu_thread_create(&memset_thread[i].pgthread, "touch_pages",
                            do_touch_pages, &memset_thread[i],
                            QEMU_THREAD_JOINABLE);
-        addr += size_per_thread;
-        numpages -= numpages_per_thread;
+        addr += memset_thread[i].numpages * hpagesize;
     }
+    threads_created_flag = true;
+    qemu_cond_broadcast(&page_cond);
+
     for (i = 0; i < memset_num_threads; i++) {
         qemu_thread_join(&memset_thread[i].pgthread);
     }
diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
index 813f7ec..ddd9a96 100644
--- a/util/vfio-helpers.c
+++ b/util/vfio-helpers.c
@@ -545,7 +545,7 @@ static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size,
     trace_qemu_vfio_do_mapping(s, host, size, iova);
 
     if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
-        error_report("VFIO_MAP_DMA: %d", -errno);
+        error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
         return -errno;
     }
     return 0;
@@ -570,7 +570,7 @@ static void qemu_vfio_undo_mapping(QEMUVFIOState *s, IOVAMapping *mapping,
     assert(QEMU_IS_ALIGNED(mapping->size, qemu_real_host_page_size));
     assert(index >= 0 && index < s->nr_mappings);
     if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
-        error_setg(errp, "VFIO_UNMAP_DMA failed: %d", -errno);
+        error_setg_errno(errp, errno, "VFIO_UNMAP_DMA failed");
     }
     memmove(mapping, &s->mappings[index + 1],
             sizeof(s->mappings[0]) * (s->nr_mappings - index - 1));
@@ -669,7 +669,7 @@ int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s)
     trace_qemu_vfio_dma_reset_temporary(s);
     qemu_mutex_lock(&s->lock);
     if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
-        error_report("VFIO_UNMAP_DMA: %d", -errno);
+        error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
         qemu_mutex_unlock(&s->lock);
         return -errno;
     }