From f25c0b547916962d0b1be260b5b643287bea0851 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 18 Feb 2020 18:27:08 +0000
Subject: aio-posix: avoid reacquiring rcu_read_lock() when polling

The first rcu_read_lock/unlock() is expensive.  Nested calls are cheap.

This optimization increases IOPS from 73k to 162k with a Linux guest
that has 2 virtio-blk,num-queues=1 and 99 virtio-blk,num-queues=32
devices.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Message-id: 20200218182708.914552-1-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 util/aio-posix.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'util')

diff --git a/util/aio-posix.c b/util/aio-posix.c
index a4977f5..f67f5b3 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -15,6 +15,7 @@
 
 #include "qemu/osdep.h"
 #include "block/block.h"
+#include "qemu/rcu.h"
 #include "qemu/rcu_queue.h"
 #include "qemu/sockets.h"
 #include "qemu/cutils.h"
@@ -514,6 +515,16 @@ static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
     bool progress = false;
     AioHandler *node;
 
+    /*
+     * Optimization: ->io_poll() handlers often contain RCU read critical
+     * sections and we therefore see many rcu_read_lock() -> rcu_read_unlock()
+     * -> rcu_read_lock() -> ... sequences with expensive memory
+     * synchronization primitives.  Make the entire polling loop an RCU
+     * critical section because nested rcu_read_lock()/rcu_read_unlock() calls
+     * are cheap.
+     */
+    RCU_READ_LOCK_GUARD();
+
     QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
         if (!node->deleted && node->io_poll &&
             aio_node_check(ctx, node->is_external) &&
-- 
cgit v1.1


From 8c6b0356b53977bcfdea5299db07884915425b0c Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Fri, 21 Feb 2020 09:39:51 +0000
Subject: util/async: make bh_aio_poll() O(1)

The ctx->first_bh list contains all created BHs, including those that
are not scheduled.  The list is iterated by the event loop and therefore
has O(n) time complexity with respected to the number of created BHs.

Rewrite BHs so that only scheduled or deleted BHs are enqueued.
Only BHs that actually require action will be iterated.

One semantic change is required: qemu_bh_delete() enqueues the BH and
therefore invokes aio_notify().  The
tests/test-aio.c:test_source_bh_delete_from_cb() test case assumed that
g_main_context_iteration(NULL, false) returns false after
qemu_bh_delete() but it now returns true for one iteration.  Fix up the
test case.

This patch makes aio_compute_timeout() and aio_bh_poll() drop from a CPU
profile reported by perf-top(1).  Previously they combined to 9% CPU
utilization when AioContext polling is commented out and the guest has 2
virtio-blk,num-queues=1 and 99 virtio-blk,num-queues=32 devices.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Message-id: 20200221093951.1414693-1-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 util/async.c | 237 ++++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 138 insertions(+), 99 deletions(-)

(limited to 'util')

diff --git a/util/async.c b/util/async.c
index c192a24..b94518b 100644
--- a/util/async.c
+++ b/util/async.c
@@ -29,6 +29,7 @@
 #include "block/thread-pool.h"
 #include "qemu/main-loop.h"
 #include "qemu/atomic.h"
+#include "qemu/rcu_queue.h"
 #include "block/raw-aio.h"
 #include "qemu/coroutine_int.h"
 #include "trace.h"
@@ -36,16 +37,76 @@
 /***********************************************************/
 /* bottom halves (can be seen as timers which expire ASAP) */
 
+/* QEMUBH::flags values */
+enum {
+    /* Already enqueued and waiting for aio_bh_poll() */
+    BH_PENDING   = (1 << 0),
+
+    /* Invoke the callback */
+    BH_SCHEDULED = (1 << 1),
+
+    /* Delete without invoking callback */
+    BH_DELETED   = (1 << 2),
+
+    /* Delete after invoking callback */
+    BH_ONESHOT   = (1 << 3),
+
+    /* Schedule periodically when the event loop is idle */
+    BH_IDLE      = (1 << 4),
+};
+
 struct QEMUBH {
     AioContext *ctx;
     QEMUBHFunc *cb;
     void *opaque;
-    QEMUBH *next;
-    bool scheduled;
-    bool idle;
-    bool deleted;
+    QSLIST_ENTRY(QEMUBH) next;
+    unsigned flags;
 };
 
+/* Called concurrently from any thread */
+static void aio_bh_enqueue(QEMUBH *bh, unsigned new_flags)
+{
+    AioContext *ctx = bh->ctx;
+    unsigned old_flags;
+
+    /*
+     * The memory barrier implicit in atomic_fetch_or makes sure that:
+     * 1. idle & any writes needed by the callback are done before the
+     *    locations are read in the aio_bh_poll.
+     * 2. ctx is loaded before the callback has a chance to execute and bh
+     *    could be freed.
+     */
+    old_flags = atomic_fetch_or(&bh->flags, BH_PENDING | new_flags);
+    if (!(old_flags & BH_PENDING)) {
+        QSLIST_INSERT_HEAD_ATOMIC(&ctx->bh_list, bh, next);
+    }
+
+    aio_notify(ctx);
+}
+
+/* Only called from aio_bh_poll() and aio_ctx_finalize() */
+static QEMUBH *aio_bh_dequeue(BHList *head, unsigned *flags)
+{
+    QEMUBH *bh = QSLIST_FIRST_RCU(head);
+
+    if (!bh) {
+        return NULL;
+    }
+
+    QSLIST_REMOVE_HEAD(head, next);
+
+    /*
+     * The atomic_and is paired with aio_bh_enqueue().  The implicit memory
+     * barrier ensures that the callback sees all writes done by the scheduling
+     * thread.  It also ensures that the scheduling thread sees the cleared
+     * flag before bh->cb has run, and thus will call aio_notify again if
+     * necessary.
+     */
+    *flags = atomic_fetch_and(&bh->flags,
+                              ~(BH_PENDING | BH_SCHEDULED | BH_IDLE));
+    return bh;
+}
+
 void aio_bh_schedule_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
 {
     QEMUBH *bh;
@@ -55,15 +116,7 @@ void aio_bh_schedule_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
         .cb = cb,
         .opaque = opaque,
     };
-    qemu_lockcnt_lock(&ctx->list_lock);
-    bh->next = ctx->first_bh;
-    bh->scheduled = 1;
-    bh->deleted = 1;
-    /* Make sure that the members are ready before putting bh into list */
-    smp_wmb();
-    ctx->first_bh = bh;
-    qemu_lockcnt_unlock(&ctx->list_lock);
-    aio_notify(ctx);
+    aio_bh_enqueue(bh, BH_SCHEDULED | BH_ONESHOT);
 }
 
 QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
@@ -75,12 +128,6 @@ QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
         .cb = cb,
         .opaque = opaque,
     };
-    qemu_lockcnt_lock(&ctx->list_lock);
-    bh->next = ctx->first_bh;
-    /* Make sure that the members are ready before putting bh into list */
-    smp_wmb();
-    ctx->first_bh = bh;
-    qemu_lockcnt_unlock(&ctx->list_lock);
     return bh;
 }
 
@@ -89,91 +136,56 @@ void aio_bh_call(QEMUBH *bh)
     bh->cb(bh->opaque);
 }
 
-/* Multiple occurrences of aio_bh_poll cannot be called concurrently.
- * The count in ctx->list_lock is incremented before the call, and is
- * not affected by the call.
- */
+/* Multiple occurrences of aio_bh_poll cannot be called concurrently. */
 int aio_bh_poll(AioContext *ctx)
 {
-    QEMUBH *bh, **bhp, *next;
-    int ret;
-    bool deleted = false;
-
-    ret = 0;
-    for (bh = atomic_rcu_read(&ctx->first_bh); bh; bh = next) {
-        next = atomic_rcu_read(&bh->next);
-        /* The atomic_xchg is paired with the one in qemu_bh_schedule.  The
-         * implicit memory barrier ensures that the callback sees all writes
-         * done by the scheduling thread.  It also ensures that the scheduling
-         * thread sees the zero before bh->cb has run, and thus will call
-         * aio_notify again if necessary.
-         */
-        if (atomic_xchg(&bh->scheduled, 0)) {
+    BHListSlice slice;
+    BHListSlice *s;
+    int ret = 0;
+
+    QSLIST_MOVE_ATOMIC(&slice.bh_list, &ctx->bh_list);
+    QSIMPLEQ_INSERT_TAIL(&ctx->bh_slice_list, &slice, next);
+
+    while ((s = QSIMPLEQ_FIRST(&ctx->bh_slice_list))) {
+        QEMUBH *bh;
+        unsigned flags;
+
+        bh = aio_bh_dequeue(&s->bh_list, &flags);
+        if (!bh) {
+            QSIMPLEQ_REMOVE_HEAD(&ctx->bh_slice_list, next);
+            continue;
+        }
+
+        if ((flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) {
             /* Idle BHs don't count as progress */
-            if (!bh->idle) {
+            if (!(flags & BH_IDLE)) {
                 ret = 1;
             }
-            bh->idle = 0;
             aio_bh_call(bh);
         }
-        if (bh->deleted) {
-            deleted = true;
+        if (flags & (BH_DELETED | BH_ONESHOT)) {
+            g_free(bh);
         }
     }
 
-    /* remove deleted bhs */
-    if (!deleted) {
-        return ret;
-    }
-
-    if (qemu_lockcnt_dec_if_lock(&ctx->list_lock)) {
-        bhp = &ctx->first_bh;
-        while (*bhp) {
-            bh = *bhp;
-            if (bh->deleted && !bh->scheduled) {
-                *bhp = bh->next;
-                g_free(bh);
-            } else {
-                bhp = &bh->next;
-            }
-        }
-        qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
-    }
     return ret;
 }
 
 void qemu_bh_schedule_idle(QEMUBH *bh)
 {
-    bh->idle = 1;
-    /* Make sure that idle & any writes needed by the callback are done
-     * before the locations are read in the aio_bh_poll.
-     */
-    atomic_mb_set(&bh->scheduled, 1);
+    aio_bh_enqueue(bh, BH_SCHEDULED | BH_IDLE);
 }
 
 void qemu_bh_schedule(QEMUBH *bh)
 {
-    AioContext *ctx;
-
-    ctx = bh->ctx;
-    bh->idle = 0;
-    /* The memory barrier implicit in atomic_xchg makes sure that:
-     * 1. idle & any writes needed by the callback are done before the
-     *    locations are read in the aio_bh_poll.
-     * 2. ctx is loaded before scheduled is set and the callback has a chance
-     *    to execute.
-     */
-    if (atomic_xchg(&bh->scheduled, 1) == 0) {
-        aio_notify(ctx);
-    }
+    aio_bh_enqueue(bh, BH_SCHEDULED);
 }
 
-
 /* This func is async.
  */
 void qemu_bh_cancel(QEMUBH *bh)
 {
-    atomic_mb_set(&bh->scheduled, 0);
+    atomic_and(&bh->flags, ~BH_SCHEDULED);
 }
 
 /* This func is async.The bottom half will do the delete action at the finial
@@ -181,21 +193,16 @@ void qemu_bh_cancel(QEMUBH *bh)
  */
 void qemu_bh_delete(QEMUBH *bh)
 {
-    bh->scheduled = 0;
-    bh->deleted = 1;
+    aio_bh_enqueue(bh, BH_DELETED);
 }
 
-int64_t
-aio_compute_timeout(AioContext *ctx)
+static int64_t aio_compute_bh_timeout(BHList *head, int timeout)
 {
-    int64_t deadline;
-    int timeout = -1;
     QEMUBH *bh;
 
-    for (bh = atomic_rcu_read(&ctx->first_bh); bh;
-         bh = atomic_rcu_read(&bh->next)) {
-        if (bh->scheduled) {
-            if (bh->idle) {
+    QSLIST_FOREACH_RCU(bh, head, next) {
+        if ((bh->flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) {
+            if (bh->flags & BH_IDLE) {
                 /* idle bottom halves will be polled at least
                  * every 10ms */
                 timeout = 10000000;
@@ -207,6 +214,28 @@ aio_compute_timeout(AioContext *ctx)
         }
     }
 
+    return timeout;
+}
+
+int64_t
+aio_compute_timeout(AioContext *ctx)
+{
+    BHListSlice *s;
+    int64_t deadline;
+    int timeout = -1;
+
+    timeout = aio_compute_bh_timeout(&ctx->bh_list, timeout);
+    if (timeout == 0) {
+        return 0;
+    }
+
+    QSIMPLEQ_FOREACH(s, &ctx->bh_slice_list, next) {
+        timeout = aio_compute_bh_timeout(&s->bh_list, timeout);
+        if (timeout == 0) {
+            return 0;
+        }
+    }
+
     deadline = timerlistgroup_deadline_ns(&ctx->tlg);
     if (deadline == 0) {
         return 0;
@@ -237,15 +266,24 @@ aio_ctx_check(GSource *source)
 {
     AioContext *ctx = (AioContext *) source;
     QEMUBH *bh;
+    BHListSlice *s;
 
     atomic_and(&ctx->notify_me, ~1);
     aio_notify_accept(ctx);
 
-    for (bh = ctx->first_bh; bh; bh = bh->next) {
-        if (bh->scheduled) {
+    QSLIST_FOREACH_RCU(bh, &ctx->bh_list, next) {
+        if ((bh->flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) {
             return true;
         }
     }
+
+    QSIMPLEQ_FOREACH(s, &ctx->bh_slice_list, next) {
+        QSLIST_FOREACH_RCU(bh, &s->bh_list, next) {
+            if ((bh->flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) {
+                return true;
+            }
+        }
+    }
     return aio_pending(ctx) || (timerlistgroup_deadline_ns(&ctx->tlg) == 0);
 }
 
@@ -265,6 +303,8 @@ static void
 aio_ctx_finalize(GSource     *source)
 {
     AioContext *ctx = (AioContext *) source;
+    QEMUBH *bh;
+    unsigned flags;
 
     thread_pool_free(ctx->thread_pool);
 
@@ -287,18 +327,15 @@ aio_ctx_finalize(GSource     *source)
     assert(QSLIST_EMPTY(&ctx->scheduled_coroutines));
     qemu_bh_delete(ctx->co_schedule_bh);
 
-    qemu_lockcnt_lock(&ctx->list_lock);
-    assert(!qemu_lockcnt_count(&ctx->list_lock));
-    while (ctx->first_bh) {
-        QEMUBH *next = ctx->first_bh->next;
+    /* There must be no aio_bh_poll() calls going on */
+    assert(QSIMPLEQ_EMPTY(&ctx->bh_slice_list));
 
+    while ((bh = aio_bh_dequeue(&ctx->bh_list, &flags))) {
         /* qemu_bh_delete() must have been called on BHs in this AioContext */
-        assert(ctx->first_bh->deleted);
+        assert(flags & BH_DELETED);
 
-        g_free(ctx->first_bh);
-        ctx->first_bh = next;
+        g_free(bh);
     }
-    qemu_lockcnt_unlock(&ctx->list_lock);
 
     aio_set_event_notifier(ctx, &ctx->notifier, false, NULL, NULL);
     event_notifier_cleanup(&ctx->notifier);
@@ -445,6 +482,8 @@ AioContext *aio_context_new(Error **errp)
     AioContext *ctx;
 
     ctx = (AioContext *) g_source_new(&aio_source_funcs, sizeof(AioContext));
+    QSLIST_INIT(&ctx->bh_list);
+    QSIMPLEQ_INIT(&ctx->bh_slice_list);
     aio_context_setup(ctx);
 
     ret = event_notifier_init(&ctx->notifier, false);
-- 
cgit v1.1


From ff29ed3a331d0cd26bcd30f7cd6c0c96c7d44eed Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Fri, 14 Feb 2020 17:17:08 +0000
Subject: aio-posix: fix use after leaving scope in aio_poll()

epoll_handler is a stack variable and must not be accessed after it goes
out of scope:

      if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
          AioHandler epoll_handler;
          ...
          add_pollfd(&epoll_handler);
          ret = aio_epoll(ctx, pollfds, npfd, timeout);
      } ...

  ...

  /* if we have any readable fds, dispatch event */
  if (ret > 0) {
      for (i = 0; i < npfd; i++) {
          nodes[i]->pfd.revents = pollfds[i].revents;
      }
  }

nodes[0] is &epoll_handler, which has already gone out of scope.

There is no need to use pollfds[] for epoll.  We don't need an
AioHandler for the epoll fd.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Sergio Lopez <slp@redhat.com>
Message-id: 20200214171712.541358-2-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 util/aio-posix.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

(limited to 'util')

diff --git a/util/aio-posix.c b/util/aio-posix.c
index f67f5b3..c964627 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -105,17 +105,18 @@ static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
     }
 }
 
-static int aio_epoll(AioContext *ctx, GPollFD *pfds,
-                     unsigned npfd, int64_t timeout)
+static int aio_epoll(AioContext *ctx, int64_t timeout)
 {
+    GPollFD pfd = {
+        .fd = ctx->epollfd,
+        .events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR,
+    };
     AioHandler *node;
     int i, ret = 0;
     struct epoll_event events[128];
 
-    assert(npfd == 1);
-    assert(pfds[0].fd == ctx->epollfd);
     if (timeout > 0) {
-        ret = qemu_poll_ns(pfds, npfd, timeout);
+        ret = qemu_poll_ns(&pfd, 1, timeout);
     }
     if (timeout <= 0 || ret > 0) {
         ret = epoll_wait(ctx->epollfd, events,
@@ -669,13 +670,8 @@ bool aio_poll(AioContext *ctx, bool blocking)
 
         /* wait until next event */
         if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
-            AioHandler epoll_handler;
-
-            epoll_handler.pfd.fd = ctx->epollfd;
-            epoll_handler.pfd.events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR;
-            npfd = 0;
-            add_pollfd(&epoll_handler);
-            ret = aio_epoll(ctx, pollfds, npfd, timeout);
+            npfd = 0; /* pollfds[] is not being used */
+            ret = aio_epoll(ctx, timeout);
         } else  {
             ret = qemu_poll_ns(pollfds, npfd, timeout);
         }
-- 
cgit v1.1


From ca8c6b22754b0f17818b1d1910d31f0aa1a49cc7 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Fri, 14 Feb 2020 17:17:09 +0000
Subject: aio-posix: don't pass ns timeout to epoll_wait()

Don't pass the nanosecond timeout into epoll_wait(), which expects
milliseconds.

The epoll_wait() timeout value does not matter if qemu_poll_ns()
determined that the poll fd is ready, but passing a value in the wrong
units is still ugly.  Pass a 0 timeout to epoll_wait() instead.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Sergio Lopez <slp@redhat.com>
Message-id: 20200214171712.541358-3-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 util/aio-posix.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'util')

diff --git a/util/aio-posix.c b/util/aio-posix.c
index c964627..58765e5 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -117,6 +117,9 @@ static int aio_epoll(AioContext *ctx, int64_t timeout)
 
     if (timeout > 0) {
         ret = qemu_poll_ns(&pfd, 1, timeout);
+        if (ret > 0) {
+            timeout = 0;
+        }
     }
     if (timeout <= 0 || ret > 0) {
         ret = epoll_wait(ctx->epollfd, events,
-- 
cgit v1.1


From 4749079ce033a94784cbe20a661abeac598ff057 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Fri, 14 Feb 2020 17:17:11 +0000
Subject: aio-posix: make AioHandler deletion O(1)

It is not necessary to scan all AioHandlers for deletion.  Keep a list
of deleted handlers instead of scanning the full list of all handlers.

The AioHandler->deleted field can be dropped.  Let's check if the
handler has been inserted into the deleted list instead.  Add a new
QLIST_IS_INSERTED() API for this check.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Sergio Lopez <slp@redhat.com>
Message-id: 20200214171712.541358-5-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 util/aio-posix.c | 53 +++++++++++++++++++++++++++++++++++------------------
 1 file changed, 35 insertions(+), 18 deletions(-)

(limited to 'util')

diff --git a/util/aio-posix.c b/util/aio-posix.c
index 58765e5..b5cfdbd 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -32,10 +32,10 @@ struct AioHandler
     AioPollFn *io_poll;
     IOHandler *io_poll_begin;
     IOHandler *io_poll_end;
-    int deleted;
     void *opaque;
     bool is_external;
     QLIST_ENTRY(AioHandler) node;
+    QLIST_ENTRY(AioHandler) node_deleted;
 };
 
 #ifdef CONFIG_EPOLL_CREATE1
@@ -68,7 +68,7 @@ static bool aio_epoll_try_enable(AioContext *ctx)
 
     QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
         int r;
-        if (node->deleted || !node->pfd.events) {
+        if (QLIST_IS_INSERTED(node, node_deleted) || !node->pfd.events) {
             continue;
         }
         event.events = epoll_events_from_pfd(node->pfd.events);
@@ -196,9 +196,11 @@ static AioHandler *find_aio_handler(AioContext *ctx, int fd)
     AioHandler *node;
 
     QLIST_FOREACH(node, &ctx->aio_handlers, node) {
-        if (node->pfd.fd == fd)
-            if (!node->deleted)
+        if (node->pfd.fd == fd) {
+            if (!QLIST_IS_INSERTED(node, node_deleted)) {
                 return node;
+            }
+        }
     }
 
     return NULL;
@@ -217,7 +219,7 @@ static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
 
     /* If a read is in progress, just mark the node as deleted */
     if (qemu_lockcnt_count(&ctx->list_lock)) {
-        node->deleted = 1;
+        QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
         node->pfd.revents = 0;
         return false;
     }
@@ -359,7 +361,7 @@ static void poll_set_started(AioContext *ctx, bool started)
     QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
         IOHandler *fn;
 
-        if (node->deleted) {
+        if (QLIST_IS_INSERTED(node, node_deleted)) {
             continue;
         }
 
@@ -416,6 +418,26 @@ bool aio_pending(AioContext *ctx)
     return result;
 }
 
+static void aio_free_deleted_handlers(AioContext *ctx)
+{
+    AioHandler *node;
+
+    if (QLIST_EMPTY_RCU(&ctx->deleted_aio_handlers)) {
+        return;
+    }
+    if (!qemu_lockcnt_dec_if_lock(&ctx->list_lock)) {
+        return; /* we are nested, let the parent do the freeing */
+    }
+
+    while ((node = QLIST_FIRST_RCU(&ctx->deleted_aio_handlers))) {
+        QLIST_REMOVE(node, node);
+        QLIST_REMOVE(node, node_deleted);
+        g_free(node);
+    }
+
+    qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
+}
+
 static bool aio_dispatch_handlers(AioContext *ctx)
 {
     AioHandler *node, *tmp;
@@ -427,7 +449,7 @@ static bool aio_dispatch_handlers(AioContext *ctx)
         revents = node->pfd.revents & node->pfd.events;
         node->pfd.revents = 0;
 
-        if (!node->deleted &&
+        if (!QLIST_IS_INSERTED(node, node_deleted) &&
             (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
             aio_node_check(ctx, node->is_external) &&
             node->io_read) {
@@ -438,21 +460,13 @@ static bool aio_dispatch_handlers(AioContext *ctx)
                 progress = true;
             }
         }
-        if (!node->deleted &&
+        if (!QLIST_IS_INSERTED(node, node_deleted) &&
             (revents & (G_IO_OUT | G_IO_ERR)) &&
             aio_node_check(ctx, node->is_external) &&
             node->io_write) {
             node->io_write(node->opaque);
             progress = true;
         }
-
-        if (node->deleted) {
-            if (qemu_lockcnt_dec_if_lock(&ctx->list_lock)) {
-                QLIST_REMOVE(node, node);
-                g_free(node);
-                qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
-            }
-        }
     }
 
     return progress;
@@ -463,6 +477,7 @@ void aio_dispatch(AioContext *ctx)
     qemu_lockcnt_inc(&ctx->list_lock);
     aio_bh_poll(ctx);
     aio_dispatch_handlers(ctx);
+    aio_free_deleted_handlers(ctx);
     qemu_lockcnt_dec(&ctx->list_lock);
 
     timerlistgroup_run_timers(&ctx->tlg);
@@ -530,7 +545,7 @@ static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
     RCU_READ_LOCK_GUARD();
 
     QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
-        if (!node->deleted && node->io_poll &&
+        if (!QLIST_IS_INSERTED(node, node_deleted) && node->io_poll &&
             aio_node_check(ctx, node->is_external) &&
             node->io_poll(node->opaque)) {
             /*
@@ -664,7 +679,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
 
         if (!aio_epoll_enabled(ctx)) {
             QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
-                if (!node->deleted && node->pfd.events
+                if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events
                     && aio_node_check(ctx, node->is_external)) {
                     add_pollfd(node);
                 }
@@ -741,6 +756,8 @@ bool aio_poll(AioContext *ctx, bool blocking)
         progress |= aio_dispatch_handlers(ctx);
     }
 
+    aio_free_deleted_handlers(ctx);
+
     qemu_lockcnt_dec(&ctx->list_lock);
 
     progress |= timerlistgroup_run_timers(&ctx->tlg);
-- 
cgit v1.1


From 7391d34c3cca09c0bb0140275839c6619b86ec0f Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Fri, 14 Feb 2020 17:17:12 +0000
Subject: aio-posix: make AioHandler dispatch O(1) with epoll

File descriptor monitoring is O(1) with epoll(7), but
aio_dispatch_handlers() still scans all AioHandlers instead of
dispatching just those that are ready.  This makes aio_poll() O(n) with
respect to the total number of registered handlers.

Add a local ready_list to aio_poll() so that each nested aio_poll()
builds a list of handlers ready to be dispatched.  Since file descriptor
polling is level-triggered, nested aio_poll() calls also see fds that
were ready in the parent but not yet dispatched.  This guarantees that
nested aio_poll() invocations will dispatch all fds, even those that
became ready before the nested invocation.

Since only handlers ready to be dispatched are placed onto the
ready_list, the new aio_dispatch_ready_handlers() function provides O(1)
dispatch.

Note that AioContext polling is still O(n) and currently cannot be fully
disabled.  This still needs to be fixed before aio_poll() is fully O(1).

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Sergio Lopez <slp@redhat.com>
Message-id: 20200214171712.541358-6-stefanha@redhat.com
[Fix compilation error on macOS where there is no epoll(87).  The
aio_epoll() prototype was out of date and aio_add_ready_list() needed to
be moved outside the ifdef.
--Stefan]
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 util/aio-posix.c | 110 +++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 78 insertions(+), 32 deletions(-)

(limited to 'util')

diff --git a/util/aio-posix.c b/util/aio-posix.c
index b5cfdbd..9e1befc 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -35,9 +35,20 @@ struct AioHandler
     void *opaque;
     bool is_external;
     QLIST_ENTRY(AioHandler) node;
+    QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
     QLIST_ENTRY(AioHandler) node_deleted;
 };
 
+/* Add a handler to a ready list */
+static void add_ready_handler(AioHandlerList *ready_list,
+                              AioHandler *node,
+                              int revents)
+{
+    QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */
+    node->pfd.revents = revents;
+    QLIST_INSERT_HEAD(ready_list, node, node_ready);
+}
+
 #ifdef CONFIG_EPOLL_CREATE1
 
 /* The fd number threshold to switch to epoll */
@@ -105,7 +116,8 @@ static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
     }
 }
 
-static int aio_epoll(AioContext *ctx, int64_t timeout)
+static int aio_epoll(AioContext *ctx, AioHandlerList *ready_list,
+                     int64_t timeout)
 {
     GPollFD pfd = {
         .fd = ctx->epollfd,
@@ -130,11 +142,13 @@ static int aio_epoll(AioContext *ctx, int64_t timeout)
         }
         for (i = 0; i < ret; i++) {
             int ev = events[i].events;
+            int revents = (ev & EPOLLIN ? G_IO_IN : 0) |
+                          (ev & EPOLLOUT ? G_IO_OUT : 0) |
+                          (ev & EPOLLHUP ? G_IO_HUP : 0) |
+                          (ev & EPOLLERR ? G_IO_ERR : 0);
+
             node = events[i].data.ptr;
-            node->pfd.revents = (ev & EPOLLIN ? G_IO_IN : 0) |
-                (ev & EPOLLOUT ? G_IO_OUT : 0) |
-                (ev & EPOLLHUP ? G_IO_HUP : 0) |
-                (ev & EPOLLERR ? G_IO_ERR : 0);
+            add_ready_handler(ready_list, node, revents);
         }
     }
 out:
@@ -172,8 +186,8 @@ static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
 {
 }
 
-static int aio_epoll(AioContext *ctx, GPollFD *pfds,
-                     unsigned npfd, int64_t timeout)
+static int aio_epoll(AioContext *ctx, AioHandlerList *ready_list,
+                     int64_t timeout)
 {
     assert(false);
 }
@@ -438,36 +452,63 @@ static void aio_free_deleted_handlers(AioContext *ctx)
     qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
 }
 
-static bool aio_dispatch_handlers(AioContext *ctx)
+static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node)
 {
-    AioHandler *node, *tmp;
     bool progress = false;
+    int revents;
 
-    QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) {
-        int revents;
+    revents = node->pfd.revents & node->pfd.events;
+    node->pfd.revents = 0;
 
-        revents = node->pfd.revents & node->pfd.events;
-        node->pfd.revents = 0;
+    if (!QLIST_IS_INSERTED(node, node_deleted) &&
+        (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
+        aio_node_check(ctx, node->is_external) &&
+        node->io_read) {
+        node->io_read(node->opaque);
 
-        if (!QLIST_IS_INSERTED(node, node_deleted) &&
-            (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
-            aio_node_check(ctx, node->is_external) &&
-            node->io_read) {
-            node->io_read(node->opaque);
-
-            /* aio_notify() does not count as progress */
-            if (node->opaque != &ctx->notifier) {
-                progress = true;
-            }
-        }
-        if (!QLIST_IS_INSERTED(node, node_deleted) &&
-            (revents & (G_IO_OUT | G_IO_ERR)) &&
-            aio_node_check(ctx, node->is_external) &&
-            node->io_write) {
-            node->io_write(node->opaque);
+        /* aio_notify() does not count as progress */
+        if (node->opaque != &ctx->notifier) {
             progress = true;
         }
     }
+    if (!QLIST_IS_INSERTED(node, node_deleted) &&
+        (revents & (G_IO_OUT | G_IO_ERR)) &&
+        aio_node_check(ctx, node->is_external) &&
+        node->io_write) {
+        node->io_write(node->opaque);
+        progress = true;
+    }
+
+    return progress;
+}
+
+/*
+ * If we have a list of ready handlers then this is more efficient than
+ * scanning all handlers with aio_dispatch_handlers().
+ */
+static bool aio_dispatch_ready_handlers(AioContext *ctx,
+                                        AioHandlerList *ready_list)
+{
+    bool progress = false;
+    AioHandler *node;
+
+    while ((node = QLIST_FIRST(ready_list))) {
+        QLIST_SAFE_REMOVE(node, node_ready);
+        progress = aio_dispatch_handler(ctx, node) || progress;
+    }
+
+    return progress;
+}
+
+/* Slower than aio_dispatch_ready_handlers() but only used via glib */
+static bool aio_dispatch_handlers(AioContext *ctx)
+{
+    AioHandler *node, *tmp;
+    bool progress = false;
+
+    QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) {
+        progress = aio_dispatch_handler(ctx, node) || progress;
+    }
 
     return progress;
 }
@@ -639,6 +680,7 @@ static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
 
 bool aio_poll(AioContext *ctx, bool blocking)
 {
+    AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
     AioHandler *node;
     int i;
     int ret = 0;
@@ -689,7 +731,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
         /* wait until next event */
         if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
             npfd = 0; /* pollfds[] is not being used */
-            ret = aio_epoll(ctx, timeout);
+            ret = aio_epoll(ctx, &ready_list, timeout);
         } else  {
             ret = qemu_poll_ns(pollfds, npfd, timeout);
         }
@@ -744,7 +786,11 @@ bool aio_poll(AioContext *ctx, bool blocking)
     /* if we have any readable fds, dispatch event */
     if (ret > 0) {
         for (i = 0; i < npfd; i++) {
-            nodes[i]->pfd.revents = pollfds[i].revents;
+            int revents = pollfds[i].revents;
+
+            if (revents) {
+                add_ready_handler(&ready_list, nodes[i], revents);
+            }
         }
     }
 
@@ -753,7 +799,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
     progress |= aio_bh_poll(ctx);
 
     if (ret > 0) {
-        progress |= aio_dispatch_handlers(ctx);
+        progress |= aio_dispatch_ready_handlers(ctx, &ready_list);
     }
 
     aio_free_deleted_handlers(ctx);
-- 
cgit v1.1


From 46a07579ebb081493618bfa00ef8e241cd0dcc4f Mon Sep 17 00:00:00 2001
From: Alexander Bulekov <alxndr@bu.edu>
Date: Wed, 19 Feb 2020 23:10:59 -0500
Subject: module: check module wasn't already initialized
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The virtual-device fuzzer must initialize QOM, prior to running
vl:qemu_init, so that it can use the qos_graph to identify the arguments
required to initialize a guest for libqos-assisted fuzzing. This change
prevents errors when vl:qemu_init tries to (re)initialize the previously
initialized QOM module.

Signed-off-by: Alexander Bulekov <alxndr@bu.edu>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Message-id: 20200220041118.23264-4-alxndr@bu.edu
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 util/module.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'util')

diff --git a/util/module.c b/util/module.c
index 8c5315a..236a7bb 100644
--- a/util/module.c
+++ b/util/module.c
@@ -30,6 +30,7 @@ typedef struct ModuleEntry
 typedef QTAILQ_HEAD(, ModuleEntry) ModuleTypeList;
 
 static ModuleTypeList init_type_list[MODULE_INIT_MAX];
+static bool modules_init_done[MODULE_INIT_MAX];
 
 static ModuleTypeList dso_init_list;
 
@@ -91,11 +92,17 @@ void module_call_init(module_init_type type)
     ModuleTypeList *l;
     ModuleEntry *e;
 
+    if (modules_init_done[type]) {
+        return;
+    }
+
     l = find_type(type);
 
     QTAILQ_FOREACH(e, l, node) {
         e->init();
     }
+
+    modules_init_done[type] = true;
 }
 
 #ifdef CONFIG_MODULES
-- 
cgit v1.1