aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/block/aio.h8
-rw-r--r--util/aio-posix.c93
-rw-r--r--util/aio-posix.h2
-rw-r--r--util/trace-events2
4 files changed, 98 insertions, 7 deletions
diff --git a/include/block/aio.h b/include/block/aio.h
index f07ebb7..cb19891 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -227,6 +227,14 @@ struct AioContext {
int64_t poll_grow; /* polling time growth factor */
int64_t poll_shrink; /* polling time shrink factor */
+ /*
+ * List of handlers participating in userspace polling. Protected by
+ * ctx->list_lock. Iterated and modified mostly by the event loop thread
+ * from aio_poll() with ctx->list_lock incremented. aio_set_fd_handler()
+ * only touches the list to delete nodes if ctx->list_lock's count is zero.
+ */
+ AioHandlerList poll_aio_handlers;
+
/* Are we in polling mode or monitoring file descriptors? */
bool poll_started;
diff --git a/util/aio-posix.c b/util/aio-posix.c
index 759989b..cd6cf0a 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -22,6 +22,9 @@
#include "trace.h"
#include "aio-posix.h"
+/* Stop userspace polling on a handler if it isn't active for some time */
+#define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND)
+
bool aio_poll_disabled(AioContext *ctx)
{
return atomic_read(&ctx->poll_disable_cnt);
@@ -78,6 +81,7 @@ static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
* deleted because deleted nodes are only cleaned up while
* no one is walking the handlers list.
*/
+ QLIST_SAFE_REMOVE(node, node_poll);
QLIST_REMOVE(node, node);
return true;
}
@@ -205,7 +209,7 @@ static bool poll_set_started(AioContext *ctx, bool started)
ctx->poll_started = started;
qemu_lockcnt_inc(&ctx->list_lock);
- QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
+ QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
IOHandler *fn;
if (QLIST_IS_INSERTED(node, node_deleted)) {
@@ -286,6 +290,7 @@ static void aio_free_deleted_handlers(AioContext *ctx)
while ((node = QLIST_FIRST_RCU(&ctx->deleted_aio_handlers))) {
QLIST_REMOVE(node, node);
QLIST_REMOVE(node, node_deleted);
+ QLIST_SAFE_REMOVE(node, node_poll);
g_free(node);
}
@@ -300,6 +305,22 @@ static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node)
revents = node->pfd.revents & node->pfd.events;
node->pfd.revents = 0;
+ /*
+ * Start polling AioHandlers when they become ready because activity is
+ * likely to continue. Note that starvation is theoretically possible when
+ * fdmon_supports_polling(), but only until the fd fires for the first
+ * time.
+ */
+ if (!QLIST_IS_INSERTED(node, node_deleted) &&
+ !QLIST_IS_INSERTED(node, node_poll) &&
+ node->io_poll) {
+ trace_poll_add(ctx, node, node->pfd.fd, revents);
+ if (ctx->poll_started && node->io_poll_begin) {
+ node->io_poll_begin(node->opaque);
+ }
+ QLIST_INSERT_HEAD(&ctx->poll_aio_handlers, node, node_poll);
+ }
+
if (!QLIST_IS_INSERTED(node, node_deleted) &&
(revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
aio_node_check(ctx, node->is_external) &&
@@ -364,15 +385,19 @@ void aio_dispatch(AioContext *ctx)
timerlistgroup_run_timers(&ctx->tlg);
}
-static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
+static bool run_poll_handlers_once(AioContext *ctx,
+ int64_t now,
+ int64_t *timeout)
{
bool progress = false;
AioHandler *node;
+ AioHandler *tmp;
- QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
- if (!QLIST_IS_INSERTED(node, node_deleted) && node->io_poll &&
- aio_node_check(ctx, node->is_external) &&
+ QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
+ if (aio_node_check(ctx, node->is_external) &&
node->io_poll(node->opaque)) {
+ node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
+
/*
* Polling was successful, exit try_poll_mode immediately
* to adjust the next polling time.
@@ -389,6 +414,50 @@ static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
return progress;
}
+static bool fdmon_supports_polling(AioContext *ctx)
+{
+ return ctx->fdmon_ops->need_wait != aio_poll_disabled;
+}
+
+static bool remove_idle_poll_handlers(AioContext *ctx, int64_t now)
+{
+ AioHandler *node;
+ AioHandler *tmp;
+ bool progress = false;
+
+ /*
+ * File descriptor monitoring implementations without userspace polling
+ * support suffer from starvation when a subset of handlers is polled
+ * because fds will not be processed in a timely fashion. Don't remove
+ * idle poll handlers.
+ */
+ if (!fdmon_supports_polling(ctx)) {
+ return false;
+ }
+
+ QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
+ if (node->poll_idle_timeout == 0LL) {
+ node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
+ } else if (now >= node->poll_idle_timeout) {
+ trace_poll_remove(ctx, node, node->pfd.fd);
+ node->poll_idle_timeout = 0LL;
+ QLIST_SAFE_REMOVE(node, node_poll);
+ if (ctx->poll_started && node->io_poll_end) {
+ node->io_poll_end(node->opaque);
+
+ /*
+ * Final poll in case ->io_poll_end() races with an event.
+ * Nevermind about re-adding the handler in the rare case where
+ * this causes progress.
+ */
+ progress = node->io_poll(node->opaque) || progress;
+ }
+ }
+ }
+
+ return progress;
+}
+
/* run_poll_handlers:
* @ctx: the AioContext
* @max_ns: maximum time to poll for, in nanoseconds
@@ -424,12 +493,17 @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
do {
- progress = run_poll_handlers_once(ctx, timeout);
+ progress = run_poll_handlers_once(ctx, start_time, timeout);
elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time;
max_ns = qemu_soonest_timeout(*timeout, max_ns);
assert(!(max_ns && progress));
} while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx));
+ if (remove_idle_poll_handlers(ctx, start_time + elapsed_time)) {
+ *timeout = 0;
+ progress = true;
+ }
+
/* If time has passed with no successful polling, adjust *timeout to
* keep the same ending time.
*/
@@ -454,8 +528,13 @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
*/
static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
{
- int64_t max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
+ int64_t max_ns;
+
+ if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) {
+ return false;
+ }
+ max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
poll_set_started(ctx, true);
diff --git a/util/aio-posix.h b/util/aio-posix.h
index 55fc771..c80c045 100644
--- a/util/aio-posix.h
+++ b/util/aio-posix.h
@@ -30,10 +30,12 @@ struct AioHandler {
QLIST_ENTRY(AioHandler) node;
QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
QLIST_ENTRY(AioHandler) node_deleted;
+ QLIST_ENTRY(AioHandler) node_poll;
#ifdef CONFIG_LINUX_IO_URING
QSLIST_ENTRY(AioHandler) node_submitted;
unsigned flags; /* see fdmon-io_uring.c */
#endif
+ int64_t poll_idle_timeout; /* when to stop userspace polling */
bool is_external;
};
diff --git a/util/trace-events b/util/trace-events
index 83b6639..0ce4282 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -5,6 +5,8 @@ run_poll_handlers_begin(void *ctx, int64_t max_ns, int64_t timeout) "ctx %p max_
run_poll_handlers_end(void *ctx, bool progress, int64_t timeout) "ctx %p progress %d new timeout %"PRId64
poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
+poll_add(void *ctx, void *node, int fd, unsigned revents) "ctx %p node %p fd %d revents 0x%x"
+poll_remove(void *ctx, void *node, int fd) "ctx %p node %p fd %d"
# async.c
aio_co_schedule(void *ctx, void *co) "ctx %p co %p"