From 34dc97b9a0e592bc466bdb0bbfe45d77304a72b6 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 17 Aug 2018 14:53:05 +0200
Subject: blockjob: Wake up BDS when job becomes idle

In the context of draining a BDS, the .drained_poll callback of block
jobs is called. If this returns true (i.e. there is still some activity
pending), the drain operation may call aio_poll() with blocking=true to
wait for completion.

As soon as the pending activity is completed and the job finally arrives
in a quiescent state (i.e. its coroutine either yields with busy=false
or terminates), the block job must notify the aio_poll() loop to wake
up, otherwise we get a deadlock if both are running in different
threads.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
---
 include/block/blockjob.h | 13 +++++++++++++
 include/qemu/job.h       |  3 +++
 2 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index 32c00b7..2290bbb 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -70,6 +70,9 @@ typedef struct BlockJob {
     /** Called when the job transitions to READY */
     Notifier ready_notifier;
 
+    /** Called when the job coroutine yields or terminates */
+    Notifier idle_notifier;
+
     /** BlockDriverStates that are involved in this block job */
     GSList *nodes;
 } BlockJob;
@@ -119,6 +122,16 @@ int block_job_add_bdrv(BlockJob *job, const char *name, BlockDriverState *bs,
 void block_job_remove_all_bdrv(BlockJob *job);
 
 /**
+ * block_job_wakeup_all_bdrv:
+ * @job: The block job
+ *
+ * Calls bdrv_wakeup() for all BlockDriverStates that have been added to the
+ * job. This function is to be called whenever child_job_drained_poll() would
+ * go from true to false to notify waiting drain requests.
+ */
+void block_job_wakeup_all_bdrv(BlockJob *job);
+
+/**
  * block_job_set_speed:
  * @job: The job to set the speed for.
  * @speed: The new value
diff --git a/include/qemu/job.h b/include/qemu/job.h
index 5cb0681..b4a784d 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -156,6 +156,9 @@ typedef struct Job {
     /** Notifiers called when the job transitions to READY */
     NotifierList on_ready;
 
+    /** Notifiers called when the job coroutine yields or terminates */
+    NotifierList on_idle;
+
     /** Element of the list of jobs */
     QLIST_ENTRY(Job) job_list;
 
-- 
cgit v1.1


From 486574483aba988c83b20e7d3f1ccd50c4c333d8 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 5 Sep 2018 18:14:17 +0200
Subject: aio-wait: Increase num_waiters even in home thread

Even if AIO_WAIT_WHILE() is called in the home context of the
AioContext, we still want to allow the condition to change depending on
other threads as long as they kick the AioWait. Specfically block jobs
can be running in an I/O thread and should then be able to kick a drain
in the main loop context.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
---
 include/block/aio-wait.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h
index c85a62f..600fad1 100644
--- a/include/block/aio-wait.h
+++ b/include/block/aio-wait.h
@@ -76,6 +76,8 @@ typedef struct {
     bool waited_ = false;                                          \
     AioWait *wait_ = (wait);                                       \
     AioContext *ctx_ = (ctx);                                      \
+    /* Increment wait_->num_waiters before evaluating cond. */     \
+    atomic_inc(&wait_->num_waiters);                               \
     if (ctx_ && in_aio_context_home_thread(ctx_)) {                \
         while ((cond)) {                                           \
             aio_poll(ctx_, true);                                  \
@@ -84,8 +86,6 @@ typedef struct {
     } else {                                                       \
         assert(qemu_get_current_aio_context() ==                   \
                qemu_get_aio_context());                            \
-        /* Increment wait_->num_waiters before evaluating cond. */ \
-        atomic_inc(&wait_->num_waiters);                           \
         while ((cond)) {                                           \
             if (ctx_) {                                            \
                 aio_context_release(ctx_);                         \
@@ -96,8 +96,8 @@ typedef struct {
             }                                                      \
             waited_ = true;                                        \
         }                                                          \
-        atomic_dec(&wait_->num_waiters);                           \
     }                                                              \
+    atomic_dec(&wait_->num_waiters);                               \
     waited_; })
 
 /**
-- 
cgit v1.1


From 30c070a547322a5e41ce129d540bca3653b1a9c8 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 17 Aug 2018 17:29:08 +0200
Subject: test-blockjob: Acquire AioContext around job_cancel_sync()

All callers in QEMU proper hold the AioContext lock when calling
job_finish_sync(). test-blockjob should do the same when it calls the
function indirectly through job_cancel_sync().

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
---
 include/qemu/job.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/qemu/job.h b/include/qemu/job.h
index b4a784d..63c60ef 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -524,6 +524,8 @@ void job_user_cancel(Job *job, bool force, Error **errp);
  *
  * Returns the return value from the job if the job actually completed
  * during the call, or -ECANCELED if it was canceled.
+ *
+ * Callers must hold the AioContext lock of job->aio_context.
  */
 int job_cancel_sync(Job *job);
 
@@ -541,6 +543,8 @@ void job_cancel_sync_all(void);
  * function).
  *
  * Returns the return value from the job.
+ *
+ * Callers must hold the AioContext lock of job->aio_context.
  */
 int job_complete_sync(Job *job, Error **errp);
 
@@ -566,6 +570,8 @@ void job_dismiss(Job **job, Error **errp);
  *
  * Returns 0 if the job is successfully completed, -ECANCELED if the job was
  * cancelled before completing, and -errno in other error cases.
+ *
+ * Callers must hold the AioContext lock of job->aio_context.
  */
 int job_finish_sync(Job *job, void (*finish)(Job *, Error **errp), Error **errp);
 
-- 
cgit v1.1


From aa1361d54aac43094b98024b8b6c804eb6e41661 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 17 Aug 2018 18:54:18 +0200
Subject: block: Add missing locking in bdrv_co_drain_bh_cb()

bdrv_do_drained_begin/end() assume that they are called with the
AioContext lock of bs held. If we call drain functions from a coroutine
with the AioContext lock held, we yield and schedule a BH to move out of
coroutine context. This means that the lock for the home context of the
coroutine is released and must be re-acquired in the bottom half.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
---
 include/qemu/coroutine.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
index 6f8a487..9801e7f 100644
--- a/include/qemu/coroutine.h
+++ b/include/qemu/coroutine.h
@@ -90,6 +90,11 @@ void qemu_aio_coroutine_enter(AioContext *ctx, Coroutine *co);
 void coroutine_fn qemu_coroutine_yield(void);
 
 /**
+ * Get the AioContext of the given coroutine
+ */
+AioContext *coroutine_fn qemu_coroutine_get_aio_context(Coroutine *co);
+
+/**
  * Get the currently executing coroutine
  */
 Coroutine *coroutine_fn qemu_coroutine_self(void);
-- 
cgit v1.1


From b5a7a0573530698ee448b063ac01d485e30446bd Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 7 Sep 2018 15:31:22 +0200
Subject: blockjob: Lie better in child_job_drained_poll()

Block jobs claim in .drained_poll() that they are in a quiescent state
as soon as job->deferred_to_main_loop is true. This is obviously wrong,
they still have a completion BH to run. We only get away with this
because commit 91af091f923 added an unconditional aio_poll(false) to the
drain functions, but this is bypassing the regular drain mechanisms.

However, just removing this and telling that the job is still active
doesn't work either: The completion callbacks themselves call drain
functions (directly, or indirectly with bdrv_reopen), so they would
deadlock then.

As a better lie, tell that the job is active as long as the BH is
pending, but falsely call it quiescent from the point in the BH when the
completion callback is called. At this point, nested drain calls won't
deadlock because they ignore the job, and outer drains will wait for the
job to really reach a quiescent state because the callback is already
running.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
---
 include/qemu/job.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/qemu/job.h b/include/qemu/job.h
index 63c60ef..9e7cd1e 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -76,6 +76,9 @@ typedef struct Job {
      * Set to false by the job while the coroutine has yielded and may be
      * re-entered by job_enter(). There may still be I/O or event loop activity
      * pending. Accessed under block_job_mutex (in blockjob.c).
+     *
+     * When the job is deferred to the main loop, busy is true as long as the
+     * bottom half is still pending.
      */
     bool busy;
 
-- 
cgit v1.1


From cfe29d8294e06420e15d4938421ae006c8ac49e7 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 18 Sep 2018 17:09:16 +0200
Subject: block: Use a single global AioWait

When draining a block node, we recurse to its parent and for subtree
drains also to its children. A single AIO_WAIT_WHILE() is then used to
wait for bdrv_drain_poll() to become true, which depends on all of the
nodes we recursed to. However, if the respective child or parent becomes
quiescent and calls bdrv_wakeup(), only the AioWait of the child/parent
is checked, while AIO_WAIT_WHILE() depends on the AioWait of the
original node.

Fix this by using a single AioWait for all callers of AIO_WAIT_WHILE().

This may mean that the draining thread gets a few more unnecessary
wakeups because an unrelated operation got completed, but we already
wake it up when something _could_ have changed rather than only if it
has certainly changed.

Apart from that, drain is a slow path anyway. In theory it would be
possible to use wakeups more selectively and still correctly, but the
gains are likely not worth the additional complexity. In fact, this
patch is a nice simplification for some places in the code.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
---
 include/block/aio-wait.h  | 22 +++++++++++-----------
 include/block/block.h     |  6 +-----
 include/block/block_int.h |  3 ---
 include/block/blockjob.h  | 10 ----------
 4 files changed, 12 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h
index 600fad1..afd0ff7 100644
--- a/include/block/aio-wait.h
+++ b/include/block/aio-wait.h
@@ -30,14 +30,15 @@
 /**
  * AioWait:
  *
- * An object that facilitates synchronous waiting on a condition.  The main
- * loop can wait on an operation running in an IOThread as follows:
+ * An object that facilitates synchronous waiting on a condition. A single
+ * global AioWait object (global_aio_wait) is used internally.
+ *
+ * The main loop can wait on an operation running in an IOThread as follows:
  *
- *   AioWait *wait = ...;
  *   AioContext *ctx = ...;
  *   MyWork work = { .done = false };
  *   schedule_my_work_in_iothread(ctx, &work);
- *   AIO_WAIT_WHILE(wait, ctx, !work.done);
+ *   AIO_WAIT_WHILE(ctx, !work.done);
  *
  * The IOThread must call aio_wait_kick() to notify the main loop when
  * work.done changes:
@@ -46,7 +47,7 @@
  *   {
  *       ...
  *       work.done = true;
- *       aio_wait_kick(wait);
+ *       aio_wait_kick();
  *   }
  */
 typedef struct {
@@ -54,9 +55,10 @@ typedef struct {
     unsigned num_waiters;
 } AioWait;
 
+extern AioWait global_aio_wait;
+
 /**
  * AIO_WAIT_WHILE:
- * @wait: the aio wait object
  * @ctx: the aio context, or NULL if multiple aio contexts (for which the
  *       caller does not hold a lock) are involved in the polling condition.
  * @cond: wait while this conditional expression is true
@@ -72,9 +74,9 @@ typedef struct {
  * wait on conditions between two IOThreads since that could lead to deadlock,
  * go via the main loop instead.
  */
-#define AIO_WAIT_WHILE(wait, ctx, cond) ({                         \
+#define AIO_WAIT_WHILE(ctx, cond) ({                               \
     bool waited_ = false;                                          \
-    AioWait *wait_ = (wait);                                       \
+    AioWait *wait_ = &global_aio_wait;                             \
     AioContext *ctx_ = (ctx);                                      \
     /* Increment wait_->num_waiters before evaluating cond. */     \
     atomic_inc(&wait_->num_waiters);                               \
@@ -102,14 +104,12 @@ typedef struct {
 
 /**
  * aio_wait_kick:
- * @wait: the aio wait object that should re-evaluate its condition
- *
  * Wake up the main thread if it is waiting on AIO_WAIT_WHILE().  During
  * synchronous operations performed in an IOThread, the main thread lets the
  * IOThread's event loop run, waiting for the operation to complete.  A
  * aio_wait_kick() call will wake up the main thread.
  */
-void aio_wait_kick(AioWait *wait);
+void aio_wait_kick(void);
 
 /**
  * aio_wait_bh_oneshot:
diff --git a/include/block/block.h b/include/block/block.h
index 4e0871a..4edc1e8 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -410,13 +410,9 @@ void bdrv_drain_all_begin(void);
 void bdrv_drain_all_end(void);
 void bdrv_drain_all(void);
 
-/* Returns NULL when bs == NULL */
-AioWait *bdrv_get_aio_wait(BlockDriverState *bs);
-
 #define BDRV_POLL_WHILE(bs, cond) ({                       \
     BlockDriverState *bs_ = (bs);                          \
-    AIO_WAIT_WHILE(bdrv_get_aio_wait(bs_),                 \
-                   bdrv_get_aio_context(bs_),              \
+    AIO_WAIT_WHILE(bdrv_get_aio_context(bs_),              \
                    cond); })
 
 int bdrv_pdiscard(BdrvChild *child, int64_t offset, int bytes);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 4000d2a..92ecbd8 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -794,9 +794,6 @@ struct BlockDriverState {
     unsigned int in_flight;
     unsigned int serialising_in_flight;
 
-    /* Kicked to signal main loop when a request completes. */
-    AioWait wait;
-
     /* counter for nested bdrv_io_plug.
      * Accessed with atomic ops.
     */
diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index 2290bbb..ede0bd8 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -122,16 +122,6 @@ int block_job_add_bdrv(BlockJob *job, const char *name, BlockDriverState *bs,
 void block_job_remove_all_bdrv(BlockJob *job);
 
 /**
- * block_job_wakeup_all_bdrv:
- * @job: The block job
- *
- * Calls bdrv_wakeup() for all BlockDriverStates that have been added to the
- * job. This function is to be called whenever child_job_drained_poll() would
- * go from true to false to notify waiting drain requests.
- */
-void block_job_wakeup_all_bdrv(BlockJob *job);
-
-/**
  * block_job_set_speed:
  * @job: The job to set the speed for.
  * @speed: The new value
-- 
cgit v1.1