From 34dc97b9a0e592bc466bdb0bbfe45d77304a72b6 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 17 Aug 2018 14:53:05 +0200 Subject: blockjob: Wake up BDS when job becomes idle In the context of draining a BDS, the .drained_poll callback of block jobs is called. If this returns true (i.e. there is still some activity pending), the drain operation may call aio_poll() with blocking=true to wait for completion. As soon as the pending activity is completed and the job finally arrives in a quiescent state (i.e. its coroutine either yields with busy=false or terminates), the block job must notify the aio_poll() loop to wake up, otherwise we get a deadlock if both are running in different threads. Signed-off-by: Kevin Wolf Reviewed-by: Fam Zheng Reviewed-by: Max Reitz --- include/qemu/job.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/qemu') diff --git a/include/qemu/job.h b/include/qemu/job.h index 5cb0681..b4a784d 100644 --- a/include/qemu/job.h +++ b/include/qemu/job.h @@ -156,6 +156,9 @@ typedef struct Job { /** Notifiers called when the job transitions to READY */ NotifierList on_ready; + /** Notifiers called when the job coroutine yields or terminates */ + NotifierList on_idle; + /** Element of the list of jobs */ QLIST_ENTRY(Job) job_list; -- cgit v1.1 From 30c070a547322a5e41ce129d540bca3653b1a9c8 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 17 Aug 2018 17:29:08 +0200 Subject: test-blockjob: Acquire AioContext around job_cancel_sync() All callers in QEMU proper hold the AioContext lock when calling job_finish_sync(). test-blockjob should do the same when it calls the function indirectly through job_cancel_sync(). Signed-off-by: Kevin Wolf Reviewed-by: Fam Zheng --- include/qemu/job.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/qemu') diff --git a/include/qemu/job.h b/include/qemu/job.h index b4a784d..63c60ef 100644 --- a/include/qemu/job.h +++ b/include/qemu/job.h @@ -524,6 +524,8 @@ void job_user_cancel(Job *job, bool force, Error **errp); * * Returns the return value from the job if the job actually completed * during the call, or -ECANCELED if it was canceled. + * + * Callers must hold the AioContext lock of job->aio_context. */ int job_cancel_sync(Job *job); @@ -541,6 +543,8 @@ void job_cancel_sync_all(void); * function). * * Returns the return value from the job. + * + * Callers must hold the AioContext lock of job->aio_context. */ int job_complete_sync(Job *job, Error **errp); @@ -566,6 +570,8 @@ void job_dismiss(Job **job, Error **errp); * * Returns 0 if the job is successfully completed, -ECANCELED if the job was * cancelled before completing, and -errno in other error cases. + * + * Callers must hold the AioContext lock of job->aio_context. */ int job_finish_sync(Job *job, void (*finish)(Job *, Error **errp), Error **errp); -- cgit v1.1 From aa1361d54aac43094b98024b8b6c804eb6e41661 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 17 Aug 2018 18:54:18 +0200 Subject: block: Add missing locking in bdrv_co_drain_bh_cb() bdrv_do_drained_begin/end() assume that they are called with the AioContext lock of bs held. If we call drain functions from a coroutine with the AioContext lock held, we yield and schedule a BH to move out of coroutine context. This means that the lock for the home context of the coroutine is released and must be re-acquired in the bottom half. Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz --- include/qemu/coroutine.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/qemu') diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h index 6f8a487..9801e7f 100644 --- a/include/qemu/coroutine.h +++ b/include/qemu/coroutine.h @@ -90,6 +90,11 @@ void qemu_aio_coroutine_enter(AioContext *ctx, Coroutine *co); void coroutine_fn qemu_coroutine_yield(void); /** + * Get the AioContext of the given coroutine + */ +AioContext *coroutine_fn qemu_coroutine_get_aio_context(Coroutine *co); + +/** * Get the currently executing coroutine */ Coroutine *coroutine_fn qemu_coroutine_self(void); -- cgit v1.1 From b5a7a0573530698ee448b063ac01d485e30446bd Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 7 Sep 2018 15:31:22 +0200 Subject: blockjob: Lie better in child_job_drained_poll() Block jobs claim in .drained_poll() that they are in a quiescent state as soon as job->deferred_to_main_loop is true. This is obviously wrong, they still have a completion BH to run. We only get away with this because commit 91af091f923 added an unconditional aio_poll(false) to the drain functions, but this is bypassing the regular drain mechanisms. However, just removing this and telling that the job is still active doesn't work either: The completion callbacks themselves call drain functions (directly, or indirectly with bdrv_reopen), so they would deadlock then. As a better lie, tell that the job is active as long as the BH is pending, but falsely call it quiescent from the point in the BH when the completion callback is called. At this point, nested drain calls won't deadlock because they ignore the job, and outer drains will wait for the job to really reach a quiescent state because the callback is already running. Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz --- include/qemu/job.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/qemu') diff --git a/include/qemu/job.h b/include/qemu/job.h index 63c60ef..9e7cd1e 100644 --- a/include/qemu/job.h +++ b/include/qemu/job.h @@ -76,6 +76,9 @@ typedef struct Job { * Set to false by the job while the coroutine has yielded and may be * re-entered by job_enter(). There may still be I/O or event loop activity * pending. Accessed under block_job_mutex (in blockjob.c). + * + * When the job is deferred to the main loop, busy is true as long as the + * bottom half is still pending. */ bool busy; -- cgit v1.1