From 3c605f4074ebeb97970eb660fb56a9cb06525923 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 27 Jun 2017 17:18:20 +0200 Subject: commit: Add top-node/base-node options The block-commit QMP command required specifying the top and base nodes of the commit jobs using the file name of that node. While this works in simple cases (local files with absolute paths), the file names generated for more complicated setups can be hard to predict. The block-commit command has more problems than just this, so we want to replace it altogether in the long run, but libvirt needs a reliable way to address nodes now. So we don't want to wait for a new, cleaner command, but just add the minimal thing needed right now. This adds two new options top-node and base-node to the command, which allow specifying node names instead. They are mutually exclusive with the old options. Signed-off-by: Kevin Wolf --- blockdev.c | 32 ++++++++++++++++++++++++++++++-- qapi/block-core.json | 24 ++++++++++++++++++------ 2 files changed, 48 insertions(+), 8 deletions(-) diff --git a/blockdev.c b/blockdev.c index d4b4240..a8755bd 100644 --- a/blockdev.c +++ b/blockdev.c @@ -3214,7 +3214,9 @@ out: } void qmp_block_commit(bool has_job_id, const char *job_id, const char *device, + bool has_base_node, const char *base_node, bool has_base, const char *base, + bool has_top_node, const char *top_node, bool has_top, const char *top, bool has_backing_file, const char *backing_file, bool has_speed, int64_t speed, @@ -3275,7 +3277,20 @@ void qmp_block_commit(bool has_job_id, const char *job_id, const char *device, /* default top_bs is the active layer */ top_bs = bs; - if (has_top && top) { + if (has_top_node && has_top) { + error_setg(errp, "'top-node' and 'top' are mutually exclusive"); + goto out; + } else if (has_top_node) { + top_bs = bdrv_lookup_bs(NULL, top_node, errp); + if (top_bs == NULL) { + goto out; + } + if (!bdrv_chain_contains(bs, top_bs)) { + error_setg(errp, "'%s' is not in this backing file chain", + top_node); + goto out; + } + } else if (has_top && top) { if (strcmp(bs->filename, top) != 0) { top_bs = bdrv_find_backing_image(bs, top); } @@ -3288,7 +3303,20 @@ void qmp_block_commit(bool has_job_id, const char *job_id, const char *device, assert(bdrv_get_aio_context(top_bs) == aio_context); - if (has_base && base) { + if (has_base_node && has_base) { + error_setg(errp, "'base-node' and 'base' are mutually exclusive"); + goto out; + } else if (has_base_node) { + base_bs = bdrv_lookup_bs(NULL, base_node, errp); + if (base_bs == NULL) { + goto out; + } + if (!bdrv_chain_contains(top_bs, base_bs)) { + error_setg(errp, "'%s' is not in this backing file chain", + base_node); + goto out; + } + } else if (has_base && base) { base_bs = bdrv_find_backing_image(top_bs, base); } else { base_bs = bdrv_find_base(top_bs); diff --git a/qapi/block-core.json b/qapi/block-core.json index c0b3d33..ac3b48e 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -1457,12 +1457,23 @@ # # @device: the device name or node-name of a root node # -# @base: The file name of the backing image to write data into. -# If not specified, this is the deepest backing image. +# @base-node: The node name of the backing image to write data into. +# If not specified, this is the deepest backing image. +# (since: 3.1) # -# @top: The file name of the backing image within the image chain, -# which contains the topmost data to be committed down. If -# not specified, this is the active layer. +# @base: Same as @base-node, except that it is a file name rather than a node +# name. This must be the exact filename string that was used to open the +# node; other strings, even if addressing the same file, are not +# accepted (deprecated, use @base-node instead) +# +# @top-node: The node name of the backing image within the image chain +# which contains the topmost data to be committed down. If +# not specified, this is the active layer. (since: 3.1) +# +# @top: Same as @top-node, except that it is a file name rather than a node +# name. This must be the exact filename string that was used to open the +# node; other strings, even if addressing the same file, are not +# accepted (deprecated, use @base-node instead) # # @backing-file: The backing file string to write into the overlay # image of 'top'. If 'top' is the active layer, @@ -1528,7 +1539,8 @@ # ## { 'command': 'block-commit', - 'data': { '*job-id': 'str', 'device': 'str', '*base': 'str', '*top': 'str', + 'data': { '*job-id': 'str', 'device': 'str', '*base-node': 'str', + '*base': 'str', '*top-node': 'str', '*top': 'str', '*backing-file': 'str', '*speed': 'int', '*filter-node-name': 'str', '*auto-finalize': 'bool', '*auto-dismiss': 'bool' } } -- cgit v1.1 From d57177a48fc604e5427921bf20b22ee0e6d578b3 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 10 Aug 2018 18:20:45 +0200 Subject: qemu-iotests: Test commit with top-node/base-node This adds some tests for block-commit with the new options top-node and base-node (taking node names) instead of top and base (taking file names). Signed-off-by: Kevin Wolf --- tests/qemu-iotests/040 | 52 ++++++++++++++++++++++++++++++++++++++++++++-- tests/qemu-iotests/040.out | 4 ++-- 2 files changed, 52 insertions(+), 4 deletions(-) diff --git a/tests/qemu-iotests/040 b/tests/qemu-iotests/040 index 1beb5e6..1cb1cee 100755 --- a/tests/qemu-iotests/040 +++ b/tests/qemu-iotests/040 @@ -57,9 +57,12 @@ class ImageCommitTestCase(iotests.QMPTestCase): self.assert_no_active_block_jobs() self.vm.shutdown() - def run_commit_test(self, top, base, need_ready=False): + def run_commit_test(self, top, base, need_ready=False, node_names=False): self.assert_no_active_block_jobs() - result = self.vm.qmp('block-commit', device='drive0', top=top, base=base) + if node_names: + result = self.vm.qmp('block-commit', device='drive0', top_node=top, base_node=base) + else: + result = self.vm.qmp('block-commit', device='drive0', top=top, base=base) self.assert_qmp(result, 'return', {}) self.wait_for_complete(need_ready) @@ -101,6 +104,11 @@ class TestSingleDrive(ImageCommitTestCase): self.assertEqual(-1, qemu_io('-f', 'raw', '-c', 'read -P 0xab 0 524288', backing_img).find("verification failed")) self.assertEqual(-1, qemu_io('-f', 'raw', '-c', 'read -P 0xef 524288 524288', backing_img).find("verification failed")) + def test_commit_node(self): + self.run_commit_test("mid", "base", node_names=True) + self.assertEqual(-1, qemu_io('-f', 'raw', '-c', 'read -P 0xab 0 524288', backing_img).find("verification failed")) + self.assertEqual(-1, qemu_io('-f', 'raw', '-c', 'read -P 0xef 524288 524288', backing_img).find("verification failed")) + def test_device_not_found(self): result = self.vm.qmp('block-commit', device='nonexistent', top='%s' % mid_img) self.assert_qmp(result, 'error/class', 'DeviceNotFound') @@ -123,6 +131,30 @@ class TestSingleDrive(ImageCommitTestCase): self.assert_qmp(result, 'error/class', 'GenericError') self.assert_qmp(result, 'error/desc', 'Base \'badfile\' not found') + def test_top_node_invalid(self): + self.assert_no_active_block_jobs() + result = self.vm.qmp('block-commit', device='drive0', top_node='badfile', base_node='base') + self.assert_qmp(result, 'error/class', 'GenericError') + self.assert_qmp(result, 'error/desc', "Cannot find device= nor node_name=badfile") + + def test_base_node_invalid(self): + self.assert_no_active_block_jobs() + result = self.vm.qmp('block-commit', device='drive0', top_node='mid', base_node='badfile') + self.assert_qmp(result, 'error/class', 'GenericError') + self.assert_qmp(result, 'error/desc', "Cannot find device= nor node_name=badfile") + + def test_top_path_and_node(self): + self.assert_no_active_block_jobs() + result = self.vm.qmp('block-commit', device='drive0', top_node='mid', base_node='base', top='%s' % mid_img) + self.assert_qmp(result, 'error/class', 'GenericError') + self.assert_qmp(result, 'error/desc', "'top-node' and 'top' are mutually exclusive") + + def test_base_path_and_node(self): + self.assert_no_active_block_jobs() + result = self.vm.qmp('block-commit', device='drive0', top_node='mid', base_node='base', base='%s' % backing_img) + self.assert_qmp(result, 'error/class', 'GenericError') + self.assert_qmp(result, 'error/desc', "'base-node' and 'base' are mutually exclusive") + def test_top_is_active(self): self.run_commit_test(test_img, backing_img, need_ready=True) self.assertEqual(-1, qemu_io('-f', 'raw', '-c', 'read -P 0xab 0 524288', backing_img).find("verification failed")) @@ -139,6 +171,22 @@ class TestSingleDrive(ImageCommitTestCase): self.assert_qmp(result, 'error/class', 'GenericError') self.assert_qmp(result, 'error/desc', 'Base \'%s\' not found' % mid_img) + def test_top_and_base_node_reversed(self): + self.assert_no_active_block_jobs() + result = self.vm.qmp('block-commit', device='drive0', top_node='base', base_node='top') + self.assert_qmp(result, 'error/class', 'GenericError') + self.assert_qmp(result, 'error/desc', "'top' is not in this backing file chain") + + def test_top_node_in_wrong_chain(self): + self.assert_no_active_block_jobs() + + result = self.vm.qmp('blockdev-add', driver='null-co', node_name='null') + self.assert_qmp(result, 'return', {}) + + result = self.vm.qmp('block-commit', device='drive0', top_node='null', base_node='base') + self.assert_qmp(result, 'error/class', 'GenericError') + self.assert_qmp(result, 'error/desc', "'null' is not in this backing file chain") + # When the job is running on a BB that is automatically deleted on hot # unplug, the job is cancelled when the device disappears def test_hot_unplug(self): diff --git a/tests/qemu-iotests/040.out b/tests/qemu-iotests/040.out index e20a75c..802ffaa 100644 --- a/tests/qemu-iotests/040.out +++ b/tests/qemu-iotests/040.out @@ -1,5 +1,5 @@ -............................. +........................................... ---------------------------------------------------------------------- -Ran 29 tests +Ran 43 tests OK -- cgit v1.1 From e091f0e905a4481f347913420f327d427f18d9d4 Mon Sep 17 00:00:00 2001 From: Sergio Lopez Date: Wed, 5 Sep 2018 13:23:34 +0200 Subject: block/linux-aio: acquire AioContext before qemu_laio_process_completions In qemu_laio_process_completions_and_submit, the AioContext is acquired before the ioq_submit iteration and after qemu_laio_process_completions, but the latter is not thread safe either. This change avoids a number of random crashes when the Main Thread and an IO Thread collide processing completions for the same AioContext. This is an example of such crash: - The IO Thread is trying to acquire the AioContext at aio_co_enter, which evidences that it didn't lock it before: Thread 3 (Thread 0x7fdfd8bd8700 (LWP 36743)): #0 0x00007fdfe0dd542d in __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135 #1 0x00007fdfe0dd0de6 in _L_lock_870 () at /lib64/libpthread.so.0 #2 0x00007fdfe0dd0cdf in __GI___pthread_mutex_lock (mutex=mutex@entry=0x5631fde0e6c0) at ../nptl/pthread_mutex_lock.c:114 #3 0x00005631fc0603a7 in qemu_mutex_lock_impl (mutex=0x5631fde0e6c0, file=0x5631fc23520f "util/async.c", line=511) at util/qemu-thread-posix.c:66 #4 0x00005631fc05b558 in aio_co_enter (ctx=0x5631fde0e660, co=0x7fdfcc0c2b40) at util/async.c:493 #5 0x00005631fc05b5ac in aio_co_wake (co=) at util/async.c:478 #6 0x00005631fbfc51ad in qemu_laio_process_completion (laiocb=) at block/linux-aio.c:104 #7 0x00005631fbfc523c in qemu_laio_process_completions (s=s@entry=0x7fdfc0297670) at block/linux-aio.c:222 #8 0x00005631fbfc5499 in qemu_laio_process_completions_and_submit (s=0x7fdfc0297670) at block/linux-aio.c:237 #9 0x00005631fc05d978 in aio_dispatch_handlers (ctx=ctx@entry=0x5631fde0e660) at util/aio-posix.c:406 #10 0x00005631fc05e3ea in aio_poll (ctx=0x5631fde0e660, blocking=blocking@entry=true) at util/aio-posix.c:693 #11 0x00005631fbd7ad96 in iothread_run (opaque=0x5631fde0e1c0) at iothread.c:64 #12 0x00007fdfe0dcee25 in start_thread (arg=0x7fdfd8bd8700) at pthread_create.c:308 #13 0x00007fdfe0afc34d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:113 - The Main Thread is also processing completions from the same AioContext, and crashes due to failed assertion at util/iov.c:78: Thread 1 (Thread 0x7fdfeb5eac80 (LWP 36740)): #0 0x00007fdfe0a391f7 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56 #1 0x00007fdfe0a3a8e8 in __GI_abort () at abort.c:90 #2 0x00007fdfe0a32266 in __assert_fail_base (fmt=0x7fdfe0b84e68 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=assertion@entry=0x5631fc238ccb "offset == 0", file=file@entry=0x5631fc23698e "util/iov.c", line=line@entry=78, function=function@entry=0x5631fc236adc <__PRETTY_FUNCTION__.15220> "iov_memset") at assert.c:92 #3 0x00007fdfe0a32312 in __GI___assert_fail (assertion=assertion@entry=0x5631fc238ccb "offset == 0", file=file@entry=0x5631fc23698e "util/iov.c", line=line@entry=78, function=function@entry=0x5631fc236adc <__PRETTY_FUNCTION__.15220> "iov_memset") at assert.c:101 #4 0x00005631fc065287 in iov_memset (iov=, iov_cnt=, offset=, offset@entry=65536, fillc=fillc@entry=0, bytes=15515191315812405248) at util/iov.c:78 #5 0x00005631fc065a63 in qemu_iovec_memset (qiov=, offset=offset@entry=65536, fillc=fillc@entry=0, bytes=) at util/iov.c:410 #6 0x00005631fbfc5178 in qemu_laio_process_completion (laiocb=0x7fdd920df630) at block/linux-aio.c:88 #7 0x00005631fbfc523c in qemu_laio_process_completions (s=s@entry=0x7fdfc0297670) at block/linux-aio.c:222 #8 0x00005631fbfc5499 in qemu_laio_process_completions_and_submit (s=0x7fdfc0297670) at block/linux-aio.c:237 #9 0x00005631fbfc54ed in qemu_laio_poll_cb (opaque=) at block/linux-aio.c:272 #10 0x00005631fc05d85e in run_poll_handlers_once (ctx=ctx@entry=0x5631fde0e660) at util/aio-posix.c:497 #11 0x00005631fc05e2ca in aio_poll (blocking=false, ctx=0x5631fde0e660) at util/aio-posix.c:574 #12 0x00005631fc05e2ca in aio_poll (ctx=0x5631fde0e660, blocking=blocking@entry=false) at util/aio-posix.c:604 #13 0x00005631fbfcb8a3 in bdrv_do_drained_begin (ignore_parent=, recursive=, bs=) at block/io.c:273 #14 0x00005631fbfcb8a3 in bdrv_do_drained_begin (bs=0x5631fe8b6200, recursive=, parent=0x0, ignore_bds_parents=, poll=) at block/io.c:390 #15 0x00005631fbfbcd2e in blk_drain (blk=0x5631fe83ac80) at block/block-backend.c:1590 #16 0x00005631fbfbe138 in blk_remove_bs (blk=blk@entry=0x5631fe83ac80) at block/block-backend.c:774 #17 0x00005631fbfbe3d6 in blk_unref (blk=0x5631fe83ac80) at block/block-backend.c:401 #18 0x00005631fbfbe3d6 in blk_unref (blk=0x5631fe83ac80) at block/block-backend.c:449 #19 0x00005631fbfc9a69 in commit_complete (job=0x5631fe8b94b0, opaque=0x7fdfcc1bb080) at block/commit.c:92 #20 0x00005631fbf7d662 in job_defer_to_main_loop_bh (opaque=0x7fdfcc1b4560) at job.c:973 #21 0x00005631fc05ad41 in aio_bh_poll (bh=0x7fdfcc01ad90) at util/async.c:90 #22 0x00005631fc05ad41 in aio_bh_poll (ctx=ctx@entry=0x5631fddffdb0) at util/async.c:118 #23 0x00005631fc05e210 in aio_dispatch (ctx=0x5631fddffdb0) at util/aio-posix.c:436 #24 0x00005631fc05ac1e in aio_ctx_dispatch (source=, callback=, user_data=) at util/async.c:261 #25 0x00007fdfeaae44c9 in g_main_context_dispatch (context=0x5631fde00140) at gmain.c:3201 #26 0x00007fdfeaae44c9 in g_main_context_dispatch (context=context@entry=0x5631fde00140) at gmain.c:3854 #27 0x00005631fc05d503 in main_loop_wait () at util/main-loop.c:215 #28 0x00005631fc05d503 in main_loop_wait (timeout=) at util/main-loop.c:238 #29 0x00005631fc05d503 in main_loop_wait (nonblocking=nonblocking@entry=0) at util/main-loop.c:497 #30 0x00005631fbd81412 in main_loop () at vl.c:1866 #31 0x00005631fbc18ff3 in main (argc=, argv=, envp=) at vl.c:4647 - A closer examination shows that s->io_q.in_flight appears to have gone backwards: (gdb) frame 7 #7 0x00005631fbfc523c in qemu_laio_process_completions (s=s@entry=0x7fdfc0297670) at block/linux-aio.c:222 222 qemu_laio_process_completion(laiocb); (gdb) p s $2 = (LinuxAioState *) 0x7fdfc0297670 (gdb) p *s $3 = {aio_context = 0x5631fde0e660, ctx = 0x7fdfeb43b000, e = {rfd = 33, wfd = 33}, io_q = {plugged = 0, in_queue = 0, in_flight = 4294967280, blocked = false, pending = {sqh_first = 0x0, sqh_last = 0x7fdfc0297698}}, completion_bh = 0x7fdfc0280ef0, event_idx = 21, event_max = 241} (gdb) p/x s->io_q.in_flight $4 = 0xfffffff0 Signed-off-by: Sergio Lopez Signed-off-by: Kevin Wolf --- block/linux-aio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/linux-aio.c b/block/linux-aio.c index 19eb922..217ce60 100644 --- a/block/linux-aio.c +++ b/block/linux-aio.c @@ -234,9 +234,9 @@ static void qemu_laio_process_completions(LinuxAioState *s) static void qemu_laio_process_completions_and_submit(LinuxAioState *s) { + aio_context_acquire(s->aio_context); qemu_laio_process_completions(s); - aio_context_acquire(s->aio_context); if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { ioq_submit(s); } -- cgit v1.1 From 8961be33e8ca7e809c603223803ea66ef7ea5be7 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Thu, 6 Sep 2018 17:25:41 +0300 Subject: block: Fix use after free error in bdrv_open_inherit() When a block device is opened with BDRV_O_SNAPSHOT and the bdrv_append_temp_snapshot() call fails then the error code path tries to unref the already destroyed 'options' QDict. This can be reproduced easily by setting TMPDIR to a location where the QEMU process can't write: $ TMPDIR=/nonexistent $QEMU -drive driver=null-co,snapshot=on Signed-off-by: Alberto Garcia Signed-off-by: Kevin Wolf --- block.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block.c b/block.c index 0dbb1fc..a381c8e 100644 --- a/block.c +++ b/block.c @@ -2792,6 +2792,7 @@ static BlockDriverState *bdrv_open_inherit(const char *filename, bdrv_parent_cb_change_media(bs, true); qobject_unref(options); + options = NULL; /* For snapshot=on, create a temporary qcow2 overlay. bs points to the * temporary snapshot afterwards. */ -- cgit v1.1 From 6a7014ef22ad3cf9110ca0e178f73a67a6483e00 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Mon, 10 Sep 2018 12:29:23 +0300 Subject: qemu-iotests: Test snapshot=on with nonexistent TMPDIR We just fixed a bug that was causing a use-after-free when QEMU was unable to create a temporary snapshot. This is a test case for this scenario. Signed-off-by: Alberto Garcia Signed-off-by: Kevin Wolf --- tests/qemu-iotests/051 | 3 +++ tests/qemu-iotests/051.out | 3 +++ tests/qemu-iotests/051.pc.out | 3 +++ 3 files changed, 9 insertions(+) diff --git a/tests/qemu-iotests/051 b/tests/qemu-iotests/051 index ee9c820..25d3b2d 100755 --- a/tests/qemu-iotests/051 +++ b/tests/qemu-iotests/051 @@ -354,6 +354,9 @@ printf %b "qemu-io $device_id \"write -P 0x33 0 4k\"\ncommit $device_id\n" | $QEMU_IO -c "read -P 0x33 0 4k" "$TEST_IMG" | _filter_qemu_io +# Using snapshot=on with a non-existent TMPDIR +TMPDIR=/nonexistent run_qemu -drive driver=null-co,snapshot=on + # success, all done echo "*** done" rm -f $seq.full diff --git a/tests/qemu-iotests/051.out b/tests/qemu-iotests/051.out index b727350..793af2a 100644 --- a/tests/qemu-iotests/051.out +++ b/tests/qemu-iotests/051.out @@ -455,4 +455,7 @@ wrote 4096/4096 bytes at offset 0 read 4096/4096 bytes at offset 0 4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +Testing: -drive driver=null-co,snapshot=on +QEMU_PROG: -drive driver=null-co,snapshot=on: Could not get temporary filename: No such file or directory + *** done diff --git a/tests/qemu-iotests/051.pc.out b/tests/qemu-iotests/051.pc.out index e9257fe..ca64eda 100644 --- a/tests/qemu-iotests/051.pc.out +++ b/tests/qemu-iotests/051.pc.out @@ -527,4 +527,7 @@ wrote 4096/4096 bytes at offset 0 read 4096/4096 bytes at offset 0 4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +Testing: -drive driver=null-co,snapshot=on +QEMU_PROG: -drive driver=null-co,snapshot=on: Could not get temporary filename: No such file or directory + *** done -- cgit v1.1 From 6808ae0417131f8dbe7b051256dff7a16634dc1d Mon Sep 17 00:00:00 2001 From: Sergio Lopez Date: Wed, 5 Sep 2018 11:33:51 +0200 Subject: util/async: use qemu_aio_coroutine_enter in co_schedule_bh_cb AIO Coroutines shouldn't by managed by an AioContext different than the one assigned when they are created. aio_co_enter avoids entering a coroutine from a different AioContext, calling aio_co_schedule instead. Scheduled coroutines are then entered by co_schedule_bh_cb using qemu_coroutine_enter, which just calls qemu_aio_coroutine_enter with the current AioContext obtained with qemu_get_current_aio_context. Eventually, co->ctx will be set to the AioContext passed as an argument to qemu_aio_coroutine_enter. This means that, if an IO Thread's AioConext is being processed by the Main Thread (due to aio_poll being called with a BDS AioContext, as it happens in AIO_WAIT_WHILE among other places), the AioContext from some coroutines may be wrongly replaced with the one from the Main Thread. This is the root cause behind some crashes, mainly triggered by the drain code at block/io.c. The most common are these abort and failed assertion: util/async.c:aio_co_schedule 456 if (scheduled) { 457 fprintf(stderr, 458 "%s: Co-routine was already scheduled in '%s'\n", 459 __func__, scheduled); 460 abort(); 461 } util/qemu-coroutine-lock.c: 286 assert(mutex->holder == self); But it's also known to cause random errors at different locations, and even SIGSEGV with broken coroutine backtraces. By using qemu_aio_coroutine_enter directly in co_schedule_bh_cb, we can pass the correct AioContext as an argument, making sure co->ctx is not wrongly altered. Signed-off-by: Sergio Lopez Signed-off-by: Kevin Wolf --- util/async.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/async.c b/util/async.c index 05979f8..c10642a 100644 --- a/util/async.c +++ b/util/async.c @@ -400,7 +400,7 @@ static void co_schedule_bh_cb(void *opaque) /* Protected by write barrier in qemu_aio_coroutine_enter */ atomic_set(&co->scheduled, NULL); - qemu_coroutine_enter(co); + qemu_aio_coroutine_enter(ctx, co); aio_context_release(ctx); } } -- cgit v1.1 From 49880165a44f26dc84651858750facdee31f2513 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Fri, 24 Aug 2018 10:43:42 +0800 Subject: job: Fix nested aio_poll() hanging in job_txn_apply All callers have acquired ctx already. Doing that again results in aio_poll() hang. This fixes the problem that a BDRV_POLL_WHILE() in the callback cannot make progress because ctx is recursively locked, for example, when drive-backup finishes. There are two callers of job_finalize(): fam@lemon:~/work/qemu [master]$ git grep -w -A1 '^\s*job_finalize' blockdev.c: job_finalize(&job->job, errp); blockdev.c- aio_context_release(aio_context); -- job-qmp.c: job_finalize(job, errp); job-qmp.c- aio_context_release(aio_context); -- tests/test-blockjob.c: job_finalize(&job->job, &error_abort); tests/test-blockjob.c- assert(job->job.status == JOB_STATUS_CONCLUDED); Ignoring the test, it's easy to see both callers to job_finalize (and job_do_finalize) have acquired the context. Cc: qemu-stable@nongnu.org Reported-by: Gu Nini Reviewed-by: Eric Blake Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- job.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/job.c b/job.c index 192f168..3dd31a5 100644 --- a/job.c +++ b/job.c @@ -136,21 +136,13 @@ static void job_txn_del_job(Job *job) } } -static int job_txn_apply(JobTxn *txn, int fn(Job *), bool lock) +static int job_txn_apply(JobTxn *txn, int fn(Job *)) { - AioContext *ctx; Job *job, *next; int rc = 0; QLIST_FOREACH_SAFE(job, &txn->jobs, txn_list, next) { - if (lock) { - ctx = job->aio_context; - aio_context_acquire(ctx); - } rc = fn(job); - if (lock) { - aio_context_release(ctx); - } if (rc) { break; } @@ -780,11 +772,11 @@ static void job_do_finalize(Job *job) assert(job && job->txn); /* prepare the transaction to complete */ - rc = job_txn_apply(job->txn, job_prepare, true); + rc = job_txn_apply(job->txn, job_prepare); if (rc) { job_completed_txn_abort(job); } else { - job_txn_apply(job->txn, job_finalize_single, true); + job_txn_apply(job->txn, job_finalize_single); } } @@ -830,10 +822,10 @@ static void job_completed_txn_success(Job *job) assert(other_job->ret == 0); } - job_txn_apply(txn, job_transition_to_pending, false); + job_txn_apply(txn, job_transition_to_pending); /* If no jobs need manual finalization, automatically do so */ - if (job_txn_apply(txn, job_needs_finalize, false) == 0) { + if (job_txn_apply(txn, job_needs_finalize) == 0) { job_do_finalize(job); } } -- cgit v1.1 From d1756c780b7879fb64e41135feac781d84a1f995 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 12 Sep 2018 13:50:38 +0200 Subject: job: Fix missing locking due to mismerge job_completed() had a problem with double locking that was recently fixed independently by two different commits: "job: Fix nested aio_poll() hanging in job_txn_apply" "jobs: add exit shim" One fix removed the first aio_context_acquire(), the other fix removed the other one. Now we have a bug again and the code is run without any locking. Add it back in one of the places. Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz Reviewed-by: John Snow --- job.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/job.c b/job.c index 3dd31a5..d17b1c8 100644 --- a/job.c +++ b/job.c @@ -847,7 +847,11 @@ static void job_completed(Job *job) static void job_exit(void *opaque) { Job *job = (Job *)opaque; + AioContext *ctx = job->aio_context; + + aio_context_acquire(ctx); job_completed(job); + aio_context_release(ctx); } /** -- cgit v1.1 From 34dc97b9a0e592bc466bdb0bbfe45d77304a72b6 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 17 Aug 2018 14:53:05 +0200 Subject: blockjob: Wake up BDS when job becomes idle In the context of draining a BDS, the .drained_poll callback of block jobs is called. If this returns true (i.e. there is still some activity pending), the drain operation may call aio_poll() with blocking=true to wait for completion. As soon as the pending activity is completed and the job finally arrives in a quiescent state (i.e. its coroutine either yields with busy=false or terminates), the block job must notify the aio_poll() loop to wake up, otherwise we get a deadlock if both are running in different threads. Signed-off-by: Kevin Wolf Reviewed-by: Fam Zheng Reviewed-by: Max Reitz --- blockjob.c | 18 ++++++++++++++++++ include/block/blockjob.h | 13 +++++++++++++ include/qemu/job.h | 3 +++ job.c | 7 +++++++ 4 files changed, 41 insertions(+) diff --git a/blockjob.c b/blockjob.c index bf7ef48..58dbd87 100644 --- a/blockjob.c +++ b/blockjob.c @@ -221,6 +221,22 @@ int block_job_add_bdrv(BlockJob *job, const char *name, BlockDriverState *bs, return 0; } +void block_job_wakeup_all_bdrv(BlockJob *job) +{ + GSList *l; + + for (l = job->nodes; l; l = l->next) { + BdrvChild *c = l->data; + bdrv_wakeup(c->bs); + } +} + +static void block_job_on_idle(Notifier *n, void *opaque) +{ + BlockJob *job = opaque; + block_job_wakeup_all_bdrv(job); +} + bool block_job_is_internal(BlockJob *job) { return (job->job.id == NULL); @@ -416,6 +432,7 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver, job->finalize_completed_notifier.notify = block_job_event_completed; job->pending_notifier.notify = block_job_event_pending; job->ready_notifier.notify = block_job_event_ready; + job->idle_notifier.notify = block_job_on_idle; notifier_list_add(&job->job.on_finalize_cancelled, &job->finalize_cancelled_notifier); @@ -423,6 +440,7 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver, &job->finalize_completed_notifier); notifier_list_add(&job->job.on_pending, &job->pending_notifier); notifier_list_add(&job->job.on_ready, &job->ready_notifier); + notifier_list_add(&job->job.on_idle, &job->idle_notifier); error_setg(&job->blocker, "block device is in use by block job: %s", job_type_str(&job->job)); diff --git a/include/block/blockjob.h b/include/block/blockjob.h index 32c00b7..2290bbb 100644 --- a/include/block/blockjob.h +++ b/include/block/blockjob.h @@ -70,6 +70,9 @@ typedef struct BlockJob { /** Called when the job transitions to READY */ Notifier ready_notifier; + /** Called when the job coroutine yields or terminates */ + Notifier idle_notifier; + /** BlockDriverStates that are involved in this block job */ GSList *nodes; } BlockJob; @@ -119,6 +122,16 @@ int block_job_add_bdrv(BlockJob *job, const char *name, BlockDriverState *bs, void block_job_remove_all_bdrv(BlockJob *job); /** + * block_job_wakeup_all_bdrv: + * @job: The block job + * + * Calls bdrv_wakeup() for all BlockDriverStates that have been added to the + * job. This function is to be called whenever child_job_drained_poll() would + * go from true to false to notify waiting drain requests. + */ +void block_job_wakeup_all_bdrv(BlockJob *job); + +/** * block_job_set_speed: * @job: The job to set the speed for. * @speed: The new value diff --git a/include/qemu/job.h b/include/qemu/job.h index 5cb0681..b4a784d 100644 --- a/include/qemu/job.h +++ b/include/qemu/job.h @@ -156,6 +156,9 @@ typedef struct Job { /** Notifiers called when the job transitions to READY */ NotifierList on_ready; + /** Notifiers called when the job coroutine yields or terminates */ + NotifierList on_idle; + /** Element of the list of jobs */ QLIST_ENTRY(Job) job_list; diff --git a/job.c b/job.c index d17b1c8..4812962 100644 --- a/job.c +++ b/job.c @@ -402,6 +402,11 @@ static void job_event_ready(Job *job) notifier_list_notify(&job->on_ready, job); } +static void job_event_idle(Job *job) +{ + notifier_list_notify(&job->on_idle, job); +} + void job_enter_cond(Job *job, bool(*fn)(Job *job)) { if (!job_started(job)) { @@ -447,6 +452,7 @@ static void coroutine_fn job_do_yield(Job *job, uint64_t ns) timer_mod(&job->sleep_timer, ns); } job->busy = false; + job_event_idle(job); job_unlock(); qemu_coroutine_yield(); @@ -865,6 +871,7 @@ static void coroutine_fn job_co_entry(void *opaque) assert(job && job->driver && job->driver->run); job_pause_point(job); job->ret = job->driver->run(job, &job->err); + job_event_idle(job); job->deferred_to_main_loop = true; aio_bh_schedule_oneshot(qemu_get_aio_context(), job_exit, job); } -- cgit v1.1 From 486574483aba988c83b20e7d3f1ccd50c4c333d8 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 5 Sep 2018 18:14:17 +0200 Subject: aio-wait: Increase num_waiters even in home thread Even if AIO_WAIT_WHILE() is called in the home context of the AioContext, we still want to allow the condition to change depending on other threads as long as they kick the AioWait. Specfically block jobs can be running in an I/O thread and should then be able to kick a drain in the main loop context. Signed-off-by: Kevin Wolf Reviewed-by: Fam Zheng --- include/block/aio-wait.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h index c85a62f..600fad1 100644 --- a/include/block/aio-wait.h +++ b/include/block/aio-wait.h @@ -76,6 +76,8 @@ typedef struct { bool waited_ = false; \ AioWait *wait_ = (wait); \ AioContext *ctx_ = (ctx); \ + /* Increment wait_->num_waiters before evaluating cond. */ \ + atomic_inc(&wait_->num_waiters); \ if (ctx_ && in_aio_context_home_thread(ctx_)) { \ while ((cond)) { \ aio_poll(ctx_, true); \ @@ -84,8 +86,6 @@ typedef struct { } else { \ assert(qemu_get_current_aio_context() == \ qemu_get_aio_context()); \ - /* Increment wait_->num_waiters before evaluating cond. */ \ - atomic_inc(&wait_->num_waiters); \ while ((cond)) { \ if (ctx_) { \ aio_context_release(ctx_); \ @@ -96,8 +96,8 @@ typedef struct { } \ waited_ = true; \ } \ - atomic_dec(&wait_->num_waiters); \ } \ + atomic_dec(&wait_->num_waiters); \ waited_; }) /** -- cgit v1.1 From f62c172959cd2b6de4dd8ba782e855d64d94764b Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 4 Sep 2018 18:23:06 +0200 Subject: test-bdrv-drain: Drain with block jobs in an I/O thread This extends the existing drain test with a block job to include variants where the block job runs in a different AioContext. Signed-off-by: Kevin Wolf Reviewed-by: Fam Zheng --- tests/test-bdrv-drain.c | 92 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 86 insertions(+), 6 deletions(-) diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c index 89ac15e..57da22a 100644 --- a/tests/test-bdrv-drain.c +++ b/tests/test-bdrv-drain.c @@ -174,6 +174,28 @@ static void do_drain_end(enum drain_type drain_type, BlockDriverState *bs) } } +static void do_drain_begin_unlocked(enum drain_type drain_type, BlockDriverState *bs) +{ + if (drain_type != BDRV_DRAIN_ALL) { + aio_context_acquire(bdrv_get_aio_context(bs)); + } + do_drain_begin(drain_type, bs); + if (drain_type != BDRV_DRAIN_ALL) { + aio_context_release(bdrv_get_aio_context(bs)); + } +} + +static void do_drain_end_unlocked(enum drain_type drain_type, BlockDriverState *bs) +{ + if (drain_type != BDRV_DRAIN_ALL) { + aio_context_acquire(bdrv_get_aio_context(bs)); + } + do_drain_end(drain_type, bs); + if (drain_type != BDRV_DRAIN_ALL) { + aio_context_release(bdrv_get_aio_context(bs)); + } +} + static void test_drv_cb_common(enum drain_type drain_type, bool recursive) { BlockBackend *blk; @@ -785,11 +807,13 @@ BlockJobDriver test_job_driver = { }, }; -static void test_blockjob_common(enum drain_type drain_type) +static void test_blockjob_common(enum drain_type drain_type, bool use_iothread) { BlockBackend *blk_src, *blk_target; BlockDriverState *src, *target; BlockJob *job; + IOThread *iothread = NULL; + AioContext *ctx; int ret; src = bdrv_new_open_driver(&bdrv_test, "source", BDRV_O_RDWR, @@ -797,21 +821,31 @@ static void test_blockjob_common(enum drain_type drain_type) blk_src = blk_new(BLK_PERM_ALL, BLK_PERM_ALL); blk_insert_bs(blk_src, src, &error_abort); + if (use_iothread) { + iothread = iothread_new(); + ctx = iothread_get_aio_context(iothread); + blk_set_aio_context(blk_src, ctx); + } else { + ctx = qemu_get_aio_context(); + } + target = bdrv_new_open_driver(&bdrv_test, "target", BDRV_O_RDWR, &error_abort); blk_target = blk_new(BLK_PERM_ALL, BLK_PERM_ALL); blk_insert_bs(blk_target, target, &error_abort); + aio_context_acquire(ctx); job = block_job_create("job0", &test_job_driver, NULL, src, 0, BLK_PERM_ALL, 0, 0, NULL, NULL, &error_abort); block_job_add_bdrv(job, "target", target, 0, BLK_PERM_ALL, &error_abort); job_start(&job->job); + aio_context_release(ctx); g_assert_cmpint(job->job.pause_count, ==, 0); g_assert_false(job->job.paused); g_assert_true(job->job.busy); /* We're in job_sleep_ns() */ - do_drain_begin(drain_type, src); + do_drain_begin_unlocked(drain_type, src); if (drain_type == BDRV_DRAIN_ALL) { /* bdrv_drain_all() drains both src and target */ @@ -822,7 +856,14 @@ static void test_blockjob_common(enum drain_type drain_type) g_assert_true(job->job.paused); g_assert_false(job->job.busy); /* The job is paused */ - do_drain_end(drain_type, src); + do_drain_end_unlocked(drain_type, src); + + if (use_iothread) { + /* paused is reset in the I/O thread, wait for it */ + while (job->job.paused) { + aio_poll(qemu_get_aio_context(), false); + } + } g_assert_cmpint(job->job.pause_count, ==, 0); g_assert_false(job->job.paused); @@ -841,32 +882,64 @@ static void test_blockjob_common(enum drain_type drain_type) do_drain_end(drain_type, target); + if (use_iothread) { + /* paused is reset in the I/O thread, wait for it */ + while (job->job.paused) { + aio_poll(qemu_get_aio_context(), false); + } + } + g_assert_cmpint(job->job.pause_count, ==, 0); g_assert_false(job->job.paused); g_assert_true(job->job.busy); /* We're in job_sleep_ns() */ + aio_context_acquire(ctx); ret = job_complete_sync(&job->job, &error_abort); g_assert_cmpint(ret, ==, 0); + if (use_iothread) { + blk_set_aio_context(blk_src, qemu_get_aio_context()); + } + aio_context_release(ctx); + blk_unref(blk_src); blk_unref(blk_target); bdrv_unref(src); bdrv_unref(target); + + if (iothread) { + iothread_join(iothread); + } } static void test_blockjob_drain_all(void) { - test_blockjob_common(BDRV_DRAIN_ALL); + test_blockjob_common(BDRV_DRAIN_ALL, false); } static void test_blockjob_drain(void) { - test_blockjob_common(BDRV_DRAIN); + test_blockjob_common(BDRV_DRAIN, false); } static void test_blockjob_drain_subtree(void) { - test_blockjob_common(BDRV_SUBTREE_DRAIN); + test_blockjob_common(BDRV_SUBTREE_DRAIN, false); +} + +static void test_blockjob_iothread_drain_all(void) +{ + test_blockjob_common(BDRV_DRAIN_ALL, true); +} + +static void test_blockjob_iothread_drain(void) +{ + test_blockjob_common(BDRV_DRAIN, true); +} + +static void test_blockjob_iothread_drain_subtree(void) +{ + test_blockjob_common(BDRV_SUBTREE_DRAIN, true); } @@ -1338,6 +1411,13 @@ int main(int argc, char **argv) g_test_add_func("/bdrv-drain/blockjob/drain_subtree", test_blockjob_drain_subtree); + g_test_add_func("/bdrv-drain/blockjob/iothread/drain_all", + test_blockjob_iothread_drain_all); + g_test_add_func("/bdrv-drain/blockjob/iothread/drain", + test_blockjob_iothread_drain); + g_test_add_func("/bdrv-drain/blockjob/iothread/drain_subtree", + test_blockjob_iothread_drain_subtree); + g_test_add_func("/bdrv-drain/deletion/drain", test_delete_by_drain); g_test_add_func("/bdrv-drain/detach/drain_all", test_detach_by_drain_all); g_test_add_func("/bdrv-drain/detach/drain", test_detach_by_drain); -- cgit v1.1 From 30c070a547322a5e41ce129d540bca3653b1a9c8 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 17 Aug 2018 17:29:08 +0200 Subject: test-blockjob: Acquire AioContext around job_cancel_sync() All callers in QEMU proper hold the AioContext lock when calling job_finish_sync(). test-blockjob should do the same when it calls the function indirectly through job_cancel_sync(). Signed-off-by: Kevin Wolf Reviewed-by: Fam Zheng --- include/qemu/job.h | 6 ++++++ tests/test-blockjob.c | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/include/qemu/job.h b/include/qemu/job.h index b4a784d..63c60ef 100644 --- a/include/qemu/job.h +++ b/include/qemu/job.h @@ -524,6 +524,8 @@ void job_user_cancel(Job *job, bool force, Error **errp); * * Returns the return value from the job if the job actually completed * during the call, or -ECANCELED if it was canceled. + * + * Callers must hold the AioContext lock of job->aio_context. */ int job_cancel_sync(Job *job); @@ -541,6 +543,8 @@ void job_cancel_sync_all(void); * function). * * Returns the return value from the job. + * + * Callers must hold the AioContext lock of job->aio_context. */ int job_complete_sync(Job *job, Error **errp); @@ -566,6 +570,8 @@ void job_dismiss(Job **job, Error **errp); * * Returns 0 if the job is successfully completed, -ECANCELED if the job was * cancelled before completing, and -errno in other error cases. + * + * Callers must hold the AioContext lock of job->aio_context. */ int job_finish_sync(Job *job, void (*finish)(Job *, Error **errp), Error **errp); diff --git a/tests/test-blockjob.c b/tests/test-blockjob.c index de4c1c2..652d1e8 100644 --- a/tests/test-blockjob.c +++ b/tests/test-blockjob.c @@ -223,6 +223,10 @@ static void cancel_common(CancelJob *s) BlockJob *job = &s->common; BlockBackend *blk = s->blk; JobStatus sts = job->job.status; + AioContext *ctx; + + ctx = job->job.aio_context; + aio_context_acquire(ctx); job_cancel_sync(&job->job); if (sts != JOB_STATUS_CREATED && sts != JOB_STATUS_CONCLUDED) { @@ -232,6 +236,8 @@ static void cancel_common(CancelJob *s) assert(job->job.status == JOB_STATUS_NULL); job_unref(&job->job); destroy_blk(blk); + + aio_context_release(ctx); } static void test_cancel_created(void) -- cgit v1.1 From de0fbe64806321fc3e6399bfab360553db87a41d Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 17 Aug 2018 14:58:49 +0200 Subject: job: Use AIO_WAIT_WHILE() in job_finish_sync() job_finish_sync() needs to release the AioContext lock of the job before calling aio_poll(). Otherwise, callbacks called by aio_poll() would possibly take the lock a second time and run into a deadlock with a nested AIO_WAIT_WHILE() call. Also, job_drain() without aio_poll() isn't necessarily enough to make progress on a job, it could depend on bottom halves to be executed. Combine both open-coded while loops into a single AIO_WAIT_WHILE() call that solves both of these problems. Signed-off-by: Kevin Wolf Reviewed-by: Fam Zheng Reviewed-by: Max Reitz --- job.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/job.c b/job.c index 4812962..7ec8c3b 100644 --- a/job.c +++ b/job.c @@ -29,6 +29,7 @@ #include "qemu/job.h" #include "qemu/id.h" #include "qemu/main-loop.h" +#include "block/aio-wait.h" #include "trace-root.h" #include "qapi/qapi-events-job.h" @@ -962,6 +963,7 @@ void job_complete(Job *job, Error **errp) int job_finish_sync(Job *job, void (*finish)(Job *, Error **errp), Error **errp) { Error *local_err = NULL; + AioWait dummy_wait = {}; int ret; job_ref(job); @@ -974,14 +976,10 @@ int job_finish_sync(Job *job, void (*finish)(Job *, Error **errp), Error **errp) job_unref(job); return -EBUSY; } - /* job_drain calls job_enter, and it should be enough to induce progress - * until the job completes or moves to the main thread. */ - while (!job->deferred_to_main_loop && !job_is_completed(job)) { - job_drain(job); - } - while (!job_is_completed(job)) { - aio_poll(qemu_get_aio_context(), true); - } + + AIO_WAIT_WHILE(&dummy_wait, job->aio_context, + (job_drain(job), !job_is_completed(job))); + ret = (job_is_cancelled(job) && job->ret == 0) ? -ECANCELED : job->ret; job_unref(job); return ret; -- cgit v1.1 From ae23dde9dd486e57e152a0ebc9802caddedc45fc Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 6 Sep 2018 11:58:01 +0200 Subject: test-bdrv-drain: Test AIO_WAIT_WHILE() in completion callback This is a regression test for a deadlock that occurred in block job completion callbacks (via job_defer_to_main_loop) because the AioContext lock was taken twice: once in job_finish_sync() and then again in job_defer_to_main_loop_bh(). This would cause AIO_WAIT_WHILE() to hang. Signed-off-by: Kevin Wolf Reviewed-by: Fam Zheng --- tests/test-bdrv-drain.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c index 57da22a..c3c17b9 100644 --- a/tests/test-bdrv-drain.c +++ b/tests/test-bdrv-drain.c @@ -774,6 +774,15 @@ typedef struct TestBlockJob { bool should_complete; } TestBlockJob; +static int test_job_prepare(Job *job) +{ + TestBlockJob *s = container_of(job, TestBlockJob, common.job); + + /* Provoke an AIO_WAIT_WHILE() call to verify there is no deadlock */ + blk_flush(s->common.blk); + return 0; +} + static int coroutine_fn test_job_run(Job *job, Error **errp) { TestBlockJob *s = container_of(job, TestBlockJob, common.job); @@ -804,6 +813,7 @@ BlockJobDriver test_job_driver = { .drain = block_job_drain, .run = test_job_run, .complete = test_job_complete, + .prepare = test_job_prepare, }, }; -- cgit v1.1 From aa1361d54aac43094b98024b8b6c804eb6e41661 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 17 Aug 2018 18:54:18 +0200 Subject: block: Add missing locking in bdrv_co_drain_bh_cb() bdrv_do_drained_begin/end() assume that they are called with the AioContext lock of bs held. If we call drain functions from a coroutine with the AioContext lock held, we yield and schedule a BH to move out of coroutine context. This means that the lock for the home context of the coroutine is released and must be re-acquired in the bottom half. Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz --- block/io.c | 15 +++++++++++++++ include/qemu/coroutine.h | 5 +++++ util/qemu-coroutine.c | 5 +++++ 3 files changed, 25 insertions(+) diff --git a/block/io.c b/block/io.c index 7100344..914ba78 100644 --- a/block/io.c +++ b/block/io.c @@ -288,6 +288,18 @@ static void bdrv_co_drain_bh_cb(void *opaque) BlockDriverState *bs = data->bs; if (bs) { + AioContext *ctx = bdrv_get_aio_context(bs); + AioContext *co_ctx = qemu_coroutine_get_aio_context(co); + + /* + * When the coroutine yielded, the lock for its home context was + * released, so we need to re-acquire it here. If it explicitly + * acquired a different context, the lock is still held and we don't + * want to lock it a second time (or AIO_WAIT_WHILE() would hang). + */ + if (ctx == co_ctx) { + aio_context_acquire(ctx); + } bdrv_dec_in_flight(bs); if (data->begin) { bdrv_do_drained_begin(bs, data->recursive, data->parent, @@ -296,6 +308,9 @@ static void bdrv_co_drain_bh_cb(void *opaque) bdrv_do_drained_end(bs, data->recursive, data->parent, data->ignore_bds_parents); } + if (ctx == co_ctx) { + aio_context_release(ctx); + } } else { assert(data->begin); bdrv_drain_all_begin(); diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h index 6f8a487..9801e7f 100644 --- a/include/qemu/coroutine.h +++ b/include/qemu/coroutine.h @@ -90,6 +90,11 @@ void qemu_aio_coroutine_enter(AioContext *ctx, Coroutine *co); void coroutine_fn qemu_coroutine_yield(void); /** + * Get the AioContext of the given coroutine + */ +AioContext *coroutine_fn qemu_coroutine_get_aio_context(Coroutine *co); + +/** * Get the currently executing coroutine */ Coroutine *coroutine_fn qemu_coroutine_self(void); diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c index 1ba4191..2295928 100644 --- a/util/qemu-coroutine.c +++ b/util/qemu-coroutine.c @@ -198,3 +198,8 @@ bool qemu_coroutine_entered(Coroutine *co) { return co->caller; } + +AioContext *coroutine_fn qemu_coroutine_get_aio_context(Coroutine *co) +{ + return co->ctx; +} -- cgit v1.1 From fe5258a503a87e69be37c9ac48799e293809386e Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 6 Sep 2018 17:43:49 +0200 Subject: block-backend: Add .drained_poll callback A bdrv_drain operation must ensure that all parents are quiesced, this includes BlockBackends. Otherwise, callbacks called by requests that are completed on the BDS layer, but not quite yet on the BlockBackend layer could still create new requests. Signed-off-by: Kevin Wolf Reviewed-by: Fam Zheng Reviewed-by: Max Reitz --- block/block-backend.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/block/block-backend.c b/block/block-backend.c index 14a1b7a..4e7d08a 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -121,6 +121,7 @@ static void blk_root_inherit_options(int *child_flags, QDict *child_options, abort(); } static void blk_root_drained_begin(BdrvChild *child); +static bool blk_root_drained_poll(BdrvChild *child); static void blk_root_drained_end(BdrvChild *child); static void blk_root_change_media(BdrvChild *child, bool load); @@ -294,6 +295,7 @@ static const BdrvChildRole child_root = { .get_parent_desc = blk_root_get_parent_desc, .drained_begin = blk_root_drained_begin, + .drained_poll = blk_root_drained_poll, .drained_end = blk_root_drained_end, .activate = blk_root_activate, @@ -2189,6 +2191,13 @@ static void blk_root_drained_begin(BdrvChild *child) } } +static bool blk_root_drained_poll(BdrvChild *child) +{ + BlockBackend *blk = child->opaque; + assert(blk->quiesce_counter); + return !!blk->in_flight; +} + static void blk_root_drained_end(BdrvChild *child) { BlockBackend *blk = child->opaque; -- cgit v1.1 From 5ca9d21bd1c8eeb578d0964e31bd03d47c25773d Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 7 Sep 2018 13:45:54 +0200 Subject: block-backend: Fix potential double blk_delete() blk_unref() first decreases the refcount of the BlockBackend and calls blk_delete() if the refcount reaches zero. Requests can still be in flight at this point, they are only drained during blk_delete(): At this point, arbitrary callbacks can run. If any callback takes a temporary BlockBackend reference, it will first increase the refcount to 1 and then decrease it to 0 again, triggering another blk_delete(). This will cause a use-after-free crash in the outer blk_delete(). Fix it by draining the BlockBackend before decreasing to refcount to 0. Assert in blk_ref() that it never takes the first refcount (which would mean that the BlockBackend is already being deleted). Signed-off-by: Kevin Wolf Reviewed-by: Fam Zheng Reviewed-by: Max Reitz --- block/block-backend.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/block/block-backend.c b/block/block-backend.c index 4e7d08a..f71fdb4 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -435,6 +435,7 @@ int blk_get_refcnt(BlockBackend *blk) */ void blk_ref(BlockBackend *blk) { + assert(blk->refcnt > 0); blk->refcnt++; } @@ -447,7 +448,13 @@ void blk_unref(BlockBackend *blk) { if (blk) { assert(blk->refcnt > 0); - if (!--blk->refcnt) { + if (blk->refcnt > 1) { + blk->refcnt--; + } else { + blk_drain(blk); + /* blk_drain() cannot resurrect blk, nobody held a reference */ + assert(blk->refcnt == 1); + blk->refcnt = 0; blk_delete(blk); } } -- cgit v1.1 From 46aaf2a566e364a62315219255099cbf1c9b990d Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 6 Sep 2018 17:47:22 +0200 Subject: block-backend: Decrease in_flight only after callback Request callbacks can do pretty much anything, including operations that will yield from the coroutine (such as draining the backend). In that case, a decreased in_flight would be visible to other code and could lead to a drain completing while the callback hasn't actually completed yet. Note that reordering these operations forbids calling drain directly inside an AIO callback. As Paolo explains, indirectly calling it is okay: - Calling it through a coroutine is okay, because then bdrv_drained_begin() goes through bdrv_co_yield_to_drain() and you have in_flight=2 when bdrv_co_yield_to_drain() yields, then soon in_flight=1 when the aio_co_wake() in the AIO callback completes, then in_flight=0 after the bottom half starts. - Calling it through a bottom half would be okay too, as long as the AIO callback remembers to do inc_in_flight/dec_in_flight just like bdrv_co_yield_to_drain() and bdrv_co_drain_bh_cb() do A few more important cases that come to mind: - A coroutine that yields because of I/O is okay, with a sequence similar to bdrv_co_yield_to_drain(). - A coroutine that yields with no I/O pending will correctly decrease in_flight to zero before yielding. - Calling more AIO from the callback won't overflow the counter just because of mutual recursion, because AIO functions always yield at least once before invoking the callback. Signed-off-by: Kevin Wolf Reviewed-by: Fam Zheng Reviewed-by: Max Reitz Reviewed-by: Paolo Bonzini --- block/block-backend.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/block-backend.c b/block/block-backend.c index f71fdb4..551e4e1 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -1339,8 +1339,8 @@ static const AIOCBInfo blk_aio_em_aiocb_info = { static void blk_aio_complete(BlkAioEmAIOCB *acb) { if (acb->has_returned) { - blk_dec_in_flight(acb->rwco.blk); acb->common.cb(acb->common.opaque, acb->rwco.ret); + blk_dec_in_flight(acb->rwco.blk); qemu_aio_unref(acb); } } -- cgit v1.1 From b5a7a0573530698ee448b063ac01d485e30446bd Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 7 Sep 2018 15:31:22 +0200 Subject: blockjob: Lie better in child_job_drained_poll() Block jobs claim in .drained_poll() that they are in a quiescent state as soon as job->deferred_to_main_loop is true. This is obviously wrong, they still have a completion BH to run. We only get away with this because commit 91af091f923 added an unconditional aio_poll(false) to the drain functions, but this is bypassing the regular drain mechanisms. However, just removing this and telling that the job is still active doesn't work either: The completion callbacks themselves call drain functions (directly, or indirectly with bdrv_reopen), so they would deadlock then. As a better lie, tell that the job is active as long as the BH is pending, but falsely call it quiescent from the point in the BH when the completion callback is called. At this point, nested drain calls won't deadlock because they ignore the job, and outer drains will wait for the job to really reach a quiescent state because the callback is already running. Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz --- blockjob.c | 2 +- include/qemu/job.h | 3 +++ job.c | 11 ++++++++++- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/blockjob.c b/blockjob.c index 58dbd87..4d53422 100644 --- a/blockjob.c +++ b/blockjob.c @@ -164,7 +164,7 @@ static bool child_job_drained_poll(BdrvChild *c) /* An inactive or completed job doesn't have any pending requests. Jobs * with !job->busy are either already paused or have a pause point after * being reentered, so no job driver code will run before they pause. */ - if (!job->busy || job_is_completed(job) || job->deferred_to_main_loop) { + if (!job->busy || job_is_completed(job)) { return false; } diff --git a/include/qemu/job.h b/include/qemu/job.h index 63c60ef..9e7cd1e 100644 --- a/include/qemu/job.h +++ b/include/qemu/job.h @@ -76,6 +76,9 @@ typedef struct Job { * Set to false by the job while the coroutine has yielded and may be * re-entered by job_enter(). There may still be I/O or event loop activity * pending. Accessed under block_job_mutex (in blockjob.c). + * + * When the job is deferred to the main loop, busy is true as long as the + * bottom half is still pending. */ bool busy; diff --git a/job.c b/job.c index 7ec8c3b..518f603 100644 --- a/job.c +++ b/job.c @@ -857,7 +857,16 @@ static void job_exit(void *opaque) AioContext *ctx = job->aio_context; aio_context_acquire(ctx); + + /* This is a lie, we're not quiescent, but still doing the completion + * callbacks. However, completion callbacks tend to involve operations that + * drain block nodes, and if .drained_poll still returned true, we would + * deadlock. */ + job->busy = false; + job_event_idle(job); + job_completed(job); + aio_context_release(ctx); } @@ -872,8 +881,8 @@ static void coroutine_fn job_co_entry(void *opaque) assert(job && job->driver && job->driver->run); job_pause_point(job); job->ret = job->driver->run(job, &job->err); - job_event_idle(job); job->deferred_to_main_loop = true; + job->busy = true; aio_bh_schedule_oneshot(qemu_get_aio_context(), job_exit, job); } -- cgit v1.1 From 4cf077b59fc73eec29f8b7d082919dbb278bdc86 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 17 Aug 2018 14:58:49 +0200 Subject: block: Remove aio_poll() in bdrv_drain_poll variants bdrv_drain_poll_top_level() was buggy because it didn't release the AioContext lock of the node to be drained before calling aio_poll(). This way, callbacks called by aio_poll() would possibly take the lock a second time and run into a deadlock with a nested AIO_WAIT_WHILE() call. However, it turns out that the aio_poll() call isn't actually needed any more. It was introduced in commit 91af091f923, which is effectively reverted by this patch. The cases it was supposed to fix are now covered by bdrv_drain_poll(), which waits for block jobs to reach a quiescent state. Signed-off-by: Kevin Wolf Reviewed-by: Fam Zheng Reviewed-by: Max Reitz --- block/io.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/block/io.c b/block/io.c index 914ba78..8b81ff3 100644 --- a/block/io.c +++ b/block/io.c @@ -268,10 +268,6 @@ bool bdrv_drain_poll(BlockDriverState *bs, bool recursive, static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive, BdrvChild *ignore_parent) { - /* Execute pending BHs first and check everything else only after the BHs - * have executed. */ - while (aio_poll(bs->aio_context, false)); - return bdrv_drain_poll(bs, recursive, ignore_parent, false); } @@ -511,10 +507,6 @@ static bool bdrv_drain_all_poll(void) BlockDriverState *bs = NULL; bool result = false; - /* Execute pending BHs first (may modify the graph) and check everything - * else only after the BHs have executed. */ - while (aio_poll(qemu_get_aio_context(), false)); - /* bdrv_drain_poll() can't make changes to the graph and we are holding the * main AioContext lock, so iterating bdrv_next_all_states() is safe. */ while ((bs = bdrv_next_all_states(bs))) { -- cgit v1.1 From ecc1a5c790cf2c7732cb9755ca388c2fe108d1a1 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 6 Sep 2018 12:28:10 +0200 Subject: test-bdrv-drain: Test nested poll in bdrv_drain_poll_top_level() This is a regression test for a deadlock that could occur in callbacks called from the aio_poll() in bdrv_drain_poll_top_level(). The AioContext lock wasn't released and therefore would be taken a second time in the callback. This would cause a possible AIO_WAIT_WHILE() in the callback to hang. Signed-off-by: Kevin Wolf Reviewed-by: Fam Zheng --- tests/test-bdrv-drain.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c index c3c17b9..e105c0a 100644 --- a/tests/test-bdrv-drain.c +++ b/tests/test-bdrv-drain.c @@ -636,6 +636,17 @@ static void test_iothread_aio_cb(void *opaque, int ret) qemu_event_set(&done_event); } +static void test_iothread_main_thread_bh(void *opaque) +{ + struct test_iothread_data *data = opaque; + + /* Test that the AioContext is not yet locked in a random BH that is + * executed during drain, otherwise this would deadlock. */ + aio_context_acquire(bdrv_get_aio_context(data->bs)); + bdrv_flush(data->bs); + aio_context_release(bdrv_get_aio_context(data->bs)); +} + /* * Starts an AIO request on a BDS that runs in the AioContext of iothread 1. * The request involves a BH on iothread 2 before it can complete. @@ -705,6 +716,8 @@ static void test_iothread_common(enum drain_type drain_type, int drain_thread) aio_context_acquire(ctx_a); } + aio_bh_schedule_oneshot(ctx_a, test_iothread_main_thread_bh, &data); + /* The request is running on the IOThread a. Draining its block device * will make sure that it has completed as far as the BDS is concerned, * but the drain in this thread can continue immediately after -- cgit v1.1 From 644f3a29bd4974aefd46d2adb5062d86063c8a50 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 13 Sep 2018 14:35:27 +0200 Subject: job: Avoid deadlocks in job_completed_txn_abort() Amongst others, job_finalize_single() calls the .prepare/.commit/.abort callbacks of the individual job driver. Recently, their use was adapted for all block jobs so that they involve code calling AIO_WAIT_WHILE() now. Such code must be called under the AioContext lock for the respective job, but without holding any other AioContext lock. Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz --- job.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/job.c b/job.c index 518f603..93aea79 100644 --- a/job.c +++ b/job.c @@ -718,6 +718,7 @@ static void job_cancel_async(Job *job, bool force) static void job_completed_txn_abort(Job *job) { + AioContext *outer_ctx = job->aio_context; AioContext *ctx; JobTxn *txn = job->txn; Job *other_job; @@ -731,23 +732,26 @@ static void job_completed_txn_abort(Job *job) txn->aborting = true; job_txn_ref(txn); - /* We are the first failed job. Cancel other jobs. */ - QLIST_FOREACH(other_job, &txn->jobs, txn_list) { - ctx = other_job->aio_context; - aio_context_acquire(ctx); - } + /* We can only hold the single job's AioContext lock while calling + * job_finalize_single() because the finalization callbacks can involve + * calls of AIO_WAIT_WHILE(), which could deadlock otherwise. */ + aio_context_release(outer_ctx); /* Other jobs are effectively cancelled by us, set the status for * them; this job, however, may or may not be cancelled, depending * on the caller, so leave it. */ QLIST_FOREACH(other_job, &txn->jobs, txn_list) { if (other_job != job) { + ctx = other_job->aio_context; + aio_context_acquire(ctx); job_cancel_async(other_job, false); + aio_context_release(ctx); } } while (!QLIST_EMPTY(&txn->jobs)) { other_job = QLIST_FIRST(&txn->jobs); ctx = other_job->aio_context; + aio_context_acquire(ctx); if (!job_is_completed(other_job)) { assert(job_is_cancelled(other_job)); job_finish_sync(other_job, NULL, NULL); @@ -756,6 +760,8 @@ static void job_completed_txn_abort(Job *job) aio_context_release(ctx); } + aio_context_acquire(outer_ctx); + job_txn_unref(txn); } -- cgit v1.1 From d49725af46a7710cde02cc120b7f1e485154b483 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 13 Sep 2018 14:39:14 +0200 Subject: test-bdrv-drain: AIO_WAIT_WHILE() in job .commit/.abort This adds tests for calling AIO_WAIT_WHILE() in the .commit and .abort callbacks. Both reasons why .abort could be called for a single job are tested: Either .run or .prepare could return an error. Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz --- tests/test-bdrv-drain.c | 116 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 104 insertions(+), 12 deletions(-) diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c index e105c0a..bbc56a0 100644 --- a/tests/test-bdrv-drain.c +++ b/tests/test-bdrv-drain.c @@ -784,6 +784,8 @@ static void test_iothread_drain_subtree(void) typedef struct TestBlockJob { BlockJob common; + int run_ret; + int prepare_ret; bool should_complete; } TestBlockJob; @@ -793,7 +795,23 @@ static int test_job_prepare(Job *job) /* Provoke an AIO_WAIT_WHILE() call to verify there is no deadlock */ blk_flush(s->common.blk); - return 0; + return s->prepare_ret; +} + +static void test_job_commit(Job *job) +{ + TestBlockJob *s = container_of(job, TestBlockJob, common.job); + + /* Provoke an AIO_WAIT_WHILE() call to verify there is no deadlock */ + blk_flush(s->common.blk); +} + +static void test_job_abort(Job *job) +{ + TestBlockJob *s = container_of(job, TestBlockJob, common.job); + + /* Provoke an AIO_WAIT_WHILE() call to verify there is no deadlock */ + blk_flush(s->common.blk); } static int coroutine_fn test_job_run(Job *job, Error **errp) @@ -809,7 +827,7 @@ static int coroutine_fn test_job_run(Job *job, Error **errp) job_pause_point(&s->common.job); } - return 0; + return s->run_ret; } static void test_job_complete(Job *job, Error **errp) @@ -827,14 +845,24 @@ BlockJobDriver test_job_driver = { .run = test_job_run, .complete = test_job_complete, .prepare = test_job_prepare, + .commit = test_job_commit, + .abort = test_job_abort, }, }; -static void test_blockjob_common(enum drain_type drain_type, bool use_iothread) +enum test_job_result { + TEST_JOB_SUCCESS, + TEST_JOB_FAIL_RUN, + TEST_JOB_FAIL_PREPARE, +}; + +static void test_blockjob_common(enum drain_type drain_type, bool use_iothread, + enum test_job_result result) { BlockBackend *blk_src, *blk_target; BlockDriverState *src, *target; BlockJob *job; + TestBlockJob *tjob; IOThread *iothread = NULL; AioContext *ctx; int ret; @@ -858,9 +886,23 @@ static void test_blockjob_common(enum drain_type drain_type, bool use_iothread) blk_insert_bs(blk_target, target, &error_abort); aio_context_acquire(ctx); - job = block_job_create("job0", &test_job_driver, NULL, src, 0, BLK_PERM_ALL, - 0, 0, NULL, NULL, &error_abort); + tjob = block_job_create("job0", &test_job_driver, NULL, src, + 0, BLK_PERM_ALL, + 0, 0, NULL, NULL, &error_abort); + job = &tjob->common; block_job_add_bdrv(job, "target", target, 0, BLK_PERM_ALL, &error_abort); + + switch (result) { + case TEST_JOB_SUCCESS: + break; + case TEST_JOB_FAIL_RUN: + tjob->run_ret = -EIO; + break; + case TEST_JOB_FAIL_PREPARE: + tjob->prepare_ret = -EIO; + break; + } + job_start(&job->job); aio_context_release(ctx); @@ -918,7 +960,7 @@ static void test_blockjob_common(enum drain_type drain_type, bool use_iothread) aio_context_acquire(ctx); ret = job_complete_sync(&job->job, &error_abort); - g_assert_cmpint(ret, ==, 0); + g_assert_cmpint(ret, ==, (result == TEST_JOB_SUCCESS ? 0 : -EIO)); if (use_iothread) { blk_set_aio_context(blk_src, qemu_get_aio_context()); @@ -937,32 +979,68 @@ static void test_blockjob_common(enum drain_type drain_type, bool use_iothread) static void test_blockjob_drain_all(void) { - test_blockjob_common(BDRV_DRAIN_ALL, false); + test_blockjob_common(BDRV_DRAIN_ALL, false, TEST_JOB_SUCCESS); } static void test_blockjob_drain(void) { - test_blockjob_common(BDRV_DRAIN, false); + test_blockjob_common(BDRV_DRAIN, false, TEST_JOB_SUCCESS); } static void test_blockjob_drain_subtree(void) { - test_blockjob_common(BDRV_SUBTREE_DRAIN, false); + test_blockjob_common(BDRV_SUBTREE_DRAIN, false, TEST_JOB_SUCCESS); +} + +static void test_blockjob_error_drain_all(void) +{ + test_blockjob_common(BDRV_DRAIN_ALL, false, TEST_JOB_FAIL_RUN); + test_blockjob_common(BDRV_DRAIN_ALL, false, TEST_JOB_FAIL_PREPARE); +} + +static void test_blockjob_error_drain(void) +{ + test_blockjob_common(BDRV_DRAIN, false, TEST_JOB_FAIL_RUN); + test_blockjob_common(BDRV_DRAIN, false, TEST_JOB_FAIL_PREPARE); +} + +static void test_blockjob_error_drain_subtree(void) +{ + test_blockjob_common(BDRV_SUBTREE_DRAIN, false, TEST_JOB_FAIL_RUN); + test_blockjob_common(BDRV_SUBTREE_DRAIN, false, TEST_JOB_FAIL_PREPARE); } static void test_blockjob_iothread_drain_all(void) { - test_blockjob_common(BDRV_DRAIN_ALL, true); + test_blockjob_common(BDRV_DRAIN_ALL, true, TEST_JOB_SUCCESS); } static void test_blockjob_iothread_drain(void) { - test_blockjob_common(BDRV_DRAIN, true); + test_blockjob_common(BDRV_DRAIN, true, TEST_JOB_SUCCESS); } static void test_blockjob_iothread_drain_subtree(void) { - test_blockjob_common(BDRV_SUBTREE_DRAIN, true); + test_blockjob_common(BDRV_SUBTREE_DRAIN, true, TEST_JOB_SUCCESS); +} + +static void test_blockjob_iothread_error_drain_all(void) +{ + test_blockjob_common(BDRV_DRAIN_ALL, true, TEST_JOB_FAIL_RUN); + test_blockjob_common(BDRV_DRAIN_ALL, true, TEST_JOB_FAIL_PREPARE); +} + +static void test_blockjob_iothread_error_drain(void) +{ + test_blockjob_common(BDRV_DRAIN, true, TEST_JOB_FAIL_RUN); + test_blockjob_common(BDRV_DRAIN, true, TEST_JOB_FAIL_PREPARE); +} + +static void test_blockjob_iothread_error_drain_subtree(void) +{ + test_blockjob_common(BDRV_SUBTREE_DRAIN, true, TEST_JOB_FAIL_RUN); + test_blockjob_common(BDRV_SUBTREE_DRAIN, true, TEST_JOB_FAIL_PREPARE); } @@ -1434,6 +1512,13 @@ int main(int argc, char **argv) g_test_add_func("/bdrv-drain/blockjob/drain_subtree", test_blockjob_drain_subtree); + g_test_add_func("/bdrv-drain/blockjob/error/drain_all", + test_blockjob_error_drain_all); + g_test_add_func("/bdrv-drain/blockjob/error/drain", + test_blockjob_error_drain); + g_test_add_func("/bdrv-drain/blockjob/error/drain_subtree", + test_blockjob_error_drain_subtree); + g_test_add_func("/bdrv-drain/blockjob/iothread/drain_all", test_blockjob_iothread_drain_all); g_test_add_func("/bdrv-drain/blockjob/iothread/drain", @@ -1441,6 +1526,13 @@ int main(int argc, char **argv) g_test_add_func("/bdrv-drain/blockjob/iothread/drain_subtree", test_blockjob_iothread_drain_subtree); + g_test_add_func("/bdrv-drain/blockjob/iothread/error/drain_all", + test_blockjob_iothread_error_drain_all); + g_test_add_func("/bdrv-drain/blockjob/iothread/error/drain", + test_blockjob_iothread_error_drain); + g_test_add_func("/bdrv-drain/blockjob/iothread/error/drain_subtree", + test_blockjob_iothread_error_drain_subtree); + g_test_add_func("/bdrv-drain/deletion/drain", test_delete_by_drain); g_test_add_func("/bdrv-drain/detach/drain_all", test_detach_by_drain_all); g_test_add_func("/bdrv-drain/detach/drain", test_detach_by_drain); -- cgit v1.1 From 5599c162c3bec2bc8f0123e4d5802a70d9984b3b Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 20 Sep 2018 15:51:21 +0200 Subject: test-bdrv-drain: Fix outdated comments Commit 89bd030533e changed the test case from using job_sleep_ns() to using qemu_co_sleep_ns() instead. Also, block_job_sleep_ns() became job_sleep_ns() in commit 5d43e86e11f. In both cases, some comments in the test case were not updated. Do that now. Reported-by: Max Reitz Signed-off-by: Kevin Wolf Reviewed-by: Eric Blake --- tests/test-bdrv-drain.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c index bbc56a0..f367e6c 100644 --- a/tests/test-bdrv-drain.c +++ b/tests/test-bdrv-drain.c @@ -820,9 +820,9 @@ static int coroutine_fn test_job_run(Job *job, Error **errp) job_transition_to_ready(&s->common.job); while (!s->should_complete) { - /* Avoid block_job_sleep_ns() because it marks the job as !busy. We - * want to emulate some actual activity (probably some I/O) here so - * that drain has to wait for this acitivity to stop. */ + /* Avoid job_sleep_ns() because it marks the job as !busy. We want to + * emulate some actual activity (probably some I/O) here so that drain + * has to wait for this activity to stop. */ qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000); job_pause_point(&s->common.job); } @@ -908,7 +908,7 @@ static void test_blockjob_common(enum drain_type drain_type, bool use_iothread, g_assert_cmpint(job->job.pause_count, ==, 0); g_assert_false(job->job.paused); - g_assert_true(job->job.busy); /* We're in job_sleep_ns() */ + g_assert_true(job->job.busy); /* We're in qemu_co_sleep_ns() */ do_drain_begin_unlocked(drain_type, src); @@ -956,7 +956,7 @@ static void test_blockjob_common(enum drain_type drain_type, bool use_iothread, g_assert_cmpint(job->job.pause_count, ==, 0); g_assert_false(job->job.paused); - g_assert_true(job->job.busy); /* We're in job_sleep_ns() */ + g_assert_true(job->job.busy); /* We're in qemu_co_sleep_ns() */ aio_context_acquire(ctx); ret = job_complete_sync(&job->job, &error_abort); -- cgit v1.1 From cfe29d8294e06420e15d4938421ae006c8ac49e7 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 18 Sep 2018 17:09:16 +0200 Subject: block: Use a single global AioWait When draining a block node, we recurse to its parent and for subtree drains also to its children. A single AIO_WAIT_WHILE() is then used to wait for bdrv_drain_poll() to become true, which depends on all of the nodes we recursed to. However, if the respective child or parent becomes quiescent and calls bdrv_wakeup(), only the AioWait of the child/parent is checked, while AIO_WAIT_WHILE() depends on the AioWait of the original node. Fix this by using a single AioWait for all callers of AIO_WAIT_WHILE(). This may mean that the draining thread gets a few more unnecessary wakeups because an unrelated operation got completed, but we already wake it up when something _could_ have changed rather than only if it has certainly changed. Apart from that, drain is a slow path anyway. In theory it would be possible to use wakeups more selectively and still correctly, but the gains are likely not worth the additional complexity. In fact, this patch is a nice simplification for some places in the code. Signed-off-by: Kevin Wolf Reviewed-by: Eric Blake Reviewed-by: Max Reitz --- block.c | 5 ----- block/block-backend.c | 11 ++++------- block/io.c | 7 ++----- blockjob.c | 13 +------------ include/block/aio-wait.h | 22 +++++++++++----------- include/block/block.h | 6 +----- include/block/block_int.h | 3 --- include/block/blockjob.h | 10 ---------- job.c | 3 +-- util/aio-wait.c | 11 ++++++----- 10 files changed, 26 insertions(+), 65 deletions(-) diff --git a/block.c b/block.c index a381c8e..c298ca6 100644 --- a/block.c +++ b/block.c @@ -4886,11 +4886,6 @@ AioContext *bdrv_get_aio_context(BlockDriverState *bs) return bs ? bs->aio_context : qemu_get_aio_context(); } -AioWait *bdrv_get_aio_wait(BlockDriverState *bs) -{ - return bs ? &bs->wait : NULL; -} - void bdrv_coroutine_enter(BlockDriverState *bs, Coroutine *co) { aio_co_enter(bdrv_get_aio_context(bs), co); diff --git a/block/block-backend.c b/block/block-backend.c index 551e4e1..7b1ec50 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -88,7 +88,6 @@ struct BlockBackend { * Accessed with atomic ops. */ unsigned int in_flight; - AioWait wait; }; typedef struct BlockBackendAIOCB { @@ -1298,7 +1297,7 @@ static void blk_inc_in_flight(BlockBackend *blk) static void blk_dec_in_flight(BlockBackend *blk) { atomic_dec(&blk->in_flight); - aio_wait_kick(&blk->wait); + aio_wait_kick(); } static void error_callback_bh(void *opaque) @@ -1599,9 +1598,8 @@ void blk_drain(BlockBackend *blk) } /* We may have -ENOMEDIUM completions in flight */ - AIO_WAIT_WHILE(&blk->wait, - blk_get_aio_context(blk), - atomic_mb_read(&blk->in_flight) > 0); + AIO_WAIT_WHILE(blk_get_aio_context(blk), + atomic_mb_read(&blk->in_flight) > 0); if (bs) { bdrv_drained_end(bs); @@ -1620,8 +1618,7 @@ void blk_drain_all(void) aio_context_acquire(ctx); /* We may have -ENOMEDIUM completions in flight */ - AIO_WAIT_WHILE(&blk->wait, ctx, - atomic_mb_read(&blk->in_flight) > 0); + AIO_WAIT_WHILE(ctx, atomic_mb_read(&blk->in_flight) > 0); aio_context_release(ctx); } diff --git a/block/io.c b/block/io.c index 8b81ff3..bd9d688 100644 --- a/block/io.c +++ b/block/io.c @@ -38,8 +38,6 @@ /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */ #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) -static AioWait drain_all_aio_wait; - static void bdrv_parent_cb_resize(BlockDriverState *bs); static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes, BdrvRequestFlags flags); @@ -557,7 +555,7 @@ void bdrv_drain_all_begin(void) } /* Now poll the in-flight requests */ - AIO_WAIT_WHILE(&drain_all_aio_wait, NULL, bdrv_drain_all_poll()); + AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll()); while ((bs = bdrv_next_all_states(bs))) { bdrv_drain_assert_idle(bs); @@ -713,8 +711,7 @@ void bdrv_inc_in_flight(BlockDriverState *bs) void bdrv_wakeup(BlockDriverState *bs) { - aio_wait_kick(bdrv_get_aio_wait(bs)); - aio_wait_kick(&drain_all_aio_wait); + aio_wait_kick(); } void bdrv_dec_in_flight(BlockDriverState *bs) diff --git a/blockjob.c b/blockjob.c index 4d53422..58de8cb 100644 --- a/blockjob.c +++ b/blockjob.c @@ -221,20 +221,9 @@ int block_job_add_bdrv(BlockJob *job, const char *name, BlockDriverState *bs, return 0; } -void block_job_wakeup_all_bdrv(BlockJob *job) -{ - GSList *l; - - for (l = job->nodes; l; l = l->next) { - BdrvChild *c = l->data; - bdrv_wakeup(c->bs); - } -} - static void block_job_on_idle(Notifier *n, void *opaque) { - BlockJob *job = opaque; - block_job_wakeup_all_bdrv(job); + aio_wait_kick(); } bool block_job_is_internal(BlockJob *job) diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h index 600fad1..afd0ff7 100644 --- a/include/block/aio-wait.h +++ b/include/block/aio-wait.h @@ -30,14 +30,15 @@ /** * AioWait: * - * An object that facilitates synchronous waiting on a condition. The main - * loop can wait on an operation running in an IOThread as follows: + * An object that facilitates synchronous waiting on a condition. A single + * global AioWait object (global_aio_wait) is used internally. + * + * The main loop can wait on an operation running in an IOThread as follows: * - * AioWait *wait = ...; * AioContext *ctx = ...; * MyWork work = { .done = false }; * schedule_my_work_in_iothread(ctx, &work); - * AIO_WAIT_WHILE(wait, ctx, !work.done); + * AIO_WAIT_WHILE(ctx, !work.done); * * The IOThread must call aio_wait_kick() to notify the main loop when * work.done changes: @@ -46,7 +47,7 @@ * { * ... * work.done = true; - * aio_wait_kick(wait); + * aio_wait_kick(); * } */ typedef struct { @@ -54,9 +55,10 @@ typedef struct { unsigned num_waiters; } AioWait; +extern AioWait global_aio_wait; + /** * AIO_WAIT_WHILE: - * @wait: the aio wait object * @ctx: the aio context, or NULL if multiple aio contexts (for which the * caller does not hold a lock) are involved in the polling condition. * @cond: wait while this conditional expression is true @@ -72,9 +74,9 @@ typedef struct { * wait on conditions between two IOThreads since that could lead to deadlock, * go via the main loop instead. */ -#define AIO_WAIT_WHILE(wait, ctx, cond) ({ \ +#define AIO_WAIT_WHILE(ctx, cond) ({ \ bool waited_ = false; \ - AioWait *wait_ = (wait); \ + AioWait *wait_ = &global_aio_wait; \ AioContext *ctx_ = (ctx); \ /* Increment wait_->num_waiters before evaluating cond. */ \ atomic_inc(&wait_->num_waiters); \ @@ -102,14 +104,12 @@ typedef struct { /** * aio_wait_kick: - * @wait: the aio wait object that should re-evaluate its condition - * * Wake up the main thread if it is waiting on AIO_WAIT_WHILE(). During * synchronous operations performed in an IOThread, the main thread lets the * IOThread's event loop run, waiting for the operation to complete. A * aio_wait_kick() call will wake up the main thread. */ -void aio_wait_kick(AioWait *wait); +void aio_wait_kick(void); /** * aio_wait_bh_oneshot: diff --git a/include/block/block.h b/include/block/block.h index 4e0871a..4edc1e8 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -410,13 +410,9 @@ void bdrv_drain_all_begin(void); void bdrv_drain_all_end(void); void bdrv_drain_all(void); -/* Returns NULL when bs == NULL */ -AioWait *bdrv_get_aio_wait(BlockDriverState *bs); - #define BDRV_POLL_WHILE(bs, cond) ({ \ BlockDriverState *bs_ = (bs); \ - AIO_WAIT_WHILE(bdrv_get_aio_wait(bs_), \ - bdrv_get_aio_context(bs_), \ + AIO_WAIT_WHILE(bdrv_get_aio_context(bs_), \ cond); }) int bdrv_pdiscard(BdrvChild *child, int64_t offset, int bytes); diff --git a/include/block/block_int.h b/include/block/block_int.h index 4000d2a..92ecbd8 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -794,9 +794,6 @@ struct BlockDriverState { unsigned int in_flight; unsigned int serialising_in_flight; - /* Kicked to signal main loop when a request completes. */ - AioWait wait; - /* counter for nested bdrv_io_plug. * Accessed with atomic ops. */ diff --git a/include/block/blockjob.h b/include/block/blockjob.h index 2290bbb..ede0bd8 100644 --- a/include/block/blockjob.h +++ b/include/block/blockjob.h @@ -122,16 +122,6 @@ int block_job_add_bdrv(BlockJob *job, const char *name, BlockDriverState *bs, void block_job_remove_all_bdrv(BlockJob *job); /** - * block_job_wakeup_all_bdrv: - * @job: The block job - * - * Calls bdrv_wakeup() for all BlockDriverStates that have been added to the - * job. This function is to be called whenever child_job_drained_poll() would - * go from true to false to notify waiting drain requests. - */ -void block_job_wakeup_all_bdrv(BlockJob *job); - -/** * block_job_set_speed: * @job: The job to set the speed for. * @speed: The new value diff --git a/job.c b/job.c index 93aea79..c65e01b 100644 --- a/job.c +++ b/job.c @@ -978,7 +978,6 @@ void job_complete(Job *job, Error **errp) int job_finish_sync(Job *job, void (*finish)(Job *, Error **errp), Error **errp) { Error *local_err = NULL; - AioWait dummy_wait = {}; int ret; job_ref(job); @@ -992,7 +991,7 @@ int job_finish_sync(Job *job, void (*finish)(Job *, Error **errp), Error **errp) return -EBUSY; } - AIO_WAIT_WHILE(&dummy_wait, job->aio_context, + AIO_WAIT_WHILE(job->aio_context, (job_drain(job), !job_is_completed(job))); ret = (job_is_cancelled(job) && job->ret == 0) ? -ECANCELED : job->ret; diff --git a/util/aio-wait.c b/util/aio-wait.c index b8a8f86..b487749 100644 --- a/util/aio-wait.c +++ b/util/aio-wait.c @@ -26,21 +26,22 @@ #include "qemu/main-loop.h" #include "block/aio-wait.h" +AioWait global_aio_wait; + static void dummy_bh_cb(void *opaque) { /* The point is to make AIO_WAIT_WHILE()'s aio_poll() return */ } -void aio_wait_kick(AioWait *wait) +void aio_wait_kick(void) { /* The barrier (or an atomic op) is in the caller. */ - if (atomic_read(&wait->num_waiters)) { + if (atomic_read(&global_aio_wait.num_waiters)) { aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL); } } typedef struct { - AioWait wait; bool done; QEMUBHFunc *cb; void *opaque; @@ -54,7 +55,7 @@ static void aio_wait_bh(void *opaque) data->cb(data->opaque); data->done = true; - aio_wait_kick(&data->wait); + aio_wait_kick(); } void aio_wait_bh_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque) @@ -67,5 +68,5 @@ void aio_wait_bh_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque) assert(qemu_get_current_aio_context() == qemu_get_aio_context()); aio_bh_schedule_oneshot(ctx, aio_wait_bh, &data); - AIO_WAIT_WHILE(&data.wait, ctx, !data.done); + AIO_WAIT_WHILE(ctx, !data.done); } -- cgit v1.1 From d8b3afd597d54e496809b05ac39ac29a5799664f Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 20 Sep 2018 17:39:13 +0200 Subject: test-bdrv-drain: Test draining job source child and parent For the block job drain test, don't only test draining the source and the target node, but create a backing chain for the source (source_backing <- source <- source_overlay) and test draining each of the nodes in it. When using iothreads, the source node (and therefore the job) is in a different AioContext than the drain, which happens from the main thread. This way, the main thread waits in AIO_WAIT_WHILE() for the iothread to make process and aio_wait_kick() is required to notify it. The test validates that calling bdrv_wakeup() for a child or a parent node will actually notify AIO_WAIT_WHILE() instead of letting it hang. Increase the sleep time a bit (to 1 ms) because the test case is racy and with the shorter sleep, it didn't reproduce the bug it is supposed to test for me under 'rr record -n'. This was because bdrv_drain_invoke_entry() (in the main thread) was only called after the job had already reached the pause point, so we got a bdrv_dec_in_flight() from the main thread and the additional aio_wait_kick() when the job becomes idle (that we really wanted to test here) wasn't even necessary any more to make progress. Signed-off-by: Kevin Wolf Reviewed-by: Eric Blake Reviewed-by: Max Reitz --- tests/test-bdrv-drain.c | 77 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 69 insertions(+), 8 deletions(-) diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c index f367e6c..c9f29c8 100644 --- a/tests/test-bdrv-drain.c +++ b/tests/test-bdrv-drain.c @@ -786,6 +786,7 @@ typedef struct TestBlockJob { BlockJob common; int run_ret; int prepare_ret; + bool running; bool should_complete; } TestBlockJob; @@ -818,12 +819,17 @@ static int coroutine_fn test_job_run(Job *job, Error **errp) { TestBlockJob *s = container_of(job, TestBlockJob, common.job); + /* We are running the actual job code past the pause point in + * job_co_entry(). */ + s->running = true; + job_transition_to_ready(&s->common.job); while (!s->should_complete) { /* Avoid job_sleep_ns() because it marks the job as !busy. We want to * emulate some actual activity (probably some I/O) here so that drain * has to wait for this activity to stop. */ - qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000); + qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 1000000); + job_pause_point(&s->common.job); } @@ -856,11 +862,19 @@ enum test_job_result { TEST_JOB_FAIL_PREPARE, }; -static void test_blockjob_common(enum drain_type drain_type, bool use_iothread, - enum test_job_result result) +enum test_job_drain_node { + TEST_JOB_DRAIN_SRC, + TEST_JOB_DRAIN_SRC_CHILD, + TEST_JOB_DRAIN_SRC_PARENT, +}; + +static void test_blockjob_common_drain_node(enum drain_type drain_type, + bool use_iothread, + enum test_job_result result, + enum test_job_drain_node drain_node) { BlockBackend *blk_src, *blk_target; - BlockDriverState *src, *target; + BlockDriverState *src, *src_backing, *src_overlay, *target, *drain_bs; BlockJob *job; TestBlockJob *tjob; IOThread *iothread = NULL; @@ -869,8 +883,32 @@ static void test_blockjob_common(enum drain_type drain_type, bool use_iothread, src = bdrv_new_open_driver(&bdrv_test, "source", BDRV_O_RDWR, &error_abort); + src_backing = bdrv_new_open_driver(&bdrv_test, "source-backing", + BDRV_O_RDWR, &error_abort); + src_overlay = bdrv_new_open_driver(&bdrv_test, "source-overlay", + BDRV_O_RDWR, &error_abort); + + bdrv_set_backing_hd(src_overlay, src, &error_abort); + bdrv_unref(src); + bdrv_set_backing_hd(src, src_backing, &error_abort); + bdrv_unref(src_backing); + blk_src = blk_new(BLK_PERM_ALL, BLK_PERM_ALL); - blk_insert_bs(blk_src, src, &error_abort); + blk_insert_bs(blk_src, src_overlay, &error_abort); + + switch (drain_node) { + case TEST_JOB_DRAIN_SRC: + drain_bs = src; + break; + case TEST_JOB_DRAIN_SRC_CHILD: + drain_bs = src_backing; + break; + case TEST_JOB_DRAIN_SRC_PARENT: + drain_bs = src_overlay; + break; + default: + g_assert_not_reached(); + } if (use_iothread) { iothread = iothread_new(); @@ -906,11 +944,21 @@ static void test_blockjob_common(enum drain_type drain_type, bool use_iothread, job_start(&job->job); aio_context_release(ctx); + if (use_iothread) { + /* job_co_entry() is run in the I/O thread, wait for the actual job + * code to start (we don't want to catch the job in the pause point in + * job_co_entry(). */ + while (!tjob->running) { + aio_poll(qemu_get_aio_context(), false); + } + } + g_assert_cmpint(job->job.pause_count, ==, 0); g_assert_false(job->job.paused); + g_assert_true(tjob->running); g_assert_true(job->job.busy); /* We're in qemu_co_sleep_ns() */ - do_drain_begin_unlocked(drain_type, src); + do_drain_begin_unlocked(drain_type, drain_bs); if (drain_type == BDRV_DRAIN_ALL) { /* bdrv_drain_all() drains both src and target */ @@ -921,7 +969,7 @@ static void test_blockjob_common(enum drain_type drain_type, bool use_iothread, g_assert_true(job->job.paused); g_assert_false(job->job.busy); /* The job is paused */ - do_drain_end_unlocked(drain_type, src); + do_drain_end_unlocked(drain_type, drain_bs); if (use_iothread) { /* paused is reset in the I/O thread, wait for it */ @@ -969,7 +1017,7 @@ static void test_blockjob_common(enum drain_type drain_type, bool use_iothread, blk_unref(blk_src); blk_unref(blk_target); - bdrv_unref(src); + bdrv_unref(src_overlay); bdrv_unref(target); if (iothread) { @@ -977,6 +1025,19 @@ static void test_blockjob_common(enum drain_type drain_type, bool use_iothread, } } +static void test_blockjob_common(enum drain_type drain_type, bool use_iothread, + enum test_job_result result) +{ + test_blockjob_common_drain_node(drain_type, use_iothread, result, + TEST_JOB_DRAIN_SRC); + test_blockjob_common_drain_node(drain_type, use_iothread, result, + TEST_JOB_DRAIN_SRC_CHILD); + if (drain_type == BDRV_SUBTREE_DRAIN) { + test_blockjob_common_drain_node(drain_type, use_iothread, result, + TEST_JOB_DRAIN_SRC_PARENT); + } +} + static void test_blockjob_drain_all(void) { test_blockjob_common(BDRV_DRAIN_ALL, false, TEST_JOB_SUCCESS); -- cgit v1.1