From 4be6a6d118123340f16fb8b3bf45220d0f8ed6d4 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 29 Jun 2018 18:01:31 +0200 Subject: block: Poll after drain on attaching a node Commit dcf94a23b1 ('block: Don't poll in parent drain callbacks') removed polling in bdrv_child_cb_drained_begin() on the grounds that the original bdrv_drain() already will poll and BdrvChildRole.drained_begin calls must not cause graph changes (and therefore must not call aio_poll() or the recursion through the graph will break. This reasoning is correct for calls through bdrv_do_drained_begin(). However, BdrvChildRole.drained_begin is also called when a node that is already in a drained section (i.e. bdrv_do_drained_begin() has already returned and therefore can't poll any more) is attached to a new parent. In this case, we must explicitly poll to have all requests completed before the drained new child can be attached to the parent. In bdrv_replace_child_noperm(), we know that we're not inside the recursion of bdrv_do_drained_begin() because graph changes are not allowed there, and bdrv_replace_child_noperm() is a graph change. The call of BdrvChildRole.drained_begin() must therefore be followed by a BDRV_POLL_WHILE() that waits for the completion of requests. Reported-by: Max Reitz Signed-off-by: Kevin Wolf --- block/io.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/io.c b/block/io.c index 1a2272f..038449f 100644 --- a/block/io.c +++ b/block/io.c @@ -52,9 +52,7 @@ void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore, if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) { continue; } - if (c->role->drained_begin) { - c->role->drained_begin(c); - } + bdrv_parent_drained_begin_single(c, false); } } @@ -73,6 +71,14 @@ void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore, } } +static bool bdrv_parent_drained_poll_single(BdrvChild *c) +{ + if (c->role->drained_poll) { + return c->role->drained_poll(c); + } + return false; +} + static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore, bool ignore_bds_parents) { @@ -83,14 +89,22 @@ static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore, if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) { continue; } - if (c->role->drained_poll) { - busy |= c->role->drained_poll(c); - } + busy |= bdrv_parent_drained_poll_single(c); } return busy; } +void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll) +{ + if (c->role->drained_begin) { + c->role->drained_begin(c); + } + if (poll) { + BDRV_POLL_WHILE(c->bs, bdrv_parent_drained_poll_single(c)); + } +} + static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src) { dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer); -- cgit v1.1 From b0ddcbbb36a66a605eb232b905cb49b1cc72e74e Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 6 Jul 2018 18:41:07 +0200 Subject: block: Fix copy-on-read crash with partial final cluster If the virtual disk size isn't aligned to full clusters, bdrv_co_do_copy_on_readv() may get pnum == 0 before having the full cluster completed, which will let it run into an assertion failure: qemu-io: block/io.c:1203: bdrv_co_do_copy_on_readv: Assertion `skip_bytes < pnum' failed. Check for EOF, assert that we read at least as much as the read request originally wanted to have (which is true at EOF because otherwise bdrv_check_byte_request() would already have returned an error) and return success early even though we couldn't copy the full cluster. Signed-off-by: Kevin Wolf --- block/io.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'block') diff --git a/block/io.c b/block/io.c index 038449f..4c08311 100644 --- a/block/io.c +++ b/block/io.c @@ -1200,6 +1200,12 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, pnum = MIN(cluster_bytes, max_transfer); } + /* Stop at EOF if the image ends in the middle of the cluster */ + if (ret == 0 && pnum == 0) { + assert(progress >= bytes); + break; + } + assert(skip_bytes < pnum); if (ret <= 0) { -- cgit v1.1 From 999658a05e61a8d87b65827da665302bb44ce8c9 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Mon, 9 Jul 2018 19:37:16 +0300 Subject: block/io: fix copy_range Here two things are fixed: 1. Architecture On each recursion step, we go to the child of src or dst, only for one of them. So, it's wrong to create tracked requests for both on each step. It leads to tracked requests duplication. 2. Wait for serializing requests on write path independently of BDRV_REQ_NO_SERIALISING Before commit 9ded4a01149 "backup: Use copy offloading", BDRV_REQ_NO_SERIALISING was used for only one case: read in copy-on-write operation during backup. Also, the flag was handled only on read path (in bdrv_co_preadv and bdrv_aligned_preadv). After 9ded4a01149, flag is used for not waiting serializing operations on backup target (in same case of copy-on-write operation). This behavior change is unsubstantiated and potentially dangerous, let's drop it and add additional asserts and documentation. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/io.c | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) (limited to 'block') diff --git a/block/io.c b/block/io.c index 4c08311..3a321d6 100644 --- a/block/io.c +++ b/block/io.c @@ -1592,6 +1592,8 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), align); + /* BDRV_REQ_NO_SERIALISING is only for read operation */ + assert(!(flags & BDRV_REQ_NO_SERIALISING)); waited = wait_serialising_requests(req); assert(!waited || !req->serialising); assert(req->overlap_offset <= offset); @@ -2916,7 +2918,7 @@ static int coroutine_fn bdrv_co_copy_range_internal(BdrvChild *src, BdrvRequestFlags flags, bool recurse_src) { - BdrvTrackedRequest src_req, dst_req; + BdrvTrackedRequest req; int ret; if (!dst || !dst->bs) { @@ -2943,32 +2945,42 @@ static int coroutine_fn bdrv_co_copy_range_internal(BdrvChild *src, || src->bs->encrypted || dst->bs->encrypted) { return -ENOTSUP; } - bdrv_inc_in_flight(src->bs); - bdrv_inc_in_flight(dst->bs); - tracked_request_begin(&src_req, src->bs, src_offset, - bytes, BDRV_TRACKED_READ); - tracked_request_begin(&dst_req, dst->bs, dst_offset, - bytes, BDRV_TRACKED_WRITE); - if (!(flags & BDRV_REQ_NO_SERIALISING)) { - wait_serialising_requests(&src_req); - wait_serialising_requests(&dst_req); - } if (recurse_src) { + bdrv_inc_in_flight(src->bs); + tracked_request_begin(&req, src->bs, src_offset, bytes, + BDRV_TRACKED_READ); + + if (!(flags & BDRV_REQ_NO_SERIALISING)) { + wait_serialising_requests(&req); + } + ret = src->bs->drv->bdrv_co_copy_range_from(src->bs, src, src_offset, dst, dst_offset, bytes, flags); + + tracked_request_end(&req); + bdrv_dec_in_flight(src->bs); } else { + bdrv_inc_in_flight(dst->bs); + tracked_request_begin(&req, dst->bs, dst_offset, bytes, + BDRV_TRACKED_WRITE); + + /* BDRV_REQ_NO_SERIALISING is only for read operation, + * so we ignore it in flags. + */ + wait_serialising_requests(&req); + ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs, src, src_offset, dst, dst_offset, bytes, flags); + + tracked_request_end(&req); + bdrv_dec_in_flight(dst->bs); } - tracked_request_end(&src_req); - tracked_request_end(&dst_req); - bdrv_dec_in_flight(src->bs); - bdrv_dec_in_flight(dst->bs); + return ret; } -- cgit v1.1 From 67b51fb998c697afb5d744066fcbde53e04fe941 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Mon, 9 Jul 2018 19:37:17 +0300 Subject: block: split flags in copy_range Pass read flags and write flags separately. This is needed to handle coming BDRV_REQ_NO_SERIALISING clearly in following patches. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/backup.c | 2 +- block/block-backend.c | 5 +++-- block/file-posix.c | 21 +++++++++++++-------- block/io.c | 46 +++++++++++++++++++++++++--------------------- block/iscsi.c | 9 ++++++--- block/qcow2.c | 20 +++++++++++--------- block/raw-format.c | 24 ++++++++++++++++-------- 7 files changed, 75 insertions(+), 52 deletions(-) (limited to 'block') diff --git a/block/backup.c b/block/backup.c index 81895dd..f3e4e81 100644 --- a/block/backup.c +++ b/block/backup.c @@ -163,7 +163,7 @@ static int coroutine_fn backup_cow_with_offload(BackupBlockJob *job, hbitmap_reset(job->copy_bitmap, start / job->cluster_size, nr_clusters); ret = blk_co_copy_range(blk, start, job->target, start, nbytes, - is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0); + is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0, 0); if (ret < 0) { trace_backup_do_cow_copy_range_fail(job, start, ret); hbitmap_set(job->copy_bitmap, start / job->cluster_size, diff --git a/block/block-backend.c b/block/block-backend.c index 6b75bca..ac8c3e0 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -2218,7 +2218,8 @@ void blk_unregister_buf(BlockBackend *blk, void *host) int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in, BlockBackend *blk_out, int64_t off_out, - int bytes, BdrvRequestFlags flags) + int bytes, BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags) { int r; r = blk_check_byte_request(blk_in, off_in, bytes); @@ -2231,5 +2232,5 @@ int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in, } return bdrv_co_copy_range(blk_in->root, off_in, blk_out->root, off_out, - bytes, flags); + bytes, read_flags, write_flags); } diff --git a/block/file-posix.c b/block/file-posix.c index 98987b8..4fec8cb 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -2589,18 +2589,23 @@ static void raw_abort_perm_update(BlockDriverState *bs) raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL); } -static int coroutine_fn raw_co_copy_range_from(BlockDriverState *bs, - BdrvChild *src, uint64_t src_offset, - BdrvChild *dst, uint64_t dst_offset, - uint64_t bytes, BdrvRequestFlags flags) +static int coroutine_fn raw_co_copy_range_from( + BlockDriverState *bs, BdrvChild *src, uint64_t src_offset, + BdrvChild *dst, uint64_t dst_offset, uint64_t bytes, + BdrvRequestFlags read_flags, BdrvRequestFlags write_flags) { - return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, flags); + return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, + read_flags, write_flags); } static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs, - BdrvChild *src, uint64_t src_offset, - BdrvChild *dst, uint64_t dst_offset, - uint64_t bytes, BdrvRequestFlags flags) + BdrvChild *src, + uint64_t src_offset, + BdrvChild *dst, + uint64_t dst_offset, + uint64_t bytes, + BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags) { BDRVRawState *s = bs->opaque; BDRVRawState *src_s; diff --git a/block/io.c b/block/io.c index 3a321d6..75ab26f 100644 --- a/block/io.c +++ b/block/io.c @@ -2910,13 +2910,11 @@ void bdrv_unregister_buf(BlockDriverState *bs, void *host) } } -static int coroutine_fn bdrv_co_copy_range_internal(BdrvChild *src, - uint64_t src_offset, - BdrvChild *dst, - uint64_t dst_offset, - uint64_t bytes, - BdrvRequestFlags flags, - bool recurse_src) +static int coroutine_fn bdrv_co_copy_range_internal( + BdrvChild *src, uint64_t src_offset, BdrvChild *dst, + uint64_t dst_offset, uint64_t bytes, + BdrvRequestFlags read_flags, BdrvRequestFlags write_flags, + bool recurse_src) { BdrvTrackedRequest req; int ret; @@ -2928,8 +2926,8 @@ static int coroutine_fn bdrv_co_copy_range_internal(BdrvChild *src, if (ret) { return ret; } - if (flags & BDRV_REQ_ZERO_WRITE) { - return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, flags); + if (write_flags & BDRV_REQ_ZERO_WRITE) { + return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags); } if (!src || !src->bs) { @@ -2951,14 +2949,15 @@ static int coroutine_fn bdrv_co_copy_range_internal(BdrvChild *src, tracked_request_begin(&req, src->bs, src_offset, bytes, BDRV_TRACKED_READ); - if (!(flags & BDRV_REQ_NO_SERIALISING)) { + if (!(read_flags & BDRV_REQ_NO_SERIALISING)) { wait_serialising_requests(&req); } ret = src->bs->drv->bdrv_co_copy_range_from(src->bs, src, src_offset, dst, dst_offset, - bytes, flags); + bytes, + read_flags, write_flags); tracked_request_end(&req); bdrv_dec_in_flight(src->bs); @@ -2967,15 +2966,15 @@ static int coroutine_fn bdrv_co_copy_range_internal(BdrvChild *src, tracked_request_begin(&req, dst->bs, dst_offset, bytes, BDRV_TRACKED_WRITE); - /* BDRV_REQ_NO_SERIALISING is only for read operation, - * so we ignore it in flags. - */ + /* BDRV_REQ_NO_SERIALISING is only for read operation */ + assert(!(write_flags & BDRV_REQ_NO_SERIALISING)); wait_serialising_requests(&req); ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs, src, src_offset, dst, dst_offset, - bytes, flags); + bytes, + read_flags, write_flags); tracked_request_end(&req); bdrv_dec_in_flight(dst->bs); @@ -2990,10 +2989,12 @@ static int coroutine_fn bdrv_co_copy_range_internal(BdrvChild *src, * semantics. */ int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset, BdrvChild *dst, uint64_t dst_offset, - uint64_t bytes, BdrvRequestFlags flags) + uint64_t bytes, + BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags) { return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, - bytes, flags, true); + bytes, read_flags, write_flags, true); } /* Copy range from @src to @dst. @@ -3002,19 +3003,22 @@ int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset, * semantics. */ int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset, BdrvChild *dst, uint64_t dst_offset, - uint64_t bytes, BdrvRequestFlags flags) + uint64_t bytes, + BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags) { return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, - bytes, flags, false); + bytes, read_flags, write_flags, false); } int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset, BdrvChild *dst, uint64_t dst_offset, - uint64_t bytes, BdrvRequestFlags flags) + uint64_t bytes, BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags) { return bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, - bytes, flags); + bytes, read_flags, write_flags); } static void bdrv_parent_cb_resize(BlockDriverState *bs) diff --git a/block/iscsi.c b/block/iscsi.c index ead2bd5..38b917a 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -2193,9 +2193,11 @@ static int coroutine_fn iscsi_co_copy_range_from(BlockDriverState *bs, BdrvChild *dst, uint64_t dst_offset, uint64_t bytes, - BdrvRequestFlags flags) + BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags) { - return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, flags); + return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, + read_flags, write_flags); } static struct scsi_task *iscsi_xcopy_task(int param_len) @@ -2332,7 +2334,8 @@ static int coroutine_fn iscsi_co_copy_range_to(BlockDriverState *bs, BdrvChild *dst, uint64_t dst_offset, uint64_t bytes, - BdrvRequestFlags flags) + BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags) { IscsiLun *dst_lun = dst->bs->opaque; IscsiLun *src_lun; diff --git a/block/qcow2.c b/block/qcow2.c index 33b61b7..867ce02 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -3253,13 +3253,14 @@ static int coroutine_fn qcow2_co_copy_range_from(BlockDriverState *bs, BdrvChild *src, uint64_t src_offset, BdrvChild *dst, uint64_t dst_offset, - uint64_t bytes, BdrvRequestFlags flags) + uint64_t bytes, BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags) { BDRVQcow2State *s = bs->opaque; int ret; unsigned int cur_bytes; /* number of bytes in current iteration */ BdrvChild *child = NULL; - BdrvRequestFlags cur_flags; + BdrvRequestFlags cur_write_flags; assert(!bs->encrypted); qemu_co_mutex_lock(&s->lock); @@ -3268,7 +3269,7 @@ qcow2_co_copy_range_from(BlockDriverState *bs, uint64_t copy_offset = 0; /* prepare next request */ cur_bytes = MIN(bytes, INT_MAX); - cur_flags = flags; + cur_write_flags = write_flags; ret = qcow2_get_cluster_offset(bs, src_offset, &cur_bytes, ©_offset); if (ret < 0) { @@ -3280,20 +3281,20 @@ qcow2_co_copy_range_from(BlockDriverState *bs, if (bs->backing && bs->backing->bs) { int64_t backing_length = bdrv_getlength(bs->backing->bs); if (src_offset >= backing_length) { - cur_flags |= BDRV_REQ_ZERO_WRITE; + cur_write_flags |= BDRV_REQ_ZERO_WRITE; } else { child = bs->backing; cur_bytes = MIN(cur_bytes, backing_length - src_offset); copy_offset = src_offset; } } else { - cur_flags |= BDRV_REQ_ZERO_WRITE; + cur_write_flags |= BDRV_REQ_ZERO_WRITE; } break; case QCOW2_CLUSTER_ZERO_PLAIN: case QCOW2_CLUSTER_ZERO_ALLOC: - cur_flags |= BDRV_REQ_ZERO_WRITE; + cur_write_flags |= BDRV_REQ_ZERO_WRITE; break; case QCOW2_CLUSTER_COMPRESSED: @@ -3317,7 +3318,7 @@ qcow2_co_copy_range_from(BlockDriverState *bs, ret = bdrv_co_copy_range_from(child, copy_offset, dst, dst_offset, - cur_bytes, cur_flags); + cur_bytes, read_flags, cur_write_flags); qemu_co_mutex_lock(&s->lock); if (ret < 0) { goto out; @@ -3338,7 +3339,8 @@ static int coroutine_fn qcow2_co_copy_range_to(BlockDriverState *bs, BdrvChild *src, uint64_t src_offset, BdrvChild *dst, uint64_t dst_offset, - uint64_t bytes, BdrvRequestFlags flags) + uint64_t bytes, BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags) { BDRVQcow2State *s = bs->opaque; int offset_in_cluster; @@ -3382,7 +3384,7 @@ qcow2_co_copy_range_to(BlockDriverState *bs, ret = bdrv_co_copy_range_to(src, src_offset, bs->file, cluster_offset + offset_in_cluster, - cur_bytes, flags); + cur_bytes, read_flags, write_flags); qemu_co_mutex_lock(&s->lock); if (ret < 0) { goto fail; diff --git a/block/raw-format.c b/block/raw-format.c index b78da56..a359198 100644 --- a/block/raw-format.c +++ b/block/raw-format.c @@ -498,9 +498,13 @@ static int raw_probe_geometry(BlockDriverState *bs, HDGeometry *geo) } static int coroutine_fn raw_co_copy_range_from(BlockDriverState *bs, - BdrvChild *src, uint64_t src_offset, - BdrvChild *dst, uint64_t dst_offset, - uint64_t bytes, BdrvRequestFlags flags) + BdrvChild *src, + uint64_t src_offset, + BdrvChild *dst, + uint64_t dst_offset, + uint64_t bytes, + BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags) { int ret; @@ -509,13 +513,17 @@ static int coroutine_fn raw_co_copy_range_from(BlockDriverState *bs, return ret; } return bdrv_co_copy_range_from(bs->file, src_offset, dst, dst_offset, - bytes, flags); + bytes, read_flags, write_flags); } static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs, - BdrvChild *src, uint64_t src_offset, - BdrvChild *dst, uint64_t dst_offset, - uint64_t bytes, BdrvRequestFlags flags) + BdrvChild *src, + uint64_t src_offset, + BdrvChild *dst, + uint64_t dst_offset, + uint64_t bytes, + BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags) { int ret; @@ -524,7 +532,7 @@ static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs, return ret; } return bdrv_co_copy_range_to(src, src_offset, bs->file, dst_offset, bytes, - flags); + read_flags, write_flags); } BlockDriver bdrv_raw = { -- cgit v1.1 From 09d2f948462f4979d18f573a0734d1daae8e67a9 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Mon, 9 Jul 2018 19:37:18 +0300 Subject: block: add BDRV_REQ_SERIALISING flag Serialized writes should be used in copy-on-write of backup(sync=none) for image fleecing scheme. We need to change an assert in bdrv_aligned_pwritev, added in 28de2dcd88de. The assert may fail now, because call to wait_serialising_requests here may become first call to it for this request with serializing flag set. It occurs if the request is aligned (otherwise, we should already set serializing flag before calling bdrv_aligned_pwritev and correspondingly waited for all intersecting requests). However, for aligned requests, we should not care about outdating of previously read data, as there no such data. Therefore, let's just update an assert to not care about aligned requests. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/io.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/io.c b/block/io.c index 75ab26f..6be9c40 100644 --- a/block/io.c +++ b/block/io.c @@ -637,6 +637,18 @@ static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); } +static bool is_request_serialising_and_aligned(BdrvTrackedRequest *req) +{ + /* + * If the request is serialising, overlap_offset and overlap_bytes are set, + * so we can check if the request is aligned. Otherwise, don't care and + * return false. + */ + + return req->serialising && (req->offset == req->overlap_offset) && + (req->bytes == req->overlap_bytes); +} + /** * Round a region to cluster boundaries */ @@ -1311,6 +1323,9 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, mark_request_serialising(req, bdrv_get_cluster_size(bs)); } + /* BDRV_REQ_SERIALISING is only for write operation */ + assert(!(flags & BDRV_REQ_SERIALISING)); + if (!(flags & BDRV_REQ_NO_SERIALISING)) { wait_serialising_requests(req); } @@ -1594,8 +1609,14 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, /* BDRV_REQ_NO_SERIALISING is only for read operation */ assert(!(flags & BDRV_REQ_NO_SERIALISING)); + + if (flags & BDRV_REQ_SERIALISING) { + mark_request_serialising(req, bdrv_get_cluster_size(bs)); + } + waited = wait_serialising_requests(req); - assert(!waited || !req->serialising); + assert(!waited || !req->serialising || + is_request_serialising_and_aligned(req)); assert(req->overlap_offset <= offset); assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); if (flags & BDRV_REQ_WRITE_UNCHANGED) { @@ -2949,6 +2970,8 @@ static int coroutine_fn bdrv_co_copy_range_internal( tracked_request_begin(&req, src->bs, src_offset, bytes, BDRV_TRACKED_READ); + /* BDRV_REQ_SERIALISING is only for write operation */ + assert(!(read_flags & BDRV_REQ_SERIALISING)); if (!(read_flags & BDRV_REQ_NO_SERIALISING)) { wait_serialising_requests(&req); } @@ -2968,6 +2991,9 @@ static int coroutine_fn bdrv_co_copy_range_internal( /* BDRV_REQ_NO_SERIALISING is only for read operation */ assert(!(write_flags & BDRV_REQ_NO_SERIALISING)); + if (write_flags & BDRV_REQ_SERIALISING) { + mark_request_serialising(&req, bdrv_get_cluster_size(dst->bs)); + } wait_serialising_requests(&req); ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs, -- cgit v1.1 From f8d59dfb40bbc6f5aeea57c8aac1e68c1d2454ee Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Mon, 9 Jul 2018 19:37:19 +0300 Subject: block/backup: fix fleecing scheme: use serialized writes Fleecing scheme works as follows: we want a kind of temporary snapshot of active drive A. We create temporary image B, with B->backing = A. Then we start backup(sync=none) from A to B. From this point, B reads as point-in-time snapshot of A (A continues to be active drive, accepting guest IO). This scheme needs some additional synchronization between reads from B and backup COW operations, otherwise, the following situation is theoretically possible: (assume B is qcow2, client is NBD client, reading from B) 1. client starts reading and take qcow2 mutex in qcow2_co_preadv, and goes up to l2 table loading (assume cache miss) 2) guest write => backup COW => qcow2 write => try to take qcow2 mutex => waiting 3. l2 table loaded, we see that cluster is UNALLOCATED, go to "case QCOW2_CLUSTER_UNALLOCATED" and unlock mutex before bdrv_co_preadv(bs->backing, ...) 4) aha, mutex unlocked, backup COW continues, and we finally finish guest write and change cluster in our active disk A 5. actually, do bdrv_co_preadv(bs->backing, ...) and read _new updated_ data. To avoid this, let's make backup writes serializing, to not intersect with reads from B. Note: we expand range of handled cases from (sync=none and B->backing = A) to just (A in backing chain of B), to finally allow safe reading from B during backup for all cases when A in backing chain of B, i.e. B formally looks like point-in-time snapshot of A. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/backup.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/backup.c b/block/backup.c index f3e4e81..319fc92 100644 --- a/block/backup.c +++ b/block/backup.c @@ -47,6 +47,8 @@ typedef struct BackupBlockJob { HBitmap *copy_bitmap; bool use_copy_range; int64_t copy_range_size; + + bool serialize_target_writes; } BackupBlockJob; static const BlockJobDriver backup_job_driver; @@ -102,6 +104,8 @@ static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job, QEMUIOVector qiov; BlockBackend *blk = job->common.blk; int nbytes; + int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0; + int write_flags = job->serialize_target_writes ? BDRV_REQ_SERIALISING : 0; hbitmap_reset(job->copy_bitmap, start / job->cluster_size, 1); nbytes = MIN(job->cluster_size, job->len - start); @@ -112,8 +116,7 @@ static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job, iov.iov_len = nbytes; qemu_iovec_init_external(&qiov, &iov, 1); - ret = blk_co_preadv(blk, start, qiov.size, &qiov, - is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0); + ret = blk_co_preadv(blk, start, qiov.size, &qiov, read_flags); if (ret < 0) { trace_backup_do_cow_read_fail(job, start, ret); if (error_is_read) { @@ -124,11 +127,11 @@ static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job, if (qemu_iovec_is_zero(&qiov)) { ret = blk_co_pwrite_zeroes(job->target, start, - qiov.size, BDRV_REQ_MAY_UNMAP); + qiov.size, write_flags | BDRV_REQ_MAY_UNMAP); } else { ret = blk_co_pwritev(job->target, start, - qiov.size, &qiov, - job->compress ? BDRV_REQ_WRITE_COMPRESSED : 0); + qiov.size, &qiov, write_flags | + (job->compress ? BDRV_REQ_WRITE_COMPRESSED : 0)); } if (ret < 0) { trace_backup_do_cow_write_fail(job, start, ret); @@ -156,6 +159,8 @@ static int coroutine_fn backup_cow_with_offload(BackupBlockJob *job, int nr_clusters; BlockBackend *blk = job->common.blk; int nbytes; + int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0; + int write_flags = job->serialize_target_writes ? BDRV_REQ_SERIALISING : 0; assert(QEMU_IS_ALIGNED(job->copy_range_size, job->cluster_size)); nbytes = MIN(job->copy_range_size, end - start); @@ -163,7 +168,7 @@ static int coroutine_fn backup_cow_with_offload(BackupBlockJob *job, hbitmap_reset(job->copy_bitmap, start / job->cluster_size, nr_clusters); ret = blk_co_copy_range(blk, start, job->target, start, nbytes, - is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0, 0); + read_flags, write_flags); if (ret < 0) { trace_backup_do_cow_copy_range_fail(job, start, ret); hbitmap_set(job->copy_bitmap, start / job->cluster_size, @@ -701,6 +706,9 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, sync_bitmap : NULL; job->compress = compress; + /* Detect image-fleecing (and similar) schemes */ + job->serialize_target_writes = bdrv_chain_contains(target, bs); + /* If there is no backing file on the target, we cannot rely on COW if our * backup cluster size is smaller than the target cluster size. Even for * targets with a backing file, try to avoid COW if possible. */ -- cgit v1.1 From ba814c82bbf0daf053c52dcdf0eb97b1f6ab0970 Mon Sep 17 00:00:00 2001 From: Ari Sundholm Date: Fri, 6 Jul 2018 15:00:38 +0300 Subject: block/blklogwrites: Make sure the log sector size is not too small The sector size needs to be large enough to accommodate the data structures for the log super block and log write entries. This was previously not properly checked, which made it possible to cause QEMU to badly misbehave. Signed-off-by: Ari Sundholm Signed-off-by: Kevin Wolf --- block/blklogwrites.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blklogwrites.c b/block/blklogwrites.c index 63bf6b3..efa2c7a 100644 --- a/block/blklogwrites.c +++ b/block/blklogwrites.c @@ -89,7 +89,10 @@ static inline uint32_t blk_log_writes_log2(uint32_t value) static inline bool blk_log_writes_sector_size_valid(uint32_t sector_size) { - return sector_size < (1ull << 24) && is_power_of_2(sector_size); + return is_power_of_2(sector_size) && + sector_size >= sizeof(struct log_write_super) && + sector_size >= sizeof(struct log_write_entry) && + sector_size < (1ull << 24); } static uint64_t blk_log_writes_find_cur_log_sector(BdrvChild *log, -- cgit v1.1 From 44e8b4689c6e3aba4df08a1201f02ac7bf3d2fdb Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Fri, 6 Jul 2018 15:06:18 +0200 Subject: Revert "block: Remove deprecated -drive option serial" This reverts commit b0083267444a5e0f28391f6c2831a539f878d424. Hold off removing this for one more QEMU release (current libvirt release still uses it.) Signed-off-by: Cornelia Huck Signed-off-by: Kevin Wolf --- block/block-backend.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/block-backend.c b/block/block-backend.c index ac8c3e0..e94e3d5 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -419,6 +419,7 @@ static void drive_info_del(DriveInfo *dinfo) return; } qemu_opts_del(dinfo->opts); + g_free(dinfo->serial); g_free(dinfo); } -- cgit v1.1 From f8a30874ca4dd6560b5827433f07877e60960946 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 10 Jul 2018 14:31:15 +0800 Subject: block: Prefix file driver trace points with "file_" With in one module, trace points usually have a common prefix named after the module name. paio_submit and paio_submit_co are the only two trace points so far in the two file protocol drivers. As we are adding more, having a common prefix here is better so that trace points can be enabled with a glob. Rename them. Suggested-by: Kevin Wolf Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/file-posix.c | 2 +- block/file-win32.c | 2 +- block/trace-events | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/file-posix.c b/block/file-posix.c index 4fec8cb..1185c7c 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -1743,7 +1743,7 @@ static int paio_submit_co_full(BlockDriverState *bs, int fd, assert(qiov->size == bytes); } - trace_paio_submit_co(offset, bytes, type); + trace_file_paio_submit_co(offset, bytes, type); pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); return thread_pool_submit_co(pool, aio_worker, acb); } diff --git a/block/file-win32.c b/block/file-win32.c index 0411fe8..f1e2187 100644 --- a/block/file-win32.c +++ b/block/file-win32.c @@ -162,7 +162,7 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile, acb->aio_nbytes = count; acb->aio_offset = offset; - trace_paio_submit(acb, opaque, offset, count, type); + trace_file_paio_submit(acb, opaque, offset, count, type); pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); } diff --git a/block/trace-events b/block/trace-events index c35287b..854d552 100644 --- a/block/trace-events +++ b/block/trace-events @@ -55,8 +55,8 @@ qmp_block_stream(void *bs, void *job) "bs %p job %p" # block/file-win32.c # block/file-posix.c -paio_submit_co(int64_t offset, int count, int type) "offset %"PRId64" count %d type %d" -paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb %p opaque %p offset %"PRId64" count %d type %d" +file_paio_submit_co(int64_t offset, int count, int type) "offset %"PRId64" count %d type %d" +file_paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb %p opaque %p offset %"PRId64" count %d type %d" # block/qcow2.c qcow2_writev_start_req(void *co, int64_t offset, int bytes) "co %p offset 0x%" PRIx64 " bytes %d" -- cgit v1.1 From ecc983a507bec9d3130434702d7031bfd372ba74 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 10 Jul 2018 14:31:16 +0800 Subject: block: Add copy offloading trace points A few trace points that can help reveal what is happening in a copy offloading I/O path. Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/file-posix.c | 2 ++ block/io.c | 4 ++++ block/iscsi.c | 3 +++ block/trace-events | 6 ++++++ 4 files changed, 15 insertions(+) (limited to 'block') diff --git a/block/file-posix.c b/block/file-posix.c index 1185c7c..09f6b93 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -1488,6 +1488,8 @@ static ssize_t handle_aiocb_copy_range(RawPosixAIOData *aiocb) ssize_t ret = copy_file_range(aiocb->aio_fildes, &in_off, aiocb->aio_fd2, &out_off, bytes, 0); + trace_file_copy_file_range(aiocb->bs, aiocb->aio_fildes, in_off, + aiocb->aio_fd2, out_off, bytes, 0, ret); if (ret == 0) { /* No progress (e.g. when beyond EOF), let the caller fall back to * buffer I/O. */ diff --git a/block/io.c b/block/io.c index 6be9c40..9b2aa04 100644 --- a/block/io.c +++ b/block/io.c @@ -3019,6 +3019,8 @@ int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset, BdrvRequestFlags read_flags, BdrvRequestFlags write_flags) { + trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes, + read_flags, write_flags); return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, bytes, read_flags, write_flags, true); } @@ -3033,6 +3035,8 @@ int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset, BdrvRequestFlags read_flags, BdrvRequestFlags write_flags) { + trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, + read_flags, write_flags); return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset, bytes, read_flags, write_flags, false); } diff --git a/block/iscsi.c b/block/iscsi.c index 38b917a..bb69faf 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -44,6 +44,7 @@ #include "qapi/qmp/qstring.h" #include "crypto/secret.h" #include "scsi/utils.h" +#include "trace.h" /* Conflict between scsi/utils.h and libiscsi! :( */ #define SCSI_XFER_NONE ISCSI_XFER_NONE @@ -2399,6 +2400,8 @@ retry: } out_unlock: + + trace_iscsi_xcopy(src_lun, src_offset, dst_lun, dst_offset, bytes, r); g_free(iscsi_task.task); qemu_mutex_unlock(&dst_lun->mutex); g_free(iscsi_task.err_str); diff --git a/block/trace-events b/block/trace-events index 854d552..3e8c47b 100644 --- a/block/trace-events +++ b/block/trace-events @@ -15,6 +15,8 @@ bdrv_co_preadv(void *bs, int64_t offset, int64_t nbytes, unsigned int flags) "bs bdrv_co_pwritev(void *bs, int64_t offset, int64_t nbytes, unsigned int flags) "bs %p offset %"PRId64" nbytes %"PRId64" flags 0x%x" bdrv_co_pwrite_zeroes(void *bs, int64_t offset, int count, int flags) "bs %p offset %"PRId64" count %d flags 0x%x" bdrv_co_do_copy_on_readv(void *bs, int64_t offset, unsigned int bytes, int64_t cluster_offset, int64_t cluster_bytes) "bs %p offset %"PRId64" bytes %u cluster_offset %"PRId64" cluster_bytes %"PRId64 +bdrv_co_copy_range_from(void *src, uint64_t src_offset, void *dst, uint64_t dst_offset, uint64_t bytes, int read_flags, int write_flags) "src %p offset %"PRIu64" dst %p offset %"PRIu64" bytes %"PRIu64" rw flags 0x%x 0x%x" +bdrv_co_copy_range_to(void *src, uint64_t src_offset, void *dst, uint64_t dst_offset, uint64_t bytes, int read_flags, int write_flags) "src %p offset %"PRIu64" dst %p offset %"PRIu64" bytes %"PRIu64" rw flags 0x%x 0x%x" # block/stream.c stream_one_iteration(void *s, int64_t offset, uint64_t bytes, int is_allocated) "s %p offset %" PRId64 " bytes %" PRIu64 " is_allocated %d" @@ -57,6 +59,7 @@ qmp_block_stream(void *bs, void *job) "bs %p job %p" # block/file-posix.c file_paio_submit_co(int64_t offset, int count, int type) "offset %"PRId64" count %d type %d" file_paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb %p opaque %p offset %"PRId64" count %d type %d" +file_copy_file_range(void *bs, int src, int64_t src_off, int dst, int64_t dst_off, int64_t bytes, int flags, int64_t ret) "bs %p src_fd %d offset %"PRIu64" dst_fd %d offset %"PRIu64" bytes %"PRIu64" flags %d ret %"PRId64 # block/qcow2.c qcow2_writev_start_req(void *co, int64_t offset, int bytes) "co %p offset 0x%" PRIx64 " bytes %d" @@ -150,3 +153,6 @@ nvme_free_req_queue_wait(void *q) "q %p" nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d" nvme_cmd_map_qiov_pages(void *s, int i, uint64_t page) "s %p page[%d] 0x%"PRIx64 nvme_cmd_map_qiov_iov(void *s, int i, void *page, int pages) "s %p iov[%d] %p pages %d" + +# block/iscsi.c +iscsi_xcopy(void *src_lun, uint64_t src_off, void *dst_lun, uint64_t dst_off, uint64_t bytes, int ret) "src_lun %p offset %"PRIu64" dst_lun %p offset %"PRIu64" bytes %"PRIu64" ret %d" -- cgit v1.1 From 0b9fd3f467dc5ac041fa014cd28c949b25b87d25 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 10 Jul 2018 14:31:17 +0800 Subject: block: Use BdrvChild to discard Other I/O functions are already using a BdrvChild pointer in the API, so make discard do the same. It makes it possible to initiate the same permission checks before doing I/O, and much easier to share the helper functions for this, which will be added and used by write, truncate and copy range paths. Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/blkdebug.c | 2 +- block/blklogwrites.c | 2 +- block/blkreplay.c | 2 +- block/block-backend.c | 2 +- block/copy-on-read.c | 2 +- block/io.c | 18 +++++++++--------- block/mirror.c | 2 +- block/qcow2-refcount.c | 2 +- block/raw-format.c | 2 +- block/throttle.c | 2 +- 10 files changed, 18 insertions(+), 18 deletions(-) (limited to 'block') diff --git a/block/blkdebug.c b/block/blkdebug.c index 526af2a..0457bf5 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -625,7 +625,7 @@ static int coroutine_fn blkdebug_co_pdiscard(BlockDriverState *bs, return err; } - return bdrv_co_pdiscard(bs->file->bs, offset, bytes); + return bdrv_co_pdiscard(bs->file, offset, bytes); } static int coroutine_fn blkdebug_co_block_status(BlockDriverState *bs, diff --git a/block/blklogwrites.c b/block/blklogwrites.c index efa2c7a..ff98cd5 100644 --- a/block/blklogwrites.c +++ b/block/blklogwrites.c @@ -486,7 +486,7 @@ static int coroutine_fn blk_log_writes_co_do_file_flush(BlkLogWritesFileReq *fr) static int coroutine_fn blk_log_writes_co_do_file_pdiscard(BlkLogWritesFileReq *fr) { - return bdrv_co_pdiscard(fr->bs->file->bs, fr->offset, fr->bytes); + return bdrv_co_pdiscard(fr->bs->file, fr->offset, fr->bytes); } static int coroutine_fn diff --git a/block/blkreplay.c b/block/blkreplay.c index b016dbe..766150a 100755 --- a/block/blkreplay.c +++ b/block/blkreplay.c @@ -113,7 +113,7 @@ static int coroutine_fn blkreplay_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) { uint64_t reqid = blkreplay_next_id(); - int ret = bdrv_co_pdiscard(bs->file->bs, offset, bytes); + int ret = bdrv_co_pdiscard(bs->file, offset, bytes); block_request_create(reqid, bs, qemu_coroutine_self()); qemu_coroutine_yield(); diff --git a/block/block-backend.c b/block/block-backend.c index e94e3d5..f2f75a9 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -1560,7 +1560,7 @@ int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes) return ret; } - return bdrv_co_pdiscard(blk_bs(blk), offset, bytes); + return bdrv_co_pdiscard(blk->root, offset, bytes); } int blk_co_flush(BlockBackend *blk) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index 1dcdaee..a19164f 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -116,7 +116,7 @@ static int coroutine_fn cor_co_pwrite_zeroes(BlockDriverState *bs, static int coroutine_fn cor_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) { - return bdrv_co_pdiscard(bs->file->bs, offset, bytes); + return bdrv_co_pdiscard(bs->file, offset, bytes); } diff --git a/block/io.c b/block/io.c index 9b2aa04..fbcd933 100644 --- a/block/io.c +++ b/block/io.c @@ -2633,7 +2633,7 @@ int bdrv_flush(BlockDriverState *bs) } typedef struct DiscardCo { - BlockDriverState *bs; + BdrvChild *child; int64_t offset; int bytes; int ret; @@ -2642,17 +2642,17 @@ static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque) { DiscardCo *rwco = opaque; - rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->bytes); + rwco->ret = bdrv_co_pdiscard(rwco->child, rwco->offset, rwco->bytes); } -int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, - int bytes) +int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int bytes) { BdrvTrackedRequest req; int max_pdiscard, ret; int head, tail, align; + BlockDriverState *bs = child->bs; - if (!bs->drv) { + if (!bs || !bs->drv) { return -ENOMEDIUM; } @@ -2763,11 +2763,11 @@ out: return ret; } -int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) +int bdrv_pdiscard(BdrvChild *child, int64_t offset, int bytes) { Coroutine *co; DiscardCo rwco = { - .bs = bs, + .child = child, .offset = offset, .bytes = bytes, .ret = NOT_DONE, @@ -2778,8 +2778,8 @@ int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int bytes) bdrv_pdiscard_co_entry(&rwco); } else { co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco); - bdrv_coroutine_enter(bs, co); - BDRV_POLL_WHILE(bs, rwco.ret == NOT_DONE); + bdrv_coroutine_enter(child->bs, co); + BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE); } return rwco.ret; diff --git a/block/mirror.c b/block/mirror.c index 61bd9f3..b48c3f8 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -1333,7 +1333,7 @@ static int coroutine_fn bdrv_mirror_top_do_write(BlockDriverState *bs, break; case MIRROR_METHOD_DISCARD: - ret = bdrv_co_pdiscard(bs->backing->bs, offset, bytes); + ret = bdrv_co_pdiscard(bs->backing, offset, bytes); break; default: diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 18c729a..4e1589a 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -734,7 +734,7 @@ void qcow2_process_discards(BlockDriverState *bs, int ret) /* Discard is optional, ignore the return value */ if (ret >= 0) { - bdrv_pdiscard(bs->file->bs, d->offset, d->bytes); + bdrv_pdiscard(bs->file, d->offset, d->bytes); } g_free(d); diff --git a/block/raw-format.c b/block/raw-format.c index a359198..dee2628 100644 --- a/block/raw-format.c +++ b/block/raw-format.c @@ -297,7 +297,7 @@ static int coroutine_fn raw_co_pdiscard(BlockDriverState *bs, if (ret) { return ret; } - return bdrv_co_pdiscard(bs->file->bs, offset, bytes); + return bdrv_co_pdiscard(bs->file, offset, bytes); } static int64_t raw_getlength(BlockDriverState *bs) diff --git a/block/throttle.c b/block/throttle.c index f617f23..636c976 100644 --- a/block/throttle.c +++ b/block/throttle.c @@ -149,7 +149,7 @@ static int coroutine_fn throttle_co_pdiscard(BlockDriverState *bs, ThrottleGroupMember *tgm = bs->opaque; throttle_group_co_io_limits_intercept(tgm, bytes, true); - return bdrv_co_pdiscard(bs->file->bs, offset, bytes); + return bdrv_co_pdiscard(bs->file, offset, bytes); } static int throttle_co_flush(BlockDriverState *bs) -- cgit v1.1 From 22931a15336e8b7726965c699981fd108620014b Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 10 Jul 2018 14:31:18 +0800 Subject: block: Use uint64_t for BdrvTrackedRequest byte fields This matches the types used for bytes in the rest parts of block layer. In the case of bdrv_co_truncate, new_bytes can be the image size which probably doesn't fit in a 32 bit int. Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/io.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/io.c b/block/io.c index fbcd933..6293612 100644 --- a/block/io.c +++ b/block/io.c @@ -601,9 +601,11 @@ static void tracked_request_end(BdrvTrackedRequest *req) static void tracked_request_begin(BdrvTrackedRequest *req, BlockDriverState *bs, int64_t offset, - unsigned int bytes, + uint64_t bytes, enum BdrvTrackedRequestType type) { + assert(bytes <= INT64_MAX && offset <= INT64_MAX - bytes); + *req = (BdrvTrackedRequest){ .bs = bs, .offset = offset, @@ -625,7 +627,7 @@ static void tracked_request_begin(BdrvTrackedRequest *req, static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) { int64_t overlap_offset = req->offset & ~(align - 1); - unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) + uint64_t overlap_bytes = ROUND_UP(req->offset + req->bytes, align) - overlap_offset; if (!req->serialising) { @@ -683,7 +685,7 @@ static int bdrv_get_cluster_size(BlockDriverState *bs) } static bool tracked_request_overlaps(BdrvTrackedRequest *req, - int64_t offset, unsigned int bytes) + int64_t offset, uint64_t bytes) { /* aaaa bbbb */ if (offset >= req->overlap_offset + req->overlap_bytes) { -- cgit v1.1 From 85fe24796ddf70f2b6a5045952826aedffa55ca2 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 10 Jul 2018 14:31:19 +0800 Subject: block: Extract common write req handling As a mechanical refactoring patch, this is the first step towards unified and more correct write code paths. This is helpful because multiple BlockDriverState fields need to be updated after modifying image data, and it's hard to maintain in multiple places such as copy offload, discard and truncate. Suggested-by: Kevin Wolf Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/io.c | 91 +++++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 57 insertions(+), 34 deletions(-) (limited to 'block') diff --git a/block/io.c b/block/io.c index 6293612..313fe07 100644 --- a/block/io.c +++ b/block/io.c @@ -1575,6 +1575,61 @@ fail: return ret; } +static inline int coroutine_fn +bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes, + BdrvTrackedRequest *req, int flags) +{ + BlockDriverState *bs = child->bs; + bool waited; + int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); + + if (bs->read_only) { + return -EPERM; + } + + /* BDRV_REQ_NO_SERIALISING is only for read operation */ + assert(!(flags & BDRV_REQ_NO_SERIALISING)); + assert(!(bs->open_flags & BDRV_O_INACTIVE)); + assert((bs->open_flags & BDRV_O_NO_IO) == 0); + assert(!(flags & ~BDRV_REQ_MASK)); + + if (flags & BDRV_REQ_SERIALISING) { + mark_request_serialising(req, bdrv_get_cluster_size(bs)); + } + + waited = wait_serialising_requests(req); + + assert(!waited || !req->serialising || + is_request_serialising_and_aligned(req)); + assert(req->overlap_offset <= offset); + assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); + + if (flags & BDRV_REQ_WRITE_UNCHANGED) { + assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); + } else { + assert(child->perm & BLK_PERM_WRITE); + } + assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE); + return notifier_with_return_list_notify(&bs->before_write_notifiers, req); +} + +static inline void coroutine_fn +bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, uint64_t bytes, + BdrvTrackedRequest *req, int ret) +{ + int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); + BlockDriverState *bs = child->bs; + + atomic_inc(&bs->write_gen); + bdrv_set_dirty(bs, offset, bytes); + + stat64_max(&bs->wr_highest_offset, offset + bytes); + + if (ret == 0) { + bs->total_sectors = MAX(bs->total_sectors, end_sector); + } +} + /* * Forwards an already correctly aligned write request to the BlockDriver, * after possibly fragmenting it. @@ -1585,10 +1640,8 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, { BlockDriverState *bs = child->bs; BlockDriver *drv = bs->drv; - bool waited; int ret; - int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); uint64_t bytes_remaining = bytes; int max_transfer; @@ -1604,31 +1657,10 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, assert((offset & (align - 1)) == 0); assert((bytes & (align - 1)) == 0); assert(!qiov || bytes == qiov->size); - assert((bs->open_flags & BDRV_O_NO_IO) == 0); - assert(!(flags & ~BDRV_REQ_MASK)); max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX), align); - /* BDRV_REQ_NO_SERIALISING is only for read operation */ - assert(!(flags & BDRV_REQ_NO_SERIALISING)); - - if (flags & BDRV_REQ_SERIALISING) { - mark_request_serialising(req, bdrv_get_cluster_size(bs)); - } - - waited = wait_serialising_requests(req); - assert(!waited || !req->serialising || - is_request_serialising_and_aligned(req)); - assert(req->overlap_offset <= offset); - assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); - if (flags & BDRV_REQ_WRITE_UNCHANGED) { - assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); - } else { - assert(child->perm & BLK_PERM_WRITE); - } - assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE); - - ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); + ret = bdrv_co_write_req_prepare(child, offset, bytes, req, flags); if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes && @@ -1677,15 +1709,10 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, } bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); - atomic_inc(&bs->write_gen); - bdrv_set_dirty(bs, offset, bytes); - - stat64_max(&bs->wr_highest_offset, offset + bytes); - if (ret >= 0) { - bs->total_sectors = MAX(bs->total_sectors, end_sector); ret = 0; } + bdrv_co_write_req_finish(child, offset, bytes, req, ret); return ret; } @@ -1800,10 +1827,6 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child, if (!bs->drv) { return -ENOMEDIUM; } - if (bs->read_only) { - return -EPERM; - } - assert(!(bs->open_flags & BDRV_O_INACTIVE)); ret = bdrv_check_byte_request(bs, offset, bytes); if (ret < 0) { -- cgit v1.1 From 7f8f03ef6d5c82fee1c22be06fc9de4a16ad7059 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 10 Jul 2018 14:31:20 +0800 Subject: block: Fix handling of image enlarging write Two problems exist when a write request that enlarges the image (i.e. write beyond EOF) finishes: 1) parent is not notified about size change; 2) dirty bitmap is not resized although we try to set the dirty bits; Fix them just like how bdrv_co_truncate works. Reported-by: Kevin Wolf Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/io.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/io.c b/block/io.c index 313fe07..c88b2b4 100644 --- a/block/io.c +++ b/block/io.c @@ -40,6 +40,7 @@ static AioWait drain_all_aio_wait; +static void bdrv_parent_cb_resize(BlockDriverState *bs); static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes, BdrvRequestFlags flags); @@ -1621,13 +1622,16 @@ bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, uint64_t bytes, BlockDriverState *bs = child->bs; atomic_inc(&bs->write_gen); - bdrv_set_dirty(bs, offset, bytes); stat64_max(&bs->wr_highest_offset, offset + bytes); - if (ret == 0) { - bs->total_sectors = MAX(bs->total_sectors, end_sector); + if (ret == 0 && + end_sector > bs->total_sectors) { + bs->total_sectors = end_sector; + bdrv_parent_cb_resize(bs); + bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS); } + bdrv_set_dirty(bs, offset, bytes); } /* -- cgit v1.1 From 00695c27a00a4b3333604aeac50c43269cc151a3 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 10 Jul 2018 14:31:21 +0800 Subject: block: Use common req handling for discard Reuse the new bdrv_co_write_req_prepare/finish helpers. The variation here is that discard requests don't affect bs->wr_highest_offset, and it cannot extend the image. Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/io.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/io.c b/block/io.c index c88b2b4..72cfd9b 100644 --- a/block/io.c +++ b/block/io.c @@ -1623,15 +1623,32 @@ bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, uint64_t bytes, atomic_inc(&bs->write_gen); - stat64_max(&bs->wr_highest_offset, offset + bytes); - + /* + * Discard cannot extend the image, but in error handling cases, such as + * when reverting a qcow2 cluster allocation, the discarded range can pass + * the end of image file, so we cannot assert about BDRV_TRACKED_DISCARD + * here. Instead, just skip it, since semantically a discard request + * beyond EOF cannot expand the image anyway. + */ if (ret == 0 && - end_sector > bs->total_sectors) { + end_sector > bs->total_sectors && + req->type != BDRV_TRACKED_DISCARD) { bs->total_sectors = end_sector; bdrv_parent_cb_resize(bs); bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS); } - bdrv_set_dirty(bs, offset, bytes); + if (req->bytes) { + switch (req->type) { + case BDRV_TRACKED_WRITE: + stat64_max(&bs->wr_highest_offset, offset + bytes); + /* fall through, to set dirty bits */ + case BDRV_TRACKED_DISCARD: + bdrv_set_dirty(bs, offset, bytes); + break; + default: + break; + } + } } /* @@ -2692,10 +2709,7 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int bytes) ret = bdrv_check_byte_request(bs, offset, bytes); if (ret < 0) { return ret; - } else if (bs->read_only) { - return -EPERM; } - assert(!(bs->open_flags & BDRV_O_INACTIVE)); /* Do nothing if disabled. */ if (!(bs->open_flags & BDRV_O_UNMAP)) { @@ -2719,7 +2733,7 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int bytes) bdrv_inc_in_flight(bs); tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD); - ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req); + ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0); if (ret < 0) { goto out; } @@ -2785,8 +2799,7 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int bytes) } ret = 0; out: - atomic_inc(&bs->write_gen); - bdrv_set_dirty(bs, req.offset, req.bytes); + bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret); tracked_request_end(&req); bdrv_dec_in_flight(bs); return ret; -- cgit v1.1 From 0eb1e891126f0cde52e88384696ad67076bdc341 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 10 Jul 2018 14:31:22 +0800 Subject: block: Use common req handling in copy offloading This brings the request handling logic inline with write and discard, fixing write_gen, resize_cb, dirty bitmaps and image size refreshing. The last of these issues broke iotest case 222, which is now fixed. Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/io.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'block') diff --git a/block/io.c b/block/io.c index 72cfd9b..2832214 100644 --- a/block/io.c +++ b/block/io.c @@ -3030,20 +3030,16 @@ static int coroutine_fn bdrv_co_copy_range_internal( bdrv_inc_in_flight(dst->bs); tracked_request_begin(&req, dst->bs, dst_offset, bytes, BDRV_TRACKED_WRITE); - - /* BDRV_REQ_NO_SERIALISING is only for read operation */ - assert(!(write_flags & BDRV_REQ_NO_SERIALISING)); - if (write_flags & BDRV_REQ_SERIALISING) { - mark_request_serialising(&req, bdrv_get_cluster_size(dst->bs)); - } - wait_serialising_requests(&req); - - ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs, - src, src_offset, - dst, dst_offset, - bytes, - read_flags, write_flags); - + ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req, + write_flags); + if (!ret) { + ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs, + src, src_offset, + dst, dst_offset, + bytes, + read_flags, write_flags); + } + bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret); tracked_request_end(&req); bdrv_dec_in_flight(dst->bs); } -- cgit v1.1 From 5416a11eb55d46c376dde97c429b89e9b4e1a94f Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 10 Jul 2018 14:31:23 +0800 Subject: block: Fix bdrv_co_truncate overlap check If we are growing the image and potentially using preallocation for the new area, we need to make sure that no write requests are made to the "preallocated" area which is [@old_size, @offset), not [@offset, offset * 2 - @old_size). Signed-off-by: Fam Zheng Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- block/io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/io.c b/block/io.c index 2832214..77d38ca 100644 --- a/block/io.c +++ b/block/io.c @@ -3136,7 +3136,8 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, } bdrv_inc_in_flight(bs); - tracked_request_begin(&req, bs, offset, new_bytes, BDRV_TRACKED_TRUNCATE); + tracked_request_begin(&req, bs, offset - new_bytes, new_bytes, + BDRV_TRACKED_TRUNCATE); /* If we are growing the image and potentially using preallocation for the * new area, we need to make sure that no write requests are made to it -- cgit v1.1 From cd47d792d7a27a57f4b621e2ff1ed8f4e83de1e9 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Tue, 10 Jul 2018 14:31:24 +0800 Subject: block: Use common write req handling in truncate Truncation is the last to convert from open coded req handling to reusing helpers. This time the permission check in prepare has to adapt to the new caller: it checks a different permission bit, and doesn't trigger the before write notifier. Also, truncation should always trigger a bs->total_sectors update and in turn call parent resize_cb. Update the condition in finish helper, too. It's intended to do a duplicated bs->read_only check before calling bdrv_co_write_req_prepare() so that we can be more informative with the error message, as bdrv_co_write_req_prepare() doesn't have Error parameter. Signed-off-by: Fam Zheng Signed-off-by: Kevin Wolf --- block/io.c | 55 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 20 deletions(-) (limited to 'block') diff --git a/block/io.c b/block/io.c index 77d38ca..7100344 100644 --- a/block/io.c +++ b/block/io.c @@ -1604,14 +1604,24 @@ bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes, is_request_serialising_and_aligned(req)); assert(req->overlap_offset <= offset); assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); + assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE); - if (flags & BDRV_REQ_WRITE_UNCHANGED) { - assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); - } else { - assert(child->perm & BLK_PERM_WRITE); + switch (req->type) { + case BDRV_TRACKED_WRITE: + case BDRV_TRACKED_DISCARD: + if (flags & BDRV_REQ_WRITE_UNCHANGED) { + assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); + } else { + assert(child->perm & BLK_PERM_WRITE); + } + return notifier_with_return_list_notify(&bs->before_write_notifiers, + req); + case BDRV_TRACKED_TRUNCATE: + assert(child->perm & BLK_PERM_RESIZE); + return 0; + default: + abort(); } - assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE); - return notifier_with_return_list_notify(&bs->before_write_notifiers, req); } static inline void coroutine_fn @@ -1631,8 +1641,9 @@ bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, uint64_t bytes, * beyond EOF cannot expand the image anyway. */ if (ret == 0 && - end_sector > bs->total_sectors && - req->type != BDRV_TRACKED_DISCARD) { + (req->type == BDRV_TRACKED_TRUNCATE || + end_sector > bs->total_sectors) && + req->type != BDRV_TRACKED_DISCARD) { bs->total_sectors = end_sector; bdrv_parent_cb_resize(bs); bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS); @@ -3111,7 +3122,6 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, int64_t old_size, new_bytes; int ret; - assert(child->perm & BLK_PERM_RESIZE); /* if bs->drv == NULL, bs is closed, so there's nothing to do here */ if (!drv) { @@ -3144,7 +3154,18 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, * concurrently or they might be overwritten by preallocation. */ if (new_bytes) { mark_request_serialising(&req, 1); - wait_serialising_requests(&req); + } + if (bs->read_only) { + error_setg(errp, "Image is read-only"); + ret = -EACCES; + goto out; + } + ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, &req, + 0); + if (ret < 0) { + error_setg_errno(errp, -ret, + "Failed to prepare request for truncation"); + goto out; } if (!drv->bdrv_co_truncate) { @@ -3156,13 +3177,6 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, ret = -ENOTSUP; goto out; } - if (bs->read_only) { - error_setg(errp, "Image is read-only"); - ret = -EACCES; - goto out; - } - - assert(!(bs->open_flags & BDRV_O_INACTIVE)); ret = drv->bdrv_co_truncate(bs, offset, prealloc, errp); if (ret < 0) { @@ -3174,9 +3188,10 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, } else { offset = bs->total_sectors * BDRV_SECTOR_SIZE; } - bdrv_dirty_bitmap_truncate(bs, offset); - bdrv_parent_cb_resize(bs); - atomic_inc(&bs->write_gen); + /* It's possible that truncation succeeded but refresh_total_sectors + * failed, but the latter doesn't affect how we should finish the request. + * Pass 0 as the last parameter so that dirty bitmaps etc. are handled. */ + bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, &req, 0); out: tracked_request_end(&req); -- cgit v1.1