From 2e36da62cf36d83897f922969e4523317e5f6ad3 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Wed, 21 Oct 2020 17:58:40 +0300 Subject: block/io.c: drop assertion on double waiting for request serialisation The comments states, that on misaligned request we should have already been waiting. But for bdrv_padding_rmw_read, we called bdrv_mark_request_serialising with align = request_alignment, and now we serialise with align = cluster_size. So we may have to wait again with larger alignment. Note, that the only user of BDRV_REQ_SERIALISING is backup which issues cluster-aligned requests, so seems the assertion should not fire for now. But it's wrong anyway. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Paolo Bonzini Message-Id: <20201021145859.11201-3-vsementsov@virtuozzo.com> Signed-off-by: Max Reitz --- block/io.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'block') diff --git a/block/io.c b/block/io.c index 24205f5..2737010 100644 --- a/block/io.c +++ b/block/io.c @@ -1827,7 +1827,6 @@ bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes, BdrvTrackedRequest *req, int flags) { BlockDriverState *bs = child->bs; - bool waited; int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); if (bs->read_only) { @@ -1839,15 +1838,7 @@ bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes, assert(!(flags & ~BDRV_REQ_MASK)); if (flags & BDRV_REQ_SERIALISING) { - waited = bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs)); - /* - * For a misaligned request we should have already waited earlier, - * because we come after bdrv_padding_rmw_read which must be called - * with the request already marked as serialising. - */ - assert(!waited || - (req->offset == req->overlap_offset && - req->bytes == req->overlap_bytes)); + bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs)); } else { bdrv_wait_serialising_requests(req); } -- cgit v1.1 From 3183937ff943f0bd0c43bb2625be1a040677ce25 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Wed, 21 Oct 2020 17:58:41 +0300 Subject: block/io: split out bdrv_find_conflicting_request To be reused in separate. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Stefan Hajnoczi Message-Id: <20201021145859.11201-4-vsementsov@virtuozzo.com> Signed-off-by: Max Reitz --- block/io.c | 71 ++++++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 41 insertions(+), 30 deletions(-) (limited to 'block') diff --git a/block/io.c b/block/io.c index 2737010..640136b 100644 --- a/block/io.c +++ b/block/io.c @@ -754,43 +754,54 @@ static bool tracked_request_overlaps(BdrvTrackedRequest *req, return true; } +/* Called with self->bs->reqs_lock held */ +static BdrvTrackedRequest * +bdrv_find_conflicting_request(BdrvTrackedRequest *self) +{ + BdrvTrackedRequest *req; + + QLIST_FOREACH(req, &self->bs->tracked_requests, list) { + if (req == self || (!req->serialising && !self->serialising)) { + continue; + } + if (tracked_request_overlaps(req, self->overlap_offset, + self->overlap_bytes)) + { + /* + * Hitting this means there was a reentrant request, for + * example, a block driver issuing nested requests. This must + * never happen since it means deadlock. + */ + assert(qemu_coroutine_self() != req->co); + + /* + * If the request is already (indirectly) waiting for us, or + * will wait for us as soon as it wakes up, then just go on + * (instead of producing a deadlock in the former case). + */ + if (!req->waiting_for) { + return req; + } + } + } + + return NULL; +} + static bool coroutine_fn bdrv_wait_serialising_requests_locked(BlockDriverState *bs, BdrvTrackedRequest *self) { BdrvTrackedRequest *req; - bool retry; bool waited = false; - do { - retry = false; - QLIST_FOREACH(req, &bs->tracked_requests, list) { - if (req == self || (!req->serialising && !self->serialising)) { - continue; - } - if (tracked_request_overlaps(req, self->overlap_offset, - self->overlap_bytes)) - { - /* Hitting this means there was a reentrant request, for - * example, a block driver issuing nested requests. This must - * never happen since it means deadlock. - */ - assert(qemu_coroutine_self() != req->co); - - /* If the request is already (indirectly) waiting for us, or - * will wait for us as soon as it wakes up, then just go on - * (instead of producing a deadlock in the former case). */ - if (!req->waiting_for) { - self->waiting_for = req; - qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock); - self->waiting_for = NULL; - retry = true; - waited = true; - break; - } - } - } - } while (retry); + while ((req = bdrv_find_conflicting_request(self))) { + self->waiting_for = req; + qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock); + self->waiting_for = NULL; + waited = true; + } + return waited; } -- cgit v1.1 From ec1c8868316f9ef33baa695400cb13cf19d1dc78 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Wed, 21 Oct 2020 17:58:42 +0300 Subject: block/io: bdrv_wait_serialising_requests_locked: drop extra bs arg bs is linked in req, so no needs to pass it separately. Most of tracked-requests API doesn't have bs argument. Actually, after this patch only tracked_request_begin has it, but it's for purpose. While being here, also add a comment about what "_locked" is. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Stefan Hajnoczi Message-Id: <20201021145859.11201-5-vsementsov@virtuozzo.com> Signed-off-by: Max Reitz --- block/io.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/io.c b/block/io.c index 640136b..d701537 100644 --- a/block/io.c +++ b/block/io.c @@ -788,16 +788,16 @@ bdrv_find_conflicting_request(BdrvTrackedRequest *self) return NULL; } +/* Called with self->bs->reqs_lock held */ static bool coroutine_fn -bdrv_wait_serialising_requests_locked(BlockDriverState *bs, - BdrvTrackedRequest *self) +bdrv_wait_serialising_requests_locked(BdrvTrackedRequest *self) { BdrvTrackedRequest *req; bool waited = false; while ((req = bdrv_find_conflicting_request(self))) { self->waiting_for = req; - qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock); + qemu_co_queue_wait(&req->wait_queue, &self->bs->reqs_lock); self->waiting_for = NULL; waited = true; } @@ -821,7 +821,7 @@ bool bdrv_mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) req->overlap_offset = MIN(req->overlap_offset, overlap_offset); req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); - waited = bdrv_wait_serialising_requests_locked(bs, req); + waited = bdrv_wait_serialising_requests_locked(req); qemu_co_mutex_unlock(&bs->reqs_lock); return waited; } @@ -903,7 +903,7 @@ static bool coroutine_fn bdrv_wait_serialising_requests(BdrvTrackedRequest *self } qemu_co_mutex_lock(&bs->reqs_lock); - waited = bdrv_wait_serialising_requests_locked(bs, self); + waited = bdrv_wait_serialising_requests_locked(self); qemu_co_mutex_unlock(&bs->reqs_lock); return waited; -- cgit v1.1 From 8ac5aab255d5c55d21bb33f4f6dd6dc58319e512 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Wed, 21 Oct 2020 17:58:43 +0300 Subject: block: bdrv_mark_request_serialising: split non-waiting function We'll need a separate function, which will only "mark" request serialising with specified align but not wait for conflicting requests. So, it will be like old bdrv_mark_request_serialising(), before merging bdrv_wait_serialising_requests_locked() into it. To reduce the possible mess, let's do the following: Public function that does both marking and waiting will be called bdrv_make_request_serialising, and private function which will only "mark" will be called tracked_request_set_serialising(). Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Max Reitz Message-Id: <20201021145859.11201-6-vsementsov@virtuozzo.com> Signed-off-by: Max Reitz --- block/file-posix.c | 2 +- block/io.c | 35 +++++++++++++++++++++++------------ 2 files changed, 24 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/file-posix.c b/block/file-posix.c index 9804681..00cdaaa 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -2953,7 +2953,7 @@ raw_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes, assert(bdrv_check_request(req->offset, req->bytes) == 0); - bdrv_mark_request_serialising(req, bs->bl.request_alignment); + bdrv_make_request_serialising(req, bs->bl.request_alignment); } #endif diff --git a/block/io.c b/block/io.c index d701537..fadfed3 100644 --- a/block/io.c +++ b/block/io.c @@ -805,15 +805,14 @@ bdrv_wait_serialising_requests_locked(BdrvTrackedRequest *self) return waited; } -bool bdrv_mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) +/* Called with req->bs->reqs_lock held */ +static void tracked_request_set_serialising(BdrvTrackedRequest *req, + uint64_t align) { - BlockDriverState *bs = req->bs; int64_t overlap_offset = req->offset & ~(align - 1); uint64_t overlap_bytes = ROUND_UP(req->offset + req->bytes, align) - overlap_offset; - bool waited; - qemu_co_mutex_lock(&bs->reqs_lock); if (!req->serialising) { qatomic_inc(&req->bs->serialising_in_flight); req->serialising = true; @@ -821,9 +820,6 @@ bool bdrv_mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) req->overlap_offset = MIN(req->overlap_offset, overlap_offset); req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); - waited = bdrv_wait_serialising_requests_locked(req); - qemu_co_mutex_unlock(&bs->reqs_lock); - return waited; } /** @@ -909,6 +905,21 @@ static bool coroutine_fn bdrv_wait_serialising_requests(BdrvTrackedRequest *self return waited; } +bool coroutine_fn bdrv_make_request_serialising(BdrvTrackedRequest *req, + uint64_t align) +{ + bool waited; + + qemu_co_mutex_lock(&req->bs->reqs_lock); + + tracked_request_set_serialising(req, align); + waited = bdrv_wait_serialising_requests_locked(req); + + qemu_co_mutex_unlock(&req->bs->reqs_lock); + + return waited; +} + int bdrv_check_request(int64_t offset, int64_t bytes) { if (offset < 0 || bytes < 0) { @@ -1434,7 +1445,7 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, * with each other for the same cluster. For example, in copy-on-read * it ensures that the CoR read and write operations are atomic and * guest writes cannot interleave between them. */ - bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs)); + bdrv_make_request_serialising(req, bdrv_get_cluster_size(bs)); } else { bdrv_wait_serialising_requests(req); } @@ -1849,7 +1860,7 @@ bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes, assert(!(flags & ~BDRV_REQ_MASK)); if (flags & BDRV_REQ_SERIALISING) { - bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs)); + bdrv_make_request_serialising(req, bdrv_get_cluster_size(bs)); } else { bdrv_wait_serialising_requests(req); } @@ -2015,7 +2026,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, padding = bdrv_init_padding(bs, offset, bytes, &pad); if (padding) { - bdrv_mark_request_serialising(req, align); + bdrv_make_request_serialising(req, align); bdrv_padding_rmw_read(child, req, &pad, true); @@ -2129,7 +2140,7 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, } if (bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad)) { - bdrv_mark_request_serialising(&req, align); + bdrv_make_request_serialising(&req, align); bdrv_padding_rmw_read(child, &req, &pad, false); } @@ -3250,7 +3261,7 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, * new area, we need to make sure that no write requests are made to it * concurrently or they might be overwritten by preallocation. */ if (new_bytes) { - bdrv_mark_request_serialising(&req, 1); + bdrv_make_request_serialising(&req, 1); } if (bs->read_only) { error_setg(errp, "Image is read-only"); -- cgit v1.1 From d1a764d126aa9bd1b519855607f55daf266b07bf Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Wed, 21 Oct 2020 17:58:44 +0300 Subject: block: introduce BDRV_REQ_NO_WAIT flag Add flag to make serialising request no wait: if there are conflicting requests, just return error immediately. It's will be used in upcoming preallocate filter. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Max Reitz Message-Id: <20201021145859.11201-7-vsementsov@virtuozzo.com> Signed-off-by: Max Reitz --- block/io.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/io.c b/block/io.c index fadfed3..95b1c56 100644 --- a/block/io.c +++ b/block/io.c @@ -1858,9 +1858,18 @@ bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes, assert(!(bs->open_flags & BDRV_O_INACTIVE)); assert((bs->open_flags & BDRV_O_NO_IO) == 0); assert(!(flags & ~BDRV_REQ_MASK)); + assert(!((flags & BDRV_REQ_NO_WAIT) && !(flags & BDRV_REQ_SERIALISING))); if (flags & BDRV_REQ_SERIALISING) { - bdrv_make_request_serialising(req, bdrv_get_cluster_size(bs)); + QEMU_LOCK_GUARD(&bs->reqs_lock); + + tracked_request_set_serialising(req, bdrv_get_cluster_size(bs)); + + if ((flags & BDRV_REQ_NO_WAIT) && bdrv_find_conflicting_request(req)) { + return -EBUSY; + } + + bdrv_wait_serialising_requests_locked(req); } else { bdrv_wait_serialising_requests(req); } -- cgit v1.1 From 33fa2222eb044147e75e5ec395e1fd53328bc9fb Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Wed, 21 Oct 2020 17:58:46 +0300 Subject: block: introduce preallocate filter It's intended to be inserted between format and protocol nodes to preallocate additional space (expanding protocol file) on writes crossing EOF. It improves performance for file-systems with slow allocation. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20201021145859.11201-9-vsementsov@virtuozzo.com> Reviewed-by: Max Reitz [mreitz: Two comment fixes, and bumped the version from 5.2 to 6.0] Signed-off-by: Max Reitz --- block/meson.build | 1 + block/preallocate.c | 559 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 560 insertions(+) create mode 100644 block/preallocate.c (limited to 'block') diff --git a/block/meson.build b/block/meson.build index 5dcc1e5..7595d86 100644 --- a/block/meson.build +++ b/block/meson.build @@ -12,6 +12,7 @@ block_ss.add(files( 'block-copy.c', 'commit.c', 'copy-on-read.c', + 'preallocate.c', 'create.c', 'crypto.c', 'dirty-bitmap.c', diff --git a/block/preallocate.c b/block/preallocate.c new file mode 100644 index 0000000..b619206 --- /dev/null +++ b/block/preallocate.c @@ -0,0 +1,559 @@ +/* + * preallocate filter driver + * + * The driver performs preallocate operation: it is injected above + * some node, and before each write over EOF it does additional preallocating + * write-zeroes request. + * + * Copyright (c) 2020 Virtuozzo International GmbH. + * + * Author: + * Sementsov-Ogievskiy Vladimir + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include "qemu/osdep.h" + +#include "qapi/error.h" +#include "qemu/module.h" +#include "qemu/option.h" +#include "qemu/units.h" +#include "block/block_int.h" + + +typedef struct PreallocateOpts { + int64_t prealloc_size; + int64_t prealloc_align; +} PreallocateOpts; + +typedef struct BDRVPreallocateState { + PreallocateOpts opts; + + /* + * Track real data end, to crop preallocation on close. If < 0 the status is + * unknown. + * + * @data_end is a maximum of file size on open (or when we get write/resize + * permissions) and all write request ends after it. So it's safe to + * truncate to data_end if it is valid. + */ + int64_t data_end; + + /* + * Start of trailing preallocated area which reads as zero. May be smaller + * than data_end, if user does over-EOF write zero operation. If < 0 the + * status is unknown. + * + * If both @zero_start and @file_end are valid, the region + * [@zero_start, @file_end) is known to be preallocated zeroes. If @file_end + * is not valid, @zero_start doesn't make much sense. + */ + int64_t zero_start; + + /* + * Real end of file. Actually the cache for bdrv_getlength(bs->file->bs), + * to avoid extra lseek() calls on each write operation. If < 0 the status + * is unknown. + */ + int64_t file_end; + + /* + * All three states @data_end, @zero_start and @file_end are guaranteed to + * be invalid (< 0) when we don't have both exclusive BLK_PERM_RESIZE and + * BLK_PERM_WRITE permissions on file child. + */ +} BDRVPreallocateState; + +#define PREALLOCATE_OPT_PREALLOC_ALIGN "prealloc-align" +#define PREALLOCATE_OPT_PREALLOC_SIZE "prealloc-size" +static QemuOptsList runtime_opts = { + .name = "preallocate", + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), + .desc = { + { + .name = PREALLOCATE_OPT_PREALLOC_ALIGN, + .type = QEMU_OPT_SIZE, + .help = "on preallocation, align file length to this number, " + "default 1M", + }, + { + .name = PREALLOCATE_OPT_PREALLOC_SIZE, + .type = QEMU_OPT_SIZE, + .help = "how much to preallocate, default 128M", + }, + { /* end of list */ } + }, +}; + +static bool preallocate_absorb_opts(PreallocateOpts *dest, QDict *options, + BlockDriverState *child_bs, Error **errp) +{ + QemuOpts *opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); + + if (!qemu_opts_absorb_qdict(opts, options, errp)) { + return false; + } + + dest->prealloc_align = + qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_ALIGN, 1 * MiB); + dest->prealloc_size = + qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_SIZE, 128 * MiB); + + qemu_opts_del(opts); + + if (!QEMU_IS_ALIGNED(dest->prealloc_align, BDRV_SECTOR_SIZE)) { + error_setg(errp, "prealloc-align parameter of preallocate filter " + "is not aligned to %llu", BDRV_SECTOR_SIZE); + return false; + } + + if (!QEMU_IS_ALIGNED(dest->prealloc_align, + child_bs->bl.request_alignment)) { + error_setg(errp, "prealloc-align parameter of preallocate filter " + "is not aligned to underlying node request alignment " + "(%" PRIi32 ")", child_bs->bl.request_alignment); + return false; + } + + return true; +} + +static int preallocate_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVPreallocateState *s = bs->opaque; + + /* + * s->data_end and friends should be initialized on permission update. + * For this to work, mark them invalid. + */ + s->file_end = s->zero_start = s->data_end = -EINVAL; + + bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds, + BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, + false, errp); + if (!bs->file) { + return -EINVAL; + } + + if (!preallocate_absorb_opts(&s->opts, options, bs->file->bs, errp)) { + return -EINVAL; + } + + bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED | + (BDRV_REQ_FUA & bs->file->bs->supported_write_flags); + + bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED | + ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & + bs->file->bs->supported_zero_flags); + + return 0; +} + +static void preallocate_close(BlockDriverState *bs) +{ + int ret; + BDRVPreallocateState *s = bs->opaque; + + if (s->data_end < 0) { + return; + } + + if (s->file_end < 0) { + s->file_end = bdrv_getlength(bs->file->bs); + if (s->file_end < 0) { + return; + } + } + + if (s->data_end < s->file_end) { + ret = bdrv_truncate(bs->file, s->data_end, true, PREALLOC_MODE_OFF, 0, + NULL); + s->file_end = ret < 0 ? ret : s->data_end; + } +} + + +/* + * Handle reopen. + * + * We must implement reopen handlers, otherwise reopen just don't work. Handle + * new options and don't care about preallocation state, as it is handled in + * set/check permission handlers. + */ + +static int preallocate_reopen_prepare(BDRVReopenState *reopen_state, + BlockReopenQueue *queue, Error **errp) +{ + PreallocateOpts *opts = g_new0(PreallocateOpts, 1); + + if (!preallocate_absorb_opts(opts, reopen_state->options, + reopen_state->bs->file->bs, errp)) { + g_free(opts); + return -EINVAL; + } + + reopen_state->opaque = opts; + + return 0; +} + +static void preallocate_reopen_commit(BDRVReopenState *state) +{ + BDRVPreallocateState *s = state->bs->opaque; + + s->opts = *(PreallocateOpts *)state->opaque; + + g_free(state->opaque); + state->opaque = NULL; +} + +static void preallocate_reopen_abort(BDRVReopenState *state) +{ + g_free(state->opaque); + state->opaque = NULL; +} + +static coroutine_fn int preallocate_co_preadv_part( + BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, size_t qiov_offset, int flags) +{ + return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, + flags); +} + +static int coroutine_fn preallocate_co_pdiscard(BlockDriverState *bs, + int64_t offset, int bytes) +{ + return bdrv_co_pdiscard(bs->file, offset, bytes); +} + +static bool can_write_resize(uint64_t perm) +{ + return (perm & BLK_PERM_WRITE) && (perm & BLK_PERM_RESIZE); +} + +static bool has_prealloc_perms(BlockDriverState *bs) +{ + BDRVPreallocateState *s = bs->opaque; + + if (can_write_resize(bs->file->perm)) { + assert(!(bs->file->shared_perm & BLK_PERM_WRITE)); + assert(!(bs->file->shared_perm & BLK_PERM_RESIZE)); + return true; + } + + assert(s->data_end < 0); + assert(s->zero_start < 0); + assert(s->file_end < 0); + return false; +} + +/* + * Call on each write. Returns true if @want_merge_zero is true and the region + * [offset, offset + bytes) is zeroed (as a result of this call or earlier + * preallocation). + * + * want_merge_zero is used to merge write-zero request with preallocation in + * one bdrv_co_pwrite_zeroes() call. + */ +static bool coroutine_fn handle_write(BlockDriverState *bs, int64_t offset, + int64_t bytes, bool want_merge_zero) +{ + BDRVPreallocateState *s = bs->opaque; + int64_t end = offset + bytes; + int64_t prealloc_start, prealloc_end; + int ret; + + if (!has_prealloc_perms(bs)) { + /* We don't have state neither should try to recover it */ + return false; + } + + if (s->data_end < 0) { + s->data_end = bdrv_getlength(bs->file->bs); + if (s->data_end < 0) { + return false; + } + + if (s->file_end < 0) { + s->file_end = s->data_end; + } + } + + if (end <= s->data_end) { + return false; + } + + /* We have valid s->data_end, and request writes beyond it. */ + + s->data_end = end; + if (s->zero_start < 0 || !want_merge_zero) { + s->zero_start = end; + } + + if (s->file_end < 0) { + s->file_end = bdrv_getlength(bs->file->bs); + if (s->file_end < 0) { + return false; + } + } + + /* Now s->data_end, s->zero_start and s->file_end are valid. */ + + if (end <= s->file_end) { + /* No preallocation needed. */ + return want_merge_zero && offset >= s->zero_start; + } + + /* Now we want new preallocation, as request writes beyond s->file_end. */ + + prealloc_start = want_merge_zero ? MIN(offset, s->file_end) : s->file_end; + prealloc_end = QEMU_ALIGN_UP(end + s->opts.prealloc_size, + s->opts.prealloc_align); + + ret = bdrv_co_pwrite_zeroes( + bs->file, prealloc_start, prealloc_end - prealloc_start, + BDRV_REQ_NO_FALLBACK | BDRV_REQ_SERIALISING | BDRV_REQ_NO_WAIT); + if (ret < 0) { + s->file_end = ret; + return false; + } + + s->file_end = prealloc_end; + return want_merge_zero; +} + +static int coroutine_fn preallocate_co_pwrite_zeroes(BlockDriverState *bs, + int64_t offset, int bytes, BdrvRequestFlags flags) +{ + bool want_merge_zero = + !(flags & ~(BDRV_REQ_ZERO_WRITE | BDRV_REQ_NO_FALLBACK)); + if (handle_write(bs, offset, bytes, want_merge_zero)) { + return 0; + } + + return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags); +} + +static coroutine_fn int preallocate_co_pwritev_part(BlockDriverState *bs, + uint64_t offset, + uint64_t bytes, + QEMUIOVector *qiov, + size_t qiov_offset, + int flags) +{ + handle_write(bs, offset, bytes, false); + + return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset, + flags); +} + +static int coroutine_fn +preallocate_co_truncate(BlockDriverState *bs, int64_t offset, + bool exact, PreallocMode prealloc, + BdrvRequestFlags flags, Error **errp) +{ + ERRP_GUARD(); + BDRVPreallocateState *s = bs->opaque; + int ret; + + if (s->data_end >= 0 && offset > s->data_end) { + if (s->file_end < 0) { + s->file_end = bdrv_getlength(bs->file->bs); + if (s->file_end < 0) { + error_setg(errp, "failed to get file length"); + return s->file_end; + } + } + + if (prealloc == PREALLOC_MODE_FALLOC) { + /* + * If offset <= s->file_end, the task is already done, just + * update s->data_end, to move part of "filter preallocation" + * to "preallocation requested by user". + * Otherwise just proceed to preallocate missing part. + */ + if (offset <= s->file_end) { + s->data_end = offset; + return 0; + } + } else { + /* + * We have to drop our preallocation, to + * - avoid "Cannot use preallocation for shrinking files" in + * case of offset < file_end + * - give PREALLOC_MODE_OFF a chance to keep small disk + * usage + * - give PREALLOC_MODE_FULL a chance to actually write the + * whole region as user expects + */ + if (s->file_end > s->data_end) { + ret = bdrv_co_truncate(bs->file, s->data_end, true, + PREALLOC_MODE_OFF, 0, errp); + if (ret < 0) { + s->file_end = ret; + error_prepend(errp, "preallocate-filter: failed to drop " + "write-zero preallocation: "); + return ret; + } + s->file_end = s->data_end; + } + } + + s->data_end = offset; + } + + ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp); + if (ret < 0) { + s->file_end = s->zero_start = s->data_end = ret; + return ret; + } + + if (has_prealloc_perms(bs)) { + s->file_end = s->zero_start = s->data_end = offset; + } + return 0; +} + +static int coroutine_fn preallocate_co_flush(BlockDriverState *bs) +{ + return bdrv_co_flush(bs->file->bs); +} + +static int64_t preallocate_getlength(BlockDriverState *bs) +{ + int64_t ret; + BDRVPreallocateState *s = bs->opaque; + + if (s->data_end >= 0) { + return s->data_end; + } + + ret = bdrv_getlength(bs->file->bs); + + if (has_prealloc_perms(bs)) { + s->file_end = s->zero_start = s->data_end = ret; + } + + return ret; +} + +static int preallocate_check_perm(BlockDriverState *bs, + uint64_t perm, uint64_t shared, Error **errp) +{ + BDRVPreallocateState *s = bs->opaque; + + if (s->data_end >= 0 && !can_write_resize(perm)) { + /* + * Lose permissions. + * We should truncate in check_perm, as in set_perm bs->file->perm will + * be already changed, and we should not violate it. + */ + if (s->file_end < 0) { + s->file_end = bdrv_getlength(bs->file->bs); + if (s->file_end < 0) { + error_setg(errp, "Failed to get file length"); + return s->file_end; + } + } + + if (s->data_end < s->file_end) { + int ret = bdrv_truncate(bs->file, s->data_end, true, + PREALLOC_MODE_OFF, 0, NULL); + if (ret < 0) { + error_setg(errp, "Failed to drop preallocation"); + s->file_end = ret; + return ret; + } + s->file_end = s->data_end; + } + } + + return 0; +} + +static void preallocate_set_perm(BlockDriverState *bs, + uint64_t perm, uint64_t shared) +{ + BDRVPreallocateState *s = bs->opaque; + + if (can_write_resize(perm)) { + if (s->data_end < 0) { + s->data_end = s->file_end = s->zero_start = + bdrv_getlength(bs->file->bs); + } + } else { + /* + * We drop our permissions, as well as allow shared + * permissions (see preallocate_child_perm), anyone will be able to + * change the child, so mark all states invalid. We'll regain control if + * get good permissions back. + */ + s->data_end = s->file_end = s->zero_start = -EINVAL; + } +} + +static void preallocate_child_perm(BlockDriverState *bs, BdrvChild *c, + BdrvChildRole role, BlockReopenQueue *reopen_queue, + uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared) +{ + bdrv_default_perms(bs, c, role, reopen_queue, perm, shared, nperm, nshared); + + if (can_write_resize(perm)) { + /* This should come by default, but let's enforce: */ + *nperm |= BLK_PERM_WRITE | BLK_PERM_RESIZE; + + /* + * Don't share, to keep our states s->file_end, s->data_end and + * s->zero_start valid. + */ + *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE); + } +} + +BlockDriver bdrv_preallocate_filter = { + .format_name = "preallocate", + .instance_size = sizeof(BDRVPreallocateState), + + .bdrv_getlength = preallocate_getlength, + .bdrv_open = preallocate_open, + .bdrv_close = preallocate_close, + + .bdrv_reopen_prepare = preallocate_reopen_prepare, + .bdrv_reopen_commit = preallocate_reopen_commit, + .bdrv_reopen_abort = preallocate_reopen_abort, + + .bdrv_co_preadv_part = preallocate_co_preadv_part, + .bdrv_co_pwritev_part = preallocate_co_pwritev_part, + .bdrv_co_pwrite_zeroes = preallocate_co_pwrite_zeroes, + .bdrv_co_pdiscard = preallocate_co_pdiscard, + .bdrv_co_flush = preallocate_co_flush, + .bdrv_co_truncate = preallocate_co_truncate, + + .bdrv_check_perm = preallocate_check_perm, + .bdrv_set_perm = preallocate_set_perm, + .bdrv_child_perm = preallocate_child_perm, + + .has_variable_length = true, + .is_filter = true, +}; + +static void bdrv_preallocate_init(void) +{ + bdrv_register(&bdrv_preallocate_filter); +} + +block_init(bdrv_preallocate_init); -- cgit v1.1 From ef9bba1484bb8fb5fda53a7bf90bf5e1a8e6a9f6 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Fri, 13 Nov 2020 17:52:31 +0100 Subject: quorum: Implement bdrv_co_block_status() The quorum driver does not implement bdrv_co_block_status() and because of that it always reports to contain data even if all its children are known to be empty. One consequence of this is that if we for example create a quorum with a size of 10GB and we mirror it to a new image the operation will write 10GB of actual zeroes to the destination image wasting a lot of time and disk space. Since a quorum has an arbitrary number of children of potentially different formats there is no way to report all possible allocation status flags in a way that makes sense, so this implementation only reports when a given region is known to contain zeroes (BDRV_BLOCK_ZERO) or not (BDRV_BLOCK_DATA). If all children agree that a region contains zeroes then we can return BDRV_BLOCK_ZERO using the smallest size reported by the children (because all agree that a region of at least that size contains zeroes). If at least one child disagrees we have to return BDRV_BLOCK_DATA. In this case we use the largest of the sizes reported by the children that didn't return BDRV_BLOCK_ZERO (because we know that there won't be an agreement for at least that size). Signed-off-by: Alberto Garcia Tested-by: Tao Xu Reviewed-by: Max Reitz Message-Id: Signed-off-by: Max Reitz --- block/quorum.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) (limited to 'block') diff --git a/block/quorum.c b/block/quorum.c index 4b08a19..ae62b20 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -18,6 +18,7 @@ #include "qemu/module.h" #include "qemu/option.h" #include "block/block_int.h" +#include "block/coroutines.h" #include "block/qdict.h" #include "qapi/error.h" #include "qapi/qapi-events-block.h" @@ -1179,6 +1180,56 @@ static void quorum_child_perm(BlockDriverState *bs, BdrvChild *c, | DEFAULT_PERM_UNCHANGED; } +/* + * Each one of the children can report different status flags even + * when they contain the same data, so what this function does is + * return BDRV_BLOCK_ZERO if *all* children agree that a certain + * region contains zeroes, and BDRV_BLOCK_DATA otherwise. + */ +static int coroutine_fn quorum_co_block_status(BlockDriverState *bs, + bool want_zero, + int64_t offset, int64_t count, + int64_t *pnum, int64_t *map, + BlockDriverState **file) +{ + BDRVQuorumState *s = bs->opaque; + int i, ret; + int64_t pnum_zero = count; + int64_t pnum_data = 0; + + for (i = 0; i < s->num_children; i++) { + int64_t bytes; + ret = bdrv_co_common_block_status_above(s->children[i]->bs, NULL, false, + want_zero, offset, count, + &bytes, NULL, NULL, NULL); + if (ret < 0) { + quorum_report_bad(QUORUM_OP_TYPE_READ, offset, count, + s->children[i]->bs->node_name, ret); + pnum_data = count; + break; + } + /* + * Even if all children agree about whether there are zeroes + * or not at @offset they might disagree on the size, so use + * the smallest when reporting BDRV_BLOCK_ZERO and the largest + * when reporting BDRV_BLOCK_DATA. + */ + if (ret & BDRV_BLOCK_ZERO) { + pnum_zero = MIN(pnum_zero, bytes); + } else { + pnum_data = MAX(pnum_data, bytes); + } + } + + if (pnum_data) { + *pnum = pnum_data; + return BDRV_BLOCK_DATA; + } else { + *pnum = pnum_zero; + return BDRV_BLOCK_ZERO; + } +} + static const char *const quorum_strong_runtime_opts[] = { QUORUM_OPT_VOTE_THRESHOLD, QUORUM_OPT_BLKVERIFY, @@ -1197,6 +1248,7 @@ static BlockDriver bdrv_quorum = { .bdrv_close = quorum_close, .bdrv_gather_child_options = quorum_gather_child_options, .bdrv_dirname = quorum_dirname, + .bdrv_co_block_status = quorum_co_block_status, .bdrv_co_flush_to_disk = quorum_co_flush, -- cgit v1.1 From 5cddb2e95f8d9e9ee535964df4136b562ce268e1 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Fri, 13 Nov 2020 17:52:32 +0100 Subject: quorum: Implement bdrv_co_pwrite_zeroes() This simply calls bdrv_co_pwrite_zeroes() in all children. bs->supported_zero_flags is also set to the flags that are supported by all children. Signed-off-by: Alberto Garcia Message-Id: <2f09c842781fe336b4c2e40036bba577b7430190.1605286097.git.berto@igalia.com> Reviewed-by: Max Reitz Signed-off-by: Max Reitz --- block/quorum.c | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/quorum.c b/block/quorum.c index ae62b20..0bd7545 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -692,8 +692,13 @@ static void write_quorum_entry(void *opaque) QuorumChildRequest *sacb = &acb->qcrs[i]; sacb->bs = s->children[i]->bs; - sacb->ret = bdrv_co_pwritev(s->children[i], acb->offset, acb->bytes, - acb->qiov, acb->flags); + if (acb->flags & BDRV_REQ_ZERO_WRITE) { + sacb->ret = bdrv_co_pwrite_zeroes(s->children[i], acb->offset, + acb->bytes, acb->flags); + } else { + sacb->ret = bdrv_co_pwritev(s->children[i], acb->offset, acb->bytes, + acb->qiov, acb->flags); + } if (sacb->ret == 0) { acb->success_count++; } else { @@ -739,6 +744,14 @@ static int quorum_co_pwritev(BlockDriverState *bs, uint64_t offset, return ret; } +static int quorum_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, + int bytes, BdrvRequestFlags flags) + +{ + return quorum_co_pwritev(bs, offset, bytes, NULL, + flags | BDRV_REQ_ZERO_WRITE); +} + static int64_t quorum_getlength(BlockDriverState *bs) { BDRVQuorumState *s = bs->opaque; @@ -897,6 +910,21 @@ static QemuOptsList quorum_runtime_opts = { }, }; +static void quorum_refresh_flags(BlockDriverState *bs) +{ + BDRVQuorumState *s = bs->opaque; + int i; + + bs->supported_zero_flags = + BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK; + + for (i = 0; i < s->num_children; i++) { + bs->supported_zero_flags &= s->children[i]->bs->supported_zero_flags; + } + + bs->supported_zero_flags |= BDRV_REQ_WRITE_UNCHANGED; +} + static int quorum_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { @@ -991,6 +1019,7 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags, s->next_child_index = s->num_children; bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED; + quorum_refresh_flags(bs); g_free(opened); goto exit; @@ -1062,6 +1091,7 @@ static void quorum_add_child(BlockDriverState *bs, BlockDriverState *child_bs, } s->children = g_renew(BdrvChild *, s->children, s->num_children + 1); s->children[s->num_children++] = child; + quorum_refresh_flags(bs); out: bdrv_drained_end(bs); @@ -1106,6 +1136,7 @@ static void quorum_del_child(BlockDriverState *bs, BdrvChild *child, s->children = g_renew(BdrvChild *, s->children, --s->num_children); bdrv_unref_child(bs, child); + quorum_refresh_flags(bs); bdrv_drained_end(bs); } @@ -1256,6 +1287,7 @@ static BlockDriver bdrv_quorum = { .bdrv_co_preadv = quorum_co_preadv, .bdrv_co_pwritev = quorum_co_pwritev, + .bdrv_co_pwrite_zeroes = quorum_co_pwrite_zeroes, .bdrv_add_child = quorum_add_child, .bdrv_del_child = quorum_del_child, -- cgit v1.1 From c8807c5edcc8bd8917a5b7531d47ef6a99e07bd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Thu, 10 Dec 2020 13:52:02 +0100 Subject: block/nvme: Implement fake truncate() coroutine MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit NVMe drive cannot be shrunk. Since commit c80d8b06cfa we can use the @exact parameter (set to false) to return success if the block device is larger than the requested offset (even if we can not be shrunk). Use this parameter to implement the NVMe truncate() coroutine, similarly how it is done for the iscsi and file-posix drivers (see commit 82325ae5f2f "Evaluate @exact in protocol drivers"). Reported-by: Xueqiang Wei Suggested-by: Max Reitz Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20201210125202.858656-1-philmd@redhat.com> Signed-off-by: Max Reitz --- block/nvme.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'block') diff --git a/block/nvme.c b/block/nvme.c index a06a188..5a6fbac 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -1389,6 +1389,29 @@ out: } +static int coroutine_fn nvme_co_truncate(BlockDriverState *bs, int64_t offset, + bool exact, PreallocMode prealloc, + BdrvRequestFlags flags, Error **errp) +{ + int64_t cur_length; + + if (prealloc != PREALLOC_MODE_OFF) { + error_setg(errp, "Unsupported preallocation mode '%s'", + PreallocMode_str(prealloc)); + return -ENOTSUP; + } + + cur_length = nvme_getlength(bs); + if (offset != cur_length && exact) { + error_setg(errp, "Cannot resize NVMe devices"); + return -ENOTSUP; + } else if (offset > cur_length) { + error_setg(errp, "Cannot grow NVMe devices"); + return -EINVAL; + } + + return 0; +} static int nvme_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue, Error **errp) @@ -1523,6 +1546,7 @@ static BlockDriver bdrv_nvme = { .bdrv_close = nvme_close, .bdrv_getlength = nvme_getlength, .bdrv_probe_blocksizes = nvme_probe_blocksizes, + .bdrv_co_truncate = nvme_co_truncate, .bdrv_co_preadv = nvme_co_preadv, .bdrv_co_pwritev = nvme_co_pwritev, -- cgit v1.1