diff options
42 files changed, 1437 insertions, 331 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index cd8d6b1..018ed62 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2755,12 +2755,13 @@ S: Supported F: util/async.c F: util/aio-*.c F: util/aio-*.h +F: util/defer-call.c F: util/fdmon-*.c F: block/io.c -F: block/plug.c F: migration/block* F: include/block/aio.h F: include/block/aio-wait.h +F: include/qemu/defer-call.h F: scripts/qemugdb/aio.py F: tests/unit/test-fdmon-epoll.c T: git https://github.com/stefanha/qemu.git block @@ -5200,7 +5200,7 @@ static void bdrv_close(BlockDriverState *bs) bs->drv = NULL; } - bdrv_graph_wrlock(NULL); + bdrv_graph_wrlock(bs); QLIST_FOREACH_SAFE(child, &bs->children, next, next) { bdrv_unref_child(bs, child); } diff --git a/block/blkio.c b/block/blkio.c index 1dd4956..0a0a6c0 100644 --- a/block/blkio.c +++ b/block/blkio.c @@ -13,6 +13,7 @@ #include "block/block_int.h" #include "exec/memory.h" #include "exec/cpu-common.h" /* for qemu_ram_get_fd() */ +#include "qemu/defer-call.h" #include "qapi/error.h" #include "qemu/error-report.h" #include "qapi/qmp/qdict.h" @@ -312,10 +313,10 @@ static void blkio_detach_aio_context(BlockDriverState *bs) } /* - * Called by blk_io_unplug() or immediately if not plugged. Called without - * blkio_lock. + * Called by defer_call_end() or immediately if not in a deferred section. + * Called without blkio_lock. */ -static void blkio_unplug_fn(void *opaque) +static void blkio_deferred_fn(void *opaque) { BDRVBlkioState *s = opaque; @@ -332,7 +333,7 @@ static void blkio_submit_io(BlockDriverState *bs) { BDRVBlkioState *s = bs->opaque; - blk_io_plug_call(blkio_unplug_fn, s); + defer_call(blkio_deferred_fn, s); } static int coroutine_fn diff --git a/block/io_uring.c b/block/io_uring.c index 69d9820..7cdd00e 100644 --- a/block/io_uring.c +++ b/block/io_uring.c @@ -15,6 +15,7 @@ #include "block/block.h" #include "block/raw-aio.h" #include "qemu/coroutine.h" +#include "qemu/defer-call.h" #include "qapi/error.h" #include "sysemu/block-backend.h" #include "trace.h" @@ -124,6 +125,9 @@ static void luring_process_completions(LuringState *s) { struct io_uring_cqe *cqes; int total_bytes; + + defer_call_begin(); + /* * Request completion callbacks can run the nested event loop. * Schedule ourselves so the nested event loop will "see" remaining @@ -216,7 +220,10 @@ end: aio_co_wake(luringcb->co); } } + qemu_bh_cancel(s->completion_bh); + + defer_call_end(); } static int ioq_submit(LuringState *s) @@ -306,7 +313,7 @@ static void ioq_init(LuringQueue *io_q) io_q->blocked = false; } -static void luring_unplug_fn(void *opaque) +static void luring_deferred_fn(void *opaque) { LuringState *s = opaque; trace_luring_unplug_fn(s, s->io_q.blocked, s->io_q.in_queue, @@ -367,7 +374,7 @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s, return ret; } - blk_io_plug_call(luring_unplug_fn, s); + defer_call(luring_deferred_fn, s); } return 0; } diff --git a/block/linux-aio.c b/block/linux-aio.c index 1a51503..ec05d94 100644 --- a/block/linux-aio.c +++ b/block/linux-aio.c @@ -14,6 +14,7 @@ #include "block/raw-aio.h" #include "qemu/event_notifier.h" #include "qemu/coroutine.h" +#include "qemu/defer-call.h" #include "qapi/error.h" #include "sysemu/block-backend.h" @@ -204,6 +205,8 @@ static void qemu_laio_process_completions(LinuxAioState *s) { struct io_event *events; + defer_call_begin(); + /* Reschedule so nested event loops see currently pending completions */ qemu_bh_schedule(s->completion_bh); @@ -230,6 +233,8 @@ static void qemu_laio_process_completions(LinuxAioState *s) * own `for` loop. If we are the last all counters dropped to zero. */ s->event_max = 0; s->event_idx = 0; + + defer_call_end(); } static void qemu_laio_process_completions_and_submit(LinuxAioState *s) @@ -353,7 +358,7 @@ static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch) return max_batch; } -static void laio_unplug_fn(void *opaque) +static void laio_deferred_fn(void *opaque) { LinuxAioState *s = opaque; @@ -393,7 +398,7 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset, if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch)) { ioq_submit(s); } else { - blk_io_plug_call(laio_unplug_fn, s); + defer_call(laio_deferred_fn, s); } } diff --git a/block/meson.build b/block/meson.build index f351b9d..59ff6d3 100644 --- a/block/meson.build +++ b/block/meson.build @@ -21,7 +21,6 @@ block_ss.add(files( 'mirror.c', 'nbd.c', 'null.c', - 'plug.c', 'preallocate.c', 'progress_meter.c', 'qapi.c', diff --git a/block/mirror.c b/block/mirror.c index dcd88de..c839542 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -55,10 +55,18 @@ typedef struct MirrorBlockJob { BlockMirrorBackingMode backing_mode; /* Whether the target image requires explicit zero-initialization */ bool zero_target; + /* + * To be accesssed with atomics. Written only under the BQL (required by the + * current implementation of mirror_change()). + */ MirrorCopyMode copy_mode; BlockdevOnError on_source_error, on_target_error; - /* Set when the target is synced (dirty bitmap is clean, nothing - * in flight) and the job is running in active mode */ + /* + * To be accessed with atomics. + * + * Set when the target is synced (dirty bitmap is clean, nothing in flight) + * and the job is running in active mode. + */ bool actively_synced; bool should_complete; int64_t granularity; @@ -122,7 +130,7 @@ typedef enum MirrorMethod { static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read, int error) { - s->actively_synced = false; + qatomic_set(&s->actively_synced, false); if (read) { return block_job_error_action(&s->common, s->on_source_error, true, error); @@ -962,7 +970,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) if (s->bdev_length == 0) { /* Transition to the READY state and wait for complete. */ job_transition_to_ready(&s->common.job); - s->actively_synced = true; + qatomic_set(&s->actively_synced, true); while (!job_cancel_requested(&s->common.job) && !s->should_complete) { job_yield(&s->common.job); } @@ -1074,9 +1082,9 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) * the target in a consistent state. */ job_transition_to_ready(&s->common.job); - if (s->copy_mode != MIRROR_COPY_MODE_BACKGROUND) { - s->actively_synced = true; - } + } + if (qatomic_read(&s->copy_mode) != MIRROR_COPY_MODE_BACKGROUND) { + qatomic_set(&s->actively_synced, true); } should_complete = s->should_complete || @@ -1246,6 +1254,48 @@ static bool commit_active_cancel(Job *job, bool force) return force || !job_is_ready(job); } +static void mirror_change(BlockJob *job, BlockJobChangeOptions *opts, + Error **errp) +{ + MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); + BlockJobChangeOptionsMirror *change_opts = &opts->u.mirror; + MirrorCopyMode current; + + /* + * The implementation relies on the fact that copy_mode is only written + * under the BQL. Otherwise, further synchronization would be required. + */ + + GLOBAL_STATE_CODE(); + + if (qatomic_read(&s->copy_mode) == change_opts->copy_mode) { + return; + } + + if (change_opts->copy_mode != MIRROR_COPY_MODE_WRITE_BLOCKING) { + error_setg(errp, "Change to copy mode '%s' is not implemented", + MirrorCopyMode_str(change_opts->copy_mode)); + return; + } + + current = qatomic_cmpxchg(&s->copy_mode, MIRROR_COPY_MODE_BACKGROUND, + change_opts->copy_mode); + if (current != MIRROR_COPY_MODE_BACKGROUND) { + error_setg(errp, "Expected current copy mode '%s', got '%s'", + MirrorCopyMode_str(MIRROR_COPY_MODE_BACKGROUND), + MirrorCopyMode_str(current)); + } +} + +static void mirror_query(BlockJob *job, BlockJobInfo *info) +{ + MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); + + info->u.mirror = (BlockJobInfoMirror) { + .actively_synced = qatomic_read(&s->actively_synced), + }; +} + static const BlockJobDriver mirror_job_driver = { .job_driver = { .instance_size = sizeof(MirrorBlockJob), @@ -1260,6 +1310,8 @@ static const BlockJobDriver mirror_job_driver = { .cancel = mirror_cancel, }, .drained_poll = mirror_drained_poll, + .change = mirror_change, + .query = mirror_query, }; static const BlockJobDriver commit_active_job_driver = { @@ -1378,7 +1430,7 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method, bitmap_end = QEMU_ALIGN_UP(offset + bytes, job->granularity); bdrv_set_dirty_bitmap(job->dirty_bitmap, bitmap_offset, bitmap_end - bitmap_offset); - job->actively_synced = false; + qatomic_set(&job->actively_synced, false); action = mirror_error_action(job, false, -ret); if (action == BLOCK_ERROR_ACTION_REPORT) { @@ -1437,7 +1489,8 @@ static void coroutine_fn GRAPH_RDLOCK active_write_settle(MirrorOp *op) uint64_t end_chunk = DIV_ROUND_UP(op->offset + op->bytes, op->s->granularity); - if (!--op->s->in_active_write_counter && op->s->actively_synced) { + if (!--op->s->in_active_write_counter && + qatomic_read(&op->s->actively_synced)) { BdrvChild *source = op->s->mirror_top_bs->backing; if (QLIST_FIRST(&source->bs->parents) == source && @@ -1463,21 +1516,21 @@ bdrv_mirror_top_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes, return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags); } +static bool should_copy_to_target(MirrorBDSOpaque *s) +{ + return s->job && s->job->ret >= 0 && + !job_is_cancelled(&s->job->common.job) && + qatomic_read(&s->job->copy_mode) == MIRROR_COPY_MODE_WRITE_BLOCKING; +} + static int coroutine_fn GRAPH_RDLOCK bdrv_mirror_top_do_write(BlockDriverState *bs, MirrorMethod method, - uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, - int flags) + bool copy_to_target, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { MirrorOp *op = NULL; MirrorBDSOpaque *s = bs->opaque; int ret = 0; - bool copy_to_target = false; - - if (s->job) { - copy_to_target = s->job->ret >= 0 && - !job_is_cancelled(&s->job->common.job) && - s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING; - } if (copy_to_target) { op = active_write_prepare(s->job, offset, bytes); @@ -1500,6 +1553,11 @@ bdrv_mirror_top_do_write(BlockDriverState *bs, MirrorMethod method, abort(); } + if (!copy_to_target && s->job && s->job->dirty_bitmap) { + qatomic_set(&s->job->actively_synced, false); + bdrv_set_dirty_bitmap(s->job->dirty_bitmap, offset, bytes); + } + if (ret < 0) { goto out; } @@ -1519,17 +1577,10 @@ static int coroutine_fn GRAPH_RDLOCK bdrv_mirror_top_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) { - MirrorBDSOpaque *s = bs->opaque; QEMUIOVector bounce_qiov; void *bounce_buf; int ret = 0; - bool copy_to_target = false; - - if (s->job) { - copy_to_target = s->job->ret >= 0 && - !job_is_cancelled(&s->job->common.job) && - s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING; - } + bool copy_to_target = should_copy_to_target(bs->opaque); if (copy_to_target) { /* The guest might concurrently modify the data to write; but @@ -1546,8 +1597,8 @@ bdrv_mirror_top_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes, flags &= ~BDRV_REQ_REGISTERED_BUF; } - ret = bdrv_mirror_top_do_write(bs, MIRROR_METHOD_COPY, offset, bytes, qiov, - flags); + ret = bdrv_mirror_top_do_write(bs, MIRROR_METHOD_COPY, copy_to_target, + offset, bytes, qiov, flags); if (copy_to_target) { qemu_iovec_destroy(&bounce_qiov); @@ -1570,15 +1621,17 @@ static int coroutine_fn GRAPH_RDLOCK bdrv_mirror_top_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes, BdrvRequestFlags flags) { - return bdrv_mirror_top_do_write(bs, MIRROR_METHOD_ZERO, offset, bytes, NULL, - flags); + bool copy_to_target = should_copy_to_target(bs->opaque); + return bdrv_mirror_top_do_write(bs, MIRROR_METHOD_ZERO, copy_to_target, + offset, bytes, NULL, flags); } static int coroutine_fn GRAPH_RDLOCK bdrv_mirror_top_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) { - return bdrv_mirror_top_do_write(bs, MIRROR_METHOD_DISCARD, offset, bytes, - NULL, 0); + bool copy_to_target = should_copy_to_target(bs->opaque); + return bdrv_mirror_top_do_write(bs, MIRROR_METHOD_DISCARD, copy_to_target, + offset, bytes, NULL, 0); } static void bdrv_mirror_top_refresh_filename(BlockDriverState *bs) @@ -1813,7 +1866,7 @@ static BlockJob *mirror_start_job( s->is_none_mode = is_none_mode; s->backing_mode = backing_mode; s->zero_target = zero_target; - s->copy_mode = copy_mode; + qatomic_set(&s->copy_mode, copy_mode); s->base = base; s->base_overlay = bdrv_find_overlay(bs, base); s->granularity = granularity; @@ -1823,13 +1876,17 @@ static BlockJob *mirror_start_job( s->should_complete = true; } - s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp); + s->dirty_bitmap = bdrv_create_dirty_bitmap(s->mirror_top_bs, granularity, + NULL, errp); if (!s->dirty_bitmap) { goto fail; } - if (s->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING) { - bdrv_disable_dirty_bitmap(s->dirty_bitmap); - } + + /* + * The dirty bitmap is set by bdrv_mirror_top_do_write() when not in active + * mode. + */ + bdrv_disable_dirty_bitmap(s->dirty_bitmap); ret = block_job_add_bdrv(&s->common, "source", bs, 0, BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE | diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c index 7645c7e..5b2c597 100644 --- a/block/monitor/block-hmp-cmds.c +++ b/block/monitor/block-hmp-cmds.c @@ -846,7 +846,7 @@ void hmp_info_block_jobs(Monitor *mon, const QDict *qdict) } while (list) { - if (strcmp(list->value->type, "stream") == 0) { + if (list->value->type == JOB_TYPE_STREAM) { monitor_printf(mon, "Streaming device %s: Completed %" PRId64 " of %" PRId64 " bytes, speed limit %" PRId64 " bytes/s\n", @@ -858,7 +858,7 @@ void hmp_info_block_jobs(Monitor *mon, const QDict *qdict) monitor_printf(mon, "Type %s, device %s: Completed %" PRId64 " of %" PRId64 " bytes, speed limit %" PRId64 " bytes/s\n", - list->value->type, + JobType_str(list->value->type), list->value->device, list->value->offset, list->value->len, diff --git a/block/nvme.c b/block/nvme.c index b6e95f0..96b3f8f 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -16,6 +16,7 @@ #include "qapi/error.h" #include "qapi/qmp/qdict.h" #include "qapi/qmp/qstring.h" +#include "qemu/defer-call.h" #include "qemu/error-report.h" #include "qemu/main-loop.h" #include "qemu/module.h" @@ -476,7 +477,7 @@ static void nvme_trace_command(const NvmeCmd *cmd) } } -static void nvme_unplug_fn(void *opaque) +static void nvme_deferred_fn(void *opaque) { NVMeQueuePair *q = opaque; @@ -503,7 +504,7 @@ static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req, q->need_kick++; qemu_mutex_unlock(&q->lock); - blk_io_plug_call(nvme_unplug_fn, q); + defer_call(nvme_deferred_fn, q); } static void nvme_admin_cmd_sync_cb(void *opaque, int ret) diff --git a/block/plug.c b/block/plug.c deleted file mode 100644 index 98a155d..0000000 --- a/block/plug.c +++ /dev/null @@ -1,159 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Block I/O plugging - * - * Copyright Red Hat. - * - * This API defers a function call within a blk_io_plug()/blk_io_unplug() - * section, allowing multiple calls to batch up. This is a performance - * optimization that is used in the block layer to submit several I/O requests - * at once instead of individually: - * - * blk_io_plug(); <-- start of plugged region - * ... - * blk_io_plug_call(my_func, my_obj); <-- deferred my_func(my_obj) call - * blk_io_plug_call(my_func, my_obj); <-- another - * blk_io_plug_call(my_func, my_obj); <-- another - * ... - * blk_io_unplug(); <-- end of plugged region, my_func(my_obj) is called once - * - * This code is actually generic and not tied to the block layer. If another - * subsystem needs this functionality, it could be renamed. - */ - -#include "qemu/osdep.h" -#include "qemu/coroutine-tls.h" -#include "qemu/notify.h" -#include "qemu/thread.h" -#include "sysemu/block-backend.h" - -/* A function call that has been deferred until unplug() */ -typedef struct { - void (*fn)(void *); - void *opaque; -} UnplugFn; - -/* Per-thread state */ -typedef struct { - unsigned count; /* how many times has plug() been called? */ - GArray *unplug_fns; /* functions to call at unplug time */ -} Plug; - -/* Use get_ptr_plug() to fetch this thread-local value */ -QEMU_DEFINE_STATIC_CO_TLS(Plug, plug); - -/* Called at thread cleanup time */ -static void blk_io_plug_atexit(Notifier *n, void *value) -{ - Plug *plug = get_ptr_plug(); - g_array_free(plug->unplug_fns, TRUE); -} - -/* This won't involve coroutines, so use __thread */ -static __thread Notifier blk_io_plug_atexit_notifier; - -/** - * blk_io_plug_call: - * @fn: a function pointer to be invoked - * @opaque: a user-defined argument to @fn() - * - * Call @fn(@opaque) immediately if not within a blk_io_plug()/blk_io_unplug() - * section. - * - * Otherwise defer the call until the end of the outermost - * blk_io_plug()/blk_io_unplug() section in this thread. If the same - * @fn/@opaque pair has already been deferred, it will only be called once upon - * blk_io_unplug() so that accumulated calls are batched into a single call. - * - * The caller must ensure that @opaque is not freed before @fn() is invoked. - */ -void blk_io_plug_call(void (*fn)(void *), void *opaque) -{ - Plug *plug = get_ptr_plug(); - - /* Call immediately if we're not plugged */ - if (plug->count == 0) { - fn(opaque); - return; - } - - GArray *array = plug->unplug_fns; - if (!array) { - array = g_array_new(FALSE, FALSE, sizeof(UnplugFn)); - plug->unplug_fns = array; - blk_io_plug_atexit_notifier.notify = blk_io_plug_atexit; - qemu_thread_atexit_add(&blk_io_plug_atexit_notifier); - } - - UnplugFn *fns = (UnplugFn *)array->data; - UnplugFn new_fn = { - .fn = fn, - .opaque = opaque, - }; - - /* - * There won't be many, so do a linear search. If this becomes a bottleneck - * then a binary search (glib 2.62+) or different data structure could be - * used. - */ - for (guint i = 0; i < array->len; i++) { - if (memcmp(&fns[i], &new_fn, sizeof(new_fn)) == 0) { - return; /* already exists */ - } - } - - g_array_append_val(array, new_fn); -} - -/** - * blk_io_plug: Defer blk_io_plug_call() functions until blk_io_unplug() - * - * blk_io_plug/unplug are thread-local operations. This means that multiple - * threads can simultaneously call plug/unplug, but the caller must ensure that - * each unplug() is called in the same thread of the matching plug(). - * - * Nesting is supported. blk_io_plug_call() functions are only called at the - * outermost blk_io_unplug(). - */ -void blk_io_plug(void) -{ - Plug *plug = get_ptr_plug(); - - assert(plug->count < UINT32_MAX); - - plug->count++; -} - -/** - * blk_io_unplug: Run any pending blk_io_plug_call() functions - * - * There must have been a matching blk_io_plug() call in the same thread prior - * to this blk_io_unplug() call. - */ -void blk_io_unplug(void) -{ - Plug *plug = get_ptr_plug(); - - assert(plug->count > 0); - - if (--plug->count > 0) { - return; - } - - GArray *array = plug->unplug_fns; - if (!array) { - return; - } - - UnplugFn *fns = (UnplugFn *)array->data; - - for (guint i = 0; i < array->len; i++) { - fns[i].fn(fns[i].opaque); - } - - /* - * This resets the array without freeing memory so that appending is cheap - * in the future. - */ - g_array_set_size(array, 0); -} diff --git a/block/qapi-sysemu.c b/block/qapi-sysemu.c index 3f614cb..1618cd2 100644 --- a/block/qapi-sysemu.c +++ b/block/qapi-sysemu.c @@ -237,6 +237,7 @@ static void qmp_blockdev_insert_anon_medium(BlockBackend *blk, BlockDriverState *bs, Error **errp) { Error *local_err = NULL; + AioContext *ctx; bool has_device; int ret; @@ -258,7 +259,11 @@ static void qmp_blockdev_insert_anon_medium(BlockBackend *blk, return; } + ctx = bdrv_get_aio_context(bs); + aio_context_acquire(ctx); ret = blk_insert_bs(blk, bs, errp); + aio_context_release(ctx); + if (ret < 0) { return; } @@ -2968,6 +2968,7 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs, if (replaces) { BlockDriverState *to_replace_bs; + AioContext *aio_context; AioContext *replace_aio_context; int64_t bs_size, replace_size; @@ -2982,10 +2983,19 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs, return; } + aio_context = bdrv_get_aio_context(bs); replace_aio_context = bdrv_get_aio_context(to_replace_bs); - aio_context_acquire(replace_aio_context); + /* + * bdrv_getlength() is a co-wrapper and uses AIO_WAIT_WHILE. Be sure not + * to acquire the same AioContext twice. + */ + if (replace_aio_context != aio_context) { + aio_context_acquire(replace_aio_context); + } replace_size = bdrv_getlength(to_replace_bs); - aio_context_release(replace_aio_context); + if (replace_aio_context != aio_context) { + aio_context_release(replace_aio_context); + } if (replace_size < 0) { error_setg_errno(errp, -replace_size, @@ -3382,6 +3392,20 @@ void qmp_block_job_dismiss(const char *id, Error **errp) job_dismiss_locked(&job, errp); } +void qmp_block_job_change(BlockJobChangeOptions *opts, Error **errp) +{ + BlockJob *job; + + JOB_LOCK_GUARD(); + job = find_block_job_locked(opts->id, errp); + + if (!job) { + return; + } + + block_job_change_locked(job, opts, errp); +} + void qmp_change_backing_file(const char *device, const char *image_node_name, const char *backing_file, @@ -198,7 +198,9 @@ void block_job_remove_all_bdrv(BlockJob *job) * one to make sure that such a concurrent access does not attempt * to process an already freed BdrvChild. */ + aio_context_release(job->job.aio_context); bdrv_graph_wrlock(NULL); + aio_context_acquire(job->job.aio_context); while (job->nodes) { GSList *l = job->nodes; BdrvChild *c = l->data; @@ -328,6 +330,26 @@ static bool block_job_set_speed(BlockJob *job, int64_t speed, Error **errp) return block_job_set_speed_locked(job, speed, errp); } +void block_job_change_locked(BlockJob *job, BlockJobChangeOptions *opts, + Error **errp) +{ + const BlockJobDriver *drv = block_job_driver(job); + + GLOBAL_STATE_CODE(); + + if (job_apply_verb_locked(&job->job, JOB_VERB_CHANGE, errp)) { + return; + } + + if (drv->change) { + job_unlock(); + drv->change(job, opts, errp); + job_lock(); + } else { + error_setg(errp, "Job type does not support change"); + } +} + void block_job_ratelimit_processed_bytes(BlockJob *job, uint64_t n) { IO_CODE(); @@ -356,6 +378,7 @@ BlockJobInfo *block_job_query_locked(BlockJob *job, Error **errp) { BlockJobInfo *info; uint64_t progress_current, progress_total; + const BlockJobDriver *drv = block_job_driver(job); GLOBAL_STATE_CODE(); @@ -368,7 +391,7 @@ BlockJobInfo *block_job_query_locked(BlockJob *job, Error **errp) &progress_total); info = g_new0(BlockJobInfo, 1); - info->type = g_strdup(job_type_str(&job->job)); + info->type = job_type(&job->job); info->device = g_strdup(job->job.id); info->busy = job->job.busy; info->paused = job->job.pause_count > 0; @@ -385,6 +408,11 @@ BlockJobInfo *block_job_query_locked(BlockJob *job, Error **errp) g_strdup(error_get_pretty(job->job.err)) : g_strdup(strerror(-job->job.ret)); } + if (drv->query) { + job_unlock(); + drv->query(job, info); + job_lock(); + } return info; } diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst index ca5a277..4459c06 100644 --- a/docs/tools/qemu-img.rst +++ b/docs/tools/qemu-img.rst @@ -667,7 +667,7 @@ Command description: List, apply, create or delete snapshots in image *FILENAME*. -.. option:: rebase [--object OBJECTDEF] [--image-opts] [-U] [-q] [-f FMT] [-t CACHE] [-T SRC_CACHE] [-p] [-u] -b BACKING_FILE [-F BACKING_FMT] FILENAME +.. option:: rebase [--object OBJECTDEF] [--image-opts] [-U] [-q] [-f FMT] [-t CACHE] [-T SRC_CACHE] [-p] [-u] [-c] -b BACKING_FILE [-F BACKING_FMT] FILENAME Changes the backing file of an image. Only the formats ``qcow2`` and ``qed`` support changing the backing file. @@ -694,7 +694,9 @@ Command description: In order to achieve this, any clusters that differ between *BACKING_FILE* and the old backing file of *FILENAME* are merged - into *FILENAME* before actually changing the backing file. + into *FILENAME* before actually changing the backing file. With the + ``-c`` option specified, the clusters which are being merged (but not + the entire *FILENAME* image) are compressed when written. Note that the safe mode is an expensive operation, comparable to converting an image. It only works if the old backing file still diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c index da36fcf..f83bb0f 100644 --- a/hw/block/dataplane/virtio-blk.c +++ b/hw/block/dataplane/virtio-blk.c @@ -31,9 +31,6 @@ struct VirtIOBlockDataPlane { VirtIOBlkConf *conf; VirtIODevice *vdev; - QEMUBH *bh; /* bh for guest notification */ - unsigned long *batch_notify_vqs; - bool batch_notifications; /* Note that these EventNotifiers are assigned by value. This is * fine as long as you do not call event_notifier_cleanup on them @@ -47,36 +44,7 @@ struct VirtIOBlockDataPlane { /* Raise an interrupt to signal guest, if necessary */ void virtio_blk_data_plane_notify(VirtIOBlockDataPlane *s, VirtQueue *vq) { - if (s->batch_notifications) { - set_bit(virtio_get_queue_index(vq), s->batch_notify_vqs); - qemu_bh_schedule(s->bh); - } else { - virtio_notify_irqfd(s->vdev, vq); - } -} - -static void notify_guest_bh(void *opaque) -{ - VirtIOBlockDataPlane *s = opaque; - unsigned nvqs = s->conf->num_queues; - unsigned long bitmap[BITS_TO_LONGS(nvqs)]; - unsigned j; - - memcpy(bitmap, s->batch_notify_vqs, sizeof(bitmap)); - memset(s->batch_notify_vqs, 0, sizeof(bitmap)); - - for (j = 0; j < nvqs; j += BITS_PER_LONG) { - unsigned long bits = bitmap[j / BITS_PER_LONG]; - - while (bits != 0) { - unsigned i = j + ctzl(bits); - VirtQueue *vq = virtio_get_queue(s->vdev, i); - - virtio_notify_irqfd(s->vdev, vq); - - bits &= bits - 1; /* clear right-most bit */ - } - } + virtio_notify_irqfd(s->vdev, vq); } /* Context: QEMU global mutex held */ @@ -126,9 +94,6 @@ bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *conf, } else { s->ctx = qemu_get_aio_context(); } - s->bh = aio_bh_new_guarded(s->ctx, notify_guest_bh, s, - &DEVICE(vdev)->mem_reentrancy_guard); - s->batch_notify_vqs = bitmap_new(conf->num_queues); *dataplane = s; @@ -146,8 +111,6 @@ void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s) vblk = VIRTIO_BLK(s->vdev); assert(!vblk->dataplane_started); - g_free(s->batch_notify_vqs); - qemu_bh_delete(s->bh); if (s->iothread) { object_unref(OBJECT(s->iothread)); } @@ -173,12 +136,6 @@ int virtio_blk_data_plane_start(VirtIODevice *vdev) s->starting = true; - if (!virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) { - s->batch_notifications = true; - } else { - s->batch_notifications = false; - } - /* Set up guest notifier (irq) */ r = k->set_guest_notifiers(qbus->parent, nvqs, true); if (r != 0) { @@ -370,9 +327,6 @@ void virtio_blk_data_plane_stop(VirtIODevice *vdev) aio_context_release(s->ctx); - qemu_bh_cancel(s->bh); - notify_guest_bh(s); /* final chance to notify guest */ - /* Clean up guest notifier (irq) */ k->set_guest_notifiers(qbus->parent, nvqs, false); diff --git a/hw/block/dataplane/xen-block.c b/hw/block/dataplane/xen-block.c index 3b6f2b0..c4bb28c 100644 --- a/hw/block/dataplane/xen-block.c +++ b/hw/block/dataplane/xen-block.c @@ -19,6 +19,7 @@ */ #include "qemu/osdep.h" +#include "qemu/defer-call.h" #include "qemu/error-report.h" #include "qemu/main-loop.h" #include "qemu/memalign.h" @@ -509,7 +510,7 @@ static int xen_block_get_request(XenBlockDataPlane *dataplane, /* * Threshold of in-flight requests above which we will start using - * blk_io_plug()/blk_io_unplug() to batch requests. + * defer_call_begin()/defer_call_end() to batch requests. */ #define IO_PLUG_THRESHOLD 1 @@ -537,7 +538,7 @@ static bool xen_block_handle_requests(XenBlockDataPlane *dataplane) * is below us. */ if (inflight_atstart > IO_PLUG_THRESHOLD) { - blk_io_plug(); + defer_call_begin(); } while (rc != rp) { /* pull request from ring */ @@ -577,12 +578,12 @@ static bool xen_block_handle_requests(XenBlockDataPlane *dataplane) if (inflight_atstart > IO_PLUG_THRESHOLD && batched >= inflight_atstart) { - blk_io_unplug(); + defer_call_end(); } xen_block_do_aio(request); if (inflight_atstart > IO_PLUG_THRESHOLD) { if (batched >= inflight_atstart) { - blk_io_plug(); + defer_call_begin(); batched = 0; } else { batched++; @@ -590,7 +591,7 @@ static bool xen_block_handle_requests(XenBlockDataPlane *dataplane) } } if (inflight_atstart > IO_PLUG_THRESHOLD) { - blk_io_unplug(); + defer_call_end(); } return done_something; diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 39e7f23..a1f8e15 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -12,6 +12,7 @@ */ #include "qemu/osdep.h" +#include "qemu/defer-call.h" #include "qapi/error.h" #include "qemu/iov.h" #include "qemu/module.h" @@ -1134,7 +1135,7 @@ void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq) bool suppress_notifications = virtio_queue_get_notification(vq); aio_context_acquire(blk_get_aio_context(s->blk)); - blk_io_plug(); + defer_call_begin(); do { if (suppress_notifications) { @@ -1158,7 +1159,7 @@ void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq) virtio_blk_submit_multireq(s, &mrb); } - blk_io_unplug(); + defer_call_end(); aio_context_release(blk_get_aio_context(s->blk)); } diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c index fa53f09..9c751bf 100644 --- a/hw/scsi/virtio-scsi.c +++ b/hw/scsi/virtio-scsi.c @@ -18,6 +18,7 @@ #include "standard-headers/linux/virtio_ids.h" #include "hw/virtio/virtio-scsi.h" #include "migration/qemu-file-types.h" +#include "qemu/defer-call.h" #include "qemu/error-report.h" #include "qemu/iov.h" #include "qemu/module.h" @@ -799,7 +800,7 @@ static int virtio_scsi_handle_cmd_req_prepare(VirtIOSCSI *s, VirtIOSCSIReq *req) return -ENOBUFS; } scsi_req_ref(req->sreq); - blk_io_plug(); + defer_call_begin(); object_unref(OBJECT(d)); return 0; } @@ -810,7 +811,7 @@ static void virtio_scsi_handle_cmd_req_submit(VirtIOSCSI *s, VirtIOSCSIReq *req) if (scsi_req_enqueue(sreq)) { scsi_req_continue(sreq); } - blk_io_unplug(); + defer_call_end(); scsi_req_unref(sreq); } @@ -836,7 +837,7 @@ static void virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq) while (!QTAILQ_EMPTY(&reqs)) { req = QTAILQ_FIRST(&reqs); QTAILQ_REMOVE(&reqs, req, next); - blk_io_unplug(); + defer_call_end(); scsi_req_unref(req->sreq); virtqueue_detach_element(req->vq, &req->elem, 0); virtio_scsi_free_req(req); diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index 1cb9027..0af7a28 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -73,6 +73,7 @@ virtqueue_fill(void *vq, const void *elem, unsigned int len, unsigned int idx) " virtqueue_flush(void *vq, unsigned int count) "vq %p count %u" virtqueue_pop(void *vq, void *elem, unsigned int in_num, unsigned int out_num) "vq %p elem %p in_num %u out_num %u" virtio_queue_notify(void *vdev, int n, void *vq) "vdev %p n %d vq %p" +virtio_notify_irqfd_deferred_fn(void *vdev, void *vq) "vdev %p vq %p" virtio_notify_irqfd(void *vdev, void *vq) "vdev %p vq %p" virtio_notify(void *vdev, void *vq) "vdev %p vq %p" virtio_set_status(void *vdev, uint8_t val) "vdev %p val %u" diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index fb24bc9..e510557 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -15,6 +15,7 @@ #include "qapi/error.h" #include "qapi/qapi-commands-virtio.h" #include "trace.h" +#include "qemu/defer-call.h" #include "qemu/error-report.h" #include "qemu/log.h" #include "qemu/main-loop.h" @@ -2445,6 +2446,16 @@ static bool virtio_should_notify(VirtIODevice *vdev, VirtQueue *vq) } } +/* Batch irqs while inside a defer_call_begin()/defer_call_end() section */ +static void virtio_notify_irqfd_deferred_fn(void *opaque) +{ + EventNotifier *notifier = opaque; + VirtQueue *vq = container_of(notifier, VirtQueue, guest_notifier); + + trace_virtio_notify_irqfd_deferred_fn(vq->vdev, vq); + event_notifier_set(notifier); +} + void virtio_notify_irqfd(VirtIODevice *vdev, VirtQueue *vq) { WITH_RCU_READ_LOCK_GUARD() { @@ -2471,7 +2482,7 @@ void virtio_notify_irqfd(VirtIODevice *vdev, VirtQueue *vq) * to an atomic operation. */ virtio_set_isr(vq->vdev, 0x1); - event_notifier_set(&vq->guest_notifier); + defer_call(virtio_notify_irqfd_deferred_fn, &vq->guest_notifier); } static void virtio_irq(VirtQueue *vq) diff --git a/include/block/blockjob.h b/include/block/blockjob.h index 058b0c8..95854f1 100644 --- a/include/block/blockjob.h +++ b/include/block/blockjob.h @@ -173,6 +173,17 @@ bool block_job_has_bdrv(BlockJob *job, BlockDriverState *bs); bool block_job_set_speed_locked(BlockJob *job, int64_t speed, Error **errp); /** + * block_job_change_locked: + * @job: The job to change. + * @opts: The new options. + * @errp: Error object. + * + * Change the job according to opts. + */ +void block_job_change_locked(BlockJob *job, BlockJobChangeOptions *opts, + Error **errp); + +/** * block_job_query_locked: * @job: The job to get information about. * diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h index 1048240..18ee6f7 100644 --- a/include/block/blockjob_int.h +++ b/include/block/blockjob_int.h @@ -67,6 +67,18 @@ struct BlockJobDriver { void (*attached_aio_context)(BlockJob *job, AioContext *new_context); void (*set_speed)(BlockJob *job, int64_t speed); + + /* + * Change the @job's options according to @opts. + * + * Note that this can already be called before the job coroutine is running. + */ + void (*change)(BlockJob *job, BlockJobChangeOptions *opts, Error **errp); + + /* + * Query information specific to this kind of block job. + */ + void (*query)(BlockJob *job, BlockJobInfo *info); }; /* diff --git a/include/qemu/defer-call.h b/include/qemu/defer-call.h new file mode 100644 index 0000000..e2c1d24 --- /dev/null +++ b/include/qemu/defer-call.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Deferred calls + * + * Copyright Red Hat. + */ + +#ifndef QEMU_DEFER_CALL_H +#define QEMU_DEFER_CALL_H + +/* See documentation in util/defer-call.c */ +void defer_call_begin(void); +void defer_call_end(void); +void defer_call(void (*fn)(void *), void *opaque); + +#endif /* QEMU_DEFER_CALL_H */ diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h index be4dcef..d174275 100644 --- a/include/sysemu/block-backend-io.h +++ b/include/sysemu/block-backend-io.h @@ -100,10 +100,6 @@ void blk_iostatus_set_err(BlockBackend *blk, int error); int blk_get_max_iov(BlockBackend *blk); int blk_get_max_hw_iov(BlockBackend *blk); -void blk_io_plug(void); -void blk_io_unplug(void); -void blk_io_plug_call(void (*fn)(void *), void *opaque); - AioContext *blk_get_aio_context(BlockBackend *blk); BlockAcctStats *blk_get_stats(BlockBackend *blk); void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk, @@ -80,6 +80,7 @@ bool JobVerbTable[JOB_VERB__MAX][JOB_STATUS__MAX] = { [JOB_VERB_COMPLETE] = {0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0}, [JOB_VERB_FINALIZE] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0}, [JOB_VERB_DISMISS] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0}, + [JOB_VERB_CHANGE] = {0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0}, }; /* Transactional group of jobs */ diff --git a/qapi/block-core.json b/qapi/block-core.json index 89751d8..9996125 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -1353,6 +1353,20 @@ 'data': ['background', 'write-blocking'] } ## +# @BlockJobInfoMirror: +# +# Information specific to mirror block jobs. +# +# @actively-synced: Whether the source is actively synced to the +# target, i.e. same data and new writes are done synchronously to +# both. +# +# Since 8.2 +## +{ 'struct': 'BlockJobInfoMirror', + 'data': { 'actively-synced': 'bool' } } + +## # @BlockJobInfo: # # Information about a long-running block device operation. @@ -1395,13 +1409,15 @@ # # Since: 1.1 ## -{ 'struct': 'BlockJobInfo', - 'data': {'type': 'str', 'device': 'str', 'len': 'int', +{ 'union': 'BlockJobInfo', + 'base': {'type': 'JobType', 'device': 'str', 'len': 'int', 'offset': 'int', 'busy': 'bool', 'paused': 'bool', 'speed': 'int', 'io-status': 'BlockDeviceIoStatus', 'ready': 'bool', 'status': 'JobStatus', 'auto-finalize': 'bool', 'auto-dismiss': 'bool', - '*error': 'str' } } + '*error': 'str' }, + 'discriminator': 'type', + 'data': { 'mirror': 'BlockJobInfoMirror' } } ## # @query-block-jobs: @@ -3045,6 +3061,43 @@ 'allow-preconfig': true } ## +# @BlockJobChangeOptionsMirror: +# +# @copy-mode: Switch to this copy mode. Currently, only the switch +# from 'background' to 'write-blocking' is implemented. +# +# Since: 8.2 +## +{ 'struct': 'BlockJobChangeOptionsMirror', + 'data': { 'copy-mode' : 'MirrorCopyMode' } } + +## +# @BlockJobChangeOptions: +# +# Block job options that can be changed after job creation. +# +# @id: The job identifier +# +# @type: The job type +# +# Since 8.2 +## +{ 'union': 'BlockJobChangeOptions', + 'base': { 'id': 'str', 'type': 'JobType' }, + 'discriminator': 'type', + 'data': { 'mirror': 'BlockJobChangeOptionsMirror' } } + +## +# @block-job-change: +# +# Change the block job's options. +# +# Since: 8.2 +## +{ 'command': 'block-job-change', + 'data': 'BlockJobChangeOptions', 'boxed': true } + +## # @BlockdevDiscardOptions: # # Determines how to handle discard requests. diff --git a/qapi/job.json b/qapi/job.json index 7f0ba09..b395720 100644 --- a/qapi/job.json +++ b/qapi/job.json @@ -105,11 +105,13 @@ # # @finalize: see @job-finalize # +# @change: see @block-job-change (since 8.2) +# # Since: 2.12 ## { 'enum': 'JobVerb', 'data': ['cancel', 'pause', 'resume', 'set-speed', 'complete', 'dismiss', - 'finalize' ] } + 'finalize', 'change' ] } ## # @JOB_STATUS_CHANGE: diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx index 1b1dab5..068692d 100644 --- a/qemu-img-cmds.hx +++ b/qemu-img-cmds.hx @@ -88,9 +88,9 @@ SRST ERST DEF("rebase", img_rebase, - "rebase [--object objectdef] [--image-opts] [-U] [-q] [-f fmt] [-t cache] [-T src_cache] [-p] [-u] -b backing_file [-F backing_fmt] filename") + "rebase [--object objectdef] [--image-opts] [-U] [-q] [-f fmt] [-t cache] [-T src_cache] [-p] [-u] [-c] -b backing_file [-F backing_fmt] filename") SRST -.. option:: rebase [--object OBJECTDEF] [--image-opts] [-U] [-q] [-f FMT] [-t CACHE] [-T SRC_CACHE] [-p] [-u] -b BACKING_FILE [-F BACKING_FMT] FILENAME +.. option:: rebase [--object OBJECTDEF] [--image-opts] [-U] [-q] [-f FMT] [-t CACHE] [-T SRC_CACHE] [-p] [-u] [-c] -b BACKING_FILE [-F BACKING_FMT] FILENAME ERST DEF("resize", img_resize, @@ -1274,23 +1274,29 @@ static int is_allocated_sectors_min(const uint8_t *buf, int n, int *pnum, } /* - * Compares two buffers sector by sector. Returns 0 if the first - * sector of each buffer matches, non-zero otherwise. + * Compares two buffers chunk by chunk, where @chsize is the chunk size. + * If @chsize is 0, default chunk size of BDRV_SECTOR_SIZE is used. + * Returns 0 if the first chunk of each buffer matches, non-zero otherwise. * - * pnum is set to the sector-aligned size of the buffer prefix that - * has the same matching status as the first sector. + * @pnum is set to the size of the buffer prefix aligned to @chsize that + * has the same matching status as the first chunk. */ static int compare_buffers(const uint8_t *buf1, const uint8_t *buf2, - int64_t bytes, int64_t *pnum) + int64_t bytes, uint64_t chsize, int64_t *pnum) { bool res; - int64_t i = MIN(bytes, BDRV_SECTOR_SIZE); + int64_t i; assert(bytes > 0); + if (!chsize) { + chsize = BDRV_SECTOR_SIZE; + } + i = MIN(bytes, chsize); + res = !!memcmp(buf1, buf2, i); while (i < bytes) { - int64_t len = MIN(bytes - i, BDRV_SECTOR_SIZE); + int64_t len = MIN(bytes - i, chsize); if (!!memcmp(buf1 + i, buf2 + i, len) != res) { break; @@ -1559,7 +1565,7 @@ static int img_compare(int argc, char **argv) ret = 4; goto out; } - ret = compare_buffers(buf1, buf2, chunk, &pnum); + ret = compare_buffers(buf1, buf2, chunk, 0, &pnum); if (ret || pnum != chunk) { qprintf(quiet, "Content mismatch at offset %" PRId64 "!\n", offset + (ret ? 0 : pnum)); @@ -3524,16 +3530,20 @@ static int img_rebase(int argc, char **argv) uint8_t *buf_new = NULL; BlockDriverState *bs = NULL, *prefix_chain_bs = NULL; BlockDriverState *unfiltered_bs; + BlockDriverInfo bdi = {0}; char *filename; const char *fmt, *cache, *src_cache, *out_basefmt, *out_baseimg; int c, flags, src_flags, ret; + BdrvRequestFlags write_flags = 0; bool writethrough, src_writethrough; int unsafe = 0; bool force_share = false; int progress = 0; bool quiet = false; + bool compress = false; Error *local_err = NULL; bool image_opts = false; + int64_t write_align; /* Parse commandline parameters */ fmt = NULL; @@ -3547,9 +3557,10 @@ static int img_rebase(int argc, char **argv) {"object", required_argument, 0, OPTION_OBJECT}, {"image-opts", no_argument, 0, OPTION_IMAGE_OPTS}, {"force-share", no_argument, 0, 'U'}, + {"compress", no_argument, 0, 'c'}, {0, 0, 0, 0} }; - c = getopt_long(argc, argv, ":hf:F:b:upt:T:qU", + c = getopt_long(argc, argv, ":hf:F:b:upt:T:qUc", long_options, NULL); if (c == -1) { break; @@ -3597,6 +3608,9 @@ static int img_rebase(int argc, char **argv) case 'U': force_share = true; break; + case 'c': + compress = true; + break; } } @@ -3649,6 +3663,14 @@ static int img_rebase(int argc, char **argv) unfiltered_bs = bdrv_skip_filters(bs); + if (compress && !block_driver_can_compress(unfiltered_bs->drv)) { + error_report("Compression not supported for this file format"); + ret = -1; + goto out; + } else if (compress) { + write_flags |= BDRV_REQ_WRITE_COMPRESSED; + } + if (out_basefmt != NULL) { if (bdrv_find_format(out_basefmt) == NULL) { error_report("Invalid format name: '%s'", out_basefmt); @@ -3657,6 +3679,20 @@ static int img_rebase(int argc, char **argv) } } + /* + * We need overlay subcluster size (or cluster size in case writes are + * compressed) to make sure write requests are aligned. + */ + ret = bdrv_get_info(unfiltered_bs, &bdi); + if (ret < 0) { + error_report("could not get block driver info"); + goto out; + } else if (bdi.subcluster_size == 0) { + bdi.cluster_size = bdi.subcluster_size = 1; + } + + write_align = compress ? bdi.cluster_size : bdi.subcluster_size; + /* For safe rebasing we need to compare old and new backing file */ if (!unsafe) { QDict *options = NULL; @@ -3756,11 +3792,16 @@ static int img_rebase(int argc, char **argv) int64_t old_backing_size = 0; int64_t new_backing_size = 0; uint64_t offset; - int64_t n; + int64_t n, n_old = 0, n_new = 0; float local_progress = 0; - buf_old = blk_blockalign(blk, IO_BUF_SIZE); - buf_new = blk_blockalign(blk, IO_BUF_SIZE); + if (blk_old_backing && bdrv_opt_mem_align(blk_bs(blk_old_backing)) > + bdrv_opt_mem_align(blk_bs(blk))) { + buf_old = blk_blockalign(blk_old_backing, IO_BUF_SIZE); + } else { + buf_old = blk_blockalign(blk, IO_BUF_SIZE); + } + buf_new = blk_blockalign(blk_new_backing, IO_BUF_SIZE); size = blk_getlength(blk); if (size < 0) { @@ -3797,7 +3838,8 @@ static int img_rebase(int argc, char **argv) } for (offset = 0; offset < size; offset += n) { - bool buf_old_is_zero = false; + bool old_backing_eof = false; + int64_t n_alloc; /* How many bytes can we handle with the next read? */ n = MIN(IO_BUF_SIZE, size - offset); @@ -3814,6 +3856,8 @@ static int img_rebase(int argc, char **argv) } if (prefix_chain_bs) { + uint64_t bytes = n; + /* * If cluster wasn't changed since prefix_chain, we don't need * to take action @@ -3826,38 +3870,60 @@ static int img_rebase(int argc, char **argv) strerror(-ret)); goto out; } - if (!ret) { + if (!ret && n) { continue; } + if (!n) { + /* + * If we've reached EOF of the old backing, it means that + * offsets beyond the old backing size were read as zeroes. + * Now we will need to explicitly zero the cluster in + * order to preserve that state after the rebase. + */ + n = bytes; + } } /* + * At this point we know that the region [offset; offset + n) + * is unallocated within the target image. This region might be + * unaligned to the target image's (sub)cluster boundaries, as + * old backing may have smaller clusters (or have subclusters). + * We extend it to the aligned boundaries to avoid CoW on + * partial writes in blk_pwrite(), + */ + n += offset - QEMU_ALIGN_DOWN(offset, write_align); + offset = QEMU_ALIGN_DOWN(offset, write_align); + n += QEMU_ALIGN_UP(offset + n, write_align) - (offset + n); + n = MIN(n, size - offset); + assert(!bdrv_is_allocated(unfiltered_bs, offset, n, &n_alloc) && + n_alloc == n); + + /* + * Much like with the target image, we'll try to read as much + * of the old and new backings as we can. + */ + n_old = MIN(n, MAX(0, old_backing_size - (int64_t) offset)); + n_new = MIN(n, MAX(0, new_backing_size - (int64_t) offset)); + + /* * Read old and new backing file and take into consideration that * backing files may be smaller than the COW image. */ - if (offset >= old_backing_size) { - memset(buf_old, 0, n); - buf_old_is_zero = true; + memset(buf_old + n_old, 0, n - n_old); + if (!n_old) { + old_backing_eof = true; } else { - if (offset + n > old_backing_size) { - n = old_backing_size - offset; - } - - ret = blk_pread(blk_old_backing, offset, n, buf_old, 0); + ret = blk_pread(blk_old_backing, offset, n_old, buf_old, 0); if (ret < 0) { error_report("error while reading from old backing file"); goto out; } } - if (offset >= new_backing_size || !blk_new_backing) { - memset(buf_new, 0, n); - } else { - if (offset + n > new_backing_size) { - n = new_backing_size - offset; - } - - ret = blk_pread(blk_new_backing, offset, n, buf_new, 0); + memset(buf_new + n_new, 0, n - n_new); + if (n_new) { + ret = blk_pread(blk_new_backing, offset, n_new, buf_new, 0); if (ret < 0) { error_report("error while reading from new backing file"); goto out; @@ -3871,13 +3937,14 @@ static int img_rebase(int argc, char **argv) int64_t pnum; if (compare_buffers(buf_old + written, buf_new + written, - n - written, &pnum)) + n - written, write_align, &pnum)) { - if (buf_old_is_zero) { + if (old_backing_eof) { ret = blk_pwrite_zeroes(blk, offset + written, pnum, 0); } else { + assert(written + pnum <= IO_BUF_SIZE); ret = blk_pwrite(blk, offset + written, pnum, - buf_old + written, 0); + buf_old + written, write_flags); } if (ret < 0) { error_report("Error while writing to COW image: %s", @@ -3887,6 +3954,9 @@ static int img_rebase(int argc, char **argv) } written += pnum; + if (offset + written >= old_backing_size) { + old_backing_eof = true; + } } qemu_progress_print(local_progress, 100); } diff --git a/tests/qemu-iotests/024 b/tests/qemu-iotests/024 index 25a564a..285f17e 100755 --- a/tests/qemu-iotests/024 +++ b/tests/qemu-iotests/024 @@ -199,6 +199,123 @@ echo # $BASE_OLD and $BASE_NEW) $QEMU_IMG map "$OVERLAY" | _filter_qemu_img_map +# Check that rebase within the chain is working when +# overlay_size > old_backing_size +# +# base_new <-- base_old <-- overlay +# +# Backing (new): 11 11 11 11 11 +# Backing (old): 22 22 22 22 +# Overlay: -- -- -- -- -- +# +# As a result, overlay should contain data identical to base_old, with the +# last cluster remaining unallocated. + +echo +echo "=== Test rebase within one backing chain ===" +echo + +echo "Creating backing chain" +echo + +TEST_IMG=$BASE_NEW _make_test_img $(( CLUSTER_SIZE * 5 )) +TEST_IMG=$BASE_OLD _make_test_img -b "$BASE_NEW" -F $IMGFMT \ + $(( CLUSTER_SIZE * 4 )) +TEST_IMG=$OVERLAY _make_test_img -b "$BASE_OLD" -F $IMGFMT \ + $(( CLUSTER_SIZE * 5 )) + +echo +echo "Fill backing files with data" +echo + +$QEMU_IO "$BASE_NEW" -c "write -P 0x11 0 $(( CLUSTER_SIZE * 5 ))" \ + | _filter_qemu_io +$QEMU_IO "$BASE_OLD" -c "write -P 0x22 0 $(( CLUSTER_SIZE * 4 ))" \ + | _filter_qemu_io + +echo +echo "Check the last cluster is zeroed in overlay before the rebase" +echo +$QEMU_IO "$OVERLAY" -c "read -P 0x00 $(( CLUSTER_SIZE * 4 )) $CLUSTER_SIZE" \ + | _filter_qemu_io + +echo +echo "Rebase onto another image in the same chain" +echo + +$QEMU_IMG rebase -b "$BASE_NEW" -F $IMGFMT "$OVERLAY" + +echo "Verify that data is read the same before and after rebase" +echo + +# Verify the first 4 clusters are still read the same as in the old base +$QEMU_IO "$OVERLAY" -c "read -P 0x22 0 $(( CLUSTER_SIZE * 4 ))" \ + | _filter_qemu_io +# Verify the last cluster still reads as zeroes +$QEMU_IO "$OVERLAY" -c "read -P 0x00 $(( CLUSTER_SIZE * 4 )) $CLUSTER_SIZE" \ + | _filter_qemu_io + +echo + +# Check that rebase within the chain is working when +# overlay cluster size > backings cluster size +# (here overlay cluster size == 2 * backings cluster size) +# +# base_new <-- base_old <-- overlay +# +# Backing (new): -- -- -- -- -- -- +# Backing (old): -- 11 -- -- 22 -- +# Overlay: |-- --|-- --|-- --| +# +# We should end up having 1st and 3rd cluster allocated, and their halves +# being read as zeroes. + +echo +echo "=== Test rebase with different cluster sizes ===" +echo + +echo "Creating backing chain" +echo + +TEST_IMG=$BASE_NEW _make_test_img $(( CLUSTER_SIZE * 6 )) +TEST_IMG=$BASE_OLD _make_test_img -b "$BASE_NEW" -F $IMGFMT \ + $(( CLUSTER_SIZE * 6 )) +CLUSTER_SIZE=$(( CLUSTER_SIZE * 2 )) TEST_IMG=$OVERLAY \ + _make_test_img -b "$BASE_OLD" -F $IMGFMT $(( CLUSTER_SIZE * 6 )) + +TEST_IMG=$OVERLAY _img_info + +echo +echo "Fill backing files with data" +echo + +$QEMU_IO "$BASE_OLD" -c "write -P 0x11 $CLUSTER_SIZE $CLUSTER_SIZE" \ + -c "write -P 0x22 $(( CLUSTER_SIZE * 4 )) $CLUSTER_SIZE" \ + | _filter_qemu_io + +echo +echo "Rebase onto another image in the same chain" +echo + +$QEMU_IMG rebase -b "$BASE_NEW" -F $IMGFMT "$OVERLAY" + +echo "Verify that data is read the same before and after rebase" +echo + +$QEMU_IO "$OVERLAY" -c "read -P 0x00 0 $CLUSTER_SIZE" \ + -c "read -P 0x11 $CLUSTER_SIZE $CLUSTER_SIZE" \ + -c "read -P 0x00 $(( CLUSTER_SIZE * 2 )) $(( CLUSTER_SIZE * 2 ))" \ + -c "read -P 0x22 $(( CLUSTER_SIZE * 4 )) $CLUSTER_SIZE" \ + -c "read -P 0x00 $(( CLUSTER_SIZE * 5 )) $CLUSTER_SIZE" \ + | _filter_qemu_io + +echo +echo "Verify that untouched cluster remains unallocated" +echo + +$QEMU_IMG map "$OVERLAY" | _filter_qemu_img_map + +echo # success, all done echo "*** done" diff --git a/tests/qemu-iotests/024.out b/tests/qemu-iotests/024.out index 973a5a3..e1e8eea 100644 --- a/tests/qemu-iotests/024.out +++ b/tests/qemu-iotests/024.out @@ -171,4 +171,77 @@ read 65536/65536 bytes at offset 196608 Offset Length File 0 0x30000 TEST_DIR/subdir/t.IMGFMT 0x30000 0x10000 TEST_DIR/subdir/t.IMGFMT.base_new + +=== Test rebase within one backing chain === + +Creating backing chain + +Formatting 'TEST_DIR/subdir/t.IMGFMT.base_new', fmt=IMGFMT size=327680 +Formatting 'TEST_DIR/subdir/t.IMGFMT.base_old', fmt=IMGFMT size=262144 backing_file=TEST_DIR/subdir/t.IMGFMT.base_new backing_fmt=IMGFMT +Formatting 'TEST_DIR/subdir/t.IMGFMT', fmt=IMGFMT size=327680 backing_file=TEST_DIR/subdir/t.IMGFMT.base_old backing_fmt=IMGFMT + +Fill backing files with data + +wrote 327680/327680 bytes at offset 0 +320 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 262144/262144 bytes at offset 0 +256 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +Check the last cluster is zeroed in overlay before the rebase + +read 65536/65536 bytes at offset 262144 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +Rebase onto another image in the same chain + +Verify that data is read the same before and after rebase + +read 262144/262144 bytes at offset 0 +256 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 65536/65536 bytes at offset 262144 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + + +=== Test rebase with different cluster sizes === + +Creating backing chain + +Formatting 'TEST_DIR/subdir/t.IMGFMT.base_new', fmt=IMGFMT size=393216 +Formatting 'TEST_DIR/subdir/t.IMGFMT.base_old', fmt=IMGFMT size=393216 backing_file=TEST_DIR/subdir/t.IMGFMT.base_new backing_fmt=IMGFMT +Formatting 'TEST_DIR/subdir/t.IMGFMT', fmt=IMGFMT size=393216 backing_file=TEST_DIR/subdir/t.IMGFMT.base_old backing_fmt=IMGFMT +image: TEST_DIR/subdir/t.IMGFMT +file format: IMGFMT +virtual size: 384 KiB (393216 bytes) +cluster_size: 131072 +backing file: TEST_DIR/subdir/t.IMGFMT.base_old +backing file format: IMGFMT + +Fill backing files with data + +wrote 65536/65536 bytes at offset 65536 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 65536/65536 bytes at offset 262144 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +Rebase onto another image in the same chain + +Verify that data is read the same before and after rebase + +read 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 65536/65536 bytes at offset 65536 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 131072/131072 bytes at offset 131072 +128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 65536/65536 bytes at offset 262144 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 65536/65536 bytes at offset 327680 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +Verify that untouched cluster remains unallocated + +Offset Length File +0 0x20000 TEST_DIR/subdir/t.IMGFMT +0x40000 0x20000 TEST_DIR/subdir/t.IMGFMT + *** done diff --git a/tests/qemu-iotests/109.out b/tests/qemu-iotests/109.out index 2611d6a..965c9a6 100644 --- a/tests/qemu-iotests/109.out +++ b/tests/qemu-iotests/109.out @@ -38,7 +38,7 @@ read 512/512 bytes at offset 0 {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "src"}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 1024, "offset": 1024, "speed": 0, "type": "mirror"}} {"execute":"query-block-jobs"} -{"return": [{"auto-finalize": true, "io-status": "ok", "device": "src", "auto-dismiss": true, "busy": false, "len": 1024, "offset": 1024, "status": "ready", "paused": false, "speed": 0, "ready": true, "type": "mirror"}]} +{"return": [{"auto-finalize": true, "io-status": "ok", "device": "src", "auto-dismiss": true, "busy": false, "len": 1024, "offset": 1024, "status": "ready", "paused": false, "speed": 0, "ready": true, "type": "mirror", "actively-synced": false}]} {"execute":"quit"} {"return": {}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} @@ -90,7 +90,7 @@ read 512/512 bytes at offset 0 {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "src"}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 197120, "offset": 197120, "speed": 0, "type": "mirror"}} {"execute":"query-block-jobs"} -{"return": [{"auto-finalize": true, "io-status": "ok", "device": "src", "auto-dismiss": true, "busy": false, "len": 197120, "offset": 197120, "status": "ready", "paused": false, "speed": 0, "ready": true, "type": "mirror"}]} +{"return": [{"auto-finalize": true, "io-status": "ok", "device": "src", "auto-dismiss": true, "busy": false, "len": 197120, "offset": 197120, "status": "ready", "paused": false, "speed": 0, "ready": true, "type": "mirror", "actively-synced": false}]} {"execute":"quit"} {"return": {}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} @@ -142,7 +142,7 @@ read 512/512 bytes at offset 0 {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "src"}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 327680, "offset": 327680, "speed": 0, "type": "mirror"}} {"execute":"query-block-jobs"} -{"return": [{"auto-finalize": true, "io-status": "ok", "device": "src", "auto-dismiss": true, "busy": false, "len": 327680, "offset": 327680, "status": "ready", "paused": false, "speed": 0, "ready": true, "type": "mirror"}]} +{"return": [{"auto-finalize": true, "io-status": "ok", "device": "src", "auto-dismiss": true, "busy": false, "len": 327680, "offset": 327680, "status": "ready", "paused": false, "speed": 0, "ready": true, "type": "mirror", "actively-synced": false}]} {"execute":"quit"} {"return": {}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} @@ -194,7 +194,7 @@ read 512/512 bytes at offset 0 {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "src"}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 1024, "offset": 1024, "speed": 0, "type": "mirror"}} {"execute":"query-block-jobs"} -{"return": [{"auto-finalize": true, "io-status": "ok", "device": "src", "auto-dismiss": true, "busy": false, "len": 1024, "offset": 1024, "status": "ready", "paused": false, "speed": 0, "ready": true, "type": "mirror"}]} +{"return": [{"auto-finalize": true, "io-status": "ok", "device": "src", "auto-dismiss": true, "busy": false, "len": 1024, "offset": 1024, "status": "ready", "paused": false, "speed": 0, "ready": true, "type": "mirror", "actively-synced": false}]} {"execute":"quit"} {"return": {}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} @@ -246,7 +246,7 @@ read 512/512 bytes at offset 0 {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "src"}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 65536, "offset": 65536, "speed": 0, "type": "mirror"}} {"execute":"query-block-jobs"} -{"return": [{"auto-finalize": true, "io-status": "ok", "device": "src", "auto-dismiss": true, "busy": false, "len": 65536, "offset": 65536, "status": "ready", "paused": false, "speed": 0, "ready": true, "type": "mirror"}]} +{"return": [{"auto-finalize": true, "io-status": "ok", "device": "src", "auto-dismiss": true, "busy": false, "len": 65536, "offset": 65536, "status": "ready", "paused": false, "speed": 0, "ready": true, "type": "mirror", "actively-synced": false}]} {"execute":"quit"} {"return": {}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} @@ -298,7 +298,7 @@ read 512/512 bytes at offset 0 {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "src"}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 2560, "offset": 2560, "speed": 0, "type": "mirror"}} {"execute":"query-block-jobs"} -{"return": [{"auto-finalize": true, "io-status": "ok", "device": "src", "auto-dismiss": true, "busy": false, "len": 2560, "offset": 2560, "status": "ready", "paused": false, "speed": 0, "ready": true, "type": "mirror"}]} +{"return": [{"auto-finalize": true, "io-status": "ok", "device": "src", "auto-dismiss": true, "busy": false, "len": 2560, "offset": 2560, "status": "ready", "paused": false, "speed": 0, "ready": true, "type": "mirror", "actively-synced": false}]} {"execute":"quit"} {"return": {}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} @@ -349,7 +349,7 @@ read 512/512 bytes at offset 0 {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "src"}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 2560, "offset": 2560, "speed": 0, "type": "mirror"}} {"execute":"query-block-jobs"} -{"return": [{"auto-finalize": true, "io-status": "ok", "device": "src", "auto-dismiss": true, "busy": false, "len": 2560, "offset": 2560, "status": "ready", "paused": false, "speed": 0, "ready": true, "type": "mirror"}]} +{"return": [{"auto-finalize": true, "io-status": "ok", "device": "src", "auto-dismiss": true, "busy": false, "len": 2560, "offset": 2560, "status": "ready", "paused": false, "speed": 0, "ready": true, "type": "mirror", "actively-synced": false}]} {"execute":"quit"} {"return": {}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} @@ -400,7 +400,7 @@ read 512/512 bytes at offset 0 {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "src"}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 31457280, "offset": 31457280, "speed": 0, "type": "mirror"}} {"execute":"query-block-jobs"} -{"return": [{"auto-finalize": true, "io-status": "ok", "device": "src", "auto-dismiss": true, "busy": false, "len": 31457280, "offset": 31457280, "status": "ready", "paused": false, "speed": 0, "ready": true, "type": "mirror"}]} +{"return": [{"auto-finalize": true, "io-status": "ok", "device": "src", "auto-dismiss": true, "busy": false, "len": 31457280, "offset": 31457280, "status": "ready", "paused": false, "speed": 0, "ready": true, "type": "mirror", "actively-synced": false}]} {"execute":"quit"} {"return": {}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} @@ -451,7 +451,7 @@ read 512/512 bytes at offset 0 {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "src"}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 327680, "offset": 327680, "speed": 0, "type": "mirror"}} {"execute":"query-block-jobs"} -{"return": [{"auto-finalize": true, "io-status": "ok", "device": "src", "auto-dismiss": true, "busy": false, "len": 327680, "offset": 327680, "status": "ready", "paused": false, "speed": 0, "ready": true, "type": "mirror"}]} +{"return": [{"auto-finalize": true, "io-status": "ok", "device": "src", "auto-dismiss": true, "busy": false, "len": 327680, "offset": 327680, "status": "ready", "paused": false, "speed": 0, "ready": true, "type": "mirror", "actively-synced": false}]} {"execute":"quit"} {"return": {}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} @@ -502,7 +502,7 @@ read 512/512 bytes at offset 0 {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "src"}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 2048, "offset": 2048, "speed": 0, "type": "mirror"}} {"execute":"query-block-jobs"} -{"return": [{"auto-finalize": true, "io-status": "ok", "device": "src", "auto-dismiss": true, "busy": false, "len": 2048, "offset": 2048, "status": "ready", "paused": false, "speed": 0, "ready": true, "type": "mirror"}]} +{"return": [{"auto-finalize": true, "io-status": "ok", "device": "src", "auto-dismiss": true, "busy": false, "len": 2048, "offset": 2048, "status": "ready", "paused": false, "speed": 0, "ready": true, "type": "mirror", "actively-synced": false}]} {"execute":"quit"} {"return": {}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} @@ -533,7 +533,7 @@ WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "src"}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 512, "offset": 512, "speed": 0, "type": "mirror"}} {"execute":"query-block-jobs"} -{"return": [{"auto-finalize": true, "io-status": "ok", "device": "src", "auto-dismiss": true, "busy": false, "len": 512, "offset": 512, "status": "ready", "paused": false, "speed": 0, "ready": true, "type": "mirror"}]} +{"return": [{"auto-finalize": true, "io-status": "ok", "device": "src", "auto-dismiss": true, "busy": false, "len": 512, "offset": 512, "status": "ready", "paused": false, "speed": 0, "ready": true, "type": "mirror", "actively-synced": false}]} {"execute":"quit"} {"return": {}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} @@ -557,7 +557,7 @@ Images are identical. {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "src"}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 512, "offset": 512, "speed": 0, "type": "mirror"}} {"execute":"query-block-jobs"} -{"return": [{"auto-finalize": true, "io-status": "ok", "device": "src", "auto-dismiss": true, "busy": false, "len": 512, "offset": 512, "status": "ready", "paused": false, "speed": 0, "ready": true, "type": "mirror"}]} +{"return": [{"auto-finalize": true, "io-status": "ok", "device": "src", "auto-dismiss": true, "busy": false, "len": 512, "offset": 512, "status": "ready", "paused": false, "speed": 0, "ready": true, "type": "mirror", "actively-synced": false}]} {"execute":"quit"} {"return": {}} {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} diff --git a/tests/qemu-iotests/118 b/tests/qemu-iotests/118 index 10dc474..6a4210c 100755 --- a/tests/qemu-iotests/118 +++ b/tests/qemu-iotests/118 @@ -277,7 +277,8 @@ class TestInitiallyFilled(GeneralChangeTestsBaseClass): 'file.driver=file', 'file.filename=%s' % old_img ]) if self.interface == 'scsi': - self.vm.add_device('virtio-scsi-pci') + self.vm.add_object('iothread,id=iothread0') + self.vm.add_device('virtio-scsi-pci,iothread=iothread0') self.vm.add_device('%s,drive=drive0,id=%s' % (interface_to_device_name(self.interface), self.device_name)) @@ -312,7 +313,8 @@ class TestInitiallyEmpty(GeneralChangeTestsBaseClass): if self.use_drive: self.vm.add_drive(None, 'media=%s' % self.media, 'none') if self.interface == 'scsi': - self.vm.add_device('virtio-scsi-pci') + self.vm.add_object('iothread,id=iothread0') + self.vm.add_device('virtio-scsi-pci,iothread=iothread0') self.vm.add_device('%s,%sid=%s' % (interface_to_device_name(self.interface), 'drive=drive0,' if self.use_drive else '', diff --git a/tests/qemu-iotests/271 b/tests/qemu-iotests/271 index c7c2cad..59a6faf 100755 --- a/tests/qemu-iotests/271 +++ b/tests/qemu-iotests/271 @@ -899,6 +899,137 @@ _concurrent_io | $QEMU_IO | _filter_qemu_io | \ sed -e 's/\(20480\|40960\)/OFFSET/' _concurrent_verify | $QEMU_IO | _filter_qemu_io +############################################################ +############################################################ +############################################################ + +echo +echo "### Rebase of qcow2 images with subclusters ###" +echo + +l2_offset=$((0x400000)) + +# Check that rebase operation preserve holes between allocated subclusters +# within one cluster (i.e. does not allocate extra space). Check that the +# data is preserved as well. +# +# Base (new backing): -- -- -- ... -- -- -- +# Mid (old backing): -- 11 -- ... -- 22 -- +# Top: -- -- -- ... -- -- -- + +echo "### Preservation of unallocated holes after rebase ###" +echo + +echo "# create backing chain" +echo + +TEST_IMG="$TEST_IMG.base" _make_test_img -o cluster_size=1M,extended_l2=on 1M +TEST_IMG="$TEST_IMG.mid" _make_test_img -o cluster_size=1M,extended_l2=on \ + -b "$TEST_IMG.base" -F qcow2 1M +TEST_IMG="$TEST_IMG.top" _make_test_img -o cluster_size=1M,extended_l2=on \ + -b "$TEST_IMG.mid" -F qcow2 1M + +echo +echo "# fill old backing with data (separate subclusters within cluster)" +echo + +$QEMU_IO -c "write -P 0x11 32k 32k" \ + -c "write -P 0x22 $(( 30 * 32 ))k 32k" \ + "$TEST_IMG.mid" | _filter_qemu_io + +echo +echo "# rebase topmost image onto the new backing" +echo + +$QEMU_IMG rebase -b "$TEST_IMG.base" -F qcow2 "$TEST_IMG.top" + +echo "# verify that data is read the same before and after rebase" +echo + +$QEMU_IO -c "read -P 0x00 0 32k" \ + -c "read -P 0x11 32k 32k" \ + -c "read -P 0x00 64k $(( 28 * 32 ))k" \ + -c "read -P 0x22 $(( 30 * 32 ))k 32k" \ + -c "read -P 0x00 $(( 31 * 32 ))k 32k" \ + "$TEST_IMG.top" | _filter_qemu_io + +echo +echo "# verify that only selected subclusters remain allocated" +echo + +$QEMU_IMG map "$TEST_IMG.top" | _filter_testdir + +echo +echo "# verify image bitmap" +echo + +TEST_IMG="$TEST_IMG.top" alloc="1 30" zero="" _verify_l2_bitmap 0 + +# Check that rebase with compression works correctly with images containing +# subclusters. When compression is enabled and we allocate a new +# subcluster within the target (overlay) image, we expect the entire cluster +# containing that subcluster to become compressed. +# +# Here we expect 1st and 3rd clusters of the top (overlay) image to become +# compressed after the rebase, while cluster 2 to remain unallocated and +# be read from the base (new backing) image. +# +# Base (new backing): |-- -- .. -- --|11 11 .. 11 11|-- -- .. -- --| +# Mid (old backing): |-- -- .. -- 22|-- -- .. -- --|33 -- .. -- --| +# Top: |-- -- .. -- --|-- -- -- -- --|-- -- .. -- --| + +echo +echo "### Rebase with compression for images with subclusters ###" +echo + +echo "# create backing chain" +echo + +TEST_IMG="$TEST_IMG.base" _make_test_img -o cluster_size=1M,extended_l2=on 3M +TEST_IMG="$TEST_IMG.mid" _make_test_img -o cluster_size=1M,extended_l2=on \ + -b "$TEST_IMG.base" -F qcow2 3M +TEST_IMG="$TEST_IMG.top" _make_test_img -o cluster_size=1M,extended_l2=on \ + -b "$TEST_IMG.mid" -F qcow2 3M + +echo +echo "# fill old and new backing with data" +echo + +$QEMU_IO -c "write -P 0x11 1M 1M" "$TEST_IMG.base" | _filter_qemu_io +$QEMU_IO -c "write -P 0x22 $(( 31 * 32 ))k 32k" \ + -c "write -P 0x33 $(( 64 * 32 ))k 32k" \ + "$TEST_IMG.mid" | _filter_qemu_io + +echo +echo "# rebase topmost image onto the new backing, with compression" +echo + +$QEMU_IMG rebase -c -b "$TEST_IMG.base" -F qcow2 "$TEST_IMG.top" + +echo "# verify that the 1st and 3rd clusters've become compressed" +echo + +$QEMU_IMG map --output=json "$TEST_IMG.top" | _filter_testdir + +echo +echo "# verify that data is read the same before and after rebase" +echo + +$QEMU_IO -c "read -P 0x22 $(( 31 * 32 ))k 32k" \ + -c "read -P 0x11 1M 1M" \ + -c "read -P 0x33 $(( 64 * 32 ))k 32k" \ + "$TEST_IMG.top" | _filter_qemu_io + +echo +echo "# verify image bitmap" +echo + +# For compressed clusters bitmap is always 0. For unallocated cluster +# there should be no entry at all, thus bitmap is also 0. +TEST_IMG="$TEST_IMG.top" alloc="" zero="" _verify_l2_bitmap 0 +TEST_IMG="$TEST_IMG.top" alloc="" zero="" _verify_l2_bitmap 1 +TEST_IMG="$TEST_IMG.top" alloc="" zero="" _verify_l2_bitmap 2 + # success, all done echo "*** done" rm -f $seq.full diff --git a/tests/qemu-iotests/271.out b/tests/qemu-iotests/271.out index 5be780d..0b24d50 100644 --- a/tests/qemu-iotests/271.out +++ b/tests/qemu-iotests/271.out @@ -723,4 +723,86 @@ wrote 2048/2048 bytes at offset OFFSET 2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) wrote 2048/2048 bytes at offset OFFSET 2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +### Rebase of qcow2 images with subclusters ### + +### Preservation of unallocated holes after rebase ### + +# create backing chain + +Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=1048576 +Formatting 'TEST_DIR/t.IMGFMT.mid', fmt=IMGFMT size=1048576 backing_file=TEST_DIR/t.IMGFMT.base backing_fmt=IMGFMT +Formatting 'TEST_DIR/t.IMGFMT.top', fmt=IMGFMT size=1048576 backing_file=TEST_DIR/t.IMGFMT.mid backing_fmt=IMGFMT + +# fill old backing with data (separate subclusters within cluster) + +wrote 32768/32768 bytes at offset 32768 +32 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 32768/32768 bytes at offset 983040 +32 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +# rebase topmost image onto the new backing + +# verify that data is read the same before and after rebase + +read 32768/32768 bytes at offset 0 +32 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 32768/32768 bytes at offset 32768 +32 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 917504/917504 bytes at offset 65536 +896 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 32768/32768 bytes at offset 983040 +32 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 32768/32768 bytes at offset 1015808 +32 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +# verify that only selected subclusters remain allocated + +Offset Length Mapped to File +0x8000 0x8000 0x508000 TEST_DIR/t.qcow2.top +0xf0000 0x8000 0x5f0000 TEST_DIR/t.qcow2.top + +# verify image bitmap + +L2 entry #0: 0x8000000000500000 0000000040000002 + +### Rebase with compression for images with subclusters ### + +# create backing chain + +Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=3145728 +Formatting 'TEST_DIR/t.IMGFMT.mid', fmt=IMGFMT size=3145728 backing_file=TEST_DIR/t.IMGFMT.base backing_fmt=IMGFMT +Formatting 'TEST_DIR/t.IMGFMT.top', fmt=IMGFMT size=3145728 backing_file=TEST_DIR/t.IMGFMT.mid backing_fmt=IMGFMT + +# fill old and new backing with data + +wrote 1048576/1048576 bytes at offset 1048576 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 32768/32768 bytes at offset 1015808 +32 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 32768/32768 bytes at offset 2097152 +32 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +# rebase topmost image onto the new backing, with compression + +# verify that the 1st and 3rd clusters've become compressed + +[{ "start": 0, "length": 1048576, "depth": 0, "present": true, "zero": false, "data": true, "compressed": true}, +{ "start": 1048576, "length": 1048576, "depth": 1, "present": true, "zero": false, "data": true, "compressed": false, "offset": 5242880}, +{ "start": 2097152, "length": 1048576, "depth": 0, "present": true, "zero": false, "data": true, "compressed": true}] + +# verify that data is read the same before and after rebase + +read 32768/32768 bytes at offset 1015808 +32 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 1048576/1048576 bytes at offset 1048576 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 32768/32768 bytes at offset 2097152 +32 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +# verify image bitmap + +L2 entry #0: 0x4008000000500000 0000000000000000 +L2 entry #1: 0x0000000000000000 0000000000000000 +L2 entry #2: 0x400800000050040b 0000000000000000 *** done diff --git a/tests/qemu-iotests/314 b/tests/qemu-iotests/314 new file mode 100755 index 0000000..96d7b4d --- /dev/null +++ b/tests/qemu-iotests/314 @@ -0,0 +1,165 @@ +#!/usr/bin/env bash +# group: rw backing auto quick +# +# Test qemu-img rebase with compression +# +# Copyright (c) 2023 Virtuozzo International GmbH. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# + +# creator +owner=andrey.drobyshev@virtuozzo.com + +seq=`basename $0` +echo "QA output created by $seq" + +status=1 # failure is the default! + +_cleanup() +{ + _cleanup_test_img + _rm_test_img "$TEST_IMG.base" + _rm_test_img "$TEST_IMG.itmd" +} +trap "_cleanup; exit \$status" 0 1 2 3 15 + +# get standard environment, filters and checks +. ./common.rc +. ./common.filter + +_supported_fmt qcow2 +_supported_proto file +_supported_os Linux + +# Want the size divisible by 2 and 3 +size=$(( 48 * 1024 * 1024 )) +half_size=$(( size / 2 )) +third_size=$(( size / 3 )) + +# 1. "qemu-img rebase -c" should refuse working with any format which doesn't +# support compression. We only check "-f raw" here. +echo +echo "=== Testing compressed rebase format compatibility ===" +echo + +$QEMU_IMG create -f raw "$TEST_IMG" "$size" | _filter_img_create +$QEMU_IMG rebase -c -f raw -b "" "$TEST_IMG" + +# 2. Write the 1st half of $size to backing file (compressed), 2nd half -- to +# the top image (also compressed). Rebase the top image onto no backing file, +# with compression (i.e. "qemu-img -c -b ''"). Check that the resulting image +# has the written data preserved, and "qemu-img check" reports 100% clusters +# as compressed. +echo +echo "=== Testing rebase with compression onto no backing file ===" +echo + +TEST_IMG="$TEST_IMG.base" _make_test_img $size +_make_test_img -b "$TEST_IMG.base" -F $IMGFMT $size + +$QEMU_IO -c "write -c -P 0xaa 0 $half_size" "$TEST_IMG.base" | _filter_qemu_io +$QEMU_IO -c "write -c -P 0xbb $half_size $half_size" "$TEST_IMG" \ + | _filter_qemu_io + +$QEMU_IMG rebase -c -f $IMGFMT -b "" "$TEST_IMG" + +$QEMU_IO -c "read -P 0xaa 0 $half_size" "$TEST_IMG" | _filter_qemu_io +$QEMU_IO -c "read -P 0xbb $half_size $half_size" "$TEST_IMG" | _filter_qemu_io + +$QEMU_IMG check "$TEST_IMG" | _filter_testdir + +# 3. Same as the previous one, but with raw backing file (hence write to +# the backing is uncompressed). +echo +echo "=== Testing rebase with compression with raw backing file ===" +echo + +$QEMU_IMG create -f raw "$TEST_IMG.base" "$half_size" | _filter_img_create +_make_test_img -b "$TEST_IMG.base" -F raw $size + +$QEMU_IO -f raw -c "write -P 0xaa 0 $half_size" "$TEST_IMG.base" \ + | _filter_qemu_io +$QEMU_IO -c "write -c -P 0xbb $half_size $half_size" \ + "$TEST_IMG" | _filter_qemu_io + +$QEMU_IMG rebase -c -f $IMGFMT -b "" "$TEST_IMG" + +$QEMU_IO -c "read -P 0xaa 0 $half_size" "$TEST_IMG" | _filter_qemu_io +$QEMU_IO -c "read -P 0xbb $half_size $half_size" "$TEST_IMG" | _filter_qemu_io + +$QEMU_IMG check "$TEST_IMG" | _filter_testdir + +# 4. Create a backing chain base<--itmd<--img, filling 1st, 2nd and 3rd +# thirds of them, respectively (with compression). Rebase img onto base, +# effectively deleting itmd from the chain, and check that written data is +# preserved in the resulting image. Also check that "qemu-img check" reports +# 100% clusters as compressed. +echo +echo "=== Testing compressed rebase removing single delta from the chain ===" +echo + +TEST_IMG="$TEST_IMG.base" _make_test_img $size +TEST_IMG="$TEST_IMG.itmd" _make_test_img -b "$TEST_IMG.base" -F $IMGFMT $size +_make_test_img -b "$TEST_IMG.itmd" -F $IMGFMT $size + +$QEMU_IO -c "write -c -P 0xaa 0 $third_size" \ + "$TEST_IMG.base" | _filter_qemu_io +$QEMU_IO -c "write -c -P 0xbb $third_size $third_size" \ + "$TEST_IMG.itmd" | _filter_qemu_io +$QEMU_IO -c "write -c -P 0xcc $((third_size * 2 )) $third_size" \ + "$TEST_IMG" | _filter_qemu_io + +$QEMU_IMG rebase -c -f $IMGFMT -b "$TEST_IMG.base" -F $IMGFMT "$TEST_IMG" + +$QEMU_IO -c "read -P 0xaa 0 $third_size" "$TEST_IMG" | _filter_qemu_io +$QEMU_IO -c "read -P 0xbb $third_size $third_size" \ + "$TEST_IMG" | _filter_qemu_io +$QEMU_IO -c "read -P 0xcc $(( third_size * 2 )) $third_size" \ + "$TEST_IMG" | _filter_qemu_io + +$QEMU_IMG check "$TEST_IMG" | _filter_testdir + +# 5. Create one-cluster backing and overlay images, and fill only the first +# (half - 1) bytes of the backing with data (uncompressed). Rebase the +# overlay onto no backing file with compression. Check that data is still +# read correctly, and that cluster is now really compressed ("qemu-img check" +# reports 100% clusters as compressed. +echo +echo "=== Testing compressed rebase with unaligned unmerged data ===" +echo + +CLUSTER_SIZE=65536 + +TEST_IMG="$TEST_IMG.base" _make_test_img $CLUSTER_SIZE +_make_test_img -b "$TEST_IMG.base" -F $IMGFMT $CLUSTER_SIZE + +$QEMU_IO -c "write -P 0xaa 0 $(( CLUSTER_SIZE / 2 - 1 ))" $TEST_IMG.base \ + | _filter_qemu_io + +$QEMU_IMG rebase -c -f $IMGFMT -b "" "$TEST_IMG" + +$QEMU_IO -c "read -P 0xaa 0 $(( CLUSTER_SIZE / 2 - 1 ))" "$TEST_IMG" \ + | _filter_qemu_io +$QEMU_IO -c \ + "read -P 0x00 $(( CLUSTER_SIZE / 2 - 1 )) $(( CLUSTER_SIZE / 2 + 1 ))" \ + "$TEST_IMG" | _filter_qemu_io + +$QEMU_IMG check "$TEST_IMG" | _filter_testdir + +# success, all done +echo +echo '*** done' +rm -f $seq.full +status=0 diff --git a/tests/qemu-iotests/314.out b/tests/qemu-iotests/314.out new file mode 100644 index 0000000..ac9337a --- /dev/null +++ b/tests/qemu-iotests/314.out @@ -0,0 +1,75 @@ +QA output created by 314 + +=== Testing compressed rebase format compatibility === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=raw size=50331648 +qemu-img: Compression not supported for this file format + +=== Testing rebase with compression onto no backing file === + +Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=50331648 +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=50331648 backing_file=TEST_DIR/t.IMGFMT.base backing_fmt=IMGFMT +wrote 25165824/25165824 bytes at offset 0 +24 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 25165824/25165824 bytes at offset 25165824 +24 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 25165824/25165824 bytes at offset 0 +24 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 25165824/25165824 bytes at offset 25165824 +24 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +No errors were found on the image. +768/768 = 100.00% allocated, 100.00% fragmented, 100.00% compressed clusters +Image end offset: 458752 + +=== Testing rebase with compression with raw backing file === + +Formatting 'TEST_DIR/t.IMGFMT.base', fmt=raw size=25165824 +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=50331648 backing_file=TEST_DIR/t.IMGFMT.base backing_fmt=raw +wrote 25165824/25165824 bytes at offset 0 +24 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 25165824/25165824 bytes at offset 25165824 +24 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 25165824/25165824 bytes at offset 0 +24 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 25165824/25165824 bytes at offset 25165824 +24 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +No errors were found on the image. +768/768 = 100.00% allocated, 100.00% fragmented, 100.00% compressed clusters +Image end offset: 458752 + +=== Testing compressed rebase removing single delta from the chain === + +Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=50331648 +Formatting 'TEST_DIR/t.IMGFMT.itmd', fmt=IMGFMT size=50331648 backing_file=TEST_DIR/t.IMGFMT.base backing_fmt=IMGFMT +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=50331648 backing_file=TEST_DIR/t.IMGFMT.itmd backing_fmt=IMGFMT +wrote 16777216/16777216 bytes at offset 0 +16 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 16777216/16777216 bytes at offset 16777216 +16 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 16777216/16777216 bytes at offset 33554432 +16 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 16777216/16777216 bytes at offset 0 +16 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 16777216/16777216 bytes at offset 16777216 +16 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 16777216/16777216 bytes at offset 33554432 +16 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +No errors were found on the image. +512/768 = 66.67% allocated, 100.00% fragmented, 100.00% compressed clusters +Image end offset: 458752 + +=== Testing compressed rebase with unaligned unmerged data === + +Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=65536 +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=65536 backing_file=TEST_DIR/t.IMGFMT.base backing_fmt=IMGFMT +wrote 32767/32767 bytes at offset 0 +31.999 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 32767/32767 bytes at offset 0 +31.999 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 32769/32769 bytes at offset 32767 +32.001 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +No errors were found on the image. +1/1 = 100.00% allocated, 100.00% fragmented, 100.00% compressed clusters +Image end offset: 393216 + +*** done diff --git a/tests/qemu-iotests/tests/mirror-change-copy-mode b/tests/qemu-iotests/tests/mirror-change-copy-mode new file mode 100755 index 0000000..51788b8 --- /dev/null +++ b/tests/qemu-iotests/tests/mirror-change-copy-mode @@ -0,0 +1,193 @@ +#!/usr/bin/env python3 +# group: rw +# +# Test for changing mirror copy mode from background to active +# +# Copyright (C) 2023 Proxmox Server Solutions GmbH +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# + +import os +import time + +import iotests +from iotests import qemu_img, QemuStorageDaemon + +iops_target = 8 +iops_source = iops_target * 2 +image_size = 1 * 1024 * 1024 +source_img = os.path.join(iotests.test_dir, 'source.' + iotests.imgfmt) +target_img = os.path.join(iotests.test_dir, 'target.' + iotests.imgfmt) +nbd_sock = os.path.join(iotests.sock_dir, 'nbd.sock') + +class TestMirrorChangeCopyMode(iotests.QMPTestCase): + + def setUp(self): + qemu_img('create', '-f', iotests.imgfmt, source_img, str(image_size)) + qemu_img('create', '-f', iotests.imgfmt, target_img, str(image_size)) + + self.qsd = QemuStorageDaemon('--nbd-server', + f'addr.type=unix,addr.path={nbd_sock}', + qmp=True) + + self.qsd.cmd('object-add', { + 'qom-type': 'throttle-group', + 'id': 'thrgr-target', + 'limits': { + 'iops-write': iops_target, + 'iops-write-max': iops_target + } + }) + + self.qsd.cmd('blockdev-add', { + 'node-name': 'target', + 'driver': 'throttle', + 'throttle-group': 'thrgr-target', + 'file': { + 'driver': iotests.imgfmt, + 'file': { + 'driver': 'file', + 'filename': target_img + } + } + }) + + self.qsd.cmd('block-export-add', { + 'id': 'exp0', + 'type': 'nbd', + 'node-name': 'target', + 'writable': True + }) + + self.vm = iotests.VM() + self.vm.add_args('-drive', + f'file={source_img},if=none,format={iotests.imgfmt},' + f'iops_wr={iops_source},' + f'iops_wr_max={iops_source},' + 'id=source') + self.vm.launch() + + self.vm.cmd('blockdev-add', { + 'node-name': 'target', + 'driver': 'nbd', + 'export': 'target', + 'server': { + 'type': 'unix', + 'path': nbd_sock + } + }) + + + def tearDown(self): + self.vm.shutdown() + self.qsd.stop() + self.check_qemu_io_errors() + self.check_images_identical() + os.remove(source_img) + os.remove(target_img) + + # Once the VM is shut down we can parse the log and see if qemu-io ran + # without errors. + def check_qemu_io_errors(self): + self.assertFalse(self.vm.is_running()) + log = self.vm.get_log() + for line in log.split("\n"): + assert not line.startswith("Pattern verification failed") + + def check_images_identical(self): + qemu_img('compare', '-f', iotests.imgfmt, source_img, target_img) + + def start_mirror(self): + self.vm.cmd('blockdev-mirror', + job_id='mirror', + device='source', + target='target', + filter_node_name='mirror-top', + sync='full', + copy_mode='background') + + def test_background_to_active(self): + self.vm.hmp_qemu_io('source', f'write 0 {image_size}') + self.vm.hmp_qemu_io('target', f'write 0 {image_size}') + + self.start_mirror() + + result = self.vm.cmd('query-block-jobs') + assert not result[0]['actively-synced'] + + self.vm.event_wait('BLOCK_JOB_READY') + + result = self.vm.cmd('query-block-jobs') + assert not result[0]['actively-synced'] + + # Start some background requests. + reqs = 4 * iops_source + req_size = image_size // reqs + for i in range(0, reqs): + req = f'aio_write -P 7 {req_size * i} {req_size}' + self.vm.hmp_qemu_io('source', req) + + # Wait for the first few requests. + time.sleep(1) + self.vm.qtest(f'clock_step {1 * 1000 * 1000 * 1000}') + + result = self.vm.cmd('query-block-jobs') + # There should've been new requests. + assert result[0]['len'] > image_size + # To verify later that not all requests were completed at this point. + len_before_change = result[0]['len'] + + # Change the copy mode while requests are happening. + self.vm.cmd('block-job-change', + id='mirror', + type='mirror', + copy_mode='write-blocking') + + # Wait until image is actively synced. + while True: + time.sleep(0.1) + self.vm.qtest(f'clock_step {100 * 1000 * 1000}') + result = self.vm.cmd('query-block-jobs') + if result[0]['actively-synced']: + break + + # Because of throttling, not all requests should have been completed + # above. + result = self.vm.cmd('query-block-jobs') + assert result[0]['len'] > len_before_change + + # Issue enough requests for a few seconds only touching the first half + # of the image. + reqs = 4 * iops_target + req_size = image_size // 2 // reqs + for i in range(0, reqs): + req = f'aio_write -P 19 {req_size * i} {req_size}' + self.vm.hmp_qemu_io('source', req) + + # Now issue a synchronous write in the second half of the image and + # immediately verify that it was written to the target too. This would + # fail without switching the copy mode. Note that this only produces a + # log line and the actual checking happens during tearDown(). + req_args = f'-P 37 {3 * (image_size // 4)} {req_size}' + self.vm.hmp_qemu_io('source', f'write {req_args}') + self.vm.hmp_qemu_io('target', f'read {req_args}') + + self.vm.cmd('block-job-cancel', device='mirror') + while len(self.vm.cmd('query-block-jobs')) > 0: + time.sleep(0.1) + +if __name__ == '__main__': + iotests.main(supported_fmts=['qcow2', 'raw'], + supported_protocols=['file']) diff --git a/tests/qemu-iotests/tests/mirror-change-copy-mode.out b/tests/qemu-iotests/tests/mirror-change-copy-mode.out new file mode 100644 index 0000000..ae1213e --- /dev/null +++ b/tests/qemu-iotests/tests/mirror-change-copy-mode.out @@ -0,0 +1,5 @@ +. +---------------------------------------------------------------------- +Ran 1 tests + +OK diff --git a/util/defer-call.c b/util/defer-call.c new file mode 100644 index 0000000..037dc0a --- /dev/null +++ b/util/defer-call.c @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Deferred calls + * + * Copyright Red Hat. + * + * This API defers a function call within a defer_call_begin()/defer_call_end() + * section, allowing multiple calls to batch up. This is a performance + * optimization that is used in the block layer to submit several I/O requests + * at once instead of individually: + * + * defer_call_begin(); <-- start of section + * ... + * defer_call(my_func, my_obj); <-- deferred my_func(my_obj) call + * defer_call(my_func, my_obj); <-- another + * defer_call(my_func, my_obj); <-- another + * ... + * defer_call_end(); <-- end of section, my_func(my_obj) is called once + */ + +#include "qemu/osdep.h" +#include "qemu/coroutine-tls.h" +#include "qemu/notify.h" +#include "qemu/thread.h" +#include "qemu/defer-call.h" + +/* A function call that has been deferred until defer_call_end() */ +typedef struct { + void (*fn)(void *); + void *opaque; +} DeferredCall; + +/* Per-thread state */ +typedef struct { + unsigned nesting_level; + GArray *deferred_call_array; +} DeferCallThreadState; + +/* Use get_ptr_defer_call_thread_state() to fetch this thread-local value */ +QEMU_DEFINE_STATIC_CO_TLS(DeferCallThreadState, defer_call_thread_state); + +/* Called at thread cleanup time */ +static void defer_call_atexit(Notifier *n, void *value) +{ + DeferCallThreadState *thread_state = get_ptr_defer_call_thread_state(); + g_array_free(thread_state->deferred_call_array, TRUE); +} + +/* This won't involve coroutines, so use __thread */ +static __thread Notifier defer_call_atexit_notifier; + +/** + * defer_call: + * @fn: a function pointer to be invoked + * @opaque: a user-defined argument to @fn() + * + * Call @fn(@opaque) immediately if not within a + * defer_call_begin()/defer_call_end() section. + * + * Otherwise defer the call until the end of the outermost + * defer_call_begin()/defer_call_end() section in this thread. If the same + * @fn/@opaque pair has already been deferred, it will only be called once upon + * defer_call_end() so that accumulated calls are batched into a single call. + * + * The caller must ensure that @opaque is not freed before @fn() is invoked. + */ +void defer_call(void (*fn)(void *), void *opaque) +{ + DeferCallThreadState *thread_state = get_ptr_defer_call_thread_state(); + + /* Call immediately if we're not deferring calls */ + if (thread_state->nesting_level == 0) { + fn(opaque); + return; + } + + GArray *array = thread_state->deferred_call_array; + if (!array) { + array = g_array_new(FALSE, FALSE, sizeof(DeferredCall)); + thread_state->deferred_call_array = array; + defer_call_atexit_notifier.notify = defer_call_atexit; + qemu_thread_atexit_add(&defer_call_atexit_notifier); + } + + DeferredCall *fns = (DeferredCall *)array->data; + DeferredCall new_fn = { + .fn = fn, + .opaque = opaque, + }; + + /* + * There won't be many, so do a linear search. If this becomes a bottleneck + * then a binary search (glib 2.62+) or different data structure could be + * used. + */ + for (guint i = 0; i < array->len; i++) { + if (memcmp(&fns[i], &new_fn, sizeof(new_fn)) == 0) { + return; /* already exists */ + } + } + + g_array_append_val(array, new_fn); +} + +/** + * defer_call_begin: Defer defer_call() functions until defer_call_end() + * + * defer_call_begin() and defer_call_end() are thread-local operations. The + * caller must ensure that each defer_call_begin() has a matching + * defer_call_end() in the same thread. + * + * Nesting is supported. defer_call() functions are only called at the + * outermost defer_call_end(). + */ +void defer_call_begin(void) +{ + DeferCallThreadState *thread_state = get_ptr_defer_call_thread_state(); + + assert(thread_state->nesting_level < UINT32_MAX); + + thread_state->nesting_level++; +} + +/** + * defer_call_end: Run any pending defer_call() functions + * + * There must have been a matching defer_call_begin() call in the same thread + * prior to this defer_call_end() call. + */ +void defer_call_end(void) +{ + DeferCallThreadState *thread_state = get_ptr_defer_call_thread_state(); + + assert(thread_state->nesting_level > 0); + + if (--thread_state->nesting_level > 0) { + return; + } + + GArray *array = thread_state->deferred_call_array; + if (!array) { + return; + } + + DeferredCall *fns = (DeferredCall *)array->data; + + for (guint i = 0; i < array->len; i++) { + fns[i].fn(fns[i].opaque); + } + + /* + * This resets the array without freeing memory so that appending is cheap + * in the future. + */ + g_array_set_size(array, 0); +} diff --git a/util/meson.build b/util/meson.build index c4827fd..769b24f 100644 --- a/util/meson.build +++ b/util/meson.build @@ -28,6 +28,7 @@ util_ss.add(when: 'CONFIG_WIN32', if_true: pathcch) if glib_has_gslice util_ss.add(files('qtree.c')) endif +util_ss.add(files('defer-call.c')) util_ss.add(files('envlist.c', 'path.c', 'module.c')) util_ss.add(files('host-utils.c')) util_ss.add(files('bitmap.c', 'bitops.c')) diff --git a/util/thread-pool.c b/util/thread-pool.c index 22f9ba3..27eb777 100644 --- a/util/thread-pool.c +++ b/util/thread-pool.c @@ -15,6 +15,7 @@ * GNU GPL, version 2 or (at your option) any later version. */ #include "qemu/osdep.h" +#include "qemu/defer-call.h" #include "qemu/queue.h" #include "qemu/thread.h" #include "qemu/coroutine.h" @@ -175,6 +176,8 @@ static void thread_pool_completion_bh(void *opaque) ThreadPool *pool = opaque; ThreadPoolElement *elem, *next; + defer_call_begin(); /* cb() may use defer_call() to coalesce work */ + restart: QLIST_FOREACH_SAFE(elem, &pool->head, all, next) { if (elem->state != THREAD_DONE) { @@ -208,6 +211,8 @@ restart: qemu_aio_unref(elem); } } + + defer_call_end(); } static void thread_pool_cancel(BlockAIOCB *acb) |