diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2021-09-15 18:55:59 +0100 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2021-09-15 18:55:59 +0100 |
commit | 57b6f58c1d0df757c9311496c32d502925056894 (patch) | |
tree | ed3efc670580fda321854d892382347f93c7c212 | |
parent | 7b7ab2d6c99987e46aa53478798a05fcaf02226e (diff) | |
parent | 1899bf47375ad40555dcdff12ba49b4b8b82df38 (diff) | |
download | qemu-57b6f58c1d0df757c9311496c32d502925056894.zip qemu-57b6f58c1d0df757c9311496c32d502925056894.tar.gz qemu-57b6f58c1d0df757c9311496c32d502925056894.tar.bz2 |
Merge remote-tracking branch 'remotes/hreitz/tags/pull-block-2021-09-15' into staging
Block patches:
- Block-status cache for data regions
- qcow2 optimization (when using subclusters)
- iotests delinting, and let 297 (lint checker) cover named iotests
- qcow2 check improvements
- Added -F (target backing file format) option to qemu-img convert
- Mirror job fix
- Fix for when a migration is initiated while a backup job runs
- Fix for uncached qemu-img convert to a volume with 4k sectors (for an
unaligned image)
- Minor gluster driver fix
# gpg: Signature made Wed 15 Sep 2021 18:39:11 BST
# gpg: using RSA key CB62D7A0EE3829E45F004D34A1FA40D098019CDF
# gpg: issuer "hreitz@redhat.com"
# gpg: Good signature from "Hanna Reitz <hreitz@redhat.com>" [marginal]
# gpg: WARNING: This key is not certified with sufficiently trusted signatures!
# gpg: It is not certain that the signature belongs to the owner.
# Primary key fingerprint: CB62 D7A0 EE38 29E4 5F00 4D34 A1FA 40D0 9801 9CDF
* remotes/hreitz/tags/pull-block-2021-09-15: (32 commits)
qemu-img: Add -F shorthand to convert
qcow2-refcount: check_refblocks(): add separate message for reserved
qcow2-refcount: check_refcounts_l1(): check reserved bits
qcow2-refcount: improve style of check_refcounts_l1()
qcow2-refcount: check_refcounts_l2(): check reserved bits
qcow2-refcount: check_refcounts_l2(): check l2_bitmap
qcow2-refcount: fix_l2_entry_by_zero(): also zero L2 entry bitmap
qcow2-refcount: introduce fix_l2_entry_by_zero()
qcow2: introduce qcow2_parse_compressed_l2_entry() helper
qcow2: compressed read: simplify cluster descriptor passing
qcow2-refcount: improve style of check_refcounts_l2()
qemu-img: Allow target be aligned to sector size
qcow2: handle_dependencies(): relax conflict detection
qcow2: refactor handle_dependencies() loop body
simplebench: add img_bench_templater.py
block: bdrv_inactivate_recurse(): check for permissions and fix crash
tests: add migrate-during-backup
block/mirror: fix NULL pointer dereference in mirror_wait_on_conflicts()
iotests/297: Cover tests/
mirror-top-perms: Fix AbnormalShutdown path
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r-- | block.c | 88 | ||||
-rw-r--r-- | block/file-posix.c | 7 | ||||
-rw-r--r-- | block/gluster.c | 23 | ||||
-rw-r--r-- | block/io.c | 68 | ||||
-rw-r--r-- | block/iscsi.c | 3 | ||||
-rw-r--r-- | block/mirror.c | 25 | ||||
-rw-r--r-- | block/qcow2-cluster.c | 78 | ||||
-rw-r--r-- | block/qcow2-refcount.c | 326 | ||||
-rw-r--r-- | block/qcow2.c | 13 | ||||
-rw-r--r-- | block/qcow2.h | 7 | ||||
-rw-r--r-- | docs/tools/qemu-img.rst | 4 | ||||
-rw-r--r-- | include/block/block_int.h | 61 | ||||
-rw-r--r-- | qemu-img-cmds.hx | 2 | ||||
-rw-r--r-- | qemu-img.c | 18 | ||||
-rwxr-xr-x | scripts/simplebench/img_bench_templater.py | 95 | ||||
-rw-r--r-- | scripts/simplebench/table_templater.py | 62 | ||||
-rwxr-xr-x | tests/qemu-iotests/122 | 2 | ||||
-rwxr-xr-x | tests/qemu-iotests/271 | 5 | ||||
-rw-r--r-- | tests/qemu-iotests/271.out | 4 | ||||
-rwxr-xr-x | tests/qemu-iotests/297 | 9 | ||||
-rw-r--r-- | tests/qemu-iotests/iotests.py | 12 | ||||
-rwxr-xr-x | tests/qemu-iotests/tests/migrate-bitmaps-postcopy-test | 13 | ||||
-rwxr-xr-x | tests/qemu-iotests/tests/migrate-bitmaps-test | 43 | ||||
-rwxr-xr-x | tests/qemu-iotests/tests/migrate-during-backup | 97 | ||||
-rw-r--r-- | tests/qemu-iotests/tests/migrate-during-backup.out | 5 | ||||
-rwxr-xr-x | tests/qemu-iotests/tests/mirror-top-perms | 2 |
26 files changed, 855 insertions, 217 deletions
@@ -49,6 +49,8 @@ #include "qemu/timer.h" #include "qemu/cutils.h" #include "qemu/id.h" +#include "qemu/range.h" +#include "qemu/rcu.h" #include "block/coroutines.h" #ifdef CONFIG_BSD @@ -401,6 +403,9 @@ BlockDriverState *bdrv_new(void) qemu_co_queue_init(&bs->flush_queue); + qemu_co_mutex_init(&bs->bsc_modify_lock); + bs->block_status_cache = g_new0(BdrvBlockStatusCache, 1); + for (i = 0; i < bdrv_drain_all_count; i++) { bdrv_drained_begin(bs); } @@ -4694,6 +4699,8 @@ static void bdrv_close(BlockDriverState *bs) bs->explicit_options = NULL; qobject_unref(bs->full_open_options); bs->full_open_options = NULL; + g_free(bs->block_status_cache); + bs->block_status_cache = NULL; bdrv_release_named_dirty_bitmaps(bs); assert(QLIST_EMPTY(&bs->dirty_bitmaps)); @@ -6319,6 +6326,7 @@ static int bdrv_inactivate_recurse(BlockDriverState *bs) { BdrvChild *child, *parent; int ret; + uint64_t cumulative_perms, cumulative_shared_perms; if (!bs->drv) { return -ENOMEDIUM; @@ -6349,6 +6357,13 @@ static int bdrv_inactivate_recurse(BlockDriverState *bs) } } + bdrv_get_cumulative_perm(bs, &cumulative_perms, + &cumulative_shared_perms); + if (cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) { + /* Our inactive parents still need write access. Inactivation failed. */ + return -EPERM; + } + bs->open_flags |= BDRV_O_INACTIVE; /* @@ -7684,3 +7699,76 @@ BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs) { return bdrv_skip_filters(bdrv_cow_bs(bdrv_skip_filters(bs))); } + +/** + * Check whether [offset, offset + bytes) overlaps with the cached + * block-status data region. + * + * If so, and @pnum is not NULL, set *pnum to `bsc.data_end - offset`, + * which is what bdrv_bsc_is_data()'s interface needs. + * Otherwise, *pnum is not touched. + */ +static bool bdrv_bsc_range_overlaps_locked(BlockDriverState *bs, + int64_t offset, int64_t bytes, + int64_t *pnum) +{ + BdrvBlockStatusCache *bsc = qatomic_rcu_read(&bs->block_status_cache); + bool overlaps; + + overlaps = + qatomic_read(&bsc->valid) && + ranges_overlap(offset, bytes, bsc->data_start, + bsc->data_end - bsc->data_start); + + if (overlaps && pnum) { + *pnum = bsc->data_end - offset; + } + + return overlaps; +} + +/** + * See block_int.h for this function's documentation. + */ +bool bdrv_bsc_is_data(BlockDriverState *bs, int64_t offset, int64_t *pnum) +{ + RCU_READ_LOCK_GUARD(); + + return bdrv_bsc_range_overlaps_locked(bs, offset, 1, pnum); +} + +/** + * See block_int.h for this function's documentation. + */ +void bdrv_bsc_invalidate_range(BlockDriverState *bs, + int64_t offset, int64_t bytes) +{ + RCU_READ_LOCK_GUARD(); + + if (bdrv_bsc_range_overlaps_locked(bs, offset, bytes, NULL)) { + qatomic_set(&bs->block_status_cache->valid, false); + } +} + +/** + * See block_int.h for this function's documentation. + */ +void bdrv_bsc_fill(BlockDriverState *bs, int64_t offset, int64_t bytes) +{ + BdrvBlockStatusCache *new_bsc = g_new(BdrvBlockStatusCache, 1); + BdrvBlockStatusCache *old_bsc; + + *new_bsc = (BdrvBlockStatusCache) { + .valid = true, + .data_start = offset, + .data_end = offset + bytes, + }; + + QEMU_LOCK_GUARD(&bs->bsc_modify_lock); + + old_bsc = qatomic_rcu_read(&bs->block_status_cache); + qatomic_rcu_set(&bs->block_status_cache, new_bsc); + if (old_bsc) { + g_free_rcu(old_bsc, rcu); + } +} diff --git a/block/file-posix.c b/block/file-posix.c index cb9bffe..9f35e56 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -2744,7 +2744,8 @@ static int find_allocation(BlockDriverState *bs, off_t start, * the specified offset) that are known to be in the same * allocated/unallocated state. * - * 'bytes' is the max value 'pnum' should be set to. + * 'bytes' is a soft cap for 'pnum'. If the information is free, 'pnum' may + * well exceed it. */ static int coroutine_fn raw_co_block_status(BlockDriverState *bs, bool want_zero, @@ -2782,7 +2783,7 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs, } else if (data == offset) { /* On a data extent, compute bytes to the end of the extent, * possibly including a partial sector at EOF. */ - *pnum = MIN(bytes, hole - offset); + *pnum = hole - offset; /* * We are not allowed to return partial sectors, though, so @@ -2801,7 +2802,7 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs, } else { /* On a hole, compute bytes to the beginning of the next extent. */ assert(hole == offset); - *pnum = MIN(bytes, data - offset); + *pnum = data - offset; ret = BDRV_BLOCK_ZERO; } *map = offset; diff --git a/block/gluster.c b/block/gluster.c index e8ee14c..d51938e 100644 --- a/block/gluster.c +++ b/block/gluster.c @@ -1461,7 +1461,8 @@ exit: * the specified offset) that are known to be in the same * allocated/unallocated state. * - * 'bytes' is the max value 'pnum' should be set to. + * 'bytes' is a soft cap for 'pnum'. If the information is free, 'pnum' may + * well exceed it. * * (Based on raw_co_block_status() from file-posix.c.) */ @@ -1477,6 +1478,8 @@ static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs, off_t data = 0, hole = 0; int ret = -EINVAL; + assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment)); + if (!s->fd) { return ret; } @@ -1500,12 +1503,26 @@ static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs, } else if (data == offset) { /* On a data extent, compute bytes to the end of the extent, * possibly including a partial sector at EOF. */ - *pnum = MIN(bytes, hole - offset); + *pnum = hole - offset; + + /* + * We are not allowed to return partial sectors, though, so + * round up if necessary. + */ + if (!QEMU_IS_ALIGNED(*pnum, bs->bl.request_alignment)) { + int64_t file_length = qemu_gluster_getlength(bs); + if (file_length > 0) { + /* Ignore errors, this is just a safeguard */ + assert(hole == file_length); + } + *pnum = ROUND_UP(*pnum, bs->bl.request_alignment); + } + ret = BDRV_BLOCK_DATA; } else { /* On a hole, compute bytes to the beginning of the next extent. */ assert(hole == offset); - *pnum = MIN(bytes, data - offset); + *pnum = data - offset; ret = BDRV_BLOCK_ZERO; } @@ -1883,6 +1883,9 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, return -ENOTSUP; } + /* Invalidate the cached block-status data range if this write overlaps */ + bdrv_bsc_invalidate_range(bs, offset, bytes); + assert(alignment % bs->bl.request_alignment == 0); head = offset % alignment; tail = (offset + bytes) % alignment; @@ -2447,9 +2450,65 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset; if (bs->drv->bdrv_co_block_status) { - ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, - aligned_bytes, pnum, &local_map, - &local_file); + /* + * Use the block-status cache only for protocol nodes: Format + * drivers are generally quick to inquire the status, but protocol + * drivers often need to get information from outside of qemu, so + * we do not have control over the actual implementation. There + * have been cases where inquiring the status took an unreasonably + * long time, and we can do nothing in qemu to fix it. + * This is especially problematic for images with large data areas, + * because finding the few holes in them and giving them special + * treatment does not gain much performance. Therefore, we try to + * cache the last-identified data region. + * + * Second, limiting ourselves to protocol nodes allows us to assume + * the block status for data regions to be DATA | OFFSET_VALID, and + * that the host offset is the same as the guest offset. + * + * Note that it is possible that external writers zero parts of + * the cached regions without the cache being invalidated, and so + * we may report zeroes as data. This is not catastrophic, + * however, because reporting zeroes as data is fine. + */ + if (QLIST_EMPTY(&bs->children) && + bdrv_bsc_is_data(bs, aligned_offset, pnum)) + { + ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; + local_file = bs; + local_map = aligned_offset; + } else { + ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, + aligned_bytes, pnum, &local_map, + &local_file); + + /* + * Note that checking QLIST_EMPTY(&bs->children) is also done when + * the cache is queried above. Technically, we do not need to check + * it here; the worst that can happen is that we fill the cache for + * non-protocol nodes, and then it is never used. However, filling + * the cache requires an RCU update, so double check here to avoid + * such an update if possible. + */ + if (ret == (BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID) && + QLIST_EMPTY(&bs->children)) + { + /* + * When a protocol driver reports BLOCK_OFFSET_VALID, the + * returned local_map value must be the same as the offset we + * have passed (aligned_offset), and local_bs must be the node + * itself. + * Assert this, because we follow this rule when reading from + * the cache (see the `local_file = bs` and + * `local_map = aligned_offset` assignments above), and the + * result the cache delivers must be the same as the driver + * would deliver. + */ + assert(local_file == bs); + assert(local_map == aligned_offset); + bdrv_bsc_fill(bs, aligned_offset, *pnum); + } + } } else { /* Default code for filters */ @@ -3002,6 +3061,9 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, return 0; } + /* Invalidate the cached block-status data range if this discard overlaps */ + bdrv_bsc_invalidate_range(bs, offset, bytes); + /* Discard is advisory, but some devices track and coalesce * unaligned requests, so we must pass everything down rather than * round here. Still, most devices will just silently ignore diff --git a/block/iscsi.c b/block/iscsi.c index 4d2a416..8523840 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -781,9 +781,6 @@ retry: iscsi_allocmap_set_allocated(iscsilun, offset, *pnum); } - if (*pnum > bytes) { - *pnum = bytes; - } out_unlock: qemu_mutex_unlock(&iscsilun->mutex); g_free(iTask.err_str); diff --git a/block/mirror.c b/block/mirror.c index 98fc66e..85b781b 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -160,18 +160,25 @@ static void coroutine_fn mirror_wait_on_conflicts(MirrorOp *self, if (ranges_overlap(self_start_chunk, self_nb_chunks, op_start_chunk, op_nb_chunks)) { - /* - * If the operation is already (indirectly) waiting for us, or - * will wait for us as soon as it wakes up, then just go on - * (instead of producing a deadlock in the former case). - */ - if (op->waiting_for_op) { - continue; + if (self) { + /* + * If the operation is already (indirectly) waiting for us, + * or will wait for us as soon as it wakes up, then just go + * on (instead of producing a deadlock in the former case). + */ + if (op->waiting_for_op) { + continue; + } + + self->waiting_for_op = op; } - self->waiting_for_op = op; qemu_co_queue_wait(&op->waiting_requests, NULL); - self->waiting_for_op = NULL; + + if (self) { + self->waiting_for_op = NULL; + } + break; } } diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index bd05978..4ebb49a 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -556,8 +556,7 @@ static int coroutine_fn do_perform_cow_write(BlockDriverState *bs, * offset needs to be aligned to a cluster boundary. * * If the cluster is unallocated then *host_offset will be 0. - * If the cluster is compressed then *host_offset will contain the - * complete compressed cluster descriptor. + * If the cluster is compressed then *host_offset will contain the l2 entry. * * On entry, *bytes is the maximum number of contiguous bytes starting at * offset that we are interested in. @@ -660,7 +659,7 @@ int qcow2_get_host_offset(BlockDriverState *bs, uint64_t offset, ret = -EIO; goto fail; } - *host_offset = l2_entry & L2E_COMPRESSED_OFFSET_SIZE_MASK; + *host_offset = l2_entry; break; case QCOW2_SUBCLUSTER_ZERO_PLAIN: case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN: @@ -1400,29 +1399,47 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, if (end <= old_start || start >= old_end) { /* No intersection */ + continue; + } + + if (old_alloc->keep_old_clusters && + (end <= l2meta_cow_start(old_alloc) || + start >= l2meta_cow_end(old_alloc))) + { + /* + * Clusters intersect but COW areas don't. And cluster itself is + * already allocated. So, there is no actual conflict. + */ + continue; + } + + /* Conflict */ + + if (start < old_start) { + /* Stop at the start of a running allocation */ + bytes = old_start - start; } else { - if (start < old_start) { - /* Stop at the start of a running allocation */ - bytes = old_start - start; - } else { - bytes = 0; - } + bytes = 0; + } - /* Stop if already an l2meta exists. After yielding, it wouldn't - * be valid any more, so we'd have to clean up the old L2Metas - * and deal with requests depending on them before starting to - * gather new ones. Not worth the trouble. */ - if (bytes == 0 && *m) { - *cur_bytes = 0; - return 0; - } + /* + * Stop if an l2meta already exists. After yielding, it wouldn't + * be valid any more, so we'd have to clean up the old L2Metas + * and deal with requests depending on them before starting to + * gather new ones. Not worth the trouble. + */ + if (bytes == 0 && *m) { + *cur_bytes = 0; + return 0; + } - if (bytes == 0) { - /* Wait for the dependency to complete. We need to recheck - * the free/allocated clusters when we continue. */ - qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock); - return -EAGAIN; - } + if (bytes == 0) { + /* + * Wait for the dependency to complete. We need to recheck + * the free/allocated clusters when we continue. + */ + qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock); + return -EAGAIN; } } @@ -2463,3 +2480,18 @@ fail: g_free(l1_table); return ret; } + +void qcow2_parse_compressed_l2_entry(BlockDriverState *bs, uint64_t l2_entry, + uint64_t *coffset, int *csize) +{ + BDRVQcow2State *s = bs->opaque; + int nb_csectors; + + assert(qcow2_get_cluster_type(bs, l2_entry) == QCOW2_CLUSTER_COMPRESSED); + + *coffset = l2_entry & s->cluster_offset_mask; + + nb_csectors = ((l2_entry >> s->csize_shift) & s->csize_mask) + 1; + *csize = nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE - + (*coffset & (QCOW2_COMPRESSED_SECTOR_SIZE - 1)); +} diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 8e649b0..4614572 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -1177,11 +1177,11 @@ void qcow2_free_any_cluster(BlockDriverState *bs, uint64_t l2_entry, switch (ctype) { case QCOW2_CLUSTER_COMPRESSED: { - int64_t offset = (l2_entry & s->cluster_offset_mask) - & QCOW2_COMPRESSED_SECTOR_MASK; - int size = QCOW2_COMPRESSED_SECTOR_SIZE * - (((l2_entry >> s->csize_shift) & s->csize_mask) + 1); - qcow2_free_clusters(bs, offset, size, type); + uint64_t coffset; + int csize; + + qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize); + qcow2_free_clusters(bs, coffset, csize, type); } break; case QCOW2_CLUSTER_NORMAL: @@ -1247,7 +1247,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, bool l1_allocated = false; int64_t old_entry, old_l2_offset; unsigned slice, slice_size2, n_slices; - int i, j, l1_modified = 0, nb_csectors; + int i, j, l1_modified = 0; int ret; assert(addend >= -1 && addend <= 1); @@ -1318,14 +1318,14 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, switch (qcow2_get_cluster_type(bs, entry)) { case QCOW2_CLUSTER_COMPRESSED: - nb_csectors = ((entry >> s->csize_shift) & - s->csize_mask) + 1; if (addend != 0) { - uint64_t coffset = (entry & s->cluster_offset_mask) - & QCOW2_COMPRESSED_SECTOR_MASK; + uint64_t coffset; + int csize; + + qcow2_parse_compressed_l2_entry(bs, entry, + &coffset, &csize); ret = update_refcount( - bs, coffset, - nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE, + bs, coffset, csize, abs(addend), addend < 0, QCOW2_DISCARD_SNAPSHOT); if (ret < 0) { @@ -1588,6 +1588,66 @@ enum { }; /* + * Fix L2 entry by making it QCOW2_CLUSTER_ZERO_PLAIN (or making all its present + * subclusters QCOW2_SUBCLUSTER_ZERO_PLAIN). + * + * This function decrements res->corruptions on success, so the caller is + * responsible to increment res->corruptions prior to the call. + * + * On failure in-memory @l2_table may be modified. + */ +static int fix_l2_entry_by_zero(BlockDriverState *bs, BdrvCheckResult *res, + uint64_t l2_offset, + uint64_t *l2_table, int l2_index, bool active, + bool *metadata_overlap) +{ + BDRVQcow2State *s = bs->opaque; + int ret; + int idx = l2_index * (l2_entry_size(s) / sizeof(uint64_t)); + uint64_t l2e_offset = l2_offset + (uint64_t)l2_index * l2_entry_size(s); + int ign = active ? QCOW2_OL_ACTIVE_L2 : QCOW2_OL_INACTIVE_L2; + + if (has_subclusters(s)) { + uint64_t l2_bitmap = get_l2_bitmap(s, l2_table, l2_index); + + /* Allocated subclusters become zero */ + l2_bitmap |= l2_bitmap << 32; + l2_bitmap &= QCOW_L2_BITMAP_ALL_ZEROES; + + set_l2_bitmap(s, l2_table, l2_index, l2_bitmap); + set_l2_entry(s, l2_table, l2_index, 0); + } else { + set_l2_entry(s, l2_table, l2_index, QCOW_OFLAG_ZERO); + } + + ret = qcow2_pre_write_overlap_check(bs, ign, l2e_offset, l2_entry_size(s), + false); + if (metadata_overlap) { + *metadata_overlap = ret < 0; + } + if (ret < 0) { + fprintf(stderr, "ERROR: Overlap check failed\n"); + goto fail; + } + + ret = bdrv_pwrite_sync(bs->file, l2e_offset, &l2_table[idx], + l2_entry_size(s)); + if (ret < 0) { + fprintf(stderr, "ERROR: Failed to overwrite L2 " + "table entry: %s\n", strerror(-ret)); + goto fail; + } + + res->corruptions--; + res->corruptions_fixed++; + return 0; + +fail: + res->check_errors++; + return ret; +} + +/* * Increases the refcount in the given refcount table for the all clusters * referenced in the L2 table. While doing so, performs some checks on L2 * entries. @@ -1601,26 +1661,41 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, int flags, BdrvCheckMode fix, bool active) { BDRVQcow2State *s = bs->opaque; - uint64_t *l2_table, l2_entry; + uint64_t l2_entry, l2_bitmap; uint64_t next_contiguous_offset = 0; - int i, l2_size, nb_csectors, ret; + int i, ret; + size_t l2_size_bytes = s->l2_size * l2_entry_size(s); + g_autofree uint64_t *l2_table = g_malloc(l2_size_bytes); + bool metadata_overlap; /* Read L2 table from disk */ - l2_size = s->l2_size * l2_entry_size(s); - l2_table = g_malloc(l2_size); - - ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size); + ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size_bytes); if (ret < 0) { fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n"); res->check_errors++; - goto fail; + return ret; } /* Do the actual checks */ - for(i = 0; i < s->l2_size; i++) { + for (i = 0; i < s->l2_size; i++) { + uint64_t coffset; + int csize; + QCow2ClusterType type; + l2_entry = get_l2_entry(s, l2_table, i); + l2_bitmap = get_l2_bitmap(s, l2_table, i); + type = qcow2_get_cluster_type(bs, l2_entry); + + if (type != QCOW2_CLUSTER_COMPRESSED) { + /* Check reserved bits of Standard Cluster Descriptor */ + if (l2_entry & L2E_STD_RESERVED_MASK) { + fprintf(stderr, "ERROR found l2 entry with reserved bits set: " + "%" PRIx64 "\n", l2_entry); + res->corruptions++; + } + } - switch (qcow2_get_cluster_type(bs, l2_entry)) { + switch (type) { case QCOW2_CLUSTER_COMPRESSED: /* Compressed clusters don't have QCOW_OFLAG_COPIED */ if (l2_entry & QCOW_OFLAG_COPIED) { @@ -1638,23 +1713,28 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, break; } + if (l2_bitmap) { + fprintf(stderr, "ERROR compressed cluster %d with non-zero " + "subcluster allocation bitmap, entry=0x%" PRIx64 "\n", + i, l2_entry); + res->corruptions++; + break; + } + /* Mark cluster as used */ - nb_csectors = ((l2_entry >> s->csize_shift) & - s->csize_mask) + 1; - l2_entry &= s->cluster_offset_mask; + qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize); ret = qcow2_inc_refcounts_imrt( - bs, res, refcount_table, refcount_table_size, - l2_entry & QCOW2_COMPRESSED_SECTOR_MASK, - nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE); + bs, res, refcount_table, refcount_table_size, coffset, csize); if (ret < 0) { - goto fail; + return ret; } if (flags & CHECK_FRAG_INFO) { res->bfi.allocated_clusters++; res->bfi.compressed_clusters++; - /* Compressed clusters are fragmented by nature. Since they + /* + * Compressed clusters are fragmented by nature. Since they * take up sub-sector space but we only have sector granularity * I/O we need to re-read the same sectors even for adjacent * compressed clusters. @@ -1668,13 +1748,19 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, { uint64_t offset = l2_entry & L2E_OFFSET_MASK; + if ((l2_bitmap >> 32) & l2_bitmap) { + res->corruptions++; + fprintf(stderr, "ERROR offset=%" PRIx64 ": Allocated " + "cluster has corrupted subcluster allocation bitmap\n", + offset); + } + /* Correct offsets are cluster aligned */ if (offset_into_cluster(s, offset)) { bool contains_data; res->corruptions++; if (has_subclusters(s)) { - uint64_t l2_bitmap = get_l2_bitmap(s, l2_table, i); contains_data = (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC); } else { contains_data = !(l2_entry & QCOW_OFLAG_ZERO); @@ -1687,40 +1773,30 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", offset); if (fix & BDRV_FIX_ERRORS) { - int idx = i * (l2_entry_size(s) / sizeof(uint64_t)); - uint64_t l2e_offset = - l2_offset + (uint64_t)i * l2_entry_size(s); - int ign = active ? QCOW2_OL_ACTIVE_L2 : - QCOW2_OL_INACTIVE_L2; - - l2_entry = has_subclusters(s) ? 0 : QCOW_OFLAG_ZERO; - set_l2_entry(s, l2_table, i, l2_entry); - ret = qcow2_pre_write_overlap_check(bs, ign, - l2e_offset, l2_entry_size(s), false); - if (ret < 0) { - fprintf(stderr, "ERROR: Overlap check failed\n"); - res->check_errors++; - /* Something is seriously wrong, so abort checking - * this L2 table */ - goto fail; + ret = fix_l2_entry_by_zero(bs, res, l2_offset, + l2_table, i, active, + &metadata_overlap); + if (metadata_overlap) { + /* + * Something is seriously wrong, so abort checking + * this L2 table. + */ + return ret; } - ret = bdrv_pwrite_sync(bs->file, l2e_offset, - &l2_table[idx], - l2_entry_size(s)); - if (ret < 0) { - fprintf(stderr, "ERROR: Failed to overwrite L2 " - "table entry: %s\n", strerror(-ret)); - res->check_errors++; - /* Do not abort, continue checking the rest of this - * L2 table's entries */ - } else { - res->corruptions--; - res->corruptions_fixed++; - /* Skip marking the cluster as used - * (it is unused now) */ + if (ret == 0) { + /* + * Skip marking the cluster as used + * (it is unused now). + */ continue; } + + /* + * Failed to fix. + * Do not abort, continue checking the rest of this + * L2 table's entries. + */ } } else { fprintf(stderr, "ERROR offset=%" PRIx64 ": Data cluster is " @@ -1743,14 +1819,23 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, refcount_table_size, offset, s->cluster_size); if (ret < 0) { - goto fail; + return ret; } } break; } case QCOW2_CLUSTER_ZERO_PLAIN: + /* Impossible when image has subclusters */ + assert(!l2_bitmap); + break; + case QCOW2_CLUSTER_UNALLOCATED: + if (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC) { + res->corruptions++; + fprintf(stderr, "ERROR: Unallocated " + "cluster has non-zero subcluster allocation map\n"); + } break; default: @@ -1758,12 +1843,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, } } - g_free(l2_table); return 0; - -fail: - g_free(l2_table); - return ret; } /* @@ -1782,71 +1862,79 @@ static int check_refcounts_l1(BlockDriverState *bs, int flags, BdrvCheckMode fix, bool active) { BDRVQcow2State *s = bs->opaque; - uint64_t *l1_table = NULL, l2_offset, l1_size2; + size_t l1_size_bytes = l1_size * L1E_SIZE; + g_autofree uint64_t *l1_table = NULL; + uint64_t l2_offset; int i, ret; - l1_size2 = l1_size * L1E_SIZE; + if (!l1_size) { + return 0; + } /* Mark L1 table as used */ ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size, - l1_table_offset, l1_size2); + l1_table_offset, l1_size_bytes); if (ret < 0) { - goto fail; + return ret; + } + + l1_table = g_try_malloc(l1_size_bytes); + if (l1_table == NULL) { + res->check_errors++; + return -ENOMEM; } /* Read L1 table entries from disk */ - if (l1_size2 > 0) { - l1_table = g_try_malloc(l1_size2); - if (l1_table == NULL) { - ret = -ENOMEM; - res->check_errors++; - goto fail; - } - ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2); - if (ret < 0) { - fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n"); - res->check_errors++; - goto fail; - } - for(i = 0;i < l1_size; i++) - be64_to_cpus(&l1_table[i]); + ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size_bytes); + if (ret < 0) { + fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n"); + res->check_errors++; + return ret; + } + + for (i = 0; i < l1_size; i++) { + be64_to_cpus(&l1_table[i]); } /* Do the actual checks */ - for(i = 0; i < l1_size; i++) { - l2_offset = l1_table[i]; - if (l2_offset) { - /* Mark L2 table as used */ - l2_offset &= L1E_OFFSET_MASK; - ret = qcow2_inc_refcounts_imrt(bs, res, - refcount_table, refcount_table_size, - l2_offset, s->cluster_size); - if (ret < 0) { - goto fail; - } + for (i = 0; i < l1_size; i++) { + if (!l1_table[i]) { + continue; + } - /* L2 tables are cluster aligned */ - if (offset_into_cluster(s, l2_offset)) { - fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not " - "cluster aligned; L1 entry corrupted\n", l2_offset); - res->corruptions++; - } + if (l1_table[i] & L1E_RESERVED_MASK) { + fprintf(stderr, "ERROR found L1 entry with reserved bits set: " + "%" PRIx64 "\n", l1_table[i]); + res->corruptions++; + } - /* Process and check L2 entries */ - ret = check_refcounts_l2(bs, res, refcount_table, - refcount_table_size, l2_offset, flags, - fix, active); - if (ret < 0) { - goto fail; - } + l2_offset = l1_table[i] & L1E_OFFSET_MASK; + + /* Mark L2 table as used */ + ret = qcow2_inc_refcounts_imrt(bs, res, + refcount_table, refcount_table_size, + l2_offset, s->cluster_size); + if (ret < 0) { + return ret; + } + + /* L2 tables are cluster aligned */ + if (offset_into_cluster(s, l2_offset)) { + fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not " + "cluster aligned; L1 entry corrupted\n", l2_offset); + res->corruptions++; + } + + /* Process and check L2 entries */ + ret = check_refcounts_l2(bs, res, refcount_table, + refcount_table_size, l2_offset, flags, + fix, active); + if (ret < 0) { + return ret; } } - g_free(l1_table); - return 0; -fail: - g_free(l1_table); - return ret; + return 0; } /* @@ -2001,9 +2089,17 @@ static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res, for(i = 0; i < s->refcount_table_size; i++) { uint64_t offset, cluster; - offset = s->refcount_table[i]; + offset = s->refcount_table[i] & REFT_OFFSET_MASK; cluster = offset >> s->cluster_bits; + if (s->refcount_table[i] & REFT_RESERVED_MASK) { + fprintf(stderr, "ERROR refcount table entry %" PRId64 " has " + "reserved bits set\n", i); + res->corruptions++; + *rebuild = true; + continue; + } + /* Refcount blocks are cluster aligned */ if (offset_into_cluster(s, offset)) { fprintf(stderr, "ERROR refcount block %" PRId64 " is not " diff --git a/block/qcow2.c b/block/qcow2.c index 9f1b646..02f9f3e 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -74,7 +74,7 @@ typedef struct { static int coroutine_fn qcow2_co_preadv_compressed(BlockDriverState *bs, - uint64_t cluster_descriptor, + uint64_t l2_entry, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, @@ -2205,7 +2205,7 @@ typedef struct Qcow2AioTask { BlockDriverState *bs; QCow2SubclusterType subcluster_type; /* only for read */ - uint64_t host_offset; /* or full descriptor in compressed clusters */ + uint64_t host_offset; /* or l2_entry for compressed read */ uint64_t offset; uint64_t bytes; QEMUIOVector *qiov; @@ -4693,22 +4693,19 @@ qcow2_co_pwritev_compressed_part(BlockDriverState *bs, static int coroutine_fn qcow2_co_preadv_compressed(BlockDriverState *bs, - uint64_t cluster_descriptor, + uint64_t l2_entry, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, size_t qiov_offset) { BDRVQcow2State *s = bs->opaque; - int ret = 0, csize, nb_csectors; + int ret = 0, csize; uint64_t coffset; uint8_t *buf, *out_buf; int offset_in_cluster = offset_into_cluster(s, offset); - coffset = cluster_descriptor & s->cluster_offset_mask; - nb_csectors = ((cluster_descriptor >> s->csize_shift) & s->csize_mask) + 1; - csize = nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE - - (coffset & ~QCOW2_COMPRESSED_SECTOR_MASK); + qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize); buf = g_try_malloc(csize); if (!buf) { diff --git a/block/qcow2.h b/block/qcow2.h index 0fe5f74..fd48a89 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -110,7 +110,6 @@ /* Defined in the qcow2 spec (compressed cluster descriptor) */ #define QCOW2_COMPRESSED_SECTOR_SIZE 512U -#define QCOW2_COMPRESSED_SECTOR_MASK (~(QCOW2_COMPRESSED_SECTOR_SIZE - 1ULL)) /* Must be at least 2 to cover COW */ #define MIN_L2_CACHE_SIZE 2 /* cache entries */ @@ -587,10 +586,12 @@ typedef enum QCow2MetadataOverlap { (QCOW2_OL_CACHED | QCOW2_OL_INACTIVE_L2) #define L1E_OFFSET_MASK 0x00fffffffffffe00ULL +#define L1E_RESERVED_MASK 0x7f000000000001ffULL #define L2E_OFFSET_MASK 0x00fffffffffffe00ULL -#define L2E_COMPRESSED_OFFSET_SIZE_MASK 0x3fffffffffffffffULL +#define L2E_STD_RESERVED_MASK 0x3f000000000001feULL #define REFT_OFFSET_MASK 0xfffffffffffffe00ULL +#define REFT_RESERVED_MASK 0x1ffULL #define INV_OFFSET (-1ULL) @@ -914,6 +915,8 @@ int qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, uint64_t offset, int compressed_size, uint64_t *host_offset); +void qcow2_parse_compressed_l2_entry(BlockDriverState *bs, uint64_t l2_entry, + uint64_t *coffset, int *csize); int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m); void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m); diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst index fe6c30d..d58980a 100644 --- a/docs/tools/qemu-img.rst +++ b/docs/tools/qemu-img.rst @@ -415,7 +415,7 @@ Command description: 4 Error on reading data -.. option:: convert [--object OBJECTDEF] [--image-opts] [--target-image-opts] [--target-is-zero] [--bitmaps [--skip-broken-bitmaps]] [-U] [-C] [-c] [-p] [-q] [-n] [-f FMT] [-t CACHE] [-T SRC_CACHE] [-O OUTPUT_FMT] [-B BACKING_FILE] [-o OPTIONS] [-l SNAPSHOT_PARAM] [-S SPARSE_SIZE] [-r RATE_LIMIT] [-m NUM_COROUTINES] [-W] FILENAME [FILENAME2 [...]] OUTPUT_FILENAME +.. option:: convert [--object OBJECTDEF] [--image-opts] [--target-image-opts] [--target-is-zero] [--bitmaps [--skip-broken-bitmaps]] [-U] [-C] [-c] [-p] [-q] [-n] [-f FMT] [-t CACHE] [-T SRC_CACHE] [-O OUTPUT_FMT] [-B BACKING_FILE [-F backing_fmt]] [-o OPTIONS] [-l SNAPSHOT_PARAM] [-S SPARSE_SIZE] [-r RATE_LIMIT] [-m NUM_COROUTINES] [-W] FILENAME [FILENAME2 [...]] OUTPUT_FILENAME Convert the disk image *FILENAME* or a snapshot *SNAPSHOT_PARAM* to disk image *OUTPUT_FILENAME* using format *OUTPUT_FMT*. It can @@ -439,7 +439,7 @@ Command description: You can use the *BACKING_FILE* option to force the output image to be created as a copy on write image of the specified base image; the *BACKING_FILE* should have the same content as the input's base image, - however the path, image format, etc may differ. + however the path, image format (as given by *BACKING_FMT*), etc may differ. If a relative path name is given, the backing file is looked up relative to the directory containing *OUTPUT_FILENAME*. diff --git a/include/block/block_int.h b/include/block/block_int.h index f1a54db..5451f89 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -34,6 +34,7 @@ #include "qemu/hbitmap.h" #include "block/snapshot.h" #include "qemu/throttle.h" +#include "qemu/rcu.h" #define BLOCK_FLAG_LAZY_REFCOUNTS 8 @@ -347,6 +348,15 @@ struct BlockDriver { * clamped to bdrv_getlength() and aligned to request_alignment, * as well as non-NULL pnum, map, and file; in turn, the driver * must return an error or set pnum to an aligned non-zero value. + * + * Note that @bytes is just a hint on how big of a region the + * caller wants to inspect. It is not a limit on *pnum. + * Implementations are free to return larger values of *pnum if + * doing so does not incur a performance penalty. + * + * block/io.c's bdrv_co_block_status() will utilize an unclamped + * *pnum value for the block-status cache on protocol nodes, prior + * to clamping *pnum for return to its caller. */ int coroutine_fn (*bdrv_co_block_status)(BlockDriverState *bs, bool want_zero, int64_t offset, int64_t bytes, int64_t *pnum, @@ -840,11 +850,23 @@ struct BdrvChild { }; /* - * Note: the function bdrv_append() copies and swaps contents of - * BlockDriverStates, so if you add new fields to this struct, please - * inspect bdrv_append() to determine if the new fields need to be - * copied as well. + * Allows bdrv_co_block_status() to cache one data region for a + * protocol node. + * + * @valid: Whether the cache is valid (should be accessed with atomic + * functions so this can be reset by RCU readers) + * @data_start: Offset where we know (or strongly assume) is data + * @data_end: Offset where the data region ends (which is not necessarily + * the start of a zeroed region) */ +typedef struct BdrvBlockStatusCache { + struct rcu_head rcu; + + bool valid; + int64_t data_start; + int64_t data_end; +} BdrvBlockStatusCache; + struct BlockDriverState { /* Protected by big QEMU lock or read-only after opening. No special * locking needed during I/O... @@ -1010,6 +1032,11 @@ struct BlockDriverState { /* BdrvChild links to this node may never be frozen */ bool never_freeze; + + /* Lock for block-status cache RCU writers */ + CoMutex bsc_modify_lock; + /* Always non-NULL, but must only be dereferenced under an RCU read guard */ + BdrvBlockStatusCache *block_status_cache; }; struct BlockBackendRootState { @@ -1435,4 +1462,30 @@ static inline BlockDriverState *bdrv_primary_bs(BlockDriverState *bs) */ void bdrv_drain_all_end_quiesce(BlockDriverState *bs); +/** + * Check whether the given offset is in the cached block-status data + * region. + * + * If it is, and @pnum is not NULL, *pnum is set to + * `bsc.data_end - offset`, i.e. how many bytes, starting from + * @offset, are data (according to the cache). + * Otherwise, *pnum is not touched. + */ +bool bdrv_bsc_is_data(BlockDriverState *bs, int64_t offset, int64_t *pnum); + +/** + * If [offset, offset + bytes) overlaps with the currently cached + * block-status region, invalidate the cache. + * + * (To be used by I/O paths that cause data regions to be zero or + * holes.) + */ +void bdrv_bsc_invalidate_range(BlockDriverState *bs, + int64_t offset, int64_t bytes); + +/** + * Mark the range [offset, offset + bytes) as a data region. + */ +void bdrv_bsc_fill(BlockDriverState *bs, int64_t offset, int64_t bytes); + #endif /* BLOCK_INT_H */ diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx index b3620f2..4c4d94a 100644 --- a/qemu-img-cmds.hx +++ b/qemu-img-cmds.hx @@ -46,7 +46,7 @@ SRST ERST DEF("convert", img_convert, - "convert [--object objectdef] [--image-opts] [--target-image-opts] [--target-is-zero] [--bitmaps] [-U] [-C] [-c] [-p] [-q] [-n] [-f fmt] [-t cache] [-T src_cache] [-O output_fmt] [-B backing_file] [-o options] [-l snapshot_param] [-S sparse_size] [-r rate_limit] [-m num_coroutines] [-W] [--salvage] filename [filename2 [...]] output_filename") + "convert [--object objectdef] [--image-opts] [--target-image-opts] [--target-is-zero] [--bitmaps] [-U] [-C] [-c] [-p] [-q] [-n] [-f fmt] [-t cache] [-T src_cache] [-O output_fmt] [-B backing_file [-F backing_fmt]] [-o options] [-l snapshot_param] [-S sparse_size] [-r rate_limit] [-m num_coroutines] [-W] [--salvage] filename [filename2 [...]] output_filename") SRST .. option:: convert [--object OBJECTDEF] [--image-opts] [--target-image-opts] [--target-is-zero] [--bitmaps] [-U] [-C] [-c] [-p] [-q] [-n] [-f FMT] [-t CACHE] [-T SRC_CACHE] [-O OUTPUT_FMT] [-B BACKING_FILE] [-o OPTIONS] [-l SNAPSHOT_PARAM] [-S SPARSE_SIZE] [-r RATE_LIMIT] [-m NUM_COROUTINES] [-W] [--salvage] FILENAME [FILENAME2 [...]] OUTPUT_FILENAME ERST @@ -2183,7 +2183,8 @@ static int img_convert(int argc, char **argv) int c, bs_i, flags, src_flags = BDRV_O_NO_SHARE; const char *fmt = NULL, *out_fmt = NULL, *cache = "unsafe", *src_cache = BDRV_DEFAULT_CACHE, *out_baseimg = NULL, - *out_filename, *out_baseimg_param, *snapshot_name = NULL; + *out_filename, *out_baseimg_param, *snapshot_name = NULL, + *backing_fmt = NULL; BlockDriver *drv = NULL, *proto_drv = NULL; BlockDriverInfo bdi; BlockDriverState *out_bs; @@ -2223,7 +2224,7 @@ static int img_convert(int argc, char **argv) {"skip-broken-bitmaps", no_argument, 0, OPTION_SKIP_BROKEN}, {0, 0, 0, 0} }; - c = getopt_long(argc, argv, ":hf:O:B:Cco:l:S:pt:T:qnm:WUr:", + c = getopt_long(argc, argv, ":hf:O:B:CcF:o:l:S:pt:T:qnm:WUr:", long_options, NULL); if (c == -1) { break; @@ -2253,6 +2254,9 @@ static int img_convert(int argc, char **argv) case 'c': s.compressed = true; break; + case 'F': + backing_fmt = optarg; + break; case 'o': if (accumulate_options(&options, optarg) < 0) { goto fail_getopt; @@ -2521,7 +2525,7 @@ static int img_convert(int argc, char **argv) qemu_opt_set_number(opts, BLOCK_OPT_SIZE, s.total_sectors * BDRV_SECTOR_SIZE, &error_abort); - ret = add_old_style_options(out_fmt, opts, out_baseimg, NULL); + ret = add_old_style_options(out_fmt, opts, out_baseimg, backing_fmt); if (ret < 0) { goto out; } @@ -2628,6 +2632,14 @@ static int img_convert(int argc, char **argv) goto out; } + if (flags & BDRV_O_NOCACHE) { + /* + * If we open the target with O_DIRECT, it may be necessary to + * extend its size to align to the physical sector size. + */ + flags |= BDRV_O_RESIZE; + } + if (skip_create) { s.target = img_open(tgt_image_opts, out_filename, out_fmt, flags, writethrough, s.quiet, false); diff --git a/scripts/simplebench/img_bench_templater.py b/scripts/simplebench/img_bench_templater.py new file mode 100755 index 0000000..f8e1540 --- /dev/null +++ b/scripts/simplebench/img_bench_templater.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +# +# Process img-bench test templates +# +# Copyright (c) 2021 Virtuozzo International GmbH. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# + + +import sys +import subprocess +import re +import json + +import simplebench +from results_to_text import results_to_text +from table_templater import Templater + + +def bench_func(env, case): + test = templater.gen(env['data'], case['data']) + + p = subprocess.run(test, shell=True, stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, universal_newlines=True) + + if p.returncode == 0: + try: + m = re.search(r'Run completed in (\d+.\d+) seconds.', p.stdout) + return {'seconds': float(m.group(1))} + except Exception: + return {'error': f'failed to parse qemu-img output: {p.stdout}'} + else: + return {'error': f'qemu-img failed: {p.returncode}: {p.stdout}'} + + +if __name__ == '__main__': + if len(sys.argv) > 1: + print(""" +Usage: img_bench_templater.py < path/to/test-template.sh + +This script generates performance tests from a test template (example below), +runs them, and displays the results in a table. The template is read from +stdin. It must be written in bash and end with a `qemu-img bench` invocation +(whose result is parsed to get the test instance’s result). + +Use the following syntax in the template to create the various different test +instances: + + column templating: {var1|var2|...} - test will use different values in + different columns. You may use several {} constructions in the test, in this + case product of all choice-sets will be used. + + row templating: [var1|var2|...] - similar thing to define rows (test-cases) + +Test template example: + +Assume you want to compare two qemu-img binaries, called qemu-img-old and +qemu-img-new in your build directory in two test-cases with 4K writes and 64K +writes. The template may look like this: + +qemu_img=/path/to/qemu/build/qemu-img-{old|new} +$qemu_img create -f qcow2 /ssd/x.qcow2 1G +$qemu_img bench -c 100 -d 8 [-s 4K|-s 64K] -w -t none -n /ssd/x.qcow2 + +When passing this to stdin of img_bench_templater.py, the resulting comparison +table will contain two columns (for two binaries) and two rows (for two +test-cases). + +In addition to displaying the results, script also stores results in JSON +format into results.json file in current directory. +""") + sys.exit() + + templater = Templater(sys.stdin.read()) + + envs = [{'id': ' / '.join(x), 'data': x} for x in templater.columns] + cases = [{'id': ' / '.join(x), 'data': x} for x in templater.rows] + + result = simplebench.bench(bench_func, envs, cases, count=5, + initial_run=False) + print(results_to_text(result)) + with open('results.json', 'w') as f: + json.dump(result, f, indent=4) diff --git a/scripts/simplebench/table_templater.py b/scripts/simplebench/table_templater.py new file mode 100644 index 0000000..950f3b3 --- /dev/null +++ b/scripts/simplebench/table_templater.py @@ -0,0 +1,62 @@ +# Parser for test templates +# +# Copyright (c) 2021 Virtuozzo International GmbH. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# + +import itertools +from lark import Lark + +grammar = """ +start: ( text | column_switch | row_switch )+ + +column_switch: "{" text ["|" text]+ "}" +row_switch: "[" text ["|" text]+ "]" +text: /[^|{}\[\]]+/ +""" + +parser = Lark(grammar) + +class Templater: + def __init__(self, template): + self.tree = parser.parse(template) + + c_switches = [] + r_switches = [] + for x in self.tree.children: + if x.data == 'column_switch': + c_switches.append([el.children[0].value for el in x.children]) + elif x.data == 'row_switch': + r_switches.append([el.children[0].value for el in x.children]) + + self.columns = list(itertools.product(*c_switches)) + self.rows = list(itertools.product(*r_switches)) + + def gen(self, column, row): + i = 0 + j = 0 + result = [] + + for x in self.tree.children: + if x.data == 'text': + result.append(x.children[0].value) + elif x.data == 'column_switch': + result.append(column[i]) + i += 1 + elif x.data == 'row_switch': + result.append(row[j]) + j += 1 + + return ''.join(result) diff --git a/tests/qemu-iotests/122 b/tests/qemu-iotests/122 index 5d550ed..efb260d 100755 --- a/tests/qemu-iotests/122 +++ b/tests/qemu-iotests/122 @@ -67,7 +67,7 @@ echo _make_test_img -b "$TEST_IMG".base -F $IMGFMT $QEMU_IO -c "write -P 0 0 3M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir -$QEMU_IMG convert -O $IMGFMT -B "$TEST_IMG".base -o backing_fmt=$IMGFMT \ +$QEMU_IMG convert -O $IMGFMT -B "$TEST_IMG".base -F $IMGFMT \ "$TEST_IMG" "$TEST_IMG".orig $QEMU_IO -c "read -P 0 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir $QEMU_IMG convert -O $IMGFMT -c -B "$TEST_IMG".base -o backing_fmt=$IMGFMT \ diff --git a/tests/qemu-iotests/271 b/tests/qemu-iotests/271 index 599b849..2775b4d 100755 --- a/tests/qemu-iotests/271 +++ b/tests/qemu-iotests/271 @@ -893,7 +893,10 @@ EOF } _make_test_img -o extended_l2=on 1M -_concurrent_io | $QEMU_IO | _filter_qemu_io +# Second and third writes in _concurrent_io() are independent and may finish in +# different order. So, filter offset out to match both possible variants. +_concurrent_io | $QEMU_IO | _filter_qemu_io | \ + $SED -e 's/\(20480\|40960\)/OFFSET/' _concurrent_verify | $QEMU_IO | _filter_qemu_io # success, all done diff --git a/tests/qemu-iotests/271.out b/tests/qemu-iotests/271.out index 81043ba..5be780d 100644 --- a/tests/qemu-iotests/271.out +++ b/tests/qemu-iotests/271.out @@ -719,8 +719,8 @@ blkdebug: Suspended request 'A' blkdebug: Resuming request 'A' wrote 2048/2048 bytes at offset 30720 2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) -wrote 2048/2048 bytes at offset 20480 +wrote 2048/2048 bytes at offset OFFSET 2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) -wrote 2048/2048 bytes at offset 40960 +wrote 2048/2048 bytes at offset OFFSET 2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) *** done diff --git a/tests/qemu-iotests/297 b/tests/qemu-iotests/297 index 345b617..b04cba5 100755 --- a/tests/qemu-iotests/297 +++ b/tests/qemu-iotests/297 @@ -29,7 +29,7 @@ import iotests SKIP_FILES = ( '030', '040', '041', '044', '045', '055', '056', '057', '065', '093', '096', '118', '124', '132', '136', '139', '147', '148', '149', - '151', '152', '155', '163', '165', '169', '194', '196', '199', '202', + '151', '152', '155', '163', '165', '194', '196', '202', '203', '205', '206', '207', '208', '210', '211', '212', '213', '216', '218', '219', '224', '228', '234', '235', '236', '237', '238', '240', '242', '245', '246', '248', '255', '256', '257', '258', '260', @@ -46,7 +46,7 @@ def is_python_file(filename): if filename.endswith('.py'): return True - with open(filename) as f: + with open(filename, encoding='utf-8') as f: try: first_line = f.readline() return re.match('^#!.*python', first_line) is not None @@ -55,8 +55,9 @@ def is_python_file(filename): def run_linters(): - files = [filename for filename in (set(os.listdir('.')) - set(SKIP_FILES)) - if is_python_file(filename)] + named_tests = [f'tests/{entry}' for entry in os.listdir('tests')] + check_tests = set(os.listdir('.') + named_tests) - set(SKIP_FILES) + files = [filename for filename in check_tests if is_python_file(filename)] iotests.logger.debug('Files to be checked:') iotests.logger.debug(', '.join(sorted(files))) diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py index 11276f3..ce06cf5 100644 --- a/tests/qemu-iotests/iotests.py +++ b/tests/qemu-iotests/iotests.py @@ -610,7 +610,7 @@ class VM(qtest.QEMUQtestMachine): return valgrind_filename = f"{test_dir}/{self._popen.pid}.valgrind" if self.exitcode() == 99: - with open(valgrind_filename) as f: + with open(valgrind_filename, encoding='utf-8') as f: print(f.read()) else: os.remove(valgrind_filename) @@ -703,7 +703,7 @@ class VM(qtest.QEMUQtestMachine): def flatten_qmp_object(self, obj, output=None, basestr=''): if output is None: - output = dict() + output = {} if isinstance(obj, list): for i, item in enumerate(obj): self.flatten_qmp_object(item, output, basestr + str(i) + '.') @@ -716,7 +716,7 @@ class VM(qtest.QEMUQtestMachine): def qmp_to_opts(self, obj): obj = self.flatten_qmp_object(obj) - output_list = list() + output_list = [] for key in obj: output_list += [key + '=' + obj[key]] return ','.join(output_list) @@ -1121,7 +1121,8 @@ def notrun(reason): # Each test in qemu-iotests has a number ("seq") seq = os.path.basename(sys.argv[0]) - with open('%s/%s.notrun' % (output_dir, seq), 'w') as outfile: + with open('%s/%s.notrun' % (output_dir, seq), 'w', encoding='utf-8') \ + as outfile: outfile.write(reason + '\n') logger.warning("%s not run: %s", seq, reason) sys.exit(0) @@ -1135,7 +1136,8 @@ def case_notrun(reason): # Each test in qemu-iotests has a number ("seq") seq = os.path.basename(sys.argv[0]) - with open('%s/%s.casenotrun' % (output_dir, seq), 'a') as outfile: + with open('%s/%s.casenotrun' % (output_dir, seq), 'a', encoding='utf-8') \ + as outfile: outfile.write(' [case not run] ' + reason + '\n') def _verify_image_format(supported_fmts: Sequence[str] = (), diff --git a/tests/qemu-iotests/tests/migrate-bitmaps-postcopy-test b/tests/qemu-iotests/tests/migrate-bitmaps-postcopy-test index 584062b..00ebb5c 100755 --- a/tests/qemu-iotests/tests/migrate-bitmaps-postcopy-test +++ b/tests/qemu-iotests/tests/migrate-bitmaps-postcopy-test @@ -132,10 +132,10 @@ class TestDirtyBitmapPostcopyMigration(iotests.QMPTestCase): result = self.vm_a.qmp('x-debug-block-dirty-bitmap-sha256', node='drive0', name='bitmap0') - self.discards1_sha256 = result['return']['sha256'] + discards1_sha256 = result['return']['sha256'] # Check, that updating the bitmap by discards works - assert self.discards1_sha256 != empty_sha256 + assert discards1_sha256 != empty_sha256 # We want to calculate resulting sha256. Do it in bitmap0, so, disable # other bitmaps @@ -148,7 +148,7 @@ class TestDirtyBitmapPostcopyMigration(iotests.QMPTestCase): result = self.vm_a.qmp('x-debug-block-dirty-bitmap-sha256', node='drive0', name='bitmap0') - self.all_discards_sha256 = result['return']['sha256'] + all_discards_sha256 = result['return']['sha256'] # Now, enable some bitmaps, to be updated during migration for i in range(2, nb_bitmaps, 2): @@ -173,10 +173,11 @@ class TestDirtyBitmapPostcopyMigration(iotests.QMPTestCase): event_resume = self.vm_b.event_wait('RESUME') self.vm_b_events.append(event_resume) - return event_resume + return (event_resume, discards1_sha256, all_discards_sha256) def test_postcopy_success(self): - event_resume = self.start_postcopy() + event_resume, discards1_sha256, all_discards_sha256 = \ + self.start_postcopy() # enabled bitmaps should be updated apply_discards(self.vm_b, discards2) @@ -217,7 +218,7 @@ class TestDirtyBitmapPostcopyMigration(iotests.QMPTestCase): for i in range(0, nb_bitmaps, 5): result = self.vm_b.qmp('x-debug-block-dirty-bitmap-sha256', node='drive0', name='bitmap{}'.format(i)) - sha = self.discards1_sha256 if i % 2 else self.all_discards_sha256 + sha = discards1_sha256 if i % 2 else all_discards_sha256 self.assert_qmp(result, 'return/sha256', sha) def test_early_shutdown_destination(self): diff --git a/tests/qemu-iotests/tests/migrate-bitmaps-test b/tests/qemu-iotests/tests/migrate-bitmaps-test index a5c7bc8..dc431c3 100755 --- a/tests/qemu-iotests/tests/migrate-bitmaps-test +++ b/tests/qemu-iotests/tests/migrate-bitmaps-test @@ -20,11 +20,10 @@ # import os -import iotests -import time import itertools import operator import re +import iotests from iotests import qemu_img, qemu_img_create, Timeout @@ -37,6 +36,12 @@ mig_cmd = 'exec: cat > ' + mig_file incoming_cmd = 'exec: cat ' + mig_file +def get_bitmap_hash(vm): + result = vm.qmp('x-debug-block-dirty-bitmap-sha256', + node='drive0', name='bitmap0') + return result['return']['sha256'] + + class TestDirtyBitmapMigration(iotests.QMPTestCase): def tearDown(self): self.vm_a.shutdown() @@ -62,21 +67,16 @@ class TestDirtyBitmapMigration(iotests.QMPTestCase): params['persistent'] = True result = vm.qmp('block-dirty-bitmap-add', **params) - self.assert_qmp(result, 'return', {}); - - def get_bitmap_hash(self, vm): - result = vm.qmp('x-debug-block-dirty-bitmap-sha256', - node='drive0', name='bitmap0') - return result['return']['sha256'] + self.assert_qmp(result, 'return', {}) def check_bitmap(self, vm, sha256): result = vm.qmp('x-debug-block-dirty-bitmap-sha256', node='drive0', name='bitmap0') if sha256: - self.assert_qmp(result, 'return/sha256', sha256); + self.assert_qmp(result, 'return/sha256', sha256) else: self.assert_qmp(result, 'error/desc', - "Dirty bitmap 'bitmap0' not found"); + "Dirty bitmap 'bitmap0' not found") def do_test_migration_resume_source(self, persistent, migrate_bitmaps): granularity = 512 @@ -97,7 +97,7 @@ class TestDirtyBitmapMigration(iotests.QMPTestCase): self.add_bitmap(self.vm_a, granularity, persistent) for r in regions: self.vm_a.hmp_qemu_io('drive0', 'write %d %d' % r) - sha256 = self.get_bitmap_hash(self.vm_a) + sha256 = get_bitmap_hash(self.vm_a) result = self.vm_a.qmp('migrate', uri=mig_cmd) while True: @@ -106,7 +106,7 @@ class TestDirtyBitmapMigration(iotests.QMPTestCase): break while True: result = self.vm_a.qmp('query-status') - if (result['return']['status'] == 'postmigrate'): + if result['return']['status'] == 'postmigrate': break # test that bitmap is still here @@ -164,7 +164,7 @@ class TestDirtyBitmapMigration(iotests.QMPTestCase): self.add_bitmap(self.vm_a, granularity, persistent) for r in regions: self.vm_a.hmp_qemu_io('drive0', 'write %d %d' % r) - sha256 = self.get_bitmap_hash(self.vm_a) + sha256 = get_bitmap_hash(self.vm_a) if pre_shutdown: self.vm_a.shutdown() @@ -214,16 +214,22 @@ class TestDirtyBitmapMigration(iotests.QMPTestCase): self.check_bitmap(self.vm_b, sha256 if persistent else False) -def inject_test_case(klass, name, method, *args, **kwargs): +def inject_test_case(klass, suffix, method, *args, **kwargs): mc = operator.methodcaller(method, *args, **kwargs) - setattr(klass, 'test_' + method + name, lambda self: mc(self)) + # We want to add a function attribute to `klass`, so that it is + # correctly converted to a method on instantiation. The + # methodcaller object `mc` is a callable, not a function, so we + # need the lambda to turn it into a function. + # pylint: disable=unnecessary-lambda + setattr(klass, 'test_' + method + suffix, lambda self: mc(self)) + for cmb in list(itertools.product((True, False), repeat=5)): name = ('_' if cmb[0] else '_not_') + 'persistent_' name += ('_' if cmb[1] else '_not_') + 'migbitmap_' name += '_online' if cmb[2] else '_offline' name += '_shared' if cmb[3] else '_nonshared' - if (cmb[4]): + if cmb[4]: name += '__pre_shutdown' inject_test_case(TestDirtyBitmapMigration, name, 'do_test_migration', @@ -270,7 +276,8 @@ class TestDirtyBitmapBackingMigration(iotests.QMPTestCase): self.assert_qmp(result, 'return', {}) # Check that the bitmaps are there - for node in self.vm.qmp('query-named-block-nodes', flat=True)['return']: + nodes = self.vm.qmp('query-named-block-nodes', flat=True)['return'] + for node in nodes: if 'node0' in node['node-name']: self.assert_qmp(node, 'dirty-bitmaps[0]/name', 'bmap0') @@ -287,7 +294,7 @@ class TestDirtyBitmapBackingMigration(iotests.QMPTestCase): """ Continue the source after migration. """ - result = self.vm.qmp('migrate', uri=f'exec: cat > /dev/null') + result = self.vm.qmp('migrate', uri='exec: cat > /dev/null') self.assert_qmp(result, 'return', {}) with Timeout(10, 'Migration timeout'): diff --git a/tests/qemu-iotests/tests/migrate-during-backup b/tests/qemu-iotests/tests/migrate-during-backup new file mode 100755 index 0000000..3410322 --- /dev/null +++ b/tests/qemu-iotests/tests/migrate-during-backup @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +# group: migration +# +# Copyright (c) 2021 Virtuozzo International GmbH +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# + +import os +import iotests +from iotests import qemu_img_create, qemu_io + + +disk_a = os.path.join(iotests.test_dir, 'disk_a') +disk_b = os.path.join(iotests.test_dir, 'disk_b') +size = '1M' +mig_file = os.path.join(iotests.test_dir, 'mig_file') +mig_cmd = 'exec: cat > ' + mig_file + + +class TestMigrateDuringBackup(iotests.QMPTestCase): + def tearDown(self): + self.vm.shutdown() + os.remove(disk_a) + os.remove(disk_b) + os.remove(mig_file) + + def setUp(self): + qemu_img_create('-f', iotests.imgfmt, disk_a, size) + qemu_img_create('-f', iotests.imgfmt, disk_b, size) + qemu_io('-c', f'write 0 {size}', disk_a) + + self.vm = iotests.VM().add_drive(disk_a) + self.vm.launch() + result = self.vm.qmp('blockdev-add', { + 'node-name': 'target', + 'driver': iotests.imgfmt, + 'file': { + 'driver': 'file', + 'filename': disk_b + } + }) + self.assert_qmp(result, 'return', {}) + + def test_migrate(self): + result = self.vm.qmp('blockdev-backup', device='drive0', + target='target', sync='full', + speed=1, x_perf={ + 'max-workers': 1, + 'max-chunk': 64 * 1024 + }) + self.assert_qmp(result, 'return', {}) + + result = self.vm.qmp('job-pause', id='drive0') + self.assert_qmp(result, 'return', {}) + + result = self.vm.qmp('migrate-set-capabilities', + capabilities=[{'capability': 'events', + 'state': True}]) + self.assert_qmp(result, 'return', {}) + result = self.vm.qmp('migrate', uri=mig_cmd) + self.assert_qmp(result, 'return', {}) + + e = self.vm.events_wait((('MIGRATION', + {'data': {'status': 'completed'}}), + ('MIGRATION', + {'data': {'status': 'failed'}}))) + + # Don't assert that e is 'failed' now: this way we'll miss + # possible crash when backup continues :) + + result = self.vm.qmp('block-job-set-speed', device='drive0', + speed=0) + self.assert_qmp(result, 'return', {}) + result = self.vm.qmp('job-resume', id='drive0') + self.assert_qmp(result, 'return', {}) + + # For future: if something changes so that both migration + # and backup pass, let's not miss that moment, as it may + # be a bug as well as improvement. + self.assert_qmp(e, 'data/status', 'failed') + + +if __name__ == '__main__': + iotests.main(supported_fmts=['qcow2'], + supported_protocols=['file']) diff --git a/tests/qemu-iotests/tests/migrate-during-backup.out b/tests/qemu-iotests/tests/migrate-during-backup.out new file mode 100644 index 0000000..ae1213e --- /dev/null +++ b/tests/qemu-iotests/tests/migrate-during-backup.out @@ -0,0 +1,5 @@ +. +---------------------------------------------------------------------- +Ran 1 tests + +OK diff --git a/tests/qemu-iotests/tests/mirror-top-perms b/tests/qemu-iotests/tests/mirror-top-perms index 451a066..2fc8dd6 100755 --- a/tests/qemu-iotests/tests/mirror-top-perms +++ b/tests/qemu-iotests/tests/mirror-top-perms @@ -47,7 +47,7 @@ class TestMirrorTopPerms(iotests.QMPTestCase): def tearDown(self): try: self.vm.shutdown() - except qemu.machine.AbnormalShutdown: + except qemu.machine.machine.AbnormalShutdown: pass if self.vm_b is not None: |