diff options
author | Richard Henderson <richard.henderson@linaro.org> | 2023-06-05 10:27:31 -0700 |
---|---|---|
committer | Richard Henderson <richard.henderson@linaro.org> | 2023-06-05 10:27:31 -0700 |
commit | b52daaf2c868f2bab102eb5acbf55b2917f46aea (patch) | |
tree | 7717ebe9fd1dec6af5ff339400c645413c22c97f | |
parent | afa351fe36d4a09842e06a0159325568ac8f923b (diff) | |
parent | 42a2890a76f4783cd1c212f27856edcf2b5e8a75 (diff) | |
download | qemu-b52daaf2c868f2bab102eb5acbf55b2917f46aea.zip qemu-b52daaf2c868f2bab102eb5acbf55b2917f46aea.tar.gz qemu-b52daaf2c868f2bab102eb5acbf55b2917f46aea.tar.bz2 |
Merge tag 'pull-block-2023-06-05' of https://gitlab.com/hreitz/qemu into staging
Block patches
- Fix padding of unaligned vectored requests to match the host alignment
for vectors with 1023 or 1024 buffers
- Refactor and fix bugs in parallels's image check functionality
- Add an option to the qcow2 driver to retain (qcow2-level) allocations
on discard requests from the guest (while still forwarding the discard
to the lower level and marking the range as zero)
# -----BEGIN PGP SIGNATURE-----
#
# iQJGBAABCAAwFiEEy2LXoO44KeRfAE00ofpA0JgBnN8FAmR+AT4SHGhyZWl0ekBy
# ZWRoYXQuY29tAAoJEKH6QNCYAZzfnboQAKD6YrreZLoseomRfqOAoApSf6yOdcHk
# 6kfsvzwzjosomsF1Pkzm4851vX5PyDqTdeu0iViM+pxanVO1b494q1P4VcAERqMB
# iZVs68R6M0l6HV9btWFGm+ibHJf4FapdntkIdwog1ka5TIhw5oDWCVNLigjhIoRv
# sM37Bgf14kC3sFTR++0HESsyU1eUP5gJjwJbPZ2IgJBmzYay0is1z5nHA/3VUswu
# 8dKnGQDsv62EtlK7PK8cU2BhLOeNi6Wr3bAb6Wf2QLB5e0qRb7oAkqNx5/UcTznk
# a3XMC1aiWhYvM/+DaYIpQUcIPgA8xQ1KHKeD6WjbGfLgZBqseX0aGWMByUsiY8Bo
# +BPIBnUDrbiPnAKB/XLQfnzlE+s7121/JpEbB7AkZqVFRGuw8Wur4tbc2fzvy8Pw
# x/uQfv3ZPi/2Lf6u7hv/TVHubXi8jucVgx3Ubu5Jeo3901S4/KOQBQ4BQ/GYIGQX
# 38ijSROcEd0eQJ1mTKPEctouxjSZCghNSbrn9DfsL1V3VWqWNKKGCU3hM+RQ1SJT
# 688qvnyYt8QZfTsiDSHR/GfKsufG0DkoqE7c9IhSEPohecAH8Rrc3HcLut7fuwD2
# gCFQhm68CPwwRmBjPCY6Zi1RDzeOyFBSWN31T6t0yTb4OHJ/3/cSZVBJtwwkOVbx
# zwabHDNdY5Kw
# =GuoL
# -----END PGP SIGNATURE-----
# gpg: Signature made Mon 05 Jun 2023 08:37:34 AM PDT
# gpg: using RSA key CB62D7A0EE3829E45F004D34A1FA40D098019CDF
# gpg: issuer "hreitz@redhat.com"
# gpg: Good signature from "Hanna Reitz <hreitz@redhat.com>" [unknown]
# gpg: WARNING: This key is not certified with a trusted signature!
# gpg: There is no indication that the signature belongs to the owner.
# Primary key fingerprint: CB62 D7A0 EE38 29E4 5F00 4D34 A1FA 40D0 9801 9CDF
* tag 'pull-block-2023-06-05' of https://gitlab.com/hreitz/qemu:
qcow2: add discard-no-unref option
parallels: Incorrect condition in out-of-image check
parallels: Replace qemu_co_mutex_lock by WITH_QEMU_LOCK_GUARD
parallels: Move statistic collection to a separate function
parallels: Move check of leaks to a separate function
parallels: Fix statistics calculation
parallels: Move check of cluster outside image to a separate function
parallels: Move check of unclean image to a separate function
parallels: Use generic infrastructure for BAT writing in parallels_co_check()
parallels: create parallels_set_bat_entry_helper() to assign BAT value
parallels: Fix image_end_offset and data_end after out-of-image check
parallels: Fix high_off calculation in parallels_co_check()
parallels: Out of image offset in BAT leads to image inflation
iotests/iov-padding: New test
util/iov: Remove qemu_iovec_init_extended()
block: Collapse padded I/O vecs exceeding IOV_MAX
util/iov: Make qiov_slice() public
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
-rw-r--r-- | block/io.c | 166 | ||||
-rw-r--r-- | block/parallels.c | 190 | ||||
-rw-r--r-- | block/qcow2-cluster.c | 32 | ||||
-rw-r--r-- | block/qcow2.c | 18 | ||||
-rw-r--r-- | block/qcow2.h | 3 | ||||
-rw-r--r-- | include/qemu/iov.h | 8 | ||||
-rw-r--r-- | qapi/block-core.json | 12 | ||||
-rw-r--r-- | qemu-options.hx | 12 | ||||
-rwxr-xr-x | tests/qemu-iotests/tests/iov-padding | 85 | ||||
-rw-r--r-- | tests/qemu-iotests/tests/iov-padding.out | 59 | ||||
-rw-r--r-- | util/iov.c | 89 |
11 files changed, 523 insertions, 151 deletions
@@ -1441,6 +1441,14 @@ out: * @merge_reads is true for small requests, * if @buf_len == @head + bytes + @tail. In this case it is possible that both * head and tail exist but @buf_len == align and @tail_buf == @buf. + * + * @write is true for write requests, false for read requests. + * + * If padding makes the vector too long (exceeding IOV_MAX), then we need to + * merge existing vector elements into a single one. @collapse_bounce_buf acts + * as the bounce buffer in such cases. @pre_collapse_qiov has the pre-collapse + * I/O vector elements so for read requests, the data can be copied back after + * the read is done. */ typedef struct BdrvRequestPadding { uint8_t *buf; @@ -1449,11 +1457,17 @@ typedef struct BdrvRequestPadding { size_t head; size_t tail; bool merge_reads; + bool write; QEMUIOVector local_qiov; + + uint8_t *collapse_bounce_buf; + size_t collapse_len; + QEMUIOVector pre_collapse_qiov; } BdrvRequestPadding; static bool bdrv_init_padding(BlockDriverState *bs, int64_t offset, int64_t bytes, + bool write, BdrvRequestPadding *pad) { int64_t align = bs->bl.request_alignment; @@ -1485,6 +1499,8 @@ static bool bdrv_init_padding(BlockDriverState *bs, pad->tail_buf = pad->buf + pad->buf_len - align; } + pad->write = write; + return true; } @@ -1549,8 +1565,23 @@ zero_mem: return 0; } -static void bdrv_padding_destroy(BdrvRequestPadding *pad) +/** + * Free *pad's associated buffers, and perform any necessary finalization steps. + */ +static void bdrv_padding_finalize(BdrvRequestPadding *pad) { + if (pad->collapse_bounce_buf) { + if (!pad->write) { + /* + * If padding required elements in the vector to be collapsed into a + * bounce buffer, copy the bounce buffer content back + */ + qemu_iovec_from_buf(&pad->pre_collapse_qiov, 0, + pad->collapse_bounce_buf, pad->collapse_len); + } + qemu_vfree(pad->collapse_bounce_buf); + qemu_iovec_destroy(&pad->pre_collapse_qiov); + } if (pad->buf) { qemu_vfree(pad->buf); qemu_iovec_destroy(&pad->local_qiov); @@ -1559,12 +1590,109 @@ static void bdrv_padding_destroy(BdrvRequestPadding *pad) } /* + * Create pad->local_qiov by wrapping @iov in the padding head and tail, while + * ensuring that the resulting vector will not exceed IOV_MAX elements. + * + * To ensure this, when necessary, the first two or three elements of @iov are + * merged into pad->collapse_bounce_buf and replaced by a reference to that + * bounce buffer in pad->local_qiov. + * + * After performing a read request, the data from the bounce buffer must be + * copied back into pad->pre_collapse_qiov (e.g. by bdrv_padding_finalize()). + */ +static int bdrv_create_padded_qiov(BlockDriverState *bs, + BdrvRequestPadding *pad, + struct iovec *iov, int niov, + size_t iov_offset, size_t bytes) +{ + int padded_niov, surplus_count, collapse_count; + + /* Assert this invariant */ + assert(niov <= IOV_MAX); + + /* + * Cannot pad if resulting length would exceed SIZE_MAX. Returning an error + * to the guest is not ideal, but there is little else we can do. At least + * this will practically never happen on 64-bit systems. + */ + if (SIZE_MAX - pad->head < bytes || + SIZE_MAX - pad->head - bytes < pad->tail) + { + return -EINVAL; + } + + /* Length of the resulting IOV if we just concatenated everything */ + padded_niov = !!pad->head + niov + !!pad->tail; + + qemu_iovec_init(&pad->local_qiov, MIN(padded_niov, IOV_MAX)); + + if (pad->head) { + qemu_iovec_add(&pad->local_qiov, pad->buf, pad->head); + } + + /* + * If padded_niov > IOV_MAX, we cannot just concatenate everything. + * Instead, merge the first two or three elements of @iov to reduce the + * number of vector elements as necessary. + */ + if (padded_niov > IOV_MAX) { + /* + * Only head and tail can have lead to the number of entries exceeding + * IOV_MAX, so we can exceed it by the head and tail at most. We need + * to reduce the number of elements by `surplus_count`, so we merge that + * many elements plus one into one element. + */ + surplus_count = padded_niov - IOV_MAX; + assert(surplus_count <= !!pad->head + !!pad->tail); + collapse_count = surplus_count + 1; + + /* + * Move the elements to collapse into `pad->pre_collapse_qiov`, then + * advance `iov` (and associated variables) by those elements. + */ + qemu_iovec_init(&pad->pre_collapse_qiov, collapse_count); + qemu_iovec_concat_iov(&pad->pre_collapse_qiov, iov, + collapse_count, iov_offset, SIZE_MAX); + iov += collapse_count; + iov_offset = 0; + niov -= collapse_count; + bytes -= pad->pre_collapse_qiov.size; + + /* + * Construct the bounce buffer to match the length of the to-collapse + * vector elements, and for write requests, initialize it with the data + * from those elements. Then add it to `pad->local_qiov`. + */ + pad->collapse_len = pad->pre_collapse_qiov.size; + pad->collapse_bounce_buf = qemu_blockalign(bs, pad->collapse_len); + if (pad->write) { + qemu_iovec_to_buf(&pad->pre_collapse_qiov, 0, + pad->collapse_bounce_buf, pad->collapse_len); + } + qemu_iovec_add(&pad->local_qiov, + pad->collapse_bounce_buf, pad->collapse_len); + } + + qemu_iovec_concat_iov(&pad->local_qiov, iov, niov, iov_offset, bytes); + + if (pad->tail) { + qemu_iovec_add(&pad->local_qiov, + pad->buf + pad->buf_len - pad->tail, pad->tail); + } + + assert(pad->local_qiov.niov == MIN(padded_niov, IOV_MAX)); + return 0; +} + +/* * bdrv_pad_request * * Exchange request parameters with padded request if needed. Don't include RMW * read of padding, bdrv_padding_rmw_read() should be called separately if * needed. * + * @write is true for write requests, false for read requests. + * * Request parameters (@qiov, &qiov_offset, &offset, &bytes) are in-out: * - on function start they represent original request * - on failure or when padding is not needed they are unchanged @@ -1573,26 +1701,34 @@ static void bdrv_padding_destroy(BdrvRequestPadding *pad) static int bdrv_pad_request(BlockDriverState *bs, QEMUIOVector **qiov, size_t *qiov_offset, int64_t *offset, int64_t *bytes, + bool write, BdrvRequestPadding *pad, bool *padded, BdrvRequestFlags *flags) { int ret; + struct iovec *sliced_iov; + int sliced_niov; + size_t sliced_head, sliced_tail; bdrv_check_qiov_request(*offset, *bytes, *qiov, *qiov_offset, &error_abort); - if (!bdrv_init_padding(bs, *offset, *bytes, pad)) { + if (!bdrv_init_padding(bs, *offset, *bytes, write, pad)) { if (padded) { *padded = false; } return 0; } - ret = qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head, - *qiov, *qiov_offset, *bytes, - pad->buf + pad->buf_len - pad->tail, - pad->tail); + sliced_iov = qemu_iovec_slice(*qiov, *qiov_offset, *bytes, + &sliced_head, &sliced_tail, + &sliced_niov); + + /* Guaranteed by bdrv_check_qiov_request() */ + assert(*bytes <= SIZE_MAX); + ret = bdrv_create_padded_qiov(bs, pad, sliced_iov, sliced_niov, + sliced_head, *bytes); if (ret < 0) { - bdrv_padding_destroy(pad); + bdrv_padding_finalize(pad); return ret; } *bytes += pad->head + pad->tail; @@ -1659,8 +1795,8 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child, flags |= BDRV_REQ_COPY_ON_READ; } - ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad, - NULL, &flags); + ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, false, + &pad, NULL, &flags); if (ret < 0) { goto fail; } @@ -1670,7 +1806,7 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child, bs->bl.request_alignment, qiov, qiov_offset, flags); tracked_request_end(&req); - bdrv_padding_destroy(&pad); + bdrv_padding_finalize(&pad); fail: bdrv_dec_in_flight(bs); @@ -2002,7 +2138,7 @@ bdrv_co_do_zero_pwritev(BdrvChild *child, int64_t offset, int64_t bytes, /* This flag doesn't make sense for padding or zero writes */ flags &= ~BDRV_REQ_REGISTERED_BUF; - padding = bdrv_init_padding(bs, offset, bytes, &pad); + padding = bdrv_init_padding(bs, offset, bytes, true, &pad); if (padding) { assert(!(flags & BDRV_REQ_NO_WAIT)); bdrv_make_request_serialising(req, align); @@ -2050,7 +2186,7 @@ bdrv_co_do_zero_pwritev(BdrvChild *child, int64_t offset, int64_t bytes, } out: - bdrv_padding_destroy(&pad); + bdrv_padding_finalize(&pad); return ret; } @@ -2118,8 +2254,8 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, * bdrv_co_do_zero_pwritev() does aligning by itself, so, we do * alignment only if there is no ZERO flag. */ - ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad, - &padded, &flags); + ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, true, + &pad, &padded, &flags); if (ret < 0) { return ret; } @@ -2149,7 +2285,7 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align, qiov, qiov_offset, flags); - bdrv_padding_destroy(&pad); + bdrv_padding_finalize(&pad); out: tracked_request_end(&req); diff --git a/block/parallels.c b/block/parallels.c index d8a3f13..7c263d5 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -165,6 +165,13 @@ static int64_t block_status(BDRVParallelsState *s, int64_t sector_num, return start_off; } +static void parallels_set_bat_entry(BDRVParallelsState *s, + uint32_t index, uint32_t offset) +{ + s->bat_bitmap[index] = cpu_to_le32(offset); + bitmap_set(s->bat_dirty_bmap, bat_entry_off(index) / s->bat_dirty_block, 1); +} + static int64_t coroutine_fn GRAPH_RDLOCK allocate_clusters(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum) @@ -251,10 +258,8 @@ allocate_clusters(BlockDriverState *bs, int64_t sector_num, } for (i = 0; i < to_allocate; i++) { - s->bat_bitmap[idx + i] = cpu_to_le32(s->data_end / s->off_multiplier); + parallels_set_bat_entry(s, idx + i, s->data_end / s->off_multiplier); s->data_end += s->tracks; - bitmap_set(s->bat_dirty_bmap, - bat_entry_off(idx + i) / s->bat_dirty_block, 1); } return bat2sect(s, idx) + sector_num % s->tracks; @@ -415,16 +420,33 @@ parallels_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, return ret; } +static void parallels_check_unclean(BlockDriverState *bs, + BdrvCheckResult *res, + BdrvCheckMode fix) +{ + BDRVParallelsState *s = bs->opaque; + + if (!s->header_unclean) { + return; + } + + fprintf(stderr, "%s image was not closed correctly\n", + fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR"); + res->corruptions++; + if (fix & BDRV_FIX_ERRORS) { + /* parallels_close will do the job right */ + res->corruptions_fixed++; + s->header_unclean = false; + } +} static int coroutine_fn GRAPH_RDLOCK -parallels_co_check(BlockDriverState *bs, BdrvCheckResult *res, - BdrvCheckMode fix) +parallels_check_outside_image(BlockDriverState *bs, BdrvCheckResult *res, + BdrvCheckMode fix) { BDRVParallelsState *s = bs->opaque; - int64_t size, prev_off, high_off; - int ret; uint32_t i; - bool flush_bat = false; + int64_t off, high_off, size; size = bdrv_getlength(bs->file->bs); if (size < 0) { @@ -432,65 +454,48 @@ parallels_co_check(BlockDriverState *bs, BdrvCheckResult *res, return size; } - qemu_co_mutex_lock(&s->lock); - if (s->header_unclean) { - fprintf(stderr, "%s image was not closed correctly\n", - fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR"); - res->corruptions++; - if (fix & BDRV_FIX_ERRORS) { - /* parallels_close will do the job right */ - res->corruptions_fixed++; - s->header_unclean = false; - } - } - - res->bfi.total_clusters = s->bat_size; - res->bfi.compressed_clusters = 0; /* compression is not supported */ - high_off = 0; - prev_off = 0; for (i = 0; i < s->bat_size; i++) { - int64_t off = bat2sect(s, i) << BDRV_SECTOR_BITS; - if (off == 0) { - prev_off = 0; - continue; - } - - /* cluster outside the image */ - if (off > size) { + off = bat2sect(s, i) << BDRV_SECTOR_BITS; + if (off + s->cluster_size > size) { fprintf(stderr, "%s cluster %u is outside image\n", fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i); res->corruptions++; if (fix & BDRV_FIX_ERRORS) { - prev_off = 0; - s->bat_bitmap[i] = 0; + parallels_set_bat_entry(s, i, 0); res->corruptions_fixed++; - flush_bat = true; - continue; } + continue; } - - res->bfi.allocated_clusters++; - if (off > high_off) { + if (high_off < off) { high_off = off; } + } - if (prev_off != 0 && (prev_off + s->cluster_size) != off) { - res->bfi.fragmented_clusters++; - } - prev_off = off; + if (high_off == 0) { + res->image_end_offset = s->data_end << BDRV_SECTOR_BITS; + } else { + res->image_end_offset = high_off + s->cluster_size; + s->data_end = res->image_end_offset >> BDRV_SECTOR_BITS; } - ret = 0; - if (flush_bat) { - ret = bdrv_co_pwrite_sync(bs->file, 0, s->header_size, s->header, 0); - if (ret < 0) { - res->check_errors++; - goto out; - } + return 0; +} + +static int coroutine_fn GRAPH_RDLOCK +parallels_check_leak(BlockDriverState *bs, BdrvCheckResult *res, + BdrvCheckMode fix) +{ + BDRVParallelsState *s = bs->opaque; + int64_t size; + int ret; + + size = bdrv_getlength(bs->file->bs); + if (size < 0) { + res->check_errors++; + return size; } - res->image_end_offset = high_off + s->cluster_size; if (size > res->image_end_offset) { int64_t count; count = DIV_ROUND_UP(size - res->image_end_offset, s->cluster_size); @@ -510,14 +515,74 @@ parallels_co_check(BlockDriverState *bs, BdrvCheckResult *res, if (ret < 0) { error_report_err(local_err); res->check_errors++; - goto out; + return ret; } res->leaks_fixed += count; } } -out: - qemu_co_mutex_unlock(&s->lock); + return 0; +} + +static void parallels_collect_statistics(BlockDriverState *bs, + BdrvCheckResult *res, + BdrvCheckMode fix) +{ + BDRVParallelsState *s = bs->opaque; + int64_t off, prev_off; + uint32_t i; + + res->bfi.total_clusters = s->bat_size; + res->bfi.compressed_clusters = 0; /* compression is not supported */ + + prev_off = 0; + for (i = 0; i < s->bat_size; i++) { + off = bat2sect(s, i) << BDRV_SECTOR_BITS; + /* + * If BDRV_FIX_ERRORS is not set, out-of-image BAT entries were not + * fixed. Skip not allocated and out-of-image BAT entries. + */ + if (off == 0 || off + s->cluster_size > res->image_end_offset) { + prev_off = 0; + continue; + } + + if (prev_off != 0 && (prev_off + s->cluster_size) != off) { + res->bfi.fragmented_clusters++; + } + prev_off = off; + res->bfi.allocated_clusters++; + } +} + +static int coroutine_fn GRAPH_RDLOCK +parallels_co_check(BlockDriverState *bs, BdrvCheckResult *res, + BdrvCheckMode fix) +{ + BDRVParallelsState *s = bs->opaque; + int ret; + + WITH_QEMU_LOCK_GUARD(&s->lock) { + parallels_check_unclean(bs, res, fix); + + ret = parallels_check_outside_image(bs, res, fix); + if (ret < 0) { + return ret; + } + + ret = parallels_check_leak(bs, res, fix); + if (ret < 0) { + return ret; + } + + parallels_collect_statistics(bs, res, fix); + } + + ret = bdrv_co_flush(bs); + if (ret < 0) { + res->check_errors++; + } + return ret; } @@ -733,6 +798,7 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags, BDRVParallelsState *s = bs->opaque; ParallelsHeader ph; int ret, size, i; + int64_t file_nb_sectors; QemuOpts *opts = NULL; Error *local_err = NULL; char *buf; @@ -742,6 +808,11 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags, return ret; } + file_nb_sectors = bdrv_nb_sectors(bs->file->bs); + if (file_nb_sectors < 0) { + return -EINVAL; + } + ret = bdrv_pread(bs->file, 0, sizeof(ph), &ph, 0); if (ret < 0) { goto fail; @@ -806,6 +877,17 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags, for (i = 0; i < s->bat_size; i++) { int64_t off = bat2sect(s, i); + if (off >= file_nb_sectors) { + if (flags & BDRV_O_CHECK) { + continue; + } + error_setg(errp, "parallels: Offset %" PRIi64 " in BAT[%d] entry " + "is larger than file size (%" PRIi64 ")", + off << BDRV_SECTOR_BITS, i, + file_nb_sectors << BDRV_SECTOR_BITS); + ret = -EINVAL; + goto fail; + } if (off >= s->data_end) { s->data_end = off + s->tracks; } diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 39cda7f..2e76de0 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -1925,6 +1925,10 @@ static int discard_in_l2_slice(BlockDriverState *bs, uint64_t offset, uint64_t new_l2_bitmap = old_l2_bitmap; QCow2ClusterType cluster_type = qcow2_get_cluster_type(bs, old_l2_entry); + bool keep_reference = (cluster_type != QCOW2_CLUSTER_COMPRESSED) && + !full_discard && + (s->discard_no_unref && + type == QCOW2_DISCARD_REQUEST); /* * If full_discard is true, the cluster should not read back as zeroes, @@ -1943,10 +1947,22 @@ static int discard_in_l2_slice(BlockDriverState *bs, uint64_t offset, new_l2_entry = new_l2_bitmap = 0; } else if (bs->backing || qcow2_cluster_is_allocated(cluster_type)) { if (has_subclusters(s)) { - new_l2_entry = 0; + if (keep_reference) { + new_l2_entry = old_l2_entry; + } else { + new_l2_entry = 0; + } new_l2_bitmap = QCOW_L2_BITMAP_ALL_ZEROES; } else { - new_l2_entry = s->qcow_version >= 3 ? QCOW_OFLAG_ZERO : 0; + if (s->qcow_version >= 3) { + if (keep_reference) { + new_l2_entry |= QCOW_OFLAG_ZERO; + } else { + new_l2_entry = QCOW_OFLAG_ZERO; + } + } else { + new_l2_entry = 0; + } } } @@ -1960,8 +1976,16 @@ static int discard_in_l2_slice(BlockDriverState *bs, uint64_t offset, if (has_subclusters(s)) { set_l2_bitmap(s, l2_slice, l2_index + i, new_l2_bitmap); } - /* Then decrease the refcount */ - qcow2_free_any_cluster(bs, old_l2_entry, type); + if (!keep_reference) { + /* Then decrease the refcount */ + qcow2_free_any_cluster(bs, old_l2_entry, type); + } else if (s->discard_passthrough[type] && + (cluster_type == QCOW2_CLUSTER_NORMAL || + cluster_type == QCOW2_CLUSTER_ZERO_ALLOC)) { + /* If we keep the reference, pass on the discard still */ + bdrv_pdiscard(s->data_file, old_l2_entry & L2E_OFFSET_MASK, + s->cluster_size); + } } qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); diff --git a/block/qcow2.c b/block/qcow2.c index 7f39483..e23edd4 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -682,6 +682,7 @@ static const char *const mutable_opts[] = { QCOW2_OPT_DISCARD_REQUEST, QCOW2_OPT_DISCARD_SNAPSHOT, QCOW2_OPT_DISCARD_OTHER, + QCOW2_OPT_DISCARD_NO_UNREF, QCOW2_OPT_OVERLAP, QCOW2_OPT_OVERLAP_TEMPLATE, QCOW2_OPT_OVERLAP_MAIN_HEADER, @@ -727,6 +728,11 @@ static QemuOptsList qcow2_runtime_opts = { .help = "Generate discard requests when other clusters are freed", }, { + .name = QCOW2_OPT_DISCARD_NO_UNREF, + .type = QEMU_OPT_BOOL, + .help = "Do not unreference discarded clusters", + }, + { .name = QCOW2_OPT_OVERLAP, .type = QEMU_OPT_STRING, .help = "Selects which overlap checks to perform from a range of " @@ -969,6 +975,7 @@ typedef struct Qcow2ReopenState { bool use_lazy_refcounts; int overlap_check; bool discard_passthrough[QCOW2_DISCARD_MAX]; + bool discard_no_unref; uint64_t cache_clean_interval; QCryptoBlockOpenOptions *crypto_opts; /* Disk encryption runtime options */ } Qcow2ReopenState; @@ -1140,6 +1147,15 @@ static int qcow2_update_options_prepare(BlockDriverState *bs, r->discard_passthrough[QCOW2_DISCARD_OTHER] = qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false); + r->discard_no_unref = qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_NO_UNREF, + false); + if (r->discard_no_unref && s->qcow_version < 3) { + error_setg(errp, + "discard-no-unref is only supported since qcow2 version 3"); + ret = -EINVAL; + goto fail; + } + switch (s->crypt_method_header) { case QCOW_CRYPT_NONE: if (encryptfmt) { @@ -1220,6 +1236,8 @@ static void qcow2_update_options_commit(BlockDriverState *bs, s->discard_passthrough[i] = r->discard_passthrough[i]; } + s->discard_no_unref = r->discard_no_unref; + if (s->cache_clean_interval != r->cache_clean_interval) { cache_clean_timer_del(bs); s->cache_clean_interval = r->cache_clean_interval; diff --git a/block/qcow2.h b/block/qcow2.h index 4f67eb9..ea9adb5 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -133,6 +133,7 @@ #define QCOW2_OPT_DISCARD_REQUEST "pass-discard-request" #define QCOW2_OPT_DISCARD_SNAPSHOT "pass-discard-snapshot" #define QCOW2_OPT_DISCARD_OTHER "pass-discard-other" +#define QCOW2_OPT_DISCARD_NO_UNREF "discard-no-unref" #define QCOW2_OPT_OVERLAP "overlap-check" #define QCOW2_OPT_OVERLAP_TEMPLATE "overlap-check.template" #define QCOW2_OPT_OVERLAP_MAIN_HEADER "overlap-check.main-header" @@ -385,6 +386,8 @@ typedef struct BDRVQcow2State { bool discard_passthrough[QCOW2_DISCARD_MAX]; + bool discard_no_unref; + int overlap_check; /* bitmask of Qcow2MetadataOverlap values */ bool signaled_corruption; diff --git a/include/qemu/iov.h b/include/qemu/iov.h index 9330746..63a1c01 100644 --- a/include/qemu/iov.h +++ b/include/qemu/iov.h @@ -222,13 +222,11 @@ static inline void *qemu_iovec_buf(QEMUIOVector *qiov) void qemu_iovec_init(QEMUIOVector *qiov, int alloc_hint); void qemu_iovec_init_external(QEMUIOVector *qiov, struct iovec *iov, int niov); -int qemu_iovec_init_extended( - QEMUIOVector *qiov, - void *head_buf, size_t head_len, - QEMUIOVector *mid_qiov, size_t mid_offset, size_t mid_len, - void *tail_buf, size_t tail_len); void qemu_iovec_init_slice(QEMUIOVector *qiov, QEMUIOVector *source, size_t offset, size_t len); +struct iovec *qemu_iovec_slice(QEMUIOVector *qiov, + size_t offset, size_t len, + size_t *head, size_t *tail, int *niov); int qemu_iovec_subvec_niov(QEMUIOVector *qiov, size_t offset, size_t len); void qemu_iovec_add(QEMUIOVector *qiov, void *base, size_t len); void qemu_iovec_concat(QEMUIOVector *dst, diff --git a/qapi/block-core.json b/qapi/block-core.json index 4bf8917..5dd5f7e 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -3478,6 +3478,17 @@ # @pass-discard-other: whether discard requests for the data source # should be issued on other occasions where a cluster gets freed # +# @discard-no-unref: when enabled, discards from the guest will not cause +# cluster allocations to be relinquished. This prevents qcow2 fragmentation +# that would be caused by such discards. Besides potential +# performance degradation, such fragmentation can lead to increased +# allocation of clusters past the end of the image file, +# resulting in image files whose file length can grow much larger +# than their guest disk size would suggest. +# If image file length is of concern (e.g. when storing qcow2 +# images directly on block devices), you should consider enabling +# this option. (since 8.1) +# # @overlap-check: which overlap checks to perform for writes to the # image, defaults to 'cached' (since 2.2) # @@ -3516,6 +3527,7 @@ '*pass-discard-request': 'bool', '*pass-discard-snapshot': 'bool', '*pass-discard-other': 'bool', + '*discard-no-unref': 'bool', '*overlap-check': 'Qcow2OverlapChecks', '*cache-size': 'int', '*l2-cache-size': 'int', diff --git a/qemu-options.hx b/qemu-options.hx index b37eb96..b57489d 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -1431,6 +1431,18 @@ SRST issued on other occasions where a cluster gets freed (on/off; default: off) + ``discard-no-unref`` + When enabled, discards from the guest will not cause cluster + allocations to be relinquished. This prevents qcow2 fragmentation + that would be caused by such discards. Besides potential + performance degradation, such fragmentation can lead to increased + allocation of clusters past the end of the image file, + resulting in image files whose file length can grow much larger + than their guest disk size would suggest. + If image file length is of concern (e.g. when storing qcow2 + images directly on block devices), you should consider enabling + this option. + ``overlap-check`` Which overlap checks to perform for writes to the image (none/constant/cached/all; default: cached). For details or diff --git a/tests/qemu-iotests/tests/iov-padding b/tests/qemu-iotests/tests/iov-padding new file mode 100755 index 0000000..b960490 --- /dev/null +++ b/tests/qemu-iotests/tests/iov-padding @@ -0,0 +1,85 @@ +#!/usr/bin/env bash +# group: rw quick +# +# Check the interaction of request padding (to fit alignment restrictions) with +# vectored I/O from the guest +# +# Copyright Red Hat +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# + +seq=$(basename $0) +echo "QA output created by $seq" + +status=1 # failure is the default! + +_cleanup() +{ + _cleanup_test_img +} +trap "_cleanup; exit \$status" 0 1 2 3 15 + +# get standard environment, filters and checks +cd .. +. ./common.rc +. ./common.filter + +_supported_fmt raw +_supported_proto file + +_make_test_img 1M + +IMGSPEC="driver=blkdebug,align=4096,image.driver=file,image.filename=$TEST_IMG" + +# Four combinations: +# - Offset 4096, length 1023 * 512 + 512: Fully aligned to 4k +# - Offset 4096, length 1023 * 512 + 4096: Head is aligned, tail is not +# - Offset 512, length 1023 * 512 + 512: Neither head nor tail are aligned +# - Offset 512, length 1023 * 512 + 4096: Tail is aligned, head is not +for start_offset in 4096 512; do + for last_element_length in 512 4096; do + length=$((1023 * 512 + $last_element_length)) + + echo + echo "== performing 1024-element vectored requests to image (offset: $start_offset; length: $length) ==" + + # Fill with data for testing + $QEMU_IO -c 'write -P 1 0 1M' "$TEST_IMG" | _filter_qemu_io + + # 1023 512-byte buffers, and then one with length $last_element_length + cmd_params="-P 2 $start_offset $(yes 512 | head -n 1023 | tr '\n' ' ') $last_element_length" + QEMU_IO_OPTIONS="$QEMU_IO_OPTIONS_NO_FMT" $QEMU_IO \ + -c "writev $cmd_params" \ + --image-opts \ + "$IMGSPEC" \ + | _filter_qemu_io + + # Read all patterns -- read the part we just wrote with writev twice, + # once "normally", and once with a readv, so we see that that works, too + QEMU_IO_OPTIONS="$QEMU_IO_OPTIONS_NO_FMT" $QEMU_IO \ + -c "read -P 1 0 $start_offset" \ + -c "read -P 2 $start_offset $length" \ + -c "readv $cmd_params" \ + -c "read -P 1 $((start_offset + length)) $((1024 * 1024 - length - start_offset))" \ + --image-opts \ + "$IMGSPEC" \ + | _filter_qemu_io + done +done + +# success, all done +echo "*** done" +rm -f $seq.full +status=0 diff --git a/tests/qemu-iotests/tests/iov-padding.out b/tests/qemu-iotests/tests/iov-padding.out new file mode 100644 index 0000000..e07a91f --- /dev/null +++ b/tests/qemu-iotests/tests/iov-padding.out @@ -0,0 +1,59 @@ +QA output created by iov-padding +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 + +== performing 1024-element vectored requests to image (offset: 4096; length: 524288) == +wrote 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 524288/524288 bytes at offset 4096 +512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 4096/4096 bytes at offset 0 +4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 524288/524288 bytes at offset 4096 +512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 524288/524288 bytes at offset 4096 +512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 520192/520192 bytes at offset 528384 +508 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +== performing 1024-element vectored requests to image (offset: 4096; length: 527872) == +wrote 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 527872/527872 bytes at offset 4096 +515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 4096/4096 bytes at offset 0 +4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 527872/527872 bytes at offset 4096 +515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 527872/527872 bytes at offset 4096 +515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 516608/516608 bytes at offset 531968 +504.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +== performing 1024-element vectored requests to image (offset: 512; length: 524288) == +wrote 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 524288/524288 bytes at offset 512 +512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 0 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 524288/524288 bytes at offset 512 +512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 524288/524288 bytes at offset 512 +512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 523776/523776 bytes at offset 524800 +511.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +== performing 1024-element vectored requests to image (offset: 512; length: 527872) == +wrote 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 527872/527872 bytes at offset 512 +515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 0 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 527872/527872 bytes at offset 512 +515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 527872/527872 bytes at offset 512 +515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 520192/520192 bytes at offset 528384 +508 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +*** done @@ -378,15 +378,15 @@ static struct iovec *iov_skip_offset(struct iovec *iov, size_t offset, } /* - * qiov_slice + * qemu_iovec_slice * * Find subarray of iovec's, containing requested range. @head would * be offset in first iov (returned by the function), @tail would be * count of extra bytes in last iovec (returned iov + @niov - 1). */ -static struct iovec *qiov_slice(QEMUIOVector *qiov, - size_t offset, size_t len, - size_t *head, size_t *tail, int *niov) +struct iovec *qemu_iovec_slice(QEMUIOVector *qiov, + size_t offset, size_t len, + size_t *head, size_t *tail, int *niov) { struct iovec *iov, *end_iov; @@ -411,76 +411,12 @@ int qemu_iovec_subvec_niov(QEMUIOVector *qiov, size_t offset, size_t len) size_t head, tail; int niov; - qiov_slice(qiov, offset, len, &head, &tail, &niov); + qemu_iovec_slice(qiov, offset, len, &head, &tail, &niov); return niov; } /* - * Compile new iovec, combining @head_buf buffer, sub-qiov of @mid_qiov, - * and @tail_buf buffer into new qiov. - */ -int qemu_iovec_init_extended( - QEMUIOVector *qiov, - void *head_buf, size_t head_len, - QEMUIOVector *mid_qiov, size_t mid_offset, size_t mid_len, - void *tail_buf, size_t tail_len) -{ - size_t mid_head, mid_tail; - int total_niov, mid_niov = 0; - struct iovec *p, *mid_iov = NULL; - - assert(mid_qiov->niov <= IOV_MAX); - - if (SIZE_MAX - head_len < mid_len || - SIZE_MAX - head_len - mid_len < tail_len) - { - return -EINVAL; - } - - if (mid_len) { - mid_iov = qiov_slice(mid_qiov, mid_offset, mid_len, - &mid_head, &mid_tail, &mid_niov); - } - - total_niov = !!head_len + mid_niov + !!tail_len; - if (total_niov > IOV_MAX) { - return -EINVAL; - } - - if (total_niov == 1) { - qemu_iovec_init_buf(qiov, NULL, 0); - p = &qiov->local_iov; - } else { - qiov->niov = qiov->nalloc = total_niov; - qiov->size = head_len + mid_len + tail_len; - p = qiov->iov = g_new(struct iovec, qiov->niov); - } - - if (head_len) { - p->iov_base = head_buf; - p->iov_len = head_len; - p++; - } - - assert(!mid_niov == !mid_len); - if (mid_niov) { - memcpy(p, mid_iov, mid_niov * sizeof(*p)); - p[0].iov_base = (uint8_t *)p[0].iov_base + mid_head; - p[0].iov_len -= mid_head; - p[mid_niov - 1].iov_len -= mid_tail; - p += mid_niov; - } - - if (tail_len) { - p->iov_base = tail_buf; - p->iov_len = tail_len; - } - - return 0; -} - -/* * Check if the contents of subrange of qiov data is all zeroes. */ bool qemu_iovec_is_zero(QEMUIOVector *qiov, size_t offset, size_t bytes) @@ -511,14 +447,21 @@ bool qemu_iovec_is_zero(QEMUIOVector *qiov, size_t offset, size_t bytes) void qemu_iovec_init_slice(QEMUIOVector *qiov, QEMUIOVector *source, size_t offset, size_t len) { - int ret; + struct iovec *slice_iov; + int slice_niov; + size_t slice_head, slice_tail; assert(source->size >= len); assert(source->size - len >= offset); - /* We shrink the request, so we can't overflow neither size_t nor MAX_IOV */ - ret = qemu_iovec_init_extended(qiov, NULL, 0, source, offset, len, NULL, 0); - assert(ret == 0); + slice_iov = qemu_iovec_slice(source, offset, len, + &slice_head, &slice_tail, &slice_niov); + if (slice_niov == 1) { + qemu_iovec_init_buf(qiov, slice_iov[0].iov_base + slice_head, len); + } else { + qemu_iovec_init(qiov, slice_niov); + qemu_iovec_concat_iov(qiov, slice_iov, slice_niov, slice_head, len); + } } void qemu_iovec_destroy(QEMUIOVector *qiov) |