aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Henderson <richard.henderson@linaro.org>2023-06-05 10:27:31 -0700
committerRichard Henderson <richard.henderson@linaro.org>2023-06-05 10:27:31 -0700
commitb52daaf2c868f2bab102eb5acbf55b2917f46aea (patch)
tree7717ebe9fd1dec6af5ff339400c645413c22c97f
parentafa351fe36d4a09842e06a0159325568ac8f923b (diff)
parent42a2890a76f4783cd1c212f27856edcf2b5e8a75 (diff)
downloadqemu-b52daaf2c868f2bab102eb5acbf55b2917f46aea.zip
qemu-b52daaf2c868f2bab102eb5acbf55b2917f46aea.tar.gz
qemu-b52daaf2c868f2bab102eb5acbf55b2917f46aea.tar.bz2
Merge tag 'pull-block-2023-06-05' of https://gitlab.com/hreitz/qemu into staging
Block patches - Fix padding of unaligned vectored requests to match the host alignment for vectors with 1023 or 1024 buffers - Refactor and fix bugs in parallels's image check functionality - Add an option to the qcow2 driver to retain (qcow2-level) allocations on discard requests from the guest (while still forwarding the discard to the lower level and marking the range as zero) # -----BEGIN PGP SIGNATURE----- # # iQJGBAABCAAwFiEEy2LXoO44KeRfAE00ofpA0JgBnN8FAmR+AT4SHGhyZWl0ekBy # ZWRoYXQuY29tAAoJEKH6QNCYAZzfnboQAKD6YrreZLoseomRfqOAoApSf6yOdcHk # 6kfsvzwzjosomsF1Pkzm4851vX5PyDqTdeu0iViM+pxanVO1b494q1P4VcAERqMB # iZVs68R6M0l6HV9btWFGm+ibHJf4FapdntkIdwog1ka5TIhw5oDWCVNLigjhIoRv # sM37Bgf14kC3sFTR++0HESsyU1eUP5gJjwJbPZ2IgJBmzYay0is1z5nHA/3VUswu # 8dKnGQDsv62EtlK7PK8cU2BhLOeNi6Wr3bAb6Wf2QLB5e0qRb7oAkqNx5/UcTznk # a3XMC1aiWhYvM/+DaYIpQUcIPgA8xQ1KHKeD6WjbGfLgZBqseX0aGWMByUsiY8Bo # +BPIBnUDrbiPnAKB/XLQfnzlE+s7121/JpEbB7AkZqVFRGuw8Wur4tbc2fzvy8Pw # x/uQfv3ZPi/2Lf6u7hv/TVHubXi8jucVgx3Ubu5Jeo3901S4/KOQBQ4BQ/GYIGQX # 38ijSROcEd0eQJ1mTKPEctouxjSZCghNSbrn9DfsL1V3VWqWNKKGCU3hM+RQ1SJT # 688qvnyYt8QZfTsiDSHR/GfKsufG0DkoqE7c9IhSEPohecAH8Rrc3HcLut7fuwD2 # gCFQhm68CPwwRmBjPCY6Zi1RDzeOyFBSWN31T6t0yTb4OHJ/3/cSZVBJtwwkOVbx # zwabHDNdY5Kw # =GuoL # -----END PGP SIGNATURE----- # gpg: Signature made Mon 05 Jun 2023 08:37:34 AM PDT # gpg: using RSA key CB62D7A0EE3829E45F004D34A1FA40D098019CDF # gpg: issuer "hreitz@redhat.com" # gpg: Good signature from "Hanna Reitz <hreitz@redhat.com>" [unknown] # gpg: WARNING: This key is not certified with a trusted signature! # gpg: There is no indication that the signature belongs to the owner. # Primary key fingerprint: CB62 D7A0 EE38 29E4 5F00 4D34 A1FA 40D0 9801 9CDF * tag 'pull-block-2023-06-05' of https://gitlab.com/hreitz/qemu: qcow2: add discard-no-unref option parallels: Incorrect condition in out-of-image check parallels: Replace qemu_co_mutex_lock by WITH_QEMU_LOCK_GUARD parallels: Move statistic collection to a separate function parallels: Move check of leaks to a separate function parallels: Fix statistics calculation parallels: Move check of cluster outside image to a separate function parallels: Move check of unclean image to a separate function parallels: Use generic infrastructure for BAT writing in parallels_co_check() parallels: create parallels_set_bat_entry_helper() to assign BAT value parallels: Fix image_end_offset and data_end after out-of-image check parallels: Fix high_off calculation in parallels_co_check() parallels: Out of image offset in BAT leads to image inflation iotests/iov-padding: New test util/iov: Remove qemu_iovec_init_extended() block: Collapse padded I/O vecs exceeding IOV_MAX util/iov: Make qiov_slice() public Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
-rw-r--r--block/io.c166
-rw-r--r--block/parallels.c190
-rw-r--r--block/qcow2-cluster.c32
-rw-r--r--block/qcow2.c18
-rw-r--r--block/qcow2.h3
-rw-r--r--include/qemu/iov.h8
-rw-r--r--qapi/block-core.json12
-rw-r--r--qemu-options.hx12
-rwxr-xr-xtests/qemu-iotests/tests/iov-padding85
-rw-r--r--tests/qemu-iotests/tests/iov-padding.out59
-rw-r--r--util/iov.c89
11 files changed, 523 insertions, 151 deletions
diff --git a/block/io.c b/block/io.c
index f2dfc7c..30748f0 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1441,6 +1441,14 @@ out:
* @merge_reads is true for small requests,
* if @buf_len == @head + bytes + @tail. In this case it is possible that both
* head and tail exist but @buf_len == align and @tail_buf == @buf.
+ *
+ * @write is true for write requests, false for read requests.
+ *
+ * If padding makes the vector too long (exceeding IOV_MAX), then we need to
+ * merge existing vector elements into a single one. @collapse_bounce_buf acts
+ * as the bounce buffer in such cases. @pre_collapse_qiov has the pre-collapse
+ * I/O vector elements so for read requests, the data can be copied back after
+ * the read is done.
*/
typedef struct BdrvRequestPadding {
uint8_t *buf;
@@ -1449,11 +1457,17 @@ typedef struct BdrvRequestPadding {
size_t head;
size_t tail;
bool merge_reads;
+ bool write;
QEMUIOVector local_qiov;
+
+ uint8_t *collapse_bounce_buf;
+ size_t collapse_len;
+ QEMUIOVector pre_collapse_qiov;
} BdrvRequestPadding;
static bool bdrv_init_padding(BlockDriverState *bs,
int64_t offset, int64_t bytes,
+ bool write,
BdrvRequestPadding *pad)
{
int64_t align = bs->bl.request_alignment;
@@ -1485,6 +1499,8 @@ static bool bdrv_init_padding(BlockDriverState *bs,
pad->tail_buf = pad->buf + pad->buf_len - align;
}
+ pad->write = write;
+
return true;
}
@@ -1549,8 +1565,23 @@ zero_mem:
return 0;
}
-static void bdrv_padding_destroy(BdrvRequestPadding *pad)
+/**
+ * Free *pad's associated buffers, and perform any necessary finalization steps.
+ */
+static void bdrv_padding_finalize(BdrvRequestPadding *pad)
{
+ if (pad->collapse_bounce_buf) {
+ if (!pad->write) {
+ /*
+ * If padding required elements in the vector to be collapsed into a
+ * bounce buffer, copy the bounce buffer content back
+ */
+ qemu_iovec_from_buf(&pad->pre_collapse_qiov, 0,
+ pad->collapse_bounce_buf, pad->collapse_len);
+ }
+ qemu_vfree(pad->collapse_bounce_buf);
+ qemu_iovec_destroy(&pad->pre_collapse_qiov);
+ }
if (pad->buf) {
qemu_vfree(pad->buf);
qemu_iovec_destroy(&pad->local_qiov);
@@ -1559,12 +1590,109 @@ static void bdrv_padding_destroy(BdrvRequestPadding *pad)
}
/*
+ * Create pad->local_qiov by wrapping @iov in the padding head and tail, while
+ * ensuring that the resulting vector will not exceed IOV_MAX elements.
+ *
+ * To ensure this, when necessary, the first two or three elements of @iov are
+ * merged into pad->collapse_bounce_buf and replaced by a reference to that
+ * bounce buffer in pad->local_qiov.
+ *
+ * After performing a read request, the data from the bounce buffer must be
+ * copied back into pad->pre_collapse_qiov (e.g. by bdrv_padding_finalize()).
+ */
+static int bdrv_create_padded_qiov(BlockDriverState *bs,
+ BdrvRequestPadding *pad,
+ struct iovec *iov, int niov,
+ size_t iov_offset, size_t bytes)
+{
+ int padded_niov, surplus_count, collapse_count;
+
+ /* Assert this invariant */
+ assert(niov <= IOV_MAX);
+
+ /*
+ * Cannot pad if resulting length would exceed SIZE_MAX. Returning an error
+ * to the guest is not ideal, but there is little else we can do. At least
+ * this will practically never happen on 64-bit systems.
+ */
+ if (SIZE_MAX - pad->head < bytes ||
+ SIZE_MAX - pad->head - bytes < pad->tail)
+ {
+ return -EINVAL;
+ }
+
+ /* Length of the resulting IOV if we just concatenated everything */
+ padded_niov = !!pad->head + niov + !!pad->tail;
+
+ qemu_iovec_init(&pad->local_qiov, MIN(padded_niov, IOV_MAX));
+
+ if (pad->head) {
+ qemu_iovec_add(&pad->local_qiov, pad->buf, pad->head);
+ }
+
+ /*
+ * If padded_niov > IOV_MAX, we cannot just concatenate everything.
+ * Instead, merge the first two or three elements of @iov to reduce the
+ * number of vector elements as necessary.
+ */
+ if (padded_niov > IOV_MAX) {
+ /*
+ * Only head and tail can have lead to the number of entries exceeding
+ * IOV_MAX, so we can exceed it by the head and tail at most. We need
+ * to reduce the number of elements by `surplus_count`, so we merge that
+ * many elements plus one into one element.
+ */
+ surplus_count = padded_niov - IOV_MAX;
+ assert(surplus_count <= !!pad->head + !!pad->tail);
+ collapse_count = surplus_count + 1;
+
+ /*
+ * Move the elements to collapse into `pad->pre_collapse_qiov`, then
+ * advance `iov` (and associated variables) by those elements.
+ */
+ qemu_iovec_init(&pad->pre_collapse_qiov, collapse_count);
+ qemu_iovec_concat_iov(&pad->pre_collapse_qiov, iov,
+ collapse_count, iov_offset, SIZE_MAX);
+ iov += collapse_count;
+ iov_offset = 0;
+ niov -= collapse_count;
+ bytes -= pad->pre_collapse_qiov.size;
+
+ /*
+ * Construct the bounce buffer to match the length of the to-collapse
+ * vector elements, and for write requests, initialize it with the data
+ * from those elements. Then add it to `pad->local_qiov`.
+ */
+ pad->collapse_len = pad->pre_collapse_qiov.size;
+ pad->collapse_bounce_buf = qemu_blockalign(bs, pad->collapse_len);
+ if (pad->write) {
+ qemu_iovec_to_buf(&pad->pre_collapse_qiov, 0,
+ pad->collapse_bounce_buf, pad->collapse_len);
+ }
+ qemu_iovec_add(&pad->local_qiov,
+ pad->collapse_bounce_buf, pad->collapse_len);
+ }
+
+ qemu_iovec_concat_iov(&pad->local_qiov, iov, niov, iov_offset, bytes);
+
+ if (pad->tail) {
+ qemu_iovec_add(&pad->local_qiov,
+ pad->buf + pad->buf_len - pad->tail, pad->tail);
+ }
+
+ assert(pad->local_qiov.niov == MIN(padded_niov, IOV_MAX));
+ return 0;
+}
+
+/*
* bdrv_pad_request
*
* Exchange request parameters with padded request if needed. Don't include RMW
* read of padding, bdrv_padding_rmw_read() should be called separately if
* needed.
*
+ * @write is true for write requests, false for read requests.
+ *
* Request parameters (@qiov, &qiov_offset, &offset, &bytes) are in-out:
* - on function start they represent original request
* - on failure or when padding is not needed they are unchanged
@@ -1573,26 +1701,34 @@ static void bdrv_padding_destroy(BdrvRequestPadding *pad)
static int bdrv_pad_request(BlockDriverState *bs,
QEMUIOVector **qiov, size_t *qiov_offset,
int64_t *offset, int64_t *bytes,
+ bool write,
BdrvRequestPadding *pad, bool *padded,
BdrvRequestFlags *flags)
{
int ret;
+ struct iovec *sliced_iov;
+ int sliced_niov;
+ size_t sliced_head, sliced_tail;
bdrv_check_qiov_request(*offset, *bytes, *qiov, *qiov_offset, &error_abort);
- if (!bdrv_init_padding(bs, *offset, *bytes, pad)) {
+ if (!bdrv_init_padding(bs, *offset, *bytes, write, pad)) {
if (padded) {
*padded = false;
}
return 0;
}
- ret = qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head,
- *qiov, *qiov_offset, *bytes,
- pad->buf + pad->buf_len - pad->tail,
- pad->tail);
+ sliced_iov = qemu_iovec_slice(*qiov, *qiov_offset, *bytes,
+ &sliced_head, &sliced_tail,
+ &sliced_niov);
+
+ /* Guaranteed by bdrv_check_qiov_request() */
+ assert(*bytes <= SIZE_MAX);
+ ret = bdrv_create_padded_qiov(bs, pad, sliced_iov, sliced_niov,
+ sliced_head, *bytes);
if (ret < 0) {
- bdrv_padding_destroy(pad);
+ bdrv_padding_finalize(pad);
return ret;
}
*bytes += pad->head + pad->tail;
@@ -1659,8 +1795,8 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
flags |= BDRV_REQ_COPY_ON_READ;
}
- ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad,
- NULL, &flags);
+ ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, false,
+ &pad, NULL, &flags);
if (ret < 0) {
goto fail;
}
@@ -1670,7 +1806,7 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
bs->bl.request_alignment,
qiov, qiov_offset, flags);
tracked_request_end(&req);
- bdrv_padding_destroy(&pad);
+ bdrv_padding_finalize(&pad);
fail:
bdrv_dec_in_flight(bs);
@@ -2002,7 +2138,7 @@ bdrv_co_do_zero_pwritev(BdrvChild *child, int64_t offset, int64_t bytes,
/* This flag doesn't make sense for padding or zero writes */
flags &= ~BDRV_REQ_REGISTERED_BUF;
- padding = bdrv_init_padding(bs, offset, bytes, &pad);
+ padding = bdrv_init_padding(bs, offset, bytes, true, &pad);
if (padding) {
assert(!(flags & BDRV_REQ_NO_WAIT));
bdrv_make_request_serialising(req, align);
@@ -2050,7 +2186,7 @@ bdrv_co_do_zero_pwritev(BdrvChild *child, int64_t offset, int64_t bytes,
}
out:
- bdrv_padding_destroy(&pad);
+ bdrv_padding_finalize(&pad);
return ret;
}
@@ -2118,8 +2254,8 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
* bdrv_co_do_zero_pwritev() does aligning by itself, so, we do
* alignment only if there is no ZERO flag.
*/
- ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad,
- &padded, &flags);
+ ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, true,
+ &pad, &padded, &flags);
if (ret < 0) {
return ret;
}
@@ -2149,7 +2285,7 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
qiov, qiov_offset, flags);
- bdrv_padding_destroy(&pad);
+ bdrv_padding_finalize(&pad);
out:
tracked_request_end(&req);
diff --git a/block/parallels.c b/block/parallels.c
index d8a3f13..7c263d5 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -165,6 +165,13 @@ static int64_t block_status(BDRVParallelsState *s, int64_t sector_num,
return start_off;
}
+static void parallels_set_bat_entry(BDRVParallelsState *s,
+ uint32_t index, uint32_t offset)
+{
+ s->bat_bitmap[index] = cpu_to_le32(offset);
+ bitmap_set(s->bat_dirty_bmap, bat_entry_off(index) / s->bat_dirty_block, 1);
+}
+
static int64_t coroutine_fn GRAPH_RDLOCK
allocate_clusters(BlockDriverState *bs, int64_t sector_num,
int nb_sectors, int *pnum)
@@ -251,10 +258,8 @@ allocate_clusters(BlockDriverState *bs, int64_t sector_num,
}
for (i = 0; i < to_allocate; i++) {
- s->bat_bitmap[idx + i] = cpu_to_le32(s->data_end / s->off_multiplier);
+ parallels_set_bat_entry(s, idx + i, s->data_end / s->off_multiplier);
s->data_end += s->tracks;
- bitmap_set(s->bat_dirty_bmap,
- bat_entry_off(idx + i) / s->bat_dirty_block, 1);
}
return bat2sect(s, idx) + sector_num % s->tracks;
@@ -415,16 +420,33 @@ parallels_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
return ret;
}
+static void parallels_check_unclean(BlockDriverState *bs,
+ BdrvCheckResult *res,
+ BdrvCheckMode fix)
+{
+ BDRVParallelsState *s = bs->opaque;
+
+ if (!s->header_unclean) {
+ return;
+ }
+
+ fprintf(stderr, "%s image was not closed correctly\n",
+ fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR");
+ res->corruptions++;
+ if (fix & BDRV_FIX_ERRORS) {
+ /* parallels_close will do the job right */
+ res->corruptions_fixed++;
+ s->header_unclean = false;
+ }
+}
static int coroutine_fn GRAPH_RDLOCK
-parallels_co_check(BlockDriverState *bs, BdrvCheckResult *res,
- BdrvCheckMode fix)
+parallels_check_outside_image(BlockDriverState *bs, BdrvCheckResult *res,
+ BdrvCheckMode fix)
{
BDRVParallelsState *s = bs->opaque;
- int64_t size, prev_off, high_off;
- int ret;
uint32_t i;
- bool flush_bat = false;
+ int64_t off, high_off, size;
size = bdrv_getlength(bs->file->bs);
if (size < 0) {
@@ -432,65 +454,48 @@ parallels_co_check(BlockDriverState *bs, BdrvCheckResult *res,
return size;
}
- qemu_co_mutex_lock(&s->lock);
- if (s->header_unclean) {
- fprintf(stderr, "%s image was not closed correctly\n",
- fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR");
- res->corruptions++;
- if (fix & BDRV_FIX_ERRORS) {
- /* parallels_close will do the job right */
- res->corruptions_fixed++;
- s->header_unclean = false;
- }
- }
-
- res->bfi.total_clusters = s->bat_size;
- res->bfi.compressed_clusters = 0; /* compression is not supported */
-
high_off = 0;
- prev_off = 0;
for (i = 0; i < s->bat_size; i++) {
- int64_t off = bat2sect(s, i) << BDRV_SECTOR_BITS;
- if (off == 0) {
- prev_off = 0;
- continue;
- }
-
- /* cluster outside the image */
- if (off > size) {
+ off = bat2sect(s, i) << BDRV_SECTOR_BITS;
+ if (off + s->cluster_size > size) {
fprintf(stderr, "%s cluster %u is outside image\n",
fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
res->corruptions++;
if (fix & BDRV_FIX_ERRORS) {
- prev_off = 0;
- s->bat_bitmap[i] = 0;
+ parallels_set_bat_entry(s, i, 0);
res->corruptions_fixed++;
- flush_bat = true;
- continue;
}
+ continue;
}
-
- res->bfi.allocated_clusters++;
- if (off > high_off) {
+ if (high_off < off) {
high_off = off;
}
+ }
- if (prev_off != 0 && (prev_off + s->cluster_size) != off) {
- res->bfi.fragmented_clusters++;
- }
- prev_off = off;
+ if (high_off == 0) {
+ res->image_end_offset = s->data_end << BDRV_SECTOR_BITS;
+ } else {
+ res->image_end_offset = high_off + s->cluster_size;
+ s->data_end = res->image_end_offset >> BDRV_SECTOR_BITS;
}
- ret = 0;
- if (flush_bat) {
- ret = bdrv_co_pwrite_sync(bs->file, 0, s->header_size, s->header, 0);
- if (ret < 0) {
- res->check_errors++;
- goto out;
- }
+ return 0;
+}
+
+static int coroutine_fn GRAPH_RDLOCK
+parallels_check_leak(BlockDriverState *bs, BdrvCheckResult *res,
+ BdrvCheckMode fix)
+{
+ BDRVParallelsState *s = bs->opaque;
+ int64_t size;
+ int ret;
+
+ size = bdrv_getlength(bs->file->bs);
+ if (size < 0) {
+ res->check_errors++;
+ return size;
}
- res->image_end_offset = high_off + s->cluster_size;
if (size > res->image_end_offset) {
int64_t count;
count = DIV_ROUND_UP(size - res->image_end_offset, s->cluster_size);
@@ -510,14 +515,74 @@ parallels_co_check(BlockDriverState *bs, BdrvCheckResult *res,
if (ret < 0) {
error_report_err(local_err);
res->check_errors++;
- goto out;
+ return ret;
}
res->leaks_fixed += count;
}
}
-out:
- qemu_co_mutex_unlock(&s->lock);
+ return 0;
+}
+
+static void parallels_collect_statistics(BlockDriverState *bs,
+ BdrvCheckResult *res,
+ BdrvCheckMode fix)
+{
+ BDRVParallelsState *s = bs->opaque;
+ int64_t off, prev_off;
+ uint32_t i;
+
+ res->bfi.total_clusters = s->bat_size;
+ res->bfi.compressed_clusters = 0; /* compression is not supported */
+
+ prev_off = 0;
+ for (i = 0; i < s->bat_size; i++) {
+ off = bat2sect(s, i) << BDRV_SECTOR_BITS;
+ /*
+ * If BDRV_FIX_ERRORS is not set, out-of-image BAT entries were not
+ * fixed. Skip not allocated and out-of-image BAT entries.
+ */
+ if (off == 0 || off + s->cluster_size > res->image_end_offset) {
+ prev_off = 0;
+ continue;
+ }
+
+ if (prev_off != 0 && (prev_off + s->cluster_size) != off) {
+ res->bfi.fragmented_clusters++;
+ }
+ prev_off = off;
+ res->bfi.allocated_clusters++;
+ }
+}
+
+static int coroutine_fn GRAPH_RDLOCK
+parallels_co_check(BlockDriverState *bs, BdrvCheckResult *res,
+ BdrvCheckMode fix)
+{
+ BDRVParallelsState *s = bs->opaque;
+ int ret;
+
+ WITH_QEMU_LOCK_GUARD(&s->lock) {
+ parallels_check_unclean(bs, res, fix);
+
+ ret = parallels_check_outside_image(bs, res, fix);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = parallels_check_leak(bs, res, fix);
+ if (ret < 0) {
+ return ret;
+ }
+
+ parallels_collect_statistics(bs, res, fix);
+ }
+
+ ret = bdrv_co_flush(bs);
+ if (ret < 0) {
+ res->check_errors++;
+ }
+
return ret;
}
@@ -733,6 +798,7 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
BDRVParallelsState *s = bs->opaque;
ParallelsHeader ph;
int ret, size, i;
+ int64_t file_nb_sectors;
QemuOpts *opts = NULL;
Error *local_err = NULL;
char *buf;
@@ -742,6 +808,11 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
return ret;
}
+ file_nb_sectors = bdrv_nb_sectors(bs->file->bs);
+ if (file_nb_sectors < 0) {
+ return -EINVAL;
+ }
+
ret = bdrv_pread(bs->file, 0, sizeof(ph), &ph, 0);
if (ret < 0) {
goto fail;
@@ -806,6 +877,17 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
for (i = 0; i < s->bat_size; i++) {
int64_t off = bat2sect(s, i);
+ if (off >= file_nb_sectors) {
+ if (flags & BDRV_O_CHECK) {
+ continue;
+ }
+ error_setg(errp, "parallels: Offset %" PRIi64 " in BAT[%d] entry "
+ "is larger than file size (%" PRIi64 ")",
+ off << BDRV_SECTOR_BITS, i,
+ file_nb_sectors << BDRV_SECTOR_BITS);
+ ret = -EINVAL;
+ goto fail;
+ }
if (off >= s->data_end) {
s->data_end = off + s->tracks;
}
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 39cda7f..2e76de0 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1925,6 +1925,10 @@ static int discard_in_l2_slice(BlockDriverState *bs, uint64_t offset,
uint64_t new_l2_bitmap = old_l2_bitmap;
QCow2ClusterType cluster_type =
qcow2_get_cluster_type(bs, old_l2_entry);
+ bool keep_reference = (cluster_type != QCOW2_CLUSTER_COMPRESSED) &&
+ !full_discard &&
+ (s->discard_no_unref &&
+ type == QCOW2_DISCARD_REQUEST);
/*
* If full_discard is true, the cluster should not read back as zeroes,
@@ -1943,10 +1947,22 @@ static int discard_in_l2_slice(BlockDriverState *bs, uint64_t offset,
new_l2_entry = new_l2_bitmap = 0;
} else if (bs->backing || qcow2_cluster_is_allocated(cluster_type)) {
if (has_subclusters(s)) {
- new_l2_entry = 0;
+ if (keep_reference) {
+ new_l2_entry = old_l2_entry;
+ } else {
+ new_l2_entry = 0;
+ }
new_l2_bitmap = QCOW_L2_BITMAP_ALL_ZEROES;
} else {
- new_l2_entry = s->qcow_version >= 3 ? QCOW_OFLAG_ZERO : 0;
+ if (s->qcow_version >= 3) {
+ if (keep_reference) {
+ new_l2_entry |= QCOW_OFLAG_ZERO;
+ } else {
+ new_l2_entry = QCOW_OFLAG_ZERO;
+ }
+ } else {
+ new_l2_entry = 0;
+ }
}
}
@@ -1960,8 +1976,16 @@ static int discard_in_l2_slice(BlockDriverState *bs, uint64_t offset,
if (has_subclusters(s)) {
set_l2_bitmap(s, l2_slice, l2_index + i, new_l2_bitmap);
}
- /* Then decrease the refcount */
- qcow2_free_any_cluster(bs, old_l2_entry, type);
+ if (!keep_reference) {
+ /* Then decrease the refcount */
+ qcow2_free_any_cluster(bs, old_l2_entry, type);
+ } else if (s->discard_passthrough[type] &&
+ (cluster_type == QCOW2_CLUSTER_NORMAL ||
+ cluster_type == QCOW2_CLUSTER_ZERO_ALLOC)) {
+ /* If we keep the reference, pass on the discard still */
+ bdrv_pdiscard(s->data_file, old_l2_entry & L2E_OFFSET_MASK,
+ s->cluster_size);
+ }
}
qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
diff --git a/block/qcow2.c b/block/qcow2.c
index 7f39483..e23edd4 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -682,6 +682,7 @@ static const char *const mutable_opts[] = {
QCOW2_OPT_DISCARD_REQUEST,
QCOW2_OPT_DISCARD_SNAPSHOT,
QCOW2_OPT_DISCARD_OTHER,
+ QCOW2_OPT_DISCARD_NO_UNREF,
QCOW2_OPT_OVERLAP,
QCOW2_OPT_OVERLAP_TEMPLATE,
QCOW2_OPT_OVERLAP_MAIN_HEADER,
@@ -727,6 +728,11 @@ static QemuOptsList qcow2_runtime_opts = {
.help = "Generate discard requests when other clusters are freed",
},
{
+ .name = QCOW2_OPT_DISCARD_NO_UNREF,
+ .type = QEMU_OPT_BOOL,
+ .help = "Do not unreference discarded clusters",
+ },
+ {
.name = QCOW2_OPT_OVERLAP,
.type = QEMU_OPT_STRING,
.help = "Selects which overlap checks to perform from a range of "
@@ -969,6 +975,7 @@ typedef struct Qcow2ReopenState {
bool use_lazy_refcounts;
int overlap_check;
bool discard_passthrough[QCOW2_DISCARD_MAX];
+ bool discard_no_unref;
uint64_t cache_clean_interval;
QCryptoBlockOpenOptions *crypto_opts; /* Disk encryption runtime options */
} Qcow2ReopenState;
@@ -1140,6 +1147,15 @@ static int qcow2_update_options_prepare(BlockDriverState *bs,
r->discard_passthrough[QCOW2_DISCARD_OTHER] =
qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false);
+ r->discard_no_unref = qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_NO_UNREF,
+ false);
+ if (r->discard_no_unref && s->qcow_version < 3) {
+ error_setg(errp,
+ "discard-no-unref is only supported since qcow2 version 3");
+ ret = -EINVAL;
+ goto fail;
+ }
+
switch (s->crypt_method_header) {
case QCOW_CRYPT_NONE:
if (encryptfmt) {
@@ -1220,6 +1236,8 @@ static void qcow2_update_options_commit(BlockDriverState *bs,
s->discard_passthrough[i] = r->discard_passthrough[i];
}
+ s->discard_no_unref = r->discard_no_unref;
+
if (s->cache_clean_interval != r->cache_clean_interval) {
cache_clean_timer_del(bs);
s->cache_clean_interval = r->cache_clean_interval;
diff --git a/block/qcow2.h b/block/qcow2.h
index 4f67eb9..ea9adb5 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -133,6 +133,7 @@
#define QCOW2_OPT_DISCARD_REQUEST "pass-discard-request"
#define QCOW2_OPT_DISCARD_SNAPSHOT "pass-discard-snapshot"
#define QCOW2_OPT_DISCARD_OTHER "pass-discard-other"
+#define QCOW2_OPT_DISCARD_NO_UNREF "discard-no-unref"
#define QCOW2_OPT_OVERLAP "overlap-check"
#define QCOW2_OPT_OVERLAP_TEMPLATE "overlap-check.template"
#define QCOW2_OPT_OVERLAP_MAIN_HEADER "overlap-check.main-header"
@@ -385,6 +386,8 @@ typedef struct BDRVQcow2State {
bool discard_passthrough[QCOW2_DISCARD_MAX];
+ bool discard_no_unref;
+
int overlap_check; /* bitmask of Qcow2MetadataOverlap values */
bool signaled_corruption;
diff --git a/include/qemu/iov.h b/include/qemu/iov.h
index 9330746..63a1c01 100644
--- a/include/qemu/iov.h
+++ b/include/qemu/iov.h
@@ -222,13 +222,11 @@ static inline void *qemu_iovec_buf(QEMUIOVector *qiov)
void qemu_iovec_init(QEMUIOVector *qiov, int alloc_hint);
void qemu_iovec_init_external(QEMUIOVector *qiov, struct iovec *iov, int niov);
-int qemu_iovec_init_extended(
- QEMUIOVector *qiov,
- void *head_buf, size_t head_len,
- QEMUIOVector *mid_qiov, size_t mid_offset, size_t mid_len,
- void *tail_buf, size_t tail_len);
void qemu_iovec_init_slice(QEMUIOVector *qiov, QEMUIOVector *source,
size_t offset, size_t len);
+struct iovec *qemu_iovec_slice(QEMUIOVector *qiov,
+ size_t offset, size_t len,
+ size_t *head, size_t *tail, int *niov);
int qemu_iovec_subvec_niov(QEMUIOVector *qiov, size_t offset, size_t len);
void qemu_iovec_add(QEMUIOVector *qiov, void *base, size_t len);
void qemu_iovec_concat(QEMUIOVector *dst,
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 4bf8917..5dd5f7e 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -3478,6 +3478,17 @@
# @pass-discard-other: whether discard requests for the data source
# should be issued on other occasions where a cluster gets freed
#
+# @discard-no-unref: when enabled, discards from the guest will not cause
+# cluster allocations to be relinquished. This prevents qcow2 fragmentation
+# that would be caused by such discards. Besides potential
+# performance degradation, such fragmentation can lead to increased
+# allocation of clusters past the end of the image file,
+# resulting in image files whose file length can grow much larger
+# than their guest disk size would suggest.
+# If image file length is of concern (e.g. when storing qcow2
+# images directly on block devices), you should consider enabling
+# this option. (since 8.1)
+#
# @overlap-check: which overlap checks to perform for writes to the
# image, defaults to 'cached' (since 2.2)
#
@@ -3516,6 +3527,7 @@
'*pass-discard-request': 'bool',
'*pass-discard-snapshot': 'bool',
'*pass-discard-other': 'bool',
+ '*discard-no-unref': 'bool',
'*overlap-check': 'Qcow2OverlapChecks',
'*cache-size': 'int',
'*l2-cache-size': 'int',
diff --git a/qemu-options.hx b/qemu-options.hx
index b37eb96..b57489d 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -1431,6 +1431,18 @@ SRST
issued on other occasions where a cluster gets freed
(on/off; default: off)
+ ``discard-no-unref``
+ When enabled, discards from the guest will not cause cluster
+ allocations to be relinquished. This prevents qcow2 fragmentation
+ that would be caused by such discards. Besides potential
+ performance degradation, such fragmentation can lead to increased
+ allocation of clusters past the end of the image file,
+ resulting in image files whose file length can grow much larger
+ than their guest disk size would suggest.
+ If image file length is of concern (e.g. when storing qcow2
+ images directly on block devices), you should consider enabling
+ this option.
+
``overlap-check``
Which overlap checks to perform for writes to the image
(none/constant/cached/all; default: cached). For details or
diff --git a/tests/qemu-iotests/tests/iov-padding b/tests/qemu-iotests/tests/iov-padding
new file mode 100755
index 0000000..b960490
--- /dev/null
+++ b/tests/qemu-iotests/tests/iov-padding
@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+# group: rw quick
+#
+# Check the interaction of request padding (to fit alignment restrictions) with
+# vectored I/O from the guest
+#
+# Copyright Red Hat
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+seq=$(basename $0)
+echo "QA output created by $seq"
+
+status=1 # failure is the default!
+
+_cleanup()
+{
+ _cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+cd ..
+. ./common.rc
+. ./common.filter
+
+_supported_fmt raw
+_supported_proto file
+
+_make_test_img 1M
+
+IMGSPEC="driver=blkdebug,align=4096,image.driver=file,image.filename=$TEST_IMG"
+
+# Four combinations:
+# - Offset 4096, length 1023 * 512 + 512: Fully aligned to 4k
+# - Offset 4096, length 1023 * 512 + 4096: Head is aligned, tail is not
+# - Offset 512, length 1023 * 512 + 512: Neither head nor tail are aligned
+# - Offset 512, length 1023 * 512 + 4096: Tail is aligned, head is not
+for start_offset in 4096 512; do
+ for last_element_length in 512 4096; do
+ length=$((1023 * 512 + $last_element_length))
+
+ echo
+ echo "== performing 1024-element vectored requests to image (offset: $start_offset; length: $length) =="
+
+ # Fill with data for testing
+ $QEMU_IO -c 'write -P 1 0 1M' "$TEST_IMG" | _filter_qemu_io
+
+ # 1023 512-byte buffers, and then one with length $last_element_length
+ cmd_params="-P 2 $start_offset $(yes 512 | head -n 1023 | tr '\n' ' ') $last_element_length"
+ QEMU_IO_OPTIONS="$QEMU_IO_OPTIONS_NO_FMT" $QEMU_IO \
+ -c "writev $cmd_params" \
+ --image-opts \
+ "$IMGSPEC" \
+ | _filter_qemu_io
+
+ # Read all patterns -- read the part we just wrote with writev twice,
+ # once "normally", and once with a readv, so we see that that works, too
+ QEMU_IO_OPTIONS="$QEMU_IO_OPTIONS_NO_FMT" $QEMU_IO \
+ -c "read -P 1 0 $start_offset" \
+ -c "read -P 2 $start_offset $length" \
+ -c "readv $cmd_params" \
+ -c "read -P 1 $((start_offset + length)) $((1024 * 1024 - length - start_offset))" \
+ --image-opts \
+ "$IMGSPEC" \
+ | _filter_qemu_io
+ done
+done
+
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/tests/iov-padding.out b/tests/qemu-iotests/tests/iov-padding.out
new file mode 100644
index 0000000..e07a91f
--- /dev/null
+++ b/tests/qemu-iotests/tests/iov-padding.out
@@ -0,0 +1,59 @@
+QA output created by iov-padding
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576
+
+== performing 1024-element vectored requests to image (offset: 4096; length: 524288) ==
+wrote 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 524288/524288 bytes at offset 4096
+512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 4096/4096 bytes at offset 0
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 524288/524288 bytes at offset 4096
+512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 524288/524288 bytes at offset 4096
+512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 520192/520192 bytes at offset 528384
+508 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== performing 1024-element vectored requests to image (offset: 4096; length: 527872) ==
+wrote 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 527872/527872 bytes at offset 4096
+515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 4096/4096 bytes at offset 0
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 527872/527872 bytes at offset 4096
+515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 527872/527872 bytes at offset 4096
+515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 516608/516608 bytes at offset 531968
+504.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== performing 1024-element vectored requests to image (offset: 512; length: 524288) ==
+wrote 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 524288/524288 bytes at offset 512
+512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 524288/524288 bytes at offset 512
+512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 524288/524288 bytes at offset 512
+512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 523776/523776 bytes at offset 524800
+511.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== performing 1024-element vectored requests to image (offset: 512; length: 527872) ==
+wrote 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 527872/527872 bytes at offset 512
+515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 527872/527872 bytes at offset 512
+515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 527872/527872 bytes at offset 512
+515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 520192/520192 bytes at offset 528384
+508 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+*** done
diff --git a/util/iov.c b/util/iov.c
index b4be580..866fb57 100644
--- a/util/iov.c
+++ b/util/iov.c
@@ -378,15 +378,15 @@ static struct iovec *iov_skip_offset(struct iovec *iov, size_t offset,
}
/*
- * qiov_slice
+ * qemu_iovec_slice
*
* Find subarray of iovec's, containing requested range. @head would
* be offset in first iov (returned by the function), @tail would be
* count of extra bytes in last iovec (returned iov + @niov - 1).
*/
-static struct iovec *qiov_slice(QEMUIOVector *qiov,
- size_t offset, size_t len,
- size_t *head, size_t *tail, int *niov)
+struct iovec *qemu_iovec_slice(QEMUIOVector *qiov,
+ size_t offset, size_t len,
+ size_t *head, size_t *tail, int *niov)
{
struct iovec *iov, *end_iov;
@@ -411,76 +411,12 @@ int qemu_iovec_subvec_niov(QEMUIOVector *qiov, size_t offset, size_t len)
size_t head, tail;
int niov;
- qiov_slice(qiov, offset, len, &head, &tail, &niov);
+ qemu_iovec_slice(qiov, offset, len, &head, &tail, &niov);
return niov;
}
/*
- * Compile new iovec, combining @head_buf buffer, sub-qiov of @mid_qiov,
- * and @tail_buf buffer into new qiov.
- */
-int qemu_iovec_init_extended(
- QEMUIOVector *qiov,
- void *head_buf, size_t head_len,
- QEMUIOVector *mid_qiov, size_t mid_offset, size_t mid_len,
- void *tail_buf, size_t tail_len)
-{
- size_t mid_head, mid_tail;
- int total_niov, mid_niov = 0;
- struct iovec *p, *mid_iov = NULL;
-
- assert(mid_qiov->niov <= IOV_MAX);
-
- if (SIZE_MAX - head_len < mid_len ||
- SIZE_MAX - head_len - mid_len < tail_len)
- {
- return -EINVAL;
- }
-
- if (mid_len) {
- mid_iov = qiov_slice(mid_qiov, mid_offset, mid_len,
- &mid_head, &mid_tail, &mid_niov);
- }
-
- total_niov = !!head_len + mid_niov + !!tail_len;
- if (total_niov > IOV_MAX) {
- return -EINVAL;
- }
-
- if (total_niov == 1) {
- qemu_iovec_init_buf(qiov, NULL, 0);
- p = &qiov->local_iov;
- } else {
- qiov->niov = qiov->nalloc = total_niov;
- qiov->size = head_len + mid_len + tail_len;
- p = qiov->iov = g_new(struct iovec, qiov->niov);
- }
-
- if (head_len) {
- p->iov_base = head_buf;
- p->iov_len = head_len;
- p++;
- }
-
- assert(!mid_niov == !mid_len);
- if (mid_niov) {
- memcpy(p, mid_iov, mid_niov * sizeof(*p));
- p[0].iov_base = (uint8_t *)p[0].iov_base + mid_head;
- p[0].iov_len -= mid_head;
- p[mid_niov - 1].iov_len -= mid_tail;
- p += mid_niov;
- }
-
- if (tail_len) {
- p->iov_base = tail_buf;
- p->iov_len = tail_len;
- }
-
- return 0;
-}
-
-/*
* Check if the contents of subrange of qiov data is all zeroes.
*/
bool qemu_iovec_is_zero(QEMUIOVector *qiov, size_t offset, size_t bytes)
@@ -511,14 +447,21 @@ bool qemu_iovec_is_zero(QEMUIOVector *qiov, size_t offset, size_t bytes)
void qemu_iovec_init_slice(QEMUIOVector *qiov, QEMUIOVector *source,
size_t offset, size_t len)
{
- int ret;
+ struct iovec *slice_iov;
+ int slice_niov;
+ size_t slice_head, slice_tail;
assert(source->size >= len);
assert(source->size - len >= offset);
- /* We shrink the request, so we can't overflow neither size_t nor MAX_IOV */
- ret = qemu_iovec_init_extended(qiov, NULL, 0, source, offset, len, NULL, 0);
- assert(ret == 0);
+ slice_iov = qemu_iovec_slice(source, offset, len,
+ &slice_head, &slice_tail, &slice_niov);
+ if (slice_niov == 1) {
+ qemu_iovec_init_buf(qiov, slice_iov[0].iov_base + slice_head, len);
+ } else {
+ qemu_iovec_init(qiov, slice_niov);
+ qemu_iovec_concat_iov(qiov, slice_iov, slice_niov, slice_head, len);
+ }
}
void qemu_iovec_destroy(QEMUIOVector *qiov)