Merge tag 'pull-block-2023-06-05' of https://gitlab.com/hreitz/qemu into staging

Block patches - Fix padding of unaligned vectored requests to match the host alignment for vectors with 1023 or 1024 buffers - Refactor and fix bugs in parallels's image check functionality - Add an option to the qcow2 driver to retain (qcow2-level) allocations on discard requests from the guest (while still forwarding the discard to the lower level and marking the range as zero) # -----BEGIN PGP SIGNATURE----- # # iQJGBAABCAAwFiEEy2LXoO44KeRfAE00ofpA0JgBnN8FAmR+AT4SHGhyZWl0ekBy # ZWRoYXQuY29tAAoJEKH6QNCYAZzfnboQAKD6YrreZLoseomRfqOAoApSf6yOdcHk # 6kfsvzwzjosomsF1Pkzm4851vX5PyDqTdeu0iViM+pxanVO1b494q1P4VcAERqMB # iZVs68R6M0l6HV9btWFGm+ibHJf4FapdntkIdwog1ka5TIhw5oDWCVNLigjhIoRv # sM37Bgf14kC3sFTR++0HESsyU1eUP5gJjwJbPZ2IgJBmzYay0is1z5nHA/3VUswu # 8dKnGQDsv62EtlK7PK8cU2BhLOeNi6Wr3bAb6Wf2QLB5e0qRb7oAkqNx5/UcTznk # a3XMC1aiWhYvM/+DaYIpQUcIPgA8xQ1KHKeD6WjbGfLgZBqseX0aGWMByUsiY8Bo # +BPIBnUDrbiPnAKB/XLQfnzlE+s7121/JpEbB7AkZqVFRGuw8Wur4tbc2fzvy8Pw # x/uQfv3ZPi/2Lf6u7hv/TVHubXi8jucVgx3Ubu5Jeo3901S4/KOQBQ4BQ/GYIGQX # 38ijSROcEd0eQJ1mTKPEctouxjSZCghNSbrn9DfsL1V3VWqWNKKGCU3hM+RQ1SJT # 688qvnyYt8QZfTsiDSHR/GfKsufG0DkoqE7c9IhSEPohecAH8Rrc3HcLut7fuwD2 # gCFQhm68CPwwRmBjPCY6Zi1RDzeOyFBSWN31T6t0yTb4OHJ/3/cSZVBJtwwkOVbx # zwabHDNdY5Kw # =GuoL # -----END PGP SIGNATURE----- # gpg: Signature made Mon 05 Jun 2023 08:37:34 AM PDT # gpg: using RSA key CB62D7A0EE3829E45F004D34A1FA40D098019CDF # gpg: issuer "hreitz@redhat.com" # gpg: Good signature from "Hanna Reitz <hreitz@redhat.com>" [unknown] # gpg: WARNING: This key is not certified with a trusted signature! # gpg: There is no indication that the signature belongs to the owner. # Primary key fingerprint: CB62 D7A0 EE38 29E4 5F00 4D34 A1FA 40D0 9801 9CDF * tag 'pull-block-2023-06-05' of https://gitlab.com/hreitz/qemu: qcow2: add discard-no-unref option parallels: Incorrect condition in out-of-image check parallels: Replace qemu_co_mutex_lock by WITH_QEMU_LOCK_GUARD parallels: Move statistic collection to a separate function parallels: Move check of leaks to a separate function parallels: Fix statistics calculation parallels: Move check of cluster outside image to a separate function parallels: Move check of unclean image to a separate function parallels: Use generic infrastructure for BAT writing in parallels_co_check() parallels: create parallels_set_bat_entry_helper() to assign BAT value parallels: Fix image_end_offset and data_end after out-of-image check parallels: Fix high_off calculation in parallels_co_check() parallels: Out of image offset in BAT leads to image inflation iotests/iov-padding: New test util/iov: Remove qemu_iovec_init_extended() block: Collapse padded I/O vecs exceeding IOV_MAX util/iov: Make qiov_slice() public Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
author: Richard Henderson <richard.henderson@linaro.org> 2023-06-05 10:27:31 -0700
committer: Richard Henderson <richard.henderson@linaro.org> 2023-06-05 10:27:31 -0700
commit: b52daaf2c868f2bab102eb5acbf55b2917f46aea (patch)
tree: 7717ebe9fd1dec6af5ff339400c645413c22c97f
parent: afa351fe36d4a09842e06a0159325568ac8f923b (diff)
parent: 42a2890a76f4783cd1c212f27856edcf2b5e8a75 (diff)
download: qemu-b52daaf2c868f2bab102eb5acbf55b2917f46aea.zip
qemu-b52daaf2c868f2bab102eb5acbf55b2917f46aea.tar.gz
qemu-b52daaf2c868f2bab102eb5acbf55b2917f46aea.tar.bz2
11 files changed, 523 insertions, 151 deletions
diff --git a/block/io.c b/block/io.c
index f2dfc7c..30748f0 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1441,6 +1441,14 @@ out:
  * @merge_reads is true for small requests,
  * if @buf_len == @head + bytes + @tail. In this case it is possible that both
  * head and tail exist but @buf_len == align and @tail_buf == @buf.
+ *
+ * @write is true for write requests, false for read requests.
+ *
+ * If padding makes the vector too long (exceeding IOV_MAX), then we need to
+ * merge existing vector elements into a single one.  @collapse_bounce_buf acts
+ * as the bounce buffer in such cases.  @pre_collapse_qiov has the pre-collapse
+ * I/O vector elements so for read requests, the data can be copied back after
+ * the read is done.
  */
 typedef struct BdrvRequestPadding {
     uint8_t *buf;
@@ -1449,11 +1457,17 @@ typedef struct BdrvRequestPadding {
     size_t head;
     size_t tail;
     bool merge_reads;
+    bool write;
     QEMUIOVector local_qiov;
+
+    uint8_t *collapse_bounce_buf;
+    size_t collapse_len;
+    QEMUIOVector pre_collapse_qiov;
 } BdrvRequestPadding;
 
 static bool bdrv_init_padding(BlockDriverState *bs,
                               int64_t offset, int64_t bytes,
+                              bool write,
                               BdrvRequestPadding *pad)
 {
     int64_t align = bs->bl.request_alignment;
@@ -1485,6 +1499,8 @@ static bool bdrv_init_padding(BlockDriverState *bs,
         pad->tail_buf = pad->buf + pad->buf_len - align;
     }
 
+    pad->write = write;
+
     return true;
 }
 
@@ -1549,8 +1565,23 @@ zero_mem:
     return 0;
 }
 
-static void bdrv_padding_destroy(BdrvRequestPadding *pad)
+/**
+ * Free *pad's associated buffers, and perform any necessary finalization steps.
+ */
+static void bdrv_padding_finalize(BdrvRequestPadding *pad)
 {
+    if (pad->collapse_bounce_buf) {
+        if (!pad->write) {
+            /*
+             * If padding required elements in the vector to be collapsed into a
+             * bounce buffer, copy the bounce buffer content back
+             */
+            qemu_iovec_from_buf(&pad->pre_collapse_qiov, 0,
+                                pad->collapse_bounce_buf, pad->collapse_len);
+        }
+        qemu_vfree(pad->collapse_bounce_buf);
+        qemu_iovec_destroy(&pad->pre_collapse_qiov);
+    }
     if (pad->buf) {
         qemu_vfree(pad->buf);
         qemu_iovec_destroy(&pad->local_qiov);
@@ -1559,12 +1590,109 @@ static void bdrv_padding_destroy(BdrvRequestPadding *pad)
 }
 
 /*
+ * Create pad->local_qiov by wrapping @iov in the padding head and tail, while
+ * ensuring that the resulting vector will not exceed IOV_MAX elements.
+ *
+ * To ensure this, when necessary, the first two or three elements of @iov are
+ * merged into pad->collapse_bounce_buf and replaced by a reference to that
+ * bounce buffer in pad->local_qiov.
+ *
+ * After performing a read request, the data from the bounce buffer must be
+ * copied back into pad->pre_collapse_qiov (e.g. by bdrv_padding_finalize()).
+ */
+static int bdrv_create_padded_qiov(BlockDriverState *bs,
+                                   BdrvRequestPadding *pad,
+                                   struct iovec *iov, int niov,
+                                   size_t iov_offset, size_t bytes)
+{
+    int padded_niov, surplus_count, collapse_count;
+
+    /* Assert this invariant */
+    assert(niov <= IOV_MAX);
+
+    /*
+     * Cannot pad if resulting length would exceed SIZE_MAX.  Returning an error
+     * to the guest is not ideal, but there is little else we can do.  At least
+     * this will practically never happen on 64-bit systems.
+     */
+    if (SIZE_MAX - pad->head < bytes ||
+        SIZE_MAX - pad->head - bytes < pad->tail)
+    {
+        return -EINVAL;
+    }
+
+    /* Length of the resulting IOV if we just concatenated everything */
+    padded_niov = !!pad->head + niov + !!pad->tail;
+
+    qemu_iovec_init(&pad->local_qiov, MIN(padded_niov, IOV_MAX));
+
+    if (pad->head) {
+        qemu_iovec_add(&pad->local_qiov, pad->buf, pad->head);
+    }
+
+    /*
+     * If padded_niov > IOV_MAX, we cannot just concatenate everything.
+     * Instead, merge the first two or three elements of @iov to reduce the
+     * number of vector elements as necessary.
+     */
+    if (padded_niov > IOV_MAX) {
+        /*
+         * Only head and tail can have lead to the number of entries exceeding
+         * IOV_MAX, so we can exceed it by the head and tail at most.  We need
+         * to reduce the number of elements by `surplus_count`, so we merge that
+         * many elements plus one into one element.
+         */
+        surplus_count = padded_niov - IOV_MAX;
+        assert(surplus_count <= !!pad->head + !!pad->tail);
+        collapse_count = surplus_count + 1;
+
+        /*
+         * Move the elements to collapse into `pad->pre_collapse_qiov`, then
+         * advance `iov` (and associated variables) by those elements.
+         */
+        qemu_iovec_init(&pad->pre_collapse_qiov, collapse_count);
+        qemu_iovec_concat_iov(&pad->pre_collapse_qiov, iov,
+                              collapse_count, iov_offset, SIZE_MAX);
+        iov += collapse_count;
+        iov_offset = 0;
+        niov -= collapse_count;
+        bytes -= pad->pre_collapse_qiov.size;
+
+        /*
+         * Construct the bounce buffer to match the length of the to-collapse
+         * vector elements, and for write requests, initialize it with the data
+         * from those elements.  Then add it to `pad->local_qiov`.
+         */
+        pad->collapse_len = pad->pre_collapse_qiov.size;
+        pad->collapse_bounce_buf = qemu_blockalign(bs, pad->collapse_len);
+        if (pad->write) {
+            qemu_iovec_to_buf(&pad->pre_collapse_qiov, 0,
+                              pad->collapse_bounce_buf, pad->collapse_len);
+        }
+        qemu_iovec_add(&pad->local_qiov,
+                       pad->collapse_bounce_buf, pad->collapse_len);
+    }
+
+    qemu_iovec_concat_iov(&pad->local_qiov, iov, niov, iov_offset, bytes);
+
+    if (pad->tail) {
+        qemu_iovec_add(&pad->local_qiov,
+                       pad->buf + pad->buf_len - pad->tail, pad->tail);
+    }
+
+    assert(pad->local_qiov.niov == MIN(padded_niov, IOV_MAX));
+    return 0;
+}
+
+/*
  * bdrv_pad_request
  *
  * Exchange request parameters with padded request if needed. Don't include RMW
  * read of padding, bdrv_padding_rmw_read() should be called separately if
  * needed.
  *
+ * @write is true for write requests, false for read requests.
+ *
  * Request parameters (@qiov, &qiov_offset, &offset, &bytes) are in-out:
  *  - on function start they represent original request
  *  - on failure or when padding is not needed they are unchanged
@@ -1573,26 +1701,34 @@ static void bdrv_padding_destroy(BdrvRequestPadding *pad)
 static int bdrv_pad_request(BlockDriverState *bs,
                             QEMUIOVector **qiov, size_t *qiov_offset,
                             int64_t *offset, int64_t *bytes,
+                            bool write,
                             BdrvRequestPadding *pad, bool *padded,
                             BdrvRequestFlags *flags)
 {
     int ret;
+    struct iovec *sliced_iov;
+    int sliced_niov;
+    size_t sliced_head, sliced_tail;
 
     bdrv_check_qiov_request(*offset, *bytes, *qiov, *qiov_offset, &error_abort);
 
-    if (!bdrv_init_padding(bs, *offset, *bytes, pad)) {
+    if (!bdrv_init_padding(bs, *offset, *bytes, write, pad)) {
         if (padded) {
             *padded = false;
         }
         return 0;
     }
 
-    ret = qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head,
-                                   *qiov, *qiov_offset, *bytes,
-                                   pad->buf + pad->buf_len - pad->tail,
-                                   pad->tail);
+    sliced_iov = qemu_iovec_slice(*qiov, *qiov_offset, *bytes,
+                                  &sliced_head, &sliced_tail,
+                                  &sliced_niov);
+
+    /* Guaranteed by bdrv_check_qiov_request() */
+    assert(*bytes <= SIZE_MAX);
+    ret = bdrv_create_padded_qiov(bs, pad, sliced_iov, sliced_niov,
+                                  sliced_head, *bytes);
     if (ret < 0) {
-        bdrv_padding_destroy(pad);
+        bdrv_padding_finalize(pad);
         return ret;
     }
     *bytes += pad->head + pad->tail;
@@ -1659,8 +1795,8 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
         flags |= BDRV_REQ_COPY_ON_READ;
     }
 
-    ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad,
-                           NULL, &flags);
+    ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, false,
+                           &pad, NULL, &flags);
     if (ret < 0) {
         goto fail;
     }
@@ -1670,7 +1806,7 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
                               bs->bl.request_alignment,
                               qiov, qiov_offset, flags);
     tracked_request_end(&req);
-    bdrv_padding_destroy(&pad);
+    bdrv_padding_finalize(&pad);
 
 fail:
     bdrv_dec_in_flight(bs);
@@ -2002,7 +2138,7 @@ bdrv_co_do_zero_pwritev(BdrvChild *child, int64_t offset, int64_t bytes,
     /* This flag doesn't make sense for padding or zero writes */
     flags &= ~BDRV_REQ_REGISTERED_BUF;
 
-    padding = bdrv_init_padding(bs, offset, bytes, &pad);
+    padding = bdrv_init_padding(bs, offset, bytes, true, &pad);
     if (padding) {
         assert(!(flags & BDRV_REQ_NO_WAIT));
         bdrv_make_request_serialising(req, align);
@@ -2050,7 +2186,7 @@ bdrv_co_do_zero_pwritev(BdrvChild *child, int64_t offset, int64_t bytes,
     }
 
 out:
-    bdrv_padding_destroy(&pad);
+    bdrv_padding_finalize(&pad);
 
     return ret;
 }
@@ -2118,8 +2254,8 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
          * bdrv_co_do_zero_pwritev() does aligning by itself, so, we do
          * alignment only if there is no ZERO flag.
          */
-        ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad,
-                               &padded, &flags);
+        ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, true,
+                               &pad, &padded, &flags);
         if (ret < 0) {
             return ret;
         }
@@ -2149,7 +2285,7 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
     ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
                                qiov, qiov_offset, flags);
 
-    bdrv_padding_destroy(&pad);
+    bdrv_padding_finalize(&pad);
 
 out:
     tracked_request_end(&req);
diff --git a/block/parallels.c b/block/parallels.c
index d8a3f13..7c263d5 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -165,6 +165,13 @@ static int64_t block_status(BDRVParallelsState *s, int64_t sector_num,
     return start_off;
 }
 
+static void parallels_set_bat_entry(BDRVParallelsState *s,
+                                    uint32_t index, uint32_t offset)
+{
+    s->bat_bitmap[index] = cpu_to_le32(offset);
+    bitmap_set(s->bat_dirty_bmap, bat_entry_off(index) / s->bat_dirty_block, 1);
+}
+
 static int64_t coroutine_fn GRAPH_RDLOCK
 allocate_clusters(BlockDriverState *bs, int64_t sector_num,
                   int nb_sectors, int *pnum)
@@ -251,10 +258,8 @@ allocate_clusters(BlockDriverState *bs, int64_t sector_num,
     }
 
     for (i = 0; i < to_allocate; i++) {
-        s->bat_bitmap[idx + i] = cpu_to_le32(s->data_end / s->off_multiplier);
+        parallels_set_bat_entry(s, idx + i, s->data_end / s->off_multiplier);
         s->data_end += s->tracks;
-        bitmap_set(s->bat_dirty_bmap,
-                   bat_entry_off(idx + i) / s->bat_dirty_block, 1);
     }
 
     return bat2sect(s, idx) + sector_num % s->tracks;
@@ -415,16 +420,33 @@ parallels_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
     return ret;
 }
 
+static void parallels_check_unclean(BlockDriverState *bs,
+                                    BdrvCheckResult *res,
+                                    BdrvCheckMode fix)
+{
+    BDRVParallelsState *s = bs->opaque;
+
+    if (!s->header_unclean) {
+        return;
+    }
+
+    fprintf(stderr, "%s image was not closed correctly\n",
+            fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR");
+    res->corruptions++;
+    if (fix & BDRV_FIX_ERRORS) {
+        /* parallels_close will do the job right */
+        res->corruptions_fixed++;
+        s->header_unclean = false;
+    }
+}
 
 static int coroutine_fn GRAPH_RDLOCK
-parallels_co_check(BlockDriverState *bs, BdrvCheckResult *res,
-                   BdrvCheckMode fix)
+parallels_check_outside_image(BlockDriverState *bs, BdrvCheckResult *res,
+                              BdrvCheckMode fix)
 {
     BDRVParallelsState *s = bs->opaque;
-    int64_t size, prev_off, high_off;
-    int ret;
     uint32_t i;
-    bool flush_bat = false;
+    int64_t off, high_off, size;
 
     size = bdrv_getlength(bs->file->bs);
     if (size < 0) {
@@ -432,65 +454,48 @@ parallels_co_check(BlockDriverState *bs, BdrvCheckResult *res,
         return size;
     }
 
-    qemu_co_mutex_lock(&s->lock);
-    if (s->header_unclean) {
-        fprintf(stderr, "%s image was not closed correctly\n",
-                fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR");
-        res->corruptions++;
-        if (fix & BDRV_FIX_ERRORS) {
-            /* parallels_close will do the job right */
-            res->corruptions_fixed++;
-            s->header_unclean = false;
-        }
-    }
-
-    res->bfi.total_clusters = s->bat_size;
-    res->bfi.compressed_clusters = 0; /* compression is not supported */
-
     high_off = 0;
-    prev_off = 0;
     for (i = 0; i < s->bat_size; i++) {
-        int64_t off = bat2sect(s, i) << BDRV_SECTOR_BITS;
-        if (off == 0) {
-            prev_off = 0;
-            continue;
-        }
-
-        /* cluster outside the image */
-        if (off > size) {
+        off = bat2sect(s, i) << BDRV_SECTOR_BITS;
+        if (off + s->cluster_size > size) {
             fprintf(stderr, "%s cluster %u is outside image\n",
                     fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
             res->corruptions++;
             if (fix & BDRV_FIX_ERRORS) {
-                prev_off = 0;
-                s->bat_bitmap[i] = 0;
+                parallels_set_bat_entry(s, i, 0);
                 res->corruptions_fixed++;
-                flush_bat = true;
-                continue;
             }
+            continue;
         }
-
-        res->bfi.allocated_clusters++;
-        if (off > high_off) {
+        if (high_off < off) {
             high_off = off;
         }
+    }
 
-        if (prev_off != 0 && (prev_off + s->cluster_size) != off) {
-            res->bfi.fragmented_clusters++;
-        }
-        prev_off = off;
+    if (high_off == 0) {
+        res->image_end_offset = s->data_end << BDRV_SECTOR_BITS;
+    } else {
+        res->image_end_offset = high_off + s->cluster_size;
+        s->data_end = res->image_end_offset >> BDRV_SECTOR_BITS;
     }
 
-    ret = 0;
-    if (flush_bat) {
-        ret = bdrv_co_pwrite_sync(bs->file, 0, s->header_size, s->header, 0);
-        if (ret < 0) {
-            res->check_errors++;
-            goto out;
-        }
+    return 0;
+}
+
+static int coroutine_fn GRAPH_RDLOCK
+parallels_check_leak(BlockDriverState *bs, BdrvCheckResult *res,
+                     BdrvCheckMode fix)
+{
+    BDRVParallelsState *s = bs->opaque;
+    int64_t size;
+    int ret;
+
+    size = bdrv_getlength(bs->file->bs);
+    if (size < 0) {
+        res->check_errors++;
+        return size;
     }
 
-    res->image_end_offset = high_off + s->cluster_size;
     if (size > res->image_end_offset) {
         int64_t count;
         count = DIV_ROUND_UP(size - res->image_end_offset, s->cluster_size);
@@ -510,14 +515,74 @@ parallels_co_check(BlockDriverState *bs, BdrvCheckResult *res,
             if (ret < 0) {
                 error_report_err(local_err);
                 res->check_errors++;
-                goto out;
+                return ret;
             }
             res->leaks_fixed += count;
         }
     }
 
-out:
-    qemu_co_mutex_unlock(&s->lock);
+    return 0;
+}
+
+static void parallels_collect_statistics(BlockDriverState *bs,
+                                         BdrvCheckResult *res,
+                                         BdrvCheckMode fix)
+{
+    BDRVParallelsState *s = bs->opaque;
+    int64_t off, prev_off;
+    uint32_t i;
+
+    res->bfi.total_clusters = s->bat_size;
+    res->bfi.compressed_clusters = 0; /* compression is not supported */
+
+    prev_off = 0;
+    for (i = 0; i < s->bat_size; i++) {
+        off = bat2sect(s, i) << BDRV_SECTOR_BITS;
+        /*
+         * If BDRV_FIX_ERRORS is not set, out-of-image BAT entries were not
+         * fixed. Skip not allocated and out-of-image BAT entries.
+         */
+        if (off == 0 || off + s->cluster_size > res->image_end_offset) {
+            prev_off = 0;
+            continue;
+        }
+
+        if (prev_off != 0 && (prev_off + s->cluster_size) != off) {
+            res->bfi.fragmented_clusters++;
+        }
+        prev_off = off;
+        res->bfi.allocated_clusters++;
+    }
+}
+
+static int coroutine_fn GRAPH_RDLOCK
+parallels_co_check(BlockDriverState *bs, BdrvCheckResult *res,
+                   BdrvCheckMode fix)
+{
+    BDRVParallelsState *s = bs->opaque;
+    int ret;
+
+    WITH_QEMU_LOCK_GUARD(&s->lock) {
+        parallels_check_unclean(bs, res, fix);
+
+        ret = parallels_check_outside_image(bs, res, fix);
+        if (ret < 0) {
+            return ret;
+        }
+
+        ret = parallels_check_leak(bs, res, fix);
+        if (ret < 0) {
+            return ret;
+        }
+
+        parallels_collect_statistics(bs, res, fix);
+    }
+
+    ret = bdrv_co_flush(bs);
+    if (ret < 0) {
+        res->check_errors++;
+    }
+
     return ret;
 }
 
@@ -733,6 +798,7 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
     BDRVParallelsState *s = bs->opaque;
     ParallelsHeader ph;
     int ret, size, i;
+    int64_t file_nb_sectors;
     QemuOpts *opts = NULL;
     Error *local_err = NULL;
     char *buf;
@@ -742,6 +808,11 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
         return ret;
     }
 
+    file_nb_sectors = bdrv_nb_sectors(bs->file->bs);
+    if (file_nb_sectors < 0) {
+        return -EINVAL;
+    }
+
     ret = bdrv_pread(bs->file, 0, sizeof(ph), &ph, 0);
     if (ret < 0) {
         goto fail;
@@ -806,6 +877,17 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
 
     for (i = 0; i < s->bat_size; i++) {
         int64_t off = bat2sect(s, i);
+        if (off >= file_nb_sectors) {
+            if (flags & BDRV_O_CHECK) {
+                continue;
+            }
+            error_setg(errp, "parallels: Offset %" PRIi64 " in BAT[%d] entry "
+                       "is larger than file size (%" PRIi64 ")",
+                       off << BDRV_SECTOR_BITS, i,
+                       file_nb_sectors << BDRV_SECTOR_BITS);
+            ret = -EINVAL;
+            goto fail;
+        }
         if (off >= s->data_end) {
             s->data_end = off + s->tracks;
         }
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 39cda7f..2e76de0 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1925,6 +1925,10 @@ static int discard_in_l2_slice(BlockDriverState *bs, uint64_t offset,
         uint64_t new_l2_bitmap = old_l2_bitmap;
         QCow2ClusterType cluster_type =
             qcow2_get_cluster_type(bs, old_l2_entry);
+        bool keep_reference = (cluster_type != QCOW2_CLUSTER_COMPRESSED) &&
+                              !full_discard &&
+                              (s->discard_no_unref &&
+                               type == QCOW2_DISCARD_REQUEST);
 
         /*
          * If full_discard is true, the cluster should not read back as zeroes,
@@ -1943,10 +1947,22 @@ static int discard_in_l2_slice(BlockDriverState *bs, uint64_t offset,
             new_l2_entry = new_l2_bitmap = 0;
         } else if (bs->backing || qcow2_cluster_is_allocated(cluster_type)) {
             if (has_subclusters(s)) {
-                new_l2_entry = 0;
+                if (keep_reference) {
+                    new_l2_entry = old_l2_entry;
+                } else {
+                    new_l2_entry = 0;
+                }
                 new_l2_bitmap = QCOW_L2_BITMAP_ALL_ZEROES;
             } else {
-                new_l2_entry = s->qcow_version >= 3 ? QCOW_OFLAG_ZERO : 0;
+                if (s->qcow_version >= 3) {
+                    if (keep_reference) {
+                        new_l2_entry |= QCOW_OFLAG_ZERO;
+                    } else {
+                        new_l2_entry = QCOW_OFLAG_ZERO;
+                    }
+                } else {
+                    new_l2_entry = 0;
+                }
             }
         }
 
@@ -1960,8 +1976,16 @@ static int discard_in_l2_slice(BlockDriverState *bs, uint64_t offset,
         if (has_subclusters(s)) {
             set_l2_bitmap(s, l2_slice, l2_index + i, new_l2_bitmap);
         }
-        /* Then decrease the refcount */
-        qcow2_free_any_cluster(bs, old_l2_entry, type);
+        if (!keep_reference) {
+            /* Then decrease the refcount */
+            qcow2_free_any_cluster(bs, old_l2_entry, type);
+        } else if (s->discard_passthrough[type] &&
+                   (cluster_type == QCOW2_CLUSTER_NORMAL ||
+                    cluster_type == QCOW2_CLUSTER_ZERO_ALLOC)) {
+            /* If we keep the reference, pass on the discard still */
+            bdrv_pdiscard(s->data_file, old_l2_entry & L2E_OFFSET_MASK,
+                          s->cluster_size);
+       }
     }
 
     qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
diff --git a/block/qcow2.c b/block/qcow2.c
index 7f39483..e23edd4 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -682,6 +682,7 @@ static const char *const mutable_opts[] = {
     QCOW2_OPT_DISCARD_REQUEST,
     QCOW2_OPT_DISCARD_SNAPSHOT,
     QCOW2_OPT_DISCARD_OTHER,
+    QCOW2_OPT_DISCARD_NO_UNREF,
     QCOW2_OPT_OVERLAP,
     QCOW2_OPT_OVERLAP_TEMPLATE,
     QCOW2_OPT_OVERLAP_MAIN_HEADER,
@@ -727,6 +728,11 @@ static QemuOptsList qcow2_runtime_opts = {
             .help = "Generate discard requests when other clusters are freed",
         },
         {
+            .name = QCOW2_OPT_DISCARD_NO_UNREF,
+            .type = QEMU_OPT_BOOL,
+            .help = "Do not unreference discarded clusters",
+        },
+        {
             .name = QCOW2_OPT_OVERLAP,
             .type = QEMU_OPT_STRING,
             .help = "Selects which overlap checks to perform from a range of "
@@ -969,6 +975,7 @@ typedef struct Qcow2ReopenState {
     bool use_lazy_refcounts;
     int overlap_check;
     bool discard_passthrough[QCOW2_DISCARD_MAX];
+    bool discard_no_unref;
     uint64_t cache_clean_interval;
     QCryptoBlockOpenOptions *crypto_opts; /* Disk encryption runtime options */
 } Qcow2ReopenState;
@@ -1140,6 +1147,15 @@ static int qcow2_update_options_prepare(BlockDriverState *bs,
     r->discard_passthrough[QCOW2_DISCARD_OTHER] =
         qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false);
 
+    r->discard_no_unref = qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_NO_UNREF,
+                                            false);
+    if (r->discard_no_unref && s->qcow_version < 3) {
+        error_setg(errp,
+                   "discard-no-unref is only supported since qcow2 version 3");
+        ret = -EINVAL;
+        goto fail;
+    }
+
     switch (s->crypt_method_header) {
     case QCOW_CRYPT_NONE:
         if (encryptfmt) {
@@ -1220,6 +1236,8 @@ static void qcow2_update_options_commit(BlockDriverState *bs,
         s->discard_passthrough[i] = r->discard_passthrough[i];
     }
 
+    s->discard_no_unref = r->discard_no_unref;
+
     if (s->cache_clean_interval != r->cache_clean_interval) {
         cache_clean_timer_del(bs);
         s->cache_clean_interval = r->cache_clean_interval;
diff --git a/block/qcow2.h b/block/qcow2.h
index 4f67eb9..ea9adb5 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -133,6 +133,7 @@
 #define QCOW2_OPT_DISCARD_REQUEST "pass-discard-request"
 #define QCOW2_OPT_DISCARD_SNAPSHOT "pass-discard-snapshot"
 #define QCOW2_OPT_DISCARD_OTHER "pass-discard-other"
+#define QCOW2_OPT_DISCARD_NO_UNREF "discard-no-unref"
 #define QCOW2_OPT_OVERLAP "overlap-check"
 #define QCOW2_OPT_OVERLAP_TEMPLATE "overlap-check.template"
 #define QCOW2_OPT_OVERLAP_MAIN_HEADER "overlap-check.main-header"
@@ -385,6 +386,8 @@ typedef struct BDRVQcow2State {
 
     bool discard_passthrough[QCOW2_DISCARD_MAX];
 
+    bool discard_no_unref;
+
     int overlap_check; /* bitmask of Qcow2MetadataOverlap values */
     bool signaled_corruption;
 
diff --git a/include/qemu/iov.h b/include/qemu/iov.h
index 9330746..63a1c01 100644
--- a/include/qemu/iov.h
+++ b/include/qemu/iov.h
@@ -222,13 +222,11 @@ static inline void *qemu_iovec_buf(QEMUIOVector *qiov)
 
 void qemu_iovec_init(QEMUIOVector *qiov, int alloc_hint);
 void qemu_iovec_init_external(QEMUIOVector *qiov, struct iovec *iov, int niov);
-int qemu_iovec_init_extended(
-        QEMUIOVector *qiov,
-        void *head_buf, size_t head_len,
-        QEMUIOVector *mid_qiov, size_t mid_offset, size_t mid_len,
-        void *tail_buf, size_t tail_len);
 void qemu_iovec_init_slice(QEMUIOVector *qiov, QEMUIOVector *source,
                            size_t offset, size_t len);
+struct iovec *qemu_iovec_slice(QEMUIOVector *qiov,
+                               size_t offset, size_t len,
+                               size_t *head, size_t *tail, int *niov);
 int qemu_iovec_subvec_niov(QEMUIOVector *qiov, size_t offset, size_t len);
 void qemu_iovec_add(QEMUIOVector *qiov, void *base, size_t len);
 void qemu_iovec_concat(QEMUIOVector *dst,
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 4bf8917..5dd5f7e 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -3478,6 +3478,17 @@
 # @pass-discard-other: whether discard requests for the data source
 #     should be issued on other occasions where a cluster gets freed
 #
+# @discard-no-unref: when enabled, discards from the guest will not cause
+#     cluster allocations to be relinquished. This prevents qcow2 fragmentation
+#     that would be caused by such discards. Besides potential
+#     performance degradation, such fragmentation can lead to increased
+#     allocation of clusters past the end of the image file,
+#     resulting in image files whose file length can grow much larger
+#     than their guest disk size would suggest.
+#     If image file length is of concern (e.g. when storing qcow2
+#     images directly on block devices), you should consider enabling
+#     this option. (since 8.1)
+#
 # @overlap-check: which overlap checks to perform for writes to the
 #     image, defaults to 'cached' (since 2.2)
 #
@@ -3516,6 +3527,7 @@
             '*pass-discard-request': 'bool',
             '*pass-discard-snapshot': 'bool',
             '*pass-discard-other': 'bool',
+            '*discard-no-unref': 'bool',
             '*overlap-check': 'Qcow2OverlapChecks',
             '*cache-size': 'int',
             '*l2-cache-size': 'int',
diff --git a/qemu-options.hx b/qemu-options.hx
index b37eb96..b57489d 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -1431,6 +1431,18 @@ SRST
             issued on other occasions where a cluster gets freed
             (on/off; default: off)
 
+        ``discard-no-unref``
+            When enabled, discards from the guest will not cause cluster
+            allocations to be relinquished. This prevents qcow2 fragmentation
+            that would be caused by such discards. Besides potential
+            performance degradation, such fragmentation can lead to increased
+            allocation of clusters past the end of the image file,
+            resulting in image files whose file length can grow much larger
+            than their guest disk size would suggest.
+            If image file length is of concern (e.g. when storing qcow2
+            images directly on block devices), you should consider enabling
+            this option.
+
         ``overlap-check``
             Which overlap checks to perform for writes to the image
             (none/constant/cached/all; default: cached). For details or
diff --git a/tests/qemu-iotests/tests/iov-padding b/tests/qemu-iotests/tests/iov-padding
new file mode 100755
index 0000000..b960490
--- /dev/null
+++ b/tests/qemu-iotests/tests/iov-padding
@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+# group: rw quick
+#
+# Check the interaction of request padding (to fit alignment restrictions) with
+# vectored I/O from the guest
+#
+# Copyright Red Hat
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+seq=$(basename $0)
+echo "QA output created by $seq"
+
+status=1	# failure is the default!
+
+_cleanup()
+{
+    _cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+cd ..
+. ./common.rc
+. ./common.filter
+
+_supported_fmt raw
+_supported_proto file
+
+_make_test_img 1M
+
+IMGSPEC="driver=blkdebug,align=4096,image.driver=file,image.filename=$TEST_IMG"
+
+# Four combinations:
+# - Offset 4096, length 1023 * 512 + 512: Fully aligned to 4k
+# - Offset 4096, length 1023 * 512 + 4096: Head is aligned, tail is not
+# - Offset 512, length 1023 * 512 + 512: Neither head nor tail are aligned
+# - Offset 512, length 1023 * 512 + 4096: Tail is aligned, head is not
+for start_offset in 4096 512; do
+    for last_element_length in 512 4096; do
+        length=$((1023 * 512 + $last_element_length))
+
+        echo
+        echo "== performing 1024-element vectored requests to image (offset: $start_offset; length: $length) =="
+
+        # Fill with data for testing
+        $QEMU_IO -c 'write -P 1 0 1M' "$TEST_IMG" | _filter_qemu_io
+
+        # 1023 512-byte buffers, and then one with length $last_element_length
+        cmd_params="-P 2 $start_offset $(yes 512 | head -n 1023 | tr '\n' ' ') $last_element_length"
+        QEMU_IO_OPTIONS="$QEMU_IO_OPTIONS_NO_FMT" $QEMU_IO \
+            -c "writev $cmd_params" \
+            --image-opts \
+            "$IMGSPEC" \
+            | _filter_qemu_io
+
+        # Read all patterns -- read the part we just wrote with writev twice,
+        # once "normally", and once with a readv, so we see that that works, too
+        QEMU_IO_OPTIONS="$QEMU_IO_OPTIONS_NO_FMT" $QEMU_IO \
+            -c "read -P 1 0 $start_offset" \
+            -c "read -P 2 $start_offset $length" \
+            -c "readv $cmd_params" \
+            -c "read -P 1 $((start_offset + length)) $((1024 * 1024 - length - start_offset))" \
+            --image-opts \
+            "$IMGSPEC" \
+            | _filter_qemu_io
+    done
+done
+
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/tests/iov-padding.out b/tests/qemu-iotests/tests/iov-padding.out
new file mode 100644
index 0000000..e07a91f
--- /dev/null
+++ b/tests/qemu-iotests/tests/iov-padding.out
@@ -0,0 +1,59 @@
+QA output created by iov-padding
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576
+
+== performing 1024-element vectored requests to image (offset: 4096; length: 524288) ==
+wrote 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 524288/524288 bytes at offset 4096
+512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 4096/4096 bytes at offset 0
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 524288/524288 bytes at offset 4096
+512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 524288/524288 bytes at offset 4096
+512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 520192/520192 bytes at offset 528384
+508 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== performing 1024-element vectored requests to image (offset: 4096; length: 527872) ==
+wrote 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 527872/527872 bytes at offset 4096
+515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 4096/4096 bytes at offset 0
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 527872/527872 bytes at offset 4096
+515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 527872/527872 bytes at offset 4096
+515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 516608/516608 bytes at offset 531968
+504.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== performing 1024-element vectored requests to image (offset: 512; length: 524288) ==
+wrote 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 524288/524288 bytes at offset 512
+512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 524288/524288 bytes at offset 512
+512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 524288/524288 bytes at offset 512
+512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 523776/523776 bytes at offset 524800
+511.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== performing 1024-element vectored requests to image (offset: 512; length: 527872) ==
+wrote 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 527872/527872 bytes at offset 512
+515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 527872/527872 bytes at offset 512
+515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 527872/527872 bytes at offset 512
+515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 520192/520192 bytes at offset 528384
+508 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+*** done
diff --git a/util/iov.c b/util/iov.c
index b4be580..866fb57 100644
--- a/util/iov.c
+++ b/util/iov.c
@@ -378,15 +378,15 @@ static struct iovec *iov_skip_offset(struct iovec *iov, size_t offset,
 }
 
 /*
- * qiov_slice
+ * qemu_iovec_slice
  *
  * Find subarray of iovec's, containing requested range. @head would
  * be offset in first iov (returned by the function), @tail would be
  * count of extra bytes in last iovec (returned iov + @niov - 1).
  */
-static struct iovec *qiov_slice(QEMUIOVector *qiov,
-                                size_t offset, size_t len,
-                                size_t *head, size_t *tail, int *niov)
+struct iovec *qemu_iovec_slice(QEMUIOVector *qiov,
+                               size_t offset, size_t len,
+                               size_t *head, size_t *tail, int *niov)
 {
     struct iovec *iov, *end_iov;
 
@@ -411,76 +411,12 @@ int qemu_iovec_subvec_niov(QEMUIOVector *qiov, size_t offset, size_t len)
     size_t head, tail;
     int niov;
 
-    qiov_slice(qiov, offset, len, &head, &tail, &niov);
+    qemu_iovec_slice(qiov, offset, len, &head, &tail, &niov);
 
     return niov;
 }
 
 /*
- * Compile new iovec, combining @head_buf buffer, sub-qiov of @mid_qiov,
- * and @tail_buf buffer into new qiov.
- */
-int qemu_iovec_init_extended(
-        QEMUIOVector *qiov,
-        void *head_buf, size_t head_len,
-        QEMUIOVector *mid_qiov, size_t mid_offset, size_t mid_len,
-        void *tail_buf, size_t tail_len)
-{
-    size_t mid_head, mid_tail;
-    int total_niov, mid_niov = 0;
-    struct iovec *p, *mid_iov = NULL;
-
-    assert(mid_qiov->niov <= IOV_MAX);
-
-    if (SIZE_MAX - head_len < mid_len ||
-        SIZE_MAX - head_len - mid_len < tail_len)
-    {
-        return -EINVAL;
-    }
-
-    if (mid_len) {
-        mid_iov = qiov_slice(mid_qiov, mid_offset, mid_len,
-                             &mid_head, &mid_tail, &mid_niov);
-    }
-
-    total_niov = !!head_len + mid_niov + !!tail_len;
-    if (total_niov > IOV_MAX) {
-        return -EINVAL;
-    }
-
-    if (total_niov == 1) {
-        qemu_iovec_init_buf(qiov, NULL, 0);
-        p = &qiov->local_iov;
-    } else {
-        qiov->niov = qiov->nalloc = total_niov;
-        qiov->size = head_len + mid_len + tail_len;
-        p = qiov->iov = g_new(struct iovec, qiov->niov);
-    }
-
-    if (head_len) {
-        p->iov_base = head_buf;
-        p->iov_len = head_len;
-        p++;
-    }
-
-    assert(!mid_niov == !mid_len);
-    if (mid_niov) {
-        memcpy(p, mid_iov, mid_niov * sizeof(*p));
-        p[0].iov_base = (uint8_t *)p[0].iov_base + mid_head;
-        p[0].iov_len -= mid_head;
-        p[mid_niov - 1].iov_len -= mid_tail;
-        p += mid_niov;
-    }
-
-    if (tail_len) {
-        p->iov_base = tail_buf;
-        p->iov_len = tail_len;
-    }
-
-    return 0;
-}
-
-/*
  * Check if the contents of subrange of qiov data is all zeroes.
  */
 bool qemu_iovec_is_zero(QEMUIOVector *qiov, size_t offset, size_t bytes)
@@ -511,14 +447,21 @@ bool qemu_iovec_is_zero(QEMUIOVector *qiov, size_t offset, size_t bytes)
 void qemu_iovec_init_slice(QEMUIOVector *qiov, QEMUIOVector *source,
                            size_t offset, size_t len)
 {
-    int ret;
+    struct iovec *slice_iov;
+    int slice_niov;
+    size_t slice_head, slice_tail;
 
     assert(source->size >= len);
     assert(source->size - len >= offset);
 
-    /* We shrink the request, so we can't overflow neither size_t nor MAX_IOV */
-    ret = qemu_iovec_init_extended(qiov, NULL, 0, source, offset, len, NULL, 0);
-    assert(ret == 0);
+    slice_iov = qemu_iovec_slice(source, offset, len,
+                                 &slice_head, &slice_tail, &slice_niov);
+    if (slice_niov == 1) {
+        qemu_iovec_init_buf(qiov, slice_iov[0].iov_base + slice_head, len);
+    } else {
+        qemu_iovec_init(qiov, slice_niov);
+        qemu_iovec_concat_iov(qiov, slice_iov, slice_niov, slice_head, len);
+    }
 }
 
 void qemu_iovec_destroy(QEMUIOVector *qiov)
author	Richard Henderson <richard.henderson@linaro.org>	2023-06-05 10:27:31 -0700
committer	Richard Henderson <richard.henderson@linaro.org>	2023-06-05 10:27:31 -0700
commit	b52daaf2c868f2bab102eb5acbf55b2917f46aea (patch)
tree	7717ebe9fd1dec6af5ff339400c645413c22c97f
parent	afa351fe36d4a09842e06a0159325568ac8f923b (diff)
parent	42a2890a76f4783cd1c212f27856edcf2b5e8a75 (diff)
download	qemu-b52daaf2c868f2bab102eb5acbf55b2917f46aea.zip qemu-b52daaf2c868f2bab102eb5acbf55b2917f46aea.tar.gz qemu-b52daaf2c868f2bab102eb5acbf55b2917f46aea.tar.bz2