84 files changed, 2232 insertions, 486 deletions
diff --git a/block/block-backend.c b/block/block-backend.c
index eb22ff3..8b8f2a8 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1178,9 +1178,10 @@ int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
     return ret;
 }
 
-int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
-                                unsigned int bytes, QEMUIOVector *qiov,
-                                BdrvRequestFlags flags)
+int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, int64_t offset,
+                                     unsigned int bytes,
+                                     QEMUIOVector *qiov, size_t qiov_offset,
+                                     BdrvRequestFlags flags)
 {
     int ret;
     BlockDriverState *bs;
@@ -1207,11 +1208,19 @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
         flags |= BDRV_REQ_FUA;
     }
 
-    ret = bdrv_co_pwritev(blk->root, offset, bytes, qiov, flags);
+    ret = bdrv_co_pwritev_part(blk->root, offset, bytes, qiov, qiov_offset,
+                               flags);
     bdrv_dec_in_flight(bs);
     return ret;
 }
 
+int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
+                                unsigned int bytes, QEMUIOVector *qiov,
+                                BdrvRequestFlags flags)
+{
+    return blk_co_pwritev_part(blk, offset, bytes, qiov, 0, flags);
+}
+
 typedef struct BlkRwCo {
     BlockBackend *blk;
     int64_t offset;
@@ -2063,15 +2072,15 @@ int blk_pwrite_compressed(BlockBackend *blk, int64_t offset, const void *buf,
                    BDRV_REQ_WRITE_COMPRESSED);
 }
 
-int blk_truncate(BlockBackend *blk, int64_t offset, PreallocMode prealloc,
-                 Error **errp)
+int blk_truncate(BlockBackend *blk, int64_t offset, bool exact,
+                 PreallocMode prealloc, Error **errp)
 {
     if (!blk_is_available(blk)) {
         error_setg(errp, "No medium inserted");
         return -ENOMEDIUM;
     }
 
-    return bdrv_truncate(blk->root, offset, prealloc, errp);
+    return bdrv_truncate(blk->root, offset, exact, prealloc, errp);
 }
 
 static void blk_pdiscard_entry(void *opaque)
diff --git a/block/block-copy.c b/block/block-copy.c
index 066e3a7..c39cc9c 100644
--- a/block/block-copy.c
+++ b/block/block-copy.c
@@ -18,6 +18,11 @@
 #include "qapi/error.h"
 #include "block/block-copy.h"
 #include "sysemu/block-backend.h"
+#include "qemu/units.h"
+
+#define BLOCK_COPY_MAX_COPY_RANGE (16 * MiB)
+#define BLOCK_COPY_MAX_BUFFER (1 * MiB)
+#define BLOCK_COPY_MAX_MEM (128 * MiB)
 
 static void coroutine_fn block_copy_wait_inflight_reqs(BlockCopyState *s,
                                                        int64_t start,
@@ -61,6 +66,7 @@ void block_copy_state_free(BlockCopyState *s)
     }
 
     bdrv_release_dirty_bitmap(s->copy_bitmap);
+    shres_destroy(s->mem);
     g_free(s);
 }
 
@@ -71,8 +77,9 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
     BlockCopyState *s;
     BdrvDirtyBitmap *copy_bitmap;
     uint32_t max_transfer =
-            MIN_NON_ZERO(INT_MAX, MIN_NON_ZERO(source->bs->bl.max_transfer,
-                                               target->bs->bl.max_transfer));
+            MIN_NON_ZERO(INT_MAX,
+                         MIN_NON_ZERO(source->bs->bl.max_transfer,
+                                      target->bs->bl.max_transfer));
 
     copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL,
                                            errp);
@@ -89,19 +96,31 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
         .cluster_size = cluster_size,
         .len = bdrv_dirty_bitmap_size(copy_bitmap),
         .write_flags = write_flags,
+        .mem = shres_create(BLOCK_COPY_MAX_MEM),
     };
 
-    s->copy_range_size = QEMU_ALIGN_DOWN(max_transfer, cluster_size),
-    /*
-     * Set use_copy_range, consider the following:
-     * 1. Compression is not supported for copy_range.
-     * 2. copy_range does not respect max_transfer (it's a TODO), so we factor
-     *    that in here. If max_transfer is smaller than the job->cluster_size,
-     *    we do not use copy_range (in that case it's zero after aligning down
-     *    above).
-     */
-    s->use_copy_range =
-        !(write_flags & BDRV_REQ_WRITE_COMPRESSED) && s->copy_range_size > 0;
+    if (max_transfer < cluster_size) {
+        /*
+         * copy_range does not respect max_transfer. We don't want to bother
+         * with requests smaller than block-copy cluster size, so fallback to
+         * buffered copying (read and write respect max_transfer on their
+         * behalf).
+         */
+        s->use_copy_range = false;
+        s->copy_size = cluster_size;
+    } else if (write_flags & BDRV_REQ_WRITE_COMPRESSED) {
+        /* Compression is not supported for copy_range */
+        s->use_copy_range = false;
+        s->copy_size = MAX(cluster_size, BLOCK_COPY_MAX_BUFFER);
+    } else {
+        /*
+         * copy_range does not respect max_transfer (it's a TODO), so we factor
+         * that in here.
+         */
+        s->use_copy_range = true;
+        s->copy_size = MIN(MAX(cluster_size, BLOCK_COPY_MAX_COPY_RANGE),
+                           QEMU_ALIGN_DOWN(max_transfer, cluster_size));
+    }
 
     QLIST_INIT(&s->inflight_reqs);
 
@@ -120,79 +139,71 @@ void block_copy_set_callbacks(
 }
 
 /*
- * Copy range to target with a bounce buffer and return the bytes copied. If
- * error occurred, return a negative error number
+ * block_copy_do_copy
+ *
+ * Do copy of cluser-aligned chunk. @end is allowed to exceed s->len only to
+ * cover last cluster when s->len is not aligned to clusters.
+ *
+ * No sync here: nor bitmap neighter intersecting requests handling, only copy.
+ *
+ * Returns 0 on success.
  */
-static int coroutine_fn block_copy_with_bounce_buffer(BlockCopyState *s,
-                                                      int64_t start,
-                                                      int64_t end,
-                                                      bool *error_is_read,
-                                                      void **bounce_buffer)
+static int coroutine_fn block_copy_do_copy(BlockCopyState *s,
+                                           int64_t start, int64_t end,
+                                           bool *error_is_read)
 {
     int ret;
-    int nbytes;
+    int nbytes = MIN(end, s->len) - start;
+    void *bounce_buffer = NULL;
 
     assert(QEMU_IS_ALIGNED(start, s->cluster_size));
-    bdrv_reset_dirty_bitmap(s->copy_bitmap, start, s->cluster_size);
-    nbytes = MIN(s->cluster_size, s->len - start);
-    if (!*bounce_buffer) {
-        *bounce_buffer = qemu_blockalign(s->source->bs, s->cluster_size);
+    assert(QEMU_IS_ALIGNED(end, s->cluster_size));
+    assert(end < s->len || end == QEMU_ALIGN_UP(s->len, s->cluster_size));
+
+    if (s->use_copy_range) {
+        ret = bdrv_co_copy_range(s->source, start, s->target, start, nbytes,
+                                 0, s->write_flags);
+        if (ret < 0) {
+            trace_block_copy_copy_range_fail(s, start, ret);
+            s->use_copy_range = false;
+            s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER);
+            /* Fallback to read+write with allocated buffer */
+        } else {
+            goto out;
+        }
     }
 
-    ret = bdrv_co_pread(s->source, start, nbytes, *bounce_buffer, 0);
+    /*
+     * In case of failed copy_range request above, we may proceed with buffered
+     * request larger than BLOCK_COPY_MAX_BUFFER. Still, further requests will
+     * be properly limited, so don't care too much.
+     */
+
+    bounce_buffer = qemu_blockalign(s->source->bs, nbytes);
+
+    ret = bdrv_co_pread(s->source, start, nbytes, bounce_buffer, 0);
     if (ret < 0) {
-        trace_block_copy_with_bounce_buffer_read_fail(s, start, ret);
+        trace_block_copy_read_fail(s, start, ret);
         if (error_is_read) {
             *error_is_read = true;
         }
-        goto fail;
+        goto out;
     }
 
-    ret = bdrv_co_pwrite(s->target, start, nbytes, *bounce_buffer,
+    ret = bdrv_co_pwrite(s->target, start, nbytes, bounce_buffer,
                          s->write_flags);
     if (ret < 0) {
-        trace_block_copy_with_bounce_buffer_write_fail(s, start, ret);
+        trace_block_copy_write_fail(s, start, ret);
         if (error_is_read) {
             *error_is_read = false;
         }
-        goto fail;
+        goto out;
     }
 
-    return nbytes;
-fail:
-    bdrv_set_dirty_bitmap(s->copy_bitmap, start, s->cluster_size);
-    return ret;
-
-}
-
-/*
- * Copy range to target and return the bytes copied. If error occurred, return a
- * negative error number.
- */
-static int coroutine_fn block_copy_with_offload(BlockCopyState *s,
-                                                int64_t start,
-                                                int64_t end)
-{
-    int ret;
-    int nr_clusters;
-    int nbytes;
-
-    assert(QEMU_IS_ALIGNED(s->copy_range_size, s->cluster_size));
-    assert(QEMU_IS_ALIGNED(start, s->cluster_size));
-    nbytes = MIN(s->copy_range_size, MIN(end, s->len) - start);
-    nr_clusters = DIV_ROUND_UP(nbytes, s->cluster_size);
-    bdrv_reset_dirty_bitmap(s->copy_bitmap, start,
-                            s->cluster_size * nr_clusters);
-    ret = bdrv_co_copy_range(s->source, start, s->target, start, nbytes,
-                             0, s->write_flags);
-    if (ret < 0) {
-        trace_block_copy_with_offload_fail(s, start, ret);
-        bdrv_set_dirty_bitmap(s->copy_bitmap, start,
-                              s->cluster_size * nr_clusters);
-        return ret;
-    }
+out:
+    qemu_vfree(bounce_buffer);
 
-    return nbytes;
+    return ret;
 }
 
 /*
@@ -271,7 +282,6 @@ int coroutine_fn block_copy(BlockCopyState *s,
 {
     int ret = 0;
     int64_t end = bytes + start; /* bytes */
-    void *bounce_buffer = NULL;
     int64_t status_bytes;
     BlockCopyInFlightReq req;
 
@@ -289,7 +299,7 @@ int coroutine_fn block_copy(BlockCopyState *s,
     block_copy_inflight_req_begin(s, &req, start, end);
 
     while (start < end) {
-        int64_t dirty_end;
+        int64_t next_zero, chunk_end;
 
         if (!bdrv_dirty_bitmap_get(s->copy_bitmap, start)) {
             trace_block_copy_skip(s, start);
@@ -297,10 +307,14 @@ int coroutine_fn block_copy(BlockCopyState *s,
             continue; /* already copied */
         }
 
-        dirty_end = bdrv_dirty_bitmap_next_zero(s->copy_bitmap, start,
-                                                (end - start));
-        if (dirty_end < 0) {
-            dirty_end = end;
+        chunk_end = MIN(end, start + s->copy_size);
+
+        next_zero = bdrv_dirty_bitmap_next_zero(s->copy_bitmap, start,
+                                                chunk_end - start);
+        if (next_zero >= 0) {
+            assert(next_zero > start); /* start is dirty */
+            assert(next_zero < chunk_end); /* no need to do MIN() */
+            chunk_end = next_zero;
         }
 
         if (s->skip_unallocated) {
@@ -311,34 +325,26 @@ int coroutine_fn block_copy(BlockCopyState *s,
                 continue;
             }
             /* Clamp to known allocated region */
-            dirty_end = MIN(dirty_end, start + status_bytes);
+            chunk_end = MIN(chunk_end, start + status_bytes);
         }
 
         trace_block_copy_process(s, start);
 
-        if (s->use_copy_range) {
-            ret = block_copy_with_offload(s, start, dirty_end);
-            if (ret < 0) {
-                s->use_copy_range = false;
-            }
-        }
-        if (!s->use_copy_range) {
-            ret = block_copy_with_bounce_buffer(s, start, dirty_end,
-                                                error_is_read, &bounce_buffer);
-        }
+        bdrv_reset_dirty_bitmap(s->copy_bitmap, start, chunk_end - start);
+
+        co_get_from_shres(s->mem, chunk_end - start);
+        ret = block_copy_do_copy(s, start, chunk_end, error_is_read);
+        co_put_to_shres(s->mem, chunk_end - start);
         if (ret < 0) {
+            bdrv_set_dirty_bitmap(s->copy_bitmap, start, chunk_end - start);
             break;
         }
 
-        start += ret;
-        s->progress_bytes_callback(ret, s->progress_opaque);
+        s->progress_bytes_callback(chunk_end - start, s->progress_opaque);
+        start = chunk_end;
         ret = 0;
     }
 
-    if (bounce_buffer) {
-        qemu_vfree(bounce_buffer);
-    }
-
     block_copy_inflight_req_end(&req);
 
     return ret;
diff --git a/block/commit.c b/block/commit.c
index bc84544..23c90b3 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -155,7 +155,7 @@ static int coroutine_fn commit_run(Job *job, Error **errp)
     }
 
     if (base_len < len) {
-        ret = blk_truncate(s->base, len, PREALLOC_MODE_OFF, NULL);
+        ret = blk_truncate(s->base, len, false, PREALLOC_MODE_OFF, NULL);
         if (ret) {
             goto out;
         }
@@ -471,7 +471,8 @@ int bdrv_commit(BlockDriverState *bs)
      * grow the backing file image if possible.  If not possible,
      * we must return an error */
     if (length > backing_length) {
-        ret = blk_truncate(backing, length, PREALLOC_MODE_OFF, &local_err);
+        ret = blk_truncate(backing, length, false, PREALLOC_MODE_OFF,
+                           &local_err);
         if (ret < 0) {
             error_report_err(local_err);
             goto ro_cleanup;
diff --git a/block/copy-on-read.c b/block/copy-on-read.c
index 6631f30..e95223d 100644
--- a/block/copy-on-read.c
+++ b/block/copy-on-read.c
@@ -73,13 +73,6 @@ static int64_t cor_getlength(BlockDriverState *bs)
 }
 
 
-static int coroutine_fn cor_co_truncate(BlockDriverState *bs, int64_t offset,
-                                        PreallocMode prealloc, Error **errp)
-{
-    return bdrv_co_truncate(bs->file, offset, prealloc, errp);
-}
-
-
 static int coroutine_fn cor_co_preadv(BlockDriverState *bs,
                                       uint64_t offset, uint64_t bytes,
                                       QEMUIOVector *qiov, int flags)
@@ -139,7 +132,6 @@ static BlockDriver bdrv_copy_on_read = {
     .bdrv_child_perm                    = cor_child_perm,
 
     .bdrv_getlength                     = cor_getlength,
-    .bdrv_co_truncate                   = cor_co_truncate,
 
     .bdrv_co_preadv                     = cor_co_preadv,
     .bdrv_co_pwritev                    = cor_co_pwritev,
diff --git a/block/crypto.c b/block/crypto.c
index 7eb6987..2482383 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -113,8 +113,8 @@ static ssize_t block_crypto_init_func(QCryptoBlock *block,
      * available to the guest, so we must take account of that
      * which will be used by the crypto header
      */
-    return blk_truncate(data->blk, data->size + headerlen, data->prealloc,
-                        errp);
+    return blk_truncate(data->blk, data->size + headerlen, false,
+                        data->prealloc, errp);
 }
 
 
@@ -297,7 +297,7 @@ static int block_crypto_co_create_generic(BlockDriverState *bs,
 }
 
 static int coroutine_fn
-block_crypto_co_truncate(BlockDriverState *bs, int64_t offset,
+block_crypto_co_truncate(BlockDriverState *bs, int64_t offset, bool exact,
                          PreallocMode prealloc, Error **errp)
 {
     BlockCrypto *crypto = bs->opaque;
@@ -311,7 +311,7 @@ block_crypto_co_truncate(BlockDriverState *bs, int64_t offset,
 
     offset += payload_offset;
 
-    return bdrv_co_truncate(bs->file, offset, prealloc, errp);
+    return bdrv_co_truncate(bs->file, offset, exact, prealloc, errp);
 }
 
 static void block_crypto_close(BlockDriverState *bs)
diff --git a/block/file-posix.c b/block/file-posix.c
index 5d1995a..0b7e904 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -2020,7 +2020,8 @@ raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset,
 }
 
 static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
-                                        PreallocMode prealloc, Error **errp)
+                                        bool exact, PreallocMode prealloc,
+                                        Error **errp)
 {
     BDRVRawState *s = bs->opaque;
     struct stat st;
@@ -2033,6 +2034,7 @@ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
     }
 
     if (S_ISREG(st.st_mode)) {
+        /* Always resizes to the exact @offset */
         return raw_regular_truncate(bs, s->fd, offset, prealloc, errp);
     }
 
@@ -2043,7 +2045,12 @@ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
     }
 
     if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
-        if (offset > raw_getlength(bs)) {
+        int64_t cur_length = raw_getlength(bs);
+
+        if (offset != cur_length && exact) {
+            error_setg(errp, "Cannot resize device files");
+            return -ENOTSUP;
+        } else if (offset > cur_length) {
             error_setg(errp, "Cannot grow device files");
             return -EINVAL;
         }
diff --git a/block/file-win32.c b/block/file-win32.c
index 41f55df..77e8ff7 100644
--- a/block/file-win32.c
+++ b/block/file-win32.c
@@ -468,7 +468,8 @@ static void raw_close(BlockDriverState *bs)
 }
 
 static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
-                                        PreallocMode prealloc, Error **errp)
+                                        bool exact, PreallocMode prealloc,
+                                        Error **errp)
 {
     BDRVRawState *s = bs->opaque;
     LONG low, high;
diff --git a/block/gluster.c b/block/gluster.c
index 64028b2..4fa4a77 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -1225,6 +1225,7 @@ static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs,
 
 static coroutine_fn int qemu_gluster_co_truncate(BlockDriverState *bs,
                                                  int64_t offset,
+                                                 bool exact,
                                                  PreallocMode prealloc,
                                                  Error **errp)
 {
diff --git a/block/io.c b/block/io.c
index e46d9e8..02659f9 100644
--- a/block/io.c
+++ b/block/io.c
@@ -3291,8 +3291,12 @@ static void bdrv_parent_cb_resize(BlockDriverState *bs)
 
 /**
  * Truncate file to 'offset' bytes (needed only for file protocols)
+ *
+ * If 'exact' is true, the file must be resized to exactly the given
+ * 'offset'.  Otherwise, it is sufficient for the node to be at least
+ * 'offset' bytes in length.
  */
-int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset,
+int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
                                   PreallocMode prealloc, Error **errp)
 {
     BlockDriverState *bs = child->bs;
@@ -3347,20 +3351,19 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset,
         goto out;
     }
 
-    if (!drv->bdrv_co_truncate) {
-        if (bs->file && drv->is_filter) {
-            ret = bdrv_co_truncate(bs->file, offset, prealloc, errp);
-            goto out;
-        }
+    if (drv->bdrv_co_truncate) {
+        ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, errp);
+    } else if (bs->file && drv->is_filter) {
+        ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, errp);
+    } else {
         error_setg(errp, "Image format driver does not support resize");
         ret = -ENOTSUP;
         goto out;
     }
-
-    ret = drv->bdrv_co_truncate(bs, offset, prealloc, errp);
     if (ret < 0) {
         goto out;
     }
+
     ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
     if (ret < 0) {
         error_setg_errno(errp, -ret, "Could not refresh total sector count");
@@ -3382,6 +3385,7 @@ out:
 typedef struct TruncateCo {
     BdrvChild *child;
     int64_t offset;
+    bool exact;
     PreallocMode prealloc;
     Error **errp;
     int ret;
@@ -3390,18 +3394,19 @@ typedef struct TruncateCo {
 static void coroutine_fn bdrv_truncate_co_entry(void *opaque)
 {
     TruncateCo *tco = opaque;
-    tco->ret = bdrv_co_truncate(tco->child, tco->offset, tco->prealloc,
-                                tco->errp);
+    tco->ret = bdrv_co_truncate(tco->child, tco->offset, tco->exact,
+                                tco->prealloc, tco->errp);
     aio_wait_kick();
 }
 
-int bdrv_truncate(BdrvChild *child, int64_t offset, PreallocMode prealloc,
-                  Error **errp)
+int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
+                  PreallocMode prealloc, Error **errp)
 {
     Coroutine *co;
     TruncateCo tco = {
         .child      = child,
         .offset     = offset,
+        .exact      = exact,
         .prealloc   = prealloc,
         .errp       = errp,
         .ret        = NOT_DONE,
diff --git a/block/iscsi.c b/block/iscsi.c
index 2ced150..2aea7e3 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -2123,9 +2123,11 @@ static void iscsi_reopen_commit(BDRVReopenState *reopen_state)
 }
 
 static int coroutine_fn iscsi_co_truncate(BlockDriverState *bs, int64_t offset,
-                                          PreallocMode prealloc, Error **errp)
+                                          bool exact, PreallocMode prealloc,
+                                          Error **errp)
 {
     IscsiLun *iscsilun = bs->opaque;
+    int64_t cur_length;
     Error *local_err = NULL;
 
     if (prealloc != PREALLOC_MODE_OFF) {
@@ -2145,7 +2147,11 @@ static int coroutine_fn iscsi_co_truncate(BlockDriverState *bs, int64_t offset,
         return -EIO;
     }
 
-    if (offset > iscsi_getlength(bs)) {
+    cur_length = iscsi_getlength(bs);
+    if (offset != cur_length && exact) {
+        error_setg(errp, "Cannot resize iSCSI devices");
+        return -ENOTSUP;
+    } else if (offset > cur_length) {
         error_setg(errp, "Cannot grow iSCSI devices");
         return -EINVAL;
     }
diff --git a/block/mirror.c b/block/mirror.c
index a6c50ca..f0f2d9d 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -620,11 +620,11 @@ static int mirror_exit_common(Job *job)
 {
     MirrorBlockJob *s = container_of(job, MirrorBlockJob, common.job);
     BlockJob *bjob = &s->common;
-    MirrorBDSOpaque *bs_opaque = s->mirror_top_bs->opaque;
+    MirrorBDSOpaque *bs_opaque;
     AioContext *replace_aio_context = NULL;
-    BlockDriverState *src = s->mirror_top_bs->backing->bs;
-    BlockDriverState *target_bs = blk_bs(s->target);
-    BlockDriverState *mirror_top_bs = s->mirror_top_bs;
+    BlockDriverState *src;
+    BlockDriverState *target_bs;
+    BlockDriverState *mirror_top_bs;
     Error *local_err = NULL;
     bool abort = job->ret < 0;
     int ret = 0;
@@ -634,6 +634,11 @@ static int mirror_exit_common(Job *job)
     }
     s->prepared = true;
 
+    mirror_top_bs = s->mirror_top_bs;
+    bs_opaque = mirror_top_bs->opaque;
+    src = mirror_top_bs->backing->bs;
+    target_bs = blk_bs(s->target);
+
     if (bdrv_chain_contains(src, target_bs)) {
         bdrv_unfreeze_backing_chain(mirror_top_bs, target_bs);
     }
@@ -873,8 +878,8 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
         }
 
         if (s->bdev_length > base_length) {
-            ret = blk_truncate(s->target, s->bdev_length, PREALLOC_MODE_OFF,
-                               NULL);
+            ret = blk_truncate(s->target, s->bdev_length, false,
+                               PREALLOC_MODE_OFF, NULL);
             if (ret < 0) {
                 goto immediate_exit;
             }
@@ -1181,84 +1186,107 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method,
                      uint64_t offset, uint64_t bytes,
                      QEMUIOVector *qiov, int flags)
 {
-    QEMUIOVector target_qiov;
-    uint64_t dirty_offset = offset;
-    uint64_t dirty_bytes;
+    int ret;
+    size_t qiov_offset = 0;
+    int64_t bitmap_offset, bitmap_end;
 
-    if (qiov) {
-        qemu_iovec_init(&target_qiov, qiov->niov);
+    if (!QEMU_IS_ALIGNED(offset, job->granularity) &&
+        bdrv_dirty_bitmap_get(job->dirty_bitmap, offset))
+    {
+            /*
+             * Dirty unaligned padding: ignore it.
+             *
+             * Reasoning:
+             * 1. If we copy it, we can't reset corresponding bit in
+             *    dirty_bitmap as there may be some "dirty" bytes still not
+             *    copied.
+             * 2. It's already dirty, so skipping it we don't diverge mirror
+             *    progress.
+             *
+             * Note, that because of this, guest write may have no contribution
+             * into mirror converge, but that's not bad, as we have background
+             * process of mirroring. If under some bad circumstances (high guest
+             * IO load) background process starve, we will not converge anyway,
+             * even if each write will contribute, as guest is not guaranteed to
+             * rewrite the whole disk.
+             */
+            qiov_offset = QEMU_ALIGN_UP(offset, job->granularity) - offset;
+            if (bytes <= qiov_offset) {
+                /* nothing to do after shrink */
+                return;
+            }
+            offset += qiov_offset;
+            bytes -= qiov_offset;
     }
 
-    while (true) {
-        bool valid_area;
-        int ret;
+    if (!QEMU_IS_ALIGNED(offset + bytes, job->granularity) &&
+        bdrv_dirty_bitmap_get(job->dirty_bitmap, offset + bytes - 1))
+    {
+        uint64_t tail = (offset + bytes) % job->granularity;
 
-        bdrv_dirty_bitmap_lock(job->dirty_bitmap);
-        dirty_bytes = MIN(offset + bytes - dirty_offset, INT_MAX);
-        valid_area = bdrv_dirty_bitmap_next_dirty_area(job->dirty_bitmap,
-                                                       &dirty_offset,
-                                                       &dirty_bytes);
-        if (!valid_area) {
-            bdrv_dirty_bitmap_unlock(job->dirty_bitmap);
-            break;
+        if (bytes <= tail) {
+            /* nothing to do after shrink */
+            return;
         }
+        bytes -= tail;
+    }
 
-        bdrv_reset_dirty_bitmap_locked(job->dirty_bitmap,
-                                       dirty_offset, dirty_bytes);
-        bdrv_dirty_bitmap_unlock(job->dirty_bitmap);
-
-        job_progress_increase_remaining(&job->common.job, dirty_bytes);
-
-        assert(dirty_offset - offset <= SIZE_MAX);
-        if (qiov) {
-            qemu_iovec_reset(&target_qiov);
-            qemu_iovec_concat(&target_qiov, qiov,
-                              dirty_offset - offset, dirty_bytes);
-        }
+    /*
+     * Tails are either clean or shrunk, so for bitmap resetting
+     * we safely align the range down.
+     */
+    bitmap_offset = QEMU_ALIGN_UP(offset, job->granularity);
+    bitmap_end = QEMU_ALIGN_DOWN(offset + bytes, job->granularity);
+    if (bitmap_offset < bitmap_end) {
+        bdrv_reset_dirty_bitmap(job->dirty_bitmap, bitmap_offset,
+                                bitmap_end - bitmap_offset);
+    }
 
-        switch (method) {
-        case MIRROR_METHOD_COPY:
-            ret = blk_co_pwritev(job->target, dirty_offset, dirty_bytes,
-                                 qiov ? &target_qiov : NULL, flags);
-            break;
+    job_progress_increase_remaining(&job->common.job, bytes);
 
-        case MIRROR_METHOD_ZERO:
-            assert(!qiov);
-            ret = blk_co_pwrite_zeroes(job->target, dirty_offset, dirty_bytes,
-                                       flags);
-            break;
+    switch (method) {
+    case MIRROR_METHOD_COPY:
+        ret = blk_co_pwritev_part(job->target, offset, bytes,
+                                  qiov, qiov_offset, flags);
+        break;
 
-        case MIRROR_METHOD_DISCARD:
-            assert(!qiov);
-            ret = blk_co_pdiscard(job->target, dirty_offset, dirty_bytes);
-            break;
+    case MIRROR_METHOD_ZERO:
+        assert(!qiov);
+        ret = blk_co_pwrite_zeroes(job->target, offset, bytes, flags);
+        break;
 
-        default:
-            abort();
-        }
+    case MIRROR_METHOD_DISCARD:
+        assert(!qiov);
+        ret = blk_co_pdiscard(job->target, offset, bytes);
+        break;
 
-        if (ret >= 0) {
-            job_progress_update(&job->common.job, dirty_bytes);
-        } else {
-            BlockErrorAction action;
+    default:
+        abort();
+    }
 
-            bdrv_set_dirty_bitmap(job->dirty_bitmap, dirty_offset, dirty_bytes);
-            job->actively_synced = false;
+    if (ret >= 0) {
+        job_progress_update(&job->common.job, bytes);
+    } else {
+        BlockErrorAction action;
 
-            action = mirror_error_action(job, false, -ret);
-            if (action == BLOCK_ERROR_ACTION_REPORT) {
-                if (!job->ret) {
-                    job->ret = ret;
-                }
-                break;
+        /*
+         * We failed, so we should mark dirty the whole area, aligned up.
+         * Note that we don't care about shrunk tails if any: they were dirty
+         * at function start, and they must be still dirty, as we've locked
+         * the region for in-flight op.
+         */
+        bitmap_offset = QEMU_ALIGN_DOWN(offset, job->granularity);
+        bitmap_end = QEMU_ALIGN_UP(offset + bytes, job->granularity);
+        bdrv_set_dirty_bitmap(job->dirty_bitmap, bitmap_offset,
+                              bitmap_end - bitmap_offset);
+        job->actively_synced = false;
+
+        action = mirror_error_action(job, false, -ret);
+        if (action == BLOCK_ERROR_ACTION_REPORT) {
+            if (!job->ret) {
+                job->ret = ret;
             }
         }
-
-        dirty_offset += dirty_bytes;
-    }
-
-    if (qiov) {
-        qemu_iovec_destroy(&target_qiov);
     }
 }
 
@@ -1465,15 +1493,6 @@ static void bdrv_mirror_top_child_perm(BlockDriverState *bs, BdrvChild *c,
     *nshared = BLK_PERM_ALL;
 }
 
-static void bdrv_mirror_top_refresh_limits(BlockDriverState *bs, Error **errp)
-{
-    MirrorBDSOpaque *s = bs->opaque;
-
-    if (s && s->job && s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING) {
-        bs->bl.request_alignment = s->job->granularity;
-    }
-}
-
 /* Dummy node that provides consistent read to its users without requiring it
  * from its backing file and that allows writes on the backing file chain. */
 static BlockDriver bdrv_mirror_top = {
@@ -1486,7 +1505,6 @@ static BlockDriver bdrv_mirror_top = {
     .bdrv_co_block_status       = bdrv_co_block_status_from_backing,
     .bdrv_refresh_filename      = bdrv_mirror_top_refresh_filename,
     .bdrv_child_perm            = bdrv_mirror_top_child_perm,
-    .bdrv_refresh_limits        = bdrv_mirror_top_refresh_limits,
 };
 
 static BlockJob *mirror_start_job(
@@ -1634,29 +1652,13 @@ static BlockJob *mirror_start_job(
         s->should_complete = true;
     }
 
-    /*
-     * Must be called before we start tracking writes, but after
-     *
-     *     ((MirrorBlockJob *)
-     *         ((MirrorBDSOpaque *)
-     *             mirror_top_bs->opaque
-     *         )->job
-     *     )->copy_mode
-     *
-     * has the correct value.
-     * (We start tracking writes as of the following
-     * bdrv_create_dirty_bitmap() call.)
-     */
-    bdrv_refresh_limits(mirror_top_bs, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-        goto fail;
-    }
-
     s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
     if (!s->dirty_bitmap) {
         goto fail;
     }
+    if (s->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING) {
+        bdrv_disable_dirty_bitmap(s->dirty_bitmap);
+    }
 
     ret = block_job_add_bdrv(&s->common, "source", bs, 0,
                              BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE |
diff --git a/block/nfs.c b/block/nfs.c
index 40f2349..9a6311e 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -752,7 +752,7 @@ static int64_t nfs_get_allocated_file_size(BlockDriverState *bs)
 }
 
 static int coroutine_fn
-nfs_file_co_truncate(BlockDriverState *bs, int64_t offset,
+nfs_file_co_truncate(BlockDriverState *bs, int64_t offset, bool exact,
                      PreallocMode prealloc, Error **errp)
 {
     NFSClient *client = bs->opaque;
diff --git a/block/nvme.c b/block/nvme.c
index 910872e..d41c4bd 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -112,6 +112,9 @@ typedef struct {
     uint64_t max_transfer;
     bool plugged;
 
+    bool supports_write_zeroes;
+    bool supports_discard;
+
     CoMutex dma_map_lock;
     CoQueue dma_flush_queue;
 
@@ -423,6 +426,7 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
     NvmeIdNs *idns;
     NvmeLBAF *lbaf;
     uint8_t *resp;
+    uint16_t oncs;
     int r;
     uint64_t iova;
     NvmeCmd cmd = {
@@ -460,6 +464,10 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
     s->max_transfer = MIN_NON_ZERO(s->max_transfer,
                           s->page_size / sizeof(uint64_t) * s->page_size);
 
+    oncs = le16_to_cpu(idctrl->oncs);
+    s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROS);
+    s->supports_discard = !!(oncs & NVME_ONCS_DSM);
+
     memset(resp, 0, 4096);
 
     cmd.cdw10 = 0;
@@ -472,6 +480,12 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
     s->nsze = le64_to_cpu(idns->nsze);
     lbaf = &idns->lbaf[NVME_ID_NS_FLBAS_INDEX(idns->flbas)];
 
+    if (NVME_ID_NS_DLFEAT_WRITE_ZEROES(idns->dlfeat) &&
+            NVME_ID_NS_DLFEAT_READ_BEHAVIOR(idns->dlfeat) ==
+                    NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROES) {
+        bs->supported_write_flags |= BDRV_REQ_MAY_UNMAP;
+    }
+
     if (lbaf->ms) {
         error_setg(errp, "Namespaces with metadata are not yet supported");
         goto out;
@@ -766,6 +780,8 @@ static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags,
     int ret;
     BDRVNVMeState *s = bs->opaque;
 
+    bs->supported_write_flags = BDRV_REQ_FUA;
+
     opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
     qemu_opts_absorb_qdict(opts, options, &error_abort);
     device = qemu_opt_get(opts, NVME_BLOCK_OPT_DEVICE);
@@ -794,7 +810,6 @@ static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags,
             goto fail;
         }
     }
-    bs->supported_write_flags = BDRV_REQ_FUA;
     return 0;
 fail:
     nvme_close(bs);
@@ -1088,6 +1103,140 @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
 }
 
 
+static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
+                                              int64_t offset,
+                                              int bytes,
+                                              BdrvRequestFlags flags)
+{
+    BDRVNVMeState *s = bs->opaque;
+    NVMeQueuePair *ioq = s->queues[1];
+    NVMeRequest *req;
+
+    uint32_t cdw12 = ((bytes >> s->blkshift) - 1) & 0xFFFF;
+
+    if (!s->supports_write_zeroes) {
+        return -ENOTSUP;
+    }
+
+    NvmeCmd cmd = {
+        .opcode = NVME_CMD_WRITE_ZEROS,
+        .nsid = cpu_to_le32(s->nsid),
+        .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF),
+        .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF),
+    };
+
+    NVMeCoData data = {
+        .ctx = bdrv_get_aio_context(bs),
+        .ret = -EINPROGRESS,
+    };
+
+    if (flags & BDRV_REQ_MAY_UNMAP) {
+        cdw12 |= (1 << 25);
+    }
+
+    if (flags & BDRV_REQ_FUA) {
+        cdw12 |= (1 << 30);
+    }
+
+    cmd.cdw12 = cpu_to_le32(cdw12);
+
+    trace_nvme_write_zeroes(s, offset, bytes, flags);
+    assert(s->nr_queues > 1);
+    req = nvme_get_free_req(ioq);
+    assert(req);
+
+    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
+
+    data.co = qemu_coroutine_self();
+    while (data.ret == -EINPROGRESS) {
+        qemu_coroutine_yield();
+    }
+
+    trace_nvme_rw_done(s, true, offset, bytes, data.ret);
+    return data.ret;
+}
+
+
+static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
+                                         int64_t offset,
+                                         int bytes)
+{
+    BDRVNVMeState *s = bs->opaque;
+    NVMeQueuePair *ioq = s->queues[1];
+    NVMeRequest *req;
+    NvmeDsmRange *buf;
+    QEMUIOVector local_qiov;
+    int ret;
+
+    NvmeCmd cmd = {
+        .opcode = NVME_CMD_DSM,
+        .nsid = cpu_to_le32(s->nsid),
+        .cdw10 = cpu_to_le32(0), /*number of ranges - 0 based*/
+        .cdw11 = cpu_to_le32(1 << 2), /*deallocate bit*/
+    };
+
+    NVMeCoData data = {
+        .ctx = bdrv_get_aio_context(bs),
+        .ret = -EINPROGRESS,
+    };
+
+    if (!s->supports_discard) {
+        return -ENOTSUP;
+    }
+
+    assert(s->nr_queues > 1);
+
+    buf = qemu_try_blockalign0(bs, s->page_size);
+    if (!buf) {
+        return -ENOMEM;
+    }
+
+    buf->nlb = cpu_to_le32(bytes >> s->blkshift);
+    buf->slba = cpu_to_le64(offset >> s->blkshift);
+    buf->cattr = 0;
+
+    qemu_iovec_init(&local_qiov, 1);
+    qemu_iovec_add(&local_qiov, buf, 4096);
+
+    req = nvme_get_free_req(ioq);
+    assert(req);
+
+    qemu_co_mutex_lock(&s->dma_map_lock);
+    ret = nvme_cmd_map_qiov(bs, &cmd, req, &local_qiov);
+    qemu_co_mutex_unlock(&s->dma_map_lock);
+
+    if (ret) {
+        req->busy = false;
+        goto out;
+    }
+
+    trace_nvme_dsm(s, offset, bytes);
+
+    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
+
+    data.co = qemu_coroutine_self();
+    while (data.ret == -EINPROGRESS) {
+        qemu_coroutine_yield();
+    }
+
+    qemu_co_mutex_lock(&s->dma_map_lock);
+    ret = nvme_cmd_unmap_qiov(bs, &local_qiov);
+    qemu_co_mutex_unlock(&s->dma_map_lock);
+
+    if (ret) {
+        goto out;
+    }
+
+    ret = data.ret;
+    trace_nvme_dsm_done(s, offset, bytes, ret);
+out:
+    qemu_iovec_destroy(&local_qiov);
+    qemu_vfree(buf);
+    return ret;
+
+}
+
+
 static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
                                BlockReopenQueue *queue, Error **errp)
 {
@@ -1192,6 +1341,10 @@ static BlockDriver bdrv_nvme = {
 
     .bdrv_co_preadv           = nvme_co_preadv,
     .bdrv_co_pwritev          = nvme_co_pwritev,
+
+    .bdrv_co_pwrite_zeroes    = nvme_co_pwrite_zeroes,
+    .bdrv_co_pdiscard         = nvme_co_pdiscard,
+
     .bdrv_co_flush_to_disk    = nvme_co_flush,
     .bdrv_reopen_prepare      = nvme_reopen_prepare,
 
diff --git a/block/parallels.c b/block/parallels.c
index f1dfb03..7a01997 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -203,7 +203,7 @@ static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num,
         } else {
             ret = bdrv_truncate(bs->file,
                                 (s->data_end + space) << BDRV_SECTOR_BITS,
-                                PREALLOC_MODE_OFF, NULL);
+                                false, PREALLOC_MODE_OFF, NULL);
         }
         if (ret < 0) {
             return ret;
@@ -487,7 +487,12 @@ static int coroutine_fn parallels_co_check(BlockDriverState *bs,
         res->leaks += count;
         if (fix & BDRV_FIX_LEAKS) {
             Error *local_err = NULL;
-            ret = bdrv_truncate(bs->file, res->image_end_offset,
+
+            /*
+             * In order to really repair the image, we must shrink it.
+             * That means we have to pass exact=true.
+             */
+            ret = bdrv_truncate(bs->file, res->image_end_offset, true,
                                 PREALLOC_MODE_OFF, &local_err);
             if (ret < 0) {
                 error_report_err(local_err);
@@ -563,11 +568,6 @@ static int coroutine_fn parallels_co_create(BlockdevCreateOptions* opts,
     blk_set_allow_write_beyond_eof(blk, true);
 
     /* Create image format */
-    ret = blk_truncate(blk, 0, PREALLOC_MODE_OFF, errp);
-    if (ret < 0) {
-        goto out;
-    }
-
     bat_entries = DIV_ROUND_UP(total_size, cl_size);
     bat_sectors = DIV_ROUND_UP(bat_entry_off(bat_entries), cl_size);
     bat_sectors = (bat_sectors *  cl_size) >> BDRV_SECTOR_BITS;
@@ -885,7 +885,9 @@ static void parallels_close(BlockDriverState *bs)
     if ((bs->open_flags & BDRV_O_RDWR) && !(bs->open_flags & BDRV_O_INACTIVE)) {
         s->header->inuse = 0;
         parallels_update_header(bs);
-        bdrv_truncate(bs->file, s->data_end << BDRV_SECTOR_BITS,
+
+        /* errors are ignored, so we might as well pass exact=true */
+        bdrv_truncate(bs->file, s->data_end << BDRV_SECTOR_BITS, true,
                       PREALLOC_MODE_OFF, NULL);
     }
 
diff --git a/block/qcow.c b/block/qcow.c
index 5bdf72b..fce8989 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -480,7 +480,7 @@ static int get_cluster_offset(BlockDriverState *bs,
                     return -E2BIG;
                 }
                 ret = bdrv_truncate(bs->file, cluster_offset + s->cluster_size,
-                                    PREALLOC_MODE_OFF, NULL);
+                                    false, PREALLOC_MODE_OFF, NULL);
                 if (ret < 0) {
                     return ret;
                 }
@@ -858,11 +858,6 @@ static int coroutine_fn qcow_co_create(BlockdevCreateOptions *opts,
     blk_set_allow_write_beyond_eof(qcow_blk, true);
 
     /* Create image format */
-    ret = blk_truncate(qcow_blk, 0, PREALLOC_MODE_OFF, errp);
-    if (ret < 0) {
-        goto exit;
-    }
-
     memset(&header, 0, sizeof(header));
     header.magic = cpu_to_be32(QCOW_MAGIC);
     header.version = cpu_to_be32(QCOW_VERSION);
@@ -1038,7 +1033,7 @@ static int qcow_make_empty(BlockDriverState *bs)
     if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, s->l1_table,
             l1_length) < 0)
         return -1;
-    ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length,
+    ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length, false,
                         PREALLOC_MODE_OFF, NULL);
     if (ret < 0)
         return ret;
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 0d64bf5..f67ac6b 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -2016,7 +2016,7 @@ static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
                     goto resize_fail;
                 }
 
-                ret = bdrv_truncate(bs->file, offset + s->cluster_size,
+                ret = bdrv_truncate(bs->file, offset + s->cluster_size, false,
                                     PREALLOC_MODE_OFF, &local_err);
                 if (ret < 0) {
                     error_report_err(local_err);
diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index d0e7fa9..5ab64da 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -29,29 +29,64 @@
 #include "qemu/error-report.h"
 #include "qemu/cutils.h"
 
+static void qcow2_free_single_snapshot(BlockDriverState *bs, int i)
+{
+    BDRVQcow2State *s = bs->opaque;
+
+    assert(i >= 0 && i < s->nb_snapshots);
+    g_free(s->snapshots[i].name);
+    g_free(s->snapshots[i].id_str);
+    g_free(s->snapshots[i].unknown_extra_data);
+    memset(&s->snapshots[i], 0, sizeof(s->snapshots[i]));
+}
+
 void qcow2_free_snapshots(BlockDriverState *bs)
 {
     BDRVQcow2State *s = bs->opaque;
     int i;
 
     for(i = 0; i < s->nb_snapshots; i++) {
-        g_free(s->snapshots[i].name);
-        g_free(s->snapshots[i].id_str);
+        qcow2_free_single_snapshot(bs, i);
     }
     g_free(s->snapshots);
     s->snapshots = NULL;
     s->nb_snapshots = 0;
 }
 
-int qcow2_read_snapshots(BlockDriverState *bs)
+/*
+ * If @repair is true, try to repair a broken snapshot table instead
+ * of just returning an error:
+ *
+ * - If the snapshot table was too long, set *nb_clusters_reduced to
+ *   the number of snapshots removed off the end.
+ *   The caller will update the on-disk nb_snapshots accordingly;
+ *   this leaks clusters, but is safe.
+ *   (The on-disk information must be updated before
+ *   qcow2_check_refcounts(), because that function relies on
+ *   s->nb_snapshots to reflect the on-disk value.)
+ *
+ * - If there were snapshots with too much extra metadata, increment
+ *   *extra_data_dropped for each.
+ *   This requires the caller to eventually rewrite the whole snapshot
+ *   table, which requires cluster allocation.  Therefore, this should
+ *   be done only after qcow2_check_refcounts() made sure the refcount
+ *   structures are valid.
+ *   (In the meantime, the image is still valid because
+ *   qcow2_check_refcounts() does not do anything with snapshots'
+ *   extra data.)
+ */
+static int qcow2_do_read_snapshots(BlockDriverState *bs, bool repair,
+                                   int *nb_clusters_reduced,
+                                   int *extra_data_dropped,
+                                   Error **errp)
 {
     BDRVQcow2State *s = bs->opaque;
     QCowSnapshotHeader h;
     QCowSnapshotExtraData extra;
     QCowSnapshot *sn;
     int i, id_str_size, name_size;
-    int64_t offset;
-    uint32_t extra_data_size;
+    int64_t offset, pre_sn_offset;
+    uint64_t table_length = 0;
     int ret;
 
     if (!s->nb_snapshots) {
@@ -64,10 +99,16 @@ int qcow2_read_snapshots(BlockDriverState *bs)
     s->snapshots = g_new0(QCowSnapshot, s->nb_snapshots);
 
     for(i = 0; i < s->nb_snapshots; i++) {
+        bool truncate_unknown_extra_data = false;
+
+        pre_sn_offset = offset;
+        table_length = ROUND_UP(table_length, 8);
+
         /* Read statically sized part of the snapshot header */
         offset = ROUND_UP(offset, 8);
         ret = bdrv_pread(bs->file, offset, &h, sizeof(h));
         if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to read snapshot table");
             goto fail;
         }
 
@@ -79,33 +120,77 @@ int qcow2_read_snapshots(BlockDriverState *bs)
         sn->date_sec = be32_to_cpu(h.date_sec);
         sn->date_nsec = be32_to_cpu(h.date_nsec);
         sn->vm_clock_nsec = be64_to_cpu(h.vm_clock_nsec);
-        extra_data_size = be32_to_cpu(h.extra_data_size);
+        sn->extra_data_size = be32_to_cpu(h.extra_data_size);
 
         id_str_size = be16_to_cpu(h.id_str_size);
         name_size = be16_to_cpu(h.name_size);
 
-        /* Read extra data */
+        if (sn->extra_data_size > QCOW_MAX_SNAPSHOT_EXTRA_DATA) {
+            if (!repair) {
+                ret = -EFBIG;
+                error_setg(errp, "Too much extra metadata in snapshot table "
+                           "entry %i", i);
+                error_append_hint(errp, "You can force-remove this extra "
+                                  "metadata with qemu-img check -r all\n");
+                goto fail;
+            }
+
+            fprintf(stderr, "Discarding too much extra metadata in snapshot "
+                    "table entry %i (%" PRIu32 " > %u)\n",
+                    i, sn->extra_data_size, QCOW_MAX_SNAPSHOT_EXTRA_DATA);
+
+            (*extra_data_dropped)++;
+            truncate_unknown_extra_data = true;
+        }
+
+        /* Read known extra data */
         ret = bdrv_pread(bs->file, offset, &extra,
-                         MIN(sizeof(extra), extra_data_size));
+                         MIN(sizeof(extra), sn->extra_data_size));
         if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to read snapshot table");
             goto fail;
         }
-        offset += extra_data_size;
+        offset += MIN(sizeof(extra), sn->extra_data_size);
 
-        if (extra_data_size >= 8) {
+        if (sn->extra_data_size >= endof(QCowSnapshotExtraData,
+                                         vm_state_size_large)) {
             sn->vm_state_size = be64_to_cpu(extra.vm_state_size_large);
         }
 
-        if (extra_data_size >= 16) {
+        if (sn->extra_data_size >= endof(QCowSnapshotExtraData, disk_size)) {
             sn->disk_size = be64_to_cpu(extra.disk_size);
         } else {
             sn->disk_size = bs->total_sectors * BDRV_SECTOR_SIZE;
         }
 
+        if (sn->extra_data_size > sizeof(extra)) {
+            uint64_t extra_data_end;
+            size_t unknown_extra_data_size;
+
+            extra_data_end = offset + sn->extra_data_size - sizeof(extra);
+
+            if (truncate_unknown_extra_data) {
+                sn->extra_data_size = QCOW_MAX_SNAPSHOT_EXTRA_DATA;
+            }
+
+            /* Store unknown extra data */
+            unknown_extra_data_size = sn->extra_data_size - sizeof(extra);
+            sn->unknown_extra_data = g_malloc(unknown_extra_data_size);
+            ret = bdrv_pread(bs->file, offset, sn->unknown_extra_data,
+                             unknown_extra_data_size);
+            if (ret < 0) {
+                error_setg_errno(errp, -ret,
+                                 "Failed to read snapshot table");
+                goto fail;
+            }
+            offset = extra_data_end;
+        }
+
         /* Read snapshot ID */
         sn->id_str = g_malloc(id_str_size + 1);
         ret = bdrv_pread(bs->file, offset, sn->id_str, id_str_size);
         if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to read snapshot table");
             goto fail;
         }
         offset += id_str_size;
@@ -115,14 +200,47 @@ int qcow2_read_snapshots(BlockDriverState *bs)
         sn->name = g_malloc(name_size + 1);
         ret = bdrv_pread(bs->file, offset, sn->name, name_size);
         if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to read snapshot table");
             goto fail;
         }
         offset += name_size;
         sn->name[name_size] = '\0';
 
-        if (offset - s->snapshots_offset > QCOW_MAX_SNAPSHOTS_SIZE) {
-            ret = -EFBIG;
-            goto fail;
+        /* Note that the extra data may have been truncated */
+        table_length += sizeof(h) + sn->extra_data_size + id_str_size +
+                        name_size;
+        if (!repair) {
+            assert(table_length == offset - s->snapshots_offset);
+        }
+
+        if (table_length > QCOW_MAX_SNAPSHOTS_SIZE ||
+            offset - s->snapshots_offset > INT_MAX)
+        {
+            if (!repair) {
+                ret = -EFBIG;
+                error_setg(errp, "Snapshot table is too big");
+                error_append_hint(errp, "You can force-remove all %u "
+                                  "overhanging snapshots with qemu-img check "
+                                  "-r all\n", s->nb_snapshots - i);
+                goto fail;
+            }
+
+            fprintf(stderr, "Discarding %u overhanging snapshots (snapshot "
+                    "table is too big)\n", s->nb_snapshots - i);
+
+            *nb_clusters_reduced += (s->nb_snapshots - i);
+
+            /* Discard current snapshot also */
+            qcow2_free_single_snapshot(bs, i);
+
+            /*
+             * This leaks all the rest of the snapshot table and the
+             * snapshots' clusters, but we run in check -r all mode,
+             * so qcow2_check_refcounts() will take care of it.
+             */
+            s->nb_snapshots = i;
+            offset = pre_sn_offset;
+            break;
         }
     }
 
@@ -135,8 +253,13 @@ fail:
     return ret;
 }
 
+int qcow2_read_snapshots(BlockDriverState *bs, Error **errp)
+{
+    return qcow2_do_read_snapshots(bs, false, NULL, NULL, errp);
+}
+
 /* add at the end of the file a new list of snapshots */
-static int qcow2_write_snapshots(BlockDriverState *bs)
+int qcow2_write_snapshots(BlockDriverState *bs)
 {
     BDRVQcow2State *s = bs->opaque;
     QCowSnapshot *sn;
@@ -156,7 +279,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
         sn = s->snapshots + i;
         offset = ROUND_UP(offset, 8);
         offset += sizeof(h);
-        offset += sizeof(extra);
+        offset += MAX(sizeof(extra), sn->extra_data_size);
         offset += strlen(sn->id_str);
         offset += strlen(sn->name);
 
@@ -203,7 +326,8 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
         h.date_sec = cpu_to_be32(sn->date_sec);
         h.date_nsec = cpu_to_be32(sn->date_nsec);
         h.vm_clock_nsec = cpu_to_be64(sn->vm_clock_nsec);
-        h.extra_data_size = cpu_to_be32(sizeof(extra));
+        h.extra_data_size = cpu_to_be32(MAX(sizeof(extra),
+                                            sn->extra_data_size));
 
         memset(&extra, 0, sizeof(extra));
         extra.vm_state_size_large = cpu_to_be64(sn->vm_state_size);
@@ -228,6 +352,22 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
         }
         offset += sizeof(extra);
 
+        if (sn->extra_data_size > sizeof(extra)) {
+            size_t unknown_extra_data_size =
+                sn->extra_data_size - sizeof(extra);
+
+            /* qcow2_read_snapshots() ensures no unbounded allocation */
+            assert(unknown_extra_data_size <= BDRV_REQUEST_MAX_BYTES);
+            assert(sn->unknown_extra_data);
+
+            ret = bdrv_pwrite(bs->file, offset, sn->unknown_extra_data,
+                              unknown_extra_data_size);
+            if (ret < 0) {
+                goto fail;
+            }
+            offset += unknown_extra_data_size;
+        }
+
         ret = bdrv_pwrite(bs->file, offset, sn->id_str, id_str_size);
         if (ret < 0) {
             goto fail;
@@ -251,7 +391,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
     }
 
     QEMU_BUILD_BUG_ON(offsetof(QCowHeader, snapshots_offset) !=
-        offsetof(QCowHeader, nb_snapshots) + sizeof(header_data.nb_snapshots));
+                      endof(QCowHeader, nb_snapshots));
 
     header_data.nb_snapshots        = cpu_to_be32(s->nb_snapshots);
     header_data.snapshots_offset    = cpu_to_be64(snapshots_offset);
@@ -277,6 +417,151 @@ fail:
     return ret;
 }
 
+int coroutine_fn qcow2_check_read_snapshot_table(BlockDriverState *bs,
+                                                 BdrvCheckResult *result,
+                                                 BdrvCheckMode fix)
+{
+    BDRVQcow2State *s = bs->opaque;
+    Error *local_err = NULL;
+    int nb_clusters_reduced = 0;
+    int extra_data_dropped = 0;
+    int ret;
+    struct {
+        uint32_t nb_snapshots;
+        uint64_t snapshots_offset;
+    } QEMU_PACKED snapshot_table_pointer;
+
+    /* qcow2_do_open() discards this information in check mode */
+    ret = bdrv_pread(bs->file, offsetof(QCowHeader, nb_snapshots),
+                     &snapshot_table_pointer, sizeof(snapshot_table_pointer));
+    if (ret < 0) {
+        result->check_errors++;
+        fprintf(stderr, "ERROR failed to read the snapshot table pointer from "
+                "the image header: %s\n", strerror(-ret));
+        return ret;
+    }
+
+    s->snapshots_offset = be64_to_cpu(snapshot_table_pointer.snapshots_offset);
+    s->nb_snapshots = be32_to_cpu(snapshot_table_pointer.nb_snapshots);
+
+    if (s->nb_snapshots > QCOW_MAX_SNAPSHOTS && (fix & BDRV_FIX_ERRORS)) {
+        fprintf(stderr, "Discarding %u overhanging snapshots\n",
+                s->nb_snapshots - QCOW_MAX_SNAPSHOTS);
+
+        nb_clusters_reduced += s->nb_snapshots - QCOW_MAX_SNAPSHOTS;
+        s->nb_snapshots = QCOW_MAX_SNAPSHOTS;
+    }
+
+    ret = qcow2_validate_table(bs, s->snapshots_offset, s->nb_snapshots,
+                               sizeof(QCowSnapshotHeader),
+                               sizeof(QCowSnapshotHeader) * QCOW_MAX_SNAPSHOTS,
+                               "snapshot table", &local_err);
+    if (ret < 0) {
+        result->check_errors++;
+        error_reportf_err(local_err, "ERROR ");
+
+        if (s->nb_snapshots > QCOW_MAX_SNAPSHOTS) {
+            fprintf(stderr, "You can force-remove all %u overhanging snapshots "
+                    "with qemu-img check -r all\n",
+                    s->nb_snapshots - QCOW_MAX_SNAPSHOTS);
+        }
+
+        /* We did not read the snapshot table, so invalidate this information */
+        s->snapshots_offset = 0;
+        s->nb_snapshots = 0;
+
+        return ret;
+    }
+
+    qemu_co_mutex_unlock(&s->lock);
+    ret = qcow2_do_read_snapshots(bs, fix & BDRV_FIX_ERRORS,
+                                  &nb_clusters_reduced, &extra_data_dropped,
+                                  &local_err);
+    qemu_co_mutex_lock(&s->lock);
+    if (ret < 0) {
+        result->check_errors++;
+        error_reportf_err(local_err,
+                          "ERROR failed to read the snapshot table: ");
+
+        /* We did not read the snapshot table, so invalidate this information */
+        s->snapshots_offset = 0;
+        s->nb_snapshots = 0;
+
+        return ret;
+    }
+    result->corruptions += nb_clusters_reduced + extra_data_dropped;
+
+    if (nb_clusters_reduced) {
+        /*
+         * Update image header now, because:
+         * (1) qcow2_check_refcounts() relies on s->nb_snapshots to be
+         *     the same as what the image header says,
+         * (2) this leaks clusters, but qcow2_check_refcounts() will
+         *     fix that.
+         */
+        assert(fix & BDRV_FIX_ERRORS);
+
+        snapshot_table_pointer.nb_snapshots = cpu_to_be32(s->nb_snapshots);
+        ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, nb_snapshots),
+                               &snapshot_table_pointer.nb_snapshots,
+                               sizeof(snapshot_table_pointer.nb_snapshots));
+        if (ret < 0) {
+            result->check_errors++;
+            fprintf(stderr, "ERROR failed to update the snapshot count in the "
+                    "image header: %s\n", strerror(-ret));
+            return ret;
+        }
+
+        result->corruptions_fixed += nb_clusters_reduced;
+        result->corruptions -= nb_clusters_reduced;
+    }
+
+    /*
+     * All of v3 images' snapshot table entries need to have at least
+     * 16 bytes of extra data.
+     */
+    if (s->qcow_version >= 3) {
+        int i;
+        for (i = 0; i < s->nb_snapshots; i++) {
+            if (s->snapshots[i].extra_data_size <
+                sizeof_field(QCowSnapshotExtraData, vm_state_size_large) +
+                sizeof_field(QCowSnapshotExtraData, disk_size))
+            {
+                result->corruptions++;
+                fprintf(stderr, "%s snapshot table entry %i is incomplete\n",
+                        fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
+            }
+        }
+    }
+
+    return 0;
+}
+
+int coroutine_fn qcow2_check_fix_snapshot_table(BlockDriverState *bs,
+                                                BdrvCheckResult *result,
+                                                BdrvCheckMode fix)
+{
+    BDRVQcow2State *s = bs->opaque;
+    int ret;
+
+    if (result->corruptions && (fix & BDRV_FIX_ERRORS)) {
+        qemu_co_mutex_unlock(&s->lock);
+        ret = qcow2_write_snapshots(bs);
+        qemu_co_mutex_lock(&s->lock);
+        if (ret < 0) {
+            result->check_errors++;
+            fprintf(stderr, "ERROR failed to update snapshot table: %s\n",
+                    strerror(-ret));
+            return ret;
+        }
+
+        result->corruptions_fixed += result->corruptions;
+        result->corruptions = 0;
+    }
+
+    return 0;
+}
+
 static void find_new_snapshot_id(BlockDriverState *bs,
                                  char *id_str, int id_str_size)
 {
@@ -370,6 +655,7 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
     sn->date_sec = sn_info->date_sec;
     sn->date_nsec = sn_info->date_nsec;
     sn->vm_clock_nsec = sn_info->vm_clock_nsec;
+    sn->extra_data_size = sizeof(QCowSnapshotExtraData);
 
     /* Allocate the L1 table of the snapshot and copy the current one there. */
     l1_table_offset = qcow2_alloc_clusters(bs, s->l1_size * sizeof(uint64_t));
@@ -641,6 +927,7 @@ int qcow2_snapshot_delete(BlockDriverState *bs,
      * The snapshot is now unused, clean up. If we fail after this point, we
      * won't recover but just leak clusters.
      */
+    g_free(sn.unknown_extra_data);
     g_free(sn.id_str);
     g_free(sn.name);
 
diff --git a/block/qcow2.c b/block/qcow2.c
index 0bc69e6..7c18721 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -570,11 +570,47 @@ int qcow2_mark_consistent(BlockDriverState *bs)
     return 0;
 }
 
+static void qcow2_add_check_result(BdrvCheckResult *out,
+                                   const BdrvCheckResult *src,
+                                   bool set_allocation_info)
+{
+    out->corruptions += src->corruptions;
+    out->leaks += src->leaks;
+    out->check_errors += src->check_errors;
+    out->corruptions_fixed += src->corruptions_fixed;
+    out->leaks_fixed += src->leaks_fixed;
+
+    if (set_allocation_info) {
+        out->image_end_offset = src->image_end_offset;
+        out->bfi = src->bfi;
+    }
+}
+
 static int coroutine_fn qcow2_co_check_locked(BlockDriverState *bs,
                                               BdrvCheckResult *result,
                                               BdrvCheckMode fix)
 {
-    int ret = qcow2_check_refcounts(bs, result, fix);
+    BdrvCheckResult snapshot_res = {};
+    BdrvCheckResult refcount_res = {};
+    int ret;
+
+    memset(result, 0, sizeof(*result));
+
+    ret = qcow2_check_read_snapshot_table(bs, &snapshot_res, fix);
+    if (ret < 0) {
+        qcow2_add_check_result(result, &snapshot_res, false);
+        return ret;
+    }
+
+    ret = qcow2_check_refcounts(bs, &refcount_res, fix);
+    qcow2_add_check_result(result, &refcount_res, true);
+    if (ret < 0) {
+        qcow2_add_check_result(result, &snapshot_res, false);
+        return ret;
+    }
+
+    ret = qcow2_check_fix_snapshot_table(bs, &snapshot_res, fix);
+    qcow2_add_check_result(result, &snapshot_res, false);
     if (ret < 0) {
         return ret;
     }
@@ -1410,17 +1446,22 @@ static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options,
         goto fail;
     }
 
-    /* The total size in bytes of the snapshot table is checked in
-     * qcow2_read_snapshots() because the size of each snapshot is
-     * variable and we don't know it yet.
-     * Here we only check the offset and number of snapshots. */
-    ret = qcow2_validate_table(bs, header.snapshots_offset,
-                               header.nb_snapshots,
-                               sizeof(QCowSnapshotHeader),
-                               sizeof(QCowSnapshotHeader) * QCOW_MAX_SNAPSHOTS,
-                               "Snapshot table", errp);
-    if (ret < 0) {
-        goto fail;
+    if (!(flags & BDRV_O_CHECK)) {
+        /*
+         * The total size in bytes of the snapshot table is checked in
+         * qcow2_read_snapshots() because the size of each snapshot is
+         * variable and we don't know it yet.
+         * Here we only check the offset and number of snapshots.
+         */
+        ret = qcow2_validate_table(bs, header.snapshots_offset,
+                                   header.nb_snapshots,
+                                   sizeof(QCowSnapshotHeader),
+                                   sizeof(QCowSnapshotHeader) *
+                                       QCOW_MAX_SNAPSHOTS,
+                                   "Snapshot table", errp);
+        if (ret < 0) {
+            goto fail;
+        }
     }
 
     /* read the level 1 table */
@@ -1580,14 +1621,19 @@ static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options,
         s->image_backing_file = g_strdup(bs->auto_backing_file);
     }
 
-    /* Internal snapshots */
-    s->snapshots_offset = header.snapshots_offset;
-    s->nb_snapshots = header.nb_snapshots;
+    /*
+     * Internal snapshots; skip reading them in check mode, because
+     * we do not need them then, and we do not want to abort because
+     * of a broken table.
+     */
+    if (!(flags & BDRV_O_CHECK)) {
+        s->snapshots_offset = header.snapshots_offset;
+        s->nb_snapshots = header.nb_snapshots;
 
-    ret = qcow2_read_snapshots(bs);
-    if (ret < 0) {
-        error_setg_errno(errp, -ret, "Could not read snapshots");
-        goto fail;
+        ret = qcow2_read_snapshots(bs, errp);
+        if (ret < 0) {
+            goto fail;
+        }
     }
 
     /* Clear unknown autoclear feature bits */
@@ -3028,8 +3074,8 @@ static int coroutine_fn preallocate_co(BlockDriverState *bs, uint64_t offset,
         if (mode == PREALLOC_MODE_METADATA) {
             mode = PREALLOC_MODE_OFF;
         }
-        ret = bdrv_co_truncate(s->data_file, host_offset + cur_bytes, mode,
-                               errp);
+        ret = bdrv_co_truncate(s->data_file, host_offset + cur_bytes, false,
+                               mode, errp);
         if (ret < 0) {
             return ret;
         }
@@ -3345,12 +3391,6 @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)
     }
     blk_set_allow_write_beyond_eof(blk, true);
 
-    /* Clear the protocol layer and preallocate it if necessary */
-    ret = blk_truncate(blk, 0, PREALLOC_MODE_OFF, errp);
-    if (ret < 0) {
-        goto out;
-    }
-
     /* Write the header */
     QEMU_BUILD_BUG_ON((1 << MIN_CLUSTER_BITS) < sizeof(*header));
     header = g_malloc0(cluster_size);
@@ -3449,7 +3489,8 @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)
     }
 
     /* Okay, now that we have a valid image, let's give it the right size */
-    ret = blk_truncate(blk, qcow2_opts->size, qcow2_opts->preallocation, errp);
+    ret = blk_truncate(blk, qcow2_opts->size, false, qcow2_opts->preallocation,
+                       errp);
     if (ret < 0) {
         error_prepend(errp, "Could not resize image: ");
         goto out;
@@ -3897,7 +3938,8 @@ fail:
 }
 
 static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
-                                          PreallocMode prealloc, Error **errp)
+                                          bool exact, PreallocMode prealloc,
+                                          Error **errp)
 {
     BDRVQcow2State *s = bs->opaque;
     uint64_t old_length;
@@ -3985,8 +4027,15 @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
         if ((last_cluster + 1) * s->cluster_size < old_file_size) {
             Error *local_err = NULL;
 
+            /*
+             * Do not pass @exact here: It will not help the user if
+             * we get an error here just because they wanted to shrink
+             * their qcow2 image (on a block device) with qemu-img.
+             * (And on the qcow2 layer, the @exact requirement is
+             * always fulfilled, so there is no need to pass it on.)
+             */
             bdrv_co_truncate(bs->file, (last_cluster + 1) * s->cluster_size,
-                             PREALLOC_MODE_OFF, &local_err);
+                             false, PREALLOC_MODE_OFF, &local_err);
             if (local_err) {
                 warn_reportf_err(local_err,
                                  "Failed to truncate the tail of the image: ");
@@ -4003,7 +4052,12 @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
     switch (prealloc) {
     case PREALLOC_MODE_OFF:
         if (has_data_file(bs)) {
-            ret = bdrv_co_truncate(s->data_file, offset, prealloc, errp);
+            /*
+             * If the caller wants an exact resize, the external data
+             * file should be resized to the exact target size, too,
+             * so we pass @exact here.
+             */
+            ret = bdrv_co_truncate(s->data_file, offset, exact, prealloc, errp);
             if (ret < 0) {
                 goto fail;
             }
@@ -4088,7 +4142,8 @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
         /* Allocate the data area */
         new_file_size = allocation_start +
                         nb_new_data_clusters * s->cluster_size;
-        ret = bdrv_co_truncate(bs->file, new_file_size, prealloc, errp);
+        /* Image file grows, so @exact does not matter */
+        ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc, errp);
         if (ret < 0) {
             error_prepend(errp, "Failed to resize underlying file: ");
             qcow2_free_clusters(bs, allocation_start,
@@ -4191,7 +4246,7 @@ qcow2_co_pwritev_compressed_part(BlockDriverState *bs,
         if (len < 0) {
             return len;
         }
-        return bdrv_co_truncate(bs->file, len, PREALLOC_MODE_OFF, NULL);
+        return bdrv_co_truncate(bs->file, len, false, PREALLOC_MODE_OFF, NULL);
     }
 
     if (offset_into_cluster(s, offset)) {
@@ -4428,7 +4483,7 @@ static int make_completely_empty(BlockDriverState *bs)
         goto fail;
     }
 
-    ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size,
+    ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size, false,
                         PREALLOC_MODE_OFF, &local_err);
     if (ret < 0) {
         error_report_err(local_err);
@@ -4913,12 +4968,74 @@ static int qcow2_downgrade(BlockDriverState *bs, int target_version,
     return 0;
 }
 
+/*
+ * Upgrades an image's version.  While newer versions encompass all
+ * features of older versions, some things may have to be presented
+ * differently.
+ */
+static int qcow2_upgrade(BlockDriverState *bs, int target_version,
+                         BlockDriverAmendStatusCB *status_cb, void *cb_opaque,
+                         Error **errp)
+{
+    BDRVQcow2State *s = bs->opaque;
+    bool need_snapshot_update;
+    int current_version = s->qcow_version;
+    int i;
+    int ret;
+
+    /* This is qcow2_upgrade(), not qcow2_downgrade() */
+    assert(target_version > current_version);
+
+    /* There are no other versions (yet) that you can upgrade to */
+    assert(target_version == 3);
+
+    status_cb(bs, 0, 2, cb_opaque);
+
+    /*
+     * In v2, snapshots do not need to have extra data.  v3 requires
+     * the 64-bit VM state size and the virtual disk size to be
+     * present.
+     * qcow2_write_snapshots() will always write the list in the
+     * v3-compliant format.
+     */
+    need_snapshot_update = false;
+    for (i = 0; i < s->nb_snapshots; i++) {
+        if (s->snapshots[i].extra_data_size <
+            sizeof_field(QCowSnapshotExtraData, vm_state_size_large) +
+            sizeof_field(QCowSnapshotExtraData, disk_size))
+        {
+            need_snapshot_update = true;
+            break;
+        }
+    }
+    if (need_snapshot_update) {
+        ret = qcow2_write_snapshots(bs);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to update the snapshot table");
+            return ret;
+        }
+    }
+    status_cb(bs, 1, 2, cb_opaque);
+
+    s->qcow_version = target_version;
+    ret = qcow2_update_header(bs);
+    if (ret < 0) {
+        s->qcow_version = current_version;
+        error_setg_errno(errp, -ret, "Failed to update the image header");
+        return ret;
+    }
+    status_cb(bs, 2, 2, cb_opaque);
+
+    return 0;
+}
+
 typedef enum Qcow2AmendOperation {
     /* This is the value Qcow2AmendHelperCBInfo::last_operation will be
      * statically initialized to so that the helper CB can discern the first
      * invocation from an operation change */
     QCOW2_NO_OPERATION = 0,
 
+    QCOW2_UPGRADING,
     QCOW2_CHANGING_REFCOUNT_ORDER,
     QCOW2_DOWNGRADING,
 } Qcow2AmendOperation;
@@ -5101,17 +5218,16 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
     helper_cb_info = (Qcow2AmendHelperCBInfo){
         .original_status_cb = status_cb,
         .original_cb_opaque = cb_opaque,
-        .total_operations = (new_version < old_version)
+        .total_operations = (new_version != old_version)
                           + (s->refcount_bits != refcount_bits)
     };
 
     /* Upgrade first (some features may require compat=1.1) */
     if (new_version > old_version) {
-        s->qcow_version = new_version;
-        ret = qcow2_update_header(bs);
+        helper_cb_info.current_operation = QCOW2_UPGRADING;
+        ret = qcow2_upgrade(bs, new_version, &qcow2_amend_helper_cb,
+                            &helper_cb_info, errp);
         if (ret < 0) {
-            s->qcow_version = old_version;
-            error_setg_errno(errp, -ret, "Failed to update the image header");
             return ret;
         }
     }
@@ -5207,7 +5323,11 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
             return ret;
         }
 
-        ret = blk_truncate(blk, new_size, PREALLOC_MODE_OFF, errp);
+        /*
+         * Amending image options should ensure that the image has
+         * exactly the given new values, so pass exact=true here.
+         */
+        ret = blk_truncate(blk, new_size, true, PREALLOC_MODE_OFF, errp);
         blk_unref(blk);
         if (ret < 0) {
             return ret;
diff --git a/block/qcow2.h b/block/qcow2.h
index 5cccd87..601c2e4 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -61,6 +61,9 @@
  * space for snapshot names and IDs */
 #define QCOW_MAX_SNAPSHOTS_SIZE (1024 * QCOW_MAX_SNAPSHOTS)
 
+/* Maximum amount of extra data per snapshot table entry to accept */
+#define QCOW_MAX_SNAPSHOT_EXTRA_DATA 1024
+
 /* Bitmap header extension constraints */
 #define QCOW2_MAX_BITMAPS 65535
 #define QCOW2_MAX_BITMAP_DIRECTORY_SIZE (1024 * QCOW2_MAX_BITMAPS)
@@ -181,6 +184,10 @@ typedef struct QCowSnapshot {
     uint32_t date_sec;
     uint32_t date_nsec;
     uint64_t vm_clock_nsec;
+    /* Size of all extra data, including QCowSnapshotExtraData if available */
+    uint32_t extra_data_size;
+    /* Data beyond QCowSnapshotExtraData, if any */
+    void *unknown_extra_data;
 } QCowSnapshot;
 
 struct Qcow2Cache;
@@ -708,7 +715,15 @@ int qcow2_snapshot_load_tmp(BlockDriverState *bs,
                             Error **errp);
 
 void qcow2_free_snapshots(BlockDriverState *bs);
-int qcow2_read_snapshots(BlockDriverState *bs);
+int qcow2_read_snapshots(BlockDriverState *bs, Error **errp);
+int qcow2_write_snapshots(BlockDriverState *bs);
+
+int coroutine_fn qcow2_check_read_snapshot_table(BlockDriverState *bs,
+                                                 BdrvCheckResult *result,
+                                                 BdrvCheckMode fix);
+int coroutine_fn qcow2_check_fix_snapshot_table(BlockDriverState *bs,
+                                                BdrvCheckResult *result,
+                                                BdrvCheckMode fix);
 
 /* qcow2-cache.c functions */
 Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables,
diff --git a/block/qed.c b/block/qed.c
index 0d8fd50..d8c4e5f 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -673,8 +673,11 @@ static int coroutine_fn bdrv_qed_co_create(BlockdevCreateOptions *opts,
 
     l1_size = header.cluster_size * header.table_size;
 
-    /* File must start empty and grow, check truncate is supported */
-    ret = blk_truncate(blk, 0, PREALLOC_MODE_OFF, errp);
+    /*
+     * The QED format associates file length with allocation status,
+     * so a new file (which is empty) must have a length of 0.
+     */
+    ret = blk_truncate(blk, 0, true, PREALLOC_MODE_OFF, errp);
     if (ret < 0) {
         goto out;
     }
@@ -1461,6 +1464,7 @@ static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
 
 static int coroutine_fn bdrv_qed_co_truncate(BlockDriverState *bs,
                                              int64_t offset,
+                                             bool exact,
                                              PreallocMode prealloc,
                                              Error **errp)
 {
diff --git a/block/raw-format.c b/block/raw-format.c
index 42c28cc..3a76ec7 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -370,7 +370,8 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 }
 
 static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
-                                        PreallocMode prealloc, Error **errp)
+                                        bool exact, PreallocMode prealloc,
+                                        Error **errp)
 {
     BDRVRawState *s = bs->opaque;
 
@@ -386,7 +387,7 @@ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
 
     s->size = offset;
     offset += s->offset;
-    return bdrv_co_truncate(bs->file, offset, prealloc, errp);
+    return bdrv_co_truncate(bs->file, offset, exact, prealloc, errp);
 }
 
 static void raw_eject(BlockDriverState *bs, bool eject_flag)
diff --git a/block/rbd.c b/block/rbd.c
index c71e45d..027cbcc 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -1087,6 +1087,7 @@ static int64_t qemu_rbd_getlength(BlockDriverState *bs)
 
 static int coroutine_fn qemu_rbd_co_truncate(BlockDriverState *bs,
                                              int64_t offset,
+                                             bool exact,
                                              PreallocMode prealloc,
                                              Error **errp)
 {
diff --git a/block/sheepdog.c b/block/sheepdog.c
index 773dfc6..cfa8433 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -2285,7 +2285,8 @@ static int64_t sd_getlength(BlockDriverState *bs)
 }
 
 static int coroutine_fn sd_co_truncate(BlockDriverState *bs, int64_t offset,
-                                       PreallocMode prealloc, Error **errp)
+                                       bool exact, PreallocMode prealloc,
+                                       Error **errp)
 {
     BDRVSheepdogState *s = bs->opaque;
     int ret, fd;
@@ -2601,7 +2602,7 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
 
     assert(!flags);
     if (offset > s->inode.vdi_size) {
-        ret = sd_co_truncate(bs, offset, PREALLOC_MODE_OFF, NULL);
+        ret = sd_co_truncate(bs, offset, false, PREALLOC_MODE_OFF, NULL);
         if (ret < 0) {
             return ret;
         }
diff --git a/block/ssh.c b/block/ssh.c
index 84d01e8..b4375cf 100644
--- a/block/ssh.c
+++ b/block/ssh.c
@@ -1295,7 +1295,8 @@ static int64_t ssh_getlength(BlockDriverState *bs)
 }
 
 static int coroutine_fn ssh_co_truncate(BlockDriverState *bs, int64_t offset,
-                                        PreallocMode prealloc, Error **errp)
+                                        bool exact, PreallocMode prealloc,
+                                        Error **errp)
 {
     BDRVSSHState *s = bs->opaque;
 
diff --git a/block/trace-events b/block/trace-events
index b8d70f5..6ba86de 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -45,9 +45,9 @@ backup_do_cow_return(void *job, int64_t offset, uint64_t bytes, int ret) "job %p
 block_copy_skip(void *bcs, int64_t start) "bcs %p start %"PRId64
 block_copy_skip_range(void *bcs, int64_t start, uint64_t bytes) "bcs %p start %"PRId64" bytes %"PRId64
 block_copy_process(void *bcs, int64_t start) "bcs %p start %"PRId64
-block_copy_with_bounce_buffer_read_fail(void *bcs, int64_t start, int ret) "bcs %p start %"PRId64" ret %d"
-block_copy_with_bounce_buffer_write_fail(void *bcs, int64_t start, int ret) "bcs %p start %"PRId64" ret %d"
-block_copy_with_offload_fail(void *bcs, int64_t start, int ret) "bcs %p start %"PRId64" ret %d"
+block_copy_copy_range_fail(void *bcs, int64_t start, int ret) "bcs %p start %"PRId64" ret %d"
+block_copy_read_fail(void *bcs, int64_t start, int ret) "bcs %p start %"PRId64" ret %d"
+block_copy_write_fail(void *bcs, int64_t start, int ret) "bcs %p start %"PRId64" ret %d"
 
 # ../blockdev.c
 qmp_block_job_cancel(void *job) "job %p"
@@ -152,9 +152,12 @@ nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6,
 nvme_handle_event(void *s) "s %p"
 nvme_poll_cb(void *s) "s %p"
 nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset %"PRId64" bytes %"PRId64" flags %d niov %d"
+nvme_write_zeroes(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offset %"PRId64" bytes %"PRId64" flags %d"
 nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x"
 nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset %"PRId64" bytes %"PRId64" niov %d is_write %d"
 nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset %"PRId64" bytes %"PRId64" ret %d"
+nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset %"PRId64" bytes %"PRId64""
+nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset %"PRId64" bytes %"PRId64" ret %d"
 nvme_dma_map_flush(void *s) "s %p"
 nvme_free_req_queue_wait(void *q) "q %p"
 nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d"
diff --git a/block/vdi.c b/block/vdi.c
index 806ba7f..0142da7 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -874,7 +874,7 @@ static int coroutine_fn vdi_co_do_create(BlockdevCreateOptions *create_options,
     }
 
     if (image_type == VDI_TYPE_STATIC) {
-        ret = blk_truncate(blk, offset + blocks * block_size,
+        ret = blk_truncate(blk, offset + blocks * block_size, false,
                            PREALLOC_MODE_OFF, errp);
         if (ret < 0) {
             error_prepend(errp, "Failed to statically allocate file");
diff --git a/block/vhdx-log.c b/block/vhdx-log.c
index fdd3a7a..13a49c2 100644
--- a/block/vhdx-log.c
+++ b/block/vhdx-log.c
@@ -557,8 +557,8 @@ static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s,
                     ret = -EINVAL;
                     goto exit;
                 }
-                ret = bdrv_truncate(bs->file, new_file_size, PREALLOC_MODE_OFF,
-                                    NULL);
+                ret = bdrv_truncate(bs->file, new_file_size, false,
+                                    PREALLOC_MODE_OFF, NULL);
                 if (ret < 0) {
                     goto exit;
                 }
diff --git a/block/vhdx.c b/block/vhdx.c
index 371f226..f02d261 100644
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -1263,7 +1263,7 @@ static int vhdx_allocate_block(BlockDriverState *bs, BDRVVHDXState *s,
         return -EINVAL;
     }
 
-    return bdrv_truncate(bs->file, *new_offset + s->block_size,
+    return bdrv_truncate(bs->file, *new_offset + s->block_size, false,
                          PREALLOC_MODE_OFF, NULL);
 }
 
@@ -1702,12 +1702,13 @@ static int vhdx_create_bat(BlockBackend *blk, BDRVVHDXState *s,
     if (type == VHDX_TYPE_DYNAMIC) {
         /* All zeroes, so we can just extend the file - the end of the BAT
          * is the furthest thing we have written yet */
-        ret = blk_truncate(blk, data_file_offset, PREALLOC_MODE_OFF, errp);
+        ret = blk_truncate(blk, data_file_offset, false, PREALLOC_MODE_OFF,
+                           errp);
         if (ret < 0) {
             goto exit;
         }
     } else if (type == VHDX_TYPE_FIXED) {
-        ret = blk_truncate(blk, data_file_offset + image_size,
+        ret = blk_truncate(blk, data_file_offset + image_size, false,
                            PREALLOC_MODE_OFF, errp);
         if (ret < 0) {
             goto exit;
diff --git a/block/vmdk.c b/block/vmdk.c
index fed3b50..20e909d 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -2076,7 +2076,7 @@ vmdk_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
                 return length;
             }
             length = QEMU_ALIGN_UP(length, BDRV_SECTOR_SIZE);
-            ret = bdrv_truncate(s->extents[i].file, length,
+            ret = bdrv_truncate(s->extents[i].file, length, false,
                                 PREALLOC_MODE_OFF, NULL);
             if (ret < 0) {
                 return ret;
@@ -2118,7 +2118,7 @@ static int vmdk_init_extent(BlockBackend *blk,
     int gd_buf_size;
 
     if (flat) {
-        ret = blk_truncate(blk, filesize, PREALLOC_MODE_OFF, errp);
+        ret = blk_truncate(blk, filesize, false, PREALLOC_MODE_OFF, errp);
         goto exit;
     }
     magic = cpu_to_be32(VMDK4_MAGIC);
@@ -2181,7 +2181,7 @@ static int vmdk_init_extent(BlockBackend *blk,
         goto exit;
     }
 
-    ret = blk_truncate(blk, le64_to_cpu(header.grain_offset) << 9,
+    ret = blk_truncate(blk, le64_to_cpu(header.grain_offset) << 9, false,
                        PREALLOC_MODE_OFF, errp);
     if (ret < 0) {
         goto exit;
@@ -2523,7 +2523,7 @@ static int coroutine_fn vmdk_co_do_create(int64_t size,
     /* bdrv_pwrite write padding zeros to align to sector, we don't need that
      * for description file */
     if (desc_offset == 0) {
-        ret = blk_truncate(blk, desc_len, PREALLOC_MODE_OFF, errp);
+        ret = blk_truncate(blk, desc_len, false, PREALLOC_MODE_OFF, errp);
         if (ret < 0) {
             goto exit;
         }
diff --git a/block/vpc.c b/block/vpc.c
index 5cd3890..a655502 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -898,7 +898,7 @@ static int create_fixed_disk(BlockBackend *blk, uint8_t *buf,
     /* Add footer to total size */
     total_size += HEADER_SIZE;
 
-    ret = blk_truncate(blk, total_size, PREALLOC_MODE_OFF, errp);
+    ret = blk_truncate(blk, total_size, false, PREALLOC_MODE_OFF, errp);
     if (ret < 0) {
         return ret;
     }
diff --git a/blockdev.c b/blockdev.c
index ba491e3..8e029e9 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -3204,7 +3204,7 @@ void qmp_block_resize(bool has_device, const char *device,
     }
 
     bdrv_drained_begin(bs);
-    ret = blk_truncate(blk, size, PREALLOC_MODE_OFF, errp);
+    ret = blk_truncate(blk, size, false, PREALLOC_MODE_OFF, errp);
     bdrv_drained_end(bs);
 
 out:
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index dd14b9a..9fa2eaf 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -42,9 +42,9 @@
  */
 static VirtIOFeature feature_sizes[] = {
     {.flags = 1ULL << VIRTIO_BLK_F_DISCARD,
-     .end = virtio_endof(struct virtio_blk_config, discard_sector_alignment)},
+     .end = endof(struct virtio_blk_config, discard_sector_alignment)},
     {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
-     .end = virtio_endof(struct virtio_blk_config, write_zeroes_may_unmap)},
+     .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
     {}
 };
 
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 9f11422..2c4909c 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -90,15 +90,15 @@ static inline __virtio16 *virtio_net_rsc_ext_num_dupacks(
 
 static VirtIOFeature feature_sizes[] = {
     {.flags = 1ULL << VIRTIO_NET_F_MAC,
-     .end = virtio_endof(struct virtio_net_config, mac)},
+     .end = endof(struct virtio_net_config, mac)},
     {.flags = 1ULL << VIRTIO_NET_F_STATUS,
-     .end = virtio_endof(struct virtio_net_config, status)},
+     .end = endof(struct virtio_net_config, status)},
     {.flags = 1ULL << VIRTIO_NET_F_MQ,
-     .end = virtio_endof(struct virtio_net_config, max_virtqueue_pairs)},
+     .end = endof(struct virtio_net_config, max_virtqueue_pairs)},
     {.flags = 1ULL << VIRTIO_NET_F_MTU,
-     .end = virtio_endof(struct virtio_net_config, mtu)},
+     .end = endof(struct virtio_net_config, mtu)},
     {.flags = 1ULL << VIRTIO_NET_F_SPEED_DUPLEX,
-     .end = virtio_endof(struct virtio_net_config, duplex)},
+     .end = endof(struct virtio_net_config, duplex)},
     {}
 };
 
diff --git a/include/block/block-copy.h b/include/block/block-copy.h
index e2e135ff..0a16172 100644
--- a/include/block/block-copy.h
+++ b/include/block/block-copy.h
@@ -16,6 +16,7 @@
 #define BLOCK_COPY_H
 
 #include "block/block.h"
+#include "qemu/co-shared-resource.h"
 
 typedef struct BlockCopyInFlightReq {
     int64_t start_byte;
@@ -37,7 +38,7 @@ typedef struct BlockCopyState {
     BdrvDirtyBitmap *copy_bitmap;
     int64_t cluster_size;
     bool use_copy_range;
-    int64_t copy_range_size;
+    int64_t copy_size;
     uint64_t len;
     QLIST_HEAD(, BlockCopyInFlightReq) inflight_reqs;
 
@@ -69,6 +70,8 @@ typedef struct BlockCopyState {
      */
     ProgressResetCallbackFunc progress_reset_callback;
     void *progress_opaque;
+
+    SharedResource *mem;
 } BlockCopyState;
 
 BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
diff --git a/include/block/block.h b/include/block/block.h
index 89606bd..1df9848 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -346,10 +346,10 @@ BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
     const char *backing_file);
 void bdrv_refresh_filename(BlockDriverState *bs);
 
-int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset,
+int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
                                   PreallocMode prealloc, Error **errp);
-int bdrv_truncate(BdrvChild *child, int64_t offset, PreallocMode prealloc,
-                  Error **errp);
+int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
+                  PreallocMode prealloc, Error **errp);
 
 int64_t bdrv_nb_sectors(BlockDriverState *bs);
 int64_t bdrv_getlength(BlockDriverState *bs);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index ca4ccac4..02dc003 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -334,8 +334,23 @@ struct BlockDriver {
      * bdrv_parse_filename.
      */
     const char *protocol_name;
+
+    /*
+     * Truncate @bs to @offset bytes using the given @prealloc mode
+     * when growing.  Modes other than PREALLOC_MODE_OFF should be
+     * rejected when shrinking @bs.
+     *
+     * If @exact is true, @bs must be resized to exactly @offset.
+     * Otherwise, it is sufficient for @bs (if it is a host block
+     * device and thus there is no way to resize it) to be at least
+     * @offset bytes in length.
+     *
+     * If @exact is true and this function fails but would succeed
+     * with @exact = false, it should return -ENOTSUP.
+     */
     int coroutine_fn (*bdrv_co_truncate)(BlockDriverState *bs, int64_t offset,
-                                         PreallocMode prealloc, Error **errp);
+                                         bool exact, PreallocMode prealloc,
+                                         Error **errp);
 
     int64_t (*bdrv_getlength)(BlockDriverState *bs);
     bool has_variable_length;
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 3ec8efc..ab5943b 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -653,12 +653,29 @@ typedef struct NvmeIdNs {
     uint8_t     mc;
     uint8_t     dpc;
     uint8_t     dps;
-    uint8_t     res30[98];
+
+    uint8_t     nmic;
+    uint8_t     rescap;
+    uint8_t     fpi;
+    uint8_t     dlfeat;
+
+    uint8_t     res34[94];
     NvmeLBAF    lbaf[16];
     uint8_t     res192[192];
     uint8_t     vs[3712];
 } NvmeIdNs;
 
+
+/*Deallocate Logical Block Features*/
+#define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat)       ((dlfeat) & 0x10)
+#define NVME_ID_NS_DLFEAT_WRITE_ZEROES(dlfeat)    ((dlfeat) & 0x08)
+
+#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR(dlfeat)     ((dlfeat) & 0x7)
+#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR_UNDEFINED   0
+#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROES      1
+#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ONES        2
+
+
 #define NVME_ID_NS_NSFEAT_THIN(nsfeat)      ((nsfeat & 0x1))
 #define NVME_ID_NS_FLBAS_EXTENDED(flbas)    ((flbas >> 4) & 0x1)
 #define NVME_ID_NS_FLBAS_INDEX(flbas)       ((flbas & 0xf))
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index 5560f4a..bea3f14 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -37,13 +37,6 @@ static inline hwaddr vring_align(hwaddr addr,
     return QEMU_ALIGN_UP(addr, align);
 }
 
-/*
- * Calculate the number of bytes up to and including the given 'field' of
- * 'container'.
- */
-#define virtio_endof(container, field) \
-    (offsetof(container, field) + sizeof_field(container, field))
-
 typedef struct VirtIOFeature {
     uint64_t flags;
     size_t end;
diff --git a/include/qemu/co-shared-resource.h b/include/qemu/co-shared-resource.h
new file mode 100644
index 0000000..4e45030
--- /dev/null
+++ b/include/qemu/co-shared-resource.h
@@ -0,0 +1,71 @@
+/*
+ * Helper functionality for distributing a fixed total amount of
+ * an abstract resource among multiple coroutines.
+ *
+ * Copyright (c) 2019 Virtuozzo International GmbH
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef QEMU_CO_SHARED_RESOURCE_H
+#define QEMU_CO_SHARED_RESOURCE_H
+
+
+typedef struct SharedResource SharedResource;
+
+/*
+ * Create SharedResource structure
+ *
+ * @total: total amount of some resource to be shared between clients
+ *
+ * Note: this API is not thread-safe.
+ */
+SharedResource *shres_create(uint64_t total);
+
+/*
+ * Release SharedResource structure
+ *
+ * This function may only be called once everything allocated by all
+ * clients has been deallocated.
+ */
+void shres_destroy(SharedResource *s);
+
+/*
+ * Try to allocate an amount of @n.  Return true on success, and false
+ * if there is too little left of the collective resource to fulfill
+ * the request.
+ */
+bool co_try_get_from_shres(SharedResource *s, uint64_t n);
+
+/*
+ * Allocate an amount of @n, and, if necessary, yield until
+ * that becomes possible.
+ */
+void coroutine_fn co_get_from_shres(SharedResource *s, uint64_t n);
+
+/*
+ * Deallocate an amount of @n.  The total amount allocated by a caller
+ * does not need to be deallocated/released with a single call, but may
+ * be split over several calls.  For example, get(4), get(3), and then
+ * put(5), put(2).
+ */
+void coroutine_fn co_put_to_shres(SharedResource *s, uint64_t n);
+
+
+#endif /* QEMU_CO_SHARED_RESOURCE_H */
diff --git a/include/qemu/compiler.h b/include/qemu/compiler.h
index 7b93c73..85c02c1 100644
--- a/include/qemu/compiler.h
+++ b/include/qemu/compiler.h
@@ -60,6 +60,13 @@
 
 #define sizeof_field(type, field) sizeof(((type *)0)->field)
 
+/*
+ * Calculate the number of bytes up to and including the given 'field' of
+ * 'container'.
+ */
+#define endof(container, field) \
+    (offsetof(container, field) + sizeof_field(container, field))
+
 /* Convert from a base type to a parent type, with compile time checking.  */
 #ifdef __GNUC__
 #define DO_UPCAST(type, field, dev) ( __extension__ ( { \
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index 368d53a..b198dec 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -121,6 +121,10 @@ void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops, void *opaque);
 int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
                                unsigned int bytes, QEMUIOVector *qiov,
                                BdrvRequestFlags flags);
+int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, int64_t offset,
+                                     unsigned int bytes,
+                                     QEMUIOVector *qiov, size_t qiov_offset,
+                                     BdrvRequestFlags flags);
 int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
                                unsigned int bytes, QEMUIOVector *qiov,
                                BdrvRequestFlags flags);
@@ -233,8 +237,8 @@ int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
                                       int bytes, BdrvRequestFlags flags);
 int blk_pwrite_compressed(BlockBackend *blk, int64_t offset, const void *buf,
                           int bytes);
-int blk_truncate(BlockBackend *blk, int64_t offset, PreallocMode prealloc,
-                 Error **errp);
+int blk_truncate(BlockBackend *blk, int64_t offset, bool exact,
+                 PreallocMode prealloc, Error **errp);
 int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes);
 int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
                      int64_t pos, int size);
diff --git a/python/qemu/machine.py b/python/qemu/machine.py
index 128a3d1..2024e8b 100644
--- a/python/qemu/machine.py
+++ b/python/qemu/machine.py
@@ -71,7 +71,7 @@ class QEMUMachine(object):
 
     def __init__(self, binary, args=None, wrapper=None, name=None,
                  test_dir="/var/tmp", monitor_address=None,
-                 socket_scm_helper=None):
+                 socket_scm_helper=None, sock_dir=None):
         '''
         Initialize a QEMUMachine
 
@@ -90,6 +90,8 @@ class QEMUMachine(object):
             wrapper = []
         if name is None:
             name = "qemu-%d" % os.getpid()
+        if sock_dir is None:
+            sock_dir = test_dir
         self._name = name
         self._monitor_address = monitor_address
         self._vm_monitor = None
@@ -106,12 +108,14 @@ class QEMUMachine(object):
         self._qemu_full_args = None
         self._test_dir = test_dir
         self._temp_dir = None
+        self._sock_dir = sock_dir
         self._launched = False
         self._machine = None
         self._console_set = False
         self._console_device_type = None
         self._console_address = None
         self._console_socket = None
+        self._remove_files = []
 
         # just in case logging wasn't configured by the main script:
         logging.basicConfig()
@@ -236,8 +240,9 @@ class QEMUMachine(object):
         if self._machine is not None:
             args.extend(['-machine', self._machine])
         if self._console_set:
-            self._console_address = os.path.join(self._temp_dir,
+            self._console_address = os.path.join(self._sock_dir,
                                                  self._name + "-console.sock")
+            self._remove_files.append(self._console_address)
             chardev = ('socket,id=console,path=%s,server,nowait' %
                        self._console_address)
             args.extend(['-chardev', chardev])
@@ -253,8 +258,9 @@ class QEMUMachine(object):
         if self._monitor_address is not None:
             self._vm_monitor = self._monitor_address
         else:
-            self._vm_monitor = os.path.join(self._temp_dir,
+            self._vm_monitor = os.path.join(self._sock_dir,
                                             self._name + "-monitor.sock")
+            self._remove_files.append(self._vm_monitor)
         self._qemu_log_path = os.path.join(self._temp_dir, self._name + ".log")
         self._qemu_log_file = open(self._qemu_log_path, 'wb')
 
@@ -279,6 +285,9 @@ class QEMUMachine(object):
             shutil.rmtree(self._temp_dir)
             self._temp_dir = None
 
+        while len(self._remove_files) > 0:
+            self._remove_if_exists(self._remove_files.pop())
+
     def launch(self):
         """
         Launch the VM and make sure we cleanup and expose the
diff --git a/python/qemu/qtest.py b/python/qemu/qtest.py
index 3f1d2cb..d24ad04 100644
--- a/python/qemu/qtest.py
+++ b/python/qemu/qtest.py
@@ -84,14 +84,17 @@ class QEMUQtestMachine(QEMUMachine):
     '''A QEMU VM'''
 
     def __init__(self, binary, args=None, name=None, test_dir="/var/tmp",
-                 socket_scm_helper=None):
+                 socket_scm_helper=None, sock_dir=None):
         if name is None:
             name = "qemu-%d" % os.getpid()
+        if sock_dir is None:
+            sock_dir = test_dir
         super(QEMUQtestMachine,
               self).__init__(binary, args, name=name, test_dir=test_dir,
-                             socket_scm_helper=socket_scm_helper)
+                             socket_scm_helper=socket_scm_helper,
+                             sock_dir=sock_dir)
         self._qtest = None
-        self._qtest_path = os.path.join(test_dir, name + "-qtest.sock")
+        self._qtest_path = os.path.join(sock_dir, name + "-qtest.sock")
 
     def _base_args(self):
         args = super(QEMUQtestMachine, self)._base_args()
diff --git a/qemu-img.c b/qemu-img.c
index 8b03ef8..95a24b9 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -3656,7 +3656,7 @@ static int img_resize(int argc, char **argv)
     Error *err = NULL;
     int c, ret, relative;
     const char *filename, *fmt, *size;
-    int64_t n, total_size, current_size, new_size;
+    int64_t n, total_size, current_size;
     bool quiet = false;
     BlockBackend *blk = NULL;
     PreallocMode prealloc = PREALLOC_MODE_OFF;
@@ -3831,43 +3831,17 @@ static int img_resize(int argc, char **argv)
         }
     }
 
-    ret = blk_truncate(blk, total_size, prealloc, &err);
-    if (ret < 0) {
+    /*
+     * The user expects the image to have the desired size after
+     * resizing, so pass @exact=true.  It is of no use to report
+     * success when the image has not actually been resized.
+     */
+    ret = blk_truncate(blk, total_size, true, prealloc, &err);
+    if (!ret) {
+        qprintf(quiet, "Image resized.\n");
+    } else {
         error_report_err(err);
-        goto out;
-    }
-
-    new_size = blk_getlength(blk);
-    if (new_size < 0) {
-        error_report("Failed to verify truncated image length: %s",
-                     strerror(-new_size));
-        ret = -1;
-        goto out;
-    }
-
-    /* Some block drivers implement a truncation method, but only so
-     * the user can cause qemu to refresh the image's size from disk.
-     * The idea is that the user resizes the image outside of qemu and
-     * then invokes block_resize to inform qemu about it.
-     * (This includes iscsi and file-posix for device files.)
-     * Of course, that is not the behavior someone invoking
-     * qemu-img resize would find useful, so we catch that behavior
-     * here and tell the user. */
-    if (new_size != total_size && new_size == current_size) {
-        error_report("Image was not resized; resizing may not be supported "
-                     "for this image");
-        ret = -1;
-        goto out;
     }
-
-    if (new_size != total_size) {
-        warn_report("Image should have been resized to %" PRIi64
-                    " bytes, but was resized to %" PRIi64 " bytes",
-                    total_size, new_size);
-    }
-
-    qprintf(quiet, "Image resized.\n");
-
 out:
     blk_unref(blk);
     if (ret) {
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 349256a..1b7e700 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -1710,7 +1710,12 @@ static int truncate_f(BlockBackend *blk, int argc, char **argv)
         return offset;
     }
 
-    ret = blk_truncate(blk, offset, PREALLOC_MODE_OFF, &local_err);
+    /*
+     * qemu-io is a debugging tool, so let us be strict here and pass
+     * exact=true.  It is better to err on the "emit more errors" side
+     * than to be overly permissive.
+     */
+    ret = blk_truncate(blk, offset, true, PREALLOC_MODE_OFF, &local_err);
     if (ret < 0) {
         error_report_err(local_err);
         return ret;
diff --git a/tests/qemu-iotests/083 b/tests/qemu-iotests/083
index b270550..10fdfc8 100755
--- a/tests/qemu-iotests/083
+++ b/tests/qemu-iotests/083
@@ -28,7 +28,7 @@ status=1	# failure is the default!
 
 _cleanup()
 {
-	rm -f nbd.sock
+	rm -f "$SOCK_DIR/nbd.sock"
 	rm -f nbd-fault-injector.out
 	rm -f nbd-fault-injector.conf
 }
@@ -80,10 +80,10 @@ EOF
 	if [ "$proto" = "tcp" ]; then
 		nbd_addr="127.0.0.1:0"
 	else
-		nbd_addr="$TEST_DIR/nbd.sock"
+		nbd_addr="$SOCK_DIR/nbd.sock"
 	fi
 
-	rm -f "$TEST_DIR/nbd.sock"
+	rm -f "$SOCK_DIR/nbd.sock"
 
         echo > "$TEST_DIR/nbd-fault-injector.out"
 	$PYTHON nbd-fault-injector.py $extra_args "$nbd_addr" "$TEST_DIR/nbd-fault-injector.conf" >"$TEST_DIR/nbd-fault-injector.out" 2>&1 &
diff --git a/tests/qemu-iotests/083.out b/tests/qemu-iotests/083.out
index eee6dd1..2090ee6 100644
--- a/tests/qemu-iotests/083.out
+++ b/tests/qemu-iotests/083.out
@@ -110,43 +110,43 @@ read failed: Input/output error
 
 === Check disconnect before neg1 ===
 
-qemu-io: can't open device nbd+unix:///foo?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///foo?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect after neg1 ===
 
-qemu-io: can't open device nbd+unix:///foo?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///foo?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect 8 neg1 ===
 
-qemu-io: can't open device nbd+unix:///foo?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///foo?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect 16 neg1 ===
 
-qemu-io: can't open device nbd+unix:///foo?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///foo?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect before export ===
 
-qemu-io: can't open device nbd+unix:///foo?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///foo?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect after export ===
 
-qemu-io: can't open device nbd+unix:///foo?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///foo?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect 4 export ===
 
-qemu-io: can't open device nbd+unix:///foo?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///foo?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect 12 export ===
 
-qemu-io: can't open device nbd+unix:///foo?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///foo?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect 16 export ===
 
-qemu-io: can't open device nbd+unix:///foo?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///foo?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect before neg2 ===
 
-qemu-io: can't open device nbd+unix:///foo?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///foo?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect after neg2 ===
 
@@ -154,11 +154,11 @@ read failed: Input/output error
 
 === Check disconnect 8 neg2 ===
 
-qemu-io: can't open device nbd+unix:///foo?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///foo?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect 10 neg2 ===
 
-qemu-io: can't open device nbd+unix:///foo?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///foo?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect before request ===
 
@@ -195,23 +195,23 @@ read 512/512 bytes at offset 0
 
 === Check disconnect before neg-classic ===
 
-qemu-io: can't open device nbd+unix:///?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect 8 neg-classic ===
 
-qemu-io: can't open device nbd+unix:///?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect 16 neg-classic ===
 
-qemu-io: can't open device nbd+unix:///?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect 24 neg-classic ===
 
-qemu-io: can't open device nbd+unix:///?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect 28 neg-classic ===
 
-qemu-io: can't open device nbd+unix:///?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect after neg-classic ===
 
diff --git a/tests/qemu-iotests/093 b/tests/qemu-iotests/093
index 3c4f517..f03fa24 100755
--- a/tests/qemu-iotests/093
+++ b/tests/qemu-iotests/093
@@ -24,7 +24,7 @@ import iotests
 nsec_per_sec = 1000000000
 
 class ThrottleTestCase(iotests.QMPTestCase):
-    test_img = "null-aio://"
+    test_driver = "null-aio"
     max_drives = 3
 
     def blockstats(self, device):
@@ -35,10 +35,14 @@ class ThrottleTestCase(iotests.QMPTestCase):
                 return stat['rd_bytes'], stat['rd_operations'], stat['wr_bytes'], stat['wr_operations']
         raise Exception("Device not found for blockstats: %s" % device)
 
+    def required_drivers(self):
+        return [self.test_driver]
+
+    @iotests.skip_if_unsupported(required_drivers)
     def setUp(self):
         self.vm = iotests.VM()
         for i in range(0, self.max_drives):
-            self.vm.add_drive(self.test_img, "file.read-zeroes=on")
+            self.vm.add_drive(self.test_driver + "://", "file.read-zeroes=on")
         self.vm.launch()
 
     def tearDown(self):
@@ -264,16 +268,15 @@ class ThrottleTestCase(iotests.QMPTestCase):
         self.assertEqual(self.blockstats('drive1')[0], 4096)
 
 class ThrottleTestCoroutine(ThrottleTestCase):
-    test_img = "null-co://"
+    test_driver = "null-co"
 
 class ThrottleTestGroupNames(iotests.QMPTestCase):
-    test_img = "null-aio://"
     max_drives = 3
 
     def setUp(self):
         self.vm = iotests.VM()
         for i in range(0, self.max_drives):
-            self.vm.add_drive(self.test_img,
+            self.vm.add_drive("null-co://",
                               "throttling.iops-total=100,file.read-zeroes=on")
         self.vm.launch()
 
@@ -376,10 +379,10 @@ class ThrottleTestRemovableMedia(iotests.QMPTestCase):
 
     def test_removable_media(self):
         # Add a couple of dummy nodes named cd0 and cd1
-        result = self.vm.qmp("blockdev-add", driver="null-aio",
+        result = self.vm.qmp("blockdev-add", driver="null-co",
                              read_zeroes=True, node_name="cd0")
         self.assert_qmp(result, 'return', {})
-        result = self.vm.qmp("blockdev-add", driver="null-aio",
+        result = self.vm.qmp("blockdev-add", driver="null-co",
                              read_zeroes=True, node_name="cd1")
         self.assert_qmp(result, 'return', {})
 
@@ -426,4 +429,6 @@ class ThrottleTestRemovableMedia(iotests.QMPTestCase):
 
 
 if __name__ == '__main__':
+    if 'null-co' not in iotests.supported_formats():
+        iotests.notrun('null-co driver support missing')
     iotests.main(supported_fmts=["raw"])
diff --git a/tests/qemu-iotests/136 b/tests/qemu-iotests/136
index a46a7b7..012ea11 100755
--- a/tests/qemu-iotests/136
+++ b/tests/qemu-iotests/136
@@ -30,7 +30,7 @@ bad_offset = bad_sector * 512
 blkdebug_file = os.path.join(iotests.test_dir, 'blkdebug.conf')
 
 class BlockDeviceStatsTestCase(iotests.QMPTestCase):
-    test_img = "null-aio://"
+    test_driver = "null-aio"
     total_rd_bytes = 0
     total_rd_ops = 0
     total_wr_bytes = 0
@@ -67,6 +67,10 @@ sector = "%d"
 ''' % (bad_sector, bad_sector))
         file.close()
 
+    def required_drivers(self):
+        return [self.test_driver]
+
+    @iotests.skip_if_unsupported(required_drivers)
     def setUp(self):
         drive_args = []
         drive_args.append("stats-intervals.0=%d" % interval_length)
@@ -76,8 +80,8 @@ sector = "%d"
                           (self.account_failed and "on" or "off"))
         drive_args.append("file.image.read-zeroes=on")
         self.create_blkdebug_file()
-        self.vm = iotests.VM().add_drive('blkdebug:%s:%s' %
-                                         (blkdebug_file, self.test_img),
+        self.vm = iotests.VM().add_drive('blkdebug:%s:%s://' %
+                                         (blkdebug_file, self.test_driver),
                                          ','.join(drive_args))
         self.vm.launch()
         # Set an initial value for the clock
@@ -337,7 +341,9 @@ class BlockDeviceStatsTestAccountBoth(BlockDeviceStatsTestCase):
     account_failed = True
 
 class BlockDeviceStatsTestCoroutine(BlockDeviceStatsTestCase):
-    test_img = "null-co://"
+    test_driver = "null-co"
 
 if __name__ == '__main__':
+    if 'null-co' not in iotests.supported_formats():
+        iotests.notrun('null-co driver support missing')
     iotests.main(supported_fmts=["raw"])
diff --git a/tests/qemu-iotests/140 b/tests/qemu-iotests/140
index b965b1d..8d2ce5d 100755
--- a/tests/qemu-iotests/140
+++ b/tests/qemu-iotests/140
@@ -34,7 +34,7 @@ _cleanup()
 {
     _cleanup_qemu
     _cleanup_test_img
-    rm -f "$TEST_DIR/nbd"
+    rm -f "$SOCK_DIR/nbd"
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
@@ -69,7 +69,7 @@ _send_qemu_cmd $QEMU_HANDLE \
 _send_qemu_cmd $QEMU_HANDLE \
     "{ 'execute': 'nbd-server-start',
        'arguments': { 'addr': { 'type': 'unix',
-                                'data': { 'path': '$TEST_DIR/nbd' }}}}" \
+                                'data': { 'path': '$SOCK_DIR/nbd' }}}}" \
     'return'
 
 _send_qemu_cmd $QEMU_HANDLE \
@@ -78,7 +78,7 @@ _send_qemu_cmd $QEMU_HANDLE \
     'return'
 
 $QEMU_IO_PROG -f raw -r -c 'read -P 42 0 64k' \
-    "nbd+unix:///drv?socket=$TEST_DIR/nbd" 2>&1 \
+    "nbd+unix:///drv?socket=$SOCK_DIR/nbd" 2>&1 \
     | _filter_qemu_io | _filter_nbd
 
 _send_qemu_cmd $QEMU_HANDLE \
@@ -87,7 +87,7 @@ _send_qemu_cmd $QEMU_HANDLE \
     'return'
 
 $QEMU_IO_PROG -f raw -r -c close \
-    "nbd+unix:///drv?socket=$TEST_DIR/nbd" 2>&1 \
+    "nbd+unix:///drv?socket=$SOCK_DIR/nbd" 2>&1 \
     | _filter_qemu_io | _filter_nbd
 
 _send_qemu_cmd $QEMU_HANDLE \
diff --git a/tests/qemu-iotests/140.out b/tests/qemu-iotests/140.out
index 67fe44a..2511eb7 100644
--- a/tests/qemu-iotests/140.out
+++ b/tests/qemu-iotests/140.out
@@ -8,7 +8,7 @@ wrote 65536/65536 bytes at offset 0
 read 65536/65536 bytes at offset 0
 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 {"return": {}}
-qemu-io: can't open device nbd+unix:///drv?socket=TEST_DIR/nbd: Requested export not available
+qemu-io: can't open device nbd+unix:///drv?socket=SOCK_DIR/nbd: Requested export not available
 server reported: export 'drv' not present
 {"return": {}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}}
diff --git a/tests/qemu-iotests/143 b/tests/qemu-iotests/143
index 92249ac..f649b36 100755
--- a/tests/qemu-iotests/143
+++ b/tests/qemu-iotests/143
@@ -29,7 +29,7 @@ status=1	# failure is the default!
 _cleanup()
 {
     _cleanup_qemu
-    rm -f "$TEST_DIR/nbd"
+    rm -f "$SOCK_DIR/nbd"
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
@@ -51,12 +51,12 @@ _send_qemu_cmd $QEMU_HANDLE \
 _send_qemu_cmd $QEMU_HANDLE \
     "{ 'execute': 'nbd-server-start',
        'arguments': { 'addr': { 'type': 'unix',
-                                'data': { 'path': '$TEST_DIR/nbd' }}}}" \
+                                'data': { 'path': '$SOCK_DIR/nbd' }}}}" \
     'return'
 
 # This should just result in a client error, not in the server crashing
 $QEMU_IO_PROG -f raw -c quit \
-    "nbd+unix:///no_such_export?socket=$TEST_DIR/nbd" 2>&1 \
+    "nbd+unix:///no_such_export?socket=$SOCK_DIR/nbd" 2>&1 \
     | _filter_qemu_io | _filter_nbd
 
 _send_qemu_cmd $QEMU_HANDLE \
diff --git a/tests/qemu-iotests/143.out b/tests/qemu-iotests/143.out
index ee71b5a..037d34a 100644
--- a/tests/qemu-iotests/143.out
+++ b/tests/qemu-iotests/143.out
@@ -1,7 +1,7 @@
 QA output created by 143
 {"return": {}}
 {"return": {}}
-qemu-io: can't open device nbd+unix:///no_such_export?socket=TEST_DIR/nbd: Requested export not available
+qemu-io: can't open device nbd+unix:///no_such_export?socket=SOCK_DIR/nbd: Requested export not available
 server reported: export 'no_such_export' not present
 {"return": {}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}}
diff --git a/tests/qemu-iotests/147 b/tests/qemu-iotests/147
index ab8480b..03fc2fa 100755
--- a/tests/qemu-iotests/147
+++ b/tests/qemu-iotests/147
@@ -32,7 +32,7 @@ NBD_IPV6_PORT_START = NBD_PORT_END
 NBD_IPV6_PORT_END   = NBD_IPV6_PORT_START + 1024
 
 test_img = os.path.join(iotests.test_dir, 'test.img')
-unix_socket = os.path.join(iotests.test_dir, 'nbd.socket')
+unix_socket = os.path.join(iotests.sock_dir, 'nbd.socket')
 
 
 def flatten_sock_addr(crumpled_address):
diff --git a/tests/qemu-iotests/181 b/tests/qemu-iotests/181
index e317e63..378c289 100755
--- a/tests/qemu-iotests/181
+++ b/tests/qemu-iotests/181
@@ -26,7 +26,7 @@ echo "QA output created by $seq"
 
 status=1	# failure is the default!
 
-MIG_SOCKET="${TEST_DIR}/migrate"
+MIG_SOCKET="${SOCK_DIR}/migrate"
 
 _cleanup()
 {
diff --git a/tests/qemu-iotests/182 b/tests/qemu-iotests/182
index 7f494eb..1ccb850 100755
--- a/tests/qemu-iotests/182
+++ b/tests/qemu-iotests/182
@@ -31,7 +31,7 @@ _cleanup()
 {
     _cleanup_test_img
     rm -f "$TEST_IMG.overlay"
-    rm -f "$TEST_DIR/nbd.socket"
+    rm -f "$SOCK_DIR/nbd.socket"
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
@@ -133,7 +133,7 @@ success_or_failure=y _send_qemu_cmd $QEMU_HANDLE \
           'addr': {
               'type': 'unix',
               'data': {
-                  'path': '$TEST_DIR/nbd.socket'
+                  'path': '$SOCK_DIR/nbd.socket'
               } } } }" \
     'return' \
     'error'
diff --git a/tests/qemu-iotests/183 b/tests/qemu-iotests/183
index 04fb344..bced83f 100755
--- a/tests/qemu-iotests/183
+++ b/tests/qemu-iotests/183
@@ -26,7 +26,7 @@ echo "QA output created by $seq"
 
 status=1 # failure is the default!
 
-MIG_SOCKET="${TEST_DIR}/migrate"
+MIG_SOCKET="${SOCK_DIR}/migrate"
 
 _cleanup()
 {
diff --git a/tests/qemu-iotests/192 b/tests/qemu-iotests/192
index 0344322..d2ba55d 100755
--- a/tests/qemu-iotests/192
+++ b/tests/qemu-iotests/192
@@ -31,7 +31,7 @@ _cleanup()
 {
     _cleanup_qemu
     _cleanup_test_img
-    rm -f "$TEST_DIR/nbd"
+    rm -f "$SOCK_DIR/nbd"
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
@@ -66,7 +66,7 @@ else
     QEMU_COMM_TIMEOUT=1
 fi
 
-_send_qemu_cmd $h "nbd_server_start unix:$TEST_DIR/nbd" "(qemu)"
+_send_qemu_cmd $h "nbd_server_start unix:$SOCK_DIR/nbd" "(qemu)"
 _send_qemu_cmd $h "nbd_server_add -w drive0" "(qemu)"
 _send_qemu_cmd $h "q" "(qemu)"
 
diff --git a/tests/qemu-iotests/192.out b/tests/qemu-iotests/192.out
index 1e0be4c..b9429db 100644
--- a/tests/qemu-iotests/192.out
+++ b/tests/qemu-iotests/192.out
@@ -1,7 +1,7 @@
 QA output created by 192
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
 QEMU X.Y.Z monitor - type 'help' for more information
-(qemu) nbd_server_start unix:TEST_DIR/nbd
+(qemu) nbd_server_start unix:SOCK_DIR/nbd
 (qemu) nbd_server_add -w drive0
 (qemu) q
 *** done
diff --git a/tests/qemu-iotests/194 b/tests/qemu-iotests/194
index d746ab1..72e47e8 100755
--- a/tests/qemu-iotests/194
+++ b/tests/qemu-iotests/194
@@ -26,8 +26,8 @@ iotests.verify_platform(['linux'])
 
 with iotests.FilePath('source.img') as source_img_path, \
      iotests.FilePath('dest.img') as dest_img_path, \
-     iotests.FilePath('migration.sock') as migration_sock_path, \
-     iotests.FilePath('nbd.sock') as nbd_sock_path, \
+     iotests.FilePaths(['migration.sock', 'nbd.sock'], iotests.sock_dir) as \
+         [migration_sock_path, nbd_sock_path], \
      iotests.VM('source') as source_vm, \
      iotests.VM('dest') as dest_vm:
 
diff --git a/tests/qemu-iotests/201 b/tests/qemu-iotests/201
index 7abf740..86fa37e 100755
--- a/tests/qemu-iotests/201
+++ b/tests/qemu-iotests/201
@@ -24,7 +24,7 @@ echo "QA output created by $seq"
 
 status=1	# failure is the default!
 
-MIG_SOCKET="${TEST_DIR}/migrate"
+MIG_SOCKET="${SOCK_DIR}/migrate"
 
 # get standard environment, filters and checks
 . ./common.rc
diff --git a/tests/qemu-iotests/205 b/tests/qemu-iotests/205
index 76f6c5f..4bb2c21 100755
--- a/tests/qemu-iotests/205
+++ b/tests/qemu-iotests/205
@@ -24,7 +24,7 @@ import iotests
 import time
 from iotests import qemu_img_create, qemu_io, filter_qemu_io, QemuIoInteractive
 
-nbd_sock = os.path.join(iotests.test_dir, 'nbd_sock')
+nbd_sock = os.path.join(iotests.sock_dir, 'nbd_sock')
 nbd_uri = 'nbd+unix:///exp?socket=' + nbd_sock
 disk = os.path.join(iotests.test_dir, 'disk')
 
diff --git a/tests/qemu-iotests/208 b/tests/qemu-iotests/208
index 1e20238..546eb1d 100755
--- a/tests/qemu-iotests/208
+++ b/tests/qemu-iotests/208
@@ -26,7 +26,7 @@ iotests.verify_image_format(supported_fmts=['generic'])
 
 with iotests.FilePath('disk.img') as disk_img_path, \
      iotests.FilePath('disk-snapshot.img') as disk_snapshot_img_path, \
-     iotests.FilePath('nbd.sock') as nbd_sock_path, \
+     iotests.FilePath('nbd.sock', iotests.sock_dir) as nbd_sock_path, \
      iotests.VM() as vm:
 
     img_size = '10M'
diff --git a/tests/qemu-iotests/209 b/tests/qemu-iotests/209
index 259e991..e0f464b 100755
--- a/tests/qemu-iotests/209
+++ b/tests/qemu-iotests/209
@@ -24,7 +24,8 @@ from iotests import qemu_img_create, qemu_io, qemu_img_verbose, qemu_nbd, \
 
 iotests.verify_image_format(supported_fmts=['qcow2'])
 
-disk, nbd_sock = file_path('disk', 'nbd-sock')
+disk = file_path('disk')
+nbd_sock = file_path('nbd-sock', base_dir=iotests.sock_dir)
 nbd_uri = 'nbd+unix:///exp?socket=' + nbd_sock
 
 qemu_img_create('-f', iotests.imgfmt, disk, '1M')
diff --git a/tests/qemu-iotests/222 b/tests/qemu-iotests/222
index 0ead56d..3f9f934 100644
--- a/tests/qemu-iotests/222
+++ b/tests/qemu-iotests/222
@@ -48,7 +48,7 @@ remainder = [("0xd5", "0x108000",  "32k"), # Right-end of partial-left [1]
 
 with iotests.FilePath('base.img') as base_img_path, \
      iotests.FilePath('fleece.img') as fleece_img_path, \
-     iotests.FilePath('nbd.sock') as nbd_sock_path, \
+     iotests.FilePath('nbd.sock', iotests.sock_dir) as nbd_sock_path, \
      iotests.VM() as vm:
 
     log('--- Setting up images ---')
diff --git a/tests/qemu-iotests/223 b/tests/qemu-iotests/223
index 2ba3d81..b5a80e5 100755
--- a/tests/qemu-iotests/223
+++ b/tests/qemu-iotests/223
@@ -28,7 +28,7 @@ _cleanup()
     nbd_server_stop
     _cleanup_test_img
     _cleanup_qemu
-    rm -f "$TEST_DIR/nbd"
+    rm -f "$SOCK_DIR/nbd"
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
@@ -125,11 +125,11 @@ _send_qemu_cmd $QEMU_HANDLE '{"execute":"nbd-server-add",
   "arguments":{"device":"n"}}' "error" # Attempt add without server
 _send_qemu_cmd $QEMU_HANDLE '{"execute":"nbd-server-start",
   "arguments":{"addr":{"type":"unix",
-    "data":{"path":"'"$TEST_DIR/nbd"'"}}}}' "return"
+    "data":{"path":"'"$SOCK_DIR/nbd"'"}}}}' "return"
 _send_qemu_cmd $QEMU_HANDLE '{"execute":"nbd-server-start",
   "arguments":{"addr":{"type":"unix",
-    "data":{"path":"'"$TEST_DIR/nbd"1'"}}}}' "error" # Attempt second server
-$QEMU_NBD_PROG -L -k "$TEST_DIR/nbd"
+    "data":{"path":"'"$SOCK_DIR/nbd"1'"}}}}' "error" # Attempt second server
+$QEMU_NBD_PROG -L -k "$SOCK_DIR/nbd"
 _send_qemu_cmd $QEMU_HANDLE '{"execute":"nbd-server-add",
   "arguments":{"device":"n", "bitmap":"b"}}' "return"
 _send_qemu_cmd $QEMU_HANDLE '{"execute":"nbd-server-add",
@@ -145,14 +145,14 @@ _send_qemu_cmd $QEMU_HANDLE '{"execute":"nbd-server-add",
 _send_qemu_cmd $QEMU_HANDLE '{"execute":"nbd-server-add",
   "arguments":{"device":"n", "name":"n2", "writable":true,
   "bitmap":"b2"}}' "return"
-$QEMU_NBD_PROG -L -k "$TEST_DIR/nbd"
+$QEMU_NBD_PROG -L -k "$SOCK_DIR/nbd"
 
 echo
 echo "=== Contrast normal status to large granularity dirty-bitmap ==="
 echo
 
 QEMU_IO_OPTIONS=$QEMU_IO_OPTIONS_NO_FMT
-IMG="driver=nbd,export=n,server.type=unix,server.path=$TEST_DIR/nbd"
+IMG="driver=nbd,export=n,server.type=unix,server.path=$SOCK_DIR/nbd"
 $QEMU_IO -r -c 'r -P 0x22 512 512' -c 'r -P 0 512k 512k' -c 'r -P 0x11 1m 1m' \
   -c 'r -P 0x33 2m 2m' --image-opts "$IMG" | _filter_qemu_io
 $QEMU_IMG map --output=json --image-opts \
@@ -164,7 +164,7 @@ echo
 echo "=== Contrast to small granularity dirty-bitmap ==="
 echo
 
-IMG="driver=nbd,export=n2,server.type=unix,server.path=$TEST_DIR/nbd"
+IMG="driver=nbd,export=n2,server.type=unix,server.path=$SOCK_DIR/nbd"
 $QEMU_IMG map --output=json --image-opts \
   "$IMG,x-dirty-bitmap=qemu:dirty-bitmap:b2" | _filter_qemu_img_map
 
diff --git a/tests/qemu-iotests/240 b/tests/qemu-iotests/240
index f73bc07..8b4337b 100755
--- a/tests/qemu-iotests/240
+++ b/tests/qemu-iotests/240
@@ -29,7 +29,7 @@ status=1	# failure is the default!
 
 _cleanup()
 {
-    rm -f "$TEST_DIR/nbd"
+    rm -f "$SOCK_DIR/nbd"
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
@@ -135,7 +135,7 @@ echo
 run_qemu <<EOF
 { "execute": "qmp_capabilities" }
 { "execute": "blockdev-add", "arguments": {"driver": "null-co", "read-zeroes": true, "node-name": "hd0", "read-only": true}}
-{ "execute": "nbd-server-start", "arguments": {"addr":{"type":"unix","data":{"path":"$TEST_DIR/nbd"}}}}
+{ "execute": "nbd-server-start", "arguments": {"addr":{"type":"unix","data":{"path":"$SOCK_DIR/nbd"}}}}
 { "execute": "nbd-server-add", "arguments": {"device":"hd0"}}
 { "execute": "object-add", "arguments": {"qom-type": "iothread", "id": "iothread0"}}
 { "execute": "device_add", "arguments": {"id": "scsi0", "driver": "${virtio_scsi}", "iothread": "iothread0"}}
diff --git a/tests/qemu-iotests/241 b/tests/qemu-iotests/241
index 58b64eb..8dae8d3 100755
--- a/tests/qemu-iotests/241
+++ b/tests/qemu-iotests/241
@@ -23,8 +23,6 @@ echo "QA output created by $seq"
 
 status=1 # failure is the default!
 
-nbd_unix_socket=$TEST_DIR/test_qemu_nbd_socket
-
 _cleanup()
 {
     _cleanup_test_img
diff --git a/tests/qemu-iotests/245 b/tests/qemu-iotests/245
index 41218d5..e66a23c 100644
--- a/tests/qemu-iotests/245
+++ b/tests/qemu-iotests/245
@@ -598,7 +598,7 @@ class TestBlockdevReopen(iotests.QMPTestCase):
         ##################
         ###### null ######
         ##################
-        opts = {'driver': 'null-aio', 'node-name': 'root', 'size': 1024}
+        opts = {'driver': 'null-co', 'node-name': 'root', 'size': 1024}
 
         result = self.vm.qmp('blockdev-add', conv_keys = False, **opts)
         self.assert_qmp(result, 'return', {})
diff --git a/tests/qemu-iotests/261 b/tests/qemu-iotests/261
new file mode 100755
index 0000000..fb96bcf
--- /dev/null
+++ b/tests/qemu-iotests/261
@@ -0,0 +1,523 @@
+#!/usr/bin/env bash
+#
+# Test case for qcow2's handling of extra data in snapshot table entries
+# (and more generally, how certain cases of broken snapshot tables are
+# handled)
+#
+# Copyright (C) 2019 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# creator
+owner=mreitz@redhat.com
+
+seq=$(basename $0)
+echo "QA output created by $seq"
+
+status=1	# failure is the default!
+
+_cleanup()
+{
+    _cleanup_test_img
+    rm -f "$TEST_IMG".v{2,3}.orig
+    rm -f "$TEST_DIR"/sn{0,1,2}{,-pre,-extra,-post}
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+# This tests qocw2-specific low-level functionality
+_supported_fmt qcow2
+_supported_proto file
+_supported_os Linux
+# (1) We create a v2 image that supports nothing but refcount_bits=16
+# (2) We do some refcount management on our own which expects
+#     refcount_bits=16
+_unsupported_imgopts 'refcount_bits=\([^1]\|.\([^6]\|$\)\)'
+
+# Parameters:
+#   $1: image filename
+#   $2: snapshot table entry offset in the image
+snapshot_table_entry_size()
+{
+    id_len=$(peek_file_be "$1" $(($2 + 12)) 2)
+    name_len=$(peek_file_be "$1" $(($2 + 14)) 2)
+    extra_len=$(peek_file_be "$1" $(($2 + 36)) 4)
+
+    full_len=$((40 + extra_len + id_len + name_len))
+    echo $(((full_len + 7) / 8 * 8))
+}
+
+# Parameter:
+#   $1: image filename
+print_snapshot_table()
+{
+    nb_entries=$(peek_file_be "$1" 60 4)
+    offset=$(peek_file_be "$1" 64 8)
+
+    echo "Snapshots in $1:" | _filter_testdir | _filter_imgfmt
+
+    for ((i = 0; i < nb_entries; i++)); do
+        id_len=$(peek_file_be "$1" $((offset + 12)) 2)
+        name_len=$(peek_file_be "$1" $((offset + 14)) 2)
+        extra_len=$(peek_file_be "$1" $((offset + 36)) 4)
+
+        extra_ofs=$((offset + 40))
+        id_ofs=$((extra_ofs + extra_len))
+        name_ofs=$((id_ofs + id_len))
+
+        echo "  [$i]"
+        echo "    ID: $(peek_file_raw "$1" $id_ofs $id_len)"
+        echo "    Name: $(peek_file_raw "$1" $name_ofs $name_len)"
+        echo "    Extra data size: $extra_len"
+        if [ $extra_len -ge 8 ]; then
+            echo "    VM state size: $(peek_file_be "$1" $extra_ofs 8)"
+        fi
+        if [ $extra_len -ge 16 ]; then
+            echo "    Disk size: $(peek_file_be "$1" $((extra_ofs + 8)) 8)"
+        fi
+        if [ $extra_len -gt 16 ]; then
+            echo '    Unknown extra data:' \
+                "$(peek_file_raw "$1" $((extra_ofs + 16)) $((extra_len - 16)) \
+                   | tr -d '\0')"
+        fi
+
+        offset=$((offset + $(snapshot_table_entry_size "$1" $offset)))
+    done
+}
+
+# Mark clusters as allocated; works only in refblock 0 (i.e. before
+# cluster #32768).
+# Parameters:
+#   $1: Start offset of what to allocate
+#   $2: End offset (exclusive)
+refblock0_allocate()
+{
+    reftable_ofs=$(peek_file_be "$TEST_IMG" 48 8)
+    refblock_ofs=$(peek_file_be "$TEST_IMG" $reftable_ofs 8)
+
+    cluster=$(($1 / 65536))
+    ecluster=$((($2 + 65535) / 65536))
+
+    while [ $cluster -lt $ecluster ]; do
+        if [ $cluster -ge 32768 ]; then
+            echo "*** Abort: Cluster $cluster exceeds refblock 0 ***"
+            exit 1
+        fi
+        poke_file "$TEST_IMG" $((refblock_ofs + cluster * 2)) '\x00\x01'
+        cluster=$((cluster + 1))
+    done
+}
+
+
+echo
+echo '=== Create v2 template ==='
+echo
+
+# Create v2 image with a snapshot table with three entries:
+# [0]: No extra data (valid with v2, not valid with v3)
+# [1]: Has extra data unknown to qemu
+# [2]: Has the 64-bit VM state size, but not the disk size (again,
+#      valid with v2, not valid with v3)
+
+TEST_IMG="$TEST_IMG.v2.orig" IMGOPTS='compat=0.10' _make_test_img 64M
+$QEMU_IMG snapshot -c sn0 "$TEST_IMG.v2.orig"
+$QEMU_IMG snapshot -c sn1 "$TEST_IMG.v2.orig"
+$QEMU_IMG snapshot -c sn2 "$TEST_IMG.v2.orig"
+
+# Copy out all existing snapshot table entries
+sn_table_ofs=$(peek_file_be "$TEST_IMG.v2.orig" 64 8)
+
+# ofs: Snapshot table entry offset
+# eds: Extra data size
+# ids: Name + ID size
+# len: Total entry length
+sn0_ofs=$sn_table_ofs
+sn0_eds=$(peek_file_be "$TEST_IMG.v2.orig" $((sn0_ofs + 36)) 4)
+sn0_ids=$(($(peek_file_be "$TEST_IMG.v2.orig" $((sn0_ofs + 12)) 2) +
+           $(peek_file_be "$TEST_IMG.v2.orig" $((sn0_ofs + 14)) 2)))
+sn0_len=$(snapshot_table_entry_size "$TEST_IMG.v2.orig" $sn0_ofs)
+sn1_ofs=$((sn0_ofs + sn0_len))
+sn1_eds=$(peek_file_be "$TEST_IMG.v2.orig" $((sn1_ofs + 36)) 4)
+sn1_ids=$(($(peek_file_be "$TEST_IMG.v2.orig" $((sn1_ofs + 12)) 2) +
+           $(peek_file_be "$TEST_IMG.v2.orig" $((sn1_ofs + 14)) 2)))
+sn1_len=$(snapshot_table_entry_size "$TEST_IMG.v2.orig" $sn1_ofs)
+sn2_ofs=$((sn1_ofs + sn1_len))
+sn2_eds=$(peek_file_be "$TEST_IMG.v2.orig" $((sn2_ofs + 36)) 4)
+sn2_ids=$(($(peek_file_be "$TEST_IMG.v2.orig" $((sn2_ofs + 12)) 2) +
+           $(peek_file_be "$TEST_IMG.v2.orig" $((sn2_ofs + 14)) 2)))
+sn2_len=$(snapshot_table_entry_size "$TEST_IMG.v2.orig" $sn2_ofs)
+
+# Data before extra data
+dd if="$TEST_IMG.v2.orig" of="$TEST_DIR/sn0-pre" bs=1 skip=$sn0_ofs count=40 \
+    &> /dev/null
+dd if="$TEST_IMG.v2.orig" of="$TEST_DIR/sn1-pre" bs=1 skip=$sn1_ofs count=40 \
+    &> /dev/null
+dd if="$TEST_IMG.v2.orig" of="$TEST_DIR/sn2-pre" bs=1 skip=$sn2_ofs count=40 \
+    &> /dev/null
+
+# Extra data
+dd if="$TEST_IMG.v2.orig" of="$TEST_DIR/sn0-extra" bs=1 \
+    skip=$((sn0_ofs + 40)) count=$sn0_eds &> /dev/null
+dd if="$TEST_IMG.v2.orig" of="$TEST_DIR/sn1-extra" bs=1 \
+    skip=$((sn1_ofs + 40)) count=$sn1_eds &> /dev/null
+dd if="$TEST_IMG.v2.orig" of="$TEST_DIR/sn2-extra" bs=1 \
+    skip=$((sn2_ofs + 40)) count=$sn2_eds &> /dev/null
+
+# Data after extra data
+dd if="$TEST_IMG.v2.orig" of="$TEST_DIR/sn0-post" bs=1 \
+    skip=$((sn0_ofs + 40 + sn0_eds)) count=$sn0_ids \
+    &> /dev/null
+dd if="$TEST_IMG.v2.orig" of="$TEST_DIR/sn1-post" bs=1 \
+    skip=$((sn1_ofs + 40 + sn1_eds)) count=$sn1_ids \
+    &> /dev/null
+dd if="$TEST_IMG.v2.orig" of="$TEST_DIR/sn2-post" bs=1 \
+    skip=$((sn2_ofs + 40 + sn2_eds)) count=$sn2_ids \
+    &> /dev/null
+
+# Amend them, one by one
+# Set sn0's extra data size to 0
+poke_file "$TEST_DIR/sn0-pre" 36 '\x00\x00\x00\x00'
+truncate -s 0 "$TEST_DIR/sn0-extra"
+# Grow sn0-post to pad
+truncate -s $(($(snapshot_table_entry_size "$TEST_DIR/sn0-pre") - 40)) \
+    "$TEST_DIR/sn0-post"
+
+# Set sn1's extra data size to 42
+poke_file "$TEST_DIR/sn1-pre" 36 '\x00\x00\x00\x2a'
+truncate -s 42 "$TEST_DIR/sn1-extra"
+poke_file "$TEST_DIR/sn1-extra" 16 'very important data'
+# Grow sn1-post to pad
+truncate -s $(($(snapshot_table_entry_size "$TEST_DIR/sn1-pre") - 82)) \
+    "$TEST_DIR/sn1-post"
+
+# Set sn2's extra data size to 8
+poke_file "$TEST_DIR/sn2-pre" 36 '\x00\x00\x00\x08'
+truncate -s 8 "$TEST_DIR/sn2-extra"
+# Grow sn2-post to pad
+truncate -s $(($(snapshot_table_entry_size "$TEST_DIR/sn2-pre") - 48)) \
+    "$TEST_DIR/sn2-post"
+
+# Construct snapshot table
+cat "$TEST_DIR"/sn0-{pre,extra,post} \
+    "$TEST_DIR"/sn1-{pre,extra,post} \
+    "$TEST_DIR"/sn2-{pre,extra,post} \
+    | dd of="$TEST_IMG.v2.orig" bs=1 seek=$sn_table_ofs conv=notrunc \
+          &> /dev/null
+
+# Done!
+TEST_IMG="$TEST_IMG.v2.orig" _check_test_img
+print_snapshot_table "$TEST_IMG.v2.orig"
+
+echo
+echo '=== Upgrade to v3 ==='
+echo
+
+cp "$TEST_IMG.v2.orig" "$TEST_IMG.v3.orig"
+$QEMU_IMG amend -o compat=1.1 "$TEST_IMG.v3.orig"
+TEST_IMG="$TEST_IMG.v3.orig" _check_test_img
+print_snapshot_table "$TEST_IMG.v3.orig"
+
+echo
+echo '=== Repair botched v3 ==='
+echo
+
+# Force the v2 file to be v3.  v3 requires each snapshot table entry
+# to have at least 16 bytes of extra data, so it will not comply to
+# the qcow2 v3 specification; but we can fix that.
+cp "$TEST_IMG.v2.orig" "$TEST_IMG"
+
+# Set version
+poke_file "$TEST_IMG" 4 '\x00\x00\x00\x03'
+# Increase header length (necessary for v3)
+poke_file "$TEST_IMG" 100 '\x00\x00\x00\x68'
+# Set refcount order (necessary for v3)
+poke_file "$TEST_IMG" 96 '\x00\x00\x00\x04'
+
+_check_test_img -r all
+print_snapshot_table "$TEST_IMG"
+
+
+# From now on, just test the qcow2 version we are supposed to test.
+# (v3 by default, v2 by choice through $IMGOPTS.)
+# That works because we always write all known extra data when
+# updating the snapshot table, independent of the version.
+
+if echo "$IMGOPTS" | grep -q 'compat=\(0\.10\|v2\)' 2> /dev/null; then
+    subver=v2
+else
+    subver=v3
+fi
+
+echo
+echo '=== Add new snapshot ==='
+echo
+
+cp "$TEST_IMG.$subver.orig" "$TEST_IMG"
+$QEMU_IMG snapshot -c sn3 "$TEST_IMG"
+_check_test_img
+print_snapshot_table "$TEST_IMG"
+
+echo
+echo '=== Remove different snapshots ==='
+
+for sn in sn0 sn1 sn2; do
+    echo
+    echo "--- $sn ---"
+
+    cp "$TEST_IMG.$subver.orig" "$TEST_IMG"
+    $QEMU_IMG snapshot -d $sn "$TEST_IMG"
+    _check_test_img
+    print_snapshot_table "$TEST_IMG"
+done
+
+echo
+echo '=== Reject too much unknown extra data ==='
+echo
+
+cp "$TEST_IMG.$subver.orig" "$TEST_IMG"
+$QEMU_IMG snapshot -c sn3 "$TEST_IMG"
+
+sn_table_ofs=$(peek_file_be "$TEST_IMG" 64 8)
+sn0_ofs=$sn_table_ofs
+sn1_ofs=$((sn0_ofs + $(snapshot_table_entry_size "$TEST_IMG" $sn0_ofs)))
+sn2_ofs=$((sn1_ofs + $(snapshot_table_entry_size "$TEST_IMG" $sn1_ofs)))
+sn3_ofs=$((sn2_ofs + $(snapshot_table_entry_size "$TEST_IMG" $sn2_ofs)))
+
+# 64 kB of extra data should be rejected
+# (Note that this also induces a refcount error, because it spills
+# over to the next cluster.  That's a good way to test that we can
+# handle simultaneous snapshot table and refcount errors.)
+poke_file "$TEST_IMG" $((sn3_ofs + 36)) '\x00\x01\x00\x00'
+
+# Print error
+_img_info
+echo
+_check_test_img
+echo
+
+# Should be repairable
+_check_test_img -r all
+
+echo
+echo '=== Snapshot table too big ==='
+echo
+
+sn_table_ofs=$(peek_file_be "$TEST_IMG.v3.orig" 64 8)
+
+# Fill a snapshot with 1 kB of extra data, a 65535-char ID, and a
+# 65535-char name, and repeat it as many times as necessary to fill
+# 64 MB (the maximum supported by qemu)
+
+touch "$TEST_DIR/sn0"
+
+# Full size (fixed + extra + ID + name + padding)
+sn_size=$((40 + 1024 + 65535 + 65535 + 2))
+
+# We only need the fixed part, though.
+truncate -s 40 "$TEST_DIR/sn0"
+
+# 65535-char ID string
+poke_file "$TEST_DIR/sn0" 12 '\xff\xff'
+# 65535-char name
+poke_file "$TEST_DIR/sn0" 14 '\xff\xff'
+# 1 kB of extra data
+poke_file "$TEST_DIR/sn0" 36 '\x00\x00\x04\x00'
+
+# Create test image
+_make_test_img 64M
+
+# Hook up snapshot table somewhere safe (at 1 MB)
+poke_file "$TEST_IMG" 64 '\x00\x00\x00\x00\x00\x10\x00\x00'
+
+offset=1048576
+size_written=0
+sn_count=0
+while [ $size_written -le $((64 * 1048576)) ]; do
+    dd if="$TEST_DIR/sn0" of="$TEST_IMG" bs=1 seek=$offset conv=notrunc \
+        &> /dev/null
+    offset=$((offset + sn_size))
+    size_written=$((size_written + sn_size))
+    sn_count=$((sn_count + 1))
+done
+truncate -s "$offset" "$TEST_IMG"
+
+# Give the last snapshot (the one to be removed) an L1 table so we can
+# see how that is handled when repairing the image
+# (Put it two clusters before 1 MB, and one L2 table one cluster
+# before 1 MB)
+poke_file "$TEST_IMG" $((offset - sn_size + 0)) \
+    '\x00\x00\x00\x00\x00\x0e\x00\x00'
+poke_file "$TEST_IMG" $((offset - sn_size + 8)) \
+    '\x00\x00\x00\x01'
+
+# Hook up the L2 table
+poke_file "$TEST_IMG" $((1048576 - 2 * 65536)) \
+    '\x80\x00\x00\x00\x00\x0f\x00\x00'
+
+# Make sure all of the clusters we just hooked up are allocated:
+# - The snapshot table
+# - The last snapshot's L1 and L2 table
+refblock0_allocate $((1048576 - 2 * 65536)) $offset
+
+poke_file "$TEST_IMG" 60 \
+    "$(printf '%08x' $sn_count | sed -e 's/\(..\)/\\x\1/g')"
+
+# Print error
+_img_info
+echo
+_check_test_img
+echo
+
+# Should be repairable
+_check_test_img -r all
+
+echo
+echo "$((sn_count - 1)) snapshots should remain:"
+echo "  qemu-img info reports $(_img_info | grep -c '^ \{34\}') snapshots"
+echo "  Image header reports $(peek_file_be "$TEST_IMG" 60 4) snapshots"
+
+echo
+echo '=== Snapshot table too big with one entry with too much extra data ==='
+echo
+
+# For this test, we reuse the image from the previous case, which has
+# a snapshot table that is right at the limit.
+# Our layout looks like this:
+# - (a number of snapshot table entries)
+# - One snapshot with $extra_data_size extra data
+# - One normal snapshot that breaks the 64 MB boundary
+# - One normal snapshot beyond the 64 MB boundary
+#
+# $extra_data_size is calculated so that simply by virtue of it
+# decreasing to 1 kB, the penultimate snapshot will fit into 64 MB
+# limit again.  The final snapshot will always be beyond the limit, so
+# that we can see that the repair algorithm does still determine the
+# limit to be somewhere, even when truncating one snapshot's extra
+# data.
+
+# The last case has removed the last snapshot, so calculate
+# $old_offset to get the current image's real length
+old_offset=$((offset - sn_size))
+
+# The layout from the previous test had one snapshot beyond the 64 MB
+# limit; we want the same (after the oversized extra data has been
+# truncated to 1 kB), so we drop the last three snapshots and
+# construct them from scratch.
+offset=$((offset - 3 * sn_size))
+sn_count=$((sn_count - 3))
+
+# Assuming we had already written one of the three snapshots
+# (necessary so we can calculate $extra_data_size next).
+size_written=$((size_written - 2 * sn_size))
+
+# Increase the extra data size so we go past the limit
+# (The -1024 comes from the 1 kB of extra data we already have)
+extra_data_size=$((64 * 1048576 + 8 - sn_size - (size_written - 1024)))
+
+poke_file "$TEST_IMG" $((offset + 36)) \
+    "$(printf '%08x' $extra_data_size | sed -e 's/\(..\)/\\x\1/g')"
+
+offset=$((offset + sn_size - 1024 + extra_data_size))
+size_written=$((size_written - 1024 + extra_data_size))
+sn_count=$((sn_count + 1))
+
+# Write the two normal snapshots
+for ((i = 0; i < 2; i++)); do
+    dd if="$TEST_DIR/sn0" of="$TEST_IMG" bs=1 seek=$offset conv=notrunc \
+        &> /dev/null
+    offset=$((offset + sn_size))
+    size_written=$((size_written + sn_size))
+    sn_count=$((sn_count + 1))
+
+    if [ $i = 0 ]; then
+        # Check that the penultimate snapshot is beyond the 64 MB limit
+        echo "Snapshot table size should equal $((64 * 1048576 + 8)):" \
+            $size_written
+        echo
+    fi
+done
+
+truncate -s $offset "$TEST_IMG"
+refblock0_allocate $old_offset $offset
+
+poke_file "$TEST_IMG" 60 \
+    "$(printf '%08x' $sn_count | sed -e 's/\(..\)/\\x\1/g')"
+
+# Print error
+_img_info
+echo
+_check_test_img
+echo
+
+# Just truncating the extra data should be sufficient to shorten the
+# snapshot table so only one snapshot exceeds the extra size
+_check_test_img -r all
+
+echo
+echo '=== Too many snapshots ==='
+echo
+
+# Create a v2 image, for speeds' sake: All-zero snapshot table entries
+# are only valid in v2.
+IMGOPTS='compat=0.10' _make_test_img 64M
+
+# Hook up snapshot table somewhere safe (at 1 MB)
+poke_file "$TEST_IMG" 64 '\x00\x00\x00\x00\x00\x10\x00\x00'
+# "Create" more than 65536 snapshots (twice that many here)
+poke_file "$TEST_IMG" 60 '\x00\x02\x00\x00'
+
+# 40-byte all-zero snapshot table entries are valid snapshots, but
+# only in v2 (v3 needs 16 bytes of extra data, so we would have to
+# write 131072x '\x10').
+truncate -s $((1048576 + 40 * 131072)) "$TEST_IMG"
+
+# But let us give one of the snapshots to be removed an L1 table so
+# we can see how that is handled when repairing the image.
+# (Put it two clusters before 1 MB, and one L2 table one cluster
+# before 1 MB)
+poke_file "$TEST_IMG" $((1048576 + 40 * 65536 + 0)) \
+    '\x00\x00\x00\x00\x00\x0e\x00\x00'
+poke_file "$TEST_IMG" $((1048576 + 40 * 65536 + 8)) \
+    '\x00\x00\x00\x01'
+
+# Hook up the L2 table
+poke_file "$TEST_IMG" $((1048576 - 2 * 65536)) \
+    '\x80\x00\x00\x00\x00\x0f\x00\x00'
+
+# Make sure all of the clusters we just hooked up are allocated:
+# - The snapshot table
+# - The last snapshot's L1 and L2 table
+refblock0_allocate $((1048576 - 2 * 65536)) $((1048576 + 40 * 131072))
+
+# Print error
+_img_info
+echo
+_check_test_img
+echo
+
+# Should be repairable
+_check_test_img -r all
+
+echo
+echo '65536 snapshots should remain:'
+echo "  qemu-img info reports $(_img_info | grep -c '^ \{34\}') snapshots"
+echo "  Image header reports $(peek_file_be "$TEST_IMG" 60 4) snapshots"
+
+# success, all done
+echo "*** done"
+status=0
diff --git a/tests/qemu-iotests/261.out b/tests/qemu-iotests/261.out
new file mode 100644
index 0000000..2600354
--- /dev/null
+++ b/tests/qemu-iotests/261.out
@@ -0,0 +1,346 @@
+QA output created by 261
+
+=== Create v2 template ===
+
+Formatting 'TEST_DIR/t.IMGFMT.v2.orig', fmt=IMGFMT size=67108864
+No errors were found on the image.
+Snapshots in TEST_DIR/t.IMGFMT.v2.orig:
+  [0]
+    ID: 1
+    Name: sn0
+    Extra data size: 0
+  [1]
+    ID: 2
+    Name: sn1
+    Extra data size: 42
+    VM state size: 0
+    Disk size: 67108864
+    Unknown extra data: very important data
+  [2]
+    ID: 3
+    Name: sn2
+    Extra data size: 8
+    VM state size: 0
+
+=== Upgrade to v3 ===
+
+No errors were found on the image.
+Snapshots in TEST_DIR/t.IMGFMT.v3.orig:
+  [0]
+    ID: 1
+    Name: sn0
+    Extra data size: 16
+    VM state size: 0
+    Disk size: 67108864
+  [1]
+    ID: 2
+    Name: sn1
+    Extra data size: 42
+    VM state size: 0
+    Disk size: 67108864
+    Unknown extra data: very important data
+  [2]
+    ID: 3
+    Name: sn2
+    Extra data size: 16
+    VM state size: 0
+    Disk size: 67108864
+
+=== Repair botched v3 ===
+
+Repairing snapshot table entry 0 is incomplete
+Repairing snapshot table entry 2 is incomplete
+The following inconsistencies were found and repaired:
+
+    0 leaked clusters
+    2 corruptions
+
+Double checking the fixed image now...
+No errors were found on the image.
+Snapshots in TEST_DIR/t.IMGFMT:
+  [0]
+    ID: 1
+    Name: sn0
+    Extra data size: 16
+    VM state size: 0
+    Disk size: 67108864
+  [1]
+    ID: 2
+    Name: sn1
+    Extra data size: 42
+    VM state size: 0
+    Disk size: 67108864
+    Unknown extra data: very important data
+  [2]
+    ID: 3
+    Name: sn2
+    Extra data size: 16
+    VM state size: 0
+    Disk size: 67108864
+
+=== Add new snapshot ===
+
+No errors were found on the image.
+Snapshots in TEST_DIR/t.IMGFMT:
+  [0]
+    ID: 1
+    Name: sn0
+    Extra data size: 16
+    VM state size: 0
+    Disk size: 67108864
+  [1]
+    ID: 2
+    Name: sn1
+    Extra data size: 42
+    VM state size: 0
+    Disk size: 67108864
+    Unknown extra data: very important data
+  [2]
+    ID: 3
+    Name: sn2
+    Extra data size: 16
+    VM state size: 0
+    Disk size: 67108864
+  [3]
+    ID: 4
+    Name: sn3
+    Extra data size: 16
+    VM state size: 0
+    Disk size: 67108864
+
+=== Remove different snapshots ===
+
+--- sn0 ---
+No errors were found on the image.
+Snapshots in TEST_DIR/t.IMGFMT:
+  [0]
+    ID: 2
+    Name: sn1
+    Extra data size: 42
+    VM state size: 0
+    Disk size: 67108864
+    Unknown extra data: very important data
+  [1]
+    ID: 3
+    Name: sn2
+    Extra data size: 16
+    VM state size: 0
+    Disk size: 67108864
+
+--- sn1 ---
+No errors were found on the image.
+Snapshots in TEST_DIR/t.IMGFMT:
+  [0]
+    ID: 1
+    Name: sn0
+    Extra data size: 16
+    VM state size: 0
+    Disk size: 67108864
+  [1]
+    ID: 3
+    Name: sn2
+    Extra data size: 16
+    VM state size: 0
+    Disk size: 67108864
+
+--- sn2 ---
+No errors were found on the image.
+Snapshots in TEST_DIR/t.IMGFMT:
+  [0]
+    ID: 1
+    Name: sn0
+    Extra data size: 16
+    VM state size: 0
+    Disk size: 67108864
+  [1]
+    ID: 2
+    Name: sn1
+    Extra data size: 42
+    VM state size: 0
+    Disk size: 67108864
+    Unknown extra data: very important data
+
+=== Reject too much unknown extra data ===
+
+qemu-img: Could not open 'TEST_DIR/t.IMGFMT': Too much extra metadata in snapshot table entry 3
+You can force-remove this extra metadata with qemu-img check -r all
+
+qemu-img: ERROR failed to read the snapshot table: Too much extra metadata in snapshot table entry 3
+You can force-remove this extra metadata with qemu-img check -r all
+qemu-img: Check failed: File too large
+
+Discarding too much extra metadata in snapshot table entry 3 (65536 > 1024)
+ERROR cluster 10 refcount=0 reference=1
+Rebuilding refcount structure
+Repairing cluster 1 refcount=1 reference=0
+Repairing cluster 2 refcount=1 reference=0
+The following inconsistencies were found and repaired:
+
+    0 leaked clusters
+    2 corruptions
+
+Double checking the fixed image now...
+No errors were found on the image.
+
+=== Snapshot table too big ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
+qemu-img: Could not open 'TEST_DIR/t.IMGFMT': Snapshot table is too big
+You can force-remove all 1 overhanging snapshots with qemu-img check -r all
+
+qemu-img: ERROR failed to read the snapshot table: Snapshot table is too big
+You can force-remove all 1 overhanging snapshots with qemu-img check -r all
+qemu-img: Check failed: File too large
+
+Discarding 1 overhanging snapshots (snapshot table is too big)
+Leaked cluster 14 refcount=1 reference=0
+Leaked cluster 15 refcount=1 reference=0
+Leaked cluster 1039 refcount=1 reference=0
+Leaked cluster 1040 refcount=1 reference=0
+Repairing cluster 14 refcount=1 reference=0
+Repairing cluster 15 refcount=1 reference=0
+Repairing cluster 1039 refcount=1 reference=0
+Repairing cluster 1040 refcount=1 reference=0
+The following inconsistencies were found and repaired:
+
+    4 leaked clusters
+    1 corruptions
+
+Double checking the fixed image now...
+No errors were found on the image.
+
+507 snapshots should remain:
+  qemu-img info reports 507 snapshots
+  Image header reports 507 snapshots
+
+=== Snapshot table too big with one entry with too much extra data ===
+
+Snapshot table size should equal 67108872: 67108872
+
+qemu-img: Could not open 'TEST_DIR/t.IMGFMT': Too much extra metadata in snapshot table entry 505
+You can force-remove this extra metadata with qemu-img check -r all
+
+qemu-img: ERROR failed to read the snapshot table: Too much extra metadata in snapshot table entry 505
+You can force-remove this extra metadata with qemu-img check -r all
+qemu-img: Check failed: File too large
+
+Discarding too much extra metadata in snapshot table entry 505 (116944 > 1024)
+Discarding 1 overhanging snapshots (snapshot table is too big)
+Leaked cluster 1041 refcount=1 reference=0
+Leaked cluster 1042 refcount=1 reference=0
+Repairing cluster 1041 refcount=1 reference=0
+Repairing cluster 1042 refcount=1 reference=0
+The following inconsistencies were found and repaired:
+
+    2 leaked clusters
+    2 corruptions
+
+Double checking the fixed image now...
+No errors were found on the image.
+
+=== Too many snapshots ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
+qemu-img: Could not open 'TEST_DIR/t.IMGFMT': Snapshot table too large
+
+qemu-img: ERROR snapshot table too large
+You can force-remove all 65536 overhanging snapshots with qemu-img check -r all
+qemu-img: Check failed: File too large
+
+Discarding 65536 overhanging snapshots
+Leaked cluster 14 refcount=1 reference=0
+Leaked cluster 15 refcount=1 reference=0
+Leaked cluster 56 refcount=1 reference=0
+Leaked cluster 57 refcount=1 reference=0
+Leaked cluster 58 refcount=1 reference=0
+Leaked cluster 59 refcount=1 reference=0
+Leaked cluster 60 refcount=1 reference=0
+Leaked cluster 61 refcount=1 reference=0
+Leaked cluster 62 refcount=1 reference=0
+Leaked cluster 63 refcount=1 reference=0
+Leaked cluster 64 refcount=1 reference=0
+Leaked cluster 65 refcount=1 reference=0
+Leaked cluster 66 refcount=1 reference=0
+Leaked cluster 67 refcount=1 reference=0
+Leaked cluster 68 refcount=1 reference=0
+Leaked cluster 69 refcount=1 reference=0
+Leaked cluster 70 refcount=1 reference=0
+Leaked cluster 71 refcount=1 reference=0
+Leaked cluster 72 refcount=1 reference=0
+Leaked cluster 73 refcount=1 reference=0
+Leaked cluster 74 refcount=1 reference=0
+Leaked cluster 75 refcount=1 reference=0
+Leaked cluster 76 refcount=1 reference=0
+Leaked cluster 77 refcount=1 reference=0
+Leaked cluster 78 refcount=1 reference=0
+Leaked cluster 79 refcount=1 reference=0
+Leaked cluster 80 refcount=1 reference=0
+Leaked cluster 81 refcount=1 reference=0
+Leaked cluster 82 refcount=1 reference=0
+Leaked cluster 83 refcount=1 reference=0
+Leaked cluster 84 refcount=1 reference=0
+Leaked cluster 85 refcount=1 reference=0
+Leaked cluster 86 refcount=1 reference=0
+Leaked cluster 87 refcount=1 reference=0
+Leaked cluster 88 refcount=1 reference=0
+Leaked cluster 89 refcount=1 reference=0
+Leaked cluster 90 refcount=1 reference=0
+Leaked cluster 91 refcount=1 reference=0
+Leaked cluster 92 refcount=1 reference=0
+Leaked cluster 93 refcount=1 reference=0
+Leaked cluster 94 refcount=1 reference=0
+Leaked cluster 95 refcount=1 reference=0
+Repairing cluster 14 refcount=1 reference=0
+Repairing cluster 15 refcount=1 reference=0
+Repairing cluster 56 refcount=1 reference=0
+Repairing cluster 57 refcount=1 reference=0
+Repairing cluster 58 refcount=1 reference=0
+Repairing cluster 59 refcount=1 reference=0
+Repairing cluster 60 refcount=1 reference=0
+Repairing cluster 61 refcount=1 reference=0
+Repairing cluster 62 refcount=1 reference=0
+Repairing cluster 63 refcount=1 reference=0
+Repairing cluster 64 refcount=1 reference=0
+Repairing cluster 65 refcount=1 reference=0
+Repairing cluster 66 refcount=1 reference=0
+Repairing cluster 67 refcount=1 reference=0
+Repairing cluster 68 refcount=1 reference=0
+Repairing cluster 69 refcount=1 reference=0
+Repairing cluster 70 refcount=1 reference=0
+Repairing cluster 71 refcount=1 reference=0
+Repairing cluster 72 refcount=1 reference=0
+Repairing cluster 73 refcount=1 reference=0
+Repairing cluster 74 refcount=1 reference=0
+Repairing cluster 75 refcount=1 reference=0
+Repairing cluster 76 refcount=1 reference=0
+Repairing cluster 77 refcount=1 reference=0
+Repairing cluster 78 refcount=1 reference=0
+Repairing cluster 79 refcount=1 reference=0
+Repairing cluster 80 refcount=1 reference=0
+Repairing cluster 81 refcount=1 reference=0
+Repairing cluster 82 refcount=1 reference=0
+Repairing cluster 83 refcount=1 reference=0
+Repairing cluster 84 refcount=1 reference=0
+Repairing cluster 85 refcount=1 reference=0
+Repairing cluster 86 refcount=1 reference=0
+Repairing cluster 87 refcount=1 reference=0
+Repairing cluster 88 refcount=1 reference=0
+Repairing cluster 89 refcount=1 reference=0
+Repairing cluster 90 refcount=1 reference=0
+Repairing cluster 91 refcount=1 reference=0
+Repairing cluster 92 refcount=1 reference=0
+Repairing cluster 93 refcount=1 reference=0
+Repairing cluster 94 refcount=1 reference=0
+Repairing cluster 95 refcount=1 reference=0
+The following inconsistencies were found and repaired:
+
+    42 leaked clusters
+    65536 corruptions
+
+Double checking the fixed image now...
+No errors were found on the image.
+
+65536 snapshots should remain:
+  qemu-img info reports 65536 snapshots
+  Image header reports 65536 snapshots
+*** done
diff --git a/tests/qemu-iotests/264 b/tests/qemu-iotests/264
index c8cd97a..1313664 100755
--- a/tests/qemu-iotests/264
+++ b/tests/qemu-iotests/264
@@ -24,6 +24,8 @@ import iotests
 from iotests import qemu_img_create, qemu_io_silent_check, file_path, \
         qemu_nbd_popen, log
 
+iotests.verify_image_format(supported_fmts=['qcow2'])
+
 disk_a, disk_b, nbd_sock = file_path('disk_a', 'disk_b', 'nbd-sock')
 nbd_uri = 'nbd+unix:///?socket=' + nbd_sock
 size = 5 * 1024 * 1024
diff --git a/tests/qemu-iotests/267 b/tests/qemu-iotests/267
index d37a67c..170e173 100755
--- a/tests/qemu-iotests/267
+++ b/tests/qemu-iotests/267
@@ -29,7 +29,7 @@ status=1	# failure is the default!
 _cleanup()
 {
     _cleanup_test_img
-    rm -f "$TEST_DIR/nbd"
+    rm -f "$SOCK_DIR/nbd"
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
@@ -143,7 +143,7 @@ echo
 
 IMGOPTS="backing_file=$TEST_IMG.base" _make_test_img $size
 cat <<EOF |
-nbd_server_start unix:$TEST_DIR/nbd
+nbd_server_start unix:$SOCK_DIR/nbd
 nbd_server_add -w backing-fmt
 savevm snap0
 info snapshots
diff --git a/tests/qemu-iotests/267.out b/tests/qemu-iotests/267.out
index 9d812e3..8dddb4b 100644
--- a/tests/qemu-iotests/267.out
+++ b/tests/qemu-iotests/267.out
@@ -161,7 +161,7 @@ Internal snapshots on backing file:
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/t.IMGFMT.base
 Testing: -blockdev driver=file,filename=TEST_DIR/t.IMGFMT.base,node-name=backing-file -blockdev driver=IMGFMT,file=backing-file,node-name=backing-fmt -blockdev driver=file,filename=TEST_DIR/t.IMGFMT,node-name=file -blockdev driver=IMGFMT,file=file,backing=backing-fmt,node-name=fmt
 QEMU X.Y.Z monitor - type 'help' for more information
-(qemu) nbd_server_start unix:TEST_DIR/nbd
+(qemu) nbd_server_start unix:SOCK_DIR/nbd
 (qemu) nbd_server_add -w backing-fmt
 (qemu) savevm snap0
 (qemu) info snapshots
diff --git a/tests/qemu-iotests/check b/tests/qemu-iotests/check
index 588c453..71fe388 100755
--- a/tests/qemu-iotests/check
+++ b/tests/qemu-iotests/check
@@ -97,6 +97,7 @@ IMGFMT        -- $FULL_IMGFMT_DETAILS
 IMGPROTO      -- $IMGPROTO
 PLATFORM      -- $FULL_HOST_DETAILS
 TEST_DIR      -- $TEST_DIR
+SOCK_DIR      -- $SOCK_DIR
 SOCKET_SCM_HELPER -- $SOCKET_SCM_HELPER
 
 EOF
@@ -116,10 +117,14 @@ set_prog_path()
 if [ -z "$TEST_DIR" ]; then
         TEST_DIR=$PWD/scratch
 fi
+mkdir -p "$TEST_DIR" || _init_error 'Failed to create TEST_DIR'
 
-if [ ! -e "$TEST_DIR" ]; then
-        mkdir "$TEST_DIR"
+tmp_sock_dir=false
+if [ -z "$SOCK_DIR" ]; then
+    SOCK_DIR=$(mktemp -d)
+    tmp_sock_dir=true
 fi
+mkdir -p "$SOCK_DIR" || _init_error 'Failed to create SOCK_DIR'
 
 diff="diff -u"
 verbose=false
@@ -534,6 +539,7 @@ if [ -z "$SAMPLE_IMG_DIR" ]; then
 fi
 
 export TEST_DIR
+export SOCK_DIR
 export SAMPLE_IMG_DIR
 
 if [ -s $tmp.list ]
@@ -716,6 +722,11 @@ END        { if (NR > 0) {
     rm -f "${TEST_DIR}"/*.out "${TEST_DIR}"/*.err "${TEST_DIR}"/*.time
     rm -f "${TEST_DIR}"/check.pid "${TEST_DIR}"/check.sts
     rm -f $tmp.*
+
+    if $tmp_sock_dir
+    then
+        rm -rf "$SOCK_DIR"
+    fi
 }
 
 trap "_wrapup; exit \$status" 0 1 2 3 15
diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter
index 9f418b4..f870e00 100644
--- a/tests/qemu-iotests/common.filter
+++ b/tests/qemu-iotests/common.filter
@@ -43,7 +43,8 @@ _filter_qom_path()
 # replace occurrences of the actual TEST_DIR value with TEST_DIR
 _filter_testdir()
 {
-    $SED -e "s#$TEST_DIR/#TEST_DIR/#g"
+    $SED -e "s#$TEST_DIR/#TEST_DIR/#g" \
+         -e "s#$SOCK_DIR/#SOCK_DIR/#g"
 }
 
 # replace occurrences of the actual IMGFMT value with IMGFMT
@@ -124,6 +125,7 @@ _filter_img_create()
     $SED -e "s#$REMOTE_TEST_DIR#TEST_DIR#g" \
         -e "s#$IMGPROTO:$TEST_DIR#TEST_DIR#g" \
         -e "s#$TEST_DIR#TEST_DIR#g" \
+        -e "s#$SOCK_DIR#SOCK_DIR#g" \
         -e "s#$IMGFMT#IMGFMT#g" \
         -e 's#nbd:127.0.0.1:10810#TEST_DIR/t.IMGFMT#g' \
         -e "s# encryption=off##g" \
@@ -160,6 +162,7 @@ _filter_img_info()
     $SED -e "s#$REMOTE_TEST_DIR#TEST_DIR#g" \
         -e "s#$IMGPROTO:$TEST_DIR#TEST_DIR#g" \
         -e "s#$TEST_DIR#TEST_DIR#g" \
+        -e "s#$SOCK_DIR#SOCK_DIR#g" \
         -e "s#$IMGFMT#IMGFMT#g" \
         -e 's#nbd://127.0.0.1:10810$#TEST_DIR/t.IMGFMT#g' \
         -e 's#json.*vdisk-id.*vxhs"}}#TEST_DIR/t.IMGFMT#' \
@@ -218,7 +221,7 @@ _filter_nbd()
     # Filter out the TCP port number since this changes between runs.
     $SED -e '/nbd\/.*\.c:/d' \
         -e 's#127\.0\.0\.1:[0-9]*#127.0.0.1:PORT#g' \
-        -e "s#?socket=$TEST_DIR#?socket=TEST_DIR#g" \
+        -e "s#?socket=$SOCK_DIR#?socket=SOCK_DIR#g" \
         -e 's#\(foo\|PORT/\?\|.sock\): Failed to .*$#\1#'
 }
 
diff --git a/tests/qemu-iotests/common.nbd b/tests/qemu-iotests/common.nbd
index 24b01b6..a8cae8f 100644
--- a/tests/qemu-iotests/common.nbd
+++ b/tests/qemu-iotests/common.nbd
@@ -19,7 +19,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-nbd_unix_socket="${TEST_DIR}/qemu-nbd.sock"
+nbd_unix_socket="${SOCK_DIR}/qemu-nbd.sock"
 nbd_tcp_addr="127.0.0.1"
 nbd_pid_file="${TEST_DIR}/qemu-nbd.pid"
 nbd_stderr_fifo="${TEST_DIR}/qemu-nbd.fifo"
diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc
index 12b4751..fa7bae2 100644
--- a/tests/qemu-iotests/common.rc
+++ b/tests/qemu-iotests/common.rc
@@ -53,6 +53,26 @@ poke_file()
     printf "$3" | dd "of=$1" bs=1 "seek=$2" conv=notrunc &>/dev/null
 }
 
+# peek_file_le 'test.img' 512 2 => 65534
+peek_file_le()
+{
+    # Wrap in echo $() to strip spaces
+    echo $(od -j"$2" -N"$3" --endian=little -An -vtu"$3" "$1")
+}
+
+# peek_file_be 'test.img' 512 2 => 65279
+peek_file_be()
+{
+    # Wrap in echo $() to strip spaces
+    echo $(od -j"$2" -N"$3" --endian=big -An -vtu"$3" "$1")
+}
+
+# peek_file_raw 'test.img' 512 2 => '\xff\xfe'
+peek_file_raw()
+{
+    dd if="$1" bs=1 skip="$2" count="$3" status=none
+}
+
 
 if ! . ./common.config
     then
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index af322af..2887160 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -274,6 +274,7 @@
 257 rw
 258 rw quick
 260 rw quick
+261 rw
 262 rw quick migration
 263 rw quick
 264 rw
diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index 709def4..075f473 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -57,6 +57,7 @@ qemu_opts = os.environ.get('QEMU_OPTIONS', '').strip().split(' ')
 imgfmt = os.environ.get('IMGFMT', 'raw')
 imgproto = os.environ.get('IMGPROTO', 'file')
 test_dir = os.environ.get('TEST_DIR')
+sock_dir = os.environ.get('SOCK_DIR')
 output_dir = os.environ.get('OUTPUT_DIR', '.')
 cachemode = os.environ.get('CACHEMODE')
 qemu_default_machine = os.environ.get('QEMU_DEFAULT_MACHINE')
@@ -385,10 +386,10 @@ class FilePaths(object):
             qemu_img('create', img_path, '1G')
         # migration_sock_path is automatically deleted
     """
-    def __init__(self, names):
+    def __init__(self, names, base_dir=test_dir):
         self.paths = []
         for name in names:
-            self.paths.append(os.path.join(test_dir, file_pattern(name)))
+            self.paths.append(os.path.join(base_dir, file_pattern(name)))
 
     def __enter__(self):
         return self.paths
@@ -405,8 +406,8 @@ class FilePath(FilePaths):
     """
     FilePath is a specialization of FilePaths that takes a single filename.
     """
-    def __init__(self, name):
-        super(FilePath, self).__init__([name])
+    def __init__(self, name, base_dir=test_dir):
+        super(FilePath, self).__init__([name], base_dir)
 
     def __enter__(self):
         return self.paths[0]
@@ -419,7 +420,7 @@ def file_path_remover():
             pass
 
 
-def file_path(*names):
+def file_path(*names, base_dir=test_dir):
     ''' Another way to get auto-generated filename that cleans itself up.
 
     Use is as simple as:
@@ -435,7 +436,7 @@ def file_path(*names):
     paths = []
     for name in names:
         filename = file_pattern(name)
-        path = os.path.join(test_dir, filename)
+        path = os.path.join(base_dir, filename)
         file_path_remover.paths.append(path)
         paths.append(path)
 
@@ -456,7 +457,8 @@ class VM(qtest.QEMUQtestMachine):
         name = "qemu%s-%d" % (path_suffix, os.getpid())
         super(VM, self).__init__(qemu_prog, qemu_opts, name=name,
                                  test_dir=test_dir,
-                                 socket_scm_helper=socket_scm_helper)
+                                 socket_scm_helper=socket_scm_helper,
+                                 sock_dir=sock_dir)
         self._num_drives = 0
 
     def add_object(self, opts):
@@ -838,6 +840,11 @@ class QMPTestCase(unittest.TestCase):
             return self.pause_wait(job_id)
         return result
 
+    def case_skip(self, reason):
+        '''Skip this test case'''
+        case_notrun(reason)
+        self.skipTest(reason)
+
 
 def notrun(reason):
     '''Skip this test suite'''
@@ -849,7 +856,11 @@ def notrun(reason):
     sys.exit(0)
 
 def case_notrun(reason):
-    '''Skip this test case'''
+    '''Mark this test case as not having been run (without actually
+    skipping it, that is left to the caller).  See
+    QMPTestCase.case_skip() for a variant that actually skips the
+    current test case.'''
+
     # Each test in qemu-iotests has a number ("seq")
     seq = os.path.basename(sys.argv[0])
 
@@ -912,22 +923,34 @@ def qemu_pipe(*args):
 def supported_formats(read_only=False):
     '''Set 'read_only' to True to check ro-whitelist
        Otherwise, rw-whitelist is checked'''
-    format_message = qemu_pipe("-drive", "format=help")
-    line = 1 if read_only else 0
-    return format_message.splitlines()[line].split(":")[1].split()
+
+    if not hasattr(supported_formats, "formats"):
+        supported_formats.formats = {}
+
+    if read_only not in supported_formats.formats:
+        format_message = qemu_pipe("-drive", "format=help")
+        line = 1 if read_only else 0
+        supported_formats.formats[read_only] = \
+            format_message.splitlines()[line].split(":")[1].split()
+
+    return supported_formats.formats[read_only]
 
 def skip_if_unsupported(required_formats=[], read_only=False):
     '''Skip Test Decorator
        Runs the test if all the required formats are whitelisted'''
     def skip_test_decorator(func):
-        def func_wrapper(*args, **kwargs):
-            usf_list = list(set(required_formats) -
-                            set(supported_formats(read_only)))
+        def func_wrapper(test_case: QMPTestCase, *args, **kwargs):
+            if callable(required_formats):
+                fmts = required_formats(test_case)
+            else:
+                fmts = required_formats
+
+            usf_list = list(set(fmts) - set(supported_formats(read_only)))
             if usf_list:
-                case_notrun('{}: formats {} are not whitelisted'.format(
-                    args[0], usf_list))
+                test_case.case_skip('{}: formats {} are not whitelisted'.format(
+                    test_case, usf_list))
             else:
-                return func(*args, **kwargs)
+                return func(test_case, *args, **kwargs)
         return func_wrapper
     return skip_test_decorator
 
@@ -950,8 +973,15 @@ def execute_unittest(output, verbosity, debug):
         unittest.main(testRunner=runner)
     finally:
         if not debug:
-            sys.stderr.write(re.sub(r'Ran (\d+) tests? in [\d.]+s',
-                                    r'Ran \1 tests', output.getvalue()))
+            out = output.getvalue()
+            out = re.sub(r'Ran (\d+) tests? in [\d.]+s', r'Ran \1 tests', out)
+
+            # Hide skipped tests from the reference output
+            out = re.sub(r'OK \(skipped=\d+\)', 'OK', out)
+            out_first_line, out_rest = out.split('\n', 1)
+            out = out_first_line.replace('s', '.') + '\n' + out_rest
+
+            sys.stderr.write(out)
 
 def execute_test(test_function=None,
                  supported_fmts=[], supported_oses=['linux'],
diff --git a/tests/test-block-iothread.c b/tests/test-block-iothread.c
index cfe30ba..0c86180 100644
--- a/tests/test-block-iothread.c
+++ b/tests/test-block-iothread.c
@@ -45,7 +45,7 @@ static int coroutine_fn bdrv_test_co_pdiscard(BlockDriverState *bs,
 }
 
 static int coroutine_fn
-bdrv_test_co_truncate(BlockDriverState *bs, int64_t offset,
+bdrv_test_co_truncate(BlockDriverState *bs, int64_t offset, bool exact,
                       PreallocMode prealloc, Error **errp)
 {
     return 0;
@@ -185,18 +185,18 @@ static void test_sync_op_truncate(BdrvChild *c)
     int ret;
 
     /* Normal success path */
-    ret = bdrv_truncate(c, 65536, PREALLOC_MODE_OFF, NULL);
+    ret = bdrv_truncate(c, 65536, false, PREALLOC_MODE_OFF, NULL);
     g_assert_cmpint(ret, ==, 0);
 
     /* Early error: Negative offset */
-    ret = bdrv_truncate(c, -2, PREALLOC_MODE_OFF, NULL);
+    ret = bdrv_truncate(c, -2, false, PREALLOC_MODE_OFF, NULL);
     g_assert_cmpint(ret, ==, -EINVAL);
 
     /* Error: Read-only image */
     c->bs->read_only = true;
     c->bs->open_flags &= ~BDRV_O_RDWR;
 
-    ret = bdrv_truncate(c, 65536, PREALLOC_MODE_OFF, NULL);
+    ret = bdrv_truncate(c, 65536, false, PREALLOC_MODE_OFF, NULL);
     g_assert_cmpint(ret, ==, -EACCES);
 
     c->bs->read_only = false;
diff --git a/util/Makefile.objs b/util/Makefile.objs
index 41bf59d..df124af 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -37,6 +37,7 @@ util-obj-y += rcu.o
 util-obj-$(CONFIG_MEMBARRIER) += sys_membarrier.o
 util-obj-y += qemu-coroutine.o qemu-coroutine-lock.o qemu-coroutine-io.o
 util-obj-y += qemu-coroutine-sleep.o
+util-obj-y += qemu-co-shared-resource.o
 util-obj-y += coroutine-$(CONFIG_COROUTINE_BACKEND).o
 util-obj-y += buffer.o
 util-obj-y += timed-average.o
diff --git a/util/hbitmap.c b/util/hbitmap.c
index 66db87c..242c6e5 100644
--- a/util/hbitmap.c
+++ b/util/hbitmap.c
@@ -387,6 +387,10 @@ void hbitmap_set(HBitmap *hb, uint64_t start, uint64_t count)
     uint64_t first, n;
     uint64_t last = start + count - 1;
 
+    if (count == 0) {
+        return;
+    }
+
     trace_hbitmap_set(hb, start, count,
                       start >> hb->granularity, last >> hb->granularity);
 
@@ -478,6 +482,10 @@ void hbitmap_reset(HBitmap *hb, uint64_t start, uint64_t count)
     uint64_t last = start + count - 1;
     uint64_t gran = 1ULL << hb->granularity;
 
+    if (count == 0) {
+        return;
+    }
+
     assert(QEMU_IS_ALIGNED(start, gran));
     assert(QEMU_IS_ALIGNED(count, gran) || (start + count == hb->orig_size));
 
diff --git a/util/qemu-co-shared-resource.c b/util/qemu-co-shared-resource.c
new file mode 100644
index 0000000..1c83cd9
--- /dev/null
+++ b/util/qemu-co-shared-resource.c
@@ -0,0 +1,76 @@
+/*
+ * Helper functionality for distributing a fixed total amount of
+ * an abstract resource among multiple coroutines.
+ *
+ * Copyright (c) 2019 Virtuozzo International GmbH
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/coroutine.h"
+#include "qemu/co-shared-resource.h"
+
+struct SharedResource {
+    uint64_t total;
+    uint64_t available;
+
+    CoQueue queue;
+};
+
+SharedResource *shres_create(uint64_t total)
+{
+    SharedResource *s = g_new0(SharedResource, 1);
+
+    s->total = s->available = total;
+    qemu_co_queue_init(&s->queue);
+
+    return s;
+}
+
+void shres_destroy(SharedResource *s)
+{
+    assert(s->available == s->total);
+    g_free(s);
+}
+
+bool co_try_get_from_shres(SharedResource *s, uint64_t n)
+{
+    if (s->available >= n) {
+        s->available -= n;
+        return true;
+    }
+
+    return false;
+}
+
+void coroutine_fn co_get_from_shres(SharedResource *s, uint64_t n)
+{
+    assert(n <= s->total);
+    while (!co_try_get_from_shres(s, n)) {
+        qemu_co_queue_wait(&s->queue, NULL);
+    }
+}
+
+void coroutine_fn co_put_to_shres(SharedResource *s, uint64_t n)
+{
+    assert(s->total - s->available >= n);
+    s->available += n;
+    qemu_co_queue_restart_all(&s->queue);
+}