aboutsummaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorPeter Maydell <peter.maydell@linaro.org>2019-10-28 14:40:00 +0000
committerPeter Maydell <peter.maydell@linaro.org>2019-10-28 14:40:01 +0000
commitaaffb853359829a37daaf883c773e8320b55c723 (patch)
tree65d111156f4480afe7cb6c4552d490eb1947e3cd /block
parent9bb73502321d46f4d320fa17aa38201445783fc4 (diff)
parentba9c45139e2938b8d20ce407db83a31bc9e5066c (diff)
downloadqemu-aaffb853359829a37daaf883c773e8320b55c723.zip
qemu-aaffb853359829a37daaf883c773e8320b55c723.tar.gz
qemu-aaffb853359829a37daaf883c773e8320b55c723.tar.bz2
Merge remote-tracking branch 'remotes/maxreitz/tags/pull-block-2019-10-28' into staging
Block patches for softfreeze: - iotest patches - Improve performance of the mirror block job in write-blocking mode - Limit memory usage for the backup block job - Add discard and write-zeroes support to the NVMe host block driver - Fix a bug in the mirror job - Prevent the qcow2 driver from creating technically non-compliant qcow2 v3 images (where there is not enough extra data for snapshot table entries) - Allow callers of bdrv_truncate() (etc.) to determine whether the file must be resized to the exact given size or whether it is OK for block devices not to shrink # gpg: Signature made Mon 28 Oct 2019 12:13:53 GMT # gpg: using RSA key 91BEB60A30DB3E8857D11829F407DB0061D5CF40 # gpg: issuer "mreitz@redhat.com" # gpg: Good signature from "Max Reitz <mreitz@redhat.com>" [full] # Primary key fingerprint: 91BE B60A 30DB 3E88 57D1 1829 F407 DB00 61D5 CF40 * remotes/maxreitz/tags/pull-block-2019-10-28: (69 commits) qemu-iotests: restrict 264 to qcow2 only Revert "qemu-img: Check post-truncation size" block: Pass truncate exact=true where reasonable block: Let format drivers pass @exact block: Evaluate @exact in protocol drivers block: Add @exact parameter to bdrv_co_truncate() block: Do not truncate file node when formatting block/cor: Drop cor_co_truncate() block: Handle filter truncation like native impl. iotests: Test qcow2's snapshot table handling iotests: Add peek_file* functions qcow2: Fix v3 snapshot table entry compliancy qcow2: Repair snapshot table with too many entries qcow2: Fix overly long snapshot tables qcow2: Keep track of the snapshot table length qcow2: Fix broken snapshot table entries qcow2: Add qcow2_check_fix_snapshot_table() qcow2: Separate qcow2_check_read_snapshot_table() qcow2: Write v3-compliant snapshot list on upgrade qcow2: Put qcow2_upgrade() into its own function ... Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'block')
-rw-r--r--block/block-backend.c23
-rw-r--r--block/block-copy.c182
-rw-r--r--block/commit.c5
-rw-r--r--block/copy-on-read.c8
-rw-r--r--block/crypto.c8
-rw-r--r--block/file-posix.c11
-rw-r--r--block/file-win32.c3
-rw-r--r--block/gluster.c1
-rw-r--r--block/io.c29
-rw-r--r--block/iscsi.c10
-rw-r--r--block/mirror.c198
-rw-r--r--block/nfs.c2
-rw-r--r--block/nvme.c155
-rw-r--r--block/parallels.c18
-rw-r--r--block/qcow.c9
-rw-r--r--block/qcow2-refcount.c2
-rw-r--r--block/qcow2-snapshot.c323
-rw-r--r--block/qcow2.c200
-rw-r--r--block/qcow2.h17
-rw-r--r--block/qed.c8
-rw-r--r--block/raw-format.c5
-rw-r--r--block/rbd.c1
-rw-r--r--block/sheepdog.c5
-rw-r--r--block/ssh.c3
-rw-r--r--block/trace-events9
-rw-r--r--block/vdi.c2
-rw-r--r--block/vhdx-log.c4
-rw-r--r--block/vhdx.c7
-rw-r--r--block/vmdk.c8
-rw-r--r--block/vpc.c2
30 files changed, 936 insertions, 322 deletions
diff --git a/block/block-backend.c b/block/block-backend.c
index eb22ff3..8b8f2a8 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1178,9 +1178,10 @@ int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
return ret;
}
-int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
- unsigned int bytes, QEMUIOVector *qiov,
- BdrvRequestFlags flags)
+int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, int64_t offset,
+ unsigned int bytes,
+ QEMUIOVector *qiov, size_t qiov_offset,
+ BdrvRequestFlags flags)
{
int ret;
BlockDriverState *bs;
@@ -1207,11 +1208,19 @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
flags |= BDRV_REQ_FUA;
}
- ret = bdrv_co_pwritev(blk->root, offset, bytes, qiov, flags);
+ ret = bdrv_co_pwritev_part(blk->root, offset, bytes, qiov, qiov_offset,
+ flags);
bdrv_dec_in_flight(bs);
return ret;
}
+int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
+ unsigned int bytes, QEMUIOVector *qiov,
+ BdrvRequestFlags flags)
+{
+ return blk_co_pwritev_part(blk, offset, bytes, qiov, 0, flags);
+}
+
typedef struct BlkRwCo {
BlockBackend *blk;
int64_t offset;
@@ -2063,15 +2072,15 @@ int blk_pwrite_compressed(BlockBackend *blk, int64_t offset, const void *buf,
BDRV_REQ_WRITE_COMPRESSED);
}
-int blk_truncate(BlockBackend *blk, int64_t offset, PreallocMode prealloc,
- Error **errp)
+int blk_truncate(BlockBackend *blk, int64_t offset, bool exact,
+ PreallocMode prealloc, Error **errp)
{
if (!blk_is_available(blk)) {
error_setg(errp, "No medium inserted");
return -ENOMEDIUM;
}
- return bdrv_truncate(blk->root, offset, prealloc, errp);
+ return bdrv_truncate(blk->root, offset, exact, prealloc, errp);
}
static void blk_pdiscard_entry(void *opaque)
diff --git a/block/block-copy.c b/block/block-copy.c
index 066e3a7..c39cc9c 100644
--- a/block/block-copy.c
+++ b/block/block-copy.c
@@ -18,6 +18,11 @@
#include "qapi/error.h"
#include "block/block-copy.h"
#include "sysemu/block-backend.h"
+#include "qemu/units.h"
+
+#define BLOCK_COPY_MAX_COPY_RANGE (16 * MiB)
+#define BLOCK_COPY_MAX_BUFFER (1 * MiB)
+#define BLOCK_COPY_MAX_MEM (128 * MiB)
static void coroutine_fn block_copy_wait_inflight_reqs(BlockCopyState *s,
int64_t start,
@@ -61,6 +66,7 @@ void block_copy_state_free(BlockCopyState *s)
}
bdrv_release_dirty_bitmap(s->copy_bitmap);
+ shres_destroy(s->mem);
g_free(s);
}
@@ -71,8 +77,9 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
BlockCopyState *s;
BdrvDirtyBitmap *copy_bitmap;
uint32_t max_transfer =
- MIN_NON_ZERO(INT_MAX, MIN_NON_ZERO(source->bs->bl.max_transfer,
- target->bs->bl.max_transfer));
+ MIN_NON_ZERO(INT_MAX,
+ MIN_NON_ZERO(source->bs->bl.max_transfer,
+ target->bs->bl.max_transfer));
copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL,
errp);
@@ -89,19 +96,31 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
.cluster_size = cluster_size,
.len = bdrv_dirty_bitmap_size(copy_bitmap),
.write_flags = write_flags,
+ .mem = shres_create(BLOCK_COPY_MAX_MEM),
};
- s->copy_range_size = QEMU_ALIGN_DOWN(max_transfer, cluster_size),
- /*
- * Set use_copy_range, consider the following:
- * 1. Compression is not supported for copy_range.
- * 2. copy_range does not respect max_transfer (it's a TODO), so we factor
- * that in here. If max_transfer is smaller than the job->cluster_size,
- * we do not use copy_range (in that case it's zero after aligning down
- * above).
- */
- s->use_copy_range =
- !(write_flags & BDRV_REQ_WRITE_COMPRESSED) && s->copy_range_size > 0;
+ if (max_transfer < cluster_size) {
+ /*
+ * copy_range does not respect max_transfer. We don't want to bother
+ * with requests smaller than block-copy cluster size, so fallback to
+ * buffered copying (read and write respect max_transfer on their
+ * behalf).
+ */
+ s->use_copy_range = false;
+ s->copy_size = cluster_size;
+ } else if (write_flags & BDRV_REQ_WRITE_COMPRESSED) {
+ /* Compression is not supported for copy_range */
+ s->use_copy_range = false;
+ s->copy_size = MAX(cluster_size, BLOCK_COPY_MAX_BUFFER);
+ } else {
+ /*
+ * copy_range does not respect max_transfer (it's a TODO), so we factor
+ * that in here.
+ */
+ s->use_copy_range = true;
+ s->copy_size = MIN(MAX(cluster_size, BLOCK_COPY_MAX_COPY_RANGE),
+ QEMU_ALIGN_DOWN(max_transfer, cluster_size));
+ }
QLIST_INIT(&s->inflight_reqs);
@@ -120,79 +139,71 @@ void block_copy_set_callbacks(
}
/*
- * Copy range to target with a bounce buffer and return the bytes copied. If
- * error occurred, return a negative error number
+ * block_copy_do_copy
+ *
+ * Do copy of cluser-aligned chunk. @end is allowed to exceed s->len only to
+ * cover last cluster when s->len is not aligned to clusters.
+ *
+ * No sync here: nor bitmap neighter intersecting requests handling, only copy.
+ *
+ * Returns 0 on success.
*/
-static int coroutine_fn block_copy_with_bounce_buffer(BlockCopyState *s,
- int64_t start,
- int64_t end,
- bool *error_is_read,
- void **bounce_buffer)
+static int coroutine_fn block_copy_do_copy(BlockCopyState *s,
+ int64_t start, int64_t end,
+ bool *error_is_read)
{
int ret;
- int nbytes;
+ int nbytes = MIN(end, s->len) - start;
+ void *bounce_buffer = NULL;
assert(QEMU_IS_ALIGNED(start, s->cluster_size));
- bdrv_reset_dirty_bitmap(s->copy_bitmap, start, s->cluster_size);
- nbytes = MIN(s->cluster_size, s->len - start);
- if (!*bounce_buffer) {
- *bounce_buffer = qemu_blockalign(s->source->bs, s->cluster_size);
+ assert(QEMU_IS_ALIGNED(end, s->cluster_size));
+ assert(end < s->len || end == QEMU_ALIGN_UP(s->len, s->cluster_size));
+
+ if (s->use_copy_range) {
+ ret = bdrv_co_copy_range(s->source, start, s->target, start, nbytes,
+ 0, s->write_flags);
+ if (ret < 0) {
+ trace_block_copy_copy_range_fail(s, start, ret);
+ s->use_copy_range = false;
+ s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER);
+ /* Fallback to read+write with allocated buffer */
+ } else {
+ goto out;
+ }
}
- ret = bdrv_co_pread(s->source, start, nbytes, *bounce_buffer, 0);
+ /*
+ * In case of failed copy_range request above, we may proceed with buffered
+ * request larger than BLOCK_COPY_MAX_BUFFER. Still, further requests will
+ * be properly limited, so don't care too much.
+ */
+
+ bounce_buffer = qemu_blockalign(s->source->bs, nbytes);
+
+ ret = bdrv_co_pread(s->source, start, nbytes, bounce_buffer, 0);
if (ret < 0) {
- trace_block_copy_with_bounce_buffer_read_fail(s, start, ret);
+ trace_block_copy_read_fail(s, start, ret);
if (error_is_read) {
*error_is_read = true;
}
- goto fail;
+ goto out;
}
- ret = bdrv_co_pwrite(s->target, start, nbytes, *bounce_buffer,
+ ret = bdrv_co_pwrite(s->target, start, nbytes, bounce_buffer,
s->write_flags);
if (ret < 0) {
- trace_block_copy_with_bounce_buffer_write_fail(s, start, ret);
+ trace_block_copy_write_fail(s, start, ret);
if (error_is_read) {
*error_is_read = false;
}
- goto fail;
+ goto out;
}
- return nbytes;
-fail:
- bdrv_set_dirty_bitmap(s->copy_bitmap, start, s->cluster_size);
- return ret;
-
-}
-
-/*
- * Copy range to target and return the bytes copied. If error occurred, return a
- * negative error number.
- */
-static int coroutine_fn block_copy_with_offload(BlockCopyState *s,
- int64_t start,
- int64_t end)
-{
- int ret;
- int nr_clusters;
- int nbytes;
-
- assert(QEMU_IS_ALIGNED(s->copy_range_size, s->cluster_size));
- assert(QEMU_IS_ALIGNED(start, s->cluster_size));
- nbytes = MIN(s->copy_range_size, MIN(end, s->len) - start);
- nr_clusters = DIV_ROUND_UP(nbytes, s->cluster_size);
- bdrv_reset_dirty_bitmap(s->copy_bitmap, start,
- s->cluster_size * nr_clusters);
- ret = bdrv_co_copy_range(s->source, start, s->target, start, nbytes,
- 0, s->write_flags);
- if (ret < 0) {
- trace_block_copy_with_offload_fail(s, start, ret);
- bdrv_set_dirty_bitmap(s->copy_bitmap, start,
- s->cluster_size * nr_clusters);
- return ret;
- }
+out:
+ qemu_vfree(bounce_buffer);
- return nbytes;
+ return ret;
}
/*
@@ -271,7 +282,6 @@ int coroutine_fn block_copy(BlockCopyState *s,
{
int ret = 0;
int64_t end = bytes + start; /* bytes */
- void *bounce_buffer = NULL;
int64_t status_bytes;
BlockCopyInFlightReq req;
@@ -289,7 +299,7 @@ int coroutine_fn block_copy(BlockCopyState *s,
block_copy_inflight_req_begin(s, &req, start, end);
while (start < end) {
- int64_t dirty_end;
+ int64_t next_zero, chunk_end;
if (!bdrv_dirty_bitmap_get(s->copy_bitmap, start)) {
trace_block_copy_skip(s, start);
@@ -297,10 +307,14 @@ int coroutine_fn block_copy(BlockCopyState *s,
continue; /* already copied */
}
- dirty_end = bdrv_dirty_bitmap_next_zero(s->copy_bitmap, start,
- (end - start));
- if (dirty_end < 0) {
- dirty_end = end;
+ chunk_end = MIN(end, start + s->copy_size);
+
+ next_zero = bdrv_dirty_bitmap_next_zero(s->copy_bitmap, start,
+ chunk_end - start);
+ if (next_zero >= 0) {
+ assert(next_zero > start); /* start is dirty */
+ assert(next_zero < chunk_end); /* no need to do MIN() */
+ chunk_end = next_zero;
}
if (s->skip_unallocated) {
@@ -311,34 +325,26 @@ int coroutine_fn block_copy(BlockCopyState *s,
continue;
}
/* Clamp to known allocated region */
- dirty_end = MIN(dirty_end, start + status_bytes);
+ chunk_end = MIN(chunk_end, start + status_bytes);
}
trace_block_copy_process(s, start);
- if (s->use_copy_range) {
- ret = block_copy_with_offload(s, start, dirty_end);
- if (ret < 0) {
- s->use_copy_range = false;
- }
- }
- if (!s->use_copy_range) {
- ret = block_copy_with_bounce_buffer(s, start, dirty_end,
- error_is_read, &bounce_buffer);
- }
+ bdrv_reset_dirty_bitmap(s->copy_bitmap, start, chunk_end - start);
+
+ co_get_from_shres(s->mem, chunk_end - start);
+ ret = block_copy_do_copy(s, start, chunk_end, error_is_read);
+ co_put_to_shres(s->mem, chunk_end - start);
if (ret < 0) {
+ bdrv_set_dirty_bitmap(s->copy_bitmap, start, chunk_end - start);
break;
}
- start += ret;
- s->progress_bytes_callback(ret, s->progress_opaque);
+ s->progress_bytes_callback(chunk_end - start, s->progress_opaque);
+ start = chunk_end;
ret = 0;
}
- if (bounce_buffer) {
- qemu_vfree(bounce_buffer);
- }
-
block_copy_inflight_req_end(&req);
return ret;
diff --git a/block/commit.c b/block/commit.c
index bc84544..23c90b3 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -155,7 +155,7 @@ static int coroutine_fn commit_run(Job *job, Error **errp)
}
if (base_len < len) {
- ret = blk_truncate(s->base, len, PREALLOC_MODE_OFF, NULL);
+ ret = blk_truncate(s->base, len, false, PREALLOC_MODE_OFF, NULL);
if (ret) {
goto out;
}
@@ -471,7 +471,8 @@ int bdrv_commit(BlockDriverState *bs)
* grow the backing file image if possible. If not possible,
* we must return an error */
if (length > backing_length) {
- ret = blk_truncate(backing, length, PREALLOC_MODE_OFF, &local_err);
+ ret = blk_truncate(backing, length, false, PREALLOC_MODE_OFF,
+ &local_err);
if (ret < 0) {
error_report_err(local_err);
goto ro_cleanup;
diff --git a/block/copy-on-read.c b/block/copy-on-read.c
index 6631f30..e95223d 100644
--- a/block/copy-on-read.c
+++ b/block/copy-on-read.c
@@ -73,13 +73,6 @@ static int64_t cor_getlength(BlockDriverState *bs)
}
-static int coroutine_fn cor_co_truncate(BlockDriverState *bs, int64_t offset,
- PreallocMode prealloc, Error **errp)
-{
- return bdrv_co_truncate(bs->file, offset, prealloc, errp);
-}
-
-
static int coroutine_fn cor_co_preadv(BlockDriverState *bs,
uint64_t offset, uint64_t bytes,
QEMUIOVector *qiov, int flags)
@@ -139,7 +132,6 @@ static BlockDriver bdrv_copy_on_read = {
.bdrv_child_perm = cor_child_perm,
.bdrv_getlength = cor_getlength,
- .bdrv_co_truncate = cor_co_truncate,
.bdrv_co_preadv = cor_co_preadv,
.bdrv_co_pwritev = cor_co_pwritev,
diff --git a/block/crypto.c b/block/crypto.c
index 7eb6987..2482383 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -113,8 +113,8 @@ static ssize_t block_crypto_init_func(QCryptoBlock *block,
* available to the guest, so we must take account of that
* which will be used by the crypto header
*/
- return blk_truncate(data->blk, data->size + headerlen, data->prealloc,
- errp);
+ return blk_truncate(data->blk, data->size + headerlen, false,
+ data->prealloc, errp);
}
@@ -297,7 +297,7 @@ static int block_crypto_co_create_generic(BlockDriverState *bs,
}
static int coroutine_fn
-block_crypto_co_truncate(BlockDriverState *bs, int64_t offset,
+block_crypto_co_truncate(BlockDriverState *bs, int64_t offset, bool exact,
PreallocMode prealloc, Error **errp)
{
BlockCrypto *crypto = bs->opaque;
@@ -311,7 +311,7 @@ block_crypto_co_truncate(BlockDriverState *bs, int64_t offset,
offset += payload_offset;
- return bdrv_co_truncate(bs->file, offset, prealloc, errp);
+ return bdrv_co_truncate(bs->file, offset, exact, prealloc, errp);
}
static void block_crypto_close(BlockDriverState *bs)
diff --git a/block/file-posix.c b/block/file-posix.c
index 5d1995a..0b7e904 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -2020,7 +2020,8 @@ raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset,
}
static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
- PreallocMode prealloc, Error **errp)
+ bool exact, PreallocMode prealloc,
+ Error **errp)
{
BDRVRawState *s = bs->opaque;
struct stat st;
@@ -2033,6 +2034,7 @@ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
}
if (S_ISREG(st.st_mode)) {
+ /* Always resizes to the exact @offset */
return raw_regular_truncate(bs, s->fd, offset, prealloc, errp);
}
@@ -2043,7 +2045,12 @@ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
}
if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
- if (offset > raw_getlength(bs)) {
+ int64_t cur_length = raw_getlength(bs);
+
+ if (offset != cur_length && exact) {
+ error_setg(errp, "Cannot resize device files");
+ return -ENOTSUP;
+ } else if (offset > cur_length) {
error_setg(errp, "Cannot grow device files");
return -EINVAL;
}
diff --git a/block/file-win32.c b/block/file-win32.c
index 41f55df..77e8ff7 100644
--- a/block/file-win32.c
+++ b/block/file-win32.c
@@ -468,7 +468,8 @@ static void raw_close(BlockDriverState *bs)
}
static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
- PreallocMode prealloc, Error **errp)
+ bool exact, PreallocMode prealloc,
+ Error **errp)
{
BDRVRawState *s = bs->opaque;
LONG low, high;
diff --git a/block/gluster.c b/block/gluster.c
index 64028b2..4fa4a77 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -1225,6 +1225,7 @@ static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs,
static coroutine_fn int qemu_gluster_co_truncate(BlockDriverState *bs,
int64_t offset,
+ bool exact,
PreallocMode prealloc,
Error **errp)
{
diff --git a/block/io.c b/block/io.c
index e46d9e8..02659f9 100644
--- a/block/io.c
+++ b/block/io.c
@@ -3291,8 +3291,12 @@ static void bdrv_parent_cb_resize(BlockDriverState *bs)
/**
* Truncate file to 'offset' bytes (needed only for file protocols)
+ *
+ * If 'exact' is true, the file must be resized to exactly the given
+ * 'offset'. Otherwise, it is sufficient for the node to be at least
+ * 'offset' bytes in length.
*/
-int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset,
+int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
PreallocMode prealloc, Error **errp)
{
BlockDriverState *bs = child->bs;
@@ -3347,20 +3351,19 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset,
goto out;
}
- if (!drv->bdrv_co_truncate) {
- if (bs->file && drv->is_filter) {
- ret = bdrv_co_truncate(bs->file, offset, prealloc, errp);
- goto out;
- }
+ if (drv->bdrv_co_truncate) {
+ ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, errp);
+ } else if (bs->file && drv->is_filter) {
+ ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, errp);
+ } else {
error_setg(errp, "Image format driver does not support resize");
ret = -ENOTSUP;
goto out;
}
-
- ret = drv->bdrv_co_truncate(bs, offset, prealloc, errp);
if (ret < 0) {
goto out;
}
+
ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
if (ret < 0) {
error_setg_errno(errp, -ret, "Could not refresh total sector count");
@@ -3382,6 +3385,7 @@ out:
typedef struct TruncateCo {
BdrvChild *child;
int64_t offset;
+ bool exact;
PreallocMode prealloc;
Error **errp;
int ret;
@@ -3390,18 +3394,19 @@ typedef struct TruncateCo {
static void coroutine_fn bdrv_truncate_co_entry(void *opaque)
{
TruncateCo *tco = opaque;
- tco->ret = bdrv_co_truncate(tco->child, tco->offset, tco->prealloc,
- tco->errp);
+ tco->ret = bdrv_co_truncate(tco->child, tco->offset, tco->exact,
+ tco->prealloc, tco->errp);
aio_wait_kick();
}
-int bdrv_truncate(BdrvChild *child, int64_t offset, PreallocMode prealloc,
- Error **errp)
+int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
+ PreallocMode prealloc, Error **errp)
{
Coroutine *co;
TruncateCo tco = {
.child = child,
.offset = offset,
+ .exact = exact,
.prealloc = prealloc,
.errp = errp,
.ret = NOT_DONE,
diff --git a/block/iscsi.c b/block/iscsi.c
index 2ced150..2aea7e3 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -2123,9 +2123,11 @@ static void iscsi_reopen_commit(BDRVReopenState *reopen_state)
}
static int coroutine_fn iscsi_co_truncate(BlockDriverState *bs, int64_t offset,
- PreallocMode prealloc, Error **errp)
+ bool exact, PreallocMode prealloc,
+ Error **errp)
{
IscsiLun *iscsilun = bs->opaque;
+ int64_t cur_length;
Error *local_err = NULL;
if (prealloc != PREALLOC_MODE_OFF) {
@@ -2145,7 +2147,11 @@ static int coroutine_fn iscsi_co_truncate(BlockDriverState *bs, int64_t offset,
return -EIO;
}
- if (offset > iscsi_getlength(bs)) {
+ cur_length = iscsi_getlength(bs);
+ if (offset != cur_length && exact) {
+ error_setg(errp, "Cannot resize iSCSI devices");
+ return -ENOTSUP;
+ } else if (offset > cur_length) {
error_setg(errp, "Cannot grow iSCSI devices");
return -EINVAL;
}
diff --git a/block/mirror.c b/block/mirror.c
index a6c50ca..f0f2d9d 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -620,11 +620,11 @@ static int mirror_exit_common(Job *job)
{
MirrorBlockJob *s = container_of(job, MirrorBlockJob, common.job);
BlockJob *bjob = &s->common;
- MirrorBDSOpaque *bs_opaque = s->mirror_top_bs->opaque;
+ MirrorBDSOpaque *bs_opaque;
AioContext *replace_aio_context = NULL;
- BlockDriverState *src = s->mirror_top_bs->backing->bs;
- BlockDriverState *target_bs = blk_bs(s->target);
- BlockDriverState *mirror_top_bs = s->mirror_top_bs;
+ BlockDriverState *src;
+ BlockDriverState *target_bs;
+ BlockDriverState *mirror_top_bs;
Error *local_err = NULL;
bool abort = job->ret < 0;
int ret = 0;
@@ -634,6 +634,11 @@ static int mirror_exit_common(Job *job)
}
s->prepared = true;
+ mirror_top_bs = s->mirror_top_bs;
+ bs_opaque = mirror_top_bs->opaque;
+ src = mirror_top_bs->backing->bs;
+ target_bs = blk_bs(s->target);
+
if (bdrv_chain_contains(src, target_bs)) {
bdrv_unfreeze_backing_chain(mirror_top_bs, target_bs);
}
@@ -873,8 +878,8 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
}
if (s->bdev_length > base_length) {
- ret = blk_truncate(s->target, s->bdev_length, PREALLOC_MODE_OFF,
- NULL);
+ ret = blk_truncate(s->target, s->bdev_length, false,
+ PREALLOC_MODE_OFF, NULL);
if (ret < 0) {
goto immediate_exit;
}
@@ -1181,84 +1186,107 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method,
uint64_t offset, uint64_t bytes,
QEMUIOVector *qiov, int flags)
{
- QEMUIOVector target_qiov;
- uint64_t dirty_offset = offset;
- uint64_t dirty_bytes;
+ int ret;
+ size_t qiov_offset = 0;
+ int64_t bitmap_offset, bitmap_end;
- if (qiov) {
- qemu_iovec_init(&target_qiov, qiov->niov);
+ if (!QEMU_IS_ALIGNED(offset, job->granularity) &&
+ bdrv_dirty_bitmap_get(job->dirty_bitmap, offset))
+ {
+ /*
+ * Dirty unaligned padding: ignore it.
+ *
+ * Reasoning:
+ * 1. If we copy it, we can't reset corresponding bit in
+ * dirty_bitmap as there may be some "dirty" bytes still not
+ * copied.
+ * 2. It's already dirty, so skipping it we don't diverge mirror
+ * progress.
+ *
+ * Note, that because of this, guest write may have no contribution
+ * into mirror converge, but that's not bad, as we have background
+ * process of mirroring. If under some bad circumstances (high guest
+ * IO load) background process starve, we will not converge anyway,
+ * even if each write will contribute, as guest is not guaranteed to
+ * rewrite the whole disk.
+ */
+ qiov_offset = QEMU_ALIGN_UP(offset, job->granularity) - offset;
+ if (bytes <= qiov_offset) {
+ /* nothing to do after shrink */
+ return;
+ }
+ offset += qiov_offset;
+ bytes -= qiov_offset;
}
- while (true) {
- bool valid_area;
- int ret;
+ if (!QEMU_IS_ALIGNED(offset + bytes, job->granularity) &&
+ bdrv_dirty_bitmap_get(job->dirty_bitmap, offset + bytes - 1))
+ {
+ uint64_t tail = (offset + bytes) % job->granularity;
- bdrv_dirty_bitmap_lock(job->dirty_bitmap);
- dirty_bytes = MIN(offset + bytes - dirty_offset, INT_MAX);
- valid_area = bdrv_dirty_bitmap_next_dirty_area(job->dirty_bitmap,
- &dirty_offset,
- &dirty_bytes);
- if (!valid_area) {
- bdrv_dirty_bitmap_unlock(job->dirty_bitmap);
- break;
+ if (bytes <= tail) {
+ /* nothing to do after shrink */
+ return;
}
+ bytes -= tail;
+ }
- bdrv_reset_dirty_bitmap_locked(job->dirty_bitmap,
- dirty_offset, dirty_bytes);
- bdrv_dirty_bitmap_unlock(job->dirty_bitmap);
-
- job_progress_increase_remaining(&job->common.job, dirty_bytes);
-
- assert(dirty_offset - offset <= SIZE_MAX);
- if (qiov) {
- qemu_iovec_reset(&target_qiov);
- qemu_iovec_concat(&target_qiov, qiov,
- dirty_offset - offset, dirty_bytes);
- }
+ /*
+ * Tails are either clean or shrunk, so for bitmap resetting
+ * we safely align the range down.
+ */
+ bitmap_offset = QEMU_ALIGN_UP(offset, job->granularity);
+ bitmap_end = QEMU_ALIGN_DOWN(offset + bytes, job->granularity);
+ if (bitmap_offset < bitmap_end) {
+ bdrv_reset_dirty_bitmap(job->dirty_bitmap, bitmap_offset,
+ bitmap_end - bitmap_offset);
+ }
- switch (method) {
- case MIRROR_METHOD_COPY:
- ret = blk_co_pwritev(job->target, dirty_offset, dirty_bytes,
- qiov ? &target_qiov : NULL, flags);
- break;
+ job_progress_increase_remaining(&job->common.job, bytes);
- case MIRROR_METHOD_ZERO:
- assert(!qiov);
- ret = blk_co_pwrite_zeroes(job->target, dirty_offset, dirty_bytes,
- flags);
- break;
+ switch (method) {
+ case MIRROR_METHOD_COPY:
+ ret = blk_co_pwritev_part(job->target, offset, bytes,
+ qiov, qiov_offset, flags);
+ break;
- case MIRROR_METHOD_DISCARD:
- assert(!qiov);
- ret = blk_co_pdiscard(job->target, dirty_offset, dirty_bytes);
- break;
+ case MIRROR_METHOD_ZERO:
+ assert(!qiov);
+ ret = blk_co_pwrite_zeroes(job->target, offset, bytes, flags);
+ break;
- default:
- abort();
- }
+ case MIRROR_METHOD_DISCARD:
+ assert(!qiov);
+ ret = blk_co_pdiscard(job->target, offset, bytes);
+ break;
- if (ret >= 0) {
- job_progress_update(&job->common.job, dirty_bytes);
- } else {
- BlockErrorAction action;
+ default:
+ abort();
+ }
- bdrv_set_dirty_bitmap(job->dirty_bitmap, dirty_offset, dirty_bytes);
- job->actively_synced = false;
+ if (ret >= 0) {
+ job_progress_update(&job->common.job, bytes);
+ } else {
+ BlockErrorAction action;
- action = mirror_error_action(job, false, -ret);
- if (action == BLOCK_ERROR_ACTION_REPORT) {
- if (!job->ret) {
- job->ret = ret;
- }
- break;
+ /*
+ * We failed, so we should mark dirty the whole area, aligned up.
+ * Note that we don't care about shrunk tails if any: they were dirty
+ * at function start, and they must be still dirty, as we've locked
+ * the region for in-flight op.
+ */
+ bitmap_offset = QEMU_ALIGN_DOWN(offset, job->granularity);
+ bitmap_end = QEMU_ALIGN_UP(offset + bytes, job->granularity);
+ bdrv_set_dirty_bitmap(job->dirty_bitmap, bitmap_offset,
+ bitmap_end - bitmap_offset);
+ job->actively_synced = false;
+
+ action = mirror_error_action(job, false, -ret);
+ if (action == BLOCK_ERROR_ACTION_REPORT) {
+ if (!job->ret) {
+ job->ret = ret;
}
}
-
- dirty_offset += dirty_bytes;
- }
-
- if (qiov) {
- qemu_iovec_destroy(&target_qiov);
}
}
@@ -1465,15 +1493,6 @@ static void bdrv_mirror_top_child_perm(BlockDriverState *bs, BdrvChild *c,
*nshared = BLK_PERM_ALL;
}
-static void bdrv_mirror_top_refresh_limits(BlockDriverState *bs, Error **errp)
-{
- MirrorBDSOpaque *s = bs->opaque;
-
- if (s && s->job && s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING) {
- bs->bl.request_alignment = s->job->granularity;
- }
-}
-
/* Dummy node that provides consistent read to its users without requiring it
* from its backing file and that allows writes on the backing file chain. */
static BlockDriver bdrv_mirror_top = {
@@ -1486,7 +1505,6 @@ static BlockDriver bdrv_mirror_top = {
.bdrv_co_block_status = bdrv_co_block_status_from_backing,
.bdrv_refresh_filename = bdrv_mirror_top_refresh_filename,
.bdrv_child_perm = bdrv_mirror_top_child_perm,
- .bdrv_refresh_limits = bdrv_mirror_top_refresh_limits,
};
static BlockJob *mirror_start_job(
@@ -1634,29 +1652,13 @@ static BlockJob *mirror_start_job(
s->should_complete = true;
}
- /*
- * Must be called before we start tracking writes, but after
- *
- * ((MirrorBlockJob *)
- * ((MirrorBDSOpaque *)
- * mirror_top_bs->opaque
- * )->job
- * )->copy_mode
- *
- * has the correct value.
- * (We start tracking writes as of the following
- * bdrv_create_dirty_bitmap() call.)
- */
- bdrv_refresh_limits(mirror_top_bs, &local_err);
- if (local_err) {
- error_propagate(errp, local_err);
- goto fail;
- }
-
s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
if (!s->dirty_bitmap) {
goto fail;
}
+ if (s->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING) {
+ bdrv_disable_dirty_bitmap(s->dirty_bitmap);
+ }
ret = block_job_add_bdrv(&s->common, "source", bs, 0,
BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE |
diff --git a/block/nfs.c b/block/nfs.c
index 40f2349..9a6311e 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -752,7 +752,7 @@ static int64_t nfs_get_allocated_file_size(BlockDriverState *bs)
}
static int coroutine_fn
-nfs_file_co_truncate(BlockDriverState *bs, int64_t offset,
+nfs_file_co_truncate(BlockDriverState *bs, int64_t offset, bool exact,
PreallocMode prealloc, Error **errp)
{
NFSClient *client = bs->opaque;
diff --git a/block/nvme.c b/block/nvme.c
index 910872e..d41c4bd 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -112,6 +112,9 @@ typedef struct {
uint64_t max_transfer;
bool plugged;
+ bool supports_write_zeroes;
+ bool supports_discard;
+
CoMutex dma_map_lock;
CoQueue dma_flush_queue;
@@ -423,6 +426,7 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
NvmeIdNs *idns;
NvmeLBAF *lbaf;
uint8_t *resp;
+ uint16_t oncs;
int r;
uint64_t iova;
NvmeCmd cmd = {
@@ -460,6 +464,10 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
s->max_transfer = MIN_NON_ZERO(s->max_transfer,
s->page_size / sizeof(uint64_t) * s->page_size);
+ oncs = le16_to_cpu(idctrl->oncs);
+ s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROS);
+ s->supports_discard = !!(oncs & NVME_ONCS_DSM);
+
memset(resp, 0, 4096);
cmd.cdw10 = 0;
@@ -472,6 +480,12 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
s->nsze = le64_to_cpu(idns->nsze);
lbaf = &idns->lbaf[NVME_ID_NS_FLBAS_INDEX(idns->flbas)];
+ if (NVME_ID_NS_DLFEAT_WRITE_ZEROES(idns->dlfeat) &&
+ NVME_ID_NS_DLFEAT_READ_BEHAVIOR(idns->dlfeat) ==
+ NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROES) {
+ bs->supported_write_flags |= BDRV_REQ_MAY_UNMAP;
+ }
+
if (lbaf->ms) {
error_setg(errp, "Namespaces with metadata are not yet supported");
goto out;
@@ -766,6 +780,8 @@ static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags,
int ret;
BDRVNVMeState *s = bs->opaque;
+ bs->supported_write_flags = BDRV_REQ_FUA;
+
opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
qemu_opts_absorb_qdict(opts, options, &error_abort);
device = qemu_opt_get(opts, NVME_BLOCK_OPT_DEVICE);
@@ -794,7 +810,6 @@ static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags,
goto fail;
}
}
- bs->supported_write_flags = BDRV_REQ_FUA;
return 0;
fail:
nvme_close(bs);
@@ -1088,6 +1103,140 @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
}
+static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
+ int64_t offset,
+ int bytes,
+ BdrvRequestFlags flags)
+{
+ BDRVNVMeState *s = bs->opaque;
+ NVMeQueuePair *ioq = s->queues[1];
+ NVMeRequest *req;
+
+ uint32_t cdw12 = ((bytes >> s->blkshift) - 1) & 0xFFFF;
+
+ if (!s->supports_write_zeroes) {
+ return -ENOTSUP;
+ }
+
+ NvmeCmd cmd = {
+ .opcode = NVME_CMD_WRITE_ZEROS,
+ .nsid = cpu_to_le32(s->nsid),
+ .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF),
+ .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF),
+ };
+
+ NVMeCoData data = {
+ .ctx = bdrv_get_aio_context(bs),
+ .ret = -EINPROGRESS,
+ };
+
+ if (flags & BDRV_REQ_MAY_UNMAP) {
+ cdw12 |= (1 << 25);
+ }
+
+ if (flags & BDRV_REQ_FUA) {
+ cdw12 |= (1 << 30);
+ }
+
+ cmd.cdw12 = cpu_to_le32(cdw12);
+
+ trace_nvme_write_zeroes(s, offset, bytes, flags);
+ assert(s->nr_queues > 1);
+ req = nvme_get_free_req(ioq);
+ assert(req);
+
+ nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
+
+ data.co = qemu_coroutine_self();
+ while (data.ret == -EINPROGRESS) {
+ qemu_coroutine_yield();
+ }
+
+ trace_nvme_rw_done(s, true, offset, bytes, data.ret);
+ return data.ret;
+}
+
+
+static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
+ int64_t offset,
+ int bytes)
+{
+ BDRVNVMeState *s = bs->opaque;
+ NVMeQueuePair *ioq = s->queues[1];
+ NVMeRequest *req;
+ NvmeDsmRange *buf;
+ QEMUIOVector local_qiov;
+ int ret;
+
+ NvmeCmd cmd = {
+ .opcode = NVME_CMD_DSM,
+ .nsid = cpu_to_le32(s->nsid),
+ .cdw10 = cpu_to_le32(0), /*number of ranges - 0 based*/
+ .cdw11 = cpu_to_le32(1 << 2), /*deallocate bit*/
+ };
+
+ NVMeCoData data = {
+ .ctx = bdrv_get_aio_context(bs),
+ .ret = -EINPROGRESS,
+ };
+
+ if (!s->supports_discard) {
+ return -ENOTSUP;
+ }
+
+ assert(s->nr_queues > 1);
+
+ buf = qemu_try_blockalign0(bs, s->page_size);
+ if (!buf) {
+ return -ENOMEM;
+ }
+
+ buf->nlb = cpu_to_le32(bytes >> s->blkshift);
+ buf->slba = cpu_to_le64(offset >> s->blkshift);
+ buf->cattr = 0;
+
+ qemu_iovec_init(&local_qiov, 1);
+ qemu_iovec_add(&local_qiov, buf, 4096);
+
+ req = nvme_get_free_req(ioq);
+ assert(req);
+
+ qemu_co_mutex_lock(&s->dma_map_lock);
+ ret = nvme_cmd_map_qiov(bs, &cmd, req, &local_qiov);
+ qemu_co_mutex_unlock(&s->dma_map_lock);
+
+ if (ret) {
+ req->busy = false;
+ goto out;
+ }
+
+ trace_nvme_dsm(s, offset, bytes);
+
+ nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
+
+ data.co = qemu_coroutine_self();
+ while (data.ret == -EINPROGRESS) {
+ qemu_coroutine_yield();
+ }
+
+ qemu_co_mutex_lock(&s->dma_map_lock);
+ ret = nvme_cmd_unmap_qiov(bs, &local_qiov);
+ qemu_co_mutex_unlock(&s->dma_map_lock);
+
+ if (ret) {
+ goto out;
+ }
+
+ ret = data.ret;
+ trace_nvme_dsm_done(s, offset, bytes, ret);
+out:
+ qemu_iovec_destroy(&local_qiov);
+ qemu_vfree(buf);
+ return ret;
+
+}
+
+
static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
BlockReopenQueue *queue, Error **errp)
{
@@ -1192,6 +1341,10 @@ static BlockDriver bdrv_nvme = {
.bdrv_co_preadv = nvme_co_preadv,
.bdrv_co_pwritev = nvme_co_pwritev,
+
+ .bdrv_co_pwrite_zeroes = nvme_co_pwrite_zeroes,
+ .bdrv_co_pdiscard = nvme_co_pdiscard,
+
.bdrv_co_flush_to_disk = nvme_co_flush,
.bdrv_reopen_prepare = nvme_reopen_prepare,
diff --git a/block/parallels.c b/block/parallels.c
index f1dfb03..7a01997 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -203,7 +203,7 @@ static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num,
} else {
ret = bdrv_truncate(bs->file,
(s->data_end + space) << BDRV_SECTOR_BITS,
- PREALLOC_MODE_OFF, NULL);
+ false, PREALLOC_MODE_OFF, NULL);
}
if (ret < 0) {
return ret;
@@ -487,7 +487,12 @@ static int coroutine_fn parallels_co_check(BlockDriverState *bs,
res->leaks += count;
if (fix & BDRV_FIX_LEAKS) {
Error *local_err = NULL;
- ret = bdrv_truncate(bs->file, res->image_end_offset,
+
+ /*
+ * In order to really repair the image, we must shrink it.
+ * That means we have to pass exact=true.
+ */
+ ret = bdrv_truncate(bs->file, res->image_end_offset, true,
PREALLOC_MODE_OFF, &local_err);
if (ret < 0) {
error_report_err(local_err);
@@ -563,11 +568,6 @@ static int coroutine_fn parallels_co_create(BlockdevCreateOptions* opts,
blk_set_allow_write_beyond_eof(blk, true);
/* Create image format */
- ret = blk_truncate(blk, 0, PREALLOC_MODE_OFF, errp);
- if (ret < 0) {
- goto out;
- }
-
bat_entries = DIV_ROUND_UP(total_size, cl_size);
bat_sectors = DIV_ROUND_UP(bat_entry_off(bat_entries), cl_size);
bat_sectors = (bat_sectors * cl_size) >> BDRV_SECTOR_BITS;
@@ -885,7 +885,9 @@ static void parallels_close(BlockDriverState *bs)
if ((bs->open_flags & BDRV_O_RDWR) && !(bs->open_flags & BDRV_O_INACTIVE)) {
s->header->inuse = 0;
parallels_update_header(bs);
- bdrv_truncate(bs->file, s->data_end << BDRV_SECTOR_BITS,
+
+ /* errors are ignored, so we might as well pass exact=true */
+ bdrv_truncate(bs->file, s->data_end << BDRV_SECTOR_BITS, true,
PREALLOC_MODE_OFF, NULL);
}
diff --git a/block/qcow.c b/block/qcow.c
index 5bdf72b..fce8989 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -480,7 +480,7 @@ static int get_cluster_offset(BlockDriverState *bs,
return -E2BIG;
}
ret = bdrv_truncate(bs->file, cluster_offset + s->cluster_size,
- PREALLOC_MODE_OFF, NULL);
+ false, PREALLOC_MODE_OFF, NULL);
if (ret < 0) {
return ret;
}
@@ -858,11 +858,6 @@ static int coroutine_fn qcow_co_create(BlockdevCreateOptions *opts,
blk_set_allow_write_beyond_eof(qcow_blk, true);
/* Create image format */
- ret = blk_truncate(qcow_blk, 0, PREALLOC_MODE_OFF, errp);
- if (ret < 0) {
- goto exit;
- }
-
memset(&header, 0, sizeof(header));
header.magic = cpu_to_be32(QCOW_MAGIC);
header.version = cpu_to_be32(QCOW_VERSION);
@@ -1038,7 +1033,7 @@ static int qcow_make_empty(BlockDriverState *bs)
if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, s->l1_table,
l1_length) < 0)
return -1;
- ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length,
+ ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length, false,
PREALLOC_MODE_OFF, NULL);
if (ret < 0)
return ret;
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 0d64bf5..f67ac6b 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -2016,7 +2016,7 @@ static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
goto resize_fail;
}
- ret = bdrv_truncate(bs->file, offset + s->cluster_size,
+ ret = bdrv_truncate(bs->file, offset + s->cluster_size, false,
PREALLOC_MODE_OFF, &local_err);
if (ret < 0) {
error_report_err(local_err);
diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index d0e7fa9..5ab64da 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -29,29 +29,64 @@
#include "qemu/error-report.h"
#include "qemu/cutils.h"
+static void qcow2_free_single_snapshot(BlockDriverState *bs, int i)
+{
+ BDRVQcow2State *s = bs->opaque;
+
+ assert(i >= 0 && i < s->nb_snapshots);
+ g_free(s->snapshots[i].name);
+ g_free(s->snapshots[i].id_str);
+ g_free(s->snapshots[i].unknown_extra_data);
+ memset(&s->snapshots[i], 0, sizeof(s->snapshots[i]));
+}
+
void qcow2_free_snapshots(BlockDriverState *bs)
{
BDRVQcow2State *s = bs->opaque;
int i;
for(i = 0; i < s->nb_snapshots; i++) {
- g_free(s->snapshots[i].name);
- g_free(s->snapshots[i].id_str);
+ qcow2_free_single_snapshot(bs, i);
}
g_free(s->snapshots);
s->snapshots = NULL;
s->nb_snapshots = 0;
}
-int qcow2_read_snapshots(BlockDriverState *bs)
+/*
+ * If @repair is true, try to repair a broken snapshot table instead
+ * of just returning an error:
+ *
+ * - If the snapshot table was too long, set *nb_clusters_reduced to
+ * the number of snapshots removed off the end.
+ * The caller will update the on-disk nb_snapshots accordingly;
+ * this leaks clusters, but is safe.
+ * (The on-disk information must be updated before
+ * qcow2_check_refcounts(), because that function relies on
+ * s->nb_snapshots to reflect the on-disk value.)
+ *
+ * - If there were snapshots with too much extra metadata, increment
+ * *extra_data_dropped for each.
+ * This requires the caller to eventually rewrite the whole snapshot
+ * table, which requires cluster allocation. Therefore, this should
+ * be done only after qcow2_check_refcounts() made sure the refcount
+ * structures are valid.
+ * (In the meantime, the image is still valid because
+ * qcow2_check_refcounts() does not do anything with snapshots'
+ * extra data.)
+ */
+static int qcow2_do_read_snapshots(BlockDriverState *bs, bool repair,
+ int *nb_clusters_reduced,
+ int *extra_data_dropped,
+ Error **errp)
{
BDRVQcow2State *s = bs->opaque;
QCowSnapshotHeader h;
QCowSnapshotExtraData extra;
QCowSnapshot *sn;
int i, id_str_size, name_size;
- int64_t offset;
- uint32_t extra_data_size;
+ int64_t offset, pre_sn_offset;
+ uint64_t table_length = 0;
int ret;
if (!s->nb_snapshots) {
@@ -64,10 +99,16 @@ int qcow2_read_snapshots(BlockDriverState *bs)
s->snapshots = g_new0(QCowSnapshot, s->nb_snapshots);
for(i = 0; i < s->nb_snapshots; i++) {
+ bool truncate_unknown_extra_data = false;
+
+ pre_sn_offset = offset;
+ table_length = ROUND_UP(table_length, 8);
+
/* Read statically sized part of the snapshot header */
offset = ROUND_UP(offset, 8);
ret = bdrv_pread(bs->file, offset, &h, sizeof(h));
if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to read snapshot table");
goto fail;
}
@@ -79,33 +120,77 @@ int qcow2_read_snapshots(BlockDriverState *bs)
sn->date_sec = be32_to_cpu(h.date_sec);
sn->date_nsec = be32_to_cpu(h.date_nsec);
sn->vm_clock_nsec = be64_to_cpu(h.vm_clock_nsec);
- extra_data_size = be32_to_cpu(h.extra_data_size);
+ sn->extra_data_size = be32_to_cpu(h.extra_data_size);
id_str_size = be16_to_cpu(h.id_str_size);
name_size = be16_to_cpu(h.name_size);
- /* Read extra data */
+ if (sn->extra_data_size > QCOW_MAX_SNAPSHOT_EXTRA_DATA) {
+ if (!repair) {
+ ret = -EFBIG;
+ error_setg(errp, "Too much extra metadata in snapshot table "
+ "entry %i", i);
+ error_append_hint(errp, "You can force-remove this extra "
+ "metadata with qemu-img check -r all\n");
+ goto fail;
+ }
+
+ fprintf(stderr, "Discarding too much extra metadata in snapshot "
+ "table entry %i (%" PRIu32 " > %u)\n",
+ i, sn->extra_data_size, QCOW_MAX_SNAPSHOT_EXTRA_DATA);
+
+ (*extra_data_dropped)++;
+ truncate_unknown_extra_data = true;
+ }
+
+ /* Read known extra data */
ret = bdrv_pread(bs->file, offset, &extra,
- MIN(sizeof(extra), extra_data_size));
+ MIN(sizeof(extra), sn->extra_data_size));
if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to read snapshot table");
goto fail;
}
- offset += extra_data_size;
+ offset += MIN(sizeof(extra), sn->extra_data_size);
- if (extra_data_size >= 8) {
+ if (sn->extra_data_size >= endof(QCowSnapshotExtraData,
+ vm_state_size_large)) {
sn->vm_state_size = be64_to_cpu(extra.vm_state_size_large);
}
- if (extra_data_size >= 16) {
+ if (sn->extra_data_size >= endof(QCowSnapshotExtraData, disk_size)) {
sn->disk_size = be64_to_cpu(extra.disk_size);
} else {
sn->disk_size = bs->total_sectors * BDRV_SECTOR_SIZE;
}
+ if (sn->extra_data_size > sizeof(extra)) {
+ uint64_t extra_data_end;
+ size_t unknown_extra_data_size;
+
+ extra_data_end = offset + sn->extra_data_size - sizeof(extra);
+
+ if (truncate_unknown_extra_data) {
+ sn->extra_data_size = QCOW_MAX_SNAPSHOT_EXTRA_DATA;
+ }
+
+ /* Store unknown extra data */
+ unknown_extra_data_size = sn->extra_data_size - sizeof(extra);
+ sn->unknown_extra_data = g_malloc(unknown_extra_data_size);
+ ret = bdrv_pread(bs->file, offset, sn->unknown_extra_data,
+ unknown_extra_data_size);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret,
+ "Failed to read snapshot table");
+ goto fail;
+ }
+ offset = extra_data_end;
+ }
+
/* Read snapshot ID */
sn->id_str = g_malloc(id_str_size + 1);
ret = bdrv_pread(bs->file, offset, sn->id_str, id_str_size);
if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to read snapshot table");
goto fail;
}
offset += id_str_size;
@@ -115,14 +200,47 @@ int qcow2_read_snapshots(BlockDriverState *bs)
sn->name = g_malloc(name_size + 1);
ret = bdrv_pread(bs->file, offset, sn->name, name_size);
if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to read snapshot table");
goto fail;
}
offset += name_size;
sn->name[name_size] = '\0';
- if (offset - s->snapshots_offset > QCOW_MAX_SNAPSHOTS_SIZE) {
- ret = -EFBIG;
- goto fail;
+ /* Note that the extra data may have been truncated */
+ table_length += sizeof(h) + sn->extra_data_size + id_str_size +
+ name_size;
+ if (!repair) {
+ assert(table_length == offset - s->snapshots_offset);
+ }
+
+ if (table_length > QCOW_MAX_SNAPSHOTS_SIZE ||
+ offset - s->snapshots_offset > INT_MAX)
+ {
+ if (!repair) {
+ ret = -EFBIG;
+ error_setg(errp, "Snapshot table is too big");
+ error_append_hint(errp, "You can force-remove all %u "
+ "overhanging snapshots with qemu-img check "
+ "-r all\n", s->nb_snapshots - i);
+ goto fail;
+ }
+
+ fprintf(stderr, "Discarding %u overhanging snapshots (snapshot "
+ "table is too big)\n", s->nb_snapshots - i);
+
+ *nb_clusters_reduced += (s->nb_snapshots - i);
+
+ /* Discard current snapshot also */
+ qcow2_free_single_snapshot(bs, i);
+
+ /*
+ * This leaks all the rest of the snapshot table and the
+ * snapshots' clusters, but we run in check -r all mode,
+ * so qcow2_check_refcounts() will take care of it.
+ */
+ s->nb_snapshots = i;
+ offset = pre_sn_offset;
+ break;
}
}
@@ -135,8 +253,13 @@ fail:
return ret;
}
+int qcow2_read_snapshots(BlockDriverState *bs, Error **errp)
+{
+ return qcow2_do_read_snapshots(bs, false, NULL, NULL, errp);
+}
+
/* add at the end of the file a new list of snapshots */
-static int qcow2_write_snapshots(BlockDriverState *bs)
+int qcow2_write_snapshots(BlockDriverState *bs)
{
BDRVQcow2State *s = bs->opaque;
QCowSnapshot *sn;
@@ -156,7 +279,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
sn = s->snapshots + i;
offset = ROUND_UP(offset, 8);
offset += sizeof(h);
- offset += sizeof(extra);
+ offset += MAX(sizeof(extra), sn->extra_data_size);
offset += strlen(sn->id_str);
offset += strlen(sn->name);
@@ -203,7 +326,8 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
h.date_sec = cpu_to_be32(sn->date_sec);
h.date_nsec = cpu_to_be32(sn->date_nsec);
h.vm_clock_nsec = cpu_to_be64(sn->vm_clock_nsec);
- h.extra_data_size = cpu_to_be32(sizeof(extra));
+ h.extra_data_size = cpu_to_be32(MAX(sizeof(extra),
+ sn->extra_data_size));
memset(&extra, 0, sizeof(extra));
extra.vm_state_size_large = cpu_to_be64(sn->vm_state_size);
@@ -228,6 +352,22 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
}
offset += sizeof(extra);
+ if (sn->extra_data_size > sizeof(extra)) {
+ size_t unknown_extra_data_size =
+ sn->extra_data_size - sizeof(extra);
+
+ /* qcow2_read_snapshots() ensures no unbounded allocation */
+ assert(unknown_extra_data_size <= BDRV_REQUEST_MAX_BYTES);
+ assert(sn->unknown_extra_data);
+
+ ret = bdrv_pwrite(bs->file, offset, sn->unknown_extra_data,
+ unknown_extra_data_size);
+ if (ret < 0) {
+ goto fail;
+ }
+ offset += unknown_extra_data_size;
+ }
+
ret = bdrv_pwrite(bs->file, offset, sn->id_str, id_str_size);
if (ret < 0) {
goto fail;
@@ -251,7 +391,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
}
QEMU_BUILD_BUG_ON(offsetof(QCowHeader, snapshots_offset) !=
- offsetof(QCowHeader, nb_snapshots) + sizeof(header_data.nb_snapshots));
+ endof(QCowHeader, nb_snapshots));
header_data.nb_snapshots = cpu_to_be32(s->nb_snapshots);
header_data.snapshots_offset = cpu_to_be64(snapshots_offset);
@@ -277,6 +417,151 @@ fail:
return ret;
}
+int coroutine_fn qcow2_check_read_snapshot_table(BlockDriverState *bs,
+ BdrvCheckResult *result,
+ BdrvCheckMode fix)
+{
+ BDRVQcow2State *s = bs->opaque;
+ Error *local_err = NULL;
+ int nb_clusters_reduced = 0;
+ int extra_data_dropped = 0;
+ int ret;
+ struct {
+ uint32_t nb_snapshots;
+ uint64_t snapshots_offset;
+ } QEMU_PACKED snapshot_table_pointer;
+
+ /* qcow2_do_open() discards this information in check mode */
+ ret = bdrv_pread(bs->file, offsetof(QCowHeader, nb_snapshots),
+ &snapshot_table_pointer, sizeof(snapshot_table_pointer));
+ if (ret < 0) {
+ result->check_errors++;
+ fprintf(stderr, "ERROR failed to read the snapshot table pointer from "
+ "the image header: %s\n", strerror(-ret));
+ return ret;
+ }
+
+ s->snapshots_offset = be64_to_cpu(snapshot_table_pointer.snapshots_offset);
+ s->nb_snapshots = be32_to_cpu(snapshot_table_pointer.nb_snapshots);
+
+ if (s->nb_snapshots > QCOW_MAX_SNAPSHOTS && (fix & BDRV_FIX_ERRORS)) {
+ fprintf(stderr, "Discarding %u overhanging snapshots\n",
+ s->nb_snapshots - QCOW_MAX_SNAPSHOTS);
+
+ nb_clusters_reduced += s->nb_snapshots - QCOW_MAX_SNAPSHOTS;
+ s->nb_snapshots = QCOW_MAX_SNAPSHOTS;
+ }
+
+ ret = qcow2_validate_table(bs, s->snapshots_offset, s->nb_snapshots,
+ sizeof(QCowSnapshotHeader),
+ sizeof(QCowSnapshotHeader) * QCOW_MAX_SNAPSHOTS,
+ "snapshot table", &local_err);
+ if (ret < 0) {
+ result->check_errors++;
+ error_reportf_err(local_err, "ERROR ");
+
+ if (s->nb_snapshots > QCOW_MAX_SNAPSHOTS) {
+ fprintf(stderr, "You can force-remove all %u overhanging snapshots "
+ "with qemu-img check -r all\n",
+ s->nb_snapshots - QCOW_MAX_SNAPSHOTS);
+ }
+
+ /* We did not read the snapshot table, so invalidate this information */
+ s->snapshots_offset = 0;
+ s->nb_snapshots = 0;
+
+ return ret;
+ }
+
+ qemu_co_mutex_unlock(&s->lock);
+ ret = qcow2_do_read_snapshots(bs, fix & BDRV_FIX_ERRORS,
+ &nb_clusters_reduced, &extra_data_dropped,
+ &local_err);
+ qemu_co_mutex_lock(&s->lock);
+ if (ret < 0) {
+ result->check_errors++;
+ error_reportf_err(local_err,
+ "ERROR failed to read the snapshot table: ");
+
+ /* We did not read the snapshot table, so invalidate this information */
+ s->snapshots_offset = 0;
+ s->nb_snapshots = 0;
+
+ return ret;
+ }
+ result->corruptions += nb_clusters_reduced + extra_data_dropped;
+
+ if (nb_clusters_reduced) {
+ /*
+ * Update image header now, because:
+ * (1) qcow2_check_refcounts() relies on s->nb_snapshots to be
+ * the same as what the image header says,
+ * (2) this leaks clusters, but qcow2_check_refcounts() will
+ * fix that.
+ */
+ assert(fix & BDRV_FIX_ERRORS);
+
+ snapshot_table_pointer.nb_snapshots = cpu_to_be32(s->nb_snapshots);
+ ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, nb_snapshots),
+ &snapshot_table_pointer.nb_snapshots,
+ sizeof(snapshot_table_pointer.nb_snapshots));
+ if (ret < 0) {
+ result->check_errors++;
+ fprintf(stderr, "ERROR failed to update the snapshot count in the "
+ "image header: %s\n", strerror(-ret));
+ return ret;
+ }
+
+ result->corruptions_fixed += nb_clusters_reduced;
+ result->corruptions -= nb_clusters_reduced;
+ }
+
+ /*
+ * All of v3 images' snapshot table entries need to have at least
+ * 16 bytes of extra data.
+ */
+ if (s->qcow_version >= 3) {
+ int i;
+ for (i = 0; i < s->nb_snapshots; i++) {
+ if (s->snapshots[i].extra_data_size <
+ sizeof_field(QCowSnapshotExtraData, vm_state_size_large) +
+ sizeof_field(QCowSnapshotExtraData, disk_size))
+ {
+ result->corruptions++;
+ fprintf(stderr, "%s snapshot table entry %i is incomplete\n",
+ fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
+ }
+ }
+ }
+
+ return 0;
+}
+
+int coroutine_fn qcow2_check_fix_snapshot_table(BlockDriverState *bs,
+ BdrvCheckResult *result,
+ BdrvCheckMode fix)
+{
+ BDRVQcow2State *s = bs->opaque;
+ int ret;
+
+ if (result->corruptions && (fix & BDRV_FIX_ERRORS)) {
+ qemu_co_mutex_unlock(&s->lock);
+ ret = qcow2_write_snapshots(bs);
+ qemu_co_mutex_lock(&s->lock);
+ if (ret < 0) {
+ result->check_errors++;
+ fprintf(stderr, "ERROR failed to update snapshot table: %s\n",
+ strerror(-ret));
+ return ret;
+ }
+
+ result->corruptions_fixed += result->corruptions;
+ result->corruptions = 0;
+ }
+
+ return 0;
+}
+
static void find_new_snapshot_id(BlockDriverState *bs,
char *id_str, int id_str_size)
{
@@ -370,6 +655,7 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
sn->date_sec = sn_info->date_sec;
sn->date_nsec = sn_info->date_nsec;
sn->vm_clock_nsec = sn_info->vm_clock_nsec;
+ sn->extra_data_size = sizeof(QCowSnapshotExtraData);
/* Allocate the L1 table of the snapshot and copy the current one there. */
l1_table_offset = qcow2_alloc_clusters(bs, s->l1_size * sizeof(uint64_t));
@@ -641,6 +927,7 @@ int qcow2_snapshot_delete(BlockDriverState *bs,
* The snapshot is now unused, clean up. If we fail after this point, we
* won't recover but just leak clusters.
*/
+ g_free(sn.unknown_extra_data);
g_free(sn.id_str);
g_free(sn.name);
diff --git a/block/qcow2.c b/block/qcow2.c
index 0bc69e6..7c18721 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -570,11 +570,47 @@ int qcow2_mark_consistent(BlockDriverState *bs)
return 0;
}
+static void qcow2_add_check_result(BdrvCheckResult *out,
+ const BdrvCheckResult *src,
+ bool set_allocation_info)
+{
+ out->corruptions += src->corruptions;
+ out->leaks += src->leaks;
+ out->check_errors += src->check_errors;
+ out->corruptions_fixed += src->corruptions_fixed;
+ out->leaks_fixed += src->leaks_fixed;
+
+ if (set_allocation_info) {
+ out->image_end_offset = src->image_end_offset;
+ out->bfi = src->bfi;
+ }
+}
+
static int coroutine_fn qcow2_co_check_locked(BlockDriverState *bs,
BdrvCheckResult *result,
BdrvCheckMode fix)
{
- int ret = qcow2_check_refcounts(bs, result, fix);
+ BdrvCheckResult snapshot_res = {};
+ BdrvCheckResult refcount_res = {};
+ int ret;
+
+ memset(result, 0, sizeof(*result));
+
+ ret = qcow2_check_read_snapshot_table(bs, &snapshot_res, fix);
+ if (ret < 0) {
+ qcow2_add_check_result(result, &snapshot_res, false);
+ return ret;
+ }
+
+ ret = qcow2_check_refcounts(bs, &refcount_res, fix);
+ qcow2_add_check_result(result, &refcount_res, true);
+ if (ret < 0) {
+ qcow2_add_check_result(result, &snapshot_res, false);
+ return ret;
+ }
+
+ ret = qcow2_check_fix_snapshot_table(bs, &snapshot_res, fix);
+ qcow2_add_check_result(result, &snapshot_res, false);
if (ret < 0) {
return ret;
}
@@ -1410,17 +1446,22 @@ static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options,
goto fail;
}
- /* The total size in bytes of the snapshot table is checked in
- * qcow2_read_snapshots() because the size of each snapshot is
- * variable and we don't know it yet.
- * Here we only check the offset and number of snapshots. */
- ret = qcow2_validate_table(bs, header.snapshots_offset,
- header.nb_snapshots,
- sizeof(QCowSnapshotHeader),
- sizeof(QCowSnapshotHeader) * QCOW_MAX_SNAPSHOTS,
- "Snapshot table", errp);
- if (ret < 0) {
- goto fail;
+ if (!(flags & BDRV_O_CHECK)) {
+ /*
+ * The total size in bytes of the snapshot table is checked in
+ * qcow2_read_snapshots() because the size of each snapshot is
+ * variable and we don't know it yet.
+ * Here we only check the offset and number of snapshots.
+ */
+ ret = qcow2_validate_table(bs, header.snapshots_offset,
+ header.nb_snapshots,
+ sizeof(QCowSnapshotHeader),
+ sizeof(QCowSnapshotHeader) *
+ QCOW_MAX_SNAPSHOTS,
+ "Snapshot table", errp);
+ if (ret < 0) {
+ goto fail;
+ }
}
/* read the level 1 table */
@@ -1580,14 +1621,19 @@ static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options,
s->image_backing_file = g_strdup(bs->auto_backing_file);
}
- /* Internal snapshots */
- s->snapshots_offset = header.snapshots_offset;
- s->nb_snapshots = header.nb_snapshots;
+ /*
+ * Internal snapshots; skip reading them in check mode, because
+ * we do not need them then, and we do not want to abort because
+ * of a broken table.
+ */
+ if (!(flags & BDRV_O_CHECK)) {
+ s->snapshots_offset = header.snapshots_offset;
+ s->nb_snapshots = header.nb_snapshots;
- ret = qcow2_read_snapshots(bs);
- if (ret < 0) {
- error_setg_errno(errp, -ret, "Could not read snapshots");
- goto fail;
+ ret = qcow2_read_snapshots(bs, errp);
+ if (ret < 0) {
+ goto fail;
+ }
}
/* Clear unknown autoclear feature bits */
@@ -3028,8 +3074,8 @@ static int coroutine_fn preallocate_co(BlockDriverState *bs, uint64_t offset,
if (mode == PREALLOC_MODE_METADATA) {
mode = PREALLOC_MODE_OFF;
}
- ret = bdrv_co_truncate(s->data_file, host_offset + cur_bytes, mode,
- errp);
+ ret = bdrv_co_truncate(s->data_file, host_offset + cur_bytes, false,
+ mode, errp);
if (ret < 0) {
return ret;
}
@@ -3345,12 +3391,6 @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)
}
blk_set_allow_write_beyond_eof(blk, true);
- /* Clear the protocol layer and preallocate it if necessary */
- ret = blk_truncate(blk, 0, PREALLOC_MODE_OFF, errp);
- if (ret < 0) {
- goto out;
- }
-
/* Write the header */
QEMU_BUILD_BUG_ON((1 << MIN_CLUSTER_BITS) < sizeof(*header));
header = g_malloc0(cluster_size);
@@ -3449,7 +3489,8 @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)
}
/* Okay, now that we have a valid image, let's give it the right size */
- ret = blk_truncate(blk, qcow2_opts->size, qcow2_opts->preallocation, errp);
+ ret = blk_truncate(blk, qcow2_opts->size, false, qcow2_opts->preallocation,
+ errp);
if (ret < 0) {
error_prepend(errp, "Could not resize image: ");
goto out;
@@ -3897,7 +3938,8 @@ fail:
}
static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
- PreallocMode prealloc, Error **errp)
+ bool exact, PreallocMode prealloc,
+ Error **errp)
{
BDRVQcow2State *s = bs->opaque;
uint64_t old_length;
@@ -3985,8 +4027,15 @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
if ((last_cluster + 1) * s->cluster_size < old_file_size) {
Error *local_err = NULL;
+ /*
+ * Do not pass @exact here: It will not help the user if
+ * we get an error here just because they wanted to shrink
+ * their qcow2 image (on a block device) with qemu-img.
+ * (And on the qcow2 layer, the @exact requirement is
+ * always fulfilled, so there is no need to pass it on.)
+ */
bdrv_co_truncate(bs->file, (last_cluster + 1) * s->cluster_size,
- PREALLOC_MODE_OFF, &local_err);
+ false, PREALLOC_MODE_OFF, &local_err);
if (local_err) {
warn_reportf_err(local_err,
"Failed to truncate the tail of the image: ");
@@ -4003,7 +4052,12 @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
switch (prealloc) {
case PREALLOC_MODE_OFF:
if (has_data_file(bs)) {
- ret = bdrv_co_truncate(s->data_file, offset, prealloc, errp);
+ /*
+ * If the caller wants an exact resize, the external data
+ * file should be resized to the exact target size, too,
+ * so we pass @exact here.
+ */
+ ret = bdrv_co_truncate(s->data_file, offset, exact, prealloc, errp);
if (ret < 0) {
goto fail;
}
@@ -4088,7 +4142,8 @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
/* Allocate the data area */
new_file_size = allocation_start +
nb_new_data_clusters * s->cluster_size;
- ret = bdrv_co_truncate(bs->file, new_file_size, prealloc, errp);
+ /* Image file grows, so @exact does not matter */
+ ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc, errp);
if (ret < 0) {
error_prepend(errp, "Failed to resize underlying file: ");
qcow2_free_clusters(bs, allocation_start,
@@ -4191,7 +4246,7 @@ qcow2_co_pwritev_compressed_part(BlockDriverState *bs,
if (len < 0) {
return len;
}
- return bdrv_co_truncate(bs->file, len, PREALLOC_MODE_OFF, NULL);
+ return bdrv_co_truncate(bs->file, len, false, PREALLOC_MODE_OFF, NULL);
}
if (offset_into_cluster(s, offset)) {
@@ -4428,7 +4483,7 @@ static int make_completely_empty(BlockDriverState *bs)
goto fail;
}
- ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size,
+ ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size, false,
PREALLOC_MODE_OFF, &local_err);
if (ret < 0) {
error_report_err(local_err);
@@ -4913,12 +4968,74 @@ static int qcow2_downgrade(BlockDriverState *bs, int target_version,
return 0;
}
+/*
+ * Upgrades an image's version. While newer versions encompass all
+ * features of older versions, some things may have to be presented
+ * differently.
+ */
+static int qcow2_upgrade(BlockDriverState *bs, int target_version,
+ BlockDriverAmendStatusCB *status_cb, void *cb_opaque,
+ Error **errp)
+{
+ BDRVQcow2State *s = bs->opaque;
+ bool need_snapshot_update;
+ int current_version = s->qcow_version;
+ int i;
+ int ret;
+
+ /* This is qcow2_upgrade(), not qcow2_downgrade() */
+ assert(target_version > current_version);
+
+ /* There are no other versions (yet) that you can upgrade to */
+ assert(target_version == 3);
+
+ status_cb(bs, 0, 2, cb_opaque);
+
+ /*
+ * In v2, snapshots do not need to have extra data. v3 requires
+ * the 64-bit VM state size and the virtual disk size to be
+ * present.
+ * qcow2_write_snapshots() will always write the list in the
+ * v3-compliant format.
+ */
+ need_snapshot_update = false;
+ for (i = 0; i < s->nb_snapshots; i++) {
+ if (s->snapshots[i].extra_data_size <
+ sizeof_field(QCowSnapshotExtraData, vm_state_size_large) +
+ sizeof_field(QCowSnapshotExtraData, disk_size))
+ {
+ need_snapshot_update = true;
+ break;
+ }
+ }
+ if (need_snapshot_update) {
+ ret = qcow2_write_snapshots(bs);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to update the snapshot table");
+ return ret;
+ }
+ }
+ status_cb(bs, 1, 2, cb_opaque);
+
+ s->qcow_version = target_version;
+ ret = qcow2_update_header(bs);
+ if (ret < 0) {
+ s->qcow_version = current_version;
+ error_setg_errno(errp, -ret, "Failed to update the image header");
+ return ret;
+ }
+ status_cb(bs, 2, 2, cb_opaque);
+
+ return 0;
+}
+
typedef enum Qcow2AmendOperation {
/* This is the value Qcow2AmendHelperCBInfo::last_operation will be
* statically initialized to so that the helper CB can discern the first
* invocation from an operation change */
QCOW2_NO_OPERATION = 0,
+ QCOW2_UPGRADING,
QCOW2_CHANGING_REFCOUNT_ORDER,
QCOW2_DOWNGRADING,
} Qcow2AmendOperation;
@@ -5101,17 +5218,16 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
helper_cb_info = (Qcow2AmendHelperCBInfo){
.original_status_cb = status_cb,
.original_cb_opaque = cb_opaque,
- .total_operations = (new_version < old_version)
+ .total_operations = (new_version != old_version)
+ (s->refcount_bits != refcount_bits)
};
/* Upgrade first (some features may require compat=1.1) */
if (new_version > old_version) {
- s->qcow_version = new_version;
- ret = qcow2_update_header(bs);
+ helper_cb_info.current_operation = QCOW2_UPGRADING;
+ ret = qcow2_upgrade(bs, new_version, &qcow2_amend_helper_cb,
+ &helper_cb_info, errp);
if (ret < 0) {
- s->qcow_version = old_version;
- error_setg_errno(errp, -ret, "Failed to update the image header");
return ret;
}
}
@@ -5207,7 +5323,11 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
return ret;
}
- ret = blk_truncate(blk, new_size, PREALLOC_MODE_OFF, errp);
+ /*
+ * Amending image options should ensure that the image has
+ * exactly the given new values, so pass exact=true here.
+ */
+ ret = blk_truncate(blk, new_size, true, PREALLOC_MODE_OFF, errp);
blk_unref(blk);
if (ret < 0) {
return ret;
diff --git a/block/qcow2.h b/block/qcow2.h
index 5cccd87..601c2e4 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -61,6 +61,9 @@
* space for snapshot names and IDs */
#define QCOW_MAX_SNAPSHOTS_SIZE (1024 * QCOW_MAX_SNAPSHOTS)
+/* Maximum amount of extra data per snapshot table entry to accept */
+#define QCOW_MAX_SNAPSHOT_EXTRA_DATA 1024
+
/* Bitmap header extension constraints */
#define QCOW2_MAX_BITMAPS 65535
#define QCOW2_MAX_BITMAP_DIRECTORY_SIZE (1024 * QCOW2_MAX_BITMAPS)
@@ -181,6 +184,10 @@ typedef struct QCowSnapshot {
uint32_t date_sec;
uint32_t date_nsec;
uint64_t vm_clock_nsec;
+ /* Size of all extra data, including QCowSnapshotExtraData if available */
+ uint32_t extra_data_size;
+ /* Data beyond QCowSnapshotExtraData, if any */
+ void *unknown_extra_data;
} QCowSnapshot;
struct Qcow2Cache;
@@ -708,7 +715,15 @@ int qcow2_snapshot_load_tmp(BlockDriverState *bs,
Error **errp);
void qcow2_free_snapshots(BlockDriverState *bs);
-int qcow2_read_snapshots(BlockDriverState *bs);
+int qcow2_read_snapshots(BlockDriverState *bs, Error **errp);
+int qcow2_write_snapshots(BlockDriverState *bs);
+
+int coroutine_fn qcow2_check_read_snapshot_table(BlockDriverState *bs,
+ BdrvCheckResult *result,
+ BdrvCheckMode fix);
+int coroutine_fn qcow2_check_fix_snapshot_table(BlockDriverState *bs,
+ BdrvCheckResult *result,
+ BdrvCheckMode fix);
/* qcow2-cache.c functions */
Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables,
diff --git a/block/qed.c b/block/qed.c
index 0d8fd50..d8c4e5f 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -673,8 +673,11 @@ static int coroutine_fn bdrv_qed_co_create(BlockdevCreateOptions *opts,
l1_size = header.cluster_size * header.table_size;
- /* File must start empty and grow, check truncate is supported */
- ret = blk_truncate(blk, 0, PREALLOC_MODE_OFF, errp);
+ /*
+ * The QED format associates file length with allocation status,
+ * so a new file (which is empty) must have a length of 0.
+ */
+ ret = blk_truncate(blk, 0, true, PREALLOC_MODE_OFF, errp);
if (ret < 0) {
goto out;
}
@@ -1461,6 +1464,7 @@ static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
static int coroutine_fn bdrv_qed_co_truncate(BlockDriverState *bs,
int64_t offset,
+ bool exact,
PreallocMode prealloc,
Error **errp)
{
diff --git a/block/raw-format.c b/block/raw-format.c
index 42c28cc..3a76ec7 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -370,7 +370,8 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
}
static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
- PreallocMode prealloc, Error **errp)
+ bool exact, PreallocMode prealloc,
+ Error **errp)
{
BDRVRawState *s = bs->opaque;
@@ -386,7 +387,7 @@ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
s->size = offset;
offset += s->offset;
- return bdrv_co_truncate(bs->file, offset, prealloc, errp);
+ return bdrv_co_truncate(bs->file, offset, exact, prealloc, errp);
}
static void raw_eject(BlockDriverState *bs, bool eject_flag)
diff --git a/block/rbd.c b/block/rbd.c
index c71e45d..027cbcc 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -1087,6 +1087,7 @@ static int64_t qemu_rbd_getlength(BlockDriverState *bs)
static int coroutine_fn qemu_rbd_co_truncate(BlockDriverState *bs,
int64_t offset,
+ bool exact,
PreallocMode prealloc,
Error **errp)
{
diff --git a/block/sheepdog.c b/block/sheepdog.c
index 773dfc6..cfa8433 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -2285,7 +2285,8 @@ static int64_t sd_getlength(BlockDriverState *bs)
}
static int coroutine_fn sd_co_truncate(BlockDriverState *bs, int64_t offset,
- PreallocMode prealloc, Error **errp)
+ bool exact, PreallocMode prealloc,
+ Error **errp)
{
BDRVSheepdogState *s = bs->opaque;
int ret, fd;
@@ -2601,7 +2602,7 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
assert(!flags);
if (offset > s->inode.vdi_size) {
- ret = sd_co_truncate(bs, offset, PREALLOC_MODE_OFF, NULL);
+ ret = sd_co_truncate(bs, offset, false, PREALLOC_MODE_OFF, NULL);
if (ret < 0) {
return ret;
}
diff --git a/block/ssh.c b/block/ssh.c
index 84d01e8..b4375cf 100644
--- a/block/ssh.c
+++ b/block/ssh.c
@@ -1295,7 +1295,8 @@ static int64_t ssh_getlength(BlockDriverState *bs)
}
static int coroutine_fn ssh_co_truncate(BlockDriverState *bs, int64_t offset,
- PreallocMode prealloc, Error **errp)
+ bool exact, PreallocMode prealloc,
+ Error **errp)
{
BDRVSSHState *s = bs->opaque;
diff --git a/block/trace-events b/block/trace-events
index b8d70f5..6ba86de 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -45,9 +45,9 @@ backup_do_cow_return(void *job, int64_t offset, uint64_t bytes, int ret) "job %p
block_copy_skip(void *bcs, int64_t start) "bcs %p start %"PRId64
block_copy_skip_range(void *bcs, int64_t start, uint64_t bytes) "bcs %p start %"PRId64" bytes %"PRId64
block_copy_process(void *bcs, int64_t start) "bcs %p start %"PRId64
-block_copy_with_bounce_buffer_read_fail(void *bcs, int64_t start, int ret) "bcs %p start %"PRId64" ret %d"
-block_copy_with_bounce_buffer_write_fail(void *bcs, int64_t start, int ret) "bcs %p start %"PRId64" ret %d"
-block_copy_with_offload_fail(void *bcs, int64_t start, int ret) "bcs %p start %"PRId64" ret %d"
+block_copy_copy_range_fail(void *bcs, int64_t start, int ret) "bcs %p start %"PRId64" ret %d"
+block_copy_read_fail(void *bcs, int64_t start, int ret) "bcs %p start %"PRId64" ret %d"
+block_copy_write_fail(void *bcs, int64_t start, int ret) "bcs %p start %"PRId64" ret %d"
# ../blockdev.c
qmp_block_job_cancel(void *job) "job %p"
@@ -152,9 +152,12 @@ nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6,
nvme_handle_event(void *s) "s %p"
nvme_poll_cb(void *s) "s %p"
nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset %"PRId64" bytes %"PRId64" flags %d niov %d"
+nvme_write_zeroes(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offset %"PRId64" bytes %"PRId64" flags %d"
nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x"
nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset %"PRId64" bytes %"PRId64" niov %d is_write %d"
nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset %"PRId64" bytes %"PRId64" ret %d"
+nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset %"PRId64" bytes %"PRId64""
+nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset %"PRId64" bytes %"PRId64" ret %d"
nvme_dma_map_flush(void *s) "s %p"
nvme_free_req_queue_wait(void *q) "q %p"
nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d"
diff --git a/block/vdi.c b/block/vdi.c
index 806ba7f..0142da7 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -874,7 +874,7 @@ static int coroutine_fn vdi_co_do_create(BlockdevCreateOptions *create_options,
}
if (image_type == VDI_TYPE_STATIC) {
- ret = blk_truncate(blk, offset + blocks * block_size,
+ ret = blk_truncate(blk, offset + blocks * block_size, false,
PREALLOC_MODE_OFF, errp);
if (ret < 0) {
error_prepend(errp, "Failed to statically allocate file");
diff --git a/block/vhdx-log.c b/block/vhdx-log.c
index fdd3a7a..13a49c2 100644
--- a/block/vhdx-log.c
+++ b/block/vhdx-log.c
@@ -557,8 +557,8 @@ static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s,
ret = -EINVAL;
goto exit;
}
- ret = bdrv_truncate(bs->file, new_file_size, PREALLOC_MODE_OFF,
- NULL);
+ ret = bdrv_truncate(bs->file, new_file_size, false,
+ PREALLOC_MODE_OFF, NULL);
if (ret < 0) {
goto exit;
}
diff --git a/block/vhdx.c b/block/vhdx.c
index 371f226..f02d261 100644
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -1263,7 +1263,7 @@ static int vhdx_allocate_block(BlockDriverState *bs, BDRVVHDXState *s,
return -EINVAL;
}
- return bdrv_truncate(bs->file, *new_offset + s->block_size,
+ return bdrv_truncate(bs->file, *new_offset + s->block_size, false,
PREALLOC_MODE_OFF, NULL);
}
@@ -1702,12 +1702,13 @@ static int vhdx_create_bat(BlockBackend *blk, BDRVVHDXState *s,
if (type == VHDX_TYPE_DYNAMIC) {
/* All zeroes, so we can just extend the file - the end of the BAT
* is the furthest thing we have written yet */
- ret = blk_truncate(blk, data_file_offset, PREALLOC_MODE_OFF, errp);
+ ret = blk_truncate(blk, data_file_offset, false, PREALLOC_MODE_OFF,
+ errp);
if (ret < 0) {
goto exit;
}
} else if (type == VHDX_TYPE_FIXED) {
- ret = blk_truncate(blk, data_file_offset + image_size,
+ ret = blk_truncate(blk, data_file_offset + image_size, false,
PREALLOC_MODE_OFF, errp);
if (ret < 0) {
goto exit;
diff --git a/block/vmdk.c b/block/vmdk.c
index fed3b50..20e909d 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -2076,7 +2076,7 @@ vmdk_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
return length;
}
length = QEMU_ALIGN_UP(length, BDRV_SECTOR_SIZE);
- ret = bdrv_truncate(s->extents[i].file, length,
+ ret = bdrv_truncate(s->extents[i].file, length, false,
PREALLOC_MODE_OFF, NULL);
if (ret < 0) {
return ret;
@@ -2118,7 +2118,7 @@ static int vmdk_init_extent(BlockBackend *blk,
int gd_buf_size;
if (flat) {
- ret = blk_truncate(blk, filesize, PREALLOC_MODE_OFF, errp);
+ ret = blk_truncate(blk, filesize, false, PREALLOC_MODE_OFF, errp);
goto exit;
}
magic = cpu_to_be32(VMDK4_MAGIC);
@@ -2181,7 +2181,7 @@ static int vmdk_init_extent(BlockBackend *blk,
goto exit;
}
- ret = blk_truncate(blk, le64_to_cpu(header.grain_offset) << 9,
+ ret = blk_truncate(blk, le64_to_cpu(header.grain_offset) << 9, false,
PREALLOC_MODE_OFF, errp);
if (ret < 0) {
goto exit;
@@ -2523,7 +2523,7 @@ static int coroutine_fn vmdk_co_do_create(int64_t size,
/* bdrv_pwrite write padding zeros to align to sector, we don't need that
* for description file */
if (desc_offset == 0) {
- ret = blk_truncate(blk, desc_len, PREALLOC_MODE_OFF, errp);
+ ret = blk_truncate(blk, desc_len, false, PREALLOC_MODE_OFF, errp);
if (ret < 0) {
goto exit;
}
diff --git a/block/vpc.c b/block/vpc.c
index 5cd3890..a655502 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -898,7 +898,7 @@ static int create_fixed_disk(BlockBackend *blk, uint8_t *buf,
/* Add footer to total size */
total_size += HEADER_SIZE;
- ret = blk_truncate(blk, total_size, PREALLOC_MODE_OFF, errp);
+ ret = blk_truncate(blk, total_size, false, PREALLOC_MODE_OFF, errp);
if (ret < 0) {
return ret;
}