From 3f3f20dcd34fc2fcf6dea2fe4e9b45d1c4d67288 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 11 Feb 2015 17:19:57 +0100
Subject: vpc: Fix size in fixed image creation

If total_sectors is rounded to match the geometry, total_size needs to
be changed as well. Otherwise we end up with an image whose geometry
describes a disk larger than the image file, which doesn't end well.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
---
 block/vpc.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'block')

diff --git a/block/vpc.c b/block/vpc.c
index 46803b1..7fddbf0 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -801,6 +801,7 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
     }
 
     total_sectors = (int64_t) cyls * heads * secs_per_cyl;
+    total_size = total_sectors * BDRV_SECTOR_SIZE;
 
     /* Prepare the Hard Disk Footer */
     memset(buf, 0, 1024);
@@ -822,13 +823,8 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
     /* Version of Virtual PC 2007 */
     footer->major = cpu_to_be16(0x0005);
     footer->minor = cpu_to_be16(0x0003);
-    if (disk_type == VHD_DYNAMIC) {
-        footer->orig_size = cpu_to_be64(total_sectors * 512);
-        footer->size = cpu_to_be64(total_sectors * 512);
-    } else {
-        footer->orig_size = cpu_to_be64(total_size);
-        footer->size = cpu_to_be64(total_size);
-    }
+    footer->orig_size = cpu_to_be64(total_size);
+    footer->size = cpu_to_be64(total_size);
     footer->cyls = cpu_to_be16(cyls);
     footer->heads = heads;
     footer->secs_per_cyl = secs_per_cyl;
-- 
cgit v1.1


From 0cc84887068eeb59eed84dbab6547b39e83d739c Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 11 Feb 2015 15:56:01 +0100
Subject: vpc: Implement bdrv_co_get_block_status()

This implements bdrv_co_get_block_status() for VHD images. This can
significantly speed up qemu-img convert operation because only with this
function implemented sparseness can be considered. (Before, converting a
1 TB empty image took several minutes for me, now it's instantaneous.)

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
---
 block/vpc.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 48 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/vpc.c b/block/vpc.c
index 7fddbf0..1533b6a 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -597,6 +597,51 @@ static coroutine_fn int vpc_co_write(BlockDriverState *bs, int64_t sector_num,
     return ret;
 }
 
+static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, int *pnum)
+{
+    BDRVVPCState *s = bs->opaque;
+    VHDFooter *footer = (VHDFooter*) s->footer_buf;
+    int64_t start, offset, next;
+    bool allocated;
+    int n;
+
+    if (be32_to_cpu(footer->type) == VHD_FIXED) {
+        *pnum = nb_sectors;
+        return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA |
+               (sector_num << BDRV_SECTOR_BITS);
+    }
+
+    offset = get_sector_offset(bs, sector_num, 0);
+    start = offset;
+    allocated = (offset != -1);
+    *pnum = 0;
+
+    do {
+        /* All sectors in a block are contiguous (without using the bitmap) */
+        n = ROUND_UP(sector_num + 1, s->block_size / BDRV_SECTOR_SIZE)
+          - sector_num;
+        n = MIN(n, nb_sectors);
+
+        *pnum += n;
+        sector_num += n;
+        nb_sectors -= n;
+        next = start + (*pnum * BDRV_SECTOR_SIZE);
+
+        if (nb_sectors == 0) {
+            break;
+        }
+
+        offset = get_sector_offset(bs, sector_num, 0);
+    } while ((allocated && offset == next) || (!allocated && offset == -1));
+
+    if (allocated) {
+        return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
+    } else {
+        return 0;
+    }
+}
+
 /*
  * Calculates the number of cylinders, heads and sectors per cylinder
  * based on a given number of sectors. This is the algorithm described
@@ -903,8 +948,9 @@ static BlockDriver bdrv_vpc = {
     .bdrv_reopen_prepare    = vpc_reopen_prepare,
     .bdrv_create            = vpc_create,
 
-    .bdrv_read              = vpc_co_read,
-    .bdrv_write             = vpc_co_write,
+    .bdrv_read                  = vpc_co_read,
+    .bdrv_write                 = vpc_co_write,
+    .bdrv_co_get_block_status   = vpc_co_get_block_status,
 
     .bdrv_get_info          = vpc_get_info,
 
-- 
cgit v1.1


From 876eb1b0cc2b04927739cba10e4e73e8b990d65e Mon Sep 17 00:00:00 2001
From: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
Date: Fri, 13 Feb 2015 18:20:53 +0900
Subject: sheepdog: selectable object size support

Previously, qemu block driver of sheepdog used hard-coded VDI object size.
This patch enables users to handle VDI object size.

When you start qemu, you don't need to specify additional command option.

But when you create the VDI which doesn't have default object size
with qemu-img command, you specify object_size option.

If you want to create a VDI of 8MB object size,
you need to specify following command option.

 # qemu-img create -o object_size=8M sheepdog:test1 100M

In addition, when you don't specify qemu-img command option,
a default value of sheepdog cluster is used for creating VDI.

 # qemu-img create sheepdog:test2 100M

Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
Acked-by: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/sheepdog.c | 155 +++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 133 insertions(+), 22 deletions(-)

(limited to 'block')

diff --git a/block/sheepdog.c b/block/sheepdog.c
index d17ee36..a2679c2 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -37,6 +37,7 @@
 #define SD_OP_READ_VDIS      0x15
 #define SD_OP_FLUSH_VDI      0x16
 #define SD_OP_DEL_VDI        0x17
+#define SD_OP_GET_CLUSTER_DEFAULT   0x18
 
 #define SD_FLAG_CMD_WRITE    0x01
 #define SD_FLAG_CMD_COW      0x02
@@ -91,6 +92,7 @@
 #define SD_NR_VDIS   (1U << 24)
 #define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
 #define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
+#define SD_DEFAULT_BLOCK_SIZE_SHIFT 22
 /*
  * For erasure coding, we use at most SD_EC_MAX_STRIP for data strips and
  * (SD_EC_MAX_STRIP - 1) for parity strips
@@ -167,7 +169,8 @@ typedef struct SheepdogVdiReq {
     uint32_t base_vdi_id;
     uint8_t copies;
     uint8_t copy_policy;
-    uint8_t reserved[2];
+    uint8_t store_policy;
+    uint8_t block_size_shift;
     uint32_t snapid;
     uint32_t type;
     uint32_t pad[2];
@@ -186,6 +189,21 @@ typedef struct SheepdogVdiRsp {
     uint32_t pad[5];
 } SheepdogVdiRsp;
 
+typedef struct SheepdogClusterRsp {
+    uint8_t proto_ver;
+    uint8_t opcode;
+    uint16_t flags;
+    uint32_t epoch;
+    uint32_t id;
+    uint32_t data_length;
+    uint32_t result;
+    uint8_t nr_copies;
+    uint8_t copy_policy;
+    uint8_t block_size_shift;
+    uint8_t __pad1;
+    uint32_t __pad2[6];
+} SheepdogClusterRsp;
+
 typedef struct SheepdogInode {
     char name[SD_MAX_VDI_LEN];
     char tag[SD_MAX_VDI_TAG_LEN];
@@ -1541,6 +1559,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
     hdr.vdi_size = s->inode.vdi_size;
     hdr.copy_policy = s->inode.copy_policy;
     hdr.copies = s->inode.nr_copies;
+    hdr.block_size_shift = s->inode.block_size_shift;
 
     ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
 
@@ -1566,9 +1585,12 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
 static int sd_prealloc(const char *filename, Error **errp)
 {
     BlockDriverState *bs = NULL;
+    BDRVSheepdogState *base = NULL;
+    unsigned long buf_size;
     uint32_t idx, max_idx;
+    uint32_t object_size;
     int64_t vdi_size;
-    void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
+    void *buf = NULL;
     int ret;
 
     ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
@@ -1582,18 +1604,24 @@ static int sd_prealloc(const char *filename, Error **errp)
         ret = vdi_size;
         goto out;
     }
-    max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
+
+    base = bs->opaque;
+    object_size = (UINT32_C(1) << base->inode.block_size_shift);
+    buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
+    buf = g_malloc0(buf_size);
+
+    max_idx = DIV_ROUND_UP(vdi_size, buf_size);
 
     for (idx = 0; idx < max_idx; idx++) {
         /*
          * The created image can be a cloned image, so we need to read
          * a data from the source image.
          */
-        ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
+        ret = bdrv_pread(bs, idx * buf_size, buf, buf_size);
         if (ret < 0) {
             goto out;
         }
-        ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
+        ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size);
         if (ret < 0) {
             goto out;
         }
@@ -1666,6 +1694,27 @@ static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
     return 0;
 }
 
+static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
+{
+    struct SheepdogInode *inode = &s->inode;
+    uint64_t object_size;
+    int obj_order;
+
+    object_size = qemu_opt_get_size_del(opt, BLOCK_OPT_OBJECT_SIZE, 0);
+    if (object_size) {
+        if ((object_size - 1) & object_size) {    /* not a power of 2? */
+            return -EINVAL;
+        }
+        obj_order = ffs(object_size) - 1;
+        if (obj_order < 20 || obj_order > 31) {
+            return -EINVAL;
+        }
+        inode->block_size_shift = (uint8_t)obj_order;
+    }
+
+    return 0;
+}
+
 static int sd_create(const char *filename, QemuOpts *opts,
                      Error **errp)
 {
@@ -1676,6 +1725,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
     BDRVSheepdogState *s;
     char tag[SD_MAX_VDI_TAG_LEN];
     uint32_t snapid;
+    uint64_t max_vdi_size;
     bool prealloc = false;
 
     s = g_new0(BDRVSheepdogState, 1);
@@ -1714,10 +1764,11 @@ static int sd_create(const char *filename, QemuOpts *opts,
             goto out;
         }
     }
-
-    if (s->inode.vdi_size > SD_MAX_VDI_SIZE) {
-        error_setg(errp, "too big image size");
-        ret = -EINVAL;
+    ret = parse_block_size_shift(s, opts);
+    if (ret < 0) {
+        error_setg(errp, "Invalid object_size."
+                         " obect_size needs to be power of 2"
+                         " and be limited from 2^20 to 2^31");
         goto out;
     }
 
@@ -1754,6 +1805,51 @@ static int sd_create(const char *filename, QemuOpts *opts,
     }
 
     s->aio_context = qemu_get_aio_context();
+
+    /* if block_size_shift is not specified, get cluster default value */
+    if (s->inode.block_size_shift == 0) {
+        SheepdogVdiReq hdr;
+        SheepdogClusterRsp *rsp = (SheepdogClusterRsp *)&hdr;
+        Error *local_err = NULL;
+        int fd;
+        unsigned int wlen = 0, rlen = 0;
+
+        fd = connect_to_sdog(s, &local_err);
+        if (fd < 0) {
+            error_report("%s", error_get_pretty(local_err));
+            error_free(local_err);
+            ret = -EIO;
+            goto out;
+        }
+
+        memset(&hdr, 0, sizeof(hdr));
+        hdr.opcode = SD_OP_GET_CLUSTER_DEFAULT;
+        hdr.proto_ver = SD_PROTO_VER;
+
+        ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr,
+                     NULL, &wlen, &rlen);
+        closesocket(fd);
+        if (ret) {
+            error_setg_errno(errp, -ret, "failed to get cluster default");
+            goto out;
+        }
+        if (rsp->result == SD_RES_SUCCESS) {
+            s->inode.block_size_shift = rsp->block_size_shift;
+        } else {
+            s->inode.block_size_shift = SD_DEFAULT_BLOCK_SIZE_SHIFT;
+        }
+    }
+
+    max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS;
+
+    if (s->inode.vdi_size > max_vdi_size) {
+        error_setg(errp, "An image is too large."
+                         " The maximum image size is %"PRIu64 "GB",
+                         max_vdi_size / 1024 / 1024 / 1024);
+        ret = -EINVAL;
+        goto out;
+    }
+
     ret = do_sd_create(s, &vid, 0, errp);
     if (ret) {
         goto out;
@@ -1823,11 +1919,13 @@ static int sd_truncate(BlockDriverState *bs, int64_t offset)
     BDRVSheepdogState *s = bs->opaque;
     int ret, fd;
     unsigned int datalen;
+    uint64_t max_vdi_size;
 
+    max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS;
     if (offset < s->inode.vdi_size) {
         error_report("shrinking is not supported");
         return -EINVAL;
-    } else if (offset > SD_MAX_VDI_SIZE) {
+    } else if (offset > max_vdi_size) {
         error_report("too big image size");
         return -EINVAL;
     }
@@ -2005,9 +2103,10 @@ static int coroutine_fn sd_co_rw_vector(void *p)
     SheepdogAIOCB *acb = p;
     int ret = 0;
     unsigned long len, done = 0, total = acb->nb_sectors * BDRV_SECTOR_SIZE;
-    unsigned long idx = acb->sector_num * BDRV_SECTOR_SIZE / SD_DATA_OBJ_SIZE;
+    unsigned long idx;
+    uint32_t object_size;
     uint64_t oid;
-    uint64_t offset = (acb->sector_num * BDRV_SECTOR_SIZE) % SD_DATA_OBJ_SIZE;
+    uint64_t offset;
     BDRVSheepdogState *s = acb->common.bs->opaque;
     SheepdogInode *inode = &s->inode;
     AIOReq *aio_req;
@@ -2024,6 +2123,10 @@ static int coroutine_fn sd_co_rw_vector(void *p)
         }
     }
 
+    object_size = (UINT32_C(1) << inode->block_size_shift);
+    idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size;
+    offset = (acb->sector_num * BDRV_SECTOR_SIZE) % object_size;
+
     /*
      * Make sure we don't free the aiocb before we are done with all requests.
      * This additional reference is dropped at the end of this function.
@@ -2037,7 +2140,7 @@ static int coroutine_fn sd_co_rw_vector(void *p)
 
         oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);
 
-        len = MIN(total - done, SD_DATA_OBJ_SIZE - offset);
+        len = MIN(total - done, object_size - offset);
 
         switch (acb->aiocb_type) {
         case AIOCB_READ_UDATA:
@@ -2061,7 +2164,7 @@ static int coroutine_fn sd_co_rw_vector(void *p)
              * We discard the object only when the whole object is
              * 1) allocated 2) trimmed. Otherwise, simply skip it.
              */
-            if (len != SD_DATA_OBJ_SIZE || inode->data_vdi_id[idx] == 0) {
+            if (len != object_size || inode->data_vdi_id[idx] == 0) {
                 goto done;
             }
             break;
@@ -2414,6 +2517,7 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
     uint64_t offset;
     uint32_t vdi_index;
     uint32_t vdi_id = load ? s->inode.parent_vdi_id : s->inode.vdi_id;
+    uint32_t object_size = (UINT32_C(1) << s->inode.block_size_shift);
 
     fd = connect_to_sdog(s, &local_err);
     if (fd < 0) {
@@ -2422,10 +2526,10 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
     }
 
     while (remaining) {
-        vdi_index = pos / SD_DATA_OBJ_SIZE;
-        offset = pos % SD_DATA_OBJ_SIZE;
+        vdi_index = pos / object_size;
+        offset = pos % object_size;
 
-        data_len = MIN(remaining, SD_DATA_OBJ_SIZE - offset);
+        data_len = MIN(remaining, object_size - offset);
 
         vmstate_oid = vid_to_vmstate_oid(vdi_id, vdi_index);
 
@@ -2512,10 +2616,11 @@ sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
 {
     BDRVSheepdogState *s = bs->opaque;
     SheepdogInode *inode = &s->inode;
+    uint32_t object_size = (UINT32_C(1) << inode->block_size_shift);
     uint64_t offset = sector_num * BDRV_SECTOR_SIZE;
-    unsigned long start = offset / SD_DATA_OBJ_SIZE,
+    unsigned long start = offset / object_size,
                   end = DIV_ROUND_UP((sector_num + nb_sectors) *
-                                     BDRV_SECTOR_SIZE, SD_DATA_OBJ_SIZE);
+                                     BDRV_SECTOR_SIZE, object_size);
     unsigned long idx;
     int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
 
@@ -2534,7 +2639,7 @@ sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
         }
     }
 
-    *pnum = (idx - start) * SD_DATA_OBJ_SIZE / BDRV_SECTOR_SIZE;
+    *pnum = (idx - start) * object_size / BDRV_SECTOR_SIZE;
     if (*pnum > nb_sectors) {
         *pnum = nb_sectors;
     }
@@ -2545,14 +2650,15 @@ static int64_t sd_get_allocated_file_size(BlockDriverState *bs)
 {
     BDRVSheepdogState *s = bs->opaque;
     SheepdogInode *inode = &s->inode;
-    unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, SD_DATA_OBJ_SIZE);
+    uint32_t object_size = (UINT32_C(1) << inode->block_size_shift);
+    unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, object_size);
     uint64_t size = 0;
 
     for (i = 0; i < last; i++) {
         if (inode->data_vdi_id[i] == 0) {
             continue;
         }
-        size += SD_DATA_OBJ_SIZE;
+        size += object_size;
     }
     return size;
 }
@@ -2581,6 +2687,11 @@ static QemuOptsList sd_create_opts = {
             .type = QEMU_OPT_STRING,
             .help = "Redundancy of the image"
         },
+        {
+            .name = BLOCK_OPT_OBJECT_SIZE,
+            .type = QEMU_OPT_SIZE,
+            .help = "Object size of the image"
+        },
         { /* end of list */ }
     }
 };
-- 
cgit v1.1


From a6dcf097fad2773fdee9ea12f8452dcc259e9ee1 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Thu, 12 Feb 2015 08:35:49 +0300
Subject: block/raw-posix: fix compilation warning on OSX

block/raw-posix.c:947:19: warning: unused variable 's' [-Wunused-variable]
    BDRVRawState *s = aiocb->bs->opaque;

This variable is used only when on of the following macros are defined
CONFIG_XFS, CONFIG_FALLOCATE, CONFIG_FALLOCATE_PUNCH_HOLE or
CONFIG_FALLOCATE_ZERO_RANGE. Fortunately, CONFIG_FALLOCATE_PUNCH_HOLE
and CONFIG_FALLOCATE_ZERO_RANGE could be defined only along with
CONFIG_FALLOCATE. Therefore checking for CONFIG_XFS or CONFIG_FALLOCATE
would be enough.

Signed-off-by: Denis V. Lunev <den@openvz.org>
CC: Peter Maydell <peter.maydell@linaro.org>
CC: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/raw-posix.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'block')

diff --git a/block/raw-posix.c b/block/raw-posix.c
index b5f077a..c0b46ca 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -944,7 +944,9 @@ static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb)
 
 static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
 {
+#if defined(CONFIG_FALLOCATE) || defined(CONFIG_XFS)
     BDRVRawState *s = aiocb->bs->opaque;
+#endif
 
     if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
         return handle_aiocb_write_zeroes_block(aiocb);
-- 
cgit v1.1


From 20a1f9d07125bead22efd1dc208b4d14ae1b2a21 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 17 Feb 2015 14:47:54 +0100
Subject: qcow2: Remove unused struct QCowCreateState

The only user went away five years ago with commit a9420734 ('qcow2:
Simplify image creation'). It's about time to remove it.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/qcow2.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'block')

diff --git a/block/qcow2.h b/block/qcow2.h
index 6e39a1b..0fee29b 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -275,17 +275,6 @@ typedef struct BDRVQcowState {
     bool cache_discards;
 } BDRVQcowState;
 
-/* XXX: use std qcow open function ? */
-typedef struct QCowCreateState {
-    int cluster_size;
-    int cluster_bits;
-    uint16_t *refcount_block;
-    uint64_t *refcount_table;
-    int64_t l1_table_offset;
-    int64_t refcount_table_offset;
-    int64_t refcount_block_offset;
-} QCowCreateState;
-
 struct QCowAIOCB;
 
 typedef struct Qcow2COWRegion {
-- 
cgit v1.1


From 346a53df38e29021e4a9e8c2a759b05744902857 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Tue, 10 Feb 2015 15:28:43 -0500
Subject: qcow2: Add two new fields to BDRVQcowState

Add two new fields regarding refcount information (the bit width of
every entry and the maximum refcount value) to the BDRVQcowState.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-refcount.c | 4 ++--
 block/qcow2.c          | 3 +++
 block/qcow2.h          | 2 ++
 3 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 9b80ca7..e124a54 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -584,7 +584,7 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
 
         refcount = be16_to_cpu(refcount_block[block_index]);
         refcount += addend;
-        if (refcount < 0 || refcount > 0xffff) {
+        if (refcount < 0 || refcount > s->refcount_max) {
             ret = -EINVAL;
             goto fail;
         }
@@ -775,7 +775,7 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
             return refcount;
         }
 
-        if (refcount == 0xffff) {
+        if (refcount == s->refcount_max) {
             offset = 0;
         }
     }
diff --git a/block/qcow2.c b/block/qcow2.c
index 50e0a94..e04ba6d 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -684,6 +684,9 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
         goto fail;
     }
     s->refcount_order = header.refcount_order;
+    s->refcount_bits = 1 << s->refcount_order;
+    s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
+    s->refcount_max += s->refcount_max - 1;
 
     if (header.crypt_method > QCOW_CRYPT_AES) {
         error_setg(errp, "Unsupported encryption method: %" PRIu32,
diff --git a/block/qcow2.h b/block/qcow2.h
index 0fee29b..55138c9 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -258,6 +258,8 @@ typedef struct BDRVQcowState {
     int qcow_version;
     bool use_lazy_refcounts;
     int refcount_order;
+    int refcount_bits;
+    uint64_t refcount_max;
 
     bool discard_passthrough[QCOW2_DISCARD_MAX];
 
-- 
cgit v1.1


From 0709c5a1530b046183b6e96d9631affcff76c1fc Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Tue, 10 Feb 2015 15:28:44 -0500
Subject: qcow2: Add refcount_bits to format-specific info

Add the bit width of every refcount entry to the format-specific
information.

In contrast to lazy_refcounts and the corrupt flag, this should be
always emitted, even for compat=0.10 although it does not support any
refcount width other than 16 bits. This is because if a boolean is
optional, one normally assumes it to be false when omitted; but if an
integer is not specified, it is rather difficult to guess its value.

This new field breaks some test outputs, fix them.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/qcow2.c b/block/qcow2.c
index e04ba6d..5c92803 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2482,7 +2482,8 @@ static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs)
     };
     if (s->qcow_version == 2) {
         *spec_info->qcow2 = (ImageInfoSpecificQCow2){
-            .compat = g_strdup("0.10"),
+            .compat             = g_strdup("0.10"),
+            .refcount_bits      = s->refcount_bits,
         };
     } else if (s->qcow_version == 3) {
         *spec_info->qcow2 = (ImageInfoSpecificQCow2){
@@ -2493,6 +2494,7 @@ static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs)
             .corrupt            = s->incompatible_features &
                                   QCOW2_INCOMPAT_CORRUPT,
             .has_corrupt        = true,
+            .refcount_bits      = s->refcount_bits,
         };
     }
 
-- 
cgit v1.1


From c6e9d8ae6629aaf3c6d483032d219b988d78583f Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Tue, 10 Feb 2015 15:28:45 -0500
Subject: qcow2: Do not return new value after refcount update

qcow2_update_cluster_refcount() does not have any quick access to the
new refcount value, it has to call qcow2_get_refcount(). Some callers do
not need that new value at all, others call qcow2_get_refcount()
themselves anyway (albeit in a different code path, which can however be
easily changed), therefore there is no advantage in making
qcow2_update_cluster_refcount() return the new value. Drop it.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-refcount.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

(limited to 'block')

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index e124a54..1a85bcd 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -631,8 +631,7 @@ fail:
 /*
  * Increases or decreases the refcount of a given cluster.
  *
- * If the return value is non-negative, it is the new refcount of the cluster.
- * If it is negative, it is -errno and indicates an error.
+ * On success 0 is returned; on failure -errno is returned.
  */
 int qcow2_update_cluster_refcount(BlockDriverState *bs,
                                   int64_t cluster_index,
@@ -648,7 +647,7 @@ int qcow2_update_cluster_refcount(BlockDriverState *bs,
         return ret;
     }
 
-    return qcow2_get_refcount(bs, cluster_index);
+    return 0;
 }
 
 
@@ -976,13 +975,15 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                             break;
                         }
                         if (addend != 0) {
-                            refcount = qcow2_update_cluster_refcount(bs,
+                            ret = qcow2_update_cluster_refcount(bs,
                                     cluster_index, addend,
                                     QCOW2_DISCARD_SNAPSHOT);
-                        } else {
-                            refcount = qcow2_get_refcount(bs, cluster_index);
+                            if (ret < 0) {
+                                goto fail;
+                            }
                         }
 
+                        refcount = qcow2_get_refcount(bs, cluster_index);
                         if (refcount < 0) {
                             ret = refcount;
                             goto fail;
@@ -1017,11 +1018,15 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
 
 
             if (addend != 0) {
-                refcount = qcow2_update_cluster_refcount(bs, l2_offset >>
-                        s->cluster_bits, addend, QCOW2_DISCARD_SNAPSHOT);
-            } else {
-                refcount = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits);
+                ret = qcow2_update_cluster_refcount(bs, l2_offset >>
+                                                        s->cluster_bits,
+                                                    addend,
+                                                    QCOW2_DISCARD_SNAPSHOT);
+                if (ret < 0) {
+                    goto fail;
+                }
             }
+            refcount = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits);
             if (refcount < 0) {
                 ret = refcount;
                 goto fail;
-- 
cgit v1.1


From 7324c10f96c821b00d691e2c8ced67d8536bf1d6 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Tue, 10 Feb 2015 15:28:46 -0500
Subject: qcow2: Only return status from qcow2_get_refcount

Refcounts can theoretically be of type uint64_t; in order to be able to
represent the full range, qcow2_get_refcount() cannot use a single
variable to represent both all refcount values and also keep some values
reserved for errors.

One solution would be to add an Error pointer parameter to
qcow2_get_refcount(); however, no caller could (currently) pass that
error message, so it would have to be emitted immediately and be
passed to the next caller by returning -EIO or something similar.
Therefore, an Error parameter does not offer any advantages here.

The solution applied by this patch is simpler to use. Because no caller
would be able to pass the error message, they would have to print it and
free it, whereas with this patch the caller only needs to pass the
returned integer (which is often a no-op from the code perspective,
because that integer will be stored in a variable "ret" which will be
returned by the fail path of many callers).

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cluster.c  |  8 ++---
 block/qcow2-refcount.c | 86 +++++++++++++++++++++++++++-----------------------
 block/qcow2.h          |  3 +-
 3 files changed, 53 insertions(+), 44 deletions(-)

(limited to 'block')

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 183177d..ee50500 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1640,7 +1640,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
     for (i = 0; i < l1_size; i++) {
         uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK;
         bool l2_dirty = false;
-        int l2_refcount;
+        uint16_t l2_refcount;
 
         if (!l2_offset) {
             /* unallocated */
@@ -1672,9 +1672,9 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
             goto fail;
         }
 
-        l2_refcount = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits);
-        if (l2_refcount < 0) {
-            ret = l2_refcount;
+        ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
+                                 &l2_refcount);
+        if (ret < 0) {
             goto fail;
         }
 
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 1a85bcd..d853490 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -87,26 +87,29 @@ static int load_refcount_block(BlockDriverState *bs,
 }
 
 /*
- * Returns the refcount of the cluster given by its index. Any non-negative
- * return value is the refcount of the cluster, negative values are -errno
- * and indicate an error.
+ * Retrieves the refcount of the cluster given by its index and stores it in
+ * *refcount. Returns 0 on success and -errno on failure.
  */
-int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index)
+int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
+                       uint16_t *refcount)
 {
     BDRVQcowState *s = bs->opaque;
     uint64_t refcount_table_index, block_index;
     int64_t refcount_block_offset;
     int ret;
     uint16_t *refcount_block;
-    uint16_t refcount;
 
     refcount_table_index = cluster_index >> s->refcount_block_bits;
-    if (refcount_table_index >= s->refcount_table_size)
+    if (refcount_table_index >= s->refcount_table_size) {
+        *refcount = 0;
         return 0;
+    }
     refcount_block_offset =
         s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
-    if (!refcount_block_offset)
+    if (!refcount_block_offset) {
+        *refcount = 0;
         return 0;
+    }
 
     if (offset_into_cluster(s, refcount_block_offset)) {
         qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
@@ -122,7 +125,7 @@ int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index)
     }
 
     block_index = cluster_index & (s->refcount_block_size - 1);
-    refcount = be16_to_cpu(refcount_block[block_index]);
+    *refcount = be16_to_cpu(refcount_block[block_index]);
 
     ret = qcow2_cache_put(bs, s->refcount_block_cache,
         (void**) &refcount_block);
@@ -130,7 +133,7 @@ int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index)
         return ret;
     }
 
-    return refcount;
+    return 0;
 }
 
 /*
@@ -662,16 +665,17 @@ static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
 {
     BDRVQcowState *s = bs->opaque;
     uint64_t i, nb_clusters;
-    int refcount;
+    uint16_t refcount;
+    int ret;
 
     nb_clusters = size_to_clusters(s, size);
 retry:
     for(i = 0; i < nb_clusters; i++) {
         uint64_t next_cluster_index = s->free_cluster_index++;
-        refcount = qcow2_get_refcount(bs, next_cluster_index);
+        ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
 
-        if (refcount < 0) {
-            return refcount;
+        if (ret < 0) {
+            return ret;
         } else if (refcount != 0) {
             goto retry;
         }
@@ -721,7 +725,8 @@ int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
     BDRVQcowState *s = bs->opaque;
     uint64_t cluster_index;
     uint64_t i;
-    int refcount, ret;
+    uint16_t refcount;
+    int ret;
 
     assert(nb_clusters >= 0);
     if (nb_clusters == 0) {
@@ -732,10 +737,9 @@ int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
         /* Check how many clusters there are free */
         cluster_index = offset >> s->cluster_bits;
         for(i = 0; i < nb_clusters; i++) {
-            refcount = qcow2_get_refcount(bs, cluster_index++);
-
-            if (refcount < 0) {
-                return refcount;
+            ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
+            if (ret < 0) {
+                return ret;
             } else if (refcount != 0) {
                 break;
             }
@@ -769,9 +773,10 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
     offset = s->free_byte_offset;
 
     if (offset) {
-        int refcount = qcow2_get_refcount(bs, offset >> s->cluster_bits);
-        if (refcount < 0) {
-            return refcount;
+        uint16_t refcount;
+        ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
+        if (ret < 0) {
+            return ret;
         }
 
         if (refcount == s->refcount_max) {
@@ -878,7 +883,8 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
     uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2;
     bool l1_allocated = false;
     int64_t old_offset, old_l2_offset;
-    int i, j, l1_modified = 0, nb_csectors, refcount;
+    int i, j, l1_modified = 0, nb_csectors;
+    uint16_t refcount;
     int ret;
 
     l2_table = NULL;
@@ -983,9 +989,8 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                             }
                         }
 
-                        refcount = qcow2_get_refcount(bs, cluster_index);
-                        if (refcount < 0) {
-                            ret = refcount;
+                        ret = qcow2_get_refcount(bs, cluster_index, &refcount);
+                        if (ret < 0) {
                             goto fail;
                         }
                         break;
@@ -1026,9 +1031,9 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                     goto fail;
                 }
             }
-            refcount = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits);
-            if (refcount < 0) {
-                ret = refcount;
+            ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
+                                     &refcount);
+            if (ret < 0) {
                 goto fail;
             } else if (refcount == 1) {
                 l2_offset |= QCOW_OFLAG_COPIED;
@@ -1346,7 +1351,7 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
     BDRVQcowState *s = bs->opaque;
     uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
     int ret;
-    int refcount;
+    uint16_t refcount;
     int i, j;
 
     for (i = 0; i < s->l1_size; i++) {
@@ -1358,8 +1363,9 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
             continue;
         }
 
-        refcount = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits);
-        if (refcount < 0) {
+        ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
+                                 &refcount);
+        if (ret < 0) {
             /* don't print message nor increment check_errors */
             continue;
         }
@@ -1400,9 +1406,10 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
 
             if ((cluster_type == QCOW2_CLUSTER_NORMAL) ||
                 ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) {
-                refcount = qcow2_get_refcount(bs,
-                                              data_offset >> s->cluster_bits);
-                if (refcount < 0) {
+                ret = qcow2_get_refcount(bs,
+                                         data_offset >> s->cluster_bits,
+                                         &refcount);
+                if (ret < 0) {
                     /* don't print message nor increment check_errors */
                     continue;
                 }
@@ -1634,13 +1641,14 @@ static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
 {
     BDRVQcowState *s = bs->opaque;
     int64_t i;
-    int refcount1, refcount2, ret;
+    uint16_t refcount1, refcount2;
+    int ret;
 
     for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
-        refcount1 = qcow2_get_refcount(bs, i);
-        if (refcount1 < 0) {
+        ret = qcow2_get_refcount(bs, i, &refcount1);
+        if (ret < 0) {
             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
-                i, strerror(-refcount1));
+                    i, strerror(-ret));
             res->check_errors++;
             continue;
         }
@@ -1670,7 +1678,7 @@ static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
 
             if (num_fixed) {
                 ret = update_refcount(bs, i << s->cluster_bits, 1,
-                                      refcount2 - refcount1,
+                                      (int)refcount2 - (int)refcount1,
                                       QCOW2_DISCARD_ALWAYS);
                 if (ret >= 0) {
                     (*num_fixed)++;
diff --git a/block/qcow2.h b/block/qcow2.h
index 55138c9..a33431f 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -478,7 +478,8 @@ void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
 int qcow2_refcount_init(BlockDriverState *bs);
 void qcow2_refcount_close(BlockDriverState *bs);
 
-int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index);
+int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
+                       uint16_t *refcount);
 
 int qcow2_update_cluster_refcount(BlockDriverState *bs, int64_t cluster_index,
                                   int addend, enum qcow2_discard_type type);
-- 
cgit v1.1


From 2aabe7c7a16cee6b1b54592fa05b5f9c23c89bc0 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Tue, 10 Feb 2015 15:28:47 -0500
Subject: qcow2: Use unsigned addend for update_refcount()

update_refcount() and qcow2_update_cluster_refcount() currently take a
signed addend. At least one caller passes a value directly derived from
an absolute refcount that should be reached ("l2_refcount - 1" in
expand_zero_clusters_in_l1()). Therefore, the addend should be unsigned
as well; this will be especially important for 64 bit refcounts.

Because update_refcount() then no longer knows whether the refcount
should be increased or decreased, it now requires an additional flag
which specified exactly that. The same applies to
qcow2_update_cluster_refcount().

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cluster.c  |  3 ++-
 block/qcow2-refcount.c | 65 +++++++++++++++++++++++++++++++++-----------------
 block/qcow2.h          |  8 ++++++-
 3 files changed, 52 insertions(+), 24 deletions(-)

(limited to 'block')

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index ee50500..405329a 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1707,7 +1707,8 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
                     /* For shared L2 tables, set the refcount accordingly (it is
                      * already 1 and needs to be l2_refcount) */
                     ret = qcow2_update_cluster_refcount(bs,
-                            offset >> s->cluster_bits, l2_refcount - 1,
+                            offset >> s->cluster_bits,
+                            refcount_diff(1, l2_refcount), false,
                             QCOW2_DISCARD_OTHER);
                     if (ret < 0) {
                         qcow2_free_clusters(bs, offset, s->cluster_size,
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index d853490..354a355 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -29,8 +29,8 @@
 
 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
-                            int64_t offset, int64_t length,
-                            int addend, enum qcow2_discard_type type);
+                            int64_t offset, int64_t length, uint16_t addend,
+                            bool decrease, enum qcow2_discard_type type);
 
 
 /*********************************************************/
@@ -263,7 +263,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
     } else {
         /* Described somewhere else. This can recurse at most twice before we
          * arrive at a block that describes itself. */
-        ret = update_refcount(bs, new_block, s->cluster_size, 1,
+        ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
                               QCOW2_DISCARD_NEVER);
         if (ret < 0) {
             goto fail_block;
@@ -530,8 +530,14 @@ found:
 }
 
 /* XXX: cache several refcount block clusters ? */
+/* @addend is the absolute value of the addend; if @decrease is set, @addend
+ * will be subtracted from the current refcount, otherwise it will be added */
 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
-    int64_t offset, int64_t length, int addend, enum qcow2_discard_type type)
+                                                   int64_t offset,
+                                                   int64_t length,
+                                                   uint16_t addend,
+                                                   bool decrease,
+                                                   enum qcow2_discard_type type)
 {
     BDRVQcowState *s = bs->opaque;
     int64_t start, last, cluster_offset;
@@ -540,8 +546,9 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
     int ret;
 
 #ifdef DEBUG_ALLOC2
-    fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64 " addend=%d\n",
-           offset, length, addend);
+    fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
+            " addend=%s%" PRIu16 "\n", offset, length, decrease ? "-" : "",
+            addend);
 #endif
     if (length < 0) {
         return -EINVAL;
@@ -549,7 +556,7 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
         return 0;
     }
 
-    if (addend < 0) {
+    if (decrease) {
         qcow2_cache_set_dependency(bs, s->refcount_block_cache,
             s->l2_table_cache);
     }
@@ -559,7 +566,8 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
     for(cluster_offset = start; cluster_offset <= last;
         cluster_offset += s->cluster_size)
     {
-        int block_index, refcount;
+        int block_index;
+        uint16_t refcount;
         int64_t cluster_index = cluster_offset >> s->cluster_bits;
         int64_t table_index = cluster_index >> s->refcount_block_bits;
 
@@ -586,11 +594,18 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
         block_index = cluster_index & (s->refcount_block_size - 1);
 
         refcount = be16_to_cpu(refcount_block[block_index]);
-        refcount += addend;
-        if (refcount < 0 || refcount > s->refcount_max) {
+        if (decrease ? ((uint16_t)(refcount - addend) > refcount)
+                     : ((uint16_t)(refcount + addend) < refcount ||
+                        (uint16_t)(refcount + addend) > s->refcount_max))
+        {
             ret = -EINVAL;
             goto fail;
         }
+        if (decrease) {
+            refcount -= addend;
+        } else {
+            refcount += addend;
+        }
         if (refcount == 0 && cluster_index < s->free_cluster_index) {
             s->free_cluster_index = cluster_index;
         }
@@ -623,8 +638,8 @@ fail:
      */
     if (ret < 0) {
         int dummy;
-        dummy = update_refcount(bs, offset, cluster_offset - offset, -addend,
-                                QCOW2_DISCARD_NEVER);
+        dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
+                                !decrease, QCOW2_DISCARD_NEVER);
         (void)dummy;
     }
 
@@ -634,18 +649,21 @@ fail:
 /*
  * Increases or decreases the refcount of a given cluster.
  *
+ * @addend is the absolute value of the addend; if @decrease is set, @addend
+ * will be subtracted from the current refcount, otherwise it will be added.
+ *
  * On success 0 is returned; on failure -errno is returned.
  */
 int qcow2_update_cluster_refcount(BlockDriverState *bs,
                                   int64_t cluster_index,
-                                  int addend,
+                                  uint16_t addend, bool decrease,
                                   enum qcow2_discard_type type)
 {
     BDRVQcowState *s = bs->opaque;
     int ret;
 
     ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
-                          type);
+                          decrease, type);
     if (ret < 0) {
         return ret;
     }
@@ -709,7 +727,7 @@ int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
             return offset;
         }
 
-        ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER);
+        ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
     } while (ret == -EAGAIN);
 
     if (ret < 0) {
@@ -746,7 +764,7 @@ int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
         }
 
         /* And then allocate them */
-        ret = update_refcount(bs, offset, i << s->cluster_bits, 1,
+        ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
                               QCOW2_DISCARD_NEVER);
     } while (ret == -EAGAIN);
 
@@ -797,7 +815,7 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
     }
 
     assert(offset);
-    ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER);
+    ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
     if (ret < 0) {
         return ret;
     }
@@ -821,7 +839,7 @@ void qcow2_free_clusters(BlockDriverState *bs,
     int ret;
 
     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
-    ret = update_refcount(bs, offset, size, -1, type);
+    ret = update_refcount(bs, offset, size, 1, true, type);
     if (ret < 0) {
         fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
         /* TODO Remember the clusters to free them later and avoid leaking */
@@ -887,6 +905,8 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
     uint16_t refcount;
     int ret;
 
+    assert(addend >= -1 && addend <= 1);
+
     l2_table = NULL;
     l1_table = NULL;
     l1_size2 = l1_size * sizeof(uint64_t);
@@ -951,7 +971,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                         if (addend != 0) {
                             ret = update_refcount(bs,
                                 (offset & s->cluster_offset_mask) & ~511,
-                                nb_csectors * 512, addend,
+                                nb_csectors * 512, abs(addend), addend < 0,
                                 QCOW2_DISCARD_SNAPSHOT);
                             if (ret < 0) {
                                 goto fail;
@@ -982,7 +1002,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                         }
                         if (addend != 0) {
                             ret = qcow2_update_cluster_refcount(bs,
-                                    cluster_index, addend,
+                                    cluster_index, abs(addend), addend < 0,
                                     QCOW2_DISCARD_SNAPSHOT);
                             if (ret < 0) {
                                 goto fail;
@@ -1025,7 +1045,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
             if (addend != 0) {
                 ret = qcow2_update_cluster_refcount(bs, l2_offset >>
                                                         s->cluster_bits,
-                                                    addend,
+                                                    abs(addend), addend < 0,
                                                     QCOW2_DISCARD_SNAPSHOT);
                 if (ret < 0) {
                     goto fail;
@@ -1678,7 +1698,8 @@ static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
 
             if (num_fixed) {
                 ret = update_refcount(bs, i << s->cluster_bits, 1,
-                                      (int)refcount2 - (int)refcount1,
+                                      refcount_diff(refcount1, refcount2),
+                                      refcount1 > refcount2,
                                       QCOW2_DISCARD_ALWAYS);
                 if (ret >= 0) {
                     (*num_fixed)++;
diff --git a/block/qcow2.h b/block/qcow2.h
index a33431f..40910d8 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -459,6 +459,11 @@ static inline uint64_t l2meta_cow_end(QCowL2Meta *m)
         + (m->cow_end.nb_sectors << BDRV_SECTOR_BITS);
 }
 
+static inline uint16_t refcount_diff(uint16_t r1, uint16_t r2)
+{
+    return r1 > r2 ? r1 - r2 : r2 - r1;
+}
+
 // FIXME Need qcow2_ prefix to global functions
 
 /* qcow2.c functions */
@@ -482,7 +487,8 @@ int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
                        uint16_t *refcount);
 
 int qcow2_update_cluster_refcount(BlockDriverState *bs, int64_t cluster_index,
-                                  int addend, enum qcow2_discard_type type);
+                                  uint16_t addend, bool decrease,
+                                  enum qcow2_discard_type type);
 
 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size);
 int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
-- 
cgit v1.1


From 0e06528e980b8ac7695a219f8405d3cdc52a1381 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Tue, 10 Feb 2015 15:28:48 -0500
Subject: qcow2: Use 64 bits for refcount values

Refcounts may have a width of up to 64 bits, so qemu should use the same
width to represent refcount values internally.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cluster.c  |  2 +-
 block/qcow2-refcount.c | 40 +++++++++++++++++++---------------------
 block/qcow2.h          |  6 +++---
 3 files changed, 23 insertions(+), 25 deletions(-)

(limited to 'block')

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 405329a..ed2b44d 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1640,7 +1640,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
     for (i = 0; i < l1_size; i++) {
         uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK;
         bool l2_dirty = false;
-        uint16_t l2_refcount;
+        uint64_t l2_refcount;
 
         if (!l2_offset) {
             /* unallocated */
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 354a355..e86a1d6 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -29,7 +29,7 @@
 
 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
-                            int64_t offset, int64_t length, uint16_t addend,
+                            int64_t offset, int64_t length, uint64_t addend,
                             bool decrease, enum qcow2_discard_type type);
 
 
@@ -91,7 +91,7 @@ static int load_refcount_block(BlockDriverState *bs,
  * *refcount. Returns 0 on success and -errno on failure.
  */
 int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
-                       uint16_t *refcount)
+                       uint64_t *refcount)
 {
     BDRVQcowState *s = bs->opaque;
     uint64_t refcount_table_index, block_index;
@@ -535,7 +535,7 @@ found:
 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
                                                    int64_t offset,
                                                    int64_t length,
-                                                   uint16_t addend,
+                                                   uint64_t addend,
                                                    bool decrease,
                                                    enum qcow2_discard_type type)
 {
@@ -547,7 +547,7 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
 
 #ifdef DEBUG_ALLOC2
     fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
-            " addend=%s%" PRIu16 "\n", offset, length, decrease ? "-" : "",
+            " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
             addend);
 #endif
     if (length < 0) {
@@ -567,7 +567,7 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
         cluster_offset += s->cluster_size)
     {
         int block_index;
-        uint16_t refcount;
+        uint64_t refcount;
         int64_t cluster_index = cluster_offset >> s->cluster_bits;
         int64_t table_index = cluster_index >> s->refcount_block_bits;
 
@@ -594,9 +594,9 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
         block_index = cluster_index & (s->refcount_block_size - 1);
 
         refcount = be16_to_cpu(refcount_block[block_index]);
-        if (decrease ? ((uint16_t)(refcount - addend) > refcount)
-                     : ((uint16_t)(refcount + addend) < refcount ||
-                        (uint16_t)(refcount + addend) > s->refcount_max))
+        if (decrease ? (refcount - addend > refcount)
+                     : (refcount + addend < refcount ||
+                        refcount + addend > s->refcount_max))
         {
             ret = -EINVAL;
             goto fail;
@@ -656,7 +656,7 @@ fail:
  */
 int qcow2_update_cluster_refcount(BlockDriverState *bs,
                                   int64_t cluster_index,
-                                  uint16_t addend, bool decrease,
+                                  uint64_t addend, bool decrease,
                                   enum qcow2_discard_type type)
 {
     BDRVQcowState *s = bs->opaque;
@@ -682,8 +682,7 @@ int qcow2_update_cluster_refcount(BlockDriverState *bs,
 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
 {
     BDRVQcowState *s = bs->opaque;
-    uint64_t i, nb_clusters;
-    uint16_t refcount;
+    uint64_t i, nb_clusters, refcount;
     int ret;
 
     nb_clusters = size_to_clusters(s, size);
@@ -741,9 +740,8 @@ int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
     int nb_clusters)
 {
     BDRVQcowState *s = bs->opaque;
-    uint64_t cluster_index;
+    uint64_t cluster_index, refcount;
     uint64_t i;
-    uint16_t refcount;
     int ret;
 
     assert(nb_clusters >= 0);
@@ -791,7 +789,7 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
     offset = s->free_byte_offset;
 
     if (offset) {
-        uint16_t refcount;
+        uint64_t refcount;
         ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
         if (ret < 0) {
             return ret;
@@ -898,11 +896,10 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
     int64_t l1_table_offset, int l1_size, int addend)
 {
     BDRVQcowState *s = bs->opaque;
-    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2;
+    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, refcount;
     bool l1_allocated = false;
     int64_t old_offset, old_l2_offset;
     int i, j, l1_modified = 0, nb_csectors;
-    uint16_t refcount;
     int ret;
 
     assert(addend >= -1 && addend <= 1);
@@ -1371,7 +1368,7 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
     BDRVQcowState *s = bs->opaque;
     uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
     int ret;
-    uint16_t refcount;
+    uint64_t refcount;
     int i, j;
 
     for (i = 0; i < s->l1_size; i++) {
@@ -1391,7 +1388,7 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
         }
         if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
             fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
-                    "l1_entry=%" PRIx64 " refcount=%d\n",
+                    "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
                     fix & BDRV_FIX_ERRORS ? "Repairing" :
                                             "ERROR",
                     i, l1_entry, refcount);
@@ -1435,7 +1432,7 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
                 }
                 if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
                     fprintf(stderr, "%s OFLAG_COPIED data cluster: "
-                            "l2_entry=%" PRIx64 " refcount=%d\n",
+                            "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
                             fix & BDRV_FIX_ERRORS ? "Repairing" :
                                                     "ERROR",
                             l2_entry, refcount);
@@ -1661,7 +1658,7 @@ static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
 {
     BDRVQcowState *s = bs->opaque;
     int64_t i;
-    uint16_t refcount1, refcount2;
+    uint64_t refcount1, refcount2;
     int ret;
 
     for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
@@ -1690,7 +1687,8 @@ static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
                 num_fixed = &res->corruptions_fixed;
             }
 
-            fprintf(stderr, "%s cluster %" PRId64 " refcount=%d reference=%d\n",
+            fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
+                    " reference=%" PRIu64 "\n",
                    num_fixed != NULL     ? "Repairing" :
                    refcount1 < refcount2 ? "ERROR" :
                                            "Leaked",
diff --git a/block/qcow2.h b/block/qcow2.h
index 40910d8..a9108f5 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -459,7 +459,7 @@ static inline uint64_t l2meta_cow_end(QCowL2Meta *m)
         + (m->cow_end.nb_sectors << BDRV_SECTOR_BITS);
 }
 
-static inline uint16_t refcount_diff(uint16_t r1, uint16_t r2)
+static inline uint64_t refcount_diff(uint64_t r1, uint64_t r2)
 {
     return r1 > r2 ? r1 - r2 : r2 - r1;
 }
@@ -484,10 +484,10 @@ int qcow2_refcount_init(BlockDriverState *bs);
 void qcow2_refcount_close(BlockDriverState *bs);
 
 int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
-                       uint16_t *refcount);
+                       uint64_t *refcount);
 
 int qcow2_update_cluster_refcount(BlockDriverState *bs, int64_t cluster_index,
-                                  uint16_t addend, bool decrease,
+                                  uint64_t addend, bool decrease,
                                   enum qcow2_discard_type type);
 
 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size);
-- 
cgit v1.1


From 5fee192efdfe2161d392491e6f68bffb406b18e1 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Tue, 10 Feb 2015 15:28:49 -0500
Subject: qcow2: Helper for refcount array reallocation

Add a helper function for reallocating a refcount array, independent of
the refcount order. The newly allocated space is zeroed and the function
handles failed reallocations gracefully.

The helper function will always align the buffer size to a cluster
boundary; if storing the refcounts in such an array in big endian byte
order, this makes it possible to write parts of the array directly as
refcount blocks into the image file.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-refcount.c | 130 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 81 insertions(+), 49 deletions(-)

(limited to 'block')

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index e86a1d6..497364f 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -1095,6 +1095,63 @@ fail:
 /* refcount checking functions */
 
 
+static size_t refcount_array_byte_size(BDRVQcowState *s, uint64_t entries)
+{
+    /* This assertion holds because there is no way we can address more than
+     * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
+     * offsets have to be representable in bytes); due to every cluster
+     * corresponding to one refcount entry, we are well below that limit */
+    assert(entries < (UINT64_C(1) << (64 - 9)));
+
+    /* Thanks to the assertion this will not overflow, because
+     * s->refcount_order < 7.
+     * (note: x << s->refcount_order == x * s->refcount_bits) */
+    return DIV_ROUND_UP(entries << s->refcount_order, 8);
+}
+
+/**
+ * Reallocates *array so that it can hold new_size entries. *size must contain
+ * the current number of entries in *array. If the reallocation fails, *array
+ * and *size will not be modified and -errno will be returned. If the
+ * reallocation is successful, *array will be set to the new buffer, *size
+ * will be set to new_size and 0 will be returned. The size of the reallocated
+ * refcount array buffer will be aligned to a cluster boundary, and the newly
+ * allocated area will be zeroed.
+ */
+static int realloc_refcount_array(BDRVQcowState *s, uint16_t **array,
+                                  int64_t *size, int64_t new_size)
+{
+    size_t old_byte_size, new_byte_size;
+    uint16_t *new_ptr;
+
+    /* Round to clusters so the array can be directly written to disk */
+    old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
+                    * s->cluster_size;
+    new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
+                    * s->cluster_size;
+
+    if (new_byte_size == old_byte_size) {
+        *size = new_size;
+        return 0;
+    }
+
+    assert(new_byte_size > 0);
+
+    new_ptr = g_try_realloc(*array, new_byte_size);
+    if (!new_ptr) {
+        return -ENOMEM;
+    }
+
+    if (new_byte_size > old_byte_size) {
+        memset((void *)((uintptr_t)new_ptr + old_byte_size), 0,
+               new_byte_size - old_byte_size);
+    }
+
+    *array = new_ptr;
+    *size  = new_size;
+
+    return 0;
+}
 
 /*
  * Increases the refcount for a range of clusters in a given refcount table.
@@ -1111,6 +1168,7 @@ static int inc_refcounts(BlockDriverState *bs,
 {
     BDRVQcowState *s = bs->opaque;
     uint64_t start, last, cluster_offset, k;
+    int ret;
 
     if (size <= 0) {
         return 0;
@@ -1122,23 +1180,12 @@ static int inc_refcounts(BlockDriverState *bs,
         cluster_offset += s->cluster_size) {
         k = cluster_offset >> s->cluster_bits;
         if (k >= *refcount_table_size) {
-            int64_t old_refcount_table_size = *refcount_table_size;
-            uint16_t *new_refcount_table;
-
-            *refcount_table_size = k + 1;
-            new_refcount_table = g_try_realloc(*refcount_table,
-                                               *refcount_table_size *
-                                               sizeof(**refcount_table));
-            if (!new_refcount_table) {
-                *refcount_table_size = old_refcount_table_size;
+            ret = realloc_refcount_array(s, refcount_table,
+                                         refcount_table_size, k + 1);
+            if (ret < 0) {
                 res->check_errors++;
-                return -ENOMEM;
+                return ret;
             }
-            *refcount_table = new_refcount_table;
-
-            memset(*refcount_table + old_refcount_table_size, 0,
-                   (*refcount_table_size - old_refcount_table_size) *
-                   sizeof(**refcount_table));
         }
 
         if (++(*refcount_table)[k] == 0) {
@@ -1507,8 +1554,7 @@ static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
                     fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
 
             if (fix & BDRV_FIX_ERRORS) {
-                int64_t old_nb_clusters = *nb_clusters;
-                uint16_t *new_refcount_table;
+                int64_t new_nb_clusters;
 
                 if (offset > INT64_MAX - s->cluster_size) {
                     ret = -EINVAL;
@@ -1525,22 +1571,15 @@ static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
                     goto resize_fail;
                 }
 
-                *nb_clusters = size_to_clusters(s, size);
-                assert(*nb_clusters >= old_nb_clusters);
+                new_nb_clusters = size_to_clusters(s, size);
+                assert(new_nb_clusters >= *nb_clusters);
 
-                new_refcount_table = g_try_realloc(*refcount_table,
-                                                   *nb_clusters *
-                                                   sizeof(**refcount_table));
-                if (!new_refcount_table) {
-                    *nb_clusters = old_nb_clusters;
+                ret = realloc_refcount_array(s, refcount_table,
+                                             nb_clusters, new_nb_clusters);
+                if (ret < 0) {
                     res->check_errors++;
-                    return -ENOMEM;
+                    return ret;
                 }
-                *refcount_table = new_refcount_table;
-
-                memset(*refcount_table + old_nb_clusters, 0,
-                       (*nb_clusters - old_nb_clusters) *
-                       sizeof(**refcount_table));
 
                 if (cluster >= *nb_clusters) {
                     ret = -EINVAL;
@@ -1600,10 +1639,12 @@ static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
     int ret;
 
     if (!*refcount_table) {
-        *refcount_table = g_try_new0(uint16_t, *nb_clusters);
-        if (*nb_clusters && *refcount_table == NULL) {
+        int64_t old_size = 0;
+        ret = realloc_refcount_array(s, refcount_table,
+                                     &old_size, *nb_clusters);
+        if (ret < 0) {
             res->check_errors++;
-            return -ENOMEM;
+            return ret;
         }
     }
 
@@ -1737,6 +1778,7 @@ static int64_t alloc_clusters_imrt(BlockDriverState *bs,
     int64_t cluster = *first_free_cluster, i;
     bool first_gap = true;
     int contiguous_free_clusters;
+    int ret;
 
     /* Starting at *first_free_cluster, find a range of at least cluster_count
      * continuously free clusters */
@@ -1766,28 +1808,18 @@ static int64_t alloc_clusters_imrt(BlockDriverState *bs,
     /* If no such range could be found, grow the in-memory refcount table
      * accordingly to append free clusters at the end of the image */
     if (contiguous_free_clusters < cluster_count) {
-        int64_t old_imrt_nb_clusters = *imrt_nb_clusters;
-        uint16_t *new_refcount_table;
-
         /* contiguous_free_clusters clusters are already empty at the image end;
          * we need cluster_count clusters; therefore, we have to allocate
          * cluster_count - contiguous_free_clusters new clusters at the end of
          * the image (which is the current value of cluster; note that cluster
          * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
          * the image end) */
-        *imrt_nb_clusters = cluster + cluster_count - contiguous_free_clusters;
-        new_refcount_table = g_try_realloc(*refcount_table,
-                                           *imrt_nb_clusters *
-                                           sizeof(**refcount_table));
-        if (!new_refcount_table) {
-            *imrt_nb_clusters = old_imrt_nb_clusters;
-            return -ENOMEM;
-        }
-        *refcount_table = new_refcount_table;
-
-        memset(*refcount_table + old_imrt_nb_clusters, 0,
-               (*imrt_nb_clusters - old_imrt_nb_clusters) *
-               sizeof(**refcount_table));
+        ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
+                                     cluster + cluster_count
+                                     - contiguous_free_clusters);
+        if (ret < 0) {
+            return ret;
+        }
     }
 
     /* Go back to the first free cluster */
-- 
cgit v1.1


From 7453c96b78c2b09aa72924f933bb9616e5474194 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Tue, 10 Feb 2015 15:28:50 -0500
Subject: qcow2: Helper function for refcount modification

Since refcounts do not always have to be a uint16_t, all refcount blocks
and arrays in memory should not have a specific type (thus they become
pointers to void) and for accessing them, two helper functions are used
(a getter and a setter). Those functions are called indirectly through
function pointers in the BDRVQcowState so they may later be exchanged
for different refcount orders.

With the check and repair functions using this function, the refcount
array they are creating will be in big endian byte order; additionally,
using realloc_refcount_array() makes the size of this refcount array
always cluster-aligned. Both combined allow rebuild_refcount_structure()
to drop the bounce buffer which was used to convert parts of the
refcount array to big endian byte order and store them on disk. Instead,
those parts can now be written directly.

[ kwolf: Fixed a build failure on 32 bit and another with old glib ]

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-refcount.c | 126 ++++++++++++++++++++++++++++---------------------
 block/qcow2.h          |   8 ++++
 2 files changed, 81 insertions(+), 53 deletions(-)

(limited to 'block')

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 497364f..c0c1313 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -32,6 +32,11 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
                             int64_t offset, int64_t length, uint64_t addend,
                             bool decrease, enum qcow2_discard_type type);
 
+static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
+
+static void set_refcount_ro4(void *refcount_array, uint64_t index,
+                             uint64_t value);
+
 
 /*********************************************************/
 /* refcount handling */
@@ -42,6 +47,9 @@ int qcow2_refcount_init(BlockDriverState *bs)
     unsigned int refcount_table_size2, i;
     int ret;
 
+    s->get_refcount = &get_refcount_ro4;
+    s->set_refcount = &set_refcount_ro4;
+
     assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
     refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
     s->refcount_table = g_try_malloc(refcount_table_size2);
@@ -72,6 +80,19 @@ void qcow2_refcount_close(BlockDriverState *bs)
 }
 
 
+static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
+{
+    return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
+}
+
+static void set_refcount_ro4(void *refcount_array, uint64_t index,
+                             uint64_t value)
+{
+    assert(!(value >> 16));
+    ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
+}
+
+
 static int load_refcount_block(BlockDriverState *bs,
                                int64_t refcount_block_offset,
                                void **refcount_block)
@@ -97,7 +118,7 @@ int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
     uint64_t refcount_table_index, block_index;
     int64_t refcount_block_offset;
     int ret;
-    uint16_t *refcount_block;
+    void *refcount_block;
 
     refcount_table_index = cluster_index >> s->refcount_block_bits;
     if (refcount_table_index >= s->refcount_table_size) {
@@ -119,16 +140,15 @@ int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
     }
 
     ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
-        (void**) &refcount_block);
+                          &refcount_block);
     if (ret < 0) {
         return ret;
     }
 
     block_index = cluster_index & (s->refcount_block_size - 1);
-    *refcount = be16_to_cpu(refcount_block[block_index]);
+    *refcount = s->get_refcount(refcount_block, block_index);
 
-    ret = qcow2_cache_put(bs, s->refcount_block_cache,
-        (void**) &refcount_block);
+    ret = qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
     if (ret < 0) {
         return ret;
     }
@@ -172,7 +192,7 @@ static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a,
  * Returns 0 on success or -errno in error case
  */
 static int alloc_refcount_block(BlockDriverState *bs,
-    int64_t cluster_index, uint16_t **refcount_block)
+                                int64_t cluster_index, void **refcount_block)
 {
     BDRVQcowState *s = bs->opaque;
     unsigned int refcount_table_index;
@@ -199,7 +219,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
             }
 
              return load_refcount_block(bs, refcount_block_offset,
-                 (void**) refcount_block);
+                                        refcount_block);
         }
     }
 
@@ -249,7 +269,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
     if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
         /* Zero the new refcount block before updating it */
         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
-            (void**) refcount_block);
+                                    refcount_block);
         if (ret < 0) {
             goto fail_block;
         }
@@ -259,7 +279,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
         /* The block describes itself, need to update the cache */
         int block_index = (new_block >> s->cluster_bits) &
             (s->refcount_block_size - 1);
-        (*refcount_block)[block_index] = cpu_to_be16(1);
+        s->set_refcount(*refcount_block, block_index, 1);
     } else {
         /* Described somewhere else. This can recurse at most twice before we
          * arrive at a block that describes itself. */
@@ -277,7 +297,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
         /* Initialize the new refcount block only after updating its refcount,
          * update_refcount uses the refcount cache itself */
         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
-            (void**) refcount_block);
+                                    refcount_block);
         if (ret < 0) {
             goto fail_block;
         }
@@ -311,7 +331,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
         return -EAGAIN;
     }
 
-    ret = qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
+    ret = qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
     if (ret < 0) {
         goto fail_block;
     }
@@ -365,7 +385,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
         s->cluster_size;
     uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size;
     uint64_t *new_table = g_try_new0(uint64_t, table_size);
-    uint16_t *new_blocks = g_try_malloc0(blocks_clusters * s->cluster_size);
+    void *new_blocks = g_try_malloc0(blocks_clusters * s->cluster_size);
 
     assert(table_size > 0 && blocks_clusters > 0);
     if (new_table == NULL || new_blocks == NULL) {
@@ -387,7 +407,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
     uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t));
     int block = 0;
     for (i = 0; i < table_clusters + blocks_clusters; i++) {
-        new_blocks[block++] = cpu_to_be16(1);
+        s->set_refcount(new_blocks, block++, 1);
     }
 
     /* Write refcount blocks to disk */
@@ -440,7 +460,7 @@ static int alloc_refcount_block(BlockDriverState *bs,
     qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
                         QCOW2_DISCARD_OTHER);
 
-    ret = load_refcount_block(bs, new_block, (void**) refcount_block);
+    ret = load_refcount_block(bs, new_block, refcount_block);
     if (ret < 0) {
         return ret;
     }
@@ -455,7 +475,7 @@ fail_table:
     g_free(new_table);
 fail_block:
     if (*refcount_block != NULL) {
-        qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
+        qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
     }
     return ret;
 }
@@ -541,7 +561,7 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
 {
     BDRVQcowState *s = bs->opaque;
     int64_t start, last, cluster_offset;
-    uint16_t *refcount_block = NULL;
+    void *refcount_block = NULL;
     int64_t old_table_index = -1;
     int ret;
 
@@ -575,7 +595,7 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
         if (table_index != old_table_index) {
             if (refcount_block) {
                 ret = qcow2_cache_put(bs, s->refcount_block_cache,
-                    (void**) &refcount_block);
+                                      &refcount_block);
                 if (ret < 0) {
                     goto fail;
                 }
@@ -593,7 +613,7 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
         /* we can update the count and save it */
         block_index = cluster_index & (s->refcount_block_size - 1);
 
-        refcount = be16_to_cpu(refcount_block[block_index]);
+        refcount = s->get_refcount(refcount_block, block_index);
         if (decrease ? (refcount - addend > refcount)
                      : (refcount + addend < refcount ||
                         refcount + addend > s->refcount_max))
@@ -609,7 +629,7 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
         if (refcount == 0 && cluster_index < s->free_cluster_index) {
             s->free_cluster_index = cluster_index;
         }
-        refcount_block[block_index] = cpu_to_be16(refcount);
+        s->set_refcount(refcount_block, block_index, refcount);
 
         if (refcount == 0 && s->discard_passthrough[type]) {
             update_refcount_discard(bs, cluster_offset, s->cluster_size);
@@ -625,8 +645,7 @@ fail:
     /* Write last changed block to disk */
     if (refcount_block) {
         int wret;
-        wret = qcow2_cache_put(bs, s->refcount_block_cache,
-            (void**) &refcount_block);
+        wret = qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
         if (wret < 0) {
             return ret < 0 ? ret : wret;
         }
@@ -1118,11 +1137,11 @@ static size_t refcount_array_byte_size(BDRVQcowState *s, uint64_t entries)
  * refcount array buffer will be aligned to a cluster boundary, and the newly
  * allocated area will be zeroed.
  */
-static int realloc_refcount_array(BDRVQcowState *s, uint16_t **array,
+static int realloc_refcount_array(BDRVQcowState *s, void **array,
                                   int64_t *size, int64_t new_size)
 {
     size_t old_byte_size, new_byte_size;
-    uint16_t *new_ptr;
+    void *new_ptr;
 
     /* Round to clusters so the array can be directly written to disk */
     old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
@@ -1162,12 +1181,12 @@ static int realloc_refcount_array(BDRVQcowState *s, uint16_t **array,
  */
 static int inc_refcounts(BlockDriverState *bs,
                          BdrvCheckResult *res,
-                         uint16_t **refcount_table,
+                         void **refcount_table,
                          int64_t *refcount_table_size,
                          int64_t offset, int64_t size)
 {
     BDRVQcowState *s = bs->opaque;
-    uint64_t start, last, cluster_offset, k;
+    uint64_t start, last, cluster_offset, k, refcount;
     int ret;
 
     if (size <= 0) {
@@ -1188,11 +1207,14 @@ static int inc_refcounts(BlockDriverState *bs,
             }
         }
 
-        if (++(*refcount_table)[k] == 0) {
+        refcount = s->get_refcount(*refcount_table, k);
+        if (refcount == s->refcount_max) {
             fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
                     "\n", cluster_offset);
             res->corruptions++;
+            continue;
         }
+        s->set_refcount(*refcount_table, k, refcount + 1);
     }
 
     return 0;
@@ -1212,8 +1234,9 @@ enum {
  * error occurred.
  */
 static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
-    uint16_t **refcount_table, int64_t *refcount_table_size, int64_t l2_offset,
-    int flags)
+                              void **refcount_table,
+                              int64_t *refcount_table_size, int64_t l2_offset,
+                              int flags)
 {
     BDRVQcowState *s = bs->opaque;
     uint64_t *l2_table, l2_entry;
@@ -1330,7 +1353,7 @@ fail:
  */
 static int check_refcounts_l1(BlockDriverState *bs,
                               BdrvCheckResult *res,
-                              uint16_t **refcount_table,
+                              void **refcount_table,
                               int64_t *refcount_table_size,
                               int64_t l1_table_offset, int l1_size,
                               int flags)
@@ -1529,7 +1552,7 @@ fail:
  */
 static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
                            BdrvCheckMode fix, bool *rebuild,
-                           uint16_t **refcount_table, int64_t *nb_clusters)
+                           void **refcount_table, int64_t *nb_clusters)
 {
     BDRVQcowState *s = bs->opaque;
     int64_t i, size;
@@ -1614,9 +1637,10 @@ resize_fail:
             if (ret < 0) {
                 return ret;
             }
-            if ((*refcount_table)[cluster] != 1) {
+            if (s->get_refcount(*refcount_table, cluster) != 1) {
                 fprintf(stderr, "ERROR refcount block %" PRId64
-                        " refcount=%d\n", i, (*refcount_table)[cluster]);
+                        " refcount=%" PRIu64 "\n", i,
+                        s->get_refcount(*refcount_table, cluster));
                 res->corruptions++;
                 *rebuild = true;
             }
@@ -1631,7 +1655,7 @@ resize_fail:
  */
 static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
                                BdrvCheckMode fix, bool *rebuild,
-                               uint16_t **refcount_table, int64_t *nb_clusters)
+                               void **refcount_table, int64_t *nb_clusters)
 {
     BDRVQcowState *s = bs->opaque;
     int64_t i;
@@ -1695,7 +1719,7 @@ static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
 static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
                               BdrvCheckMode fix, bool *rebuild,
                               int64_t *highest_cluster,
-                              uint16_t *refcount_table, int64_t nb_clusters)
+                              void *refcount_table, int64_t nb_clusters)
 {
     BDRVQcowState *s = bs->opaque;
     int64_t i;
@@ -1711,7 +1735,7 @@ static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
             continue;
         }
 
-        refcount2 = refcount_table[i];
+        refcount2 = s->get_refcount(refcount_table, i);
 
         if (refcount1 > 0 || refcount2 > 0) {
             *highest_cluster = i;
@@ -1770,7 +1794,7 @@ static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
  */
 static int64_t alloc_clusters_imrt(BlockDriverState *bs,
                                    int cluster_count,
-                                   uint16_t **refcount_table,
+                                   void **refcount_table,
                                    int64_t *imrt_nb_clusters,
                                    int64_t *first_free_cluster)
 {
@@ -1787,7 +1811,7 @@ static int64_t alloc_clusters_imrt(BlockDriverState *bs,
          contiguous_free_clusters < cluster_count;
          cluster++)
     {
-        if (!(*refcount_table)[cluster]) {
+        if (!s->get_refcount(*refcount_table, cluster)) {
             contiguous_free_clusters++;
             if (first_gap) {
                 /* If this is the first free cluster found, update
@@ -1825,7 +1849,7 @@ static int64_t alloc_clusters_imrt(BlockDriverState *bs,
     /* Go back to the first free cluster */
     cluster -= contiguous_free_clusters;
     for (i = 0; i < cluster_count; i++) {
-        (*refcount_table)[cluster + i] = 1;
+        s->set_refcount(*refcount_table, cluster + i, 1);
     }
 
     return cluster << s->cluster_bits;
@@ -1841,7 +1865,7 @@ static int64_t alloc_clusters_imrt(BlockDriverState *bs,
  */
 static int rebuild_refcount_structure(BlockDriverState *bs,
                                       BdrvCheckResult *res,
-                                      uint16_t **refcount_table,
+                                      void **refcount_table,
                                       int64_t *nb_clusters)
 {
     BDRVQcowState *s = bs->opaque;
@@ -1849,8 +1873,8 @@ static int rebuild_refcount_structure(BlockDriverState *bs,
     int64_t refblock_offset, refblock_start, refblock_index;
     uint32_t reftable_size = 0;
     uint64_t *on_disk_reftable = NULL;
-    uint16_t *on_disk_refblock;
-    int i, ret = 0;
+    void *on_disk_refblock;
+    int ret = 0;
     struct {
         uint64_t reftable_offset;
         uint32_t reftable_clusters;
@@ -1860,7 +1884,7 @@ static int rebuild_refcount_structure(BlockDriverState *bs,
 
 write_refblocks:
     for (; cluster < *nb_clusters; cluster++) {
-        if (!(*refcount_table)[cluster]) {
+        if (!s->get_refcount(*refcount_table, cluster)) {
             continue;
         }
 
@@ -1933,17 +1957,13 @@ write_refblocks:
             goto fail;
         }
 
-        on_disk_refblock = qemu_blockalign0(bs->file, s->cluster_size);
-        for (i = 0; i < s->refcount_block_size &&
-                    refblock_start + i < *nb_clusters; i++)
-        {
-            on_disk_refblock[i] =
-                cpu_to_be16((*refcount_table)[refblock_start + i]);
-        }
+        /* The size of *refcount_table is always cluster-aligned, therefore the
+         * write operation will not overflow */
+        on_disk_refblock = (void *)((char *) *refcount_table +
+                                    refblock_index * s->cluster_size);
 
         ret = bdrv_write(bs->file, refblock_offset / BDRV_SECTOR_SIZE,
-                         (void *)on_disk_refblock, s->cluster_sectors);
-        qemu_vfree(on_disk_refblock);
+                         on_disk_refblock, s->cluster_sectors);
         if (ret < 0) {
             fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
             goto fail;
@@ -2038,7 +2058,7 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
     BDRVQcowState *s = bs->opaque;
     BdrvCheckResult pre_compare_res;
     int64_t size, highest_cluster, nb_clusters;
-    uint16_t *refcount_table = NULL;
+    void *refcount_table = NULL;
     bool rebuild = false;
     int ret;
 
@@ -2087,7 +2107,7 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
         /* Because the old reftable has been exchanged for a new one the
          * references have to be recalculated */
         rebuild = false;
-        memset(refcount_table, 0, nb_clusters * sizeof(uint16_t));
+        memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
         ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
                                   &nb_clusters);
         if (ret < 0) {
diff --git a/block/qcow2.h b/block/qcow2.h
index a9108f5..aa6d367 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -213,6 +213,11 @@ typedef struct Qcow2DiscardRegion {
     QTAILQ_ENTRY(Qcow2DiscardRegion) next;
 } Qcow2DiscardRegion;
 
+typedef uint64_t Qcow2GetRefcountFunc(const void *refcount_array,
+                                      uint64_t index);
+typedef void Qcow2SetRefcountFunc(void *refcount_array,
+                                  uint64_t index, uint64_t value);
+
 typedef struct BDRVQcowState {
     int cluster_bits;
     int cluster_size;
@@ -261,6 +266,9 @@ typedef struct BDRVQcowState {
     int refcount_bits;
     uint64_t refcount_max;
 
+    Qcow2GetRefcountFunc *get_refcount;
+    Qcow2SetRefcountFunc *set_refcount;
+
     bool discard_passthrough[QCOW2_DISCARD_MAX];
 
     int overlap_check; /* bitmask of Qcow2MetadataOverlap values */
-- 
cgit v1.1


From 59c0cb7830be68080c6a0d1449253b954350e57d Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Tue, 10 Feb 2015 15:28:51 -0500
Subject: qcow2: More helpers for refcount modification

Add helper functions for getting and setting refcounts in a refcount
array for any possible refcount order, and choose the correct one during
refcount initialization.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-refcount.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 119 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index c0c1313..dc8d186 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -32,10 +32,49 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
                             int64_t offset, int64_t length, uint64_t addend,
                             bool decrease, enum qcow2_discard_type type);
 
+static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
+static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
+static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
+static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
+static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
+static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
 
+static void set_refcount_ro0(void *refcount_array, uint64_t index,
+                             uint64_t value);
+static void set_refcount_ro1(void *refcount_array, uint64_t index,
+                             uint64_t value);
+static void set_refcount_ro2(void *refcount_array, uint64_t index,
+                             uint64_t value);
+static void set_refcount_ro3(void *refcount_array, uint64_t index,
+                             uint64_t value);
 static void set_refcount_ro4(void *refcount_array, uint64_t index,
                              uint64_t value);
+static void set_refcount_ro5(void *refcount_array, uint64_t index,
+                             uint64_t value);
+static void set_refcount_ro6(void *refcount_array, uint64_t index,
+                             uint64_t value);
+
+
+static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
+    &get_refcount_ro0,
+    &get_refcount_ro1,
+    &get_refcount_ro2,
+    &get_refcount_ro3,
+    &get_refcount_ro4,
+    &get_refcount_ro5,
+    &get_refcount_ro6
+};
+
+static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
+    &set_refcount_ro0,
+    &set_refcount_ro1,
+    &set_refcount_ro2,
+    &set_refcount_ro3,
+    &set_refcount_ro4,
+    &set_refcount_ro5,
+    &set_refcount_ro6
+};
 
 
 /*********************************************************/
@@ -47,8 +86,10 @@ int qcow2_refcount_init(BlockDriverState *bs)
     unsigned int refcount_table_size2, i;
     int ret;
 
-    s->get_refcount = &get_refcount_ro4;
-    s->set_refcount = &set_refcount_ro4;
+    assert(s->refcount_order >= 0 && s->refcount_order <= 6);
+
+    s->get_refcount = get_refcount_funcs[s->refcount_order];
+    s->set_refcount = set_refcount_funcs[s->refcount_order];
 
     assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
     refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
@@ -80,6 +121,59 @@ void qcow2_refcount_close(BlockDriverState *bs)
 }
 
 
+static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
+{
+    return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
+}
+
+static void set_refcount_ro0(void *refcount_array, uint64_t index,
+                             uint64_t value)
+{
+    assert(!(value >> 1));
+    ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
+    ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
+}
+
+static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
+{
+    return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
+           & 0x3;
+}
+
+static void set_refcount_ro1(void *refcount_array, uint64_t index,
+                             uint64_t value)
+{
+    assert(!(value >> 2));
+    ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
+    ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
+}
+
+static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
+{
+    return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
+           & 0xf;
+}
+
+static void set_refcount_ro2(void *refcount_array, uint64_t index,
+                             uint64_t value)
+{
+    assert(!(value >> 4));
+    ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
+    ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
+}
+
+static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
+{
+    return ((const uint8_t *)refcount_array)[index];
+}
+
+static void set_refcount_ro3(void *refcount_array, uint64_t index,
+                             uint64_t value)
+{
+    assert(!(value >> 8));
+    ((uint8_t *)refcount_array)[index] = value;
+}
+
 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
 {
     return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
@@ -92,6 +186,29 @@ static void set_refcount_ro4(void *refcount_array, uint64_t index,
     ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
 }
 
+static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
+{
+    return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
+}
+
+static void set_refcount_ro5(void *refcount_array, uint64_t index,
+                             uint64_t value)
+{
+    assert(!(value >> 32));
+    ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
+}
+
+static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
+{
+    return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
+}
+
+static void set_refcount_ro6(void *refcount_array, uint64_t index,
+                             uint64_t value)
+{
+    ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
+}
+
 
 static int load_refcount_block(BlockDriverState *bs,
                                int64_t refcount_block_offset,
-- 
cgit v1.1


From b72faf9f78377e38342f78ffacdfa0726f5e7511 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Tue, 10 Feb 2015 15:28:52 -0500
Subject: qcow2: Open images with refcount order != 4

No longer refuse to open images with a different refcount entry width
than 16 bits; only reject images with a refcount width larger than 64
bits (which is prohibited by the specification).

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/qcow2.c b/block/qcow2.c
index 5c92803..52731ab 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -677,10 +677,10 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
     }
 
     /* Check support for various header values */
-    if (header.refcount_order != 4) {
-        report_unsupported(bs, errp, "%d bit reference counts",
-                           1 << header.refcount_order);
-        ret = -ENOTSUP;
+    if (header.refcount_order > 6) {
+        error_setg(errp, "Reference count entry width too large; may not "
+                   "exceed 64 bits");
+        ret = -EINVAL;
         goto fail;
     }
     s->refcount_order = header.refcount_order;
-- 
cgit v1.1


From bd4b167f84840865b788e9f236585f77a77b10fc Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Wed, 18 Feb 2015 17:40:46 -0500
Subject: qcow2: refcount_order parameter for qcow2_create2

Add a refcount_order parameter to qcow2_create2(), use that value for
the image header and for calculating the size required for
preallocation.

For now, always pass 4.

This addition requires changes to the calculation of the file size for
the "full" and "falloc" preallocation modes. That in turn is a nice
opportunity to add a comment about that calculation not necessarily
being exact (and that being intentional).

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2.c | 47 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 36 insertions(+), 11 deletions(-)

(limited to 'block')

diff --git a/block/qcow2.c b/block/qcow2.c
index 52731ab..7a35f97 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1783,7 +1783,7 @@ static int preallocate(BlockDriverState *bs)
 static int qcow2_create2(const char *filename, int64_t total_size,
                          const char *backing_file, const char *backing_format,
                          int flags, size_t cluster_size, PreallocMode prealloc,
-                         QemuOpts *opts, int version,
+                         QemuOpts *opts, int version, int refcount_order,
                          Error **errp)
 {
     /* Calculate cluster_bits */
@@ -1816,9 +1816,21 @@ static int qcow2_create2(const char *filename, int64_t total_size,
     int ret;
 
     if (prealloc == PREALLOC_MODE_FULL || prealloc == PREALLOC_MODE_FALLOC) {
+        /* Note: The following calculation does not need to be exact; if it is a
+         * bit off, either some bytes will be "leaked" (which is fine) or we
+         * will need to increase the file size by some bytes (which is fine,
+         * too, as long as the bulk is allocated here). Therefore, using
+         * floating point arithmetic is fine. */
         int64_t meta_size = 0;
         uint64_t nreftablee, nrefblocke, nl1e, nl2e;
         int64_t aligned_total_size = align_offset(total_size, cluster_size);
+        int refblock_bits, refblock_size;
+        /* refcount entry size in bytes */
+        double rces = (1 << refcount_order) / 8.;
+
+        /* see qcow2_open() */
+        refblock_bits = cluster_bits - (refcount_order - 3);
+        refblock_size = 1 << refblock_bits;
 
         /* header: 1 cluster */
         meta_size += cluster_size;
@@ -1843,20 +1855,20 @@ static int qcow2_create2(const char *filename, int64_t total_size,
          *   c = cluster size
          *   y1 = number of refcount blocks entries
          *   y2 = meta size including everything
+         *   rces = refcount entry size in bytes
          * then,
          *   y1 = (y2 + a)/c
-         *   y2 = y1 * sizeof(u16) + y1 * sizeof(u16) * sizeof(u64) / c + m
+         *   y2 = y1 * rces + y1 * rces * sizeof(u64) / c + m
          * we can get y1:
-         *   y1 = (a + m) / (c - sizeof(u16) - sizeof(u16) * sizeof(u64) / c)
+         *   y1 = (a + m) / (c - rces - rces * sizeof(u64) / c)
          */
-        nrefblocke = (aligned_total_size + meta_size + cluster_size) /
-            (cluster_size - sizeof(uint16_t) -
-             1.0 * sizeof(uint16_t) * sizeof(uint64_t) / cluster_size);
-        nrefblocke = align_offset(nrefblocke, cluster_size / sizeof(uint16_t));
-        meta_size += nrefblocke * sizeof(uint16_t);
+        nrefblocke = (aligned_total_size + meta_size + cluster_size)
+                   / (cluster_size - rces - rces * sizeof(uint64_t)
+                                                 / cluster_size);
+        meta_size += DIV_ROUND_UP(nrefblocke, refblock_size) * cluster_size;
 
         /* total size of refcount tables */
-        nreftablee = nrefblocke * sizeof(uint16_t) / cluster_size;
+        nreftablee = nrefblocke / refblock_size;
         nreftablee = align_offset(nreftablee, cluster_size / sizeof(uint64_t));
         meta_size += nreftablee * sizeof(uint64_t);
 
@@ -1892,7 +1904,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
         .l1_size                    = cpu_to_be32(0),
         .refcount_table_offset      = cpu_to_be64(cluster_size),
         .refcount_table_clusters    = cpu_to_be32(1),
-        .refcount_order             = cpu_to_be32(4),
+        .refcount_order             = cpu_to_be32(refcount_order),
         .header_length              = cpu_to_be32(sizeof(*header)),
     };
 
@@ -2011,6 +2023,8 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp)
     size_t cluster_size = DEFAULT_CLUSTER_SIZE;
     PreallocMode prealloc;
     int version = 3;
+    uint64_t refcount_bits = 16;
+    int refcount_order;
     Error *local_err = NULL;
     int ret;
 
@@ -2065,8 +2079,19 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp)
         goto finish;
     }
 
+    if (version < 3 && refcount_bits != 16) {
+        error_setg(errp, "Different refcount widths than 16 bits require "
+                   "compatibility level 1.1 or above (use compat=1.1 or "
+                   "greater)");
+        ret = -EINVAL;
+        goto finish;
+    }
+
+    refcount_order = ffs(refcount_bits) - 1;
+
     ret = qcow2_create2(filename, size, backing_file, backing_fmt, flags,
-                        cluster_size, prealloc, opts, version, &local_err);
+                        cluster_size, prealloc, opts, version, refcount_order,
+                        &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
     }
-- 
cgit v1.1


From 8a17b83cc3d4aa61ed24e2682aaf8ada7130f03d Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Wed, 18 Feb 2015 17:40:47 -0500
Subject: qcow2: Use symbolic macros in qcow2_amend_options

qcow2_amend_options() should not compare options against some inline
strings but rather use the symbolic macros available for each of the
creation options.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2.c | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

(limited to 'block')

diff --git a/block/qcow2.c b/block/qcow2.c
index 7a35f97..64bb7b8 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2672,8 +2672,8 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
             continue;
         }
 
-        if (!strcmp(desc->name, "compat")) {
-            compat = qemu_opt_get(opts, "compat");
+        if (!strcmp(desc->name, BLOCK_OPT_COMPAT_LEVEL)) {
+            compat = qemu_opt_get(opts, BLOCK_OPT_COMPAT_LEVEL);
             if (!compat) {
                 /* preserve default */
             } else if (!strcmp(compat, "0.10")) {
@@ -2684,32 +2684,33 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
                 fprintf(stderr, "Unknown compatibility level %s.\n", compat);
                 return -EINVAL;
             }
-        } else if (!strcmp(desc->name, "preallocation")) {
+        } else if (!strcmp(desc->name, BLOCK_OPT_PREALLOC)) {
             fprintf(stderr, "Cannot change preallocation mode.\n");
             return -ENOTSUP;
-        } else if (!strcmp(desc->name, "size")) {
-            new_size = qemu_opt_get_size(opts, "size", 0);
-        } else if (!strcmp(desc->name, "backing_file")) {
-            backing_file = qemu_opt_get(opts, "backing_file");
-        } else if (!strcmp(desc->name, "backing_fmt")) {
-            backing_format = qemu_opt_get(opts, "backing_fmt");
-        } else if (!strcmp(desc->name, "encryption")) {
-            encrypt = qemu_opt_get_bool(opts, "encryption", s->crypt_method);
+        } else if (!strcmp(desc->name, BLOCK_OPT_SIZE)) {
+            new_size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
+        } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FILE)) {
+            backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
+        } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FMT)) {
+            backing_format = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
+        } else if (!strcmp(desc->name, BLOCK_OPT_ENCRYPT)) {
+            encrypt = qemu_opt_get_bool(opts, BLOCK_OPT_ENCRYPT,
+                                        s->crypt_method);
             if (encrypt != !!s->crypt_method) {
                 fprintf(stderr, "Changing the encryption flag is not "
                         "supported.\n");
                 return -ENOTSUP;
             }
-        } else if (!strcmp(desc->name, "cluster_size")) {
-            cluster_size = qemu_opt_get_size(opts, "cluster_size",
+        } else if (!strcmp(desc->name, BLOCK_OPT_CLUSTER_SIZE)) {
+            cluster_size = qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE,
                                              cluster_size);
             if (cluster_size != s->cluster_size) {
                 fprintf(stderr, "Changing the cluster size is not "
                         "supported.\n");
                 return -ENOTSUP;
             }
-        } else if (!strcmp(desc->name, "lazy_refcounts")) {
-            lazy_refcounts = qemu_opt_get_bool(opts, "lazy_refcounts",
+        } else if (!strcmp(desc->name, BLOCK_OPT_LAZY_REFCOUNTS)) {
+            lazy_refcounts = qemu_opt_get_bool(opts, BLOCK_OPT_LAZY_REFCOUNTS,
                                                lazy_refcounts);
         } else {
             /* if this assertion fails, this probably means a new option was
-- 
cgit v1.1


From 06d05fa738915ab82577289a0b81b3e600c4e749 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Wed, 18 Feb 2015 17:40:49 -0500
Subject: qcow2: Allow creation with refcount order != 4

Add a creation option to qcow2 for setting the refcount order of images
to be created, and respect that option's value.

This breaks some test outputs, fix them.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'block')

diff --git a/block/qcow2.c b/block/qcow2.c
index 64bb7b8..8bfb094 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2079,6 +2079,15 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp)
         goto finish;
     }
 
+    refcount_bits = qemu_opt_get_number_del(opts, BLOCK_OPT_REFCOUNT_BITS,
+                                            refcount_bits);
+    if (refcount_bits > 64 || !is_power_of_2(refcount_bits)) {
+        error_setg(errp, "Refcount width must be a power of two and may not "
+                   "exceed 64 bits");
+        ret = -EINVAL;
+        goto finish;
+    }
+
     if (version < 3 && refcount_bits != 16) {
         error_setg(errp, "Different refcount widths than 16 bits require "
                    "compatibility level 1.1 or above (use compat=1.1 or "
@@ -2712,6 +2721,9 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
         } else if (!strcmp(desc->name, BLOCK_OPT_LAZY_REFCOUNTS)) {
             lazy_refcounts = qemu_opt_get_bool(opts, BLOCK_OPT_LAZY_REFCOUNTS,
                                                lazy_refcounts);
+        } else if (!strcmp(desc->name, BLOCK_OPT_REFCOUNT_BITS)) {
+            error_report("Cannot change refcount entry width");
+            return -ENOTSUP;
         } else {
             /* if this assertion fails, this probably means a new option was
              * added without having it covered here */
@@ -2881,6 +2893,12 @@ static QemuOptsList qcow2_create_opts = {
             .help = "Postpone refcount updates",
             .def_value_str = "off"
         },
+        {
+            .name = BLOCK_OPT_REFCOUNT_BITS,
+            .type = QEMU_OPT_NUMBER,
+            .help = "Width of a reference count entry in bits",
+            .def_value_str = "16"
+        },
         { /* end of list */ }
     }
 };
-- 
cgit v1.1


From a069e2f1372a0a823ab506fc019852a2a652aa54 Mon Sep 17 00:00:00 2001
From: John Snow <jsnow@redhat.com>
Date: Fri, 6 Feb 2015 16:26:17 -0500
Subject: blkdebug: fix "once" rule

Background:
  The blkdebug scripts are currently engineered so that when a debug
event occurs, a prefilter browses a master list of parsed rules for a
certain event and adds them to an "active list" of rules to be used for
the forthcoming action, provided the events and state numbers match.

  Then, once the request is received, the last active rule is used to
inject an error if certain parameters match.

  This active list is cleared every time the prefilter injects a new
rule for the first time during a debug event.

  The "once" rule currently causes the error injection, if it is
triggered, to only clear the active list. This is insufficient for
preventing future injections of the same rule.

Remedy:
  This patch /deletes/ the rule from the list that the prefilter
browses, so it is gone for good. In V2, we remove only the rule of
interest from the active list instead of allowing the "once" rule to
clear the entire list of active rules.

Impact:
  This affects iotests 026. Several ENOSPC tests that used "once" can
be seen to have output that shows multiple failure messages. After
this patch, the error messages tend to be smaller and less severe, but
the injection can still be seen to be working. I have patched the
expected output to expect the smaller error messages.

Signed-off-by: John Snow <jsnow@redhat.com>
Message-id: 1423257977-25630-1-git-send-email-jsnow@redhat.com
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/blkdebug.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blkdebug.c b/block/blkdebug.c
index 9ce35cd..63611e0 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -472,12 +472,14 @@ static BlockAIOCB *inject_error(BlockDriverState *bs,
     int error = rule->options.inject.error;
     struct BlkdebugAIOCB *acb;
     QEMUBH *bh;
+    bool immediately = rule->options.inject.immediately;
 
     if (rule->options.inject.once) {
-        QSIMPLEQ_INIT(&s->active_rules);
+        QSIMPLEQ_REMOVE(&s->active_rules, rule, BlkdebugRule, active_next);
+        remove_rule(rule);
     }
 
-    if (rule->options.inject.immediately) {
+    if (immediately) {
         return NULL;
     }
 
-- 
cgit v1.1


From 8a4ed0d1b16d8932362ffecf8f6f79e6340a585f Mon Sep 17 00:00:00 2001
From: Ekaterina Tumanova <tumanova@linux.vnet.ibm.com>
Date: Mon, 16 Feb 2015 12:47:55 +0100
Subject: raw-posix: Factor block size detection out of raw_probe_alignment()

Put it in new probe_logical_blocksize().

Signed-off-by: Ekaterina Tumanova <tumanova@linux.vnet.ibm.com>
Reviewed-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1424087278-49393-3-git-send-email-tumanova@linux.vnet.ibm.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/raw-posix.c | 51 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 35 insertions(+), 16 deletions(-)

(limited to 'block')

diff --git a/block/raw-posix.c b/block/raw-posix.c
index c0b46ca..34d403d 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -218,39 +218,58 @@ static int raw_normalize_devicepath(const char **filename)
 }
 #endif
 
-static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
+/*
+ * Get logical block size via ioctl. On success store it in @sector_size_p.
+ */
+static int probe_logical_blocksize(int fd, unsigned int *sector_size_p)
 {
-    BDRVRawState *s = bs->opaque;
-    char *buf;
     unsigned int sector_size;
+    bool success = false;
 
-    /* For /dev/sg devices the alignment is not really used.
-       With buffered I/O, we don't have any restrictions. */
-    if (bs->sg || !s->needs_alignment) {
-        bs->request_alignment = 1;
-        s->buf_align = 1;
-        return;
-    }
+    errno = ENOTSUP;
 
     /* Try a few ioctls to get the right size */
-    bs->request_alignment = 0;
-    s->buf_align = 0;
-
 #ifdef BLKSSZGET
     if (ioctl(fd, BLKSSZGET, &sector_size) >= 0) {
-        bs->request_alignment = sector_size;
+        *sector_size_p = sector_size;
+        success = true;
     }
 #endif
 #ifdef DKIOCGETBLOCKSIZE
     if (ioctl(fd, DKIOCGETBLOCKSIZE, &sector_size) >= 0) {
-        bs->request_alignment = sector_size;
+        *sector_size_p = sector_size;
+        success = true;
     }
 #endif
 #ifdef DIOCGSECTORSIZE
     if (ioctl(fd, DIOCGSECTORSIZE, &sector_size) >= 0) {
-        bs->request_alignment = sector_size;
+        *sector_size_p = sector_size;
+        success = true;
     }
 #endif
+
+    return success ? 0 : -errno;
+}
+
+static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
+{
+    BDRVRawState *s = bs->opaque;
+    char *buf;
+
+    /* For /dev/sg devices the alignment is not really used.
+       With buffered I/O, we don't have any restrictions. */
+    if (bs->sg || !s->needs_alignment) {
+        bs->request_alignment = 1;
+        s->buf_align = 1;
+        return;
+    }
+
+    bs->request_alignment = 0;
+    s->buf_align = 0;
+    /* Let's try to use the logical blocksize for the alignment. */
+    if (probe_logical_blocksize(fd, &bs->request_alignment) < 0) {
+        bs->request_alignment = 0;
+    }
 #ifdef CONFIG_XFS
     if (s->is_xfs) {
         struct dioattr da;
-- 
cgit v1.1


From 1a9335e4a94d0f0c635a5284aa3ba5dc50d24f70 Mon Sep 17 00:00:00 2001
From: Ekaterina Tumanova <tumanova@linux.vnet.ibm.com>
Date: Mon, 16 Feb 2015 12:47:56 +0100
Subject: block: Add driver methods to probe blocksizes and geometry

Introduce driver methods of defining disk blocksizes (physical and
logical) and hard drive geometry.
Methods are only implemented for "host_device". For "raw" devices
driver calls child's method.

For now geometry detection will only work for DASD devices. To check
that a local check_for_dasd function was introduced. It calls BIODASDINFO2
ioctl and returns its rc.

Blocksizes detection function will probe sizes for DASD devices.

Signed-off-by: Ekaterina Tumanova <tumanova@linux.vnet.ibm.com>
Reviewed-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1424087278-49393-4-git-send-email-tumanova@linux.vnet.ibm.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/raw-posix.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 block/raw_bsd.c   |  12 +++++++
 2 files changed, 115 insertions(+)

(limited to 'block')

diff --git a/block/raw-posix.c b/block/raw-posix.c
index 34d403d..3263d2b 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -56,6 +56,10 @@
 #include <linux/cdrom.h>
 #include <linux/fd.h>
 #include <linux/fs.h>
+#include <linux/hdreg.h>
+#ifdef __s390__
+#include <asm/dasd.h>
+#endif
 #ifndef FS_NOCOW_FL
 #define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
 #endif
@@ -251,6 +255,23 @@ static int probe_logical_blocksize(int fd, unsigned int *sector_size_p)
     return success ? 0 : -errno;
 }
 
+/**
+ * Get physical block size of @fd.
+ * On success, store it in @blk_size and return 0.
+ * On failure, return -errno.
+ */
+static int probe_physical_blocksize(int fd, unsigned int *blk_size)
+{
+#ifdef BLKPBSZGET
+    if (ioctl(fd, BLKPBSZGET, blk_size) < 0) {
+        return -errno;
+    }
+    return 0;
+#else
+    return -ENOTSUP;
+#endif
+}
+
 static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
 {
     BDRVRawState *s = bs->opaque;
@@ -674,6 +695,86 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
     bs->bl.opt_mem_alignment = s->buf_align;
 }
 
+static int check_for_dasd(int fd)
+{
+#ifdef BIODASDINFO2
+    struct dasd_information2_t info = {0};
+
+    return ioctl(fd, BIODASDINFO2, &info);
+#else
+    return -1;
+#endif
+}
+
+/**
+ * Try to get @bs's logical and physical block size.
+ * On success, store them in @bsz and return zero.
+ * On failure, return negative errno.
+ */
+static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
+{
+    BDRVRawState *s = bs->opaque;
+    int ret;
+
+    /* If DASD, get blocksizes */
+    if (check_for_dasd(s->fd) < 0) {
+        return -ENOTSUP;
+    }
+    ret = probe_logical_blocksize(s->fd, &bsz->log);
+    if (ret < 0) {
+        return ret;
+    }
+    return probe_physical_blocksize(s->fd, &bsz->phys);
+}
+
+/**
+ * Try to get @bs's geometry: cyls, heads, sectors.
+ * On success, store them in @geo and return 0.
+ * On failure return -errno.
+ * (Allows block driver to assign default geometry values that guest sees)
+ */
+#ifdef __linux__
+static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
+{
+    BDRVRawState *s = bs->opaque;
+    struct hd_geometry ioctl_geo = {0};
+    uint32_t blksize;
+
+    /* If DASD, get its geometry */
+    if (check_for_dasd(s->fd) < 0) {
+        return -ENOTSUP;
+    }
+    if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) {
+        return -errno;
+    }
+    /* HDIO_GETGEO may return success even though geo contains zeros
+       (e.g. certain multipath setups) */
+    if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) {
+        return -ENOTSUP;
+    }
+    /* Do not return a geometry for partition */
+    if (ioctl_geo.start != 0) {
+        return -ENOTSUP;
+    }
+    geo->heads = ioctl_geo.heads;
+    geo->sectors = ioctl_geo.sectors;
+    if (!probe_physical_blocksize(s->fd, &blksize)) {
+        /* overwrite cyls: HDIO_GETGEO result is incorrect for big drives */
+        geo->cylinders = bdrv_nb_sectors(bs) / (blksize / BDRV_SECTOR_SIZE)
+                                             / (geo->heads * geo->sectors);
+        return 0;
+    }
+    geo->cylinders = ioctl_geo.cylinders;
+
+    return 0;
+}
+#else /* __linux__ */
+static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
+{
+    return -ENOTSUP;
+}
+#endif
+
 static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb)
 {
     int ret;
@@ -2215,6 +2316,8 @@ static BlockDriver bdrv_host_device = {
     .bdrv_get_info = raw_get_info,
     .bdrv_get_allocated_file_size
                         = raw_get_allocated_file_size,
+    .bdrv_probe_blocksizes = hdev_probe_blocksizes,
+    .bdrv_probe_geometry = hdev_probe_geometry,
 
     .bdrv_detach_aio_context = raw_detach_aio_context,
     .bdrv_attach_aio_context = raw_attach_aio_context,
diff --git a/block/raw_bsd.c b/block/raw_bsd.c
index 05b02c7..e3d2d04 100644
--- a/block/raw_bsd.c
+++ b/block/raw_bsd.c
@@ -235,6 +235,16 @@ static int raw_probe(const uint8_t *buf, int buf_size, const char *filename)
     return 1;
 }
 
+static int raw_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
+{
+    return bdrv_probe_blocksizes(bs->file, bsz);
+}
+
+static int raw_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
+{
+    return bdrv_probe_geometry(bs->file, geo);
+}
+
 BlockDriver bdrv_raw = {
     .format_name          = "raw",
     .bdrv_probe           = &raw_probe,
@@ -252,6 +262,8 @@ BlockDriver bdrv_raw = {
     .has_variable_length  = true,
     .bdrv_get_info        = &raw_get_info,
     .bdrv_refresh_limits  = &raw_refresh_limits,
+    .bdrv_probe_blocksizes = &raw_probe_blocksizes,
+    .bdrv_probe_geometry  = &raw_probe_geometry,
     .bdrv_is_inserted     = &raw_is_inserted,
     .bdrv_media_changed   = &raw_media_changed,
     .bdrv_eject           = &raw_eject,
-- 
cgit v1.1


From f0272c4db2a914453a6254f80bfe3b708a0b2426 Mon Sep 17 00:00:00 2001
From: Ekaterina Tumanova <tumanova@linux.vnet.ibm.com>
Date: Mon, 16 Feb 2015 12:47:57 +0100
Subject: block-backend: Add wrappers for blocksizes and geometry probing

Signed-off-by: Ekaterina Tumanova <tumanova@linux.vnet.ibm.com>
Reviewed-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1424087278-49393-5-git-send-email-tumanova@linux.vnet.ibm.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/block-backend.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'block')

diff --git a/block/block-backend.c b/block/block-backend.c
index bfb0418..48b6e4c 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -892,3 +892,13 @@ int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size)
 {
     return bdrv_load_vmstate(blk->bs, buf, pos, size);
 }
+
+int blk_probe_blocksizes(BlockBackend *blk, BlockSizes *bsz)
+{
+    return bdrv_probe_blocksizes(blk->bs, bsz);
+}
+
+int blk_probe_geometry(BlockBackend *blk, HDGeometry *geo)
+{
+    return bdrv_probe_geometry(blk->bs, geo);
+}
-- 
cgit v1.1


From 833a7cc36e63653641558ba27148076f9a32062f Mon Sep 17 00:00:00 2001
From: Liu Yuan <liuyuan@cmss.chinamobile.com>
Date: Wed, 18 Feb 2015 11:57:55 +0800
Subject: sheepdog: fix confused return values

These functions mix up -1 and -errno in return values and would might cause
trouble error handling in the call chain.

This patch let them return -errno and add some comments.

Cc: qemu-devel@nongnu.org
Cc: Markus Armbruster <armbru@redhat.com>
Cc: Kevin Wolf <kwolf@redhat.com>
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Reported-by: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Liu Yuan <liuyuan@cmss.chinamobile.com>
Message-id: 1424231875-7131-1-git-send-email-namei.unix@gmail.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/sheepdog.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'block')

diff --git a/block/sheepdog.c b/block/sheepdog.c
index a2679c2..60a4853 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -545,6 +545,7 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
     return acb;
 }
 
+/* Return -EIO in case of error, file descriptor on success */
 static int connect_to_sdog(BDRVSheepdogState *s, Error **errp)
 {
     int fd;
@@ -564,11 +565,14 @@ static int connect_to_sdog(BDRVSheepdogState *s, Error **errp)
 
     if (fd >= 0) {
         qemu_set_nonblock(fd);
+    } else {
+        fd = -EIO;
     }
 
     return fd;
 }
 
+/* Return 0 on success and -errno in case of error */
 static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
                                     unsigned int *wlen)
 {
@@ -577,11 +581,13 @@ static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
     ret = qemu_co_send(sockfd, hdr, sizeof(*hdr));
     if (ret != sizeof(*hdr)) {
         error_report("failed to send a req, %s", strerror(errno));
+        ret = -socket_error();
         return ret;
     }
 
     ret = qemu_co_send(sockfd, data, *wlen);
     if (ret != *wlen) {
+        ret = -socket_error();
         error_report("failed to send a req, %s", strerror(errno));
     }
 
@@ -656,6 +662,11 @@ out:
     srco->finished = true;
 }
 
+/*
+ * Send the request to the sheep in a synchronous manner.
+ *
+ * Return 0 on success, -errno in case of error.
+ */
 static int do_req(int sockfd, AioContext *aio_context, SheepdogReq *hdr,
                   void *data, unsigned int *wlen, unsigned int *rlen)
 {
-- 
cgit v1.1


From f0ab6f109630940146cbaf47d0cd99993ddba824 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Fri, 27 Feb 2015 14:54:39 -0500
Subject: block/vdi: Add locking for parallel requests

When allocating a new cluster, the first write to it must be the one
doing the allocation, because that one pads its write request to the
cluster size; if another write to that cluster is executed before it,
that write will be overwritten due to the padding.

See https://bugs.launchpad.net/qemu/+bug/1422307 for what can go wrong
without this patch.

Cc: qemu-stable <qemu-stable@nongnu.org>
Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/vdi.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'block')

diff --git a/block/vdi.c b/block/vdi.c
index 74030c6..53bd02f 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -53,6 +53,7 @@
 #include "block/block_int.h"
 #include "qemu/module.h"
 #include "migration/migration.h"
+#include "block/coroutine.h"
 
 #if defined(CONFIG_UUID)
 #include <uuid/uuid.h>
@@ -196,6 +197,8 @@ typedef struct {
     /* VDI header (converted to host endianness). */
     VdiHeader header;
 
+    CoMutex write_lock;
+
     Error *migration_blocker;
 } BDRVVdiState;
 
@@ -504,6 +507,8 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
               "vdi", bdrv_get_device_name(bs), "live migration");
     migrate_add_blocker(s->migration_blocker);
 
+    qemu_co_mutex_init(&s->write_lock);
+
     return 0;
 
  fail_free_bmap:
@@ -639,11 +644,31 @@ static int vdi_co_write(BlockDriverState *bs,
                    buf, n_sectors * SECTOR_SIZE);
             memset(block + (sector_in_block + n_sectors) * SECTOR_SIZE, 0,
                    (s->block_sectors - n_sectors - sector_in_block) * SECTOR_SIZE);
+
+            /* Note that this coroutine does not yield anywhere from reading the
+             * bmap entry until here, so in regards to all the coroutines trying
+             * to write to this cluster, the one doing the allocation will
+             * always be the first to try to acquire the lock.
+             * Therefore, it is also the first that will actually be able to
+             * acquire the lock and thus the padded cluster is written before
+             * the other coroutines can write to the affected area. */
+            qemu_co_mutex_lock(&s->write_lock);
             ret = bdrv_write(bs->file, offset, block, s->block_sectors);
+            qemu_co_mutex_unlock(&s->write_lock);
         } else {
             uint64_t offset = s->header.offset_data / SECTOR_SIZE +
                               (uint64_t)bmap_entry * s->block_sectors +
                               sector_in_block;
+            qemu_co_mutex_lock(&s->write_lock);
+            /* This lock is only used to make sure the following write operation
+             * is executed after the write issued by the coroutine allocating
+             * this cluster, therefore we do not need to keep it locked.
+             * As stated above, the allocating coroutine will always try to lock
+             * the mutex before all the other concurrent accesses to that
+             * cluster, therefore at this point we can be absolutely certain
+             * that that write operation has returned (there may be other writes
+             * in flight, but they do not concern this very operation). */
+            qemu_co_mutex_unlock(&s->write_lock);
             ret = bdrv_write(bs->file, offset, buf, n_sectors);
         }
 
-- 
cgit v1.1


From 27994d587940b0c72d5f1d69f6e1a62a02f26dc9 Mon Sep 17 00:00:00 2001
From: Markus Armbruster <armbru@redhat.com>
Date: Thu, 12 Feb 2015 14:49:50 +0100
Subject: sheepdog: Fix misleading error messages in sd_snapshot_create()

If do_sd_create() fails, it first reports the error returned, then
reports a another one with strerror(errno).  errno is meaningless at
that point.

Report just one error combining the valid information from both
messages.

Reported-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Liu Yuan <namei.unix@gmail.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/sheepdog.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 60a4853..c14172c 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -2339,9 +2339,8 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
 
     ret = do_sd_create(s, &new_vid, 1, &local_err);
     if (ret < 0) {
-        error_report_err(local_err);
-        error_report("failed to create inode for snapshot. %s",
-                     strerror(errno));
+        error_report("failed to create inode for snapshot: %s",
+                     error_get_pretty(local_err));
         goto cleanup;
     }
 
-- 
cgit v1.1


From 22d182e82b4ba2fb78b2cc22bcec4e6a440b0ad6 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 5 Mar 2015 15:38:17 -0600
Subject: block/raw-posix: fix launching with failed disks

Since commit c25f53b06eba1575d5d0e92a0132455c97825b83 ("raw: Probe
required direct I/O alignment") QEMU has failed to launch if image files
produce I/O errors.

Previously, QEMU would launch successfully and the guest would see the
errors when attempting I/O.

This is a regression and may prevent multipath I/O inside the guest,
where QEMU must launch and let the guest figure out by itself which
disks are online.

Tweak the alignment probing code in raw-posix.c to explicitly look for
EINVAL on Linux instead of bailing.  The kernel refuses misaligned
requests with this error code and other error codes can be ignored.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/raw-posix.c | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/raw-posix.c b/block/raw-posix.c
index 3263d2b..f0b4488 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -272,6 +272,31 @@ static int probe_physical_blocksize(int fd, unsigned int *blk_size)
 #endif
 }
 
+/* Check if read is allowed with given memory buffer and length.
+ *
+ * This function is used to check O_DIRECT memory buffer and request alignment.
+ */
+static bool raw_is_io_aligned(int fd, void *buf, size_t len)
+{
+    ssize_t ret = pread(fd, buf, len, 0);
+
+    if (ret >= 0) {
+        return true;
+    }
+
+#ifdef __linux__
+    /* The Linux kernel returns EINVAL for misaligned O_DIRECT reads.  Ignore
+     * other errors (e.g. real I/O error), which could happen on a failed
+     * drive, since we only care about probing alignment.
+     */
+    if (errno != EINVAL) {
+        return true;
+    }
+#endif
+
+    return false;
+}
+
 static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
 {
     BDRVRawState *s = bs->opaque;
@@ -307,7 +332,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
         size_t align;
         buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE);
         for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) {
-            if (pread(fd, buf + align, MAX_BLOCKSIZE, 0) >= 0) {
+            if (raw_is_io_aligned(fd, buf + align, MAX_BLOCKSIZE)) {
                 s->buf_align = align;
                 break;
             }
@@ -319,7 +344,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
         size_t align;
         buf = qemu_memalign(s->buf_align, MAX_BLOCKSIZE);
         for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) {
-            if (pread(fd, buf, align, 0) >= 0) {
+            if (raw_is_io_aligned(fd, buf, align)) {
                 bs->request_alignment = align;
                 break;
             }
-- 
cgit v1.1