From 407bc15033b2a8faeb7ca42aab63b7bcede76e10 Mon Sep 17 00:00:00 2001
From: Yi Wang <up2wing@gmail.com>
Date: Thu, 12 Mar 2015 22:54:42 +0800
Subject: savevm: create snapshot failed when id_str already exists

The command "virsh create" will fail in such condition: vm has two
disks: vda and vdb. vda has snapshot s1 with id "1", vdb doesn't have
s1 but has snapshot s2 with id "1".  When we want to run command "virsh
create s1", del_existing_snapshots() only deletes s1 in vda, and
bdrv_snapshot_create() tries to create vdb's snapshot s1 with id "1",
but id "1" alreay exists in vdb with name "s2"!

The simplest way is call find_new_snapshot_id() unconditionally.

Signed-off-by: Yi Wang <up2wing@gmail.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-snapshot.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 2aa9dcb..17bb211 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -351,10 +351,8 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
 
     memset(sn, 0, sizeof(*sn));
 
-    /* Generate an ID if it wasn't passed */
-    if (sn_info->id_str[0] == '\0') {
-        find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
-    }
+    /* Generate an ID */
+    find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
 
     /* Check that the ID is unique */
     if (find_snapshot_by_id_and_name(bs, sn_info->id_str, NULL) >= 0) {
-- 
cgit v1.1


From ecdda9e03d73d2cc1c82c00cccc02f087741b6a5 Mon Sep 17 00:00:00 2001
From: Alberto Garcia <berto@igalia.com>
Date: Mon, 16 Mar 2015 18:22:05 +0200
Subject: MAINTAINERS: Add myself as the maintainer of the Quorum driver

Signed-off-by: Alberto Garcia <berto@igalia.com>
Message-id: 1426522925-14444-1-git-send-email-berto@igalia.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 MAINTAINERS | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 7aab80b..7c5e71e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1186,3 +1186,9 @@ Bootdevice
 M: Gonglei <arei.gonglei@huawei.com>
 S: Maintained
 F: bootdevice.c
+
+Quorum
+M: Alberto Garcia <berto@igalia.com>
+S: Supported
+F: block/quorum.c
+L: qemu-block@nongnu.org
-- 
cgit v1.1


From 588ef9d411339012fc3c94bfad8911e9d0a517a2 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 23 Mar 2015 15:29:23 +0000
Subject: bt-sdp: fix broken uuids power-of-2 calculation

The binary search in sdp_uuid_match() only works when the number of
elements to search is a power of two.

  lo = record->uuid;
  hi = record->uuids;
  while (hi >>= 1)
      if (lo[hi] <= val)
          lo += hi;

  return *lo == val;

I noticed that the record->uuids calculation in
sdp_service_record_build() was suspect:

  record->uuids = 1 << ffs(record->uuids - 1);

Unlike most ffs(val) - 1 users, the expression is ffs(val - 1)!

Actually ffs() is the wrong function to use for power-of-2.  Use
pow2ceil() to achieve the correct effect.  Now the record->uuid[] array
is sized correctly and the binary search in sdp_uuid_match() should
work.

I'm not sure how to run/test this code.

Cc: Andrzej Zaborowski <balrog@zabor.org>
Cc: qemu-stable@nongnu.org
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1427124571-28598-2-git-send-email-stefanha@redhat.com
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/bt/sdp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/bt/sdp.c b/hw/bt/sdp.c
index 218e075..c903747 100644
--- a/hw/bt/sdp.c
+++ b/hw/bt/sdp.c
@@ -707,7 +707,7 @@ static void sdp_service_record_build(struct sdp_service_record_s *record,
         len += sdp_attr_max_size(&def->attributes[record->attributes ++].data,
                         &record->uuids);
     }
-    record->uuids = 1 << ffs(record->uuids - 1);
+    record->uuids = pow2ceil(record->uuids);
     record->attribute_list =
             g_malloc0(record->attributes * sizeof(*record->attribute_list));
     record->uuid =
-- 
cgit v1.1


From ad5f5fdca83cccd1a4c269b1fd8ba2fce8d1ba26 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 23 Mar 2015 15:29:24 +0000
Subject: hw/arm/nseries: convert ffs(3) to ctz32()

It is not clear from the code how a 0 parameter should be handled by the
hardware.  Keep the same behavior as ffs(0) - 1 == -1.

Cc: Andrzej Zaborowski <balrog@zabor.org>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1427124571-28598-3-git-send-email-stefanha@redhat.com
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/arm/nseries.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/hw/arm/nseries.c b/hw/arm/nseries.c
index 2a5406d..d243159 100644
--- a/hw/arm/nseries.c
+++ b/hw/arm/nseries.c
@@ -579,7 +579,10 @@ static uint32_t mipid_txrx(void *opaque, uint32_t cmd, int len)
 
     case 0x26:	/* GAMSET */
         if (!s->pm) {
-            s->gamma = ffs(s->param[0] & 0xf) - 1;
+            s->gamma = ctz32(s->param[0] & 0xf);
+            if (s->gamma == 32) {
+                s->gamma = -1; /* XXX: should this be 0? */
+            }
         } else if (s->pm < 0) {
             s->pm = 1;
         }
-- 
cgit v1.1


From 5863d374a32c98a7adb4c5e49d62de3cdc16d2ea Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 23 Mar 2015 15:29:25 +0000
Subject: uninorth: convert ffs(3) to ctz32()

It is not clear from the code how a 0 parameter should be handled by the
hardware.  Keep the same behavior as ffs(0) - 1 == -1.

Cc: Alexander Graf <agraf@suse.de>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1427124571-28598-4-git-send-email-stefanha@redhat.com
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/pci-host/uninorth.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/hw/pci-host/uninorth.c b/hw/pci-host/uninorth.c
index 53f2b59..f0144eb 100644
--- a/hw/pci-host/uninorth.c
+++ b/hw/pci-host/uninorth.c
@@ -92,7 +92,10 @@ static uint32_t unin_get_config_reg(uint32_t reg, uint32_t addr)
         uint32_t slot, func;
 
         /* Grab CFA0 style values */
-        slot = ffs(reg & 0xfffff800) - 1;
+        slot = ctz32(reg & 0xfffff800);
+        if (slot == 32) {
+            slot = -1; /* XXX: should this be 0? */
+        }
         func = (reg >> 8) & 7;
 
         /* ... and then convert them to x86 format */
-- 
cgit v1.1


From 786a4ea82ec9c87e3a895cf41081029b285a5fe5 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 23 Mar 2015 15:29:26 +0000
Subject: Convert (ffs(val) - 1) to ctz32(val)

This commit was generated mechanically by coccinelle from the following
semantic patch:

@@
expression val;
@@
- (ffs(val) - 1)
+ ctz32(val)

The call sites have been audited to ensure the ffs(0) - 1 == -1 case
never occurs (due to input validation, asserts, etc).  Therefore we
don't need to worry about the fact that ctz32(0) == 32.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1427124571-28598-5-git-send-email-stefanha@redhat.com
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c                    |  2 +-
 block/qcow2-refcount.c     |  2 +-
 block/qcow2.c              |  4 ++--
 block/qed.c                |  4 ++--
 block/rbd.c                |  2 +-
 block/sheepdog.c           |  2 +-
 hw/acpi/pcihp.c            |  2 +-
 hw/arm/pxa2xx_gpio.c       |  2 +-
 hw/arm/strongarm.c         |  4 ++--
 hw/display/tc6393xb.c      |  2 +-
 hw/gpio/max7310.c          |  2 +-
 hw/gpio/zaurus.c           |  2 +-
 hw/pci-host/bonito.c       |  2 +-
 hw/pci/msi.c               | 12 ++++++------
 hw/pci/pcie_aer.c          |  2 +-
 hw/pci/shpc.c              | 10 +++++-----
 hw/pci/slotid_cap.c        |  2 +-
 hw/ppc/ppce500_spin.c      |  2 +-
 hw/scsi/megasas.c          |  2 +-
 include/hw/pci/pci.h       | 16 ++++++++--------
 include/hw/pci/pcie_regs.h | 18 +++++++++---------
 target-ppc/cpu.h           |  4 ++--
 22 files changed, 50 insertions(+), 50 deletions(-)

diff --git a/block.c b/block.c
index f2f8ae7..d7781ee 100644
--- a/block.c
+++ b/block.c
@@ -5452,7 +5452,7 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
         return NULL;
     }
     bitmap = g_new0(BdrvDirtyBitmap, 1);
-    bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
+    bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(granularity));
     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
     return bitmap;
 }
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 6cbae1d..f47260b 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -2450,7 +2450,7 @@ int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
     if (ret < 0) {
         return ret;
     } else if (ret > 0) {
-        int metadata_ol_bitnr = ffs(ret) - 1;
+        int metadata_ol_bitnr = ctz32(ret);
         assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
 
         qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
diff --git a/block/qcow2.c b/block/qcow2.c
index 316a8db..f692978 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1802,7 +1802,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
 {
     /* Calculate cluster_bits */
     int cluster_bits;
-    cluster_bits = ffs(cluster_size) - 1;
+    cluster_bits = ctz32(cluster_size);
     if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS ||
         (1 << cluster_bits) != cluster_size)
     {
@@ -2110,7 +2110,7 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp)
         goto finish;
     }
 
-    refcount_order = ffs(refcount_bits) - 1;
+    refcount_order = ctz32(refcount_bits);
 
     ret = qcow2_create2(filename, size, backing_file, backing_fmt, flags,
                         cluster_size, prealloc, opts, version, refcount_order,
diff --git a/block/qed.c b/block/qed.c
index 892b13c..9d90888 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -436,9 +436,9 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
 
     s->table_nelems = (s->header.cluster_size * s->header.table_size) /
                       sizeof(uint64_t);
-    s->l2_shift = ffs(s->header.cluster_size) - 1;
+    s->l2_shift = ctz32(s->header.cluster_size);
     s->l2_mask = s->table_nelems - 1;
-    s->l1_shift = s->l2_shift + ffs(s->table_nelems) - 1;
+    s->l1_shift = s->l2_shift + ctz32(s->table_nelems);
 
     /* Header size calculation must not overflow uint32_t */
     if (s->header.header_size > UINT32_MAX / s->header.cluster_size) {
diff --git a/block/rbd.c b/block/rbd.c
index f3ab2dd..fbe87e0 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -325,7 +325,7 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
             error_setg(errp, "obj size too small");
             return -EINVAL;
         }
-        obj_order = ffs(objsize) - 1;
+        obj_order = ctz32(objsize);
     }
 
     clientname = qemu_rbd_parse_clientname(conf, clientname_buf);
diff --git a/block/sheepdog.c b/block/sheepdog.c
index c14172c..2d5f06a 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -1716,7 +1716,7 @@ static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
         if ((object_size - 1) & object_size) {    /* not a power of 2? */
             return -EINVAL;
         }
-        obj_order = ffs(object_size) - 1;
+        obj_order = ctz32(object_size);
         if (obj_order < 20 || obj_order > 31) {
             return -EINVAL;
         }
diff --git a/hw/acpi/pcihp.c b/hw/acpi/pcihp.c
index 612fec0..77e1126 100644
--- a/hw/acpi/pcihp.c
+++ b/hw/acpi/pcihp.c
@@ -120,7 +120,7 @@ static bool acpi_pcihp_pc_no_hotplug(AcpiPciHpState *s, PCIDevice *dev)
 static void acpi_pcihp_eject_slot(AcpiPciHpState *s, unsigned bsel, unsigned slots)
 {
     BusChild *kid, *next;
-    int slot = ffs(slots) - 1;
+    int slot = ctz32(slots);
     PCIBus *bus = acpi_pcihp_find_hotplug_bus(s, bsel);
 
     if (!bus) {
diff --git a/hw/arm/pxa2xx_gpio.c b/hw/arm/pxa2xx_gpio.c
index 354ccf1..c89c804 100644
--- a/hw/arm/pxa2xx_gpio.c
+++ b/hw/arm/pxa2xx_gpio.c
@@ -137,7 +137,7 @@ static void pxa2xx_gpio_handler_update(PXA2xxGPIOInfo *s) {
         level = s->olevel[i] & s->dir[i];
 
         for (diff = s->prev_level[i] ^ level; diff; diff ^= 1 << bit) {
-            bit = ffs(diff) - 1;
+            bit = ctz32(diff);
             line = bit + 32 * i;
             qemu_set_irq(s->handler[line], (level >> bit) & 1);
         }
diff --git a/hw/arm/strongarm.c b/hw/arm/strongarm.c
index 1ddea6d..da9fc1d 100644
--- a/hw/arm/strongarm.c
+++ b/hw/arm/strongarm.c
@@ -528,7 +528,7 @@ static void strongarm_gpio_handler_update(StrongARMGPIOInfo *s)
     level = s->olevel & s->dir;
 
     for (diff = s->prev_level ^ level; diff; diff ^= 1 << bit) {
-        bit = ffs(diff) - 1;
+        bit = ctz32(diff);
         qemu_set_irq(s->handler[bit], (level >> bit) & 1);
     }
 
@@ -745,7 +745,7 @@ static void strongarm_ppc_handler_update(StrongARMPPCInfo *s)
     level = s->olevel & s->dir;
 
     for (diff = s->prev_level ^ level; diff; diff ^= 1 << bit) {
-        bit = ffs(diff) - 1;
+        bit = ctz32(diff);
         qemu_set_irq(s->handler[bit], (level >> bit) & 1);
     }
 
diff --git a/hw/display/tc6393xb.c b/hw/display/tc6393xb.c
index 4306adc..66b7ade 100644
--- a/hw/display/tc6393xb.c
+++ b/hw/display/tc6393xb.c
@@ -171,7 +171,7 @@ static void tc6393xb_gpio_handler_update(TC6393xbState *s)
     level = s->gpio_level & s->gpio_dir;
 
     for (diff = s->prev_level ^ level; diff; diff ^= 1 << bit) {
-        bit = ffs(diff) - 1;
+        bit = ctz32(diff);
         qemu_set_irq(s->handler[bit], (level >> bit) & 1);
     }
 
diff --git a/hw/gpio/max7310.c b/hw/gpio/max7310.c
index 7fbf313..2f59b13 100644
--- a/hw/gpio/max7310.c
+++ b/hw/gpio/max7310.c
@@ -96,7 +96,7 @@ static int max7310_tx(I2CSlave *i2c, uint8_t data)
     case 0x01:	/* Output port */
         for (diff = (data ^ s->level) & ~s->direction; diff;
                         diff &= ~(1 << line)) {
-            line = ffs(diff) - 1;
+            line = ctz32(diff);
             if (s->handler[line])
                 qemu_set_irq(s->handler[line], (data >> line) & 1);
         }
diff --git a/hw/gpio/zaurus.c b/hw/gpio/zaurus.c
index 9408342..24a7727 100644
--- a/hw/gpio/zaurus.c
+++ b/hw/gpio/zaurus.c
@@ -65,7 +65,7 @@ static inline void scoop_gpio_handler_update(ScoopInfo *s) {
     level = s->gpio_level & s->gpio_dir;
 
     for (diff = s->prev_level ^ level; diff; diff ^= 1 << bit) {
-        bit = ffs(diff) - 1;
+        bit = ctz32(diff);
         qemu_set_irq(s->handler[bit], (level >> bit) & 1);
     }
 
diff --git a/hw/pci-host/bonito.c b/hw/pci-host/bonito.c
index 8134d0b..3a731fe 100644
--- a/hw/pci-host/bonito.c
+++ b/hw/pci-host/bonito.c
@@ -427,7 +427,7 @@ static uint32_t bonito_sbridge_pciaddr(void *opaque, hwaddr addr)
     cfgaddr |= (s->regs[BONITO_PCIMAP_CFG] & 0xffff) << 16;
 
     idsel = (cfgaddr & BONITO_PCICONF_IDSEL_MASK) >> BONITO_PCICONF_IDSEL_OFFSET;
-    devno = ffs(idsel) - 1;
+    devno = ctz32(idsel);
     funno = (cfgaddr & BONITO_PCICONF_FUN_MASK) >> BONITO_PCICONF_FUN_OFFSET;
     regno = (cfgaddr & BONITO_PCICONF_REG_MASK) >> BONITO_PCICONF_REG_OFFSET;
 
diff --git a/hw/pci/msi.c b/hw/pci/msi.c
index 916e1a1..2949938 100644
--- a/hw/pci/msi.c
+++ b/hw/pci/msi.c
@@ -72,7 +72,7 @@ static inline uint8_t msi_cap_sizeof(uint16_t flags)
 static inline unsigned int msi_nr_vectors(uint16_t flags)
 {
     return 1U <<
-        ((flags & PCI_MSI_FLAGS_QSIZE) >> (ffs(PCI_MSI_FLAGS_QSIZE) - 1));
+        ((flags & PCI_MSI_FLAGS_QSIZE) >> ctz32(PCI_MSI_FLAGS_QSIZE));
 }
 
 static inline uint8_t msi_flags_off(const PCIDevice* dev)
@@ -175,9 +175,9 @@ int msi_init(struct PCIDevice *dev, uint8_t offset,
     assert(nr_vectors > 0);
     assert(nr_vectors <= PCI_MSI_VECTORS_MAX);
     /* the nr of MSI vectors is up to 32 */
-    vectors_order = ffs(nr_vectors) - 1;
+    vectors_order = ctz32(nr_vectors);
 
-    flags = vectors_order << (ffs(PCI_MSI_FLAGS_QMASK) - 1);
+    flags = vectors_order << ctz32(PCI_MSI_FLAGS_QMASK);
     if (msi64bit) {
         flags |= PCI_MSI_FLAGS_64BIT;
     }
@@ -355,12 +355,12 @@ void msi_write_config(PCIDevice *dev, uint32_t addr, uint32_t val, int len)
      * just don't crash the host
      */
     log_num_vecs =
-        (flags & PCI_MSI_FLAGS_QSIZE) >> (ffs(PCI_MSI_FLAGS_QSIZE) - 1);
+        (flags & PCI_MSI_FLAGS_QSIZE) >> ctz32(PCI_MSI_FLAGS_QSIZE);
     log_max_vecs =
-        (flags & PCI_MSI_FLAGS_QMASK) >> (ffs(PCI_MSI_FLAGS_QMASK) - 1);
+        (flags & PCI_MSI_FLAGS_QMASK) >> ctz32(PCI_MSI_FLAGS_QMASK);
     if (log_num_vecs > log_max_vecs) {
         flags &= ~PCI_MSI_FLAGS_QSIZE;
-        flags |= log_max_vecs << (ffs(PCI_MSI_FLAGS_QSIZE) - 1);
+        flags |= log_max_vecs << ctz32(PCI_MSI_FLAGS_QSIZE);
         pci_set_word(dev->config + msi_flags_off(dev), flags);
     }
 
diff --git a/hw/pci/pcie_aer.c b/hw/pci/pcie_aer.c
index eaa3e6e..b48c09c 100644
--- a/hw/pci/pcie_aer.c
+++ b/hw/pci/pcie_aer.c
@@ -410,7 +410,7 @@ static void pcie_aer_msg(PCIDevice *dev, const PCIEAERMsg *msg)
 static void pcie_aer_update_log(PCIDevice *dev, const PCIEAERErr *err)
 {
     uint8_t *aer_cap = dev->config + dev->exp.aer_cap;
-    uint8_t first_bit = ffs(err->status) - 1;
+    uint8_t first_bit = ctz32(err->status);
     uint32_t errcap = pci_get_long(aer_cap + PCI_ERR_CAP);
     int i;
 
diff --git a/hw/pci/shpc.c b/hw/pci/shpc.c
index 759910f..a706486 100644
--- a/hw/pci/shpc.c
+++ b/hw/pci/shpc.c
@@ -61,7 +61,7 @@
 /* Same slot state masks are used for command and status registers */
 #define SHPC_SLOT_STATE_MASK     0x03
 #define SHPC_SLOT_STATE_SHIFT \
-    (ffs(SHPC_SLOT_STATE_MASK) - 1)
+    ctz32(SHPC_SLOT_STATE_MASK)
 
 #define SHPC_STATE_NO       0x0
 #define SHPC_STATE_PWRONLY  0x1
@@ -70,10 +70,10 @@
 
 #define SHPC_SLOT_PWR_LED_MASK   0xC
 #define SHPC_SLOT_PWR_LED_SHIFT \
-    (ffs(SHPC_SLOT_PWR_LED_MASK) - 1)
+    ctz32(SHPC_SLOT_PWR_LED_MASK)
 #define SHPC_SLOT_ATTN_LED_MASK  0x30
 #define SHPC_SLOT_ATTN_LED_SHIFT \
-    (ffs(SHPC_SLOT_ATTN_LED_MASK) - 1)
+    ctz32(SHPC_SLOT_ATTN_LED_MASK)
 
 #define SHPC_LED_NO     0x0
 #define SHPC_LED_ON     0x1
@@ -136,7 +136,7 @@ static int roundup_pow_of_two(int x)
 static uint16_t shpc_get_status(SHPCDevice *shpc, int slot, uint16_t msk)
 {
     uint8_t *status = shpc->config + SHPC_SLOT_STATUS(slot);
-    return (pci_get_word(status) & msk) >> (ffs(msk) - 1);
+    return (pci_get_word(status) & msk) >> ctz32(msk);
 }
 
 static void shpc_set_status(SHPCDevice *shpc,
@@ -144,7 +144,7 @@ static void shpc_set_status(SHPCDevice *shpc,
 {
     uint8_t *status = shpc->config + SHPC_SLOT_STATUS(slot);
     pci_word_test_and_clear_mask(status, msk);
-    pci_word_test_and_set_mask(status, value << (ffs(msk) - 1));
+    pci_word_test_and_set_mask(status, value << ctz32(msk));
 }
 
 static void shpc_interrupt_update(PCIDevice *d)
diff --git a/hw/pci/slotid_cap.c b/hw/pci/slotid_cap.c
index 62f7bae..1c01d34 100644
--- a/hw/pci/slotid_cap.c
+++ b/hw/pci/slotid_cap.c
@@ -3,7 +3,7 @@
 #include "qemu/error-report.h"
 
 #define SLOTID_CAP_LENGTH 4
-#define SLOTID_NSLOTS_SHIFT (ffs(PCI_SID_ESR_NSLOTS) - 1)
+#define SLOTID_NSLOTS_SHIFT ctz32(PCI_SID_ESR_NSLOTS)
 
 int slotid_cap_init(PCIDevice *d, int nslots,
                     uint8_t chassis,
diff --git a/hw/ppc/ppce500_spin.c b/hw/ppc/ppce500_spin.c
index d49f2b8..a99f7b0 100644
--- a/hw/ppc/ppce500_spin.c
+++ b/hw/ppc/ppce500_spin.c
@@ -74,7 +74,7 @@ static void spin_reset(void *opaque)
 /* Create -kernel TLB entries for BookE, linearly spanning 256MB.  */
 static inline hwaddr booke206_page_size_to_tlb(uint64_t size)
 {
-    return (ffs(size >> 10) - 1) >> 1;
+    return ctz32(size >> 10) >> 1;
 }
 
 static void mmubooke_create_initial_mapping(CPUPPCState *env,
diff --git a/hw/scsi/megasas.c b/hw/scsi/megasas.c
index ad7317b..91a5d97 100644
--- a/hw/scsi/megasas.c
+++ b/hw/scsi/megasas.c
@@ -804,7 +804,7 @@ static int megasas_ctrl_get_info(MegasasState *s, MegasasCmd *cmd)
                                MFI_INFO_LDOPS_READ_POLICY);
     info.max_strips_per_io = cpu_to_le16(s->fw_sge);
     info.stripe_sz_ops.min = 3;
-    info.stripe_sz_ops.max = ffs(MEGASAS_MAX_SECTORS + 1) - 1;
+    info.stripe_sz_ops.max = ctz32(MEGASAS_MAX_SECTORS + 1);
     info.properties.pred_fail_poll_interval = cpu_to_le16(300);
     info.properties.intr_throttle_cnt = cpu_to_le16(16);
     info.properties.intr_throttle_timeout = cpu_to_le16(50);
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index b97c295..d4ffead 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -568,7 +568,7 @@ static inline void
 pci_set_byte_by_mask(uint8_t *config, uint8_t mask, uint8_t reg)
 {
     uint8_t val = pci_get_byte(config);
-    uint8_t rval = reg << (ffs(mask) - 1);
+    uint8_t rval = reg << ctz32(mask);
     pci_set_byte(config, (~mask & val) | (mask & rval));
 }
 
@@ -576,14 +576,14 @@ static inline uint8_t
 pci_get_byte_by_mask(uint8_t *config, uint8_t mask)
 {
     uint8_t val = pci_get_byte(config);
-    return (val & mask) >> (ffs(mask) - 1);
+    return (val & mask) >> ctz32(mask);
 }
 
 static inline void
 pci_set_word_by_mask(uint8_t *config, uint16_t mask, uint16_t reg)
 {
     uint16_t val = pci_get_word(config);
-    uint16_t rval = reg << (ffs(mask) - 1);
+    uint16_t rval = reg << ctz32(mask);
     pci_set_word(config, (~mask & val) | (mask & rval));
 }
 
@@ -591,14 +591,14 @@ static inline uint16_t
 pci_get_word_by_mask(uint8_t *config, uint16_t mask)
 {
     uint16_t val = pci_get_word(config);
-    return (val & mask) >> (ffs(mask) - 1);
+    return (val & mask) >> ctz32(mask);
 }
 
 static inline void
 pci_set_long_by_mask(uint8_t *config, uint32_t mask, uint32_t reg)
 {
     uint32_t val = pci_get_long(config);
-    uint32_t rval = reg << (ffs(mask) - 1);
+    uint32_t rval = reg << ctz32(mask);
     pci_set_long(config, (~mask & val) | (mask & rval));
 }
 
@@ -606,14 +606,14 @@ static inline uint32_t
 pci_get_long_by_mask(uint8_t *config, uint32_t mask)
 {
     uint32_t val = pci_get_long(config);
-    return (val & mask) >> (ffs(mask) - 1);
+    return (val & mask) >> ctz32(mask);
 }
 
 static inline void
 pci_set_quad_by_mask(uint8_t *config, uint64_t mask, uint64_t reg)
 {
     uint64_t val = pci_get_quad(config);
-    uint64_t rval = reg << (ffs(mask) - 1);
+    uint64_t rval = reg << ctz32(mask);
     pci_set_quad(config, (~mask & val) | (mask & rval));
 }
 
@@ -621,7 +621,7 @@ static inline uint64_t
 pci_get_quad_by_mask(uint8_t *config, uint64_t mask)
 {
     uint64_t val = pci_get_quad(config);
-    return (val & mask) >> (ffs(mask) - 1);
+    return (val & mask) >> ctz32(mask);
 }
 
 PCIDevice *pci_create_multifunction(PCIBus *bus, int devfn, bool multifunction,
diff --git a/include/hw/pci/pcie_regs.h b/include/hw/pci/pcie_regs.h
index 848ab1c..6a28b33 100644
--- a/include/hw/pci/pcie_regs.h
+++ b/include/hw/pci/pcie_regs.h
@@ -27,34 +27,34 @@
 
 /* PCI_EXP_FLAGS */
 #define PCI_EXP_FLAGS_VER2              2 /* for now, supports only ver. 2 */
-#define PCI_EXP_FLAGS_IRQ_SHIFT         (ffs(PCI_EXP_FLAGS_IRQ) - 1)
-#define PCI_EXP_FLAGS_TYPE_SHIFT        (ffs(PCI_EXP_FLAGS_TYPE) - 1)
+#define PCI_EXP_FLAGS_IRQ_SHIFT         ctz32(PCI_EXP_FLAGS_IRQ)
+#define PCI_EXP_FLAGS_TYPE_SHIFT        ctz32(PCI_EXP_FLAGS_TYPE)
 
 
 /* PCI_EXP_LINK{CAP, STA} */
 /* link speed */
 #define PCI_EXP_LNK_LS_25               1
 
-#define PCI_EXP_LNK_MLW_SHIFT           (ffs(PCI_EXP_LNKCAP_MLW) - 1)
+#define PCI_EXP_LNK_MLW_SHIFT           ctz32(PCI_EXP_LNKCAP_MLW)
 #define PCI_EXP_LNK_MLW_1               (1 << PCI_EXP_LNK_MLW_SHIFT)
 
 /* PCI_EXP_LINKCAP */
-#define PCI_EXP_LNKCAP_ASPMS_SHIFT      (ffs(PCI_EXP_LNKCAP_ASPMS) - 1)
+#define PCI_EXP_LNKCAP_ASPMS_SHIFT      ctz32(PCI_EXP_LNKCAP_ASPMS)
 #define PCI_EXP_LNKCAP_ASPMS_0S         (1 << PCI_EXP_LNKCAP_ASPMS_SHIFT)
 
-#define PCI_EXP_LNKCAP_PN_SHIFT         (ffs(PCI_EXP_LNKCAP_PN) - 1)
+#define PCI_EXP_LNKCAP_PN_SHIFT         ctz32(PCI_EXP_LNKCAP_PN)
 
-#define PCI_EXP_SLTCAP_PSN_SHIFT        (ffs(PCI_EXP_SLTCAP_PSN) - 1)
+#define PCI_EXP_SLTCAP_PSN_SHIFT        ctz32(PCI_EXP_SLTCAP_PSN)
 
 #define PCI_EXP_SLTCTL_IND_RESERVED     0x0
 #define PCI_EXP_SLTCTL_IND_ON           0x1
 #define PCI_EXP_SLTCTL_IND_BLINK        0x2
 #define PCI_EXP_SLTCTL_IND_OFF          0x3
-#define PCI_EXP_SLTCTL_AIC_SHIFT        (ffs(PCI_EXP_SLTCTL_AIC) - 1)
+#define PCI_EXP_SLTCTL_AIC_SHIFT        ctz32(PCI_EXP_SLTCTL_AIC)
 #define PCI_EXP_SLTCTL_AIC_OFF                          \
     (PCI_EXP_SLTCTL_IND_OFF << PCI_EXP_SLTCTL_AIC_SHIFT)
 
-#define PCI_EXP_SLTCTL_PIC_SHIFT        (ffs(PCI_EXP_SLTCTL_PIC) - 1)
+#define PCI_EXP_SLTCTL_PIC_SHIFT        ctz32(PCI_EXP_SLTCTL_PIC)
 #define PCI_EXP_SLTCTL_PIC_OFF                          \
     (PCI_EXP_SLTCTL_IND_OFF << PCI_EXP_SLTCTL_PIC_SHIFT)
 #define PCI_EXP_SLTCTL_PIC_ON                          \
@@ -109,7 +109,7 @@
 
 #define PCI_ERR_ROOT_IRQ_MAX            32
 #define PCI_ERR_ROOT_IRQ                0xf8000000
-#define PCI_ERR_ROOT_IRQ_SHIFT          (ffs(PCI_ERR_ROOT_IRQ) - 1)
+#define PCI_ERR_ROOT_IRQ_SHIFT          ctz32(PCI_ERR_ROOT_IRQ)
 #define PCI_ERR_ROOT_STATUS_REPORT_MASK (PCI_ERR_ROOT_COR_RCV |         \
                                          PCI_ERR_ROOT_MULTI_COR_RCV |   \
                                          PCI_ERR_ROOT_UNCOR_RCV |       \
diff --git a/target-ppc/cpu.h b/target-ppc/cpu.h
index f15815f..c05c503 100644
--- a/target-ppc/cpu.h
+++ b/target-ppc/cpu.h
@@ -2251,8 +2251,8 @@ static inline ppcmas_tlb_t *booke206_get_tlbm(CPUPPCState *env, const int tlbn,
 {
     int r;
     uint32_t ways = booke206_tlb_ways(env, tlbn);
-    int ways_bits = ffs(ways) - 1;
-    int tlb_bits = ffs(booke206_tlb_size(env, tlbn)) - 1;
+    int ways_bits = ctz32(ways);
+    int tlb_bits = ctz32(booke206_tlb_size(env, tlbn));
     int i;
 
     way &= ways - 1;
-- 
cgit v1.1


From bd2a88840e2496e29442f333c8fdd6491e831a35 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 23 Mar 2015 15:29:27 +0000
Subject: Convert ffs() != 0 callers to ctz32()

There are a number of ffs(3) callers that do roughly:

  bit = ffs(val);
  if (bit) {
      do_something(bit - 1);
  }

This pattern can be converted to ctz32() like this:

  zeroes = ctz32(val);
  if (zeroes != 32) {
      do_something(zeroes);
  }

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1427124571-28598-6-git-send-email-stefanha@redhat.com
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/arm/omap1.c              |  6 ++----
 hw/char/virtio-serial-bus.c |  8 ++++----
 hw/gpio/omap_gpio.c         | 13 +++++--------
 hw/i2c/omap_i2c.c           | 10 +++++++---
 hw/intc/allwinner-a10-pic.c |  8 ++++----
 kvm-all.c                   |  8 ++++----
 6 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/hw/arm/omap1.c b/hw/arm/omap1.c
index 91ffb58..de2b289 100644
--- a/hw/arm/omap1.c
+++ b/hw/arm/omap1.c
@@ -2004,8 +2004,7 @@ static void omap_mpuio_write(void *opaque, hwaddr addr,
     case 0x04:	/* OUTPUT_REG */
         diff = (s->outputs ^ value) & ~s->dir;
         s->outputs = value;
-        while ((ln = ffs(diff))) {
-            ln --;
+        while ((ln = ctz32(diff)) != 32) {
             if (s->handler[ln])
                 qemu_set_irq(s->handler[ln], (value >> ln) & 1);
             diff &= ~(1 << ln);
@@ -2017,8 +2016,7 @@ static void omap_mpuio_write(void *opaque, hwaddr addr,
         s->dir = value;
 
         value = s->outputs & ~s->dir;
-        while ((ln = ffs(diff))) {
-            ln --;
+        while ((ln = ctz32(diff)) != 32) {
             if (s->handler[ln])
                 qemu_set_irq(s->handler[ln], (value >> ln) & 1);
             diff &= ~(1 << ln);
diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c
index e336bdb..6e2ad82 100644
--- a/hw/char/virtio-serial-bus.c
+++ b/hw/char/virtio-serial-bus.c
@@ -814,12 +814,12 @@ static uint32_t find_free_port_id(VirtIOSerial *vser)
 
     max_nr_ports = vser->serial.max_virtserial_ports;
     for (i = 0; i < (max_nr_ports + 31) / 32; i++) {
-        uint32_t map, bit;
+        uint32_t map, zeroes;
 
         map = vser->ports_map[i];
-        bit = ffs(~map);
-        if (bit) {
-            return (bit - 1) + i * 32;
+        zeroes = ctz32(~map);
+        if (zeroes != 32) {
+            return zeroes + i * 32;
         }
     }
     return VIRTIO_CONSOLE_BAD_ID;
diff --git a/hw/gpio/omap_gpio.c b/hw/gpio/omap_gpio.c
index 9a43486..d92f8cf 100644
--- a/hw/gpio/omap_gpio.c
+++ b/hw/gpio/omap_gpio.c
@@ -125,8 +125,7 @@ static void omap_gpio_write(void *opaque, hwaddr addr,
     case 0x04:	/* DATA_OUTPUT */
         diff = (s->outputs ^ value) & ~s->dir;
         s->outputs = value;
-        while ((ln = ffs(diff))) {
-            ln --;
+        while ((ln = ctz32(diff)) != 32) {
             if (s->handler[ln])
                 qemu_set_irq(s->handler[ln], (value >> ln) & 1);
             diff &= ~(1 << ln);
@@ -138,8 +137,7 @@ static void omap_gpio_write(void *opaque, hwaddr addr,
         s->dir = value;
 
         value = s->outputs & ~s->dir;
-        while ((ln = ffs(diff))) {
-            ln --;
+        while ((ln = ctz32(diff)) != 32) {
             if (s->handler[ln])
                 qemu_set_irq(s->handler[ln], (value >> ln) & 1);
             diff &= ~(1 << ln);
@@ -253,8 +251,7 @@ static inline void omap2_gpio_module_out_update(struct omap2_gpio_s *s,
 
     s->outputs ^= diff;
     diff &= ~s->dir;
-    while ((ln = ffs(diff))) {
-        ln --;
+    while ((ln = ctz32(diff)) != 32) {
         qemu_set_irq(s->handler[ln], (s->outputs >> ln) & 1);
         diff &= ~(1 << ln);
     }
@@ -442,8 +439,8 @@ static void omap2_gpio_module_write(void *opaque, hwaddr addr,
         s->dir = value;
 
         value = s->outputs & ~s->dir;
-        while ((ln = ffs(diff))) {
-            diff &= ~(1 <<-- ln);
+        while ((ln = ctz32(diff)) != 32) {
+            diff &= ~(1 << ln);
             qemu_set_irq(s->handler[ln], (value >> ln) & 1);
         }
 
diff --git a/hw/i2c/omap_i2c.c b/hw/i2c/omap_i2c.c
index d63278d..b6f544a 100644
--- a/hw/i2c/omap_i2c.c
+++ b/hw/i2c/omap_i2c.c
@@ -171,9 +171,13 @@ static uint32_t omap_i2c_read(void *opaque, hwaddr addr)
     case 0x0c:	/* I2C_IV */
         if (s->revision >= OMAP2_INTR_REV)
             break;
-        ret = ffs(s->stat & s->mask);
-        if (ret)
-            s->stat ^= 1 << (ret - 1);
+        ret = ctz32(s->stat & s->mask);
+        if (ret != 32) {
+            s->stat ^= 1 << ret;
+            ret++;
+        } else {
+            ret = 0;
+        }
         omap_i2c_interrupts_update(s);
         return ret;
 
diff --git a/hw/intc/allwinner-a10-pic.c b/hw/intc/allwinner-a10-pic.c
index de820b9..eed7621 100644
--- a/hw/intc/allwinner-a10-pic.c
+++ b/hw/intc/allwinner-a10-pic.c
@@ -23,7 +23,7 @@
 static void aw_a10_pic_update(AwA10PICState *s)
 {
     uint8_t i;
-    int irq = 0, fiq = 0, pending;
+    int irq = 0, fiq = 0, zeroes;
 
     s->vector = 0;
 
@@ -32,9 +32,9 @@ static void aw_a10_pic_update(AwA10PICState *s)
         fiq |= s->select[i] & s->irq_pending[i] & ~s->mask[i];
 
         if (!s->vector) {
-            pending = ffs(s->irq_pending[i] & ~s->mask[i]);
-            if (pending) {
-                s->vector = (i * 32 + pending - 1) * 4;
+            zeroes = ctz32(s->irq_pending[i] & ~s->mask[i]);
+            if (zeroes != 32) {
+                s->vector = (i * 32 + zeroes) * 4;
             }
         }
     }
diff --git a/kvm-all.c b/kvm-all.c
index 4ec153d..2a717e5 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -1141,18 +1141,18 @@ static int kvm_irqchip_get_virq(KVMState *s)
 {
     uint32_t *word = s->used_gsi_bitmap;
     int max_words = ALIGN(s->gsi_count, 32) / 32;
-    int i, bit;
+    int i, zeroes;
     bool retry = true;
 
 again:
     /* Return the lowest unused GSI in the bitmap */
     for (i = 0; i < max_words; i++) {
-        bit = ffs(~word[i]);
-        if (!bit) {
+        zeroes = ctz32(~word[i]);
+        if (zeroes == 32) {
             continue;
         }
 
-        return bit - 1 + i * 32;
+        return zeroes + i * 32;
     }
     if (!s->direct_msi && retry) {
         retry = false;
-- 
cgit v1.1


From c9d933185181cb1cf81bc4c9e5c3a10a5934b017 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 23 Mar 2015 15:29:28 +0000
Subject: sd: convert sd_normal_command() ffs(3) call to ctz32()

ffs() cannot be replaced with ctz32() when the argument might be zero,
because ffs(0) returns 0 while ctz32(0) returns 32.

The ffs(3) call in sd_normal_command() is a special case though.  It can
be converted to ctz32() + 1 because the argument is never zero:

  if (!(req.arg >> 8) || (req.arg >> (ctz32(req.arg & ~0xff) + 1))) {
      ~~~~~~~~~~~~~~~
            ^--------------- req.arg cannot be zero

Cc: Markus Armbruster <armbru@redhat.com>
Cc: Peter Crosthwaite <peter.crosthwaite@xilinx.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1427124571-28598-7-git-send-email-stefanha@redhat.com
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/sd/sd.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/sd/sd.c b/hw/sd/sd.c
index f955265..8abf0c9 100644
--- a/hw/sd/sd.c
+++ b/hw/sd/sd.c
@@ -796,8 +796,9 @@ static sd_rsp_type_t sd_normal_command(SDState *sd,
             sd->vhs = 0;
 
             /* No response if not exactly one VHS bit is set.  */
-            if (!(req.arg >> 8) || (req.arg >> ffs(req.arg & ~0xff)))
+            if (!(req.arg >> 8) || (req.arg >> (ctz32(req.arg & ~0xff) + 1))) {
                 return sd->spi ? sd_r7 : sd_r0;
+            }
 
             /* Accept.  */
             sd->vhs = req.arg;
-- 
cgit v1.1


From 41074f3d3ff0e9a3c6f638627c12ebbf6d757cea Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 23 Mar 2015 15:29:29 +0000
Subject: omap_intc: convert ffs(3) to ctz32() in omap_inth_sir_update()

Rewrite the loop using level &= level - 1 to clear the least significant
bit after each iteration.  This simplifies the loop and makes it easy to
replace ffs(3) with ctz32().

Cc: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1427124571-28598-8-git-send-email-stefanha@redhat.com
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/intc/omap_intc.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/hw/intc/omap_intc.c b/hw/intc/omap_intc.c
index ad3931c..e9b38a3 100644
--- a/hw/intc/omap_intc.c
+++ b/hw/intc/omap_intc.c
@@ -60,7 +60,7 @@ struct omap_intr_handler_s {
 
 static void omap_inth_sir_update(struct omap_intr_handler_s *s, int is_fiq)
 {
-    int i, j, sir_intr, p_intr, p, f;
+    int i, j, sir_intr, p_intr, p;
     uint32_t level;
     sir_intr = 0;
     p_intr = 255;
@@ -72,14 +72,15 @@ static void omap_inth_sir_update(struct omap_intr_handler_s *s, int is_fiq)
     for (j = 0; j < s->nbanks; ++j) {
         level = s->bank[j].irqs & ~s->bank[j].mask &
                 (is_fiq ? s->bank[j].fiq : ~s->bank[j].fiq);
-        for (f = ffs(level), i = f - 1, level >>= f - 1; f; i += f,
-                        level >>= f) {
+
+        while (level != 0) {
+            i = ctz32(level);
             p = s->bank[j].priority[i];
             if (p <= p_intr) {
                 p_intr = p;
                 sir_intr = 32 * j + i;
             }
-            f = ffs(level >> 1);
+            level &= level - 1;
         }
     }
     s->sir_intr[is_fiq] = sir_intr;
-- 
cgit v1.1


From f450a85899585776ccd0913d2361dd8f82666e44 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 23 Mar 2015 15:29:30 +0000
Subject: os-win32: drop ffs(3) prototype

The lack of ffs(3) in the MinGW headers is a hint that we shouldn't rely
on it.  MinGW 4.9.2 does not make it available for linking when QEMU's
./configure --enable-debug is used (release builds are fine though).

Now that all QEMU code has been switched to ctz32() there is no need for
ffs(3).

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1427124571-28598-9-git-send-email-stefanha@redhat.com
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/sysemu/os-win32.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/include/sysemu/os-win32.h b/include/sysemu/os-win32.h
index 9cc9e08..4035c4f 100644
--- a/include/sysemu/os-win32.h
+++ b/include/sysemu/os-win32.h
@@ -72,9 +72,6 @@
 #define sigsetjmp(env, savemask) setjmp(env)
 #define siglongjmp(env, val) longjmp(env, val)
 
-/* Declaration of ffs() is missing in MinGW's strings.h. */
-int ffs(int i);
-
 /* Missing POSIX functions. Don't use MinGW-w64 macros. */
 #undef gmtime_r
 struct tm *gmtime_r(const time_t *timep, struct tm *result);
-- 
cgit v1.1


From 8b6ee9aeb3f0508ed2a41381cde13bdb8707b7be Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 23 Mar 2015 15:29:31 +0000
Subject: checkpatch: complain about ffs(3) calls

The ffs(3) family of functions is not portable.  MinGW doesn't always
provide the function.

Use ctz32() or ctz64() instead.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1427124571-28598-10-git-send-email-stefanha@redhat.com
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 scripts/checkpatch.pl | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 5df61f9..7f0aae9 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2911,6 +2911,17 @@ sub process {
 		if ($rawline =~ /\b(?:Qemu|QEmu)\b/) {
 			WARN("use QEMU instead of Qemu or QEmu\n" . $herecurr);
 		}
+
+# check for non-portable ffs() calls that have portable alternatives in QEMU
+		if ($line =~ /\bffs\(/) {
+			ERROR("use ctz32() instead of ffs()\n" . $herecurr);
+		}
+		if ($line =~ /\bffsl\(/) {
+			ERROR("use ctz32() or ctz64() instead of ffsl()\n" . $herecurr);
+		}
+		if ($line =~ /\bffsll\(/) {
+			ERROR("use ctz64() instead of ffsll()\n" . $herecurr);
+		}
 	}
 
 	# If we have no input at all, then there is nothing to report on
-- 
cgit v1.1


From de50a20a4cc368d241d67c600f8c0f667186a8b5 Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Wed, 25 Mar 2015 15:27:26 +0800
Subject: block: Switch to host monotonic clock for IO throttling

Currently, throttle timers won't make any progress when VCPU is not
running, which would stall the request queue in utils, qtest, vm
suspending, and live migration, without special handling.

Block jobs are confusingly inconsistent between with and without
throttling: if user sets a bps limit, stops the vm, then start a block
job, the block job will not make any progress; in contrary, if user
unsets the bps limit, or if it's not set, the block job will run
normally.

After this patch, with the host clock, even if the VCPUs are stopped,
the throttle queues will be processed.

This patch also enables potential to add throttle to bdrv_drain_all.
Currently all requests are drained immediately. In other words whenever
it is called, IO throttling goes ineffective (examples: system reset,
migration and many block job operations.). This is a loophole that guest
could exploit. If we use the host clock, we can later just trust the
nested poll. This could be done on top.

Note that for qemu-iotests case 093, which uses qtest, we still keep vm
clock so the script can control the clock stepping in order to be
deterministic.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
Signed-off-by: Fam Zheng <famz@redhat.com>
Message-id: 1427268446-6426-1-git-send-email-famz@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index d7781ee..670d7e4 100644
--- a/block.c
+++ b/block.c
@@ -30,6 +30,7 @@
 #include "qapi/qmp/qjson.h"
 #include "sysemu/block-backend.h"
 #include "sysemu/sysemu.h"
+#include "sysemu/qtest.h"
 #include "qemu/notify.h"
 #include "block/coroutine.h"
 #include "block/qapi.h"
@@ -181,10 +182,16 @@ static void bdrv_throttle_write_timer_cb(void *opaque)
 /* should be called before bdrv_set_io_limits if a limit is set */
 void bdrv_io_limits_enable(BlockDriverState *bs)
 {
+    int clock_type = QEMU_CLOCK_REALTIME;
+
+    if (qtest_enabled()) {
+        /* For testing block IO throttling only */
+        clock_type = QEMU_CLOCK_VIRTUAL;
+    }
     assert(!bs->io_limits_enabled);
     throttle_init(&bs->throttle_state,
                   bdrv_get_aio_context(bs),
-                  QEMU_CLOCK_VIRTUAL,
+                  clock_type,
                   bdrv_throttle_read_timer_cb,
                   bdrv_throttle_write_timer_cb,
                   bs);
-- 
cgit v1.1


From e98ab097092e54999f046e9efa1ca1dd52f0c9e5 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 20 Feb 2015 17:26:50 +0100
Subject: aio-posix: move pollfds to thread-local storage

By using thread-local storage, aio_poll can stop using global data during
g_poll_ns.  This will make it possible to drop callbacks from rfifolock.

[Moved npfd = 0 assignment to end of walking_handlers region as
suggested by Paolo.  This resolves the assert(npfd == 0) assertion
failure in pollfds_cleanup().
--Stefan]

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1424449612-18215-2-git-send-email-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 aio-posix.c         | 78 ++++++++++++++++++++++++++++++++++++++---------------
 async.c             |  2 --
 include/block/aio.h |  3 ---
 3 files changed, 57 insertions(+), 26 deletions(-)

diff --git a/aio-posix.c b/aio-posix.c
index cbd4c34..296cd9b 100644
--- a/aio-posix.c
+++ b/aio-posix.c
@@ -24,7 +24,6 @@ struct AioHandler
     IOHandler *io_read;
     IOHandler *io_write;
     int deleted;
-    int pollfds_idx;
     void *opaque;
     QLIST_ENTRY(AioHandler) node;
 };
@@ -83,7 +82,6 @@ void aio_set_fd_handler(AioContext *ctx,
         node->io_read = io_read;
         node->io_write = io_write;
         node->opaque = opaque;
-        node->pollfds_idx = -1;
 
         node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0);
         node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
@@ -186,12 +184,59 @@ bool aio_dispatch(AioContext *ctx)
     return progress;
 }
 
+/* These thread-local variables are used only in a small part of aio_poll
+ * around the call to the poll() system call.  In particular they are not
+ * used while aio_poll is performing callbacks, which makes it much easier
+ * to think about reentrancy!
+ *
+ * Stack-allocated arrays would be perfect but they have size limitations;
+ * heap allocation is expensive enough that we want to reuse arrays across
+ * calls to aio_poll().  And because poll() has to be called without holding
+ * any lock, the arrays cannot be stored in AioContext.  Thread-local data
+ * has none of the disadvantages of these three options.
+ */
+static __thread GPollFD *pollfds;
+static __thread AioHandler **nodes;
+static __thread unsigned npfd, nalloc;
+static __thread Notifier pollfds_cleanup_notifier;
+
+static void pollfds_cleanup(Notifier *n, void *unused)
+{
+    g_assert(npfd == 0);
+    g_free(pollfds);
+    g_free(nodes);
+    nalloc = 0;
+}
+
+static void add_pollfd(AioHandler *node)
+{
+    if (npfd == nalloc) {
+        if (nalloc == 0) {
+            pollfds_cleanup_notifier.notify = pollfds_cleanup;
+            qemu_thread_atexit_add(&pollfds_cleanup_notifier);
+            nalloc = 8;
+        } else {
+            g_assert(nalloc <= INT_MAX);
+            nalloc *= 2;
+        }
+        pollfds = g_renew(GPollFD, pollfds, nalloc);
+        nodes = g_renew(AioHandler *, nodes, nalloc);
+    }
+    nodes[npfd] = node;
+    pollfds[npfd] = (GPollFD) {
+        .fd = node->pfd.fd,
+        .events = node->pfd.events,
+    };
+    npfd++;
+}
+
 bool aio_poll(AioContext *ctx, bool blocking)
 {
     AioHandler *node;
     bool was_dispatching;
-    int ret;
+    int i, ret;
     bool progress;
+    int64_t timeout;
 
     was_dispatching = ctx->dispatching;
     progress = false;
@@ -210,39 +255,30 @@ bool aio_poll(AioContext *ctx, bool blocking)
 
     ctx->walking_handlers++;
 
-    g_array_set_size(ctx->pollfds, 0);
+    assert(npfd == 0);
 
     /* fill pollfds */
     QLIST_FOREACH(node, &ctx->aio_handlers, node) {
-        node->pollfds_idx = -1;
         if (!node->deleted && node->pfd.events) {
-            GPollFD pfd = {
-                .fd = node->pfd.fd,
-                .events = node->pfd.events,
-            };
-            node->pollfds_idx = ctx->pollfds->len;
-            g_array_append_val(ctx->pollfds, pfd);
+            add_pollfd(node);
         }
     }
 
-    ctx->walking_handlers--;
+    timeout = blocking ? aio_compute_timeout(ctx) : 0;
 
     /* wait until next event */
-    ret = qemu_poll_ns((GPollFD *)ctx->pollfds->data,
-                         ctx->pollfds->len,
-                         blocking ? aio_compute_timeout(ctx) : 0);
+    ret = qemu_poll_ns((GPollFD *)pollfds, npfd, timeout);
 
     /* if we have any readable fds, dispatch event */
     if (ret > 0) {
-        QLIST_FOREACH(node, &ctx->aio_handlers, node) {
-            if (node->pollfds_idx != -1) {
-                GPollFD *pfd = &g_array_index(ctx->pollfds, GPollFD,
-                                              node->pollfds_idx);
-                node->pfd.revents = pfd->revents;
-            }
+        for (i = 0; i < npfd; i++) {
+            nodes[i]->pfd.revents = pollfds[i].revents;
         }
     }
 
+    npfd = 0;
+    ctx->walking_handlers--;
+
     /* Run dispatch even if there were no readable fds to run timers */
     aio_set_dispatching(ctx, true);
     if (aio_dispatch(ctx)) {
diff --git a/async.c b/async.c
index 2b51e87..77d080d 100644
--- a/async.c
+++ b/async.c
@@ -230,7 +230,6 @@ aio_ctx_finalize(GSource     *source)
     event_notifier_cleanup(&ctx->notifier);
     rfifolock_destroy(&ctx->lock);
     qemu_mutex_destroy(&ctx->bh_lock);
-    g_array_free(ctx->pollfds, TRUE);
     timerlistgroup_deinit(&ctx->tlg);
 }
 
@@ -302,7 +301,6 @@ AioContext *aio_context_new(Error **errp)
     aio_set_event_notifier(ctx, &ctx->notifier,
                            (EventNotifierHandler *)
                            event_notifier_test_and_clear);
-    ctx->pollfds = g_array_new(FALSE, FALSE, sizeof(GPollFD));
     ctx->thread_pool = NULL;
     qemu_mutex_init(&ctx->bh_lock);
     rfifolock_init(&ctx->lock, aio_rfifolock_cb, ctx);
diff --git a/include/block/aio.h b/include/block/aio.h
index 7d1e26b..0dc7a25 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -82,9 +82,6 @@ struct AioContext {
     /* Used for aio_notify.  */
     EventNotifier notifier;
 
-    /* GPollFDs for aio_poll() */
-    GArray *pollfds;
-
     /* Thread pool for performing work and receiving completion callbacks */
     struct ThreadPool *thread_pool;
 
-- 
cgit v1.1


From 49110174f8835ec3d5ca7fc076ee1f51c18564fe Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 20 Feb 2015 17:26:51 +0100
Subject: AioContext: acquire/release AioContext during aio_poll

This is the first step in pushing down acquire/release, and will let
rfifolock drop the contention callback feature.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1424449612-18215-3-git-send-email-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 aio-posix.c         |  9 +++++++++
 aio-win32.c         |  8 ++++++++
 include/block/aio.h | 13 +++++++------
 3 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/aio-posix.c b/aio-posix.c
index 296cd9b..4abec38 100644
--- a/aio-posix.c
+++ b/aio-posix.c
@@ -238,6 +238,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
     bool progress;
     int64_t timeout;
 
+    aio_context_acquire(ctx);
     was_dispatching = ctx->dispatching;
     progress = false;
 
@@ -267,7 +268,13 @@ bool aio_poll(AioContext *ctx, bool blocking)
     timeout = blocking ? aio_compute_timeout(ctx) : 0;
 
     /* wait until next event */
+    if (timeout) {
+        aio_context_release(ctx);
+    }
     ret = qemu_poll_ns((GPollFD *)pollfds, npfd, timeout);
+    if (timeout) {
+        aio_context_acquire(ctx);
+    }
 
     /* if we have any readable fds, dispatch event */
     if (ret > 0) {
@@ -286,5 +293,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
     }
 
     aio_set_dispatching(ctx, was_dispatching);
+    aio_context_release(ctx);
+
     return progress;
 }
diff --git a/aio-win32.c b/aio-win32.c
index e6f4ced..233d8f5 100644
--- a/aio-win32.c
+++ b/aio-win32.c
@@ -283,6 +283,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
     int count;
     int timeout;
 
+    aio_context_acquire(ctx);
     have_select_revents = aio_prepare(ctx);
     if (have_select_revents) {
         blocking = false;
@@ -323,7 +324,13 @@ bool aio_poll(AioContext *ctx, bool blocking)
 
         timeout = blocking
             ? qemu_timeout_ns_to_ms(aio_compute_timeout(ctx)) : 0;
+        if (timeout) {
+            aio_context_release(ctx);
+        }
         ret = WaitForMultipleObjects(count, events, FALSE, timeout);
+        if (timeout) {
+            aio_context_acquire(ctx);
+        }
         aio_set_dispatching(ctx, true);
 
         if (first && aio_bh_poll(ctx)) {
@@ -349,5 +356,6 @@ bool aio_poll(AioContext *ctx, bool blocking)
     progress |= timerlistgroup_run_timers(&ctx->tlg);
 
     aio_set_dispatching(ctx, was_dispatching);
+    aio_context_release(ctx);
     return progress;
 }
diff --git a/include/block/aio.h b/include/block/aio.h
index 0dc7a25..d2bb423 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -118,13 +118,14 @@ void aio_context_ref(AioContext *ctx);
 void aio_context_unref(AioContext *ctx);
 
 /* Take ownership of the AioContext.  If the AioContext will be shared between
- * threads, a thread must have ownership when calling aio_poll().
+ * threads, and a thread does not want to be interrupted, it will have to
+ * take ownership around calls to aio_poll().  Otherwise, aio_poll()
+ * automatically takes care of calling aio_context_acquire and
+ * aio_context_release.
  *
- * Note that multiple threads calling aio_poll() means timers, BHs, and
- * callbacks may be invoked from a different thread than they were registered
- * from.  Therefore, code must use AioContext acquire/release or use
- * fine-grained synchronization to protect shared state if other threads will
- * be accessing it simultaneously.
+ * Access to timers and BHs from a thread that has not acquired AioContext
+ * is possible.  Access to callbacks for now must be done while the AioContext
+ * is owned by the thread (FIXME).
  */
 void aio_context_acquire(AioContext *ctx);
 
-- 
cgit v1.1


From a0710f7995f914e3044e5899bd8ff6c43c62f916 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 20 Feb 2015 17:26:52 +0100
Subject: iothread: release iothread around aio_poll

This is the first step towards having fine-grained critical sections in
dataplane threads, which resolves lock ordering problems between
address_space_* functions (which need the BQL when doing MMIO, even
after we complete RCU-based dispatch) and the AioContext.

Because AioContext does not use contention callbacks anymore, the
unit test has to be changed.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1424449612-18215-4-git-send-email-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 async.c          |  8 +-------
 iothread.c       | 11 ++---------
 tests/test-aio.c | 19 +++++++++++--------
 3 files changed, 14 insertions(+), 24 deletions(-)

diff --git a/async.c b/async.c
index 77d080d..46d9e63 100644
--- a/async.c
+++ b/async.c
@@ -280,12 +280,6 @@ static void aio_timerlist_notify(void *opaque)
     aio_notify(opaque);
 }
 
-static void aio_rfifolock_cb(void *opaque)
-{
-    /* Kick owner thread in case they are blocked in aio_poll() */
-    aio_notify(opaque);
-}
-
 AioContext *aio_context_new(Error **errp)
 {
     int ret;
@@ -303,7 +297,7 @@ AioContext *aio_context_new(Error **errp)
                            event_notifier_test_and_clear);
     ctx->thread_pool = NULL;
     qemu_mutex_init(&ctx->bh_lock);
-    rfifolock_init(&ctx->lock, aio_rfifolock_cb, ctx);
+    rfifolock_init(&ctx->lock, NULL, NULL);
     timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx);
 
     return ctx;
diff --git a/iothread.c b/iothread.c
index 342a23f..a1f9109 100644
--- a/iothread.c
+++ b/iothread.c
@@ -31,21 +31,14 @@ typedef ObjectClass IOThreadClass;
 static void *iothread_run(void *opaque)
 {
     IOThread *iothread = opaque;
-    bool blocking;
 
     qemu_mutex_lock(&iothread->init_done_lock);
     iothread->thread_id = qemu_get_thread_id();
     qemu_cond_signal(&iothread->init_done_cond);
     qemu_mutex_unlock(&iothread->init_done_lock);
 
-    while (!iothread->stopping) {
-        aio_context_acquire(iothread->ctx);
-        blocking = true;
-        while (!iothread->stopping && aio_poll(iothread->ctx, blocking)) {
-            /* Progress was made, keep going */
-            blocking = false;
-        }
-        aio_context_release(iothread->ctx);
+    while (!atomic_read(&iothread->stopping)) {
+        aio_poll(iothread->ctx, true);
     }
     return NULL;
 }
diff --git a/tests/test-aio.c b/tests/test-aio.c
index a7cb5c9..4b0cb45 100644
--- a/tests/test-aio.c
+++ b/tests/test-aio.c
@@ -107,6 +107,7 @@ static void test_notify(void)
 
 typedef struct {
     QemuMutex start_lock;
+    EventNotifier notifier;
     bool thread_acquired;
 } AcquireTestData;
 
@@ -118,6 +119,8 @@ static void *test_acquire_thread(void *opaque)
     qemu_mutex_lock(&data->start_lock);
     qemu_mutex_unlock(&data->start_lock);
 
+    g_usleep(500000);
+    event_notifier_set(&data->notifier);
     aio_context_acquire(ctx);
     aio_context_release(ctx);
 
@@ -126,20 +129,19 @@ static void *test_acquire_thread(void *opaque)
     return NULL;
 }
 
-static void dummy_notifier_read(EventNotifier *unused)
+static void dummy_notifier_read(EventNotifier *n)
 {
-    g_assert(false); /* should never be invoked */
+    event_notifier_test_and_clear(n);
 }
 
 static void test_acquire(void)
 {
     QemuThread thread;
-    EventNotifier notifier;
     AcquireTestData data;
 
     /* Dummy event notifier ensures aio_poll() will block */
-    event_notifier_init(&notifier, false);
-    aio_set_event_notifier(ctx, &notifier, dummy_notifier_read);
+    event_notifier_init(&data.notifier, false);
+    aio_set_event_notifier(ctx, &data.notifier, dummy_notifier_read);
     g_assert(!aio_poll(ctx, false)); /* consume aio_notify() */
 
     qemu_mutex_init(&data.start_lock);
@@ -153,12 +155,13 @@ static void test_acquire(void)
     /* Block in aio_poll(), let other thread kick us and acquire context */
     aio_context_acquire(ctx);
     qemu_mutex_unlock(&data.start_lock); /* let the thread run */
-    g_assert(!aio_poll(ctx, true));
+    g_assert(aio_poll(ctx, true));
+    g_assert(!data.thread_acquired);
     aio_context_release(ctx);
 
     qemu_thread_join(&thread);
-    aio_set_event_notifier(ctx, &notifier, NULL);
-    event_notifier_cleanup(&notifier);
+    aio_set_event_notifier(ctx, &data.notifier, NULL);
+    event_notifier_cleanup(&data.notifier);
 
     g_assert(data.thread_acquired);
 }
-- 
cgit v1.1


From 0df89e8e6f62aea32a7302e73a86b7bfe5821018 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 19 Mar 2015 13:33:31 +0100
Subject: block-backend: Expose bdrv_write_zeroes()

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
---
 block/block-backend.c          | 11 +++++++++++
 include/sysemu/block-backend.h |  2 ++
 2 files changed, 13 insertions(+)

diff --git a/block/block-backend.c b/block/block-backend.c
index 48b6e4c..93e46f3 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -515,6 +515,17 @@ int blk_write(BlockBackend *blk, int64_t sector_num, const uint8_t *buf,
     return bdrv_write(blk->bs, sector_num, buf, nb_sectors);
 }
 
+int blk_write_zeroes(BlockBackend *blk, int64_t sector_num,
+                     int nb_sectors, BdrvRequestFlags flags)
+{
+    int ret = blk_check_request(blk, sector_num, nb_sectors);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return bdrv_write_zeroes(blk->bs, sector_num, nb_sectors, flags);
+}
+
 static void error_callback_bh(void *opaque)
 {
     struct BlockBackendAIOCB *acb = opaque;
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index 77e9b9c..b4a4d5e 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -87,6 +87,8 @@ int blk_read_unthrottled(BlockBackend *blk, int64_t sector_num, uint8_t *buf,
                          int nb_sectors);
 int blk_write(BlockBackend *blk, int64_t sector_num, const uint8_t *buf,
               int nb_sectors);
+int blk_write_zeroes(BlockBackend *blk, int64_t sector_num,
+                     int nb_sectors, BdrvRequestFlags flags);
 BlockAIOCB *blk_aio_write_zeroes(BlockBackend *blk, int64_t sector_num,
                                  int nb_sectors, BdrvRequestFlags flags,
                                  BlockCompletionFunc *cb, void *opaque);
-- 
cgit v1.1


From 690c7301600162421b928c7f26fd488fd8fa464e Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 19 Mar 2015 13:33:32 +0100
Subject: qemu-img convert: Rewrite copying logic

The implementation of qemu-img convert is (a) messy, (b) buggy, and
(c) less efficient than possible. The changes required to beat some
sense into it are massive enough that incremental changes would only
make my and the reviewers' life harder. So throw it away and reimplement
it from scratch.

Let me give some examples what I mean by messy, buggy and inefficient:

(a) The copying logic of qemu-img convert has two separate branches for
    compressed and normal target images, which roughly do the same -
    except for a little code that handles actual differences between
    compressed and uncompressed images, and much more code that
    implements just a different set of optimisations and bugs. This is
    unnecessary code duplication, and makes the code for compressed
    output (unsurprisingly) suffer from bitrot.

    The code for uncompressed ouput is run twice to count the the total
    length for the progress bar. In the first run it just takes a
    shortcut and runs only half the loop, and when it's done, it toggles
    a boolean, jumps out of the loop with a backwards goto and starts
    over. Works, but pretty is something different.

(b) Converting while keeping a backing file (-B option) is broken in
    several ways. This includes not writing to the image file if the
    input has zero clusters or data filled with zeros (ignoring that the
    backing file will be visible instead).

    It also doesn't correctly limit every iteration of the copy loop to
    sectors of the same status so that too many sectors may be copied to
    in the target image. For -B this gives an unexpected result, for
    other images it just does more work than necessary.

    Conversion with a compressed target completely ignores any target
    backing file.

(c) qemu-img convert skips reading and writing an area if it knows from
    metadata that copying isn't needed (except for the bug mentioned
    above that ignores a status change in some cases). It does, however,
    read from the source even if it knows that it will read zeros, and
    then search for non-zero bytes in the read buffer, if it's possible
    that a write might be needed.

This reimplementation of the copying core reorganises the code to remove
the duplication and have a much more obvious code flow, by essentially
splitting the copy iteration loop into three parts:

1. Find the number of contiguous sectors of the same status at the
   current offset (This can also be called in a separate loop before the
   copying loop in order to determine the total sectors for the progress
   bar.)

2. Read sectors. If the status implies that there is no data there to
   read (zero or unallocated cluster), don't do anything.

3. Write sectors depending on the status. If it's data, write it. If
   we want the backing file to be visible (with -B), don't write it. If
   it's zeroed, skip it if you can, otherwise use bdrv_write_zeroes() to
   optimise the write at least where possible.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
---
 qemu-img.c | 516 +++++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 310 insertions(+), 206 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index 9dddfbe..8d30e43 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1305,20 +1305,312 @@ out3:
     return ret;
 }
 
+enum ImgConvertBlockStatus {
+    BLK_DATA,
+    BLK_ZERO,
+    BLK_BACKING_FILE,
+};
+
+typedef struct ImgConvertState {
+    BlockBackend **src;
+    int64_t *src_sectors;
+    int src_cur, src_num;
+    int64_t src_cur_offset;
+    int64_t total_sectors;
+    int64_t allocated_sectors;
+    enum ImgConvertBlockStatus status;
+    int64_t sector_next_status;
+    BlockBackend *target;
+    bool has_zero_init;
+    bool compressed;
+    bool target_has_backing;
+    int min_sparse;
+    size_t cluster_sectors;
+    size_t buf_sectors;
+} ImgConvertState;
+
+static void convert_select_part(ImgConvertState *s, int64_t sector_num)
+{
+    assert(sector_num >= s->src_cur_offset);
+    while (sector_num - s->src_cur_offset >= s->src_sectors[s->src_cur]) {
+        s->src_cur_offset += s->src_sectors[s->src_cur];
+        s->src_cur++;
+        assert(s->src_cur < s->src_num);
+    }
+}
+
+static int convert_iteration_sectors(ImgConvertState *s, int64_t sector_num)
+{
+    int64_t ret;
+    int n;
+
+    convert_select_part(s, sector_num);
+
+    assert(s->total_sectors > sector_num);
+    n = MIN(s->total_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
+
+    if (s->sector_next_status <= sector_num) {
+        ret = bdrv_get_block_status(blk_bs(s->src[s->src_cur]),
+                                    sector_num - s->src_cur_offset,
+                                    n, &n);
+        if (ret < 0) {
+            return ret;
+        }
+
+        if (ret & BDRV_BLOCK_ZERO) {
+            s->status = BLK_ZERO;
+        } else if (ret & BDRV_BLOCK_DATA) {
+            s->status = BLK_DATA;
+        } else if (!s->target_has_backing) {
+            /* Without a target backing file we must copy over the contents of
+             * the backing file as well. */
+            /* TODO Check block status of the backing file chain to avoid
+             * needlessly reading zeroes and limiting the iteration to the
+             * buffer size */
+            s->status = BLK_DATA;
+        } else {
+            s->status = BLK_BACKING_FILE;
+        }
+
+        s->sector_next_status = sector_num + n;
+    }
+
+    n = MIN(n, s->sector_next_status - sector_num);
+    if (s->status == BLK_DATA) {
+        n = MIN(n, s->buf_sectors);
+    }
+
+    /* We need to write complete clusters for compressed images, so if an
+     * unallocated area is shorter than that, we must consider the whole
+     * cluster allocated. */
+    if (s->compressed) {
+        if (n < s->cluster_sectors) {
+            n = MIN(s->cluster_sectors, s->total_sectors - sector_num);
+            s->status = BLK_DATA;
+        } else {
+            n = QEMU_ALIGN_DOWN(n, s->cluster_sectors);
+        }
+    }
+
+    return n;
+}
+
+static int convert_read(ImgConvertState *s, int64_t sector_num, int nb_sectors,
+                        uint8_t *buf)
+{
+    int n;
+    int ret;
+
+    if (s->status == BLK_ZERO || s->status == BLK_BACKING_FILE) {
+        return 0;
+    }
+
+    assert(nb_sectors <= s->buf_sectors);
+    while (nb_sectors > 0) {
+        BlockBackend *blk;
+        int64_t bs_sectors;
+
+        /* In the case of compression with multiple source files, we can get a
+         * nb_sectors that spreads into the next part. So we must be able to
+         * read across multiple BDSes for one convert_read() call. */
+        convert_select_part(s, sector_num);
+        blk = s->src[s->src_cur];
+        bs_sectors = s->src_sectors[s->src_cur];
+
+        n = MIN(nb_sectors, bs_sectors - (sector_num - s->src_cur_offset));
+        ret = blk_read(blk, sector_num - s->src_cur_offset, buf, n);
+        if (ret < 0) {
+            return ret;
+        }
+
+        sector_num += n;
+        nb_sectors -= n;
+        buf += n * BDRV_SECTOR_SIZE;
+    }
+
+    return 0;
+}
+
+static int convert_write(ImgConvertState *s, int64_t sector_num, int nb_sectors,
+                         const uint8_t *buf)
+{
+    int ret;
+
+    while (nb_sectors > 0) {
+        int n = nb_sectors;
+
+        switch (s->status) {
+        case BLK_BACKING_FILE:
+            /* If we have a backing file, leave clusters unallocated that are
+             * unallocated in the source image, so that the backing file is
+             * visible at the respective offset. */
+            assert(s->target_has_backing);
+            break;
+
+        case BLK_DATA:
+            /* We must always write compressed clusters as a whole, so don't
+             * try to find zeroed parts in the buffer. We can only save the
+             * write if the buffer is completely zeroed and we're allowed to
+             * keep the target sparse. */
+            if (s->compressed) {
+                if (s->has_zero_init && s->min_sparse &&
+                    buffer_is_zero(buf, n * BDRV_SECTOR_SIZE))
+                {
+                    assert(!s->target_has_backing);
+                    break;
+                }
+
+                ret = blk_write_compressed(s->target, sector_num, buf, n);
+                if (ret < 0) {
+                    return ret;
+                }
+                break;
+            }
+
+            /* If there is real non-zero data or we're told to keep the target
+             * fully allocated (-S 0), we must write it. Otherwise we can treat
+             * it as zero sectors. */
+            if (!s->min_sparse ||
+                is_allocated_sectors_min(buf, n, &n, s->min_sparse))
+            {
+                ret = blk_write(s->target, sector_num, buf, n);
+                if (ret < 0) {
+                    return ret;
+                }
+                break;
+            }
+            /* fall-through */
+
+        case BLK_ZERO:
+            if (s->has_zero_init) {
+                break;
+            }
+            ret = blk_write_zeroes(s->target, sector_num, n, 0);
+            if (ret < 0) {
+                return ret;
+            }
+            break;
+        }
+
+        sector_num += n;
+        nb_sectors -= n;
+        buf += n * BDRV_SECTOR_SIZE;
+    }
+
+    return 0;
+}
+
+static int convert_do_copy(ImgConvertState *s)
+{
+    uint8_t *buf = NULL;
+    int64_t sector_num, allocated_done;
+    int ret;
+    int n;
+
+    /* Check whether we have zero initialisation or can get it efficiently */
+    s->has_zero_init = s->min_sparse && !s->target_has_backing
+                     ? bdrv_has_zero_init(blk_bs(s->target))
+                     : false;
+
+    if (!s->has_zero_init && !s->target_has_backing &&
+        bdrv_can_write_zeroes_with_unmap(blk_bs(s->target)))
+    {
+        ret = bdrv_make_zero(blk_bs(s->target), BDRV_REQ_MAY_UNMAP);
+        if (ret == 0) {
+            s->has_zero_init = true;
+        }
+    }
+
+    /* Allocate buffer for copied data. For compressed images, only one cluster
+     * can be copied at a time. */
+    if (s->compressed) {
+        if (s->cluster_sectors <= 0 || s->cluster_sectors > s->buf_sectors) {
+            error_report("invalid cluster size");
+            ret = -EINVAL;
+            goto fail;
+        }
+        s->buf_sectors = s->cluster_sectors;
+    }
+    buf = blk_blockalign(s->target, s->buf_sectors * BDRV_SECTOR_SIZE);
+
+    /* Calculate allocated sectors for progress */
+    s->allocated_sectors = 0;
+    sector_num = 0;
+    while (sector_num < s->total_sectors) {
+        n = convert_iteration_sectors(s, sector_num);
+        if (n < 0) {
+            ret = n;
+            goto fail;
+        }
+        if (s->status == BLK_DATA) {
+            s->allocated_sectors += n;
+        }
+        sector_num += n;
+    }
+
+    /* Do the copy */
+    s->src_cur = 0;
+    s->src_cur_offset = 0;
+    s->sector_next_status = 0;
+
+    sector_num = 0;
+    allocated_done = 0;
+
+    while (sector_num < s->total_sectors) {
+        n = convert_iteration_sectors(s, sector_num);
+        if (n < 0) {
+            ret = n;
+            goto fail;
+        }
+        if (s->status == BLK_DATA) {
+            allocated_done += n;
+            qemu_progress_print(100.0 * allocated_done / s->allocated_sectors,
+                                0);
+        }
+
+        ret = convert_read(s, sector_num, n, buf);
+        if (ret < 0) {
+            error_report("error while reading sector %" PRId64
+                         ": %s", sector_num, strerror(-ret));
+            goto fail;
+        }
+
+        ret = convert_write(s, sector_num, n, buf);
+        if (ret < 0) {
+            error_report("error while writing sector %" PRId64
+                         ": %s", sector_num, strerror(-ret));
+            goto fail;
+        }
+
+        sector_num += n;
+    }
+
+    if (s->compressed) {
+        /* signal EOF to align */
+        ret = blk_write_compressed(s->target, 0, NULL, 0);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+
+    ret = 0;
+fail:
+    qemu_vfree(buf);
+    return ret;
+}
+
 static int img_convert(int argc, char **argv)
 {
-    int c, n, n1, bs_n, bs_i, compress, cluster_sectors, skip_create;
+    int c, bs_n, bs_i, compress, cluster_sectors, skip_create;
     int64_t ret = 0;
     int progress = 0, flags, src_flags;
     const char *fmt, *out_fmt, *cache, *src_cache, *out_baseimg, *out_filename;
     BlockDriver *drv, *proto_drv;
     BlockBackend **blk = NULL, *out_blk = NULL;
     BlockDriverState **bs = NULL, *out_bs = NULL;
-    int64_t total_sectors, nb_sectors, sector_num, bs_offset;
+    int64_t total_sectors;
     int64_t *bs_sectors = NULL;
-    uint8_t * buf = NULL;
     size_t bufsectors = IO_BUF_SIZE / BDRV_SECTOR_SIZE;
-    const uint8_t *buf1;
     BlockDriverInfo bdi;
     QemuOpts *opts = NULL;
     QemuOptsList *create_opts = NULL;
@@ -1329,6 +1621,7 @@ static int img_convert(int argc, char **argv)
     bool quiet = false;
     Error *local_err = NULL;
     QemuOpts *sn_opts = NULL;
+    ImgConvertState state;
 
     fmt = NULL;
     out_fmt = "raw";
@@ -1627,9 +1920,6 @@ static int img_convert(int argc, char **argv)
     }
     out_bs = blk_bs(out_blk);
 
-    bs_i = 0;
-    bs_offset = 0;
-
     /* increase bufsectors from the default 4096 (2M) if opt_transfer_length
      * or discard_alignment of the out_bs is greater. Limit to 32768 (16MB)
      * as maximum. */
@@ -1638,8 +1928,6 @@ static int img_convert(int argc, char **argv)
                                          out_bs->bl.discard_alignment))
                     );
 
-    buf = blk_blockalign(out_blk, bufsectors * BDRV_SECTOR_SIZE);
-
     if (skip_create) {
         int64_t output_sectors = blk_nb_sectors(out_blk);
         if (output_sectors < 0) {
@@ -1666,203 +1954,20 @@ static int img_convert(int argc, char **argv)
         cluster_sectors = bdi.cluster_size / BDRV_SECTOR_SIZE;
     }
 
-    if (compress) {
-        if (cluster_sectors <= 0 || cluster_sectors > bufsectors) {
-            error_report("invalid cluster size");
-            ret = -1;
-            goto out;
-        }
-        sector_num = 0;
-
-        nb_sectors = total_sectors;
-
-        for(;;) {
-            int64_t bs_num;
-            int remainder;
-            uint8_t *buf2;
-
-            nb_sectors = total_sectors - sector_num;
-            if (nb_sectors <= 0)
-                break;
-            if (nb_sectors >= cluster_sectors)
-                n = cluster_sectors;
-            else
-                n = nb_sectors;
-
-            bs_num = sector_num - bs_offset;
-            assert (bs_num >= 0);
-            remainder = n;
-            buf2 = buf;
-            while (remainder > 0) {
-                int nlow;
-                while (bs_num == bs_sectors[bs_i]) {
-                    bs_offset += bs_sectors[bs_i];
-                    bs_i++;
-                    assert (bs_i < bs_n);
-                    bs_num = 0;
-                    /* printf("changing part: sector_num=%" PRId64 ", "
-                       "bs_i=%d, bs_offset=%" PRId64 ", bs_sectors=%" PRId64
-                       "\n", sector_num, bs_i, bs_offset, bs_sectors[bs_i]); */
-                }
-                assert (bs_num < bs_sectors[bs_i]);
-
-                nlow = remainder > bs_sectors[bs_i] - bs_num
-                    ? bs_sectors[bs_i] - bs_num : remainder;
-
-                ret = blk_read(blk[bs_i], bs_num, buf2, nlow);
-                if (ret < 0) {
-                    error_report("error while reading sector %" PRId64 ": %s",
-                                 bs_num, strerror(-ret));
-                    goto out;
-                }
-
-                buf2 += nlow * 512;
-                bs_num += nlow;
-
-                remainder -= nlow;
-            }
-            assert (remainder == 0);
-
-            if (!buffer_is_zero(buf, n * BDRV_SECTOR_SIZE)) {
-                ret = blk_write_compressed(out_blk, sector_num, buf, n);
-                if (ret != 0) {
-                    error_report("error while compressing sector %" PRId64
-                                 ": %s", sector_num, strerror(-ret));
-                    goto out;
-                }
-            }
-            sector_num += n;
-            qemu_progress_print(100.0 * sector_num / total_sectors, 0);
-        }
-        /* signal EOF to align */
-        blk_write_compressed(out_blk, 0, NULL, 0);
-    } else {
-        int64_t sectors_to_read, sectors_read, sector_num_next_status;
-        bool count_allocated_sectors;
-        int has_zero_init = min_sparse ? bdrv_has_zero_init(out_bs) : 0;
-
-        if (!has_zero_init && bdrv_can_write_zeroes_with_unmap(out_bs)) {
-            ret = bdrv_make_zero(out_bs, BDRV_REQ_MAY_UNMAP);
-            if (ret < 0) {
-                goto out;
-            }
-            has_zero_init = 1;
-        }
-
-        sectors_to_read = total_sectors;
-        count_allocated_sectors = progress && (out_baseimg || has_zero_init);
-restart:
-        sector_num = 0; // total number of sectors converted so far
-        sectors_read = 0;
-        sector_num_next_status = 0;
-
-        for(;;) {
-            nb_sectors = total_sectors - sector_num;
-            if (nb_sectors <= 0) {
-                if (count_allocated_sectors) {
-                    sectors_to_read = sectors_read;
-                    count_allocated_sectors = false;
-                    goto restart;
-                }
-                ret = 0;
-                break;
-            }
-
-            while (sector_num - bs_offset >= bs_sectors[bs_i]) {
-                bs_offset += bs_sectors[bs_i];
-                bs_i ++;
-                assert (bs_i < bs_n);
-                /* printf("changing part: sector_num=%" PRId64 ", bs_i=%d, "
-                  "bs_offset=%" PRId64 ", bs_sectors=%" PRId64 "\n",
-                   sector_num, bs_i, bs_offset, bs_sectors[bs_i]); */
-            }
-
-            if ((out_baseimg || has_zero_init) &&
-                sector_num >= sector_num_next_status) {
-                n = nb_sectors > INT_MAX ? INT_MAX : nb_sectors;
-                ret = bdrv_get_block_status(bs[bs_i], sector_num - bs_offset,
-                                            n, &n1);
-                if (ret < 0) {
-                    error_report("error while reading block status of sector %"
-                                 PRId64 ": %s", sector_num - bs_offset,
-                                 strerror(-ret));
-                    goto out;
-                }
-                /* If the output image is zero initialized, we are not working
-                 * on a shared base and the input is zero we can skip the next
-                 * n1 sectors */
-                if (has_zero_init && !out_baseimg && (ret & BDRV_BLOCK_ZERO)) {
-                    sector_num += n1;
-                    continue;
-                }
-                /* If the output image is being created as a copy on write
-                 * image, assume that sectors which are unallocated in the
-                 * input image are present in both the output's and input's
-                 * base images (no need to copy them). */
-                if (out_baseimg) {
-                    if (!(ret & BDRV_BLOCK_DATA)) {
-                        sector_num += n1;
-                        continue;
-                    }
-                    /* The next 'n1' sectors are allocated in the input image.
-                     * Copy only those as they may be followed by unallocated
-                     * sectors. */
-                    nb_sectors = n1;
-                }
-                /* avoid redundant callouts to get_block_status */
-                sector_num_next_status = sector_num + n1;
-            }
-
-            n = MIN(nb_sectors, bufsectors);
-
-            /* round down request length to an aligned sector, but
-             * do not bother doing this on short requests. They happen
-             * when we found an all-zero area, and the next sector to
-             * write will not be sector_num + n. */
-            if (cluster_sectors > 0 && n >= cluster_sectors) {
-                int64_t next_aligned_sector = (sector_num + n);
-                next_aligned_sector -= next_aligned_sector % cluster_sectors;
-                if (sector_num + n > next_aligned_sector) {
-                    n = next_aligned_sector - sector_num;
-                }
-            }
-
-            n = MIN(n, bs_sectors[bs_i] - (sector_num - bs_offset));
-
-            sectors_read += n;
-            if (count_allocated_sectors) {
-                sector_num += n;
-                continue;
-            }
+    state = (ImgConvertState) {
+        .src                = blk,
+        .src_sectors        = bs_sectors,
+        .src_num            = bs_n,
+        .total_sectors      = total_sectors,
+        .target             = out_blk,
+        .compressed         = compress,
+        .target_has_backing = (bool) out_baseimg,
+        .min_sparse         = min_sparse,
+        .cluster_sectors    = cluster_sectors,
+        .buf_sectors        = bufsectors,
+    };
+    ret = convert_do_copy(&state);
 
-            n1 = n;
-            ret = blk_read(blk[bs_i], sector_num - bs_offset, buf, n);
-            if (ret < 0) {
-                error_report("error while reading sector %" PRId64 ": %s",
-                             sector_num - bs_offset, strerror(-ret));
-                goto out;
-            }
-            /* NOTE: at the same time we convert, we do not write zero
-               sectors to have a chance to compress the image. Ideally, we
-               should add a specific call to have the info to go faster */
-            buf1 = buf;
-            while (n > 0) {
-                if (!has_zero_init ||
-                    is_allocated_sectors_min(buf1, n, &n1, min_sparse)) {
-                    ret = blk_write(out_blk, sector_num, buf1, n1);
-                    if (ret < 0) {
-                        error_report("error while writing sector %" PRId64
-                                     ": %s", sector_num, strerror(-ret));
-                        goto out;
-                    }
-                }
-                sector_num += n1;
-                n -= n1;
-                buf1 += n1 * 512;
-            }
-            qemu_progress_print(100.0 * sectors_read / sectors_to_read, 0);
-        }
-    }
 out:
     if (!ret) {
         qemu_progress_print(100, 0);
@@ -1870,7 +1975,6 @@ out:
     qemu_progress_end();
     qemu_opts_del(opts);
     qemu_opts_free(create_opts);
-    qemu_vfree(buf);
     qemu_opts_del(sn_opts);
     blk_unref(out_blk);
     g_free(bs);
-- 
cgit v1.1


From e4f587492331df0ac50bad6131ea273d527af796 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 19 Mar 2015 13:33:33 +0100
Subject: qemu-iotests: Some qemu-img convert tests

This adds a regression test for some problems that the qemu-img convert
rewrite just fixed.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/122     | 223 +++++++++++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/122.out | 209 ++++++++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/group   |   1 +
 3 files changed, 433 insertions(+)
 create mode 100755 tests/qemu-iotests/122
 create mode 100644 tests/qemu-iotests/122.out

diff --git a/tests/qemu-iotests/122 b/tests/qemu-iotests/122
new file mode 100755
index 0000000..350ca9c
--- /dev/null
+++ b/tests/qemu-iotests/122
@@ -0,0 +1,223 @@
+#!/bin/bash
+#
+# Test some qemu-img convert cases
+#
+# Copyright (C) 2015 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# creator
+owner=kwolf@redhat.com
+
+seq="$(basename $0)"
+echo "QA output created by $seq"
+
+here="$PWD"
+tmp=/tmp/$$
+status=1	# failure is the default!
+
+_cleanup()
+{
+    rm -f "$TEST_IMG".[123]
+	_cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+_supported_fmt qcow2
+_supported_proto file
+_supported_os Linux
+
+
+TEST_IMG="$TEST_IMG".base _make_test_img 64M
+$QEMU_IO -c "write -P 0x11 0 64M" "$TEST_IMG".base 2>&1 | _filter_qemu_io | _filter_testdir
+
+
+echo
+echo "=== Check allocation status regression with -B ==="
+echo
+
+_make_test_img -b "$TEST_IMG".base
+$QEMU_IO -c "write -P 0x22 0 3M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG convert -O $IMGFMT -B "$TEST_IMG".base "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IMG map "$TEST_IMG".orig | _filter_qemu_img_map
+
+
+echo
+echo "=== Check that zero clusters are kept in overlay ==="
+echo
+
+_make_test_img -b "$TEST_IMG".base
+
+$QEMU_IO -c "write -P 0 0 3M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG convert -O $IMGFMT -B "$TEST_IMG".base "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG convert -O $IMGFMT -c -B "$TEST_IMG".base "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+
+$QEMU_IO -c "write -z 0 3M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG convert -O $IMGFMT -B "$TEST_IMG".base "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG convert -O $IMGFMT -c -B "$TEST_IMG".base "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+
+
+echo
+echo "=== Concatenate multiple source images ==="
+echo
+
+TEST_IMG="$TEST_IMG".1 _make_test_img 4M
+TEST_IMG="$TEST_IMG".2 _make_test_img 4M
+TEST_IMG="$TEST_IMG".3 _make_test_img 4M
+
+$QEMU_IO -c "write -P 0x11 0 64k" "$TEST_IMG".1 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "write -P 0x22 0 64k" "$TEST_IMG".2 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "write -P 0x33 0 64k" "$TEST_IMG".3 2>&1 | _filter_qemu_io | _filter_testdir
+
+$QEMU_IMG convert -O $IMGFMT "$TEST_IMG".[123] "$TEST_IMG"
+$QEMU_IMG map "$TEST_IMG" | _filter_qemu_img_map
+$QEMU_IO -c "read -P 0x11 0 64k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x22 4M 64k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x33 8M 64k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+$QEMU_IMG convert -c -O $IMGFMT "$TEST_IMG".[123] "$TEST_IMG"
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+$QEMU_IO -c "read -P 0x11 0 64k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x22 4M 64k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x33 8M 64k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+# -B can't be combined with concatenation
+$QEMU_IMG convert -O $IMGFMT -B "$TEST_IMG".base "$TEST_IMG".[123] "$TEST_IMG"
+$QEMU_IMG convert -O $IMGFMT -c -B "$TEST_IMG".base "$TEST_IMG".[123] "$TEST_IMG"
+
+
+echo
+echo "=== Compression with misaligned allocations and image sizes ==="
+echo
+
+TEST_IMG="$TEST_IMG".1 _make_test_img 1023k -o cluster_size=1024
+TEST_IMG="$TEST_IMG".2 _make_test_img 1023k -o cluster_size=1024
+
+$QEMU_IO -c "write -P 0x11   16k  16k" "$TEST_IMG".1 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "write -P 0x22  130k 130k" "$TEST_IMG".1 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "write -P 0x33 1022k   1k" "$TEST_IMG".1 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "write -P 0x44    0k   1k" "$TEST_IMG".2 2>&1 | _filter_qemu_io | _filter_testdir
+
+$QEMU_IMG convert -c -O $IMGFMT "$TEST_IMG".[12] "$TEST_IMG"
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+$QEMU_IO -c "read -P 0       0k   16k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x11   16k   16k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0      32k   98k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x22  130k  130k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0     260k  762k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x33 1022k    1k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x44 1023k    1k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0    1024k 1022k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+
+echo
+echo "=== Full allocation with -S 0 ==="
+echo
+
+# Standalone image
+_make_test_img 64M
+$QEMU_IO -c "write -P 0x22 0 3M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "write -P 0 3M 3M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+echo
+echo convert -S 0:
+$QEMU_IMG convert -O $IMGFMT -S 0 "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0x22 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0 3M 61M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map
+
+echo
+echo convert -c -S 0:
+$QEMU_IMG convert -O $IMGFMT -c -S 0 "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0x22 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0 3M 61M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map
+
+# With backing file
+TEST_IMG="$TEST_IMG".base _make_test_img 64M
+$QEMU_IO -c "write -P 0x11 0 32M" "$TEST_IMG".base 2>&1 | _filter_qemu_io | _filter_testdir
+
+_make_test_img -b "$TEST_IMG".base 64M
+$QEMU_IO -c "write -P 0x22 0 3M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+echo
+echo convert -S 0 with source backing file:
+$QEMU_IMG convert -O $IMGFMT -S 0 "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0x22 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x11 3M 29M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0 32M 32M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map
+
+echo
+echo convert -c -S 0 with source backing file:
+$QEMU_IMG convert -O $IMGFMT -c -S 0 "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0x22 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x11 3M 29M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0 32M 32M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map
+
+# With keeping the backing file
+echo
+echo convert -S 0 -B ...
+$QEMU_IMG convert -O $IMGFMT -S 0 "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0x22 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x11 3M 29M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0 32M 32M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map
+
+echo
+echo convert -c -S 0 -B ...
+$QEMU_IMG convert -O $IMGFMT -c -S 0 "$TEST_IMG" "$TEST_IMG".orig
+$QEMU_IO -c "read -P 0x22 0 3M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0x11 3M 29M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "read -P 0 32M 32M" "$TEST_IMG".orig 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map
+
+
+echo
+echo "=== Non-zero -S ==="
+echo
+
+_make_test_img 64M -o cluster_size=1k
+$QEMU_IO -c "write -P 0 0 64k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "write 0 1k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "write 8k 1k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "write 17k 1k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+for min_sparse in 4k 8k; do
+    echo
+    echo convert -S $min_sparse
+    $QEMU_IMG convert -O $IMGFMT -o cluster_size=1k -S $min_sparse "$TEST_IMG" "$TEST_IMG".orig
+    $QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map
+
+    echo
+    echo convert -c -S $min_sparse
+    # For compressed images, -S values other than 0 are ignored
+    $QEMU_IMG convert -O $IMGFMT -o cluster_size=1k -c -S $min_sparse "$TEST_IMG" "$TEST_IMG".orig
+    $QEMU_IMG map --output=json "$TEST_IMG".orig | _filter_qemu_img_map
+done
+
+# success, all done
+echo '*** done'
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/122.out b/tests/qemu-iotests/122.out
new file mode 100644
index 0000000..1f853b9
--- /dev/null
+++ b/tests/qemu-iotests/122.out
@@ -0,0 +1,209 @@
+QA output created by 122
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=67108864
+wrote 67108864/67108864 bytes at offset 0
+64 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+=== Check allocation status regression with -B ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 backing_file='TEST_DIR/t.IMGFMT.base'
+wrote 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+Offset          Length          File
+0               0x300000        TEST_DIR/t.IMGFMT.orig
+0x300000        0x3d00000       TEST_DIR/t.IMGFMT.base
+
+=== Check that zero clusters are kept in overlay ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 backing_file='TEST_DIR/t.IMGFMT.base'
+wrote 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+=== Concatenate multiple source images ===
+
+Formatting 'TEST_DIR/t.IMGFMT.1', fmt=IMGFMT size=4194304
+Formatting 'TEST_DIR/t.IMGFMT.2', fmt=IMGFMT size=4194304
+Formatting 'TEST_DIR/t.IMGFMT.3', fmt=IMGFMT size=4194304
+wrote 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+Offset          Length          File
+0               0x10000         TEST_DIR/t.IMGFMT
+0x400000        0x10000         TEST_DIR/t.IMGFMT
+0x800000        0x10000         TEST_DIR/t.IMGFMT
+read 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 65536/65536 bytes at offset 4194304
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 65536/65536 bytes at offset 8388608
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 65536, "depth": 0, "zero": false, "data": true},
+{ "start": 65536, "length": 4128768, "depth": 0, "zero": true, "data": false},
+{ "start": 4194304, "length": 65536, "depth": 0, "zero": false, "data": true},
+{ "start": 4259840, "length": 4128768, "depth": 0, "zero": true, "data": false},
+{ "start": 8388608, "length": 65536, "depth": 0, "zero": false, "data": true},
+{ "start": 8454144, "length": 4128768, "depth": 0, "zero": true, "data": false}]
+read 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 65536/65536 bytes at offset 4194304
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 65536/65536 bytes at offset 8388608
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+qemu-img: -B makes no sense when concatenating multiple input images
+qemu-img: -B makes no sense when concatenating multiple input images
+
+=== Compression with misaligned allocations and image sizes ===
+
+Formatting 'TEST_DIR/t.IMGFMT.1', fmt=IMGFMT size=1047552
+Formatting 'TEST_DIR/t.IMGFMT.2', fmt=IMGFMT size=1047552
+wrote 16384/16384 bytes at offset 16384
+16 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 133120/133120 bytes at offset 133120
+130 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1024/1024 bytes at offset 1046528
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1024/1024 bytes at offset 0
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 65536, "depth": 0, "zero": false, "data": true},
+{ "start": 65536, "length": 65536, "depth": 0, "zero": true, "data": false},
+{ "start": 131072, "length": 196608, "depth": 0, "zero": false, "data": true},
+{ "start": 327680, "length": 655360, "depth": 0, "zero": true, "data": false},
+{ "start": 983040, "length": 65536, "depth": 0, "zero": false, "data": true},
+{ "start": 1048576, "length": 1046528, "depth": 0, "zero": true, "data": false}]
+read 16384/16384 bytes at offset 0
+16 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 16384/16384 bytes at offset 16384
+16 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 100352/100352 bytes at offset 32768
+98 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 133120/133120 bytes at offset 133120
+130 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 780288/780288 bytes at offset 266240
+762 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 1046528
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 1047552
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1046528/1046528 bytes at offset 1048576
+1022 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+=== Full allocation with -S 0 ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
+wrote 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 3145728/3145728 bytes at offset 3145728
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+convert -S 0:
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 63963136/63963136 bytes at offset 3145728
+61 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 6291456, "depth": 0, "zero": false, "data": true, "offset": 327680},
+{ "start": 6291456, "length": 60817408, "depth": 0, "zero": true, "data": false}]
+
+convert -c -S 0:
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 63963136/63963136 bytes at offset 3145728
+61 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 6291456, "depth": 0, "zero": false, "data": true},
+{ "start": 6291456, "length": 60817408, "depth": 0, "zero": true, "data": false}]
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=67108864
+wrote 33554432/33554432 bytes at offset 0
+32 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 backing_file='TEST_DIR/t.IMGFMT.base'
+wrote 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+convert -S 0 with source backing file:
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 30408704/30408704 bytes at offset 3145728
+29 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 33554432/33554432 bytes at offset 33554432
+32 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 67108864, "depth": 0, "zero": false, "data": true, "offset": 327680}]
+
+convert -c -S 0 with source backing file:
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 30408704/30408704 bytes at offset 3145728
+29 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 33554432/33554432 bytes at offset 33554432
+32 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 67108864, "depth": 0, "zero": false, "data": true}]
+
+convert -S 0 -B ...
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 30408704/30408704 bytes at offset 3145728
+29 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 33554432/33554432 bytes at offset 33554432
+32 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 67108864, "depth": 0, "zero": false, "data": true, "offset": 327680}]
+
+convert -c -S 0 -B ...
+read 3145728/3145728 bytes at offset 0
+3 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 30408704/30408704 bytes at offset 3145728
+29 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 33554432/33554432 bytes at offset 33554432
+32 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+[{ "start": 0, "length": 67108864, "depth": 0, "zero": false, "data": true}]
+
+=== Non-zero -S ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
+wrote 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1024/1024 bytes at offset 0
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1024/1024 bytes at offset 8192
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1024/1024 bytes at offset 17408
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+convert -S 4k
+[{ "start": 0, "length": 1024, "depth": 0, "zero": false, "data": true, "offset": 8192},
+{ "start": 1024, "length": 7168, "depth": 0, "zero": true, "data": false},
+{ "start": 8192, "length": 1024, "depth": 0, "zero": false, "data": true, "offset": 9216},
+{ "start": 9216, "length": 8192, "depth": 0, "zero": true, "data": false},
+{ "start": 17408, "length": 1024, "depth": 0, "zero": false, "data": true, "offset": 10240},
+{ "start": 18432, "length": 67090432, "depth": 0, "zero": true, "data": false}]
+
+convert -c -S 4k
+[{ "start": 0, "length": 1024, "depth": 0, "zero": false, "data": true},
+{ "start": 1024, "length": 7168, "depth": 0, "zero": true, "data": false},
+{ "start": 8192, "length": 1024, "depth": 0, "zero": false, "data": true},
+{ "start": 9216, "length": 8192, "depth": 0, "zero": true, "data": false},
+{ "start": 17408, "length": 1024, "depth": 0, "zero": false, "data": true},
+{ "start": 18432, "length": 67090432, "depth": 0, "zero": true, "data": false}]
+
+convert -S 8k
+[{ "start": 0, "length": 9216, "depth": 0, "zero": false, "data": true, "offset": 8192},
+{ "start": 9216, "length": 8192, "depth": 0, "zero": true, "data": false},
+{ "start": 17408, "length": 1024, "depth": 0, "zero": false, "data": true, "offset": 17408},
+{ "start": 18432, "length": 67090432, "depth": 0, "zero": true, "data": false}]
+
+convert -c -S 8k
+[{ "start": 0, "length": 1024, "depth": 0, "zero": false, "data": true},
+{ "start": 1024, "length": 7168, "depth": 0, "zero": true, "data": false},
+{ "start": 8192, "length": 1024, "depth": 0, "zero": false, "data": true},
+{ "start": 9216, "length": 8192, "depth": 0, "zero": true, "data": false},
+{ "start": 17408, "length": 1024, "depth": 0, "zero": false, "data": true},
+{ "start": 18432, "length": 67090432, "depth": 0, "zero": true, "data": false}]
+*** done
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index bcf2578..3cf55e5 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -122,6 +122,7 @@
 115 rw auto
 116 rw auto quick
 121 rw auto
+122 rw auto
 123 rw auto quick
 128 rw auto quick
 130 rw auto quick
-- 
cgit v1.1


From 8eedfbd4a50299f03b3630659c34ad1b01f69370 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 14 Apr 2015 16:32:45 +0200
Subject: blkdebug: Add bdrv_truncate()

This is, amongst others, required for qemu-iotests 033 to run as
intended on VHDX, which uses explicit bdrv_truncate() calls to bs->file
when allocating new blocks.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Jeff Cody <jcody@redhat.com>
---
 block/blkdebug.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/block/blkdebug.c b/block/blkdebug.c
index 63611e0..3c30edb 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -721,6 +721,11 @@ static int64_t blkdebug_getlength(BlockDriverState *bs)
     return bdrv_getlength(bs->file);
 }
 
+static int blkdebug_truncate(BlockDriverState *bs, int64_t offset)
+{
+    return bdrv_truncate(bs->file, offset);
+}
+
 static void blkdebug_refresh_filename(BlockDriverState *bs)
 {
     QDict *opts;
@@ -779,6 +784,7 @@ static BlockDriver bdrv_blkdebug = {
     .bdrv_file_open         = blkdebug_open,
     .bdrv_close             = blkdebug_close,
     .bdrv_getlength         = blkdebug_getlength,
+    .bdrv_truncate          = blkdebug_truncate,
     .bdrv_refresh_filename  = blkdebug_refresh_filename,
 
     .bdrv_aio_readv         = blkdebug_aio_readv,
-- 
cgit v1.1


From d1a126c53ddc563b7b731cee013e0362f7a5f22f Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 14 Apr 2015 16:36:16 +0200
Subject: vhdx: Fix zero-fill iov length

Fix the length of the zero-fill for the back, which was accidentally
using the same value as for the front. This is caught by qemu-iotests
033.

For consistency, change the code for the front as well to use the length
stored in the iov (it is the same value, copied four lines above).

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Acked-by: Jeff Cody <jcody@redhat.com>
---
 block/vhdx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/vhdx.c b/block/vhdx.c
index bb3ed45..e24062f 100644
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -1269,7 +1269,7 @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
                         iov1.iov_base = qemu_blockalign(bs, iov1.iov_len);
                         memset(iov1.iov_base, 0, iov1.iov_len);
                         qemu_iovec_concat_iov(&hd_qiov, &iov1, 1, 0,
-                                              sinfo.block_offset);
+                                              iov1.iov_len);
                         sectors_to_write += iov1.iov_len >> BDRV_SECTOR_BITS;
                     }
 
@@ -1285,7 +1285,7 @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
                         iov2.iov_base = qemu_blockalign(bs, iov2.iov_len);
                         memset(iov2.iov_base, 0, iov2.iov_len);
                         qemu_iovec_concat_iov(&hd_qiov, &iov2, 1, 0,
-                                              sinfo.block_offset);
+                                              iov2.iov_len);
                         sectors_to_write += iov2.iov_len >> BDRV_SECTOR_BITS;
                     }
                 }
-- 
cgit v1.1


From 1faa5bb73247339bf3d797433a9ade990ef0fb32 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 2 Apr 2015 17:39:22 +0100
Subject: thread-pool: clean up thread_pool_completion_bh()

This patch simplifies thread_pool_completion_bh().

The function first checks elem->state:

  if (elem->state != THREAD_DONE) {
      continue;
  }

It then goes on to check elem->state == THREAD_DONE although we already
know this must be the case.

The QLIST_REMOVE() is duplicated down both branches of an if-else
statement so that can be lifted out as well.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Message-id: 1427992762-10126-1-git-send-email-stefanha@redhat.com
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 thread-pool.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/thread-pool.c b/thread-pool.c
index e2cac8e..ac909f4 100644
--- a/thread-pool.c
+++ b/thread-pool.c
@@ -170,12 +170,12 @@ restart:
         if (elem->state != THREAD_DONE) {
             continue;
         }
-        if (elem->state == THREAD_DONE) {
-            trace_thread_pool_complete(pool, elem, elem->common.opaque,
-                                       elem->ret);
-        }
-        if (elem->state == THREAD_DONE && elem->common.cb) {
-            QLIST_REMOVE(elem, all);
+
+        trace_thread_pool_complete(pool, elem, elem->common.opaque,
+                                   elem->ret);
+        QLIST_REMOVE(elem, all);
+
+        if (elem->common.cb) {
             /* Read state before ret.  */
             smp_rmb();
 
@@ -188,8 +188,6 @@ restart:
             qemu_aio_unref(elem);
             goto restart;
         } else {
-            /* remove the request */
-            QLIST_REMOVE(elem, all);
             qemu_aio_unref(elem);
         }
     }
-- 
cgit v1.1


From 9eddd6a4b3b187ba50038800b6e4aeda4973b365 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 26 Mar 2015 22:42:34 +0000
Subject: scripts: add 'qemu coroutine' command to qemu-gdb.py

The 'qemu coroutine <coroutine-address>' GDB command prints the
backtrace for a CoroutineUContext.  This is useful for peeking inside
yielded coroutines that are waiting for file descriptor events, timers,
etc.

For example:

  $ gdb tests/test-coroutine
  (gdb) b test_yield
  (gdb) r
  (gdb) b qemu_coroutine_enter
  (gdb) c
  (gdb) c
  Continuing.

  Breakpoint 2, qemu_coroutine_enter (co=0x555555c66520, opaque=0x0) at qemu-coroutine.c:103
  103	{
  (gdb) source scripts/qemu-gdb.py
  (gdb) qemu coroutine 0x555555c66520
  #0  0x000055555557a740 in qemu_coroutine_switch (from_=<optimized out>, to_=0x7ffff7f90a70, action=COROUTINE_YIELD) at coroutine-ucontext.c:177
  #1  0x0000555555566af9 in yield_5_times (opaque=0x7fffffffdbb7) at tests/test-coroutine.c:107
  #2  0x000055555557a7aa in coroutine_trampoline (i0=<optimized out>, i1=<optimized out>) at coroutine-ucontext.c:80
  #3  0x00007ffff08de000 in __start_context () at /lib64/libc.so.6

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1427409754-8556-1-git-send-email-stefanha@redhat.com
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 scripts/qemu-gdb.py | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/scripts/qemu-gdb.py b/scripts/qemu-gdb.py
index 8a0f305..6c7f4fb 100644
--- a/scripts/qemu-gdb.py
+++ b/scripts/qemu-gdb.py
@@ -22,12 +22,86 @@ def isnull(ptr):
 def int128(p):
     return long(p['lo']) + (long(p['hi']) << 64)
 
+def get_fs_base():
+    '''Fetch %fs base value using arch_prctl(ARCH_GET_FS)'''
+    # %rsp - 120 is scratch space according to the SystemV ABI
+    old = gdb.parse_and_eval('*(uint64_t*)($rsp - 120)')
+    gdb.execute('call arch_prctl(0x1003, $rsp - 120)', False, True)
+    fs_base = gdb.parse_and_eval('*(uint64_t*)($rsp - 120)')
+    gdb.execute('set *(uint64_t*)($rsp - 120) = %s' % old, False, True)
+    return fs_base
+
+def get_glibc_pointer_guard():
+    '''Fetch glibc pointer guard value'''
+    fs_base = get_fs_base()
+    return gdb.parse_and_eval('*(uint64_t*)((uint64_t)%s + 0x30)' % fs_base)
+
+def glibc_ptr_demangle(val, pointer_guard):
+    '''Undo effect of glibc's PTR_MANGLE()'''
+    return gdb.parse_and_eval('(((uint64_t)%s >> 0x11) | ((uint64_t)%s << (64 - 0x11))) ^ (uint64_t)%s' % (val, val, pointer_guard))
+
+def bt_jmpbuf(jmpbuf):
+    '''Backtrace a jmpbuf'''
+    JB_RBX  = 0
+    JB_RBP  = 1
+    JB_R12  = 2
+    JB_R13  = 3
+    JB_R14  = 4
+    JB_R15  = 5
+    JB_RSP  = 6
+    JB_PC   = 7
+
+    old_rbx = gdb.parse_and_eval('(uint64_t)$rbx')
+    old_rbp = gdb.parse_and_eval('(uint64_t)$rbp')
+    old_rsp = gdb.parse_and_eval('(uint64_t)$rsp')
+    old_r12 = gdb.parse_and_eval('(uint64_t)$r12')
+    old_r13 = gdb.parse_and_eval('(uint64_t)$r13')
+    old_r14 = gdb.parse_and_eval('(uint64_t)$r14')
+    old_r15 = gdb.parse_and_eval('(uint64_t)$r15')
+    old_rip = gdb.parse_and_eval('(uint64_t)$rip')
+
+    pointer_guard = get_glibc_pointer_guard()
+    gdb.execute('set $rbx = %s' % jmpbuf[JB_RBX])
+    gdb.execute('set $rbp = %s' % glibc_ptr_demangle(jmpbuf[JB_RBP], pointer_guard))
+    gdb.execute('set $rsp = %s' % glibc_ptr_demangle(jmpbuf[JB_RSP], pointer_guard))
+    gdb.execute('set $r12 = %s' % jmpbuf[JB_R12])
+    gdb.execute('set $r13 = %s' % jmpbuf[JB_R13])
+    gdb.execute('set $r14 = %s' % jmpbuf[JB_R14])
+    gdb.execute('set $r15 = %s' % jmpbuf[JB_R15])
+    gdb.execute('set $rip = %s' % glibc_ptr_demangle(jmpbuf[JB_PC], pointer_guard))
+
+    gdb.execute('bt')
+
+    gdb.execute('set $rbx = %s' % old_rbx)
+    gdb.execute('set $rbp = %s' % old_rbp)
+    gdb.execute('set $rsp = %s' % old_rsp)
+    gdb.execute('set $r12 = %s' % old_r12)
+    gdb.execute('set $r13 = %s' % old_r13)
+    gdb.execute('set $r14 = %s' % old_r14)
+    gdb.execute('set $r15 = %s' % old_r15)
+    gdb.execute('set $rip = %s' % old_rip)
+
 class QemuCommand(gdb.Command):
     '''Prefix for QEMU debug support commands'''
     def __init__(self):
         gdb.Command.__init__(self, 'qemu', gdb.COMMAND_DATA,
                              gdb.COMPLETE_NONE, True)
 
+class CoroutineCommand(gdb.Command):
+    '''Display coroutine backtrace'''
+    def __init__(self):
+        gdb.Command.__init__(self, 'qemu coroutine', gdb.COMMAND_DATA,
+                             gdb.COMPLETE_NONE)
+
+    def invoke(self, arg, from_tty):
+        argv = gdb.string_to_argv(arg)
+        if len(argv) != 1:
+            gdb.write('usage: qemu coroutine <coroutine-pointer>\n')
+            return
+
+        coroutine_pointer = gdb.parse_and_eval(argv[0]).cast(gdb.lookup_type('CoroutineUContext').pointer())
+        bt_jmpbuf(coroutine_pointer['env']['__jmpbuf'])
+
 class MtreeCommand(gdb.Command):
     '''Display the memory tree hierarchy'''
     def __init__(self):
@@ -86,4 +160,5 @@ class MtreeCommand(gdb.Command):
             subregion = subregion['subregions_link']['tqe_next']
 
 QemuCommand()
+CoroutineCommand()
 MtreeCommand()
-- 
cgit v1.1


From e5e51dd3af6a0872dedce290ee41437b5aeed109 Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Wed, 1 Apr 2015 09:45:38 +0800
Subject: block/null: Latency simulation by adding new option "latency-ns"

Aio context switch should just work because the requests will be
drained, so the scheduled timer(s) on the old context will be freed.

Signed-off-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1427852740-24315-2-git-send-email-famz@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/null.c         | 58 ++++++++++++++++++++++++++++++++++++++++++++++------
 qapi/block-core.json |  5 ++++-
 2 files changed, 56 insertions(+), 7 deletions(-)

diff --git a/block/null.c b/block/null.c
index ec2bd27..267d3e6 100644
--- a/block/null.c
+++ b/block/null.c
@@ -12,8 +12,11 @@
 
 #include "block/block_int.h"
 
+#define NULL_OPT_LATENCY "latency-ns"
+
 typedef struct {
     int64_t length;
+    int64_t latency_ns;
 } BDRVNullState;
 
 static QemuOptsList runtime_opts = {
@@ -30,6 +33,12 @@ static QemuOptsList runtime_opts = {
             .type = QEMU_OPT_SIZE,
             .help = "size of the null block",
         },
+        {
+            .name = NULL_OPT_LATENCY,
+            .type = QEMU_OPT_NUMBER,
+            .help = "nanoseconds (approximated) to wait "
+                    "before completing request",
+        },
         { /* end of list */ }
     },
 };
@@ -39,13 +48,20 @@ static int null_file_open(BlockDriverState *bs, QDict *options, int flags,
 {
     QemuOpts *opts;
     BDRVNullState *s = bs->opaque;
+    int ret = 0;
 
     opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
     qemu_opts_absorb_qdict(opts, options, &error_abort);
     s->length =
         qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 1 << 30);
+    s->latency_ns =
+        qemu_opt_get_number(opts, NULL_OPT_LATENCY, 0);
+    if (s->latency_ns < 0) {
+        error_setg(errp, "latency-ns is invalid");
+        ret = -EINVAL;
+    }
     qemu_opts_del(opts);
-    return 0;
+    return ret;
 }
 
 static void null_close(BlockDriverState *bs)
@@ -58,28 +74,40 @@ static int64_t null_getlength(BlockDriverState *bs)
     return s->length;
 }
 
+static coroutine_fn int null_co_common(BlockDriverState *bs)
+{
+    BDRVNullState *s = bs->opaque;
+
+    if (s->latency_ns) {
+        co_aio_sleep_ns(bdrv_get_aio_context(bs), QEMU_CLOCK_REALTIME,
+                        s->latency_ns);
+    }
+    return 0;
+}
+
 static coroutine_fn int null_co_readv(BlockDriverState *bs,
                                       int64_t sector_num, int nb_sectors,
                                       QEMUIOVector *qiov)
 {
-    return 0;
+    return null_co_common(bs);
 }
 
 static coroutine_fn int null_co_writev(BlockDriverState *bs,
                                        int64_t sector_num, int nb_sectors,
                                        QEMUIOVector *qiov)
 {
-    return 0;
+    return null_co_common(bs);
 }
 
 static coroutine_fn int null_co_flush(BlockDriverState *bs)
 {
-    return 0;
+    return null_co_common(bs);
 }
 
 typedef struct {
     BlockAIOCB common;
     QEMUBH *bh;
+    QEMUTimer timer;
 } NullAIOCB;
 
 static const AIOCBInfo null_aiocb_info = {
@@ -94,15 +122,33 @@ static void null_bh_cb(void *opaque)
     qemu_aio_unref(acb);
 }
 
+static void null_timer_cb(void *opaque)
+{
+    NullAIOCB *acb = opaque;
+    acb->common.cb(acb->common.opaque, 0);
+    timer_deinit(&acb->timer);
+    qemu_aio_unref(acb);
+}
+
 static inline BlockAIOCB *null_aio_common(BlockDriverState *bs,
                                           BlockCompletionFunc *cb,
                                           void *opaque)
 {
     NullAIOCB *acb;
+    BDRVNullState *s = bs->opaque;
 
     acb = qemu_aio_get(&null_aiocb_info, bs, cb, opaque);
-    acb->bh = aio_bh_new(bdrv_get_aio_context(bs), null_bh_cb, acb);
-    qemu_bh_schedule(acb->bh);
+    /* Only emulate latency after vcpu is running. */
+    if (s->latency_ns) {
+        aio_timer_init(bdrv_get_aio_context(bs), &acb->timer,
+                       QEMU_CLOCK_REALTIME, SCALE_NS,
+                       null_timer_cb, acb);
+        timer_mod_ns(&acb->timer,
+                     qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + s->latency_ns);
+    } else {
+        acb->bh = aio_bh_new(bdrv_get_aio_context(bs), null_bh_cb, acb);
+        qemu_bh_schedule(acb->bh);
+    }
     return &acb->common;
 }
 
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 7873084..e158a7c 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -1310,11 +1310,14 @@
 # Driver specific block device options for the null backend.
 #
 # @size:    #optional size of the device in bytes.
+# @latency-ns: #optional emulated latency (in nanoseconds) in processing
+#              requests. Default to zero which completes requests immediately.
+#              (Since 2.4)
 #
 # Since: 2.2
 ##
 { 'type': 'BlockdevOptionsNull',
-  'data': { '*size': 'int' } }
+  'data': { '*size': 'int', '*latency-ns': 'uint64' } }
 
 ##
 # @BlockdevOptionsVVFAT
-- 
cgit v1.1


From 1c2b49a17282f3abd9ccf71b65d0be62d3b3192e Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Wed, 1 Apr 2015 09:45:39 +0800
Subject: block/null: Support reopen

Reopen is used in block-commit. With this always-succeed operation, it
is now possible to test committing to a null drive, by specifying
"null-aio://" or "null-co://" as the backing image when creating the
qcow2 image.

Signed-off-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1427852740-24315-3-git-send-email-famz@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/null.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/block/null.c b/block/null.c
index 267d3e6..7d08323 100644
--- a/block/null.c
+++ b/block/null.c
@@ -177,6 +177,12 @@ static BlockAIOCB *null_aio_flush(BlockDriverState *bs,
     return null_aio_common(bs, cb, opaque);
 }
 
+static int null_reopen_prepare(BDRVReopenState *reopen_state,
+                               BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
 static BlockDriver bdrv_null_co = {
     .format_name            = "null-co",
     .protocol_name          = "null-co",
@@ -189,6 +195,7 @@ static BlockDriver bdrv_null_co = {
     .bdrv_co_readv          = null_co_readv,
     .bdrv_co_writev         = null_co_writev,
     .bdrv_co_flush_to_disk  = null_co_flush,
+    .bdrv_reopen_prepare    = null_reopen_prepare,
 };
 
 static BlockDriver bdrv_null_aio = {
@@ -203,6 +210,7 @@ static BlockDriver bdrv_null_aio = {
     .bdrv_aio_readv         = null_aio_readv,
     .bdrv_aio_writev        = null_aio_writev,
     .bdrv_aio_flush         = null_aio_flush,
+    .bdrv_reopen_prepare    = null_reopen_prepare,
 };
 
 static void bdrv_null_init(void)
-- 
cgit v1.1


From 199667a8c843d268f0fe80f09041b8c7193f1ba5 Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Wed, 1 Apr 2015 09:45:40 +0800
Subject: MAINTAINERS: Add Fam Zheng as Null block driver maintainer

Signed-off-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1427852740-24315-4-git-send-email-famz@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 MAINTAINERS | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 7c5e71e..b5ab755 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1182,6 +1182,12 @@ S: Supported
 F: block/gluster.c
 T: git git://github.com/codyprime/qemu-kvm-jtc.git block
 
+Null Block Driver
+M: Fam Zheng <famz@redhat.com>
+L: qemu-block@nongnu.org
+S: Supported
+F: block/null.c
+
 Bootdevice
 M: Gonglei <arei.gonglei@huawei.com>
 S: Maintained
-- 
cgit v1.1


From 751ebd76e654bd1e65da08ecf694325282b4cfcc Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Fri, 3 Apr 2015 22:05:18 +0800
Subject: blockjob: Allow nested pause

This patch changes block_job_pause to increase the pause counter and
block_job_resume to decrease it.

The counter will allow calling block_job_pause/block_job_resume
unconditionally on a job when we need to suspend the IO temporarily.

From now on, each block_job_resume must be paired with a block_job_pause
to keep the counter balanced.

The user pause from QMP or HMP will only trigger block_job_pause once
until it's resumed, this is achieved by adding a user_paused flag in
BlockJob.

One occurrence of block_job_resume in mirror_complete is replaced with
block_job_enter which does what is necessary.

In block_job_cancel, the cancel flag is good enough to instruct
coroutines to quit loop, so use block_job_enter to replace the unpaired
block_job_resume.

Upon block job IO error, user is notified about the entering to the
pause state, so this pause belongs to user pause, set the flag
accordingly and expect a matching QMP resume.

[Extended doc comments as suggested by Paolo Bonzini
<pbonzini@redhat.com>.
--Stefan]

Signed-off-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
Message-id: 1428069921-2957-2-git-send-email-famz@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/mirror.c           |  2 +-
 blockdev.c               |  8 +++++---
 blockjob.c               | 23 +++++++++++++++++------
 include/block/blockjob.h | 22 ++++++++++++++++++----
 4 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index 4056164..65b1718 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -634,7 +634,7 @@ static void mirror_complete(BlockJob *job, Error **errp)
     }
 
     s->should_complete = true;
-    block_job_resume(job);
+    block_job_enter(&s->common);
 }
 
 static const BlockJobDriver mirror_job_driver = {
diff --git a/blockdev.c b/blockdev.c
index fbb3a79..9132d69 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2699,7 +2699,7 @@ void qmp_block_job_cancel(const char *device,
         force = false;
     }
 
-    if (job->paused && !force) {
+    if (job->user_paused && !force) {
         error_setg(errp, "The block job for device '%s' is currently paused",
                    device);
         goto out;
@@ -2716,10 +2716,11 @@ void qmp_block_job_pause(const char *device, Error **errp)
     AioContext *aio_context;
     BlockJob *job = find_block_job(device, &aio_context, errp);
 
-    if (!job) {
+    if (!job || job->user_paused) {
         return;
     }
 
+    job->user_paused = true;
     trace_qmp_block_job_pause(job);
     block_job_pause(job);
     aio_context_release(aio_context);
@@ -2730,10 +2731,11 @@ void qmp_block_job_resume(const char *device, Error **errp)
     AioContext *aio_context;
     BlockJob *job = find_block_job(device, &aio_context, errp);
 
-    if (!job) {
+    if (!job || !job->user_paused) {
         return;
     }
 
+    job->user_paused = false;
     trace_qmp_block_job_resume(job);
     block_job_resume(job);
     aio_context_release(aio_context);
diff --git a/blockjob.c b/blockjob.c
index ba2255d..2755465 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -107,7 +107,7 @@ void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
 
 void block_job_complete(BlockJob *job, Error **errp)
 {
-    if (job->paused || job->cancelled || !job->driver->complete) {
+    if (job->pause_count || job->cancelled || !job->driver->complete) {
         error_set(errp, QERR_BLOCK_JOB_NOT_READY,
                   bdrv_get_device_name(job->bs));
         return;
@@ -118,17 +118,26 @@ void block_job_complete(BlockJob *job, Error **errp)
 
 void block_job_pause(BlockJob *job)
 {
-    job->paused = true;
+    job->pause_count++;
 }
 
 bool block_job_is_paused(BlockJob *job)
 {
-    return job->paused;
+    return job->pause_count > 0;
 }
 
 void block_job_resume(BlockJob *job)
 {
-    job->paused = false;
+    assert(job->pause_count > 0);
+    job->pause_count--;
+    if (job->pause_count) {
+        return;
+    }
+    block_job_enter(job);
+}
+
+void block_job_enter(BlockJob *job)
+{
     block_job_iostatus_reset(job);
     if (job->co && !job->busy) {
         qemu_coroutine_enter(job->co, NULL);
@@ -138,7 +147,7 @@ void block_job_resume(BlockJob *job)
 void block_job_cancel(BlockJob *job)
 {
     job->cancelled = true;
-    block_job_resume(job);
+    block_job_enter(job);
 }
 
 bool block_job_is_cancelled(BlockJob *job)
@@ -258,7 +267,7 @@ BlockJobInfo *block_job_query(BlockJob *job)
     info->device    = g_strdup(bdrv_get_device_name(job->bs));
     info->len       = job->len;
     info->busy      = job->busy;
-    info->paused    = job->paused;
+    info->paused    = job->pause_count > 0;
     info->offset    = job->offset;
     info->speed     = job->speed;
     info->io_status = job->iostatus;
@@ -335,6 +344,8 @@ BlockErrorAction block_job_error_action(BlockJob *job, BlockDriverState *bs,
                                     IO_OPERATION_TYPE_WRITE,
                                     action, &error_abort);
     if (action == BLOCK_ERROR_ACTION_STOP) {
+        /* make the pause user visible, which will be resumed from QMP. */
+        job->user_paused = true;
         block_job_pause(job);
         block_job_iostatus_set_err(job, error);
         if (bs != job->bs) {
diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index b6d4ebb..57d8ef1 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -79,10 +79,16 @@ struct BlockJob {
     bool cancelled;
 
     /**
-     * Set to true if the job is either paused, or will pause itself
-     * as soon as possible (if busy == true).
+     * Counter for pause request. If non-zero, the block job is either paused,
+     * or if busy == true will pause itself as soon as possible.
      */
-    bool paused;
+    int pause_count;
+
+    /**
+     * Set to true if the job is paused by user.  Can be unpaused with the
+     * block-job-resume QMP command.
+     */
+    bool user_paused;
 
     /**
      * Set to false by the job while it is in a quiescent state, where
@@ -225,11 +231,19 @@ void block_job_pause(BlockJob *job);
  * block_job_resume:
  * @job: The job to be resumed.
  *
- * Resume the specified job.
+ * Resume the specified job.  Must be paired with a preceding block_job_pause.
  */
 void block_job_resume(BlockJob *job);
 
 /**
+ * block_job_enter:
+ * @job: The job to enter.
+ *
+ * Continue the specified job by entering the coroutine.
+ */
+void block_job_enter(BlockJob *job);
+
+/**
  * block_job_event_cancelled:
  * @job: The job whose information is requested.
  *
-- 
cgit v1.1


From 69da3b0b47c8f6016e9109fcfa608e9e7e99bc05 Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Fri, 3 Apr 2015 22:05:19 +0800
Subject: block: Pause block jobs in bdrv_drain_all

This is necessary to suppress more IO requests from being generated from
block job coroutines.

Signed-off-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
Message-id: 1428069921-2957-3-git-send-email-famz@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/block.c b/block.c
index 670d7e4..70aed80 100644
--- a/block.c
+++ b/block.c
@@ -2040,6 +2040,16 @@ void bdrv_drain_all(void)
     bool busy = true;
     BlockDriverState *bs;
 
+    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+
+        aio_context_acquire(aio_context);
+        if (bs->job) {
+            block_job_pause(bs->job);
+        }
+        aio_context_release(aio_context);
+    }
+
     while (busy) {
         busy = false;
 
@@ -2051,6 +2061,16 @@ void bdrv_drain_all(void)
             aio_context_release(aio_context);
         }
     }
+
+    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+
+        aio_context_acquire(aio_context);
+        if (bs->job) {
+            block_job_resume(bs->job);
+        }
+        aio_context_release(aio_context);
+    }
 }
 
 /* make a BlockDriverState anonymous by removing from bdrv_state and
-- 
cgit v1.1


From e62303a437af72141c8d04c36799521a56d6f4f6 Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Fri, 3 Apr 2015 22:05:20 +0800
Subject: qemu-iotests: Test that "stop" doesn't drain block jobs

Signed-off-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
Message-id: 1428069921-2957-4-git-send-email-famz@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/129     | 86 ++++++++++++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/129.out |  5 +++
 tests/qemu-iotests/group   |  1 +
 3 files changed, 92 insertions(+)
 create mode 100644 tests/qemu-iotests/129
 create mode 100644 tests/qemu-iotests/129.out

diff --git a/tests/qemu-iotests/129 b/tests/qemu-iotests/129
new file mode 100644
index 0000000..9e87e1c
--- /dev/null
+++ b/tests/qemu-iotests/129
@@ -0,0 +1,86 @@
+#!/usr/bin/env python
+#
+# Tests that "bdrv_drain_all" doesn't drain block jobs
+#
+# Copyright (C) 2015 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import os
+import iotests
+import time
+
+class TestStopWithBlockJob(iotests.QMPTestCase):
+    test_img = os.path.join(iotests.test_dir, 'test.img')
+    target_img = os.path.join(iotests.test_dir, 'target.img')
+    base_img = os.path.join(iotests.test_dir, 'base.img')
+
+    def setUp(self):
+        iotests.qemu_img('create', '-f', iotests.imgfmt, self.base_img, "1G")
+        iotests.qemu_img('create', '-f', iotests.imgfmt, self.test_img, "-b", self.base_img)
+        iotests.qemu_io('-f', iotests.imgfmt, '-c', 'write -P0x5d 1M 128M', self.test_img)
+        self.vm = iotests.VM().add_drive(self.test_img)
+        self.vm.launch()
+
+    def tearDown(self):
+        params = {"device": "drive0",
+                  "bps": 0,
+                  "bps_rd": 0,
+                  "bps_wr": 0,
+                  "iops": 0,
+                  "iops_rd": 0,
+                  "iops_wr": 0,
+                 }
+        result = self.vm.qmp("block_set_io_throttle", conv_keys=False,
+                             **params)
+        self.vm.shutdown()
+
+    def do_test_stop(self, cmd, **args):
+        """Test 'stop' while block job is running on a throttled drive.
+        The 'stop' command shouldn't drain the job"""
+        params = {"device": "drive0",
+                  "bps": 1024,
+                  "bps_rd": 0,
+                  "bps_wr": 0,
+                  "iops": 0,
+                  "iops_rd": 0,
+                  "iops_wr": 0,
+                 }
+        result = self.vm.qmp("block_set_io_throttle", conv_keys=False,
+                             **params)
+        self.assert_qmp(result, 'return', {})
+        result = self.vm.qmp(cmd, **args)
+        self.assert_qmp(result, 'return', {})
+        result = self.vm.qmp("stop")
+        self.assert_qmp(result, 'return', {})
+        result = self.vm.qmp("query-block-jobs")
+        self.assert_qmp(result, 'return[0]/busy', True)
+        self.assert_qmp(result, 'return[0]/ready', False)
+
+    def test_drive_mirror(self):
+        self.do_test_stop("drive-mirror", device="drive0",
+                          target=self.target_img,
+                          sync="full")
+
+    def test_drive_backup(self):
+        self.do_test_stop("drive-backup", device="drive0",
+                          target=self.target_img,
+                          sync="full")
+
+    def test_block_commit(self):
+        self.do_test_stop("block-commit", device="drive0")
+
+if __name__ == '__main__':
+    iotests.main(supported_fmts=["qcow2"])
diff --git a/tests/qemu-iotests/129.out b/tests/qemu-iotests/129.out
new file mode 100644
index 0000000..8d7e9967
--- /dev/null
+++ b/tests/qemu-iotests/129.out
@@ -0,0 +1,5 @@
+...
+----------------------------------------------------------------------
+Ran 3 tests
+
+OK
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index 3cf55e5..7c0d639 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -125,4 +125,5 @@
 122 rw auto
 123 rw auto quick
 128 rw auto quick
+129 rw auto quick
 130 rw auto quick
-- 
cgit v1.1


From a7282330c01364ef00260749bc6a37c7f16ec047 Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Fri, 3 Apr 2015 22:05:21 +0800
Subject: blockjob: Update function name in comments

Signed-off-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
Message-id: 1428069921-2957-5-git-send-email-famz@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/backup.c | 2 +-
 block/mirror.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index 1c535b1..3312476 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -287,7 +287,7 @@ static void coroutine_fn backup_run(void *opaque)
                 break;
             }
 
-            /* we need to yield so that qemu_aio_flush() returns.
+            /* we need to yield so that bdrv_drain_all() returns.
              * (without, VM does not reboot)
              */
             if (job->common.speed) {
diff --git a/block/mirror.c b/block/mirror.c
index 65b1718..d421fce 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -475,7 +475,7 @@ static void coroutine_fn mirror_run(void *opaque)
                         (cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE;
 
         /* Note that even when no rate limit is applied we need to yield
-         * periodically with no pending I/O so that qemu_aio_flush() returns.
+         * periodically with no pending I/O so that bdrv_drain_all() returns.
          * We do so every SLICE_TIME nanoseconds, or when there is an error,
          * or when the source is clean, whichever comes first.
          */
-- 
cgit v1.1


From 0b5a24454fc551f0294fe93821e8c643214a55f5 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Sat, 28 Mar 2015 07:37:18 +0100
Subject: block: avoid unnecessary bottom halves

bdrv_aio_* APIs can use coroutines to achieve asynchronicity.  However,
the coroutine may terminate without having yielded back to the caller
(for example because of something that invokes a nested event loop,
or because the coroutine is doing nothing at all).  In this case,
the bdrv_aio_* API must delay the completion to the next iteration
of the main loop, because bdrv_aio_* will never invoke the callback
before returning.

This can be done with a bottom half, and indeed bdrv_aio_* is always
using one for simplicity.  It is possible to gain some performance
(~3%) by avoiding this in the common case.  A new field in the
BlockAIOCBCoroutine struct is set to true until the first time the
corotine has yielded to its creator, and completion goes through a
new function bdrv_co_complete.  If the flag is false, bdrv_co_complete
invokes the callback immediately.  If it is true, the caller will
notice that the coroutine has completed and schedule the bottom
half itself.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1427524638-28157-1-git-send-email-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 43 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 34 insertions(+), 9 deletions(-)

diff --git a/block.c b/block.c
index 70aed80..d13b2e7 100644
--- a/block.c
+++ b/block.c
@@ -4849,6 +4849,7 @@ typedef struct BlockAIOCBCoroutine {
     BlockAIOCB common;
     BlockRequest req;
     bool is_write;
+    bool need_bh;
     bool *done;
     QEMUBH* bh;
 } BlockAIOCBCoroutine;
@@ -4857,14 +4858,32 @@ static const AIOCBInfo bdrv_em_co_aiocb_info = {
     .aiocb_size         = sizeof(BlockAIOCBCoroutine),
 };
 
+static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
+{
+    if (!acb->need_bh) {
+        acb->common.cb(acb->common.opaque, acb->req.error);
+        qemu_aio_unref(acb);
+    }
+}
+
 static void bdrv_co_em_bh(void *opaque)
 {
     BlockAIOCBCoroutine *acb = opaque;
 
-    acb->common.cb(acb->common.opaque, acb->req.error);
-
+    assert(!acb->need_bh);
     qemu_bh_delete(acb->bh);
-    qemu_aio_unref(acb);
+    bdrv_co_complete(acb);
+}
+
+static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
+{
+    acb->need_bh = false;
+    if (acb->req.error != -EINPROGRESS) {
+        BlockDriverState *bs = acb->common.bs;
+
+        acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
+        qemu_bh_schedule(acb->bh);
+    }
 }
 
 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
@@ -4881,8 +4900,7 @@ static void coroutine_fn bdrv_co_do_rw(void *opaque)
             acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
     }
 
-    acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
-    qemu_bh_schedule(acb->bh);
+    bdrv_co_complete(acb);
 }
 
 static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
@@ -4898,6 +4916,8 @@ static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
     BlockAIOCBCoroutine *acb;
 
     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
+    acb->need_bh = true;
+    acb->req.error = -EINPROGRESS;
     acb->req.sector = sector_num;
     acb->req.nb_sectors = nb_sectors;
     acb->req.qiov = qiov;
@@ -4907,6 +4927,7 @@ static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
     co = qemu_coroutine_create(bdrv_co_do_rw);
     qemu_coroutine_enter(co, acb);
 
+    bdrv_co_maybe_schedule_bh(acb);
     return &acb->common;
 }
 
@@ -4916,8 +4937,7 @@ static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
     BlockDriverState *bs = acb->common.bs;
 
     acb->req.error = bdrv_co_flush(bs);
-    acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
-    qemu_bh_schedule(acb->bh);
+    bdrv_co_complete(acb);
 }
 
 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
@@ -4929,10 +4949,13 @@ BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
     BlockAIOCBCoroutine *acb;
 
     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
+    acb->need_bh = true;
+    acb->req.error = -EINPROGRESS;
 
     co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
     qemu_coroutine_enter(co, acb);
 
+    bdrv_co_maybe_schedule_bh(acb);
     return &acb->common;
 }
 
@@ -4942,8 +4965,7 @@ static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
     BlockDriverState *bs = acb->common.bs;
 
     acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
-    acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
-    qemu_bh_schedule(acb->bh);
+    bdrv_co_complete(acb);
 }
 
 BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
@@ -4956,11 +4978,14 @@ BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
     trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
 
     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
+    acb->need_bh = true;
+    acb->req.error = -EINPROGRESS;
     acb->req.sector = sector_num;
     acb->req.nb_sectors = nb_sectors;
     co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
     qemu_coroutine_enter(co, acb);
 
+    bdrv_co_maybe_schedule_bh(acb);
     return &acb->common;
 }
 
-- 
cgit v1.1


From 4eb867e98c1815d9d7a2a9380182005df12064a7 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Sun, 12 Apr 2015 17:55:17 +0200
Subject: virtio_blk: comment fix

update virtio blk header from latest linux, include comment fixups.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Message-id: 1428854036-12806-1-git-send-email-mst@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/standard-headers/linux/virtio_blk.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/standard-headers/linux/virtio_blk.h b/include/standard-headers/linux/virtio_blk.h
index 12016b4..cd601f4 100644
--- a/include/standard-headers/linux/virtio_blk.h
+++ b/include/standard-headers/linux/virtio_blk.h
@@ -58,7 +58,7 @@ struct virtio_blk_config {
 	uint32_t size_max;
 	/* The maximum number of segments (if VIRTIO_BLK_F_SEG_MAX) */
 	uint32_t seg_max;
-	/* geometry the device (if VIRTIO_BLK_F_GEOMETRY) */
+	/* geometry of the device (if VIRTIO_BLK_F_GEOMETRY) */
 	struct virtio_blk_geometry {
 		uint16_t cylinders;
 		uint8_t heads;
@@ -117,7 +117,11 @@ struct virtio_blk_config {
 #define VIRTIO_BLK_T_BARRIER	0x80000000
 #endif /* !VIRTIO_BLK_NO_LEGACY */
 
-/* This is the first element of the read scatter-gather list. */
+/*
+ * This comes first in the read scatter-gather list.
+ * For legacy virtio, if VIRTIO_F_ANY_LAYOUT is not negotiated,
+ * this is the first element of the read scatter-gather list.
+ */
 struct virtio_blk_outhdr {
 	/* VIRTIO_BLK_T* */
 	__virtio32 type;
-- 
cgit v1.1


From d07063e46047242c4f010ff9ddbff5e02f15d9e7 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 14 Apr 2015 17:29:47 +0200
Subject: m25p80: add missing blk_attach_dev_nofail

Of the block devices that poked into -drive options via drive_get_next,
m25p80 was the only one who also did not attach itself to the BlockBackend.

Since sd does it, and all other devices go through a "drive" property,
with this change all block backends attached to the guest will have a
non-NULL result for blk_get_attached_dev().

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Peter Crosthwaite <peter.crosthwaite@xilinx.com>
Message-id: 1429025387-11077-1-git-send-email-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/block/m25p80.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/block/m25p80.c b/hw/block/m25p80.c
index afe243b..728e384 100644
--- a/hw/block/m25p80.c
+++ b/hw/block/m25p80.c
@@ -629,6 +629,7 @@ static int m25p80_init(SSISlave *ss)
     if (dinfo) {
         DB_PRINT_L(0, "Binding to IF_MTD drive\n");
         s->blk = blk_by_legacy_dinfo(dinfo);
+        blk_attach_dev_nofail(s->blk, s);
 
         /* FIXME: Move to late init */
         if (blk_read(s->blk, 0, s->storage,
-- 
cgit v1.1


From c485cf9c9277ca9b3d5227c99a13c374e812f42b Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Wed, 15 Apr 2015 10:43:44 +0100
Subject: m25p80: fix s->blk usage before assignment

Delay the call to blk_blockalign() until s->blk has been assigned.

This never caused a crash because blk_blockalign(NULL, size) defaults to
4096 alignment but it's technically incorrect.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Message-id: 1429091024-25098-1-git-send-email-stefanha@redhat.com
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/block/m25p80.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/block/m25p80.c b/hw/block/m25p80.c
index 728e384..efc43dd 100644
--- a/hw/block/m25p80.c
+++ b/hw/block/m25p80.c
@@ -621,7 +621,6 @@ static int m25p80_init(SSISlave *ss)
 
     s->size = s->pi->sector_size * s->pi->n_sectors;
     s->dirty_page = -1;
-    s->storage = blk_blockalign(s->blk, s->size);
 
     /* FIXME use a qdev drive property instead of drive_get_next() */
     dinfo = drive_get_next(IF_MTD);
@@ -631,6 +630,8 @@ static int m25p80_init(SSISlave *ss)
         s->blk = blk_by_legacy_dinfo(dinfo);
         blk_attach_dev_nofail(s->blk, s);
 
+        s->storage = blk_blockalign(s->blk, s->size);
+
         /* FIXME: Move to late init */
         if (blk_read(s->blk, 0, s->storage,
                      DIV_ROUND_UP(s->size, BDRV_SECTOR_SIZE))) {
@@ -639,6 +640,7 @@ static int m25p80_init(SSISlave *ss)
         }
     } else {
         DB_PRINT_L(0, "No BDRV - binding to RAM\n");
+        s->storage = blk_blockalign(NULL, s->size);
         memset(s->storage, 0xFF, s->size);
     }
 
-- 
cgit v1.1


From ec683d604069dcdaaa516789274bc0cdc14e5247 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Wed, 15 Apr 2015 11:43:42 +0100
Subject: block: document block-stream in qmp-commands.hx

The 'block-stream' QMP command is documented in block-core.json but not
qmp-commands.hx.  Add a summary of the command to qmp-commands.hx
(similar to the documentation for 'block-commit').

Reported-by: Kashyap Chamarthy <kchamart@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 1429094622-26218-1-git-send-email-stefanha@redhat.com
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 qmp-commands.hx | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/qmp-commands.hx b/qmp-commands.hx
index 1e59541..e6ae026 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -1007,6 +1007,43 @@ EQMP
         .mhandler.cmd_new = qmp_marshal_input_block_stream,
     },
 
+SQMP
+block-stream
+------------
+
+Copy data from a backing file into a block device.
+
+Arguments:
+
+- "device": The device's ID, must be unique (json-string)
+- "base": The file name of the backing image above which copying starts
+          (json-string, optional)
+- "backing-file": The backing file string to write into the active layer. This
+                  filename is not validated.
+
+                  If a pathname string is such that it cannot be resolved by
+                  QEMU, that means that subsequent QMP or HMP commands must use
+                  node-names for the image in question, as filename lookup
+                  methods will fail.
+
+                  If not specified, QEMU will automatically determine the
+                  backing file string to use, or error out if there is no
+                  obvious choice.  Care should be taken when specifying the
+                  string, to specify a valid filename or protocol.
+                  (json-string, optional) (Since 2.1)
+- "speed":  the maximum speed, in bytes per second (json-int, optional)
+- "on-error": the action to take on an error (default 'report').  'stop' and
+              'enospc' can only be used if the block device supports io-status.
+              (json-string, optional) (Since 2.1)
+
+Example:
+
+-> { "execute": "block-stream", "arguments": { "device": "virtio0",
+                                               "base": "/tmp/master.qcow2" } }
+<- { "return": {} }
+
+EQMP
+
     {
         .name       = "block-commit",
         .args_type  = "device:B,base:s?,top:s?,backing-file:s?,speed:o?",
-- 
cgit v1.1


From 9b2aa84f87f5b95cb0295dcae38fbfbf115df2be Mon Sep 17 00:00:00 2001
From: Alberto Garcia <berto@igalia.com>
Date: Wed, 8 Apr 2015 12:29:18 +0300
Subject: block: add bdrv_get_device_or_node_name()

This function gets the device name associated with a BlockDriverState,
or its node name if the device name is empty.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 4fa30aa8d61d9052ce266fd5429a59a14e941255.1428485266.git.berto@igalia.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c               | 9 +++++++++
 block/quorum.c        | 5 +----
 include/block/block.h | 1 +
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/block.c b/block.c
index d13b2e7..cc8010e 100644
--- a/block.c
+++ b/block.c
@@ -3980,6 +3980,15 @@ const char *bdrv_get_device_name(const BlockDriverState *bs)
     return bs->blk ? blk_name(bs->blk) : "";
 }
 
+/* This can be used to identify nodes that might not have a device
+ * name associated. Since node and device names live in the same
+ * namespace, the result is unambiguous. The exception is if both are
+ * absent, then this returns an empty (non-null) string. */
+const char *bdrv_get_device_or_node_name(const BlockDriverState *bs)
+{
+    return bs->blk ? blk_name(bs->blk) : bs->node_name;
+}
+
 int bdrv_get_flags(BlockDriverState *bs)
 {
     return bs->open_flags;
diff --git a/block/quorum.c b/block/quorum.c
index 437b122..f91ef75 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -226,10 +226,7 @@ static void quorum_report_bad(QuorumAIOCB *acb, char *node_name, int ret)
 
 static void quorum_report_failure(QuorumAIOCB *acb)
 {
-    const char *reference = bdrv_get_device_name(acb->common.bs)[0] ?
-                            bdrv_get_device_name(acb->common.bs) :
-                            acb->common.bs->node_name;
-
+    const char *reference = bdrv_get_device_or_node_name(acb->common.bs);
     qapi_event_send_quorum_failure(reference, acb->sector_num,
                                    acb->nb_sectors, &error_abort);
 }
diff --git a/include/block/block.h b/include/block/block.h
index 4c57d63..b285e0d 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -398,6 +398,7 @@ void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
                          void *opaque);
 const char *bdrv_get_node_name(const BlockDriverState *bs);
 const char *bdrv_get_device_name(const BlockDriverState *bs);
+const char *bdrv_get_device_or_node_name(const BlockDriverState *bs);
 int bdrv_get_flags(BlockDriverState *bs);
 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
                           const uint8_t *buf, int nb_sectors);
-- 
cgit v1.1


From 81e5f78a9f4f13548ec1edddaf780d339f18e2d2 Mon Sep 17 00:00:00 2001
From: Alberto Garcia <berto@igalia.com>
Date: Wed, 8 Apr 2015 12:29:19 +0300
Subject: block: use bdrv_get_device_or_node_name() in error messages

There are several error messages that identify a BlockDriverState by
its device name. However those errors can be produced in nodes that
don't have a device name associated.

In those cases we should use bdrv_get_device_or_node_name() to fall
back to the node name and produce a more meaningful message. The
messages are also updated to use the more generic term 'node' instead
of 'device'.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 9823a1f0514fdb0692e92868661c38a9e00a12d6.1428485266.git.berto@igalia.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c                   | 24 ++++++++++++------------
 block/qcow.c              |  8 ++++----
 block/qcow2.c             |  2 +-
 block/qed.c               |  2 +-
 block/snapshot.c          | 12 ++++++------
 block/vdi.c               |  6 +++---
 block/vhdx.c              |  6 +++---
 block/vmdk.c              |  8 ++++----
 block/vpc.c               |  6 +++---
 block/vvfat.c             |  7 ++++---
 blockdev.c                |  9 +++++----
 include/qapi/qmp/qerror.h |  6 ------
 12 files changed, 46 insertions(+), 50 deletions(-)

diff --git a/block.c b/block.c
index cc8010e..002972a 100644
--- a/block.c
+++ b/block.c
@@ -1231,8 +1231,8 @@ void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
         bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
     } else if (backing_hd) {
         error_setg(&bs->backing_blocker,
-                   "device is used as backing hd of '%s'",
-                   bdrv_get_device_name(bs));
+                   "node is used as backing hd of '%s'",
+                   bdrv_get_device_or_node_name(bs));
     }
 
     bs->backing_hd = backing_hd;
@@ -1819,8 +1819,8 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
      * to r/w */
     if (!(reopen_state->bs->open_flags & BDRV_O_ALLOW_RDWR) &&
         reopen_state->flags & BDRV_O_RDWR) {
-        error_set(errp, QERR_DEVICE_IS_READ_ONLY,
-                  bdrv_get_device_name(reopen_state->bs));
+        error_setg(errp, "Node '%s' is read only",
+                   bdrv_get_device_or_node_name(reopen_state->bs));
         goto error;
     }
 
@@ -1846,9 +1846,9 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
     } else {
         /* It is currently mandatory to have a bdrv_reopen_prepare()
          * handler for each supported drv. */
-        error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-                  drv->format_name, bdrv_get_device_name(reopen_state->bs),
-                 "reopening of file");
+        error_setg(errp, "Block format '%s' used by node '%s' "
+                   "does not support reopening files", drv->format_name,
+                   bdrv_get_device_or_node_name(reopen_state->bs));
         ret = -1;
         goto error;
     }
@@ -3824,8 +3824,8 @@ void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp)
 {
     if (key) {
         if (!bdrv_is_encrypted(bs)) {
-            error_setg(errp, "Device '%s' is not encrypted",
-                      bdrv_get_device_name(bs));
+            error_setg(errp, "Node '%s' is not encrypted",
+                      bdrv_get_device_or_node_name(bs));
         } else if (bdrv_set_key(bs, key) < 0) {
             error_set(errp, QERR_INVALID_PASSWORD);
         }
@@ -3833,7 +3833,7 @@ void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp)
         if (bdrv_key_required(bs)) {
             error_set(errp, ERROR_CLASS_DEVICE_ENCRYPTED,
                       "'%s' (%s) is encrypted",
-                      bdrv_get_device_name(bs),
+                      bdrv_get_device_or_node_name(bs),
                       bdrv_get_encrypted_filename(bs));
         }
     }
@@ -5633,8 +5633,8 @@ bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
     if (!QLIST_EMPTY(&bs->op_blockers[op])) {
         blocker = QLIST_FIRST(&bs->op_blockers[op]);
         if (errp) {
-            error_setg(errp, "Device '%s' is busy: %s",
-                       bdrv_get_device_name(bs),
+            error_setg(errp, "Node '%s' is busy: %s",
+                       bdrv_get_device_or_node_name(bs),
                        error_get_pretty(blocker->reason));
         }
         return true;
diff --git a/block/qcow.c b/block/qcow.c
index 0558969..ab89328 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -124,7 +124,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
         snprintf(version, sizeof(version), "QCOW version %" PRIu32,
                  header.version);
         error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-                  bdrv_get_device_name(bs), "qcow", version);
+                  bdrv_get_device_or_node_name(bs), "qcow", version);
         ret = -ENOTSUP;
         goto fail;
     }
@@ -229,9 +229,9 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
     }
 
     /* Disable migration when qcow images are used */
-    error_set(&s->migration_blocker,
-              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-              "qcow", bdrv_get_device_name(bs), "live migration");
+    error_setg(&s->migration_blocker, "The qcow format used by node '%s' "
+               "does not support live migration",
+               bdrv_get_device_or_node_name(bs));
     migrate_add_blocker(s->migration_blocker);
 
     qemu_co_mutex_init(&s->lock);
diff --git a/block/qcow2.c b/block/qcow2.c
index f692978..6084eae 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -208,7 +208,7 @@ static void GCC_FMT_ATTR(3, 4) report_unsupported(BlockDriverState *bs,
     va_end(ap);
 
     error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-              bdrv_get_device_name(bs), "qcow2", msg);
+              bdrv_get_device_or_node_name(bs), "qcow2", msg);
 }
 
 static void report_unsupported_feature(BlockDriverState *bs,
diff --git a/block/qed.c b/block/qed.c
index 9d90888..5bbe069 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -408,7 +408,7 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
         snprintf(buf, sizeof(buf), "%" PRIx64,
             s->header.features & ~QED_FEATURE_MASK);
         error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-            bdrv_get_device_name(bs), "QED", buf);
+            bdrv_get_device_or_node_name(bs), "QED", buf);
         return -ENOTSUP;
     }
     if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
diff --git a/block/snapshot.c b/block/snapshot.c
index 698e1a1..50ae610 100644
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -246,9 +246,9 @@ int bdrv_snapshot_delete(BlockDriverState *bs,
     if (bs->file) {
         return bdrv_snapshot_delete(bs->file, snapshot_id, name, errp);
     }
-    error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-              drv->format_name, bdrv_get_device_name(bs),
-              "internal snapshot deletion");
+    error_setg(errp, "Block format '%s' used by device '%s' "
+               "does not support internal snapshot deletion",
+               drv->format_name, bdrv_get_device_name(bs));
     return -ENOTSUP;
 }
 
@@ -329,9 +329,9 @@ int bdrv_snapshot_load_tmp(BlockDriverState *bs,
     if (drv->bdrv_snapshot_load_tmp) {
         return drv->bdrv_snapshot_load_tmp(bs, snapshot_id, name, errp);
     }
-    error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-              drv->format_name, bdrv_get_device_name(bs),
-              "temporarily load internal snapshot");
+    error_setg(errp, "Block format '%s' used by device '%s' "
+               "does not support temporarily loading internal snapshots",
+               drv->format_name, bdrv_get_device_name(bs));
     return -ENOTSUP;
 }
 
diff --git a/block/vdi.c b/block/vdi.c
index 53bd02f..7642ef3 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -502,9 +502,9 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
     }
 
     /* Disable migration when vdi images are used */
-    error_set(&s->migration_blocker,
-              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-              "vdi", bdrv_get_device_name(bs), "live migration");
+    error_setg(&s->migration_blocker, "The vdi format used by node '%s' "
+               "does not support live migration",
+               bdrv_get_device_or_node_name(bs));
     migrate_add_blocker(s->migration_blocker);
 
     qemu_co_mutex_init(&s->write_lock);
diff --git a/block/vhdx.c b/block/vhdx.c
index e24062f..0776de7 100644
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -1002,9 +1002,9 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags,
     /* TODO: differencing files */
 
     /* Disable migration when VHDX images are used */
-    error_set(&s->migration_blocker,
-            QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-            "vhdx", bdrv_get_device_name(bs), "live migration");
+    error_setg(&s->migration_blocker, "The vhdx format used by node '%s' "
+               "does not support live migration",
+               bdrv_get_device_or_node_name(bs));
     migrate_add_blocker(s->migration_blocker);
 
     return 0;
diff --git a/block/vmdk.c b/block/vmdk.c
index 8410a15..fd94b8f 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -669,7 +669,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
         snprintf(buf, sizeof(buf), "VMDK version %" PRId32,
                  le32_to_cpu(header.version));
         error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
-                  bdrv_get_device_name(bs), "vmdk", buf);
+                  bdrv_get_device_or_node_name(bs), "vmdk", buf);
         return -ENOTSUP;
     } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) {
         /* VMware KB 2064959 explains that version 3 added support for
@@ -962,9 +962,9 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags,
     qemu_co_mutex_init(&s->lock);
 
     /* Disable migration when VMDK images are used */
-    error_set(&s->migration_blocker,
-              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-              "vmdk", bdrv_get_device_name(bs), "live migration");
+    error_setg(&s->migration_blocker, "The vmdk format used by node '%s' "
+               "does not support live migration",
+               bdrv_get_device_or_node_name(bs));
     migrate_add_blocker(s->migration_blocker);
     g_free(buf);
     return 0;
diff --git a/block/vpc.c b/block/vpc.c
index 43e768e..37572ba 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -318,9 +318,9 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags,
     qemu_co_mutex_init(&s->lock);
 
     /* Disable migration when VHD images are used */
-    error_set(&s->migration_blocker,
-              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-              "vpc", bdrv_get_device_name(bs), "live migration");
+    error_setg(&s->migration_blocker, "The vpc format used by node '%s' "
+               "does not support live migration",
+               bdrv_get_device_or_node_name(bs));
     migrate_add_blocker(s->migration_blocker);
 
     return 0;
diff --git a/block/vvfat.c b/block/vvfat.c
index 9be632f..e803589 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -1180,9 +1180,10 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags,
 
     /* Disable migration when vvfat is used rw */
     if (s->qcow) {
-        error_set(&s->migration_blocker,
-                  QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-                  "vvfat (rw)", bdrv_get_device_name(bs), "live migration");
+        error_setg(&s->migration_blocker,
+                   "The vvfat (rw) format used by node '%s' "
+                   "does not support live migration",
+                   bdrv_get_device_or_node_name(bs));
         migrate_add_blocker(s->migration_blocker);
     }
 
diff --git a/blockdev.c b/blockdev.c
index 9132d69..e67c701 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1248,13 +1248,14 @@ static void internal_snapshot_prepare(BlkTransactionState *common,
     }
 
     if (bdrv_is_read_only(bs)) {
-        error_set(errp, QERR_DEVICE_IS_READ_ONLY, device);
+        error_setg(errp, "Device '%s' is read only", device);
         return;
     }
 
     if (!bdrv_can_snapshot(bs)) {
-        error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-                  bs->drv->format_name, device, "internal snapshot");
+        error_setg(errp, "Block format '%s' used by device '%s' "
+                   "does not support internal snapshots",
+                   bs->drv->format_name, device);
         return;
     }
 
@@ -2055,7 +2056,7 @@ void qmp_block_resize(bool has_device, const char *device,
         error_set(errp, QERR_UNSUPPORTED);
         break;
     case -EACCES:
-        error_set(errp, QERR_DEVICE_IS_READ_ONLY, device);
+        error_setg(errp, "Device '%s' is read only", device);
         break;
     case -EBUSY:
         error_set(errp, QERR_DEVICE_IN_USE, device);
diff --git a/include/qapi/qmp/qerror.h b/include/qapi/qmp/qerror.h
index 57a62d4..e567339 100644
--- a/include/qapi/qmp/qerror.h
+++ b/include/qapi/qmp/qerror.h
@@ -37,9 +37,6 @@ void qerror_report_err(Error *err);
 #define QERR_BASE_NOT_FOUND \
     ERROR_CLASS_GENERIC_ERROR, "Base '%s' not found"
 
-#define QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED \
-    ERROR_CLASS_GENERIC_ERROR, "Block format '%s' used by device '%s' does not support feature '%s'"
-
 #define QERR_BLOCK_JOB_NOT_READY \
     ERROR_CLASS_GENERIC_ERROR, "The active block job for device '%s' cannot be completed"
 
@@ -58,9 +55,6 @@ void qerror_report_err(Error *err);
 #define QERR_DEVICE_IN_USE \
     ERROR_CLASS_GENERIC_ERROR, "Device '%s' is in use"
 
-#define QERR_DEVICE_IS_READ_ONLY \
-    ERROR_CLASS_GENERIC_ERROR, "Device '%s' is read only"
-
 #define QERR_DEVICE_NO_HOTPLUG \
     ERROR_CLASS_GENERIC_ERROR, "Device '%s' does not support hotplugging"
 
-- 
cgit v1.1


From dc881b441d74b8fc6c9c007cd03d5d05bca388dd Mon Sep 17 00:00:00 2001
From: Alberto Garcia <berto@igalia.com>
Date: Wed, 8 Apr 2015 12:29:20 +0300
Subject: block: add 'node-name' field to BLOCK_IMAGE_CORRUPTED

Since this event can occur in nodes that cannot have a device name
associated, include also a field with the node name.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 147cec5b3594f4bec0cb41c98afe5fcbfb67567c.1428485266.git.berto@igalia.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2.c           |  8 ++++++--
 docs/qmp/qmp-events.txt | 21 +++++++++++++--------
 qapi/block-core.json    | 17 +++++++++++------
 3 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 6084eae..b9a72e3 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2824,6 +2824,7 @@ void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
                              int64_t size, const char *message_format, ...)
 {
     BDRVQcowState *s = bs->opaque;
+    const char *node_name;
     char *message;
     va_list ap;
 
@@ -2847,8 +2848,11 @@ void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
                 "corruption events will be suppressed\n", message);
     }
 
-    qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs), message,
-                                          offset >= 0, offset, size >= 0, size,
+    node_name = bdrv_get_node_name(bs);
+    qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs),
+                                          *node_name != '\0', node_name,
+                                          message, offset >= 0, offset,
+                                          size >= 0, size,
                                           fatal, &error_abort);
     g_free(message);
 
diff --git a/docs/qmp/qmp-events.txt b/docs/qmp/qmp-events.txt
index d759d19..b19e490 100644
--- a/docs/qmp/qmp-events.txt
+++ b/docs/qmp/qmp-events.txt
@@ -31,21 +31,26 @@ Example:
 BLOCK_IMAGE_CORRUPTED
 ---------------------
 
-Emitted when a disk image is being marked corrupt.
+Emitted when a disk image is being marked corrupt. The image can be
+identified by its device or node name. The 'device' field is always
+present for compatibility reasons, but it can be empty ("") if the
+image does not have a device name associated.
 
 Data:
 
-- "device": Device name (json-string)
-- "msg":    Informative message (e.g., reason for the corruption) (json-string)
-- "offset": If the corruption resulted from an image access, this is the access
-            offset into the image (json-int)
-- "size":   If the corruption resulted from an image access, this is the access
-            size (json-int)
+- "device":    Device name (json-string)
+- "node-name": Node name (json-string, optional)
+- "msg":       Informative message (e.g., reason for the corruption)
+               (json-string)
+- "offset":    If the corruption resulted from an image access, this
+               is the access offset into the image (json-int)
+- "size":      If the corruption resulted from an image access, this
+               is the access size (json-int)
 
 Example:
 
 { "event": "BLOCK_IMAGE_CORRUPTED",
-    "data": { "device": "ide0-hd0",
+    "data": { "device": "ide0-hd0", "node-name": "node0",
         "msg": "Prevented active L1 table overwrite", "offset": 196608,
         "size": 65536 },
     "timestamp": { "seconds": 1378126126, "microseconds": 966463 } }
diff --git a/qapi/block-core.json b/qapi/block-core.json
index e158a7c..82a8ae5 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -1757,7 +1757,11 @@
 #
 # Emitted when a corruption has been detected in a disk image
 #
-# @device: device name
+# @device: device name. This is always present for compatibility
+#          reasons, but it can be empty ("") if the image does not
+#          have a device name associated.
+#
+# @node-name: #optional node name (Since: 2.4)
 #
 # @msg: informative message for human consumption, such as the kind of
 #       corruption being detected. It should not be parsed by machine as it is
@@ -1776,11 +1780,12 @@
 # Since: 1.7
 ##
 { 'event': 'BLOCK_IMAGE_CORRUPTED',
-  'data': { 'device' : 'str',
-            'msg'    : 'str',
-            '*offset': 'int',
-            '*size'  : 'int',
-            'fatal'  : 'bool' } }
+  'data': { 'device'     : 'str',
+            '*node-name' : 'str',
+            'msg'        : 'str',
+            '*offset'    : 'int',
+            '*size'      : 'int',
+            'fatal'      : 'bool' } }
 
 ##
 # @BLOCK_IO_ERROR
-- 
cgit v1.1


From 9419874f709469de16c1bced7731bfecb07fe1cf Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Wed, 22 Apr 2015 11:15:10 +0100
Subject: Revert "hmp: fix crash in 'info block -n -v'"

This reverts commit 638b8366200130cc7cf7a026630bc6bfb63b0c4c.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hmp.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/hmp.c b/hmp.c
index f142d36..f31ae27 100644
--- a/hmp.c
+++ b/hmp.c
@@ -391,8 +391,7 @@ static void print_block_info(Monitor *mon, BlockInfo *info,
                         inserted->iops_size);
     }
 
-    /* TODO: inserted->image should never be null */
-    if (verbose && inserted->image) {
+    if (verbose) {
         monitor_printf(mon, "\nImages:\n");
         image_info = inserted->image;
         while (1) {
-- 
cgit v1.1


From d5a8ee60a0fbc20a2c2d02f3bda1bb1bd365f1ee Mon Sep 17 00:00:00 2001
From: Alberto Garcia <berto@igalia.com>
Date: Fri, 17 Apr 2015 14:52:43 +0300
Subject: qmp: fill in the image field in BlockDeviceInfo

The image field in BlockDeviceInfo is supposed to contain an ImageInfo
object. However that is being filled in by bdrv_query_info(), not by
bdrv_block_device_info(), which is where BlockDeviceInfo is actually
created.

Anyone calling bdrv_block_device_info() directly will get a null image
field. As a consequence of this, the HMP command 'info block -n -v'
crashes QEMU.

This patch moves the code that fills in that field from
bdrv_query_info() to bdrv_block_device_info().

Signed-off-by: Alberto Garcia <berto@igalia.com>
Message-id: 1429271563-3765-1-git-send-email-berto@igalia.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c               |  9 +++++++--
 block/qapi.c          | 46 +++++++++++++++++++++++++---------------------
 blockdev.c            |  2 +-
 include/block/block.h |  2 +-
 include/block/qapi.h  |  2 +-
 5 files changed, 35 insertions(+), 26 deletions(-)

diff --git a/block.c b/block.c
index 002972a..101b50c 100644
--- a/block.c
+++ b/block.c
@@ -3897,15 +3897,20 @@ BlockDriverState *bdrv_find_node(const char *node_name)
 }
 
 /* Put this QMP function here so it can access the static graph_bdrv_states. */
-BlockDeviceInfoList *bdrv_named_nodes_list(void)
+BlockDeviceInfoList *bdrv_named_nodes_list(Error **errp)
 {
     BlockDeviceInfoList *list, *entry;
     BlockDriverState *bs;
 
     list = NULL;
     QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
+        BlockDeviceInfo *info = bdrv_block_device_info(bs, errp);
+        if (!info) {
+            qapi_free_BlockDeviceInfoList(list);
+            return NULL;
+        }
         entry = g_malloc0(sizeof(*entry));
-        entry->value = bdrv_block_device_info(bs);
+        entry->value = info;
         entry->next = list;
         list = entry;
     }
diff --git a/block/qapi.c b/block/qapi.c
index 8a19aed..063dd1b 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -31,8 +31,10 @@
 #include "qapi/qmp/types.h"
 #include "sysemu/block-backend.h"
 
-BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs)
+BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs, Error **errp)
 {
+    ImageInfo **p_image_info;
+    BlockDriverState *bs0;
     BlockDeviceInfo *info = g_malloc0(sizeof(*info));
 
     info->file                   = g_strdup(bs->filename);
@@ -92,6 +94,25 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs)
 
     info->write_threshold = bdrv_write_threshold_get(bs);
 
+    bs0 = bs;
+    p_image_info = &info->image;
+    while (1) {
+        Error *local_err = NULL;
+        bdrv_query_image_info(bs0, p_image_info, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            qapi_free_BlockDeviceInfo(info);
+            return NULL;
+        }
+        if (bs0->drv && bs0->backing_hd) {
+            bs0 = bs0->backing_hd;
+            (*p_image_info)->has_backing_image = true;
+            p_image_info = &((*p_image_info)->backing_image);
+        } else {
+            break;
+        }
+    }
+
     return info;
 }
 
@@ -264,9 +285,6 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info,
 {
     BlockInfo *info = g_malloc0(sizeof(*info));
     BlockDriverState *bs = blk_bs(blk);
-    BlockDriverState *bs0;
-    ImageInfo **p_image_info;
-    Error *local_err = NULL;
     info->device = g_strdup(blk_name(blk));
     info->type = g_strdup("unknown");
     info->locked = blk_dev_is_medium_locked(blk);
@@ -289,23 +307,9 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info,
 
     if (bs->drv) {
         info->has_inserted = true;
-        info->inserted = bdrv_block_device_info(bs);
-
-        bs0 = bs;
-        p_image_info = &info->inserted->image;
-        while (1) {
-            bdrv_query_image_info(bs0, p_image_info, &local_err);
-            if (local_err) {
-                error_propagate(errp, local_err);
-                goto err;
-            }
-            if (bs0->drv && bs0->backing_hd) {
-                bs0 = bs0->backing_hd;
-                (*p_image_info)->has_backing_image = true;
-                p_image_info = &((*p_image_info)->backing_image);
-            } else {
-                break;
-            }
+        info->inserted = bdrv_block_device_info(bs, errp);
+        if (info->inserted == NULL) {
+            goto err;
         }
     }
 
diff --git a/blockdev.c b/blockdev.c
index e67c701..a136b2e 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2392,7 +2392,7 @@ out:
 
 BlockDeviceInfoList *qmp_query_named_block_nodes(Error **errp)
 {
-    return bdrv_named_nodes_list();
+    return bdrv_named_nodes_list(errp);
 }
 
 void qmp_blockdev_backup(const char *device, const char *target,
diff --git a/include/block/block.h b/include/block/block.h
index b285e0d..d79714a 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -382,7 +382,7 @@ void bdrv_lock_medium(BlockDriverState *bs, bool locked);
 void bdrv_eject(BlockDriverState *bs, bool eject_flag);
 const char *bdrv_get_format_name(BlockDriverState *bs);
 BlockDriverState *bdrv_find_node(const char *node_name);
-BlockDeviceInfoList *bdrv_named_nodes_list(void);
+BlockDeviceInfoList *bdrv_named_nodes_list(Error **errp);
 BlockDriverState *bdrv_lookup_bs(const char *device,
                                  const char *node_name,
                                  Error **errp);
diff --git a/include/block/qapi.h b/include/block/qapi.h
index 168d788..327549d 100644
--- a/include/block/qapi.h
+++ b/include/block/qapi.h
@@ -29,7 +29,7 @@
 #include "block/block.h"
 #include "block/snapshot.h"
 
-BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs);
+BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs, Error **errp);
 int bdrv_query_snapshot_info_list(BlockDriverState *bs,
                                   SnapshotInfoList **p_list,
                                   Error **errp);
-- 
cgit v1.1


From 20474e9aa040b9a255c63127f1eb873c29c54f68 Mon Sep 17 00:00:00 2001
From: Peter Lieven <pl@kamp.de>
Date: Thu, 16 Apr 2015 16:08:25 +0200
Subject: block/iscsi: do not forget to logout from target

We actually were always impolitely dropping the connection and
not cleanly logging out.

CC: qemu-stable@nongnu.org
Signed-off-by: Peter Lieven <pl@kamp.de>
Message-id: 1429193313-4263-2-git-send-email-pl@kamp.de
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/iscsi.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/block/iscsi.c b/block/iscsi.c
index ba33290..be8af46 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -1501,6 +1501,9 @@ out:
 
     if (ret) {
         if (iscsi != NULL) {
+            if (iscsi_is_logged_in(iscsi)) {
+                iscsi_logout_sync(iscsi);
+            }
             iscsi_destroy_context(iscsi);
         }
         memset(iscsilun, 0, sizeof(IscsiLun));
@@ -1514,6 +1517,9 @@ static void iscsi_close(BlockDriverState *bs)
     struct iscsi_context *iscsi = iscsilun->iscsi;
 
     iscsi_detach_aio_context(bs);
+    if (iscsi_is_logged_in(iscsi)) {
+        iscsi_logout_sync(iscsi);
+    }
     iscsi_destroy_context(iscsi);
     g_free(iscsilun->zeroblock);
     g_free(iscsilun->allocationmap);
-- 
cgit v1.1


From 0a386e48527d16e5dedbc1ff62aa0042a1cbdac5 Mon Sep 17 00:00:00 2001
From: Peter Lieven <pl@kamp.de>
Date: Thu, 16 Apr 2015 16:08:26 +0200
Subject: block/iscsi: change all iscsilun properties from uint8_t to bool

Signed-off-by: Peter Lieven <pl@kamp.de>
Message-id: 1429193313-4263-3-git-send-email-pl@kamp.de
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/iscsi.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index be8af46..6cf7e99 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -57,9 +57,6 @@ typedef struct IscsiLun {
     int events;
     QEMUTimer *nop_timer;
     QEMUTimer *event_timer;
-    uint8_t lbpme;
-    uint8_t lbprz;
-    uint8_t has_write_same;
     struct scsi_inquiry_logical_block_provisioning lbp;
     struct scsi_inquiry_block_limits bl;
     unsigned char *zeroblock;
@@ -67,6 +64,9 @@ typedef struct IscsiLun {
     int cluster_sectors;
     bool use_16_for_rw;
     bool write_protected;
+    bool lbpme;
+    bool lbprz;
+    bool has_write_same;
 } IscsiLun;
 
 typedef struct IscsiTask {
@@ -460,7 +460,7 @@ static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs,
     *pnum = nb_sectors;
 
     /* LUN does not support logical block provisioning */
-    if (iscsilun->lbpme == 0) {
+    if (!iscsilun->lbpme) {
         goto out;
     }
 
@@ -1121,8 +1121,8 @@ static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp)
                 } else {
                     iscsilun->block_size = rc16->block_length;
                     iscsilun->num_blocks = rc16->returned_lba + 1;
-                    iscsilun->lbpme = rc16->lbpme;
-                    iscsilun->lbprz = rc16->lbprz;
+                    iscsilun->lbpme = !!rc16->lbpme;
+                    iscsilun->lbprz = !!rc16->lbprz;
                     iscsilun->use_16_for_rw = (rc16->returned_lba > 0xffffffff);
                 }
             }
@@ -1655,7 +1655,7 @@ out:
 static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
     IscsiLun *iscsilun = bs->opaque;
-    bdi->unallocated_blocks_are_zero = !!iscsilun->lbprz;
+    bdi->unallocated_blocks_are_zero = iscsilun->lbprz;
     bdi->can_write_zeroes_with_unmap = iscsilun->lbprz && iscsilun->lbp.lbpws;
     bdi->cluster_size = iscsilun->cluster_sectors * BDRV_SECTOR_SIZE;
     return 0;
-- 
cgit v1.1


From 7191f2080c70228c6483b6604cc1c18943d8d766 Mon Sep 17 00:00:00 2001
From: Peter Lieven <pl@kamp.de>
Date: Thu, 16 Apr 2015 16:08:27 +0200
Subject: block/iscsi: rename iscsi_write_protected and let it return void

Signed-off-by: Peter Lieven <pl@kamp.de>
Message-id: 1429193313-4263-4-git-send-email-pl@kamp.de
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/iscsi.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index 6cf7e99..221c9fc 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -1253,11 +1253,11 @@ static void iscsi_attach_aio_context(BlockDriverState *bs,
                                           iscsi_timed_set_events, iscsilun);
 }
 
-static bool iscsi_is_write_protected(IscsiLun *iscsilun)
+static void iscsi_modesense_sync(IscsiLun *iscsilun)
 {
     struct scsi_task *task;
     struct scsi_mode_sense *ms = NULL;
-    bool wrprotected = false;
+    iscsilun->write_protected = false;
 
     task = iscsi_modesense6_sync(iscsilun->iscsi, iscsilun->lun,
                                  1, SCSI_MODESENSE_PC_CURRENT,
@@ -1278,13 +1278,12 @@ static bool iscsi_is_write_protected(IscsiLun *iscsilun)
                      iscsi_get_error(iscsilun->iscsi));
         goto out;
     }
-    wrprotected = ms->device_specific_parameter & 0x80;
+    iscsilun->write_protected = ms->device_specific_parameter & 0x80;
 
 out:
     if (task) {
         scsi_free_scsi_task(task);
     }
-    return wrprotected;
 }
 
 /*
@@ -1403,7 +1402,8 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
     scsi_free_scsi_task(task);
     task = NULL;
 
-    iscsilun->write_protected = iscsi_is_write_protected(iscsilun);
+    iscsi_modesense_sync(iscsilun);
+
     /* Check the write protect flag of the LUN if we want to write */
     if (iscsilun->type == TYPE_DISK && (flags & BDRV_O_RDWR) &&
         iscsilun->write_protected) {
-- 
cgit v1.1


From 752ce45150d3d70aabc4eb46a7a9cdfd8b4640fd Mon Sep 17 00:00:00 2001
From: Peter Lieven <pl@kamp.de>
Date: Thu, 16 Apr 2015 16:08:28 +0200
Subject: block/iscsi: store DPOFUA bit from the modesense command

Signed-off-by: Peter Lieven <pl@kamp.de>
Message-id: 1429193313-4263-5-git-send-email-pl@kamp.de
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/iscsi.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block/iscsi.c b/block/iscsi.c
index 221c9fc..237faa1 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -66,6 +66,7 @@ typedef struct IscsiLun {
     bool write_protected;
     bool lbpme;
     bool lbprz;
+    bool dpofua;
     bool has_write_same;
 } IscsiLun;
 
@@ -1258,6 +1259,7 @@ static void iscsi_modesense_sync(IscsiLun *iscsilun)
     struct scsi_task *task;
     struct scsi_mode_sense *ms = NULL;
     iscsilun->write_protected = false;
+    iscsilun->dpofua = false;
 
     task = iscsi_modesense6_sync(iscsilun->iscsi, iscsilun->lun,
                                  1, SCSI_MODESENSE_PC_CURRENT,
@@ -1279,6 +1281,7 @@ static void iscsi_modesense_sync(IscsiLun *iscsilun)
         goto out;
     }
     iscsilun->write_protected = ms->device_specific_parameter & 0x80;
+    iscsilun->dpofua          = ms->device_specific_parameter & 0x10;
 
 out:
     if (task) {
-- 
cgit v1.1


From 73b5394e2e4af3bbe01e221fa395373facc67f78 Mon Sep 17 00:00:00 2001
From: Peter Lieven <pl@kamp.de>
Date: Thu, 16 Apr 2015 16:08:29 +0200
Subject: block/iscsi: optimize WRITE10/16 if cache.writeback is not set

SCSI allowes to tell the target to not return from a write command
if the date is not written to the disk. Use this so called FUA
bit if it is supported to optimize WRITE commands if writeback is
not allowed.

In this case qemu always issues a WRITE followed by a FLUSH. This
is 2 round trip times. If we set the FUA bit we can ignore the
following FLUSH.

Signed-off-by: Peter Lieven <pl@kamp.de>
Message-id: 1429193313-4263-6-git-send-email-pl@kamp.de
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/iscsi.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index 237faa1..6033330 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -68,6 +68,7 @@ typedef struct IscsiLun {
     bool lbprz;
     bool dpofua;
     bool has_write_same;
+    bool force_next_flush;
 } IscsiLun;
 
 typedef struct IscsiTask {
@@ -80,6 +81,7 @@ typedef struct IscsiTask {
     QEMUBH *bh;
     IscsiLun *iscsilun;
     QEMUTimer retry_timer;
+    bool force_next_flush;
 } IscsiTask;
 
 typedef struct IscsiAIOCB {
@@ -200,6 +202,8 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
             }
         }
         error_report("iSCSI Failure: %s", iscsi_get_error(iscsi));
+    } else {
+        iTask->iscsilun->force_next_flush |= iTask->force_next_flush;
     }
 
 out:
@@ -370,6 +374,7 @@ static int coroutine_fn iscsi_co_writev(BlockDriverState *bs,
     struct IscsiTask iTask;
     uint64_t lba;
     uint32_t num_sectors;
+    int fua;
 
     if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
         return -EINVAL;
@@ -385,15 +390,17 @@ static int coroutine_fn iscsi_co_writev(BlockDriverState *bs,
     num_sectors = sector_qemu2lun(nb_sectors, iscsilun);
     iscsi_co_init_iscsitask(iscsilun, &iTask);
 retry:
+    fua = iscsilun->dpofua && !bs->enable_write_cache;
+    iTask.force_next_flush = !fua;
     if (iscsilun->use_16_for_rw) {
         iTask.task = iscsi_write16_task(iscsilun->iscsi, iscsilun->lun, lba,
                                         NULL, num_sectors * iscsilun->block_size,
-                                        iscsilun->block_size, 0, 0, 0, 0, 0,
+                                        iscsilun->block_size, 0, 0, fua, 0, 0,
                                         iscsi_co_generic_cb, &iTask);
     } else {
         iTask.task = iscsi_write10_task(iscsilun->iscsi, iscsilun->lun, lba,
                                         NULL, num_sectors * iscsilun->block_size,
-                                        iscsilun->block_size, 0, 0, 0, 0, 0,
+                                        iscsilun->block_size, 0, 0, fua, 0, 0,
                                         iscsi_co_generic_cb, &iTask);
     }
     if (iTask.task == NULL) {
@@ -621,8 +628,12 @@ static int coroutine_fn iscsi_co_flush(BlockDriverState *bs)
         return 0;
     }
 
-    iscsi_co_init_iscsitask(iscsilun, &iTask);
+    if (!iscsilun->force_next_flush) {
+        return 0;
+    }
+    iscsilun->force_next_flush = false;
 
+    iscsi_co_init_iscsitask(iscsilun, &iTask);
 retry:
     if (iscsi_synchronizecache10_task(iscsilun->iscsi, iscsilun->lun, 0, 0, 0,
                                       0, iscsi_co_generic_cb, &iTask) == NULL) {
@@ -918,6 +929,7 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
     }
 
     iscsi_co_init_iscsitask(iscsilun, &iTask);
+    iTask.force_next_flush = true;
 retry:
     if (use_16_for_ws) {
         iTask.task = iscsi_writesame16_task(iscsilun->iscsi, iscsilun->lun, lba,
-- 
cgit v1.1


From 59dd0a22ca4c3ac70c37263208b9e49cfeacf2e4 Mon Sep 17 00:00:00 2001
From: Peter Lieven <pl@kamp.de>
Date: Thu, 16 Apr 2015 16:08:30 +0200
Subject: block/iscsi: increase retry count

The idea is that a command is retried in a BUSY condition
up a time of approx. 60 seconds before it is failed. This should
be far higher than any command timeout in the guest.

Signed-off-by: Peter Lieven <pl@kamp.de>
Message-id: 1429193313-4263-7-git-send-email-pl@kamp.de
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/iscsi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index 6033330..5999f74 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -103,7 +103,7 @@ typedef struct IscsiAIOCB {
 #define NOP_INTERVAL 5000
 #define MAX_NOP_FAILURES 3
 #define ISCSI_CMD_RETRIES ARRAY_SIZE(iscsi_retry_times)
-static const unsigned iscsi_retry_times[] = {8, 32, 128, 512, 2048};
+static const unsigned iscsi_retry_times[] = {8, 32, 128, 512, 2048, 8192, 32768};
 
 /* this threshold is a trade-off knob to choose between
  * the potential additional overhead of an extra GET_LBA_STATUS request
-- 
cgit v1.1


From e380aff831c24b37c023010852e7ddd2ae1ec385 Mon Sep 17 00:00:00 2001
From: Peter Lieven <pl@kamp.de>
Date: Thu, 16 Apr 2015 16:08:31 +0200
Subject: block/iscsi: handle SCSI_STATUS_TASK_SET_FULL

a target may issue a SCSI_STATUS_TASK_SET_FULL status
if there is more than one "BUSY" command queued already.

Signed-off-by: Peter Lieven <pl@kamp.de>
Message-id: 1429193313-4263-8-git-send-email-pl@kamp.de
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/iscsi.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index 5999f74..328907b 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -186,10 +186,13 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
                 iTask->do_retry = 1;
                 goto out;
             }
-            if (status == SCSI_STATUS_BUSY) {
+            /* status 0x28 is SCSI_TASK_SET_FULL. It was first introduced
+             * in libiscsi 1.10.0. Hardcode this value here to avoid
+             * the need to bump the libiscsi requirement to 1.10.0 */
+            if (status == SCSI_STATUS_BUSY || status == 0x28) {
                 unsigned retry_time =
                     exp_random(iscsi_retry_times[iTask->retries - 1]);
-                error_report("iSCSI Busy (retry #%u in %u ms): %s",
+                error_report("iSCSI Busy/TaskSetFull (retry #%u in %u ms): %s",
                              iTask->retries, retry_time,
                              iscsi_get_error(iscsi));
                 aio_timer_init(iTask->iscsilun->aio_context,
-- 
cgit v1.1


From 03e40fef4678f9a42846c91a804b6d3c820e8b90 Mon Sep 17 00:00:00 2001
From: Peter Lieven <pl@kamp.de>
Date: Thu, 16 Apr 2015 16:08:32 +0200
Subject: block/iscsi: bump year in copyright notice

Signed-off-by: Peter Lieven <pl@kamp.de>
Message-id: 1429193313-4263-9-git-send-email-pl@kamp.de
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/iscsi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index 328907b..8364f97 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -2,7 +2,7 @@
  * QEMU Block driver for iSCSI images
  *
  * Copyright (c) 2010-2011 Ronnie Sahlberg <ronniesahlberg@gmail.com>
- * Copyright (c) 2012-2014 Peter Lieven <pl@kamp.de>
+ * Copyright (c) 2012-2015 Peter Lieven <pl@kamp.de>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
-- 
cgit v1.1


From 9eac3622a2b1159ab50b10540e822f3e58fdc383 Mon Sep 17 00:00:00 2001
From: Peter Lieven <pl@kamp.de>
Date: Thu, 16 Apr 2015 16:08:33 +0200
Subject: block/iscsi: use the allocationmap also if cache.direct=on

the allocationmap has only a hint character. The driver always
double checks that blocks marked unallocated in the cache are
still unallocated before taking the fast path and return zeroes.
So using the allocationmap is migration safe and can
also be enabled with cache.direct=on.

Signed-off-by: Peter Lieven <pl@kamp.de>
Message-id: 1429193313-4263-10-git-send-email-pl@kamp.de
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/iscsi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index 8364f97..8fca1d3 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -1499,7 +1499,7 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
         iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) {
         iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran *
                                      iscsilun->block_size) >> BDRV_SECTOR_BITS;
-        if (iscsilun->lbprz && !(bs->open_flags & BDRV_O_NOCACHE)) {
+        if (iscsilun->lbprz) {
             iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun);
             if (iscsilun->allocationmap == NULL) {
                 ret = -ENOMEM;
-- 
cgit v1.1


From efcfa278dca27f1c9db8b8283eac54f5e19074e7 Mon Sep 17 00:00:00 2001
From: John Snow <jsnow@redhat.com>
Date: Fri, 17 Apr 2015 19:49:49 -0400
Subject: docs: incremental backup documentation

Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 1429314609-29776-2-git-send-email-jsnow@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 docs/bitmaps.md | 352 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 352 insertions(+)
 create mode 100644 docs/bitmaps.md

diff --git a/docs/bitmaps.md b/docs/bitmaps.md
new file mode 100644
index 0000000..f066b48
--- /dev/null
+++ b/docs/bitmaps.md
@@ -0,0 +1,352 @@
+<!--
+Copyright 2015 John Snow <jsnow@redhat.com> and Red Hat, Inc.
+All rights reserved.
+
+This file is licensed via The FreeBSD Documentation License, the full text of
+which is included at the end of this document.
+-->
+
+# Dirty Bitmaps and Incremental Backup
+
+* Dirty Bitmaps are objects that track which data needs to be backed up for the
+  next incremental backup.
+
+* Dirty bitmaps can be created at any time and attached to any node
+  (not just complete drives.)
+
+## Dirty Bitmap Names
+
+* A dirty bitmap's name is unique to the node, but bitmaps attached to different
+  nodes can share the same name.
+
+## Bitmap Modes
+
+* A Bitmap can be "frozen," which means that it is currently in-use by a backup
+  operation and cannot be deleted, renamed, written to, reset,
+  etc.
+
+## Basic QMP Usage
+
+### Supported Commands ###
+
+* block-dirty-bitmap-add
+* block-dirty-bitmap-remove
+* block-dirty-bitmap-clear
+
+### Creation
+
+* To create a new bitmap, enabled, on the drive with id=drive0:
+
+```json
+{ "execute": "block-dirty-bitmap-add",
+  "arguments": {
+    "node": "drive0",
+    "name": "bitmap0"
+  }
+}
+```
+
+* This bitmap will have a default granularity that matches the cluster size of
+  its associated drive, if available, clamped to between [4KiB, 64KiB].
+  The current default for qcow2 is 64KiB.
+
+* To create a new bitmap that tracks changes in 32KiB segments:
+
+```json
+{ "execute": "block-dirty-bitmap-add",
+  "arguments": {
+    "node": "drive0",
+    "name": "bitmap0",
+    "granularity": 32768
+  }
+}
+```
+
+### Deletion
+
+* Bitmaps that are frozen cannot be deleted.
+
+* Deleting the bitmap does not impact any other bitmaps attached to the same
+  node, nor does it affect any backups already created from this node.
+
+* Because bitmaps are only unique to the node to which they are attached,
+  you must specify the node/drive name here, too.
+
+```json
+{ "execute": "block-dirty-bitmap-remove",
+  "arguments": {
+    "node": "drive0",
+    "name": "bitmap0"
+  }
+}
+```
+
+### Resetting
+
+* Resetting a bitmap will clear all information it holds.
+
+* An incremental backup created from an empty bitmap will copy no data,
+  as if nothing has changed.
+
+```json
+{ "execute": "block-dirty-bitmap-clear",
+  "arguments": {
+    "node": "drive0",
+    "name": "bitmap0"
+  }
+}
+```
+
+## Transactions (Not yet implemented)
+
+* Transactional commands are forthcoming in a future version,
+  and are not yet available for use. This section serves as
+  documentation of intent for their design and usage.
+
+### Justification
+
+Bitmaps can be safely modified when the VM is paused or halted by using
+the basic QMP commands. For instance, you might perform the following actions:
+
+1. Boot the VM in a paused state.
+2. Create a full drive backup of drive0.
+3. Create a new bitmap attached to drive0.
+4. Resume execution of the VM.
+5. Incremental backups are ready to be created.
+
+At this point, the bitmap and drive backup would be correctly in sync,
+and incremental backups made from this point forward would be correctly aligned
+to the full drive backup.
+
+This is not particularly useful if we decide we want to start incremental
+backups after the VM has been running for a while, for which we will need to
+perform actions such as the following:
+
+1. Boot the VM and begin execution.
+2. Using a single transaction, perform the following operations:
+    * Create bitmap0.
+    * Create a full drive backup of drive0.
+3. Incremental backups are now ready to be created.
+
+### Supported Bitmap Transactions
+
+* block-dirty-bitmap-add
+* block-dirty-bitmap-clear
+
+The usages are identical to their respective QMP commands, but see below
+for examples.
+
+### Example: New Incremental Backup
+
+As outlined in the justification, perhaps we want to create a new incremental
+backup chain attached to a drive.
+
+```json
+{ "execute": "transaction",
+  "arguments": {
+    "actions": [
+      {"type": "block-dirty-bitmap-add",
+       "data": {"node": "drive0", "name": "bitmap0"} },
+      {"type": "drive-backup",
+       "data": {"device": "drive0", "target": "/path/to/full_backup.img",
+                "sync": "full", "format": "qcow2"} }
+    ]
+  }
+}
+```
+
+### Example: New Incremental Backup Anchor Point
+
+Maybe we just want to create a new full backup with an existing bitmap and
+want to reset the bitmap to track the new chain.
+
+```json
+{ "execute": "transaction",
+  "arguments": {
+    "actions": [
+      {"type": "block-dirty-bitmap-clear",
+       "data": {"node": "drive0", "name": "bitmap0"} },
+      {"type": "drive-backup",
+       "data": {"device": "drive0", "target": "/path/to/new_full_backup.img",
+                "sync": "full", "format": "qcow2"} }
+    ]
+  }
+}
+```
+
+## Incremental Backups
+
+The star of the show.
+
+**Nota Bene!** Only incremental backups of entire drives are supported for now.
+So despite the fact that you can attach a bitmap to any arbitrary node, they are
+only currently useful when attached to the root node. This is because
+drive-backup only supports drives/devices instead of arbitrary nodes.
+
+### Example: First Incremental Backup
+
+1. Create a full backup and sync it to the dirty bitmap, as in the transactional
+examples above; or with the VM offline, manually create a full copy and then
+create a new bitmap before the VM begins execution.
+
+    * Let's assume the full backup is named 'full_backup.img'.
+    * Let's assume the bitmap you created is 'bitmap0' attached to 'drive0'.
+
+2. Create a destination image for the incremental backup that utilizes the
+full backup as a backing image.
+
+    * Let's assume it is named 'incremental.0.img'.
+
+    ```sh
+    # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
+    ```
+
+3. Issue the incremental backup command:
+
+    ```json
+    { "execute": "drive-backup",
+      "arguments": {
+        "device": "drive0",
+        "bitmap": "bitmap0",
+        "target": "incremental.0.img",
+        "format": "qcow2",
+        "sync": "dirty-bitmap",
+        "mode": "existing"
+      }
+    }
+    ```
+
+### Example: Second Incremental Backup
+
+1. Create a new destination image for the incremental backup that points to the
+   previous one, e.g.: 'incremental.1.img'
+
+    ```sh
+    # qemu-img create -f qcow2 incremental.1.img -b incremental.0.img -F qcow2
+    ```
+
+2. Issue a new incremental backup command. The only difference here is that we
+   have changed the target image below.
+
+    ```json
+    { "execute": "drive-backup",
+      "arguments": {
+        "device": "drive0",
+        "bitmap": "bitmap0",
+        "target": "incremental.1.img",
+        "format": "qcow2",
+        "sync": "dirty-bitmap",
+        "mode": "existing"
+      }
+    }
+    ```
+
+## Errors
+
+* In the event of an error that occurs after a backup job is successfully
+  launched, either by a direct QMP command or a QMP transaction, the user
+  will receive a BLOCK_JOB_COMPLETE event with a failure message, accompanied
+  by a BLOCK_JOB_ERROR event.
+
+* In the case of an event being cancelled, the user will receive a
+  BLOCK_JOB_CANCELLED event instead of a pair of COMPLETE and ERROR events.
+
+* In either case, the incremental backup data contained within the bitmap is
+  safely rolled back, and the data within the bitmap is not lost. The image
+  file created for the failed attempt can be safely deleted.
+
+* Once the underlying problem is fixed (e.g. more storage space is freed up),
+  you can simply retry the incremental backup command with the same bitmap.
+
+### Example
+
+1. Create a target image:
+
+    ```sh
+    # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
+    ```
+
+2. Attempt to create an incremental backup via QMP:
+
+    ```json
+    { "execute": "drive-backup",
+      "arguments": {
+        "device": "drive0",
+        "bitmap": "bitmap0",
+        "target": "incremental.0.img",
+        "format": "qcow2",
+        "sync": "dirty-bitmap",
+        "mode": "existing"
+      }
+    }
+    ```
+
+3. Receive an event notifying us of failure:
+
+    ```json
+    { "timestamp": { "seconds": 1424709442, "microseconds": 844524 },
+      "data": { "speed": 0, "offset": 0, "len": 67108864,
+                "error": "No space left on device",
+                "device": "drive1", "type": "backup" },
+      "event": "BLOCK_JOB_COMPLETED" }
+    ```
+
+4. Delete the failed incremental, and re-create the image.
+
+    ```sh
+    # rm incremental.0.img
+    # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
+    ```
+
+5. Retry the command after fixing the underlying problem,
+   such as freeing up space on the backup volume:
+
+    ```json
+    { "execute": "drive-backup",
+      "arguments": {
+        "device": "drive0",
+        "bitmap": "bitmap0",
+        "target": "incremental.0.img",
+        "format": "qcow2",
+        "sync": "dirty-bitmap",
+        "mode": "existing"
+      }
+    }
+    ```
+
+6. Receive confirmation that the job completed successfully:
+
+    ```json
+    { "timestamp": { "seconds": 1424709668, "microseconds": 526525 },
+      "data": { "device": "drive1", "type": "backup",
+                "speed": 0, "len": 67108864, "offset": 67108864},
+      "event": "BLOCK_JOB_COMPLETED" }
+    ```
+
+<!--
+The FreeBSD Documentation License
+
+Redistribution and use in source (Markdown) and 'compiled' forms (SGML, HTML,
+PDF, PostScript, RTF and so forth) with or without modification, are permitted
+provided that the following conditions are met:
+
+Redistributions of source code (Markdown) must retain the above copyright
+notice, this list of conditions and the following disclaimer of this file
+unmodified.
+
+Redistributions in compiled form (transformed to other DTDs, converted to PDF,
+PostScript, RTF and other formats) must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+
+THIS DOCUMENTATION IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR  PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS  BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+THIS DOCUMENTATION, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
-- 
cgit v1.1


From 0db6e54a8a2c6e16780356422da671b71f862341 Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Fri, 17 Apr 2015 19:49:50 -0400
Subject: qapi: Add optional field "name" to block dirty bitmap

This field will be set for user created dirty bitmap. Also pass in an
error pointer to bdrv_create_dirty_bitmap, so when a name is already
taken on this BDS, it can report an error message. This is not global
check, two BDSes can have dirty bitmap with a common name.

Implemented bdrv_find_dirty_bitmap to find a dirty bitmap by name, will
be used later when other QMP commands want to reference dirty bitmap by
name.

Add bdrv_dirty_bitmap_make_anon. This unsets the name of dirty bitmap.

Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1429314609-29776-3-git-send-email-jsnow@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c               | 32 +++++++++++++++++++++++++++++++-
 block/mirror.c        |  2 +-
 include/block/block.h |  7 ++++++-
 migration/block.c     |  2 +-
 qapi/block-core.json  |  4 +++-
 5 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/block.c b/block.c
index 101b50c..d25ab51 100644
--- a/block.c
+++ b/block.c
@@ -54,6 +54,7 @@
 
 struct BdrvDirtyBitmap {
     HBitmap *bitmap;
+    char *name;
     QLIST_ENTRY(BdrvDirtyBitmap) list;
 };
 
@@ -5501,7 +5502,28 @@ bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
     return true;
 }
 
-BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
+BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name)
+{
+    BdrvDirtyBitmap *bm;
+
+    assert(name);
+    QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
+        if (bm->name && !strcmp(name, bm->name)) {
+            return bm;
+        }
+    }
+    return NULL;
+}
+
+void bdrv_dirty_bitmap_make_anon(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
+{
+    g_free(bitmap->name);
+    bitmap->name = NULL;
+}
+
+BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
+                                          int granularity,
+                                          const char *name,
                                           Error **errp)
 {
     int64_t bitmap_size;
@@ -5509,6 +5531,10 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
 
     assert((granularity & (granularity - 1)) == 0);
 
+    if (name && bdrv_find_dirty_bitmap(bs, name)) {
+        error_setg(errp, "Bitmap already exists: %s", name);
+        return NULL;
+    }
     granularity >>= BDRV_SECTOR_BITS;
     assert(granularity);
     bitmap_size = bdrv_nb_sectors(bs);
@@ -5519,6 +5545,7 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
     }
     bitmap = g_new0(BdrvDirtyBitmap, 1);
     bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(granularity));
+    bitmap->name = g_strdup(name);
     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
     return bitmap;
 }
@@ -5530,6 +5557,7 @@ void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
         if (bm == bitmap) {
             QLIST_REMOVE(bitmap, list);
             hbitmap_free(bitmap->bitmap);
+            g_free(bitmap->name);
             g_free(bitmap);
             return;
         }
@@ -5548,6 +5576,8 @@ BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
         info->count = bdrv_get_dirty_count(bs, bm);
         info->granularity =
             ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
+        info->has_name = !!bm->name;
+        info->name = g_strdup(bm->name);
         entry->value = info;
         *plist = entry;
         plist = &entry->next;
diff --git a/block/mirror.c b/block/mirror.c
index d421fce..f3bd24a 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -703,7 +703,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
     s->granularity = granularity;
     s->buf_size = MAX(buf_size, granularity);
 
-    s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, errp);
+    s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
     if (!s->dirty_bitmap) {
         return;
     }
diff --git a/include/block/block.h b/include/block/block.h
index d79714a..feae17c 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -450,8 +450,13 @@ bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov);
 
 struct HBitmapIter;
 typedef struct BdrvDirtyBitmap BdrvDirtyBitmap;
-BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity,
+BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
+                                          int granularity,
+                                          const char *name,
                                           Error **errp);
+BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs,
+                                        const char *name);
+void bdrv_dirty_bitmap_make_anon(BlockDriverState *bs, BdrvDirtyBitmap *bitmap);
 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap);
 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs);
 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector);
diff --git a/migration/block.c b/migration/block.c
index 085c0fa..02a7d26 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -320,7 +320,7 @@ static int set_dirty_tracking(void)
 
     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
         bmds->dirty_bitmap = bdrv_create_dirty_bitmap(bmds->bs, BLOCK_SIZE,
-                                                      NULL);
+                                                      NULL, NULL);
         if (!bmds->dirty_bitmap) {
             ret = -errno;
             goto fail;
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 82a8ae5..915cdc0 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -330,6 +330,8 @@
 #
 # Block dirty bitmap information.
 #
+# @name: #optional the name of the dirty bitmap (Since 2.4)
+#
 # @count: number of dirty bytes according to the dirty bitmap
 #
 # @granularity: granularity of the dirty bitmap in bytes (since 1.4)
@@ -337,7 +339,7 @@
 # Since: 1.3
 ##
 { 'type': 'BlockDirtyInfo',
-  'data': {'count': 'int', 'granularity': 'int'} }
+  'data': {'*name': 'str', 'count': 'int', 'granularity': 'int'} }
 
 ##
 # @BlockInfo:
-- 
cgit v1.1


From 5fba6c0e50b66691568b34d5a2f4be0b39f5e20a Mon Sep 17 00:00:00 2001
From: John Snow <jsnow@redhat.com>
Date: Fri, 17 Apr 2015 19:49:51 -0400
Subject: qmp: Ensure consistent granularity type

We treat this field with a variety of different types everywhere
in the code. Now it's just uint32_t.

Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1429314609-29776-4-git-send-email-jsnow@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c                   | 11 ++++++-----
 block/mirror.c            |  4 ++--
 include/block/block.h     |  2 +-
 include/block/block_int.h |  2 +-
 qapi/block-core.json      |  2 +-
 5 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/block.c b/block.c
index d25ab51..424e00d 100644
--- a/block.c
+++ b/block.c
@@ -5522,12 +5522,13 @@ void bdrv_dirty_bitmap_make_anon(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
 }
 
 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
-                                          int granularity,
+                                          uint32_t granularity,
                                           const char *name,
                                           Error **errp)
 {
     int64_t bitmap_size;
     BdrvDirtyBitmap *bitmap;
+    uint32_t sector_granularity;
 
     assert((granularity & (granularity - 1)) == 0);
 
@@ -5535,8 +5536,8 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
         error_setg(errp, "Bitmap already exists: %s", name);
         return NULL;
     }
-    granularity >>= BDRV_SECTOR_BITS;
-    assert(granularity);
+    sector_granularity = granularity >> BDRV_SECTOR_BITS;
+    assert(sector_granularity);
     bitmap_size = bdrv_nb_sectors(bs);
     if (bitmap_size < 0) {
         error_setg_errno(errp, -bitmap_size, "could not get length of device");
@@ -5544,7 +5545,7 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
         return NULL;
     }
     bitmap = g_new0(BdrvDirtyBitmap, 1);
-    bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(granularity));
+    bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(sector_granularity));
     bitmap->name = g_strdup(name);
     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
     return bitmap;
@@ -5575,7 +5576,7 @@ BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
         BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
         info->count = bdrv_get_dirty_count(bs, bm);
         info->granularity =
-            ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
+            ((uint32_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
         info->has_name = !!bm->name;
         info->name = g_strdup(bm->name);
         entry->value = info;
diff --git a/block/mirror.c b/block/mirror.c
index f3bd24a..83e330e 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -656,7 +656,7 @@ static const BlockJobDriver commit_active_job_driver = {
 
 static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
                              const char *replaces,
-                             int64_t speed, int64_t granularity,
+                             int64_t speed, uint32_t granularity,
                              int64_t buf_size,
                              BlockdevOnError on_source_error,
                              BlockdevOnError on_target_error,
@@ -717,7 +717,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
 
 void mirror_start(BlockDriverState *bs, BlockDriverState *target,
                   const char *replaces,
-                  int64_t speed, int64_t granularity, int64_t buf_size,
+                  int64_t speed, uint32_t granularity, int64_t buf_size,
                   MirrorSyncMode mode, BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
                   BlockCompletionFunc *cb,
diff --git a/include/block/block.h b/include/block/block.h
index feae17c..77cdf91 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -451,7 +451,7 @@ bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov);
 struct HBitmapIter;
 typedef struct BdrvDirtyBitmap BdrvDirtyBitmap;
 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
-                                          int granularity,
+                                          uint32_t granularity,
                                           const char *name,
                                           Error **errp);
 BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs,
diff --git a/include/block/block_int.h b/include/block/block_int.h
index dccb092..fb9e100 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -590,7 +590,7 @@ void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
  */
 void mirror_start(BlockDriverState *bs, BlockDriverState *target,
                   const char *replaces,
-                  int64_t speed, int64_t granularity, int64_t buf_size,
+                  int64_t speed, uint32_t granularity, int64_t buf_size,
                   MirrorSyncMode mode, BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
                   BlockCompletionFunc *cb,
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 915cdc0..7dc6441 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -339,7 +339,7 @@
 # Since: 1.3
 ##
 { 'type': 'BlockDirtyInfo',
-  'data': {'*name': 'str', 'count': 'int', 'granularity': 'int'} }
+  'data': {'*name': 'str', 'count': 'int', 'granularity': 'uint32'} }
 
 ##
 # @BlockInfo:
-- 
cgit v1.1


From 341ebc2f81b14862347e4d4c1fcb3759f815237a Mon Sep 17 00:00:00 2001
From: John Snow <jsnow@redhat.com>
Date: Fri, 17 Apr 2015 19:49:52 -0400
Subject: qmp: Add block-dirty-bitmap-add and block-dirty-bitmap-remove

The new command pair is added to manage a user created dirty bitmap. The
dirty bitmap's name is mandatory and must be unique for the same device,
but different devices can have bitmaps with the same names.

The granularity is an optional field. If it is not specified, we will
choose a default granularity based on the cluster size if available,
clamped to between 4K and 64K to mirror how the 'mirror' code was
already choosing granularity. If we do not have cluster size info
available, we choose 64K. This code has been factored out into a helper
shared with block/mirror.

This patch also introduces the 'block_dirty_bitmap_lookup' helper,
which takes a device name and a dirty bitmap name and validates the
lookup, returning NULL and setting errp if there is a problem with
either field. This helper will be re-used in future patches in this
series.

The types added to block-core.json will be re-used in future patches
in this series, see:
'qapi: Add transaction support to block-dirty-bitmap-{add, enable, disable}'

Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 1429314609-29776-5-git-send-email-jsnow@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c               |  20 +++++++++
 block/mirror.c        |  10 +----
 blockdev.c            | 117 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/block/block.h |   1 +
 qapi/block-core.json  |  55 ++++++++++++++++++++++++
 qmp-commands.hx       |  56 ++++++++++++++++++++++++
 6 files changed, 250 insertions(+), 9 deletions(-)

diff --git a/block.c b/block.c
index 424e00d..b268174 100644
--- a/block.c
+++ b/block.c
@@ -5596,6 +5596,26 @@ int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector
     }
 }
 
+/**
+ * Chooses a default granularity based on the existing cluster size,
+ * but clamped between [4K, 64K]. Defaults to 64K in the case that there
+ * is no cluster size information available.
+ */
+uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs)
+{
+    BlockDriverInfo bdi;
+    uint32_t granularity;
+
+    if (bdrv_get_info(bs, &bdi) >= 0 && bdi.cluster_size > 0) {
+        granularity = MAX(4096, bdi.cluster_size);
+        granularity = MIN(65536, granularity);
+    } else {
+        granularity = 65536;
+    }
+
+    return granularity;
+}
+
 void bdrv_dirty_iter_init(BlockDriverState *bs,
                           BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
 {
diff --git a/block/mirror.c b/block/mirror.c
index 83e330e..29c4545 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -668,15 +668,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
     MirrorBlockJob *s;
 
     if (granularity == 0) {
-        /* Choose the default granularity based on the target file's cluster
-         * size, clamped between 4k and 64k.  */
-        BlockDriverInfo bdi;
-        if (bdrv_get_info(target, &bdi) >= 0 && bdi.cluster_size != 0) {
-            granularity = MAX(4096, bdi.cluster_size);
-            granularity = MIN(65536, granularity);
-        } else {
-            granularity = 65536;
-        }
+        granularity = bdrv_get_default_bitmap_granularity(target);
     }
 
     assert ((granularity & (granularity - 1)) == 0);
diff --git a/blockdev.c b/blockdev.c
index a136b2e..5dde1e8 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1164,6 +1164,68 @@ out_aio_context:
     return NULL;
 }
 
+/**
+ * block_dirty_bitmap_lookup:
+ * Return a dirty bitmap (if present), after validating
+ * the node reference and bitmap names.
+ *
+ * @node: The name of the BDS node to search for bitmaps
+ * @name: The name of the bitmap to search for
+ * @pbs: Output pointer for BDS lookup, if desired. Can be NULL.
+ * @paio: Output pointer for aio_context acquisition, if desired. Can be NULL.
+ * @errp: Output pointer for error information. Can be NULL.
+ *
+ * @return: A bitmap object on success, or NULL on failure.
+ */
+static BdrvDirtyBitmap *block_dirty_bitmap_lookup(const char *node,
+                                                  const char *name,
+                                                  BlockDriverState **pbs,
+                                                  AioContext **paio,
+                                                  Error **errp)
+{
+    BlockDriverState *bs;
+    BdrvDirtyBitmap *bitmap;
+    AioContext *aio_context;
+
+    if (!node) {
+        error_setg(errp, "Node cannot be NULL");
+        return NULL;
+    }
+    if (!name) {
+        error_setg(errp, "Bitmap name cannot be NULL");
+        return NULL;
+    }
+    bs = bdrv_lookup_bs(node, node, NULL);
+    if (!bs) {
+        error_setg(errp, "Node '%s' not found", node);
+        return NULL;
+    }
+
+    aio_context = bdrv_get_aio_context(bs);
+    aio_context_acquire(aio_context);
+
+    bitmap = bdrv_find_dirty_bitmap(bs, name);
+    if (!bitmap) {
+        error_setg(errp, "Dirty bitmap '%s' not found", name);
+        goto fail;
+    }
+
+    if (pbs) {
+        *pbs = bs;
+    }
+    if (paio) {
+        *paio = aio_context;
+    } else {
+        aio_context_release(aio_context);
+    }
+
+    return bitmap;
+
+ fail:
+    aio_context_release(aio_context);
+    return NULL;
+}
+
 /* New and old BlockDriverState structs for atomic group operations */
 
 typedef struct BlkTransactionState BlkTransactionState;
@@ -1954,6 +2016,61 @@ void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd,
     aio_context_release(aio_context);
 }
 
+void qmp_block_dirty_bitmap_add(const char *node, const char *name,
+                                bool has_granularity, uint32_t granularity,
+                                Error **errp)
+{
+    AioContext *aio_context;
+    BlockDriverState *bs;
+
+    if (!name || name[0] == '\0') {
+        error_setg(errp, "Bitmap name cannot be empty");
+        return;
+    }
+
+    bs = bdrv_lookup_bs(node, node, errp);
+    if (!bs) {
+        return;
+    }
+
+    aio_context = bdrv_get_aio_context(bs);
+    aio_context_acquire(aio_context);
+
+    if (has_granularity) {
+        if (granularity < 512 || !is_power_of_2(granularity)) {
+            error_setg(errp, "Granularity must be power of 2 "
+                             "and at least 512");
+            goto out;
+        }
+    } else {
+        /* Default to cluster size, if available: */
+        granularity = bdrv_get_default_bitmap_granularity(bs);
+    }
+
+    bdrv_create_dirty_bitmap(bs, granularity, name, errp);
+
+ out:
+    aio_context_release(aio_context);
+}
+
+void qmp_block_dirty_bitmap_remove(const char *node, const char *name,
+                                   Error **errp)
+{
+    AioContext *aio_context;
+    BlockDriverState *bs;
+    BdrvDirtyBitmap *bitmap;
+
+    bitmap = block_dirty_bitmap_lookup(node, name, &bs, &aio_context, errp);
+    if (!bitmap || !bs) {
+        return;
+    }
+
+    bdrv_dirty_bitmap_make_anon(bs, bitmap);
+    bdrv_release_dirty_bitmap(bs, bitmap);
+
+    aio_context_release(aio_context);
+}
+
 int hmp_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data)
 {
     const char *id = qdict_get_str(qdict, "id");
diff --git a/include/block/block.h b/include/block/block.h
index 77cdf91..283117f 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -459,6 +459,7 @@ BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs,
 void bdrv_dirty_bitmap_make_anon(BlockDriverState *bs, BdrvDirtyBitmap *bitmap);
 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap);
 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs);
+uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs);
 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector);
 void bdrv_set_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
                            int64_t cur_sector, int nr_sectors);
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 7dc6441..6237f9d 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -960,6 +960,61 @@
             '*on-target-error': 'BlockdevOnError' } }
 
 ##
+# @BlockDirtyBitmap
+#
+# @node: name of device/node which the bitmap is tracking
+#
+# @name: name of the dirty bitmap
+#
+# Since 2.4
+##
+{ 'type': 'BlockDirtyBitmap',
+  'data': { 'node': 'str', 'name': 'str' } }
+
+##
+# @BlockDirtyBitmapAdd
+#
+# @node: name of device/node which the bitmap is tracking
+#
+# @name: name of the dirty bitmap
+#
+# @granularity: #optional the bitmap granularity, default is 64k for
+#               block-dirty-bitmap-add
+#
+# Since 2.4
+##
+{ 'type': 'BlockDirtyBitmapAdd',
+  'data': { 'node': 'str', 'name': 'str', '*granularity': 'uint32' } }
+
+##
+# @block-dirty-bitmap-add
+#
+# Create a dirty bitmap with a name on the node
+#
+# Returns: nothing on success
+#          If @node is not a valid block device or node, DeviceNotFound
+#          If @name is already taken, GenericError with an explanation
+#
+# Since 2.4
+##
+{ 'command': 'block-dirty-bitmap-add',
+  'data': 'BlockDirtyBitmapAdd' }
+
+##
+# @block-dirty-bitmap-remove
+#
+# Remove a dirty bitmap on the node
+#
+# Returns: nothing on success
+#          If @node is not a valid block device or node, DeviceNotFound
+#          If @name is not found, GenericError with an explanation
+#
+# Since 2.4
+##
+{ 'command': 'block-dirty-bitmap-remove',
+  'data': 'BlockDirtyBitmap' }
+
+##
 # @block_set_io_throttle:
 #
 # Change I/O throttle limits for a block drive.
diff --git a/qmp-commands.hx b/qmp-commands.hx
index e6ae026..1aeab50 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -1306,6 +1306,62 @@ Example:
 EQMP
 
     {
+        .name       = "block-dirty-bitmap-add",
+        .args_type  = "node:B,name:s,granularity:i?",
+        .mhandler.cmd_new = qmp_marshal_input_block_dirty_bitmap_add,
+    },
+
+SQMP
+
+block-dirty-bitmap-add
+----------------------
+Since 2.4
+
+Create a dirty bitmap with a name on the device, and start tracking the writes.
+
+Arguments:
+
+- "node": device/node on which to create dirty bitmap (json-string)
+- "name": name of the new dirty bitmap (json-string)
+- "granularity": granularity to track writes with (int, optional)
+
+Example:
+
+-> { "execute": "block-dirty-bitmap-add", "arguments": { "node": "drive0",
+                                                   "name": "bitmap0" } }
+<- { "return": {} }
+
+EQMP
+
+    {
+        .name       = "block-dirty-bitmap-remove",
+        .args_type  = "node:B,name:s",
+        .mhandler.cmd_new = qmp_marshal_input_block_dirty_bitmap_remove,
+    },
+
+SQMP
+
+block-dirty-bitmap-remove
+-------------------------
+Since 2.4
+
+Stop write tracking and remove the dirty bitmap that was created with
+block-dirty-bitmap-add.
+
+Arguments:
+
+- "node": device/node on which to remove dirty bitmap (json-string)
+- "name": name of the dirty bitmap to remove (json-string)
+
+Example:
+
+-> { "execute": "block-dirty-bitmap-remove", "arguments": { "node": "drive0",
+                                                      "name": "bitmap0" } }
+<- { "return": {} }
+
+EQMP
+
+    {
         .name       = "blockdev-snapshot-sync",
         .args_type  = "device:s?,node-name:s?,snapshot-file:s,snapshot-node-name:s?,format:s?,mode:s?",
         .mhandler.cmd_new = qmp_marshal_input_blockdev_snapshot_sync,
-- 
cgit v1.1


From 592fdd02ae987a439a2ba25a2a973673f1484805 Mon Sep 17 00:00:00 2001
From: John Snow <jsnow@redhat.com>
Date: Fri, 17 Apr 2015 19:49:53 -0400
Subject: block: Introduce bdrv_dirty_bitmap_granularity()

This returns the granularity (in bytes) of dirty bitmap,
which matches the QMP interface and the existing query
interface.

Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1429314609-29776-6-git-send-email-jsnow@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c               | 8 ++++++--
 include/block/block.h | 1 +
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/block.c b/block.c
index b268174..b8df11d 100644
--- a/block.c
+++ b/block.c
@@ -5575,8 +5575,7 @@ BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
         BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
         BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
         info->count = bdrv_get_dirty_count(bs, bm);
-        info->granularity =
-            ((uint32_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap));
+        info->granularity = bdrv_dirty_bitmap_granularity(bm);
         info->has_name = !!bm->name;
         info->name = g_strdup(bm->name);
         entry->value = info;
@@ -5616,6 +5615,11 @@ uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs)
     return granularity;
 }
 
+uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap)
+{
+    return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap);
+}
+
 void bdrv_dirty_iter_init(BlockDriverState *bs,
                           BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
 {
diff --git a/include/block/block.h b/include/block/block.h
index 283117f..0c0dd6d 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -460,6 +460,7 @@ void bdrv_dirty_bitmap_make_anon(BlockDriverState *bs, BdrvDirtyBitmap *bitmap);
 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap);
 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs);
 uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs);
+uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap);
 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector);
 void bdrv_set_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
                            int64_t cur_sector, int nr_sectors);
-- 
cgit v1.1


From 8515efbef1759b9143f06e9722c8f4e145032181 Mon Sep 17 00:00:00 2001
From: John Snow <jsnow@redhat.com>
Date: Fri, 17 Apr 2015 19:49:54 -0400
Subject: hbitmap: cache array lengths

As a convenience: between incremental backups, bitmap migrations
and bitmap persistence we seem to need to recalculate these a lot.

Because the lengths are a little bit-twiddly, let's just solidly
cache them and be done with it.

Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1429314609-29776-7-git-send-email-jsnow@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 util/hbitmap.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/util/hbitmap.c b/util/hbitmap.c
index ab13971..5b78613 100644
--- a/util/hbitmap.c
+++ b/util/hbitmap.c
@@ -90,6 +90,9 @@ struct HBitmap {
      * bitmap will still allocate HBITMAP_LEVELS arrays.
      */
     unsigned long *levels[HBITMAP_LEVELS];
+
+    /* The length of each levels[] array. */
+    uint64_t sizes[HBITMAP_LEVELS];
 };
 
 /* Advance hbi to the next nonzero word and return it.  hbi->pos
@@ -384,6 +387,7 @@ HBitmap *hbitmap_alloc(uint64_t size, int granularity)
     hb->granularity = granularity;
     for (i = HBITMAP_LEVELS; i-- > 0; ) {
         size = MAX((size + BITS_PER_LONG - 1) >> BITS_PER_LEVEL, 1);
+        hb->sizes[i] = size;
         hb->levels[i] = g_new0(unsigned long, size);
     }
 
-- 
cgit v1.1


From be58721dbf882fa8830f3669f499b0a5b501e90f Mon Sep 17 00:00:00 2001
From: John Snow <jsnow@redhat.com>
Date: Fri, 17 Apr 2015 19:49:55 -0400
Subject: hbitmap: add hbitmap_merge

We add a bitmap merge operation to assist in error cases
where we wish to combine two bitmaps together.

This is algorithmically O(bits) provided HBITMAP_LEVELS remains
constant. For a full bitmap on a 64bit machine:
sum(bits/64^k, k, 0, HBITMAP_LEVELS) ~= 1.01587 * bits

We may be able to improve running speed for particularly sparse
bitmaps by using iterators, but the running time for dense maps
will be worse.

We present the simpler solution first, and we can refine it later
if needed.

Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1429314609-29776-8-git-send-email-jsnow@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 include/qemu/hbitmap.h | 13 +++++++++++++
 util/hbitmap.c         | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/include/qemu/hbitmap.h b/include/qemu/hbitmap.h
index 550d7ce..6cb2d0e 100644
--- a/include/qemu/hbitmap.h
+++ b/include/qemu/hbitmap.h
@@ -65,6 +65,19 @@ struct HBitmapIter {
 HBitmap *hbitmap_alloc(uint64_t size, int granularity);
 
 /**
+ * hbitmap_merge:
+ * @a: The bitmap to store the result in.
+ * @b: The bitmap to merge into @a.
+ * @return true if the merge was successful,
+ *         false if it was not attempted.
+ *
+ * Merge two bitmaps together.
+ * A := A (BITOR) B.
+ * B is left unmodified.
+ */
+bool hbitmap_merge(HBitmap *a, const HBitmap *b);
+
+/**
  * hbitmap_empty:
  * @hb: HBitmap to operate on.
  *
diff --git a/util/hbitmap.c b/util/hbitmap.c
index 5b78613..150d6e9 100644
--- a/util/hbitmap.c
+++ b/util/hbitmap.c
@@ -399,3 +399,36 @@ HBitmap *hbitmap_alloc(uint64_t size, int granularity)
     hb->levels[0][0] |= 1UL << (BITS_PER_LONG - 1);
     return hb;
 }
+
+/**
+ * Given HBitmaps A and B, let A := A (BITOR) B.
+ * Bitmap B will not be modified.
+ *
+ * @return true if the merge was successful,
+ *         false if it was not attempted.
+ */
+bool hbitmap_merge(HBitmap *a, const HBitmap *b)
+{
+    int i;
+    uint64_t j;
+
+    if ((a->size != b->size) || (a->granularity != b->granularity)) {
+        return false;
+    }
+
+    if (hbitmap_count(b) == 0) {
+        return true;
+    }
+
+    /* This merge is O(size), as BITS_PER_LONG and HBITMAP_LEVELS are constant.
+     * It may be possible to improve running times for sparsely populated maps
+     * by using hbitmap_iter_next, but this is suboptimal for dense maps.
+     */
+    for (i = HBITMAP_LEVELS - 1; i >= 0; i--) {
+        for (j = 0; j < a->sizes[i]; j++) {
+            a->levels[i][j] |= b->levels[i][j];
+        }
+    }
+
+    return true;
+}
-- 
cgit v1.1


From b8e6fb752e43b45b428487c244cab35f0ab94b10 Mon Sep 17 00:00:00 2001
From: John Snow <jsnow@redhat.com>
Date: Fri, 17 Apr 2015 19:49:56 -0400
Subject: block: Add bitmap disabled status

Add a status indicating the enabled/disabled state of the bitmap.
A bitmap is by default enabled, but you can lock the bitmap into
a read-only state by setting disabled = true.

A previous version of this patch added a QMP interface for changing
the state of the bitmap, but it has since been removed for now until
a use case emerges where this state must be revealed to the user.

The disabled state WILL be used internally for bitmap migration and
bitmap persistence.

Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1429314609-29776-9-git-send-email-jsnow@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c               | 25 +++++++++++++++++++++++++
 include/block/block.h |  3 +++
 2 files changed, 28 insertions(+)

diff --git a/block.c b/block.c
index b8df11d..0e63524 100644
--- a/block.c
+++ b/block.c
@@ -55,6 +55,7 @@
 struct BdrvDirtyBitmap {
     HBitmap *bitmap;
     char *name;
+    bool disabled;
     QLIST_ENTRY(BdrvDirtyBitmap) list;
 };
 
@@ -5547,10 +5548,16 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
     bitmap = g_new0(BdrvDirtyBitmap, 1);
     bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(sector_granularity));
     bitmap->name = g_strdup(name);
+    bitmap->disabled = false;
     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
     return bitmap;
 }
 
+bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap)
+{
+    return !bitmap->disabled;
+}
+
 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
 {
     BdrvDirtyBitmap *bm, *next;
@@ -5565,6 +5572,16 @@ void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
     }
 }
 
+void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap)
+{
+    bitmap->disabled = true;
+}
+
+void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap)
+{
+    bitmap->disabled = false;
+}
+
 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
 {
     BdrvDirtyBitmap *bm;
@@ -5629,12 +5646,14 @@ void bdrv_dirty_iter_init(BlockDriverState *bs,
 void bdrv_set_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
                            int64_t cur_sector, int nr_sectors)
 {
+    assert(bdrv_dirty_bitmap_enabled(bitmap));
     hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
 }
 
 void bdrv_reset_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
                              int64_t cur_sector, int nr_sectors)
 {
+    assert(bdrv_dirty_bitmap_enabled(bitmap));
     hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
 }
 
@@ -5643,6 +5662,9 @@ static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
 {
     BdrvDirtyBitmap *bitmap;
     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
+        if (!bdrv_dirty_bitmap_enabled(bitmap)) {
+            continue;
+        }
         hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
     }
 }
@@ -5652,6 +5674,9 @@ static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
 {
     BdrvDirtyBitmap *bitmap;
     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
+        if (!bdrv_dirty_bitmap_enabled(bitmap)) {
+            continue;
+        }
         hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
     }
 }
diff --git a/include/block/block.h b/include/block/block.h
index 0c0dd6d..5d131f0 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -458,9 +458,12 @@ BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs,
                                         const char *name);
 void bdrv_dirty_bitmap_make_anon(BlockDriverState *bs, BdrvDirtyBitmap *bitmap);
 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap);
+void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap);
+void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap);
 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs);
 uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs);
 uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap);
+bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap);
 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector);
 void bdrv_set_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
                            int64_t cur_sector, int nr_sectors);
-- 
cgit v1.1


From 9bd2b08f27b9c27bb40d73b6466321b8c635086e Mon Sep 17 00:00:00 2001
From: John Snow <jsnow@redhat.com>
Date: Fri, 17 Apr 2015 19:49:57 -0400
Subject: block: Add bitmap successors

A bitmap successor is an anonymous BdrvDirtyBitmap that is intended to
be created just prior to a sensitive operation (e.g. Incremental Backup)
that can either succeed or fail, but during the course of which we still
want a bitmap tracking writes.

On creating a successor, we "freeze" the parent bitmap which prevents
its deletion, enabling, anonymization, or creating a bitmap with the
same name.

On success, the parent bitmap can "abdicate" responsibility to the
successor, which will inherit its name. The successor will have been
tracking writes during the course of the backup operation. The parent
will be safely deleted.

On failure, we can "reclaim" the successor from the parent, unifying
them such that the resulting bitmap describes all writes occurring since
the last successful backup, for instance. Reclamation will thaw the
parent, but not explicitly re-enable it.

BdrvDirtyBitmap operations that target a single bitmap are protected
by assertions that the bitmap is not frozen and/or disabled.

BdrvDirtyBitmap operations that target a group of bitmaps, such as
bdrv_{set,reset}_dirty will ignore frozen/disabled drives with a
conditional instead.

Internal functions that enable/disable dirty bitmaps have assertions
added to them to prevent modifying frozen bitmaps.

Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 1429314609-29776-10-git-send-email-jsnow@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c               | 104 +++++++++++++++++++++++++++++++++++++++++++++++++-
 blockdev.c            |   7 ++++
 include/block/block.h |  10 +++++
 qapi/block-core.json  |   1 +
 4 files changed, 121 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index 0e63524..dffa9de 100644
--- a/block.c
+++ b/block.c
@@ -52,8 +52,17 @@
 #include <windows.h>
 #endif
 
+/**
+ * A BdrvDirtyBitmap can be in three possible states:
+ * (1) successor is NULL and disabled is false: full r/w mode
+ * (2) successor is NULL and disabled is true: read only mode ("disabled")
+ * (3) successor is set: frozen mode.
+ *     A frozen bitmap cannot be renamed, deleted, anonymized, cleared, set,
+ *     or enabled. A frozen bitmap can only abdicate() or reclaim().
+ */
 struct BdrvDirtyBitmap {
     HBitmap *bitmap;
+    BdrvDirtyBitmap *successor;
     char *name;
     bool disabled;
     QLIST_ENTRY(BdrvDirtyBitmap) list;
@@ -5518,6 +5527,7 @@ BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name)
 
 void bdrv_dirty_bitmap_make_anon(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
 {
+    assert(!bdrv_dirty_bitmap_frozen(bitmap));
     g_free(bitmap->name);
     bitmap->name = NULL;
 }
@@ -5553,9 +5563,98 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
     return bitmap;
 }
 
+bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap)
+{
+    return bitmap->successor;
+}
+
 bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap)
 {
-    return !bitmap->disabled;
+    return !(bitmap->disabled || bitmap->successor);
+}
+
+/**
+ * Create a successor bitmap destined to replace this bitmap after an operation.
+ * Requires that the bitmap is not frozen and has no successor.
+ */
+int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs,
+                                       BdrvDirtyBitmap *bitmap, Error **errp)
+{
+    uint64_t granularity;
+    BdrvDirtyBitmap *child;
+
+    if (bdrv_dirty_bitmap_frozen(bitmap)) {
+        error_setg(errp, "Cannot create a successor for a bitmap that is "
+                   "currently frozen");
+        return -1;
+    }
+    assert(!bitmap->successor);
+
+    /* Create an anonymous successor */
+    granularity = bdrv_dirty_bitmap_granularity(bitmap);
+    child = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
+    if (!child) {
+        return -1;
+    }
+
+    /* Successor will be on or off based on our current state. */
+    child->disabled = bitmap->disabled;
+
+    /* Install the successor and freeze the parent */
+    bitmap->successor = child;
+    return 0;
+}
+
+/**
+ * For a bitmap with a successor, yield our name to the successor,
+ * delete the old bitmap, and return a handle to the new bitmap.
+ */
+BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs,
+                                            BdrvDirtyBitmap *bitmap,
+                                            Error **errp)
+{
+    char *name;
+    BdrvDirtyBitmap *successor = bitmap->successor;
+
+    if (successor == NULL) {
+        error_setg(errp, "Cannot relinquish control if "
+                   "there's no successor present");
+        return NULL;
+    }
+
+    name = bitmap->name;
+    bitmap->name = NULL;
+    successor->name = name;
+    bitmap->successor = NULL;
+    bdrv_release_dirty_bitmap(bs, bitmap);
+
+    return successor;
+}
+
+/**
+ * In cases of failure where we can no longer safely delete the parent,
+ * we may wish to re-join the parent and child/successor.
+ * The merged parent will be un-frozen, but not explicitly re-enabled.
+ */
+BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs,
+                                           BdrvDirtyBitmap *parent,
+                                           Error **errp)
+{
+    BdrvDirtyBitmap *successor = parent->successor;
+
+    if (!successor) {
+        error_setg(errp, "Cannot reclaim a successor when none is present");
+        return NULL;
+    }
+
+    if (!hbitmap_merge(parent->bitmap, successor->bitmap)) {
+        error_setg(errp, "Merging of parent and successor bitmap failed");
+        return NULL;
+    }
+    bdrv_release_dirty_bitmap(bs, successor);
+    parent->successor = NULL;
+
+    return parent;
 }
 
 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
@@ -5563,6 +5662,7 @@ void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
     BdrvDirtyBitmap *bm, *next;
     QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
         if (bm == bitmap) {
+            assert(!bdrv_dirty_bitmap_frozen(bm));
             QLIST_REMOVE(bitmap, list);
             hbitmap_free(bitmap->bitmap);
             g_free(bitmap->name);
@@ -5574,11 +5674,13 @@ void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
 
 void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap)
 {
+    assert(!bdrv_dirty_bitmap_frozen(bitmap));
     bitmap->disabled = true;
 }
 
 void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap)
 {
+    assert(!bdrv_dirty_bitmap_frozen(bitmap));
     bitmap->disabled = false;
 }
 
diff --git a/blockdev.c b/blockdev.c
index 5dde1e8..e8b712e 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2065,9 +2065,16 @@ void qmp_block_dirty_bitmap_remove(const char *node, const char *name,
         return;
     }
 
+    if (bdrv_dirty_bitmap_frozen(bitmap)) {
+        error_setg(errp,
+                   "Bitmap '%s' is currently frozen and cannot be removed",
+                   name);
+        goto out;
+    }
     bdrv_dirty_bitmap_make_anon(bs, bitmap);
     bdrv_release_dirty_bitmap(bs, bitmap);
 
+ out:
     aio_context_release(aio_context);
 }
 
diff --git a/include/block/block.h b/include/block/block.h
index 5d131f0..1635c22 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -454,6 +454,15 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
                                           uint32_t granularity,
                                           const char *name,
                                           Error **errp);
+int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs,
+                                       BdrvDirtyBitmap *bitmap,
+                                       Error **errp);
+BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs,
+                                            BdrvDirtyBitmap *bitmap,
+                                            Error **errp);
+BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs,
+                                           BdrvDirtyBitmap *bitmap,
+                                           Error **errp);
 BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs,
                                         const char *name);
 void bdrv_dirty_bitmap_make_anon(BlockDriverState *bs, BdrvDirtyBitmap *bitmap);
@@ -464,6 +473,7 @@ BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs);
 uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs);
 uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap);
 bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap);
+bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap);
 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector);
 void bdrv_set_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
                            int64_t cur_sector, int nr_sectors);
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 6237f9d..f3b92d8 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -1008,6 +1008,7 @@
 # Returns: nothing on success
 #          If @node is not a valid block device or node, DeviceNotFound
 #          If @name is not found, GenericError with an explanation
+#          if @name is frozen by an operation, GenericError
 #
 # Since 2.4
 ##
-- 
cgit v1.1


From d58d84539784d27c826924a79d9436178b07ff69 Mon Sep 17 00:00:00 2001
From: John Snow <jsnow@redhat.com>
Date: Fri, 17 Apr 2015 19:49:58 -0400
Subject: qmp: Add support of "dirty-bitmap" sync mode for drive-backup

For "dirty-bitmap" sync mode, the block job will iterate through the
given dirty bitmap to decide if a sector needs backup (backup all the
dirty clusters and skip clean ones), just as allocation conditions of
"top" sync mode.

Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1429314609-29776-11-git-send-email-jsnow@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c                   |   9 +++
 block/backup.c            | 155 +++++++++++++++++++++++++++++++++++++++-------
 block/mirror.c            |   4 ++
 blockdev.c                |  18 +++++-
 hmp.c                     |   3 +-
 include/block/block.h     |   1 +
 include/block/block_int.h |   2 +
 qapi/block-core.json      |  14 +++--
 qmp-commands.hx           |   8 ++-
 9 files changed, 181 insertions(+), 33 deletions(-)

diff --git a/block.c b/block.c
index dffa9de..9dc5c8c 100644
--- a/block.c
+++ b/block.c
@@ -5783,6 +5783,15 @@ static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
     }
 }
 
+/**
+ * Advance an HBitmapIter to an arbitrary offset.
+ */
+void bdrv_set_dirty_iter(HBitmapIter *hbi, int64_t offset)
+{
+    assert(hbi->hb);
+    hbitmap_iter_init(hbi, hbi->hb, offset);
+}
+
 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
 {
     return hbitmap_count(bitmap->bitmap);
diff --git a/block/backup.c b/block/backup.c
index 3312476..ddf9027 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -37,6 +37,8 @@ typedef struct CowRequest {
 typedef struct BackupBlockJob {
     BlockJob common;
     BlockDriverState *target;
+    /* bitmap for sync=dirty-bitmap */
+    BdrvDirtyBitmap *sync_bitmap;
     MirrorSyncMode sync_mode;
     RateLimit limit;
     BlockdevOnError on_source_error;
@@ -242,6 +244,91 @@ static void backup_complete(BlockJob *job, void *opaque)
     g_free(data);
 }
 
+static bool coroutine_fn yield_and_check(BackupBlockJob *job)
+{
+    if (block_job_is_cancelled(&job->common)) {
+        return true;
+    }
+
+    /* we need to yield so that bdrv_drain_all() returns.
+     * (without, VM does not reboot)
+     */
+    if (job->common.speed) {
+        uint64_t delay_ns = ratelimit_calculate_delay(&job->limit,
+                                                      job->sectors_read);
+        job->sectors_read = 0;
+        block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns);
+    } else {
+        block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0);
+    }
+
+    if (block_job_is_cancelled(&job->common)) {
+        return true;
+    }
+
+    return false;
+}
+
+static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
+{
+    bool error_is_read;
+    int ret = 0;
+    int clusters_per_iter;
+    uint32_t granularity;
+    int64_t sector;
+    int64_t cluster;
+    int64_t end;
+    int64_t last_cluster = -1;
+    BlockDriverState *bs = job->common.bs;
+    HBitmapIter hbi;
+
+    granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap);
+    clusters_per_iter = MAX((granularity / BACKUP_CLUSTER_SIZE), 1);
+    bdrv_dirty_iter_init(bs, job->sync_bitmap, &hbi);
+
+    /* Find the next dirty sector(s) */
+    while ((sector = hbitmap_iter_next(&hbi)) != -1) {
+        cluster = sector / BACKUP_SECTORS_PER_CLUSTER;
+
+        /* Fake progress updates for any clusters we skipped */
+        if (cluster != last_cluster + 1) {
+            job->common.offset += ((cluster - last_cluster - 1) *
+                                   BACKUP_CLUSTER_SIZE);
+        }
+
+        for (end = cluster + clusters_per_iter; cluster < end; cluster++) {
+            do {
+                if (yield_and_check(job)) {
+                    return ret;
+                }
+                ret = backup_do_cow(bs, cluster * BACKUP_SECTORS_PER_CLUSTER,
+                                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read);
+                if ((ret < 0) &&
+                    backup_error_action(job, error_is_read, -ret) ==
+                    BLOCK_ERROR_ACTION_REPORT) {
+                    return ret;
+                }
+            } while (ret < 0);
+        }
+
+        /* If the bitmap granularity is smaller than the backup granularity,
+         * we need to advance the iterator pointer to the next cluster. */
+        if (granularity < BACKUP_CLUSTER_SIZE) {
+            bdrv_set_dirty_iter(&hbi, cluster * BACKUP_SECTORS_PER_CLUSTER);
+        }
+
+        last_cluster = cluster - 1;
+    }
+
+    /* Play some final catchup with the progress meter */
+    end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE);
+    if (last_cluster + 1 < end) {
+        job->common.offset += ((end - last_cluster - 1) * BACKUP_CLUSTER_SIZE);
+    }
+
+    return ret;
+}
+
 static void coroutine_fn backup_run(void *opaque)
 {
     BackupBlockJob *job = opaque;
@@ -259,8 +346,7 @@ static void coroutine_fn backup_run(void *opaque)
     qemu_co_rwlock_init(&job->flush_rwlock);
 
     start = 0;
-    end = DIV_ROUND_UP(job->common.len / BDRV_SECTOR_SIZE,
-                       BACKUP_SECTORS_PER_CLUSTER);
+    end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE);
 
     job->bitmap = hbitmap_alloc(end, 0);
 
@@ -278,28 +364,13 @@ static void coroutine_fn backup_run(void *opaque)
             qemu_coroutine_yield();
             job->common.busy = true;
         }
+    } else if (job->sync_mode == MIRROR_SYNC_MODE_DIRTY_BITMAP) {
+        ret = backup_run_incremental(job);
     } else {
         /* Both FULL and TOP SYNC_MODE's require copying.. */
         for (; start < end; start++) {
             bool error_is_read;
-
-            if (block_job_is_cancelled(&job->common)) {
-                break;
-            }
-
-            /* we need to yield so that bdrv_drain_all() returns.
-             * (without, VM does not reboot)
-             */
-            if (job->common.speed) {
-                uint64_t delay_ns = ratelimit_calculate_delay(
-                        &job->limit, job->sectors_read);
-                job->sectors_read = 0;
-                block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns);
-            } else {
-                block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0);
-            }
-
-            if (block_job_is_cancelled(&job->common)) {
+            if (yield_and_check(job)) {
                 break;
             }
 
@@ -357,6 +428,18 @@ static void coroutine_fn backup_run(void *opaque)
     qemu_co_rwlock_wrlock(&job->flush_rwlock);
     qemu_co_rwlock_unlock(&job->flush_rwlock);
 
+    if (job->sync_bitmap) {
+        BdrvDirtyBitmap *bm;
+        if (ret < 0) {
+            /* Merge the successor back into the parent, delete nothing. */
+            bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL);
+            assert(bm);
+        } else {
+            /* Everything is fine, delete this bitmap and install the backup. */
+            bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL);
+            assert(bm);
+        }
+    }
     hbitmap_free(job->bitmap);
 
     bdrv_iostatus_disable(target);
@@ -369,6 +452,7 @@ static void coroutine_fn backup_run(void *opaque)
 
 void backup_start(BlockDriverState *bs, BlockDriverState *target,
                   int64_t speed, MirrorSyncMode sync_mode,
+                  BdrvDirtyBitmap *sync_bitmap,
                   BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
                   BlockCompletionFunc *cb, void *opaque,
@@ -412,17 +496,36 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
         return;
     }
 
+    if (sync_mode == MIRROR_SYNC_MODE_DIRTY_BITMAP) {
+        if (!sync_bitmap) {
+            error_setg(errp, "must provide a valid bitmap name for "
+                             "\"dirty-bitmap\" sync mode");
+            return;
+        }
+
+        /* Create a new bitmap, and freeze/disable this one. */
+        if (bdrv_dirty_bitmap_create_successor(bs, sync_bitmap, errp) < 0) {
+            return;
+        }
+    } else if (sync_bitmap) {
+        error_setg(errp,
+                   "a sync_bitmap was provided to backup_run, "
+                   "but received an incompatible sync_mode (%s)",
+                   MirrorSyncMode_lookup[sync_mode]);
+        return;
+    }
+
     len = bdrv_getlength(bs);
     if (len < 0) {
         error_setg_errno(errp, -len, "unable to get length for '%s'",
                          bdrv_get_device_name(bs));
-        return;
+        goto error;
     }
 
     BackupBlockJob *job = block_job_create(&backup_job_driver, bs, speed,
                                            cb, opaque, errp);
     if (!job) {
-        return;
+        goto error;
     }
 
     bdrv_op_block_all(target, job->common.blocker);
@@ -431,7 +534,15 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
     job->on_target_error = on_target_error;
     job->target = target;
     job->sync_mode = sync_mode;
+    job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_DIRTY_BITMAP ?
+                       sync_bitmap : NULL;
     job->common.len = len;
     job->common.co = qemu_coroutine_create(backup_run);
     qemu_coroutine_enter(job->common.co, job);
+    return;
+
+ error:
+    if (sync_bitmap) {
+        bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL);
+    }
 }
diff --git a/block/mirror.c b/block/mirror.c
index 29c4545..898f8ce 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -718,6 +718,10 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
     bool is_none_mode;
     BlockDriverState *base;
 
+    if (mode == MIRROR_SYNC_MODE_DIRTY_BITMAP) {
+        error_setg(errp, "Sync mode 'dirty-bitmap' not supported");
+        return;
+    }
     is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
     base = mode == MIRROR_SYNC_MODE_TOP ? bs->backing_hd : NULL;
     mirror_start_job(bs, target, replaces,
diff --git a/blockdev.c b/blockdev.c
index e8b712e..62d862c 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1585,6 +1585,7 @@ static void drive_backup_prepare(BlkTransactionState *common, Error **errp)
                      backup->sync,
                      backup->has_mode, backup->mode,
                      backup->has_speed, backup->speed,
+                     backup->has_bitmap, backup->bitmap,
                      backup->has_on_source_error, backup->on_source_error,
                      backup->has_on_target_error, backup->on_target_error,
                      &local_err);
@@ -2395,6 +2396,7 @@ void qmp_drive_backup(const char *device, const char *target,
                       enum MirrorSyncMode sync,
                       bool has_mode, enum NewImageMode mode,
                       bool has_speed, int64_t speed,
+                      bool has_bitmap, const char *bitmap,
                       bool has_on_source_error, BlockdevOnError on_source_error,
                       bool has_on_target_error, BlockdevOnError on_target_error,
                       Error **errp)
@@ -2403,6 +2405,7 @@ void qmp_drive_backup(const char *device, const char *target,
     BlockDriverState *bs;
     BlockDriverState *target_bs;
     BlockDriverState *source = NULL;
+    BdrvDirtyBitmap *bmap = NULL;
     AioContext *aio_context;
     BlockDriver *drv = NULL;
     Error *local_err = NULL;
@@ -2502,7 +2505,16 @@ void qmp_drive_backup(const char *device, const char *target,
 
     bdrv_set_aio_context(target_bs, aio_context);
 
-    backup_start(bs, target_bs, speed, sync, on_source_error, on_target_error,
+    if (has_bitmap) {
+        bmap = bdrv_find_dirty_bitmap(bs, bitmap);
+        if (!bmap) {
+            error_setg(errp, "Bitmap '%s' could not be found", bitmap);
+            goto out;
+        }
+    }
+
+    backup_start(bs, target_bs, speed, sync, bmap,
+                 on_source_error, on_target_error,
                  block_job_cb, bs, &local_err);
     if (local_err != NULL) {
         bdrv_unref(target_bs);
@@ -2563,8 +2575,8 @@ void qmp_blockdev_backup(const char *device, const char *target,
 
     bdrv_ref(target_bs);
     bdrv_set_aio_context(target_bs, aio_context);
-    backup_start(bs, target_bs, speed, sync, on_source_error, on_target_error,
-                 block_job_cb, bs, &local_err);
+    backup_start(bs, target_bs, speed, sync, NULL, on_source_error,
+                 on_target_error, block_job_cb, bs, &local_err);
     if (local_err != NULL) {
         bdrv_unref(target_bs);
         error_propagate(errp, local_err);
diff --git a/hmp.c b/hmp.c
index f31ae27..d85d913 100644
--- a/hmp.c
+++ b/hmp.c
@@ -1061,7 +1061,8 @@ void hmp_drive_backup(Monitor *mon, const QDict *qdict)
 
     qmp_drive_backup(device, filename, !!format, format,
                      full ? MIRROR_SYNC_MODE_FULL : MIRROR_SYNC_MODE_TOP,
-                     true, mode, false, 0, false, 0, false, 0, &err);
+                     true, mode, false, 0, false, NULL,
+                     false, 0, false, 0, &err);
     hmp_handle_error(mon, &err);
 }
 
diff --git a/include/block/block.h b/include/block/block.h
index 1635c22..035cf7b 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -481,6 +481,7 @@ void bdrv_reset_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
                              int64_t cur_sector, int nr_sectors);
 void bdrv_dirty_iter_init(BlockDriverState *bs,
                           BdrvDirtyBitmap *bitmap, struct HBitmapIter *hbi);
+void bdrv_set_dirty_iter(struct HBitmapIter *hbi, int64_t offset);
 int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap);
 
 void bdrv_enable_copy_on_read(BlockDriverState *bs);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index fb9e100..e0d5561 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -602,6 +602,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
  * @target: Block device to write to.
  * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
  * @sync_mode: What parts of the disk image should be copied to the destination.
+ * @sync_bitmap: The dirty bitmap if sync_mode is MIRROR_SYNC_MODE_DIRTY_BITMAP.
  * @on_source_error: The action to take upon error reading from the source.
  * @on_target_error: The action to take upon error writing to the target.
  * @cb: Completion function for the job.
@@ -612,6 +613,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
  */
 void backup_start(BlockDriverState *bs, BlockDriverState *target,
                   int64_t speed, MirrorSyncMode sync_mode,
+                  BdrvDirtyBitmap *sync_bitmap,
                   BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
                   BlockCompletionFunc *cb, void *opaque,
diff --git a/qapi/block-core.json b/qapi/block-core.json
index f3b92d8..8a4c7f2 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -512,10 +512,12 @@
 #
 # @none: only copy data written from now on
 #
+# @dirty-bitmap: only copy data described by the dirty bitmap. Since: 2.4
+#
 # Since: 1.3
 ##
 { 'enum': 'MirrorSyncMode',
-  'data': ['top', 'full', 'none'] }
+  'data': ['top', 'full', 'none', 'dirty-bitmap'] }
 
 ##
 # @BlockJobType:
@@ -690,14 +692,18 @@
 #          probe if @mode is 'existing', else the format of the source
 #
 # @sync: what parts of the disk image should be copied to the destination
-#        (all the disk, only the sectors allocated in the topmost image, or
-#        only new I/O).
+#        (all the disk, only the sectors allocated in the topmost image, from a
+#        dirty bitmap, or only new I/O).
 #
 # @mode: #optional whether and how QEMU should create a new image, default is
 #        'absolute-paths'.
 #
 # @speed: #optional the maximum speed, in bytes per second
 #
+# @bitmap: #optional the name of dirty bitmap if sync is "dirty-bitmap".
+#          Must be present if sync is "dirty-bitmap", must NOT be present
+#          otherwise. (Since 2.4)
+#
 # @on-source-error: #optional the action to take on an error on the source,
 #                   default 'report'.  'stop' and 'enospc' can only be used
 #                   if the block device supports io-status (see BlockInfo).
@@ -715,7 +721,7 @@
 { 'type': 'DriveBackup',
   'data': { 'device': 'str', 'target': 'str', '*format': 'str',
             'sync': 'MirrorSyncMode', '*mode': 'NewImageMode',
-            '*speed': 'int',
+            '*speed': 'int', '*bitmap': 'str',
             '*on-source-error': 'BlockdevOnError',
             '*on-target-error': 'BlockdevOnError' } }
 
diff --git a/qmp-commands.hx b/qmp-commands.hx
index 1aeab50..9b10d3d 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -1110,7 +1110,7 @@ EQMP
     {
         .name       = "drive-backup",
         .args_type  = "sync:s,device:B,target:s,speed:i?,mode:s?,format:s?,"
-                      "on-source-error:s?,on-target-error:s?",
+                      "bitmap:s?,on-source-error:s?,on-target-error:s?",
         .mhandler.cmd_new = qmp_marshal_input_drive_backup,
     },
 
@@ -1137,8 +1137,10 @@ Arguments:
             (json-string, optional)
 - "sync": what parts of the disk image should be copied to the destination;
   possibilities include "full" for all the disk, "top" for only the sectors
-  allocated in the topmost image, or "none" to only replicate new I/O
-  (MirrorSyncMode).
+  allocated in the topmost image, "dirty-bitmap" for only the dirty sectors in
+  the bitmap, or "none" to only replicate new I/O (MirrorSyncMode).
+- "bitmap": dirty bitmap name for sync==dirty-bitmap. Must be present if sync
+            is "dirty-bitmap", must NOT be present otherwise.
 - "mode": whether and how QEMU should create a new image
           (NewImageMode, optional, default 'absolute-paths')
 - "speed": the maximum speed, in bytes per second (json-int, optional)
-- 
cgit v1.1


From e74e6b78e6fe0c9ee426d1278fff45f5fa0af766 Mon Sep 17 00:00:00 2001
From: John Snow <jsnow@redhat.com>
Date: Fri, 17 Apr 2015 19:49:59 -0400
Subject: qmp: add block-dirty-bitmap-clear

Add bdrv_clear_dirty_bitmap and a matching QMP command,
qmp_block_dirty_bitmap_clear that enables a user to reset
the bitmap attached to a drive.

This allows us to reset a bitmap in the event of a full
drive backup.

Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 1429314609-29776-12-git-send-email-jsnow@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c               |  8 ++++++++
 blockdev.c            | 34 ++++++++++++++++++++++++++++++++++
 include/block/block.h |  1 +
 qapi/block-core.json  | 14 ++++++++++++++
 qmp-commands.hx       | 29 +++++++++++++++++++++++++++++
 5 files changed, 86 insertions(+)

diff --git a/block.c b/block.c
index 9dc5c8c..b7f1f00 100644
--- a/block.c
+++ b/block.c
@@ -63,6 +63,7 @@
 struct BdrvDirtyBitmap {
     HBitmap *bitmap;
     BdrvDirtyBitmap *successor;
+    int64_t size;
     char *name;
     bool disabled;
     QLIST_ENTRY(BdrvDirtyBitmap) list;
@@ -5557,6 +5558,7 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
     }
     bitmap = g_new0(BdrvDirtyBitmap, 1);
     bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(sector_granularity));
+    bitmap->size = bitmap_size;
     bitmap->name = g_strdup(name);
     bitmap->disabled = false;
     QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list);
@@ -5759,6 +5761,12 @@ void bdrv_reset_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
     hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
 }
 
+void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap)
+{
+    assert(bdrv_dirty_bitmap_enabled(bitmap));
+    hbitmap_reset(bitmap->bitmap, 0, bitmap->size);
+}
+
 static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
                            int nr_sectors)
 {
diff --git a/blockdev.c b/blockdev.c
index 62d862c..5905946 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2079,6 +2079,40 @@ void qmp_block_dirty_bitmap_remove(const char *node, const char *name,
     aio_context_release(aio_context);
 }
 
+/**
+ * Completely clear a bitmap, for the purposes of synchronizing a bitmap
+ * immediately after a full backup operation.
+ */
+void qmp_block_dirty_bitmap_clear(const char *node, const char *name,
+                                  Error **errp)
+{
+    AioContext *aio_context;
+    BdrvDirtyBitmap *bitmap;
+    BlockDriverState *bs;
+
+    bitmap = block_dirty_bitmap_lookup(node, name, &bs, &aio_context, errp);
+    if (!bitmap || !bs) {
+        return;
+    }
+
+    if (bdrv_dirty_bitmap_frozen(bitmap)) {
+        error_setg(errp,
+                   "Bitmap '%s' is currently frozen and cannot be modified",
+                   name);
+        goto out;
+    } else if (!bdrv_dirty_bitmap_enabled(bitmap)) {
+        error_setg(errp,
+                   "Bitmap '%s' is currently disabled and cannot be cleared",
+                   name);
+        goto out;
+    }
+
+    bdrv_clear_dirty_bitmap(bitmap);
+
+ out:
+    aio_context_release(aio_context);
+}
+
 int hmp_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data)
 {
     const char *id = qdict_get_str(qdict, "id");
diff --git a/include/block/block.h b/include/block/block.h
index 035cf7b..12bf145 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -479,6 +479,7 @@ void bdrv_set_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
                            int64_t cur_sector, int nr_sectors);
 void bdrv_reset_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
                              int64_t cur_sector, int nr_sectors);
+void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap);
 void bdrv_dirty_iter_init(BlockDriverState *bs,
                           BdrvDirtyBitmap *bitmap, struct HBitmapIter *hbi);
 void bdrv_set_dirty_iter(struct HBitmapIter *hbi, int64_t offset);
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 8a4c7f2..e275dc2 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -1022,6 +1022,20 @@
   'data': 'BlockDirtyBitmap' }
 
 ##
+# @block-dirty-bitmap-clear
+#
+# Clear (reset) a dirty bitmap on the device
+#
+# Returns: nothing on success
+#          If @node is not a valid block device, DeviceNotFound
+#          If @name is not found, GenericError with an explanation
+#
+# Since 2.4
+##
+{ 'command': 'block-dirty-bitmap-clear',
+  'data': 'BlockDirtyBitmap' }
+
+##
 # @block_set_io_throttle:
 #
 # Change I/O throttle limits for a block drive.
diff --git a/qmp-commands.hx b/qmp-commands.hx
index 9b10d3d..213508f 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -1364,6 +1364,35 @@ Example:
 EQMP
 
     {
+        .name       = "block-dirty-bitmap-clear",
+        .args_type  = "node:B,name:s",
+        .mhandler.cmd_new = qmp_marshal_input_block_dirty_bitmap_clear,
+    },
+
+SQMP
+
+block-dirty-bitmap-clear
+------------------------
+Since 2.4
+
+Reset the dirty bitmap associated with a node so that an incremental backup
+from this point in time forward will only backup clusters modified after this
+clear operation.
+
+Arguments:
+
+- "node": device/node on which to remove dirty bitmap (json-string)
+- "name": name of the dirty bitmap to remove (json-string)
+
+Example:
+
+-> { "execute": "block-dirty-bitmap-clear", "arguments": { "node": "drive0",
+                                                           "name": "bitmap0" } }
+<- { "return": {} }
+
+EQMP
+
+    {
         .name       = "blockdev-snapshot-sync",
         .args_type  = "device:s?,node-name:s?,snapshot-file:s,snapshot-node-name:s?,format:s?,mode:s?",
         .mhandler.cmd_new = qmp_marshal_input_blockdev_snapshot_sync,
-- 
cgit v1.1


From a113534ffb8f2580d323e6397e6908d5f4bfa0b7 Mon Sep 17 00:00:00 2001
From: John Snow <jsnow@redhat.com>
Date: Fri, 17 Apr 2015 19:50:00 -0400
Subject: qmp: Add dirty bitmap status field in query-block

Add the "frozen" status booleans, to inform clients
when a bitmap is occupied doing a task.

Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 1429314609-29776-13-git-send-email-jsnow@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c              | 1 +
 qapi/block-core.json | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index b7f1f00..f0157d5 100644
--- a/block.c
+++ b/block.c
@@ -5699,6 +5699,7 @@ BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
         info->granularity = bdrv_dirty_bitmap_granularity(bm);
         info->has_name = !!bm->name;
         info->name = g_strdup(bm->name);
+        info->frozen = bdrv_dirty_bitmap_frozen(bm);
         entry->value = info;
         *plist = entry;
         plist = &entry->next;
diff --git a/qapi/block-core.json b/qapi/block-core.json
index e275dc2..1c17224 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -336,10 +336,13 @@
 #
 # @granularity: granularity of the dirty bitmap in bytes (since 1.4)
 #
+# @frozen: whether the dirty bitmap is frozen (Since 2.4)
+#
 # Since: 1.3
 ##
 { 'type': 'BlockDirtyInfo',
-  'data': {'*name': 'str', 'count': 'int', 'granularity': 'uint32'} }
+  'data': {'*name': 'str', 'count': 'int', 'granularity': 'uint32',
+           'frozen': 'bool'} }
 
 ##
 # @BlockInfo:
-- 
cgit v1.1


From aa0c7ca506bb3f661be673b3d5c1320f37e52fdb Mon Sep 17 00:00:00 2001
From: John Snow <jsnow@redhat.com>
Date: Fri, 17 Apr 2015 19:50:01 -0400
Subject: block: add BdrvDirtyBitmap documentation

Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1429314609-29776-14-git-send-email-jsnow@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/block.c b/block.c
index f0157d5..4121929 100644
--- a/block.c
+++ b/block.c
@@ -61,11 +61,11 @@
  *     or enabled. A frozen bitmap can only abdicate() or reclaim().
  */
 struct BdrvDirtyBitmap {
-    HBitmap *bitmap;
-    BdrvDirtyBitmap *successor;
-    int64_t size;
-    char *name;
-    bool disabled;
+    HBitmap *bitmap;            /* Dirty sector bitmap implementation */
+    BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status */
+    char *name;                 /* Optional non-empty unique ID */
+    int64_t size;               /* Size of the bitmap (Number of sectors) */
+    bool disabled;              /* Bitmap is read-only */
     QLIST_ENTRY(BdrvDirtyBitmap) list;
 };
 
-- 
cgit v1.1


From 20dca81075e712ebcbc151eed9b1a02d4e5d08f5 Mon Sep 17 00:00:00 2001
From: John Snow <jsnow@redhat.com>
Date: Fri, 17 Apr 2015 19:50:02 -0400
Subject: block: Ensure consistent bitmap function prototypes

We often don't need the BlockDriverState for functions
that operate on bitmaps. Remove it.

Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1429314609-29776-15-git-send-email-jsnow@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c               | 13 ++++++-------
 block/backup.c        |  2 +-
 block/mirror.c        | 26 ++++++++++----------------
 blockdev.c            |  2 +-
 include/block/block.h | 11 +++++------
 migration/block.c     |  7 +++----
 6 files changed, 26 insertions(+), 35 deletions(-)

diff --git a/block.c b/block.c
index 4121929..b7d6df5 100644
--- a/block.c
+++ b/block.c
@@ -5526,7 +5526,7 @@ BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name)
     return NULL;
 }
 
-void bdrv_dirty_bitmap_make_anon(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
+void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap)
 {
     assert(!bdrv_dirty_bitmap_frozen(bitmap));
     g_free(bitmap->name);
@@ -5695,7 +5695,7 @@ BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs)
     QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) {
         BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1);
         BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1);
-        info->count = bdrv_get_dirty_count(bs, bm);
+        info->count = bdrv_get_dirty_count(bm);
         info->granularity = bdrv_dirty_bitmap_granularity(bm);
         info->has_name = !!bm->name;
         info->name = g_strdup(bm->name);
@@ -5742,20 +5742,19 @@ uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap)
     return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap);
 }
 
-void bdrv_dirty_iter_init(BlockDriverState *bs,
-                          BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
+void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
 {
     hbitmap_iter_init(hbi, bitmap->bitmap, 0);
 }
 
-void bdrv_set_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
+void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap,
                            int64_t cur_sector, int nr_sectors)
 {
     assert(bdrv_dirty_bitmap_enabled(bitmap));
     hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
 }
 
-void bdrv_reset_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
+void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
                              int64_t cur_sector, int nr_sectors)
 {
     assert(bdrv_dirty_bitmap_enabled(bitmap));
@@ -5801,7 +5800,7 @@ void bdrv_set_dirty_iter(HBitmapIter *hbi, int64_t offset)
     hbitmap_iter_init(hbi, hbi->hb, offset);
 }
 
-int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
+int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap)
 {
     return hbitmap_count(bitmap->bitmap);
 }
diff --git a/block/backup.c b/block/backup.c
index ddf9027..d3f648d 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -284,7 +284,7 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
 
     granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap);
     clusters_per_iter = MAX((granularity / BACKUP_CLUSTER_SIZE), 1);
-    bdrv_dirty_iter_init(bs, job->sync_bitmap, &hbi);
+    bdrv_dirty_iter_init(job->sync_bitmap, &hbi);
 
     /* Find the next dirty sector(s) */
     while ((sector = hbitmap_iter_next(&hbi)) != -1) {
diff --git a/block/mirror.c b/block/mirror.c
index 898f8ce..d53e60e 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -125,11 +125,9 @@ static void mirror_write_complete(void *opaque, int ret)
     MirrorOp *op = opaque;
     MirrorBlockJob *s = op->s;
     if (ret < 0) {
-        BlockDriverState *source = s->common.bs;
         BlockErrorAction action;
 
-        bdrv_set_dirty_bitmap(source, s->dirty_bitmap, op->sector_num,
-                              op->nb_sectors);
+        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
         action = mirror_error_action(s, false, -ret);
         if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
             s->ret = ret;
@@ -143,11 +141,9 @@ static void mirror_read_complete(void *opaque, int ret)
     MirrorOp *op = opaque;
     MirrorBlockJob *s = op->s;
     if (ret < 0) {
-        BlockDriverState *source = s->common.bs;
         BlockErrorAction action;
 
-        bdrv_set_dirty_bitmap(source, s->dirty_bitmap, op->sector_num,
-                              op->nb_sectors);
+        bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors);
         action = mirror_error_action(s, true, -ret);
         if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
             s->ret = ret;
@@ -170,10 +166,9 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
 
     s->sector_num = hbitmap_iter_next(&s->hbi);
     if (s->sector_num < 0) {
-        bdrv_dirty_iter_init(source, s->dirty_bitmap, &s->hbi);
+        bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
         s->sector_num = hbitmap_iter_next(&s->hbi);
-        trace_mirror_restart_iter(s,
-                                  bdrv_get_dirty_count(source, s->dirty_bitmap));
+        trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap));
         assert(s->sector_num >= 0);
     }
 
@@ -288,8 +283,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
         next_sector += sectors_per_chunk;
     }
 
-    bdrv_reset_dirty_bitmap(source, s->dirty_bitmap, sector_num,
-                            nb_sectors);
+    bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num, nb_sectors);
 
     /* Copy the dirty cluster.  */
     s->in_flight++;
@@ -446,7 +440,7 @@ static void coroutine_fn mirror_run(void *opaque)
 
             assert(n > 0);
             if (ret == 1) {
-                bdrv_set_dirty_bitmap(bs, s->dirty_bitmap, sector_num, n);
+                bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
                 sector_num = next;
             } else {
                 sector_num += n;
@@ -454,7 +448,7 @@ static void coroutine_fn mirror_run(void *opaque)
         }
     }
 
-    bdrv_dirty_iter_init(bs, s->dirty_bitmap, &s->hbi);
+    bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
     last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
     for (;;) {
         uint64_t delay_ns = 0;
@@ -466,7 +460,7 @@ static void coroutine_fn mirror_run(void *opaque)
             goto immediate_exit;
         }
 
-        cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
+        cnt = bdrv_get_dirty_count(s->dirty_bitmap);
         /* s->common.offset contains the number of bytes already processed so
          * far, cnt is the number of dirty sectors remaining and
          * s->sectors_in_flight is the number of sectors currently being
@@ -516,7 +510,7 @@ static void coroutine_fn mirror_run(void *opaque)
 
                 should_complete = s->should_complete ||
                     block_job_is_cancelled(&s->common);
-                cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
+                cnt = bdrv_get_dirty_count(s->dirty_bitmap);
             }
         }
 
@@ -531,7 +525,7 @@ static void coroutine_fn mirror_run(void *opaque)
              */
             trace_mirror_before_drain(s, cnt);
             bdrv_drain(bs);
-            cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap);
+            cnt = bdrv_get_dirty_count(s->dirty_bitmap);
         }
 
         ret = 0;
diff --git a/blockdev.c b/blockdev.c
index 5905946..5eaf77e 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2072,7 +2072,7 @@ void qmp_block_dirty_bitmap_remove(const char *node, const char *name,
                    name);
         goto out;
     }
-    bdrv_dirty_bitmap_make_anon(bs, bitmap);
+    bdrv_dirty_bitmap_make_anon(bitmap);
     bdrv_release_dirty_bitmap(bs, bitmap);
 
  out:
diff --git a/include/block/block.h b/include/block/block.h
index 12bf145..7d1a717 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -465,7 +465,7 @@ BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs,
                                            Error **errp);
 BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs,
                                         const char *name);
-void bdrv_dirty_bitmap_make_anon(BlockDriverState *bs, BdrvDirtyBitmap *bitmap);
+void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap);
 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap);
 void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap);
 void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap);
@@ -475,15 +475,14 @@ uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap);
 bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap);
 bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap);
 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector);
-void bdrv_set_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
+void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap,
                            int64_t cur_sector, int nr_sectors);
-void bdrv_reset_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
+void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
                              int64_t cur_sector, int nr_sectors);
 void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap);
-void bdrv_dirty_iter_init(BlockDriverState *bs,
-                          BdrvDirtyBitmap *bitmap, struct HBitmapIter *hbi);
+void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, struct HBitmapIter *hbi);
 void bdrv_set_dirty_iter(struct HBitmapIter *hbi, int64_t offset);
-int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap);
+int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap);
 
 void bdrv_enable_copy_on_read(BlockDriverState *bs);
 void bdrv_disable_copy_on_read(BlockDriverState *bs);
diff --git a/migration/block.c b/migration/block.c
index 02a7d26..ddb59cc 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -304,7 +304,7 @@ static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
     blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
                                 nr_sectors, blk_mig_read_cb, blk);
 
-    bdrv_reset_dirty_bitmap(bs, bmds->dirty_bitmap, cur_sector, nr_sectors);
+    bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, cur_sector, nr_sectors);
     qemu_mutex_unlock_iothread();
 
     bmds->cur_sector = cur_sector + nr_sectors;
@@ -497,8 +497,7 @@ static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
                 g_free(blk);
             }
 
-            bdrv_reset_dirty_bitmap(bmds->bs, bmds->dirty_bitmap, sector,
-                                    nr_sectors);
+            bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, sector, nr_sectors);
             break;
         }
         sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
@@ -584,7 +583,7 @@ static int64_t get_remaining_dirty(void)
     int64_t dirty = 0;
 
     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
-        dirty += bdrv_get_dirty_count(bmds->bs, bmds->dirty_bitmap);
+        dirty += bdrv_get_dirty_count(bmds->dirty_bitmap);
     }
 
     return dirty << BDRV_SECTOR_BITS;
-- 
cgit v1.1


From ce1ffea8cdcea41533bde87759b8390f0e3a9ad3 Mon Sep 17 00:00:00 2001
From: John Snow <jsnow@redhat.com>
Date: Fri, 17 Apr 2015 19:50:03 -0400
Subject: block: Resize bitmaps on bdrv_truncate

Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1429314609-29776-16-git-send-email-jsnow@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c                | 18 ++++++++++++++++++
 include/qemu/hbitmap.h | 10 ++++++++++
 util/hbitmap.c         | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 76 insertions(+)

diff --git a/block.c b/block.c
index b7d6df5..ec23594 100644
--- a/block.c
+++ b/block.c
@@ -114,6 +114,7 @@ static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
                            int nr_sectors);
 static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
                              int nr_sectors);
+static void bdrv_dirty_bitmap_truncate(BlockDriverState *bs);
 /* If non-zero, use only whitelisted block drivers */
 static int use_bdrv_whitelist;
 
@@ -3610,6 +3611,7 @@ int bdrv_truncate(BlockDriverState *bs, int64_t offset)
     ret = drv->bdrv_truncate(bs, offset);
     if (ret == 0) {
         ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
+        bdrv_dirty_bitmap_truncate(bs);
         if (bs->blk) {
             blk_dev_resize_cb(bs->blk);
         }
@@ -5659,6 +5661,22 @@ BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs,
     return parent;
 }
 
+/**
+ * Truncates _all_ bitmaps attached to a BDS.
+ */
+static void bdrv_dirty_bitmap_truncate(BlockDriverState *bs)
+{
+    BdrvDirtyBitmap *bitmap;
+    uint64_t size = bdrv_nb_sectors(bs);
+
+    QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
+        if (bdrv_dirty_bitmap_frozen(bitmap)) {
+            continue;
+        }
+        hbitmap_truncate(bitmap->bitmap, size);
+    }
+}
+
 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
 {
     BdrvDirtyBitmap *bm, *next;
diff --git a/include/qemu/hbitmap.h b/include/qemu/hbitmap.h
index 6cb2d0e..f0a85f8 100644
--- a/include/qemu/hbitmap.h
+++ b/include/qemu/hbitmap.h
@@ -65,6 +65,16 @@ struct HBitmapIter {
 HBitmap *hbitmap_alloc(uint64_t size, int granularity);
 
 /**
+ * hbitmap_truncate:
+ * @hb: The bitmap to change the size of.
+ * @size: The number of elements to change the bitmap to accommodate.
+ *
+ * truncate or grow an existing bitmap to accommodate a new number of elements.
+ * This may invalidate existing HBitmapIterators.
+ */
+void hbitmap_truncate(HBitmap *hb, uint64_t size);
+
+/**
  * hbitmap_merge:
  * @a: The bitmap to store the result in.
  * @b: The bitmap to merge into @a.
diff --git a/util/hbitmap.c b/util/hbitmap.c
index 150d6e9..a10c7ae 100644
--- a/util/hbitmap.c
+++ b/util/hbitmap.c
@@ -400,6 +400,54 @@ HBitmap *hbitmap_alloc(uint64_t size, int granularity)
     return hb;
 }
 
+void hbitmap_truncate(HBitmap *hb, uint64_t size)
+{
+    bool shrink;
+    unsigned i;
+    uint64_t num_elements = size;
+    uint64_t old;
+
+    /* Size comes in as logical elements, adjust for granularity. */
+    size = (size + (1ULL << hb->granularity) - 1) >> hb->granularity;
+    assert(size <= ((uint64_t)1 << HBITMAP_LOG_MAX_SIZE));
+    shrink = size < hb->size;
+
+    /* bit sizes are identical; nothing to do. */
+    if (size == hb->size) {
+        return;
+    }
+
+    /* If we're losing bits, let's clear those bits before we invalidate all of
+     * our invariants. This helps keep the bitcount consistent, and will prevent
+     * us from carrying around garbage bits beyond the end of the map.
+     */
+    if (shrink) {
+        /* Don't clear partial granularity groups;
+         * start at the first full one. */
+        uint64_t start = QEMU_ALIGN_UP(num_elements, 1 << hb->granularity);
+        uint64_t fix_count = (hb->size << hb->granularity) - start;
+
+        assert(fix_count);
+        hbitmap_reset(hb, start, fix_count);
+    }
+
+    hb->size = size;
+    for (i = HBITMAP_LEVELS; i-- > 0; ) {
+        size = MAX(BITS_TO_LONGS(size), 1);
+        if (hb->sizes[i] == size) {
+            break;
+        }
+        old = hb->sizes[i];
+        hb->sizes[i] = size;
+        hb->levels[i] = g_realloc(hb->levels[i], size * sizeof(unsigned long));
+        if (!shrink) {
+            memset(&hb->levels[i][old], 0x00,
+                   (size - old) * sizeof(*hb->levels[i]));
+        }
+    }
+}
+
+
 /**
  * Given HBitmaps A and B, let A := A (BITOR) B.
  * Bitmap B will not be modified.
-- 
cgit v1.1


From a94e87c08cfff73ac4b179adc3d0d9c3b8d2ddef Mon Sep 17 00:00:00 2001
From: John Snow <jsnow@redhat.com>
Date: Fri, 17 Apr 2015 19:50:04 -0400
Subject: hbitmap: truncate tests

The general approach is to set bits close to the boundaries of
where we are truncating and ensure that everything appears to
have gone OK.

We test growing and shrinking by different amounts:
- Less than the granularity
- Less than the granularity, but across a boundary
- Less than sizeof(unsigned long)
- Less than sizeof(unsigned long), but across a ulong boundary
- More than sizeof(unsigned long)

Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1429314609-29776-17-git-send-email-jsnow@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/test-hbitmap.c | 255 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 255 insertions(+)

diff --git a/tests/test-hbitmap.c b/tests/test-hbitmap.c
index 8c902f2..9f41b5f 100644
--- a/tests/test-hbitmap.c
+++ b/tests/test-hbitmap.c
@@ -11,6 +11,8 @@
 
 #include <glib.h>
 #include <stdarg.h>
+#include <string.h>
+#include <sys/types.h>
 #include "qemu/hbitmap.h"
 
 #define LOG_BITS_PER_LONG          (BITS_PER_LONG == 32 ? 5 : 6)
@@ -23,6 +25,7 @@ typedef struct TestHBitmapData {
     HBitmap       *hb;
     unsigned long *bits;
     size_t         size;
+    size_t         old_size;
     int            granularity;
 } TestHBitmapData;
 
@@ -91,6 +94,44 @@ static void hbitmap_test_init(TestHBitmapData *data,
     }
 }
 
+static inline size_t hbitmap_test_array_size(size_t bits)
+{
+    size_t n = (bits + BITS_PER_LONG - 1) / BITS_PER_LONG;
+    return n ? n : 1;
+}
+
+static void hbitmap_test_truncate_impl(TestHBitmapData *data,
+                                       size_t size)
+{
+    size_t n;
+    size_t m;
+    data->old_size = data->size;
+    data->size = size;
+
+    if (data->size == data->old_size) {
+        return;
+    }
+
+    n = hbitmap_test_array_size(size);
+    m = hbitmap_test_array_size(data->old_size);
+    data->bits = g_realloc(data->bits, sizeof(unsigned long) * n);
+    if (n > m) {
+        memset(&data->bits[m], 0x00, sizeof(unsigned long) * (n - m));
+    }
+
+    /* If we shrink to an uneven multiple of sizeof(unsigned long),
+     * scrub the leftover memory. */
+    if (data->size < data->old_size) {
+        m = size % (sizeof(unsigned long) * 8);
+        if (m) {
+            unsigned long mask = (1ULL << m) - 1;
+            data->bits[n-1] &= mask;
+        }
+    }
+
+    hbitmap_truncate(data->hb, size);
+}
+
 static void hbitmap_test_teardown(TestHBitmapData *data,
                                   const void *unused)
 {
@@ -369,6 +410,198 @@ static void test_hbitmap_iter_granularity(TestHBitmapData *data,
     g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0);
 }
 
+static void hbitmap_test_set_boundary_bits(TestHBitmapData *data, ssize_t diff)
+{
+    size_t size = data->size;
+
+    /* First bit */
+    hbitmap_test_set(data, 0, 1);
+    if (diff < 0) {
+        /* Last bit in new, shortened map */
+        hbitmap_test_set(data, size + diff - 1, 1);
+
+        /* First bit to be truncated away */
+        hbitmap_test_set(data, size + diff, 1);
+    }
+    /* Last bit */
+    hbitmap_test_set(data, size - 1, 1);
+    if (data->granularity == 0) {
+        hbitmap_test_check_get(data);
+    }
+}
+
+static void hbitmap_test_check_boundary_bits(TestHBitmapData *data)
+{
+    size_t size = MIN(data->size, data->old_size);
+
+    if (data->granularity == 0) {
+        hbitmap_test_check_get(data);
+        hbitmap_test_check(data, 0);
+    } else {
+        /* If a granularity was set, note that every distinct
+         * (bit >> granularity) value that was set will increase
+         * the bit pop count by 2^granularity, not just 1.
+         *
+         * The hbitmap_test_check facility does not currently tolerate
+         * non-zero granularities, so test the boundaries and the population
+         * count manually.
+         */
+        g_assert(hbitmap_get(data->hb, 0));
+        g_assert(hbitmap_get(data->hb, size - 1));
+        g_assert_cmpint(2 << data->granularity, ==, hbitmap_count(data->hb));
+    }
+}
+
+/* Generic truncate test. */
+static void hbitmap_test_truncate(TestHBitmapData *data,
+                                  size_t size,
+                                  ssize_t diff,
+                                  int granularity)
+{
+    hbitmap_test_init(data, size, granularity);
+    hbitmap_test_set_boundary_bits(data, diff);
+    hbitmap_test_truncate_impl(data, size + diff);
+    hbitmap_test_check_boundary_bits(data);
+}
+
+static void test_hbitmap_truncate_nop(TestHBitmapData *data,
+                                      const void *unused)
+{
+    hbitmap_test_truncate(data, L2, 0, 0);
+}
+
+/**
+ * Grow by an amount smaller than the granularity, without crossing
+ * a granularity alignment boundary. Effectively a NOP.
+ */
+static void test_hbitmap_truncate_grow_negligible(TestHBitmapData *data,
+                                                  const void *unused)
+{
+    size_t size = L2 - 1;
+    size_t diff = 1;
+    int granularity = 1;
+
+    hbitmap_test_truncate(data, size, diff, granularity);
+}
+
+/**
+ * Shrink by an amount smaller than the granularity, without crossing
+ * a granularity alignment boundary. Effectively a NOP.
+ */
+static void test_hbitmap_truncate_shrink_negligible(TestHBitmapData *data,
+                                                    const void *unused)
+{
+    size_t size = L2;
+    ssize_t diff = -1;
+    int granularity = 1;
+
+    hbitmap_test_truncate(data, size, diff, granularity);
+}
+
+/**
+ * Grow by an amount smaller than the granularity, but crossing over
+ * a granularity alignment boundary.
+ */
+static void test_hbitmap_truncate_grow_tiny(TestHBitmapData *data,
+                                            const void *unused)
+{
+    size_t size = L2 - 2;
+    ssize_t diff = 1;
+    int granularity = 1;
+
+    hbitmap_test_truncate(data, size, diff, granularity);
+}
+
+/**
+ * Shrink by an amount smaller than the granularity, but crossing over
+ * a granularity alignment boundary.
+ */
+static void test_hbitmap_truncate_shrink_tiny(TestHBitmapData *data,
+                                              const void *unused)
+{
+    size_t size = L2 - 1;
+    ssize_t diff = -1;
+    int granularity = 1;
+
+    hbitmap_test_truncate(data, size, diff, granularity);
+}
+
+/**
+ * Grow by an amount smaller than sizeof(long), and not crossing over
+ * a sizeof(long) alignment boundary.
+ */
+static void test_hbitmap_truncate_grow_small(TestHBitmapData *data,
+                                             const void *unused)
+{
+    size_t size = L2 + 1;
+    size_t diff = sizeof(long) / 2;
+
+    hbitmap_test_truncate(data, size, diff, 0);
+}
+
+/**
+ * Shrink by an amount smaller than sizeof(long), and not crossing over
+ * a sizeof(long) alignment boundary.
+ */
+static void test_hbitmap_truncate_shrink_small(TestHBitmapData *data,
+                                               const void *unused)
+{
+    size_t size = L2;
+    size_t diff = sizeof(long) / 2;
+
+    hbitmap_test_truncate(data, size, -diff, 0);
+}
+
+/**
+ * Grow by an amount smaller than sizeof(long), while crossing over
+ * a sizeof(long) alignment boundary.
+ */
+static void test_hbitmap_truncate_grow_medium(TestHBitmapData *data,
+                                              const void *unused)
+{
+    size_t size = L2 - 1;
+    size_t diff = sizeof(long) / 2;
+
+    hbitmap_test_truncate(data, size, diff, 0);
+}
+
+/**
+ * Shrink by an amount smaller than sizeof(long), while crossing over
+ * a sizeof(long) alignment boundary.
+ */
+static void test_hbitmap_truncate_shrink_medium(TestHBitmapData *data,
+                                                const void *unused)
+{
+    size_t size = L2 + 1;
+    size_t diff = sizeof(long) / 2;
+
+    hbitmap_test_truncate(data, size, -diff, 0);
+}
+
+/**
+ * Grow by an amount larger than sizeof(long).
+ */
+static void test_hbitmap_truncate_grow_large(TestHBitmapData *data,
+                                             const void *unused)
+{
+    size_t size = L2;
+    size_t diff = 8 * sizeof(long);
+
+    hbitmap_test_truncate(data, size, diff, 0);
+}
+
+/**
+ * Shrink by an amount larger than sizeof(long).
+ */
+static void test_hbitmap_truncate_shrink_large(TestHBitmapData *data,
+                                               const void *unused)
+{
+    size_t size = L2;
+    size_t diff = 8 * sizeof(long);
+
+    hbitmap_test_truncate(data, size, -diff, 0);
+}
+
 static void hbitmap_test_add(const char *testpath,
                                    void (*test_func)(TestHBitmapData *data, const void *user_data))
 {
@@ -395,6 +628,28 @@ int main(int argc, char **argv)
     hbitmap_test_add("/hbitmap/reset/empty", test_hbitmap_reset_empty);
     hbitmap_test_add("/hbitmap/reset/general", test_hbitmap_reset);
     hbitmap_test_add("/hbitmap/granularity", test_hbitmap_granularity);
+
+    hbitmap_test_add("/hbitmap/truncate/nop", test_hbitmap_truncate_nop);
+    hbitmap_test_add("/hbitmap/truncate/grow/negligible",
+                     test_hbitmap_truncate_grow_negligible);
+    hbitmap_test_add("/hbitmap/truncate/shrink/negligible",
+                     test_hbitmap_truncate_shrink_negligible);
+    hbitmap_test_add("/hbitmap/truncate/grow/tiny",
+                     test_hbitmap_truncate_grow_tiny);
+    hbitmap_test_add("/hbitmap/truncate/shrink/tiny",
+                     test_hbitmap_truncate_shrink_tiny);
+    hbitmap_test_add("/hbitmap/truncate/grow/small",
+                     test_hbitmap_truncate_grow_small);
+    hbitmap_test_add("/hbitmap/truncate/shrink/small",
+                     test_hbitmap_truncate_shrink_small);
+    hbitmap_test_add("/hbitmap/truncate/grow/medium",
+                     test_hbitmap_truncate_grow_medium);
+    hbitmap_test_add("/hbitmap/truncate/shrink/medium",
+                     test_hbitmap_truncate_shrink_medium);
+    hbitmap_test_add("/hbitmap/truncate/grow/large",
+                     test_hbitmap_truncate_grow_large);
+    hbitmap_test_add("/hbitmap/truncate/shrink/large",
+                     test_hbitmap_truncate_shrink_large);
     g_test_run();
 
     return 0;
-- 
cgit v1.1


From 9f7264f57c8307bca32e78427348b8b323d5db21 Mon Sep 17 00:00:00 2001
From: John Snow <jsnow@redhat.com>
Date: Fri, 17 Apr 2015 19:50:05 -0400
Subject: iotests: add invalid input incremental backup tests

Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1429314609-29776-18-git-send-email-jsnow@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/124     | 104 +++++++++++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/124.out |   5 +++
 tests/qemu-iotests/group   |   1 +
 3 files changed, 110 insertions(+)
 create mode 100644 tests/qemu-iotests/124
 create mode 100644 tests/qemu-iotests/124.out

diff --git a/tests/qemu-iotests/124 b/tests/qemu-iotests/124
new file mode 100644
index 0000000..85675ec
--- /dev/null
+++ b/tests/qemu-iotests/124
@@ -0,0 +1,104 @@
+#!/usr/bin/env python
+#
+# Tests for incremental drive-backup
+#
+# Copyright (C) 2015 John Snow for Red Hat, Inc.
+#
+# Based on 056.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import os
+import iotests
+
+
+def io_write_patterns(img, patterns):
+    for pattern in patterns:
+        iotests.qemu_io('-c', 'write -P%s %s %s' % pattern, img)
+
+
+class TestIncrementalBackup(iotests.QMPTestCase):
+    def setUp(self):
+        self.bitmaps = list()
+        self.files = list()
+        self.drives = list()
+        self.vm = iotests.VM()
+        self.err_img = os.path.join(iotests.test_dir, 'err.%s' % iotests.imgfmt)
+
+        # Create a base image with a distinctive patterning
+        drive0 = self.add_node('drive0')
+        self.img_create(drive0['file'], drive0['fmt'])
+        self.vm.add_drive(drive0['file'])
+        io_write_patterns(drive0['file'], (('0x41', 0, 512),
+                                           ('0xd5', '1M', '32k'),
+                                           ('0xdc', '32M', '124k')))
+        self.vm.launch()
+
+
+    def add_node(self, node_id, fmt=iotests.imgfmt, path=None, backup=None):
+        if path is None:
+            path = os.path.join(iotests.test_dir, '%s.%s' % (node_id, fmt))
+        if backup is None:
+            backup = os.path.join(iotests.test_dir,
+                                  '%s.full.backup.%s' % (node_id, fmt))
+
+        self.drives.append({
+            'id': node_id,
+            'file': path,
+            'backup': backup,
+            'fmt': fmt })
+        return self.drives[-1]
+
+
+    def img_create(self, img, fmt=iotests.imgfmt, size='64M',
+                   parent=None, parentFormat=None):
+        if parent:
+            if parentFormat is None:
+                parentFormat = fmt
+            iotests.qemu_img('create', '-f', fmt, img, size,
+                             '-b', parent, '-F', parentFormat)
+        else:
+            iotests.qemu_img('create', '-f', fmt, img, size)
+        self.files.append(img)
+
+    def test_sync_dirty_bitmap_missing(self):
+        self.assert_no_active_block_jobs()
+        self.files.append(self.err_img)
+        result = self.vm.qmp('drive-backup', device=self.drives[0]['id'],
+                             sync='dirty-bitmap', format=self.drives[0]['fmt'],
+                             target=self.err_img)
+        self.assert_qmp(result, 'error/class', 'GenericError')
+
+
+    def test_sync_dirty_bitmap_not_found(self):
+        self.assert_no_active_block_jobs()
+        self.files.append(self.err_img)
+        result = self.vm.qmp('drive-backup', device=self.drives[0]['id'],
+                             sync='dirty-bitmap', bitmap='unknown',
+                             format=self.drives[0]['fmt'], target=self.err_img)
+        self.assert_qmp(result, 'error/class', 'GenericError')
+
+
+    def tearDown(self):
+        self.vm.shutdown()
+        for filename in self.files:
+            try:
+                os.remove(filename)
+            except OSError:
+                pass
+
+
+if __name__ == '__main__':
+    iotests.main(supported_fmts=['qcow2'])
diff --git a/tests/qemu-iotests/124.out b/tests/qemu-iotests/124.out
new file mode 100644
index 0000000..fbc63e6
--- /dev/null
+++ b/tests/qemu-iotests/124.out
@@ -0,0 +1,5 @@
+..
+----------------------------------------------------------------------
+Ran 2 tests
+
+OK
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index 7c0d639..6ca3466 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -124,6 +124,7 @@
 121 rw auto
 122 rw auto
 123 rw auto quick
+124 rw auto backing
 128 rw auto quick
 129 rw auto quick
 130 rw auto quick
-- 
cgit v1.1


From 7898f74e78a5900fc079868e255b65d807fa8a8f Mon Sep 17 00:00:00 2001
From: John Snow <jsnow@redhat.com>
Date: Fri, 17 Apr 2015 19:50:06 -0400
Subject: iotests: add QMP event waiting queue

A filter is added to allow callers to request very specific
events to be pulled from the event queue, while leaving undesired
events still in the stream.

This allows us to poll for completion data for multiple asynchronous
events in any arbitrary order.

A new timeout context is added to the qmp pull_event method's
wait parameter to allow tests to fail if they do not complete
within some expected period of time.

Also fixed is a bug in qmp.pull_event where we try to retrieve an event
from an empty list if we attempt to retrieve an event with wait=False
but no events have occurred.

Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1429314609-29776-19-git-send-email-jsnow@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 scripts/qmp/qmp.py            | 95 +++++++++++++++++++++++++++++--------------
 tests/qemu-iotests/iotests.py | 38 +++++++++++++++++
 2 files changed, 103 insertions(+), 30 deletions(-)

diff --git a/scripts/qmp/qmp.py b/scripts/qmp/qmp.py
index 20b6ec7..1d38e3e 100644
--- a/scripts/qmp/qmp.py
+++ b/scripts/qmp/qmp.py
@@ -21,6 +21,9 @@ class QMPConnectError(QMPError):
 class QMPCapabilitiesError(QMPError):
     pass
 
+class QMPTimeoutError(QMPError):
+    pass
+
 class QEMUMonitorProtocol:
     def __init__(self, address, server=False):
         """
@@ -72,6 +75,44 @@ class QEMUMonitorProtocol:
 
     error = socket.error
 
+    def __get_events(self, wait=False):
+        """
+        Check for new events in the stream and cache them in __events.
+
+        @param wait (bool): block until an event is available.
+        @param wait (float): If wait is a float, treat it as a timeout value.
+
+        @raise QMPTimeoutError: If a timeout float is provided and the timeout
+                                period elapses.
+        @raise QMPConnectError: If wait is True but no events could be retrieved
+                                or if some other error occurred.
+        """
+
+        # Check for new events regardless and pull them into the cache:
+        self.__sock.setblocking(0)
+        try:
+            self.__json_read()
+        except socket.error, err:
+            if err[0] == errno.EAGAIN:
+                # No data available
+                pass
+        self.__sock.setblocking(1)
+
+        # Wait for new events, if needed.
+        # if wait is 0.0, this means "no wait" and is also implicitly false.
+        if not self.__events and wait:
+            if isinstance(wait, float):
+                self.__sock.settimeout(wait)
+            try:
+                ret = self.__json_read(only_event=True)
+            except socket.timeout:
+                raise QMPTimeoutError("Timeout waiting for event")
+            except:
+                raise QMPConnectError("Error while reading from socket")
+            if ret is None:
+                raise QMPConnectError("Error while reading from socket")
+            self.__sock.settimeout(None)
+
     def connect(self, negotiate=True):
         """
         Connect to the QMP Monitor and perform capabilities negotiation.
@@ -140,43 +181,37 @@ class QEMUMonitorProtocol:
         """
         Get and delete the first available QMP event.
 
-        @param wait: block until an event is available (bool)
+        @param wait (bool): block until an event is available.
+        @param wait (float): If wait is a float, treat it as a timeout value.
+
+        @raise QMPTimeoutError: If a timeout float is provided and the timeout
+                                period elapses.
+        @raise QMPConnectError: If wait is True but no events could be retrieved
+                                or if some other error occurred.
+
+        @return The first available QMP event, or None.
         """
-        self.__sock.setblocking(0)
-        try:
-            self.__json_read()
-        except socket.error, err:
-            if err[0] == errno.EAGAIN:
-                # No data available
-                pass
-        self.__sock.setblocking(1)
-        if not self.__events and wait:
-            self.__json_read(only_event=True)
-        event = self.__events[0]
-        del self.__events[0]
-        return event
+        self.__get_events(wait)
+
+        if self.__events:
+            return self.__events.pop(0)
+        return None
 
     def get_events(self, wait=False):
         """
         Get a list of available QMP events.
 
-        @param wait: block until an event is available (bool)
-        """
-        self.__sock.setblocking(0)
-        try:
-            self.__json_read()
-        except socket.error, err:
-            if err[0] == errno.EAGAIN:
-                # No data available
-                pass
-        self.__sock.setblocking(1)
-        if not self.__events and wait:
-            ret = self.__json_read(only_event=True)
-            if ret == None:
-                # We are in blocking mode, if don't get anything, something
-                # went wrong
-                raise QMPConnectError("Error while reading from socket")
+        @param wait (bool): block until an event is available.
+        @param wait (float): If wait is a float, treat it as a timeout value.
 
+        @raise QMPTimeoutError: If a timeout float is provided and the timeout
+                                period elapses.
+        @raise QMPConnectError: If wait is True but no events could be retrieved
+                                or if some other error occurred.
+
+        @return The list of available QMP events.
+        """
+        self.__get_events(wait)
         return self.__events
 
     def clear_events(self):
diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index 1402854..e93e623 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -78,6 +78,23 @@ def create_image(name, size):
         i = i + 512
     file.close()
 
+# Test if 'match' is a recursive subset of 'event'
+def event_match(event, match=None):
+    if match is None:
+        return True
+
+    for key in match:
+        if key in event:
+            if isinstance(event[key], dict):
+                if not event_match(event[key], match[key]):
+                    return False
+            elif event[key] != match[key]:
+                return False
+        else:
+            return False
+
+    return True
+
 class VM(object):
     '''A QEMU VM'''
 
@@ -92,6 +109,7 @@ class VM(object):
                      '-machine', 'accel=qtest',
                      '-display', 'none', '-vga', 'none']
         self._num_drives = 0
+        self._events = []
 
     # This can be used to add an unused monitor instance.
     def add_monitor_telnet(self, ip, port):
@@ -202,14 +220,34 @@ class VM(object):
 
     def get_qmp_event(self, wait=False):
         '''Poll for one queued QMP events and return it'''
+        if len(self._events) > 0:
+            return self._events.pop(0)
         return self._qmp.pull_event(wait=wait)
 
     def get_qmp_events(self, wait=False):
         '''Poll for queued QMP events and return a list of dicts'''
         events = self._qmp.get_events(wait=wait)
+        events.extend(self._events)
+        del self._events[:]
         self._qmp.clear_events()
         return events
 
+    def event_wait(self, name='BLOCK_JOB_COMPLETED', timeout=60.0, match=None):
+        # Search cached events
+        for event in self._events:
+            if (event['event'] == name) and event_match(event, match):
+                self._events.remove(event)
+                return event
+
+        # Poll for new events
+        while True:
+            event = self._qmp.pull_event(wait=timeout)
+            if (event['event'] == name) and event_match(event, match):
+                return event
+            self._events.append(event)
+
+        return None
+
 index_re = re.compile(r'([^\[]+)\[([^\]]+)\]')
 
 class QMPTestCase(unittest.TestCase):
-- 
cgit v1.1


From a3d715958c4456afea402e891288864fe4e51547 Mon Sep 17 00:00:00 2001
From: John Snow <jsnow@redhat.com>
Date: Fri, 17 Apr 2015 19:50:07 -0400
Subject: iotests: add simple incremental backup case

Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1429314609-29776-20-git-send-email-jsnow@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/124     | 174 +++++++++++++++++++++++++++++++++++++++++++--
 tests/qemu-iotests/124.out |   4 +-
 2 files changed, 172 insertions(+), 6 deletions(-)

diff --git a/tests/qemu-iotests/124 b/tests/qemu-iotests/124
index 85675ec..5c3b434 100644
--- a/tests/qemu-iotests/124
+++ b/tests/qemu-iotests/124
@@ -29,6 +29,51 @@ def io_write_patterns(img, patterns):
         iotests.qemu_io('-c', 'write -P%s %s %s' % pattern, img)
 
 
+def try_remove(img):
+    try:
+        os.remove(img)
+    except OSError:
+        pass
+
+
+class Bitmap:
+    def __init__(self, name, drive):
+        self.name = name
+        self.drive = drive
+        self.num = 0
+        self.backups = list()
+
+    def base_target(self):
+        return (self.drive['backup'], None)
+
+    def new_target(self, num=None):
+        if num is None:
+            num = self.num
+        self.num = num + 1
+        base = os.path.join(iotests.test_dir,
+                            "%s.%s." % (self.drive['id'], self.name))
+        suff = "%i.%s" % (num, self.drive['fmt'])
+        target = base + "inc" + suff
+        reference = base + "ref" + suff
+        self.backups.append((target, reference))
+        return (target, reference)
+
+    def last_target(self):
+        if self.backups:
+            return self.backups[-1]
+        return self.base_target()
+
+    def del_target(self):
+        for image in self.backups.pop():
+            try_remove(image)
+        self.num -= 1
+
+    def cleanup(self):
+        for backup in self.backups:
+            for image in backup:
+                try_remove(image)
+
+
 class TestIncrementalBackup(iotests.QMPTestCase):
     def setUp(self):
         self.bitmaps = list()
@@ -73,6 +118,128 @@ class TestIncrementalBackup(iotests.QMPTestCase):
             iotests.qemu_img('create', '-f', fmt, img, size)
         self.files.append(img)
 
+
+    def do_qmp_backup(self, error='Input/output error', **kwargs):
+        res = self.vm.qmp('drive-backup', **kwargs)
+        self.assert_qmp(res, 'return', {})
+
+        event = self.vm.event_wait(name="BLOCK_JOB_COMPLETED",
+                                   match={'data': {'device': kwargs['device']}})
+        self.assertIsNotNone(event)
+
+        try:
+            failure = self.dictpath(event, 'data/error')
+        except AssertionError:
+            # Backup succeeded.
+            self.assert_qmp(event, 'data/offset', event['data']['len'])
+            return True
+        else:
+            # Backup failed.
+            self.assert_qmp(event, 'data/error', error)
+            return False
+
+
+    def create_anchor_backup(self, drive=None):
+        if drive is None:
+            drive = self.drives[-1]
+        res = self.do_qmp_backup(device=drive['id'], sync='full',
+                                 format=drive['fmt'], target=drive['backup'])
+        self.assertTrue(res)
+        self.files.append(drive['backup'])
+        return drive['backup']
+
+
+    def make_reference_backup(self, bitmap=None):
+        if bitmap is None:
+            bitmap = self.bitmaps[-1]
+        _, reference = bitmap.last_target()
+        res = self.do_qmp_backup(device=bitmap.drive['id'], sync='full',
+                                 format=bitmap.drive['fmt'], target=reference)
+        self.assertTrue(res)
+
+
+    def add_bitmap(self, name, drive):
+        bitmap = Bitmap(name, drive)
+        self.bitmaps.append(bitmap)
+        result = self.vm.qmp('block-dirty-bitmap-add', node=drive['id'],
+                             name=bitmap.name)
+        self.assert_qmp(result, 'return', {})
+        return bitmap
+
+
+    def prepare_backup(self, bitmap=None, parent=None):
+        if bitmap is None:
+            bitmap = self.bitmaps[-1]
+        if parent is None:
+            parent, _ = bitmap.last_target()
+
+        target, _ = bitmap.new_target()
+        self.img_create(target, bitmap.drive['fmt'], parent=parent)
+        return target
+
+
+    def create_incremental(self, bitmap=None, parent=None,
+                           parentFormat=None, validate=True):
+        if bitmap is None:
+            bitmap = self.bitmaps[-1]
+        if parent is None:
+            parent, _ = bitmap.last_target()
+
+        target = self.prepare_backup(bitmap, parent)
+        res = self.do_qmp_backup(device=bitmap.drive['id'],
+                                 sync='dirty-bitmap', bitmap=bitmap.name,
+                                 format=bitmap.drive['fmt'], target=target,
+                                 mode='existing')
+        if not res:
+            bitmap.del_target();
+            self.assertFalse(validate)
+        else:
+            self.make_reference_backup(bitmap)
+        return res
+
+
+    def check_backups(self):
+        for bitmap in self.bitmaps:
+            for incremental, reference in bitmap.backups:
+                self.assertTrue(iotests.compare_images(incremental, reference))
+            last = bitmap.last_target()[0]
+            self.assertTrue(iotests.compare_images(last, bitmap.drive['file']))
+
+
+    def hmp_io_writes(self, drive, patterns):
+        for pattern in patterns:
+            self.vm.hmp_qemu_io(drive, 'write -P%s %s %s' % pattern)
+        self.vm.hmp_qemu_io(drive, 'flush')
+
+
+    def test_incremental_simple(self):
+        '''
+        Test: Create and verify three incremental backups.
+
+        Create a bitmap and a full backup before VM execution begins,
+        then create a series of three incremental backups "during execution,"
+        i.e.; after IO requests begin modifying the drive.
+        '''
+        self.create_anchor_backup()
+        self.add_bitmap('bitmap0', self.drives[0])
+
+        # Sanity: Create a "hollow" incremental backup
+        self.create_incremental()
+        # Three writes: One complete overwrite, one new segment,
+        # and one partial overlap.
+        self.hmp_io_writes(self.drives[0]['id'], (('0xab', 0, 512),
+                                                  ('0xfe', '16M', '256k'),
+                                                  ('0x64', '32736k', '64k')))
+        self.create_incremental()
+        # Three more writes, one of each kind, like above
+        self.hmp_io_writes(self.drives[0]['id'], (('0x9a', 0, 512),
+                                                  ('0x55', '8M', '352k'),
+                                                  ('0x78', '15872k', '1M')))
+        self.create_incremental()
+        self.vm.shutdown()
+        self.check_backups()
+
+
     def test_sync_dirty_bitmap_missing(self):
         self.assert_no_active_block_jobs()
         self.files.append(self.err_img)
@@ -93,11 +260,10 @@ class TestIncrementalBackup(iotests.QMPTestCase):
 
     def tearDown(self):
         self.vm.shutdown()
+        for bitmap in self.bitmaps:
+            bitmap.cleanup()
         for filename in self.files:
-            try:
-                os.remove(filename)
-            except OSError:
-                pass
+            try_remove(filename)
 
 
 if __name__ == '__main__':
diff --git a/tests/qemu-iotests/124.out b/tests/qemu-iotests/124.out
index fbc63e6..8d7e9967 100644
--- a/tests/qemu-iotests/124.out
+++ b/tests/qemu-iotests/124.out
@@ -1,5 +1,5 @@
-..
+...
 ----------------------------------------------------------------------
-Ran 2 tests
+Ran 3 tests
 
 OK
-- 
cgit v1.1


From 24618f5381da650bd50c78feea07b35cf82e7d6c Mon Sep 17 00:00:00 2001
From: John Snow <jsnow@redhat.com>
Date: Fri, 17 Apr 2015 19:50:08 -0400
Subject: iotests: add incremental backup failure recovery test

Test the failure case for incremental backups.

Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1429314609-29776-21-git-send-email-jsnow@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/124     | 57 ++++++++++++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/124.out |  4 ++--
 2 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/tests/qemu-iotests/124 b/tests/qemu-iotests/124
index 5c3b434..95f6de5 100644
--- a/tests/qemu-iotests/124
+++ b/tests/qemu-iotests/124
@@ -240,6 +240,63 @@ class TestIncrementalBackup(iotests.QMPTestCase):
         self.check_backups()
 
 
+    def test_incremental_failure(self):
+        '''Test: Verify backups made after a failure are correct.
+
+        Simulate a failure during an incremental backup block job,
+        emulate additional writes, then create another incremental backup
+        afterwards and verify that the backup created is correct.
+        '''
+
+        # Create a blkdebug interface to this img as 'drive1',
+        # but don't actually create a new image.
+        drive1 = self.add_node('drive1', self.drives[0]['fmt'],
+                               path=self.drives[0]['file'],
+                               backup=self.drives[0]['backup'])
+        result = self.vm.qmp('blockdev-add', options={
+            'id': drive1['id'],
+            'driver': drive1['fmt'],
+            'file': {
+                'driver': 'blkdebug',
+                'image': {
+                    'driver': 'file',
+                    'filename': drive1['file']
+                },
+                'set-state': [{
+                    'event': 'flush_to_disk',
+                    'state': 1,
+                    'new_state': 2
+                }],
+                'inject-error': [{
+                    'event': 'read_aio',
+                    'errno': 5,
+                    'state': 2,
+                    'immediately': False,
+                    'once': True
+                }],
+            }
+        })
+        self.assert_qmp(result, 'return', {})
+
+        self.create_anchor_backup(self.drives[0])
+        self.add_bitmap('bitmap0', drive1)
+        # Note: at this point, during a normal execution,
+        # Assume that the VM resumes and begins issuing IO requests here.
+
+        self.hmp_io_writes(drive1['id'], (('0xab', 0, 512),
+                                          ('0xfe', '16M', '256k'),
+                                          ('0x64', '32736k', '64k')))
+
+        result = self.create_incremental(validate=False)
+        self.assertFalse(result)
+        self.hmp_io_writes(drive1['id'], (('0x9a', 0, 512),
+                                          ('0x55', '8M', '352k'),
+                                          ('0x78', '15872k', '1M')))
+        self.create_incremental()
+        self.vm.shutdown()
+        self.check_backups()
+
+
     def test_sync_dirty_bitmap_missing(self):
         self.assert_no_active_block_jobs()
         self.files.append(self.err_img)
diff --git a/tests/qemu-iotests/124.out b/tests/qemu-iotests/124.out
index 8d7e9967..89968f3 100644
--- a/tests/qemu-iotests/124.out
+++ b/tests/qemu-iotests/124.out
@@ -1,5 +1,5 @@
-...
+....
 ----------------------------------------------------------------------
-Ran 3 tests
+Ran 4 tests
 
 OK
-- 
cgit v1.1


From 59fc5d844fe192494308d0f07507b712ec395129 Mon Sep 17 00:00:00 2001
From: John Snow <jsnow@redhat.com>
Date: Fri, 17 Apr 2015 19:50:09 -0400
Subject: iotests: add incremental backup granularity tests

Test what happens if you fiddle with the granularity.

Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: John Snow <jsnow@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 1429314609-29776-22-git-send-email-jsnow@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/124     | 58 +++++++++++++++++++++++++++++++++++++---------
 tests/qemu-iotests/124.out |  4 ++--
 2 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/tests/qemu-iotests/124 b/tests/qemu-iotests/124
index 95f6de5..3ee78cd 100644
--- a/tests/qemu-iotests/124
+++ b/tests/qemu-iotests/124
@@ -158,11 +158,11 @@ class TestIncrementalBackup(iotests.QMPTestCase):
         self.assertTrue(res)
 
 
-    def add_bitmap(self, name, drive):
+    def add_bitmap(self, name, drive, **kwargs):
         bitmap = Bitmap(name, drive)
         self.bitmaps.append(bitmap)
         result = self.vm.qmp('block-dirty-bitmap-add', node=drive['id'],
-                             name=bitmap.name)
+                             name=bitmap.name, **kwargs)
         self.assert_qmp(result, 'return', {})
         return bitmap
 
@@ -212,16 +212,9 @@ class TestIncrementalBackup(iotests.QMPTestCase):
         self.vm.hmp_qemu_io(drive, 'flush')
 
 
-    def test_incremental_simple(self):
-        '''
-        Test: Create and verify three incremental backups.
-
-        Create a bitmap and a full backup before VM execution begins,
-        then create a series of three incremental backups "during execution,"
-        i.e.; after IO requests begin modifying the drive.
-        '''
+    def do_incremental_simple(self, **kwargs):
         self.create_anchor_backup()
-        self.add_bitmap('bitmap0', self.drives[0])
+        self.add_bitmap('bitmap0', self.drives[0], **kwargs)
 
         # Sanity: Create a "hollow" incremental backup
         self.create_incremental()
@@ -240,6 +233,37 @@ class TestIncrementalBackup(iotests.QMPTestCase):
         self.check_backups()
 
 
+    def test_incremental_simple(self):
+        '''
+        Test: Create and verify three incremental backups.
+
+        Create a bitmap and a full backup before VM execution begins,
+        then create a series of three incremental backups "during execution,"
+        i.e.; after IO requests begin modifying the drive.
+        '''
+        return self.do_incremental_simple()
+
+
+    def test_small_granularity(self):
+        '''
+        Test: Create and verify backups made with a small granularity bitmap.
+
+        Perform the same test as test_incremental_simple, but with a granularity
+        of only 32KiB instead of the present default of 64KiB.
+        '''
+        return self.do_incremental_simple(granularity=32768)
+
+
+    def test_large_granularity(self):
+        '''
+        Test: Create and verify backups made with a large granularity bitmap.
+
+        Perform the same test as test_incremental_simple, but with a granularity
+        of 128KiB instead of the present default of 64KiB.
+        '''
+        return self.do_incremental_simple(granularity=131072)
+
+
     def test_incremental_failure(self):
         '''Test: Verify backups made after a failure are correct.
 
@@ -315,6 +339,18 @@ class TestIncrementalBackup(iotests.QMPTestCase):
         self.assert_qmp(result, 'error/class', 'GenericError')
 
 
+    def test_sync_dirty_bitmap_bad_granularity(self):
+        '''
+        Test: Test what happens if we provide an improper granularity.
+
+        The granularity must always be a power of 2.
+        '''
+        self.assert_no_active_block_jobs()
+        self.assertRaises(AssertionError, self.add_bitmap,
+                          'bitmap0', self.drives[0],
+                          granularity=64000)
+
+
     def tearDown(self):
         self.vm.shutdown()
         for bitmap in self.bitmaps:
diff --git a/tests/qemu-iotests/124.out b/tests/qemu-iotests/124.out
index 89968f3..2f7d390 100644
--- a/tests/qemu-iotests/124.out
+++ b/tests/qemu-iotests/124.out
@@ -1,5 +1,5 @@
-....
+.......
 ----------------------------------------------------------------------
-Ran 4 tests
+Ran 7 tests
 
 OK
-- 
cgit v1.1


From 001c95b740b2ed3d8b486952f68b5f06e609f1f2 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Mon, 27 Apr 2015 13:07:31 +0200
Subject: block/mirror: Always call block_job_sleep_ns()

The mirror block job is trying to take a clever shortcut if delay_ns is
0 and skips block_job_sleep_ns() in that case. But that function must be
called in every block job iteration, because otherwise it is for example
impossible to pause the job.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/mirror.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index d53e60e..58f391a 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -482,9 +482,6 @@ static void coroutine_fn mirror_run(void *opaque)
                 continue;
             } else if (cnt != 0) {
                 delay_ns = mirror_iteration(s);
-                if (delay_ns == 0) {
-                    continue;
-                }
             }
         }
 
-- 
cgit v1.1


From 5505e8b76f86f925c35ecc2b2d311886bb36534c Mon Sep 17 00:00:00 2001
From: Michael Tokarev <mjt@tls.msk.ru>
Date: Mon, 27 Apr 2015 14:51:56 +0300
Subject: block/dmg: make it modular

dmg can optionally utilize libbz2, make it modular

Signed-off-by: Michael Tokarev <mjt@tls.msk.ru>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/Makefile.objs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/block/Makefile.objs b/block/Makefile.objs
index db2933e..179e71d 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -1,4 +1,4 @@
-block-obj-y += raw_bsd.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
+block-obj-y += raw_bsd.o qcow.o vdi.o vmdk.o cloop.o bochs.o vpc.o vvfat.o
 block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
 block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-obj-y += qed-check.o
@@ -37,6 +37,7 @@ gluster.o-libs     := $(GLUSTERFS_LIBS)
 ssh.o-cflags       := $(LIBSSH2_CFLAGS)
 ssh.o-libs         := $(LIBSSH2_LIBS)
 archipelago.o-libs := $(ARCHIPELAGO_LIBS)
+block-obj-m        += dmg.o
 dmg.o-libs         := $(BZIP2_LIBS)
 qcow.o-libs        := -lz
 linux-aio.o-libs   := -laio
-- 
cgit v1.1


From 7237aecd7e8fcc3ccf7fded77b6c127b4df5d3ac Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Mon, 27 Apr 2015 22:23:01 +0800
Subject: vmdk: Widen before shifting 32 bit header field

Coverity spotted this.

The field is 32 bits, but if it's possible to overflow in 32 bit
left shift.

Signed-off-by: Fam Zheng <famz@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/vmdk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index fd94b8f..1c5e2ef 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -523,7 +523,7 @@ static int vmdk_open_vmfs_sparse(BlockDriverState *bs,
     }
     ret = vmdk_add_extent(bs, file, false,
                           le32_to_cpu(header.disk_sectors),
-                          le32_to_cpu(header.l1dir_offset) << 9,
+                          (int64_t)le32_to_cpu(header.l1dir_offset) << 9,
                           0,
                           le32_to_cpu(header.l1dir_size),
                           4096,
-- 
cgit v1.1


From 4f5472cb2d3d37ec3282cc3829612f9d696c2df7 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 28 Apr 2015 14:27:49 +0100
Subject: block: replace bdrv_states iteration with bdrv_next()

The bdrv_states list is a static variable in block.c.

bdrv_drain_all() and bdrv_flush_all() use this variable to iterate over
all drives.

The next patch will move bdrv_drain_all() and bdrv_flush_all() out of
block.c so it's necessary to switch to the public bdrv_next() interface.

Reviewed-by: Alberto Garcia <berto@igalia.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/block.c b/block.c
index ec23594..1f0a4e2 100644
--- a/block.c
+++ b/block.c
@@ -2051,9 +2051,9 @@ void bdrv_drain_all(void)
 {
     /* Always run first iteration so any pending completion BHs run */
     bool busy = true;
-    BlockDriverState *bs;
+    BlockDriverState *bs = NULL;
 
-    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
+    while ((bs = bdrv_next(bs))) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
         aio_context_acquire(aio_context);
@@ -2065,8 +2065,9 @@ void bdrv_drain_all(void)
 
     while (busy) {
         busy = false;
+        bs = NULL;
 
-        QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
+        while ((bs = bdrv_next(bs))) {
             AioContext *aio_context = bdrv_get_aio_context(bs);
 
             aio_context_acquire(aio_context);
@@ -2075,7 +2076,8 @@ void bdrv_drain_all(void)
         }
     }
 
-    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
+    bs = NULL;
+    while ((bs = bdrv_next(bs))) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
         aio_context_acquire(aio_context);
@@ -4015,10 +4017,10 @@ int bdrv_get_flags(BlockDriverState *bs)
 
 int bdrv_flush_all(void)
 {
-    BlockDriverState *bs;
+    BlockDriverState *bs = NULL;
     int result = 0;
 
-    QTAILQ_FOREACH(bs, &bdrv_states, device_list) {
+    while ((bs = bdrv_next(bs))) {
         AioContext *aio_context = bdrv_get_aio_context(bs);
         int ret;
 
-- 
cgit v1.1


From e0c47b6cb1de430fbc6f828f7acffa851c580840 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 28 Apr 2015 14:27:50 +0100
Subject: block: add bdrv_set_dirty()/bdrv_reset_dirty() to block_int.h

The dirty bitmap functions are called from the block I/O processing
code.  Make them visible to block_int.h users so they can be used
outside block.c.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c                   | 12 ++++--------
 include/block/block_int.h |  4 ++++
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/block.c b/block.c
index 1f0a4e2..3b865ee 100644
--- a/block.c
+++ b/block.c
@@ -110,10 +110,6 @@ static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
     QLIST_HEAD_INITIALIZER(bdrv_drivers);
 
-static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
-                           int nr_sectors);
-static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
-                             int nr_sectors);
 static void bdrv_dirty_bitmap_truncate(BlockDriverState *bs);
 /* If non-zero, use only whitelisted block drivers */
 static int use_bdrv_whitelist;
@@ -5787,8 +5783,8 @@ void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap)
     hbitmap_reset(bitmap->bitmap, 0, bitmap->size);
 }
 
-static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
-                           int nr_sectors)
+void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
+                    int nr_sectors)
 {
     BdrvDirtyBitmap *bitmap;
     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
@@ -5799,8 +5795,8 @@ static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
     }
 }
 
-static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
-                             int nr_sectors)
+void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
+                      int nr_sectors)
 {
     BdrvDirtyBitmap *bitmap;
     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
diff --git a/include/block/block_int.h b/include/block/block_int.h
index e0d5561..5ad20b2 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -626,4 +626,8 @@ bool blk_dev_is_tray_open(BlockBackend *blk);
 bool blk_dev_is_medium_locked(BlockBackend *blk);
 void blk_dev_resize_cb(BlockBackend *blk);
 
+void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors);
+void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
+                      int nr_sectors);
+
 #endif /* BLOCK_INT_H */
-- 
cgit v1.1


From 0eb7217e49b84553bb30f97bc34380633fd846fe Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 28 Apr 2015 14:27:51 +0100
Subject: block: extract bdrv_setup_io_funcs()

Move the code to install coroutine and aio emulation function pointers
in a BlockDriver to its own function.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c                   | 7 ++++++-
 include/block/block_int.h | 8 ++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index 3b865ee..954d783 100644
--- a/block.c
+++ b/block.c
@@ -349,7 +349,7 @@ void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz,
                                                  dest, sz, errp);
 }
 
-void bdrv_register(BlockDriver *bdrv)
+void bdrv_setup_io_funcs(BlockDriver *bdrv)
 {
     /* Block drivers without coroutine functions need emulation */
     if (!bdrv->bdrv_co_readv) {
@@ -365,6 +365,11 @@ void bdrv_register(BlockDriver *bdrv)
             bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
         }
     }
+}
+
+void bdrv_register(BlockDriver *bdrv)
+{
+    bdrv_setup_io_funcs(bdrv);
 
     QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
 }
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 5ad20b2..db29b74 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -439,6 +439,14 @@ extern BlockDriver bdrv_file;
 extern BlockDriver bdrv_raw;
 extern BlockDriver bdrv_qcow2;
 
+/**
+ * bdrv_setup_io_funcs:
+ *
+ * Prepare a #BlockDriver for I/O request processing by populating
+ * unimplemented coroutine and AIO interfaces with generic wrapper functions
+ * that fall back to implemented interfaces.
+ */
+void bdrv_setup_io_funcs(BlockDriver *bdrv);
 
 int get_tmp_filename(char *filename, int size);
 BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
-- 
cgit v1.1


From 61007b316cd71ee7333ff7a0a749a8949527575f Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 28 Apr 2015 14:27:52 +0100
Subject: block: move I/O request processing to block/io.c

The block.c file has grown to over 6000 lines.  It is time to split this
file so there are fewer conflicts and the code is easier to maintain.

Extract I/O request processing code:
 * Read
 * Write
 * Zero writes and making the image empty
 * Flush
 * Discard
 * ioctl
 * Tracked requests and queuing
 * Throttling and copy-on-read
 * Block status and allocated functions
 * Refreshing block limits
 * Reading/writing vmstate
 * qemu_blockalign() and friends

The patch simply moves code from block.c into block/io.c.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c             | 3394 +++++++--------------------------------------------
 block/Makefile.objs |    2 +-
 block/io.c          | 2540 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 2982 insertions(+), 2954 deletions(-)
 create mode 100644 block/io.c

diff --git a/block.c b/block.c
index 954d783..7904098 100644
--- a/block.c
+++ b/block.c
@@ -30,7 +30,6 @@
 #include "qapi/qmp/qjson.h"
 #include "sysemu/block-backend.h"
 #include "sysemu/sysemu.h"
-#include "sysemu/qtest.h"
 #include "qemu/notify.h"
 #include "block/coroutine.h"
 #include "block/qapi.h"
@@ -71,36 +70,6 @@ struct BdrvDirtyBitmap {
 
 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
 
-static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque);
-static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque);
-static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
-                                         int64_t sector_num, int nb_sectors,
-                                         QEMUIOVector *iov);
-static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
-                                         int64_t sector_num, int nb_sectors,
-                                         QEMUIOVector *iov);
-static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
-    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
-    BdrvRequestFlags flags);
-static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
-    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
-    BdrvRequestFlags flags);
-static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
-                                         int64_t sector_num,
-                                         QEMUIOVector *qiov,
-                                         int nb_sectors,
-                                         BdrvRequestFlags flags,
-                                         BlockCompletionFunc *cb,
-                                         void *opaque,
-                                         bool is_write);
-static void coroutine_fn bdrv_co_do_rw(void *opaque);
-static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
-
 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
     QTAILQ_HEAD_INITIALIZER(bdrv_states);
 
@@ -134,110 +103,6 @@ int is_windows_drive(const char *filename)
 }
 #endif
 
-/* throttling disk I/O limits */
-void bdrv_set_io_limits(BlockDriverState *bs,
-                        ThrottleConfig *cfg)
-{
-    int i;
-
-    throttle_config(&bs->throttle_state, cfg);
-
-    for (i = 0; i < 2; i++) {
-        qemu_co_enter_next(&bs->throttled_reqs[i]);
-    }
-}
-
-/* this function drain all the throttled IOs */
-static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
-{
-    bool drained = false;
-    bool enabled = bs->io_limits_enabled;
-    int i;
-
-    bs->io_limits_enabled = false;
-
-    for (i = 0; i < 2; i++) {
-        while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
-            drained = true;
-        }
-    }
-
-    bs->io_limits_enabled = enabled;
-
-    return drained;
-}
-
-void bdrv_io_limits_disable(BlockDriverState *bs)
-{
-    bs->io_limits_enabled = false;
-
-    bdrv_start_throttled_reqs(bs);
-
-    throttle_destroy(&bs->throttle_state);
-}
-
-static void bdrv_throttle_read_timer_cb(void *opaque)
-{
-    BlockDriverState *bs = opaque;
-    qemu_co_enter_next(&bs->throttled_reqs[0]);
-}
-
-static void bdrv_throttle_write_timer_cb(void *opaque)
-{
-    BlockDriverState *bs = opaque;
-    qemu_co_enter_next(&bs->throttled_reqs[1]);
-}
-
-/* should be called before bdrv_set_io_limits if a limit is set */
-void bdrv_io_limits_enable(BlockDriverState *bs)
-{
-    int clock_type = QEMU_CLOCK_REALTIME;
-
-    if (qtest_enabled()) {
-        /* For testing block IO throttling only */
-        clock_type = QEMU_CLOCK_VIRTUAL;
-    }
-    assert(!bs->io_limits_enabled);
-    throttle_init(&bs->throttle_state,
-                  bdrv_get_aio_context(bs),
-                  clock_type,
-                  bdrv_throttle_read_timer_cb,
-                  bdrv_throttle_write_timer_cb,
-                  bs);
-    bs->io_limits_enabled = true;
-}
-
-/* This function makes an IO wait if needed
- *
- * @nb_sectors: the number of sectors of the IO
- * @is_write:   is the IO a write
- */
-static void bdrv_io_limits_intercept(BlockDriverState *bs,
-                                     unsigned int bytes,
-                                     bool is_write)
-{
-    /* does this io must wait */
-    bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
-
-    /* if must wait or any request of this type throttled queue the IO */
-    if (must_wait ||
-        !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
-        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
-    }
-
-    /* the IO will be executed, do the accounting */
-    throttle_account(&bs->throttle_state, is_write, bytes);
-
-
-    /* if the next request must wait -> do nothing */
-    if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
-        return;
-    }
-
-    /* else queue next request for execution */
-    qemu_co_queue_next(&bs->throttled_reqs[is_write]);
-}
-
 size_t bdrv_opt_mem_align(BlockDriverState *bs)
 {
     if (!bs || !bs->drv) {
@@ -349,24 +214,6 @@ void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz,
                                                  dest, sz, errp);
 }
 
-void bdrv_setup_io_funcs(BlockDriver *bdrv)
-{
-    /* Block drivers without coroutine functions need emulation */
-    if (!bdrv->bdrv_co_readv) {
-        bdrv->bdrv_co_readv = bdrv_co_readv_em;
-        bdrv->bdrv_co_writev = bdrv_co_writev_em;
-
-        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
-         * the block driver lacks aio we need to emulate that too.
-         */
-        if (!bdrv->bdrv_aio_readv) {
-            /* add AIO emulation layer */
-            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
-            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
-        }
-    }
-}
-
 void bdrv_register(BlockDriver *bdrv)
 {
     bdrv_setup_io_funcs(bdrv);
@@ -541,54 +388,6 @@ int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
     return ret;
 }
 
-void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
-{
-    BlockDriver *drv = bs->drv;
-    Error *local_err = NULL;
-
-    memset(&bs->bl, 0, sizeof(bs->bl));
-
-    if (!drv) {
-        return;
-    }
-
-    /* Take some limits from the children as a default */
-    if (bs->file) {
-        bdrv_refresh_limits(bs->file, &local_err);
-        if (local_err) {
-            error_propagate(errp, local_err);
-            return;
-        }
-        bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
-        bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
-        bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
-    } else {
-        bs->bl.opt_mem_alignment = 512;
-    }
-
-    if (bs->backing_hd) {
-        bdrv_refresh_limits(bs->backing_hd, &local_err);
-        if (local_err) {
-            error_propagate(errp, local_err);
-            return;
-        }
-        bs->bl.opt_transfer_length =
-            MAX(bs->bl.opt_transfer_length,
-                bs->backing_hd->bl.opt_transfer_length);
-        bs->bl.max_transfer_length =
-            MIN_NON_ZERO(bs->bl.max_transfer_length,
-                         bs->backing_hd->bl.max_transfer_length);
-        bs->bl.opt_mem_alignment =
-            MAX(bs->bl.opt_mem_alignment,
-                bs->backing_hd->bl.opt_mem_alignment);
-    }
-
-    /* Then let the driver override it */
-    if (drv->bdrv_refresh_limits) {
-        drv->bdrv_refresh_limits(bs, errp);
-    }
-}
-
 /**
  * Try to get @bs's logical and physical block size.
  * On success, store them in @bsz struct and return 0.
@@ -862,22 +661,6 @@ int bdrv_parse_cache_flags(const char *mode, int *flags)
     return 0;
 }
 
-/**
- * The copy-on-read flag is actually a reference count so multiple users may
- * use the feature without worrying about clobbering its previous state.
- * Copy-on-read stays enabled until all users have called to disable it.
- */
-void bdrv_enable_copy_on_read(BlockDriverState *bs)
-{
-    bs->copy_on_read++;
-}
-
-void bdrv_disable_copy_on_read(BlockDriverState *bs)
-{
-    assert(bs->copy_on_read > 0);
-    bs->copy_on_read--;
-}
-
 /*
  * Returns the flags that a temporary snapshot should get, based on the
  * originally requested flags (the originally requested image will have flags
@@ -1987,108 +1770,6 @@ void bdrv_close_all(void)
     }
 }
 
-/* Check if any requests are in-flight (including throttled requests) */
-static bool bdrv_requests_pending(BlockDriverState *bs)
-{
-    if (!QLIST_EMPTY(&bs->tracked_requests)) {
-        return true;
-    }
-    if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
-        return true;
-    }
-    if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
-        return true;
-    }
-    if (bs->file && bdrv_requests_pending(bs->file)) {
-        return true;
-    }
-    if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
-        return true;
-    }
-    return false;
-}
-
-static bool bdrv_drain_one(BlockDriverState *bs)
-{
-    bool bs_busy;
-
-    bdrv_flush_io_queue(bs);
-    bdrv_start_throttled_reqs(bs);
-    bs_busy = bdrv_requests_pending(bs);
-    bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
-    return bs_busy;
-}
-
-/*
- * Wait for pending requests to complete on a single BlockDriverState subtree
- *
- * See the warning in bdrv_drain_all().  This function can only be called if
- * you are sure nothing can generate I/O because you have op blockers
- * installed.
- *
- * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
- * AioContext.
- */
-void bdrv_drain(BlockDriverState *bs)
-{
-    while (bdrv_drain_one(bs)) {
-        /* Keep iterating */
-    }
-}
-
-/*
- * Wait for pending requests to complete across all BlockDriverStates
- *
- * This function does not flush data to disk, use bdrv_flush_all() for that
- * after calling this function.
- *
- * Note that completion of an asynchronous I/O operation can trigger any
- * number of other I/O operations on other devices---for example a coroutine
- * can be arbitrarily complex and a constant flow of I/O can come until the
- * coroutine is complete.  Because of this, it is not possible to have a
- * function to drain a single device's I/O queue.
- */
-void bdrv_drain_all(void)
-{
-    /* Always run first iteration so any pending completion BHs run */
-    bool busy = true;
-    BlockDriverState *bs = NULL;
-
-    while ((bs = bdrv_next(bs))) {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-
-        aio_context_acquire(aio_context);
-        if (bs->job) {
-            block_job_pause(bs->job);
-        }
-        aio_context_release(aio_context);
-    }
-
-    while (busy) {
-        busy = false;
-        bs = NULL;
-
-        while ((bs = bdrv_next(bs))) {
-            AioContext *aio_context = bdrv_get_aio_context(bs);
-
-            aio_context_acquire(aio_context);
-            busy |= bdrv_drain_one(bs);
-            aio_context_release(aio_context);
-        }
-    }
-
-    bs = NULL;
-    while ((bs = bdrv_next(bs))) {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-
-        aio_context_acquire(aio_context);
-        if (bs->job) {
-            block_job_resume(bs->job);
-        }
-        aio_context_release(aio_context);
-    }
-}
-
 /* make a BlockDriverState anonymous by removing from bdrv_state and
  * graph_bdrv_state list.
    Also, NULL terminate the device_name to prevent double remove */
@@ -2410,152 +2091,6 @@ int bdrv_commit_all(void)
     return 0;
 }
 
-/**
- * Remove an active request from the tracked requests list
- *
- * This function should be called when a tracked request is completing.
- */
-static void tracked_request_end(BdrvTrackedRequest *req)
-{
-    if (req->serialising) {
-        req->bs->serialising_in_flight--;
-    }
-
-    QLIST_REMOVE(req, list);
-    qemu_co_queue_restart_all(&req->wait_queue);
-}
-
-/**
- * Add an active request to the tracked requests list
- */
-static void tracked_request_begin(BdrvTrackedRequest *req,
-                                  BlockDriverState *bs,
-                                  int64_t offset,
-                                  unsigned int bytes, bool is_write)
-{
-    *req = (BdrvTrackedRequest){
-        .bs = bs,
-        .offset         = offset,
-        .bytes          = bytes,
-        .is_write       = is_write,
-        .co             = qemu_coroutine_self(),
-        .serialising    = false,
-        .overlap_offset = offset,
-        .overlap_bytes  = bytes,
-    };
-
-    qemu_co_queue_init(&req->wait_queue);
-
-    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
-}
-
-static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
-{
-    int64_t overlap_offset = req->offset & ~(align - 1);
-    unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
-                               - overlap_offset;
-
-    if (!req->serialising) {
-        req->bs->serialising_in_flight++;
-        req->serialising = true;
-    }
-
-    req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
-    req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
-}
-
-/**
- * Round a region to cluster boundaries
- */
-void bdrv_round_to_clusters(BlockDriverState *bs,
-                            int64_t sector_num, int nb_sectors,
-                            int64_t *cluster_sector_num,
-                            int *cluster_nb_sectors)
-{
-    BlockDriverInfo bdi;
-
-    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
-        *cluster_sector_num = sector_num;
-        *cluster_nb_sectors = nb_sectors;
-    } else {
-        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
-        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
-        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
-                                            nb_sectors, c);
-    }
-}
-
-static int bdrv_get_cluster_size(BlockDriverState *bs)
-{
-    BlockDriverInfo bdi;
-    int ret;
-
-    ret = bdrv_get_info(bs, &bdi);
-    if (ret < 0 || bdi.cluster_size == 0) {
-        return bs->request_alignment;
-    } else {
-        return bdi.cluster_size;
-    }
-}
-
-static bool tracked_request_overlaps(BdrvTrackedRequest *req,
-                                     int64_t offset, unsigned int bytes)
-{
-    /*        aaaa   bbbb */
-    if (offset >= req->overlap_offset + req->overlap_bytes) {
-        return false;
-    }
-    /* bbbb   aaaa        */
-    if (req->overlap_offset >= offset + bytes) {
-        return false;
-    }
-    return true;
-}
-
-static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
-{
-    BlockDriverState *bs = self->bs;
-    BdrvTrackedRequest *req;
-    bool retry;
-    bool waited = false;
-
-    if (!bs->serialising_in_flight) {
-        return false;
-    }
-
-    do {
-        retry = false;
-        QLIST_FOREACH(req, &bs->tracked_requests, list) {
-            if (req == self || (!req->serialising && !self->serialising)) {
-                continue;
-            }
-            if (tracked_request_overlaps(req, self->overlap_offset,
-                                         self->overlap_bytes))
-            {
-                /* Hitting this means there was a reentrant request, for
-                 * example, a block driver issuing nested requests.  This must
-                 * never happen since it means deadlock.
-                 */
-                assert(qemu_coroutine_self() != req->co);
-
-                /* If the request is already (indirectly) waiting for us, or
-                 * will wait for us as soon as it wakes up, then just go on
-                 * (instead of producing a deadlock in the former case). */
-                if (!req->waiting_for) {
-                    self->waiting_for = req;
-                    qemu_co_queue_wait(&req->wait_queue);
-                    self->waiting_for = NULL;
-                    retry = true;
-                    waited = true;
-                    break;
-                }
-            }
-        }
-    } while (retry);
-
-    return waited;
-}
-
 /*
  * Return values:
  * 0        - success
@@ -2724,1127 +2259,254 @@ exit:
     return ret;
 }
 
-
-static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
-                                   size_t size)
+/**
+ * Truncate file to 'offset' bytes (needed only for file protocols)
+ */
+int bdrv_truncate(BlockDriverState *bs, int64_t offset)
 {
-    if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
-        return -EIO;
-    }
-
-    if (!bdrv_is_inserted(bs)) {
+    BlockDriver *drv = bs->drv;
+    int ret;
+    if (!drv)
         return -ENOMEDIUM;
-    }
+    if (!drv->bdrv_truncate)
+        return -ENOTSUP;
+    if (bs->read_only)
+        return -EACCES;
 
-    if (offset < 0) {
-        return -EIO;
+    ret = drv->bdrv_truncate(bs, offset);
+    if (ret == 0) {
+        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
+        bdrv_dirty_bitmap_truncate(bs);
+        if (bs->blk) {
+            blk_dev_resize_cb(bs->blk);
+        }
     }
-
-    return 0;
+    return ret;
 }
 
-static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
-                              int nb_sectors)
+/**
+ * Length of a allocated file in bytes. Sparse files are counted by actual
+ * allocated space. Return < 0 if error or unknown.
+ */
+int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
 {
-    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
-        return -EIO;
+    BlockDriver *drv = bs->drv;
+    if (!drv) {
+        return -ENOMEDIUM;
     }
-
-    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
-                                   nb_sectors * BDRV_SECTOR_SIZE);
-}
-
-typedef struct RwCo {
-    BlockDriverState *bs;
-    int64_t offset;
-    QEMUIOVector *qiov;
-    bool is_write;
-    int ret;
-    BdrvRequestFlags flags;
-} RwCo;
-
-static void coroutine_fn bdrv_rw_co_entry(void *opaque)
-{
-    RwCo *rwco = opaque;
-
-    if (!rwco->is_write) {
-        rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
-                                      rwco->qiov->size, rwco->qiov,
-                                      rwco->flags);
-    } else {
-        rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
-                                       rwco->qiov->size, rwco->qiov,
-                                       rwco->flags);
+    if (drv->bdrv_get_allocated_file_size) {
+        return drv->bdrv_get_allocated_file_size(bs);
+    }
+    if (bs->file) {
+        return bdrv_get_allocated_file_size(bs->file);
     }
+    return -ENOTSUP;
 }
 
-/*
- * Process a vectored synchronous request using coroutines
+/**
+ * Return number of sectors on success, -errno on error.
  */
-static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
-                        QEMUIOVector *qiov, bool is_write,
-                        BdrvRequestFlags flags)
+int64_t bdrv_nb_sectors(BlockDriverState *bs)
 {
-    Coroutine *co;
-    RwCo rwco = {
-        .bs = bs,
-        .offset = offset,
-        .qiov = qiov,
-        .is_write = is_write,
-        .ret = NOT_DONE,
-        .flags = flags,
-    };
-
-    /**
-     * In sync call context, when the vcpu is blocked, this throttling timer
-     * will not fire; so the I/O throttling function has to be disabled here
-     * if it has been enabled.
-     */
-    if (bs->io_limits_enabled) {
-        fprintf(stderr, "Disabling I/O throttling on '%s' due "
-                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
-        bdrv_io_limits_disable(bs);
-    }
+    BlockDriver *drv = bs->drv;
 
-    if (qemu_in_coroutine()) {
-        /* Fast-path if already in coroutine context */
-        bdrv_rw_co_entry(&rwco);
-    } else {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
+    if (!drv)
+        return -ENOMEDIUM;
 
-        co = qemu_coroutine_create(bdrv_rw_co_entry);
-        qemu_coroutine_enter(co, &rwco);
-        while (rwco.ret == NOT_DONE) {
-            aio_poll(aio_context, true);
+    if (drv->has_variable_length) {
+        int ret = refresh_total_sectors(bs, bs->total_sectors);
+        if (ret < 0) {
+            return ret;
         }
     }
-    return rwco.ret;
+    return bs->total_sectors;
 }
 
-/*
- * Process a synchronous request using coroutines
+/**
+ * Return length in bytes on success, -errno on error.
+ * The length is always a multiple of BDRV_SECTOR_SIZE.
  */
-static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
-                      int nb_sectors, bool is_write, BdrvRequestFlags flags)
+int64_t bdrv_getlength(BlockDriverState *bs)
 {
-    QEMUIOVector qiov;
-    struct iovec iov = {
-        .iov_base = (void *)buf,
-        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
-    };
-
-    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
-        return -EINVAL;
-    }
-
-    qemu_iovec_init_external(&qiov, &iov, 1);
-    return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
-                        &qiov, is_write, flags);
-}
+    int64_t ret = bdrv_nb_sectors(bs);
 
-/* return < 0 if error. See bdrv_write() for the return codes */
-int bdrv_read(BlockDriverState *bs, int64_t sector_num,
-              uint8_t *buf, int nb_sectors)
-{
-    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
+    return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
 }
 
-/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
-int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
-                          uint8_t *buf, int nb_sectors)
+/* return 0 as number of sectors if no device present or error */
+void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
 {
-    bool enabled;
-    int ret;
+    int64_t nb_sectors = bdrv_nb_sectors(bs);
 
-    enabled = bs->io_limits_enabled;
-    bs->io_limits_enabled = false;
-    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
-    bs->io_limits_enabled = enabled;
-    return ret;
+    *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
 }
 
-/* Return < 0 if error. Important errors are:
-  -EIO         generic I/O error (may happen for all errors)
-  -ENOMEDIUM   No media inserted.
-  -EINVAL      Invalid sector number or nb_sectors
-  -EACCES      Trying to write a read-only device
-*/
-int bdrv_write(BlockDriverState *bs, int64_t sector_num,
-               const uint8_t *buf, int nb_sectors)
+void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
+                       BlockdevOnError on_write_error)
 {
-    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
+    bs->on_read_error = on_read_error;
+    bs->on_write_error = on_write_error;
 }
 
-int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
-                      int nb_sectors, BdrvRequestFlags flags)
+BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
 {
-    return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
-                      BDRV_REQ_ZERO_WRITE | flags);
+    return is_read ? bs->on_read_error : bs->on_write_error;
 }
 
-/*
- * Completely zero out a block device with the help of bdrv_write_zeroes.
- * The operation is sped up by checking the block status and only writing
- * zeroes to the device if they currently do not return zeroes. Optional
- * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
- *
- * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
- */
-int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
+BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
 {
-    int64_t target_sectors, ret, nb_sectors, sector_num = 0;
-    int n;
-
-    target_sectors = bdrv_nb_sectors(bs);
-    if (target_sectors < 0) {
-        return target_sectors;
-    }
+    BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
 
-    for (;;) {
-        nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
-        if (nb_sectors <= 0) {
-            return 0;
-        }
-        ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
-        if (ret < 0) {
-            error_report("error getting block status at sector %" PRId64 ": %s",
-                         sector_num, strerror(-ret));
-            return ret;
-        }
-        if (ret & BDRV_BLOCK_ZERO) {
-            sector_num += n;
-            continue;
-        }
-        ret = bdrv_write_zeroes(bs, sector_num, n, flags);
-        if (ret < 0) {
-            error_report("error writing zeroes at sector %" PRId64 ": %s",
-                         sector_num, strerror(-ret));
-            return ret;
-        }
-        sector_num += n;
+    switch (on_err) {
+    case BLOCKDEV_ON_ERROR_ENOSPC:
+        return (error == ENOSPC) ?
+               BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
+    case BLOCKDEV_ON_ERROR_STOP:
+        return BLOCK_ERROR_ACTION_STOP;
+    case BLOCKDEV_ON_ERROR_REPORT:
+        return BLOCK_ERROR_ACTION_REPORT;
+    case BLOCKDEV_ON_ERROR_IGNORE:
+        return BLOCK_ERROR_ACTION_IGNORE;
+    default:
+        abort();
     }
 }
 
-int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
+static void send_qmp_error_event(BlockDriverState *bs,
+                                 BlockErrorAction action,
+                                 bool is_read, int error)
 {
-    QEMUIOVector qiov;
-    struct iovec iov = {
-        .iov_base = (void *)buf,
-        .iov_len = bytes,
-    };
-    int ret;
-
-    if (bytes < 0) {
-        return -EINVAL;
-    }
-
-    qemu_iovec_init_external(&qiov, &iov, 1);
-    ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
-    if (ret < 0) {
-        return ret;
-    }
+    IoOperationType optype;
 
-    return bytes;
+    optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
+    qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action,
+                                   bdrv_iostatus_is_enabled(bs),
+                                   error == ENOSPC, strerror(error),
+                                   &error_abort);
 }
 
-int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
+/* This is done by device models because, while the block layer knows
+ * about the error, it does not know whether an operation comes from
+ * the device or the block layer (from a job, for example).
+ */
+void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
+                       bool is_read, int error)
 {
-    int ret;
+    assert(error >= 0);
 
-    ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
-    if (ret < 0) {
-        return ret;
-    }
+    if (action == BLOCK_ERROR_ACTION_STOP) {
+        /* First set the iostatus, so that "info block" returns an iostatus
+         * that matches the events raised so far (an additional error iostatus
+         * is fine, but not a lost one).
+         */
+        bdrv_iostatus_set_err(bs, error);
 
-    return qiov->size;
+        /* Then raise the request to stop the VM and the event.
+         * qemu_system_vmstop_request_prepare has two effects.  First,
+         * it ensures that the STOP event always comes after the
+         * BLOCK_IO_ERROR event.  Second, it ensures that even if management
+         * can observe the STOP event and do a "cont" before the STOP
+         * event is issued, the VM will not stop.  In this case, vm_start()
+         * also ensures that the STOP/RESUME pair of events is emitted.
+         */
+        qemu_system_vmstop_request_prepare();
+        send_qmp_error_event(bs, action, is_read, error);
+        qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
+    } else {
+        send_qmp_error_event(bs, action, is_read, error);
+    }
 }
 
-int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
-                const void *buf, int bytes)
+int bdrv_is_read_only(BlockDriverState *bs)
 {
-    QEMUIOVector qiov;
-    struct iovec iov = {
-        .iov_base   = (void *) buf,
-        .iov_len    = bytes,
-    };
-
-    if (bytes < 0) {
-        return -EINVAL;
-    }
-
-    qemu_iovec_init_external(&qiov, &iov, 1);
-    return bdrv_pwritev(bs, offset, &qiov);
+    return bs->read_only;
 }
 
-/*
- * Writes to the file and ensures that no writes are reordered across this
- * request (acts as a barrier)
- *
- * Returns 0 on success, -errno in error cases.
- */
-int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
-    const void *buf, int count)
+int bdrv_is_sg(BlockDriverState *bs)
 {
-    int ret;
-
-    ret = bdrv_pwrite(bs, offset, buf, count);
-    if (ret < 0) {
-        return ret;
-    }
-
-    /* No flush needed for cache modes that already do it */
-    if (bs->enable_write_cache) {
-        bdrv_flush(bs);
-    }
-
-    return 0;
+    return bs->sg;
 }
 
-static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+int bdrv_enable_write_cache(BlockDriverState *bs)
 {
-    /* Perform I/O through a temporary buffer so that users who scribble over
-     * their read buffer while the operation is in progress do not end up
-     * modifying the image file.  This is critical for zero-copy guest I/O
-     * where anything might happen inside guest memory.
-     */
-    void *bounce_buffer;
-
-    BlockDriver *drv = bs->drv;
-    struct iovec iov;
-    QEMUIOVector bounce_qiov;
-    int64_t cluster_sector_num;
-    int cluster_nb_sectors;
-    size_t skip_bytes;
-    int ret;
-
-    /* Cover entire cluster so no additional backing file I/O is required when
-     * allocating cluster in the image file.
-     */
-    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
-                           &cluster_sector_num, &cluster_nb_sectors);
-
-    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
-                                   cluster_sector_num, cluster_nb_sectors);
-
-    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
-    iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
-    if (bounce_buffer == NULL) {
-        ret = -ENOMEM;
-        goto err;
-    }
-
-    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
-
-    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
-                             &bounce_qiov);
-    if (ret < 0) {
-        goto err;
-    }
-
-    if (drv->bdrv_co_write_zeroes &&
-        buffer_is_zero(bounce_buffer, iov.iov_len)) {
-        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
-                                      cluster_nb_sectors, 0);
-    } else {
-        /* This does not change the data on the disk, it is not necessary
-         * to flush even in cache=writethrough mode.
-         */
-        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
-                                  &bounce_qiov);
-    }
-
-    if (ret < 0) {
-        /* It might be okay to ignore write errors for guest requests.  If this
-         * is a deliberate copy-on-read then we don't want to ignore the error.
-         * Simply report it in all cases.
-         */
-        goto err;
-    }
-
-    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
-    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
-                        nb_sectors * BDRV_SECTOR_SIZE);
-
-err:
-    qemu_vfree(bounce_buffer);
-    return ret;
+    return bs->enable_write_cache;
 }
 
-/*
- * Forwards an already correctly aligned request to the BlockDriver. This
- * handles copy on read and zeroing after EOF; any other features must be
- * implemented by the caller.
- */
-static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
-    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
-    int64_t align, QEMUIOVector *qiov, int flags)
+void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
 {
-    BlockDriver *drv = bs->drv;
-    int ret;
-
-    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
-    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
-
-    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
-    assert(!qiov || bytes == qiov->size);
-
-    /* Handle Copy on Read and associated serialisation */
-    if (flags & BDRV_REQ_COPY_ON_READ) {
-        /* If we touch the same cluster it counts as an overlap.  This
-         * guarantees that allocating writes will be serialized and not race
-         * with each other for the same cluster.  For example, in copy-on-read
-         * it ensures that the CoR read and write operations are atomic and
-         * guest writes cannot interleave between them. */
-        mark_request_serialising(req, bdrv_get_cluster_size(bs));
-    }
-
-    wait_serialising_requests(req);
-
-    if (flags & BDRV_REQ_COPY_ON_READ) {
-        int pnum;
-
-        ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
-        if (ret < 0) {
-            goto out;
-        }
-
-        if (!ret || pnum != nb_sectors) {
-            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
-            goto out;
-        }
-    }
+    bs->enable_write_cache = wce;
 
-    /* Forward the request to the BlockDriver */
-    if (!bs->zero_beyond_eof) {
-        ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+    /* so a reopen() will preserve wce */
+    if (wce) {
+        bs->open_flags |= BDRV_O_CACHE_WB;
     } else {
-        /* Read zeros after EOF */
-        int64_t total_sectors, max_nb_sectors;
-
-        total_sectors = bdrv_nb_sectors(bs);
-        if (total_sectors < 0) {
-            ret = total_sectors;
-            goto out;
-        }
-
-        max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
-                                  align >> BDRV_SECTOR_BITS);
-        if (nb_sectors < max_nb_sectors) {
-            ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
-        } else if (max_nb_sectors > 0) {
-            QEMUIOVector local_qiov;
-
-            qemu_iovec_init(&local_qiov, qiov->niov);
-            qemu_iovec_concat(&local_qiov, qiov, 0,
-                              max_nb_sectors * BDRV_SECTOR_SIZE);
-
-            ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
-                                     &local_qiov);
-
-            qemu_iovec_destroy(&local_qiov);
-        } else {
-            ret = 0;
-        }
-
-        /* Reading beyond end of file is supposed to produce zeroes */
-        if (ret == 0 && total_sectors < sector_num + nb_sectors) {
-            uint64_t offset = MAX(0, total_sectors - sector_num);
-            uint64_t bytes = (sector_num + nb_sectors - offset) *
-                              BDRV_SECTOR_SIZE;
-            qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
-        }
+        bs->open_flags &= ~BDRV_O_CACHE_WB;
     }
-
-out:
-    return ret;
 }
 
-static inline uint64_t bdrv_get_align(BlockDriverState *bs)
+int bdrv_is_encrypted(BlockDriverState *bs)
 {
-    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
-    return MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
+    if (bs->backing_hd && bs->backing_hd->encrypted)
+        return 1;
+    return bs->encrypted;
 }
 
-static inline bool bdrv_req_is_aligned(BlockDriverState *bs,
-                                       int64_t offset, size_t bytes)
+int bdrv_key_required(BlockDriverState *bs)
 {
-    int64_t align = bdrv_get_align(bs);
-    return !(offset & (align - 1) || (bytes & (align - 1)));
+    BlockDriverState *backing_hd = bs->backing_hd;
+
+    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
+        return 1;
+    return (bs->encrypted && !bs->valid_key);
 }
 
-/*
- * Handle a read request in coroutine context
- */
-static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
-    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
-    BdrvRequestFlags flags)
+int bdrv_set_key(BlockDriverState *bs, const char *key)
 {
-    BlockDriver *drv = bs->drv;
-    BdrvTrackedRequest req;
-
-    uint64_t align = bdrv_get_align(bs);
-    uint8_t *head_buf = NULL;
-    uint8_t *tail_buf = NULL;
-    QEMUIOVector local_qiov;
-    bool use_local_qiov = false;
     int ret;
-
-    if (!drv) {
+    if (bs->backing_hd && bs->backing_hd->encrypted) {
+        ret = bdrv_set_key(bs->backing_hd, key);
+        if (ret < 0)
+            return ret;
+        if (!bs->encrypted)
+            return 0;
+    }
+    if (!bs->encrypted) {
+        return -EINVAL;
+    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
         return -ENOMEDIUM;
     }
-
-    ret = bdrv_check_byte_request(bs, offset, bytes);
+    ret = bs->drv->bdrv_set_key(bs, key);
     if (ret < 0) {
-        return ret;
-    }
-
-    if (bs->copy_on_read) {
-        flags |= BDRV_REQ_COPY_ON_READ;
-    }
-
-    /* throttling disk I/O */
-    if (bs->io_limits_enabled) {
-        bdrv_io_limits_intercept(bs, bytes, false);
+        bs->valid_key = 0;
+    } else if (!bs->valid_key) {
+        bs->valid_key = 1;
+        if (bs->blk) {
+            /* call the change callback now, we skipped it on open */
+            blk_dev_change_media_cb(bs->blk, true);
+        }
     }
-
-    /* Align read if necessary by padding qiov */
-    if (offset & (align - 1)) {
-        head_buf = qemu_blockalign(bs, align);
-        qemu_iovec_init(&local_qiov, qiov->niov + 2);
-        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
-        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
-        use_local_qiov = true;
-
-        bytes += offset & (align - 1);
-        offset = offset & ~(align - 1);
-    }
-
-    if ((offset + bytes) & (align - 1)) {
-        if (!use_local_qiov) {
-            qemu_iovec_init(&local_qiov, qiov->niov + 1);
-            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
-            use_local_qiov = true;
-        }
-        tail_buf = qemu_blockalign(bs, align);
-        qemu_iovec_add(&local_qiov, tail_buf,
-                       align - ((offset + bytes) & (align - 1)));
-
-        bytes = ROUND_UP(bytes, align);
-    }
-
-    tracked_request_begin(&req, bs, offset, bytes, false);
-    ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
-                              use_local_qiov ? &local_qiov : qiov,
-                              flags);
-    tracked_request_end(&req);
-
-    if (use_local_qiov) {
-        qemu_iovec_destroy(&local_qiov);
-        qemu_vfree(head_buf);
-        qemu_vfree(tail_buf);
-    }
-
-    return ret;
-}
-
-static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
-    BdrvRequestFlags flags)
-{
-    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
-        return -EINVAL;
-    }
-
-    return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
-                             nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
-}
-
-int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
-    int nb_sectors, QEMUIOVector *qiov)
-{
-    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
-
-    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
-}
-
-int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
-{
-    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
-
-    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
-                            BDRV_REQ_COPY_ON_READ);
-}
-
-#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
-
-static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
-{
-    BlockDriver *drv = bs->drv;
-    QEMUIOVector qiov;
-    struct iovec iov = {0};
-    int ret = 0;
-
-    int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
-                                        BDRV_REQUEST_MAX_SECTORS);
-
-    while (nb_sectors > 0 && !ret) {
-        int num = nb_sectors;
-
-        /* Align request.  Block drivers can expect the "bulk" of the request
-         * to be aligned.
-         */
-        if (bs->bl.write_zeroes_alignment
-            && num > bs->bl.write_zeroes_alignment) {
-            if (sector_num % bs->bl.write_zeroes_alignment != 0) {
-                /* Make a small request up to the first aligned sector.  */
-                num = bs->bl.write_zeroes_alignment;
-                num -= sector_num % bs->bl.write_zeroes_alignment;
-            } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
-                /* Shorten the request to the last aligned sector.  num cannot
-                 * underflow because num > bs->bl.write_zeroes_alignment.
-                 */
-                num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
-            }
-        }
-
-        /* limit request size */
-        if (num > max_write_zeroes) {
-            num = max_write_zeroes;
-        }
-
-        ret = -ENOTSUP;
-        /* First try the efficient write zeroes operation */
-        if (drv->bdrv_co_write_zeroes) {
-            ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
-        }
-
-        if (ret == -ENOTSUP) {
-            /* Fall back to bounce buffer if write zeroes is unsupported */
-            int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
-                                            MAX_WRITE_ZEROES_BOUNCE_BUFFER);
-            num = MIN(num, max_xfer_len);
-            iov.iov_len = num * BDRV_SECTOR_SIZE;
-            if (iov.iov_base == NULL) {
-                iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
-                if (iov.iov_base == NULL) {
-                    ret = -ENOMEM;
-                    goto fail;
-                }
-                memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
-            }
-            qemu_iovec_init_external(&qiov, &iov, 1);
-
-            ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
-
-            /* Keep bounce buffer around if it is big enough for all
-             * all future requests.
-             */
-            if (num < max_xfer_len) {
-                qemu_vfree(iov.iov_base);
-                iov.iov_base = NULL;
-            }
-        }
-
-        sector_num += num;
-        nb_sectors -= num;
-    }
-
-fail:
-    qemu_vfree(iov.iov_base);
-    return ret;
-}
+    return ret;
+}
 
 /*
- * Forwards an already correctly aligned write request to the BlockDriver.
+ * Provide an encryption key for @bs.
+ * If @key is non-null:
+ *     If @bs is not encrypted, fail.
+ *     Else if the key is invalid, fail.
+ *     Else set @bs's key to @key, replacing the existing key, if any.
+ * If @key is null:
+ *     If @bs is encrypted and still lacks a key, fail.
+ *     Else do nothing.
+ * On failure, store an error object through @errp if non-null.
  */
-static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
-    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
-    QEMUIOVector *qiov, int flags)
+void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp)
 {
-    BlockDriver *drv = bs->drv;
-    bool waited;
-    int ret;
-
-    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
-    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
-
-    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
-    assert(!qiov || bytes == qiov->size);
-
-    waited = wait_serialising_requests(req);
-    assert(!waited || !req->serialising);
-    assert(req->overlap_offset <= offset);
-    assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
-
-    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
-
-    if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
-        !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
-        qemu_iovec_is_zero(qiov)) {
-        flags |= BDRV_REQ_ZERO_WRITE;
-        if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
-            flags |= BDRV_REQ_MAY_UNMAP;
-        }
-    }
-
-    if (ret < 0) {
-        /* Do nothing, write notifier decided to fail this request */
-    } else if (flags & BDRV_REQ_ZERO_WRITE) {
-        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
-        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
-    } else {
-        BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
-        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
-    }
-    BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
-
-    if (ret == 0 && !bs->enable_write_cache) {
-        ret = bdrv_co_flush(bs);
-    }
-
-    bdrv_set_dirty(bs, sector_num, nb_sectors);
-
-    block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
-
-    if (ret >= 0) {
-        bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
-    }
-
-    return ret;
-}
-
-/*
- * Handle a write request in coroutine context
- */
-static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
-    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
-    BdrvRequestFlags flags)
-{
-    BdrvTrackedRequest req;
-    uint64_t align = bdrv_get_align(bs);
-    uint8_t *head_buf = NULL;
-    uint8_t *tail_buf = NULL;
-    QEMUIOVector local_qiov;
-    bool use_local_qiov = false;
-    int ret;
-
-    if (!bs->drv) {
-        return -ENOMEDIUM;
-    }
-    if (bs->read_only) {
-        return -EACCES;
-    }
-
-    ret = bdrv_check_byte_request(bs, offset, bytes);
-    if (ret < 0) {
-        return ret;
-    }
-
-    /* throttling disk I/O */
-    if (bs->io_limits_enabled) {
-        bdrv_io_limits_intercept(bs, bytes, true);
-    }
-
-    /*
-     * Align write if necessary by performing a read-modify-write cycle.
-     * Pad qiov with the read parts and be sure to have a tracked request not
-     * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
-     */
-    tracked_request_begin(&req, bs, offset, bytes, true);
-
-    if (offset & (align - 1)) {
-        QEMUIOVector head_qiov;
-        struct iovec head_iov;
-
-        mark_request_serialising(&req, align);
-        wait_serialising_requests(&req);
-
-        head_buf = qemu_blockalign(bs, align);
-        head_iov = (struct iovec) {
-            .iov_base   = head_buf,
-            .iov_len    = align,
-        };
-        qemu_iovec_init_external(&head_qiov, &head_iov, 1);
-
-        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
-        ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
-                                  align, &head_qiov, 0);
-        if (ret < 0) {
-            goto fail;
-        }
-        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
-
-        qemu_iovec_init(&local_qiov, qiov->niov + 2);
-        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
-        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
-        use_local_qiov = true;
-
-        bytes += offset & (align - 1);
-        offset = offset & ~(align - 1);
-    }
-
-    if ((offset + bytes) & (align - 1)) {
-        QEMUIOVector tail_qiov;
-        struct iovec tail_iov;
-        size_t tail_bytes;
-        bool waited;
-
-        mark_request_serialising(&req, align);
-        waited = wait_serialising_requests(&req);
-        assert(!waited || !use_local_qiov);
-
-        tail_buf = qemu_blockalign(bs, align);
-        tail_iov = (struct iovec) {
-            .iov_base   = tail_buf,
-            .iov_len    = align,
-        };
-        qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
-
-        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
-        ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
-                                  align, &tail_qiov, 0);
-        if (ret < 0) {
-            goto fail;
-        }
-        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
-
-        if (!use_local_qiov) {
-            qemu_iovec_init(&local_qiov, qiov->niov + 1);
-            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
-            use_local_qiov = true;
-        }
-
-        tail_bytes = (offset + bytes) & (align - 1);
-        qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
-
-        bytes = ROUND_UP(bytes, align);
-    }
-
-    if (use_local_qiov) {
-        /* Local buffer may have non-zero data. */
-        flags &= ~BDRV_REQ_ZERO_WRITE;
-    }
-    ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
-                               use_local_qiov ? &local_qiov : qiov,
-                               flags);
-
-fail:
-    tracked_request_end(&req);
-
-    if (use_local_qiov) {
-        qemu_iovec_destroy(&local_qiov);
-    }
-    qemu_vfree(head_buf);
-    qemu_vfree(tail_buf);
-
-    return ret;
-}
-
-static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
-    BdrvRequestFlags flags)
-{
-    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
-        return -EINVAL;
-    }
-
-    return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
-                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
-}
-
-int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
-    int nb_sectors, QEMUIOVector *qiov)
-{
-    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
-
-    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
-}
-
-int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
-                                      int64_t sector_num, int nb_sectors,
-                                      BdrvRequestFlags flags)
-{
-    int ret;
-
-    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
-
-    if (!(bs->open_flags & BDRV_O_UNMAP)) {
-        flags &= ~BDRV_REQ_MAY_UNMAP;
-    }
-    if (bdrv_req_is_aligned(bs, sector_num << BDRV_SECTOR_BITS,
-                            nb_sectors << BDRV_SECTOR_BITS)) {
-        ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
-                                BDRV_REQ_ZERO_WRITE | flags);
-    } else {
-        uint8_t *buf;
-        QEMUIOVector local_qiov;
-        size_t bytes = nb_sectors << BDRV_SECTOR_BITS;
-
-        buf = qemu_memalign(bdrv_opt_mem_align(bs), bytes);
-        memset(buf, 0, bytes);
-        qemu_iovec_init(&local_qiov, 1);
-        qemu_iovec_add(&local_qiov, buf, bytes);
-
-        ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, &local_qiov,
-                                BDRV_REQ_ZERO_WRITE | flags);
-        qemu_vfree(buf);
-    }
-    return ret;
-}
-
-/**
- * Truncate file to 'offset' bytes (needed only for file protocols)
- */
-int bdrv_truncate(BlockDriverState *bs, int64_t offset)
-{
-    BlockDriver *drv = bs->drv;
-    int ret;
-    if (!drv)
-        return -ENOMEDIUM;
-    if (!drv->bdrv_truncate)
-        return -ENOTSUP;
-    if (bs->read_only)
-        return -EACCES;
-
-    ret = drv->bdrv_truncate(bs, offset);
-    if (ret == 0) {
-        ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
-        bdrv_dirty_bitmap_truncate(bs);
-        if (bs->blk) {
-            blk_dev_resize_cb(bs->blk);
-        }
-    }
-    return ret;
-}
-
-/**
- * Length of a allocated file in bytes. Sparse files are counted by actual
- * allocated space. Return < 0 if error or unknown.
- */
-int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
-{
-    BlockDriver *drv = bs->drv;
-    if (!drv) {
-        return -ENOMEDIUM;
-    }
-    if (drv->bdrv_get_allocated_file_size) {
-        return drv->bdrv_get_allocated_file_size(bs);
-    }
-    if (bs->file) {
-        return bdrv_get_allocated_file_size(bs->file);
-    }
-    return -ENOTSUP;
-}
-
-/**
- * Return number of sectors on success, -errno on error.
- */
-int64_t bdrv_nb_sectors(BlockDriverState *bs)
-{
-    BlockDriver *drv = bs->drv;
-
-    if (!drv)
-        return -ENOMEDIUM;
-
-    if (drv->has_variable_length) {
-        int ret = refresh_total_sectors(bs, bs->total_sectors);
-        if (ret < 0) {
-            return ret;
-        }
-    }
-    return bs->total_sectors;
-}
-
-/**
- * Return length in bytes on success, -errno on error.
- * The length is always a multiple of BDRV_SECTOR_SIZE.
- */
-int64_t bdrv_getlength(BlockDriverState *bs)
-{
-    int64_t ret = bdrv_nb_sectors(bs);
-
-    return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE;
-}
-
-/* return 0 as number of sectors if no device present or error */
-void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
-{
-    int64_t nb_sectors = bdrv_nb_sectors(bs);
-
-    *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
-}
-
-void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
-                       BlockdevOnError on_write_error)
-{
-    bs->on_read_error = on_read_error;
-    bs->on_write_error = on_write_error;
-}
-
-BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
-{
-    return is_read ? bs->on_read_error : bs->on_write_error;
-}
-
-BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
-{
-    BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
-
-    switch (on_err) {
-    case BLOCKDEV_ON_ERROR_ENOSPC:
-        return (error == ENOSPC) ?
-               BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
-    case BLOCKDEV_ON_ERROR_STOP:
-        return BLOCK_ERROR_ACTION_STOP;
-    case BLOCKDEV_ON_ERROR_REPORT:
-        return BLOCK_ERROR_ACTION_REPORT;
-    case BLOCKDEV_ON_ERROR_IGNORE:
-        return BLOCK_ERROR_ACTION_IGNORE;
-    default:
-        abort();
-    }
-}
-
-static void send_qmp_error_event(BlockDriverState *bs,
-                                 BlockErrorAction action,
-                                 bool is_read, int error)
-{
-    IoOperationType optype;
-
-    optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
-    qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action,
-                                   bdrv_iostatus_is_enabled(bs),
-                                   error == ENOSPC, strerror(error),
-                                   &error_abort);
-}
-
-/* This is done by device models because, while the block layer knows
- * about the error, it does not know whether an operation comes from
- * the device or the block layer (from a job, for example).
- */
-void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
-                       bool is_read, int error)
-{
-    assert(error >= 0);
-
-    if (action == BLOCK_ERROR_ACTION_STOP) {
-        /* First set the iostatus, so that "info block" returns an iostatus
-         * that matches the events raised so far (an additional error iostatus
-         * is fine, but not a lost one).
-         */
-        bdrv_iostatus_set_err(bs, error);
-
-        /* Then raise the request to stop the VM and the event.
-         * qemu_system_vmstop_request_prepare has two effects.  First,
-         * it ensures that the STOP event always comes after the
-         * BLOCK_IO_ERROR event.  Second, it ensures that even if management
-         * can observe the STOP event and do a "cont" before the STOP
-         * event is issued, the VM will not stop.  In this case, vm_start()
-         * also ensures that the STOP/RESUME pair of events is emitted.
-         */
-        qemu_system_vmstop_request_prepare();
-        send_qmp_error_event(bs, action, is_read, error);
-        qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
-    } else {
-        send_qmp_error_event(bs, action, is_read, error);
-    }
-}
-
-int bdrv_is_read_only(BlockDriverState *bs)
-{
-    return bs->read_only;
-}
-
-int bdrv_is_sg(BlockDriverState *bs)
-{
-    return bs->sg;
-}
-
-int bdrv_enable_write_cache(BlockDriverState *bs)
-{
-    return bs->enable_write_cache;
-}
-
-void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce)
-{
-    bs->enable_write_cache = wce;
-
-    /* so a reopen() will preserve wce */
-    if (wce) {
-        bs->open_flags |= BDRV_O_CACHE_WB;
-    } else {
-        bs->open_flags &= ~BDRV_O_CACHE_WB;
-    }
-}
-
-int bdrv_is_encrypted(BlockDriverState *bs)
-{
-    if (bs->backing_hd && bs->backing_hd->encrypted)
-        return 1;
-    return bs->encrypted;
-}
-
-int bdrv_key_required(BlockDriverState *bs)
-{
-    BlockDriverState *backing_hd = bs->backing_hd;
-
-    if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
-        return 1;
-    return (bs->encrypted && !bs->valid_key);
-}
-
-int bdrv_set_key(BlockDriverState *bs, const char *key)
-{
-    int ret;
-    if (bs->backing_hd && bs->backing_hd->encrypted) {
-        ret = bdrv_set_key(bs->backing_hd, key);
-        if (ret < 0)
-            return ret;
-        if (!bs->encrypted)
-            return 0;
-    }
-    if (!bs->encrypted) {
-        return -EINVAL;
-    } else if (!bs->drv || !bs->drv->bdrv_set_key) {
-        return -ENOMEDIUM;
-    }
-    ret = bs->drv->bdrv_set_key(bs, key);
-    if (ret < 0) {
-        bs->valid_key = 0;
-    } else if (!bs->valid_key) {
-        bs->valid_key = 1;
-        if (bs->blk) {
-            /* call the change callback now, we skipped it on open */
-            blk_dev_change_media_cb(bs->blk, true);
-        }
-    }
-    return ret;
-}
-
-/*
- * Provide an encryption key for @bs.
- * If @key is non-null:
- *     If @bs is not encrypted, fail.
- *     Else if the key is invalid, fail.
- *     Else set @bs's key to @key, replacing the existing key, if any.
- * If @key is null:
- *     If @bs is encrypted and still lacks a key, fail.
- *     Else do nothing.
- * On failure, store an error object through @errp if non-null.
- */
-void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp)
-{
-    if (key) {
-        if (!bdrv_is_encrypted(bs)) {
-            error_setg(errp, "Node '%s' is not encrypted",
-                      bdrv_get_device_or_node_name(bs));
-        } else if (bdrv_set_key(bs, key) < 0) {
-            error_set(errp, QERR_INVALID_PASSWORD);
+    if (key) {
+        if (!bdrv_is_encrypted(bs)) {
+            error_setg(errp, "Node '%s' is not encrypted",
+                      bdrv_get_device_or_node_name(bs));
+        } else if (bdrv_set_key(bs, key) < 0) {
+            error_set(errp, QERR_INVALID_PASSWORD);
         }
     } else {
         if (bdrv_key_required(bs)) {
@@ -3856,1334 +2518,409 @@ void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp)
     }
 }
 
-const char *bdrv_get_format_name(BlockDriverState *bs)
-{
-    return bs->drv ? bs->drv->format_name : NULL;
-}
-
-static int qsort_strcmp(const void *a, const void *b)
-{
-    return strcmp(a, b);
-}
-
-void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
-                         void *opaque)
-{
-    BlockDriver *drv;
-    int count = 0;
-    int i;
-    const char **formats = NULL;
-
-    QLIST_FOREACH(drv, &bdrv_drivers, list) {
-        if (drv->format_name) {
-            bool found = false;
-            int i = count;
-            while (formats && i && !found) {
-                found = !strcmp(formats[--i], drv->format_name);
-            }
-
-            if (!found) {
-                formats = g_renew(const char *, formats, count + 1);
-                formats[count++] = drv->format_name;
-            }
-        }
-    }
-
-    qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
-
-    for (i = 0; i < count; i++) {
-        it(opaque, formats[i]);
-    }
-
-    g_free(formats);
-}
-
-/* This function is to find a node in the bs graph */
-BlockDriverState *bdrv_find_node(const char *node_name)
-{
-    BlockDriverState *bs;
-
-    assert(node_name);
-
-    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
-        if (!strcmp(node_name, bs->node_name)) {
-            return bs;
-        }
-    }
-    return NULL;
-}
-
-/* Put this QMP function here so it can access the static graph_bdrv_states. */
-BlockDeviceInfoList *bdrv_named_nodes_list(Error **errp)
-{
-    BlockDeviceInfoList *list, *entry;
-    BlockDriverState *bs;
-
-    list = NULL;
-    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
-        BlockDeviceInfo *info = bdrv_block_device_info(bs, errp);
-        if (!info) {
-            qapi_free_BlockDeviceInfoList(list);
-            return NULL;
-        }
-        entry = g_malloc0(sizeof(*entry));
-        entry->value = info;
-        entry->next = list;
-        list = entry;
-    }
-
-    return list;
-}
-
-BlockDriverState *bdrv_lookup_bs(const char *device,
-                                 const char *node_name,
-                                 Error **errp)
-{
-    BlockBackend *blk;
-    BlockDriverState *bs;
-
-    if (device) {
-        blk = blk_by_name(device);
-
-        if (blk) {
-            return blk_bs(blk);
-        }
-    }
-
-    if (node_name) {
-        bs = bdrv_find_node(node_name);
-
-        if (bs) {
-            return bs;
-        }
-    }
-
-    error_setg(errp, "Cannot find device=%s nor node_name=%s",
-                     device ? device : "",
-                     node_name ? node_name : "");
-    return NULL;
-}
-
-/* If 'base' is in the same chain as 'top', return true. Otherwise,
- * return false.  If either argument is NULL, return false. */
-bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
-{
-    while (top && top != base) {
-        top = top->backing_hd;
-    }
-
-    return top != NULL;
-}
-
-BlockDriverState *bdrv_next_node(BlockDriverState *bs)
-{
-    if (!bs) {
-        return QTAILQ_FIRST(&graph_bdrv_states);
-    }
-    return QTAILQ_NEXT(bs, node_list);
-}
-
-BlockDriverState *bdrv_next(BlockDriverState *bs)
-{
-    if (!bs) {
-        return QTAILQ_FIRST(&bdrv_states);
-    }
-    return QTAILQ_NEXT(bs, device_list);
-}
-
-const char *bdrv_get_node_name(const BlockDriverState *bs)
-{
-    return bs->node_name;
-}
-
-/* TODO check what callers really want: bs->node_name or blk_name() */
-const char *bdrv_get_device_name(const BlockDriverState *bs)
-{
-    return bs->blk ? blk_name(bs->blk) : "";
-}
-
-/* This can be used to identify nodes that might not have a device
- * name associated. Since node and device names live in the same
- * namespace, the result is unambiguous. The exception is if both are
- * absent, then this returns an empty (non-null) string. */
-const char *bdrv_get_device_or_node_name(const BlockDriverState *bs)
-{
-    return bs->blk ? blk_name(bs->blk) : bs->node_name;
-}
-
-int bdrv_get_flags(BlockDriverState *bs)
-{
-    return bs->open_flags;
-}
-
-int bdrv_flush_all(void)
-{
-    BlockDriverState *bs = NULL;
-    int result = 0;
-
-    while ((bs = bdrv_next(bs))) {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-        int ret;
-
-        aio_context_acquire(aio_context);
-        ret = bdrv_flush(bs);
-        if (ret < 0 && !result) {
-            result = ret;
-        }
-        aio_context_release(aio_context);
-    }
-
-    return result;
-}
-
-int bdrv_has_zero_init_1(BlockDriverState *bs)
-{
-    return 1;
-}
-
-int bdrv_has_zero_init(BlockDriverState *bs)
-{
-    assert(bs->drv);
-
-    /* If BS is a copy on write image, it is initialized to
-       the contents of the base image, which may not be zeroes.  */
-    if (bs->backing_hd) {
-        return 0;
-    }
-    if (bs->drv->bdrv_has_zero_init) {
-        return bs->drv->bdrv_has_zero_init(bs);
-    }
-
-    /* safe default */
-    return 0;
-}
-
-bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
-{
-    BlockDriverInfo bdi;
-
-    if (bs->backing_hd) {
-        return false;
-    }
-
-    if (bdrv_get_info(bs, &bdi) == 0) {
-        return bdi.unallocated_blocks_are_zero;
-    }
-
-    return false;
-}
-
-bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
-{
-    BlockDriverInfo bdi;
-
-    if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
-        return false;
-    }
-
-    if (bdrv_get_info(bs, &bdi) == 0) {
-        return bdi.can_write_zeroes_with_unmap;
-    }
-
-    return false;
-}
-
-typedef struct BdrvCoGetBlockStatusData {
-    BlockDriverState *bs;
-    BlockDriverState *base;
-    int64_t sector_num;
-    int nb_sectors;
-    int *pnum;
-    int64_t ret;
-    bool done;
-} BdrvCoGetBlockStatusData;
-
-/*
- * Returns the allocation status of the specified sectors.
- * Drivers not implementing the functionality are assumed to not support
- * backing files, hence all their sectors are reported as allocated.
- *
- * If 'sector_num' is beyond the end of the disk image the return value is 0
- * and 'pnum' is set to 0.
- *
- * 'pnum' is set to the number of sectors (including and immediately following
- * the specified sector) that are known to be in the same
- * allocated/unallocated state.
- *
- * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
- * beyond the end of the disk image it will be clamped.
- */
-static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
-                                                     int64_t sector_num,
-                                                     int nb_sectors, int *pnum)
-{
-    int64_t total_sectors;
-    int64_t n;
-    int64_t ret, ret2;
-
-    total_sectors = bdrv_nb_sectors(bs);
-    if (total_sectors < 0) {
-        return total_sectors;
-    }
-
-    if (sector_num >= total_sectors) {
-        *pnum = 0;
-        return 0;
-    }
-
-    n = total_sectors - sector_num;
-    if (n < nb_sectors) {
-        nb_sectors = n;
-    }
-
-    if (!bs->drv->bdrv_co_get_block_status) {
-        *pnum = nb_sectors;
-        ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
-        if (bs->drv->protocol_name) {
-            ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
-        }
-        return ret;
-    }
-
-    ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
-    if (ret < 0) {
-        *pnum = 0;
-        return ret;
-    }
-
-    if (ret & BDRV_BLOCK_RAW) {
-        assert(ret & BDRV_BLOCK_OFFSET_VALID);
-        return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
-                                     *pnum, pnum);
-    }
-
-    if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
-        ret |= BDRV_BLOCK_ALLOCATED;
-    }
-
-    if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
-        if (bdrv_unallocated_blocks_are_zero(bs)) {
-            ret |= BDRV_BLOCK_ZERO;
-        } else if (bs->backing_hd) {
-            BlockDriverState *bs2 = bs->backing_hd;
-            int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
-            if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
-                ret |= BDRV_BLOCK_ZERO;
-            }
-        }
-    }
-
-    if (bs->file &&
-        (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
-        (ret & BDRV_BLOCK_OFFSET_VALID)) {
-        int file_pnum;
-
-        ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
-                                        *pnum, &file_pnum);
-        if (ret2 >= 0) {
-            /* Ignore errors.  This is just providing extra information, it
-             * is useful but not necessary.
-             */
-            if (!file_pnum) {
-                /* !file_pnum indicates an offset at or beyond the EOF; it is
-                 * perfectly valid for the format block driver to point to such
-                 * offsets, so catch it and mark everything as zero */
-                ret |= BDRV_BLOCK_ZERO;
-            } else {
-                /* Limit request to the range reported by the protocol driver */
-                *pnum = file_pnum;
-                ret |= (ret2 & BDRV_BLOCK_ZERO);
-            }
-        }
-    }
-
-    return ret;
-}
-
-/* Coroutine wrapper for bdrv_get_block_status() */
-static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
-{
-    BdrvCoGetBlockStatusData *data = opaque;
-    BlockDriverState *bs = data->bs;
-
-    data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
-                                         data->pnum);
-    data->done = true;
-}
-
-/*
- * Synchronous wrapper around bdrv_co_get_block_status().
- *
- * See bdrv_co_get_block_status() for details.
- */
-int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
-                              int nb_sectors, int *pnum)
-{
-    Coroutine *co;
-    BdrvCoGetBlockStatusData data = {
-        .bs = bs,
-        .sector_num = sector_num,
-        .nb_sectors = nb_sectors,
-        .pnum = pnum,
-        .done = false,
-    };
-
-    if (qemu_in_coroutine()) {
-        /* Fast-path if already in coroutine context */
-        bdrv_get_block_status_co_entry(&data);
-    } else {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-
-        co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
-        qemu_coroutine_enter(co, &data);
-        while (!data.done) {
-            aio_poll(aio_context, true);
-        }
-    }
-    return data.ret;
-}
-
-int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
-                                   int nb_sectors, int *pnum)
-{
-    int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
-    if (ret < 0) {
-        return ret;
-    }
-    return !!(ret & BDRV_BLOCK_ALLOCATED);
-}
-
-/*
- * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
- *
- * Return true if the given sector is allocated in any image between
- * BASE and TOP (inclusive).  BASE can be NULL to check if the given
- * sector is allocated in any image of the chain.  Return false otherwise.
- *
- * 'pnum' is set to the number of sectors (including and immediately following
- *  the specified sector) that are known to be in the same
- *  allocated/unallocated state.
- *
- */
-int bdrv_is_allocated_above(BlockDriverState *top,
-                            BlockDriverState *base,
-                            int64_t sector_num,
-                            int nb_sectors, int *pnum)
-{
-    BlockDriverState *intermediate;
-    int ret, n = nb_sectors;
-
-    intermediate = top;
-    while (intermediate && intermediate != base) {
-        int pnum_inter;
-        ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
-                                &pnum_inter);
-        if (ret < 0) {
-            return ret;
-        } else if (ret) {
-            *pnum = pnum_inter;
-            return 1;
-        }
-
-        /*
-         * [sector_num, nb_sectors] is unallocated on top but intermediate
-         * might have
-         *
-         * [sector_num+x, nr_sectors] allocated.
-         */
-        if (n > pnum_inter &&
-            (intermediate == top ||
-             sector_num + pnum_inter < intermediate->total_sectors)) {
-            n = pnum_inter;
-        }
-
-        intermediate = intermediate->backing_hd;
-    }
-
-    *pnum = n;
-    return 0;
-}
-
-const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
-{
-    if (bs->backing_hd && bs->backing_hd->encrypted)
-        return bs->backing_file;
-    else if (bs->encrypted)
-        return bs->filename;
-    else
-        return NULL;
-}
-
-void bdrv_get_backing_filename(BlockDriverState *bs,
-                               char *filename, int filename_size)
-{
-    pstrcpy(filename, filename_size, bs->backing_file);
-}
-
-int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
-                          const uint8_t *buf, int nb_sectors)
-{
-    BlockDriver *drv = bs->drv;
-    int ret;
-
-    if (!drv) {
-        return -ENOMEDIUM;
-    }
-    if (!drv->bdrv_write_compressed) {
-        return -ENOTSUP;
-    }
-    ret = bdrv_check_request(bs, sector_num, nb_sectors);
-    if (ret < 0) {
-        return ret;
-    }
-
-    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
-
-    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
-}
-
-int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
-{
-    BlockDriver *drv = bs->drv;
-    if (!drv)
-        return -ENOMEDIUM;
-    if (!drv->bdrv_get_info)
-        return -ENOTSUP;
-    memset(bdi, 0, sizeof(*bdi));
-    return drv->bdrv_get_info(bs, bdi);
-}
-
-ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
-{
-    BlockDriver *drv = bs->drv;
-    if (drv && drv->bdrv_get_specific_info) {
-        return drv->bdrv_get_specific_info(bs);
-    }
-    return NULL;
-}
-
-int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
-                      int64_t pos, int size)
-{
-    QEMUIOVector qiov;
-    struct iovec iov = {
-        .iov_base   = (void *) buf,
-        .iov_len    = size,
-    };
-
-    qemu_iovec_init_external(&qiov, &iov, 1);
-    return bdrv_writev_vmstate(bs, &qiov, pos);
-}
-
-int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
-{
-    BlockDriver *drv = bs->drv;
-
-    if (!drv) {
-        return -ENOMEDIUM;
-    } else if (drv->bdrv_save_vmstate) {
-        return drv->bdrv_save_vmstate(bs, qiov, pos);
-    } else if (bs->file) {
-        return bdrv_writev_vmstate(bs->file, qiov, pos);
-    }
-
-    return -ENOTSUP;
-}
-
-int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
-                      int64_t pos, int size)
-{
-    BlockDriver *drv = bs->drv;
-    if (!drv)
-        return -ENOMEDIUM;
-    if (drv->bdrv_load_vmstate)
-        return drv->bdrv_load_vmstate(bs, buf, pos, size);
-    if (bs->file)
-        return bdrv_load_vmstate(bs->file, buf, pos, size);
-    return -ENOTSUP;
-}
-
-void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
-{
-    if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
-        return;
-    }
-
-    bs->drv->bdrv_debug_event(bs, event);
-}
-
-int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
-                          const char *tag)
-{
-    while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
-        bs = bs->file;
-    }
-
-    if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
-        return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
-    }
-
-    return -ENOTSUP;
-}
-
-int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
-{
-    while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
-        bs = bs->file;
-    }
-
-    if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
-        return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
-    }
-
-    return -ENOTSUP;
-}
-
-int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
-{
-    while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
-        bs = bs->file;
-    }
-
-    if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
-        return bs->drv->bdrv_debug_resume(bs, tag);
-    }
-
-    return -ENOTSUP;
-}
-
-bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
-{
-    while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
-        bs = bs->file;
-    }
-
-    if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
-        return bs->drv->bdrv_debug_is_suspended(bs, tag);
-    }
-
-    return false;
-}
-
-int bdrv_is_snapshot(BlockDriverState *bs)
-{
-    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
-}
-
-/* backing_file can either be relative, or absolute, or a protocol.  If it is
- * relative, it must be relative to the chain.  So, passing in bs->filename
- * from a BDS as backing_file should not be done, as that may be relative to
- * the CWD rather than the chain. */
-BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
-        const char *backing_file)
-{
-    char *filename_full = NULL;
-    char *backing_file_full = NULL;
-    char *filename_tmp = NULL;
-    int is_protocol = 0;
-    BlockDriverState *curr_bs = NULL;
-    BlockDriverState *retval = NULL;
-
-    if (!bs || !bs->drv || !backing_file) {
-        return NULL;
-    }
-
-    filename_full     = g_malloc(PATH_MAX);
-    backing_file_full = g_malloc(PATH_MAX);
-    filename_tmp      = g_malloc(PATH_MAX);
-
-    is_protocol = path_has_protocol(backing_file);
-
-    for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
-
-        /* If either of the filename paths is actually a protocol, then
-         * compare unmodified paths; otherwise make paths relative */
-        if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
-            if (strcmp(backing_file, curr_bs->backing_file) == 0) {
-                retval = curr_bs->backing_hd;
-                break;
-            }
-        } else {
-            /* If not an absolute filename path, make it relative to the current
-             * image's filename path */
-            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
-                         backing_file);
-
-            /* We are going to compare absolute pathnames */
-            if (!realpath(filename_tmp, filename_full)) {
-                continue;
-            }
-
-            /* We need to make sure the backing filename we are comparing against
-             * is relative to the current image filename (or absolute) */
-            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
-                         curr_bs->backing_file);
-
-            if (!realpath(filename_tmp, backing_file_full)) {
-                continue;
-            }
-
-            if (strcmp(backing_file_full, filename_full) == 0) {
-                retval = curr_bs->backing_hd;
-                break;
-            }
-        }
-    }
-
-    g_free(filename_full);
-    g_free(backing_file_full);
-    g_free(filename_tmp);
-    return retval;
-}
-
-int bdrv_get_backing_file_depth(BlockDriverState *bs)
-{
-    if (!bs->drv) {
-        return 0;
-    }
-
-    if (!bs->backing_hd) {
-        return 0;
-    }
-
-    return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
-}
-
-/**************************************************************/
-/* async I/Os */
-
-BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
-                           QEMUIOVector *qiov, int nb_sectors,
-                           BlockCompletionFunc *cb, void *opaque)
-{
-    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
-
-    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
-                                 cb, opaque, false);
-}
-
-BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
-                            QEMUIOVector *qiov, int nb_sectors,
-                            BlockCompletionFunc *cb, void *opaque)
-{
-    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
-
-    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
-                                 cb, opaque, true);
-}
-
-BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
-        BlockCompletionFunc *cb, void *opaque)
-{
-    trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
-
-    return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
-                                 BDRV_REQ_ZERO_WRITE | flags,
-                                 cb, opaque, true);
-}
-
-
-typedef struct MultiwriteCB {
-    int error;
-    int num_requests;
-    int num_callbacks;
-    struct {
-        BlockCompletionFunc *cb;
-        void *opaque;
-        QEMUIOVector *free_qiov;
-    } callbacks[];
-} MultiwriteCB;
-
-static void multiwrite_user_cb(MultiwriteCB *mcb)
-{
-    int i;
-
-    for (i = 0; i < mcb->num_callbacks; i++) {
-        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
-        if (mcb->callbacks[i].free_qiov) {
-            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
-        }
-        g_free(mcb->callbacks[i].free_qiov);
-    }
-}
-
-static void multiwrite_cb(void *opaque, int ret)
+const char *bdrv_get_format_name(BlockDriverState *bs)
 {
-    MultiwriteCB *mcb = opaque;
-
-    trace_multiwrite_cb(mcb, ret);
-
-    if (ret < 0 && !mcb->error) {
-        mcb->error = ret;
-    }
-
-    mcb->num_requests--;
-    if (mcb->num_requests == 0) {
-        multiwrite_user_cb(mcb);
-        g_free(mcb);
-    }
+    return bs->drv ? bs->drv->format_name : NULL;
 }
 
-static int multiwrite_req_compare(const void *a, const void *b)
+static int qsort_strcmp(const void *a, const void *b)
 {
-    const BlockRequest *req1 = a, *req2 = b;
-
-    /*
-     * Note that we can't simply subtract req2->sector from req1->sector
-     * here as that could overflow the return value.
-     */
-    if (req1->sector > req2->sector) {
-        return 1;
-    } else if (req1->sector < req2->sector) {
-        return -1;
-    } else {
-        return 0;
-    }
+    return strcmp(a, b);
 }
 
-/*
- * Takes a bunch of requests and tries to merge them. Returns the number of
- * requests that remain after merging.
- */
-static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
-    int num_reqs, MultiwriteCB *mcb)
+void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
+                         void *opaque)
 {
-    int i, outidx;
-
-    // Sort requests by start sector
-    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
-
-    // Check if adjacent requests touch the same clusters. If so, combine them,
-    // filling up gaps with zero sectors.
-    outidx = 0;
-    for (i = 1; i < num_reqs; i++) {
-        int merge = 0;
-        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
-
-        // Handle exactly sequential writes and overlapping writes.
-        if (reqs[i].sector <= oldreq_last) {
-            merge = 1;
-        }
+    BlockDriver *drv;
+    int count = 0;
+    int i;
+    const char **formats = NULL;
 
-        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
-            merge = 0;
-        }
+    QLIST_FOREACH(drv, &bdrv_drivers, list) {
+        if (drv->format_name) {
+            bool found = false;
+            int i = count;
+            while (formats && i && !found) {
+                found = !strcmp(formats[--i], drv->format_name);
+            }
 
-        if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
-            reqs[i].nb_sectors > bs->bl.max_transfer_length) {
-            merge = 0;
+            if (!found) {
+                formats = g_renew(const char *, formats, count + 1);
+                formats[count++] = drv->format_name;
+            }
         }
+    }
 
-        if (merge) {
-            size_t size;
-            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
-            qemu_iovec_init(qiov,
-                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
-
-            // Add the first request to the merged one. If the requests are
-            // overlapping, drop the last sectors of the first request.
-            size = (reqs[i].sector - reqs[outidx].sector) << 9;
-            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
+    qsort(formats, count, sizeof(formats[0]), qsort_strcmp);
 
-            // We should need to add any zeros between the two requests
-            assert (reqs[i].sector <= oldreq_last);
+    for (i = 0; i < count; i++) {
+        it(opaque, formats[i]);
+    }
 
-            // Add the second request
-            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
+    g_free(formats);
+}
 
-            // Add tail of first request, if necessary
-            if (qiov->size < reqs[outidx].qiov->size) {
-                qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
-                                  reqs[outidx].qiov->size - qiov->size);
-            }
+/* This function is to find a node in the bs graph */
+BlockDriverState *bdrv_find_node(const char *node_name)
+{
+    BlockDriverState *bs;
 
-            reqs[outidx].nb_sectors = qiov->size >> 9;
-            reqs[outidx].qiov = qiov;
+    assert(node_name);
 
-            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
-        } else {
-            outidx++;
-            reqs[outidx].sector     = reqs[i].sector;
-            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
-            reqs[outidx].qiov       = reqs[i].qiov;
+    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
+        if (!strcmp(node_name, bs->node_name)) {
+            return bs;
         }
     }
-
-    block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1);
-
-    return outidx + 1;
+    return NULL;
 }
 
-/*
- * Submit multiple AIO write requests at once.
- *
- * On success, the function returns 0 and all requests in the reqs array have
- * been submitted. In error case this function returns -1, and any of the
- * requests may or may not be submitted yet. In particular, this means that the
- * callback will be called for some of the requests, for others it won't. The
- * caller must check the error field of the BlockRequest to wait for the right
- * callbacks (if error != 0, no callback will be called).
- *
- * The implementation may modify the contents of the reqs array, e.g. to merge
- * requests. However, the fields opaque and error are left unmodified as they
- * are used to signal failure for a single request to the caller.
- */
-int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
+/* Put this QMP function here so it can access the static graph_bdrv_states. */
+BlockDeviceInfoList *bdrv_named_nodes_list(Error **errp)
 {
-    MultiwriteCB *mcb;
-    int i;
+    BlockDeviceInfoList *list, *entry;
+    BlockDriverState *bs;
 
-    /* don't submit writes if we don't have a medium */
-    if (bs->drv == NULL) {
-        for (i = 0; i < num_reqs; i++) {
-            reqs[i].error = -ENOMEDIUM;
+    list = NULL;
+    QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) {
+        BlockDeviceInfo *info = bdrv_block_device_info(bs, errp);
+        if (!info) {
+            qapi_free_BlockDeviceInfoList(list);
+            return NULL;
         }
-        return -1;
+        entry = g_malloc0(sizeof(*entry));
+        entry->value = info;
+        entry->next = list;
+        list = entry;
     }
 
-    if (num_reqs == 0) {
-        return 0;
-    }
+    return list;
+}
 
-    // Create MultiwriteCB structure
-    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
-    mcb->num_requests = 0;
-    mcb->num_callbacks = num_reqs;
+BlockDriverState *bdrv_lookup_bs(const char *device,
+                                 const char *node_name,
+                                 Error **errp)
+{
+    BlockBackend *blk;
+    BlockDriverState *bs;
 
-    for (i = 0; i < num_reqs; i++) {
-        mcb->callbacks[i].cb = reqs[i].cb;
-        mcb->callbacks[i].opaque = reqs[i].opaque;
-    }
+    if (device) {
+        blk = blk_by_name(device);
 
-    // Check for mergable requests
-    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
+        if (blk) {
+            return blk_bs(blk);
+        }
+    }
 
-    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
+    if (node_name) {
+        bs = bdrv_find_node(node_name);
 
-    /* Run the aio requests. */
-    mcb->num_requests = num_reqs;
-    for (i = 0; i < num_reqs; i++) {
-        bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
-                              reqs[i].nb_sectors, reqs[i].flags,
-                              multiwrite_cb, mcb,
-                              true);
+        if (bs) {
+            return bs;
+        }
     }
 
-    return 0;
+    error_setg(errp, "Cannot find device=%s nor node_name=%s",
+                     device ? device : "",
+                     node_name ? node_name : "");
+    return NULL;
 }
 
-void bdrv_aio_cancel(BlockAIOCB *acb)
+/* If 'base' is in the same chain as 'top', return true. Otherwise,
+ * return false.  If either argument is NULL, return false. */
+bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
 {
-    qemu_aio_ref(acb);
-    bdrv_aio_cancel_async(acb);
-    while (acb->refcnt > 1) {
-        if (acb->aiocb_info->get_aio_context) {
-            aio_poll(acb->aiocb_info->get_aio_context(acb), true);
-        } else if (acb->bs) {
-            aio_poll(bdrv_get_aio_context(acb->bs), true);
-        } else {
-            abort();
-        }
+    while (top && top != base) {
+        top = top->backing_hd;
     }
-    qemu_aio_unref(acb);
+
+    return top != NULL;
 }
 
-/* Async version of aio cancel. The caller is not blocked if the acb implements
- * cancel_async, otherwise we do nothing and let the request normally complete.
- * In either case the completion callback must be called. */
-void bdrv_aio_cancel_async(BlockAIOCB *acb)
+BlockDriverState *bdrv_next_node(BlockDriverState *bs)
 {
-    if (acb->aiocb_info->cancel_async) {
-        acb->aiocb_info->cancel_async(acb);
+    if (!bs) {
+        return QTAILQ_FIRST(&graph_bdrv_states);
     }
+    return QTAILQ_NEXT(bs, node_list);
 }
 
-/**************************************************************/
-/* async block device emulation */
-
-typedef struct BlockAIOCBSync {
-    BlockAIOCB common;
-    QEMUBH *bh;
-    int ret;
-    /* vector translation state */
-    QEMUIOVector *qiov;
-    uint8_t *bounce;
-    int is_write;
-} BlockAIOCBSync;
-
-static const AIOCBInfo bdrv_em_aiocb_info = {
-    .aiocb_size         = sizeof(BlockAIOCBSync),
-};
-
-static void bdrv_aio_bh_cb(void *opaque)
+BlockDriverState *bdrv_next(BlockDriverState *bs)
 {
-    BlockAIOCBSync *acb = opaque;
-
-    if (!acb->is_write && acb->ret >= 0) {
-        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
+    if (!bs) {
+        return QTAILQ_FIRST(&bdrv_states);
     }
-    qemu_vfree(acb->bounce);
-    acb->common.cb(acb->common.opaque, acb->ret);
-    qemu_bh_delete(acb->bh);
-    acb->bh = NULL;
-    qemu_aio_unref(acb);
+    return QTAILQ_NEXT(bs, device_list);
 }
 
-static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
-                                      int64_t sector_num,
-                                      QEMUIOVector *qiov,
-                                      int nb_sectors,
-                                      BlockCompletionFunc *cb,
-                                      void *opaque,
-                                      int is_write)
-
+const char *bdrv_get_node_name(const BlockDriverState *bs)
 {
-    BlockAIOCBSync *acb;
-
-    acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
-    acb->is_write = is_write;
-    acb->qiov = qiov;
-    acb->bounce = qemu_try_blockalign(bs, qiov->size);
-    acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
-
-    if (acb->bounce == NULL) {
-        acb->ret = -ENOMEM;
-    } else if (is_write) {
-        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
-        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
-    } else {
-        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
-    }
-
-    qemu_bh_schedule(acb->bh);
-
-    return &acb->common;
+    return bs->node_name;
 }
 
-static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque)
+/* TODO check what callers really want: bs->node_name or blk_name() */
+const char *bdrv_get_device_name(const BlockDriverState *bs)
 {
-    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+    return bs->blk ? blk_name(bs->blk) : "";
 }
 
-static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque)
+/* This can be used to identify nodes that might not have a device
+ * name associated. Since node and device names live in the same
+ * namespace, the result is unambiguous. The exception is if both are
+ * absent, then this returns an empty (non-null) string. */
+const char *bdrv_get_device_or_node_name(const BlockDriverState *bs)
 {
-    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
+    return bs->blk ? blk_name(bs->blk) : bs->node_name;
 }
 
-
-typedef struct BlockAIOCBCoroutine {
-    BlockAIOCB common;
-    BlockRequest req;
-    bool is_write;
-    bool need_bh;
-    bool *done;
-    QEMUBH* bh;
-} BlockAIOCBCoroutine;
-
-static const AIOCBInfo bdrv_em_co_aiocb_info = {
-    .aiocb_size         = sizeof(BlockAIOCBCoroutine),
-};
-
-static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
+int bdrv_get_flags(BlockDriverState *bs)
 {
-    if (!acb->need_bh) {
-        acb->common.cb(acb->common.opaque, acb->req.error);
-        qemu_aio_unref(acb);
-    }
+    return bs->open_flags;
 }
 
-static void bdrv_co_em_bh(void *opaque)
+int bdrv_has_zero_init_1(BlockDriverState *bs)
 {
-    BlockAIOCBCoroutine *acb = opaque;
-
-    assert(!acb->need_bh);
-    qemu_bh_delete(acb->bh);
-    bdrv_co_complete(acb);
+    return 1;
 }
 
-static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
+int bdrv_has_zero_init(BlockDriverState *bs)
 {
-    acb->need_bh = false;
-    if (acb->req.error != -EINPROGRESS) {
-        BlockDriverState *bs = acb->common.bs;
+    assert(bs->drv);
 
-        acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
-        qemu_bh_schedule(acb->bh);
+    /* If BS is a copy on write image, it is initialized to
+       the contents of the base image, which may not be zeroes.  */
+    if (bs->backing_hd) {
+        return 0;
+    }
+    if (bs->drv->bdrv_has_zero_init) {
+        return bs->drv->bdrv_has_zero_init(bs);
     }
+
+    /* safe default */
+    return 0;
 }
 
-/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
-static void coroutine_fn bdrv_co_do_rw(void *opaque)
+bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
 {
-    BlockAIOCBCoroutine *acb = opaque;
-    BlockDriverState *bs = acb->common.bs;
+    BlockDriverInfo bdi;
 
-    if (!acb->is_write) {
-        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
-            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
-    } else {
-        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
-            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
+    if (bs->backing_hd) {
+        return false;
+    }
+
+    if (bdrv_get_info(bs, &bdi) == 0) {
+        return bdi.unallocated_blocks_are_zero;
     }
 
-    bdrv_co_complete(acb);
+    return false;
 }
 
-static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
-                                         int64_t sector_num,
-                                         QEMUIOVector *qiov,
-                                         int nb_sectors,
-                                         BdrvRequestFlags flags,
-                                         BlockCompletionFunc *cb,
-                                         void *opaque,
-                                         bool is_write)
+bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
 {
-    Coroutine *co;
-    BlockAIOCBCoroutine *acb;
+    BlockDriverInfo bdi;
 
-    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
-    acb->need_bh = true;
-    acb->req.error = -EINPROGRESS;
-    acb->req.sector = sector_num;
-    acb->req.nb_sectors = nb_sectors;
-    acb->req.qiov = qiov;
-    acb->req.flags = flags;
-    acb->is_write = is_write;
+    if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) {
+        return false;
+    }
 
-    co = qemu_coroutine_create(bdrv_co_do_rw);
-    qemu_coroutine_enter(co, acb);
+    if (bdrv_get_info(bs, &bdi) == 0) {
+        return bdi.can_write_zeroes_with_unmap;
+    }
 
-    bdrv_co_maybe_schedule_bh(acb);
-    return &acb->common;
+    return false;
 }
 
-static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
+const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
 {
-    BlockAIOCBCoroutine *acb = opaque;
-    BlockDriverState *bs = acb->common.bs;
-
-    acb->req.error = bdrv_co_flush(bs);
-    bdrv_co_complete(acb);
+    if (bs->backing_hd && bs->backing_hd->encrypted)
+        return bs->backing_file;
+    else if (bs->encrypted)
+        return bs->filename;
+    else
+        return NULL;
 }
 
-BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
-        BlockCompletionFunc *cb, void *opaque)
+void bdrv_get_backing_filename(BlockDriverState *bs,
+                               char *filename, int filename_size)
 {
-    trace_bdrv_aio_flush(bs, opaque);
-
-    Coroutine *co;
-    BlockAIOCBCoroutine *acb;
-
-    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
-    acb->need_bh = true;
-    acb->req.error = -EINPROGRESS;
+    pstrcpy(filename, filename_size, bs->backing_file);
+}
 
-    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
-    qemu_coroutine_enter(co, acb);
+int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+    BlockDriver *drv = bs->drv;
+    if (!drv)
+        return -ENOMEDIUM;
+    if (!drv->bdrv_get_info)
+        return -ENOTSUP;
+    memset(bdi, 0, sizeof(*bdi));
+    return drv->bdrv_get_info(bs, bdi);
+}
 
-    bdrv_co_maybe_schedule_bh(acb);
-    return &acb->common;
+ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs)
+{
+    BlockDriver *drv = bs->drv;
+    if (drv && drv->bdrv_get_specific_info) {
+        return drv->bdrv_get_specific_info(bs);
+    }
+    return NULL;
 }
 
-static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
+void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
 {
-    BlockAIOCBCoroutine *acb = opaque;
-    BlockDriverState *bs = acb->common.bs;
+    if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
+        return;
+    }
 
-    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
-    bdrv_co_complete(acb);
+    bs->drv->bdrv_debug_event(bs, event);
 }
 
-BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque)
+int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
+                          const char *tag)
 {
-    Coroutine *co;
-    BlockAIOCBCoroutine *acb;
-
-    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
+    while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
+        bs = bs->file;
+    }
 
-    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
-    acb->need_bh = true;
-    acb->req.error = -EINPROGRESS;
-    acb->req.sector = sector_num;
-    acb->req.nb_sectors = nb_sectors;
-    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
-    qemu_coroutine_enter(co, acb);
+    if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
+        return bs->drv->bdrv_debug_breakpoint(bs, event, tag);
+    }
 
-    bdrv_co_maybe_schedule_bh(acb);
-    return &acb->common;
+    return -ENOTSUP;
 }
 
-void bdrv_init(void)
+int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
 {
-    module_call_init(MODULE_INIT_BLOCK);
-}
+    while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
+        bs = bs->file;
+    }
 
-void bdrv_init_with_whitelist(void)
-{
-    use_bdrv_whitelist = 1;
-    bdrv_init();
+    if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
+        return bs->drv->bdrv_debug_remove_breakpoint(bs, tag);
+    }
+
+    return -ENOTSUP;
 }
 
-void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
-                   BlockCompletionFunc *cb, void *opaque)
+int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
 {
-    BlockAIOCB *acb;
+    while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
+        bs = bs->file;
+    }
 
-    acb = g_slice_alloc(aiocb_info->aiocb_size);
-    acb->aiocb_info = aiocb_info;
-    acb->bs = bs;
-    acb->cb = cb;
-    acb->opaque = opaque;
-    acb->refcnt = 1;
-    return acb;
-}
+    if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
+        return bs->drv->bdrv_debug_resume(bs, tag);
+    }
 
-void qemu_aio_ref(void *p)
-{
-    BlockAIOCB *acb = p;
-    acb->refcnt++;
+    return -ENOTSUP;
 }
 
-void qemu_aio_unref(void *p)
+bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
 {
-    BlockAIOCB *acb = p;
-    assert(acb->refcnt > 0);
-    if (--acb->refcnt == 0) {
-        g_slice_free1(acb->aiocb_info->aiocb_size, acb);
+    while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
+        bs = bs->file;
     }
-}
 
-/**************************************************************/
-/* Coroutine block device emulation */
+    if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
+        return bs->drv->bdrv_debug_is_suspended(bs, tag);
+    }
 
-typedef struct CoroutineIOCompletion {
-    Coroutine *coroutine;
-    int ret;
-} CoroutineIOCompletion;
+    return false;
+}
 
-static void bdrv_co_io_em_complete(void *opaque, int ret)
+int bdrv_is_snapshot(BlockDriverState *bs)
 {
-    CoroutineIOCompletion *co = opaque;
-
-    co->ret = ret;
-    qemu_coroutine_enter(co->coroutine, NULL);
+    return !!(bs->open_flags & BDRV_O_SNAPSHOT);
 }
 
-static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
-                                      int nb_sectors, QEMUIOVector *iov,
-                                      bool is_write)
+/* backing_file can either be relative, or absolute, or a protocol.  If it is
+ * relative, it must be relative to the chain.  So, passing in bs->filename
+ * from a BDS as backing_file should not be done, as that may be relative to
+ * the CWD rather than the chain. */
+BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
+        const char *backing_file)
 {
-    CoroutineIOCompletion co = {
-        .coroutine = qemu_coroutine_self(),
-    };
-    BlockAIOCB *acb;
+    char *filename_full = NULL;
+    char *backing_file_full = NULL;
+    char *filename_tmp = NULL;
+    int is_protocol = 0;
+    BlockDriverState *curr_bs = NULL;
+    BlockDriverState *retval = NULL;
 
-    if (is_write) {
-        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
-                                       bdrv_co_io_em_complete, &co);
-    } else {
-        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
-                                      bdrv_co_io_em_complete, &co);
+    if (!bs || !bs->drv || !backing_file) {
+        return NULL;
     }
 
-    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
-    if (!acb) {
-        return -EIO;
-    }
-    qemu_coroutine_yield();
+    filename_full     = g_malloc(PATH_MAX);
+    backing_file_full = g_malloc(PATH_MAX);
+    filename_tmp      = g_malloc(PATH_MAX);
 
-    return co.ret;
-}
+    is_protocol = path_has_protocol(backing_file);
 
-static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
-                                         int64_t sector_num, int nb_sectors,
-                                         QEMUIOVector *iov)
-{
-    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
-}
+    for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) {
 
-static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
-                                         int64_t sector_num, int nb_sectors,
-                                         QEMUIOVector *iov)
-{
-    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
-}
+        /* If either of the filename paths is actually a protocol, then
+         * compare unmodified paths; otherwise make paths relative */
+        if (is_protocol || path_has_protocol(curr_bs->backing_file)) {
+            if (strcmp(backing_file, curr_bs->backing_file) == 0) {
+                retval = curr_bs->backing_hd;
+                break;
+            }
+        } else {
+            /* If not an absolute filename path, make it relative to the current
+             * image's filename path */
+            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
+                         backing_file);
 
-static void coroutine_fn bdrv_flush_co_entry(void *opaque)
-{
-    RwCo *rwco = opaque;
+            /* We are going to compare absolute pathnames */
+            if (!realpath(filename_tmp, filename_full)) {
+                continue;
+            }
 
-    rwco->ret = bdrv_co_flush(rwco->bs);
-}
+            /* We need to make sure the backing filename we are comparing against
+             * is relative to the current image filename (or absolute) */
+            path_combine(filename_tmp, PATH_MAX, curr_bs->filename,
+                         curr_bs->backing_file);
 
-int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
-{
-    int ret;
+            if (!realpath(filename_tmp, backing_file_full)) {
+                continue;
+            }
 
-    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
-        return 0;
+            if (strcmp(backing_file_full, filename_full) == 0) {
+                retval = curr_bs->backing_hd;
+                break;
+            }
+        }
     }
 
-    /* Write back cached data to the OS even with cache=unsafe */
-    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
-    if (bs->drv->bdrv_co_flush_to_os) {
-        ret = bs->drv->bdrv_co_flush_to_os(bs);
-        if (ret < 0) {
-            return ret;
-        }
+    g_free(filename_full);
+    g_free(backing_file_full);
+    g_free(filename_tmp);
+    return retval;
+}
+
+int bdrv_get_backing_file_depth(BlockDriverState *bs)
+{
+    if (!bs->drv) {
+        return 0;
     }
 
-    /* But don't actually force it to the disk with cache=unsafe */
-    if (bs->open_flags & BDRV_O_NO_FLUSH) {
-        goto flush_parent;
+    if (!bs->backing_hd) {
+        return 0;
     }
 
-    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
-    if (bs->drv->bdrv_co_flush_to_disk) {
-        ret = bs->drv->bdrv_co_flush_to_disk(bs);
-    } else if (bs->drv->bdrv_aio_flush) {
-        BlockAIOCB *acb;
-        CoroutineIOCompletion co = {
-            .coroutine = qemu_coroutine_self(),
-        };
+    return 1 + bdrv_get_backing_file_depth(bs->backing_hd);
+}
 
-        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
-        if (acb == NULL) {
-            ret = -EIO;
-        } else {
-            qemu_coroutine_yield();
-            ret = co.ret;
-        }
-    } else {
-        /*
-         * Some block drivers always operate in either writethrough or unsafe
-         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
-         * know how the server works (because the behaviour is hardcoded or
-         * depends on server-side configuration), so we can't ensure that
-         * everything is safe on disk. Returning an error doesn't work because
-         * that would break guests even if the server operates in writethrough
-         * mode.
-         *
-         * Let's hope the user knows what he's doing.
-         */
-        ret = 0;
-    }
-    if (ret < 0) {
-        return ret;
-    }
+void bdrv_init(void)
+{
+    module_call_init(MODULE_INIT_BLOCK);
+}
 
-    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
-     * in the case of cache=unsafe, so there are no useless flushes.
-     */
-flush_parent:
-    return bdrv_co_flush(bs->file);
+void bdrv_init_with_whitelist(void)
+{
+    use_bdrv_whitelist = 1;
+    bdrv_init();
 }
 
 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
@@ -5235,143 +2972,6 @@ void bdrv_invalidate_cache_all(Error **errp)
     }
 }
 
-int bdrv_flush(BlockDriverState *bs)
-{
-    Coroutine *co;
-    RwCo rwco = {
-        .bs = bs,
-        .ret = NOT_DONE,
-    };
-
-    if (qemu_in_coroutine()) {
-        /* Fast-path if already in coroutine context */
-        bdrv_flush_co_entry(&rwco);
-    } else {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-
-        co = qemu_coroutine_create(bdrv_flush_co_entry);
-        qemu_coroutine_enter(co, &rwco);
-        while (rwco.ret == NOT_DONE) {
-            aio_poll(aio_context, true);
-        }
-    }
-
-    return rwco.ret;
-}
-
-typedef struct DiscardCo {
-    BlockDriverState *bs;
-    int64_t sector_num;
-    int nb_sectors;
-    int ret;
-} DiscardCo;
-static void coroutine_fn bdrv_discard_co_entry(void *opaque)
-{
-    DiscardCo *rwco = opaque;
-
-    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
-}
-
-int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
-                                 int nb_sectors)
-{
-    int max_discard, ret;
-
-    if (!bs->drv) {
-        return -ENOMEDIUM;
-    }
-
-    ret = bdrv_check_request(bs, sector_num, nb_sectors);
-    if (ret < 0) {
-        return ret;
-    } else if (bs->read_only) {
-        return -EROFS;
-    }
-
-    bdrv_reset_dirty(bs, sector_num, nb_sectors);
-
-    /* Do nothing if disabled.  */
-    if (!(bs->open_flags & BDRV_O_UNMAP)) {
-        return 0;
-    }
-
-    if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
-        return 0;
-    }
-
-    max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
-    while (nb_sectors > 0) {
-        int ret;
-        int num = nb_sectors;
-
-        /* align request */
-        if (bs->bl.discard_alignment &&
-            num >= bs->bl.discard_alignment &&
-            sector_num % bs->bl.discard_alignment) {
-            if (num > bs->bl.discard_alignment) {
-                num = bs->bl.discard_alignment;
-            }
-            num -= sector_num % bs->bl.discard_alignment;
-        }
-
-        /* limit request size */
-        if (num > max_discard) {
-            num = max_discard;
-        }
-
-        if (bs->drv->bdrv_co_discard) {
-            ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
-        } else {
-            BlockAIOCB *acb;
-            CoroutineIOCompletion co = {
-                .coroutine = qemu_coroutine_self(),
-            };
-
-            acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
-                                            bdrv_co_io_em_complete, &co);
-            if (acb == NULL) {
-                return -EIO;
-            } else {
-                qemu_coroutine_yield();
-                ret = co.ret;
-            }
-        }
-        if (ret && ret != -ENOTSUP) {
-            return ret;
-        }
-
-        sector_num += num;
-        nb_sectors -= num;
-    }
-    return 0;
-}
-
-int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
-{
-    Coroutine *co;
-    DiscardCo rwco = {
-        .bs = bs,
-        .sector_num = sector_num,
-        .nb_sectors = nb_sectors,
-        .ret = NOT_DONE,
-    };
-
-    if (qemu_in_coroutine()) {
-        /* Fast-path if already in coroutine context */
-        bdrv_discard_co_entry(&rwco);
-    } else {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
-
-        co = qemu_coroutine_create(bdrv_discard_co_entry);
-        qemu_coroutine_enter(co, &rwco);
-        while (rwco.ret == NOT_DONE) {
-            aio_poll(aio_context, true);
-        }
-    }
-
-    return rwco.ret;
-}
-
 /**************************************************************/
 /* removable device support */
 
@@ -5437,87 +3037,11 @@ void bdrv_lock_medium(BlockDriverState *bs, bool locked)
     }
 }
 
-/* needed for generic scsi interface */
-
-int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
-{
-    BlockDriver *drv = bs->drv;
-
-    if (drv && drv->bdrv_ioctl)
-        return drv->bdrv_ioctl(bs, req, buf);
-    return -ENOTSUP;
-}
-
-BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
-        unsigned long int req, void *buf,
-        BlockCompletionFunc *cb, void *opaque)
-{
-    BlockDriver *drv = bs->drv;
-
-    if (drv && drv->bdrv_aio_ioctl)
-        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
-    return NULL;
-}
-
 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
 {
     bs->guest_block_size = align;
 }
 
-void *qemu_blockalign(BlockDriverState *bs, size_t size)
-{
-    return qemu_memalign(bdrv_opt_mem_align(bs), size);
-}
-
-void *qemu_blockalign0(BlockDriverState *bs, size_t size)
-{
-    return memset(qemu_blockalign(bs, size), 0, size);
-}
-
-void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
-{
-    size_t align = bdrv_opt_mem_align(bs);
-
-    /* Ensure that NULL is never returned on success */
-    assert(align > 0);
-    if (size == 0) {
-        size = align;
-    }
-
-    return qemu_try_memalign(align, size);
-}
-
-void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
-{
-    void *mem = qemu_try_blockalign(bs, size);
-
-    if (mem) {
-        memset(mem, 0, size);
-    }
-
-    return mem;
-}
-
-/*
- * Check if all memory in this vector is sector aligned.
- */
-bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
-{
-    int i;
-    size_t alignment = bdrv_opt_mem_align(bs);
-
-    for (i = 0; i < qiov->niov; i++) {
-        if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
-            return false;
-        }
-        if (qiov->iov[i].iov_len % alignment) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
 BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name)
 {
     BdrvDirtyBitmap *bm;
@@ -6239,12 +3763,6 @@ void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
     abort();
 }
 
-void bdrv_add_before_write_notifier(BlockDriverState *bs,
-                                    NotifierWithReturn *notifier)
-{
-    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
-}
-
 int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts,
                        BlockDriverAmendStatusCB *status_cb)
 {
@@ -6345,36 +3863,6 @@ out:
     return to_replace_bs;
 }
 
-void bdrv_io_plug(BlockDriverState *bs)
-{
-    BlockDriver *drv = bs->drv;
-    if (drv && drv->bdrv_io_plug) {
-        drv->bdrv_io_plug(bs);
-    } else if (bs->file) {
-        bdrv_io_plug(bs->file);
-    }
-}
-
-void bdrv_io_unplug(BlockDriverState *bs)
-{
-    BlockDriver *drv = bs->drv;
-    if (drv && drv->bdrv_io_unplug) {
-        drv->bdrv_io_unplug(bs);
-    } else if (bs->file) {
-        bdrv_io_unplug(bs->file);
-    }
-}
-
-void bdrv_flush_io_queue(BlockDriverState *bs)
-{
-    BlockDriver *drv = bs->drv;
-    if (drv && drv->bdrv_flush_io_queue) {
-        drv->bdrv_flush_io_queue(bs);
-    } else if (bs->file) {
-        bdrv_flush_io_queue(bs->file);
-    }
-}
-
 static bool append_open_options(QDict *d, BlockDriverState *bs)
 {
     const QDictEntry *entry;
diff --git a/block/Makefile.objs b/block/Makefile.objs
index 179e71d..0d8c2a4 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -9,7 +9,7 @@ block-obj-y += block-backend.o snapshot.o qapi.o
 block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
 block-obj-$(CONFIG_POSIX) += raw-posix.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
-block-obj-y += null.o mirror.o
+block-obj-y += null.o mirror.o io.o
 
 block-obj-y += nbd.o nbd-client.o sheepdog.o
 block-obj-$(CONFIG_LIBISCSI) += iscsi.o
diff --git a/block/io.c b/block/io.c
new file mode 100644
index 0000000..1ce62c4
--- /dev/null
+++ b/block/io.c
@@ -0,0 +1,2540 @@
+/*
+ * Block layer I/O functions
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "trace.h"
+#include "sysemu/qtest.h"
+#include "block/blockjob.h"
+#include "block/block_int.h"
+
+#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
+
+static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockCompletionFunc *cb, void *opaque);
+static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockCompletionFunc *cb, void *opaque);
+static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
+                                         int64_t sector_num, int nb_sectors,
+                                         QEMUIOVector *iov);
+static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
+                                         int64_t sector_num, int nb_sectors,
+                                         QEMUIOVector *iov);
+static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
+    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
+    BdrvRequestFlags flags);
+static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
+    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
+    BdrvRequestFlags flags);
+static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
+                                         int64_t sector_num,
+                                         QEMUIOVector *qiov,
+                                         int nb_sectors,
+                                         BdrvRequestFlags flags,
+                                         BlockCompletionFunc *cb,
+                                         void *opaque,
+                                         bool is_write);
+static void coroutine_fn bdrv_co_do_rw(void *opaque);
+static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
+
+/* throttling disk I/O limits */
+void bdrv_set_io_limits(BlockDriverState *bs,
+                        ThrottleConfig *cfg)
+{
+    int i;
+
+    throttle_config(&bs->throttle_state, cfg);
+
+    for (i = 0; i < 2; i++) {
+        qemu_co_enter_next(&bs->throttled_reqs[i]);
+    }
+}
+
+/* this function drain all the throttled IOs */
+static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
+{
+    bool drained = false;
+    bool enabled = bs->io_limits_enabled;
+    int i;
+
+    bs->io_limits_enabled = false;
+
+    for (i = 0; i < 2; i++) {
+        while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
+            drained = true;
+        }
+    }
+
+    bs->io_limits_enabled = enabled;
+
+    return drained;
+}
+
+void bdrv_io_limits_disable(BlockDriverState *bs)
+{
+    bs->io_limits_enabled = false;
+
+    bdrv_start_throttled_reqs(bs);
+
+    throttle_destroy(&bs->throttle_state);
+}
+
+static void bdrv_throttle_read_timer_cb(void *opaque)
+{
+    BlockDriverState *bs = opaque;
+    qemu_co_enter_next(&bs->throttled_reqs[0]);
+}
+
+static void bdrv_throttle_write_timer_cb(void *opaque)
+{
+    BlockDriverState *bs = opaque;
+    qemu_co_enter_next(&bs->throttled_reqs[1]);
+}
+
+/* should be called before bdrv_set_io_limits if a limit is set */
+void bdrv_io_limits_enable(BlockDriverState *bs)
+{
+    int clock_type = QEMU_CLOCK_REALTIME;
+
+    if (qtest_enabled()) {
+        /* For testing block IO throttling only */
+        clock_type = QEMU_CLOCK_VIRTUAL;
+    }
+    assert(!bs->io_limits_enabled);
+    throttle_init(&bs->throttle_state,
+                  bdrv_get_aio_context(bs),
+                  clock_type,
+                  bdrv_throttle_read_timer_cb,
+                  bdrv_throttle_write_timer_cb,
+                  bs);
+    bs->io_limits_enabled = true;
+}
+
+/* This function makes an IO wait if needed
+ *
+ * @nb_sectors: the number of sectors of the IO
+ * @is_write:   is the IO a write
+ */
+static void bdrv_io_limits_intercept(BlockDriverState *bs,
+                                     unsigned int bytes,
+                                     bool is_write)
+{
+    /* does this io must wait */
+    bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
+
+    /* if must wait or any request of this type throttled queue the IO */
+    if (must_wait ||
+        !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
+        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
+    }
+
+    /* the IO will be executed, do the accounting */
+    throttle_account(&bs->throttle_state, is_write, bytes);
+
+
+    /* if the next request must wait -> do nothing */
+    if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
+        return;
+    }
+
+    /* else queue next request for execution */
+    qemu_co_queue_next(&bs->throttled_reqs[is_write]);
+}
+
+void bdrv_setup_io_funcs(BlockDriver *bdrv)
+{
+    /* Block drivers without coroutine functions need emulation */
+    if (!bdrv->bdrv_co_readv) {
+        bdrv->bdrv_co_readv = bdrv_co_readv_em;
+        bdrv->bdrv_co_writev = bdrv_co_writev_em;
+
+        /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
+         * the block driver lacks aio we need to emulate that too.
+         */
+        if (!bdrv->bdrv_aio_readv) {
+            /* add AIO emulation layer */
+            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
+            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
+        }
+    }
+}
+
+void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
+{
+    BlockDriver *drv = bs->drv;
+    Error *local_err = NULL;
+
+    memset(&bs->bl, 0, sizeof(bs->bl));
+
+    if (!drv) {
+        return;
+    }
+
+    /* Take some limits from the children as a default */
+    if (bs->file) {
+        bdrv_refresh_limits(bs->file, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+        bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
+        bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
+        bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
+    } else {
+        bs->bl.opt_mem_alignment = 512;
+    }
+
+    if (bs->backing_hd) {
+        bdrv_refresh_limits(bs->backing_hd, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+        bs->bl.opt_transfer_length =
+            MAX(bs->bl.opt_transfer_length,
+                bs->backing_hd->bl.opt_transfer_length);
+        bs->bl.max_transfer_length =
+            MIN_NON_ZERO(bs->bl.max_transfer_length,
+                         bs->backing_hd->bl.max_transfer_length);
+        bs->bl.opt_mem_alignment =
+            MAX(bs->bl.opt_mem_alignment,
+                bs->backing_hd->bl.opt_mem_alignment);
+    }
+
+    /* Then let the driver override it */
+    if (drv->bdrv_refresh_limits) {
+        drv->bdrv_refresh_limits(bs, errp);
+    }
+}
+
+/**
+ * The copy-on-read flag is actually a reference count so multiple users may
+ * use the feature without worrying about clobbering its previous state.
+ * Copy-on-read stays enabled until all users have called to disable it.
+ */
+void bdrv_enable_copy_on_read(BlockDriverState *bs)
+{
+    bs->copy_on_read++;
+}
+
+void bdrv_disable_copy_on_read(BlockDriverState *bs)
+{
+    assert(bs->copy_on_read > 0);
+    bs->copy_on_read--;
+}
+
+/* Check if any requests are in-flight (including throttled requests) */
+static bool bdrv_requests_pending(BlockDriverState *bs)
+{
+    if (!QLIST_EMPTY(&bs->tracked_requests)) {
+        return true;
+    }
+    if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
+        return true;
+    }
+    if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
+        return true;
+    }
+    if (bs->file && bdrv_requests_pending(bs->file)) {
+        return true;
+    }
+    if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
+        return true;
+    }
+    return false;
+}
+
+static bool bdrv_drain_one(BlockDriverState *bs)
+{
+    bool bs_busy;
+
+    bdrv_flush_io_queue(bs);
+    bdrv_start_throttled_reqs(bs);
+    bs_busy = bdrv_requests_pending(bs);
+    bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
+    return bs_busy;
+}
+
+/*
+ * Wait for pending requests to complete on a single BlockDriverState subtree
+ *
+ * See the warning in bdrv_drain_all().  This function can only be called if
+ * you are sure nothing can generate I/O because you have op blockers
+ * installed.
+ *
+ * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
+ * AioContext.
+ */
+void bdrv_drain(BlockDriverState *bs)
+{
+    while (bdrv_drain_one(bs)) {
+        /* Keep iterating */
+    }
+}
+
+/*
+ * Wait for pending requests to complete across all BlockDriverStates
+ *
+ * This function does not flush data to disk, use bdrv_flush_all() for that
+ * after calling this function.
+ *
+ * Note that completion of an asynchronous I/O operation can trigger any
+ * number of other I/O operations on other devices---for example a coroutine
+ * can be arbitrarily complex and a constant flow of I/O can come until the
+ * coroutine is complete.  Because of this, it is not possible to have a
+ * function to drain a single device's I/O queue.
+ */
+void bdrv_drain_all(void)
+{
+    /* Always run first iteration so any pending completion BHs run */
+    bool busy = true;
+    BlockDriverState *bs = NULL;
+
+    while ((bs = bdrv_next(bs))) {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+
+        aio_context_acquire(aio_context);
+        if (bs->job) {
+            block_job_pause(bs->job);
+        }
+        aio_context_release(aio_context);
+    }
+
+    while (busy) {
+        busy = false;
+        bs = NULL;
+
+        while ((bs = bdrv_next(bs))) {
+            AioContext *aio_context = bdrv_get_aio_context(bs);
+
+            aio_context_acquire(aio_context);
+            busy |= bdrv_drain_one(bs);
+            aio_context_release(aio_context);
+        }
+    }
+
+    bs = NULL;
+    while ((bs = bdrv_next(bs))) {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+
+        aio_context_acquire(aio_context);
+        if (bs->job) {
+            block_job_resume(bs->job);
+        }
+        aio_context_release(aio_context);
+    }
+}
+
+/**
+ * Remove an active request from the tracked requests list
+ *
+ * This function should be called when a tracked request is completing.
+ */
+static void tracked_request_end(BdrvTrackedRequest *req)
+{
+    if (req->serialising) {
+        req->bs->serialising_in_flight--;
+    }
+
+    QLIST_REMOVE(req, list);
+    qemu_co_queue_restart_all(&req->wait_queue);
+}
+
+/**
+ * Add an active request to the tracked requests list
+ */
+static void tracked_request_begin(BdrvTrackedRequest *req,
+                                  BlockDriverState *bs,
+                                  int64_t offset,
+                                  unsigned int bytes, bool is_write)
+{
+    *req = (BdrvTrackedRequest){
+        .bs = bs,
+        .offset         = offset,
+        .bytes          = bytes,
+        .is_write       = is_write,
+        .co             = qemu_coroutine_self(),
+        .serialising    = false,
+        .overlap_offset = offset,
+        .overlap_bytes  = bytes,
+    };
+
+    qemu_co_queue_init(&req->wait_queue);
+
+    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
+}
+
+static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
+{
+    int64_t overlap_offset = req->offset & ~(align - 1);
+    unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
+                               - overlap_offset;
+
+    if (!req->serialising) {
+        req->bs->serialising_in_flight++;
+        req->serialising = true;
+    }
+
+    req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
+    req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
+}
+
+/**
+ * Round a region to cluster boundaries
+ */
+void bdrv_round_to_clusters(BlockDriverState *bs,
+                            int64_t sector_num, int nb_sectors,
+                            int64_t *cluster_sector_num,
+                            int *cluster_nb_sectors)
+{
+    BlockDriverInfo bdi;
+
+    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
+        *cluster_sector_num = sector_num;
+        *cluster_nb_sectors = nb_sectors;
+    } else {
+        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
+        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
+        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
+                                            nb_sectors, c);
+    }
+}
+
+static int bdrv_get_cluster_size(BlockDriverState *bs)
+{
+    BlockDriverInfo bdi;
+    int ret;
+
+    ret = bdrv_get_info(bs, &bdi);
+    if (ret < 0 || bdi.cluster_size == 0) {
+        return bs->request_alignment;
+    } else {
+        return bdi.cluster_size;
+    }
+}
+
+static bool tracked_request_overlaps(BdrvTrackedRequest *req,
+                                     int64_t offset, unsigned int bytes)
+{
+    /*        aaaa   bbbb */
+    if (offset >= req->overlap_offset + req->overlap_bytes) {
+        return false;
+    }
+    /* bbbb   aaaa        */
+    if (req->overlap_offset >= offset + bytes) {
+        return false;
+    }
+    return true;
+}
+
+static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
+{
+    BlockDriverState *bs = self->bs;
+    BdrvTrackedRequest *req;
+    bool retry;
+    bool waited = false;
+
+    if (!bs->serialising_in_flight) {
+        return false;
+    }
+
+    do {
+        retry = false;
+        QLIST_FOREACH(req, &bs->tracked_requests, list) {
+            if (req == self || (!req->serialising && !self->serialising)) {
+                continue;
+            }
+            if (tracked_request_overlaps(req, self->overlap_offset,
+                                         self->overlap_bytes))
+            {
+                /* Hitting this means there was a reentrant request, for
+                 * example, a block driver issuing nested requests.  This must
+                 * never happen since it means deadlock.
+                 */
+                assert(qemu_coroutine_self() != req->co);
+
+                /* If the request is already (indirectly) waiting for us, or
+                 * will wait for us as soon as it wakes up, then just go on
+                 * (instead of producing a deadlock in the former case). */
+                if (!req->waiting_for) {
+                    self->waiting_for = req;
+                    qemu_co_queue_wait(&req->wait_queue);
+                    self->waiting_for = NULL;
+                    retry = true;
+                    waited = true;
+                    break;
+                }
+            }
+        }
+    } while (retry);
+
+    return waited;
+}
+
+static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
+                                   size_t size)
+{
+    if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
+        return -EIO;
+    }
+
+    if (!bdrv_is_inserted(bs)) {
+        return -ENOMEDIUM;
+    }
+
+    if (offset < 0) {
+        return -EIO;
+    }
+
+    return 0;
+}
+
+static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
+                              int nb_sectors)
+{
+    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
+        return -EIO;
+    }
+
+    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
+                                   nb_sectors * BDRV_SECTOR_SIZE);
+}
+
+typedef struct RwCo {
+    BlockDriverState *bs;
+    int64_t offset;
+    QEMUIOVector *qiov;
+    bool is_write;
+    int ret;
+    BdrvRequestFlags flags;
+} RwCo;
+
+static void coroutine_fn bdrv_rw_co_entry(void *opaque)
+{
+    RwCo *rwco = opaque;
+
+    if (!rwco->is_write) {
+        rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset,
+                                      rwco->qiov->size, rwco->qiov,
+                                      rwco->flags);
+    } else {
+        rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset,
+                                       rwco->qiov->size, rwco->qiov,
+                                       rwco->flags);
+    }
+}
+
+/*
+ * Process a vectored synchronous request using coroutines
+ */
+static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
+                        QEMUIOVector *qiov, bool is_write,
+                        BdrvRequestFlags flags)
+{
+    Coroutine *co;
+    RwCo rwco = {
+        .bs = bs,
+        .offset = offset,
+        .qiov = qiov,
+        .is_write = is_write,
+        .ret = NOT_DONE,
+        .flags = flags,
+    };
+
+    /**
+     * In sync call context, when the vcpu is blocked, this throttling timer
+     * will not fire; so the I/O throttling function has to be disabled here
+     * if it has been enabled.
+     */
+    if (bs->io_limits_enabled) {
+        fprintf(stderr, "Disabling I/O throttling on '%s' due "
+                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
+        bdrv_io_limits_disable(bs);
+    }
+
+    if (qemu_in_coroutine()) {
+        /* Fast-path if already in coroutine context */
+        bdrv_rw_co_entry(&rwco);
+    } else {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+
+        co = qemu_coroutine_create(bdrv_rw_co_entry);
+        qemu_coroutine_enter(co, &rwco);
+        while (rwco.ret == NOT_DONE) {
+            aio_poll(aio_context, true);
+        }
+    }
+    return rwco.ret;
+}
+
+/*
+ * Process a synchronous request using coroutines
+ */
+static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
+                      int nb_sectors, bool is_write, BdrvRequestFlags flags)
+{
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base = (void *)buf,
+        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
+    };
+
+    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
+        return -EINVAL;
+    }
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
+    return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS,
+                        &qiov, is_write, flags);
+}
+
+/* return < 0 if error. See bdrv_write() for the return codes */
+int bdrv_read(BlockDriverState *bs, int64_t sector_num,
+              uint8_t *buf, int nb_sectors)
+{
+    return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0);
+}
+
+/* Just like bdrv_read(), but with I/O throttling temporarily disabled */
+int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
+                          uint8_t *buf, int nb_sectors)
+{
+    bool enabled;
+    int ret;
+
+    enabled = bs->io_limits_enabled;
+    bs->io_limits_enabled = false;
+    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
+    bs->io_limits_enabled = enabled;
+    return ret;
+}
+
+/* Return < 0 if error. Important errors are:
+  -EIO         generic I/O error (may happen for all errors)
+  -ENOMEDIUM   No media inserted.
+  -EINVAL      Invalid sector number or nb_sectors
+  -EACCES      Trying to write a read-only device
+*/
+int bdrv_write(BlockDriverState *bs, int64_t sector_num,
+               const uint8_t *buf, int nb_sectors)
+{
+    return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
+}
+
+int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
+                      int nb_sectors, BdrvRequestFlags flags)
+{
+    return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
+                      BDRV_REQ_ZERO_WRITE | flags);
+}
+
+/*
+ * Completely zero out a block device with the help of bdrv_write_zeroes.
+ * The operation is sped up by checking the block status and only writing
+ * zeroes to the device if they currently do not return zeroes. Optional
+ * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
+ *
+ * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
+ */
+int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
+{
+    int64_t target_sectors, ret, nb_sectors, sector_num = 0;
+    int n;
+
+    target_sectors = bdrv_nb_sectors(bs);
+    if (target_sectors < 0) {
+        return target_sectors;
+    }
+
+    for (;;) {
+        nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
+        if (nb_sectors <= 0) {
+            return 0;
+        }
+        ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
+        if (ret < 0) {
+            error_report("error getting block status at sector %" PRId64 ": %s",
+                         sector_num, strerror(-ret));
+            return ret;
+        }
+        if (ret & BDRV_BLOCK_ZERO) {
+            sector_num += n;
+            continue;
+        }
+        ret = bdrv_write_zeroes(bs, sector_num, n, flags);
+        if (ret < 0) {
+            error_report("error writing zeroes at sector %" PRId64 ": %s",
+                         sector_num, strerror(-ret));
+            return ret;
+        }
+        sector_num += n;
+    }
+}
+
+int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
+{
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base = (void *)buf,
+        .iov_len = bytes,
+    };
+    int ret;
+
+    if (bytes < 0) {
+        return -EINVAL;
+    }
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
+    ret = bdrv_prwv_co(bs, offset, &qiov, false, 0);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return bytes;
+}
+
+int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
+{
+    int ret;
+
+    ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return qiov->size;
+}
+
+int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
+                const void *buf, int bytes)
+{
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base   = (void *) buf,
+        .iov_len    = bytes,
+    };
+
+    if (bytes < 0) {
+        return -EINVAL;
+    }
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
+    return bdrv_pwritev(bs, offset, &qiov);
+}
+
+/*
+ * Writes to the file and ensures that no writes are reordered across this
+ * request (acts as a barrier)
+ *
+ * Returns 0 on success, -errno in error cases.
+ */
+int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
+    const void *buf, int count)
+{
+    int ret;
+
+    ret = bdrv_pwrite(bs, offset, buf, count);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* No flush needed for cache modes that already do it */
+    if (bs->enable_write_cache) {
+        bdrv_flush(bs);
+    }
+
+    return 0;
+}
+
+static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+    /* Perform I/O through a temporary buffer so that users who scribble over
+     * their read buffer while the operation is in progress do not end up
+     * modifying the image file.  This is critical for zero-copy guest I/O
+     * where anything might happen inside guest memory.
+     */
+    void *bounce_buffer;
+
+    BlockDriver *drv = bs->drv;
+    struct iovec iov;
+    QEMUIOVector bounce_qiov;
+    int64_t cluster_sector_num;
+    int cluster_nb_sectors;
+    size_t skip_bytes;
+    int ret;
+
+    /* Cover entire cluster so no additional backing file I/O is required when
+     * allocating cluster in the image file.
+     */
+    bdrv_round_to_clusters(bs, sector_num, nb_sectors,
+                           &cluster_sector_num, &cluster_nb_sectors);
+
+    trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
+                                   cluster_sector_num, cluster_nb_sectors);
+
+    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
+    iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
+    if (bounce_buffer == NULL) {
+        ret = -ENOMEM;
+        goto err;
+    }
+
+    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
+
+    ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
+                             &bounce_qiov);
+    if (ret < 0) {
+        goto err;
+    }
+
+    if (drv->bdrv_co_write_zeroes &&
+        buffer_is_zero(bounce_buffer, iov.iov_len)) {
+        ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
+                                      cluster_nb_sectors, 0);
+    } else {
+        /* This does not change the data on the disk, it is not necessary
+         * to flush even in cache=writethrough mode.
+         */
+        ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
+                                  &bounce_qiov);
+    }
+
+    if (ret < 0) {
+        /* It might be okay to ignore write errors for guest requests.  If this
+         * is a deliberate copy-on-read then we don't want to ignore the error.
+         * Simply report it in all cases.
+         */
+        goto err;
+    }
+
+    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
+    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes,
+                        nb_sectors * BDRV_SECTOR_SIZE);
+
+err:
+    qemu_vfree(bounce_buffer);
+    return ret;
+}
+
+/*
+ * Forwards an already correctly aligned request to the BlockDriver. This
+ * handles copy on read and zeroing after EOF; any other features must be
+ * implemented by the caller.
+ */
+static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
+    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
+    int64_t align, QEMUIOVector *qiov, int flags)
+{
+    BlockDriver *drv = bs->drv;
+    int ret;
+
+    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert(!qiov || bytes == qiov->size);
+
+    /* Handle Copy on Read and associated serialisation */
+    if (flags & BDRV_REQ_COPY_ON_READ) {
+        /* If we touch the same cluster it counts as an overlap.  This
+         * guarantees that allocating writes will be serialized and not race
+         * with each other for the same cluster.  For example, in copy-on-read
+         * it ensures that the CoR read and write operations are atomic and
+         * guest writes cannot interleave between them. */
+        mark_request_serialising(req, bdrv_get_cluster_size(bs));
+    }
+
+    wait_serialising_requests(req);
+
+    if (flags & BDRV_REQ_COPY_ON_READ) {
+        int pnum;
+
+        ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
+        if (ret < 0) {
+            goto out;
+        }
+
+        if (!ret || pnum != nb_sectors) {
+            ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
+            goto out;
+        }
+    }
+
+    /* Forward the request to the BlockDriver */
+    if (!bs->zero_beyond_eof) {
+        ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+    } else {
+        /* Read zeros after EOF */
+        int64_t total_sectors, max_nb_sectors;
+
+        total_sectors = bdrv_nb_sectors(bs);
+        if (total_sectors < 0) {
+            ret = total_sectors;
+            goto out;
+        }
+
+        max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num),
+                                  align >> BDRV_SECTOR_BITS);
+        if (nb_sectors < max_nb_sectors) {
+            ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+        } else if (max_nb_sectors > 0) {
+            QEMUIOVector local_qiov;
+
+            qemu_iovec_init(&local_qiov, qiov->niov);
+            qemu_iovec_concat(&local_qiov, qiov, 0,
+                              max_nb_sectors * BDRV_SECTOR_SIZE);
+
+            ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors,
+                                     &local_qiov);
+
+            qemu_iovec_destroy(&local_qiov);
+        } else {
+            ret = 0;
+        }
+
+        /* Reading beyond end of file is supposed to produce zeroes */
+        if (ret == 0 && total_sectors < sector_num + nb_sectors) {
+            uint64_t offset = MAX(0, total_sectors - sector_num);
+            uint64_t bytes = (sector_num + nb_sectors - offset) *
+                              BDRV_SECTOR_SIZE;
+            qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes);
+        }
+    }
+
+out:
+    return ret;
+}
+
+static inline uint64_t bdrv_get_align(BlockDriverState *bs)
+{
+    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
+    return MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
+}
+
+static inline bool bdrv_req_is_aligned(BlockDriverState *bs,
+                                       int64_t offset, size_t bytes)
+{
+    int64_t align = bdrv_get_align(bs);
+    return !(offset & (align - 1) || (bytes & (align - 1)));
+}
+
+/*
+ * Handle a read request in coroutine context
+ */
+static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
+    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
+    BdrvRequestFlags flags)
+{
+    BlockDriver *drv = bs->drv;
+    BdrvTrackedRequest req;
+
+    uint64_t align = bdrv_get_align(bs);
+    uint8_t *head_buf = NULL;
+    uint8_t *tail_buf = NULL;
+    QEMUIOVector local_qiov;
+    bool use_local_qiov = false;
+    int ret;
+
+    if (!drv) {
+        return -ENOMEDIUM;
+    }
+
+    ret = bdrv_check_byte_request(bs, offset, bytes);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (bs->copy_on_read) {
+        flags |= BDRV_REQ_COPY_ON_READ;
+    }
+
+    /* throttling disk I/O */
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_intercept(bs, bytes, false);
+    }
+
+    /* Align read if necessary by padding qiov */
+    if (offset & (align - 1)) {
+        head_buf = qemu_blockalign(bs, align);
+        qemu_iovec_init(&local_qiov, qiov->niov + 2);
+        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
+        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+        use_local_qiov = true;
+
+        bytes += offset & (align - 1);
+        offset = offset & ~(align - 1);
+    }
+
+    if ((offset + bytes) & (align - 1)) {
+        if (!use_local_qiov) {
+            qemu_iovec_init(&local_qiov, qiov->niov + 1);
+            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+            use_local_qiov = true;
+        }
+        tail_buf = qemu_blockalign(bs, align);
+        qemu_iovec_add(&local_qiov, tail_buf,
+                       align - ((offset + bytes) & (align - 1)));
+
+        bytes = ROUND_UP(bytes, align);
+    }
+
+    tracked_request_begin(&req, bs, offset, bytes, false);
+    ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
+                              use_local_qiov ? &local_qiov : qiov,
+                              flags);
+    tracked_request_end(&req);
+
+    if (use_local_qiov) {
+        qemu_iovec_destroy(&local_qiov);
+        qemu_vfree(head_buf);
+        qemu_vfree(tail_buf);
+    }
+
+    return ret;
+}
+
+static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+    BdrvRequestFlags flags)
+{
+    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
+        return -EINVAL;
+    }
+
+    return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS,
+                             nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
+}
+
+int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
+    int nb_sectors, QEMUIOVector *qiov)
+{
+    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
+
+    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
+}
+
+int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
+
+    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
+                            BDRV_REQ_COPY_ON_READ);
+}
+
+#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
+
+static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+{
+    BlockDriver *drv = bs->drv;
+    QEMUIOVector qiov;
+    struct iovec iov = {0};
+    int ret = 0;
+
+    int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
+                                        BDRV_REQUEST_MAX_SECTORS);
+
+    while (nb_sectors > 0 && !ret) {
+        int num = nb_sectors;
+
+        /* Align request.  Block drivers can expect the "bulk" of the request
+         * to be aligned.
+         */
+        if (bs->bl.write_zeroes_alignment
+            && num > bs->bl.write_zeroes_alignment) {
+            if (sector_num % bs->bl.write_zeroes_alignment != 0) {
+                /* Make a small request up to the first aligned sector.  */
+                num = bs->bl.write_zeroes_alignment;
+                num -= sector_num % bs->bl.write_zeroes_alignment;
+            } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) {
+                /* Shorten the request to the last aligned sector.  num cannot
+                 * underflow because num > bs->bl.write_zeroes_alignment.
+                 */
+                num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
+            }
+        }
+
+        /* limit request size */
+        if (num > max_write_zeroes) {
+            num = max_write_zeroes;
+        }
+
+        ret = -ENOTSUP;
+        /* First try the efficient write zeroes operation */
+        if (drv->bdrv_co_write_zeroes) {
+            ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags);
+        }
+
+        if (ret == -ENOTSUP) {
+            /* Fall back to bounce buffer if write zeroes is unsupported */
+            int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
+                                            MAX_WRITE_ZEROES_BOUNCE_BUFFER);
+            num = MIN(num, max_xfer_len);
+            iov.iov_len = num * BDRV_SECTOR_SIZE;
+            if (iov.iov_base == NULL) {
+                iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
+                if (iov.iov_base == NULL) {
+                    ret = -ENOMEM;
+                    goto fail;
+                }
+                memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
+            }
+            qemu_iovec_init_external(&qiov, &iov, 1);
+
+            ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
+
+            /* Keep bounce buffer around if it is big enough for all
+             * all future requests.
+             */
+            if (num < max_xfer_len) {
+                qemu_vfree(iov.iov_base);
+                iov.iov_base = NULL;
+            }
+        }
+
+        sector_num += num;
+        nb_sectors -= num;
+    }
+
+fail:
+    qemu_vfree(iov.iov_base);
+    return ret;
+}
+
+/*
+ * Forwards an already correctly aligned write request to the BlockDriver.
+ */
+static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
+    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
+    QEMUIOVector *qiov, int flags)
+{
+    BlockDriver *drv = bs->drv;
+    bool waited;
+    int ret;
+
+    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
+    unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS;
+
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert(!qiov || bytes == qiov->size);
+
+    waited = wait_serialising_requests(req);
+    assert(!waited || !req->serialising);
+    assert(req->overlap_offset <= offset);
+    assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
+
+    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
+
+    if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
+        !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes &&
+        qemu_iovec_is_zero(qiov)) {
+        flags |= BDRV_REQ_ZERO_WRITE;
+        if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
+            flags |= BDRV_REQ_MAY_UNMAP;
+        }
+    }
+
+    if (ret < 0) {
+        /* Do nothing, write notifier decided to fail this request */
+    } else if (flags & BDRV_REQ_ZERO_WRITE) {
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
+        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
+    } else {
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
+        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
+    }
+    BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
+
+    if (ret == 0 && !bs->enable_write_cache) {
+        ret = bdrv_co_flush(bs);
+    }
+
+    bdrv_set_dirty(bs, sector_num, nb_sectors);
+
+    block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
+
+    if (ret >= 0) {
+        bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
+    }
+
+    return ret;
+}
+
+/*
+ * Handle a write request in coroutine context
+ */
+static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
+    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
+    BdrvRequestFlags flags)
+{
+    BdrvTrackedRequest req;
+    uint64_t align = bdrv_get_align(bs);
+    uint8_t *head_buf = NULL;
+    uint8_t *tail_buf = NULL;
+    QEMUIOVector local_qiov;
+    bool use_local_qiov = false;
+    int ret;
+
+    if (!bs->drv) {
+        return -ENOMEDIUM;
+    }
+    if (bs->read_only) {
+        return -EACCES;
+    }
+
+    ret = bdrv_check_byte_request(bs, offset, bytes);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* throttling disk I/O */
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_intercept(bs, bytes, true);
+    }
+
+    /*
+     * Align write if necessary by performing a read-modify-write cycle.
+     * Pad qiov with the read parts and be sure to have a tracked request not
+     * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
+     */
+    tracked_request_begin(&req, bs, offset, bytes, true);
+
+    if (offset & (align - 1)) {
+        QEMUIOVector head_qiov;
+        struct iovec head_iov;
+
+        mark_request_serialising(&req, align);
+        wait_serialising_requests(&req);
+
+        head_buf = qemu_blockalign(bs, align);
+        head_iov = (struct iovec) {
+            .iov_base   = head_buf,
+            .iov_len    = align,
+        };
+        qemu_iovec_init_external(&head_qiov, &head_iov, 1);
+
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
+        ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
+                                  align, &head_qiov, 0);
+        if (ret < 0) {
+            goto fail;
+        }
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
+
+        qemu_iovec_init(&local_qiov, qiov->niov + 2);
+        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
+        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+        use_local_qiov = true;
+
+        bytes += offset & (align - 1);
+        offset = offset & ~(align - 1);
+    }
+
+    if ((offset + bytes) & (align - 1)) {
+        QEMUIOVector tail_qiov;
+        struct iovec tail_iov;
+        size_t tail_bytes;
+        bool waited;
+
+        mark_request_serialising(&req, align);
+        waited = wait_serialising_requests(&req);
+        assert(!waited || !use_local_qiov);
+
+        tail_buf = qemu_blockalign(bs, align);
+        tail_iov = (struct iovec) {
+            .iov_base   = tail_buf,
+            .iov_len    = align,
+        };
+        qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
+
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
+        ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
+                                  align, &tail_qiov, 0);
+        if (ret < 0) {
+            goto fail;
+        }
+        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
+
+        if (!use_local_qiov) {
+            qemu_iovec_init(&local_qiov, qiov->niov + 1);
+            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
+            use_local_qiov = true;
+        }
+
+        tail_bytes = (offset + bytes) & (align - 1);
+        qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
+
+        bytes = ROUND_UP(bytes, align);
+    }
+
+    if (use_local_qiov) {
+        /* Local buffer may have non-zero data. */
+        flags &= ~BDRV_REQ_ZERO_WRITE;
+    }
+    ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
+                               use_local_qiov ? &local_qiov : qiov,
+                               flags);
+
+fail:
+    tracked_request_end(&req);
+
+    if (use_local_qiov) {
+        qemu_iovec_destroy(&local_qiov);
+    }
+    qemu_vfree(head_buf);
+    qemu_vfree(tail_buf);
+
+    return ret;
+}
+
+static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+    BdrvRequestFlags flags)
+{
+    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
+        return -EINVAL;
+    }
+
+    return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS,
+                              nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
+}
+
+int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
+    int nb_sectors, QEMUIOVector *qiov)
+{
+    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
+
+    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0);
+}
+
+int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
+                                      int64_t sector_num, int nb_sectors,
+                                      BdrvRequestFlags flags)
+{
+    int ret;
+
+    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
+
+    if (!(bs->open_flags & BDRV_O_UNMAP)) {
+        flags &= ~BDRV_REQ_MAY_UNMAP;
+    }
+    if (bdrv_req_is_aligned(bs, sector_num << BDRV_SECTOR_BITS,
+                            nb_sectors << BDRV_SECTOR_BITS)) {
+        ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
+                                BDRV_REQ_ZERO_WRITE | flags);
+    } else {
+        uint8_t *buf;
+        QEMUIOVector local_qiov;
+        size_t bytes = nb_sectors << BDRV_SECTOR_BITS;
+
+        buf = qemu_memalign(bdrv_opt_mem_align(bs), bytes);
+        memset(buf, 0, bytes);
+        qemu_iovec_init(&local_qiov, 1);
+        qemu_iovec_add(&local_qiov, buf, bytes);
+
+        ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, &local_qiov,
+                                BDRV_REQ_ZERO_WRITE | flags);
+        qemu_vfree(buf);
+    }
+    return ret;
+}
+
+int bdrv_flush_all(void)
+{
+    BlockDriverState *bs = NULL;
+    int result = 0;
+
+    while ((bs = bdrv_next(bs))) {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+        int ret;
+
+        aio_context_acquire(aio_context);
+        ret = bdrv_flush(bs);
+        if (ret < 0 && !result) {
+            result = ret;
+        }
+        aio_context_release(aio_context);
+    }
+
+    return result;
+}
+
+typedef struct BdrvCoGetBlockStatusData {
+    BlockDriverState *bs;
+    BlockDriverState *base;
+    int64_t sector_num;
+    int nb_sectors;
+    int *pnum;
+    int64_t ret;
+    bool done;
+} BdrvCoGetBlockStatusData;
+
+/*
+ * Returns the allocation status of the specified sectors.
+ * Drivers not implementing the functionality are assumed to not support
+ * backing files, hence all their sectors are reported as allocated.
+ *
+ * If 'sector_num' is beyond the end of the disk image the return value is 0
+ * and 'pnum' is set to 0.
+ *
+ * 'pnum' is set to the number of sectors (including and immediately following
+ * the specified sector) that are known to be in the same
+ * allocated/unallocated state.
+ *
+ * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
+ * beyond the end of the disk image it will be clamped.
+ */
+static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
+                                                     int64_t sector_num,
+                                                     int nb_sectors, int *pnum)
+{
+    int64_t total_sectors;
+    int64_t n;
+    int64_t ret, ret2;
+
+    total_sectors = bdrv_nb_sectors(bs);
+    if (total_sectors < 0) {
+        return total_sectors;
+    }
+
+    if (sector_num >= total_sectors) {
+        *pnum = 0;
+        return 0;
+    }
+
+    n = total_sectors - sector_num;
+    if (n < nb_sectors) {
+        nb_sectors = n;
+    }
+
+    if (!bs->drv->bdrv_co_get_block_status) {
+        *pnum = nb_sectors;
+        ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
+        if (bs->drv->protocol_name) {
+            ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
+        }
+        return ret;
+    }
+
+    ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
+    if (ret < 0) {
+        *pnum = 0;
+        return ret;
+    }
+
+    if (ret & BDRV_BLOCK_RAW) {
+        assert(ret & BDRV_BLOCK_OFFSET_VALID);
+        return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
+                                     *pnum, pnum);
+    }
+
+    if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
+        ret |= BDRV_BLOCK_ALLOCATED;
+    }
+
+    if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
+        if (bdrv_unallocated_blocks_are_zero(bs)) {
+            ret |= BDRV_BLOCK_ZERO;
+        } else if (bs->backing_hd) {
+            BlockDriverState *bs2 = bs->backing_hd;
+            int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
+            if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
+                ret |= BDRV_BLOCK_ZERO;
+            }
+        }
+    }
+
+    if (bs->file &&
+        (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
+        (ret & BDRV_BLOCK_OFFSET_VALID)) {
+        int file_pnum;
+
+        ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
+                                        *pnum, &file_pnum);
+        if (ret2 >= 0) {
+            /* Ignore errors.  This is just providing extra information, it
+             * is useful but not necessary.
+             */
+            if (!file_pnum) {
+                /* !file_pnum indicates an offset at or beyond the EOF; it is
+                 * perfectly valid for the format block driver to point to such
+                 * offsets, so catch it and mark everything as zero */
+                ret |= BDRV_BLOCK_ZERO;
+            } else {
+                /* Limit request to the range reported by the protocol driver */
+                *pnum = file_pnum;
+                ret |= (ret2 & BDRV_BLOCK_ZERO);
+            }
+        }
+    }
+
+    return ret;
+}
+
+/* Coroutine wrapper for bdrv_get_block_status() */
+static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
+{
+    BdrvCoGetBlockStatusData *data = opaque;
+    BlockDriverState *bs = data->bs;
+
+    data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
+                                         data->pnum);
+    data->done = true;
+}
+
+/*
+ * Synchronous wrapper around bdrv_co_get_block_status().
+ *
+ * See bdrv_co_get_block_status() for details.
+ */
+int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
+                              int nb_sectors, int *pnum)
+{
+    Coroutine *co;
+    BdrvCoGetBlockStatusData data = {
+        .bs = bs,
+        .sector_num = sector_num,
+        .nb_sectors = nb_sectors,
+        .pnum = pnum,
+        .done = false,
+    };
+
+    if (qemu_in_coroutine()) {
+        /* Fast-path if already in coroutine context */
+        bdrv_get_block_status_co_entry(&data);
+    } else {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+
+        co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
+        qemu_coroutine_enter(co, &data);
+        while (!data.done) {
+            aio_poll(aio_context, true);
+        }
+    }
+    return data.ret;
+}
+
+int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
+                                   int nb_sectors, int *pnum)
+{
+    int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
+    if (ret < 0) {
+        return ret;
+    }
+    return !!(ret & BDRV_BLOCK_ALLOCATED);
+}
+
+/*
+ * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
+ *
+ * Return true if the given sector is allocated in any image between
+ * BASE and TOP (inclusive).  BASE can be NULL to check if the given
+ * sector is allocated in any image of the chain.  Return false otherwise.
+ *
+ * 'pnum' is set to the number of sectors (including and immediately following
+ *  the specified sector) that are known to be in the same
+ *  allocated/unallocated state.
+ *
+ */
+int bdrv_is_allocated_above(BlockDriverState *top,
+                            BlockDriverState *base,
+                            int64_t sector_num,
+                            int nb_sectors, int *pnum)
+{
+    BlockDriverState *intermediate;
+    int ret, n = nb_sectors;
+
+    intermediate = top;
+    while (intermediate && intermediate != base) {
+        int pnum_inter;
+        ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
+                                &pnum_inter);
+        if (ret < 0) {
+            return ret;
+        } else if (ret) {
+            *pnum = pnum_inter;
+            return 1;
+        }
+
+        /*
+         * [sector_num, nb_sectors] is unallocated on top but intermediate
+         * might have
+         *
+         * [sector_num+x, nr_sectors] allocated.
+         */
+        if (n > pnum_inter &&
+            (intermediate == top ||
+             sector_num + pnum_inter < intermediate->total_sectors)) {
+            n = pnum_inter;
+        }
+
+        intermediate = intermediate->backing_hd;
+    }
+
+    *pnum = n;
+    return 0;
+}
+
+int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
+                          const uint8_t *buf, int nb_sectors)
+{
+    BlockDriver *drv = bs->drv;
+    int ret;
+
+    if (!drv) {
+        return -ENOMEDIUM;
+    }
+    if (!drv->bdrv_write_compressed) {
+        return -ENOTSUP;
+    }
+    ret = bdrv_check_request(bs, sector_num, nb_sectors);
+    if (ret < 0) {
+        return ret;
+    }
+
+    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
+
+    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
+}
+
+int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
+                      int64_t pos, int size)
+{
+    QEMUIOVector qiov;
+    struct iovec iov = {
+        .iov_base   = (void *) buf,
+        .iov_len    = size,
+    };
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
+    return bdrv_writev_vmstate(bs, &qiov, pos);
+}
+
+int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
+{
+    BlockDriver *drv = bs->drv;
+
+    if (!drv) {
+        return -ENOMEDIUM;
+    } else if (drv->bdrv_save_vmstate) {
+        return drv->bdrv_save_vmstate(bs, qiov, pos);
+    } else if (bs->file) {
+        return bdrv_writev_vmstate(bs->file, qiov, pos);
+    }
+
+    return -ENOTSUP;
+}
+
+int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
+                      int64_t pos, int size)
+{
+    BlockDriver *drv = bs->drv;
+    if (!drv)
+        return -ENOMEDIUM;
+    if (drv->bdrv_load_vmstate)
+        return drv->bdrv_load_vmstate(bs, buf, pos, size);
+    if (bs->file)
+        return bdrv_load_vmstate(bs->file, buf, pos, size);
+    return -ENOTSUP;
+}
+
+/**************************************************************/
+/* async I/Os */
+
+BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
+                           QEMUIOVector *qiov, int nb_sectors,
+                           BlockCompletionFunc *cb, void *opaque)
+{
+    trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
+
+    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
+                                 cb, opaque, false);
+}
+
+BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
+                            QEMUIOVector *qiov, int nb_sectors,
+                            BlockCompletionFunc *cb, void *opaque)
+{
+    trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
+
+    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0,
+                                 cb, opaque, true);
+}
+
+BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, BdrvRequestFlags flags,
+        BlockCompletionFunc *cb, void *opaque)
+{
+    trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque);
+
+    return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors,
+                                 BDRV_REQ_ZERO_WRITE | flags,
+                                 cb, opaque, true);
+}
+
+
+typedef struct MultiwriteCB {
+    int error;
+    int num_requests;
+    int num_callbacks;
+    struct {
+        BlockCompletionFunc *cb;
+        void *opaque;
+        QEMUIOVector *free_qiov;
+    } callbacks[];
+} MultiwriteCB;
+
+static void multiwrite_user_cb(MultiwriteCB *mcb)
+{
+    int i;
+
+    for (i = 0; i < mcb->num_callbacks; i++) {
+        mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
+        if (mcb->callbacks[i].free_qiov) {
+            qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
+        }
+        g_free(mcb->callbacks[i].free_qiov);
+    }
+}
+
+static void multiwrite_cb(void *opaque, int ret)
+{
+    MultiwriteCB *mcb = opaque;
+
+    trace_multiwrite_cb(mcb, ret);
+
+    if (ret < 0 && !mcb->error) {
+        mcb->error = ret;
+    }
+
+    mcb->num_requests--;
+    if (mcb->num_requests == 0) {
+        multiwrite_user_cb(mcb);
+        g_free(mcb);
+    }
+}
+
+static int multiwrite_req_compare(const void *a, const void *b)
+{
+    const BlockRequest *req1 = a, *req2 = b;
+
+    /*
+     * Note that we can't simply subtract req2->sector from req1->sector
+     * here as that could overflow the return value.
+     */
+    if (req1->sector > req2->sector) {
+        return 1;
+    } else if (req1->sector < req2->sector) {
+        return -1;
+    } else {
+        return 0;
+    }
+}
+
+/*
+ * Takes a bunch of requests and tries to merge them. Returns the number of
+ * requests that remain after merging.
+ */
+static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
+    int num_reqs, MultiwriteCB *mcb)
+{
+    int i, outidx;
+
+    // Sort requests by start sector
+    qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
+
+    // Check if adjacent requests touch the same clusters. If so, combine them,
+    // filling up gaps with zero sectors.
+    outidx = 0;
+    for (i = 1; i < num_reqs; i++) {
+        int merge = 0;
+        int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
+
+        // Handle exactly sequential writes and overlapping writes.
+        if (reqs[i].sector <= oldreq_last) {
+            merge = 1;
+        }
+
+        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
+            merge = 0;
+        }
+
+        if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors +
+            reqs[i].nb_sectors > bs->bl.max_transfer_length) {
+            merge = 0;
+        }
+
+        if (merge) {
+            size_t size;
+            QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
+            qemu_iovec_init(qiov,
+                reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
+
+            // Add the first request to the merged one. If the requests are
+            // overlapping, drop the last sectors of the first request.
+            size = (reqs[i].sector - reqs[outidx].sector) << 9;
+            qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size);
+
+            // We should need to add any zeros between the two requests
+            assert (reqs[i].sector <= oldreq_last);
+
+            // Add the second request
+            qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
+
+            // Add tail of first request, if necessary
+            if (qiov->size < reqs[outidx].qiov->size) {
+                qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
+                                  reqs[outidx].qiov->size - qiov->size);
+            }
+
+            reqs[outidx].nb_sectors = qiov->size >> 9;
+            reqs[outidx].qiov = qiov;
+
+            mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
+        } else {
+            outidx++;
+            reqs[outidx].sector     = reqs[i].sector;
+            reqs[outidx].nb_sectors = reqs[i].nb_sectors;
+            reqs[outidx].qiov       = reqs[i].qiov;
+        }
+    }
+
+    block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1);
+
+    return outidx + 1;
+}
+
+/*
+ * Submit multiple AIO write requests at once.
+ *
+ * On success, the function returns 0 and all requests in the reqs array have
+ * been submitted. In error case this function returns -1, and any of the
+ * requests may or may not be submitted yet. In particular, this means that the
+ * callback will be called for some of the requests, for others it won't. The
+ * caller must check the error field of the BlockRequest to wait for the right
+ * callbacks (if error != 0, no callback will be called).
+ *
+ * The implementation may modify the contents of the reqs array, e.g. to merge
+ * requests. However, the fields opaque and error are left unmodified as they
+ * are used to signal failure for a single request to the caller.
+ */
+int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
+{
+    MultiwriteCB *mcb;
+    int i;
+
+    /* don't submit writes if we don't have a medium */
+    if (bs->drv == NULL) {
+        for (i = 0; i < num_reqs; i++) {
+            reqs[i].error = -ENOMEDIUM;
+        }
+        return -1;
+    }
+
+    if (num_reqs == 0) {
+        return 0;
+    }
+
+    // Create MultiwriteCB structure
+    mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
+    mcb->num_requests = 0;
+    mcb->num_callbacks = num_reqs;
+
+    for (i = 0; i < num_reqs; i++) {
+        mcb->callbacks[i].cb = reqs[i].cb;
+        mcb->callbacks[i].opaque = reqs[i].opaque;
+    }
+
+    // Check for mergable requests
+    num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
+
+    trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
+
+    /* Run the aio requests. */
+    mcb->num_requests = num_reqs;
+    for (i = 0; i < num_reqs; i++) {
+        bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov,
+                              reqs[i].nb_sectors, reqs[i].flags,
+                              multiwrite_cb, mcb,
+                              true);
+    }
+
+    return 0;
+}
+
+void bdrv_aio_cancel(BlockAIOCB *acb)
+{
+    qemu_aio_ref(acb);
+    bdrv_aio_cancel_async(acb);
+    while (acb->refcnt > 1) {
+        if (acb->aiocb_info->get_aio_context) {
+            aio_poll(acb->aiocb_info->get_aio_context(acb), true);
+        } else if (acb->bs) {
+            aio_poll(bdrv_get_aio_context(acb->bs), true);
+        } else {
+            abort();
+        }
+    }
+    qemu_aio_unref(acb);
+}
+
+/* Async version of aio cancel. The caller is not blocked if the acb implements
+ * cancel_async, otherwise we do nothing and let the request normally complete.
+ * In either case the completion callback must be called. */
+void bdrv_aio_cancel_async(BlockAIOCB *acb)
+{
+    if (acb->aiocb_info->cancel_async) {
+        acb->aiocb_info->cancel_async(acb);
+    }
+}
+
+/**************************************************************/
+/* async block device emulation */
+
+typedef struct BlockAIOCBSync {
+    BlockAIOCB common;
+    QEMUBH *bh;
+    int ret;
+    /* vector translation state */
+    QEMUIOVector *qiov;
+    uint8_t *bounce;
+    int is_write;
+} BlockAIOCBSync;
+
+static const AIOCBInfo bdrv_em_aiocb_info = {
+    .aiocb_size         = sizeof(BlockAIOCBSync),
+};
+
+static void bdrv_aio_bh_cb(void *opaque)
+{
+    BlockAIOCBSync *acb = opaque;
+
+    if (!acb->is_write && acb->ret >= 0) {
+        qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
+    }
+    qemu_vfree(acb->bounce);
+    acb->common.cb(acb->common.opaque, acb->ret);
+    qemu_bh_delete(acb->bh);
+    acb->bh = NULL;
+    qemu_aio_unref(acb);
+}
+
+static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
+                                      int64_t sector_num,
+                                      QEMUIOVector *qiov,
+                                      int nb_sectors,
+                                      BlockCompletionFunc *cb,
+                                      void *opaque,
+                                      int is_write)
+
+{
+    BlockAIOCBSync *acb;
+
+    acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque);
+    acb->is_write = is_write;
+    acb->qiov = qiov;
+    acb->bounce = qemu_try_blockalign(bs, qiov->size);
+    acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb);
+
+    if (acb->bounce == NULL) {
+        acb->ret = -ENOMEM;
+    } else if (is_write) {
+        qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
+        acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
+    } else {
+        acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
+    }
+
+    qemu_bh_schedule(acb->bh);
+
+    return &acb->common;
+}
+
+static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockCompletionFunc *cb, void *opaque)
+{
+    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+}
+
+static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockCompletionFunc *cb, void *opaque)
+{
+    return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
+}
+
+
+typedef struct BlockAIOCBCoroutine {
+    BlockAIOCB common;
+    BlockRequest req;
+    bool is_write;
+    bool need_bh;
+    bool *done;
+    QEMUBH* bh;
+} BlockAIOCBCoroutine;
+
+static const AIOCBInfo bdrv_em_co_aiocb_info = {
+    .aiocb_size         = sizeof(BlockAIOCBCoroutine),
+};
+
+static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
+{
+    if (!acb->need_bh) {
+        acb->common.cb(acb->common.opaque, acb->req.error);
+        qemu_aio_unref(acb);
+    }
+}
+
+static void bdrv_co_em_bh(void *opaque)
+{
+    BlockAIOCBCoroutine *acb = opaque;
+
+    assert(!acb->need_bh);
+    qemu_bh_delete(acb->bh);
+    bdrv_co_complete(acb);
+}
+
+static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
+{
+    acb->need_bh = false;
+    if (acb->req.error != -EINPROGRESS) {
+        BlockDriverState *bs = acb->common.bs;
+
+        acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
+        qemu_bh_schedule(acb->bh);
+    }
+}
+
+/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
+static void coroutine_fn bdrv_co_do_rw(void *opaque)
+{
+    BlockAIOCBCoroutine *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+
+    if (!acb->is_write) {
+        acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
+            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
+    } else {
+        acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
+            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
+    }
+
+    bdrv_co_complete(acb);
+}
+
+static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
+                                         int64_t sector_num,
+                                         QEMUIOVector *qiov,
+                                         int nb_sectors,
+                                         BdrvRequestFlags flags,
+                                         BlockCompletionFunc *cb,
+                                         void *opaque,
+                                         bool is_write)
+{
+    Coroutine *co;
+    BlockAIOCBCoroutine *acb;
+
+    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
+    acb->need_bh = true;
+    acb->req.error = -EINPROGRESS;
+    acb->req.sector = sector_num;
+    acb->req.nb_sectors = nb_sectors;
+    acb->req.qiov = qiov;
+    acb->req.flags = flags;
+    acb->is_write = is_write;
+
+    co = qemu_coroutine_create(bdrv_co_do_rw);
+    qemu_coroutine_enter(co, acb);
+
+    bdrv_co_maybe_schedule_bh(acb);
+    return &acb->common;
+}
+
+static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
+{
+    BlockAIOCBCoroutine *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+
+    acb->req.error = bdrv_co_flush(bs);
+    bdrv_co_complete(acb);
+}
+
+BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
+        BlockCompletionFunc *cb, void *opaque)
+{
+    trace_bdrv_aio_flush(bs, opaque);
+
+    Coroutine *co;
+    BlockAIOCBCoroutine *acb;
+
+    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
+    acb->need_bh = true;
+    acb->req.error = -EINPROGRESS;
+
+    co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
+    qemu_coroutine_enter(co, acb);
+
+    bdrv_co_maybe_schedule_bh(acb);
+    return &acb->common;
+}
+
+static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
+{
+    BlockAIOCBCoroutine *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+
+    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
+    bdrv_co_complete(acb);
+}
+
+BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors,
+        BlockCompletionFunc *cb, void *opaque)
+{
+    Coroutine *co;
+    BlockAIOCBCoroutine *acb;
+
+    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
+
+    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
+    acb->need_bh = true;
+    acb->req.error = -EINPROGRESS;
+    acb->req.sector = sector_num;
+    acb->req.nb_sectors = nb_sectors;
+    co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
+    qemu_coroutine_enter(co, acb);
+
+    bdrv_co_maybe_schedule_bh(acb);
+    return &acb->common;
+}
+
+void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
+                   BlockCompletionFunc *cb, void *opaque)
+{
+    BlockAIOCB *acb;
+
+    acb = g_slice_alloc(aiocb_info->aiocb_size);
+    acb->aiocb_info = aiocb_info;
+    acb->bs = bs;
+    acb->cb = cb;
+    acb->opaque = opaque;
+    acb->refcnt = 1;
+    return acb;
+}
+
+void qemu_aio_ref(void *p)
+{
+    BlockAIOCB *acb = p;
+    acb->refcnt++;
+}
+
+void qemu_aio_unref(void *p)
+{
+    BlockAIOCB *acb = p;
+    assert(acb->refcnt > 0);
+    if (--acb->refcnt == 0) {
+        g_slice_free1(acb->aiocb_info->aiocb_size, acb);
+    }
+}
+
+/**************************************************************/
+/* Coroutine block device emulation */
+
+typedef struct CoroutineIOCompletion {
+    Coroutine *coroutine;
+    int ret;
+} CoroutineIOCompletion;
+
+static void bdrv_co_io_em_complete(void *opaque, int ret)
+{
+    CoroutineIOCompletion *co = opaque;
+
+    co->ret = ret;
+    qemu_coroutine_enter(co->coroutine, NULL);
+}
+
+static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
+                                      int nb_sectors, QEMUIOVector *iov,
+                                      bool is_write)
+{
+    CoroutineIOCompletion co = {
+        .coroutine = qemu_coroutine_self(),
+    };
+    BlockAIOCB *acb;
+
+    if (is_write) {
+        acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
+                                       bdrv_co_io_em_complete, &co);
+    } else {
+        acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
+                                      bdrv_co_io_em_complete, &co);
+    }
+
+    trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
+    if (!acb) {
+        return -EIO;
+    }
+    qemu_coroutine_yield();
+
+    return co.ret;
+}
+
+static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
+                                         int64_t sector_num, int nb_sectors,
+                                         QEMUIOVector *iov)
+{
+    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
+}
+
+static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
+                                         int64_t sector_num, int nb_sectors,
+                                         QEMUIOVector *iov)
+{
+    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
+}
+
+static void coroutine_fn bdrv_flush_co_entry(void *opaque)
+{
+    RwCo *rwco = opaque;
+
+    rwco->ret = bdrv_co_flush(rwco->bs);
+}
+
+int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
+{
+    int ret;
+
+    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
+        return 0;
+    }
+
+    /* Write back cached data to the OS even with cache=unsafe */
+    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
+    if (bs->drv->bdrv_co_flush_to_os) {
+        ret = bs->drv->bdrv_co_flush_to_os(bs);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    /* But don't actually force it to the disk with cache=unsafe */
+    if (bs->open_flags & BDRV_O_NO_FLUSH) {
+        goto flush_parent;
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
+    if (bs->drv->bdrv_co_flush_to_disk) {
+        ret = bs->drv->bdrv_co_flush_to_disk(bs);
+    } else if (bs->drv->bdrv_aio_flush) {
+        BlockAIOCB *acb;
+        CoroutineIOCompletion co = {
+            .coroutine = qemu_coroutine_self(),
+        };
+
+        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
+        if (acb == NULL) {
+            ret = -EIO;
+        } else {
+            qemu_coroutine_yield();
+            ret = co.ret;
+        }
+    } else {
+        /*
+         * Some block drivers always operate in either writethrough or unsafe
+         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
+         * know how the server works (because the behaviour is hardcoded or
+         * depends on server-side configuration), so we can't ensure that
+         * everything is safe on disk. Returning an error doesn't work because
+         * that would break guests even if the server operates in writethrough
+         * mode.
+         *
+         * Let's hope the user knows what he's doing.
+         */
+        ret = 0;
+    }
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
+     * in the case of cache=unsafe, so there are no useless flushes.
+     */
+flush_parent:
+    return bdrv_co_flush(bs->file);
+}
+
+int bdrv_flush(BlockDriverState *bs)
+{
+    Coroutine *co;
+    RwCo rwco = {
+        .bs = bs,
+        .ret = NOT_DONE,
+    };
+
+    if (qemu_in_coroutine()) {
+        /* Fast-path if already in coroutine context */
+        bdrv_flush_co_entry(&rwco);
+    } else {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+
+        co = qemu_coroutine_create(bdrv_flush_co_entry);
+        qemu_coroutine_enter(co, &rwco);
+        while (rwco.ret == NOT_DONE) {
+            aio_poll(aio_context, true);
+        }
+    }
+
+    return rwco.ret;
+}
+
+typedef struct DiscardCo {
+    BlockDriverState *bs;
+    int64_t sector_num;
+    int nb_sectors;
+    int ret;
+} DiscardCo;
+static void coroutine_fn bdrv_discard_co_entry(void *opaque)
+{
+    DiscardCo *rwco = opaque;
+
+    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
+}
+
+int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
+                                 int nb_sectors)
+{
+    int max_discard, ret;
+
+    if (!bs->drv) {
+        return -ENOMEDIUM;
+    }
+
+    ret = bdrv_check_request(bs, sector_num, nb_sectors);
+    if (ret < 0) {
+        return ret;
+    } else if (bs->read_only) {
+        return -EROFS;
+    }
+
+    bdrv_reset_dirty(bs, sector_num, nb_sectors);
+
+    /* Do nothing if disabled.  */
+    if (!(bs->open_flags & BDRV_O_UNMAP)) {
+        return 0;
+    }
+
+    if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
+        return 0;
+    }
+
+    max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
+    while (nb_sectors > 0) {
+        int ret;
+        int num = nb_sectors;
+
+        /* align request */
+        if (bs->bl.discard_alignment &&
+            num >= bs->bl.discard_alignment &&
+            sector_num % bs->bl.discard_alignment) {
+            if (num > bs->bl.discard_alignment) {
+                num = bs->bl.discard_alignment;
+            }
+            num -= sector_num % bs->bl.discard_alignment;
+        }
+
+        /* limit request size */
+        if (num > max_discard) {
+            num = max_discard;
+        }
+
+        if (bs->drv->bdrv_co_discard) {
+            ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
+        } else {
+            BlockAIOCB *acb;
+            CoroutineIOCompletion co = {
+                .coroutine = qemu_coroutine_self(),
+            };
+
+            acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
+                                            bdrv_co_io_em_complete, &co);
+            if (acb == NULL) {
+                return -EIO;
+            } else {
+                qemu_coroutine_yield();
+                ret = co.ret;
+            }
+        }
+        if (ret && ret != -ENOTSUP) {
+            return ret;
+        }
+
+        sector_num += num;
+        nb_sectors -= num;
+    }
+    return 0;
+}
+
+int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
+{
+    Coroutine *co;
+    DiscardCo rwco = {
+        .bs = bs,
+        .sector_num = sector_num,
+        .nb_sectors = nb_sectors,
+        .ret = NOT_DONE,
+    };
+
+    if (qemu_in_coroutine()) {
+        /* Fast-path if already in coroutine context */
+        bdrv_discard_co_entry(&rwco);
+    } else {
+        AioContext *aio_context = bdrv_get_aio_context(bs);
+
+        co = qemu_coroutine_create(bdrv_discard_co_entry);
+        qemu_coroutine_enter(co, &rwco);
+        while (rwco.ret == NOT_DONE) {
+            aio_poll(aio_context, true);
+        }
+    }
+
+    return rwco.ret;
+}
+
+/* needed for generic scsi interface */
+
+int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+{
+    BlockDriver *drv = bs->drv;
+
+    if (drv && drv->bdrv_ioctl)
+        return drv->bdrv_ioctl(bs, req, buf);
+    return -ENOTSUP;
+}
+
+BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
+        unsigned long int req, void *buf,
+        BlockCompletionFunc *cb, void *opaque)
+{
+    BlockDriver *drv = bs->drv;
+
+    if (drv && drv->bdrv_aio_ioctl)
+        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
+    return NULL;
+}
+
+void *qemu_blockalign(BlockDriverState *bs, size_t size)
+{
+    return qemu_memalign(bdrv_opt_mem_align(bs), size);
+}
+
+void *qemu_blockalign0(BlockDriverState *bs, size_t size)
+{
+    return memset(qemu_blockalign(bs, size), 0, size);
+}
+
+void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
+{
+    size_t align = bdrv_opt_mem_align(bs);
+
+    /* Ensure that NULL is never returned on success */
+    assert(align > 0);
+    if (size == 0) {
+        size = align;
+    }
+
+    return qemu_try_memalign(align, size);
+}
+
+void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
+{
+    void *mem = qemu_try_blockalign(bs, size);
+
+    if (mem) {
+        memset(mem, 0, size);
+    }
+
+    return mem;
+}
+
+/*
+ * Check if all memory in this vector is sector aligned.
+ */
+bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
+{
+    int i;
+    size_t alignment = bdrv_opt_mem_align(bs);
+
+    for (i = 0; i < qiov->niov; i++) {
+        if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
+            return false;
+        }
+        if (qiov->iov[i].iov_len % alignment) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+void bdrv_add_before_write_notifier(BlockDriverState *bs,
+                                    NotifierWithReturn *notifier)
+{
+    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
+}
+
+void bdrv_io_plug(BlockDriverState *bs)
+{
+    BlockDriver *drv = bs->drv;
+    if (drv && drv->bdrv_io_plug) {
+        drv->bdrv_io_plug(bs);
+    } else if (bs->file) {
+        bdrv_io_plug(bs->file);
+    }
+}
+
+void bdrv_io_unplug(BlockDriverState *bs)
+{
+    BlockDriver *drv = bs->drv;
+    if (drv && drv->bdrv_io_unplug) {
+        drv->bdrv_io_unplug(bs);
+    } else if (bs->file) {
+        bdrv_io_unplug(bs->file);
+    }
+}
+
+void bdrv_flush_io_queue(BlockDriverState *bs)
+{
+    BlockDriver *drv = bs->drv;
+    if (drv && drv->bdrv_flush_io_queue) {
+        drv->bdrv_flush_io_queue(bs);
+    } else if (bs->file) {
+        bdrv_flush_io_queue(bs->file);
+    }
+}
-- 
cgit v1.1