18 files changed, 909 insertions, 514 deletions
diff --git a/block/blkreplay.c b/block/blkreplay.c
index 3368c8c..30f9d5f 100755
--- a/block/blkreplay.c
+++ b/block/blkreplay.c
@@ -114,11 +114,11 @@ static int coroutine_fn blkreplay_co_pwrite_zeroes(BlockDriverState *bs,
     return ret;
 }
 
-static int coroutine_fn blkreplay_co_discard(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors)
+static int coroutine_fn blkreplay_co_pdiscard(BlockDriverState *bs,
+                                              int64_t offset, int count)
 {
     uint64_t reqid = request_id++;
-    int ret = bdrv_co_discard(bs->file->bs, sector_num, nb_sectors);
+    int ret = bdrv_co_pdiscard(bs->file->bs, offset, count);
     block_request_create(reqid, bs, qemu_coroutine_self());
     qemu_coroutine_yield();
 
@@ -148,7 +148,7 @@ static BlockDriver bdrv_blkreplay = {
     .bdrv_co_pwritev        = blkreplay_co_pwritev,
 
     .bdrv_co_pwrite_zeroes  = blkreplay_co_pwrite_zeroes,
-    .bdrv_co_discard        = blkreplay_co_discard,
+    .bdrv_co_pdiscard       = blkreplay_co_pdiscard,
     .bdrv_co_flush          = blkreplay_co_flush,
 };
 
diff --git a/block/block-backend.c b/block/block-backend.c
index f9cea1b..effa038 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1065,16 +1065,16 @@ BlockAIOCB *blk_aio_flush(BlockBackend *blk,
     return bdrv_aio_flush(blk_bs(blk), cb, opaque);
 }
 
-BlockAIOCB *blk_aio_discard(BlockBackend *blk,
-                            int64_t sector_num, int nb_sectors,
-                            BlockCompletionFunc *cb, void *opaque)
+BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk,
+                             int64_t offset, int count,
+                             BlockCompletionFunc *cb, void *opaque)
 {
-    int ret = blk_check_request(blk, sector_num, nb_sectors);
+    int ret = blk_check_byte_request(blk, offset, count);
     if (ret < 0) {
         return blk_abort_aio_request(blk, cb, opaque, ret);
     }
 
-    return bdrv_aio_discard(blk_bs(blk), sector_num, nb_sectors, cb, opaque);
+    return bdrv_aio_pdiscard(blk_bs(blk), offset, count, cb, opaque);
 }
 
 void blk_aio_cancel(BlockAIOCB *acb)
@@ -1106,14 +1106,14 @@ BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
     return bdrv_aio_ioctl(blk_bs(blk), req, buf, cb, opaque);
 }
 
-int blk_co_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors)
+int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int count)
 {
-    int ret = blk_check_request(blk, sector_num, nb_sectors);
+    int ret = blk_check_byte_request(blk, offset, count);
     if (ret < 0) {
         return ret;
     }
 
-    return bdrv_co_discard(blk_bs(blk), sector_num, nb_sectors);
+    return bdrv_co_pdiscard(blk_bs(blk), offset, count);
 }
 
 int blk_co_flush(BlockBackend *blk)
@@ -1504,14 +1504,14 @@ int blk_truncate(BlockBackend *blk, int64_t offset)
     return bdrv_truncate(blk_bs(blk), offset);
 }
 
-int blk_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors)
+int blk_pdiscard(BlockBackend *blk, int64_t offset, int count)
 {
-    int ret = blk_check_request(blk, sector_num, nb_sectors);
+    int ret = blk_check_byte_request(blk, offset, count);
     if (ret < 0) {
         return ret;
     }
 
-    return bdrv_discard(blk_bs(blk), sector_num, nb_sectors);
+    return bdrv_pdiscard(blk_bs(blk), offset, count);
 }
 
 int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index 4902ca5..f2bfdcf 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -326,14 +326,14 @@ void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
 }
 
 void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap,
-                           int64_t cur_sector, int nr_sectors)
+                           int64_t cur_sector, int64_t nr_sectors)
 {
     assert(bdrv_dirty_bitmap_enabled(bitmap));
     hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors);
 }
 
 void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
-                             int64_t cur_sector, int nr_sectors)
+                             int64_t cur_sector, int64_t nr_sectors)
 {
     assert(bdrv_dirty_bitmap_enabled(bitmap));
     hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
@@ -361,7 +361,7 @@ void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in)
 }
 
 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
-                    int nr_sectors)
+                    int64_t nr_sectors)
 {
     BdrvDirtyBitmap *bitmap;
     QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
diff --git a/block/gluster.c b/block/gluster.c
index 406c1e6..01b479f 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -11,7 +11,27 @@
 #include <glusterfs/api/glfs.h>
 #include "block/block_int.h"
 #include "qapi/error.h"
+#include "qapi/qmp/qerror.h"
 #include "qemu/uri.h"
+#include "qemu/error-report.h"
+
+#define GLUSTER_OPT_FILENAME        "filename"
+#define GLUSTER_OPT_VOLUME          "volume"
+#define GLUSTER_OPT_PATH            "path"
+#define GLUSTER_OPT_TYPE            "type"
+#define GLUSTER_OPT_SERVER_PATTERN  "server."
+#define GLUSTER_OPT_HOST            "host"
+#define GLUSTER_OPT_PORT            "port"
+#define GLUSTER_OPT_TO              "to"
+#define GLUSTER_OPT_IPV4            "ipv4"
+#define GLUSTER_OPT_IPV6            "ipv6"
+#define GLUSTER_OPT_SOCKET          "socket"
+#define GLUSTER_OPT_DEBUG           "debug"
+#define GLUSTER_DEFAULT_PORT        24007
+#define GLUSTER_DEBUG_DEFAULT       4
+#define GLUSTER_DEBUG_MAX           9
+
+#define GERR_INDEX_HINT "hint: check in 'server' array index '%d'\n"
 
 typedef struct GlusterAIOCB {
     int64_t size;
@@ -28,27 +48,141 @@ typedef struct BDRVGlusterState {
     int debug_level;
 } BDRVGlusterState;
 
-typedef struct GlusterConf {
-    char *server;
-    int port;
-    char *volname;
-    char *image;
-    char *transport;
-    int debug_level;
-} GlusterConf;
+typedef struct BDRVGlusterReopenState {
+    struct glfs *glfs;
+    struct glfs_fd *fd;
+} BDRVGlusterReopenState;
 
-static void qemu_gluster_gconf_free(GlusterConf *gconf)
-{
-    if (gconf) {
-        g_free(gconf->server);
-        g_free(gconf->volname);
-        g_free(gconf->image);
-        g_free(gconf->transport);
-        g_free(gconf);
+
+static QemuOptsList qemu_gluster_create_opts = {
+    .name = "qemu-gluster-create-opts",
+    .head = QTAILQ_HEAD_INITIALIZER(qemu_gluster_create_opts.head),
+    .desc = {
+        {
+            .name = BLOCK_OPT_SIZE,
+            .type = QEMU_OPT_SIZE,
+            .help = "Virtual disk size"
+        },
+        {
+            .name = BLOCK_OPT_PREALLOC,
+            .type = QEMU_OPT_STRING,
+            .help = "Preallocation mode (allowed values: off, full)"
+        },
+        {
+            .name = GLUSTER_OPT_DEBUG,
+            .type = QEMU_OPT_NUMBER,
+            .help = "Gluster log level, valid range is 0-9",
+        },
+        { /* end of list */ }
     }
-}
+};
+
+static QemuOptsList runtime_opts = {
+    .name = "gluster",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+    .desc = {
+        {
+            .name = GLUSTER_OPT_FILENAME,
+            .type = QEMU_OPT_STRING,
+            .help = "URL to the gluster image",
+        },
+        {
+            .name = GLUSTER_OPT_DEBUG,
+            .type = QEMU_OPT_NUMBER,
+            .help = "Gluster log level, valid range is 0-9",
+        },
+        { /* end of list */ }
+    },
+};
+
+static QemuOptsList runtime_json_opts = {
+    .name = "gluster_json",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_json_opts.head),
+    .desc = {
+        {
+            .name = GLUSTER_OPT_VOLUME,
+            .type = QEMU_OPT_STRING,
+            .help = "name of gluster volume where VM image resides",
+        },
+        {
+            .name = GLUSTER_OPT_PATH,
+            .type = QEMU_OPT_STRING,
+            .help = "absolute path to image file in gluster volume",
+        },
+        {
+            .name = GLUSTER_OPT_DEBUG,
+            .type = QEMU_OPT_NUMBER,
+            .help = "Gluster log level, valid range is 0-9",
+        },
+        { /* end of list */ }
+    },
+};
+
+static QemuOptsList runtime_type_opts = {
+    .name = "gluster_type",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_type_opts.head),
+    .desc = {
+        {
+            .name = GLUSTER_OPT_TYPE,
+            .type = QEMU_OPT_STRING,
+            .help = "tcp|unix",
+        },
+        { /* end of list */ }
+    },
+};
+
+static QemuOptsList runtime_unix_opts = {
+    .name = "gluster_unix",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_unix_opts.head),
+    .desc = {
+        {
+            .name = GLUSTER_OPT_SOCKET,
+            .type = QEMU_OPT_STRING,
+            .help = "socket file path)",
+        },
+        { /* end of list */ }
+    },
+};
+
+static QemuOptsList runtime_tcp_opts = {
+    .name = "gluster_tcp",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_tcp_opts.head),
+    .desc = {
+        {
+            .name = GLUSTER_OPT_TYPE,
+            .type = QEMU_OPT_STRING,
+            .help = "tcp|unix",
+        },
+        {
+            .name = GLUSTER_OPT_HOST,
+            .type = QEMU_OPT_STRING,
+            .help = "host address (hostname/ipv4/ipv6 addresses)",
+        },
+        {
+            .name = GLUSTER_OPT_PORT,
+            .type = QEMU_OPT_NUMBER,
+            .help = "port number on which glusterd is listening (default 24007)",
+        },
+        {
+            .name = "to",
+            .type = QEMU_OPT_NUMBER,
+            .help = "max port number, not supported by gluster",
+        },
+        {
+            .name = "ipv4",
+            .type = QEMU_OPT_BOOL,
+            .help = "ipv4 bool value, not supported by gluster",
+        },
+        {
+            .name = "ipv6",
+            .type = QEMU_OPT_BOOL,
+            .help = "ipv6 bool value, not supported by gluster",
+        },
+        { /* end of list */ }
+    },
+};
 
-static int parse_volume_options(GlusterConf *gconf, char *path)
+static int parse_volume_options(BlockdevOptionsGluster *gconf, char *path)
 {
     char *p, *q;
 
@@ -62,31 +196,29 @@ static int parse_volume_options(GlusterConf *gconf, char *path)
     if (*p == '\0') {
         return -EINVAL;
     }
-    gconf->volname = g_strndup(q, p - q);
+    gconf->volume = g_strndup(q, p - q);
 
-    /* image */
+    /* path */
     p += strspn(p, "/");
     if (*p == '\0') {
         return -EINVAL;
     }
-    gconf->image = g_strdup(p);
+    gconf->path = g_strdup(p);
     return 0;
 }
 
 /*
- * file=gluster[+transport]://[server[:port]]/volname/image[?socket=...]
+ * file=gluster[+transport]://[host[:port]]/volume/path[?socket=...]
  *
  * 'gluster' is the protocol.
  *
  * 'transport' specifies the transport type used to connect to gluster
  * management daemon (glusterd). Valid transport types are
- * tcp, unix and rdma. If a transport type isn't specified, then tcp
- * type is assumed.
+ * tcp or unix. If a transport type isn't specified, then tcp type is assumed.
  *
- * 'server' specifies the server where the volume file specification for
- * the given volume resides. This can be either hostname, ipv4 address
- * or ipv6 address. ipv6 address needs to be within square brackets [ ].
- * If transport type is 'unix', then 'server' field should not be specified.
+ * 'host' specifies the host where the volume file specification for
+ * the given volume resides. This can be either hostname or ipv4 address.
+ * If transport type is 'unix', then 'host' field should not be specified.
  * The 'socket' field needs to be populated with the path to unix domain
  * socket.
  *
@@ -95,23 +227,22 @@ static int parse_volume_options(GlusterConf *gconf, char *path)
  * default port. If the transport type is unix, then 'port' should not be
  * specified.
  *
- * 'volname' is the name of the gluster volume which contains the VM image.
+ * 'volume' is the name of the gluster volume which contains the VM image.
  *
- * 'image' is the path to the actual VM image that resides on gluster volume.
+ * 'path' is the path to the actual VM image that resides on gluster volume.
  *
  * Examples:
  *
  * file=gluster://1.2.3.4/testvol/a.img
  * file=gluster+tcp://1.2.3.4/testvol/a.img
  * file=gluster+tcp://1.2.3.4:24007/testvol/dir/a.img
- * file=gluster+tcp://[1:2:3:4:5:6:7:8]/testvol/dir/a.img
- * file=gluster+tcp://[1:2:3:4:5:6:7:8]:24007/testvol/dir/a.img
- * file=gluster+tcp://server.domain.com:24007/testvol/dir/a.img
+ * file=gluster+tcp://host.domain.com:24007/testvol/dir/a.img
  * file=gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket
- * file=gluster+rdma://1.2.3.4:24007/testvol/a.img
  */
-static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename)
+static int qemu_gluster_parse_uri(BlockdevOptionsGluster *gconf,
+                                  const char *filename)
 {
+    GlusterServer *gsconf;
     URI *uri;
     QueryParams *qp = NULL;
     bool is_unix = false;
@@ -122,16 +253,21 @@ static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename)
         return -EINVAL;
     }
 
+    gconf->server = g_new0(GlusterServerList, 1);
+    gconf->server->value = gsconf = g_new0(GlusterServer, 1);
+
     /* transport */
     if (!uri->scheme || !strcmp(uri->scheme, "gluster")) {
-        gconf->transport = g_strdup("tcp");
+        gsconf->type = GLUSTER_TRANSPORT_TCP;
     } else if (!strcmp(uri->scheme, "gluster+tcp")) {
-        gconf->transport = g_strdup("tcp");
+        gsconf->type = GLUSTER_TRANSPORT_TCP;
     } else if (!strcmp(uri->scheme, "gluster+unix")) {
-        gconf->transport = g_strdup("unix");
+        gsconf->type = GLUSTER_TRANSPORT_UNIX;
         is_unix = true;
     } else if (!strcmp(uri->scheme, "gluster+rdma")) {
-        gconf->transport = g_strdup("rdma");
+        gsconf->type = GLUSTER_TRANSPORT_TCP;
+        error_report("Warning: rdma feature is not supported, falling "
+                     "back to tcp");
     } else {
         ret = -EINVAL;
         goto out;
@@ -157,10 +293,14 @@ static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename)
             ret = -EINVAL;
             goto out;
         }
-        gconf->server = g_strdup(qp->p[0].value);
+        gsconf->u.q_unix.path = g_strdup(qp->p[0].value);
     } else {
-        gconf->server = g_strdup(uri->server ? uri->server : "localhost");
-        gconf->port = uri->port;
+        gsconf->u.tcp.host = g_strdup(uri->server ? uri->server : "localhost");
+        if (uri->port) {
+            gsconf->u.tcp.port = g_strdup_printf("%d", uri->port);
+        } else {
+            gsconf->u.tcp.port = g_strdup_printf("%d", GLUSTER_DEFAULT_PORT);
+        }
     }
 
 out:
@@ -171,30 +311,34 @@ out:
     return ret;
 }
 
-static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename,
-                                      Error **errp)
+static struct glfs *qemu_gluster_glfs_init(BlockdevOptionsGluster *gconf,
+                                           Error **errp)
 {
-    struct glfs *glfs = NULL;
+    struct glfs *glfs;
     int ret;
     int old_errno;
+    GlusterServerList *server;
 
-    ret = qemu_gluster_parseuri(gconf, filename);
-    if (ret < 0) {
-        error_setg(errp, "Usage: file=gluster[+transport]://[server[:port]]/"
-                   "volname/image[?socket=...]");
-        errno = -ret;
-        goto out;
-    }
-
-    glfs = glfs_new(gconf->volname);
+    glfs = glfs_new(gconf->volume);
     if (!glfs) {
         goto out;
     }
 
-    ret = glfs_set_volfile_server(glfs, gconf->transport, gconf->server,
-            gconf->port);
-    if (ret < 0) {
-        goto out;
+    for (server = gconf->server; server; server = server->next) {
+        if (server->value->type  == GLUSTER_TRANSPORT_UNIX) {
+            ret = glfs_set_volfile_server(glfs,
+                                   GlusterTransport_lookup[server->value->type],
+                                   server->value->u.q_unix.path, 0);
+        } else {
+            ret = glfs_set_volfile_server(glfs,
+                                   GlusterTransport_lookup[server->value->type],
+                                   server->value->u.tcp.host,
+                                   atoi(server->value->u.tcp.port));
+        }
+
+        if (ret < 0) {
+            goto out;
+        }
     }
 
     ret = glfs_set_logging(glfs, "-", gconf->debug_level);
@@ -204,15 +348,25 @@ static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename,
 
     ret = glfs_init(glfs);
     if (ret) {
-        error_setg_errno(errp, errno,
-                         "Gluster connection failed for server=%s port=%d "
-                         "volume=%s image=%s transport=%s", gconf->server,
-                         gconf->port, gconf->volname, gconf->image,
-                         gconf->transport);
+        error_setg(errp, "Gluster connection for volume %s, path %s failed"
+                         " to connect", gconf->volume, gconf->path);
+        for (server = gconf->server; server; server = server->next) {
+            if (server->value->type  == GLUSTER_TRANSPORT_UNIX) {
+                error_append_hint(errp, "hint: failed on socket %s ",
+                                  server->value->u.q_unix.path);
+            } else {
+                error_append_hint(errp, "hint: failed on host %s and port %s ",
+                                  server->value->u.tcp.host,
+                                  server->value->u.tcp.port);
+            }
+        }
+
+        error_append_hint(errp, "Please refer to gluster logs for more info\n");
 
         /* glfs_init sometimes doesn't set errno although docs suggest that */
-        if (errno == 0)
+        if (errno == 0) {
             errno = EINVAL;
+        }
 
         goto out;
     }
@@ -227,6 +381,226 @@ out:
     return NULL;
 }
 
+static int qapi_enum_parse(const char *opt)
+{
+    int i;
+
+    if (!opt) {
+        return GLUSTER_TRANSPORT__MAX;
+    }
+
+    for (i = 0; i < GLUSTER_TRANSPORT__MAX; i++) {
+        if (!strcmp(opt, GlusterTransport_lookup[i])) {
+            return i;
+        }
+    }
+
+    return i;
+}
+
+/*
+ * Convert the json formatted command line into qapi.
+*/
+static int qemu_gluster_parse_json(BlockdevOptionsGluster *gconf,
+                                  QDict *options, Error **errp)
+{
+    QemuOpts *opts;
+    GlusterServer *gsconf;
+    GlusterServerList *curr = NULL;
+    QDict *backing_options = NULL;
+    Error *local_err = NULL;
+    char *str = NULL;
+    const char *ptr;
+    size_t num_servers;
+    int i;
+
+    /* create opts info from runtime_json_opts list */
+    opts = qemu_opts_create(&runtime_json_opts, NULL, 0, &error_abort);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (local_err) {
+        goto out;
+    }
+
+    num_servers = qdict_array_entries(options, GLUSTER_OPT_SERVER_PATTERN);
+    if (num_servers < 1) {
+        error_setg(&local_err, QERR_MISSING_PARAMETER, "server");
+        goto out;
+    }
+
+    ptr = qemu_opt_get(opts, GLUSTER_OPT_VOLUME);
+    if (!ptr) {
+        error_setg(&local_err, QERR_MISSING_PARAMETER, GLUSTER_OPT_VOLUME);
+        goto out;
+    }
+    gconf->volume = g_strdup(ptr);
+
+    ptr = qemu_opt_get(opts, GLUSTER_OPT_PATH);
+    if (!ptr) {
+        error_setg(&local_err, QERR_MISSING_PARAMETER, GLUSTER_OPT_PATH);
+        goto out;
+    }
+    gconf->path = g_strdup(ptr);
+    qemu_opts_del(opts);
+
+    for (i = 0; i < num_servers; i++) {
+        str = g_strdup_printf(GLUSTER_OPT_SERVER_PATTERN"%d.", i);
+        qdict_extract_subqdict(options, &backing_options, str);
+
+        /* create opts info from runtime_type_opts list */
+        opts = qemu_opts_create(&runtime_type_opts, NULL, 0, &error_abort);
+        qemu_opts_absorb_qdict(opts, backing_options, &local_err);
+        if (local_err) {
+            goto out;
+        }
+
+        ptr = qemu_opt_get(opts, GLUSTER_OPT_TYPE);
+        gsconf = g_new0(GlusterServer, 1);
+        gsconf->type = qapi_enum_parse(ptr);
+        if (!ptr) {
+            error_setg(&local_err, QERR_MISSING_PARAMETER, GLUSTER_OPT_TYPE);
+            error_append_hint(&local_err, GERR_INDEX_HINT, i);
+            goto out;
+
+        }
+        if (gsconf->type == GLUSTER_TRANSPORT__MAX) {
+            error_setg(&local_err, QERR_INVALID_PARAMETER_VALUE,
+                       GLUSTER_OPT_TYPE, "tcp or unix");
+            error_append_hint(&local_err, GERR_INDEX_HINT, i);
+            goto out;
+        }
+        qemu_opts_del(opts);
+
+        if (gsconf->type == GLUSTER_TRANSPORT_TCP) {
+            /* create opts info from runtime_tcp_opts list */
+            opts = qemu_opts_create(&runtime_tcp_opts, NULL, 0, &error_abort);
+            qemu_opts_absorb_qdict(opts, backing_options, &local_err);
+            if (local_err) {
+                goto out;
+            }
+
+            ptr = qemu_opt_get(opts, GLUSTER_OPT_HOST);
+            if (!ptr) {
+                error_setg(&local_err, QERR_MISSING_PARAMETER,
+                           GLUSTER_OPT_HOST);
+                error_append_hint(&local_err, GERR_INDEX_HINT, i);
+                goto out;
+            }
+            gsconf->u.tcp.host = g_strdup(ptr);
+            ptr = qemu_opt_get(opts, GLUSTER_OPT_PORT);
+            if (!ptr) {
+                error_setg(&local_err, QERR_MISSING_PARAMETER,
+                           GLUSTER_OPT_PORT);
+                error_append_hint(&local_err, GERR_INDEX_HINT, i);
+                goto out;
+            }
+            gsconf->u.tcp.port = g_strdup(ptr);
+
+            /* defend for unsupported fields in InetSocketAddress,
+             * i.e. @ipv4, @ipv6  and @to
+             */
+            ptr = qemu_opt_get(opts, GLUSTER_OPT_TO);
+            if (ptr) {
+                gsconf->u.tcp.has_to = true;
+            }
+            ptr = qemu_opt_get(opts, GLUSTER_OPT_IPV4);
+            if (ptr) {
+                gsconf->u.tcp.has_ipv4 = true;
+            }
+            ptr = qemu_opt_get(opts, GLUSTER_OPT_IPV6);
+            if (ptr) {
+                gsconf->u.tcp.has_ipv6 = true;
+            }
+            if (gsconf->u.tcp.has_to) {
+                error_setg(&local_err, "Parameter 'to' not supported");
+                goto out;
+            }
+            if (gsconf->u.tcp.has_ipv4 || gsconf->u.tcp.has_ipv6) {
+                error_setg(&local_err, "Parameters 'ipv4/ipv6' not supported");
+                goto out;
+            }
+            qemu_opts_del(opts);
+        } else {
+            /* create opts info from runtime_unix_opts list */
+            opts = qemu_opts_create(&runtime_unix_opts, NULL, 0, &error_abort);
+            qemu_opts_absorb_qdict(opts, backing_options, &local_err);
+            if (local_err) {
+                goto out;
+            }
+
+            ptr = qemu_opt_get(opts, GLUSTER_OPT_SOCKET);
+            if (!ptr) {
+                error_setg(&local_err, QERR_MISSING_PARAMETER,
+                           GLUSTER_OPT_SOCKET);
+                error_append_hint(&local_err, GERR_INDEX_HINT, i);
+                goto out;
+            }
+            gsconf->u.q_unix.path = g_strdup(ptr);
+            qemu_opts_del(opts);
+        }
+
+        if (gconf->server == NULL) {
+            gconf->server = g_new0(GlusterServerList, 1);
+            gconf->server->value = gsconf;
+            curr = gconf->server;
+        } else {
+            curr->next = g_new0(GlusterServerList, 1);
+            curr->next->value = gsconf;
+            curr = curr->next;
+        }
+
+        qdict_del(backing_options, str);
+        g_free(str);
+        str = NULL;
+    }
+
+    return 0;
+
+out:
+    error_propagate(errp, local_err);
+    qemu_opts_del(opts);
+    if (str) {
+        qdict_del(backing_options, str);
+        g_free(str);
+    }
+    errno = EINVAL;
+    return -errno;
+}
+
+static struct glfs *qemu_gluster_init(BlockdevOptionsGluster *gconf,
+                                      const char *filename,
+                                      QDict *options, Error **errp)
+{
+    int ret;
+    if (filename) {
+        ret = qemu_gluster_parse_uri(gconf, filename);
+        if (ret < 0) {
+            error_setg(errp, "invalid URI");
+            error_append_hint(errp, "Usage: file=gluster[+transport]://"
+                                    "[host[:port]]/volume/path[?socket=...]\n");
+            errno = -ret;
+            return NULL;
+        }
+    } else {
+        ret = qemu_gluster_parse_json(gconf, options, errp);
+        if (ret < 0) {
+            error_append_hint(errp, "Usage: "
+                             "-drive driver=qcow2,file.driver=gluster,"
+                             "file.volume=testvol,file.path=/path/a.qcow2"
+                             "[,file.debug=9],file.server.0.type=tcp,"
+                             "file.server.0.host=1.2.3.4,"
+                             "file.server.0.port=24007,"
+                             "file.server.1.transport=unix,"
+                             "file.server.1.socket=/var/run/glusterd.socket ..."
+                             "\n");
+            errno = -ret;
+            return NULL;
+        }
+
+    }
+
+    return qemu_gluster_glfs_init(gconf, errp);
+}
+
 static void qemu_gluster_complete_aio(void *opaque)
 {
     GlusterAIOCB *acb = (GlusterAIOCB *)opaque;
@@ -255,30 +629,6 @@ static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
     qemu_bh_schedule(acb->bh);
 }
 
-#define GLUSTER_OPT_FILENAME "filename"
-#define GLUSTER_OPT_DEBUG "debug"
-#define GLUSTER_DEBUG_DEFAULT 4
-#define GLUSTER_DEBUG_MAX 9
-
-/* TODO Convert to fine grained options */
-static QemuOptsList runtime_opts = {
-    .name = "gluster",
-    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
-    .desc = {
-        {
-            .name = GLUSTER_OPT_FILENAME,
-            .type = QEMU_OPT_STRING,
-            .help = "URL to the gluster image",
-        },
-        {
-            .name = GLUSTER_OPT_DEBUG,
-            .type = QEMU_OPT_NUMBER,
-            .help = "Gluster log level, valid range is 0-9",
-        },
-        { /* end of list */ }
-    },
-};
-
 static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags)
 {
     assert(open_flags != NULL);
@@ -324,7 +674,7 @@ static int qemu_gluster_open(BlockDriverState *bs,  QDict *options,
     BDRVGlusterState *s = bs->opaque;
     int open_flags = 0;
     int ret = 0;
-    GlusterConf *gconf = g_new0(GlusterConf, 1);
+    BlockdevOptionsGluster *gconf = NULL;
     QemuOpts *opts;
     Error *local_err = NULL;
     const char *filename;
@@ -347,8 +697,10 @@ static int qemu_gluster_open(BlockDriverState *bs,  QDict *options,
         s->debug_level = GLUSTER_DEBUG_MAX;
     }
 
+    gconf = g_new0(BlockdevOptionsGluster, 1);
     gconf->debug_level = s->debug_level;
-    s->glfs = qemu_gluster_init(gconf, filename, errp);
+    gconf->has_debug_level = true;
+    s->glfs = qemu_gluster_init(gconf, filename, options, errp);
     if (!s->glfs) {
         ret = -errno;
         goto out;
@@ -373,7 +725,7 @@ static int qemu_gluster_open(BlockDriverState *bs,  QDict *options,
 
     qemu_gluster_parse_flags(bdrv_flags, &open_flags);
 
-    s->fd = glfs_open(s->glfs, gconf->image, open_flags);
+    s->fd = glfs_open(s->glfs, gconf->path, open_flags);
     if (!s->fd) {
         ret = -errno;
     }
@@ -382,7 +734,7 @@ static int qemu_gluster_open(BlockDriverState *bs,  QDict *options,
 
 out:
     qemu_opts_del(opts);
-    qemu_gluster_gconf_free(gconf);
+    qapi_free_BlockdevOptionsGluster(gconf);
     if (!ret) {
         return ret;
     }
@@ -395,19 +747,13 @@ out:
     return ret;
 }
 
-typedef struct BDRVGlusterReopenState {
-    struct glfs *glfs;
-    struct glfs_fd *fd;
-} BDRVGlusterReopenState;
-
-
 static int qemu_gluster_reopen_prepare(BDRVReopenState *state,
                                        BlockReopenQueue *queue, Error **errp)
 {
     int ret = 0;
     BDRVGlusterState *s;
     BDRVGlusterReopenState *reop_s;
-    GlusterConf *gconf = NULL;
+    BlockdevOptionsGluster *gconf;
     int open_flags = 0;
 
     assert(state != NULL);
@@ -420,10 +766,10 @@ static int qemu_gluster_reopen_prepare(BDRVReopenState *state,
 
     qemu_gluster_parse_flags(state->flags, &open_flags);
 
-    gconf = g_new0(GlusterConf, 1);
-
+    gconf = g_new0(BlockdevOptionsGluster, 1);
     gconf->debug_level = s->debug_level;
-    reop_s->glfs = qemu_gluster_init(gconf, state->bs->filename, errp);
+    gconf->has_debug_level = true;
+    reop_s->glfs = qemu_gluster_init(gconf, state->bs->filename, NULL, errp);
     if (reop_s->glfs == NULL) {
         ret = -errno;
         goto exit;
@@ -439,7 +785,7 @@ static int qemu_gluster_reopen_prepare(BDRVReopenState *state,
     }
 #endif
 
-    reop_s->fd = glfs_open(reop_s->glfs, gconf->image, open_flags);
+    reop_s->fd = glfs_open(reop_s->glfs, gconf->path, open_flags);
     if (reop_s->fd == NULL) {
         /* reops->glfs will be cleaned up in _abort */
         ret = -errno;
@@ -448,7 +794,7 @@ static int qemu_gluster_reopen_prepare(BDRVReopenState *state,
 
 exit:
     /* state->opaque will be freed in either the _abort or _commit */
-    qemu_gluster_gconf_free(gconf);
+    qapi_free_BlockdevOptionsGluster(gconf);
     return ret;
 }
 
@@ -501,7 +847,9 @@ static void qemu_gluster_reopen_abort(BDRVReopenState *state)
 
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
 static coroutine_fn int qemu_gluster_co_pwrite_zeroes(BlockDriverState *bs,
-        int64_t offset, int size, BdrvRequestFlags flags)
+                                                      int64_t offset,
+                                                      int size,
+                                                      BdrvRequestFlags flags)
 {
     int ret;
     GlusterAIOCB acb;
@@ -527,7 +875,7 @@ static inline bool gluster_supports_zerofill(void)
 }
 
 static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset,
-        int64_t size)
+                                        int64_t size)
 {
     return glfs_zerofill(fd, offset, size);
 }
@@ -539,7 +887,7 @@ static inline bool gluster_supports_zerofill(void)
 }
 
 static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset,
-        int64_t size)
+                                        int64_t size)
 {
     return 0;
 }
@@ -548,14 +896,15 @@ static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset,
 static int qemu_gluster_create(const char *filename,
                                QemuOpts *opts, Error **errp)
 {
+    BlockdevOptionsGluster *gconf;
     struct glfs *glfs;
     struct glfs_fd *fd;
     int ret = 0;
     int prealloc = 0;
     int64_t total_size = 0;
     char *tmp = NULL;
-    GlusterConf *gconf = g_new0(GlusterConf, 1);
 
+    gconf = g_new0(BlockdevOptionsGluster, 1);
     gconf->debug_level = qemu_opt_get_number_del(opts, GLUSTER_OPT_DEBUG,
                                                  GLUSTER_DEBUG_DEFAULT);
     if (gconf->debug_level < 0) {
@@ -563,8 +912,9 @@ static int qemu_gluster_create(const char *filename,
     } else if (gconf->debug_level > GLUSTER_DEBUG_MAX) {
         gconf->debug_level = GLUSTER_DEBUG_MAX;
     }
+    gconf->has_debug_level = true;
 
-    glfs = qemu_gluster_init(gconf, filename, errp);
+    glfs = qemu_gluster_init(gconf, filename, NULL, errp);
     if (!glfs) {
         ret = -errno;
         goto out;
@@ -576,19 +926,17 @@ static int qemu_gluster_create(const char *filename,
     tmp = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
     if (!tmp || !strcmp(tmp, "off")) {
         prealloc = 0;
-    } else if (!strcmp(tmp, "full") &&
-               gluster_supports_zerofill()) {
+    } else if (!strcmp(tmp, "full") && gluster_supports_zerofill()) {
         prealloc = 1;
     } else {
         error_setg(errp, "Invalid preallocation mode: '%s'"
-            " or GlusterFS doesn't support zerofill API",
-            tmp);
+                         " or GlusterFS doesn't support zerofill API", tmp);
         ret = -EINVAL;
         goto out;
     }
 
-    fd = glfs_creat(glfs, gconf->image,
-        O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR);
+    fd = glfs_creat(glfs, gconf->path,
+                    O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR);
     if (!fd) {
         ret = -errno;
     } else {
@@ -606,7 +954,7 @@ static int qemu_gluster_create(const char *filename,
     }
 out:
     g_free(tmp);
-    qemu_gluster_gconf_free(gconf);
+    qapi_free_BlockdevOptionsGluster(gconf);
     if (glfs) {
         glfs_fini(glfs);
     }
@@ -614,7 +962,8 @@ out:
 }
 
 static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int write)
+                                           int64_t sector_num, int nb_sectors,
+                                           QEMUIOVector *qiov, int write)
 {
     int ret;
     GlusterAIOCB acb;
@@ -629,10 +978,10 @@ static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs,
 
     if (write) {
         ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0,
-            gluster_finish_aiocb, &acb);
+                                 gluster_finish_aiocb, &acb);
     } else {
         ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0,
-            gluster_finish_aiocb, &acb);
+                                gluster_finish_aiocb, &acb);
     }
 
     if (ret < 0) {
@@ -657,13 +1006,17 @@ static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset)
 }
 
 static coroutine_fn int qemu_gluster_co_readv(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+                                              int64_t sector_num,
+                                              int nb_sectors,
+                                              QEMUIOVector *qiov)
 {
     return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 0);
 }
 
 static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+                                               int64_t sector_num,
+                                               int nb_sectors,
+                                               QEMUIOVector *qiov)
 {
     return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1);
 }
@@ -724,14 +1077,12 @@ error:
 }
 
 #ifdef CONFIG_GLUSTERFS_DISCARD
-static coroutine_fn int qemu_gluster_co_discard(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors)
+static coroutine_fn int qemu_gluster_co_pdiscard(BlockDriverState *bs,
+                                                 int64_t offset, int size)
 {
     int ret;
     GlusterAIOCB acb;
     BDRVGlusterState *s = bs->opaque;
-    size_t size = nb_sectors * BDRV_SECTOR_SIZE;
-    off_t offset = sector_num * BDRV_SECTOR_SIZE;
 
     acb.size = 0;
     acb.ret = 0;
@@ -934,34 +1285,11 @@ static int64_t coroutine_fn qemu_gluster_co_get_block_status(
 }
 
 
-static QemuOptsList qemu_gluster_create_opts = {
-    .name = "qemu-gluster-create-opts",
-    .head = QTAILQ_HEAD_INITIALIZER(qemu_gluster_create_opts.head),
-    .desc = {
-        {
-            .name = BLOCK_OPT_SIZE,
-            .type = QEMU_OPT_SIZE,
-            .help = "Virtual disk size"
-        },
-        {
-            .name = BLOCK_OPT_PREALLOC,
-            .type = QEMU_OPT_STRING,
-            .help = "Preallocation mode (allowed values: off, full)"
-        },
-        {
-            .name = GLUSTER_OPT_DEBUG,
-            .type = QEMU_OPT_NUMBER,
-            .help = "Gluster log level, valid range is 0-9",
-        },
-        { /* end of list */ }
-    }
-};
-
 static BlockDriver bdrv_gluster = {
     .format_name                  = "gluster",
     .protocol_name                = "gluster",
     .instance_size                = sizeof(BDRVGlusterState),
-    .bdrv_needs_filename          = true,
+    .bdrv_needs_filename          = false,
     .bdrv_file_open               = qemu_gluster_open,
     .bdrv_reopen_prepare          = qemu_gluster_reopen_prepare,
     .bdrv_reopen_commit           = qemu_gluster_reopen_commit,
@@ -976,7 +1304,7 @@ static BlockDriver bdrv_gluster = {
     .bdrv_co_flush_to_disk        = qemu_gluster_co_flush_to_disk,
     .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
 #ifdef CONFIG_GLUSTERFS_DISCARD
-    .bdrv_co_discard              = qemu_gluster_co_discard,
+    .bdrv_co_pdiscard             = qemu_gluster_co_pdiscard,
 #endif
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
     .bdrv_co_pwrite_zeroes        = qemu_gluster_co_pwrite_zeroes,
@@ -989,7 +1317,7 @@ static BlockDriver bdrv_gluster_tcp = {
     .format_name                  = "gluster",
     .protocol_name                = "gluster+tcp",
     .instance_size                = sizeof(BDRVGlusterState),
-    .bdrv_needs_filename          = true,
+    .bdrv_needs_filename          = false,
     .bdrv_file_open               = qemu_gluster_open,
     .bdrv_reopen_prepare          = qemu_gluster_reopen_prepare,
     .bdrv_reopen_commit           = qemu_gluster_reopen_commit,
@@ -1004,7 +1332,7 @@ static BlockDriver bdrv_gluster_tcp = {
     .bdrv_co_flush_to_disk        = qemu_gluster_co_flush_to_disk,
     .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
 #ifdef CONFIG_GLUSTERFS_DISCARD
-    .bdrv_co_discard              = qemu_gluster_co_discard,
+    .bdrv_co_pdiscard             = qemu_gluster_co_pdiscard,
 #endif
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
     .bdrv_co_pwrite_zeroes        = qemu_gluster_co_pwrite_zeroes,
@@ -1032,7 +1360,7 @@ static BlockDriver bdrv_gluster_unix = {
     .bdrv_co_flush_to_disk        = qemu_gluster_co_flush_to_disk,
     .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
 #ifdef CONFIG_GLUSTERFS_DISCARD
-    .bdrv_co_discard              = qemu_gluster_co_discard,
+    .bdrv_co_pdiscard             = qemu_gluster_co_pdiscard,
 #endif
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
     .bdrv_co_pwrite_zeroes        = qemu_gluster_co_pwrite_zeroes,
@@ -1041,6 +1369,12 @@ static BlockDriver bdrv_gluster_unix = {
     .create_opts                  = &qemu_gluster_create_opts,
 };
 
+/* rdma is deprecated (actually never supported for volfile fetch).
+ * Let's maintain it for the protocol compatibility, to make sure things
+ * won't break immediately. For now, gluster+rdma will fall back to gluster+tcp
+ * protocol with a warning.
+ * TODO: remove gluster+rdma interface support
+ */
 static BlockDriver bdrv_gluster_rdma = {
     .format_name                  = "gluster",
     .protocol_name                = "gluster+rdma",
@@ -1060,7 +1394,7 @@ static BlockDriver bdrv_gluster_rdma = {
     .bdrv_co_flush_to_disk        = qemu_gluster_co_flush_to_disk,
     .bdrv_has_zero_init           = qemu_gluster_has_zero_init,
 #ifdef CONFIG_GLUSTERFS_DISCARD
-    .bdrv_co_discard              = qemu_gluster_co_discard,
+    .bdrv_co_pdiscard             = qemu_gluster_co_pdiscard,
 #endif
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
     .bdrv_co_pwrite_zeroes        = qemu_gluster_co_pwrite_zeroes,
diff --git a/block/io.c b/block/io.c
index cfda714..7323f0f 100644
--- a/block/io.c
+++ b/block/io.c
@@ -33,14 +33,13 @@
 
 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
 
-static BlockAIOCB *bdrv_co_aio_rw_vector(BdrvChild *child,
-                                         int64_t sector_num,
-                                         QEMUIOVector *qiov,
-                                         int nb_sectors,
-                                         BdrvRequestFlags flags,
-                                         BlockCompletionFunc *cb,
-                                         void *opaque,
-                                         bool is_write);
+static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
+                                          int64_t offset,
+                                          QEMUIOVector *qiov,
+                                          BdrvRequestFlags flags,
+                                          BlockCompletionFunc *cb,
+                                          void *opaque,
+                                          bool is_write);
 static void coroutine_fn bdrv_co_do_rw(void *opaque);
 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
     int64_t offset, int count, BdrvRequestFlags flags);
@@ -971,21 +970,25 @@ err:
 
 /*
  * Forwards an already correctly aligned request to the BlockDriver. This
- * handles copy on read and zeroing after EOF; any other features must be
- * implemented by the caller.
+ * handles copy on read, zeroing after EOF, and fragmentation of large
+ * reads; any other features must be implemented by the caller.
  */
 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
     int64_t align, QEMUIOVector *qiov, int flags)
 {
     int64_t total_bytes, max_bytes;
-    int ret;
+    int ret = 0;
+    uint64_t bytes_remaining = bytes;
+    int max_transfer;
 
     assert(is_power_of_2(align));
     assert((offset & (align - 1)) == 0);
     assert((bytes & (align - 1)) == 0);
     assert(!qiov || bytes == qiov->size);
     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
+    max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
+                                   align);
 
     /* TODO: We would need a per-BDS .supported_read_flags and
      * potential fallback support, if we ever implement any read flags
@@ -1024,7 +1027,7 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
         }
     }
 
-    /* Forward the request to the BlockDriver */
+    /* Forward the request to the BlockDriver, possibly fragmenting it */
     total_bytes = bdrv_getlength(bs);
     if (total_bytes < 0) {
         ret = total_bytes;
@@ -1032,30 +1035,39 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
     }
 
     max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
-    if (bytes <= max_bytes) {
+    if (bytes <= max_bytes && bytes <= max_transfer) {
         ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
-    } else if (max_bytes > 0) {
-        QEMUIOVector local_qiov;
+        goto out;
+    }
 
-        qemu_iovec_init(&local_qiov, qiov->niov);
-        qemu_iovec_concat(&local_qiov, qiov, 0, max_bytes);
+    while (bytes_remaining) {
+        int num;
 
-        ret = bdrv_driver_preadv(bs, offset, max_bytes, &local_qiov, 0);
+        if (max_bytes) {
+            QEMUIOVector local_qiov;
 
-        qemu_iovec_destroy(&local_qiov);
-    } else {
-        ret = 0;
-    }
+            num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
+            assert(num);
+            qemu_iovec_init(&local_qiov, qiov->niov);
+            qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
 
-    /* Reading beyond end of file is supposed to produce zeroes */
-    if (ret == 0 && total_bytes < offset + bytes) {
-        uint64_t zero_offset = MAX(0, total_bytes - offset);
-        uint64_t zero_bytes = offset + bytes - zero_offset;
-        qemu_iovec_memset(qiov, zero_offset, 0, zero_bytes);
+            ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
+                                     num, &local_qiov, 0);
+            max_bytes -= num;
+            qemu_iovec_destroy(&local_qiov);
+        } else {
+            num = bytes_remaining;
+            ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0,
+                                    bytes_remaining);
+        }
+        if (ret < 0) {
+            goto out;
+        }
+        bytes_remaining -= num;
     }
 
 out:
-    return ret;
+    return ret < 0 ? ret : 0;
 }
 
 /*
@@ -1256,7 +1268,8 @@ fail:
 }
 
 /*
- * Forwards an already correctly aligned write request to the BlockDriver.
+ * Forwards an already correctly aligned write request to the BlockDriver,
+ * after possibly fragmenting it.
  */
 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
@@ -1268,6 +1281,8 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
 
     int64_t start_sector = offset >> BDRV_SECTOR_BITS;
     int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
+    uint64_t bytes_remaining = bytes;
+    int max_transfer;
 
     assert(is_power_of_2(align));
     assert((offset & (align - 1)) == 0);
@@ -1275,6 +1290,8 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
     assert(!qiov || bytes == qiov->size);
     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
     assert(!(flags & ~BDRV_REQ_MASK));
+    max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
+                                   align);
 
     waited = wait_serialising_requests(req);
     assert(!waited || !req->serialising);
@@ -1297,9 +1314,34 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
     } else if (flags & BDRV_REQ_ZERO_WRITE) {
         bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
         ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
-    } else {
+    } else if (bytes <= max_transfer) {
         bdrv_debug_event(bs, BLKDBG_PWRITEV);
         ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags);
+    } else {
+        bdrv_debug_event(bs, BLKDBG_PWRITEV);
+        while (bytes_remaining) {
+            int num = MIN(bytes_remaining, max_transfer);
+            QEMUIOVector local_qiov;
+            int local_flags = flags;
+
+            assert(num);
+            if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
+                !(bs->supported_write_flags & BDRV_REQ_FUA)) {
+                /* If FUA is going to be emulated by flush, we only
+                 * need to flush on the last iteration */
+                local_flags &= ~BDRV_REQ_FUA;
+            }
+            qemu_iovec_init(&local_qiov, qiov->niov);
+            qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
+
+            ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
+                                      num, &local_qiov, local_flags);
+            qemu_iovec_destroy(&local_qiov);
+            if (ret < 0) {
+                break;
+            }
+            bytes_remaining -= num;
+        }
     }
     bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
 
@@ -1312,6 +1354,7 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
 
     if (ret >= 0) {
         bs->total_sectors = MAX(bs->total_sectors, end_sector);
+        ret = 0;
     }
 
     return ret;
@@ -1971,8 +2014,9 @@ BlockAIOCB *bdrv_aio_readv(BdrvChild *child, int64_t sector_num,
 {
     trace_bdrv_aio_readv(child->bs, sector_num, nb_sectors, opaque);
 
-    return bdrv_co_aio_rw_vector(child, sector_num, qiov, nb_sectors, 0,
-                                 cb, opaque, false);
+    assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size);
+    return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov,
+                                  0, cb, opaque, false);
 }
 
 BlockAIOCB *bdrv_aio_writev(BdrvChild *child, int64_t sector_num,
@@ -1981,8 +2025,9 @@ BlockAIOCB *bdrv_aio_writev(BdrvChild *child, int64_t sector_num,
 {
     trace_bdrv_aio_writev(child->bs, sector_num, nb_sectors, opaque);
 
-    return bdrv_co_aio_rw_vector(child, sector_num, qiov, nb_sectors, 0,
-                                 cb, opaque, true);
+    assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size);
+    return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov,
+                                  0, cb, opaque, true);
 }
 
 void bdrv_aio_cancel(BlockAIOCB *acb)
@@ -2018,8 +2063,8 @@ typedef struct BlockRequest {
     union {
         /* Used during read, write, trim */
         struct {
-            int64_t sector;
-            int nb_sectors;
+            int64_t offset;
+            int bytes;
             int flags;
             QEMUIOVector *qiov;
         };
@@ -2083,24 +2128,23 @@ static void coroutine_fn bdrv_co_do_rw(void *opaque)
     BlockAIOCBCoroutine *acb = opaque;
 
     if (!acb->is_write) {
-        acb->req.error = bdrv_co_do_readv(acb->child, acb->req.sector,
-            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
+        acb->req.error = bdrv_co_preadv(acb->child, acb->req.offset,
+            acb->req.qiov->size, acb->req.qiov, acb->req.flags);
     } else {
-        acb->req.error = bdrv_co_do_writev(acb->child, acb->req.sector,
-            acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
+        acb->req.error = bdrv_co_pwritev(acb->child, acb->req.offset,
+            acb->req.qiov->size, acb->req.qiov, acb->req.flags);
     }
 
     bdrv_co_complete(acb);
 }
 
-static BlockAIOCB *bdrv_co_aio_rw_vector(BdrvChild *child,
-                                         int64_t sector_num,
-                                         QEMUIOVector *qiov,
-                                         int nb_sectors,
-                                         BdrvRequestFlags flags,
-                                         BlockCompletionFunc *cb,
-                                         void *opaque,
-                                         bool is_write)
+static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
+                                          int64_t offset,
+                                          QEMUIOVector *qiov,
+                                          BdrvRequestFlags flags,
+                                          BlockCompletionFunc *cb,
+                                          void *opaque,
+                                          bool is_write)
 {
     Coroutine *co;
     BlockAIOCBCoroutine *acb;
@@ -2109,8 +2153,7 @@ static BlockAIOCB *bdrv_co_aio_rw_vector(BdrvChild *child,
     acb->child = child;
     acb->need_bh = true;
     acb->req.error = -EINPROGRESS;
-    acb->req.sector = sector_num;
-    acb->req.nb_sectors = nb_sectors;
+    acb->req.offset = offset;
     acb->req.qiov = qiov;
     acb->req.flags = flags;
     acb->is_write = is_write;
@@ -2150,30 +2193,29 @@ BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
     return &acb->common;
 }
 
-static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
+static void coroutine_fn bdrv_aio_pdiscard_co_entry(void *opaque)
 {
     BlockAIOCBCoroutine *acb = opaque;
     BlockDriverState *bs = acb->common.bs;
 
-    acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
+    acb->req.error = bdrv_co_pdiscard(bs, acb->req.offset, acb->req.bytes);
     bdrv_co_complete(acb);
 }
 
-BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors,
-        BlockCompletionFunc *cb, void *opaque)
+BlockAIOCB *bdrv_aio_pdiscard(BlockDriverState *bs, int64_t offset, int count,
+                              BlockCompletionFunc *cb, void *opaque)
 {
     Coroutine *co;
     BlockAIOCBCoroutine *acb;
 
-    trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
+    trace_bdrv_aio_pdiscard(bs, offset, count, opaque);
 
     acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
     acb->need_bh = true;
     acb->req.error = -EINPROGRESS;
-    acb->req.sector = sector_num;
-    acb->req.nb_sectors = nb_sectors;
-    co = qemu_coroutine_create(bdrv_aio_discard_co_entry, acb);
+    acb->req.offset = offset;
+    acb->req.bytes = count;
+    co = qemu_coroutine_create(bdrv_aio_pdiscard_co_entry, acb);
     qemu_coroutine_enter(co);
 
     bdrv_co_maybe_schedule_bh(acb);
@@ -2346,28 +2388,29 @@ int bdrv_flush(BlockDriverState *bs)
 
 typedef struct DiscardCo {
     BlockDriverState *bs;
-    int64_t sector_num;
-    int nb_sectors;
+    int64_t offset;
+    int count;
     int ret;
 } DiscardCo;
-static void coroutine_fn bdrv_discard_co_entry(void *opaque)
+static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
 {
     DiscardCo *rwco = opaque;
 
-    rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
+    rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->count);
 }
 
-int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
-                                 int nb_sectors)
+int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
+                                  int count)
 {
     BdrvTrackedRequest req;
-    int max_discard, ret;
+    int max_pdiscard, ret;
+    int head, align;
 
     if (!bs->drv) {
         return -ENOMEDIUM;
     }
 
-    ret = bdrv_check_request(bs, sector_num, nb_sectors);
+    ret = bdrv_check_byte_request(bs, offset, count);
     if (ret < 0) {
         return ret;
     } else if (bs->read_only) {
@@ -2380,50 +2423,47 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
         return 0;
     }
 
-    if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
+    if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
+        return 0;
+    }
+
+    /* Discard is advisory, so ignore any unaligned head or tail */
+    align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
+    assert(is_power_of_2(align));
+    head = MIN(count, -offset & (align - 1));
+    if (head) {
+        count -= head;
+        offset += head;
+    }
+    count = QEMU_ALIGN_DOWN(count, align);
+    if (!count) {
         return 0;
     }
 
-    tracked_request_begin(&req, bs, sector_num << BDRV_SECTOR_BITS,
-                          nb_sectors << BDRV_SECTOR_BITS, BDRV_TRACKED_DISCARD);
+    tracked_request_begin(&req, bs, offset, count, BDRV_TRACKED_DISCARD);
 
     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
     if (ret < 0) {
         goto out;
     }
 
-    max_discard = MIN_NON_ZERO(bs->bl.max_pdiscard >> BDRV_SECTOR_BITS,
-                               BDRV_REQUEST_MAX_SECTORS);
-    while (nb_sectors > 0) {
-        int ret;
-        int num = nb_sectors;
-        int discard_alignment = bs->bl.pdiscard_alignment >> BDRV_SECTOR_BITS;
-
-        /* align request */
-        if (discard_alignment &&
-            num >= discard_alignment &&
-            sector_num % discard_alignment) {
-            if (num > discard_alignment) {
-                num = discard_alignment;
-            }
-            num -= sector_num % discard_alignment;
-        }
+    max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX),
+                                   align);
 
-        /* limit request size */
-        if (num > max_discard) {
-            num = max_discard;
-        }
+    while (count > 0) {
+        int ret;
+        int num = MIN(count, max_pdiscard);
 
-        if (bs->drv->bdrv_co_discard) {
-            ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
+        if (bs->drv->bdrv_co_pdiscard) {
+            ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
         } else {
             BlockAIOCB *acb;
             CoroutineIOCompletion co = {
                 .coroutine = qemu_coroutine_self(),
             };
 
-            acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
-                                            bdrv_co_io_em_complete, &co);
+            acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
+                                             bdrv_co_io_em_complete, &co);
             if (acb == NULL) {
                 ret = -EIO;
                 goto out;
@@ -2436,8 +2476,8 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
             goto out;
         }
 
-        sector_num += num;
-        nb_sectors -= num;
+        offset += num;
+        count -= num;
     }
     ret = 0;
 out:
@@ -2448,23 +2488,23 @@ out:
     return ret;
 }
 
-int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
+int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count)
 {
     Coroutine *co;
     DiscardCo rwco = {
         .bs = bs,
-        .sector_num = sector_num,
-        .nb_sectors = nb_sectors,
+        .offset = offset,
+        .count = count,
         .ret = NOT_DONE,
     };
 
     if (qemu_in_coroutine()) {
         /* Fast-path if already in coroutine context */
-        bdrv_discard_co_entry(&rwco);
+        bdrv_pdiscard_co_entry(&rwco);
     } else {
         AioContext *aio_context = bdrv_get_aio_context(bs);
 
-        co = qemu_coroutine_create(bdrv_discard_co_entry, &rwco);
+        co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco);
         qemu_coroutine_enter(co);
         while (rwco.ret == NOT_DONE) {
             aio_poll(aio_context, true);
diff --git a/block/iscsi.c b/block/iscsi.c
index 129c3af..95ce9e1 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -586,11 +586,8 @@ iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
         return -EINVAL;
     }
 
-    if (bs->bl.max_transfer &&
-        nb_sectors << BDRV_SECTOR_BITS > bs->bl.max_transfer) {
-        error_report("iSCSI Error: Write of %d sectors exceeds max_xfer_len "
-                     "of %" PRIu32 " bytes", nb_sectors, bs->bl.max_transfer);
-        return -EINVAL;
+    if (bs->bl.max_transfer) {
+        assert(nb_sectors << BDRV_SECTOR_BITS <= bs->bl.max_transfer);
     }
 
     lba = sector_qemu2lun(sector_num, iscsilun);
@@ -754,11 +751,8 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs,
         return -EINVAL;
     }
 
-    if (bs->bl.max_transfer &&
-        nb_sectors << BDRV_SECTOR_BITS > bs->bl.max_transfer) {
-        error_report("iSCSI Error: Read of %d sectors exceeds max_xfer_len "
-                     "of %" PRIu32 " bytes", nb_sectors, bs->bl.max_transfer);
-        return -EINVAL;
+    if (bs->bl.max_transfer) {
+        assert(nb_sectors << BDRV_SECTOR_BITS <= bs->bl.max_transfer);
     }
 
     /* if cache.direct is off and we have a valid entry in our allocation map
@@ -1048,29 +1042,26 @@ iscsi_getlength(BlockDriverState *bs)
 }
 
 static int
-coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t sector_num,
-                                   int nb_sectors)
+coroutine_fn iscsi_co_pdiscard(BlockDriverState *bs, int64_t offset, int count)
 {
     IscsiLun *iscsilun = bs->opaque;
     struct IscsiTask iTask;
     struct unmap_list list;
 
-    if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
-        return -EINVAL;
-    }
+    assert(is_byte_request_lun_aligned(offset, count, iscsilun));
 
     if (!iscsilun->lbp.lbpu) {
         /* UNMAP is not supported by the target */
         return 0;
     }
 
-    list.lba = sector_qemu2lun(sector_num, iscsilun);
-    list.num = sector_qemu2lun(nb_sectors, iscsilun);
+    list.lba = offset / iscsilun->block_size;
+    list.num = count / iscsilun->block_size;
 
     iscsi_co_init_iscsitask(iscsilun, &iTask);
 retry:
     if (iscsi_unmap_task(iscsilun->iscsi, iscsilun->lun, 0, 0, &list, 1,
-                     iscsi_co_generic_cb, &iTask) == NULL) {
+                         iscsi_co_generic_cb, &iTask) == NULL) {
         return -ENOMEM;
     }
 
@@ -1100,7 +1091,8 @@ retry:
         return iTask.err_code;
     }
 
-    iscsi_allocmap_set_invalid(iscsilun, sector_num, nb_sectors);
+    iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS,
+                               count >> BDRV_SECTOR_BITS);
 
     return 0;
 }
@@ -2004,7 +1996,7 @@ static BlockDriver bdrv_iscsi = {
     .bdrv_refresh_limits = iscsi_refresh_limits,
 
     .bdrv_co_get_block_status = iscsi_co_get_block_status,
-    .bdrv_co_discard      = iscsi_co_discard,
+    .bdrv_co_pdiscard      = iscsi_co_pdiscard,
     .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes,
     .bdrv_co_readv         = iscsi_co_readv,
     .bdrv_co_writev_flags  = iscsi_co_writev_flags,
diff --git a/block/mirror.c b/block/mirror.c
index 9ae11e5..69a1a7c 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -58,9 +58,10 @@ typedef struct MirrorBlockJob {
     QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
     int buf_free_count;
 
+    uint64_t last_pause_ns;
     unsigned long *in_flight_bitmap;
     int in_flight;
-    int sectors_in_flight;
+    int64_t sectors_in_flight;
     int ret;
     bool unmap;
     bool waiting_for_io;
@@ -303,8 +304,9 @@ static void mirror_do_zero_or_discard(MirrorBlockJob *s,
     s->in_flight++;
     s->sectors_in_flight += nb_sectors;
     if (is_discard) {
-        blk_aio_discard(s->target, sector_num, op->nb_sectors,
-                        mirror_write_complete, op);
+        blk_aio_pdiscard(s->target, sector_num << BDRV_SECTOR_BITS,
+                         op->nb_sectors << BDRV_SECTOR_BITS,
+                         mirror_write_complete, op);
     } else {
         blk_aio_pwrite_zeroes(s->target, sector_num * BDRV_SECTOR_SIZE,
                               op->nb_sectors * BDRV_SECTOR_SIZE,
@@ -322,6 +324,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
     int nb_chunks = 1;
     int64_t end = s->bdev_length / BDRV_SECTOR_SIZE;
     int sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
+    bool write_zeroes_ok = bdrv_can_write_zeroes_with_unmap(blk_bs(s->target));
 
     sector_num = hbitmap_iter_next(&s->hbi);
     if (sector_num < 0) {
@@ -372,7 +375,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
     bitmap_set(s->in_flight_bitmap, sector_num / sectors_per_chunk, nb_chunks);
     while (nb_chunks > 0 && sector_num < end) {
         int ret;
-        int io_sectors;
+        int io_sectors, io_sectors_acct;
         BlockDriverState *file;
         enum MirrorMethod {
             MIRROR_METHOD_COPY,
@@ -405,16 +408,26 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
             }
         }
 
+        while (s->in_flight >= MAX_IN_FLIGHT) {
+            trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
+            mirror_wait_for_io(s);
+        }
+
         mirror_clip_sectors(s, sector_num, &io_sectors);
         switch (mirror_method) {
         case MIRROR_METHOD_COPY:
             io_sectors = mirror_do_read(s, sector_num, io_sectors);
+            io_sectors_acct = io_sectors;
             break;
         case MIRROR_METHOD_ZERO:
-            mirror_do_zero_or_discard(s, sector_num, io_sectors, false);
-            break;
         case MIRROR_METHOD_DISCARD:
-            mirror_do_zero_or_discard(s, sector_num, io_sectors, true);
+            mirror_do_zero_or_discard(s, sector_num, io_sectors,
+                                      mirror_method == MIRROR_METHOD_DISCARD);
+            if (write_zeroes_ok) {
+                io_sectors_acct = 0;
+            } else {
+                io_sectors_acct = io_sectors;
+            }
             break;
         default:
             abort();
@@ -423,7 +436,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
         sector_num += io_sectors;
         nb_chunks -= DIV_ROUND_UP(io_sectors, sectors_per_chunk);
         if (s->common.speed) {
-            delay_ns = ratelimit_calculate_delay(&s->limit, io_sectors);
+            delay_ns = ratelimit_calculate_delay(&s->limit, io_sectors_acct);
         }
     }
     return delay_ns;
@@ -511,19 +524,94 @@ static void mirror_exit(BlockJob *job, void *opaque)
     bdrv_unref(src);
 }
 
+static void mirror_throttle(MirrorBlockJob *s)
+{
+    int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+
+    if (now - s->last_pause_ns > SLICE_TIME) {
+        s->last_pause_ns = now;
+        block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0);
+    } else {
+        block_job_pause_point(&s->common);
+    }
+}
+
+static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
+{
+    int64_t sector_num, end;
+    BlockDriverState *base = s->base;
+    BlockDriverState *bs = blk_bs(s->common.blk);
+    BlockDriverState *target_bs = blk_bs(s->target);
+    int ret, n;
+
+    end = s->bdev_length / BDRV_SECTOR_SIZE;
+
+    if (base == NULL && !bdrv_has_zero_init(target_bs)) {
+        if (!bdrv_can_write_zeroes_with_unmap(target_bs)) {
+            bdrv_set_dirty_bitmap(s->dirty_bitmap, 0, end);
+            return 0;
+        }
+
+        for (sector_num = 0; sector_num < end; ) {
+            int nb_sectors = MIN(end - sector_num,
+                QEMU_ALIGN_DOWN(INT_MAX, s->granularity) >> BDRV_SECTOR_BITS);
+
+            mirror_throttle(s);
+
+            if (block_job_is_cancelled(&s->common)) {
+                return 0;
+            }
+
+            if (s->in_flight >= MAX_IN_FLIGHT) {
+                trace_mirror_yield(s, s->in_flight, s->buf_free_count, -1);
+                mirror_wait_for_io(s);
+                continue;
+            }
+
+            mirror_do_zero_or_discard(s, sector_num, nb_sectors, false);
+            sector_num += nb_sectors;
+        }
+
+        mirror_drain(s);
+    }
+
+    /* First part, loop on the sectors and initialize the dirty bitmap.  */
+    for (sector_num = 0; sector_num < end; ) {
+        /* Just to make sure we are not exceeding int limit. */
+        int nb_sectors = MIN(INT_MAX >> BDRV_SECTOR_BITS,
+                             end - sector_num);
+
+        mirror_throttle(s);
+
+        if (block_job_is_cancelled(&s->common)) {
+            return 0;
+        }
+
+        ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n);
+        if (ret < 0) {
+            return ret;
+        }
+
+        assert(n > 0);
+        if (ret == 1) {
+            bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
+        }
+        sector_num += n;
+    }
+    return 0;
+}
+
 static void coroutine_fn mirror_run(void *opaque)
 {
     MirrorBlockJob *s = opaque;
     MirrorExitData *data;
     BlockDriverState *bs = blk_bs(s->common.blk);
     BlockDriverState *target_bs = blk_bs(s->target);
-    int64_t sector_num, end, length;
-    uint64_t last_pause_ns;
+    int64_t length;
     BlockDriverInfo bdi;
     char backing_filename[2]; /* we only need 2 characters because we are only
                                  checking for a NULL string */
     int ret = 0;
-    int n;
     int target_cluster_size = BDRV_SECTOR_SIZE;
 
     if (block_job_is_cancelled(&s->common)) {
@@ -565,7 +653,6 @@ static void coroutine_fn mirror_run(void *opaque)
     s->target_cluster_sectors = target_cluster_size >> BDRV_SECTOR_BITS;
     s->max_iov = MIN(bs->bl.max_iov, target_bs->bl.max_iov);
 
-    end = s->bdev_length / BDRV_SECTOR_SIZE;
     s->buf = qemu_try_blockalign(bs, s->buf_size);
     if (s->buf == NULL) {
         ret = -ENOMEM;
@@ -574,47 +661,18 @@ static void coroutine_fn mirror_run(void *opaque)
 
     mirror_free_init(s);
 
-    last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+    s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
     if (!s->is_none_mode) {
-        /* First part, loop on the sectors and initialize the dirty bitmap.  */
-        BlockDriverState *base = s->base;
-        bool mark_all_dirty = s->base == NULL && !bdrv_has_zero_init(target_bs);
-
-        for (sector_num = 0; sector_num < end; ) {
-            /* Just to make sure we are not exceeding int limit. */
-            int nb_sectors = MIN(INT_MAX >> BDRV_SECTOR_BITS,
-                                 end - sector_num);
-            int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
-
-            if (now - last_pause_ns > SLICE_TIME) {
-                last_pause_ns = now;
-                block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0);
-            } else {
-                block_job_pause_point(&s->common);
-            }
-
-            if (block_job_is_cancelled(&s->common)) {
-                goto immediate_exit;
-            }
-
-            ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n);
-
-            if (ret < 0) {
-                goto immediate_exit;
-            }
-
-            assert(n > 0);
-            if (ret == 1 || mark_all_dirty) {
-                bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
-            }
-            sector_num += n;
+        ret = mirror_dirty_init(s);
+        if (ret < 0 || block_job_is_cancelled(&s->common)) {
+            goto immediate_exit;
         }
     }
 
     bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi);
     for (;;) {
         uint64_t delay_ns = 0;
-        int64_t cnt;
+        int64_t cnt, delta;
         bool should_complete;
 
         if (s->ret < 0) {
@@ -637,9 +695,10 @@ static void coroutine_fn mirror_run(void *opaque)
          * We do so every SLICE_TIME nanoseconds, or when there is an error,
          * or when the source is clean, whichever comes first.
          */
-        if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - last_pause_ns < SLICE_TIME &&
+        delta = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->last_pause_ns;
+        if (delta < SLICE_TIME &&
             s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
-            if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 ||
+            if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 ||
                 (cnt == 0 && s->in_flight > 0)) {
                 trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt);
                 mirror_wait_for_io(s);
@@ -707,7 +766,7 @@ static void coroutine_fn mirror_run(void *opaque)
             s->common.cancelled = false;
             break;
         }
-        last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+        s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
     }
 
 immediate_exit:
diff --git a/block/nbd-client.c b/block/nbd-client.c
index 4cc408d..2cf3237 100644
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -116,7 +116,7 @@ static void nbd_restart_write(void *opaque)
 
 static int nbd_co_send_request(BlockDriverState *bs,
                                struct nbd_request *request,
-                               QEMUIOVector *qiov, int offset)
+                               QEMUIOVector *qiov)
 {
     NbdClientSession *s = nbd_get_client_session(bs);
     AioContext *aio_context;
@@ -149,8 +149,8 @@ static int nbd_co_send_request(BlockDriverState *bs,
         qio_channel_set_cork(s->ioc, true);
         rc = nbd_send_request(s->ioc, request);
         if (rc >= 0) {
-            ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov,
-                               offset, request->len, 0);
+            ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov, request->len,
+                               false);
             if (ret != request->len) {
                 rc = -EIO;
             }
@@ -167,8 +167,9 @@ static int nbd_co_send_request(BlockDriverState *bs,
 }
 
 static void nbd_co_receive_reply(NbdClientSession *s,
-    struct nbd_request *request, struct nbd_reply *reply,
-    QEMUIOVector *qiov, int offset)
+                                 struct nbd_request *request,
+                                 struct nbd_reply *reply,
+                                 QEMUIOVector *qiov)
 {
     int ret;
 
@@ -181,8 +182,8 @@ static void nbd_co_receive_reply(NbdClientSession *s,
         reply->error = EIO;
     } else {
         if (qiov && reply->error == 0) {
-            ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov,
-                               offset, request->len, 1);
+            ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov, request->len,
+                               true);
             if (ret != request->len) {
                 reply->error = EIO;
             }
@@ -217,36 +218,41 @@ static void nbd_coroutine_end(NbdClientSession *s,
     }
 }
 
-static int nbd_co_readv_1(BlockDriverState *bs, int64_t sector_num,
-                          int nb_sectors, QEMUIOVector *qiov,
-                          int offset)
+int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset,
+                         uint64_t bytes, QEMUIOVector *qiov, int flags)
 {
     NbdClientSession *client = nbd_get_client_session(bs);
-    struct nbd_request request = { .type = NBD_CMD_READ };
+    struct nbd_request request = {
+        .type = NBD_CMD_READ,
+        .from = offset,
+        .len = bytes,
+    };
     struct nbd_reply reply;
     ssize_t ret;
 
-    request.from = sector_num * 512;
-    request.len = nb_sectors * 512;
+    assert(bytes <= NBD_MAX_BUFFER_SIZE);
+    assert(!flags);
 
     nbd_coroutine_start(client, &request);
-    ret = nbd_co_send_request(bs, &request, NULL, 0);
+    ret = nbd_co_send_request(bs, &request, NULL);
     if (ret < 0) {
         reply.error = -ret;
     } else {
-        nbd_co_receive_reply(client, &request, &reply, qiov, offset);
+        nbd_co_receive_reply(client, &request, &reply, qiov);
     }
     nbd_coroutine_end(client, &request);
     return -reply.error;
-
 }
 
-static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num,
-                           int nb_sectors, QEMUIOVector *qiov,
-                           int offset, int flags)
+int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
+                          uint64_t bytes, QEMUIOVector *qiov, int flags)
 {
     NbdClientSession *client = nbd_get_client_session(bs);
-    struct nbd_request request = { .type = NBD_CMD_WRITE };
+    struct nbd_request request = {
+        .type = NBD_CMD_WRITE,
+        .from = offset,
+        .len = bytes,
+    };
     struct nbd_reply reply;
     ssize_t ret;
 
@@ -255,55 +261,19 @@ static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num,
         request.type |= NBD_CMD_FLAG_FUA;
     }
 
-    request.from = sector_num * 512;
-    request.len = nb_sectors * 512;
+    assert(bytes <= NBD_MAX_BUFFER_SIZE);
 
     nbd_coroutine_start(client, &request);
-    ret = nbd_co_send_request(bs, &request, qiov, offset);
+    ret = nbd_co_send_request(bs, &request, qiov);
     if (ret < 0) {
         reply.error = -ret;
     } else {
-        nbd_co_receive_reply(client, &request, &reply, NULL, 0);
+        nbd_co_receive_reply(client, &request, &reply, NULL);
     }
     nbd_coroutine_end(client, &request);
     return -reply.error;
 }
 
-int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num,
-                        int nb_sectors, QEMUIOVector *qiov)
-{
-    int offset = 0;
-    int ret;
-    while (nb_sectors > NBD_MAX_SECTORS) {
-        ret = nbd_co_readv_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset);
-        if (ret < 0) {
-            return ret;
-        }
-        offset += NBD_MAX_SECTORS * 512;
-        sector_num += NBD_MAX_SECTORS;
-        nb_sectors -= NBD_MAX_SECTORS;
-    }
-    return nbd_co_readv_1(bs, sector_num, nb_sectors, qiov, offset);
-}
-
-int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num,
-                         int nb_sectors, QEMUIOVector *qiov, int flags)
-{
-    int offset = 0;
-    int ret;
-    while (nb_sectors > NBD_MAX_SECTORS) {
-        ret = nbd_co_writev_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset,
-                              flags);
-        if (ret < 0) {
-            return ret;
-        }
-        offset += NBD_MAX_SECTORS * 512;
-        sector_num += NBD_MAX_SECTORS;
-        nb_sectors -= NBD_MAX_SECTORS;
-    }
-    return nbd_co_writev_1(bs, sector_num, nb_sectors, qiov, offset, flags);
-}
-
 int nbd_client_co_flush(BlockDriverState *bs)
 {
     NbdClientSession *client = nbd_get_client_session(bs);
@@ -319,36 +289,37 @@ int nbd_client_co_flush(BlockDriverState *bs)
     request.len = 0;
 
     nbd_coroutine_start(client, &request);
-    ret = nbd_co_send_request(bs, &request, NULL, 0);
+    ret = nbd_co_send_request(bs, &request, NULL);
     if (ret < 0) {
         reply.error = -ret;
     } else {
-        nbd_co_receive_reply(client, &request, &reply, NULL, 0);
+        nbd_co_receive_reply(client, &request, &reply, NULL);
     }
     nbd_coroutine_end(client, &request);
     return -reply.error;
 }
 
-int nbd_client_co_discard(BlockDriverState *bs, int64_t sector_num,
-                          int nb_sectors)
+int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count)
 {
     NbdClientSession *client = nbd_get_client_session(bs);
-    struct nbd_request request = { .type = NBD_CMD_TRIM };
+    struct nbd_request request = {
+        .type = NBD_CMD_TRIM,
+        .from = offset,
+        .len = count,
+    };
     struct nbd_reply reply;
     ssize_t ret;
 
     if (!(client->nbdflags & NBD_FLAG_SEND_TRIM)) {
         return 0;
     }
-    request.from = sector_num * 512;
-    request.len = nb_sectors * 512;
 
     nbd_coroutine_start(client, &request);
-    ret = nbd_co_send_request(bs, &request, NULL, 0);
+    ret = nbd_co_send_request(bs, &request, NULL);
     if (ret < 0) {
         reply.error = -ret;
     } else {
-        nbd_co_receive_reply(client, &request, &reply, NULL, 0);
+        nbd_co_receive_reply(client, &request, &reply, NULL);
     }
     nbd_coroutine_end(client, &request);
     return -reply.error;
diff --git a/block/nbd-client.h b/block/nbd-client.h
index c618dad..fa9817b 100644
--- a/block/nbd-client.h
+++ b/block/nbd-client.h
@@ -44,13 +44,12 @@ int nbd_client_init(BlockDriverState *bs,
                     Error **errp);
 void nbd_client_close(BlockDriverState *bs);
 
-int nbd_client_co_discard(BlockDriverState *bs, int64_t sector_num,
-                          int nb_sectors);
+int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count);
 int nbd_client_co_flush(BlockDriverState *bs);
-int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num,
-                         int nb_sectors, QEMUIOVector *qiov, int flags);
-int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num,
-                        int nb_sectors, QEMUIOVector *qiov);
+int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
+                          uint64_t bytes, QEMUIOVector *qiov, int flags);
+int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset,
+                         uint64_t bytes, QEMUIOVector *qiov, int flags);
 
 void nbd_client_detach_aio_context(BlockDriverState *bs);
 void nbd_client_attach_aio_context(BlockDriverState *bs,
diff --git a/block/nbd.c b/block/nbd.c
index 08e5b67..8d57220 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -349,12 +349,6 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
     return ret;
 }
 
-static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num,
-                        int nb_sectors, QEMUIOVector *qiov)
-{
-    return nbd_client_co_readv(bs, sector_num, nb_sectors, qiov);
-}
-
 static int nbd_co_flush(BlockDriverState *bs)
 {
     return nbd_client_co_flush(bs);
@@ -366,12 +360,6 @@ static void nbd_refresh_limits(BlockDriverState *bs, Error **errp)
     bs->bl.max_transfer = NBD_MAX_BUFFER_SIZE;
 }
 
-static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num,
-                          int nb_sectors)
-{
-    return nbd_client_co_discard(bs, sector_num, nb_sectors);
-}
-
 static void nbd_close(BlockDriverState *bs)
 {
     nbd_client_close(bs);
@@ -450,11 +438,11 @@ static BlockDriver bdrv_nbd = {
     .instance_size              = sizeof(BDRVNBDState),
     .bdrv_parse_filename        = nbd_parse_filename,
     .bdrv_file_open             = nbd_open,
-    .bdrv_co_readv              = nbd_co_readv,
-    .bdrv_co_writev_flags       = nbd_client_co_writev,
+    .bdrv_co_preadv             = nbd_client_co_preadv,
+    .bdrv_co_pwritev            = nbd_client_co_pwritev,
     .bdrv_close                 = nbd_close,
     .bdrv_co_flush_to_os        = nbd_co_flush,
-    .bdrv_co_discard            = nbd_co_discard,
+    .bdrv_co_pdiscard           = nbd_client_co_pdiscard,
     .bdrv_refresh_limits        = nbd_refresh_limits,
     .bdrv_getlength             = nbd_getlength,
     .bdrv_detach_aio_context    = nbd_detach_aio_context,
@@ -468,11 +456,11 @@ static BlockDriver bdrv_nbd_tcp = {
     .instance_size              = sizeof(BDRVNBDState),
     .bdrv_parse_filename        = nbd_parse_filename,
     .bdrv_file_open             = nbd_open,
-    .bdrv_co_readv              = nbd_co_readv,
-    .bdrv_co_writev_flags       = nbd_client_co_writev,
+    .bdrv_co_preadv             = nbd_client_co_preadv,
+    .bdrv_co_pwritev            = nbd_client_co_pwritev,
     .bdrv_close                 = nbd_close,
     .bdrv_co_flush_to_os        = nbd_co_flush,
-    .bdrv_co_discard            = nbd_co_discard,
+    .bdrv_co_pdiscard           = nbd_client_co_pdiscard,
     .bdrv_refresh_limits        = nbd_refresh_limits,
     .bdrv_getlength             = nbd_getlength,
     .bdrv_detach_aio_context    = nbd_detach_aio_context,
@@ -486,11 +474,11 @@ static BlockDriver bdrv_nbd_unix = {
     .instance_size              = sizeof(BDRVNBDState),
     .bdrv_parse_filename        = nbd_parse_filename,
     .bdrv_file_open             = nbd_open,
-    .bdrv_co_readv              = nbd_co_readv,
-    .bdrv_co_writev_flags       = nbd_client_co_writev,
+    .bdrv_co_preadv             = nbd_client_co_preadv,
+    .bdrv_co_pwritev            = nbd_client_co_pwritev,
     .bdrv_close                 = nbd_close,
     .bdrv_co_flush_to_os        = nbd_co_flush,
-    .bdrv_co_discard            = nbd_co_discard,
+    .bdrv_co_pdiscard           = nbd_client_co_pdiscard,
     .bdrv_refresh_limits        = nbd_refresh_limits,
     .bdrv_getlength             = nbd_getlength,
     .bdrv_detach_aio_context    = nbd_detach_aio_context,
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 49b6ce6..cbfb3fe 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -615,9 +615,7 @@ void qcow2_process_discards(BlockDriverState *bs, int ret)
 
         /* Discard is optional, ignore the return value */
         if (ret >= 0) {
-            bdrv_discard(bs->file->bs,
-                         d->offset >> BDRV_SECTOR_BITS,
-                         d->bytes >> BDRV_SECTOR_BITS);
+            bdrv_pdiscard(bs->file->bs, d->offset, d->bytes);
         }
 
         g_free(d);
diff --git a/block/qcow2.c b/block/qcow2.c
index a6bca73..d620d0a 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2479,15 +2479,15 @@ static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
     return ret;
 }
 
-static coroutine_fn int qcow2_co_discard(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors)
+static coroutine_fn int qcow2_co_pdiscard(BlockDriverState *bs,
+                                          int64_t offset, int count)
 {
     int ret;
     BDRVQcow2State *s = bs->opaque;
 
     qemu_co_mutex_lock(&s->lock);
-    ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS,
-        nb_sectors, QCOW2_DISCARD_REQUEST, false);
+    ret = qcow2_discard_clusters(bs, offset, count >> BDRV_SECTOR_BITS,
+                                 QCOW2_DISCARD_REQUEST, false);
     qemu_co_mutex_unlock(&s->lock);
     return ret;
 }
@@ -3410,7 +3410,7 @@ BlockDriver bdrv_qcow2 = {
     .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,
 
     .bdrv_co_pwrite_zeroes  = qcow2_co_pwrite_zeroes,
-    .bdrv_co_discard        = qcow2_co_discard,
+    .bdrv_co_pdiscard       = qcow2_co_pdiscard,
     .bdrv_truncate          = qcow2_truncate,
     .bdrv_write_compressed  = qcow2_write_compressed,
     .bdrv_make_empty        = qcow2_make_empty,
diff --git a/block/raw-posix.c b/block/raw-posix.c
index 20f4d7a..6ed7547 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -1214,7 +1214,7 @@ static int paio_submit_co(BlockDriverState *bs, int fd,
 }
 
 static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        int64_t offset, QEMUIOVector *qiov, int count,
         BlockCompletionFunc *cb, void *opaque, int type)
 {
     RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
@@ -1224,8 +1224,8 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd,
     acb->aio_type = type;
     acb->aio_fildes = fd;
 
-    acb->aio_nbytes = nb_sectors * BDRV_SECTOR_SIZE;
-    acb->aio_offset = sector_num * BDRV_SECTOR_SIZE;
+    acb->aio_nbytes = count;
+    acb->aio_offset = offset;
 
     if (qiov) {
         acb->aio_iov = qiov->iov;
@@ -1233,7 +1233,7 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd,
         assert(qiov->size == acb->aio_nbytes);
     }
 
-    trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
+    trace_paio_submit(acb, opaque, offset, count, type);
     pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
     return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
 }
@@ -1786,13 +1786,13 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
     return ret | BDRV_BLOCK_OFFSET_VALID | start;
 }
 
-static coroutine_fn BlockAIOCB *raw_aio_discard(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors,
+static coroutine_fn BlockAIOCB *raw_aio_pdiscard(BlockDriverState *bs,
+    int64_t offset, int count,
     BlockCompletionFunc *cb, void *opaque)
 {
     BDRVRawState *s = bs->opaque;
 
-    return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors,
+    return paio_submit(bs, s->fd, offset, NULL, count,
                        cb, opaque, QEMU_AIO_DISCARD);
 }
 
@@ -1864,7 +1864,7 @@ BlockDriver bdrv_file = {
     .bdrv_co_preadv         = raw_co_preadv,
     .bdrv_co_pwritev        = raw_co_pwritev,
     .bdrv_aio_flush = raw_aio_flush,
-    .bdrv_aio_discard = raw_aio_discard,
+    .bdrv_aio_pdiscard = raw_aio_pdiscard,
     .bdrv_refresh_limits = raw_refresh_limits,
     .bdrv_io_plug = raw_aio_plug,
     .bdrv_io_unplug = raw_aio_unplug,
@@ -2203,8 +2203,8 @@ static int fd_open(BlockDriverState *bs)
     return -EIO;
 }
 
-static coroutine_fn BlockAIOCB *hdev_aio_discard(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors,
+static coroutine_fn BlockAIOCB *hdev_aio_pdiscard(BlockDriverState *bs,
+    int64_t offset, int count,
     BlockCompletionFunc *cb, void *opaque)
 {
     BDRVRawState *s = bs->opaque;
@@ -2212,7 +2212,7 @@ static coroutine_fn BlockAIOCB *hdev_aio_discard(BlockDriverState *bs,
     if (fd_open(bs) < 0) {
         return NULL;
     }
-    return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors,
+    return paio_submit(bs, s->fd, offset, NULL, count,
                        cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
 }
 
@@ -2307,7 +2307,7 @@ static BlockDriver bdrv_host_device = {
     .bdrv_co_preadv         = raw_co_preadv,
     .bdrv_co_pwritev        = raw_co_pwritev,
     .bdrv_aio_flush	= raw_aio_flush,
-    .bdrv_aio_discard   = hdev_aio_discard,
+    .bdrv_aio_pdiscard   = hdev_aio_pdiscard,
     .bdrv_refresh_limits = raw_refresh_limits,
     .bdrv_io_plug = raw_aio_plug,
     .bdrv_io_unplug = raw_aio_unplug,
diff --git a/block/raw-win32.c b/block/raw-win32.c
index 9b813d9..56f45fe 100644
--- a/block/raw-win32.c
+++ b/block/raw-win32.c
@@ -142,7 +142,7 @@ static int aio_worker(void *arg)
 }
 
 static BlockAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        int64_t offset, QEMUIOVector *qiov, int count,
         BlockCompletionFunc *cb, void *opaque, int type)
 {
     RawWin32AIOData *acb = g_new(RawWin32AIOData, 1);
@@ -155,11 +155,12 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile,
     if (qiov) {
         acb->aio_iov = qiov->iov;
         acb->aio_niov = qiov->niov;
+        assert(qiov->size == count);
     }
-    acb->aio_nbytes = nb_sectors * 512;
-    acb->aio_offset = sector_num * 512;
+    acb->aio_nbytes = count;
+    acb->aio_offset = offset;
 
-    trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
+    trace_paio_submit(acb, opaque, offset, count, type);
     pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
     return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
 }
@@ -378,9 +379,10 @@ static BlockAIOCB *raw_aio_readv(BlockDriverState *bs,
     BDRVRawState *s = bs->opaque;
     if (s->aio) {
         return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov,
-                                nb_sectors, cb, opaque, QEMU_AIO_READ); 
+                                nb_sectors, cb, opaque, QEMU_AIO_READ);
     } else {
-        return paio_submit(bs, s->hfile, sector_num, qiov, nb_sectors,
+        return paio_submit(bs, s->hfile, sector_num << BDRV_SECTOR_BITS, qiov,
+                           nb_sectors << BDRV_SECTOR_BITS,
                            cb, opaque, QEMU_AIO_READ);
     }
 }
@@ -392,9 +394,10 @@ static BlockAIOCB *raw_aio_writev(BlockDriverState *bs,
     BDRVRawState *s = bs->opaque;
     if (s->aio) {
         return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov,
-                                nb_sectors, cb, opaque, QEMU_AIO_WRITE); 
+                                nb_sectors, cb, opaque, QEMU_AIO_WRITE);
     } else {
-        return paio_submit(bs, s->hfile, sector_num, qiov, nb_sectors,
+        return paio_submit(bs, s->hfile, sector_num << BDRV_SECTOR_BITS, qiov,
+                           nb_sectors << BDRV_SECTOR_BITS,
                            cb, opaque, QEMU_AIO_WRITE);
     }
 }
diff --git a/block/raw_bsd.c b/block/raw_bsd.c
index 5f9dd29..588d408 100644
--- a/block/raw_bsd.c
+++ b/block/raw_bsd.c
@@ -50,33 +50,30 @@ static int raw_reopen_prepare(BDRVReopenState *reopen_state,
     return 0;
 }
 
-static int coroutine_fn raw_co_readv(BlockDriverState *bs, int64_t sector_num,
-                                     int nb_sectors, QEMUIOVector *qiov)
+static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
+                                      uint64_t bytes, QEMUIOVector *qiov,
+                                      int flags)
 {
     BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
-    return bdrv_co_readv(bs->file, sector_num, nb_sectors, qiov);
+    return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
 }
 
-static int coroutine_fn
-raw_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
-                    QEMUIOVector *qiov, int flags)
+static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
+                                       uint64_t bytes, QEMUIOVector *qiov,
+                                       int flags)
 {
     void *buf = NULL;
     BlockDriver *drv;
     QEMUIOVector local_qiov;
     int ret;
 
-    if (bs->probed && sector_num == 0) {
-        /* As long as these conditions are true, we can't get partial writes to
-         * the probe buffer and can just directly check the request. */
+    if (bs->probed && offset < BLOCK_PROBE_BUF_SIZE && bytes) {
+        /* Handling partial writes would be a pain - so we just
+         * require that guests have 512-byte request alignment if
+         * probing occurred */
         QEMU_BUILD_BUG_ON(BLOCK_PROBE_BUF_SIZE != 512);
         QEMU_BUILD_BUG_ON(BDRV_SECTOR_SIZE != 512);
-
-        if (nb_sectors == 0) {
-            /* qemu_iovec_to_buf() would fail, but we want to return success
-             * instead of -EINVAL in this case. */
-            return 0;
-        }
+        assert(offset == 0 && bytes >= BLOCK_PROBE_BUF_SIZE);
 
         buf = qemu_try_blockalign(bs->file->bs, 512);
         if (!buf) {
@@ -105,8 +102,7 @@ raw_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
     }
 
     BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
-    ret = bdrv_co_pwritev(bs->file, sector_num * BDRV_SECTOR_SIZE,
-                          nb_sectors * BDRV_SECTOR_SIZE, qiov, flags);
+    ret = bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
 
 fail:
     if (qiov == &local_qiov) {
@@ -134,10 +130,10 @@ static int coroutine_fn raw_co_pwrite_zeroes(BlockDriverState *bs,
     return bdrv_co_pwrite_zeroes(bs->file, offset, count, flags);
 }
 
-static int coroutine_fn raw_co_discard(BlockDriverState *bs,
-                                       int64_t sector_num, int nb_sectors)
+static int coroutine_fn raw_co_pdiscard(BlockDriverState *bs,
+                                        int64_t offset, int count)
 {
-    return bdrv_co_discard(bs->file->bs, sector_num, nb_sectors);
+    return bdrv_co_pdiscard(bs->file->bs, offset, count);
 }
 
 static int64_t raw_getlength(BlockDriverState *bs)
@@ -150,6 +146,16 @@ static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
     return bdrv_get_info(bs->file->bs, bdi);
 }
 
+static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
+{
+    if (bs->probed) {
+        /* To make it easier to protect the first sector, any probed
+         * image is restricted to read-modify-write on sub-sector
+         * operations. */
+        bs->bl.request_alignment = BDRV_SECTOR_SIZE;
+    }
+}
+
 static int raw_truncate(BlockDriverState *bs, int64_t offset)
 {
     return bdrv_truncate(bs->file->bs, offset);
@@ -192,8 +198,10 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
                     Error **errp)
 {
     bs->sg = bs->file->bs->sg;
-    bs->supported_write_flags = BDRV_REQ_FUA;
-    bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP;
+    bs->supported_write_flags = BDRV_REQ_FUA &
+        bs->file->bs->supported_write_flags;
+    bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
+        bs->file->bs->supported_zero_flags;
 
     if (bs->probed && !bdrv_is_read_only(bs)) {
         fprintf(stderr,
@@ -238,15 +246,16 @@ BlockDriver bdrv_raw = {
     .bdrv_open            = &raw_open,
     .bdrv_close           = &raw_close,
     .bdrv_create          = &raw_create,
-    .bdrv_co_readv        = &raw_co_readv,
-    .bdrv_co_writev_flags = &raw_co_writev_flags,
+    .bdrv_co_preadv       = &raw_co_preadv,
+    .bdrv_co_pwritev      = &raw_co_pwritev,
     .bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes,
-    .bdrv_co_discard      = &raw_co_discard,
+    .bdrv_co_pdiscard     = &raw_co_pdiscard,
     .bdrv_co_get_block_status = &raw_co_get_block_status,
     .bdrv_truncate        = &raw_truncate,
     .bdrv_getlength       = &raw_getlength,
     .has_variable_length  = true,
     .bdrv_get_info        = &raw_get_info,
+    .bdrv_refresh_limits  = &raw_refresh_limits,
     .bdrv_probe_blocksizes = &raw_probe_blocksizes,
     .bdrv_probe_geometry  = &raw_probe_geometry,
     .bdrv_media_changed   = &raw_media_changed,
diff --git a/block/rbd.c b/block/rbd.c
index 0a5840d..0106fea 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -649,9 +649,9 @@ static int rbd_aio_flush_wrapper(rbd_image_t image,
 }
 
 static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
-                                 int64_t sector_num,
+                                 int64_t off,
                                  QEMUIOVector *qiov,
-                                 int nb_sectors,
+                                 int64_t size,
                                  BlockCompletionFunc *cb,
                                  void *opaque,
                                  RBDAIOCmd cmd)
@@ -659,7 +659,6 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
     RBDAIOCB *acb;
     RADOSCB *rcb = NULL;
     rbd_completion_t c;
-    int64_t off, size;
     char *buf;
     int r;
 
@@ -668,6 +667,7 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
     acb = qemu_aio_get(&rbd_aiocb_info, bs, cb, opaque);
     acb->cmd = cmd;
     acb->qiov = qiov;
+    assert(!qiov || qiov->size == size);
     if (cmd == RBD_AIO_DISCARD || cmd == RBD_AIO_FLUSH) {
         acb->bounce = NULL;
     } else {
@@ -687,9 +687,6 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
 
     buf = acb->bounce;
 
-    off = sector_num * BDRV_SECTOR_SIZE;
-    size = nb_sectors * BDRV_SECTOR_SIZE;
-
     rcb = g_new(RADOSCB, 1);
     rcb->acb = acb;
     rcb->buf = buf;
@@ -739,7 +736,8 @@ static BlockAIOCB *qemu_rbd_aio_readv(BlockDriverState *bs,
                                       BlockCompletionFunc *cb,
                                       void *opaque)
 {
-    return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque,
+    return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, qiov,
+                         nb_sectors << BDRV_SECTOR_BITS, cb, opaque,
                          RBD_AIO_READ);
 }
 
@@ -750,7 +748,8 @@ static BlockAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs,
                                        BlockCompletionFunc *cb,
                                        void *opaque)
 {
-    return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque,
+    return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, qiov,
+                         nb_sectors << BDRV_SECTOR_BITS, cb, opaque,
                          RBD_AIO_WRITE);
 }
 
@@ -931,13 +930,13 @@ static int qemu_rbd_snap_list(BlockDriverState *bs,
 }
 
 #ifdef LIBRBD_SUPPORTS_DISCARD
-static BlockAIOCB* qemu_rbd_aio_discard(BlockDriverState *bs,
-                                        int64_t sector_num,
-                                        int nb_sectors,
-                                        BlockCompletionFunc *cb,
-                                        void *opaque)
+static BlockAIOCB *qemu_rbd_aio_pdiscard(BlockDriverState *bs,
+                                         int64_t offset,
+                                         int count,
+                                         BlockCompletionFunc *cb,
+                                         void *opaque)
 {
-    return rbd_start_aio(bs, sector_num, NULL, nb_sectors, cb, opaque,
+    return rbd_start_aio(bs, offset, NULL, count, cb, opaque,
                          RBD_AIO_DISCARD);
 }
 #endif
@@ -1001,7 +1000,7 @@ static BlockDriver bdrv_rbd = {
 #endif
 
 #ifdef LIBRBD_SUPPORTS_DISCARD
-    .bdrv_aio_discard       = qemu_rbd_aio_discard,
+    .bdrv_aio_pdiscard      = qemu_rbd_aio_pdiscard,
 #endif
 
     .bdrv_snapshot_create   = qemu_rbd_snap_create,
diff --git a/block/sheepdog.c b/block/sheepdog.c
index e739c56..66e1cb2 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -2800,8 +2800,8 @@ static int sd_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
 }
 
 
-static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num,
-                                      int nb_sectors)
+static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset,
+                                      int count)
 {
     SheepdogAIOCB *acb;
     BDRVSheepdogState *s = bs->opaque;
@@ -2811,7 +2811,7 @@ static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num,
     uint32_t zero = 0;
 
     if (!s->discard_supported) {
-            return 0;
+        return 0;
     }
 
     memset(&discard_iov, 0, sizeof(discard_iov));
@@ -2820,7 +2820,10 @@ static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num,
     iov.iov_len = sizeof(zero);
     discard_iov.iov = &iov;
     discard_iov.niov = 1;
-    acb = sd_aio_setup(bs, &discard_iov, sector_num, nb_sectors);
+    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((count & (BDRV_SECTOR_SIZE - 1)) == 0);
+    acb = sd_aio_setup(bs, &discard_iov, offset >> BDRV_SECTOR_BITS,
+                       count >> BDRV_SECTOR_BITS);
     acb->aiocb_type = AIOCB_DISCARD_OBJ;
     acb->aio_done_func = sd_finish_aiocb;
 
@@ -2954,7 +2957,7 @@ static BlockDriver bdrv_sheepdog = {
     .bdrv_co_readv  = sd_co_readv,
     .bdrv_co_writev = sd_co_writev,
     .bdrv_co_flush_to_disk  = sd_co_flush_to_disk,
-    .bdrv_co_discard = sd_co_discard,
+    .bdrv_co_pdiscard = sd_co_pdiscard,
     .bdrv_co_get_block_status = sd_co_get_block_status,
 
     .bdrv_snapshot_create   = sd_snapshot_create,
@@ -2990,7 +2993,7 @@ static BlockDriver bdrv_sheepdog_tcp = {
     .bdrv_co_readv  = sd_co_readv,
     .bdrv_co_writev = sd_co_writev,
     .bdrv_co_flush_to_disk  = sd_co_flush_to_disk,
-    .bdrv_co_discard = sd_co_discard,
+    .bdrv_co_pdiscard = sd_co_pdiscard,
     .bdrv_co_get_block_status = sd_co_get_block_status,
 
     .bdrv_snapshot_create   = sd_snapshot_create,
@@ -3026,7 +3029,7 @@ static BlockDriver bdrv_sheepdog_unix = {
     .bdrv_co_readv  = sd_co_readv,
     .bdrv_co_writev = sd_co_writev,
     .bdrv_co_flush_to_disk  = sd_co_flush_to_disk,
-    .bdrv_co_discard = sd_co_discard,
+    .bdrv_co_pdiscard = sd_co_pdiscard,
     .bdrv_co_get_block_status = sd_co_get_block_status,
 
     .bdrv_snapshot_create   = sd_snapshot_create,
diff --git a/block/trace-events b/block/trace-events
index 354967e..978ef4f 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -9,7 +9,7 @@ blk_co_preadv(void *blk, void *bs, int64_t offset, unsigned int bytes, int flags
 blk_co_pwritev(void *blk, void *bs, int64_t offset, unsigned int bytes, int flags) "blk %p bs %p offset %"PRId64" bytes %u flags %x"
 
 # block/io.c
-bdrv_aio_discard(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
+bdrv_aio_pdiscard(void *bs, int64_t offset, int count, void *opaque) "bs %p offset %"PRId64" count %d opaque %p"
 bdrv_aio_flush(void *bs, void *opaque) "bs %p opaque %p"
 bdrv_aio_readv(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
 bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
@@ -58,7 +58,7 @@ qmp_block_stream(void *bs, void *job) "bs %p job %p"
 # block/raw-win32.c
 # block/raw-posix.c
 paio_submit_co(int64_t offset, int count, int type) "offset %"PRId64" count %d type %d"
-paio_submit(void *acb, void *opaque, int64_t sector_num, int nb_sectors, int type) "acb %p opaque %p sector_num %"PRId64" nb_sectors %d type %d"
+paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb %p opaque %p offset %"PRId64" count %d type %d"
 
 # block/qcow2.c
 qcow2_writev_start_req(void *co, int64_t offset, int bytes) "co %p offset %" PRIx64 " bytes %d"