From 421919d76b1755e1f9478f85cb71829ca5150ade Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 22 Jul 2019 17:22:00 +0200
Subject: block: Remove blk_pread_unthrottled()

The functionality offered by blk_pread_unthrottled() goes back to commit
498e386c584. Then, we couldn't perform I/O throttling with synchronous
requests because timers wouldn't be executed in polling loops. So the
commit automatically disabled I/O throttling as soon as a synchronous
request was issued.

However, for geometry detection during disk initialisation, we always
used (and still use) synchronous requests even if guest requests use AIO
later. Geometry detection was not wanted to disable I/O throttling, so
bdrv_pread_unthrottled() was introduced which disabled throttling only
temporarily.

All of this isn't necessary any more because we do run timers in polling
loop and even synchronous requests are now using coroutine
infrastructure internally. For this reason, commit 90c78624f already
removed the automatic disabling of I/O throttling.

It's time to get rid of the workaround for the removed code, and its
abuse of blk_root_drained_begin()/end(), as well.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
---
 block/block-backend.c | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'block')

diff --git a/block/block-backend.c b/block/block-backend.c
index 0056b52..fdd6b01 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1237,22 +1237,6 @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
     return rwco.ret;
 }
 
-int blk_pread_unthrottled(BlockBackend *blk, int64_t offset, uint8_t *buf,
-                          int count)
-{
-    int ret;
-
-    ret = blk_check_byte_request(blk, offset, count);
-    if (ret < 0) {
-        return ret;
-    }
-
-    blk_root_drained_begin(blk->root);
-    ret = blk_pread(blk, offset, buf, count);
-    blk_root_drained_end(blk->root, NULL);
-    return ret;
-}
-
 int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
                       int bytes, BdrvRequestFlags flags)
 {
-- 
cgit v1.1


From d2da5e288a2e71e82866c8fdefd41b5727300124 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 22 Jul 2019 17:44:27 +0200
Subject: mirror: Keep mirror_top_bs drained after dropping permissions

mirror_top_bs is currently implicitly drained through its connection to
the source or the target node. However, the drain section for target_bs
ends early after moving mirror_top_bs from src to target_bs, so that
requests can already be restarted while mirror_top_bs is still present
in the chain, but has dropped all permissions and therefore runs into an
assertion failure like this:

    qemu-system-x86_64: block/io.c:1634: bdrv_co_write_req_prepare:
    Assertion `child->perm & BLK_PERM_WRITE' failed.

Keep mirror_top_bs drained until all graph changes have completed.

Cc: qemu-stable@nongnu.org
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
---
 block/mirror.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/mirror.c b/block/mirror.c
index 9f5c59e..642d657 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -656,7 +656,10 @@ static int mirror_exit_common(Job *job)
     s->target = NULL;
 
     /* We don't access the source any more. Dropping any WRITE/RESIZE is
-     * required before it could become a backing file of target_bs. */
+     * required before it could become a backing file of target_bs. Not having
+     * these permissions any more means that we can't allow any new requests on
+     * mirror_top_bs from now on, so keep it drained. */
+    bdrv_drained_begin(mirror_top_bs);
     bs_opaque->stop = true;
     bdrv_child_refresh_perms(mirror_top_bs, mirror_top_bs->backing,
                              &error_abort);
@@ -724,6 +727,7 @@ static int mirror_exit_common(Job *job)
     bs_opaque->job = NULL;
 
     bdrv_drained_end(src);
+    bdrv_drained_end(mirror_top_bs);
     s->in_drain = false;
     bdrv_unref(mirror_top_bs);
     bdrv_unref(src);
-- 
cgit v1.1


From cf3129323f900ef5ddbccbe86e4fa801e88c566e Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 22 Jul 2019 17:46:23 +0200
Subject: block-backend: Queue requests while drained

This fixes devices like IDE that can still start new requests from I/O
handlers in the CPU thread while the block backend is drained.

The basic assumption is that in a drain section, no new requests should
be allowed through a BlockBackend (blk_drained_begin/end don't exist,
we get drain sections only on the node level). However, there are two
special cases where requests should not be queued:

1. Block jobs: We already make sure that block jobs are paused in a
   drain section, so they won't start new requests. However, if the
   drain_begin is called on the job's BlockBackend first, it can happen
   that we deadlock because the job stays busy until it reaches a pause
   point - which it can't if its requests aren't processed any more.

   The proper solution here would be to make all requests through the
   job's filter node instead of using a BlockBackend. For now, just
   disabling request queuing on the job BlockBackend is simpler.

2. In test cases where making requests through bdrv_* would be
   cumbersome because we'd need a BdrvChild. As we already got the
   functionality to disable request queuing from 1., use it in tests,
   too, for convenience.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
---
 block/backup.c        |  1 +
 block/block-backend.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++---
 block/commit.c        |  2 ++
 block/mirror.c        |  1 +
 4 files changed, 54 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/backup.c b/block/backup.c
index b26c22c..4743c8f 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -644,6 +644,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
     if (ret < 0) {
         goto error;
     }
+    blk_set_disable_request_queuing(job->target, true);
 
     job->on_source_error = on_source_error;
     job->on_target_error = on_target_error;
diff --git a/block/block-backend.c b/block/block-backend.c
index fdd6b01..c13c5c8 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -79,6 +79,9 @@ struct BlockBackend {
     QLIST_HEAD(, BlockBackendAioNotifier) aio_notifiers;
 
     int quiesce_counter;
+    CoQueue queued_requests;
+    bool disable_request_queuing;
+
     VMChangeStateEntry *vmsh;
     bool force_allow_inactivate;
 
@@ -339,6 +342,7 @@ BlockBackend *blk_new(AioContext *ctx, uint64_t perm, uint64_t shared_perm)
 
     block_acct_init(&blk->stats);
 
+    qemu_co_queue_init(&blk->queued_requests);
     notifier_list_init(&blk->remove_bs_notifiers);
     notifier_list_init(&blk->insert_bs_notifiers);
     QLIST_INIT(&blk->aio_notifiers);
@@ -1096,6 +1100,11 @@ void blk_set_allow_aio_context_change(BlockBackend *blk, bool allow)
     blk->allow_aio_context_change = allow;
 }
 
+void blk_set_disable_request_queuing(BlockBackend *blk, bool disable)
+{
+    blk->disable_request_queuing = disable;
+}
+
 static int blk_check_byte_request(BlockBackend *blk, int64_t offset,
                                   size_t size)
 {
@@ -1127,13 +1136,24 @@ static int blk_check_byte_request(BlockBackend *blk, int64_t offset,
     return 0;
 }
 
+static void coroutine_fn blk_wait_while_drained(BlockBackend *blk)
+{
+    if (blk->quiesce_counter && !blk->disable_request_queuing) {
+        qemu_co_queue_wait(&blk->queued_requests, NULL);
+    }
+}
+
 int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
                                unsigned int bytes, QEMUIOVector *qiov,
                                BdrvRequestFlags flags)
 {
     int ret;
-    BlockDriverState *bs = blk_bs(blk);
+    BlockDriverState *bs;
 
+    blk_wait_while_drained(blk);
+
+    /* Call blk_bs() only after waiting, the graph may have changed */
+    bs = blk_bs(blk);
     trace_blk_co_preadv(blk, bs, offset, bytes, flags);
 
     ret = blk_check_byte_request(blk, offset, bytes);
@@ -1159,8 +1179,12 @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
                                 BdrvRequestFlags flags)
 {
     int ret;
-    BlockDriverState *bs = blk_bs(blk);
+    BlockDriverState *bs;
 
+    blk_wait_while_drained(blk);
+
+    /* Call blk_bs() only after waiting, the graph may have changed */
+    bs = blk_bs(blk);
     trace_blk_co_pwritev(blk, bs, offset, bytes, flags);
 
     ret = blk_check_byte_request(blk, offset, bytes);
@@ -1349,6 +1373,12 @@ static void blk_aio_read_entry(void *opaque)
     BlkRwCo *rwco = &acb->rwco;
     QEMUIOVector *qiov = rwco->iobuf;
 
+    if (rwco->blk->quiesce_counter) {
+        blk_dec_in_flight(rwco->blk);
+        blk_wait_while_drained(rwco->blk);
+        blk_inc_in_flight(rwco->blk);
+    }
+
     assert(qiov->size == acb->bytes);
     rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, acb->bytes,
                               qiov, rwco->flags);
@@ -1361,6 +1391,12 @@ static void blk_aio_write_entry(void *opaque)
     BlkRwCo *rwco = &acb->rwco;
     QEMUIOVector *qiov = rwco->iobuf;
 
+    if (rwco->blk->quiesce_counter) {
+        blk_dec_in_flight(rwco->blk);
+        blk_wait_while_drained(rwco->blk);
+        blk_inc_in_flight(rwco->blk);
+    }
+
     assert(!qiov || qiov->size == acb->bytes);
     rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, acb->bytes,
                                qiov, rwco->flags);
@@ -1482,6 +1518,8 @@ void blk_aio_cancel_async(BlockAIOCB *acb)
 
 int blk_co_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
 {
+    blk_wait_while_drained(blk);
+
     if (!blk_is_available(blk)) {
         return -ENOMEDIUM;
     }
@@ -1522,7 +1560,11 @@ BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
 
 int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
 {
-    int ret = blk_check_byte_request(blk, offset, bytes);
+    int ret;
+
+    blk_wait_while_drained(blk);
+
+    ret = blk_check_byte_request(blk, offset, bytes);
     if (ret < 0) {
         return ret;
     }
@@ -1532,6 +1574,8 @@ int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
 
 int blk_co_flush(BlockBackend *blk)
 {
+    blk_wait_while_drained(blk);
+
     if (!blk_is_available(blk)) {
         return -ENOMEDIUM;
     }
@@ -2232,6 +2276,9 @@ static void blk_root_drained_end(BdrvChild *child, int *drained_end_counter)
         if (blk->dev_ops && blk->dev_ops->drained_end) {
             blk->dev_ops->drained_end(blk->dev_opaque);
         }
+        while (qemu_co_enter_next(&blk->queued_requests, NULL)) {
+            /* Resume all queued requests */
+        }
     }
 }
 
diff --git a/block/commit.c b/block/commit.c
index 2c5a6d4..408ae15 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -350,6 +350,7 @@ void commit_start(const char *job_id, BlockDriverState *bs,
     if (ret < 0) {
         goto fail;
     }
+    blk_set_disable_request_queuing(s->base, true);
     s->base_bs = base;
 
     /* Required permissions are already taken with block_job_add_bdrv() */
@@ -358,6 +359,7 @@ void commit_start(const char *job_id, BlockDriverState *bs,
     if (ret < 0) {
         goto fail;
     }
+    blk_set_disable_request_queuing(s->top, true);
 
     s->backing_file_str = g_strdup(backing_file_str);
     s->on_error = on_error;
diff --git a/block/mirror.c b/block/mirror.c
index 642d657..9b36391 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -1636,6 +1636,7 @@ static BlockJob *mirror_start_job(
         blk_set_force_allow_inactivate(s->target);
     }
     blk_set_allow_aio_context_change(s->target, true);
+    blk_set_disable_request_queuing(s->target, true);
 
     s->replaces = g_strdup(replaces);
     s->on_source_error = on_source_error;
-- 
cgit v1.1


From a6b257a08e3d72219f03e461a52152672fec0612 Mon Sep 17 00:00:00 2001
From: Nir Soffer <nirsof@gmail.com>
Date: Tue, 13 Aug 2019 21:21:03 +0300
Subject: file-posix: Handle undetectable alignment

In some cases buf_align or request_alignment cannot be detected:

1. With Gluster, buf_align cannot be detected since the actual I/O is
   done on Gluster server, and qemu buffer alignment does not matter.
   Since we don't have alignment requirement, buf_align=1 is the best
   value.

2. With local XFS filesystem, buf_align cannot be detected if reading
   from unallocated area. In this we must align the buffer, but we don't
   know what is the correct size. Using the wrong alignment results in
   I/O error.

3. With Gluster backed by XFS, request_alignment cannot be detected if
   reading from unallocated area. In this case we need to use the
   correct alignment, and failing to do so results in I/O errors.

4. With NFS, the server does not use direct I/O, so both buf_align cannot
   be detected. In this case we don't need any alignment so we can use
   buf_align=1 and request_alignment=1.

These cases seems to work when storage sector size is 512 bytes, because
the current code starts checking align=512. If the check succeeds
because alignment cannot be detected we use 512. But this does not work
for storage with 4k sector size.

To determine if we can detect the alignment, we probe first with
align=1. If probing succeeds, maybe there are no alignment requirement
(cases 1, 4) or we are probing unallocated area (cases 2, 3). Since we
don't have any way to tell, we treat this as undetectable alignment. If
probing with align=1 fails with EINVAL, but probing with one of the
expected alignments succeeds, we know that we found a working alignment.

Practically the alignment requirements are the same for buffer
alignment, buffer length, and offset in file. So in case we cannot
detect buf_align, we can use request alignment. If we cannot detect
request alignment, we can fallback to a safe value. To use this logic,
we probe first request alignment instead of buf_align.

Here is a table showing the behaviour with current code (the value in
parenthesis is the optimal value).

Case    Sector    buf_align (opt)   request_alignment (opt)     result
======================================================================
1       512       512   (1)          512   (512)                 OK
1       4096      512   (1)          4096  (4096)                FAIL
----------------------------------------------------------------------
2       512       512   (512)        512   (512)                 OK
2       4096      512   (4096)       4096  (4096)                FAIL
----------------------------------------------------------------------
3       512       512   (1)          512   (512)                 OK
3       4096      512   (1)          512   (4096)                FAIL
----------------------------------------------------------------------
4       512       512   (1)          512   (1)                   OK
4       4096      512   (1)          512   (1)                   OK

Same cases with this change:

Case    Sector    buf_align (opt)   request_alignment (opt)     result
======================================================================
1       512       512   (1)          512   (512)                 OK
1       4096      4096  (1)          4096  (4096)                OK
----------------------------------------------------------------------
2       512       512   (512)        512   (512)                 OK
2       4096      4096  (4096)       4096  (4096)                OK
----------------------------------------------------------------------
3       512       4096  (1)          4096  (512)                 OK
3       4096      4096  (1)          4096  (4096)                OK
----------------------------------------------------------------------
4       512       4096  (1)          4096  (1)                   OK
4       4096      4096  (1)          4096  (1)                   OK

I tested that provisioning VMs and copying disks on local XFS and
Gluster with 4k bytes sector size work now, resolving bugs [1],[2].
I tested also on XFS, NFS, Gluster with 512 bytes sector size.

[1] https://bugzilla.redhat.com/1737256
[2] https://bugzilla.redhat.com/1738657

Signed-off-by: Nir Soffer <nsoffer@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/file-posix.c | 36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

(limited to 'block')

diff --git a/block/file-posix.c b/block/file-posix.c
index 4479cc7..b8b4dad 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -323,6 +323,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
     BDRVRawState *s = bs->opaque;
     char *buf;
     size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
+    size_t alignments[] = {1, 512, 1024, 2048, 4096};
 
     /* For SCSI generic devices the alignment is not really used.
        With buffered I/O, we don't have any restrictions. */
@@ -349,25 +350,38 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
     }
 #endif
 
-    /* If we could not get the sizes so far, we can only guess them */
-    if (!s->buf_align) {
+    /*
+     * If we could not get the sizes so far, we can only guess them. First try
+     * to detect request alignment, since it is more likely to succeed. Then
+     * try to detect buf_align, which cannot be detected in some cases (e.g.
+     * Gluster). If buf_align cannot be detected, we fallback to the value of
+     * request_alignment.
+     */
+
+    if (!bs->bl.request_alignment) {
+        int i;
         size_t align;
-        buf = qemu_memalign(max_align, 2 * max_align);
-        for (align = 512; align <= max_align; align <<= 1) {
-            if (raw_is_io_aligned(fd, buf + align, max_align)) {
-                s->buf_align = align;
+        buf = qemu_memalign(max_align, max_align);
+        for (i = 0; i < ARRAY_SIZE(alignments); i++) {
+            align = alignments[i];
+            if (raw_is_io_aligned(fd, buf, align)) {
+                /* Fallback to safe value. */
+                bs->bl.request_alignment = (align != 1) ? align : max_align;
                 break;
             }
         }
         qemu_vfree(buf);
     }
 
-    if (!bs->bl.request_alignment) {
+    if (!s->buf_align) {
+        int i;
         size_t align;
-        buf = qemu_memalign(s->buf_align, max_align);
-        for (align = 512; align <= max_align; align <<= 1) {
-            if (raw_is_io_aligned(fd, buf, align)) {
-                bs->bl.request_alignment = align;
+        buf = qemu_memalign(max_align, 2 * max_align);
+        for (i = 0; i < ARRAY_SIZE(alignments); i++) {
+            align = alignments[i];
+            if (raw_is_io_aligned(fd, buf + align, max_align)) {
+                /* Fallback to request_aligment. */
+                s->buf_align = (align != 1) ? align : bs->bl.request_alignment;
                 break;
             }
         }
-- 
cgit v1.1