aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--block.c97
-rw-r--r--block/block-gen.h49
-rw-r--r--block/coroutines.h65
-rw-r--r--block/io.c337
-rw-r--r--block/meson.build8
-rw-r--r--block/nvme.c73
-rw-r--r--docs/devel/block-coroutine-wrapper.rst54
-rw-r--r--docs/devel/index.rst1
-rw-r--r--include/block/block.h36
-rw-r--r--include/qemu/vfio-helpers.h2
-rw-r--r--qemu-options.hx10
-rw-r--r--scripts/block-coroutine-wrapper.py167
-rw-r--r--tests/test-bdrv-drain.c2
-rw-r--r--util/vfio-helpers.c133
14 files changed, 607 insertions, 427 deletions
diff --git a/block.c b/block.c
index f4b6dd5..52b2e27 100644
--- a/block.c
+++ b/block.c
@@ -48,6 +48,7 @@
#include "qemu/timer.h"
#include "qemu/cutils.h"
#include "qemu/id.h"
+#include "block/coroutines.h"
#ifdef CONFIG_BSD
#include <sys/ioctl.h>
@@ -4676,8 +4677,8 @@ static void bdrv_delete(BlockDriverState *bs)
* free of errors) or -errno when an internal error occurred. The results of the
* check are stored in res.
*/
-static int coroutine_fn bdrv_co_check(BlockDriverState *bs,
- BdrvCheckResult *res, BdrvCheckMode fix)
+int coroutine_fn bdrv_co_check(BlockDriverState *bs,
+ BdrvCheckResult *res, BdrvCheckMode fix)
{
if (bs->drv == NULL) {
return -ENOMEDIUM;
@@ -4690,43 +4691,6 @@ static int coroutine_fn bdrv_co_check(BlockDriverState *bs,
return bs->drv->bdrv_co_check(bs, res, fix);
}
-typedef struct CheckCo {
- BlockDriverState *bs;
- BdrvCheckResult *res;
- BdrvCheckMode fix;
- int ret;
-} CheckCo;
-
-static void coroutine_fn bdrv_check_co_entry(void *opaque)
-{
- CheckCo *cco = opaque;
- cco->ret = bdrv_co_check(cco->bs, cco->res, cco->fix);
- aio_wait_kick();
-}
-
-int bdrv_check(BlockDriverState *bs,
- BdrvCheckResult *res, BdrvCheckMode fix)
-{
- Coroutine *co;
- CheckCo cco = {
- .bs = bs,
- .res = res,
- .ret = -EINPROGRESS,
- .fix = fix,
- };
-
- if (qemu_in_coroutine()) {
- /* Fast-path if already in coroutine context */
- bdrv_check_co_entry(&cco);
- } else {
- co = qemu_coroutine_create(bdrv_check_co_entry, &cco);
- bdrv_coroutine_enter(bs, co);
- BDRV_POLL_WHILE(bs, cco.ret == -EINPROGRESS);
- }
-
- return cco.ret;
-}
-
/*
* Return values:
* 0 - success
@@ -5781,8 +5745,7 @@ void bdrv_init_with_whitelist(void)
bdrv_init();
}
-static void coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs,
- Error **errp)
+int coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, Error **errp)
{
BdrvChild *child, *parent;
uint64_t perm, shared_perm;
@@ -5791,14 +5754,14 @@ static void coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs,
BdrvDirtyBitmap *bm;
if (!bs->drv) {
- return;
+ return -ENOMEDIUM;
}
QLIST_FOREACH(child, &bs->children, next) {
bdrv_co_invalidate_cache(child->bs, &local_err);
if (local_err) {
error_propagate(errp, local_err);
- return;
+ return -EINVAL;
}
}
@@ -5821,7 +5784,7 @@ static void coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs,
ret = bdrv_check_perm(bs, NULL, perm, shared_perm, NULL, NULL, errp);
if (ret < 0) {
bs->open_flags |= BDRV_O_INACTIVE;
- return;
+ return ret;
}
bdrv_set_perm(bs, perm, shared_perm);
@@ -5830,7 +5793,7 @@ static void coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs,
if (local_err) {
bs->open_flags |= BDRV_O_INACTIVE;
error_propagate(errp, local_err);
- return;
+ return -EINVAL;
}
}
@@ -5842,7 +5805,7 @@ static void coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs,
if (ret < 0) {
bs->open_flags |= BDRV_O_INACTIVE;
error_setg_errno(errp, -ret, "Could not refresh total sector count");
- return;
+ return ret;
}
}
@@ -5852,59 +5815,27 @@ static void coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs,
if (local_err) {
bs->open_flags |= BDRV_O_INACTIVE;
error_propagate(errp, local_err);
- return;
+ return -EINVAL;
}
}
}
-}
-
-typedef struct InvalidateCacheCo {
- BlockDriverState *bs;
- Error **errp;
- bool done;
-} InvalidateCacheCo;
-
-static void coroutine_fn bdrv_invalidate_cache_co_entry(void *opaque)
-{
- InvalidateCacheCo *ico = opaque;
- bdrv_co_invalidate_cache(ico->bs, ico->errp);
- ico->done = true;
- aio_wait_kick();
-}
-
-void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
-{
- Coroutine *co;
- InvalidateCacheCo ico = {
- .bs = bs,
- .done = false,
- .errp = errp
- };
- if (qemu_in_coroutine()) {
- /* Fast-path if already in coroutine context */
- bdrv_invalidate_cache_co_entry(&ico);
- } else {
- co = qemu_coroutine_create(bdrv_invalidate_cache_co_entry, &ico);
- bdrv_coroutine_enter(bs, co);
- BDRV_POLL_WHILE(bs, !ico.done);
- }
+ return 0;
}
void bdrv_invalidate_cache_all(Error **errp)
{
BlockDriverState *bs;
- Error *local_err = NULL;
BdrvNextIterator it;
for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
AioContext *aio_context = bdrv_get_aio_context(bs);
+ int ret;
aio_context_acquire(aio_context);
- bdrv_invalidate_cache(bs, &local_err);
+ ret = bdrv_invalidate_cache(bs, errp);
aio_context_release(aio_context);
- if (local_err) {
- error_propagate(errp, local_err);
+ if (ret < 0) {
bdrv_next_cleanup(&it);
return;
}
diff --git a/block/block-gen.h b/block/block-gen.h
new file mode 100644
index 0000000..f80cf48
--- /dev/null
+++ b/block/block-gen.h
@@ -0,0 +1,49 @@
+/*
+ * Block coroutine wrapping core, used by auto-generated block/block-gen.c
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ * Copyright (c) 2020 Virtuozzo International GmbH
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef BLOCK_BLOCK_GEN_H
+#define BLOCK_BLOCK_GEN_H
+
+#include "block/block_int.h"
+
+/* Base structure for argument packing structures */
+typedef struct BdrvPollCo {
+ BlockDriverState *bs;
+ bool in_progress;
+ int ret;
+ Coroutine *co; /* Keep pointer here for debugging */
+} BdrvPollCo;
+
+static inline int bdrv_poll_co(BdrvPollCo *s)
+{
+ assert(!qemu_in_coroutine());
+
+ bdrv_coroutine_enter(s->bs, s->co);
+ BDRV_POLL_WHILE(s->bs, s->in_progress);
+
+ return s->ret;
+}
+
+#endif /* BLOCK_BLOCK_GEN_H */
diff --git a/block/coroutines.h b/block/coroutines.h
new file mode 100644
index 0000000..f69179f
--- /dev/null
+++ b/block/coroutines.h
@@ -0,0 +1,65 @@
+/*
+ * Block layer I/O functions
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef BLOCK_COROUTINES_INT_H
+#define BLOCK_COROUTINES_INT_H
+
+#include "block/block_int.h"
+
+int coroutine_fn bdrv_co_check(BlockDriverState *bs,
+ BdrvCheckResult *res, BdrvCheckMode fix);
+int coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, Error **errp);
+
+int generated_co_wrapper
+bdrv_preadv(BdrvChild *child, int64_t offset, unsigned int bytes,
+ QEMUIOVector *qiov, BdrvRequestFlags flags);
+int generated_co_wrapper
+bdrv_pwritev(BdrvChild *child, int64_t offset, unsigned int bytes,
+ QEMUIOVector *qiov, BdrvRequestFlags flags);
+
+int coroutine_fn
+bdrv_co_common_block_status_above(BlockDriverState *bs,
+ BlockDriverState *base,
+ bool want_zero,
+ int64_t offset,
+ int64_t bytes,
+ int64_t *pnum,
+ int64_t *map,
+ BlockDriverState **file);
+int generated_co_wrapper
+bdrv_common_block_status_above(BlockDriverState *bs,
+ BlockDriverState *base,
+ bool want_zero,
+ int64_t offset,
+ int64_t bytes,
+ int64_t *pnum,
+ int64_t *map,
+ BlockDriverState **file);
+
+int coroutine_fn bdrv_co_readv_vmstate(BlockDriverState *bs,
+ QEMUIOVector *qiov, int64_t pos);
+int coroutine_fn bdrv_co_writev_vmstate(BlockDriverState *bs,
+ QEMUIOVector *qiov, int64_t pos);
+
+#endif /* BLOCK_COROUTINES_INT_H */
diff --git a/block/io.c b/block/io.c
index 11df188..54f0968 100644
--- a/block/io.c
+++ b/block/io.c
@@ -29,6 +29,7 @@
#include "block/blockjob.h"
#include "block/blockjob_int.h"
#include "block/block_int.h"
+#include "block/coroutines.h"
#include "qemu/cutils.h"
#include "qapi/error.h"
#include "qemu/error-report.h"
@@ -889,89 +890,10 @@ static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
return 0;
}
-typedef int coroutine_fn BdrvRequestEntry(void *opaque);
-typedef struct BdrvRunCo {
- BdrvRequestEntry *entry;
- void *opaque;
- int ret;
- bool done;
- Coroutine *co; /* Coroutine, running bdrv_run_co_entry, for debugging */
-} BdrvRunCo;
-
-static void coroutine_fn bdrv_run_co_entry(void *opaque)
-{
- BdrvRunCo *arg = opaque;
-
- arg->ret = arg->entry(arg->opaque);
- arg->done = true;
- aio_wait_kick();
-}
-
-static int bdrv_run_co(BlockDriverState *bs, BdrvRequestEntry *entry,
- void *opaque)
-{
- if (qemu_in_coroutine()) {
- /* Fast-path if already in coroutine context */
- return entry(opaque);
- } else {
- BdrvRunCo s = { .entry = entry, .opaque = opaque };
-
- s.co = qemu_coroutine_create(bdrv_run_co_entry, &s);
- bdrv_coroutine_enter(bs, s.co);
-
- BDRV_POLL_WHILE(bs, !s.done);
-
- return s.ret;
- }
-}
-
-typedef struct RwCo {
- BdrvChild *child;
- int64_t offset;
- QEMUIOVector *qiov;
- bool is_write;
- BdrvRequestFlags flags;
-} RwCo;
-
-static int coroutine_fn bdrv_rw_co_entry(void *opaque)
-{
- RwCo *rwco = opaque;
-
- if (!rwco->is_write) {
- return bdrv_co_preadv(rwco->child, rwco->offset,
- rwco->qiov->size, rwco->qiov,
- rwco->flags);
- } else {
- return bdrv_co_pwritev(rwco->child, rwco->offset,
- rwco->qiov->size, rwco->qiov,
- rwco->flags);
- }
-}
-
-/*
- * Process a vectored synchronous request using coroutines
- */
-static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
- QEMUIOVector *qiov, bool is_write,
- BdrvRequestFlags flags)
-{
- RwCo rwco = {
- .child = child,
- .offset = offset,
- .qiov = qiov,
- .is_write = is_write,
- .flags = flags,
- };
-
- return bdrv_run_co(child->bs, bdrv_rw_co_entry, &rwco);
-}
-
int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
int bytes, BdrvRequestFlags flags)
{
- QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, bytes);
-
- return bdrv_prwv_co(child, offset, &qiov, true,
+ return bdrv_pwritev(child, offset, bytes, NULL,
BDRV_REQ_ZERO_WRITE | flags);
}
@@ -1016,41 +938,19 @@ int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags)
}
}
-/* return < 0 if error. See bdrv_pwrite() for the return codes */
-int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
-{
- int ret;
-
- ret = bdrv_prwv_co(child, offset, qiov, false, 0);
- if (ret < 0) {
- return ret;
- }
-
- return qiov->size;
-}
-
/* See bdrv_pwrite() for the return codes */
int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes)
{
+ int ret;
QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
if (bytes < 0) {
return -EINVAL;
}
- return bdrv_preadv(child, offset, &qiov);
-}
-
-int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
-{
- int ret;
+ ret = bdrv_preadv(child, offset, bytes, &qiov, 0);
- ret = bdrv_prwv_co(child, offset, qiov, true, 0);
- if (ret < 0) {
- return ret;
- }
-
- return qiov->size;
+ return ret < 0 ? ret : bytes;
}
/* Return no. of bytes on success or < 0 on error. Important errors are:
@@ -1061,13 +961,16 @@ int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
*/
int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes)
{
+ int ret;
QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
if (bytes < 0) {
return -EINVAL;
}
- return bdrv_pwritev(child, offset, &qiov);
+ ret = bdrv_pwritev(child, offset, bytes, &qiov, 0);
+
+ return ret < 0 ? ret : bytes;
}
/*
@@ -2243,18 +2146,6 @@ int bdrv_flush_all(void)
return result;
}
-
-typedef struct BdrvCoBlockStatusData {
- BlockDriverState *bs;
- BlockDriverState *base;
- bool want_zero;
- int64_t offset;
- int64_t bytes;
- int64_t *pnum;
- int64_t *map;
- BlockDriverState **file;
-} BdrvCoBlockStatusData;
-
/*
* Returns the allocation status of the specified sectors.
* Drivers not implementing the functionality are assumed to not support
@@ -2449,14 +2340,15 @@ early_out:
return ret;
}
-static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs,
- BlockDriverState *base,
- bool want_zero,
- int64_t offset,
- int64_t bytes,
- int64_t *pnum,
- int64_t *map,
- BlockDriverState **file)
+int coroutine_fn
+bdrv_co_common_block_status_above(BlockDriverState *bs,
+ BlockDriverState *base,
+ bool want_zero,
+ int64_t offset,
+ int64_t bytes,
+ int64_t *pnum,
+ int64_t *map,
+ BlockDriverState **file)
{
BlockDriverState *p;
int ret = 0;
@@ -2489,43 +2381,6 @@ static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs,
return ret;
}
-/* Coroutine wrapper for bdrv_block_status_above() */
-static int coroutine_fn bdrv_block_status_above_co_entry(void *opaque)
-{
- BdrvCoBlockStatusData *data = opaque;
-
- return bdrv_co_block_status_above(data->bs, data->base,
- data->want_zero,
- data->offset, data->bytes,
- data->pnum, data->map, data->file);
-}
-
-/*
- * Synchronous wrapper around bdrv_co_block_status_above().
- *
- * See bdrv_co_block_status_above() for details.
- */
-static int bdrv_common_block_status_above(BlockDriverState *bs,
- BlockDriverState *base,
- bool want_zero, int64_t offset,
- int64_t bytes, int64_t *pnum,
- int64_t *map,
- BlockDriverState **file)
-{
- BdrvCoBlockStatusData data = {
- .bs = bs,
- .base = base,
- .want_zero = want_zero,
- .offset = offset,
- .bytes = bytes,
- .pnum = pnum,
- .map = map,
- .file = file,
- };
-
- return bdrv_run_co(bs, bdrv_block_status_above_co_entry, &data);
-}
-
int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
int64_t offset, int64_t bytes, int64_t *pnum,
int64_t *map, BlockDriverState **file)
@@ -2619,96 +2474,70 @@ int bdrv_is_allocated_above(BlockDriverState *top,
return 0;
}
-typedef struct BdrvVmstateCo {
- BlockDriverState *bs;
- QEMUIOVector *qiov;
- int64_t pos;
- bool is_read;
-} BdrvVmstateCo;
-
-static int coroutine_fn
-bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
- bool is_read)
+int coroutine_fn
+bdrv_co_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
{
BlockDriver *drv = bs->drv;
BlockDriverState *child_bs = bdrv_primary_bs(bs);
int ret = -ENOTSUP;
+ if (!drv) {
+ return -ENOMEDIUM;
+ }
+
bdrv_inc_in_flight(bs);
- if (!drv) {
- ret = -ENOMEDIUM;
- } else if (drv->bdrv_load_vmstate) {
- if (is_read) {
- ret = drv->bdrv_load_vmstate(bs, qiov, pos);
- } else {
- ret = drv->bdrv_save_vmstate(bs, qiov, pos);
- }
+ if (drv->bdrv_load_vmstate) {
+ ret = drv->bdrv_load_vmstate(bs, qiov, pos);
} else if (child_bs) {
- ret = bdrv_co_rw_vmstate(child_bs, qiov, pos, is_read);
+ ret = bdrv_co_readv_vmstate(child_bs, qiov, pos);
}
bdrv_dec_in_flight(bs);
+
return ret;
}
-static int coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
+int coroutine_fn
+bdrv_co_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
{
- BdrvVmstateCo *co = opaque;
+ BlockDriver *drv = bs->drv;
+ BlockDriverState *child_bs = bdrv_primary_bs(bs);
+ int ret = -ENOTSUP;
- return bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
-}
+ if (!drv) {
+ return -ENOMEDIUM;
+ }
-static inline int
-bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
- bool is_read)
-{
- BdrvVmstateCo data = {
- .bs = bs,
- .qiov = qiov,
- .pos = pos,
- .is_read = is_read,
- };
+ bdrv_inc_in_flight(bs);
- return bdrv_run_co(bs, bdrv_co_rw_vmstate_entry, &data);
+ if (drv->bdrv_save_vmstate) {
+ ret = drv->bdrv_save_vmstate(bs, qiov, pos);
+ } else if (child_bs) {
+ ret = bdrv_co_writev_vmstate(child_bs, qiov, pos);
+ }
+
+ bdrv_dec_in_flight(bs);
+
+ return ret;
}
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
int64_t pos, int size)
{
QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
- int ret;
+ int ret = bdrv_writev_vmstate(bs, &qiov, pos);
- ret = bdrv_writev_vmstate(bs, &qiov, pos);
- if (ret < 0) {
- return ret;
- }
-
- return size;
-}
-
-int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
-{
- return bdrv_rw_vmstate(bs, qiov, pos, false);
+ return ret < 0 ? ret : size;
}
int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
int64_t pos, int size)
{
QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
- int ret;
+ int ret = bdrv_readv_vmstate(bs, &qiov, pos);
- ret = bdrv_readv_vmstate(bs, &qiov, pos);
- if (ret < 0) {
- return ret;
- }
-
- return size;
-}
-
-int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
-{
- return bdrv_rw_vmstate(bs, qiov, pos, true);
+ return ret < 0 ? ret : size;
}
/**************************************************************/
@@ -2748,11 +2577,6 @@ void bdrv_aio_cancel_async(BlockAIOCB *acb)
/**************************************************************/
/* Coroutine block device emulation */
-static int coroutine_fn bdrv_flush_co_entry(void *opaque)
-{
- return bdrv_co_flush(opaque);
-}
-
int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
{
BdrvChild *primary_child = bdrv_primary_child(bs);
@@ -2876,24 +2700,6 @@ early_exit:
return ret;
}
-int bdrv_flush(BlockDriverState *bs)
-{
- return bdrv_run_co(bs, bdrv_flush_co_entry, bs);
-}
-
-typedef struct DiscardCo {
- BdrvChild *child;
- int64_t offset;
- int64_t bytes;
-} DiscardCo;
-
-static int coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
-{
- DiscardCo *rwco = opaque;
-
- return bdrv_co_pdiscard(rwco->child, rwco->offset, rwco->bytes);
-}
-
int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
int64_t bytes)
{
@@ -3008,17 +2814,6 @@ out:
return ret;
}
-int bdrv_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes)
-{
- DiscardCo rwco = {
- .child = child,
- .offset = offset,
- .bytes = bytes,
- };
-
- return bdrv_run_co(child->bs, bdrv_pdiscard_co_entry, &rwco);
-}
-
int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
{
BlockDriver *drv = bs->drv;
@@ -3420,35 +3215,3 @@ out:
return ret;
}
-
-typedef struct TruncateCo {
- BdrvChild *child;
- int64_t offset;
- bool exact;
- PreallocMode prealloc;
- BdrvRequestFlags flags;
- Error **errp;
-} TruncateCo;
-
-static int coroutine_fn bdrv_truncate_co_entry(void *opaque)
-{
- TruncateCo *tco = opaque;
-
- return bdrv_co_truncate(tco->child, tco->offset, tco->exact,
- tco->prealloc, tco->flags, tco->errp);
-}
-
-int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
- PreallocMode prealloc, BdrvRequestFlags flags, Error **errp)
-{
- TruncateCo tco = {
- .child = child,
- .offset = offset,
- .exact = exact,
- .prealloc = prealloc,
- .flags = flags,
- .errp = errp,
- };
-
- return bdrv_run_co(child->bs, bdrv_truncate_co_entry, &tco);
-}
diff --git a/block/meson.build b/block/meson.build
index 0b38dc3..78e8b25 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -107,6 +107,14 @@ module_block_h = custom_target('module_block.h',
command: [module_block_py, '@OUTPUT0@', modsrc])
block_ss.add(module_block_h)
+wrapper_py = find_program('../scripts/block-coroutine-wrapper.py')
+block_gen_c = custom_target('block-gen.c',
+ output: 'block-gen.c',
+ input: files('../include/block/block.h',
+ 'coroutines.h'),
+ command: [wrapper_py, '@OUTPUT@', '@INPUT@'])
+block_ss.add(block_gen_c)
+
block_ss.add(files('stream.c'))
softmmu_ss.add(files('qapi-sysemu.c'))
diff --git a/block/nvme.c b/block/nvme.c
index f4f27b6..b48f6f2 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -31,7 +31,7 @@
#define NVME_SQ_ENTRY_BYTES 64
#define NVME_CQ_ENTRY_BYTES 16
#define NVME_QUEUE_SIZE 128
-#define NVME_BAR_SIZE 8192
+#define NVME_DOORBELL_SIZE 4096
/*
* We have to leave one slot empty as that is the full queue case where
@@ -81,15 +81,6 @@ typedef struct {
QEMUBH *completion_bh;
} NVMeQueuePair;
-/* Memory mapped registers */
-typedef volatile struct {
- NvmeBar ctrl;
- struct {
- uint32_t sq_tail;
- uint32_t cq_head;
- } doorbells[];
-} NVMeRegs;
-
#define INDEX_ADMIN 0
#define INDEX_IO(n) (1 + n)
@@ -102,7 +93,11 @@ enum {
struct BDRVNVMeState {
AioContext *aio_context;
QEMUVFIOState *vfio;
- NVMeRegs *regs;
+ /* Memory mapped registers */
+ volatile struct {
+ uint32_t sq_tail;
+ uint32_t cq_head;
+ } *doorbells;
/* The submission/completion queue pairs.
* [0]: admin queue.
* [1..]: io queues.
@@ -247,14 +242,14 @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
error_propagate(errp, local_err);
goto fail;
}
- q->sq.doorbell = &s->regs->doorbells[idx * s->doorbell_scale].sq_tail;
+ q->sq.doorbell = &s->doorbells[idx * s->doorbell_scale].sq_tail;
nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, &local_err);
if (local_err) {
error_propagate(errp, local_err);
goto fail;
}
- q->cq.doorbell = &s->regs->doorbells[idx * s->doorbell_scale].cq_head;
+ q->cq.doorbell = &s->doorbells[idx * s->doorbell_scale].cq_head;
return q;
fail:
@@ -694,6 +689,7 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
uint64_t timeout_ms;
uint64_t deadline, now;
Error *local_err = NULL;
+ volatile NvmeBar *regs = NULL;
qemu_co_mutex_init(&s->dma_map_lock);
qemu_co_queue_init(&s->dma_flush_queue);
@@ -712,32 +708,32 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
goto out;
}
- s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, NVME_BAR_SIZE, errp);
- if (!s->regs) {
+ regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, sizeof(NvmeBar),
+ PROT_READ | PROT_WRITE, errp);
+ if (!regs) {
ret = -EINVAL;
goto out;
}
-
/* Perform initialize sequence as described in NVMe spec "7.6.1
* Initialization". */
- cap = le64_to_cpu(s->regs->ctrl.cap);
- if (!(cap & (1ULL << 37))) {
+ cap = le64_to_cpu(regs->cap);
+ if (!NVME_CAP_CSS(cap)) {
error_setg(errp, "Device doesn't support NVMe command set");
ret = -EINVAL;
goto out;
}
- s->page_size = MAX(4096, 1 << (12 + ((cap >> 48) & 0xF)));
- s->doorbell_scale = (4 << (((cap >> 32) & 0xF))) / sizeof(uint32_t);
+ s->page_size = MAX(4096, 1 << NVME_CAP_MPSMIN(cap));
+ s->doorbell_scale = (4 << NVME_CAP_DSTRD(cap)) / sizeof(uint32_t);
bs->bl.opt_mem_alignment = s->page_size;
- timeout_ms = MIN(500 * ((cap >> 24) & 0xFF), 30000);
+ timeout_ms = MIN(500 * NVME_CAP_TO(cap), 30000);
/* Reset device to get a clean state. */
- s->regs->ctrl.cc = cpu_to_le32(le32_to_cpu(s->regs->ctrl.cc) & 0xFE);
+ regs->cc = cpu_to_le32(le32_to_cpu(regs->cc) & 0xFE);
/* Wait for CSTS.RDY = 0. */
deadline = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ms * SCALE_MS;
- while (le32_to_cpu(s->regs->ctrl.csts) & 0x1) {
+ while (NVME_CSTS_RDY(le32_to_cpu(regs->csts))) {
if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) {
error_setg(errp, "Timeout while waiting for device to reset (%"
PRId64 " ms)",
@@ -747,6 +743,13 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
}
}
+ s->doorbells = qemu_vfio_pci_map_bar(s->vfio, 0, sizeof(NvmeBar),
+ NVME_DOORBELL_SIZE, PROT_WRITE, errp);
+ if (!s->doorbells) {
+ ret = -EINVAL;
+ goto out;
+ }
+
/* Set up admin queue. */
s->queues = g_new(NVMeQueuePair *, 1);
s->queues[INDEX_ADMIN] = nvme_create_queue_pair(s, aio_context, 0,
@@ -758,18 +761,19 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
}
s->nr_queues = 1;
QEMU_BUILD_BUG_ON(NVME_QUEUE_SIZE & 0xF000);
- s->regs->ctrl.aqa = cpu_to_le32((NVME_QUEUE_SIZE << 16) | NVME_QUEUE_SIZE);
- s->regs->ctrl.asq = cpu_to_le64(s->queues[INDEX_ADMIN]->sq.iova);
- s->regs->ctrl.acq = cpu_to_le64(s->queues[INDEX_ADMIN]->cq.iova);
+ regs->aqa = cpu_to_le32((NVME_QUEUE_SIZE << AQA_ACQS_SHIFT) |
+ (NVME_QUEUE_SIZE << AQA_ASQS_SHIFT));
+ regs->asq = cpu_to_le64(s->queues[INDEX_ADMIN]->sq.iova);
+ regs->acq = cpu_to_le64(s->queues[INDEX_ADMIN]->cq.iova);
/* After setting up all control registers we can enable device now. */
- s->regs->ctrl.cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << 20) |
- (ctz32(NVME_SQ_ENTRY_BYTES) << 16) |
- 0x1);
+ regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << CC_IOCQES_SHIFT) |
+ (ctz32(NVME_SQ_ENTRY_BYTES) << CC_IOSQES_SHIFT) |
+ CC_EN_MASK);
/* Wait for CSTS.RDY = 1. */
now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
- deadline = now + timeout_ms * 1000000;
- while (!(le32_to_cpu(s->regs->ctrl.csts) & 0x1)) {
+ deadline = now + timeout_ms * SCALE_MS;
+ while (!NVME_CSTS_RDY(le32_to_cpu(regs->csts))) {
if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) {
error_setg(errp, "Timeout while waiting for device to start (%"
PRId64 " ms)",
@@ -800,6 +804,10 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
ret = -EIO;
}
out:
+ if (regs) {
+ qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)regs, 0, sizeof(NvmeBar));
+ }
+
/* Cleaning up is done in nvme_file_open() upon error. */
return ret;
}
@@ -872,7 +880,8 @@ static void nvme_close(BlockDriverState *bs)
&s->irq_notifier[MSIX_SHARED_IRQ_IDX],
false, NULL, NULL);
event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]);
- qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0, NVME_BAR_SIZE);
+ qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->doorbells,
+ sizeof(NvmeBar), NVME_DOORBELL_SIZE);
qemu_vfio_close(s->vfio);
g_free(s->device);
diff --git a/docs/devel/block-coroutine-wrapper.rst b/docs/devel/block-coroutine-wrapper.rst
new file mode 100644
index 0000000..4128519
--- /dev/null
+++ b/docs/devel/block-coroutine-wrapper.rst
@@ -0,0 +1,54 @@
+=======================
+block-coroutine-wrapper
+=======================
+
+A lot of functions in QEMU block layer (see ``block/*``) can only be
+called in coroutine context. Such functions are normally marked by the
+coroutine_fn specifier. Still, sometimes we need to call them from
+non-coroutine context; for this we need to start a coroutine, run the
+needed function from it and wait for the coroutine to finish in a
+BDRV_POLL_WHILE() loop. To run a coroutine we need a function with one
+void* argument. So for each coroutine_fn function which needs a
+non-coroutine interface, we should define a structure to pack the
+parameters, define a separate function to unpack the parameters and
+call the original function and finally define a new interface function
+with same list of arguments as original one, which will pack the
+parameters into a struct, create a coroutine, run it and wait in
+BDRV_POLL_WHILE() loop. It's boring to create such wrappers by hand,
+so we have a script to generate them.
+
+Usage
+=====
+
+Assume we have defined the ``coroutine_fn`` function
+``bdrv_co_foo(<some args>)`` and need a non-coroutine interface for it,
+called ``bdrv_foo(<same args>)``. In this case the script can help. To
+trigger the generation:
+
+1. You need ``bdrv_foo`` declaration somewhere (for example, in
+ ``block/coroutines.h``) with the ``generated_co_wrapper`` mark,
+ like this:
+
+.. code-block:: c
+
+ int generated_co_wrapper bdrv_foo(<some args>);
+
+2. You need to feed this declaration to block-coroutine-wrapper script.
+ For this, add the .h (or .c) file with the declaration to the
+ ``input: files(...)`` list of ``block_gen_c`` target declaration in
+ ``block/meson.build``
+
+You are done. During the build, coroutine wrappers will be generated in
+``<BUILD_DIR>/block/block-gen.c``.
+
+Links
+=====
+
+1. The script location is ``scripts/block-coroutine-wrapper.py``.
+
+2. Generic place for private ``generated_co_wrapper`` declarations is
+ ``block/coroutines.h``, for public declarations:
+ ``include/block/block.h``
+
+3. The core API of generated coroutine wrappers is placed in
+ (not generated) ``block/block-gen.h``
diff --git a/docs/devel/index.rst b/docs/devel/index.rst
index c34b43e..5fda2d3 100644
--- a/docs/devel/index.rst
+++ b/docs/devel/index.rst
@@ -32,3 +32,4 @@ Contents:
s390-dasd-ipl
clocks
qom
+ block-coroutine-wrapper
diff --git a/include/block/block.h b/include/block/block.h
index 981ab5b..ce2ac39 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -10,6 +10,16 @@
#include "block/blockjob.h"
#include "qemu/hbitmap.h"
+/*
+ * generated_co_wrapper
+ *
+ * Function specifier, which does nothing but mark functions to be
+ * generated by scripts/block-coroutine-wrapper.py
+ *
+ * Read more in docs/devel/block-coroutine-wrapper.rst
+ */
+#define generated_co_wrapper
+
/* block.c */
typedef struct BlockDriver BlockDriver;
typedef struct BdrvChild BdrvChild;
@@ -294,7 +304,7 @@ enum BdrvChildRoleBits {
BDRV_CHILD_FILTERED = (1 << 2),
/*
- * Child from which to read all data that isn’t allocated in the
+ * Child from which to read all data that isn't allocated in the
* parent (i.e., the backing child); such data is copied to the
* parent through COW (and optionally COR).
* This field is mutually exclusive with DATA, METADATA, and
@@ -373,9 +383,7 @@ int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
int bytes, BdrvRequestFlags flags);
int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags);
int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes);
-int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov);
int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes);
-int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov);
int bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
const void *buf, int count);
/*
@@ -393,8 +401,9 @@ void bdrv_refresh_filename(BlockDriverState *bs);
int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
PreallocMode prealloc, BdrvRequestFlags flags,
Error **errp);
-int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
- PreallocMode prealloc, BdrvRequestFlags flags, Error **errp);
+int generated_co_wrapper
+bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
+ PreallocMode prealloc, BdrvRequestFlags flags, Error **errp);
int64_t bdrv_nb_sectors(BlockDriverState *bs);
int64_t bdrv_getlength(BlockDriverState *bs);
@@ -436,7 +445,8 @@ typedef enum {
BDRV_FIX_ERRORS = 2,
} BdrvCheckMode;
-int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix);
+int generated_co_wrapper bdrv_check(BlockDriverState *bs, BdrvCheckResult *res,
+ BdrvCheckMode fix);
/* The units of offset and total_work_size may be chosen arbitrarily by the
* block driver; total_work_size may change during the course of the amendment
@@ -460,12 +470,13 @@ void bdrv_aio_cancel_async(BlockAIOCB *acb);
int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf);
/* Invalidate any cached metadata used by image formats */
-void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp);
+int generated_co_wrapper bdrv_invalidate_cache(BlockDriverState *bs,
+ Error **errp);
void bdrv_invalidate_cache_all(Error **errp);
int bdrv_inactivate_all(void);
/* Ensure contents are flushed to disk. */
-int bdrv_flush(BlockDriverState *bs);
+int generated_co_wrapper bdrv_flush(BlockDriverState *bs);
int coroutine_fn bdrv_co_flush(BlockDriverState *bs);
int bdrv_flush_all(void);
void bdrv_close_all(void);
@@ -480,7 +491,8 @@ void bdrv_drain_all(void);
AIO_WAIT_WHILE(bdrv_get_aio_context(bs_), \
cond); })
-int bdrv_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
+int generated_co_wrapper bdrv_pdiscard(BdrvChild *child, int64_t offset,
+ int64_t bytes);
int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
int bdrv_has_zero_init_1(BlockDriverState *bs);
int bdrv_has_zero_init(BlockDriverState *bs);
@@ -560,8 +572,10 @@ int path_has_protocol(const char *path);
int path_is_absolute(const char *path);
char *path_combine(const char *base_path, const char *filename);
-int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
-int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
+int generated_co_wrapper
+bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
+int generated_co_wrapper
+bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
int64_t pos, int size);
diff --git a/include/qemu/vfio-helpers.h b/include/qemu/vfio-helpers.h
index 1f057c2..4491c8e 100644
--- a/include/qemu/vfio-helpers.h
+++ b/include/qemu/vfio-helpers.h
@@ -22,7 +22,7 @@ int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s);
void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host);
void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index,
- uint64_t offset, uint64_t size,
+ uint64_t offset, uint64_t size, int prot,
Error **errp);
void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar,
uint64_t offset, uint64_t size);
diff --git a/qemu-options.hx b/qemu-options.hx
index 3564c23..1da52a2 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -1053,7 +1053,8 @@ SRST
The path to the image file in the local filesystem
``aio``
- Specifies the AIO backend (threads/native, default: threads)
+ Specifies the AIO backend (threads/native/io_uring,
+ default: threads)
``locking``
Specifies whether the image file is protected with Linux OFD
@@ -1175,7 +1176,8 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive,
"-drive [file=file][,if=type][,bus=n][,unit=m][,media=d][,index=i]\n"
" [,cache=writethrough|writeback|none|directsync|unsafe][,format=f]\n"
" [,snapshot=on|off][,rerror=ignore|stop|report]\n"
- " [,werror=ignore|stop|report|enospc][,id=name][,aio=threads|native]\n"
+ " [,werror=ignore|stop|report|enospc][,id=name]\n"
+ " [,aio=threads|native|io_uring]\n"
" [,readonly=on|off][,copy-on-read=on|off]\n"
" [,discard=ignore|unmap][,detect-zeroes=on|off|unmap]\n"
" [[,bps=b]|[[,bps_rd=r][,bps_wr=w]]]\n"
@@ -1247,8 +1249,8 @@ SRST
The default mode is ``cache=writeback``.
``aio=aio``
- aio is "threads", or "native" and selects between pthread based
- disk I/O and native Linux AIO.
+ aio is "threads", "native", or "io_uring" and selects between pthread
+ based disk I/O, native Linux AIO, or Linux io_uring API.
``format=format``
Specify which disk format will be used rather than detecting the
diff --git a/scripts/block-coroutine-wrapper.py b/scripts/block-coroutine-wrapper.py
new file mode 100644
index 0000000..0461fd1
--- /dev/null
+++ b/scripts/block-coroutine-wrapper.py
@@ -0,0 +1,167 @@
+#! /usr/bin/env python3
+"""Generate coroutine wrappers for block subsystem.
+
+The program parses one or several concatenated c files from stdin,
+searches for functions with the 'generated_co_wrapper' specifier
+and generates corresponding wrappers on stdout.
+
+Usage: block-coroutine-wrapper.py generated-file.c FILE.[ch]...
+
+Copyright (c) 2020 Virtuozzo International GmbH.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import sys
+import re
+from typing import Iterator
+
+
+def gen_header():
+ copyright = re.sub('^.*Copyright', 'Copyright', __doc__, flags=re.DOTALL)
+ copyright = re.sub('^(?=.)', ' * ', copyright.strip(), flags=re.MULTILINE)
+ copyright = re.sub('^$', ' *', copyright, flags=re.MULTILINE)
+ return f"""\
+/*
+ * File is generated by scripts/block-coroutine-wrapper.py
+ *
+{copyright}
+ */
+
+#include "qemu/osdep.h"
+#include "block/coroutines.h"
+#include "block/block-gen.h"
+#include "block/block_int.h"\
+"""
+
+
+class ParamDecl:
+ param_re = re.compile(r'(?P<decl>'
+ r'(?P<type>.*[ *])'
+ r'(?P<name>[a-z][a-z0-9_]*)'
+ r')')
+
+ def __init__(self, param_decl: str) -> None:
+ m = self.param_re.match(param_decl.strip())
+ if m is None:
+ raise ValueError(f'Wrong parameter declaration: "{param_decl}"')
+ self.decl = m.group('decl')
+ self.type = m.group('type')
+ self.name = m.group('name')
+
+
+class FuncDecl:
+ def __init__(self, return_type: str, name: str, args: str) -> None:
+ self.return_type = return_type.strip()
+ self.name = name.strip()
+ self.args = [ParamDecl(arg.strip()) for arg in args.split(',')]
+
+ def gen_list(self, format: str) -> str:
+ return ', '.join(format.format_map(arg.__dict__) for arg in self.args)
+
+ def gen_block(self, format: str) -> str:
+ return '\n'.join(format.format_map(arg.__dict__) for arg in self.args)
+
+
+# Match wrappers declared with a generated_co_wrapper mark
+func_decl_re = re.compile(r'^int\s*generated_co_wrapper\s*'
+ r'(?P<wrapper_name>[a-z][a-z0-9_]*)'
+ r'\((?P<args>[^)]*)\);$', re.MULTILINE)
+
+
+def func_decl_iter(text: str) -> Iterator:
+ for m in func_decl_re.finditer(text):
+ yield FuncDecl(return_type='int',
+ name=m.group('wrapper_name'),
+ args=m.group('args'))
+
+
+def snake_to_camel(func_name: str) -> str:
+ """
+ Convert underscore names like 'some_function_name' to camel-case like
+ 'SomeFunctionName'
+ """
+ words = func_name.split('_')
+ words = [w[0].upper() + w[1:] for w in words]
+ return ''.join(words)
+
+
+def gen_wrapper(func: FuncDecl) -> str:
+ assert func.name.startswith('bdrv_')
+ assert not func.name.startswith('bdrv_co_')
+ assert func.return_type == 'int'
+ assert func.args[0].type in ['BlockDriverState *', 'BdrvChild *']
+
+ name = 'bdrv_co_' + func.name[5:]
+ bs = 'bs' if func.args[0].type == 'BlockDriverState *' else 'child->bs'
+ struct_name = snake_to_camel(name)
+
+ return f"""\
+/*
+ * Wrappers for {name}
+ */
+
+typedef struct {struct_name} {{
+ BdrvPollCo poll_state;
+{ func.gen_block(' {decl};') }
+}} {struct_name};
+
+static void coroutine_fn {name}_entry(void *opaque)
+{{
+ {struct_name} *s = opaque;
+
+ s->poll_state.ret = {name}({ func.gen_list('s->{name}') });
+ s->poll_state.in_progress = false;
+
+ aio_wait_kick();
+}}
+
+int {func.name}({ func.gen_list('{decl}') })
+{{
+ if (qemu_in_coroutine()) {{
+ return {name}({ func.gen_list('{name}') });
+ }} else {{
+ {struct_name} s = {{
+ .poll_state.bs = {bs},
+ .poll_state.in_progress = true,
+
+{ func.gen_block(' .{name} = {name},') }
+ }};
+
+ s.poll_state.co = qemu_coroutine_create({name}_entry, &s);
+
+ return bdrv_poll_co(&s.poll_state);
+ }}
+}}"""
+
+
+def gen_wrappers(input_code: str) -> str:
+ res = ''
+ for func in func_decl_iter(input_code):
+ res += '\n\n\n'
+ res += gen_wrapper(func)
+
+ return res
+
+
+if __name__ == '__main__':
+ if len(sys.argv) < 3:
+ exit(f'Usage: {sys.argv[0]} OUT_FILE.c IN_FILE.[ch]...')
+
+ with open(sys.argv[1], 'w', encoding='utf-8') as f_out:
+ f_out.write(gen_header())
+ for fname in sys.argv[2:]:
+ with open(fname, encoding='utf-8') as f_in:
+ f_out.write(gen_wrappers(f_in.read()))
+ f_out.write('\n')
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index 1107271..1595bbc 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -1872,7 +1872,7 @@ static int coroutine_fn bdrv_replace_test_co_preadv(BlockDriverState *bs,
}
s->io_co = NULL;
- ret = bdrv_preadv(bs->backing, offset, qiov);
+ ret = bdrv_co_preadv(bs->backing, offset, bytes, qiov, 0);
s->has_read = true;
/* Wake up drain_co if it runs */
diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
index 583bdfb..c469beb 100644
--- a/util/vfio-helpers.c
+++ b/util/vfio-helpers.c
@@ -40,6 +40,11 @@ typedef struct {
uint64_t iova;
} IOVAMapping;
+struct IOVARange {
+ uint64_t start;
+ uint64_t end;
+};
+
struct QEMUVFIOState {
QemuMutex lock;
@@ -49,6 +54,8 @@ struct QEMUVFIOState {
int device;
RAMBlockNotifier ram_notifier;
struct vfio_region_info config_region_info, bar_region_info[6];
+ struct IOVARange *usable_iova_ranges;
+ uint8_t nb_iova_ranges;
/* These fields are protected by @lock */
/* VFIO's IO virtual address space is managed by splitting into a few
@@ -146,13 +153,13 @@ static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp)
* Map a PCI bar area.
*/
void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index,
- uint64_t offset, uint64_t size,
+ uint64_t offset, uint64_t size, int prot,
Error **errp)
{
void *p;
assert_bar_index_valid(s, index);
p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset),
- PROT_READ | PROT_WRITE, MAP_SHARED,
+ prot, MAP_SHARED,
s->device, s->bar_region_info[index].offset + offset);
if (p == MAP_FAILED) {
error_setg_errno(errp, errno, "Failed to map BAR region");
@@ -236,6 +243,35 @@ static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int
return ret == size ? 0 : -errno;
}
+static void collect_usable_iova_ranges(QEMUVFIOState *s, void *buf)
+{
+ struct vfio_iommu_type1_info *info = (struct vfio_iommu_type1_info *)buf;
+ struct vfio_info_cap_header *cap = (void *)buf + info->cap_offset;
+ struct vfio_iommu_type1_info_cap_iova_range *cap_iova_range;
+ int i;
+
+ while (cap->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) {
+ if (!cap->next) {
+ return;
+ }
+ cap = (struct vfio_info_cap_header *)(buf + cap->next);
+ }
+
+ cap_iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)cap;
+
+ s->nb_iova_ranges = cap_iova_range->nr_iovas;
+ if (s->nb_iova_ranges > 1) {
+ s->usable_iova_ranges =
+ g_realloc(s->usable_iova_ranges,
+ s->nb_iova_ranges * sizeof(struct IOVARange));
+ }
+
+ for (i = 0; i < s->nb_iova_ranges; i++) {
+ s->usable_iova_ranges[i].start = cap_iova_range->iova_ranges[i].start;
+ s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end;
+ }
+}
+
static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
Error **errp)
{
@@ -243,10 +279,13 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
int i;
uint16_t pci_cmd;
struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
- struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) };
+ struct vfio_iommu_type1_info *iommu_info = NULL;
+ size_t iommu_info_size = sizeof(*iommu_info);
struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
char *group_file = NULL;
+ s->usable_iova_ranges = NULL;
+
/* Create a new container */
s->container = open("/dev/vfio/vfio", O_RDWR);
@@ -310,13 +349,35 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
goto fail;
}
+ iommu_info = g_malloc0(iommu_info_size);
+ iommu_info->argsz = iommu_info_size;
+
/* Get additional IOMMU info */
- if (ioctl(s->container, VFIO_IOMMU_GET_INFO, &iommu_info)) {
+ if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
error_setg_errno(errp, errno, "Failed to get IOMMU info");
ret = -errno;
goto fail;
}
+ /*
+ * if the kernel does not report usable IOVA regions, choose
+ * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region
+ */
+ s->nb_iova_ranges = 1;
+ s->usable_iova_ranges = g_new0(struct IOVARange, 1);
+ s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN;
+ s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1;
+
+ if (iommu_info->argsz > iommu_info_size) {
+ iommu_info_size = iommu_info->argsz;
+ iommu_info = g_realloc(iommu_info, iommu_info_size);
+ if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
+ ret = -errno;
+ goto fail;
+ }
+ collect_usable_iova_ranges(s, iommu_info);
+ }
+
s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device);
if (s->device < 0) {
@@ -365,8 +426,13 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
if (ret) {
goto fail;
}
+ g_free(iommu_info);
return 0;
fail:
+ g_free(s->usable_iova_ranges);
+ s->usable_iova_ranges = NULL;
+ s->nb_iova_ranges = 0;
+ g_free(iommu_info);
close(s->group);
fail_container:
close(s->container);
@@ -601,6 +667,50 @@ static bool qemu_vfio_verify_mappings(QEMUVFIOState *s)
return true;
}
+static int
+qemu_vfio_find_fixed_iova(QEMUVFIOState *s, size_t size, uint64_t *iova)
+{
+ int i;
+
+ for (i = 0; i < s->nb_iova_ranges; i++) {
+ if (s->usable_iova_ranges[i].end < s->low_water_mark) {
+ continue;
+ }
+ s->low_water_mark =
+ MAX(s->low_water_mark, s->usable_iova_ranges[i].start);
+
+ if (s->usable_iova_ranges[i].end - s->low_water_mark + 1 >= size ||
+ s->usable_iova_ranges[i].end - s->low_water_mark + 1 == 0) {
+ *iova = s->low_water_mark;
+ s->low_water_mark += size;
+ return 0;
+ }
+ }
+ return -ENOMEM;
+}
+
+static int
+qemu_vfio_find_temp_iova(QEMUVFIOState *s, size_t size, uint64_t *iova)
+{
+ int i;
+
+ for (i = s->nb_iova_ranges - 1; i >= 0; i--) {
+ if (s->usable_iova_ranges[i].start > s->high_water_mark) {
+ continue;
+ }
+ s->high_water_mark =
+ MIN(s->high_water_mark, s->usable_iova_ranges[i].end + 1);
+
+ if (s->high_water_mark - s->usable_iova_ranges[i].start + 1 >= size ||
+ s->high_water_mark - s->usable_iova_ranges[i].start + 1 == 0) {
+ *iova = s->high_water_mark - size;
+ s->high_water_mark = *iova;
+ return 0;
+ }
+ }
+ return -ENOMEM;
+}
+
/* Map [host, host + size) area into a contiguous IOVA address space, and store
* the result in @iova if not NULL. The caller need to make sure the area is
* aligned to page size, and mustn't overlap with existing mapping areas (split
@@ -627,7 +737,11 @@ int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
goto out;
}
if (!temporary) {
- iova0 = s->low_water_mark;
+ if (qemu_vfio_find_fixed_iova(s, size, &iova0)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0);
if (!mapping) {
ret = -ENOMEM;
@@ -639,15 +753,16 @@ int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
qemu_vfio_undo_mapping(s, mapping, NULL);
goto out;
}
- s->low_water_mark += size;
qemu_vfio_dump_mappings(s);
} else {
- iova0 = s->high_water_mark - size;
+ if (qemu_vfio_find_temp_iova(s, size, &iova0)) {
+ ret = -ENOMEM;
+ goto out;
+ }
ret = qemu_vfio_do_mapping(s, host, size, iova0);
if (ret) {
goto out;
}
- s->high_water_mark -= size;
}
}
if (iova) {
@@ -716,6 +831,8 @@ void qemu_vfio_close(QEMUVFIOState *s)
qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
}
ram_block_notifier_remove(&s->ram_notifier);
+ g_free(s->usable_iova_ranges);
+ s->nb_iova_ranges = 0;
qemu_vfio_reset(s);
close(s->device);
close(s->group);