diff options
-rw-r--r-- | block.c | 97 | ||||
-rw-r--r-- | block/block-gen.h | 49 | ||||
-rw-r--r-- | block/coroutines.h | 65 | ||||
-rw-r--r-- | block/io.c | 337 | ||||
-rw-r--r-- | block/meson.build | 8 | ||||
-rw-r--r-- | block/nvme.c | 73 | ||||
-rw-r--r-- | docs/devel/block-coroutine-wrapper.rst | 54 | ||||
-rw-r--r-- | docs/devel/index.rst | 1 | ||||
-rw-r--r-- | include/block/block.h | 36 | ||||
-rw-r--r-- | include/qemu/vfio-helpers.h | 2 | ||||
-rw-r--r-- | qemu-options.hx | 10 | ||||
-rw-r--r-- | scripts/block-coroutine-wrapper.py | 167 | ||||
-rw-r--r-- | tests/test-bdrv-drain.c | 2 | ||||
-rw-r--r-- | util/vfio-helpers.c | 133 |
14 files changed, 607 insertions, 427 deletions
@@ -48,6 +48,7 @@ #include "qemu/timer.h" #include "qemu/cutils.h" #include "qemu/id.h" +#include "block/coroutines.h" #ifdef CONFIG_BSD #include <sys/ioctl.h> @@ -4676,8 +4677,8 @@ static void bdrv_delete(BlockDriverState *bs) * free of errors) or -errno when an internal error occurred. The results of the * check are stored in res. */ -static int coroutine_fn bdrv_co_check(BlockDriverState *bs, - BdrvCheckResult *res, BdrvCheckMode fix) +int coroutine_fn bdrv_co_check(BlockDriverState *bs, + BdrvCheckResult *res, BdrvCheckMode fix) { if (bs->drv == NULL) { return -ENOMEDIUM; @@ -4690,43 +4691,6 @@ static int coroutine_fn bdrv_co_check(BlockDriverState *bs, return bs->drv->bdrv_co_check(bs, res, fix); } -typedef struct CheckCo { - BlockDriverState *bs; - BdrvCheckResult *res; - BdrvCheckMode fix; - int ret; -} CheckCo; - -static void coroutine_fn bdrv_check_co_entry(void *opaque) -{ - CheckCo *cco = opaque; - cco->ret = bdrv_co_check(cco->bs, cco->res, cco->fix); - aio_wait_kick(); -} - -int bdrv_check(BlockDriverState *bs, - BdrvCheckResult *res, BdrvCheckMode fix) -{ - Coroutine *co; - CheckCo cco = { - .bs = bs, - .res = res, - .ret = -EINPROGRESS, - .fix = fix, - }; - - if (qemu_in_coroutine()) { - /* Fast-path if already in coroutine context */ - bdrv_check_co_entry(&cco); - } else { - co = qemu_coroutine_create(bdrv_check_co_entry, &cco); - bdrv_coroutine_enter(bs, co); - BDRV_POLL_WHILE(bs, cco.ret == -EINPROGRESS); - } - - return cco.ret; -} - /* * Return values: * 0 - success @@ -5781,8 +5745,7 @@ void bdrv_init_with_whitelist(void) bdrv_init(); } -static void coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, - Error **errp) +int coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, Error **errp) { BdrvChild *child, *parent; uint64_t perm, shared_perm; @@ -5791,14 +5754,14 @@ static void coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, BdrvDirtyBitmap *bm; if (!bs->drv) { - return; + return -ENOMEDIUM; } QLIST_FOREACH(child, &bs->children, next) { bdrv_co_invalidate_cache(child->bs, &local_err); if (local_err) { error_propagate(errp, local_err); - return; + return -EINVAL; } } @@ -5821,7 +5784,7 @@ static void coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, ret = bdrv_check_perm(bs, NULL, perm, shared_perm, NULL, NULL, errp); if (ret < 0) { bs->open_flags |= BDRV_O_INACTIVE; - return; + return ret; } bdrv_set_perm(bs, perm, shared_perm); @@ -5830,7 +5793,7 @@ static void coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, if (local_err) { bs->open_flags |= BDRV_O_INACTIVE; error_propagate(errp, local_err); - return; + return -EINVAL; } } @@ -5842,7 +5805,7 @@ static void coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, if (ret < 0) { bs->open_flags |= BDRV_O_INACTIVE; error_setg_errno(errp, -ret, "Could not refresh total sector count"); - return; + return ret; } } @@ -5852,59 +5815,27 @@ static void coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, if (local_err) { bs->open_flags |= BDRV_O_INACTIVE; error_propagate(errp, local_err); - return; + return -EINVAL; } } } -} - -typedef struct InvalidateCacheCo { - BlockDriverState *bs; - Error **errp; - bool done; -} InvalidateCacheCo; - -static void coroutine_fn bdrv_invalidate_cache_co_entry(void *opaque) -{ - InvalidateCacheCo *ico = opaque; - bdrv_co_invalidate_cache(ico->bs, ico->errp); - ico->done = true; - aio_wait_kick(); -} - -void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp) -{ - Coroutine *co; - InvalidateCacheCo ico = { - .bs = bs, - .done = false, - .errp = errp - }; - if (qemu_in_coroutine()) { - /* Fast-path if already in coroutine context */ - bdrv_invalidate_cache_co_entry(&ico); - } else { - co = qemu_coroutine_create(bdrv_invalidate_cache_co_entry, &ico); - bdrv_coroutine_enter(bs, co); - BDRV_POLL_WHILE(bs, !ico.done); - } + return 0; } void bdrv_invalidate_cache_all(Error **errp) { BlockDriverState *bs; - Error *local_err = NULL; BdrvNextIterator it; for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { AioContext *aio_context = bdrv_get_aio_context(bs); + int ret; aio_context_acquire(aio_context); - bdrv_invalidate_cache(bs, &local_err); + ret = bdrv_invalidate_cache(bs, errp); aio_context_release(aio_context); - if (local_err) { - error_propagate(errp, local_err); + if (ret < 0) { bdrv_next_cleanup(&it); return; } diff --git a/block/block-gen.h b/block/block-gen.h new file mode 100644 index 0000000..f80cf48 --- /dev/null +++ b/block/block-gen.h @@ -0,0 +1,49 @@ +/* + * Block coroutine wrapping core, used by auto-generated block/block-gen.c + * + * Copyright (c) 2003 Fabrice Bellard + * Copyright (c) 2020 Virtuozzo International GmbH + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef BLOCK_BLOCK_GEN_H +#define BLOCK_BLOCK_GEN_H + +#include "block/block_int.h" + +/* Base structure for argument packing structures */ +typedef struct BdrvPollCo { + BlockDriverState *bs; + bool in_progress; + int ret; + Coroutine *co; /* Keep pointer here for debugging */ +} BdrvPollCo; + +static inline int bdrv_poll_co(BdrvPollCo *s) +{ + assert(!qemu_in_coroutine()); + + bdrv_coroutine_enter(s->bs, s->co); + BDRV_POLL_WHILE(s->bs, s->in_progress); + + return s->ret; +} + +#endif /* BLOCK_BLOCK_GEN_H */ diff --git a/block/coroutines.h b/block/coroutines.h new file mode 100644 index 0000000..f69179f --- /dev/null +++ b/block/coroutines.h @@ -0,0 +1,65 @@ +/* + * Block layer I/O functions + * + * Copyright (c) 2003 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef BLOCK_COROUTINES_INT_H +#define BLOCK_COROUTINES_INT_H + +#include "block/block_int.h" + +int coroutine_fn bdrv_co_check(BlockDriverState *bs, + BdrvCheckResult *res, BdrvCheckMode fix); +int coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, Error **errp); + +int generated_co_wrapper +bdrv_preadv(BdrvChild *child, int64_t offset, unsigned int bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags); +int generated_co_wrapper +bdrv_pwritev(BdrvChild *child, int64_t offset, unsigned int bytes, + QEMUIOVector *qiov, BdrvRequestFlags flags); + +int coroutine_fn +bdrv_co_common_block_status_above(BlockDriverState *bs, + BlockDriverState *base, + bool want_zero, + int64_t offset, + int64_t bytes, + int64_t *pnum, + int64_t *map, + BlockDriverState **file); +int generated_co_wrapper +bdrv_common_block_status_above(BlockDriverState *bs, + BlockDriverState *base, + bool want_zero, + int64_t offset, + int64_t bytes, + int64_t *pnum, + int64_t *map, + BlockDriverState **file); + +int coroutine_fn bdrv_co_readv_vmstate(BlockDriverState *bs, + QEMUIOVector *qiov, int64_t pos); +int coroutine_fn bdrv_co_writev_vmstate(BlockDriverState *bs, + QEMUIOVector *qiov, int64_t pos); + +#endif /* BLOCK_COROUTINES_INT_H */ @@ -29,6 +29,7 @@ #include "block/blockjob.h" #include "block/blockjob_int.h" #include "block/block_int.h" +#include "block/coroutines.h" #include "qemu/cutils.h" #include "qapi/error.h" #include "qemu/error-report.h" @@ -889,89 +890,10 @@ static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, return 0; } -typedef int coroutine_fn BdrvRequestEntry(void *opaque); -typedef struct BdrvRunCo { - BdrvRequestEntry *entry; - void *opaque; - int ret; - bool done; - Coroutine *co; /* Coroutine, running bdrv_run_co_entry, for debugging */ -} BdrvRunCo; - -static void coroutine_fn bdrv_run_co_entry(void *opaque) -{ - BdrvRunCo *arg = opaque; - - arg->ret = arg->entry(arg->opaque); - arg->done = true; - aio_wait_kick(); -} - -static int bdrv_run_co(BlockDriverState *bs, BdrvRequestEntry *entry, - void *opaque) -{ - if (qemu_in_coroutine()) { - /* Fast-path if already in coroutine context */ - return entry(opaque); - } else { - BdrvRunCo s = { .entry = entry, .opaque = opaque }; - - s.co = qemu_coroutine_create(bdrv_run_co_entry, &s); - bdrv_coroutine_enter(bs, s.co); - - BDRV_POLL_WHILE(bs, !s.done); - - return s.ret; - } -} - -typedef struct RwCo { - BdrvChild *child; - int64_t offset; - QEMUIOVector *qiov; - bool is_write; - BdrvRequestFlags flags; -} RwCo; - -static int coroutine_fn bdrv_rw_co_entry(void *opaque) -{ - RwCo *rwco = opaque; - - if (!rwco->is_write) { - return bdrv_co_preadv(rwco->child, rwco->offset, - rwco->qiov->size, rwco->qiov, - rwco->flags); - } else { - return bdrv_co_pwritev(rwco->child, rwco->offset, - rwco->qiov->size, rwco->qiov, - rwco->flags); - } -} - -/* - * Process a vectored synchronous request using coroutines - */ -static int bdrv_prwv_co(BdrvChild *child, int64_t offset, - QEMUIOVector *qiov, bool is_write, - BdrvRequestFlags flags) -{ - RwCo rwco = { - .child = child, - .offset = offset, - .qiov = qiov, - .is_write = is_write, - .flags = flags, - }; - - return bdrv_run_co(child->bs, bdrv_rw_co_entry, &rwco); -} - int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, int bytes, BdrvRequestFlags flags) { - QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, bytes); - - return bdrv_prwv_co(child, offset, &qiov, true, + return bdrv_pwritev(child, offset, bytes, NULL, BDRV_REQ_ZERO_WRITE | flags); } @@ -1016,41 +938,19 @@ int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags) } } -/* return < 0 if error. See bdrv_pwrite() for the return codes */ -int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) -{ - int ret; - - ret = bdrv_prwv_co(child, offset, qiov, false, 0); - if (ret < 0) { - return ret; - } - - return qiov->size; -} - /* See bdrv_pwrite() for the return codes */ int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes) { + int ret; QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); if (bytes < 0) { return -EINVAL; } - return bdrv_preadv(child, offset, &qiov); -} - -int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) -{ - int ret; + ret = bdrv_preadv(child, offset, bytes, &qiov, 0); - ret = bdrv_prwv_co(child, offset, qiov, true, 0); - if (ret < 0) { - return ret; - } - - return qiov->size; + return ret < 0 ? ret : bytes; } /* Return no. of bytes on success or < 0 on error. Important errors are: @@ -1061,13 +961,16 @@ int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov) */ int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes) { + int ret; QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); if (bytes < 0) { return -EINVAL; } - return bdrv_pwritev(child, offset, &qiov); + ret = bdrv_pwritev(child, offset, bytes, &qiov, 0); + + return ret < 0 ? ret : bytes; } /* @@ -2243,18 +2146,6 @@ int bdrv_flush_all(void) return result; } - -typedef struct BdrvCoBlockStatusData { - BlockDriverState *bs; - BlockDriverState *base; - bool want_zero; - int64_t offset; - int64_t bytes; - int64_t *pnum; - int64_t *map; - BlockDriverState **file; -} BdrvCoBlockStatusData; - /* * Returns the allocation status of the specified sectors. * Drivers not implementing the functionality are assumed to not support @@ -2449,14 +2340,15 @@ early_out: return ret; } -static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs, - BlockDriverState *base, - bool want_zero, - int64_t offset, - int64_t bytes, - int64_t *pnum, - int64_t *map, - BlockDriverState **file) +int coroutine_fn +bdrv_co_common_block_status_above(BlockDriverState *bs, + BlockDriverState *base, + bool want_zero, + int64_t offset, + int64_t bytes, + int64_t *pnum, + int64_t *map, + BlockDriverState **file) { BlockDriverState *p; int ret = 0; @@ -2489,43 +2381,6 @@ static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs, return ret; } -/* Coroutine wrapper for bdrv_block_status_above() */ -static int coroutine_fn bdrv_block_status_above_co_entry(void *opaque) -{ - BdrvCoBlockStatusData *data = opaque; - - return bdrv_co_block_status_above(data->bs, data->base, - data->want_zero, - data->offset, data->bytes, - data->pnum, data->map, data->file); -} - -/* - * Synchronous wrapper around bdrv_co_block_status_above(). - * - * See bdrv_co_block_status_above() for details. - */ -static int bdrv_common_block_status_above(BlockDriverState *bs, - BlockDriverState *base, - bool want_zero, int64_t offset, - int64_t bytes, int64_t *pnum, - int64_t *map, - BlockDriverState **file) -{ - BdrvCoBlockStatusData data = { - .bs = bs, - .base = base, - .want_zero = want_zero, - .offset = offset, - .bytes = bytes, - .pnum = pnum, - .map = map, - .file = file, - }; - - return bdrv_run_co(bs, bdrv_block_status_above_co_entry, &data); -} - int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, int64_t offset, int64_t bytes, int64_t *pnum, int64_t *map, BlockDriverState **file) @@ -2619,96 +2474,70 @@ int bdrv_is_allocated_above(BlockDriverState *top, return 0; } -typedef struct BdrvVmstateCo { - BlockDriverState *bs; - QEMUIOVector *qiov; - int64_t pos; - bool is_read; -} BdrvVmstateCo; - -static int coroutine_fn -bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, - bool is_read) +int coroutine_fn +bdrv_co_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) { BlockDriver *drv = bs->drv; BlockDriverState *child_bs = bdrv_primary_bs(bs); int ret = -ENOTSUP; + if (!drv) { + return -ENOMEDIUM; + } + bdrv_inc_in_flight(bs); - if (!drv) { - ret = -ENOMEDIUM; - } else if (drv->bdrv_load_vmstate) { - if (is_read) { - ret = drv->bdrv_load_vmstate(bs, qiov, pos); - } else { - ret = drv->bdrv_save_vmstate(bs, qiov, pos); - } + if (drv->bdrv_load_vmstate) { + ret = drv->bdrv_load_vmstate(bs, qiov, pos); } else if (child_bs) { - ret = bdrv_co_rw_vmstate(child_bs, qiov, pos, is_read); + ret = bdrv_co_readv_vmstate(child_bs, qiov, pos); } bdrv_dec_in_flight(bs); + return ret; } -static int coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque) +int coroutine_fn +bdrv_co_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) { - BdrvVmstateCo *co = opaque; + BlockDriver *drv = bs->drv; + BlockDriverState *child_bs = bdrv_primary_bs(bs); + int ret = -ENOTSUP; - return bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read); -} + if (!drv) { + return -ENOMEDIUM; + } -static inline int -bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, - bool is_read) -{ - BdrvVmstateCo data = { - .bs = bs, - .qiov = qiov, - .pos = pos, - .is_read = is_read, - }; + bdrv_inc_in_flight(bs); - return bdrv_run_co(bs, bdrv_co_rw_vmstate_entry, &data); + if (drv->bdrv_save_vmstate) { + ret = drv->bdrv_save_vmstate(bs, qiov, pos); + } else if (child_bs) { + ret = bdrv_co_writev_vmstate(child_bs, qiov, pos); + } + + bdrv_dec_in_flight(bs); + + return ret; } int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, int64_t pos, int size) { QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size); - int ret; + int ret = bdrv_writev_vmstate(bs, &qiov, pos); - ret = bdrv_writev_vmstate(bs, &qiov, pos); - if (ret < 0) { - return ret; - } - - return size; -} - -int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) -{ - return bdrv_rw_vmstate(bs, qiov, pos, false); + return ret < 0 ? ret : size; } int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, int64_t pos, int size) { QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size); - int ret; + int ret = bdrv_readv_vmstate(bs, &qiov, pos); - ret = bdrv_readv_vmstate(bs, &qiov, pos); - if (ret < 0) { - return ret; - } - - return size; -} - -int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) -{ - return bdrv_rw_vmstate(bs, qiov, pos, true); + return ret < 0 ? ret : size; } /**************************************************************/ @@ -2748,11 +2577,6 @@ void bdrv_aio_cancel_async(BlockAIOCB *acb) /**************************************************************/ /* Coroutine block device emulation */ -static int coroutine_fn bdrv_flush_co_entry(void *opaque) -{ - return bdrv_co_flush(opaque); -} - int coroutine_fn bdrv_co_flush(BlockDriverState *bs) { BdrvChild *primary_child = bdrv_primary_child(bs); @@ -2876,24 +2700,6 @@ early_exit: return ret; } -int bdrv_flush(BlockDriverState *bs) -{ - return bdrv_run_co(bs, bdrv_flush_co_entry, bs); -} - -typedef struct DiscardCo { - BdrvChild *child; - int64_t offset; - int64_t bytes; -} DiscardCo; - -static int coroutine_fn bdrv_pdiscard_co_entry(void *opaque) -{ - DiscardCo *rwco = opaque; - - return bdrv_co_pdiscard(rwco->child, rwco->offset, rwco->bytes); -} - int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes) { @@ -3008,17 +2814,6 @@ out: return ret; } -int bdrv_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes) -{ - DiscardCo rwco = { - .child = child, - .offset = offset, - .bytes = bytes, - }; - - return bdrv_run_co(child->bs, bdrv_pdiscard_co_entry, &rwco); -} - int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf) { BlockDriver *drv = bs->drv; @@ -3420,35 +3215,3 @@ out: return ret; } - -typedef struct TruncateCo { - BdrvChild *child; - int64_t offset; - bool exact; - PreallocMode prealloc; - BdrvRequestFlags flags; - Error **errp; -} TruncateCo; - -static int coroutine_fn bdrv_truncate_co_entry(void *opaque) -{ - TruncateCo *tco = opaque; - - return bdrv_co_truncate(tco->child, tco->offset, tco->exact, - tco->prealloc, tco->flags, tco->errp); -} - -int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact, - PreallocMode prealloc, BdrvRequestFlags flags, Error **errp) -{ - TruncateCo tco = { - .child = child, - .offset = offset, - .exact = exact, - .prealloc = prealloc, - .flags = flags, - .errp = errp, - }; - - return bdrv_run_co(child->bs, bdrv_truncate_co_entry, &tco); -} diff --git a/block/meson.build b/block/meson.build index 0b38dc3..78e8b25 100644 --- a/block/meson.build +++ b/block/meson.build @@ -107,6 +107,14 @@ module_block_h = custom_target('module_block.h', command: [module_block_py, '@OUTPUT0@', modsrc]) block_ss.add(module_block_h) +wrapper_py = find_program('../scripts/block-coroutine-wrapper.py') +block_gen_c = custom_target('block-gen.c', + output: 'block-gen.c', + input: files('../include/block/block.h', + 'coroutines.h'), + command: [wrapper_py, '@OUTPUT@', '@INPUT@']) +block_ss.add(block_gen_c) + block_ss.add(files('stream.c')) softmmu_ss.add(files('qapi-sysemu.c')) diff --git a/block/nvme.c b/block/nvme.c index f4f27b6..b48f6f2 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -31,7 +31,7 @@ #define NVME_SQ_ENTRY_BYTES 64 #define NVME_CQ_ENTRY_BYTES 16 #define NVME_QUEUE_SIZE 128 -#define NVME_BAR_SIZE 8192 +#define NVME_DOORBELL_SIZE 4096 /* * We have to leave one slot empty as that is the full queue case where @@ -81,15 +81,6 @@ typedef struct { QEMUBH *completion_bh; } NVMeQueuePair; -/* Memory mapped registers */ -typedef volatile struct { - NvmeBar ctrl; - struct { - uint32_t sq_tail; - uint32_t cq_head; - } doorbells[]; -} NVMeRegs; - #define INDEX_ADMIN 0 #define INDEX_IO(n) (1 + n) @@ -102,7 +93,11 @@ enum { struct BDRVNVMeState { AioContext *aio_context; QEMUVFIOState *vfio; - NVMeRegs *regs; + /* Memory mapped registers */ + volatile struct { + uint32_t sq_tail; + uint32_t cq_head; + } *doorbells; /* The submission/completion queue pairs. * [0]: admin queue. * [1..]: io queues. @@ -247,14 +242,14 @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s, error_propagate(errp, local_err); goto fail; } - q->sq.doorbell = &s->regs->doorbells[idx * s->doorbell_scale].sq_tail; + q->sq.doorbell = &s->doorbells[idx * s->doorbell_scale].sq_tail; nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, &local_err); if (local_err) { error_propagate(errp, local_err); goto fail; } - q->cq.doorbell = &s->regs->doorbells[idx * s->doorbell_scale].cq_head; + q->cq.doorbell = &s->doorbells[idx * s->doorbell_scale].cq_head; return q; fail: @@ -694,6 +689,7 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, uint64_t timeout_ms; uint64_t deadline, now; Error *local_err = NULL; + volatile NvmeBar *regs = NULL; qemu_co_mutex_init(&s->dma_map_lock); qemu_co_queue_init(&s->dma_flush_queue); @@ -712,32 +708,32 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, goto out; } - s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, NVME_BAR_SIZE, errp); - if (!s->regs) { + regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, sizeof(NvmeBar), + PROT_READ | PROT_WRITE, errp); + if (!regs) { ret = -EINVAL; goto out; } - /* Perform initialize sequence as described in NVMe spec "7.6.1 * Initialization". */ - cap = le64_to_cpu(s->regs->ctrl.cap); - if (!(cap & (1ULL << 37))) { + cap = le64_to_cpu(regs->cap); + if (!NVME_CAP_CSS(cap)) { error_setg(errp, "Device doesn't support NVMe command set"); ret = -EINVAL; goto out; } - s->page_size = MAX(4096, 1 << (12 + ((cap >> 48) & 0xF))); - s->doorbell_scale = (4 << (((cap >> 32) & 0xF))) / sizeof(uint32_t); + s->page_size = MAX(4096, 1 << NVME_CAP_MPSMIN(cap)); + s->doorbell_scale = (4 << NVME_CAP_DSTRD(cap)) / sizeof(uint32_t); bs->bl.opt_mem_alignment = s->page_size; - timeout_ms = MIN(500 * ((cap >> 24) & 0xFF), 30000); + timeout_ms = MIN(500 * NVME_CAP_TO(cap), 30000); /* Reset device to get a clean state. */ - s->regs->ctrl.cc = cpu_to_le32(le32_to_cpu(s->regs->ctrl.cc) & 0xFE); + regs->cc = cpu_to_le32(le32_to_cpu(regs->cc) & 0xFE); /* Wait for CSTS.RDY = 0. */ deadline = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ms * SCALE_MS; - while (le32_to_cpu(s->regs->ctrl.csts) & 0x1) { + while (NVME_CSTS_RDY(le32_to_cpu(regs->csts))) { if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) { error_setg(errp, "Timeout while waiting for device to reset (%" PRId64 " ms)", @@ -747,6 +743,13 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, } } + s->doorbells = qemu_vfio_pci_map_bar(s->vfio, 0, sizeof(NvmeBar), + NVME_DOORBELL_SIZE, PROT_WRITE, errp); + if (!s->doorbells) { + ret = -EINVAL; + goto out; + } + /* Set up admin queue. */ s->queues = g_new(NVMeQueuePair *, 1); s->queues[INDEX_ADMIN] = nvme_create_queue_pair(s, aio_context, 0, @@ -758,18 +761,19 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, } s->nr_queues = 1; QEMU_BUILD_BUG_ON(NVME_QUEUE_SIZE & 0xF000); - s->regs->ctrl.aqa = cpu_to_le32((NVME_QUEUE_SIZE << 16) | NVME_QUEUE_SIZE); - s->regs->ctrl.asq = cpu_to_le64(s->queues[INDEX_ADMIN]->sq.iova); - s->regs->ctrl.acq = cpu_to_le64(s->queues[INDEX_ADMIN]->cq.iova); + regs->aqa = cpu_to_le32((NVME_QUEUE_SIZE << AQA_ACQS_SHIFT) | + (NVME_QUEUE_SIZE << AQA_ASQS_SHIFT)); + regs->asq = cpu_to_le64(s->queues[INDEX_ADMIN]->sq.iova); + regs->acq = cpu_to_le64(s->queues[INDEX_ADMIN]->cq.iova); /* After setting up all control registers we can enable device now. */ - s->regs->ctrl.cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << 20) | - (ctz32(NVME_SQ_ENTRY_BYTES) << 16) | - 0x1); + regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << CC_IOCQES_SHIFT) | + (ctz32(NVME_SQ_ENTRY_BYTES) << CC_IOSQES_SHIFT) | + CC_EN_MASK); /* Wait for CSTS.RDY = 1. */ now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); - deadline = now + timeout_ms * 1000000; - while (!(le32_to_cpu(s->regs->ctrl.csts) & 0x1)) { + deadline = now + timeout_ms * SCALE_MS; + while (!NVME_CSTS_RDY(le32_to_cpu(regs->csts))) { if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) { error_setg(errp, "Timeout while waiting for device to start (%" PRId64 " ms)", @@ -800,6 +804,10 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, ret = -EIO; } out: + if (regs) { + qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)regs, 0, sizeof(NvmeBar)); + } + /* Cleaning up is done in nvme_file_open() upon error. */ return ret; } @@ -872,7 +880,8 @@ static void nvme_close(BlockDriverState *bs) &s->irq_notifier[MSIX_SHARED_IRQ_IDX], false, NULL, NULL); event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]); - qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0, NVME_BAR_SIZE); + qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->doorbells, + sizeof(NvmeBar), NVME_DOORBELL_SIZE); qemu_vfio_close(s->vfio); g_free(s->device); diff --git a/docs/devel/block-coroutine-wrapper.rst b/docs/devel/block-coroutine-wrapper.rst new file mode 100644 index 0000000..4128519 --- /dev/null +++ b/docs/devel/block-coroutine-wrapper.rst @@ -0,0 +1,54 @@ +======================= +block-coroutine-wrapper +======================= + +A lot of functions in QEMU block layer (see ``block/*``) can only be +called in coroutine context. Such functions are normally marked by the +coroutine_fn specifier. Still, sometimes we need to call them from +non-coroutine context; for this we need to start a coroutine, run the +needed function from it and wait for the coroutine to finish in a +BDRV_POLL_WHILE() loop. To run a coroutine we need a function with one +void* argument. So for each coroutine_fn function which needs a +non-coroutine interface, we should define a structure to pack the +parameters, define a separate function to unpack the parameters and +call the original function and finally define a new interface function +with same list of arguments as original one, which will pack the +parameters into a struct, create a coroutine, run it and wait in +BDRV_POLL_WHILE() loop. It's boring to create such wrappers by hand, +so we have a script to generate them. + +Usage +===== + +Assume we have defined the ``coroutine_fn`` function +``bdrv_co_foo(<some args>)`` and need a non-coroutine interface for it, +called ``bdrv_foo(<same args>)``. In this case the script can help. To +trigger the generation: + +1. You need ``bdrv_foo`` declaration somewhere (for example, in + ``block/coroutines.h``) with the ``generated_co_wrapper`` mark, + like this: + +.. code-block:: c + + int generated_co_wrapper bdrv_foo(<some args>); + +2. You need to feed this declaration to block-coroutine-wrapper script. + For this, add the .h (or .c) file with the declaration to the + ``input: files(...)`` list of ``block_gen_c`` target declaration in + ``block/meson.build`` + +You are done. During the build, coroutine wrappers will be generated in +``<BUILD_DIR>/block/block-gen.c``. + +Links +===== + +1. The script location is ``scripts/block-coroutine-wrapper.py``. + +2. Generic place for private ``generated_co_wrapper`` declarations is + ``block/coroutines.h``, for public declarations: + ``include/block/block.h`` + +3. The core API of generated coroutine wrappers is placed in + (not generated) ``block/block-gen.h`` diff --git a/docs/devel/index.rst b/docs/devel/index.rst index c34b43e..5fda2d3 100644 --- a/docs/devel/index.rst +++ b/docs/devel/index.rst @@ -32,3 +32,4 @@ Contents: s390-dasd-ipl clocks qom + block-coroutine-wrapper diff --git a/include/block/block.h b/include/block/block.h index 981ab5b..ce2ac39 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -10,6 +10,16 @@ #include "block/blockjob.h" #include "qemu/hbitmap.h" +/* + * generated_co_wrapper + * + * Function specifier, which does nothing but mark functions to be + * generated by scripts/block-coroutine-wrapper.py + * + * Read more in docs/devel/block-coroutine-wrapper.rst + */ +#define generated_co_wrapper + /* block.c */ typedef struct BlockDriver BlockDriver; typedef struct BdrvChild BdrvChild; @@ -294,7 +304,7 @@ enum BdrvChildRoleBits { BDRV_CHILD_FILTERED = (1 << 2), /* - * Child from which to read all data that isn’t allocated in the + * Child from which to read all data that isn't allocated in the * parent (i.e., the backing child); such data is copied to the * parent through COW (and optionally COR). * This field is mutually exclusive with DATA, METADATA, and @@ -373,9 +383,7 @@ int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, int bytes, BdrvRequestFlags flags); int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags); int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes); -int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov); int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes); -int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov); int bdrv_pwrite_sync(BdrvChild *child, int64_t offset, const void *buf, int count); /* @@ -393,8 +401,9 @@ void bdrv_refresh_filename(BlockDriverState *bs); int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, PreallocMode prealloc, BdrvRequestFlags flags, Error **errp); -int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact, - PreallocMode prealloc, BdrvRequestFlags flags, Error **errp); +int generated_co_wrapper +bdrv_truncate(BdrvChild *child, int64_t offset, bool exact, + PreallocMode prealloc, BdrvRequestFlags flags, Error **errp); int64_t bdrv_nb_sectors(BlockDriverState *bs); int64_t bdrv_getlength(BlockDriverState *bs); @@ -436,7 +445,8 @@ typedef enum { BDRV_FIX_ERRORS = 2, } BdrvCheckMode; -int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix); +int generated_co_wrapper bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, + BdrvCheckMode fix); /* The units of offset and total_work_size may be chosen arbitrarily by the * block driver; total_work_size may change during the course of the amendment @@ -460,12 +470,13 @@ void bdrv_aio_cancel_async(BlockAIOCB *acb); int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf); /* Invalidate any cached metadata used by image formats */ -void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp); +int generated_co_wrapper bdrv_invalidate_cache(BlockDriverState *bs, + Error **errp); void bdrv_invalidate_cache_all(Error **errp); int bdrv_inactivate_all(void); /* Ensure contents are flushed to disk. */ -int bdrv_flush(BlockDriverState *bs); +int generated_co_wrapper bdrv_flush(BlockDriverState *bs); int coroutine_fn bdrv_co_flush(BlockDriverState *bs); int bdrv_flush_all(void); void bdrv_close_all(void); @@ -480,7 +491,8 @@ void bdrv_drain_all(void); AIO_WAIT_WHILE(bdrv_get_aio_context(bs_), \ cond); }) -int bdrv_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes); +int generated_co_wrapper bdrv_pdiscard(BdrvChild *child, int64_t offset, + int64_t bytes); int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes); int bdrv_has_zero_init_1(BlockDriverState *bs); int bdrv_has_zero_init(BlockDriverState *bs); @@ -560,8 +572,10 @@ int path_has_protocol(const char *path); int path_is_absolute(const char *path); char *path_combine(const char *base_path, const char *filename); -int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos); -int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos); +int generated_co_wrapper +bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos); +int generated_co_wrapper +bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos); int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, int64_t pos, int size); diff --git a/include/qemu/vfio-helpers.h b/include/qemu/vfio-helpers.h index 1f057c2..4491c8e 100644 --- a/include/qemu/vfio-helpers.h +++ b/include/qemu/vfio-helpers.h @@ -22,7 +22,7 @@ int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size, int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s); void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host); void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index, - uint64_t offset, uint64_t size, + uint64_t offset, uint64_t size, int prot, Error **errp); void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar, uint64_t offset, uint64_t size); diff --git a/qemu-options.hx b/qemu-options.hx index 3564c23..1da52a2 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -1053,7 +1053,8 @@ SRST The path to the image file in the local filesystem ``aio`` - Specifies the AIO backend (threads/native, default: threads) + Specifies the AIO backend (threads/native/io_uring, + default: threads) ``locking`` Specifies whether the image file is protected with Linux OFD @@ -1175,7 +1176,8 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive, "-drive [file=file][,if=type][,bus=n][,unit=m][,media=d][,index=i]\n" " [,cache=writethrough|writeback|none|directsync|unsafe][,format=f]\n" " [,snapshot=on|off][,rerror=ignore|stop|report]\n" - " [,werror=ignore|stop|report|enospc][,id=name][,aio=threads|native]\n" + " [,werror=ignore|stop|report|enospc][,id=name]\n" + " [,aio=threads|native|io_uring]\n" " [,readonly=on|off][,copy-on-read=on|off]\n" " [,discard=ignore|unmap][,detect-zeroes=on|off|unmap]\n" " [[,bps=b]|[[,bps_rd=r][,bps_wr=w]]]\n" @@ -1247,8 +1249,8 @@ SRST The default mode is ``cache=writeback``. ``aio=aio`` - aio is "threads", or "native" and selects between pthread based - disk I/O and native Linux AIO. + aio is "threads", "native", or "io_uring" and selects between pthread + based disk I/O, native Linux AIO, or Linux io_uring API. ``format=format`` Specify which disk format will be used rather than detecting the diff --git a/scripts/block-coroutine-wrapper.py b/scripts/block-coroutine-wrapper.py new file mode 100644 index 0000000..0461fd1 --- /dev/null +++ b/scripts/block-coroutine-wrapper.py @@ -0,0 +1,167 @@ +#! /usr/bin/env python3 +"""Generate coroutine wrappers for block subsystem. + +The program parses one or several concatenated c files from stdin, +searches for functions with the 'generated_co_wrapper' specifier +and generates corresponding wrappers on stdout. + +Usage: block-coroutine-wrapper.py generated-file.c FILE.[ch]... + +Copyright (c) 2020 Virtuozzo International GmbH. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +""" + +import sys +import re +from typing import Iterator + + +def gen_header(): + copyright = re.sub('^.*Copyright', 'Copyright', __doc__, flags=re.DOTALL) + copyright = re.sub('^(?=.)', ' * ', copyright.strip(), flags=re.MULTILINE) + copyright = re.sub('^$', ' *', copyright, flags=re.MULTILINE) + return f"""\ +/* + * File is generated by scripts/block-coroutine-wrapper.py + * +{copyright} + */ + +#include "qemu/osdep.h" +#include "block/coroutines.h" +#include "block/block-gen.h" +#include "block/block_int.h"\ +""" + + +class ParamDecl: + param_re = re.compile(r'(?P<decl>' + r'(?P<type>.*[ *])' + r'(?P<name>[a-z][a-z0-9_]*)' + r')') + + def __init__(self, param_decl: str) -> None: + m = self.param_re.match(param_decl.strip()) + if m is None: + raise ValueError(f'Wrong parameter declaration: "{param_decl}"') + self.decl = m.group('decl') + self.type = m.group('type') + self.name = m.group('name') + + +class FuncDecl: + def __init__(self, return_type: str, name: str, args: str) -> None: + self.return_type = return_type.strip() + self.name = name.strip() + self.args = [ParamDecl(arg.strip()) for arg in args.split(',')] + + def gen_list(self, format: str) -> str: + return ', '.join(format.format_map(arg.__dict__) for arg in self.args) + + def gen_block(self, format: str) -> str: + return '\n'.join(format.format_map(arg.__dict__) for arg in self.args) + + +# Match wrappers declared with a generated_co_wrapper mark +func_decl_re = re.compile(r'^int\s*generated_co_wrapper\s*' + r'(?P<wrapper_name>[a-z][a-z0-9_]*)' + r'\((?P<args>[^)]*)\);$', re.MULTILINE) + + +def func_decl_iter(text: str) -> Iterator: + for m in func_decl_re.finditer(text): + yield FuncDecl(return_type='int', + name=m.group('wrapper_name'), + args=m.group('args')) + + +def snake_to_camel(func_name: str) -> str: + """ + Convert underscore names like 'some_function_name' to camel-case like + 'SomeFunctionName' + """ + words = func_name.split('_') + words = [w[0].upper() + w[1:] for w in words] + return ''.join(words) + + +def gen_wrapper(func: FuncDecl) -> str: + assert func.name.startswith('bdrv_') + assert not func.name.startswith('bdrv_co_') + assert func.return_type == 'int' + assert func.args[0].type in ['BlockDriverState *', 'BdrvChild *'] + + name = 'bdrv_co_' + func.name[5:] + bs = 'bs' if func.args[0].type == 'BlockDriverState *' else 'child->bs' + struct_name = snake_to_camel(name) + + return f"""\ +/* + * Wrappers for {name} + */ + +typedef struct {struct_name} {{ + BdrvPollCo poll_state; +{ func.gen_block(' {decl};') } +}} {struct_name}; + +static void coroutine_fn {name}_entry(void *opaque) +{{ + {struct_name} *s = opaque; + + s->poll_state.ret = {name}({ func.gen_list('s->{name}') }); + s->poll_state.in_progress = false; + + aio_wait_kick(); +}} + +int {func.name}({ func.gen_list('{decl}') }) +{{ + if (qemu_in_coroutine()) {{ + return {name}({ func.gen_list('{name}') }); + }} else {{ + {struct_name} s = {{ + .poll_state.bs = {bs}, + .poll_state.in_progress = true, + +{ func.gen_block(' .{name} = {name},') } + }}; + + s.poll_state.co = qemu_coroutine_create({name}_entry, &s); + + return bdrv_poll_co(&s.poll_state); + }} +}}""" + + +def gen_wrappers(input_code: str) -> str: + res = '' + for func in func_decl_iter(input_code): + res += '\n\n\n' + res += gen_wrapper(func) + + return res + + +if __name__ == '__main__': + if len(sys.argv) < 3: + exit(f'Usage: {sys.argv[0]} OUT_FILE.c IN_FILE.[ch]...') + + with open(sys.argv[1], 'w', encoding='utf-8') as f_out: + f_out.write(gen_header()) + for fname in sys.argv[2:]: + with open(fname, encoding='utf-8') as f_in: + f_out.write(gen_wrappers(f_in.read())) + f_out.write('\n') diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c index 1107271..1595bbc 100644 --- a/tests/test-bdrv-drain.c +++ b/tests/test-bdrv-drain.c @@ -1872,7 +1872,7 @@ static int coroutine_fn bdrv_replace_test_co_preadv(BlockDriverState *bs, } s->io_co = NULL; - ret = bdrv_preadv(bs->backing, offset, qiov); + ret = bdrv_co_preadv(bs->backing, offset, bytes, qiov, 0); s->has_read = true; /* Wake up drain_co if it runs */ diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c index 583bdfb..c469beb 100644 --- a/util/vfio-helpers.c +++ b/util/vfio-helpers.c @@ -40,6 +40,11 @@ typedef struct { uint64_t iova; } IOVAMapping; +struct IOVARange { + uint64_t start; + uint64_t end; +}; + struct QEMUVFIOState { QemuMutex lock; @@ -49,6 +54,8 @@ struct QEMUVFIOState { int device; RAMBlockNotifier ram_notifier; struct vfio_region_info config_region_info, bar_region_info[6]; + struct IOVARange *usable_iova_ranges; + uint8_t nb_iova_ranges; /* These fields are protected by @lock */ /* VFIO's IO virtual address space is managed by splitting into a few @@ -146,13 +153,13 @@ static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp) * Map a PCI bar area. */ void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index, - uint64_t offset, uint64_t size, + uint64_t offset, uint64_t size, int prot, Error **errp) { void *p; assert_bar_index_valid(s, index); p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset), - PROT_READ | PROT_WRITE, MAP_SHARED, + prot, MAP_SHARED, s->device, s->bar_region_info[index].offset + offset); if (p == MAP_FAILED) { error_setg_errno(errp, errno, "Failed to map BAR region"); @@ -236,6 +243,35 @@ static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int return ret == size ? 0 : -errno; } +static void collect_usable_iova_ranges(QEMUVFIOState *s, void *buf) +{ + struct vfio_iommu_type1_info *info = (struct vfio_iommu_type1_info *)buf; + struct vfio_info_cap_header *cap = (void *)buf + info->cap_offset; + struct vfio_iommu_type1_info_cap_iova_range *cap_iova_range; + int i; + + while (cap->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) { + if (!cap->next) { + return; + } + cap = (struct vfio_info_cap_header *)(buf + cap->next); + } + + cap_iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)cap; + + s->nb_iova_ranges = cap_iova_range->nr_iovas; + if (s->nb_iova_ranges > 1) { + s->usable_iova_ranges = + g_realloc(s->usable_iova_ranges, + s->nb_iova_ranges * sizeof(struct IOVARange)); + } + + for (i = 0; i < s->nb_iova_ranges; i++) { + s->usable_iova_ranges[i].start = cap_iova_range->iova_ranges[i].start; + s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end; + } +} + static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device, Error **errp) { @@ -243,10 +279,13 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device, int i; uint16_t pci_cmd; struct vfio_group_status group_status = { .argsz = sizeof(group_status) }; - struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) }; + struct vfio_iommu_type1_info *iommu_info = NULL; + size_t iommu_info_size = sizeof(*iommu_info); struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; char *group_file = NULL; + s->usable_iova_ranges = NULL; + /* Create a new container */ s->container = open("/dev/vfio/vfio", O_RDWR); @@ -310,13 +349,35 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device, goto fail; } + iommu_info = g_malloc0(iommu_info_size); + iommu_info->argsz = iommu_info_size; + /* Get additional IOMMU info */ - if (ioctl(s->container, VFIO_IOMMU_GET_INFO, &iommu_info)) { + if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) { error_setg_errno(errp, errno, "Failed to get IOMMU info"); ret = -errno; goto fail; } + /* + * if the kernel does not report usable IOVA regions, choose + * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region + */ + s->nb_iova_ranges = 1; + s->usable_iova_ranges = g_new0(struct IOVARange, 1); + s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN; + s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1; + + if (iommu_info->argsz > iommu_info_size) { + iommu_info_size = iommu_info->argsz; + iommu_info = g_realloc(iommu_info, iommu_info_size); + if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) { + ret = -errno; + goto fail; + } + collect_usable_iova_ranges(s, iommu_info); + } + s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device); if (s->device < 0) { @@ -365,8 +426,13 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device, if (ret) { goto fail; } + g_free(iommu_info); return 0; fail: + g_free(s->usable_iova_ranges); + s->usable_iova_ranges = NULL; + s->nb_iova_ranges = 0; + g_free(iommu_info); close(s->group); fail_container: close(s->container); @@ -601,6 +667,50 @@ static bool qemu_vfio_verify_mappings(QEMUVFIOState *s) return true; } +static int +qemu_vfio_find_fixed_iova(QEMUVFIOState *s, size_t size, uint64_t *iova) +{ + int i; + + for (i = 0; i < s->nb_iova_ranges; i++) { + if (s->usable_iova_ranges[i].end < s->low_water_mark) { + continue; + } + s->low_water_mark = + MAX(s->low_water_mark, s->usable_iova_ranges[i].start); + + if (s->usable_iova_ranges[i].end - s->low_water_mark + 1 >= size || + s->usable_iova_ranges[i].end - s->low_water_mark + 1 == 0) { + *iova = s->low_water_mark; + s->low_water_mark += size; + return 0; + } + } + return -ENOMEM; +} + +static int +qemu_vfio_find_temp_iova(QEMUVFIOState *s, size_t size, uint64_t *iova) +{ + int i; + + for (i = s->nb_iova_ranges - 1; i >= 0; i--) { + if (s->usable_iova_ranges[i].start > s->high_water_mark) { + continue; + } + s->high_water_mark = + MIN(s->high_water_mark, s->usable_iova_ranges[i].end + 1); + + if (s->high_water_mark - s->usable_iova_ranges[i].start + 1 >= size || + s->high_water_mark - s->usable_iova_ranges[i].start + 1 == 0) { + *iova = s->high_water_mark - size; + s->high_water_mark = *iova; + return 0; + } + } + return -ENOMEM; +} + /* Map [host, host + size) area into a contiguous IOVA address space, and store * the result in @iova if not NULL. The caller need to make sure the area is * aligned to page size, and mustn't overlap with existing mapping areas (split @@ -627,7 +737,11 @@ int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size, goto out; } if (!temporary) { - iova0 = s->low_water_mark; + if (qemu_vfio_find_fixed_iova(s, size, &iova0)) { + ret = -ENOMEM; + goto out; + } + mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0); if (!mapping) { ret = -ENOMEM; @@ -639,15 +753,16 @@ int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size, qemu_vfio_undo_mapping(s, mapping, NULL); goto out; } - s->low_water_mark += size; qemu_vfio_dump_mappings(s); } else { - iova0 = s->high_water_mark - size; + if (qemu_vfio_find_temp_iova(s, size, &iova0)) { + ret = -ENOMEM; + goto out; + } ret = qemu_vfio_do_mapping(s, host, size, iova0); if (ret) { goto out; } - s->high_water_mark -= size; } } if (iova) { @@ -716,6 +831,8 @@ void qemu_vfio_close(QEMUVFIOState *s) qemu_vfio_undo_mapping(s, &s->mappings[i], NULL); } ram_block_notifier_remove(&s->ram_notifier); + g_free(s->usable_iova_ranges); + s->nb_iova_ranges = 0; qemu_vfio_reset(s); close(s->device); close(s->group); |