aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--block/block-backend.c12
-rw-r--r--block/file-posix.c29
-rw-r--r--block/io.c4
-rw-r--r--block/io_uring.c25
-rw-r--r--block/linux-aio.c25
-rw-r--r--block/snapshot.c1
-rw-r--r--hw/block/virtio-blk.c132
-rw-r--r--hw/ide/core.c3
-rw-r--r--hw/ide/macio.c3
-rw-r--r--hw/scsi/scsi-bus.c121
-rw-r--r--hw/scsi/scsi-disk.c24
-rw-r--r--hw/scsi/virtio-scsi-dataplane.c103
-rw-r--r--hw/scsi/virtio-scsi.c502
-rw-r--r--hw/virtio/iothread-vq-mapping.c131
-rw-r--r--hw/virtio/meson.build1
-rw-r--r--include/block/aio.h5
-rw-r--r--include/block/raw-aio.h19
-rw-r--r--include/hw/scsi/scsi.h8
-rw-r--r--include/hw/virtio/iothread-vq-mapping.h45
-rw-r--r--include/hw/virtio/virtio-scsi.h15
-rw-r--r--include/system/block-backend-global-state.h1
-rw-r--r--include/system/dma.h3
-rw-r--r--meson.build8
-rwxr-xr-xscripts/qcow2-to-stdout.py449
-rw-r--r--system/dma-helpers.c8
-rw-r--r--tests/qemu-iotests/051.pc.out2
-rwxr-xr-xtests/qemu-iotests/tests/qsd-migrate2
-rw-r--r--util/aio-posix.c114
-rw-r--r--util/aio-posix.h1
-rw-r--r--util/async.c1
30 files changed, 1286 insertions, 511 deletions
diff --git a/block/block-backend.c b/block/block-backend.c
index 9288f7e..a402db1 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -2357,18 +2357,6 @@ void *blk_blockalign(BlockBackend *blk, size_t size)
return qemu_blockalign(blk ? blk_bs(blk) : NULL, size);
}
-bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp)
-{
- BlockDriverState *bs = blk_bs(blk);
- GLOBAL_STATE_CODE();
- GRAPH_RDLOCK_GUARD_MAINLOOP();
-
- if (!bs) {
- return false;
- }
-
- return bdrv_op_is_blocked(bs, op, errp);
-}
/**
* Return BB's current AioContext. Note that this context may change
diff --git a/block/file-posix.c b/block/file-posix.c
index 44e16dd..56d1972 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -194,6 +194,7 @@ static int fd_open(BlockDriverState *bs)
}
static int64_t raw_getlength(BlockDriverState *bs);
+static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs);
typedef struct RawPosixAIOData {
BlockDriverState *bs;
@@ -804,6 +805,13 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
#endif
s->needs_alignment = raw_needs_alignment(bs);
+ bs->supported_write_flags = BDRV_REQ_FUA;
+ if (s->use_linux_aio && !laio_has_fua()) {
+ bs->supported_write_flags &= ~BDRV_REQ_FUA;
+ } else if (s->use_linux_io_uring && !luring_has_fua()) {
+ bs->supported_write_flags &= ~BDRV_REQ_FUA;
+ }
+
bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
if (S_ISREG(st.st_mode)) {
/* When extending regular files, we get zeros from the OS */
@@ -2477,7 +2485,8 @@ static inline bool raw_check_linux_aio(BDRVRawState *s)
#endif
static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
- uint64_t bytes, QEMUIOVector *qiov, int type)
+ uint64_t bytes, QEMUIOVector *qiov, int type,
+ int flags)
{
BDRVRawState *s = bs->opaque;
RawPosixAIOData acb;
@@ -2508,13 +2517,13 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
#ifdef CONFIG_LINUX_IO_URING
} else if (raw_check_linux_io_uring(s)) {
assert(qiov->size == bytes);
- ret = luring_co_submit(bs, s->fd, offset, qiov, type);
+ ret = luring_co_submit(bs, s->fd, offset, qiov, type, flags);
goto out;
#endif
#ifdef CONFIG_LINUX_AIO
} else if (raw_check_linux_aio(s)) {
assert(qiov->size == bytes);
- ret = laio_co_submit(s->fd, offset, qiov, type,
+ ret = laio_co_submit(s->fd, offset, qiov, type, flags,
s->aio_max_batch);
goto out;
#endif
@@ -2534,6 +2543,10 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
assert(qiov->size == bytes);
ret = raw_thread_pool_submit(handle_aiocb_rw, &acb);
+ if (ret == 0 && (flags & BDRV_REQ_FUA)) {
+ /* TODO Use pwritev2() instead if it's available */
+ ret = raw_co_flush_to_disk(bs);
+ }
goto out; /* Avoid the compiler err of unused label */
out:
@@ -2571,14 +2584,14 @@ static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset,
int64_t bytes, QEMUIOVector *qiov,
BdrvRequestFlags flags)
{
- return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_READ);
+ return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_READ, flags);
}
static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset,
int64_t bytes, QEMUIOVector *qiov,
BdrvRequestFlags flags)
{
- return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_WRITE);
+ return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_WRITE, flags);
}
static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
@@ -2600,12 +2613,12 @@ static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
#ifdef CONFIG_LINUX_IO_URING
if (raw_check_linux_io_uring(s)) {
- return luring_co_submit(bs, s->fd, 0, NULL, QEMU_AIO_FLUSH);
+ return luring_co_submit(bs, s->fd, 0, NULL, QEMU_AIO_FLUSH, 0);
}
#endif
#ifdef CONFIG_LINUX_AIO
if (s->has_laio_fdsync && raw_check_linux_aio(s)) {
- return laio_co_submit(s->fd, 0, NULL, QEMU_AIO_FLUSH, 0);
+ return laio_co_submit(s->fd, 0, NULL, QEMU_AIO_FLUSH, 0, 0);
}
#endif
return raw_thread_pool_submit(handle_aiocb_flush, &acb);
@@ -3540,7 +3553,7 @@ static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
}
trace_zbd_zone_append(bs, *offset >> BDRV_SECTOR_BITS);
- return raw_co_prw(bs, offset, len, qiov, QEMU_AIO_ZONE_APPEND);
+ return raw_co_prw(bs, offset, len, qiov, QEMU_AIO_ZONE_APPEND, 0);
}
#endif
diff --git a/block/io.c b/block/io.c
index d369b99..1ba8d1a 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1058,6 +1058,10 @@ bdrv_driver_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
return -ENOMEDIUM;
}
+ if (bs->open_flags & BDRV_O_NO_FLUSH) {
+ flags &= ~BDRV_REQ_FUA;
+ }
+
if ((flags & BDRV_REQ_FUA) &&
(~bs->supported_write_flags & BDRV_REQ_FUA)) {
flags &= ~BDRV_REQ_FUA;
diff --git a/block/io_uring.c b/block/io_uring.c
index f52b66b..dd4f304 100644
--- a/block/io_uring.c
+++ b/block/io_uring.c
@@ -335,15 +335,24 @@ static void luring_deferred_fn(void *opaque)
*
*/
static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
- uint64_t offset, int type)
+ uint64_t offset, int type, BdrvRequestFlags flags)
{
int ret;
struct io_uring_sqe *sqes = &luringcb->sqeq;
switch (type) {
case QEMU_AIO_WRITE:
+#ifdef HAVE_IO_URING_PREP_WRITEV2
+ {
+ int luring_flags = (flags & BDRV_REQ_FUA) ? RWF_DSYNC : 0;
+ io_uring_prep_writev2(sqes, fd, luringcb->qiov->iov,
+ luringcb->qiov->niov, offset, luring_flags);
+ }
+#else
+ assert(flags == 0);
io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
luringcb->qiov->niov, offset);
+#endif
break;
case QEMU_AIO_ZONE_APPEND:
io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
@@ -380,7 +389,8 @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
}
int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset,
- QEMUIOVector *qiov, int type)
+ QEMUIOVector *qiov, int type,
+ BdrvRequestFlags flags)
{
int ret;
AioContext *ctx = qemu_get_current_aio_context();
@@ -393,7 +403,7 @@ int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset,
};
trace_luring_co_submit(bs, s, &luringcb, fd, offset, qiov ? qiov->size : 0,
type);
- ret = luring_do_submit(fd, &luringcb, s, offset, type);
+ ret = luring_do_submit(fd, &luringcb, s, offset, type, flags);
if (ret < 0) {
return ret;
@@ -448,3 +458,12 @@ void luring_cleanup(LuringState *s)
trace_luring_cleanup_state(s);
g_free(s);
}
+
+bool luring_has_fua(void)
+{
+#ifdef HAVE_IO_URING_PREP_WRITEV2
+ return true;
+#else
+ return false;
+#endif
+}
diff --git a/block/linux-aio.c b/block/linux-aio.c
index 194c8f4..407369f 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -368,7 +368,8 @@ static void laio_deferred_fn(void *opaque)
}
static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
- int type, uint64_t dev_max_batch)
+ int type, BdrvRequestFlags flags,
+ uint64_t dev_max_batch)
{
LinuxAioState *s = laiocb->ctx;
struct iocb *iocbs = &laiocb->iocb;
@@ -376,7 +377,15 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
switch (type) {
case QEMU_AIO_WRITE:
+#ifdef HAVE_IO_PREP_PWRITEV2
+ {
+ int laio_flags = (flags & BDRV_REQ_FUA) ? RWF_DSYNC : 0;
+ io_prep_pwritev2(iocbs, fd, qiov->iov, qiov->niov, offset, laio_flags);
+ }
+#else
+ assert(flags == 0);
io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
+#endif
break;
case QEMU_AIO_ZONE_APPEND:
io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
@@ -409,7 +418,8 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
}
int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
- int type, uint64_t dev_max_batch)
+ int type, BdrvRequestFlags flags,
+ uint64_t dev_max_batch)
{
int ret;
AioContext *ctx = qemu_get_current_aio_context();
@@ -422,7 +432,7 @@ int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
.qiov = qiov,
};
- ret = laio_do_submit(fd, &laiocb, offset, type, dev_max_batch);
+ ret = laio_do_submit(fd, &laiocb, offset, type, flags, dev_max_batch);
if (ret < 0) {
return ret;
}
@@ -505,3 +515,12 @@ bool laio_has_fdsync(int fd)
io_destroy(ctx);
return (ret == -EINVAL) ? false : true;
}
+
+bool laio_has_fua(void)
+{
+#ifdef HAVE_IO_PREP_PWRITEV2
+ return true;
+#else
+ return false;
+#endif
+}
diff --git a/block/snapshot.c b/block/snapshot.c
index 9c44780..22567f1 100644
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -296,6 +296,7 @@ int bdrv_snapshot_goto(BlockDriverState *bs,
bdrv_graph_wrunlock();
ret = bdrv_snapshot_goto(fallback_bs, snapshot_id, errp);
+ memset(bs->opaque, 0, drv->instance_size);
open_ret = drv->bdrv_open(bs, options, bs->open_flags, &local_err);
qobject_unref(options);
if (open_ret < 0) {
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 5135b4d..5077793 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -33,6 +33,7 @@
#endif
#include "hw/virtio/virtio-bus.h"
#include "migration/qemu-file-types.h"
+#include "hw/virtio/iothread-vq-mapping.h"
#include "hw/virtio/virtio-access.h"
#include "hw/virtio/virtio-blk-common.h"
#include "qemu/coroutine.h"
@@ -1423,128 +1424,6 @@ static const BlockDevOps virtio_block_ops = {
.drained_end = virtio_blk_drained_end,
};
-static bool
-validate_iothread_vq_mapping_list(IOThreadVirtQueueMappingList *list,
- uint16_t num_queues, Error **errp)
-{
- g_autofree unsigned long *vqs = bitmap_new(num_queues);
- g_autoptr(GHashTable) iothreads =
- g_hash_table_new(g_str_hash, g_str_equal);
-
- for (IOThreadVirtQueueMappingList *node = list; node; node = node->next) {
- const char *name = node->value->iothread;
- uint16List *vq;
-
- if (!iothread_by_id(name)) {
- error_setg(errp, "IOThread \"%s\" object does not exist", name);
- return false;
- }
-
- if (!g_hash_table_add(iothreads, (gpointer)name)) {
- error_setg(errp,
- "duplicate IOThread name \"%s\" in iothread-vq-mapping",
- name);
- return false;
- }
-
- if (node != list) {
- if (!!node->value->vqs != !!list->value->vqs) {
- error_setg(errp, "either all items in iothread-vq-mapping "
- "must have vqs or none of them must have it");
- return false;
- }
- }
-
- for (vq = node->value->vqs; vq; vq = vq->next) {
- if (vq->value >= num_queues) {
- error_setg(errp, "vq index %u for IOThread \"%s\" must be "
- "less than num_queues %u in iothread-vq-mapping",
- vq->value, name, num_queues);
- return false;
- }
-
- if (test_and_set_bit(vq->value, vqs)) {
- error_setg(errp, "cannot assign vq %u to IOThread \"%s\" "
- "because it is already assigned", vq->value, name);
- return false;
- }
- }
- }
-
- if (list->value->vqs) {
- for (uint16_t i = 0; i < num_queues; i++) {
- if (!test_bit(i, vqs)) {
- error_setg(errp,
- "missing vq %u IOThread assignment in iothread-vq-mapping",
- i);
- return false;
- }
- }
- }
-
- return true;
-}
-
-/**
- * apply_iothread_vq_mapping:
- * @iothread_vq_mapping_list: The mapping of virtqueues to IOThreads.
- * @vq_aio_context: The array of AioContext pointers to fill in.
- * @num_queues: The length of @vq_aio_context.
- * @errp: If an error occurs, a pointer to the area to store the error.
- *
- * Fill in the AioContext for each virtqueue in the @vq_aio_context array given
- * the iothread-vq-mapping parameter in @iothread_vq_mapping_list.
- *
- * Returns: %true on success, %false on failure.
- **/
-static bool apply_iothread_vq_mapping(
- IOThreadVirtQueueMappingList *iothread_vq_mapping_list,
- AioContext **vq_aio_context,
- uint16_t num_queues,
- Error **errp)
-{
- IOThreadVirtQueueMappingList *node;
- size_t num_iothreads = 0;
- size_t cur_iothread = 0;
-
- if (!validate_iothread_vq_mapping_list(iothread_vq_mapping_list,
- num_queues, errp)) {
- return false;
- }
-
- for (node = iothread_vq_mapping_list; node; node = node->next) {
- num_iothreads++;
- }
-
- for (node = iothread_vq_mapping_list; node; node = node->next) {
- IOThread *iothread = iothread_by_id(node->value->iothread);
- AioContext *ctx = iothread_get_aio_context(iothread);
-
- /* Released in virtio_blk_vq_aio_context_cleanup() */
- object_ref(OBJECT(iothread));
-
- if (node->value->vqs) {
- uint16List *vq;
-
- /* Explicit vq:IOThread assignment */
- for (vq = node->value->vqs; vq; vq = vq->next) {
- assert(vq->value < num_queues);
- vq_aio_context[vq->value] = ctx;
- }
- } else {
- /* Round-robin vq:IOThread assignment */
- for (unsigned i = cur_iothread; i < num_queues;
- i += num_iothreads) {
- vq_aio_context[i] = ctx;
- }
- }
-
- cur_iothread++;
- }
-
- return true;
-}
-
/* Context: BQL held */
static bool virtio_blk_vq_aio_context_init(VirtIOBlock *s, Error **errp)
{
@@ -1577,7 +1456,7 @@ static bool virtio_blk_vq_aio_context_init(VirtIOBlock *s, Error **errp)
s->vq_aio_context = g_new(AioContext *, conf->num_queues);
if (conf->iothread_vq_mapping_list) {
- if (!apply_iothread_vq_mapping(conf->iothread_vq_mapping_list,
+ if (!iothread_vq_mapping_apply(conf->iothread_vq_mapping_list,
s->vq_aio_context,
conf->num_queues,
errp)) {
@@ -1611,12 +1490,7 @@ static void virtio_blk_vq_aio_context_cleanup(VirtIOBlock *s)
assert(!s->ioeventfd_started);
if (conf->iothread_vq_mapping_list) {
- IOThreadVirtQueueMappingList *node;
-
- for (node = conf->iothread_vq_mapping_list; node; node = node->next) {
- IOThread *iothread = iothread_by_id(node->value->iothread);
- object_unref(OBJECT(iothread));
- }
+ iothread_vq_mapping_cleanup(conf->iothread_vq_mapping_list);
}
if (conf->iothread) {
diff --git a/hw/ide/core.c b/hw/ide/core.c
index f9baba5..b14983e 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -968,8 +968,7 @@ static void ide_dma_cb(void *opaque, int ret)
BDRV_SECTOR_SIZE, ide_dma_cb, s);
break;
case IDE_DMA_TRIM:
- s->bus->dma->aiocb = dma_blk_io(blk_get_aio_context(s->blk),
- &s->sg, offset, BDRV_SECTOR_SIZE,
+ s->bus->dma->aiocb = dma_blk_io(&s->sg, offset, BDRV_SECTOR_SIZE,
ide_issue_trim, s, ide_dma_cb, s,
DMA_DIRECTION_TO_DEVICE);
break;
diff --git a/hw/ide/macio.c b/hw/ide/macio.c
index 5fe764b..c8e8e44 100644
--- a/hw/ide/macio.c
+++ b/hw/ide/macio.c
@@ -187,8 +187,7 @@ static void pmac_ide_transfer_cb(void *opaque, int ret)
pmac_ide_transfer_cb, io);
break;
case IDE_DMA_TRIM:
- s->bus->dma->aiocb = dma_blk_io(blk_get_aio_context(s->blk), &s->sg,
- offset, 0x1, ide_issue_trim, s,
+ s->bus->dma->aiocb = dma_blk_io(&s->sg, offset, 0x1, ide_issue_trim, s,
pmac_ide_transfer_cb, io,
DMA_DIRECTION_TO_DEVICE);
break;
diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
index 7d45468..ece1107 100644
--- a/hw/scsi/scsi-bus.c
+++ b/hw/scsi/scsi-bus.c
@@ -100,8 +100,15 @@ static void scsi_device_for_each_req_sync(SCSIDevice *s,
assert(!runstate_is_running());
assert(qemu_in_main_thread());
- QTAILQ_FOREACH_SAFE(req, &s->requests, next, next_req) {
- fn(req, opaque);
+ /*
+ * Locking is not necessary because the guest is stopped and no other
+ * threads can be accessing the requests list, but take the lock for
+ * consistency.
+ */
+ WITH_QEMU_LOCK_GUARD(&s->requests_lock) {
+ QTAILQ_FOREACH_SAFE(req, &s->requests, next, next_req) {
+ fn(req, opaque);
+ }
}
}
@@ -115,21 +122,29 @@ static void scsi_device_for_each_req_async_bh(void *opaque)
{
g_autofree SCSIDeviceForEachReqAsyncData *data = opaque;
SCSIDevice *s = data->s;
- AioContext *ctx;
- SCSIRequest *req;
- SCSIRequest *next;
+ g_autoptr(GList) reqs = NULL;
/*
- * The BB cannot have changed contexts between this BH being scheduled and
- * now: BBs' AioContexts, when they have a node attached, can only be
- * changed via bdrv_try_change_aio_context(), in a drained section. While
- * we have the in-flight counter incremented, that drain must block.
+ * Build a list of requests in this AioContext so fn() can be invoked later
+ * outside requests_lock.
*/
- ctx = blk_get_aio_context(s->conf.blk);
- assert(ctx == qemu_get_current_aio_context());
+ WITH_QEMU_LOCK_GUARD(&s->requests_lock) {
+ AioContext *ctx = qemu_get_current_aio_context();
+ SCSIRequest *req;
+ SCSIRequest *next;
+
+ QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) {
+ if (req->ctx == ctx) {
+ scsi_req_ref(req); /* dropped after calling fn() */
+ reqs = g_list_prepend(reqs, req);
+ }
+ }
+ }
- QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) {
- data->fn(req, data->fn_opaque);
+ /* Call fn() on each request */
+ for (GList *elem = g_list_first(reqs); elem; elem = g_list_next(elem)) {
+ data->fn(elem->data, data->fn_opaque);
+ scsi_req_unref(elem->data);
}
/* Drop the reference taken by scsi_device_for_each_req_async() */
@@ -139,9 +154,35 @@ static void scsi_device_for_each_req_async_bh(void *opaque)
blk_dec_in_flight(s->conf.blk);
}
+static void scsi_device_for_each_req_async_do_ctx(gpointer key, gpointer value,
+ gpointer user_data)
+{
+ AioContext *ctx = key;
+ SCSIDeviceForEachReqAsyncData *params = user_data;
+ SCSIDeviceForEachReqAsyncData *data;
+
+ data = g_new(SCSIDeviceForEachReqAsyncData, 1);
+ data->s = params->s;
+ data->fn = params->fn;
+ data->fn_opaque = params->fn_opaque;
+
+ /*
+ * Hold a reference to the SCSIDevice until
+ * scsi_device_for_each_req_async_bh() finishes.
+ */
+ object_ref(OBJECT(data->s));
+
+ /* Paired with scsi_device_for_each_req_async_bh() */
+ blk_inc_in_flight(data->s->conf.blk);
+
+ aio_bh_schedule_oneshot(ctx, scsi_device_for_each_req_async_bh, data);
+}
+
/*
* Schedule @fn() to be invoked for each enqueued request in device @s. @fn()
- * runs in the AioContext that is executing the request.
+ * must be thread-safe because it runs concurrently in each AioContext that is
+ * executing a request.
+ *
* Keeps the BlockBackend's in-flight counter incremented until everything is
* done, so draining it will settle all scheduled @fn() calls.
*/
@@ -151,24 +192,26 @@ static void scsi_device_for_each_req_async(SCSIDevice *s,
{
assert(qemu_in_main_thread());
- SCSIDeviceForEachReqAsyncData *data =
- g_new(SCSIDeviceForEachReqAsyncData, 1);
-
- data->s = s;
- data->fn = fn;
- data->fn_opaque = opaque;
-
- /*
- * Hold a reference to the SCSIDevice until
- * scsi_device_for_each_req_async_bh() finishes.
- */
- object_ref(OBJECT(s));
+ /* The set of AioContexts where the requests are being processed */
+ g_autoptr(GHashTable) aio_contexts = g_hash_table_new(NULL, NULL);
+ WITH_QEMU_LOCK_GUARD(&s->requests_lock) {
+ SCSIRequest *req;
+ QTAILQ_FOREACH(req, &s->requests, next) {
+ g_hash_table_add(aio_contexts, req->ctx);
+ }
+ }
- /* Paired with blk_dec_in_flight() in scsi_device_for_each_req_async_bh() */
- blk_inc_in_flight(s->conf.blk);
- aio_bh_schedule_oneshot(blk_get_aio_context(s->conf.blk),
- scsi_device_for_each_req_async_bh,
- data);
+ /* Schedule a BH for each AioContext */
+ SCSIDeviceForEachReqAsyncData params = {
+ .s = s,
+ .fn = fn,
+ .fn_opaque = opaque,
+ };
+ g_hash_table_foreach(
+ aio_contexts,
+ scsi_device_for_each_req_async_do_ctx,
+ &params
+ );
}
static void scsi_device_realize(SCSIDevice *s, Error **errp)
@@ -349,6 +392,7 @@ static void scsi_qdev_realize(DeviceState *qdev, Error **errp)
dev->lun = lun;
}
+ qemu_mutex_init(&dev->requests_lock);
QTAILQ_INIT(&dev->requests);
scsi_device_realize(dev, &local_err);
if (local_err) {
@@ -369,6 +413,8 @@ static void scsi_qdev_unrealize(DeviceState *qdev)
scsi_device_purge_requests(dev, SENSE_CODE(NO_SENSE));
+ qemu_mutex_destroy(&dev->requests_lock);
+
scsi_device_unrealize(dev);
blockdev_mark_auto_del(dev->conf.blk);
@@ -868,6 +914,7 @@ invalid_opcode:
}
}
+ req->ctx = qemu_get_current_aio_context();
req->cmd = cmd;
req->residual = req->cmd.xfer;
@@ -964,7 +1011,10 @@ static void scsi_req_enqueue_internal(SCSIRequest *req)
req->sg = NULL;
}
req->enqueued = true;
- QTAILQ_INSERT_TAIL(&req->dev->requests, req, next);
+
+ WITH_QEMU_LOCK_GUARD(&req->dev->requests_lock) {
+ QTAILQ_INSERT_TAIL(&req->dev->requests, req, next);
+ }
}
int32_t scsi_req_enqueue(SCSIRequest *req)
@@ -984,7 +1034,9 @@ static void scsi_req_dequeue(SCSIRequest *req)
trace_scsi_req_dequeue(req->dev->id, req->lun, req->tag);
req->retry = false;
if (req->enqueued) {
- QTAILQ_REMOVE(&req->dev->requests, req, next);
+ WITH_QEMU_LOCK_GUARD(&req->dev->requests_lock) {
+ QTAILQ_REMOVE(&req->dev->requests, req, next);
+ }
req->enqueued = false;
scsi_req_unref(req);
}
@@ -1961,8 +2013,7 @@ static void scsi_device_class_init(ObjectClass *klass, void *data)
static void scsi_dev_instance_init(Object *obj)
{
- DeviceState *dev = DEVICE(obj);
- SCSIDevice *s = SCSI_DEVICE(dev);
+ SCSIDevice *s = SCSI_DEVICE(obj);
device_add_bootindex_property(obj, &s->conf.bootindex,
"bootindex", NULL,
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index 7c87b20..8da1d5a 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -106,7 +106,6 @@ struct SCSIDiskState {
uint64_t max_unmap_size;
uint64_t max_io_size;
uint32_t quirks;
- QEMUBH *bh;
char *version;
char *serial;
char *vendor;
@@ -329,9 +328,8 @@ static void scsi_aio_complete(void *opaque, int ret)
SCSIDiskReq *r = (SCSIDiskReq *)opaque;
SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
- /* The request must only run in the BlockBackend's AioContext */
- assert(blk_get_aio_context(s->qdev.conf.blk) ==
- qemu_get_current_aio_context());
+ /* The request must run in its AioContext */
+ assert(r->req.ctx == qemu_get_current_aio_context());
assert(r->req.aiocb != NULL);
r->req.aiocb = NULL;
@@ -431,12 +429,10 @@ static void scsi_dma_complete(void *opaque, int ret)
static void scsi_read_complete_noio(SCSIDiskReq *r, int ret)
{
- SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
uint32_t n;
- /* The request must only run in the BlockBackend's AioContext */
- assert(blk_get_aio_context(s->qdev.conf.blk) ==
- qemu_get_current_aio_context());
+ /* The request must run in its AioContext */
+ assert(r->req.ctx == qemu_get_current_aio_context());
assert(r->req.aiocb == NULL);
if (scsi_disk_req_check_error(r, ret, ret > 0)) {
@@ -488,8 +484,7 @@ static void scsi_do_read(SCSIDiskReq *r, int ret)
if (r->req.sg) {
dma_acct_start(s->qdev.conf.blk, &r->acct, r->req.sg, BLOCK_ACCT_READ);
r->req.residual -= r->req.sg->size;
- r->req.aiocb = dma_blk_io(blk_get_aio_context(s->qdev.conf.blk),
- r->req.sg, r->sector << BDRV_SECTOR_BITS,
+ r->req.aiocb = dma_blk_io(r->req.sg, r->sector << BDRV_SECTOR_BITS,
BDRV_SECTOR_SIZE,
sdc->dma_readv, r, scsi_dma_complete, r,
DMA_DIRECTION_FROM_DEVICE);
@@ -564,12 +559,10 @@ static void scsi_read_data(SCSIRequest *req)
static void scsi_write_complete_noio(SCSIDiskReq *r, int ret)
{
- SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
uint32_t n;
- /* The request must only run in the BlockBackend's AioContext */
- assert(blk_get_aio_context(s->qdev.conf.blk) ==
- qemu_get_current_aio_context());
+ /* The request must run in its AioContext */
+ assert(r->req.ctx == qemu_get_current_aio_context());
assert (r->req.aiocb == NULL);
if (scsi_disk_req_check_error(r, ret, ret > 0)) {
@@ -651,8 +644,7 @@ static void scsi_write_data(SCSIRequest *req)
if (r->req.sg) {
dma_acct_start(s->qdev.conf.blk, &r->acct, r->req.sg, BLOCK_ACCT_WRITE);
r->req.residual -= r->req.sg->size;
- r->req.aiocb = dma_blk_io(blk_get_aio_context(s->qdev.conf.blk),
- r->req.sg, r->sector << BDRV_SECTOR_BITS,
+ r->req.aiocb = dma_blk_io(r->req.sg, r->sector << BDRV_SECTOR_BITS,
BDRV_SECTOR_SIZE,
sdc->dma_writev, r, scsi_dma_complete, r,
DMA_DIRECTION_TO_DEVICE);
diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c
index f49ab98..95f13fb 100644
--- a/hw/scsi/virtio-scsi-dataplane.c
+++ b/hw/scsi/virtio-scsi-dataplane.c
@@ -18,6 +18,7 @@
#include "system/block-backend.h"
#include "hw/scsi/scsi.h"
#include "scsi/constants.h"
+#include "hw/virtio/iothread-vq-mapping.h"
#include "hw/virtio/virtio-bus.h"
/* Context: BQL held */
@@ -28,7 +29,14 @@ void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp)
BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
- if (vs->conf.iothread) {
+ if (vs->conf.iothread && vs->conf.iothread_vq_mapping_list) {
+ error_setg(errp,
+ "iothread and iothread-vq-mapping properties cannot be set "
+ "at the same time");
+ return;
+ }
+
+ if (vs->conf.iothread || vs->conf.iothread_vq_mapping_list) {
if (!k->set_guest_notifiers || !k->ioeventfd_assign) {
error_setg(errp,
"device is incompatible with iothread "
@@ -39,15 +47,64 @@ void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp)
error_setg(errp, "ioeventfd is required for iothread");
return;
}
- s->ctx = iothread_get_aio_context(vs->conf.iothread);
- } else {
- if (!virtio_device_ioeventfd_enabled(vdev)) {
+ }
+
+ s->vq_aio_context = g_new(AioContext *, vs->conf.num_queues +
+ VIRTIO_SCSI_VQ_NUM_FIXED);
+
+ /*
+ * Handle the ctrl virtqueue in the main loop thread where device resets
+ * can be performed.
+ */
+ s->vq_aio_context[0] = qemu_get_aio_context();
+
+ /*
+ * Handle the event virtqueue in the main loop thread where its no_poll
+ * behavior won't stop IOThread polling.
+ */
+ s->vq_aio_context[1] = qemu_get_aio_context();
+
+ if (vs->conf.iothread_vq_mapping_list) {
+ if (!iothread_vq_mapping_apply(vs->conf.iothread_vq_mapping_list,
+ &s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED],
+ vs->conf.num_queues, errp)) {
+ g_free(s->vq_aio_context);
+ s->vq_aio_context = NULL;
return;
}
- s->ctx = qemu_get_aio_context();
+ } else if (vs->conf.iothread) {
+ AioContext *ctx = iothread_get_aio_context(vs->conf.iothread);
+ for (uint16_t i = 0; i < vs->conf.num_queues; i++) {
+ s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i] = ctx;
+ }
+
+ /* Released in virtio_scsi_dataplane_cleanup() */
+ object_ref(OBJECT(vs->conf.iothread));
+ } else {
+ AioContext *ctx = qemu_get_aio_context();
+ for (unsigned i = 0; i < vs->conf.num_queues; i++) {
+ s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i] = ctx;
+ }
}
}
+/* Context: BQL held */
+void virtio_scsi_dataplane_cleanup(VirtIOSCSI *s)
+{
+ VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s);
+
+ if (vs->conf.iothread_vq_mapping_list) {
+ iothread_vq_mapping_cleanup(vs->conf.iothread_vq_mapping_list);
+ }
+
+ if (vs->conf.iothread) {
+ object_unref(OBJECT(vs->conf.iothread));
+ }
+
+ g_free(s->vq_aio_context);
+ s->vq_aio_context = NULL;
+}
+
static int virtio_scsi_set_host_notifier(VirtIOSCSI *s, VirtQueue *vq, int n)
{
BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(s)));
@@ -66,31 +123,20 @@ static int virtio_scsi_set_host_notifier(VirtIOSCSI *s, VirtQueue *vq, int n)
}
/* Context: BH in IOThread */
-static void virtio_scsi_dataplane_stop_bh(void *opaque)
+static void virtio_scsi_dataplane_stop_vq_bh(void *opaque)
{
- VirtIOSCSI *s = opaque;
- VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s);
+ AioContext *ctx = qemu_get_current_aio_context();
+ VirtQueue *vq = opaque;
EventNotifier *host_notifier;
- int i;
- virtio_queue_aio_detach_host_notifier(vs->ctrl_vq, s->ctx);
- host_notifier = virtio_queue_get_host_notifier(vs->ctrl_vq);
+ virtio_queue_aio_detach_host_notifier(vq, ctx);
+ host_notifier = virtio_queue_get_host_notifier(vq);
/*
* Test and clear notifier after disabling event, in case poll callback
* didn't have time to run.
*/
virtio_queue_host_notifier_read(host_notifier);
-
- virtio_queue_aio_detach_host_notifier(vs->event_vq, s->ctx);
- host_notifier = virtio_queue_get_host_notifier(vs->event_vq);
- virtio_queue_host_notifier_read(host_notifier);
-
- for (i = 0; i < vs->conf.num_queues; i++) {
- virtio_queue_aio_detach_host_notifier(vs->cmd_vqs[i], s->ctx);
- host_notifier = virtio_queue_get_host_notifier(vs->cmd_vqs[i]);
- virtio_queue_host_notifier_read(host_notifier);
- }
}
/* Context: BQL held */
@@ -154,11 +200,14 @@ int virtio_scsi_dataplane_start(VirtIODevice *vdev)
smp_wmb(); /* paired with aio_notify_accept() */
if (s->bus.drain_count == 0) {
- virtio_queue_aio_attach_host_notifier(vs->ctrl_vq, s->ctx);
- virtio_queue_aio_attach_host_notifier_no_poll(vs->event_vq, s->ctx);
+ virtio_queue_aio_attach_host_notifier(vs->ctrl_vq,
+ s->vq_aio_context[0]);
+ virtio_queue_aio_attach_host_notifier_no_poll(vs->event_vq,
+ s->vq_aio_context[1]);
for (i = 0; i < vs->conf.num_queues; i++) {
- virtio_queue_aio_attach_host_notifier(vs->cmd_vqs[i], s->ctx);
+ AioContext *ctx = s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i];
+ virtio_queue_aio_attach_host_notifier(vs->cmd_vqs[i], ctx);
}
}
return 0;
@@ -207,7 +256,11 @@ void virtio_scsi_dataplane_stop(VirtIODevice *vdev)
s->dataplane_stopping = true;
if (s->bus.drain_count == 0) {
- aio_wait_bh_oneshot(s->ctx, virtio_scsi_dataplane_stop_bh, s);
+ for (i = 0; i < vs->conf.num_queues + VIRTIO_SCSI_VQ_NUM_FIXED; i++) {
+ VirtQueue *vq = virtio_get_queue(&vs->parent_obj, i);
+ AioContext *ctx = s->vq_aio_context[i];
+ aio_wait_bh_oneshot(ctx, virtio_scsi_dataplane_stop_vq_bh, vq);
+ }
}
blk_drain_all(); /* ensure there are no in-flight requests */
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 7d094e1..f5a3aa2 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -27,6 +27,7 @@
#include "hw/qdev-properties.h"
#include "hw/scsi/scsi.h"
#include "scsi/constants.h"
+#include "hw/virtio/iothread-vq-mapping.h"
#include "hw/virtio/virtio-bus.h"
#include "hw/virtio/virtio-access.h"
#include "trace.h"
@@ -47,7 +48,7 @@ typedef struct VirtIOSCSIReq {
/* Used for two-stage request submission and TMFs deferred to BH */
QTAILQ_ENTRY(VirtIOSCSIReq) next;
- /* Used for cancellation of request during TMFs */
+ /* Used for cancellation of request during TMFs. Atomic. */
int remaining;
SCSIRequest *sreq;
@@ -102,13 +103,18 @@ static void virtio_scsi_free_req(VirtIOSCSIReq *req)
g_free(req);
}
-static void virtio_scsi_complete_req(VirtIOSCSIReq *req)
+static void virtio_scsi_complete_req(VirtIOSCSIReq *req, QemuMutex *vq_lock)
{
VirtIOSCSI *s = req->dev;
VirtQueue *vq = req->vq;
VirtIODevice *vdev = VIRTIO_DEVICE(s);
qemu_iovec_from_buf(&req->resp_iov, 0, &req->resp, req->resp_size);
+
+ if (vq_lock) {
+ qemu_mutex_lock(vq_lock);
+ }
+
virtqueue_push(vq, &req->elem, req->qsgl.size + req->resp_iov.size);
if (s->dataplane_started && !s->dataplane_fenced) {
virtio_notify_irqfd(vdev, vq);
@@ -116,6 +122,10 @@ static void virtio_scsi_complete_req(VirtIOSCSIReq *req)
virtio_notify(vdev, vq);
}
+ if (vq_lock) {
+ qemu_mutex_unlock(vq_lock);
+ }
+
if (req->sreq) {
req->sreq->hba_private = NULL;
scsi_req_unref(req->sreq);
@@ -123,34 +133,20 @@ static void virtio_scsi_complete_req(VirtIOSCSIReq *req)
virtio_scsi_free_req(req);
}
-static void virtio_scsi_complete_req_bh(void *opaque)
+static void virtio_scsi_bad_req(VirtIOSCSIReq *req, QemuMutex *vq_lock)
{
- VirtIOSCSIReq *req = opaque;
+ virtio_error(VIRTIO_DEVICE(req->dev), "wrong size for virtio-scsi headers");
- virtio_scsi_complete_req(req);
-}
+ if (vq_lock) {
+ qemu_mutex_lock(vq_lock);
+ }
-/*
- * Called from virtio_scsi_do_one_tmf_bh() in main loop thread. The main loop
- * thread cannot touch the virtqueue since that could race with an IOThread.
- */
-static void virtio_scsi_complete_req_from_main_loop(VirtIOSCSIReq *req)
-{
- VirtIOSCSI *s = req->dev;
+ virtqueue_detach_element(req->vq, &req->elem, 0);
- if (!s->ctx || s->ctx == qemu_get_aio_context()) {
- /* No need to schedule a BH when there is no IOThread */
- virtio_scsi_complete_req(req);
- } else {
- /* Run request completion in the IOThread */
- aio_wait_bh_oneshot(s->ctx, virtio_scsi_complete_req_bh, req);
+ if (vq_lock) {
+ qemu_mutex_unlock(vq_lock);
}
-}
-static void virtio_scsi_bad_req(VirtIOSCSIReq *req)
-{
- virtio_error(VIRTIO_DEVICE(req->dev), "wrong size for virtio-scsi headers");
- virtqueue_detach_element(req->vq, &req->elem, 0);
virtio_scsi_free_req(req);
}
@@ -235,12 +231,21 @@ static int virtio_scsi_parse_req(VirtIOSCSIReq *req,
return 0;
}
-static VirtIOSCSIReq *virtio_scsi_pop_req(VirtIOSCSI *s, VirtQueue *vq)
+static VirtIOSCSIReq *virtio_scsi_pop_req(VirtIOSCSI *s, VirtQueue *vq, QemuMutex *vq_lock)
{
VirtIOSCSICommon *vs = (VirtIOSCSICommon *)s;
VirtIOSCSIReq *req;
+ if (vq_lock) {
+ qemu_mutex_lock(vq_lock);
+ }
+
req = virtqueue_pop(vq, sizeof(VirtIOSCSIReq) + vs->cdb_size);
+
+ if (vq_lock) {
+ qemu_mutex_unlock(vq_lock);
+ }
+
if (!req) {
return NULL;
}
@@ -294,136 +299,157 @@ typedef struct {
VirtIOSCSIReq *tmf_req;
} VirtIOSCSICancelNotifier;
+static void virtio_scsi_tmf_dec_remaining(VirtIOSCSIReq *tmf)
+{
+ if (qatomic_fetch_dec(&tmf->remaining) == 1) {
+ trace_virtio_scsi_tmf_resp(virtio_scsi_get_lun(tmf->req.tmf.lun),
+ tmf->req.tmf.tag, tmf->resp.tmf.response);
+
+ virtio_scsi_complete_req(tmf, &tmf->dev->ctrl_lock);
+ }
+}
+
static void virtio_scsi_cancel_notify(Notifier *notifier, void *data)
{
VirtIOSCSICancelNotifier *n = container_of(notifier,
VirtIOSCSICancelNotifier,
notifier);
- if (--n->tmf_req->remaining == 0) {
- VirtIOSCSIReq *req = n->tmf_req;
-
- trace_virtio_scsi_tmf_resp(virtio_scsi_get_lun(req->req.tmf.lun),
- req->req.tmf.tag, req->resp.tmf.response);
- virtio_scsi_complete_req(req);
- }
+ virtio_scsi_tmf_dec_remaining(n->tmf_req);
g_free(n);
}
-static inline void virtio_scsi_ctx_check(VirtIOSCSI *s, SCSIDevice *d)
+static void virtio_scsi_tmf_cancel_req(VirtIOSCSIReq *tmf, SCSIRequest *r)
{
- if (s->dataplane_started && d && blk_is_available(d->conf.blk)) {
- assert(blk_get_aio_context(d->conf.blk) == s->ctx);
- }
+ VirtIOSCSICancelNotifier *notifier;
+
+ assert(r->ctx == qemu_get_current_aio_context());
+
+ /* Decremented in virtio_scsi_cancel_notify() */
+ qatomic_inc(&tmf->remaining);
+
+ notifier = g_new(VirtIOSCSICancelNotifier, 1);
+ notifier->notifier.notify = virtio_scsi_cancel_notify;
+ notifier->tmf_req = tmf;
+ scsi_req_cancel_async(r, &notifier->notifier);
}
-static void virtio_scsi_do_one_tmf_bh(VirtIOSCSIReq *req)
+/* Execute a TMF on the requests in the current AioContext */
+static void virtio_scsi_do_tmf_aio_context(void *opaque)
{
- VirtIOSCSI *s = req->dev;
- SCSIDevice *d = virtio_scsi_device_get(s, req->req.tmf.lun);
- BusChild *kid;
- int target;
+ AioContext *ctx = qemu_get_current_aio_context();
+ VirtIOSCSIReq *tmf = opaque;
+ VirtIOSCSI *s = tmf->dev;
+ SCSIDevice *d = virtio_scsi_device_get(s, tmf->req.tmf.lun);
+ SCSIRequest *r;
+ bool match_tag;
- switch (req->req.tmf.subtype) {
- case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET:
- if (!d) {
- req->resp.tmf.response = VIRTIO_SCSI_S_BAD_TARGET;
- goto out;
- }
- if (d->lun != virtio_scsi_get_lun(req->req.tmf.lun)) {
- req->resp.tmf.response = VIRTIO_SCSI_S_INCORRECT_LUN;
- goto out;
- }
- qatomic_inc(&s->resetting);
- device_cold_reset(&d->qdev);
- qatomic_dec(&s->resetting);
+ if (!d) {
+ tmf->resp.tmf.response = VIRTIO_SCSI_S_BAD_TARGET;
+ virtio_scsi_tmf_dec_remaining(tmf);
+ return;
+ }
+
+ /*
+ * This function could handle other subtypes that need to be processed in
+ * the request's AioContext in the future, but for now only request
+ * cancelation subtypes are performed here.
+ */
+ switch (tmf->req.tmf.subtype) {
+ case VIRTIO_SCSI_T_TMF_ABORT_TASK:
+ match_tag = true;
break;
+ case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET:
+ case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET:
+ match_tag = false;
+ break;
+ default:
+ g_assert_not_reached();
+ }
- case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET:
- target = req->req.tmf.lun[1];
- qatomic_inc(&s->resetting);
+ WITH_QEMU_LOCK_GUARD(&d->requests_lock) {
+ QTAILQ_FOREACH(r, &d->requests, next) {
+ VirtIOSCSIReq *cmd_req = r->hba_private;
+ assert(cmd_req); /* request has hba_private while enqueued */
- rcu_read_lock();
- QTAILQ_FOREACH_RCU(kid, &s->bus.qbus.children, sibling) {
- SCSIDevice *d1 = SCSI_DEVICE(kid->child);
- if (d1->channel == 0 && d1->id == target) {
- device_cold_reset(&d1->qdev);
+ if (r->ctx != ctx) {
+ continue;
+ }
+ if (match_tag && cmd_req->req.cmd.tag != tmf->req.tmf.tag) {
+ continue;
}
+ virtio_scsi_tmf_cancel_req(tmf, r);
}
- rcu_read_unlock();
-
- qatomic_dec(&s->resetting);
- break;
-
- default:
- g_assert_not_reached();
}
-out:
- object_unref(OBJECT(d));
- virtio_scsi_complete_req_from_main_loop(req);
+ /* Incremented by virtio_scsi_do_tmf() */
+ virtio_scsi_tmf_dec_remaining(tmf);
+
+ object_unref(d);
}
-/* Some TMFs must be processed from the main loop thread */
-static void virtio_scsi_do_tmf_bh(void *opaque)
+static void dummy_bh(void *opaque)
{
- VirtIOSCSI *s = opaque;
- QTAILQ_HEAD(, VirtIOSCSIReq) reqs = QTAILQ_HEAD_INITIALIZER(reqs);
- VirtIOSCSIReq *req;
- VirtIOSCSIReq *tmp;
+ /* Do nothing */
+}
+/*
+ * Wait for pending virtio_scsi_defer_tmf_to_aio_context() BHs.
+ */
+static void virtio_scsi_flush_defer_tmf_to_aio_context(VirtIOSCSI *s)
+{
GLOBAL_STATE_CODE();
- WITH_QEMU_LOCK_GUARD(&s->tmf_bh_lock) {
- QTAILQ_FOREACH_SAFE(req, &s->tmf_bh_list, next, tmp) {
- QTAILQ_REMOVE(&s->tmf_bh_list, req, next);
- QTAILQ_INSERT_TAIL(&reqs, req, next);
- }
+ assert(!s->dataplane_started);
- qemu_bh_delete(s->tmf_bh);
- s->tmf_bh = NULL;
- }
+ for (uint32_t i = 0; i < s->parent_obj.conf.num_queues; i++) {
+ AioContext *ctx = s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i];
- QTAILQ_FOREACH_SAFE(req, &reqs, next, tmp) {
- QTAILQ_REMOVE(&reqs, req, next);
- virtio_scsi_do_one_tmf_bh(req);
+ /* Our BH only runs after previously scheduled BHs */
+ aio_wait_bh_oneshot(ctx, dummy_bh, NULL);
}
}
-static void virtio_scsi_reset_tmf_bh(VirtIOSCSI *s)
+/*
+ * Run the TMF in a specific AioContext, handling only requests in that
+ * AioContext. This is necessary because requests can run in different
+ * AioContext and it is only possible to cancel them from the AioContext where
+ * they are running.
+ */
+static void virtio_scsi_defer_tmf_to_aio_context(VirtIOSCSIReq *tmf,
+ AioContext *ctx)
{
- VirtIOSCSIReq *req;
- VirtIOSCSIReq *tmp;
+ /* Decremented in virtio_scsi_do_tmf_aio_context() */
+ qatomic_inc(&tmf->remaining);
- GLOBAL_STATE_CODE();
-
- /* Called after ioeventfd has been stopped, so tmf_bh_lock is not needed */
- if (s->tmf_bh) {
- qemu_bh_delete(s->tmf_bh);
- s->tmf_bh = NULL;
- }
-
- QTAILQ_FOREACH_SAFE(req, &s->tmf_bh_list, next, tmp) {
- QTAILQ_REMOVE(&s->tmf_bh_list, req, next);
-
- /* SAM-6 6.3.2 Hard reset */
- req->resp.tmf.response = VIRTIO_SCSI_S_TARGET_FAILURE;
- virtio_scsi_complete_req(req);
- }
+ /* See virtio_scsi_flush_defer_tmf_to_aio_context() cleanup during reset */
+ aio_bh_schedule_oneshot(ctx, virtio_scsi_do_tmf_aio_context, tmf);
}
-static void virtio_scsi_defer_tmf_to_bh(VirtIOSCSIReq *req)
+/*
+ * Returns the AioContext for a given TMF's tag field or NULL. Note that the
+ * request identified by the tag may have completed by the time you can execute
+ * a BH in the AioContext, so don't assume the request still exists in your BH.
+ */
+static AioContext *find_aio_context_for_tmf_tag(SCSIDevice *d,
+ VirtIOSCSIReq *tmf)
{
- VirtIOSCSI *s = req->dev;
+ WITH_QEMU_LOCK_GUARD(&d->requests_lock) {
+ SCSIRequest *r;
+ SCSIRequest *next;
+
+ QTAILQ_FOREACH_SAFE(r, &d->requests, next, next) {
+ VirtIOSCSIReq *cmd_req = r->hba_private;
- WITH_QEMU_LOCK_GUARD(&s->tmf_bh_lock) {
- QTAILQ_INSERT_TAIL(&s->tmf_bh_list, req, next);
+ /* hba_private is non-NULL while the request is enqueued */
+ assert(cmd_req);
- if (!s->tmf_bh) {
- s->tmf_bh = qemu_bh_new(virtio_scsi_do_tmf_bh, s);
- qemu_bh_schedule(s->tmf_bh);
+ if (cmd_req->req.cmd.tag == tmf->req.tmf.tag) {
+ return r->ctx;
+ }
}
}
+ return NULL;
}
/* Return 0 if the request is ready to be completed and return to guest;
@@ -433,9 +459,9 @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
{
SCSIDevice *d = virtio_scsi_device_get(s, req->req.tmf.lun);
SCSIRequest *r, *next;
+ AioContext *ctx;
int ret = 0;
- virtio_scsi_ctx_check(s, d);
/* Here VIRTIO_SCSI_S_OK means "FUNCTION COMPLETE". */
req->resp.tmf.response = VIRTIO_SCSI_S_OK;
@@ -450,7 +476,22 @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
req->req.tmf.tag, req->req.tmf.subtype);
switch (req->req.tmf.subtype) {
- case VIRTIO_SCSI_T_TMF_ABORT_TASK:
+ case VIRTIO_SCSI_T_TMF_ABORT_TASK: {
+ if (!d) {
+ goto fail;
+ }
+ if (d->lun != virtio_scsi_get_lun(req->req.tmf.lun)) {
+ goto incorrect_lun;
+ }
+
+ ctx = find_aio_context_for_tmf_tag(d, req);
+ if (ctx) {
+ virtio_scsi_defer_tmf_to_aio_context(req, ctx);
+ ret = -EINPROGRESS;
+ }
+ break;
+ }
+
case VIRTIO_SCSI_T_TMF_QUERY_TASK:
if (!d) {
goto fail;
@@ -458,44 +499,82 @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
if (d->lun != virtio_scsi_get_lun(req->req.tmf.lun)) {
goto incorrect_lun;
}
- QTAILQ_FOREACH_SAFE(r, &d->requests, next, next) {
- VirtIOSCSIReq *cmd_req = r->hba_private;
- if (cmd_req && cmd_req->req.cmd.tag == req->req.tmf.tag) {
- break;
+
+ WITH_QEMU_LOCK_GUARD(&d->requests_lock) {
+ QTAILQ_FOREACH(r, &d->requests, next) {
+ VirtIOSCSIReq *cmd_req = r->hba_private;
+ assert(cmd_req); /* request has hba_private while enqueued */
+
+ if (cmd_req->req.cmd.tag == req->req.tmf.tag) {
+ /*
+ * "If the specified command is present in the task set,
+ * then return a service response set to FUNCTION
+ * SUCCEEDED".
+ */
+ req->resp.tmf.response = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED;
+ }
}
}
- if (r) {
- /*
- * Assert that the request has not been completed yet, we
- * check for it in the loop above.
- */
- assert(r->hba_private);
- if (req->req.tmf.subtype == VIRTIO_SCSI_T_TMF_QUERY_TASK) {
- /* "If the specified command is present in the task set, then
- * return a service response set to FUNCTION SUCCEEDED".
- */
- req->resp.tmf.response = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED;
- } else {
- VirtIOSCSICancelNotifier *notifier;
-
- req->remaining = 1;
- notifier = g_new(VirtIOSCSICancelNotifier, 1);
- notifier->tmf_req = req;
- notifier->notifier.notify = virtio_scsi_cancel_notify;
- scsi_req_cancel_async(r, &notifier->notifier);
- ret = -EINPROGRESS;
+ break;
+
+ case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET:
+ if (!d) {
+ goto fail;
+ }
+ if (d->lun != virtio_scsi_get_lun(req->req.tmf.lun)) {
+ goto incorrect_lun;
+ }
+ qatomic_inc(&s->resetting);
+ device_cold_reset(&d->qdev);
+ qatomic_dec(&s->resetting);
+ break;
+
+ case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET: {
+ BusChild *kid;
+ int target = req->req.tmf.lun[1];
+ qatomic_inc(&s->resetting);
+
+ rcu_read_lock();
+ QTAILQ_FOREACH_RCU(kid, &s->bus.qbus.children, sibling) {
+ SCSIDevice *d1 = SCSI_DEVICE(kid->child);
+ if (d1->channel == 0 && d1->id == target) {
+ device_cold_reset(&d1->qdev);
}
}
+ rcu_read_unlock();
+
+ qatomic_dec(&s->resetting);
break;
+ }
- case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET:
- case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET:
- virtio_scsi_defer_tmf_to_bh(req);
+ case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET:
+ case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET: {
+ g_autoptr(GHashTable) aio_contexts = g_hash_table_new(NULL, NULL);
+
+ if (!d) {
+ goto fail;
+ }
+ if (d->lun != virtio_scsi_get_lun(req->req.tmf.lun)) {
+ goto incorrect_lun;
+ }
+
+ qatomic_inc(&req->remaining);
+
+ for (uint32_t i = 0; i < s->parent_obj.conf.num_queues; i++) {
+ ctx = s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i];
+
+ if (!g_hash_table_add(aio_contexts, ctx)) {
+ continue; /* skip previously added AioContext */
+ }
+
+ virtio_scsi_defer_tmf_to_aio_context(req, ctx);
+ }
+
+ virtio_scsi_tmf_dec_remaining(req);
ret = -EINPROGRESS;
break;
+ }
- case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET:
- case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET:
case VIRTIO_SCSI_T_TMF_QUERY_TASK_SET:
if (!d) {
goto fail;
@@ -504,34 +583,19 @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
goto incorrect_lun;
}
- /* Add 1 to "remaining" until virtio_scsi_do_tmf returns.
- * This way, if the bus starts calling back to the notifiers
- * even before we finish the loop, virtio_scsi_cancel_notify
- * will not complete the TMF too early.
- */
- req->remaining = 1;
- QTAILQ_FOREACH_SAFE(r, &d->requests, next, next) {
- if (r->hba_private) {
- if (req->req.tmf.subtype == VIRTIO_SCSI_T_TMF_QUERY_TASK_SET) {
- /* "If there is any command present in the task set, then
- * return a service response set to FUNCTION SUCCEEDED".
- */
- req->resp.tmf.response = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED;
- break;
- } else {
- VirtIOSCSICancelNotifier *notifier;
-
- req->remaining++;
- notifier = g_new(VirtIOSCSICancelNotifier, 1);
- notifier->notifier.notify = virtio_scsi_cancel_notify;
- notifier->tmf_req = req;
- scsi_req_cancel_async(r, &notifier->notifier);
- }
+ WITH_QEMU_LOCK_GUARD(&d->requests_lock) {
+ QTAILQ_FOREACH_SAFE(r, &d->requests, next, next) {
+ /* Request has hba_private while enqueued */
+ assert(r->hba_private);
+
+ /*
+ * "If there is any command present in the task set, then
+ * return a service response set to FUNCTION SUCCEEDED".
+ */
+ req->resp.tmf.response = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED;
+ break;
}
}
- if (--req->remaining > 0) {
- ret = -EINPROGRESS;
- }
break;
case VIRTIO_SCSI_T_TMF_CLEAR_ACA:
@@ -562,7 +626,7 @@ static void virtio_scsi_handle_ctrl_req(VirtIOSCSI *s, VirtIOSCSIReq *req)
if (iov_to_buf(req->elem.out_sg, req->elem.out_num, 0,
&type, sizeof(type)) < sizeof(type)) {
- virtio_scsi_bad_req(req);
+ virtio_scsi_bad_req(req, &s->ctrl_lock);
return;
}
@@ -570,7 +634,7 @@ static void virtio_scsi_handle_ctrl_req(VirtIOSCSI *s, VirtIOSCSIReq *req)
if (type == VIRTIO_SCSI_T_TMF) {
if (virtio_scsi_parse_req(req, sizeof(VirtIOSCSICtrlTMFReq),
sizeof(VirtIOSCSICtrlTMFResp)) < 0) {
- virtio_scsi_bad_req(req);
+ virtio_scsi_bad_req(req, &s->ctrl_lock);
return;
} else {
r = virtio_scsi_do_tmf(s, req);
@@ -580,7 +644,7 @@ static void virtio_scsi_handle_ctrl_req(VirtIOSCSI *s, VirtIOSCSIReq *req)
type == VIRTIO_SCSI_T_AN_SUBSCRIBE) {
if (virtio_scsi_parse_req(req, sizeof(VirtIOSCSICtrlANReq),
sizeof(VirtIOSCSICtrlANResp)) < 0) {
- virtio_scsi_bad_req(req);
+ virtio_scsi_bad_req(req, &s->ctrl_lock);
return;
} else {
req->req.an.event_requested =
@@ -600,7 +664,7 @@ static void virtio_scsi_handle_ctrl_req(VirtIOSCSI *s, VirtIOSCSIReq *req)
type == VIRTIO_SCSI_T_AN_SUBSCRIBE)
trace_virtio_scsi_an_resp(virtio_scsi_get_lun(req->req.an.lun),
req->resp.an.response);
- virtio_scsi_complete_req(req);
+ virtio_scsi_complete_req(req, &s->ctrl_lock);
} else {
assert(r == -EINPROGRESS);
}
@@ -610,7 +674,7 @@ static void virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq)
{
VirtIOSCSIReq *req;
- while ((req = virtio_scsi_pop_req(s, vq))) {
+ while ((req = virtio_scsi_pop_req(s, vq, &s->ctrl_lock))) {
virtio_scsi_handle_ctrl_req(s, req);
}
}
@@ -625,9 +689,12 @@ static void virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq)
*/
static bool virtio_scsi_defer_to_dataplane(VirtIOSCSI *s)
{
- if (!s->ctx || s->dataplane_started) {
+ if (s->dataplane_started) {
return false;
}
+ if (s->vq_aio_context[0] == qemu_get_aio_context()) {
+ return false; /* not using IOThreads */
+ }
virtio_device_start_ioeventfd(&s->parent_obj.parent_obj);
return !s->dataplane_fenced;
@@ -654,7 +721,7 @@ static void virtio_scsi_complete_cmd_req(VirtIOSCSIReq *req)
* in virtio_scsi_command_complete.
*/
req->resp_size = sizeof(VirtIOSCSICmdResp);
- virtio_scsi_complete_req(req);
+ virtio_scsi_complete_req(req, NULL);
}
static void virtio_scsi_command_failed(SCSIRequest *r)
@@ -788,7 +855,7 @@ static int virtio_scsi_handle_cmd_req_prepare(VirtIOSCSI *s, VirtIOSCSIReq *req)
virtio_scsi_fail_cmd_req(req);
return -ENOTSUP;
} else {
- virtio_scsi_bad_req(req);
+ virtio_scsi_bad_req(req, NULL);
return -EINVAL;
}
}
@@ -801,7 +868,6 @@ static int virtio_scsi_handle_cmd_req_prepare(VirtIOSCSI *s, VirtIOSCSIReq *req)
virtio_scsi_complete_cmd_req(req);
return -ENOENT;
}
- virtio_scsi_ctx_check(s, d);
req->sreq = scsi_req_new(d, req->req.cmd.tag,
virtio_scsi_get_lun(req->req.cmd.lun),
req->req.cmd.cdb, vs->cdb_size, req);
@@ -843,7 +909,7 @@ static void virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
virtio_queue_set_notification(vq, 0);
}
- while ((req = virtio_scsi_pop_req(s, vq))) {
+ while ((req = virtio_scsi_pop_req(s, vq, NULL))) {
ret = virtio_scsi_handle_cmd_req_prepare(s, req);
if (!ret) {
QTAILQ_INSERT_TAIL(&reqs, req, next);
@@ -936,7 +1002,7 @@ static void virtio_scsi_reset(VirtIODevice *vdev)
assert(!s->dataplane_started);
- virtio_scsi_reset_tmf_bh(s);
+ virtio_scsi_flush_defer_tmf_to_aio_context(s);
qatomic_inc(&s->resetting);
bus_cold_reset(BUS(&s->bus));
@@ -944,7 +1010,10 @@ static void virtio_scsi_reset(VirtIODevice *vdev)
vs->sense_size = VIRTIO_SCSI_SENSE_DEFAULT_SIZE;
vs->cdb_size = VIRTIO_SCSI_CDB_DEFAULT_SIZE;
- s->events_dropped = false;
+
+ WITH_QEMU_LOCK_GUARD(&s->event_lock) {
+ s->events_dropped = false;
+ }
}
typedef struct {
@@ -973,19 +1042,21 @@ static void virtio_scsi_push_event(VirtIOSCSI *s,
return;
}
- req = virtio_scsi_pop_req(s, vs->event_vq);
- if (!req) {
- s->events_dropped = true;
- return;
- }
+ req = virtio_scsi_pop_req(s, vs->event_vq, &s->event_lock);
+ WITH_QEMU_LOCK_GUARD(&s->event_lock) {
+ if (!req) {
+ s->events_dropped = true;
+ return;
+ }
- if (s->events_dropped) {
- event |= VIRTIO_SCSI_T_EVENTS_MISSED;
- s->events_dropped = false;
+ if (s->events_dropped) {
+ event |= VIRTIO_SCSI_T_EVENTS_MISSED;
+ s->events_dropped = false;
+ }
}
if (virtio_scsi_parse_req(req, 0, sizeof(VirtIOSCSIEvent))) {
- virtio_scsi_bad_req(req);
+ virtio_scsi_bad_req(req, &s->event_lock);
return;
}
@@ -1005,12 +1076,18 @@ static void virtio_scsi_push_event(VirtIOSCSI *s,
}
trace_virtio_scsi_event(virtio_scsi_get_lun(evt->lun), event, reason);
- virtio_scsi_complete_req(req);
+ virtio_scsi_complete_req(req, &s->event_lock);
}
static void virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq)
{
- if (s->events_dropped) {
+ bool events_dropped;
+
+ WITH_QEMU_LOCK_GUARD(&s->event_lock) {
+ events_dropped = s->events_dropped;
+ }
+
+ if (events_dropped) {
VirtIOSCSIEventInfo info = {
.event = VIRTIO_SCSI_T_NO_EVENT,
};
@@ -1061,14 +1138,16 @@ static void virtio_scsi_hotplug(HotplugHandler *hotplug_dev, DeviceState *dev,
{
VirtIODevice *vdev = VIRTIO_DEVICE(hotplug_dev);
VirtIOSCSI *s = VIRTIO_SCSI(vdev);
+ AioContext *ctx = s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED];
SCSIDevice *sd = SCSI_DEVICE(dev);
- int ret;
- if (s->ctx && !s->dataplane_fenced) {
- ret = blk_set_aio_context(sd->conf.blk, s->ctx, errp);
- if (ret < 0) {
- return;
- }
+ if (ctx != qemu_get_aio_context() && !s->dataplane_fenced) {
+ /*
+ * Try to make the BlockBackend's AioContext match ours. Ignore failure
+ * because I/O will still work although block jobs and other users
+ * might be slower when multiple AioContexts use a BlockBackend.
+ */
+ blk_set_aio_context(sd->conf.blk, ctx, NULL);
}
if (virtio_vdev_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) {
@@ -1103,7 +1182,7 @@ static void virtio_scsi_hotunplug(HotplugHandler *hotplug_dev, DeviceState *dev,
qdev_simple_device_unplug_cb(hotplug_dev, dev, errp);
- if (s->ctx) {
+ if (s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED] != qemu_get_aio_context()) {
/* If other users keep the BlockBackend in the iothread, that's ok */
blk_set_aio_context(sd->conf.blk, qemu_get_aio_context(), NULL);
}
@@ -1137,7 +1216,7 @@ static void virtio_scsi_drained_begin(SCSIBus *bus)
for (uint32_t i = 0; i < total_queues; i++) {
VirtQueue *vq = virtio_get_queue(vdev, i);
- virtio_queue_aio_detach_host_notifier(vq, s->ctx);
+ virtio_queue_aio_detach_host_notifier(vq, s->vq_aio_context[i]);
}
}
@@ -1163,10 +1242,12 @@ static void virtio_scsi_drained_end(SCSIBus *bus)
for (uint32_t i = 0; i < total_queues; i++) {
VirtQueue *vq = virtio_get_queue(vdev, i);
+ AioContext *ctx = s->vq_aio_context[i];
+
if (vq == vs->event_vq) {
- virtio_queue_aio_attach_host_notifier_no_poll(vq, s->ctx);
+ virtio_queue_aio_attach_host_notifier_no_poll(vq, ctx);
} else {
- virtio_queue_aio_attach_host_notifier(vq, s->ctx);
+ virtio_queue_aio_attach_host_notifier(vq, ctx);
}
}
}
@@ -1235,8 +1316,8 @@ static void virtio_scsi_device_realize(DeviceState *dev, Error **errp)
VirtIOSCSI *s = VIRTIO_SCSI(dev);
Error *err = NULL;
- QTAILQ_INIT(&s->tmf_bh_list);
- qemu_mutex_init(&s->tmf_bh_lock);
+ qemu_mutex_init(&s->ctrl_lock);
+ qemu_mutex_init(&s->event_lock);
virtio_scsi_common_realize(dev,
virtio_scsi_handle_ctrl,
@@ -1271,15 +1352,16 @@ void virtio_scsi_common_unrealize(DeviceState *dev)
virtio_cleanup(vdev);
}
+/* main loop */
static void virtio_scsi_device_unrealize(DeviceState *dev)
{
VirtIOSCSI *s = VIRTIO_SCSI(dev);
- virtio_scsi_reset_tmf_bh(s);
-
+ virtio_scsi_dataplane_cleanup(s);
qbus_set_hotplug_handler(BUS(&s->bus), NULL);
virtio_scsi_common_unrealize(dev);
- qemu_mutex_destroy(&s->tmf_bh_lock);
+ qemu_mutex_destroy(&s->event_lock);
+ qemu_mutex_destroy(&s->ctrl_lock);
}
static const Property virtio_scsi_properties[] = {
@@ -1299,6 +1381,8 @@ static const Property virtio_scsi_properties[] = {
VIRTIO_SCSI_F_CHANGE, true),
DEFINE_PROP_LINK("iothread", VirtIOSCSI, parent_obj.conf.iothread,
TYPE_IOTHREAD, IOThread *),
+ DEFINE_PROP_IOTHREAD_VQ_MAPPING_LIST("iothread-vq-mapping", VirtIOSCSI,
+ parent_obj.conf.iothread_vq_mapping_list),
};
static const VMStateDescription vmstate_virtio_scsi = {
diff --git a/hw/virtio/iothread-vq-mapping.c b/hw/virtio/iothread-vq-mapping.c
new file mode 100644
index 0000000..15909eb
--- /dev/null
+++ b/hw/virtio/iothread-vq-mapping.c
@@ -0,0 +1,131 @@
+/*
+ * IOThread Virtqueue Mapping
+ *
+ * Copyright Red Hat, Inc
+ *
+ * SPDX-License-Identifier: GPL-2.0-only
+ */
+
+#include "qemu/osdep.h"
+#include "system/iothread.h"
+#include "hw/virtio/iothread-vq-mapping.h"
+
+static bool
+iothread_vq_mapping_validate(IOThreadVirtQueueMappingList *list, uint16_t
+ num_queues, Error **errp)
+{
+ g_autofree unsigned long *vqs = bitmap_new(num_queues);
+ g_autoptr(GHashTable) iothreads =
+ g_hash_table_new(g_str_hash, g_str_equal);
+
+ for (IOThreadVirtQueueMappingList *node = list; node; node = node->next) {
+ const char *name = node->value->iothread;
+ uint16List *vq;
+
+ if (!iothread_by_id(name)) {
+ error_setg(errp, "IOThread \"%s\" object does not exist", name);
+ return false;
+ }
+
+ if (!g_hash_table_add(iothreads, (gpointer)name)) {
+ error_setg(errp,
+ "duplicate IOThread name \"%s\" in iothread-vq-mapping",
+ name);
+ return false;
+ }
+
+ if (node != list) {
+ if (!!node->value->vqs != !!list->value->vqs) {
+ error_setg(errp, "either all items in iothread-vq-mapping "
+ "must have vqs or none of them must have it");
+ return false;
+ }
+ }
+
+ for (vq = node->value->vqs; vq; vq = vq->next) {
+ if (vq->value >= num_queues) {
+ error_setg(errp, "vq index %u for IOThread \"%s\" must be "
+ "less than num_queues %u in iothread-vq-mapping",
+ vq->value, name, num_queues);
+ return false;
+ }
+
+ if (test_and_set_bit(vq->value, vqs)) {
+ error_setg(errp, "cannot assign vq %u to IOThread \"%s\" "
+ "because it is already assigned", vq->value, name);
+ return false;
+ }
+ }
+ }
+
+ if (list->value->vqs) {
+ for (uint16_t i = 0; i < num_queues; i++) {
+ if (!test_bit(i, vqs)) {
+ error_setg(errp,
+ "missing vq %u IOThread assignment in iothread-vq-mapping",
+ i);
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+bool iothread_vq_mapping_apply(
+ IOThreadVirtQueueMappingList *list,
+ AioContext **vq_aio_context,
+ uint16_t num_queues,
+ Error **errp)
+{
+ IOThreadVirtQueueMappingList *node;
+ size_t num_iothreads = 0;
+ size_t cur_iothread = 0;
+
+ if (!iothread_vq_mapping_validate(list, num_queues, errp)) {
+ return false;
+ }
+
+ for (node = list; node; node = node->next) {
+ num_iothreads++;
+ }
+
+ for (node = list; node; node = node->next) {
+ IOThread *iothread = iothread_by_id(node->value->iothread);
+ AioContext *ctx = iothread_get_aio_context(iothread);
+
+ /* Released in virtio_blk_vq_aio_context_cleanup() */
+ object_ref(OBJECT(iothread));
+
+ if (node->value->vqs) {
+ uint16List *vq;
+
+ /* Explicit vq:IOThread assignment */
+ for (vq = node->value->vqs; vq; vq = vq->next) {
+ assert(vq->value < num_queues);
+ vq_aio_context[vq->value] = ctx;
+ }
+ } else {
+ /* Round-robin vq:IOThread assignment */
+ for (unsigned i = cur_iothread; i < num_queues;
+ i += num_iothreads) {
+ vq_aio_context[i] = ctx;
+ }
+ }
+
+ cur_iothread++;
+ }
+
+ return true;
+}
+
+void iothread_vq_mapping_cleanup(IOThreadVirtQueueMappingList *list)
+{
+ IOThreadVirtQueueMappingList *node;
+
+ for (node = list; node; node = node->next) {
+ IOThread *iothread = iothread_by_id(node->value->iothread);
+ object_unref(OBJECT(iothread));
+ }
+}
+
diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index a5f9f79..19b04c4 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -1,5 +1,6 @@
system_virtio_ss = ss.source_set()
system_virtio_ss.add(files('virtio-bus.c'))
+system_virtio_ss.add(files('iothread-vq-mapping.c'))
system_virtio_ss.add(when: 'CONFIG_VIRTIO_PCI', if_true: files('virtio-pci.c'))
system_virtio_ss.add(when: 'CONFIG_VIRTIO_MMIO', if_true: files('virtio-mmio.c'))
system_virtio_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('virtio-crypto.c'))
diff --git a/include/block/aio.h b/include/block/aio.h
index b2ab351..99ff484 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -123,6 +123,10 @@ struct BHListSlice {
typedef QSLIST_HEAD(, AioHandler) AioHandlerSList;
+typedef struct AioPolledEvent {
+ int64_t ns; /* current polling time in nanoseconds */
+} AioPolledEvent;
+
struct AioContext {
GSource source;
@@ -229,7 +233,6 @@ struct AioContext {
int poll_disable_cnt;
/* Polling mode parameters */
- int64_t poll_ns; /* current polling time in nanoseconds */
int64_t poll_max_ns; /* maximum polling time in nanoseconds */
int64_t poll_grow; /* polling time growth factor */
int64_t poll_shrink; /* polling time shrink factor */
diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
index 6267068..6570244 100644
--- a/include/block/raw-aio.h
+++ b/include/block/raw-aio.h
@@ -17,6 +17,7 @@
#define QEMU_RAW_AIO_H
#include "block/aio.h"
+#include "block/block-common.h"
#include "qemu/iov.h"
/* AIO request types */
@@ -58,11 +59,18 @@ void laio_cleanup(LinuxAioState *s);
/* laio_co_submit: submit I/O requests in the thread's current AioContext. */
int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
- int type, uint64_t dev_max_batch);
+ int type, BdrvRequestFlags flags,
+ uint64_t dev_max_batch);
bool laio_has_fdsync(int);
+bool laio_has_fua(void);
void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context);
void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context);
+#else
+static inline bool laio_has_fua(void)
+{
+ return false;
+}
#endif
/* io_uring.c - Linux io_uring implementation */
#ifdef CONFIG_LINUX_IO_URING
@@ -71,9 +79,16 @@ void luring_cleanup(LuringState *s);
/* luring_co_submit: submit I/O requests in the thread's current AioContext. */
int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset,
- QEMUIOVector *qiov, int type);
+ QEMUIOVector *qiov, int type,
+ BdrvRequestFlags flags);
void luring_detach_aio_context(LuringState *s, AioContext *old_context);
void luring_attach_aio_context(LuringState *s, AioContext *new_context);
+bool luring_has_fua(void);
+#else
+static inline bool luring_has_fua(void)
+{
+ return false;
+}
#endif
#ifdef _WIN32
diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h
index c3d5e17..90ee192 100644
--- a/include/hw/scsi/scsi.h
+++ b/include/hw/scsi/scsi.h
@@ -24,6 +24,7 @@ struct SCSIRequest {
SCSIBus *bus;
SCSIDevice *dev;
const SCSIReqOps *ops;
+ AioContext *ctx;
uint32_t refcount;
uint32_t tag;
uint32_t lun;
@@ -48,6 +49,8 @@ struct SCSIRequest {
bool dma_started;
BlockAIOCB *aiocb;
QEMUSGList *sg;
+
+ /* Protected by SCSIDevice->requests_lock */
QTAILQ_ENTRY(SCSIRequest) next;
};
@@ -76,10 +79,7 @@ struct SCSIDevice
uint8_t sense[SCSI_SENSE_BUF_SIZE];
uint32_t sense_len;
- /*
- * The requests list is only accessed from the AioContext that executes
- * requests or from the main loop when IOThread processing is stopped.
- */
+ QemuMutex requests_lock; /* protects the requests list */
QTAILQ_HEAD(, SCSIRequest) requests;
uint32_t channel;
diff --git a/include/hw/virtio/iothread-vq-mapping.h b/include/hw/virtio/iothread-vq-mapping.h
new file mode 100644
index 0000000..57335c3
--- /dev/null
+++ b/include/hw/virtio/iothread-vq-mapping.h
@@ -0,0 +1,45 @@
+/*
+ * IOThread Virtqueue Mapping
+ *
+ * Copyright Red Hat, Inc
+ *
+ * SPDX-License-Identifier: GPL-2.0-only
+ */
+
+#ifndef HW_VIRTIO_IOTHREAD_VQ_MAPPING_H
+#define HW_VIRTIO_IOTHREAD_VQ_MAPPING_H
+
+#include "qapi/error.h"
+#include "qapi/qapi-types-virtio.h"
+
+/**
+ * iothread_vq_mapping_apply:
+ * @list: The mapping of virtqueues to IOThreads.
+ * @vq_aio_context: The array of AioContext pointers to fill in.
+ * @num_queues: The length of @vq_aio_context.
+ * @errp: If an error occurs, a pointer to the area to store the error.
+ *
+ * Fill in the AioContext for each virtqueue in the @vq_aio_context array given
+ * the iothread-vq-mapping parameter in @list.
+ *
+ * iothread_vq_mapping_cleanup() must be called to free IOThread object
+ * references after this function returns success.
+ *
+ * Returns: %true on success, %false on failure.
+ **/
+bool iothread_vq_mapping_apply(
+ IOThreadVirtQueueMappingList *list,
+ AioContext **vq_aio_context,
+ uint16_t num_queues,
+ Error **errp);
+
+/**
+ * iothread_vq_mapping_cleanup:
+ * @list: The mapping of virtqueues to IOThreads.
+ *
+ * Release IOThread object references that were acquired by
+ * iothread_vq_mapping_apply().
+ */
+void iothread_vq_mapping_cleanup(IOThreadVirtQueueMappingList *list);
+
+#endif /* HW_VIRTIO_IOTHREAD_VQ_MAPPING_H */
diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h
index be230cd..31e852e 100644
--- a/include/hw/virtio/virtio-scsi.h
+++ b/include/hw/virtio/virtio-scsi.h
@@ -22,6 +22,7 @@
#include "hw/virtio/virtio.h"
#include "hw/scsi/scsi.h"
#include "chardev/char-fe.h"
+#include "qapi/qapi-types-virtio.h"
#include "system/iothread.h"
#define TYPE_VIRTIO_SCSI_COMMON "virtio-scsi-common"
@@ -60,6 +61,7 @@ struct VirtIOSCSIConf {
CharBackend chardev;
uint32_t boot_tpgt;
IOThread *iothread;
+ IOThreadVirtQueueMappingList *iothread_vq_mapping_list;
};
struct VirtIOSCSI;
@@ -82,18 +84,14 @@ struct VirtIOSCSI {
SCSIBus bus;
int resetting; /* written from main loop thread, read from any thread */
+
+ QemuMutex event_lock; /* protects event_vq and events_dropped */
bool events_dropped;
- /*
- * TMFs deferred to main loop BH. These fields are protected by
- * tmf_bh_lock.
- */
- QemuMutex tmf_bh_lock;
- QEMUBH *tmf_bh;
- QTAILQ_HEAD(, VirtIOSCSIReq) tmf_bh_list;
+ QemuMutex ctrl_lock; /* protects ctrl_vq */
/* Fields for dataplane below */
- AioContext *ctx; /* one iothread per virtio-scsi-pci for now */
+ AioContext **vq_aio_context; /* per-virtqueue AioContext pointer */
bool dataplane_started;
bool dataplane_starting;
@@ -111,6 +109,7 @@ void virtio_scsi_common_realize(DeviceState *dev,
void virtio_scsi_common_unrealize(DeviceState *dev);
void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp);
+void virtio_scsi_dataplane_cleanup(VirtIOSCSI *s);
int virtio_scsi_dataplane_start(VirtIODevice *s);
void virtio_scsi_dataplane_stop(VirtIODevice *s);
diff --git a/include/system/block-backend-global-state.h b/include/system/block-backend-global-state.h
index 9cc9b00..35b5e83 100644
--- a/include/system/block-backend-global-state.h
+++ b/include/system/block-backend-global-state.h
@@ -86,7 +86,6 @@ bool blk_supports_write_perm(BlockBackend *blk);
bool blk_is_sg(BlockBackend *blk);
void blk_set_enable_write_cache(BlockBackend *blk, bool wce);
int blk_get_flags(BlockBackend *blk);
-bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp);
int blk_set_aio_context(BlockBackend *blk, AioContext *new_context,
Error **errp);
void blk_add_aio_context_notifier(BlockBackend *blk,
diff --git a/include/system/dma.h b/include/system/dma.h
index 5a49a30..e142f7e 100644
--- a/include/system/dma.h
+++ b/include/system/dma.h
@@ -290,8 +290,7 @@ typedef BlockAIOCB *DMAIOFunc(int64_t offset, QEMUIOVector *iov,
BlockCompletionFunc *cb, void *cb_opaque,
void *opaque);
-BlockAIOCB *dma_blk_io(AioContext *ctx,
- QEMUSGList *sg, uint64_t offset, uint32_t align,
+BlockAIOCB *dma_blk_io(QEMUSGList *sg, uint64_t offset, uint32_t align,
DMAIOFunc *io_func, void *io_func_opaque,
BlockCompletionFunc *cb, void *opaque, DMADirection dir);
BlockAIOCB *dma_blk_read(BlockBackend *blk,
diff --git a/meson.build b/meson.build
index 9d9c117..2f43fd8 100644
--- a/meson.build
+++ b/meson.build
@@ -2727,6 +2727,14 @@ config_host_data.set('HAVE_OPTRESET',
cc.has_header_symbol('getopt.h', 'optreset'))
config_host_data.set('HAVE_IPPROTO_MPTCP',
cc.has_header_symbol('netinet/in.h', 'IPPROTO_MPTCP'))
+if libaio.found()
+ config_host_data.set('HAVE_IO_PREP_PWRITEV2',
+ cc.has_header_symbol('libaio.h', 'io_prep_pwritev2'))
+endif
+if linux_io_uring.found()
+ config_host_data.set('HAVE_IO_URING_PREP_WRITEV2',
+ cc.has_header_symbol('liburing.h', 'io_uring_prep_writev2'))
+endif
# has_member
config_host_data.set('HAVE_SIGEV_NOTIFY_THREAD_ID',
diff --git a/scripts/qcow2-to-stdout.py b/scripts/qcow2-to-stdout.py
new file mode 100755
index 0000000..06b7c13
--- /dev/null
+++ b/scripts/qcow2-to-stdout.py
@@ -0,0 +1,449 @@
+#!/usr/bin/env python3
+
+# This tool reads a disk image in any format and converts it to qcow2,
+# writing the result directly to stdout.
+#
+# Copyright (C) 2024 Igalia, S.L.
+#
+# Authors: Alberto Garcia <berto@igalia.com>
+# Madeeha Javed <javed@igalia.com>
+#
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# qcow2 files produced by this script are always arranged like this:
+#
+# - qcow2 header
+# - refcount table
+# - refcount blocks
+# - L1 table
+# - L2 tables
+# - Data clusters
+#
+# A note about variable names: in qcow2 there is one refcount table
+# and one (active) L1 table, although each can occupy several
+# clusters. For the sake of simplicity the code sometimes talks about
+# refcount tables and L1 tables when referring to those clusters.
+
+import argparse
+import errno
+import math
+import os
+import signal
+import struct
+import subprocess
+import sys
+import tempfile
+import time
+from contextlib import contextmanager
+
+QCOW2_DEFAULT_CLUSTER_SIZE = 65536
+QCOW2_DEFAULT_REFCOUNT_BITS = 16
+QCOW2_FEATURE_NAME_TABLE = 0x6803F857
+QCOW2_DATA_FILE_NAME_STRING = 0x44415441
+QCOW2_V3_HEADER_LENGTH = 112 # Header length in QEMU 9.0. Must be a multiple of 8
+QCOW2_INCOMPAT_DATA_FILE_BIT = 2
+QCOW2_AUTOCLEAR_DATA_FILE_RAW_BIT = 1
+QCOW_OFLAG_COPIED = 1 << 63
+QEMU_STORAGE_DAEMON = "qemu-storage-daemon"
+
+
+def bitmap_set(bitmap, idx):
+ bitmap[idx // 8] |= 1 << (idx % 8)
+
+
+def bitmap_is_set(bitmap, idx):
+ return (bitmap[idx // 8] & (1 << (idx % 8))) != 0
+
+
+def bitmap_iterator(bitmap, length):
+ for idx in range(length):
+ if bitmap_is_set(bitmap, idx):
+ yield idx
+
+
+def align_up(num, d):
+ return d * math.ceil(num / d)
+
+
+# Holes in the input file contain only zeroes so we can skip them and
+# save time. This function returns the indexes of the clusters that
+# are known to contain data. Those are the ones that we need to read.
+def clusters_with_data(fd, cluster_size):
+ data_to = 0
+ while True:
+ try:
+ data_from = os.lseek(fd, data_to, os.SEEK_DATA)
+ data_to = align_up(os.lseek(fd, data_from, os.SEEK_HOLE), cluster_size)
+ for idx in range(data_from // cluster_size, data_to // cluster_size):
+ yield idx
+ except OSError as err:
+ if err.errno == errno.ENXIO: # End of file reached
+ break
+ raise err
+
+
+# write_qcow2_content() expects a raw input file. If we have a different
+# format we can use qemu-storage-daemon to make it appear as raw.
+@contextmanager
+def get_input_as_raw_file(input_file, input_format):
+ if input_format == "raw":
+ yield input_file
+ return
+ try:
+ temp_dir = tempfile.mkdtemp()
+ pid_file = os.path.join(temp_dir, "pid")
+ raw_file = os.path.join(temp_dir, "raw")
+ open(raw_file, "wb").close()
+ ret = subprocess.run(
+ [
+ QEMU_STORAGE_DAEMON,
+ "--daemonize",
+ "--pidfile", pid_file,
+ "--blockdev", f"driver=file,node-name=file0,driver=file,filename={input_file},read-only=on",
+ "--blockdev", f"driver={input_format},node-name=disk0,file=file0,read-only=on",
+ "--export", f"type=fuse,id=export0,node-name=disk0,mountpoint={raw_file},writable=off",
+ ],
+ capture_output=True,
+ )
+ if ret.returncode != 0:
+ sys.exit("[Error] Could not start the qemu-storage-daemon:\n" +
+ ret.stderr.decode().rstrip('\n'))
+ yield raw_file
+ finally:
+ # Kill the storage daemon on exit
+ # and remove all temporary files
+ if os.path.exists(pid_file):
+ with open(pid_file, "r") as f:
+ pid = int(f.readline())
+ os.kill(pid, signal.SIGTERM)
+ while os.path.exists(pid_file):
+ time.sleep(0.1)
+ os.unlink(raw_file)
+ os.rmdir(temp_dir)
+
+
+def write_features(cluster, offset, data_file_name):
+ if data_file_name is not None:
+ encoded_name = data_file_name.encode("utf-8")
+ padded_name_len = align_up(len(encoded_name), 8)
+ struct.pack_into(f">II{padded_name_len}s", cluster, offset,
+ QCOW2_DATA_FILE_NAME_STRING,
+ len(encoded_name),
+ encoded_name)
+ offset += 8 + padded_name_len
+
+ qcow2_features = [
+ # Incompatible
+ (0, 0, "dirty bit"),
+ (0, 1, "corrupt bit"),
+ (0, 2, "external data file"),
+ (0, 3, "compression type"),
+ (0, 4, "extended L2 entries"),
+ # Compatible
+ (1, 0, "lazy refcounts"),
+ # Autoclear
+ (2, 0, "bitmaps"),
+ (2, 1, "raw external data"),
+ ]
+ struct.pack_into(">I", cluster, offset, QCOW2_FEATURE_NAME_TABLE)
+ struct.pack_into(">I", cluster, offset + 4, len(qcow2_features) * 48)
+ offset += 8
+ for feature_type, feature_bit, feature_name in qcow2_features:
+ struct.pack_into(">BB46s", cluster, offset,
+ feature_type, feature_bit, feature_name.encode("ascii"))
+ offset += 48
+
+
+def write_qcow2_content(input_file, cluster_size, refcount_bits, data_file_name, data_file_raw):
+ # Some basic values
+ l1_entries_per_table = cluster_size // 8
+ l2_entries_per_table = cluster_size // 8
+ refcounts_per_table = cluster_size // 8
+ refcounts_per_block = cluster_size * 8 // refcount_bits
+
+ # Virtual disk size, number of data clusters and L1 entries
+ disk_size = align_up(os.path.getsize(input_file), 512)
+ total_data_clusters = math.ceil(disk_size / cluster_size)
+ l1_entries = math.ceil(total_data_clusters / l2_entries_per_table)
+ allocated_l1_tables = math.ceil(l1_entries / l1_entries_per_table)
+
+ # Max L1 table size is 32 MB (QCOW_MAX_L1_SIZE in block/qcow2.h)
+ if (l1_entries * 8) > (32 * 1024 * 1024):
+ sys.exit("[Error] The image size is too large. Try using a larger cluster size.")
+
+ # Two bitmaps indicating which L1 and L2 entries are set
+ l1_bitmap = bytearray(allocated_l1_tables * l1_entries_per_table // 8)
+ l2_bitmap = bytearray(l1_entries * l2_entries_per_table // 8)
+ allocated_l2_tables = 0
+ allocated_data_clusters = 0
+
+ if data_file_raw:
+ # If data_file_raw is set then all clusters are allocated and
+ # we don't need to read the input file at all.
+ allocated_l2_tables = l1_entries
+ for idx in range(l1_entries):
+ bitmap_set(l1_bitmap, idx)
+ for idx in range(total_data_clusters):
+ bitmap_set(l2_bitmap, idx)
+ else:
+ # Open the input file for reading
+ fd = os.open(input_file, os.O_RDONLY)
+ zero_cluster = bytes(cluster_size)
+ # Read all the clusters that contain data
+ for idx in clusters_with_data(fd, cluster_size):
+ cluster = os.pread(fd, cluster_size, cluster_size * idx)
+ # If the last cluster is smaller than cluster_size pad it with zeroes
+ if len(cluster) < cluster_size:
+ cluster += bytes(cluster_size - len(cluster))
+ # If a cluster has non-zero data then it must be allocated
+ # in the output file and its L2 entry must be set
+ if cluster != zero_cluster:
+ bitmap_set(l2_bitmap, idx)
+ allocated_data_clusters += 1
+ # Allocated data clusters also need their corresponding L1 entry and L2 table
+ l1_idx = math.floor(idx / l2_entries_per_table)
+ if not bitmap_is_set(l1_bitmap, l1_idx):
+ bitmap_set(l1_bitmap, l1_idx)
+ allocated_l2_tables += 1
+
+ # Total amount of allocated clusters excluding the refcount blocks and table
+ total_allocated_clusters = 1 + allocated_l1_tables + allocated_l2_tables
+ if data_file_name is None:
+ total_allocated_clusters += allocated_data_clusters
+
+ # Clusters allocated for the refcount blocks and table
+ allocated_refcount_blocks = math.ceil(total_allocated_clusters / refcounts_per_block)
+ allocated_refcount_tables = math.ceil(allocated_refcount_blocks / refcounts_per_table)
+
+ # Now we have a problem because allocated_refcount_blocks and allocated_refcount_tables...
+ # (a) increase total_allocated_clusters, and
+ # (b) need to be recalculated when total_allocated_clusters is increased
+ # So we need to repeat the calculation as long as the numbers change
+ while True:
+ new_total_allocated_clusters = total_allocated_clusters + allocated_refcount_tables + allocated_refcount_blocks
+ new_allocated_refcount_blocks = math.ceil(new_total_allocated_clusters / refcounts_per_block)
+ if new_allocated_refcount_blocks > allocated_refcount_blocks:
+ allocated_refcount_blocks = new_allocated_refcount_blocks
+ allocated_refcount_tables = math.ceil(allocated_refcount_blocks / refcounts_per_table)
+ else:
+ break
+
+ # Now that we have the final numbers we can update total_allocated_clusters
+ total_allocated_clusters += allocated_refcount_tables + allocated_refcount_blocks
+
+ # At this point we have the exact number of clusters that the output
+ # image is going to use so we can calculate all the offsets.
+ current_cluster_idx = 1
+
+ refcount_table_offset = current_cluster_idx * cluster_size
+ current_cluster_idx += allocated_refcount_tables
+
+ refcount_block_offset = current_cluster_idx * cluster_size
+ current_cluster_idx += allocated_refcount_blocks
+
+ l1_table_offset = current_cluster_idx * cluster_size
+ current_cluster_idx += allocated_l1_tables
+
+ l2_table_offset = current_cluster_idx * cluster_size
+ current_cluster_idx += allocated_l2_tables
+
+ data_clusters_offset = current_cluster_idx * cluster_size
+
+ # Calculate some values used in the qcow2 header
+ if allocated_l1_tables == 0:
+ l1_table_offset = 0
+
+ hdr_cluster_bits = int(math.log2(cluster_size))
+ hdr_refcount_bits = int(math.log2(refcount_bits))
+ hdr_length = QCOW2_V3_HEADER_LENGTH
+ hdr_incompat_features = 0
+ if data_file_name is not None:
+ hdr_incompat_features |= 1 << QCOW2_INCOMPAT_DATA_FILE_BIT
+ hdr_autoclear_features = 0
+ if data_file_raw:
+ hdr_autoclear_features |= 1 << QCOW2_AUTOCLEAR_DATA_FILE_RAW_BIT
+
+ ### Write qcow2 header
+ cluster = bytearray(cluster_size)
+ struct.pack_into(">4sIQIIQIIQQIIQQQQII", cluster, 0,
+ b"QFI\xfb", # QCOW magic string
+ 3, # version
+ 0, # backing file offset
+ 0, # backing file sizes
+ hdr_cluster_bits,
+ disk_size,
+ 0, # encryption method
+ l1_entries,
+ l1_table_offset,
+ refcount_table_offset,
+ allocated_refcount_tables,
+ 0, # number of snapshots
+ 0, # snapshot table offset
+ hdr_incompat_features,
+ 0, # compatible features
+ hdr_autoclear_features,
+ hdr_refcount_bits,
+ hdr_length,
+ )
+
+ write_features(cluster, hdr_length, data_file_name)
+
+ sys.stdout.buffer.write(cluster)
+
+ ### Write refcount table
+ cur_offset = refcount_block_offset
+ remaining_refcount_table_entries = allocated_refcount_blocks # Each entry is a pointer to a refcount block
+ while remaining_refcount_table_entries > 0:
+ cluster = bytearray(cluster_size)
+ to_write = min(remaining_refcount_table_entries, refcounts_per_table)
+ remaining_refcount_table_entries -= to_write
+ for idx in range(to_write):
+ struct.pack_into(">Q", cluster, idx * 8, cur_offset)
+ cur_offset += cluster_size
+ sys.stdout.buffer.write(cluster)
+
+ ### Write refcount blocks
+ remaining_refcount_block_entries = total_allocated_clusters # One entry for each allocated cluster
+ for tbl in range(allocated_refcount_blocks):
+ cluster = bytearray(cluster_size)
+ to_write = min(remaining_refcount_block_entries, refcounts_per_block)
+ remaining_refcount_block_entries -= to_write
+ # All refcount entries contain the number 1. The only difference
+ # is their bit width, defined when the image is created.
+ for idx in range(to_write):
+ if refcount_bits == 64:
+ struct.pack_into(">Q", cluster, idx * 8, 1)
+ elif refcount_bits == 32:
+ struct.pack_into(">L", cluster, idx * 4, 1)
+ elif refcount_bits == 16:
+ struct.pack_into(">H", cluster, idx * 2, 1)
+ elif refcount_bits == 8:
+ cluster[idx] = 1
+ elif refcount_bits == 4:
+ cluster[idx // 2] |= 1 << ((idx % 2) * 4)
+ elif refcount_bits == 2:
+ cluster[idx // 4] |= 1 << ((idx % 4) * 2)
+ elif refcount_bits == 1:
+ cluster[idx // 8] |= 1 << (idx % 8)
+ sys.stdout.buffer.write(cluster)
+
+ ### Write L1 table
+ cur_offset = l2_table_offset
+ for tbl in range(allocated_l1_tables):
+ cluster = bytearray(cluster_size)
+ for idx in range(l1_entries_per_table):
+ l1_idx = tbl * l1_entries_per_table + idx
+ if bitmap_is_set(l1_bitmap, l1_idx):
+ struct.pack_into(">Q", cluster, idx * 8, cur_offset | QCOW_OFLAG_COPIED)
+ cur_offset += cluster_size
+ sys.stdout.buffer.write(cluster)
+
+ ### Write L2 tables
+ cur_offset = data_clusters_offset
+ for tbl in range(l1_entries):
+ # Skip the empty L2 tables. We can identify them because
+ # there is no L1 entry pointing at them.
+ if bitmap_is_set(l1_bitmap, tbl):
+ cluster = bytearray(cluster_size)
+ for idx in range(l2_entries_per_table):
+ l2_idx = tbl * l2_entries_per_table + idx
+ if bitmap_is_set(l2_bitmap, l2_idx):
+ if data_file_name is None:
+ struct.pack_into(">Q", cluster, idx * 8, cur_offset | QCOW_OFLAG_COPIED)
+ cur_offset += cluster_size
+ else:
+ struct.pack_into(">Q", cluster, idx * 8, (l2_idx * cluster_size) | QCOW_OFLAG_COPIED)
+ sys.stdout.buffer.write(cluster)
+
+ ### Write data clusters
+ if data_file_name is None:
+ for idx in bitmap_iterator(l2_bitmap, total_data_clusters):
+ cluster = os.pread(fd, cluster_size, cluster_size * idx)
+ # If the last cluster is smaller than cluster_size pad it with zeroes
+ if len(cluster) < cluster_size:
+ cluster += bytes(cluster_size - len(cluster))
+ sys.stdout.buffer.write(cluster)
+
+ if not data_file_raw:
+ os.close(fd)
+
+
+def main():
+ # Command-line arguments
+ parser = argparse.ArgumentParser(
+ description="This program converts a QEMU disk image to qcow2 "
+ "and writes it to the standard output"
+ )
+ parser.add_argument("input_file", help="name of the input file")
+ parser.add_argument(
+ "-f",
+ dest="input_format",
+ metavar="input_format",
+ help="format of the input file (default: raw)",
+ default="raw",
+ )
+ parser.add_argument(
+ "-c",
+ dest="cluster_size",
+ metavar="cluster_size",
+ help=f"qcow2 cluster size (default: {QCOW2_DEFAULT_CLUSTER_SIZE})",
+ default=QCOW2_DEFAULT_CLUSTER_SIZE,
+ type=int,
+ choices=[1 << x for x in range(9, 22)],
+ )
+ parser.add_argument(
+ "-r",
+ dest="refcount_bits",
+ metavar="refcount_bits",
+ help=f"width of the reference count entries (default: {QCOW2_DEFAULT_REFCOUNT_BITS})",
+ default=QCOW2_DEFAULT_REFCOUNT_BITS,
+ type=int,
+ choices=[1 << x for x in range(7)],
+ )
+ parser.add_argument(
+ "-d",
+ dest="data_file",
+ help="create an image with input_file as an external data file",
+ action="store_true",
+ )
+ parser.add_argument(
+ "-R",
+ dest="data_file_raw",
+ help="enable data_file_raw on the generated image (implies -d)",
+ action="store_true",
+ )
+ args = parser.parse_args()
+
+ if args.data_file_raw:
+ args.data_file = True
+
+ if not os.path.isfile(args.input_file):
+ sys.exit(f"[Error] {args.input_file} does not exist or is not a regular file.")
+
+ if args.data_file and args.input_format != "raw":
+ sys.exit("[Error] External data files can only be used with raw input images")
+
+ # A 512 byte header is too small for the data file name extension
+ if args.data_file and args.cluster_size == 512:
+ sys.exit("[Error] External data files require a larger cluster size")
+
+ if sys.stdout.isatty():
+ sys.exit("[Error] Refusing to write to a tty. Try redirecting stdout.")
+
+ if args.data_file:
+ data_file_name = args.input_file
+ else:
+ data_file_name = None
+
+ with get_input_as_raw_file(args.input_file, args.input_format) as raw_file:
+ write_qcow2_content(
+ raw_file,
+ args.cluster_size,
+ args.refcount_bits,
+ data_file_name,
+ args.data_file_raw,
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/system/dma-helpers.c b/system/dma-helpers.c
index f640324..6bad758 100644
--- a/system/dma-helpers.c
+++ b/system/dma-helpers.c
@@ -211,7 +211,7 @@ static const AIOCBInfo dma_aiocb_info = {
.cancel_async = dma_aio_cancel,
};
-BlockAIOCB *dma_blk_io(AioContext *ctx,
+BlockAIOCB *dma_blk_io(
QEMUSGList *sg, uint64_t offset, uint32_t align,
DMAIOFunc *io_func, void *io_func_opaque,
BlockCompletionFunc *cb,
@@ -223,7 +223,7 @@ BlockAIOCB *dma_blk_io(AioContext *ctx,
dbs->acb = NULL;
dbs->sg = sg;
- dbs->ctx = ctx;
+ dbs->ctx = qemu_get_current_aio_context();
dbs->offset = offset;
dbs->align = align;
dbs->sg_cur_index = 0;
@@ -251,7 +251,7 @@ BlockAIOCB *dma_blk_read(BlockBackend *blk,
QEMUSGList *sg, uint64_t offset, uint32_t align,
void (*cb)(void *opaque, int ret), void *opaque)
{
- return dma_blk_io(blk_get_aio_context(blk), sg, offset, align,
+ return dma_blk_io(sg, offset, align,
dma_blk_read_io_func, blk, cb, opaque,
DMA_DIRECTION_FROM_DEVICE);
}
@@ -269,7 +269,7 @@ BlockAIOCB *dma_blk_write(BlockBackend *blk,
QEMUSGList *sg, uint64_t offset, uint32_t align,
void (*cb)(void *opaque, int ret), void *opaque)
{
- return dma_blk_io(blk_get_aio_context(blk), sg, offset, align,
+ return dma_blk_io(sg, offset, align,
dma_blk_write_io_func, blk, cb, opaque,
DMA_DIRECTION_TO_DEVICE);
}
diff --git a/tests/qemu-iotests/051.pc.out b/tests/qemu-iotests/051.pc.out
index 7e10c5f..f19b532 100644
--- a/tests/qemu-iotests/051.pc.out
+++ b/tests/qemu-iotests/051.pc.out
@@ -181,7 +181,7 @@ QEMU X.Y.Z monitor - type 'help' for more information
Testing: -drive file=TEST_DIR/t.qcow2,if=none,node-name=disk -object iothread,id=thread0 -device virtio-scsi,iothread=thread0,id=virtio-scsi0 -device scsi-hd,bus=virtio-scsi0.0,drive=disk,share-rw=on -device virtio-scsi,id=virtio-scsi1 -device scsi-hd,bus=virtio-scsi1.0,drive=disk,share-rw=on
QEMU X.Y.Z monitor - type 'help' for more information
-(qemu) QEMU_PROG: -device scsi-hd,bus=virtio-scsi1.0,drive=disk,share-rw=on: Cannot change iothread of active block backend
+(qemu) quit
Testing: -drive file=TEST_DIR/t.qcow2,if=none,node-name=disk -object iothread,id=thread0 -device virtio-scsi,iothread=thread0,id=virtio-scsi0 -device scsi-hd,bus=virtio-scsi0.0,drive=disk,share-rw=on -device virtio-blk-pci,drive=disk,iothread=thread0,share-rw=on
QEMU X.Y.Z monitor - type 'help' for more information
diff --git a/tests/qemu-iotests/tests/qsd-migrate b/tests/qemu-iotests/tests/qsd-migrate
index de17562..a4c6592 100755
--- a/tests/qemu-iotests/tests/qsd-migrate
+++ b/tests/qemu-iotests/tests/qsd-migrate
@@ -22,7 +22,7 @@ import iotests
from iotests import filter_qemu_io, filter_qtest
-iotests.script_initialize(supported_fmts=['generic'],
+iotests.script_initialize(supported_fmts=['qcow2', 'qed', 'raw'],
supported_protocols=['file'],
supported_platforms=['linux'])
diff --git a/util/aio-posix.c b/util/aio-posix.c
index 06bf9f4..2e0a5da 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -28,6 +28,9 @@
/* Stop userspace polling on a handler if it isn't active for some time */
#define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND)
+static void adjust_polling_time(AioContext *ctx, AioPolledEvent *poll,
+ int64_t block_ns);
+
bool aio_poll_disabled(AioContext *ctx)
{
return qatomic_read(&ctx->poll_disable_cnt);
@@ -392,7 +395,8 @@ static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node)
* scanning all handlers with aio_dispatch_handlers().
*/
static bool aio_dispatch_ready_handlers(AioContext *ctx,
- AioHandlerList *ready_list)
+ AioHandlerList *ready_list,
+ int64_t block_ns)
{
bool progress = false;
AioHandler *node;
@@ -400,6 +404,14 @@ static bool aio_dispatch_ready_handlers(AioContext *ctx,
while ((node = QLIST_FIRST(ready_list))) {
QLIST_REMOVE(node, node_ready);
progress = aio_dispatch_handler(ctx, node) || progress;
+
+ /*
+ * Adjust polling time only after aio_dispatch_handler(), which can
+ * add the handler to ctx->poll_aio_handlers.
+ */
+ if (ctx->poll_max_ns && QLIST_IS_INSERTED(node, node_poll)) {
+ adjust_polling_time(ctx, &node->poll, block_ns);
+ }
}
return progress;
@@ -579,13 +591,19 @@ static bool run_poll_handlers(AioContext *ctx, AioHandlerList *ready_list,
static bool try_poll_mode(AioContext *ctx, AioHandlerList *ready_list,
int64_t *timeout)
{
+ AioHandler *node;
int64_t max_ns;
if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) {
return false;
}
- max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
+ max_ns = 0;
+ QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
+ max_ns = MAX(max_ns, node->poll.ns);
+ }
+ max_ns = qemu_soonest_timeout(*timeout, max_ns);
+
if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
/*
* Enable poll mode. It pairs with the poll_set_started() in
@@ -600,6 +618,46 @@ static bool try_poll_mode(AioContext *ctx, AioHandlerList *ready_list,
return false;
}
+static void adjust_polling_time(AioContext *ctx, AioPolledEvent *poll,
+ int64_t block_ns)
+{
+ if (block_ns <= poll->ns) {
+ /* This is the sweet spot, no adjustment needed */
+ } else if (block_ns > ctx->poll_max_ns) {
+ /* We'd have to poll for too long, poll less */
+ int64_t old = poll->ns;
+
+ if (ctx->poll_shrink) {
+ poll->ns /= ctx->poll_shrink;
+ } else {
+ poll->ns = 0;
+ }
+
+ trace_poll_shrink(ctx, old, poll->ns);
+ } else if (poll->ns < ctx->poll_max_ns &&
+ block_ns < ctx->poll_max_ns) {
+ /* There is room to grow, poll longer */
+ int64_t old = poll->ns;
+ int64_t grow = ctx->poll_grow;
+
+ if (grow == 0) {
+ grow = 2;
+ }
+
+ if (poll->ns) {
+ poll->ns *= grow;
+ } else {
+ poll->ns = 4000; /* start polling at 4 microseconds */
+ }
+
+ if (poll->ns > ctx->poll_max_ns) {
+ poll->ns = ctx->poll_max_ns;
+ }
+
+ trace_poll_grow(ctx, old, poll->ns);
+ }
+}
+
bool aio_poll(AioContext *ctx, bool blocking)
{
AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
@@ -607,6 +665,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
bool use_notify_me;
int64_t timeout;
int64_t start = 0;
+ int64_t block_ns = 0;
/*
* There cannot be two concurrent aio_poll calls for the same AioContext (or
@@ -679,49 +738,13 @@ bool aio_poll(AioContext *ctx, bool blocking)
aio_notify_accept(ctx);
- /* Adjust polling time */
+ /* Calculate blocked time for adaptive polling */
if (ctx->poll_max_ns) {
- int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start;
-
- if (block_ns <= ctx->poll_ns) {
- /* This is the sweet spot, no adjustment needed */
- } else if (block_ns > ctx->poll_max_ns) {
- /* We'd have to poll for too long, poll less */
- int64_t old = ctx->poll_ns;
-
- if (ctx->poll_shrink) {
- ctx->poll_ns /= ctx->poll_shrink;
- } else {
- ctx->poll_ns = 0;
- }
-
- trace_poll_shrink(ctx, old, ctx->poll_ns);
- } else if (ctx->poll_ns < ctx->poll_max_ns &&
- block_ns < ctx->poll_max_ns) {
- /* There is room to grow, poll longer */
- int64_t old = ctx->poll_ns;
- int64_t grow = ctx->poll_grow;
-
- if (grow == 0) {
- grow = 2;
- }
-
- if (ctx->poll_ns) {
- ctx->poll_ns *= grow;
- } else {
- ctx->poll_ns = 4000; /* start polling at 4 microseconds */
- }
-
- if (ctx->poll_ns > ctx->poll_max_ns) {
- ctx->poll_ns = ctx->poll_max_ns;
- }
-
- trace_poll_grow(ctx, old, ctx->poll_ns);
- }
+ block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start;
}
progress |= aio_bh_poll(ctx);
- progress |= aio_dispatch_ready_handlers(ctx, &ready_list);
+ progress |= aio_dispatch_ready_handlers(ctx, &ready_list, block_ns);
aio_free_deleted_handlers(ctx);
@@ -767,11 +790,18 @@ void aio_context_use_g_source(AioContext *ctx)
void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
int64_t grow, int64_t shrink, Error **errp)
{
+ AioHandler *node;
+
+ qemu_lockcnt_inc(&ctx->list_lock);
+ QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+ node->poll.ns = 0;
+ }
+ qemu_lockcnt_dec(&ctx->list_lock);
+
/* No thread synchronization here, it doesn't matter if an incorrect value
* is used once.
*/
ctx->poll_max_ns = max_ns;
- ctx->poll_ns = 0;
ctx->poll_grow = grow;
ctx->poll_shrink = shrink;
diff --git a/util/aio-posix.h b/util/aio-posix.h
index 4264c51..82a0201 100644
--- a/util/aio-posix.h
+++ b/util/aio-posix.h
@@ -38,6 +38,7 @@ struct AioHandler {
#endif
int64_t poll_idle_timeout; /* when to stop userspace polling */
bool poll_ready; /* has polling detected an event? */
+ AioPolledEvent poll;
};
/* Add a handler to a ready list */
diff --git a/util/async.c b/util/async.c
index 47e3d35..863416d 100644
--- a/util/async.c
+++ b/util/async.c
@@ -609,7 +609,6 @@ AioContext *aio_context_new(Error **errp)
qemu_rec_mutex_init(&ctx->lock);
timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx);
- ctx->poll_ns = 0;
ctx->poll_max_ns = 0;
ctx->poll_grow = 0;
ctx->poll_shrink = 0;