From dea97c1fbd4e1bc36d776ff949ce568b7a435e71 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 25 May 2023 14:47:02 +0200 Subject: block-coroutine-wrapper: Take AioContext lock in no_co_wrappers All of the functions that currently take a BlockDriverState, BdrvChild or BlockBackend as their first parameter expect the associated AioContext to be locked when they are called. In the case of no_co_wrappers, they are called from bottom halves directly in the main loop, so no other caller can be expected to take the lock for them. This can result in assertion failures because a lock that isn't taken is released in nested event loops. Looking at the first parameter is already done by co_wrappers to decide where the coroutine should run, so doing the same in no_co_wrappers is only consistent. Take the lock in the generated bottom halves to fix the problem. Signed-off-by: Kevin Wolf Message-Id: <20230525124713.401149-2-kwolf@redhat.com> Reviewed-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- include/block/block-common.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/block/block-common.h b/include/block/block-common.h index 9319622..e15395f 100644 --- a/include/block/block-common.h +++ b/include/block/block-common.h @@ -65,6 +65,9 @@ * scheduling a BH in the bottom half that runs the respective non-coroutine * function. The coroutine yields after scheduling the BH and is reentered when * the wrapped function returns. + * + * If the first parameter of the function is a BlockDriverState, BdrvChild or + * BlockBackend pointer, the AioContext lock for it is taken in the wrapper. */ #define no_co_wrapper -- cgit v1.1 From 26462a700c8c5d30802c2254a35b5064762e00f0 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 16 May 2023 15:02:20 -0400 Subject: hw/qdev: introduce qdev_is_realized() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a helper function to check whether the device is realized without requiring the Big QEMU Lock. The next patch adds a second caller. The goal is to avoid spreading DeviceState field accesses throughout the code. Suggested-by: Philippe Mathieu-Daudé Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Stefan Hajnoczi Reviewed-by: Kevin Wolf Message-Id: <20230516190238.8401-3-stefanha@redhat.com> Signed-off-by: Kevin Wolf --- include/hw/qdev-core.h | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h index 7623703..f1070d6 100644 --- a/include/hw/qdev-core.h +++ b/include/hw/qdev-core.h @@ -1,6 +1,7 @@ #ifndef QDEV_CORE_H #define QDEV_CORE_H +#include "qemu/atomic.h" #include "qemu/queue.h" #include "qemu/bitmap.h" #include "qemu/rcu.h" @@ -168,9 +169,6 @@ typedef struct { /** * DeviceState: - * @realized: Indicates whether the device has been fully constructed. - * When accessed outside big qemu lock, must be accessed with - * qatomic_load_acquire() * @reset: ResettableState for the device; handled by Resettable interface. * * This structure should not be accessed directly. We declare it here @@ -340,6 +338,19 @@ DeviceState *qdev_new(const char *name); DeviceState *qdev_try_new(const char *name); /** + * qdev_is_realized: + * @dev: The device to check. + * + * May be called outside big qemu lock. + * + * Returns: %true% if the device has been fully constructed, %false% otherwise. + */ +static inline bool qdev_is_realized(DeviceState *dev) +{ + return qatomic_load_acquire(&dev->realized); +} + +/** * qdev_realize: Realize @dev. * @dev: device to realize * @bus: bus to plug it into (may be NULL) -- cgit v1.1 From 75d33e852536361367c8460abd8b04e3fe9921ee Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 16 May 2023 15:02:23 -0400 Subject: util/vhost-user-server: rename refcount to in_flight counter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The VuServer object has a refcount field and ref/unref APIs. The name is confusing because it's actually an in-flight request counter instead of a refcount. Normally a refcount destroys the object upon reaching zero. The VuServer counter is used to wake up the vhost-user coroutine when there are no more requests. Avoid confusing by renaming refcount and ref/unref to in_flight and inc/dec. Reviewed-by: Paolo Bonzini Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Stefan Hajnoczi Reviewed-by: Kevin Wolf Message-Id: <20230516190238.8401-6-stefanha@redhat.com> Signed-off-by: Kevin Wolf --- include/qemu/vhost-user-server.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/qemu/vhost-user-server.h b/include/qemu/vhost-user-server.h index 25c7243..bc0ac9d 100644 --- a/include/qemu/vhost-user-server.h +++ b/include/qemu/vhost-user-server.h @@ -41,7 +41,7 @@ typedef struct { const VuDevIface *vu_iface; /* Protected by ctx lock */ - unsigned int refcount; + unsigned int in_flight; bool wait_idle; VuDev vu_dev; QIOChannel *ioc; /* The I/O channel with the client */ @@ -60,8 +60,8 @@ bool vhost_user_server_start(VuServer *server, void vhost_user_server_stop(VuServer *server); -void vhost_user_server_ref(VuServer *server); -void vhost_user_server_unref(VuServer *server); +void vhost_user_server_inc_in_flight(VuServer *server); +void vhost_user_server_dec_in_flight(VuServer *server); void vhost_user_server_attach_aio_context(VuServer *server, AioContext *ctx); void vhost_user_server_detach_aio_context(VuServer *server); -- cgit v1.1 From 8f5e9a8ee189b44ffa90cc6db61e25499b9d786a Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 16 May 2023 15:02:24 -0400 Subject: block/export: wait for vhost-user-blk requests when draining Each vhost-user-blk request runs in a coroutine. When the BlockBackend enters a drained section we need to enter a quiescent state. Currently any in-flight requests race with bdrv_drained_begin() because it is unaware of vhost-user-blk requests. When blk_co_preadv/pwritev()/etc returns it wakes the bdrv_drained_begin() thread but vhost-user-blk request processing has not yet finished. The request coroutine continues executing while the main loop thread thinks it is in a drained section. One example where this is unsafe is for blk_set_aio_context() where bdrv_drained_begin() is called before .aio_context_detached() and .aio_context_attach(). If request coroutines are still running after bdrv_drained_begin(), then the AioContext could change underneath them and they race with new requests processed in the new AioContext. This could lead to virtqueue corruption, for example. (This example is theoretical, I came across this while reading the code and have not tried to reproduce it.) It's easy to make bdrv_drained_begin() wait for in-flight requests: add a .drained_poll() callback that checks the VuServer's in-flight counter. VuServer just needs an API that returns true when there are requests in flight. The in-flight counter needs to be atomic. Signed-off-by: Stefan Hajnoczi Reviewed-by: Kevin Wolf Message-Id: <20230516190238.8401-7-stefanha@redhat.com> Signed-off-by: Kevin Wolf --- include/qemu/vhost-user-server.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/qemu/vhost-user-server.h b/include/qemu/vhost-user-server.h index bc0ac9d..b1c1cda 100644 --- a/include/qemu/vhost-user-server.h +++ b/include/qemu/vhost-user-server.h @@ -40,8 +40,9 @@ typedef struct { int max_queues; const VuDevIface *vu_iface; + unsigned int in_flight; /* atomic */ + /* Protected by ctx lock */ - unsigned int in_flight; bool wait_idle; VuDev vu_dev; QIOChannel *ioc; /* The I/O channel with the client */ @@ -62,6 +63,7 @@ void vhost_user_server_stop(VuServer *server); void vhost_user_server_inc_in_flight(VuServer *server); void vhost_user_server_dec_in_flight(VuServer *server); +bool vhost_user_server_has_in_flight(VuServer *server); void vhost_user_server_attach_aio_context(VuServer *server, AioContext *ctx); void vhost_user_server_detach_aio_context(VuServer *server); -- cgit v1.1 From ff82b7835b2fbbd0a17d616f6929601a97a6497d Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 16 May 2023 15:02:27 -0400 Subject: block: add blk_in_drain() API The BlockBackend quiesce_counter is greater than zero during drained sections. Add an API to check whether the BlockBackend is in a drained section. The next patch will use this API. Signed-off-by: Stefan Hajnoczi Reviewed-by: Kevin Wolf Message-Id: <20230516190238.8401-10-stefanha@redhat.com> Signed-off-by: Kevin Wolf --- include/sysemu/block-backend-global-state.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/sysemu/block-backend-global-state.h b/include/sysemu/block-backend-global-state.h index fa83f93..184e667 100644 --- a/include/sysemu/block-backend-global-state.h +++ b/include/sysemu/block-backend-global-state.h @@ -81,6 +81,7 @@ void blk_activate(BlockBackend *blk, Error **errp); int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags); void blk_aio_cancel(BlockAIOCB *acb); int blk_commit_all(void); +bool blk_in_drain(BlockBackend *blk); void blk_drain(BlockBackend *blk); void blk_drain_all(void); void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error, -- cgit v1.1 From ab61335025b1274bd7042219203524045b23e0d3 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 16 May 2023 15:02:28 -0400 Subject: block: drain from main loop thread in bdrv_co_yield_to_drain() For simplicity, always run BlockDevOps .drained_begin/end/poll() callbacks in the main loop thread. This makes it easier to implement the callbacks and avoids extra locks. Move the function pointer declarations from the I/O Code section to the Global State section for BlockDevOps, BdrvChildClass, and BlockDriver. Narrow IO_OR_GS_CODE() to GLOBAL_STATE_CODE() where appropriate. The test-bdrv-drain test case calls bdrv_drain() from an IOThread. This is now only allowed from coroutine context, so update the test case to run in a coroutine. Signed-off-by: Stefan Hajnoczi Reviewed-by: Kevin Wolf Message-Id: <20230516190238.8401-11-stefanha@redhat.com> Signed-off-by: Kevin Wolf --- include/block/block_int-common.h | 72 +++++++++++++++++------------------ include/sysemu/block-backend-common.h | 25 ++++++------ 2 files changed, 49 insertions(+), 48 deletions(-) (limited to 'include') diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h index 6492a1e..b1cbc1e 100644 --- a/include/block/block_int-common.h +++ b/include/block/block_int-common.h @@ -364,6 +364,21 @@ struct BlockDriver { AioContext *new_context); /** + * bdrv_drain_begin is called if implemented in the beginning of a + * drain operation to drain and stop any internal sources of requests in + * the driver. + * bdrv_drain_end is called if implemented at the end of the drain. + * + * They should be used by the driver to e.g. manage scheduled I/O + * requests, or toggle an internal state. After the end of the drain new + * requests will continue normally. + * + * Implementations of both functions must not call aio_poll(). + */ + void (*bdrv_drain_begin)(BlockDriverState *bs); + void (*bdrv_drain_end)(BlockDriverState *bs); + + /** * Try to get @bs's logical and physical block size. * On success, store them in @bsz and return zero. * On failure, return negative errno. @@ -758,21 +773,6 @@ struct BlockDriver { void coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_io_unplug)( BlockDriverState *bs); - /** - * bdrv_drain_begin is called if implemented in the beginning of a - * drain operation to drain and stop any internal sources of requests in - * the driver. - * bdrv_drain_end is called if implemented at the end of the drain. - * - * They should be used by the driver to e.g. manage scheduled I/O - * requests, or toggle an internal state. After the end of the drain new - * requests will continue normally. - * - * Implementations of both functions must not call aio_poll(). - */ - void (*bdrv_drain_begin)(BlockDriverState *bs); - void (*bdrv_drain_end)(BlockDriverState *bs); - bool (*bdrv_supports_persistent_dirty_bitmap)(BlockDriverState *bs); bool coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_can_store_new_dirty_bitmap)( @@ -956,6 +956,27 @@ struct BdrvChildClass { void GRAPH_WRLOCK_PTR (*detach)(BdrvChild *child); /* + * If this pair of functions is implemented, the parent doesn't issue new + * requests after returning from .drained_begin() until .drained_end() is + * called. + * + * These functions must not change the graph (and therefore also must not + * call aio_poll(), which could change the graph indirectly). + * + * Note that this can be nested. If drained_begin() was called twice, new + * I/O is allowed only after drained_end() was called twice, too. + */ + void (*drained_begin)(BdrvChild *child); + void (*drained_end)(BdrvChild *child); + + /* + * Returns whether the parent has pending requests for the child. This + * callback is polled after .drained_begin() has been called until all + * activity on the child has stopped. + */ + bool (*drained_poll)(BdrvChild *child); + + /* * Notifies the parent that the filename of its child has changed (e.g. * because the direct child was removed from the backing chain), so that it * can update its reference. @@ -984,27 +1005,6 @@ struct BdrvChildClass { const char *(*get_name)(BdrvChild *child); AioContext *(*get_parent_aio_context)(BdrvChild *child); - - /* - * If this pair of functions is implemented, the parent doesn't issue new - * requests after returning from .drained_begin() until .drained_end() is - * called. - * - * These functions must not change the graph (and therefore also must not - * call aio_poll(), which could change the graph indirectly). - * - * Note that this can be nested. If drained_begin() was called twice, new - * I/O is allowed only after drained_end() was called twice, too. - */ - void (*drained_begin)(BdrvChild *child); - void (*drained_end)(BdrvChild *child); - - /* - * Returns whether the parent has pending requests for the child. This - * callback is polled after .drained_begin() has been called until all - * activity on the child has stopped. - */ - bool (*drained_poll)(BdrvChild *child); }; extern const BdrvChildClass child_of_bds; diff --git a/include/sysemu/block-backend-common.h b/include/sysemu/block-backend-common.h index 2391679..780cea7 100644 --- a/include/sysemu/block-backend-common.h +++ b/include/sysemu/block-backend-common.h @@ -60,6 +60,19 @@ typedef struct BlockDevOps { bool (*is_medium_locked)(void *opaque); /* + * Runs when the backend receives a drain request. + */ + void (*drained_begin)(void *opaque); + /* + * Runs when the backend's last drain request ends. + */ + void (*drained_end)(void *opaque); + /* + * Is the device still busy? + */ + bool (*drained_poll)(void *opaque); + + /* * I/O API functions. These functions are thread-safe. * * See include/block/block-io.h for more information about @@ -76,18 +89,6 @@ typedef struct BlockDevOps { * Runs when the size changed (e.g. monitor command block_resize) */ void (*resize_cb)(void *opaque); - /* - * Runs when the backend receives a drain request. - */ - void (*drained_begin)(void *opaque); - /* - * Runs when the backend's last drain request ends. - */ - void (*drained_end)(void *opaque); - /* - * Is the device still busy? - */ - bool (*drained_poll)(void *opaque); } BlockDevOps; /* -- cgit v1.1 From 3d499a43a25571dcacb3a25330357af5e0be9bca Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 16 May 2023 15:02:32 -0400 Subject: block/export: don't require AioContext lock around blk_exp_ref/unref() The FUSE export calls blk_exp_ref/unref() without the AioContext lock. Instead of fixing the FUSE export, adjust blk_exp_ref/unref() so they work without the AioContext lock. This way it's less error-prone. Suggested-by: Paolo Bonzini Signed-off-by: Stefan Hajnoczi Reviewed-by: Kevin Wolf Message-Id: <20230516190238.8401-15-stefanha@redhat.com> Signed-off-by: Kevin Wolf --- include/block/export.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/block/export.h b/include/block/export.h index 7feb02e..f2fe0f8 100644 --- a/include/block/export.h +++ b/include/block/export.h @@ -57,6 +57,8 @@ struct BlockExport { * Reference count for this block export. This includes strong references * both from the owner (qemu-nbd or the monitor) and clients connected to * the export. + * + * Use atomics to access this field. */ int refcount; -- cgit v1.1 From 766aa2de0f29b657148e04599320d771c36fd126 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 16 May 2023 15:02:36 -0400 Subject: virtio-scsi: implement BlockDevOps->drained_begin() The virtio-scsi Host Bus Adapter provides access to devices on a SCSI bus. Those SCSI devices typically have a BlockBackend. When the BlockBackend enters a drained section, the SCSI device must temporarily stop submitting new I/O requests. Implement this behavior by temporarily stopping virtio-scsi virtqueue processing when one of the SCSI devices enters a drained section. The new scsi_device_drained_begin() API allows scsi-disk to message the virtio-scsi HBA. scsi_device_drained_begin() uses a drain counter so that multiple SCSI devices can have overlapping drained sections. The HBA only sees one pair of .drained_begin/end() calls. After this commit, virtio-scsi no longer depends on hw/virtio's ioeventfd aio_set_event_notifier(is_external=true). This commit is a step towards removing the aio_disable_external() API. Signed-off-by: Stefan Hajnoczi Message-Id: <20230516190238.8401-19-stefanha@redhat.com> Signed-off-by: Kevin Wolf --- include/hw/scsi/scsi.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h index 6f23a7a..e2bb1a2 100644 --- a/include/hw/scsi/scsi.h +++ b/include/hw/scsi/scsi.h @@ -133,6 +133,16 @@ struct SCSIBusInfo { void (*save_request)(QEMUFile *f, SCSIRequest *req); void *(*load_request)(QEMUFile *f, SCSIRequest *req); void (*free_request)(SCSIBus *bus, void *priv); + + /* + * Temporarily stop submitting new requests between drained_begin() and + * drained_end(). Called from the main loop thread with the BQL held. + * + * Implement these callbacks if request processing is triggered by a file + * descriptor like an EventNotifier. Otherwise set them to NULL. + */ + void (*drained_begin)(SCSIBus *bus); + void (*drained_end)(SCSIBus *bus); }; #define TYPE_SCSI_BUS "SCSI" @@ -144,6 +154,8 @@ struct SCSIBus { SCSISense unit_attention; const SCSIBusInfo *info; + + int drain_count; /* protected by BQL */ }; /** @@ -213,6 +225,8 @@ void scsi_req_cancel_complete(SCSIRequest *req); void scsi_req_cancel(SCSIRequest *req); void scsi_req_cancel_async(SCSIRequest *req, Notifier *notifier); void scsi_req_retry(SCSIRequest *req); +void scsi_device_drained_begin(SCSIDevice *sdev); +void scsi_device_drained_end(SCSIDevice *sdev); void scsi_device_purge_requests(SCSIDevice *sdev, SCSISense sense); void scsi_device_set_ua(SCSIDevice *sdev, SCSISense sense); void scsi_device_report_change(SCSIDevice *dev, SCSISense sense); -- cgit v1.1 From 60f782b6b78211c125970768be726c9f380dbd61 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 16 May 2023 15:02:38 -0400 Subject: aio: remove aio_disable_external() API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All callers now pass is_external=false to aio_set_fd_handler() and aio_set_event_notifier(). The aio_disable_external() API that temporarily disables fd handlers that were registered is_external=true is therefore dead code. Remove aio_disable_external(), aio_enable_external(), and the is_external arguments to aio_set_fd_handler() and aio_set_event_notifier(). The entire test-fdmon-epoll test is removed because its sole purpose was testing aio_disable_external(). Parts of this patch were generated using the following coccinelle (https://coccinelle.lip6.fr/) semantic patch: @@ expression ctx, fd, is_external, io_read, io_write, io_poll, io_poll_ready, opaque; @@ - aio_set_fd_handler(ctx, fd, is_external, io_read, io_write, io_poll, io_poll_ready, opaque) + aio_set_fd_handler(ctx, fd, io_read, io_write, io_poll, io_poll_ready, opaque) @@ expression ctx, notifier, is_external, io_read, io_poll, io_poll_ready; @@ - aio_set_event_notifier(ctx, notifier, is_external, io_read, io_poll, io_poll_ready) + aio_set_event_notifier(ctx, notifier, io_read, io_poll, io_poll_ready) Reviewed-by: Juan Quintela Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Stefan Hajnoczi Message-Id: <20230516190238.8401-21-stefanha@redhat.com> Signed-off-by: Kevin Wolf --- include/block/aio.h | 57 ----------------------------------------------------- 1 file changed, 57 deletions(-) (limited to 'include') diff --git a/include/block/aio.h b/include/block/aio.h index 89bbc53..32042e8 100644 --- a/include/block/aio.h +++ b/include/block/aio.h @@ -225,8 +225,6 @@ struct AioContext { */ QEMUTimerListGroup tlg; - int external_disable_cnt; - /* Number of AioHandlers without .io_poll() */ int poll_disable_cnt; @@ -481,7 +479,6 @@ bool aio_poll(AioContext *ctx, bool blocking); */ void aio_set_fd_handler(AioContext *ctx, int fd, - bool is_external, IOHandler *io_read, IOHandler *io_write, AioPollFn *io_poll, @@ -497,7 +494,6 @@ void aio_set_fd_handler(AioContext *ctx, */ void aio_set_event_notifier(AioContext *ctx, EventNotifier *notifier, - bool is_external, EventNotifierHandler *io_read, AioPollFn *io_poll, EventNotifierHandler *io_poll_ready); @@ -627,59 +623,6 @@ static inline void aio_timer_init(AioContext *ctx, int64_t aio_compute_timeout(AioContext *ctx); /** - * aio_disable_external: - * @ctx: the aio context - * - * Disable the further processing of external clients. - */ -static inline void aio_disable_external(AioContext *ctx) -{ - qatomic_inc(&ctx->external_disable_cnt); -} - -/** - * aio_enable_external: - * @ctx: the aio context - * - * Enable the processing of external clients. - */ -static inline void aio_enable_external(AioContext *ctx) -{ - int old; - - old = qatomic_fetch_dec(&ctx->external_disable_cnt); - assert(old > 0); - if (old == 1) { - /* Kick event loop so it re-arms file descriptors */ - aio_notify(ctx); - } -} - -/** - * aio_external_disabled: - * @ctx: the aio context - * - * Return true if the external clients are disabled. - */ -static inline bool aio_external_disabled(AioContext *ctx) -{ - return qatomic_read(&ctx->external_disable_cnt); -} - -/** - * aio_node_check: - * @ctx: the aio context - * @is_external: Whether or not the checked node is an external event source. - * - * Check if the node's is_external flag is okay to be polled by the ctx at this - * moment. True means green light. - */ -static inline bool aio_node_check(AioContext *ctx, bool is_external) -{ - return !is_external || !qatomic_read(&ctx->external_disable_cnt); -} - -/** * aio_co_schedule: * @ctx: the aio context * @co: the coroutine -- cgit v1.1