diff options
author | Richard Henderson <richard.henderson@linaro.org> | 2021-11-03 00:32:56 -0400 |
---|---|---|
committer | Richard Henderson <richard.henderson@linaro.org> | 2021-11-03 00:32:56 -0400 |
commit | 741bdeb1d5a4024a2c54c6abb2de493a27b61953 (patch) | |
tree | e2af8a1c355cbf56913499e9ea32814d5b86ad47 /block | |
parent | 22d5760cb43e2fe73e61fda145a98f3217ca47bf (diff) | |
parent | a8951438946d72d74c9bdbdb38fce95aa2973a88 (diff) | |
download | qemu-741bdeb1d5a4024a2c54c6abb2de493a27b61953.zip qemu-741bdeb1d5a4024a2c54c6abb2de493a27b61953.tar.gz qemu-741bdeb1d5a4024a2c54c6abb2de493a27b61953.tar.bz2 |
Merge remote-tracking branch 'remotes/kwolf/tags/for-upstream' into staging
Block layer patches
- Fail gracefully when blockdev-snapshot creates loops
- ide: Fix IDENTIFY DEVICE for disks > 128 GiB
- file-posix: Fix return value translation for AIO discards
- file-posix: add 'aio-max-batch' option
- rbd: implement bdrv_co_block_status
- Code cleanups and build fixes
# gpg: Signature made Tue 02 Nov 2021 12:04:02 PM EDT
# gpg: using RSA key DC3DEB159A9AF95D3D7456FE7F09B272C88F2FD6
# gpg: issuer "kwolf@redhat.com"
# gpg: Good signature from "Kevin Wolf <kwolf@redhat.com>" [full]
* remotes/kwolf/tags/for-upstream:
block/nvme: Extract nvme_free_queue() from nvme_free_queue_pair()
block/nvme: Display CQ/SQ pointer in nvme_free_queue_pair()
block/nvme: Automatically free qemu_memalign() with QEMU_AUTO_VFREE
block-backend: Silence clang -m32 compiler warning
linux-aio: add `dev_max_batch` parameter to laio_io_unplug()
linux-aio: add `dev_max_batch` parameter to laio_co_submit()
file-posix: add `aio-max-batch` option
block/export/fuse.c: fix musl build
ide: Cap LBA28 capacity announcement to 2^28-1
block/rbd: implement bdrv_co_block_status
block: Fail gracefully when blockdev-snapshot creates loops
block/file-posix: Fix return value translation for AIO discards
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Diffstat (limited to 'block')
-rw-r--r-- | block/block-backend.c | 2 | ||||
-rw-r--r-- | block/export/fuse.c | 4 | ||||
-rw-r--r-- | block/file-posix.c | 18 | ||||
-rw-r--r-- | block/linux-aio.c | 38 | ||||
-rw-r--r-- | block/nvme.c | 22 | ||||
-rw-r--r-- | block/rbd.c | 112 | ||||
-rw-r--r-- | block/trace-events | 2 |
7 files changed, 171 insertions, 27 deletions
diff --git a/block/block-backend.c b/block/block-backend.c index 39cd99d..12ef80e 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -1540,7 +1540,7 @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset, QEMUIOVector *qiov, BdrvRequestFlags flags, BlockCompletionFunc *cb, void *opaque) { - assert(qiov->size <= INT64_MAX); + assert((uint64_t)qiov->size <= INT64_MAX); return blk_aio_prwv(blk, offset, qiov->size, qiov, blk_aio_write_entry, flags, cb, opaque); } diff --git a/block/export/fuse.c b/block/export/fuse.c index 2e3bf82..823c126 100644 --- a/block/export/fuse.c +++ b/block/export/fuse.c @@ -31,6 +31,10 @@ #include <fuse.h> #include <fuse_lowlevel.h> +#if defined(CONFIG_FALLOCATE_ZERO_RANGE) +#include <linux/falloc.h> +#endif + #ifdef __linux__ #include <linux/fs.h> #endif diff --git a/block/file-posix.c b/block/file-posix.c index 53be0bd..7a27c83 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -150,6 +150,8 @@ typedef struct BDRVRawState { uint64_t locked_perm; uint64_t locked_shared_perm; + uint64_t aio_max_batch; + int perm_change_fd; int perm_change_flags; BDRVReopenState *reopen_state; @@ -531,6 +533,11 @@ static QemuOptsList raw_runtime_opts = { .help = "host AIO implementation (threads, native, io_uring)", }, { + .name = "aio-max-batch", + .type = QEMU_OPT_NUMBER, + .help = "AIO max batch size (0 = auto handled by AIO backend, default: 0)", + }, + { .name = "locking", .type = QEMU_OPT_STRING, .help = "file locking mode (on/off/auto, default: auto)", @@ -609,6 +616,8 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, s->use_linux_io_uring = (aio == BLOCKDEV_AIO_OPTIONS_IO_URING); #endif + s->aio_max_batch = qemu_opt_get_number(opts, "aio-max-batch", 0); + locking = qapi_enum_parse(&OnOffAuto_lookup, qemu_opt_get(opts, "locking"), ON_OFF_AUTO_AUTO, &local_err); @@ -1807,7 +1816,7 @@ static int handle_aiocb_copy_range(void *opaque) static int handle_aiocb_discard(void *opaque) { RawPosixAIOData *aiocb = opaque; - int ret = -EOPNOTSUPP; + int ret = -ENOTSUP; BDRVRawState *s = aiocb->bs->opaque; if (!s->has_discard) { @@ -1829,7 +1838,7 @@ static int handle_aiocb_discard(void *opaque) #ifdef CONFIG_FALLOCATE_PUNCH_HOLE ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, aiocb->aio_offset, aiocb->aio_nbytes); - ret = translate_err(-errno); + ret = translate_err(ret); #elif defined(__APPLE__) && (__MACH__) fpunchhole_t fpunchhole; fpunchhole.fp_flags = 0; @@ -2057,7 +2066,8 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, } else if (s->use_linux_aio) { LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); assert(qiov->size == bytes); - return laio_co_submit(bs, aio, s->fd, offset, qiov, type); + return laio_co_submit(bs, aio, s->fd, offset, qiov, type, + s->aio_max_batch); #endif } @@ -2115,7 +2125,7 @@ static void raw_aio_unplug(BlockDriverState *bs) #ifdef CONFIG_LINUX_AIO if (s->use_linux_aio) { LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); - laio_io_unplug(bs, aio); + laio_io_unplug(bs, aio, s->aio_max_batch); } #endif #ifdef CONFIG_LINUX_IO_URING diff --git a/block/linux-aio.c b/block/linux-aio.c index 0dab507..f53ae72 100644 --- a/block/linux-aio.c +++ b/block/linux-aio.c @@ -334,30 +334,45 @@ static void ioq_submit(LinuxAioState *s) } } +static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch) +{ + uint64_t max_batch = s->aio_context->aio_max_batch ?: DEFAULT_MAX_BATCH; + + /* + * AIO context can be shared between multiple block devices, so + * `dev_max_batch` allows reducing the batch size for latency-sensitive + * devices. + */ + max_batch = MIN_NON_ZERO(dev_max_batch, max_batch); + + /* limit the batch with the number of available events */ + max_batch = MIN_NON_ZERO(MAX_EVENTS - s->io_q.in_flight, max_batch); + + return max_batch; +} + void laio_io_plug(BlockDriverState *bs, LinuxAioState *s) { s->io_q.plugged++; } -void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s) +void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s, + uint64_t dev_max_batch) { assert(s->io_q.plugged); - if (--s->io_q.plugged == 0 && - !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { + if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch) || + (--s->io_q.plugged == 0 && + !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending))) { ioq_submit(s); } } static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset, - int type) + int type, uint64_t dev_max_batch) { LinuxAioState *s = laiocb->ctx; struct iocb *iocbs = &laiocb->iocb; QEMUIOVector *qiov = laiocb->qiov; - int64_t max_batch = s->aio_context->aio_max_batch ?: DEFAULT_MAX_BATCH; - - /* limit the batch with the number of available events */ - max_batch = MIN_NON_ZERO(MAX_EVENTS - s->io_q.in_flight, max_batch); switch (type) { case QEMU_AIO_WRITE: @@ -378,7 +393,7 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset, s->io_q.in_queue++; if (!s->io_q.blocked && (!s->io_q.plugged || - s->io_q.in_queue >= max_batch)) { + s->io_q.in_queue >= laio_max_batch(s, dev_max_batch))) { ioq_submit(s); } @@ -386,7 +401,8 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset, } int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd, - uint64_t offset, QEMUIOVector *qiov, int type) + uint64_t offset, QEMUIOVector *qiov, int type, + uint64_t dev_max_batch) { int ret; struct qemu_laiocb laiocb = { @@ -398,7 +414,7 @@ int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd, .qiov = qiov, }; - ret = laio_do_submit(fd, &laiocb, offset, type); + ret = laio_do_submit(fd, &laiocb, offset, type, dev_max_batch); if (ret < 0) { return ret; } diff --git a/block/nvme.c b/block/nvme.c index 1cc7b62..e4f336d 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -183,15 +183,20 @@ static bool nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q, return r == 0; } +static void nvme_free_queue(NVMeQueue *q) +{ + qemu_vfree(q->queue); +} + static void nvme_free_queue_pair(NVMeQueuePair *q) { - trace_nvme_free_queue_pair(q->index, q); + trace_nvme_free_queue_pair(q->index, q, &q->cq, &q->sq); if (q->completion_bh) { qemu_bh_delete(q->completion_bh); } + nvme_free_queue(&q->sq); + nvme_free_queue(&q->cq); qemu_vfree(q->prp_list_pages); - qemu_vfree(q->sq.queue); - qemu_vfree(q->cq.queue); qemu_mutex_destroy(&q->lock); g_free(q); } @@ -514,10 +519,10 @@ static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp) { BDRVNVMeState *s = bs->opaque; bool ret = false; - union { + QEMU_AUTO_VFREE union { NvmeIdCtrl ctrl; NvmeIdNs ns; - } *id; + } *id = NULL; NvmeLBAF *lbaf; uint16_t oncs; int r; @@ -595,7 +600,6 @@ static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp) s->blkshift = lbaf->ds; out: qemu_vfio_dma_unmap(s->vfio, id); - qemu_vfree(id); return ret; } @@ -1219,7 +1223,7 @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes, { BDRVNVMeState *s = bs->opaque; int r; - uint8_t *buf = NULL; + QEMU_AUTO_VFREE uint8_t *buf = NULL; QEMUIOVector local_qiov; size_t len = QEMU_ALIGN_UP(bytes, qemu_real_host_page_size); assert(QEMU_IS_ALIGNED(offset, s->page_size)); @@ -1246,7 +1250,6 @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes, if (!r && !is_write) { qemu_iovec_from_buf(qiov, 0, buf, bytes); } - qemu_vfree(buf); return r; } @@ -1365,7 +1368,7 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs, BDRVNVMeState *s = bs->opaque; NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; NVMeRequest *req; - NvmeDsmRange *buf; + QEMU_AUTO_VFREE NvmeDsmRange *buf = NULL; QEMUIOVector local_qiov; int ret; @@ -1440,7 +1443,6 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs, trace_nvme_dsm_done(s, offset, bytes, ret); out: qemu_iovec_destroy(&local_qiov); - qemu_vfree(buf); return ret; } diff --git a/block/rbd.c b/block/rbd.c index 701fbf2..def9629 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -97,6 +97,12 @@ typedef struct RBDTask { int64_t ret; } RBDTask; +typedef struct RBDDiffIterateReq { + uint64_t offs; + uint64_t bytes; + bool exists; +} RBDDiffIterateReq; + static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx, BlockdevOptionsRbd *opts, bool cache, const char *keypairs, const char *secretid, @@ -1259,6 +1265,111 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs, return spec_info; } +/* + * rbd_diff_iterate2 allows to interrupt the exection by returning a negative + * value in the callback routine. Choose a value that does not conflict with + * an existing exitcode and return it if we want to prematurely stop the + * execution because we detected a change in the allocation status. + */ +#define QEMU_RBD_EXIT_DIFF_ITERATE2 -9000 + +static int qemu_rbd_diff_iterate_cb(uint64_t offs, size_t len, + int exists, void *opaque) +{ + RBDDiffIterateReq *req = opaque; + + assert(req->offs + req->bytes <= offs); + /* + * we do not diff against a snapshot so we should never receive a callback + * for a hole. + */ + assert(exists); + + if (!req->exists && offs > req->offs) { + /* + * we started in an unallocated area and hit the first allocated + * block. req->bytes must be set to the length of the unallocated area + * before the allocated area. stop further processing. + */ + req->bytes = offs - req->offs; + return QEMU_RBD_EXIT_DIFF_ITERATE2; + } + + if (req->exists && offs > req->offs + req->bytes) { + /* + * we started in an allocated area and jumped over an unallocated area, + * req->bytes contains the length of the allocated area before the + * unallocated area. stop further processing. + */ + return QEMU_RBD_EXIT_DIFF_ITERATE2; + } + + req->bytes += len; + req->exists = true; + + return 0; +} + +static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs, + bool want_zero, int64_t offset, + int64_t bytes, int64_t *pnum, + int64_t *map, + BlockDriverState **file) +{ + BDRVRBDState *s = bs->opaque; + int status, r; + RBDDiffIterateReq req = { .offs = offset }; + uint64_t features, flags; + + assert(offset + bytes <= s->image_size); + + /* default to all sectors allocated */ + status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; + *map = offset; + *file = bs; + *pnum = bytes; + + /* check if RBD image supports fast-diff */ + r = rbd_get_features(s->image, &features); + if (r < 0) { + return status; + } + if (!(features & RBD_FEATURE_FAST_DIFF)) { + return status; + } + + /* check if RBD fast-diff result is valid */ + r = rbd_get_flags(s->image, &flags); + if (r < 0) { + return status; + } + if (flags & RBD_FLAG_FAST_DIFF_INVALID) { + return status; + } + + r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true, + qemu_rbd_diff_iterate_cb, &req); + if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) { + return status; + } + assert(req.bytes <= bytes); + if (!req.exists) { + if (r == 0) { + /* + * rbd_diff_iterate2 does not invoke callbacks for unallocated + * areas. This here catches the case where no callback was + * invoked at all (req.bytes == 0). + */ + assert(req.bytes == 0); + req.bytes = bytes; + } + status = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID; + } + + *pnum = req.bytes; + return status; +} + static int64_t qemu_rbd_getlength(BlockDriverState *bs) { BDRVRBDState *s = bs->opaque; @@ -1494,6 +1605,7 @@ static BlockDriver bdrv_rbd = { #ifdef LIBRBD_SUPPORTS_WRITE_ZEROES .bdrv_co_pwrite_zeroes = qemu_rbd_co_pwrite_zeroes, #endif + .bdrv_co_block_status = qemu_rbd_co_block_status, .bdrv_snapshot_create = qemu_rbd_snap_create, .bdrv_snapshot_delete = qemu_rbd_snap_remove, diff --git a/block/trace-events b/block/trace-events index ab56eda..549090d 100644 --- a/block/trace-events +++ b/block/trace-events @@ -157,7 +157,7 @@ nvme_dsm_done(void *s, int64_t offset, int64_t bytes, int ret) "s %p offset 0x%" nvme_dma_map_flush(void *s) "s %p" nvme_free_req_queue_wait(void *s, unsigned q_index) "s %p q #%u" nvme_create_queue_pair(unsigned q_index, void *q, size_t size, void *aio_context, int fd) "index %u q %p size %zu aioctx %p fd %d" -nvme_free_queue_pair(unsigned q_index, void *q) "index %u q %p" +nvme_free_queue_pair(unsigned q_index, void *q, void *cq, void *sq) "index %u q %p cq %p sq %p" nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d" nvme_cmd_map_qiov_pages(void *s, int i, uint64_t page) "s %p page[%d] 0x%"PRIx64 nvme_cmd_map_qiov_iov(void *s, int i, void *page, int pages) "s %p iov[%d] %p pages %d" |