From 61007b316cd71ee7333ff7a0a749a8949527575f Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 28 Apr 2015 14:27:52 +0100 Subject: block: move I/O request processing to block/io.c The block.c file has grown to over 6000 lines. It is time to split this file so there are fewer conflicts and the code is easier to maintain. Extract I/O request processing code: * Read * Write * Zero writes and making the image empty * Flush * Discard * ioctl * Tracked requests and queuing * Throttling and copy-on-read * Block status and allocated functions * Refreshing block limits * Reading/writing vmstate * qemu_blockalign() and friends The patch simply moves code from block.c into block/io.c. Signed-off-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block.c | 3394 +++++++-------------------------------------------- block/Makefile.objs | 2 +- block/io.c | 2540 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 2982 insertions(+), 2954 deletions(-) create mode 100644 block/io.c diff --git a/block.c b/block.c index 954d783..7904098 100644 --- a/block.c +++ b/block.c @@ -30,7 +30,6 @@ #include "qapi/qmp/qjson.h" #include "sysemu/block-backend.h" #include "sysemu/sysemu.h" -#include "sysemu/qtest.h" #include "qemu/notify.h" #include "block/coroutine.h" #include "block/qapi.h" @@ -71,36 +70,6 @@ struct BdrvDirtyBitmap { #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ -static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque); -static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque); -static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov); -static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov); -static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, - int64_t offset, unsigned int bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags); -static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, - int64_t offset, unsigned int bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags); -static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BdrvRequestFlags flags, - BlockCompletionFunc *cb, - void *opaque, - bool is_write); -static void coroutine_fn bdrv_co_do_rw(void *opaque); -static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); - static QTAILQ_HEAD(, BlockDriverState) bdrv_states = QTAILQ_HEAD_INITIALIZER(bdrv_states); @@ -134,110 +103,6 @@ int is_windows_drive(const char *filename) } #endif -/* throttling disk I/O limits */ -void bdrv_set_io_limits(BlockDriverState *bs, - ThrottleConfig *cfg) -{ - int i; - - throttle_config(&bs->throttle_state, cfg); - - for (i = 0; i < 2; i++) { - qemu_co_enter_next(&bs->throttled_reqs[i]); - } -} - -/* this function drain all the throttled IOs */ -static bool bdrv_start_throttled_reqs(BlockDriverState *bs) -{ - bool drained = false; - bool enabled = bs->io_limits_enabled; - int i; - - bs->io_limits_enabled = false; - - for (i = 0; i < 2; i++) { - while (qemu_co_enter_next(&bs->throttled_reqs[i])) { - drained = true; - } - } - - bs->io_limits_enabled = enabled; - - return drained; -} - -void bdrv_io_limits_disable(BlockDriverState *bs) -{ - bs->io_limits_enabled = false; - - bdrv_start_throttled_reqs(bs); - - throttle_destroy(&bs->throttle_state); -} - -static void bdrv_throttle_read_timer_cb(void *opaque) -{ - BlockDriverState *bs = opaque; - qemu_co_enter_next(&bs->throttled_reqs[0]); -} - -static void bdrv_throttle_write_timer_cb(void *opaque) -{ - BlockDriverState *bs = opaque; - qemu_co_enter_next(&bs->throttled_reqs[1]); -} - -/* should be called before bdrv_set_io_limits if a limit is set */ -void bdrv_io_limits_enable(BlockDriverState *bs) -{ - int clock_type = QEMU_CLOCK_REALTIME; - - if (qtest_enabled()) { - /* For testing block IO throttling only */ - clock_type = QEMU_CLOCK_VIRTUAL; - } - assert(!bs->io_limits_enabled); - throttle_init(&bs->throttle_state, - bdrv_get_aio_context(bs), - clock_type, - bdrv_throttle_read_timer_cb, - bdrv_throttle_write_timer_cb, - bs); - bs->io_limits_enabled = true; -} - -/* This function makes an IO wait if needed - * - * @nb_sectors: the number of sectors of the IO - * @is_write: is the IO a write - */ -static void bdrv_io_limits_intercept(BlockDriverState *bs, - unsigned int bytes, - bool is_write) -{ - /* does this io must wait */ - bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write); - - /* if must wait or any request of this type throttled queue the IO */ - if (must_wait || - !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) { - qemu_co_queue_wait(&bs->throttled_reqs[is_write]); - } - - /* the IO will be executed, do the accounting */ - throttle_account(&bs->throttle_state, is_write, bytes); - - - /* if the next request must wait -> do nothing */ - if (throttle_schedule_timer(&bs->throttle_state, is_write)) { - return; - } - - /* else queue next request for execution */ - qemu_co_queue_next(&bs->throttled_reqs[is_write]); -} - size_t bdrv_opt_mem_align(BlockDriverState *bs) { if (!bs || !bs->drv) { @@ -349,24 +214,6 @@ void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz, dest, sz, errp); } -void bdrv_setup_io_funcs(BlockDriver *bdrv) -{ - /* Block drivers without coroutine functions need emulation */ - if (!bdrv->bdrv_co_readv) { - bdrv->bdrv_co_readv = bdrv_co_readv_em; - bdrv->bdrv_co_writev = bdrv_co_writev_em; - - /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if - * the block driver lacks aio we need to emulate that too. - */ - if (!bdrv->bdrv_aio_readv) { - /* add AIO emulation layer */ - bdrv->bdrv_aio_readv = bdrv_aio_readv_em; - bdrv->bdrv_aio_writev = bdrv_aio_writev_em; - } - } -} - void bdrv_register(BlockDriver *bdrv) { bdrv_setup_io_funcs(bdrv); @@ -541,54 +388,6 @@ int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp) return ret; } -void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) -{ - BlockDriver *drv = bs->drv; - Error *local_err = NULL; - - memset(&bs->bl, 0, sizeof(bs->bl)); - - if (!drv) { - return; - } - - /* Take some limits from the children as a default */ - if (bs->file) { - bdrv_refresh_limits(bs->file, &local_err); - if (local_err) { - error_propagate(errp, local_err); - return; - } - bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length; - bs->bl.max_transfer_length = bs->file->bl.max_transfer_length; - bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment; - } else { - bs->bl.opt_mem_alignment = 512; - } - - if (bs->backing_hd) { - bdrv_refresh_limits(bs->backing_hd, &local_err); - if (local_err) { - error_propagate(errp, local_err); - return; - } - bs->bl.opt_transfer_length = - MAX(bs->bl.opt_transfer_length, - bs->backing_hd->bl.opt_transfer_length); - bs->bl.max_transfer_length = - MIN_NON_ZERO(bs->bl.max_transfer_length, - bs->backing_hd->bl.max_transfer_length); - bs->bl.opt_mem_alignment = - MAX(bs->bl.opt_mem_alignment, - bs->backing_hd->bl.opt_mem_alignment); - } - - /* Then let the driver override it */ - if (drv->bdrv_refresh_limits) { - drv->bdrv_refresh_limits(bs, errp); - } -} - /** * Try to get @bs's logical and physical block size. * On success, store them in @bsz struct and return 0. @@ -862,22 +661,6 @@ int bdrv_parse_cache_flags(const char *mode, int *flags) return 0; } -/** - * The copy-on-read flag is actually a reference count so multiple users may - * use the feature without worrying about clobbering its previous state. - * Copy-on-read stays enabled until all users have called to disable it. - */ -void bdrv_enable_copy_on_read(BlockDriverState *bs) -{ - bs->copy_on_read++; -} - -void bdrv_disable_copy_on_read(BlockDriverState *bs) -{ - assert(bs->copy_on_read > 0); - bs->copy_on_read--; -} - /* * Returns the flags that a temporary snapshot should get, based on the * originally requested flags (the originally requested image will have flags @@ -1987,108 +1770,6 @@ void bdrv_close_all(void) } } -/* Check if any requests are in-flight (including throttled requests) */ -static bool bdrv_requests_pending(BlockDriverState *bs) -{ - if (!QLIST_EMPTY(&bs->tracked_requests)) { - return true; - } - if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { - return true; - } - if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { - return true; - } - if (bs->file && bdrv_requests_pending(bs->file)) { - return true; - } - if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) { - return true; - } - return false; -} - -static bool bdrv_drain_one(BlockDriverState *bs) -{ - bool bs_busy; - - bdrv_flush_io_queue(bs); - bdrv_start_throttled_reqs(bs); - bs_busy = bdrv_requests_pending(bs); - bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy); - return bs_busy; -} - -/* - * Wait for pending requests to complete on a single BlockDriverState subtree - * - * See the warning in bdrv_drain_all(). This function can only be called if - * you are sure nothing can generate I/O because you have op blockers - * installed. - * - * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState - * AioContext. - */ -void bdrv_drain(BlockDriverState *bs) -{ - while (bdrv_drain_one(bs)) { - /* Keep iterating */ - } -} - -/* - * Wait for pending requests to complete across all BlockDriverStates - * - * This function does not flush data to disk, use bdrv_flush_all() for that - * after calling this function. - * - * Note that completion of an asynchronous I/O operation can trigger any - * number of other I/O operations on other devices---for example a coroutine - * can be arbitrarily complex and a constant flow of I/O can come until the - * coroutine is complete. Because of this, it is not possible to have a - * function to drain a single device's I/O queue. - */ -void bdrv_drain_all(void) -{ - /* Always run first iteration so any pending completion BHs run */ - bool busy = true; - BlockDriverState *bs = NULL; - - while ((bs = bdrv_next(bs))) { - AioContext *aio_context = bdrv_get_aio_context(bs); - - aio_context_acquire(aio_context); - if (bs->job) { - block_job_pause(bs->job); - } - aio_context_release(aio_context); - } - - while (busy) { - busy = false; - bs = NULL; - - while ((bs = bdrv_next(bs))) { - AioContext *aio_context = bdrv_get_aio_context(bs); - - aio_context_acquire(aio_context); - busy |= bdrv_drain_one(bs); - aio_context_release(aio_context); - } - } - - bs = NULL; - while ((bs = bdrv_next(bs))) { - AioContext *aio_context = bdrv_get_aio_context(bs); - - aio_context_acquire(aio_context); - if (bs->job) { - block_job_resume(bs->job); - } - aio_context_release(aio_context); - } -} - /* make a BlockDriverState anonymous by removing from bdrv_state and * graph_bdrv_state list. Also, NULL terminate the device_name to prevent double remove */ @@ -2410,152 +2091,6 @@ int bdrv_commit_all(void) return 0; } -/** - * Remove an active request from the tracked requests list - * - * This function should be called when a tracked request is completing. - */ -static void tracked_request_end(BdrvTrackedRequest *req) -{ - if (req->serialising) { - req->bs->serialising_in_flight--; - } - - QLIST_REMOVE(req, list); - qemu_co_queue_restart_all(&req->wait_queue); -} - -/** - * Add an active request to the tracked requests list - */ -static void tracked_request_begin(BdrvTrackedRequest *req, - BlockDriverState *bs, - int64_t offset, - unsigned int bytes, bool is_write) -{ - *req = (BdrvTrackedRequest){ - .bs = bs, - .offset = offset, - .bytes = bytes, - .is_write = is_write, - .co = qemu_coroutine_self(), - .serialising = false, - .overlap_offset = offset, - .overlap_bytes = bytes, - }; - - qemu_co_queue_init(&req->wait_queue); - - QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); -} - -static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) -{ - int64_t overlap_offset = req->offset & ~(align - 1); - unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) - - overlap_offset; - - if (!req->serialising) { - req->bs->serialising_in_flight++; - req->serialising = true; - } - - req->overlap_offset = MIN(req->overlap_offset, overlap_offset); - req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); -} - -/** - * Round a region to cluster boundaries - */ -void bdrv_round_to_clusters(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - int64_t *cluster_sector_num, - int *cluster_nb_sectors) -{ - BlockDriverInfo bdi; - - if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { - *cluster_sector_num = sector_num; - *cluster_nb_sectors = nb_sectors; - } else { - int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; - *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); - *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + - nb_sectors, c); - } -} - -static int bdrv_get_cluster_size(BlockDriverState *bs) -{ - BlockDriverInfo bdi; - int ret; - - ret = bdrv_get_info(bs, &bdi); - if (ret < 0 || bdi.cluster_size == 0) { - return bs->request_alignment; - } else { - return bdi.cluster_size; - } -} - -static bool tracked_request_overlaps(BdrvTrackedRequest *req, - int64_t offset, unsigned int bytes) -{ - /* aaaa bbbb */ - if (offset >= req->overlap_offset + req->overlap_bytes) { - return false; - } - /* bbbb aaaa */ - if (req->overlap_offset >= offset + bytes) { - return false; - } - return true; -} - -static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) -{ - BlockDriverState *bs = self->bs; - BdrvTrackedRequest *req; - bool retry; - bool waited = false; - - if (!bs->serialising_in_flight) { - return false; - } - - do { - retry = false; - QLIST_FOREACH(req, &bs->tracked_requests, list) { - if (req == self || (!req->serialising && !self->serialising)) { - continue; - } - if (tracked_request_overlaps(req, self->overlap_offset, - self->overlap_bytes)) - { - /* Hitting this means there was a reentrant request, for - * example, a block driver issuing nested requests. This must - * never happen since it means deadlock. - */ - assert(qemu_coroutine_self() != req->co); - - /* If the request is already (indirectly) waiting for us, or - * will wait for us as soon as it wakes up, then just go on - * (instead of producing a deadlock in the former case). */ - if (!req->waiting_for) { - self->waiting_for = req; - qemu_co_queue_wait(&req->wait_queue); - self->waiting_for = NULL; - retry = true; - waited = true; - break; - } - } - } - } while (retry); - - return waited; -} - /* * Return values: * 0 - success @@ -2724,1127 +2259,254 @@ exit: return ret; } - -static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, - size_t size) +/** + * Truncate file to 'offset' bytes (needed only for file protocols) + */ +int bdrv_truncate(BlockDriverState *bs, int64_t offset) { - if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { - return -EIO; - } - - if (!bdrv_is_inserted(bs)) { + BlockDriver *drv = bs->drv; + int ret; + if (!drv) return -ENOMEDIUM; - } + if (!drv->bdrv_truncate) + return -ENOTSUP; + if (bs->read_only) + return -EACCES; - if (offset < 0) { - return -EIO; + ret = drv->bdrv_truncate(bs, offset); + if (ret == 0) { + ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); + bdrv_dirty_bitmap_truncate(bs); + if (bs->blk) { + blk_dev_resize_cb(bs->blk); + } } - - return 0; + return ret; } -static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, - int nb_sectors) +/** + * Length of a allocated file in bytes. Sparse files are counted by actual + * allocated space. Return < 0 if error or unknown. + */ +int64_t bdrv_get_allocated_file_size(BlockDriverState *bs) { - if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { - return -EIO; + BlockDriver *drv = bs->drv; + if (!drv) { + return -ENOMEDIUM; } - - return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, - nb_sectors * BDRV_SECTOR_SIZE); -} - -typedef struct RwCo { - BlockDriverState *bs; - int64_t offset; - QEMUIOVector *qiov; - bool is_write; - int ret; - BdrvRequestFlags flags; -} RwCo; - -static void coroutine_fn bdrv_rw_co_entry(void *opaque) -{ - RwCo *rwco = opaque; - - if (!rwco->is_write) { - rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, - rwco->qiov->size, rwco->qiov, - rwco->flags); - } else { - rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, - rwco->qiov->size, rwco->qiov, - rwco->flags); + if (drv->bdrv_get_allocated_file_size) { + return drv->bdrv_get_allocated_file_size(bs); + } + if (bs->file) { + return bdrv_get_allocated_file_size(bs->file); } + return -ENOTSUP; } -/* - * Process a vectored synchronous request using coroutines +/** + * Return number of sectors on success, -errno on error. */ -static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, - QEMUIOVector *qiov, bool is_write, - BdrvRequestFlags flags) +int64_t bdrv_nb_sectors(BlockDriverState *bs) { - Coroutine *co; - RwCo rwco = { - .bs = bs, - .offset = offset, - .qiov = qiov, - .is_write = is_write, - .ret = NOT_DONE, - .flags = flags, - }; - - /** - * In sync call context, when the vcpu is blocked, this throttling timer - * will not fire; so the I/O throttling function has to be disabled here - * if it has been enabled. - */ - if (bs->io_limits_enabled) { - fprintf(stderr, "Disabling I/O throttling on '%s' due " - "to synchronous I/O.\n", bdrv_get_device_name(bs)); - bdrv_io_limits_disable(bs); - } + BlockDriver *drv = bs->drv; - if (qemu_in_coroutine()) { - /* Fast-path if already in coroutine context */ - bdrv_rw_co_entry(&rwco); - } else { - AioContext *aio_context = bdrv_get_aio_context(bs); + if (!drv) + return -ENOMEDIUM; - co = qemu_coroutine_create(bdrv_rw_co_entry); - qemu_coroutine_enter(co, &rwco); - while (rwco.ret == NOT_DONE) { - aio_poll(aio_context, true); + if (drv->has_variable_length) { + int ret = refresh_total_sectors(bs, bs->total_sectors); + if (ret < 0) { + return ret; } } - return rwco.ret; + return bs->total_sectors; } -/* - * Process a synchronous request using coroutines +/** + * Return length in bytes on success, -errno on error. + * The length is always a multiple of BDRV_SECTOR_SIZE. */ -static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, - int nb_sectors, bool is_write, BdrvRequestFlags flags) +int64_t bdrv_getlength(BlockDriverState *bs) { - QEMUIOVector qiov; - struct iovec iov = { - .iov_base = (void *)buf, - .iov_len = nb_sectors * BDRV_SECTOR_SIZE, - }; - - if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { - return -EINVAL; - } - - qemu_iovec_init_external(&qiov, &iov, 1); - return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, - &qiov, is_write, flags); -} + int64_t ret = bdrv_nb_sectors(bs); -/* return < 0 if error. See bdrv_write() for the return codes */ -int bdrv_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); + return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE; } -/* Just like bdrv_read(), but with I/O throttling temporarily disabled */ -int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) +/* return 0 as number of sectors if no device present or error */ +void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr) { - bool enabled; - int ret; + int64_t nb_sectors = bdrv_nb_sectors(bs); - enabled = bs->io_limits_enabled; - bs->io_limits_enabled = false; - ret = bdrv_read(bs, sector_num, buf, nb_sectors); - bs->io_limits_enabled = enabled; - return ret; + *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors; } -/* Return < 0 if error. Important errors are: - -EIO generic I/O error (may happen for all errors) - -ENOMEDIUM No media inserted. - -EINVAL Invalid sector number or nb_sectors - -EACCES Trying to write a read-only device -*/ -int bdrv_write(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) +void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error, + BlockdevOnError on_write_error) { - return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); + bs->on_read_error = on_read_error; + bs->on_write_error = on_write_error; } -int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, BdrvRequestFlags flags) +BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read) { - return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, - BDRV_REQ_ZERO_WRITE | flags); + return is_read ? bs->on_read_error : bs->on_write_error; } -/* - * Completely zero out a block device with the help of bdrv_write_zeroes. - * The operation is sped up by checking the block status and only writing - * zeroes to the device if they currently do not return zeroes. Optional - * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). - * - * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). - */ -int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) +BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error) { - int64_t target_sectors, ret, nb_sectors, sector_num = 0; - int n; - - target_sectors = bdrv_nb_sectors(bs); - if (target_sectors < 0) { - return target_sectors; - } + BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error; - for (;;) { - nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS); - if (nb_sectors <= 0) { - return 0; - } - ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n); - if (ret < 0) { - error_report("error getting block status at sector %" PRId64 ": %s", - sector_num, strerror(-ret)); - return ret; - } - if (ret & BDRV_BLOCK_ZERO) { - sector_num += n; - continue; - } - ret = bdrv_write_zeroes(bs, sector_num, n, flags); - if (ret < 0) { - error_report("error writing zeroes at sector %" PRId64 ": %s", - sector_num, strerror(-ret)); - return ret; - } - sector_num += n; + switch (on_err) { + case BLOCKDEV_ON_ERROR_ENOSPC: + return (error == ENOSPC) ? + BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT; + case BLOCKDEV_ON_ERROR_STOP: + return BLOCK_ERROR_ACTION_STOP; + case BLOCKDEV_ON_ERROR_REPORT: + return BLOCK_ERROR_ACTION_REPORT; + case BLOCKDEV_ON_ERROR_IGNORE: + return BLOCK_ERROR_ACTION_IGNORE; + default: + abort(); } } -int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) +static void send_qmp_error_event(BlockDriverState *bs, + BlockErrorAction action, + bool is_read, int error) { - QEMUIOVector qiov; - struct iovec iov = { - .iov_base = (void *)buf, - .iov_len = bytes, - }; - int ret; - - if (bytes < 0) { - return -EINVAL; - } - - qemu_iovec_init_external(&qiov, &iov, 1); - ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); - if (ret < 0) { - return ret; - } + IoOperationType optype; - return bytes; + optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE; + qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action, + bdrv_iostatus_is_enabled(bs), + error == ENOSPC, strerror(error), + &error_abort); } -int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) +/* This is done by device models because, while the block layer knows + * about the error, it does not know whether an operation comes from + * the device or the block layer (from a job, for example). + */ +void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action, + bool is_read, int error) { - int ret; + assert(error >= 0); - ret = bdrv_prwv_co(bs, offset, qiov, true, 0); - if (ret < 0) { - return ret; - } + if (action == BLOCK_ERROR_ACTION_STOP) { + /* First set the iostatus, so that "info block" returns an iostatus + * that matches the events raised so far (an additional error iostatus + * is fine, but not a lost one). + */ + bdrv_iostatus_set_err(bs, error); - return qiov->size; + /* Then raise the request to stop the VM and the event. + * qemu_system_vmstop_request_prepare has two effects. First, + * it ensures that the STOP event always comes after the + * BLOCK_IO_ERROR event. Second, it ensures that even if management + * can observe the STOP event and do a "cont" before the STOP + * event is issued, the VM will not stop. In this case, vm_start() + * also ensures that the STOP/RESUME pair of events is emitted. + */ + qemu_system_vmstop_request_prepare(); + send_qmp_error_event(bs, action, is_read, error); + qemu_system_vmstop_request(RUN_STATE_IO_ERROR); + } else { + send_qmp_error_event(bs, action, is_read, error); + } } -int bdrv_pwrite(BlockDriverState *bs, int64_t offset, - const void *buf, int bytes) +int bdrv_is_read_only(BlockDriverState *bs) { - QEMUIOVector qiov; - struct iovec iov = { - .iov_base = (void *) buf, - .iov_len = bytes, - }; - - if (bytes < 0) { - return -EINVAL; - } - - qemu_iovec_init_external(&qiov, &iov, 1); - return bdrv_pwritev(bs, offset, &qiov); + return bs->read_only; } -/* - * Writes to the file and ensures that no writes are reordered across this - * request (acts as a barrier) - * - * Returns 0 on success, -errno in error cases. - */ -int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, - const void *buf, int count) +int bdrv_is_sg(BlockDriverState *bs) { - int ret; - - ret = bdrv_pwrite(bs, offset, buf, count); - if (ret < 0) { - return ret; - } - - /* No flush needed for cache modes that already do it */ - if (bs->enable_write_cache) { - bdrv_flush(bs); - } - - return 0; + return bs->sg; } -static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +int bdrv_enable_write_cache(BlockDriverState *bs) { - /* Perform I/O through a temporary buffer so that users who scribble over - * their read buffer while the operation is in progress do not end up - * modifying the image file. This is critical for zero-copy guest I/O - * where anything might happen inside guest memory. - */ - void *bounce_buffer; - - BlockDriver *drv = bs->drv; - struct iovec iov; - QEMUIOVector bounce_qiov; - int64_t cluster_sector_num; - int cluster_nb_sectors; - size_t skip_bytes; - int ret; - - /* Cover entire cluster so no additional backing file I/O is required when - * allocating cluster in the image file. - */ - bdrv_round_to_clusters(bs, sector_num, nb_sectors, - &cluster_sector_num, &cluster_nb_sectors); - - trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, - cluster_sector_num, cluster_nb_sectors); - - iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; - iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); - if (bounce_buffer == NULL) { - ret = -ENOMEM; - goto err; - } - - qemu_iovec_init_external(&bounce_qiov, &iov, 1); - - ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, - &bounce_qiov); - if (ret < 0) { - goto err; - } - - if (drv->bdrv_co_write_zeroes && - buffer_is_zero(bounce_buffer, iov.iov_len)) { - ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, - cluster_nb_sectors, 0); - } else { - /* This does not change the data on the disk, it is not necessary - * to flush even in cache=writethrough mode. - */ - ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, - &bounce_qiov); - } - - if (ret < 0) { - /* It might be okay to ignore write errors for guest requests. If this - * is a deliberate copy-on-read then we don't want to ignore the error. - * Simply report it in all cases. - */ - goto err; - } - - skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; - qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, - nb_sectors * BDRV_SECTOR_SIZE); - -err: - qemu_vfree(bounce_buffer); - return ret; + return bs->enable_write_cache; } -/* - * Forwards an already correctly aligned request to the BlockDriver. This - * handles copy on read and zeroing after EOF; any other features must be - * implemented by the caller. - */ -static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, - BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, - int64_t align, QEMUIOVector *qiov, int flags) +void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce) { - BlockDriver *drv = bs->drv; - int ret; - - int64_t sector_num = offset >> BDRV_SECTOR_BITS; - unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; - - assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); - assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); - assert(!qiov || bytes == qiov->size); - - /* Handle Copy on Read and associated serialisation */ - if (flags & BDRV_REQ_COPY_ON_READ) { - /* If we touch the same cluster it counts as an overlap. This - * guarantees that allocating writes will be serialized and not race - * with each other for the same cluster. For example, in copy-on-read - * it ensures that the CoR read and write operations are atomic and - * guest writes cannot interleave between them. */ - mark_request_serialising(req, bdrv_get_cluster_size(bs)); - } - - wait_serialising_requests(req); - - if (flags & BDRV_REQ_COPY_ON_READ) { - int pnum; - - ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); - if (ret < 0) { - goto out; - } - - if (!ret || pnum != nb_sectors) { - ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); - goto out; - } - } + bs->enable_write_cache = wce; - /* Forward the request to the BlockDriver */ - if (!bs->zero_beyond_eof) { - ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); + /* so a reopen() will preserve wce */ + if (wce) { + bs->open_flags |= BDRV_O_CACHE_WB; } else { - /* Read zeros after EOF */ - int64_t total_sectors, max_nb_sectors; - - total_sectors = bdrv_nb_sectors(bs); - if (total_sectors < 0) { - ret = total_sectors; - goto out; - } - - max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), - align >> BDRV_SECTOR_BITS); - if (nb_sectors < max_nb_sectors) { - ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); - } else if (max_nb_sectors > 0) { - QEMUIOVector local_qiov; - - qemu_iovec_init(&local_qiov, qiov->niov); - qemu_iovec_concat(&local_qiov, qiov, 0, - max_nb_sectors * BDRV_SECTOR_SIZE); - - ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors, - &local_qiov); - - qemu_iovec_destroy(&local_qiov); - } else { - ret = 0; - } - - /* Reading beyond end of file is supposed to produce zeroes */ - if (ret == 0 && total_sectors < sector_num + nb_sectors) { - uint64_t offset = MAX(0, total_sectors - sector_num); - uint64_t bytes = (sector_num + nb_sectors - offset) * - BDRV_SECTOR_SIZE; - qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); - } + bs->open_flags &= ~BDRV_O_CACHE_WB; } - -out: - return ret; } -static inline uint64_t bdrv_get_align(BlockDriverState *bs) +int bdrv_is_encrypted(BlockDriverState *bs) { - /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ - return MAX(BDRV_SECTOR_SIZE, bs->request_alignment); + if (bs->backing_hd && bs->backing_hd->encrypted) + return 1; + return bs->encrypted; } -static inline bool bdrv_req_is_aligned(BlockDriverState *bs, - int64_t offset, size_t bytes) +int bdrv_key_required(BlockDriverState *bs) { - int64_t align = bdrv_get_align(bs); - return !(offset & (align - 1) || (bytes & (align - 1))); + BlockDriverState *backing_hd = bs->backing_hd; + + if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key) + return 1; + return (bs->encrypted && !bs->valid_key); } -/* - * Handle a read request in coroutine context - */ -static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, - int64_t offset, unsigned int bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags) +int bdrv_set_key(BlockDriverState *bs, const char *key) { - BlockDriver *drv = bs->drv; - BdrvTrackedRequest req; - - uint64_t align = bdrv_get_align(bs); - uint8_t *head_buf = NULL; - uint8_t *tail_buf = NULL; - QEMUIOVector local_qiov; - bool use_local_qiov = false; int ret; - - if (!drv) { + if (bs->backing_hd && bs->backing_hd->encrypted) { + ret = bdrv_set_key(bs->backing_hd, key); + if (ret < 0) + return ret; + if (!bs->encrypted) + return 0; + } + if (!bs->encrypted) { + return -EINVAL; + } else if (!bs->drv || !bs->drv->bdrv_set_key) { return -ENOMEDIUM; } - - ret = bdrv_check_byte_request(bs, offset, bytes); + ret = bs->drv->bdrv_set_key(bs, key); if (ret < 0) { - return ret; - } - - if (bs->copy_on_read) { - flags |= BDRV_REQ_COPY_ON_READ; - } - - /* throttling disk I/O */ - if (bs->io_limits_enabled) { - bdrv_io_limits_intercept(bs, bytes, false); + bs->valid_key = 0; + } else if (!bs->valid_key) { + bs->valid_key = 1; + if (bs->blk) { + /* call the change callback now, we skipped it on open */ + blk_dev_change_media_cb(bs->blk, true); + } } - - /* Align read if necessary by padding qiov */ - if (offset & (align - 1)) { - head_buf = qemu_blockalign(bs, align); - qemu_iovec_init(&local_qiov, qiov->niov + 2); - qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); - qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); - use_local_qiov = true; - - bytes += offset & (align - 1); - offset = offset & ~(align - 1); - } - - if ((offset + bytes) & (align - 1)) { - if (!use_local_qiov) { - qemu_iovec_init(&local_qiov, qiov->niov + 1); - qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); - use_local_qiov = true; - } - tail_buf = qemu_blockalign(bs, align); - qemu_iovec_add(&local_qiov, tail_buf, - align - ((offset + bytes) & (align - 1))); - - bytes = ROUND_UP(bytes, align); - } - - tracked_request_begin(&req, bs, offset, bytes, false); - ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, - use_local_qiov ? &local_qiov : qiov, - flags); - tracked_request_end(&req); - - if (use_local_qiov) { - qemu_iovec_destroy(&local_qiov); - qemu_vfree(head_buf); - qemu_vfree(tail_buf); - } - - return ret; -} - -static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, - BdrvRequestFlags flags) -{ - if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { - return -EINVAL; - } - - return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS, qiov, flags); -} - -int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - trace_bdrv_co_readv(bs, sector_num, nb_sectors); - - return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); -} - -int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) -{ - trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); - - return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, - BDRV_REQ_COPY_ON_READ); -} - -#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768 - -static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) -{ - BlockDriver *drv = bs->drv; - QEMUIOVector qiov; - struct iovec iov = {0}; - int ret = 0; - - int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes, - BDRV_REQUEST_MAX_SECTORS); - - while (nb_sectors > 0 && !ret) { - int num = nb_sectors; - - /* Align request. Block drivers can expect the "bulk" of the request - * to be aligned. - */ - if (bs->bl.write_zeroes_alignment - && num > bs->bl.write_zeroes_alignment) { - if (sector_num % bs->bl.write_zeroes_alignment != 0) { - /* Make a small request up to the first aligned sector. */ - num = bs->bl.write_zeroes_alignment; - num -= sector_num % bs->bl.write_zeroes_alignment; - } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { - /* Shorten the request to the last aligned sector. num cannot - * underflow because num > bs->bl.write_zeroes_alignment. - */ - num -= (sector_num + num) % bs->bl.write_zeroes_alignment; - } - } - - /* limit request size */ - if (num > max_write_zeroes) { - num = max_write_zeroes; - } - - ret = -ENOTSUP; - /* First try the efficient write zeroes operation */ - if (drv->bdrv_co_write_zeroes) { - ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); - } - - if (ret == -ENOTSUP) { - /* Fall back to bounce buffer if write zeroes is unsupported */ - int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length, - MAX_WRITE_ZEROES_BOUNCE_BUFFER); - num = MIN(num, max_xfer_len); - iov.iov_len = num * BDRV_SECTOR_SIZE; - if (iov.iov_base == NULL) { - iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); - if (iov.iov_base == NULL) { - ret = -ENOMEM; - goto fail; - } - memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); - } - qemu_iovec_init_external(&qiov, &iov, 1); - - ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); - - /* Keep bounce buffer around if it is big enough for all - * all future requests. - */ - if (num < max_xfer_len) { - qemu_vfree(iov.iov_base); - iov.iov_base = NULL; - } - } - - sector_num += num; - nb_sectors -= num; - } - -fail: - qemu_vfree(iov.iov_base); - return ret; -} + return ret; +} /* - * Forwards an already correctly aligned write request to the BlockDriver. + * Provide an encryption key for @bs. + * If @key is non-null: + * If @bs is not encrypted, fail. + * Else if the key is invalid, fail. + * Else set @bs's key to @key, replacing the existing key, if any. + * If @key is null: + * If @bs is encrypted and still lacks a key, fail. + * Else do nothing. + * On failure, store an error object through @errp if non-null. */ -static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, - BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, - QEMUIOVector *qiov, int flags) +void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp) { - BlockDriver *drv = bs->drv; - bool waited; - int ret; - - int64_t sector_num = offset >> BDRV_SECTOR_BITS; - unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; - - assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); - assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); - assert(!qiov || bytes == qiov->size); - - waited = wait_serialising_requests(req); - assert(!waited || !req->serialising); - assert(req->overlap_offset <= offset); - assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); - - ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); - - if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && - !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && - qemu_iovec_is_zero(qiov)) { - flags |= BDRV_REQ_ZERO_WRITE; - if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { - flags |= BDRV_REQ_MAY_UNMAP; - } - } - - if (ret < 0) { - /* Do nothing, write notifier decided to fail this request */ - } else if (flags & BDRV_REQ_ZERO_WRITE) { - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO); - ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); - } else { - BLKDBG_EVENT(bs, BLKDBG_PWRITEV); - ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); - } - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE); - - if (ret == 0 && !bs->enable_write_cache) { - ret = bdrv_co_flush(bs); - } - - bdrv_set_dirty(bs, sector_num, nb_sectors); - - block_acct_highest_sector(&bs->stats, sector_num, nb_sectors); - - if (ret >= 0) { - bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); - } - - return ret; -} - -/* - * Handle a write request in coroutine context - */ -static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, - int64_t offset, unsigned int bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags) -{ - BdrvTrackedRequest req; - uint64_t align = bdrv_get_align(bs); - uint8_t *head_buf = NULL; - uint8_t *tail_buf = NULL; - QEMUIOVector local_qiov; - bool use_local_qiov = false; - int ret; - - if (!bs->drv) { - return -ENOMEDIUM; - } - if (bs->read_only) { - return -EACCES; - } - - ret = bdrv_check_byte_request(bs, offset, bytes); - if (ret < 0) { - return ret; - } - - /* throttling disk I/O */ - if (bs->io_limits_enabled) { - bdrv_io_limits_intercept(bs, bytes, true); - } - - /* - * Align write if necessary by performing a read-modify-write cycle. - * Pad qiov with the read parts and be sure to have a tracked request not - * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. - */ - tracked_request_begin(&req, bs, offset, bytes, true); - - if (offset & (align - 1)) { - QEMUIOVector head_qiov; - struct iovec head_iov; - - mark_request_serialising(&req, align); - wait_serialising_requests(&req); - - head_buf = qemu_blockalign(bs, align); - head_iov = (struct iovec) { - .iov_base = head_buf, - .iov_len = align, - }; - qemu_iovec_init_external(&head_qiov, &head_iov, 1); - - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD); - ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, - align, &head_qiov, 0); - if (ret < 0) { - goto fail; - } - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); - - qemu_iovec_init(&local_qiov, qiov->niov + 2); - qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); - qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); - use_local_qiov = true; - - bytes += offset & (align - 1); - offset = offset & ~(align - 1); - } - - if ((offset + bytes) & (align - 1)) { - QEMUIOVector tail_qiov; - struct iovec tail_iov; - size_t tail_bytes; - bool waited; - - mark_request_serialising(&req, align); - waited = wait_serialising_requests(&req); - assert(!waited || !use_local_qiov); - - tail_buf = qemu_blockalign(bs, align); - tail_iov = (struct iovec) { - .iov_base = tail_buf, - .iov_len = align, - }; - qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); - - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL); - ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, - align, &tail_qiov, 0); - if (ret < 0) { - goto fail; - } - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); - - if (!use_local_qiov) { - qemu_iovec_init(&local_qiov, qiov->niov + 1); - qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); - use_local_qiov = true; - } - - tail_bytes = (offset + bytes) & (align - 1); - qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); - - bytes = ROUND_UP(bytes, align); - } - - if (use_local_qiov) { - /* Local buffer may have non-zero data. */ - flags &= ~BDRV_REQ_ZERO_WRITE; - } - ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, - use_local_qiov ? &local_qiov : qiov, - flags); - -fail: - tracked_request_end(&req); - - if (use_local_qiov) { - qemu_iovec_destroy(&local_qiov); - } - qemu_vfree(head_buf); - qemu_vfree(tail_buf); - - return ret; -} - -static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, - BdrvRequestFlags flags) -{ - if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { - return -EINVAL; - } - - return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS, qiov, flags); -} - -int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - trace_bdrv_co_writev(bs, sector_num, nb_sectors); - - return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); -} - -int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - BdrvRequestFlags flags) -{ - int ret; - - trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); - - if (!(bs->open_flags & BDRV_O_UNMAP)) { - flags &= ~BDRV_REQ_MAY_UNMAP; - } - if (bdrv_req_is_aligned(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS)) { - ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, - BDRV_REQ_ZERO_WRITE | flags); - } else { - uint8_t *buf; - QEMUIOVector local_qiov; - size_t bytes = nb_sectors << BDRV_SECTOR_BITS; - - buf = qemu_memalign(bdrv_opt_mem_align(bs), bytes); - memset(buf, 0, bytes); - qemu_iovec_init(&local_qiov, 1); - qemu_iovec_add(&local_qiov, buf, bytes); - - ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, &local_qiov, - BDRV_REQ_ZERO_WRITE | flags); - qemu_vfree(buf); - } - return ret; -} - -/** - * Truncate file to 'offset' bytes (needed only for file protocols) - */ -int bdrv_truncate(BlockDriverState *bs, int64_t offset) -{ - BlockDriver *drv = bs->drv; - int ret; - if (!drv) - return -ENOMEDIUM; - if (!drv->bdrv_truncate) - return -ENOTSUP; - if (bs->read_only) - return -EACCES; - - ret = drv->bdrv_truncate(bs, offset); - if (ret == 0) { - ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); - bdrv_dirty_bitmap_truncate(bs); - if (bs->blk) { - blk_dev_resize_cb(bs->blk); - } - } - return ret; -} - -/** - * Length of a allocated file in bytes. Sparse files are counted by actual - * allocated space. Return < 0 if error or unknown. - */ -int64_t bdrv_get_allocated_file_size(BlockDriverState *bs) -{ - BlockDriver *drv = bs->drv; - if (!drv) { - return -ENOMEDIUM; - } - if (drv->bdrv_get_allocated_file_size) { - return drv->bdrv_get_allocated_file_size(bs); - } - if (bs->file) { - return bdrv_get_allocated_file_size(bs->file); - } - return -ENOTSUP; -} - -/** - * Return number of sectors on success, -errno on error. - */ -int64_t bdrv_nb_sectors(BlockDriverState *bs) -{ - BlockDriver *drv = bs->drv; - - if (!drv) - return -ENOMEDIUM; - - if (drv->has_variable_length) { - int ret = refresh_total_sectors(bs, bs->total_sectors); - if (ret < 0) { - return ret; - } - } - return bs->total_sectors; -} - -/** - * Return length in bytes on success, -errno on error. - * The length is always a multiple of BDRV_SECTOR_SIZE. - */ -int64_t bdrv_getlength(BlockDriverState *bs) -{ - int64_t ret = bdrv_nb_sectors(bs); - - return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE; -} - -/* return 0 as number of sectors if no device present or error */ -void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr) -{ - int64_t nb_sectors = bdrv_nb_sectors(bs); - - *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors; -} - -void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error, - BlockdevOnError on_write_error) -{ - bs->on_read_error = on_read_error; - bs->on_write_error = on_write_error; -} - -BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read) -{ - return is_read ? bs->on_read_error : bs->on_write_error; -} - -BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error) -{ - BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error; - - switch (on_err) { - case BLOCKDEV_ON_ERROR_ENOSPC: - return (error == ENOSPC) ? - BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT; - case BLOCKDEV_ON_ERROR_STOP: - return BLOCK_ERROR_ACTION_STOP; - case BLOCKDEV_ON_ERROR_REPORT: - return BLOCK_ERROR_ACTION_REPORT; - case BLOCKDEV_ON_ERROR_IGNORE: - return BLOCK_ERROR_ACTION_IGNORE; - default: - abort(); - } -} - -static void send_qmp_error_event(BlockDriverState *bs, - BlockErrorAction action, - bool is_read, int error) -{ - IoOperationType optype; - - optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE; - qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action, - bdrv_iostatus_is_enabled(bs), - error == ENOSPC, strerror(error), - &error_abort); -} - -/* This is done by device models because, while the block layer knows - * about the error, it does not know whether an operation comes from - * the device or the block layer (from a job, for example). - */ -void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action, - bool is_read, int error) -{ - assert(error >= 0); - - if (action == BLOCK_ERROR_ACTION_STOP) { - /* First set the iostatus, so that "info block" returns an iostatus - * that matches the events raised so far (an additional error iostatus - * is fine, but not a lost one). - */ - bdrv_iostatus_set_err(bs, error); - - /* Then raise the request to stop the VM and the event. - * qemu_system_vmstop_request_prepare has two effects. First, - * it ensures that the STOP event always comes after the - * BLOCK_IO_ERROR event. Second, it ensures that even if management - * can observe the STOP event and do a "cont" before the STOP - * event is issued, the VM will not stop. In this case, vm_start() - * also ensures that the STOP/RESUME pair of events is emitted. - */ - qemu_system_vmstop_request_prepare(); - send_qmp_error_event(bs, action, is_read, error); - qemu_system_vmstop_request(RUN_STATE_IO_ERROR); - } else { - send_qmp_error_event(bs, action, is_read, error); - } -} - -int bdrv_is_read_only(BlockDriverState *bs) -{ - return bs->read_only; -} - -int bdrv_is_sg(BlockDriverState *bs) -{ - return bs->sg; -} - -int bdrv_enable_write_cache(BlockDriverState *bs) -{ - return bs->enable_write_cache; -} - -void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce) -{ - bs->enable_write_cache = wce; - - /* so a reopen() will preserve wce */ - if (wce) { - bs->open_flags |= BDRV_O_CACHE_WB; - } else { - bs->open_flags &= ~BDRV_O_CACHE_WB; - } -} - -int bdrv_is_encrypted(BlockDriverState *bs) -{ - if (bs->backing_hd && bs->backing_hd->encrypted) - return 1; - return bs->encrypted; -} - -int bdrv_key_required(BlockDriverState *bs) -{ - BlockDriverState *backing_hd = bs->backing_hd; - - if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key) - return 1; - return (bs->encrypted && !bs->valid_key); -} - -int bdrv_set_key(BlockDriverState *bs, const char *key) -{ - int ret; - if (bs->backing_hd && bs->backing_hd->encrypted) { - ret = bdrv_set_key(bs->backing_hd, key); - if (ret < 0) - return ret; - if (!bs->encrypted) - return 0; - } - if (!bs->encrypted) { - return -EINVAL; - } else if (!bs->drv || !bs->drv->bdrv_set_key) { - return -ENOMEDIUM; - } - ret = bs->drv->bdrv_set_key(bs, key); - if (ret < 0) { - bs->valid_key = 0; - } else if (!bs->valid_key) { - bs->valid_key = 1; - if (bs->blk) { - /* call the change callback now, we skipped it on open */ - blk_dev_change_media_cb(bs->blk, true); - } - } - return ret; -} - -/* - * Provide an encryption key for @bs. - * If @key is non-null: - * If @bs is not encrypted, fail. - * Else if the key is invalid, fail. - * Else set @bs's key to @key, replacing the existing key, if any. - * If @key is null: - * If @bs is encrypted and still lacks a key, fail. - * Else do nothing. - * On failure, store an error object through @errp if non-null. - */ -void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp) -{ - if (key) { - if (!bdrv_is_encrypted(bs)) { - error_setg(errp, "Node '%s' is not encrypted", - bdrv_get_device_or_node_name(bs)); - } else if (bdrv_set_key(bs, key) < 0) { - error_set(errp, QERR_INVALID_PASSWORD); + if (key) { + if (!bdrv_is_encrypted(bs)) { + error_setg(errp, "Node '%s' is not encrypted", + bdrv_get_device_or_node_name(bs)); + } else if (bdrv_set_key(bs, key) < 0) { + error_set(errp, QERR_INVALID_PASSWORD); } } else { if (bdrv_key_required(bs)) { @@ -3856,1334 +2518,409 @@ void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp) } } -const char *bdrv_get_format_name(BlockDriverState *bs) -{ - return bs->drv ? bs->drv->format_name : NULL; -} - -static int qsort_strcmp(const void *a, const void *b) -{ - return strcmp(a, b); -} - -void bdrv_iterate_format(void (*it)(void *opaque, const char *name), - void *opaque) -{ - BlockDriver *drv; - int count = 0; - int i; - const char **formats = NULL; - - QLIST_FOREACH(drv, &bdrv_drivers, list) { - if (drv->format_name) { - bool found = false; - int i = count; - while (formats && i && !found) { - found = !strcmp(formats[--i], drv->format_name); - } - - if (!found) { - formats = g_renew(const char *, formats, count + 1); - formats[count++] = drv->format_name; - } - } - } - - qsort(formats, count, sizeof(formats[0]), qsort_strcmp); - - for (i = 0; i < count; i++) { - it(opaque, formats[i]); - } - - g_free(formats); -} - -/* This function is to find a node in the bs graph */ -BlockDriverState *bdrv_find_node(const char *node_name) -{ - BlockDriverState *bs; - - assert(node_name); - - QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) { - if (!strcmp(node_name, bs->node_name)) { - return bs; - } - } - return NULL; -} - -/* Put this QMP function here so it can access the static graph_bdrv_states. */ -BlockDeviceInfoList *bdrv_named_nodes_list(Error **errp) -{ - BlockDeviceInfoList *list, *entry; - BlockDriverState *bs; - - list = NULL; - QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) { - BlockDeviceInfo *info = bdrv_block_device_info(bs, errp); - if (!info) { - qapi_free_BlockDeviceInfoList(list); - return NULL; - } - entry = g_malloc0(sizeof(*entry)); - entry->value = info; - entry->next = list; - list = entry; - } - - return list; -} - -BlockDriverState *bdrv_lookup_bs(const char *device, - const char *node_name, - Error **errp) -{ - BlockBackend *blk; - BlockDriverState *bs; - - if (device) { - blk = blk_by_name(device); - - if (blk) { - return blk_bs(blk); - } - } - - if (node_name) { - bs = bdrv_find_node(node_name); - - if (bs) { - return bs; - } - } - - error_setg(errp, "Cannot find device=%s nor node_name=%s", - device ? device : "", - node_name ? node_name : ""); - return NULL; -} - -/* If 'base' is in the same chain as 'top', return true. Otherwise, - * return false. If either argument is NULL, return false. */ -bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base) -{ - while (top && top != base) { - top = top->backing_hd; - } - - return top != NULL; -} - -BlockDriverState *bdrv_next_node(BlockDriverState *bs) -{ - if (!bs) { - return QTAILQ_FIRST(&graph_bdrv_states); - } - return QTAILQ_NEXT(bs, node_list); -} - -BlockDriverState *bdrv_next(BlockDriverState *bs) -{ - if (!bs) { - return QTAILQ_FIRST(&bdrv_states); - } - return QTAILQ_NEXT(bs, device_list); -} - -const char *bdrv_get_node_name(const BlockDriverState *bs) -{ - return bs->node_name; -} - -/* TODO check what callers really want: bs->node_name or blk_name() */ -const char *bdrv_get_device_name(const BlockDriverState *bs) -{ - return bs->blk ? blk_name(bs->blk) : ""; -} - -/* This can be used to identify nodes that might not have a device - * name associated. Since node and device names live in the same - * namespace, the result is unambiguous. The exception is if both are - * absent, then this returns an empty (non-null) string. */ -const char *bdrv_get_device_or_node_name(const BlockDriverState *bs) -{ - return bs->blk ? blk_name(bs->blk) : bs->node_name; -} - -int bdrv_get_flags(BlockDriverState *bs) -{ - return bs->open_flags; -} - -int bdrv_flush_all(void) -{ - BlockDriverState *bs = NULL; - int result = 0; - - while ((bs = bdrv_next(bs))) { - AioContext *aio_context = bdrv_get_aio_context(bs); - int ret; - - aio_context_acquire(aio_context); - ret = bdrv_flush(bs); - if (ret < 0 && !result) { - result = ret; - } - aio_context_release(aio_context); - } - - return result; -} - -int bdrv_has_zero_init_1(BlockDriverState *bs) -{ - return 1; -} - -int bdrv_has_zero_init(BlockDriverState *bs) -{ - assert(bs->drv); - - /* If BS is a copy on write image, it is initialized to - the contents of the base image, which may not be zeroes. */ - if (bs->backing_hd) { - return 0; - } - if (bs->drv->bdrv_has_zero_init) { - return bs->drv->bdrv_has_zero_init(bs); - } - - /* safe default */ - return 0; -} - -bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs) -{ - BlockDriverInfo bdi; - - if (bs->backing_hd) { - return false; - } - - if (bdrv_get_info(bs, &bdi) == 0) { - return bdi.unallocated_blocks_are_zero; - } - - return false; -} - -bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs) -{ - BlockDriverInfo bdi; - - if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) { - return false; - } - - if (bdrv_get_info(bs, &bdi) == 0) { - return bdi.can_write_zeroes_with_unmap; - } - - return false; -} - -typedef struct BdrvCoGetBlockStatusData { - BlockDriverState *bs; - BlockDriverState *base; - int64_t sector_num; - int nb_sectors; - int *pnum; - int64_t ret; - bool done; -} BdrvCoGetBlockStatusData; - -/* - * Returns the allocation status of the specified sectors. - * Drivers not implementing the functionality are assumed to not support - * backing files, hence all their sectors are reported as allocated. - * - * If 'sector_num' is beyond the end of the disk image the return value is 0 - * and 'pnum' is set to 0. - * - * 'pnum' is set to the number of sectors (including and immediately following - * the specified sector) that are known to be in the same - * allocated/unallocated state. - * - * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes - * beyond the end of the disk image it will be clamped. - */ -static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, int *pnum) -{ - int64_t total_sectors; - int64_t n; - int64_t ret, ret2; - - total_sectors = bdrv_nb_sectors(bs); - if (total_sectors < 0) { - return total_sectors; - } - - if (sector_num >= total_sectors) { - *pnum = 0; - return 0; - } - - n = total_sectors - sector_num; - if (n < nb_sectors) { - nb_sectors = n; - } - - if (!bs->drv->bdrv_co_get_block_status) { - *pnum = nb_sectors; - ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; - if (bs->drv->protocol_name) { - ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); - } - return ret; - } - - ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum); - if (ret < 0) { - *pnum = 0; - return ret; - } - - if (ret & BDRV_BLOCK_RAW) { - assert(ret & BDRV_BLOCK_OFFSET_VALID); - return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, - *pnum, pnum); - } - - if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { - ret |= BDRV_BLOCK_ALLOCATED; - } - - if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) { - if (bdrv_unallocated_blocks_are_zero(bs)) { - ret |= BDRV_BLOCK_ZERO; - } else if (bs->backing_hd) { - BlockDriverState *bs2 = bs->backing_hd; - int64_t nb_sectors2 = bdrv_nb_sectors(bs2); - if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { - ret |= BDRV_BLOCK_ZERO; - } - } - } - - if (bs->file && - (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && - (ret & BDRV_BLOCK_OFFSET_VALID)) { - int file_pnum; - - ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, - *pnum, &file_pnum); - if (ret2 >= 0) { - /* Ignore errors. This is just providing extra information, it - * is useful but not necessary. - */ - if (!file_pnum) { - /* !file_pnum indicates an offset at or beyond the EOF; it is - * perfectly valid for the format block driver to point to such - * offsets, so catch it and mark everything as zero */ - ret |= BDRV_BLOCK_ZERO; - } else { - /* Limit request to the range reported by the protocol driver */ - *pnum = file_pnum; - ret |= (ret2 & BDRV_BLOCK_ZERO); - } - } - } - - return ret; -} - -/* Coroutine wrapper for bdrv_get_block_status() */ -static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque) -{ - BdrvCoGetBlockStatusData *data = opaque; - BlockDriverState *bs = data->bs; - - data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors, - data->pnum); - data->done = true; -} - -/* - * Synchronous wrapper around bdrv_co_get_block_status(). - * - * See bdrv_co_get_block_status() for details. - */ -int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, int *pnum) -{ - Coroutine *co; - BdrvCoGetBlockStatusData data = { - .bs = bs, - .sector_num = sector_num, - .nb_sectors = nb_sectors, - .pnum = pnum, - .done = false, - }; - - if (qemu_in_coroutine()) { - /* Fast-path if already in coroutine context */ - bdrv_get_block_status_co_entry(&data); - } else { - AioContext *aio_context = bdrv_get_aio_context(bs); - - co = qemu_coroutine_create(bdrv_get_block_status_co_entry); - qemu_coroutine_enter(co, &data); - while (!data.done) { - aio_poll(aio_context, true); - } - } - return data.ret; -} - -int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, int *pnum) -{ - int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum); - if (ret < 0) { - return ret; - } - return !!(ret & BDRV_BLOCK_ALLOCATED); -} - -/* - * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] - * - * Return true if the given sector is allocated in any image between - * BASE and TOP (inclusive). BASE can be NULL to check if the given - * sector is allocated in any image of the chain. Return false otherwise. - * - * 'pnum' is set to the number of sectors (including and immediately following - * the specified sector) that are known to be in the same - * allocated/unallocated state. - * - */ -int bdrv_is_allocated_above(BlockDriverState *top, - BlockDriverState *base, - int64_t sector_num, - int nb_sectors, int *pnum) -{ - BlockDriverState *intermediate; - int ret, n = nb_sectors; - - intermediate = top; - while (intermediate && intermediate != base) { - int pnum_inter; - ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, - &pnum_inter); - if (ret < 0) { - return ret; - } else if (ret) { - *pnum = pnum_inter; - return 1; - } - - /* - * [sector_num, nb_sectors] is unallocated on top but intermediate - * might have - * - * [sector_num+x, nr_sectors] allocated. - */ - if (n > pnum_inter && - (intermediate == top || - sector_num + pnum_inter < intermediate->total_sectors)) { - n = pnum_inter; - } - - intermediate = intermediate->backing_hd; - } - - *pnum = n; - return 0; -} - -const char *bdrv_get_encrypted_filename(BlockDriverState *bs) -{ - if (bs->backing_hd && bs->backing_hd->encrypted) - return bs->backing_file; - else if (bs->encrypted) - return bs->filename; - else - return NULL; -} - -void bdrv_get_backing_filename(BlockDriverState *bs, - char *filename, int filename_size) -{ - pstrcpy(filename, filename_size, bs->backing_file); -} - -int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) -{ - BlockDriver *drv = bs->drv; - int ret; - - if (!drv) { - return -ENOMEDIUM; - } - if (!drv->bdrv_write_compressed) { - return -ENOTSUP; - } - ret = bdrv_check_request(bs, sector_num, nb_sectors); - if (ret < 0) { - return ret; - } - - assert(QLIST_EMPTY(&bs->dirty_bitmaps)); - - return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); -} - -int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) -{ - BlockDriver *drv = bs->drv; - if (!drv) - return -ENOMEDIUM; - if (!drv->bdrv_get_info) - return -ENOTSUP; - memset(bdi, 0, sizeof(*bdi)); - return drv->bdrv_get_info(bs, bdi); -} - -ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs) -{ - BlockDriver *drv = bs->drv; - if (drv && drv->bdrv_get_specific_info) { - return drv->bdrv_get_specific_info(bs); - } - return NULL; -} - -int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, - int64_t pos, int size) -{ - QEMUIOVector qiov; - struct iovec iov = { - .iov_base = (void *) buf, - .iov_len = size, - }; - - qemu_iovec_init_external(&qiov, &iov, 1); - return bdrv_writev_vmstate(bs, &qiov, pos); -} - -int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) -{ - BlockDriver *drv = bs->drv; - - if (!drv) { - return -ENOMEDIUM; - } else if (drv->bdrv_save_vmstate) { - return drv->bdrv_save_vmstate(bs, qiov, pos); - } else if (bs->file) { - return bdrv_writev_vmstate(bs->file, qiov, pos); - } - - return -ENOTSUP; -} - -int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, - int64_t pos, int size) -{ - BlockDriver *drv = bs->drv; - if (!drv) - return -ENOMEDIUM; - if (drv->bdrv_load_vmstate) - return drv->bdrv_load_vmstate(bs, buf, pos, size); - if (bs->file) - return bdrv_load_vmstate(bs->file, buf, pos, size); - return -ENOTSUP; -} - -void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event) -{ - if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) { - return; - } - - bs->drv->bdrv_debug_event(bs, event); -} - -int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event, - const char *tag) -{ - while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) { - bs = bs->file; - } - - if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) { - return bs->drv->bdrv_debug_breakpoint(bs, event, tag); - } - - return -ENOTSUP; -} - -int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag) -{ - while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) { - bs = bs->file; - } - - if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) { - return bs->drv->bdrv_debug_remove_breakpoint(bs, tag); - } - - return -ENOTSUP; -} - -int bdrv_debug_resume(BlockDriverState *bs, const char *tag) -{ - while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) { - bs = bs->file; - } - - if (bs && bs->drv && bs->drv->bdrv_debug_resume) { - return bs->drv->bdrv_debug_resume(bs, tag); - } - - return -ENOTSUP; -} - -bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag) -{ - while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) { - bs = bs->file; - } - - if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) { - return bs->drv->bdrv_debug_is_suspended(bs, tag); - } - - return false; -} - -int bdrv_is_snapshot(BlockDriverState *bs) -{ - return !!(bs->open_flags & BDRV_O_SNAPSHOT); -} - -/* backing_file can either be relative, or absolute, or a protocol. If it is - * relative, it must be relative to the chain. So, passing in bs->filename - * from a BDS as backing_file should not be done, as that may be relative to - * the CWD rather than the chain. */ -BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, - const char *backing_file) -{ - char *filename_full = NULL; - char *backing_file_full = NULL; - char *filename_tmp = NULL; - int is_protocol = 0; - BlockDriverState *curr_bs = NULL; - BlockDriverState *retval = NULL; - - if (!bs || !bs->drv || !backing_file) { - return NULL; - } - - filename_full = g_malloc(PATH_MAX); - backing_file_full = g_malloc(PATH_MAX); - filename_tmp = g_malloc(PATH_MAX); - - is_protocol = path_has_protocol(backing_file); - - for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) { - - /* If either of the filename paths is actually a protocol, then - * compare unmodified paths; otherwise make paths relative */ - if (is_protocol || path_has_protocol(curr_bs->backing_file)) { - if (strcmp(backing_file, curr_bs->backing_file) == 0) { - retval = curr_bs->backing_hd; - break; - } - } else { - /* If not an absolute filename path, make it relative to the current - * image's filename path */ - path_combine(filename_tmp, PATH_MAX, curr_bs->filename, - backing_file); - - /* We are going to compare absolute pathnames */ - if (!realpath(filename_tmp, filename_full)) { - continue; - } - - /* We need to make sure the backing filename we are comparing against - * is relative to the current image filename (or absolute) */ - path_combine(filename_tmp, PATH_MAX, curr_bs->filename, - curr_bs->backing_file); - - if (!realpath(filename_tmp, backing_file_full)) { - continue; - } - - if (strcmp(backing_file_full, filename_full) == 0) { - retval = curr_bs->backing_hd; - break; - } - } - } - - g_free(filename_full); - g_free(backing_file_full); - g_free(filename_tmp); - return retval; -} - -int bdrv_get_backing_file_depth(BlockDriverState *bs) -{ - if (!bs->drv) { - return 0; - } - - if (!bs->backing_hd) { - return 0; - } - - return 1 + bdrv_get_backing_file_depth(bs->backing_hd); -} - -/**************************************************************/ -/* async I/Os */ - -BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); - - return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, - cb, opaque, false); -} - -BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); - - return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, - cb, opaque, true); -} - -BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, - BlockCompletionFunc *cb, void *opaque) -{ - trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); - - return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, - BDRV_REQ_ZERO_WRITE | flags, - cb, opaque, true); -} - - -typedef struct MultiwriteCB { - int error; - int num_requests; - int num_callbacks; - struct { - BlockCompletionFunc *cb; - void *opaque; - QEMUIOVector *free_qiov; - } callbacks[]; -} MultiwriteCB; - -static void multiwrite_user_cb(MultiwriteCB *mcb) -{ - int i; - - for (i = 0; i < mcb->num_callbacks; i++) { - mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); - if (mcb->callbacks[i].free_qiov) { - qemu_iovec_destroy(mcb->callbacks[i].free_qiov); - } - g_free(mcb->callbacks[i].free_qiov); - } -} - -static void multiwrite_cb(void *opaque, int ret) +const char *bdrv_get_format_name(BlockDriverState *bs) { - MultiwriteCB *mcb = opaque; - - trace_multiwrite_cb(mcb, ret); - - if (ret < 0 && !mcb->error) { - mcb->error = ret; - } - - mcb->num_requests--; - if (mcb->num_requests == 0) { - multiwrite_user_cb(mcb); - g_free(mcb); - } + return bs->drv ? bs->drv->format_name : NULL; } -static int multiwrite_req_compare(const void *a, const void *b) +static int qsort_strcmp(const void *a, const void *b) { - const BlockRequest *req1 = a, *req2 = b; - - /* - * Note that we can't simply subtract req2->sector from req1->sector - * here as that could overflow the return value. - */ - if (req1->sector > req2->sector) { - return 1; - } else if (req1->sector < req2->sector) { - return -1; - } else { - return 0; - } + return strcmp(a, b); } -/* - * Takes a bunch of requests and tries to merge them. Returns the number of - * requests that remain after merging. - */ -static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, - int num_reqs, MultiwriteCB *mcb) +void bdrv_iterate_format(void (*it)(void *opaque, const char *name), + void *opaque) { - int i, outidx; - - // Sort requests by start sector - qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); - - // Check if adjacent requests touch the same clusters. If so, combine them, - // filling up gaps with zero sectors. - outidx = 0; - for (i = 1; i < num_reqs; i++) { - int merge = 0; - int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; - - // Handle exactly sequential writes and overlapping writes. - if (reqs[i].sector <= oldreq_last) { - merge = 1; - } + BlockDriver *drv; + int count = 0; + int i; + const char **formats = NULL; - if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) { - merge = 0; - } + QLIST_FOREACH(drv, &bdrv_drivers, list) { + if (drv->format_name) { + bool found = false; + int i = count; + while (formats && i && !found) { + found = !strcmp(formats[--i], drv->format_name); + } - if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors + - reqs[i].nb_sectors > bs->bl.max_transfer_length) { - merge = 0; + if (!found) { + formats = g_renew(const char *, formats, count + 1); + formats[count++] = drv->format_name; + } } + } - if (merge) { - size_t size; - QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); - qemu_iovec_init(qiov, - reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); - - // Add the first request to the merged one. If the requests are - // overlapping, drop the last sectors of the first request. - size = (reqs[i].sector - reqs[outidx].sector) << 9; - qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); + qsort(formats, count, sizeof(formats[0]), qsort_strcmp); - // We should need to add any zeros between the two requests - assert (reqs[i].sector <= oldreq_last); + for (i = 0; i < count; i++) { + it(opaque, formats[i]); + } - // Add the second request - qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); + g_free(formats); +} - // Add tail of first request, if necessary - if (qiov->size < reqs[outidx].qiov->size) { - qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size, - reqs[outidx].qiov->size - qiov->size); - } +/* This function is to find a node in the bs graph */ +BlockDriverState *bdrv_find_node(const char *node_name) +{ + BlockDriverState *bs; - reqs[outidx].nb_sectors = qiov->size >> 9; - reqs[outidx].qiov = qiov; + assert(node_name); - mcb->callbacks[i].free_qiov = reqs[outidx].qiov; - } else { - outidx++; - reqs[outidx].sector = reqs[i].sector; - reqs[outidx].nb_sectors = reqs[i].nb_sectors; - reqs[outidx].qiov = reqs[i].qiov; + QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) { + if (!strcmp(node_name, bs->node_name)) { + return bs; } } - - block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1); - - return outidx + 1; + return NULL; } -/* - * Submit multiple AIO write requests at once. - * - * On success, the function returns 0 and all requests in the reqs array have - * been submitted. In error case this function returns -1, and any of the - * requests may or may not be submitted yet. In particular, this means that the - * callback will be called for some of the requests, for others it won't. The - * caller must check the error field of the BlockRequest to wait for the right - * callbacks (if error != 0, no callback will be called). - * - * The implementation may modify the contents of the reqs array, e.g. to merge - * requests. However, the fields opaque and error are left unmodified as they - * are used to signal failure for a single request to the caller. - */ -int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) +/* Put this QMP function here so it can access the static graph_bdrv_states. */ +BlockDeviceInfoList *bdrv_named_nodes_list(Error **errp) { - MultiwriteCB *mcb; - int i; + BlockDeviceInfoList *list, *entry; + BlockDriverState *bs; - /* don't submit writes if we don't have a medium */ - if (bs->drv == NULL) { - for (i = 0; i < num_reqs; i++) { - reqs[i].error = -ENOMEDIUM; + list = NULL; + QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) { + BlockDeviceInfo *info = bdrv_block_device_info(bs, errp); + if (!info) { + qapi_free_BlockDeviceInfoList(list); + return NULL; } - return -1; + entry = g_malloc0(sizeof(*entry)); + entry->value = info; + entry->next = list; + list = entry; } - if (num_reqs == 0) { - return 0; - } + return list; +} - // Create MultiwriteCB structure - mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); - mcb->num_requests = 0; - mcb->num_callbacks = num_reqs; +BlockDriverState *bdrv_lookup_bs(const char *device, + const char *node_name, + Error **errp) +{ + BlockBackend *blk; + BlockDriverState *bs; - for (i = 0; i < num_reqs; i++) { - mcb->callbacks[i].cb = reqs[i].cb; - mcb->callbacks[i].opaque = reqs[i].opaque; - } + if (device) { + blk = blk_by_name(device); - // Check for mergable requests - num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); + if (blk) { + return blk_bs(blk); + } + } - trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); + if (node_name) { + bs = bdrv_find_node(node_name); - /* Run the aio requests. */ - mcb->num_requests = num_reqs; - for (i = 0; i < num_reqs; i++) { - bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, - reqs[i].nb_sectors, reqs[i].flags, - multiwrite_cb, mcb, - true); + if (bs) { + return bs; + } } - return 0; + error_setg(errp, "Cannot find device=%s nor node_name=%s", + device ? device : "", + node_name ? node_name : ""); + return NULL; } -void bdrv_aio_cancel(BlockAIOCB *acb) +/* If 'base' is in the same chain as 'top', return true. Otherwise, + * return false. If either argument is NULL, return false. */ +bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base) { - qemu_aio_ref(acb); - bdrv_aio_cancel_async(acb); - while (acb->refcnt > 1) { - if (acb->aiocb_info->get_aio_context) { - aio_poll(acb->aiocb_info->get_aio_context(acb), true); - } else if (acb->bs) { - aio_poll(bdrv_get_aio_context(acb->bs), true); - } else { - abort(); - } + while (top && top != base) { + top = top->backing_hd; } - qemu_aio_unref(acb); + + return top != NULL; } -/* Async version of aio cancel. The caller is not blocked if the acb implements - * cancel_async, otherwise we do nothing and let the request normally complete. - * In either case the completion callback must be called. */ -void bdrv_aio_cancel_async(BlockAIOCB *acb) +BlockDriverState *bdrv_next_node(BlockDriverState *bs) { - if (acb->aiocb_info->cancel_async) { - acb->aiocb_info->cancel_async(acb); + if (!bs) { + return QTAILQ_FIRST(&graph_bdrv_states); } + return QTAILQ_NEXT(bs, node_list); } -/**************************************************************/ -/* async block device emulation */ - -typedef struct BlockAIOCBSync { - BlockAIOCB common; - QEMUBH *bh; - int ret; - /* vector translation state */ - QEMUIOVector *qiov; - uint8_t *bounce; - int is_write; -} BlockAIOCBSync; - -static const AIOCBInfo bdrv_em_aiocb_info = { - .aiocb_size = sizeof(BlockAIOCBSync), -}; - -static void bdrv_aio_bh_cb(void *opaque) +BlockDriverState *bdrv_next(BlockDriverState *bs) { - BlockAIOCBSync *acb = opaque; - - if (!acb->is_write && acb->ret >= 0) { - qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); + if (!bs) { + return QTAILQ_FIRST(&bdrv_states); } - qemu_vfree(acb->bounce); - acb->common.cb(acb->common.opaque, acb->ret); - qemu_bh_delete(acb->bh); - acb->bh = NULL; - qemu_aio_unref(acb); + return QTAILQ_NEXT(bs, device_list); } -static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque, - int is_write) - +const char *bdrv_get_node_name(const BlockDriverState *bs) { - BlockAIOCBSync *acb; - - acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); - acb->is_write = is_write; - acb->qiov = qiov; - acb->bounce = qemu_try_blockalign(bs, qiov->size); - acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); - - if (acb->bounce == NULL) { - acb->ret = -ENOMEM; - } else if (is_write) { - qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); - acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); - } else { - acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); - } - - qemu_bh_schedule(acb->bh); - - return &acb->common; + return bs->node_name; } -static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) +/* TODO check what callers really want: bs->node_name or blk_name() */ +const char *bdrv_get_device_name(const BlockDriverState *bs) { - return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); + return bs->blk ? blk_name(bs->blk) : ""; } -static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) +/* This can be used to identify nodes that might not have a device + * name associated. Since node and device names live in the same + * namespace, the result is unambiguous. The exception is if both are + * absent, then this returns an empty (non-null) string. */ +const char *bdrv_get_device_or_node_name(const BlockDriverState *bs) { - return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); + return bs->blk ? blk_name(bs->blk) : bs->node_name; } - -typedef struct BlockAIOCBCoroutine { - BlockAIOCB common; - BlockRequest req; - bool is_write; - bool need_bh; - bool *done; - QEMUBH* bh; -} BlockAIOCBCoroutine; - -static const AIOCBInfo bdrv_em_co_aiocb_info = { - .aiocb_size = sizeof(BlockAIOCBCoroutine), -}; - -static void bdrv_co_complete(BlockAIOCBCoroutine *acb) +int bdrv_get_flags(BlockDriverState *bs) { - if (!acb->need_bh) { - acb->common.cb(acb->common.opaque, acb->req.error); - qemu_aio_unref(acb); - } + return bs->open_flags; } -static void bdrv_co_em_bh(void *opaque) +int bdrv_has_zero_init_1(BlockDriverState *bs) { - BlockAIOCBCoroutine *acb = opaque; - - assert(!acb->need_bh); - qemu_bh_delete(acb->bh); - bdrv_co_complete(acb); + return 1; } -static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb) +int bdrv_has_zero_init(BlockDriverState *bs) { - acb->need_bh = false; - if (acb->req.error != -EINPROGRESS) { - BlockDriverState *bs = acb->common.bs; + assert(bs->drv); - acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); - qemu_bh_schedule(acb->bh); + /* If BS is a copy on write image, it is initialized to + the contents of the base image, which may not be zeroes. */ + if (bs->backing_hd) { + return 0; + } + if (bs->drv->bdrv_has_zero_init) { + return bs->drv->bdrv_has_zero_init(bs); } + + /* safe default */ + return 0; } -/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ -static void coroutine_fn bdrv_co_do_rw(void *opaque) +bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs) { - BlockAIOCBCoroutine *acb = opaque; - BlockDriverState *bs = acb->common.bs; + BlockDriverInfo bdi; - if (!acb->is_write) { - acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, - acb->req.nb_sectors, acb->req.qiov, acb->req.flags); - } else { - acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, - acb->req.nb_sectors, acb->req.qiov, acb->req.flags); + if (bs->backing_hd) { + return false; + } + + if (bdrv_get_info(bs, &bdi) == 0) { + return bdi.unallocated_blocks_are_zero; } - bdrv_co_complete(acb); + return false; } -static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BdrvRequestFlags flags, - BlockCompletionFunc *cb, - void *opaque, - bool is_write) +bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs) { - Coroutine *co; - BlockAIOCBCoroutine *acb; + BlockDriverInfo bdi; - acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); - acb->need_bh = true; - acb->req.error = -EINPROGRESS; - acb->req.sector = sector_num; - acb->req.nb_sectors = nb_sectors; - acb->req.qiov = qiov; - acb->req.flags = flags; - acb->is_write = is_write; + if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) { + return false; + } - co = qemu_coroutine_create(bdrv_co_do_rw); - qemu_coroutine_enter(co, acb); + if (bdrv_get_info(bs, &bdi) == 0) { + return bdi.can_write_zeroes_with_unmap; + } - bdrv_co_maybe_schedule_bh(acb); - return &acb->common; + return false; } -static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) +const char *bdrv_get_encrypted_filename(BlockDriverState *bs) { - BlockAIOCBCoroutine *acb = opaque; - BlockDriverState *bs = acb->common.bs; - - acb->req.error = bdrv_co_flush(bs); - bdrv_co_complete(acb); + if (bs->backing_hd && bs->backing_hd->encrypted) + return bs->backing_file; + else if (bs->encrypted) + return bs->filename; + else + return NULL; } -BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, - BlockCompletionFunc *cb, void *opaque) +void bdrv_get_backing_filename(BlockDriverState *bs, + char *filename, int filename_size) { - trace_bdrv_aio_flush(bs, opaque); - - Coroutine *co; - BlockAIOCBCoroutine *acb; - - acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); - acb->need_bh = true; - acb->req.error = -EINPROGRESS; + pstrcpy(filename, filename_size, bs->backing_file); +} - co = qemu_coroutine_create(bdrv_aio_flush_co_entry); - qemu_coroutine_enter(co, acb); +int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + BlockDriver *drv = bs->drv; + if (!drv) + return -ENOMEDIUM; + if (!drv->bdrv_get_info) + return -ENOTSUP; + memset(bdi, 0, sizeof(*bdi)); + return drv->bdrv_get_info(bs, bdi); +} - bdrv_co_maybe_schedule_bh(acb); - return &acb->common; +ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_get_specific_info) { + return drv->bdrv_get_specific_info(bs); + } + return NULL; } -static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) +void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event) { - BlockAIOCBCoroutine *acb = opaque; - BlockDriverState *bs = acb->common.bs; + if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) { + return; + } - acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); - bdrv_co_complete(acb); + bs->drv->bdrv_debug_event(bs, event); } -BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) +int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event, + const char *tag) { - Coroutine *co; - BlockAIOCBCoroutine *acb; - - trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); + while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) { + bs = bs->file; + } - acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); - acb->need_bh = true; - acb->req.error = -EINPROGRESS; - acb->req.sector = sector_num; - acb->req.nb_sectors = nb_sectors; - co = qemu_coroutine_create(bdrv_aio_discard_co_entry); - qemu_coroutine_enter(co, acb); + if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) { + return bs->drv->bdrv_debug_breakpoint(bs, event, tag); + } - bdrv_co_maybe_schedule_bh(acb); - return &acb->common; + return -ENOTSUP; } -void bdrv_init(void) +int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag) { - module_call_init(MODULE_INIT_BLOCK); -} + while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) { + bs = bs->file; + } -void bdrv_init_with_whitelist(void) -{ - use_bdrv_whitelist = 1; - bdrv_init(); + if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) { + return bs->drv->bdrv_debug_remove_breakpoint(bs, tag); + } + + return -ENOTSUP; } -void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, - BlockCompletionFunc *cb, void *opaque) +int bdrv_debug_resume(BlockDriverState *bs, const char *tag) { - BlockAIOCB *acb; + while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) { + bs = bs->file; + } - acb = g_slice_alloc(aiocb_info->aiocb_size); - acb->aiocb_info = aiocb_info; - acb->bs = bs; - acb->cb = cb; - acb->opaque = opaque; - acb->refcnt = 1; - return acb; -} + if (bs && bs->drv && bs->drv->bdrv_debug_resume) { + return bs->drv->bdrv_debug_resume(bs, tag); + } -void qemu_aio_ref(void *p) -{ - BlockAIOCB *acb = p; - acb->refcnt++; + return -ENOTSUP; } -void qemu_aio_unref(void *p) +bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag) { - BlockAIOCB *acb = p; - assert(acb->refcnt > 0); - if (--acb->refcnt == 0) { - g_slice_free1(acb->aiocb_info->aiocb_size, acb); + while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) { + bs = bs->file; } -} -/**************************************************************/ -/* Coroutine block device emulation */ + if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) { + return bs->drv->bdrv_debug_is_suspended(bs, tag); + } -typedef struct CoroutineIOCompletion { - Coroutine *coroutine; - int ret; -} CoroutineIOCompletion; + return false; +} -static void bdrv_co_io_em_complete(void *opaque, int ret) +int bdrv_is_snapshot(BlockDriverState *bs) { - CoroutineIOCompletion *co = opaque; - - co->ret = ret; - qemu_coroutine_enter(co->coroutine, NULL); + return !!(bs->open_flags & BDRV_O_SNAPSHOT); } -static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *iov, - bool is_write) +/* backing_file can either be relative, or absolute, or a protocol. If it is + * relative, it must be relative to the chain. So, passing in bs->filename + * from a BDS as backing_file should not be done, as that may be relative to + * the CWD rather than the chain. */ +BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, + const char *backing_file) { - CoroutineIOCompletion co = { - .coroutine = qemu_coroutine_self(), - }; - BlockAIOCB *acb; + char *filename_full = NULL; + char *backing_file_full = NULL; + char *filename_tmp = NULL; + int is_protocol = 0; + BlockDriverState *curr_bs = NULL; + BlockDriverState *retval = NULL; - if (is_write) { - acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, - bdrv_co_io_em_complete, &co); - } else { - acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, - bdrv_co_io_em_complete, &co); + if (!bs || !bs->drv || !backing_file) { + return NULL; } - trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); - if (!acb) { - return -EIO; - } - qemu_coroutine_yield(); + filename_full = g_malloc(PATH_MAX); + backing_file_full = g_malloc(PATH_MAX); + filename_tmp = g_malloc(PATH_MAX); - return co.ret; -} + is_protocol = path_has_protocol(backing_file); -static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov) -{ - return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); -} + for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) { -static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov) -{ - return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); -} + /* If either of the filename paths is actually a protocol, then + * compare unmodified paths; otherwise make paths relative */ + if (is_protocol || path_has_protocol(curr_bs->backing_file)) { + if (strcmp(backing_file, curr_bs->backing_file) == 0) { + retval = curr_bs->backing_hd; + break; + } + } else { + /* If not an absolute filename path, make it relative to the current + * image's filename path */ + path_combine(filename_tmp, PATH_MAX, curr_bs->filename, + backing_file); -static void coroutine_fn bdrv_flush_co_entry(void *opaque) -{ - RwCo *rwco = opaque; + /* We are going to compare absolute pathnames */ + if (!realpath(filename_tmp, filename_full)) { + continue; + } - rwco->ret = bdrv_co_flush(rwco->bs); -} + /* We need to make sure the backing filename we are comparing against + * is relative to the current image filename (or absolute) */ + path_combine(filename_tmp, PATH_MAX, curr_bs->filename, + curr_bs->backing_file); -int coroutine_fn bdrv_co_flush(BlockDriverState *bs) -{ - int ret; + if (!realpath(filename_tmp, backing_file_full)) { + continue; + } - if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { - return 0; + if (strcmp(backing_file_full, filename_full) == 0) { + retval = curr_bs->backing_hd; + break; + } + } } - /* Write back cached data to the OS even with cache=unsafe */ - BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); - if (bs->drv->bdrv_co_flush_to_os) { - ret = bs->drv->bdrv_co_flush_to_os(bs); - if (ret < 0) { - return ret; - } + g_free(filename_full); + g_free(backing_file_full); + g_free(filename_tmp); + return retval; +} + +int bdrv_get_backing_file_depth(BlockDriverState *bs) +{ + if (!bs->drv) { + return 0; } - /* But don't actually force it to the disk with cache=unsafe */ - if (bs->open_flags & BDRV_O_NO_FLUSH) { - goto flush_parent; + if (!bs->backing_hd) { + return 0; } - BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); - if (bs->drv->bdrv_co_flush_to_disk) { - ret = bs->drv->bdrv_co_flush_to_disk(bs); - } else if (bs->drv->bdrv_aio_flush) { - BlockAIOCB *acb; - CoroutineIOCompletion co = { - .coroutine = qemu_coroutine_self(), - }; + return 1 + bdrv_get_backing_file_depth(bs->backing_hd); +} - acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); - if (acb == NULL) { - ret = -EIO; - } else { - qemu_coroutine_yield(); - ret = co.ret; - } - } else { - /* - * Some block drivers always operate in either writethrough or unsafe - * mode and don't support bdrv_flush therefore. Usually qemu doesn't - * know how the server works (because the behaviour is hardcoded or - * depends on server-side configuration), so we can't ensure that - * everything is safe on disk. Returning an error doesn't work because - * that would break guests even if the server operates in writethrough - * mode. - * - * Let's hope the user knows what he's doing. - */ - ret = 0; - } - if (ret < 0) { - return ret; - } +void bdrv_init(void) +{ + module_call_init(MODULE_INIT_BLOCK); +} - /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH - * in the case of cache=unsafe, so there are no useless flushes. - */ -flush_parent: - return bdrv_co_flush(bs->file); +void bdrv_init_with_whitelist(void) +{ + use_bdrv_whitelist = 1; + bdrv_init(); } void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp) @@ -5235,143 +2972,6 @@ void bdrv_invalidate_cache_all(Error **errp) } } -int bdrv_flush(BlockDriverState *bs) -{ - Coroutine *co; - RwCo rwco = { - .bs = bs, - .ret = NOT_DONE, - }; - - if (qemu_in_coroutine()) { - /* Fast-path if already in coroutine context */ - bdrv_flush_co_entry(&rwco); - } else { - AioContext *aio_context = bdrv_get_aio_context(bs); - - co = qemu_coroutine_create(bdrv_flush_co_entry); - qemu_coroutine_enter(co, &rwco); - while (rwco.ret == NOT_DONE) { - aio_poll(aio_context, true); - } - } - - return rwco.ret; -} - -typedef struct DiscardCo { - BlockDriverState *bs; - int64_t sector_num; - int nb_sectors; - int ret; -} DiscardCo; -static void coroutine_fn bdrv_discard_co_entry(void *opaque) -{ - DiscardCo *rwco = opaque; - - rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); -} - -int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, - int nb_sectors) -{ - int max_discard, ret; - - if (!bs->drv) { - return -ENOMEDIUM; - } - - ret = bdrv_check_request(bs, sector_num, nb_sectors); - if (ret < 0) { - return ret; - } else if (bs->read_only) { - return -EROFS; - } - - bdrv_reset_dirty(bs, sector_num, nb_sectors); - - /* Do nothing if disabled. */ - if (!(bs->open_flags & BDRV_O_UNMAP)) { - return 0; - } - - if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { - return 0; - } - - max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS); - while (nb_sectors > 0) { - int ret; - int num = nb_sectors; - - /* align request */ - if (bs->bl.discard_alignment && - num >= bs->bl.discard_alignment && - sector_num % bs->bl.discard_alignment) { - if (num > bs->bl.discard_alignment) { - num = bs->bl.discard_alignment; - } - num -= sector_num % bs->bl.discard_alignment; - } - - /* limit request size */ - if (num > max_discard) { - num = max_discard; - } - - if (bs->drv->bdrv_co_discard) { - ret = bs->drv->bdrv_co_discard(bs, sector_num, num); - } else { - BlockAIOCB *acb; - CoroutineIOCompletion co = { - .coroutine = qemu_coroutine_self(), - }; - - acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, - bdrv_co_io_em_complete, &co); - if (acb == NULL) { - return -EIO; - } else { - qemu_coroutine_yield(); - ret = co.ret; - } - } - if (ret && ret != -ENOTSUP) { - return ret; - } - - sector_num += num; - nb_sectors -= num; - } - return 0; -} - -int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) -{ - Coroutine *co; - DiscardCo rwco = { - .bs = bs, - .sector_num = sector_num, - .nb_sectors = nb_sectors, - .ret = NOT_DONE, - }; - - if (qemu_in_coroutine()) { - /* Fast-path if already in coroutine context */ - bdrv_discard_co_entry(&rwco); - } else { - AioContext *aio_context = bdrv_get_aio_context(bs); - - co = qemu_coroutine_create(bdrv_discard_co_entry); - qemu_coroutine_enter(co, &rwco); - while (rwco.ret == NOT_DONE) { - aio_poll(aio_context, true); - } - } - - return rwco.ret; -} - /**************************************************************/ /* removable device support */ @@ -5437,87 +3037,11 @@ void bdrv_lock_medium(BlockDriverState *bs, bool locked) } } -/* needed for generic scsi interface */ - -int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) -{ - BlockDriver *drv = bs->drv; - - if (drv && drv->bdrv_ioctl) - return drv->bdrv_ioctl(bs, req, buf); - return -ENOTSUP; -} - -BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, - unsigned long int req, void *buf, - BlockCompletionFunc *cb, void *opaque) -{ - BlockDriver *drv = bs->drv; - - if (drv && drv->bdrv_aio_ioctl) - return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque); - return NULL; -} - void bdrv_set_guest_block_size(BlockDriverState *bs, int align) { bs->guest_block_size = align; } -void *qemu_blockalign(BlockDriverState *bs, size_t size) -{ - return qemu_memalign(bdrv_opt_mem_align(bs), size); -} - -void *qemu_blockalign0(BlockDriverState *bs, size_t size) -{ - return memset(qemu_blockalign(bs, size), 0, size); -} - -void *qemu_try_blockalign(BlockDriverState *bs, size_t size) -{ - size_t align = bdrv_opt_mem_align(bs); - - /* Ensure that NULL is never returned on success */ - assert(align > 0); - if (size == 0) { - size = align; - } - - return qemu_try_memalign(align, size); -} - -void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) -{ - void *mem = qemu_try_blockalign(bs, size); - - if (mem) { - memset(mem, 0, size); - } - - return mem; -} - -/* - * Check if all memory in this vector is sector aligned. - */ -bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) -{ - int i; - size_t alignment = bdrv_opt_mem_align(bs); - - for (i = 0; i < qiov->niov; i++) { - if ((uintptr_t) qiov->iov[i].iov_base % alignment) { - return false; - } - if (qiov->iov[i].iov_len % alignment) { - return false; - } - } - - return true; -} - BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name) { BdrvDirtyBitmap *bm; @@ -6239,12 +3763,6 @@ void bdrv_remove_aio_context_notifier(BlockDriverState *bs, abort(); } -void bdrv_add_before_write_notifier(BlockDriverState *bs, - NotifierWithReturn *notifier) -{ - notifier_with_return_list_add(&bs->before_write_notifiers, notifier); -} - int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts, BlockDriverAmendStatusCB *status_cb) { @@ -6345,36 +3863,6 @@ out: return to_replace_bs; } -void bdrv_io_plug(BlockDriverState *bs) -{ - BlockDriver *drv = bs->drv; - if (drv && drv->bdrv_io_plug) { - drv->bdrv_io_plug(bs); - } else if (bs->file) { - bdrv_io_plug(bs->file); - } -} - -void bdrv_io_unplug(BlockDriverState *bs) -{ - BlockDriver *drv = bs->drv; - if (drv && drv->bdrv_io_unplug) { - drv->bdrv_io_unplug(bs); - } else if (bs->file) { - bdrv_io_unplug(bs->file); - } -} - -void bdrv_flush_io_queue(BlockDriverState *bs) -{ - BlockDriver *drv = bs->drv; - if (drv && drv->bdrv_flush_io_queue) { - drv->bdrv_flush_io_queue(bs); - } else if (bs->file) { - bdrv_flush_io_queue(bs->file); - } -} - static bool append_open_options(QDict *d, BlockDriverState *bs) { const QDictEntry *entry; diff --git a/block/Makefile.objs b/block/Makefile.objs index 179e71d..0d8c2a4 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -9,7 +9,7 @@ block-obj-y += block-backend.o snapshot.o qapi.o block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o block-obj-$(CONFIG_POSIX) += raw-posix.o block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o -block-obj-y += null.o mirror.o +block-obj-y += null.o mirror.o io.o block-obj-y += nbd.o nbd-client.o sheepdog.o block-obj-$(CONFIG_LIBISCSI) += iscsi.o diff --git a/block/io.c b/block/io.c new file mode 100644 index 0000000..1ce62c4 --- /dev/null +++ b/block/io.c @@ -0,0 +1,2540 @@ +/* + * Block layer I/O functions + * + * Copyright (c) 2003 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "trace.h" +#include "sysemu/qtest.h" +#include "block/blockjob.h" +#include "block/block_int.h" + +#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ + +static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque); +static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque); +static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov); +static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov); +static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); +static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); +static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BdrvRequestFlags flags, + BlockCompletionFunc *cb, + void *opaque, + bool is_write); +static void coroutine_fn bdrv_co_do_rw(void *opaque); +static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); + +/* throttling disk I/O limits */ +void bdrv_set_io_limits(BlockDriverState *bs, + ThrottleConfig *cfg) +{ + int i; + + throttle_config(&bs->throttle_state, cfg); + + for (i = 0; i < 2; i++) { + qemu_co_enter_next(&bs->throttled_reqs[i]); + } +} + +/* this function drain all the throttled IOs */ +static bool bdrv_start_throttled_reqs(BlockDriverState *bs) +{ + bool drained = false; + bool enabled = bs->io_limits_enabled; + int i; + + bs->io_limits_enabled = false; + + for (i = 0; i < 2; i++) { + while (qemu_co_enter_next(&bs->throttled_reqs[i])) { + drained = true; + } + } + + bs->io_limits_enabled = enabled; + + return drained; +} + +void bdrv_io_limits_disable(BlockDriverState *bs) +{ + bs->io_limits_enabled = false; + + bdrv_start_throttled_reqs(bs); + + throttle_destroy(&bs->throttle_state); +} + +static void bdrv_throttle_read_timer_cb(void *opaque) +{ + BlockDriverState *bs = opaque; + qemu_co_enter_next(&bs->throttled_reqs[0]); +} + +static void bdrv_throttle_write_timer_cb(void *opaque) +{ + BlockDriverState *bs = opaque; + qemu_co_enter_next(&bs->throttled_reqs[1]); +} + +/* should be called before bdrv_set_io_limits if a limit is set */ +void bdrv_io_limits_enable(BlockDriverState *bs) +{ + int clock_type = QEMU_CLOCK_REALTIME; + + if (qtest_enabled()) { + /* For testing block IO throttling only */ + clock_type = QEMU_CLOCK_VIRTUAL; + } + assert(!bs->io_limits_enabled); + throttle_init(&bs->throttle_state, + bdrv_get_aio_context(bs), + clock_type, + bdrv_throttle_read_timer_cb, + bdrv_throttle_write_timer_cb, + bs); + bs->io_limits_enabled = true; +} + +/* This function makes an IO wait if needed + * + * @nb_sectors: the number of sectors of the IO + * @is_write: is the IO a write + */ +static void bdrv_io_limits_intercept(BlockDriverState *bs, + unsigned int bytes, + bool is_write) +{ + /* does this io must wait */ + bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write); + + /* if must wait or any request of this type throttled queue the IO */ + if (must_wait || + !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) { + qemu_co_queue_wait(&bs->throttled_reqs[is_write]); + } + + /* the IO will be executed, do the accounting */ + throttle_account(&bs->throttle_state, is_write, bytes); + + + /* if the next request must wait -> do nothing */ + if (throttle_schedule_timer(&bs->throttle_state, is_write)) { + return; + } + + /* else queue next request for execution */ + qemu_co_queue_next(&bs->throttled_reqs[is_write]); +} + +void bdrv_setup_io_funcs(BlockDriver *bdrv) +{ + /* Block drivers without coroutine functions need emulation */ + if (!bdrv->bdrv_co_readv) { + bdrv->bdrv_co_readv = bdrv_co_readv_em; + bdrv->bdrv_co_writev = bdrv_co_writev_em; + + /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if + * the block driver lacks aio we need to emulate that too. + */ + if (!bdrv->bdrv_aio_readv) { + /* add AIO emulation layer */ + bdrv->bdrv_aio_readv = bdrv_aio_readv_em; + bdrv->bdrv_aio_writev = bdrv_aio_writev_em; + } + } +} + +void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) +{ + BlockDriver *drv = bs->drv; + Error *local_err = NULL; + + memset(&bs->bl, 0, sizeof(bs->bl)); + + if (!drv) { + return; + } + + /* Take some limits from the children as a default */ + if (bs->file) { + bdrv_refresh_limits(bs->file, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length; + bs->bl.max_transfer_length = bs->file->bl.max_transfer_length; + bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment; + } else { + bs->bl.opt_mem_alignment = 512; + } + + if (bs->backing_hd) { + bdrv_refresh_limits(bs->backing_hd, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + bs->bl.opt_transfer_length = + MAX(bs->bl.opt_transfer_length, + bs->backing_hd->bl.opt_transfer_length); + bs->bl.max_transfer_length = + MIN_NON_ZERO(bs->bl.max_transfer_length, + bs->backing_hd->bl.max_transfer_length); + bs->bl.opt_mem_alignment = + MAX(bs->bl.opt_mem_alignment, + bs->backing_hd->bl.opt_mem_alignment); + } + + /* Then let the driver override it */ + if (drv->bdrv_refresh_limits) { + drv->bdrv_refresh_limits(bs, errp); + } +} + +/** + * The copy-on-read flag is actually a reference count so multiple users may + * use the feature without worrying about clobbering its previous state. + * Copy-on-read stays enabled until all users have called to disable it. + */ +void bdrv_enable_copy_on_read(BlockDriverState *bs) +{ + bs->copy_on_read++; +} + +void bdrv_disable_copy_on_read(BlockDriverState *bs) +{ + assert(bs->copy_on_read > 0); + bs->copy_on_read--; +} + +/* Check if any requests are in-flight (including throttled requests) */ +static bool bdrv_requests_pending(BlockDriverState *bs) +{ + if (!QLIST_EMPTY(&bs->tracked_requests)) { + return true; + } + if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { + return true; + } + if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { + return true; + } + if (bs->file && bdrv_requests_pending(bs->file)) { + return true; + } + if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) { + return true; + } + return false; +} + +static bool bdrv_drain_one(BlockDriverState *bs) +{ + bool bs_busy; + + bdrv_flush_io_queue(bs); + bdrv_start_throttled_reqs(bs); + bs_busy = bdrv_requests_pending(bs); + bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy); + return bs_busy; +} + +/* + * Wait for pending requests to complete on a single BlockDriverState subtree + * + * See the warning in bdrv_drain_all(). This function can only be called if + * you are sure nothing can generate I/O because you have op blockers + * installed. + * + * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState + * AioContext. + */ +void bdrv_drain(BlockDriverState *bs) +{ + while (bdrv_drain_one(bs)) { + /* Keep iterating */ + } +} + +/* + * Wait for pending requests to complete across all BlockDriverStates + * + * This function does not flush data to disk, use bdrv_flush_all() for that + * after calling this function. + * + * Note that completion of an asynchronous I/O operation can trigger any + * number of other I/O operations on other devices---for example a coroutine + * can be arbitrarily complex and a constant flow of I/O can come until the + * coroutine is complete. Because of this, it is not possible to have a + * function to drain a single device's I/O queue. + */ +void bdrv_drain_all(void) +{ + /* Always run first iteration so any pending completion BHs run */ + bool busy = true; + BlockDriverState *bs = NULL; + + while ((bs = bdrv_next(bs))) { + AioContext *aio_context = bdrv_get_aio_context(bs); + + aio_context_acquire(aio_context); + if (bs->job) { + block_job_pause(bs->job); + } + aio_context_release(aio_context); + } + + while (busy) { + busy = false; + bs = NULL; + + while ((bs = bdrv_next(bs))) { + AioContext *aio_context = bdrv_get_aio_context(bs); + + aio_context_acquire(aio_context); + busy |= bdrv_drain_one(bs); + aio_context_release(aio_context); + } + } + + bs = NULL; + while ((bs = bdrv_next(bs))) { + AioContext *aio_context = bdrv_get_aio_context(bs); + + aio_context_acquire(aio_context); + if (bs->job) { + block_job_resume(bs->job); + } + aio_context_release(aio_context); + } +} + +/** + * Remove an active request from the tracked requests list + * + * This function should be called when a tracked request is completing. + */ +static void tracked_request_end(BdrvTrackedRequest *req) +{ + if (req->serialising) { + req->bs->serialising_in_flight--; + } + + QLIST_REMOVE(req, list); + qemu_co_queue_restart_all(&req->wait_queue); +} + +/** + * Add an active request to the tracked requests list + */ +static void tracked_request_begin(BdrvTrackedRequest *req, + BlockDriverState *bs, + int64_t offset, + unsigned int bytes, bool is_write) +{ + *req = (BdrvTrackedRequest){ + .bs = bs, + .offset = offset, + .bytes = bytes, + .is_write = is_write, + .co = qemu_coroutine_self(), + .serialising = false, + .overlap_offset = offset, + .overlap_bytes = bytes, + }; + + qemu_co_queue_init(&req->wait_queue); + + QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); +} + +static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) +{ + int64_t overlap_offset = req->offset & ~(align - 1); + unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) + - overlap_offset; + + if (!req->serialising) { + req->bs->serialising_in_flight++; + req->serialising = true; + } + + req->overlap_offset = MIN(req->overlap_offset, overlap_offset); + req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); +} + +/** + * Round a region to cluster boundaries + */ +void bdrv_round_to_clusters(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + int64_t *cluster_sector_num, + int *cluster_nb_sectors) +{ + BlockDriverInfo bdi; + + if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { + *cluster_sector_num = sector_num; + *cluster_nb_sectors = nb_sectors; + } else { + int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; + *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); + *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + + nb_sectors, c); + } +} + +static int bdrv_get_cluster_size(BlockDriverState *bs) +{ + BlockDriverInfo bdi; + int ret; + + ret = bdrv_get_info(bs, &bdi); + if (ret < 0 || bdi.cluster_size == 0) { + return bs->request_alignment; + } else { + return bdi.cluster_size; + } +} + +static bool tracked_request_overlaps(BdrvTrackedRequest *req, + int64_t offset, unsigned int bytes) +{ + /* aaaa bbbb */ + if (offset >= req->overlap_offset + req->overlap_bytes) { + return false; + } + /* bbbb aaaa */ + if (req->overlap_offset >= offset + bytes) { + return false; + } + return true; +} + +static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) +{ + BlockDriverState *bs = self->bs; + BdrvTrackedRequest *req; + bool retry; + bool waited = false; + + if (!bs->serialising_in_flight) { + return false; + } + + do { + retry = false; + QLIST_FOREACH(req, &bs->tracked_requests, list) { + if (req == self || (!req->serialising && !self->serialising)) { + continue; + } + if (tracked_request_overlaps(req, self->overlap_offset, + self->overlap_bytes)) + { + /* Hitting this means there was a reentrant request, for + * example, a block driver issuing nested requests. This must + * never happen since it means deadlock. + */ + assert(qemu_coroutine_self() != req->co); + + /* If the request is already (indirectly) waiting for us, or + * will wait for us as soon as it wakes up, then just go on + * (instead of producing a deadlock in the former case). */ + if (!req->waiting_for) { + self->waiting_for = req; + qemu_co_queue_wait(&req->wait_queue); + self->waiting_for = NULL; + retry = true; + waited = true; + break; + } + } + } + } while (retry); + + return waited; +} + +static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, + size_t size) +{ + if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { + return -EIO; + } + + if (!bdrv_is_inserted(bs)) { + return -ENOMEDIUM; + } + + if (offset < 0) { + return -EIO; + } + + return 0; +} + +static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, + int nb_sectors) +{ + if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { + return -EIO; + } + + return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, + nb_sectors * BDRV_SECTOR_SIZE); +} + +typedef struct RwCo { + BlockDriverState *bs; + int64_t offset; + QEMUIOVector *qiov; + bool is_write; + int ret; + BdrvRequestFlags flags; +} RwCo; + +static void coroutine_fn bdrv_rw_co_entry(void *opaque) +{ + RwCo *rwco = opaque; + + if (!rwco->is_write) { + rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, + rwco->qiov->size, rwco->qiov, + rwco->flags); + } else { + rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, + rwco->qiov->size, rwco->qiov, + rwco->flags); + } +} + +/* + * Process a vectored synchronous request using coroutines + */ +static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, + QEMUIOVector *qiov, bool is_write, + BdrvRequestFlags flags) +{ + Coroutine *co; + RwCo rwco = { + .bs = bs, + .offset = offset, + .qiov = qiov, + .is_write = is_write, + .ret = NOT_DONE, + .flags = flags, + }; + + /** + * In sync call context, when the vcpu is blocked, this throttling timer + * will not fire; so the I/O throttling function has to be disabled here + * if it has been enabled. + */ + if (bs->io_limits_enabled) { + fprintf(stderr, "Disabling I/O throttling on '%s' due " + "to synchronous I/O.\n", bdrv_get_device_name(bs)); + bdrv_io_limits_disable(bs); + } + + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_rw_co_entry(&rwco); + } else { + AioContext *aio_context = bdrv_get_aio_context(bs); + + co = qemu_coroutine_create(bdrv_rw_co_entry); + qemu_coroutine_enter(co, &rwco); + while (rwco.ret == NOT_DONE) { + aio_poll(aio_context, true); + } + } + return rwco.ret; +} + +/* + * Process a synchronous request using coroutines + */ +static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, + int nb_sectors, bool is_write, BdrvRequestFlags flags) +{ + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *)buf, + .iov_len = nb_sectors * BDRV_SECTOR_SIZE, + }; + + if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { + return -EINVAL; + } + + qemu_iovec_init_external(&qiov, &iov, 1); + return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, + &qiov, is_write, flags); +} + +/* return < 0 if error. See bdrv_write() for the return codes */ +int bdrv_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); +} + +/* Just like bdrv_read(), but with I/O throttling temporarily disabled */ +int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + bool enabled; + int ret; + + enabled = bs->io_limits_enabled; + bs->io_limits_enabled = false; + ret = bdrv_read(bs, sector_num, buf, nb_sectors); + bs->io_limits_enabled = enabled; + return ret; +} + +/* Return < 0 if error. Important errors are: + -EIO generic I/O error (may happen for all errors) + -ENOMEDIUM No media inserted. + -EINVAL Invalid sector number or nb_sectors + -EACCES Trying to write a read-only device +*/ +int bdrv_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); +} + +int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, BdrvRequestFlags flags) +{ + return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, + BDRV_REQ_ZERO_WRITE | flags); +} + +/* + * Completely zero out a block device with the help of bdrv_write_zeroes. + * The operation is sped up by checking the block status and only writing + * zeroes to the device if they currently do not return zeroes. Optional + * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). + * + * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). + */ +int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) +{ + int64_t target_sectors, ret, nb_sectors, sector_num = 0; + int n; + + target_sectors = bdrv_nb_sectors(bs); + if (target_sectors < 0) { + return target_sectors; + } + + for (;;) { + nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS); + if (nb_sectors <= 0) { + return 0; + } + ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n); + if (ret < 0) { + error_report("error getting block status at sector %" PRId64 ": %s", + sector_num, strerror(-ret)); + return ret; + } + if (ret & BDRV_BLOCK_ZERO) { + sector_num += n; + continue; + } + ret = bdrv_write_zeroes(bs, sector_num, n, flags); + if (ret < 0) { + error_report("error writing zeroes at sector %" PRId64 ": %s", + sector_num, strerror(-ret)); + return ret; + } + sector_num += n; + } +} + +int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) +{ + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *)buf, + .iov_len = bytes, + }; + int ret; + + if (bytes < 0) { + return -EINVAL; + } + + qemu_iovec_init_external(&qiov, &iov, 1); + ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); + if (ret < 0) { + return ret; + } + + return bytes; +} + +int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) +{ + int ret; + + ret = bdrv_prwv_co(bs, offset, qiov, true, 0); + if (ret < 0) { + return ret; + } + + return qiov->size; +} + +int bdrv_pwrite(BlockDriverState *bs, int64_t offset, + const void *buf, int bytes) +{ + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *) buf, + .iov_len = bytes, + }; + + if (bytes < 0) { + return -EINVAL; + } + + qemu_iovec_init_external(&qiov, &iov, 1); + return bdrv_pwritev(bs, offset, &qiov); +} + +/* + * Writes to the file and ensures that no writes are reordered across this + * request (acts as a barrier) + * + * Returns 0 on success, -errno in error cases. + */ +int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, + const void *buf, int count) +{ + int ret; + + ret = bdrv_pwrite(bs, offset, buf, count); + if (ret < 0) { + return ret; + } + + /* No flush needed for cache modes that already do it */ + if (bs->enable_write_cache) { + bdrv_flush(bs); + } + + return 0; +} + +static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ + /* Perform I/O through a temporary buffer so that users who scribble over + * their read buffer while the operation is in progress do not end up + * modifying the image file. This is critical for zero-copy guest I/O + * where anything might happen inside guest memory. + */ + void *bounce_buffer; + + BlockDriver *drv = bs->drv; + struct iovec iov; + QEMUIOVector bounce_qiov; + int64_t cluster_sector_num; + int cluster_nb_sectors; + size_t skip_bytes; + int ret; + + /* Cover entire cluster so no additional backing file I/O is required when + * allocating cluster in the image file. + */ + bdrv_round_to_clusters(bs, sector_num, nb_sectors, + &cluster_sector_num, &cluster_nb_sectors); + + trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, + cluster_sector_num, cluster_nb_sectors); + + iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; + iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); + if (bounce_buffer == NULL) { + ret = -ENOMEM; + goto err; + } + + qemu_iovec_init_external(&bounce_qiov, &iov, 1); + + ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, + &bounce_qiov); + if (ret < 0) { + goto err; + } + + if (drv->bdrv_co_write_zeroes && + buffer_is_zero(bounce_buffer, iov.iov_len)) { + ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, + cluster_nb_sectors, 0); + } else { + /* This does not change the data on the disk, it is not necessary + * to flush even in cache=writethrough mode. + */ + ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, + &bounce_qiov); + } + + if (ret < 0) { + /* It might be okay to ignore write errors for guest requests. If this + * is a deliberate copy-on-read then we don't want to ignore the error. + * Simply report it in all cases. + */ + goto err; + } + + skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; + qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, + nb_sectors * BDRV_SECTOR_SIZE); + +err: + qemu_vfree(bounce_buffer); + return ret; +} + +/* + * Forwards an already correctly aligned request to the BlockDriver. This + * handles copy on read and zeroing after EOF; any other features must be + * implemented by the caller. + */ +static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, + BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, + int64_t align, QEMUIOVector *qiov, int flags) +{ + BlockDriver *drv = bs->drv; + int ret; + + int64_t sector_num = offset >> BDRV_SECTOR_BITS; + unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; + + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + assert(!qiov || bytes == qiov->size); + + /* Handle Copy on Read and associated serialisation */ + if (flags & BDRV_REQ_COPY_ON_READ) { + /* If we touch the same cluster it counts as an overlap. This + * guarantees that allocating writes will be serialized and not race + * with each other for the same cluster. For example, in copy-on-read + * it ensures that the CoR read and write operations are atomic and + * guest writes cannot interleave between them. */ + mark_request_serialising(req, bdrv_get_cluster_size(bs)); + } + + wait_serialising_requests(req); + + if (flags & BDRV_REQ_COPY_ON_READ) { + int pnum; + + ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); + if (ret < 0) { + goto out; + } + + if (!ret || pnum != nb_sectors) { + ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); + goto out; + } + } + + /* Forward the request to the BlockDriver */ + if (!bs->zero_beyond_eof) { + ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); + } else { + /* Read zeros after EOF */ + int64_t total_sectors, max_nb_sectors; + + total_sectors = bdrv_nb_sectors(bs); + if (total_sectors < 0) { + ret = total_sectors; + goto out; + } + + max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), + align >> BDRV_SECTOR_BITS); + if (nb_sectors < max_nb_sectors) { + ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); + } else if (max_nb_sectors > 0) { + QEMUIOVector local_qiov; + + qemu_iovec_init(&local_qiov, qiov->niov); + qemu_iovec_concat(&local_qiov, qiov, 0, + max_nb_sectors * BDRV_SECTOR_SIZE); + + ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors, + &local_qiov); + + qemu_iovec_destroy(&local_qiov); + } else { + ret = 0; + } + + /* Reading beyond end of file is supposed to produce zeroes */ + if (ret == 0 && total_sectors < sector_num + nb_sectors) { + uint64_t offset = MAX(0, total_sectors - sector_num); + uint64_t bytes = (sector_num + nb_sectors - offset) * + BDRV_SECTOR_SIZE; + qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); + } + } + +out: + return ret; +} + +static inline uint64_t bdrv_get_align(BlockDriverState *bs) +{ + /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ + return MAX(BDRV_SECTOR_SIZE, bs->request_alignment); +} + +static inline bool bdrv_req_is_aligned(BlockDriverState *bs, + int64_t offset, size_t bytes) +{ + int64_t align = bdrv_get_align(bs); + return !(offset & (align - 1) || (bytes & (align - 1))); +} + +/* + * Handle a read request in coroutine context + */ +static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + BlockDriver *drv = bs->drv; + BdrvTrackedRequest req; + + uint64_t align = bdrv_get_align(bs); + uint8_t *head_buf = NULL; + uint8_t *tail_buf = NULL; + QEMUIOVector local_qiov; + bool use_local_qiov = false; + int ret; + + if (!drv) { + return -ENOMEDIUM; + } + + ret = bdrv_check_byte_request(bs, offset, bytes); + if (ret < 0) { + return ret; + } + + if (bs->copy_on_read) { + flags |= BDRV_REQ_COPY_ON_READ; + } + + /* throttling disk I/O */ + if (bs->io_limits_enabled) { + bdrv_io_limits_intercept(bs, bytes, false); + } + + /* Align read if necessary by padding qiov */ + if (offset & (align - 1)) { + head_buf = qemu_blockalign(bs, align); + qemu_iovec_init(&local_qiov, qiov->niov + 2); + qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + + bytes += offset & (align - 1); + offset = offset & ~(align - 1); + } + + if ((offset + bytes) & (align - 1)) { + if (!use_local_qiov) { + qemu_iovec_init(&local_qiov, qiov->niov + 1); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + } + tail_buf = qemu_blockalign(bs, align); + qemu_iovec_add(&local_qiov, tail_buf, + align - ((offset + bytes) & (align - 1))); + + bytes = ROUND_UP(bytes, align); + } + + tracked_request_begin(&req, bs, offset, bytes, false); + ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, + use_local_qiov ? &local_qiov : qiov, + flags); + tracked_request_end(&req); + + if (use_local_qiov) { + qemu_iovec_destroy(&local_qiov); + qemu_vfree(head_buf); + qemu_vfree(tail_buf); + } + + return ret; +} + +static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { + return -EINVAL; + } + + return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, qiov, flags); +} + +int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + trace_bdrv_co_readv(bs, sector_num, nb_sectors); + + return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); +} + +int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ + trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); + + return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, + BDRV_REQ_COPY_ON_READ); +} + +#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768 + +static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) +{ + BlockDriver *drv = bs->drv; + QEMUIOVector qiov; + struct iovec iov = {0}; + int ret = 0; + + int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes, + BDRV_REQUEST_MAX_SECTORS); + + while (nb_sectors > 0 && !ret) { + int num = nb_sectors; + + /* Align request. Block drivers can expect the "bulk" of the request + * to be aligned. + */ + if (bs->bl.write_zeroes_alignment + && num > bs->bl.write_zeroes_alignment) { + if (sector_num % bs->bl.write_zeroes_alignment != 0) { + /* Make a small request up to the first aligned sector. */ + num = bs->bl.write_zeroes_alignment; + num -= sector_num % bs->bl.write_zeroes_alignment; + } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { + /* Shorten the request to the last aligned sector. num cannot + * underflow because num > bs->bl.write_zeroes_alignment. + */ + num -= (sector_num + num) % bs->bl.write_zeroes_alignment; + } + } + + /* limit request size */ + if (num > max_write_zeroes) { + num = max_write_zeroes; + } + + ret = -ENOTSUP; + /* First try the efficient write zeroes operation */ + if (drv->bdrv_co_write_zeroes) { + ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); + } + + if (ret == -ENOTSUP) { + /* Fall back to bounce buffer if write zeroes is unsupported */ + int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length, + MAX_WRITE_ZEROES_BOUNCE_BUFFER); + num = MIN(num, max_xfer_len); + iov.iov_len = num * BDRV_SECTOR_SIZE; + if (iov.iov_base == NULL) { + iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); + if (iov.iov_base == NULL) { + ret = -ENOMEM; + goto fail; + } + memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); + } + qemu_iovec_init_external(&qiov, &iov, 1); + + ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); + + /* Keep bounce buffer around if it is big enough for all + * all future requests. + */ + if (num < max_xfer_len) { + qemu_vfree(iov.iov_base); + iov.iov_base = NULL; + } + } + + sector_num += num; + nb_sectors -= num; + } + +fail: + qemu_vfree(iov.iov_base); + return ret; +} + +/* + * Forwards an already correctly aligned write request to the BlockDriver. + */ +static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, + BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, + QEMUIOVector *qiov, int flags) +{ + BlockDriver *drv = bs->drv; + bool waited; + int ret; + + int64_t sector_num = offset >> BDRV_SECTOR_BITS; + unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; + + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + assert(!qiov || bytes == qiov->size); + + waited = wait_serialising_requests(req); + assert(!waited || !req->serialising); + assert(req->overlap_offset <= offset); + assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); + + ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); + + if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && + !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && + qemu_iovec_is_zero(qiov)) { + flags |= BDRV_REQ_ZERO_WRITE; + if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { + flags |= BDRV_REQ_MAY_UNMAP; + } + } + + if (ret < 0) { + /* Do nothing, write notifier decided to fail this request */ + } else if (flags & BDRV_REQ_ZERO_WRITE) { + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO); + ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); + } else { + BLKDBG_EVENT(bs, BLKDBG_PWRITEV); + ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); + } + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE); + + if (ret == 0 && !bs->enable_write_cache) { + ret = bdrv_co_flush(bs); + } + + bdrv_set_dirty(bs, sector_num, nb_sectors); + + block_acct_highest_sector(&bs->stats, sector_num, nb_sectors); + + if (ret >= 0) { + bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); + } + + return ret; +} + +/* + * Handle a write request in coroutine context + */ +static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + BdrvTrackedRequest req; + uint64_t align = bdrv_get_align(bs); + uint8_t *head_buf = NULL; + uint8_t *tail_buf = NULL; + QEMUIOVector local_qiov; + bool use_local_qiov = false; + int ret; + + if (!bs->drv) { + return -ENOMEDIUM; + } + if (bs->read_only) { + return -EACCES; + } + + ret = bdrv_check_byte_request(bs, offset, bytes); + if (ret < 0) { + return ret; + } + + /* throttling disk I/O */ + if (bs->io_limits_enabled) { + bdrv_io_limits_intercept(bs, bytes, true); + } + + /* + * Align write if necessary by performing a read-modify-write cycle. + * Pad qiov with the read parts and be sure to have a tracked request not + * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. + */ + tracked_request_begin(&req, bs, offset, bytes, true); + + if (offset & (align - 1)) { + QEMUIOVector head_qiov; + struct iovec head_iov; + + mark_request_serialising(&req, align); + wait_serialising_requests(&req); + + head_buf = qemu_blockalign(bs, align); + head_iov = (struct iovec) { + .iov_base = head_buf, + .iov_len = align, + }; + qemu_iovec_init_external(&head_qiov, &head_iov, 1); + + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD); + ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, + align, &head_qiov, 0); + if (ret < 0) { + goto fail; + } + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); + + qemu_iovec_init(&local_qiov, qiov->niov + 2); + qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + + bytes += offset & (align - 1); + offset = offset & ~(align - 1); + } + + if ((offset + bytes) & (align - 1)) { + QEMUIOVector tail_qiov; + struct iovec tail_iov; + size_t tail_bytes; + bool waited; + + mark_request_serialising(&req, align); + waited = wait_serialising_requests(&req); + assert(!waited || !use_local_qiov); + + tail_buf = qemu_blockalign(bs, align); + tail_iov = (struct iovec) { + .iov_base = tail_buf, + .iov_len = align, + }; + qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); + + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL); + ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, + align, &tail_qiov, 0); + if (ret < 0) { + goto fail; + } + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); + + if (!use_local_qiov) { + qemu_iovec_init(&local_qiov, qiov->niov + 1); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + } + + tail_bytes = (offset + bytes) & (align - 1); + qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); + + bytes = ROUND_UP(bytes, align); + } + + if (use_local_qiov) { + /* Local buffer may have non-zero data. */ + flags &= ~BDRV_REQ_ZERO_WRITE; + } + ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, + use_local_qiov ? &local_qiov : qiov, + flags); + +fail: + tracked_request_end(&req); + + if (use_local_qiov) { + qemu_iovec_destroy(&local_qiov); + } + qemu_vfree(head_buf); + qemu_vfree(tail_buf); + + return ret; +} + +static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { + return -EINVAL; + } + + return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, qiov, flags); +} + +int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + trace_bdrv_co_writev(bs, sector_num, nb_sectors); + + return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); +} + +int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + BdrvRequestFlags flags) +{ + int ret; + + trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); + + if (!(bs->open_flags & BDRV_O_UNMAP)) { + flags &= ~BDRV_REQ_MAY_UNMAP; + } + if (bdrv_req_is_aligned(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS)) { + ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, + BDRV_REQ_ZERO_WRITE | flags); + } else { + uint8_t *buf; + QEMUIOVector local_qiov; + size_t bytes = nb_sectors << BDRV_SECTOR_BITS; + + buf = qemu_memalign(bdrv_opt_mem_align(bs), bytes); + memset(buf, 0, bytes); + qemu_iovec_init(&local_qiov, 1); + qemu_iovec_add(&local_qiov, buf, bytes); + + ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, &local_qiov, + BDRV_REQ_ZERO_WRITE | flags); + qemu_vfree(buf); + } + return ret; +} + +int bdrv_flush_all(void) +{ + BlockDriverState *bs = NULL; + int result = 0; + + while ((bs = bdrv_next(bs))) { + AioContext *aio_context = bdrv_get_aio_context(bs); + int ret; + + aio_context_acquire(aio_context); + ret = bdrv_flush(bs); + if (ret < 0 && !result) { + result = ret; + } + aio_context_release(aio_context); + } + + return result; +} + +typedef struct BdrvCoGetBlockStatusData { + BlockDriverState *bs; + BlockDriverState *base; + int64_t sector_num; + int nb_sectors; + int *pnum; + int64_t ret; + bool done; +} BdrvCoGetBlockStatusData; + +/* + * Returns the allocation status of the specified sectors. + * Drivers not implementing the functionality are assumed to not support + * backing files, hence all their sectors are reported as allocated. + * + * If 'sector_num' is beyond the end of the disk image the return value is 0 + * and 'pnum' is set to 0. + * + * 'pnum' is set to the number of sectors (including and immediately following + * the specified sector) that are known to be in the same + * allocated/unallocated state. + * + * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes + * beyond the end of the disk image it will be clamped. + */ +static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + int64_t total_sectors; + int64_t n; + int64_t ret, ret2; + + total_sectors = bdrv_nb_sectors(bs); + if (total_sectors < 0) { + return total_sectors; + } + + if (sector_num >= total_sectors) { + *pnum = 0; + return 0; + } + + n = total_sectors - sector_num; + if (n < nb_sectors) { + nb_sectors = n; + } + + if (!bs->drv->bdrv_co_get_block_status) { + *pnum = nb_sectors; + ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; + if (bs->drv->protocol_name) { + ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); + } + return ret; + } + + ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum); + if (ret < 0) { + *pnum = 0; + return ret; + } + + if (ret & BDRV_BLOCK_RAW) { + assert(ret & BDRV_BLOCK_OFFSET_VALID); + return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, + *pnum, pnum); + } + + if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { + ret |= BDRV_BLOCK_ALLOCATED; + } + + if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) { + if (bdrv_unallocated_blocks_are_zero(bs)) { + ret |= BDRV_BLOCK_ZERO; + } else if (bs->backing_hd) { + BlockDriverState *bs2 = bs->backing_hd; + int64_t nb_sectors2 = bdrv_nb_sectors(bs2); + if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { + ret |= BDRV_BLOCK_ZERO; + } + } + } + + if (bs->file && + (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && + (ret & BDRV_BLOCK_OFFSET_VALID)) { + int file_pnum; + + ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, + *pnum, &file_pnum); + if (ret2 >= 0) { + /* Ignore errors. This is just providing extra information, it + * is useful but not necessary. + */ + if (!file_pnum) { + /* !file_pnum indicates an offset at or beyond the EOF; it is + * perfectly valid for the format block driver to point to such + * offsets, so catch it and mark everything as zero */ + ret |= BDRV_BLOCK_ZERO; + } else { + /* Limit request to the range reported by the protocol driver */ + *pnum = file_pnum; + ret |= (ret2 & BDRV_BLOCK_ZERO); + } + } + } + + return ret; +} + +/* Coroutine wrapper for bdrv_get_block_status() */ +static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque) +{ + BdrvCoGetBlockStatusData *data = opaque; + BlockDriverState *bs = data->bs; + + data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors, + data->pnum); + data->done = true; +} + +/* + * Synchronous wrapper around bdrv_co_get_block_status(). + * + * See bdrv_co_get_block_status() for details. + */ +int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, int *pnum) +{ + Coroutine *co; + BdrvCoGetBlockStatusData data = { + .bs = bs, + .sector_num = sector_num, + .nb_sectors = nb_sectors, + .pnum = pnum, + .done = false, + }; + + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_get_block_status_co_entry(&data); + } else { + AioContext *aio_context = bdrv_get_aio_context(bs); + + co = qemu_coroutine_create(bdrv_get_block_status_co_entry); + qemu_coroutine_enter(co, &data); + while (!data.done) { + aio_poll(aio_context, true); + } + } + return data.ret; +} + +int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, int *pnum) +{ + int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum); + if (ret < 0) { + return ret; + } + return !!(ret & BDRV_BLOCK_ALLOCATED); +} + +/* + * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] + * + * Return true if the given sector is allocated in any image between + * BASE and TOP (inclusive). BASE can be NULL to check if the given + * sector is allocated in any image of the chain. Return false otherwise. + * + * 'pnum' is set to the number of sectors (including and immediately following + * the specified sector) that are known to be in the same + * allocated/unallocated state. + * + */ +int bdrv_is_allocated_above(BlockDriverState *top, + BlockDriverState *base, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + BlockDriverState *intermediate; + int ret, n = nb_sectors; + + intermediate = top; + while (intermediate && intermediate != base) { + int pnum_inter; + ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, + &pnum_inter); + if (ret < 0) { + return ret; + } else if (ret) { + *pnum = pnum_inter; + return 1; + } + + /* + * [sector_num, nb_sectors] is unallocated on top but intermediate + * might have + * + * [sector_num+x, nr_sectors] allocated. + */ + if (n > pnum_inter && + (intermediate == top || + sector_num + pnum_inter < intermediate->total_sectors)) { + n = pnum_inter; + } + + intermediate = intermediate->backing_hd; + } + + *pnum = n; + return 0; +} + +int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + BlockDriver *drv = bs->drv; + int ret; + + if (!drv) { + return -ENOMEDIUM; + } + if (!drv->bdrv_write_compressed) { + return -ENOTSUP; + } + ret = bdrv_check_request(bs, sector_num, nb_sectors); + if (ret < 0) { + return ret; + } + + assert(QLIST_EMPTY(&bs->dirty_bitmaps)); + + return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); +} + +int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, + int64_t pos, int size) +{ + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *) buf, + .iov_len = size, + }; + + qemu_iovec_init_external(&qiov, &iov, 1); + return bdrv_writev_vmstate(bs, &qiov, pos); +} + +int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) +{ + BlockDriver *drv = bs->drv; + + if (!drv) { + return -ENOMEDIUM; + } else if (drv->bdrv_save_vmstate) { + return drv->bdrv_save_vmstate(bs, qiov, pos); + } else if (bs->file) { + return bdrv_writev_vmstate(bs->file, qiov, pos); + } + + return -ENOTSUP; +} + +int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, + int64_t pos, int size) +{ + BlockDriver *drv = bs->drv; + if (!drv) + return -ENOMEDIUM; + if (drv->bdrv_load_vmstate) + return drv->bdrv_load_vmstate(bs, buf, pos, size); + if (bs->file) + return bdrv_load_vmstate(bs->file, buf, pos, size); + return -ENOTSUP; +} + +/**************************************************************/ +/* async I/Os */ + +BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); + + return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, + cb, opaque, false); +} + +BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); + + return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, + cb, opaque, true); +} + +BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, + BlockCompletionFunc *cb, void *opaque) +{ + trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); + + return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, + BDRV_REQ_ZERO_WRITE | flags, + cb, opaque, true); +} + + +typedef struct MultiwriteCB { + int error; + int num_requests; + int num_callbacks; + struct { + BlockCompletionFunc *cb; + void *opaque; + QEMUIOVector *free_qiov; + } callbacks[]; +} MultiwriteCB; + +static void multiwrite_user_cb(MultiwriteCB *mcb) +{ + int i; + + for (i = 0; i < mcb->num_callbacks; i++) { + mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); + if (mcb->callbacks[i].free_qiov) { + qemu_iovec_destroy(mcb->callbacks[i].free_qiov); + } + g_free(mcb->callbacks[i].free_qiov); + } +} + +static void multiwrite_cb(void *opaque, int ret) +{ + MultiwriteCB *mcb = opaque; + + trace_multiwrite_cb(mcb, ret); + + if (ret < 0 && !mcb->error) { + mcb->error = ret; + } + + mcb->num_requests--; + if (mcb->num_requests == 0) { + multiwrite_user_cb(mcb); + g_free(mcb); + } +} + +static int multiwrite_req_compare(const void *a, const void *b) +{ + const BlockRequest *req1 = a, *req2 = b; + + /* + * Note that we can't simply subtract req2->sector from req1->sector + * here as that could overflow the return value. + */ + if (req1->sector > req2->sector) { + return 1; + } else if (req1->sector < req2->sector) { + return -1; + } else { + return 0; + } +} + +/* + * Takes a bunch of requests and tries to merge them. Returns the number of + * requests that remain after merging. + */ +static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, + int num_reqs, MultiwriteCB *mcb) +{ + int i, outidx; + + // Sort requests by start sector + qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); + + // Check if adjacent requests touch the same clusters. If so, combine them, + // filling up gaps with zero sectors. + outidx = 0; + for (i = 1; i < num_reqs; i++) { + int merge = 0; + int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; + + // Handle exactly sequential writes and overlapping writes. + if (reqs[i].sector <= oldreq_last) { + merge = 1; + } + + if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) { + merge = 0; + } + + if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors + + reqs[i].nb_sectors > bs->bl.max_transfer_length) { + merge = 0; + } + + if (merge) { + size_t size; + QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); + qemu_iovec_init(qiov, + reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); + + // Add the first request to the merged one. If the requests are + // overlapping, drop the last sectors of the first request. + size = (reqs[i].sector - reqs[outidx].sector) << 9; + qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); + + // We should need to add any zeros between the two requests + assert (reqs[i].sector <= oldreq_last); + + // Add the second request + qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); + + // Add tail of first request, if necessary + if (qiov->size < reqs[outidx].qiov->size) { + qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size, + reqs[outidx].qiov->size - qiov->size); + } + + reqs[outidx].nb_sectors = qiov->size >> 9; + reqs[outidx].qiov = qiov; + + mcb->callbacks[i].free_qiov = reqs[outidx].qiov; + } else { + outidx++; + reqs[outidx].sector = reqs[i].sector; + reqs[outidx].nb_sectors = reqs[i].nb_sectors; + reqs[outidx].qiov = reqs[i].qiov; + } + } + + block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1); + + return outidx + 1; +} + +/* + * Submit multiple AIO write requests at once. + * + * On success, the function returns 0 and all requests in the reqs array have + * been submitted. In error case this function returns -1, and any of the + * requests may or may not be submitted yet. In particular, this means that the + * callback will be called for some of the requests, for others it won't. The + * caller must check the error field of the BlockRequest to wait for the right + * callbacks (if error != 0, no callback will be called). + * + * The implementation may modify the contents of the reqs array, e.g. to merge + * requests. However, the fields opaque and error are left unmodified as they + * are used to signal failure for a single request to the caller. + */ +int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) +{ + MultiwriteCB *mcb; + int i; + + /* don't submit writes if we don't have a medium */ + if (bs->drv == NULL) { + for (i = 0; i < num_reqs; i++) { + reqs[i].error = -ENOMEDIUM; + } + return -1; + } + + if (num_reqs == 0) { + return 0; + } + + // Create MultiwriteCB structure + mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); + mcb->num_requests = 0; + mcb->num_callbacks = num_reqs; + + for (i = 0; i < num_reqs; i++) { + mcb->callbacks[i].cb = reqs[i].cb; + mcb->callbacks[i].opaque = reqs[i].opaque; + } + + // Check for mergable requests + num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); + + trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); + + /* Run the aio requests. */ + mcb->num_requests = num_reqs; + for (i = 0; i < num_reqs; i++) { + bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, + reqs[i].nb_sectors, reqs[i].flags, + multiwrite_cb, mcb, + true); + } + + return 0; +} + +void bdrv_aio_cancel(BlockAIOCB *acb) +{ + qemu_aio_ref(acb); + bdrv_aio_cancel_async(acb); + while (acb->refcnt > 1) { + if (acb->aiocb_info->get_aio_context) { + aio_poll(acb->aiocb_info->get_aio_context(acb), true); + } else if (acb->bs) { + aio_poll(bdrv_get_aio_context(acb->bs), true); + } else { + abort(); + } + } + qemu_aio_unref(acb); +} + +/* Async version of aio cancel. The caller is not blocked if the acb implements + * cancel_async, otherwise we do nothing and let the request normally complete. + * In either case the completion callback must be called. */ +void bdrv_aio_cancel_async(BlockAIOCB *acb) +{ + if (acb->aiocb_info->cancel_async) { + acb->aiocb_info->cancel_async(acb); + } +} + +/**************************************************************/ +/* async block device emulation */ + +typedef struct BlockAIOCBSync { + BlockAIOCB common; + QEMUBH *bh; + int ret; + /* vector translation state */ + QEMUIOVector *qiov; + uint8_t *bounce; + int is_write; +} BlockAIOCBSync; + +static const AIOCBInfo bdrv_em_aiocb_info = { + .aiocb_size = sizeof(BlockAIOCBSync), +}; + +static void bdrv_aio_bh_cb(void *opaque) +{ + BlockAIOCBSync *acb = opaque; + + if (!acb->is_write && acb->ret >= 0) { + qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); + } + qemu_vfree(acb->bounce); + acb->common.cb(acb->common.opaque, acb->ret); + qemu_bh_delete(acb->bh); + acb->bh = NULL; + qemu_aio_unref(acb); +} + +static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BlockCompletionFunc *cb, + void *opaque, + int is_write) + +{ + BlockAIOCBSync *acb; + + acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); + acb->is_write = is_write; + acb->qiov = qiov; + acb->bounce = qemu_try_blockalign(bs, qiov->size); + acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); + + if (acb->bounce == NULL) { + acb->ret = -ENOMEM; + } else if (is_write) { + qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); + acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); + } else { + acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); + } + + qemu_bh_schedule(acb->bh); + + return &acb->common; +} + +static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); +} + +static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); +} + + +typedef struct BlockAIOCBCoroutine { + BlockAIOCB common; + BlockRequest req; + bool is_write; + bool need_bh; + bool *done; + QEMUBH* bh; +} BlockAIOCBCoroutine; + +static const AIOCBInfo bdrv_em_co_aiocb_info = { + .aiocb_size = sizeof(BlockAIOCBCoroutine), +}; + +static void bdrv_co_complete(BlockAIOCBCoroutine *acb) +{ + if (!acb->need_bh) { + acb->common.cb(acb->common.opaque, acb->req.error); + qemu_aio_unref(acb); + } +} + +static void bdrv_co_em_bh(void *opaque) +{ + BlockAIOCBCoroutine *acb = opaque; + + assert(!acb->need_bh); + qemu_bh_delete(acb->bh); + bdrv_co_complete(acb); +} + +static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb) +{ + acb->need_bh = false; + if (acb->req.error != -EINPROGRESS) { + BlockDriverState *bs = acb->common.bs; + + acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); + qemu_bh_schedule(acb->bh); + } +} + +/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ +static void coroutine_fn bdrv_co_do_rw(void *opaque) +{ + BlockAIOCBCoroutine *acb = opaque; + BlockDriverState *bs = acb->common.bs; + + if (!acb->is_write) { + acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, + acb->req.nb_sectors, acb->req.qiov, acb->req.flags); + } else { + acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, + acb->req.nb_sectors, acb->req.qiov, acb->req.flags); + } + + bdrv_co_complete(acb); +} + +static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BdrvRequestFlags flags, + BlockCompletionFunc *cb, + void *opaque, + bool is_write) +{ + Coroutine *co; + BlockAIOCBCoroutine *acb; + + acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); + acb->need_bh = true; + acb->req.error = -EINPROGRESS; + acb->req.sector = sector_num; + acb->req.nb_sectors = nb_sectors; + acb->req.qiov = qiov; + acb->req.flags = flags; + acb->is_write = is_write; + + co = qemu_coroutine_create(bdrv_co_do_rw); + qemu_coroutine_enter(co, acb); + + bdrv_co_maybe_schedule_bh(acb); + return &acb->common; +} + +static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) +{ + BlockAIOCBCoroutine *acb = opaque; + BlockDriverState *bs = acb->common.bs; + + acb->req.error = bdrv_co_flush(bs); + bdrv_co_complete(acb); +} + +BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, + BlockCompletionFunc *cb, void *opaque) +{ + trace_bdrv_aio_flush(bs, opaque); + + Coroutine *co; + BlockAIOCBCoroutine *acb; + + acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); + acb->need_bh = true; + acb->req.error = -EINPROGRESS; + + co = qemu_coroutine_create(bdrv_aio_flush_co_entry); + qemu_coroutine_enter(co, acb); + + bdrv_co_maybe_schedule_bh(acb); + return &acb->common; +} + +static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) +{ + BlockAIOCBCoroutine *acb = opaque; + BlockDriverState *bs = acb->common.bs; + + acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); + bdrv_co_complete(acb); +} + +BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + Coroutine *co; + BlockAIOCBCoroutine *acb; + + trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); + + acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); + acb->need_bh = true; + acb->req.error = -EINPROGRESS; + acb->req.sector = sector_num; + acb->req.nb_sectors = nb_sectors; + co = qemu_coroutine_create(bdrv_aio_discard_co_entry); + qemu_coroutine_enter(co, acb); + + bdrv_co_maybe_schedule_bh(acb); + return &acb->common; +} + +void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, + BlockCompletionFunc *cb, void *opaque) +{ + BlockAIOCB *acb; + + acb = g_slice_alloc(aiocb_info->aiocb_size); + acb->aiocb_info = aiocb_info; + acb->bs = bs; + acb->cb = cb; + acb->opaque = opaque; + acb->refcnt = 1; + return acb; +} + +void qemu_aio_ref(void *p) +{ + BlockAIOCB *acb = p; + acb->refcnt++; +} + +void qemu_aio_unref(void *p) +{ + BlockAIOCB *acb = p; + assert(acb->refcnt > 0); + if (--acb->refcnt == 0) { + g_slice_free1(acb->aiocb_info->aiocb_size, acb); + } +} + +/**************************************************************/ +/* Coroutine block device emulation */ + +typedef struct CoroutineIOCompletion { + Coroutine *coroutine; + int ret; +} CoroutineIOCompletion; + +static void bdrv_co_io_em_complete(void *opaque, int ret) +{ + CoroutineIOCompletion *co = opaque; + + co->ret = ret; + qemu_coroutine_enter(co->coroutine, NULL); +} + +static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *iov, + bool is_write) +{ + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + BlockAIOCB *acb; + + if (is_write) { + acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, + bdrv_co_io_em_complete, &co); + } else { + acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, + bdrv_co_io_em_complete, &co); + } + + trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); + if (!acb) { + return -EIO; + } + qemu_coroutine_yield(); + + return co.ret; +} + +static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov) +{ + return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); +} + +static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov) +{ + return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); +} + +static void coroutine_fn bdrv_flush_co_entry(void *opaque) +{ + RwCo *rwco = opaque; + + rwco->ret = bdrv_co_flush(rwco->bs); +} + +int coroutine_fn bdrv_co_flush(BlockDriverState *bs) +{ + int ret; + + if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { + return 0; + } + + /* Write back cached data to the OS even with cache=unsafe */ + BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); + if (bs->drv->bdrv_co_flush_to_os) { + ret = bs->drv->bdrv_co_flush_to_os(bs); + if (ret < 0) { + return ret; + } + } + + /* But don't actually force it to the disk with cache=unsafe */ + if (bs->open_flags & BDRV_O_NO_FLUSH) { + goto flush_parent; + } + + BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); + if (bs->drv->bdrv_co_flush_to_disk) { + ret = bs->drv->bdrv_co_flush_to_disk(bs); + } else if (bs->drv->bdrv_aio_flush) { + BlockAIOCB *acb; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); + if (acb == NULL) { + ret = -EIO; + } else { + qemu_coroutine_yield(); + ret = co.ret; + } + } else { + /* + * Some block drivers always operate in either writethrough or unsafe + * mode and don't support bdrv_flush therefore. Usually qemu doesn't + * know how the server works (because the behaviour is hardcoded or + * depends on server-side configuration), so we can't ensure that + * everything is safe on disk. Returning an error doesn't work because + * that would break guests even if the server operates in writethrough + * mode. + * + * Let's hope the user knows what he's doing. + */ + ret = 0; + } + if (ret < 0) { + return ret; + } + + /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH + * in the case of cache=unsafe, so there are no useless flushes. + */ +flush_parent: + return bdrv_co_flush(bs->file); +} + +int bdrv_flush(BlockDriverState *bs) +{ + Coroutine *co; + RwCo rwco = { + .bs = bs, + .ret = NOT_DONE, + }; + + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_flush_co_entry(&rwco); + } else { + AioContext *aio_context = bdrv_get_aio_context(bs); + + co = qemu_coroutine_create(bdrv_flush_co_entry); + qemu_coroutine_enter(co, &rwco); + while (rwco.ret == NOT_DONE) { + aio_poll(aio_context, true); + } + } + + return rwco.ret; +} + +typedef struct DiscardCo { + BlockDriverState *bs; + int64_t sector_num; + int nb_sectors; + int ret; +} DiscardCo; +static void coroutine_fn bdrv_discard_co_entry(void *opaque) +{ + DiscardCo *rwco = opaque; + + rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); +} + +int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, + int nb_sectors) +{ + int max_discard, ret; + + if (!bs->drv) { + return -ENOMEDIUM; + } + + ret = bdrv_check_request(bs, sector_num, nb_sectors); + if (ret < 0) { + return ret; + } else if (bs->read_only) { + return -EROFS; + } + + bdrv_reset_dirty(bs, sector_num, nb_sectors); + + /* Do nothing if disabled. */ + if (!(bs->open_flags & BDRV_O_UNMAP)) { + return 0; + } + + if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { + return 0; + } + + max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS); + while (nb_sectors > 0) { + int ret; + int num = nb_sectors; + + /* align request */ + if (bs->bl.discard_alignment && + num >= bs->bl.discard_alignment && + sector_num % bs->bl.discard_alignment) { + if (num > bs->bl.discard_alignment) { + num = bs->bl.discard_alignment; + } + num -= sector_num % bs->bl.discard_alignment; + } + + /* limit request size */ + if (num > max_discard) { + num = max_discard; + } + + if (bs->drv->bdrv_co_discard) { + ret = bs->drv->bdrv_co_discard(bs, sector_num, num); + } else { + BlockAIOCB *acb; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, + bdrv_co_io_em_complete, &co); + if (acb == NULL) { + return -EIO; + } else { + qemu_coroutine_yield(); + ret = co.ret; + } + } + if (ret && ret != -ENOTSUP) { + return ret; + } + + sector_num += num; + nb_sectors -= num; + } + return 0; +} + +int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) +{ + Coroutine *co; + DiscardCo rwco = { + .bs = bs, + .sector_num = sector_num, + .nb_sectors = nb_sectors, + .ret = NOT_DONE, + }; + + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_discard_co_entry(&rwco); + } else { + AioContext *aio_context = bdrv_get_aio_context(bs); + + co = qemu_coroutine_create(bdrv_discard_co_entry); + qemu_coroutine_enter(co, &rwco); + while (rwco.ret == NOT_DONE) { + aio_poll(aio_context, true); + } + } + + return rwco.ret; +} + +/* needed for generic scsi interface */ + +int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) +{ + BlockDriver *drv = bs->drv; + + if (drv && drv->bdrv_ioctl) + return drv->bdrv_ioctl(bs, req, buf); + return -ENOTSUP; +} + +BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, + unsigned long int req, void *buf, + BlockCompletionFunc *cb, void *opaque) +{ + BlockDriver *drv = bs->drv; + + if (drv && drv->bdrv_aio_ioctl) + return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque); + return NULL; +} + +void *qemu_blockalign(BlockDriverState *bs, size_t size) +{ + return qemu_memalign(bdrv_opt_mem_align(bs), size); +} + +void *qemu_blockalign0(BlockDriverState *bs, size_t size) +{ + return memset(qemu_blockalign(bs, size), 0, size); +} + +void *qemu_try_blockalign(BlockDriverState *bs, size_t size) +{ + size_t align = bdrv_opt_mem_align(bs); + + /* Ensure that NULL is never returned on success */ + assert(align > 0); + if (size == 0) { + size = align; + } + + return qemu_try_memalign(align, size); +} + +void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) +{ + void *mem = qemu_try_blockalign(bs, size); + + if (mem) { + memset(mem, 0, size); + } + + return mem; +} + +/* + * Check if all memory in this vector is sector aligned. + */ +bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) +{ + int i; + size_t alignment = bdrv_opt_mem_align(bs); + + for (i = 0; i < qiov->niov; i++) { + if ((uintptr_t) qiov->iov[i].iov_base % alignment) { + return false; + } + if (qiov->iov[i].iov_len % alignment) { + return false; + } + } + + return true; +} + +void bdrv_add_before_write_notifier(BlockDriverState *bs, + NotifierWithReturn *notifier) +{ + notifier_with_return_list_add(&bs->before_write_notifiers, notifier); +} + +void bdrv_io_plug(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_io_plug) { + drv->bdrv_io_plug(bs); + } else if (bs->file) { + bdrv_io_plug(bs->file); + } +} + +void bdrv_io_unplug(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_io_unplug) { + drv->bdrv_io_unplug(bs); + } else if (bs->file) { + bdrv_io_unplug(bs->file); + } +} + +void bdrv_flush_io_queue(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_flush_io_queue) { + drv->bdrv_flush_io_queue(bs); + } else if (bs->file) { + bdrv_flush_io_queue(bs->file); + } +} -- cgit v1.1