diff options
author | Anthony Liguori <aliguori@amazon.com> | 2014-01-24 15:43:30 -0800 |
---|---|---|
committer | Anthony Liguori <aliguori@amazon.com> | 2014-01-24 15:43:30 -0800 |
commit | 0d688cf7d8d71bce2aab83173552a784e96b6729 (patch) | |
tree | 543f7cc92a36e0157cf35e474ee4eb74b8f6156e /block.c | |
parent | 732c66ce641c69702a7e7fdb73b68f0c1b583ab5 (diff) | |
parent | d5103588aa39157c8eea3bb5fb6780bbd8be21b7 (diff) | |
download | qemu-0d688cf7d8d71bce2aab83173552a784e96b6729.zip qemu-0d688cf7d8d71bce2aab83173552a784e96b6729.tar.gz qemu-0d688cf7d8d71bce2aab83173552a784e96b6729.tar.bz2 |
Merge remote-tracking branch 'kwolf/tags/for-anthony' into staging
Block patches
# gpg: Signature made Fri 24 Jan 2014 08:40:53 AM PST using RSA key ID C88F2FD6
# gpg: Can't check signature: public key not found
* kwolf/tags/for-anthony: (93 commits)
block: Switch bdrv_io_limits_intercept() to byte granularity
qemu-iotests: Test pwritev RMW logic
qemu-io: New command 'sleep'
blkdebug: Make required alignment configurable
iscsi: Set bs->request_alignment
block: Make bdrv_pwrite() a bdrv_prwv_co() wrapper
block: Make bdrv_pread() a bdrv_prwv_co() wrapper
block: Change coroutine wrapper to byte granularity
block: Assert serialisation assumptions in pwritev
block: Align requests in bdrv_co_do_pwritev()
block: Allow wait_serialising_requests() at any point
block: Make overlap range for serialisation dynamic
block: Generalise and optimise COR serialisation
block: Make zero-after-EOF work with larger alignment
block: Allow waiting for overlapping requests between begin/end
block: Switch BdrvTrackedRequest to byte granularity
block: Introduce bdrv_co_do_pwritev()
block: write: Handle COR dependency after I/O throttling
block: Introduce bdrv_aligned_pwritev()
block: Introduce bdrv_co_do_preadv()
...
Message-id: 1390584136-24703-1-git-send-email-kwolf@redhat.com
Signed-off-by: Anthony Liguori <aliguori@amazon.com>
Diffstat (limited to 'block.c')
-rw-r--r-- | block.c | 1024 |
1 files changed, 778 insertions, 246 deletions
@@ -32,6 +32,7 @@ #include "sysemu/sysemu.h" #include "qemu/notify.h" #include "block/coroutine.h" +#include "block/qapi.h" #include "qmp-commands.h" #include "qemu/timer.h" @@ -69,11 +70,11 @@ static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov); -static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, +static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, BdrvRequestFlags flags); -static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, +static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, BdrvRequestFlags flags); static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, int64_t sector_num, @@ -90,6 +91,9 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, static QTAILQ_HEAD(, BlockDriverState) bdrv_states = QTAILQ_HEAD_INITIALIZER(bdrv_states); +static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states = + QTAILQ_HEAD_INITIALIZER(graph_bdrv_states); + static QLIST_HEAD(, BlockDriver) bdrv_drivers = QLIST_HEAD_INITIALIZER(bdrv_drivers); @@ -188,7 +192,7 @@ void bdrv_io_limits_enable(BlockDriverState *bs) * @is_write: is the IO a write */ static void bdrv_io_limits_intercept(BlockDriverState *bs, - int nb_sectors, + unsigned int bytes, bool is_write) { /* does this io must wait */ @@ -201,9 +205,8 @@ static void bdrv_io_limits_intercept(BlockDriverState *bs, } /* the IO will be executed, do the accounting */ - throttle_account(&bs->throttle_state, - is_write, - nb_sectors * BDRV_SECTOR_SIZE); + throttle_account(&bs->throttle_state, is_write, bytes); + /* if the next request must wait -> do nothing */ if (throttle_schedule_timer(&bs->throttle_state, is_write)) { @@ -214,6 +217,16 @@ static void bdrv_io_limits_intercept(BlockDriverState *bs, qemu_co_queue_next(&bs->throttled_reqs[is_write]); } +size_t bdrv_opt_mem_align(BlockDriverState *bs) +{ + if (!bs || !bs->drv) { + /* 4k should be on the safe side */ + return 4096; + } + + return bs->bl.opt_mem_alignment; +} + /* check if the path starts with "<protocol>:" */ static int path_has_protocol(const char *path) { @@ -327,7 +340,7 @@ BlockDriverState *bdrv_new(const char *device_name) QLIST_INIT(&bs->dirty_bitmaps); pstrcpy(bs->device_name, sizeof(bs->device_name), device_name); if (device_name[0] != '\0') { - QTAILQ_INSERT_TAIL(&bdrv_states, bs, list); + QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list); } bdrv_iostatus_disable(bs); notifier_list_init(&bs->close_notifiers); @@ -479,6 +492,43 @@ int bdrv_create_file(const char* filename, QEMUOptionParameter *options, return ret; } +int bdrv_refresh_limits(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + + memset(&bs->bl, 0, sizeof(bs->bl)); + + if (!drv) { + return 0; + } + + /* Take some limits from the children as a default */ + if (bs->file) { + bdrv_refresh_limits(bs->file); + bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length; + bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment; + } else { + bs->bl.opt_mem_alignment = 512; + } + + if (bs->backing_hd) { + bdrv_refresh_limits(bs->backing_hd); + bs->bl.opt_transfer_length = + MAX(bs->bl.opt_transfer_length, + bs->backing_hd->bl.opt_transfer_length); + bs->bl.opt_mem_alignment = + MAX(bs->bl.opt_mem_alignment, + bs->backing_hd->bl.opt_mem_alignment); + } + + /* Then let the driver override it */ + if (drv->bdrv_refresh_limits) { + return drv->bdrv_refresh_limits(bs); + } + + return 0; +} + /* * Create a uniquely-named empty temporary file. * Return 0 upon success, otherwise a negative errno value. @@ -732,6 +782,33 @@ static int bdrv_open_flags(BlockDriverState *bs, int flags) return open_flags; } +static int bdrv_assign_node_name(BlockDriverState *bs, + const char *node_name, + Error **errp) +{ + if (!node_name) { + return 0; + } + + /* empty string node name is invalid */ + if (node_name[0] == '\0') { + error_setg(errp, "Empty node name"); + return -EINVAL; + } + + /* takes care of avoiding duplicates node names */ + if (bdrv_find_node(node_name)) { + error_setg(errp, "Duplicate node name"); + return -EINVAL; + } + + /* copy node name into the bs and insert it into the graph list */ + pstrcpy(bs->node_name, sizeof(bs->node_name), node_name); + QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list); + + return 0; +} + /* * Common part for opening disk images and files * @@ -742,6 +819,7 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, { int ret, open_flags; const char *filename; + const char *node_name = NULL; Error *local_err = NULL; assert(drv != NULL); @@ -756,6 +834,13 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name); + node_name = qdict_get_try_str(options, "node-name"); + ret = bdrv_assign_node_name(bs, node_name, errp); + if (ret < 0) { + return ret; + } + qdict_del(options, "node-name"); + /* bdrv_open() with directly using a protocol as drv. This layer is already * opened, so assign it to bs (while file becomes a closed BlockDriverState) * and return immediately. */ @@ -765,7 +850,8 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, } bs->open_flags = flags; - bs->buffer_alignment = 512; + bs->guest_block_size = 512; + bs->request_alignment = 512; bs->zero_beyond_eof = true; open_flags = bdrv_open_flags(bs, flags); bs->read_only = !(open_flags & BDRV_O_RDWR); @@ -833,6 +919,10 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, goto free_and_fail; } + bdrv_refresh_limits(bs); + assert(bdrv_opt_mem_align(bs) != 0); + assert(bs->request_alignment != 0); + #ifndef _WIN32 if (bs->is_temporary) { assert(bs->filename[0] != '\0'); @@ -858,9 +948,10 @@ free_and_fail: * dictionary, it needs to use QINCREF() before calling bdrv_file_open. */ int bdrv_file_open(BlockDriverState **pbs, const char *filename, - QDict *options, int flags, Error **errp) + const char *reference, QDict *options, int flags, + Error **errp) { - BlockDriverState *bs; + BlockDriverState *bs = NULL; BlockDriver *drv; const char *drvname; bool allow_protocol_prefix = false; @@ -872,6 +963,24 @@ int bdrv_file_open(BlockDriverState **pbs, const char *filename, options = qdict_new(); } + if (reference) { + if (filename || qdict_size(options)) { + error_setg(errp, "Cannot reference an existing block device with " + "additional options or a new filename"); + return -EINVAL; + } + QDECREF(options); + + bs = bdrv_find(reference); + if (!bs) { + error_setg(errp, "Cannot find block device '%s'", reference); + return -ENODEV; + } + bdrv_ref(bs); + *pbs = bs; + return 0; + } + bs = bdrv_new(""); bs->options = options; options = qdict_clone_shallow(options); @@ -929,14 +1038,19 @@ int bdrv_file_open(BlockDriverState **pbs, const char *filename, goto fail; } - ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err); + if (!drv->bdrv_file_open) { + ret = bdrv_open(bs, filename, options, flags, drv, &local_err); + options = NULL; + } else { + ret = bdrv_open_common(bs, NULL, options, flags, drv, &local_err); + } if (ret < 0) { error_propagate(errp, local_err); goto fail; } /* Check if any unknown options were used */ - if (qdict_size(options) != 0) { + if (options && (qdict_size(options) != 0)) { const QDictEntry *entry = qdict_first(options); error_setg(errp, "Block protocol '%s' doesn't support the option '%s'", drv->format_name, entry->key); @@ -1016,12 +1130,92 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp) error_free(local_err); return ret; } - pstrcpy(bs->backing_file, sizeof(bs->backing_file), - bs->backing_hd->file->filename); + + if (bs->backing_hd->file) { + pstrcpy(bs->backing_file, sizeof(bs->backing_file), + bs->backing_hd->file->filename); + } + + /* Recalculate the BlockLimits with the backing file */ + bdrv_refresh_limits(bs); + return 0; } /* + * Opens a disk image whose options are given as BlockdevRef in another block + * device's options. + * + * If force_raw is true, bdrv_file_open() will be used, thereby preventing any + * image format auto-detection. If it is false and a filename is given, + * bdrv_open() will be used for auto-detection. + * + * If allow_none is true, no image will be opened if filename is false and no + * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned. + * + * bdrev_key specifies the key for the image's BlockdevRef in the options QDict. + * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict + * itself, all options starting with "${bdref_key}." are considered part of the + * BlockdevRef. + * + * The BlockdevRef will be removed from the options QDict. + */ +int bdrv_open_image(BlockDriverState **pbs, const char *filename, + QDict *options, const char *bdref_key, int flags, + bool force_raw, bool allow_none, Error **errp) +{ + QDict *image_options; + int ret; + char *bdref_key_dot; + const char *reference; + + bdref_key_dot = g_strdup_printf("%s.", bdref_key); + qdict_extract_subqdict(options, &image_options, bdref_key_dot); + g_free(bdref_key_dot); + + reference = qdict_get_try_str(options, bdref_key); + if (!filename && !reference && !qdict_size(image_options)) { + if (allow_none) { + ret = 0; + } else { + error_setg(errp, "A block device must be specified for \"%s\"", + bdref_key); + ret = -EINVAL; + } + goto done; + } + + if (filename && !force_raw) { + /* If a filename is given and the block driver should be detected + automatically (instead of using none), use bdrv_open() in order to do + that auto-detection. */ + BlockDriverState *bs; + + if (reference) { + error_setg(errp, "Cannot reference an existing block device while " + "giving a filename"); + ret = -EINVAL; + goto done; + } + + bs = bdrv_new(""); + ret = bdrv_open(bs, filename, image_options, flags, NULL, errp); + if (ret < 0) { + bdrv_unref(bs); + } else { + *pbs = bs; + } + } else { + ret = bdrv_file_open(pbs, filename, reference, image_options, flags, + errp); + } + +done: + qdict_del(options, bdref_key); + return ret; +} + +/* * Opens a disk image (raw, qcow2, vmdk, ...) * * options is a QDict of options to pass to the block drivers, or NULL for an @@ -1036,7 +1230,6 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */ char tmp_filename[PATH_MAX + 1]; BlockDriverState *file = NULL; - QDict *file_options = NULL; const char *drvname; Error *local_err = NULL; @@ -1122,10 +1315,9 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, flags |= BDRV_O_ALLOW_RDWR; } - qdict_extract_subqdict(options, &file_options, "file."); - - ret = bdrv_file_open(&file, filename, file_options, - bdrv_open_flags(bs, flags | BDRV_O_UNMAP), &local_err); + ret = bdrv_open_image(&file, filename, options, "file", + bdrv_open_flags(bs, flags | BDRV_O_UNMAP), true, true, + &local_err); if (ret < 0) { goto fail; } @@ -1143,7 +1335,13 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, } if (!drv) { - ret = find_image_format(file, filename, &drv, &local_err); + if (file) { + ret = find_image_format(file, filename, &drv, &local_err); + } else { + error_setg(errp, "Must specify either driver or file"); + ret = -EINVAL; + goto unlink_and_fail; + } } if (!drv) { @@ -1156,7 +1354,7 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, goto unlink_and_fail; } - if (bs->file != file) { + if (file && (bs->file != file)) { bdrv_unref(file); file = NULL; } @@ -1427,6 +1625,8 @@ void bdrv_reopen_commit(BDRVReopenState *reopen_state) reopen_state->bs->enable_write_cache = !!(reopen_state->flags & BDRV_O_CACHE_WB); reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR); + + bdrv_refresh_limits(reopen_state->bs); } /* @@ -1501,7 +1701,7 @@ void bdrv_close_all(void) { BlockDriverState *bs; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { bdrv_close(bs); } } @@ -1530,7 +1730,7 @@ static bool bdrv_requests_pending(BlockDriverState *bs) static bool bdrv_requests_pending_all(void) { BlockDriverState *bs; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { if (bdrv_requests_pending(bs)) { return true; } @@ -1557,7 +1757,7 @@ void bdrv_drain_all(void) BlockDriverState *bs; while (busy) { - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { bdrv_start_throttled_reqs(bs); } @@ -1566,14 +1766,19 @@ void bdrv_drain_all(void) } } -/* make a BlockDriverState anonymous by removing from bdrv_state list. +/* make a BlockDriverState anonymous by removing from bdrv_state and + * graph_bdrv_state list. Also, NULL terminate the device_name to prevent double remove */ void bdrv_make_anon(BlockDriverState *bs) { if (bs->device_name[0] != '\0') { - QTAILQ_REMOVE(&bdrv_states, bs, list); + QTAILQ_REMOVE(&bdrv_states, bs, device_list); } bs->device_name[0] = '\0'; + if (bs->node_name[0] != '\0') { + QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list); + } + bs->node_name[0] = '\0'; } static void bdrv_rebind(BlockDriverState *bs) @@ -1593,7 +1798,7 @@ static void bdrv_move_feature_fields(BlockDriverState *bs_dest, bs_dest->dev_ops = bs_src->dev_ops; bs_dest->dev_opaque = bs_src->dev_opaque; bs_dest->dev = bs_src->dev; - bs_dest->buffer_alignment = bs_src->buffer_alignment; + bs_dest->guest_block_size = bs_src->guest_block_size; bs_dest->copy_on_read = bs_src->copy_on_read; bs_dest->enable_write_cache = bs_src->enable_write_cache; @@ -1627,7 +1832,12 @@ static void bdrv_move_feature_fields(BlockDriverState *bs_dest, /* keep the same entry in bdrv_states */ pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name), bs_src->device_name); - bs_dest->list = bs_src->list; + bs_dest->device_list = bs_src->device_list; + + /* keep the same entry in graph_bdrv_states + * We do want to swap name but don't want to swap linked list entries + */ + bs_dest->node_list = bs_src->node_list; } /* @@ -1745,7 +1955,7 @@ void bdrv_detach_dev(BlockDriverState *bs, void *dev) bs->dev = NULL; bs->dev_ops = NULL; bs->dev_opaque = NULL; - bs->buffer_alignment = 512; + bs->guest_block_size = 512; } /* TODO change to return DeviceState * when all users are qdevified */ @@ -1876,10 +2086,10 @@ int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix) int bdrv_commit(BlockDriverState *bs) { BlockDriver *drv = bs->drv; - int64_t sector, total_sectors; + int64_t sector, total_sectors, length, backing_length; int n, ro, open_flags; int ret = 0; - uint8_t *buf; + uint8_t *buf = NULL; char filename[PATH_MAX]; if (!drv) @@ -1904,7 +2114,29 @@ int bdrv_commit(BlockDriverState *bs) } } - total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS; + length = bdrv_getlength(bs); + if (length < 0) { + ret = length; + goto ro_cleanup; + } + + backing_length = bdrv_getlength(bs->backing_hd); + if (backing_length < 0) { + ret = backing_length; + goto ro_cleanup; + } + + /* If our top snapshot is larger than the backing file image, + * grow the backing file image if possible. If not possible, + * we must return an error */ + if (length > backing_length) { + ret = bdrv_truncate(bs->backing_hd, length); + if (ret < 0) { + goto ro_cleanup; + } + } + + total_sectors = length >> BDRV_SECTOR_BITS; buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE); for (sector = 0; sector < total_sectors; sector += n) { @@ -1913,13 +2145,13 @@ int bdrv_commit(BlockDriverState *bs) goto ro_cleanup; } if (ret) { - if (bdrv_read(bs, sector, buf, n) != 0) { - ret = -EIO; + ret = bdrv_read(bs, sector, buf, n); + if (ret < 0) { goto ro_cleanup; } - if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) { - ret = -EIO; + ret = bdrv_write(bs->backing_hd, sector, buf, n); + if (ret < 0) { goto ro_cleanup; } } @@ -1927,6 +2159,9 @@ int bdrv_commit(BlockDriverState *bs) if (drv->bdrv_make_empty) { ret = drv->bdrv_make_empty(bs); + if (ret < 0) { + goto ro_cleanup; + } bdrv_flush(bs); } @@ -1934,9 +2169,11 @@ int bdrv_commit(BlockDriverState *bs) * Make sure all data we wrote to the backing device is actually * stable on disk. */ - if (bs->backing_hd) + if (bs->backing_hd) { bdrv_flush(bs->backing_hd); + } + ret = 0; ro_cleanup: g_free(buf); @@ -1952,7 +2189,7 @@ int bdrv_commit_all(void) { BlockDriverState *bs; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { if (bs->drv && bs->backing_hd) { int ret = bdrv_commit(bs); if (ret < 0) { @@ -1970,6 +2207,10 @@ int bdrv_commit_all(void) */ static void tracked_request_end(BdrvTrackedRequest *req) { + if (req->serialising) { + req->bs->serialising_in_flight--; + } + QLIST_REMOVE(req, list); qemu_co_queue_restart_all(&req->wait_queue); } @@ -1979,15 +2220,18 @@ static void tracked_request_end(BdrvTrackedRequest *req) */ static void tracked_request_begin(BdrvTrackedRequest *req, BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, bool is_write) + int64_t offset, + unsigned int bytes, bool is_write) { *req = (BdrvTrackedRequest){ .bs = bs, - .sector_num = sector_num, - .nb_sectors = nb_sectors, - .is_write = is_write, - .co = qemu_coroutine_self(), + .offset = offset, + .bytes = bytes, + .is_write = is_write, + .co = qemu_coroutine_self(), + .serialising = false, + .overlap_offset = offset, + .overlap_bytes = bytes, }; qemu_co_queue_init(&req->wait_queue); @@ -1995,6 +2239,21 @@ static void tracked_request_begin(BdrvTrackedRequest *req, QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); } +static void mark_request_serialising(BdrvTrackedRequest *req, size_t align) +{ + int64_t overlap_offset = req->offset & ~(align - 1); + int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) + - overlap_offset; + + if (!req->serialising) { + req->bs->serialising_in_flight++; + req->serialising = true; + } + + req->overlap_offset = MIN(req->overlap_offset, overlap_offset); + req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); +} + /** * Round a region to cluster boundaries */ @@ -2016,53 +2275,75 @@ void bdrv_round_to_clusters(BlockDriverState *bs, } } +static int bdrv_get_cluster_size(BlockDriverState *bs) +{ + BlockDriverInfo bdi; + int ret; + + ret = bdrv_get_info(bs, &bdi); + if (ret < 0 || bdi.cluster_size == 0) { + return bs->request_alignment; + } else { + return bdi.cluster_size; + } +} + static bool tracked_request_overlaps(BdrvTrackedRequest *req, - int64_t sector_num, int nb_sectors) { + int64_t offset, unsigned int bytes) +{ /* aaaa bbbb */ - if (sector_num >= req->sector_num + req->nb_sectors) { + if (offset >= req->overlap_offset + req->overlap_bytes) { return false; } /* bbbb aaaa */ - if (req->sector_num >= sector_num + nb_sectors) { + if (req->overlap_offset >= offset + bytes) { return false; } return true; } -static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) +static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) { + BlockDriverState *bs = self->bs; BdrvTrackedRequest *req; - int64_t cluster_sector_num; - int cluster_nb_sectors; bool retry; + bool waited = false; - /* If we touch the same cluster it counts as an overlap. This guarantees - * that allocating writes will be serialized and not race with each other - * for the same cluster. For example, in copy-on-read it ensures that the - * CoR read and write operations are atomic and guest writes cannot - * interleave between them. - */ - bdrv_round_to_clusters(bs, sector_num, nb_sectors, - &cluster_sector_num, &cluster_nb_sectors); + if (!bs->serialising_in_flight) { + return false; + } do { retry = false; QLIST_FOREACH(req, &bs->tracked_requests, list) { - if (tracked_request_overlaps(req, cluster_sector_num, - cluster_nb_sectors)) { + if (req == self || (!req->serialising && !self->serialising)) { + continue; + } + if (tracked_request_overlaps(req, self->overlap_offset, + self->overlap_bytes)) + { /* Hitting this means there was a reentrant request, for * example, a block driver issuing nested requests. This must * never happen since it means deadlock. */ assert(qemu_coroutine_self() != req->co); - qemu_co_queue_wait(&req->wait_queue); - retry = true; - break; + /* If the request is already (indirectly) waiting for us, or + * will wait for us as soon as it wakes up, then just go on + * (instead of producing a deadlock in the former case). */ + if (!req->waiting_for) { + self->waiting_for = req; + qemu_co_queue_wait(&req->wait_queue); + self->waiting_for = NULL; + retry = true; + waited = true; + break; + } } } } while (retry); + + return waited; } /* @@ -2224,6 +2505,7 @@ int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top, } new_top_bs->backing_hd = base_bs; + bdrv_refresh_limits(new_top_bs); QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) { /* so that bdrv_close() does not recursively close the chain */ @@ -2271,8 +2553,7 @@ static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, typedef struct RwCo { BlockDriverState *bs; - int64_t sector_num; - int nb_sectors; + int64_t offset; QEMUIOVector *qiov; bool is_write; int ret; @@ -2284,34 +2565,32 @@ static void coroutine_fn bdrv_rw_co_entry(void *opaque) RwCo *rwco = opaque; if (!rwco->is_write) { - rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num, - rwco->nb_sectors, rwco->qiov, - rwco->flags); - } else { - rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num, - rwco->nb_sectors, rwco->qiov, + rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, + rwco->qiov->size, rwco->qiov, rwco->flags); + } else { + rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, + rwco->qiov->size, rwco->qiov, + rwco->flags); } } /* * Process a vectored synchronous request using coroutines */ -static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *qiov, bool is_write, - BdrvRequestFlags flags) +static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, + QEMUIOVector *qiov, bool is_write, + BdrvRequestFlags flags) { Coroutine *co; RwCo rwco = { .bs = bs, - .sector_num = sector_num, - .nb_sectors = qiov->size >> BDRV_SECTOR_BITS, + .offset = offset, .qiov = qiov, .is_write = is_write, .ret = NOT_DONE, .flags = flags, }; - assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0); /** * In sync call context, when the vcpu is blocked, this throttling timer @@ -2350,7 +2629,8 @@ static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, }; qemu_iovec_init_external(&qiov, &iov, 1); - return bdrv_rwv_co(bs, sector_num, &qiov, is_write, flags); + return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, + &qiov, is_write, flags); } /* return < 0 if error. See bdrv_write() for the return codes */ @@ -2386,11 +2666,6 @@ int bdrv_write(BlockDriverState *bs, int64_t sector_num, return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); } -int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov) -{ - return bdrv_rwv_co(bs, sector_num, qiov, true, 0); -} - int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) { @@ -2440,117 +2715,53 @@ int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) } } -int bdrv_pread(BlockDriverState *bs, int64_t offset, - void *buf, int count1) +int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) { - uint8_t tmp_buf[BDRV_SECTOR_SIZE]; - int len, nb_sectors, count; - int64_t sector_num; + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *)buf, + .iov_len = bytes, + }; int ret; - count = count1; - /* first read to align to sector start */ - len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1); - if (len > count) - len = count; - sector_num = offset >> BDRV_SECTOR_BITS; - if (len > 0) { - if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) - return ret; - memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len); - count -= len; - if (count == 0) - return count1; - sector_num++; - buf += len; - } - - /* read the sectors "in place" */ - nb_sectors = count >> BDRV_SECTOR_BITS; - if (nb_sectors > 0) { - if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0) - return ret; - sector_num += nb_sectors; - len = nb_sectors << BDRV_SECTOR_BITS; - buf += len; - count -= len; + if (bytes < 0) { + return -EINVAL; } - /* add data from the last sector */ - if (count > 0) { - if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) - return ret; - memcpy(buf, tmp_buf, count); + qemu_iovec_init_external(&qiov, &iov, 1); + ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); + if (ret < 0) { + return ret; } - return count1; + + return bytes; } int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) { - uint8_t tmp_buf[BDRV_SECTOR_SIZE]; - int len, nb_sectors, count; - int64_t sector_num; int ret; - count = qiov->size; - - /* first write to align to sector start */ - len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1); - if (len > count) - len = count; - sector_num = offset >> BDRV_SECTOR_BITS; - if (len > 0) { - if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) - return ret; - qemu_iovec_to_buf(qiov, 0, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), - len); - if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0) - return ret; - count -= len; - if (count == 0) - return qiov->size; - sector_num++; - } - - /* write the sectors "in place" */ - nb_sectors = count >> BDRV_SECTOR_BITS; - if (nb_sectors > 0) { - QEMUIOVector qiov_inplace; - - qemu_iovec_init(&qiov_inplace, qiov->niov); - qemu_iovec_concat(&qiov_inplace, qiov, len, - nb_sectors << BDRV_SECTOR_BITS); - ret = bdrv_writev(bs, sector_num, &qiov_inplace); - qemu_iovec_destroy(&qiov_inplace); - if (ret < 0) { - return ret; - } - - sector_num += nb_sectors; - len = nb_sectors << BDRV_SECTOR_BITS; - count -= len; + ret = bdrv_prwv_co(bs, offset, qiov, true, 0); + if (ret < 0) { + return ret; } - /* add data from the last sector */ - if (count > 0) { - if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) - return ret; - qemu_iovec_to_buf(qiov, qiov->size - count, tmp_buf, count); - if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0) - return ret; - } return qiov->size; } int bdrv_pwrite(BlockDriverState *bs, int64_t offset, - const void *buf, int count1) + const void *buf, int bytes) { QEMUIOVector qiov; struct iovec iov = { .iov_base = (void *) buf, - .iov_len = count1, + .iov_len = bytes, }; + if (bytes < 0) { + return -EINVAL; + } + qemu_iovec_init_external(&qiov, &iov, 1); return bdrv_pwritev(bs, offset, &qiov); } @@ -2646,40 +2857,34 @@ err: } /* - * Handle a read request in coroutine context + * Forwards an already correctly aligned request to the BlockDriver. This + * handles copy on read and zeroing after EOF; any other features must be + * implemented by the caller. */ -static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, - BdrvRequestFlags flags) +static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, + BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, + int64_t align, QEMUIOVector *qiov, int flags) { BlockDriver *drv = bs->drv; - BdrvTrackedRequest req; int ret; - if (!drv) { - return -ENOMEDIUM; - } - if (bdrv_check_request(bs, sector_num, nb_sectors)) { - return -EIO; - } + int64_t sector_num = offset >> BDRV_SECTOR_BITS; + unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; - if (bs->copy_on_read) { - flags |= BDRV_REQ_COPY_ON_READ; - } - if (flags & BDRV_REQ_COPY_ON_READ) { - bs->copy_on_read_in_flight++; - } + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); - if (bs->copy_on_read_in_flight) { - wait_for_overlapping_requests(bs, sector_num, nb_sectors); - } - - /* throttling disk I/O */ - if (bs->io_limits_enabled) { - bdrv_io_limits_intercept(bs, nb_sectors, false); + /* Handle Copy on Read and associated serialisation */ + if (flags & BDRV_REQ_COPY_ON_READ) { + /* If we touch the same cluster it counts as an overlap. This + * guarantees that allocating writes will be serialized and not race + * with each other for the same cluster. For example, in copy-on-read + * it ensures that the CoR read and write operations are atomic and + * guest writes cannot interleave between them. */ + mark_request_serialising(req, bdrv_get_cluster_size(bs)); } - tracked_request_begin(&req, bs, sector_num, nb_sectors, false); + wait_serialising_requests(req); if (flags & BDRV_REQ_COPY_ON_READ) { int pnum; @@ -2695,6 +2900,7 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, } } + /* Forward the request to the BlockDriver */ if (!(bs->zero_beyond_eof && bs->growable)) { ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); } else { @@ -2708,7 +2914,8 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, } total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE); - max_nb_sectors = MAX(0, total_sectors - sector_num); + max_nb_sectors = MAX(0, ROUND_UP(total_sectors - sector_num, + align >> BDRV_SECTOR_BITS)); if (max_nb_sectors > 0) { ret = drv->bdrv_co_readv(bs, sector_num, MIN(nb_sectors, max_nb_sectors), qiov); @@ -2726,15 +2933,95 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, } out: + return ret; +} + +/* + * Handle a read request in coroutine context + */ +static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + BlockDriver *drv = bs->drv; + BdrvTrackedRequest req; + + /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ + uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); + uint8_t *head_buf = NULL; + uint8_t *tail_buf = NULL; + QEMUIOVector local_qiov; + bool use_local_qiov = false; + int ret; + + if (!drv) { + return -ENOMEDIUM; + } + if (bdrv_check_byte_request(bs, offset, bytes)) { + return -EIO; + } + + if (bs->copy_on_read) { + flags |= BDRV_REQ_COPY_ON_READ; + } + + /* throttling disk I/O */ + if (bs->io_limits_enabled) { + bdrv_io_limits_intercept(bs, bytes, false); + } + + /* Align read if necessary by padding qiov */ + if (offset & (align - 1)) { + head_buf = qemu_blockalign(bs, align); + qemu_iovec_init(&local_qiov, qiov->niov + 2); + qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + + bytes += offset & (align - 1); + offset = offset & ~(align - 1); + } + + if ((offset + bytes) & (align - 1)) { + if (!use_local_qiov) { + qemu_iovec_init(&local_qiov, qiov->niov + 1); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + } + tail_buf = qemu_blockalign(bs, align); + qemu_iovec_add(&local_qiov, tail_buf, + align - ((offset + bytes) & (align - 1))); + + bytes = ROUND_UP(bytes, align); + } + + tracked_request_begin(&req, bs, offset, bytes, false); + ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, + use_local_qiov ? &local_qiov : qiov, + flags); tracked_request_end(&req); - if (flags & BDRV_REQ_COPY_ON_READ) { - bs->copy_on_read_in_flight--; + if (use_local_qiov) { + qemu_iovec_destroy(&local_qiov); + qemu_vfree(head_buf); + qemu_vfree(tail_buf); } return ret; } +static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) { + return -EINVAL; + } + + return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, qiov, flags); +} + int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { @@ -2828,46 +3115,37 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, } /* - * Handle a write request in coroutine context + * Forwards an already correctly aligned write request to the BlockDriver. */ -static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, - BdrvRequestFlags flags) +static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, + BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, + QEMUIOVector *qiov, int flags) { BlockDriver *drv = bs->drv; - BdrvTrackedRequest req; + bool waited; int ret; - if (!bs->drv) { - return -ENOMEDIUM; - } - if (bs->read_only) { - return -EACCES; - } - if (bdrv_check_request(bs, sector_num, nb_sectors)) { - return -EIO; - } + int64_t sector_num = offset >> BDRV_SECTOR_BITS; + unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; - if (bs->copy_on_read_in_flight) { - wait_for_overlapping_requests(bs, sector_num, nb_sectors); - } - - /* throttling disk I/O */ - if (bs->io_limits_enabled) { - bdrv_io_limits_intercept(bs, nb_sectors, true); - } + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); - tracked_request_begin(&req, bs, sector_num, nb_sectors, true); + waited = wait_serialising_requests(req); + assert(!waited || !req->serialising); - ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req); + ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); if (ret < 0) { /* Do nothing, write notifier decided to fail this request */ } else if (flags & BDRV_REQ_ZERO_WRITE) { + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO); ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); } else { + BLKDBG_EVENT(bs, BLKDBG_PWRITEV); ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); } + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE); if (ret == 0 && !bs->enable_write_cache) { ret = bdrv_co_flush(bs); @@ -2882,11 +3160,143 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); } + return ret; +} + +/* + * Handle a write request in coroutine context + */ +static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + BdrvTrackedRequest req; + /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ + uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); + uint8_t *head_buf = NULL; + uint8_t *tail_buf = NULL; + QEMUIOVector local_qiov; + bool use_local_qiov = false; + int ret; + + if (!bs->drv) { + return -ENOMEDIUM; + } + if (bs->read_only) { + return -EACCES; + } + if (bdrv_check_byte_request(bs, offset, bytes)) { + return -EIO; + } + + /* throttling disk I/O */ + if (bs->io_limits_enabled) { + bdrv_io_limits_intercept(bs, bytes, true); + } + + /* + * Align write if necessary by performing a read-modify-write cycle. + * Pad qiov with the read parts and be sure to have a tracked request not + * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. + */ + tracked_request_begin(&req, bs, offset, bytes, true); + + if (offset & (align - 1)) { + QEMUIOVector head_qiov; + struct iovec head_iov; + + mark_request_serialising(&req, align); + wait_serialising_requests(&req); + + head_buf = qemu_blockalign(bs, align); + head_iov = (struct iovec) { + .iov_base = head_buf, + .iov_len = align, + }; + qemu_iovec_init_external(&head_qiov, &head_iov, 1); + + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD); + ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, + align, &head_qiov, 0); + if (ret < 0) { + goto fail; + } + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); + + qemu_iovec_init(&local_qiov, qiov->niov + 2); + qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + + bytes += offset & (align - 1); + offset = offset & ~(align - 1); + } + + if ((offset + bytes) & (align - 1)) { + QEMUIOVector tail_qiov; + struct iovec tail_iov; + size_t tail_bytes; + bool waited; + + mark_request_serialising(&req, align); + waited = wait_serialising_requests(&req); + assert(!waited || !use_local_qiov); + + tail_buf = qemu_blockalign(bs, align); + tail_iov = (struct iovec) { + .iov_base = tail_buf, + .iov_len = align, + }; + qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); + + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL); + ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, + align, &tail_qiov, 0); + if (ret < 0) { + goto fail; + } + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); + + if (!use_local_qiov) { + qemu_iovec_init(&local_qiov, qiov->niov + 1); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + } + + tail_bytes = (offset + bytes) & (align - 1); + qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); + + bytes = ROUND_UP(bytes, align); + } + + ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, + use_local_qiov ? &local_qiov : qiov, + flags); + +fail: tracked_request_end(&req); + if (use_local_qiov) { + qemu_iovec_destroy(&local_qiov); + qemu_vfree(head_buf); + qemu_vfree(tail_buf); + } + return ret; } +static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) { + return -EINVAL; + } + + return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, qiov, flags); +} + int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { @@ -3110,11 +3520,12 @@ void bdrv_iterate_format(void (*it)(void *opaque, const char *name), } } +/* This function is to find block backend bs */ BlockDriverState *bdrv_find(const char *name) { BlockDriverState *bs; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { if (!strcmp(name, bs->device_name)) { return bs; } @@ -3122,19 +3533,83 @@ BlockDriverState *bdrv_find(const char *name) return NULL; } +/* This function is to find a node in the bs graph */ +BlockDriverState *bdrv_find_node(const char *node_name) +{ + BlockDriverState *bs; + + assert(node_name); + + QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) { + if (!strcmp(node_name, bs->node_name)) { + return bs; + } + } + return NULL; +} + +/* Put this QMP function here so it can access the static graph_bdrv_states. */ +BlockDeviceInfoList *bdrv_named_nodes_list(void) +{ + BlockDeviceInfoList *list, *entry; + BlockDriverState *bs; + + list = NULL; + QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) { + entry = g_malloc0(sizeof(*entry)); + entry->value = bdrv_block_device_info(bs); + entry->next = list; + list = entry; + } + + return list; +} + +BlockDriverState *bdrv_lookup_bs(const char *device, + const char *node_name, + Error **errp) +{ + BlockDriverState *bs = NULL; + + if ((!device && !node_name) || (device && node_name)) { + error_setg(errp, "Use either device or node-name but not both"); + return NULL; + } + + if (device) { + bs = bdrv_find(device); + + if (!bs) { + error_set(errp, QERR_DEVICE_NOT_FOUND, device); + return NULL; + } + + return bs; + } + + bs = bdrv_find_node(node_name); + + if (!bs) { + error_set(errp, QERR_DEVICE_NOT_FOUND, node_name); + return NULL; + } + + return bs; +} + BlockDriverState *bdrv_next(BlockDriverState *bs) { if (!bs) { return QTAILQ_FIRST(&bdrv_states); } - return QTAILQ_NEXT(bs, list); + return QTAILQ_NEXT(bs, device_list); } void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque) { BlockDriverState *bs; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { it(opaque, bs); } } @@ -3154,7 +3629,7 @@ int bdrv_flush_all(void) BlockDriverState *bs; int result = 0; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { int ret = bdrv_flush(bs); if (ret < 0 && !result) { result = ret; @@ -4278,7 +4753,7 @@ void bdrv_invalidate_cache_all(void) { BlockDriverState *bs; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { bdrv_invalidate_cache(bs); } } @@ -4287,7 +4762,7 @@ void bdrv_clear_incoming_migration_all(void) { BlockDriverState *bs; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING); } } @@ -4314,9 +4789,15 @@ int bdrv_flush(BlockDriverState *bs) return rwco.ret; } +typedef struct DiscardCo { + BlockDriverState *bs; + int64_t sector_num; + int nb_sectors; + int ret; +} DiscardCo; static void coroutine_fn bdrv_discard_co_entry(void *opaque) { - RwCo *rwco = opaque; + DiscardCo *rwco = opaque; rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); } @@ -4400,7 +4881,7 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) { Coroutine *co; - RwCo rwco = { + DiscardCo rwco = { .bs = bs, .sector_num = sector_num, .nb_sectors = nb_sectors, @@ -4505,14 +4986,14 @@ BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, return NULL; } -void bdrv_set_buffer_alignment(BlockDriverState *bs, int align) +void bdrv_set_guest_block_size(BlockDriverState *bs, int align) { - bs->buffer_alignment = align; + bs->guest_block_size = align; } void *qemu_blockalign(BlockDriverState *bs, size_t size) { - return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size); + return qemu_memalign(bdrv_opt_mem_align(bs), size); } /* @@ -4521,9 +5002,13 @@ void *qemu_blockalign(BlockDriverState *bs, size_t size) bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) { int i; + size_t alignment = bdrv_opt_mem_align(bs); for (i = 0; i < qiov->niov; i++) { - if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) { + if ((uintptr_t) qiov->iov[i].iov_base % alignment) { + return false; + } + if (qiov->iov[i].iov_len % alignment) { return false; } } @@ -4875,21 +5360,68 @@ int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options) return bs->drv->bdrv_amend_options(bs, options); } -ExtSnapshotPerm bdrv_check_ext_snapshot(BlockDriverState *bs) +/* Used to recurse on single child block filters. + * Single child block filter will store their child in bs->file. + */ +bool bdrv_generic_is_first_non_filter(BlockDriverState *bs, + BlockDriverState *candidate) { - if (bs->drv->bdrv_check_ext_snapshot) { - return bs->drv->bdrv_check_ext_snapshot(bs); + if (!bs->drv) { + return false; } - if (bs->file && bs->file->drv && bs->file->drv->bdrv_check_ext_snapshot) { - return bs->file->drv->bdrv_check_ext_snapshot(bs); + if (!bs->drv->authorizations[BS_IS_A_FILTER]) { + if (bs == candidate) { + return true; + } else { + return false; + } + } + + if (!bs->drv->authorizations[BS_FILTER_PASS_DOWN]) { + return false; + } + + if (!bs->file) { + return false; } - /* external snapshots are allowed by default */ - return EXT_SNAPSHOT_ALLOWED; + return bdrv_recurse_is_first_non_filter(bs->file, candidate); } -ExtSnapshotPerm bdrv_check_ext_snapshot_forbidden(BlockDriverState *bs) +bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs, + BlockDriverState *candidate) +{ + if (bs->drv && bs->drv->bdrv_recurse_is_first_non_filter) { + return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate); + } + + return bdrv_generic_is_first_non_filter(bs, candidate); +} + +/* This function checks if the candidate is the first non filter bs down it's + * bs chain. Since we don't have pointers to parents it explore all bs chains + * from the top. Some filters can choose not to pass down the recursion. + */ +bool bdrv_is_first_non_filter(BlockDriverState *candidate) { - return EXT_SNAPSHOT_FORBIDDEN; + BlockDriverState *bs; + + /* walk down the bs forest recursively */ + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { + bool perm; + + if (!bs->file) { + continue; + } + + perm = bdrv_recurse_is_first_non_filter(bs->file, candidate); + + /* candidate is the first non filter */ + if (perm) { + return true; + } + } + + return false; } |