diff options
Diffstat (limited to 'block/io.c')
-rw-r--r-- | block/io.c | 162 |
1 files changed, 108 insertions, 54 deletions
@@ -38,10 +38,14 @@ #include "qemu/error-report.h" #include "qemu/main-loop.h" #include "system/replay.h" +#include "qemu/units.h" /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */ #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) +/* Maximum read size for checking if data reads as zero, in bytes */ +#define MAX_ZERO_CHECK_BUFFER (128 * KiB) + static void coroutine_fn GRAPH_RDLOCK bdrv_parent_cb_resize(BlockDriverState *bs); @@ -409,7 +413,6 @@ static void bdrv_do_drained_end(BlockDriverState *bs, BdrvChild *parent) /* At this point, we should be always running in the main loop. */ GLOBAL_STATE_CODE(); assert(bs->quiesce_counter > 0); - GLOBAL_STATE_CODE(); /* Re-enable things in child-to-parent order */ old_quiesce_counter = qatomic_fetch_dec(&bs->quiesce_counter); @@ -2364,10 +2367,8 @@ int bdrv_flush_all(void) * Drivers not implementing the functionality are assumed to not support * backing files, hence all their sectors are reported as allocated. * - * If 'want_zero' is true, the caller is querying for mapping - * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and - * _ZERO where possible; otherwise, the result favors larger 'pnum', - * with a focus on accurate BDRV_BLOCK_ALLOCATED. + * 'mode' serves as a hint as to which results are favored; see the + * BDRV_WANT_* macros for details. * * If 'offset' is beyond the end of the disk image the return value is * BDRV_BLOCK_EOF and 'pnum' is set to 0. @@ -2387,7 +2388,7 @@ int bdrv_flush_all(void) * set to the host mapping and BDS corresponding to the guest offset. */ static int coroutine_fn GRAPH_RDLOCK -bdrv_co_do_block_status(BlockDriverState *bs, bool want_zero, +bdrv_co_do_block_status(BlockDriverState *bs, unsigned int mode, int64_t offset, int64_t bytes, int64_t *pnum, int64_t *map, BlockDriverState **file) { @@ -2476,7 +2477,7 @@ bdrv_co_do_block_status(BlockDriverState *bs, bool want_zero, local_file = bs; local_map = aligned_offset; } else { - ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, + ret = bs->drv->bdrv_co_block_status(bs, mode, aligned_offset, aligned_bytes, pnum, &local_map, &local_file); @@ -2488,10 +2489,10 @@ bdrv_co_do_block_status(BlockDriverState *bs, bool want_zero, * the cache requires an RCU update, so double check here to avoid * such an update if possible. * - * Check want_zero, because we only want to update the cache when we + * Check mode, because we only want to update the cache when we * have accurate information about what is zero and what is data. */ - if (want_zero && + if (mode == BDRV_WANT_PRECISE && ret == (BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID) && QLIST_EMPTY(&bs->children)) { @@ -2548,7 +2549,7 @@ bdrv_co_do_block_status(BlockDriverState *bs, bool want_zero, if (ret & BDRV_BLOCK_RAW) { assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file); - ret = bdrv_co_do_block_status(local_file, want_zero, local_map, + ret = bdrv_co_do_block_status(local_file, mode, local_map, *pnum, pnum, &local_map, &local_file); goto out; } @@ -2560,7 +2561,7 @@ bdrv_co_do_block_status(BlockDriverState *bs, bool want_zero, if (!cow_bs) { ret |= BDRV_BLOCK_ZERO; - } else if (want_zero) { + } else if (mode == BDRV_WANT_PRECISE) { int64_t size2 = bdrv_co_getlength(cow_bs); if (size2 >= 0 && offset >= size2) { @@ -2569,14 +2570,14 @@ bdrv_co_do_block_status(BlockDriverState *bs, bool want_zero, } } - if (want_zero && ret & BDRV_BLOCK_RECURSE && + if (mode == BDRV_WANT_PRECISE && ret & BDRV_BLOCK_RECURSE && local_file && local_file != bs && (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && (ret & BDRV_BLOCK_OFFSET_VALID)) { int64_t file_pnum; int ret2; - ret2 = bdrv_co_do_block_status(local_file, want_zero, local_map, + ret2 = bdrv_co_do_block_status(local_file, mode, local_map, *pnum, &file_pnum, NULL, NULL); if (ret2 >= 0) { /* Ignore errors. This is just providing extra information, it @@ -2627,7 +2628,7 @@ int coroutine_fn bdrv_co_common_block_status_above(BlockDriverState *bs, BlockDriverState *base, bool include_base, - bool want_zero, + unsigned int mode, int64_t offset, int64_t bytes, int64_t *pnum, @@ -2654,7 +2655,7 @@ bdrv_co_common_block_status_above(BlockDriverState *bs, return 0; } - ret = bdrv_co_do_block_status(bs, want_zero, offset, bytes, pnum, + ret = bdrv_co_do_block_status(bs, mode, offset, bytes, pnum, map, file); ++*depth; if (ret < 0 || *pnum == 0 || ret & BDRV_BLOCK_ALLOCATED || bs == base) { @@ -2671,7 +2672,7 @@ bdrv_co_common_block_status_above(BlockDriverState *bs, for (p = bdrv_filter_or_cow_bs(bs); include_base || p != base; p = bdrv_filter_or_cow_bs(p)) { - ret = bdrv_co_do_block_status(p, want_zero, offset, bytes, pnum, + ret = bdrv_co_do_block_status(p, mode, offset, bytes, pnum, map, file); ++*depth; if (ret < 0) { @@ -2734,7 +2735,8 @@ int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs, BlockDriverState **file) { IO_CODE(); - return bdrv_co_common_block_status_above(bs, base, false, true, offset, + return bdrv_co_common_block_status_above(bs, base, false, + BDRV_WANT_PRECISE, offset, bytes, pnum, map, file, NULL); } @@ -2752,27 +2754,89 @@ int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, int64_t offset, * by @offset and @bytes is known to read as zeroes. * Return 1 if that is the case, 0 otherwise and -errno on error. * This test is meant to be fast rather than accurate so returning 0 - * does not guarantee non-zero data. + * does not guarantee non-zero data; but a return of 1 is reliable. */ int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset, int64_t bytes) { int ret; - int64_t pnum = bytes; + int64_t pnum; IO_CODE(); - if (!bytes) { - return 1; + while (bytes) { + ret = bdrv_co_common_block_status_above(bs, NULL, false, + BDRV_WANT_ZERO, offset, bytes, + &pnum, NULL, NULL, NULL); + + if (ret < 0) { + return ret; + } + if (!(ret & BDRV_BLOCK_ZERO)) { + return 0; + } + offset += pnum; + bytes -= pnum; } - ret = bdrv_co_common_block_status_above(bs, NULL, false, false, offset, - bytes, &pnum, NULL, NULL, NULL); + return 1; +} + +/* + * Check @bs (and its backing chain) to see if the entire image is known + * to read as zeroes. + * Return 1 if that is the case, 0 otherwise and -errno on error. + * This test is meant to be fast rather than accurate so returning 0 + * does not guarantee non-zero data; however, a return of 1 is reliable, + * and this function can report 1 in more cases than bdrv_co_is_zero_fast. + */ +int coroutine_fn bdrv_co_is_all_zeroes(BlockDriverState *bs) +{ + int ret; + int64_t pnum, bytes; + char *buf; + QEMUIOVector local_qiov; + IO_CODE(); + + bytes = bdrv_co_getlength(bs); + if (bytes < 0) { + return bytes; + } + /* First probe - see if the entire image reads as zero */ + ret = bdrv_co_common_block_status_above(bs, NULL, false, BDRV_WANT_ZERO, + 0, bytes, &pnum, NULL, NULL, + NULL); if (ret < 0) { return ret; } + if (ret & BDRV_BLOCK_ZERO) { + return bdrv_co_is_zero_fast(bs, pnum, bytes - pnum); + } - return (pnum == bytes) && (ret & BDRV_BLOCK_ZERO); + /* + * Because of the way 'blockdev-create' works, raw files tend to + * be created with a non-sparse region at the front to make + * alignment probing easier. If the block starts with only a + * small allocated region, it is still worth the effort to see if + * the rest of the image is still sparse, coupled with manually + * reading the first region to see if it reads zero after all. + */ + if (pnum > MAX_ZERO_CHECK_BUFFER) { + return 0; + } + ret = bdrv_co_is_zero_fast(bs, pnum, bytes - pnum); + if (ret <= 0) { + return ret; + } + /* Only the head of the image is unknown, and it's small. Read it. */ + buf = qemu_blockalign(bs, pnum); + qemu_iovec_init_buf(&local_qiov, buf, pnum); + ret = bdrv_driver_preadv(bs, 0, pnum, &local_qiov, 0, 0); + if (ret >= 0) { + ret = buffer_is_zero(buf, pnum); + } + qemu_vfree(buf); + return ret; } int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t offset, @@ -2782,9 +2846,9 @@ int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t offset, int64_t dummy; IO_CODE(); - ret = bdrv_co_common_block_status_above(bs, bs, true, false, offset, - bytes, pnum ? pnum : &dummy, NULL, - NULL, NULL); + ret = bdrv_co_common_block_status_above(bs, bs, true, BDRV_WANT_ALLOCATED, + offset, bytes, pnum ? pnum : &dummy, + NULL, NULL, NULL); if (ret < 0) { return ret; } @@ -2817,7 +2881,8 @@ int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *bs, int ret; IO_CODE(); - ret = bdrv_co_common_block_status_above(bs, base, include_base, false, + ret = bdrv_co_common_block_status_above(bs, base, include_base, + BDRV_WANT_ALLOCATED, offset, bytes, pnum, NULL, NULL, &depth); if (ret < 0) { @@ -3102,18 +3167,19 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, return 0; } - if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) { + if (!bs->drv->bdrv_co_pdiscard) { return 0; } /* Invalidate the cached block-status data range if this discard overlaps */ bdrv_bsc_invalidate_range(bs, offset, bytes); - /* Discard is advisory, but some devices track and coalesce + /* + * Discard is advisory, but some devices track and coalesce * unaligned requests, so we must pass everything down rather than - * round here. Still, most devices will just silently ignore - * unaligned requests (by returning -ENOTSUP), so we must fragment - * the request accordingly. */ + * round here. Still, most devices reject unaligned requests with + * -EINVAL or -ENOTSUP, so we must fragment the request accordingly. + */ align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment); assert(align % bs->bl.request_alignment == 0); head = offset % align; @@ -3161,27 +3227,15 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, ret = -ENOMEDIUM; goto out; } - if (bs->drv->bdrv_co_pdiscard) { - ret = bs->drv->bdrv_co_pdiscard(bs, offset, num); - } else { - BlockAIOCB *acb; - CoroutineIOCompletion co = { - .coroutine = qemu_coroutine_self(), - }; - - acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num, - bdrv_co_io_em_complete, &co); - if (acb == NULL) { - ret = -EIO; - goto out; + + ret = bs->drv->bdrv_co_pdiscard(bs, offset, num); + if (ret && ret != -ENOTSUP) { + if (ret == -EINVAL && (offset % align != 0 || num % align != 0)) { + /* Silently skip rejected unaligned head/tail requests */ } else { - qemu_coroutine_yield(); - ret = co.ret; + goto out; /* bail out */ } } - if (ret && ret != -ENOTSUP) { - goto out; - } offset += num; bytes -= num; @@ -3709,8 +3763,8 @@ bdrv_co_preadv_snapshot(BdrvChild *child, int64_t offset, int64_t bytes, } int coroutine_fn -bdrv_co_snapshot_block_status(BlockDriverState *bs, - bool want_zero, int64_t offset, int64_t bytes, +bdrv_co_snapshot_block_status(BlockDriverState *bs, unsigned int mode, + int64_t offset, int64_t bytes, int64_t *pnum, int64_t *map, BlockDriverState **file) { @@ -3728,7 +3782,7 @@ bdrv_co_snapshot_block_status(BlockDriverState *bs, } bdrv_inc_in_flight(bs); - ret = drv->bdrv_co_snapshot_block_status(bs, want_zero, offset, bytes, + ret = drv->bdrv_co_snapshot_block_status(bs, mode, offset, bytes, pnum, map, file); bdrv_dec_in_flight(bs); |