diff options
Diffstat (limited to 'block/io.c')
-rw-r--r-- | block/io.c | 131 |
1 files changed, 89 insertions, 42 deletions
@@ -34,6 +34,9 @@ #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ +/* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */ +#define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) + static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes, BdrvRequestFlags flags); @@ -945,68 +948,114 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, BlockDriver *drv = bs->drv; struct iovec iov; - QEMUIOVector bounce_qiov; + QEMUIOVector local_qiov; int64_t cluster_offset; unsigned int cluster_bytes; size_t skip_bytes; int ret; + int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, + BDRV_REQUEST_MAX_BYTES); + unsigned int progress = 0; /* FIXME We cannot require callers to have write permissions when all they * are doing is a read request. If we did things right, write permissions * would be obtained anyway, but internally by the copy-on-read code. As - * long as it is implemented here rather than in a separat filter driver, + * long as it is implemented here rather than in a separate filter driver, * the copy-on-read code doesn't have its own BdrvChild, however, for which * it could request permissions. Therefore we have to bypass the permission * system for the moment. */ // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); /* Cover entire cluster so no additional backing file I/O is required when - * allocating cluster in the image file. + * allocating cluster in the image file. Note that this value may exceed + * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which + * is one reason we loop rather than doing it all at once. */ bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes); + skip_bytes = offset - cluster_offset; trace_bdrv_co_do_copy_on_readv(bs, offset, bytes, cluster_offset, cluster_bytes); - iov.iov_len = cluster_bytes; - iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); + bounce_buffer = qemu_try_blockalign(bs, + MIN(MIN(max_transfer, cluster_bytes), + MAX_BOUNCE_BUFFER)); if (bounce_buffer == NULL) { ret = -ENOMEM; goto err; } - qemu_iovec_init_external(&bounce_qiov, &iov, 1); + while (cluster_bytes) { + int64_t pnum; - ret = bdrv_driver_preadv(bs, cluster_offset, cluster_bytes, - &bounce_qiov, 0); - if (ret < 0) { - goto err; - } + ret = bdrv_is_allocated(bs, cluster_offset, + MIN(cluster_bytes, max_transfer), &pnum); + if (ret < 0) { + /* Safe to treat errors in querying allocation as if + * unallocated; we'll probably fail again soon on the + * read, but at least that will set a decent errno. + */ + pnum = MIN(cluster_bytes, max_transfer); + } - if (drv->bdrv_co_pwrite_zeroes && - buffer_is_zero(bounce_buffer, iov.iov_len)) { - /* FIXME: Should we (perhaps conditionally) be setting - * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy - * that still correctly reads as zero? */ - ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, cluster_bytes, 0); - } else { - /* This does not change the data on the disk, it is not necessary - * to flush even in cache=writethrough mode. - */ - ret = bdrv_driver_pwritev(bs, cluster_offset, cluster_bytes, - &bounce_qiov, 0); - } + assert(skip_bytes < pnum); - if (ret < 0) { - /* It might be okay to ignore write errors for guest requests. If this - * is a deliberate copy-on-read then we don't want to ignore the error. - * Simply report it in all cases. - */ - goto err; - } + if (ret <= 0) { + /* Must copy-on-read; use the bounce buffer */ + iov.iov_base = bounce_buffer; + iov.iov_len = pnum = MIN(pnum, MAX_BOUNCE_BUFFER); + qemu_iovec_init_external(&local_qiov, &iov, 1); - skip_bytes = offset - cluster_offset; - qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, bytes); + ret = bdrv_driver_preadv(bs, cluster_offset, pnum, + &local_qiov, 0); + if (ret < 0) { + goto err; + } + + bdrv_debug_event(bs, BLKDBG_COR_WRITE); + if (drv->bdrv_co_pwrite_zeroes && + buffer_is_zero(bounce_buffer, pnum)) { + /* FIXME: Should we (perhaps conditionally) be setting + * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy + * that still correctly reads as zero? */ + ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 0); + } else { + /* This does not change the data on the disk, it is not + * necessary to flush even in cache=writethrough mode. + */ + ret = bdrv_driver_pwritev(bs, cluster_offset, pnum, + &local_qiov, 0); + } + + if (ret < 0) { + /* It might be okay to ignore write errors for guest + * requests. If this is a deliberate copy-on-read + * then we don't want to ignore the error. Simply + * report it in all cases. + */ + goto err; + } + + qemu_iovec_from_buf(qiov, progress, bounce_buffer + skip_bytes, + pnum - skip_bytes); + } else { + /* Read directly into the destination */ + qemu_iovec_init(&local_qiov, qiov->niov); + qemu_iovec_concat(&local_qiov, qiov, progress, pnum - skip_bytes); + ret = bdrv_driver_preadv(bs, offset + progress, local_qiov.size, + &local_qiov, 0); + qemu_iovec_destroy(&local_qiov); + if (ret < 0) { + goto err; + } + } + + cluster_offset += pnum; + cluster_bytes -= pnum; + progress += pnum - skip_bytes; + skip_bytes = 0; + } + ret = 0; err: qemu_vfree(bounce_buffer); @@ -1212,9 +1261,6 @@ int coroutine_fn bdrv_co_readv(BdrvChild *child, int64_t sector_num, return bdrv_co_do_readv(child, sector_num, nb_sectors, qiov, 0); } -/* Maximum buffer for write zeroes fallback, in bytes */ -#define MAX_WRITE_ZEROES_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS) - static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes, BdrvRequestFlags flags) { @@ -1229,8 +1275,7 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX); int alignment = MAX(bs->bl.pwrite_zeroes_alignment, bs->bl.request_alignment); - int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, - MAX_WRITE_ZEROES_BOUNCE_BUFFER); + int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER); assert(alignment % bs->bl.request_alignment == 0); head = offset % alignment; @@ -1334,7 +1379,6 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, bool waited; int ret; - int64_t start_sector = offset >> BDRV_SECTOR_BITS; int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); uint64_t bytes_remaining = bytes; int max_transfer; @@ -1409,7 +1453,7 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); atomic_inc(&bs->write_gen); - bdrv_set_dirty(bs, start_sector, end_sector - start_sector); + bdrv_set_dirty(bs, offset, bytes); stat64_max(&bs->wr_highest_offset, offset + bytes); @@ -1778,6 +1822,10 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, *pnum = 0; return BDRV_BLOCK_EOF; } + if (!nb_sectors) { + *pnum = 0; + return 0; + } n = total_sectors - sector_num; if (n < nb_sectors) { @@ -2438,8 +2486,7 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, ret = 0; out: atomic_inc(&bs->write_gen); - bdrv_set_dirty(bs, req.offset >> BDRV_SECTOR_BITS, - req.bytes >> BDRV_SECTOR_BITS); + bdrv_set_dirty(bs, req.offset, req.bytes); tracked_request_end(&req); bdrv_dec_in_flight(bs); return ret; |