aboutsummaryrefslogtreecommitdiff
path: root/block/io.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/io.c')
-rw-r--r--block/io.c131
1 files changed, 89 insertions, 42 deletions
diff --git a/block/io.c b/block/io.c
index 4378ae4..8e41907 100644
--- a/block/io.c
+++ b/block/io.c
@@ -34,6 +34,9 @@
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
+/* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
+#define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
+
static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
int64_t offset, int bytes, BdrvRequestFlags flags);
@@ -945,68 +948,114 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,
BlockDriver *drv = bs->drv;
struct iovec iov;
- QEMUIOVector bounce_qiov;
+ QEMUIOVector local_qiov;
int64_t cluster_offset;
unsigned int cluster_bytes;
size_t skip_bytes;
int ret;
+ int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
+ BDRV_REQUEST_MAX_BYTES);
+ unsigned int progress = 0;
/* FIXME We cannot require callers to have write permissions when all they
* are doing is a read request. If we did things right, write permissions
* would be obtained anyway, but internally by the copy-on-read code. As
- * long as it is implemented here rather than in a separat filter driver,
+ * long as it is implemented here rather than in a separate filter driver,
* the copy-on-read code doesn't have its own BdrvChild, however, for which
* it could request permissions. Therefore we have to bypass the permission
* system for the moment. */
// assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
/* Cover entire cluster so no additional backing file I/O is required when
- * allocating cluster in the image file.
+ * allocating cluster in the image file. Note that this value may exceed
+ * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which
+ * is one reason we loop rather than doing it all at once.
*/
bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
+ skip_bytes = offset - cluster_offset;
trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
cluster_offset, cluster_bytes);
- iov.iov_len = cluster_bytes;
- iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
+ bounce_buffer = qemu_try_blockalign(bs,
+ MIN(MIN(max_transfer, cluster_bytes),
+ MAX_BOUNCE_BUFFER));
if (bounce_buffer == NULL) {
ret = -ENOMEM;
goto err;
}
- qemu_iovec_init_external(&bounce_qiov, &iov, 1);
+ while (cluster_bytes) {
+ int64_t pnum;
- ret = bdrv_driver_preadv(bs, cluster_offset, cluster_bytes,
- &bounce_qiov, 0);
- if (ret < 0) {
- goto err;
- }
+ ret = bdrv_is_allocated(bs, cluster_offset,
+ MIN(cluster_bytes, max_transfer), &pnum);
+ if (ret < 0) {
+ /* Safe to treat errors in querying allocation as if
+ * unallocated; we'll probably fail again soon on the
+ * read, but at least that will set a decent errno.
+ */
+ pnum = MIN(cluster_bytes, max_transfer);
+ }
- if (drv->bdrv_co_pwrite_zeroes &&
- buffer_is_zero(bounce_buffer, iov.iov_len)) {
- /* FIXME: Should we (perhaps conditionally) be setting
- * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
- * that still correctly reads as zero? */
- ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, cluster_bytes, 0);
- } else {
- /* This does not change the data on the disk, it is not necessary
- * to flush even in cache=writethrough mode.
- */
- ret = bdrv_driver_pwritev(bs, cluster_offset, cluster_bytes,
- &bounce_qiov, 0);
- }
+ assert(skip_bytes < pnum);
- if (ret < 0) {
- /* It might be okay to ignore write errors for guest requests. If this
- * is a deliberate copy-on-read then we don't want to ignore the error.
- * Simply report it in all cases.
- */
- goto err;
- }
+ if (ret <= 0) {
+ /* Must copy-on-read; use the bounce buffer */
+ iov.iov_base = bounce_buffer;
+ iov.iov_len = pnum = MIN(pnum, MAX_BOUNCE_BUFFER);
+ qemu_iovec_init_external(&local_qiov, &iov, 1);
- skip_bytes = offset - cluster_offset;
- qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, bytes);
+ ret = bdrv_driver_preadv(bs, cluster_offset, pnum,
+ &local_qiov, 0);
+ if (ret < 0) {
+ goto err;
+ }
+
+ bdrv_debug_event(bs, BLKDBG_COR_WRITE);
+ if (drv->bdrv_co_pwrite_zeroes &&
+ buffer_is_zero(bounce_buffer, pnum)) {
+ /* FIXME: Should we (perhaps conditionally) be setting
+ * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
+ * that still correctly reads as zero? */
+ ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 0);
+ } else {
+ /* This does not change the data on the disk, it is not
+ * necessary to flush even in cache=writethrough mode.
+ */
+ ret = bdrv_driver_pwritev(bs, cluster_offset, pnum,
+ &local_qiov, 0);
+ }
+
+ if (ret < 0) {
+ /* It might be okay to ignore write errors for guest
+ * requests. If this is a deliberate copy-on-read
+ * then we don't want to ignore the error. Simply
+ * report it in all cases.
+ */
+ goto err;
+ }
+
+ qemu_iovec_from_buf(qiov, progress, bounce_buffer + skip_bytes,
+ pnum - skip_bytes);
+ } else {
+ /* Read directly into the destination */
+ qemu_iovec_init(&local_qiov, qiov->niov);
+ qemu_iovec_concat(&local_qiov, qiov, progress, pnum - skip_bytes);
+ ret = bdrv_driver_preadv(bs, offset + progress, local_qiov.size,
+ &local_qiov, 0);
+ qemu_iovec_destroy(&local_qiov);
+ if (ret < 0) {
+ goto err;
+ }
+ }
+
+ cluster_offset += pnum;
+ cluster_bytes -= pnum;
+ progress += pnum - skip_bytes;
+ skip_bytes = 0;
+ }
+ ret = 0;
err:
qemu_vfree(bounce_buffer);
@@ -1212,9 +1261,6 @@ int coroutine_fn bdrv_co_readv(BdrvChild *child, int64_t sector_num,
return bdrv_co_do_readv(child, sector_num, nb_sectors, qiov, 0);
}
-/* Maximum buffer for write zeroes fallback, in bytes */
-#define MAX_WRITE_ZEROES_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
-
static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
int64_t offset, int bytes, BdrvRequestFlags flags)
{
@@ -1229,8 +1275,7 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
bs->bl.request_alignment);
- int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
- MAX_WRITE_ZEROES_BOUNCE_BUFFER);
+ int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER);
assert(alignment % bs->bl.request_alignment == 0);
head = offset % alignment;
@@ -1334,7 +1379,6 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
bool waited;
int ret;
- int64_t start_sector = offset >> BDRV_SECTOR_BITS;
int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
uint64_t bytes_remaining = bytes;
int max_transfer;
@@ -1409,7 +1453,7 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
atomic_inc(&bs->write_gen);
- bdrv_set_dirty(bs, start_sector, end_sector - start_sector);
+ bdrv_set_dirty(bs, offset, bytes);
stat64_max(&bs->wr_highest_offset, offset + bytes);
@@ -1778,6 +1822,10 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
*pnum = 0;
return BDRV_BLOCK_EOF;
}
+ if (!nb_sectors) {
+ *pnum = 0;
+ return 0;
+ }
n = total_sectors - sector_num;
if (n < nb_sectors) {
@@ -2438,8 +2486,7 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
ret = 0;
out:
atomic_inc(&bs->write_gen);
- bdrv_set_dirty(bs, req.offset >> BDRV_SECTOR_BITS,
- req.bytes >> BDRV_SECTOR_BITS);
+ bdrv_set_dirty(bs, req.offset, req.bytes);
tracked_request_end(&req);
bdrv_dec_in_flight(bs);
return ret;