diff options
Diffstat (limited to 'block/qed.c')
-rw-r--r-- | block/qed.c | 628 |
1 files changed, 626 insertions, 2 deletions
diff --git a/block/qed.c b/block/qed.c index cd1bead..8e65d18 100644 --- a/block/qed.c +++ b/block/qed.c @@ -12,8 +12,26 @@ * */ +#include "trace.h" #include "qed.h" +static void qed_aio_cancel(BlockDriverAIOCB *blockacb) +{ + QEDAIOCB *acb = (QEDAIOCB *)blockacb; + bool finished = false; + + /* Wait for the request to finish */ + acb->finished = &finished; + while (!finished) { + qemu_aio_wait(); + } +} + +static AIOPool qed_aio_pool = { + .aiocb_size = sizeof(QEDAIOCB), + .cancel = qed_aio_cancel, +}; + static int bdrv_qed_probe(const uint8_t *buf, int buf_size, const char *filename) { @@ -155,6 +173,24 @@ static int qed_read_string(BlockDriverState *file, uint64_t offset, size_t n, return 0; } +/** + * Allocate new clusters + * + * @s: QED state + * @n: Number of contiguous clusters to allocate + * @ret: Offset of first allocated cluster + * + * This function only produces the offset where the new clusters should be + * written. It updates BDRVQEDState but does not make any changes to the image + * file. + */ +static uint64_t qed_alloc_clusters(BDRVQEDState *s, unsigned int n) +{ + uint64_t offset = s->file_size; + s->file_size += n * s->header.cluster_size; + return offset; +} + QEDTable *qed_alloc_table(BDRVQEDState *s) { /* Honor O_DIRECT memory alignment requirements */ @@ -162,6 +198,23 @@ QEDTable *qed_alloc_table(BDRVQEDState *s) s->header.cluster_size * s->header.table_size); } +/** + * Allocate a new zeroed L2 table + */ +static CachedL2Table *qed_new_l2_table(BDRVQEDState *s) +{ + CachedL2Table *l2_table = qed_alloc_l2_cache_entry(&s->l2_cache); + + l2_table->table = qed_alloc_table(s); + l2_table->offset = qed_alloc_clusters(s, s->header.table_size); + + memset(l2_table->table->offsets, 0, + s->header.cluster_size * s->header.table_size); + return l2_table; +} + +static void qed_aio_next_io(void *opaque, int ret); + static int bdrv_qed_open(BlockDriverState *bs, int flags) { BDRVQEDState *s = bs->opaque; @@ -170,6 +223,7 @@ static int bdrv_qed_open(BlockDriverState *bs, int flags) int ret; s->bs = bs; + QSIMPLEQ_INIT(&s->allocating_write_reqs); ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header)); if (ret < 0) { @@ -431,13 +485,583 @@ static int bdrv_qed_make_empty(BlockDriverState *bs) return -ENOTSUP; } +static BDRVQEDState *acb_to_s(QEDAIOCB *acb) +{ + return acb->common.bs->opaque; +} + +/** + * Read from the backing file or zero-fill if no backing file + * + * @s: QED state + * @pos: Byte position in device + * @qiov: Destination I/O vector + * @cb: Completion function + * @opaque: User data for completion function + * + * This function reads qiov->size bytes starting at pos from the backing file. + * If there is no backing file then zeroes are read. + */ +static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos, + QEMUIOVector *qiov, + BlockDriverCompletionFunc *cb, void *opaque) +{ + BlockDriverAIOCB *aiocb; + uint64_t backing_length = 0; + size_t size; + + /* If there is a backing file, get its length. Treat the absence of a + * backing file like a zero length backing file. + */ + if (s->bs->backing_hd) { + int64_t l = bdrv_getlength(s->bs->backing_hd); + if (l < 0) { + cb(opaque, l); + return; + } + backing_length = l; + } + + /* Zero all sectors if reading beyond the end of the backing file */ + if (pos >= backing_length || + pos + qiov->size > backing_length) { + qemu_iovec_memset(qiov, 0, qiov->size); + } + + /* Complete now if there are no backing file sectors to read */ + if (pos >= backing_length) { + cb(opaque, 0); + return; + } + + /* If the read straddles the end of the backing file, shorten it */ + size = MIN((uint64_t)backing_length - pos, qiov->size); + + BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING); + aiocb = bdrv_aio_readv(s->bs->backing_hd, pos / BDRV_SECTOR_SIZE, + qiov, size / BDRV_SECTOR_SIZE, cb, opaque); + if (!aiocb) { + cb(opaque, -EIO); + } +} + +typedef struct { + GenericCB gencb; + BDRVQEDState *s; + QEMUIOVector qiov; + struct iovec iov; + uint64_t offset; +} CopyFromBackingFileCB; + +static void qed_copy_from_backing_file_cb(void *opaque, int ret) +{ + CopyFromBackingFileCB *copy_cb = opaque; + qemu_vfree(copy_cb->iov.iov_base); + gencb_complete(©_cb->gencb, ret); +} + +static void qed_copy_from_backing_file_write(void *opaque, int ret) +{ + CopyFromBackingFileCB *copy_cb = opaque; + BDRVQEDState *s = copy_cb->s; + BlockDriverAIOCB *aiocb; + + if (ret) { + qed_copy_from_backing_file_cb(copy_cb, ret); + return; + } + + BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE); + aiocb = bdrv_aio_writev(s->bs->file, copy_cb->offset / BDRV_SECTOR_SIZE, + ©_cb->qiov, + copy_cb->qiov.size / BDRV_SECTOR_SIZE, + qed_copy_from_backing_file_cb, copy_cb); + if (!aiocb) { + qed_copy_from_backing_file_cb(copy_cb, -EIO); + } +} + +/** + * Copy data from backing file into the image + * + * @s: QED state + * @pos: Byte position in device + * @len: Number of bytes + * @offset: Byte offset in image file + * @cb: Completion function + * @opaque: User data for completion function + */ +static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos, + uint64_t len, uint64_t offset, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + CopyFromBackingFileCB *copy_cb; + + /* Skip copy entirely if there is no work to do */ + if (len == 0) { + cb(opaque, 0); + return; + } + + copy_cb = gencb_alloc(sizeof(*copy_cb), cb, opaque); + copy_cb->s = s; + copy_cb->offset = offset; + copy_cb->iov.iov_base = qemu_blockalign(s->bs, len); + copy_cb->iov.iov_len = len; + qemu_iovec_init_external(©_cb->qiov, ©_cb->iov, 1); + + qed_read_backing_file(s, pos, ©_cb->qiov, + qed_copy_from_backing_file_write, copy_cb); +} + +/** + * Link one or more contiguous clusters into a table + * + * @s: QED state + * @table: L2 table + * @index: First cluster index + * @n: Number of contiguous clusters + * @cluster: First cluster byte offset in image file + */ +static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index, + unsigned int n, uint64_t cluster) +{ + int i; + for (i = index; i < index + n; i++) { + table->offsets[i] = cluster; + cluster += s->header.cluster_size; + } +} + +static void qed_aio_complete_bh(void *opaque) +{ + QEDAIOCB *acb = opaque; + BlockDriverCompletionFunc *cb = acb->common.cb; + void *user_opaque = acb->common.opaque; + int ret = acb->bh_ret; + bool *finished = acb->finished; + + qemu_bh_delete(acb->bh); + qemu_aio_release(acb); + + /* Invoke callback */ + cb(user_opaque, ret); + + /* Signal cancel completion */ + if (finished) { + *finished = true; + } +} + +static void qed_aio_complete(QEDAIOCB *acb, int ret) +{ + BDRVQEDState *s = acb_to_s(acb); + + trace_qed_aio_complete(s, acb, ret); + + /* Free resources */ + qemu_iovec_destroy(&acb->cur_qiov); + qed_unref_l2_cache_entry(acb->request.l2_table); + + /* Arrange for a bh to invoke the completion function */ + acb->bh_ret = ret; + acb->bh = qemu_bh_new(qed_aio_complete_bh, acb); + qemu_bh_schedule(acb->bh); + + /* Start next allocating write request waiting behind this one. Note that + * requests enqueue themselves when they first hit an unallocated cluster + * but they wait until the entire request is finished before waking up the + * next request in the queue. This ensures that we don't cycle through + * requests multiple times but rather finish one at a time completely. + */ + if (acb == QSIMPLEQ_FIRST(&s->allocating_write_reqs)) { + QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next); + acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs); + if (acb) { + qed_aio_next_io(acb, 0); + } + } +} + +/** + * Commit the current L2 table to the cache + */ +static void qed_commit_l2_update(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + CachedL2Table *l2_table = acb->request.l2_table; + + qed_commit_l2_cache_entry(&s->l2_cache, l2_table); + + /* This is guaranteed to succeed because we just committed the entry to the + * cache. + */ + acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, + l2_table->offset); + assert(acb->request.l2_table != NULL); + + qed_aio_next_io(opaque, ret); +} + +/** + * Update L1 table with new L2 table offset and write it out + */ +static void qed_aio_write_l1_update(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + int index; + + if (ret) { + qed_aio_complete(acb, ret); + return; + } + + index = qed_l1_index(s, acb->cur_pos); + s->l1_table->offsets[index] = acb->request.l2_table->offset; + + qed_write_l1_table(s, index, 1, qed_commit_l2_update, acb); +} + +/** + * Update L2 table with new cluster offsets and write them out + */ +static void qed_aio_write_l2_update(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1; + int index; + + if (ret) { + goto err; + } + + if (need_alloc) { + qed_unref_l2_cache_entry(acb->request.l2_table); + acb->request.l2_table = qed_new_l2_table(s); + } + + index = qed_l2_index(s, acb->cur_pos); + qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters, + acb->cur_cluster); + + if (need_alloc) { + /* Write out the whole new L2 table */ + qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true, + qed_aio_write_l1_update, acb); + } else { + /* Write out only the updated part of the L2 table */ + qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false, + qed_aio_next_io, acb); + } + return; + +err: + qed_aio_complete(acb, ret); +} + +/** + * Flush new data clusters before updating the L2 table + * + * This flush is necessary when a backing file is in use. A crash during an + * allocating write could result in empty clusters in the image. If the write + * only touched a subregion of the cluster, then backing image sectors have + * been lost in the untouched region. The solution is to flush after writing a + * new data cluster and before updating the L2 table. + */ +static void qed_aio_write_flush_before_l2_update(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + + if (!bdrv_aio_flush(s->bs->file, qed_aio_write_l2_update, opaque)) { + qed_aio_complete(acb, -EIO); + } +} + +/** + * Write data to the image file + */ +static void qed_aio_write_main(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + uint64_t offset = acb->cur_cluster + + qed_offset_into_cluster(s, acb->cur_pos); + BlockDriverCompletionFunc *next_fn; + BlockDriverAIOCB *file_acb; + + trace_qed_aio_write_main(s, acb, ret, offset, acb->cur_qiov.size); + + if (ret) { + qed_aio_complete(acb, ret); + return; + } + + if (acb->find_cluster_ret == QED_CLUSTER_FOUND) { + next_fn = qed_aio_next_io; + } else { + if (s->bs->backing_hd) { + next_fn = qed_aio_write_flush_before_l2_update; + } else { + next_fn = qed_aio_write_l2_update; + } + } + + BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO); + file_acb = bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE, + &acb->cur_qiov, + acb->cur_qiov.size / BDRV_SECTOR_SIZE, + next_fn, acb); + if (!file_acb) { + qed_aio_complete(acb, -EIO); + } +} + +/** + * Populate back untouched region of new data cluster + */ +static void qed_aio_write_postfill(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + uint64_t start = acb->cur_pos + acb->cur_qiov.size; + uint64_t len = + qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start; + uint64_t offset = acb->cur_cluster + + qed_offset_into_cluster(s, acb->cur_pos) + + acb->cur_qiov.size; + + if (ret) { + qed_aio_complete(acb, ret); + return; + } + + trace_qed_aio_write_postfill(s, acb, start, len, offset); + qed_copy_from_backing_file(s, start, len, offset, + qed_aio_write_main, acb); +} + +/** + * Populate front untouched region of new data cluster + */ +static void qed_aio_write_prefill(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + uint64_t start = qed_start_of_cluster(s, acb->cur_pos); + uint64_t len = qed_offset_into_cluster(s, acb->cur_pos); + + trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster); + qed_copy_from_backing_file(s, start, len, acb->cur_cluster, + qed_aio_write_postfill, acb); +} + +/** + * Write new data cluster + * + * @acb: Write request + * @len: Length in bytes + * + * This path is taken when writing to previously unallocated clusters. + */ +static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len) +{ + BDRVQEDState *s = acb_to_s(acb); + + /* Freeze this request if another allocating write is in progress */ + if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs)) { + QSIMPLEQ_INSERT_TAIL(&s->allocating_write_reqs, acb, next); + } + if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs)) { + return; /* wait for existing request to finish */ + } + + acb->cur_nclusters = qed_bytes_to_clusters(s, + qed_offset_into_cluster(s, acb->cur_pos) + len); + acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters); + qemu_iovec_copy(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); + + /* Write new cluster */ + qed_aio_write_prefill(acb, 0); +} + +/** + * Write data cluster in place + * + * @acb: Write request + * @offset: Cluster offset in bytes + * @len: Length in bytes + * + * This path is taken when writing to already allocated clusters. + */ +static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len) +{ + /* Calculate the I/O vector */ + acb->cur_cluster = offset; + qemu_iovec_copy(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); + + /* Do the actual write */ + qed_aio_write_main(acb, 0); +} + +/** + * Write data cluster + * + * @opaque: Write request + * @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1, + * or -errno + * @offset: Cluster offset in bytes + * @len: Length in bytes + * + * Callback from qed_find_cluster(). + */ +static void qed_aio_write_data(void *opaque, int ret, + uint64_t offset, size_t len) +{ + QEDAIOCB *acb = opaque; + + trace_qed_aio_write_data(acb_to_s(acb), acb, ret, offset, len); + + acb->find_cluster_ret = ret; + + switch (ret) { + case QED_CLUSTER_FOUND: + qed_aio_write_inplace(acb, offset, len); + break; + + case QED_CLUSTER_L2: + case QED_CLUSTER_L1: + qed_aio_write_alloc(acb, len); + break; + + default: + qed_aio_complete(acb, ret); + break; + } +} + +/** + * Read data cluster + * + * @opaque: Read request + * @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1, + * or -errno + * @offset: Cluster offset in bytes + * @len: Length in bytes + * + * Callback from qed_find_cluster(). + */ +static void qed_aio_read_data(void *opaque, int ret, + uint64_t offset, size_t len) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + BlockDriverState *bs = acb->common.bs; + BlockDriverAIOCB *file_acb; + + /* Adjust offset into cluster */ + offset += qed_offset_into_cluster(s, acb->cur_pos); + + trace_qed_aio_read_data(s, acb, ret, offset, len); + + if (ret < 0) { + goto err; + } + + qemu_iovec_copy(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); + + /* Handle backing file and unallocated sparse hole reads */ + if (ret != QED_CLUSTER_FOUND) { + qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov, + qed_aio_next_io, acb); + return; + } + + BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); + file_acb = bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE, + &acb->cur_qiov, + acb->cur_qiov.size / BDRV_SECTOR_SIZE, + qed_aio_next_io, acb); + if (!file_acb) { + ret = -EIO; + goto err; + } + return; + +err: + qed_aio_complete(acb, ret); +} + +/** + * Begin next I/O or complete the request + */ +static void qed_aio_next_io(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + QEDFindClusterFunc *io_fn = + acb->is_write ? qed_aio_write_data : qed_aio_read_data; + + trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size); + + /* Handle I/O error */ + if (ret) { + qed_aio_complete(acb, ret); + return; + } + + acb->qiov_offset += acb->cur_qiov.size; + acb->cur_pos += acb->cur_qiov.size; + qemu_iovec_reset(&acb->cur_qiov); + + /* Complete request */ + if (acb->cur_pos >= acb->end_pos) { + qed_aio_complete(acb, 0); + return; + } + + /* Find next cluster and start I/O */ + qed_find_cluster(s, &acb->request, + acb->cur_pos, acb->end_pos - acb->cur_pos, + io_fn, acb); +} + +static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque, bool is_write) +{ + QEDAIOCB *acb = qemu_aio_get(&qed_aio_pool, bs, cb, opaque); + + trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors, + opaque, is_write); + + acb->is_write = is_write; + acb->finished = NULL; + acb->qiov = qiov; + acb->qiov_offset = 0; + acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE; + acb->end_pos = acb->cur_pos + nb_sectors * BDRV_SECTOR_SIZE; + acb->request.l2_table = NULL; + qemu_iovec_init(&acb->cur_qiov, qiov->niov); + + /* Start request */ + qed_aio_next_io(acb, 0); + return &acb->common; +} + static BlockDriverAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { - return NULL; + return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, false); } static BlockDriverAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs, @@ -446,7 +1070,7 @@ static BlockDriverAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs, BlockDriverCompletionFunc *cb, void *opaque) { - return NULL; + return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, true); } static BlockDriverAIOCB *bdrv_qed_aio_flush(BlockDriverState *bs, |