diff options
80 files changed, 2860 insertions, 993 deletions
diff --git a/aio-win32.c b/aio-win32.c index 61e3d2d..d81313b 100644 --- a/aio-win32.c +++ b/aio-win32.c @@ -283,9 +283,9 @@ bool aio_poll(AioContext *ctx, bool blocking) int count; int timeout; - if (aio_prepare(ctx)) { + have_select_revents = aio_prepare(ctx); + if (have_select_revents) { blocking = false; - have_select_revents = true; } was_dispatching = ctx->dispatching; @@ -335,6 +335,7 @@ bool aio_poll(AioContext *ctx, bool blocking) event = NULL; if ((DWORD) (ret - WAIT_OBJECT_0) < count) { event = events[ret - WAIT_OBJECT_0]; + events[ret - WAIT_OBJECT_0] = events[--count]; } else if (!have_select_revents) { break; } @@ -343,9 +344,6 @@ bool aio_poll(AioContext *ctx, bool blocking) blocking = false; progress |= aio_dispatch_handlers(ctx, event); - - /* Try again, but only call each handler once. */ - events[ret - WAIT_OBJECT_0] = events[--count]; } progress |= timerlistgroup_run_timers(&ctx->tlg); @@ -289,18 +289,24 @@ static void aio_rfifolock_cb(void *opaque) aio_notify(opaque); } -AioContext *aio_context_new(void) +AioContext *aio_context_new(Error **errp) { + int ret; AioContext *ctx; ctx = (AioContext *) g_source_new(&aio_source_funcs, sizeof(AioContext)); + ret = event_notifier_init(&ctx->notifier, false); + if (ret < 0) { + g_source_destroy(&ctx->source); + error_setg_errno(errp, -ret, "Failed to initialize event notifier"); + return NULL; + } + aio_set_event_notifier(ctx, &ctx->notifier, + (EventNotifierHandler *) + event_notifier_test_and_clear); ctx->pollfds = g_array_new(FALSE, FALSE, sizeof(GPollFD)); ctx->thread_pool = NULL; qemu_mutex_init(&ctx->bh_lock); rfifolock_init(&ctx->lock, aio_rfifolock_cb, ctx); - event_notifier_init(&ctx->notifier, false); - aio_set_event_notifier(ctx, &ctx->notifier, - (EventNotifierHandler *) - event_notifier_test_and_clear); timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx); return ctx; @@ -4640,7 +4640,28 @@ int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) void bdrv_aio_cancel(BlockDriverAIOCB *acb) { - acb->aiocb_info->cancel(acb); + qemu_aio_ref(acb); + bdrv_aio_cancel_async(acb); + while (acb->refcnt > 1) { + if (acb->aiocb_info->get_aio_context) { + aio_poll(acb->aiocb_info->get_aio_context(acb), true); + } else if (acb->bs) { + aio_poll(bdrv_get_aio_context(acb->bs), true); + } else { + abort(); + } + } + qemu_aio_unref(acb); +} + +/* Async version of aio cancel. The caller is not blocked if the acb implements + * cancel_async, otherwise we do nothing and let the request normally complete. + * In either case the completion callback must be called. */ +void bdrv_aio_cancel_async(BlockDriverAIOCB *acb) +{ + if (acb->aiocb_info->cancel_async) { + acb->aiocb_info->cancel_async(acb); + } } /**************************************************************/ @@ -4656,18 +4677,8 @@ typedef struct BlockDriverAIOCBSync { int is_write; } BlockDriverAIOCBSync; -static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb) -{ - BlockDriverAIOCBSync *acb = - container_of(blockacb, BlockDriverAIOCBSync, common); - qemu_bh_delete(acb->bh); - acb->bh = NULL; - qemu_aio_release(acb); -} - static const AIOCBInfo bdrv_em_aiocb_info = { .aiocb_size = sizeof(BlockDriverAIOCBSync), - .cancel = bdrv_aio_cancel_em, }; static void bdrv_aio_bh_cb(void *opaque) @@ -4681,7 +4692,7 @@ static void bdrv_aio_bh_cb(void *opaque) acb->common.cb(acb->common.opaque, acb->ret); qemu_bh_delete(acb->bh); acb->bh = NULL; - qemu_aio_release(acb); + qemu_aio_unref(acb); } static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, @@ -4738,22 +4749,8 @@ typedef struct BlockDriverAIOCBCoroutine { QEMUBH* bh; } BlockDriverAIOCBCoroutine; -static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb) -{ - AioContext *aio_context = bdrv_get_aio_context(blockacb->bs); - BlockDriverAIOCBCoroutine *acb = - container_of(blockacb, BlockDriverAIOCBCoroutine, common); - bool done = false; - - acb->done = &done; - while (!done) { - aio_poll(aio_context, true); - } -} - static const AIOCBInfo bdrv_em_co_aiocb_info = { .aiocb_size = sizeof(BlockDriverAIOCBCoroutine), - .cancel = bdrv_aio_co_cancel_em, }; static void bdrv_co_em_bh(void *opaque) @@ -4762,12 +4759,8 @@ static void bdrv_co_em_bh(void *opaque) acb->common.cb(acb->common.opaque, acb->req.error); - if (acb->done) { - *acb->done = true; - } - qemu_bh_delete(acb->bh); - qemu_aio_release(acb); + qemu_aio_unref(acb); } /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ @@ -4806,7 +4799,6 @@ static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, acb->req.qiov = qiov; acb->req.flags = flags; acb->is_write = is_write; - acb->done = NULL; co = qemu_coroutine_create(bdrv_co_do_rw); qemu_coroutine_enter(co, acb); @@ -4833,7 +4825,6 @@ BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs, BlockDriverAIOCBCoroutine *acb; acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); - acb->done = NULL; co = qemu_coroutine_create(bdrv_aio_flush_co_entry); qemu_coroutine_enter(co, acb); @@ -4863,7 +4854,6 @@ BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs, acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); acb->req.sector = sector_num; acb->req.nb_sectors = nb_sectors; - acb->done = NULL; co = qemu_coroutine_create(bdrv_aio_discard_co_entry); qemu_coroutine_enter(co, acb); @@ -4891,13 +4881,23 @@ void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, acb->bs = bs; acb->cb = cb; acb->opaque = opaque; + acb->refcnt = 1; return acb; } -void qemu_aio_release(void *p) +void qemu_aio_ref(void *p) +{ + BlockDriverAIOCB *acb = p; + acb->refcnt++; +} + +void qemu_aio_unref(void *p) { BlockDriverAIOCB *acb = p; - g_slice_free1(acb->aiocb_info->aiocb_size, acb); + assert(acb->refcnt > 0); + if (--acb->refcnt == 0) { + g_slice_free1(acb->aiocb_info->aiocb_size, acb); + } } /**************************************************************/ diff --git a/block/Makefile.objs b/block/Makefile.objs index c9c8bbb..a833ed5 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -1,4 +1,4 @@ -block-obj-y += raw_bsd.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o +block-obj-y += raw_bsd.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o block-obj-y += qed-check.o @@ -9,6 +9,7 @@ block-obj-y += snapshot.o qapi.o block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o block-obj-$(CONFIG_POSIX) += raw-posix.o block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o +block-obj-y += null.o block-obj-y += nbd.o nbd-client.o sheepdog.o block-obj-$(CONFIG_LIBISCSI) += iscsi.o diff --git a/block/archipelago.c b/block/archipelago.c index 93fb7c0..73d91a4 100644 --- a/block/archipelago.c +++ b/block/archipelago.c @@ -91,7 +91,6 @@ typedef struct ArchipelagoAIOCB { struct BDRVArchipelagoState *s; QEMUIOVector *qiov; ARCHIPCmd cmd; - bool cancelled; int status; int64_t size; int64_t ret; @@ -318,9 +317,7 @@ static void qemu_archipelago_complete_aio(void *opaque) aio_cb->common.cb(aio_cb->common.opaque, aio_cb->ret); aio_cb->status = 0; - if (!aio_cb->cancelled) { - qemu_aio_release(aio_cb); - } + qemu_aio_unref(aio_cb); g_free(reqdata); } @@ -725,19 +722,8 @@ static int qemu_archipelago_create(const char *filename, return ret; } -static void qemu_archipelago_aio_cancel(BlockDriverAIOCB *blockacb) -{ - ArchipelagoAIOCB *aio_cb = (ArchipelagoAIOCB *) blockacb; - aio_cb->cancelled = true; - while (aio_cb->status == -EINPROGRESS) { - aio_poll(bdrv_get_aio_context(aio_cb->common.bs), true); - } - qemu_aio_release(aio_cb); -} - static const AIOCBInfo archipelago_aiocb_info = { .aiocb_size = sizeof(ArchipelagoAIOCB), - .cancel = qemu_archipelago_aio_cancel, }; static int archipelago_submit_request(BDRVArchipelagoState *s, @@ -889,7 +875,6 @@ static BlockDriverAIOCB *qemu_archipelago_aio_rw(BlockDriverState *bs, aio_cb->ret = 0; aio_cb->s = s; - aio_cb->cancelled = false; aio_cb->status = -EINPROGRESS; off = sector_num * BDRV_SECTOR_SIZE; @@ -905,7 +890,7 @@ static BlockDriverAIOCB *qemu_archipelago_aio_rw(BlockDriverState *bs, err_exit: error_report("qemu_archipelago_aio_rw(): I/O Error\n"); - qemu_aio_release(aio_cb); + qemu_aio_unref(aio_cb); return NULL; } @@ -1008,7 +993,7 @@ static int qemu_archipelago_truncate(BlockDriverState *bs, int64_t offset) req = xseg_get_request(s->xseg, s->srcport, s->mportno, X_ALLOC); if (!req) { archipelagolog("Cannot get XSEG request\n"); - return err_exit2; + goto err_exit2; } ret = xseg_prep_request(s->xseg, req, targetlen, 0); diff --git a/block/blkdebug.c b/block/blkdebug.c index 69b330e..ced0b60 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -52,11 +52,8 @@ typedef struct BlkdebugSuspendedReq { QLIST_ENTRY(BlkdebugSuspendedReq) next; } BlkdebugSuspendedReq; -static void blkdebug_aio_cancel(BlockDriverAIOCB *blockacb); - static const AIOCBInfo blkdebug_aiocb_info = { - .aiocb_size = sizeof(BlkdebugAIOCB), - .cancel = blkdebug_aio_cancel, + .aiocb_size = sizeof(BlkdebugAIOCB), }; enum { @@ -447,17 +444,7 @@ static void error_callback_bh(void *opaque) struct BlkdebugAIOCB *acb = opaque; qemu_bh_delete(acb->bh); acb->common.cb(acb->common.opaque, acb->ret); - qemu_aio_release(acb); -} - -static void blkdebug_aio_cancel(BlockDriverAIOCB *blockacb) -{ - BlkdebugAIOCB *acb = container_of(blockacb, BlkdebugAIOCB, common); - if (acb->bh) { - qemu_bh_delete(acb->bh); - acb->bh = NULL; - } - qemu_aio_release(acb); + qemu_aio_unref(acb); } static BlockDriverAIOCB *inject_error(BlockDriverState *bs, diff --git a/block/blkverify.c b/block/blkverify.c index 163064c..7d64a23 100644 --- a/block/blkverify.c +++ b/block/blkverify.c @@ -29,7 +29,6 @@ struct BlkverifyAIOCB { int ret; /* first completed request's result */ unsigned int done; /* completion counter */ - bool *finished; /* completion signal for cancel */ QEMUIOVector *qiov; /* user I/O vector */ QEMUIOVector raw_qiov; /* cloned I/O vector for raw file */ @@ -38,22 +37,8 @@ struct BlkverifyAIOCB { void (*verify)(BlkverifyAIOCB *acb); }; -static void blkverify_aio_cancel(BlockDriverAIOCB *blockacb) -{ - BlkverifyAIOCB *acb = (BlkverifyAIOCB *)blockacb; - AioContext *aio_context = bdrv_get_aio_context(blockacb->bs); - bool finished = false; - - /* Wait until request completes, invokes its callback, and frees itself */ - acb->finished = &finished; - while (!finished) { - aio_poll(aio_context, true); - } -} - static const AIOCBInfo blkverify_aiocb_info = { .aiocb_size = sizeof(BlkverifyAIOCB), - .cancel = blkverify_aio_cancel, }; static void GCC_FMT_ATTR(2, 3) blkverify_err(BlkverifyAIOCB *acb, @@ -194,7 +179,6 @@ static BlkverifyAIOCB *blkverify_aio_get(BlockDriverState *bs, bool is_write, acb->qiov = qiov; acb->buf = NULL; acb->verify = NULL; - acb->finished = NULL; return acb; } @@ -208,10 +192,7 @@ static void blkverify_aio_bh(void *opaque) qemu_vfree(acb->buf); } acb->common.cb(acb->common.opaque, acb->ret); - if (acb->finished) { - *acb->finished = true; - } - qemu_aio_release(acb); + qemu_aio_unref(acb); } static void blkverify_aio_cb(void *opaque, int ret) diff --git a/block/cow.c b/block/cow.c deleted file mode 100644 index c3769fe..0000000 --- a/block/cow.c +++ /dev/null @@ -1,433 +0,0 @@ -/* - * Block driver for the COW format - * - * Copyright (c) 2004 Fabrice Bellard - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -#include "qemu-common.h" -#include "block/block_int.h" -#include "qemu/module.h" - -/**************************************************************/ -/* COW block driver using file system holes */ - -/* user mode linux compatible COW file */ -#define COW_MAGIC 0x4f4f4f4d /* MOOO */ -#define COW_VERSION 2 - -struct cow_header_v2 { - uint32_t magic; - uint32_t version; - char backing_file[1024]; - int32_t mtime; - uint64_t size; - uint32_t sectorsize; -}; - -typedef struct BDRVCowState { - CoMutex lock; - int64_t cow_sectors_offset; -} BDRVCowState; - -static int cow_probe(const uint8_t *buf, int buf_size, const char *filename) -{ - const struct cow_header_v2 *cow_header = (const void *)buf; - - if (buf_size >= sizeof(struct cow_header_v2) && - be32_to_cpu(cow_header->magic) == COW_MAGIC && - be32_to_cpu(cow_header->version) == COW_VERSION) - return 100; - else - return 0; -} - -static int cow_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - BDRVCowState *s = bs->opaque; - struct cow_header_v2 cow_header; - int bitmap_size; - int64_t size; - int ret; - - /* see if it is a cow image */ - ret = bdrv_pread(bs->file, 0, &cow_header, sizeof(cow_header)); - if (ret < 0) { - goto fail; - } - - if (be32_to_cpu(cow_header.magic) != COW_MAGIC) { - error_setg(errp, "Image not in COW format"); - ret = -EINVAL; - goto fail; - } - - if (be32_to_cpu(cow_header.version) != COW_VERSION) { - char version[64]; - snprintf(version, sizeof(version), - "COW version %" PRIu32, cow_header.version); - error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, - bs->device_name, "cow", version); - ret = -ENOTSUP; - goto fail; - } - - /* cow image found */ - size = be64_to_cpu(cow_header.size); - bs->total_sectors = size / 512; - - pstrcpy(bs->backing_file, sizeof(bs->backing_file), - cow_header.backing_file); - - bitmap_size = ((bs->total_sectors + 7) >> 3) + sizeof(cow_header); - s->cow_sectors_offset = (bitmap_size + 511) & ~511; - qemu_co_mutex_init(&s->lock); - return 0; - fail: - return ret; -} - -static inline void cow_set_bits(uint8_t *bitmap, int start, int64_t nb_sectors) -{ - int64_t bitnum = start, last = start + nb_sectors; - while (bitnum < last) { - if ((bitnum & 7) == 0 && bitnum + 8 <= last) { - bitmap[bitnum / 8] = 0xFF; - bitnum += 8; - continue; - } - bitmap[bitnum/8] |= (1 << (bitnum % 8)); - bitnum++; - } -} - -#define BITS_PER_BITMAP_SECTOR (512 * 8) - -/* Cannot use bitmap.c on big-endian machines. */ -static int cow_test_bit(int64_t bitnum, const uint8_t *bitmap) -{ - return (bitmap[bitnum / 8] & (1 << (bitnum & 7))) != 0; -} - -static int cow_find_streak(const uint8_t *bitmap, int value, int start, int nb_sectors) -{ - int streak_value = value ? 0xFF : 0; - int last = MIN(start + nb_sectors, BITS_PER_BITMAP_SECTOR); - int bitnum = start; - while (bitnum < last) { - if ((bitnum & 7) == 0 && bitmap[bitnum / 8] == streak_value) { - bitnum += 8; - continue; - } - if (cow_test_bit(bitnum, bitmap) == value) { - bitnum++; - continue; - } - break; - } - return MIN(bitnum, last) - start; -} - -/* Return true if first block has been changed (ie. current version is - * in COW file). Set the number of continuous blocks for which that - * is true. */ -static int coroutine_fn cow_co_is_allocated(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, int *num_same) -{ - int64_t bitnum = sector_num + sizeof(struct cow_header_v2) * 8; - uint64_t offset = (bitnum / 8) & -BDRV_SECTOR_SIZE; - bool first = true; - int changed = 0, same = 0; - - do { - int ret; - uint8_t bitmap[BDRV_SECTOR_SIZE]; - - bitnum &= BITS_PER_BITMAP_SECTOR - 1; - int sector_bits = MIN(nb_sectors, BITS_PER_BITMAP_SECTOR - bitnum); - - ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap)); - if (ret < 0) { - return ret; - } - - if (first) { - changed = cow_test_bit(bitnum, bitmap); - first = false; - } - - same += cow_find_streak(bitmap, changed, bitnum, nb_sectors); - - bitnum += sector_bits; - nb_sectors -= sector_bits; - offset += BDRV_SECTOR_SIZE; - } while (nb_sectors); - - *num_same = same; - return changed; -} - -static int64_t coroutine_fn cow_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, int *num_same) -{ - BDRVCowState *s = bs->opaque; - int ret = cow_co_is_allocated(bs, sector_num, nb_sectors, num_same); - int64_t offset = s->cow_sectors_offset + (sector_num << BDRV_SECTOR_BITS); - if (ret < 0) { - return ret; - } - return (ret ? BDRV_BLOCK_DATA : 0) | offset | BDRV_BLOCK_OFFSET_VALID; -} - -static int cow_update_bitmap(BlockDriverState *bs, int64_t sector_num, - int nb_sectors) -{ - int64_t bitnum = sector_num + sizeof(struct cow_header_v2) * 8; - uint64_t offset = (bitnum / 8) & -BDRV_SECTOR_SIZE; - bool first = true; - int sector_bits; - - for ( ; nb_sectors; - bitnum += sector_bits, - nb_sectors -= sector_bits, - offset += BDRV_SECTOR_SIZE) { - int ret, set; - uint8_t bitmap[BDRV_SECTOR_SIZE]; - - bitnum &= BITS_PER_BITMAP_SECTOR - 1; - sector_bits = MIN(nb_sectors, BITS_PER_BITMAP_SECTOR - bitnum); - - ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap)); - if (ret < 0) { - return ret; - } - - /* Skip over any already set bits */ - set = cow_find_streak(bitmap, 1, bitnum, sector_bits); - bitnum += set; - sector_bits -= set; - nb_sectors -= set; - if (!sector_bits) { - continue; - } - - if (first) { - ret = bdrv_flush(bs->file); - if (ret < 0) { - return ret; - } - first = false; - } - - cow_set_bits(bitmap, bitnum, sector_bits); - - ret = bdrv_pwrite(bs->file, offset, &bitmap, sizeof(bitmap)); - if (ret < 0) { - return ret; - } - } - - return 0; -} - -static int coroutine_fn cow_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - BDRVCowState *s = bs->opaque; - int ret, n; - - while (nb_sectors > 0) { - ret = cow_co_is_allocated(bs, sector_num, nb_sectors, &n); - if (ret < 0) { - return ret; - } - if (ret) { - ret = bdrv_pread(bs->file, - s->cow_sectors_offset + sector_num * 512, - buf, n * 512); - if (ret < 0) { - return ret; - } - } else { - if (bs->backing_hd) { - /* read from the base image */ - ret = bdrv_read(bs->backing_hd, sector_num, buf, n); - if (ret < 0) { - return ret; - } - } else { - memset(buf, 0, n * 512); - } - } - nb_sectors -= n; - sector_num += n; - buf += n * 512; - } - return 0; -} - -static coroutine_fn int cow_co_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - int ret; - BDRVCowState *s = bs->opaque; - qemu_co_mutex_lock(&s->lock); - ret = cow_read(bs, sector_num, buf, nb_sectors); - qemu_co_mutex_unlock(&s->lock); - return ret; -} - -static int cow_write(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) -{ - BDRVCowState *s = bs->opaque; - int ret; - - ret = bdrv_pwrite(bs->file, s->cow_sectors_offset + sector_num * 512, - buf, nb_sectors * 512); - if (ret < 0) { - return ret; - } - - return cow_update_bitmap(bs, sector_num, nb_sectors); -} - -static coroutine_fn int cow_co_write(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) -{ - int ret; - BDRVCowState *s = bs->opaque; - qemu_co_mutex_lock(&s->lock); - ret = cow_write(bs, sector_num, buf, nb_sectors); - qemu_co_mutex_unlock(&s->lock); - return ret; -} - -static void cow_close(BlockDriverState *bs) -{ -} - -static int cow_create(const char *filename, QemuOpts *opts, Error **errp) -{ - struct cow_header_v2 cow_header; - struct stat st; - int64_t image_sectors = 0; - char *image_filename = NULL; - Error *local_err = NULL; - int ret; - BlockDriverState *cow_bs = NULL; - - /* Read out options */ - image_sectors = DIV_ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), - BDRV_SECTOR_SIZE); - image_filename = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); - - ret = bdrv_create_file(filename, opts, &local_err); - if (ret < 0) { - error_propagate(errp, local_err); - goto exit; - } - - ret = bdrv_open(&cow_bs, filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_PROTOCOL, NULL, &local_err); - if (ret < 0) { - error_propagate(errp, local_err); - goto exit; - } - - memset(&cow_header, 0, sizeof(cow_header)); - cow_header.magic = cpu_to_be32(COW_MAGIC); - cow_header.version = cpu_to_be32(COW_VERSION); - if (image_filename) { - /* Note: if no file, we put a dummy mtime */ - cow_header.mtime = cpu_to_be32(0); - - if (stat(image_filename, &st) != 0) { - goto mtime_fail; - } - cow_header.mtime = cpu_to_be32(st.st_mtime); - mtime_fail: - pstrcpy(cow_header.backing_file, sizeof(cow_header.backing_file), - image_filename); - } - cow_header.sectorsize = cpu_to_be32(512); - cow_header.size = cpu_to_be64(image_sectors * 512); - ret = bdrv_pwrite(cow_bs, 0, &cow_header, sizeof(cow_header)); - if (ret < 0) { - goto exit; - } - - /* resize to include at least all the bitmap */ - ret = bdrv_truncate(cow_bs, - sizeof(cow_header) + ((image_sectors + 7) >> 3)); - if (ret < 0) { - goto exit; - } - -exit: - g_free(image_filename); - if (cow_bs) { - bdrv_unref(cow_bs); - } - return ret; -} - -static QemuOptsList cow_create_opts = { - .name = "cow-create-opts", - .head = QTAILQ_HEAD_INITIALIZER(cow_create_opts.head), - .desc = { - { - .name = BLOCK_OPT_SIZE, - .type = QEMU_OPT_SIZE, - .help = "Virtual disk size" - }, - { - .name = BLOCK_OPT_BACKING_FILE, - .type = QEMU_OPT_STRING, - .help = "File name of a base image" - }, - { /* end of list */ } - } -}; - -static BlockDriver bdrv_cow = { - .format_name = "cow", - .instance_size = sizeof(BDRVCowState), - - .bdrv_probe = cow_probe, - .bdrv_open = cow_open, - .bdrv_close = cow_close, - .bdrv_create = cow_create, - .bdrv_has_zero_init = bdrv_has_zero_init_1, - .supports_backing = true, - - .bdrv_read = cow_co_read, - .bdrv_write = cow_co_write, - .bdrv_co_get_block_status = cow_co_get_block_status, - - .create_opts = &cow_create_opts, -}; - -static void bdrv_cow_init(void) -{ - bdrv_register(&bdrv_cow); -} - -block_init(bdrv_cow_init); diff --git a/block/curl.c b/block/curl.c index 938f9d9..225407c 100644 --- a/block/curl.c +++ b/block/curl.c @@ -212,7 +212,7 @@ static size_t curl_read_cb(void *ptr, size_t size, size_t nmemb, void *opaque) qemu_iovec_from_buf(acb->qiov, 0, s->orig_buf + acb->start, acb->end - acb->start); acb->common.cb(acb->common.opaque, 0); - qemu_aio_release(acb); + qemu_aio_unref(acb); s->acb[i] = NULL; } } @@ -304,7 +304,7 @@ static void curl_multi_check_completion(BDRVCURLState *s) } acb->common.cb(acb->common.opaque, -EIO); - qemu_aio_release(acb); + qemu_aio_unref(acb); state->acb[i] = NULL; } } @@ -613,14 +613,8 @@ out_noclean: return -EINVAL; } -static void curl_aio_cancel(BlockDriverAIOCB *blockacb) -{ - // Do we have to implement canceling? Seems to work without... -} - static const AIOCBInfo curl_aiocb_info = { .aiocb_size = sizeof(CURLAIOCB), - .cancel = curl_aio_cancel, }; @@ -642,7 +636,7 @@ static void curl_readv_bh_cb(void *p) // we can just call the callback and be done. switch (curl_find_buf(s, start, acb->nb_sectors * SECTOR_SIZE, acb)) { case FIND_RET_OK: - qemu_aio_release(acb); + qemu_aio_unref(acb); // fall through case FIND_RET_WAIT: return; @@ -654,7 +648,7 @@ static void curl_readv_bh_cb(void *p) state = curl_init_state(acb->common.bs, s); if (!state) { acb->common.cb(acb->common.opaque, -EIO); - qemu_aio_release(acb); + qemu_aio_unref(acb); return; } @@ -670,7 +664,7 @@ static void curl_readv_bh_cb(void *p) if (state->buf_len && state->orig_buf == NULL) { curl_clean_state(state); acb->common.cb(acb->common.opaque, -ENOMEM); - qemu_aio_release(acb); + qemu_aio_unref(acb); return; } state->acb[0] = acb; diff --git a/block/iscsi.c b/block/iscsi.c index 84bcae8..3c1b416 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -88,7 +88,6 @@ typedef struct IscsiAIOCB { struct scsi_task *task; uint8_t *buf; int status; - int canceled; int64_t sector_num; int nb_sectors; #ifdef __linux__ @@ -120,16 +119,14 @@ iscsi_bh_cb(void *p) g_free(acb->buf); acb->buf = NULL; - if (acb->canceled == 0) { - acb->common.cb(acb->common.opaque, acb->status); - } + acb->common.cb(acb->common.opaque, acb->status); if (acb->task != NULL) { scsi_free_scsi_task(acb->task); acb->task = NULL; } - qemu_aio_release(acb); + qemu_aio_unref(acb); } static void @@ -240,20 +237,15 @@ iscsi_aio_cancel(BlockDriverAIOCB *blockacb) return; } - acb->canceled = 1; - /* send a task mgmt call to the target to cancel the task on the target */ iscsi_task_mgmt_abort_task_async(iscsilun->iscsi, acb->task, iscsi_abort_task_cb, acb); - while (acb->status == -EINPROGRESS) { - aio_poll(iscsilun->aio_context, true); - } } static const AIOCBInfo iscsi_aiocb_info = { .aiocb_size = sizeof(IscsiAIOCB), - .cancel = iscsi_aio_cancel, + .cancel_async = iscsi_aio_cancel, }; @@ -638,10 +630,6 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status, g_free(acb->buf); acb->buf = NULL; - if (acb->canceled != 0) { - return; - } - acb->status = 0; if (status < 0) { error_report("Failed to ioctl(SG_IO) to iSCSI lun. %s", @@ -683,7 +671,6 @@ static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs, acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque); acb->iscsilun = iscsilun; - acb->canceled = 0; acb->bh = NULL; acb->status = -EINPROGRESS; acb->buf = NULL; @@ -693,7 +680,7 @@ static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs, if (acb->task == NULL) { error_report("iSCSI: Failed to allocate task for scsi command. %s", iscsi_get_error(iscsi)); - qemu_aio_release(acb); + qemu_aio_unref(acb); return NULL; } memset(acb->task, 0, sizeof(struct scsi_task)); @@ -731,7 +718,7 @@ static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs, (data.size > 0) ? &data : NULL, acb) != 0) { scsi_free_scsi_task(acb->task); - qemu_aio_release(acb); + qemu_aio_unref(acb); return NULL; } diff --git a/block/linux-aio.c b/block/linux-aio.c index 9aca758..f4baba2 100644 --- a/block/linux-aio.c +++ b/block/linux-aio.c @@ -85,11 +85,10 @@ static void qemu_laio_process_completion(struct qemu_laio_state *s, ret = -EINVAL; } } - - laiocb->common.cb(laiocb->common.opaque, ret); } + laiocb->common.cb(laiocb->common.opaque, ret); - qemu_aio_release(laiocb); + qemu_aio_unref(laiocb); } /* The completion BH fetches completed I/O requests and invokes their @@ -153,35 +152,22 @@ static void laio_cancel(BlockDriverAIOCB *blockacb) struct io_event event; int ret; - if (laiocb->ret != -EINPROGRESS) + if (laiocb->ret != -EINPROGRESS) { return; - - /* - * Note that as of Linux 2.6.31 neither the block device code nor any - * filesystem implements cancellation of AIO request. - * Thus the polling loop below is the normal code path. - */ + } ret = io_cancel(laiocb->ctx->ctx, &laiocb->iocb, &event); - if (ret == 0) { - laiocb->ret = -ECANCELED; + laiocb->ret = -ECANCELED; + if (ret != 0) { + /* iocb is not cancelled, cb will be called by the event loop later */ return; } - /* - * We have to wait for the iocb to finish. - * - * The only way to get the iocb status update is by polling the io context. - * We might be able to do this slightly more optimal by removing the - * O_NONBLOCK flag. - */ - while (laiocb->ret == -EINPROGRESS) { - qemu_laio_completion_cb(&laiocb->ctx->e); - } + laiocb->common.cb(laiocb->common.opaque, laiocb->ret); } static const AIOCBInfo laio_aiocb_info = { .aiocb_size = sizeof(struct qemu_laiocb), - .cancel = laio_cancel, + .cancel_async = laio_cancel, }; static void ioq_init(LaioQueue *io_q) @@ -300,7 +286,7 @@ BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, return &laiocb->common; out_free_aiocb: - qemu_aio_release(laiocb); + qemu_aio_unref(laiocb); return NULL; } diff --git a/block/null.c b/block/null.c new file mode 100644 index 0000000..8284419 --- /dev/null +++ b/block/null.c @@ -0,0 +1,168 @@ +/* + * Null block driver + * + * Authors: + * Fam Zheng <famz@redhat.com> + * + * Copyright (C) 2014 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "block/block_int.h" + +typedef struct { + int64_t length; +} BDRVNullState; + +static QemuOptsList runtime_opts = { + .name = "null", + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), + .desc = { + { + .name = "filename", + .type = QEMU_OPT_STRING, + .help = "", + }, + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "size of the null block", + }, + { /* end of list */ } + }, +}; + +static int null_file_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + QemuOpts *opts; + BDRVNullState *s = bs->opaque; + + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &error_abort); + s->length = + qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 1 << 30); + qemu_opts_del(opts); + return 0; +} + +static void null_close(BlockDriverState *bs) +{ +} + +static int64_t null_getlength(BlockDriverState *bs) +{ + BDRVNullState *s = bs->opaque; + return s->length; +} + +static coroutine_fn int null_co_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *qiov) +{ + return 0; +} + +static coroutine_fn int null_co_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *qiov) +{ + return 0; +} + +static coroutine_fn int null_co_flush(BlockDriverState *bs) +{ + return 0; +} + +typedef struct { + BlockDriverAIOCB common; + QEMUBH *bh; +} NullAIOCB; + +static const AIOCBInfo null_aiocb_info = { + .aiocb_size = sizeof(NullAIOCB), +}; + +static void null_bh_cb(void *opaque) +{ + NullAIOCB *acb = opaque; + acb->common.cb(acb->common.opaque, 0); + qemu_bh_delete(acb->bh); + qemu_aio_unref(acb); +} + +static inline BlockDriverAIOCB *null_aio_common(BlockDriverState *bs, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + NullAIOCB *acb; + + acb = qemu_aio_get(&null_aiocb_info, bs, cb, opaque); + acb->bh = aio_bh_new(bdrv_get_aio_context(bs), null_bh_cb, acb); + qemu_bh_schedule(acb->bh); + return &acb->common; +} + +static BlockDriverAIOCB *null_aio_readv(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, + int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + return null_aio_common(bs, cb, opaque); +} + +static BlockDriverAIOCB *null_aio_writev(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, + int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + return null_aio_common(bs, cb, opaque); +} + +static BlockDriverAIOCB *null_aio_flush(BlockDriverState *bs, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + return null_aio_common(bs, cb, opaque); +} + +static BlockDriver bdrv_null_co = { + .format_name = "null-co", + .protocol_name = "null-co", + .instance_size = sizeof(BDRVNullState), + + .bdrv_file_open = null_file_open, + .bdrv_close = null_close, + .bdrv_getlength = null_getlength, + + .bdrv_co_readv = null_co_readv, + .bdrv_co_writev = null_co_writev, + .bdrv_co_flush_to_disk = null_co_flush, +}; + +static BlockDriver bdrv_null_aio = { + .format_name = "null-aio", + .protocol_name = "null-aio", + .instance_size = sizeof(BDRVNullState), + + .bdrv_file_open = null_file_open, + .bdrv_close = null_close, + .bdrv_getlength = null_getlength, + + .bdrv_aio_readv = null_aio_readv, + .bdrv_aio_writev = null_aio_writev, + .bdrv_aio_flush = null_aio_flush, +}; + +static void bdrv_null_init(void) +{ + bdrv_register(&bdrv_null_co); + bdrv_register(&bdrv_null_aio); +} + +block_init(bdrv_null_init); diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 735f687..f7dd8c0 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -486,6 +486,13 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, goto out; } + if (offset_into_cluster(s, l2_offset)) { + qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64 + " unaligned (L1 index: %#" PRIx64 ")", + l2_offset, l1_index); + return -EIO; + } + /* load the l2 table in memory */ ret = l2_load(bs, l2_offset, &l2_table); @@ -508,8 +515,11 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, break; case QCOW2_CLUSTER_ZERO: if (s->qcow_version < 3) { - qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); - return -EIO; + qcow2_signal_corruption(bs, true, -1, -1, "Zero cluster entry found" + " in pre-v3 image (L2 offset: %#" PRIx64 + ", L2 index: %#x)", l2_offset, l2_index); + ret = -EIO; + goto fail; } c = count_contiguous_clusters(nb_clusters, s->cluster_size, &l2_table[l2_index], QCOW_OFLAG_ZERO); @@ -525,6 +535,14 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, c = count_contiguous_clusters(nb_clusters, s->cluster_size, &l2_table[l2_index], QCOW_OFLAG_ZERO); *cluster_offset &= L2E_OFFSET_MASK; + if (offset_into_cluster(s, *cluster_offset)) { + qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset %#" + PRIx64 " unaligned (L2 offset: %#" PRIx64 + ", L2 index: %#x)", *cluster_offset, + l2_offset, l2_index); + ret = -EIO; + goto fail; + } break; default: abort(); @@ -541,6 +559,10 @@ out: *num = nb_available - index_in_cluster; return ret; + +fail: + qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table); + return ret; } /* @@ -576,6 +598,12 @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset, assert(l1_index < s->l1_size); l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; + if (offset_into_cluster(s, l2_offset)) { + qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64 + " unaligned (L1 index: %#" PRIx64 ")", + l2_offset, l1_index); + return -EIO; + } /* seek the l2 table of the given l2 offset */ @@ -948,6 +976,15 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, bool offset_matches = (cluster_offset & L2E_OFFSET_MASK) == *host_offset; + if (offset_into_cluster(s, cluster_offset & L2E_OFFSET_MASK)) { + qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset " + "%#llx unaligned (guest offset: %#" PRIx64 + ")", cluster_offset & L2E_OFFSET_MASK, + guest_offset); + ret = -EIO; + goto out; + } + if (*host_offset != 0 && !offset_matches) { *bytes = 0; ret = 0; @@ -979,7 +1016,7 @@ out: /* Only return a host offset if we actually made progress. Otherwise we * would make requirements for handle_alloc() that it can't fulfill */ - if (ret) { + if (ret > 0) { *host_offset = (cluster_offset & L2E_OFFSET_MASK) + offset_into_cluster(s, guest_offset); } diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 43665b8..2bcaaf9 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -26,8 +26,6 @@ #include "block/block_int.h" #include "block/qcow2.h" #include "qemu/range.h" -#include "qapi/qmp/types.h" -#include "qapi-event.h" static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size); static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, @@ -110,6 +108,13 @@ static int get_refcount(BlockDriverState *bs, int64_t cluster_index) if (!refcount_block_offset) return 0; + if (offset_into_cluster(s, refcount_block_offset)) { + qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64 + " unaligned (reftable index: %#" PRIx64 ")", + refcount_block_offset, refcount_table_index); + return -EIO; + } + ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset, (void**) &refcount_block); if (ret < 0) { @@ -183,6 +188,14 @@ static int alloc_refcount_block(BlockDriverState *bs, /* If it's already there, we're done */ if (refcount_block_offset) { + if (offset_into_cluster(s, refcount_block_offset)) { + qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" + PRIx64 " unaligned (reftable index: " + "%#x)", refcount_block_offset, + refcount_table_index); + return -EIO; + } + return load_refcount_block(bs, refcount_block_offset, (void**) refcount_block); } @@ -838,8 +851,14 @@ void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry, case QCOW2_CLUSTER_NORMAL: case QCOW2_CLUSTER_ZERO: if (l2_entry & L2E_OFFSET_MASK) { - qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK, - nb_clusters << s->cluster_bits, type); + if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) { + qcow2_signal_corruption(bs, false, -1, -1, + "Cannot free unaligned cluster %#llx", + l2_entry & L2E_OFFSET_MASK); + } else { + qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK, + nb_clusters << s->cluster_bits, type); + } } break; case QCOW2_CLUSTER_UNALLOCATED: @@ -903,6 +922,14 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, old_l2_offset = l2_offset; l2_offset &= L1E_OFFSET_MASK; + if (offset_into_cluster(s, l2_offset)) { + qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" + PRIx64 " unaligned (L1 index: %#x)", + l2_offset, i); + ret = -EIO; + goto fail; + } + ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, (void**) &l2_table); if (ret < 0) { @@ -935,6 +962,17 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, case QCOW2_CLUSTER_NORMAL: case QCOW2_CLUSTER_ZERO: + if (offset_into_cluster(s, offset & L2E_OFFSET_MASK)) { + qcow2_signal_corruption(bs, true, -1, -1, "Data " + "cluster offset %#llx " + "unaligned (L2 offset: %#" + PRIx64 ", L2 index: %#x)", + offset & L2E_OFFSET_MASK, + l2_offset, j); + ret = -EIO; + goto fail; + } + cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits; if (!cluster_index) { /* unallocated */ @@ -1838,26 +1876,11 @@ int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset, return ret; } else if (ret > 0) { int metadata_ol_bitnr = ffs(ret) - 1; - char *message; - assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR); - fprintf(stderr, "qcow2: Preventing invalid write on metadata (overlaps " - "with %s); image marked as corrupt.\n", - metadata_ol_names[metadata_ol_bitnr]); - message = g_strdup_printf("Prevented %s overwrite", - metadata_ol_names[metadata_ol_bitnr]); - qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs), - message, - true, - offset, - true, - size, - &error_abort); - g_free(message); - - qcow2_mark_corrupt(bs); - bs->drv = NULL; /* make BDS unusable */ + qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid " + "write on metadata (overlaps with %s)", + metadata_ol_names[metadata_ol_bitnr]); return -EIO; } diff --git a/block/qcow2.c b/block/qcow2.c index 0daf25c..778fc1e 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -31,6 +31,8 @@ #include "qapi/qmp/qerror.h" #include "qapi/qmp/qbool.h" #include "qapi/util.h" +#include "qapi/qmp/types.h" +#include "qapi-event.h" #include "trace.h" #include "qemu/option_int.h" @@ -404,6 +406,12 @@ static QemuOptsList qcow2_runtime_opts = { "templates (none, constant, cached, all)", }, { + .name = QCOW2_OPT_OVERLAP_TEMPLATE, + .type = QEMU_OPT_STRING, + .help = "Selects which overlap checks to perform from a range of " + "templates (none, constant, cached, all)", + }, + { .name = QCOW2_OPT_OVERLAP_MAIN_HEADER, .type = QEMU_OPT_BOOL, .help = "Check for unintended writes into the main qcow2 header", @@ -536,11 +544,11 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, unsigned int len, i; int ret = 0; QCowHeader header; - QemuOpts *opts; + QemuOpts *opts = NULL; Error *local_err = NULL; uint64_t ext_end; uint64_t l1_vm_state_index; - const char *opt_overlap_check; + const char *opt_overlap_check, *opt_overlap_check_template; int overlap_check_template = 0; uint64_t l2_cache_size, refcount_cache_size; @@ -920,7 +928,21 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, s->discard_passthrough[QCOW2_DISCARD_OTHER] = qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false); - opt_overlap_check = qemu_opt_get(opts, "overlap-check") ?: "cached"; + opt_overlap_check = qemu_opt_get(opts, QCOW2_OPT_OVERLAP); + opt_overlap_check_template = qemu_opt_get(opts, QCOW2_OPT_OVERLAP_TEMPLATE); + if (opt_overlap_check_template && opt_overlap_check && + strcmp(opt_overlap_check_template, opt_overlap_check)) + { + error_setg(errp, "Conflicting values for qcow2 options '" + QCOW2_OPT_OVERLAP "' ('%s') and '" QCOW2_OPT_OVERLAP_TEMPLATE + "' ('%s')", opt_overlap_check, opt_overlap_check_template); + ret = -EINVAL; + goto fail; + } + if (!opt_overlap_check) { + opt_overlap_check = opt_overlap_check_template ?: "cached"; + } + if (!strcmp(opt_overlap_check, "none")) { overlap_check_template = 0; } else if (!strcmp(opt_overlap_check, "constant")) { @@ -933,7 +955,6 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, error_setg(errp, "Unsupported value '%s' for qcow2 option " "'overlap-check'. Allowed are either of the following: " "none, constant, cached, all", opt_overlap_check); - qemu_opts_del(opts); ret = -EINVAL; goto fail; } @@ -948,6 +969,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, } qemu_opts_del(opts); + opts = NULL; if (s->use_lazy_refcounts && s->qcow_version < 3) { error_setg(errp, "Lazy refcounts require a qcow2 image with at least " @@ -965,6 +987,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, return ret; fail: + qemu_opts_del(opts); g_free(s->unknown_header_fields); cleanup_unknown_header_ext(bs); qcow2_free_snapshots(bs); @@ -2529,6 +2552,52 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts) return 0; } +/* + * If offset or size are negative, respectively, they will not be included in + * the BLOCK_IMAGE_CORRUPTED event emitted. + * fatal will be ignored for read-only BDS; corruptions found there will always + * be considered non-fatal. + */ +void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset, + int64_t size, const char *message_format, ...) +{ + BDRVQcowState *s = bs->opaque; + char *message; + va_list ap; + + fatal = fatal && !bs->read_only; + + if (s->signaled_corruption && + (!fatal || (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT))) + { + return; + } + + va_start(ap, message_format); + message = g_strdup_vprintf(message_format, ap); + va_end(ap); + + if (fatal) { + fprintf(stderr, "qcow2: Marking image as corrupt: %s; further " + "corruption events will be suppressed\n", message); + } else { + fprintf(stderr, "qcow2: Image is corrupt: %s; further non-fatal " + "corruption events will be suppressed\n", message); + } + + qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs), message, + offset >= 0, offset, size >= 0, size, + fatal, &error_abort); + g_free(message); + + if (fatal) { + qcow2_mark_corrupt(bs); + bs->drv = NULL; /* make BDS unusable */ + } + + s->signaled_corruption = true; +} + static QemuOptsList qcow2_create_opts = { .name = "qcow2-create-opts", .head = QTAILQ_HEAD_INITIALIZER(qcow2_create_opts.head), diff --git a/block/qcow2.h b/block/qcow2.h index 6aeb7ea..7d61e61 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -83,6 +83,7 @@ #define QCOW2_OPT_DISCARD_SNAPSHOT "pass-discard-snapshot" #define QCOW2_OPT_DISCARD_OTHER "pass-discard-other" #define QCOW2_OPT_OVERLAP "overlap-check" +#define QCOW2_OPT_OVERLAP_TEMPLATE "overlap-check.template" #define QCOW2_OPT_OVERLAP_MAIN_HEADER "overlap-check.main-header" #define QCOW2_OPT_OVERLAP_ACTIVE_L1 "overlap-check.active-l1" #define QCOW2_OPT_OVERLAP_ACTIVE_L2 "overlap-check.active-l2" @@ -261,6 +262,7 @@ typedef struct BDRVQcowState { bool discard_passthrough[QCOW2_DISCARD_MAX]; int overlap_check; /* bitmask of Qcow2MetadataOverlap values */ + bool signaled_corruption; uint64_t incompatible_features; uint64_t compatible_features; @@ -477,6 +479,10 @@ int qcow2_mark_corrupt(BlockDriverState *bs); int qcow2_mark_consistent(BlockDriverState *bs); int qcow2_update_header(BlockDriverState *bs); +void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset, + int64_t size, const char *message_format, ...) + GCC_FMT_ATTR(5, 6); + /* qcow2-refcount.c functions */ int qcow2_refcount_init(BlockDriverState *bs); void qcow2_refcount_close(BlockDriverState *bs); diff --git a/block/qed.c b/block/qed.c index f8d9e12..e2316d7 100644 --- a/block/qed.c +++ b/block/qed.c @@ -18,22 +18,8 @@ #include "qapi/qmp/qerror.h" #include "migration/migration.h" -static void qed_aio_cancel(BlockDriverAIOCB *blockacb) -{ - QEDAIOCB *acb = (QEDAIOCB *)blockacb; - AioContext *aio_context = bdrv_get_aio_context(blockacb->bs); - bool finished = false; - - /* Wait for the request to finish */ - acb->finished = &finished; - while (!finished) { - aio_poll(aio_context, true); - } -} - static const AIOCBInfo qed_aiocb_info = { .aiocb_size = sizeof(QEDAIOCB), - .cancel = qed_aio_cancel, }; static int bdrv_qed_probe(const uint8_t *buf, int buf_size, @@ -919,18 +905,12 @@ static void qed_aio_complete_bh(void *opaque) BlockDriverCompletionFunc *cb = acb->common.cb; void *user_opaque = acb->common.opaque; int ret = acb->bh_ret; - bool *finished = acb->finished; qemu_bh_delete(acb->bh); - qemu_aio_release(acb); + qemu_aio_unref(acb); /* Invoke callback */ cb(user_opaque, ret); - - /* Signal cancel completion */ - if (finished) { - *finished = true; - } } static void qed_aio_complete(QEDAIOCB *acb, int ret) @@ -1397,7 +1377,6 @@ static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs, opaque, flags); acb->flags = flags; - acb->finished = NULL; acb->qiov = qiov; acb->qiov_offset = 0; acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE; diff --git a/block/quorum.c b/block/quorum.c index 093382e..7687466 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -138,16 +138,15 @@ static void quorum_aio_cancel(BlockDriverAIOCB *blockacb) /* cancel all callbacks */ for (i = 0; i < s->num_children; i++) { - bdrv_aio_cancel(acb->qcrs[i].aiocb); + if (acb->qcrs[i].aiocb) { + bdrv_aio_cancel_async(acb->qcrs[i].aiocb); + } } - - g_free(acb->qcrs); - qemu_aio_release(acb); } static AIOCBInfo quorum_aiocb_info = { .aiocb_size = sizeof(QuorumAIOCB), - .cancel = quorum_aio_cancel, + .cancel_async = quorum_aio_cancel, }; static void quorum_aio_finalize(QuorumAIOCB *acb) @@ -169,7 +168,7 @@ static void quorum_aio_finalize(QuorumAIOCB *acb) } g_free(acb->qcrs); - qemu_aio_release(acb); + qemu_aio_unref(acb); } static bool quorum_sha256_compare(QuorumVoteValue *a, QuorumVoteValue *b) diff --git a/block/rbd.c b/block/rbd.c index b7f7d5f..96947e3 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -77,7 +77,6 @@ typedef struct RBDAIOCB { int64_t sector_num; int error; struct BDRVRBDState *s; - int cancelled; int status; } RBDAIOCB; @@ -408,9 +407,7 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb) acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret)); acb->status = 0; - if (!acb->cancelled) { - qemu_aio_release(acb); - } + qemu_aio_unref(acb); } /* TODO Convert to fine grained options */ @@ -539,25 +536,8 @@ static void qemu_rbd_close(BlockDriverState *bs) rados_shutdown(s->cluster); } -/* - * Cancel aio. Since we don't reference acb in a non qemu threads, - * it is safe to access it here. - */ -static void qemu_rbd_aio_cancel(BlockDriverAIOCB *blockacb) -{ - RBDAIOCB *acb = (RBDAIOCB *) blockacb; - acb->cancelled = 1; - - while (acb->status == -EINPROGRESS) { - aio_poll(bdrv_get_aio_context(acb->common.bs), true); - } - - qemu_aio_release(acb); -} - static const AIOCBInfo rbd_aiocb_info = { .aiocb_size = sizeof(RBDAIOCB), - .cancel = qemu_rbd_aio_cancel, }; static void rbd_finish_bh(void *opaque) @@ -640,7 +620,6 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs, acb->ret = 0; acb->error = 0; acb->s = s; - acb->cancelled = 0; acb->bh = NULL; acb->status = -EINPROGRESS; @@ -692,7 +671,7 @@ failed_completion: failed: g_free(rcb); qemu_vfree(acb->bounce); - qemu_aio_release(acb); + qemu_aio_unref(acb); return NULL; } diff --git a/block/sheepdog.c b/block/sheepdog.c index 7da36e1..2d78ef9 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -315,7 +315,6 @@ struct SheepdogAIOCB { void (*aio_done_func)(SheepdogAIOCB *); bool cancelable; - bool *finished; int nr_pending; }; @@ -446,10 +445,7 @@ static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req) static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB *acb) { qemu_coroutine_enter(acb->coroutine, NULL); - if (acb->finished) { - *acb->finished = true; - } - qemu_aio_release(acb); + qemu_aio_unref(acb); } /* @@ -482,36 +478,33 @@ static void sd_aio_cancel(BlockDriverAIOCB *blockacb) SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb; BDRVSheepdogState *s = acb->common.bs->opaque; AIOReq *aioreq, *next; - bool finished = false; - - acb->finished = &finished; - while (!finished) { - if (sd_acb_cancelable(acb)) { - /* Remove outstanding requests from pending and failed queues. */ - QLIST_FOREACH_SAFE(aioreq, &s->pending_aio_head, aio_siblings, - next) { - if (aioreq->aiocb == acb) { - free_aio_req(s, aioreq); - } + + if (sd_acb_cancelable(acb)) { + /* Remove outstanding requests from pending and failed queues. */ + QLIST_FOREACH_SAFE(aioreq, &s->pending_aio_head, aio_siblings, + next) { + if (aioreq->aiocb == acb) { + free_aio_req(s, aioreq); } - QLIST_FOREACH_SAFE(aioreq, &s->failed_aio_head, aio_siblings, - next) { - if (aioreq->aiocb == acb) { - free_aio_req(s, aioreq); - } + } + QLIST_FOREACH_SAFE(aioreq, &s->failed_aio_head, aio_siblings, + next) { + if (aioreq->aiocb == acb) { + free_aio_req(s, aioreq); } + } - assert(acb->nr_pending == 0); - sd_finish_aiocb(acb); - return; + assert(acb->nr_pending == 0); + if (acb->common.cb) { + acb->common.cb(acb->common.opaque, -ECANCELED); } - aio_poll(s->aio_context, true); + sd_finish_aiocb(acb); } } static const AIOCBInfo sd_aiocb_info = { - .aiocb_size = sizeof(SheepdogAIOCB), - .cancel = sd_aio_cancel, + .aiocb_size = sizeof(SheepdogAIOCB), + .cancel_async = sd_aio_cancel, }; static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov, @@ -528,7 +521,6 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov, acb->aio_done_func = NULL; acb->cancelable = true; - acb->finished = NULL; acb->coroutine = qemu_coroutine_self(); acb->ret = 0; acb->nr_pending = 0; @@ -2138,7 +2130,7 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num, ret = sd_co_rw_vector(acb); if (ret <= 0) { - qemu_aio_release(acb); + qemu_aio_unref(acb); return ret; } @@ -2159,7 +2151,7 @@ static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num, ret = sd_co_rw_vector(acb); if (ret <= 0) { - qemu_aio_release(acb); + qemu_aio_unref(acb); return ret; } @@ -2518,7 +2510,7 @@ static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num, ret = sd_co_rw_vector(acb); if (ret <= 0) { - qemu_aio_release(acb); + qemu_aio_unref(acb); return ret; } diff --git a/block/vhdx.c b/block/vhdx.c index 796b7bd..b046448 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -99,7 +99,8 @@ static const MSGUID logical_sector_guid = { .data1 = 0x8141bf1d, /* Each parent type must have a valid GUID; this is for parent images * of type 'VHDX'. If we were to allow e.g. a QCOW2 parent, we would * need to make up our own QCOW2 GUID type */ -static const MSGUID parent_vhdx_guid = { .data1 = 0xb04aefb7, +static const MSGUID parent_vhdx_guid __attribute__((unused)) + = { .data1 = 0xb04aefb7, .data2 = 0xd19e, .data3 = 0x4a81, .data4 = { 0xb7, 0x89, 0x25, 0xb8, @@ -1407,6 +1408,12 @@ exit: return ret; } +#define VHDX_METADATA_ENTRY_BUFFER_SIZE \ + (sizeof(VHDXFileParameters) +\ + sizeof(VHDXVirtualDiskSize) +\ + sizeof(VHDXPage83Data) +\ + sizeof(VHDXVirtualDiskLogicalSectorSize) +\ + sizeof(VHDXVirtualDiskPhysicalSectorSize)) /* * Create the Metadata entries. @@ -1445,11 +1452,7 @@ static int vhdx_create_new_metadata(BlockDriverState *bs, VHDXVirtualDiskLogicalSectorSize *mt_log_sector_size; VHDXVirtualDiskPhysicalSectorSize *mt_phys_sector_size; - entry_buffer = g_malloc0(sizeof(VHDXFileParameters) + - sizeof(VHDXVirtualDiskSize) + - sizeof(VHDXPage83Data) + - sizeof(VHDXVirtualDiskLogicalSectorSize) + - sizeof(VHDXVirtualDiskPhysicalSectorSize)); + entry_buffer = g_malloc0(VHDX_METADATA_ENTRY_BUFFER_SIZE); mt_file_params = entry_buffer; offset += sizeof(VHDXFileParameters); @@ -1530,7 +1533,7 @@ static int vhdx_create_new_metadata(BlockDriverState *bs, } ret = bdrv_pwrite(bs, metadata_offset + (64 * KiB), entry_buffer, - VHDX_HEADER_BLOCK_SIZE); + VHDX_METADATA_ENTRY_BUFFER_SIZE); if (ret < 0) { goto exit; } @@ -1593,7 +1596,7 @@ static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s, bdrv_has_zero_init(bs) == 0) { /* for a fixed file, the default BAT entry is not zero */ s->bat = g_try_malloc0(length); - if (length && s->bat != NULL) { + if (length && s->bat == NULL) { ret = -ENOMEM; goto exit; } @@ -1725,7 +1728,6 @@ static int vhdx_create_new_region_table(BlockDriverState *bs, goto exit; } - exit: g_free(s); g_free(buffer); @@ -1876,7 +1878,6 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp) } - delete_and_exit: bdrv_unref(bs); exit: diff --git a/block/win32-aio.c b/block/win32-aio.c index 5030e32..9323c1a 100644 --- a/block/win32-aio.c +++ b/block/win32-aio.c @@ -88,7 +88,7 @@ static void win32_aio_process_completion(QEMUWin32AIOState *s, waiocb->common.cb(waiocb->common.opaque, ret); - qemu_aio_release(waiocb); + qemu_aio_unref(waiocb); } static void win32_aio_completion_cb(EventNotifier *e) @@ -106,22 +106,8 @@ static void win32_aio_completion_cb(EventNotifier *e) } } -static void win32_aio_cancel(BlockDriverAIOCB *blockacb) -{ - QEMUWin32AIOCB *waiocb = (QEMUWin32AIOCB *)blockacb; - - /* - * CancelIoEx is only supported in Vista and newer. For now, just - * wait for completion. - */ - while (!HasOverlappedIoCompleted(&waiocb->ov)) { - aio_poll(bdrv_get_aio_context(blockacb->bs), true); - } -} - static const AIOCBInfo win32_aiocb_info = { .aiocb_size = sizeof(QEMUWin32AIOCB), - .cancel = win32_aio_cancel, }; BlockDriverAIOCB *win32_aio_submit(BlockDriverState *bs, @@ -172,7 +158,7 @@ BlockDriverAIOCB *win32_aio_submit(BlockDriverState *bs, out_dec_count: aio->count--; out: - qemu_aio_release(waiocb); + qemu_aio_unref(waiocb); return NULL; } @@ -327,7 +327,6 @@ glusterfs="" glusterfs_discard="no" glusterfs_zerofill="no" archipelago="" -virtio_blk_data_plane="" gtk="" gtkabi="" vte="" @@ -1093,9 +1092,8 @@ for opt do ;; --enable-archipelago) archipelago="yes" ;; - --disable-virtio-blk-data-plane) virtio_blk_data_plane="no" - ;; - --enable-virtio-blk-data-plane) virtio_blk_data_plane="yes" + --disable-virtio-blk-data-plane|--enable-virtio-blk-data-plane) + echo "$0: $opt is obsolete, virtio-blk data-plane is always on" >&2 ;; --disable-gtk) gtk="no" ;; @@ -1350,7 +1348,7 @@ Advanced options (experts only): --enable-linux-aio enable Linux AIO support --disable-cap-ng disable libcap-ng support --enable-cap-ng enable libcap-ng support - --disable-attr disables attr and xattr support + --disable-attr disable attr and xattr support --enable-attr enable attr and xattr support --disable-blobs disable installing provided firmware blobs --enable-docs enable documentation build @@ -1381,7 +1379,7 @@ Advanced options (experts only): --with-vss-sdk=SDK-path enable Windows VSS support in QEMU Guest Agent --with-win-sdk=SDK-path path to Windows Platform SDK (to build VSS .tlb) --disable-seccomp disable seccomp support - --enable-seccomp enables seccomp support + --enable-seccomp enable seccomp support --with-coroutine=BACKEND coroutine backend. Supported options: gthread, ucontext, sigaltstack, windows --disable-coroutine-pool disable coroutine freelist (worse performance) @@ -1396,7 +1394,7 @@ Advanced options (experts only): --enable-tpm enable TPM support --disable-libssh2 disable ssh block device support --enable-libssh2 enable ssh block device support - --disable-vhdx disables support for the Microsoft VHDX image format + --disable-vhdx disable support for the Microsoft VHDX image format --enable-vhdx enable support for the Microsoft VHDX image format --disable-quorum disable quorum block filter support --enable-quorum enable quorum block filter support @@ -2944,16 +2942,6 @@ else fi ########################################## -# adjust virtio-blk-data-plane based on linux-aio - -if test "$virtio_blk_data_plane" = "yes" -a \ - "$linux_aio" != "yes" ; then - error_exit "virtio-blk-data-plane requires Linux AIO, please try --enable-linux-aio" -elif test -z "$virtio_blk_data_plane" ; then - virtio_blk_data_plane=$linux_aio -fi - -########################################## # attr probe if test "$attr" != "no" ; then @@ -4327,7 +4315,6 @@ echo "coroutine backend $coroutine" echo "coroutine pool $coroutine_pool" echo "GlusterFS support $glusterfs" echo "Archipelago support $archipelago" -echo "virtio-blk-data-plane $virtio_blk_data_plane" echo "gcov $gcov_tool" echo "gcov enabled $gcov" echo "TPM support $tpm" @@ -4789,10 +4776,6 @@ if test "$quorum" = "yes" ; then echo "CONFIG_QUORUM=y" >> $config_host_mak fi -if test "$virtio_blk_data_plane" = "yes" ; then - echo 'CONFIG_VIRTIO_BLK_DATA_PLANE=$(CONFIG_VIRTIO)' >> $config_host_mak -fi - if test "$vhdx" = "yes" ; then echo "CONFIG_VHDX=y" >> $config_host_mak fi diff --git a/dma-helpers.c b/dma-helpers.c index f6811fa..7f86e18 100644 --- a/dma-helpers.c +++ b/dma-helpers.c @@ -73,7 +73,6 @@ typedef struct { QEMUSGList *sg; uint64_t sector_num; DMADirection dir; - bool in_cancel; int sg_cur_index; dma_addr_t sg_cur_byte; QEMUIOVector iov; @@ -125,12 +124,7 @@ static void dma_complete(DMAAIOCB *dbs, int ret) qemu_bh_delete(dbs->bh); dbs->bh = NULL; } - if (!dbs->in_cancel) { - /* Requests may complete while dma_aio_cancel is in progress. In - * this case, the AIOCB should not be released because it is still - * referenced by dma_aio_cancel. */ - qemu_aio_release(dbs); - } + qemu_aio_unref(dbs); } static void dma_bdrv_cb(void *opaque, int ret) @@ -186,19 +180,14 @@ static void dma_aio_cancel(BlockDriverAIOCB *acb) trace_dma_aio_cancel(dbs); if (dbs->acb) { - BlockDriverAIOCB *acb = dbs->acb; - dbs->acb = NULL; - dbs->in_cancel = true; - bdrv_aio_cancel(acb); - dbs->in_cancel = false; + bdrv_aio_cancel_async(dbs->acb); } - dbs->common.cb = NULL; - dma_complete(dbs, 0); } + static const AIOCBInfo dma_aiocb_info = { .aiocb_size = sizeof(DMAAIOCB), - .cancel = dma_aio_cancel, + .cancel_async = dma_aio_cancel, }; BlockDriverAIOCB *dma_bdrv_io( @@ -217,7 +206,6 @@ BlockDriverAIOCB *dma_bdrv_io( dbs->sg_cur_index = 0; dbs->sg_cur_byte = 0; dbs->dir = dir; - dbs->in_cancel = false; dbs->io_func = io_func; dbs->bh = NULL; qemu_iovec_init(&dbs->iov, sg->nsg); diff --git a/docs/image-fuzzer.txt b/docs/image-fuzzer.txt index 0d0005d..3e23ebe 100644 --- a/docs/image-fuzzer.txt +++ b/docs/image-fuzzer.txt @@ -125,7 +125,8 @@ If a fuzzer configuration is specified, then it has the next interpretation: will be always fuzzed for every test. This case is useful for regression testing. -For now only header fields, header extensions and L1/L2 tables are generated. +The generator can create header fields, header extensions, L1/L2 tables and +refcount table and blocks. Module interfaces ----------------- diff --git a/docs/rdma.txt b/docs/rdma.txt index 1f5d9e9..2bdd0a5 100644 --- a/docs/rdma.txt +++ b/docs/rdma.txt @@ -18,7 +18,7 @@ Contents: * RDMA Migration Protocol Description * Versioning and Capabilities * QEMUFileRDMA Interface -* Migration of pc.ram +* Migration of VM's ram * Error handling * TODO @@ -149,7 +149,7 @@ The only difference between a SEND message and an RDMA message is that SEND messages cause notifications to be posted to the completion queue (CQ) on the infiniband receiver side, whereas RDMA messages (used -for pc.ram) do not (to behave like an actual DMA). +for VM's ram) do not (to behave like an actual DMA). Messages in infiniband require two things: @@ -355,7 +355,7 @@ If the buffer is empty, then we follow the same steps listed above and issue another "QEMU File" protocol command, asking for a new SEND message to re-fill the buffer. -Migration of pc.ram: +Migration of VM's ram: ==================== At the beginning of the migration, (migration-rdma.c), @@ -679,6 +679,8 @@ void hmp_info_block_jobs(Monitor *mon, const QDict *qdict) } list = list->next; } + + qapi_free_BlockJobInfoList(list); } void hmp_info_tpm(Monitor *mon, const QDict *qdict) diff --git a/hw/block/Makefile.objs b/hw/block/Makefile.objs index bf46f03..d4c3ab7 100644 --- a/hw/block/Makefile.objs +++ b/hw/block/Makefile.objs @@ -12,4 +12,4 @@ common-obj-$(CONFIG_NVME_PCI) += nvme.o obj-$(CONFIG_SH4) += tc58128.o obj-$(CONFIG_VIRTIO) += virtio-blk.o -obj-$(CONFIG_VIRTIO_BLK_DATA_PLANE) += dataplane/ +obj-$(CONFIG_VIRTIO) += dataplane/ diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 38ad38f..45e0c8f 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -18,10 +18,8 @@ #include "hw/block/block.h" #include "sysemu/blockdev.h" #include "hw/virtio/virtio-blk.h" -#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE -# include "dataplane/virtio-blk.h" -# include "migration/migration.h" -#endif +#include "dataplane/virtio-blk.h" +#include "migration/migration.h" #include "block/scsi.h" #ifdef __linux__ # include <scsi/sg.h> @@ -435,7 +433,6 @@ static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq) .num_writes = 0, }; -#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE /* Some guests kick before setting VIRTIO_CONFIG_S_DRIVER_OK so start * dataplane here instead of waiting for .set_status(). */ @@ -443,7 +440,6 @@ static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq) virtio_blk_data_plane_start(s->dataplane); return; } -#endif while ((req = virtio_blk_get_request(s))) { virtio_blk_handle_request(req, &mrb); @@ -500,11 +496,9 @@ static void virtio_blk_reset(VirtIODevice *vdev) { VirtIOBlock *s = VIRTIO_BLK(vdev); -#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE if (s->dataplane) { virtio_blk_data_plane_stop(s->dataplane); } -#endif /* * This should cancel pending requests, but can't do nicely until there @@ -594,12 +588,10 @@ static void virtio_blk_set_status(VirtIODevice *vdev, uint8_t status) VirtIOBlock *s = VIRTIO_BLK(vdev); uint32_t features; -#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE if (s->dataplane && !(status & (VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK))) { virtio_blk_data_plane_stop(s->dataplane); } -#endif if (!(status & VIRTIO_CONFIG_S_DRIVER_OK)) { return; @@ -694,7 +686,6 @@ static const BlockDevOps virtio_block_ops = { .resize_cb = virtio_blk_resize, }; -#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE /* Disable dataplane thread during live migration since it does not * update the dirty memory bitmap yet. */ @@ -725,7 +716,6 @@ static void virtio_blk_migration_state_changed(Notifier *notifier, void *data) } } } -#endif /* CONFIG_VIRTIO_BLK_DATA_PLANE */ static void virtio_blk_device_realize(DeviceState *dev, Error **errp) { @@ -762,7 +752,6 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp) s->vq = virtio_add_queue(vdev, 128, virtio_blk_handle_output); s->complete_request = virtio_blk_complete_request; -#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE virtio_blk_data_plane_create(vdev, blk, &s->dataplane, &err); if (err != NULL) { error_propagate(errp, err); @@ -771,7 +760,6 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp) } s->migration_state_notifier.notify = virtio_blk_migration_state_changed; add_migration_state_change_notifier(&s->migration_state_notifier); -#endif s->change = qemu_add_vm_change_state_handler(virtio_blk_dma_restart_cb, s); register_savevm(dev, "virtio-blk", virtio_blk_id++, 2, @@ -789,11 +777,9 @@ static void virtio_blk_device_unrealize(DeviceState *dev, Error **errp) VirtIODevice *vdev = VIRTIO_DEVICE(dev); VirtIOBlock *s = VIRTIO_BLK(dev); -#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE remove_migration_state_change_notifier(&s->migration_state_notifier); virtio_blk_data_plane_destroy(s->dataplane); s->dataplane = NULL; -#endif qemu_del_vm_change_state_handler(s->change); unregister_savevm(dev, "virtio-blk", s); blockdev_mark_auto_del(s->bs); @@ -818,9 +804,7 @@ static Property virtio_blk_properties[] = { #ifdef __linux__ DEFINE_PROP_BIT("scsi", VirtIOBlock, blk.scsi, 0, true), #endif -#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE DEFINE_PROP_BIT("x-data-plane", VirtIOBlock, blk.data_plane, 0, false), -#endif DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 2c2e9dc..82a7daa 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -75,7 +75,7 @@ /* Leave a chunk of memory at the top of RAM for the BIOS ACPI tables * (128K) and other BIOS datastructures (less than 4K reported to be used at * the moment, 32K should be enough for a while). */ -unsigned acpi_data_size = 0x20000 + 0x8000; +static unsigned acpi_data_size = 0x20000 + 0x8000; void pc_set_legacy_acpi_data_size(void) { acpi_data_size = 0x10000; diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c index ba69de3..8978643 100644 --- a/hw/ide/ahci.c +++ b/hw/ide/ahci.c @@ -78,8 +78,7 @@ static uint32_t ahci_port_read(AHCIState *s, int port, int offset) val = pr->cmd; break; case PORT_TFDATA: - val = ((uint16_t)s->dev[port].port.ifs[0].error << 8) | - s->dev[port].port.ifs[0].status; + val = pr->tfdata; break; case PORT_SIG: val = pr->sig; @@ -251,14 +250,13 @@ static void ahci_port_write(AHCIState *s, int port, int offset, uint32_t val) check_cmd(s, port); break; case PORT_TFDATA: - s->dev[port].port.ifs[0].error = (val >> 8) & 0xff; - s->dev[port].port.ifs[0].status = val & 0xff; + /* Read Only. */ break; case PORT_SIG: - pr->sig = val; + /* Read Only */ break; case PORT_SCR_STAT: - pr->scr_stat = val; + /* Read Only */ break; case PORT_SCR_CTL: if (((pr->scr_ctl & AHCI_SCR_SCTL_DET) == 1) && @@ -497,6 +495,8 @@ static void ahci_reset_port(AHCIState *s, int port) pr->scr_stat = 0; pr->scr_err = 0; pr->scr_act = 0; + pr->tfdata = 0x7F; + pr->sig = 0xFFFFFFFF; d->busy_slot = -1; d->init_d2h_sent = false; @@ -528,16 +528,16 @@ static void ahci_reset_port(AHCIState *s, int port) s->dev[port].port_state = STATE_RUN; if (!ide_state->bs) { - s->dev[port].port_regs.sig = 0; + pr->sig = 0; ide_state->status = SEEK_STAT | WRERR_STAT; } else if (ide_state->drive_kind == IDE_CD) { - s->dev[port].port_regs.sig = SATA_SIGNATURE_CDROM; + pr->sig = SATA_SIGNATURE_CDROM; ide_state->lcyl = 0x14; ide_state->hcyl = 0xeb; DPRINTF(port, "set lcyl = %d\n", ide_state->lcyl); ide_state->status = SEEK_STAT | WRERR_STAT | READY_STAT; } else { - s->dev[port].port_regs.sig = SATA_SIGNATURE_DISK; + pr->sig = SATA_SIGNATURE_DISK; ide_state->status = SEEK_STAT | WRERR_STAT; } @@ -563,7 +563,8 @@ static void debug_print_fis(uint8_t *fis, int cmd_len) static void ahci_write_fis_sdb(AHCIState *s, int port, uint32_t finished) { - AHCIPortRegs *pr = &s->dev[port].port_regs; + AHCIDevice *ad = &s->dev[port]; + AHCIPortRegs *pr = &ad->port_regs; IDEState *ide_state; uint8_t *sdb_fis; @@ -572,8 +573,8 @@ static void ahci_write_fis_sdb(AHCIState *s, int port, uint32_t finished) return; } - sdb_fis = &s->dev[port].res_fis[RES_FIS_SDBFIS]; - ide_state = &s->dev[port].port.ifs[0]; + sdb_fis = &ad->res_fis[RES_FIS_SDBFIS]; + ide_state = &ad->port.ifs[0]; /* clear memory */ *(uint32_t*)sdb_fis = 0; @@ -582,9 +583,14 @@ static void ahci_write_fis_sdb(AHCIState *s, int port, uint32_t finished) sdb_fis[0] = ide_state->error; sdb_fis[2] = ide_state->status & 0x77; s->dev[port].finished |= finished; - *(uint32_t*)(sdb_fis + 4) = cpu_to_le32(s->dev[port].finished); + *(uint32_t*)(sdb_fis + 4) = cpu_to_le32(ad->finished); + + /* Update shadow registers (except BSY 0x80 and DRQ 0x08) */ + pr->tfdata = (ad->port.ifs[0].error << 8) | + (ad->port.ifs[0].status & 0x77) | + (pr->tfdata & 0x88); - ahci_trigger_irq(s, &s->dev[port], PORT_IRQ_SDB_FIS); + ahci_trigger_irq(s, ad, PORT_IRQ_SDB_FIS); } static void ahci_write_fis_pio(AHCIDevice *ad, uint16_t len) @@ -642,6 +648,10 @@ static void ahci_write_fis_pio(AHCIDevice *ad, uint16_t len) pio_fis[18] = 0; pio_fis[19] = 0; + /* Update shadow registers: */ + pr->tfdata = (ad->port.ifs[0].error << 8) | + ad->port.ifs[0].status; + if (pio_fis[2] & ERR_STAT) { ahci_trigger_irq(ad->hba, ad, PORT_IRQ_TF_ERR); } @@ -693,6 +703,10 @@ static void ahci_write_fis_d2h(AHCIDevice *ad, uint8_t *cmd_fis) d2h_fis[i] = 0; } + /* Update shadow registers: */ + pr->tfdata = (ad->port.ifs[0].error << 8) | + ad->port.ifs[0].status; + if (d2h_fis[2] & ERR_STAT) { ahci_trigger_irq(ad->hba, ad, PORT_IRQ_TF_ERR); } @@ -791,6 +805,9 @@ static void ncq_cb(void *opaque, int ret) NCQTransferState *ncq_tfs = (NCQTransferState *)opaque; IDEState *ide_state = &ncq_tfs->drive->port.ifs[0]; + if (ret == -ECANCELED) { + return; + } /* Clear bit for this tag in SActive */ ncq_tfs->drive->port_regs.scr_act &= ~(1 << ncq_tfs->tag); diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c index 6d52cda..10218df 100644 --- a/hw/ide/atapi.c +++ b/hw/ide/atapi.c @@ -136,6 +136,7 @@ void ide_atapi_cmd_ok(IDEState *s) s->error = 0; s->status = READY_STAT | SEEK_STAT; s->nsector = (s->nsector & ~7) | ATAPI_INT_REASON_IO | ATAPI_INT_REASON_CD; + ide_transfer_stop(s); ide_set_irq(s->bus); } @@ -149,6 +150,7 @@ void ide_atapi_cmd_error(IDEState *s, int sense_key, int asc) s->nsector = (s->nsector & ~7) | ATAPI_INT_REASON_IO | ATAPI_INT_REASON_CD; s->sense_key = sense_key; s->asc = asc; + ide_transfer_stop(s); ide_set_irq(s->bus); } @@ -176,9 +178,7 @@ void ide_atapi_cmd_reply_end(IDEState *s) #endif if (s->packet_transfer_size <= 0) { /* end of transfer */ - s->status = READY_STAT | SEEK_STAT; - s->nsector = (s->nsector & ~7) | ATAPI_INT_REASON_IO | ATAPI_INT_REASON_CD; - ide_transfer_stop(s); + ide_atapi_cmd_ok(s); ide_set_irq(s->bus); #ifdef DEBUG_IDE_ATAPI printf("status=0x%x\n", s->status); @@ -188,7 +188,6 @@ void ide_atapi_cmd_reply_end(IDEState *s) if (s->lba != -1 && s->io_buffer_index >= s->cd_sector_size) { ret = cd_read_sector(s, s->lba, s->io_buffer, s->cd_sector_size); if (ret < 0) { - ide_transfer_stop(s); ide_atapi_io_error(s, ret); return; } diff --git a/hw/ide/core.c b/hw/ide/core.c index 6fba056..190700a 100644 --- a/hw/ide/core.c +++ b/hw/ide/core.c @@ -372,23 +372,21 @@ static void trim_aio_cancel(BlockDriverAIOCB *acb) { TrimAIOCB *iocb = container_of(acb, TrimAIOCB, common); - /* Exit the loop in case bdrv_aio_cancel calls ide_issue_trim_cb again. */ + /* Exit the loop so ide_issue_trim_cb will not continue */ iocb->j = iocb->qiov->niov - 1; iocb->i = (iocb->qiov->iov[iocb->j].iov_len / 8) - 1; - /* Tell ide_issue_trim_cb not to trigger the completion, too. */ - qemu_bh_delete(iocb->bh); - iocb->bh = NULL; + iocb->ret = -ECANCELED; if (iocb->aiocb) { - bdrv_aio_cancel(iocb->aiocb); + bdrv_aio_cancel_async(iocb->aiocb); + iocb->aiocb = NULL; } - qemu_aio_release(iocb); } static const AIOCBInfo trim_aiocb_info = { .aiocb_size = sizeof(TrimAIOCB), - .cancel = trim_aio_cancel, + .cancel_async = trim_aio_cancel, }; static void ide_trim_bh_cb(void *opaque) @@ -399,7 +397,7 @@ static void ide_trim_bh_cb(void *opaque) qemu_bh_delete(iocb->bh); iocb->bh = NULL; - qemu_aio_release(iocb); + qemu_aio_unref(iocb); } static void ide_issue_trim_cb(void *opaque, int ret) @@ -568,6 +566,9 @@ static void ide_sector_read_cb(void *opaque, int ret) s->pio_aiocb = NULL; s->status &= ~BUSY_STAT; + if (ret == -ECANCELED) { + return; + } block_acct_done(bdrv_get_stats(s->bs), &s->acct); if (ret != 0) { if (ide_handle_rw_error(s, -ret, IDE_RETRY_PIO | @@ -678,6 +679,9 @@ void ide_dma_cb(void *opaque, int ret) int64_t sector_num; bool stay_active = false; + if (ret == -ECANCELED) { + return; + } if (ret < 0) { int op = IDE_RETRY_DMA; @@ -803,6 +807,9 @@ static void ide_sector_write_cb(void *opaque, int ret) IDEState *s = opaque; int n; + if (ret == -ECANCELED) { + return; + } block_acct_done(bdrv_get_stats(s->bs), &s->acct); s->pio_aiocb = NULL; @@ -882,6 +889,9 @@ static void ide_flush_cb(void *opaque, int ret) s->pio_aiocb = NULL; + if (ret == -ECANCELED) { + return; + } if (ret < 0) { /* XXX: What sector number to set here? */ if (ide_handle_rw_error(s, -ret, IDE_RETRY_FLUSH)) { diff --git a/hw/ide/ich.c b/hw/ide/ich.c index a2f1639..8eb77a1 100644 --- a/hw/ide/ich.c +++ b/hw/ide/ich.c @@ -71,6 +71,7 @@ #include <hw/ide/pci.h> #include <hw/ide/ahci.h> +#define ICH9_MSI_CAP_OFFSET 0x80 #define ICH9_SATA_CAP_OFFSET 0xA8 #define ICH9_IDP_BAR 4 @@ -115,7 +116,6 @@ static int pci_ich9_ahci_init(PCIDevice *dev) /* XXX Software should program this register */ dev->config[0x90] = 1 << 6; /* Address Map Register - AHCI mode */ - msi_init(dev, 0x50, 1, true, false); d->ahci.irq = pci_allocate_irq(dev); pci_register_bar(dev, ICH9_IDP_BAR, PCI_BASE_ADDRESS_SPACE_IO, @@ -135,6 +135,11 @@ static int pci_ich9_ahci_init(PCIDevice *dev) (ICH9_IDP_BAR + 0x4) | (ICH9_IDP_INDEX_LOG2 << 4)); d->ahci.idp_offset = ICH9_IDP_INDEX; + /* Although the AHCI 1.3 specification states that the first capability + * should be PMCAP, the Intel ICH9 data sheet specifies that the ICH9 + * AHCI device puts the MSI capability first, pointing to 0x80. */ + msi_init(dev, ICH9_MSI_CAP_OFFSET, 1, true, false); + return 0; } diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index 77bb93e..4e3a061 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -28,7 +28,6 @@ #include <sys/socket.h> #include <linux/kvm.h> #include <fcntl.h> -#include <linux/virtio_ring.h> #include <netpacket/packet.h> #include <net/ethernet.h> #include <net/if.h> @@ -36,6 +35,7 @@ #include <stdio.h> +#include "hw/virtio/virtio_ring.h" #include "hw/virtio/vhost.h" #include "hw/virtio/virtio-bus.h" diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 2ab4460..2becc9f 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -541,7 +541,7 @@ static void *spapr_create_fdt_skel(hwaddr initrd_base, _FDT((fdt_property_cell(fdt, "rtas-error-log-max", RTAS_ERROR_LOG_MAX))); /* - * According to PAPR, rtas ibm,os-term, does not gaurantee a return + * According to PAPR, rtas ibm,os-term does not guarantee a return * back to the guest cpu. * * While an additional ibm,extended-os-term property indicates that diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs index ec9e855..d21c397 100644 --- a/hw/virtio/Makefile.objs +++ b/hw/virtio/Makefile.objs @@ -2,7 +2,7 @@ common-obj-y += virtio-rng.o common-obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o common-obj-y += virtio-bus.o common-obj-y += virtio-mmio.o -common-obj-$(CONFIG_VIRTIO_BLK_DATA_PLANE) += dataplane/ +common-obj-$(CONFIG_VIRTIO) += dataplane/ obj-y += virtio.o virtio-balloon.o obj-$(CONFIG_LINUX) += vhost.o vhost-backend.o vhost-user.o diff --git a/hw/virtio/dataplane/vring.c b/hw/virtio/dataplane/vring.c index 67cb2b8..372706a 100644 --- a/hw/virtio/dataplane/vring.c +++ b/hw/virtio/dataplane/vring.c @@ -181,7 +181,8 @@ static int get_desc(Vring *vring, VirtQueueElement *elem, /* Stop for now if there are not enough iovecs available. */ if (*num >= VIRTQUEUE_MAX_SIZE) { - return -ENOBUFS; + error_report("Invalid SG num: %u", *num); + return -EFAULT; } /* TODO handle non-contiguous memory across region boundaries */ diff --git a/include/block/aio.h b/include/block/aio.h index 4603c0f..1562721 100644 --- a/include/block/aio.h +++ b/include/block/aio.h @@ -26,7 +26,8 @@ typedef struct BlockDriverAIOCB BlockDriverAIOCB; typedef void BlockDriverCompletionFunc(void *opaque, int ret); typedef struct AIOCBInfo { - void (*cancel)(BlockDriverAIOCB *acb); + void (*cancel_async)(BlockDriverAIOCB *acb); + AioContext *(*get_aio_context)(BlockDriverAIOCB *acb); size_t aiocb_size; } AIOCBInfo; @@ -35,11 +36,13 @@ struct BlockDriverAIOCB { BlockDriverState *bs; BlockDriverCompletionFunc *cb; void *opaque; + int refcnt; }; void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, BlockDriverCompletionFunc *cb, void *opaque); -void qemu_aio_release(void *p); +void qemu_aio_unref(void *p); +void qemu_aio_ref(void *p); typedef struct AioHandler AioHandler; typedef void QEMUBHFunc(void *opaque); @@ -99,7 +102,7 @@ void aio_set_dispatching(AioContext *ctx, bool dispatching); * They also provide bottom halves, a service to execute a piece of code * as soon as possible. */ -AioContext *aio_context_new(void); +AioContext *aio_context_new(Error **errp); /** * aio_context_ref: diff --git a/include/block/block.h b/include/block/block.h index 07d6d8e..3318f0d 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -337,6 +337,7 @@ BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque); void bdrv_aio_cancel(BlockDriverAIOCB *acb); +void bdrv_aio_cancel_async(BlockDriverAIOCB *acb); typedef struct BlockRequest { /* Fields to be filled by multiwrite caller */ diff --git a/include/hw/virtio/dataplane/vring.h b/include/hw/virtio/dataplane/vring.h index af73ee2..d3e086a 100644 --- a/include/hw/virtio/dataplane/vring.h +++ b/include/hw/virtio/dataplane/vring.h @@ -17,8 +17,8 @@ #ifndef VRING_H #define VRING_H -#include <linux/virtio_ring.h> #include "qemu-common.h" +#include "hw/virtio/virtio_ring.h" #include "hw/virtio/virtio.h" typedef struct { diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h index cf61154..f3853f2 100644 --- a/include/hw/virtio/virtio-blk.h +++ b/include/hw/virtio/virtio-blk.h @@ -132,10 +132,8 @@ typedef struct VirtIOBlock { VMChangeStateEntry *change; /* Function to push to vq and notify guest */ void (*complete_request)(struct VirtIOBlockReq *req, unsigned char status); -#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE Notifier migration_state_notifier; struct VirtIOBlockDataPlane *dataplane; -#endif } VirtIOBlock; typedef struct MultiReqBuffer { diff --git a/include/hw/virtio/virtio_ring.h b/include/hw/virtio/virtio_ring.h new file mode 100644 index 0000000..8f58bc9 --- /dev/null +++ b/include/hw/virtio/virtio_ring.h @@ -0,0 +1,167 @@ +#ifndef _LINUX_VIRTIO_RING_H +#define _LINUX_VIRTIO_RING_H +/* + * This file is copied from /usr/include/linux while converting __uNN types + * to uXX_t, __inline__ to inline, and tab to spaces. + * */ + +/* An interface for efficient virtio implementation, currently for use by KVM + * and lguest, but hopefully others soon. Do NOT change this since it will + * break existing servers and clients. + * + * This header is BSD licensed so anyone can use the definitions to implement + * compatible drivers/servers. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of IBM nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Copyright Rusty Russell IBM Corporation 2007. */ + +/* This marks a buffer as continuing via the next field. */ +#define VRING_DESC_F_NEXT 1 +/* This marks a buffer as write-only (otherwise read-only). */ +#define VRING_DESC_F_WRITE 2 +/* This means the buffer contains a list of buffer descriptors. */ +#define VRING_DESC_F_INDIRECT 4 + +/* The Host uses this in used->flags to advise the Guest: don't kick me when + * you add a buffer. It's unreliable, so it's simply an optimization. Guest + * will still kick if it's out of buffers. */ +#define VRING_USED_F_NO_NOTIFY 1 +/* The Guest uses this in avail->flags to advise the Host: don't interrupt me + * when you consume a buffer. It's unreliable, so it's simply an + * optimization. */ +#define VRING_AVAIL_F_NO_INTERRUPT 1 + +/* We support indirect buffer descriptors */ +#define VIRTIO_RING_F_INDIRECT_DESC 28 + +/* The Guest publishes the used index for which it expects an interrupt + * at the end of the avail ring. Host should ignore the avail->flags field. */ +/* The Host publishes the avail index for which it expects a kick + * at the end of the used ring. Guest should ignore the used->flags field. */ +#define VIRTIO_RING_F_EVENT_IDX 29 + +/* Virtio ring descriptors: 16 bytes. These can chain together via "next". */ +struct vring_desc { + /* Address (guest-physical). */ + uint64_t addr; + /* Length. */ + uint32_t len; + /* The flags as indicated above. */ + uint16_t flags; + /* We chain unused descriptors via this, too */ + uint16_t next; +}; + +struct vring_avail { + uint16_t flags; + uint16_t idx; + uint16_t ring[]; +}; + +/* u32 is used here for ids for padding reasons. */ +struct vring_used_elem { + /* Index of start of used descriptor chain. */ + uint32_t id; + /* Total length of the descriptor chain which was used (written to) */ + uint32_t len; +}; + +struct vring_used { + uint16_t flags; + uint16_t idx; + struct vring_used_elem ring[]; +}; + +struct vring { + unsigned int num; + + struct vring_desc *desc; + + struct vring_avail *avail; + + struct vring_used *used; +}; + +/* The standard layout for the ring is a continuous chunk of memory which looks + * like this. We assume num is a power of 2. + * + * struct vring + * { + * // The actual descriptors (16 bytes each) + * struct vring_desc desc[num]; + * + * // A ring of available descriptor heads with free-running index. + * uint16_t avail_flags; + * uint16_t avail_idx; + * uint16_t available[num]; + * uint16_t used_event_idx; + * + * // Padding to the next align boundary. + * char pad[]; + * + * // A ring of used descriptor heads with free-running index. + * uint16_t used_flags; + * uint16_t used_idx; + * struct vring_used_elem used[num]; + * uint16_t avail_event_idx; + * }; + */ +/* We publish the used event index at the end of the available ring, and vice + * versa. They are at the end for backwards compatibility. */ +#define vring_used_event(vr) ((vr)->avail->ring[(vr)->num]) +#define vring_avail_event(vr) (*(uint16_t *)&(vr)->used->ring[(vr)->num]) + +static inline void vring_init(struct vring *vr, unsigned int num, void *p, + unsigned long align) +{ + vr->num = num; + vr->desc = p; + vr->avail = p + num*sizeof(struct vring_desc); + vr->used = (void *)(((unsigned long)&vr->avail->ring[num] + sizeof(uint16_t) + + align-1) & ~(align - 1)); +} + +static inline unsigned vring_size(unsigned int num, unsigned long align) +{ + return ((sizeof(struct vring_desc) * num + sizeof(uint16_t) * (3 + num) + + align - 1) & ~(align - 1)) + + sizeof(uint16_t) * 3 + sizeof(struct vring_used_elem) * num; +} + +/* The following is used with USED_EVENT_IDX and AVAIL_EVENT_IDX */ +/* Assuming a given event_idx value from the other size, if + * we have just incremented index from old to new_idx, + * should we trigger an event? */ +static inline int vring_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old) +{ + /* Note: Xen has similar logic for notification hold-off + * in include/xen/interface/io/ring.h with req_event and req_prod + * corresponding to event_idx + 1 and new_idx respectively. + * Note also that req_event and req_prod in Xen start at 1, + * event indexes in virtio start at 0. */ + return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old); +} + +#endif /* _LINUX_VIRTIO_RING_H */ diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h index 6f0200a..62c68c0 100644 --- a/include/qemu/main-loop.h +++ b/include/qemu/main-loop.h @@ -42,7 +42,7 @@ * * In the case of QEMU tools, this will also start/initialize timers. */ -int qemu_init_main_loop(void); +int qemu_init_main_loop(Error **errp); /** * main_loop_wait: Run one iteration of the main loop. diff --git a/include/sysemu/arch_init.h b/include/sysemu/arch_init.h index 8939233..769ec06 100644 --- a/include/sysemu/arch_init.h +++ b/include/sysemu/arch_init.h @@ -6,23 +6,23 @@ enum { QEMU_ARCH_ALL = -1, - QEMU_ARCH_ALPHA = 1, - QEMU_ARCH_ARM = 2, - QEMU_ARCH_CRIS = 4, - QEMU_ARCH_I386 = 8, - QEMU_ARCH_M68K = 16, - QEMU_ARCH_LM32 = 32, - QEMU_ARCH_MICROBLAZE = 64, - QEMU_ARCH_MIPS = 128, - QEMU_ARCH_PPC = 256, - QEMU_ARCH_S390X = 512, - QEMU_ARCH_SH4 = 1024, - QEMU_ARCH_SPARC = 2048, - QEMU_ARCH_XTENSA = 4096, - QEMU_ARCH_OPENRISC = 8192, - QEMU_ARCH_UNICORE32 = 0x4000, - QEMU_ARCH_MOXIE = 0x8000, - QEMU_ARCH_TRICORE = 0x10000, + QEMU_ARCH_ALPHA = (1 << 0), + QEMU_ARCH_ARM = (1 << 1), + QEMU_ARCH_CRIS = (1 << 2), + QEMU_ARCH_I386 = (1 << 3), + QEMU_ARCH_M68K = (1 << 4), + QEMU_ARCH_LM32 = (1 << 5), + QEMU_ARCH_MICROBLAZE = (1 << 6), + QEMU_ARCH_MIPS = (1 << 7), + QEMU_ARCH_PPC = (1 << 8), + QEMU_ARCH_S390X = (1 << 9), + QEMU_ARCH_SH4 = (1 << 10), + QEMU_ARCH_SPARC = (1 << 11), + QEMU_ARCH_XTENSA = (1 << 12), + QEMU_ARCH_OPENRISC = (1 << 13), + QEMU_ARCH_UNICORE32 = (1 << 14), + QEMU_ARCH_MOXIE = (1 << 15), + QEMU_ARCH_TRICORE = (1 << 16), }; extern const uint32_t arch_type; @@ -17,6 +17,7 @@ #include "block/aio.h" #include "sysemu/iothread.h" #include "qmp-commands.h" +#include "qemu/error-report.h" #define IOTHREADS_PATH "/objects" @@ -53,6 +54,9 @@ static void iothread_instance_finalize(Object *obj) { IOThread *iothread = IOTHREAD(obj); + if (!iothread->ctx) { + return; + } iothread->stopping = true; aio_notify(iothread->ctx); qemu_thread_join(&iothread->thread); @@ -63,11 +67,16 @@ static void iothread_instance_finalize(Object *obj) static void iothread_complete(UserCreatable *obj, Error **errp) { + Error *local_error = NULL; IOThread *iothread = IOTHREAD(obj); iothread->stopping = false; - iothread->ctx = aio_context_new(); iothread->thread_id = -1; + iothread->ctx = aio_context_new(&local_error); + if (!iothread->ctx) { + error_propagate(errp, local_error); + return; + } qemu_mutex_init(&iothread->init_done_lock); qemu_cond_init(&iothread->init_done_cond); diff --git a/libcacard/vcard_emul_nss.c b/libcacard/vcard_emul_nss.c index f1bba57..07b4464 100644 --- a/libcacard/vcard_emul_nss.c +++ b/libcacard/vcard_emul_nss.c @@ -286,10 +286,10 @@ vcard_emul_rsa_op(VCard *card, VCardKey *key, } } if ((i < buffer_size) && (buffer[i] == 0)) { - /* yes, we have a properly formated PKCS #1 signature */ + /* yes, we have a properly formatted PKCS #1 signature */ /* * NOTE: even if we accidentally got an encrypt buffer, which - * through shear luck started with 00, 01, ff, 00, it won't matter + * through sheer luck started with 00, 01, ff, 00, it won't matter * because the resulting Sign operation will effectively decrypt * the real buffer. */ diff --git a/linux-headers/linux/vhost.h b/linux-headers/linux/vhost.h index c656f61..bb5df43 100644 --- a/linux-headers/linux/vhost.h +++ b/linux-headers/linux/vhost.h @@ -14,7 +14,7 @@ #include <linux/ioctl.h> #include <linux/virtio_config.h> -#include <linux/virtio_ring.h> +#include "hw/virtio/virtio_ring.h" struct vhost_vring_state { unsigned int index; diff --git a/main-loop.c b/main-loop.c index 3cc79f8..53393a4 100644 --- a/main-loop.c +++ b/main-loop.c @@ -126,10 +126,11 @@ void qemu_notify_event(void) static GArray *gpollfds; -int qemu_init_main_loop(void) +int qemu_init_main_loop(Error **errp) { int ret; GSource *src; + Error *local_error = NULL; init_clocks(); @@ -138,8 +139,12 @@ int qemu_init_main_loop(void) return ret; } + qemu_aio_context = aio_context_new(&local_error); + if (!qemu_aio_context) { + error_propagate(errp, local_error); + return -EMFILE; + } gpollfds = g_array_new(FALSE, FALSE, sizeof(GPollFD)); - qemu_aio_context = aio_context_new(); src = aio_get_g_source(qemu_aio_context); g_source_attach(src, NULL); g_source_unref(src); diff --git a/migration-rdma.c b/migration-rdma.c index d99812c..b32dbdf 100644 --- a/migration-rdma.c +++ b/migration-rdma.c @@ -2523,7 +2523,7 @@ static void *qemu_rdma_data_init(const char *host_port, Error **errp) /* * QEMUFile interface to the control channel. * SEND messages for control only. - * pc.ram is handled with regular RDMA messages. + * VM's ram is handled with regular RDMA messages. */ static int qemu_rdma_put_buffer(void *opaque, const uint8_t *buf, int64_t pos, int size) @@ -2539,7 +2539,7 @@ static int qemu_rdma_put_buffer(void *opaque, const uint8_t *buf, /* * Push out any writes that - * we're queued up for pc.ram. + * we're queued up for VM's ram. */ ret = qemu_rdma_write_flush(f, rdma); if (ret < 0) { diff --git a/qapi/block-core.json b/qapi/block-core.json index 95dcd81..fa2d1b7 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -197,7 +197,7 @@ # 'file', 'file', 'ftp', 'ftps', 'host_cdrom', 'host_device', # 'host_floppy', 'http', 'https', 'nbd', 'parallels', 'qcow', # 'qcow2', 'raw', 'tftp', 'vdi', 'vmdk', 'vpc', 'vvfat' -# 2.2: 'archipelago' +# 2.2: 'archipelago' added, 'cow' dropped # # @backing_file: #optional the name of the backing file (for copy-on-write) # @@ -1148,10 +1148,11 @@ # Since: 2.0 ## { 'enum': 'BlockdevDriver', - 'data': [ 'archipelago', 'file', 'host_device', 'host_cdrom', 'host_floppy', - 'http', 'https', 'ftp', 'ftps', 'tftp', 'vvfat', 'blkdebug', - 'blkverify', 'bochs', 'cloop', 'cow', 'dmg', 'parallels', 'qcow', - 'qcow2', 'qed', 'raw', 'vdi', 'vhdx', 'vmdk', 'vpc', 'quorum' ] } + 'data': [ 'archipelago', 'blkdebug', 'blkverify', 'bochs', 'cloop', + 'dmg', 'file', 'ftp', 'ftps', 'host_cdrom', 'host_device', + 'host_floppy', 'http', 'https', 'null-aio', 'null-co', 'parallels', + 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'tftp', 'vdi', 'vhdx', + 'vmdk', 'vpc', 'vvfat' ] } ## # @BlockdevOptionsBase @@ -1204,6 +1205,18 @@ 'data': { 'filename': 'str' } } ## +# @BlockdevOptionsNull +# +# Driver specific block device options for the null backend. +# +# @size: #optional size of the device in bytes. +# +# Since: 2.2 +## +{ 'type': 'BlockdevOptionsNull', + 'data': { '*size': 'int' } } + +## # @BlockdevOptionsVVFAT # # Driver specific block device options for the vvfat protocol. @@ -1251,6 +1264,67 @@ 'data': { '*backing': 'BlockdevRef' } } ## +# @Qcow2OverlapCheckMode +# +# General overlap check modes. +# +# @none: Do not perform any checks +# +# @constant: Perform only checks which can be done in constant time and +# without reading anything from disk +# +# @cached: Perform only checks which can be done without reading anything +# from disk +# +# @all: Perform all available overlap checks +# +# Since: 2.2 +## +{ 'enum': 'Qcow2OverlapCheckMode', + 'data': [ 'none', 'constant', 'cached', 'all' ] } + +## +# @Qcow2OverlapCheckFlags +# +# Structure of flags for each metadata structure. Setting a field to 'true' +# makes qemu guard that structure against unintended overwriting. The default +# value is chosen according to the template given. +# +# @template: Specifies a template mode which can be adjusted using the other +# flags, defaults to 'cached' +# +# Since: 2.2 +## +{ 'type': 'Qcow2OverlapCheckFlags', + 'data': { '*template': 'Qcow2OverlapCheckMode', + '*main-header': 'bool', + '*active-l1': 'bool', + '*active-l2': 'bool', + '*refcount-table': 'bool', + '*refcount-block': 'bool', + '*snapshot-table': 'bool', + '*inactive-l1': 'bool', + '*inactive-l2': 'bool' } } + +## +# @Qcow2OverlapChecks +# +# Specifies which metadata structures should be guarded against unintended +# overwriting. +# +# @flags: set of flags for separate specification of each metadata structure +# type +# +# @mode: named mode which chooses a specific set of flags +# +# Since: 2.2 +## +{ 'union': 'Qcow2OverlapChecks', + 'discriminator': {}, + 'data': { 'flags': 'Qcow2OverlapCheckFlags', + 'mode': 'Qcow2OverlapCheckMode' } } + +## # @BlockdevOptionsQcow2 # # Driver specific block device options for qcow2. @@ -1269,6 +1343,18 @@ # should be issued on other occasions where a cluster # gets freed # +# @overlap-check: #optional which overlap checks to perform for writes +# to the image, defaults to 'cached' (since 2.2) +# +# @cache-size: #optional the maximum total size of the L2 table and +# refcount block caches in bytes (since 2.2) +# +# @l2-cache-size: #optional the maximum size of the L2 table cache in +# bytes (since 2.2) +# +# @refcount-cache-size: #optional the maximum size of the refcount block cache +# in bytes (since 2.2) +# # Since: 1.7 ## { 'type': 'BlockdevOptionsQcow2', @@ -1276,7 +1362,11 @@ 'data': { '*lazy-refcounts': 'bool', '*pass-discard-request': 'bool', '*pass-discard-snapshot': 'bool', - '*pass-discard-other': 'bool' } } + '*pass-discard-other': 'bool', + '*overlap-check': 'Qcow2OverlapChecks', + '*cache-size': 'int', + '*l2-cache-size': 'int', + '*refcount-cache-size': 'int' } } ## @@ -1471,39 +1561,40 @@ 'discriminator': 'driver', 'data': { 'archipelago':'BlockdevOptionsArchipelago', + 'blkdebug': 'BlockdevOptionsBlkdebug', + 'blkverify': 'BlockdevOptionsBlkverify', + 'bochs': 'BlockdevOptionsGenericFormat', + 'cloop': 'BlockdevOptionsGenericFormat', + 'dmg': 'BlockdevOptionsGenericFormat', 'file': 'BlockdevOptionsFile', - 'host_device':'BlockdevOptionsFile', + 'ftp': 'BlockdevOptionsFile', + 'ftps': 'BlockdevOptionsFile', +# TODO gluster: Wait for structured options 'host_cdrom': 'BlockdevOptionsFile', + 'host_device':'BlockdevOptionsFile', 'host_floppy':'BlockdevOptionsFile', 'http': 'BlockdevOptionsFile', 'https': 'BlockdevOptionsFile', - 'ftp': 'BlockdevOptionsFile', - 'ftps': 'BlockdevOptionsFile', - 'tftp': 'BlockdevOptionsFile', -# TODO gluster: Wait for structured options # TODO iscsi: Wait for structured options # TODO nbd: Should take InetSocketAddress for 'host'? # TODO nfs: Wait for structured options -# TODO rbd: Wait for structured options -# TODO sheepdog: Wait for structured options -# TODO ssh: Should take InetSocketAddress for 'host'? - 'vvfat': 'BlockdevOptionsVVFAT', - 'blkdebug': 'BlockdevOptionsBlkdebug', - 'blkverify': 'BlockdevOptionsBlkverify', - 'bochs': 'BlockdevOptionsGenericFormat', - 'cloop': 'BlockdevOptionsGenericFormat', - 'cow': 'BlockdevOptionsGenericCOWFormat', - 'dmg': 'BlockdevOptionsGenericFormat', + 'null-aio': 'BlockdevOptionsNull', + 'null-co': 'BlockdevOptionsNull', 'parallels': 'BlockdevOptionsGenericFormat', - 'qcow': 'BlockdevOptionsGenericCOWFormat', 'qcow2': 'BlockdevOptionsQcow2', + 'qcow': 'BlockdevOptionsGenericCOWFormat', 'qed': 'BlockdevOptionsGenericCOWFormat', + 'quorum': 'BlockdevOptionsQuorum', 'raw': 'BlockdevOptionsGenericFormat', +# TODO rbd: Wait for structured options +# TODO sheepdog: Wait for structured options +# TODO ssh: Should take InetSocketAddress for 'host'? + 'tftp': 'BlockdevOptionsFile', 'vdi': 'BlockdevOptionsGenericFormat', 'vhdx': 'BlockdevOptionsGenericFormat', 'vmdk': 'BlockdevOptionsGenericCOWFormat', 'vpc': 'BlockdevOptionsGenericFormat', - 'quorum': 'BlockdevOptionsQuorum' + 'vvfat': 'BlockdevOptionsVVFAT' } } ## @@ -1555,7 +1646,7 @@ ## # @BLOCK_IMAGE_CORRUPTED # -# Emitted when a disk image is being marked corrupt +# Emitted when a corruption has been detected in a disk image # # @device: device name # @@ -1569,13 +1660,18 @@ # @size: #optional, if the corruption resulted from an image access, this is # the access size # +# fatal: if set, the image is marked corrupt and therefore unusable after this +# event and must be repaired (Since 2.2; before, every +# BLOCK_IMAGE_CORRUPTED event was fatal) +# # Since: 1.7 ## { 'event': 'BLOCK_IMAGE_CORRUPTED', 'data': { 'device' : 'str', 'msg' : 'str', '*offset': 'int', - '*size' : 'int' } } + '*size' : 'int', + 'fatal' : 'bool' } } ## # @BLOCK_IO_ERROR diff --git a/qdev-monitor.c b/qdev-monitor.c index fb9ee24..5ec6606 100644 --- a/qdev-monitor.c +++ b/qdev-monitor.c @@ -206,7 +206,7 @@ int qdev_device_help(QemuOpts *opts) } prop_list = qmp_device_list_properties(driver, &local_err); - if (!prop_list) { + if (local_err) { error_printf("%s\n", error_get_pretty(local_err)); error_free(local_err); return 1; diff --git a/qemu-char.c b/qemu-char.c index 2a3cb9f..8623c70 100644 --- a/qemu-char.c +++ b/qemu-char.c @@ -1017,6 +1017,7 @@ static CharDriverState *qemu_chr_open_pipe(ChardevHostdev *opts) /* init terminal so that we can grab keys */ static struct termios oldtty; static int old_fd0_flags; +static bool stdio_in_use; static bool stdio_allow_signal; static void term_exit(void) @@ -1060,8 +1061,15 @@ static CharDriverState *qemu_chr_open_stdio(ChardevStdio *opts) error_report("cannot use stdio with -daemonize"); return NULL; } + + if (stdio_in_use) { + error_report("cannot use stdio by multiple character devices"); + exit(1); + } + + stdio_in_use = true; old_fd0_flags = fcntl(0, F_GETFL); - tcgetattr (0, &oldtty); + tcgetattr(0, &oldtty); qemu_set_nonblock(0); atexit(term_exit); diff --git a/qemu-doc.texi b/qemu-doc.texi index ef3be72..9973090 100644 --- a/qemu-doc.texi +++ b/qemu-doc.texi @@ -654,15 +654,6 @@ File name of a base image (see @option{create} subcommand) If this option is set to @code{on}, the image is encrypted. @end table -@item cow -User Mode Linux Copy On Write image format. It is supported only for -compatibility with previous versions. -Supported options: -@table @code -@item backing_file -File name of a base image (see @option{create} subcommand) -@end table - @item vdi VirtualBox 1.1 compatible image format. Supported options: @@ -2879,6 +2879,7 @@ int main(int argc, char **argv) { const img_cmd_t *cmd; const char *cmdname; + Error *local_error = NULL; int c; static const struct option long_options[] = { {"help", no_argument, 0, 'h'}, @@ -2893,7 +2894,12 @@ int main(int argc, char **argv) error_set_progname(argv[0]); qemu_init_exec_dir(argv[0]); - qemu_init_main_loop(); + if (qemu_init_main_loop(&local_error)) { + error_report("%s", error_get_pretty(local_error)); + error_free(local_error); + exit(EXIT_FAILURE); + } + bdrv_init(); if (argc < 2) { error_exit("Not enough arguments"); diff --git a/qemu-img.texi b/qemu-img.texi index 50d2cca..e3ef4a0 100644 --- a/qemu-img.texi +++ b/qemu-img.texi @@ -228,8 +228,8 @@ compression is read-only. It means that if a compressed sector is rewritten, then it is rewritten as uncompressed data. Image conversion is also useful to get smaller image when using a -growable format such as @code{qcow} or @code{cow}: the empty sectors -are detected and suppressed from the destination image. +growable format such as @code{qcow}: the empty sectors are detected and +suppressed from the destination image. @var{sparse_size} indicates the consecutive number of bytes (defaults to 4k) that must contain only zeros for qemu-img to create a sparse image during @@ -379,6 +379,7 @@ int main(int argc, char **argv) int c; int opt_index = 0; int flags = BDRV_O_UNMAP; + Error *local_error = NULL; #ifdef CONFIG_POSIX signal(SIGPIPE, SIG_IGN); @@ -444,7 +445,11 @@ int main(int argc, char **argv) exit(1); } - qemu_init_main_loop(); + if (qemu_init_main_loop(&local_error)) { + error_report("%s", error_get_pretty(local_error)); + error_free(local_error); + exit(1); + } bdrv_init(); /* initialize commands */ @@ -674,7 +674,11 @@ int main(int argc, char **argv) snprintf(sockpath, 128, SOCKET_PATH, basename(device)); } - qemu_init_main_loop(); + if (qemu_init_main_loop(&local_err)) { + error_report("%s", error_get_pretty(local_err)); + error_free(local_err); + exit(EXIT_FAILURE); + } bdrv_init(); atexit(bdrv_close_all); diff --git a/qmp-commands.hx b/qmp-commands.hx index 7658d4b..76656cc 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -2081,7 +2081,7 @@ Each json-object contain the following: - "file": device file name (json-string) - "ro": true if read-only, false otherwise (json-bool) - "drv": driver format name (json-string) - - Possible values: "blkdebug", "bochs", "cloop", "cow", "dmg", + - Possible values: "blkdebug", "bochs", "cloop", "dmg", "file", "file", "ftp", "ftps", "host_cdrom", "host_device", "host_floppy", "http", "https", "nbd", "parallels", "qcow", "qcow2", "raw", diff --git a/scripts/qapi-types.py b/scripts/qapi-types.py index b463232..d2f815b 100644 --- a/scripts/qapi-types.py +++ b/scripts/qapi-types.py @@ -177,6 +177,8 @@ const int %(name)s_qtypes[QTYPE_MAX] = { qtype = "QTYPE_QDICT" elif find_union(qapi_type): qtype = "QTYPE_QDICT" + elif find_enum(qapi_type): + qtype = "QTYPE_QSTRING" else: assert False, "Invalid anonymous union member" diff --git a/scripts/qapi-visit.py b/scripts/qapi-visit.py index c129697..df9f7fb 100644 --- a/scripts/qapi-visit.py +++ b/scripts/qapi-visit.py @@ -263,7 +263,8 @@ void visit_type_%(name)s(Visitor *m, %(name)s **obj, const char *name, Error **e for key in members: assert (members[key] in builtin_types or find_struct(members[key]) - or find_union(members[key])), "Invalid anonymous union member" + or find_union(members[key]) + or find_enum(members[key])), "Invalid anonymous union member" enum_full_value = generate_enum_full_value(disc_type, key) ret += mcgen(''' diff --git a/tests/Makefile b/tests/Makefile index a5e3d0c..f5de29c 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -132,6 +132,7 @@ check-qtest-i386-y = tests/endianness-test$(EXESUF) check-qtest-i386-y += tests/fdc-test$(EXESUF) gcov-files-i386-y = hw/block/fdc.c check-qtest-i386-y += tests/ide-test$(EXESUF) +check-qtest-i386-y += tests/ahci-test$(EXESUF) check-qtest-i386-y += tests/hd-geo-test$(EXESUF) gcov-files-i386-y += hw/block/hd-geometry.c check-qtest-i386-y += tests/boot-order-test$(EXESUF) @@ -307,6 +308,7 @@ tests/endianness-test$(EXESUF): tests/endianness-test.o tests/spapr-phb-test$(EXESUF): tests/spapr-phb-test.o $(libqos-obj-y) tests/fdc-test$(EXESUF): tests/fdc-test.o tests/ide-test$(EXESUF): tests/ide-test.o $(libqos-pc-obj-y) +tests/ahci-test$(EXESUF): tests/ahci-test.o $(libqos-pc-obj-y) tests/hd-geo-test$(EXESUF): tests/hd-geo-test.o tests/boot-order-test$(EXESUF): tests/boot-order-test.o $(libqos-obj-y) tests/bios-tables-test$(EXESUF): tests/bios-tables-test.o $(libqos-obj-y) diff --git a/tests/ahci-test.c b/tests/ahci-test.c new file mode 100644 index 0000000..4c77ebe --- /dev/null +++ b/tests/ahci-test.c @@ -0,0 +1,1561 @@ +/* + * AHCI test cases + * + * Copyright (c) 2014 John Snow <jsnow@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <stdint.h> +#include <string.h> +#include <stdio.h> +#include <getopt.h> +#include <glib.h> + +#include "libqtest.h" +#include "libqos/pci-pc.h" +#include "libqos/malloc-pc.h" + +#include "qemu-common.h" +#include "qemu/host-utils.h" + +#include "hw/pci/pci_ids.h" +#include "hw/pci/pci_regs.h" + +/* Test-specific defines. */ +#define TEST_IMAGE_SIZE (64 * 1024 * 1024) + +/*** Supplementary PCI Config Space IDs & Masks ***/ +#define PCI_DEVICE_ID_INTEL_Q35_AHCI (0x2922) +#define PCI_MSI_FLAGS_RESERVED (0xFF00) +#define PCI_PM_CTRL_RESERVED (0xFC) +#define PCI_BCC(REG32) ((REG32) >> 24) +#define PCI_PI(REG32) (((REG32) >> 8) & 0xFF) +#define PCI_SCC(REG32) (((REG32) >> 16) & 0xFF) + +/*** Recognized AHCI Device Types ***/ +#define AHCI_INTEL_ICH9 (PCI_DEVICE_ID_INTEL_Q35_AHCI << 16 | \ + PCI_VENDOR_ID_INTEL) + +/*** AHCI/HBA Register Offsets and Bitmasks ***/ +#define AHCI_CAP (0) +#define AHCI_CAP_NP (0x1F) +#define AHCI_CAP_SXS (0x20) +#define AHCI_CAP_EMS (0x40) +#define AHCI_CAP_CCCS (0x80) +#define AHCI_CAP_NCS (0x1F00) +#define AHCI_CAP_PSC (0x2000) +#define AHCI_CAP_SSC (0x4000) +#define AHCI_CAP_PMD (0x8000) +#define AHCI_CAP_FBSS (0x10000) +#define AHCI_CAP_SPM (0x20000) +#define AHCI_CAP_SAM (0x40000) +#define AHCI_CAP_RESERVED (0x80000) +#define AHCI_CAP_ISS (0xF00000) +#define AHCI_CAP_SCLO (0x1000000) +#define AHCI_CAP_SAL (0x2000000) +#define AHCI_CAP_SALP (0x4000000) +#define AHCI_CAP_SSS (0x8000000) +#define AHCI_CAP_SMPS (0x10000000) +#define AHCI_CAP_SSNTF (0x20000000) +#define AHCI_CAP_SNCQ (0x40000000) +#define AHCI_CAP_S64A (0x80000000) + +#define AHCI_GHC (1) +#define AHCI_GHC_HR (0x01) +#define AHCI_GHC_IE (0x02) +#define AHCI_GHC_MRSM (0x04) +#define AHCI_GHC_RESERVED (0x7FFFFFF8) +#define AHCI_GHC_AE (0x80000000) + +#define AHCI_IS (2) +#define AHCI_PI (3) +#define AHCI_VS (4) + +#define AHCI_CCCCTL (5) +#define AHCI_CCCCTL_EN (0x01) +#define AHCI_CCCCTL_RESERVED (0x06) +#define AHCI_CCCCTL_CC (0xFF00) +#define AHCI_CCCCTL_TV (0xFFFF0000) + +#define AHCI_CCCPORTS (6) +#define AHCI_EMLOC (7) + +#define AHCI_EMCTL (8) +#define AHCI_EMCTL_STSMR (0x01) +#define AHCI_EMCTL_CTLTM (0x100) +#define AHCI_EMCTL_CTLRST (0x200) +#define AHCI_EMCTL_RESERVED (0xF0F0FCFE) + +#define AHCI_CAP2 (9) +#define AHCI_CAP2_BOH (0x01) +#define AHCI_CAP2_NVMP (0x02) +#define AHCI_CAP2_APST (0x04) +#define AHCI_CAP2_RESERVED (0xFFFFFFF8) + +#define AHCI_BOHC (10) +#define AHCI_RESERVED (11) +#define AHCI_NVMHCI (24) +#define AHCI_VENDOR (40) +#define AHCI_PORTS (64) + +/*** Port Memory Offsets & Bitmasks ***/ +#define AHCI_PX_CLB (0) +#define AHCI_PX_CLB_RESERVED (0x1FF) + +#define AHCI_PX_CLBU (1) + +#define AHCI_PX_FB (2) +#define AHCI_PX_FB_RESERVED (0xFF) + +#define AHCI_PX_FBU (3) + +#define AHCI_PX_IS (4) +#define AHCI_PX_IS_DHRS (0x1) +#define AHCI_PX_IS_PSS (0x2) +#define AHCI_PX_IS_DSS (0x4) +#define AHCI_PX_IS_SDBS (0x8) +#define AHCI_PX_IS_UFS (0x10) +#define AHCI_PX_IS_DPS (0x20) +#define AHCI_PX_IS_PCS (0x40) +#define AHCI_PX_IS_DMPS (0x80) +#define AHCI_PX_IS_RESERVED (0x23FFF00) +#define AHCI_PX_IS_PRCS (0x400000) +#define AHCI_PX_IS_IPMS (0x800000) +#define AHCI_PX_IS_OFS (0x1000000) +#define AHCI_PX_IS_INFS (0x4000000) +#define AHCI_PX_IS_IFS (0x8000000) +#define AHCI_PX_IS_HBDS (0x10000000) +#define AHCI_PX_IS_HBFS (0x20000000) +#define AHCI_PX_IS_TFES (0x40000000) +#define AHCI_PX_IS_CPDS (0x80000000) + +#define AHCI_PX_IE (5) +#define AHCI_PX_IE_DHRE (0x1) +#define AHCI_PX_IE_PSE (0x2) +#define AHCI_PX_IE_DSE (0x4) +#define AHCI_PX_IE_SDBE (0x8) +#define AHCI_PX_IE_UFE (0x10) +#define AHCI_PX_IE_DPE (0x20) +#define AHCI_PX_IE_PCE (0x40) +#define AHCI_PX_IE_DMPE (0x80) +#define AHCI_PX_IE_RESERVED (0x23FFF00) +#define AHCI_PX_IE_PRCE (0x400000) +#define AHCI_PX_IE_IPME (0x800000) +#define AHCI_PX_IE_OFE (0x1000000) +#define AHCI_PX_IE_INFE (0x4000000) +#define AHCI_PX_IE_IFE (0x8000000) +#define AHCI_PX_IE_HBDE (0x10000000) +#define AHCI_PX_IE_HBFE (0x20000000) +#define AHCI_PX_IE_TFEE (0x40000000) +#define AHCI_PX_IE_CPDE (0x80000000) + +#define AHCI_PX_CMD (6) +#define AHCI_PX_CMD_ST (0x1) +#define AHCI_PX_CMD_SUD (0x2) +#define AHCI_PX_CMD_POD (0x4) +#define AHCI_PX_CMD_CLO (0x8) +#define AHCI_PX_CMD_FRE (0x10) +#define AHCI_PX_CMD_RESERVED (0xE0) +#define AHCI_PX_CMD_CCS (0x1F00) +#define AHCI_PX_CMD_MPSS (0x2000) +#define AHCI_PX_CMD_FR (0x4000) +#define AHCI_PX_CMD_CR (0x8000) +#define AHCI_PX_CMD_CPS (0x10000) +#define AHCI_PX_CMD_PMA (0x20000) +#define AHCI_PX_CMD_HPCP (0x40000) +#define AHCI_PX_CMD_MPSP (0x80000) +#define AHCI_PX_CMD_CPD (0x100000) +#define AHCI_PX_CMD_ESP (0x200000) +#define AHCI_PX_CMD_FBSCP (0x400000) +#define AHCI_PX_CMD_APSTE (0x800000) +#define AHCI_PX_CMD_ATAPI (0x1000000) +#define AHCI_PX_CMD_DLAE (0x2000000) +#define AHCI_PX_CMD_ALPE (0x4000000) +#define AHCI_PX_CMD_ASP (0x8000000) +#define AHCI_PX_CMD_ICC (0xF0000000) + +#define AHCI_PX_RES1 (7) + +#define AHCI_PX_TFD (8) +#define AHCI_PX_TFD_STS (0xFF) +#define AHCI_PX_TFD_STS_ERR (0x01) +#define AHCI_PX_TFD_STS_CS1 (0x06) +#define AHCI_PX_TFD_STS_DRQ (0x08) +#define AHCI_PX_TFD_STS_CS2 (0x70) +#define AHCI_PX_TFD_STS_BSY (0x80) +#define AHCI_PX_TFD_ERR (0xFF00) +#define AHCI_PX_TFD_RESERVED (0xFFFF0000) + +#define AHCI_PX_SIG (9) +#define AHCI_PX_SIG_SECTOR_COUNT (0xFF) +#define AHCI_PX_SIG_LBA_LOW (0xFF00) +#define AHCI_PX_SIG_LBA_MID (0xFF0000) +#define AHCI_PX_SIG_LBA_HIGH (0xFF000000) + +#define AHCI_PX_SSTS (10) +#define AHCI_PX_SSTS_DET (0x0F) +#define AHCI_PX_SSTS_SPD (0xF0) +#define AHCI_PX_SSTS_IPM (0xF00) +#define AHCI_PX_SSTS_RESERVED (0xFFFFF000) +#define SSTS_DET_NO_DEVICE (0x00) +#define SSTS_DET_PRESENT (0x01) +#define SSTS_DET_ESTABLISHED (0x03) +#define SSTS_DET_OFFLINE (0x04) + +#define AHCI_PX_SCTL (11) + +#define AHCI_PX_SERR (12) +#define AHCI_PX_SERR_ERR (0xFFFF) +#define AHCI_PX_SERR_DIAG (0xFFFF0000) +#define AHCI_PX_SERR_DIAG_X (0x04000000) + +#define AHCI_PX_SACT (13) +#define AHCI_PX_CI (14) +#define AHCI_PX_SNTF (15) + +#define AHCI_PX_FBS (16) +#define AHCI_PX_FBS_EN (0x1) +#define AHCI_PX_FBS_DEC (0x2) +#define AHCI_PX_FBS_SDE (0x4) +#define AHCI_PX_FBS_DEV (0xF00) +#define AHCI_PX_FBS_ADO (0xF000) +#define AHCI_PX_FBS_DWE (0xF0000) +#define AHCI_PX_FBS_RESERVED (0xFFF000F8) + +#define AHCI_PX_RES2 (17) +#define AHCI_PX_VS (28) + +#define HBA_DATA_REGION_SIZE (256) +#define HBA_PORT_DATA_SIZE (128) +#define HBA_PORT_NUM_REG (HBA_PORT_DATA_SIZE/4) + +#define AHCI_VERSION_0_95 (0x00000905) +#define AHCI_VERSION_1_0 (0x00010000) +#define AHCI_VERSION_1_1 (0x00010100) +#define AHCI_VERSION_1_2 (0x00010200) +#define AHCI_VERSION_1_3 (0x00010300) + +/*** Structures ***/ + +/** + * Generic FIS structure. + */ +typedef struct FIS { + uint8_t fis_type; + uint8_t flags; + char data[0]; +} __attribute__((__packed__)) FIS; + +/** + * Register device-to-host FIS structure. + */ +typedef struct RegD2HFIS { + /* DW0 */ + uint8_t fis_type; + uint8_t flags; + uint8_t status; + uint8_t error; + /* DW1 */ + uint8_t lba_low; + uint8_t lba_mid; + uint8_t lba_high; + uint8_t device; + /* DW2 */ + uint8_t lba3; + uint8_t lba4; + uint8_t lba5; + uint8_t res1; + /* DW3 */ + uint16_t count; + uint8_t res2; + uint8_t res3; + /* DW4 */ + uint16_t res4; + uint16_t res5; +} __attribute__((__packed__)) RegD2HFIS; + +/** + * Register host-to-device FIS structure. + */ +typedef struct RegH2DFIS { + /* DW0 */ + uint8_t fis_type; + uint8_t flags; + uint8_t command; + uint8_t feature_low; + /* DW1 */ + uint8_t lba_low; + uint8_t lba_mid; + uint8_t lba_high; + uint8_t device; + /* DW2 */ + uint8_t lba3; + uint8_t lba4; + uint8_t lba5; + uint8_t feature_high; + /* DW3 */ + uint16_t count; + uint8_t icc; + uint8_t control; + /* DW4 */ + uint32_t aux; +} __attribute__((__packed__)) RegH2DFIS; + +/** + * Command List entry structure. + * The command list contains between 1-32 of these structures. + */ +typedef struct AHCICommand { + uint8_t b1; + uint8_t b2; + uint16_t prdtl; /* Phys Region Desc. Table Length */ + uint32_t prdbc; /* Phys Region Desc. Byte Count */ + uint32_t ctba; /* Command Table Descriptor Base Address */ + uint32_t ctbau; /* '' Upper */ + uint32_t res[4]; +} __attribute__((__packed__)) AHCICommand; + +/** + * Physical Region Descriptor; pointed to by the Command List Header, + * struct ahci_command. + */ +typedef struct PRD { + uint32_t dba; /* Data Base Address */ + uint32_t dbau; /* Data Base Address Upper */ + uint32_t res; /* Reserved */ + uint32_t dbc; /* Data Byte Count (0-indexed) & Interrupt Flag (bit 2^31) */ +} PRD; + +typedef struct HBACap { + uint32_t cap; + uint32_t cap2; +} HBACap; + +/*** Globals ***/ +static QGuestAllocator *guest_malloc; +static QPCIBus *pcibus; +static uint64_t barsize; +static char tmp_path[] = "/tmp/qtest.XXXXXX"; +static bool ahci_pedantic; +static uint32_t ahci_fingerprint; + +/*** Macro Utilities ***/ +#define BITANY(data, mask) (((data) & (mask)) != 0) +#define BITSET(data, mask) (((data) & (mask)) == (mask)) +#define BITCLR(data, mask) (((data) & (mask)) == 0) +#define ASSERT_BIT_SET(data, mask) g_assert_cmphex((data) & (mask), ==, (mask)) +#define ASSERT_BIT_CLEAR(data, mask) g_assert_cmphex((data) & (mask), ==, 0) + +/*** IO macros for the AHCI memory registers. ***/ +#define AHCI_READ(OFST) qpci_io_readl(ahci, hba_base + (OFST)) +#define AHCI_WRITE(OFST, VAL) qpci_io_writel(ahci, hba_base + (OFST), (VAL)) +#define AHCI_RREG(regno) AHCI_READ(4 * (regno)) +#define AHCI_WREG(regno, val) AHCI_WRITE(4 * (regno), (val)) +#define AHCI_SET(regno, mask) AHCI_WREG((regno), AHCI_RREG(regno) | (mask)) +#define AHCI_CLR(regno, mask) AHCI_WREG((regno), AHCI_RREG(regno) & ~(mask)) + +/*** IO macros for port-specific offsets inside of AHCI memory. ***/ +#define PX_OFST(port, regno) (HBA_PORT_NUM_REG * (port) + AHCI_PORTS + (regno)) +#define PX_RREG(port, regno) AHCI_RREG(PX_OFST((port), (regno))) +#define PX_WREG(port, regno, val) AHCI_WREG(PX_OFST((port), (regno)), (val)) +#define PX_SET(port, reg, mask) PX_WREG((port), (reg), \ + PX_RREG((port), (reg)) | (mask)); +#define PX_CLR(port, reg, mask) PX_WREG((port), (reg), \ + PX_RREG((port), (reg)) & ~(mask)); + +/* For calculating how big the PRD table needs to be: */ +#define CMD_TBL_SIZ(n) ((0x80 + ((n) * sizeof(PRD)) + 0x7F) & ~0x7F) + + +/*** Function Declarations ***/ +static QPCIDevice *get_ahci_device(void); +static QPCIDevice *start_ahci_device(QPCIDevice *dev, void **hba_base); +static void free_ahci_device(QPCIDevice *dev); +static void ahci_test_port_spec(QPCIDevice *ahci, void *hba_base, + HBACap *hcap, uint8_t port); +static void ahci_test_pci_spec(QPCIDevice *ahci); +static void ahci_test_pci_caps(QPCIDevice *ahci, uint16_t header, + uint8_t offset); +static void ahci_test_satacap(QPCIDevice *ahci, uint8_t offset); +static void ahci_test_msicap(QPCIDevice *ahci, uint8_t offset); +static void ahci_test_pmcap(QPCIDevice *ahci, uint8_t offset); + +/*** Utilities ***/ + +static void string_bswap16(uint16_t *s, size_t bytes) +{ + g_assert_cmphex((bytes & 1), ==, 0); + bytes /= 2; + + while (bytes--) { + *s = bswap16(*s); + s++; + } +} + +/** + * Locate, verify, and return a handle to the AHCI device. + */ +static QPCIDevice *get_ahci_device(void) +{ + QPCIDevice *ahci; + + pcibus = qpci_init_pc(); + + /* Find the AHCI PCI device and verify it's the right one. */ + ahci = qpci_device_find(pcibus, QPCI_DEVFN(0x1F, 0x02)); + g_assert(ahci != NULL); + + ahci_fingerprint = qpci_config_readl(ahci, PCI_VENDOR_ID); + + switch (ahci_fingerprint) { + case AHCI_INTEL_ICH9: + break; + default: + /* Unknown device. */ + g_assert_not_reached(); + } + + return ahci; +} + +static void free_ahci_device(QPCIDevice *ahci) +{ + /* libqos doesn't have a function for this, so free it manually */ + g_free(ahci); + + if (pcibus) { + qpci_free_pc(pcibus); + pcibus = NULL; + } + + /* Clear our cached barsize information. */ + barsize = 0; +} + +/*** Test Setup & Teardown ***/ + +/** + * Launch QEMU with the given command line, + * and then set up interrupts and our guest malloc interface. + */ +static void qtest_boot(const char *cmdline_fmt, ...) +{ + va_list ap; + char *cmdline; + + va_start(ap, cmdline_fmt); + cmdline = g_strdup_vprintf(cmdline_fmt, ap); + va_end(ap); + + qtest_start(cmdline); + qtest_irq_intercept_in(global_qtest, "ioapic"); + guest_malloc = pc_alloc_init(); + + g_free(cmdline); +} + +/** + * Tear down the QEMU instance. + */ +static void qtest_shutdown(void) +{ + g_free(guest_malloc); + guest_malloc = NULL; + qtest_end(); +} + +/** + * Start a Q35 machine and bookmark a handle to the AHCI device. + */ +static QPCIDevice *ahci_boot(void) +{ + qtest_boot("-drive if=none,id=drive0,file=%s,cache=writeback,serial=%s" + " -M q35 " + "-device ide-hd,drive=drive0 " + "-global ide-hd.ver=%s", + tmp_path, "testdisk", "version"); + + /* Verify that we have an AHCI device present. */ + return get_ahci_device(); +} + +/** + * Clean up the PCI device, then terminate the QEMU instance. + */ +static void ahci_shutdown(QPCIDevice *ahci) +{ + free_ahci_device(ahci); + qtest_shutdown(); +} + +/*** Logical Device Initialization ***/ + +/** + * Start the PCI device and sanity-check default operation. + */ +static void ahci_pci_enable(QPCIDevice *ahci, void **hba_base) +{ + uint8_t reg; + + start_ahci_device(ahci, hba_base); + + switch (ahci_fingerprint) { + case AHCI_INTEL_ICH9: + /* ICH9 has a register at PCI 0x92 that + * acts as a master port enabler mask. */ + reg = qpci_config_readb(ahci, 0x92); + reg |= 0x3F; + qpci_config_writeb(ahci, 0x92, reg); + /* 0...0111111b -- bit significant, ports 0-5 enabled. */ + ASSERT_BIT_SET(qpci_config_readb(ahci, 0x92), 0x3F); + break; + } + +} + +/** + * Map BAR5/ABAR, and engage the PCI device. + */ +static QPCIDevice *start_ahci_device(QPCIDevice *ahci, void **hba_base) +{ + /* Map AHCI's ABAR (BAR5) */ + *hba_base = qpci_iomap(ahci, 5, &barsize); + + /* turns on pci.cmd.iose, pci.cmd.mse and pci.cmd.bme */ + qpci_device_enable(ahci); + + return ahci; +} + +/** + * Test and initialize the AHCI's HBA memory areas. + * Initialize and start any ports with devices attached. + * Bring the HBA into the idle state. + */ +static void ahci_hba_enable(QPCIDevice *ahci, void *hba_base) +{ + /* Bits of interest in this section: + * GHC.AE Global Host Control / AHCI Enable + * PxCMD.ST Port Command: Start + * PxCMD.SUD "Spin Up Device" + * PxCMD.POD "Power On Device" + * PxCMD.FRE "FIS Receive Enable" + * PxCMD.FR "FIS Receive Running" + * PxCMD.CR "Command List Running" + */ + + g_assert(ahci != NULL); + g_assert(hba_base != NULL); + + uint32_t reg, ports_impl, clb, fb; + uint16_t i; + uint8_t num_cmd_slots; + + g_assert(hba_base != 0); + + /* Set GHC.AE to 1 */ + AHCI_SET(AHCI_GHC, AHCI_GHC_AE); + reg = AHCI_RREG(AHCI_GHC); + ASSERT_BIT_SET(reg, AHCI_GHC_AE); + + /* Read CAP.NCS, how many command slots do we have? */ + reg = AHCI_RREG(AHCI_CAP); + num_cmd_slots = ((reg & AHCI_CAP_NCS) >> ctzl(AHCI_CAP_NCS)) + 1; + g_test_message("Number of Command Slots: %u", num_cmd_slots); + + /* Determine which ports are implemented. */ + ports_impl = AHCI_RREG(AHCI_PI); + + for (i = 0; ports_impl; ports_impl >>= 1, ++i) { + if (!(ports_impl & 0x01)) { + continue; + } + + g_test_message("Initializing port %u", i); + + reg = PX_RREG(i, AHCI_PX_CMD); + if (BITCLR(reg, AHCI_PX_CMD_ST | AHCI_PX_CMD_CR | + AHCI_PX_CMD_FRE | AHCI_PX_CMD_FR)) { + g_test_message("port is idle"); + } else { + g_test_message("port needs to be idled"); + PX_CLR(i, AHCI_PX_CMD, (AHCI_PX_CMD_ST | AHCI_PX_CMD_FRE)); + /* The port has 500ms to disengage. */ + usleep(500000); + reg = PX_RREG(i, AHCI_PX_CMD); + ASSERT_BIT_CLEAR(reg, AHCI_PX_CMD_CR); + ASSERT_BIT_CLEAR(reg, AHCI_PX_CMD_FR); + g_test_message("port is now idle"); + /* The spec does allow for possibly needing a PORT RESET + * or HBA reset if we fail to idle the port. */ + } + + /* Allocate Memory for the Command List Buffer & FIS Buffer */ + /* PxCLB space ... 0x20 per command, as in 4.2.2 p 36 */ + clb = guest_alloc(guest_malloc, num_cmd_slots * 0x20); + g_test_message("CLB: 0x%08x", clb); + PX_WREG(i, AHCI_PX_CLB, clb); + g_assert_cmphex(clb, ==, PX_RREG(i, AHCI_PX_CLB)); + + /* PxFB space ... 0x100, as in 4.2.1 p 35 */ + fb = guest_alloc(guest_malloc, 0x100); + g_test_message("FB: 0x%08x", fb); + PX_WREG(i, AHCI_PX_FB, fb); + g_assert_cmphex(fb, ==, PX_RREG(i, AHCI_PX_FB)); + + /* Clear PxSERR, PxIS, then IS.IPS[x] by writing '1's. */ + PX_WREG(i, AHCI_PX_SERR, 0xFFFFFFFF); + PX_WREG(i, AHCI_PX_IS, 0xFFFFFFFF); + AHCI_WREG(AHCI_IS, (1 << i)); + + /* Verify Interrupts Cleared */ + reg = PX_RREG(i, AHCI_PX_SERR); + g_assert_cmphex(reg, ==, 0); + + reg = PX_RREG(i, AHCI_PX_IS); + g_assert_cmphex(reg, ==, 0); + + reg = AHCI_RREG(AHCI_IS); + ASSERT_BIT_CLEAR(reg, (1 << i)); + + /* Enable All Interrupts: */ + PX_WREG(i, AHCI_PX_IE, 0xFFFFFFFF); + reg = PX_RREG(i, AHCI_PX_IE); + g_assert_cmphex(reg, ==, ~((uint32_t)AHCI_PX_IE_RESERVED)); + + /* Enable the FIS Receive Engine. */ + PX_SET(i, AHCI_PX_CMD, AHCI_PX_CMD_FRE); + reg = PX_RREG(i, AHCI_PX_CMD); + ASSERT_BIT_SET(reg, AHCI_PX_CMD_FR); + + /* AHCI 1.3 spec: if !STS.BSY, !STS.DRQ and PxSSTS.DET indicates + * physical presence, a device is present and may be started. However, + * PxSERR.DIAG.X /may/ need to be cleared a priori. */ + reg = PX_RREG(i, AHCI_PX_SERR); + if (BITSET(reg, AHCI_PX_SERR_DIAG_X)) { + PX_SET(i, AHCI_PX_SERR, AHCI_PX_SERR_DIAG_X); + } + + reg = PX_RREG(i, AHCI_PX_TFD); + if (BITCLR(reg, AHCI_PX_TFD_STS_BSY | AHCI_PX_TFD_STS_DRQ)) { + reg = PX_RREG(i, AHCI_PX_SSTS); + if ((reg & AHCI_PX_SSTS_DET) == SSTS_DET_ESTABLISHED) { + /* Device Found: set PxCMD.ST := 1 */ + PX_SET(i, AHCI_PX_CMD, AHCI_PX_CMD_ST); + ASSERT_BIT_SET(PX_RREG(i, AHCI_PX_CMD), AHCI_PX_CMD_CR); + g_test_message("Started Device %u", i); + } else if ((reg & AHCI_PX_SSTS_DET)) { + /* Device present, but in some unknown state. */ + g_assert_not_reached(); + } + } + } + + /* Enable GHC.IE */ + AHCI_SET(AHCI_GHC, AHCI_GHC_IE); + reg = AHCI_RREG(AHCI_GHC); + ASSERT_BIT_SET(reg, AHCI_GHC_IE); + + /* TODO: The device should now be idling and waiting for commands. + * In the future, a small test-case to inspect the Register D2H FIS + * and clear the initial interrupts might be good. */ +} + +/*** Specification Adherence Tests ***/ + +/** + * Implementation for test_pci_spec. Ensures PCI configuration space is sane. + */ +static void ahci_test_pci_spec(QPCIDevice *ahci) +{ + uint8_t datab; + uint16_t data; + uint32_t datal; + + /* Most of these bits should start cleared until we turn them on. */ + data = qpci_config_readw(ahci, PCI_COMMAND); + ASSERT_BIT_CLEAR(data, PCI_COMMAND_MEMORY); + ASSERT_BIT_CLEAR(data, PCI_COMMAND_MASTER); + ASSERT_BIT_CLEAR(data, PCI_COMMAND_SPECIAL); /* Reserved */ + ASSERT_BIT_CLEAR(data, PCI_COMMAND_VGA_PALETTE); /* Reserved */ + ASSERT_BIT_CLEAR(data, PCI_COMMAND_PARITY); + ASSERT_BIT_CLEAR(data, PCI_COMMAND_WAIT); /* Reserved */ + ASSERT_BIT_CLEAR(data, PCI_COMMAND_SERR); + ASSERT_BIT_CLEAR(data, PCI_COMMAND_FAST_BACK); + ASSERT_BIT_CLEAR(data, PCI_COMMAND_INTX_DISABLE); + ASSERT_BIT_CLEAR(data, 0xF800); /* Reserved */ + + data = qpci_config_readw(ahci, PCI_STATUS); + ASSERT_BIT_CLEAR(data, 0x01 | 0x02 | 0x04); /* Reserved */ + ASSERT_BIT_CLEAR(data, PCI_STATUS_INTERRUPT); + ASSERT_BIT_SET(data, PCI_STATUS_CAP_LIST); /* must be set */ + ASSERT_BIT_CLEAR(data, PCI_STATUS_UDF); /* Reserved */ + ASSERT_BIT_CLEAR(data, PCI_STATUS_PARITY); + ASSERT_BIT_CLEAR(data, PCI_STATUS_SIG_TARGET_ABORT); + ASSERT_BIT_CLEAR(data, PCI_STATUS_REC_TARGET_ABORT); + ASSERT_BIT_CLEAR(data, PCI_STATUS_REC_MASTER_ABORT); + ASSERT_BIT_CLEAR(data, PCI_STATUS_SIG_SYSTEM_ERROR); + ASSERT_BIT_CLEAR(data, PCI_STATUS_DETECTED_PARITY); + + /* RID occupies the low byte, CCs occupy the high three. */ + datal = qpci_config_readl(ahci, PCI_CLASS_REVISION); + if (ahci_pedantic) { + /* AHCI 1.3 specifies that at-boot, the RID should reset to 0x00, + * Though in practice this is likely seldom true. */ + ASSERT_BIT_CLEAR(datal, 0xFF); + } + + /* BCC *must* equal 0x01. */ + g_assert_cmphex(PCI_BCC(datal), ==, 0x01); + if (PCI_SCC(datal) == 0x01) { + /* IDE */ + ASSERT_BIT_SET(0x80000000, datal); + ASSERT_BIT_CLEAR(0x60000000, datal); + } else if (PCI_SCC(datal) == 0x04) { + /* RAID */ + g_assert_cmphex(PCI_PI(datal), ==, 0); + } else if (PCI_SCC(datal) == 0x06) { + /* AHCI */ + g_assert_cmphex(PCI_PI(datal), ==, 0x01); + } else { + g_assert_not_reached(); + } + + datab = qpci_config_readb(ahci, PCI_CACHE_LINE_SIZE); + g_assert_cmphex(datab, ==, 0); + + datab = qpci_config_readb(ahci, PCI_LATENCY_TIMER); + g_assert_cmphex(datab, ==, 0); + + /* Only the bottom 7 bits must be off. */ + datab = qpci_config_readb(ahci, PCI_HEADER_TYPE); + ASSERT_BIT_CLEAR(datab, 0x7F); + + /* BIST is optional, but the low 7 bits must always start off regardless. */ + datab = qpci_config_readb(ahci, PCI_BIST); + ASSERT_BIT_CLEAR(datab, 0x7F); + + /* BARS 0-4 do not have a boot spec, but ABAR/BAR5 must be clean. */ + datal = qpci_config_readl(ahci, PCI_BASE_ADDRESS_5); + g_assert_cmphex(datal, ==, 0); + + qpci_config_writel(ahci, PCI_BASE_ADDRESS_5, 0xFFFFFFFF); + datal = qpci_config_readl(ahci, PCI_BASE_ADDRESS_5); + /* ABAR must be 32-bit, memory mapped, non-prefetchable and + * must be >= 512 bytes. To that end, bits 0-8 must be off. */ + ASSERT_BIT_CLEAR(datal, 0xFF); + + /* Capability list MUST be present, */ + datal = qpci_config_readl(ahci, PCI_CAPABILITY_LIST); + /* But these bits are reserved. */ + ASSERT_BIT_CLEAR(datal, ~0xFF); + g_assert_cmphex(datal, !=, 0); + + /* Check specification adherence for capability extenstions. */ + data = qpci_config_readw(ahci, datal); + + switch (ahci_fingerprint) { + case AHCI_INTEL_ICH9: + /* Intel ICH9 Family Datasheet 14.1.19 p.550 */ + g_assert_cmphex((data & 0xFF), ==, PCI_CAP_ID_MSI); + break; + default: + /* AHCI 1.3, Section 2.1.14 -- CAP must point to PMCAP. */ + g_assert_cmphex((data & 0xFF), ==, PCI_CAP_ID_PM); + } + + ahci_test_pci_caps(ahci, data, (uint8_t)datal); + + /* Reserved. */ + datal = qpci_config_readl(ahci, PCI_CAPABILITY_LIST + 4); + g_assert_cmphex(datal, ==, 0); + + /* IPIN might vary, but ILINE must be off. */ + datab = qpci_config_readb(ahci, PCI_INTERRUPT_LINE); + g_assert_cmphex(datab, ==, 0); +} + +/** + * Test PCI capabilities for AHCI specification adherence. + */ +static void ahci_test_pci_caps(QPCIDevice *ahci, uint16_t header, + uint8_t offset) +{ + uint8_t cid = header & 0xFF; + uint8_t next = header >> 8; + + g_test_message("CID: %02x; next: %02x", cid, next); + + switch (cid) { + case PCI_CAP_ID_PM: + ahci_test_pmcap(ahci, offset); + break; + case PCI_CAP_ID_MSI: + ahci_test_msicap(ahci, offset); + break; + case PCI_CAP_ID_SATA: + ahci_test_satacap(ahci, offset); + break; + + default: + g_test_message("Unknown CAP 0x%02x", cid); + } + + if (next) { + ahci_test_pci_caps(ahci, qpci_config_readw(ahci, next), next); + } +} + +/** + * Test SATA PCI capabilitity for AHCI specification adherence. + */ +static void ahci_test_satacap(QPCIDevice *ahci, uint8_t offset) +{ + uint16_t dataw; + uint32_t datal; + + g_test_message("Verifying SATACAP"); + + /* Assert that the SATACAP version is 1.0, And reserved bits are empty. */ + dataw = qpci_config_readw(ahci, offset + 2); + g_assert_cmphex(dataw, ==, 0x10); + + /* Grab the SATACR1 register. */ + datal = qpci_config_readw(ahci, offset + 4); + + switch (datal & 0x0F) { + case 0x04: /* BAR0 */ + case 0x05: /* BAR1 */ + case 0x06: + case 0x07: + case 0x08: + case 0x09: /* BAR5 */ + case 0x0F: /* Immediately following SATACR1 in PCI config space. */ + break; + default: + /* Invalid BARLOC for the Index Data Pair. */ + g_assert_not_reached(); + } + + /* Reserved. */ + g_assert_cmphex((datal >> 24), ==, 0x00); +} + +/** + * Test MSI PCI capability for AHCI specification adherence. + */ +static void ahci_test_msicap(QPCIDevice *ahci, uint8_t offset) +{ + uint16_t dataw; + uint32_t datal; + + g_test_message("Verifying MSICAP"); + + dataw = qpci_config_readw(ahci, offset + PCI_MSI_FLAGS); + ASSERT_BIT_CLEAR(dataw, PCI_MSI_FLAGS_ENABLE); + ASSERT_BIT_CLEAR(dataw, PCI_MSI_FLAGS_QSIZE); + ASSERT_BIT_CLEAR(dataw, PCI_MSI_FLAGS_RESERVED); + + datal = qpci_config_readl(ahci, offset + PCI_MSI_ADDRESS_LO); + g_assert_cmphex(datal, ==, 0); + + if (dataw & PCI_MSI_FLAGS_64BIT) { + g_test_message("MSICAP is 64bit"); + datal = qpci_config_readl(ahci, offset + PCI_MSI_ADDRESS_HI); + g_assert_cmphex(datal, ==, 0); + dataw = qpci_config_readw(ahci, offset + PCI_MSI_DATA_64); + g_assert_cmphex(dataw, ==, 0); + } else { + g_test_message("MSICAP is 32bit"); + dataw = qpci_config_readw(ahci, offset + PCI_MSI_DATA_32); + g_assert_cmphex(dataw, ==, 0); + } +} + +/** + * Test Power Management PCI capability for AHCI specification adherence. + */ +static void ahci_test_pmcap(QPCIDevice *ahci, uint8_t offset) +{ + uint16_t dataw; + + g_test_message("Verifying PMCAP"); + + dataw = qpci_config_readw(ahci, offset + PCI_PM_PMC); + ASSERT_BIT_CLEAR(dataw, PCI_PM_CAP_PME_CLOCK); + ASSERT_BIT_CLEAR(dataw, PCI_PM_CAP_RESERVED); + ASSERT_BIT_CLEAR(dataw, PCI_PM_CAP_D1); + ASSERT_BIT_CLEAR(dataw, PCI_PM_CAP_D2); + + dataw = qpci_config_readw(ahci, offset + PCI_PM_CTRL); + ASSERT_BIT_CLEAR(dataw, PCI_PM_CTRL_STATE_MASK); + ASSERT_BIT_CLEAR(dataw, PCI_PM_CTRL_RESERVED); + ASSERT_BIT_CLEAR(dataw, PCI_PM_CTRL_DATA_SEL_MASK); + ASSERT_BIT_CLEAR(dataw, PCI_PM_CTRL_DATA_SCALE_MASK); +} + +static void ahci_test_hba_spec(QPCIDevice *ahci, void *hba_base) +{ + HBACap hcap; + unsigned i; + uint32_t cap, cap2, reg; + uint32_t ports; + uint8_t nports_impl; + uint8_t maxports; + + g_assert(ahci != 0); + g_assert(hba_base != 0); + + /* + * Note that the AHCI spec does expect the BIOS to set up a few things: + * CAP.SSS - Support for staggered spin-up (t/f) + * CAP.SMPS - Support for mechanical presence switches (t/f) + * PI - Ports Implemented (1-32) + * PxCMD.HPCP - Hot Plug Capable Port + * PxCMD.MPSP - Mechanical Presence Switch Present + * PxCMD.CPD - Cold Presence Detection support + * + * Additional items are touched if CAP.SSS is on, see AHCI 10.1.1 p.97: + * Foreach Port Implemented: + * -PxCMD.ST, PxCMD.CR, PxCMD.FRE, PxCMD.FR, PxSCTL.DET are 0 + * -PxCLB/U and PxFB/U are set to valid regions in memory + * -PxSUD is set to 1. + * -PxSSTS.DET is polled for presence; if detected, we continue: + * -PxSERR is cleared with 1's. + * -If PxTFD.STS.BSY, PxTFD.STS.DRQ, and PxTFD.STS.ERR are all zero, + * the device is ready. + */ + + /* 1 CAP - Capabilities Register */ + cap = AHCI_RREG(AHCI_CAP); + ASSERT_BIT_CLEAR(cap, AHCI_CAP_RESERVED); + + /* 2 GHC - Global Host Control */ + reg = AHCI_RREG(AHCI_GHC); + ASSERT_BIT_CLEAR(reg, AHCI_GHC_HR); + ASSERT_BIT_CLEAR(reg, AHCI_GHC_IE); + ASSERT_BIT_CLEAR(reg, AHCI_GHC_MRSM); + if (BITSET(cap, AHCI_CAP_SAM)) { + g_test_message("Supports AHCI-Only Mode: GHC_AE is Read-Only."); + ASSERT_BIT_SET(reg, AHCI_GHC_AE); + } else { + g_test_message("Supports AHCI/Legacy mix."); + ASSERT_BIT_CLEAR(reg, AHCI_GHC_AE); + } + + /* 3 IS - Interrupt Status */ + reg = AHCI_RREG(AHCI_IS); + g_assert_cmphex(reg, ==, 0); + + /* 4 PI - Ports Implemented */ + ports = AHCI_RREG(AHCI_PI); + /* Ports Implemented must be non-zero. */ + g_assert_cmphex(ports, !=, 0); + /* Ports Implemented must be <= Number of Ports. */ + nports_impl = ctpopl(ports); + g_assert_cmpuint(((AHCI_CAP_NP & cap) + 1), >=, nports_impl); + + g_assert_cmphex(barsize, >, 0); + /* Ports must be within the proper range. Given a mapping of SIZE, + * 256 bytes are used for global HBA control, and the rest is used + * for ports data, at 0x80 bytes each. */ + maxports = (barsize - HBA_DATA_REGION_SIZE) / HBA_PORT_DATA_SIZE; + /* e.g, 30 ports for 4K of memory. (4096 - 256) / 128 = 30 */ + g_assert_cmphex((reg >> maxports), ==, 0); + + /* 5 AHCI Version */ + reg = AHCI_RREG(AHCI_VS); + switch (reg) { + case AHCI_VERSION_0_95: + case AHCI_VERSION_1_0: + case AHCI_VERSION_1_1: + case AHCI_VERSION_1_2: + case AHCI_VERSION_1_3: + break; + default: + g_assert_not_reached(); + } + + /* 6 Command Completion Coalescing Control: depends on CAP.CCCS. */ + reg = AHCI_RREG(AHCI_CCCCTL); + if (BITSET(cap, AHCI_CAP_CCCS)) { + ASSERT_BIT_CLEAR(reg, AHCI_CCCCTL_EN); + ASSERT_BIT_CLEAR(reg, AHCI_CCCCTL_RESERVED); + ASSERT_BIT_SET(reg, AHCI_CCCCTL_CC); + ASSERT_BIT_SET(reg, AHCI_CCCCTL_TV); + } else { + g_assert_cmphex(reg, ==, 0); + } + + /* 7 CCC_PORTS */ + reg = AHCI_RREG(AHCI_CCCPORTS); + /* Must be zeroes initially regardless of CAP.CCCS */ + g_assert_cmphex(reg, ==, 0); + + /* 8 EM_LOC */ + reg = AHCI_RREG(AHCI_EMLOC); + if (BITCLR(cap, AHCI_CAP_EMS)) { + g_assert_cmphex(reg, ==, 0); + } + + /* 9 EM_CTL */ + reg = AHCI_RREG(AHCI_EMCTL); + if (BITSET(cap, AHCI_CAP_EMS)) { + ASSERT_BIT_CLEAR(reg, AHCI_EMCTL_STSMR); + ASSERT_BIT_CLEAR(reg, AHCI_EMCTL_CTLTM); + ASSERT_BIT_CLEAR(reg, AHCI_EMCTL_CTLRST); + ASSERT_BIT_CLEAR(reg, AHCI_EMCTL_RESERVED); + } else { + g_assert_cmphex(reg, ==, 0); + } + + /* 10 CAP2 -- Capabilities Extended */ + cap2 = AHCI_RREG(AHCI_CAP2); + ASSERT_BIT_CLEAR(cap2, AHCI_CAP2_RESERVED); + + /* 11 BOHC -- Bios/OS Handoff Control */ + reg = AHCI_RREG(AHCI_BOHC); + g_assert_cmphex(reg, ==, 0); + + /* 12 -- 23: Reserved */ + g_test_message("Verifying HBA reserved area is empty."); + for (i = AHCI_RESERVED; i < AHCI_NVMHCI; ++i) { + reg = AHCI_RREG(i); + g_assert_cmphex(reg, ==, 0); + } + + /* 24 -- 39: NVMHCI */ + if (BITCLR(cap2, AHCI_CAP2_NVMP)) { + g_test_message("Verifying HBA/NVMHCI area is empty."); + for (i = AHCI_NVMHCI; i < AHCI_VENDOR; ++i) { + reg = AHCI_RREG(i); + g_assert_cmphex(reg, ==, 0); + } + } + + /* 40 -- 63: Vendor */ + g_test_message("Verifying HBA/Vendor area is empty."); + for (i = AHCI_VENDOR; i < AHCI_PORTS; ++i) { + reg = AHCI_RREG(i); + g_assert_cmphex(reg, ==, 0); + } + + /* 64 -- XX: Port Space */ + hcap.cap = cap; + hcap.cap2 = cap2; + for (i = 0; ports || (i < maxports); ports >>= 1, ++i) { + if (BITSET(ports, 0x1)) { + g_test_message("Testing port %u for spec", i); + ahci_test_port_spec(ahci, hba_base, &hcap, i); + } else { + uint16_t j; + uint16_t low = AHCI_PORTS + (32 * i); + uint16_t high = AHCI_PORTS + (32 * (i + 1)); + g_test_message("Asserting unimplemented port %u " + "(reg [%u-%u]) is empty.", + i, low, high - 1); + for (j = low; j < high; ++j) { + reg = AHCI_RREG(j); + g_assert_cmphex(reg, ==, 0); + } + } + } +} + +/** + * Test the memory space for one port for specification adherence. + */ +static void ahci_test_port_spec(QPCIDevice *ahci, void *hba_base, + HBACap *hcap, uint8_t port) +{ + uint32_t reg; + unsigned i; + + /* (0) CLB */ + reg = PX_RREG(port, AHCI_PX_CLB); + ASSERT_BIT_CLEAR(reg, AHCI_PX_CLB_RESERVED); + + /* (1) CLBU */ + if (BITCLR(hcap->cap, AHCI_CAP_S64A)) { + reg = PX_RREG(port, AHCI_PX_CLBU); + g_assert_cmphex(reg, ==, 0); + } + + /* (2) FB */ + reg = PX_RREG(port, AHCI_PX_FB); + ASSERT_BIT_CLEAR(reg, AHCI_PX_FB_RESERVED); + + /* (3) FBU */ + if (BITCLR(hcap->cap, AHCI_CAP_S64A)) { + reg = PX_RREG(port, AHCI_PX_FBU); + g_assert_cmphex(reg, ==, 0); + } + + /* (4) IS */ + reg = PX_RREG(port, AHCI_PX_IS); + g_assert_cmphex(reg, ==, 0); + + /* (5) IE */ + reg = PX_RREG(port, AHCI_PX_IE); + g_assert_cmphex(reg, ==, 0); + + /* (6) CMD */ + reg = PX_RREG(port, AHCI_PX_CMD); + ASSERT_BIT_CLEAR(reg, AHCI_PX_CMD_FRE); + ASSERT_BIT_CLEAR(reg, AHCI_PX_CMD_RESERVED); + ASSERT_BIT_CLEAR(reg, AHCI_PX_CMD_CCS); + ASSERT_BIT_CLEAR(reg, AHCI_PX_CMD_FR); + ASSERT_BIT_CLEAR(reg, AHCI_PX_CMD_CR); + ASSERT_BIT_CLEAR(reg, AHCI_PX_CMD_PMA); /* And RW only if CAP.SPM */ + ASSERT_BIT_CLEAR(reg, AHCI_PX_CMD_APSTE); /* RW only if CAP2.APST */ + ASSERT_BIT_CLEAR(reg, AHCI_PX_CMD_ATAPI); + ASSERT_BIT_CLEAR(reg, AHCI_PX_CMD_DLAE); + ASSERT_BIT_CLEAR(reg, AHCI_PX_CMD_ALPE); /* RW only if CAP.SALP */ + ASSERT_BIT_CLEAR(reg, AHCI_PX_CMD_ASP); /* RW only if CAP.SALP */ + ASSERT_BIT_CLEAR(reg, AHCI_PX_CMD_ICC); + /* If CPDetect support does not exist, CPState must be off. */ + if (BITCLR(reg, AHCI_PX_CMD_CPD)) { + ASSERT_BIT_CLEAR(reg, AHCI_PX_CMD_CPS); + } + /* If MPSPresence is not set, MPSState must be off. */ + if (BITCLR(reg, AHCI_PX_CMD_MPSP)) { + ASSERT_BIT_CLEAR(reg, AHCI_PX_CMD_MPSS); + } + /* If we do not support MPS, MPSS and MPSP must be off. */ + if (BITCLR(hcap->cap, AHCI_CAP_SMPS)) { + ASSERT_BIT_CLEAR(reg, AHCI_PX_CMD_MPSS); + ASSERT_BIT_CLEAR(reg, AHCI_PX_CMD_MPSP); + } + /* If, via CPD or MPSP we detect a drive, HPCP must be on. */ + if (BITANY(reg, AHCI_PX_CMD_CPD || AHCI_PX_CMD_MPSP)) { + ASSERT_BIT_SET(reg, AHCI_PX_CMD_HPCP); + } + /* HPCP and ESP cannot both be active. */ + g_assert(!BITSET(reg, AHCI_PX_CMD_HPCP | AHCI_PX_CMD_ESP)); + /* If CAP.FBSS is not set, FBSCP must not be set. */ + if (BITCLR(hcap->cap, AHCI_CAP_FBSS)) { + ASSERT_BIT_CLEAR(reg, AHCI_PX_CMD_FBSCP); + } + + /* (7) RESERVED */ + reg = PX_RREG(port, AHCI_PX_RES1); + g_assert_cmphex(reg, ==, 0); + + /* (8) TFD */ + reg = PX_RREG(port, AHCI_PX_TFD); + /* At boot, prior to an FIS being received, the TFD register should be 0x7F, + * which breaks down as follows, as seen in AHCI 1.3 sec 3.3.8, p. 27. */ + ASSERT_BIT_SET(reg, AHCI_PX_TFD_STS_ERR); + ASSERT_BIT_SET(reg, AHCI_PX_TFD_STS_CS1); + ASSERT_BIT_SET(reg, AHCI_PX_TFD_STS_DRQ); + ASSERT_BIT_SET(reg, AHCI_PX_TFD_STS_CS2); + ASSERT_BIT_CLEAR(reg, AHCI_PX_TFD_STS_BSY); + ASSERT_BIT_CLEAR(reg, AHCI_PX_TFD_ERR); + ASSERT_BIT_CLEAR(reg, AHCI_PX_TFD_RESERVED); + + /* (9) SIG */ + /* Though AHCI specifies the boot value should be 0xFFFFFFFF, + * Even when GHC.ST is zero, the AHCI HBA may receive the initial + * D2H register FIS and update the signature asynchronously, + * so we cannot expect a value here. AHCI 1.3, sec 3.3.9, pp 27-28 */ + + /* (10) SSTS / SCR0: SStatus */ + reg = PX_RREG(port, AHCI_PX_SSTS); + ASSERT_BIT_CLEAR(reg, AHCI_PX_SSTS_RESERVED); + /* Even though the register should be 0 at boot, it is asynchronous and + * prone to change, so we cannot test any well known value. */ + + /* (11) SCTL / SCR2: SControl */ + reg = PX_RREG(port, AHCI_PX_SCTL); + g_assert_cmphex(reg, ==, 0); + + /* (12) SERR / SCR1: SError */ + reg = PX_RREG(port, AHCI_PX_SERR); + g_assert_cmphex(reg, ==, 0); + + /* (13) SACT / SCR3: SActive */ + reg = PX_RREG(port, AHCI_PX_SACT); + g_assert_cmphex(reg, ==, 0); + + /* (14) CI */ + reg = PX_RREG(port, AHCI_PX_CI); + g_assert_cmphex(reg, ==, 0); + + /* (15) SNTF */ + reg = PX_RREG(port, AHCI_PX_SNTF); + g_assert_cmphex(reg, ==, 0); + + /* (16) FBS */ + reg = PX_RREG(port, AHCI_PX_FBS); + ASSERT_BIT_CLEAR(reg, AHCI_PX_FBS_EN); + ASSERT_BIT_CLEAR(reg, AHCI_PX_FBS_DEC); + ASSERT_BIT_CLEAR(reg, AHCI_PX_FBS_SDE); + ASSERT_BIT_CLEAR(reg, AHCI_PX_FBS_DEV); + ASSERT_BIT_CLEAR(reg, AHCI_PX_FBS_DWE); + ASSERT_BIT_CLEAR(reg, AHCI_PX_FBS_RESERVED); + if (BITSET(hcap->cap, AHCI_CAP_FBSS)) { + /* if Port-Multiplier FIS-based switching avail, ADO must >= 2 */ + g_assert((reg & AHCI_PX_FBS_ADO) >> ctzl(AHCI_PX_FBS_ADO) >= 2); + } + + /* [17 -- 27] RESERVED */ + for (i = AHCI_PX_RES2; i < AHCI_PX_VS; ++i) { + reg = PX_RREG(port, i); + g_assert_cmphex(reg, ==, 0); + } + + /* [28 -- 31] Vendor-Specific */ + for (i = AHCI_PX_VS; i < 32; ++i) { + reg = PX_RREG(port, i); + if (reg) { + g_test_message("INFO: Vendor register %u non-empty", i); + } + } +} + +/** + * Utilizing an initialized AHCI HBA, issue an IDENTIFY command to the first + * device we see, then read and check the response. + */ +static void ahci_test_identify(QPCIDevice *ahci, void *hba_base) +{ + RegD2HFIS *d2h = g_malloc0(0x20); + RegD2HFIS *pio = g_malloc0(0x20); + RegH2DFIS fis; + AHCICommand cmd; + PRD prd; + uint32_t ports, reg, clb, table, fb, data_ptr; + uint16_t buff[256]; + unsigned i; + int rc; + + g_assert(ahci != NULL); + g_assert(hba_base != NULL); + + /* We need to: + * (1) Create a Command Table Buffer and update the Command List Slot #0 + * to point to this buffer. + * (2) Construct an FIS host-to-device command structure, and write it to + * the top of the command table buffer. + * (3) Create a data buffer for the IDENTIFY response to be sent to + * (4) Create a Physical Region Descriptor that points to the data buffer, + * and write it to the bottom (offset 0x80) of the command table. + * (5) Now, PxCLB points to the command list, command 0 points to + * our table, and our table contains an FIS instruction and a + * PRD that points to our rx buffer. + * (6) We inform the HBA via PxCI that there is a command ready in slot #0. + */ + + /* Pick the first implemented and running port */ + ports = AHCI_RREG(AHCI_PI); + for (i = 0; i < 32; ports >>= 1, ++i) { + if (ports == 0) { + i = 32; + } + + if (!(ports & 0x01)) { + continue; + } + + reg = PX_RREG(i, AHCI_PX_CMD); + if (BITSET(reg, AHCI_PX_CMD_ST)) { + break; + } + } + g_assert_cmphex(i, <, 32); + g_test_message("Selected port %u for test", i); + + /* Clear out this port's interrupts (ignore the init register d2h fis) */ + reg = PX_RREG(i, AHCI_PX_IS); + PX_WREG(i, AHCI_PX_IS, reg); + g_assert_cmphex(PX_RREG(i, AHCI_PX_IS), ==, 0); + + /* Wipe the FIS-Recieve Buffer */ + fb = PX_RREG(i, AHCI_PX_FB); + g_assert_cmphex(fb, !=, 0); + qmemset(fb, 0x00, 0x100); + + /* Create a Command Table buffer. 0x80 is the smallest with a PRDTL of 0. */ + /* We need at least one PRD, so round up to the nearest 0x80 multiple. */ + table = guest_alloc(guest_malloc, CMD_TBL_SIZ(1)); + g_assert(table); + ASSERT_BIT_CLEAR(table, 0x7F); + + /* Create a data buffer ... where we will dump the IDENTIFY data to. */ + data_ptr = guest_alloc(guest_malloc, 512); + g_assert(data_ptr); + + /* Grab the Command List Buffer pointer */ + clb = PX_RREG(i, AHCI_PX_CLB); + g_assert(clb); + + /* Copy the existing Command #0 structure from the CLB into local memory, + * and build a new command #0. */ + memread(clb, &cmd, sizeof(cmd)); + cmd.b1 = 5; /* reg_h2d_fis is 5 double-words long */ + cmd.b2 = 0x04; /* clear PxTFD.STS.BSY when done */ + cmd.prdtl = cpu_to_le16(1); /* One PRD table entry. */ + cmd.prdbc = 0; + cmd.ctba = cpu_to_le32(table); + cmd.ctbau = 0; + + /* Construct our PRD, noting that DBC is 0-indexed. */ + prd.dba = cpu_to_le32(data_ptr); + prd.dbau = 0; + prd.res = 0; + /* 511+1 bytes, request DPS interrupt */ + prd.dbc = cpu_to_le32(511 | 0x80000000); + + /* Construct our Command FIS, Based on http://wiki.osdev.org/AHCI */ + memset(&fis, 0x00, sizeof(fis)); + fis.fis_type = 0x27; /* Register Host-to-Device FIS */ + fis.command = 0xEC; /* IDENTIFY */ + fis.device = 0; + fis.flags = 0x80; /* Indicate this is a command FIS */ + + /* We've committed nothing yet, no interrupts should be posted yet. */ + g_assert_cmphex(PX_RREG(i, AHCI_PX_IS), ==, 0); + + /* Commit the Command FIS to the Command Table */ + memwrite(table, &fis, sizeof(fis)); + + /* Commit the PRD entry to the Command Table */ + memwrite(table + 0x80, &prd, sizeof(prd)); + + /* Commit Command #0, pointing to the Table, to the Command List Buffer. */ + memwrite(clb, &cmd, sizeof(cmd)); + + /* Everything is in place, but we haven't given the go-ahead yet. */ + g_assert_cmphex(PX_RREG(i, AHCI_PX_IS), ==, 0); + + /* Issue Command #0 via PxCI */ + PX_WREG(i, AHCI_PX_CI, (1 << 0)); + while (BITSET(PX_RREG(i, AHCI_PX_TFD), AHCI_PX_TFD_STS_BSY)) { + usleep(50); + } + + /* Check for expected interrupts */ + reg = PX_RREG(i, AHCI_PX_IS); + ASSERT_BIT_SET(reg, AHCI_PX_IS_DHRS); + ASSERT_BIT_SET(reg, AHCI_PX_IS_PSS); + /* BUG: we expect AHCI_PX_IS_DPS to be set. */ + ASSERT_BIT_CLEAR(reg, AHCI_PX_IS_DPS); + + /* Clear expected interrupts and assert all interrupts now cleared. */ + PX_WREG(i, AHCI_PX_IS, AHCI_PX_IS_DHRS | AHCI_PX_IS_PSS | AHCI_PX_IS_DPS); + g_assert_cmphex(PX_RREG(i, AHCI_PX_IS), ==, 0); + + /* Check for errors. */ + reg = PX_RREG(i, AHCI_PX_SERR); + g_assert_cmphex(reg, ==, 0); + reg = PX_RREG(i, AHCI_PX_TFD); + ASSERT_BIT_CLEAR(reg, AHCI_PX_TFD_STS_ERR); + ASSERT_BIT_CLEAR(reg, AHCI_PX_TFD_ERR); + + /* Investigate CMD #0, assert that we read 512 bytes */ + memread(clb, &cmd, sizeof(cmd)); + g_assert_cmphex(512, ==, le32_to_cpu(cmd.prdbc)); + + /* Investigate FIS responses */ + memread(fb + 0x20, pio, 0x20); + memread(fb + 0x40, d2h, 0x20); + g_assert_cmphex(pio->fis_type, ==, 0x5f); + g_assert_cmphex(d2h->fis_type, ==, 0x34); + g_assert_cmphex(pio->flags, ==, d2h->flags); + g_assert_cmphex(pio->status, ==, d2h->status); + g_assert_cmphex(pio->error, ==, d2h->error); + + reg = PX_RREG(i, AHCI_PX_TFD); + g_assert_cmphex((reg & AHCI_PX_TFD_ERR), ==, pio->error); + g_assert_cmphex((reg & AHCI_PX_TFD_STS), ==, pio->status); + /* The PIO Setup FIS contains a "bytes read" field, which is a + * 16-bit value. The Physical Region Descriptor Byte Count is + * 32-bit, but for small transfers using one PRD, it should match. */ + g_assert_cmphex(le16_to_cpu(pio->res4), ==, le32_to_cpu(cmd.prdbc)); + + /* Last, but not least: Investigate the IDENTIFY response data. */ + memread(data_ptr, &buff, 512); + + /* Check serial number/version in the buffer */ + /* NB: IDENTIFY strings are packed in 16bit little endian chunks. + * Since we copy byte-for-byte in ahci-test, on both LE and BE, we need to + * unchunk this data. By contrast, ide-test copies 2 bytes at a time, and + * as a consequence, only needs to unchunk the data on LE machines. */ + string_bswap16(&buff[10], 20); + rc = memcmp(&buff[10], "testdisk ", 20); + g_assert_cmphex(rc, ==, 0); + + string_bswap16(&buff[23], 8); + rc = memcmp(&buff[23], "version ", 8); + g_assert_cmphex(rc, ==, 0); + + g_free(d2h); + g_free(pio); +} + +/******************************************************************************/ +/* Test Interfaces */ +/******************************************************************************/ + +/** + * Basic sanity test to boot a machine, find an AHCI device, and shutdown. + */ +static void test_sanity(void) +{ + QPCIDevice *ahci; + ahci = ahci_boot(); + ahci_shutdown(ahci); +} + +/** + * Ensure that the PCI configuration space for the AHCI device is in-line with + * the AHCI 1.3 specification for initial values. + */ +static void test_pci_spec(void) +{ + QPCIDevice *ahci; + ahci = ahci_boot(); + ahci_test_pci_spec(ahci); + ahci_shutdown(ahci); +} + +/** + * Engage the PCI AHCI device and sanity check the response. + * Perform additional PCI config space bringup for the HBA. + */ +static void test_pci_enable(void) +{ + QPCIDevice *ahci; + void *hba_base; + ahci = ahci_boot(); + ahci_pci_enable(ahci, &hba_base); + ahci_shutdown(ahci); +} + +/** + * Investigate the memory mapped regions of the HBA, + * and test them for AHCI specification adherence. + */ +static void test_hba_spec(void) +{ + QPCIDevice *ahci; + void *hba_base; + + ahci = ahci_boot(); + ahci_pci_enable(ahci, &hba_base); + ahci_test_hba_spec(ahci, hba_base); + ahci_shutdown(ahci); +} + +/** + * Engage the HBA functionality of the AHCI PCI device, + * and bring it into a functional idle state. + */ +static void test_hba_enable(void) +{ + QPCIDevice *ahci; + void *hba_base; + + ahci = ahci_boot(); + ahci_pci_enable(ahci, &hba_base); + ahci_hba_enable(ahci, hba_base); + ahci_shutdown(ahci); +} + +/** + * Bring up the device and issue an IDENTIFY command. + * Inspect the state of the HBA device and the data returned. + */ +static void test_identify(void) +{ + QPCIDevice *ahci; + void *hba_base; + + ahci = ahci_boot(); + ahci_pci_enable(ahci, &hba_base); + ahci_hba_enable(ahci, hba_base); + ahci_test_identify(ahci, hba_base); + ahci_shutdown(ahci); +} + +/******************************************************************************/ + +int main(int argc, char **argv) +{ + const char *arch; + int fd; + int ret; + int c; + + static struct option long_options[] = { + {"pedantic", no_argument, 0, 'p' }, + {0, 0, 0, 0}, + }; + + /* Should be first to utilize g_test functionality, So we can see errors. */ + g_test_init(&argc, &argv, NULL); + + while (1) { + c = getopt_long(argc, argv, "", long_options, NULL); + if (c == -1) { + break; + } + switch (c) { + case -1: + break; + case 'p': + ahci_pedantic = 1; + break; + default: + fprintf(stderr, "Unrecognized ahci_test option.\n"); + g_assert_not_reached(); + } + } + + /* Check architecture */ + arch = qtest_get_arch(); + if (strcmp(arch, "i386") && strcmp(arch, "x86_64")) { + g_test_message("Skipping test for non-x86"); + return 0; + } + + /* Create a temporary raw image */ + fd = mkstemp(tmp_path); + g_assert(fd >= 0); + ret = ftruncate(fd, TEST_IMAGE_SIZE); + g_assert(ret == 0); + close(fd); + + /* Run the tests */ + qtest_add_func("/ahci/sanity", test_sanity); + qtest_add_func("/ahci/pci_spec", test_pci_spec); + qtest_add_func("/ahci/pci_enable", test_pci_enable); + qtest_add_func("/ahci/hba_spec", test_hba_spec); + qtest_add_func("/ahci/hba_enable", test_hba_enable); + qtest_add_func("/ahci/identify", test_identify); + + ret = g_test_run(); + + /* Cleanup */ + unlink(tmp_path); + + return ret; +} diff --git a/tests/image-fuzzer/qcow2/fuzz.py b/tests/image-fuzzer/qcow2/fuzz.py index 57527f9..20eba6b 100644 --- a/tests/image-fuzzer/qcow2/fuzz.py +++ b/tests/image-fuzzer/qcow2/fuzz.py @@ -18,8 +18,8 @@ import random - UINT8 = 0xff +UINT16 = 0xffff UINT32 = 0xffffffff UINT64 = 0xffffffffffffffff # Most significant bit orders @@ -28,6 +28,8 @@ UINT64_M = 63 # Fuzz vectors UINT8_V = [0, 0x10, UINT8/4, UINT8/2 - 1, UINT8/2, UINT8/2 + 1, UINT8 - 1, UINT8] +UINT16_V = [0, 0x100, 0x1000, UINT16/4, UINT16/2 - 1, UINT16/2, UINT16/2 + 1, + UINT16 - 1, UINT16] UINT32_V = [0, 0x100, 0x1000, 0x10000, 0x100000, UINT32/4, UINT32/2 - 1, UINT32/2, UINT32/2 + 1, UINT32 - 1, UINT32] UINT64_V = UINT32_V + [0x1000000, 0x10000000, 0x100000000, UINT64/4, @@ -332,9 +334,8 @@ def l1_entry(current): constraints = UINT64_V # Reserved bits are ignored # Added a possibility when only flags are fuzzed - offset = 0x7fffffffffffffff & random.choice([selector(current, - constraints), - current]) + offset = 0x7fffffffffffffff & \ + random.choice([selector(current, constraints), current]) is_cow = random.randint(0, 1) return offset + (is_cow << UINT64_M) @@ -344,12 +345,23 @@ def l2_entry(current): constraints = UINT64_V # Reserved bits are ignored # Add a possibility when only flags are fuzzed - offset = 0x3ffffffffffffffe & random.choice([selector(current, - constraints), - current]) + offset = 0x3ffffffffffffffe & \ + random.choice([selector(current, constraints), current]) is_compressed = random.randint(0, 1) is_cow = random.randint(0, 1) is_zero = random.randint(0, 1) value = offset + (is_cow << UINT64_M) + \ (is_compressed << UINT64_M - 1) + is_zero return value + + +def refcount_table_entry(current): + """Fuzz an entry of the refcount table.""" + constraints = UINT64_V + return selector(current, constraints) + + +def refcount_block_entry(current): + """Fuzz an entry of a refcount block.""" + constraints = UINT16_V + return selector(current, constraints) diff --git a/tests/image-fuzzer/qcow2/layout.py b/tests/image-fuzzer/qcow2/layout.py index 730c771..63e801f 100644 --- a/tests/image-fuzzer/qcow2/layout.py +++ b/tests/image-fuzzer/qcow2/layout.py @@ -102,6 +102,8 @@ class Image(object): self.end_of_extension_area = FieldsList() self.l2_tables = FieldsList() self.l1_table = FieldsList() + self.refcount_table = FieldsList() + self.refcount_blocks = FieldsList() self.ext_offset = 0 self.create_header(cluster_bits, backing_file_name) self.set_backing_file_name(backing_file_name) @@ -113,7 +115,8 @@ class Image(object): def __iter__(self): return chain(self.header, self.backing_file_format, self.feature_name_table, self.end_of_extension_area, - self.backing_file_name, self.l1_table, self.l2_tables) + self.backing_file_name, self.l1_table, self.l2_tables, + self.refcount_table, self.refcount_blocks) def create_header(self, cluster_bits, backing_file_name=None): """Generate a random valid header.""" @@ -330,6 +333,138 @@ class Image(object): float(self.cluster_size**2))) self.header['l1_table_offset'][0].value = l1_offset + def create_refcount_structures(self): + """Generate random refcount blocks and refcount table.""" + def allocate_rfc_blocks(data, size): + """Return indices of clusters allocated for refcount blocks.""" + cluster_ids = set() + diff = block_ids = set([x / size for x in data]) + while len(diff) != 0: + # Allocate all yet not allocated clusters + new = self._get_available_clusters(data | cluster_ids, + len(diff)) + # Indices of new refcount blocks necessary to cover clusters + # in 'new' + diff = set([x / size for x in new]) - block_ids + cluster_ids |= new + block_ids |= diff + return cluster_ids, block_ids + + def allocate_rfc_table(data, init_blocks, block_size): + """Return indices of clusters allocated for the refcount table + and updated indices of clusters allocated for blocks and indices + of blocks. + """ + blocks = set(init_blocks) + clusters = set() + # Number of entries in one cluster of the refcount table + size = self.cluster_size / UINT64_S + # Number of clusters necessary for the refcount table based on + # the current number of refcount blocks + table_size = int(ceil((max(blocks) + 1) / float(size))) + # Index of the first cluster of the refcount table + table_start = self._get_adjacent_clusters(data, table_size + 1) + # Clusters allocated for the current length of the refcount table + table_clusters = set(range(table_start, table_start + table_size)) + # Clusters allocated for the refcount table including + # last optional one for potential l1 growth + table_clusters_allocated = set(range(table_start, table_start + + table_size + 1)) + # New refcount blocks necessary for clusters occupied by the + # refcount table + diff = set([c / block_size for c in table_clusters]) - blocks + blocks |= diff + while len(diff) != 0: + # Allocate clusters for new refcount blocks + new = self._get_available_clusters((data | clusters) | + table_clusters_allocated, + len(diff)) + # Indices of new refcount blocks necessary to cover + # clusters in 'new' + diff = set([x / block_size for x in new]) - blocks + clusters |= new + blocks |= diff + # Check if the refcount table needs one more cluster + if int(ceil((max(blocks) + 1) / float(size))) > table_size: + new_block_id = (table_start + table_size) / block_size + # Check if the additional table cluster needs + # one more refcount block + if new_block_id not in blocks: + diff.add(new_block_id) + table_clusters.add(table_start + table_size) + table_size += 1 + return table_clusters, blocks, clusters + + def create_table_entry(table_offset, block_cluster, block_size, + cluster): + """Generate a refcount table entry.""" + offset = table_offset + UINT64_S * (cluster / block_size) + return ['>Q', offset, block_cluster * self.cluster_size, + 'refcount_table_entry'] + + def create_block_entry(block_cluster, block_size, cluster): + """Generate a list of entries for the current block.""" + entry_size = self.cluster_size / block_size + offset = block_cluster * self.cluster_size + entry_offset = offset + entry_size * (cluster % block_size) + # While snapshots are not supported all refcounts are set to 1 + return ['>H', entry_offset, 1, 'refcount_block_entry'] + # Size of a block entry in bits + refcount_bits = 1 << self.header['refcount_order'][0].value + # Number of refcount entries per refcount block + # Convert self.cluster_size from bytes to bits to have the same + # base for the numerator and denominator + block_size = self.cluster_size * 8 / refcount_bits + meta_data = self._get_metadata() + if len(self.data_clusters) == 0: + # All metadata for an empty guest image needs 4 clusters: + # header, rfc table, rfc block, L1 table. + # Header takes cluster #0, other clusters ##1-3 can be used + block_clusters = set([random.choice(list(set(range(1, 4)) - + meta_data))]) + block_ids = set([0]) + table_clusters = set([random.choice(list(set(range(1, 4)) - + meta_data - + block_clusters))]) + else: + block_clusters, block_ids = \ + allocate_rfc_blocks(self.data_clusters | + meta_data, block_size) + table_clusters, block_ids, new_clusters = \ + allocate_rfc_table(self.data_clusters | + meta_data | + block_clusters, + block_ids, + block_size) + block_clusters |= new_clusters + + meta_data |= block_clusters | table_clusters + table_offset = min(table_clusters) * self.cluster_size + block_id = None + # Clusters allocated for refcount blocks + block_clusters = list(block_clusters) + # Indices of refcount blocks + block_ids = list(block_ids) + # Refcount table entries + rfc_table = [] + # Refcount entries + rfc_blocks = [] + + for cluster in sorted(self.data_clusters | meta_data): + if cluster / block_size != block_id: + block_id = cluster / block_size + block_cluster = block_clusters[block_ids.index(block_id)] + rfc_table.append(create_table_entry(table_offset, + block_cluster, + block_size, cluster)) + rfc_blocks.append(create_block_entry(block_cluster, block_size, + cluster)) + self.refcount_table = FieldsList(rfc_table) + self.refcount_blocks = FieldsList(rfc_blocks) + + self.header['refcount_table_offset'][0].value = table_offset + self.header['refcount_table_clusters'][0].value = len(table_clusters) + def fuzz(self, fields_to_fuzz=None): """Fuzz an image by corrupting values of a random subset of its fields. @@ -471,6 +606,7 @@ def create_image(test_img_path, backing_file_name=None, backing_file_fmt=None, image.create_feature_name_table() image.set_end_of_extension_area() image.create_l_structures() + image.create_refcount_structures() image.fuzz(fields_to_fuzz) image.write(test_img_path) return image.image_size diff --git a/tests/image-fuzzer/runner.py b/tests/image-fuzzer/runner.py index c903c8a..0a8743e 100755 --- a/tests/image-fuzzer/runner.py +++ b/tests/image-fuzzer/runner.py @@ -70,7 +70,7 @@ def run_app(fd, q_args): """Exception for signal.alarm events.""" pass - def handler(*arg): + def handler(*args): """Notify that an alarm event occurred.""" raise Alarm @@ -134,8 +134,8 @@ class TestEnv(object): self.init_path = os.getcwd() self.work_dir = work_dir self.current_dir = os.path.join(work_dir, 'test-' + test_id) - self.qemu_img = os.environ.get('QEMU_IMG', 'qemu-img')\ - .strip().split(' ') + self.qemu_img = \ + os.environ.get('QEMU_IMG', 'qemu-img').strip().split(' ') self.qemu_io = os.environ.get('QEMU_IO', 'qemu-io').strip().split(' ') self.commands = [['qemu-img', 'check', '-f', 'qcow2', '$test_img'], ['qemu-img', 'info', '-f', 'qcow2', '$test_img'], @@ -150,8 +150,7 @@ class TestEnv(object): 'discard $off $len'], ['qemu-io', '$test_img', '-c', 'truncate $off']] - for fmt in ['raw', 'vmdk', 'vdi', 'cow', 'qcow2', 'file', - 'qed', 'vpc']: + for fmt in ['raw', 'vmdk', 'vdi', 'qcow2', 'file', 'qed', 'vpc']: self.commands.append( ['qemu-img', 'convert', '-f', 'qcow2', '-O', fmt, '$test_img', 'converted_image.' + fmt]) @@ -178,7 +177,7 @@ class TestEnv(object): by 'qemu-img create'. """ # All formats supported by the 'qemu-img create' command. - backing_file_fmt = random.choice(['raw', 'vmdk', 'vdi', 'cow', 'qcow2', + backing_file_fmt = random.choice(['raw', 'vmdk', 'vdi', 'qcow2', 'file', 'qed', 'vpc']) backing_file_name = 'backing_img.' + backing_file_fmt backing_file_size = random.randint(MIN_BACKING_FILE_SIZE, @@ -212,10 +211,8 @@ class TestEnv(object): os.chdir(self.current_dir) backing_file_name, backing_file_fmt = self._create_backing_file() - img_size = image_generator.create_image('test.img', - backing_file_name, - backing_file_fmt, - fuzz_config) + img_size = image_generator.create_image( + 'test.img', backing_file_name, backing_file_fmt, fuzz_config) for item in commands: shutil.copy('test.img', 'copy.img') # 'off' and 'len' are multiple of the sector size @@ -228,7 +225,7 @@ class TestEnv(object): elif item[0] == 'qemu-io': current_cmd = list(self.qemu_io) else: - multilog("Warning: test command '%s' is not defined.\n" \ + multilog("Warning: test command '%s' is not defined.\n" % item[0], sys.stderr, self.log, self.parent_log) continue # Replace all placeholders with their real values @@ -244,29 +241,28 @@ class TestEnv(object): "Backing file: %s\n" \ % (self.seed, " ".join(current_cmd), self.current_dir, backing_file_name) - temp_log = StringIO.StringIO() try: retcode = run_app(temp_log, current_cmd) except OSError, e: - multilog(test_summary + "Error: Start of '%s' failed. " \ - "Reason: %s\n\n" % (os.path.basename( - current_cmd[0]), e[1]), + multilog("%sError: Start of '%s' failed. Reason: %s\n\n" + % (test_summary, os.path.basename(current_cmd[0]), + e[1]), sys.stderr, self.log, self.parent_log) raise TestException if retcode < 0: self.log.write(temp_log.getvalue()) - multilog(test_summary + "FAIL: Test terminated by signal " + - "%s\n\n" % str_signal(-retcode), sys.stderr, self.log, - self.parent_log) + multilog("%sFAIL: Test terminated by signal %s\n\n" + % (test_summary, str_signal(-retcode)), + sys.stderr, self.log, self.parent_log) self.failed = True else: if self.log_all: self.log.write(temp_log.getvalue()) - multilog(test_summary + "PASS: Application exited with" + - " the code '%d'\n\n" % retcode, sys.stdout, - self.log, self.parent_log) + multilog("%sPASS: Application exited with the code " \ + "'%d'\n\n" % (test_summary, retcode), + sys.stdout, self.log, self.parent_log) temp_log.close() os.remove('copy.img') @@ -286,8 +282,9 @@ if __name__ == '__main__': Set up test environment in TEST_DIR and run a test in it. A module for test image generation should be specified via IMG_GENERATOR. + Example: - runner.py -c '[["qemu-img", "info", "$test_img"]]' /tmp/test qcow2 + runner.py -c '[["qemu-img", "info", "$test_img"]]' /tmp/test qcow2 Optional arguments: -h, --help display this help and exit @@ -305,20 +302,22 @@ if __name__ == '__main__': '--command' accepts a JSON array of commands. Each command presents an application under test with all its paramaters as a list of strings, - e.g. - ["qemu-io", "$test_img", "-c", "write $off $len"] + e.g. ["qemu-io", "$test_img", "-c", "write $off $len"]. Supported application aliases: 'qemu-img' and 'qemu-io'. + Supported argument aliases: $test_img for the fuzzed image, $off for an offset, $len for length. Values for $off and $len will be generated based on the virtual disk - size of the fuzzed image + size of the fuzzed image. + Paths to 'qemu-img' and 'qemu-io' are retrevied from 'QEMU_IMG' and - 'QEMU_IO' environment variables + 'QEMU_IO' environment variables. '--config' accepts a JSON array of fields to be fuzzed, e.g. - '[["header"], ["header", "version"]]' + '[["header"], ["header", "version"]]'. + Each of the list elements can consist of a complex image element only as ["header"] or ["feature_name_table"] or an exact field as ["header", "version"]. In the first case random portion of the element @@ -368,7 +367,6 @@ if __name__ == '__main__': seed = None config = None duration = None - for opt, arg in opts: if opt in ('-h', '--help'): usage() diff --git a/tests/libqos/pci.c b/tests/libqos/pci.c index d5ce683..4e630c2 100644 --- a/tests/libqos/pci.c +++ b/tests/libqos/pci.c @@ -71,6 +71,12 @@ void qpci_device_enable(QPCIDevice *dev) cmd = qpci_config_readw(dev, PCI_COMMAND); cmd |= PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER; qpci_config_writew(dev, PCI_COMMAND, cmd); + + /* Verify the bits are now set. */ + cmd = qpci_config_readw(dev, PCI_COMMAND); + g_assert_cmphex(cmd & PCI_COMMAND_IO, ==, PCI_COMMAND_IO); + g_assert_cmphex(cmd & PCI_COMMAND_MEMORY, ==, PCI_COMMAND_MEMORY); + g_assert_cmphex(cmd & PCI_COMMAND_MASTER, ==, PCI_COMMAND_MASTER); } uint8_t qpci_find_capability(QPCIDevice *dev, uint8_t id) diff --git a/tests/qemu-iotests/060 b/tests/qemu-iotests/060 index 830386f..2355567 100755 --- a/tests/qemu-iotests/060 +++ b/tests/qemu-iotests/060 @@ -169,9 +169,61 @@ echo "=== Testing unallocated image header ===" echo _make_test_img 64M # Create L1/L2 -$QEMU_IO -c "$OPEN_RW" -c "write 0 64k" | _filter_qemu_io +$QEMU_IO -c "write 0 64k" "$TEST_IMG" | _filter_qemu_io poke_file "$TEST_IMG" "$rb_offset" "\x00\x00" -$QEMU_IO -c "$OPEN_RW" -c "write 64k 64k" | _filter_qemu_io +$QEMU_IO -c "write 64k 64k" "$TEST_IMG" | _filter_qemu_io + +echo +echo "=== Testing unaligned L1 entry ===" +echo +_make_test_img 64M +$QEMU_IO -c "write 0 64k" "$TEST_IMG" | _filter_qemu_io +# This will be masked with ~(512 - 1) = ~0x1ff, so whether the lower 9 bits are +# aligned or not does not matter +poke_file "$TEST_IMG" "$l1_offset" "\x80\x00\x00\x00\x00\x04\x2a\x00" +$QEMU_IO -c "read 0 64k" "$TEST_IMG" | _filter_qemu_io + +echo +echo "=== Testing unaligned L2 entry ===" +echo +_make_test_img 64M +$QEMU_IO -c "write 0 64k" "$TEST_IMG" | _filter_qemu_io +poke_file "$TEST_IMG" "$l2_offset" "\x80\x00\x00\x00\x00\x05\x2a\x00" +$QEMU_IO -c "read 0 64k" "$TEST_IMG" | _filter_qemu_io + +echo +echo "=== Testing unaligned reftable entry ===" +echo +_make_test_img 64M +poke_file "$TEST_IMG" "$rt_offset" "\x00\x00\x00\x00\x00\x02\x2a\x00" +$QEMU_IO -c "write 0 64k" "$TEST_IMG" | _filter_qemu_io + +echo +echo "=== Testing non-fatal corruption on freeing ===" +echo +_make_test_img 64M +$QEMU_IO -c "write 0 64k" "$TEST_IMG" | _filter_qemu_io +poke_file "$TEST_IMG" "$l2_offset" "\x80\x00\x00\x00\x00\x05\x2a\x00" +$QEMU_IO -c "discard 0 64k" "$TEST_IMG" | _filter_qemu_io + +echo +echo "=== Testing read-only corruption report ===" +echo +_make_test_img 64M +$QEMU_IO -c "write 0 64k" "$TEST_IMG" | _filter_qemu_io +poke_file "$TEST_IMG" "$l2_offset" "\x80\x00\x00\x00\x00\x05\x2a\x00" +# Should only emit a single error message +$QEMU_IO -c "$OPEN_RO" -c "read 0 64k" -c "read 0 64k" | _filter_qemu_io + +echo +echo "=== Testing non-fatal and then fatal corruption report ===" +echo +_make_test_img 64M +$QEMU_IO -c "write 0 128k" "$TEST_IMG" | _filter_qemu_io +poke_file "$TEST_IMG" "$l2_offset" "\x80\x00\x00\x00\x00\x05\x2a\x00" +poke_file "$TEST_IMG" "$(($l2_offset+8))" "\x80\x00\x00\x00\x00\x06\x2a\x00" +# Should emit two error messages +$QEMU_IO -c "discard 0 64k" -c "read 64k 64k" "$TEST_IMG" | _filter_qemu_io # success, all done echo "*** done" diff --git a/tests/qemu-iotests/060.out b/tests/qemu-iotests/060.out index c27c952..4f0c6d0 100644 --- a/tests/qemu-iotests/060.out +++ b/tests/qemu-iotests/060.out @@ -8,7 +8,7 @@ ERROR cluster 3 refcount=1 reference=3 1 errors were found on the image. Data may be corrupted, or further writes to the image may corrupt it. incompatible_features 0x0 -qcow2: Preventing invalid write on metadata (overlaps with active L1 table); image marked as corrupt. +qcow2: Marking image as corrupt: Preventing invalid write on metadata (overlaps with active L1 table); further corruption events will be suppressed write failed: Input/output error incompatible_features 0x2 qemu-io: can't open device TEST_DIR/t.IMGFMT: IMGFMT: Image is corrupt; cannot be opened read/write @@ -24,7 +24,7 @@ ERROR cluster 2 refcount=1 reference=2 2 errors were found on the image. Data may be corrupted, or further writes to the image may corrupt it. incompatible_features 0x0 -qcow2: Preventing invalid write on metadata (overlaps with refcount block); image marked as corrupt. +qcow2: Marking image as corrupt: Preventing invalid write on metadata (overlaps with refcount block); further corruption events will be suppressed write failed: Input/output error incompatible_features 0x2 Repairing refcount block 0 refcount=2 @@ -56,7 +56,7 @@ Data may be corrupted, or further writes to the image may corrupt it. 1 leaked clusters were found on the image. This means waste of disk space, but no harm to data. incompatible_features 0x0 -qcow2: Preventing invalid write on metadata (overlaps with inactive L2 table); image marked as corrupt. +qcow2: Marking image as corrupt: Preventing invalid write on metadata (overlaps with inactive L2 table); further corruption events will be suppressed write failed: Input/output error incompatible_features 0x2 Repairing cluster 4 refcount=1 reference=2 @@ -88,7 +88,7 @@ wrote 65536/65536 bytes at offset 536870912 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) discard 65536/65536 bytes at offset 0 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) -qcow2: Preventing invalid write on metadata (overlaps with active L2 table); image marked as corrupt. +qcow2: Marking image as corrupt: Preventing invalid write on metadata (overlaps with active L2 table); further corruption events will be suppressed blkdebug: Suspended request '0' write failed: Input/output error blkdebug: Resuming request '0' @@ -99,6 +99,57 @@ aio_write failed: No medium found Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 wrote 65536/65536 bytes at offset 0 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) -qcow2: Preventing invalid write on metadata (overlaps with qcow2_header); image marked as corrupt. +qcow2: Marking image as corrupt: Preventing invalid write on metadata (overlaps with qcow2_header); further corruption events will be suppressed write failed: Input/output error + +=== Testing unaligned L1 entry === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 +wrote 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +qcow2: Marking image as corrupt: L2 table offset 0x42a00 unaligned (L1 index: 0); further corruption events will be suppressed +read failed: Input/output error + +=== Testing unaligned L2 entry === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 +wrote 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +qcow2: Marking image as corrupt: Data cluster offset 0x52a00 unaligned (L2 offset: 0x40000, L2 index: 0); further corruption events will be suppressed +read failed: Input/output error + +=== Testing unaligned reftable entry === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 +qcow2: Marking image as corrupt: Refblock offset 0x22a00 unaligned (reftable index: 0); further corruption events will be suppressed +write failed: Input/output error + +=== Testing non-fatal corruption on freeing === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 +wrote 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +qcow2: Image is corrupt: Cannot free unaligned cluster 0x52a00; further non-fatal corruption events will be suppressed +discard 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +=== Testing read-only corruption report === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 +wrote 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +qcow2: Image is corrupt: Data cluster offset 0x52a00 unaligned (L2 offset: 0x40000, L2 index: 0); further non-fatal corruption events will be suppressed +read failed: Input/output error +read failed: Input/output error + +=== Testing non-fatal and then fatal corruption report === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 +wrote 131072/131072 bytes at offset 0 +128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +qcow2: Image is corrupt: Cannot free unaligned cluster 0x52a00; further non-fatal corruption events will be suppressed +qcow2: Marking image as corrupt: Data cluster offset 0x62a00 unaligned (L2 offset: 0x40000, L2 index: 0x1); further corruption events will be suppressed +discard 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read failed: Input/output error *** done diff --git a/tests/qemu-iotests/069 b/tests/qemu-iotests/069 index e661598..ce9e054 100755 --- a/tests/qemu-iotests/069 +++ b/tests/qemu-iotests/069 @@ -38,7 +38,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15 . ./common.rc . ./common.filter -_supported_fmt cow qed qcow qcow2 vmdk +_supported_fmt qed qcow qcow2 vmdk _supported_proto file _supported_os Linux _unsupported_imgopts "subformat=monolithicFlat" "subformat=twoGbMaxExtentFlat" diff --git a/tests/qemu-iotests/072 b/tests/qemu-iotests/072 index 58faa8b..e4a723d 100755 --- a/tests/qemu-iotests/072 +++ b/tests/qemu-iotests/072 @@ -38,7 +38,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15 . ./common.rc . ./common.filter -_supported_fmt vpc vmdk vhdx vdi qed qcow2 qcow cow +_supported_fmt vpc vmdk vhdx vdi qed qcow2 qcow _supported_proto file _supported_os Linux diff --git a/tests/qemu-iotests/099 b/tests/qemu-iotests/099 index a26d3d2..ffc7ea7 100755 --- a/tests/qemu-iotests/099 +++ b/tests/qemu-iotests/099 @@ -41,7 +41,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15 # Basically all formats, but "raw" has issues with _filter_imgfmt regarding the # raw comparison image for blkverify; also, all images have to support creation -_supported_fmt cow qcow qcow2 qed vdi vhdx vmdk vpc +_supported_fmt qcow qcow2 qed vdi vhdx vmdk vpc _supported_proto file _supported_os Linux diff --git a/tests/qemu-iotests/common b/tests/qemu-iotests/common index 70df659..89c6dde 100644 --- a/tests/qemu-iotests/common +++ b/tests/qemu-iotests/common @@ -136,7 +136,6 @@ common options check options -raw test raw (default) -bochs test bochs - -cow test cow -cloop test cloop -parallels test parallels -qcow test qcow @@ -182,11 +181,6 @@ testlist options xpand=false ;; - -cow) - IMGFMT=cow - xpand=false - ;; - -cloop) IMGFMT=cloop IMGFMT_GENERIC=false diff --git a/tests/test-aio.c b/tests/test-aio.c index c6a8713..a7cb5c9 100644 --- a/tests/test-aio.c +++ b/tests/test-aio.c @@ -14,6 +14,7 @@ #include "block/aio.h" #include "qemu/timer.h" #include "qemu/sockets.h" +#include "qemu/error-report.h" static AioContext *ctx; @@ -810,11 +811,18 @@ static void test_source_timer_schedule(void) int main(int argc, char **argv) { + Error *local_error = NULL; GSource *src; init_clocks(); - ctx = aio_context_new(); + ctx = aio_context_new(&local_error); + if (!ctx) { + error_report("Failed to create AIO Context: '%s'", + error_get_pretty(local_error)); + error_free(local_error); + exit(1); + } src = aio_get_g_source(ctx); g_source_attach(src, NULL); g_source_unref(src); diff --git a/tests/test-thread-pool.c b/tests/test-thread-pool.c index f40b7fc..8c4d68b 100644 --- a/tests/test-thread-pool.c +++ b/tests/test-thread-pool.c @@ -4,6 +4,7 @@ #include "block/thread-pool.h" #include "block/block.h" #include "qemu/timer.h" +#include "qemu/error-report.h" static AioContext *ctx; static ThreadPool *pool; @@ -33,7 +34,7 @@ static int long_cb(void *opaque) static void done_cb(void *opaque, int ret) { WorkerTestData *data = opaque; - g_assert_cmpint(data->ret, ==, -EINPROGRESS); + g_assert(data->ret == -EINPROGRESS || data->ret == -ECANCELED); data->ret = ret; data->aiocb = NULL; @@ -132,7 +133,7 @@ static void test_submit_many(void) } } -static void test_cancel(void) +static void do_test_cancel(bool sync) { WorkerTestData data[100]; int num_canceled; @@ -170,18 +171,25 @@ static void test_cancel(void) for (i = 0; i < 100; i++) { if (atomic_cmpxchg(&data[i].n, 0, 3) == 0) { data[i].ret = -ECANCELED; - bdrv_aio_cancel(data[i].aiocb); - active--; + if (sync) { + bdrv_aio_cancel(data[i].aiocb); + } else { + bdrv_aio_cancel_async(data[i].aiocb); + } num_canceled++; } } g_assert_cmpint(active, >, 0); g_assert_cmpint(num_canceled, <, 100); - /* Canceling the others will be a blocking operation. */ for (i = 0; i < 100; i++) { if (data[i].aiocb && data[i].n != 3) { - bdrv_aio_cancel(data[i].aiocb); + if (sync) { + /* Canceling the others will be a blocking operation. */ + bdrv_aio_cancel(data[i].aiocb); + } else { + bdrv_aio_cancel_async(data[i].aiocb); + } } } @@ -193,22 +201,39 @@ static void test_cancel(void) for (i = 0; i < 100; i++) { if (data[i].n == 3) { g_assert_cmpint(data[i].ret, ==, -ECANCELED); - g_assert(data[i].aiocb != NULL); + g_assert(data[i].aiocb == NULL); } else { g_assert_cmpint(data[i].n, ==, 2); - g_assert_cmpint(data[i].ret, ==, 0); + g_assert(data[i].ret == 0 || data[i].ret == -ECANCELED); g_assert(data[i].aiocb == NULL); } } } +static void test_cancel(void) +{ + do_test_cancel(true); +} + +static void test_cancel_async(void) +{ + do_test_cancel(false); +} + int main(int argc, char **argv) { int ret; + Error *local_error = NULL; init_clocks(); - ctx = aio_context_new(); + ctx = aio_context_new(&local_error); + if (!ctx) { + error_report("Failed to create AIO Context: '%s'", + error_get_pretty(local_error)); + error_free(local_error); + exit(1); + } pool = aio_get_thread_pool(ctx); g_test_init(&argc, &argv, NULL); @@ -217,6 +242,7 @@ int main(int argc, char **argv) g_test_add_func("/thread-pool/submit-co", test_submit_co); g_test_add_func("/thread-pool/submit-many", test_submit_many); g_test_add_func("/thread-pool/cancel", test_cancel); + g_test_add_func("/thread-pool/cancel-async", test_cancel_async); ret = g_test_run(); diff --git a/tests/test-throttle.c b/tests/test-throttle.c index 000ae31..d8ba415 100644 --- a/tests/test-throttle.c +++ b/tests/test-throttle.c @@ -14,6 +14,7 @@ #include <math.h> #include "block/aio.h" #include "qemu/throttle.h" +#include "qemu/error-report.h" static AioContext *ctx; static LeakyBucket bkt; @@ -492,10 +493,17 @@ static void test_accounting(void) int main(int argc, char **argv) { GSource *src; + Error *local_error = NULL; init_clocks(); - ctx = aio_context_new(); + ctx = aio_context_new(&local_error); + if (!ctx) { + error_report("Failed to create AIO Context: '%s'", + error_get_pretty(local_error)); + error_free(local_error); + exit(1); + } src = aio_get_g_source(ctx); g_source_attach(src, NULL); g_source_unref(src); diff --git a/thread-pool.c b/thread-pool.c index bc07d7a..86fa4a8 100644 --- a/thread-pool.c +++ b/thread-pool.c @@ -31,7 +31,6 @@ enum ThreadState { THREAD_QUEUED, THREAD_ACTIVE, THREAD_DONE, - THREAD_CANCELED, }; struct ThreadPoolElement { @@ -58,7 +57,6 @@ struct ThreadPool { AioContext *ctx; QEMUBH *completion_bh; QemuMutex lock; - QemuCond check_cancel; QemuCond worker_stopped; QemuSemaphore sem; int max_threads; @@ -73,7 +71,6 @@ struct ThreadPool { int idle_threads; int new_threads; /* backlog of threads we need to create */ int pending_threads; /* threads created but not running yet */ - int pending_cancellations; /* whether we need a cond_broadcast */ bool stopping; }; @@ -113,9 +110,6 @@ static void *worker_thread(void *opaque) req->state = THREAD_DONE; qemu_mutex_lock(&pool->lock); - if (pool->pending_cancellations) { - qemu_cond_broadcast(&pool->check_cancel); - } qemu_bh_schedule(pool->completion_bh); } @@ -173,7 +167,7 @@ static void thread_pool_completion_bh(void *opaque) restart: QLIST_FOREACH_SAFE(elem, &pool->head, all, next) { - if (elem->state != THREAD_CANCELED && elem->state != THREAD_DONE) { + if (elem->state != THREAD_DONE) { continue; } if (elem->state == THREAD_DONE) { @@ -191,12 +185,12 @@ restart: qemu_bh_schedule(pool->completion_bh); elem->common.cb(elem->common.opaque, elem->ret); - qemu_aio_release(elem); + qemu_aio_unref(elem); goto restart; } else { /* remove the request */ QLIST_REMOVE(elem, all); - qemu_aio_release(elem); + qemu_aio_unref(elem); } } } @@ -217,22 +211,26 @@ static void thread_pool_cancel(BlockDriverAIOCB *acb) */ qemu_sem_timedwait(&pool->sem, 0) == 0) { QTAILQ_REMOVE(&pool->request_list, elem, reqs); - elem->state = THREAD_CANCELED; qemu_bh_schedule(pool->completion_bh); - } else { - pool->pending_cancellations++; - while (elem->state != THREAD_CANCELED && elem->state != THREAD_DONE) { - qemu_cond_wait(&pool->check_cancel, &pool->lock); - } - pool->pending_cancellations--; + + elem->state = THREAD_DONE; + elem->ret = -ECANCELED; } + qemu_mutex_unlock(&pool->lock); - thread_pool_completion_bh(pool); +} + +static AioContext *thread_pool_get_aio_context(BlockDriverAIOCB *acb) +{ + ThreadPoolElement *elem = (ThreadPoolElement *)acb; + ThreadPool *pool = elem->pool; + return pool->ctx; } static const AIOCBInfo thread_pool_aiocb_info = { .aiocb_size = sizeof(ThreadPoolElement), - .cancel = thread_pool_cancel, + .cancel_async = thread_pool_cancel, + .get_aio_context = thread_pool_get_aio_context, }; BlockDriverAIOCB *thread_pool_submit_aio(ThreadPool *pool, @@ -299,7 +297,6 @@ static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx) pool->ctx = ctx; pool->completion_bh = aio_bh_new(ctx, thread_pool_completion_bh, pool); qemu_mutex_init(&pool->lock); - qemu_cond_init(&pool->check_cancel); qemu_cond_init(&pool->worker_stopped); qemu_sem_init(&pool->sem, 0); pool->max_threads = 64; @@ -342,7 +339,6 @@ void thread_pool_free(ThreadPool *pool) qemu_bh_delete(pool->completion_bh); qemu_sem_destroy(&pool->sem); - qemu_cond_destroy(&pool->check_cancel); qemu_cond_destroy(&pool->worker_stopped); qemu_mutex_destroy(&pool->lock); g_free(pool); @@ -857,7 +857,7 @@ QemuCocoaView *cocoaView; [op setPrompt:@"Boot image"]; [op setMessage:@"Select the disk image you want to boot.\n\nHit the \"Cancel\" button to quit"]; NSArray *filetypes = [NSArray arrayWithObjects:@"img", @"iso", @"dmg", - @"qcow", @"qcow2", @"cow", @"cloop", @"vmdk", nil]; + @"qcow", @"qcow2", @"cloop", @"vmdk", nil]; #if (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6) [op setAllowedFileTypes:filetypes]; [op beginSheetModalForWindow:normalWindow @@ -2968,6 +2968,7 @@ int main(int argc, char **argv, char **envp) ram_addr_t maxram_size = default_ram_size; uint64_t ram_slots = 0; FILE *vmstate_dump_file = NULL; + Error *main_loop_err = NULL; atexit(qemu_run_exit_notifiers); error_set_progname(argv[0]); @@ -3358,34 +3359,34 @@ int main(int argc, char **argv, char **envp) sz = qemu_opt_get_size(opts, "maxmem", 0); if (sz < ram_size) { - fprintf(stderr, "qemu: invalid -m option value: maxmem " - "(%" PRIu64 ") <= initial memory (" - RAM_ADDR_FMT ")\n", sz, ram_size); + error_report("invalid -m option value: maxmem " + "(0x%" PRIx64 ") <= initial memory (0x" + RAM_ADDR_FMT ")", sz, ram_size); exit(EXIT_FAILURE); } slots = qemu_opt_get_number(opts, "slots", 0); if ((sz > ram_size) && !slots) { - fprintf(stderr, "qemu: invalid -m option value: maxmem " - "(%" PRIu64 ") more than initial memory (" + error_report("invalid -m option value: maxmem " + "(0x%" PRIx64 ") more than initial memory (0x" RAM_ADDR_FMT ") but no hotplug slots where " - "specified\n", sz, ram_size); + "specified", sz, ram_size); exit(EXIT_FAILURE); } if ((sz <= ram_size) && slots) { - fprintf(stderr, "qemu: invalid -m option value: %" + error_report("invalid -m option value: %" PRIu64 " hotplug slots where specified but " - "maxmem (%" PRIu64 ") <= initial memory (" - RAM_ADDR_FMT ")\n", slots, sz, ram_size); + "maxmem (0x%" PRIx64 ") <= initial memory (0x" + RAM_ADDR_FMT ")", slots, sz, ram_size); exit(EXIT_FAILURE); } maxram_size = sz; ram_slots = slots; } else if ((!maxmem_str && slots_str) || (maxmem_str && !slots_str)) { - fprintf(stderr, "qemu: invalid -m option value: missing " - "'%s' option\n", slots_str ? "maxmem" : "slots"); + error_report("invalid -m option value: missing " + "'%s' option", slots_str ? "maxmem" : "slots"); exit(EXIT_FAILURE); } break; @@ -3998,8 +3999,8 @@ int main(int argc, char **argv, char **envp) os_daemonize(); - if (qemu_init_main_loop()) { - fprintf(stderr, "qemu_init_main_loop failed\n"); + if (qemu_init_main_loop(&main_loop_err)) { + error_report("%s", error_get_pretty(main_loop_err)); exit(1); } @@ -979,6 +979,7 @@ static void xen_wakeup_notifier(Notifier *notifier, void *data) xc_set_hvm_param(xen_xc, xen_domid, HVM_PARAM_ACPI_S_STATE, 0); } +/* return 0 means OK, or -1 means critical issue -- will exit(1) */ int xen_hvm_init(ram_addr_t *below_4g_mem_size, ram_addr_t *above_4g_mem_size, MemoryRegion **ram_memory) { @@ -992,15 +993,13 @@ int xen_hvm_init(ram_addr_t *below_4g_mem_size, ram_addr_t *above_4g_mem_size, state->xce_handle = xen_xc_evtchn_open(NULL, 0); if (state->xce_handle == XC_HANDLER_INITIAL_VALUE) { perror("xen: event channel open"); - g_free(state); - return -errno; + return -1; } state->xenstore = xs_daemon_open(); if (state->xenstore == NULL) { perror("xen: xenstore open"); - g_free(state); - return -errno; + return -1; } state->exit.notify = xen_exit_notifier; @@ -1070,7 +1069,7 @@ int xen_hvm_init(ram_addr_t *below_4g_mem_size, ram_addr_t *above_4g_mem_size, /* Initialize backend core & drivers */ if (xen_be_init() != 0) { fprintf(stderr, "%s: xen backend core setup failed\n", __FUNCTION__); - exit(1); + return -1; } xen_be_register("console", &xen_console_ops); xen_be_register("vkbd", &xen_kbdmouse_ops); |