diff options
30 files changed, 2100 insertions, 52 deletions
@@ -7982,6 +7982,25 @@ void bdrv_add_child(BlockDriverState *parent_bs, BlockDriverState *child_bs, return; } + /* + * Non-zoned block drivers do not follow zoned storage constraints + * (i.e. sequential writes to zones). Refuse mixing zoned and non-zoned + * drivers in a graph. + */ + if (!parent_bs->drv->supports_zoned_children && + child_bs->bl.zoned == BLK_Z_HM) { + /* + * The host-aware model allows zoned storage constraints and random + * write. Allow mixing host-aware and non-zoned drivers. Using + * host-aware device as a regular device. + */ + error_setg(errp, "Cannot add a %s child to a %s parent", + child_bs->bl.zoned == BLK_Z_HM ? "zoned" : "non-zoned", + parent_bs->drv->supports_zoned_children ? + "support zoned children" : "not support zoned children"); + return; + } + if (!QLIST_EMPTY(&child_bs->parents)) { error_setg(errp, "The node %s already has a parent", child_bs->node_name); diff --git a/block/block-backend.c b/block/block-backend.c index e37d55d..ca537cd 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -1845,6 +1845,204 @@ int coroutine_fn blk_co_flush(BlockBackend *blk) return ret; } +static void coroutine_fn blk_aio_zone_report_entry(void *opaque) +{ + BlkAioEmAIOCB *acb = opaque; + BlkRwCo *rwco = &acb->rwco; + + rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset, + (unsigned int*)(uintptr_t)acb->bytes, + rwco->iobuf); + blk_aio_complete(acb); +} + +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset, + unsigned int *nr_zones, + BlockZoneDescriptor *zones, + BlockCompletionFunc *cb, void *opaque) +{ + BlkAioEmAIOCB *acb; + Coroutine *co; + IO_CODE(); + + blk_inc_in_flight(blk); + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); + acb->rwco = (BlkRwCo) { + .blk = blk, + .offset = offset, + .iobuf = zones, + .ret = NOT_DONE, + }; + acb->bytes = (int64_t)(uintptr_t)nr_zones, + acb->has_returned = false; + + co = qemu_coroutine_create(blk_aio_zone_report_entry, acb); + aio_co_enter(blk_get_aio_context(blk), co); + + acb->has_returned = true; + if (acb->rwco.ret != NOT_DONE) { + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), + blk_aio_complete_bh, acb); + } + + return &acb->common; +} + +static void coroutine_fn blk_aio_zone_mgmt_entry(void *opaque) +{ + BlkAioEmAIOCB *acb = opaque; + BlkRwCo *rwco = &acb->rwco; + + rwco->ret = blk_co_zone_mgmt(rwco->blk, + (BlockZoneOp)(uintptr_t)rwco->iobuf, + rwco->offset, acb->bytes); + blk_aio_complete(acb); +} + +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, + int64_t offset, int64_t len, + BlockCompletionFunc *cb, void *opaque) { + BlkAioEmAIOCB *acb; + Coroutine *co; + IO_CODE(); + + blk_inc_in_flight(blk); + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); + acb->rwco = (BlkRwCo) { + .blk = blk, + .offset = offset, + .iobuf = (void *)(uintptr_t)op, + .ret = NOT_DONE, + }; + acb->bytes = len; + acb->has_returned = false; + + co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb); + aio_co_enter(blk_get_aio_context(blk), co); + + acb->has_returned = true; + if (acb->rwco.ret != NOT_DONE) { + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), + blk_aio_complete_bh, acb); + } + + return &acb->common; +} + +static void coroutine_fn blk_aio_zone_append_entry(void *opaque) +{ + BlkAioEmAIOCB *acb = opaque; + BlkRwCo *rwco = &acb->rwco; + + rwco->ret = blk_co_zone_append(rwco->blk, (int64_t *)(uintptr_t)acb->bytes, + rwco->iobuf, rwco->flags); + blk_aio_complete(acb); +} + +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset, + QEMUIOVector *qiov, BdrvRequestFlags flags, + BlockCompletionFunc *cb, void *opaque) { + BlkAioEmAIOCB *acb; + Coroutine *co; + IO_CODE(); + + blk_inc_in_flight(blk); + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); + acb->rwco = (BlkRwCo) { + .blk = blk, + .ret = NOT_DONE, + .flags = flags, + .iobuf = qiov, + }; + acb->bytes = (int64_t)(uintptr_t)offset; + acb->has_returned = false; + + co = qemu_coroutine_create(blk_aio_zone_append_entry, acb); + aio_co_enter(blk_get_aio_context(blk), co); + acb->has_returned = true; + if (acb->rwco.ret != NOT_DONE) { + replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), + blk_aio_complete_bh, acb); + } + + return &acb->common; +} + +/* + * Send a zone_report command. + * offset is a byte offset from the start of the device. No alignment + * required for offset. + * nr_zones represents IN maximum and OUT actual. + */ +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, + unsigned int *nr_zones, + BlockZoneDescriptor *zones) +{ + int ret; + IO_CODE(); + + blk_inc_in_flight(blk); /* increase before waiting */ + blk_wait_while_drained(blk); + GRAPH_RDLOCK_GUARD(); + if (!blk_is_available(blk)) { + blk_dec_in_flight(blk); + return -ENOMEDIUM; + } + ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones); + blk_dec_in_flight(blk); + return ret; +} + +/* + * Send a zone_management command. + * op is the zone operation; + * offset is the byte offset from the start of the zoned device; + * len is the maximum number of bytes the command should operate on. It + * should be aligned with the device zone size. + */ +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, + int64_t offset, int64_t len) +{ + int ret; + IO_CODE(); + + blk_inc_in_flight(blk); + blk_wait_while_drained(blk); + GRAPH_RDLOCK_GUARD(); + + ret = blk_check_byte_request(blk, offset, len); + if (ret < 0) { + blk_dec_in_flight(blk); + return ret; + } + + ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len); + blk_dec_in_flight(blk); + return ret; +} + +/* + * Send a zone_append command. + */ +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset, + QEMUIOVector *qiov, BdrvRequestFlags flags) +{ + int ret; + IO_CODE(); + + blk_inc_in_flight(blk); + blk_wait_while_drained(blk); + GRAPH_RDLOCK_GUARD(); + if (!blk_is_available(blk)) { + blk_dec_in_flight(blk); + return -ENOMEDIUM; + } + + ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags); + blk_dec_in_flight(blk); + return ret; +} + void blk_drain(BlockBackend *blk) { BlockDriverState *bs = blk_bs(blk); diff --git a/block/file-posix.c b/block/file-posix.c index c7b7233..0ab158e 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -68,6 +68,9 @@ #include <sys/param.h> #include <sys/syscall.h> #include <sys/vfs.h> +#if defined(CONFIG_BLKZONED) +#include <linux/blkzoned.h> +#endif #include <linux/cdrom.h> #include <linux/fd.h> #include <linux/fs.h> @@ -157,6 +160,7 @@ typedef struct BDRVRawState { bool has_write_zeroes:1; bool use_linux_aio:1; bool use_linux_io_uring:1; + int64_t *offset; /* offset of zone append operation */ int page_cache_inconsistent; /* errno from fdatasync failure */ bool has_fallocate; bool needs_alignment; @@ -216,6 +220,13 @@ typedef struct RawPosixAIOData { PreallocMode prealloc; Error **errp; } truncate; + struct { + unsigned int *nr_zones; + BlockZoneDescriptor *zones; + } zone_report; + struct { + unsigned long op; + } zone_mgmt; }; } RawPosixAIOData; @@ -766,6 +777,18 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, goto fail; } } +#ifdef CONFIG_BLKZONED + /* + * The kernel page cache does not reliably work for writes to SWR zones + * of zoned block device because it can not guarantee the order of writes. + */ + if ((bs->bl.zoned != BLK_Z_NONE) && + (!(s->open_flags & O_DIRECT))) { + error_setg(errp, "The driver supports zoned devices, and it requires " + "cache.direct=on, which was not specified."); + return -EINVAL; /* No host kernel page cache */ + } +#endif if (S_ISBLK(st.st_mode)) { #ifdef __linux__ @@ -1202,15 +1225,91 @@ static int hdev_get_max_hw_transfer(int fd, struct stat *st) #endif } -static int hdev_get_max_segments(int fd, struct stat *st) +/* + * Get a sysfs attribute value as character string. + */ +#ifdef CONFIG_LINUX +static int get_sysfs_str_val(struct stat *st, const char *attribute, + char **val) { + g_autofree char *sysfspath = NULL; + int ret; + size_t len; + + if (!S_ISBLK(st->st_mode)) { + return -ENOTSUP; + } + + sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/%s", + major(st->st_rdev), minor(st->st_rdev), + attribute); + ret = g_file_get_contents(sysfspath, val, &len, NULL); + if (ret == -1) { + return -ENOENT; + } + + /* The file is ended with '\n' */ + char *p; + p = *val; + if (*(p + len - 1) == '\n') { + *(p + len - 1) = '\0'; + } + return ret; +} +#endif + +#if defined(CONFIG_BLKZONED) +static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned) { + g_autofree char *val = NULL; + int ret; + + ret = get_sysfs_str_val(st, "zoned", &val); + if (ret < 0) { + return ret; + } + + if (strcmp(val, "host-managed") == 0) { + *zoned = BLK_Z_HM; + } else if (strcmp(val, "host-aware") == 0) { + *zoned = BLK_Z_HA; + } else if (strcmp(val, "none") == 0) { + *zoned = BLK_Z_NONE; + } else { + return -ENOTSUP; + } + return 0; +} +#endif /* defined(CONFIG_BLKZONED) */ + +/* + * Get a sysfs attribute value as a long integer. + */ #ifdef CONFIG_LINUX - char buf[32]; +static long get_sysfs_long_val(struct stat *st, const char *attribute) +{ + g_autofree char *str = NULL; const char *end; - char *sysfspath = NULL; + long val; + int ret; + + ret = get_sysfs_str_val(st, attribute, &str); + if (ret < 0) { + return ret; + } + + /* The file is ended with '\n', pass 'end' to accept that. */ + ret = qemu_strtol(str, &end, 10, &val); + if (ret == 0 && end && *end == '\0') { + ret = val; + } + return ret; +} +#endif + +static int hdev_get_max_segments(int fd, struct stat *st) +{ +#ifdef CONFIG_LINUX int ret; - int sysfd = -1; - long max_segments; if (S_ISCHR(st->st_mode)) { if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) { @@ -1218,43 +1317,175 @@ static int hdev_get_max_segments(int fd, struct stat *st) } return -ENOTSUP; } + return get_sysfs_long_val(st, "max_segments"); +#else + return -ENOTSUP; +#endif +} - if (!S_ISBLK(st->st_mode)) { - return -ENOTSUP; +#if defined(CONFIG_BLKZONED) +/* + * If the reset_all flag is true, then the wps of zone whose state is + * not readonly or offline should be all reset to the start sector. + * Else, take the real wp of the device. + */ +static int get_zones_wp(BlockDriverState *bs, int fd, int64_t offset, + unsigned int nrz, bool reset_all) +{ + struct blk_zone *blkz; + size_t rep_size; + uint64_t sector = offset >> BDRV_SECTOR_BITS; + BlockZoneWps *wps = bs->wps; + unsigned int j = offset / bs->bl.zone_size; + unsigned int n = 0, i = 0; + int ret; + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); + g_autofree struct blk_zone_report *rep = NULL; + + rep = g_malloc(rep_size); + blkz = (struct blk_zone *)(rep + 1); + while (n < nrz) { + memset(rep, 0, rep_size); + rep->sector = sector; + rep->nr_zones = nrz - n; + + do { + ret = ioctl(fd, BLKREPORTZONE, rep); + } while (ret != 0 && errno == EINTR); + if (ret != 0) { + error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d", + fd, offset, errno); + return -errno; + } + + if (!rep->nr_zones) { + break; + } + + for (i = 0; i < rep->nr_zones; ++i, ++n, ++j) { + /* + * The wp tracking cares only about sequential writes required and + * sequential write preferred zones so that the wp can advance to + * the right location. + * Use the most significant bit of the wp location to indicate the + * zone type: 0 for SWR/SWP zones and 1 for conventional zones. + */ + if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) { + wps->wp[j] |= 1ULL << 63; + } else { + switch(blkz[i].cond) { + case BLK_ZONE_COND_FULL: + case BLK_ZONE_COND_READONLY: + /* Zone not writable */ + wps->wp[j] = (blkz[i].start + blkz[i].len) << BDRV_SECTOR_BITS; + break; + case BLK_ZONE_COND_OFFLINE: + /* Zone not writable nor readable */ + wps->wp[j] = (blkz[i].start) << BDRV_SECTOR_BITS; + break; + default: + if (reset_all) { + wps->wp[j] = blkz[i].start << BDRV_SECTOR_BITS; + } else { + wps->wp[j] = blkz[i].wp << BDRV_SECTOR_BITS; + } + break; + } + } + } + sector = blkz[i - 1].start + blkz[i - 1].len; } - sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments", - major(st->st_rdev), minor(st->st_rdev)); - sysfd = open(sysfspath, O_RDONLY); - if (sysfd == -1) { - ret = -errno; - goto out; + return 0; +} + +static void update_zones_wp(BlockDriverState *bs, int fd, int64_t offset, + unsigned int nrz) +{ + if (get_zones_wp(bs, fd, offset, nrz, 0) < 0) { + error_report("update zone wp failed"); + } +} + +static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st, + Error **errp) +{ + BDRVRawState *s = bs->opaque; + BlockZoneModel zoned; + int ret; + + bs->bl.zoned = BLK_Z_NONE; + + ret = get_sysfs_zoned_model(st, &zoned); + if (ret < 0 || zoned == BLK_Z_NONE) { + return; + } + bs->bl.zoned = zoned; + + ret = get_sysfs_long_val(st, "max_open_zones"); + if (ret >= 0) { + bs->bl.max_open_zones = ret; + } + + ret = get_sysfs_long_val(st, "max_active_zones"); + if (ret >= 0) { + bs->bl.max_active_zones = ret; } - ret = RETRY_ON_EINTR(read(sysfd, buf, sizeof(buf) - 1)); + + /* + * The zoned device must at least have zone size and nr_zones fields. + */ + ret = get_sysfs_long_val(st, "chunk_sectors"); if (ret < 0) { - ret = -errno; - goto out; - } else if (ret == 0) { - ret = -EIO; - goto out; + error_setg_errno(errp, -ret, "Unable to read chunk_sectors " + "sysfs attribute"); + return; + } else if (!ret) { + error_setg(errp, "Read 0 from chunk_sectors sysfs attribute"); + return; } - buf[ret] = 0; - /* The file is ended with '\n', pass 'end' to accept that. */ - ret = qemu_strtol(buf, &end, 10, &max_segments); - if (ret == 0 && end && *end == '\n') { - ret = max_segments; + bs->bl.zone_size = ret << BDRV_SECTOR_BITS; + + ret = get_sysfs_long_val(st, "nr_zones"); + if (ret < 0) { + error_setg_errno(errp, -ret, "Unable to read nr_zones " + "sysfs attribute"); + return; + } else if (!ret) { + error_setg(errp, "Read 0 from nr_zones sysfs attribute"); + return; } + bs->bl.nr_zones = ret; -out: - if (sysfd != -1) { - close(sysfd); + ret = get_sysfs_long_val(st, "zone_append_max_bytes"); + if (ret > 0) { + bs->bl.max_append_sectors = ret >> BDRV_SECTOR_BITS; } - g_free(sysfspath); - return ret; -#else - return -ENOTSUP; -#endif + + ret = get_sysfs_long_val(st, "physical_block_size"); + if (ret >= 0) { + bs->bl.write_granularity = ret; + } + + /* The refresh_limits() function can be called multiple times. */ + g_free(bs->wps); + bs->wps = g_malloc(sizeof(BlockZoneWps) + + sizeof(int64_t) * bs->bl.nr_zones); + ret = get_zones_wp(bs, s->fd, 0, bs->bl.nr_zones, 0); + if (ret < 0) { + error_setg_errno(errp, -ret, "report wps failed"); + bs->wps = NULL; + return; + } + qemu_co_mutex_init(&bs->wps->colock); } +#else /* !defined(CONFIG_BLKZONED) */ +static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st, + Error **errp) +{ + bs->bl.zoned = BLK_Z_NONE; +} +#endif /* !defined(CONFIG_BLKZONED) */ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) { @@ -1297,6 +1528,8 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) bs->bl.max_hw_iov = ret; } } + + raw_refresh_zoned_limits(bs, &st, errp); } static int check_for_dasd(int fd) @@ -1320,9 +1553,12 @@ static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) BDRVRawState *s = bs->opaque; int ret; - /* If DASD, get blocksizes */ + /* If DASD or zoned devices, get blocksizes */ if (check_for_dasd(s->fd) < 0) { - return -ENOTSUP; + /* zoned devices are not DASD */ + if (bs->bl.zoned == BLK_Z_NONE) { + return -ENOTSUP; + } } ret = probe_logical_blocksize(s->fd, &bsz->log); if (ret < 0) { @@ -1463,7 +1699,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) ssize_t len; len = RETRY_ON_EINTR( - (aiocb->aio_type & QEMU_AIO_WRITE) ? + (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) ? qemu_pwritev(aiocb->aio_fildes, aiocb->io.iov, aiocb->io.niov, @@ -1492,7 +1728,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf) ssize_t len; while (offset < aiocb->aio_nbytes) { - if (aiocb->aio_type & QEMU_AIO_WRITE) { + if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { len = pwrite(aiocb->aio_fildes, (const char *)buf + offset, aiocb->aio_nbytes - offset, @@ -1585,7 +1821,7 @@ static int handle_aiocb_rw(void *opaque) } nbytes = handle_aiocb_rw_linear(aiocb, buf); - if (!(aiocb->aio_type & QEMU_AIO_WRITE)) { + if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) { char *p = buf; size_t count = aiocb->aio_nbytes, copy; int i; @@ -1790,6 +2026,147 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd, } #endif +/* + * parse_zone - Fill a zone descriptor + */ +#if defined(CONFIG_BLKZONED) +static inline int parse_zone(struct BlockZoneDescriptor *zone, + const struct blk_zone *blkz) { + zone->start = blkz->start << BDRV_SECTOR_BITS; + zone->length = blkz->len << BDRV_SECTOR_BITS; + zone->wp = blkz->wp << BDRV_SECTOR_BITS; + +#ifdef HAVE_BLK_ZONE_REP_CAPACITY + zone->cap = blkz->capacity << BDRV_SECTOR_BITS; +#else + zone->cap = blkz->len << BDRV_SECTOR_BITS; +#endif + + switch (blkz->type) { + case BLK_ZONE_TYPE_SEQWRITE_REQ: + zone->type = BLK_ZT_SWR; + break; + case BLK_ZONE_TYPE_SEQWRITE_PREF: + zone->type = BLK_ZT_SWP; + break; + case BLK_ZONE_TYPE_CONVENTIONAL: + zone->type = BLK_ZT_CONV; + break; + default: + error_report("Unsupported zone type: 0x%x", blkz->type); + return -ENOTSUP; + } + + switch (blkz->cond) { + case BLK_ZONE_COND_NOT_WP: + zone->state = BLK_ZS_NOT_WP; + break; + case BLK_ZONE_COND_EMPTY: + zone->state = BLK_ZS_EMPTY; + break; + case BLK_ZONE_COND_IMP_OPEN: + zone->state = BLK_ZS_IOPEN; + break; + case BLK_ZONE_COND_EXP_OPEN: + zone->state = BLK_ZS_EOPEN; + break; + case BLK_ZONE_COND_CLOSED: + zone->state = BLK_ZS_CLOSED; + break; + case BLK_ZONE_COND_READONLY: + zone->state = BLK_ZS_RDONLY; + break; + case BLK_ZONE_COND_FULL: + zone->state = BLK_ZS_FULL; + break; + case BLK_ZONE_COND_OFFLINE: + zone->state = BLK_ZS_OFFLINE; + break; + default: + error_report("Unsupported zone state: 0x%x", blkz->cond); + return -ENOTSUP; + } + return 0; +} +#endif + +#if defined(CONFIG_BLKZONED) +static int handle_aiocb_zone_report(void *opaque) +{ + RawPosixAIOData *aiocb = opaque; + int fd = aiocb->aio_fildes; + unsigned int *nr_zones = aiocb->zone_report.nr_zones; + BlockZoneDescriptor *zones = aiocb->zone_report.zones; + /* zoned block devices use 512-byte sectors */ + uint64_t sector = aiocb->aio_offset / 512; + + struct blk_zone *blkz; + size_t rep_size; + unsigned int nrz; + int ret; + unsigned int n = 0, i = 0; + + nrz = *nr_zones; + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone); + g_autofree struct blk_zone_report *rep = NULL; + rep = g_malloc(rep_size); + + blkz = (struct blk_zone *)(rep + 1); + while (n < nrz) { + memset(rep, 0, rep_size); + rep->sector = sector; + rep->nr_zones = nrz - n; + + do { + ret = ioctl(fd, BLKREPORTZONE, rep); + } while (ret != 0 && errno == EINTR); + if (ret != 0) { + error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d", + fd, sector, errno); + return -errno; + } + + if (!rep->nr_zones) { + break; + } + + for (i = 0; i < rep->nr_zones; i++, n++) { + ret = parse_zone(&zones[n], &blkz[i]); + if (ret != 0) { + return ret; + } + + /* The next report should start after the last zone reported */ + sector = blkz[i].start + blkz[i].len; + } + } + + *nr_zones = n; + return 0; +} +#endif + +#if defined(CONFIG_BLKZONED) +static int handle_aiocb_zone_mgmt(void *opaque) +{ + RawPosixAIOData *aiocb = opaque; + int fd = aiocb->aio_fildes; + uint64_t sector = aiocb->aio_offset / 512; + int64_t nr_sectors = aiocb->aio_nbytes / 512; + struct blk_zone_range range; + int ret; + + /* Execute the operation */ + range.sector = sector; + range.nr_sectors = nr_sectors; + do { + ret = ioctl(fd, aiocb->zone_mgmt.op, &range); + } while (ret != 0 && errno == EINTR); + + return ret < 0 ? -errno : ret; +} +#endif + static int handle_aiocb_copy_range(void *opaque) { RawPosixAIOData *aiocb = opaque; @@ -2072,9 +2449,19 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, { BDRVRawState *s = bs->opaque; RawPosixAIOData acb; + int ret; if (fd_open(bs) < 0) return -EIO; +#if defined(CONFIG_BLKZONED) + if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) && bs->wps) { + qemu_co_mutex_lock(&bs->wps->colock); + if (type & QEMU_AIO_ZONE_APPEND && bs->bl.zone_size) { + int index = offset / bs->bl.zone_size; + offset = bs->wps->wp[index]; + } + } +#endif /* * When using O_DIRECT, the request must be aligned to be able to use @@ -2087,12 +2474,15 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, #ifdef CONFIG_LINUX_IO_URING } else if (s->use_linux_io_uring) { assert(qiov->size == bytes); - return luring_co_submit(bs, s->fd, offset, qiov, type); + ret = luring_co_submit(bs, s->fd, offset, qiov, type); + goto out; #endif #ifdef CONFIG_LINUX_AIO } else if (s->use_linux_aio) { assert(qiov->size == bytes); - return laio_co_submit(s->fd, offset, qiov, type, s->aio_max_batch); + ret = laio_co_submit(s->fd, offset, qiov, type, + s->aio_max_batch); + goto out; #endif } @@ -2109,7 +2499,41 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, }; assert(qiov->size == bytes); - return raw_thread_pool_submit(handle_aiocb_rw, &acb); + ret = raw_thread_pool_submit(handle_aiocb_rw, &acb); + goto out; /* Avoid the compiler err of unused label */ + +out: +#if defined(CONFIG_BLKZONED) +{ + BlockZoneWps *wps = bs->wps; + if (ret == 0) { + if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) + && wps && bs->bl.zone_size) { + uint64_t *wp = &wps->wp[offset / bs->bl.zone_size]; + if (!BDRV_ZT_IS_CONV(*wp)) { + if (type & QEMU_AIO_ZONE_APPEND) { + *s->offset = *wp; + trace_zbd_zone_append_complete(bs, *s->offset + >> BDRV_SECTOR_BITS); + } + /* Advance the wp if needed */ + if (offset + bytes > *wp) { + *wp = offset + bytes; + } + } + } + } else { + if (type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) { + update_zones_wp(bs, s->fd, 0, 1); + } + } + + if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) && wps) { + qemu_co_mutex_unlock(&wps->colock); + } +} +#endif + return ret; } static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset, @@ -2212,6 +2636,9 @@ static void raw_close(BlockDriverState *bs) BDRVRawState *s = bs->opaque; if (s->fd >= 0) { +#if defined(CONFIG_BLKZONED) + g_free(bs->wps); +#endif qemu_close(s->fd); s->fd = -1; } @@ -2969,6 +3396,171 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret) } } +/* + * zone report - Get a zone block device's information in the form + * of an array of zone descriptors. + * zones is an array of zone descriptors to hold zone information on reply; + * offset can be any byte within the entire size of the device; + * nr_zones is the maxium number of sectors the command should operate on. + */ +#if defined(CONFIG_BLKZONED) +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset, + unsigned int *nr_zones, + BlockZoneDescriptor *zones) { + BDRVRawState *s = bs->opaque; + RawPosixAIOData acb = (RawPosixAIOData) { + .bs = bs, + .aio_fildes = s->fd, + .aio_type = QEMU_AIO_ZONE_REPORT, + .aio_offset = offset, + .zone_report = { + .nr_zones = nr_zones, + .zones = zones, + }, + }; + + trace_zbd_zone_report(bs, *nr_zones, offset >> BDRV_SECTOR_BITS); + return raw_thread_pool_submit(handle_aiocb_zone_report, &acb); +} +#endif + +/* + * zone management operations - Execute an operation on a zone + */ +#if defined(CONFIG_BLKZONED) +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, + int64_t offset, int64_t len) { + BDRVRawState *s = bs->opaque; + RawPosixAIOData acb; + int64_t zone_size, zone_size_mask; + const char *op_name; + unsigned long zo; + int ret; + BlockZoneWps *wps = bs->wps; + int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS; + + zone_size = bs->bl.zone_size; + zone_size_mask = zone_size - 1; + if (offset & zone_size_mask) { + error_report("sector offset %" PRId64 " is not aligned to zone size " + "%" PRId64 "", offset / 512, zone_size / 512); + return -EINVAL; + } + + if (((offset + len) < capacity && len & zone_size_mask) || + offset + len > capacity) { + error_report("number of sectors %" PRId64 " is not aligned to zone size" + " %" PRId64 "", len / 512, zone_size / 512); + return -EINVAL; + } + + uint32_t i = offset / bs->bl.zone_size; + uint32_t nrz = len / bs->bl.zone_size; + uint64_t *wp = &wps->wp[i]; + if (BDRV_ZT_IS_CONV(*wp) && len != capacity) { + error_report("zone mgmt operations are not allowed for conventional zones"); + return -EIO; + } + + switch (op) { + case BLK_ZO_OPEN: + op_name = "BLKOPENZONE"; + zo = BLKOPENZONE; + break; + case BLK_ZO_CLOSE: + op_name = "BLKCLOSEZONE"; + zo = BLKCLOSEZONE; + break; + case BLK_ZO_FINISH: + op_name = "BLKFINISHZONE"; + zo = BLKFINISHZONE; + break; + case BLK_ZO_RESET: + op_name = "BLKRESETZONE"; + zo = BLKRESETZONE; + break; + default: + error_report("Unsupported zone op: 0x%x", op); + return -ENOTSUP; + } + + acb = (RawPosixAIOData) { + .bs = bs, + .aio_fildes = s->fd, + .aio_type = QEMU_AIO_ZONE_MGMT, + .aio_offset = offset, + .aio_nbytes = len, + .zone_mgmt = { + .op = zo, + }, + }; + + trace_zbd_zone_mgmt(bs, op_name, offset >> BDRV_SECTOR_BITS, + len >> BDRV_SECTOR_BITS); + ret = raw_thread_pool_submit(handle_aiocb_zone_mgmt, &acb); + if (ret != 0) { + update_zones_wp(bs, s->fd, offset, i); + error_report("ioctl %s failed %d", op_name, ret); + return ret; + } + + if (zo == BLKRESETZONE && len == capacity) { + ret = get_zones_wp(bs, s->fd, 0, bs->bl.nr_zones, 1); + if (ret < 0) { + error_report("reporting single wp failed"); + return ret; + } + } else if (zo == BLKRESETZONE) { + for (unsigned int j = 0; j < nrz; ++j) { + wp[j] = offset + j * zone_size; + } + } else if (zo == BLKFINISHZONE) { + for (unsigned int j = 0; j < nrz; ++j) { + /* The zoned device allows the last zone smaller that the + * zone size. */ + wp[j] = MIN(offset + (j + 1) * zone_size, offset + len); + } + } + + return ret; +} +#endif + +#if defined(CONFIG_BLKZONED) +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs, + int64_t *offset, + QEMUIOVector *qiov, + BdrvRequestFlags flags) { + assert(flags == 0); + int64_t zone_size_mask = bs->bl.zone_size - 1; + int64_t iov_len = 0; + int64_t len = 0; + BDRVRawState *s = bs->opaque; + s->offset = offset; + + if (*offset & zone_size_mask) { + error_report("sector offset %" PRId64 " is not aligned to zone size " + "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512); + return -EINVAL; + } + + int64_t wg = bs->bl.write_granularity; + int64_t wg_mask = wg - 1; + for (int i = 0; i < qiov->niov; i++) { + iov_len = qiov->iov[i].iov_len; + if (iov_len & wg_mask) { + error_report("len of IOVector[%d] %" PRId64 " is not aligned to " + "block size %" PRId64 "", i, iov_len, wg); + return -EINVAL; + } + len += iov_len; + } + + trace_zbd_zone_append(bs, *offset >> BDRV_SECTOR_BITS); + return raw_co_prw(bs, *offset, len, qiov, QEMU_AIO_ZONE_APPEND); +} +#endif + static coroutine_fn int raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes, bool blkdev) @@ -3724,6 +4316,14 @@ static BlockDriver bdrv_host_device = { #ifdef __linux__ .bdrv_co_ioctl = hdev_co_ioctl, #endif + + /* zoned device */ +#if defined(CONFIG_BLKZONED) + /* zone management operations */ + .bdrv_co_zone_report = raw_co_zone_report, + .bdrv_co_zone_mgmt = raw_co_zone_mgmt, + .bdrv_co_zone_append = raw_co_zone_append, +#endif }; #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) @@ -3113,6 +3113,74 @@ out: return co.ret; } +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset, + unsigned int *nr_zones, + BlockZoneDescriptor *zones) +{ + BlockDriver *drv = bs->drv; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + IO_CODE(); + + bdrv_inc_in_flight(bs); + if (!drv || !drv->bdrv_co_zone_report || bs->bl.zoned == BLK_Z_NONE) { + co.ret = -ENOTSUP; + goto out; + } + co.ret = drv->bdrv_co_zone_report(bs, offset, nr_zones, zones); +out: + bdrv_dec_in_flight(bs); + return co.ret; +} + +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, + int64_t offset, int64_t len) +{ + BlockDriver *drv = bs->drv; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + IO_CODE(); + + bdrv_inc_in_flight(bs); + if (!drv || !drv->bdrv_co_zone_mgmt || bs->bl.zoned == BLK_Z_NONE) { + co.ret = -ENOTSUP; + goto out; + } + co.ret = drv->bdrv_co_zone_mgmt(bs, op, offset, len); +out: + bdrv_dec_in_flight(bs); + return co.ret; +} + +int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset, + QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + int ret; + BlockDriver *drv = bs->drv; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + IO_CODE(); + + ret = bdrv_check_qiov_request(*offset, qiov->size, qiov, 0, NULL); + if (ret < 0) { + return ret; + } + + bdrv_inc_in_flight(bs); + if (!drv || !drv->bdrv_co_zone_append || bs->bl.zoned == BLK_Z_NONE) { + co.ret = -ENOTSUP; + goto out; + } + co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags); +out: + bdrv_dec_in_flight(bs); + return co.ret; +} + void *qemu_blockalign(BlockDriverState *bs, size_t size) { IO_CODE(); diff --git a/block/io_uring.c b/block/io_uring.c index 989f9a9..82cab6a 100644 --- a/block/io_uring.c +++ b/block/io_uring.c @@ -350,6 +350,10 @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s, io_uring_prep_writev(sqes, fd, luringcb->qiov->iov, luringcb->qiov->niov, offset); break; + case QEMU_AIO_ZONE_APPEND: + io_uring_prep_writev(sqes, fd, luringcb->qiov->iov, + luringcb->qiov->niov, offset); + break; case QEMU_AIO_READ: io_uring_prep_readv(sqes, fd, luringcb->qiov->iov, luringcb->qiov->niov, offset); diff --git a/block/linux-aio.c b/block/linux-aio.c index fc50cdd..442c862 100644 --- a/block/linux-aio.c +++ b/block/linux-aio.c @@ -394,6 +394,9 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset, case QEMU_AIO_WRITE: io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset); break; + case QEMU_AIO_ZONE_APPEND: + io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset); + break; case QEMU_AIO_READ: io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset); break; diff --git a/block/qapi-sysemu.c b/block/qapi-sysemu.c index 7bd7554..cec3c1a 100644 --- a/block/qapi-sysemu.c +++ b/block/qapi-sysemu.c @@ -517,6 +517,7 @@ void qmp_block_latency_histogram_set( bool has_boundaries, uint64List *boundaries, bool has_boundaries_read, uint64List *boundaries_read, bool has_boundaries_write, uint64List *boundaries_write, + bool has_boundaries_append, uint64List *boundaries_append, bool has_boundaries_flush, uint64List *boundaries_flush, Error **errp) { @@ -557,6 +558,16 @@ void qmp_block_latency_histogram_set( } } + if (has_boundaries || has_boundaries_append) { + ret = block_latency_histogram_set( + stats, BLOCK_ACCT_ZONE_APPEND, + has_boundaries_append ? boundaries_append : boundaries); + if (ret) { + error_setg(errp, "Device '%s' set append write boundaries fail", id); + return; + } + } + if (has_boundaries || has_boundaries_flush) { ret = block_latency_histogram_set( stats, BLOCK_ACCT_FLUSH, diff --git a/block/qapi.c b/block/qapi.c index 71f2751..f34f95e 100644 --- a/block/qapi.c +++ b/block/qapi.c @@ -533,27 +533,36 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk) ds->rd_bytes = stats->nr_bytes[BLOCK_ACCT_READ]; ds->wr_bytes = stats->nr_bytes[BLOCK_ACCT_WRITE]; + ds->zone_append_bytes = stats->nr_bytes[BLOCK_ACCT_ZONE_APPEND]; ds->unmap_bytes = stats->nr_bytes[BLOCK_ACCT_UNMAP]; ds->rd_operations = stats->nr_ops[BLOCK_ACCT_READ]; ds->wr_operations = stats->nr_ops[BLOCK_ACCT_WRITE]; + ds->zone_append_operations = stats->nr_ops[BLOCK_ACCT_ZONE_APPEND]; ds->unmap_operations = stats->nr_ops[BLOCK_ACCT_UNMAP]; ds->failed_rd_operations = stats->failed_ops[BLOCK_ACCT_READ]; ds->failed_wr_operations = stats->failed_ops[BLOCK_ACCT_WRITE]; + ds->failed_zone_append_operations = + stats->failed_ops[BLOCK_ACCT_ZONE_APPEND]; ds->failed_flush_operations = stats->failed_ops[BLOCK_ACCT_FLUSH]; ds->failed_unmap_operations = stats->failed_ops[BLOCK_ACCT_UNMAP]; ds->invalid_rd_operations = stats->invalid_ops[BLOCK_ACCT_READ]; ds->invalid_wr_operations = stats->invalid_ops[BLOCK_ACCT_WRITE]; + ds->invalid_zone_append_operations = + stats->invalid_ops[BLOCK_ACCT_ZONE_APPEND]; ds->invalid_flush_operations = stats->invalid_ops[BLOCK_ACCT_FLUSH]; ds->invalid_unmap_operations = stats->invalid_ops[BLOCK_ACCT_UNMAP]; ds->rd_merged = stats->merged[BLOCK_ACCT_READ]; ds->wr_merged = stats->merged[BLOCK_ACCT_WRITE]; + ds->zone_append_merged = stats->merged[BLOCK_ACCT_ZONE_APPEND]; ds->unmap_merged = stats->merged[BLOCK_ACCT_UNMAP]; ds->flush_operations = stats->nr_ops[BLOCK_ACCT_FLUSH]; ds->wr_total_time_ns = stats->total_time_ns[BLOCK_ACCT_WRITE]; + ds->zone_append_total_time_ns = + stats->total_time_ns[BLOCK_ACCT_ZONE_APPEND]; ds->rd_total_time_ns = stats->total_time_ns[BLOCK_ACCT_READ]; ds->flush_total_time_ns = stats->total_time_ns[BLOCK_ACCT_FLUSH]; ds->unmap_total_time_ns = stats->total_time_ns[BLOCK_ACCT_UNMAP]; @@ -571,6 +580,7 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk) TimedAverage *rd = &ts->latency[BLOCK_ACCT_READ]; TimedAverage *wr = &ts->latency[BLOCK_ACCT_WRITE]; + TimedAverage *zap = &ts->latency[BLOCK_ACCT_ZONE_APPEND]; TimedAverage *fl = &ts->latency[BLOCK_ACCT_FLUSH]; dev_stats->interval_length = ts->interval_length; @@ -583,6 +593,10 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk) dev_stats->max_wr_latency_ns = timed_average_max(wr); dev_stats->avg_wr_latency_ns = timed_average_avg(wr); + dev_stats->min_zone_append_latency_ns = timed_average_min(zap); + dev_stats->max_zone_append_latency_ns = timed_average_max(zap); + dev_stats->avg_zone_append_latency_ns = timed_average_avg(zap); + dev_stats->min_flush_latency_ns = timed_average_min(fl); dev_stats->max_flush_latency_ns = timed_average_max(fl); dev_stats->avg_flush_latency_ns = timed_average_avg(fl); @@ -591,6 +605,8 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk) block_acct_queue_depth(ts, BLOCK_ACCT_READ); dev_stats->avg_wr_queue_depth = block_acct_queue_depth(ts, BLOCK_ACCT_WRITE); + dev_stats->avg_zone_append_queue_depth = + block_acct_queue_depth(ts, BLOCK_ACCT_ZONE_APPEND); QAPI_LIST_PREPEND(ds->timed_stats, dev_stats); } @@ -600,6 +616,8 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk) = bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_READ]); ds->wr_latency_histogram = bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_WRITE]); + ds->zone_append_latency_histogram + = bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_ZONE_APPEND]); ds->flush_latency_histogram = bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_FLUSH]); } diff --git a/block/raw-format.c b/block/raw-format.c index fd9e61f..3a39462 100644 --- a/block/raw-format.c +++ b/block/raw-format.c @@ -317,6 +317,28 @@ raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes) return bdrv_co_pdiscard(bs->file, offset, bytes); } +static int coroutine_fn GRAPH_RDLOCK +raw_co_zone_report(BlockDriverState *bs, int64_t offset, + unsigned int *nr_zones, + BlockZoneDescriptor *zones) +{ + return bdrv_co_zone_report(bs->file->bs, offset, nr_zones, zones); +} + +static int coroutine_fn GRAPH_RDLOCK +raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, + int64_t offset, int64_t len) +{ + return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len); +} + +static int coroutine_fn GRAPH_RDLOCK +raw_co_zone_append(BlockDriverState *bs,int64_t *offset, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags); +} + static int64_t coroutine_fn GRAPH_RDLOCK raw_co_getlength(BlockDriverState *bs) { @@ -608,6 +630,7 @@ static void raw_child_perm(BlockDriverState *bs, BdrvChild *c, BlockDriver bdrv_raw = { .format_name = "raw", .instance_size = sizeof(BDRVRawState), + .supports_zoned_children = true, .bdrv_probe = &raw_probe, .bdrv_reopen_prepare = &raw_reopen_prepare, .bdrv_reopen_commit = &raw_reopen_commit, @@ -619,6 +642,9 @@ BlockDriver bdrv_raw = { .bdrv_co_pwritev = &raw_co_pwritev, .bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes, .bdrv_co_pdiscard = &raw_co_pdiscard, + .bdrv_co_zone_report = &raw_co_zone_report, + .bdrv_co_zone_mgmt = &raw_co_zone_mgmt, + .bdrv_co_zone_append = &raw_co_zone_append, .bdrv_co_block_status = &raw_co_block_status, .bdrv_co_copy_range_from = &raw_co_copy_range_from, .bdrv_co_copy_range_to = &raw_co_copy_range_to, diff --git a/block/trace-events b/block/trace-events index 48dbf10..3266515 100644 --- a/block/trace-events +++ b/block/trace-events @@ -209,6 +209,10 @@ file_FindEjectableOpticalMedia(const char *media) "Matching using %s" file_setup_cdrom(const char *partition) "Using %s as optical disc" file_hdev_is_sg(int type, int version) "SG device found: type=%d, version=%d" file_flush_fdatasync_failed(int err) "errno %d" +zbd_zone_report(void *bs, unsigned int nr_zones, int64_t sector) "bs %p report %d zones starting at sector offset 0x%" PRIx64 "" +zbd_zone_mgmt(void *bs, const char *op_name, int64_t sector, int64_t len) "bs %p %s starts at sector offset 0x%" PRIx64 " over a range of 0x%" PRIx64 " sectors" +zbd_zone_append(void *bs, int64_t sector) "bs %p append at sector offset 0x%" PRIx64 "" +zbd_zone_append_complete(void *bs, int64_t sector) "bs %p returns append sector 0x%" PRIx64 "" # ssh.c sftp_error(const char *op, const char *ssh_err, int ssh_err_code, int sftp_err_code) "%s failed: %s (libssh error code: %d, sftp error code: %d)" diff --git a/docs/devel/index-api.rst b/docs/devel/index-api.rst index 60c0d74..7108821 100644 --- a/docs/devel/index-api.rst +++ b/docs/devel/index-api.rst @@ -12,3 +12,4 @@ generated from in-code annotations to function prototypes. memory modules ui + zoned-storage diff --git a/docs/devel/zoned-storage.rst b/docs/devel/zoned-storage.rst new file mode 100644 index 0000000..30296d3 --- /dev/null +++ b/docs/devel/zoned-storage.rst @@ -0,0 +1,62 @@ +============= +zoned-storage +============= + +Zoned Block Devices (ZBDs) divide the LBA space into block regions called zones +that are larger than the LBA size. They can only allow sequential writes, which +can reduce write amplification in SSDs, and potentially lead to higher +throughput and increased capacity. More details about ZBDs can be found at: + +https://zonedstorage.io/docs/introduction/zoned-storage + +1. Block layer APIs for zoned storage +------------------------------------- +QEMU block layer supports three zoned storage models: +- BLK_Z_HM: The host-managed zoned model only allows sequential writes access +to zones. It supports ZBD-specific I/O commands that can be used by a host to +manage the zones of a device. +- BLK_Z_HA: The host-aware zoned model allows random write operations in +zones, making it backward compatible with regular block devices. +- BLK_Z_NONE: The non-zoned model has no zones support. It includes both +regular and drive-managed ZBD devices. ZBD-specific I/O commands are not +supported. + +The block device information resides inside BlockDriverState. QEMU uses +BlockLimits struct(BlockDriverState::bl) that is continuously accessed by the +block layer while processing I/O requests. A BlockBackend has a root pointer to +a BlockDriverState graph(for example, raw format on top of file-posix). The +zoned storage information can be propagated from the leaf BlockDriverState all +the way up to the BlockBackend. If the zoned storage model in file-posix is +set to BLK_Z_HM, then block drivers will declare support for zoned host device. + +The block layer APIs support commands needed for zoned storage devices, +including report zones, four zone operations, and zone append. + +2. Emulating zoned storage controllers +-------------------------------------- +When the BlockBackend's BlockLimits model reports a zoned storage device, users +like the virtio-blk emulation or the qemu-io-cmds.c utility can use block layer +APIs for zoned storage emulation or testing. + +For example, to test zone_report on a null_blk device using qemu-io is:: + + $ path/to/qemu-io --image-opts -n driver=host_device,filename=/dev/nullb0 -c "zrp offset nr_zones" + +To expose the host's zoned block device through virtio-blk, the command line +can be (includes the -device parameter):: + + -blockdev node-name=drive0,driver=host_device,filename=/dev/nullb0,cache.direct=on \ + -device virtio-blk-pci,drive=drive0 + +Or only use the -drive parameter:: + + -driver driver=host_device,file=/dev/nullb0,if=virtio,cache.direct=on + +Additionally, QEMU has several ways of supporting zoned storage, including: +(1) Using virtio-scsi: --device scsi-block allows for the passing through of +SCSI ZBC devices, enabling the attachment of ZBC or ZAC HDDs to QEMU. +(2) PCI device pass-through: While NVMe ZNS emulation is available for testing +purposes, it cannot yet pass through a zoned device from the host. To pass on +the NVMe ZNS device to the guest, use VFIO PCI pass the entire NVMe PCI adapter +through to the guest. Likewise, an HDD HBA can be passed on to QEMU all HDDs +attached to the HBA. diff --git a/docs/system/qemu-block-drivers.rst.inc b/docs/system/qemu-block-drivers.rst.inc index dfe5d22..105cb96 100644 --- a/docs/system/qemu-block-drivers.rst.inc +++ b/docs/system/qemu-block-drivers.rst.inc @@ -430,6 +430,12 @@ Hard disks you may corrupt your host data (use the ``-snapshot`` command line option or modify the device permissions accordingly). +Zoned block devices + Zoned block devices can be passed through to the guest if the emulated storage + controller supports zoned storage. Use ``--blockdev host_device, + node-name=drive0,filename=/dev/nullb0,cache.direct=on`` to pass through + ``/dev/nullb0`` as ``drive0``. + Windows ^^^^^^^ diff --git a/hw/block/trace-events b/hw/block/trace-events index 2c45a62..34be8b9 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -44,9 +44,16 @@ pflash_write_unknown(const char *name, uint8_t cmd) "%s: unknown command 0x%02x" # virtio-blk.c virtio_blk_req_complete(void *vdev, void *req, int status) "vdev %p req %p status %d" virtio_blk_rw_complete(void *vdev, void *req, int ret) "vdev %p req %p ret %d" +virtio_blk_zone_report_complete(void *vdev, void *req, unsigned int nr_zones, int ret) "vdev %p req %p nr_zones %u ret %d" +virtio_blk_zone_mgmt_complete(void *vdev, void *req, int ret) "vdev %p req %p ret %d" +virtio_blk_zone_append_complete(void *vdev, void *req, int64_t sector, int ret) "vdev %p req %p, append sector 0x%" PRIx64 " ret %d" virtio_blk_handle_write(void *vdev, void *req, uint64_t sector, size_t nsectors) "vdev %p req %p sector %"PRIu64" nsectors %zu" virtio_blk_handle_read(void *vdev, void *req, uint64_t sector, size_t nsectors) "vdev %p req %p sector %"PRIu64" nsectors %zu" virtio_blk_submit_multireq(void *vdev, void *mrb, int start, int num_reqs, uint64_t offset, size_t size, bool is_write) "vdev %p mrb %p start %d num_reqs %d offset %"PRIu64" size %zu is_write %d" +virtio_blk_handle_zone_report(void *vdev, void *req, int64_t sector, unsigned int nr_zones) "vdev %p req %p sector 0x%" PRIx64 " nr_zones %u" +virtio_blk_handle_zone_mgmt(void *vdev, void *req, uint8_t op, int64_t sector, int64_t len) "vdev %p req %p op 0x%x sector 0x%" PRIx64 " len 0x%" PRIx64 "" +virtio_blk_handle_zone_reset_all(void *vdev, void *req, int64_t sector, int64_t len) "vdev %p req %p sector 0x%" PRIx64 " cap 0x%" PRIx64 "" +virtio_blk_handle_zone_append(void *vdev, void *req, int64_t sector) "vdev %p req %p, append sector 0x%" PRIx64 "" # hd-geometry.c hd_geometry_lchs_guess(void *blk, int cyls, int heads, int secs) "blk %p LCHS %d %d %d" diff --git a/hw/block/virtio-blk-common.c b/hw/block/virtio-blk-common.c index ac52d7c..e2f8e2f 100644 --- a/hw/block/virtio-blk-common.c +++ b/hw/block/virtio-blk-common.c @@ -29,6 +29,8 @@ static const VirtIOFeature feature_sizes[] = { .end = endof(struct virtio_blk_config, discard_sector_alignment)}, {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES, .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)}, + {.flags = 1ULL << VIRTIO_BLK_F_ZONED, + .end = endof(struct virtio_blk_config, zoned)}, {} }; diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index cefca93..8f65ea4 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -17,6 +17,7 @@ #include "qemu/module.h" #include "qemu/error-report.h" #include "qemu/main-loop.h" +#include "block/block_int.h" #include "trace.h" #include "hw/block/block.h" #include "hw/qdev-properties.h" @@ -601,6 +602,351 @@ err: return err_status; } +typedef struct ZoneCmdData { + VirtIOBlockReq *req; + struct iovec *in_iov; + unsigned in_num; + union { + struct { + unsigned int nr_zones; + BlockZoneDescriptor *zones; + } zone_report_data; + struct { + int64_t offset; + } zone_append_data; + }; +} ZoneCmdData; + +/* + * check zoned_request: error checking before issuing requests. If all checks + * passed, return true. + * append: true if only zone append requests issued. + */ +static bool check_zoned_request(VirtIOBlock *s, int64_t offset, int64_t len, + bool append, uint8_t *status) { + BlockDriverState *bs = blk_bs(s->blk); + int index; + + if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) { + *status = VIRTIO_BLK_S_UNSUPP; + return false; + } + + if (offset < 0 || len < 0 || len > (bs->total_sectors << BDRV_SECTOR_BITS) + || offset > (bs->total_sectors << BDRV_SECTOR_BITS) - len) { + *status = VIRTIO_BLK_S_ZONE_INVALID_CMD; + return false; + } + + if (append) { + if (bs->bl.write_granularity) { + if ((offset % bs->bl.write_granularity) != 0) { + *status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP; + return false; + } + } + + index = offset / bs->bl.zone_size; + if (BDRV_ZT_IS_CONV(bs->wps->wp[index])) { + *status = VIRTIO_BLK_S_ZONE_INVALID_CMD; + return false; + } + + if (len / 512 > bs->bl.max_append_sectors) { + if (bs->bl.max_append_sectors == 0) { + *status = VIRTIO_BLK_S_UNSUPP; + } else { + *status = VIRTIO_BLK_S_ZONE_INVALID_CMD; + } + return false; + } + } + return true; +} + +static void virtio_blk_zone_report_complete(void *opaque, int ret) +{ + ZoneCmdData *data = opaque; + VirtIOBlockReq *req = data->req; + VirtIOBlock *s = req->dev; + VirtIODevice *vdev = VIRTIO_DEVICE(req->dev); + struct iovec *in_iov = data->in_iov; + unsigned in_num = data->in_num; + int64_t zrp_size, n, j = 0; + int64_t nz = data->zone_report_data.nr_zones; + int8_t err_status = VIRTIO_BLK_S_OK; + + trace_virtio_blk_zone_report_complete(vdev, req, nz, ret); + if (ret) { + err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD; + goto out; + } + + struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) { + .nr_zones = cpu_to_le64(nz), + }; + zrp_size = sizeof(struct virtio_blk_zone_report) + + sizeof(struct virtio_blk_zone_descriptor) * nz; + n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr)); + if (n != sizeof(zrp_hdr)) { + virtio_error(vdev, "Driver provided input buffer that is too small!"); + err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD; + goto out; + } + + for (size_t i = sizeof(zrp_hdr); i < zrp_size; + i += sizeof(struct virtio_blk_zone_descriptor), ++j) { + struct virtio_blk_zone_descriptor desc = + (struct virtio_blk_zone_descriptor) { + .z_start = cpu_to_le64(data->zone_report_data.zones[j].start + >> BDRV_SECTOR_BITS), + .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap + >> BDRV_SECTOR_BITS), + .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp + >> BDRV_SECTOR_BITS), + }; + + switch (data->zone_report_data.zones[j].type) { + case BLK_ZT_CONV: + desc.z_type = VIRTIO_BLK_ZT_CONV; + break; + case BLK_ZT_SWR: + desc.z_type = VIRTIO_BLK_ZT_SWR; + break; + case BLK_ZT_SWP: + desc.z_type = VIRTIO_BLK_ZT_SWP; + break; + default: + g_assert_not_reached(); + } + + switch (data->zone_report_data.zones[j].state) { + case BLK_ZS_RDONLY: + desc.z_state = VIRTIO_BLK_ZS_RDONLY; + break; + case BLK_ZS_OFFLINE: + desc.z_state = VIRTIO_BLK_ZS_OFFLINE; + break; + case BLK_ZS_EMPTY: + desc.z_state = VIRTIO_BLK_ZS_EMPTY; + break; + case BLK_ZS_CLOSED: + desc.z_state = VIRTIO_BLK_ZS_CLOSED; + break; + case BLK_ZS_FULL: + desc.z_state = VIRTIO_BLK_ZS_FULL; + break; + case BLK_ZS_EOPEN: + desc.z_state = VIRTIO_BLK_ZS_EOPEN; + break; + case BLK_ZS_IOPEN: + desc.z_state = VIRTIO_BLK_ZS_IOPEN; + break; + case BLK_ZS_NOT_WP: + desc.z_state = VIRTIO_BLK_ZS_NOT_WP; + break; + default: + g_assert_not_reached(); + } + + /* TODO: it takes O(n^2) time complexity. Optimizations required. */ + n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc)); + if (n != sizeof(desc)) { + virtio_error(vdev, "Driver provided input buffer " + "for descriptors that is too small!"); + err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD; + } + } + +out: + aio_context_acquire(blk_get_aio_context(s->conf.conf.blk)); + virtio_blk_req_complete(req, err_status); + virtio_blk_free_request(req); + aio_context_release(blk_get_aio_context(s->conf.conf.blk)); + g_free(data->zone_report_data.zones); + g_free(data); +} + +static void virtio_blk_handle_zone_report(VirtIOBlockReq *req, + struct iovec *in_iov, + unsigned in_num) +{ + VirtIOBlock *s = req->dev; + VirtIODevice *vdev = VIRTIO_DEVICE(s); + unsigned int nr_zones; + ZoneCmdData *data; + int64_t zone_size, offset; + uint8_t err_status; + + if (req->in_len < sizeof(struct virtio_blk_inhdr) + + sizeof(struct virtio_blk_zone_report) + + sizeof(struct virtio_blk_zone_descriptor)) { + virtio_error(vdev, "in buffer too small for zone report"); + return; + } + + /* start byte offset of the zone report */ + offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS; + if (!check_zoned_request(s, offset, 0, false, &err_status)) { + goto out; + } + nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) - + sizeof(struct virtio_blk_zone_report)) / + sizeof(struct virtio_blk_zone_descriptor); + trace_virtio_blk_handle_zone_report(vdev, req, + offset >> BDRV_SECTOR_BITS, nr_zones); + + zone_size = sizeof(BlockZoneDescriptor) * nr_zones; + data = g_malloc(sizeof(ZoneCmdData)); + data->req = req; + data->in_iov = in_iov; + data->in_num = in_num; + data->zone_report_data.nr_zones = nr_zones; + data->zone_report_data.zones = g_malloc(zone_size), + + blk_aio_zone_report(s->blk, offset, &data->zone_report_data.nr_zones, + data->zone_report_data.zones, + virtio_blk_zone_report_complete, data); + return; +out: + virtio_blk_req_complete(req, err_status); + virtio_blk_free_request(req); +} + +static void virtio_blk_zone_mgmt_complete(void *opaque, int ret) +{ + VirtIOBlockReq *req = opaque; + VirtIOBlock *s = req->dev; + VirtIODevice *vdev = VIRTIO_DEVICE(s); + int8_t err_status = VIRTIO_BLK_S_OK; + trace_virtio_blk_zone_mgmt_complete(vdev, req,ret); + + if (ret) { + err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD; + } + + aio_context_acquire(blk_get_aio_context(s->conf.conf.blk)); + virtio_blk_req_complete(req, err_status); + virtio_blk_free_request(req); + aio_context_release(blk_get_aio_context(s->conf.conf.blk)); +} + +static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op) +{ + VirtIOBlock *s = req->dev; + VirtIODevice *vdev = VIRTIO_DEVICE(s); + BlockDriverState *bs = blk_bs(s->blk); + int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS; + uint64_t len; + uint64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS; + uint8_t err_status = VIRTIO_BLK_S_OK; + + uint32_t type = virtio_ldl_p(vdev, &req->out.type); + if (type == VIRTIO_BLK_T_ZONE_RESET_ALL) { + /* Entire drive capacity */ + offset = 0; + len = capacity; + trace_virtio_blk_handle_zone_reset_all(vdev, req, 0, + bs->total_sectors); + } else { + if (bs->bl.zone_size > capacity - offset) { + /* The zoned device allows the last smaller zone. */ + len = capacity - bs->bl.zone_size * (bs->bl.nr_zones - 1); + } else { + len = bs->bl.zone_size; + } + trace_virtio_blk_handle_zone_mgmt(vdev, req, op, + offset >> BDRV_SECTOR_BITS, + len >> BDRV_SECTOR_BITS); + } + + if (!check_zoned_request(s, offset, len, false, &err_status)) { + goto out; + } + + blk_aio_zone_mgmt(s->blk, op, offset, len, + virtio_blk_zone_mgmt_complete, req); + + return 0; +out: + virtio_blk_req_complete(req, err_status); + virtio_blk_free_request(req); + return err_status; +} + +static void virtio_blk_zone_append_complete(void *opaque, int ret) +{ + ZoneCmdData *data = opaque; + VirtIOBlockReq *req = data->req; + VirtIOBlock *s = req->dev; + VirtIODevice *vdev = VIRTIO_DEVICE(req->dev); + int64_t append_sector, n; + uint8_t err_status = VIRTIO_BLK_S_OK; + + if (ret) { + err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD; + goto out; + } + + virtio_stq_p(vdev, &append_sector, + data->zone_append_data.offset >> BDRV_SECTOR_BITS); + n = iov_from_buf(data->in_iov, data->in_num, 0, &append_sector, + sizeof(append_sector)); + if (n != sizeof(append_sector)) { + virtio_error(vdev, "Driver provided input buffer less than size of " + "append_sector"); + err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD; + goto out; + } + trace_virtio_blk_zone_append_complete(vdev, req, append_sector, ret); + +out: + aio_context_acquire(blk_get_aio_context(s->conf.conf.blk)); + virtio_blk_req_complete(req, err_status); + virtio_blk_free_request(req); + aio_context_release(blk_get_aio_context(s->conf.conf.blk)); + g_free(data); +} + +static int virtio_blk_handle_zone_append(VirtIOBlockReq *req, + struct iovec *out_iov, + struct iovec *in_iov, + uint64_t out_num, + unsigned in_num) { + VirtIOBlock *s = req->dev; + VirtIODevice *vdev = VIRTIO_DEVICE(s); + uint8_t err_status = VIRTIO_BLK_S_OK; + + int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS; + int64_t len = iov_size(out_iov, out_num); + + trace_virtio_blk_handle_zone_append(vdev, req, offset >> BDRV_SECTOR_BITS); + if (!check_zoned_request(s, offset, len, true, &err_status)) { + goto out; + } + + ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData)); + data->req = req; + data->in_iov = in_iov; + data->in_num = in_num; + data->zone_append_data.offset = offset; + qemu_iovec_init_external(&req->qiov, out_iov, out_num); + + block_acct_start(blk_get_stats(s->blk), &req->acct, len, + BLOCK_ACCT_ZONE_APPEND); + + blk_aio_zone_append(s->blk, &data->zone_append_data.offset, &req->qiov, 0, + virtio_blk_zone_append_complete, data); + return 0; + +out: + aio_context_acquire(blk_get_aio_context(s->conf.conf.blk)); + virtio_blk_req_complete(req, err_status); + virtio_blk_free_request(req); + aio_context_release(blk_get_aio_context(s->conf.conf.blk)); + return err_status; +} + static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb) { uint32_t type; @@ -687,6 +1033,24 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb) case VIRTIO_BLK_T_FLUSH: virtio_blk_handle_flush(req, mrb); break; + case VIRTIO_BLK_T_ZONE_REPORT: + virtio_blk_handle_zone_report(req, in_iov, in_num); + break; + case VIRTIO_BLK_T_ZONE_OPEN: + virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN); + break; + case VIRTIO_BLK_T_ZONE_CLOSE: + virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE); + break; + case VIRTIO_BLK_T_ZONE_FINISH: + virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH); + break; + case VIRTIO_BLK_T_ZONE_RESET: + virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET); + break; + case VIRTIO_BLK_T_ZONE_RESET_ALL: + virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET); + break; case VIRTIO_BLK_T_SCSI_CMD: virtio_blk_handle_scsi(req); break; @@ -705,6 +1069,14 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb) virtio_blk_free_request(req); break; } + case VIRTIO_BLK_T_ZONE_APPEND & ~VIRTIO_BLK_T_OUT: + /* + * Passing out_iov/out_num and in_iov/in_num is not safe + * to access req->elem.out_sg directly because it may be + * modified by virtio_blk_handle_request(). + */ + virtio_blk_handle_zone_append(req, out_iov, in_iov, out_num, in_num); + break; /* * VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with * VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch statement, @@ -890,6 +1262,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config) { VirtIOBlock *s = VIRTIO_BLK(vdev); BlockConf *conf = &s->conf.conf; + BlockDriverState *bs = blk_bs(s->blk); struct virtio_blk_config blkcfg; uint64_t capacity; int64_t length; @@ -954,6 +1327,30 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config) blkcfg.write_zeroes_may_unmap = 1; virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1); } + if (bs->bl.zoned != BLK_Z_NONE) { + switch (bs->bl.zoned) { + case BLK_Z_HM: + blkcfg.zoned.model = VIRTIO_BLK_Z_HM; + break; + case BLK_Z_HA: + blkcfg.zoned.model = VIRTIO_BLK_Z_HA; + break; + default: + g_assert_not_reached(); + } + + virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors, + bs->bl.zone_size / 512); + virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones, + bs->bl.max_active_zones); + virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones, + bs->bl.max_open_zones); + virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size); + virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors, + bs->bl.max_append_sectors); + } else { + blkcfg.zoned.model = VIRTIO_BLK_Z_NONE; + } memcpy(config, &blkcfg, s->config_size); } @@ -1163,6 +1560,14 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp) return; } + BlockDriverState *bs = blk_bs(conf->conf.blk); + if (bs->bl.zoned != BLK_Z_NONE) { + virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED); + if (bs->bl.zoned == BLK_Z_HM) { + virtio_clear_feature(&s->host_features, VIRTIO_BLK_F_DISCARD); + } + } + if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_DISCARD) && (!conf->max_discard_sectors || conf->max_discard_sectors > BDRV_REQUEST_MAX_SECTORS)) { diff --git a/hw/virtio/virtio-qmp.c b/hw/virtio/virtio-qmp.c index b70148a..e84316d 100644 --- a/hw/virtio/virtio-qmp.c +++ b/hw/virtio/virtio-qmp.c @@ -176,6 +176,8 @@ static const qmp_virtio_feature_map_t virtio_blk_feature_map[] = { "VIRTIO_BLK_F_DISCARD: Discard command supported"), FEATURE_ENTRY(VIRTIO_BLK_F_WRITE_ZEROES, \ "VIRTIO_BLK_F_WRITE_ZEROES: Write zeroes command supported"), + FEATURE_ENTRY(VIRTIO_BLK_F_ZONED, \ + "VIRTIO_BLK_F_ZONED: Zoned block devices"), #ifndef VIRTIO_BLK_NO_LEGACY FEATURE_ENTRY(VIRTIO_BLK_F_BARRIER, \ "VIRTIO_BLK_F_BARRIER: Request barriers supported"), diff --git a/include/block/accounting.h b/include/block/accounting.h index b9caad6..a59e39f 100644 --- a/include/block/accounting.h +++ b/include/block/accounting.h @@ -37,6 +37,7 @@ enum BlockAcctType { BLOCK_ACCT_READ, BLOCK_ACCT_WRITE, BLOCK_ACCT_FLUSH, + BLOCK_ACCT_ZONE_APPEND, BLOCK_ACCT_UNMAP, BLOCK_MAX_IOTYPE, }; diff --git a/include/block/block-common.h b/include/block/block-common.h index b5122ef..9319622 100644 --- a/include/block/block-common.h +++ b/include/block/block-common.h @@ -75,6 +75,57 @@ typedef struct BlockDriver BlockDriver; typedef struct BdrvChild BdrvChild; typedef struct BdrvChildClass BdrvChildClass; +typedef enum BlockZoneOp { + BLK_ZO_OPEN, + BLK_ZO_CLOSE, + BLK_ZO_FINISH, + BLK_ZO_RESET, +} BlockZoneOp; + +typedef enum BlockZoneModel { + BLK_Z_NONE = 0x0, /* Regular block device */ + BLK_Z_HM = 0x1, /* Host-managed zoned block device */ + BLK_Z_HA = 0x2, /* Host-aware zoned block device */ +} BlockZoneModel; + +typedef enum BlockZoneState { + BLK_ZS_NOT_WP = 0x0, + BLK_ZS_EMPTY = 0x1, + BLK_ZS_IOPEN = 0x2, + BLK_ZS_EOPEN = 0x3, + BLK_ZS_CLOSED = 0x4, + BLK_ZS_RDONLY = 0xD, + BLK_ZS_FULL = 0xE, + BLK_ZS_OFFLINE = 0xF, +} BlockZoneState; + +typedef enum BlockZoneType { + BLK_ZT_CONV = 0x1, /* Conventional random writes supported */ + BLK_ZT_SWR = 0x2, /* Sequential writes required */ + BLK_ZT_SWP = 0x3, /* Sequential writes preferred */ +} BlockZoneType; + +/* + * Zone descriptor data structure. + * Provides information on a zone with all position and size values in bytes. + */ +typedef struct BlockZoneDescriptor { + uint64_t start; + uint64_t length; + uint64_t cap; + uint64_t wp; + BlockZoneType type; + BlockZoneState state; +} BlockZoneDescriptor; + +/* + * Track write pointers of a zone in bytes. + */ +typedef struct BlockZoneWps { + CoMutex colock; + uint64_t wp[]; +} BlockZoneWps; + typedef struct BlockDriverInfo { /* in bytes, 0 if irrelevant */ int cluster_size; @@ -197,6 +248,12 @@ typedef enum { #define BDRV_SECTOR_BITS 9 #define BDRV_SECTOR_SIZE (1ULL << BDRV_SECTOR_BITS) +/* + * Get the first most significant bit of wp. If it is zero, then + * the zone type is SWR. + */ +#define BDRV_ZT_IS_CONV(wp) (wp & (1ULL << 63)) + #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \ INT_MAX >> BDRV_SECTOR_BITS) #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) diff --git a/include/block/block-io.h b/include/block/block-io.h index 1f612ec..a27e471 100644 --- a/include/block/block-io.h +++ b/include/block/block-io.h @@ -114,6 +114,19 @@ int coroutine_fn GRAPH_RDLOCK bdrv_co_flush(BlockDriverState *bs); int coroutine_fn GRAPH_RDLOCK bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes); +/* Report zone information of zone block device. */ +int coroutine_fn GRAPH_RDLOCK bdrv_co_zone_report(BlockDriverState *bs, + int64_t offset, + unsigned int *nr_zones, + BlockZoneDescriptor *zones); +int coroutine_fn GRAPH_RDLOCK bdrv_co_zone_mgmt(BlockDriverState *bs, + BlockZoneOp op, + int64_t offset, int64_t len); +int coroutine_fn GRAPH_RDLOCK bdrv_co_zone_append(BlockDriverState *bs, + int64_t *offset, + QEMUIOVector *qiov, + BdrvRequestFlags flags); + bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs); int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes, int64_t *pnum, int64_t *map, diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h index 4909876..dbec0e3 100644 --- a/include/block/block_int-common.h +++ b/include/block/block_int-common.h @@ -138,6 +138,11 @@ struct BlockDriver { bool is_format; /* + * Set to true if the BlockDriver supports zoned children. + */ + bool supports_zoned_children; + + /* * Drivers not implementing bdrv_parse_filename nor bdrv_open should have * this field set to true, except ones that are defined only by their * child's bs. @@ -713,6 +718,15 @@ struct BlockDriver { int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_load_vmstate)( BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos); + int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs, + int64_t offset, unsigned int *nr_zones, + BlockZoneDescriptor *zones); + int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op, + int64_t offset, int64_t len); + int coroutine_fn (*bdrv_co_zone_append)(BlockDriverState *bs, + int64_t *offset, QEMUIOVector *qiov, + BdrvRequestFlags flags); + /* removable device specific */ bool coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_is_inserted)( BlockDriverState *bs); @@ -862,6 +876,26 @@ typedef struct BlockLimits { * an explicit monitor command to load the disk inside the guest). */ bool has_variable_length; + + /* device zone model */ + BlockZoneModel zoned; + + /* zone size expressed in bytes */ + uint32_t zone_size; + + /* total number of zones */ + uint32_t nr_zones; + + /* maximum sectors of a zone append write operation */ + uint32_t max_append_sectors; + + /* maximum number of open zones */ + uint32_t max_open_zones; + + /* maximum number of active zones */ + uint32_t max_active_zones; + + uint32_t write_granularity; } BlockLimits; typedef struct BdrvOpBlocker BdrvOpBlocker; @@ -1223,6 +1257,9 @@ struct BlockDriverState { CoMutex bsc_modify_lock; /* Always non-NULL, but must only be dereferenced under an RCU read guard */ BdrvBlockStatusCache *block_status_cache; + + /* array of write pointers' location of each zone in the zoned device. */ + BlockZoneWps *wps; }; struct BlockBackendRootState { diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h index e46a29c..0fe85ad 100644 --- a/include/block/raw-aio.h +++ b/include/block/raw-aio.h @@ -28,6 +28,9 @@ #define QEMU_AIO_WRITE_ZEROES 0x0020 #define QEMU_AIO_COPY_RANGE 0x0040 #define QEMU_AIO_TRUNCATE 0x0080 +#define QEMU_AIO_ZONE_REPORT 0x0100 +#define QEMU_AIO_ZONE_MGMT 0x0200 +#define QEMU_AIO_ZONE_APPEND 0x0400 #define QEMU_AIO_TYPE_MASK \ (QEMU_AIO_READ | \ QEMU_AIO_WRITE | \ @@ -36,7 +39,10 @@ QEMU_AIO_DISCARD | \ QEMU_AIO_WRITE_ZEROES | \ QEMU_AIO_COPY_RANGE | \ - QEMU_AIO_TRUNCATE) + QEMU_AIO_TRUNCATE | \ + QEMU_AIO_ZONE_REPORT | \ + QEMU_AIO_ZONE_MGMT | \ + QEMU_AIO_ZONE_APPEND) /* AIO flags */ #define QEMU_AIO_MISALIGNED 0x1000 diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h index 851a44d..d62a7ee 100644 --- a/include/sysemu/block-backend-io.h +++ b/include/sysemu/block-backend-io.h @@ -46,6 +46,16 @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset, BlockCompletionFunc *cb, void *opaque); BlockAIOCB *blk_aio_flush(BlockBackend *blk, BlockCompletionFunc *cb, void *opaque); +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset, + unsigned int *nr_zones, + BlockZoneDescriptor *zones, + BlockCompletionFunc *cb, void *opaque); +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op, + int64_t offset, int64_t len, + BlockCompletionFunc *cb, void *opaque); +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset, + QEMUIOVector *qiov, BdrvRequestFlags flags, + BlockCompletionFunc *cb, void *opaque); BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes, BlockCompletionFunc *cb, void *opaque); void blk_aio_cancel_async(BlockAIOCB *acb); @@ -191,6 +201,23 @@ int co_wrapper_mixed blk_pwrite_zeroes(BlockBackend *blk, int64_t offset, int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset, int64_t bytes, BdrvRequestFlags flags); +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, + unsigned int *nr_zones, + BlockZoneDescriptor *zones); +int co_wrapper_mixed blk_zone_report(BlockBackend *blk, int64_t offset, + unsigned int *nr_zones, + BlockZoneDescriptor *zones); +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, + int64_t offset, int64_t len); +int co_wrapper_mixed blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op, + int64_t offset, int64_t len); +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset, + QEMUIOVector *qiov, + BdrvRequestFlags flags); +int co_wrapper_mixed blk_zone_append(BlockBackend *blk, int64_t *offset, + QEMUIOVector *qiov, + BdrvRequestFlags flags); + int co_wrapper_mixed blk_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes); int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset, diff --git a/meson.build b/meson.build index d3cf489..25a4b9f 100644 --- a/meson.build +++ b/meson.build @@ -2025,6 +2025,8 @@ if rdma.found() endif # has_header_symbol +config_host_data.set('CONFIG_BLKZONED', + cc.has_header_symbol('linux/blkzoned.h', 'BLKOPENZONE')) config_host_data.set('CONFIG_EPOLL_CREATE1', cc.has_header_symbol('sys/epoll.h', 'epoll_create1')) config_host_data.set('CONFIG_FALLOCATE_PUNCH_HOLE', @@ -2060,6 +2062,9 @@ config_host_data.set('HAVE_SIGEV_NOTIFY_THREAD_ID', config_host_data.set('HAVE_STRUCT_STAT_ST_ATIM', cc.has_member('struct stat', 'st_atim', prefix: '#include <sys/stat.h>')) +config_host_data.set('HAVE_BLK_ZONE_REP_CAPACITY', + cc.has_member('struct blk_zone', 'capacity', + prefix: '#include <linux/blkzoned.h>')) # has_type config_host_data.set('CONFIG_IOVEC', diff --git a/qapi/block-core.json b/qapi/block-core.json index 187e35d..98d9116 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -854,6 +854,10 @@ # @min_wr_latency_ns: Minimum latency of write operations in the # defined interval, in nanoseconds. # +# @min_zone_append_latency_ns: Minimum latency of zone append operations +# in the defined interval, in nanoseconds +# (since 8.1) +# # @min_flush_latency_ns: Minimum latency of flush operations in the # defined interval, in nanoseconds. # @@ -863,6 +867,10 @@ # @max_wr_latency_ns: Maximum latency of write operations in the # defined interval, in nanoseconds. # +# @max_zone_append_latency_ns: Maximum latency of zone append operations +# in the defined interval, in nanoseconds +# (since 8.1) +# # @max_flush_latency_ns: Maximum latency of flush operations in the # defined interval, in nanoseconds. # @@ -872,6 +880,10 @@ # @avg_wr_latency_ns: Average latency of write operations in the # defined interval, in nanoseconds. # +# @avg_zone_append_latency_ns: Average latency of zone append operations +# in the defined interval, in nanoseconds +# (since 8.1) +# # @avg_flush_latency_ns: Average latency of flush operations in the # defined interval, in nanoseconds. # @@ -881,15 +893,23 @@ # @avg_wr_queue_depth: Average number of pending write operations in # the defined interval. # +# @avg_zone_append_queue_depth: Average number of pending zone append +# operations in the defined interval +# (since 8.1). +# # Since: 2.5 ## { 'struct': 'BlockDeviceTimedStats', 'data': { 'interval_length': 'int', 'min_rd_latency_ns': 'int', 'max_rd_latency_ns': 'int', 'avg_rd_latency_ns': 'int', 'min_wr_latency_ns': 'int', 'max_wr_latency_ns': 'int', - 'avg_wr_latency_ns': 'int', 'min_flush_latency_ns': 'int', - 'max_flush_latency_ns': 'int', 'avg_flush_latency_ns': 'int', - 'avg_rd_queue_depth': 'number', 'avg_wr_queue_depth': 'number' } } + 'avg_wr_latency_ns': 'int', 'min_zone_append_latency_ns': 'int', + 'max_zone_append_latency_ns': 'int', + 'avg_zone_append_latency_ns': 'int', + 'min_flush_latency_ns': 'int', 'max_flush_latency_ns': 'int', + 'avg_flush_latency_ns': 'int', 'avg_rd_queue_depth': 'number', + 'avg_wr_queue_depth': 'number', + 'avg_zone_append_queue_depth': 'number' } } ## # @BlockDeviceStats: @@ -900,6 +920,9 @@ # # @wr_bytes: The number of bytes written by the device. # +# @zone_append_bytes: The number of bytes appended by the zoned devices +# (since 8.1) +# # @unmap_bytes: The number of bytes unmapped by the device (Since 4.2) # # @rd_operations: The number of read operations performed by the @@ -908,6 +931,9 @@ # @wr_operations: The number of write operations performed by the # device. # +# @zone_append_operations: The number of zone append operations performed +# by the zoned devices (since 8.1) +# # @flush_operations: The number of cache flush operations performed by # the device (since 0.15) # @@ -920,6 +946,9 @@ # @wr_total_time_ns: Total time spent on writes in nanoseconds (since # 0.15). # +# @zone_append_total_time_ns: Total time spent on zone append writes +# in nanoseconds (since 8.1) +# # @flush_total_time_ns: Total time spent on cache flushes in # nanoseconds (since 0.15). # @@ -937,6 +966,9 @@ # @wr_merged: Number of write requests that have been merged into # another request (Since 2.3). # +# @zone_append_merged: Number of zone append requests that have been merged +# into another request (since 8.1) +# # @unmap_merged: Number of unmap requests that have been merged into # another request (Since 4.2) # @@ -950,6 +982,10 @@ # @failed_wr_operations: The number of failed write operations # performed by the device (Since 2.5) # +# @failed_zone_append_operations: The number of failed zone append write +# operations performed by the zoned devices +# (since 8.1) +# # @failed_flush_operations: The number of failed flush operations # performed by the device (Since 2.5) # @@ -962,6 +998,9 @@ # @invalid_wr_operations: The number of invalid write operations # performed by the device (Since 2.5) # +# @invalid_zone_append_operations: The number of invalid zone append operations +# performed by the zoned device (since 8.1) +# # @invalid_flush_operations: The number of invalid flush operations # performed by the device (Since 2.5) # @@ -981,27 +1020,34 @@ # # @wr_latency_histogram: @BlockLatencyHistogramInfo. (Since 4.0) # +# @zone_append_latency_histogram: @BlockLatencyHistogramInfo. (since 8.1) +# # @flush_latency_histogram: @BlockLatencyHistogramInfo. (Since 4.0) # # Since: 0.14 ## { 'struct': 'BlockDeviceStats', - 'data': {'rd_bytes': 'int', 'wr_bytes': 'int', 'unmap_bytes' : 'int', - 'rd_operations': 'int', 'wr_operations': 'int', + 'data': {'rd_bytes': 'int', 'wr_bytes': 'int', 'zone_append_bytes': 'int', + 'unmap_bytes' : 'int', 'rd_operations': 'int', + 'wr_operations': 'int', 'zone_append_operations': 'int', 'flush_operations': 'int', 'unmap_operations': 'int', 'rd_total_time_ns': 'int', 'wr_total_time_ns': 'int', - 'flush_total_time_ns': 'int', 'unmap_total_time_ns': 'int', - 'wr_highest_offset': 'int', - 'rd_merged': 'int', 'wr_merged': 'int', 'unmap_merged': 'int', - '*idle_time_ns': 'int', + 'zone_append_total_time_ns': 'int', 'flush_total_time_ns': 'int', + 'unmap_total_time_ns': 'int', 'wr_highest_offset': 'int', + 'rd_merged': 'int', 'wr_merged': 'int', 'zone_append_merged': 'int', + 'unmap_merged': 'int', '*idle_time_ns': 'int', 'failed_rd_operations': 'int', 'failed_wr_operations': 'int', - 'failed_flush_operations': 'int', 'failed_unmap_operations': 'int', - 'invalid_rd_operations': 'int', 'invalid_wr_operations': 'int', + 'failed_zone_append_operations': 'int', + 'failed_flush_operations': 'int', + 'failed_unmap_operations': 'int', 'invalid_rd_operations': 'int', + 'invalid_wr_operations': 'int', + 'invalid_zone_append_operations': 'int', 'invalid_flush_operations': 'int', 'invalid_unmap_operations': 'int', 'account_invalid': 'bool', 'account_failed': 'bool', 'timed_stats': ['BlockDeviceTimedStats'], '*rd_latency_histogram': 'BlockLatencyHistogramInfo', '*wr_latency_histogram': 'BlockLatencyHistogramInfo', + '*zone_append_latency_histogram': 'BlockLatencyHistogramInfo', '*flush_latency_histogram': 'BlockLatencyHistogramInfo' } } ## diff --git a/qapi/block.json b/qapi/block.json index a1e1659..0f25ce3 100644 --- a/qapi/block.json +++ b/qapi/block.json @@ -534,6 +534,9 @@ # @boundaries-write: list of interval boundary values for write # latency histogram. # +# @boundaries-zap: list of interval boundary values for zone append write +# latency histogram. +# # @boundaries-flush: list of interval boundary values for flush # latency histogram. # @@ -587,5 +590,6 @@ '*boundaries': ['uint64'], '*boundaries-read': ['uint64'], '*boundaries-write': ['uint64'], + '*boundaries-zap': ['uint64'], '*boundaries-flush': ['uint64'] }, 'allow-preconfig': true } diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c index e7a02f5..3f75d2f 100644 --- a/qemu-io-cmds.c +++ b/qemu-io-cmds.c @@ -1730,6 +1730,224 @@ static const cmdinfo_t flush_cmd = { .oneline = "flush all in-core file state to disk", }; +static inline int64_t tosector(int64_t bytes) +{ + return bytes >> BDRV_SECTOR_BITS; +} + +static int zone_report_f(BlockBackend *blk, int argc, char **argv) +{ + int ret; + int64_t offset; + unsigned int nr_zones; + + ++optind; + offset = cvtnum(argv[optind]); + ++optind; + nr_zones = cvtnum(argv[optind]); + + g_autofree BlockZoneDescriptor *zones = NULL; + zones = g_new(BlockZoneDescriptor, nr_zones); + ret = blk_zone_report(blk, offset, &nr_zones, zones); + if (ret < 0) { + printf("zone report failed: %s\n", strerror(-ret)); + } else { + for (int i = 0; i < nr_zones; ++i) { + printf("start: 0x%" PRIx64 ", len 0x%" PRIx64 ", " + "cap"" 0x%" PRIx64 ", wptr 0x%" PRIx64 ", " + "zcond:%u, [type: %u]\n", + tosector(zones[i].start), tosector(zones[i].length), + tosector(zones[i].cap), tosector(zones[i].wp), + zones[i].state, zones[i].type); + } + } + return ret; +} + +static const cmdinfo_t zone_report_cmd = { + .name = "zone_report", + .altname = "zrp", + .cfunc = zone_report_f, + .argmin = 2, + .argmax = 2, + .args = "offset number", + .oneline = "report zone information", +}; + +static int zone_open_f(BlockBackend *blk, int argc, char **argv) +{ + int ret; + int64_t offset, len; + ++optind; + offset = cvtnum(argv[optind]); + ++optind; + len = cvtnum(argv[optind]); + ret = blk_zone_mgmt(blk, BLK_ZO_OPEN, offset, len); + if (ret < 0) { + printf("zone open failed: %s\n", strerror(-ret)); + } + return ret; +} + +static const cmdinfo_t zone_open_cmd = { + .name = "zone_open", + .altname = "zo", + .cfunc = zone_open_f, + .argmin = 2, + .argmax = 2, + .args = "offset len", + .oneline = "explicit open a range of zones in zone block device", +}; + +static int zone_close_f(BlockBackend *blk, int argc, char **argv) +{ + int ret; + int64_t offset, len; + ++optind; + offset = cvtnum(argv[optind]); + ++optind; + len = cvtnum(argv[optind]); + ret = blk_zone_mgmt(blk, BLK_ZO_CLOSE, offset, len); + if (ret < 0) { + printf("zone close failed: %s\n", strerror(-ret)); + } + return ret; +} + +static const cmdinfo_t zone_close_cmd = { + .name = "zone_close", + .altname = "zc", + .cfunc = zone_close_f, + .argmin = 2, + .argmax = 2, + .args = "offset len", + .oneline = "close a range of zones in zone block device", +}; + +static int zone_finish_f(BlockBackend *blk, int argc, char **argv) +{ + int ret; + int64_t offset, len; + ++optind; + offset = cvtnum(argv[optind]); + ++optind; + len = cvtnum(argv[optind]); + ret = blk_zone_mgmt(blk, BLK_ZO_FINISH, offset, len); + if (ret < 0) { + printf("zone finish failed: %s\n", strerror(-ret)); + } + return ret; +} + +static const cmdinfo_t zone_finish_cmd = { + .name = "zone_finish", + .altname = "zf", + .cfunc = zone_finish_f, + .argmin = 2, + .argmax = 2, + .args = "offset len", + .oneline = "finish a range of zones in zone block device", +}; + +static int zone_reset_f(BlockBackend *blk, int argc, char **argv) +{ + int ret; + int64_t offset, len; + ++optind; + offset = cvtnum(argv[optind]); + ++optind; + len = cvtnum(argv[optind]); + ret = blk_zone_mgmt(blk, BLK_ZO_RESET, offset, len); + if (ret < 0) { + printf("zone reset failed: %s\n", strerror(-ret)); + } + return ret; +} + +static const cmdinfo_t zone_reset_cmd = { + .name = "zone_reset", + .altname = "zrs", + .cfunc = zone_reset_f, + .argmin = 2, + .argmax = 2, + .args = "offset len", + .oneline = "reset a zone write pointer in zone block device", +}; + +static int do_aio_zone_append(BlockBackend *blk, QEMUIOVector *qiov, + int64_t *offset, int flags, int *total) +{ + int async_ret = NOT_DONE; + + blk_aio_zone_append(blk, offset, qiov, flags, aio_rw_done, &async_ret); + while (async_ret == NOT_DONE) { + main_loop_wait(false); + } + + *total = qiov->size; + return async_ret < 0 ? async_ret : 1; +} + +static int zone_append_f(BlockBackend *blk, int argc, char **argv) +{ + int ret; + bool pflag = false; + int flags = 0; + int total = 0; + int64_t offset; + char *buf; + int c, nr_iov; + int pattern = 0xcd; + QEMUIOVector qiov; + + if (optind > argc - 3) { + return -EINVAL; + } + + if ((c = getopt(argc, argv, "p")) != -1) { + pflag = true; + } + + offset = cvtnum(argv[optind]); + if (offset < 0) { + print_cvtnum_err(offset, argv[optind]); + return offset; + } + optind++; + nr_iov = argc - optind; + buf = create_iovec(blk, &qiov, &argv[optind], nr_iov, pattern, + flags & BDRV_REQ_REGISTERED_BUF); + if (buf == NULL) { + return -EINVAL; + } + ret = do_aio_zone_append(blk, &qiov, &offset, flags, &total); + if (ret < 0) { + printf("zone append failed: %s\n", strerror(-ret)); + goto out; + } + + if (pflag) { + printf("After zap done, the append sector is 0x%" PRIx64 "\n", + tosector(offset)); + } + +out: + qemu_io_free(blk, buf, qiov.size, + flags & BDRV_REQ_REGISTERED_BUF); + qemu_iovec_destroy(&qiov); + return ret; +} + +static const cmdinfo_t zone_append_cmd = { + .name = "zone_append", + .altname = "zap", + .cfunc = zone_append_f, + .argmin = 3, + .argmax = 4, + .args = "offset len [len..]", + .oneline = "append write a number of bytes at a specified offset", +}; + static int truncate_f(BlockBackend *blk, int argc, char **argv); static const cmdinfo_t truncate_cmd = { .name = "truncate", @@ -2523,6 +2741,12 @@ static void __attribute((constructor)) init_qemuio_commands(void) qemuio_add_command(&aio_write_cmd); qemuio_add_command(&aio_flush_cmd); qemuio_add_command(&flush_cmd); + qemuio_add_command(&zone_report_cmd); + qemuio_add_command(&zone_open_cmd); + qemuio_add_command(&zone_close_cmd); + qemuio_add_command(&zone_finish_cmd); + qemuio_add_command(&zone_reset_cmd); + qemuio_add_command(&zone_append_cmd); qemuio_add_command(&truncate_cmd); qemuio_add_command(&length_cmd); qemuio_add_command(&info_cmd); diff --git a/tests/qemu-iotests/227.out b/tests/qemu-iotests/227.out index 378c1b8..a947b1a 100644 --- a/tests/qemu-iotests/227.out +++ b/tests/qemu-iotests/227.out @@ -17,6 +17,7 @@ Testing: -drive driver=null-co,read-zeroes=on,if=virtio "stats": { "unmap_operations": 0, "unmap_merged": 0, + "failed_zone_append_operations": 0, "flush_total_time_ns": 0, "wr_highest_offset": 0, "wr_total_time_ns": 0, @@ -27,6 +28,7 @@ Testing: -drive driver=null-co,read-zeroes=on,if=virtio "timed_stats": [ ], "failed_unmap_operations": 0, + "zone_append_merged": 0, "failed_flush_operations": 0, "account_invalid": true, "rd_total_time_ns": 0, @@ -39,7 +41,11 @@ Testing: -drive driver=null-co,read-zeroes=on,if=virtio "unmap_total_time_ns": 0, "invalid_flush_operations": 0, "account_failed": true, + "zone_append_total_time_ns": 0, + "zone_append_operations": 0, "rd_operations": 0, + "zone_append_bytes": 0, + "invalid_zone_append_operations": 0, "invalid_wr_operations": 0, "invalid_rd_operations": 0 }, @@ -82,6 +88,7 @@ Testing: -drive driver=null-co,if=none "stats": { "unmap_operations": 0, "unmap_merged": 0, + "failed_zone_append_operations": 0, "flush_total_time_ns": 0, "wr_highest_offset": 0, "wr_total_time_ns": 0, @@ -92,6 +99,7 @@ Testing: -drive driver=null-co,if=none "timed_stats": [ ], "failed_unmap_operations": 0, + "zone_append_merged": 0, "failed_flush_operations": 0, "account_invalid": true, "rd_total_time_ns": 0, @@ -104,7 +112,11 @@ Testing: -drive driver=null-co,if=none "unmap_total_time_ns": 0, "invalid_flush_operations": 0, "account_failed": true, + "zone_append_total_time_ns": 0, + "zone_append_operations": 0, "rd_operations": 0, + "zone_append_bytes": 0, + "invalid_zone_append_operations": 0, "invalid_wr_operations": 0, "invalid_rd_operations": 0 }, @@ -177,6 +189,7 @@ Testing: -blockdev driver=null-co,read-zeroes=on,node-name=null -device virtio-b "stats": { "unmap_operations": 0, "unmap_merged": 0, + "failed_zone_append_operations": 0, "flush_total_time_ns": 0, "wr_highest_offset": 0, "wr_total_time_ns": 0, @@ -187,6 +200,7 @@ Testing: -blockdev driver=null-co,read-zeroes=on,node-name=null -device virtio-b "timed_stats": [ ], "failed_unmap_operations": 0, + "zone_append_merged": 0, "failed_flush_operations": 0, "account_invalid": true, "rd_total_time_ns": 0, @@ -199,7 +213,11 @@ Testing: -blockdev driver=null-co,read-zeroes=on,node-name=null -device virtio-b "unmap_total_time_ns": 0, "invalid_flush_operations": 0, "account_failed": true, + "zone_append_total_time_ns": 0, + "zone_append_operations": 0, "rd_operations": 0, + "zone_append_bytes": 0, + "invalid_zone_append_operations": 0, "invalid_wr_operations": 0, "invalid_rd_operations": 0 }, diff --git a/tests/qemu-iotests/tests/zoned b/tests/qemu-iotests/tests/zoned new file mode 100755 index 0000000..3d23ce9 --- /dev/null +++ b/tests/qemu-iotests/tests/zoned @@ -0,0 +1,105 @@ +#!/usr/bin/env bash +# +# Test zone management operations. +# + +seq="$(basename $0)" +echo "QA output created by $seq" +status=1 # failure is the default! + +_cleanup() +{ + _cleanup_test_img + sudo -n rmmod null_blk +} +trap "_cleanup; exit \$status" 0 1 2 3 15 + +# get standard environment, filters and checks +. ../common.rc +. ../common.filter +. ../common.qemu + +# This test only runs on Linux hosts with raw image files. +_supported_fmt raw +_supported_proto file +_supported_os Linux + +sudo -n true || \ + _notrun 'Password-less sudo required' + +IMG="--image-opts -n driver=host_device,filename=/dev/nullb0" +QEMU_IO_OPTIONS=$QEMU_IO_OPTIONS_NO_FMT + +echo "Testing a null_blk device:" +echo "case 1: if the operations work" +sudo -n modprobe null_blk nr_devices=1 zoned=1 +sudo -n chmod 0666 /dev/nullb0 + +echo "(1) report the first zone:" +$QEMU_IO $IMG -c "zrp 0 1" +echo +echo "report the first 10 zones" +$QEMU_IO $IMG -c "zrp 0 10" +echo +echo "report the last zone:" +$QEMU_IO $IMG -c "zrp 0x3e70000000 2" # 0x3e70000000 / 512 = 0x1f380000 +echo +echo +echo "(2) opening the first zone" +$QEMU_IO $IMG -c "zo 0 268435456" # 268435456 / 512 = 524288 +echo "report after:" +$QEMU_IO $IMG -c "zrp 0 1" +echo +echo "opening the second zone" +$QEMU_IO $IMG -c "zo 268435456 268435456" # +echo "report after:" +$QEMU_IO $IMG -c "zrp 268435456 1" +echo +echo "opening the last zone" +$QEMU_IO $IMG -c "zo 0x3e70000000 268435456" +echo "report after:" +$QEMU_IO $IMG -c "zrp 0x3e70000000 2" +echo +echo +echo "(3) closing the first zone" +$QEMU_IO $IMG -c "zc 0 268435456" +echo "report after:" +$QEMU_IO $IMG -c "zrp 0 1" +echo +echo "closing the last zone" +$QEMU_IO $IMG -c "zc 0x3e70000000 268435456" +echo "report after:" +$QEMU_IO $IMG -c "zrp 0x3e70000000 2" +echo +echo +echo "(4) finishing the second zone" +$QEMU_IO $IMG -c "zf 268435456 268435456" +echo "After finishing a zone:" +$QEMU_IO $IMG -c "zrp 268435456 1" +echo +echo +echo "(5) resetting the second zone" +$QEMU_IO $IMG -c "zrs 268435456 268435456" +echo "After resetting a zone:" +$QEMU_IO $IMG -c "zrp 268435456 1" +echo +echo +echo "(6) append write" # the physical block size of the device is 4096 +$QEMU_IO $IMG -c "zrp 0 1" +$QEMU_IO $IMG -c "zap -p 0 0x1000 0x2000" +echo "After appending the first zone firstly:" +$QEMU_IO $IMG -c "zrp 0 1" +$QEMU_IO $IMG -c "zap -p 0 0x1000 0x2000" +echo "After appending the first zone secondly:" +$QEMU_IO $IMG -c "zrp 0 1" +$QEMU_IO $IMG -c "zap -p 268435456 0x1000 0x2000" +echo "After appending the second zone firstly:" +$QEMU_IO $IMG -c "zrp 268435456 1" +$QEMU_IO $IMG -c "zap -p 268435456 0x1000 0x2000" +echo "After appending the second zone secondly:" +$QEMU_IO $IMG -c "zrp 268435456 1" + +# success, all done +echo "*** done" +rm -f $seq.full +status=0 diff --git a/tests/qemu-iotests/tests/zoned.out b/tests/qemu-iotests/tests/zoned.out new file mode 100644 index 0000000..fe53ba4 --- /dev/null +++ b/tests/qemu-iotests/tests/zoned.out @@ -0,0 +1,69 @@ +QA output created by zoned +Testing a null_blk device: +case 1: if the operations work +(1) report the first zone: +start: 0x0, len 0x80000, cap 0x80000, wptr 0x0, zcond:1, [type: 2] + +report the first 10 zones +start: 0x0, len 0x80000, cap 0x80000, wptr 0x0, zcond:1, [type: 2] +start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80000, zcond:1, [type: 2] +start: 0x100000, len 0x80000, cap 0x80000, wptr 0x100000, zcond:1, [type: 2] +start: 0x180000, len 0x80000, cap 0x80000, wptr 0x180000, zcond:1, [type: 2] +start: 0x200000, len 0x80000, cap 0x80000, wptr 0x200000, zcond:1, [type: 2] +start: 0x280000, len 0x80000, cap 0x80000, wptr 0x280000, zcond:1, [type: 2] +start: 0x300000, len 0x80000, cap 0x80000, wptr 0x300000, zcond:1, [type: 2] +start: 0x380000, len 0x80000, cap 0x80000, wptr 0x380000, zcond:1, [type: 2] +start: 0x400000, len 0x80000, cap 0x80000, wptr 0x400000, zcond:1, [type: 2] +start: 0x480000, len 0x80000, cap 0x80000, wptr 0x480000, zcond:1, [type: 2] + +report the last zone: +start: 0x1f380000, len 0x80000, cap 0x80000, wptr 0x1f380000, zcond:1, [type: 2] + + +(2) opening the first zone +report after: +start: 0x0, len 0x80000, cap 0x80000, wptr 0x0, zcond:3, [type: 2] + +opening the second zone +report after: +start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80000, zcond:3, [type: 2] + +opening the last zone +report after: +start: 0x1f380000, len 0x80000, cap 0x80000, wptr 0x1f380000, zcond:3, [type: 2] + + +(3) closing the first zone +report after: +start: 0x0, len 0x80000, cap 0x80000, wptr 0x0, zcond:1, [type: 2] + +closing the last zone +report after: +start: 0x1f380000, len 0x80000, cap 0x80000, wptr 0x1f380000, zcond:1, [type: 2] + + +(4) finishing the second zone +After finishing a zone: +start: 0x80000, len 0x80000, cap 0x80000, wptr 0x100000, zcond:14, [type: 2] + + +(5) resetting the second zone +After resetting a zone: +start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80000, zcond:1, [type: 2] + + +(6) append write +start: 0x0, len 0x80000, cap 0x80000, wptr 0x0, zcond:1, [type: 2] +After zap done, the append sector is 0x0 +After appending the first zone firstly: +start: 0x0, len 0x80000, cap 0x80000, wptr 0x18, zcond:2, [type: 2] +After zap done, the append sector is 0x18 +After appending the first zone secondly: +start: 0x0, len 0x80000, cap 0x80000, wptr 0x30, zcond:2, [type: 2] +After zap done, the append sector is 0x80000 +After appending the second zone firstly: +start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80018, zcond:2, [type: 2] +After zap done, the append sector is 0x80018 +After appending the second zone secondly: +start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80030, zcond:2, [type: 2] +*** done |