diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2021-06-30 21:09:27 +0100 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2021-06-30 21:09:27 +0100 |
commit | 1ec2cd0ce2ca94292ce237becc2c21b4eb9edca0 (patch) | |
tree | 2c2d10818a01841b37a2c0be156a4f56c91cc022 /hw/nvme/ctrl.c | |
parent | d940d468e29bff5eb5669c0dd8f3de0c3de17bfb (diff) | |
parent | 176c0a4973d3ca5d46b05d0edb439b154363d29f (diff) | |
download | qemu-1ec2cd0ce2ca94292ce237becc2c21b4eb9edca0.zip qemu-1ec2cd0ce2ca94292ce237becc2c21b4eb9edca0.tar.gz qemu-1ec2cd0ce2ca94292ce237becc2c21b4eb9edca0.tar.bz2 |
Merge remote-tracking branch 'remotes/nvme/tags/nvme-next-pull-request' into staging
hw/nvme patches
* namespace eui64 support (Heinrich)
* aiocb refactoring (Klaus)
* controller parameter for auto zone transitioning (Niklas)
* misc fixes and additions (Gollu, Klaus, Keith)
# gpg: Signature made Tue 29 Jun 2021 19:46:55 BST
# gpg: using RSA key 522833AA75E2DCE6A24766C04DE1AF316D4F0DE9
# gpg: Good signature from "Klaus Jensen <its@irrelevant.dk>" [unknown]
# gpg: aka "Klaus Jensen <k.jensen@samsung.com>" [unknown]
# gpg: WARNING: This key is not certified with a trusted signature!
# gpg: There is no indication that the signature belongs to the owner.
# Primary key fingerprint: DDCA 4D9C 9EF9 31CC 3468 4272 63D5 6FC5 E55D A838
# Subkey fingerprint: 5228 33AA 75E2 DCE6 A247 66C0 4DE1 AF31 6D4F 0DE9
* remotes/nvme/tags/nvme-next-pull-request: (23 commits)
hw/nvme: add 'zoned.zasl' to documentation
hw/nvme: fix pin-based interrupt behavior (again)
hw/nvme: fix missing check for PMR capability
hw/nvme: documentation fix
hw/nvme: fix endianess conversion and add controller list
Partially revert "hw/block/nvme: drain namespaces on sq deletion"
hw/nvme: reimplement format nvm to allow cancellation
hw/nvme: reimplement zone reset to allow cancellation
hw/nvme: reimplement the copy command to allow aio cancellation
hw/nvme: add dw0/1 to the req completion trace event
hw/nvme: use prinfo directly in nvme_check_prinfo and nvme_dif_check
hw/nvme: remove assert from nvme_get_zone_by_slba
hw/nvme: save reftag when generating pi
hw/nvme: reimplement dsm to allow cancellation
hw/nvme: add nvme_block_status_all helper
hw/nvme: reimplement flush to allow cancellation
hw/nvme: default for namespace EUI-64
hw/nvme: namespace parameter for EUI-64
hw/nvme: fix csi field for cns 0x00 and 0x11
hw/nvme: add param to control auto zone transitioning to zone state closed
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'hw/nvme/ctrl.c')
-rw-r--r-- | hw/nvme/ctrl.c | 1951 |
1 files changed, 1125 insertions, 826 deletions
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 0bcaf71..629b0d3 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -34,6 +34,7 @@ * aerl=<N[optional]>,aer_max_queued=<N[optional]>, \ * mdts=<N[optional]>,vsl=<N[optional]>, \ * zoned.zasl=<N[optional]>, \ + * zoned.auto_transition=<on|off[optional]>, \ * subsys=<subsys_id> * -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\ * zoned=<true|false[optional]>, \ @@ -100,6 +101,11 @@ * the minimum memory page size (CAP.MPSMIN). The default value is 0 (i.e. * defaulting to the value of `mdts`). * + * - `zoned.auto_transition` + * Indicates if zones in zone state implicitly opened can be automatically + * transitioned to zone state closed for resource management purposes. + * Defaults to 'on'. + * * nvme namespace device parameters * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * - `shared` @@ -114,7 +120,7 @@ * This parameter is only valid together with the `subsys` parameter. If left * at the default value (`false/off`), the namespace will be attached to all * controllers in the NVMe subsystem at boot-up. If set to `true/on`, the - * namespace will be be available in the subsystem not not attached to any + * namespace will be available in the subsystem but not attached to any * controllers. * * Setting `zoned` to true selects Zoned Command Set at the namespace. @@ -467,7 +473,9 @@ static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq) return; } else { assert(cq->vector < 32); - n->irq_status &= ~(1 << cq->vector); + if (!n->cq_pending) { + n->irq_status &= ~(1 << cq->vector); + } nvme_irq_check(n); } } @@ -1004,16 +1012,12 @@ static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req) { NvmeNamespace *ns = req->ns; NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; - uint16_t ctrl = le16_to_cpu(rw->control); + bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps); + bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT); size_t len = nvme_l2b(ns, nlb); uint16_t status; - if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && - (ctrl & NVME_RW_PRINFO_PRACT && ns->lbaf.ms == 8)) { - goto out; - } - - if (nvme_ns_ext(ns)) { + if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) { NvmeSg sg; len += nvme_m2b(ns, nlb); @@ -1030,7 +1034,6 @@ static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req) return NVME_SUCCESS; } -out: return nvme_map_dptr(n, &req->sg, len, &req->cmd); } @@ -1189,10 +1192,10 @@ uint16_t nvme_bounce_data(NvmeCtrl *n, uint8_t *ptr, uint32_t len, { NvmeNamespace *ns = req->ns; NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; - uint16_t ctrl = le16_to_cpu(rw->control); + bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps); + bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT); - if (nvme_ns_ext(ns) && - !(ctrl & NVME_RW_PRINFO_PRACT && ns->lbaf.ms == 8)) { + if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) { return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz, ns->lbaf.ms, 0, dir); } @@ -1252,6 +1255,7 @@ static void nvme_post_cqes(void *opaque) NvmeCQueue *cq = opaque; NvmeCtrl *n = cq->ctrl; NvmeRequest *req, *next; + bool pending = cq->head != cq->tail; int ret; QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) { @@ -1281,6 +1285,10 @@ static void nvme_post_cqes(void *opaque) QTAILQ_INSERT_TAIL(&sq->req_list, req, entry); } if (cq->tail != cq->head) { + if (cq->irq_enabled && !pending) { + n->cq_pending++; + } + nvme_irq_assert(n, cq); } } @@ -1289,6 +1297,8 @@ static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req) { assert(cq->cqid == req->sq->cqid); trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid, + le32_to_cpu(req->cqe.result), + le32_to_cpu(req->cqe.dw1), req->status); if (req->status) { @@ -1432,18 +1442,15 @@ static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba, return NVME_SUCCESS; } -static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba, - uint32_t nlb) +static int nvme_block_status_all(NvmeNamespace *ns, uint64_t slba, + uint32_t nlb, int flags) { BlockDriverState *bs = blk_bs(ns->blkconf.blk); int64_t pnum = 0, bytes = nvme_l2b(ns, nlb); int64_t offset = nvme_l2b(ns, slba); - bool zeroed; int ret; - Error *local_err = NULL; - /* * `pnum` holds the number of bytes after offset that shares the same * allocation status as the byte at offset. If `pnum` is different from @@ -1455,23 +1462,41 @@ static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba, ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL); if (ret < 0) { - error_setg_errno(&local_err, -ret, "unable to get block status"); - error_report_err(local_err); - - return NVME_INTERNAL_DEV_ERROR; + return ret; } - zeroed = !!(ret & BDRV_BLOCK_ZERO); - trace_pci_nvme_block_status(offset, bytes, pnum, ret, zeroed); + trace_pci_nvme_block_status(offset, bytes, pnum, ret, + !!(ret & BDRV_BLOCK_ZERO)); - if (zeroed) { - return NVME_DULB; + if (!(ret & flags)) { + return 1; } offset += pnum; } while (pnum != bytes); + return 0; +} + +static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba, + uint32_t nlb) +{ + int ret; + Error *err = NULL; + + ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_DATA); + if (ret) { + if (ret < 0) { + error_setg_errno(&err, -ret, "unable to get block status"); + error_report_err(err); + + return NVME_INTERNAL_DEV_ERROR; + } + + return NVME_DULB; + } + return NVME_SUCCESS; } @@ -1521,7 +1546,10 @@ static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba) { uint32_t zone_idx = nvme_zone_idx(ns, slba); - assert(zone_idx < ns->num_zones); + if (zone_idx >= ns->num_zones) { + return NULL; + } + return &ns->zone_array[zone_idx]; } @@ -1598,11 +1626,16 @@ static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone) static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba, uint32_t nlb) { - NvmeZone *zone = nvme_get_zone_by_slba(ns, slba); - uint64_t bndry = nvme_zone_rd_boundary(ns, zone); - uint64_t end = slba + nlb; + NvmeZone *zone; + uint64_t bndry, end; uint16_t status; + zone = nvme_get_zone_by_slba(ns, slba); + assert(zone); + + bndry = nvme_zone_rd_boundary(ns, zone); + end = slba + nlb; + status = nvme_check_zone_state_for_read(zone); if (status) { ; @@ -1665,6 +1698,29 @@ static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone) } } +static uint16_t nvme_zrm_reset(NvmeNamespace *ns, NvmeZone *zone) +{ + switch (nvme_get_zone_state(zone)) { + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + nvme_aor_dec_open(ns); + /* fallthrough */ + case NVME_ZONE_STATE_CLOSED: + nvme_aor_dec_active(ns); + /* fallthrough */ + case NVME_ZONE_STATE_FULL: + zone->w_ptr = zone->d.zslba; + zone->d.wp = zone->w_ptr; + nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY); + /* fallthrough */ + case NVME_ZONE_STATE_EMPTY: + return NVME_SUCCESS; + + default: + return NVME_ZONE_INVAL_TRANSITION; + } +} + static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns) { NvmeZone *zone; @@ -1686,8 +1742,8 @@ enum { NVME_ZRM_AUTO = 1 << 0, }; -static uint16_t nvme_zrm_open_flags(NvmeNamespace *ns, NvmeZone *zone, - int flags) +static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns, + NvmeZone *zone, int flags) { int act = 0; uint16_t status; @@ -1699,7 +1755,9 @@ static uint16_t nvme_zrm_open_flags(NvmeNamespace *ns, NvmeZone *zone, /* fallthrough */ case NVME_ZONE_STATE_CLOSED: - nvme_zrm_auto_transition_zone(ns); + if (n->params.auto_transition_zones) { + nvme_zrm_auto_transition_zone(ns); + } status = nvme_aor_check(ns, act, 1); if (status) { return status; @@ -1735,14 +1793,16 @@ static uint16_t nvme_zrm_open_flags(NvmeNamespace *ns, NvmeZone *zone, } } -static inline uint16_t nvme_zrm_auto(NvmeNamespace *ns, NvmeZone *zone) +static inline uint16_t nvme_zrm_auto(NvmeCtrl *n, NvmeNamespace *ns, + NvmeZone *zone) { - return nvme_zrm_open_flags(ns, zone, NVME_ZRM_AUTO); + return nvme_zrm_open_flags(n, ns, zone, NVME_ZRM_AUTO); } -static inline uint16_t nvme_zrm_open(NvmeNamespace *ns, NvmeZone *zone) +static inline uint16_t nvme_zrm_open(NvmeCtrl *n, NvmeNamespace *ns, + NvmeZone *zone) { - return nvme_zrm_open_flags(ns, zone, 0); + return nvme_zrm_open_flags(n, ns, zone, 0); } static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone, @@ -1765,6 +1825,7 @@ static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req) slba = le64_to_cpu(rw->slba); nlb = le16_to_cpu(rw->nlb) + 1; zone = nvme_get_zone_by_slba(ns, slba); + assert(zone); nvme_advance_zone_wp(ns, zone, nlb); } @@ -1778,22 +1839,19 @@ static inline bool nvme_is_write(NvmeRequest *req) rw->opcode == NVME_CMD_WRITE_ZEROES; } +static AioContext *nvme_get_aio_context(BlockAIOCB *acb) +{ + return qemu_get_aio_context(); +} + static void nvme_misc_cb(void *opaque, int ret) { NvmeRequest *req = opaque; - NvmeNamespace *ns = req->ns; - BlockBackend *blk = ns->blkconf.blk; - BlockAcctCookie *acct = &req->acct; - BlockAcctStats *stats = blk_get_stats(blk); - - trace_pci_nvme_misc_cb(nvme_cid(req), blk_name(blk)); + trace_pci_nvme_misc_cb(nvme_cid(req)); if (ret) { - block_acct_failed(stats, acct); nvme_aio_err(req, ret); - } else { - block_acct_done(stats, acct); } nvme_enqueue_req_completion(nvme_cq(req), req); @@ -1873,77 +1931,6 @@ out: nvme_rw_complete_cb(req, ret); } -struct nvme_aio_format_ctx { - NvmeRequest *req; - NvmeNamespace *ns; - - /* number of outstanding write zeroes for this namespace */ - int *count; -}; - -static void nvme_aio_format_cb(void *opaque, int ret) -{ - struct nvme_aio_format_ctx *ctx = opaque; - NvmeRequest *req = ctx->req; - NvmeNamespace *ns = ctx->ns; - uintptr_t *num_formats = (uintptr_t *)&req->opaque; - int *count = ctx->count; - - g_free(ctx); - - if (ret) { - nvme_aio_err(req, ret); - } - - if (--(*count)) { - return; - } - - g_free(count); - ns->status = 0x0; - - if (--(*num_formats)) { - return; - } - - nvme_enqueue_req_completion(nvme_cq(req), req); -} - -struct nvme_aio_flush_ctx { - NvmeRequest *req; - NvmeNamespace *ns; - BlockAcctCookie acct; -}; - -static void nvme_aio_flush_cb(void *opaque, int ret) -{ - struct nvme_aio_flush_ctx *ctx = opaque; - NvmeRequest *req = ctx->req; - uintptr_t *num_flushes = (uintptr_t *)&req->opaque; - - BlockBackend *blk = ctx->ns->blkconf.blk; - BlockAcctCookie *acct = &ctx->acct; - BlockAcctStats *stats = blk_get_stats(blk); - - trace_pci_nvme_aio_flush_cb(nvme_cid(req), blk_name(blk)); - - if (!ret) { - block_acct_done(stats, acct); - } else { - block_acct_failed(stats, acct); - nvme_aio_err(req, ret); - } - - (*num_flushes)--; - g_free(ctx); - - if (*num_flushes) { - return; - } - - nvme_enqueue_req_completion(nvme_cq(req), req); -} - static void nvme_verify_cb(void *opaque, int ret) { NvmeBounceContext *ctx = opaque; @@ -1954,14 +1941,13 @@ static void nvme_verify_cb(void *opaque, int ret) BlockAcctStats *stats = blk_get_stats(blk); NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; uint64_t slba = le64_to_cpu(rw->slba); - uint16_t ctrl = le16_to_cpu(rw->control); + uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control)); uint16_t apptag = le16_to_cpu(rw->apptag); uint16_t appmask = le16_to_cpu(rw->appmask); uint32_t reftag = le32_to_cpu(rw->reftag); uint16_t status; - trace_pci_nvme_verify_cb(nvme_cid(req), NVME_RW_PRINFO(ctrl), apptag, - appmask, reftag); + trace_pci_nvme_verify_cb(nvme_cid(req), prinfo, apptag, appmask, reftag); if (ret) { block_acct_failed(stats, acct); @@ -1981,7 +1967,7 @@ static void nvme_verify_cb(void *opaque, int ret) req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size, ctx->mdata.bounce, ctx->mdata.iov.size, - ctrl, slba, apptag, appmask, reftag); + prinfo, slba, apptag, appmask, &reftag); } out: @@ -2028,326 +2014,6 @@ out: nvme_verify_cb(ctx, ret); } -static void nvme_aio_discard_cb(void *opaque, int ret) -{ - NvmeRequest *req = opaque; - uintptr_t *discards = (uintptr_t *)&req->opaque; - - trace_pci_nvme_aio_discard_cb(nvme_cid(req)); - - if (ret) { - nvme_aio_err(req, ret); - } - - (*discards)--; - - if (*discards) { - return; - } - - nvme_enqueue_req_completion(nvme_cq(req), req); -} - -struct nvme_zone_reset_ctx { - NvmeRequest *req; - NvmeZone *zone; -}; - -static void nvme_aio_zone_reset_complete_cb(void *opaque, int ret) -{ - struct nvme_zone_reset_ctx *ctx = opaque; - NvmeRequest *req = ctx->req; - NvmeNamespace *ns = req->ns; - NvmeZone *zone = ctx->zone; - uintptr_t *resets = (uintptr_t *)&req->opaque; - - if (ret) { - nvme_aio_err(req, ret); - goto out; - } - - switch (nvme_get_zone_state(zone)) { - case NVME_ZONE_STATE_EXPLICITLY_OPEN: - case NVME_ZONE_STATE_IMPLICITLY_OPEN: - nvme_aor_dec_open(ns); - /* fall through */ - case NVME_ZONE_STATE_CLOSED: - nvme_aor_dec_active(ns); - /* fall through */ - case NVME_ZONE_STATE_FULL: - zone->w_ptr = zone->d.zslba; - zone->d.wp = zone->w_ptr; - nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY); - /* fall through */ - default: - break; - } - -out: - g_free(ctx); - - (*resets)--; - - if (*resets) { - return; - } - - nvme_enqueue_req_completion(nvme_cq(req), req); -} - -static void nvme_aio_zone_reset_cb(void *opaque, int ret) -{ - struct nvme_zone_reset_ctx *ctx = opaque; - NvmeRequest *req = ctx->req; - NvmeNamespace *ns = req->ns; - NvmeZone *zone = ctx->zone; - - trace_pci_nvme_aio_zone_reset_cb(nvme_cid(req), zone->d.zslba); - - if (ret) { - goto out; - } - - if (ns->lbaf.ms) { - int64_t offset = nvme_moff(ns, zone->d.zslba); - - blk_aio_pwrite_zeroes(ns->blkconf.blk, offset, - nvme_m2b(ns, ns->zone_size), BDRV_REQ_MAY_UNMAP, - nvme_aio_zone_reset_complete_cb, ctx); - return; - } - -out: - nvme_aio_zone_reset_complete_cb(opaque, ret); -} - -struct nvme_copy_ctx { - int copies; - uint8_t *bounce; - uint8_t *mbounce; - uint32_t nlb; - NvmeCopySourceRange *ranges; -}; - -struct nvme_copy_in_ctx { - NvmeRequest *req; - QEMUIOVector iov; - NvmeCopySourceRange *range; -}; - -static void nvme_copy_complete_cb(void *opaque, int ret) -{ - NvmeRequest *req = opaque; - NvmeNamespace *ns = req->ns; - struct nvme_copy_ctx *ctx = req->opaque; - - if (ret) { - block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct); - nvme_aio_err(req, ret); - goto out; - } - - block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct); - -out: - if (ns->params.zoned) { - NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd; - uint64_t sdlba = le64_to_cpu(copy->sdlba); - NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba); - - nvme_advance_zone_wp(ns, zone, ctx->nlb); - } - - g_free(ctx->bounce); - g_free(ctx->mbounce); - g_free(ctx); - - nvme_enqueue_req_completion(nvme_cq(req), req); -} - -static void nvme_copy_cb(void *opaque, int ret) -{ - NvmeRequest *req = opaque; - NvmeNamespace *ns = req->ns; - struct nvme_copy_ctx *ctx = req->opaque; - - trace_pci_nvme_copy_cb(nvme_cid(req)); - - if (ret) { - goto out; - } - - if (ns->lbaf.ms) { - NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd; - uint64_t sdlba = le64_to_cpu(copy->sdlba); - int64_t offset = nvme_moff(ns, sdlba); - - qemu_iovec_reset(&req->sg.iov); - qemu_iovec_add(&req->sg.iov, ctx->mbounce, nvme_m2b(ns, ctx->nlb)); - - req->aiocb = blk_aio_pwritev(ns->blkconf.blk, offset, &req->sg.iov, 0, - nvme_copy_complete_cb, req); - return; - } - -out: - nvme_copy_complete_cb(opaque, ret); -} - -static void nvme_copy_in_complete(NvmeRequest *req) -{ - NvmeNamespace *ns = req->ns; - NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd; - struct nvme_copy_ctx *ctx = req->opaque; - uint64_t sdlba = le64_to_cpu(copy->sdlba); - uint16_t status; - - trace_pci_nvme_copy_in_complete(nvme_cid(req)); - - block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct); - - if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { - uint16_t prinfor = (copy->control[0] >> 4) & 0xf; - uint16_t prinfow = (copy->control[2] >> 2) & 0xf; - uint16_t nr = copy->nr + 1; - NvmeCopySourceRange *range; - uint64_t slba; - uint32_t nlb; - uint16_t apptag, appmask; - uint32_t reftag; - uint8_t *buf = ctx->bounce, *mbuf = ctx->mbounce; - size_t len, mlen; - int i; - - /* - * The dif helpers expects prinfo to be similar to the control field of - * the NvmeRwCmd, so shift by 10 to fake it. - */ - prinfor = prinfor << 10; - prinfow = prinfow << 10; - - for (i = 0; i < nr; i++) { - range = &ctx->ranges[i]; - slba = le64_to_cpu(range->slba); - nlb = le16_to_cpu(range->nlb) + 1; - len = nvme_l2b(ns, nlb); - mlen = nvme_m2b(ns, nlb); - apptag = le16_to_cpu(range->apptag); - appmask = le16_to_cpu(range->appmask); - reftag = le32_to_cpu(range->reftag); - - status = nvme_dif_check(ns, buf, len, mbuf, mlen, prinfor, slba, - apptag, appmask, reftag); - if (status) { - goto invalid; - } - - buf += len; - mbuf += mlen; - } - - apptag = le16_to_cpu(copy->apptag); - appmask = le16_to_cpu(copy->appmask); - reftag = le32_to_cpu(copy->reftag); - - if (prinfow & NVME_RW_PRINFO_PRACT) { - size_t len = nvme_l2b(ns, ctx->nlb); - size_t mlen = nvme_m2b(ns, ctx->nlb); - - status = nvme_check_prinfo(ns, prinfow, sdlba, reftag); - if (status) { - goto invalid; - } - - nvme_dif_pract_generate_dif(ns, ctx->bounce, len, ctx->mbounce, - mlen, apptag, reftag); - } else { - status = nvme_dif_check(ns, ctx->bounce, len, ctx->mbounce, mlen, - prinfow, sdlba, apptag, appmask, reftag); - if (status) { - goto invalid; - } - } - } - - status = nvme_check_bounds(ns, sdlba, ctx->nlb); - if (status) { - goto invalid; - } - - if (ns->params.zoned) { - NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba); - - status = nvme_check_zone_write(ns, zone, sdlba, ctx->nlb); - if (status) { - goto invalid; - } - - status = nvme_zrm_auto(ns, zone); - if (status) { - goto invalid; - } - - zone->w_ptr += ctx->nlb; - } - - qemu_iovec_init(&req->sg.iov, 1); - qemu_iovec_add(&req->sg.iov, ctx->bounce, nvme_l2b(ns, ctx->nlb)); - - block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0, - BLOCK_ACCT_WRITE); - - req->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, sdlba), - &req->sg.iov, 0, nvme_copy_cb, req); - - return; - -invalid: - req->status = status; - - g_free(ctx->bounce); - g_free(ctx); - - nvme_enqueue_req_completion(nvme_cq(req), req); -} - -static void nvme_aio_copy_in_cb(void *opaque, int ret) -{ - struct nvme_copy_in_ctx *in_ctx = opaque; - NvmeRequest *req = in_ctx->req; - NvmeNamespace *ns = req->ns; - struct nvme_copy_ctx *ctx = req->opaque; - - qemu_iovec_destroy(&in_ctx->iov); - g_free(in_ctx); - - trace_pci_nvme_aio_copy_in_cb(nvme_cid(req)); - - if (ret) { - nvme_aio_err(req, ret); - } - - ctx->copies--; - - if (ctx->copies) { - return; - } - - if (req->status) { - block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct); - - g_free(ctx->bounce); - g_free(ctx->mbounce); - g_free(ctx); - - nvme_enqueue_req_completion(nvme_cq(req), req); - - return; - } - - nvme_copy_in_complete(req); -} - struct nvme_compare_ctx { struct { QEMUIOVector iov; @@ -2366,7 +2032,7 @@ static void nvme_compare_mdata_cb(void *opaque, int ret) NvmeNamespace *ns = req->ns; NvmeCtrl *n = nvme_ctrl(req); NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; - uint16_t ctrl = le16_to_cpu(rw->control); + uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control)); uint16_t apptag = le16_to_cpu(rw->apptag); uint16_t appmask = le16_to_cpu(rw->appmask); uint32_t reftag = le32_to_cpu(rw->reftag); @@ -2402,8 +2068,8 @@ static void nvme_compare_mdata_cb(void *opaque, int ret) int16_t pil = 0; status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size, - ctx->mdata.bounce, ctx->mdata.iov.size, ctrl, - slba, apptag, appmask, reftag); + ctx->mdata.bounce, ctx->mdata.iov.size, prinfo, + slba, apptag, appmask, &reftag); if (status) { req->status = status; goto out; @@ -2508,75 +2174,182 @@ out: nvme_enqueue_req_completion(nvme_cq(req), req); } -static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req) +typedef struct NvmeDSMAIOCB { + BlockAIOCB common; + BlockAIOCB *aiocb; + NvmeRequest *req; + QEMUBH *bh; + int ret; + + NvmeDsmRange *range; + unsigned int nr; + unsigned int idx; +} NvmeDSMAIOCB; + +static void nvme_dsm_cancel(BlockAIOCB *aiocb) { + NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common); + + /* break nvme_dsm_cb loop */ + iocb->idx = iocb->nr; + iocb->ret = -ECANCELED; + + if (iocb->aiocb) { + blk_aio_cancel_async(iocb->aiocb); + iocb->aiocb = NULL; + } else { + /* + * We only reach this if nvme_dsm_cancel() has already been called or + * the command ran to completion and nvme_dsm_bh is scheduled to run. + */ + assert(iocb->idx == iocb->nr); + } +} + +static const AIOCBInfo nvme_dsm_aiocb_info = { + .aiocb_size = sizeof(NvmeDSMAIOCB), + .cancel_async = nvme_dsm_cancel, +}; + +static void nvme_dsm_bh(void *opaque) +{ + NvmeDSMAIOCB *iocb = opaque; + + iocb->common.cb(iocb->common.opaque, iocb->ret); + + qemu_bh_delete(iocb->bh); + iocb->bh = NULL; + qemu_aio_unref(iocb); +} + +static void nvme_dsm_cb(void *opaque, int ret); + +static void nvme_dsm_md_cb(void *opaque, int ret) +{ + NvmeDSMAIOCB *iocb = opaque; + NvmeRequest *req = iocb->req; NvmeNamespace *ns = req->ns; - NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd; + NvmeDsmRange *range; + uint64_t slba; + uint32_t nlb; - uint32_t attr = le32_to_cpu(dsm->attributes); - uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1; + if (ret < 0) { + iocb->ret = ret; + goto done; + } - uint16_t status = NVME_SUCCESS; + if (!ns->lbaf.ms) { + nvme_dsm_cb(iocb, 0); + return; + } - trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr); + range = &iocb->range[iocb->idx - 1]; + slba = le64_to_cpu(range->slba); + nlb = le32_to_cpu(range->nlb); - if (attr & NVME_DSMGMT_AD) { - int64_t offset; - size_t len; - NvmeDsmRange range[nr]; - uintptr_t *discards = (uintptr_t *)&req->opaque; + /* + * Check that all block were discarded (zeroed); otherwise we do not zero + * the metadata. + */ - status = nvme_h2c(n, (uint8_t *)range, sizeof(range), req); - if (status) { - return status; + ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_ZERO); + if (ret) { + if (ret < 0) { + iocb->ret = ret; + goto done; } - /* - * AIO callbacks may be called immediately, so initialize discards to 1 - * to make sure the the callback does not complete the request before - * all discards have been issued. - */ - *discards = 1; + nvme_dsm_cb(iocb, 0); + } - for (int i = 0; i < nr; i++) { - uint64_t slba = le64_to_cpu(range[i].slba); - uint32_t nlb = le32_to_cpu(range[i].nlb); + iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_moff(ns, slba), + nvme_m2b(ns, nlb), BDRV_REQ_MAY_UNMAP, + nvme_dsm_cb, iocb); + return; - if (nvme_check_bounds(ns, slba, nlb)) { - continue; - } +done: + iocb->aiocb = NULL; + qemu_bh_schedule(iocb->bh); +} - trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba, - nlb); +static void nvme_dsm_cb(void *opaque, int ret) +{ + NvmeDSMAIOCB *iocb = opaque; + NvmeRequest *req = iocb->req; + NvmeCtrl *n = nvme_ctrl(req); + NvmeNamespace *ns = req->ns; + NvmeDsmRange *range; + uint64_t slba; + uint32_t nlb; - if (nlb > n->dmrsl) { - trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl); - } + if (ret < 0) { + iocb->ret = ret; + goto done; + } - offset = nvme_l2b(ns, slba); - len = nvme_l2b(ns, nlb); +next: + if (iocb->idx == iocb->nr) { + goto done; + } - while (len) { - size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len); + range = &iocb->range[iocb->idx++]; + slba = le64_to_cpu(range->slba); + nlb = le32_to_cpu(range->nlb); - (*discards)++; + trace_pci_nvme_dsm_deallocate(slba, nlb); - blk_aio_pdiscard(ns->blkconf.blk, offset, bytes, - nvme_aio_discard_cb, req); + if (nlb > n->dmrsl) { + trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl); + goto next; + } - offset += bytes; - len -= bytes; - } - } + if (nvme_check_bounds(ns, slba, nlb)) { + trace_pci_nvme_err_invalid_lba_range(slba, nlb, + ns->id_ns.nsze); + goto next; + } - /* account for the 1-initialization */ - (*discards)--; + iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, nvme_l2b(ns, slba), + nvme_l2b(ns, nlb), + nvme_dsm_md_cb, iocb); + return; - if (*discards) { - status = NVME_NO_COMPLETE; - } else { - status = req->status; +done: + iocb->aiocb = NULL; + qemu_bh_schedule(iocb->bh); +} + +static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req) +{ + NvmeNamespace *ns = req->ns; + NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd; + uint32_t attr = le32_to_cpu(dsm->attributes); + uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1; + uint16_t status = NVME_SUCCESS; + + trace_pci_nvme_dsm(nr, attr); + + if (attr & NVME_DSMGMT_AD) { + NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk, + nvme_misc_cb, req); + + iocb->req = req; + iocb->bh = qemu_bh_new(nvme_dsm_bh, iocb); + iocb->ret = 0; + iocb->range = g_new(NvmeDsmRange, nr); + iocb->nr = nr; + iocb->idx = 0; + + status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr, + req); + if (status) { + return status; } + + req->aiocb = &iocb->common; + nvme_dsm_cb(iocb, 0); + + return NVME_NO_COMPLETE; } return status; @@ -2591,7 +2364,7 @@ static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req) uint32_t nlb = le16_to_cpu(rw->nlb) + 1; size_t len = nvme_l2b(ns, nlb); int64_t offset = nvme_l2b(ns, slba); - uint16_t ctrl = le16_to_cpu(rw->control); + uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control)); uint32_t reftag = le32_to_cpu(rw->reftag); NvmeBounceContext *ctx = NULL; uint16_t status; @@ -2599,12 +2372,12 @@ static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req) trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb); if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { - status = nvme_check_prinfo(ns, ctrl, slba, reftag); + status = nvme_check_prinfo(ns, prinfo, slba, reftag); if (status) { return status; } - if (ctrl & NVME_RW_PRINFO_PRACT) { + if (prinfo & NVME_PRINFO_PRACT) { return NVME_INVALID_PROT_INFO | NVME_DNR; } } @@ -2641,158 +2414,433 @@ static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req) return NVME_NO_COMPLETE; } -static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req) +typedef struct NvmeCopyAIOCB { + BlockAIOCB common; + BlockAIOCB *aiocb; + NvmeRequest *req; + QEMUBH *bh; + int ret; + + NvmeCopySourceRange *ranges; + int nr; + int idx; + + uint8_t *bounce; + QEMUIOVector iov; + struct { + BlockAcctCookie read; + BlockAcctCookie write; + } acct; + + uint32_t reftag; + uint64_t slba; + + NvmeZone *zone; +} NvmeCopyAIOCB; + +static void nvme_copy_cancel(BlockAIOCB *aiocb) +{ + NvmeCopyAIOCB *iocb = container_of(aiocb, NvmeCopyAIOCB, common); + + iocb->ret = -ECANCELED; + + if (iocb->aiocb) { + blk_aio_cancel_async(iocb->aiocb); + iocb->aiocb = NULL; + } +} + +static const AIOCBInfo nvme_copy_aiocb_info = { + .aiocb_size = sizeof(NvmeCopyAIOCB), + .cancel_async = nvme_copy_cancel, +}; + +static void nvme_copy_bh(void *opaque) { + NvmeCopyAIOCB *iocb = opaque; + NvmeRequest *req = iocb->req; NvmeNamespace *ns = req->ns; - NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd; + BlockAcctStats *stats = blk_get_stats(ns->blkconf.blk); - uint16_t nr = copy->nr + 1; - uint8_t format = copy->control[0] & 0xf; + if (iocb->idx != iocb->nr) { + req->cqe.result = cpu_to_le32(iocb->idx); + } - /* - * Shift the PRINFOR/PRINFOW values by 10 to allow reusing the - * NVME_RW_PRINFO constants. - */ - uint16_t prinfor = ((copy->control[0] >> 4) & 0xf) << 10; - uint16_t prinfow = ((copy->control[2] >> 2) & 0xf) << 10; + qemu_iovec_destroy(&iocb->iov); + g_free(iocb->bounce); - uint32_t nlb = 0; - uint8_t *bounce = NULL, *bouncep = NULL; - uint8_t *mbounce = NULL, *mbouncep = NULL; - struct nvme_copy_ctx *ctx; - uint16_t status; - int i; + qemu_bh_delete(iocb->bh); + iocb->bh = NULL; - trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format); + if (iocb->ret < 0) { + block_acct_failed(stats, &iocb->acct.read); + block_acct_failed(stats, &iocb->acct.write); + } else { + block_acct_done(stats, &iocb->acct.read); + block_acct_done(stats, &iocb->acct.write); + } - if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && - ((prinfor & NVME_RW_PRINFO_PRACT) != (prinfow & NVME_RW_PRINFO_PRACT))) { - return NVME_INVALID_FIELD | NVME_DNR; + iocb->common.cb(iocb->common.opaque, iocb->ret); + qemu_aio_unref(iocb); +} + +static void nvme_copy_cb(void *opaque, int ret); + +static void nvme_copy_out_completed_cb(void *opaque, int ret) +{ + NvmeCopyAIOCB *iocb = opaque; + NvmeRequest *req = iocb->req; + NvmeNamespace *ns = req->ns; + NvmeCopySourceRange *range = &iocb->ranges[iocb->idx]; + uint32_t nlb = le32_to_cpu(range->nlb) + 1; + + if (ret < 0) { + iocb->ret = ret; + goto out; + } else if (iocb->ret < 0) { + goto out; } - if (!(n->id_ctrl.ocfs & (1 << format))) { - trace_pci_nvme_err_copy_invalid_format(format); - return NVME_INVALID_FIELD | NVME_DNR; + if (ns->params.zoned) { + nvme_advance_zone_wp(ns, iocb->zone, nlb); } - if (nr > ns->id_ns.msrc + 1) { - return NVME_CMD_SIZE_LIMIT | NVME_DNR; + iocb->idx++; + iocb->slba += nlb; +out: + nvme_copy_cb(iocb, iocb->ret); +} + +static void nvme_copy_out_cb(void *opaque, int ret) +{ + NvmeCopyAIOCB *iocb = opaque; + NvmeRequest *req = iocb->req; + NvmeNamespace *ns = req->ns; + NvmeCopySourceRange *range; + uint32_t nlb; + size_t mlen; + uint8_t *mbounce; + + if (ret < 0) { + iocb->ret = ret; + goto out; + } else if (iocb->ret < 0) { + goto out; } - ctx = g_new(struct nvme_copy_ctx, 1); - ctx->ranges = g_new(NvmeCopySourceRange, nr); + if (!ns->lbaf.ms) { + nvme_copy_out_completed_cb(iocb, 0); + return; + } - status = nvme_h2c(n, (uint8_t *)ctx->ranges, - nr * sizeof(NvmeCopySourceRange), req); - if (status) { + range = &iocb->ranges[iocb->idx]; + nlb = le32_to_cpu(range->nlb) + 1; + + mlen = nvme_m2b(ns, nlb); + mbounce = iocb->bounce + nvme_l2b(ns, nlb); + + qemu_iovec_reset(&iocb->iov); + qemu_iovec_add(&iocb->iov, mbounce, mlen); + + iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_moff(ns, iocb->slba), + &iocb->iov, 0, nvme_copy_out_completed_cb, + iocb); + + return; + +out: + nvme_copy_cb(iocb, ret); +} + +static void nvme_copy_in_completed_cb(void *opaque, int ret) +{ + NvmeCopyAIOCB *iocb = opaque; + NvmeRequest *req = iocb->req; + NvmeNamespace *ns = req->ns; + NvmeCopySourceRange *range; + uint32_t nlb; + size_t len; + uint16_t status; + + if (ret < 0) { + iocb->ret = ret; + goto out; + } else if (iocb->ret < 0) { goto out; } - for (i = 0; i < nr; i++) { - uint64_t slba = le64_to_cpu(ctx->ranges[i].slba); - uint32_t _nlb = le16_to_cpu(ctx->ranges[i].nlb) + 1; + range = &iocb->ranges[iocb->idx]; + nlb = le32_to_cpu(range->nlb) + 1; + len = nvme_l2b(ns, nlb); - if (_nlb > le16_to_cpu(ns->id_ns.mssrl)) { - status = NVME_CMD_SIZE_LIMIT | NVME_DNR; - goto out; - } + trace_pci_nvme_copy_out(iocb->slba, nlb); - status = nvme_check_bounds(ns, slba, _nlb); + if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { + NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd; + + uint16_t prinfor = ((copy->control[0] >> 4) & 0xf); + uint16_t prinfow = ((copy->control[2] >> 2) & 0xf); + + uint16_t apptag = le16_to_cpu(range->apptag); + uint16_t appmask = le16_to_cpu(range->appmask); + uint32_t reftag = le32_to_cpu(range->reftag); + + uint64_t slba = le64_to_cpu(range->slba); + size_t mlen = nvme_m2b(ns, nlb); + uint8_t *mbounce = iocb->bounce + nvme_l2b(ns, nlb); + + status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen, prinfor, + slba, apptag, appmask, &reftag); if (status) { - goto out; + goto invalid; } - if (NVME_ERR_REC_DULBE(ns->features.err_rec)) { - status = nvme_check_dulbe(ns, slba, _nlb); + apptag = le16_to_cpu(copy->apptag); + appmask = le16_to_cpu(copy->appmask); + + if (prinfow & NVME_PRINFO_PRACT) { + status = nvme_check_prinfo(ns, prinfow, iocb->slba, iocb->reftag); if (status) { - goto out; + goto invalid; } - } - if (ns->params.zoned) { - status = nvme_check_zone_read(ns, slba, _nlb); + nvme_dif_pract_generate_dif(ns, iocb->bounce, len, mbounce, mlen, + apptag, &iocb->reftag); + } else { + status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen, + prinfow, iocb->slba, apptag, appmask, + &iocb->reftag); if (status) { - goto out; + goto invalid; } } + } - nlb += _nlb; + status = nvme_check_bounds(ns, iocb->slba, nlb); + if (status) { + goto invalid; } - if (nlb > le32_to_cpu(ns->id_ns.mcl)) { - status = NVME_CMD_SIZE_LIMIT | NVME_DNR; - goto out; + if (ns->params.zoned) { + status = nvme_check_zone_write(ns, iocb->zone, iocb->slba, nlb); + if (status) { + goto invalid; + } + + iocb->zone->w_ptr += nlb; } - bounce = bouncep = g_malloc(nvme_l2b(ns, nlb)); - if (ns->lbaf.ms) { - mbounce = mbouncep = g_malloc(nvme_m2b(ns, nlb)); + qemu_iovec_reset(&iocb->iov); + qemu_iovec_add(&iocb->iov, iocb->bounce, len); + + iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, iocb->slba), + &iocb->iov, 0, nvme_copy_out_cb, iocb); + + return; + +invalid: + req->status = status; + iocb->aiocb = NULL; + if (iocb->bh) { + qemu_bh_schedule(iocb->bh); } - block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0, - BLOCK_ACCT_READ); + return; - ctx->bounce = bounce; - ctx->mbounce = mbounce; - ctx->nlb = nlb; - ctx->copies = 1; +out: + nvme_copy_cb(iocb, ret); +} - req->opaque = ctx; +static void nvme_copy_in_cb(void *opaque, int ret) +{ + NvmeCopyAIOCB *iocb = opaque; + NvmeRequest *req = iocb->req; + NvmeNamespace *ns = req->ns; + NvmeCopySourceRange *range; + uint64_t slba; + uint32_t nlb; - for (i = 0; i < nr; i++) { - uint64_t slba = le64_to_cpu(ctx->ranges[i].slba); - uint32_t nlb = le16_to_cpu(ctx->ranges[i].nlb) + 1; + if (ret < 0) { + iocb->ret = ret; + goto out; + } else if (iocb->ret < 0) { + goto out; + } - size_t len = nvme_l2b(ns, nlb); - int64_t offset = nvme_l2b(ns, slba); + if (!ns->lbaf.ms) { + nvme_copy_in_completed_cb(iocb, 0); + return; + } - trace_pci_nvme_copy_source_range(slba, nlb); + range = &iocb->ranges[iocb->idx]; + slba = le64_to_cpu(range->slba); + nlb = le32_to_cpu(range->nlb) + 1; - struct nvme_copy_in_ctx *in_ctx = g_new(struct nvme_copy_in_ctx, 1); - in_ctx->req = req; + qemu_iovec_reset(&iocb->iov); + qemu_iovec_add(&iocb->iov, iocb->bounce + nvme_l2b(ns, nlb), + nvme_m2b(ns, nlb)); - qemu_iovec_init(&in_ctx->iov, 1); - qemu_iovec_add(&in_ctx->iov, bouncep, len); + iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_moff(ns, slba), + &iocb->iov, 0, nvme_copy_in_completed_cb, + iocb); + return; - ctx->copies++; +out: + nvme_copy_cb(iocb, iocb->ret); +} - blk_aio_preadv(ns->blkconf.blk, offset, &in_ctx->iov, 0, - nvme_aio_copy_in_cb, in_ctx); +static void nvme_copy_cb(void *opaque, int ret) +{ + NvmeCopyAIOCB *iocb = opaque; + NvmeRequest *req = iocb->req; + NvmeNamespace *ns = req->ns; + NvmeCopySourceRange *range; + uint64_t slba; + uint32_t nlb; + size_t len; + uint16_t status; - bouncep += len; + if (ret < 0) { + iocb->ret = ret; + goto done; + } else if (iocb->ret < 0) { + goto done; + } - if (ns->lbaf.ms) { - len = nvme_m2b(ns, nlb); - offset = nvme_moff(ns, slba); + if (iocb->idx == iocb->nr) { + goto done; + } - in_ctx = g_new(struct nvme_copy_in_ctx, 1); - in_ctx->req = req; + range = &iocb->ranges[iocb->idx]; + slba = le64_to_cpu(range->slba); + nlb = le32_to_cpu(range->nlb) + 1; + len = nvme_l2b(ns, nlb); - qemu_iovec_init(&in_ctx->iov, 1); - qemu_iovec_add(&in_ctx->iov, mbouncep, len); + trace_pci_nvme_copy_source_range(slba, nlb); - ctx->copies++; + if (nlb > le16_to_cpu(ns->id_ns.mssrl)) { + status = NVME_CMD_SIZE_LIMIT | NVME_DNR; + goto invalid; + } - blk_aio_preadv(ns->blkconf.blk, offset, &in_ctx->iov, 0, - nvme_aio_copy_in_cb, in_ctx); + status = nvme_check_bounds(ns, slba, nlb); + if (status) { + goto invalid; + } - mbouncep += len; + if (NVME_ERR_REC_DULBE(ns->features.err_rec)) { + status = nvme_check_dulbe(ns, slba, nlb); + if (status) { + goto invalid; } } - /* account for the 1-initialization */ - ctx->copies--; + if (ns->params.zoned) { + status = nvme_check_zone_read(ns, slba, nlb); + if (status) { + goto invalid; + } + } + + qemu_iovec_reset(&iocb->iov); + qemu_iovec_add(&iocb->iov, iocb->bounce, len); - if (!ctx->copies) { - nvme_copy_in_complete(req); + iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_l2b(ns, slba), + &iocb->iov, 0, nvme_copy_in_cb, iocb); + return; + +invalid: + req->status = status; +done: + iocb->aiocb = NULL; + if (iocb->bh) { + qemu_bh_schedule(iocb->bh); } +} - return NVME_NO_COMPLETE; -out: - g_free(ctx->ranges); - g_free(ctx); +static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req) +{ + NvmeNamespace *ns = req->ns; + NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd; + NvmeCopyAIOCB *iocb = blk_aio_get(&nvme_copy_aiocb_info, ns->blkconf.blk, + nvme_misc_cb, req); + uint16_t nr = copy->nr + 1; + uint8_t format = copy->control[0] & 0xf; + uint16_t prinfor = ((copy->control[0] >> 4) & 0xf); + uint16_t prinfow = ((copy->control[2] >> 2) & 0xf); + uint16_t status; + + trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format); + + iocb->ranges = NULL; + iocb->zone = NULL; + + if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && + ((prinfor & NVME_PRINFO_PRACT) != (prinfow & NVME_PRINFO_PRACT))) { + status = NVME_INVALID_FIELD | NVME_DNR; + goto invalid; + } + + if (!(n->id_ctrl.ocfs & (1 << format))) { + trace_pci_nvme_err_copy_invalid_format(format); + status = NVME_INVALID_FIELD | NVME_DNR; + goto invalid; + } + + if (nr > ns->id_ns.msrc + 1) { + status = NVME_CMD_SIZE_LIMIT | NVME_DNR; + goto invalid; + } + + iocb->ranges = g_new(NvmeCopySourceRange, nr); + + status = nvme_h2c(n, (uint8_t *)iocb->ranges, + sizeof(NvmeCopySourceRange) * nr, req); + if (status) { + goto invalid; + } + + iocb->slba = le64_to_cpu(copy->sdlba); + + if (ns->params.zoned) { + iocb->zone = nvme_get_zone_by_slba(ns, iocb->slba); + if (!iocb->zone) { + status = NVME_LBA_RANGE | NVME_DNR; + goto invalid; + } + + status = nvme_zrm_auto(n, ns, iocb->zone); + if (status) { + goto invalid; + } + } + + iocb->req = req; + iocb->bh = qemu_bh_new(nvme_copy_bh, iocb); + iocb->ret = 0; + iocb->nr = nr; + iocb->idx = 0; + iocb->reftag = le32_to_cpu(copy->reftag); + iocb->bounce = g_malloc_n(le16_to_cpu(ns->id_ns.mssrl), + ns->lbasz + ns->lbaf.ms); + + qemu_iovec_init(&iocb->iov, 1); + + block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.read, 0, + BLOCK_ACCT_READ); + block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.write, 0, + BLOCK_ACCT_WRITE); + + req->aiocb = &iocb->common; + nvme_copy_cb(iocb, 0); + + return NVME_NO_COMPLETE; + +invalid: + g_free(iocb->ranges); + qemu_aio_unref(iocb); return status; } @@ -2803,7 +2851,7 @@ static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req) BlockBackend *blk = ns->blkconf.blk; uint64_t slba = le64_to_cpu(rw->slba); uint32_t nlb = le16_to_cpu(rw->nlb) + 1; - uint16_t ctrl = le16_to_cpu(rw->control); + uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control)); size_t data_len = nvme_l2b(ns, nlb); size_t len = data_len; int64_t offset = nvme_l2b(ns, slba); @@ -2812,7 +2860,7 @@ static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req) trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb); - if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (ctrl & NVME_RW_PRINFO_PRACT)) { + if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (prinfo & NVME_PRINFO_PRACT)) { return NVME_INVALID_PROT_INFO | NVME_DNR; } @@ -2858,57 +2906,139 @@ static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req) return NVME_NO_COMPLETE; } -static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req) -{ - uint32_t nsid = le32_to_cpu(req->cmd.nsid); - uintptr_t *num_flushes = (uintptr_t *)&req->opaque; - uint16_t status; - struct nvme_aio_flush_ctx *ctx; +typedef struct NvmeFlushAIOCB { + BlockAIOCB common; + BlockAIOCB *aiocb; + NvmeRequest *req; + QEMUBH *bh; + int ret; + NvmeNamespace *ns; + uint32_t nsid; + bool broadcast; +} NvmeFlushAIOCB; - trace_pci_nvme_flush(nvme_cid(req), nsid); +static void nvme_flush_cancel(BlockAIOCB *acb) +{ + NvmeFlushAIOCB *iocb = container_of(acb, NvmeFlushAIOCB, common); - if (nsid != NVME_NSID_BROADCAST) { - req->ns = nvme_ns(n, nsid); - if (unlikely(!req->ns)) { - return NVME_INVALID_FIELD | NVME_DNR; - } + iocb->ret = -ECANCELED; - block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0, - BLOCK_ACCT_FLUSH); - req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_misc_cb, req); - return NVME_NO_COMPLETE; + if (iocb->aiocb) { + blk_aio_cancel_async(iocb->aiocb); } +} - /* 1-initialize; see comment in nvme_dsm */ - *num_flushes = 1; +static const AIOCBInfo nvme_flush_aiocb_info = { + .aiocb_size = sizeof(NvmeFlushAIOCB), + .cancel_async = nvme_flush_cancel, + .get_aio_context = nvme_get_aio_context, +}; - for (int i = 1; i <= NVME_MAX_NAMESPACES; i++) { - ns = nvme_ns(n, i); - if (!ns) { - continue; - } +static void nvme_flush_ns_cb(void *opaque, int ret) +{ + NvmeFlushAIOCB *iocb = opaque; + NvmeNamespace *ns = iocb->ns; - ctx = g_new(struct nvme_aio_flush_ctx, 1); - ctx->req = req; - ctx->ns = ns; + if (ret < 0) { + iocb->ret = ret; + goto out; + } else if (iocb->ret < 0) { + goto out; + } - (*num_flushes)++; + if (ns) { + trace_pci_nvme_flush_ns(iocb->nsid); - block_acct_start(blk_get_stats(ns->blkconf.blk), &ctx->acct, 0, - BLOCK_ACCT_FLUSH); - blk_aio_flush(ns->blkconf.blk, nvme_aio_flush_cb, ctx); + iocb->ns = NULL; + iocb->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_flush_ns_cb, iocb); + return; } - /* account for the 1-initialization */ - (*num_flushes)--; +out: + iocb->aiocb = NULL; + qemu_bh_schedule(iocb->bh); +} - if (*num_flushes) { - status = NVME_NO_COMPLETE; - } else { - status = req->status; +static void nvme_flush_bh(void *opaque) +{ + NvmeFlushAIOCB *iocb = opaque; + NvmeRequest *req = iocb->req; + NvmeCtrl *n = nvme_ctrl(req); + int i; + + if (iocb->ret < 0) { + goto done; } + if (iocb->broadcast) { + for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) { + iocb->ns = nvme_ns(n, i); + if (iocb->ns) { + iocb->nsid = i; + break; + } + } + } + + if (!iocb->ns) { + goto done; + } + + nvme_flush_ns_cb(iocb, 0); + return; + +done: + qemu_bh_delete(iocb->bh); + iocb->bh = NULL; + + iocb->common.cb(iocb->common.opaque, iocb->ret); + + qemu_aio_unref(iocb); + + return; +} + +static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req) +{ + NvmeFlushAIOCB *iocb; + uint32_t nsid = le32_to_cpu(req->cmd.nsid); + uint16_t status; + + iocb = qemu_aio_get(&nvme_flush_aiocb_info, NULL, nvme_misc_cb, req); + + iocb->req = req; + iocb->bh = qemu_bh_new(nvme_flush_bh, iocb); + iocb->ret = 0; + iocb->ns = NULL; + iocb->nsid = 0; + iocb->broadcast = (nsid == NVME_NSID_BROADCAST); + + if (!iocb->broadcast) { + if (!nvme_nsid_valid(n, nsid)) { + status = NVME_INVALID_NSID | NVME_DNR; + goto out; + } + + iocb->ns = nvme_ns(n, nsid); + if (!iocb->ns) { + status = NVME_INVALID_FIELD | NVME_DNR; + goto out; + } + + iocb->nsid = nsid; + } + + req->aiocb = &iocb->common; + qemu_bh_schedule(iocb->bh); + + return NVME_NO_COMPLETE; + +out: + qemu_bh_delete(iocb->bh); + iocb->bh = NULL; + qemu_aio_unref(iocb); + return status; } @@ -2918,7 +3048,7 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req) NvmeNamespace *ns = req->ns; uint64_t slba = le64_to_cpu(rw->slba); uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1; - uint16_t ctrl = le16_to_cpu(rw->control); + uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control)); uint64_t data_size = nvme_l2b(ns, nlb); uint64_t mapped_size = data_size; uint64_t data_offset; @@ -2929,7 +3059,7 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req) mapped_size += nvme_m2b(ns, nlb); if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { - bool pract = ctrl & NVME_RW_PRINFO_PRACT; + bool pract = prinfo & NVME_PRINFO_PRACT; if (pract && ns->lbaf.ms == 8) { mapped_size = data_size; @@ -2993,6 +3123,7 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, uint64_t slba = le64_to_cpu(rw->slba); uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1; uint16_t ctrl = le16_to_cpu(rw->control); + uint8_t prinfo = NVME_RW_PRINFO(ctrl); uint64_t data_size = nvme_l2b(ns, nlb); uint64_t mapped_size = data_size; uint64_t data_offset; @@ -3005,7 +3136,7 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, mapped_size += nvme_m2b(ns, nlb); if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) { - bool pract = ctrl & NVME_RW_PRINFO_PRACT; + bool pract = prinfo & NVME_PRINFO_PRACT; if (pract && ns->lbaf.ms == 8) { mapped_size -= nvme_m2b(ns, nlb); @@ -3030,6 +3161,7 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, if (ns->params.zoned) { zone = nvme_get_zone_by_slba(ns, slba); + assert(zone); if (append) { bool piremap = !!(ctrl & NVME_RW_PIREMAP); @@ -3080,7 +3212,7 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, goto invalid; } - status = nvme_zrm_auto(ns, zone); + status = nvme_zrm_auto(n, ns, zone); if (status) { goto invalid; } @@ -3169,7 +3301,7 @@ enum NvmeZoneProcessingMask { static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone, NvmeZoneState state, NvmeRequest *req) { - return nvme_zrm_open(ns, zone); + return nvme_zrm_open(nvme_ctrl(req), ns, zone); } static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone, @@ -3184,41 +3316,6 @@ static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone, return nvme_zrm_finish(ns, zone); } -static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone, - NvmeZoneState state, NvmeRequest *req) -{ - uintptr_t *resets = (uintptr_t *)&req->opaque; - struct nvme_zone_reset_ctx *ctx; - - switch (state) { - case NVME_ZONE_STATE_EMPTY: - return NVME_SUCCESS; - case NVME_ZONE_STATE_EXPLICITLY_OPEN: - case NVME_ZONE_STATE_IMPLICITLY_OPEN: - case NVME_ZONE_STATE_CLOSED: - case NVME_ZONE_STATE_FULL: - break; - default: - return NVME_ZONE_INVAL_TRANSITION; - } - - /* - * The zone reset aio callback needs to know the zone that is being reset - * in order to transition the zone on completion. - */ - ctx = g_new(struct nvme_zone_reset_ctx, 1); - ctx->req = req; - ctx->zone = zone; - - (*resets)++; - - blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_l2b(ns, zone->d.zslba), - nvme_l2b(ns, ns->zone_size), BDRV_REQ_MAY_UNMAP, - nvme_aio_zone_reset_cb, ctx); - - return NVME_NO_COMPLETE; -} - static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone, NvmeZoneState state, NvmeRequest *req) { @@ -3347,12 +3444,144 @@ out: return status; } +typedef struct NvmeZoneResetAIOCB { + BlockAIOCB common; + BlockAIOCB *aiocb; + NvmeRequest *req; + QEMUBH *bh; + int ret; + + bool all; + int idx; + NvmeZone *zone; +} NvmeZoneResetAIOCB; + +static void nvme_zone_reset_cancel(BlockAIOCB *aiocb) +{ + NvmeZoneResetAIOCB *iocb = container_of(aiocb, NvmeZoneResetAIOCB, common); + NvmeRequest *req = iocb->req; + NvmeNamespace *ns = req->ns; + + iocb->idx = ns->num_zones; + + iocb->ret = -ECANCELED; + + if (iocb->aiocb) { + blk_aio_cancel_async(iocb->aiocb); + iocb->aiocb = NULL; + } +} + +static const AIOCBInfo nvme_zone_reset_aiocb_info = { + .aiocb_size = sizeof(NvmeZoneResetAIOCB), + .cancel_async = nvme_zone_reset_cancel, +}; + +static void nvme_zone_reset_bh(void *opaque) +{ + NvmeZoneResetAIOCB *iocb = opaque; + + iocb->common.cb(iocb->common.opaque, iocb->ret); + + qemu_bh_delete(iocb->bh); + iocb->bh = NULL; + qemu_aio_unref(iocb); +} + +static void nvme_zone_reset_cb(void *opaque, int ret); + +static void nvme_zone_reset_epilogue_cb(void *opaque, int ret) +{ + NvmeZoneResetAIOCB *iocb = opaque; + NvmeRequest *req = iocb->req; + NvmeNamespace *ns = req->ns; + int64_t moff; + int count; + + if (ret < 0) { + nvme_zone_reset_cb(iocb, ret); + return; + } + + if (!ns->lbaf.ms) { + nvme_zone_reset_cb(iocb, 0); + return; + } + + moff = nvme_moff(ns, iocb->zone->d.zslba); + count = nvme_m2b(ns, ns->zone_size); + + iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, moff, count, + BDRV_REQ_MAY_UNMAP, + nvme_zone_reset_cb, iocb); + return; +} + +static void nvme_zone_reset_cb(void *opaque, int ret) +{ + NvmeZoneResetAIOCB *iocb = opaque; + NvmeRequest *req = iocb->req; + NvmeNamespace *ns = req->ns; + + if (ret < 0) { + iocb->ret = ret; + goto done; + } + + if (iocb->zone) { + nvme_zrm_reset(ns, iocb->zone); + + if (!iocb->all) { + goto done; + } + } + + while (iocb->idx < ns->num_zones) { + NvmeZone *zone = &ns->zone_array[iocb->idx++]; + + switch (nvme_get_zone_state(zone)) { + case NVME_ZONE_STATE_EMPTY: + if (!iocb->all) { + goto done; + } + + continue; + + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + case NVME_ZONE_STATE_CLOSED: + case NVME_ZONE_STATE_FULL: + iocb->zone = zone; + break; + + default: + continue; + } + + trace_pci_nvme_zns_zone_reset(zone->d.zslba); + + iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, + nvme_l2b(ns, zone->d.zslba), + nvme_l2b(ns, ns->zone_size), + BDRV_REQ_MAY_UNMAP, + nvme_zone_reset_epilogue_cb, + iocb); + return; + } + +done: + iocb->aiocb = NULL; + if (iocb->bh) { + qemu_bh_schedule(iocb->bh); + } +} + static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req) { NvmeCmd *cmd = (NvmeCmd *)&req->cmd; NvmeNamespace *ns = req->ns; NvmeZone *zone; - uintptr_t *resets; + NvmeZoneResetAIOCB *iocb; uint8_t *zd_ext; uint32_t dw13 = le32_to_cpu(cmd->cdw13); uint64_t slba = 0; @@ -3363,7 +3592,7 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req) enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE; action = dw13 & 0xff; - all = dw13 & 0x100; + all = !!(dw13 & 0x100); req->status = NVME_SUCCESS; @@ -3407,21 +3636,22 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req) break; case NVME_ZONE_ACTION_RESET: - resets = (uintptr_t *)&req->opaque; - - if (all) { - proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES | - NVME_PROC_FULL_ZONES; - } trace_pci_nvme_reset_zone(slba, zone_idx, all); - *resets = 1; + iocb = blk_aio_get(&nvme_zone_reset_aiocb_info, ns->blkconf.blk, + nvme_misc_cb, req); - status = nvme_do_zone_op(ns, zone, proc_mask, nvme_reset_zone, req); + iocb->req = req; + iocb->bh = qemu_bh_new(nvme_zone_reset_bh, iocb); + iocb->ret = 0; + iocb->all = all; + iocb->idx = zone_idx; + iocb->zone = NULL; - (*resets)--; + req->aiocb = &iocb->common; + nvme_zone_reset_cb(iocb, 0); - return *resets ? NVME_NO_COMPLETE : req->status; + return NVME_NO_COMPLETE; case NVME_ZONE_ACTION_OFFLINE: if (all) { @@ -3695,7 +3925,6 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req) NvmeSQueue *sq; NvmeCQueue *cq; uint16_t qid = le16_to_cpu(c->qid); - uint32_t nsid; if (unlikely(!qid || nvme_check_sqid(n, qid))) { trace_pci_nvme_err_invalid_del_sq(qid); @@ -3707,22 +3936,8 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req) sq = n->sq[qid]; while (!QTAILQ_EMPTY(&sq->out_req_list)) { r = QTAILQ_FIRST(&sq->out_req_list); - if (r->aiocb) { - blk_aio_cancel(r->aiocb); - } - } - - /* - * Drain all namespaces if there are still outstanding requests that we - * could not cancel explicitly. - */ - if (!QTAILQ_EMPTY(&sq->out_req_list)) { - for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) { - NvmeNamespace *ns = nvme_ns(n, nsid); - if (ns) { - nvme_ns_drain(ns); - } - } + assert(r->aiocb); + blk_aio_cancel(r->aiocb); } assert(QTAILQ_EMPTY(&sq->out_req_list)); @@ -4089,6 +4304,11 @@ static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req) trace_pci_nvme_err_invalid_del_cq_notempty(qid); return NVME_INVALID_QUEUE_DEL; } + + if (cq->irq_enabled && cq->tail != cq->head) { + n->cq_pending--; + } + nvme_irq_deassert(n, cq); trace_pci_nvme_del_cq(qid); nvme_free_cq(cq, n); @@ -4178,16 +4398,6 @@ static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req) return nvme_c2h(n, id, sizeof(id), req); } -static inline bool nvme_csi_has_nvm_support(NvmeNamespace *ns) -{ - switch (ns->csi) { - case NVME_CSI_NVM: - case NVME_CSI_ZONED: - return true; - } - return false; -} - static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req) { trace_pci_nvme_identify_ctrl(); @@ -4244,16 +4454,18 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active) } } - if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) { + if (active || ns->csi == NVME_CSI_NVM) { return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req); } return NVME_INVALID_CMD_SET | NVME_DNR; } -static uint16_t nvme_identify_ns_attached_list(NvmeCtrl *n, NvmeRequest *req) +static uint16_t nvme_identify_ctrl_list(NvmeCtrl *n, NvmeRequest *req, + bool attached) { NvmeIdentify *c = (NvmeIdentify *)&req->cmd; + uint32_t nsid = le32_to_cpu(c->nsid); uint16_t min_id = le16_to_cpu(c->ctrlid); uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {}; uint16_t *ids = &list[1]; @@ -4261,15 +4473,21 @@ static uint16_t nvme_identify_ns_attached_list(NvmeCtrl *n, NvmeRequest *req) NvmeCtrl *ctrl; int cntlid, nr_ids = 0; - trace_pci_nvme_identify_ns_attached_list(min_id); + trace_pci_nvme_identify_ctrl_list(c->cns, min_id); - if (c->nsid == NVME_NSID_BROADCAST) { + if (!n->subsys) { return NVME_INVALID_FIELD | NVME_DNR; } - ns = nvme_subsys_ns(n->subsys, c->nsid); - if (!ns) { - return NVME_INVALID_FIELD | NVME_DNR; + if (attached) { + if (nsid == NVME_NSID_BROADCAST) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + ns = nvme_subsys_ns(n->subsys, nsid); + if (!ns) { + return NVME_INVALID_FIELD | NVME_DNR; + } } for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) { @@ -4278,7 +4496,7 @@ static uint16_t nvme_identify_ns_attached_list(NvmeCtrl *n, NvmeRequest *req) continue; } - if (!nvme_ns(ctrl, c->nsid)) { + if (attached && !nvme_ns(ctrl, nsid)) { continue; } @@ -4291,7 +4509,7 @@ static uint16_t nvme_identify_ns_attached_list(NvmeCtrl *n, NvmeRequest *req) } static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req, - bool active) + bool active) { NvmeNamespace *ns; NvmeIdentify *c = (NvmeIdentify *)&req->cmd; @@ -4315,7 +4533,7 @@ static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req, } } - if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) { + if (c->csi == NVME_CSI_NVM) { return nvme_rpt_empty_id_struct(n, req); } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) { return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned), @@ -4326,7 +4544,7 @@ static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req, } static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req, - bool active) + bool active) { NvmeNamespace *ns; NvmeIdentify *c = (NvmeIdentify *)&req->cmd; @@ -4373,7 +4591,7 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req, } static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req, - bool active) + bool active) { NvmeNamespace *ns; NvmeIdentify *c = (NvmeIdentify *)&req->cmd; @@ -4426,19 +4644,19 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) NvmeIdentify *c = (NvmeIdentify *)&req->cmd; uint32_t nsid = le32_to_cpu(c->nsid); uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {}; - - struct data { - struct { - NvmeIdNsDescr hdr; - uint8_t v[NVME_NIDL_UUID]; - } uuid; - struct { - NvmeIdNsDescr hdr; - uint8_t v; - } csi; - }; - - struct data *ns_descrs = (struct data *)list; + uint8_t *pos = list; + struct { + NvmeIdNsDescr hdr; + uint8_t v[NVME_NIDL_UUID]; + } QEMU_PACKED uuid; + struct { + NvmeIdNsDescr hdr; + uint64_t v; + } QEMU_PACKED eui64; + struct { + NvmeIdNsDescr hdr; + uint8_t v; + } QEMU_PACKED csi; trace_pci_nvme_identify_ns_descr_list(nsid); @@ -4452,17 +4670,29 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) } /* - * Because the NGUID and EUI64 fields are 0 in the Identify Namespace data - * structure, a Namespace UUID (nidt = 3h) must be reported in the - * Namespace Identification Descriptor. Add the namespace UUID here. + * If the EUI-64 field is 0 and the NGUID field is 0, the namespace must + * provide a valid Namespace UUID in the Namespace Identification Descriptor + * data structure. QEMU does not yet support setting NGUID. */ - ns_descrs->uuid.hdr.nidt = NVME_NIDT_UUID; - ns_descrs->uuid.hdr.nidl = NVME_NIDL_UUID; - memcpy(&ns_descrs->uuid.v, ns->params.uuid.data, NVME_NIDL_UUID); - - ns_descrs->csi.hdr.nidt = NVME_NIDT_CSI; - ns_descrs->csi.hdr.nidl = NVME_NIDL_CSI; - ns_descrs->csi.v = ns->csi; + uuid.hdr.nidt = NVME_NIDT_UUID; + uuid.hdr.nidl = NVME_NIDL_UUID; + memcpy(uuid.v, ns->params.uuid.data, NVME_NIDL_UUID); + memcpy(pos, &uuid, sizeof(uuid)); + pos += sizeof(uuid); + + if (ns->params.eui64) { + eui64.hdr.nidt = NVME_NIDT_EUI64; + eui64.hdr.nidl = NVME_NIDL_EUI64; + eui64.v = cpu_to_be64(ns->params.eui64); + memcpy(pos, &eui64, sizeof(eui64)); + pos += sizeof(eui64); + } + + csi.hdr.nidt = NVME_NIDT_CSI; + csi.hdr.nidl = NVME_NIDL_CSI; + csi.v = ns->csi; + memcpy(pos, &csi, sizeof(csi)); + pos += sizeof(csi); return nvme_c2h(n, list, sizeof(list), req); } @@ -4493,7 +4723,9 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req) case NVME_ID_CNS_NS_PRESENT: return nvme_identify_ns(n, req, false); case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST: - return nvme_identify_ns_attached_list(n, req); + return nvme_identify_ctrl_list(n, req, true); + case NVME_ID_CNS_CTRL_LIST: + return nvme_identify_ctrl_list(n, req, false); case NVME_ID_CNS_CS_NS: return nvme_identify_ns_csi(n, req, true); case NVME_ID_CNS_CS_NS_PRESENT: @@ -5011,138 +5243,195 @@ static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req) return NVME_SUCCESS; } -static uint16_t nvme_format_ns(NvmeCtrl *n, NvmeNamespace *ns, uint8_t lbaf, - uint8_t mset, uint8_t pi, uint8_t pil, - NvmeRequest *req) -{ - int64_t len, offset; - struct nvme_aio_format_ctx *ctx; - BlockBackend *blk = ns->blkconf.blk; - uint16_t ms; - uintptr_t *num_formats = (uintptr_t *)&req->opaque; - int *count; - - if (ns->params.zoned) { - return NVME_INVALID_FORMAT | NVME_DNR; - } +typedef struct NvmeFormatAIOCB { + BlockAIOCB common; + BlockAIOCB *aiocb; + QEMUBH *bh; + NvmeRequest *req; + int ret; - trace_pci_nvme_format_ns(nvme_cid(req), nvme_nsid(ns), lbaf, mset, pi, pil); + NvmeNamespace *ns; + uint32_t nsid; + bool broadcast; + int64_t offset; +} NvmeFormatAIOCB; - if (lbaf > ns->id_ns.nlbaf) { - return NVME_INVALID_FORMAT | NVME_DNR; - } +static void nvme_format_bh(void *opaque); - ms = ns->id_ns.lbaf[lbaf].ms; +static void nvme_format_cancel(BlockAIOCB *aiocb) +{ + NvmeFormatAIOCB *iocb = container_of(aiocb, NvmeFormatAIOCB, common); - if (pi && (ms < sizeof(NvmeDifTuple))) { - return NVME_INVALID_FORMAT | NVME_DNR; + if (iocb->aiocb) { + blk_aio_cancel_async(iocb->aiocb); } +} - if (pi && pi > NVME_ID_NS_DPS_TYPE_3) { - return NVME_INVALID_FIELD | NVME_DNR; - } +static const AIOCBInfo nvme_format_aiocb_info = { + .aiocb_size = sizeof(NvmeFormatAIOCB), + .cancel_async = nvme_format_cancel, + .get_aio_context = nvme_get_aio_context, +}; + +static void nvme_format_set(NvmeNamespace *ns, NvmeCmd *cmd) +{ + uint32_t dw10 = le32_to_cpu(cmd->cdw10); + uint8_t lbaf = dw10 & 0xf; + uint8_t pi = (dw10 >> 5) & 0x7; + uint8_t mset = (dw10 >> 4) & 0x1; + uint8_t pil = (dw10 >> 8) & 0x1; - nvme_ns_drain(ns); - nvme_ns_shutdown(ns); - nvme_ns_cleanup(ns); + trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil); ns->id_ns.dps = (pil << 3) | pi; ns->id_ns.flbas = lbaf | (mset << 4); nvme_ns_init_format(ns); +} - ns->status = NVME_FORMAT_IN_PROGRESS; +static void nvme_format_ns_cb(void *opaque, int ret) +{ + NvmeFormatAIOCB *iocb = opaque; + NvmeRequest *req = iocb->req; + NvmeNamespace *ns = iocb->ns; + int bytes; - len = ns->size; - offset = 0; + if (ret < 0) { + iocb->ret = ret; + goto done; + } - count = g_new(int, 1); - *count = 1; + assert(ns); - (*num_formats)++; + if (iocb->offset < ns->size) { + bytes = MIN(BDRV_REQUEST_MAX_BYTES, ns->size - iocb->offset); - while (len) { - ctx = g_new(struct nvme_aio_format_ctx, 1); - ctx->req = req; - ctx->ns = ns; - ctx->count = count; + iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, iocb->offset, + bytes, BDRV_REQ_MAY_UNMAP, + nvme_format_ns_cb, iocb); - size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len); + iocb->offset += bytes; + return; + } - (*count)++; + nvme_format_set(ns, &req->cmd); + ns->status = 0x0; + iocb->ns = NULL; + iocb->offset = 0; - blk_aio_pwrite_zeroes(blk, offset, bytes, BDRV_REQ_MAY_UNMAP, - nvme_aio_format_cb, ctx); +done: + iocb->aiocb = NULL; + qemu_bh_schedule(iocb->bh); +} - offset += bytes; - len -= bytes; +static uint16_t nvme_format_check(NvmeNamespace *ns, uint8_t lbaf, uint8_t pi) +{ + if (ns->params.zoned) { + return NVME_INVALID_FORMAT | NVME_DNR; + } + if (lbaf > ns->id_ns.nlbaf) { + return NVME_INVALID_FORMAT | NVME_DNR; } - if (--(*count)) { - return NVME_NO_COMPLETE; + if (pi && (ns->id_ns.lbaf[lbaf].ms < sizeof(NvmeDifTuple))) { + return NVME_INVALID_FORMAT | NVME_DNR; } - g_free(count); - ns->status = 0x0; - (*num_formats)--; + if (pi && pi > NVME_ID_NS_DPS_TYPE_3) { + return NVME_INVALID_FIELD | NVME_DNR; + } return NVME_SUCCESS; } -static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req) +static void nvme_format_bh(void *opaque) { - NvmeNamespace *ns; + NvmeFormatAIOCB *iocb = opaque; + NvmeRequest *req = iocb->req; + NvmeCtrl *n = nvme_ctrl(req); uint32_t dw10 = le32_to_cpu(req->cmd.cdw10); - uint32_t nsid = le32_to_cpu(req->cmd.nsid); uint8_t lbaf = dw10 & 0xf; - uint8_t mset = (dw10 >> 4) & 0x1; uint8_t pi = (dw10 >> 5) & 0x7; - uint8_t pil = (dw10 >> 8) & 0x1; - uintptr_t *num_formats = (uintptr_t *)&req->opaque; uint16_t status; int i; - trace_pci_nvme_format(nvme_cid(req), nsid, lbaf, mset, pi, pil); - - /* 1-initialize; see the comment in nvme_dsm */ - *num_formats = 1; + if (iocb->ret < 0) { + goto done; + } - if (nsid != NVME_NSID_BROADCAST) { - if (!nvme_nsid_valid(n, nsid)) { - return NVME_INVALID_NSID | NVME_DNR; + if (iocb->broadcast) { + for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) { + iocb->ns = nvme_ns(n, i); + if (iocb->ns) { + iocb->nsid = i; + break; + } } + } - ns = nvme_ns(n, nsid); - if (!ns) { - return NVME_INVALID_FIELD | NVME_DNR; - } + if (!iocb->ns) { + goto done; + } - status = nvme_format_ns(n, ns, lbaf, mset, pi, pil, req); - if (status && status != NVME_NO_COMPLETE) { - req->status = status; + status = nvme_format_check(iocb->ns, lbaf, pi); + if (status) { + req->status = status; + goto done; + } + + iocb->ns->status = NVME_FORMAT_IN_PROGRESS; + nvme_format_ns_cb(iocb, 0); + return; + +done: + qemu_bh_delete(iocb->bh); + iocb->bh = NULL; + + iocb->common.cb(iocb->common.opaque, iocb->ret); + + qemu_aio_unref(iocb); +} + +static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req) +{ + NvmeFormatAIOCB *iocb; + uint32_t nsid = le32_to_cpu(req->cmd.nsid); + uint16_t status; + + iocb = qemu_aio_get(&nvme_format_aiocb_info, NULL, nvme_misc_cb, req); + + iocb->req = req; + iocb->bh = qemu_bh_new(nvme_format_bh, iocb); + iocb->ret = 0; + iocb->ns = NULL; + iocb->nsid = 0; + iocb->broadcast = (nsid == NVME_NSID_BROADCAST); + iocb->offset = 0; + + if (!iocb->broadcast) { + if (!nvme_nsid_valid(n, nsid)) { + status = NVME_INVALID_NSID | NVME_DNR; + goto out; } - } else { - for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { - ns = nvme_ns(n, i); - if (!ns) { - continue; - } - status = nvme_format_ns(n, ns, lbaf, mset, pi, pil, req); - if (status && status != NVME_NO_COMPLETE) { - req->status = status; - break; - } + iocb->ns = nvme_ns(n, nsid); + if (!iocb->ns) { + status = NVME_INVALID_FIELD | NVME_DNR; + goto out; } } - /* account for the 1-initialization */ - if (--(*num_formats)) { - return NVME_NO_COMPLETE; - } + req->aiocb = &iocb->common; + qemu_bh_schedule(iocb->bh); - return req->status; + return NVME_NO_COMPLETE; + +out: + qemu_bh_delete(iocb->bh); + iocb->bh = NULL; + qemu_aio_unref(iocb); + return status; } static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req) @@ -5583,6 +5872,10 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data, "invalid write to PMRCAP register, ignored"); return; case 0xe04: /* PMRCTL */ + if (!NVME_CAP_PMRS(n->bar.cap)) { + return; + } + n->bar.pmrctl = data; if (NVME_PMRCTL_EN(data)) { memory_region_set_enabled(&n->pmr.dev->mr, true); @@ -5758,6 +6051,10 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) } if (cq->tail == cq->head) { + if (cq->irq_enabled) { + n->cq_pending--; + } + nvme_irq_deassert(n, cq); } } else { @@ -6259,6 +6556,8 @@ static Property nvme_props[] = { DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false), DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false), DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0), + DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl, + params.auto_transition_zones, true), DEFINE_PROP_END_OF_LIST(), }; |