aboutsummaryrefslogtreecommitdiff
path: root/hw/nvme
diff options
context:
space:
mode:
authorKlaus Jensen <k.jensen@samsung.com>2021-06-17 21:06:49 +0200
committerKlaus Jensen <k.jensen@samsung.com>2021-06-29 07:16:25 +0200
commitd7d1474fd85d6c3fce46fa2edd051a6ff3f17cd1 (patch)
tree73e83d36e050a2c184814ade2a581ce5e9bb8d42 /hw/nvme
parentff0ac2c8b89dfcfbb4186d8ae5469d4e80a10da9 (diff)
downloadqemu-d7d1474fd85d6c3fce46fa2edd051a6ff3f17cd1.zip
qemu-d7d1474fd85d6c3fce46fa2edd051a6ff3f17cd1.tar.gz
qemu-d7d1474fd85d6c3fce46fa2edd051a6ff3f17cd1.tar.bz2
hw/nvme: reimplement dsm to allow cancellation
Prior to this patch, a loop was used to issue multiple "fire and forget" aios for each range in the command. Without a reference to the aiocb returned from the blk_aio_pdiscard calls, the aios cannot be canceled. Fix this by processing the ranges one after another. As a bonus, this fixes how metadata is cleared (i.e. we only zero it out if the data was succesfully discarded). Signed-off-by: Klaus Jensen <k.jensen@samsung.com> Reviewed-by: Keith Busch <kbusch@kernel.org>
Diffstat (limited to 'hw/nvme')
-rw-r--r--hw/nvme/ctrl.c223
-rw-r--r--hw/nvme/trace-events4
2 files changed, 157 insertions, 70 deletions
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 5777c7e..bb9ee93 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -2015,26 +2015,6 @@ out:
nvme_verify_cb(ctx, ret);
}
-static void nvme_aio_discard_cb(void *opaque, int ret)
-{
- NvmeRequest *req = opaque;
- uintptr_t *discards = (uintptr_t *)&req->opaque;
-
- trace_pci_nvme_aio_discard_cb(nvme_cid(req));
-
- if (ret) {
- nvme_aio_err(req, ret);
- }
-
- (*discards)--;
-
- if (*discards) {
- return;
- }
-
- nvme_enqueue_req_completion(nvme_cq(req), req);
-}
-
struct nvme_zone_reset_ctx {
NvmeRequest *req;
NvmeZone *zone;
@@ -2495,75 +2475,182 @@ out:
nvme_enqueue_req_completion(nvme_cq(req), req);
}
-static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
+typedef struct NvmeDSMAIOCB {
+ BlockAIOCB common;
+ BlockAIOCB *aiocb;
+ NvmeRequest *req;
+ QEMUBH *bh;
+ int ret;
+
+ NvmeDsmRange *range;
+ unsigned int nr;
+ unsigned int idx;
+} NvmeDSMAIOCB;
+
+static void nvme_dsm_cancel(BlockAIOCB *aiocb)
{
+ NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
+
+ /* break nvme_dsm_cb loop */
+ iocb->idx = iocb->nr;
+ iocb->ret = -ECANCELED;
+
+ if (iocb->aiocb) {
+ blk_aio_cancel_async(iocb->aiocb);
+ iocb->aiocb = NULL;
+ } else {
+ /*
+ * We only reach this if nvme_dsm_cancel() has already been called or
+ * the command ran to completion and nvme_dsm_bh is scheduled to run.
+ */
+ assert(iocb->idx == iocb->nr);
+ }
+}
+
+static const AIOCBInfo nvme_dsm_aiocb_info = {
+ .aiocb_size = sizeof(NvmeDSMAIOCB),
+ .cancel_async = nvme_dsm_cancel,
+};
+
+static void nvme_dsm_bh(void *opaque)
+{
+ NvmeDSMAIOCB *iocb = opaque;
+
+ iocb->common.cb(iocb->common.opaque, iocb->ret);
+
+ qemu_bh_delete(iocb->bh);
+ iocb->bh = NULL;
+ qemu_aio_unref(iocb);
+}
+
+static void nvme_dsm_cb(void *opaque, int ret);
+
+static void nvme_dsm_md_cb(void *opaque, int ret)
+{
+ NvmeDSMAIOCB *iocb = opaque;
+ NvmeRequest *req = iocb->req;
NvmeNamespace *ns = req->ns;
- NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
+ NvmeDsmRange *range;
+ uint64_t slba;
+ uint32_t nlb;
- uint32_t attr = le32_to_cpu(dsm->attributes);
- uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
+ if (ret < 0) {
+ iocb->ret = ret;
+ goto done;
+ }
- uint16_t status = NVME_SUCCESS;
+ if (!ns->lbaf.ms) {
+ nvme_dsm_cb(iocb, 0);
+ return;
+ }
- trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
+ range = &iocb->range[iocb->idx - 1];
+ slba = le64_to_cpu(range->slba);
+ nlb = le32_to_cpu(range->nlb);
- if (attr & NVME_DSMGMT_AD) {
- int64_t offset;
- size_t len;
- NvmeDsmRange range[nr];
- uintptr_t *discards = (uintptr_t *)&req->opaque;
+ /*
+ * Check that all block were discarded (zeroed); otherwise we do not zero
+ * the metadata.
+ */
- status = nvme_h2c(n, (uint8_t *)range, sizeof(range), req);
- if (status) {
- return status;
+ ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_ZERO);
+ if (ret) {
+ if (ret < 0) {
+ iocb->ret = ret;
+ goto done;
}
- /*
- * AIO callbacks may be called immediately, so initialize discards to 1
- * to make sure the the callback does not complete the request before
- * all discards have been issued.
- */
- *discards = 1;
+ nvme_dsm_cb(iocb, 0);
+ }
- for (int i = 0; i < nr; i++) {
- uint64_t slba = le64_to_cpu(range[i].slba);
- uint32_t nlb = le32_to_cpu(range[i].nlb);
+ iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_moff(ns, slba),
+ nvme_m2b(ns, nlb), BDRV_REQ_MAY_UNMAP,
+ nvme_dsm_cb, iocb);
+ return;
- if (nvme_check_bounds(ns, slba, nlb)) {
- continue;
- }
+done:
+ iocb->aiocb = NULL;
+ qemu_bh_schedule(iocb->bh);
+}
- trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
- nlb);
+static void nvme_dsm_cb(void *opaque, int ret)
+{
+ NvmeDSMAIOCB *iocb = opaque;
+ NvmeRequest *req = iocb->req;
+ NvmeCtrl *n = nvme_ctrl(req);
+ NvmeNamespace *ns = req->ns;
+ NvmeDsmRange *range;
+ uint64_t slba;
+ uint32_t nlb;
- if (nlb > n->dmrsl) {
- trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
- }
+ if (ret < 0) {
+ iocb->ret = ret;
+ goto done;
+ }
- offset = nvme_l2b(ns, slba);
- len = nvme_l2b(ns, nlb);
+next:
+ if (iocb->idx == iocb->nr) {
+ goto done;
+ }
- while (len) {
- size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
+ range = &iocb->range[iocb->idx++];
+ slba = le64_to_cpu(range->slba);
+ nlb = le32_to_cpu(range->nlb);
- (*discards)++;
+ trace_pci_nvme_dsm_deallocate(slba, nlb);
- blk_aio_pdiscard(ns->blkconf.blk, offset, bytes,
- nvme_aio_discard_cb, req);
+ if (nlb > n->dmrsl) {
+ trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
+ goto next;
+ }
- offset += bytes;
- len -= bytes;
- }
- }
+ if (nvme_check_bounds(ns, slba, nlb)) {
+ trace_pci_nvme_err_invalid_lba_range(slba, nlb,
+ ns->id_ns.nsze);
+ goto next;
+ }
- /* account for the 1-initialization */
- (*discards)--;
+ iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, nvme_l2b(ns, slba),
+ nvme_l2b(ns, nlb),
+ nvme_dsm_md_cb, iocb);
+ return;
- if (*discards) {
- status = NVME_NO_COMPLETE;
- } else {
- status = req->status;
+done:
+ iocb->aiocb = NULL;
+ qemu_bh_schedule(iocb->bh);
+}
+
+static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
+{
+ NvmeNamespace *ns = req->ns;
+ NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
+ uint32_t attr = le32_to_cpu(dsm->attributes);
+ uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
+ uint16_t status = NVME_SUCCESS;
+
+ trace_pci_nvme_dsm(nr, attr);
+
+ if (attr & NVME_DSMGMT_AD) {
+ NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
+ nvme_misc_cb, req);
+
+ iocb->req = req;
+ iocb->bh = qemu_bh_new(nvme_dsm_bh, iocb);
+ iocb->ret = 0;
+ iocb->range = g_new(NvmeDsmRange, nr);
+ iocb->nr = nr;
+ iocb->idx = 0;
+
+ status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
+ req);
+ if (status) {
+ return status;
}
+
+ req->aiocb = &iocb->common;
+ nvme_dsm_cb(iocb, 0);
+
+ return NVME_NO_COMPLETE;
}
return status;
diff --git a/hw/nvme/trace-events b/hw/nvme/trace-events
index ce6b6ff..eea4e31 100644
--- a/hw/nvme/trace-events
+++ b/hw/nvme/trace-events
@@ -37,8 +37,8 @@ pci_nvme_verify_mdata_in_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" bl
pci_nvme_verify_cb(uint16_t cid, uint8_t prinfo, uint16_t apptag, uint16_t appmask, uint32_t reftag) "cid %"PRIu16" prinfo 0x%"PRIx8" apptag 0x%"PRIx16" appmask 0x%"PRIx16" reftag 0x%"PRIx32""
pci_nvme_rw_complete_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
pci_nvme_block_status(int64_t offset, int64_t bytes, int64_t pnum, int ret, bool zeroed) "offset %"PRId64" bytes %"PRId64" pnum %"PRId64" ret 0x%x zeroed %d"
-pci_nvme_dsm(uint16_t cid, uint32_t nsid, uint32_t nr, uint32_t attr) "cid %"PRIu16" nsid %"PRIu32" nr %"PRIu32" attr 0x%"PRIx32""
-pci_nvme_dsm_deallocate(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32""
+pci_nvme_dsm(uint32_t nr, uint32_t attr) "nr %"PRIu32" attr 0x%"PRIx32""
+pci_nvme_dsm_deallocate(uint64_t slba, uint32_t nlb) "slba %"PRIu64" nlb %"PRIu32""
pci_nvme_dsm_single_range_limit_exceeded(uint32_t nlb, uint32_t dmrsl) "nlb %"PRIu32" dmrsl %"PRIu32""
pci_nvme_compare(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba 0x%"PRIx64" nlb %"PRIu32""
pci_nvme_compare_data_cb(uint16_t cid) "cid %"PRIu16""