diff options
author | Klaus Jensen <k.jensen@samsung.com> | 2019-04-12 20:53:16 +0200 |
---|---|---|
committer | Klaus Jensen <k.jensen@samsung.com> | 2020-10-27 07:24:47 +0100 |
commit | cba0a8a344fea94aa2212e105611b8e099343cb1 (patch) | |
tree | a8aef1b3fe98019092cac3f7eeb91d6be6a8017e /hw/block | |
parent | 6e0ac3a03f3ab0d0b69d086e3226bac77a20a468 (diff) | |
download | qemu-cba0a8a344fea94aa2212e105611b8e099343cb1.zip qemu-cba0a8a344fea94aa2212e105611b8e099343cb1.tar.gz qemu-cba0a8a344fea94aa2212e105611b8e099343cb1.tar.bz2 |
hw/block/nvme: add support for scatter gather lists
For now, support the Data Block, Segment and Last Segment descriptor
types.
See NVM Express 1.3d, Section 4.4 ("Scatter Gather List (SGL)").
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Diffstat (limited to 'hw/block')
-rw-r--r-- | hw/block/nvme.c | 329 | ||||
-rw-r--r-- | hw/block/trace-events | 4 |
2 files changed, 276 insertions, 57 deletions
diff --git a/hw/block/nvme.c b/hw/block/nvme.c index c0f1f8c..63d0a17 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -413,13 +413,262 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, return NVME_SUCCESS; } -static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len, - uint64_t prp1, uint64_t prp2, DMADirection dir, +/* + * Map 'nsgld' data descriptors from 'segment'. The function will subtract the + * number of bytes mapped in len. + */ +static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg, + QEMUIOVector *iov, + NvmeSglDescriptor *segment, uint64_t nsgld, + size_t *len, NvmeRequest *req) +{ + dma_addr_t addr, trans_len; + uint32_t dlen; + uint16_t status; + + for (int i = 0; i < nsgld; i++) { + uint8_t type = NVME_SGL_TYPE(segment[i].type); + + switch (type) { + case NVME_SGL_DESCR_TYPE_DATA_BLOCK: + break; + case NVME_SGL_DESCR_TYPE_SEGMENT: + case NVME_SGL_DESCR_TYPE_LAST_SEGMENT: + return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR; + default: + return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR; + } + + dlen = le32_to_cpu(segment[i].len); + if (!dlen) { + continue; + } + + if (*len == 0) { + /* + * All data has been mapped, but the SGL contains additional + * segments and/or descriptors. The controller might accept + * ignoring the rest of the SGL. + */ + uint16_t sgls = le16_to_cpu(n->id_ctrl.sgls); + if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) { + break; + } + + trace_pci_nvme_err_invalid_sgl_excess_length(nvme_cid(req)); + return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; + } + + trans_len = MIN(*len, dlen); + addr = le64_to_cpu(segment[i].addr); + + if (UINT64_MAX - addr < dlen) { + return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; + } + + status = nvme_map_addr(n, qsg, iov, addr, trans_len); + if (status) { + return status; + } + + *len -= trans_len; + } + + return NVME_SUCCESS; +} + +static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, + NvmeSglDescriptor sgl, size_t len, NvmeRequest *req) { + /* + * Read the segment in chunks of 256 descriptors (one 4k page) to avoid + * dynamically allocating a potentially huge SGL. The spec allows the SGL + * to be larger (as in number of bytes required to describe the SGL + * descriptors and segment chain) than the command transfer size, so it is + * not bounded by MDTS. + */ + const int SEG_CHUNK_SIZE = 256; + + NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld; + uint64_t nsgld; + uint32_t seg_len; + uint16_t status; + bool sgl_in_cmb = false; + hwaddr addr; + int ret; + + sgld = &sgl; + addr = le64_to_cpu(sgl.addr); + + trace_pci_nvme_map_sgl(nvme_cid(req), NVME_SGL_TYPE(sgl.type), len); + + /* + * If the entire transfer can be described with a single data block it can + * be mapped directly. + */ + if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) { + status = nvme_map_sgl_data(n, qsg, iov, sgld, 1, &len, req); + if (status) { + goto unmap; + } + + goto out; + } + + /* + * If the segment is located in the CMB, the submission queue of the + * request must also reside there. + */ + if (nvme_addr_is_cmb(n, addr)) { + if (!nvme_addr_is_cmb(n, req->sq->dma_addr)) { + return NVME_INVALID_USE_OF_CMB | NVME_DNR; + } + + sgl_in_cmb = true; + } + + for (;;) { + switch (NVME_SGL_TYPE(sgld->type)) { + case NVME_SGL_DESCR_TYPE_SEGMENT: + case NVME_SGL_DESCR_TYPE_LAST_SEGMENT: + break; + default: + return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR; + } + + seg_len = le32_to_cpu(sgld->len); + + /* check the length of the (Last) Segment descriptor */ + if (!seg_len || seg_len & 0xf) { + return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR; + } + + if (UINT64_MAX - addr < seg_len) { + return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; + } + + nsgld = seg_len / sizeof(NvmeSglDescriptor); + + while (nsgld > SEG_CHUNK_SIZE) { + if (nvme_addr_read(n, addr, segment, sizeof(segment))) { + trace_pci_nvme_err_addr_read(addr); + status = NVME_DATA_TRAS_ERROR; + goto unmap; + } + + status = nvme_map_sgl_data(n, qsg, iov, segment, SEG_CHUNK_SIZE, + &len, req); + if (status) { + goto unmap; + } + + nsgld -= SEG_CHUNK_SIZE; + addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor); + } + + ret = nvme_addr_read(n, addr, segment, nsgld * + sizeof(NvmeSglDescriptor)); + if (ret) { + trace_pci_nvme_err_addr_read(addr); + status = NVME_DATA_TRAS_ERROR; + goto unmap; + } + + last_sgld = &segment[nsgld - 1]; + + /* if the segment ends with a Data Block, then we are done */ + if (NVME_SGL_TYPE(last_sgld->type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) { + status = nvme_map_sgl_data(n, qsg, iov, segment, nsgld, &len, req); + if (status) { + goto unmap; + } + + goto out; + } + + /* + * If the last descriptor was not a Data Block, then the current + * segment must not be a Last Segment. + */ + if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) { + status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR; + goto unmap; + } + + sgld = last_sgld; + addr = le64_to_cpu(sgld->addr); + + /* + * Do not map the last descriptor; it will be a Segment or Last Segment + * descriptor and is handled by the next iteration. + */ + status = nvme_map_sgl_data(n, qsg, iov, segment, nsgld - 1, &len, req); + if (status) { + goto unmap; + } + + /* + * If the next segment is in the CMB, make sure that the sgl was + * already located there. + */ + if (sgl_in_cmb != nvme_addr_is_cmb(n, addr)) { + status = NVME_INVALID_USE_OF_CMB | NVME_DNR; + goto unmap; + } + } + +out: + /* if there is any residual left in len, the SGL was too short */ + if (len) { + status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR; + goto unmap; + } + + return NVME_SUCCESS; + +unmap: + if (iov->iov) { + qemu_iovec_destroy(iov); + } + + if (qsg->sg) { + qemu_sglist_destroy(qsg); + } + + return status; +} + +static uint16_t nvme_map_dptr(NvmeCtrl *n, size_t len, NvmeRequest *req) +{ + uint64_t prp1, prp2; + + switch (NVME_CMD_FLAGS_PSDT(req->cmd.flags)) { + case NVME_PSDT_PRP: + prp1 = le64_to_cpu(req->cmd.dptr.prp1); + prp2 = le64_to_cpu(req->cmd.dptr.prp2); + + return nvme_map_prp(n, prp1, prp2, len, req); + case NVME_PSDT_SGL_MPTR_CONTIGUOUS: + case NVME_PSDT_SGL_MPTR_SGL: + /* SGLs shall not be used for Admin commands in NVMe over PCIe */ + if (!req->sq->sqid) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + return nvme_map_sgl(n, &req->qsg, &req->iov, req->cmd.dptr.sgl, len, + req); + default: + return NVME_INVALID_FIELD; + } +} + +static uint16_t nvme_dma(NvmeCtrl *n, uint8_t *ptr, uint32_t len, + DMADirection dir, NvmeRequest *req) +{ uint16_t status = NVME_SUCCESS; - status = nvme_map_prp(n, prp1, prp2, len, req); + status = nvme_map_dptr(n, len, req); if (status) { return status; } @@ -458,15 +707,6 @@ static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len, return status; } -static uint16_t nvme_map_dptr(NvmeCtrl *n, size_t len, NvmeRequest *req) -{ - NvmeCmd *cmd = &req->cmd; - uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); - uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); - - return nvme_map_prp(n, prp1, prp2, len, req); -} - static void nvme_post_cqes(void *opaque) { NvmeCQueue *cq = opaque; @@ -929,10 +1169,7 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req) static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, uint64_t off, NvmeRequest *req) { - NvmeCmd *cmd = &req->cmd; - uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); - uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); - uint32_t nsid = le32_to_cpu(cmd->nsid); + uint32_t nsid = le32_to_cpu(req->cmd.nsid); uint32_t trans_len; time_t current_ms; @@ -981,17 +1218,14 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, nvme_clear_events(n, NVME_AER_TYPE_SMART); } - return nvme_dma_prp(n, (uint8_t *) &smart + off, trans_len, prp1, prp2, - DMA_DIRECTION_FROM_DEVICE, req); + return nvme_dma(n, (uint8_t *) &smart + off, trans_len, + DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off, NvmeRequest *req) { uint32_t trans_len; - NvmeCmd *cmd = &req->cmd; - uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); - uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); NvmeFwSlotInfoLog fw_log = { .afi = 0x1, }; @@ -1004,17 +1238,14 @@ static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off, trans_len = MIN(sizeof(fw_log) - off, buf_len); - return nvme_dma_prp(n, (uint8_t *) &fw_log + off, trans_len, prp1, prp2, - DMA_DIRECTION_FROM_DEVICE, req); + return nvme_dma(n, (uint8_t *) &fw_log + off, trans_len, + DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, uint64_t off, NvmeRequest *req) { uint32_t trans_len; - NvmeCmd *cmd = &req->cmd; - uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); - uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); NvmeErrorLog errlog; if (!rae) { @@ -1029,8 +1260,8 @@ static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, trans_len = MIN(sizeof(errlog) - off, buf_len); - return nvme_dma_prp(n, (uint8_t *)&errlog, trans_len, prp1, prp2, - DMA_DIRECTION_FROM_DEVICE, req); + return nvme_dma(n, (uint8_t *)&errlog, trans_len, + DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) @@ -1190,14 +1421,10 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req) static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req) { - NvmeIdentify *c = (NvmeIdentify *)&req->cmd; - uint64_t prp1 = le64_to_cpu(c->prp1); - uint64_t prp2 = le64_to_cpu(c->prp2); - trace_pci_nvme_identify_ctrl(); - return nvme_dma_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), prp1, - prp2, DMA_DIRECTION_FROM_DEVICE, req); + return nvme_dma(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), + DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req) @@ -1205,8 +1432,6 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req) NvmeNamespace *ns; NvmeIdentify *c = (NvmeIdentify *)&req->cmd; uint32_t nsid = le32_to_cpu(c->nsid); - uint64_t prp1 = le64_to_cpu(c->prp1); - uint64_t prp2 = le64_to_cpu(c->prp2); trace_pci_nvme_identify_ns(nsid); @@ -1217,8 +1442,8 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req) ns = &n->namespaces[nsid - 1]; - return nvme_dma_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns), prp1, - prp2, DMA_DIRECTION_FROM_DEVICE, req); + return nvme_dma(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns), + DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req) @@ -1226,8 +1451,6 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req) NvmeIdentify *c = (NvmeIdentify *)&req->cmd; static const int data_len = NVME_IDENTIFY_DATA_SIZE; uint32_t min_nsid = le32_to_cpu(c->nsid); - uint64_t prp1 = le64_to_cpu(c->prp1); - uint64_t prp2 = le64_to_cpu(c->prp2); uint32_t *list; uint16_t ret; int i, j = 0; @@ -1254,8 +1477,8 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req) break; } } - ret = nvme_dma_prp(n, (uint8_t *)list, data_len, prp1, prp2, - DMA_DIRECTION_FROM_DEVICE, req); + ret = nvme_dma(n, (uint8_t *)list, data_len, DMA_DIRECTION_FROM_DEVICE, + req); g_free(list); return ret; } @@ -1264,8 +1487,6 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) { NvmeIdentify *c = (NvmeIdentify *)&req->cmd; uint32_t nsid = le32_to_cpu(c->nsid); - uint64_t prp1 = le64_to_cpu(c->prp1); - uint64_t prp2 = le64_to_cpu(c->prp2); uint8_t list[NVME_IDENTIFY_DATA_SIZE]; @@ -1297,8 +1518,8 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) ns_descrs->uuid.hdr.nidl = NVME_NIDT_UUID_LEN; stl_be_p(&ns_descrs->uuid.v, nsid); - return nvme_dma_prp(n, list, NVME_IDENTIFY_DATA_SIZE, prp1, prp2, - DMA_DIRECTION_FROM_DEVICE, req); + return nvme_dma(n, list, NVME_IDENTIFY_DATA_SIZE, + DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req) @@ -1369,14 +1590,10 @@ static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n) static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) { - NvmeCmd *cmd = &req->cmd; - uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); - uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); - uint64_t timestamp = nvme_get_timestamp(n); - return nvme_dma_prp(n, (uint8_t *)×tamp, sizeof(timestamp), prp1, - prp2, DMA_DIRECTION_FROM_DEVICE, req); + return nvme_dma(n, (uint8_t *)×tamp, sizeof(timestamp), + DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req) @@ -1505,12 +1722,9 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) { uint16_t ret; uint64_t timestamp; - NvmeCmd *cmd = &req->cmd; - uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); - uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); - ret = nvme_dma_prp(n, (uint8_t *)×tamp, sizeof(timestamp), prp1, - prp2, DMA_DIRECTION_TO_DEVICE, req); + ret = nvme_dma(n, (uint8_t *)×tamp, sizeof(timestamp), + DMA_DIRECTION_TO_DEVICE, req); if (ret != NVME_SUCCESS) { return ret; } @@ -2437,6 +2651,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->nn = cpu_to_le32(n->num_namespaces); id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP | NVME_ONCS_FEATURES); + id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN); subnqn = g_strdup_printf("nqn.2019-08.org.qemu:%s", n->params.serial); strpadcpy((char *)id->subnqn, sizeof(id->subnqn), subnqn, '\0'); diff --git a/hw/block/trace-events b/hw/block/trace-events index 9a6c5fb..22ea635 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -36,6 +36,7 @@ pci_nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2 pci_nvme_map_addr(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64"" pci_nvme_map_addr_cmb(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64"" pci_nvme_map_prp(uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t prp2, int num_prps) "trans_len %"PRIu64" len %"PRIu32" prp1 0x%"PRIx64" prp2 0x%"PRIx64" num_prps %d" +pci_nvme_map_sgl(uint16_t cid, uint8_t typ, uint64_t len) "cid %"PRIu16" type 0x%"PRIx8" len %"PRIu64"" pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'" pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'" pci_nvme_rw(uint16_t cid, const char *verb, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" '%s' nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64"" @@ -91,6 +92,9 @@ pci_nvme_err_addr_read(uint64_t addr) "addr 0x%"PRIx64"" pci_nvme_err_addr_write(uint64_t addr) "addr 0x%"PRIx64"" pci_nvme_err_cfs(void) "controller fatal status" pci_nvme_err_aio(uint16_t cid, const char *errname, uint16_t status) "cid %"PRIu16" err '%s' status 0x%"PRIx16"" +pci_nvme_err_invalid_sgld(uint16_t cid, uint8_t typ) "cid %"PRIu16" type 0x%"PRIx8"" +pci_nvme_err_invalid_num_sgld(uint16_t cid, uint8_t typ) "cid %"PRIu16" type 0x%"PRIx8"" +pci_nvme_err_invalid_sgl_excess_length(uint16_t cid) "cid %"PRIu16"" pci_nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size" pci_nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64"" pci_nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64"" |