diff options
-rw-r--r-- | MAINTAINERS | 2 | ||||
-rw-r--r-- | accel/kvm/kvm-all.c | 6 | ||||
-rw-r--r-- | block/nvme.c | 209 | ||||
-rw-r--r-- | block/trace-events | 30 | ||||
-rw-r--r-- | include/block/nvme.h | 18 | ||||
-rw-r--r-- | softmmu/memory.c | 11 | ||||
-rw-r--r-- | util/trace-events | 10 | ||||
-rw-r--r-- | util/vfio-helpers.c | 43 |
8 files changed, 195 insertions, 134 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 2e018a0..e47bf8c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1882,6 +1882,7 @@ M: Klaus Jensen <its@irrelevant.dk> L: qemu-block@nongnu.org S: Supported F: hw/block/nvme* +F: include/block/nvme.h F: tests/qtest/nvme-test.c F: docs/specs/nvme.txt T: git git://git.infradead.org/qemu-nvme.git nvme-next @@ -2972,6 +2973,7 @@ R: Fam Zheng <fam@euphon.net> L: qemu-block@nongnu.org S: Supported F: block/nvme* +F: include/block/nvme.h T: git https://github.com/stefanha/qemu.git block Bootdevice diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 9ef5daf..baaa542 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -2239,8 +2239,10 @@ static int kvm_init(MachineState *ms) kvm_memory_listener_register(s, &s->memory_listener, &address_space_memory, 0); - memory_listener_register(&kvm_io_listener, - &address_space_io); + if (kvm_eventfds_allowed) { + memory_listener_register(&kvm_io_listener, + &address_space_io); + } memory_listener_register(&kvm_coalesced_pio_listener, &address_space_io); diff --git a/block/nvme.c b/block/nvme.c index 739a0a7..a06a188 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -41,6 +41,16 @@ typedef struct BDRVNVMeState BDRVNVMeState; +/* Same index is used for queues and IRQs */ +#define INDEX_ADMIN 0 +#define INDEX_IO(n) (1 + n) + +/* This driver shares a single MSIX IRQ for the admin and I/O queues */ +enum { + MSIX_SHARED_IRQ_IDX = 0, + MSIX_IRQ_COUNT = 1 +}; + typedef struct { int32_t head, tail; uint8_t *queue; @@ -81,18 +91,10 @@ typedef struct { QEMUBH *completion_bh; } NVMeQueuePair; -#define INDEX_ADMIN 0 -#define INDEX_IO(n) (1 + n) - -/* This driver shares a single MSIX IRQ for the admin and I/O queues */ -enum { - MSIX_SHARED_IRQ_IDX = 0, - MSIX_IRQ_COUNT = 1 -}; - struct BDRVNVMeState { AioContext *aio_context; QEMUVFIOState *vfio; + void *bar0_wo_map; /* Memory mapped registers */ volatile struct { uint32_t sq_tail; @@ -103,7 +105,7 @@ struct BDRVNVMeState { * [1..]: io queues. */ NVMeQueuePair **queues; - int nr_queues; + unsigned queue_count; size_t page_size; /* How many uint32_t elements does each doorbell entry take. */ size_t doorbell_scale; @@ -159,28 +161,32 @@ static QemuOptsList runtime_opts = { }, }; -static void nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q, - int nentries, int entry_bytes, Error **errp) +/* Returns true on success, false on failure. */ +static bool nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q, + unsigned nentries, size_t entry_bytes, Error **errp) { size_t bytes; int r; - bytes = ROUND_UP(nentries * entry_bytes, s->page_size); + bytes = ROUND_UP(nentries * entry_bytes, qemu_real_host_page_size); q->head = q->tail = 0; - q->queue = qemu_try_memalign(s->page_size, bytes); + q->queue = qemu_try_memalign(qemu_real_host_page_size, bytes); if (!q->queue) { error_setg(errp, "Cannot allocate queue"); - return; + return false; } memset(q->queue, 0, bytes); r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, &q->iova); if (r) { error_setg(errp, "Cannot map queue"); + return false; } + return true; } static void nvme_free_queue_pair(NVMeQueuePair *q) { + trace_nvme_free_queue_pair(q->index, q); if (q->completion_bh) { qemu_bh_delete(q->completion_bh); } @@ -204,31 +210,33 @@ static void nvme_free_req_queue_cb(void *opaque) static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s, AioContext *aio_context, - int idx, int size, + unsigned idx, size_t size, Error **errp) { int i, r; - Error *local_err = NULL; NVMeQueuePair *q; uint64_t prp_list_iova; + size_t bytes; q = g_try_new0(NVMeQueuePair, 1); if (!q) { return NULL; } - q->prp_list_pages = qemu_try_memalign(s->page_size, - s->page_size * NVME_NUM_REQS); + trace_nvme_create_queue_pair(idx, q, size, aio_context, + event_notifier_get_fd(s->irq_notifier)); + bytes = QEMU_ALIGN_UP(s->page_size * NVME_NUM_REQS, + qemu_real_host_page_size); + q->prp_list_pages = qemu_try_memalign(qemu_real_host_page_size, bytes); if (!q->prp_list_pages) { goto fail; } - memset(q->prp_list_pages, 0, s->page_size * NVME_NUM_REQS); + memset(q->prp_list_pages, 0, bytes); qemu_mutex_init(&q->lock); q->s = s; q->index = idx; qemu_co_queue_init(&q->free_req_queue); q->completion_bh = aio_bh_new(aio_context, nvme_process_completion_bh, q); - r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, - s->page_size * NVME_NUM_REQS, + r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, bytes, false, &prp_list_iova); if (r) { goto fail; @@ -243,16 +251,12 @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s, req->prp_list_iova = prp_list_iova + i * s->page_size; } - nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err); - if (local_err) { - error_propagate(errp, local_err); + if (!nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, errp)) { goto fail; } q->sq.doorbell = &s->doorbells[idx * s->doorbell_scale].sq_tail; - nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, &local_err); - if (local_err) { - error_propagate(errp, local_err); + if (!nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, errp)) { goto fail; } q->cq.doorbell = &s->doorbells[idx * s->doorbell_scale].cq_head; @@ -292,7 +296,7 @@ static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q) while (q->free_req_head == -1) { if (qemu_in_coroutine()) { - trace_nvme_free_req_queue_wait(q); + trace_nvme_free_req_queue_wait(q->s, q->index); qemu_co_queue_wait(&q->free_req_queue, &q->lock); } else { qemu_mutex_unlock(&q->lock); @@ -399,8 +403,8 @@ static bool nvme_process_completion(NVMeQueuePair *q) } cid = le16_to_cpu(c->cid); if (cid == 0 || cid > NVME_QUEUE_SIZE) { - fprintf(stderr, "Unexpected CID in completion queue: %" PRIu32 "\n", - cid); + warn_report("NVMe: Unexpected CID in completion queue: %"PRIu32", " + "queue size: %u", cid, NVME_QUEUE_SIZE); continue; } trace_nvme_complete_command(s, q->index, cid); @@ -465,7 +469,7 @@ static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req, assert(!req->cb); req->cb = cb; req->opaque = opaque; - cmd->cid = cpu_to_le32(req->cid); + cmd->cid = cpu_to_le16(req->cid); trace_nvme_submit_command(q->s, q->index, req->cid); nvme_trace_command(cmd); @@ -479,16 +483,17 @@ static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req, qemu_mutex_unlock(&q->lock); } -static void nvme_cmd_sync_cb(void *opaque, int ret) +static void nvme_admin_cmd_sync_cb(void *opaque, int ret) { int *pret = opaque; *pret = ret; aio_wait_kick(); } -static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q, - NvmeCmd *cmd) +static int nvme_admin_cmd_sync(BlockDriverState *bs, NvmeCmd *cmd) { + BDRVNVMeState *s = bs->opaque; + NVMeQueuePair *q = s->queues[INDEX_ADMIN]; AioContext *aio_context = bdrv_get_aio_context(bs); NVMeRequest *req; int ret = -EINPROGRESS; @@ -496,15 +501,17 @@ static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q, if (!req) { return -EBUSY; } - nvme_submit_command(q, req, cmd, nvme_cmd_sync_cb, &ret); + nvme_submit_command(q, req, cmd, nvme_admin_cmd_sync_cb, &ret); AIO_WAIT_WHILE(aio_context, ret == -EINPROGRESS); return ret; } -static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp) +/* Returns true on success, false on failure. */ +static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp) { BDRVNVMeState *s = bs->opaque; + bool ret = false; union { NvmeIdCtrl ctrl; NvmeIdNs ns; @@ -517,21 +524,22 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp) .opcode = NVME_ADM_CMD_IDENTIFY, .cdw10 = cpu_to_le32(0x1), }; + size_t id_size = QEMU_ALIGN_UP(sizeof(*id), qemu_real_host_page_size); - id = qemu_try_memalign(s->page_size, sizeof(*id)); + id = qemu_try_memalign(qemu_real_host_page_size, id_size); if (!id) { error_setg(errp, "Cannot allocate buffer for identify response"); goto out; } - r = qemu_vfio_dma_map(s->vfio, id, sizeof(*id), true, &iova); + r = qemu_vfio_dma_map(s->vfio, id, id_size, true, &iova); if (r) { error_setg(errp, "Cannot map buffer for DMA"); goto out; } - memset(id, 0, sizeof(*id)); + memset(id, 0, id_size); cmd.dptr.prp1 = cpu_to_le64(iova); - if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) { + if (nvme_admin_cmd_sync(bs, &cmd)) { error_setg(errp, "Failed to identify controller"); goto out; } @@ -551,10 +559,10 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp) s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROES); s->supports_discard = !!(oncs & NVME_ONCS_DSM); - memset(id, 0, sizeof(*id)); + memset(id, 0, id_size); cmd.cdw10 = 0; cmd.nsid = cpu_to_le32(namespace); - if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) { + if (nvme_admin_cmd_sync(bs, &cmd)) { error_setg(errp, "Failed to identify namespace"); goto out; } @@ -581,10 +589,13 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp) goto out; } + ret = true; s->blkshift = lbaf->ds; out: qemu_vfio_dma_unmap(s->vfio, id); qemu_vfree(id); + + return ret; } static bool nvme_poll_queue(NVMeQueuePair *q) @@ -594,6 +605,7 @@ static bool nvme_poll_queue(NVMeQueuePair *q) const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES; NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset]; + trace_nvme_poll_queue(q->s, q->index); /* * Do an early check for completions. q->lock isn't needed because * nvme_process_completion() only runs in the event loop thread and @@ -618,7 +630,7 @@ static bool nvme_poll_queues(BDRVNVMeState *s) bool progress = false; int i; - for (i = 0; i < s->nr_queues; i++) { + for (i = 0; i < s->queue_count; i++) { if (nvme_poll_queue(s->queues[i])) { progress = true; } @@ -639,11 +651,12 @@ static void nvme_handle_event(EventNotifier *n) static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) { BDRVNVMeState *s = bs->opaque; - int n = s->nr_queues; + unsigned n = s->queue_count; NVMeQueuePair *q; NvmeCmd cmd; - int queue_size = NVME_QUEUE_SIZE; + unsigned queue_size = NVME_QUEUE_SIZE; + assert(n <= UINT16_MAX); q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs), n, queue_size, errp); if (!q) { @@ -652,26 +665,26 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) cmd = (NvmeCmd) { .opcode = NVME_ADM_CMD_CREATE_CQ, .dptr.prp1 = cpu_to_le64(q->cq.iova), - .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)), - .cdw11 = cpu_to_le32(0x3), + .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n), + .cdw11 = cpu_to_le32(NVME_CQ_IEN | NVME_CQ_PC), }; - if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) { - error_setg(errp, "Failed to create CQ io queue [%d]", n); + if (nvme_admin_cmd_sync(bs, &cmd)) { + error_setg(errp, "Failed to create CQ io queue [%u]", n); goto out_error; } cmd = (NvmeCmd) { .opcode = NVME_ADM_CMD_CREATE_SQ, .dptr.prp1 = cpu_to_le64(q->sq.iova), - .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)), - .cdw11 = cpu_to_le32(0x1 | (n << 16)), + .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n), + .cdw11 = cpu_to_le32(NVME_SQ_PC | (n << 16)), }; - if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) { - error_setg(errp, "Failed to create SQ io queue [%d]", n); + if (nvme_admin_cmd_sync(bs, &cmd)) { + error_setg(errp, "Failed to create SQ io queue [%u]", n); goto out_error; } s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1); s->queues[n] = q; - s->nr_queues++; + s->queue_count++; return true; out_error: nvme_free_queue_pair(q); @@ -684,7 +697,6 @@ static bool nvme_poll_cb(void *opaque) BDRVNVMeState *s = container_of(e, BDRVNVMeState, irq_notifier[MSIX_SHARED_IRQ_IDX]); - trace_nvme_poll_cb(s); return nvme_poll_queues(s); } @@ -692,12 +704,12 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, Error **errp) { BDRVNVMeState *s = bs->opaque; + NVMeQueuePair *q; AioContext *aio_context = bdrv_get_aio_context(bs); int ret; uint64_t cap; uint64_t timeout_ms; uint64_t deadline, now; - Error *local_err = NULL; volatile NvmeBar *regs = NULL; qemu_co_mutex_init(&s->dma_map_lock); @@ -727,15 +739,29 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, * Initialization". */ cap = le64_to_cpu(regs->cap); + trace_nvme_controller_capability_raw(cap); + trace_nvme_controller_capability("Maximum Queue Entries Supported", + 1 + NVME_CAP_MQES(cap)); + trace_nvme_controller_capability("Contiguous Queues Required", + NVME_CAP_CQR(cap)); + trace_nvme_controller_capability("Doorbell Stride", + 2 << (2 + NVME_CAP_DSTRD(cap))); + trace_nvme_controller_capability("Subsystem Reset Supported", + NVME_CAP_NSSRS(cap)); + trace_nvme_controller_capability("Memory Page Size Minimum", + 1 << (12 + NVME_CAP_MPSMIN(cap))); + trace_nvme_controller_capability("Memory Page Size Maximum", + 1 << (12 + NVME_CAP_MPSMAX(cap))); if (!NVME_CAP_CSS(cap)) { error_setg(errp, "Device doesn't support NVMe command set"); ret = -EINVAL; goto out; } - s->page_size = MAX(4096, 1 << NVME_CAP_MPSMIN(cap)); + s->page_size = 1u << (12 + NVME_CAP_MPSMIN(cap)); s->doorbell_scale = (4 << NVME_CAP_DSTRD(cap)) / sizeof(uint32_t); bs->bl.opt_mem_alignment = s->page_size; + bs->bl.request_alignment = s->page_size; timeout_ms = MIN(500 * NVME_CAP_TO(cap), 30000); /* Reset device to get a clean state. */ @@ -752,8 +778,10 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, } } - s->doorbells = qemu_vfio_pci_map_bar(s->vfio, 0, sizeof(NvmeBar), - NVME_DOORBELL_SIZE, PROT_WRITE, errp); + s->bar0_wo_map = qemu_vfio_pci_map_bar(s->vfio, 0, 0, + sizeof(NvmeBar) + NVME_DOORBELL_SIZE, + PROT_WRITE, errp); + s->doorbells = (void *)((uintptr_t)s->bar0_wo_map + sizeof(NvmeBar)); if (!s->doorbells) { ret = -EINVAL; goto out; @@ -761,19 +789,18 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, /* Set up admin queue. */ s->queues = g_new(NVMeQueuePair *, 1); - s->queues[INDEX_ADMIN] = nvme_create_queue_pair(s, aio_context, 0, - NVME_QUEUE_SIZE, - errp); - if (!s->queues[INDEX_ADMIN]) { + q = nvme_create_queue_pair(s, aio_context, 0, NVME_QUEUE_SIZE, errp); + if (!q) { ret = -EINVAL; goto out; } - s->nr_queues = 1; - QEMU_BUILD_BUG_ON(NVME_QUEUE_SIZE & 0xF000); - regs->aqa = cpu_to_le32((NVME_QUEUE_SIZE << AQA_ACQS_SHIFT) | - (NVME_QUEUE_SIZE << AQA_ASQS_SHIFT)); - regs->asq = cpu_to_le64(s->queues[INDEX_ADMIN]->sq.iova); - regs->acq = cpu_to_le64(s->queues[INDEX_ADMIN]->cq.iova); + s->queues[INDEX_ADMIN] = q; + s->queue_count = 1; + QEMU_BUILD_BUG_ON((NVME_QUEUE_SIZE - 1) & 0xF000); + regs->aqa = cpu_to_le32(((NVME_QUEUE_SIZE - 1) << AQA_ACQS_SHIFT) | + ((NVME_QUEUE_SIZE - 1) << AQA_ASQS_SHIFT)); + regs->asq = cpu_to_le64(q->sq.iova); + regs->acq = cpu_to_le64(q->cq.iova); /* After setting up all control registers we can enable device now. */ regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << CC_IOCQES_SHIFT) | @@ -801,9 +828,7 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace, &s->irq_notifier[MSIX_SHARED_IRQ_IDX], false, nvme_handle_event, nvme_poll_cb); - nvme_identify(bs, namespace, &local_err); - if (local_err) { - error_propagate(errp, local_err); + if (!nvme_identify(bs, namespace, errp)) { ret = -EIO; goto out; } @@ -869,7 +894,7 @@ static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable, .cdw11 = cpu_to_le32(enable ? 0x01 : 0x00), }; - ret = nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd); + ret = nvme_admin_cmd_sync(bs, &cmd); if (ret) { error_setg(errp, "Failed to configure NVMe write cache"); } @@ -878,10 +903,9 @@ static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable, static void nvme_close(BlockDriverState *bs) { - int i; BDRVNVMeState *s = bs->opaque; - for (i = 0; i < s->nr_queues; ++i) { + for (unsigned i = 0; i < s->queue_count; ++i) { nvme_free_queue_pair(s->queues[i]); } g_free(s->queues); @@ -889,8 +913,8 @@ static void nvme_close(BlockDriverState *bs) &s->irq_notifier[MSIX_SHARED_IRQ_IDX], false, NULL, NULL); event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]); - qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->doorbells, - sizeof(NvmeBar), NVME_DOORBELL_SIZE); + qemu_vfio_pci_unmap_bar(s->vfio, 0, s->bar0_wo_map, + 0, sizeof(NvmeBar) + NVME_DOORBELL_SIZE); qemu_vfio_close(s->vfio); g_free(s->device); @@ -994,11 +1018,12 @@ static coroutine_fn int nvme_cmd_map_qiov(BlockDriverState *bs, NvmeCmd *cmd, for (i = 0; i < qiov->niov; ++i) { bool retry = true; uint64_t iova; + size_t len = QEMU_ALIGN_UP(qiov->iov[i].iov_len, + qemu_real_host_page_size); try_map: r = qemu_vfio_dma_map(s->vfio, qiov->iov[i].iov_base, - qiov->iov[i].iov_len, - true, &iova); + len, true, &iova); if (r == -ENOMEM && retry) { retry = false; trace_nvme_dma_flush_queue_wait(s); @@ -1106,7 +1131,7 @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs, }; trace_nvme_prw_aligned(s, is_write, offset, bytes, flags, qiov->niov); - assert(s->nr_queues > 1); + assert(s->queue_count > 1); req = nvme_get_free_req(ioq); assert(req); @@ -1142,8 +1167,9 @@ static inline bool nvme_qiov_aligned(BlockDriverState *bs, BDRVNVMeState *s = bs->opaque; for (i = 0; i < qiov->niov; ++i) { - if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base, s->page_size) || - !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, s->page_size)) { + if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base, + qemu_real_host_page_size) || + !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, qemu_real_host_page_size)) { trace_nvme_qiov_unaligned(qiov, i, qiov->iov[i].iov_base, qiov->iov[i].iov_len, s->page_size); return false; @@ -1159,7 +1185,7 @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes, int r; uint8_t *buf = NULL; QEMUIOVector local_qiov; - + size_t len = QEMU_ALIGN_UP(bytes, qemu_real_host_page_size); assert(QEMU_IS_ALIGNED(offset, s->page_size)); assert(QEMU_IS_ALIGNED(bytes, s->page_size)); assert(bytes <= s->max_transfer); @@ -1169,7 +1195,7 @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes, } s->stats.unaligned_accesses++; trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write); - buf = qemu_try_memalign(s->page_size, bytes); + buf = qemu_try_memalign(qemu_real_host_page_size, len); if (!buf) { return -ENOMEM; @@ -1216,7 +1242,7 @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs) .ret = -EINPROGRESS, }; - assert(s->nr_queues > 1); + assert(s->queue_count > 1); req = nvme_get_free_req(ioq); assert(req); nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); @@ -1268,7 +1294,7 @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs, cmd.cdw12 = cpu_to_le32(cdw12); trace_nvme_write_zeroes(s, offset, bytes, flags); - assert(s->nr_queues > 1); + assert(s->queue_count > 1); req = nvme_get_free_req(ioq); assert(req); @@ -1311,7 +1337,7 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs, return -ENOTSUP; } - assert(s->nr_queues > 1); + assert(s->queue_count > 1); buf = qemu_try_memalign(s->page_size, s->page_size); if (!buf) { @@ -1391,7 +1417,7 @@ static void nvme_detach_aio_context(BlockDriverState *bs) { BDRVNVMeState *s = bs->opaque; - for (int i = 0; i < s->nr_queues; i++) { + for (unsigned i = 0; i < s->queue_count; i++) { NVMeQueuePair *q = s->queues[i]; qemu_bh_delete(q->completion_bh); @@ -1412,7 +1438,7 @@ static void nvme_attach_aio_context(BlockDriverState *bs, aio_set_event_notifier(new_context, &s->irq_notifier[MSIX_SHARED_IRQ_IDX], false, nvme_handle_event, nvme_poll_cb); - for (int i = 0; i < s->nr_queues; i++) { + for (unsigned i = 0; i < s->queue_count; i++) { NVMeQueuePair *q = s->queues[i]; q->completion_bh = @@ -1429,11 +1455,10 @@ static void nvme_aio_plug(BlockDriverState *bs) static void nvme_aio_unplug(BlockDriverState *bs) { - int i; BDRVNVMeState *s = bs->opaque; assert(s->plugged); s->plugged = false; - for (i = INDEX_IO(0); i < s->nr_queues; i++) { + for (unsigned i = INDEX_IO(0); i < s->queue_count; i++) { NVMeQueuePair *q = s->queues[i]; qemu_mutex_lock(&q->lock); nvme_kick(q); diff --git a/block/trace-events b/block/trace-events index 0e351c3..8368f4a 100644 --- a/block/trace-events +++ b/block/trace-events @@ -134,25 +134,29 @@ qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, uint64_t qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu" # nvme.c -nvme_kick(void *s, int queue) "s %p queue %d" +nvme_controller_capability_raw(uint64_t value) "0x%08"PRIx64 +nvme_controller_capability(const char *desc, uint64_t value) "%s: %"PRIu64 +nvme_kick(void *s, unsigned q_index) "s %p q #%u" nvme_dma_flush_queue_wait(void *s) "s %p" nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x" -nvme_process_completion(void *s, int index, int inflight) "s %p queue %d inflight %d" -nvme_process_completion_queue_plugged(void *s, int index) "s %p queue %d" -nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d" -nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d" +nvme_process_completion(void *s, unsigned q_index, int inflight) "s %p q #%u inflight %d" +nvme_process_completion_queue_plugged(void *s, unsigned q_index) "s %p q #%u" +nvme_complete_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d" +nvme_submit_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d" nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x" nvme_handle_event(void *s) "s %p" -nvme_poll_cb(void *s) "s %p" -nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset %"PRId64" bytes %"PRId64" flags %d niov %d" -nvme_write_zeroes(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offset %"PRId64" bytes %"PRId64" flags %d" +nvme_poll_queue(void *s, unsigned q_index) "s %p q #%u" +nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset 0x%"PRIx64" bytes %"PRId64" flags %d niov %d" +nvme_write_zeroes(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offset 0x%"PRIx64" bytes %"PRId64" flags %d" nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x" -nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset %"PRId64" bytes %"PRId64" niov %d is_write %d" -nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset %"PRId64" bytes %"PRId64" ret %d" -nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset %"PRId64" bytes %"PRId64"" -nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset %"PRId64" bytes %"PRId64" ret %d" +nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset 0x%"PRIx64" bytes %"PRId64" niov %d is_write %d" +nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset 0x%"PRIx64" bytes %"PRId64" ret %d" +nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset 0x%"PRIx64" bytes %"PRId64"" +nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset 0x%"PRIx64" bytes %"PRId64" ret %d" nvme_dma_map_flush(void *s) "s %p" -nvme_free_req_queue_wait(void *q) "q %p" +nvme_free_req_queue_wait(void *s, unsigned q_index) "s %p q #%u" +nvme_create_queue_pair(unsigned q_index, void *q, unsigned size, void *aio_context, int fd) "index %u q %p size %u aioctx %p fd %d" +nvme_free_queue_pair(unsigned q_index, void *q) "index %u q %p" nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d" nvme_cmd_map_qiov_pages(void *s, int i, uint64_t page) "s %p page[%d] 0x%"PRIx64 nvme_cmd_map_qiov_iov(void *s, int i, void *page, int pages) "s %p iov[%d] %p pages %d" diff --git a/include/block/nvme.h b/include/block/nvme.h index 8a46d9c..3e02d9c 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -501,6 +501,11 @@ typedef struct QEMU_PACKED NvmeCreateCq { #define NVME_CQ_FLAGS_PC(cq_flags) (cq_flags & 0x1) #define NVME_CQ_FLAGS_IEN(cq_flags) ((cq_flags >> 1) & 0x1) +enum NvmeFlagsCq { + NVME_CQ_PC = 1, + NVME_CQ_IEN = 2, +}; + typedef struct QEMU_PACKED NvmeCreateSq { uint8_t opcode; uint8_t flags; @@ -518,12 +523,13 @@ typedef struct QEMU_PACKED NvmeCreateSq { #define NVME_SQ_FLAGS_PC(sq_flags) (sq_flags & 0x1) #define NVME_SQ_FLAGS_QPRIO(sq_flags) ((sq_flags >> 1) & 0x3) -enum NvmeQueueFlags { - NVME_Q_PC = 1, - NVME_Q_PRIO_URGENT = 0, - NVME_Q_PRIO_HIGH = 1, - NVME_Q_PRIO_NORMAL = 2, - NVME_Q_PRIO_LOW = 3, +enum NvmeFlagsSq { + NVME_SQ_PC = 1, + + NVME_SQ_PRIO_URGENT = 0, + NVME_SQ_PRIO_HIGH = 1, + NVME_SQ_PRIO_NORMAL = 2, + NVME_SQ_PRIO_LOW = 3, }; typedef struct QEMU_PACKED NvmeIdentify { diff --git a/softmmu/memory.c b/softmmu/memory.c index aa393f1..11ca94d 100644 --- a/softmmu/memory.c +++ b/softmmu/memory.c @@ -205,8 +205,15 @@ static bool memory_region_ioeventfd_before(MemoryRegionIoeventfd *a, static bool memory_region_ioeventfd_equal(MemoryRegionIoeventfd *a, MemoryRegionIoeventfd *b) { - return !memory_region_ioeventfd_before(a, b) - && !memory_region_ioeventfd_before(b, a); + if (int128_eq(a->addr.start, b->addr.start) && + (!int128_nz(a->addr.size) || !int128_nz(b->addr.size) || + (int128_eq(a->addr.size, b->addr.size) && + (a->match_data == b->match_data) && + ((a->match_data && (a->data == b->data)) || !a->match_data) && + (a->e == b->e)))) + return true; + + return false; } /* Range of memory in the global map. Addresses are absolute. */ diff --git a/util/trace-events b/util/trace-events index 24c3180..61e0d4b 100644 --- a/util/trace-events +++ b/util/trace-events @@ -80,8 +80,14 @@ qemu_mutex_unlock(void *mutex, const char *file, const int line) "released mutex qemu_vfio_dma_reset_temporary(void *s) "s %p" qemu_vfio_ram_block_added(void *s, void *p, size_t size) "s %p host %p size 0x%zx" qemu_vfio_ram_block_removed(void *s, void *p, size_t size) "s %p host %p size 0x%zx" +qemu_vfio_dump_mapping(void *host, uint64_t iova, size_t size) "vfio mapping %p to iova 0x%08" PRIx64 " size 0x%zx" qemu_vfio_find_mapping(void *s, void *p) "s %p host %p" qemu_vfio_new_mapping(void *s, void *host, size_t size, int index, uint64_t iova) "s %p host %p size 0x%zx index %d iova 0x%"PRIx64 -qemu_vfio_do_mapping(void *s, void *host, size_t size, uint64_t iova) "s %p host %p size 0x%zx iova 0x%"PRIx64 -qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size 0x%zx temporary %d iova %p" +qemu_vfio_do_mapping(void *s, void *host, uint64_t iova, size_t size) "s %p host %p <-> iova 0x%"PRIx64 " size 0x%zx" +qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size 0x%zx temporary %d &iova %p" +qemu_vfio_dma_mapped(void *s, void *host, uint64_t iova, size_t size) "s %p host %p <-> iova 0x%"PRIx64" size 0x%zx" qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p" +qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "read cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")" +qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")" +qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_size, uint32_t cap_offset) "region '%s' addr 0x%"PRIx64" size 0x%"PRIx64" cap_ofs 0x%"PRIx32 +qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p" diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c index 2bec48e..97dfa3f 100644 --- a/util/vfio-helpers.c +++ b/util/vfio-helpers.c @@ -137,6 +137,7 @@ static inline void assert_bar_index_valid(QEMUVFIOState *s, int index) static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp) { + g_autofree char *barname = NULL; assert_bar_index_valid(s, index); s->bar_region_info[index] = (struct vfio_region_info) { .index = VFIO_PCI_BAR0_REGION_INDEX + index, @@ -146,6 +147,10 @@ static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp) error_setg_errno(errp, errno, "Failed to get BAR region info"); return -errno; } + barname = g_strdup_printf("bar[%d]", index); + trace_qemu_vfio_region_info(barname, s->bar_region_info[index].offset, + s->bar_region_info[index].size, + s->bar_region_info[index].cap_offset); return 0; } @@ -158,10 +163,13 @@ void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index, Error **errp) { void *p; + assert(QEMU_IS_ALIGNED(offset, qemu_real_host_page_size)); assert_bar_index_valid(s, index); p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset), prot, MAP_SHARED, s->device, s->bar_region_info[index].offset + offset); + trace_qemu_vfio_pci_map_bar(index, s->bar_region_info[index].offset , + size, offset, p); if (p == MAP_FAILED) { error_setg_errno(errp, errno, "Failed to map BAR region"); p = NULL; @@ -228,6 +236,10 @@ static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf, { int ret; + trace_qemu_vfio_pci_read_config(buf, ofs, size, + s->config_region_info.offset, + s->config_region_info.size); + assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size)); do { ret = pread(s->device, buf, size, s->config_region_info.offset + ofs); } while (ret == -1 && errno == EINTR); @@ -238,6 +250,10 @@ static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int { int ret; + trace_qemu_vfio_pci_write_config(buf, ofs, size, + s->config_region_info.offset, + s->config_region_info.size); + assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size)); do { ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs); } while (ret == -1 && errno == EINTR); @@ -301,7 +317,7 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device, } if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) { - error_setg_errno(errp, errno, "VFIO IOMMU check failed"); + error_setg_errno(errp, errno, "VFIO IOMMU Type1 is not supported"); ret = -EINVAL; goto fail_container; } @@ -409,6 +425,9 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device, ret = -errno; goto fail; } + trace_qemu_vfio_region_info("config", s->config_region_info.offset, + s->config_region_info.size, + s->config_region_info.cap_offset); for (i = 0; i < ARRAY_SIZE(s->bar_region_info); i++) { ret = qemu_vfio_pci_init_bar(s, i, errp); @@ -516,23 +535,12 @@ QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp) return s; } -static void qemu_vfio_dump_mapping(IOVAMapping *m) -{ - if (QEMU_VFIO_DEBUG) { - printf(" vfio mapping %p %" PRIx64 " to %" PRIx64 "\n", m->host, - (uint64_t)m->size, (uint64_t)m->iova); - } -} - static void qemu_vfio_dump_mappings(QEMUVFIOState *s) { - int i; - - if (QEMU_VFIO_DEBUG) { - printf("vfio mappings\n"); - for (i = 0; i < s->nr_mappings; ++i) { - qemu_vfio_dump_mapping(&s->mappings[i]); - } + for (int i = 0; i < s->nr_mappings; ++i) { + trace_qemu_vfio_dump_mapping(s->mappings[i].host, + s->mappings[i].iova, + s->mappings[i].size); } } @@ -622,7 +630,7 @@ static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size, .vaddr = (uintptr_t)host, .size = size, }; - trace_qemu_vfio_do_mapping(s, host, size, iova); + trace_qemu_vfio_do_mapping(s, host, iova, size); if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) { error_report("VFIO_MAP_DMA failed: %s", strerror(errno)); @@ -778,6 +786,7 @@ int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size, } } } + trace_qemu_vfio_dma_mapped(s, host, iova0, size); if (iova) { *iova = iova0; } |