aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Maydell <peter.maydell@linaro.org>2021-04-12 10:41:11 +0100
committerPeter Maydell <peter.maydell@linaro.org>2021-04-12 10:41:11 +0100
commitf2afdc2ad94b99cc98371791cabc308b547e4add (patch)
treeec61626791e6de659028d7d81dcdbd2adb183672
parent555249a59e9cdd6b58da103aba5cf3a2d45c899f (diff)
parent98f84f5a4eca5c03e32fff20f246d9b4b96d6422 (diff)
downloadqemu-f2afdc2ad94b99cc98371791cabc308b547e4add.zip
qemu-f2afdc2ad94b99cc98371791cabc308b547e4add.tar.gz
qemu-f2afdc2ad94b99cc98371791cabc308b547e4add.tar.bz2
Merge remote-tracking branch 'remotes/nvme/tags/nvme-fixes-20210412-pull-request' into staging
emulated nvme docs and fixes for -rc3 - documentation - fixes # gpg: Signature made Mon 12 Apr 2021 07:56:09 BST # gpg: using RSA key 522833AA75E2DCE6A24766C04DE1AF316D4F0DE9 # gpg: Good signature from "Klaus Jensen <its@irrelevant.dk>" [unknown] # gpg: aka "Klaus Jensen <k.jensen@samsung.com>" [unknown] # gpg: WARNING: This key is not certified with a trusted signature! # gpg: There is no indication that the signature belongs to the owner. # Primary key fingerprint: DDCA 4D9C 9EF9 31CC 3468 4272 63D5 6FC5 E55D A838 # Subkey fingerprint: 5228 33AA 75E2 DCE6 A247 66C0 4DE1 AF31 6D4F 0DE9 * remotes/nvme/tags/nvme-fixes-20210412-pull-request: hw/block/nvme: drain namespaces on sq deletion hw/block/nvme: store aiocb in compare hw/block/nvme: map prp fix if prp2 contains non-zero offset docs: add nvme emulation documentation Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r--MAINTAINERS2
-rw-r--r--docs/specs/nvme.txt23
-rw-r--r--docs/system/index.rst1
-rw-r--r--docs/system/nvme.rst225
-rw-r--r--hw/block/nvme.c38
5 files changed, 259 insertions, 30 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 58f3421..04beb34 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1974,7 +1974,7 @@ S: Supported
F: hw/block/nvme*
F: include/block/nvme.h
F: tests/qtest/nvme-test.c
-F: docs/specs/nvme.txt
+F: docs/system/nvme.rst
T: git git://git.infradead.org/qemu-nvme.git nvme-next
megasas
diff --git a/docs/specs/nvme.txt b/docs/specs/nvme.txt
deleted file mode 100644
index 56d3938..0000000
--- a/docs/specs/nvme.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-NVM Express Controller
-======================
-
-The nvme device (-device nvme) emulates an NVM Express Controller.
-
-
-Reference Specifications
-------------------------
-
-The device currently implements most mandatory features of NVMe v1.3d, see
-
- https://nvmexpress.org/resources/specifications/
-
-for the specification.
-
-
-Known issues
-------------
-
-* The accounting numbers in the SMART/Health are reset across power cycles
-
-* Interrupt Coalescing is not supported and is disabled by default in volation
- of the specification.
diff --git a/docs/system/index.rst b/docs/system/index.rst
index 02d0707..b05af71 100644
--- a/docs/system/index.rst
+++ b/docs/system/index.rst
@@ -23,6 +23,7 @@ Contents:
net
virtio-net-failover
usb
+ nvme
ivshmem
linuxboot
generic-loader
diff --git a/docs/system/nvme.rst b/docs/system/nvme.rst
new file mode 100644
index 0000000..f7f63d6
--- /dev/null
+++ b/docs/system/nvme.rst
@@ -0,0 +1,225 @@
+==============
+NVMe Emulation
+==============
+
+QEMU provides NVMe emulation through the ``nvme``, ``nvme-ns`` and
+``nvme-subsys`` devices.
+
+See the following sections for specific information on
+
+ * `Adding NVMe Devices`_, `additional namespaces`_ and `NVM subsystems`_.
+ * Configuration of `Optional Features`_ such as `Controller Memory Buffer`_,
+ `Simple Copy`_, `Zoned Namespaces`_, `metadata`_ and `End-to-End Data
+ Protection`_,
+
+Adding NVMe Devices
+===================
+
+Controller Emulation
+--------------------
+
+The QEMU emulated NVMe controller implements version 1.4 of the NVM Express
+specification. All mandatory features are implement with a couple of exceptions
+and limitations:
+
+ * Accounting numbers in the SMART/Health log page are reset when the device
+ is power cycled.
+ * Interrupt Coalescing is not supported and is disabled by default.
+
+The simplest way to attach an NVMe controller on the QEMU PCI bus is to add the
+following parameters:
+
+.. code-block:: console
+
+ -drive file=nvm.img,if=none,id=nvm
+ -device nvme,serial=deadbeef,drive=nvm
+
+There are a number of optional general parameters for the ``nvme`` device. Some
+are mentioned here, but see ``-device nvme,help`` to list all possible
+parameters.
+
+``max_ioqpairs=UINT32`` (default: ``64``)
+ Set the maximum number of allowed I/O queue pairs. This replaces the
+ deprecated ``num_queues`` parameter.
+
+``msix_qsize=UINT16`` (default: ``65``)
+ The number of MSI-X vectors that the device should support.
+
+``mdts=UINT8`` (default: ``7``)
+ Set the Maximum Data Transfer Size of the device.
+
+``use-intel-id`` (default: ``off``)
+ Since QEMU 5.2, the device uses a QEMU allocated "Red Hat" PCI Device and
+ Vendor ID. Set this to ``on`` to revert to the unallocated Intel ID
+ previously used.
+
+Additional Namespaces
+---------------------
+
+In the simplest possible invocation sketched above, the device only support a
+single namespace with the namespace identifier ``1``. To support multiple
+namespaces and additional features, the ``nvme-ns`` device must be used.
+
+.. code-block:: console
+
+ -device nvme,id=nvme-ctrl-0,serial=deadbeef
+ -drive file=nvm-1.img,if=none,id=nvm-1
+ -device nvme-ns,drive=nvm-1
+ -drive file=nvm-2.img,if=none,id=nvm-2
+ -device nvme-ns,drive=nvm-2
+
+The namespaces defined by the ``nvme-ns`` device will attach to the most
+recently defined ``nvme-bus`` that is created by the ``nvme`` device. Namespace
+identifers are allocated automatically, starting from ``1``.
+
+There are a number of parameters available:
+
+``nsid`` (default: ``0``)
+ Explicitly set the namespace identifier.
+
+``uuid`` (default: *autogenerated*)
+ Set the UUID of the namespace. This will be reported as a "Namespace UUID"
+ descriptor in the Namespace Identification Descriptor List.
+
+``bus``
+ If there are more ``nvme`` devices defined, this parameter may be used to
+ attach the namespace to a specific ``nvme`` device (identified by an ``id``
+ parameter on the controller device).
+
+NVM Subsystems
+--------------
+
+Additional features becomes available if the controller device (``nvme``) is
+linked to an NVM Subsystem device (``nvme-subsys``).
+
+The NVM Subsystem emulation allows features such as shared namespaces and
+multipath I/O.
+
+.. code-block:: console
+
+ -device nvme-subsys,id=nvme-subsys-0,nqn=subsys0
+ -device nvme,serial=a,subsys=nvme-subsys-0
+ -device nvme,serial=b,subsys=nvme-subsys-0
+
+This will create an NVM subsystem with two controllers. Having controllers
+linked to an ``nvme-subsys`` device allows additional ``nvme-ns`` parameters:
+
+``shared`` (default: ``off``)
+ Specifies that the namespace will be attached to all controllers in the
+ subsystem. If set to ``off`` (the default), the namespace will remain a
+ private namespace and may only be attached to a single controller at a time.
+
+``detached`` (default: ``off``)
+ If set to ``on``, the namespace will be be available in the subsystem, but
+ not attached to any controllers initially.
+
+Thus, adding
+
+.. code-block:: console
+
+ -drive file=nvm-1.img,if=none,id=nvm-1
+ -device nvme-ns,drive=nvm-1,nsid=1,shared=on
+ -drive file=nvm-2.img,if=none,id=nvm-2
+ -device nvme-ns,drive=nvm-2,nsid=3,detached=on
+
+will cause NSID 1 will be a shared namespace (due to ``shared=on``) that is
+initially attached to both controllers. NSID 3 will be a private namespace
+(i.e. only attachable to a single controller at a time) and will not be
+attached to any controller initially (due to ``detached=on``).
+
+Optional Features
+=================
+
+Controller Memory Buffer
+------------------------
+
+``nvme`` device parameters related to the Controller Memory Buffer support:
+
+``cmb_size_mb=UINT32`` (default: ``0``)
+ This adds a Controller Memory Buffer of the given size at offset zero in BAR
+ 2.
+
+``legacy-cmb`` (default: ``off``)
+ By default, the device uses the "v1.4 scheme" for the Controller Memory
+ Buffer support (i.e, the CMB is initially disabled and must be explicitly
+ enabled by the host). Set this to ``on`` to behave as a v1.3 device wrt. the
+ CMB.
+
+Simple Copy
+-----------
+
+The device includes support for TP 4065 ("Simple Copy Command"). A number of
+additional ``nvme-ns`` device parameters may be used to control the Copy
+command limits:
+
+``mssrl=UINT16`` (default: ``128``)
+ Set the Maximum Single Source Range Length (``MSSRL``). This is the maximum
+ number of logical blocks that may be specified in each source range.
+
+``mcl=UINT32`` (default: ``128``)
+ Set the Maximum Copy Length (``MCL``). This is the maximum number of logical
+ blocks that may be specified in a Copy command (the total for all source
+ ranges).
+
+``msrc=UINT8`` (default: ``127``)
+ Set the Maximum Source Range Count (``MSRC``). This is the maximum number of
+ source ranges that may be used in a Copy command. This is a 0's based value.
+
+Zoned Namespaces
+----------------
+
+A namespaces may be "Zoned" as defined by TP 4053 ("Zoned Namespaces"). Set
+``zoned=on`` on an ``nvme-ns`` device to configure it as a zoned namespace.
+
+The namespace may be configured with additional parameters
+
+``zoned.zone_size=SIZE`` (default: ``128MiB``)
+ Define the zone size (``ZSZE``).
+
+``zoned.zone_capacity=SIZE`` (default: ``0``)
+ Define the zone capacity (``ZCAP``). If left at the default (``0``), the zone
+ capacity will equal the zone size.
+
+``zoned.descr_ext_size=UINT32`` (default: ``0``)
+ Set the Zone Descriptor Extension Size (``ZDES``). Must be a multiple of 64
+ bytes.
+
+``zoned.cross_read=BOOL`` (default: ``off``)
+ Set to ``on`` to allow reads to cross zone boundaries.
+
+``zoned.max_active=UINT32`` (default: ``0``)
+ Set the maximum number of active resources (``MAR``). The default (``0``)
+ allows all zones to be active.
+
+``zoned.max_open=UINT32`` (default: ``0``)
+ Set the maximum number of open resources (``MOR``). The default (``0``)
+ allows all zones to be open. If ``zoned.max_active`` is specified, this value
+ must be less than or equal to that.
+
+Metadata
+--------
+
+The virtual namespace device supports LBA metadata in the form separate
+metadata (``MPTR``-based) and extended LBAs.
+
+``ms=UINT16`` (default: ``0``)
+ Defines the number of metadata bytes per LBA.
+
+``mset=UINT8`` (default: ``0``)
+ Set to ``1`` to enable extended LBAs.
+
+End-to-End Data Protection
+--------------------------
+
+The virtual namespace device supports DIF- and DIX-based protection information
+(depending on ``mset``).
+
+``pi=UINT8`` (default: ``0``)
+ Enable protection information of the specified type (type ``1``, ``2`` or
+ ``3``).
+
+``pil=UINT8`` (default: ``0``)
+ Controls the location of the protection information within the metadata. Set
+ to ``1`` to transfer protection information as the first eight bytes of
+ metadata. Otherwise, the protection information is transferred as the last
+ eight bytes.
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 6b1f056..624a143 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -470,6 +470,7 @@ static void nvme_req_clear(NvmeRequest *req)
{
req->ns = NULL;
req->opaque = NULL;
+ req->aiocb = NULL;
memset(&req->cqe, 0x0, sizeof(req->cqe));
req->status = NVME_SUCCESS;
}
@@ -655,7 +656,12 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
uint32_t nents, prp_trans;
int i = 0;
- nents = (len + n->page_size - 1) >> n->page_bits;
+ /*
+ * The first PRP list entry, pointed to by PRP2 may contain offset.
+ * Hence, we need to calculate the number of entries in based on
+ * that offset.
+ */
+ nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3;
prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
if (ret) {
@@ -666,7 +672,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
while (len != 0) {
uint64_t prp_ent = le64_to_cpu(prp_list[i]);
- if (i == n->max_prp_ents - 1 && len > n->page_size) {
+ if (i == nents - 1 && len > n->page_size) {
if (unlikely(prp_ent & (n->page_size - 1))) {
trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
@@ -675,7 +681,8 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
i = 0;
nents = (len + n->page_size - 1) >> n->page_bits;
- prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
+ nents = MIN(nents, n->max_prp_ents);
+ prp_trans = nents * sizeof(uint64_t);
ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
prp_trans);
if (ret) {
@@ -2837,7 +2844,8 @@ static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
block_acct_start(blk_get_stats(blk), &req->acct, data_len,
BLOCK_ACCT_READ);
- blk_aio_preadv(blk, offset, &ctx->data.iov, 0, nvme_compare_data_cb, req);
+ req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0,
+ nvme_compare_data_cb, req);
return NVME_NO_COMPLETE;
}
@@ -3680,6 +3688,7 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
NvmeSQueue *sq;
NvmeCQueue *cq;
uint16_t qid = le16_to_cpu(c->qid);
+ uint32_t nsid;
if (unlikely(!qid || nvme_check_sqid(n, qid))) {
trace_pci_nvme_err_invalid_del_sq(qid);
@@ -3691,9 +3700,26 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
sq = n->sq[qid];
while (!QTAILQ_EMPTY(&sq->out_req_list)) {
r = QTAILQ_FIRST(&sq->out_req_list);
- assert(r->aiocb);
- blk_aio_cancel(r->aiocb);
+ if (r->aiocb) {
+ blk_aio_cancel(r->aiocb);
+ }
+ }
+
+ /*
+ * Drain all namespaces if there are still outstanding requests that we
+ * could not cancel explicitly.
+ */
+ if (!QTAILQ_EMPTY(&sq->out_req_list)) {
+ for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
+ NvmeNamespace *ns = nvme_ns(n, nsid);
+ if (ns) {
+ nvme_ns_drain(ns);
+ }
+ }
}
+
+ assert(QTAILQ_EMPTY(&sq->out_req_list));
+
if (!nvme_check_cqid(n, sq->cqid)) {
cq = n->cq[sq->cqid];
QTAILQ_REMOVE(&cq->sq_list, sq, entry);