aboutsummaryrefslogtreecommitdiff
path: root/hw/rdma
diff options
context:
space:
mode:
authorYuval Shaia <yuval.shaia@oracle.com>2018-12-21 16:40:19 +0200
committerMarcel Apfelbaum <marcel.apfelbaum@gmail.com>2018-12-22 11:09:56 +0200
commit605ec1663b51722a73046fed5453cb5efb994d85 (patch)
tree7c35720630b5b7782eb79a91877a2e3e77955d1d /hw/rdma
parent305bdd7a57a52cf87f9bf3e85316b0f62fe7167c (diff)
downloadqemu-605ec1663b51722a73046fed5453cb5efb994d85.zip
qemu-605ec1663b51722a73046fed5453cb5efb994d85.tar.gz
qemu-605ec1663b51722a73046fed5453cb5efb994d85.tar.bz2
hw/rdma: Add support for MAD packets
MAD (Management Datagram) packets are widely used by various modules both in kernel and in user space for example the rdma_* API which is used to create and maintain "connection" layer on top of RDMA uses several types of MAD packets. For more information please refer to chapter 13.4 in Volume 1 Architecture Specification, Release 1.1 available here: https://www.infinibandta.org/ibta-specifications-download/ To support MAD packets the device uses an external utility (contrib/rdmacm-mux) to relay packets from and to the guest driver. Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com> Reviewed-by: Marcel Apfelbaum<marcel.apfelbaum@gmail.com> Signed-off-by: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
Diffstat (limited to 'hw/rdma')
-rw-r--r--hw/rdma/rdma_backend.c250
-rw-r--r--hw/rdma/rdma_backend.h4
-rw-r--r--hw/rdma/rdma_backend_defs.h10
-rw-r--r--hw/rdma/vmw/pvrdma.h2
-rw-r--r--hw/rdma/vmw/pvrdma_main.c4
5 files changed, 260 insertions, 10 deletions
diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c
index 1e14839..c6dedda 100644
--- a/hw/rdma/rdma_backend.c
+++ b/hw/rdma/rdma_backend.c
@@ -16,8 +16,13 @@
#include "qemu/osdep.h"
#include "qemu/error-report.h"
#include "qapi/error.h"
+#include "qapi/qmp/qlist.h"
+#include "qapi/qmp/qnum.h"
#include <infiniband/verbs.h>
+#include <infiniband/umad_types.h>
+#include <infiniband/umad.h>
+#include <rdma/rdma_user_cm.h>
#include "trace.h"
#include "rdma_utils.h"
@@ -33,16 +38,25 @@
#define VENDOR_ERR_MAD_SEND 0x206
#define VENDOR_ERR_INVLKEY 0x207
#define VENDOR_ERR_MR_SMALL 0x208
+#define VENDOR_ERR_INV_MAD_BUFF 0x209
+#define VENDOR_ERR_INV_NUM_SGE 0x210
#define THR_NAME_LEN 16
#define THR_POLL_TO 5000
+#define MAD_HDR_SIZE sizeof(struct ibv_grh)
+
typedef struct BackendCtx {
- uint64_t req_id;
void *up_ctx;
bool is_tx_req;
+ struct ibv_sge sge; /* Used to save MAD recv buffer */
} BackendCtx;
+struct backend_umad {
+ struct ib_user_mad hdr;
+ char mad[RDMA_MAX_PRIVATE_DATA];
+};
+
static void (*comp_handler)(int status, unsigned int vendor_err, void *ctx);
static void dummy_comp_handler(int status, unsigned int vendor_err, void *ctx)
@@ -286,6 +300,61 @@ static int build_host_sge_array(RdmaDeviceResources *rdma_dev_res,
return 0;
}
+static int mad_send(RdmaBackendDev *backend_dev, struct ibv_sge *sge,
+ uint32_t num_sge)
+{
+ struct backend_umad umad = {0};
+ char *hdr, *msg;
+ int ret;
+
+ pr_dbg("num_sge=%d\n", num_sge);
+
+ if (num_sge != 2) {
+ return -EINVAL;
+ }
+
+ umad.hdr.length = sge[0].length + sge[1].length;
+ pr_dbg("msg_len=%d\n", umad.hdr.length);
+
+ if (umad.hdr.length > sizeof(umad.mad)) {
+ return -ENOMEM;
+ }
+
+ umad.hdr.addr.qpn = htobe32(1);
+ umad.hdr.addr.grh_present = 1;
+ umad.hdr.addr.gid_index = backend_dev->backend_gid_idx;
+ memcpy(umad.hdr.addr.gid, backend_dev->gid.raw, sizeof(umad.hdr.addr.gid));
+ umad.hdr.addr.hop_limit = 0xFF;
+
+ hdr = rdma_pci_dma_map(backend_dev->dev, sge[0].addr, sge[0].length);
+ if (!hdr) {
+ pr_dbg("Fail to map to sge[0]\n");
+ return -ENOMEM;
+ }
+ msg = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length);
+ if (!msg) {
+ pr_dbg("Fail to map to sge[1]\n");
+ rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length);
+ return -ENOMEM;
+ }
+
+ pr_dbg_buf("mad_hdr", hdr, sge[0].length);
+ pr_dbg_buf("mad_data", data, sge[1].length);
+
+ memcpy(&umad.mad[0], hdr, sge[0].length);
+ memcpy(&umad.mad[sge[0].length], msg, sge[1].length);
+
+ rdma_pci_dma_unmap(backend_dev->dev, msg, sge[1].length);
+ rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length);
+
+ ret = qemu_chr_fe_write(backend_dev->mad_chr_be, (const uint8_t *)&umad,
+ sizeof(umad));
+
+ pr_dbg("qemu_chr_fe_write=%d\n", ret);
+
+ return (ret != sizeof(umad));
+}
+
void rdma_backend_post_send(RdmaBackendDev *backend_dev,
RdmaBackendQP *qp, uint8_t qp_type,
struct ibv_sge *sge, uint32_t num_sge,
@@ -304,9 +373,13 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev,
comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx);
} else if (qp_type == IBV_QPT_GSI) {
pr_dbg("QP1\n");
- comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
+ rc = mad_send(backend_dev, sge, num_sge);
+ if (rc) {
+ comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
+ } else {
+ comp_handler(IBV_WC_SUCCESS, 0, ctx);
+ }
}
- pr_dbg("qp->ibqp is NULL for qp_type %d!!!\n", qp_type);
return;
}
@@ -370,6 +443,48 @@ out_free_bctx:
g_free(bctx);
}
+static unsigned int save_mad_recv_buffer(RdmaBackendDev *backend_dev,
+ struct ibv_sge *sge, uint32_t num_sge,
+ void *ctx)
+{
+ BackendCtx *bctx;
+ int rc;
+ uint32_t bctx_id;
+
+ if (num_sge != 1) {
+ pr_dbg("Invalid num_sge (%d), expecting 1\n", num_sge);
+ return VENDOR_ERR_INV_NUM_SGE;
+ }
+
+ if (sge[0].length < RDMA_MAX_PRIVATE_DATA + sizeof(struct ibv_grh)) {
+ pr_dbg("Too small buffer for MAD\n");
+ return VENDOR_ERR_INV_MAD_BUFF;
+ }
+
+ pr_dbg("addr=0x%" PRIx64"\n", sge[0].addr);
+ pr_dbg("length=%d\n", sge[0].length);
+ pr_dbg("lkey=%d\n", sge[0].lkey);
+
+ bctx = g_malloc0(sizeof(*bctx));
+
+ rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx);
+ if (unlikely(rc)) {
+ g_free(bctx);
+ pr_dbg("Fail to allocate cqe_ctx\n");
+ return VENDOR_ERR_NOMEM;
+ }
+
+ pr_dbg("bctx_id %d, bctx %p, ctx %p\n", bctx_id, bctx, ctx);
+ bctx->up_ctx = ctx;
+ bctx->sge = *sge;
+
+ qemu_mutex_lock(&backend_dev->recv_mads_list.lock);
+ qlist_append_int(backend_dev->recv_mads_list.list, bctx_id);
+ qemu_mutex_unlock(&backend_dev->recv_mads_list.lock);
+
+ return 0;
+}
+
void rdma_backend_post_recv(RdmaBackendDev *backend_dev,
RdmaDeviceResources *rdma_dev_res,
RdmaBackendQP *qp, uint8_t qp_type,
@@ -388,7 +503,10 @@ void rdma_backend_post_recv(RdmaBackendDev *backend_dev,
}
if (qp_type == IBV_QPT_GSI) {
pr_dbg("QP1\n");
- comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
+ rc = save_mad_recv_buffer(backend_dev, sge, num_sge, ctx);
+ if (rc) {
+ comp_handler(IBV_WC_GENERAL_ERR, rc, ctx);
+ }
}
return;
}
@@ -517,7 +635,6 @@ int rdma_backend_create_qp(RdmaBackendQP *qp, uint8_t qp_type,
switch (qp_type) {
case IBV_QPT_GSI:
- pr_dbg("QP1 unsupported\n");
return 0;
case IBV_QPT_RC:
@@ -748,11 +865,122 @@ static int init_device_caps(RdmaBackendDev *backend_dev,
return 0;
}
+static inline void build_mad_hdr(struct ibv_grh *grh, union ibv_gid *sgid,
+ union ibv_gid *my_gid, int paylen)
+{
+ grh->paylen = htons(paylen);
+ grh->sgid = *sgid;
+ grh->dgid = *my_gid;
+
+ pr_dbg("paylen=%d (net=0x%x)\n", paylen, grh->paylen);
+ pr_dbg("my_gid=0x%llx\n", my_gid->global.interface_id);
+ pr_dbg("gid=0x%llx\n", sgid->global.interface_id);
+}
+
+static inline int mad_can_receieve(void *opaque)
+{
+ return sizeof(struct backend_umad);
+}
+
+static void mad_read(void *opaque, const uint8_t *buf, int size)
+{
+ RdmaBackendDev *backend_dev = (RdmaBackendDev *)opaque;
+ QObject *o_ctx_id;
+ unsigned long cqe_ctx_id;
+ BackendCtx *bctx;
+ char *mad;
+ struct backend_umad *umad;
+
+ assert(size != sizeof(umad));
+ umad = (struct backend_umad *)buf;
+
+ pr_dbg("Got %d bytes\n", size);
+ pr_dbg("umad->hdr.length=%d\n", umad->hdr.length);
+
+#ifdef PVRDMA_DEBUG
+ struct umad_hdr *hdr = (struct umad_hdr *)&msg->umad.mad;
+ pr_dbg("bv %x cls %x cv %x mtd %x st %d tid %" PRIx64 " at %x atm %x\n",
+ hdr->base_version, hdr->mgmt_class, hdr->class_version,
+ hdr->method, hdr->status, be64toh(hdr->tid),
+ hdr->attr_id, hdr->attr_mod);
+#endif
+
+ qemu_mutex_lock(&backend_dev->recv_mads_list.lock);
+ o_ctx_id = qlist_pop(backend_dev->recv_mads_list.list);
+ qemu_mutex_unlock(&backend_dev->recv_mads_list.lock);
+ if (!o_ctx_id) {
+ pr_dbg("No more free MADs buffers, waiting for a while\n");
+ sleep(THR_POLL_TO);
+ return;
+ }
+
+ cqe_ctx_id = qnum_get_uint(qobject_to(QNum, o_ctx_id));
+ bctx = rdma_rm_get_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id);
+ if (unlikely(!bctx)) {
+ pr_dbg("Error: Fail to find ctx for %ld\n", cqe_ctx_id);
+ return;
+ }
+
+ pr_dbg("id %ld, bctx %p, ctx %p\n", cqe_ctx_id, bctx, bctx->up_ctx);
+
+ mad = rdma_pci_dma_map(backend_dev->dev, bctx->sge.addr,
+ bctx->sge.length);
+ if (!mad || bctx->sge.length < umad->hdr.length + MAD_HDR_SIZE) {
+ comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_INV_MAD_BUFF,
+ bctx->up_ctx);
+ } else {
+ memset(mad, 0, bctx->sge.length);
+ build_mad_hdr((struct ibv_grh *)mad,
+ (union ibv_gid *)&umad->hdr.addr.gid,
+ &backend_dev->gid, umad->hdr.length);
+ memcpy(&mad[MAD_HDR_SIZE], umad->mad, umad->hdr.length);
+ rdma_pci_dma_unmap(backend_dev->dev, mad, bctx->sge.length);
+
+ comp_handler(IBV_WC_SUCCESS, 0, bctx->up_ctx);
+ }
+
+ g_free(bctx);
+ rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id);
+}
+
+static int mad_init(RdmaBackendDev *backend_dev)
+{
+ struct backend_umad umad = {0};
+ int ret;
+
+ if (!qemu_chr_fe_backend_connected(backend_dev->mad_chr_be)) {
+ pr_dbg("Missing chardev for MAD multiplexer\n");
+ return -EIO;
+ }
+
+ qemu_chr_fe_set_handlers(backend_dev->mad_chr_be, mad_can_receieve,
+ mad_read, NULL, NULL, backend_dev, NULL, true);
+
+ /* Register ourself */
+ memcpy(umad.hdr.addr.gid, backend_dev->gid.raw, sizeof(umad.hdr.addr.gid));
+ ret = qemu_chr_fe_write(backend_dev->mad_chr_be, (const uint8_t *)&umad,
+ sizeof(umad.hdr));
+ if (ret != sizeof(umad.hdr)) {
+ pr_dbg("Fail to register to rdma_umadmux (%d)\n", ret);
+ }
+
+ qemu_mutex_init(&backend_dev->recv_mads_list.lock);
+ backend_dev->recv_mads_list.list = qlist_new();
+
+ return 0;
+}
+
+static void mad_fini(RdmaBackendDev *backend_dev)
+{
+ qlist_destroy_obj(QOBJECT(backend_dev->recv_mads_list.list));
+ qemu_mutex_destroy(&backend_dev->recv_mads_list.lock);
+}
+
int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev,
RdmaDeviceResources *rdma_dev_res,
const char *backend_device_name, uint8_t port_num,
uint8_t backend_gid_idx, struct ibv_device_attr *dev_attr,
- Error **errp)
+ CharBackend *mad_chr_be, Error **errp)
{
int i;
int ret = 0;
@@ -763,7 +991,7 @@ int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev,
memset(backend_dev, 0, sizeof(*backend_dev));
backend_dev->dev = pdev;
-
+ backend_dev->mad_chr_be = mad_chr_be;
backend_dev->backend_gid_idx = backend_gid_idx;
backend_dev->port_num = port_num;
backend_dev->rdma_dev_res = rdma_dev_res;
@@ -854,6 +1082,13 @@ int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev,
pr_dbg("interface_id=0x%" PRIx64 "\n",
be64_to_cpu(backend_dev->gid.global.interface_id));
+ ret = mad_init(backend_dev);
+ if (ret) {
+ error_setg(errp, "Fail to initialize mad");
+ ret = -EIO;
+ goto out_destroy_comm_channel;
+ }
+
backend_dev->comp_thread.run = false;
backend_dev->comp_thread.is_running = false;
@@ -890,6 +1125,7 @@ void rdma_backend_stop(RdmaBackendDev *backend_dev)
void rdma_backend_fini(RdmaBackendDev *backend_dev)
{
rdma_backend_stop(backend_dev);
+ mad_fini(backend_dev);
g_hash_table_destroy(ah_hash);
ibv_destroy_comp_channel(backend_dev->channel);
ibv_close_device(backend_dev->context);
diff --git a/hw/rdma/rdma_backend.h b/hw/rdma/rdma_backend.h
index 3ccc9a2..fc83330 100644
--- a/hw/rdma/rdma_backend.h
+++ b/hw/rdma/rdma_backend.h
@@ -17,6 +17,8 @@
#define RDMA_BACKEND_H
#include "qapi/error.h"
+#include "chardev/char-fe.h"
+
#include "rdma_rm_defs.h"
#include "rdma_backend_defs.h"
@@ -50,7 +52,7 @@ int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev,
RdmaDeviceResources *rdma_dev_res,
const char *backend_device_name, uint8_t port_num,
uint8_t backend_gid_idx, struct ibv_device_attr *dev_attr,
- Error **errp);
+ CharBackend *mad_chr_be, Error **errp);
void rdma_backend_fini(RdmaBackendDev *backend_dev);
void rdma_backend_start(RdmaBackendDev *backend_dev);
void rdma_backend_stop(RdmaBackendDev *backend_dev);
diff --git a/hw/rdma/rdma_backend_defs.h b/hw/rdma/rdma_backend_defs.h
index 7404f64..2a7e667 100644
--- a/hw/rdma/rdma_backend_defs.h
+++ b/hw/rdma/rdma_backend_defs.h
@@ -16,8 +16,9 @@
#ifndef RDMA_BACKEND_DEFS_H
#define RDMA_BACKEND_DEFS_H
-#include <infiniband/verbs.h>
#include "qemu/thread.h"
+#include "chardev/char-fe.h"
+#include <infiniband/verbs.h>
typedef struct RdmaDeviceResources RdmaDeviceResources;
@@ -28,6 +29,11 @@ typedef struct RdmaBackendThread {
bool is_running; /* Set by the thread to report its status */
} RdmaBackendThread;
+typedef struct RecvMadList {
+ QemuMutex lock;
+ QList *list;
+} RecvMadList;
+
typedef struct RdmaBackendDev {
struct ibv_device_attr dev_attr;
RdmaBackendThread comp_thread;
@@ -39,6 +45,8 @@ typedef struct RdmaBackendDev {
struct ibv_comp_channel *channel;
uint8_t port_num;
uint8_t backend_gid_idx;
+ RecvMadList recv_mads_list;
+ CharBackend *mad_chr_be;
} RdmaBackendDev;
typedef struct RdmaBackendPD {
diff --git a/hw/rdma/vmw/pvrdma.h b/hw/rdma/vmw/pvrdma.h
index e2d9f93..e3742d8 100644
--- a/hw/rdma/vmw/pvrdma.h
+++ b/hw/rdma/vmw/pvrdma.h
@@ -19,6 +19,7 @@
#include "qemu/units.h"
#include "hw/pci/pci.h"
#include "hw/pci/msix.h"
+#include "chardev/char-fe.h"
#include "../rdma_backend_defs.h"
#include "../rdma_rm_defs.h"
@@ -83,6 +84,7 @@ typedef struct PVRDMADev {
uint8_t backend_port_num;
RdmaBackendDev backend_dev;
RdmaDeviceResources rdma_dev_res;
+ CharBackend mad_chr;
} PVRDMADev;
#define PVRDMA_DEV(dev) OBJECT_CHECK(PVRDMADev, (dev), PVRDMA_HW_NAME)
diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c
index ca5fa8d..6c8c015 100644
--- a/hw/rdma/vmw/pvrdma_main.c
+++ b/hw/rdma/vmw/pvrdma_main.c
@@ -51,6 +51,7 @@ static Property pvrdma_dev_properties[] = {
DEFINE_PROP_INT32("dev-caps-max-qp-init-rd-atom", PVRDMADev,
dev_attr.max_qp_init_rd_atom, MAX_QP_INIT_RD_ATOM),
DEFINE_PROP_INT32("dev-caps-max-ah", PVRDMADev, dev_attr.max_ah, MAX_AH),
+ DEFINE_PROP_CHR("mad-chardev", PVRDMADev, mad_chr),
DEFINE_PROP_END_OF_LIST(),
};
@@ -613,7 +614,8 @@ static void pvrdma_realize(PCIDevice *pdev, Error **errp)
rc = rdma_backend_init(&dev->backend_dev, pdev, &dev->rdma_dev_res,
dev->backend_device_name, dev->backend_port_num,
- dev->backend_gid_idx, &dev->dev_attr, errp);
+ dev->backend_gid_idx, &dev->dev_attr, &dev->mad_chr,
+ errp);
if (rc) {
goto out;
}