diff options
Diffstat (limited to 'hw/vfio-user/proxy.c')
-rw-r--r-- | hw/vfio-user/proxy.c | 1356 |
1 files changed, 1356 insertions, 0 deletions
diff --git a/hw/vfio-user/proxy.c b/hw/vfio-user/proxy.c new file mode 100644 index 0000000..c418954 --- /dev/null +++ b/hw/vfio-user/proxy.c @@ -0,0 +1,1356 @@ +/* + * vfio protocol over a UNIX socket. + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include <sys/ioctl.h> + +#include "hw/vfio/vfio-device.h" +#include "hw/vfio-user/proxy.h" +#include "hw/vfio-user/trace.h" +#include "qapi/error.h" +#include "qobject/qbool.h" +#include "qobject/qdict.h" +#include "qobject/qjson.h" +#include "qobject/qnum.h" +#include "qemu/error-report.h" +#include "qemu/lockable.h" +#include "qemu/main-loop.h" +#include "qemu/thread.h" +#include "system/iothread.h" + +static IOThread *vfio_user_iothread; + +static void vfio_user_shutdown(VFIOUserProxy *proxy); +static VFIOUserMsg *vfio_user_getmsg(VFIOUserProxy *proxy, VFIOUserHdr *hdr, + VFIOUserFDs *fds); +static void vfio_user_recycle(VFIOUserProxy *proxy, VFIOUserMsg *msg); + +static void vfio_user_recv(void *opaque); +static void vfio_user_send(void *opaque); +static void vfio_user_cb(void *opaque); + +static void vfio_user_request(void *opaque); + +static inline void vfio_user_set_error(VFIOUserHdr *hdr, uint32_t err) +{ + hdr->flags |= VFIO_USER_ERROR; + hdr->error_reply = err; +} + +/* + * Functions called by main, CPU, or iothread threads + */ + +static void vfio_user_shutdown(VFIOUserProxy *proxy) +{ + qio_channel_shutdown(proxy->ioc, QIO_CHANNEL_SHUTDOWN_READ, NULL); + qio_channel_set_aio_fd_handler(proxy->ioc, proxy->ctx, NULL, + proxy->ctx, NULL, NULL); +} + +/* + * Same return values as qio_channel_writev_full(): + * + * QIO_CHANNEL_ERR_BLOCK: *errp not set + * -1: *errp will be populated + * otherwise: bytes written + */ +static ssize_t vfio_user_send_qio(VFIOUserProxy *proxy, VFIOUserMsg *msg, + Error **errp) +{ + VFIOUserFDs *fds = msg->fds; + struct iovec iov = { + .iov_base = msg->hdr, + .iov_len = msg->hdr->size, + }; + size_t numfds = 0; + int *fdp = NULL; + ssize_t ret; + + if (fds != NULL && fds->send_fds != 0) { + numfds = fds->send_fds; + fdp = fds->fds; + } + + ret = qio_channel_writev_full(proxy->ioc, &iov, 1, fdp, numfds, 0, errp); + + if (ret == -1) { + vfio_user_set_error(msg->hdr, EIO); + vfio_user_shutdown(proxy); + } + trace_vfio_user_send_write(msg->hdr->id, ret); + + return ret; +} + +static VFIOUserMsg *vfio_user_getmsg(VFIOUserProxy *proxy, VFIOUserHdr *hdr, + VFIOUserFDs *fds) +{ + VFIOUserMsg *msg; + + msg = QTAILQ_FIRST(&proxy->free); + if (msg != NULL) { + QTAILQ_REMOVE(&proxy->free, msg, next); + } else { + msg = g_malloc0(sizeof(*msg)); + qemu_cond_init(&msg->cv); + } + + msg->hdr = hdr; + msg->fds = fds; + return msg; +} + +/* + * Recycle a message list entry to the free list. + */ +static void vfio_user_recycle(VFIOUserProxy *proxy, VFIOUserMsg *msg) +{ + if (msg->type == VFIO_MSG_NONE) { + error_printf("vfio_user_recycle - freeing free msg\n"); + return; + } + + /* free msg buffer if no one is waiting to consume the reply */ + if (msg->type == VFIO_MSG_NOWAIT || msg->type == VFIO_MSG_ASYNC) { + g_free(msg->hdr); + if (msg->fds != NULL) { + g_free(msg->fds); + } + } + + msg->type = VFIO_MSG_NONE; + msg->hdr = NULL; + msg->fds = NULL; + msg->complete = false; + msg->pending = false; + QTAILQ_INSERT_HEAD(&proxy->free, msg, next); +} + +VFIOUserFDs *vfio_user_getfds(int numfds) +{ + VFIOUserFDs *fds = g_malloc0(sizeof(*fds) + (numfds * sizeof(int))); + + fds->fds = (int *)((char *)fds + sizeof(*fds)); + + return fds; +} + +/* + * Functions only called by iothread + */ + +/* + * Process a received message. + */ +static void vfio_user_process(VFIOUserProxy *proxy, VFIOUserMsg *msg, + bool isreply) +{ + + /* + * Replies signal a waiter, if none just check for errors + * and free the message buffer. + * + * Requests get queued for the BH. + */ + if (isreply) { + msg->complete = true; + if (msg->type == VFIO_MSG_WAIT) { + qemu_cond_signal(&msg->cv); + } else { + if (msg->hdr->flags & VFIO_USER_ERROR) { + error_printf("vfio_user_process: error reply on async "); + error_printf("request command %x error %s\n", + msg->hdr->command, + strerror(msg->hdr->error_reply)); + } + /* youngest nowait msg has been ack'd */ + if (proxy->last_nowait == msg) { + proxy->last_nowait = NULL; + } + vfio_user_recycle(proxy, msg); + } + } else { + QTAILQ_INSERT_TAIL(&proxy->incoming, msg, next); + qemu_bh_schedule(proxy->req_bh); + } +} + +/* + * Complete a partial message read + */ +static int vfio_user_complete(VFIOUserProxy *proxy, Error **errp) +{ + VFIOUserMsg *msg = proxy->part_recv; + size_t msgleft = proxy->recv_left; + bool isreply; + char *data; + int ret; + + data = (char *)msg->hdr + (msg->hdr->size - msgleft); + while (msgleft > 0) { + ret = qio_channel_read(proxy->ioc, data, msgleft, errp); + + /* error or would block */ + if (ret <= 0) { + /* try for rest on next iternation */ + if (ret == QIO_CHANNEL_ERR_BLOCK) { + proxy->recv_left = msgleft; + } + return ret; + } + trace_vfio_user_recv_read(msg->hdr->id, ret); + + msgleft -= ret; + data += ret; + } + + /* + * Read complete message, process it. + */ + proxy->part_recv = NULL; + proxy->recv_left = 0; + isreply = (msg->hdr->flags & VFIO_USER_TYPE) == VFIO_USER_REPLY; + vfio_user_process(proxy, msg, isreply); + + /* return positive value */ + return 1; +} + +/* + * Receive and process one incoming message. + * + * For replies, find matching outgoing request and wake any waiters. + * For requests, queue in incoming list and run request BH. + */ +static int vfio_user_recv_one(VFIOUserProxy *proxy, Error **errp) +{ + VFIOUserMsg *msg = NULL; + g_autofree int *fdp = NULL; + VFIOUserFDs *reqfds; + VFIOUserHdr hdr; + struct iovec iov = { + .iov_base = &hdr, + .iov_len = sizeof(hdr), + }; + bool isreply = false; + int i, ret; + size_t msgleft, numfds = 0; + char *data = NULL; + char *buf = NULL; + + /* + * Complete any partial reads + */ + if (proxy->part_recv != NULL) { + ret = vfio_user_complete(proxy, errp); + + /* still not complete, try later */ + if (ret == QIO_CHANNEL_ERR_BLOCK) { + return ret; + } + + if (ret <= 0) { + goto fatal; + } + /* else fall into reading another msg */ + } + + /* + * Read header + */ + ret = qio_channel_readv_full(proxy->ioc, &iov, 1, &fdp, &numfds, 0, + errp); + if (ret == QIO_CHANNEL_ERR_BLOCK) { + return ret; + } + + /* read error or other side closed connection */ + if (ret <= 0) { + goto fatal; + } + + if (ret < sizeof(hdr)) { + error_setg(errp, "short read of header"); + goto fatal; + } + + /* + * Validate header + */ + if (hdr.size < sizeof(VFIOUserHdr)) { + error_setg(errp, "bad header size"); + goto fatal; + } + switch (hdr.flags & VFIO_USER_TYPE) { + case VFIO_USER_REQUEST: + isreply = false; + break; + case VFIO_USER_REPLY: + isreply = true; + break; + default: + error_setg(errp, "unknown message type"); + goto fatal; + } + trace_vfio_user_recv_hdr(proxy->sockname, hdr.id, hdr.command, hdr.size, + hdr.flags); + + /* + * For replies, find the matching pending request. + * For requests, reap incoming FDs. + */ + if (isreply) { + QTAILQ_FOREACH(msg, &proxy->pending, next) { + if (hdr.id == msg->id) { + break; + } + } + if (msg == NULL) { + error_setg(errp, "unexpected reply"); + goto err; + } + QTAILQ_REMOVE(&proxy->pending, msg, next); + + /* + * Process any received FDs + */ + if (numfds != 0) { + if (msg->fds == NULL || msg->fds->recv_fds < numfds) { + error_setg(errp, "unexpected FDs"); + goto err; + } + msg->fds->recv_fds = numfds; + memcpy(msg->fds->fds, fdp, numfds * sizeof(int)); + } + } else { + if (numfds != 0) { + reqfds = vfio_user_getfds(numfds); + memcpy(reqfds->fds, fdp, numfds * sizeof(int)); + } else { + reqfds = NULL; + } + } + + /* + * Put the whole message into a single buffer. + */ + if (isreply) { + if (hdr.size > msg->rsize) { + error_setg(errp, "reply larger than recv buffer"); + goto err; + } + *msg->hdr = hdr; + data = (char *)msg->hdr + sizeof(hdr); + } else { + if (hdr.size > proxy->max_xfer_size + sizeof(VFIOUserDMARW)) { + error_setg(errp, "vfio_user_recv request larger than max"); + goto err; + } + buf = g_malloc0(hdr.size); + memcpy(buf, &hdr, sizeof(hdr)); + data = buf + sizeof(hdr); + msg = vfio_user_getmsg(proxy, (VFIOUserHdr *)buf, reqfds); + msg->type = VFIO_MSG_REQ; + } + + /* + * Read rest of message. + */ + msgleft = hdr.size - sizeof(hdr); + while (msgleft > 0) { + ret = qio_channel_read(proxy->ioc, data, msgleft, errp); + + /* prepare to complete read on next iternation */ + if (ret == QIO_CHANNEL_ERR_BLOCK) { + proxy->part_recv = msg; + proxy->recv_left = msgleft; + return ret; + } + + if (ret <= 0) { + goto fatal; + } + trace_vfio_user_recv_read(hdr.id, ret); + + msgleft -= ret; + data += ret; + } + + vfio_user_process(proxy, msg, isreply); + return 0; + + /* + * fatal means the other side closed or we don't trust the stream + * err means this message is corrupt + */ +fatal: + vfio_user_shutdown(proxy); + proxy->state = VFIO_PROXY_ERROR; + + /* set error if server side closed */ + if (ret == 0) { + error_setg(errp, "server closed socket"); + } + +err: + for (i = 0; i < numfds; i++) { + close(fdp[i]); + } + if (isreply && msg != NULL) { + /* force an error to keep sending thread from hanging */ + vfio_user_set_error(msg->hdr, EINVAL); + msg->complete = true; + qemu_cond_signal(&msg->cv); + } + return -1; +} + +static void vfio_user_recv(void *opaque) +{ + VFIOUserProxy *proxy = opaque; + + QEMU_LOCK_GUARD(&proxy->lock); + + if (proxy->state == VFIO_PROXY_CONNECTED) { + Error *local_err = NULL; + + while (vfio_user_recv_one(proxy, &local_err) == 0) { + ; + } + + if (local_err != NULL) { + error_report_err(local_err); + } + } +} + +/* + * Send a single message, same return semantics as vfio_user_send_qio(). + * + * Sent async messages are freed, others are moved to pending queue. + */ +static ssize_t vfio_user_send_one(VFIOUserProxy *proxy, Error **errp) +{ + VFIOUserMsg *msg; + ssize_t ret; + + msg = QTAILQ_FIRST(&proxy->outgoing); + ret = vfio_user_send_qio(proxy, msg, errp); + if (ret < 0) { + return ret; + } + + QTAILQ_REMOVE(&proxy->outgoing, msg, next); + proxy->num_outgoing--; + if (msg->type == VFIO_MSG_ASYNC) { + vfio_user_recycle(proxy, msg); + } else { + QTAILQ_INSERT_TAIL(&proxy->pending, msg, next); + msg->pending = true; + } + + return ret; +} + +/* + * Send messages from outgoing queue when the socket buffer has space. + * If we deplete 'outgoing', remove ourselves from the poll list. + */ +static void vfio_user_send(void *opaque) +{ + VFIOUserProxy *proxy = opaque; + + QEMU_LOCK_GUARD(&proxy->lock); + + if (proxy->state == VFIO_PROXY_CONNECTED) { + while (!QTAILQ_EMPTY(&proxy->outgoing)) { + Error *local_err = NULL; + int ret; + + ret = vfio_user_send_one(proxy, &local_err); + + if (ret == QIO_CHANNEL_ERR_BLOCK) { + return; + } else if (ret == -1) { + error_report_err(local_err); + return; + } + } + qio_channel_set_aio_fd_handler(proxy->ioc, proxy->ctx, + vfio_user_recv, NULL, NULL, proxy); + + /* queue empty - send any pending multi write msgs */ + if (proxy->wr_multi != NULL) { + vfio_user_flush_multi(proxy); + } + } +} + +static void vfio_user_cb(void *opaque) +{ + VFIOUserProxy *proxy = opaque; + + QEMU_LOCK_GUARD(&proxy->lock); + + proxy->state = VFIO_PROXY_CLOSED; + qemu_cond_signal(&proxy->close_cv); +} + + +/* + * Functions called by main or CPU threads + */ + +/* + * Process incoming requests. + * + * The bus-specific callback has the form: + * request(opaque, msg) + * where 'opaque' was specified in vfio_user_set_handler + * and 'msg' is the inbound message. + * + * The callback is responsible for disposing of the message buffer, + * usually by re-using it when calling vfio_send_reply or vfio_send_error, + * both of which free their message buffer when the reply is sent. + * + * If the callback uses a new buffer, it needs to free the old one. + */ +static void vfio_user_request(void *opaque) +{ + VFIOUserProxy *proxy = opaque; + VFIOUserMsgQ new, free; + VFIOUserMsg *msg, *m1; + + /* reap all incoming */ + QTAILQ_INIT(&new); + WITH_QEMU_LOCK_GUARD(&proxy->lock) { + QTAILQ_FOREACH_SAFE(msg, &proxy->incoming, next, m1) { + QTAILQ_REMOVE(&proxy->incoming, msg, next); + QTAILQ_INSERT_TAIL(&new, msg, next); + } + } + + /* process list */ + QTAILQ_INIT(&free); + QTAILQ_FOREACH_SAFE(msg, &new, next, m1) { + QTAILQ_REMOVE(&new, msg, next); + trace_vfio_user_recv_request(msg->hdr->command); + proxy->request(proxy->req_arg, msg); + QTAILQ_INSERT_HEAD(&free, msg, next); + } + + /* free list */ + WITH_QEMU_LOCK_GUARD(&proxy->lock) { + QTAILQ_FOREACH_SAFE(msg, &free, next, m1) { + vfio_user_recycle(proxy, msg); + } + } +} + +/* + * Messages are queued onto the proxy's outgoing list. + * + * It handles 3 types of messages: + * + * async messages - replies and posted writes + * + * There will be no reply from the server, so message + * buffers are freed after they're sent. + * + * nowait messages - map/unmap during address space transactions + * + * These are also sent async, but a reply is expected so that + * vfio_wait_reqs() can wait for the youngest nowait request. + * They transition from the outgoing list to the pending list + * when sent, and are freed when the reply is received. + * + * wait messages - all other requests + * + * The reply to these messages is waited for by their caller. + * They also transition from outgoing to pending when sent, but + * the message buffer is returned to the caller with the reply + * contents. The caller is responsible for freeing these messages. + * + * As an optimization, if the outgoing list and the socket send + * buffer are empty, the message is sent inline instead of being + * added to the outgoing list. The rest of the transitions are + * unchanged. + */ +static bool vfio_user_send_queued(VFIOUserProxy *proxy, VFIOUserMsg *msg, + Error **errp) +{ + int ret; + + /* older coalesced writes go first */ + if (proxy->wr_multi != NULL && + ((msg->hdr->flags & VFIO_USER_TYPE) == VFIO_USER_REQUEST)) { + vfio_user_flush_multi(proxy); + } + + /* + * Unsent outgoing msgs - add to tail + */ + if (!QTAILQ_EMPTY(&proxy->outgoing)) { + QTAILQ_INSERT_TAIL(&proxy->outgoing, msg, next); + proxy->num_outgoing++; + return true; + } + + /* + * Try inline - if blocked, queue it and kick send poller + */ + if (proxy->flags & VFIO_PROXY_FORCE_QUEUED) { + ret = QIO_CHANNEL_ERR_BLOCK; + } else { + ret = vfio_user_send_qio(proxy, msg, errp); + } + + if (ret == QIO_CHANNEL_ERR_BLOCK) { + QTAILQ_INSERT_HEAD(&proxy->outgoing, msg, next); + proxy->num_outgoing = 1; + qio_channel_set_aio_fd_handler(proxy->ioc, proxy->ctx, + vfio_user_recv, proxy->ctx, + vfio_user_send, proxy); + return true; + } + if (ret == -1) { + return false; + } + + /* + * Sent - free async, add others to pending + */ + if (msg->type == VFIO_MSG_ASYNC) { + vfio_user_recycle(proxy, msg); + } else { + QTAILQ_INSERT_TAIL(&proxy->pending, msg, next); + msg->pending = true; + } + + return true; +} + +/* + * nowait send - vfio_wait_reqs() can wait for it later + * + * Returns false if we did not successfully receive a reply message, in which + * case @errp will be populated. + * + * In either case, ownership of @hdr and @fds is taken, and the caller must + * *not* free them itself. + */ +bool vfio_user_send_nowait(VFIOUserProxy *proxy, VFIOUserHdr *hdr, + VFIOUserFDs *fds, int rsize, Error **errp) +{ + VFIOUserMsg *msg; + + QEMU_LOCK_GUARD(&proxy->lock); + + msg = vfio_user_getmsg(proxy, hdr, fds); + msg->id = hdr->id; + msg->rsize = rsize ? rsize : hdr->size; + msg->type = VFIO_MSG_NOWAIT; + + if (hdr->flags & VFIO_USER_NO_REPLY) { + error_setg_errno(errp, EINVAL, "%s on NO_REPLY message", __func__); + vfio_user_recycle(proxy, msg); + return false; + } + + if (!vfio_user_send_queued(proxy, msg, errp)) { + vfio_user_recycle(proxy, msg); + return false; + } + + proxy->last_nowait = msg; + + return true; +} + +/* + * Returns false if we did not successfully receive a reply message, in which + * case @errp will be populated. + * + * In either case, the caller must free @hdr and @fds if needed. + */ +bool vfio_user_send_wait(VFIOUserProxy *proxy, VFIOUserHdr *hdr, + VFIOUserFDs *fds, int rsize, Error **errp) +{ + VFIOUserMsg *msg; + bool ok = false; + + if (hdr->flags & VFIO_USER_NO_REPLY) { + error_setg_errno(errp, EINVAL, "%s on NO_REPLY message", __func__); + return false; + } + + qemu_mutex_lock(&proxy->lock); + + msg = vfio_user_getmsg(proxy, hdr, fds); + msg->id = hdr->id; + msg->rsize = rsize ? rsize : hdr->size; + msg->type = VFIO_MSG_WAIT; + + ok = vfio_user_send_queued(proxy, msg, errp); + + if (ok) { + while (!msg->complete) { + if (!qemu_cond_timedwait(&msg->cv, &proxy->lock, + proxy->wait_time)) { + VFIOUserMsgQ *list; + + list = msg->pending ? &proxy->pending : &proxy->outgoing; + QTAILQ_REMOVE(list, msg, next); + error_setg_errno(errp, ETIMEDOUT, + "timed out waiting for reply"); + ok = false; + break; + } + } + } + + vfio_user_recycle(proxy, msg); + + qemu_mutex_unlock(&proxy->lock); + + return ok; +} + +/* + * async send - msg can be queued, but will be freed when sent + * + * Returns false on failure, in which case @errp will be populated. + * + * In either case, ownership of @hdr and @fds is taken, and the caller must + * *not* free them itself. + */ +bool vfio_user_send_async(VFIOUserProxy *proxy, VFIOUserHdr *hdr, + VFIOUserFDs *fds, Error **errp) +{ + VFIOUserMsg *msg; + + QEMU_LOCK_GUARD(&proxy->lock); + + msg = vfio_user_getmsg(proxy, hdr, fds); + msg->id = hdr->id; + msg->rsize = 0; + msg->type = VFIO_MSG_ASYNC; + + if (!(hdr->flags & (VFIO_USER_NO_REPLY | VFIO_USER_REPLY))) { + error_setg_errno(errp, EINVAL, "%s on sync message", __func__); + vfio_user_recycle(proxy, msg); + return false; + } + + if (!vfio_user_send_queued(proxy, msg, errp)) { + vfio_user_recycle(proxy, msg); + return false; + } + + return true; +} + +void vfio_user_wait_reqs(VFIOUserProxy *proxy) +{ + VFIOUserMsg *msg; + + /* + * Any DMA map/unmap requests sent in the middle + * of a memory region transaction were sent nowait. + * Wait for them here. + */ + qemu_mutex_lock(&proxy->lock); + if (proxy->last_nowait != NULL) { + /* + * Change type to WAIT to wait for reply + */ + msg = proxy->last_nowait; + msg->type = VFIO_MSG_WAIT; + proxy->last_nowait = NULL; + while (!msg->complete) { + if (!qemu_cond_timedwait(&msg->cv, &proxy->lock, + proxy->wait_time)) { + VFIOUserMsgQ *list; + + list = msg->pending ? &proxy->pending : &proxy->outgoing; + QTAILQ_REMOVE(list, msg, next); + error_printf("vfio_wait_reqs - timed out\n"); + break; + } + } + + if (msg->hdr->flags & VFIO_USER_ERROR) { + error_printf("vfio_user_wait_reqs - error reply on async "); + error_printf("request: command %x error %s\n", msg->hdr->command, + strerror(msg->hdr->error_reply)); + } + + /* + * Change type back to NOWAIT to free + */ + msg->type = VFIO_MSG_NOWAIT; + vfio_user_recycle(proxy, msg); + } + + qemu_mutex_unlock(&proxy->lock); +} + +/* + * Reply to an incoming request. + */ +void vfio_user_send_reply(VFIOUserProxy *proxy, VFIOUserHdr *hdr, int size) +{ + Error *local_err = NULL; + + if (size < sizeof(VFIOUserHdr)) { + error_printf("%s: size too small", __func__); + g_free(hdr); + return; + } + + /* + * convert header to associated reply + */ + hdr->flags = VFIO_USER_REPLY; + hdr->size = size; + + if (!vfio_user_send_async(proxy, hdr, NULL, &local_err)) { + error_report_err(local_err); + } +} + +/* + * Send an error reply to an incoming request. + */ +void vfio_user_send_error(VFIOUserProxy *proxy, VFIOUserHdr *hdr, int error) +{ + Error *local_err = NULL; + + /* + * convert header to associated reply + */ + hdr->flags = VFIO_USER_REPLY; + hdr->flags |= VFIO_USER_ERROR; + hdr->error_reply = error; + hdr->size = sizeof(*hdr); + + if (!vfio_user_send_async(proxy, hdr, NULL, &local_err)) { + error_report_err(local_err); + } +} + +/* + * Close FDs erroneously received in an incoming request. + */ +void vfio_user_putfds(VFIOUserMsg *msg) +{ + VFIOUserFDs *fds = msg->fds; + int i; + + for (i = 0; i < fds->recv_fds; i++) { + close(fds->fds[i]); + } + g_free(fds); + msg->fds = NULL; +} + +void +vfio_user_disable_posted_writes(VFIOUserProxy *proxy) +{ + WITH_QEMU_LOCK_GUARD(&proxy->lock) { + proxy->flags |= VFIO_PROXY_NO_POST; + } +} + +static QLIST_HEAD(, VFIOUserProxy) vfio_user_sockets = + QLIST_HEAD_INITIALIZER(vfio_user_sockets); + +VFIOUserProxy *vfio_user_connect_dev(SocketAddress *addr, Error **errp) +{ + VFIOUserProxy *proxy; + QIOChannelSocket *sioc; + QIOChannel *ioc; + char *sockname; + + if (addr->type != SOCKET_ADDRESS_TYPE_UNIX) { + error_setg(errp, "vfio_user_connect - bad address family"); + return NULL; + } + sockname = addr->u.q_unix.path; + + sioc = qio_channel_socket_new(); + ioc = QIO_CHANNEL(sioc); + if (qio_channel_socket_connect_sync(sioc, addr, errp)) { + object_unref(OBJECT(ioc)); + return NULL; + } + qio_channel_set_blocking(ioc, false, NULL); + + proxy = g_malloc0(sizeof(VFIOUserProxy)); + proxy->sockname = g_strdup_printf("unix:%s", sockname); + proxy->ioc = ioc; + + /* init defaults */ + proxy->max_xfer_size = VFIO_USER_DEF_MAX_XFER; + proxy->max_send_fds = VFIO_USER_DEF_MAX_FDS; + proxy->max_dma = VFIO_USER_DEF_MAP_MAX; + proxy->dma_pgsizes = VFIO_USER_DEF_PGSIZE; + proxy->max_bitmap = VFIO_USER_DEF_MAX_BITMAP; + proxy->migr_pgsize = VFIO_USER_DEF_PGSIZE; + + proxy->flags = VFIO_PROXY_CLIENT; + proxy->state = VFIO_PROXY_CONNECTED; + + qemu_mutex_init(&proxy->lock); + qemu_cond_init(&proxy->close_cv); + + if (vfio_user_iothread == NULL) { + vfio_user_iothread = iothread_create("VFIO user", errp); + } + + proxy->ctx = iothread_get_aio_context(vfio_user_iothread); + proxy->req_bh = qemu_bh_new(vfio_user_request, proxy); + + QTAILQ_INIT(&proxy->outgoing); + QTAILQ_INIT(&proxy->incoming); + QTAILQ_INIT(&proxy->free); + QTAILQ_INIT(&proxy->pending); + QLIST_INSERT_HEAD(&vfio_user_sockets, proxy, next); + + return proxy; +} + +void vfio_user_set_handler(VFIODevice *vbasedev, + void (*handler)(void *opaque, VFIOUserMsg *msg), + void *req_arg) +{ + VFIOUserProxy *proxy = vbasedev->proxy; + + proxy->request = handler; + proxy->req_arg = req_arg; + qio_channel_set_aio_fd_handler(proxy->ioc, proxy->ctx, + vfio_user_recv, NULL, NULL, proxy); +} + +void vfio_user_disconnect(VFIOUserProxy *proxy) +{ + VFIOUserMsg *r1, *r2; + + qemu_mutex_lock(&proxy->lock); + + /* our side is quitting */ + if (proxy->state == VFIO_PROXY_CONNECTED) { + vfio_user_shutdown(proxy); + if (!QTAILQ_EMPTY(&proxy->pending)) { + error_printf("vfio_user_disconnect: outstanding requests\n"); + } + } + object_unref(OBJECT(proxy->ioc)); + proxy->ioc = NULL; + qemu_bh_delete(proxy->req_bh); + proxy->req_bh = NULL; + + proxy->state = VFIO_PROXY_CLOSING; + QTAILQ_FOREACH_SAFE(r1, &proxy->outgoing, next, r2) { + qemu_cond_destroy(&r1->cv); + QTAILQ_REMOVE(&proxy->outgoing, r1, next); + g_free(r1); + } + QTAILQ_FOREACH_SAFE(r1, &proxy->incoming, next, r2) { + qemu_cond_destroy(&r1->cv); + QTAILQ_REMOVE(&proxy->incoming, r1, next); + g_free(r1); + } + QTAILQ_FOREACH_SAFE(r1, &proxy->pending, next, r2) { + qemu_cond_destroy(&r1->cv); + QTAILQ_REMOVE(&proxy->pending, r1, next); + g_free(r1); + } + QTAILQ_FOREACH_SAFE(r1, &proxy->free, next, r2) { + qemu_cond_destroy(&r1->cv); + QTAILQ_REMOVE(&proxy->free, r1, next); + g_free(r1); + } + + /* + * Make sure the iothread isn't blocking anywhere + * with a ref to this proxy by waiting for a BH + * handler to run after the proxy fd handlers were + * deleted above. + */ + aio_bh_schedule_oneshot(proxy->ctx, vfio_user_cb, proxy); + qemu_cond_wait(&proxy->close_cv, &proxy->lock); + + /* we now hold the only ref to proxy */ + qemu_mutex_unlock(&proxy->lock); + qemu_cond_destroy(&proxy->close_cv); + qemu_mutex_destroy(&proxy->lock); + + QLIST_REMOVE(proxy, next); + if (QLIST_EMPTY(&vfio_user_sockets)) { + iothread_destroy(vfio_user_iothread); + vfio_user_iothread = NULL; + } + + g_free(proxy->sockname); + g_free(proxy); +} + +void vfio_user_request_msg(VFIOUserHdr *hdr, uint16_t cmd, + uint32_t size, uint32_t flags) +{ + static uint16_t next_id; + + hdr->id = qatomic_fetch_inc(&next_id); + hdr->command = cmd; + hdr->size = size; + hdr->flags = (flags & ~VFIO_USER_TYPE) | VFIO_USER_REQUEST; + hdr->error_reply = 0; +} + +struct cap_entry { + const char *name; + bool (*check)(VFIOUserProxy *proxy, QObject *qobj, Error **errp); +}; + +static bool caps_parse(VFIOUserProxy *proxy, QDict *qdict, + struct cap_entry caps[], Error **errp) +{ + QObject *qobj; + struct cap_entry *p; + + for (p = caps; p->name != NULL; p++) { + qobj = qdict_get(qdict, p->name); + if (qobj != NULL) { + if (!p->check(proxy, qobj, errp)) { + return false; + } + qdict_del(qdict, p->name); + } + } + + /* warning, for now */ + if (qdict_size(qdict) != 0) { + warn_report("spurious capabilities"); + } + return true; +} + +static bool check_migr_pgsize(VFIOUserProxy *proxy, QObject *qobj, Error **errp) +{ + QNum *qn = qobject_to(QNum, qobj); + uint64_t pgsize; + + if (qn == NULL || !qnum_get_try_uint(qn, &pgsize)) { + error_setg(errp, "malformed %s", VFIO_USER_CAP_PGSIZE); + return false; + } + + /* must be larger than default */ + if (pgsize & (VFIO_USER_DEF_PGSIZE - 1)) { + error_setg(errp, "pgsize 0x%"PRIx64" too small", pgsize); + return false; + } + + proxy->migr_pgsize = pgsize; + return true; +} + +static bool check_bitmap(VFIOUserProxy *proxy, QObject *qobj, Error **errp) +{ + QNum *qn = qobject_to(QNum, qobj); + uint64_t bitmap_size; + + if (qn == NULL || !qnum_get_try_uint(qn, &bitmap_size)) { + error_setg(errp, "malformed %s", VFIO_USER_CAP_MAX_BITMAP); + return false; + } + + /* can only lower it */ + if (bitmap_size > VFIO_USER_DEF_MAX_BITMAP) { + error_setg(errp, "%s too large", VFIO_USER_CAP_MAX_BITMAP); + return false; + } + + proxy->max_bitmap = bitmap_size; + return true; +} + +static struct cap_entry caps_migr[] = { + { VFIO_USER_CAP_PGSIZE, check_migr_pgsize }, + { VFIO_USER_CAP_MAX_BITMAP, check_bitmap }, + { NULL } +}; + +static bool check_max_fds(VFIOUserProxy *proxy, QObject *qobj, Error **errp) +{ + QNum *qn = qobject_to(QNum, qobj); + uint64_t max_send_fds; + + if (qn == NULL || !qnum_get_try_uint(qn, &max_send_fds) || + max_send_fds > VFIO_USER_MAX_MAX_FDS) { + error_setg(errp, "malformed %s", VFIO_USER_CAP_MAX_FDS); + return false; + } + proxy->max_send_fds = max_send_fds; + return true; +} + +static bool check_max_xfer(VFIOUserProxy *proxy, QObject *qobj, Error **errp) +{ + QNum *qn = qobject_to(QNum, qobj); + uint64_t max_xfer_size; + + if (qn == NULL || !qnum_get_try_uint(qn, &max_xfer_size) || + max_xfer_size > VFIO_USER_MAX_MAX_XFER) { + error_setg(errp, "malformed %s", VFIO_USER_CAP_MAX_XFER); + return false; + } + proxy->max_xfer_size = max_xfer_size; + return true; +} + +static bool check_pgsizes(VFIOUserProxy *proxy, QObject *qobj, Error **errp) +{ + QNum *qn = qobject_to(QNum, qobj); + uint64_t pgsizes; + + if (qn == NULL || !qnum_get_try_uint(qn, &pgsizes)) { + error_setg(errp, "malformed %s", VFIO_USER_CAP_PGSIZES); + return false; + } + + /* must be larger than default */ + if (pgsizes & (VFIO_USER_DEF_PGSIZE - 1)) { + error_setg(errp, "pgsize 0x%"PRIx64" too small", pgsizes); + return false; + } + + proxy->dma_pgsizes = pgsizes; + return true; +} + +static bool check_max_dma(VFIOUserProxy *proxy, QObject *qobj, Error **errp) +{ + QNum *qn = qobject_to(QNum, qobj); + uint64_t max_dma; + + if (qn == NULL || !qnum_get_try_uint(qn, &max_dma)) { + error_setg(errp, "malformed %s", VFIO_USER_CAP_MAP_MAX); + return false; + } + + /* can only lower it */ + if (max_dma > VFIO_USER_DEF_MAP_MAX) { + error_setg(errp, "%s too large", VFIO_USER_CAP_MAP_MAX); + return false; + } + + proxy->max_dma = max_dma; + return true; +} + +static bool check_migr(VFIOUserProxy *proxy, QObject *qobj, Error **errp) +{ + QDict *qdict = qobject_to(QDict, qobj); + + if (qdict == NULL) { + error_setg(errp, "malformed %s", VFIO_USER_CAP_MAX_FDS); + return true; + } + return caps_parse(proxy, qdict, caps_migr, errp); +} + +static bool check_multi(VFIOUserProxy *proxy, QObject *qobj, Error **errp) +{ + QBool *qb = qobject_to(QBool, qobj); + + if (qb == NULL) { + error_setg(errp, "malformed %s", VFIO_USER_CAP_MULTI); + return false; + } + if (qbool_get_bool(qb)) { + proxy->flags |= VFIO_PROXY_USE_MULTI; + } + return true; +} + +static struct cap_entry caps_cap[] = { + { VFIO_USER_CAP_MAX_FDS, check_max_fds }, + { VFIO_USER_CAP_MAX_XFER, check_max_xfer }, + { VFIO_USER_CAP_PGSIZES, check_pgsizes }, + { VFIO_USER_CAP_MAP_MAX, check_max_dma }, + { VFIO_USER_CAP_MIGR, check_migr }, + { VFIO_USER_CAP_MULTI, check_multi }, + { NULL } +}; + +static bool check_cap(VFIOUserProxy *proxy, QObject *qobj, Error **errp) +{ + QDict *qdict = qobject_to(QDict, qobj); + + if (qdict == NULL) { + error_setg(errp, "malformed %s", VFIO_USER_CAP); + return false; + } + return caps_parse(proxy, qdict, caps_cap, errp); +} + +static struct cap_entry ver_0_0[] = { + { VFIO_USER_CAP, check_cap }, + { NULL } +}; + +static bool caps_check(VFIOUserProxy *proxy, int minor, const char *caps, + Error **errp) +{ + QObject *qobj; + QDict *qdict; + bool ret; + + qobj = qobject_from_json(caps, NULL); + if (qobj == NULL) { + error_setg(errp, "malformed capabilities %s", caps); + return false; + } + qdict = qobject_to(QDict, qobj); + if (qdict == NULL) { + error_setg(errp, "capabilities %s not an object", caps); + qobject_unref(qobj); + return false; + } + ret = caps_parse(proxy, qdict, ver_0_0, errp); + + qobject_unref(qobj); + return ret; +} + +static GString *caps_json(void) +{ + QDict *dict = qdict_new(); + QDict *capdict = qdict_new(); + QDict *migdict = qdict_new(); + GString *str; + + qdict_put_int(migdict, VFIO_USER_CAP_PGSIZE, VFIO_USER_DEF_PGSIZE); + qdict_put_int(migdict, VFIO_USER_CAP_MAX_BITMAP, VFIO_USER_DEF_MAX_BITMAP); + qdict_put_obj(capdict, VFIO_USER_CAP_MIGR, QOBJECT(migdict)); + + qdict_put_int(capdict, VFIO_USER_CAP_MAX_FDS, VFIO_USER_MAX_MAX_FDS); + qdict_put_int(capdict, VFIO_USER_CAP_MAX_XFER, VFIO_USER_DEF_MAX_XFER); + qdict_put_int(capdict, VFIO_USER_CAP_PGSIZES, VFIO_USER_DEF_PGSIZE); + qdict_put_int(capdict, VFIO_USER_CAP_MAP_MAX, VFIO_USER_DEF_MAP_MAX); + qdict_put_bool(capdict, VFIO_USER_CAP_MULTI, true); + + qdict_put_obj(dict, VFIO_USER_CAP, QOBJECT(capdict)); + + str = qobject_to_json(QOBJECT(dict)); + qobject_unref(dict); + return str; +} + +bool vfio_user_validate_version(VFIOUserProxy *proxy, Error **errp) +{ + g_autofree VFIOUserVersion *msgp = NULL; + GString *caps; + char *reply; + int size, caplen; + + caps = caps_json(); + caplen = caps->len + 1; + size = sizeof(*msgp) + caplen; + msgp = g_malloc0(size); + + vfio_user_request_msg(&msgp->hdr, VFIO_USER_VERSION, size, 0); + msgp->major = VFIO_USER_MAJOR_VER; + msgp->minor = VFIO_USER_MINOR_VER; + memcpy(&msgp->capabilities, caps->str, caplen); + g_string_free(caps, true); + trace_vfio_user_version(msgp->major, msgp->minor, msgp->capabilities); + + if (!vfio_user_send_wait(proxy, &msgp->hdr, NULL, 0, errp)) { + return false; + } + + if (msgp->hdr.flags & VFIO_USER_ERROR) { + error_setg_errno(errp, msgp->hdr.error_reply, "version reply"); + return false; + } + + if (msgp->major != VFIO_USER_MAJOR_VER || + msgp->minor > VFIO_USER_MINOR_VER) { + error_setg(errp, "incompatible server version"); + return false; + } + + reply = msgp->capabilities; + if (reply[msgp->hdr.size - sizeof(*msgp) - 1] != '\0') { + error_setg(errp, "corrupt version reply"); + return false; + } + + if (!caps_check(proxy, msgp->minor, reply, errp)) { + return false; + } + + trace_vfio_user_version(msgp->major, msgp->minor, msgp->capabilities); + return true; +} + +void vfio_user_flush_multi(VFIOUserProxy *proxy) +{ + VFIOUserMsg *msg; + VFIOUserWRMulti *wm = proxy->wr_multi; + Error *local_err = NULL; + + proxy->wr_multi = NULL; + + /* adjust size for actual # of writes */ + wm->hdr.size -= (VFIO_USER_MULTI_MAX - wm->wr_cnt) * sizeof(VFIOUserWROne); + + msg = vfio_user_getmsg(proxy, &wm->hdr, NULL); + msg->id = wm->hdr.id; + msg->rsize = 0; + msg->type = VFIO_MSG_ASYNC; + trace_vfio_user_wrmulti("flush", wm->wr_cnt); + + if (!vfio_user_send_queued(proxy, msg, &local_err)) { + error_report_err(local_err); + vfio_user_recycle(proxy, msg); + } +} + +void vfio_user_create_multi(VFIOUserProxy *proxy) +{ + VFIOUserWRMulti *wm; + + wm = g_malloc0(sizeof(*wm)); + vfio_user_request_msg(&wm->hdr, VFIO_USER_REGION_WRITE_MULTI, + sizeof(*wm), VFIO_USER_NO_REPLY); + proxy->wr_multi = wm; +} + +void vfio_user_add_multi(VFIOUserProxy *proxy, uint8_t index, + off_t offset, uint32_t count, void *data) +{ + VFIOUserWRMulti *wm = proxy->wr_multi; + VFIOUserWROne *w1 = &wm->wrs[wm->wr_cnt]; + + w1->offset = offset; + w1->region = index; + w1->count = count; + memcpy(&w1->data, data, count); + + wm->wr_cnt++; + trace_vfio_user_wrmulti("add", wm->wr_cnt); + if (wm->wr_cnt == VFIO_USER_MULTI_MAX || + proxy->num_outgoing < VFIO_USER_OUT_LOW) { + vfio_user_flush_multi(proxy); + } +} |