aboutsummaryrefslogtreecommitdiff
path: root/lib/tran_sock.c
diff options
context:
space:
mode:
authorJohn Levon <john.levon@nutanix.com>2020-11-19 09:23:10 +0000
committerGitHub <noreply@github.com>2020-11-19 09:23:10 +0000
commit58b89f9e0f7cd7847606fb22d2c0b9a38735cd62 (patch)
tree2fe5e0f56a1bb02572d8cd95cbd3c7f91a3ee9db /lib/tran_sock.c
parent57777a738b6945c203f28f7ee5829d04e8824f8c (diff)
downloadlibvfio-user-58b89f9e0f7cd7847606fb22d2c0b9a38735cd62.zip
libvfio-user-58b89f9e0f7cd7847606fb22d2c0b9a38735cd62.tar.gz
libvfio-user-58b89f9e0f7cd7847606fb22d2c0b9a38735cd62.tar.bz2
refactor socket code into lib/tran_sock.[ch] (#97)
Diffstat (limited to 'lib/tran_sock.c')
-rw-r--r--lib/tran_sock.c575
1 files changed, 575 insertions, 0 deletions
diff --git a/lib/tran_sock.c b/lib/tran_sock.c
new file mode 100644
index 0000000..856d296
--- /dev/null
+++ b/lib/tran_sock.c
@@ -0,0 +1,575 @@
+/*
+ * Copyright (c) 2020 Nutanix Inc. All rights reserved.
+ *
+ * Authors: Thanos Makatos <thanos@nutanix.com>
+ * Swapnil Ingle <swapnil.ingle@nutanix.com>
+ * Felipe Franciosi <felipe@nutanix.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Nutanix nor the names of its contributors may be
+ * used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include "muser.h"
+#include "muser_priv.h"
+#include "tran_sock.h"
+
+#define MAX_FDS 8
+
+static inline int
+recv_blocking(int sock, void *buf, size_t len, int flags)
+{
+ int f = fcntl(sock, F_GETFL, 0);
+ int ret, fret;
+
+ fret = fcntl(sock, F_SETFL, f & ~O_NONBLOCK);
+ assert(fret != -1);
+
+ ret = recv(sock, buf, len, flags);
+
+ fret = fcntl(sock, F_SETFL, f);
+ assert(fret != -1);
+
+ return ret;
+}
+
+static int
+init_sock(lm_ctx_t *lm_ctx)
+{
+ struct sockaddr_un addr = { .sun_family = AF_UNIX };
+ int ret, unix_sock;
+ mode_t mode;
+
+ assert(lm_ctx != NULL);
+
+ /* FIXME SPDK can't easily run as non-root */
+ mode = umask(0000);
+
+ if ((unix_sock = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
+ ret = -errno;
+ goto out;
+ }
+
+ if (lm_ctx->flags & LM_FLAG_ATTACH_NB) {
+ ret = fcntl(unix_sock, F_SETFL,
+ fcntl(unix_sock, F_GETFL, 0) | O_NONBLOCK);
+ if (ret < 0) {
+ ret = -errno;
+ goto out;
+ }
+ lm_ctx->sock_flags = MSG_DONTWAIT | MSG_WAITALL;
+ } else {
+ lm_ctx->sock_flags = 0;
+ }
+
+ ret = snprintf(addr.sun_path, sizeof addr.sun_path, "%s", lm_ctx->uuid);
+ if (ret >= (int)sizeof addr.sun_path) {
+ ret = -ENAMETOOLONG;
+ }
+ if (ret < 0) {
+ goto out;
+ }
+
+ /* start listening business */
+ ret = bind(unix_sock, (struct sockaddr*)&addr, sizeof(addr));
+ if (ret < 0) {
+ ret = errno;
+ goto out;
+ }
+
+ ret = listen(unix_sock, 0);
+ if (ret < 0) {
+ ret = -errno;
+ }
+
+out:
+ umask(mode);
+ if (ret != 0) {
+ close(unix_sock);
+ return ret;
+ }
+ return unix_sock;
+}
+
+int
+_send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply,
+ enum vfio_user_command cmd,
+ struct iovec *iovecs, size_t nr_iovecs,
+ int *fds, int count, int err)
+{
+ int ret;
+ struct vfio_user_header hdr = {.msg_id = msg_id};
+ struct msghdr msg;
+ size_t i;
+
+ if (nr_iovecs == 0) {
+ iovecs = alloca(sizeof(*iovecs));
+ nr_iovecs = 1;
+ }
+
+ memset(&msg, 0, sizeof(msg));
+
+ if (is_reply) {
+ // FIXME: SPEC: should the reply include the command? I'd say yes?
+ hdr.flags.type = VFIO_USER_F_TYPE_REPLY;
+ if (err != 0) {
+ hdr.flags.error = 1U;
+ hdr.error_no = err;
+ }
+ } else {
+ hdr.cmd = cmd;
+ hdr.flags.type = VFIO_USER_F_TYPE_COMMAND;
+ }
+
+ iovecs[0].iov_base = &hdr;
+ iovecs[0].iov_len = sizeof(hdr);
+
+ for (i = 0; i < nr_iovecs; i++) {
+ hdr.msg_size += iovecs[i].iov_len;
+ }
+
+ msg.msg_iovlen = nr_iovecs;
+ msg.msg_iov = iovecs;
+
+ if (fds != NULL) {
+ size_t size = count * sizeof *fds;
+ char *buf = alloca(CMSG_SPACE(size));
+
+ msg.msg_control = buf;
+ msg.msg_controllen = CMSG_SPACE(size);
+
+ struct cmsghdr * cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(size);
+ memcpy(CMSG_DATA(cmsg), fds, size);
+ }
+
+ // FIXME: this doesn't check the entire data was sent?
+ ret = sendmsg(sock, &msg, 0);
+ if (ret == -1) {
+ return -errno;
+ }
+
+ return 0;
+}
+
+int
+send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply,
+ enum vfio_user_command cmd,
+ void *data, size_t data_len,
+ int *fds, size_t count)
+{
+ /* [0] is for the header. */
+ struct iovec iovecs[2] = {
+ [1] = {
+ .iov_base = data,
+ .iov_len = data_len
+ }
+ };
+ return _send_vfio_user_msg(sock, msg_id, is_reply, cmd, iovecs,
+ ARRAY_SIZE(iovecs), fds, count, 0);
+}
+
+/*
+ * Receive a vfio-user message. If "len" is set to non-zero, the message should
+ * include data of that length, which is stored in the pre-allocated "data"
+ * pointer.
+ *
+ * FIXME: in general, sort out negative err returns - they should only be used
+ * when we're going to return > 0 on success, and even then "errno" might be
+ * better.
+ */
+int
+recv_vfio_user_msg(int sock, struct vfio_user_header *hdr, bool is_reply,
+ uint16_t *msg_id, void *data, size_t *len)
+{
+ int ret;
+
+ /* FIXME if ret == -1 then fcntl can overwrite recv's errno */
+
+ ret = recv_blocking(sock, hdr, sizeof(*hdr), 0);
+ if (ret == -1) {
+ return -errno;
+ }
+ if (ret < (int)sizeof(*hdr)) {
+ return -EINVAL;
+ }
+
+ if (is_reply) {
+ if (hdr->msg_id != *msg_id) {
+ return -EINVAL;
+ }
+
+ if (hdr->flags.type != VFIO_USER_F_TYPE_REPLY) {
+ return -EINVAL;
+ }
+
+ if (hdr->flags.error == 1U) {
+ if (hdr->error_no <= 0) {
+ hdr->error_no = EINVAL;
+ }
+ return -hdr->error_no;
+ }
+ } else {
+ if (hdr->flags.type != VFIO_USER_F_TYPE_COMMAND) {
+ return -EINVAL;
+ }
+ *msg_id = hdr->msg_id;
+ }
+
+ if (len != NULL && *len > 0 && hdr->msg_size > sizeof *hdr) {
+ ret = recv_blocking(sock, data, MIN(hdr->msg_size - sizeof *hdr, *len),
+ 0);
+ if (ret < 0) {
+ return ret;
+ }
+ if (*len != (size_t)ret) { /* FIXME we should allow receiving less */
+ return -EINVAL;
+ }
+ *len = ret;
+ }
+ return 0;
+}
+
+/*
+ * Like recv_vfio_user_msg(), but will automatically allocate reply data.
+ *
+ * FIXME: this does an unconstrained alloc of client-supplied data.
+ */
+int
+recv_vfio_user_msg_alloc(int sock, struct vfio_user_header *hdr, bool is_reply,
+ uint16_t *msg_id, void **datap, size_t *lenp)
+{
+ void *data;
+ size_t len;
+ int ret;
+
+ ret = recv_vfio_user_msg(sock, hdr, is_reply, msg_id, NULL, NULL);
+
+ if (ret != 0) {
+ return ret;
+ }
+
+ assert(hdr->msg_size >= sizeof (*hdr));
+
+ len = hdr->msg_size - sizeof (*hdr);
+
+ if (len == 0) {
+ *datap = NULL;
+ *lenp = 0;
+ return 0;
+ }
+
+ data = calloc(1, len);
+
+ if (data == NULL) {
+ return -errno;
+ }
+
+ ret = recv_blocking(sock, data, len, 0);
+ if (ret < 0) {
+ free(data);
+ return -errno;
+ }
+
+ if (len != (size_t)ret) {
+ free(data);
+ return -EINVAL;
+ }
+
+ *datap = data;
+ *lenp = len;
+ return 0;
+}
+
+int
+_send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd,
+ struct iovec *iovecs, size_t nr_iovecs,
+ int *send_fds, size_t fd_count,
+ struct vfio_user_header *hdr,
+ void *recv_data, size_t recv_len)
+{
+ int ret = _send_vfio_user_msg(sock, msg_id, false, cmd, iovecs, nr_iovecs,
+ send_fds, fd_count, 0);
+ if (ret < 0) {
+ return ret;
+ }
+ if (hdr == NULL) {
+ hdr = alloca(sizeof *hdr);
+ }
+ return recv_vfio_user_msg(sock, hdr, true, &msg_id, recv_data, &recv_len);
+}
+
+int
+send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd,
+ void *send_data, size_t send_len,
+ int *send_fds, size_t fd_count,
+ struct vfio_user_header *hdr,
+ void *recv_data, size_t recv_len)
+{
+ /* [0] is for the header. */
+ struct iovec iovecs[2] = {
+ [1] = {
+ .iov_base = send_data,
+ .iov_len = send_len
+ }
+ };
+ return _send_recv_vfio_user_msg(sock, msg_id, cmd, iovecs,
+ ARRAY_SIZE(iovecs), send_fds, fd_count,
+ hdr, recv_data, recv_len);
+}
+
+int
+recv_version(lm_ctx_t *lm_ctx, int sock, uint16_t *msg_idp,
+ struct vfio_user_version **versionp)
+{
+ struct vfio_user_version *cversion;
+ struct vfio_user_header hdr;
+ size_t vlen;
+ int ret;
+
+ *versionp = NULL;
+
+ ret = recv_vfio_user_msg_alloc(sock, &hdr, false, msg_idp,
+ (void **)&cversion, &vlen);
+
+ if (ret < 0) {
+ lm_log(lm_ctx, LM_ERR, "failed to receive version: %s", strerror(-ret));
+ goto out;
+ }
+
+ if (hdr.cmd != VFIO_USER_VERSION) {
+ lm_log(lm_ctx, LM_ERR, "msg%hx: invalid cmd %hu (expected %hu)",
+ *msg_idp, hdr.cmd, VFIO_USER_VERSION);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (vlen < sizeof (*cversion)) {
+ lm_log(lm_ctx, LM_ERR, "msg%hx (VFIO_USER_VERSION): invalid size %lu",
+ *msg_idp, vlen);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* FIXME: oracle qemu code has major of 1 currently */
+#if 0
+ if (cversion->major != LIB_MUSER_VFIO_USER_VERS_MJ) {
+ lm_log(lm_ctx, LM_ERR, "unsupported client major %hu (must be %hu)",
+ cversion->major, LIB_MUSER_VFIO_USER_VERS_MJ);
+ ret = -ENOTSUP;
+ goto out;
+ }
+#endif
+
+ /*
+ * FIXME: incorrect, if the client doesn't give a pgsize value, it means "no
+ * migration support", handle this
+ */
+ lm_ctx->migration.pgsize = sysconf(_SC_PAGESIZE);
+ lm_ctx->client_max_fds = 1;
+
+ if (vlen > sizeof (*cversion)) {
+ lm_log(lm_ctx, LM_DBG, "ignoring JSON \"%s\"", cversion->data);
+ // FIXME: don't ignore it.
+ lm_ctx->client_max_fds = 128;
+ }
+
+out:
+ if (ret != 0) {
+ free(cversion);
+ cversion = NULL;
+ }
+
+ *versionp = cversion;
+ return ret;
+}
+
+int
+send_version(lm_ctx_t *lm_ctx, int sock, uint16_t msg_id,
+ struct vfio_user_version *cversion)
+{
+ struct vfio_user_version *sversion = NULL;
+ char *server_caps = NULL;
+ size_t len;
+ int ret;
+
+ ret = asprintf(&server_caps, "{"
+ "\"capabilities\":{"
+ "\"max_fds\":%u,"
+ "\"migration\":{"
+ "\"pgsize\":%zu"
+ "}"
+ "}"
+ "}", MAX_FDS, lm_ctx->migration.pgsize);
+
+ if (ret == -1) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ len = sizeof (*sversion) + ret + 1;
+ sversion = calloc(1, len);
+
+ if (sversion == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ // FIXME: we should save the client minor here, and check that before trying
+ // to send unsupported things.
+ sversion->major = LIB_MUSER_VFIO_USER_VERS_MJ;
+ sversion->minor = MIN(cversion->minor, LIB_MUSER_VFIO_USER_VERS_MN);
+
+ // FIXME: strcpy sucks
+ strcpy(((char *)sversion) + offsetof(struct vfio_user_version, data),
+ server_caps);
+
+ ret = send_vfio_user_msg(sock, msg_id, true, VFIO_USER_VERSION, sversion,
+ len, NULL, 0);
+
+out:
+ free(server_caps);
+ free(sversion);
+ return ret;
+}
+
+static int
+negotiate(lm_ctx_t *lm_ctx, int sock)
+{
+ struct vfio_user_version *client_version = NULL;
+ uint16_t msg_id;
+ int ret;
+
+ ret = recv_version(lm_ctx, sock, &msg_id, &client_version);
+
+ if (ret < 0) {
+ lm_log(lm_ctx, LM_ERR, "failed to recv version: %s", strerror(-ret));
+ return ret;
+ }
+
+ ret = send_version(lm_ctx, sock, msg_id, client_version);
+
+ free(client_version);
+
+ if (ret < 0) {
+ lm_log(lm_ctx, LM_ERR, "failed to send version: %s", strerror(-ret));
+ }
+
+ return ret;
+}
+
+/**
+ * lm_ctx: libmuser context
+ * FIXME: this shouldn't be happening as part of lm_ctx_create().
+ */
+static int
+open_sock(lm_ctx_t *lm_ctx)
+{
+ int ret;
+ int conn_fd;
+
+ assert(lm_ctx != NULL);
+
+ conn_fd = accept(lm_ctx->fd, NULL, NULL);
+ if (conn_fd == -1) {
+ return conn_fd;
+ }
+
+ ret = negotiate(lm_ctx, conn_fd);
+ if (ret < 0) {
+ return ret;
+ }
+
+ lm_ctx->conn_fd = conn_fd;
+ return conn_fd;
+}
+
+static int
+close_sock(lm_ctx_t *lm_ctx)
+{
+ return close(lm_ctx->conn_fd);
+}
+
+static int
+get_request_sock(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr,
+ int *fds, int *nr_fds)
+{
+ int ret;
+ struct iovec iov = {.iov_base = hdr, .iov_len = sizeof *hdr};
+ struct msghdr msg = {.msg_iov = &iov, .msg_iovlen = 1};
+ struct cmsghdr *cmsg;
+
+ msg.msg_controllen = CMSG_SPACE(sizeof(int) * *nr_fds);
+ msg.msg_control = alloca(msg.msg_controllen);
+
+ /*
+ * TODO ideally we should set O_NONBLOCK on the fd so that the syscall is
+ * faster (?). I tried that and get short reads, so we need to store the
+ * partially received buffer somewhere and retry.
+ */
+ ret = recvmsg(lm_ctx->conn_fd, &msg, lm_ctx->sock_flags);
+ if (ret == -1) {
+ return -errno;
+ }
+
+ for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
+ if (cmsg->cmsg_level != SOL_SOCKET || cmsg->cmsg_type != SCM_RIGHTS) {
+ continue;
+ }
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(int))) {
+ return -EINVAL;
+ }
+ int size = cmsg->cmsg_len - CMSG_LEN(0);
+ if (size % sizeof(int) != 0) {
+ return -EINVAL;
+ }
+ *nr_fds = (int)(size / sizeof(int));
+ memcpy(fds, CMSG_DATA(cmsg), *nr_fds * sizeof(int));
+ break;
+ }
+
+ return ret;
+}
+
+struct transport_ops sock_transport_ops = {
+ .init = init_sock,
+ .attach = open_sock,
+ .detach = close_sock,
+ .get_request = get_request_sock,
+};
+
+/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */