diff options
Diffstat (limited to 'lib/tran_sock.c')
-rw-r--r-- | lib/tran_sock.c | 575 |
1 files changed, 575 insertions, 0 deletions
diff --git a/lib/tran_sock.c b/lib/tran_sock.c new file mode 100644 index 0000000..856d296 --- /dev/null +++ b/lib/tran_sock.c @@ -0,0 +1,575 @@ +/* + * Copyright (c) 2020 Nutanix Inc. All rights reserved. + * + * Authors: Thanos Makatos <thanos@nutanix.com> + * Swapnil Ingle <swapnil.ingle@nutanix.com> + * Felipe Franciosi <felipe@nutanix.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Nutanix nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + */ + +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <stdio.h> +#include <string.h> +#include <sys/param.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/un.h> + +#include "muser.h" +#include "muser_priv.h" +#include "tran_sock.h" + +#define MAX_FDS 8 + +static inline int +recv_blocking(int sock, void *buf, size_t len, int flags) +{ + int f = fcntl(sock, F_GETFL, 0); + int ret, fret; + + fret = fcntl(sock, F_SETFL, f & ~O_NONBLOCK); + assert(fret != -1); + + ret = recv(sock, buf, len, flags); + + fret = fcntl(sock, F_SETFL, f); + assert(fret != -1); + + return ret; +} + +static int +init_sock(lm_ctx_t *lm_ctx) +{ + struct sockaddr_un addr = { .sun_family = AF_UNIX }; + int ret, unix_sock; + mode_t mode; + + assert(lm_ctx != NULL); + + /* FIXME SPDK can't easily run as non-root */ + mode = umask(0000); + + if ((unix_sock = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) { + ret = -errno; + goto out; + } + + if (lm_ctx->flags & LM_FLAG_ATTACH_NB) { + ret = fcntl(unix_sock, F_SETFL, + fcntl(unix_sock, F_GETFL, 0) | O_NONBLOCK); + if (ret < 0) { + ret = -errno; + goto out; + } + lm_ctx->sock_flags = MSG_DONTWAIT | MSG_WAITALL; + } else { + lm_ctx->sock_flags = 0; + } + + ret = snprintf(addr.sun_path, sizeof addr.sun_path, "%s", lm_ctx->uuid); + if (ret >= (int)sizeof addr.sun_path) { + ret = -ENAMETOOLONG; + } + if (ret < 0) { + goto out; + } + + /* start listening business */ + ret = bind(unix_sock, (struct sockaddr*)&addr, sizeof(addr)); + if (ret < 0) { + ret = errno; + goto out; + } + + ret = listen(unix_sock, 0); + if (ret < 0) { + ret = -errno; + } + +out: + umask(mode); + if (ret != 0) { + close(unix_sock); + return ret; + } + return unix_sock; +} + +int +_send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply, + enum vfio_user_command cmd, + struct iovec *iovecs, size_t nr_iovecs, + int *fds, int count, int err) +{ + int ret; + struct vfio_user_header hdr = {.msg_id = msg_id}; + struct msghdr msg; + size_t i; + + if (nr_iovecs == 0) { + iovecs = alloca(sizeof(*iovecs)); + nr_iovecs = 1; + } + + memset(&msg, 0, sizeof(msg)); + + if (is_reply) { + // FIXME: SPEC: should the reply include the command? I'd say yes? + hdr.flags.type = VFIO_USER_F_TYPE_REPLY; + if (err != 0) { + hdr.flags.error = 1U; + hdr.error_no = err; + } + } else { + hdr.cmd = cmd; + hdr.flags.type = VFIO_USER_F_TYPE_COMMAND; + } + + iovecs[0].iov_base = &hdr; + iovecs[0].iov_len = sizeof(hdr); + + for (i = 0; i < nr_iovecs; i++) { + hdr.msg_size += iovecs[i].iov_len; + } + + msg.msg_iovlen = nr_iovecs; + msg.msg_iov = iovecs; + + if (fds != NULL) { + size_t size = count * sizeof *fds; + char *buf = alloca(CMSG_SPACE(size)); + + msg.msg_control = buf; + msg.msg_controllen = CMSG_SPACE(size); + + struct cmsghdr * cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(size); + memcpy(CMSG_DATA(cmsg), fds, size); + } + + // FIXME: this doesn't check the entire data was sent? + ret = sendmsg(sock, &msg, 0); + if (ret == -1) { + return -errno; + } + + return 0; +} + +int +send_vfio_user_msg(int sock, uint16_t msg_id, bool is_reply, + enum vfio_user_command cmd, + void *data, size_t data_len, + int *fds, size_t count) +{ + /* [0] is for the header. */ + struct iovec iovecs[2] = { + [1] = { + .iov_base = data, + .iov_len = data_len + } + }; + return _send_vfio_user_msg(sock, msg_id, is_reply, cmd, iovecs, + ARRAY_SIZE(iovecs), fds, count, 0); +} + +/* + * Receive a vfio-user message. If "len" is set to non-zero, the message should + * include data of that length, which is stored in the pre-allocated "data" + * pointer. + * + * FIXME: in general, sort out negative err returns - they should only be used + * when we're going to return > 0 on success, and even then "errno" might be + * better. + */ +int +recv_vfio_user_msg(int sock, struct vfio_user_header *hdr, bool is_reply, + uint16_t *msg_id, void *data, size_t *len) +{ + int ret; + + /* FIXME if ret == -1 then fcntl can overwrite recv's errno */ + + ret = recv_blocking(sock, hdr, sizeof(*hdr), 0); + if (ret == -1) { + return -errno; + } + if (ret < (int)sizeof(*hdr)) { + return -EINVAL; + } + + if (is_reply) { + if (hdr->msg_id != *msg_id) { + return -EINVAL; + } + + if (hdr->flags.type != VFIO_USER_F_TYPE_REPLY) { + return -EINVAL; + } + + if (hdr->flags.error == 1U) { + if (hdr->error_no <= 0) { + hdr->error_no = EINVAL; + } + return -hdr->error_no; + } + } else { + if (hdr->flags.type != VFIO_USER_F_TYPE_COMMAND) { + return -EINVAL; + } + *msg_id = hdr->msg_id; + } + + if (len != NULL && *len > 0 && hdr->msg_size > sizeof *hdr) { + ret = recv_blocking(sock, data, MIN(hdr->msg_size - sizeof *hdr, *len), + 0); + if (ret < 0) { + return ret; + } + if (*len != (size_t)ret) { /* FIXME we should allow receiving less */ + return -EINVAL; + } + *len = ret; + } + return 0; +} + +/* + * Like recv_vfio_user_msg(), but will automatically allocate reply data. + * + * FIXME: this does an unconstrained alloc of client-supplied data. + */ +int +recv_vfio_user_msg_alloc(int sock, struct vfio_user_header *hdr, bool is_reply, + uint16_t *msg_id, void **datap, size_t *lenp) +{ + void *data; + size_t len; + int ret; + + ret = recv_vfio_user_msg(sock, hdr, is_reply, msg_id, NULL, NULL); + + if (ret != 0) { + return ret; + } + + assert(hdr->msg_size >= sizeof (*hdr)); + + len = hdr->msg_size - sizeof (*hdr); + + if (len == 0) { + *datap = NULL; + *lenp = 0; + return 0; + } + + data = calloc(1, len); + + if (data == NULL) { + return -errno; + } + + ret = recv_blocking(sock, data, len, 0); + if (ret < 0) { + free(data); + return -errno; + } + + if (len != (size_t)ret) { + free(data); + return -EINVAL; + } + + *datap = data; + *lenp = len; + return 0; +} + +int +_send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd, + struct iovec *iovecs, size_t nr_iovecs, + int *send_fds, size_t fd_count, + struct vfio_user_header *hdr, + void *recv_data, size_t recv_len) +{ + int ret = _send_vfio_user_msg(sock, msg_id, false, cmd, iovecs, nr_iovecs, + send_fds, fd_count, 0); + if (ret < 0) { + return ret; + } + if (hdr == NULL) { + hdr = alloca(sizeof *hdr); + } + return recv_vfio_user_msg(sock, hdr, true, &msg_id, recv_data, &recv_len); +} + +int +send_recv_vfio_user_msg(int sock, uint16_t msg_id, enum vfio_user_command cmd, + void *send_data, size_t send_len, + int *send_fds, size_t fd_count, + struct vfio_user_header *hdr, + void *recv_data, size_t recv_len) +{ + /* [0] is for the header. */ + struct iovec iovecs[2] = { + [1] = { + .iov_base = send_data, + .iov_len = send_len + } + }; + return _send_recv_vfio_user_msg(sock, msg_id, cmd, iovecs, + ARRAY_SIZE(iovecs), send_fds, fd_count, + hdr, recv_data, recv_len); +} + +int +recv_version(lm_ctx_t *lm_ctx, int sock, uint16_t *msg_idp, + struct vfio_user_version **versionp) +{ + struct vfio_user_version *cversion; + struct vfio_user_header hdr; + size_t vlen; + int ret; + + *versionp = NULL; + + ret = recv_vfio_user_msg_alloc(sock, &hdr, false, msg_idp, + (void **)&cversion, &vlen); + + if (ret < 0) { + lm_log(lm_ctx, LM_ERR, "failed to receive version: %s", strerror(-ret)); + goto out; + } + + if (hdr.cmd != VFIO_USER_VERSION) { + lm_log(lm_ctx, LM_ERR, "msg%hx: invalid cmd %hu (expected %hu)", + *msg_idp, hdr.cmd, VFIO_USER_VERSION); + ret = -EINVAL; + goto out; + } + + if (vlen < sizeof (*cversion)) { + lm_log(lm_ctx, LM_ERR, "msg%hx (VFIO_USER_VERSION): invalid size %lu", + *msg_idp, vlen); + ret = -EINVAL; + goto out; + } + + /* FIXME: oracle qemu code has major of 1 currently */ +#if 0 + if (cversion->major != LIB_MUSER_VFIO_USER_VERS_MJ) { + lm_log(lm_ctx, LM_ERR, "unsupported client major %hu (must be %hu)", + cversion->major, LIB_MUSER_VFIO_USER_VERS_MJ); + ret = -ENOTSUP; + goto out; + } +#endif + + /* + * FIXME: incorrect, if the client doesn't give a pgsize value, it means "no + * migration support", handle this + */ + lm_ctx->migration.pgsize = sysconf(_SC_PAGESIZE); + lm_ctx->client_max_fds = 1; + + if (vlen > sizeof (*cversion)) { + lm_log(lm_ctx, LM_DBG, "ignoring JSON \"%s\"", cversion->data); + // FIXME: don't ignore it. + lm_ctx->client_max_fds = 128; + } + +out: + if (ret != 0) { + free(cversion); + cversion = NULL; + } + + *versionp = cversion; + return ret; +} + +int +send_version(lm_ctx_t *lm_ctx, int sock, uint16_t msg_id, + struct vfio_user_version *cversion) +{ + struct vfio_user_version *sversion = NULL; + char *server_caps = NULL; + size_t len; + int ret; + + ret = asprintf(&server_caps, "{" + "\"capabilities\":{" + "\"max_fds\":%u," + "\"migration\":{" + "\"pgsize\":%zu" + "}" + "}" + "}", MAX_FDS, lm_ctx->migration.pgsize); + + if (ret == -1) { + ret = -ENOMEM; + goto out; + } + + len = sizeof (*sversion) + ret + 1; + sversion = calloc(1, len); + + if (sversion == NULL) { + ret = -ENOMEM; + goto out; + } + + // FIXME: we should save the client minor here, and check that before trying + // to send unsupported things. + sversion->major = LIB_MUSER_VFIO_USER_VERS_MJ; + sversion->minor = MIN(cversion->minor, LIB_MUSER_VFIO_USER_VERS_MN); + + // FIXME: strcpy sucks + strcpy(((char *)sversion) + offsetof(struct vfio_user_version, data), + server_caps); + + ret = send_vfio_user_msg(sock, msg_id, true, VFIO_USER_VERSION, sversion, + len, NULL, 0); + +out: + free(server_caps); + free(sversion); + return ret; +} + +static int +negotiate(lm_ctx_t *lm_ctx, int sock) +{ + struct vfio_user_version *client_version = NULL; + uint16_t msg_id; + int ret; + + ret = recv_version(lm_ctx, sock, &msg_id, &client_version); + + if (ret < 0) { + lm_log(lm_ctx, LM_ERR, "failed to recv version: %s", strerror(-ret)); + return ret; + } + + ret = send_version(lm_ctx, sock, msg_id, client_version); + + free(client_version); + + if (ret < 0) { + lm_log(lm_ctx, LM_ERR, "failed to send version: %s", strerror(-ret)); + } + + return ret; +} + +/** + * lm_ctx: libmuser context + * FIXME: this shouldn't be happening as part of lm_ctx_create(). + */ +static int +open_sock(lm_ctx_t *lm_ctx) +{ + int ret; + int conn_fd; + + assert(lm_ctx != NULL); + + conn_fd = accept(lm_ctx->fd, NULL, NULL); + if (conn_fd == -1) { + return conn_fd; + } + + ret = negotiate(lm_ctx, conn_fd); + if (ret < 0) { + return ret; + } + + lm_ctx->conn_fd = conn_fd; + return conn_fd; +} + +static int +close_sock(lm_ctx_t *lm_ctx) +{ + return close(lm_ctx->conn_fd); +} + +static int +get_request_sock(lm_ctx_t *lm_ctx, struct vfio_user_header *hdr, + int *fds, int *nr_fds) +{ + int ret; + struct iovec iov = {.iov_base = hdr, .iov_len = sizeof *hdr}; + struct msghdr msg = {.msg_iov = &iov, .msg_iovlen = 1}; + struct cmsghdr *cmsg; + + msg.msg_controllen = CMSG_SPACE(sizeof(int) * *nr_fds); + msg.msg_control = alloca(msg.msg_controllen); + + /* + * TODO ideally we should set O_NONBLOCK on the fd so that the syscall is + * faster (?). I tried that and get short reads, so we need to store the + * partially received buffer somewhere and retry. + */ + ret = recvmsg(lm_ctx->conn_fd, &msg, lm_ctx->sock_flags); + if (ret == -1) { + return -errno; + } + + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL; cmsg = CMSG_NXTHDR(&msg, cmsg)) { + if (cmsg->cmsg_level != SOL_SOCKET || cmsg->cmsg_type != SCM_RIGHTS) { + continue; + } + if (cmsg->cmsg_len < CMSG_LEN(sizeof(int))) { + return -EINVAL; + } + int size = cmsg->cmsg_len - CMSG_LEN(0); + if (size % sizeof(int) != 0) { + return -EINVAL; + } + *nr_fds = (int)(size / sizeof(int)); + memcpy(fds, CMSG_DATA(cmsg), *nr_fds * sizeof(int)); + break; + } + + return ret; +} + +struct transport_ops sock_transport_ops = { + .init = init_sock, + .attach = open_sock, + .detach = close_sock, + .get_request = get_request_sock, +}; + +/* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */ |