diff options
Diffstat (limited to 'tools/virtiofsd')
-rw-r--r-- | tools/virtiofsd/fuse_common.h | 9 | ||||
-rw-r--r-- | tools/virtiofsd/fuse_i.h | 7 | ||||
-rw-r--r-- | tools/virtiofsd/fuse_lowlevel.c | 180 | ||||
-rw-r--r-- | tools/virtiofsd/fuse_lowlevel.h | 13 | ||||
-rw-r--r-- | tools/virtiofsd/helper.c | 1 | ||||
-rw-r--r-- | tools/virtiofsd/passthrough_ll.c | 467 | ||||
-rw-r--r-- | tools/virtiofsd/passthrough_seccomp.c | 1 |
7 files changed, 622 insertions, 56 deletions
diff --git a/tools/virtiofsd/fuse_common.h b/tools/virtiofsd/fuse_common.h index 0c2665b..bf46954 100644 --- a/tools/virtiofsd/fuse_common.h +++ b/tools/virtiofsd/fuse_common.h @@ -378,6 +378,11 @@ struct fuse_file_info { #define FUSE_CAP_SETXATTR_EXT (1 << 29) /** + * Indicates that file server supports creating file security context + */ +#define FUSE_CAP_SECURITY_CTX (1ULL << 32) + +/** * Ioctl flags * * FUSE_IOCTL_COMPAT: 32bit compat ioctl on 64bit machine @@ -439,7 +444,7 @@ struct fuse_conn_info { /** * Capability flags that the kernel supports (read-only) */ - unsigned capable; + uint64_t capable; /** * Capability flags that the filesystem wants to enable. @@ -447,7 +452,7 @@ struct fuse_conn_info { * libfuse attempts to initialize this field with * reasonable default values before calling the init() handler. */ - unsigned want; + uint64_t want; /** * Maximum number of pending "background" requests. A diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h index 492e002..a5572fa 100644 --- a/tools/virtiofsd/fuse_i.h +++ b/tools/virtiofsd/fuse_i.h @@ -15,6 +15,12 @@ struct fv_VuDev; struct fv_QueueInfo; +struct fuse_security_context { + const char *name; + uint32_t ctxlen; + const void *ctx; +}; + struct fuse_req { struct fuse_session *se; uint64_t unique; @@ -35,6 +41,7 @@ struct fuse_req { } u; struct fuse_req *next; struct fuse_req *prev; + struct fuse_security_context secctx; }; struct fuse_notify_req { diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c index e4679c7..7529287 100644 --- a/tools/virtiofsd/fuse_lowlevel.c +++ b/tools/virtiofsd/fuse_lowlevel.c @@ -886,11 +886,63 @@ static void do_readlink(fuse_req_t req, fuse_ino_t nodeid, } } +static int parse_secctx_fill_req(fuse_req_t req, struct fuse_mbuf_iter *iter) +{ + struct fuse_secctx_header *fsecctx_header; + struct fuse_secctx *fsecctx; + const void *secctx; + const char *name; + + fsecctx_header = fuse_mbuf_iter_advance(iter, sizeof(*fsecctx_header)); + if (!fsecctx_header) { + return -EINVAL; + } + + /* + * As of now maximum of one security context is supported. It can + * change in future though. + */ + if (fsecctx_header->nr_secctx > 1) { + return -EINVAL; + } + + /* No security context sent. Maybe no LSM supports it */ + if (!fsecctx_header->nr_secctx) { + return 0; + } + + fsecctx = fuse_mbuf_iter_advance(iter, sizeof(*fsecctx)); + if (!fsecctx) { + return -EINVAL; + } + + /* struct fsecctx with zero sized context is not expected */ + if (!fsecctx->size) { + return -EINVAL; + } + name = fuse_mbuf_iter_advance_str(iter); + if (!name) { + return -EINVAL; + } + + secctx = fuse_mbuf_iter_advance(iter, fsecctx->size); + if (!secctx) { + return -EINVAL; + } + + req->secctx.name = name; + req->secctx.ctx = secctx; + req->secctx.ctxlen = fsecctx->size; + return 0; +} + static void do_mknod(fuse_req_t req, fuse_ino_t nodeid, struct fuse_mbuf_iter *iter) { struct fuse_mknod_in *arg; const char *name; + bool secctx_enabled = req->se->conn.want & FUSE_CAP_SECURITY_CTX; + int err; arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); name = fuse_mbuf_iter_advance_str(iter); @@ -901,6 +953,14 @@ static void do_mknod(fuse_req_t req, fuse_ino_t nodeid, req->ctx.umask = arg->umask; + if (secctx_enabled) { + err = parse_secctx_fill_req(req, iter); + if (err) { + fuse_reply_err(req, -err); + return; + } + } + if (req->se->op.mknod) { req->se->op.mknod(req, nodeid, name, arg->mode, arg->rdev); } else { @@ -913,6 +973,8 @@ static void do_mkdir(fuse_req_t req, fuse_ino_t nodeid, { struct fuse_mkdir_in *arg; const char *name; + bool secctx_enabled = req->se->conn.want & FUSE_CAP_SECURITY_CTX; + int err; arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); name = fuse_mbuf_iter_advance_str(iter); @@ -923,6 +985,14 @@ static void do_mkdir(fuse_req_t req, fuse_ino_t nodeid, req->ctx.umask = arg->umask; + if (secctx_enabled) { + err = parse_secctx_fill_req(req, iter); + if (err) { + fuse_reply_err(req, err); + return; + } + } + if (req->se->op.mkdir) { req->se->op.mkdir(req, nodeid, name, arg->mode); } else { @@ -969,12 +1039,22 @@ static void do_symlink(fuse_req_t req, fuse_ino_t nodeid, { const char *name = fuse_mbuf_iter_advance_str(iter); const char *linkname = fuse_mbuf_iter_advance_str(iter); + bool secctx_enabled = req->se->conn.want & FUSE_CAP_SECURITY_CTX; + int err; if (!name || !linkname) { fuse_reply_err(req, EINVAL); return; } + if (secctx_enabled) { + err = parse_secctx_fill_req(req, iter); + if (err) { + fuse_reply_err(req, err); + return; + } + } + if (req->se->op.symlink) { req->se->op.symlink(req, linkname, nodeid, name); } else { @@ -1048,6 +1128,8 @@ static void do_link(fuse_req_t req, fuse_ino_t nodeid, static void do_create(fuse_req_t req, fuse_ino_t nodeid, struct fuse_mbuf_iter *iter) { + bool secctx_enabled = req->se->conn.want & FUSE_CAP_SECURITY_CTX; + if (req->se->op.create) { struct fuse_create_in *arg; struct fuse_file_info fi; @@ -1060,6 +1142,15 @@ static void do_create(fuse_req_t req, fuse_ino_t nodeid, return; } + if (secctx_enabled) { + int err; + err = parse_secctx_fill_req(req, iter); + if (err) { + fuse_reply_err(req, err); + return; + } + } + memset(&fi, 0, sizeof(fi)); fi.flags = arg->flags; fi.kill_priv = arg->open_flags & FUSE_OPEN_KILL_SUIDGID; @@ -1876,15 +1967,30 @@ static void do_lseek(fuse_req_t req, fuse_ino_t nodeid, } } +static void do_syncfs(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + if (req->se->op.syncfs) { + req->se->op.syncfs(req, nodeid); + } else { + fuse_reply_err(req, ENOSYS); + } +} + static void do_init(fuse_req_t req, fuse_ino_t nodeid, struct fuse_mbuf_iter *iter) { size_t compat_size = offsetof(struct fuse_init_in, max_readahead); + size_t compat2_size = offsetof(struct fuse_init_in, flags) + + sizeof(uint32_t); + /* Fuse structure extended with minor version 36 */ + size_t compat3_size = endof(struct fuse_init_in, unused); struct fuse_init_in *arg; struct fuse_init_out outarg; struct fuse_session *se = req->se; size_t bufsize = se->bufsize; size_t outargsize = sizeof(outarg); + uint64_t flags = 0; (void)nodeid; @@ -1897,15 +2003,29 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, /* ...and now consume the new fields. */ if (arg->major == 7 && arg->minor >= 6) { - if (!fuse_mbuf_iter_advance(iter, sizeof(*arg) - compat_size)) { + if (!fuse_mbuf_iter_advance(iter, compat2_size - compat_size)) { + fuse_reply_err(req, EINVAL); + return; + } + flags |= arg->flags; + } + + /* + * fuse_init_in was extended again with minor version 36. Just read + * current known size of fuse_init so that future extension and + * header rebase does not cause breakage. + */ + if (sizeof(*arg) > compat2_size && (arg->flags & FUSE_INIT_EXT)) { + if (!fuse_mbuf_iter_advance(iter, compat3_size - compat2_size)) { fuse_reply_err(req, EINVAL); return; } + flags |= (uint64_t) arg->flags2 << 32; } fuse_log(FUSE_LOG_DEBUG, "INIT: %u.%u\n", arg->major, arg->minor); if (arg->major == 7 && arg->minor >= 6) { - fuse_log(FUSE_LOG_DEBUG, "flags=0x%08x\n", arg->flags); + fuse_log(FUSE_LOG_DEBUG, "flags=0x%016llx\n", flags); fuse_log(FUSE_LOG_DEBUG, "max_readahead=0x%08x\n", arg->max_readahead); } se->conn.proto_major = arg->major; @@ -1933,70 +2053,73 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, if (arg->max_readahead < se->conn.max_readahead) { se->conn.max_readahead = arg->max_readahead; } - if (arg->flags & FUSE_ASYNC_READ) { + if (flags & FUSE_ASYNC_READ) { se->conn.capable |= FUSE_CAP_ASYNC_READ; } - if (arg->flags & FUSE_POSIX_LOCKS) { + if (flags & FUSE_POSIX_LOCKS) { se->conn.capable |= FUSE_CAP_POSIX_LOCKS; } - if (arg->flags & FUSE_ATOMIC_O_TRUNC) { + if (flags & FUSE_ATOMIC_O_TRUNC) { se->conn.capable |= FUSE_CAP_ATOMIC_O_TRUNC; } - if (arg->flags & FUSE_EXPORT_SUPPORT) { + if (flags & FUSE_EXPORT_SUPPORT) { se->conn.capable |= FUSE_CAP_EXPORT_SUPPORT; } - if (arg->flags & FUSE_DONT_MASK) { + if (flags & FUSE_DONT_MASK) { se->conn.capable |= FUSE_CAP_DONT_MASK; } - if (arg->flags & FUSE_FLOCK_LOCKS) { + if (flags & FUSE_FLOCK_LOCKS) { se->conn.capable |= FUSE_CAP_FLOCK_LOCKS; } - if (arg->flags & FUSE_AUTO_INVAL_DATA) { + if (flags & FUSE_AUTO_INVAL_DATA) { se->conn.capable |= FUSE_CAP_AUTO_INVAL_DATA; } - if (arg->flags & FUSE_DO_READDIRPLUS) { + if (flags & FUSE_DO_READDIRPLUS) { se->conn.capable |= FUSE_CAP_READDIRPLUS; } - if (arg->flags & FUSE_READDIRPLUS_AUTO) { + if (flags & FUSE_READDIRPLUS_AUTO) { se->conn.capable |= FUSE_CAP_READDIRPLUS_AUTO; } - if (arg->flags & FUSE_ASYNC_DIO) { + if (flags & FUSE_ASYNC_DIO) { se->conn.capable |= FUSE_CAP_ASYNC_DIO; } - if (arg->flags & FUSE_WRITEBACK_CACHE) { + if (flags & FUSE_WRITEBACK_CACHE) { se->conn.capable |= FUSE_CAP_WRITEBACK_CACHE; } - if (arg->flags & FUSE_NO_OPEN_SUPPORT) { + if (flags & FUSE_NO_OPEN_SUPPORT) { se->conn.capable |= FUSE_CAP_NO_OPEN_SUPPORT; } - if (arg->flags & FUSE_PARALLEL_DIROPS) { + if (flags & FUSE_PARALLEL_DIROPS) { se->conn.capable |= FUSE_CAP_PARALLEL_DIROPS; } - if (arg->flags & FUSE_POSIX_ACL) { + if (flags & FUSE_POSIX_ACL) { se->conn.capable |= FUSE_CAP_POSIX_ACL; } - if (arg->flags & FUSE_HANDLE_KILLPRIV) { + if (flags & FUSE_HANDLE_KILLPRIV) { se->conn.capable |= FUSE_CAP_HANDLE_KILLPRIV; } - if (arg->flags & FUSE_NO_OPENDIR_SUPPORT) { + if (flags & FUSE_NO_OPENDIR_SUPPORT) { se->conn.capable |= FUSE_CAP_NO_OPENDIR_SUPPORT; } - if (!(arg->flags & FUSE_MAX_PAGES)) { + if (!(flags & FUSE_MAX_PAGES)) { size_t max_bufsize = FUSE_DEFAULT_MAX_PAGES_PER_REQ * getpagesize() + FUSE_BUFFER_HEADER_SIZE; if (bufsize > max_bufsize) { bufsize = max_bufsize; } } - if (arg->flags & FUSE_SUBMOUNTS) { + if (flags & FUSE_SUBMOUNTS) { se->conn.capable |= FUSE_CAP_SUBMOUNTS; } - if (arg->flags & FUSE_HANDLE_KILLPRIV_V2) { + if (flags & FUSE_HANDLE_KILLPRIV_V2) { se->conn.capable |= FUSE_CAP_HANDLE_KILLPRIV_V2; } - if (arg->flags & FUSE_SETXATTR_EXT) { + if (flags & FUSE_SETXATTR_EXT) { se->conn.capable |= FUSE_CAP_SETXATTR_EXT; } + if (flags & FUSE_SECURITY_CTX) { + se->conn.capable |= FUSE_CAP_SECURITY_CTX; + } #ifdef HAVE_SPLICE #ifdef HAVE_VMSPLICE se->conn.capable |= FUSE_CAP_SPLICE_WRITE | FUSE_CAP_SPLICE_MOVE; @@ -2051,7 +2174,7 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, if (se->conn.want & (~se->conn.capable)) { fuse_log(FUSE_LOG_ERR, "fuse: error: filesystem requested capabilities " - "0x%x that are not supported by kernel, aborting.\n", + "0x%llx that are not supported by kernel, aborting.\n", se->conn.want & (~se->conn.capable)); fuse_reply_err(req, EPROTO); se->error = -EPROTO; @@ -2062,7 +2185,7 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, if (se->conn.max_write < bufsize - FUSE_BUFFER_HEADER_SIZE) { se->bufsize = se->conn.max_write + FUSE_BUFFER_HEADER_SIZE; } - if (arg->flags & FUSE_MAX_PAGES) { + if (flags & FUSE_MAX_PAGES) { outarg.flags |= FUSE_MAX_PAGES; outarg.max_pages = (se->conn.max_write - 1) / getpagesize() + 1; } @@ -2136,8 +2259,14 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, outarg.flags |= FUSE_SETXATTR_EXT; } + if (se->conn.want & FUSE_CAP_SECURITY_CTX) { + /* bits 32..63 get shifted down 32 bits into the flags2 field */ + outarg.flags2 |= FUSE_SECURITY_CTX >> 32; + } + fuse_log(FUSE_LOG_DEBUG, " INIT: %u.%u\n", outarg.major, outarg.minor); - fuse_log(FUSE_LOG_DEBUG, " flags=0x%08x\n", outarg.flags); + fuse_log(FUSE_LOG_DEBUG, " flags2=0x%08x flags=0x%08x\n", outarg.flags2, + outarg.flags); fuse_log(FUSE_LOG_DEBUG, " max_readahead=0x%08x\n", outarg.max_readahead); fuse_log(FUSE_LOG_DEBUG, " max_write=0x%08x\n", outarg.max_write); fuse_log(FUSE_LOG_DEBUG, " max_background=%i\n", outarg.max_background); @@ -2280,6 +2409,7 @@ static struct { [FUSE_RENAME2] = { do_rename2, "RENAME2" }, [FUSE_COPY_FILE_RANGE] = { do_copy_file_range, "COPY_FILE_RANGE" }, [FUSE_LSEEK] = { do_lseek, "LSEEK" }, + [FUSE_SYNCFS] = { do_syncfs, "SYNCFS" }, }; #define FUSE_MAXOP (sizeof(fuse_ll_ops) / sizeof(fuse_ll_ops[0])) diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h index c55c0ca..b889dae 100644 --- a/tools/virtiofsd/fuse_lowlevel.h +++ b/tools/virtiofsd/fuse_lowlevel.h @@ -1226,6 +1226,19 @@ struct fuse_lowlevel_ops { */ void (*lseek)(fuse_req_t req, fuse_ino_t ino, off_t off, int whence, struct fuse_file_info *fi); + + /** + * Synchronize file system content + * + * If this request is answered with an error code of ENOSYS, + * this is treated as success and future calls to syncfs() will + * succeed automatically without being sent to the filesystem + * process. + * + * @param req request handle + * @param ino the inode number + */ + void (*syncfs)(fuse_req_t req, fuse_ino_t ino); }; /** diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c index a8295d9..e226fc5 100644 --- a/tools/virtiofsd/helper.c +++ b/tools/virtiofsd/helper.c @@ -187,6 +187,7 @@ void fuse_cmdline_help(void) " default: no_allow_direct_io\n" " -o announce_submounts Announce sub-mount points to the guest\n" " -o posix_acl/no_posix_acl Enable/Disable posix_acl. (default: disabled)\n" + " -o security_label/no_security_label Enable/Disable security label. (default: disabled)\n" ); } diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c index b3d0674..dfa2fc2 100644 --- a/tools/virtiofsd/passthrough_ll.c +++ b/tools/virtiofsd/passthrough_ll.c @@ -173,10 +173,15 @@ struct lo_data { /* An O_PATH file descriptor to /proc/self/fd/ */ int proc_self_fd; + /* An O_PATH file descriptor to /proc/self/task/ */ + int proc_self_task; int user_killpriv_v2, killpriv_v2; /* If set, virtiofsd is responsible for setting umask during creation */ bool change_umask; int user_posix_acl, posix_acl; + /* Keeps track if /proc/<pid>/attr/fscreate should be used or not */ + bool use_fscreate; + int user_security_label; }; static const struct fuse_opt lo_opts[] = { @@ -211,6 +216,8 @@ static const struct fuse_opt lo_opts[] = { { "no_killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 0 }, { "posix_acl", offsetof(struct lo_data, user_posix_acl), 1 }, { "no_posix_acl", offsetof(struct lo_data, user_posix_acl), 0 }, + { "security_label", offsetof(struct lo_data, user_security_label), 1 }, + { "no_security_label", offsetof(struct lo_data, user_security_label), 0 }, FUSE_OPT_END }; static bool use_syslog = false; @@ -230,6 +237,11 @@ static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st, static int xattr_map_client(const struct lo_data *lo, const char *client_name, char **out_name); +#define FCHDIR_NOFAIL(fd) do { \ + int fchdir_res = fchdir(fd); \ + assert(fchdir_res == 0); \ + } while (0) + static bool is_dot_or_dotdot(const char *name) { return name[0] == '.' && @@ -257,6 +269,70 @@ static struct lo_data *lo_data(fuse_req_t req) } /* + * Tries to figure out if /proc/<pid>/attr/fscreate is usable or not. With + * selinux=0, read from fscreate returns -EINVAL. + * + * TODO: Link with libselinux and use is_selinux_enabled() instead down + * the line. It probably will be more reliable indicator. + */ +static bool is_fscreate_usable(struct lo_data *lo) +{ + char procname[64]; + int fscreate_fd; + size_t bytes_read; + + sprintf(procname, "%ld/attr/fscreate", syscall(SYS_gettid)); + fscreate_fd = openat(lo->proc_self_task, procname, O_RDWR); + if (fscreate_fd == -1) { + return false; + } + + bytes_read = read(fscreate_fd, procname, 64); + close(fscreate_fd); + if (bytes_read == -1) { + return false; + } + return true; +} + +/* Helpers to set/reset fscreate */ +static int open_set_proc_fscreate(struct lo_data *lo, const void *ctx, + size_t ctxlen, int *fd) +{ + char procname[64]; + int fscreate_fd, err = 0; + size_t written; + + sprintf(procname, "%ld/attr/fscreate", syscall(SYS_gettid)); + fscreate_fd = openat(lo->proc_self_task, procname, O_WRONLY); + err = fscreate_fd == -1 ? errno : 0; + if (err) { + return err; + } + + written = write(fscreate_fd, ctx, ctxlen); + err = written == -1 ? errno : 0; + if (err) { + goto out; + } + + *fd = fscreate_fd; + return 0; +out: + close(fscreate_fd); + return err; +} + +static void close_reset_proc_fscreate(int fd) +{ + if ((write(fd, NULL, 0)) == -1) { + fuse_log(FUSE_LOG_WARNING, "Failed to reset fscreate. err=%d\n", errno); + } + close(fd); + return; +} + +/* * Load capng's state from our saved state if the current thread * hadn't previously been loaded. * returns 0 on success @@ -735,6 +811,17 @@ static void lo_init(void *userdata, struct fuse_conn_info *conn) fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix_acl\n"); conn->want &= ~FUSE_CAP_POSIX_ACL; } + + if (lo->user_security_label == 1) { + if (!(conn->capable & FUSE_CAP_SECURITY_CTX)) { + fuse_log(FUSE_LOG_ERR, "lo_init: Can not enable security label." + " kernel does not support FUSE_SECURITY_CTX capability.\n"); + } + conn->want |= FUSE_CAP_SECURITY_CTX; + } else { + fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling security label\n"); + conn->want &= ~FUSE_CAP_SECURITY_CTX; + } } static void lo_getattr(fuse_req_t req, fuse_ino_t ino, @@ -1284,16 +1371,103 @@ static void lo_restore_cred_gain_cap(struct lo_cred *old, bool restore_umask, } } +static int do_mknod_symlink_secctx(fuse_req_t req, struct lo_inode *dir, + const char *name, const char *secctx_name) +{ + int path_fd, err; + char procname[64]; + struct lo_data *lo = lo_data(req); + + if (!req->secctx.ctxlen) { + return 0; + } + + /* Open newly created element with O_PATH */ + path_fd = openat(dir->fd, name, O_PATH | O_NOFOLLOW); + err = path_fd == -1 ? errno : 0; + if (err) { + return err; + } + sprintf(procname, "%i", path_fd); + FCHDIR_NOFAIL(lo->proc_self_fd); + /* Set security context. This is not atomic w.r.t file creation */ + err = setxattr(procname, secctx_name, req->secctx.ctx, req->secctx.ctxlen, + 0); + if (err) { + err = errno; + } + FCHDIR_NOFAIL(lo->root.fd); + close(path_fd); + return err; +} + +static int do_mknod_symlink(fuse_req_t req, struct lo_inode *dir, + const char *name, mode_t mode, dev_t rdev, + const char *link) +{ + int err, fscreate_fd = -1; + const char *secctx_name = req->secctx.name; + struct lo_cred old = {}; + struct lo_data *lo = lo_data(req); + char *mapped_name = NULL; + bool secctx_enabled = req->secctx.ctxlen; + bool do_fscreate = false; + + if (secctx_enabled && lo->xattrmap) { + err = xattr_map_client(lo, req->secctx.name, &mapped_name); + if (err < 0) { + return -err; + } + secctx_name = mapped_name; + } + + /* + * If security xattr has not been remapped and selinux is enabled on + * host, set fscreate and no need to do a setxattr() after file creation + */ + if (secctx_enabled && !mapped_name && lo->use_fscreate) { + do_fscreate = true; + err = open_set_proc_fscreate(lo, req->secctx.ctx, req->secctx.ctxlen, + &fscreate_fd); + if (err) { + goto out; + } + } + + err = lo_change_cred(req, &old, lo->change_umask && !S_ISLNK(mode)); + if (err) { + goto out; + } + + err = mknod_wrapper(dir->fd, name, link, mode, rdev); + err = err == -1 ? errno : 0; + lo_restore_cred(&old, lo->change_umask && !S_ISLNK(mode)); + if (err) { + goto out; + } + + if (!do_fscreate) { + err = do_mknod_symlink_secctx(req, dir, name, secctx_name); + if (err) { + unlinkat(dir->fd, name, S_ISDIR(mode) ? AT_REMOVEDIR : 0); + } + } +out: + if (fscreate_fd != -1) { + close_reset_proc_fscreate(fscreate_fd); + } + g_free(mapped_name); + return err; +} + static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, const char *name, mode_t mode, dev_t rdev, const char *link) { - int res; int saverr; struct lo_data *lo = lo_data(req); struct lo_inode *dir; struct fuse_entry_param e; - struct lo_cred old = {}; if (is_empty(name)) { fuse_reply_err(req, ENOENT); @@ -1311,21 +1485,11 @@ static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, return; } - saverr = lo_change_cred(req, &old, lo->change_umask && !S_ISLNK(mode)); + saverr = do_mknod_symlink(req, dir, name, mode, rdev, link); if (saverr) { goto out; } - res = mknod_wrapper(dir->fd, name, link, mode, rdev); - - saverr = errno; - - lo_restore_cred(&old, lo->change_umask && !S_ISLNK(mode)); - - if (res == -1) { - goto out; - } - saverr = lo_do_lookup(req, parent, name, &e, NULL); if (saverr) { goto out; @@ -2001,6 +2165,190 @@ static int lo_do_open(struct lo_data *lo, struct lo_inode *inode, return 0; } +static int do_create_nosecctx(fuse_req_t req, struct lo_inode *parent_inode, + const char *name, mode_t mode, + struct fuse_file_info *fi, int *open_fd, + bool tmpfile) +{ + int err, fd; + struct lo_cred old = {}; + struct lo_data *lo = lo_data(req); + int flags; + + if (tmpfile) { + flags = fi->flags | O_TMPFILE; + /* + * Don't use O_EXCL as we want to link file later. Also reset O_CREAT + * otherwise openat() returns -EINVAL. + */ + flags &= ~(O_CREAT | O_EXCL); + + /* O_TMPFILE needs either O_RDWR or O_WRONLY */ + if ((flags & O_ACCMODE) == O_RDONLY) { + flags |= O_RDWR; + } + } else { + flags = fi->flags | O_CREAT | O_EXCL; + } + + err = lo_change_cred(req, &old, lo->change_umask); + if (err) { + return err; + } + + /* Try to create a new file but don't open existing files */ + fd = openat(parent_inode->fd, name, flags, mode); + err = fd == -1 ? errno : 0; + lo_restore_cred(&old, lo->change_umask); + if (!err) { + *open_fd = fd; + } + return err; +} + +static int do_create_secctx_fscreate(fuse_req_t req, + struct lo_inode *parent_inode, + const char *name, mode_t mode, + struct fuse_file_info *fi, int *open_fd) +{ + int err = 0, fd = -1, fscreate_fd = -1; + struct lo_data *lo = lo_data(req); + + err = open_set_proc_fscreate(lo, req->secctx.ctx, req->secctx.ctxlen, + &fscreate_fd); + if (err) { + return err; + } + + err = do_create_nosecctx(req, parent_inode, name, mode, fi, &fd, false); + + close_reset_proc_fscreate(fscreate_fd); + if (!err) { + *open_fd = fd; + } + return err; +} + +static int do_create_secctx_tmpfile(fuse_req_t req, + struct lo_inode *parent_inode, + const char *name, mode_t mode, + struct fuse_file_info *fi, + const char *secctx_name, int *open_fd) +{ + int err, fd = -1; + struct lo_data *lo = lo_data(req); + char procname[64]; + + err = do_create_nosecctx(req, parent_inode, ".", mode, fi, &fd, true); + if (err) { + return err; + } + + err = fsetxattr(fd, secctx_name, req->secctx.ctx, req->secctx.ctxlen, 0); + if (err) { + err = errno; + goto out; + } + + /* Security context set on file. Link it in place */ + sprintf(procname, "%d", fd); + FCHDIR_NOFAIL(lo->proc_self_fd); + err = linkat(AT_FDCWD, procname, parent_inode->fd, name, + AT_SYMLINK_FOLLOW); + err = err == -1 ? errno : 0; + FCHDIR_NOFAIL(lo->root.fd); + +out: + if (!err) { + *open_fd = fd; + } else if (fd != -1) { + close(fd); + } + return err; +} + +static int do_create_secctx_noatomic(fuse_req_t req, + struct lo_inode *parent_inode, + const char *name, mode_t mode, + struct fuse_file_info *fi, + const char *secctx_name, int *open_fd) +{ + int err = 0, fd = -1; + + err = do_create_nosecctx(req, parent_inode, name, mode, fi, &fd, false); + if (err) { + goto out; + } + + /* Set security context. This is not atomic w.r.t file creation */ + err = fsetxattr(fd, secctx_name, req->secctx.ctx, req->secctx.ctxlen, 0); + err = err == -1 ? errno : 0; +out: + if (!err) { + *open_fd = fd; + } else { + if (fd != -1) { + close(fd); + unlinkat(parent_inode->fd, name, 0); + } + } + return err; +} + +static int do_lo_create(fuse_req_t req, struct lo_inode *parent_inode, + const char *name, mode_t mode, + struct fuse_file_info *fi, int *open_fd) +{ + struct lo_data *lo = lo_data(req); + char *mapped_name = NULL; + int err; + const char *ctxname = req->secctx.name; + bool secctx_enabled = req->secctx.ctxlen; + + if (secctx_enabled && lo->xattrmap) { + err = xattr_map_client(lo, req->secctx.name, &mapped_name); + if (err < 0) { + return -err; + } + + ctxname = mapped_name; + } + + if (secctx_enabled) { + /* + * If security.selinux has not been remapped and selinux is enabled, + * use fscreate to set context before file creation. If not, use + * tmpfile method for regular files. Otherwise fallback to + * non-atomic method of file creation and xattr settting. + */ + if (!mapped_name && lo->use_fscreate) { + err = do_create_secctx_fscreate(req, parent_inode, name, mode, fi, + open_fd); + goto out; + } else if (S_ISREG(mode)) { + err = do_create_secctx_tmpfile(req, parent_inode, name, mode, fi, + ctxname, open_fd); + /* + * If filesystem does not support O_TMPFILE, fallback to non-atomic + * method. + */ + if (!err || err != EOPNOTSUPP) { + goto out; + } + } + + err = do_create_secctx_noatomic(req, parent_inode, name, mode, fi, + ctxname, open_fd); + } else { + err = do_create_nosecctx(req, parent_inode, name, mode, fi, open_fd, + false); + } + +out: + g_free(mapped_name); + return err; +} + static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, mode_t mode, struct fuse_file_info *fi) { @@ -2010,7 +2358,6 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, struct lo_inode *inode = NULL; struct fuse_entry_param e; int err; - struct lo_cred old = {}; fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)" " kill_priv=%d\n", parent, name, fi->kill_priv); @@ -2026,18 +2373,9 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, return; } - err = lo_change_cred(req, &old, lo->change_umask); - if (err) { - goto out; - } - update_open_flags(lo->writeback, lo->allow_direct_io, fi); - /* Try to create a new file but don't open existing files */ - fd = openat(parent_inode->fd, name, fi->flags | O_CREAT | O_EXCL, mode); - err = fd == -1 ? errno : 0; - - lo_restore_cred(&old, lo->change_umask); + err = do_lo_create(req, parent_inode, name, mode, fi, &fd); /* Ignore the error if file exists and O_EXCL was not given */ if (err && (err != EEXIST || (fi->flags & O_EXCL))) { @@ -2467,6 +2805,15 @@ static void lo_flock(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, int res; (void)ino; + if (!(op & LOCK_NB)) { + /* + * Blocking flock can deadlock as there is only one thread + * serving the queue. + */ + fuse_reply_err(req, EOPNOTSUPP); + return; + } + res = flock(lo_fi_fd(req, fi), op); fuse_reply_err(req, res == -1 ? errno : 0); @@ -2842,11 +3189,6 @@ static int xattr_map_server(const struct lo_data *lo, const char *server_name, return -ENODATA; } -#define FCHDIR_NOFAIL(fd) do { \ - int fchdir_res = fchdir(fd); \ - assert(fchdir_res == 0); \ - } while (0) - static bool block_xattr(struct lo_data *lo, const char *name) { /* @@ -3357,6 +3699,49 @@ static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence, } } +static int lo_do_syncfs(struct lo_data *lo, struct lo_inode *inode) +{ + int fd, ret = 0; + + fuse_log(FUSE_LOG_DEBUG, "lo_do_syncfs(ino=%" PRIu64 ")\n", + inode->fuse_ino); + + fd = lo_inode_open(lo, inode, O_RDONLY); + if (fd < 0) { + return -fd; + } + + if (syncfs(fd) < 0) { + ret = errno; + } + + close(fd); + return ret; +} + +static void lo_syncfs(fuse_req_t req, fuse_ino_t ino) +{ + struct lo_data *lo = lo_data(req); + struct lo_inode *inode = lo_inode(req, ino); + int err; + + if (!inode) { + fuse_reply_err(req, EBADF); + return; + } + + err = lo_do_syncfs(lo, inode); + lo_inode_put(lo, &inode); + + /* + * If submounts aren't announced, the client only sends a request to + * sync the root inode. TODO: Track submounts internally and iterate + * over them as well. + */ + + fuse_reply_err(req, err); +} + static void lo_destroy(void *userdata) { struct lo_data *lo = (struct lo_data *)userdata; @@ -3417,6 +3802,7 @@ static struct fuse_lowlevel_ops lo_oper = { .copy_file_range = lo_copy_file_range, #endif .lseek = lo_lseek, + .syncfs = lo_syncfs, .destroy = lo_destroy, }; @@ -3508,6 +3894,15 @@ static void setup_namespaces(struct lo_data *lo, struct fuse_session *se) exit(1); } + /* Get the /proc/self/task descriptor */ + lo->proc_self_task = open("/proc/self/task/", O_PATH); + if (lo->proc_self_task == -1) { + fuse_log(FUSE_LOG_ERR, "open(/proc/self/task, O_PATH): %m\n"); + exit(1); + } + + lo->use_fscreate = is_fscreate_usable(lo); + /* * We only need /proc/self/fd. Prevent ".." from accessing parent * directories of /proc/self/fd by bind-mounting it over /proc. Since / was @@ -3724,6 +4119,14 @@ static void setup_chroot(struct lo_data *lo) exit(1); } + lo->proc_self_task = open("/proc/self/task", O_PATH); + if (lo->proc_self_fd == -1) { + fuse_log(FUSE_LOG_ERR, "open(\"/proc/self/task\", O_PATH): %m\n"); + exit(1); + } + + lo->use_fscreate = is_fscreate_usable(lo); + /* * Make the shared directory the file system root so that FUSE_OPEN * (lo_open()) cannot escape the shared directory by opening a symlink. @@ -3909,6 +4312,10 @@ static void fuse_lo_data_cleanup(struct lo_data *lo) close(lo->proc_self_fd); } + if (lo->proc_self_task >= 0) { + close(lo->proc_self_task); + } + if (lo->root.fd >= 0) { close(lo->root.fd); } @@ -3936,8 +4343,10 @@ int main(int argc, char *argv[]) .posix_lock = 0, .allow_direct_io = 0, .proc_self_fd = -1, + .proc_self_task = -1, .user_killpriv_v2 = -1, .user_posix_acl = -1, + .user_security_label = -1, }; struct lo_map_elem *root_elem; struct lo_map_elem *reserve_elem; diff --git a/tools/virtiofsd/passthrough_seccomp.c b/tools/virtiofsd/passthrough_seccomp.c index 2bc0127..888295c 100644 --- a/tools/virtiofsd/passthrough_seccomp.c +++ b/tools/virtiofsd/passthrough_seccomp.c @@ -111,6 +111,7 @@ static const int syscall_allowlist[] = { SCMP_SYS(set_robust_list), SCMP_SYS(setxattr), SCMP_SYS(symlinkat), + SCMP_SYS(syncfs), SCMP_SYS(time), /* Rarely needed, except on static builds */ SCMP_SYS(tgkill), SCMP_SYS(unlinkat), |