aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Maydell <peter.maydell@linaro.org>2020-05-01 23:10:22 +0100
committerPeter Maydell <peter.maydell@linaro.org>2020-05-01 23:10:22 +0100
commit6897541d902218b31eef2e8eabc74fa56618f9b3 (patch)
tree1b8d5416976ccc3841d3bd4c4170e7a255958c27
parent1c47613588ccff44422d4bdeea0dc36a0a308ec7 (diff)
parent66502bbca37ca7a3bfa57e82cfc03b89a7a11eae (diff)
downloadqemu-6897541d902218b31eef2e8eabc74fa56618f9b3.zip
qemu-6897541d902218b31eef2e8eabc74fa56618f9b3.tar.gz
qemu-6897541d902218b31eef2e8eabc74fa56618f9b3.tar.bz2
Merge remote-tracking branch 'remotes/dgilbert-gitlab/tags/pull-virtiofs-20200501' into staging
virtiofsd: Pull 2020-05-01 (includes CVE fix) This set includes a security fix, other fixes and improvements. Security fix: The security fix is for CVE-2020-10717 where, on low RAM hosts, the guest can potentially exceed the maximum fd limit. This fix adds some more configuration so that the user can explicitly set the limit. Fixes: Recursive mounting of the exported directory is now used in the sandbox, such that if there was a mount underneath present at the time the virtiofsd was started, that mount is also visible to the guest; in the existing code, only mounts that happened after startup were visible. Security improvements: The jailing for /proc/self/fd is improved - but it's something that shouldn't be accessible anyway. Most capabilities are now dropped at startup; again this shouldn't change any behaviour but is extra protection. # gpg: Signature made Fri 01 May 2020 20:06:46 BST # gpg: using RSA key 45F5C71B4A0CB7FB977A9FA90516331EBC5BFDE7 # gpg: Good signature from "Dr. David Alan Gilbert (RH2) <dgilbert@redhat.com>" [full] # Primary key fingerprint: 45F5 C71B 4A0C B7FB 977A 9FA9 0516 331E BC5B FDE7 * remotes/dgilbert-gitlab/tags/pull-virtiofs-20200501: virtiofsd: drop all capabilities in the wait parent process virtiofsd: only retain file system capabilities virtiofsd: Show submounts virtiofsd: jail lo->proc_self_fd virtiofsd: stay below fs.file-max sysctl value (CVE-2020-10717) virtiofsd: add --rlimit-nofile=NUM option Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r--tools/virtiofsd/fuse_lowlevel.h1
-rw-r--r--tools/virtiofsd/helper.c47
-rw-r--r--tools/virtiofsd/passthrough_ll.c102
3 files changed, 133 insertions, 17 deletions
diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h
index 8f6d705..562fd52 100644
--- a/tools/virtiofsd/fuse_lowlevel.h
+++ b/tools/virtiofsd/fuse_lowlevel.h
@@ -1777,6 +1777,7 @@ struct fuse_cmdline_opts {
int syslog;
int log_level;
unsigned int max_idle_threads;
+ unsigned long rlimit_nofile;
};
/**
diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c
index 819c2bc..00a1ef6 100644
--- a/tools/virtiofsd/helper.c
+++ b/tools/virtiofsd/helper.c
@@ -23,6 +23,8 @@
#include <stdlib.h>
#include <string.h>
#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/resource.h>
#include <unistd.h>
#define FUSE_HELPER_OPT(t, p) \
@@ -53,6 +55,7 @@ static const struct fuse_opt fuse_helper_opts[] = {
FUSE_HELPER_OPT("subtype=", nodefault_subtype),
FUSE_OPT_KEY("subtype=", FUSE_OPT_KEY_KEEP),
FUSE_HELPER_OPT("max_idle_threads=%u", max_idle_threads),
+ FUSE_HELPER_OPT("--rlimit-nofile=%lu", rlimit_nofile),
FUSE_HELPER_OPT("--syslog", syslog),
FUSE_HELPER_OPT_VALUE("log_level=debug", log_level, FUSE_LOG_DEBUG),
FUSE_HELPER_OPT_VALUE("log_level=info", log_level, FUSE_LOG_INFO),
@@ -171,6 +174,10 @@ void fuse_cmdline_help(void)
" default: no_writeback\n"
" -o xattr|no_xattr enable/disable xattr\n"
" default: no_xattr\n"
+ " --rlimit-nofile=<num> set maximum number of file descriptors\n"
+ " (0 leaves rlimit unchanged)\n"
+ " default: min(1000000, fs.file-max - 16384)\n"
+ " if the current rlimit is lower\n"
);
}
@@ -191,11 +198,51 @@ static int fuse_helper_opt_proc(void *data, const char *arg, int key,
}
}
+static unsigned long get_default_rlimit_nofile(void)
+{
+ g_autofree gchar *file_max_str = NULL;
+ const rlim_t reserved_fds = 16384; /* leave at least this many fds free */
+ rlim_t max_fds = 1000000; /* our default RLIMIT_NOFILE target */
+ rlim_t file_max;
+ struct rlimit rlim;
+
+ /*
+ * Reduce max_fds below the system-wide maximum, if necessary. This
+ * ensures there are fds available for other processes so we don't
+ * cause resource exhaustion.
+ */
+ if (!g_file_get_contents("/proc/sys/fs/file-max", &file_max_str,
+ NULL, NULL)) {
+ fuse_log(FUSE_LOG_ERR, "can't read /proc/sys/fs/file-max\n");
+ exit(1);
+ }
+ file_max = g_ascii_strtoull(file_max_str, NULL, 10);
+ if (file_max < 2 * reserved_fds) {
+ fuse_log(FUSE_LOG_ERR,
+ "The fs.file-max sysctl is too low (%lu) to allow a "
+ "reasonable number of open files.\n",
+ (unsigned long)file_max);
+ exit(1);
+ }
+ max_fds = MIN(file_max - reserved_fds, max_fds);
+
+ if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) {
+ fuse_log(FUSE_LOG_ERR, "getrlimit(RLIMIT_NOFILE): %m\n");
+ exit(1);
+ }
+
+ if (rlim.rlim_cur >= max_fds) {
+ return 0; /* we have more fds available than required! */
+ }
+ return max_fds;
+}
+
int fuse_parse_cmdline(struct fuse_args *args, struct fuse_cmdline_opts *opts)
{
memset(opts, 0, sizeof(struct fuse_cmdline_opts));
opts->max_idle_threads = 10;
+ opts->rlimit_nofile = get_default_rlimit_nofile();
opts->foreground = 1;
if (fuse_opt_parse(args, opts, fuse_helper_opts, fuse_helper_opt_proc) ==
diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c
index 4c35c95..3ba1d90 100644
--- a/tools/virtiofsd/passthrough_ll.c
+++ b/tools/virtiofsd/passthrough_ll.c
@@ -2531,11 +2531,24 @@ static void print_capabilities(void)
}
/*
+ * Drop all Linux capabilities because the wait parent process only needs to
+ * sit in waitpid(2) and terminate.
+ */
+static void setup_wait_parent_capabilities(void)
+{
+ capng_setpid(syscall(SYS_gettid));
+ capng_clear(CAPNG_SELECT_BOTH);
+ capng_apply(CAPNG_SELECT_BOTH);
+}
+
+/*
* Move to a new mount, net, and pid namespaces to isolate this process.
*/
static void setup_namespaces(struct lo_data *lo, struct fuse_session *se)
{
pid_t child;
+ char template[] = "virtiofsd-XXXXXX";
+ char *tmpdir;
/*
* Create a new pid namespace for *child* processes. We'll have to
@@ -2561,6 +2574,8 @@ static void setup_namespaces(struct lo_data *lo, struct fuse_session *se)
pid_t waited;
int wstatus;
+ setup_wait_parent_capabilities();
+
/* The parent waits for the child */
do {
waited = waitpid(child, &wstatus, 0);
@@ -2597,12 +2612,33 @@ static void setup_namespaces(struct lo_data *lo, struct fuse_session *se)
exit(1);
}
+ tmpdir = mkdtemp(template);
+ if (!tmpdir) {
+ fuse_log(FUSE_LOG_ERR, "tmpdir(%s): %m\n", template);
+ exit(1);
+ }
+
+ if (mount("/proc/self/fd", tmpdir, NULL, MS_BIND, NULL) < 0) {
+ fuse_log(FUSE_LOG_ERR, "mount(/proc/self/fd, %s, MS_BIND): %m\n",
+ tmpdir);
+ exit(1);
+ }
+
/* Now we can get our /proc/self/fd directory file descriptor */
- lo->proc_self_fd = open("/proc/self/fd", O_PATH);
+ lo->proc_self_fd = open(tmpdir, O_PATH);
if (lo->proc_self_fd == -1) {
- fuse_log(FUSE_LOG_ERR, "open(/proc/self/fd, O_PATH): %m\n");
+ fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", tmpdir);
exit(1);
}
+
+ if (umount2(tmpdir, MNT_DETACH) < 0) {
+ fuse_log(FUSE_LOG_ERR, "umount2(%s, MNT_DETACH): %m\n", tmpdir);
+ exit(1);
+ }
+
+ if (rmdir(tmpdir) < 0) {
+ fuse_log(FUSE_LOG_ERR, "rmdir(%s): %m\n", tmpdir);
+ }
}
/*
@@ -2643,7 +2679,7 @@ static void setup_mounts(const char *source)
int oldroot;
int newroot;
- if (mount(source, source, NULL, MS_BIND, NULL) < 0) {
+ if (mount(source, source, NULL, MS_BIND | MS_REC, NULL) < 0) {
fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source);
exit(1);
}
@@ -2696,6 +2732,43 @@ static void setup_mounts(const char *source)
}
/*
+ * Only keep whitelisted capabilities that are needed for file system operation
+ */
+static void setup_capabilities(void)
+{
+ pthread_mutex_lock(&cap.mutex);
+ capng_restore_state(&cap.saved);
+
+ /*
+ * Whitelist file system-related capabilities that are needed for a file
+ * server to act like root. Drop everything else like networking and
+ * sysadmin capabilities.
+ *
+ * Exclusions:
+ * 1. CAP_LINUX_IMMUTABLE is not included because it's only used via ioctl
+ * and we don't support that.
+ * 2. CAP_MAC_OVERRIDE is not included because it only seems to be
+ * used by the Smack LSM. Omit it until there is demand for it.
+ */
+ capng_setpid(syscall(SYS_gettid));
+ capng_clear(CAPNG_SELECT_BOTH);
+ capng_updatev(CAPNG_ADD, CAPNG_PERMITTED | CAPNG_EFFECTIVE,
+ CAP_CHOWN,
+ CAP_DAC_OVERRIDE,
+ CAP_DAC_READ_SEARCH,
+ CAP_FOWNER,
+ CAP_FSETID,
+ CAP_SETGID,
+ CAP_SETUID,
+ CAP_MKNOD,
+ CAP_SETFCAP);
+ capng_apply(CAPNG_SELECT_BOTH);
+
+ cap.saved = capng_save_state();
+ pthread_mutex_unlock(&cap.mutex);
+}
+
+/*
* Lock down this process to prevent access to other processes or files outside
* source directory. This reduces the impact of arbitrary code execution bugs.
*/
@@ -2705,26 +2778,21 @@ static void setup_sandbox(struct lo_data *lo, struct fuse_session *se,
setup_namespaces(lo, se);
setup_mounts(lo->source);
setup_seccomp(enable_syslog);
+ setup_capabilities();
}
-/* Raise the maximum number of open file descriptors */
-static void setup_nofile_rlimit(void)
+/* Set the maximum number of open file descriptors */
+static void setup_nofile_rlimit(unsigned long rlimit_nofile)
{
- const rlim_t max_fds = 1000000;
- struct rlimit rlim;
-
- if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) {
- fuse_log(FUSE_LOG_ERR, "getrlimit(RLIMIT_NOFILE): %m\n");
- exit(1);
- }
+ struct rlimit rlim = {
+ .rlim_cur = rlimit_nofile,
+ .rlim_max = rlimit_nofile,
+ };
- if (rlim.rlim_cur >= max_fds) {
+ if (rlimit_nofile == 0) {
return; /* nothing to do */
}
- rlim.rlim_cur = max_fds;
- rlim.rlim_max = max_fds;
-
if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) {
/* Ignore SELinux denials */
if (errno == EPERM) {
@@ -2977,7 +3045,7 @@ int main(int argc, char *argv[])
fuse_daemonize(opts.foreground);
- setup_nofile_rlimit();
+ setup_nofile_rlimit(opts.rlimit_nofile);
/* Must be before sandbox since it wants /proc */
setup_capng();