aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Maydell <peter.maydell@linaro.org>2021-07-05 12:45:24 +0100
committerPeter Maydell <peter.maydell@linaro.org>2021-07-05 12:45:24 +0100
commit715167a36c2b152f6511cff690180c1254ae039f (patch)
tree71907df095a9c0c4364baf565168a1429aa9bd6d
parent4fb2820854a796ab75ffb2ec896b67268281ecde (diff)
parente5f607913cee3f3b486eb024dbc7079b51f6da57 (diff)
downloadqemu-715167a36c2b152f6511cff690180c1254ae039f.zip
qemu-715167a36c2b152f6511cff690180c1254ae039f.tar.gz
qemu-715167a36c2b152f6511cff690180c1254ae039f.tar.bz2
Merge remote-tracking branch 'remotes/dgilbert-gitlab/tags/pull-migration-20210705a' into staging
Migration and virtiofs pull 2021-07-01 v2 Dropped Peter Xu's migration-test fix to reenable most of the migration tests when uffd isn't available; we're seeing at least one seg in github CI (on qemu-system-i386) and Peter Maydell is reporting a hang on Openbsd. Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com> # gpg: Signature made Mon 05 Jul 2021 11:01:35 BST # gpg: using RSA key 45F5C71B4A0CB7FB977A9FA90516331EBC5BFDE7 # gpg: Good signature from "Dr. David Alan Gilbert (RH2) <dgilbert@redhat.com>" [full] # Primary key fingerprint: 45F5 C71B 4A0C B7FB 977A 9FA9 0516 331E BC5B FDE7 * remotes/dgilbert-gitlab/tags/pull-migration-20210705a: migration/rdma: Use error_report to suppress errno message tests/migration: fix "downtime_limit" type when "migrate-set-parameters" tests/migration: parse the thread-id key of CpuInfoFast virtiofsd: Add an option to enable/disable posix acls virtiofsd: Switch creds, drop FSETID for system.posix_acl_access xattr virtiofsd: Add capability to change/restore umask virtiofsd: Add umask to seccom allow list virtiofsd: Add support for extended setxattr virtiofsd: Fix xattr operations overwriting errno virtiofsd: Fix fuse setxattr() API change issue virtiofsd: Don't allow file creation with FUSE_OPEN docs: describe the security considerations with virtiofsd xattr mapping virtiofsd: use GDateTime for formatting timestamp for debug messages migration: failover: continue to wait card unplug on error migration: move wait-unplug loop to its own function migration: Allow reset of postcopy_recover_triggered when failed migration: Move yank outside qemu_start_incoming_migration() migration: fix the memory overwriting risk in add_to_iovec tests: migration-test: Add dirty ring test Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r--docs/tools/virtiofsd.rst58
-rw-r--r--migration/migration.c89
-rw-r--r--migration/qemu-file.c5
-rw-r--r--migration/rdma.c4
-rw-r--r--tests/migration/guestperf/engine.py4
-rw-r--r--tests/qtest/migration-test.c58
-rw-r--r--tools/virtiofsd/fuse_common.h5
-rw-r--r--tools/virtiofsd/fuse_lowlevel.c24
-rw-r--r--tools/virtiofsd/fuse_lowlevel.h3
-rw-r--r--tools/virtiofsd/helper.c1
-rw-r--r--tools/virtiofsd/passthrough_ll.c254
-rw-r--r--tools/virtiofsd/passthrough_seccomp.c1
12 files changed, 423 insertions, 83 deletions
diff --git a/docs/tools/virtiofsd.rst b/docs/tools/virtiofsd.rst
index 4911e79..c4ac7fd 100644
--- a/docs/tools/virtiofsd.rst
+++ b/docs/tools/virtiofsd.rst
@@ -101,6 +101,9 @@ Options
Enable/disable extended attributes (xattr) on files and directories. The
default is ``no_xattr``.
+ * posix_acl|no_posix_acl -
+ Enable/disable posix acl support. Posix ACLs are disabled by default`.
+
.. option:: --socket-path=PATH
Listen on vhost-user UNIX domain socket at PATH.
@@ -127,8 +130,8 @@ Options
timeout. ``always`` sets a long cache lifetime at the expense of coherency.
The default is ``auto``.
-xattr-mapping
--------------
+Extended attribute (xattr) mapping
+----------------------------------
By default the name of xattr's used by the client are passed through to the server
file system. This can be a problem where either those xattr names are used
@@ -136,6 +139,9 @@ by something on the server (e.g. selinux client/server confusion) or if the
virtiofsd is running in a container with restricted privileges where it cannot
access some attributes.
+Mapping syntax
+~~~~~~~~~~~~~~
+
A mapping of xattr names can be made using -o xattrmap=mapping where the ``mapping``
string consists of a series of rules.
@@ -232,8 +238,48 @@ Note: When the 'security.capability' xattr is remapped, the daemon has to do
extra work to remove it during many operations, which the host kernel normally
does itself.
-xattr-mapping Examples
-----------------------
+Security considerations
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Operating systems typically partition the xattr namespace using
+well defined name prefixes. Each partition may have different
+access controls applied. For example, on Linux there are multiple
+partitions
+
+ * ``system.*`` - access varies depending on attribute & filesystem
+ * ``security.*`` - only processes with CAP_SYS_ADMIN
+ * ``trusted.*`` - only processes with CAP_SYS_ADMIN
+ * ``user.*`` - any process granted by file permissions / ownership
+
+While other OS such as FreeBSD have different name prefixes
+and access control rules.
+
+When remapping attributes on the host, it is important to
+ensure that the remapping does not allow a guest user to
+evade the guest access control rules.
+
+Consider if ``trusted.*`` from the guest was remapped to
+``user.virtiofs.trusted*`` in the host. An unprivileged
+user in a Linux guest has the ability to write to xattrs
+under ``user.*``. Thus the user can evade the access
+control restriction on ``trusted.*`` by instead writing
+to ``user.virtiofs.trusted.*``.
+
+As noted above, the partitions used and access controls
+applied, will vary across guest OS, so it is not wise to
+try to predict what the guest OS will use.
+
+The simplest way to avoid an insecure configuration is
+to remap all xattrs at once, to a given fixed prefix.
+This is shown in example (1) below.
+
+If selectively mapping only a subset of xattr prefixes,
+then rules must be added to explicitly block direct
+access to the target of the remapping. This is shown
+in example (2) below.
+
+Mapping examples
+~~~~~~~~~~~~~~~~
1) Prefix all attributes with 'user.virtiofs.'
@@ -271,7 +317,9 @@ stripping of 'user.virtiofs.'.
The second rule hides unprefixed 'trusted.' attributes
on the host.
The third rule stops a guest from explicitly setting
-the 'user.virtiofs.' path directly.
+the 'user.virtiofs.' path directly to prevent access
+control bypass on the target of the earlier prefix
+remapping.
Finally, the fourth rule lets all remaining attributes
through.
diff --git a/migration/migration.c b/migration/migration.c
index 4228635..5ff7ba9 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -456,10 +456,6 @@ static void qemu_start_incoming_migration(const char *uri, Error **errp)
{
const char *p = NULL;
- if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
- return;
- }
-
qapi_event_send_migration(MIGRATION_STATUS_SETUP);
if (strstart(uri, "tcp:", &p) ||
strstart(uri, "unix:", NULL) ||
@@ -474,7 +470,6 @@ static void qemu_start_incoming_migration(const char *uri, Error **errp)
} else if (strstart(uri, "fd:", &p)) {
fd_start_incoming_migration(p, errp);
} else {
- yank_unregister_instance(MIGRATION_YANK_INSTANCE);
error_setg(errp, "unknown migration protocol: %s", uri);
}
}
@@ -2083,9 +2078,14 @@ void qmp_migrate_incoming(const char *uri, Error **errp)
return;
}
+ if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
+ return;
+ }
+
qemu_start_incoming_migration(uri, &local_err);
if (local_err) {
+ yank_unregister_instance(MIGRATION_YANK_INSTANCE);
error_propagate(errp, local_err);
return;
}
@@ -2097,6 +2097,13 @@ void qmp_migrate_recover(const char *uri, Error **errp)
{
MigrationIncomingState *mis = migration_incoming_get_current();
+ /*
+ * Don't even bother to use ERRP_GUARD() as it _must_ always be set by
+ * callers (no one should ignore a recover failure); if there is, it's a
+ * programming error.
+ */
+ assert(errp);
+
if (mis->state != MIGRATION_STATUS_POSTCOPY_PAUSED) {
error_setg(errp, "Migrate recover can only be run "
"when postcopy is paused.");
@@ -2114,8 +2121,13 @@ void qmp_migrate_recover(const char *uri, Error **errp)
* only re-setup the migration stream and poke existing migration
* to continue using that newly established channel.
*/
- yank_unregister_instance(MIGRATION_YANK_INSTANCE);
qemu_start_incoming_migration(uri, errp);
+
+ /* Safe to dereference with the assert above */
+ if (*errp) {
+ /* Reset the flag so user could still retry */
+ qatomic_set(&mis->postcopy_recover_triggered, false);
+ }
}
void qmp_migrate_pause(Error **errp)
@@ -3665,6 +3677,39 @@ bool migration_rate_limit(void)
}
/*
+ * if failover devices are present, wait they are completely
+ * unplugged
+ */
+
+static void qemu_savevm_wait_unplug(MigrationState *s, int old_state,
+ int new_state)
+{
+ if (qemu_savevm_state_guest_unplug_pending()) {
+ migrate_set_state(&s->state, old_state, MIGRATION_STATUS_WAIT_UNPLUG);
+
+ while (s->state == MIGRATION_STATUS_WAIT_UNPLUG &&
+ qemu_savevm_state_guest_unplug_pending()) {
+ qemu_sem_timedwait(&s->wait_unplug_sem, 250);
+ }
+ if (s->state != MIGRATION_STATUS_WAIT_UNPLUG) {
+ int timeout = 120; /* 30 seconds */
+ /*
+ * migration has been canceled
+ * but as we have started an unplug we must wait the end
+ * to be able to plug back the card
+ */
+ while (timeout-- && qemu_savevm_state_guest_unplug_pending()) {
+ qemu_sem_timedwait(&s->wait_unplug_sem, 250);
+ }
+ }
+
+ migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG, new_state);
+ } else {
+ migrate_set_state(&s->state, old_state, new_state);
+ }
+}
+
+/*
* Master migration thread on the source VM.
* It drives the migration and pumps the data down the outgoing channel.
*/
@@ -3710,22 +3755,10 @@ static void *migration_thread(void *opaque)
qemu_savevm_state_setup(s->to_dst_file);
- if (qemu_savevm_state_guest_unplug_pending()) {
- migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
- MIGRATION_STATUS_WAIT_UNPLUG);
-
- while (s->state == MIGRATION_STATUS_WAIT_UNPLUG &&
- qemu_savevm_state_guest_unplug_pending()) {
- qemu_sem_timedwait(&s->wait_unplug_sem, 250);
- }
-
- migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG,
- MIGRATION_STATUS_ACTIVE);
- }
+ qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP,
+ MIGRATION_STATUS_ACTIVE);
s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
- migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
- MIGRATION_STATUS_ACTIVE);
trace_migration_thread_setup_complete();
@@ -3833,21 +3866,9 @@ static void *bg_migration_thread(void *opaque)
qemu_savevm_state_header(s->to_dst_file);
qemu_savevm_state_setup(s->to_dst_file);
- if (qemu_savevm_state_guest_unplug_pending()) {
- migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
- MIGRATION_STATUS_WAIT_UNPLUG);
-
- while (s->state == MIGRATION_STATUS_WAIT_UNPLUG &&
- qemu_savevm_state_guest_unplug_pending()) {
- qemu_sem_timedwait(&s->wait_unplug_sem, 250);
- }
+ qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP,
+ MIGRATION_STATUS_ACTIVE);
- migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG,
- MIGRATION_STATUS_ACTIVE);
- } else {
- migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
- MIGRATION_STATUS_ACTIVE);
- }
s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
trace_migration_thread_setup_complete();
diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index d6e03db..1eacf9e 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -416,6 +416,11 @@ static int add_to_iovec(QEMUFile *f, const uint8_t *buf, size_t size,
{
f->iov[f->iovcnt - 1].iov_len += size;
} else {
+ if (f->iovcnt >= MAX_IOV_SIZE) {
+ /* Should only happen if a previous fflush failed */
+ assert(f->shutdown || !qemu_file_is_writable(f));
+ return 1;
+ }
if (may_free) {
set_bit(f->iovcnt, f->may_free);
}
diff --git a/migration/rdma.c b/migration/rdma.c
index d90b29a..b6cc4be 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -1006,7 +1006,7 @@ route:
if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
ERROR(errp, "result not equal to event_addr_resolved %s",
rdma_event_str(cm_event->event));
- perror("rdma_resolve_addr");
+ error_report("rdma_resolve_addr");
rdma_ack_cm_event(cm_event);
ret = -EINVAL;
goto err_resolve_get_addr;
@@ -2544,7 +2544,7 @@ static int qemu_rdma_connect(RDMAContext *rdma, Error **errp, bool return_path)
}
if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
- perror("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect");
+ error_report("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect");
ERROR(errp, "connecting to destination!");
rdma_ack_cm_event(cm_event);
goto err_rdma_source_connect;
diff --git a/tests/migration/guestperf/engine.py b/tests/migration/guestperf/engine.py
index 208e095..7c991c4 100644
--- a/tests/migration/guestperf/engine.py
+++ b/tests/migration/guestperf/engine.py
@@ -113,7 +113,7 @@ class Engine(object):
vcpus = src.command("query-cpus-fast")
src_threads = []
for vcpu in vcpus:
- src_threads.append(vcpu["thread_id"])
+ src_threads.append(vcpu["thread-id"])
# XXX how to get dst timings on remote host ?
@@ -153,7 +153,7 @@ class Engine(object):
max_bandwidth=scenario._bandwidth * 1024 * 1024)
resp = src.command("migrate-set-parameters",
- downtime_limit=scenario._downtime / 1024.0)
+ downtime_limit=scenario._downtime)
if scenario._compression_mt:
resp = src.command("migrate-set-capabilities",
diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index 2b028df..328d6db 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -27,6 +27,10 @@
#include "migration-helpers.h"
#include "tests/migration/migration-test.h"
+#if defined(__linux__)
+#include "linux/kvm.h"
+#endif
+
/* TODO actually test the results and get rid of this */
#define qtest_qmp_discard_response(...) qobject_unref(qtest_qmp(__VA_ARGS__))
@@ -467,6 +471,8 @@ typedef struct {
bool use_shmem;
/* only launch the target process */
bool only_target;
+ /* Use dirty ring if true; dirty logging otherwise */
+ bool use_dirty_ring;
char *opts_source;
char *opts_target;
} MigrateStart;
@@ -573,11 +579,13 @@ static int test_migrate_start(QTestState **from, QTestState **to,
shmem_opts = g_strdup("");
}
- cmd_source = g_strdup_printf("-accel kvm -accel tcg%s%s "
+ cmd_source = g_strdup_printf("-accel kvm%s -accel tcg%s%s "
"-name source,debug-threads=on "
"-m %s "
"-serial file:%s/src_serial "
"%s %s %s %s",
+ args->use_dirty_ring ?
+ ",dirty-ring-size=4096" : "",
machine_opts ? " -machine " : "",
machine_opts ? machine_opts : "",
memory_size, tmpfs,
@@ -587,12 +595,14 @@ static int test_migrate_start(QTestState **from, QTestState **to,
*from = qtest_init(cmd_source);
}
- cmd_target = g_strdup_printf("-accel kvm -accel tcg%s%s "
+ cmd_target = g_strdup_printf("-accel kvm%s -accel tcg%s%s "
"-name target,debug-threads=on "
"-m %s "
"-serial file:%s/dest_serial "
"-incoming %s "
"%s %s %s %s",
+ args->use_dirty_ring ?
+ ",dirty-ring-size=4096" : "",
machine_opts ? " -machine " : "",
machine_opts ? machine_opts : "",
memory_size, tmpfs, uri,
@@ -785,12 +795,14 @@ static void test_baddest(void)
test_migrate_end(from, to, false);
}
-static void test_precopy_unix(void)
+static void test_precopy_unix_common(bool dirty_ring)
{
g_autofree char *uri = g_strdup_printf("unix:%s/migsocket", tmpfs);
MigrateStart *args = migrate_start_new();
QTestState *from, *to;
+ args->use_dirty_ring = dirty_ring;
+
if (test_migrate_start(&from, &to, uri, args)) {
return;
}
@@ -825,6 +837,18 @@ static void test_precopy_unix(void)
test_migrate_end(from, to, true);
}
+static void test_precopy_unix(void)
+{
+ /* Using default dirty logging */
+ test_precopy_unix_common(false);
+}
+
+static void test_precopy_unix_dirty_ring(void)
+{
+ /* Using dirty ring tracking */
+ test_precopy_unix_common(true);
+}
+
#if 0
/* Currently upset on aarch64 TCG */
static void test_ignore_shared(void)
@@ -1369,6 +1393,29 @@ static void test_multifd_tcp_cancel(void)
test_migrate_end(from, to2, true);
}
+static bool kvm_dirty_ring_supported(void)
+{
+#if defined(__linux__)
+ int ret, kvm_fd = open("/dev/kvm", O_RDONLY);
+
+ if (kvm_fd < 0) {
+ return false;
+ }
+
+ ret = ioctl(kvm_fd, KVM_CHECK_EXTENSION, KVM_CAP_DIRTY_LOG_RING);
+ close(kvm_fd);
+
+ /* We test with 4096 slots */
+ if (ret < 4096) {
+ return false;
+ }
+
+ return true;
+#else
+ return false;
+#endif
+}
+
int main(int argc, char **argv)
{
char template[] = "/tmp/migration-test-XXXXXX";
@@ -1439,6 +1486,11 @@ int main(int argc, char **argv)
qtest_add_func("/migration/multifd/tcp/zstd", test_multifd_tcp_zstd);
#endif
+ if (kvm_dirty_ring_supported()) {
+ qtest_add_func("/migration/dirty_ring",
+ test_precopy_unix_dirty_ring);
+ }
+
ret = g_test_run();
g_assert_cmpint(ret, ==, 0);
diff --git a/tools/virtiofsd/fuse_common.h b/tools/virtiofsd/fuse_common.h
index fa96718..0c2665b 100644
--- a/tools/virtiofsd/fuse_common.h
+++ b/tools/virtiofsd/fuse_common.h
@@ -373,6 +373,11 @@ struct fuse_file_info {
#define FUSE_CAP_HANDLE_KILLPRIV_V2 (1 << 28)
/**
+ * Indicates that file server supports extended struct fuse_setxattr_in
+ */
+#define FUSE_CAP_SETXATTR_EXT (1 << 29)
+
+/**
* Ioctl flags
*
* FUSE_IOCTL_COMPAT: 32bit compat ioctl on 64bit machine
diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c
index 7fe2cef..e4679c7 100644
--- a/tools/virtiofsd/fuse_lowlevel.c
+++ b/tools/virtiofsd/fuse_lowlevel.c
@@ -1084,6 +1084,12 @@ static void do_open(fuse_req_t req, fuse_ino_t nodeid,
return;
}
+ /* File creation is handled by do_create() or do_mknod() */
+ if (arg->flags & (O_CREAT | O_TMPFILE)) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
memset(&fi, 0, sizeof(fi));
fi.flags = arg->flags;
fi.kill_priv = arg->open_flags & FUSE_OPEN_KILL_SUIDGID;
@@ -1419,8 +1425,13 @@ static void do_setxattr(fuse_req_t req, fuse_ino_t nodeid,
struct fuse_setxattr_in *arg;
const char *name;
const char *value;
+ bool setxattr_ext = req->se->conn.want & FUSE_CAP_SETXATTR_EXT;
- arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (setxattr_ext) {
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ } else {
+ arg = fuse_mbuf_iter_advance(iter, FUSE_COMPAT_SETXATTR_IN_SIZE);
+ }
name = fuse_mbuf_iter_advance_str(iter);
if (!arg || !name) {
fuse_reply_err(req, EINVAL);
@@ -1434,7 +1445,9 @@ static void do_setxattr(fuse_req_t req, fuse_ino_t nodeid,
}
if (req->se->op.setxattr) {
- req->se->op.setxattr(req, nodeid, name, value, arg->size, arg->flags);
+ uint32_t setxattr_flags = setxattr_ext ? arg->setxattr_flags : 0;
+ req->se->op.setxattr(req, nodeid, name, value, arg->size, arg->flags,
+ setxattr_flags);
} else {
fuse_reply_err(req, ENOSYS);
}
@@ -1981,6 +1994,9 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid,
if (arg->flags & FUSE_HANDLE_KILLPRIV_V2) {
se->conn.capable |= FUSE_CAP_HANDLE_KILLPRIV_V2;
}
+ if (arg->flags & FUSE_SETXATTR_EXT) {
+ se->conn.capable |= FUSE_CAP_SETXATTR_EXT;
+ }
#ifdef HAVE_SPLICE
#ifdef HAVE_VMSPLICE
se->conn.capable |= FUSE_CAP_SPLICE_WRITE | FUSE_CAP_SPLICE_MOVE;
@@ -2116,6 +2132,10 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid,
outarg.flags |= FUSE_HANDLE_KILLPRIV_V2;
}
+ if (se->conn.want & FUSE_CAP_SETXATTR_EXT) {
+ outarg.flags |= FUSE_SETXATTR_EXT;
+ }
+
fuse_log(FUSE_LOG_DEBUG, " INIT: %u.%u\n", outarg.major, outarg.minor);
fuse_log(FUSE_LOG_DEBUG, " flags=0x%08x\n", outarg.flags);
fuse_log(FUSE_LOG_DEBUG, " max_readahead=0x%08x\n", outarg.max_readahead);
diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h
index 3bf786b..4b4e8c9 100644
--- a/tools/virtiofsd/fuse_lowlevel.h
+++ b/tools/virtiofsd/fuse_lowlevel.h
@@ -798,7 +798,8 @@ struct fuse_lowlevel_ops {
* fuse_reply_err
*/
void (*setxattr)(fuse_req_t req, fuse_ino_t ino, const char *name,
- const char *value, size_t size, int flags);
+ const char *value, size_t size, int flags,
+ uint32_t setxattr_flags);
/**
* Get an extended attribute
diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c
index 5e98ed7..a8295d9 100644
--- a/tools/virtiofsd/helper.c
+++ b/tools/virtiofsd/helper.c
@@ -186,6 +186,7 @@ void fuse_cmdline_help(void)
" to virtiofsd from guest applications.\n"
" default: no_allow_direct_io\n"
" -o announce_submounts Announce sub-mount points to the guest\n"
+ " -o posix_acl/no_posix_acl Enable/Disable posix_acl. (default: disabled)\n"
);
}
diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c
index 49c21fd..38b2af8 100644
--- a/tools/virtiofsd/passthrough_ll.c
+++ b/tools/virtiofsd/passthrough_ll.c
@@ -122,6 +122,7 @@ struct lo_inode {
struct lo_cred {
uid_t euid;
gid_t egid;
+ mode_t umask;
};
enum {
@@ -172,6 +173,9 @@ struct lo_data {
/* An O_PATH file descriptor to /proc/self/fd/ */
int proc_self_fd;
int user_killpriv_v2, killpriv_v2;
+ /* If set, virtiofsd is responsible for setting umask during creation */
+ bool change_umask;
+ int user_posix_acl, posix_acl;
};
static const struct fuse_opt lo_opts[] = {
@@ -204,6 +208,8 @@ static const struct fuse_opt lo_opts[] = {
{ "announce_submounts", offsetof(struct lo_data, announce_submounts), 1 },
{ "killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 1 },
{ "no_killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 0 },
+ { "posix_acl", offsetof(struct lo_data, user_posix_acl), 1 },
+ { "no_posix_acl", offsetof(struct lo_data, user_posix_acl), 0 },
FUSE_OPT_END
};
static bool use_syslog = false;
@@ -702,6 +708,32 @@ static void lo_init(void *userdata, struct fuse_conn_info *conn)
conn->want &= ~FUSE_CAP_HANDLE_KILLPRIV_V2;
lo->killpriv_v2 = 0;
}
+
+ if (lo->user_posix_acl == 1) {
+ /*
+ * User explicitly asked for this option. Enable it unconditionally.
+ * If connection does not have this capability, print error message
+ * now. It will fail later in fuse_lowlevel.c
+ */
+ if (!(conn->capable & FUSE_CAP_POSIX_ACL) ||
+ !(conn->capable & FUSE_CAP_DONT_MASK) ||
+ !(conn->capable & FUSE_CAP_SETXATTR_EXT)) {
+ fuse_log(FUSE_LOG_ERR, "lo_init: Can not enable posix acl."
+ " kernel does not support FUSE_POSIX_ACL, FUSE_DONT_MASK"
+ " or FUSE_SETXATTR_EXT capability.\n");
+ } else {
+ fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling posix acl\n");
+ }
+
+ conn->want |= FUSE_CAP_POSIX_ACL | FUSE_CAP_DONT_MASK |
+ FUSE_CAP_SETXATTR_EXT;
+ lo->change_umask = true;
+ lo->posix_acl = true;
+ } else {
+ /* User either did not specify anything or wants it disabled */
+ fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix_acl\n");
+ conn->want &= ~FUSE_CAP_POSIX_ACL;
+ }
}
static void lo_getattr(fuse_req_t req, fuse_ino_t ino,
@@ -1134,7 +1166,8 @@ static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name)
* ownership of caller.
* TODO: What about selinux context?
*/
-static int lo_change_cred(fuse_req_t req, struct lo_cred *old)
+static int lo_change_cred(fuse_req_t req, struct lo_cred *old,
+ bool change_umask)
{
int res;
@@ -1154,11 +1187,14 @@ static int lo_change_cred(fuse_req_t req, struct lo_cred *old)
return errno_save;
}
+ if (change_umask) {
+ old->umask = umask(req->ctx.umask);
+ }
return 0;
}
/* Regain Privileges */
-static void lo_restore_cred(struct lo_cred *old)
+static void lo_restore_cred(struct lo_cred *old, bool restore_umask)
{
int res;
@@ -1173,6 +1209,54 @@ static void lo_restore_cred(struct lo_cred *old)
fuse_log(FUSE_LOG_ERR, "setegid(%u): %m\n", old->egid);
exit(1);
}
+
+ if (restore_umask)
+ umask(old->umask);
+}
+
+/*
+ * A helper to change cred and drop capability. Returns 0 on success and
+ * errno on error
+ */
+static int lo_drop_cap_change_cred(fuse_req_t req, struct lo_cred *old,
+ bool change_umask, const char *cap_name,
+ bool *cap_dropped)
+{
+ int ret;
+ bool __cap_dropped;
+
+ assert(cap_name);
+
+ ret = drop_effective_cap(cap_name, &__cap_dropped);
+ if (ret) {
+ return ret;
+ }
+
+ ret = lo_change_cred(req, old, change_umask);
+ if (ret) {
+ if (__cap_dropped) {
+ if (gain_effective_cap(cap_name)) {
+ fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_%s\n", cap_name);
+ }
+ }
+ }
+
+ if (cap_dropped) {
+ *cap_dropped = __cap_dropped;
+ }
+ return ret;
+}
+
+static void lo_restore_cred_gain_cap(struct lo_cred *old, bool restore_umask,
+ const char *cap_name)
+{
+ assert(cap_name);
+
+ lo_restore_cred(old, restore_umask);
+
+ if (gain_effective_cap(cap_name)) {
+ fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_%s\n", cap_name);
+ }
}
static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent,
@@ -1202,7 +1286,7 @@ static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent,
return;
}
- saverr = lo_change_cred(req, &old);
+ saverr = lo_change_cred(req, &old, lo->change_umask && !S_ISLNK(mode));
if (saverr) {
goto out;
}
@@ -1211,7 +1295,7 @@ static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent,
saverr = errno;
- lo_restore_cred(&old);
+ lo_restore_cred(&old, lo->change_umask && !S_ISLNK(mode));
if (res == -1) {
goto out;
@@ -1917,7 +2001,7 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name,
return;
}
- err = lo_change_cred(req, &old);
+ err = lo_change_cred(req, &old, lo->change_umask);
if (err) {
goto out;
}
@@ -1928,7 +2012,7 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name,
fd = openat(parent_inode->fd, name, fi->flags | O_CREAT | O_EXCL, mode);
err = fd == -1 ? errno : 0;
- lo_restore_cred(&old);
+ lo_restore_cred(&old, lo->change_umask);
/* Ignore the error if file exists and O_EXCL was not given */
if (err && (err != EEXIST || (fi->flags & O_EXCL))) {
@@ -2727,6 +2811,63 @@ static int xattr_map_server(const struct lo_data *lo, const char *server_name,
assert(fchdir_res == 0); \
} while (0)
+static bool block_xattr(struct lo_data *lo, const char *name)
+{
+ /*
+ * If user explicitly enabled posix_acl or did not provide any option,
+ * do not block acl. Otherwise block system.posix_acl_access and
+ * system.posix_acl_default xattrs.
+ */
+ if (lo->user_posix_acl) {
+ return false;
+ }
+ if (!strcmp(name, "system.posix_acl_access") ||
+ !strcmp(name, "system.posix_acl_default"))
+ return true;
+
+ return false;
+}
+
+/*
+ * Returns number of bytes in xattr_list after filtering on success. This
+ * could be zero as well if nothing is left after filtering.
+ *
+ * Returns negative error code on failure.
+ * xattr_list is modified in place.
+ */
+static int remove_blocked_xattrs(struct lo_data *lo, char *xattr_list,
+ unsigned in_size)
+{
+ size_t out_index, in_index;
+
+ /*
+ * As of now we only filter out acl xattrs. If acls are enabled or
+ * they have not been explicitly disabled, there is nothing to
+ * filter.
+ */
+ if (lo->user_posix_acl) {
+ return in_size;
+ }
+
+ out_index = 0;
+ in_index = 0;
+ while (in_index < in_size) {
+ char *in_ptr = xattr_list + in_index;
+
+ /* Length of current attribute name */
+ size_t in_len = strlen(xattr_list + in_index) + 1;
+
+ if (!block_xattr(lo, in_ptr)) {
+ if (in_index != out_index) {
+ memmove(xattr_list + out_index, xattr_list + in_index, in_len);
+ }
+ out_index += in_len;
+ }
+ in_index += in_len;
+ }
+ return out_index;
+}
+
static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name,
size_t size)
{
@@ -2740,6 +2881,11 @@ static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name,
int saverr;
int fd = -1;
+ if (block_xattr(lo, in_name)) {
+ fuse_reply_err(req, EOPNOTSUPP);
+ return;
+ }
+
mapped_name = NULL;
name = in_name;
if (lo->xattrmap) {
@@ -2791,15 +2937,17 @@ static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name,
goto out_err;
}
ret = fgetxattr(fd, name, value, size);
+ saverr = ret == -1 ? errno : 0;
} else {
/* fchdir should not fail here */
FCHDIR_NOFAIL(lo->proc_self_fd);
ret = getxattr(procname, name, value, size);
+ saverr = ret == -1 ? errno : 0;
FCHDIR_NOFAIL(lo->root.fd);
}
if (ret == -1) {
- goto out_err;
+ goto out;
}
if (size) {
saverr = 0;
@@ -2864,15 +3012,17 @@ static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size)
goto out_err;
}
ret = flistxattr(fd, value, size);
+ saverr = ret == -1 ? errno : 0;
} else {
/* fchdir should not fail here */
FCHDIR_NOFAIL(lo->proc_self_fd);
ret = listxattr(procname, value, size);
+ saverr = ret == -1 ? errno : 0;
FCHDIR_NOFAIL(lo->root.fd);
}
if (ret == -1) {
- goto out_err;
+ goto out;
}
if (size) {
saverr = 0;
@@ -2926,6 +3076,12 @@ static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size)
goto out;
}
}
+
+ ret = remove_blocked_xattrs(lo, value, ret);
+ if (ret <= 0) {
+ saverr = -ret;
+ goto out;
+ }
fuse_reply_buf(req, value, ret);
} else {
/*
@@ -2951,7 +3107,8 @@ out:
}
static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name,
- const char *value, size_t size, int flags)
+ const char *value, size_t size, int flags,
+ uint32_t extra_flags)
{
char procname[64];
const char *name;
@@ -2961,6 +3118,14 @@ static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name,
ssize_t ret;
int saverr;
int fd = -1;
+ bool switched_creds = false;
+ bool cap_fsetid_dropped = false;
+ struct lo_cred old = {};
+
+ if (block_xattr(lo, in_name)) {
+ fuse_reply_err(req, EOPNOTSUPP);
+ return;
+ }
mapped_name = NULL;
name = in_name;
@@ -2991,6 +3156,26 @@ static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name,
", name=%s value=%s size=%zd)\n", ino, name, value, size);
sprintf(procname, "%i", inode->fd);
+ /*
+ * If we are setting posix access acl and if SGID needs to be
+ * cleared, then switch to caller's gid and drop CAP_FSETID
+ * and that should make sure host kernel clears SGID.
+ *
+ * This probably will not work when we support idmapped mounts.
+ * In that case we will need to find a non-root gid and switch
+ * to it. (Instead of gid in request). Fix it when we support
+ * idmapped mounts.
+ */
+ if (lo->posix_acl && !strcmp(name, "system.posix_acl_access")
+ && (extra_flags & FUSE_SETXATTR_ACL_KILL_SGID)) {
+ ret = lo_drop_cap_change_cred(req, &old, false, "FSETID",
+ &cap_fsetid_dropped);
+ if (ret) {
+ saverr = ret;
+ goto out;
+ }
+ switched_creds = true;
+ }
if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
fd = openat(lo->proc_self_fd, procname, O_RDONLY);
if (fd < 0) {
@@ -2998,14 +3183,20 @@ static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name,
goto out;
}
ret = fsetxattr(fd, name, value, size, flags);
+ saverr = ret == -1 ? errno : 0;
} else {
/* fchdir should not fail here */
FCHDIR_NOFAIL(lo->proc_self_fd);
ret = setxattr(procname, name, value, size, flags);
+ saverr = ret == -1 ? errno : 0;
FCHDIR_NOFAIL(lo->root.fd);
}
-
- saverr = ret == -1 ? errno : 0;
+ if (switched_creds) {
+ if (cap_fsetid_dropped)
+ lo_restore_cred_gain_cap(&old, false, "FSETID");
+ else
+ lo_restore_cred(&old, false);
+ }
out:
if (fd >= 0) {
@@ -3028,6 +3219,11 @@ static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *in_name)
int saverr;
int fd = -1;
+ if (block_xattr(lo, in_name)) {
+ fuse_reply_err(req, EOPNOTSUPP);
+ return;
+ }
+
mapped_name = NULL;
name = in_name;
if (lo->xattrmap) {
@@ -3064,15 +3260,15 @@ static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *in_name)
goto out;
}
ret = fremovexattr(fd, name);
+ saverr = ret == -1 ? errno : 0;
} else {
/* fchdir should not fail here */
FCHDIR_NOFAIL(lo->proc_self_fd);
ret = removexattr(procname, name);
+ saverr = ret == -1 ? errno : 0;
FCHDIR_NOFAIL(lo->root.fd);
}
- saverr = ret == -1 ? errno : 0;
-
out:
if (fd >= 0) {
close(fd);
@@ -3559,10 +3755,6 @@ static void setup_nofile_rlimit(unsigned long rlimit_nofile)
static void log_func(enum fuse_log_level level, const char *fmt, va_list ap)
{
g_autofree char *localfmt = NULL;
- struct timespec ts;
- struct tm tm;
- char sec_fmt[sizeof "2020-12-07 18:17:54"];
- char zone_fmt[sizeof "+0100"];
if (current_log_level < level) {
return;
@@ -3574,23 +3766,10 @@ static void log_func(enum fuse_log_level level, const char *fmt, va_list ap)
localfmt = g_strdup_printf("[ID: %08ld] %s", syscall(__NR_gettid),
fmt);
} else {
- /* try formatting a broken-down timestamp */
- if (clock_gettime(CLOCK_REALTIME, &ts) != -1 &&
- localtime_r(&ts.tv_sec, &tm) != NULL &&
- strftime(sec_fmt, sizeof sec_fmt, "%Y-%m-%d %H:%M:%S",
- &tm) != 0 &&
- strftime(zone_fmt, sizeof zone_fmt, "%z", &tm) != 0) {
- localfmt = g_strdup_printf("[%s.%02ld%s] [ID: %08ld] %s",
- sec_fmt,
- ts.tv_nsec / (10L * 1000 * 1000),
- zone_fmt, syscall(__NR_gettid),
- fmt);
- } else {
- /* fall back to a flat timestamp */
- localfmt = g_strdup_printf("[%" PRId64 "] [ID: %08ld] %s",
- get_clock(), syscall(__NR_gettid),
- fmt);
- }
+ g_autoptr(GDateTime) now = g_date_time_new_now_utc();
+ g_autofree char *nowstr = g_date_time_format(now, "%Y-%m-%d %H:%M:%S.%f%z");
+ localfmt = g_strdup_printf("[%s] [ID: %08ld] %s",
+ nowstr, syscall(__NR_gettid), fmt);
}
fmt = localfmt;
}
@@ -3722,6 +3901,7 @@ int main(int argc, char *argv[])
.allow_direct_io = 0,
.proc_self_fd = -1,
.user_killpriv_v2 = -1,
+ .user_posix_acl = -1,
};
struct lo_map_elem *root_elem;
struct lo_map_elem *reserve_elem;
@@ -3850,6 +4030,12 @@ int main(int argc, char *argv[])
exit(1);
}
+ if (lo.user_posix_acl == 1 && !lo.xattr) {
+ fuse_log(FUSE_LOG_ERR, "Can't enable posix ACLs. xattrs are disabled."
+ "\n");
+ exit(1);
+ }
+
lo.use_statx = true;
se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo);
diff --git a/tools/virtiofsd/passthrough_seccomp.c b/tools/virtiofsd/passthrough_seccomp.c
index 62441cf..f49ed94 100644
--- a/tools/virtiofsd/passthrough_seccomp.c
+++ b/tools/virtiofsd/passthrough_seccomp.c
@@ -114,6 +114,7 @@ static const int syscall_allowlist[] = {
SCMP_SYS(utimensat),
SCMP_SYS(write),
SCMP_SYS(writev),
+ SCMP_SYS(umask),
};
/* Syscalls used when --syslog is enabled */