/* * Copyright (c) Meta Platforms, Inc. and affiliates * * Authors: Mattias Nissler * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Nutanix nor the names of its contributors may be * used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_LINUX_KCMP_H #include #endif #include "btree.h" #include "common.h" #include "fd_cache.h" /* * The file descriptor cache is a global B-tree, protected by a mutex. * * Note that libvfio-user's general approach to thread safety is to assume * a given vfu_ctx_t is operated on a single thread only or protected by * appropriate locking by the caller (with a noteworthy exception for DMA via * some vfu_sgl_* APIs). However, since file descriptors are a per-process * resource, we maintain a process-global file descriptor cache and protect it * with a mutex in order to keep keep multiple vfu_ctx_t independent and avoid * imposing locking requirements across contexts. */ static btree_t cache_tree; static pthread_mutex_t cache_mutex = PTHREAD_MUTEX_INITIALIZER; /* Not static so the unit test can access these for fallback testing. */ bool kcmp_checked; bool kcmp_available; struct fd_cache_entry { int fd; dev_t dev; ino_t ino; int flags; int refcount; }; #ifdef HAVE_LINUX_KCMP_H static int kcmp(pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) { return syscall(SYS_kcmp, pid1, pid2, type, idx1, idx2); } /* Note that this is called under cache_mutex, thus guaranteeing once-only init. */ static bool is_kcmp_available(int fd) { if (!kcmp_checked) { pid_t pid = getpid(); kcmp_available = kcmp(pid, pid, KCMP_FILE, fd, fd) == 0; kcmp_checked = true; } return kcmp_available; } #else #define is_kcmp_available(fd) (false) #endif static int is_same_file(const struct fd_cache_entry *entry1, const struct fd_cache_entry *entry2) { #ifdef HAVE_LINUX_KCMP_H if (is_kcmp_available(entry1->fd)) { pid_t pid = getpid(); return kcmp(pid, pid, KCMP_FILE, entry1->fd, entry2->fd); } #endif /* * We prefer using kcmp for detecting file descriptor duplicates. * * Unfortunately, it's not available on all kernel versions/configurations, * and Docker's default seccomp policy blocks it. Thus we fall back to * comparing device and inode numbers as well as flags bits. This considers * file descriptors equivalent if they are referring to the same file on * disk (as per device/inode numbers) and were opened in identical mode * (such as read/write, as determined by flags). * * Note that this fallback isn't a perfect replacement for kcmp, as it will * consider file descriptors identical even though they originate from * separate open() calls as long as their parameters match. That's OK * though, as we really only care about the targets. Specifically, the file * positions aren't of relevance due to using pread/pwrite with explicit * offsets. */ return !(entry1->dev == entry2->dev && entry1->ino == entry2->ino && entry1->flags == entry2->flags); } static int fd_cache_entry_init(int fd, struct fd_cache_entry *entry, bool *accurate) { struct stat st; int flags; if (fstat(fd, &st) != 0 || (flags = fcntl(fd, F_GETFL)) == -1) { return -1; } entry->fd = fd; entry->dev = st.st_dev; entry->ino = st.st_ino; entry->flags = flags; entry->refcount = 0; /* * Indicate whether fd equivalence testing will work accurately for the fd. * This is trivially true when kcmp is available. When operating in * fallback mode, declare that non-regular files can't be accurately * compared. This is to exclude most "exotic" file descriptors such as * eventfd, etc. The reason is that some of these don't have unique inode * numbers, which would cause false positives in (dev, inode)-based * duplicate detection. */ *accurate = is_kcmp_available(fd) || S_ISREG(st.st_mode); return 0; } static uintptr_t fd_cache_entry_compute_key(const struct fd_cache_entry *entry) { /* * Compute a key from device and inode numbers. Note that the key value * isn't unique per file descriptor, for example in case a file gets * open()-ed multiple times. The value is expected to be unique per file * file though, as long as the computed keys don't happen to collide. * Either way, identical keys are tolerable thanks to the cache B-Tree * being able to handle multiple entries with the same key, at least as * long as the number of identical keys doesn't grow too large. * * Given that device and inode numbers are usually only using a lower * portion of a 64 bit integer's range, we byte-swap one of the numbers in * an effort to reduce the likelihood of key collisions. */ return entry->ino ^ (bswap_64(entry->dev) >> ((sizeof(uint64_t) - sizeof(uintptr_t)) * CHAR_BIT)); } static int fd_cache_lookup(const struct fd_cache_entry *query, btree_iter_t *iter, struct fd_cache_entry **entry) { uintptr_t query_key = fd_cache_entry_compute_key(query); uintptr_t key; int ret; /* * Note that the key is not guaranteed to be unique: Collisions can happen * if we have multiple independent file descriptors (not just duplicates) * for a given file (e.g. from multiple open() calls with different * read/write access modes), or when the cache keys are colliding * accidentally (due to unfortunate device/inode number values). * * Thus, we iterate over all entries matching the target key and rely on * `is_same_file` to identify actually matching entries. */ for (btree_iter_init(&cache_tree, query_key, iter); (*entry = btree_iter_get(iter, &key)) != NULL && query_key == key; btree_iter_next(iter)) { ret = is_same_file(query, *entry); if (ret <= 0) { return ret; } } *entry = NULL; return 0; } static int fd_cache_get_locked(int fd) { struct fd_cache_entry *entry; struct fd_cache_entry query; btree_iter_t iter; uintptr_t key; bool accurate; if (fd_cache_entry_init(fd, &query, &accurate) != 0) { return -1; } /* * When we can't make accurate comparisons, bypass the cache and * pretend to the caller that everything is fine. */ if (!accurate) { return fd; } if (fd_cache_lookup(&query, &iter, &entry) != 0) { return -1; } if (entry != NULL) { close_safely(&fd); ++entry->refcount; return entry->fd; } entry = calloc(1, sizeof(*entry)); if (entry == NULL) { errno = ENOMEM; return -1; } *entry = query; entry->refcount = 1; key = fd_cache_entry_compute_key(entry); if (btree_iter_insert(&iter, key, entry) != 0) { free(entry); return -1; } return entry->fd; } static int fd_cache_put_locked(int *fd) { struct fd_cache_entry *entry; struct fd_cache_entry query; btree_iter_t iter; bool accurate; if (*fd == -1) { return 0; } if (fd_cache_entry_init(*fd, &query, &accurate) != 0) { return -1; } if (!accurate) { /* * This file descriptor isn't eligible for de-duplication and thus * fd_cache_get hasn't created a cache entry. Just close the * descriptor to provide consistent semantics to the caller. */ close_safely(fd); return 0; } if (fd_cache_lookup(&query, &iter, &entry) != 0) { return -1; } if (entry == NULL || entry->fd != *fd) { errno = ENOENT; return -1; } assert(entry->refcount > 0); --entry->refcount; if (entry->refcount == 0) { btree_iter_remove(&iter); close_safely(&entry->fd); free(entry); } *fd = -1; return 0; } int fd_cache_get(int fd) { int ret; pthread_mutex_lock(&cache_mutex); ret = fd_cache_get_locked(fd); pthread_mutex_unlock(&cache_mutex); return ret; } int fd_cache_put(int *fd) { int ret; pthread_mutex_lock(&cache_mutex); ret = fd_cache_put_locked(fd); pthread_mutex_unlock(&cache_mutex); return ret; } int fd_cache_is_same_file(int fd1, int fd2) { struct fd_cache_entry entry1; struct fd_cache_entry entry2; bool accurate; int ret = -1; if (fd1 == fd2) { return 0; } else if (fd1 == -1 || fd2 == -1) { return 1; } pthread_mutex_lock(&cache_mutex); if (fd_cache_entry_init(fd1, &entry1, &accurate) == 0 && fd_cache_entry_init(fd2, &entry2, &accurate) == 0) { ret = is_same_file(&entry1, &entry2); } pthread_mutex_unlock(&cache_mutex); return ret; } /* ex: set tabstop=4 shiftwidth=4 softtabstop=4 expandtab: */