aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStefan Hajnoczi <stefanha@redhat.com>2022-10-30 18:31:59 -0400
committerStefan Hajnoczi <stefanha@redhat.com>2022-10-30 18:31:59 -0400
commit7208429223963c405c62fa2611398f1aa8033593 (patch)
tree4e4bb0a31f0b2be625b9901d811e0c522d16688d
parent5eff7badae58fdfb8ff179e72cd5546425cf8a6f (diff)
parentbd77c30df984faefa85e6a402939b485d6e05f05 (diff)
downloadqemu-7208429223963c405c62fa2611398f1aa8033593.zip
qemu-7208429223963c405c62fa2611398f1aa8033593.tar.gz
qemu-7208429223963c405c62fa2611398f1aa8033593.tar.bz2
Merge tag 'mem-2022-10-28' of https://github.com/davidhildenbrand/qemu into staging
Hi, "Host Memory Backends" and "Memory devices" queue ("mem"): - Fix NVDIMM error message - Add ThreadContext user-creatable object and wire it up for NUMA-aware hostmem preallocation # -----BEGIN PGP SIGNATURE----- # # iQJFBAABCAAvFiEEG9nKrXNcTDpGDfzKTd4Q9wD/g1oFAmNbpHARHGRhdmlkQHJl # ZGhhdC5jb20ACgkQTd4Q9wD/g1pDpw//bG9cyIlzTzDnU5pbQiXyLm0nF9tW/tli # npGPSbFFYz/72XD9VJSVLhbNHoQSmFcMK5m/DA4WAMdOc5zF7lP3XdZcj72pDyxu # 31hJRvuRhxNb09jhEdWRfX5+Jg9UyYXuIvtKXHSWgrtaYDtHBdTXq/ojZlvlo/rr # 36v0jaVaTNRs7dKQL2oaN+DSMiPXHxBzA6FABqYmJNNwuMJT0kkX8pfz0OFwkRn+ # iqf9uRhM6b/fNNB0+ReA7FfGL+hzU6Uv8AvAL3orXUqjwPMRe9Fz2gE7HpFnE6DD # dOP4Xk2iSSJ5XQA8HwtvrQfrGPh4gPYE80ziK/+8boy3alVeGYbYbvWVtdsNju41 # Cq9kM1wDyjZf6SSUIAbjOrNPdbhwyK4GviVBR1zh+/gA3uF5MhrDtZh4h3mWX2if # ijmT9mfte4NwF3K1MvckAl7IHRb8nxmr7wjjhJ26JwpD+76lfAcmXC2YOlFGHCMi # 028mjvThf3HW7BD2LjlQSX4UkHmM2vUBrgMGQKyeMham1VmMfSK32wzvUNfF7xSz # o9k0loBh7unGcUsv3EbqUGswV5F6AgjK3vWRkDql8dNrdIoapDfaejPCd58kVM98 # 5N/aEoha4bAeJ6NGIKzD+4saiMxUqJ0y2NjSrE8iO4HszXgZW5e1Gbkn4Ae6d37D # QSSqyfasVHY= # =bLuc # -----END PGP SIGNATURE----- # gpg: Signature made Fri 28 Oct 2022 05:44:16 EDT # gpg: using RSA key 1BD9CAAD735C4C3A460DFCCA4DDE10F700FF835A # gpg: issuer "david@redhat.com" # gpg: Good signature from "David Hildenbrand <david@redhat.com>" [unknown] # gpg: aka "David Hildenbrand <davidhildenbrand@gmail.com>" [full] # gpg: aka "David Hildenbrand <hildenbr@in.tum.de>" [unknown] # gpg: WARNING: The key's User ID is not certified with a trusted signature! # gpg: There is no indication that the signature belongs to the owner. # Primary key fingerprint: 1BD9 CAAD 735C 4C3A 460D FCCA 4DDE 10F7 00FF 835A * tag 'mem-2022-10-28' of https://github.com/davidhildenbrand/qemu: vl: Allow ThreadContext objects to be created before the sandbox option hostmem: Allow for specifying a ThreadContext for preallocation util: Make qemu_prealloc_mem() optionally consume a ThreadContext util: Add write-only "node-affinity" property for ThreadContext util: Introduce ThreadContext user-creatable object util: Introduce qemu_thread_set_affinity() and qemu_thread_get_affinity() util: Cleanup and rename os_mem_prealloc() hw/mem/nvdimm: fix error message for 'unarmed' flag Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-rw-r--r--backends/hostmem.c13
-rw-r--r--hw/mem/nvdimm.c2
-rw-r--r--hw/virtio/virtio-mem.c2
-rw-r--r--include/qemu/osdep.h19
-rw-r--r--include/qemu/thread-context.h57
-rw-r--r--include/qemu/thread.h4
-rw-r--r--include/sysemu/hostmem.h2
-rw-r--r--meson.build16
-rw-r--r--qapi/qom.json28
-rw-r--r--softmmu/cpus.c2
-rw-r--r--softmmu/vl.c36
-rw-r--r--util/meson.build1
-rw-r--r--util/oslib-posix.c39
-rw-r--r--util/oslib-win32.c8
-rw-r--r--util/qemu-thread-posix.c70
-rw-r--r--util/qemu-thread-win32.c12
-rw-r--r--util/thread-context.c362
17 files changed, 642 insertions, 31 deletions
diff --git a/backends/hostmem.c b/backends/hostmem.c
index 4428e06..8640294 100644
--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -232,7 +232,8 @@ static void host_memory_backend_set_prealloc(Object *obj, bool value,
void *ptr = memory_region_get_ram_ptr(&backend->mr);
uint64_t sz = memory_region_size(&backend->mr);
- os_mem_prealloc(fd, ptr, sz, backend->prealloc_threads, &local_err);
+ qemu_prealloc_mem(fd, ptr, sz, backend->prealloc_threads,
+ backend->prealloc_context, &local_err);
if (local_err) {
error_propagate(errp, local_err);
return;
@@ -383,8 +384,9 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
* specified NUMA policy in place.
*/
if (backend->prealloc) {
- os_mem_prealloc(memory_region_get_fd(&backend->mr), ptr, sz,
- backend->prealloc_threads, &local_err);
+ qemu_prealloc_mem(memory_region_get_fd(&backend->mr), ptr, sz,
+ backend->prealloc_threads,
+ backend->prealloc_context, &local_err);
if (local_err) {
goto out;
}
@@ -492,6 +494,11 @@ host_memory_backend_class_init(ObjectClass *oc, void *data)
NULL, NULL);
object_class_property_set_description(oc, "prealloc-threads",
"Number of CPU threads to use for prealloc");
+ object_class_property_add_link(oc, "prealloc-context",
+ TYPE_THREAD_CONTEXT, offsetof(HostMemoryBackend, prealloc_context),
+ object_property_allow_set_link, OBJ_PROP_LINK_STRONG);
+ object_class_property_set_description(oc, "prealloc-context",
+ "Context to use for creating CPU threads for preallocation");
object_class_property_add(oc, "size", "int",
host_memory_backend_get_size,
host_memory_backend_set_size,
diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index 7c7d777..31080c2 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -149,7 +149,7 @@ static void nvdimm_prepare_memory_region(NVDIMMDevice *nvdimm, Error **errp)
if (!nvdimm->unarmed && memory_region_is_rom(mr)) {
HostMemoryBackend *hostmem = dimm->hostmem;
- error_setg(errp, "'unarmed' property must be off since memdev %s "
+ error_setg(errp, "'unarmed' property must be 'on' since memdev %s "
"is read-only",
object_get_canonical_path_component(OBJECT(hostmem)));
return;
diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
index 30d03e9..ed170de 100644
--- a/hw/virtio/virtio-mem.c
+++ b/hw/virtio/virtio-mem.c
@@ -467,7 +467,7 @@ static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa,
int fd = memory_region_get_fd(&vmem->memdev->mr);
Error *local_err = NULL;
- os_mem_prealloc(fd, area, size, 1, &local_err);
+ qemu_prealloc_mem(fd, area, size, 1, NULL, &local_err);
if (local_err) {
static bool warned;
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index 2276094..b9c4307 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -576,8 +576,23 @@ unsigned long qemu_getauxval(unsigned long type);
void qemu_set_tty_echo(int fd, bool echo);
-void os_mem_prealloc(int fd, char *area, size_t sz, int smp_cpus,
- Error **errp);
+typedef struct ThreadContext ThreadContext;
+
+/**
+ * qemu_prealloc_mem:
+ * @fd: the fd mapped into the area, -1 for anonymous memory
+ * @area: start address of the are to preallocate
+ * @sz: the size of the area to preallocate
+ * @max_threads: maximum number of threads to use
+ * @errp: returns an error if this function fails
+ *
+ * Preallocate memory (populate/prefault page tables writable) for the virtual
+ * memory area starting at @area with the size of @sz. After a successful call,
+ * each page in the area was faulted in writable at least once, for example,
+ * after allocating file blocks for mapped files.
+ */
+void qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
+ ThreadContext *tc, Error **errp);
/**
* qemu_get_pid_name:
diff --git a/include/qemu/thread-context.h b/include/qemu/thread-context.h
new file mode 100644
index 0000000..2ebd6b7
--- /dev/null
+++ b/include/qemu/thread-context.h
@@ -0,0 +1,57 @@
+/*
+ * QEMU Thread Context
+ *
+ * Copyright Red Hat Inc., 2022
+ *
+ * Authors:
+ * David Hildenbrand <david@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef SYSEMU_THREAD_CONTEXT_H
+#define SYSEMU_THREAD_CONTEXT_H
+
+#include "qapi/qapi-types-machine.h"
+#include "qemu/thread.h"
+#include "qom/object.h"
+
+#define TYPE_THREAD_CONTEXT "thread-context"
+OBJECT_DECLARE_TYPE(ThreadContext, ThreadContextClass,
+ THREAD_CONTEXT)
+
+struct ThreadContextClass {
+ ObjectClass parent_class;
+};
+
+struct ThreadContext {
+ /* private */
+ Object parent;
+
+ /* private */
+ unsigned int thread_id;
+ QemuThread thread;
+
+ /* Semaphore to wait for context thread action. */
+ QemuSemaphore sem;
+ /* Semaphore to wait for action in context thread. */
+ QemuSemaphore sem_thread;
+ /* Mutex to synchronize requests. */
+ QemuMutex mutex;
+
+ /* Commands for the thread to execute. */
+ int thread_cmd;
+ void *thread_cmd_data;
+
+ /* CPU affinity bitmap used for initialization. */
+ unsigned long *init_cpu_bitmap;
+ int init_cpu_nbits;
+};
+
+void thread_context_create_thread(ThreadContext *tc, QemuThread *thread,
+ const char *name,
+ void *(*start_routine)(void *), void *arg,
+ int mode);
+
+#endif /* SYSEMU_THREAD_CONTEXT_H */
diff --git a/include/qemu/thread.h b/include/qemu/thread.h
index 20641e5..7c6703b 100644
--- a/include/qemu/thread.h
+++ b/include/qemu/thread.h
@@ -185,6 +185,10 @@ void qemu_event_destroy(QemuEvent *ev);
void qemu_thread_create(QemuThread *thread, const char *name,
void *(*start_routine)(void *),
void *arg, int mode);
+int qemu_thread_set_affinity(QemuThread *thread, unsigned long *host_cpus,
+ unsigned long nbits);
+int qemu_thread_get_affinity(QemuThread *thread, unsigned long **host_cpus,
+ unsigned long *nbits);
void *qemu_thread_join(QemuThread *thread);
void qemu_thread_get_self(QemuThread *thread);
bool qemu_thread_is_self(QemuThread *thread);
diff --git a/include/sysemu/hostmem.h b/include/sysemu/hostmem.h
index 9ff5c16..39326f1 100644
--- a/include/sysemu/hostmem.h
+++ b/include/sysemu/hostmem.h
@@ -18,6 +18,7 @@
#include "qom/object.h"
#include "exec/memory.h"
#include "qemu/bitmap.h"
+#include "qemu/thread-context.h"
#define TYPE_MEMORY_BACKEND "memory-backend"
OBJECT_DECLARE_TYPE(HostMemoryBackend, HostMemoryBackendClass,
@@ -66,6 +67,7 @@ struct HostMemoryBackend {
bool merge, dump, use_canonical_path;
bool prealloc, is_mapped, share, reserve;
uint32_t prealloc_threads;
+ ThreadContext *prealloc_context;
DECLARE_BITMAP(host_nodes, MAX_NODES + 1);
HostMemPolicy policy;
diff --git a/meson.build b/meson.build
index 7d39756..15dff0a 100644
--- a/meson.build
+++ b/meson.build
@@ -2130,7 +2130,23 @@ config_host_data.set('CONFIG_PTHREAD_CONDATTR_SETCLOCK', cc.links(gnu_source_pre
pthread_condattr_setclock(&attr, CLOCK_MONOTONIC);
return 0;
}''', dependencies: threads))
+config_host_data.set('CONFIG_PTHREAD_AFFINITY_NP', cc.links(gnu_source_prefix + '''
+ #include <pthread.h>
+ static void *f(void *p) { return NULL; }
+ int main(void)
+ {
+ int setsize = CPU_ALLOC_SIZE(64);
+ pthread_t thread;
+ cpu_set_t *cpuset;
+ pthread_create(&thread, 0, f, 0);
+ cpuset = CPU_ALLOC(64);
+ CPU_ZERO_S(setsize, cpuset);
+ pthread_setaffinity_np(thread, setsize, cpuset);
+ pthread_getaffinity_np(thread, setsize, cpuset);
+ CPU_FREE(cpuset);
+ return 0;
+ }''', dependencies: threads))
config_host_data.set('CONFIG_SIGNALFD', cc.links(gnu_source_prefix + '''
#include <sys/signalfd.h>
#include <stddef.h>
diff --git a/qapi/qom.json b/qapi/qom.json
index 80dd419..87fcad2 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -578,6 +578,9 @@
#
# @prealloc-threads: number of CPU threads to use for prealloc (default: 1)
#
+# @prealloc-context: thread context to use for creation of preallocation threads
+# (default: none) (since 7.2)
+#
# @share: if false, the memory is private to QEMU; if true, it is shared
# (default: false)
#
@@ -608,6 +611,7 @@
'*policy': 'HostMemPolicy',
'*prealloc': 'bool',
'*prealloc-threads': 'uint32',
+ '*prealloc-context': 'str',
'*share': 'bool',
'*reserve': 'bool',
'size': 'size',
@@ -831,6 +835,28 @@
'*kernel-hashes': 'bool' } }
##
+# @ThreadContextProperties:
+#
+# Properties for thread context objects.
+#
+# @cpu-affinity: the list of host CPU numbers used as CPU affinity for all
+# threads created in the thread context (default: QEMU main
+# thread CPU affinity)
+#
+# @node-affinity: the list of host node numbers that will be resolved to a
+# list of host CPU numbers used as CPU affinity. This is a
+# shortcut for specifying the list of host CPU numbers
+# belonging to the host nodes manually by setting
+# @cpu-affinity. (default: QEMU main thread affinity)
+#
+# Since: 7.2
+##
+{ 'struct': 'ThreadContextProperties',
+ 'data': { '*cpu-affinity': ['uint16'],
+ '*node-affinity': ['uint16'] } }
+
+
+##
# @ObjectType:
#
# Features:
@@ -882,6 +908,7 @@
{ 'name': 'secret_keyring',
'if': 'CONFIG_SECRET_KEYRING' },
'sev-guest',
+ 'thread-context',
's390-pv-guest',
'throttle-group',
'tls-creds-anon',
@@ -948,6 +975,7 @@
'secret_keyring': { 'type': 'SecretKeyringProperties',
'if': 'CONFIG_SECRET_KEYRING' },
'sev-guest': 'SevGuestProperties',
+ 'thread-context': 'ThreadContextProperties',
'throttle-group': 'ThrottleGroupProperties',
'tls-creds-anon': 'TlsCredsAnonProperties',
'tls-creds-psk': 'TlsCredsPskProperties',
diff --git a/softmmu/cpus.c b/softmmu/cpus.c
index 61b27ff..01c94fd 100644
--- a/softmmu/cpus.c
+++ b/softmmu/cpus.c
@@ -354,7 +354,7 @@ static void qemu_init_sigbus(void)
/*
* ALERT: when modifying this, take care that SIGBUS forwarding in
- * os_mem_prealloc() will continue working as expected.
+ * qemu_prealloc_mem() will continue working as expected.
*/
memset(&action, 0, sizeof(action));
action.sa_flags = SA_SIGINFO;
diff --git a/softmmu/vl.c b/softmmu/vl.c
index 99fb49c..5115221 100644
--- a/softmmu/vl.c
+++ b/softmmu/vl.c
@@ -1760,6 +1760,27 @@ static void object_option_parse(const char *optarg)
}
/*
+ * Very early object creation, before the sandbox options have been activated.
+ */
+static bool object_create_pre_sandbox(const char *type)
+{
+ /*
+ * Objects should in general not get initialized "too early" without
+ * a reason. If you add one, state the reason in a comment!
+ */
+
+ /*
+ * Reason: -sandbox on,resourcecontrol=deny disallows setting CPU
+ * affinity of threads.
+ */
+ if (g_str_equal(type, "thread-context")) {
+ return true;
+ }
+
+ return false;
+}
+
+/*
* Initial object creation happens before all other
* QEMU data types are created. The majority of objects
* can be created at this point. The rng-egd object
@@ -1773,6 +1794,11 @@ static bool object_create_early(const char *type)
* add one, state the reason in a comment!
*/
+ /* Reason: already created. */
+ if (object_create_pre_sandbox(type)) {
+ return false;
+ }
+
/* Reason: property "chardev" */
if (g_str_equal(type, "rng-egd") ||
g_str_equal(type, "qtest")) {
@@ -1895,7 +1921,7 @@ static void qemu_create_early_backends(void)
*/
static bool object_create_late(const char *type)
{
- return !object_create_early(type);
+ return !object_create_early(type) && !object_create_pre_sandbox(type);
}
static void qemu_create_late_backends(void)
@@ -2351,6 +2377,11 @@ static int process_runstate_actions(void *opaque, QemuOpts *opts, Error **errp)
static void qemu_process_early_options(void)
{
+ qemu_opts_foreach(qemu_find_opts("name"),
+ parse_name, NULL, &error_fatal);
+
+ object_option_foreach_add(object_create_pre_sandbox);
+
#ifdef CONFIG_SECCOMP
QemuOptsList *olist = qemu_find_opts_err("sandbox", NULL);
if (olist) {
@@ -2358,9 +2389,6 @@ static void qemu_process_early_options(void)
}
#endif
- qemu_opts_foreach(qemu_find_opts("name"),
- parse_name, NULL, &error_fatal);
-
if (qemu_opts_foreach(qemu_find_opts("action"),
process_runstate_actions, NULL, &error_fatal)) {
exit(1);
diff --git a/util/meson.build b/util/meson.build
index 5e28213..c0a7bc5 100644
--- a/util/meson.build
+++ b/util/meson.build
@@ -1,4 +1,5 @@
util_ss.add(files('osdep.c', 'cutils.c', 'unicode.c', 'qemu-timer-common.c'))
+util_ss.add(files('thread-context.c'), numa)
if not config_host_data.get('CONFIG_ATOMIC64')
util_ss.add(files('atomic64.c'))
endif
diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index 827a7aa..59a891b 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -42,6 +42,7 @@
#include "qemu/cutils.h"
#include "qemu/compiler.h"
#include "qemu/units.h"
+#include "qemu/thread-context.h"
#ifdef CONFIG_LINUX
#include <sys/syscall.h>
@@ -329,7 +330,7 @@ static void sigbus_handler(int signal)
return;
}
#endif /* CONFIG_LINUX */
- warn_report("os_mem_prealloc: unrelated SIGBUS detected and ignored");
+ warn_report("qemu_prealloc_mem: unrelated SIGBUS detected and ignored");
}
static void *do_touch_pages(void *arg)
@@ -399,13 +400,13 @@ static void *do_madv_populate_write_pages(void *arg)
}
static inline int get_memset_num_threads(size_t hpagesize, size_t numpages,
- int smp_cpus)
+ int max_threads)
{
long host_procs = sysconf(_SC_NPROCESSORS_ONLN);
int ret = 1;
if (host_procs > 0) {
- ret = MIN(MIN(host_procs, MAX_MEM_PREALLOC_THREAD_COUNT), smp_cpus);
+ ret = MIN(MIN(host_procs, MAX_MEM_PREALLOC_THREAD_COUNT), max_threads);
}
/* Especially with gigantic pages, don't create more threads than pages. */
@@ -418,11 +419,12 @@ static inline int get_memset_num_threads(size_t hpagesize, size_t numpages,
}
static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
- int smp_cpus, bool use_madv_populate_write)
+ int max_threads, ThreadContext *tc,
+ bool use_madv_populate_write)
{
static gsize initialized = 0;
MemsetContext context = {
- .num_threads = get_memset_num_threads(hpagesize, numpages, smp_cpus),
+ .num_threads = get_memset_num_threads(hpagesize, numpages, max_threads),
};
size_t numpages_per_thread, leftover;
void *(*touch_fn)(void *);
@@ -457,9 +459,16 @@ static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
context.threads[i].numpages = numpages_per_thread + (i < leftover);
context.threads[i].hpagesize = hpagesize;
context.threads[i].context = &context;
- qemu_thread_create(&context.threads[i].pgthread, "touch_pages",
- touch_fn, &context.threads[i],
- QEMU_THREAD_JOINABLE);
+ if (tc) {
+ thread_context_create_thread(tc, &context.threads[i].pgthread,
+ "touch_pages",
+ touch_fn, &context.threads[i],
+ QEMU_THREAD_JOINABLE);
+ } else {
+ qemu_thread_create(&context.threads[i].pgthread, "touch_pages",
+ touch_fn, &context.threads[i],
+ QEMU_THREAD_JOINABLE);
+ }
addr += context.threads[i].numpages * hpagesize;
}
@@ -494,13 +503,13 @@ static bool madv_populate_write_possible(char *area, size_t pagesize)
errno != EINVAL;
}
-void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
- Error **errp)
+void qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
+ ThreadContext *tc, Error **errp)
{
static gsize initialized;
int ret;
size_t hpagesize = qemu_fd_getpagesize(fd);
- size_t numpages = DIV_ROUND_UP(memory, hpagesize);
+ size_t numpages = DIV_ROUND_UP(sz, hpagesize);
bool use_madv_populate_write;
struct sigaction act;
@@ -530,24 +539,24 @@ void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
if (ret) {
qemu_mutex_unlock(&sigbus_mutex);
error_setg_errno(errp, errno,
- "os_mem_prealloc: failed to install signal handler");
+ "qemu_prealloc_mem: failed to install signal handler");
return;
}
}
/* touch pages simultaneously */
- ret = touch_all_pages(area, hpagesize, numpages, smp_cpus,
+ ret = touch_all_pages(area, hpagesize, numpages, max_threads, tc,
use_madv_populate_write);
if (ret) {
error_setg_errno(errp, -ret,
- "os_mem_prealloc: preallocating memory failed");
+ "qemu_prealloc_mem: preallocating memory failed");
}
if (!use_madv_populate_write) {
ret = sigaction(SIGBUS, &sigbus_oldact, NULL);
if (ret) {
/* Terminate QEMU since it can't recover from error */
- perror("os_mem_prealloc: failed to reinstall signal handler");
+ perror("qemu_prealloc_mem: failed to reinstall signal handler");
exit(1);
}
qemu_mutex_unlock(&sigbus_mutex);
diff --git a/util/oslib-win32.c b/util/oslib-win32.c
index 5723d3e..a67cb38 100644
--- a/util/oslib-win32.c
+++ b/util/oslib-win32.c
@@ -268,14 +268,14 @@ int getpagesize(void)
return system_info.dwPageSize;
}
-void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
- Error **errp)
+void qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
+ ThreadContext *tc, Error **errp)
{
int i;
size_t pagesize = qemu_real_host_page_size();
- memory = (memory + pagesize - 1) & -pagesize;
- for (i = 0; i < memory / pagesize; i++) {
+ sz = (sz + pagesize - 1) & -pagesize;
+ for (i = 0; i < sz / pagesize; i++) {
memset(area + pagesize * i, 0, 1);
}
}
diff --git a/util/qemu-thread-posix.c b/util/qemu-thread-posix.c
index ac1d56e..bae938c 100644
--- a/util/qemu-thread-posix.c
+++ b/util/qemu-thread-posix.c
@@ -16,6 +16,7 @@
#include "qemu/notify.h"
#include "qemu-thread-common.h"
#include "qemu/tsan.h"
+#include "qemu/bitmap.h"
static bool name_threads;
@@ -552,6 +553,75 @@ void qemu_thread_create(QemuThread *thread, const char *name,
pthread_attr_destroy(&attr);
}
+int qemu_thread_set_affinity(QemuThread *thread, unsigned long *host_cpus,
+ unsigned long nbits)
+{
+#if defined(CONFIG_PTHREAD_AFFINITY_NP)
+ const size_t setsize = CPU_ALLOC_SIZE(nbits);
+ unsigned long value;
+ cpu_set_t *cpuset;
+ int err;
+
+ cpuset = CPU_ALLOC(nbits);
+ g_assert(cpuset);
+
+ CPU_ZERO_S(setsize, cpuset);
+ value = find_first_bit(host_cpus, nbits);
+ while (value < nbits) {
+ CPU_SET_S(value, setsize, cpuset);
+ value = find_next_bit(host_cpus, nbits, value + 1);
+ }
+
+ err = pthread_setaffinity_np(thread->thread, setsize, cpuset);
+ CPU_FREE(cpuset);
+ return err;
+#else
+ return -ENOSYS;
+#endif
+}
+
+int qemu_thread_get_affinity(QemuThread *thread, unsigned long **host_cpus,
+ unsigned long *nbits)
+{
+#if defined(CONFIG_PTHREAD_AFFINITY_NP)
+ unsigned long tmpbits;
+ cpu_set_t *cpuset;
+ size_t setsize;
+ int i, err;
+
+ tmpbits = CPU_SETSIZE;
+ while (true) {
+ setsize = CPU_ALLOC_SIZE(tmpbits);
+ cpuset = CPU_ALLOC(tmpbits);
+ g_assert(cpuset);
+
+ err = pthread_getaffinity_np(thread->thread, setsize, cpuset);
+ if (err) {
+ CPU_FREE(cpuset);
+ if (err != -EINVAL) {
+ return err;
+ }
+ tmpbits *= 2;
+ } else {
+ break;
+ }
+ }
+
+ /* Convert the result into a proper bitmap. */
+ *nbits = tmpbits;
+ *host_cpus = bitmap_new(tmpbits);
+ for (i = 0; i < tmpbits; i++) {
+ if (CPU_ISSET(i, cpuset)) {
+ set_bit(i, *host_cpus);
+ }
+ }
+ CPU_FREE(cpuset);
+ return 0;
+#else
+ return -ENOSYS;
+#endif
+}
+
void qemu_thread_get_self(QemuThread *thread)
{
thread->thread = pthread_self();
diff --git a/util/qemu-thread-win32.c b/util/qemu-thread-win32.c
index b9a467d..69db254 100644
--- a/util/qemu-thread-win32.c
+++ b/util/qemu-thread-win32.c
@@ -477,6 +477,18 @@ void qemu_thread_create(QemuThread *thread, const char *name,
thread->data = data;
}
+int qemu_thread_set_affinity(QemuThread *thread, unsigned long *host_cpus,
+ unsigned long nbits)
+{
+ return -ENOSYS;
+}
+
+int qemu_thread_get_affinity(QemuThread *thread, unsigned long **host_cpus,
+ unsigned long *nbits)
+{
+ return -ENOSYS;
+}
+
void qemu_thread_get_self(QemuThread *thread)
{
thread->data = qemu_thread_data;
diff --git a/util/thread-context.c b/util/thread-context.c
new file mode 100644
index 0000000..4138245
--- /dev/null
+++ b/util/thread-context.c
@@ -0,0 +1,362 @@
+/*
+ * QEMU Thread Context
+ *
+ * Copyright Red Hat Inc., 2022
+ *
+ * Authors:
+ * David Hildenbrand <david@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/thread-context.h"
+#include "qapi/error.h"
+#include "qapi/qapi-builtin-visit.h"
+#include "qapi/visitor.h"
+#include "qemu/config-file.h"
+#include "qapi/qapi-builtin-visit.h"
+#include "qom/object_interfaces.h"
+#include "qemu/module.h"
+#include "qemu/bitmap.h"
+
+#ifdef CONFIG_NUMA
+#include <numa.h>
+#endif
+
+enum {
+ TC_CMD_NONE = 0,
+ TC_CMD_STOP,
+ TC_CMD_NEW,
+};
+
+typedef struct ThreadContextCmdNew {
+ QemuThread *thread;
+ const char *name;
+ void *(*start_routine)(void *);
+ void *arg;
+ int mode;
+} ThreadContextCmdNew;
+
+static void *thread_context_run(void *opaque)
+{
+ ThreadContext *tc = opaque;
+
+ tc->thread_id = qemu_get_thread_id();
+ qemu_sem_post(&tc->sem);
+
+ while (true) {
+ /*
+ * Threads inherit the CPU affinity of the creating thread. For this
+ * reason, we create new (especially short-lived) threads from our
+ * persistent context thread.
+ *
+ * Especially when QEMU is not allowed to set the affinity itself,
+ * management tools can simply set the affinity of the context thread
+ * after creating the context, to have new threads created via
+ * the context inherit the CPU affinity automatically.
+ */
+ switch (tc->thread_cmd) {
+ case TC_CMD_NONE:
+ break;
+ case TC_CMD_STOP:
+ tc->thread_cmd = TC_CMD_NONE;
+ qemu_sem_post(&tc->sem);
+ return NULL;
+ case TC_CMD_NEW: {
+ ThreadContextCmdNew *cmd_new = tc->thread_cmd_data;
+
+ qemu_thread_create(cmd_new->thread, cmd_new->name,
+ cmd_new->start_routine, cmd_new->arg,
+ cmd_new->mode);
+ tc->thread_cmd = TC_CMD_NONE;
+ tc->thread_cmd_data = NULL;
+ qemu_sem_post(&tc->sem);
+ break;
+ }
+ default:
+ g_assert_not_reached();
+ }
+ qemu_sem_wait(&tc->sem_thread);
+ }
+}
+
+static void thread_context_set_cpu_affinity(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ ThreadContext *tc = THREAD_CONTEXT(obj);
+ uint16List *l, *host_cpus = NULL;
+ unsigned long *bitmap = NULL;
+ int nbits = 0, ret;
+ Error *err = NULL;
+
+ if (tc->init_cpu_bitmap) {
+ error_setg(errp, "Mixing CPU and node affinity not supported");
+ return;
+ }
+
+ visit_type_uint16List(v, name, &host_cpus, &err);
+ if (err) {
+ error_propagate(errp, err);
+ return;
+ }
+
+ if (!host_cpus) {
+ error_setg(errp, "CPU list is empty");
+ goto out;
+ }
+
+ for (l = host_cpus; l; l = l->next) {
+ nbits = MAX(nbits, l->value + 1);
+ }
+ bitmap = bitmap_new(nbits);
+ for (l = host_cpus; l; l = l->next) {
+ set_bit(l->value, bitmap);
+ }
+
+ if (tc->thread_id != -1) {
+ /*
+ * Note: we won't be adjusting the affinity of any thread that is still
+ * around, but only the affinity of the context thread.
+ */
+ ret = qemu_thread_set_affinity(&tc->thread, bitmap, nbits);
+ if (ret) {
+ error_setg(errp, "Setting CPU affinity failed: %s", strerror(ret));
+ }
+ } else {
+ tc->init_cpu_bitmap = bitmap;
+ bitmap = NULL;
+ tc->init_cpu_nbits = nbits;
+ }
+out:
+ g_free(bitmap);
+ qapi_free_uint16List(host_cpus);
+}
+
+static void thread_context_get_cpu_affinity(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ unsigned long *bitmap, nbits, value;
+ ThreadContext *tc = THREAD_CONTEXT(obj);
+ uint16List *host_cpus = NULL;
+ uint16List **tail = &host_cpus;
+ int ret;
+
+ if (tc->thread_id == -1) {
+ error_setg(errp, "Object not initialized yet");
+ return;
+ }
+
+ ret = qemu_thread_get_affinity(&tc->thread, &bitmap, &nbits);
+ if (ret) {
+ error_setg(errp, "Getting CPU affinity failed: %s", strerror(ret));
+ return;
+ }
+
+ value = find_first_bit(bitmap, nbits);
+ while (value < nbits) {
+ QAPI_LIST_APPEND(tail, value);
+
+ value = find_next_bit(bitmap, nbits, value + 1);
+ }
+ g_free(bitmap);
+
+ visit_type_uint16List(v, name, &host_cpus, errp);
+ qapi_free_uint16List(host_cpus);
+}
+
+static void thread_context_set_node_affinity(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+#ifdef CONFIG_NUMA
+ const int nbits = numa_num_possible_cpus();
+ ThreadContext *tc = THREAD_CONTEXT(obj);
+ uint16List *l, *host_nodes = NULL;
+ unsigned long *bitmap = NULL;
+ struct bitmask *tmp_cpus;
+ Error *err = NULL;
+ int ret, i;
+
+ if (tc->init_cpu_bitmap) {
+ error_setg(errp, "Mixing CPU and node affinity not supported");
+ return;
+ }
+
+ visit_type_uint16List(v, name, &host_nodes, &err);
+ if (err) {
+ error_propagate(errp, err);
+ return;
+ }
+
+ if (!host_nodes) {
+ error_setg(errp, "Node list is empty");
+ goto out;
+ }
+
+ bitmap = bitmap_new(nbits);
+ tmp_cpus = numa_allocate_cpumask();
+ for (l = host_nodes; l; l = l->next) {
+ numa_bitmask_clearall(tmp_cpus);
+ ret = numa_node_to_cpus(l->value, tmp_cpus);
+ if (ret) {
+ /* We ignore any errors, such as impossible nodes. */
+ continue;
+ }
+ for (i = 0; i < nbits; i++) {
+ if (numa_bitmask_isbitset(tmp_cpus, i)) {
+ set_bit(i, bitmap);
+ }
+ }
+ }
+ numa_free_cpumask(tmp_cpus);
+
+ if (bitmap_empty(bitmap, nbits)) {
+ error_setg(errp, "The nodes select no CPUs");
+ goto out;
+ }
+
+ if (tc->thread_id != -1) {
+ /*
+ * Note: we won't be adjusting the affinity of any thread that is still
+ * around for now, but only the affinity of the context thread.
+ */
+ ret = qemu_thread_set_affinity(&tc->thread, bitmap, nbits);
+ if (ret) {
+ error_setg(errp, "Setting CPU affinity failed: %s", strerror(ret));
+ }
+ } else {
+ tc->init_cpu_bitmap = bitmap;
+ bitmap = NULL;
+ tc->init_cpu_nbits = nbits;
+ }
+out:
+ g_free(bitmap);
+ qapi_free_uint16List(host_nodes);
+#else
+ error_setg(errp, "NUMA node affinity is not supported by this QEMU");
+#endif
+}
+
+static void thread_context_get_thread_id(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ ThreadContext *tc = THREAD_CONTEXT(obj);
+ uint64_t value = tc->thread_id;
+
+ visit_type_uint64(v, name, &value, errp);
+}
+
+static void thread_context_instance_complete(UserCreatable *uc, Error **errp)
+{
+ ThreadContext *tc = THREAD_CONTEXT(uc);
+ char *thread_name;
+ int ret;
+
+ thread_name = g_strdup_printf("TC %s",
+ object_get_canonical_path_component(OBJECT(uc)));
+ qemu_thread_create(&tc->thread, thread_name, thread_context_run, tc,
+ QEMU_THREAD_JOINABLE);
+ g_free(thread_name);
+
+ /* Wait until initialization of the thread is done. */
+ while (tc->thread_id == -1) {
+ qemu_sem_wait(&tc->sem);
+ }
+
+ if (tc->init_cpu_bitmap) {
+ ret = qemu_thread_set_affinity(&tc->thread, tc->init_cpu_bitmap,
+ tc->init_cpu_nbits);
+ if (ret) {
+ error_setg(errp, "Setting CPU affinity failed: %s", strerror(ret));
+ }
+ g_free(tc->init_cpu_bitmap);
+ tc->init_cpu_bitmap = NULL;
+ }
+}
+
+static void thread_context_class_init(ObjectClass *oc, void *data)
+{
+ UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc);
+
+ ucc->complete = thread_context_instance_complete;
+ object_class_property_add(oc, "thread-id", "int",
+ thread_context_get_thread_id, NULL, NULL,
+ NULL);
+ object_class_property_add(oc, "cpu-affinity", "int",
+ thread_context_get_cpu_affinity,
+ thread_context_set_cpu_affinity, NULL, NULL);
+ object_class_property_add(oc, "node-affinity", "int", NULL,
+ thread_context_set_node_affinity, NULL, NULL);
+}
+
+static void thread_context_instance_init(Object *obj)
+{
+ ThreadContext *tc = THREAD_CONTEXT(obj);
+
+ tc->thread_id = -1;
+ qemu_sem_init(&tc->sem, 0);
+ qemu_sem_init(&tc->sem_thread, 0);
+ qemu_mutex_init(&tc->mutex);
+}
+
+static void thread_context_instance_finalize(Object *obj)
+{
+ ThreadContext *tc = THREAD_CONTEXT(obj);
+
+ if (tc->thread_id != -1) {
+ tc->thread_cmd = TC_CMD_STOP;
+ qemu_sem_post(&tc->sem_thread);
+ qemu_thread_join(&tc->thread);
+ }
+ qemu_sem_destroy(&tc->sem);
+ qemu_sem_destroy(&tc->sem_thread);
+ qemu_mutex_destroy(&tc->mutex);
+}
+
+static const TypeInfo thread_context_info = {
+ .name = TYPE_THREAD_CONTEXT,
+ .parent = TYPE_OBJECT,
+ .class_init = thread_context_class_init,
+ .instance_size = sizeof(ThreadContext),
+ .instance_init = thread_context_instance_init,
+ .instance_finalize = thread_context_instance_finalize,
+ .interfaces = (InterfaceInfo[]) {
+ { TYPE_USER_CREATABLE },
+ { }
+ }
+};
+
+static void thread_context_register_types(void)
+{
+ type_register_static(&thread_context_info);
+}
+type_init(thread_context_register_types)
+
+void thread_context_create_thread(ThreadContext *tc, QemuThread *thread,
+ const char *name,
+ void *(*start_routine)(void *), void *arg,
+ int mode)
+{
+ ThreadContextCmdNew data = {
+ .thread = thread,
+ .name = name,
+ .start_routine = start_routine,
+ .arg = arg,
+ .mode = mode,
+ };
+
+ qemu_mutex_lock(&tc->mutex);
+ tc->thread_cmd = TC_CMD_NEW;
+ tc->thread_cmd_data = &data;
+ qemu_sem_post(&tc->sem_thread);
+
+ while (tc->thread_cmd != TC_CMD_NONE) {
+ qemu_sem_wait(&tc->sem);
+ }
+ qemu_mutex_unlock(&tc->mutex);
+}