diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2024-02-08 11:59:13 +0000 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2024-02-08 11:59:13 +0000 |
commit | 8ab67b6ebc721041b22aa0e60f5422166dada234 (patch) | |
tree | c5668fe7f5ae98622af2f69ac4f2a97ff219a97a | |
parent | 39a6e4f87e7b75a45b08d6dc8b8b7c2954c87440 (diff) | |
parent | 04accf43df83aa10f06f7dbda3ecf0db97f0c5a6 (diff) | |
download | qemu-8ab67b6ebc721041b22aa0e60f5422166dada234.zip qemu-8ab67b6ebc721041b22aa0e60f5422166dada234.tar.gz qemu-8ab67b6ebc721041b22aa0e60f5422166dada234.tar.bz2 |
Merge tag 'mem-2024-02-06-v3' of https://github.com/davidhildenbrand/qemu into staging
Hi,
"Host Memory Backends" and "Memory devices" queue ("mem"):
- Reintroduce memory region size checks for memory devices; the removal
lead to some undesired side effects
- Preallocate memory of memory backends in selected configurations
asynchronously (so we preallocate concurrently), to speed up QEMU
startup time.
# -----BEGIN PGP SIGNATURE-----
#
# iQJFBAABCAAvFiEEG9nKrXNcTDpGDfzKTd4Q9wD/g1oFAmXB3LcRHGRhdmlkQHJl
# ZGhhdC5jb20ACgkQTd4Q9wD/g1plRA/+N8y4aJB+qEwacl5scIpiWShqeBA0aybS
# Rp3796djgjkqozkv7AFGHrOIGiLDtCh4W1JYuML7kLN7IvuJoHSY+AHzfhDiae1l
# eluX/Cs/5rgEninwT9M0yEkgvUybA8+kx+z96hBJgkfJOrdbETc7YVbU5iP/sOOF
# UtfEVWGwT1RJOun0qrgEhHiJCTMcHyJjSEy8D867ymC+knu3OZIz22+axcmpHz6i
# QJFgY40OCP1yxBvPVLR3K/Z0se/FkxG55LwM58j7N/m+VDv4IqZCTbkZb5BTJVla
# 5vKgIrZfZ+XFqrenyMsBnBLgQuyCmDJIDFfxM0A9gOvJbwtf8T4DhL9FoRvVZMDD
# SHBl/EZcViXFDDKVHjotBSA5JoNbjHac5J5jCFu7pRq+2DbzxWHmW6xV7sY9gkSO
# +SdW9hcmF/vF5MKHfoQR2kVLLJ2/EKHiN/xVVsha0+RQDctucrhg1Y9MS2obJV3u
# u2udaVk5UNcfNPuVPwkG8YQ0sIyuDYXOTThwNtsj0tyZ+tGVQmMIlou/GAsrc9PF
# xmqzkCXXyrILrPMQJrYBcdwasBLuEcJMW59BqgxHCVP9NiAQgsNVzXFg4mr3+mVF
# xTrt8wioTvAPoDvXe+BPoaH6AsIY2TqE8j7IqA1Q/IFNf+KLYkPcHknZfzfxSkdW
# woRHVtjrkMo=
# =lW5h
# -----END PGP SIGNATURE-----
# gpg: Signature made Tue 06 Feb 2024 07:16:07 GMT
# gpg: using RSA key 1BD9CAAD735C4C3A460DFCCA4DDE10F700FF835A
# gpg: issuer "david@redhat.com"
# gpg: Good signature from "David Hildenbrand <david@redhat.com>" [marginal]
# gpg: aka "David Hildenbrand <davidhildenbrand@gmail.com>" [full]
# gpg: aka "David Hildenbrand <hildenbr@in.tum.de>" [unknown]
# Primary key fingerprint: 1BD9 CAAD 735C 4C3A 460D FCCA 4DDE 10F7 00FF 835A
* tag 'mem-2024-02-06-v3' of https://github.com/davidhildenbrand/qemu:
oslib-posix: initialize backend memory objects in parallel
memory-device: reintroduce memory region size check
hv-balloon: use get_min_alignment() to express 32 GiB alignment
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r-- | backends/hostmem.c | 7 | ||||
-rw-r--r-- | hw/hyperv/hv-balloon.c | 37 | ||||
-rw-r--r-- | hw/mem/memory-device.c | 14 | ||||
-rw-r--r-- | hw/virtio/virtio-mem.c | 4 | ||||
-rw-r--r-- | include/hw/qdev-core.h | 5 | ||||
-rw-r--r-- | include/qemu/osdep.h | 18 | ||||
-rw-r--r-- | system/vl.c | 9 | ||||
-rw-r--r-- | util/oslib-posix.c | 131 | ||||
-rw-r--r-- | util/oslib-win32.c | 8 |
9 files changed, 180 insertions, 53 deletions
diff --git a/backends/hostmem.c b/backends/hostmem.c index 987f6f5..81a72ce 100644 --- a/backends/hostmem.c +++ b/backends/hostmem.c @@ -20,6 +20,7 @@ #include "qom/object_interfaces.h" #include "qemu/mmap-alloc.h" #include "qemu/madvise.h" +#include "hw/qdev-core.h" #ifdef CONFIG_NUMA #include <numaif.h> @@ -237,7 +238,7 @@ static void host_memory_backend_set_prealloc(Object *obj, bool value, uint64_t sz = memory_region_size(&backend->mr); if (!qemu_prealloc_mem(fd, ptr, sz, backend->prealloc_threads, - backend->prealloc_context, errp)) { + backend->prealloc_context, false, errp)) { return; } backend->prealloc = true; @@ -323,6 +324,7 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp) HostMemoryBackendClass *bc = MEMORY_BACKEND_GET_CLASS(uc); void *ptr; uint64_t sz; + bool async = !phase_check(PHASE_LATE_BACKENDS_CREATED); if (!bc->alloc) { return; @@ -402,7 +404,8 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp) if (backend->prealloc && !qemu_prealloc_mem(memory_region_get_fd(&backend->mr), ptr, sz, backend->prealloc_threads, - backend->prealloc_context, errp)) { + backend->prealloc_context, + async, errp)) { return; } } diff --git a/hw/hyperv/hv-balloon.c b/hw/hyperv/hv-balloon.c index 0238365..ade2833 100644 --- a/hw/hyperv/hv-balloon.c +++ b/hw/hyperv/hv-balloon.c @@ -1477,22 +1477,7 @@ static void hv_balloon_ensure_mr(HvBalloon *balloon) balloon->mr = g_new0(MemoryRegion, 1); memory_region_init(balloon->mr, OBJECT(balloon), TYPE_HV_BALLOON, memory_region_size(hostmem_mr)); - - /* - * The VM can indicate an alignment up to 32 GiB. Memory device core can - * usually only handle/guarantee 1 GiB alignment. The user will have to - * specify a larger maxmem eventually. - * - * The memory device core will warn the user in case maxmem might have to be - * increased and will fail plugging the device if there is not sufficient - * space after alignment. - * - * TODO: we could do the alignment ourselves in a slightly bigger region. - * But this feels better, although the warning might be annoying. Maybe - * we can optimize that in the future (e.g., with such a device on the - * cmdline place/size the device memory region differently. - */ - balloon->mr->align = MAX(32 * GiB, memory_region_get_alignment(hostmem_mr)); + balloon->mr->align = memory_region_get_alignment(hostmem_mr); } static void hv_balloon_free_mr(HvBalloon *balloon) @@ -1654,6 +1639,25 @@ static MemoryRegion *hv_balloon_md_get_memory_region(MemoryDeviceState *md, return balloon->mr; } +static uint64_t hv_balloon_md_get_min_alignment(const MemoryDeviceState *md) +{ + /* + * The VM can indicate an alignment up to 32 GiB. Memory device core can + * usually only handle/guarantee 1 GiB alignment. The user will have to + * specify a larger maxmem eventually. + * + * The memory device core will warn the user in case maxmem might have to be + * increased and will fail plugging the device if there is not sufficient + * space after alignment. + * + * TODO: we could do the alignment ourselves in a slightly bigger region. + * But this feels better, although the warning might be annoying. Maybe + * we can optimize that in the future (e.g., with such a device on the + * cmdline place/size the device memory region differently. + */ + return 32 * GiB; +} + static void hv_balloon_md_fill_device_info(const MemoryDeviceState *md, MemoryDeviceInfo *info) { @@ -1766,5 +1770,6 @@ static void hv_balloon_class_init(ObjectClass *klass, void *data) mdc->get_memory_region = hv_balloon_md_get_memory_region; mdc->decide_memslots = hv_balloon_decide_memslots; mdc->get_memslots = hv_balloon_get_memslots; + mdc->get_min_alignment = hv_balloon_md_get_min_alignment; mdc->fill_device_info = hv_balloon_md_fill_device_info; } diff --git a/hw/mem/memory-device.c b/hw/mem/memory-device.c index a1b1af2..e098585 100644 --- a/hw/mem/memory-device.c +++ b/hw/mem/memory-device.c @@ -374,6 +374,20 @@ void memory_device_pre_plug(MemoryDeviceState *md, MachineState *ms, goto out; } + /* + * We always want the memory region size to be multiples of the memory + * region alignment: for example, DIMMs with 1G+1byte size don't make + * any sense. Note that we don't check that the size is multiples + * of any additional alignment requirements the memory device might + * have when it comes to the address in physical address space. + */ + if (!QEMU_IS_ALIGNED(memory_region_size(mr), + memory_region_get_alignment(mr))) { + error_setg(errp, "backend memory size must be multiple of 0x%" + PRIx64, memory_region_get_alignment(mr)); + return; + } + if (legacy_align) { align = *legacy_align; } else { diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c index 99ab989..ffd119e 100644 --- a/hw/virtio/virtio-mem.c +++ b/hw/virtio/virtio-mem.c @@ -605,7 +605,7 @@ static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa, int fd = memory_region_get_fd(&vmem->memdev->mr); Error *local_err = NULL; - if (!qemu_prealloc_mem(fd, area, size, 1, NULL, &local_err)) { + if (!qemu_prealloc_mem(fd, area, size, 1, NULL, false, &local_err)) { static bool warned; /* @@ -1248,7 +1248,7 @@ static int virtio_mem_prealloc_range_cb(VirtIOMEM *vmem, void *arg, int fd = memory_region_get_fd(&vmem->memdev->mr); Error *local_err = NULL; - if (!qemu_prealloc_mem(fd, area, size, 1, NULL, &local_err)) { + if (!qemu_prealloc_mem(fd, area, size, 1, NULL, false, &local_err)) { error_report_err(local_err); return -ENOMEM; } diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h index d47536e..9228e96 100644 --- a/include/hw/qdev-core.h +++ b/include/hw/qdev-core.h @@ -1084,6 +1084,11 @@ typedef enum MachineInitPhase { PHASE_ACCEL_CREATED, /* + * Late backend objects have been created and initialized. + */ + PHASE_LATE_BACKENDS_CREATED, + + /* * machine_class->init has been called, thus creating any embedded * devices and validating machine properties. Devices created at * this time are considered to be cold-plugged. diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h index c9692cc..7d359da 100644 --- a/include/qemu/osdep.h +++ b/include/qemu/osdep.h @@ -680,6 +680,8 @@ typedef struct ThreadContext ThreadContext; * @area: start address of the are to preallocate * @sz: the size of the area to preallocate * @max_threads: maximum number of threads to use + * @tc: prealloc context threads pointer, NULL if not in use + * @async: request asynchronous preallocation, requires @tc * @errp: returns an error if this function fails * * Preallocate memory (populate/prefault page tables writable) for the virtual @@ -687,10 +689,24 @@ typedef struct ThreadContext ThreadContext; * each page in the area was faulted in writable at least once, for example, * after allocating file blocks for mapped files. * + * When setting @async, allocation might be performed asynchronously. + * qemu_finish_async_prealloc_mem() must be called to finish any asynchronous + * preallocation. + * * Return: true on success, else false setting @errp with error. */ bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads, - ThreadContext *tc, Error **errp); + ThreadContext *tc, bool async, Error **errp); + +/** + * qemu_finish_async_prealloc_mem: + * @errp: returns an error if this function fails + * + * Finish all outstanding asynchronous memory preallocation. + * + * Return: true on success, else false setting @errp with error. + */ +bool qemu_finish_async_prealloc_mem(Error **errp); /** * qemu_get_pid_name: diff --git a/system/vl.c b/system/vl.c index bb959cb..2a0bd08 100644 --- a/system/vl.c +++ b/system/vl.c @@ -2013,6 +2013,14 @@ static void qemu_create_late_backends(void) object_option_foreach_add(object_create_late); + /* + * Wait for any outstanding memory prealloc from created memory + * backends to complete. + */ + if (!qemu_finish_async_prealloc_mem(&error_fatal)) { + exit(1); + } + if (tpm_init() < 0) { exit(1); } @@ -3699,6 +3707,7 @@ void qemu_init(int argc, char **argv) * over memory-backend-file objects). */ qemu_create_late_backends(); + phase_advance(PHASE_LATE_BACKENDS_CREATED); /* * Note: creates a QOM object, must run only after global and diff --git a/util/oslib-posix.c b/util/oslib-posix.c index 7c29700..3c379f9 100644 --- a/util/oslib-posix.c +++ b/util/oslib-posix.c @@ -42,6 +42,7 @@ #include "qemu/cutils.h" #include "qemu/units.h" #include "qemu/thread-context.h" +#include "qemu/main-loop.h" #ifdef CONFIG_LINUX #include <sys/syscall.h> @@ -63,11 +64,15 @@ struct MemsetThread; +static QLIST_HEAD(, MemsetContext) memset_contexts = + QLIST_HEAD_INITIALIZER(memset_contexts); + typedef struct MemsetContext { bool all_threads_created; bool any_thread_failed; struct MemsetThread *threads; int num_threads; + QLIST_ENTRY(MemsetContext) next; } MemsetContext; struct MemsetThread { @@ -412,19 +417,44 @@ static inline int get_memset_num_threads(size_t hpagesize, size_t numpages, return ret; } +static int wait_and_free_mem_prealloc_context(MemsetContext *context) +{ + int i, ret = 0, tmp; + + for (i = 0; i < context->num_threads; i++) { + tmp = (uintptr_t)qemu_thread_join(&context->threads[i].pgthread); + + if (tmp) { + ret = tmp; + } + } + g_free(context->threads); + g_free(context); + return ret; +} + static int touch_all_pages(char *area, size_t hpagesize, size_t numpages, - int max_threads, ThreadContext *tc, + int max_threads, ThreadContext *tc, bool async, bool use_madv_populate_write) { static gsize initialized = 0; - MemsetContext context = { - .num_threads = get_memset_num_threads(hpagesize, numpages, max_threads), - }; + MemsetContext *context = g_malloc0(sizeof(MemsetContext)); size_t numpages_per_thread, leftover; void *(*touch_fn)(void *); - int ret = 0, i = 0; + int ret, i = 0; char *addr = area; + /* + * Asynchronous preallocation is only allowed when using MADV_POPULATE_WRITE + * and prealloc context for thread placement. + */ + if (!use_madv_populate_write || !tc) { + async = false; + } + + context->num_threads = + get_memset_num_threads(hpagesize, numpages, max_threads); + if (g_once_init_enter(&initialized)) { qemu_mutex_init(&page_mutex); qemu_cond_init(&page_cond); @@ -432,8 +462,11 @@ static int touch_all_pages(char *area, size_t hpagesize, size_t numpages, } if (use_madv_populate_write) { - /* Avoid creating a single thread for MADV_POPULATE_WRITE */ - if (context.num_threads == 1) { + /* + * Avoid creating a single thread for MADV_POPULATE_WRITE when + * preallocating synchronously. + */ + if (context->num_threads == 1 && !async) { if (qemu_madvise(area, hpagesize * numpages, QEMU_MADV_POPULATE_WRITE)) { return -errno; @@ -445,50 +478,86 @@ static int touch_all_pages(char *area, size_t hpagesize, size_t numpages, touch_fn = do_touch_pages; } - context.threads = g_new0(MemsetThread, context.num_threads); - numpages_per_thread = numpages / context.num_threads; - leftover = numpages % context.num_threads; - for (i = 0; i < context.num_threads; i++) { - context.threads[i].addr = addr; - context.threads[i].numpages = numpages_per_thread + (i < leftover); - context.threads[i].hpagesize = hpagesize; - context.threads[i].context = &context; + context->threads = g_new0(MemsetThread, context->num_threads); + numpages_per_thread = numpages / context->num_threads; + leftover = numpages % context->num_threads; + for (i = 0; i < context->num_threads; i++) { + context->threads[i].addr = addr; + context->threads[i].numpages = numpages_per_thread + (i < leftover); + context->threads[i].hpagesize = hpagesize; + context->threads[i].context = context; if (tc) { - thread_context_create_thread(tc, &context.threads[i].pgthread, + thread_context_create_thread(tc, &context->threads[i].pgthread, "touch_pages", - touch_fn, &context.threads[i], + touch_fn, &context->threads[i], QEMU_THREAD_JOINABLE); } else { - qemu_thread_create(&context.threads[i].pgthread, "touch_pages", - touch_fn, &context.threads[i], + qemu_thread_create(&context->threads[i].pgthread, "touch_pages", + touch_fn, &context->threads[i], QEMU_THREAD_JOINABLE); } - addr += context.threads[i].numpages * hpagesize; + addr += context->threads[i].numpages * hpagesize; + } + + if (async) { + /* + * async requests currently require the BQL. Add it to the list and kick + * preallocation off during qemu_finish_async_prealloc_mem(). + */ + assert(bql_locked()); + QLIST_INSERT_HEAD(&memset_contexts, context, next); + return 0; } if (!use_madv_populate_write) { - sigbus_memset_context = &context; + sigbus_memset_context = context; } qemu_mutex_lock(&page_mutex); - context.all_threads_created = true; + context->all_threads_created = true; qemu_cond_broadcast(&page_cond); qemu_mutex_unlock(&page_mutex); - for (i = 0; i < context.num_threads; i++) { - int tmp = (uintptr_t)qemu_thread_join(&context.threads[i].pgthread); + ret = wait_and_free_mem_prealloc_context(context); + if (!use_madv_populate_write) { + sigbus_memset_context = NULL; + } + return ret; +} + +bool qemu_finish_async_prealloc_mem(Error **errp) +{ + int ret = 0, tmp; + MemsetContext *context, *next_context; + + /* Waiting for preallocation requires the BQL. */ + assert(bql_locked()); + if (QLIST_EMPTY(&memset_contexts)) { + return true; + } + + qemu_mutex_lock(&page_mutex); + QLIST_FOREACH(context, &memset_contexts, next) { + context->all_threads_created = true; + } + qemu_cond_broadcast(&page_cond); + qemu_mutex_unlock(&page_mutex); + + QLIST_FOREACH_SAFE(context, &memset_contexts, next, next_context) { + QLIST_REMOVE(context, next); + tmp = wait_and_free_mem_prealloc_context(context); if (tmp) { ret = tmp; } } - if (!use_madv_populate_write) { - sigbus_memset_context = NULL; + if (ret) { + error_setg_errno(errp, -ret, + "qemu_prealloc_mem: preallocating memory failed"); + return false; } - g_free(context.threads); - - return ret; + return true; } static bool madv_populate_write_possible(char *area, size_t pagesize) @@ -498,7 +567,7 @@ static bool madv_populate_write_possible(char *area, size_t pagesize) } bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads, - ThreadContext *tc, Error **errp) + ThreadContext *tc, bool async, Error **errp) { static gsize initialized; int ret; @@ -540,7 +609,7 @@ bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads, } /* touch pages simultaneously */ - ret = touch_all_pages(area, hpagesize, numpages, max_threads, tc, + ret = touch_all_pages(area, hpagesize, numpages, max_threads, tc, async, use_madv_populate_write); if (ret) { error_setg_errno(errp, -ret, diff --git a/util/oslib-win32.c b/util/oslib-win32.c index c4a5f05..b623830 100644 --- a/util/oslib-win32.c +++ b/util/oslib-win32.c @@ -265,7 +265,7 @@ int getpagesize(void) } bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads, - ThreadContext *tc, Error **errp) + ThreadContext *tc, bool async, Error **errp) { int i; size_t pagesize = qemu_real_host_page_size(); @@ -278,6 +278,12 @@ bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads, return true; } +bool qemu_finish_async_prealloc_mem(Error **errp) +{ + /* async prealloc not supported, there is nothing to finish */ + return true; +} + char *qemu_get_pid_name(pid_t pid) { /* XXX Implement me */ |