diff options
author | Stefan Hajnoczi <stefanha@redhat.com> | 2023-10-16 12:34:17 -0400 |
---|---|---|
committer | Stefan Hajnoczi <stefanha@redhat.com> | 2023-10-16 12:34:17 -0400 |
commit | bc2b89b38582b1cc7198428c9174fbbbf31245ad (patch) | |
tree | d15cdfa911a8b6d6eecec0d0ecaa1d1eed346075 /hw | |
parent | 63011373ad22c794a013da69663c03f1297a5c56 (diff) | |
parent | ee6398d862c108f8136a26d93d26680f3d222a3a (diff) | |
download | qemu-bc2b89b38582b1cc7198428c9174fbbbf31245ad.zip qemu-bc2b89b38582b1cc7198428c9174fbbbf31245ad.tar.gz qemu-bc2b89b38582b1cc7198428c9174fbbbf31245ad.tar.bz2 |
Merge tag 'mem-2023-10-12' of https://github.com/davidhildenbrand/qemu into staging
Hi,
"Host Memory Backends" and "Memory devices" queue ("mem"):
- Support memory devices with multiple memslots
- Support memory devices that dynamically consume memslots
- Support memory devices that can automatically decide on the number of
memslots to use
- virtio-mem support for exposing memory dynamically via multiple
memslots
- Some required cleanups/refactorings
# -----BEGIN PGP SIGNATURE-----
#
# iQJFBAABCAAvFiEEG9nKrXNcTDpGDfzKTd4Q9wD/g1oFAmUn+XMRHGRhdmlkQHJl
# ZGhhdC5jb20ACgkQTd4Q9wD/g1qDHA//T01suTa+uzrcoJHoMWN11S47WnAmbuTo
# vVakucLBPMJAa9xZeCy3OavXaVGpHkw+t6g3OFknof0LfQ5/j9iE3Q1PxURN7g5j
# SJ2WJXCoceM6T4TMhPvVvgEaYjFmESqZB5FZgedMT0QRyhAxMuF9pCkWhk1O3OAV
# JqQKqLFiGcv60AEuBYGZGzgiOUv8EJ5gKwRF4VOdyHIxqZDw1aZXzlcd4TzFZBQ7
# rwW/3ef+sFmUJdmfrSrqcIlQSRrqZ2w95xATDzLTIEEUT3SWqh/E95EZWIz1M0oQ
# NgWgFiLCR1KOj7bWFhLXT7IfyLh0mEysD+P/hY6QwQ4RewWG7EW5UK+JFswssdcZ
# rEj5XpHZzev/wx7hM4bWsoQ+VIvrH7j3uYGyWkcgYRbdDEkWDv2rsT23lwGYNhht
# oBsrdEBELRw6v4C8doq/+sCmHmuxUMqTGwbArCQVnB1XnLxOEkuqlnfq5MORkzNF
# fxbIRx+LRluOllC0HVaDQd8qxRq1+UC5WIpAcDcrouy4HGgi1onWKrXpgjIAbVyH
# M6cENkK7rnRk96gpeXdmrf0h9HqRciAOY8oUsFsvLyKBOCPBWDrLyOQEY5UoSdtD
# m4QpEVgywCy2z1uU/UObeT/UxJy/9EL/Zb+DHoEK06iEhwONoUJjEBYMJD38RMkk
# mwPTB4UAk9g=
# =s69t
# -----END PGP SIGNATURE-----
# gpg: Signature made Thu 12 Oct 2023 09:49:39 EDT
# gpg: using RSA key 1BD9CAAD735C4C3A460DFCCA4DDE10F700FF835A
# gpg: issuer "david@redhat.com"
# gpg: Good signature from "David Hildenbrand <david@redhat.com>" [unknown]
# gpg: aka "David Hildenbrand <davidhildenbrand@gmail.com>" [full]
# gpg: aka "David Hildenbrand <hildenbr@in.tum.de>" [unknown]
# gpg: WARNING: The key's User ID is not certified with a trusted signature!
# gpg: There is no indication that the signature belongs to the owner.
# Primary key fingerprint: 1BD9 CAAD 735C 4C3A 460D FCCA 4DDE 10F7 00FF 835A
* tag 'mem-2023-10-12' of https://github.com/davidhildenbrand/qemu:
virtio-mem: Mark memslot alias memory regions unmergeable
memory,vhost: Allow for marking memory device memory regions unmergeable
virtio-mem: Expose device memory dynamically via multiple memslots if enabled
virtio-mem: Update state to match bitmap as soon as it's been migrated
virtio-mem: Pass non-const VirtIOMEM via virtio_mem_range_cb
memory: Clarify mapping requirements for RamDiscardManager
memory-device,vhost: Support automatic decision on the number of memslots
vhost: Add vhost_get_max_memslots()
kvm: Add stub for kvm_get_max_memslots()
memory-device,vhost: Support memory devices that dynamically consume memslots
memory-device: Track required and actually used memslots in DeviceMemoryState
stubs: Rename qmp_memory_device.c to memory_device.c
memory-device: Support memory devices with multiple memslots
vhost: Return number of free memslots
kvm: Return number of free memslots
softmmu/physmem: Fixup qemu_ram_block_from_host() documentation
vhost: Remove vhost_backend_can_merge() callback
vhost: Rework memslot filtering and fix "used_memslot" tracking
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Diffstat (limited to 'hw')
-rw-r--r-- | hw/mem/memory-device.c | 196 | ||||
-rw-r--r-- | hw/virtio/vhost-stub.c | 9 | ||||
-rw-r--r-- | hw/virtio/vhost-user.c | 21 | ||||
-rw-r--r-- | hw/virtio/vhost-vdpa.c | 1 | ||||
-rw-r--r-- | hw/virtio/vhost.c | 103 | ||||
-rw-r--r-- | hw/virtio/virtio-mem-pci.c | 21 | ||||
-rw-r--r-- | hw/virtio/virtio-mem.c | 330 |
7 files changed, 624 insertions, 57 deletions
diff --git a/hw/mem/memory-device.c b/hw/mem/memory-device.c index 667d56b..ae38f48 100644 --- a/hw/mem/memory-device.c +++ b/hw/mem/memory-device.c @@ -52,19 +52,135 @@ static int memory_device_build_list(Object *obj, void *opaque) return 0; } -static void memory_device_check_addable(MachineState *ms, MemoryRegion *mr, - Error **errp) +static unsigned int memory_device_get_memslots(MemoryDeviceState *md) { + const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md); + + if (mdc->get_memslots) { + return mdc->get_memslots(md); + } + return 1; +} + +/* + * Memslots that are reserved by memory devices (required but still reported + * as free from KVM / vhost). + */ +static unsigned int get_reserved_memslots(MachineState *ms) +{ + if (ms->device_memory->used_memslots > + ms->device_memory->required_memslots) { + /* This is unexpected, and we warned already in the memory notifier. */ + return 0; + } + return ms->device_memory->required_memslots - + ms->device_memory->used_memslots; +} + +unsigned int memory_devices_get_reserved_memslots(void) +{ + if (!current_machine->device_memory) { + return 0; + } + return get_reserved_memslots(current_machine); +} + +bool memory_devices_memslot_auto_decision_active(void) +{ + if (!current_machine->device_memory) { + return false; + } + + return current_machine->device_memory->memslot_auto_decision_active; +} + +static unsigned int memory_device_memslot_decision_limit(MachineState *ms, + MemoryRegion *mr) +{ + const unsigned int reserved = get_reserved_memslots(ms); + const uint64_t size = memory_region_size(mr); + unsigned int max = vhost_get_max_memslots(); + unsigned int free = vhost_get_free_memslots(); + uint64_t available_space; + unsigned int memslots; + + if (kvm_enabled()) { + max = MIN(max, kvm_get_max_memslots()); + free = MIN(free, kvm_get_free_memslots()); + } + + /* + * If we only have less overall memslots than what we consider reasonable, + * just keep it to a minimum. + */ + if (max < MEMORY_DEVICES_SAFE_MAX_MEMSLOTS) { + return 1; + } + + /* + * Consider our soft-limit across all memory devices. We don't really + * expect to exceed this limit in reasonable configurations. + */ + if (MEMORY_DEVICES_SOFT_MEMSLOT_LIMIT <= + ms->device_memory->required_memslots) { + return 1; + } + memslots = MEMORY_DEVICES_SOFT_MEMSLOT_LIMIT - + ms->device_memory->required_memslots; + + /* + * Consider the actually still free memslots. This is only relevant if + * other memslot consumers would consume *significantly* more memslots than + * what we prepared for (> 253). Unlikely, but let's just handle it + * cleanly. + */ + memslots = MIN(memslots, free - reserved); + if (memslots < 1 || unlikely(free < reserved)) { + return 1; + } + + /* We cannot have any other memory devices? So give all to this device. */ + if (size == ms->maxram_size - ms->ram_size) { + return memslots; + } + + /* + * Simple heuristic: equally distribute the memslots over the space + * still available for memory devices. + */ + available_space = ms->maxram_size - ms->ram_size - + ms->device_memory->used_region_size; + memslots = (double)memslots * size / available_space; + return memslots < 1 ? 1 : memslots; +} + +static void memory_device_check_addable(MachineState *ms, MemoryDeviceState *md, + MemoryRegion *mr, Error **errp) +{ + const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md); const uint64_t used_region_size = ms->device_memory->used_region_size; const uint64_t size = memory_region_size(mr); + const unsigned int reserved_memslots = get_reserved_memslots(ms); + unsigned int required_memslots, memslot_limit; + + /* + * Instruct the device to decide how many memslots to use, if applicable, + * before we query the number of required memslots the first time. + */ + if (mdc->decide_memslots) { + memslot_limit = memory_device_memslot_decision_limit(ms, mr); + mdc->decide_memslots(md, memslot_limit); + } + required_memslots = memory_device_get_memslots(md); - /* we will need a new memory slot for kvm and vhost */ - if (kvm_enabled() && !kvm_has_free_slot(ms)) { - error_setg(errp, "hypervisor has no free memory slots left"); + /* we will need memory slots for kvm and vhost */ + if (kvm_enabled() && + kvm_get_free_memslots() < required_memslots + reserved_memslots) { + error_setg(errp, "hypervisor has not enough free memory slots left"); return; } - if (!vhost_has_free_slot()) { - error_setg(errp, "a used vhost backend has no free memory slots left"); + if (vhost_get_free_memslots() < required_memslots + reserved_memslots) { + error_setg(errp, "a used vhost backend has not enough free memory slots left"); return; } @@ -233,7 +349,7 @@ void memory_device_pre_plug(MemoryDeviceState *md, MachineState *ms, goto out; } - memory_device_check_addable(ms, mr, &local_err); + memory_device_check_addable(ms, md, mr, &local_err); if (local_err) { goto out; } @@ -264,6 +380,7 @@ out: void memory_device_plug(MemoryDeviceState *md, MachineState *ms) { const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md); + const unsigned int memslots = memory_device_get_memslots(md); const uint64_t addr = mdc->get_addr(md); MemoryRegion *mr; @@ -275,6 +392,11 @@ void memory_device_plug(MemoryDeviceState *md, MachineState *ms) g_assert(ms->device_memory); ms->device_memory->used_region_size += memory_region_size(mr); + ms->device_memory->required_memslots += memslots; + if (mdc->decide_memslots && memslots > 1) { + ms->device_memory->memslot_auto_decision_active++; + } + memory_region_add_subregion(&ms->device_memory->mr, addr - ms->device_memory->base, mr); trace_memory_device_plug(DEVICE(md)->id ? DEVICE(md)->id : "", addr); @@ -283,6 +405,7 @@ void memory_device_plug(MemoryDeviceState *md, MachineState *ms) void memory_device_unplug(MemoryDeviceState *md, MachineState *ms) { const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md); + const unsigned int memslots = memory_device_get_memslots(md); MemoryRegion *mr; /* @@ -293,7 +416,12 @@ void memory_device_unplug(MemoryDeviceState *md, MachineState *ms) g_assert(ms->device_memory); memory_region_del_subregion(&ms->device_memory->mr, mr); + + if (mdc->decide_memslots && memslots > 1) { + ms->device_memory->memslot_auto_decision_active--; + } ms->device_memory->used_region_size -= memory_region_size(mr); + ms->device_memory->required_memslots -= memslots; trace_memory_device_unplug(DEVICE(md)->id ? DEVICE(md)->id : "", mdc->get_addr(md)); } @@ -313,6 +441,50 @@ uint64_t memory_device_get_region_size(const MemoryDeviceState *md, return memory_region_size(mr); } +static void memory_devices_region_mod(MemoryListener *listener, + MemoryRegionSection *mrs, bool add) +{ + DeviceMemoryState *dms = container_of(listener, DeviceMemoryState, + listener); + + if (!memory_region_is_ram(mrs->mr)) { + warn_report("Unexpected memory region mapped into device memory region."); + return; + } + + /* + * The expectation is that each distinct RAM memory region section in + * our region for memory devices consumes exactly one memslot in KVM + * and in vhost. For vhost, this is true, except: + * * ROM memory regions don't consume a memslot. These get used very + * rarely for memory devices (R/O NVDIMMs). + * * Memslots without a fd (memory-backend-ram) don't necessarily + * consume a memslot. Such setups are quite rare and possibly bogus: + * the memory would be inaccessible by such vhost devices. + * + * So for vhost, in corner cases we might over-estimate the number of + * memslots that are currently used or that might still be reserved + * (required - used). + */ + dms->used_memslots += add ? 1 : -1; + + if (dms->used_memslots > dms->required_memslots) { + warn_report("Memory devices use more memory slots than indicated as required."); + } +} + +static void memory_devices_region_add(MemoryListener *listener, + MemoryRegionSection *mrs) +{ + return memory_devices_region_mod(listener, mrs, true); +} + +static void memory_devices_region_del(MemoryListener *listener, + MemoryRegionSection *mrs) +{ + return memory_devices_region_mod(listener, mrs, false); +} + void machine_memory_devices_init(MachineState *ms, hwaddr base, uint64_t size) { g_assert(size); @@ -322,8 +494,16 @@ void machine_memory_devices_init(MachineState *ms, hwaddr base, uint64_t size) memory_region_init(&ms->device_memory->mr, OBJECT(ms), "device-memory", size); + address_space_init(&ms->device_memory->as, &ms->device_memory->mr, + "device-memory"); memory_region_add_subregion(get_system_memory(), ms->device_memory->base, &ms->device_memory->mr); + + /* Track the number of memslots used by memory devices. */ + ms->device_memory->listener.region_add = memory_devices_region_add; + ms->device_memory->listener.region_del = memory_devices_region_del; + memory_listener_register(&ms->device_memory->listener, + &ms->device_memory->as); } static const TypeInfo memory_device_info = { diff --git a/hw/virtio/vhost-stub.c b/hw/virtio/vhost-stub.c index aa858ef..52d42ad 100644 --- a/hw/virtio/vhost-stub.c +++ b/hw/virtio/vhost-stub.c @@ -2,9 +2,14 @@ #include "hw/virtio/vhost.h" #include "hw/virtio/vhost-user.h" -bool vhost_has_free_slot(void) +unsigned int vhost_get_max_memslots(void) { - return true; + return UINT_MAX; +} + +unsigned int vhost_get_free_memslots(void) +{ + return UINT_MAX; } bool vhost_user_init(VhostUserState *user, CharBackend *chr, Error **errp) diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 3766b41..68eb1f0 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -2327,19 +2327,6 @@ static int vhost_user_migration_done(struct vhost_dev *dev, char* mac_addr) return -ENOTSUP; } -static bool vhost_user_can_merge(struct vhost_dev *dev, - uint64_t start1, uint64_t size1, - uint64_t start2, uint64_t size2) -{ - ram_addr_t offset; - int mfd, rfd; - - (void)vhost_user_get_mr_data(start1, &offset, &mfd); - (void)vhost_user_get_mr_data(start2, &offset, &rfd); - - return mfd == rfd; -} - static int vhost_user_net_set_mtu(struct vhost_dev *dev, uint16_t mtu) { VhostUserMsg msg; @@ -2622,10 +2609,9 @@ vhost_user_crypto_close_session(struct vhost_dev *dev, uint64_t session_id) return 0; } -static bool vhost_user_mem_section_filter(struct vhost_dev *dev, - MemoryRegionSection *section) +static bool vhost_user_no_private_memslots(struct vhost_dev *dev) { - return memory_region_get_fd(section->mr) >= 0; + return true; } static int vhost_user_get_inflight_fd(struct vhost_dev *dev, @@ -2868,6 +2854,7 @@ const VhostOps user_ops = { .vhost_backend_init = vhost_user_backend_init, .vhost_backend_cleanup = vhost_user_backend_cleanup, .vhost_backend_memslots_limit = vhost_user_memslots_limit, + .vhost_backend_no_private_memslots = vhost_user_no_private_memslots, .vhost_set_log_base = vhost_user_set_log_base, .vhost_set_mem_table = vhost_user_set_mem_table, .vhost_set_vring_addr = vhost_user_set_vring_addr, @@ -2886,7 +2873,6 @@ const VhostOps user_ops = { .vhost_set_vring_enable = vhost_user_set_vring_enable, .vhost_requires_shm_log = vhost_user_requires_shm_log, .vhost_migration_done = vhost_user_migration_done, - .vhost_backend_can_merge = vhost_user_can_merge, .vhost_net_set_mtu = vhost_user_net_set_mtu, .vhost_set_iotlb_callback = vhost_user_set_iotlb_callback, .vhost_send_device_iotlb_msg = vhost_user_send_device_iotlb_msg, @@ -2894,7 +2880,6 @@ const VhostOps user_ops = { .vhost_set_config = vhost_user_set_config, .vhost_crypto_create_session = vhost_user_crypto_create_session, .vhost_crypto_close_session = vhost_user_crypto_close_session, - .vhost_backend_mem_section_filter = vhost_user_mem_section_filter, .vhost_get_inflight_fd = vhost_user_get_inflight_fd, .vhost_set_inflight_fd = vhost_user_set_inflight_fd, .vhost_dev_start = vhost_user_dev_start, diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index 89ff02a..819b2d8 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -1512,7 +1512,6 @@ const VhostOps vdpa_ops = { .vhost_set_config = vhost_vdpa_set_config, .vhost_requires_shm_log = NULL, .vhost_migration_done = NULL, - .vhost_backend_can_merge = NULL, .vhost_net_set_mtu = NULL, .vhost_set_iotlb_callback = NULL, .vhost_send_device_iotlb_msg = NULL, diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 9cfac40..9f37206 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -23,6 +23,7 @@ #include "qemu/log.h" #include "standard-headers/linux/vhost_types.h" #include "hw/virtio/virtio-bus.h" +#include "hw/mem/memory-device.h" #include "migration/blocker.h" #include "migration/qemu-file-types.h" #include "sysemu/dma.h" @@ -45,20 +46,44 @@ static struct vhost_log *vhost_log; static struct vhost_log *vhost_log_shm; +/* Memslots used by backends that support private memslots (without an fd). */ static unsigned int used_memslots; + +/* Memslots used by backends that only support shared memslots (with an fd). */ +static unsigned int used_shared_memslots; + static QLIST_HEAD(, vhost_dev) vhost_devices = QLIST_HEAD_INITIALIZER(vhost_devices); -bool vhost_has_free_slot(void) +unsigned int vhost_get_max_memslots(void) +{ + unsigned int max = UINT_MAX; + struct vhost_dev *hdev; + + QLIST_FOREACH(hdev, &vhost_devices, entry) { + max = MIN(max, hdev->vhost_ops->vhost_backend_memslots_limit(hdev)); + } + return max; +} + +unsigned int vhost_get_free_memslots(void) { - unsigned int slots_limit = ~0U; + unsigned int free = UINT_MAX; struct vhost_dev *hdev; QLIST_FOREACH(hdev, &vhost_devices, entry) { unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev); - slots_limit = MIN(slots_limit, r); + unsigned int cur_free; + + if (hdev->vhost_ops->vhost_backend_no_private_memslots && + hdev->vhost_ops->vhost_backend_no_private_memslots(hdev)) { + cur_free = r - used_shared_memslots; + } else { + cur_free = r - used_memslots; + } + free = MIN(free, cur_free); } - return slots_limit > used_memslots; + return free; } static void vhost_dev_sync_region(struct vhost_dev *dev, @@ -474,8 +499,7 @@ static int vhost_verify_ring_mappings(struct vhost_dev *dev, * vhost_section: identify sections needed for vhost access * * We only care about RAM sections here (where virtqueue and guest - * internals accessed by virtio might live). If we find one we still - * allow the backend to potentially filter it out of our list. + * internals accessed by virtio might live). */ static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section) { @@ -502,8 +526,16 @@ static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section) return false; } - if (dev->vhost_ops->vhost_backend_mem_section_filter && - !dev->vhost_ops->vhost_backend_mem_section_filter(dev, section)) { + /* + * Some backends (like vhost-user) can only handle memory regions + * that have an fd (can be mapped into a different process). Filter + * the ones without an fd out, if requested. + * + * TODO: we might have to limit to MAP_SHARED as well. + */ + if (memory_region_get_fd(section->mr) < 0 && + dev->vhost_ops->vhost_backend_no_private_memslots && + dev->vhost_ops->vhost_backend_no_private_memslots(dev)) { trace_vhost_reject_section(mr->name, 2); return false; } @@ -568,7 +600,14 @@ static void vhost_commit(MemoryListener *listener) dev->n_mem_sections * sizeof dev->mem->regions[0]; dev->mem = g_realloc(dev->mem, regions_size); dev->mem->nregions = dev->n_mem_sections; - used_memslots = dev->mem->nregions; + + if (dev->vhost_ops->vhost_backend_no_private_memslots && + dev->vhost_ops->vhost_backend_no_private_memslots(dev)) { + used_shared_memslots = dev->mem->nregions; + } else { + used_memslots = dev->mem->nregions; + } + for (i = 0; i < dev->n_mem_sections; i++) { struct vhost_memory_region *cur_vmr = dev->mem->regions + i; struct MemoryRegionSection *mrs = dev->mem_sections + i; @@ -668,7 +707,7 @@ static void vhost_region_add_section(struct vhost_dev *dev, mrs_size, mrs_host); } - if (dev->n_tmp_sections) { + if (dev->n_tmp_sections && !section->unmergeable) { /* Since we already have at least one section, lets see if * this extends it; since we're scanning in order, we only * have to look at the last one, and the FlatView that calls @@ -701,11 +740,7 @@ static void vhost_region_add_section(struct vhost_dev *dev, size_t offset = mrs_gpa - prev_gpa_start; if (prev_host_start + offset == mrs_host && - section->mr == prev_sec->mr && - (!dev->vhost_ops->vhost_backend_can_merge || - dev->vhost_ops->vhost_backend_can_merge(dev, - mrs_host, mrs_size, - prev_host_start, prev_size))) { + section->mr == prev_sec->mr && !prev_sec->unmergeable) { uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size); need_add = false; prev_sec->offset_within_address_space = @@ -1400,6 +1435,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, VhostBackendType backend_type, uint32_t busyloop_timeout, Error **errp) { + unsigned int used, reserved, limit; uint64_t features; int i, r, n_initialized_vqs = 0; @@ -1426,6 +1462,19 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, goto fail; } + limit = hdev->vhost_ops->vhost_backend_memslots_limit(hdev); + if (limit < MEMORY_DEVICES_SAFE_MAX_MEMSLOTS && + memory_devices_memslot_auto_decision_active()) { + error_setg(errp, "some memory device (like virtio-mem)" + " decided how many memory slots to use based on the overall" + " number of memory slots; this vhost backend would further" + " restricts the overall number of memory slots"); + error_append_hint(errp, "Try plugging this vhost backend before" + " plugging such memory devices.\n"); + r = -EINVAL; + goto fail; + } + for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) { r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i); if (r < 0) { @@ -1495,9 +1544,27 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, memory_listener_register(&hdev->memory_listener, &address_space_memory); QLIST_INSERT_HEAD(&vhost_devices, hdev, entry); - if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) { - error_setg(errp, "vhost backend memory slots limit is less" - " than current number of present memory slots"); + /* + * The listener we registered properly updated the corresponding counter. + * So we can trust that these values are accurate. + */ + if (hdev->vhost_ops->vhost_backend_no_private_memslots && + hdev->vhost_ops->vhost_backend_no_private_memslots(hdev)) { + used = used_shared_memslots; + } else { + used = used_memslots; + } + /* + * We assume that all reserved memslots actually require a real memslot + * in our vhost backend. This might not be true, for example, if the + * memslot would be ROM. If ever relevant, we can optimize for that -- + * but we'll need additional information about the reservations. + */ + reserved = memory_devices_get_reserved_memslots(); + if (used + reserved > limit) { + error_setg(errp, "vhost backend memory slots limit (%d) is less" + " than current number of used (%d) and reserved (%d)" + " memory slots for memory devices.", limit, used, reserved); r = -EINVAL; goto fail_busyloop; } diff --git a/hw/virtio/virtio-mem-pci.c b/hw/virtio/virtio-mem-pci.c index c4597e0..1b4e9a3 100644 --- a/hw/virtio/virtio-mem-pci.c +++ b/hw/virtio/virtio-mem-pci.c @@ -48,6 +48,25 @@ static MemoryRegion *virtio_mem_pci_get_memory_region(MemoryDeviceState *md, return vmc->get_memory_region(vmem, errp); } +static void virtio_mem_pci_decide_memslots(MemoryDeviceState *md, + unsigned int limit) +{ + VirtIOMEMPCI *pci_mem = VIRTIO_MEM_PCI(md); + VirtIOMEM *vmem = VIRTIO_MEM(&pci_mem->vdev); + VirtIOMEMClass *vmc = VIRTIO_MEM_GET_CLASS(vmem); + + vmc->decide_memslots(vmem, limit); +} + +static unsigned int virtio_mem_pci_get_memslots(MemoryDeviceState *md) +{ + VirtIOMEMPCI *pci_mem = VIRTIO_MEM_PCI(md); + VirtIOMEM *vmem = VIRTIO_MEM(&pci_mem->vdev); + VirtIOMEMClass *vmc = VIRTIO_MEM_GET_CLASS(vmem); + + return vmc->get_memslots(vmem); +} + static uint64_t virtio_mem_pci_get_plugged_size(const MemoryDeviceState *md, Error **errp) { @@ -150,6 +169,8 @@ static void virtio_mem_pci_class_init(ObjectClass *klass, void *data) mdc->set_addr = virtio_mem_pci_set_addr; mdc->get_plugged_size = virtio_mem_pci_get_plugged_size; mdc->get_memory_region = virtio_mem_pci_get_memory_region; + mdc->decide_memslots = virtio_mem_pci_decide_memslots; + mdc->get_memslots = virtio_mem_pci_get_memslots; mdc->fill_device_info = virtio_mem_pci_fill_device_info; mdc->get_min_alignment = virtio_mem_pci_get_min_alignment; diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c index da5b09c..9dc3c61 100644 --- a/hw/virtio/virtio-mem.c +++ b/hw/virtio/virtio-mem.c @@ -67,6 +67,13 @@ static uint32_t virtio_mem_default_thp_size(void) } /* + * The minimum memslot size depends on this setting ("sane default"), the + * device block size, and the memory backend page size. The last (or single) + * memslot might be smaller than this constant. + */ +#define VIRTIO_MEM_MIN_MEMSLOT_SIZE (1 * GiB) + +/* * We want to have a reasonable default block size such that * 1. We avoid splitting THPs when unplugging memory, which degrades * performance. @@ -177,10 +184,10 @@ static bool virtio_mem_is_busy(void) return migration_in_incoming_postcopy() || !migration_is_idle(); } -typedef int (*virtio_mem_range_cb)(const VirtIOMEM *vmem, void *arg, +typedef int (*virtio_mem_range_cb)(VirtIOMEM *vmem, void *arg, uint64_t offset, uint64_t size); -static int virtio_mem_for_each_unplugged_range(const VirtIOMEM *vmem, void *arg, +static int virtio_mem_for_each_unplugged_range(VirtIOMEM *vmem, void *arg, virtio_mem_range_cb cb) { unsigned long first_zero_bit, last_zero_bit; @@ -204,7 +211,7 @@ static int virtio_mem_for_each_unplugged_range(const VirtIOMEM *vmem, void *arg, return ret; } -static int virtio_mem_for_each_plugged_range(const VirtIOMEM *vmem, void *arg, +static int virtio_mem_for_each_plugged_range(VirtIOMEM *vmem, void *arg, virtio_mem_range_cb cb) { unsigned long first_bit, last_bit; @@ -483,6 +490,96 @@ static bool virtio_mem_valid_range(const VirtIOMEM *vmem, uint64_t gpa, return true; } +static void virtio_mem_activate_memslot(VirtIOMEM *vmem, unsigned int idx) +{ + const uint64_t memslot_offset = idx * vmem->memslot_size; + + assert(vmem->memslots); + + /* + * Instead of enabling/disabling memslots, we add/remove them. This should + * make address space updates faster, because we don't have to loop over + * many disabled subregions. + */ + if (memory_region_is_mapped(&vmem->memslots[idx])) { + return; + } + memory_region_add_subregion(vmem->mr, memslot_offset, &vmem->memslots[idx]); +} + +static void virtio_mem_deactivate_memslot(VirtIOMEM *vmem, unsigned int idx) +{ + assert(vmem->memslots); + + if (!memory_region_is_mapped(&vmem->memslots[idx])) { + return; + } + memory_region_del_subregion(vmem->mr, &vmem->memslots[idx]); +} + +static void virtio_mem_activate_memslots_to_plug(VirtIOMEM *vmem, + uint64_t offset, uint64_t size) +{ + const unsigned int start_idx = offset / vmem->memslot_size; + const unsigned int end_idx = (offset + size + vmem->memslot_size - 1) / + vmem->memslot_size; + unsigned int idx; + + if (!vmem->dynamic_memslots) { + return; + } + + /* Activate all involved memslots in a single transaction. */ + memory_region_transaction_begin(); + for (idx = start_idx; idx < end_idx; idx++) { + virtio_mem_activate_memslot(vmem, idx); + } + memory_region_transaction_commit(); +} + +static void virtio_mem_deactivate_unplugged_memslots(VirtIOMEM *vmem, + uint64_t offset, + uint64_t size) +{ + const uint64_t region_size = memory_region_size(&vmem->memdev->mr); + const unsigned int start_idx = offset / vmem->memslot_size; + const unsigned int end_idx = (offset + size + vmem->memslot_size - 1) / + vmem->memslot_size; + unsigned int idx; + + if (!vmem->dynamic_memslots) { + return; + } + + /* Deactivate all memslots with unplugged blocks in a single transaction. */ + memory_region_transaction_begin(); + for (idx = start_idx; idx < end_idx; idx++) { + const uint64_t memslot_offset = idx * vmem->memslot_size; + uint64_t memslot_size = vmem->memslot_size; + + /* The size of the last memslot might be smaller. */ + if (idx == vmem->nb_memslots - 1) { + memslot_size = region_size - memslot_offset; + } + + /* + * Partially covered memslots might still have some blocks plugged and + * have to remain active if that's the case. + */ + if (offset > memslot_offset || + offset + size < memslot_offset + memslot_size) { + const uint64_t gpa = vmem->addr + memslot_offset; + + if (!virtio_mem_is_range_unplugged(vmem, gpa, memslot_size)) { + continue; + } + } + + virtio_mem_deactivate_memslot(vmem, idx); + } + memory_region_transaction_commit(); +} + static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa, uint64_t size, bool plug) { @@ -500,6 +597,8 @@ static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa, } virtio_mem_notify_unplug(vmem, offset, size); virtio_mem_set_range_unplugged(vmem, start_gpa, size); + /* Deactivate completely unplugged memslots after updating the state. */ + virtio_mem_deactivate_unplugged_memslots(vmem, offset, size); return 0; } @@ -527,7 +626,20 @@ static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa, } if (!ret) { + /* + * Activate before notifying and rollback in case of any errors. + * + * When activating a yet inactive memslot, memory notifiers will get + * notified about the added memory region and can register with the + * RamDiscardManager; this will traverse all plugged blocks and skip the + * blocks we are plugging here. The following notification will inform + * registered listeners about the blocks we're plugging. + */ + virtio_mem_activate_memslots_to_plug(vmem, offset, size); ret = virtio_mem_notify_plug(vmem, offset, size); + if (ret) { + virtio_mem_deactivate_unplugged_memslots(vmem, offset, size); + } } if (ret) { /* Could be preallocation or a notifier populated memory. */ @@ -620,6 +732,7 @@ static void virtio_mem_resize_usable_region(VirtIOMEM *vmem, static int virtio_mem_unplug_all(VirtIOMEM *vmem) { + const uint64_t region_size = memory_region_size(&vmem->memdev->mr); RAMBlock *rb = vmem->memdev->mr.ram_block; if (vmem->size) { @@ -634,6 +747,9 @@ static int virtio_mem_unplug_all(VirtIOMEM *vmem) bitmap_clear(vmem->bitmap, 0, vmem->bitmap_size); vmem->size = 0; notifier_list_notify(&vmem->size_change_notifiers, &vmem->size); + + /* Deactivate all memslots after updating the state. */ + virtio_mem_deactivate_unplugged_memslots(vmem, 0, region_size); } trace_virtio_mem_unplugged_all(); @@ -790,6 +906,49 @@ static void virtio_mem_system_reset(void *opaque) virtio_mem_unplug_all(vmem); } +static void virtio_mem_prepare_mr(VirtIOMEM *vmem) +{ + const uint64_t region_size = memory_region_size(&vmem->memdev->mr); + + assert(!vmem->mr && vmem->dynamic_memslots); + vmem->mr = g_new0(MemoryRegion, 1); + memory_region_init(vmem->mr, OBJECT(vmem), "virtio-mem", + region_size); + vmem->mr->align = memory_region_get_alignment(&vmem->memdev->mr); +} + +static void virtio_mem_prepare_memslots(VirtIOMEM *vmem) +{ + const uint64_t region_size = memory_region_size(&vmem->memdev->mr); + unsigned int idx; + + g_assert(!vmem->memslots && vmem->nb_memslots && vmem->dynamic_memslots); + vmem->memslots = g_new0(MemoryRegion, vmem->nb_memslots); + + /* Initialize our memslots, but don't map them yet. */ + for (idx = 0; idx < vmem->nb_memslots; idx++) { + const uint64_t memslot_offset = idx * vmem->memslot_size; + uint64_t memslot_size = vmem->memslot_size; + char name[20]; + + /* The size of the last memslot might be smaller. */ + if (idx == vmem->nb_memslots - 1) { + memslot_size = region_size - memslot_offset; + } + + snprintf(name, sizeof(name), "memslot-%u", idx); + memory_region_init_alias(&vmem->memslots[idx], OBJECT(vmem), name, + &vmem->memdev->mr, memslot_offset, + memslot_size); + /* + * We want to be able to atomically and efficiently activate/deactivate + * individual memslots without affecting adjacent memslots in memory + * notifiers. + */ + memory_region_set_unmergeable(&vmem->memslots[idx], true); + } +} + static void virtio_mem_device_realize(DeviceState *dev, Error **errp) { MachineState *ms = MACHINE(qdev_get_machine()); @@ -861,6 +1020,14 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp) vmem->unplugged_inaccessible = ON_OFF_AUTO_ON; #endif /* VIRTIO_MEM_HAS_LEGACY_GUESTS */ + if (vmem->dynamic_memslots && + vmem->unplugged_inaccessible != ON_OFF_AUTO_ON) { + error_setg(errp, "'%s' property set to 'on' requires '%s' to be 'on'", + VIRTIO_MEM_DYNAMIC_MEMSLOTS_PROP, + VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP); + return; + } + /* * If the block size wasn't configured by the user, use a sane default. This * allows using hugetlbfs backends of any page size without manual @@ -930,6 +1097,25 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp) virtio_init(vdev, VIRTIO_ID_MEM, sizeof(struct virtio_mem_config)); vmem->vq = virtio_add_queue(vdev, 128, virtio_mem_handle_request); + /* + * With "dynamic-memslots=off" (old behavior) we always map the whole + * RAM memory region directly. + */ + if (vmem->dynamic_memslots) { + if (!vmem->mr) { + virtio_mem_prepare_mr(vmem); + } + if (vmem->nb_memslots <= 1) { + vmem->nb_memslots = 1; + vmem->memslot_size = memory_region_size(&vmem->memdev->mr); + } + if (!vmem->memslots) { + virtio_mem_prepare_memslots(vmem); + } + } else { + assert(!vmem->mr && !vmem->nb_memslots && !vmem->memslots); + } + host_memory_backend_set_mapped(vmem->memdev, true); vmstate_register_ram(&vmem->memdev->mr, DEVICE(vmem)); if (vmem->early_migration) { @@ -969,7 +1155,7 @@ static void virtio_mem_device_unrealize(DeviceState *dev) ram_block_coordinated_discard_require(false); } -static int virtio_mem_discard_range_cb(const VirtIOMEM *vmem, void *arg, +static int virtio_mem_discard_range_cb(VirtIOMEM *vmem, void *arg, uint64_t offset, uint64_t size) { RAMBlock *rb = vmem->memdev->mr.ram_block; @@ -984,13 +1170,32 @@ static int virtio_mem_restore_unplugged(VirtIOMEM *vmem) virtio_mem_discard_range_cb); } -static int virtio_mem_post_load(void *opaque, int version_id) +static int virtio_mem_activate_memslot_range_cb(VirtIOMEM *vmem, void *arg, + uint64_t offset, uint64_t size) +{ + virtio_mem_activate_memslots_to_plug(vmem, offset, size); + return 0; +} + +static int virtio_mem_post_load_bitmap(VirtIOMEM *vmem) { - VirtIOMEM *vmem = VIRTIO_MEM(opaque); RamDiscardListener *rdl; int ret; /* + * We restored the bitmap and updated the requested size; activate all + * memslots (so listeners register) before notifying about plugged blocks. + */ + if (vmem->dynamic_memslots) { + /* + * We don't expect any active memslots at this point to deactivate: no + * memory was plugged on the migration destination. + */ + virtio_mem_for_each_plugged_range(vmem, NULL, + virtio_mem_activate_memslot_range_cb); + } + + /* * We started out with all memory discarded and our memory region is mapped * into an address space. Replay, now that we updated the bitmap. */ @@ -1001,6 +1206,20 @@ static int virtio_mem_post_load(void *opaque, int version_id) return ret; } } + return 0; +} + +static int virtio_mem_post_load(void *opaque, int version_id) +{ + VirtIOMEM *vmem = VIRTIO_MEM(opaque); + int ret; + + if (!vmem->early_migration) { + ret = virtio_mem_post_load_bitmap(vmem); + if (ret) { + return ret; + } + } /* * If shared RAM is migrated using the file content and not using QEMU, @@ -1021,7 +1240,7 @@ static int virtio_mem_post_load(void *opaque, int version_id) return virtio_mem_restore_unplugged(vmem); } -static int virtio_mem_prealloc_range_cb(const VirtIOMEM *vmem, void *arg, +static int virtio_mem_prealloc_range_cb(VirtIOMEM *vmem, void *arg, uint64_t offset, uint64_t size) { void *area = memory_region_get_ram_ptr(&vmem->memdev->mr) + offset; @@ -1043,7 +1262,7 @@ static int virtio_mem_post_load_early(void *opaque, int version_id) int ret; if (!vmem->prealloc) { - return 0; + goto post_load_bitmap; } /* @@ -1051,7 +1270,7 @@ static int virtio_mem_post_load_early(void *opaque, int version_id) * don't mess with preallocation and postcopy. */ if (migrate_ram_is_ignored(rb)) { - return 0; + goto post_load_bitmap; } /* @@ -1084,7 +1303,10 @@ static int virtio_mem_post_load_early(void *opaque, int version_id) return -EBUSY; } } - return 0; + +post_load_bitmap: + /* Finally, update any other state to be consistent with the new bitmap. */ + return virtio_mem_post_load_bitmap(vmem); } typedef struct VirtIOMEMMigSanityChecks { @@ -1235,11 +1457,79 @@ static MemoryRegion *virtio_mem_get_memory_region(VirtIOMEM *vmem, Error **errp) if (!vmem->memdev) { error_setg(errp, "'%s' property must be set", VIRTIO_MEM_MEMDEV_PROP); return NULL; + } else if (vmem->dynamic_memslots) { + if (!vmem->mr) { + virtio_mem_prepare_mr(vmem); + } + return vmem->mr; } return &vmem->memdev->mr; } +static void virtio_mem_decide_memslots(VirtIOMEM *vmem, unsigned int limit) +{ + uint64_t region_size, memslot_size, min_memslot_size; + unsigned int memslots; + RAMBlock *rb; + + if (!vmem->dynamic_memslots) { + return; + } + + /* We're called exactly once, before realizing the device. */ + assert(!vmem->nb_memslots); + + /* If realizing the device will fail, just assume a single memslot. */ + if (limit <= 1 || !vmem->memdev || !vmem->memdev->mr.ram_block) { + vmem->nb_memslots = 1; + return; + } + + rb = vmem->memdev->mr.ram_block; + region_size = memory_region_size(&vmem->memdev->mr); + + /* + * Determine the default block size now, to determine the minimum memslot + * size. We want the minimum slot size to be at least the device block size. + */ + if (!vmem->block_size) { + vmem->block_size = virtio_mem_default_block_size(rb); + } + /* If realizing the device will fail, just assume a single memslot. */ + if (vmem->block_size < qemu_ram_pagesize(rb) || + !QEMU_IS_ALIGNED(region_size, vmem->block_size)) { + vmem->nb_memslots = 1; + return; + } + + /* + * All memslots except the last one have a reasonable minimum size, and + * and all memslot sizes are aligned to the device block size. + */ + memslot_size = QEMU_ALIGN_UP(region_size / limit, vmem->block_size); + min_memslot_size = MAX(vmem->block_size, VIRTIO_MEM_MIN_MEMSLOT_SIZE); + memslot_size = MAX(memslot_size, min_memslot_size); + + memslots = QEMU_ALIGN_UP(region_size, memslot_size) / memslot_size; + if (memslots != 1) { + vmem->memslot_size = memslot_size; + } + vmem->nb_memslots = memslots; +} + +static unsigned int virtio_mem_get_memslots(VirtIOMEM *vmem) +{ + if (!vmem->dynamic_memslots) { + /* Exactly one static RAM memory region. */ + return 1; + } + + /* We're called after instructed to make a decision. */ + g_assert(vmem->nb_memslots); + return vmem->nb_memslots; +} + static void virtio_mem_add_size_change_notifier(VirtIOMEM *vmem, Notifier *notifier) { @@ -1377,6 +1667,21 @@ static void virtio_mem_instance_init(Object *obj) NULL, NULL); } +static void virtio_mem_instance_finalize(Object *obj) +{ + VirtIOMEM *vmem = VIRTIO_MEM(obj); + + /* + * Note: the core already dropped the references on all memory regions + * (it's passed as the owner to memory_region_init_*()) and finalized + * these objects. We can simply free the memory. + */ + g_free(vmem->memslots); + vmem->memslots = NULL; + g_free(vmem->mr); + vmem->mr = NULL; +} + static Property virtio_mem_properties[] = { DEFINE_PROP_UINT64(VIRTIO_MEM_ADDR_PROP, VirtIOMEM, addr, 0), DEFINE_PROP_UINT32(VIRTIO_MEM_NODE_PROP, VirtIOMEM, node, 0), @@ -1389,6 +1694,8 @@ static Property virtio_mem_properties[] = { #endif DEFINE_PROP_BOOL(VIRTIO_MEM_EARLY_MIGRATION_PROP, VirtIOMEM, early_migration, true), + DEFINE_PROP_BOOL(VIRTIO_MEM_DYNAMIC_MEMSLOTS_PROP, VirtIOMEM, + dynamic_memslots, false), DEFINE_PROP_END_OF_LIST(), }; @@ -1556,6 +1863,8 @@ static void virtio_mem_class_init(ObjectClass *klass, void *data) vmc->fill_device_info = virtio_mem_fill_device_info; vmc->get_memory_region = virtio_mem_get_memory_region; + vmc->decide_memslots = virtio_mem_decide_memslots; + vmc->get_memslots = virtio_mem_get_memslots; vmc->add_size_change_notifier = virtio_mem_add_size_change_notifier; vmc->remove_size_change_notifier = virtio_mem_remove_size_change_notifier; vmc->unplug_request_check = virtio_mem_unplug_request_check; @@ -1573,6 +1882,7 @@ static const TypeInfo virtio_mem_info = { .parent = TYPE_VIRTIO_DEVICE, .instance_size = sizeof(VirtIOMEM), .instance_init = virtio_mem_instance_init, + .instance_finalize = virtio_mem_instance_finalize, .class_init = virtio_mem_class_init, .class_size = sizeof(VirtIOMEMClass), .interfaces = (InterfaceInfo[]) { |