aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--MAINTAINERS1
-rw-r--r--accel/kvm/kvm-all.c35
-rw-r--r--accel/stubs/kvm-stub.c9
-rw-r--r--hw/mem/memory-device.c196
-rw-r--r--hw/virtio/vhost-stub.c9
-rw-r--r--hw/virtio/vhost-user.c21
-rw-r--r--hw/virtio/vhost-vdpa.c1
-rw-r--r--hw/virtio/vhost.c103
-rw-r--r--hw/virtio/virtio-mem-pci.c21
-rw-r--r--hw/virtio/virtio-mem.c330
-rw-r--r--include/exec/cpu-common.h15
-rw-r--r--include/exec/memory.h27
-rw-r--r--include/hw/boards.h14
-rw-r--r--include/hw/mem/memory-device.h57
-rw-r--r--include/hw/virtio/vhost-backend.h9
-rw-r--r--include/hw/virtio/vhost.h3
-rw-r--r--include/hw/virtio/virtio-mem.h32
-rw-r--r--include/sysemu/kvm.h4
-rw-r--r--include/sysemu/kvm_int.h1
-rw-r--r--stubs/memory_device.c (renamed from stubs/qmp_memory_device.c)10
-rw-r--r--stubs/meson.build2
-rw-r--r--system/memory.c35
-rw-r--r--system/physmem.c17
23 files changed, 839 insertions, 113 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index ceea4c2..04e87c20 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2891,6 +2891,7 @@ F: hw/mem/pc-dimm.c
F: include/hw/mem/memory-device.h
F: include/hw/mem/nvdimm.h
F: include/hw/mem/pc-dimm.h
+F: stubs/memory_device.c
F: docs/nvdimm.txt
SPICE
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 72e1d11..3f7eafe 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -174,13 +174,31 @@ void kvm_resample_fd_notify(int gsi)
}
}
-int kvm_get_max_memslots(void)
+unsigned int kvm_get_max_memslots(void)
{
KVMState *s = KVM_STATE(current_accel());
return s->nr_slots;
}
+unsigned int kvm_get_free_memslots(void)
+{
+ unsigned int used_slots = 0;
+ KVMState *s = kvm_state;
+ int i;
+
+ kvm_slots_lock();
+ for (i = 0; i < s->nr_as; i++) {
+ if (!s->as[i].ml) {
+ continue;
+ }
+ used_slots = MAX(used_slots, s->as[i].ml->nr_used_slots);
+ }
+ kvm_slots_unlock();
+
+ return s->nr_slots - used_slots;
+}
+
/* Called with KVMMemoryListener.slots_lock held */
static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
{
@@ -196,19 +214,6 @@ static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
return NULL;
}
-bool kvm_has_free_slot(MachineState *ms)
-{
- KVMState *s = KVM_STATE(ms->accelerator);
- bool result;
- KVMMemoryListener *kml = &s->memory_listener;
-
- kvm_slots_lock();
- result = !!kvm_get_free_slot(kml);
- kvm_slots_unlock();
-
- return result;
-}
-
/* Called with KVMMemoryListener.slots_lock held */
static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml)
{
@@ -1387,6 +1392,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
}
start_addr += slot_size;
size -= slot_size;
+ kml->nr_used_slots--;
} while (size);
return;
}
@@ -1412,6 +1418,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
ram_start_offset += slot_size;
ram += slot_size;
size -= slot_size;
+ kml->nr_used_slots++;
} while (size);
}
diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c
index 235dc66..51f522e 100644
--- a/accel/stubs/kvm-stub.c
+++ b/accel/stubs/kvm-stub.c
@@ -109,9 +109,14 @@ int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
return -ENOSYS;
}
-bool kvm_has_free_slot(MachineState *ms)
+unsigned int kvm_get_max_memslots(void)
{
- return false;
+ return 0;
+}
+
+unsigned int kvm_get_free_memslots(void)
+{
+ return 0;
}
void kvm_init_cpu_signals(CPUState *cpu)
diff --git a/hw/mem/memory-device.c b/hw/mem/memory-device.c
index 667d56b..ae38f48 100644
--- a/hw/mem/memory-device.c
+++ b/hw/mem/memory-device.c
@@ -52,19 +52,135 @@ static int memory_device_build_list(Object *obj, void *opaque)
return 0;
}
-static void memory_device_check_addable(MachineState *ms, MemoryRegion *mr,
- Error **errp)
+static unsigned int memory_device_get_memslots(MemoryDeviceState *md)
{
+ const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
+
+ if (mdc->get_memslots) {
+ return mdc->get_memslots(md);
+ }
+ return 1;
+}
+
+/*
+ * Memslots that are reserved by memory devices (required but still reported
+ * as free from KVM / vhost).
+ */
+static unsigned int get_reserved_memslots(MachineState *ms)
+{
+ if (ms->device_memory->used_memslots >
+ ms->device_memory->required_memslots) {
+ /* This is unexpected, and we warned already in the memory notifier. */
+ return 0;
+ }
+ return ms->device_memory->required_memslots -
+ ms->device_memory->used_memslots;
+}
+
+unsigned int memory_devices_get_reserved_memslots(void)
+{
+ if (!current_machine->device_memory) {
+ return 0;
+ }
+ return get_reserved_memslots(current_machine);
+}
+
+bool memory_devices_memslot_auto_decision_active(void)
+{
+ if (!current_machine->device_memory) {
+ return false;
+ }
+
+ return current_machine->device_memory->memslot_auto_decision_active;
+}
+
+static unsigned int memory_device_memslot_decision_limit(MachineState *ms,
+ MemoryRegion *mr)
+{
+ const unsigned int reserved = get_reserved_memslots(ms);
+ const uint64_t size = memory_region_size(mr);
+ unsigned int max = vhost_get_max_memslots();
+ unsigned int free = vhost_get_free_memslots();
+ uint64_t available_space;
+ unsigned int memslots;
+
+ if (kvm_enabled()) {
+ max = MIN(max, kvm_get_max_memslots());
+ free = MIN(free, kvm_get_free_memslots());
+ }
+
+ /*
+ * If we only have less overall memslots than what we consider reasonable,
+ * just keep it to a minimum.
+ */
+ if (max < MEMORY_DEVICES_SAFE_MAX_MEMSLOTS) {
+ return 1;
+ }
+
+ /*
+ * Consider our soft-limit across all memory devices. We don't really
+ * expect to exceed this limit in reasonable configurations.
+ */
+ if (MEMORY_DEVICES_SOFT_MEMSLOT_LIMIT <=
+ ms->device_memory->required_memslots) {
+ return 1;
+ }
+ memslots = MEMORY_DEVICES_SOFT_MEMSLOT_LIMIT -
+ ms->device_memory->required_memslots;
+
+ /*
+ * Consider the actually still free memslots. This is only relevant if
+ * other memslot consumers would consume *significantly* more memslots than
+ * what we prepared for (> 253). Unlikely, but let's just handle it
+ * cleanly.
+ */
+ memslots = MIN(memslots, free - reserved);
+ if (memslots < 1 || unlikely(free < reserved)) {
+ return 1;
+ }
+
+ /* We cannot have any other memory devices? So give all to this device. */
+ if (size == ms->maxram_size - ms->ram_size) {
+ return memslots;
+ }
+
+ /*
+ * Simple heuristic: equally distribute the memslots over the space
+ * still available for memory devices.
+ */
+ available_space = ms->maxram_size - ms->ram_size -
+ ms->device_memory->used_region_size;
+ memslots = (double)memslots * size / available_space;
+ return memslots < 1 ? 1 : memslots;
+}
+
+static void memory_device_check_addable(MachineState *ms, MemoryDeviceState *md,
+ MemoryRegion *mr, Error **errp)
+{
+ const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
const uint64_t used_region_size = ms->device_memory->used_region_size;
const uint64_t size = memory_region_size(mr);
+ const unsigned int reserved_memslots = get_reserved_memslots(ms);
+ unsigned int required_memslots, memslot_limit;
+
+ /*
+ * Instruct the device to decide how many memslots to use, if applicable,
+ * before we query the number of required memslots the first time.
+ */
+ if (mdc->decide_memslots) {
+ memslot_limit = memory_device_memslot_decision_limit(ms, mr);
+ mdc->decide_memslots(md, memslot_limit);
+ }
+ required_memslots = memory_device_get_memslots(md);
- /* we will need a new memory slot for kvm and vhost */
- if (kvm_enabled() && !kvm_has_free_slot(ms)) {
- error_setg(errp, "hypervisor has no free memory slots left");
+ /* we will need memory slots for kvm and vhost */
+ if (kvm_enabled() &&
+ kvm_get_free_memslots() < required_memslots + reserved_memslots) {
+ error_setg(errp, "hypervisor has not enough free memory slots left");
return;
}
- if (!vhost_has_free_slot()) {
- error_setg(errp, "a used vhost backend has no free memory slots left");
+ if (vhost_get_free_memslots() < required_memslots + reserved_memslots) {
+ error_setg(errp, "a used vhost backend has not enough free memory slots left");
return;
}
@@ -233,7 +349,7 @@ void memory_device_pre_plug(MemoryDeviceState *md, MachineState *ms,
goto out;
}
- memory_device_check_addable(ms, mr, &local_err);
+ memory_device_check_addable(ms, md, mr, &local_err);
if (local_err) {
goto out;
}
@@ -264,6 +380,7 @@ out:
void memory_device_plug(MemoryDeviceState *md, MachineState *ms)
{
const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
+ const unsigned int memslots = memory_device_get_memslots(md);
const uint64_t addr = mdc->get_addr(md);
MemoryRegion *mr;
@@ -275,6 +392,11 @@ void memory_device_plug(MemoryDeviceState *md, MachineState *ms)
g_assert(ms->device_memory);
ms->device_memory->used_region_size += memory_region_size(mr);
+ ms->device_memory->required_memslots += memslots;
+ if (mdc->decide_memslots && memslots > 1) {
+ ms->device_memory->memslot_auto_decision_active++;
+ }
+
memory_region_add_subregion(&ms->device_memory->mr,
addr - ms->device_memory->base, mr);
trace_memory_device_plug(DEVICE(md)->id ? DEVICE(md)->id : "", addr);
@@ -283,6 +405,7 @@ void memory_device_plug(MemoryDeviceState *md, MachineState *ms)
void memory_device_unplug(MemoryDeviceState *md, MachineState *ms)
{
const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
+ const unsigned int memslots = memory_device_get_memslots(md);
MemoryRegion *mr;
/*
@@ -293,7 +416,12 @@ void memory_device_unplug(MemoryDeviceState *md, MachineState *ms)
g_assert(ms->device_memory);
memory_region_del_subregion(&ms->device_memory->mr, mr);
+
+ if (mdc->decide_memslots && memslots > 1) {
+ ms->device_memory->memslot_auto_decision_active--;
+ }
ms->device_memory->used_region_size -= memory_region_size(mr);
+ ms->device_memory->required_memslots -= memslots;
trace_memory_device_unplug(DEVICE(md)->id ? DEVICE(md)->id : "",
mdc->get_addr(md));
}
@@ -313,6 +441,50 @@ uint64_t memory_device_get_region_size(const MemoryDeviceState *md,
return memory_region_size(mr);
}
+static void memory_devices_region_mod(MemoryListener *listener,
+ MemoryRegionSection *mrs, bool add)
+{
+ DeviceMemoryState *dms = container_of(listener, DeviceMemoryState,
+ listener);
+
+ if (!memory_region_is_ram(mrs->mr)) {
+ warn_report("Unexpected memory region mapped into device memory region.");
+ return;
+ }
+
+ /*
+ * The expectation is that each distinct RAM memory region section in
+ * our region for memory devices consumes exactly one memslot in KVM
+ * and in vhost. For vhost, this is true, except:
+ * * ROM memory regions don't consume a memslot. These get used very
+ * rarely for memory devices (R/O NVDIMMs).
+ * * Memslots without a fd (memory-backend-ram) don't necessarily
+ * consume a memslot. Such setups are quite rare and possibly bogus:
+ * the memory would be inaccessible by such vhost devices.
+ *
+ * So for vhost, in corner cases we might over-estimate the number of
+ * memslots that are currently used or that might still be reserved
+ * (required - used).
+ */
+ dms->used_memslots += add ? 1 : -1;
+
+ if (dms->used_memslots > dms->required_memslots) {
+ warn_report("Memory devices use more memory slots than indicated as required.");
+ }
+}
+
+static void memory_devices_region_add(MemoryListener *listener,
+ MemoryRegionSection *mrs)
+{
+ return memory_devices_region_mod(listener, mrs, true);
+}
+
+static void memory_devices_region_del(MemoryListener *listener,
+ MemoryRegionSection *mrs)
+{
+ return memory_devices_region_mod(listener, mrs, false);
+}
+
void machine_memory_devices_init(MachineState *ms, hwaddr base, uint64_t size)
{
g_assert(size);
@@ -322,8 +494,16 @@ void machine_memory_devices_init(MachineState *ms, hwaddr base, uint64_t size)
memory_region_init(&ms->device_memory->mr, OBJECT(ms), "device-memory",
size);
+ address_space_init(&ms->device_memory->as, &ms->device_memory->mr,
+ "device-memory");
memory_region_add_subregion(get_system_memory(), ms->device_memory->base,
&ms->device_memory->mr);
+
+ /* Track the number of memslots used by memory devices. */
+ ms->device_memory->listener.region_add = memory_devices_region_add;
+ ms->device_memory->listener.region_del = memory_devices_region_del;
+ memory_listener_register(&ms->device_memory->listener,
+ &ms->device_memory->as);
}
static const TypeInfo memory_device_info = {
diff --git a/hw/virtio/vhost-stub.c b/hw/virtio/vhost-stub.c
index aa858ef..52d42ad 100644
--- a/hw/virtio/vhost-stub.c
+++ b/hw/virtio/vhost-stub.c
@@ -2,9 +2,14 @@
#include "hw/virtio/vhost.h"
#include "hw/virtio/vhost-user.h"
-bool vhost_has_free_slot(void)
+unsigned int vhost_get_max_memslots(void)
{
- return true;
+ return UINT_MAX;
+}
+
+unsigned int vhost_get_free_memslots(void)
+{
+ return UINT_MAX;
}
bool vhost_user_init(VhostUserState *user, CharBackend *chr, Error **errp)
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 3766b41..68eb1f0 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -2327,19 +2327,6 @@ static int vhost_user_migration_done(struct vhost_dev *dev, char* mac_addr)
return -ENOTSUP;
}
-static bool vhost_user_can_merge(struct vhost_dev *dev,
- uint64_t start1, uint64_t size1,
- uint64_t start2, uint64_t size2)
-{
- ram_addr_t offset;
- int mfd, rfd;
-
- (void)vhost_user_get_mr_data(start1, &offset, &mfd);
- (void)vhost_user_get_mr_data(start2, &offset, &rfd);
-
- return mfd == rfd;
-}
-
static int vhost_user_net_set_mtu(struct vhost_dev *dev, uint16_t mtu)
{
VhostUserMsg msg;
@@ -2622,10 +2609,9 @@ vhost_user_crypto_close_session(struct vhost_dev *dev, uint64_t session_id)
return 0;
}
-static bool vhost_user_mem_section_filter(struct vhost_dev *dev,
- MemoryRegionSection *section)
+static bool vhost_user_no_private_memslots(struct vhost_dev *dev)
{
- return memory_region_get_fd(section->mr) >= 0;
+ return true;
}
static int vhost_user_get_inflight_fd(struct vhost_dev *dev,
@@ -2868,6 +2854,7 @@ const VhostOps user_ops = {
.vhost_backend_init = vhost_user_backend_init,
.vhost_backend_cleanup = vhost_user_backend_cleanup,
.vhost_backend_memslots_limit = vhost_user_memslots_limit,
+ .vhost_backend_no_private_memslots = vhost_user_no_private_memslots,
.vhost_set_log_base = vhost_user_set_log_base,
.vhost_set_mem_table = vhost_user_set_mem_table,
.vhost_set_vring_addr = vhost_user_set_vring_addr,
@@ -2886,7 +2873,6 @@ const VhostOps user_ops = {
.vhost_set_vring_enable = vhost_user_set_vring_enable,
.vhost_requires_shm_log = vhost_user_requires_shm_log,
.vhost_migration_done = vhost_user_migration_done,
- .vhost_backend_can_merge = vhost_user_can_merge,
.vhost_net_set_mtu = vhost_user_net_set_mtu,
.vhost_set_iotlb_callback = vhost_user_set_iotlb_callback,
.vhost_send_device_iotlb_msg = vhost_user_send_device_iotlb_msg,
@@ -2894,7 +2880,6 @@ const VhostOps user_ops = {
.vhost_set_config = vhost_user_set_config,
.vhost_crypto_create_session = vhost_user_crypto_create_session,
.vhost_crypto_close_session = vhost_user_crypto_close_session,
- .vhost_backend_mem_section_filter = vhost_user_mem_section_filter,
.vhost_get_inflight_fd = vhost_user_get_inflight_fd,
.vhost_set_inflight_fd = vhost_user_set_inflight_fd,
.vhost_dev_start = vhost_user_dev_start,
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 89ff02a..819b2d8 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -1512,7 +1512,6 @@ const VhostOps vdpa_ops = {
.vhost_set_config = vhost_vdpa_set_config,
.vhost_requires_shm_log = NULL,
.vhost_migration_done = NULL,
- .vhost_backend_can_merge = NULL,
.vhost_net_set_mtu = NULL,
.vhost_set_iotlb_callback = NULL,
.vhost_send_device_iotlb_msg = NULL,
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 9cfac40..9f37206 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -23,6 +23,7 @@
#include "qemu/log.h"
#include "standard-headers/linux/vhost_types.h"
#include "hw/virtio/virtio-bus.h"
+#include "hw/mem/memory-device.h"
#include "migration/blocker.h"
#include "migration/qemu-file-types.h"
#include "sysemu/dma.h"
@@ -45,20 +46,44 @@
static struct vhost_log *vhost_log;
static struct vhost_log *vhost_log_shm;
+/* Memslots used by backends that support private memslots (without an fd). */
static unsigned int used_memslots;
+
+/* Memslots used by backends that only support shared memslots (with an fd). */
+static unsigned int used_shared_memslots;
+
static QLIST_HEAD(, vhost_dev) vhost_devices =
QLIST_HEAD_INITIALIZER(vhost_devices);
-bool vhost_has_free_slot(void)
+unsigned int vhost_get_max_memslots(void)
+{
+ unsigned int max = UINT_MAX;
+ struct vhost_dev *hdev;
+
+ QLIST_FOREACH(hdev, &vhost_devices, entry) {
+ max = MIN(max, hdev->vhost_ops->vhost_backend_memslots_limit(hdev));
+ }
+ return max;
+}
+
+unsigned int vhost_get_free_memslots(void)
{
- unsigned int slots_limit = ~0U;
+ unsigned int free = UINT_MAX;
struct vhost_dev *hdev;
QLIST_FOREACH(hdev, &vhost_devices, entry) {
unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
- slots_limit = MIN(slots_limit, r);
+ unsigned int cur_free;
+
+ if (hdev->vhost_ops->vhost_backend_no_private_memslots &&
+ hdev->vhost_ops->vhost_backend_no_private_memslots(hdev)) {
+ cur_free = r - used_shared_memslots;
+ } else {
+ cur_free = r - used_memslots;
+ }
+ free = MIN(free, cur_free);
}
- return slots_limit > used_memslots;
+ return free;
}
static void vhost_dev_sync_region(struct vhost_dev *dev,
@@ -474,8 +499,7 @@ static int vhost_verify_ring_mappings(struct vhost_dev *dev,
* vhost_section: identify sections needed for vhost access
*
* We only care about RAM sections here (where virtqueue and guest
- * internals accessed by virtio might live). If we find one we still
- * allow the backend to potentially filter it out of our list.
+ * internals accessed by virtio might live).
*/
static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section)
{
@@ -502,8 +526,16 @@ static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section)
return false;
}
- if (dev->vhost_ops->vhost_backend_mem_section_filter &&
- !dev->vhost_ops->vhost_backend_mem_section_filter(dev, section)) {
+ /*
+ * Some backends (like vhost-user) can only handle memory regions
+ * that have an fd (can be mapped into a different process). Filter
+ * the ones without an fd out, if requested.
+ *
+ * TODO: we might have to limit to MAP_SHARED as well.
+ */
+ if (memory_region_get_fd(section->mr) < 0 &&
+ dev->vhost_ops->vhost_backend_no_private_memslots &&
+ dev->vhost_ops->vhost_backend_no_private_memslots(dev)) {
trace_vhost_reject_section(mr->name, 2);
return false;
}
@@ -568,7 +600,14 @@ static void vhost_commit(MemoryListener *listener)
dev->n_mem_sections * sizeof dev->mem->regions[0];
dev->mem = g_realloc(dev->mem, regions_size);
dev->mem->nregions = dev->n_mem_sections;
- used_memslots = dev->mem->nregions;
+
+ if (dev->vhost_ops->vhost_backend_no_private_memslots &&
+ dev->vhost_ops->vhost_backend_no_private_memslots(dev)) {
+ used_shared_memslots = dev->mem->nregions;
+ } else {
+ used_memslots = dev->mem->nregions;
+ }
+
for (i = 0; i < dev->n_mem_sections; i++) {
struct vhost_memory_region *cur_vmr = dev->mem->regions + i;
struct MemoryRegionSection *mrs = dev->mem_sections + i;
@@ -668,7 +707,7 @@ static void vhost_region_add_section(struct vhost_dev *dev,
mrs_size, mrs_host);
}
- if (dev->n_tmp_sections) {
+ if (dev->n_tmp_sections && !section->unmergeable) {
/* Since we already have at least one section, lets see if
* this extends it; since we're scanning in order, we only
* have to look at the last one, and the FlatView that calls
@@ -701,11 +740,7 @@ static void vhost_region_add_section(struct vhost_dev *dev,
size_t offset = mrs_gpa - prev_gpa_start;
if (prev_host_start + offset == mrs_host &&
- section->mr == prev_sec->mr &&
- (!dev->vhost_ops->vhost_backend_can_merge ||
- dev->vhost_ops->vhost_backend_can_merge(dev,
- mrs_host, mrs_size,
- prev_host_start, prev_size))) {
+ section->mr == prev_sec->mr && !prev_sec->unmergeable) {
uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size);
need_add = false;
prev_sec->offset_within_address_space =
@@ -1400,6 +1435,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
VhostBackendType backend_type, uint32_t busyloop_timeout,
Error **errp)
{
+ unsigned int used, reserved, limit;
uint64_t features;
int i, r, n_initialized_vqs = 0;
@@ -1426,6 +1462,19 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
goto fail;
}
+ limit = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
+ if (limit < MEMORY_DEVICES_SAFE_MAX_MEMSLOTS &&
+ memory_devices_memslot_auto_decision_active()) {
+ error_setg(errp, "some memory device (like virtio-mem)"
+ " decided how many memory slots to use based on the overall"
+ " number of memory slots; this vhost backend would further"
+ " restricts the overall number of memory slots");
+ error_append_hint(errp, "Try plugging this vhost backend before"
+ " plugging such memory devices.\n");
+ r = -EINVAL;
+ goto fail;
+ }
+
for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) {
r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
if (r < 0) {
@@ -1495,9 +1544,27 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
memory_listener_register(&hdev->memory_listener, &address_space_memory);
QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
- if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) {
- error_setg(errp, "vhost backend memory slots limit is less"
- " than current number of present memory slots");
+ /*
+ * The listener we registered properly updated the corresponding counter.
+ * So we can trust that these values are accurate.
+ */
+ if (hdev->vhost_ops->vhost_backend_no_private_memslots &&
+ hdev->vhost_ops->vhost_backend_no_private_memslots(hdev)) {
+ used = used_shared_memslots;
+ } else {
+ used = used_memslots;
+ }
+ /*
+ * We assume that all reserved memslots actually require a real memslot
+ * in our vhost backend. This might not be true, for example, if the
+ * memslot would be ROM. If ever relevant, we can optimize for that --
+ * but we'll need additional information about the reservations.
+ */
+ reserved = memory_devices_get_reserved_memslots();
+ if (used + reserved > limit) {
+ error_setg(errp, "vhost backend memory slots limit (%d) is less"
+ " than current number of used (%d) and reserved (%d)"
+ " memory slots for memory devices.", limit, used, reserved);
r = -EINVAL;
goto fail_busyloop;
}
diff --git a/hw/virtio/virtio-mem-pci.c b/hw/virtio/virtio-mem-pci.c
index c4597e0..1b4e9a3 100644
--- a/hw/virtio/virtio-mem-pci.c
+++ b/hw/virtio/virtio-mem-pci.c
@@ -48,6 +48,25 @@ static MemoryRegion *virtio_mem_pci_get_memory_region(MemoryDeviceState *md,
return vmc->get_memory_region(vmem, errp);
}
+static void virtio_mem_pci_decide_memslots(MemoryDeviceState *md,
+ unsigned int limit)
+{
+ VirtIOMEMPCI *pci_mem = VIRTIO_MEM_PCI(md);
+ VirtIOMEM *vmem = VIRTIO_MEM(&pci_mem->vdev);
+ VirtIOMEMClass *vmc = VIRTIO_MEM_GET_CLASS(vmem);
+
+ vmc->decide_memslots(vmem, limit);
+}
+
+static unsigned int virtio_mem_pci_get_memslots(MemoryDeviceState *md)
+{
+ VirtIOMEMPCI *pci_mem = VIRTIO_MEM_PCI(md);
+ VirtIOMEM *vmem = VIRTIO_MEM(&pci_mem->vdev);
+ VirtIOMEMClass *vmc = VIRTIO_MEM_GET_CLASS(vmem);
+
+ return vmc->get_memslots(vmem);
+}
+
static uint64_t virtio_mem_pci_get_plugged_size(const MemoryDeviceState *md,
Error **errp)
{
@@ -150,6 +169,8 @@ static void virtio_mem_pci_class_init(ObjectClass *klass, void *data)
mdc->set_addr = virtio_mem_pci_set_addr;
mdc->get_plugged_size = virtio_mem_pci_get_plugged_size;
mdc->get_memory_region = virtio_mem_pci_get_memory_region;
+ mdc->decide_memslots = virtio_mem_pci_decide_memslots;
+ mdc->get_memslots = virtio_mem_pci_get_memslots;
mdc->fill_device_info = virtio_mem_pci_fill_device_info;
mdc->get_min_alignment = virtio_mem_pci_get_min_alignment;
diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
index da5b09c..9dc3c61 100644
--- a/hw/virtio/virtio-mem.c
+++ b/hw/virtio/virtio-mem.c
@@ -67,6 +67,13 @@ static uint32_t virtio_mem_default_thp_size(void)
}
/*
+ * The minimum memslot size depends on this setting ("sane default"), the
+ * device block size, and the memory backend page size. The last (or single)
+ * memslot might be smaller than this constant.
+ */
+#define VIRTIO_MEM_MIN_MEMSLOT_SIZE (1 * GiB)
+
+/*
* We want to have a reasonable default block size such that
* 1. We avoid splitting THPs when unplugging memory, which degrades
* performance.
@@ -177,10 +184,10 @@ static bool virtio_mem_is_busy(void)
return migration_in_incoming_postcopy() || !migration_is_idle();
}
-typedef int (*virtio_mem_range_cb)(const VirtIOMEM *vmem, void *arg,
+typedef int (*virtio_mem_range_cb)(VirtIOMEM *vmem, void *arg,
uint64_t offset, uint64_t size);
-static int virtio_mem_for_each_unplugged_range(const VirtIOMEM *vmem, void *arg,
+static int virtio_mem_for_each_unplugged_range(VirtIOMEM *vmem, void *arg,
virtio_mem_range_cb cb)
{
unsigned long first_zero_bit, last_zero_bit;
@@ -204,7 +211,7 @@ static int virtio_mem_for_each_unplugged_range(const VirtIOMEM *vmem, void *arg,
return ret;
}
-static int virtio_mem_for_each_plugged_range(const VirtIOMEM *vmem, void *arg,
+static int virtio_mem_for_each_plugged_range(VirtIOMEM *vmem, void *arg,
virtio_mem_range_cb cb)
{
unsigned long first_bit, last_bit;
@@ -483,6 +490,96 @@ static bool virtio_mem_valid_range(const VirtIOMEM *vmem, uint64_t gpa,
return true;
}
+static void virtio_mem_activate_memslot(VirtIOMEM *vmem, unsigned int idx)
+{
+ const uint64_t memslot_offset = idx * vmem->memslot_size;
+
+ assert(vmem->memslots);
+
+ /*
+ * Instead of enabling/disabling memslots, we add/remove them. This should
+ * make address space updates faster, because we don't have to loop over
+ * many disabled subregions.
+ */
+ if (memory_region_is_mapped(&vmem->memslots[idx])) {
+ return;
+ }
+ memory_region_add_subregion(vmem->mr, memslot_offset, &vmem->memslots[idx]);
+}
+
+static void virtio_mem_deactivate_memslot(VirtIOMEM *vmem, unsigned int idx)
+{
+ assert(vmem->memslots);
+
+ if (!memory_region_is_mapped(&vmem->memslots[idx])) {
+ return;
+ }
+ memory_region_del_subregion(vmem->mr, &vmem->memslots[idx]);
+}
+
+static void virtio_mem_activate_memslots_to_plug(VirtIOMEM *vmem,
+ uint64_t offset, uint64_t size)
+{
+ const unsigned int start_idx = offset / vmem->memslot_size;
+ const unsigned int end_idx = (offset + size + vmem->memslot_size - 1) /
+ vmem->memslot_size;
+ unsigned int idx;
+
+ if (!vmem->dynamic_memslots) {
+ return;
+ }
+
+ /* Activate all involved memslots in a single transaction. */
+ memory_region_transaction_begin();
+ for (idx = start_idx; idx < end_idx; idx++) {
+ virtio_mem_activate_memslot(vmem, idx);
+ }
+ memory_region_transaction_commit();
+}
+
+static void virtio_mem_deactivate_unplugged_memslots(VirtIOMEM *vmem,
+ uint64_t offset,
+ uint64_t size)
+{
+ const uint64_t region_size = memory_region_size(&vmem->memdev->mr);
+ const unsigned int start_idx = offset / vmem->memslot_size;
+ const unsigned int end_idx = (offset + size + vmem->memslot_size - 1) /
+ vmem->memslot_size;
+ unsigned int idx;
+
+ if (!vmem->dynamic_memslots) {
+ return;
+ }
+
+ /* Deactivate all memslots with unplugged blocks in a single transaction. */
+ memory_region_transaction_begin();
+ for (idx = start_idx; idx < end_idx; idx++) {
+ const uint64_t memslot_offset = idx * vmem->memslot_size;
+ uint64_t memslot_size = vmem->memslot_size;
+
+ /* The size of the last memslot might be smaller. */
+ if (idx == vmem->nb_memslots - 1) {
+ memslot_size = region_size - memslot_offset;
+ }
+
+ /*
+ * Partially covered memslots might still have some blocks plugged and
+ * have to remain active if that's the case.
+ */
+ if (offset > memslot_offset ||
+ offset + size < memslot_offset + memslot_size) {
+ const uint64_t gpa = vmem->addr + memslot_offset;
+
+ if (!virtio_mem_is_range_unplugged(vmem, gpa, memslot_size)) {
+ continue;
+ }
+ }
+
+ virtio_mem_deactivate_memslot(vmem, idx);
+ }
+ memory_region_transaction_commit();
+}
+
static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa,
uint64_t size, bool plug)
{
@@ -500,6 +597,8 @@ static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa,
}
virtio_mem_notify_unplug(vmem, offset, size);
virtio_mem_set_range_unplugged(vmem, start_gpa, size);
+ /* Deactivate completely unplugged memslots after updating the state. */
+ virtio_mem_deactivate_unplugged_memslots(vmem, offset, size);
return 0;
}
@@ -527,7 +626,20 @@ static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa,
}
if (!ret) {
+ /*
+ * Activate before notifying and rollback in case of any errors.
+ *
+ * When activating a yet inactive memslot, memory notifiers will get
+ * notified about the added memory region and can register with the
+ * RamDiscardManager; this will traverse all plugged blocks and skip the
+ * blocks we are plugging here. The following notification will inform
+ * registered listeners about the blocks we're plugging.
+ */
+ virtio_mem_activate_memslots_to_plug(vmem, offset, size);
ret = virtio_mem_notify_plug(vmem, offset, size);
+ if (ret) {
+ virtio_mem_deactivate_unplugged_memslots(vmem, offset, size);
+ }
}
if (ret) {
/* Could be preallocation or a notifier populated memory. */
@@ -620,6 +732,7 @@ static void virtio_mem_resize_usable_region(VirtIOMEM *vmem,
static int virtio_mem_unplug_all(VirtIOMEM *vmem)
{
+ const uint64_t region_size = memory_region_size(&vmem->memdev->mr);
RAMBlock *rb = vmem->memdev->mr.ram_block;
if (vmem->size) {
@@ -634,6 +747,9 @@ static int virtio_mem_unplug_all(VirtIOMEM *vmem)
bitmap_clear(vmem->bitmap, 0, vmem->bitmap_size);
vmem->size = 0;
notifier_list_notify(&vmem->size_change_notifiers, &vmem->size);
+
+ /* Deactivate all memslots after updating the state. */
+ virtio_mem_deactivate_unplugged_memslots(vmem, 0, region_size);
}
trace_virtio_mem_unplugged_all();
@@ -790,6 +906,49 @@ static void virtio_mem_system_reset(void *opaque)
virtio_mem_unplug_all(vmem);
}
+static void virtio_mem_prepare_mr(VirtIOMEM *vmem)
+{
+ const uint64_t region_size = memory_region_size(&vmem->memdev->mr);
+
+ assert(!vmem->mr && vmem->dynamic_memslots);
+ vmem->mr = g_new0(MemoryRegion, 1);
+ memory_region_init(vmem->mr, OBJECT(vmem), "virtio-mem",
+ region_size);
+ vmem->mr->align = memory_region_get_alignment(&vmem->memdev->mr);
+}
+
+static void virtio_mem_prepare_memslots(VirtIOMEM *vmem)
+{
+ const uint64_t region_size = memory_region_size(&vmem->memdev->mr);
+ unsigned int idx;
+
+ g_assert(!vmem->memslots && vmem->nb_memslots && vmem->dynamic_memslots);
+ vmem->memslots = g_new0(MemoryRegion, vmem->nb_memslots);
+
+ /* Initialize our memslots, but don't map them yet. */
+ for (idx = 0; idx < vmem->nb_memslots; idx++) {
+ const uint64_t memslot_offset = idx * vmem->memslot_size;
+ uint64_t memslot_size = vmem->memslot_size;
+ char name[20];
+
+ /* The size of the last memslot might be smaller. */
+ if (idx == vmem->nb_memslots - 1) {
+ memslot_size = region_size - memslot_offset;
+ }
+
+ snprintf(name, sizeof(name), "memslot-%u", idx);
+ memory_region_init_alias(&vmem->memslots[idx], OBJECT(vmem), name,
+ &vmem->memdev->mr, memslot_offset,
+ memslot_size);
+ /*
+ * We want to be able to atomically and efficiently activate/deactivate
+ * individual memslots without affecting adjacent memslots in memory
+ * notifiers.
+ */
+ memory_region_set_unmergeable(&vmem->memslots[idx], true);
+ }
+}
+
static void virtio_mem_device_realize(DeviceState *dev, Error **errp)
{
MachineState *ms = MACHINE(qdev_get_machine());
@@ -861,6 +1020,14 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp)
vmem->unplugged_inaccessible = ON_OFF_AUTO_ON;
#endif /* VIRTIO_MEM_HAS_LEGACY_GUESTS */
+ if (vmem->dynamic_memslots &&
+ vmem->unplugged_inaccessible != ON_OFF_AUTO_ON) {
+ error_setg(errp, "'%s' property set to 'on' requires '%s' to be 'on'",
+ VIRTIO_MEM_DYNAMIC_MEMSLOTS_PROP,
+ VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP);
+ return;
+ }
+
/*
* If the block size wasn't configured by the user, use a sane default. This
* allows using hugetlbfs backends of any page size without manual
@@ -930,6 +1097,25 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp)
virtio_init(vdev, VIRTIO_ID_MEM, sizeof(struct virtio_mem_config));
vmem->vq = virtio_add_queue(vdev, 128, virtio_mem_handle_request);
+ /*
+ * With "dynamic-memslots=off" (old behavior) we always map the whole
+ * RAM memory region directly.
+ */
+ if (vmem->dynamic_memslots) {
+ if (!vmem->mr) {
+ virtio_mem_prepare_mr(vmem);
+ }
+ if (vmem->nb_memslots <= 1) {
+ vmem->nb_memslots = 1;
+ vmem->memslot_size = memory_region_size(&vmem->memdev->mr);
+ }
+ if (!vmem->memslots) {
+ virtio_mem_prepare_memslots(vmem);
+ }
+ } else {
+ assert(!vmem->mr && !vmem->nb_memslots && !vmem->memslots);
+ }
+
host_memory_backend_set_mapped(vmem->memdev, true);
vmstate_register_ram(&vmem->memdev->mr, DEVICE(vmem));
if (vmem->early_migration) {
@@ -969,7 +1155,7 @@ static void virtio_mem_device_unrealize(DeviceState *dev)
ram_block_coordinated_discard_require(false);
}
-static int virtio_mem_discard_range_cb(const VirtIOMEM *vmem, void *arg,
+static int virtio_mem_discard_range_cb(VirtIOMEM *vmem, void *arg,
uint64_t offset, uint64_t size)
{
RAMBlock *rb = vmem->memdev->mr.ram_block;
@@ -984,13 +1170,32 @@ static int virtio_mem_restore_unplugged(VirtIOMEM *vmem)
virtio_mem_discard_range_cb);
}
-static int virtio_mem_post_load(void *opaque, int version_id)
+static int virtio_mem_activate_memslot_range_cb(VirtIOMEM *vmem, void *arg,
+ uint64_t offset, uint64_t size)
+{
+ virtio_mem_activate_memslots_to_plug(vmem, offset, size);
+ return 0;
+}
+
+static int virtio_mem_post_load_bitmap(VirtIOMEM *vmem)
{
- VirtIOMEM *vmem = VIRTIO_MEM(opaque);
RamDiscardListener *rdl;
int ret;
/*
+ * We restored the bitmap and updated the requested size; activate all
+ * memslots (so listeners register) before notifying about plugged blocks.
+ */
+ if (vmem->dynamic_memslots) {
+ /*
+ * We don't expect any active memslots at this point to deactivate: no
+ * memory was plugged on the migration destination.
+ */
+ virtio_mem_for_each_plugged_range(vmem, NULL,
+ virtio_mem_activate_memslot_range_cb);
+ }
+
+ /*
* We started out with all memory discarded and our memory region is mapped
* into an address space. Replay, now that we updated the bitmap.
*/
@@ -1001,6 +1206,20 @@ static int virtio_mem_post_load(void *opaque, int version_id)
return ret;
}
}
+ return 0;
+}
+
+static int virtio_mem_post_load(void *opaque, int version_id)
+{
+ VirtIOMEM *vmem = VIRTIO_MEM(opaque);
+ int ret;
+
+ if (!vmem->early_migration) {
+ ret = virtio_mem_post_load_bitmap(vmem);
+ if (ret) {
+ return ret;
+ }
+ }
/*
* If shared RAM is migrated using the file content and not using QEMU,
@@ -1021,7 +1240,7 @@ static int virtio_mem_post_load(void *opaque, int version_id)
return virtio_mem_restore_unplugged(vmem);
}
-static int virtio_mem_prealloc_range_cb(const VirtIOMEM *vmem, void *arg,
+static int virtio_mem_prealloc_range_cb(VirtIOMEM *vmem, void *arg,
uint64_t offset, uint64_t size)
{
void *area = memory_region_get_ram_ptr(&vmem->memdev->mr) + offset;
@@ -1043,7 +1262,7 @@ static int virtio_mem_post_load_early(void *opaque, int version_id)
int ret;
if (!vmem->prealloc) {
- return 0;
+ goto post_load_bitmap;
}
/*
@@ -1051,7 +1270,7 @@ static int virtio_mem_post_load_early(void *opaque, int version_id)
* don't mess with preallocation and postcopy.
*/
if (migrate_ram_is_ignored(rb)) {
- return 0;
+ goto post_load_bitmap;
}
/*
@@ -1084,7 +1303,10 @@ static int virtio_mem_post_load_early(void *opaque, int version_id)
return -EBUSY;
}
}
- return 0;
+
+post_load_bitmap:
+ /* Finally, update any other state to be consistent with the new bitmap. */
+ return virtio_mem_post_load_bitmap(vmem);
}
typedef struct VirtIOMEMMigSanityChecks {
@@ -1235,11 +1457,79 @@ static MemoryRegion *virtio_mem_get_memory_region(VirtIOMEM *vmem, Error **errp)
if (!vmem->memdev) {
error_setg(errp, "'%s' property must be set", VIRTIO_MEM_MEMDEV_PROP);
return NULL;
+ } else if (vmem->dynamic_memslots) {
+ if (!vmem->mr) {
+ virtio_mem_prepare_mr(vmem);
+ }
+ return vmem->mr;
}
return &vmem->memdev->mr;
}
+static void virtio_mem_decide_memslots(VirtIOMEM *vmem, unsigned int limit)
+{
+ uint64_t region_size, memslot_size, min_memslot_size;
+ unsigned int memslots;
+ RAMBlock *rb;
+
+ if (!vmem->dynamic_memslots) {
+ return;
+ }
+
+ /* We're called exactly once, before realizing the device. */
+ assert(!vmem->nb_memslots);
+
+ /* If realizing the device will fail, just assume a single memslot. */
+ if (limit <= 1 || !vmem->memdev || !vmem->memdev->mr.ram_block) {
+ vmem->nb_memslots = 1;
+ return;
+ }
+
+ rb = vmem->memdev->mr.ram_block;
+ region_size = memory_region_size(&vmem->memdev->mr);
+
+ /*
+ * Determine the default block size now, to determine the minimum memslot
+ * size. We want the minimum slot size to be at least the device block size.
+ */
+ if (!vmem->block_size) {
+ vmem->block_size = virtio_mem_default_block_size(rb);
+ }
+ /* If realizing the device will fail, just assume a single memslot. */
+ if (vmem->block_size < qemu_ram_pagesize(rb) ||
+ !QEMU_IS_ALIGNED(region_size, vmem->block_size)) {
+ vmem->nb_memslots = 1;
+ return;
+ }
+
+ /*
+ * All memslots except the last one have a reasonable minimum size, and
+ * and all memslot sizes are aligned to the device block size.
+ */
+ memslot_size = QEMU_ALIGN_UP(region_size / limit, vmem->block_size);
+ min_memslot_size = MAX(vmem->block_size, VIRTIO_MEM_MIN_MEMSLOT_SIZE);
+ memslot_size = MAX(memslot_size, min_memslot_size);
+
+ memslots = QEMU_ALIGN_UP(region_size, memslot_size) / memslot_size;
+ if (memslots != 1) {
+ vmem->memslot_size = memslot_size;
+ }
+ vmem->nb_memslots = memslots;
+}
+
+static unsigned int virtio_mem_get_memslots(VirtIOMEM *vmem)
+{
+ if (!vmem->dynamic_memslots) {
+ /* Exactly one static RAM memory region. */
+ return 1;
+ }
+
+ /* We're called after instructed to make a decision. */
+ g_assert(vmem->nb_memslots);
+ return vmem->nb_memslots;
+}
+
static void virtio_mem_add_size_change_notifier(VirtIOMEM *vmem,
Notifier *notifier)
{
@@ -1377,6 +1667,21 @@ static void virtio_mem_instance_init(Object *obj)
NULL, NULL);
}
+static void virtio_mem_instance_finalize(Object *obj)
+{
+ VirtIOMEM *vmem = VIRTIO_MEM(obj);
+
+ /*
+ * Note: the core already dropped the references on all memory regions
+ * (it's passed as the owner to memory_region_init_*()) and finalized
+ * these objects. We can simply free the memory.
+ */
+ g_free(vmem->memslots);
+ vmem->memslots = NULL;
+ g_free(vmem->mr);
+ vmem->mr = NULL;
+}
+
static Property virtio_mem_properties[] = {
DEFINE_PROP_UINT64(VIRTIO_MEM_ADDR_PROP, VirtIOMEM, addr, 0),
DEFINE_PROP_UINT32(VIRTIO_MEM_NODE_PROP, VirtIOMEM, node, 0),
@@ -1389,6 +1694,8 @@ static Property virtio_mem_properties[] = {
#endif
DEFINE_PROP_BOOL(VIRTIO_MEM_EARLY_MIGRATION_PROP, VirtIOMEM,
early_migration, true),
+ DEFINE_PROP_BOOL(VIRTIO_MEM_DYNAMIC_MEMSLOTS_PROP, VirtIOMEM,
+ dynamic_memslots, false),
DEFINE_PROP_END_OF_LIST(),
};
@@ -1556,6 +1863,8 @@ static void virtio_mem_class_init(ObjectClass *klass, void *data)
vmc->fill_device_info = virtio_mem_fill_device_info;
vmc->get_memory_region = virtio_mem_get_memory_region;
+ vmc->decide_memslots = virtio_mem_decide_memslots;
+ vmc->get_memslots = virtio_mem_get_memslots;
vmc->add_size_change_notifier = virtio_mem_add_size_change_notifier;
vmc->remove_size_change_notifier = virtio_mem_remove_size_change_notifier;
vmc->unplug_request_check = virtio_mem_unplug_request_check;
@@ -1573,6 +1882,7 @@ static const TypeInfo virtio_mem_info = {
.parent = TYPE_VIRTIO_DEVICE,
.instance_size = sizeof(VirtIOMEM),
.instance_init = virtio_mem_instance_init,
+ .instance_finalize = virtio_mem_instance_finalize,
.class_init = virtio_mem_class_init,
.class_size = sizeof(VirtIOMEMClass),
.interfaces = (InterfaceInfo[]) {
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 605b160..30c376a 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -83,6 +83,21 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length);
ram_addr_t qemu_ram_addr_from_host(void *ptr);
ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr);
RAMBlock *qemu_ram_block_by_name(const char *name);
+
+/*
+ * Translates a host ptr back to a RAMBlock and an offset in that RAMBlock.
+ *
+ * @ptr: The host pointer to translate.
+ * @round_offset: Whether to round the result offset down to a target page
+ * @offset: Will be set to the offset within the returned RAMBlock.
+ *
+ * Returns: RAMBlock (or NULL if not found)
+ *
+ * By the time this function returns, the returned pointer is not protected
+ * by RCU anymore. If the caller is not within an RCU critical section and
+ * does not hold the iothread lock, it must have other means of protecting the
+ * pointer, such as a reference to the memory region that owns the RAMBlock.
+ */
RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
ram_addr_t *offset);
ram_addr_t qemu_ram_block_host_offset(RAMBlock *rb, void *host);
diff --git a/include/exec/memory.h b/include/exec/memory.h
index c99842d..653a32e 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -95,6 +95,7 @@ struct ReservedRegion {
* relative to the region's address space
* @readonly: writes to this section are ignored
* @nonvolatile: this section is non-volatile
+ * @unmergeable: this section should not get merged with adjacent sections
*/
struct MemoryRegionSection {
Int128 size;
@@ -104,6 +105,7 @@ struct MemoryRegionSection {
hwaddr offset_within_address_space;
bool readonly;
bool nonvolatile;
+ bool unmergeable;
};
typedef struct IOMMUTLBEntry IOMMUTLBEntry;
@@ -599,8 +601,9 @@ typedef void (*ReplayRamDiscard)(MemoryRegionSection *section, void *opaque);
* populated (consuming memory), to be used/accessed by the VM.
*
* A #RamDiscardManager can only be set for a RAM #MemoryRegion while the
- * #MemoryRegion isn't mapped yet; it cannot change while the #MemoryRegion is
- * mapped.
+ * #MemoryRegion isn't mapped into an address space yet (either directly
+ * or via an alias); it cannot change while the #MemoryRegion is
+ * mapped into an address space.
*
* The #RamDiscardManager is intended to be used by technologies that are
* incompatible with discarding of RAM (e.g., VFIO, which may pin all
@@ -772,6 +775,7 @@ struct MemoryRegion {
bool nonvolatile;
bool rom_device;
bool flush_coalesced_mmio;
+ bool unmergeable;
uint8_t dirty_log_mask;
bool is_iommu;
RAMBlock *ram_block;
@@ -2350,6 +2354,25 @@ void memory_region_set_size(MemoryRegion *mr, uint64_t size);
void memory_region_set_alias_offset(MemoryRegion *mr,
hwaddr offset);
+/*
+ * memory_region_set_unmergeable: Set a memory region unmergeable
+ *
+ * Mark a memory region unmergeable, resulting in the memory region (or
+ * everything contained in a memory region container) not getting merged when
+ * simplifying the address space and notifying memory listeners. Consequently,
+ * memory listeners will never get notified about ranges that are larger than
+ * the original memory regions.
+ *
+ * This is primarily useful when multiple aliases to a RAM memory region are
+ * mapped into a memory region container, and updates (e.g., enable/disable or
+ * map/unmap) of individual memory region aliases are not supposed to affect
+ * other memory regions in the same container.
+ *
+ * @mr: the #MemoryRegion to be updated
+ * @unmergeable: whether to mark the #MemoryRegion unmergeable
+ */
+void memory_region_set_unmergeable(MemoryRegion *mr, bool unmergeable);
+
/**
* memory_region_present: checks if an address relative to a @container
* translates into #MemoryRegion within @container
diff --git a/include/hw/boards.h b/include/hw/boards.h
index 55a64a1..43a56dc 100644
--- a/include/hw/boards.h
+++ b/include/hw/boards.h
@@ -297,15 +297,27 @@ struct MachineClass {
* DeviceMemoryState:
* @base: address in guest physical address space where the memory
* address space for memory devices starts
- * @mr: address space container for memory devices
+ * @mr: memory region container for memory devices
+ * @as: address space for memory devices
+ * @listener: memory listener used to track used memslots in the address space
* @dimm_size: the sum of plugged DIMMs' sizes
* @used_region_size: the part of @mr already used by memory devices
+ * @required_memslots: the number of memslots required by memory devices
+ * @used_memslots: the number of memslots currently used by memory devices
+ * @memslot_auto_decision_active: whether any plugged memory device
+ * automatically decided to use more than
+ * one memslot
*/
typedef struct DeviceMemoryState {
hwaddr base;
MemoryRegion mr;
+ AddressSpace as;
+ MemoryListener listener;
uint64_t dimm_size;
uint64_t used_region_size;
+ unsigned int required_memslots;
+ unsigned int used_memslots;
+ unsigned int memslot_auto_decision_active;
} DeviceMemoryState;
/**
diff --git a/include/hw/mem/memory-device.h b/include/hw/mem/memory-device.h
index 48d2611..3354d6c 100644
--- a/include/hw/mem/memory-device.h
+++ b/include/hw/mem/memory-device.h
@@ -14,6 +14,7 @@
#define MEMORY_DEVICE_H
#include "hw/qdev-core.h"
+#include "qemu/typedefs.h"
#include "qapi/qapi-types-machine.h"
#include "qom/object.h"
@@ -41,6 +42,17 @@ typedef struct MemoryDeviceState MemoryDeviceState;
* successive memory regions are used, a covering memory region has to
* be provided. Scattered memory regions are not supported for single
* devices.
+ *
+ * The device memory region returned via @get_memory_region may either be a
+ * single RAM memory region or a memory region container with subregions
+ * that are RAM memory regions or aliases to RAM memory regions. Other
+ * memory regions or subregions are not supported.
+ *
+ * If the device memory region returned via @get_memory_region is a
+ * memory region container, it's supported to dynamically (un)map subregions
+ * as long as the number of memslots returned by @get_memslots() won't
+ * be exceeded and as long as all memory regions are of the same kind (e.g.,
+ * all RAM or all ROM).
*/
struct MemoryDeviceClass {
/* private */
@@ -89,6 +101,28 @@ struct MemoryDeviceClass {
MemoryRegion *(*get_memory_region)(MemoryDeviceState *md, Error **errp);
/*
+ * Optional: Instruct the memory device to decide how many memory slots
+ * it requires, not exceeding the given limit.
+ *
+ * Called exactly once when pre-plugging the memory device, before
+ * querying the number of memslots using @get_memslots the first time.
+ */
+ void (*decide_memslots)(MemoryDeviceState *md, unsigned int limit);
+
+ /*
+ * Optional for memory devices that require only a single memslot,
+ * required for all other memory devices: Return the number of memslots
+ * (distinct RAM memory regions in the device memory region) that are
+ * required by the device.
+ *
+ * If this function is not implemented, the assumption is "1".
+ *
+ * Called when (un)plugging the memory device, to check if the requirements
+ * can be satisfied, and to do proper accounting.
+ */
+ unsigned int (*get_memslots)(MemoryDeviceState *md);
+
+ /*
* Optional: Return the desired minimum alignment of the device in guest
* physical address space. The final alignment is computed based on this
* alignment and the alignment requirements of the memory region.
@@ -105,8 +139,31 @@ struct MemoryDeviceClass {
MemoryDeviceInfo *info);
};
+/*
+ * Traditionally, KVM/vhost in many setups supported 509 memslots, whereby
+ * 253 memslots were "reserved" for boot memory and other devices (such
+ * as PCI BARs, which can get mapped dynamically) and 256 memslots were
+ * dedicated for DIMMs. These magic numbers worked reliably in the past.
+ *
+ * Further, using many memslots can negatively affect performance, so setting
+ * the soft-limit of memslots used by memory devices to the traditional
+ * DIMM limit of 256 sounds reasonable.
+ *
+ * If we have less than 509 memslots, we will instruct memory devices that
+ * support automatically deciding how many memslots to use to only use a single
+ * one.
+ *
+ * Hotplugging vhost devices with at least 509 memslots is not expected to
+ * cause problems, not even when devices automatically decided how many memslots
+ * to use.
+ */
+#define MEMORY_DEVICES_SOFT_MEMSLOT_LIMIT 256
+#define MEMORY_DEVICES_SAFE_MAX_MEMSLOTS 509
+
MemoryDeviceInfoList *qmp_memory_device_list(void);
uint64_t get_plugged_memory_size(void);
+unsigned int memory_devices_get_reserved_memslots(void);
+bool memory_devices_memslot_auto_decision_active(void);
void memory_device_pre_plug(MemoryDeviceState *md, MachineState *ms,
const uint64_t *legacy_align, Error **errp);
void memory_device_plug(MemoryDeviceState *md, MachineState *ms);
diff --git a/include/hw/virtio/vhost-backend.h b/include/hw/virtio/vhost-backend.h
index 1860b54..96ccc18 100644
--- a/include/hw/virtio/vhost-backend.h
+++ b/include/hw/virtio/vhost-backend.h
@@ -86,9 +86,6 @@ typedef int (*vhost_set_vring_enable_op)(struct vhost_dev *dev,
typedef bool (*vhost_requires_shm_log_op)(struct vhost_dev *dev);
typedef int (*vhost_migration_done_op)(struct vhost_dev *dev,
char *mac_addr);
-typedef bool (*vhost_backend_can_merge_op)(struct vhost_dev *dev,
- uint64_t start1, uint64_t size1,
- uint64_t start2, uint64_t size2);
typedef int (*vhost_vsock_set_guest_cid_op)(struct vhost_dev *dev,
uint64_t guest_cid);
typedef int (*vhost_vsock_set_running_op)(struct vhost_dev *dev, int start);
@@ -108,8 +105,7 @@ typedef int (*vhost_crypto_create_session_op)(struct vhost_dev *dev,
typedef int (*vhost_crypto_close_session_op)(struct vhost_dev *dev,
uint64_t session_id);
-typedef bool (*vhost_backend_mem_section_filter_op)(struct vhost_dev *dev,
- MemoryRegionSection *section);
+typedef bool (*vhost_backend_no_private_memslots_op)(struct vhost_dev *dev);
typedef int (*vhost_get_inflight_fd_op)(struct vhost_dev *dev,
uint16_t queue_size,
@@ -138,6 +134,7 @@ typedef struct VhostOps {
vhost_backend_init vhost_backend_init;
vhost_backend_cleanup vhost_backend_cleanup;
vhost_backend_memslots_limit vhost_backend_memslots_limit;
+ vhost_backend_no_private_memslots_op vhost_backend_no_private_memslots;
vhost_net_set_backend_op vhost_net_set_backend;
vhost_net_set_mtu_op vhost_net_set_mtu;
vhost_scsi_set_endpoint_op vhost_scsi_set_endpoint;
@@ -163,7 +160,6 @@ typedef struct VhostOps {
vhost_set_vring_enable_op vhost_set_vring_enable;
vhost_requires_shm_log_op vhost_requires_shm_log;
vhost_migration_done_op vhost_migration_done;
- vhost_backend_can_merge_op vhost_backend_can_merge;
vhost_vsock_set_guest_cid_op vhost_vsock_set_guest_cid;
vhost_vsock_set_running_op vhost_vsock_set_running;
vhost_set_iotlb_callback_op vhost_set_iotlb_callback;
@@ -172,7 +168,6 @@ typedef struct VhostOps {
vhost_set_config_op vhost_set_config;
vhost_crypto_create_session_op vhost_crypto_create_session;
vhost_crypto_close_session_op vhost_crypto_close_session;
- vhost_backend_mem_section_filter_op vhost_backend_mem_section_filter;
vhost_get_inflight_fd_op vhost_get_inflight_fd;
vhost_set_inflight_fd_op vhost_set_inflight_fd;
vhost_dev_start_op vhost_dev_start;
diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
index 6a173cb..c7e54676 100644
--- a/include/hw/virtio/vhost.h
+++ b/include/hw/virtio/vhost.h
@@ -315,7 +315,8 @@ uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits,
*/
void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits,
uint64_t features);
-bool vhost_has_free_slot(void);
+unsigned int vhost_get_max_memslots(void);
+unsigned int vhost_get_free_memslots(void);
int vhost_net_set_backend(struct vhost_dev *hdev,
struct vhost_vring_file *file);
diff --git a/include/hw/virtio/virtio-mem.h b/include/hw/virtio/virtio-mem.h
index ab0fe2b..5f5b02b 100644
--- a/include/hw/virtio/virtio-mem.h
+++ b/include/hw/virtio/virtio-mem.h
@@ -33,6 +33,7 @@ OBJECT_DECLARE_TYPE(VirtIOMEM, VirtIOMEMClass,
#define VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP "unplugged-inaccessible"
#define VIRTIO_MEM_EARLY_MIGRATION_PROP "x-early-migration"
#define VIRTIO_MEM_PREALLOC_PROP "prealloc"
+#define VIRTIO_MEM_DYNAMIC_MEMSLOTS_PROP "dynamic-memslots"
struct VirtIOMEM {
VirtIODevice parent_obj;
@@ -44,7 +45,28 @@ struct VirtIOMEM {
int32_t bitmap_size;
unsigned long *bitmap;
- /* assigned memory backend and memory region */
+ /*
+ * With "dynamic-memslots=on": Device memory region in which we dynamically
+ * map the memslots.
+ */
+ MemoryRegion *mr;
+
+ /*
+ * With "dynamic-memslots=on": The individual memslots (aliases into the
+ * memory backend).
+ */
+ MemoryRegion *memslots;
+
+ /* With "dynamic-memslots=on": The total number of memslots. */
+ uint16_t nb_memslots;
+
+ /*
+ * With "dynamic-memslots=on": Size of one memslot (the size of the
+ * last one can differ).
+ */
+ uint64_t memslot_size;
+
+ /* Assigned memory backend with the RAM memory region. */
HostMemoryBackend *memdev;
/* NUMA node */
@@ -82,6 +104,12 @@ struct VirtIOMEM {
*/
bool early_migration;
+ /*
+ * Whether we dynamically map (multiple, if possible) memslots instead of
+ * statically mapping the whole RAM memory region.
+ */
+ bool dynamic_memslots;
+
/* notifiers to notify when "size" changes */
NotifierList size_change_notifiers;
@@ -96,6 +124,8 @@ struct VirtIOMEMClass {
/* public */
void (*fill_device_info)(const VirtIOMEM *vmen, VirtioMEMDeviceInfo *vi);
MemoryRegion *(*get_memory_region)(VirtIOMEM *vmem, Error **errp);
+ void (*decide_memslots)(VirtIOMEM *vmem, unsigned int limit);
+ unsigned int (*get_memslots)(VirtIOMEM *vmem);
void (*add_size_change_notifier)(VirtIOMEM *vmem, Notifier *notifier);
void (*remove_size_change_notifier)(VirtIOMEM *vmem, Notifier *notifier);
void (*unplug_request_check)(VirtIOMEM *vmem, Error **errp);
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index ee9025f..97a8a4f 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -215,7 +215,8 @@ typedef struct KVMRouteChange {
/* external API */
-bool kvm_has_free_slot(MachineState *ms);
+unsigned int kvm_get_max_memslots(void);
+unsigned int kvm_get_free_memslots(void);
bool kvm_has_sync_mmu(void);
int kvm_has_vcpu_events(void);
int kvm_has_robust_singlestep(void);
@@ -552,7 +553,6 @@ int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source);
*/
int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target);
struct ppc_radix_page_info *kvm_get_radix_page_info(void);
-int kvm_get_max_memslots(void);
/* Notify resamplefd for EOI of specific interrupts. */
void kvm_resample_fd_notify(int gsi);
diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h
index a5b9122..075939a 100644
--- a/include/sysemu/kvm_int.h
+++ b/include/sysemu/kvm_int.h
@@ -40,6 +40,7 @@ typedef struct KVMMemoryUpdate {
typedef struct KVMMemoryListener {
MemoryListener listener;
KVMSlot *slots;
+ unsigned int nr_used_slots;
int as_id;
QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_add;
QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_del;
diff --git a/stubs/qmp_memory_device.c b/stubs/memory_device.c
index e75cac6..15fd93f 100644
--- a/stubs/qmp_memory_device.c
+++ b/stubs/memory_device.c
@@ -10,3 +10,13 @@ uint64_t get_plugged_memory_size(void)
{
return (uint64_t)-1;
}
+
+unsigned int memory_devices_get_reserved_memslots(void)
+{
+ return 0;
+}
+
+bool memory_devices_memslot_auto_decision_active(void)
+{
+ return false;
+}
diff --git a/stubs/meson.build b/stubs/meson.build
index ef6e39a..cde4497 100644
--- a/stubs/meson.build
+++ b/stubs/meson.build
@@ -32,7 +32,7 @@ stub_ss.add(files('monitor.c'))
stub_ss.add(files('monitor-core.c'))
stub_ss.add(files('physmem.c'))
stub_ss.add(files('qemu-timer-notify-cb.c'))
-stub_ss.add(files('qmp_memory_device.c'))
+stub_ss.add(files('memory_device.c'))
stub_ss.add(files('qmp-command-available.c'))
stub_ss.add(files('qmp-quit.c'))
stub_ss.add(files('qtest.c'))
diff --git a/system/memory.c b/system/memory.c
index fa1c99f..a800fbc 100644
--- a/system/memory.c
+++ b/system/memory.c
@@ -224,6 +224,7 @@ struct FlatRange {
bool romd_mode;
bool readonly;
bool nonvolatile;
+ bool unmergeable;
};
#define FOR_EACH_FLAT_RANGE(var, view) \
@@ -240,6 +241,7 @@ section_from_flat_range(FlatRange *fr, FlatView *fv)
.offset_within_address_space = int128_get64(fr->addr.start),
.readonly = fr->readonly,
.nonvolatile = fr->nonvolatile,
+ .unmergeable = fr->unmergeable,
};
}
@@ -250,7 +252,8 @@ static bool flatrange_equal(FlatRange *a, FlatRange *b)
&& a->offset_in_region == b->offset_in_region
&& a->romd_mode == b->romd_mode
&& a->readonly == b->readonly
- && a->nonvolatile == b->nonvolatile;
+ && a->nonvolatile == b->nonvolatile
+ && a->unmergeable == b->unmergeable;
}
static FlatView *flatview_new(MemoryRegion *mr_root)
@@ -323,7 +326,8 @@ static bool can_merge(FlatRange *r1, FlatRange *r2)
&& r1->dirty_log_mask == r2->dirty_log_mask
&& r1->romd_mode == r2->romd_mode
&& r1->readonly == r2->readonly
- && r1->nonvolatile == r2->nonvolatile;
+ && r1->nonvolatile == r2->nonvolatile
+ && !r1->unmergeable && !r2->unmergeable;
}
/* Attempt to simplify a view by merging adjacent ranges */
@@ -599,7 +603,8 @@ static void render_memory_region(FlatView *view,
Int128 base,
AddrRange clip,
bool readonly,
- bool nonvolatile)
+ bool nonvolatile,
+ bool unmergeable)
{
MemoryRegion *subregion;
unsigned i;
@@ -616,6 +621,7 @@ static void render_memory_region(FlatView *view,
int128_addto(&base, int128_make64(mr->addr));
readonly |= mr->readonly;
nonvolatile |= mr->nonvolatile;
+ unmergeable |= mr->unmergeable;
tmp = addrrange_make(base, mr->size);
@@ -629,14 +635,14 @@ static void render_memory_region(FlatView *view,
int128_subfrom(&base, int128_make64(mr->alias->addr));
int128_subfrom(&base, int128_make64(mr->alias_offset));
render_memory_region(view, mr->alias, base, clip,
- readonly, nonvolatile);
+ readonly, nonvolatile, unmergeable);
return;
}
/* Render subregions in priority order. */
QTAILQ_FOREACH(subregion, &mr->subregions, subregions_link) {
render_memory_region(view, subregion, base, clip,
- readonly, nonvolatile);
+ readonly, nonvolatile, unmergeable);
}
if (!mr->terminates) {
@@ -652,6 +658,7 @@ static void render_memory_region(FlatView *view,
fr.romd_mode = mr->romd_mode;
fr.readonly = readonly;
fr.nonvolatile = nonvolatile;
+ fr.unmergeable = unmergeable;
/* Render the region itself into any gaps left by the current view. */
for (i = 0; i < view->nr && int128_nz(remain); ++i) {
@@ -753,7 +760,7 @@ static FlatView *generate_memory_topology(MemoryRegion *mr)
if (mr) {
render_memory_region(view, mr, int128_zero(),
addrrange_make(int128_zero(), int128_2_64()),
- false, false);
+ false, false, false);
}
flatview_simplify(view);
@@ -2085,7 +2092,7 @@ int memory_region_iommu_num_indexes(IOMMUMemoryRegion *iommu_mr)
RamDiscardManager *memory_region_get_ram_discard_manager(MemoryRegion *mr)
{
- if (!memory_region_is_mapped(mr) || !memory_region_is_ram(mr)) {
+ if (!memory_region_is_ram(mr)) {
return NULL;
}
return mr->rdm;
@@ -2094,7 +2101,7 @@ RamDiscardManager *memory_region_get_ram_discard_manager(MemoryRegion *mr)
void memory_region_set_ram_discard_manager(MemoryRegion *mr,
RamDiscardManager *rdm)
{
- g_assert(memory_region_is_ram(mr) && !memory_region_is_mapped(mr));
+ g_assert(memory_region_is_ram(mr));
g_assert(!rdm || !mr->rdm);
mr->rdm = rdm;
}
@@ -2755,6 +2762,18 @@ void memory_region_set_alias_offset(MemoryRegion *mr, hwaddr offset)
memory_region_transaction_commit();
}
+void memory_region_set_unmergeable(MemoryRegion *mr, bool unmergeable)
+{
+ if (unmergeable == mr->unmergeable) {
+ return;
+ }
+
+ memory_region_transaction_begin();
+ mr->unmergeable = unmergeable;
+ memory_region_update_pending |= mr->enabled;
+ memory_region_transaction_commit();
+}
+
uint64_t memory_region_get_alignment(const MemoryRegion *mr)
{
return mr->align;
diff --git a/system/physmem.c b/system/physmem.c
index edc3ed8..fc2b0fe 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -2221,23 +2221,6 @@ ram_addr_t qemu_ram_block_host_offset(RAMBlock *rb, void *host)
return res;
}
-/*
- * Translates a host ptr back to a RAMBlock, a ram_addr and an offset
- * in that RAMBlock.
- *
- * ptr: Host pointer to look up
- * round_offset: If true round the result offset down to a page boundary
- * *ram_addr: set to result ram_addr
- * *offset: set to result offset within the RAMBlock
- *
- * Returns: RAMBlock (or NULL if not found)
- *
- * By the time this function returns, the returned pointer is not protected
- * by RCU anymore. If the caller is not within an RCU critical section and
- * does not hold the iothread lock, it must have other means of protecting the
- * pointer, such as a reference to the region that includes the incoming
- * ram_addr_t.
- */
RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
ram_addr_t *offset)
{