aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStefan Hajnoczi <stefanha@redhat.com>2023-09-19 13:22:18 -0400
committerStefan Hajnoczi <stefanha@redhat.com>2023-09-19 13:22:19 -0400
commit4907644841e3200aea6475c0f72d3d987e9f3d93 (patch)
tree962232f0bc6da95c938bcb3088e5b45ef64cb706
parent1361bba536ccb49b20ce0b915e7f552d61717f51 (diff)
parent544cff46c018036cd66e98ffb224dd9f098065c8 (diff)
downloadqemu-4907644841e3200aea6475c0f72d3d987e9f3d93.zip
qemu-4907644841e3200aea6475c0f72d3d987e9f3d93.tar.gz
qemu-4907644841e3200aea6475c0f72d3d987e9f3d93.tar.bz2
Merge tag 'mem-2023-09-19' of https://github.com/davidhildenbrand/qemu into staging
Hi, "Host Memory Backends" and "Memory devices" queue ("mem"): - Support and document VM templating with R/O files using a new "rom" parameter for memory-backend-file - Some cleanups and fixes around NVDIMMs and R/O file handling for guest RAM - Optimize ioeventfd updates by skipping address spaces that are not applicable # -----BEGIN PGP SIGNATURE----- # # iQJFBAABCAAvFiEEG9nKrXNcTDpGDfzKTd4Q9wD/g1oFAmUJdykRHGRhdmlkQHJl # ZGhhdC5jb20ACgkQTd4Q9wD/g1pf2w//akOUoYMuamySGjXtKLVyMKZkjIys+Ama # k2C0xzsWAHBP572ezwHi8uxf5j9kzAjsw6GxDZ7FAamD9MhiohkEvkecloBx6f/c # q3fVHblBNkG7v2urtf4+6PJtJvhzOST2SFXfWeYhO/vaA04AYCDgexv82JN3gA6B # OS8WyOX62b8wILPSY2GLZ8IqpE9XnOYZwzVBn6YB1yo7ZkYEfXO6cA8nykNuNcOE # vppqDo7uVIX6317FWj8ygxmzFfOaj0WT2MT2XFzEIDfg8BInQN8HC4mTn0hcVKMa # N1y+eZH733CQKT+uNBRZ5YOeljOi4d6gEEyvkkA/L7e5D3Qg9hIdvHb4uryCFSWX # Vt07OP1XLBwCZFobOC6sg+2gtTZJxxYK89e6ZzEd0454S24w5bnEteRAaCGOP0XL # ww9xYULqhtZs55UC4rvZHJwdUAk1fIY4VqynwkeQXegvz6BxedNeEkJiiEU0Tizx # N2VpsxAJ7H/LLSFeZoCRESo4azrH6U4n7S/eS1tkCniFqibfe2yIQCDoJVfb42ec # gfg/vThCrDwHkIHzkMmoV8NndA7Q7SIkyMfYeEEBeZMeg8JzYll4DJEw/jQCacxh # KRUa+AZvGlTJUq0mkvyOVfLki+iaehoIUuY1yvMrmdWijPO8n3YybmP9Ljhr8VdR # 9MSYZe+I2v8= # =iraT # -----END PGP SIGNATURE----- # gpg: Signature made Tue 19 Sep 2023 06:25:45 EDT # gpg: using RSA key 1BD9CAAD735C4C3A460DFCCA4DDE10F700FF835A # gpg: issuer "david@redhat.com" # gpg: Good signature from "David Hildenbrand <david@redhat.com>" [unknown] # gpg: aka "David Hildenbrand <davidhildenbrand@gmail.com>" [full] # gpg: aka "David Hildenbrand <hildenbr@in.tum.de>" [unknown] # gpg: WARNING: The key's User ID is not certified with a trusted signature! # gpg: There is no indication that the signature belongs to the owner. # Primary key fingerprint: 1BD9 CAAD 735C 4C3A 460D FCCA 4DDE 10F7 00FF 835A * tag 'mem-2023-09-19' of https://github.com/davidhildenbrand/qemu: memory: avoid updating ioeventfds for some address_space machine: Improve error message when using default RAM backend id softmmu/physmem: Hint that "readonly=on,rom=off" exists when opening file R/W for private mapping fails docs: Start documenting VM templating docs: Don't mention "-mem-path" in multi-process.rst softmmu/physmem: Never return directories from file_ram_open() softmmu/physmem: Fail creation of new files in file_ram_open() with readonly=true softmmu/physmem: Bail out early in ram_block_discard_range() with readonly files softmmu/physmem: Remap with proper protection in qemu_ram_remap() backends/hostmem-file: Add "rom" property to support VM templating with R/O files softmmu/physmem: Distinguish between file access mode and mmap protection nvdimm: Reject writing label data to ROM instead of crashing QEMU Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-rw-r--r--MAINTAINERS1
-rw-r--r--backends/hostmem-file.c61
-rw-r--r--docs/devel/multi-process.rst5
-rw-r--r--docs/system/index.rst1
-rw-r--r--docs/system/vm-templating.rst125
-rw-r--r--hw/acpi/nvdimm.c11
-rw-r--r--hw/core/machine.c11
-rw-r--r--hw/mem/nvdimm.c10
-rw-r--r--hw/ppc/spapr_nvdimm.c3
-rw-r--r--include/exec/memory.h15
-rw-r--r--include/exec/ram_addr.h8
-rw-r--r--include/hw/mem/nvdimm.h6
-rw-r--r--qapi/qom.json17
-rw-r--r--qemu-options.hx16
-rw-r--r--softmmu/memory.c20
-rw-r--r--softmmu/physmem.c93
16 files changed, 354 insertions, 49 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index d5773b1..355b196 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2961,6 +2961,7 @@ M: Igor Mammedov <imammedo@redhat.com>
S: Maintained
F: backends/hostmem*.c
F: include/sysemu/hostmem.h
+F: docs/system/vm-templating.rst
T: git https://gitlab.com/ehabkost/qemu.git machine-next
Cryptodev Backends
diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c
index b4335a8..361d4a8 100644
--- a/backends/hostmem-file.c
+++ b/backends/hostmem-file.c
@@ -18,6 +18,8 @@
#include "sysemu/hostmem.h"
#include "qom/object_interfaces.h"
#include "qom/object.h"
+#include "qapi/visitor.h"
+#include "qapi/qapi-visit-common.h"
OBJECT_DECLARE_SIMPLE_TYPE(HostMemoryBackendFile, MEMORY_BACKEND_FILE)
@@ -31,6 +33,7 @@ struct HostMemoryBackendFile {
bool discard_data;
bool is_pmem;
bool readonly;
+ OnOffAuto rom;
};
static void
@@ -53,15 +56,39 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
return;
}
+ switch (fb->rom) {
+ case ON_OFF_AUTO_AUTO:
+ /* Traditionally, opening the file readonly always resulted in ROM. */
+ fb->rom = fb->readonly ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
+ break;
+ case ON_OFF_AUTO_ON:
+ if (!fb->readonly) {
+ error_setg(errp, "property 'rom' = 'on' is not supported with"
+ " 'readonly' = 'off'");
+ return;
+ }
+ break;
+ case ON_OFF_AUTO_OFF:
+ if (fb->readonly && backend->share) {
+ error_setg(errp, "property 'rom' = 'off' is incompatible with"
+ " 'readonly' = 'on' and 'share' = 'on'");
+ return;
+ }
+ break;
+ default:
+ assert(false);
+ }
+
name = host_memory_backend_get_name(backend);
ram_flags = backend->share ? RAM_SHARED : 0;
+ ram_flags |= fb->readonly ? RAM_READONLY_FD : 0;
+ ram_flags |= fb->rom == ON_OFF_AUTO_ON ? RAM_READONLY : 0;
ram_flags |= backend->reserve ? 0 : RAM_NORESERVE;
ram_flags |= fb->is_pmem ? RAM_PMEM : 0;
ram_flags |= RAM_NAMED_FILE;
memory_region_init_ram_from_file(&backend->mr, OBJECT(backend), name,
backend->size, fb->align, ram_flags,
- fb->mem_path, fb->offset, fb->readonly,
- errp);
+ fb->mem_path, fb->offset, errp);
g_free(name);
#endif
}
@@ -201,6 +228,32 @@ static void file_memory_backend_set_readonly(Object *obj, bool value,
fb->readonly = value;
}
+static void file_memory_backend_get_rom(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(obj);
+ OnOffAuto rom = fb->rom;
+
+ visit_type_OnOffAuto(v, name, &rom, errp);
+}
+
+static void file_memory_backend_set_rom(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ HostMemoryBackend *backend = MEMORY_BACKEND(obj);
+ HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(obj);
+
+ if (host_memory_backend_mr_inited(backend)) {
+ error_setg(errp, "cannot change property '%s' of %s.", name,
+ object_get_typename(obj));
+ return;
+ }
+
+ visit_type_OnOffAuto(v, name, &fb->rom, errp);
+}
+
static void file_backend_unparent(Object *obj)
{
HostMemoryBackend *backend = MEMORY_BACKEND(obj);
@@ -243,6 +296,10 @@ file_backend_class_init(ObjectClass *oc, void *data)
object_class_property_add_bool(oc, "readonly",
file_memory_backend_get_readonly,
file_memory_backend_set_readonly);
+ object_class_property_add(oc, "rom", "OnOffAuto",
+ file_memory_backend_get_rom, file_memory_backend_set_rom, NULL, NULL);
+ object_class_property_set_description(oc, "rom",
+ "Whether to create Read Only Memory (ROM)");
}
static void file_backend_instance_finalize(Object *o)
diff --git a/docs/devel/multi-process.rst b/docs/devel/multi-process.rst
index e480175..4ef539c 100644
--- a/docs/devel/multi-process.rst
+++ b/docs/devel/multi-process.rst
@@ -409,8 +409,9 @@ the initial messages sent to the emulation process is a guest memory
table. Each entry in this table consists of a file descriptor and size
that the emulation process can ``mmap()`` to directly access guest
memory, similar to ``vhost_user_set_mem_table()``. Note guest memory
-must be backed by file descriptors, such as when QEMU is given the
-*-mem-path* command line option.
+must be backed by shared file-backed memory, for example, using
+*-object memory-backend-file,share=on* and setting that memory backend
+as RAM for the machine.
IOMMU operations
^^^^^^^^^^^^^^^^
diff --git a/docs/system/index.rst b/docs/system/index.rst
index 45bf1f1..c21065e 100644
--- a/docs/system/index.rst
+++ b/docs/system/index.rst
@@ -38,3 +38,4 @@ or Hypervisor.Framework.
security
multi-process
confidential-guest-support
+ vm-templating
diff --git a/docs/system/vm-templating.rst b/docs/system/vm-templating.rst
new file mode 100644
index 0000000..28905a1
--- /dev/null
+++ b/docs/system/vm-templating.rst
@@ -0,0 +1,125 @@
+QEMU VM templating
+==================
+
+This document explains how to use VM templating in QEMU.
+
+For now, the focus is on VM memory aspects, and not about how to save and
+restore other VM state (i.e., migrate-to-file with ``x-ignore-shared``).
+
+Overview
+--------
+
+With VM templating, a single template VM serves as the starting point for
+new VMs. This allows for fast and efficient replication of VMs, resulting
+in fast startup times and reduced memory consumption.
+
+Conceptually, the VM state is frozen, to then be used as a basis for new
+VMs. The Copy-On-Write mechanism in the operating systems makes sure that
+new VMs are able to read template VM memory; however, any modifications
+stay private and don't modify the original template VM or any other
+created VM.
+
+!!! Security Alert !!!
+----------------------
+
+When effectively cloning VMs by VM templating, hardware identifiers
+(such as UUIDs and NIC MAC addresses), and similar data in the guest OS
+(such as machine IDs, SSH keys, certificates) that are supposed to be
+*unique* are no longer unique, which can be a security concern.
+
+Please be aware of these implications and how to mitigate them for your
+use case, which might involve vmgenid, hot(un)plug of NIC, etc..
+
+Memory configuration
+--------------------
+
+In order to create the template VM, we have to make sure that VM memory
+ends up in a file, from where it can be reused for the new VMs:
+
+Supply VM RAM via memory-backend-file, with ``share=on`` (modifications go
+to the file) and ``readonly=off`` (open the file writable). Note that
+``readonly=off`` is implicit.
+
+In the following command-line example, a 2GB VM is created, whereby VM RAM
+is to be stored in the ``template`` file.
+
+.. parsed-literal::
+
+ |qemu_system| [...] -m 2g \\
+ -object memory-backend-file,id=pc.ram,mem-path=template,size=2g,share=on,... \\
+ -machine q35,memory-backend=pc.ram
+
+If multiple memory backends are used (vNUMA, DIMMs), configure all
+memory backends accordingly.
+
+Once the VM is in the desired state, stop the VM and save other VM state,
+leaving the current state of VM RAM reside in the file.
+
+In order to have a new VM be based on a template VM, we have to
+configure VM RAM to be based on a template VM RAM file; however, the VM
+should not be able to modify file content.
+
+Supply VM RAM via memory-backend-file, with ``share=off`` (modifications
+stay private), ``readonly=on`` (open the file readonly) and ``rom=off``
+(don't make the memory readonly for the VM). Note that ``share=off`` is
+implicit and that other VM state has to be restored separately.
+
+In the following command-line example, a 2GB VM is created based on the
+existing 2GB file ``template``.
+
+.. parsed-literal::
+
+ |qemu_system| [...] -m 2g \\
+ -object memory-backend-file,id=pc.ram,mem-path=template,size=2g,readonly=on,rom=off,... \\
+ -machine q35,memory-backend=pc.ram
+
+If multiple memory backends are used (vNUMA, DIMMs), configure all
+memory backends accordingly.
+
+Note that ``-mem-path`` cannot be used for VM templating when creating the
+template VM or when starting new VMs based on a template VM.
+
+Incompatible features
+---------------------
+
+Some features are incompatible with VM templating, as the underlying file
+cannot be modified to discard VM RAM, or to actually share memory with
+another process.
+
+vhost-user and multi-process QEMU
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+vhost-user and multi-process QEMU are incompatible with VM templating.
+These technologies rely on shared memory, however, the template VMs
+don't actually share memory (``share=off``), even though they are
+file-based.
+
+virtio-balloon
+~~~~~~~~~~~~~~
+
+virtio-balloon inflation and "free page reporting" cannot discard VM RAM
+and will repeatedly report errors. While virtio-balloon can be used
+for template VMs (e.g., report VM RAM stats), "free page reporting"
+should be disabled and the balloon should not be inflated.
+
+virtio-mem
+~~~~~~~~~~
+
+virtio-mem cannot discard VM RAM that is managed by the virtio-mem
+device. virtio-mem will fail early when realizing the device. To use
+VM templating with virtio-mem, either hotplug virtio-mem devices to the
+new VM, or don't supply any memory to the template VM using virtio-mem
+(requested-size=0), not using a template VM file as memory backend for the
+virtio-mem device.
+
+VM migration
+~~~~~~~~~~~~
+
+For VM migration, "x-release-ram" similarly relies on discarding of VM
+RAM on the migration source to free up migrated RAM, and will
+repeatedly report errors.
+
+Postcopy live migration fails discarding VM RAM on the migration
+destination early and refuses to activate postcopy live migration. Note
+that postcopy live migration usually only works on selected filesystems
+(shmem/tmpfs, hugetlbfs) either way.
diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index a3b25a9..3cbd416 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -670,7 +670,8 @@ static void nvdimm_dsm_label_size(NVDIMMDevice *nvdimm, hwaddr dsm_mem_addr)
}
static uint32_t nvdimm_rw_label_data_check(NVDIMMDevice *nvdimm,
- uint32_t offset, uint32_t length)
+ uint32_t offset, uint32_t length,
+ bool is_write)
{
uint32_t ret = NVDIMM_DSM_RET_STATUS_INVALID;
@@ -690,6 +691,10 @@ static uint32_t nvdimm_rw_label_data_check(NVDIMMDevice *nvdimm,
return ret;
}
+ if (is_write && nvdimm->readonly) {
+ return NVDIMM_DSM_RET_STATUS_UNSUPPORT;
+ }
+
return NVDIMM_DSM_RET_STATUS_SUCCESS;
}
@@ -713,7 +718,7 @@ static void nvdimm_dsm_get_label_data(NVDIMMDevice *nvdimm, NvdimmDsmIn *in,
get_label_data->length);
status = nvdimm_rw_label_data_check(nvdimm, get_label_data->offset,
- get_label_data->length);
+ get_label_data->length, false);
if (status != NVDIMM_DSM_RET_STATUS_SUCCESS) {
nvdimm_dsm_no_payload(status, dsm_mem_addr);
return;
@@ -752,7 +757,7 @@ static void nvdimm_dsm_set_label_data(NVDIMMDevice *nvdimm, NvdimmDsmIn *in,
set_label_data->length);
status = nvdimm_rw_label_data_check(nvdimm, set_label_data->offset,
- set_label_data->length);
+ set_label_data->length, true);
if (status != NVDIMM_DSM_RET_STATUS_SUCCESS) {
nvdimm_dsm_no_payload(status, dsm_mem_addr);
return;
diff --git a/hw/core/machine.c b/hw/core/machine.c
index 230aab8..59208e6 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -1359,6 +1359,7 @@ out:
void machine_run_board_init(MachineState *machine, const char *mem_path, Error **errp)
{
+ ERRP_GUARD();
MachineClass *machine_class = MACHINE_GET_CLASS(machine);
ObjectClass *oc = object_class_by_name(machine->cpu_type);
CPUClass *cc;
@@ -1387,9 +1388,13 @@ void machine_run_board_init(MachineState *machine, const char *mem_path, Error *
numa_uses_legacy_mem()) {
if (object_property_find(object_get_objects_root(),
machine_class->default_ram_id)) {
- error_setg(errp, "object name '%s' is reserved for the default"
- " RAM backend, it can't be used for any other purposes."
- " Change the object's 'id' to something else",
+ error_setg(errp, "object's id '%s' is reserved for the default"
+ " RAM backend, it can't be used for any other purposes",
+ machine_class->default_ram_id);
+ error_append_hint(errp,
+ "Change the object's 'id' to something else or disable"
+ " automatic creation of the default RAM backend by setting"
+ " 'memory-backend=%s' with '-machine'.\n",
machine_class->default_ram_id);
return;
}
diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index 31080c2..1631a7d 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -154,6 +154,9 @@ static void nvdimm_prepare_memory_region(NVDIMMDevice *nvdimm, Error **errp)
object_get_canonical_path_component(OBJECT(hostmem)));
return;
}
+ if (memory_region_is_rom(mr)) {
+ nvdimm->readonly = true;
+ }
nvdimm->nvdimm_mr = g_new(MemoryRegion, 1);
memory_region_init_alias(nvdimm->nvdimm_mr, OBJECT(dimm),
@@ -207,15 +210,16 @@ static void nvdimm_unrealize(PCDIMMDevice *dimm)
* label read/write functions.
*/
static void nvdimm_validate_rw_label_data(NVDIMMDevice *nvdimm, uint64_t size,
- uint64_t offset)
+ uint64_t offset, bool is_write)
{
assert((nvdimm->label_size >= size + offset) && (offset + size > offset));
+ assert(!is_write || !nvdimm->readonly);
}
static void nvdimm_read_label_data(NVDIMMDevice *nvdimm, void *buf,
uint64_t size, uint64_t offset)
{
- nvdimm_validate_rw_label_data(nvdimm, size, offset);
+ nvdimm_validate_rw_label_data(nvdimm, size, offset, false);
memcpy(buf, nvdimm->label_data + offset, size);
}
@@ -229,7 +233,7 @@ static void nvdimm_write_label_data(NVDIMMDevice *nvdimm, const void *buf,
"pmem", NULL);
uint64_t backend_offset;
- nvdimm_validate_rw_label_data(nvdimm, size, offset);
+ nvdimm_validate_rw_label_data(nvdimm, size, offset, true);
if (!is_pmem) {
memcpy(nvdimm->label_data + offset, buf, size);
diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index a868824..60d6d0a 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -320,7 +320,8 @@ static target_ulong h_scm_write_metadata(PowerPCCPU *cpu,
nvdimm = NVDIMM(drc->dev);
if ((offset + len < offset) ||
- (nvdimm->label_size < len + offset)) {
+ (nvdimm->label_size < len + offset) ||
+ nvdimm->readonly) {
return H_P2;
}
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 6828442..ef23d65 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -235,6 +235,12 @@ typedef struct IOMMUTLBEvent {
/* RAM is an mmap-ed named file */
#define RAM_NAMED_FILE (1 << 9)
+/* RAM is mmap-ed read-only */
+#define RAM_READONLY (1 << 10)
+
+/* RAM FD is opened read-only */
+#define RAM_READONLY_FD (1 << 11)
+
static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn,
IOMMUNotifierFlag flags,
hwaddr start, hwaddr end,
@@ -1089,6 +1095,7 @@ struct AddressSpace {
struct FlatView *current_map;
int ioeventfd_nb;
+ int ioeventfd_notifiers;
struct MemoryRegionIoeventfd *ioeventfds;
QTAILQ_HEAD(, MemoryListener) listeners;
QTAILQ_ENTRY(AddressSpace) address_spaces_link;
@@ -1331,10 +1338,10 @@ void memory_region_init_resizeable_ram(MemoryRegion *mr,
* @align: alignment of the region base address; if 0, the default alignment
* (getpagesize()) will be used.
* @ram_flags: RamBlock flags. Supported flags: RAM_SHARED, RAM_PMEM,
- * RAM_NORESERVE,
+ * RAM_NORESERVE, RAM_PROTECTED, RAM_NAMED_FILE, RAM_READONLY,
+ * RAM_READONLY_FD
* @path: the path in which to allocate the RAM.
* @offset: offset within the file referenced by path
- * @readonly: true to open @path for reading, false for read/write.
* @errp: pointer to Error*, to store an error if it happens.
*
* Note that this function does not do anything to cause the data in the
@@ -1348,7 +1355,6 @@ void memory_region_init_ram_from_file(MemoryRegion *mr,
uint32_t ram_flags,
const char *path,
ram_addr_t offset,
- bool readonly,
Error **errp);
/**
@@ -1360,7 +1366,8 @@ void memory_region_init_ram_from_file(MemoryRegion *mr,
* @name: the name of the region.
* @size: size of the region.
* @ram_flags: RamBlock flags. Supported flags: RAM_SHARED, RAM_PMEM,
- * RAM_NORESERVE, RAM_PROTECTED.
+ * RAM_NORESERVE, RAM_PROTECTED, RAM_NAMED_FILE, RAM_READONLY,
+ * RAM_READONLY_FD
* @fd: the fd to mmap.
* @offset: offset within the file referenced by fd
* @errp: pointer to Error*, to store an error if it happens.
diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index 9f2e389..9067609 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -108,10 +108,10 @@ long qemu_maxrampagesize(void);
* @size: the size in bytes of the ram block
* @mr: the memory region where the ram block is
* @ram_flags: RamBlock flags. Supported flags: RAM_SHARED, RAM_PMEM,
- * RAM_NORESERVE.
+ * RAM_NORESERVE, RAM_PROTECTED, RAM_NAMED_FILE, RAM_READONLY,
+ * RAM_READONLY_FD
* @mem_path or @fd: specify the backing file or device
* @offset: Offset into target file
- * @readonly: true to open @path for reading, false for read/write.
* @errp: pointer to Error*, to store an error if it happens
*
* Return:
@@ -120,10 +120,10 @@ long qemu_maxrampagesize(void);
*/
RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
uint32_t ram_flags, const char *mem_path,
- off_t offset, bool readonly, Error **errp);
+ off_t offset, Error **errp);
RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
uint32_t ram_flags, int fd, off_t offset,
- bool readonly, Error **errp);
+ Error **errp);
RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
MemoryRegion *mr, Error **errp);
diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h
index acf887c..d3b7634 100644
--- a/include/hw/mem/nvdimm.h
+++ b/include/hw/mem/nvdimm.h
@@ -78,6 +78,12 @@ struct NVDIMMDevice {
bool unarmed;
/*
+ * Whether our DIMM is backed by ROM, and even label data cannot be
+ * written. If set, implies that "unarmed" is also set.
+ */
+ bool readonly;
+
+ /*
* The PPC64 - spapr requires each nvdimm device have a uuid.
*/
QemuUUID uuid;
diff --git a/qapi/qom.json b/qapi/qom.json
index fa3e88c..c53ef97 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -668,6 +668,20 @@
# @readonly: if true, the backing file is opened read-only; if false,
# it is opened read-write. (default: false)
#
+# @rom: whether to create Read Only Memory (ROM) that cannot be modified
+# by the VM. Any write attempts to such ROM will be denied. Most
+# use cases want writable RAM instead of ROM. However, selected use
+# cases, like R/O NVDIMMs, can benefit from ROM. If set to 'on',
+# create ROM; if set to 'off', create writable RAM; if set to
+# 'auto', the value of the @readonly property is used. This
+# property is primarily helpful when we want to have proper RAM in
+# configurations that would traditionally create ROM before this
+# property was introduced: VM templating, where we want to open a
+# file readonly (@readonly set to true) and mark the memory to be
+# private for QEMU (@share set to false). For this use case, we need
+# writable RAM instead of ROM, and want to set this property to 'off'.
+# (default: auto, since 8.2)
+#
# Since: 2.1
##
{ 'struct': 'MemoryBackendFileProperties',
@@ -677,7 +691,8 @@
'*discard-data': 'bool',
'mem-path': 'str',
'*pmem': { 'type': 'bool', 'if': 'CONFIG_LIBPMEM' },
- '*readonly': 'bool' } }
+ '*readonly': 'bool',
+ '*rom': 'OnOffAuto' } }
##
# @MemoryBackendMemfdProperties:
diff --git a/qemu-options.hx b/qemu-options.hx
index 2bcf7e4..bcd7725 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -5063,7 +5063,7 @@ SRST
they are specified. Note that the 'id' property must be set. These
objects are placed in the '/objects' path.
- ``-object memory-backend-file,id=id,size=size,mem-path=dir,share=on|off,discard-data=on|off,merge=on|off,dump=on|off,prealloc=on|off,host-nodes=host-nodes,policy=default|preferred|bind|interleave,align=align,offset=offset,readonly=on|off``
+ ``-object memory-backend-file,id=id,size=size,mem-path=dir,share=on|off,discard-data=on|off,merge=on|off,dump=on|off,prealloc=on|off,host-nodes=host-nodes,policy=default|preferred|bind|interleave,align=align,offset=offset,readonly=on|off,rom=on|off|auto``
Creates a memory file backend object, which can be used to back
the guest RAM with huge pages.
@@ -5153,6 +5153,20 @@ SRST
The ``readonly`` option specifies whether the backing file is opened
read-only or read-write (default).
+ The ``rom`` option specifies whether to create Read Only Memory
+ (ROM) that cannot be modified by the VM. Any write attempts to such
+ ROM will be denied. Most use cases want proper RAM instead of ROM.
+ However, selected use cases, like R/O NVDIMMs, can benefit from
+ ROM. If set to ``on``, create ROM; if set to ``off``, create
+ writable RAM; if set to ``auto`` (default), the value of the
+ ``readonly`` option is used. This option is primarily helpful when
+ we want to have writable RAM in configurations that would
+ traditionally create ROM before the ``rom`` option was introduced:
+ VM templating, where we want to open a file readonly
+ (``readonly=on``) and mark the memory to be private for QEMU
+ (``share=off``). For this use case, we need writable RAM instead
+ of ROM, and want to also set ``rom=off``.
+
``-object memory-backend-ram,id=id,merge=on|off,dump=on|off,share=on|off,prealloc=on|off,size=size,host-nodes=host-nodes,policy=default|preferred|bind|interleave``
Creates a memory backend object, which can be used to back the
guest RAM. Memory backend objects offer more control than the
diff --git a/softmmu/memory.c b/softmmu/memory.c
index 7d9494c..c0383a1 100644
--- a/softmmu/memory.c
+++ b/softmmu/memory.c
@@ -842,6 +842,10 @@ static void address_space_update_ioeventfds(AddressSpace *as)
AddrRange tmp;
unsigned i;
+ if (!as->ioeventfd_notifiers) {
+ return;
+ }
+
/*
* It is likely that the number of ioeventfds hasn't changed much, so use
* the previous size as the starting value, with some headroom to avoid
@@ -1620,18 +1624,17 @@ void memory_region_init_ram_from_file(MemoryRegion *mr,
uint32_t ram_flags,
const char *path,
ram_addr_t offset,
- bool readonly,
Error **errp)
{
Error *err = NULL;
memory_region_init(mr, owner, name, size);
mr->ram = true;
- mr->readonly = readonly;
+ mr->readonly = !!(ram_flags & RAM_READONLY);
mr->terminates = true;
mr->destructor = memory_region_destructor_ram;
mr->align = align;
mr->ram_block = qemu_ram_alloc_from_file(size, mr, ram_flags, path,
- offset, readonly, &err);
+ offset, &err);
if (err) {
mr->size = int128_zero();
object_unparent(OBJECT(mr));
@@ -1651,10 +1654,11 @@ void memory_region_init_ram_from_fd(MemoryRegion *mr,
Error *err = NULL;
memory_region_init(mr, owner, name, size);
mr->ram = true;
+ mr->readonly = !!(ram_flags & RAM_READONLY);
mr->terminates = true;
mr->destructor = memory_region_destructor_ram;
mr->ram_block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, offset,
- false, &err);
+ &err);
if (err) {
mr->size = int128_zero();
object_unparent(OBJECT(mr));
@@ -3075,6 +3079,10 @@ void memory_listener_register(MemoryListener *listener, AddressSpace *as)
}
listener_add_address_space(listener, as);
+
+ if (listener->eventfd_add || listener->eventfd_del) {
+ as->ioeventfd_notifiers++;
+ }
}
void memory_listener_unregister(MemoryListener *listener)
@@ -3083,6 +3091,10 @@ void memory_listener_unregister(MemoryListener *listener)
return;
}
+ if (listener->eventfd_add || listener->eventfd_del) {
+ listener->address_space->ioeventfd_notifiers--;
+ }
+
listener_del_address_space(listener, listener->address_space);
QTAILQ_REMOVE(&memory_listeners, listener, link);
QTAILQ_REMOVE(&listener->address_space->listeners, listener, link_as);
diff --git a/softmmu/physmem.c b/softmmu/physmem.c
index 18277dd..4f6ca65 100644
--- a/softmmu/physmem.c
+++ b/softmmu/physmem.c
@@ -1288,8 +1288,7 @@ static int64_t get_file_align(int fd)
static int file_ram_open(const char *path,
const char *region_name,
bool readonly,
- bool *created,
- Error **errp)
+ bool *created)
{
char *filename;
char *sanitized_name;
@@ -1300,10 +1299,33 @@ static int file_ram_open(const char *path,
for (;;) {
fd = open(path, readonly ? O_RDONLY : O_RDWR);
if (fd >= 0) {
+ /*
+ * open(O_RDONLY) won't fail with EISDIR. Check manually if we
+ * opened a directory and fail similarly to how we fail ENOENT
+ * in readonly mode. Note that mkstemp() would imply O_RDWR.
+ */
+ if (readonly) {
+ struct stat file_stat;
+
+ if (fstat(fd, &file_stat)) {
+ close(fd);
+ if (errno == EINTR) {
+ continue;
+ }
+ return -errno;
+ } else if (S_ISDIR(file_stat.st_mode)) {
+ close(fd);
+ return -EISDIR;
+ }
+ }
/* @path names an existing file, use it */
break;
}
if (errno == ENOENT) {
+ if (readonly) {
+ /* Refuse to create new, readonly files. */
+ return -ENOENT;
+ }
/* @path names a file that doesn't exist, create it */
fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644);
if (fd >= 0) {
@@ -1333,10 +1355,7 @@ static int file_ram_open(const char *path,
g_free(filename);
}
if (errno != EEXIST && errno != EINTR) {
- error_setg_errno(errp, errno,
- "can't open backing store %s for guest RAM",
- path);
- return -1;
+ return -errno;
}
/*
* Try again on EINTR and EEXIST. The latter happens when
@@ -1350,7 +1369,6 @@ static int file_ram_open(const char *path,
static void *file_ram_alloc(RAMBlock *block,
ram_addr_t memory,
int fd,
- bool readonly,
bool truncate,
off_t offset,
Error **errp)
@@ -1408,7 +1426,7 @@ static void *file_ram_alloc(RAMBlock *block,
perror("ftruncate");
}
- qemu_map_flags = readonly ? QEMU_MAP_READONLY : 0;
+ qemu_map_flags = (block->flags & RAM_READONLY) ? QEMU_MAP_READONLY : 0;
qemu_map_flags |= (block->flags & RAM_SHARED) ? QEMU_MAP_SHARED : 0;
qemu_map_flags |= (block->flags & RAM_PMEM) ? QEMU_MAP_SYNC : 0;
qemu_map_flags |= (block->flags & RAM_NORESERVE) ? QEMU_MAP_NORESERVE : 0;
@@ -1876,7 +1894,7 @@ static void ram_block_add(RAMBlock *new_block, Error **errp)
#ifdef CONFIG_POSIX
RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
uint32_t ram_flags, int fd, off_t offset,
- bool readonly, Error **errp)
+ Error **errp)
{
RAMBlock *new_block;
Error *local_err = NULL;
@@ -1884,7 +1902,8 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
/* Just support these ram flags by now. */
assert((ram_flags & ~(RAM_SHARED | RAM_PMEM | RAM_NORESERVE |
- RAM_PROTECTED | RAM_NAMED_FILE)) == 0);
+ RAM_PROTECTED | RAM_NAMED_FILE | RAM_READONLY |
+ RAM_READONLY_FD)) == 0);
if (xen_enabled()) {
error_setg(errp, "-mem-path not supported with Xen");
@@ -1919,8 +1938,8 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
new_block->used_length = size;
new_block->max_length = size;
new_block->flags = ram_flags;
- new_block->host = file_ram_alloc(new_block, size, fd, readonly,
- !file_size, offset, errp);
+ new_block->host = file_ram_alloc(new_block, size, fd, !file_size, offset,
+ errp);
if (!new_block->host) {
g_free(new_block);
return NULL;
@@ -1939,20 +1958,40 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
uint32_t ram_flags, const char *mem_path,
- off_t offset, bool readonly, Error **errp)
+ off_t offset, Error **errp)
{
int fd;
bool created;
RAMBlock *block;
- fd = file_ram_open(mem_path, memory_region_name(mr), readonly, &created,
- errp);
+ fd = file_ram_open(mem_path, memory_region_name(mr),
+ !!(ram_flags & RAM_READONLY_FD), &created);
if (fd < 0) {
+ error_setg_errno(errp, -fd, "can't open backing store %s for guest RAM",
+ mem_path);
+ if (!(ram_flags & RAM_READONLY_FD) && !(ram_flags & RAM_SHARED) &&
+ fd == -EACCES) {
+ /*
+ * If we can open the file R/O (note: will never create a new file)
+ * and we are dealing with a private mapping, there are still ways
+ * to consume such files and get RAM instead of ROM.
+ */
+ fd = file_ram_open(mem_path, memory_region_name(mr), true,
+ &created);
+ if (fd < 0) {
+ return NULL;
+ }
+ assert(!created);
+ close(fd);
+ error_append_hint(errp, "Consider opening the backing store"
+ " read-only but still creating writable RAM using"
+ " '-object memory-backend-file,readonly=on,rom=off...'"
+ " (see \"VM templating\" documentation)\n");
+ }
return NULL;
}
- block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, offset, readonly,
- errp);
+ block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, offset, errp);
if (!block) {
if (created) {
unlink(mem_path);
@@ -2070,6 +2109,7 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
ram_addr_t offset;
int flags;
void *area, *vaddr;
+ int prot;
RAMBLOCK_FOREACH(block) {
offset = addr - block->offset;
@@ -2084,13 +2124,14 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
flags |= block->flags & RAM_SHARED ?
MAP_SHARED : MAP_PRIVATE;
flags |= block->flags & RAM_NORESERVE ? MAP_NORESERVE : 0;
+ prot = PROT_READ;
+ prot |= block->flags & RAM_READONLY ? 0 : PROT_WRITE;
if (block->fd >= 0) {
- area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
- flags, block->fd, offset + block->fd_offset);
+ area = mmap(vaddr, length, prot, flags, block->fd,
+ offset + block->fd_offset);
} else {
flags |= MAP_ANONYMOUS;
- area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
- flags, -1, 0);
+ area = mmap(vaddr, length, prot, flags, -1, 0);
}
if (area != vaddr) {
error_report("Could not remap addr: "
@@ -3481,6 +3522,16 @@ int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
*/
#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
/*
+ * fallocate() will fail with readonly files. Let's print a
+ * proper error message.
+ */
+ if (rb->flags & RAM_READONLY_FD) {
+ error_report("ram_block_discard_range: Discarding RAM"
+ " with readonly files is not supported");
+ goto err;
+
+ }
+ /*
* We'll discard data from the actual file, even though we only
* have a MAP_PRIVATE mapping, possibly messing with other
* MAP_PRIVATE/MAP_SHARED mappings. There is no easy way to