From f47a672a72acd6e2712031f0bc4d4f3ae4b6302c Mon Sep 17 00:00:00 2001 From: Chenyi Qiang Date: Thu, 12 Jun 2025 16:27:42 +0800 Subject: memory: Export a helper to get intersection of a MemoryRegionSection with a given range Rename the helper to memory_region_section_intersect_range() to make it more generic. Meanwhile, define the @end as Int128 and replace the related operations with Int128_* format since the helper is exported as a wider API. Suggested-by: Alexey Kardashevskiy Reviewed-by: Alexey Kardashevskiy Reviewed-by: Pankaj Gupta Reviewed-by: David Hildenbrand Reviewed-by: Zhao Liu Reviewed-by: Xiaoyao Li Signed-off-by: Chenyi Qiang Link: https://lore.kernel.org/r/20250612082747.51539-2-chenyi.qiang@intel.com Signed-off-by: Peter Xu --- include/system/memory.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include') diff --git a/include/system/memory.h b/include/system/memory.h index 0848690..da97753 100644 --- a/include/system/memory.h +++ b/include/system/memory.h @@ -1212,6 +1212,36 @@ MemoryRegionSection *memory_region_section_new_copy(MemoryRegionSection *s); void memory_region_section_free_copy(MemoryRegionSection *s); /** + * memory_region_section_intersect_range: Adjust the memory section to cover + * the intersection with the given range. + * + * @s: the #MemoryRegionSection to be adjusted + * @offset: the offset of the given range in the memory region + * @size: the size of the given range + * + * Returns false if the intersection is empty, otherwise returns true. + */ +static inline bool memory_region_section_intersect_range(MemoryRegionSection *s, + uint64_t offset, + uint64_t size) +{ + uint64_t start = MAX(s->offset_within_region, offset); + Int128 end = int128_min(int128_add(int128_make64(s->offset_within_region), + s->size), + int128_add(int128_make64(offset), + int128_make64(size))); + + if (int128_le(end, int128_make64(start))) { + return false; + } + + s->offset_within_address_space += start - s->offset_within_region; + s->offset_within_region = start; + s->size = int128_sub(end, int128_make64(start)); + return true; +} + +/** * memory_region_init: Initialize a memory region * * The region typically acts as a container for other memory regions. Use -- cgit v1.1 From ff1211154c45c9f7f82116ae9a8c72a848e4a8b5 Mon Sep 17 00:00:00 2001 From: Chenyi Qiang Date: Thu, 12 Jun 2025 16:27:43 +0800 Subject: memory: Change memory_region_set_ram_discard_manager() to return the result Modify memory_region_set_ram_discard_manager() to return -EBUSY if a RamDiscardManager is already set in the MemoryRegion. The caller must handle this failure, such as having virtio-mem undo its actions and fail the realize() process. Opportunistically move the call earlier to avoid complex error handling. This change is beneficial when introducing a new RamDiscardManager instance besides virtio-mem. After ram_block_coordinated_discard_require(true) unlocks all RamDiscardManager instances, only one instance is allowed to be set for one MemoryRegion at present. Suggested-by: David Hildenbrand Reviewed-by: David Hildenbrand Reviewed-by: Pankaj Gupta Tested-by: Alexey Kardashevskiy Reviewed-by: Alexey Kardashevskiy Reviewed-by: Xiaoyao Li Signed-off-by: Chenyi Qiang Link: https://lore.kernel.org/r/20250612082747.51539-3-chenyi.qiang@intel.com Signed-off-by: Peter Xu --- include/system/memory.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/system/memory.h b/include/system/memory.h index da97753..60983d4 100644 --- a/include/system/memory.h +++ b/include/system/memory.h @@ -2499,13 +2499,13 @@ static inline bool memory_region_has_ram_discard_manager(MemoryRegion *mr) * * This function must not be called for a mapped #MemoryRegion, a #MemoryRegion * that does not cover RAM, or a #MemoryRegion that already has a - * #RamDiscardManager assigned. + * #RamDiscardManager assigned. Return 0 if the rdm is set successfully. * * @mr: the #MemoryRegion * @rdm: #RamDiscardManager to set */ -void memory_region_set_ram_discard_manager(MemoryRegion *mr, - RamDiscardManager *rdm); +int memory_region_set_ram_discard_manager(MemoryRegion *mr, + RamDiscardManager *rdm); /** * memory_region_find: translate an address/size relative to a -- cgit v1.1 From 2205b8466733f8c6e3306c964f31c5a7cac69dfa Mon Sep 17 00:00:00 2001 From: Chenyi Qiang Date: Thu, 12 Jun 2025 16:27:44 +0800 Subject: memory: Unify the definiton of ReplayRamPopulate() and ReplayRamDiscard() Update ReplayRamDiscard() function to return the result and unify the ReplayRamPopulate() and ReplayRamDiscard() to ReplayRamDiscardState() at the same time due to their identical definitions. This unification simplifies related structures, such as VirtIOMEMReplayData, which makes it cleaner. Reviewed-by: David Hildenbrand Reviewed-by: Pankaj Gupta Reviewed-by: Xiaoyao Li Signed-off-by: Chenyi Qiang Link: https://lore.kernel.org/r/20250612082747.51539-4-chenyi.qiang@intel.com Signed-off-by: Peter Xu --- include/system/memory.h | 74 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 57 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/system/memory.h b/include/system/memory.h index 60983d4..46248d4 100644 --- a/include/system/memory.h +++ b/include/system/memory.h @@ -576,8 +576,20 @@ static inline void ram_discard_listener_init(RamDiscardListener *rdl, rdl->double_discard_supported = double_discard_supported; } -typedef int (*ReplayRamPopulate)(MemoryRegionSection *section, void *opaque); -typedef void (*ReplayRamDiscard)(MemoryRegionSection *section, void *opaque); +/** + * typedef ReplayRamDiscardState: + * + * The callback handler for #RamDiscardManagerClass.replay_populated/ + * #RamDiscardManagerClass.replay_discarded to invoke on populated/discarded + * parts. + * + * @section: the #MemoryRegionSection of populated/discarded part + * @opaque: pointer to forward to the callback + * + * Returns 0 on success, or a negative error if failed. + */ +typedef int (*ReplayRamDiscardState)(MemoryRegionSection *section, + void *opaque); /* * RamDiscardManagerClass: @@ -651,36 +663,38 @@ struct RamDiscardManagerClass { /** * @replay_populated: * - * Call the #ReplayRamPopulate callback for all populated parts within the - * #MemoryRegionSection via the #RamDiscardManager. + * Call the #ReplayRamDiscardState callback for all populated parts within + * the #MemoryRegionSection via the #RamDiscardManager. * * In case any call fails, no further calls are made. * * @rdm: the #RamDiscardManager * @section: the #MemoryRegionSection - * @replay_fn: the #ReplayRamPopulate callback + * @replay_fn: the #ReplayRamDiscardState callback * @opaque: pointer to forward to the callback * * Returns 0 on success, or a negative error if any notification failed. */ int (*replay_populated)(const RamDiscardManager *rdm, MemoryRegionSection *section, - ReplayRamPopulate replay_fn, void *opaque); + ReplayRamDiscardState replay_fn, void *opaque); /** * @replay_discarded: * - * Call the #ReplayRamDiscard callback for all discarded parts within the - * #MemoryRegionSection via the #RamDiscardManager. + * Call the #ReplayRamDiscardState callback for all discarded parts within + * the #MemoryRegionSection via the #RamDiscardManager. * * @rdm: the #RamDiscardManager * @section: the #MemoryRegionSection - * @replay_fn: the #ReplayRamDiscard callback + * @replay_fn: the #ReplayRamDiscardState callback * @opaque: pointer to forward to the callback + * + * Returns 0 on success, or a negative error if any notification failed. */ - void (*replay_discarded)(const RamDiscardManager *rdm, - MemoryRegionSection *section, - ReplayRamDiscard replay_fn, void *opaque); + int (*replay_discarded)(const RamDiscardManager *rdm, + MemoryRegionSection *section, + ReplayRamDiscardState replay_fn, void *opaque); /** * @register_listener: @@ -721,15 +735,41 @@ uint64_t ram_discard_manager_get_min_granularity(const RamDiscardManager *rdm, bool ram_discard_manager_is_populated(const RamDiscardManager *rdm, const MemoryRegionSection *section); +/** + * ram_discard_manager_replay_populated: + * + * A wrapper to call the #RamDiscardManagerClass.replay_populated callback + * of the #RamDiscardManager. + * + * @rdm: the #RamDiscardManager + * @section: the #MemoryRegionSection + * @replay_fn: the #ReplayRamDiscardState callback + * @opaque: pointer to forward to the callback + * + * Returns 0 on success, or a negative error if any notification failed. + */ int ram_discard_manager_replay_populated(const RamDiscardManager *rdm, MemoryRegionSection *section, - ReplayRamPopulate replay_fn, + ReplayRamDiscardState replay_fn, void *opaque); -void ram_discard_manager_replay_discarded(const RamDiscardManager *rdm, - MemoryRegionSection *section, - ReplayRamDiscard replay_fn, - void *opaque); +/** + * ram_discard_manager_replay_discarded: + * + * A wrapper to call the #RamDiscardManagerClass.replay_discarded callback + * of the #RamDiscardManager. + * + * @rdm: the #RamDiscardManager + * @section: the #MemoryRegionSection + * @replay_fn: the #ReplayRamDiscardState callback + * @opaque: pointer to forward to the callback + * + * Returns 0 on success, or a negative error if any notification failed. + */ +int ram_discard_manager_replay_discarded(const RamDiscardManager *rdm, + MemoryRegionSection *section, + ReplayRamDiscardState replay_fn, + void *opaque); void ram_discard_manager_register_listener(RamDiscardManager *rdm, RamDiscardListener *rdl, -- cgit v1.1 From 5d6483edaa9232d8f3709f68c8eab4bc2033fb70 Mon Sep 17 00:00:00 2001 From: Chenyi Qiang Date: Thu, 12 Jun 2025 16:27:45 +0800 Subject: ram-block-attributes: Introduce RamBlockAttributes to manage RAMBlock with guest_memfd Commit 852f0048f3 ("RAMBlock: make guest_memfd require uncoordinated discard") highlighted that subsystems like VFIO may disable RAM block discard. However, guest_memfd relies on discard operations for page conversion between private and shared memory, potentially leading to the stale IOMMU mapping issue when assigning hardware devices to confidential VMs via shared memory. To address this and allow shared device assignement, it is crucial to ensure the VFIO system refreshes its IOMMU mappings. RamDiscardManager is an existing interface (used by virtio-mem) to adjust VFIO mappings in relation to VM page assignment. Effectively page conversion is similar to hot-removing a page in one mode and adding it back in the other. Therefore, similar actions are required for page conversion events. Introduce the RamDiscardManager to guest_memfd to facilitate this process. Since guest_memfd is not an object, it cannot directly implement the RamDiscardManager interface. Implementing it in HostMemoryBackend is not appropriate because guest_memfd is per RAMBlock, and some RAMBlocks have a memory backend while others do not. Notably, virtual BIOS RAMBlocks using memory_region_init_ram_guest_memfd() do not have a backend. To manage RAMBlocks with guest_memfd, define a new object named RamBlockAttributes to implement the RamDiscardManager interface. This object can store the guest_memfd information such as the bitmap for shared memory and the registered listeners for event notifications. A new state_change() helper function is provided to notify listeners, such as VFIO, allowing VFIO to do dynamically DMA map and unmap for the shared memory according to conversion events. Note that in the current context of RamDiscardManager for guest_memfd, the shared state is analogous to being populated, while the private state can be considered discarded for simplicity. In the future, it would be more complicated if considering more states like private/shared/discarded at the same time. In current implementation, memory state tracking is performed at the host page size granularity, as the minimum conversion size can be one page per request. Additionally, VFIO expected the DMA mapping for a specific IOVA to be mapped and unmapped with the same granularity. Confidential VMs may perform partial conversions, such as conversions on small regions within a larger one. To prevent such invalid cases and until support for DMA mapping cut operations is available, all operations are performed with 4K granularity. In addition, memory conversion failures cause QEMU to quit rather than resuming the guest or retrying the operation at present. It would be future work to add more error handling or rollback mechanisms once conversion failures are allowed. For example, in-place conversion of guest_memfd could retry the unmap operation during the conversion from shared to private. For now, keep the complex error handling out of the picture as it is not required. Tested-by: Alexey Kardashevskiy Reviewed-by: Alexey Kardashevskiy Reviewed-by: Pankaj Gupta Signed-off-by: Chenyi Qiang Link: https://lore.kernel.org/r/20250612082747.51539-5-chenyi.qiang@intel.com [peterx: squash fixup from Chenyi to fix builds] Signed-off-by: Peter Xu --- include/system/ramblock.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/system/ramblock.h b/include/system/ramblock.h index d8a116b..1bab9e2 100644 --- a/include/system/ramblock.h +++ b/include/system/ramblock.h @@ -22,6 +22,10 @@ #include "exec/cpu-common.h" #include "qemu/rcu.h" #include "exec/ramlist.h" +#include "system/hostmem.h" + +#define TYPE_RAM_BLOCK_ATTRIBUTES "ram-block-attributes" +OBJECT_DECLARE_SIMPLE_TYPE(RamBlockAttributes, RAM_BLOCK_ATTRIBUTES) struct RAMBlock { struct rcu_head rcu; @@ -91,4 +95,21 @@ struct RAMBlock { ram_addr_t postcopy_length; }; +struct RamBlockAttributes { + Object parent; + + RAMBlock *ram_block; + + /* 1-setting of the bitmap represents ram is populated (shared) */ + unsigned bitmap_size; + unsigned long *bitmap; + + QLIST_HEAD(, RamDiscardListener) rdl_list; +}; + +RamBlockAttributes *ram_block_attributes_create(RAMBlock *ram_block); +void ram_block_attributes_destroy(RamBlockAttributes *attr); +int ram_block_attributes_state_change(RamBlockAttributes *attr, uint64_t offset, + uint64_t size, bool to_discard); + #endif -- cgit v1.1 From 2fde3fb916079ee0ff0fc26d9446c813b1d5cc28 Mon Sep 17 00:00:00 2001 From: Chenyi Qiang Date: Thu, 12 Jun 2025 16:27:46 +0800 Subject: physmem: Support coordinated discarding of RAM with guest_memfd A new field, attributes, was introduced in RAMBlock to link to a RamBlockAttributes object, which centralizes all guest_memfd related information (such as fd and status bitmap) within a RAMBlock. Create and initialize the RamBlockAttributes object upon ram_block_add(). Meanwhile, register the object in the target RAMBlock's MemoryRegion. After that, guest_memfd-backed RAMBlock is associated with the RamDiscardManager interface, and the users can execute RamDiscardManager specific handling. For example, VFIO will register the RamDiscardListener and get notifications when the state_change() helper invokes. As coordinate discarding of RAM with guest_memfd is now supported, only block uncoordinated discard. Tested-by: Alexey Kardashevskiy Reviewed-by: Alexey Kardashevskiy Acked-by: David Hildenbrand Signed-off-by: Chenyi Qiang Link: https://lore.kernel.org/r/20250612082747.51539-6-chenyi.qiang@intel.com Signed-off-by: Peter Xu --- include/system/ramblock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/system/ramblock.h b/include/system/ramblock.h index 1bab9e2..87e847e 100644 --- a/include/system/ramblock.h +++ b/include/system/ramblock.h @@ -46,6 +46,7 @@ struct RAMBlock { int fd; uint64_t fd_offset; int guest_memfd; + RamBlockAttributes *attributes; size_t page_size; /* dirty bitmap used during migration */ unsigned long *bmap; -- cgit v1.1