diff options
Diffstat (limited to 'system/physmem.c')
-rw-r--r-- | system/physmem.c | 637 |
1 files changed, 466 insertions, 171 deletions
diff --git a/system/physmem.c b/system/physmem.c index 9a3b3a7..ff0ca40 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -28,31 +28,34 @@ #include "qemu/lockable.h" #ifdef CONFIG_TCG -#include "hw/core/tcg-cpu-ops.h" +#include "accel/tcg/cpu-ops.h" +#include "accel/tcg/iommu.h" #endif /* CONFIG_TCG */ -#include "exec/exec-all.h" +#include "exec/cputlb.h" #include "exec/page-protection.h" #include "exec/target_page.h" +#include "exec/translation-block.h" #include "hw/qdev-core.h" #include "hw/qdev-properties.h" #include "hw/boards.h" -#include "sysemu/xen.h" -#include "sysemu/kvm.h" -#include "sysemu/tcg.h" -#include "sysemu/qtest.h" +#include "system/xen.h" +#include "system/kvm.h" +#include "system/tcg.h" +#include "system/qtest.h" #include "qemu/timer.h" #include "qemu/config-file.h" #include "qemu/error-report.h" #include "qemu/qemu-print.h" #include "qemu/log.h" #include "qemu/memalign.h" -#include "exec/memory.h" -#include "exec/ioport.h" -#include "sysemu/dma.h" -#include "sysemu/hostmem.h" -#include "sysemu/hw_accel.h" -#include "sysemu/xen-mapcache.h" +#include "qemu/memfd.h" +#include "system/memory.h" +#include "system/ioport.h" +#include "system/dma.h" +#include "system/hostmem.h" +#include "system/hw_accel.h" +#include "system/xen-mapcache.h" #include "trace.h" #ifdef CONFIG_FALLOCATE_PUNCH_HOLE @@ -61,14 +64,16 @@ #include "qemu/rcu_queue.h" #include "qemu/main-loop.h" -#include "exec/translate-all.h" -#include "sysemu/replay.h" +#include "system/replay.h" -#include "exec/memory-internal.h" -#include "exec/ram_addr.h" +#include "system/ram_addr.h" #include "qemu/pmem.h" +#include "qapi/qapi-types-migration.h" +#include "migration/blocker.h" +#include "migration/cpr.h" +#include "migration/options.h" #include "migration/vmstate.h" #include "qemu/range.h" @@ -82,6 +87,8 @@ #include <daxctl/libdaxctl.h> #endif +#include "memory-internal.h" + //#define DEBUG_SUBPAGE /* ram_list is read under rcu_read_lock()/rcu_read_unlock(). Writes @@ -152,6 +159,7 @@ static void io_mem_init(void); static void memory_map_init(void); static void tcg_log_global_after_sync(MemoryListener *listener); static void tcg_commit(MemoryListener *listener); +static bool ram_is_cpr_compatible(RAMBlock *rb); /** * CPUAddressSpace: all the information a CPU needs about an AddressSpace @@ -571,7 +579,7 @@ MemoryRegion *flatview_translate(FlatView *fv, hwaddr addr, hwaddr *xlat, is_write, true, &as, attrs); mr = section.mr; - if (xen_enabled() && memory_access_is_direct(mr, is_write)) { + if (xen_enabled() && memory_access_is_direct(mr, is_write, attrs)) { hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr; *plen = MIN(page, *plen); } @@ -579,6 +587,8 @@ MemoryRegion *flatview_translate(FlatView *fv, hwaddr addr, hwaddr *xlat, return mr; } +#ifdef CONFIG_TCG + typedef struct TCGIOMMUNotifier { IOMMUNotifier n; MemoryRegion *mr; @@ -738,6 +748,33 @@ translate_fail: return &d->map.sections[PHYS_SECTION_UNASSIGNED]; } +MemoryRegionSection *iotlb_to_section(CPUState *cpu, + hwaddr index, MemTxAttrs attrs) +{ + int asidx = cpu_asidx_from_attrs(cpu, attrs); + CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx]; + AddressSpaceDispatch *d = cpuas->memory_dispatch; + int section_index = index & ~TARGET_PAGE_MASK; + MemoryRegionSection *ret; + + assert(section_index < d->map.sections_nb); + ret = d->map.sections + section_index; + assert(ret->mr); + assert(ret->mr->ops); + + return ret; +} + +/* Called from RCU critical section */ +hwaddr memory_region_section_get_iotlb(CPUState *cpu, + MemoryRegionSection *section) +{ + AddressSpaceDispatch *d = flatview_to_dispatch(section->fv); + return section - d->map.sections; +} + +#endif /* CONFIG_TCG */ + void cpu_address_space_init(CPUState *cpu, int asidx, const char *prefix, MemoryRegion *mr) { @@ -763,6 +800,7 @@ void cpu_address_space_init(CPUState *cpu, int asidx, if (!cpu->cpu_ases) { cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases); + cpu->cpu_ases_count = cpu->num_ases; } newas = &cpu->cpu_ases[asidx]; @@ -776,6 +814,34 @@ void cpu_address_space_init(CPUState *cpu, int asidx, } } +void cpu_address_space_destroy(CPUState *cpu, int asidx) +{ + CPUAddressSpace *cpuas; + + assert(cpu->cpu_ases); + assert(asidx >= 0 && asidx < cpu->num_ases); + /* KVM cannot currently support multiple address spaces. */ + assert(asidx == 0 || !kvm_enabled()); + + cpuas = &cpu->cpu_ases[asidx]; + if (tcg_enabled()) { + memory_listener_unregister(&cpuas->tcg_as_listener); + } + + address_space_destroy(cpuas->as); + g_free_rcu(cpuas->as, rcu); + + if (asidx == 0) { + /* reset the convenience alias for address space 0 */ + cpu->as = NULL; + } + + if (--cpu->cpu_ases_count == 0) { + g_free(cpu->cpu_ases); + cpu->cpu_ases = NULL; + } +} + AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx) { /* Return the AddressSpace corresponding to the specified index */ @@ -894,13 +960,19 @@ DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty (MemoryRegion *mr, hwaddr offset, hwaddr length, unsigned client) { DirtyMemoryBlocks *blocks; - ram_addr_t start = memory_region_get_ram_addr(mr) + offset; + ram_addr_t start, first, last; unsigned long align = 1UL << (TARGET_PAGE_BITS + BITS_PER_LEVEL); - ram_addr_t first = QEMU_ALIGN_DOWN(start, align); - ram_addr_t last = QEMU_ALIGN_UP(start + length, align); DirtyBitmapSnapshot *snap; unsigned long page, end, dest; + start = memory_region_get_ram_addr(mr); + /* We know we're only called for RAM MemoryRegions */ + assert(start != RAM_ADDR_INVALID); + start += offset; + + first = QEMU_ALIGN_DOWN(start, align); + last = QEMU_ALIGN_UP(start + length, align); + snap = g_malloc0(sizeof(*snap) + ((last - first) >> (TARGET_PAGE_BITS + 3))); snap->start = first; @@ -959,14 +1031,6 @@ bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap, return false; } -/* Called from RCU critical section */ -hwaddr memory_region_section_get_iotlb(CPUState *cpu, - MemoryRegionSection *section) -{ - AddressSpaceDispatch *d = flatview_to_dispatch(section->fv); - return section - d->map.sections; -} - static int subpage_register(subpage_t *mmio, uint32_t start, uint32_t end, uint16_t section); static subpage_t *subpage_init(FlatView *fv, hwaddr base); @@ -1200,7 +1264,7 @@ long qemu_maxrampagesize(void) return pagesize; } -#ifdef CONFIG_POSIX +#if defined(CONFIG_POSIX) && !defined(EMSCRIPTEN) static int64_t get_file_size(int fd) { int64_t size; @@ -1499,18 +1563,6 @@ static ram_addr_t find_ram_offset(ram_addr_t size) return offset; } -static unsigned long last_ram_page(void) -{ - RAMBlock *block; - ram_addr_t last = 0; - - RCU_READ_LOCK_GUARD(); - RAMBLOCK_FOREACH(block) { - last = MAX(last, block->offset + block->max_length); - } - return last >> TARGET_PAGE_BITS; -} - static void qemu_ram_setup_dump(void *addr, ram_addr_t size) { int ret; @@ -1637,6 +1689,18 @@ void qemu_ram_unset_idstr(RAMBlock *block) } } +static char *cpr_name(MemoryRegion *mr) +{ + const char *mr_name = memory_region_name(mr); + g_autofree char *id = mr->dev ? qdev_get_dev_path(mr->dev) : NULL; + + if (id) { + return g_strdup_printf("%s/%s", id, mr_name); + } else { + return g_strdup(mr_name); + } +} + size_t qemu_ram_pagesize(RAMBlock *rb) { return rb->page_size; @@ -1764,13 +1828,11 @@ void qemu_ram_msync(RAMBlock *block, ram_addr_t start, ram_addr_t length) } /* Called with ram_list.mutex held */ -static void dirty_memory_extend(ram_addr_t old_ram_size, - ram_addr_t new_ram_size) +static void dirty_memory_extend(ram_addr_t new_ram_size) { - ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size, - DIRTY_MEMORY_BLOCK_SIZE); - ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size, - DIRTY_MEMORY_BLOCK_SIZE); + unsigned int old_num_blocks = ram_list.num_dirty_blocks; + unsigned int new_num_blocks = DIV_ROUND_UP(new_ram_size, + DIRTY_MEMORY_BLOCK_SIZE); int i; /* Only need to extend if block count increased */ @@ -1802,6 +1864,8 @@ static void dirty_memory_extend(ram_addr_t old_ram_size, g_free_rcu(old_blocks, rcu); } } + + ram_list.num_dirty_blocks = new_num_blocks; } static void ram_block_add(RAMBlock *new_block, Error **errp) @@ -1811,11 +1875,9 @@ static void ram_block_add(RAMBlock *new_block, Error **errp) RAMBlock *block; RAMBlock *last_block = NULL; bool free_on_error = false; - ram_addr_t old_ram_size, new_ram_size; + ram_addr_t ram_size; Error *err = NULL; - old_ram_size = last_ram_page(); - qemu_mutex_lock_ramlist(); new_block->offset = find_ram_offset(new_block->max_length); @@ -1847,10 +1909,14 @@ static void ram_block_add(RAMBlock *new_block, Error **errp) if (new_block->flags & RAM_GUEST_MEMFD) { int ret; - assert(kvm_enabled()); + if (!kvm_enabled()) { + error_setg(errp, "cannot set up private guest memory for %s: KVM required", + object_get_typename(OBJECT(current_machine->cgs))); + goto out_free; + } assert(new_block->guest_memfd < 0); - ret = ram_block_discard_require(true); + ret = ram_block_coordinated_discard_require(true); if (ret < 0) { error_setg_errno(errp, -ret, "cannot set up private guest memory: discard currently blocked"); @@ -1864,13 +1930,41 @@ static void ram_block_add(RAMBlock *new_block, Error **errp) qemu_mutex_unlock_ramlist(); goto out_free; } - } - new_ram_size = MAX(old_ram_size, - (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS); - if (new_ram_size > old_ram_size) { - dirty_memory_extend(old_ram_size, new_ram_size); + /* + * The attribute bitmap of the RamBlockAttributes is default to + * discarded, which mimics the behavior of kvm_set_phys_mem() when it + * calls kvm_set_memory_attributes_private(). This leads to a brief + * period of inconsistency between the creation of the RAMBlock and its + * mapping into the physical address space. However, this is not + * problematic, as no users rely on the attribute status to perform + * any actions during this interval. + */ + new_block->attributes = ram_block_attributes_create(new_block); + if (!new_block->attributes) { + error_setg(errp, "Failed to create ram block attribute"); + close(new_block->guest_memfd); + ram_block_coordinated_discard_require(false); + qemu_mutex_unlock_ramlist(); + goto out_free; + } + + /* + * Add a specific guest_memfd blocker if a generic one would not be + * added by ram_block_add_cpr_blocker. + */ + if (ram_is_cpr_compatible(new_block)) { + error_setg(&new_block->cpr_blocker, + "Memory region %s uses guest_memfd, " + "which is not supported with CPR.", + memory_region_name(new_block->mr)); + migrate_add_blocker_modes(&new_block->cpr_blocker, errp, + MIG_MODE_CPR_TRANSFER, -1); + } } + + ram_size = (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS; + dirty_memory_extend(ram_size); /* Keep the list sorted from biggest to smallest block. Unlike QTAILQ, * QLIST (which has an RCU-friendly variant) does not have insertion at * tail, so save the last element in last_block. @@ -1923,19 +2017,28 @@ out_free: } } -#ifdef CONFIG_POSIX -RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr, +#if defined(CONFIG_POSIX) && !defined(EMSCRIPTEN) +RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, ram_addr_t max_size, + qemu_ram_resize_cb resized, MemoryRegion *mr, uint32_t ram_flags, int fd, off_t offset, + bool grow, Error **errp) { + ERRP_GUARD(); RAMBlock *new_block; Error *local_err = NULL; - int64_t file_size, file_align; + int64_t file_size, file_align, share_flags; + + share_flags = ram_flags & (RAM_PRIVATE | RAM_SHARED); + assert(share_flags != (RAM_SHARED | RAM_PRIVATE)); + ram_flags &= ~RAM_PRIVATE; /* Just support these ram flags by now. */ assert((ram_flags & ~(RAM_SHARED | RAM_PMEM | RAM_NORESERVE | RAM_PROTECTED | RAM_NAMED_FILE | RAM_READONLY | - RAM_READONLY_FD | RAM_GUEST_MEMFD)) == 0); + RAM_READONLY_FD | RAM_GUEST_MEMFD | + RAM_RESIZEABLE)) == 0); + assert(max_size >= size); if (xen_enabled()) { error_setg(errp, "-mem-path not supported with Xen"); @@ -1950,12 +2053,16 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr, size = TARGET_PAGE_ALIGN(size); size = REAL_HOST_PAGE_ALIGN(size); + max_size = TARGET_PAGE_ALIGN(max_size); + max_size = REAL_HOST_PAGE_ALIGN(max_size); file_size = get_file_size(fd); - if (file_size > offset && file_size < (offset + size)) { - error_setg(errp, "backing store size 0x%" PRIx64 - " does not match 'size' option 0x" RAM_ADDR_FMT, - file_size, size); + if (file_size && file_size < offset + max_size && !grow) { + error_setg(errp, "%s backing store size 0x%" PRIx64 + " is too small for 'size' option 0x" RAM_ADDR_FMT + " plus 'offset' option 0x%" PRIx64, + memory_region_name(mr), file_size, max_size, + (uint64_t)offset); return NULL; } @@ -1970,11 +2077,13 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr, new_block = g_malloc0(sizeof(*new_block)); new_block->mr = mr; new_block->used_length = size; - new_block->max_length = size; + new_block->max_length = max_size; + new_block->resized = resized; new_block->flags = ram_flags; new_block->guest_memfd = -1; - new_block->host = file_ram_alloc(new_block, size, fd, !file_size, offset, - errp); + new_block->host = file_ram_alloc(new_block, max_size, fd, + file_size < offset + max_size, + offset, errp); if (!new_block->host) { g_free(new_block); return NULL; @@ -2026,7 +2135,8 @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr, return NULL; } - block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, offset, errp); + block = qemu_ram_alloc_from_fd(size, size, NULL, mr, ram_flags, fd, offset, + false, errp); if (!block) { if (created) { unlink(mem_path); @@ -2039,21 +2149,98 @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr, } #endif +#ifdef CONFIG_POSIX +/* + * Create MAP_SHARED RAMBlocks by mmap'ing a file descriptor, so it can be + * shared with another process if CPR is being used. Use memfd if available + * because it has no size limits, else use POSIX shm. + */ +static int qemu_ram_get_shared_fd(const char *name, bool *reused, Error **errp) +{ + int fd = cpr_find_fd(name, 0); + + if (fd >= 0) { + *reused = true; + return fd; + } + + if (qemu_memfd_check(0)) { + fd = qemu_memfd_create(name, 0, 0, 0, 0, errp); + } else { + fd = qemu_shm_alloc(0, errp); + } + + if (fd >= 0) { + cpr_save_fd(name, 0, fd); + } + *reused = false; + return fd; +} +#endif + static RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size, - void (*resized)(const char*, - uint64_t length, - void *host), + qemu_ram_resize_cb resized, void *host, uint32_t ram_flags, MemoryRegion *mr, Error **errp) { RAMBlock *new_block; Error *local_err = NULL; - int align; + int align, share_flags; + + share_flags = ram_flags & (RAM_PRIVATE | RAM_SHARED); + assert(share_flags != (RAM_SHARED | RAM_PRIVATE)); + ram_flags &= ~RAM_PRIVATE; assert((ram_flags & ~(RAM_SHARED | RAM_RESIZEABLE | RAM_PREALLOC | RAM_NORESERVE | RAM_GUEST_MEMFD)) == 0); assert(!host ^ (ram_flags & RAM_PREALLOC)); + assert(max_size >= size); + + /* ignore RAM_SHARED for Windows and emscripten*/ +#if defined(CONFIG_POSIX) && !defined(EMSCRIPTEN) + if (!host) { + if (!share_flags && current_machine->aux_ram_share) { + ram_flags |= RAM_SHARED; + } + if (ram_flags & RAM_SHARED) { + bool reused; + g_autofree char *name = cpr_name(mr); + int fd = qemu_ram_get_shared_fd(name, &reused, errp); + + if (fd < 0) { + return NULL; + } + + /* Use same alignment as qemu_anon_ram_alloc */ + mr->align = QEMU_VMALLOC_ALIGN; + + /* + * This can fail if the shm mount size is too small, or alloc from + * fd is not supported, but previous QEMU versions that called + * qemu_anon_ram_alloc for anonymous shared memory could have + * succeeded. Quietly fail and fall back. + * + * After cpr-transfer, new QEMU could create a memory region + * with a larger max size than old, so pass reused to grow the + * region if necessary. The extra space will be usable after a + * guest reset. + */ + new_block = qemu_ram_alloc_from_fd(size, max_size, resized, mr, + ram_flags, fd, 0, reused, NULL); + if (new_block) { + trace_qemu_ram_alloc_shared(name, new_block->used_length, + new_block->max_length, fd, + new_block->host); + return new_block; + } + + cpr_delete_fd(name, 0); + close(fd); + /* fall back to anon allocation */ + } + } +#endif align = qemu_real_host_page_size(); align = MAX(align, TARGET_PAGE_SIZE); @@ -2065,7 +2252,6 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size, new_block->resized = resized; new_block->used_length = size; new_block->max_length = max_size; - assert(max_size >= size); new_block->fd = -1; new_block->guest_memfd = -1; new_block->page_size = qemu_real_host_page_size(); @@ -2090,15 +2276,14 @@ RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host, RAMBlock *qemu_ram_alloc(ram_addr_t size, uint32_t ram_flags, MemoryRegion *mr, Error **errp) { - assert((ram_flags & ~(RAM_SHARED | RAM_NORESERVE | RAM_GUEST_MEMFD)) == 0); + assert((ram_flags & ~(RAM_SHARED | RAM_NORESERVE | RAM_GUEST_MEMFD | + RAM_PRIVATE)) == 0); return qemu_ram_alloc_internal(size, size, NULL, NULL, ram_flags, mr, errp); } RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz, - void (*resized)(const char*, - uint64_t length, - void *host), - MemoryRegion *mr, Error **errp) + qemu_ram_resize_cb resized, + MemoryRegion *mr, Error **errp) { return qemu_ram_alloc_internal(size, maxsz, resized, NULL, RAM_RESIZEABLE, mr, errp); @@ -2110,7 +2295,7 @@ static void reclaim_ramblock(RAMBlock *block) ; } else if (xen_enabled()) { xen_invalidate_map_cache_entry(block->host); -#ifndef _WIN32 +#if !defined(_WIN32) && !defined(EMSCRIPTEN) } else if (block->fd >= 0) { qemu_ram_munmap(block->fd, block->host, block->max_length); close(block->fd); @@ -2120,8 +2305,9 @@ static void reclaim_ramblock(RAMBlock *block) } if (block->guest_memfd >= 0) { + ram_block_attributes_destroy(block->attributes); close(block->guest_memfd); - ram_block_discard_require(false); + ram_block_coordinated_discard_require(false); } g_free(block); @@ -2129,6 +2315,8 @@ static void reclaim_ramblock(RAMBlock *block) void qemu_ram_free(RAMBlock *block) { + g_autofree char *name = NULL; + if (!block) { return; } @@ -2139,6 +2327,8 @@ void qemu_ram_free(RAMBlock *block) } qemu_mutex_lock_ramlist(); + name = cpr_name(block->mr); + cpr_delete_fd(name, 0); QLIST_REMOVE_RCU(block, next); ram_list.mru_block = NULL; /* Write list before version */ @@ -2149,45 +2339,80 @@ void qemu_ram_free(RAMBlock *block) } #ifndef _WIN32 -void qemu_ram_remap(ram_addr_t addr, ram_addr_t length) +/* Simply remap the given VM memory location from start to start+length */ +static int qemu_ram_remap_mmap(RAMBlock *block, uint64_t start, size_t length) +{ + int flags, prot; + void *area; + void *host_startaddr = block->host + start; + + assert(block->fd < 0); + flags = MAP_FIXED | MAP_ANONYMOUS; + flags |= block->flags & RAM_SHARED ? MAP_SHARED : MAP_PRIVATE; + flags |= block->flags & RAM_NORESERVE ? MAP_NORESERVE : 0; + prot = PROT_READ; + prot |= block->flags & RAM_READONLY ? 0 : PROT_WRITE; + area = mmap(host_startaddr, length, prot, flags, -1, 0); + return area != host_startaddr ? -errno : 0; +} + +/* + * qemu_ram_remap - remap a single RAM page + * + * @addr: address in ram_addr_t address space. + * + * This function will try remapping a single page of guest RAM identified by + * @addr, essentially discarding memory to recover from previously poisoned + * memory (MCE). The page size depends on the RAMBlock (i.e., hugetlb). @addr + * does not have to point at the start of the page. + * + * This function is only to be used during system resets; it will kill the + * VM if remapping failed. + */ +void qemu_ram_remap(ram_addr_t addr) { RAMBlock *block; - ram_addr_t offset; - int flags; - void *area, *vaddr; - int prot; + uint64_t offset; + void *vaddr; + size_t page_size; RAMBLOCK_FOREACH(block) { offset = addr - block->offset; if (offset < block->max_length) { + /* Respect the pagesize of our RAMBlock */ + page_size = qemu_ram_pagesize(block); + offset = QEMU_ALIGN_DOWN(offset, page_size); + vaddr = ramblock_ptr(block, offset); if (block->flags & RAM_PREALLOC) { ; } else if (xen_enabled()) { abort(); } else { - flags = MAP_FIXED; - flags |= block->flags & RAM_SHARED ? - MAP_SHARED : MAP_PRIVATE; - flags |= block->flags & RAM_NORESERVE ? MAP_NORESERVE : 0; - prot = PROT_READ; - prot |= block->flags & RAM_READONLY ? 0 : PROT_WRITE; - if (block->fd >= 0) { - area = mmap(vaddr, length, prot, flags, block->fd, - offset + block->fd_offset); - } else { - flags |= MAP_ANONYMOUS; - area = mmap(vaddr, length, prot, flags, -1, 0); - } - if (area != vaddr) { - error_report("Could not remap addr: " - RAM_ADDR_FMT "@" RAM_ADDR_FMT "", - length, addr); - exit(1); + if (ram_block_discard_range(block, offset, page_size) != 0) { + /* + * Fall back to using mmap() only for anonymous mapping, + * as if a backing file is associated we may not be able + * to recover the memory in all cases. + * So don't take the risk of using only mmap and fail now. + */ + if (block->fd >= 0) { + error_report("Could not remap RAM %s:%" PRIx64 "+%" + PRIx64 " +%zx", block->idstr, offset, + block->fd_offset, page_size); + exit(1); + } + if (qemu_ram_remap_mmap(block, offset, page_size) != 0) { + error_report("Could not remap RAM %s:%" PRIx64 " +%zx", + block->idstr, offset, page_size); + exit(1); + } } - memory_try_enable_merging(vaddr, length); - qemu_ram_setup_dump(vaddr, length); + memory_try_enable_merging(vaddr, page_size); + qemu_ram_setup_dump(vaddr, page_size); } + + break; } } } @@ -2485,23 +2710,6 @@ static uint16_t dummy_section(PhysPageMap *map, FlatView *fv, MemoryRegion *mr) return phys_section_add(map, §ion); } -MemoryRegionSection *iotlb_to_section(CPUState *cpu, - hwaddr index, MemTxAttrs attrs) -{ - int asidx = cpu_asidx_from_attrs(cpu, attrs); - CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx]; - AddressSpaceDispatch *d = cpuas->memory_dispatch; - int section_index = index & ~TARGET_PAGE_MASK; - MemoryRegionSection *ret; - - assert(section_index < d->map.sections_nb); - ret = d->map.sections + section_index; - assert(ret->mr); - assert(ret->mr->ops); - - return ret; -} - static void io_mem_init(void) { memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL, @@ -2630,7 +2838,11 @@ static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr, hwaddr length) { uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr); - addr += memory_region_get_ram_addr(mr); + ram_addr_t ramaddr = memory_region_get_ram_addr(mr); + + /* We know we're only called for RAM MemoryRegions */ + assert(ramaddr != RAM_ADDR_INVALID); + addr += ramaddr; /* No early return if dirty_log_mask is or becomes 0, because * cpu_physical_memory_set_dirty_range will still call @@ -2642,7 +2854,7 @@ static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr, } if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) { assert(tcg_enabled()); - tb_invalidate_phys_range(addr, addr + length - 1); + tb_invalidate_phys_range(NULL, addr, addr + length - 1); dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE); } cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask); @@ -2723,7 +2935,7 @@ static bool flatview_access_allowed(MemoryRegion *mr, MemTxAttrs attrs, if (memory_region_is_ram(mr)) { return true; } - qemu_log_mask(LOG_GUEST_ERROR, + qemu_log_mask(LOG_INVALID_MEM, "Invalid access to non-RAM device at " "addr 0x%" HWADDR_PRIX ", size %" HWADDR_PRIu ", " "region '%s'\n", addr, len, memory_region_name(mr)); @@ -2739,7 +2951,7 @@ static MemTxResult flatview_write_continue_step(MemTxAttrs attrs, return MEMTX_ACCESS_ERROR; } - if (!memory_access_is_direct(mr, true)) { + if (!memory_access_is_direct(mr, true, attrs)) { uint64_t val; MemTxResult result; bool release_lock = prepare_mmio_access(mr); @@ -2835,7 +3047,7 @@ static MemTxResult flatview_read_continue_step(MemTxAttrs attrs, uint8_t *buf, return MEMTX_ACCESS_ERROR; } - if (!memory_access_is_direct(mr, false)) { + if (!memory_access_is_direct(mr, false, attrs)) { /* I/O case */ uint64_t val; MemTxResult result; @@ -3007,8 +3219,7 @@ static inline MemTxResult address_space_write_rom_internal(AddressSpace *as, l = len; mr = address_space_translate(as, addr, &addr1, &l, true, attrs); - if (!(memory_region_is_ram(mr) || - memory_region_is_romd(mr))) { + if (!memory_region_supports_direct_access(mr)) { l = memory_access_size(mr, l, addr1); } else { /* ROM/RAM case */ @@ -3056,6 +3267,20 @@ void cpu_flush_icache_range(hwaddr start, hwaddr len) NULL, len, FLUSH_CACHE); } +/* + * A magic value stored in the first 8 bytes of the bounce buffer struct. Used + * to detect illegal pointers passed to address_space_unmap. + */ +#define BOUNCE_BUFFER_MAGIC 0xb4017ceb4ffe12ed + +typedef struct { + uint64_t magic; + MemoryRegion *mr; + hwaddr addr; + size_t len; + uint8_t buffer[]; +} BounceBuffer; + static void address_space_unregister_map_client_do(AddressSpaceMapClient *client) { @@ -3081,9 +3306,9 @@ void address_space_register_map_client(AddressSpace *as, QEMUBH *bh) QEMU_LOCK_GUARD(&as->map_client_list_lock); client->bh = bh; QLIST_INSERT_HEAD(&as->map_client_list, client, link); - /* Write map_client_list before reading in_use. */ + /* Write map_client_list before reading bounce_buffer_size. */ smp_mb(); - if (!qatomic_read(&as->bounce.in_use)) { + if (qatomic_read(&as->bounce_buffer_size) < as->max_bounce_buffer_size) { address_space_notify_map_clients_locked(as); } } @@ -3131,7 +3356,7 @@ static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len, while (len > 0) { l = len; mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs); - if (!memory_access_is_direct(mr, is_write)) { + if (!memory_access_is_direct(mr, is_write, attrs)) { l = memory_access_size(mr, l, addr); if (!memory_region_access_valid(mr, xlat, l, is_write, attrs)) { return false; @@ -3211,29 +3436,41 @@ void *address_space_map(AddressSpace *as, fv = address_space_to_flatview(as); mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs); - if (!memory_access_is_direct(mr, is_write)) { - if (qatomic_xchg(&as->bounce.in_use, true)) { + if (!memory_access_is_direct(mr, is_write, attrs)) { + size_t used = qatomic_read(&as->bounce_buffer_size); + for (;;) { + hwaddr alloc = MIN(as->max_bounce_buffer_size - used, l); + size_t new_size = used + alloc; + size_t actual = + qatomic_cmpxchg(&as->bounce_buffer_size, used, new_size); + if (actual == used) { + l = alloc; + break; + } + used = actual; + } + + if (l == 0) { *plen = 0; return NULL; } - /* Avoid unbounded allocations */ - l = MIN(l, TARGET_PAGE_SIZE); - as->bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l); - as->bounce.addr = addr; - as->bounce.len = l; + BounceBuffer *bounce = g_malloc0(l + sizeof(BounceBuffer)); + bounce->magic = BOUNCE_BUFFER_MAGIC; memory_region_ref(mr); - as->bounce.mr = mr; + bounce->mr = mr; + bounce->addr = addr; + bounce->len = l; + if (!is_write) { - flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED, - as->bounce.buffer, l); + flatview_read(fv, addr, attrs, + bounce->buffer, l); } *plen = l; - return as->bounce.buffer; + return bounce->buffer; } - memory_region_ref(mr); *plen = flatview_extend_translation(fv, addr, len, mr, xlat, l, is_write, attrs); @@ -3248,12 +3485,11 @@ void *address_space_map(AddressSpace *as, void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len, bool is_write, hwaddr access_len) { - if (buffer != as->bounce.buffer) { - MemoryRegion *mr; - ram_addr_t addr1; + MemoryRegion *mr; + ram_addr_t addr1; - mr = memory_region_from_host(buffer, &addr1); - assert(mr != NULL); + mr = memory_region_from_host(buffer, &addr1); + if (mr != NULL) { if (is_write) { invalidate_and_set_dirty(mr, addr1, access_len); } @@ -3263,15 +3499,22 @@ void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len, memory_region_unref(mr); return; } + + + BounceBuffer *bounce = container_of(buffer, BounceBuffer, buffer); + assert(bounce->magic == BOUNCE_BUFFER_MAGIC); + if (is_write) { - address_space_write(as, as->bounce.addr, MEMTXATTRS_UNSPECIFIED, - as->bounce.buffer, access_len); - } - qemu_vfree(as->bounce.buffer); - as->bounce.buffer = NULL; - memory_region_unref(as->bounce.mr); - /* Clear in_use before reading map_client_list. */ - qatomic_set_mb(&as->bounce.in_use, false); + address_space_write(as, bounce->addr, MEMTXATTRS_UNSPECIFIED, + bounce->buffer, access_len); + } + + qatomic_sub(&as->bounce_buffer_size, bounce->len); + bounce->magic = ~BOUNCE_BUFFER_MAGIC; + memory_region_unref(bounce->mr); + g_free(bounce); + /* Write bounce_buffer_size before reading map_client_list. */ + smp_mb(); address_space_notify_map_clients(as); } @@ -3326,7 +3569,7 @@ int64_t address_space_cache_init(MemoryRegionCache *cache, mr = cache->mrs.mr; memory_region_ref(mr); - if (memory_access_is_direct(mr, is_write)) { + if (memory_access_is_direct(mr, is_write, MEMTXATTRS_UNSPECIFIED)) { /* We don't care about the memory attributes here as we're only * doing this if we found actual RAM, which behaves the same * regardless of attributes; so UNSPECIFIED is fine. @@ -3519,13 +3762,8 @@ int cpu_memory_rw_debug(CPUState *cpu, vaddr addr, if (l > len) l = len; phys_addr += (addr & ~TARGET_PAGE_MASK); - if (is_write) { - res = address_space_write_rom(cpu->cpu_ases[asidx].as, phys_addr, - attrs, buf, l); - } else { - res = address_space_read(cpu->cpu_ases[asidx].as, phys_addr, - attrs, buf, l); - } + res = address_space_rw(cpu->cpu_ases[asidx].as, phys_addr, attrs, buf, + l, is_write); if (res != MEMTX_OK) { return -1; } @@ -3635,18 +3873,19 @@ int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length) } ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, - start, length); + start + rb->fd_offset, length); if (ret) { ret = -errno; - error_report("%s: Failed to fallocate %s:%" PRIx64 " +%zx (%d)", - __func__, rb->idstr, start, length, ret); + error_report("%s: Failed to fallocate %s:%" PRIx64 "+%" PRIx64 + " +%zx (%d)", __func__, rb->idstr, start, + rb->fd_offset, length, ret); goto err; } #else ret = -ENOSYS; error_report("%s: fallocate not available/file" - "%s:%" PRIx64 " +%zx (%d)", - __func__, rb->idstr, start, length, ret); + "%s:%" PRIx64 "+%" PRIx64 " +%zx (%d)", __func__, + rb->idstr, start, rb->fd_offset, length, ret); goto err; #endif } @@ -3693,6 +3932,7 @@ int ram_block_discard_guest_memfd_range(RAMBlock *rb, uint64_t start, int ret = -1; #ifdef CONFIG_FALLOCATE_PUNCH_HOLE + /* ignore fd_offset with guest_memfd */ ret = fallocate(rb->guest_memfd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, start, length); @@ -3897,3 +4137,58 @@ bool ram_block_discard_is_required(void) return qatomic_read(&ram_block_discard_required_cnt) || qatomic_read(&ram_block_coordinated_discard_required_cnt); } + +/* + * Return true if ram is compatible with CPR. Do not exclude rom, + * because the rom file could change in new QEMU. + */ +static bool ram_is_cpr_compatible(RAMBlock *rb) +{ + MemoryRegion *mr = rb->mr; + + if (!mr || !memory_region_is_ram(mr)) { + return true; + } + + /* Ram device is remapped in new QEMU */ + if (memory_region_is_ram_device(mr)) { + return true; + } + + /* + * A file descriptor is passed to new QEMU and remapped, or its backing + * file is reopened and mapped. It must be shared to avoid COW. + */ + if (rb->fd >= 0 && qemu_ram_is_shared(rb)) { + return true; + } + + return false; +} + +/* + * Add a blocker for each volatile ram block. This function should only be + * called after we know that the block is migratable. Non-migratable blocks + * are either re-created in new QEMU, or are handled specially, or are covered + * by a device-level CPR blocker. + */ +void ram_block_add_cpr_blocker(RAMBlock *rb, Error **errp) +{ + assert(qemu_ram_is_migratable(rb)); + + if (ram_is_cpr_compatible(rb)) { + return; + } + + error_setg(&rb->cpr_blocker, + "Memory region %s is not compatible with CPR. share=on is " + "required for memory-backend objects, and aux-ram-share=on is " + "required.", memory_region_name(rb->mr)); + migrate_add_blocker_modes(&rb->cpr_blocker, errp, MIG_MODE_CPR_TRANSFER, + -1); +} + +void ram_block_del_cpr_blocker(RAMBlock *rb) +{ + migrate_del_blocker(&rb->cpr_blocker); +} |