diff options
Diffstat (limited to 'accel/kvm/kvm-all.c')
-rw-r--r-- | accel/kvm/kvm-all.c | 622 |
1 files changed, 420 insertions, 202 deletions
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 64bf47a..d095d1b 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -28,13 +28,14 @@ #include "hw/pci/msix.h" #include "hw/s390x/adapter.h" #include "gdbstub/enums.h" -#include "sysemu/kvm_int.h" -#include "sysemu/runstate.h" -#include "sysemu/cpus.h" -#include "sysemu/accel-blocker.h" +#include "system/kvm_int.h" +#include "system/runstate.h" +#include "system/cpus.h" +#include "system/accel-blocker.h" #include "qemu/bswap.h" -#include "exec/memory.h" -#include "exec/ram_addr.h" +#include "exec/tswap.h" +#include "system/memory.h" +#include "system/ram_addr.h" #include "qemu/event_notifier.h" #include "qemu/main-loop.h" #include "trace.h" @@ -42,21 +43,26 @@ #include "qapi/visitor.h" #include "qapi/qapi-types-common.h" #include "qapi/qapi-visit-common.h" -#include "sysemu/reset.h" +#include "system/reset.h" #include "qemu/guest-random.h" -#include "sysemu/hw_accel.h" +#include "system/hw_accel.h" #include "kvm-cpus.h" -#include "sysemu/dirtylimit.h" +#include "system/dirtylimit.h" #include "qemu/range.h" #include "hw/boards.h" -#include "sysemu/stats.h" +#include "system/stats.h" /* This check must be after config-host.h is included */ #ifdef CONFIG_EVENTFD #include <sys/eventfd.h> #endif +#if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__) +# define KVM_HAVE_MCE_INJECTION 1 +#endif + + /* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We * need to use the real host PAGE_SIZE, as that's what KVM will use. */ @@ -69,6 +75,11 @@ #define KVM_GUESTDBG_BLOCKIRQ 0 #endif +/* Default num of memslots to be allocated when VM starts */ +#define KVM_MEMSLOTS_NR_ALLOC_DEFAULT 16 +/* Default max allowed memslots if kernel reported nothing */ +#define KVM_MEMSLOTS_NR_MAX_DEFAULT 32 + struct KVMParkedVcpu { unsigned long vcpu_id; int kvm_fd; @@ -88,6 +99,7 @@ bool kvm_allowed; bool kvm_readonly_mem_allowed; bool kvm_vm_attributes_allowed; bool kvm_msi_use_devid; +bool kvm_pre_fault_memory_supported; static bool kvm_has_guest_debug; static int kvm_sstep_flags; static bool kvm_immediate_exit; @@ -165,11 +177,62 @@ void kvm_resample_fd_notify(int gsi) } } +/** + * kvm_slots_grow(): Grow the slots[] array in the KVMMemoryListener + * + * @kml: The KVMMemoryListener* to grow the slots[] array + * @nr_slots_new: The new size of slots[] array + * + * Returns: True if the array grows larger, false otherwise. + */ +static bool kvm_slots_grow(KVMMemoryListener *kml, unsigned int nr_slots_new) +{ + unsigned int i, cur = kml->nr_slots_allocated; + KVMSlot *slots; + + if (nr_slots_new > kvm_state->nr_slots_max) { + nr_slots_new = kvm_state->nr_slots_max; + } + + if (cur >= nr_slots_new) { + /* Big enough, no need to grow, or we reached max */ + return false; + } + + if (cur == 0) { + slots = g_new0(KVMSlot, nr_slots_new); + } else { + assert(kml->slots); + slots = g_renew(KVMSlot, kml->slots, nr_slots_new); + /* + * g_renew() doesn't initialize extended buffers, however kvm + * memslots require fields to be zero-initialized. E.g. pointers, + * memory_size field, etc. + */ + memset(&slots[cur], 0x0, sizeof(slots[0]) * (nr_slots_new - cur)); + } + + for (i = cur; i < nr_slots_new; i++) { + slots[i].slot = i; + } + + kml->slots = slots; + kml->nr_slots_allocated = nr_slots_new; + trace_kvm_slots_grow(cur, nr_slots_new); + + return true; +} + +static bool kvm_slots_double(KVMMemoryListener *kml) +{ + return kvm_slots_grow(kml, kml->nr_slots_allocated * 2); +} + unsigned int kvm_get_max_memslots(void) { KVMState *s = KVM_STATE(current_accel()); - return s->nr_slots; + return s->nr_slots_max; } unsigned int kvm_get_free_memslots(void) @@ -183,25 +246,36 @@ unsigned int kvm_get_free_memslots(void) if (!s->as[i].ml) { continue; } - used_slots = MAX(used_slots, s->as[i].ml->nr_used_slots); + used_slots = MAX(used_slots, s->as[i].ml->nr_slots_used); } kvm_slots_unlock(); - return s->nr_slots - used_slots; + return s->nr_slots_max - used_slots; } /* Called with KVMMemoryListener.slots_lock held */ static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml) { - KVMState *s = kvm_state; + unsigned int n; int i; - for (i = 0; i < s->nr_slots; i++) { + for (i = 0; i < kml->nr_slots_allocated; i++) { if (kml->slots[i].memory_size == 0) { return &kml->slots[i]; } } + /* + * If no free slots, try to grow first by doubling. Cache the old size + * here to avoid another round of search: if the grow succeeded, it + * means slots[] now must have the existing "n" slots occupied, + * followed by one or more free slots starting from slots[n]. + */ + n = kml->nr_slots_allocated; + if (kvm_slots_double(kml)) { + return &kml->slots[n]; + } + return NULL; } @@ -222,10 +296,9 @@ static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml, hwaddr start_addr, hwaddr size) { - KVMState *s = kvm_state; int i; - for (i = 0; i < s->nr_slots; i++) { + for (i = 0; i < kml->nr_slots_allocated; i++) { KVMSlot *mem = &kml->slots[i]; if (start_addr == mem->start_addr && size == mem->memory_size) { @@ -267,7 +340,7 @@ int kvm_physical_memory_addr_from_host(KVMState *s, void *ram, int i, ret = 0; kvm_slots_lock(); - for (i = 0; i < s->nr_slots; i++) { + for (i = 0; i < kml->nr_slots_allocated; i++) { KVMSlot *mem = &kml->slots[i]; if (ram >= mem->ram && ram < mem->ram + mem->memory_size) { @@ -340,14 +413,95 @@ err: return ret; } +void kvm_park_vcpu(CPUState *cpu) +{ + struct KVMParkedVcpu *vcpu; + + trace_kvm_park_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu)); + + vcpu = g_malloc0(sizeof(*vcpu)); + vcpu->vcpu_id = kvm_arch_vcpu_id(cpu); + vcpu->kvm_fd = cpu->kvm_fd; + QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node); +} + +int kvm_unpark_vcpu(KVMState *s, unsigned long vcpu_id) +{ + struct KVMParkedVcpu *cpu; + int kvm_fd = -ENOENT; + + QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) { + if (cpu->vcpu_id == vcpu_id) { + QLIST_REMOVE(cpu, node); + kvm_fd = cpu->kvm_fd; + g_free(cpu); + break; + } + } + + trace_kvm_unpark_vcpu(vcpu_id, kvm_fd > 0 ? "unparked" : "!found parked"); + + return kvm_fd; +} + +static void kvm_reset_parked_vcpus(KVMState *s) +{ + struct KVMParkedVcpu *cpu; + + QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) { + kvm_arch_reset_parked_vcpu(cpu->vcpu_id, cpu->kvm_fd); + } +} + +int kvm_create_vcpu(CPUState *cpu) +{ + unsigned long vcpu_id = kvm_arch_vcpu_id(cpu); + KVMState *s = kvm_state; + int kvm_fd; + + /* check if the KVM vCPU already exist but is parked */ + kvm_fd = kvm_unpark_vcpu(s, vcpu_id); + if (kvm_fd < 0) { + /* vCPU not parked: create a new KVM vCPU */ + kvm_fd = kvm_vm_ioctl(s, KVM_CREATE_VCPU, vcpu_id); + if (kvm_fd < 0) { + error_report("KVM_CREATE_VCPU IOCTL failed for vCPU %lu", vcpu_id); + return kvm_fd; + } + } + + cpu->kvm_fd = kvm_fd; + cpu->kvm_state = s; + if (!s->guest_state_protected) { + cpu->vcpu_dirty = true; + } + cpu->dirty_pages = 0; + cpu->throttle_us_per_full = 0; + + trace_kvm_create_vcpu(cpu->cpu_index, vcpu_id, kvm_fd); + + return 0; +} + +int kvm_create_and_park_vcpu(CPUState *cpu) +{ + int ret = 0; + + ret = kvm_create_vcpu(cpu); + if (!ret) { + kvm_park_vcpu(cpu); + } + + return ret; +} + static int do_kvm_destroy_vcpu(CPUState *cpu) { KVMState *s = kvm_state; - long mmap_size; - struct KVMParkedVcpu *vcpu = NULL; + int mmap_size; int ret = 0; - trace_kvm_destroy_vcpu(); + trace_kvm_destroy_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu)); ret = kvm_arch_destroy_vcpu(cpu); if (ret < 0) { @@ -373,10 +527,7 @@ static int do_kvm_destroy_vcpu(CPUState *cpu) } } - vcpu = g_malloc0(sizeof(*vcpu)); - vcpu->vcpu_id = kvm_arch_vcpu_id(cpu); - vcpu->kvm_fd = cpu->kvm_fd; - QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node); + kvm_park_vcpu(cpu); err: return ret; } @@ -389,44 +540,26 @@ void kvm_destroy_vcpu(CPUState *cpu) } } -static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id) -{ - struct KVMParkedVcpu *cpu; - - QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) { - if (cpu->vcpu_id == vcpu_id) { - int kvm_fd; - - QLIST_REMOVE(cpu, node); - kvm_fd = cpu->kvm_fd; - g_free(cpu); - return kvm_fd; - } - } - - return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id); -} - int kvm_init_vcpu(CPUState *cpu, Error **errp) { KVMState *s = kvm_state; - long mmap_size; + int mmap_size; int ret; trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu)); - ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu)); + ret = kvm_arch_pre_create_vcpu(cpu, errp); if (ret < 0) { - error_setg_errno(errp, -ret, "kvm_init_vcpu: kvm_get_vcpu failed (%lu)", - kvm_arch_vcpu_id(cpu)); goto err; } - cpu->kvm_fd = ret; - cpu->kvm_state = s; - cpu->vcpu_dirty = true; - cpu->dirty_pages = 0; - cpu->throttle_us_per_full = 0; + ret = kvm_create_vcpu(cpu); + if (ret < 0) { + error_setg_errno(errp, -ret, + "kvm_init_vcpu: kvm_create_vcpu failed (%lu)", + kvm_arch_vcpu_id(cpu)); + goto err; + } mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); if (mmap_size < 0) { @@ -1027,7 +1160,7 @@ static int kvm_physical_log_clear(KVMMemoryListener *kml, kvm_slots_lock(); - for (i = 0; i < s->nr_slots; i++) { + for (i = 0; i < kml->nr_slots_allocated; i++) { mem = &kml->slots[i]; /* Discard slots that are empty or do not overlap the section */ if (!mem->memory_size || @@ -1168,7 +1301,7 @@ static void kvm_unpoison_all(void *param) QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) { QLIST_REMOVE(page, list); - qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE); + qemu_ram_remap(page->ram_addr); g_free(page); } } @@ -1194,21 +1327,22 @@ bool kvm_hwpoisoned_mem(void) static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size) { -#if HOST_BIG_ENDIAN != TARGET_BIG_ENDIAN - /* The kernel expects ioeventfd values in HOST_BIG_ENDIAN - * endianness, but the memory core hands them in target endianness. - * For example, PPC is always treated as big-endian even if running - * on KVM and on PPC64LE. Correct here. - */ - switch (size) { - case 2: - val = bswap16(val); - break; - case 4: - val = bswap32(val); - break; + if (target_needs_bswap()) { + /* + * The kernel expects ioeventfd values in HOST_BIG_ENDIAN + * endianness, but the memory core hands them in target endianness. + * For example, PPC is always treated as big-endian even if running + * on KVM and on PPC64LE. Correct here, swapping back. + */ + switch (size) { + case 2: + val = bswap16(val); + break; + case 4: + val = bswap32(val); + break; + } } -#endif return val; } @@ -1406,7 +1540,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, } start_addr += slot_size; size -= slot_size; - kml->nr_used_slots--; + kml->nr_slots_used--; } while (size); return; } @@ -1445,7 +1579,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, ram_start_offset += slot_size; ram += slot_size; size -= slot_size; - kml->nr_used_slots++; + kml->nr_slots_used++; } while (size); } @@ -1481,11 +1615,7 @@ static void *kvm_dirty_ring_reaper_thread(void *data) r->reaper_iteration++; } - trace_kvm_dirty_ring_reaper("exit"); - - rcu_unregister_thread(); - - return NULL; + g_assert_not_reached(); } static void kvm_dirty_ring_reaper_init(KVMState *s) @@ -1675,12 +1805,8 @@ static void kvm_log_sync_global(MemoryListener *l, bool last_stage) /* Flush all kernel dirty addresses into KVMSlot dirty bitmap */ kvm_dirty_ring_flush(); - /* - * TODO: make this faster when nr_slots is big while there are - * only a few used slots (small VMs). - */ kvm_slots_lock(); - for (i = 0; i < s->nr_slots; i++) { + for (i = 0; i < kml->nr_slots_allocated; i++) { mem = &kml->slots[i]; if (mem->memory_size && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { kvm_slot_sync_dirty_pages(mem); @@ -1795,12 +1921,9 @@ void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, { int i; - kml->slots = g_new0(KVMSlot, s->nr_slots); kml->as_id = as_id; - for (i = 0; i < s->nr_slots; i++) { - kml->slots[i].slot = i; - } + kvm_slots_grow(kml, KVM_MEMSLOTS_NR_ALLOC_DEFAULT); QSIMPLEQ_INIT(&kml->transaction_add); QSIMPLEQ_INIT(&kml->transaction_del); @@ -2311,7 +2434,7 @@ static int kvm_recommended_vcpus(KVMState *s) static int kvm_max_vcpus(KVMState *s) { - int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS); + int ret = kvm_vm_check_extension(s, KVM_CAP_MAX_VCPUS); return (ret) ? ret : kvm_recommended_vcpus(s); } @@ -2341,6 +2464,109 @@ uint32_t kvm_dirty_ring_size(void) return kvm_state->kvm_dirty_ring_size; } +static int do_kvm_create_vm(MachineState *ms, int type) +{ + KVMState *s; + int ret; + + s = KVM_STATE(ms->accelerator); + + do { + ret = kvm_ioctl(s, KVM_CREATE_VM, type); + } while (ret == -EINTR); + + if (ret < 0) { + error_report("ioctl(KVM_CREATE_VM) failed: %s", strerror(-ret)); + +#ifdef TARGET_S390X + if (ret == -EINVAL) { + error_printf("Host kernel setup problem detected." + " Please verify:\n"); + error_printf("- for kernels supporting the" + " switch_amode or user_mode parameters, whether"); + error_printf(" user space is running in primary address space\n"); + error_printf("- for kernels supporting the vm.allocate_pgste" + " sysctl, whether it is enabled\n"); + } +#elif defined(TARGET_PPC) + if (ret == -EINVAL) { + error_printf("PPC KVM module is not loaded. Try modprobe kvm_%s.\n", + (type == 2) ? "pr" : "hv"); + } +#endif + } + + return ret; +} + +static int find_kvm_machine_type(MachineState *ms) +{ + MachineClass *mc = MACHINE_GET_CLASS(ms); + int type; + + if (object_property_find(OBJECT(current_machine), "kvm-type")) { + g_autofree char *kvm_type; + kvm_type = object_property_get_str(OBJECT(current_machine), + "kvm-type", + &error_abort); + type = mc->kvm_type(ms, kvm_type); + } else if (mc->kvm_type) { + type = mc->kvm_type(ms, NULL); + } else { + type = kvm_arch_get_default_type(ms); + } + return type; +} + +static int kvm_setup_dirty_ring(KVMState *s) +{ + uint64_t dirty_log_manual_caps; + int ret; + + /* + * Enable KVM dirty ring if supported, otherwise fall back to + * dirty logging mode + */ + ret = kvm_dirty_ring_init(s); + if (ret < 0) { + return ret; + } + + /* + * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is not needed when dirty ring is + * enabled. More importantly, KVM_DIRTY_LOG_INITIALLY_SET will assume no + * page is wr-protected initially, which is against how kvm dirty ring is + * usage - kvm dirty ring requires all pages are wr-protected at the very + * beginning. Enabling this feature for dirty ring causes data corruption. + * + * TODO: Without KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and kvm clear dirty log, + * we may expect a higher stall time when starting the migration. In the + * future we can enable KVM_CLEAR_DIRTY_LOG to work with dirty ring too: + * instead of clearing dirty bit, it can be a way to explicitly wr-protect + * guest pages. + */ + if (!s->kvm_dirty_ring_size) { + dirty_log_manual_caps = + kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); + dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | + KVM_DIRTY_LOG_INITIALLY_SET); + s->manual_dirty_log_protect = dirty_log_manual_caps; + if (dirty_log_manual_caps) { + ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0, + dirty_log_manual_caps); + if (ret) { + warn_report("Trying to enable capability %"PRIu64" of " + "KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 but failed. " + "Falling back to the legacy mode. ", + dirty_log_manual_caps); + s->manual_dirty_log_protect = 0; + } + } + } + + return 0; +} + static int kvm_init(MachineState *ms) { MachineClass *mc = MACHINE_GET_CLASS(ms); @@ -2360,7 +2586,6 @@ static int kvm_init(MachineState *ms) const KVMCapabilityInfo *missing_cap; int ret; int type; - uint64_t dirty_log_manual_caps; qemu_mutex_init(&kml_slots_lock); @@ -2383,7 +2608,7 @@ static int kvm_init(MachineState *ms) QLIST_INIT(&s->kvm_parked_vcpus); s->fd = qemu_open_old(s->device ?: "/dev/kvm", O_RDWR); if (s->fd == -1) { - fprintf(stderr, "Could not access KVM kernel module: %m\n"); + error_report("Could not access KVM kernel module: %m"); ret = -errno; goto err; } @@ -2393,84 +2618,43 @@ static int kvm_init(MachineState *ms) if (ret >= 0) { ret = -EINVAL; } - fprintf(stderr, "kvm version too old\n"); + error_report("kvm version too old"); goto err; } if (ret > KVM_API_VERSION) { ret = -EINVAL; - fprintf(stderr, "kvm version not supported\n"); + error_report("kvm version not supported"); goto err; } - kvm_supported_memory_attributes = kvm_check_extension(s, KVM_CAP_MEMORY_ATTRIBUTES); - kvm_guest_memfd_supported = - kvm_check_extension(s, KVM_CAP_GUEST_MEMFD) && - kvm_check_extension(s, KVM_CAP_USER_MEMORY2) && - (kvm_supported_memory_attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE); - kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT); - s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS); + s->nr_slots_max = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS); /* If unspecified, use the default value */ - if (!s->nr_slots) { - s->nr_slots = 32; - } - - s->nr_as = kvm_check_extension(s, KVM_CAP_MULTI_ADDRESS_SPACE); - if (s->nr_as <= 1) { - s->nr_as = 1; - } - s->as = g_new0(struct KVMAs, s->nr_as); - - if (object_property_find(OBJECT(current_machine), "kvm-type")) { - g_autofree char *kvm_type = object_property_get_str(OBJECT(current_machine), - "kvm-type", - &error_abort); - type = mc->kvm_type(ms, kvm_type); - } else if (mc->kvm_type) { - type = mc->kvm_type(ms, NULL); - } else { - type = kvm_arch_get_default_type(ms); + if (!s->nr_slots_max) { + s->nr_slots_max = KVM_MEMSLOTS_NR_MAX_DEFAULT; } + type = find_kvm_machine_type(ms); if (type < 0) { ret = -EINVAL; goto err; } - do { - ret = kvm_ioctl(s, KVM_CREATE_VM, type); - } while (ret == -EINTR); - + ret = do_kvm_create_vm(ms, type); if (ret < 0) { - fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret, - strerror(-ret)); - -#ifdef TARGET_S390X - if (ret == -EINVAL) { - fprintf(stderr, - "Host kernel setup problem detected. Please verify:\n"); - fprintf(stderr, "- for kernels supporting the switch_amode or" - " user_mode parameters, whether\n"); - fprintf(stderr, - " user space is running in primary address space\n"); - fprintf(stderr, - "- for kernels supporting the vm.allocate_pgste sysctl, " - "whether it is enabled\n"); - } -#elif defined(TARGET_PPC) - if (ret == -EINVAL) { - fprintf(stderr, - "PPC KVM module is not loaded. Try modprobe kvm_%s.\n", - (type == 2) ? "pr" : "hv"); - } -#endif goto err; } s->vmfd = ret; + s->nr_as = kvm_vm_check_extension(s, KVM_CAP_MULTI_ADDRESS_SPACE); + if (s->nr_as <= 1) { + s->nr_as = 1; + } + s->as = g_new0(struct KVMAs, s->nr_as); + /* check the vcpu limits */ soft_vcpus_limit = kvm_recommended_vcpus(s); hard_vcpus_limit = kvm_max_vcpus(s); @@ -2482,9 +2666,9 @@ static int kvm_init(MachineState *ms) nc->name, nc->num, soft_vcpus_limit); if (nc->num > hard_vcpus_limit) { - fprintf(stderr, "Number of %s cpus requested (%d) exceeds " - "the maximum cpus supported by KVM (%d)\n", - nc->name, nc->num, hard_vcpus_limit); + error_report("Number of %s cpus requested (%d) exceeds " + "the maximum cpus supported by KVM (%d)", + nc->name, nc->num, hard_vcpus_limit); exit(1); } } @@ -2498,8 +2682,8 @@ static int kvm_init(MachineState *ms) } if (missing_cap) { ret = -EINVAL; - fprintf(stderr, "kvm does not support %s\n%s", - missing_cap->name, upgrade_note); + error_report("kvm does not support %s", missing_cap->name); + error_printf("%s", upgrade_note); goto err; } @@ -2507,47 +2691,11 @@ static int kvm_init(MachineState *ms) s->coalesced_pio = s->coalesced_mmio && kvm_check_extension(s, KVM_CAP_COALESCED_PIO); - /* - * Enable KVM dirty ring if supported, otherwise fall back to - * dirty logging mode - */ - ret = kvm_dirty_ring_init(s); + ret = kvm_setup_dirty_ring(s); if (ret < 0) { goto err; } - /* - * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is not needed when dirty ring is - * enabled. More importantly, KVM_DIRTY_LOG_INITIALLY_SET will assume no - * page is wr-protected initially, which is against how kvm dirty ring is - * usage - kvm dirty ring requires all pages are wr-protected at the very - * beginning. Enabling this feature for dirty ring causes data corruption. - * - * TODO: Without KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and kvm clear dirty log, - * we may expect a higher stall time when starting the migration. In the - * future we can enable KVM_CLEAR_DIRTY_LOG to work with dirty ring too: - * instead of clearing dirty bit, it can be a way to explicitly wr-protect - * guest pages. - */ - if (!s->kvm_dirty_ring_size) { - dirty_log_manual_caps = - kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); - dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | - KVM_DIRTY_LOG_INITIALLY_SET); - s->manual_dirty_log_protect = dirty_log_manual_caps; - if (dirty_log_manual_caps) { - ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0, - dirty_log_manual_caps); - if (ret) { - warn_report("Trying to enable capability %"PRIu64" of " - "KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 but failed. " - "Falling back to the legacy mode. ", - dirty_log_manual_caps); - s->manual_dirty_log_protect = 0; - } - } - } - #ifdef KVM_CAP_VCPU_EVENTS s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS); #endif @@ -2559,7 +2707,7 @@ static int kvm_init(MachineState *ms) } kvm_readonly_mem_allowed = - (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0); + (kvm_vm_check_extension(s, KVM_CAP_READONLY_MEM) > 0); kvm_resamplefds_allowed = (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0); @@ -2593,6 +2741,13 @@ static int kvm_init(MachineState *ms) goto err; } + kvm_supported_memory_attributes = kvm_vm_check_extension(s, KVM_CAP_MEMORY_ATTRIBUTES); + kvm_guest_memfd_supported = + kvm_check_extension(s, KVM_CAP_GUEST_MEMFD) && + kvm_check_extension(s, KVM_CAP_USER_MEMORY2) && + (kvm_supported_memory_attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE); + kvm_pre_fault_memory_supported = kvm_vm_check_extension(s, KVM_CAP_PRE_FAULT_MEMORY); + if (s->kernel_irqchip_split == ON_OFF_AUTO_AUTO) { s->kernel_irqchip_split = mc->default_kernel_irqchip_split ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; } @@ -2722,9 +2877,15 @@ void kvm_flush_coalesced_mmio_buffer(void) static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg) { if (!cpu->vcpu_dirty && !kvm_state->guest_state_protected) { - int ret = kvm_arch_get_registers(cpu); + Error *err = NULL; + int ret = kvm_arch_get_registers(cpu, &err); if (ret) { - error_report("Failed to get registers: %s", strerror(-ret)); + if (err) { + error_reportf_err(err, "Failed to synchronize CPU state: "); + } else { + error_report("Failed to get registers: %s", strerror(-ret)); + } + cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); vm_stop(RUN_STATE_INTERNAL_ERROR); } @@ -2742,9 +2903,15 @@ void kvm_cpu_synchronize_state(CPUState *cpu) static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg) { - int ret = kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE); + Error *err = NULL; + int ret = kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE, &err); if (ret) { - error_report("Failed to put registers after reset: %s", strerror(-ret)); + if (err) { + error_reportf_err(err, "Restoring resisters after reset: "); + } else { + error_report("Failed to put registers after reset: %s", + strerror(-ret)); + } cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); vm_stop(RUN_STATE_INTERNAL_ERROR); } @@ -2755,13 +2922,23 @@ static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg void kvm_cpu_synchronize_post_reset(CPUState *cpu) { run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL); + + if (cpu == first_cpu) { + kvm_reset_parked_vcpus(kvm_state); + } } static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg) { - int ret = kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE); + Error *err = NULL; + int ret = kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE, &err); if (ret) { - error_report("Failed to put registers after init: %s", strerror(-ret)); + if (err) { + error_reportf_err(err, "Putting registers after init: "); + } else { + error_report("Failed to put registers after init: %s", + strerror(-ret)); + } exit(1); } @@ -2851,17 +3028,17 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) MemoryRegion *mr; RAMBlock *rb; void *addr; - int ret = -1; + int ret = -EINVAL; trace_kvm_convert_memory(start, size, to_private ? "shared_to_private" : "private_to_shared"); if (!QEMU_PTR_IS_ALIGNED(start, qemu_real_host_page_size()) || !QEMU_PTR_IS_ALIGNED(size, qemu_real_host_page_size())) { - return -1; + return ret; } if (!size) { - return -1; + return ret; } section = memory_region_find(get_system_memory(), start, size); @@ -2879,7 +3056,7 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) if (!to_private) { return 0; } - return -1; + return ret; } if (!memory_region_has_guest_memfd(mr)) { @@ -2914,6 +3091,15 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) addr = memory_region_get_ram_ptr(mr) + section.offset_within_region; rb = qemu_ram_block_from_host(addr, false, &offset); + ret = ram_block_attributes_state_change(RAM_BLOCK_ATTRIBUTES(mr->rdm), + offset, size, to_private); + if (ret) { + error_report("Failed to notify the listener the state change of " + "(0x%"HWADDR_PRIx" + 0x%"HWADDR_PRIx") to %s", + start, size, to_private ? "private" : "shared"); + goto out_unref; + } + if (to_private) { if (rb->page_size != qemu_real_host_page_size()) { /* @@ -2951,10 +3137,15 @@ int kvm_cpu_exec(CPUState *cpu) MemTxAttrs attrs; if (cpu->vcpu_dirty) { - ret = kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE); + Error *err = NULL; + ret = kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE, &err); if (ret) { - error_report("Failed to put registers after init: %s", - strerror(-ret)); + if (err) { + error_reportf_err(err, "Putting registers after init: "); + } else { + error_report("Failed to put registers after init: %s", + strerror(-ret)); + } ret = -1; break; } @@ -3126,7 +3317,7 @@ int kvm_cpu_exec(CPUState *cpu) return ret; } -int kvm_ioctl(KVMState *s, int type, ...) +int kvm_ioctl(KVMState *s, unsigned long type, ...) { int ret; void *arg; @@ -3144,7 +3335,7 @@ int kvm_ioctl(KVMState *s, int type, ...) return ret; } -int kvm_vm_ioctl(KVMState *s, int type, ...) +int kvm_vm_ioctl(KVMState *s, unsigned long type, ...) { int ret; void *arg; @@ -3164,7 +3355,7 @@ int kvm_vm_ioctl(KVMState *s, int type, ...) return ret; } -int kvm_vcpu_ioctl(CPUState *cpu, int type, ...) +int kvm_vcpu_ioctl(CPUState *cpu, unsigned long type, ...) { int ret; void *arg; @@ -3184,7 +3375,7 @@ int kvm_vcpu_ioctl(CPUState *cpu, int type, ...) return ret; } -int kvm_device_ioctl(int fd, int type, ...) +int kvm_device_ioctl(int fd, unsigned long type, ...) { int ret; void *arg; @@ -3745,6 +3936,21 @@ static void kvm_set_device(Object *obj, s->device = g_strdup(value); } +static void kvm_set_kvm_rapl(Object *obj, bool value, Error **errp) +{ + KVMState *s = KVM_STATE(obj); + s->msr_energy.enable = value; +} + +static void kvm_set_kvm_rapl_socket_path(Object *obj, + const char *str, + Error **errp) +{ + KVMState *s = KVM_STATE(obj); + g_free(s->msr_energy.socket_path); + s->msr_energy.socket_path = g_strdup(str); +} + static void kvm_accel_instance_init(Object *obj) { KVMState *s = KVM_STATE(obj); @@ -3764,6 +3970,7 @@ static void kvm_accel_instance_init(Object *obj) s->xen_gnttab_max_frames = 64; s->xen_evtchn_max_pirq = 256; s->device = NULL; + s->msr_energy.enable = false; } /** @@ -3777,7 +3984,7 @@ static int kvm_gdbstub_sstep_flags(void) return kvm_sstep_flags; } -static void kvm_accel_class_init(ObjectClass *oc, void *data) +static void kvm_accel_class_init(ObjectClass *oc, const void *data) { AccelClass *ac = ACCEL_CLASS(oc); ac->name = "KVM"; @@ -3808,6 +4015,17 @@ static void kvm_accel_class_init(ObjectClass *oc, void *data) object_class_property_set_description(oc, "device", "Path to the device node to use (default: /dev/kvm)"); + object_class_property_add_bool(oc, "rapl", + NULL, + kvm_set_kvm_rapl); + object_class_property_set_description(oc, "rapl", + "Allow energy related MSRs for RAPL interface in Guest"); + + object_class_property_add_str(oc, "rapl-helper-socket", NULL, + kvm_set_kvm_rapl_socket_path); + object_class_property_set_description(oc, "rapl-helper-socket", + "Socket Path for comminucating with the Virtual MSR helper daemon"); + kvm_arch_accel_class_init(oc); } |