aboutsummaryrefslogtreecommitdiff
path: root/accel/kvm/kvm-all.c
diff options
context:
space:
mode:
Diffstat (limited to 'accel/kvm/kvm-all.c')
-rw-r--r--accel/kvm/kvm-all.c622
1 files changed, 420 insertions, 202 deletions
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 64bf47a..d095d1b 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -28,13 +28,14 @@
#include "hw/pci/msix.h"
#include "hw/s390x/adapter.h"
#include "gdbstub/enums.h"
-#include "sysemu/kvm_int.h"
-#include "sysemu/runstate.h"
-#include "sysemu/cpus.h"
-#include "sysemu/accel-blocker.h"
+#include "system/kvm_int.h"
+#include "system/runstate.h"
+#include "system/cpus.h"
+#include "system/accel-blocker.h"
#include "qemu/bswap.h"
-#include "exec/memory.h"
-#include "exec/ram_addr.h"
+#include "exec/tswap.h"
+#include "system/memory.h"
+#include "system/ram_addr.h"
#include "qemu/event_notifier.h"
#include "qemu/main-loop.h"
#include "trace.h"
@@ -42,21 +43,26 @@
#include "qapi/visitor.h"
#include "qapi/qapi-types-common.h"
#include "qapi/qapi-visit-common.h"
-#include "sysemu/reset.h"
+#include "system/reset.h"
#include "qemu/guest-random.h"
-#include "sysemu/hw_accel.h"
+#include "system/hw_accel.h"
#include "kvm-cpus.h"
-#include "sysemu/dirtylimit.h"
+#include "system/dirtylimit.h"
#include "qemu/range.h"
#include "hw/boards.h"
-#include "sysemu/stats.h"
+#include "system/stats.h"
/* This check must be after config-host.h is included */
#ifdef CONFIG_EVENTFD
#include <sys/eventfd.h>
#endif
+#if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__)
+# define KVM_HAVE_MCE_INJECTION 1
+#endif
+
+
/* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We
* need to use the real host PAGE_SIZE, as that's what KVM will use.
*/
@@ -69,6 +75,11 @@
#define KVM_GUESTDBG_BLOCKIRQ 0
#endif
+/* Default num of memslots to be allocated when VM starts */
+#define KVM_MEMSLOTS_NR_ALLOC_DEFAULT 16
+/* Default max allowed memslots if kernel reported nothing */
+#define KVM_MEMSLOTS_NR_MAX_DEFAULT 32
+
struct KVMParkedVcpu {
unsigned long vcpu_id;
int kvm_fd;
@@ -88,6 +99,7 @@ bool kvm_allowed;
bool kvm_readonly_mem_allowed;
bool kvm_vm_attributes_allowed;
bool kvm_msi_use_devid;
+bool kvm_pre_fault_memory_supported;
static bool kvm_has_guest_debug;
static int kvm_sstep_flags;
static bool kvm_immediate_exit;
@@ -165,11 +177,62 @@ void kvm_resample_fd_notify(int gsi)
}
}
+/**
+ * kvm_slots_grow(): Grow the slots[] array in the KVMMemoryListener
+ *
+ * @kml: The KVMMemoryListener* to grow the slots[] array
+ * @nr_slots_new: The new size of slots[] array
+ *
+ * Returns: True if the array grows larger, false otherwise.
+ */
+static bool kvm_slots_grow(KVMMemoryListener *kml, unsigned int nr_slots_new)
+{
+ unsigned int i, cur = kml->nr_slots_allocated;
+ KVMSlot *slots;
+
+ if (nr_slots_new > kvm_state->nr_slots_max) {
+ nr_slots_new = kvm_state->nr_slots_max;
+ }
+
+ if (cur >= nr_slots_new) {
+ /* Big enough, no need to grow, or we reached max */
+ return false;
+ }
+
+ if (cur == 0) {
+ slots = g_new0(KVMSlot, nr_slots_new);
+ } else {
+ assert(kml->slots);
+ slots = g_renew(KVMSlot, kml->slots, nr_slots_new);
+ /*
+ * g_renew() doesn't initialize extended buffers, however kvm
+ * memslots require fields to be zero-initialized. E.g. pointers,
+ * memory_size field, etc.
+ */
+ memset(&slots[cur], 0x0, sizeof(slots[0]) * (nr_slots_new - cur));
+ }
+
+ for (i = cur; i < nr_slots_new; i++) {
+ slots[i].slot = i;
+ }
+
+ kml->slots = slots;
+ kml->nr_slots_allocated = nr_slots_new;
+ trace_kvm_slots_grow(cur, nr_slots_new);
+
+ return true;
+}
+
+static bool kvm_slots_double(KVMMemoryListener *kml)
+{
+ return kvm_slots_grow(kml, kml->nr_slots_allocated * 2);
+}
+
unsigned int kvm_get_max_memslots(void)
{
KVMState *s = KVM_STATE(current_accel());
- return s->nr_slots;
+ return s->nr_slots_max;
}
unsigned int kvm_get_free_memslots(void)
@@ -183,25 +246,36 @@ unsigned int kvm_get_free_memslots(void)
if (!s->as[i].ml) {
continue;
}
- used_slots = MAX(used_slots, s->as[i].ml->nr_used_slots);
+ used_slots = MAX(used_slots, s->as[i].ml->nr_slots_used);
}
kvm_slots_unlock();
- return s->nr_slots - used_slots;
+ return s->nr_slots_max - used_slots;
}
/* Called with KVMMemoryListener.slots_lock held */
static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
{
- KVMState *s = kvm_state;
+ unsigned int n;
int i;
- for (i = 0; i < s->nr_slots; i++) {
+ for (i = 0; i < kml->nr_slots_allocated; i++) {
if (kml->slots[i].memory_size == 0) {
return &kml->slots[i];
}
}
+ /*
+ * If no free slots, try to grow first by doubling. Cache the old size
+ * here to avoid another round of search: if the grow succeeded, it
+ * means slots[] now must have the existing "n" slots occupied,
+ * followed by one or more free slots starting from slots[n].
+ */
+ n = kml->nr_slots_allocated;
+ if (kvm_slots_double(kml)) {
+ return &kml->slots[n];
+ }
+
return NULL;
}
@@ -222,10 +296,9 @@ static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml,
hwaddr start_addr,
hwaddr size)
{
- KVMState *s = kvm_state;
int i;
- for (i = 0; i < s->nr_slots; i++) {
+ for (i = 0; i < kml->nr_slots_allocated; i++) {
KVMSlot *mem = &kml->slots[i];
if (start_addr == mem->start_addr && size == mem->memory_size) {
@@ -267,7 +340,7 @@ int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
int i, ret = 0;
kvm_slots_lock();
- for (i = 0; i < s->nr_slots; i++) {
+ for (i = 0; i < kml->nr_slots_allocated; i++) {
KVMSlot *mem = &kml->slots[i];
if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
@@ -340,14 +413,95 @@ err:
return ret;
}
+void kvm_park_vcpu(CPUState *cpu)
+{
+ struct KVMParkedVcpu *vcpu;
+
+ trace_kvm_park_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
+
+ vcpu = g_malloc0(sizeof(*vcpu));
+ vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
+ vcpu->kvm_fd = cpu->kvm_fd;
+ QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
+}
+
+int kvm_unpark_vcpu(KVMState *s, unsigned long vcpu_id)
+{
+ struct KVMParkedVcpu *cpu;
+ int kvm_fd = -ENOENT;
+
+ QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
+ if (cpu->vcpu_id == vcpu_id) {
+ QLIST_REMOVE(cpu, node);
+ kvm_fd = cpu->kvm_fd;
+ g_free(cpu);
+ break;
+ }
+ }
+
+ trace_kvm_unpark_vcpu(vcpu_id, kvm_fd > 0 ? "unparked" : "!found parked");
+
+ return kvm_fd;
+}
+
+static void kvm_reset_parked_vcpus(KVMState *s)
+{
+ struct KVMParkedVcpu *cpu;
+
+ QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
+ kvm_arch_reset_parked_vcpu(cpu->vcpu_id, cpu->kvm_fd);
+ }
+}
+
+int kvm_create_vcpu(CPUState *cpu)
+{
+ unsigned long vcpu_id = kvm_arch_vcpu_id(cpu);
+ KVMState *s = kvm_state;
+ int kvm_fd;
+
+ /* check if the KVM vCPU already exist but is parked */
+ kvm_fd = kvm_unpark_vcpu(s, vcpu_id);
+ if (kvm_fd < 0) {
+ /* vCPU not parked: create a new KVM vCPU */
+ kvm_fd = kvm_vm_ioctl(s, KVM_CREATE_VCPU, vcpu_id);
+ if (kvm_fd < 0) {
+ error_report("KVM_CREATE_VCPU IOCTL failed for vCPU %lu", vcpu_id);
+ return kvm_fd;
+ }
+ }
+
+ cpu->kvm_fd = kvm_fd;
+ cpu->kvm_state = s;
+ if (!s->guest_state_protected) {
+ cpu->vcpu_dirty = true;
+ }
+ cpu->dirty_pages = 0;
+ cpu->throttle_us_per_full = 0;
+
+ trace_kvm_create_vcpu(cpu->cpu_index, vcpu_id, kvm_fd);
+
+ return 0;
+}
+
+int kvm_create_and_park_vcpu(CPUState *cpu)
+{
+ int ret = 0;
+
+ ret = kvm_create_vcpu(cpu);
+ if (!ret) {
+ kvm_park_vcpu(cpu);
+ }
+
+ return ret;
+}
+
static int do_kvm_destroy_vcpu(CPUState *cpu)
{
KVMState *s = kvm_state;
- long mmap_size;
- struct KVMParkedVcpu *vcpu = NULL;
+ int mmap_size;
int ret = 0;
- trace_kvm_destroy_vcpu();
+ trace_kvm_destroy_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
ret = kvm_arch_destroy_vcpu(cpu);
if (ret < 0) {
@@ -373,10 +527,7 @@ static int do_kvm_destroy_vcpu(CPUState *cpu)
}
}
- vcpu = g_malloc0(sizeof(*vcpu));
- vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
- vcpu->kvm_fd = cpu->kvm_fd;
- QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
+ kvm_park_vcpu(cpu);
err:
return ret;
}
@@ -389,44 +540,26 @@ void kvm_destroy_vcpu(CPUState *cpu)
}
}
-static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
-{
- struct KVMParkedVcpu *cpu;
-
- QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
- if (cpu->vcpu_id == vcpu_id) {
- int kvm_fd;
-
- QLIST_REMOVE(cpu, node);
- kvm_fd = cpu->kvm_fd;
- g_free(cpu);
- return kvm_fd;
- }
- }
-
- return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
-}
-
int kvm_init_vcpu(CPUState *cpu, Error **errp)
{
KVMState *s = kvm_state;
- long mmap_size;
+ int mmap_size;
int ret;
trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
- ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
+ ret = kvm_arch_pre_create_vcpu(cpu, errp);
if (ret < 0) {
- error_setg_errno(errp, -ret, "kvm_init_vcpu: kvm_get_vcpu failed (%lu)",
- kvm_arch_vcpu_id(cpu));
goto err;
}
- cpu->kvm_fd = ret;
- cpu->kvm_state = s;
- cpu->vcpu_dirty = true;
- cpu->dirty_pages = 0;
- cpu->throttle_us_per_full = 0;
+ ret = kvm_create_vcpu(cpu);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret,
+ "kvm_init_vcpu: kvm_create_vcpu failed (%lu)",
+ kvm_arch_vcpu_id(cpu));
+ goto err;
+ }
mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
if (mmap_size < 0) {
@@ -1027,7 +1160,7 @@ static int kvm_physical_log_clear(KVMMemoryListener *kml,
kvm_slots_lock();
- for (i = 0; i < s->nr_slots; i++) {
+ for (i = 0; i < kml->nr_slots_allocated; i++) {
mem = &kml->slots[i];
/* Discard slots that are empty or do not overlap the section */
if (!mem->memory_size ||
@@ -1168,7 +1301,7 @@ static void kvm_unpoison_all(void *param)
QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
QLIST_REMOVE(page, list);
- qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
+ qemu_ram_remap(page->ram_addr);
g_free(page);
}
}
@@ -1194,21 +1327,22 @@ bool kvm_hwpoisoned_mem(void)
static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size)
{
-#if HOST_BIG_ENDIAN != TARGET_BIG_ENDIAN
- /* The kernel expects ioeventfd values in HOST_BIG_ENDIAN
- * endianness, but the memory core hands them in target endianness.
- * For example, PPC is always treated as big-endian even if running
- * on KVM and on PPC64LE. Correct here.
- */
- switch (size) {
- case 2:
- val = bswap16(val);
- break;
- case 4:
- val = bswap32(val);
- break;
+ if (target_needs_bswap()) {
+ /*
+ * The kernel expects ioeventfd values in HOST_BIG_ENDIAN
+ * endianness, but the memory core hands them in target endianness.
+ * For example, PPC is always treated as big-endian even if running
+ * on KVM and on PPC64LE. Correct here, swapping back.
+ */
+ switch (size) {
+ case 2:
+ val = bswap16(val);
+ break;
+ case 4:
+ val = bswap32(val);
+ break;
+ }
}
-#endif
return val;
}
@@ -1406,7 +1540,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
}
start_addr += slot_size;
size -= slot_size;
- kml->nr_used_slots--;
+ kml->nr_slots_used--;
} while (size);
return;
}
@@ -1445,7 +1579,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
ram_start_offset += slot_size;
ram += slot_size;
size -= slot_size;
- kml->nr_used_slots++;
+ kml->nr_slots_used++;
} while (size);
}
@@ -1481,11 +1615,7 @@ static void *kvm_dirty_ring_reaper_thread(void *data)
r->reaper_iteration++;
}
- trace_kvm_dirty_ring_reaper("exit");
-
- rcu_unregister_thread();
-
- return NULL;
+ g_assert_not_reached();
}
static void kvm_dirty_ring_reaper_init(KVMState *s)
@@ -1675,12 +1805,8 @@ static void kvm_log_sync_global(MemoryListener *l, bool last_stage)
/* Flush all kernel dirty addresses into KVMSlot dirty bitmap */
kvm_dirty_ring_flush();
- /*
- * TODO: make this faster when nr_slots is big while there are
- * only a few used slots (small VMs).
- */
kvm_slots_lock();
- for (i = 0; i < s->nr_slots; i++) {
+ for (i = 0; i < kml->nr_slots_allocated; i++) {
mem = &kml->slots[i];
if (mem->memory_size && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
kvm_slot_sync_dirty_pages(mem);
@@ -1795,12 +1921,9 @@ void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
{
int i;
- kml->slots = g_new0(KVMSlot, s->nr_slots);
kml->as_id = as_id;
- for (i = 0; i < s->nr_slots; i++) {
- kml->slots[i].slot = i;
- }
+ kvm_slots_grow(kml, KVM_MEMSLOTS_NR_ALLOC_DEFAULT);
QSIMPLEQ_INIT(&kml->transaction_add);
QSIMPLEQ_INIT(&kml->transaction_del);
@@ -2311,7 +2434,7 @@ static int kvm_recommended_vcpus(KVMState *s)
static int kvm_max_vcpus(KVMState *s)
{
- int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS);
+ int ret = kvm_vm_check_extension(s, KVM_CAP_MAX_VCPUS);
return (ret) ? ret : kvm_recommended_vcpus(s);
}
@@ -2341,6 +2464,109 @@ uint32_t kvm_dirty_ring_size(void)
return kvm_state->kvm_dirty_ring_size;
}
+static int do_kvm_create_vm(MachineState *ms, int type)
+{
+ KVMState *s;
+ int ret;
+
+ s = KVM_STATE(ms->accelerator);
+
+ do {
+ ret = kvm_ioctl(s, KVM_CREATE_VM, type);
+ } while (ret == -EINTR);
+
+ if (ret < 0) {
+ error_report("ioctl(KVM_CREATE_VM) failed: %s", strerror(-ret));
+
+#ifdef TARGET_S390X
+ if (ret == -EINVAL) {
+ error_printf("Host kernel setup problem detected."
+ " Please verify:\n");
+ error_printf("- for kernels supporting the"
+ " switch_amode or user_mode parameters, whether");
+ error_printf(" user space is running in primary address space\n");
+ error_printf("- for kernels supporting the vm.allocate_pgste"
+ " sysctl, whether it is enabled\n");
+ }
+#elif defined(TARGET_PPC)
+ if (ret == -EINVAL) {
+ error_printf("PPC KVM module is not loaded. Try modprobe kvm_%s.\n",
+ (type == 2) ? "pr" : "hv");
+ }
+#endif
+ }
+
+ return ret;
+}
+
+static int find_kvm_machine_type(MachineState *ms)
+{
+ MachineClass *mc = MACHINE_GET_CLASS(ms);
+ int type;
+
+ if (object_property_find(OBJECT(current_machine), "kvm-type")) {
+ g_autofree char *kvm_type;
+ kvm_type = object_property_get_str(OBJECT(current_machine),
+ "kvm-type",
+ &error_abort);
+ type = mc->kvm_type(ms, kvm_type);
+ } else if (mc->kvm_type) {
+ type = mc->kvm_type(ms, NULL);
+ } else {
+ type = kvm_arch_get_default_type(ms);
+ }
+ return type;
+}
+
+static int kvm_setup_dirty_ring(KVMState *s)
+{
+ uint64_t dirty_log_manual_caps;
+ int ret;
+
+ /*
+ * Enable KVM dirty ring if supported, otherwise fall back to
+ * dirty logging mode
+ */
+ ret = kvm_dirty_ring_init(s);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /*
+ * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is not needed when dirty ring is
+ * enabled. More importantly, KVM_DIRTY_LOG_INITIALLY_SET will assume no
+ * page is wr-protected initially, which is against how kvm dirty ring is
+ * usage - kvm dirty ring requires all pages are wr-protected at the very
+ * beginning. Enabling this feature for dirty ring causes data corruption.
+ *
+ * TODO: Without KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and kvm clear dirty log,
+ * we may expect a higher stall time when starting the migration. In the
+ * future we can enable KVM_CLEAR_DIRTY_LOG to work with dirty ring too:
+ * instead of clearing dirty bit, it can be a way to explicitly wr-protect
+ * guest pages.
+ */
+ if (!s->kvm_dirty_ring_size) {
+ dirty_log_manual_caps =
+ kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
+ dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
+ KVM_DIRTY_LOG_INITIALLY_SET);
+ s->manual_dirty_log_protect = dirty_log_manual_caps;
+ if (dirty_log_manual_caps) {
+ ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0,
+ dirty_log_manual_caps);
+ if (ret) {
+ warn_report("Trying to enable capability %"PRIu64" of "
+ "KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 but failed. "
+ "Falling back to the legacy mode. ",
+ dirty_log_manual_caps);
+ s->manual_dirty_log_protect = 0;
+ }
+ }
+ }
+
+ return 0;
+}
+
static int kvm_init(MachineState *ms)
{
MachineClass *mc = MACHINE_GET_CLASS(ms);
@@ -2360,7 +2586,6 @@ static int kvm_init(MachineState *ms)
const KVMCapabilityInfo *missing_cap;
int ret;
int type;
- uint64_t dirty_log_manual_caps;
qemu_mutex_init(&kml_slots_lock);
@@ -2383,7 +2608,7 @@ static int kvm_init(MachineState *ms)
QLIST_INIT(&s->kvm_parked_vcpus);
s->fd = qemu_open_old(s->device ?: "/dev/kvm", O_RDWR);
if (s->fd == -1) {
- fprintf(stderr, "Could not access KVM kernel module: %m\n");
+ error_report("Could not access KVM kernel module: %m");
ret = -errno;
goto err;
}
@@ -2393,84 +2618,43 @@ static int kvm_init(MachineState *ms)
if (ret >= 0) {
ret = -EINVAL;
}
- fprintf(stderr, "kvm version too old\n");
+ error_report("kvm version too old");
goto err;
}
if (ret > KVM_API_VERSION) {
ret = -EINVAL;
- fprintf(stderr, "kvm version not supported\n");
+ error_report("kvm version not supported");
goto err;
}
- kvm_supported_memory_attributes = kvm_check_extension(s, KVM_CAP_MEMORY_ATTRIBUTES);
- kvm_guest_memfd_supported =
- kvm_check_extension(s, KVM_CAP_GUEST_MEMFD) &&
- kvm_check_extension(s, KVM_CAP_USER_MEMORY2) &&
- (kvm_supported_memory_attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE);
-
kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT);
- s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS);
+ s->nr_slots_max = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS);
/* If unspecified, use the default value */
- if (!s->nr_slots) {
- s->nr_slots = 32;
- }
-
- s->nr_as = kvm_check_extension(s, KVM_CAP_MULTI_ADDRESS_SPACE);
- if (s->nr_as <= 1) {
- s->nr_as = 1;
- }
- s->as = g_new0(struct KVMAs, s->nr_as);
-
- if (object_property_find(OBJECT(current_machine), "kvm-type")) {
- g_autofree char *kvm_type = object_property_get_str(OBJECT(current_machine),
- "kvm-type",
- &error_abort);
- type = mc->kvm_type(ms, kvm_type);
- } else if (mc->kvm_type) {
- type = mc->kvm_type(ms, NULL);
- } else {
- type = kvm_arch_get_default_type(ms);
+ if (!s->nr_slots_max) {
+ s->nr_slots_max = KVM_MEMSLOTS_NR_MAX_DEFAULT;
}
+ type = find_kvm_machine_type(ms);
if (type < 0) {
ret = -EINVAL;
goto err;
}
- do {
- ret = kvm_ioctl(s, KVM_CREATE_VM, type);
- } while (ret == -EINTR);
-
+ ret = do_kvm_create_vm(ms, type);
if (ret < 0) {
- fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret,
- strerror(-ret));
-
-#ifdef TARGET_S390X
- if (ret == -EINVAL) {
- fprintf(stderr,
- "Host kernel setup problem detected. Please verify:\n");
- fprintf(stderr, "- for kernels supporting the switch_amode or"
- " user_mode parameters, whether\n");
- fprintf(stderr,
- " user space is running in primary address space\n");
- fprintf(stderr,
- "- for kernels supporting the vm.allocate_pgste sysctl, "
- "whether it is enabled\n");
- }
-#elif defined(TARGET_PPC)
- if (ret == -EINVAL) {
- fprintf(stderr,
- "PPC KVM module is not loaded. Try modprobe kvm_%s.\n",
- (type == 2) ? "pr" : "hv");
- }
-#endif
goto err;
}
s->vmfd = ret;
+ s->nr_as = kvm_vm_check_extension(s, KVM_CAP_MULTI_ADDRESS_SPACE);
+ if (s->nr_as <= 1) {
+ s->nr_as = 1;
+ }
+ s->as = g_new0(struct KVMAs, s->nr_as);
+
/* check the vcpu limits */
soft_vcpus_limit = kvm_recommended_vcpus(s);
hard_vcpus_limit = kvm_max_vcpus(s);
@@ -2482,9 +2666,9 @@ static int kvm_init(MachineState *ms)
nc->name, nc->num, soft_vcpus_limit);
if (nc->num > hard_vcpus_limit) {
- fprintf(stderr, "Number of %s cpus requested (%d) exceeds "
- "the maximum cpus supported by KVM (%d)\n",
- nc->name, nc->num, hard_vcpus_limit);
+ error_report("Number of %s cpus requested (%d) exceeds "
+ "the maximum cpus supported by KVM (%d)",
+ nc->name, nc->num, hard_vcpus_limit);
exit(1);
}
}
@@ -2498,8 +2682,8 @@ static int kvm_init(MachineState *ms)
}
if (missing_cap) {
ret = -EINVAL;
- fprintf(stderr, "kvm does not support %s\n%s",
- missing_cap->name, upgrade_note);
+ error_report("kvm does not support %s", missing_cap->name);
+ error_printf("%s", upgrade_note);
goto err;
}
@@ -2507,47 +2691,11 @@ static int kvm_init(MachineState *ms)
s->coalesced_pio = s->coalesced_mmio &&
kvm_check_extension(s, KVM_CAP_COALESCED_PIO);
- /*
- * Enable KVM dirty ring if supported, otherwise fall back to
- * dirty logging mode
- */
- ret = kvm_dirty_ring_init(s);
+ ret = kvm_setup_dirty_ring(s);
if (ret < 0) {
goto err;
}
- /*
- * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is not needed when dirty ring is
- * enabled. More importantly, KVM_DIRTY_LOG_INITIALLY_SET will assume no
- * page is wr-protected initially, which is against how kvm dirty ring is
- * usage - kvm dirty ring requires all pages are wr-protected at the very
- * beginning. Enabling this feature for dirty ring causes data corruption.
- *
- * TODO: Without KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and kvm clear dirty log,
- * we may expect a higher stall time when starting the migration. In the
- * future we can enable KVM_CLEAR_DIRTY_LOG to work with dirty ring too:
- * instead of clearing dirty bit, it can be a way to explicitly wr-protect
- * guest pages.
- */
- if (!s->kvm_dirty_ring_size) {
- dirty_log_manual_caps =
- kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
- dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
- KVM_DIRTY_LOG_INITIALLY_SET);
- s->manual_dirty_log_protect = dirty_log_manual_caps;
- if (dirty_log_manual_caps) {
- ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0,
- dirty_log_manual_caps);
- if (ret) {
- warn_report("Trying to enable capability %"PRIu64" of "
- "KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 but failed. "
- "Falling back to the legacy mode. ",
- dirty_log_manual_caps);
- s->manual_dirty_log_protect = 0;
- }
- }
- }
-
#ifdef KVM_CAP_VCPU_EVENTS
s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
#endif
@@ -2559,7 +2707,7 @@ static int kvm_init(MachineState *ms)
}
kvm_readonly_mem_allowed =
- (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0);
+ (kvm_vm_check_extension(s, KVM_CAP_READONLY_MEM) > 0);
kvm_resamplefds_allowed =
(kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0);
@@ -2593,6 +2741,13 @@ static int kvm_init(MachineState *ms)
goto err;
}
+ kvm_supported_memory_attributes = kvm_vm_check_extension(s, KVM_CAP_MEMORY_ATTRIBUTES);
+ kvm_guest_memfd_supported =
+ kvm_check_extension(s, KVM_CAP_GUEST_MEMFD) &&
+ kvm_check_extension(s, KVM_CAP_USER_MEMORY2) &&
+ (kvm_supported_memory_attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE);
+ kvm_pre_fault_memory_supported = kvm_vm_check_extension(s, KVM_CAP_PRE_FAULT_MEMORY);
+
if (s->kernel_irqchip_split == ON_OFF_AUTO_AUTO) {
s->kernel_irqchip_split = mc->default_kernel_irqchip_split ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
}
@@ -2722,9 +2877,15 @@ void kvm_flush_coalesced_mmio_buffer(void)
static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
{
if (!cpu->vcpu_dirty && !kvm_state->guest_state_protected) {
- int ret = kvm_arch_get_registers(cpu);
+ Error *err = NULL;
+ int ret = kvm_arch_get_registers(cpu, &err);
if (ret) {
- error_report("Failed to get registers: %s", strerror(-ret));
+ if (err) {
+ error_reportf_err(err, "Failed to synchronize CPU state: ");
+ } else {
+ error_report("Failed to get registers: %s", strerror(-ret));
+ }
+
cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
vm_stop(RUN_STATE_INTERNAL_ERROR);
}
@@ -2742,9 +2903,15 @@ void kvm_cpu_synchronize_state(CPUState *cpu)
static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg)
{
- int ret = kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE);
+ Error *err = NULL;
+ int ret = kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE, &err);
if (ret) {
- error_report("Failed to put registers after reset: %s", strerror(-ret));
+ if (err) {
+ error_reportf_err(err, "Restoring resisters after reset: ");
+ } else {
+ error_report("Failed to put registers after reset: %s",
+ strerror(-ret));
+ }
cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
vm_stop(RUN_STATE_INTERNAL_ERROR);
}
@@ -2755,13 +2922,23 @@ static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg
void kvm_cpu_synchronize_post_reset(CPUState *cpu)
{
run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
+
+ if (cpu == first_cpu) {
+ kvm_reset_parked_vcpus(kvm_state);
+ }
}
static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg)
{
- int ret = kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE);
+ Error *err = NULL;
+ int ret = kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE, &err);
if (ret) {
- error_report("Failed to put registers after init: %s", strerror(-ret));
+ if (err) {
+ error_reportf_err(err, "Putting registers after init: ");
+ } else {
+ error_report("Failed to put registers after init: %s",
+ strerror(-ret));
+ }
exit(1);
}
@@ -2851,17 +3028,17 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
MemoryRegion *mr;
RAMBlock *rb;
void *addr;
- int ret = -1;
+ int ret = -EINVAL;
trace_kvm_convert_memory(start, size, to_private ? "shared_to_private" : "private_to_shared");
if (!QEMU_PTR_IS_ALIGNED(start, qemu_real_host_page_size()) ||
!QEMU_PTR_IS_ALIGNED(size, qemu_real_host_page_size())) {
- return -1;
+ return ret;
}
if (!size) {
- return -1;
+ return ret;
}
section = memory_region_find(get_system_memory(), start, size);
@@ -2879,7 +3056,7 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
if (!to_private) {
return 0;
}
- return -1;
+ return ret;
}
if (!memory_region_has_guest_memfd(mr)) {
@@ -2914,6 +3091,15 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
addr = memory_region_get_ram_ptr(mr) + section.offset_within_region;
rb = qemu_ram_block_from_host(addr, false, &offset);
+ ret = ram_block_attributes_state_change(RAM_BLOCK_ATTRIBUTES(mr->rdm),
+ offset, size, to_private);
+ if (ret) {
+ error_report("Failed to notify the listener the state change of "
+ "(0x%"HWADDR_PRIx" + 0x%"HWADDR_PRIx") to %s",
+ start, size, to_private ? "private" : "shared");
+ goto out_unref;
+ }
+
if (to_private) {
if (rb->page_size != qemu_real_host_page_size()) {
/*
@@ -2951,10 +3137,15 @@ int kvm_cpu_exec(CPUState *cpu)
MemTxAttrs attrs;
if (cpu->vcpu_dirty) {
- ret = kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE);
+ Error *err = NULL;
+ ret = kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE, &err);
if (ret) {
- error_report("Failed to put registers after init: %s",
- strerror(-ret));
+ if (err) {
+ error_reportf_err(err, "Putting registers after init: ");
+ } else {
+ error_report("Failed to put registers after init: %s",
+ strerror(-ret));
+ }
ret = -1;
break;
}
@@ -3126,7 +3317,7 @@ int kvm_cpu_exec(CPUState *cpu)
return ret;
}
-int kvm_ioctl(KVMState *s, int type, ...)
+int kvm_ioctl(KVMState *s, unsigned long type, ...)
{
int ret;
void *arg;
@@ -3144,7 +3335,7 @@ int kvm_ioctl(KVMState *s, int type, ...)
return ret;
}
-int kvm_vm_ioctl(KVMState *s, int type, ...)
+int kvm_vm_ioctl(KVMState *s, unsigned long type, ...)
{
int ret;
void *arg;
@@ -3164,7 +3355,7 @@ int kvm_vm_ioctl(KVMState *s, int type, ...)
return ret;
}
-int kvm_vcpu_ioctl(CPUState *cpu, int type, ...)
+int kvm_vcpu_ioctl(CPUState *cpu, unsigned long type, ...)
{
int ret;
void *arg;
@@ -3184,7 +3375,7 @@ int kvm_vcpu_ioctl(CPUState *cpu, int type, ...)
return ret;
}
-int kvm_device_ioctl(int fd, int type, ...)
+int kvm_device_ioctl(int fd, unsigned long type, ...)
{
int ret;
void *arg;
@@ -3745,6 +3936,21 @@ static void kvm_set_device(Object *obj,
s->device = g_strdup(value);
}
+static void kvm_set_kvm_rapl(Object *obj, bool value, Error **errp)
+{
+ KVMState *s = KVM_STATE(obj);
+ s->msr_energy.enable = value;
+}
+
+static void kvm_set_kvm_rapl_socket_path(Object *obj,
+ const char *str,
+ Error **errp)
+{
+ KVMState *s = KVM_STATE(obj);
+ g_free(s->msr_energy.socket_path);
+ s->msr_energy.socket_path = g_strdup(str);
+}
+
static void kvm_accel_instance_init(Object *obj)
{
KVMState *s = KVM_STATE(obj);
@@ -3764,6 +3970,7 @@ static void kvm_accel_instance_init(Object *obj)
s->xen_gnttab_max_frames = 64;
s->xen_evtchn_max_pirq = 256;
s->device = NULL;
+ s->msr_energy.enable = false;
}
/**
@@ -3777,7 +3984,7 @@ static int kvm_gdbstub_sstep_flags(void)
return kvm_sstep_flags;
}
-static void kvm_accel_class_init(ObjectClass *oc, void *data)
+static void kvm_accel_class_init(ObjectClass *oc, const void *data)
{
AccelClass *ac = ACCEL_CLASS(oc);
ac->name = "KVM";
@@ -3808,6 +4015,17 @@ static void kvm_accel_class_init(ObjectClass *oc, void *data)
object_class_property_set_description(oc, "device",
"Path to the device node to use (default: /dev/kvm)");
+ object_class_property_add_bool(oc, "rapl",
+ NULL,
+ kvm_set_kvm_rapl);
+ object_class_property_set_description(oc, "rapl",
+ "Allow energy related MSRs for RAPL interface in Guest");
+
+ object_class_property_add_str(oc, "rapl-helper-socket", NULL,
+ kvm_set_kvm_rapl_socket_path);
+ object_class_property_set_description(oc, "rapl-helper-socket",
+ "Socket Path for comminucating with the Virtual MSR helper daemon");
+
kvm_arch_accel_class_init(oc);
}