aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--MAINTAINERS20
-rw-r--r--accel/Kconfig3
-rw-r--r--accel/kvm/kvm-all.c372
-rw-r--r--accel/kvm/trace-events2
-rw-r--r--accel/meson.build1
-rw-r--r--accel/nitro/meson.build3
-rw-r--r--accel/nitro/nitro-accel.c284
-rw-r--r--accel/nitro/trace-events6
-rw-r--r--accel/nitro/trace.h2
-rw-r--r--accel/stubs/kvm-stub.c18
-rw-r--r--accel/stubs/meson.build1
-rw-r--r--accel/stubs/nitro-stub.c11
-rw-r--r--accel/whpx/whpx-accel-ops.c8
-rw-r--r--accel/whpx/whpx-common.c68
-rw-r--r--audio/audio-mixeng-be.c2
-rw-r--r--audio/paaudio.c28
-rw-r--r--audio/spiceaudio.c30
-rw-r--r--docs/system/confidential-guest-support.rst1
-rw-r--r--docs/system/index.rst1
-rw-r--r--docs/system/nitro.rst133
-rw-r--r--hw/Kconfig1
-rw-r--r--hw/core/eif.c38
-rw-r--r--hw/core/eif.h41
-rw-r--r--hw/core/machine.c22
-rw-r--r--hw/hyperv/trace-events1
-rw-r--r--hw/hyperv/vmbus.c37
-rw-r--r--hw/i386/kvm/clock.c59
-rw-r--r--hw/i386/kvm/i8254.c91
-rw-r--r--hw/i386/kvm/trace-events1
-rw-r--r--hw/i386/vapic.c24
-rw-r--r--hw/i386/vmmouse.c10
-rw-r--r--hw/i386/x86-common.c71
-rw-r--r--hw/intc/openpic_kvm.c112
-rw-r--r--hw/meson.build1
-rw-r--r--hw/nitro/Kconfig18
-rw-r--r--hw/nitro/heartbeat.c115
-rw-r--r--hw/nitro/machine.c277
-rw-r--r--hw/nitro/meson.build4
-rw-r--r--hw/nitro/nitro-vsock-bus.c98
-rw-r--r--hw/nitro/serial-vsock.c123
-rw-r--r--hw/nitro/trace-events8
-rw-r--r--hw/nitro/trace.h4
-rw-r--r--hw/vfio/helpers.c91
-rw-r--r--include/accel/accel-ops.h2
-rw-r--r--include/hw/core/boards.h6
-rw-r--r--include/hw/i386/x86.h1
-rw-r--r--include/hw/nitro/heartbeat.h24
-rw-r--r--include/hw/nitro/machine.h20
-rw-r--r--include/hw/nitro/nitro-vsock-bus.h71
-rw-r--r--include/hw/nitro/serial-vsock.h24
-rw-r--r--include/standard-headers/linux/nitro_enclaves.h359
-rw-r--r--include/system/confidential-guest-support.h20
-rw-r--r--include/system/hw_accel.h1
-rw-r--r--include/system/kvm.h43
-rw-r--r--include/system/kvm_int.h1
-rw-r--r--include/system/nitro-accel.h25
-rw-r--r--include/system/physmem.h1
-rw-r--r--include/system/whpx-accel-ops.h16
-rw-r--r--include/system/whpx-all.h11
-rw-r--r--include/system/whpx-common.h6
-rw-r--r--include/system/whpx-internal.h16
-rw-r--r--meson.build20
-rw-r--r--meson_options.txt2
-rwxr-xr-xpython/scripts/vendor.py2
-rw-r--r--python/wheels/meson-1.9.0-py3-none-any.whlbin1029634 -> 0 bytes
-rw-r--r--qapi/qom.json16
-rw-r--r--qemu-options.hx8
-rw-r--r--rust/Cargo.toml1
-rw-r--r--rust/hw/core/src/qdev.rs14
-rw-r--r--scripts/meson-buildoptions.sh3
-rwxr-xr-xscripts/update-linux-headers.sh1
-rw-r--r--stubs/kvm.c22
-rw-r--r--stubs/meson.build1
-rw-r--r--system/physmem.c28
-rw-r--r--system/runstate.c44
-rw-r--r--target/alpha/cpu.c1
-rw-r--r--target/arm/cpu64.c8
-rw-r--r--target/arm/whpx/whpx-all.c43
-rw-r--r--target/i386/cpu.c41
-rw-r--r--target/i386/cpu.h4
-rw-r--r--target/i386/emulate/meson.build9
-rw-r--r--target/i386/emulate/x86.h1
-rw-r--r--target/i386/emulate/x86_decode.c12
-rw-r--r--target/i386/emulate/x86_emu.c375
-rw-r--r--target/i386/emulate/x86_emu.h24
-rw-r--r--target/i386/emulate/x86_flags.c47
-rw-r--r--target/i386/emulate/x86_flags.h20
-rw-r--r--target/i386/emulate/x86_helpers.c (renamed from target/i386/mshv/x86.c)13
-rw-r--r--target/i386/emulate/x86_mmu.c354
-rw-r--r--target/i386/emulate/x86_mmu.h (renamed from target/i386/hvf/x86_mmu.h)31
-rw-r--r--target/i386/hvf/hvf.c40
-rw-r--r--target/i386/hvf/meson.build1
-rw-r--r--target/i386/hvf/x86.c13
-rw-r--r--target/i386/hvf/x86_mmu.c277
-rw-r--r--target/i386/hvf/x86_task.c10
-rw-r--r--target/i386/kvm/kvm.c188
-rw-r--r--target/i386/kvm/tdx.c141
-rw-r--r--target/i386/kvm/tdx.h1
-rw-r--r--target/i386/kvm/trace-events4
-rw-r--r--target/i386/kvm/xen-emu.c38
-rw-r--r--target/i386/mshv/meson.build2
-rw-r--r--target/i386/mshv/mshv-cpu.c71
-rw-r--r--target/i386/sev.c127
-rw-r--r--target/i386/trace-events1
-rw-r--r--target/i386/whpx/whpx-all.c727
-rw-r--r--target/i386/whpx/whpx-apic.c5
-rw-r--r--tests/functional/x86_64/meson.build1
-rwxr-xr-xtests/functional/x86_64/test_rebuild_vmfd.py136
-rw-r--r--tests/qtest/libqtest.c1
-rw-r--r--ui/vdagent.c18
-rw-r--r--util/rcu.c79
111 files changed, 4571 insertions, 1343 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index a07a3a7..b8317fa 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -152,6 +152,13 @@ F: tools/i386/
F: tests/functional/i386/
F: tests/functional/x86_64/
+X86 VM file descriptor change on reset test
+M: Ani Sinha <anisinha@redhat.com>
+M: Paolo Bonzini <pbonzini@redhat.com>
+S: Maintained
+F: stubs/kvm.c
+F: tests/functional/x86_64/test_rebuild_vmfd.py
+
Guest CPU cores (TCG)
---------------------
Overall TCG CPUs
@@ -587,6 +594,12 @@ F: include/system/mshv.h
F: include/hw/hyperv/hvgdk*.h
F: include/hw/hyperv/hvhdk*.h
+Nitro Enclaves (native)
+M: Alexander Graf <graf@amazon.com>
+S: Maintained
+F: accel/nitro/
+F: include/system/nitro-accel.h
+
X86 MSHV CPUs
M: Magnus Kulke <magnus.kulke@linux.microsoft.com>
R: Wei Liu <wei.liu@kernel.org>
@@ -3021,6 +3034,13 @@ F: hw/vmapple/*
F: include/hw/vmapple/*
F: docs/system/arm/vmapple.rst
+Nitro Enclaves (native)
+M: Alexander Graf <graf@amazon.com>
+S: Maintained
+F: hw/nitro/
+F: include/hw/nitro/
+F: docs/system/nitro.rst
+
Subsystems
----------
Overall Audio backends
diff --git a/accel/Kconfig b/accel/Kconfig
index a60f114..6d05287 100644
--- a/accel/Kconfig
+++ b/accel/Kconfig
@@ -16,6 +16,9 @@ config KVM
config MSHV
bool
+config NITRO
+ bool
+
config XEN
bool
select FSDEV_9P if VIRTFS
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 0d8b0c4..ebd721c 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -90,6 +90,7 @@ struct KVMParkedVcpu {
};
KVMState *kvm_state;
+VmfdChangeNotifier vmfd_notifier;
bool kvm_kernel_irqchip;
bool kvm_split_irqchip;
bool kvm_async_interrupts_allowed;
@@ -123,6 +124,16 @@ static const KVMCapabilityInfo kvm_required_capabilites[] = {
static NotifierList kvm_irqchip_change_notifiers =
NOTIFIER_LIST_INITIALIZER(kvm_irqchip_change_notifiers);
+static NotifierWithReturnList register_vmfd_changed_notifiers =
+ NOTIFIER_WITH_RETURN_LIST_INITIALIZER(register_vmfd_changed_notifiers);
+
+static NotifierWithReturnList register_vcpufd_changed_notifiers =
+ NOTIFIER_WITH_RETURN_LIST_INITIALIZER(register_vcpufd_changed_notifiers);
+
+static int map_kvm_run(KVMState *s, CPUState *cpu, Error **errp);
+static int map_kvm_dirty_gfns(KVMState *s, CPUState *cpu, Error **errp);
+static int vcpu_unmap_regions(KVMState *s, CPUState *cpu);
+
struct KVMResampleFd {
int gsi;
EventNotifier *resample_event;
@@ -416,6 +427,90 @@ err:
return ret;
}
+static void kvm_create_vcpu_internal(CPUState *cpu, KVMState *s, int kvm_fd)
+{
+ cpu->kvm_fd = kvm_fd;
+ cpu->kvm_state = s;
+ if (!s->guest_state_protected) {
+ cpu->vcpu_dirty = true;
+ }
+ cpu->dirty_pages = 0;
+ cpu->throttle_us_per_full = 0;
+
+ return;
+}
+
+static int kvm_rebind_vcpus(Error **errp)
+{
+ CPUState *cpu;
+ unsigned long vcpu_id;
+ KVMState *s = kvm_state;
+ int kvm_fd, ret = 0;
+
+ CPU_FOREACH(cpu) {
+ vcpu_id = kvm_arch_vcpu_id(cpu);
+
+ if (cpu->kvm_fd) {
+ close(cpu->kvm_fd);
+ }
+
+ ret = kvm_arch_destroy_vcpu(cpu);
+ if (ret < 0) {
+ goto err;
+ }
+
+ if (s->coalesced_mmio_ring == (void *)cpu->kvm_run + PAGE_SIZE) {
+ s->coalesced_mmio_ring = NULL;
+ }
+
+ ret = vcpu_unmap_regions(s, cpu);
+ if (ret < 0) {
+ goto err;
+ }
+
+ ret = kvm_arch_pre_create_vcpu(cpu, errp);
+ if (ret < 0) {
+ goto err;
+ }
+
+ kvm_fd = kvm_vm_ioctl(s, KVM_CREATE_VCPU, vcpu_id);
+ if (kvm_fd < 0) {
+ error_report("KVM_CREATE_VCPU IOCTL failed for vCPU %lu (%s)",
+ vcpu_id, strerror(kvm_fd));
+ return kvm_fd;
+ }
+
+ kvm_create_vcpu_internal(cpu, s, kvm_fd);
+
+ ret = map_kvm_run(s, cpu, errp);
+ if (ret < 0) {
+ goto err;
+ }
+
+ if (s->kvm_dirty_ring_size) {
+ ret = map_kvm_dirty_gfns(s, cpu, errp);
+ if (ret < 0) {
+ goto err;
+ }
+ }
+
+ ret = kvm_arch_init_vcpu(cpu);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret,
+ "kvm_init_vcpu: kvm_arch_init_vcpu failed (%lu)",
+ vcpu_id);
+ }
+
+ close(cpu->kvm_vcpu_stats_fd);
+ cpu->kvm_vcpu_stats_fd = kvm_vcpu_ioctl(cpu, KVM_GET_STATS_FD, NULL);
+ kvm_init_cpu_signals(cpu);
+ }
+ trace_kvm_rebind_vcpus();
+
+ err:
+ return ret;
+}
+
static void kvm_park_vcpu(CPUState *cpu)
{
struct KVMParkedVcpu *vcpu;
@@ -479,13 +574,7 @@ static int kvm_create_vcpu(CPUState *cpu)
}
}
- cpu->kvm_fd = kvm_fd;
- cpu->kvm_state = s;
- if (!s->guest_state_protected) {
- cpu->vcpu_dirty = true;
- }
- cpu->dirty_pages = 0;
- cpu->throttle_us_per_full = 0;
+ kvm_create_vcpu_internal(cpu, s, kvm_fd);
trace_kvm_create_vcpu(cpu->cpu_index, vcpu_id, kvm_fd);
@@ -504,19 +593,11 @@ int kvm_create_and_park_vcpu(CPUState *cpu)
return ret;
}
-static int do_kvm_destroy_vcpu(CPUState *cpu)
+static int vcpu_unmap_regions(KVMState *s, CPUState *cpu)
{
- KVMState *s = kvm_state;
int mmap_size;
int ret = 0;
- trace_kvm_destroy_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
-
- ret = kvm_arch_destroy_vcpu(cpu);
- if (ret < 0) {
- goto err;
- }
-
mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
if (mmap_size < 0) {
ret = mmap_size;
@@ -544,39 +625,47 @@ static int do_kvm_destroy_vcpu(CPUState *cpu)
cpu->kvm_dirty_gfns = NULL;
}
- kvm_park_vcpu(cpu);
-err:
+ err:
return ret;
}
-void kvm_destroy_vcpu(CPUState *cpu)
-{
- if (do_kvm_destroy_vcpu(cpu) < 0) {
- error_report("kvm_destroy_vcpu failed");
- exit(EXIT_FAILURE);
- }
-}
-
-int kvm_init_vcpu(CPUState *cpu, Error **errp)
+static int do_kvm_destroy_vcpu(CPUState *cpu)
{
KVMState *s = kvm_state;
- int mmap_size;
- int ret;
+ int ret = 0;
- trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
+ trace_kvm_destroy_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
- ret = kvm_arch_pre_create_vcpu(cpu, errp);
+ ret = kvm_arch_destroy_vcpu(cpu);
if (ret < 0) {
goto err;
}
- ret = kvm_create_vcpu(cpu);
+ /* If I am the CPU that created coalesced_mmio_ring, then discard it */
+ if (s->coalesced_mmio_ring == (void *)cpu->kvm_run + PAGE_SIZE) {
+ s->coalesced_mmio_ring = NULL;
+ }
+
+ ret = vcpu_unmap_regions(s, cpu);
if (ret < 0) {
- error_setg_errno(errp, -ret,
- "kvm_init_vcpu: kvm_create_vcpu failed (%lu)",
- kvm_arch_vcpu_id(cpu));
goto err;
}
+ kvm_park_vcpu(cpu);
+err:
+ return ret;
+}
+
+void kvm_destroy_vcpu(CPUState *cpu)
+{
+ if (do_kvm_destroy_vcpu(cpu) < 0) {
+ error_report("kvm_destroy_vcpu failed");
+ exit(EXIT_FAILURE);
+ }
+}
+
+static int map_kvm_run(KVMState *s, CPUState *cpu, Error **errp)
+{
+ int mmap_size, ret = 0;
mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
if (mmap_size < 0) {
@@ -601,14 +690,53 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp)
(void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
}
+ err:
+ return ret;
+}
+
+static int map_kvm_dirty_gfns(KVMState *s, CPUState *cpu, Error **errp)
+{
+ int ret = 0;
+ /* Use MAP_SHARED to share pages with the kernel */
+ cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_bytes,
+ PROT_READ | PROT_WRITE, MAP_SHARED,
+ cpu->kvm_fd,
+ PAGE_SIZE * KVM_DIRTY_LOG_PAGE_OFFSET);
+ if (cpu->kvm_dirty_gfns == MAP_FAILED) {
+ ret = -errno;
+ }
+
+ return ret;
+}
+
+int kvm_init_vcpu(CPUState *cpu, Error **errp)
+{
+ KVMState *s = kvm_state;
+ int ret;
+
+ trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
+
+ ret = kvm_arch_pre_create_vcpu(cpu, errp);
+ if (ret < 0) {
+ goto err;
+ }
+
+ ret = kvm_create_vcpu(cpu);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret,
+ "kvm_init_vcpu: kvm_create_vcpu failed (%lu)",
+ kvm_arch_vcpu_id(cpu));
+ goto err;
+ }
+
+ ret = map_kvm_run(s, cpu, errp);
+ if (ret < 0) {
+ goto err;
+ }
+
if (s->kvm_dirty_ring_size) {
- /* Use MAP_SHARED to share pages with the kernel */
- cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_bytes,
- PROT_READ | PROT_WRITE, MAP_SHARED,
- cpu->kvm_fd,
- PAGE_SIZE * KVM_DIRTY_LOG_PAGE_OFFSET);
- if (cpu->kvm_dirty_gfns == MAP_FAILED) {
- ret = -errno;
+ ret = map_kvm_dirty_gfns(s, cpu, errp);
+ if (ret < 0) {
goto err;
}
}
@@ -2173,6 +2301,38 @@ void kvm_irqchip_change_notify(void)
notifier_list_notify(&kvm_irqchip_change_notifiers, NULL);
}
+void kvm_vmfd_add_change_notifier(NotifierWithReturn *n)
+{
+ notifier_with_return_list_add(&register_vmfd_changed_notifiers, n);
+}
+
+void kvm_vmfd_remove_change_notifier(NotifierWithReturn *n)
+{
+ notifier_with_return_remove(n);
+}
+
+static int kvm_vmfd_change_notify(Error **errp)
+{
+ return notifier_with_return_list_notify(&register_vmfd_changed_notifiers,
+ &vmfd_notifier, errp);
+}
+
+void kvm_vcpufd_add_change_notifier(NotifierWithReturn *n)
+{
+ notifier_with_return_list_add(&register_vcpufd_changed_notifiers, n);
+}
+
+void kvm_vcpufd_remove_change_notifier(NotifierWithReturn *n)
+{
+ notifier_with_return_remove(n);
+}
+
+static int kvm_vcpufd_change_notify(Error **errp)
+{
+ return notifier_with_return_list_notify(&register_vcpufd_changed_notifiers,
+ &vmfd_notifier, errp);
+}
+
int kvm_irqchip_get_virq(KVMState *s)
{
int next_virq;
@@ -2415,11 +2575,9 @@ void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi)
g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi));
}
-static void kvm_irqchip_create(KVMState *s)
+static void do_kvm_irqchip_create(KVMState *s)
{
int ret;
-
- assert(s->kernel_irqchip_split != ON_OFF_AUTO_AUTO);
if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
;
} else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) {
@@ -2452,7 +2610,13 @@ static void kvm_irqchip_create(KVMState *s)
fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret));
exit(1);
}
+}
+
+static void kvm_irqchip_create(KVMState *s)
+{
+ assert(s->kernel_irqchip_split != ON_OFF_AUTO_AUTO);
+ do_kvm_irqchip_create(s);
kvm_kernel_irqchip = true;
/* If we have an in-kernel IRQ chip then we must have asynchronous
* interrupt delivery (though the reverse is not necessarily true)
@@ -2607,6 +2771,122 @@ static int kvm_setup_dirty_ring(KVMState *s)
return 0;
}
+static int kvm_reset_vmfd(MachineState *ms)
+{
+ KVMState *s;
+ KVMMemoryListener *kml;
+ int ret = 0, type;
+ Error *err = NULL;
+
+ /*
+ * bail if the current architecture does not support VM file
+ * descriptor change.
+ */
+ if (!kvm_arch_supports_vmfd_change()) {
+ error_report("This target architecture does not support KVM VM "
+ "file descriptor change.");
+ return -EOPNOTSUPP;
+ }
+
+ s = KVM_STATE(ms->accelerator);
+ kml = &s->memory_listener;
+
+ memory_listener_unregister(&kml->listener);
+ memory_listener_unregister(&kvm_io_listener);
+
+ vmfd_notifier.pre = true;
+ ret = kvm_vmfd_change_notify(&err);
+ if (ret < 0) {
+ return ret;
+ }
+ assert(!err);
+
+ if (s->vmfd >= 0) {
+ close(s->vmfd);
+ }
+
+ type = find_kvm_machine_type(ms);
+ if (type < 0) {
+ return -EINVAL;
+ }
+
+ ret = do_kvm_create_vm(s, type);
+ if (ret < 0) {
+ return ret;
+ }
+
+ s->vmfd = ret;
+
+ /* guest state is now unprotected again */
+ kvm_state->guest_state_protected = false;
+
+ kvm_setup_dirty_ring(s);
+
+ /* rebind memory to new vm fd */
+ ret = ram_block_rebind(&err);
+ if (ret < 0) {
+ return ret;
+ }
+ assert(!err);
+
+ ret = kvm_arch_on_vmfd_change(ms, s);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (s->kernel_irqchip_allowed) {
+ do_kvm_irqchip_create(s);
+ }
+
+ /*
+ * notify everyone that vmfd has changed.
+ */
+ vmfd_notifier.vmfd = s->vmfd;
+ vmfd_notifier.pre = false;
+
+ ret = kvm_vmfd_change_notify(&err);
+ if (ret < 0) {
+ return ret;
+ }
+ assert(!err);
+
+ /*
+ * rebind new vcpu fds with the new kvm fds
+ * These can only be called after kvm_arch_on_vmfd_change()
+ */
+ ret = kvm_rebind_vcpus(&err);
+ if (ret < 0) {
+ return ret;
+ }
+ assert(!err);
+
+ /* notify everyone that vcpu fd has changed. */
+ ret = kvm_vcpufd_change_notify(&err);
+ if (ret < 0) {
+ return ret;
+ }
+ assert(!err);
+
+ /* these can be only called after ram_block_rebind() */
+ memory_listener_register(&kml->listener, &address_space_memory);
+ memory_listener_register(&kvm_io_listener, &address_space_io);
+
+ /*
+ * kvm fd has changed. Commit the irq routes to KVM once more.
+ */
+ kvm_irqchip_commit_routes(s);
+ /*
+ * for confidential guest, this is the last possible place where we
+ * can call synchronize_all_post_init() to sync all vcpu states to
+ * kvm.
+ */
+ if (ms->cgs) {
+ cpu_synchronize_all_post_init();
+ }
+ trace_kvm_reset_vmfd();
+ return ret;
+}
+
static int kvm_init(AccelState *as, MachineState *ms)
{
MachineClass *mc = MACHINE_GET_CLASS(ms);
@@ -3997,6 +4277,7 @@ static void kvm_accel_instance_init(Object *obj)
s->xen_evtchn_max_pirq = 256;
s->device = NULL;
s->msr_energy.enable = false;
+ s->honor_guest_pat = ON_OFF_AUTO_OFF;
}
/**
@@ -4015,6 +4296,7 @@ static void kvm_accel_class_init(ObjectClass *oc, const void *data)
AccelClass *ac = ACCEL_CLASS(oc);
ac->name = "KVM";
ac->init_machine = kvm_init;
+ ac->rebuild_guest = kvm_reset_vmfd;
ac->has_memory = kvm_accel_has_memory;
ac->allowed = &kvm_allowed;
ac->gdbstub_supported_sstep_flags = kvm_gdbstub_sstep_flags;
diff --git a/accel/kvm/trace-events b/accel/kvm/trace-events
index e43d18a..4a8921c 100644
--- a/accel/kvm/trace-events
+++ b/accel/kvm/trace-events
@@ -14,6 +14,8 @@ kvm_destroy_vcpu(int cpu_index, unsigned long arch_cpu_id) "index: %d id: %lu"
kvm_park_vcpu(int cpu_index, unsigned long arch_cpu_id) "index: %d id: %lu"
kvm_unpark_vcpu(unsigned long arch_cpu_id, const char *msg) "id: %lu %s"
kvm_irqchip_commit_routes(void) ""
+kvm_reset_vmfd(void) ""
+kvm_rebind_vcpus(void) ""
kvm_irqchip_add_msi_route(char *name, int vector, int virq) "dev %s vector %d virq %d"
kvm_irqchip_update_msi_route(int virq) "Updating MSI route virq=%d"
kvm_irqchip_release_virq(int virq) "virq %d"
diff --git a/accel/meson.build b/accel/meson.build
index 289b742..7da12b9 100644
--- a/accel/meson.build
+++ b/accel/meson.build
@@ -12,6 +12,7 @@ if have_system
subdir('xen')
subdir('stubs')
subdir('mshv')
+ subdir('nitro')
endif
# qtest
diff --git a/accel/nitro/meson.build b/accel/nitro/meson.build
new file mode 100644
index 0000000..e01c1ba
--- /dev/null
+++ b/accel/nitro/meson.build
@@ -0,0 +1,3 @@
+nitro_ss = ss.source_set()
+nitro_ss.add(files('nitro-accel.c'))
+system_ss.add_all(when: 'CONFIG_NITRO', if_true: nitro_ss)
diff --git a/accel/nitro/nitro-accel.c b/accel/nitro/nitro-accel.c
new file mode 100644
index 0000000..a1e97a9
--- /dev/null
+++ b/accel/nitro/nitro-accel.c
@@ -0,0 +1,284 @@
+/*
+ * Nitro Enclaves accelerator
+ *
+ * Copyright © 2026 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors:
+ * Alexander Graf <graf@amazon.com>
+ *
+ * Nitro Enclaves are a confidential compute technology which
+ * allows a parent instance to carve out resources from itself
+ * and spawn a confidential sibling VM next to itself. Similar
+ * to other confidential compute solutions, this sibling is
+ * controlled by an underlying vmm, but still has a higher level
+ * vmm (QEMU) to implement some of its I/O functionality and
+ * lifecycle.
+ *
+ * This accelerator drives /dev/nitro_enclaves to spawn a Nitro
+ * Enclave. It works in tandem with the nitro_enclaves machine
+ * which ensures the correct backend devices are available and
+ * that the initial seed (an EIF file) is loaded at the correct
+ * offset in memory.
+ *
+ * The accel starts the enclave when the machine starts, after
+ * all device setup is finished.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+#include "qapi/visitor.h"
+#include "qemu/module.h"
+#include "qemu/rcu.h"
+#include "qemu/accel.h"
+#include "qemu/guest-random.h"
+#include "qemu/main-loop.h"
+#include "accel/accel-ops.h"
+#include "accel/accel-cpu-ops.h"
+#include "accel/dummy-cpus.h"
+#include "system/cpus.h"
+#include "hw/core/cpu.h"
+#include "hw/core/boards.h"
+#include "hw/nitro/nitro-vsock-bus.h"
+#include "system/ramblock.h"
+#include "system/nitro-accel.h"
+#include "trace.h"
+
+#include <sys/ioctl.h>
+#include "standard-headers/linux/nitro_enclaves.h"
+
+bool nitro_allowed;
+
+typedef struct NitroAccelState {
+ AccelState parent_obj;
+
+ int ne_fd;
+ int enclave_fd;
+ uint64_t slot_uid;
+ uint64_t enclave_cid;
+ bool debug_mode;
+} NitroAccelState;
+
+static int nitro_init_machine(AccelState *as, MachineState *ms)
+{
+ NitroAccelState *s = NITRO_ACCEL(as);
+ uint64_t slot_uid = 0;
+ int ret;
+
+ s->ne_fd = open("/dev/nitro_enclaves", O_RDWR | O_CLOEXEC);
+ if (s->ne_fd < 0) {
+ error_report("nitro: failed to open /dev/nitro_enclaves: %s",
+ strerror(errno));
+ return -errno;
+ }
+
+ ret = ioctl(s->ne_fd, NE_CREATE_VM, &slot_uid);
+ if (ret < 0) {
+ error_report("nitro: NE_CREATE_VM failed: %s", strerror(errno));
+ close(s->ne_fd);
+ return -errno;
+ }
+ s->enclave_fd = ret;
+ s->slot_uid = slot_uid;
+
+ return 0;
+}
+
+static int nitro_donate_ram_block(RAMBlock *rb, void *opaque)
+{
+ NitroAccelState *s = opaque;
+ struct ne_user_memory_region region = {
+ .flags = 0,
+ .memory_size = rb->used_length,
+ .userspace_addr = (uint64_t)(uintptr_t)rb->host,
+ };
+
+ if (!rb->used_length) {
+ return 0;
+ }
+
+ if (ioctl(s->enclave_fd, NE_SET_USER_MEMORY_REGION, &region) < 0) {
+ error_report("nitro: NE_SET_USER_MEMORY_REGION failed for %s "
+ "(%" PRIu64 " bytes): %s", rb->idstr, rb->used_length,
+ strerror(errno));
+ return -errno;
+ }
+ return 0;
+}
+
+/*
+ * Start the Enclave. At this point memory is set up and the EIF is loaded.
+ * This function donates memory, adds vCPUs, and starts the enclave.
+ */
+static void nitro_setup_post(AccelState *as)
+{
+ MachineState *ms = MACHINE(qdev_get_machine());
+ NitroAccelState *s = NITRO_ACCEL(as);
+ int nr_cpus = ms->smp.cpus;
+ int i, ret;
+ struct ne_enclave_start_info start_info = {
+ .flags = s->debug_mode ? NE_ENCLAVE_DEBUG_MODE : 0,
+ .enclave_cid = s->enclave_cid,
+ };
+
+ ret = qemu_ram_foreach_block(nitro_donate_ram_block, s);
+ if (ret < 0) {
+ error_report("nitro: failed to donate memory");
+ exit(1);
+ }
+
+ for (i = 0; i < nr_cpus; i++) {
+ uint32_t cpu_id = 0;
+ if (ioctl(s->enclave_fd, NE_ADD_VCPU, &cpu_id) < 0) {
+ error_report("nitro: NE_ADD_VCPU failed: %s", strerror(errno));
+ exit(1);
+ }
+ }
+
+ ret = ioctl(s->enclave_fd, NE_START_ENCLAVE, &start_info);
+ if (ret < 0) {
+ switch (errno) {
+ case NE_ERR_NO_MEM_REGIONS_ADDED:
+ error_report("nitro: no memory regions added");
+ break;
+ case NE_ERR_NO_VCPUS_ADDED:
+ error_report("nitro: no vCPUs added");
+ break;
+ case NE_ERR_ENCLAVE_MEM_MIN_SIZE:
+ error_report("nitro: memory is below the minimum "
+ "required size. Try increasing -m");
+ break;
+ case NE_ERR_FULL_CORES_NOT_USED:
+ error_report("nitro: requires full CPU cores. "
+ "Try increasing -smp to a multiple of threads "
+ "per core on this host (e.g. -smp 2)");
+ break;
+ case NE_ERR_NOT_IN_INIT_STATE:
+ error_report("nitro: not in init state");
+ break;
+ case NE_ERR_INVALID_FLAG_VALUE:
+ error_report("nitro: invalid flag value for NE_START_ENCLAVE");
+ break;
+ case NE_ERR_INVALID_ENCLAVE_CID:
+ error_report("nitro: invalid enclave CID");
+ break;
+ default:
+ error_report("nitro: NE_START_ENCLAVE failed: %s (errno %d)",
+ strerror(errno), errno);
+ break;
+ }
+ exit(1);
+ }
+
+ s->enclave_cid = start_info.enclave_cid;
+ trace_nitro_enclave_started(s->enclave_cid);
+
+ /*
+ * Notify all Nitro vsock bus devices that the enclave has started
+ * and provide them with the CID for vsock connections.
+ */
+ {
+ NitroVsockBridge *bridge = nitro_vsock_bridge_find();
+ Error *err = NULL;
+
+ if (bridge) {
+ nitro_vsock_bridge_start_enclave(bridge,
+ (uint32_t)s->enclave_cid, &err);
+ if (err) {
+ error_report_err(err);
+ exit(1);
+ }
+ }
+ }
+}
+
+/* QOM properties */
+
+static bool nitro_get_debug_mode(Object *obj, Error **errp)
+{
+ return NITRO_ACCEL(obj)->debug_mode;
+}
+
+static void nitro_set_debug_mode(Object *obj, bool value, Error **errp)
+{
+ NITRO_ACCEL(obj)->debug_mode = value;
+}
+
+static void nitro_get_enclave_cid(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ uint64_t val = NITRO_ACCEL(obj)->enclave_cid;
+ visit_type_uint64(v, name, &val, errp);
+}
+
+static void nitro_set_enclave_cid(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ uint64_t val;
+ if (visit_type_uint64(v, name, &val, errp)) {
+ NITRO_ACCEL(obj)->enclave_cid = val;
+ }
+}
+
+static void nitro_accel_class_init(ObjectClass *oc, const void *data)
+{
+ AccelClass *ac = ACCEL_CLASS(oc);
+ ac->name = "Nitro";
+ ac->init_machine = nitro_init_machine;
+ ac->setup_post = nitro_setup_post;
+ ac->allowed = &nitro_allowed;
+
+ object_class_property_add_bool(oc, "debug-mode",
+ nitro_get_debug_mode,
+ nitro_set_debug_mode);
+ object_class_property_set_description(oc, "debug-mode",
+ "Start enclave in debug mode (enables console output)");
+
+ object_class_property_add(oc, "enclave-cid", "uint64",
+ nitro_get_enclave_cid,
+ nitro_set_enclave_cid,
+ NULL, NULL);
+ object_class_property_set_description(oc, "enclave-cid",
+ "Enclave CID (0 = auto-assigned by Nitro)");
+}
+
+static const TypeInfo nitro_accel_type = {
+ .name = TYPE_NITRO_ACCEL,
+ .parent = TYPE_ACCEL,
+ .instance_size = sizeof(NitroAccelState),
+ .class_init = nitro_accel_class_init,
+};
+module_obj(TYPE_NITRO_ACCEL);
+
+static bool nitro_cpus_are_resettable(void)
+{
+ return false;
+}
+
+static void nitro_accel_ops_class_init(ObjectClass *oc, const void *data)
+{
+ AccelOpsClass *ops = ACCEL_OPS_CLASS(oc);
+ ops->create_vcpu_thread = dummy_start_vcpu_thread;
+ ops->handle_interrupt = generic_handle_interrupt;
+ ops->cpus_are_resettable = nitro_cpus_are_resettable;
+}
+
+static const TypeInfo nitro_accel_ops_type = {
+ .name = ACCEL_OPS_NAME("nitro"),
+ .parent = TYPE_ACCEL_OPS,
+ .class_init = nitro_accel_ops_class_init,
+ .abstract = true,
+};
+module_obj(ACCEL_OPS_NAME("nitro"));
+
+static void nitro_type_init(void)
+{
+ type_register_static(&nitro_accel_type);
+ type_register_static(&nitro_accel_ops_type);
+}
+
+type_init(nitro_type_init);
diff --git a/accel/nitro/trace-events b/accel/nitro/trace-events
new file mode 100644
index 0000000..9673eb5
--- /dev/null
+++ b/accel/nitro/trace-events
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# See docs/devel/tracing.rst for syntax documentation.
+
+# nitro-accel.c
+nitro_enclave_started(uint64_t cid) "nitro: enclave started, CID=%"PRIu64
diff --git a/accel/nitro/trace.h b/accel/nitro/trace.h
new file mode 100644
index 0000000..8c55647
--- /dev/null
+++ b/accel/nitro/trace.h
@@ -0,0 +1,2 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#include "trace/trace-accel_nitro.h"
diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c
index 68cd33b..c4617ca 100644
--- a/accel/stubs/kvm-stub.c
+++ b/accel/stubs/kvm-stub.c
@@ -79,6 +79,24 @@ void kvm_irqchip_change_notify(void)
{
}
+void kvm_vmfd_add_change_notifier(NotifierWithReturn *n)
+{
+}
+
+void kvm_vmfd_remove_change_notifier(NotifierWithReturn *n)
+{
+}
+
+void kvm_vcpufd_add_change_notifier(NotifierWithReturn *n)
+{
+ return;
+}
+
+void kvm_vcpufd_remove_change_notifier(NotifierWithReturn *n)
+{
+ return;
+}
+
int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
EventNotifier *rn, int virq)
{
diff --git a/accel/stubs/meson.build b/accel/stubs/meson.build
index 48eccd1..5de4a27 100644
--- a/accel/stubs/meson.build
+++ b/accel/stubs/meson.build
@@ -3,6 +3,7 @@ system_stubs_ss.add(when: 'CONFIG_XEN', if_false: files('xen-stub.c'))
system_stubs_ss.add(when: 'CONFIG_KVM', if_false: files('kvm-stub.c'))
system_stubs_ss.add(when: 'CONFIG_TCG', if_false: files('tcg-stub.c'))
system_stubs_ss.add(when: 'CONFIG_HVF', if_false: files('hvf-stub.c'))
+system_stubs_ss.add(when: 'CONFIG_NITRO', if_false: files('nitro-stub.c'))
system_stubs_ss.add(when: 'CONFIG_NVMM', if_false: files('nvmm-stub.c'))
system_stubs_ss.add(when: 'CONFIG_WHPX', if_false: files('whpx-stub.c'))
system_stubs_ss.add(when: 'CONFIG_MSHV', if_false: files('mshv-stub.c'))
diff --git a/accel/stubs/nitro-stub.c b/accel/stubs/nitro-stub.c
new file mode 100644
index 0000000..186c844
--- /dev/null
+++ b/accel/stubs/nitro-stub.c
@@ -0,0 +1,11 @@
+/*
+ * Nitro accel stubs for QEMU
+ *
+ * Copyright © 2026 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+
+bool nitro_allowed;
diff --git a/accel/whpx/whpx-accel-ops.c b/accel/whpx/whpx-accel-ops.c
index 50fadea..b8f4154 100644
--- a/accel/whpx/whpx-accel-ops.c
+++ b/accel/whpx/whpx-accel-ops.c
@@ -17,6 +17,7 @@
#include "system/whpx.h"
#include "system/whpx-internal.h"
+#include "system/whpx-all.h"
#include "system/whpx-accel-ops.h"
static void *whpx_cpu_thread_fn(void *arg)
@@ -81,6 +82,12 @@ static bool whpx_vcpu_thread_is_idle(CPUState *cpu)
return !whpx_irqchip_in_kernel();
}
+static bool whpx_supports_guest_debug(void)
+{
+ return whpx_arch_supports_guest_debug();
+}
+
+
static void whpx_accel_ops_class_init(ObjectClass *oc, const void *data)
{
AccelOpsClass *ops = ACCEL_OPS_CLASS(oc);
@@ -89,6 +96,7 @@ static void whpx_accel_ops_class_init(ObjectClass *oc, const void *data)
ops->kick_vcpu_thread = whpx_kick_vcpu_thread;
ops->cpu_thread_is_idle = whpx_vcpu_thread_is_idle;
ops->handle_interrupt = generic_handle_interrupt;
+ ops->supports_guest_debug = whpx_supports_guest_debug;
ops->synchronize_post_reset = whpx_cpu_synchronize_post_reset;
ops->synchronize_post_init = whpx_cpu_synchronize_post_init;
diff --git a/accel/whpx/whpx-common.c b/accel/whpx/whpx-common.c
index f018a8f..4863fc8 100644
--- a/accel/whpx/whpx-common.c
+++ b/accel/whpx/whpx-common.c
@@ -39,13 +39,45 @@ bool whpx_allowed;
bool whpx_irqchip_in_kernel;
static bool whp_dispatch_initialized;
static HMODULE hWinHvPlatform;
-#ifdef HOST_X86_64
-static HMODULE hWinHvEmulation;
-#endif
struct whpx_state whpx_global;
struct WHPDispatch whp_dispatch;
+void whpx_flush_cpu_state(CPUState *cpu)
+{
+ if (cpu->vcpu_dirty) {
+ whpx_set_registers(cpu, WHPX_LEVEL_RUNTIME_STATE);
+ cpu->vcpu_dirty = false;
+ }
+}
+
+void whpx_get_reg(CPUState *cpu, WHV_REGISTER_NAME reg, WHV_REGISTER_VALUE* val)
+{
+ struct whpx_state *whpx = &whpx_global;
+ HRESULT hr;
+
+ whpx_flush_cpu_state(cpu);
+
+ hr = whp_dispatch.WHvGetVirtualProcessorRegisters(whpx->partition, cpu->cpu_index,
+ &reg, 1, val);
+
+ if (FAILED(hr)) {
+ error_report("WHPX: Failed to get register %08x, hr=%08lx", reg, hr);
+ }
+}
+
+void whpx_set_reg(CPUState *cpu, WHV_REGISTER_NAME reg, WHV_REGISTER_VALUE val)
+{
+ struct whpx_state *whpx = &whpx_global;
+ HRESULT hr;
+ hr = whp_dispatch.WHvSetVirtualProcessorRegisters(whpx->partition, cpu->cpu_index,
+ &reg, 1, &val);
+
+ if (FAILED(hr)) {
+ error_report("WHPX: Failed to set register %08x, hr=%08lx", reg, hr);
+ }
+}
+
/* Tries to find a breakpoint at the specified address. */
struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address)
{
@@ -148,7 +180,7 @@ int whpx_last_vcpu_stopping(CPUState *cpu)
static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
{
if (!cpu->vcpu_dirty) {
- whpx_get_registers(cpu);
+ whpx_get_registers(cpu, WHPX_LEVEL_FULL_STATE);
cpu->vcpu_dirty = true;
}
}
@@ -156,14 +188,14 @@ static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu,
run_on_cpu_data arg)
{
- whpx_set_registers(cpu, WHPX_SET_RESET_STATE);
+ whpx_set_registers(cpu, WHPX_LEVEL_RESET_STATE);
cpu->vcpu_dirty = false;
}
static void do_whpx_cpu_synchronize_post_init(CPUState *cpu,
run_on_cpu_data arg)
{
- whpx_set_registers(cpu, WHPX_SET_FULL_STATE);
+ whpx_set_registers(cpu, WHPX_LEVEL_FULL_STATE);
cpu->vcpu_dirty = false;
}
@@ -236,10 +268,7 @@ void whpx_destroy_vcpu(CPUState *cpu)
struct whpx_state *whpx = &whpx_global;
whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index);
-#ifdef HOST_X86_64
- AccelCPUState *vcpu = cpu->accel;
- whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
-#endif
+ whpx_arch_destroy_vcpu(cpu);
g_free(cpu->accel);
}
@@ -361,7 +390,6 @@ static bool load_whp_dispatch_fns(HMODULE *handle,
HMODULE hLib = *handle;
#define WINHV_PLATFORM_DLL "WinHvPlatform.dll"
- #define WINHV_EMULATION_DLL "WinHvEmulation.dll"
#define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \
whp_dispatch.function_name = \
(function_name ## _t)GetProcAddress(hLib, #function_name); \
@@ -388,14 +416,6 @@ static bool load_whp_dispatch_fns(HMODULE *handle,
WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD)
break;
- case WINHV_EMULATION_FNS_DEFAULT:
-#ifdef HOST_X86_64
- WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib)
- LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD)
-#else
- g_assert_not_reached();
-#endif
- break;
case WINHV_PLATFORM_FNS_SUPPLEMENTAL:
WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL)
@@ -511,11 +531,6 @@ bool init_whp_dispatch(void)
if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) {
goto error;
}
-#ifdef HOST_X86_64
- if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) {
- goto error;
- }
-#endif
assert(load_whp_dispatch_fns(&hWinHvPlatform,
WINHV_PLATFORM_FNS_SUPPLEMENTAL));
whp_dispatch_initialized = true;
@@ -525,11 +540,6 @@ error:
if (hWinHvPlatform) {
FreeLibrary(hWinHvPlatform);
}
-#ifdef HOST_X86_64
- if (hWinHvEmulation) {
- FreeLibrary(hWinHvEmulation);
- }
-#endif
return false;
}
diff --git a/audio/audio-mixeng-be.c b/audio/audio-mixeng-be.c
index 3704045..5878b23 100644
--- a/audio/audio-mixeng-be.c
+++ b/audio/audio-mixeng-be.c
@@ -1649,7 +1649,7 @@ static void audio_mixeng_backend_set_volume_out(AudioBackend *be, SWVoiceOut *sw
sw->vol.mute = vol->mute;
sw->vol.l = nominal_volume.l * vol->vol[0] / 255;
- sw->vol.r = nominal_volume.l * vol->vol[vol->channels > 1 ? 1 : 0] /
+ sw->vol.r = nominal_volume.r * vol->vol[vol->channels > 1 ? 1 : 0] /
255;
if (k->volume_out) {
diff --git a/audio/paaudio.c b/audio/paaudio.c
index 23e8767..24327ec 100644
--- a/audio/paaudio.c
+++ b/audio/paaudio.c
@@ -62,26 +62,6 @@ static void G_GNUC_PRINTF(2, 3) qpa_logerr(int err, const char *fmt, ...)
error_printf(" Reason: %s\n", pa_strerror(err));
}
-#ifndef PA_CONTEXT_IS_GOOD
-static inline int PA_CONTEXT_IS_GOOD(pa_context_state_t x)
-{
- return
- x == PA_CONTEXT_CONNECTING ||
- x == PA_CONTEXT_AUTHORIZING ||
- x == PA_CONTEXT_SETTING_NAME ||
- x == PA_CONTEXT_READY;
-}
-#endif
-
-#ifndef PA_STREAM_IS_GOOD
-static inline int PA_STREAM_IS_GOOD(pa_stream_state_t x)
-{
- return
- x == PA_STREAM_CREATING ||
- x == PA_STREAM_READY;
-}
-#endif
-
#define CHECK_SUCCESS_GOTO(c, expression, label, msg) \
do { \
if (!(expression)) { \
@@ -682,9 +662,7 @@ static void qpa_volume_out(HWVoiceOut *hw, Volume *vol)
PAConnection *c = pa->g->conn;
int i;
-#ifdef PA_CHECK_VERSION /* macro is present in 0.9.16+ */
- pa_cvolume_init (&v); /* function is present in 0.9.13+ */
-#endif
+ pa_cvolume_init(&v);
v.channels = vol->channels;
for (i = 0; i < vol->channels; ++i) {
@@ -724,9 +702,7 @@ static void qpa_volume_in(HWVoiceIn *hw, Volume *vol)
PAConnection *c = pa->g->conn;
int i;
-#ifdef PA_CHECK_VERSION
- pa_cvolume_init (&v);
-#endif
+ pa_cvolume_init(&v);
v.channels = vol->channels;
for (i = 0; i < vol->channels; ++i) {
diff --git a/audio/spiceaudio.c b/audio/spiceaudio.c
index 70a0b60..5a97eb8 100644
--- a/audio/spiceaudio.c
+++ b/audio/spiceaudio.c
@@ -49,17 +49,8 @@ static bool spice_audio_realize(AudioBackend *abe, Audiodev *dev, Error **errp)
return audio_spice_parent_class->realize(abe, dev, errp);
}
-#if SPICE_INTERFACE_PLAYBACK_MAJOR > 1 || SPICE_INTERFACE_PLAYBACK_MINOR >= 3
#define LINE_OUT_SAMPLES (480 * 4)
-#else
-#define LINE_OUT_SAMPLES (256 * 4)
-#endif
-
-#if SPICE_INTERFACE_RECORD_MAJOR > 2 || SPICE_INTERFACE_RECORD_MINOR >= 3
#define LINE_IN_SAMPLES (480 * 4)
-#else
-#define LINE_IN_SAMPLES (256 * 4)
-#endif
typedef struct SpiceVoiceOut {
HWVoiceOut hw;
@@ -99,11 +90,7 @@ static int line_out_init(HWVoiceOut *hw, struct audsettings *as)
SpiceVoiceOut *out = container_of (hw, SpiceVoiceOut, hw);
struct audsettings settings;
-#if SPICE_INTERFACE_PLAYBACK_MAJOR > 1 || SPICE_INTERFACE_PLAYBACK_MINOR >= 3
settings.freq = spice_server_get_best_playback_rate(NULL);
-#else
- settings.freq = SPICE_INTERFACE_PLAYBACK_FREQ;
-#endif
settings.nchannels = SPICE_INTERFACE_PLAYBACK_CHAN;
settings.fmt = AUDIO_FORMAT_S16;
settings.big_endian = HOST_BIG_ENDIAN;
@@ -114,9 +101,7 @@ static int line_out_init(HWVoiceOut *hw, struct audsettings *as)
out->sin.base.sif = &playback_sif.base;
qemu_spice.add_interface(&out->sin.base);
-#if SPICE_INTERFACE_PLAYBACK_MAJOR > 1 || SPICE_INTERFACE_PLAYBACK_MINOR >= 3
spice_server_set_playback_rate(&out->sin, settings.freq);
-#endif
return 0;
}
@@ -194,7 +179,6 @@ static void line_out_enable(HWVoiceOut *hw, bool enable)
}
}
-#if ((SPICE_INTERFACE_PLAYBACK_MAJOR >= 1) && (SPICE_INTERFACE_PLAYBACK_MINOR >= 2))
static void line_out_volume(HWVoiceOut *hw, Volume *vol)
{
SpiceVoiceOut *out = container_of(hw, SpiceVoiceOut, hw);
@@ -206,7 +190,6 @@ static void line_out_volume(HWVoiceOut *hw, Volume *vol)
spice_server_playback_set_volume(&out->sin, 2, svol);
spice_server_playback_set_mute(&out->sin, vol->mute);
}
-#endif
/* record */
@@ -215,11 +198,7 @@ static int line_in_init(HWVoiceIn *hw, struct audsettings *as)
SpiceVoiceIn *in = container_of (hw, SpiceVoiceIn, hw);
struct audsettings settings;
-#if SPICE_INTERFACE_RECORD_MAJOR > 2 || SPICE_INTERFACE_RECORD_MINOR >= 3
settings.freq = spice_server_get_best_record_rate(NULL);
-#else
- settings.freq = SPICE_INTERFACE_RECORD_FREQ;
-#endif
settings.nchannels = SPICE_INTERFACE_RECORD_CHAN;
settings.fmt = AUDIO_FORMAT_S16;
settings.big_endian = HOST_BIG_ENDIAN;
@@ -230,9 +209,7 @@ static int line_in_init(HWVoiceIn *hw, struct audsettings *as)
in->sin.base.sif = &record_sif.base;
qemu_spice.add_interface(&in->sin.base);
-#if SPICE_INTERFACE_RECORD_MAJOR > 2 || SPICE_INTERFACE_RECORD_MINOR >= 3
spice_server_set_record_rate(&in->sin, settings.freq);
-#endif
return 0;
}
@@ -281,7 +258,6 @@ static void line_in_enable(HWVoiceIn *hw, bool enable)
}
}
-#if ((SPICE_INTERFACE_RECORD_MAJOR >= 2) && (SPICE_INTERFACE_RECORD_MINOR >= 2))
static void line_in_volume(HWVoiceIn *hw, Volume *vol)
{
SpiceVoiceIn *in = container_of(hw, SpiceVoiceIn, hw);
@@ -293,7 +269,6 @@ static void line_in_volume(HWVoiceIn *hw, Volume *vol)
spice_server_record_set_volume(&in->sin, 2, svol);
spice_server_record_set_mute(&in->sin, vol->mute);
}
-#endif
static void audio_spice_class_init(ObjectClass *klass, const void *data)
{
@@ -315,19 +290,14 @@ static void audio_spice_class_init(ObjectClass *klass, const void *data)
k->get_buffer_out = line_out_get_buffer;
k->put_buffer_out = line_out_put_buffer;
k->enable_out = line_out_enable;
-#if (SPICE_INTERFACE_PLAYBACK_MAJOR >= 1) && \
- (SPICE_INTERFACE_PLAYBACK_MINOR >= 2)
k->volume_out = line_out_volume;
-#endif
k->init_in = line_in_init;
k->fini_in = line_in_fini;
k->read = line_in_read;
k->run_buffer_in = audio_generic_run_buffer_in;
k->enable_in = line_in_enable;
-#if ((SPICE_INTERFACE_RECORD_MAJOR >= 2) && (SPICE_INTERFACE_RECORD_MINOR >= 2))
k->volume_in = line_in_volume;
-#endif
}
static const TypeInfo audio_types[] = {
diff --git a/docs/system/confidential-guest-support.rst b/docs/system/confidential-guest-support.rst
index 66129fb..562a7c3 100644
--- a/docs/system/confidential-guest-support.rst
+++ b/docs/system/confidential-guest-support.rst
@@ -41,5 +41,6 @@ Currently supported confidential guest mechanisms are:
* Intel Trust Domain Extension (TDX) (see :doc:`i386/tdx`)
* POWER Protected Execution Facility (PEF) (see :ref:`power-papr-protected-execution-facility-pef`)
* s390x Protected Virtualization (PV) (see :doc:`s390x/protvirt`)
+* AWS Nitro Enclaves (see :doc:`nitro`)
Other mechanisms may be supported in future.
diff --git a/docs/system/index.rst b/docs/system/index.rst
index 427b020..d297a95 100644
--- a/docs/system/index.rst
+++ b/docs/system/index.rst
@@ -39,5 +39,6 @@ or Hypervisor.Framework.
multi-process
confidential-guest-support
igvm
+ nitro
vm-templating
sriov
diff --git a/docs/system/nitro.rst b/docs/system/nitro.rst
new file mode 100644
index 0000000..5907d61
--- /dev/null
+++ b/docs/system/nitro.rst
@@ -0,0 +1,133 @@
+AWS Nitro Enclaves
+==================
+
+`AWS Nitro Enclaves <https://aws.amazon.com/ec2/nitro/nitro-enclaves/>`_
+are isolated compute environments that run alongside EC2 instances.
+They are created by partitioning CPU and memory resources from a parent
+instance and launching a signed Enclave Image Format (EIF) file inside
+a confidential VM managed by the Nitro Hypervisor.
+
+QEMU supports launching Nitro Enclaves on EC2 instances that have
+enclave support enabled, using the ``nitro`` accelerator and the
+``nitro`` machine type.
+
+Prerequisites
+-------------
+
+* An EC2 instance with Nitro Enclaves enabled
+* The ``nitro_enclaves`` kernel module loaded (provides ``/dev/nitro_enclaves``)
+* CPU cores allocated to the Nitro Enclaves pool via ``nitro-enclaves-allocator``
+* Huge pages allocated for Nitro Enclaves via ``nitro-enclaves-allocator``
+
+Quick Start
+-----------
+
+Launch a Nitro Enclave from a pre-built EIF file::
+
+ $ qemu-system-x86_64 -accel nitro,debug-mode=on -M nitro -nographic \
+ -smp 2 -m 512M -kernel enclave.eif
+
+Launch an enclave from individual kernel and initrd files::
+
+ $ qemu-system-x86_64 -accel nitro,debug-mode=on -M nitro -nographic \
+ -smp 2 -m 512M -kernel vmlinuz -initrd initrd.cpio \
+ -append "console=ttyS0"
+
+The same commands work with ``qemu-system-aarch64`` on Graviton based EC2
+instances.
+
+Accelerator
+-----------
+
+The ``nitro`` accelerator (``-accel nitro``) drives the
+``/dev/nitro_enclaves`` device to create and manage a Nitro Enclave.
+It handles:
+
+* Creating the enclave VM slot
+* Donating memory regions (must be huge page backed)
+* Adding vCPUs (must be full physical cores)
+* Starting the enclave
+* Notifying vsock bus devices of the enclave CID
+
+Accelerator options:
+
+``debug-mode=on|off``
+ Enable debug mode. When enabled, the Nitro Hypervisor exposes the
+ enclave's serial console output via a vsock port that the machine
+ model automatically connects to. In debug mode, PCR values are zero.
+ Default is ``off``.
+
+Machine
+-------
+
+The ``nitro`` machine (``-M nitro``) is a minimal, architecture-independent
+machine that provides only what a Nitro Enclave needs:
+
+* RAM (huge page backed via memfd)
+* vCPUs (defaults to ``host`` CPU type)
+* A Nitro vsock bus with:
+
+ - A heartbeat device (vsock server on port 9000)
+ - A serial console bridge (vsock client, debug mode only)
+
+Communication to the Nitro Enclave is limited to virtio-vsock. The Enclave
+is allocated a CID at launch at which it is reachable. A specific CID can
+be requested with ``-accel nitro,enclave-cid=<N>`` (0 lets the hypervisor
+choose). The assigned CID is readable from the vsock bridge device::
+
+ (qemu) qom-get /machine/peripheral/nitro-vsock enclave-cid
+
+EIF Image Format
+^^^^^^^^^^^^^^^^
+
+Nitro Enclaves boot from EIF (Enclave Image Format) files. When
+``-kernel`` points to an EIF file (detected by the ``.eif`` magic
+bytes), it is loaded directly into guest memory.
+
+When ``-kernel`` points to a regular kernel image (e.g. a bzImage or
+Image), the machine automatically assembles a minimal EIF on the fly
+from ``-kernel``, ``-initrd``, and ``-append``. This allows standard
+direct kernel boot without external EIF tooling.
+
+CPU Requirements
+^^^^^^^^^^^^^^^^
+
+Nitro Enclaves require full physical CPU cores. On hyperthreaded
+systems, this means ``-smp`` must be a multiple of the threads per
+core (typically 2).
+
+Nitro Enclaves can only consume cores that are donated to the Nitro Enclave
+CPU pool. You can configure the CPU pool using the ``nitro-enclaves-allocator``
+tool or manually by writing to the nitro_enclaves cpu pool parameter. To
+allocate vCPUs 1, 2 and 3, you can call::
+
+ $ echo 1,2,3 | sudo tee /sys/module/nitro_enclaves/parameters/ne_cpus
+
+Beware that on x86-64 systems, hyperthread siblings are not consecutive
+and must be added in pairs to the pool. Consult tools like ``lstopo``
+or ``lscpu`` for details about your instance's CPU topology.
+
+Memory Requirements
+^^^^^^^^^^^^^^^^^^^
+
+Enclave memory must be huge page backed. The machine automatically
+creates a memfd memory backend with huge pages enabled. To make the
+huge page allocation work, ensure that huge pages are reserved in
+the system. To reserve 1 GiB of memory on a 4 KiB PAGE_SIZE system,
+you can call::
+
+ $ echo 512 | sudo tee /proc/sys/vm/nr_hugepages
+
+Emulated Nitro Enclaves
+-----------------------
+
+In addition to the native Nitro Enclaves invocation, you can also use
+the emulated nitro-enclave machine target (see :doc:`i386/nitro-enclave`)
+which implements the x86 Nitro Enclave device model. While -M nitro
+delegates virtual machine device emulation to the Nitro Hypervisor, -M
+nitro-enclave implements all devices itself, which means it also works
+on non-EC2 instances.
+
+If you require NSM based attestation backed by valid AWS certificates,
+you must use -M nitro. The -M nitro-enclave model does not provide
+you with an AWS signed attestation document.
diff --git a/hw/Kconfig b/hw/Kconfig
index f8f92b5..b3ce152 100644
--- a/hw/Kconfig
+++ b/hw/Kconfig
@@ -22,6 +22,7 @@ source isa/Kconfig
source mem/Kconfig
source misc/Kconfig
source net/Kconfig
+source nitro/Kconfig
source nubus/Kconfig
source nvme/Kconfig
source nvram/Kconfig
diff --git a/hw/core/eif.c b/hw/core/eif.c
index 513caec..96f1d76 100644
--- a/hw/core/eif.c
+++ b/hw/core/eif.c
@@ -18,44 +18,6 @@
#include "hw/core/eif.h"
-#define MAX_SECTIONS 32
-
-/* members are ordered according to field order in .eif file */
-typedef struct EifHeader {
- uint8_t magic[4]; /* must be .eif in ascii i.e., [46, 101, 105, 102] */
- uint16_t version;
- uint16_t flags;
- uint64_t default_memory;
- uint64_t default_cpus;
- uint16_t reserved;
- uint16_t section_cnt;
- uint64_t section_offsets[MAX_SECTIONS];
- uint64_t section_sizes[MAX_SECTIONS];
- uint32_t unused;
- uint32_t eif_crc32;
-} QEMU_PACKED EifHeader;
-
-/* members are ordered according to field order in .eif file */
-typedef struct EifSectionHeader {
- /*
- * 0 = invalid, 1 = kernel, 2 = cmdline, 3 = ramdisk, 4 = signature,
- * 5 = metadata
- */
- uint16_t section_type;
- uint16_t flags;
- uint64_t section_size;
-} QEMU_PACKED EifSectionHeader;
-
-enum EifSectionTypes {
- EIF_SECTION_INVALID = 0,
- EIF_SECTION_KERNEL = 1,
- EIF_SECTION_CMDLINE = 2,
- EIF_SECTION_RAMDISK = 3,
- EIF_SECTION_SIGNATURE = 4,
- EIF_SECTION_METADATA = 5,
- EIF_SECTION_MAX = 6,
-};
-
static const char *section_type_to_string(uint16_t type)
{
const char *str;
diff --git a/hw/core/eif.h b/hw/core/eif.h
index fed3cb5..0c432db 100644
--- a/hw/core/eif.h
+++ b/hw/core/eif.h
@@ -11,6 +11,47 @@
#ifndef HW_CORE_EIF_H
#define HW_CORE_EIF_H
+#define MAX_SECTIONS 32
+#define EIF_HDR_ARCH_ARM64 0x1
+
+/* members are ordered according to field order in .eif file */
+typedef struct EifHeader {
+ uint8_t magic[4]; /* must be .eif in ascii i.e., [46, 101, 105, 102] */
+ uint16_t version;
+ uint16_t flags;
+ uint64_t default_memory;
+ uint64_t default_cpus;
+ uint16_t reserved;
+ uint16_t section_cnt;
+ uint64_t section_offsets[MAX_SECTIONS];
+ uint64_t section_sizes[MAX_SECTIONS];
+ uint32_t unused;
+ uint32_t eif_crc32;
+} QEMU_PACKED EifHeader;
+
+/* members are ordered according to field order in .eif file */
+typedef struct EifSectionHeader {
+ /*
+ * 0 = invalid, 1 = kernel, 2 = cmdline, 3 = ramdisk, 4 = signature,
+ * 5 = metadata
+ */
+ uint16_t section_type;
+ uint16_t flags;
+ uint64_t section_size;
+} QEMU_PACKED EifSectionHeader;
+
+enum EifSectionTypes {
+ EIF_SECTION_INVALID = 0,
+ EIF_SECTION_KERNEL = 1,
+ EIF_SECTION_CMDLINE = 2,
+ EIF_SECTION_RAMDISK = 3,
+ EIF_SECTION_SIGNATURE = 4,
+ EIF_SECTION_METADATA = 5,
+ EIF_SECTION_MAX = 6,
+};
+
+#define EIF_MAGIC { '.', 'e', 'i', 'f' }
+
bool read_eif_file(const char *eif_path, const char *machine_initrd,
char **kernel_path, char **initrd_path,
char **kernel_cmdline, uint8_t *image_sha384,
diff --git a/hw/core/machine.c b/hw/core/machine.c
index d4ef620..eae1f6b 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -435,6 +435,21 @@ static void machine_set_dump_guest_core(Object *obj, bool value, Error **errp)
ms->dump_guest_core = value;
}
+static bool machine_get_new_accel_vmfd_on_reset(Object *obj, Error **errp)
+{
+ MachineState *ms = MACHINE(obj);
+
+ return ms->new_accel_vmfd_on_reset;
+}
+
+static void machine_set_new_accel_vmfd_on_reset(Object *obj,
+ bool value, Error **errp)
+{
+ MachineState *ms = MACHINE(obj);
+
+ ms->new_accel_vmfd_on_reset = value;
+}
+
static bool machine_get_mem_merge(Object *obj, Error **errp)
{
MachineState *ms = MACHINE(obj);
@@ -1183,6 +1198,13 @@ static void machine_class_init(ObjectClass *oc, const void *data)
object_class_property_set_description(oc, "dump-guest-core",
"Include guest memory in a core dump");
+ object_class_property_add_bool(oc, "x-change-vmfd-on-reset",
+ machine_get_new_accel_vmfd_on_reset,
+ machine_set_new_accel_vmfd_on_reset);
+ object_class_property_set_description(oc, "x-change-vmfd-on-reset",
+ "Set on/off to enable/disable generating new accelerator guest handle "
+ "on guest reset. Default: off (used only for testing/debugging).");
+
object_class_property_add_bool(oc, "mem-merge",
machine_get_mem_merge, machine_set_mem_merge);
object_class_property_set_description(oc, "mem-merge",
diff --git a/hw/hyperv/trace-events b/hw/hyperv/trace-events
index 7963c21..d8c96f1 100644
--- a/hw/hyperv/trace-events
+++ b/hw/hyperv/trace-events
@@ -16,6 +16,7 @@ vmbus_gpadl_torndown(uint32_t gpadl_id) "gpadl #%d"
vmbus_open_channel(uint32_t chan_id, uint32_t gpadl_id, uint32_t target_vp) "channel #%d gpadl #%d target vp %d"
vmbus_channel_open(uint32_t chan_id, uint32_t status) "channel #%d status %d"
vmbus_close_channel(uint32_t chan_id) "channel #%d"
+vmbus_handle_vmfd_change(void) ""
# hv-balloon
hv_balloon_state_change(const char *tostr) "-> %s"
diff --git a/hw/hyperv/vmbus.c b/hw/hyperv/vmbus.c
index c5bab5d..64abe4c 100644
--- a/hw/hyperv/vmbus.c
+++ b/hw/hyperv/vmbus.c
@@ -20,6 +20,7 @@
#include "hw/hyperv/vmbus-bridge.h"
#include "hw/core/sysbus.h"
#include "exec/cpu-common.h"
+#include "system/kvm.h"
#include "exec/target_page.h"
#include "trace.h"
@@ -248,6 +249,12 @@ struct VMBus {
* interrupt page
*/
EventNotifier notifier;
+
+ /*
+ * Notifier to inform when vmfd is changed as a part of confidential guest
+ * reset mechanism.
+ */
+ NotifierWithReturn vmbus_vmfd_change_notifier;
};
static bool gpadl_full(VMBusGpadl *gpadl)
@@ -2347,6 +2354,33 @@ static void vmbus_dev_unrealize(DeviceState *dev)
free_channels(vdev);
}
+/*
+ * If the KVM fd changes because of VM reset in confidential guests,
+ * reassociate event fd with the new KVM fd.
+ */
+static int vmbus_handle_vmfd_change(NotifierWithReturn *notifier,
+ void *data, Error** errp)
+{
+ VMBus *vmbus = container_of(notifier, VMBus,
+ vmbus_vmfd_change_notifier);
+ int ret = 0;
+
+ /* we are not interested in pre vmfd change notification */
+ if (((VmfdChangeNotifier *)data)->pre) {
+ return 0;
+ }
+
+ ret = hyperv_set_event_flag_handler(VMBUS_EVENT_CONNECTION_ID,
+ &vmbus->notifier);
+ /* if we are only using userland event handler, it may already exist */
+ if (ret != 0 && ret != -EEXIST) {
+ error_setg(errp, "hyperv set event handler failed with %d", ret);
+ }
+
+ trace_vmbus_handle_vmfd_change();
+ return ret;
+}
+
static const Property vmbus_dev_props[] = {
DEFINE_PROP_UUID("instanceid", VMBusDevice, instanceid),
};
@@ -2429,6 +2463,9 @@ static void vmbus_realize(BusState *bus, Error **errp)
goto clear_event_notifier;
}
+ vmbus->vmbus_vmfd_change_notifier.notify = vmbus_handle_vmfd_change;
+ kvm_vmfd_add_change_notifier(&vmbus->vmbus_vmfd_change_notifier);
+
return;
clear_event_notifier:
diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
index aba6842..10d3425 100644
--- a/hw/i386/kvm/clock.c
+++ b/hw/i386/kvm/clock.c
@@ -50,6 +50,9 @@ struct KVMClockState {
/* whether the 'clock' value was obtained in a host with
* reliable KVM_GET_CLOCK */
bool clock_is_reliable;
+
+ NotifierWithReturn kvmclock_vcpufd_change_notifier;
+ NotifierWithReturn kvmclock_vmfd_change_notifier;
};
struct pvclock_vcpu_time_info {
@@ -63,6 +66,9 @@ struct pvclock_vcpu_time_info {
uint8_t pad[2];
} __attribute__((__packed__)); /* 32 bytes */
+static int kvmclock_set_clock(NotifierWithReturn *notifier,
+ void *data, Error** errp);
+
static uint64_t kvmclock_current_nsec(KVMClockState *s)
{
CPUState *cpu = first_cpu;
@@ -219,6 +225,54 @@ static void kvmclock_vm_state_change(void *opaque, bool running,
}
}
+static int kvmclock_save_clock(NotifierWithReturn *notifier,
+ void *data, Error** errp)
+{
+ if (!((VmfdChangeNotifier *)data)->pre) {
+ return 0;
+ }
+ KVMClockState *s = container_of(notifier, KVMClockState,
+ kvmclock_vmfd_change_notifier);
+ kvm_update_clock(s);
+ return 0;
+}
+
+static int kvmclock_set_clock(NotifierWithReturn *notifier,
+ void *data, Error** errp)
+{
+ struct kvm_clock_data clock_data = {};
+ CPUState *cpu;
+ int ret;
+ KVMClockState *s = container_of(notifier, KVMClockState,
+ kvmclock_vcpufd_change_notifier);
+ int cap_clock_ctrl = kvm_check_extension(kvm_state, KVM_CAP_KVMCLOCK_CTRL);
+
+ if (!s->clock_is_reliable) {
+ uint64_t pvclock_via_mem = kvmclock_current_nsec(s);
+ /* saved clock value before vmfd change is not reliable */
+ if (pvclock_via_mem) {
+ s->clock = pvclock_via_mem;
+ }
+ }
+
+ clock_data.clock = s->clock;
+ ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &clock_data);
+ if (ret < 0) {
+ fprintf(stderr, "KVM_SET_CLOCK failed: %s\n", strerror(-ret));
+ abort();
+ }
+
+ if (!cap_clock_ctrl) {
+ return 0;
+ }
+ CPU_FOREACH(cpu) {
+ run_on_cpu(cpu, do_kvmclock_ctrl, RUN_ON_CPU_NULL);
+ }
+
+ return 0;
+}
+
+
static void kvmclock_realize(DeviceState *dev, Error **errp)
{
KVMClockState *s = KVM_CLOCK(dev);
@@ -230,7 +284,12 @@ static void kvmclock_realize(DeviceState *dev, Error **errp)
kvm_update_clock(s);
+ s->kvmclock_vcpufd_change_notifier.notify = kvmclock_set_clock;
+ s->kvmclock_vmfd_change_notifier.notify = kvmclock_save_clock;
+
qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
+ kvm_vcpufd_add_change_notifier(&s->kvmclock_vcpufd_change_notifier);
+ kvm_vmfd_add_change_notifier(&s->kvmclock_vmfd_change_notifier);
}
static bool kvmclock_clock_is_reliable_needed(void *opaque)
diff --git a/hw/i386/kvm/i8254.c b/hw/i386/kvm/i8254.c
index 81e742f..70e8fd8 100644
--- a/hw/i386/kvm/i8254.c
+++ b/hw/i386/kvm/i8254.c
@@ -35,6 +35,7 @@
#include "hw/core/qdev-properties-system.h"
#include "system/kvm.h"
#include "target/i386/kvm/kvm_i386.h"
+#include "trace.h"
#include "qom/object.h"
#define KVM_PIT_REINJECT_BIT 0
@@ -52,6 +53,8 @@ struct KVMPITState {
LostTickPolicy lost_tick_policy;
bool vm_stopped;
int64_t kernel_clock_offset;
+
+ NotifierWithReturn kvmpit_vmfd_change_notifier;
};
struct KVMPITClass {
@@ -60,6 +63,43 @@ struct KVMPITClass {
DeviceRealize parent_realize;
};
+static void do_pit_initialize(KVMPITState *s, Error **errp)
+{
+ struct kvm_pit_config config = {
+ .flags = 0,
+ };
+ int ret;
+
+ ret = kvm_vm_ioctl(kvm_state, KVM_CREATE_PIT2, &config);
+ if (ret < 0) {
+ error_setg(errp, "Create kernel PIC irqchip failed: %s",
+ strerror(-ret));
+ return;
+ }
+ switch (s->lost_tick_policy) {
+ case LOST_TICK_POLICY_DELAY:
+ break; /* enabled by default */
+ case LOST_TICK_POLICY_DISCARD:
+ if (kvm_check_extension(kvm_state, KVM_CAP_REINJECT_CONTROL)) {
+ struct kvm_reinject_control control = { .pit_reinject = 0 };
+
+ ret = kvm_vm_ioctl(kvm_state, KVM_REINJECT_CONTROL, &control);
+ if (ret < 0) {
+ error_setg(errp,
+ "Can't disable in-kernel PIT reinjection: %s",
+ strerror(-ret));
+ return;
+ }
+ }
+ break;
+ default:
+ error_setg(errp, "Lost tick policy not supported.");
+ return;
+ }
+
+ return;
+}
+
static void kvm_pit_update_clock_offset(KVMPITState *s)
{
int64_t offset, clock_offset;
@@ -166,6 +206,23 @@ static void kvm_pit_put(PITCommonState *pit)
}
}
+static int kvmpit_post_vmfd_change(NotifierWithReturn *notifier,
+ void *data, Error** errp)
+{
+ KVMPITState *s = container_of(notifier, KVMPITState,
+ kvmpit_vmfd_change_notifier);
+
+ /* we are not interested in pre vmfd change notification */
+ if (((VmfdChangeNotifier *)data)->pre) {
+ return 0;
+ }
+
+ do_pit_initialize(s, errp);
+
+ trace_kvmpit_post_vmfd_change();
+ return 0;
+}
+
static void kvm_pit_set_gate(PITCommonState *s, PITChannelState *sc, int val)
{
kvm_pit_get(s);
@@ -241,42 +298,13 @@ static void kvm_pit_realizefn(DeviceState *dev, Error **errp)
PITCommonState *pit = PIT_COMMON(dev);
KVMPITClass *kpc = KVM_PIT_GET_CLASS(dev);
KVMPITState *s = KVM_PIT(pit);
- struct kvm_pit_config config = {
- .flags = 0,
- };
- int ret;
if (!kvm_check_extension(kvm_state, KVM_CAP_PIT_STATE2) ||
!kvm_check_extension(kvm_state, KVM_CAP_PIT2)) {
error_setg(errp, "In-kernel PIT not available");
}
- ret = kvm_vm_ioctl(kvm_state, KVM_CREATE_PIT2, &config);
- if (ret < 0) {
- error_setg(errp, "Create kernel PIC irqchip failed: %s",
- strerror(-ret));
- return;
- }
- switch (s->lost_tick_policy) {
- case LOST_TICK_POLICY_DELAY:
- break; /* enabled by default */
- case LOST_TICK_POLICY_DISCARD:
- if (kvm_check_extension(kvm_state, KVM_CAP_REINJECT_CONTROL)) {
- struct kvm_reinject_control control = { .pit_reinject = 0 };
-
- ret = kvm_vm_ioctl(kvm_state, KVM_REINJECT_CONTROL, &control);
- if (ret < 0) {
- error_setg(errp,
- "Can't disable in-kernel PIT reinjection: %s",
- strerror(-ret));
- return;
- }
- }
- break;
- default:
- error_setg(errp, "Lost tick policy not supported.");
- return;
- }
+ do_pit_initialize(s, errp);
memory_region_init_io(&pit->ioports, OBJECT(dev), NULL, NULL, "kvm-pit", 4);
@@ -284,6 +312,9 @@ static void kvm_pit_realizefn(DeviceState *dev, Error **errp)
qemu_add_vm_change_state_handler(kvm_pit_vm_state_change, s);
+ s->kvmpit_vmfd_change_notifier.notify = kvmpit_post_vmfd_change;
+ kvm_vmfd_add_change_notifier(&s->kvmpit_vmfd_change_notifier);
+
kpc->parent_realize(dev, errp);
}
diff --git a/hw/i386/kvm/trace-events b/hw/i386/kvm/trace-events
index 67bf7f1..33680ff 100644
--- a/hw/i386/kvm/trace-events
+++ b/hw/i386/kvm/trace-events
@@ -20,3 +20,4 @@ xenstore_reset_watches(void) ""
xenstore_watch_event(const char *path, const char *token) "path %s token %s"
xen_primary_console_create(void) ""
xen_primary_console_reset(int port) "port %u"
+kvmpit_post_vmfd_change(void) ""
diff --git a/hw/i386/vapic.c b/hw/i386/vapic.c
index 670a505..41e5ca2 100644
--- a/hw/i386/vapic.c
+++ b/hw/i386/vapic.c
@@ -16,6 +16,7 @@
#include "system/cpus.h"
#include "system/hw_accel.h"
#include "system/kvm.h"
+#include "system/whpx.h"
#include "system/runstate.h"
#include "system/address-spaces.h"
#include "hw/i386/apic_internal.h"
@@ -229,7 +230,8 @@ static int evaluate_tpr_instruction(VAPICROMState *s, X86CPU *cpu,
return -1;
}
- if (kvm_enabled() && !kvm_irqchip_in_kernel()) {
+ if ((kvm_enabled() && !kvm_irqchip_in_kernel())
+ || (whpx_enabled() && !whpx_irqchip_in_kernel())) {
/*
* KVM without kernel-based TPR access reporting will pass an IP that
* points after the accessing instruction. So we need to look backward
@@ -549,7 +551,7 @@ static int patch_hypercalls(VAPICROMState *s)
cpu_physical_memory_read(rom_paddr, rom, s->rom_size);
for (pos = 0; pos < s->rom_size - sizeof(vmcall_pattern); pos++) {
- if (kvm_irqchip_in_kernel()) {
+ if (kvm_enabled() && kvm_irqchip_in_kernel()) {
pattern = outl_pattern;
alternates[0] = outl_pattern[7];
alternates[1] = outl_pattern[7];
@@ -679,16 +681,25 @@ static void vapic_write(void *opaque, hwaddr addr, uint64_t data,
}
break;
case 1:
- if (kvm_enabled()) {
+ if (kvm_enabled() || (whpx_enabled() && !whpx_irqchip_in_kernel())) {
/*
* Disable triggering instruction in ROM by writing a NOP.
*
* We cannot do this in TCG mode as the reported IP is not
* accurate.
+ *
+ * Oddly enough, KVM increments EIP _before_ the execution
+ * of the instruction is finished.
*/
pause_all_vcpus();
- patch_byte(cpu, env->eip - 2, 0x66);
- patch_byte(cpu, env->eip - 1, 0x90);
+ if (!kvm_enabled()) {
+ patch_byte(cpu, env->eip, 0x66);
+ patch_byte(cpu, env->eip + 1, 0x90);
+ }
+ else {
+ patch_byte(cpu, env->eip - 2, 0x66);
+ patch_byte(cpu, env->eip - 1, 0x90);
+ }
resume_all_vcpus();
}
@@ -705,7 +716,8 @@ static void vapic_write(void *opaque, hwaddr addr, uint64_t data,
break;
default:
case 4:
- if (!kvm_irqchip_in_kernel()) {
+ if ((kvm_enabled() && !kvm_irqchip_in_kernel())
+ || (whpx_enabled() && !whpx_irqchip_in_kernel())) {
apic_poll_irq(cpu->apic_state);
}
break;
diff --git a/hw/i386/vmmouse.c b/hw/i386/vmmouse.c
index 2ae7f3a..c1aeeca 100644
--- a/hw/i386/vmmouse.c
+++ b/hw/i386/vmmouse.c
@@ -72,7 +72,7 @@ struct VMMouseState {
ISAKBDState *i8042;
};
-static void vmmouse_get_data(uint32_t *data)
+static void vmmouse_get_data(uint64_t *data)
{
X86CPU *cpu = X86_CPU(current_cpu);
CPUX86State *env = &cpu->env;
@@ -82,7 +82,7 @@ static void vmmouse_get_data(uint32_t *data)
data[4] = env->regs[R_ESI]; data[5] = env->regs[R_EDI];
}
-static void vmmouse_set_data(const uint32_t *data)
+static void vmmouse_set_data(const uint64_t *data)
{
X86CPU *cpu = X86_CPU(current_cpu);
CPUX86State *env = &cpu->env;
@@ -197,7 +197,7 @@ static void vmmouse_disable(VMMouseState *s)
vmmouse_remove_handler(s);
}
-static void vmmouse_data(VMMouseState *s, uint32_t *data, uint32_t size)
+static void vmmouse_data(VMMouseState *s, uint64_t *data, uint32_t size)
{
int i;
@@ -221,7 +221,7 @@ static void vmmouse_data(VMMouseState *s, uint32_t *data, uint32_t size)
static uint32_t vmmouse_ioport_read(void *opaque, uint32_t addr)
{
VMMouseState *s = opaque;
- uint32_t data[6];
+ uint64_t data[6];
uint16_t command;
vmmouse_get_data(data);
@@ -247,7 +247,7 @@ static uint32_t vmmouse_ioport_read(void *opaque, uint32_t addr)
vmmouse_request_absolute(s);
break;
default:
- printf("vmmouse: unknown command %x\n", data[1]);
+ printf("vmmouse: unknown command %" PRIx64 "\n", data[1]);
break;
}
break;
diff --git a/hw/i386/x86-common.c b/hw/i386/x86-common.c
index de4cd76..a420112 100644
--- a/hw/i386/x86-common.c
+++ b/hw/i386/x86-common.c
@@ -1020,17 +1020,11 @@ void x86_isa_bios_init(MemoryRegion *isa_bios, MemoryRegion *isa_memory,
memory_region_set_readonly(isa_bios, read_only);
}
-void x86_bios_rom_init(X86MachineState *x86ms, const char *default_firmware,
- MemoryRegion *rom_memory, bool isapc_ram_fw)
+static int get_bios_size(X86MachineState *x86ms,
+ const char *bios_name, char *filename)
{
- const char *bios_name;
- char *filename;
int bios_size;
- ssize_t ret;
- /* BIOS load */
- bios_name = MACHINE(x86ms)->firmware ?: default_firmware;
- filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name);
if (filename) {
bios_size = get_image_size(filename, NULL);
} else {
@@ -1040,6 +1034,21 @@ void x86_bios_rom_init(X86MachineState *x86ms, const char *default_firmware,
(bios_size % 65536) != 0) {
goto bios_error;
}
+
+ return bios_size;
+
+ bios_error:
+ fprintf(stderr, "qemu: could not load PC BIOS '%s'\n", bios_name);
+ exit(1);
+}
+
+static void load_bios_from_file(X86MachineState *x86ms, const char *bios_name,
+ char *filename, int bios_size,
+ bool isapc_ram_fw)
+{
+ ssize_t ret;
+
+ /* BIOS load */
if (machine_require_guest_memfd(MACHINE(x86ms))) {
memory_region_init_ram_guest_memfd(&x86ms->bios, NULL, "pc.bios",
bios_size, &error_fatal);
@@ -1068,7 +1077,47 @@ void x86_bios_rom_init(X86MachineState *x86ms, const char *default_firmware,
goto bios_error;
}
}
- g_free(filename);
+
+ return;
+
+ bios_error:
+ fprintf(stderr, "qemu: could not load PC BIOS '%s'\n", bios_name);
+ exit(1);
+}
+
+void x86_bios_rom_reload(X86MachineState *x86ms)
+{
+ int bios_size;
+ const char *bios_name;
+ char *filename;
+
+ if (memory_region_size(&x86ms->bios) == 0) {
+ /* if -bios is not used */
+ return;
+ }
+
+ bios_name = MACHINE(x86ms)->firmware ?: "bios.bin";
+ filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name);
+
+ bios_size = get_bios_size(x86ms, bios_name, filename);
+
+ void *ptr = memory_region_get_ram_ptr(&x86ms->bios);
+ load_image_size(filename, ptr, bios_size);
+ x86_firmware_configure(0x100000000ULL - bios_size, ptr, bios_size);
+}
+
+void x86_bios_rom_init(X86MachineState *x86ms, const char *default_firmware,
+ MemoryRegion *rom_memory, bool isapc_ram_fw)
+{
+ int bios_size;
+ const char *bios_name;
+ g_autofree char *filename;
+
+ bios_name = MACHINE(x86ms)->firmware ?: default_firmware;
+ filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name);
+
+ bios_size = get_bios_size(x86ms, bios_name, filename);
+ load_bios_from_file(x86ms, bios_name, filename, bios_size, isapc_ram_fw);
if (!machine_require_guest_memfd(MACHINE(x86ms))) {
/* map the last 128KB of the BIOS in ISA space */
@@ -1081,8 +1130,4 @@ void x86_bios_rom_init(X86MachineState *x86ms, const char *default_firmware,
(uint32_t)(-bios_size),
&x86ms->bios);
return;
-
-bios_error:
- fprintf(stderr, "qemu: could not load PC BIOS '%s'\n", bios_name);
- exit(1);
}
diff --git a/hw/intc/openpic_kvm.c b/hw/intc/openpic_kvm.c
index fbf0bdb..b099da2 100644
--- a/hw/intc/openpic_kvm.c
+++ b/hw/intc/openpic_kvm.c
@@ -49,6 +49,7 @@ struct KVMOpenPICState {
uint32_t fd;
uint32_t model;
hwaddr mapped;
+ NotifierWithReturn vmfd_change_notifier;
};
static void kvm_openpic_set_irq(void *opaque, int n_IRQ, int level)
@@ -114,6 +115,88 @@ static const MemoryRegionOps kvm_openpic_mem_ops = {
},
};
+static int kvm_openpic_setup(KVMOpenPICState *opp, Error **errp)
+{
+ int kvm_openpic_model;
+ struct kvm_create_device cd = {0};
+ KVMState *s = kvm_state;
+ int ret;
+
+ switch (opp->model) {
+ case OPENPIC_MODEL_FSL_MPIC_20:
+ kvm_openpic_model = KVM_DEV_TYPE_FSL_MPIC_20;
+ break;
+
+ case OPENPIC_MODEL_FSL_MPIC_42:
+ kvm_openpic_model = KVM_DEV_TYPE_FSL_MPIC_42;
+ break;
+
+ default:
+ error_setg(errp, "Unsupported OpenPIC model %" PRIu32, opp->model);
+ return -1;
+ }
+
+ cd.type = kvm_openpic_model;
+ ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &cd);
+ if (ret < 0) {
+ error_setg(errp, "Can't create device %d: %s",
+ cd.type, strerror(errno));
+ return -1;
+ }
+ opp->fd = cd.fd;
+
+ return 0;
+}
+
+static int kvm_openpic_handle_vmfd_change(NotifierWithReturn *notifier,
+ void *data, Error **errp)
+{
+ KVMOpenPICState *opp = container_of(notifier, KVMOpenPICState,
+ vmfd_change_notifier);
+ uint64_t reg_base;
+ struct kvm_device_attr attr;
+ CPUState *cs;
+ int ret;
+
+ /* we are not interested in pre vmfd change notification */
+ if (((VmfdChangeNotifier *)data)->pre) {
+ return 0;
+ }
+
+ /* close the old descriptor */
+ close(opp->fd);
+
+ if (kvm_openpic_setup(opp, errp) < 0) {
+ return -1;
+ }
+
+ if (!opp->mapped) {
+ return 0;
+ }
+
+ reg_base = opp->mapped;
+ attr.group = KVM_DEV_MPIC_GRP_MISC;
+ attr.attr = KVM_DEV_MPIC_BASE_ADDR;
+ attr.addr = (uint64_t)(unsigned long)&reg_base;
+
+ ret = ioctl(opp->fd, KVM_SET_DEVICE_ATTR, &attr);
+ if (ret < 0) {
+ error_setg(errp, "%s: %s %" PRIx64, __func__,
+ strerror(errno), reg_base);
+ return -1;
+ }
+
+ CPU_FOREACH(cs) {
+ ret = kvm_vcpu_enable_cap(cs, KVM_CAP_IRQ_MPIC, 0, opp->fd,
+ kvm_arch_vcpu_id(cs));
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
static void kvm_openpic_region_add(MemoryListener *listener,
MemoryRegionSection *section)
{
@@ -197,36 +280,14 @@ static void kvm_openpic_realize(DeviceState *dev, Error **errp)
SysBusDevice *d = SYS_BUS_DEVICE(dev);
KVMOpenPICState *opp = KVM_OPENPIC(dev);
KVMState *s = kvm_state;
- int kvm_openpic_model;
- struct kvm_create_device cd = {0};
- int ret, i;
+ int i;
if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) {
error_setg(errp, "Kernel is lacking Device Control API");
return;
}
- switch (opp->model) {
- case OPENPIC_MODEL_FSL_MPIC_20:
- kvm_openpic_model = KVM_DEV_TYPE_FSL_MPIC_20;
- break;
-
- case OPENPIC_MODEL_FSL_MPIC_42:
- kvm_openpic_model = KVM_DEV_TYPE_FSL_MPIC_42;
- break;
-
- default:
- error_setg(errp, "Unsupported OpenPIC model %" PRIu32, opp->model);
- return;
- }
-
- cd.type = kvm_openpic_model;
- ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &cd);
- if (ret < 0) {
- error_setg_errno(errp, errno, "Can't create device %d", cd.type);
- return;
- }
- opp->fd = cd.fd;
+ kvm_openpic_setup(opp, errp);
sysbus_init_mmio(d, &opp->mem);
qdev_init_gpio_in(dev, kvm_openpic_set_irq, OPENPIC_MAX_IRQ);
@@ -235,6 +296,9 @@ static void kvm_openpic_realize(DeviceState *dev, Error **errp)
opp->mem_listener.region_del = kvm_openpic_region_del;
opp->mem_listener.name = "openpic-kvm";
memory_listener_register(&opp->mem_listener, &address_space_memory);
+ opp->vmfd_change_notifier.notify =
+ kvm_openpic_handle_vmfd_change;
+ kvm_vmfd_add_change_notifier(&opp->vmfd_change_notifier);
/* indicate pic capabilities */
msi_nonbroken = true;
diff --git a/hw/meson.build b/hw/meson.build
index 66e46b8..36da532 100644
--- a/hw/meson.build
+++ b/hw/meson.build
@@ -44,6 +44,7 @@ subdir('isa')
subdir('mem')
subdir('misc')
subdir('net')
+subdir('nitro')
subdir('nubus')
subdir('nvme')
subdir('nvram')
diff --git a/hw/nitro/Kconfig b/hw/nitro/Kconfig
new file mode 100644
index 0000000..cfae859
--- /dev/null
+++ b/hw/nitro/Kconfig
@@ -0,0 +1,18 @@
+config NITRO_VSOCK_BUS
+ bool
+
+config NITRO_SERIAL_VSOCK
+ bool
+ depends on NITRO_VSOCK_BUS
+
+config NITRO_HEARTBEAT
+ bool
+ depends on NITRO_VSOCK_BUS
+
+config NITRO_MACHINE
+ bool
+ default y
+ depends on NITRO
+ select NITRO_VSOCK_BUS
+ select NITRO_HEARTBEAT
+ select NITRO_SERIAL_VSOCK
diff --git a/hw/nitro/heartbeat.c b/hw/nitro/heartbeat.c
new file mode 100644
index 0000000..dc41323
--- /dev/null
+++ b/hw/nitro/heartbeat.c
@@ -0,0 +1,115 @@
+/*
+ * Nitro Enclave Heartbeat device
+ *
+ * Copyright © 2026 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors:
+ * Alexander Graf <graf@amazon.com>
+ *
+ * The Nitro Enclave init process sends a heartbeat byte (0xB7) to
+ * CID 3 (parent) port 9000 on boot to signal it reached initramfs.
+ * The parent must accept the connection, read the byte, and echo it
+ * back. If the enclave init cannot reach the listener, it exits.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "chardev/char.h"
+#include "chardev/char-fe.h"
+#include "hw/nitro/heartbeat.h"
+#include "trace.h"
+
+#define HEARTBEAT_PORT 9000
+#define VMADDR_CID_ANY_STR "4294967295"
+
+static int nitro_heartbeat_can_read(void *opaque)
+{
+ NitroHeartbeatState *s = opaque;
+
+ /* One-shot protocol: stop reading after the first heartbeat */
+ return s->done ? 0 : 1;
+}
+
+static void nitro_heartbeat_read(void *opaque, const uint8_t *buf, int size)
+{
+ NitroHeartbeatState *s = opaque;
+
+ if (s->done || size < 1) {
+ return;
+ }
+
+ /* Echo the heartbeat byte back and disconnect */
+ qemu_chr_fe_write_all(&s->vsock, buf, 1);
+ s->done = true;
+ qemu_chr_fe_deinit(&s->vsock, true);
+
+ trace_nitro_heartbeat_done();
+}
+
+static void nitro_heartbeat_event(void *opaque, QEMUChrEvent event)
+{
+ trace_nitro_heartbeat_event(event);
+}
+
+static void nitro_heartbeat_realize(DeviceState *dev, Error **errp)
+{
+ NitroHeartbeatState *s = NITRO_HEARTBEAT(dev);
+ g_autofree char *chardev_id = NULL;
+ Chardev *chr;
+ ChardevBackend *backend;
+ ChardevSocket *sock;
+
+ chardev_id = g_strdup_printf("nitro-heartbeat");
+
+ backend = g_new0(ChardevBackend, 1);
+ backend->type = CHARDEV_BACKEND_KIND_SOCKET;
+ sock = backend->u.socket.data = g_new0(ChardevSocket, 1);
+ sock->addr = g_new0(SocketAddressLegacy, 1);
+ sock->addr->type = SOCKET_ADDRESS_TYPE_VSOCK;
+ sock->addr->u.vsock.data = g_new0(VsockSocketAddress, 1);
+ sock->addr->u.vsock.data->cid = g_strdup(VMADDR_CID_ANY_STR);
+ sock->addr->u.vsock.data->port = g_strdup_printf("%u", HEARTBEAT_PORT);
+ sock->server = true;
+ sock->has_server = true;
+ sock->wait = false;
+ sock->has_wait = true;
+
+ chr = qemu_chardev_new(chardev_id, TYPE_CHARDEV_SOCKET,
+ backend, NULL, errp);
+ if (!chr) {
+ return;
+ }
+
+ if (!qemu_chr_fe_init(&s->vsock, chr, errp)) {
+ return;
+ }
+
+ qemu_chr_fe_set_handlers(&s->vsock,
+ nitro_heartbeat_can_read,
+ nitro_heartbeat_read,
+ nitro_heartbeat_event,
+ NULL, s, NULL, true);
+}
+
+static void nitro_heartbeat_class_init(ObjectClass *oc, const void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(oc);
+
+ dc->realize = nitro_heartbeat_realize;
+}
+
+static const TypeInfo nitro_heartbeat_info = {
+ .name = TYPE_NITRO_HEARTBEAT,
+ .parent = TYPE_NITRO_VSOCK_DEVICE,
+ .instance_size = sizeof(NitroHeartbeatState),
+ .class_init = nitro_heartbeat_class_init,
+};
+
+static void nitro_heartbeat_register(void)
+{
+ type_register_static(&nitro_heartbeat_info);
+}
+
+type_init(nitro_heartbeat_register);
diff --git a/hw/nitro/machine.c b/hw/nitro/machine.c
new file mode 100644
index 0000000..8849959
--- /dev/null
+++ b/hw/nitro/machine.c
@@ -0,0 +1,277 @@
+/*
+ * Nitro Enclaves (accel) machine
+ *
+ * Copyright © 2026 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors:
+ * Alexander Graf <graf@amazon.com>
+ *
+ * Nitro Enclaves machine model for -accel nitro. This machine behaves
+ * like the nitro-enclave machine, but uses the real Nitro Enclaves
+ * backend to launch the virtual machine. It requires use of the -accel
+ * nitro.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+#include "qom/object_interfaces.h"
+#include "chardev/char.h"
+#include "hw/core/boards.h"
+#include "hw/core/cpu.h"
+#include "hw/core/qdev-properties-system.h"
+#include "hw/nitro/heartbeat.h"
+#include "hw/nitro/machine.h"
+#include "hw/nitro/nitro-vsock-bus.h"
+#include "hw/nitro/serial-vsock.h"
+#include "system/address-spaces.h"
+#include "system/hostmem.h"
+#include "system/system.h"
+#include "system/nitro-accel.h"
+#include "qemu/accel.h"
+#include "hw/arm/machines-qom.h"
+#include "hw/core/eif.h"
+#include <zlib.h> /* for crc32 */
+
+#define EIF_LOAD_ADDR (8 * 1024 * 1024)
+
+static bool is_eif(char *eif, gsize len)
+{
+ const char eif_magic[] = EIF_MAGIC;
+
+ return len >= sizeof(eif_magic) &&
+ !memcmp(eif, eif_magic, sizeof(eif_magic));
+}
+
+static void build_eif_section(EifHeader *hdr, GByteArray *buf, uint16_t type,
+ const char *data, uint64_t size)
+{
+ uint16_t section = be16_to_cpu(hdr->section_cnt);
+ EifSectionHeader shdr = {
+ .section_type = cpu_to_be16(type),
+ .flags = 0,
+ .section_size = cpu_to_be64(size),
+ };
+
+ hdr->section_offsets[section] = cpu_to_be64(buf->len);
+ hdr->section_sizes[section] = cpu_to_be64(size);
+
+ g_byte_array_append(buf, (const uint8_t *)&shdr, sizeof(shdr));
+ if (size) {
+ g_byte_array_append(buf, (const uint8_t *)data, size);
+ }
+
+ hdr->section_cnt = cpu_to_be16(section + 1);
+}
+
+/*
+ * Nitro Enclaves only support loading EIF files. When the user provides
+ * a Linux kernel, initrd and cmdline, convert them into EIF format.
+ */
+static char *build_eif(const char *kernel_data, gsize kernel_size,
+ const char *initrd_path, const char *cmdline,
+ gsize *out_size, Error **errp)
+{
+ g_autofree char *initrd_data = NULL;
+ static const char metadata[] = "{}";
+ size_t metadata_len = sizeof(metadata) - 1;
+ gsize initrd_size = 0;
+ GByteArray *buf;
+ EifHeader hdr;
+ uint32_t crc = 0;
+ size_t cmdline_len;
+
+ if (initrd_path) {
+ if (!g_file_get_contents(initrd_path, &initrd_data,
+ &initrd_size, NULL)) {
+ error_setg(errp, "Failed to read initrd '%s'", initrd_path);
+ return NULL;
+ }
+ }
+
+ buf = g_byte_array_new();
+
+ cmdline_len = cmdline ? strlen(cmdline) : 0;
+
+ hdr = (EifHeader) {
+ .magic = EIF_MAGIC,
+ .version = cpu_to_be16(4),
+ .flags = cpu_to_be16(target_aarch64() ? EIF_HDR_ARCH_ARM64 : 0),
+ };
+
+ g_byte_array_append(buf, (const uint8_t *)&hdr, sizeof(hdr));
+
+ /* Kernel */
+ build_eif_section(&hdr, buf, EIF_SECTION_KERNEL, kernel_data, kernel_size);
+
+ /* Command line */
+ build_eif_section(&hdr, buf, EIF_SECTION_CMDLINE, cmdline, cmdline_len);
+
+ /* Initramfs */
+ build_eif_section(&hdr, buf, EIF_SECTION_RAMDISK, initrd_data, initrd_size);
+
+ /* Metadata */
+ build_eif_section(&hdr, buf, EIF_SECTION_METADATA, metadata, metadata_len);
+
+ /*
+ * Patch the header into the buffer first (with real section offsets
+ * and sizes), then compute CRC over everything except the CRC field.
+ */
+ memcpy(buf->data, &hdr, sizeof(hdr));
+ crc = crc32(crc, buf->data, offsetof(EifHeader, eif_crc32));
+ crc = crc32(crc, &buf->data[sizeof(hdr)], buf->len - sizeof(hdr));
+
+ /* Finally write the CRC into the in-buffer header */
+ ((EifHeader *)buf->data)->eif_crc32 = cpu_to_be32(crc);
+
+ *out_size = buf->len;
+ return (char *)g_byte_array_free(buf, false);
+}
+
+static void nitro_machine_init(MachineState *machine)
+{
+ const char *eif_path = machine->kernel_filename;
+ const char *cpu_type = machine->cpu_type;
+ g_autofree char *eif_data = NULL;
+ gsize eif_size;
+
+ if (!nitro_enabled()) {
+ error_report("The 'nitro' machine requires -accel nitro");
+ exit(1);
+ }
+
+ if (!cpu_type) {
+ ObjectClass *oc = cpu_class_by_name(target_cpu_type(), "host");
+
+ if (!oc) {
+ error_report("nitro: no 'host' CPU available");
+ exit(1);
+ }
+ cpu_type = object_class_get_name(oc);
+ }
+
+ if (!eif_path) {
+ error_report("nitro: -kernel <eif-file> is required");
+ exit(1);
+ }
+
+ /* Expose memory as normal QEMU RAM. Needs to be huge page backed. */
+ memory_region_add_subregion(get_system_memory(), 0, machine->ram);
+
+ /*
+ * Load EIF (-kernel) as raw blob at the EIF_LOAD_ADDR into guest RAM.
+ * The Nitro Hypervisor will extract its contents and bootstrap the
+ * Enclave from it.
+ */
+ if (!g_file_get_contents(eif_path, &eif_data, &eif_size, NULL)) {
+ error_report("nitro: failed to read EIF '%s'", eif_path);
+ exit(1);
+ }
+
+ if (!is_eif(eif_data, eif_size)) {
+ char *kernel_data = eif_data;
+ gsize kernel_size = eif_size;
+ Error *err = NULL;
+
+ /*
+ * The user gave us a non-EIF kernel, likely a Linux kernel image.
+ * Assemble an EIF file from it, the -initrd and the -append arguments,
+ * so that users can perform a natural direct kernel boot.
+ */
+ eif_data = build_eif(kernel_data, kernel_size, machine->initrd_filename,
+ machine->kernel_cmdline, &eif_size, &err);
+ if (!eif_data) {
+ error_report_err(err);
+ exit(1);
+ }
+
+ g_free(kernel_data);
+ }
+
+ address_space_write(&address_space_memory, EIF_LOAD_ADDR,
+ MEMTXATTRS_UNSPECIFIED, eif_data, eif_size);
+
+ if (defaults_enabled()) {
+ NitroVsockBridge *bridge = nitro_vsock_bridge_create();
+
+ /* Nitro Enclaves require a heartbeat device. Provide one. */
+ qdev_realize(qdev_new(TYPE_NITRO_HEARTBEAT),
+ BUS(&bridge->bus), &error_fatal);
+
+ /*
+ * In debug mode, Nitro Enclaves expose the guest's serial output via
+ * vsock. When the accel is in debug mode, wire the vsock serial to
+ * the machine's serial port so that -nographic automatically works
+ */
+ if (object_property_get_bool(OBJECT(current_accel()), "debug-mode", NULL)) {
+ Chardev *chr = serial_hd(0);
+
+ if (chr) {
+ DeviceState *dev = qdev_new(TYPE_NITRO_SERIAL_VSOCK);
+
+ qdev_prop_set_chr(dev, "chardev", chr);
+ qdev_realize(dev, BUS(&bridge->bus), &error_fatal);
+ }
+ }
+ }
+}
+
+static bool nitro_create_memfd_backend(MachineState *ms, const char *path,
+ Error **errp)
+{
+ MachineClass *mc = MACHINE_GET_CLASS(ms);
+ Object *root = object_get_objects_root();
+ Object *obj;
+ bool r = false;
+
+ obj = object_new(TYPE_MEMORY_BACKEND_MEMFD);
+
+ /* Nitro Enclaves require huge page backing */
+ if (!object_property_set_int(obj, "size", ms->ram_size, errp) ||
+ !object_property_set_bool(obj, "hugetlb", true, errp)) {
+ goto out;
+ }
+
+ object_property_add_child(root, mc->default_ram_id, obj);
+
+ if (!user_creatable_complete(USER_CREATABLE(obj), errp)) {
+ goto out;
+ }
+ r = object_property_set_link(OBJECT(ms), "memory-backend", obj, errp);
+
+out:
+ object_unref(obj);
+ return r;
+}
+
+static void nitro_machine_class_init(ObjectClass *oc, const void *data)
+{
+ MachineClass *mc = MACHINE_CLASS(oc);
+
+ mc->desc = "Nitro Enclave";
+ mc->init = nitro_machine_init;
+ mc->create_default_memdev = nitro_create_memfd_backend;
+ mc->default_ram_id = "ram";
+ mc->max_cpus = 4096;
+}
+
+static const TypeInfo nitro_machine_info = {
+ .name = TYPE_NITRO_MACHINE,
+ .parent = TYPE_MACHINE,
+ .instance_size = sizeof(NitroMachineState),
+ .class_init = nitro_machine_class_init,
+ .interfaces = (const InterfaceInfo[]) {
+ /* x86_64 and aarch64 only */
+ { TYPE_TARGET_AARCH64_MACHINE },
+ { }
+ },
+};
+
+static void nitro_machine_register(void)
+{
+ type_register_static(&nitro_machine_info);
+}
+
+type_init(nitro_machine_register);
diff --git a/hw/nitro/meson.build b/hw/nitro/meson.build
new file mode 100644
index 0000000..b9bd0d4
--- /dev/null
+++ b/hw/nitro/meson.build
@@ -0,0 +1,4 @@
+system_ss.add(when: 'CONFIG_NITRO_VSOCK_BUS', if_true: files('nitro-vsock-bus.c'))
+system_ss.add(when: 'CONFIG_NITRO_SERIAL_VSOCK', if_true: files('serial-vsock.c'))
+system_ss.add(when: 'CONFIG_NITRO_HEARTBEAT', if_true: files('heartbeat.c'))
+system_ss.add(when: 'CONFIG_NITRO_MACHINE', if_true: [files('machine.c'), zlib])
diff --git a/hw/nitro/nitro-vsock-bus.c b/hw/nitro/nitro-vsock-bus.c
new file mode 100644
index 0000000..eed29df
--- /dev/null
+++ b/hw/nitro/nitro-vsock-bus.c
@@ -0,0 +1,98 @@
+/*
+ * Nitro Enclave Vsock Bus
+ *
+ * Copyright © 2026 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors:
+ * Alexander Graf <graf@amazon.com>
+ *
+ * A bus for Nitro Enclave vsock devices. In Nitro Enclaves, communication
+ * between parent and enclave/hypervisor happens almost exclusively through
+ * vsock. The nitro-vsock-bus models this dependency in QEMU, which allows
+ * devices in this bus to implement individual services on top of vsock.
+ *
+ * The nitro accel advertises the Enclave's CID to the bus by calling
+ * nitro_vsock_bridge_start_enclave() on the bridge device as soon as it
+ * knows the CID.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "monitor/qdev.h"
+#include "hw/core/sysbus.h"
+#include "hw/nitro/nitro-vsock-bus.h"
+
+void nitro_vsock_bridge_start_enclave(NitroVsockBridge *bridge,
+ uint32_t enclave_cid, Error **errp)
+{
+ ERRP_GUARD();
+ BusState *qbus = BUS(&bridge->bus);
+ BusChild *kid;
+
+ bridge->enclave_cid = enclave_cid;
+
+ QTAILQ_FOREACH(kid, &qbus->children, sibling) {
+ NitroVsockDevice *ndev = NITRO_VSOCK_DEVICE(kid->child);
+ NitroVsockDeviceClass *ndc = NITRO_VSOCK_DEVICE_GET_CLASS(ndev);
+
+ if (ndc->enclave_started) {
+ ndc->enclave_started(ndev, enclave_cid, errp);
+ if (*errp) {
+ return;
+ }
+ }
+ }
+}
+
+NitroVsockBridge *nitro_vsock_bridge_create(void)
+{
+ DeviceState *dev = qdev_new(TYPE_NITRO_VSOCK_BRIDGE);
+
+ qdev_set_id(dev, g_strdup("nitro-vsock"), &error_fatal);
+ sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal);
+
+ return NITRO_VSOCK_BRIDGE(dev);
+}
+
+static void nitro_vsock_bridge_init(Object *obj)
+{
+ NitroVsockBridge *s = NITRO_VSOCK_BRIDGE(obj);
+
+ qbus_init(&s->bus, sizeof(s->bus), TYPE_NITRO_VSOCK_BUS,
+ DEVICE(s), "nitro-vsock");
+ object_property_add_uint32_ptr(obj, "enclave-cid",
+ &s->enclave_cid, OBJ_PROP_FLAG_READ);
+}
+
+static void nitro_vsock_device_class_init(ObjectClass *oc, const void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(oc);
+
+ dc->bus_type = TYPE_NITRO_VSOCK_BUS;
+}
+
+static const TypeInfo nitro_vsock_bus_types[] = {
+ {
+ .name = TYPE_NITRO_VSOCK_BUS,
+ .parent = TYPE_BUS,
+ .instance_size = sizeof(NitroVsockBus),
+ },
+ {
+ .name = TYPE_NITRO_VSOCK_BRIDGE,
+ .parent = TYPE_SYS_BUS_DEVICE,
+ .instance_size = sizeof(NitroVsockBridge),
+ .instance_init = nitro_vsock_bridge_init,
+ },
+ {
+ .name = TYPE_NITRO_VSOCK_DEVICE,
+ .parent = TYPE_DEVICE,
+ .instance_size = sizeof(NitroVsockDevice),
+ .class_size = sizeof(NitroVsockDeviceClass),
+ .class_init = nitro_vsock_device_class_init,
+ .abstract = true,
+ },
+};
+
+DEFINE_TYPES(nitro_vsock_bus_types);
diff --git a/hw/nitro/serial-vsock.c b/hw/nitro/serial-vsock.c
new file mode 100644
index 0000000..1d56c33
--- /dev/null
+++ b/hw/nitro/serial-vsock.c
@@ -0,0 +1,123 @@
+/*
+ * Nitro Enclave Vsock Serial
+ *
+ * Copyright © 2026 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors:
+ * Alexander Graf <graf@amazon.com>
+ *
+ * With Nitro Enclaves in debug mode, the Nitro Hypervisor provides a vsock
+ * port that the parent can connect to to receive serial console output of
+ * the Enclave. This driver implements short-circuit logic to establish the
+ * vsock connection to that port and feed its data into a chardev, so that
+ * a machine model can use it as serial device.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+#include "chardev/char.h"
+#include "chardev/char-fe.h"
+#include "hw/core/qdev-properties.h"
+#include "hw/core/qdev-properties-system.h"
+#include "hw/nitro/serial-vsock.h"
+#include "trace.h"
+
+#define CONSOLE_PORT_START 10000
+#define VMADDR_CID_HYPERVISOR_STR "0"
+
+static int nitro_serial_vsock_can_read(void *opaque)
+{
+ NitroSerialVsockState *s = opaque;
+
+ /* Refuse vsock input until the output backend is ready */
+ return qemu_chr_fe_backend_open(&s->output) ? 4096 : 0;
+}
+
+static void nitro_serial_vsock_read(void *opaque, const uint8_t *buf, int size)
+{
+ NitroSerialVsockState *s = opaque;
+
+ /* Forward all vsock data to the output chardev */
+ qemu_chr_fe_write_all(&s->output, buf, size);
+}
+
+static void nitro_serial_vsock_event(void *opaque, QEMUChrEvent event)
+{
+ /* No need to action on connect/disconnect events, but trace for debug */
+ trace_nitro_serial_vsock_event(event);
+}
+
+static void nitro_serial_vsock_enclave_started(NitroVsockDevice *dev,
+ uint32_t enclave_cid,
+ Error **errp)
+{
+ NitroSerialVsockState *s = NITRO_SERIAL_VSOCK(dev);
+ uint32_t port = enclave_cid + CONSOLE_PORT_START;
+ g_autofree char *chardev_id = NULL;
+ Chardev *chr;
+ ChardevBackend *backend;
+ ChardevSocket *sock;
+
+ /*
+ * We know the Enclave CID to connect to now. Create a vsock
+ * client chardev that connects to the Enclave's console.
+ */
+ chardev_id = g_strdup_printf("nitro-console-%u", enclave_cid);
+
+ backend = g_new0(ChardevBackend, 1);
+ backend->type = CHARDEV_BACKEND_KIND_SOCKET;
+ sock = backend->u.socket.data = g_new0(ChardevSocket, 1);
+ sock->addr = g_new0(SocketAddressLegacy, 1);
+ sock->addr->type = SOCKET_ADDRESS_TYPE_VSOCK;
+ sock->addr->u.vsock.data = g_new0(VsockSocketAddress, 1);
+ sock->addr->u.vsock.data->cid = g_strdup(VMADDR_CID_HYPERVISOR_STR);
+ sock->addr->u.vsock.data->port = g_strdup_printf("%u", port);
+ sock->server = false;
+ sock->has_server = true;
+
+ chr = qemu_chardev_new(chardev_id, TYPE_CHARDEV_SOCKET,
+ backend, NULL, errp);
+ if (!chr) {
+ return;
+ }
+
+ if (!qemu_chr_fe_init(&s->vsock, chr, errp)) {
+ return;
+ }
+
+ qemu_chr_fe_set_handlers(&s->vsock,
+ nitro_serial_vsock_can_read,
+ nitro_serial_vsock_read,
+ nitro_serial_vsock_event,
+ NULL, s, NULL, true);
+}
+
+static const Property nitro_serial_vsock_props[] = {
+ DEFINE_PROP_CHR("chardev", NitroSerialVsockState, output),
+};
+
+static void nitro_serial_vsock_class_init(ObjectClass *oc, const void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(oc);
+ NitroVsockDeviceClass *ndc = NITRO_VSOCK_DEVICE_CLASS(oc);
+
+ device_class_set_props(dc, nitro_serial_vsock_props);
+ ndc->enclave_started = nitro_serial_vsock_enclave_started;
+}
+
+static const TypeInfo nitro_serial_vsock_info = {
+ .name = TYPE_NITRO_SERIAL_VSOCK,
+ .parent = TYPE_NITRO_VSOCK_DEVICE,
+ .instance_size = sizeof(NitroSerialVsockState),
+ .class_init = nitro_serial_vsock_class_init,
+};
+
+static void nitro_serial_vsock_register(void)
+{
+ type_register_static(&nitro_serial_vsock_info);
+}
+
+type_init(nitro_serial_vsock_register);
diff --git a/hw/nitro/trace-events b/hw/nitro/trace-events
new file mode 100644
index 0000000..311ab78
--- /dev/null
+++ b/hw/nitro/trace-events
@@ -0,0 +1,8 @@
+# See docs/devel/tracing.rst for syntax documentation.
+
+# serial-vsock.c
+nitro_serial_vsock_event(int event) "event %d"
+
+# heartbeat.c
+nitro_heartbeat_event(int event) "event %d"
+nitro_heartbeat_done(void) "enclave heartbeat received"
diff --git a/hw/nitro/trace.h b/hw/nitro/trace.h
new file mode 100644
index 0000000..b455d6c
--- /dev/null
+++ b/hw/nitro/trace.h
@@ -0,0 +1,4 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+#include "trace/trace-hw_nitro.h"
diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c
index f68f816..00d42d3 100644
--- a/hw/vfio/helpers.c
+++ b/hw/vfio/helpers.c
@@ -116,6 +116,88 @@ bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
* we'll re-use it should another vfio device be attached before then.
*/
int vfio_kvm_device_fd = -1;
+
+/*
+ * Confidential virtual machines:
+ * During reset of confidential vms, the kvm vm file descriptor changes.
+ * In this case, the old vfio kvm file descriptor is
+ * closed and a new descriptor is created against the new kvm vm file
+ * descriptor.
+ */
+
+typedef struct VFIODeviceFd {
+ int fd;
+ QLIST_ENTRY(VFIODeviceFd) node;
+} VFIODeviceFd;
+
+static QLIST_HEAD(, VFIODeviceFd) vfio_device_fds =
+ QLIST_HEAD_INITIALIZER(vfio_device_fds);
+
+static void vfio_device_fd_list_add(int fd)
+{
+ VFIODeviceFd *file_fd;
+ file_fd = g_malloc0(sizeof(*file_fd));
+ file_fd->fd = fd;
+ QLIST_INSERT_HEAD(&vfio_device_fds, file_fd, node);
+}
+
+static void vfio_device_fd_list_remove(int fd)
+{
+ VFIODeviceFd *file_fd, *next;
+
+ QLIST_FOREACH_SAFE(file_fd, &vfio_device_fds, node, next) {
+ if (file_fd->fd == fd) {
+ QLIST_REMOVE(file_fd, node);
+ g_free(file_fd);
+ break;
+ }
+ }
+}
+
+static int vfio_device_fd_rebind(NotifierWithReturn *notifier, void *data,
+ Error **errp)
+{
+ VFIODeviceFd *file_fd;
+ struct kvm_device_attr attr = {
+ .group = KVM_DEV_VFIO_FILE,
+ .attr = KVM_DEV_VFIO_FILE_ADD,
+ };
+ struct kvm_create_device cd = {
+ .type = KVM_DEV_TYPE_VFIO,
+ };
+
+ /* we are not interested in pre vmfd change notification */
+ if (((VmfdChangeNotifier *)data)->pre) {
+ return 0;
+ }
+
+ if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
+ error_setg_errno(errp, errno, "Failed to create KVM VFIO device");
+ return -errno;
+ }
+
+ if (vfio_kvm_device_fd != -1) {
+ close(vfio_kvm_device_fd);
+ }
+
+ vfio_kvm_device_fd = cd.fd;
+
+ QLIST_FOREACH(file_fd, &vfio_device_fds, node) {
+ attr.addr = (uint64_t)(unsigned long)&file_fd->fd;
+ if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
+ error_setg_errno(errp, errno,
+ "Failed to add fd %d to KVM VFIO device",
+ file_fd->fd);
+ return -errno;
+ }
+ }
+ return 0;
+}
+
+static struct NotifierWithReturn vfio_vmfd_change_notifier = {
+ .notify = vfio_device_fd_rebind,
+};
+
#endif
void vfio_kvm_device_close(void)
@@ -153,6 +235,11 @@ int vfio_kvm_device_add_fd(int fd, Error **errp)
}
vfio_kvm_device_fd = cd.fd;
+ /*
+ * If the vm file descriptor changes, add a notifier so that we can
+ * re-create the vfio_kvm_device_fd.
+ */
+ kvm_vmfd_add_change_notifier(&vfio_vmfd_change_notifier);
}
if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
@@ -160,6 +247,8 @@ int vfio_kvm_device_add_fd(int fd, Error **errp)
fd);
return -errno;
}
+
+ vfio_device_fd_list_add(fd);
#endif
return 0;
}
@@ -183,6 +272,8 @@ int vfio_kvm_device_del_fd(int fd, Error **errp)
"Failed to remove fd %d from KVM VFIO device", fd);
return -errno;
}
+
+ vfio_device_fd_list_remove(fd);
#endif
return 0;
}
diff --git a/include/accel/accel-ops.h b/include/accel/accel-ops.h
index 23a8c24..f46492e 100644
--- a/include/accel/accel-ops.h
+++ b/include/accel/accel-ops.h
@@ -23,6 +23,8 @@ struct AccelClass {
AccelOpsClass *ops;
int (*init_machine)(AccelState *as, MachineState *ms);
+ /* used mainly by confidential guests to rebuild guest state upon reset */
+ int (*rebuild_guest)(MachineState *ms);
bool (*cpu_common_realize)(CPUState *cpu, Error **errp);
void (*cpu_common_unrealize)(CPUState *cpu);
/* get_stats: Append statistics to @buf */
diff --git a/include/hw/core/boards.h b/include/hw/core/boards.h
index edbe8d0..12b2149 100644
--- a/include/hw/core/boards.h
+++ b/include/hw/core/boards.h
@@ -448,6 +448,12 @@ struct MachineState {
struct NVDIMMState *nvdimms_state;
struct NumaState *numa_state;
bool acpi_spcr_enabled;
+ /*
+ * Whether to change virtual machine accelerator handle upon
+ * reset or not. Used only for debugging and testing purpose.
+ * Set to false by default for all regular use.
+ */
+ bool new_accel_vmfd_on_reset;
};
/*
diff --git a/include/hw/i386/x86.h b/include/hw/i386/x86.h
index 23be627..a85a560 100644
--- a/include/hw/i386/x86.h
+++ b/include/hw/i386/x86.h
@@ -125,6 +125,7 @@ void x86_isa_bios_init(MemoryRegion *isa_bios, MemoryRegion *isa_memory,
MemoryRegion *bios, bool read_only);
void x86_bios_rom_init(X86MachineState *x86ms, const char *default_firmware,
MemoryRegion *rom_memory, bool isapc_ram_fw);
+void x86_bios_rom_reload(X86MachineState *x86ms);
void x86_load_linux(X86MachineState *x86ms,
FWCfgState *fw_cfg,
diff --git a/include/hw/nitro/heartbeat.h b/include/hw/nitro/heartbeat.h
new file mode 100644
index 0000000..6b9271a
--- /dev/null
+++ b/include/hw/nitro/heartbeat.h
@@ -0,0 +1,24 @@
+/*
+ * Nitro Heartbeat device
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_MISC_NITRO_HEARTBEAT_H
+#define HW_MISC_NITRO_HEARTBEAT_H
+
+#include "hw/nitro/nitro-vsock-bus.h"
+#include "chardev/char-fe.h"
+#include "qom/object.h"
+
+#define TYPE_NITRO_HEARTBEAT "nitro-heartbeat"
+OBJECT_DECLARE_SIMPLE_TYPE(NitroHeartbeatState, NITRO_HEARTBEAT)
+
+struct NitroHeartbeatState {
+ NitroVsockDevice parent_obj;
+
+ CharFrontend vsock; /* vsock server chardev for heartbeat */
+ bool done;
+};
+
+#endif /* HW_MISC_NITRO_HEARTBEAT_H */
diff --git a/include/hw/nitro/machine.h b/include/hw/nitro/machine.h
new file mode 100644
index 0000000..d78ba7d
--- /dev/null
+++ b/include/hw/nitro/machine.h
@@ -0,0 +1,20 @@
+/*
+ * Nitro Enclaves (accel) machine
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_NITRO_MACHINE_H
+#define HW_NITRO_MACHINE_H
+
+#include "hw/core/boards.h"
+#include "qom/object.h"
+
+#define TYPE_NITRO_MACHINE MACHINE_TYPE_NAME("nitro")
+OBJECT_DECLARE_SIMPLE_TYPE(NitroMachineState, NITRO_MACHINE)
+
+struct NitroMachineState {
+ MachineState parent;
+};
+
+#endif /* HW_NITRO_MACHINE_H */
diff --git a/include/hw/nitro/nitro-vsock-bus.h b/include/hw/nitro/nitro-vsock-bus.h
new file mode 100644
index 0000000..064260a
--- /dev/null
+++ b/include/hw/nitro/nitro-vsock-bus.h
@@ -0,0 +1,71 @@
+/*
+ * Nitro Enclave Vsock Bus
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_NITRO_VSOCK_BUS_H
+#define HW_NITRO_VSOCK_BUS_H
+
+#include "hw/core/qdev.h"
+#include "hw/core/sysbus.h"
+#include "qom/object.h"
+
+#define TYPE_NITRO_VSOCK_BUS "nitro-vsock-bus"
+OBJECT_DECLARE_SIMPLE_TYPE(NitroVsockBus, NITRO_VSOCK_BUS)
+
+#define TYPE_NITRO_VSOCK_BRIDGE "nitro-vsock-bridge"
+OBJECT_DECLARE_SIMPLE_TYPE(NitroVsockBridge, NITRO_VSOCK_BRIDGE)
+
+#define TYPE_NITRO_VSOCK_DEVICE "nitro-vsock-device"
+OBJECT_DECLARE_TYPE(NitroVsockDevice, NitroVsockDeviceClass,
+ NITRO_VSOCK_DEVICE)
+
+struct NitroVsockBus {
+ BusState parent_obj;
+};
+
+struct NitroVsockBridge {
+ SysBusDevice parent_obj;
+
+ NitroVsockBus bus;
+ uint32_t enclave_cid;
+};
+
+struct NitroVsockDevice {
+ DeviceState parent_obj;
+};
+
+struct NitroVsockDeviceClass {
+ DeviceClass parent_class;
+
+ /*
+ * Called after the enclave has been started and the CID is known.
+ * Devices use this to establish vsock connections to the enclave.
+ */
+ void (*enclave_started)(NitroVsockDevice *dev, uint32_t enclave_cid,
+ Error **errp);
+};
+
+/*
+ * Machine helper to create the Nitro vsock bridge sysbus device.
+ */
+NitroVsockBridge *nitro_vsock_bridge_create(void);
+
+/*
+ * Find the Nitro vsock bridge on the sysbus.
+ */
+static inline NitroVsockBridge *nitro_vsock_bridge_find(void)
+{
+ return NITRO_VSOCK_BRIDGE(
+ object_resolve_path_type("", TYPE_NITRO_VSOCK_BRIDGE, NULL));
+}
+
+/*
+ * Notify the bridge that the enclave has started. Dispatches
+ * enclave_started() to all devices on the bus.
+ */
+void nitro_vsock_bridge_start_enclave(NitroVsockBridge *bridge,
+ uint32_t enclave_cid, Error **errp);
+
+#endif /* HW_NITRO_VSOCK_BUS_H */
diff --git a/include/hw/nitro/serial-vsock.h b/include/hw/nitro/serial-vsock.h
new file mode 100644
index 0000000..c365880
--- /dev/null
+++ b/include/hw/nitro/serial-vsock.h
@@ -0,0 +1,24 @@
+/*
+ * Nitro Enclave Serial (vsock)
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_CHAR_NITRO_SERIAL_VSOCK_H
+#define HW_CHAR_NITRO_SERIAL_VSOCK_H
+
+#include "hw/nitro/nitro-vsock-bus.h"
+#include "chardev/char-fe.h"
+#include "qom/object.h"
+
+#define TYPE_NITRO_SERIAL_VSOCK "nitro-serial-vsock"
+OBJECT_DECLARE_SIMPLE_TYPE(NitroSerialVsockState, NITRO_SERIAL_VSOCK)
+
+struct NitroSerialVsockState {
+ NitroVsockDevice parent_obj;
+
+ CharFrontend output; /* chardev to write console output to */
+ CharFrontend vsock; /* vsock chardev to enclave console */
+};
+
+#endif /* HW_CHAR_NITRO_SERIAL_VSOCK_H */
diff --git a/include/standard-headers/linux/nitro_enclaves.h b/include/standard-headers/linux/nitro_enclaves.h
new file mode 100644
index 0000000..5545267
--- /dev/null
+++ b/include/standard-headers/linux/nitro_enclaves.h
@@ -0,0 +1,359 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright 2020-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ */
+
+#ifndef _LINUX_NITRO_ENCLAVES_H_
+#define _LINUX_NITRO_ENCLAVES_H_
+
+#include "standard-headers/linux/types.h"
+
+/**
+ * DOC: Nitro Enclaves (NE) Kernel Driver Interface
+ */
+
+/**
+ * NE_CREATE_VM - The command is used to create a slot that is associated with
+ * an enclave VM.
+ * The generated unique slot id is an output parameter.
+ * The ioctl can be invoked on the /dev/nitro_enclaves fd, before
+ * setting any resources, such as memory and vCPUs, for an
+ * enclave. Memory and vCPUs are set for the slot mapped to an enclave.
+ * A NE CPU pool has to be set before calling this function. The
+ * pool can be set after the NE driver load, using
+ * /sys/module/nitro_enclaves/parameters/ne_cpus.
+ * Its format is the detailed in the cpu-lists section:
+ * https://www.kernel.org/doc/html/latest/admin-guide/kernel-parameters.html
+ * CPU 0 and its siblings have to remain available for the
+ * primary / parent VM, so they cannot be set for enclaves. Full
+ * CPU core(s), from the same NUMA node, need(s) to be included
+ * in the CPU pool.
+ *
+ * Context: Process context.
+ * Return:
+ * * Enclave file descriptor - Enclave file descriptor used with
+ * ioctl calls to set vCPUs and memory
+ * regions, then start the enclave.
+ * * -1 - There was a failure in the ioctl logic.
+ * On failure, errno is set to:
+ * * EFAULT - copy_to_user() failure.
+ * * ENOMEM - Memory allocation failure for internal
+ * bookkeeping variables.
+ * * NE_ERR_NO_CPUS_AVAIL_IN_POOL - No NE CPU pool set / no CPUs available
+ * in the pool.
+ * * Error codes from get_unused_fd_flags() and anon_inode_getfile().
+ * * Error codes from the NE PCI device request.
+ */
+#define NE_CREATE_VM _IOR(0xAE, 0x20, uint64_t)
+
+/**
+ * NE_ADD_VCPU - The command is used to set a vCPU for an enclave. The vCPU can
+ * be auto-chosen from the NE CPU pool or it can be set by the
+ * caller, with the note that it needs to be available in the NE
+ * CPU pool. Full CPU core(s), from the same NUMA node, need(s) to
+ * be associated with an enclave.
+ * The vCPU id is an input / output parameter. If its value is 0,
+ * then a CPU is chosen from the enclave CPU pool and returned via
+ * this parameter.
+ * The ioctl can be invoked on the enclave fd, before an enclave
+ * is started.
+ *
+ * Context: Process context.
+ * Return:
+ * * 0 - Logic successfully completed.
+ * * -1 - There was a failure in the ioctl logic.
+ * On failure, errno is set to:
+ * * EFAULT - copy_from_user() / copy_to_user() failure.
+ * * ENOMEM - Memory allocation failure for internal
+ * bookkeeping variables.
+ * * EIO - Current task mm is not the same as the one
+ * that created the enclave.
+ * * NE_ERR_NO_CPUS_AVAIL_IN_POOL - No CPUs available in the NE CPU pool.
+ * * NE_ERR_VCPU_ALREADY_USED - The provided vCPU is already used.
+ * * NE_ERR_VCPU_NOT_IN_CPU_POOL - The provided vCPU is not available in the
+ * NE CPU pool.
+ * * NE_ERR_VCPU_INVALID_CPU_CORE - The core id of the provided vCPU is invalid
+ * or out of range.
+ * * NE_ERR_NOT_IN_INIT_STATE - The enclave is not in init state
+ * (init = before being started).
+ * * NE_ERR_INVALID_VCPU - The provided vCPU is not in the available
+ * CPUs range.
+ * * Error codes from the NE PCI device request.
+ */
+#define NE_ADD_VCPU _IOWR(0xAE, 0x21, uint32_t)
+
+/**
+ * NE_GET_IMAGE_LOAD_INFO - The command is used to get information needed for
+ * in-memory enclave image loading e.g. offset in
+ * enclave memory to start placing the enclave image.
+ * The image load info is an input / output parameter.
+ * It includes info provided by the caller - flags -
+ * and returns the offset in enclave memory where to
+ * start placing the enclave image.
+ * The ioctl can be invoked on the enclave fd, before
+ * an enclave is started.
+ *
+ * Context: Process context.
+ * Return:
+ * * 0 - Logic successfully completed.
+ * * -1 - There was a failure in the ioctl logic.
+ * On failure, errno is set to:
+ * * EFAULT - copy_from_user() / copy_to_user() failure.
+ * * NE_ERR_NOT_IN_INIT_STATE - The enclave is not in init state (init =
+ * before being started).
+ * * NE_ERR_INVALID_FLAG_VALUE - The value of the provided flag is invalid.
+ */
+#define NE_GET_IMAGE_LOAD_INFO _IOWR(0xAE, 0x22, struct ne_image_load_info)
+
+/**
+ * NE_SET_USER_MEMORY_REGION - The command is used to set a memory region for an
+ * enclave, given the allocated memory from the
+ * userspace. Enclave memory needs to be from the
+ * same NUMA node as the enclave CPUs.
+ * The user memory region is an input parameter. It
+ * includes info provided by the caller - flags,
+ * memory size and userspace address.
+ * The ioctl can be invoked on the enclave fd,
+ * before an enclave is started.
+ *
+ * Context: Process context.
+ * Return:
+ * * 0 - Logic successfully completed.
+ * * -1 - There was a failure in the ioctl logic.
+ * On failure, errno is set to:
+ * * EFAULT - copy_from_user() failure.
+ * * EINVAL - Invalid physical memory region(s) e.g.
+ * unaligned address.
+ * * EIO - Current task mm is not the same as
+ * the one that created the enclave.
+ * * ENOMEM - Memory allocation failure for internal
+ * bookkeeping variables.
+ * * NE_ERR_NOT_IN_INIT_STATE - The enclave is not in init state
+ * (init = before being started).
+ * * NE_ERR_INVALID_MEM_REGION_SIZE - The memory size of the region is not
+ * multiple of 2 MiB.
+ * * NE_ERR_INVALID_MEM_REGION_ADDR - Invalid user space address given.
+ * * NE_ERR_UNALIGNED_MEM_REGION_ADDR - Unaligned user space address given.
+ * * NE_ERR_MEM_REGION_ALREADY_USED - The memory region is already used.
+ * * NE_ERR_MEM_NOT_HUGE_PAGE - The memory region is not backed by
+ * huge pages.
+ * * NE_ERR_MEM_DIFFERENT_NUMA_NODE - The memory region is not from the same
+ * NUMA node as the CPUs.
+ * * NE_ERR_MEM_MAX_REGIONS - The number of memory regions set for
+ * the enclave reached maximum.
+ * * NE_ERR_INVALID_PAGE_SIZE - The memory region is not backed by
+ * pages multiple of 2 MiB.
+ * * NE_ERR_INVALID_FLAG_VALUE - The value of the provided flag is invalid.
+ * * Error codes from get_user_pages().
+ * * Error codes from the NE PCI device request.
+ */
+#define NE_SET_USER_MEMORY_REGION _IOW(0xAE, 0x23, struct ne_user_memory_region)
+
+/**
+ * NE_START_ENCLAVE - The command is used to trigger enclave start after the
+ * enclave resources, such as memory and CPU, have been set.
+ * The enclave start info is an input / output parameter. It
+ * includes info provided by the caller - enclave cid and
+ * flags - and returns the cid (if input cid is 0).
+ * The ioctl can be invoked on the enclave fd, after an
+ * enclave slot is created and resources, such as memory and
+ * vCPUs are set for an enclave.
+ *
+ * Context: Process context.
+ * Return:
+ * * 0 - Logic successfully completed.
+ * * -1 - There was a failure in the ioctl logic.
+ * On failure, errno is set to:
+ * * EFAULT - copy_from_user() / copy_to_user() failure.
+ * * NE_ERR_NOT_IN_INIT_STATE - The enclave is not in init state
+ * (init = before being started).
+ * * NE_ERR_NO_MEM_REGIONS_ADDED - No memory regions are set.
+ * * NE_ERR_NO_VCPUS_ADDED - No vCPUs are set.
+ * * NE_ERR_FULL_CORES_NOT_USED - Full core(s) not set for the enclave.
+ * * NE_ERR_ENCLAVE_MEM_MIN_SIZE - Enclave memory is less than minimum
+ * memory size (64 MiB).
+ * * NE_ERR_INVALID_FLAG_VALUE - The value of the provided flag is invalid.
+ * * NE_ERR_INVALID_ENCLAVE_CID - The provided enclave CID is invalid.
+ * * Error codes from the NE PCI device request.
+ */
+#define NE_START_ENCLAVE _IOWR(0xAE, 0x24, struct ne_enclave_start_info)
+
+/**
+ * DOC: NE specific error codes
+ */
+
+/**
+ * NE_ERR_VCPU_ALREADY_USED - The provided vCPU is already used.
+ */
+#define NE_ERR_VCPU_ALREADY_USED (256)
+/**
+ * NE_ERR_VCPU_NOT_IN_CPU_POOL - The provided vCPU is not available in the
+ * NE CPU pool.
+ */
+#define NE_ERR_VCPU_NOT_IN_CPU_POOL (257)
+/**
+ * NE_ERR_VCPU_INVALID_CPU_CORE - The core id of the provided vCPU is invalid
+ * or out of range of the NE CPU pool.
+ */
+#define NE_ERR_VCPU_INVALID_CPU_CORE (258)
+/**
+ * NE_ERR_INVALID_MEM_REGION_SIZE - The user space memory region size is not
+ * multiple of 2 MiB.
+ */
+#define NE_ERR_INVALID_MEM_REGION_SIZE (259)
+/**
+ * NE_ERR_INVALID_MEM_REGION_ADDR - The user space memory region address range
+ * is invalid.
+ */
+#define NE_ERR_INVALID_MEM_REGION_ADDR (260)
+/**
+ * NE_ERR_UNALIGNED_MEM_REGION_ADDR - The user space memory region address is
+ * not aligned.
+ */
+#define NE_ERR_UNALIGNED_MEM_REGION_ADDR (261)
+/**
+ * NE_ERR_MEM_REGION_ALREADY_USED - The user space memory region is already used.
+ */
+#define NE_ERR_MEM_REGION_ALREADY_USED (262)
+/**
+ * NE_ERR_MEM_NOT_HUGE_PAGE - The user space memory region is not backed by
+ * contiguous physical huge page(s).
+ */
+#define NE_ERR_MEM_NOT_HUGE_PAGE (263)
+/**
+ * NE_ERR_MEM_DIFFERENT_NUMA_NODE - The user space memory region is backed by
+ * pages from different NUMA nodes than the CPUs.
+ */
+#define NE_ERR_MEM_DIFFERENT_NUMA_NODE (264)
+/**
+ * NE_ERR_MEM_MAX_REGIONS - The supported max memory regions per enclaves has
+ * been reached.
+ */
+#define NE_ERR_MEM_MAX_REGIONS (265)
+/**
+ * NE_ERR_NO_MEM_REGIONS_ADDED - The command to start an enclave is triggered
+ * and no memory regions are added.
+ */
+#define NE_ERR_NO_MEM_REGIONS_ADDED (266)
+/**
+ * NE_ERR_NO_VCPUS_ADDED - The command to start an enclave is triggered and no
+ * vCPUs are added.
+ */
+#define NE_ERR_NO_VCPUS_ADDED (267)
+/**
+ * NE_ERR_ENCLAVE_MEM_MIN_SIZE - The enclave memory size is lower than the
+ * minimum supported.
+ */
+#define NE_ERR_ENCLAVE_MEM_MIN_SIZE (268)
+/**
+ * NE_ERR_FULL_CORES_NOT_USED - The command to start an enclave is triggered and
+ * full CPU cores are not set.
+ */
+#define NE_ERR_FULL_CORES_NOT_USED (269)
+/**
+ * NE_ERR_NOT_IN_INIT_STATE - The enclave is not in init state when setting
+ * resources or triggering start.
+ */
+#define NE_ERR_NOT_IN_INIT_STATE (270)
+/**
+ * NE_ERR_INVALID_VCPU - The provided vCPU is out of range of the available CPUs.
+ */
+#define NE_ERR_INVALID_VCPU (271)
+/**
+ * NE_ERR_NO_CPUS_AVAIL_IN_POOL - The command to create an enclave is triggered
+ * and no CPUs are available in the pool.
+ */
+#define NE_ERR_NO_CPUS_AVAIL_IN_POOL (272)
+/**
+ * NE_ERR_INVALID_PAGE_SIZE - The user space memory region is not backed by pages
+ * multiple of 2 MiB.
+ */
+#define NE_ERR_INVALID_PAGE_SIZE (273)
+/**
+ * NE_ERR_INVALID_FLAG_VALUE - The provided flag value is invalid.
+ */
+#define NE_ERR_INVALID_FLAG_VALUE (274)
+/**
+ * NE_ERR_INVALID_ENCLAVE_CID - The provided enclave CID is invalid, either
+ * being a well-known value or the CID of the
+ * parent / primary VM.
+ */
+#define NE_ERR_INVALID_ENCLAVE_CID (275)
+
+/**
+ * DOC: Image load info flags
+ */
+
+/**
+ * NE_EIF_IMAGE - Enclave Image Format (EIF)
+ */
+#define NE_EIF_IMAGE (0x01)
+
+#define NE_IMAGE_LOAD_MAX_FLAG_VAL (0x02)
+
+/**
+ * struct ne_image_load_info - Info necessary for in-memory enclave image
+ * loading (in / out).
+ * @flags: Flags to determine the enclave image type
+ * (e.g. Enclave Image Format - EIF) (in).
+ * @memory_offset: Offset in enclave memory where to start placing the
+ * enclave image (out).
+ */
+struct ne_image_load_info {
+ uint64_t flags;
+ uint64_t memory_offset;
+};
+
+/**
+ * DOC: User memory region flags
+ */
+
+/**
+ * NE_DEFAULT_MEMORY_REGION - Memory region for enclave general usage.
+ */
+#define NE_DEFAULT_MEMORY_REGION (0x00)
+
+#define NE_MEMORY_REGION_MAX_FLAG_VAL (0x01)
+
+/**
+ * struct ne_user_memory_region - Memory region to be set for an enclave (in).
+ * @flags: Flags to determine the usage for the memory region (in).
+ * @memory_size: The size, in bytes, of the memory region to be set for
+ * an enclave (in).
+ * @userspace_addr: The start address of the userspace allocated memory of
+ * the memory region to set for an enclave (in).
+ */
+struct ne_user_memory_region {
+ uint64_t flags;
+ uint64_t memory_size;
+ uint64_t userspace_addr;
+};
+
+/**
+ * DOC: Enclave start info flags
+ */
+
+/**
+ * NE_ENCLAVE_PRODUCTION_MODE - Start enclave in production mode.
+ */
+#define NE_ENCLAVE_PRODUCTION_MODE (0x00)
+/**
+ * NE_ENCLAVE_DEBUG_MODE - Start enclave in debug mode.
+ */
+#define NE_ENCLAVE_DEBUG_MODE (0x01)
+
+#define NE_ENCLAVE_START_MAX_FLAG_VAL (0x02)
+
+/**
+ * struct ne_enclave_start_info - Setup info necessary for enclave start (in / out).
+ * @flags: Flags for the enclave to start with (e.g. debug mode) (in).
+ * @enclave_cid: Context ID (CID) for the enclave vsock device. If 0 as
+ * input, the CID is autogenerated by the hypervisor and
+ * returned back as output by the driver (in / out).
+ */
+struct ne_enclave_start_info {
+ uint64_t flags;
+ uint64_t enclave_cid;
+};
+
+#endif /* _LINUX_NITRO_ENCLAVES_H_ */
diff --git a/include/system/confidential-guest-support.h b/include/system/confidential-guest-support.h
index 0cc8b26..5dca717 100644
--- a/include/system/confidential-guest-support.h
+++ b/include/system/confidential-guest-support.h
@@ -152,6 +152,11 @@ typedef struct ConfidentialGuestSupportClass {
*/
int (*get_mem_map_entry)(int index, ConfidentialGuestMemoryMapEntry *entry,
Error **errp);
+
+ /*
+ * is it possible to rebuild the guest state?
+ */
+ bool can_rebuild_guest_state;
} ConfidentialGuestSupportClass;
static inline int confidential_guest_kvm_init(ConfidentialGuestSupport *cgs,
@@ -167,6 +172,21 @@ static inline int confidential_guest_kvm_init(ConfidentialGuestSupport *cgs,
return 0;
}
+static inline bool
+confidential_guest_can_rebuild_state(ConfidentialGuestSupport *cgs)
+{
+ ConfidentialGuestSupportClass *klass;
+
+ if (!cgs) {
+ /* non-confidential guests */
+ return true;
+ }
+
+ klass = CONFIDENTIAL_GUEST_SUPPORT_GET_CLASS(cgs);
+ return klass->can_rebuild_guest_state;
+
+}
+
static inline int confidential_guest_kvm_reset(ConfidentialGuestSupport *cgs,
Error **errp)
{
diff --git a/include/system/hw_accel.h b/include/system/hw_accel.h
index 628a50e..f0c10b6 100644
--- a/include/system/hw_accel.h
+++ b/include/system/hw_accel.h
@@ -17,6 +17,7 @@
#include "system/mshv.h"
#include "system/whpx.h"
#include "system/nvmm.h"
+#include "system/nitro-accel.h"
/**
* cpu_synchronize_state:
diff --git a/include/system/kvm.h b/include/system/kvm.h
index 8f9eecf..4b0e1b4 100644
--- a/include/system/kvm.h
+++ b/include/system/kvm.h
@@ -181,6 +181,7 @@ DECLARE_INSTANCE_CHECKER(KVMState, KVM_STATE,
extern KVMState *kvm_state;
typedef struct Notifier Notifier;
+typedef struct NotifierWithReturn NotifierWithReturn;
typedef struct KVMRouteChange {
KVMState *s;
@@ -456,6 +457,9 @@ int kvm_physical_memory_addr_from_host(KVMState *s, void *ram_addr,
#endif /* COMPILING_PER_TARGET */
+bool kvm_arch_supports_vmfd_change(void);
+int kvm_arch_on_vmfd_change(MachineState *ms, KVMState *s);
+
void kvm_cpu_synchronize_state(CPUState *cpu);
void kvm_init_cpu_signals(CPUState *cpu);
@@ -564,4 +568,43 @@ int kvm_set_memory_attributes_shared(hwaddr start, uint64_t size);
int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private);
+/* argument to vmfd change notifier */
+typedef struct VmfdChangeNotifier {
+ int vmfd;
+ bool pre;
+} VmfdChangeNotifier;
+
+/**
+ * kvm_vmfd_add_change_notifier - register a notifier to get notified when
+ * a KVM vm file descriptor changes or about to be changed as a part of the
+ * confidential guest "reset" process.
+ * Various subsystems should use this mechanism to take actions such
+ * as creating new fds against this new vm file descriptor.
+ * @n: notifier with return value.
+ */
+void kvm_vmfd_add_change_notifier(NotifierWithReturn *n);
+/**
+ * kvm_vmfd_remove_change_notifier - de-register a notifer previously
+ * registered with kvm_vmfd_add_change_notifier call.
+ * @n: notifier that was previously registered.
+ */
+void kvm_vmfd_remove_change_notifier(NotifierWithReturn *n);
+
+/**
+ * kvm_vcpufd_add_change_notifier - register a notifier to get notified when
+ * a KVM vcpu file descriptors changes as a part of the confidential guest
+ * "reset" process. Various subsystems should use this mechanism to take
+ * actions such as re-issuing vcpu ioctls as a part of setting up vcpu
+ * features.
+ * @n: notifier with return value.
+ */
+void kvm_vcpufd_add_change_notifier(NotifierWithReturn *n);
+
+/**
+ * kvm_vcpufd_remove_change_notifier - de-register a notifer previously
+ * registered with kvm_vcpufd_add_change_notifier call.
+ * @n: notifier that was previously registered.
+ */
+void kvm_vcpufd_remove_change_notifier(NotifierWithReturn *n);
+
#endif
diff --git a/include/system/kvm_int.h b/include/system/kvm_int.h
index baeb166..0876aac 100644
--- a/include/system/kvm_int.h
+++ b/include/system/kvm_int.h
@@ -167,6 +167,7 @@ struct KVMState
uint16_t xen_gnttab_max_frames;
uint16_t xen_evtchn_max_pirq;
char *device;
+ OnOffAuto honor_guest_pat;
};
void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
diff --git a/include/system/nitro-accel.h b/include/system/nitro-accel.h
new file mode 100644
index 0000000..a93aa6f
--- /dev/null
+++ b/include/system/nitro-accel.h
@@ -0,0 +1,25 @@
+/*
+ * Nitro Enclaves accelerator - public interface
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef SYSTEM_NITRO_ACCEL_H
+#define SYSTEM_NITRO_ACCEL_H
+
+#include "qemu/accel.h"
+
+extern bool nitro_allowed;
+
+static inline bool nitro_enabled(void)
+{
+ return nitro_allowed;
+}
+
+#define TYPE_NITRO_ACCEL ACCEL_CLASS_NAME("nitro")
+
+typedef struct NitroAccelState NitroAccelState;
+DECLARE_INSTANCE_CHECKER(NitroAccelState, NITRO_ACCEL,
+ TYPE_NITRO_ACCEL)
+
+#endif /* SYSTEM_NITRO_ACCEL_H */
diff --git a/include/system/physmem.h b/include/system/physmem.h
index 7bb7d3e..da91b77 100644
--- a/include/system/physmem.h
+++ b/include/system/physmem.h
@@ -51,5 +51,6 @@ physical_memory_snapshot_and_clear_dirty(MemoryRegion *mr, hwaddr offset,
bool physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap,
ram_addr_t start,
ram_addr_t length);
+int ram_block_rebind(Error **errp);
#endif
diff --git a/include/system/whpx-accel-ops.h b/include/system/whpx-accel-ops.h
index ed9d4c4..4b2a732 100644
--- a/include/system/whpx-accel-ops.h
+++ b/include/system/whpx-accel-ops.h
@@ -22,11 +22,15 @@ void whpx_cpu_synchronize_post_reset(CPUState *cpu);
void whpx_cpu_synchronize_post_init(CPUState *cpu);
void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu);
-/* state subset only touched by the VCPU itself during runtime */
-#define WHPX_SET_RUNTIME_STATE 1
-/* state subset modified during VCPU reset */
-#define WHPX_SET_RESET_STATE 2
-/* full state set, modified during initialization or on vmload */
-#define WHPX_SET_FULL_STATE 3
+typedef enum WHPXStateLevel {
+ /* subset of runtime state for faster returns from vmexit */
+ WHPX_LEVEL_FAST_RUNTIME_STATE,
+ /* state subset only touched by the VCPU itself during runtime */
+ WHPX_LEVEL_RUNTIME_STATE,
+ /* state subset modified during VCPU reset */
+ WHPX_LEVEL_RESET_STATE,
+ /* full state set, modified during initialization or on vmload */
+ WHPX_LEVEL_FULL_STATE
+} WHPXStateLevel;
#endif /* TARGET_I386_WHPX_ACCEL_OPS_H */
diff --git a/include/system/whpx-all.h b/include/system/whpx-all.h
index f13cdf7..2cbea71 100644
--- a/include/system/whpx-all.h
+++ b/include/system/whpx-all.h
@@ -2,10 +2,12 @@
#ifndef SYSTEM_WHPX_ALL_H
#define SYSTEM_WHPX_ALL_H
+#include "system/whpx-accel-ops.h"
+
/* Called by whpx-common */
int whpx_vcpu_run(CPUState *cpu);
-void whpx_get_registers(CPUState *cpu);
-void whpx_set_registers(CPUState *cpu, int level);
+void whpx_get_registers(CPUState *cpu, WHPXStateLevel level);
+void whpx_set_registers(CPUState *cpu, WHPXStateLevel level);
int whpx_accel_init(AccelState *as, MachineState *ms);
void whpx_cpu_instance_init(CPUState *cs);
HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions);
@@ -17,4 +19,9 @@ void whpx_translate_cpu_breakpoints(
struct whpx_breakpoints *breakpoints,
CPUState *cpu,
int cpu_breakpoint_count);
+void whpx_arch_destroy_vcpu(CPUState *cpu);
+
+/* called by whpx-accel-ops */
+bool whpx_arch_supports_guest_debug(void);
+
#endif
diff --git a/include/system/whpx-common.h b/include/system/whpx-common.h
index b86fe9d..04289af 100644
--- a/include/system/whpx-common.h
+++ b/include/system/whpx-common.h
@@ -3,9 +3,6 @@
#define SYSTEM_WHPX_COMMON_H
struct AccelCPUState {
-#ifdef HOST_X86_64
- WHV_EMULATOR_HANDLE emulator;
-#endif
bool window_registered;
bool interruptable;
bool ready_for_pic_interrupt;
@@ -20,6 +17,9 @@ int whpx_first_vcpu_starting(CPUState *cpu);
int whpx_last_vcpu_stopping(CPUState *cpu);
void whpx_memory_init(void);
struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address);
+void whpx_flush_cpu_state(CPUState *cpu);
+void whpx_get_reg(CPUState *cpu, WHV_REGISTER_NAME reg, WHV_REGISTER_VALUE* val);
+void whpx_set_reg(CPUState *cpu, WHV_REGISTER_NAME reg, WHV_REGISTER_VALUE val);
/* On x64: same as WHvX64ExceptionTypeDebugTrapOrFault */
#define WHPX_INTERCEPT_DEBUG_TRAPS 1
diff --git a/include/system/whpx-internal.h b/include/system/whpx-internal.h
index ad6ade2..7a1c987 100644
--- a/include/system/whpx-internal.h
+++ b/include/system/whpx-internal.h
@@ -4,9 +4,6 @@
#include <windows.h>
#include <winhvplatform.h>
-#ifdef HOST_X86_64
-#include <winhvemulation.h>
-#endif
#include "hw/i386/apic.h"
#include "exec/vaddr.h"
@@ -89,12 +86,6 @@ void whpx_apic_get(APICCommonState *s);
X(HRESULT, WHvResetPartition, \
(WHV_PARTITION_HANDLE Partition)) \
-#define LIST_WINHVEMULATION_FUNCTIONS(X) \
- X(HRESULT, WHvEmulatorCreateEmulator, (const WHV_EMULATOR_CALLBACKS* Callbacks, WHV_EMULATOR_HANDLE* Emulator)) \
- X(HRESULT, WHvEmulatorDestroyEmulator, (WHV_EMULATOR_HANDLE Emulator)) \
- X(HRESULT, WHvEmulatorTryIoEmulation, (WHV_EMULATOR_HANDLE Emulator, VOID* Context, const WHV_VP_EXIT_CONTEXT* VpContext, const WHV_X64_IO_PORT_ACCESS_CONTEXT* IoInstructionContext, WHV_EMULATOR_STATUS* EmulatorReturnStatus)) \
- X(HRESULT, WHvEmulatorTryMmioEmulation, (WHV_EMULATOR_HANDLE Emulator, VOID* Context, const WHV_VP_EXIT_CONTEXT* VpContext, const WHV_MEMORY_ACCESS_CONTEXT* MmioInstructionContext, WHV_EMULATOR_STATUS* EmulatorReturnStatus)) \
-
#define WHP_DEFINE_TYPE(return_type, function_name, signature) \
typedef return_type (WINAPI *function_name ## _t) signature;
@@ -103,16 +94,10 @@ void whpx_apic_get(APICCommonState *s);
/* Define function typedef */
LIST_WINHVPLATFORM_FUNCTIONS(WHP_DEFINE_TYPE)
-#ifdef HOST_X86_64
-LIST_WINHVEMULATION_FUNCTIONS(WHP_DEFINE_TYPE)
-#endif
LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_DEFINE_TYPE)
struct WHPDispatch {
LIST_WINHVPLATFORM_FUNCTIONS(WHP_DECLARE_MEMBER)
-#ifdef HOST_X86_64
- LIST_WINHVEMULATION_FUNCTIONS(WHP_DECLARE_MEMBER)
-#endif
LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_DECLARE_MEMBER)
};
@@ -122,7 +107,6 @@ bool init_whp_dispatch(void);
typedef enum WHPFunctionList {
WINHV_PLATFORM_FNS_DEFAULT,
- WINHV_EMULATION_FNS_DEFAULT,
WINHV_PLATFORM_FNS_SUPPLEMENTAL
} WHPFunctionList;
diff --git a/meson.build b/meson.build
index 3cd1d8d..f67f866 100644
--- a/meson.build
+++ b/meson.build
@@ -302,11 +302,13 @@ accelerator_targets += { 'CONFIG_XEN': xen_targets }
if cpu == 'aarch64'
accelerator_targets += {
'CONFIG_HVF': ['aarch64-softmmu'],
+ 'CONFIG_NITRO': ['aarch64-softmmu'],
'CONFIG_WHPX': ['aarch64-softmmu']
}
elif cpu == 'x86_64'
accelerator_targets += {
'CONFIG_HVF': ['x86_64-softmmu'],
+ 'CONFIG_NITRO': ['x86_64-softmmu'],
'CONFIG_NVMM': ['i386-softmmu', 'x86_64-softmmu'],
'CONFIG_WHPX': ['i386-softmmu', 'x86_64-softmmu'],
'CONFIG_MSHV': ['x86_64-softmmu'],
@@ -865,8 +867,7 @@ if get_option('whpx').allowed() and host_os == 'windows'
endif
# Leave CONFIG_WHPX disabled
else
- if cc.has_header('winhvplatform.h', required: get_option('whpx')) and \
- cc.has_header('winhvemulation.h', required: get_option('whpx'))
+ if cc.has_header('winhvplatform.h', required: get_option('whpx'))
accelerators += 'CONFIG_WHPX'
endif
endif
@@ -881,6 +882,11 @@ if get_option('hvf').allowed()
endif
endif
+nitro = not_found
+if get_option('nitro').allowed() and host_os == 'linux'
+ accelerators += 'CONFIG_NITRO'
+endif
+
nvmm = not_found
if host_os == 'netbsd'
nvmm = cc.find_library('nvmm', required: get_option('nvmm'))
@@ -922,6 +928,9 @@ endif
if 'CONFIG_HVF' not in accelerators and get_option('hvf').enabled()
error('HVF not available on this platform')
endif
+if 'CONFIG_NITRO' not in accelerators and get_option('nitro').enabled()
+ error('NITRO not available on this platform')
+endif
if 'CONFIG_NVMM' not in accelerators and get_option('nvmm').enabled()
error('NVMM not available on this platform')
endif
@@ -1289,7 +1298,7 @@ endif
pulse = not_found
if not get_option('pa').auto() or (host_os == 'linux' and have_system)
- pulse = dependency('libpulse', required: get_option('pa'),
+ pulse = dependency('libpulse', version: '>=0.9.13', required: get_option('pa'),
method: 'pkg-config')
endif
alsa = not_found
@@ -1316,7 +1325,7 @@ endif
spice_protocol = not_found
if not get_option('spice_protocol').auto() or have_system
- spice_protocol = dependency('spice-protocol', version: '>=0.14.0',
+ spice_protocol = dependency('spice-protocol', version: '>=0.14.3',
required: get_option('spice_protocol'),
method: 'pkg-config')
endif
@@ -3591,6 +3600,7 @@ if have_system
'accel/hvf',
'accel/kvm',
'accel/mshv',
+ 'accel/nitro',
'audio',
'backends',
'backends/tpm',
@@ -3621,6 +3631,7 @@ if have_system
'hw/misc/macio',
'hw/net',
'hw/net/can',
+ 'hw/nitro',
'hw/nubus',
'hw/nvme',
'hw/nvram',
@@ -4789,6 +4800,7 @@ endif
summary_info = {}
if have_system
summary_info += {'KVM support': config_all_accel.has_key('CONFIG_KVM')}
+ summary_info += {'Nitro support': config_all_accel.has_key('CONFIG_NITRO')}
summary_info += {'HVF support': config_all_accel.has_key('CONFIG_HVF')}
summary_info += {'WHPX support': config_all_accel.has_key('CONFIG_WHPX')}
summary_info += {'NVMM support': config_all_accel.has_key('CONFIG_NVMM')}
diff --git a/meson_options.txt b/meson_options.txt
index 2836156..31d5916 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -79,6 +79,8 @@ option('whpx', type: 'feature', value: 'auto',
description: 'WHPX acceleration support')
option('hvf', type: 'feature', value: 'auto',
description: 'HVF acceleration support')
+option('nitro', type: 'feature', value: 'auto',
+ description: 'Nitro acceleration support')
option('nvmm', type: 'feature', value: 'auto',
description: 'NVMM acceleration support')
option('xen', type: 'feature', value: 'auto',
diff --git a/python/scripts/vendor.py b/python/scripts/vendor.py
index 46ce298..7805818 100755
--- a/python/scripts/vendor.py
+++ b/python/scripts/vendor.py
@@ -45,6 +45,8 @@ def main() -> int:
"4b27aafce281e652dcb437b28007457411245d975c48b5db3a797d3e93ae1585",
"qemu.qmp==0.0.5":
"e05782d6df5844b34e0d2f7c68693525da074deef7b641c1401dda6e4e3d6303",
+ "pycotap==1.3.1":
+ "1c3a25b3ff89e48f4e00f1f71dbbc1642b4f65c65d416524d07e73492fff25ea",
}
vendor_dir = Path(__file__, "..", "..", "wheels").resolve()
diff --git a/python/wheels/meson-1.9.0-py3-none-any.whl b/python/wheels/meson-1.9.0-py3-none-any.whl
deleted file mode 100644
index 57cc75c..0000000
--- a/python/wheels/meson-1.9.0-py3-none-any.whl
+++ /dev/null
Binary files differ
diff --git a/qapi/qom.json b/qapi/qom.json
index 6f5c9de..c653248 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -1009,13 +1009,19 @@
# designated guest firmware page for measured boot with -kernel
# (default: false) (since 6.2)
#
+# Features:
+#
+# @confidential-guest-reset: If present, the hypervisor supports
+# confidential guest resets (since 11.0).
+#
# Since: 9.1
##
{ 'struct': 'SevCommonProperties',
'data': { '*sev-device': 'str',
'*cbitpos': 'uint32',
'reduced-phys-bits': 'uint32',
- '*kernel-hashes': 'bool' } }
+ '*kernel-hashes': 'bool' },
+ 'features': ['confidential-guest-reset']}
##
# @SevGuestProperties:
@@ -1136,6 +1142,11 @@
# it, the guest will not be able to get a TD quote for
# attestation.
#
+# Features:
+#
+# @confidential-guest-reset: If present, the hypervisor supports
+# confidential guest resets (since 11.0).
+#
# Since: 10.1
##
{ 'struct': 'TdxGuestProperties',
@@ -1144,7 +1155,8 @@
'*mrconfigid': 'str',
'*mrowner': 'str',
'*mrownerconfig': 'str',
- '*quote-generation-socket': 'SocketAddress' } }
+ '*quote-generation-socket': 'SocketAddress' },
+ 'features': ['confidential-guest-reset']}
##
# @ThreadContextProperties:
diff --git a/qemu-options.hx b/qemu-options.hx
index 4043e8c..0da2b4d 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -28,7 +28,7 @@ DEF("machine", HAS_ARG, QEMU_OPTION_machine, \
"-machine [type=]name[,prop[=value][,...]]\n"
" selects emulated machine ('-machine help' for list)\n"
" property accel=accel1[:accel2[:...]] selects accelerator\n"
- " supported accelerators are kvm, xen, hvf, nvmm, whpx, mshv or tcg (default: tcg)\n"
+ " supported accelerators are kvm, xen, hvf, nitro, nvmm, whpx, mshv or tcg (default: tcg)\n"
" vmport=on|off|auto controls emulation of vmport (default: auto)\n"
" dump-guest-core=on|off include guest memory in a core dump (default=on)\n"
" mem-merge=on|off controls memory merge support (default: on)\n"
@@ -67,7 +67,7 @@ SRST
``accel=accels1[:accels2[:...]]``
This is used to enable an accelerator. Depending on the target
- architecture, kvm, xen, hvf, nvmm, whpx, mshv or tcg can be
+ architecture, kvm, xen, hvf, nitro, nvmm, whpx, mshv or tcg can be
available. By default, tcg is used. If there is more than one
accelerator specified, the next one is used if the previous one
fails to initialize.
@@ -228,7 +228,7 @@ ERST
DEF("accel", HAS_ARG, QEMU_OPTION_accel,
"-accel [accel=]accelerator[,prop[=value][,...]]\n"
- " select accelerator (kvm, xen, hvf, nvmm, whpx, mshv or tcg; use 'help' for a list)\n"
+ " select accelerator (kvm, xen, hvf, nitro, nvmm, whpx, mshv or tcg; use 'help' for a list)\n"
" igd-passthru=on|off (enable Xen integrated Intel graphics passthrough, default=off)\n"
" kernel-irqchip=on|off|split controls accelerated irqchip support (default=on)\n"
" kvm-shadow-mem=size of KVM shadow MMU in bytes\n"
@@ -243,7 +243,7 @@ DEF("accel", HAS_ARG, QEMU_OPTION_accel,
SRST
``-accel name[,prop=value[,...]]``
This is used to enable an accelerator. Depending on the target
- architecture, kvm, xen, hvf, nvmm, whpx, mshv or tcg can be available.
+ architecture, kvm, xen, hvf, nitro, nvmm, whpx, mshv or tcg can be available.
By default, tcg is used. If there is more than one accelerator
specified, the next one is used if the previous one fails to
initialize.
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
index ace0baf..0d24eb8 100644
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
@@ -46,6 +46,7 @@ redundant_explicit_links = "deny"
[workspace.lints.clippy]
# default-warn lints
result_unit_err = "allow"
+manual_checked_ops = "deny"
should_implement_trait = "deny"
# can be for a reason, e.g. in callbacks
unused_self = "allow"
diff --git a/rust/hw/core/src/qdev.rs b/rust/hw/core/src/qdev.rs
index 145e20a..b2e5441 100644
--- a/rust/hw/core/src/qdev.rs
+++ b/rust/hw/core/src/qdev.rs
@@ -425,18 +425,16 @@ impl Clock {
}
pub const fn period_from_hz(hz: u64) -> u64 {
- if hz == 0 {
- 0
- } else {
- Self::PERIOD_1SEC / hz
+ match Self::PERIOD_1SEC.checked_div(hz) {
+ Some(value) => value,
+ None => 0,
}
}
pub const fn period_to_hz(period: u64) -> u64 {
- if period == 0 {
- 0
- } else {
- Self::PERIOD_1SEC / period
+ match Self::PERIOD_1SEC.checked_div(period) {
+ Some(value) => value,
+ None => 0,
}
}
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index e8edc52..ca5b113 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -158,6 +158,7 @@ meson_options_help() {
printf "%s\n" ' multiprocess Out of process device emulation support'
printf "%s\n" ' netmap netmap network backend support'
printf "%s\n" ' nettle nettle cryptography support'
+ printf "%s\n" ' nitro Nitro acceleration support'
printf "%s\n" ' numa libnuma support'
printf "%s\n" ' nvmm NVMM acceleration support'
printf "%s\n" ' opengl OpenGL support'
@@ -418,6 +419,8 @@ _meson_option_parse() {
--disable-netmap) printf "%s" -Dnetmap=disabled ;;
--enable-nettle) printf "%s" -Dnettle=enabled ;;
--disable-nettle) printf "%s" -Dnettle=disabled ;;
+ --enable-nitro) printf "%s" -Dnitro=enabled ;;
+ --disable-nitro) printf "%s" -Dnitro=disabled ;;
--enable-numa) printf "%s" -Dnuma=enabled ;;
--disable-numa) printf "%s" -Dnuma=disabled ;;
--enable-nvmm) printf "%s" -Dnvmm=enabled ;;
diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh
index d09d8cf..386d7a3 100755
--- a/scripts/update-linux-headers.sh
+++ b/scripts/update-linux-headers.sh
@@ -254,6 +254,7 @@ for i in "$hdrdir"/include/linux/*virtio*.h \
"$hdrdir/include/linux/kvm_para.h" \
"$hdrdir/include/linux/vhost_types.h" \
"$hdrdir/include/linux/vmclock-abi.h" \
+ "$hdrdir/include/linux/nitro_enclaves.h" \
"$hdrdir/include/linux/sysinfo.h"; do
cp_portable "$i" "$output/include/standard-headers/linux"
done
diff --git a/stubs/kvm.c b/stubs/kvm.c
new file mode 100644
index 0000000..2db61d8
--- /dev/null
+++ b/stubs/kvm.c
@@ -0,0 +1,22 @@
+/*
+ * kvm target arch specific stubs
+ *
+ * Copyright (c) 2026 Red Hat, Inc.
+ *
+ * Author:
+ * Ani Sinha <anisinha@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+#include "qemu/osdep.h"
+#include "system/kvm.h"
+
+int kvm_arch_on_vmfd_change(MachineState *ms, KVMState *s)
+{
+ abort();
+}
+
+bool kvm_arch_supports_vmfd_change(void)
+{
+ return false;
+}
diff --git a/stubs/meson.build b/stubs/meson.build
index 8a07059..6ae478b 100644
--- a/stubs/meson.build
+++ b/stubs/meson.build
@@ -74,6 +74,7 @@ if have_system
if igvm.found()
stub_ss.add(files('igvm.c'))
endif
+ stub_ss.add(files('kvm.c'))
stub_ss.add(files('target-get-monitor-def.c'))
stub_ss.add(files('target-monitor-defs.c'))
stub_ss.add(files('win32-kbd-hook.c'))
diff --git a/system/physmem.c b/system/physmem.c
index 2fb0c25..e5ff26a 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -2827,6 +2827,34 @@ found:
}
/*
+ * Creates new guest memfd for the ramblocks and closes the
+ * existing memfd.
+ */
+int ram_block_rebind(Error **errp)
+{
+ RAMBlock *block;
+
+ qemu_mutex_lock_ramlist();
+
+ RAMBLOCK_FOREACH(block) {
+ if (block->flags & RAM_GUEST_MEMFD) {
+ if (block->guest_memfd >= 0) {
+ close(block->guest_memfd);
+ }
+ block->guest_memfd = kvm_create_guest_memfd(block->max_length,
+ 0, errp);
+ if (block->guest_memfd < 0) {
+ qemu_mutex_unlock_ramlist();
+ return -1;
+ }
+
+ }
+ }
+ qemu_mutex_unlock_ramlist();
+ return 0;
+}
+
+/*
* Finds the named RAMBlock
*
* name: The name of RAMBlock to find
diff --git a/system/runstate.c b/system/runstate.c
index d091a2b..eca722b 100644
--- a/system/runstate.c
+++ b/system/runstate.c
@@ -42,6 +42,7 @@
#include "qapi/qapi-commands-run-state.h"
#include "qapi/qapi-events-run-state.h"
#include "qemu/accel.h"
+#include "accel/accel-ops.h"
#include "qemu/error-report.h"
#include "qemu/job.h"
#include "qemu/log.h"
@@ -57,6 +58,7 @@
#include "system/reset.h"
#include "system/runstate.h"
#include "system/runstate-action.h"
+#include "system/confidential-guest-support.h"
#include "system/system.h"
#include "system/tpm.h"
#include "trace.h"
@@ -508,6 +510,9 @@ void qemu_system_reset(ShutdownCause reason)
{
MachineClass *mc;
ResetType type;
+ AccelClass *ac = ACCEL_GET_CLASS(current_accel());
+ bool guest_state_rebuilt = false;
+ int ret;
mc = current_machine ? MACHINE_GET_CLASS(current_machine) : NULL;
@@ -520,6 +525,29 @@ void qemu_system_reset(ShutdownCause reason)
default:
type = RESET_TYPE_COLD;
}
+
+ if ((reason == SHUTDOWN_CAUSE_GUEST_RESET ||
+ reason == SHUTDOWN_CAUSE_HOST_QMP_SYSTEM_RESET) &&
+ (current_machine->new_accel_vmfd_on_reset || !cpus_are_resettable())) {
+ if (ac->rebuild_guest) {
+ ret = ac->rebuild_guest(current_machine);
+ if (ret < 0) {
+ error_report("unable to rebuild guest: %s(%d)",
+ strerror(-ret), ret);
+ vm_stop(RUN_STATE_INTERNAL_ERROR);
+ } else {
+ info_report("virtual machine state has been rebuilt with new "
+ "guest file handle.");
+ guest_state_rebuilt = true;
+ }
+ } else if (!cpus_are_resettable()) {
+ error_report("accelerator does not support reset!");
+ } else {
+ error_report("accelerator does not support rebuilding guest state,"
+ " proceeding with normal reset!");
+ }
+ }
+
if (mc && mc->reset) {
mc->reset(current_machine, type);
} else {
@@ -542,9 +570,16 @@ void qemu_system_reset(ShutdownCause reason)
* it does _more_ than cpu_synchronize_all_post_reset().
*/
if (cpus_are_resettable()) {
- cpu_synchronize_all_post_reset();
- } else {
- assert(runstate_check(RUN_STATE_PRELAUNCH));
+ if (guest_state_rebuilt) {
+ /*
+ * If guest state has been rebuilt, then we
+ * need to sync full cpu state for non confidential guests post
+ * reset.
+ */
+ cpu_synchronize_all_post_init();
+ } else {
+ cpu_synchronize_all_post_reset();
+ }
}
vm_set_suspended(false);
@@ -697,7 +732,8 @@ void qemu_system_reset_request(ShutdownCause reason)
if (reboot_action == REBOOT_ACTION_SHUTDOWN &&
reason != SHUTDOWN_CAUSE_SUBSYSTEM_RESET) {
shutdown_requested = reason;
- } else if (!cpus_are_resettable()) {
+ } else if (!cpus_are_resettable() &&
+ !confidential_guest_can_rebuild_state(current_machine->cgs)) {
error_report("cpus are not resettable, terminating");
shutdown_requested = reason;
} else {
diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c
index e0e13d3..ff05304 100644
--- a/target/alpha/cpu.c
+++ b/target/alpha/cpu.c
@@ -124,6 +124,7 @@ static void alpha_cpu_realizefn(DeviceState *dev, Error **errp)
}
qemu_init_vcpu(cs);
+ cpu_reset(cs);
acc->parent_realize(dev, errp);
}
diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c
index 5821521..d6feba2 100644
--- a/target/arm/cpu64.c
+++ b/target/arm/cpu64.c
@@ -813,6 +813,14 @@ static void aarch64_a53_initfn(Object *obj)
static void aarch64_host_initfn(Object *obj)
{
ARMCPU *cpu = ARM_CPU(obj);
+
+#if defined(CONFIG_NITRO)
+ if (nitro_enabled()) {
+ /* The nitro accel uses -cpu host, but does not actually consume it */
+ return;
+ }
+#endif
+
#if defined(CONFIG_KVM)
kvm_arm_set_cpu_features_from_host(cpu);
aarch64_add_sve_properties(obj);
diff --git a/target/arm/whpx/whpx-all.c b/target/arm/whpx/whpx-all.c
index 40ada2d..bb94eac 100644
--- a/target/arm/whpx/whpx-all.c
+++ b/target/arm/whpx/whpx-all.c
@@ -273,14 +273,6 @@ static struct whpx_sreg_match whpx_sreg_match[] = {
{ WHvArm64RegisterSpEl1, ENCODE_AA64_CP_REG(4, 1, 3, 4, 0) },
};
-static void flush_cpu_state(CPUState *cpu)
-{
- if (cpu->vcpu_dirty) {
- whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE);
- cpu->vcpu_dirty = false;
- }
-}
-
HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions)
{
if (exceptions != 0) {
@@ -303,31 +295,14 @@ void whpx_translate_cpu_breakpoints(
/* Breakpoints aren’t supported on this platform */
}
-static void whpx_get_reg(CPUState *cpu, WHV_REGISTER_NAME reg, WHV_REGISTER_VALUE* val)
+bool whpx_arch_supports_guest_debug(void)
{
- struct whpx_state *whpx = &whpx_global;
- HRESULT hr;
-
- flush_cpu_state(cpu);
-
- hr = whp_dispatch.WHvGetVirtualProcessorRegisters(whpx->partition, cpu->cpu_index,
- &reg, 1, val);
-
- if (FAILED(hr)) {
- error_report("WHPX: Failed to get register %08x, hr=%08lx", reg, hr);
- }
+ return false;
}
-static void whpx_set_reg(CPUState *cpu, WHV_REGISTER_NAME reg, WHV_REGISTER_VALUE val)
+void whpx_arch_destroy_vcpu(CPUState *cpu)
{
- struct whpx_state *whpx = &whpx_global;
- HRESULT hr;
- hr = whp_dispatch.WHvSetVirtualProcessorRegisters(whpx->partition, cpu->cpu_index,
- &reg, 1, &val);
-
- if (FAILED(hr)) {
- error_report("WHPX: Failed to set register %08x, hr=%08lx", reg, hr);
- }
+ /* currently empty on Arm */
}
static void whpx_get_global_reg(WHV_REGISTER_NAME reg, WHV_REGISTER_VALUE *val)
@@ -442,7 +417,7 @@ int whpx_vcpu_run(CPUState *cpu)
do {
bool advance_pc = false;
if (cpu->vcpu_dirty) {
- whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE);
+ whpx_set_registers(cpu, WHPX_LEVEL_RUNTIME_STATE);
cpu->vcpu_dirty = false;
}
@@ -507,7 +482,7 @@ int whpx_vcpu_run(CPUState *cpu)
default:
error_report("WHPX: Unexpected VP exit code 0x%08x",
vcpu->exit_ctx.ExitReason);
- whpx_get_registers(cpu);
+ whpx_get_registers(cpu, WHPX_LEVEL_FULL_STATE);
bql_lock();
qemu_system_guest_panicked(cpu_get_crash_info(cpu));
bql_unlock();
@@ -516,7 +491,7 @@ int whpx_vcpu_run(CPUState *cpu)
if (advance_pc) {
WHV_REGISTER_VALUE pc;
- flush_cpu_state(cpu);
+ whpx_flush_cpu_state(cpu);
pc.Reg64 = vcpu->exit_ctx.MemoryAccess.Header.Pc + 4;
whpx_set_reg(cpu, WHvArm64RegisterPc, pc);
}
@@ -541,7 +516,7 @@ static void clean_whv_register_value(WHV_REGISTER_VALUE *val)
memset(val, 0, sizeof(WHV_REGISTER_VALUE));
}
-void whpx_get_registers(CPUState *cpu)
+void whpx_get_registers(CPUState *cpu, WHPXStateLevel level)
{
ARMCPU *arm_cpu = ARM_CPU(cpu);
CPUARMState *env = &arm_cpu->env;
@@ -588,7 +563,7 @@ void whpx_get_registers(CPUState *cpu)
aarch64_restore_sp(env, arm_current_el(env));
}
-void whpx_set_registers(CPUState *cpu, int level)
+void whpx_set_registers(CPUState *cpu, WHPXStateLevel level)
{
ARMCPU *arm_cpu = ARM_CPU(cpu);
CPUARMState *env = &arm_cpu->env;
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 9b9ed2d..01b6494 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -1656,7 +1656,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
"vmx-apicv-register", "vmx-apicv-vid", "vmx-ple", "vmx-rdrand-exit",
"vmx-invpcid-exit", "vmx-vmfunc", "vmx-shadow-vmcs", "vmx-encls-exit",
"vmx-rdseed-exit", "vmx-pml", NULL, NULL,
- "vmx-xsaves", NULL, NULL, NULL,
+ "vmx-xsaves", NULL, "vmx-mbec", NULL,
NULL, "vmx-tsc-scaling", "vmx-enable-user-wait-pause", NULL,
NULL, NULL, NULL, NULL,
},
@@ -1972,6 +1972,10 @@ static FeatureDep feature_dependencies[] = {
.to = { FEAT_VMX_SECONDARY_CTLS, VMX_SECONDARY_EXEC_UNRESTRICTED_GUEST },
},
{
+ .from = { FEAT_VMX_SECONDARY_CTLS, VMX_SECONDARY_EXEC_ENABLE_EPT },
+ .to = { FEAT_VMX_SECONDARY_CTLS, VMX_SECONDARY_EXEC_MODE_BASED_EPT_EXEC },
+ },
+ {
.from = { FEAT_VMX_SECONDARY_CTLS, VMX_SECONDARY_EXEC_ENABLE_VPID },
.to = { FEAT_VMX_EPT_VPID_CAPS, 0xffffffffull << 32 },
},
@@ -5257,6 +5261,15 @@ static const X86CPUDefinition builtin_x86_defs[] = {
{ /* end of list */ },
}
},
+ {
+ .version = 6,
+ .note = "with cet-ss, cet-ibt, its-no",
+ .cache_info = &xeon_spr_cache_info,
+ .props = (PropValue[]) {
+ { "its-no", "on" },
+ { /* end of list */ },
+ }
+ },
{ /* end of list */ }
}
},
@@ -5430,6 +5443,15 @@ static const X86CPUDefinition builtin_x86_defs[] = {
{ /* end of list */ },
}
},
+ {
+ .version = 5,
+ .note = "with cet-ss, cet-ibt, its-no",
+ .cache_info = &xeon_gnr_cache_info,
+ .props = (PropValue[]) {
+ { "its-no", "on" },
+ { /* end of list */ },
+ }
+ },
{ /* end of list */ },
},
},
@@ -5787,6 +5809,15 @@ static const X86CPUDefinition builtin_x86_defs[] = {
{ /* end of list */ },
}
},
+ {
+ .version = 5,
+ .note = "with ITS_NO",
+ .cache_info = &xeon_srf_cache_info,
+ .props = (PropValue[]) {
+ { "its-no", "on" },
+ { /* end of list */ },
+ }
+ },
{ /* end of list */ },
},
},
@@ -5933,6 +5964,14 @@ static const X86CPUDefinition builtin_x86_defs[] = {
{ /* end of list */ },
}
},
+ {
+ .version = 3,
+ .note = "with cet-ss, cet-ibt, ITS_NO",
+ .props = (PropValue[]) {
+ { "its-no", "on" },
+ { /* end of list */ },
+ }
+ },
{ /* end of list */ },
},
},
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 9f222a0..f2679cc 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1329,6 +1329,7 @@ uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w);
#define MSR_ARCH_CAP_PBRSB_NO (1U << 24)
#define MSR_ARCH_CAP_GDS_NO (1U << 26)
#define MSR_ARCH_CAP_RFDS_NO (1U << 27)
+#define MSR_ARCH_CAP_ITS_NO (1U << 62)
#define MSR_CORE_CAP_SPLIT_LOCK_DETECT (1U << 5)
@@ -1414,6 +1415,7 @@ uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w);
#define VMX_SECONDARY_EXEC_RDSEED_EXITING 0x00010000
#define VMX_SECONDARY_EXEC_ENABLE_PML 0x00020000
#define VMX_SECONDARY_EXEC_XSAVES 0x00100000
+#define VMX_SECONDARY_EXEC_MODE_BASED_EPT_EXEC 0x00400000
#define VMX_SECONDARY_EXEC_TSC_SCALING 0x02000000
#define VMX_SECONDARY_EXEC_ENABLE_USER_WAIT_PAUSE 0x04000000
@@ -2286,7 +2288,7 @@ typedef struct CPUArchState {
QEMUTimer *xen_periodic_timer;
QemuMutex xen_timers_lock;
#endif
-#if defined(CONFIG_HVF) || defined(CONFIG_MSHV)
+#if defined(CONFIG_HVF) || defined(CONFIG_MSHV) || defined(CONFIG_WHPX)
void *emu_mmio_buf;
#endif
diff --git a/target/i386/emulate/meson.build b/target/i386/emulate/meson.build
index b6dafb6..1fa1a8e 100644
--- a/target/i386/emulate/meson.build
+++ b/target/i386/emulate/meson.build
@@ -2,7 +2,16 @@ emulator_files = files(
'x86_decode.c',
'x86_emu.c',
'x86_flags.c',
+ 'x86_mmu.c'
+)
+
+emulator_helper_files = files(
+ 'x86_helpers.c'
)
i386_system_ss.add(when: [hvf, 'CONFIG_HVF'], if_true: emulator_files)
i386_system_ss.add(when: 'CONFIG_MSHV', if_true: emulator_files)
+i386_system_ss.add(when: 'CONFIG_WHPX', if_true: emulator_files)
+
+i386_system_ss.add(when: 'CONFIG_MSHV', if_true: emulator_helper_files)
+i386_system_ss.add(when: 'CONFIG_WHPX', if_true: emulator_helper_files)
diff --git a/target/i386/emulate/x86.h b/target/i386/emulate/x86.h
index 73edccf..caf0e3b 100644
--- a/target/i386/emulate/x86.h
+++ b/target/i386/emulate/x86.h
@@ -263,6 +263,7 @@ bool x86_is_protected(CPUState *cpu);
bool x86_is_real(CPUState *cpu);
bool x86_is_v8086(CPUState *cpu);
bool x86_is_long_mode(CPUState *cpu);
+bool x86_is_la57(CPUState *cpu);
bool x86_is_long64_mode(CPUState *cpu);
bool x86_is_paging_mode(CPUState *cpu);
bool x86_is_pae_enabled(CPUState *cpu);
diff --git a/target/i386/emulate/x86_decode.c b/target/i386/emulate/x86_decode.c
index d037ed1..bae1dd4 100644
--- a/target/i386/emulate/x86_decode.c
+++ b/target/i386/emulate/x86_decode.c
@@ -77,11 +77,7 @@ static inline uint64_t decode_bytes(CPUX86State *env, struct x86_decode *decode,
memcpy(&val, decode->stream->bytes + decode->len, size);
} else {
target_ulong va = linear_rip(env_cpu(env), env->eip) + decode->len;
- if (emul_ops->fetch_instruction) {
- emul_ops->fetch_instruction(env_cpu(env), &val, va, size);
- } else {
- emul_ops->read_mem(env_cpu(env), &val, va, size);
- }
+ x86_read_mem(env_cpu(env), &val, va, size);
}
decode->len += size;
@@ -1699,7 +1695,7 @@ void *get_reg_ref(CPUX86State *env, int reg, int rex_present,
target_ulong get_reg_val(CPUX86State *env, int reg, int rex_present,
int is_extended, int size)
{
- target_ulong val = 0;
+ uint64_t val = 0;
memcpy(&val,
get_reg_ref(env, reg, rex_present, is_extended, size),
size);
@@ -2088,8 +2084,6 @@ static void decode_opcodes(CPUX86State *env, struct x86_decode *decode)
static uint32_t decode_opcode(CPUX86State *env, struct x86_decode *decode)
{
- memset(decode, 0, sizeof(*decode));
-
decode_prefix(env, decode);
set_addressing_size(env, decode);
set_operand_size(env, decode);
@@ -2101,6 +2095,8 @@ static uint32_t decode_opcode(CPUX86State *env, struct x86_decode *decode)
uint32_t decode_instruction(CPUX86State *env, struct x86_decode *decode)
{
+ memset(decode, 0, sizeof(*decode));
+
return decode_opcode(env, decode);
}
diff --git a/target/i386/emulate/x86_emu.c b/target/i386/emulate/x86_emu.c
index 4409f7b..55b1a68 100644
--- a/target/i386/emulate/x86_emu.c
+++ b/target/i386/emulate/x86_emu.c
@@ -36,15 +36,36 @@
/////////////////////////////////////////////////////////////////////////
#include "qemu/osdep.h"
+#include "qemu/error-report.h"
#include "panic.h"
#include "x86_decode.h"
#include "x86.h"
#include "x86_emu.h"
#include "x86_flags.h"
+#include "x86_mmu.h"
+
+#ifdef TARGET_X86_64
+#define EXEC_2OP_FLAGS_CMD_64(env, decode, cmd, FLAGS_FUNC, save_res) \
+ case 8: \
+ { \
+ uint64_t v1 = (uint64_t)decode->op[0].val; \
+ uint64_t v2 = (uint64_t)decode->op[1].val; \
+ uint64_t diff = v1 cmd v2; \
+ if (save_res) { \
+ if (write_val_ext(env, &decode->op[0], diff, 8)) { return 1; } \
+ } \
+ FLAGS_FUNC##64(env, v1, v2, diff); \
+ break; \
+ }
+#else
+#define EXEC_2OP_FLAGS_CMD_64(env, decode, cmd, FLAGS_FUNC, save_res)
+#endif
#define EXEC_2OP_FLAGS_CMD(env, decode, cmd, FLAGS_FUNC, save_res) \
{ \
- fetch_operands(env, decode, 2, true, true, false); \
+ if (fetch_operands(env, decode, 2, true, true, false)) {\
+ return 1; \
+ }\
switch (decode->operand_size) { \
case 1: \
{ \
@@ -52,7 +73,7 @@
uint8_t v2 = (uint8_t)decode->op[1].val; \
uint8_t diff = v1 cmd v2; \
if (save_res) { \
- write_val_ext(env, &decode->op[0], diff, 1); \
+ if (write_val_ext(env, &decode->op[0], diff, 1)) { return 1; } \
} \
FLAGS_FUNC##8(env, v1, v2, diff); \
break; \
@@ -63,7 +84,7 @@
uint16_t v2 = (uint16_t)decode->op[1].val; \
uint16_t diff = v1 cmd v2; \
if (save_res) { \
- write_val_ext(env, &decode->op[0], diff, 2); \
+ if (write_val_ext(env, &decode->op[0], diff, 2)) { return 1; } \
} \
FLAGS_FUNC##16(env, v1, v2, diff); \
break; \
@@ -74,11 +95,12 @@
uint32_t v2 = (uint32_t)decode->op[1].val; \
uint32_t diff = v1 cmd v2; \
if (save_res) { \
- write_val_ext(env, &decode->op[0], diff, 4); \
+ if (write_val_ext(env, &decode->op[0], diff, 4)) { return 1; } \
} \
FLAGS_FUNC##32(env, v1, v2, diff); \
break; \
} \
+ EXEC_2OP_FLAGS_CMD_64(env, decode, cmd, FLAGS_FUNC, save_res) \
default: \
VM_PANIC("bad size\n"); \
} \
@@ -164,63 +186,77 @@ void write_val_to_reg(void *reg_ptr, target_ulong val, int size)
}
}
-static void write_val_to_mem(CPUX86State *env, target_ulong ptr, target_ulong val, int size)
-{
- emul_ops->write_mem(env_cpu(env), &val, ptr, size);
-}
-
-void write_val_ext(CPUX86State *env, struct x86_decode_op *decode, target_ulong val, int size)
+bool write_val_ext(CPUX86State *env, struct x86_decode_op *decode, target_ulong val, int size)
{
if (decode->type == X86_VAR_REG) {
write_val_to_reg(decode->regptr, val, size);
} else {
- write_val_to_mem(env, decode->addr, val, size);
+ MMUTranslateResult res = x86_write_mem(env_cpu(env), &val, decode->addr, size);
+ if (res) {
+ if (res == MMU_TRANSLATE_GPA_UNMAPPED) {
+ return 0;
+ }
+ return 1;
+ }
}
+ return 0;
}
uint8_t *read_mmio(CPUX86State *env, target_ulong ptr, int bytes)
{
- emul_ops->read_mem(env_cpu(env), env->emu_mmio_buf, ptr, bytes);
+ MMUTranslateResult res = x86_read_mem(env_cpu(env), env->emu_mmio_buf, ptr, bytes);
+ if (res) {
+ if (res == MMU_TRANSLATE_GPA_UNMAPPED) {
+ memset(env->emu_mmio_buf, 0xFF, bytes);
+ return env->emu_mmio_buf;
+ }
+ return NULL;
+ }
return env->emu_mmio_buf;
}
-static target_ulong read_val_from_mem(CPUX86State *env, target_long ptr, int size)
+static bool read_val_from_mem(CPUX86State *env, target_long ptr, int size, target_ulong* val)
{
- target_ulong val;
uint8_t *mmio_ptr;
mmio_ptr = read_mmio(env, ptr, size);
+ if (mmio_ptr == NULL) {
+ return 1;
+ }
switch (size) {
case 1:
- val = *(uint8_t *)mmio_ptr;
+ *val = *(uint8_t *)mmio_ptr;
break;
case 2:
- val = *(uint16_t *)mmio_ptr;
+ *val = *(uint16_t *)mmio_ptr;
break;
case 4:
- val = *(uint32_t *)mmio_ptr;
+ *val = *(uint32_t *)mmio_ptr;
break;
case 8:
- val = *(uint64_t *)mmio_ptr;
+ *val = *(uint64_t *)mmio_ptr;
break;
default:
VM_PANIC("bad size\n");
break;
}
- return val;
+ return 0;
}
-target_ulong read_val_ext(CPUX86State *env, struct x86_decode_op *decode, int size)
+bool read_val_ext(CPUX86State *env, struct x86_decode_op *decode, int size, target_ulong* val)
{
if (decode->type == X86_VAR_REG) {
- return read_val_from_reg(decode->regptr, size);
+ *val = read_val_from_reg(decode->regptr, size);
} else {
- return read_val_from_mem(env, decode->addr, size);
+ if (read_val_from_mem(env, decode->addr, size, val)) {
+ return 1;
+ }
}
+ return 0;
}
-static void fetch_operands(CPUX86State *env, struct x86_decode *decode,
+static bool fetch_operands(CPUX86State *env, struct x86_decode *decode,
int n, bool val_op0, bool val_op1, bool val_op2)
{
int i;
@@ -240,8 +276,10 @@ static void fetch_operands(CPUX86State *env, struct x86_decode *decode,
case X86_VAR_RM:
calc_modrm_operand(env, decode, &decode->op[i]);
if (calc_val[i]) {
- decode->op[i].val = read_val_ext(env, &decode->op[i],
- decode->operand_size);
+ if (read_val_ext(env, &decode->op[i],decode->operand_size,
+ &decode->op[i].val)) {
+ return 1;
+ }
}
break;
case X86_VAR_OFFSET:
@@ -249,68 +287,81 @@ static void fetch_operands(CPUX86State *env, struct x86_decode *decode,
decode->op[i].addr,
R_DS);
if (calc_val[i]) {
- decode->op[i].val = read_val_ext(env, &decode->op[i],
- decode->operand_size);
+ if (read_val_ext(env, &decode->op[i], decode->operand_size,
+ &decode->op[i].val)) {
+ return 1;
+ }
}
break;
default:
break;
}
}
+ return 0;
}
-static void exec_mov(CPUX86State *env, struct x86_decode *decode)
+static bool exec_mov(CPUX86State *env, struct x86_decode *decode)
{
fetch_operands(env, decode, 2, false, true, false);
- write_val_ext(env, &decode->op[0], decode->op[1].val,
- decode->operand_size);
+ if (write_val_ext(env, &decode->op[0], decode->op[1].val,
+ decode->operand_size)) {
+ return 1;
+ }
env->eip += decode->len;
+ return 0;
}
-static void exec_add(CPUX86State *env, struct x86_decode *decode)
+static bool exec_add(CPUX86State *env, struct x86_decode *decode)
{
EXEC_2OP_FLAGS_CMD(env, decode, +, SET_FLAGS_OSZAPC_ADD, true);
env->eip += decode->len;
+ return 0;
}
-static void exec_or(CPUX86State *env, struct x86_decode *decode)
+static bool exec_or(CPUX86State *env, struct x86_decode *decode)
{
EXEC_2OP_FLAGS_CMD(env, decode, |, SET_FLAGS_OSZAPC_LOGIC, true);
env->eip += decode->len;
+ return 0;
}
-static void exec_adc(CPUX86State *env, struct x86_decode *decode)
+static bool exec_adc(CPUX86State *env, struct x86_decode *decode)
{
EXEC_2OP_FLAGS_CMD(env, decode, +get_CF(env)+, SET_FLAGS_OSZAPC_ADD, true);
env->eip += decode->len;
+ return 0;
}
-static void exec_sbb(CPUX86State *env, struct x86_decode *decode)
+static bool exec_sbb(CPUX86State *env, struct x86_decode *decode)
{
EXEC_2OP_FLAGS_CMD(env, decode, -get_CF(env)-, SET_FLAGS_OSZAPC_SUB, true);
env->eip += decode->len;
+ return 0;
}
-static void exec_and(CPUX86State *env, struct x86_decode *decode)
+static bool exec_and(CPUX86State *env, struct x86_decode *decode)
{
EXEC_2OP_FLAGS_CMD(env, decode, &, SET_FLAGS_OSZAPC_LOGIC, true);
env->eip += decode->len;
+ return 0;
}
-static void exec_sub(CPUX86State *env, struct x86_decode *decode)
+static bool exec_sub(CPUX86State *env, struct x86_decode *decode)
{
EXEC_2OP_FLAGS_CMD(env, decode, -, SET_FLAGS_OSZAPC_SUB, true);
env->eip += decode->len;
+ return 0;
}
-static void exec_xor(CPUX86State *env, struct x86_decode *decode)
+static bool exec_xor(CPUX86State *env, struct x86_decode *decode)
{
EXEC_2OP_FLAGS_CMD(env, decode, ^, SET_FLAGS_OSZAPC_LOGIC, true);
env->eip += decode->len;
+ return 0;
}
-static void exec_neg(CPUX86State *env, struct x86_decode *decode)
+static bool exec_neg(CPUX86State *env, struct x86_decode *decode)
{
/*EXEC_2OP_FLAGS_CMD(env, decode, -, SET_FLAGS_OSZAPC_SUB, false);*/
int32_t val;
@@ -331,15 +382,17 @@ static void exec_neg(CPUX86State *env, struct x86_decode *decode)
/*lflags_to_rflags(env);*/
env->eip += decode->len;
+ return 0;
}
-static void exec_cmp(CPUX86State *env, struct x86_decode *decode)
+static bool exec_cmp(CPUX86State *env, struct x86_decode *decode)
{
EXEC_2OP_FLAGS_CMD(env, decode, -, SET_FLAGS_OSZAPC_SUB, false);
env->eip += decode->len;
+ return 0;
}
-static void exec_inc(CPUX86State *env, struct x86_decode *decode)
+static bool exec_inc(CPUX86State *env, struct x86_decode *decode)
{
decode->op[1].type = X86_VAR_IMMEDIATE;
decode->op[1].val = 0;
@@ -347,33 +400,37 @@ static void exec_inc(CPUX86State *env, struct x86_decode *decode)
EXEC_2OP_FLAGS_CMD(env, decode, +1+, SET_FLAGS_OSZAP_ADD, true);
env->eip += decode->len;
+ return 0;
}
-static void exec_dec(CPUX86State *env, struct x86_decode *decode)
+static bool exec_dec(CPUX86State *env, struct x86_decode *decode)
{
decode->op[1].type = X86_VAR_IMMEDIATE;
decode->op[1].val = 0;
EXEC_2OP_FLAGS_CMD(env, decode, -1-, SET_FLAGS_OSZAP_SUB, true);
env->eip += decode->len;
+ return 0;
}
-static void exec_tst(CPUX86State *env, struct x86_decode *decode)
+static bool exec_tst(CPUX86State *env, struct x86_decode *decode)
{
EXEC_2OP_FLAGS_CMD(env, decode, &, SET_FLAGS_OSZAPC_LOGIC, false);
env->eip += decode->len;
+ return 0;
}
-static void exec_not(CPUX86State *env, struct x86_decode *decode)
+static bool exec_not(CPUX86State *env, struct x86_decode *decode)
{
fetch_operands(env, decode, 1, true, false, false);
write_val_ext(env, &decode->op[0], ~decode->op[0].val,
decode->operand_size);
env->eip += decode->len;
+ return 0;
}
-void exec_movzx(CPUX86State *env, struct x86_decode *decode)
+bool exec_movzx(CPUX86State *env, struct x86_decode *decode)
{
int src_op_size;
int op_size = decode->operand_size;
@@ -387,13 +444,16 @@ void exec_movzx(CPUX86State *env, struct x86_decode *decode)
}
decode->operand_size = src_op_size;
calc_modrm_operand(env, decode, &decode->op[1]);
- decode->op[1].val = read_val_ext(env, &decode->op[1], src_op_size);
+ if (read_val_ext(env, &decode->op[1], src_op_size, &decode->op[1].val)) {
+ return 1;
+ }
write_val_ext(env, &decode->op[0], decode->op[1].val, op_size);
env->eip += decode->len;
+ return 0;
}
-static void exec_out(CPUX86State *env, struct x86_decode *decode)
+static bool exec_out(CPUX86State *env, struct x86_decode *decode)
{
switch (decode->opcode[0]) {
case 0xe6:
@@ -415,9 +475,10 @@ static void exec_out(CPUX86State *env, struct x86_decode *decode)
break;
}
env->eip += decode->len;
+ return 0;
}
-static void exec_in(CPUX86State *env, struct x86_decode *decode)
+static bool exec_in(CPUX86State *env, struct x86_decode *decode)
{
target_ulong val = 0;
switch (decode->opcode[0]) {
@@ -452,6 +513,7 @@ static void exec_in(CPUX86State *env, struct x86_decode *decode)
}
env->eip += decode->len;
+ return 0;
}
static inline void string_increment_reg(CPUX86State *env, int reg,
@@ -466,99 +528,138 @@ static inline void string_increment_reg(CPUX86State *env, int reg,
write_reg(env, reg, val, decode->addressing_size);
}
-static inline void string_rep(CPUX86State *env, struct x86_decode *decode,
- void (*func)(CPUX86State *env,
+static inline int get_ZF(CPUX86State *env) {
+ return env->cc_dst ? 0 : CC_Z;
+}
+
+static inline bool string_rep(CPUX86State *env, struct x86_decode *decode,
+ bool (*func)(CPUX86State *env,
struct x86_decode *ins), int rep)
{
target_ulong rcx = read_reg(env, R_ECX, decode->addressing_size);
- while (rcx--) {
- func(env, decode);
+
+ while (rcx != 0) {
+ bool is_cmps_or_scas = decode->cmd == X86_DECODE_CMD_CMPS || decode->cmd == X86_DECODE_CMD_SCAS;
+ if (func(env, decode)) {
+ return 1;
+ }
+ rcx--;
write_reg(env, R_ECX, rcx, decode->addressing_size);
- if ((PREFIX_REP == rep) && !env->cc_dst) {
+ if ((PREFIX_REP == rep) && !get_ZF(env) && is_cmps_or_scas) {
break;
}
- if ((PREFIX_REPN == rep) && env->cc_dst) {
+ if ((PREFIX_REPN == rep) && get_ZF(env)&& is_cmps_or_scas) {
break;
}
}
+ return 0;
}
-static void exec_ins_single(CPUX86State *env, struct x86_decode *decode)
+static bool exec_ins_single(CPUX86State *env, struct x86_decode *decode)
{
+ MMUTranslateResult res;
+
target_ulong addr = linear_addr_size(env_cpu(env), RDI(env),
decode->addressing_size, R_ES);
emul_ops->handle_io(env_cpu(env), DX(env), env->emu_mmio_buf, 0,
decode->operand_size, 1);
- emul_ops->write_mem(env_cpu(env), env->emu_mmio_buf, addr,
+ res = x86_write_mem(env_cpu(env), env->emu_mmio_buf, addr,
decode->operand_size);
+ if (res) {
+ return 1;
+ }
string_increment_reg(env, R_EDI, decode);
+ return 0;
}
-static void exec_ins(CPUX86State *env, struct x86_decode *decode)
+static bool exec_ins(CPUX86State *env, struct x86_decode *decode)
{
+ bool res;
if (decode->rep) {
- string_rep(env, decode, exec_ins_single, 0);
+ res = string_rep(env, decode, exec_ins_single, 0);
} else {
- exec_ins_single(env, decode);
+ res = exec_ins_single(env, decode);
}
+ if (res) {
+ return 1;
+ }
env->eip += decode->len;
+ return 0;
}
-static void exec_outs_single(CPUX86State *env, struct x86_decode *decode)
+static bool exec_outs_single(CPUX86State *env, struct x86_decode *decode)
{
target_ulong addr = decode_linear_addr(env, decode, RSI(env), R_DS);
- emul_ops->read_mem(env_cpu(env), env->emu_mmio_buf, addr,
+ x86_read_mem(env_cpu(env), env->emu_mmio_buf, addr,
decode->operand_size);
emul_ops->handle_io(env_cpu(env), DX(env), env->emu_mmio_buf, 1,
decode->operand_size, 1);
string_increment_reg(env, R_ESI, decode);
+ return 0;
}
-static void exec_outs(CPUX86State *env, struct x86_decode *decode)
+static bool exec_outs(CPUX86State *env, struct x86_decode *decode)
{
+ bool res;
if (decode->rep) {
- string_rep(env, decode, exec_outs_single, 0);
+ res = string_rep(env, decode, exec_outs_single, 0);
} else {
- exec_outs_single(env, decode);
+ res = exec_outs_single(env, decode);
}
+ if (res) {
+ return 1;
+ }
env->eip += decode->len;
+ return 0;
}
-static void exec_movs_single(CPUX86State *env, struct x86_decode *decode)
+static bool exec_movs_single(CPUX86State *env, struct x86_decode *decode)
{
target_ulong src_addr;
target_ulong dst_addr;
target_ulong val;
+ MMUTranslateResult res;
src_addr = decode_linear_addr(env, decode, RSI(env), R_DS);
dst_addr = linear_addr_size(env_cpu(env), RDI(env),
decode->addressing_size, R_ES);
- val = read_val_from_mem(env, src_addr, decode->operand_size);
- write_val_to_mem(env, dst_addr, val, decode->operand_size);
+ if (read_val_from_mem(env, src_addr, decode->operand_size, &val)) {
+ return 1;
+ }
+ res = x86_write_mem(env_cpu(env), &val, dst_addr, decode->operand_size);
+ if (res) {
+ return 1;
+ }
string_increment_reg(env, R_ESI, decode);
string_increment_reg(env, R_EDI, decode);
+ return 0;
}
-static void exec_movs(CPUX86State *env, struct x86_decode *decode)
+static bool exec_movs(CPUX86State *env, struct x86_decode *decode)
{
+ bool res;
if (decode->rep) {
- string_rep(env, decode, exec_movs_single, 0);
+ res = string_rep(env, decode, exec_movs_single, 0);
} else {
- exec_movs_single(env, decode);
+ res = exec_movs_single(env, decode);
}
+ if (res) {
+ return 1;
+ }
env->eip += decode->len;
+ return 0;
}
-static void exec_cmps_single(CPUX86State *env, struct x86_decode *decode)
+static bool exec_cmps_single(CPUX86State *env, struct x86_decode *decode)
{
target_ulong src_addr;
target_ulong dst_addr;
@@ -568,17 +669,22 @@ static void exec_cmps_single(CPUX86State *env, struct x86_decode *decode)
decode->addressing_size, R_ES);
decode->op[0].type = X86_VAR_IMMEDIATE;
- decode->op[0].val = read_val_from_mem(env, src_addr, decode->operand_size);
+ if (read_val_from_mem(env, src_addr, decode->operand_size, &decode->op[0].val)) {
+ return 1;
+ }
decode->op[1].type = X86_VAR_IMMEDIATE;
- decode->op[1].val = read_val_from_mem(env, dst_addr, decode->operand_size);
+ if (read_val_from_mem(env, dst_addr, decode->operand_size, &decode->op[1].val)) {
+ return 1;
+ }
EXEC_2OP_FLAGS_CMD(env, decode, -, SET_FLAGS_OSZAPC_SUB, false);
string_increment_reg(env, R_ESI, decode);
string_increment_reg(env, R_EDI, decode);
+ return 0;
}
-static void exec_cmps(CPUX86State *env, struct x86_decode *decode)
+static bool exec_cmps(CPUX86State *env, struct x86_decode *decode)
{
if (decode->rep) {
string_rep(env, decode, exec_cmps_single, decode->rep);
@@ -586,24 +692,30 @@ static void exec_cmps(CPUX86State *env, struct x86_decode *decode)
exec_cmps_single(env, decode);
}
env->eip += decode->len;
+ return 0;
}
-static void exec_stos_single(CPUX86State *env, struct x86_decode *decode)
+static bool exec_stos_single(CPUX86State *env, struct x86_decode *decode)
{
target_ulong addr;
target_ulong val;
+ MMUTranslateResult res;
addr = linear_addr_size(env_cpu(env), RDI(env),
decode->addressing_size, R_ES);
val = read_reg(env, R_EAX, decode->operand_size);
- emul_ops->write_mem(env_cpu(env), &val, addr, decode->operand_size);
+ res = x86_write_mem(env_cpu(env), &val, addr, decode->operand_size);
+ if (res) {
+ return 1;
+ }
string_increment_reg(env, R_EDI, decode);
+ return 0;
}
-static void exec_stos(CPUX86State *env, struct x86_decode *decode)
+static bool exec_stos(CPUX86State *env, struct x86_decode *decode)
{
if (decode->rep) {
string_rep(env, decode, exec_stos_single, 0);
@@ -612,25 +724,29 @@ static void exec_stos(CPUX86State *env, struct x86_decode *decode)
}
env->eip += decode->len;
+ return 0;
}
-static void exec_scas_single(CPUX86State *env, struct x86_decode *decode)
+static bool exec_scas_single(CPUX86State *env, struct x86_decode *decode)
{
target_ulong addr;
addr = linear_addr_size(env_cpu(env), RDI(env),
decode->addressing_size, R_ES);
decode->op[1].type = X86_VAR_IMMEDIATE;
- emul_ops->read_mem(env_cpu(env), &decode->op[1].val, addr, decode->operand_size);
+ x86_read_mem(env_cpu(env), &decode->op[1].val, addr, decode->operand_size);
EXEC_2OP_FLAGS_CMD(env, decode, -, SET_FLAGS_OSZAPC_SUB, false);
string_increment_reg(env, R_EDI, decode);
+ return 0;
}
-static void exec_scas(CPUX86State *env, struct x86_decode *decode)
+static bool exec_scas(CPUX86State *env, struct x86_decode *decode)
{
decode->op[0].type = X86_VAR_REG;
decode->op[0].reg = R_EAX;
+ decode->op[0].regptr = x86_reg(env, R_EAX);
+
if (decode->rep) {
string_rep(env, decode, exec_scas_single, decode->rep);
} else {
@@ -638,21 +754,23 @@ static void exec_scas(CPUX86State *env, struct x86_decode *decode)
}
env->eip += decode->len;
+ return 0;
}
-static void exec_lods_single(CPUX86State *env, struct x86_decode *decode)
+static bool exec_lods_single(CPUX86State *env, struct x86_decode *decode)
{
target_ulong addr;
target_ulong val = 0;
addr = decode_linear_addr(env, decode, RSI(env), R_DS);
- emul_ops->read_mem(env_cpu(env), &val, addr, decode->operand_size);
+ x86_read_mem(env_cpu(env), &val, addr, decode->operand_size);
write_reg(env, R_EAX, val, decode->operand_size);
string_increment_reg(env, R_ESI, decode);
+ return 0;
}
-static void exec_lods(CPUX86State *env, struct x86_decode *decode)
+static bool exec_lods(CPUX86State *env, struct x86_decode *decode)
{
if (decode->rep) {
string_rep(env, decode, exec_lods_single, 0);
@@ -661,6 +779,7 @@ static void exec_lods(CPUX86State *env, struct x86_decode *decode)
}
env->eip += decode->len;
+ return 0;
}
void x86_emul_raise_exception(CPUX86State *env, int exception_index, int error_code)
@@ -671,23 +790,25 @@ void x86_emul_raise_exception(CPUX86State *env, int exception_index, int error_c
env->exception_injected = 1;
}
-static void exec_rdmsr(CPUX86State *env, struct x86_decode *decode)
+static bool exec_rdmsr(CPUX86State *env, struct x86_decode *decode)
{
emul_ops->simulate_rdmsr(env_cpu(env));
env->eip += decode->len;
+ return 0;
}
-static void exec_wrmsr(CPUX86State *env, struct x86_decode *decode)
+static bool exec_wrmsr(CPUX86State *env, struct x86_decode *decode)
{
emul_ops->simulate_wrmsr(env_cpu(env));
env->eip += decode->len;
+ return 0;
}
/*
* flag:
* 0 - bt, 1 - btc, 2 - bts, 3 - btr
*/
-static void do_bt(CPUX86State *env, struct x86_decode *decode, int flag)
+static bool do_bt(CPUX86State *env, struct x86_decode *decode, int flag)
{
int32_t displacement;
uint8_t index;
@@ -696,7 +817,9 @@ static void do_bt(CPUX86State *env, struct x86_decode *decode, int flag)
VM_PANIC_ON(decode->rex.rex);
- fetch_operands(env, decode, 2, false, true, false);
+ if (fetch_operands(env, decode, 2, false, true, false)) {
+ return 1;
+ }
index = decode->op[1].val & mask;
if (decode->op[0].type != X86_VAR_REG) {
@@ -710,14 +833,16 @@ static void do_bt(CPUX86State *env, struct x86_decode *decode, int flag)
VM_PANIC("bt 64bit\n");
}
}
- decode->op[0].val = read_val_ext(env, &decode->op[0],
- decode->operand_size);
+ if (read_val_ext(env, &decode->op[0],
+ decode->operand_size, &decode->op[0].val)) {
+ return 1;
+ }
cf = (decode->op[0].val >> index) & 0x01;
switch (flag) {
case 0:
set_CF(env, cf);
- return;
+ return 0;
case 1:
decode->op[0].val ^= (1u << index);
break;
@@ -728,41 +853,58 @@ static void do_bt(CPUX86State *env, struct x86_decode *decode, int flag)
decode->op[0].val &= ~(1u << index);
break;
}
- write_val_ext(env, &decode->op[0], decode->op[0].val,
- decode->operand_size);
+ if (write_val_ext(env, &decode->op[0], decode->op[0].val,
+ decode->operand_size)) {
+ return 1;
+ }
set_CF(env, cf);
+ return 0;
}
-static void exec_bt(CPUX86State *env, struct x86_decode *decode)
+static bool exec_bt(CPUX86State *env, struct x86_decode *decode)
{
- do_bt(env, decode, 0);
+ if (do_bt(env, decode, 0)) {
+ return 1;
+ }
env->eip += decode->len;
+ return 0;
}
-static void exec_btc(CPUX86State *env, struct x86_decode *decode)
+static bool exec_btc(CPUX86State *env, struct x86_decode *decode)
{
- do_bt(env, decode, 1);
+ if (do_bt(env, decode, 1)) {
+ return 1;
+ }
env->eip += decode->len;
+ return 0;
}
-static void exec_btr(CPUX86State *env, struct x86_decode *decode)
+static bool exec_btr(CPUX86State *env, struct x86_decode *decode)
{
- do_bt(env, decode, 3);
+ if (do_bt(env, decode, 3)) {
+ return 1;
+ }
env->eip += decode->len;
+ return 0;
}
-static void exec_bts(CPUX86State *env, struct x86_decode *decode)
+static bool exec_bts(CPUX86State *env, struct x86_decode *decode)
{
- do_bt(env, decode, 2);
+ if (do_bt(env, decode, 2)) {
+ return 1;
+ }
env->eip += decode->len;
+ return 0;
}
-void exec_shl(CPUX86State *env, struct x86_decode *decode)
+bool exec_shl(CPUX86State *env, struct x86_decode *decode)
{
uint8_t count;
int of = 0, cf = 0;
- fetch_operands(env, decode, 2, true, true, false);
+ if (fetch_operands(env, decode, 2, true, true, false)) {
+ return 1;
+ }
count = decode->op[1].val;
count &= 0x1f; /* count is masked to 5 bits*/
@@ -819,12 +961,14 @@ void exec_shl(CPUX86State *env, struct x86_decode *decode)
exit:
/* lflags_to_rflags(env); */
env->eip += decode->len;
+ return 0;
}
-void exec_movsx(CPUX86State *env, struct x86_decode *decode)
+bool exec_movsx(CPUX86State *env, struct x86_decode *decode)
{
int src_op_size;
int op_size = decode->operand_size;
+ target_ulong val;
fetch_operands(env, decode, 2, false, false, false);
@@ -836,15 +980,18 @@ void exec_movsx(CPUX86State *env, struct x86_decode *decode)
decode->operand_size = src_op_size;
calc_modrm_operand(env, decode, &decode->op[1]);
- decode->op[1].val = sign(read_val_ext(env, &decode->op[1], src_op_size),
- src_op_size);
+ if (read_val_ext(env, &decode->op[1], src_op_size, &val)) {
+ return 1;
+ }
+ decode->op[1].val = sign(val, src_op_size);
write_val_ext(env, &decode->op[0], decode->op[1].val, op_size);
env->eip += decode->len;
+ return 0;
}
-void exec_ror(CPUX86State *env, struct x86_decode *decode)
+bool exec_ror(CPUX86State *env, struct x86_decode *decode)
{
uint8_t count;
@@ -920,9 +1067,10 @@ void exec_ror(CPUX86State *env, struct x86_decode *decode)
}
}
env->eip += decode->len;
+ return 0;
}
-void exec_rol(CPUX86State *env, struct x86_decode *decode)
+bool exec_rol(CPUX86State *env, struct x86_decode *decode)
{
uint8_t count;
@@ -1001,10 +1149,11 @@ void exec_rol(CPUX86State *env, struct x86_decode *decode)
}
}
env->eip += decode->len;
+ return 0;
}
-void exec_rcl(CPUX86State *env, struct x86_decode *decode)
+bool exec_rcl(CPUX86State *env, struct x86_decode *decode)
{
uint8_t count;
int of = 0, cf = 0;
@@ -1087,9 +1236,10 @@ void exec_rcl(CPUX86State *env, struct x86_decode *decode)
}
}
env->eip += decode->len;
+ return 0;
}
-void exec_rcr(CPUX86State *env, struct x86_decode *decode)
+bool exec_rcr(CPUX86State *env, struct x86_decode *decode)
{
uint8_t count;
int of = 0, cf = 0;
@@ -1162,9 +1312,10 @@ void exec_rcr(CPUX86State *env, struct x86_decode *decode)
}
}
env->eip += decode->len;
+ return 0;
}
-static void exec_xchg(CPUX86State *env, struct x86_decode *decode)
+static bool exec_xchg(CPUX86State *env, struct x86_decode *decode)
{
fetch_operands(env, decode, 2, true, true, false);
@@ -1174,20 +1325,22 @@ static void exec_xchg(CPUX86State *env, struct x86_decode *decode)
decode->operand_size);
env->eip += decode->len;
+ return 0;
}
-static void exec_xadd(CPUX86State *env, struct x86_decode *decode)
+static bool exec_xadd(CPUX86State *env, struct x86_decode *decode)
{
EXEC_2OP_FLAGS_CMD(env, decode, +, SET_FLAGS_OSZAPC_ADD, true);
write_val_ext(env, &decode->op[1], decode->op[0].val,
decode->operand_size);
env->eip += decode->len;
+ return 0;
}
static struct cmd_handler {
enum x86_decode_cmd cmd;
- void (*handler)(CPUX86State *env, struct x86_decode *ins);
+ bool (*handler)(CPUX86State *env, struct x86_decode *ins);
} handlers[] = {
{X86_DECODE_CMD_INVL, NULL,},
{X86_DECODE_CMD_MOV, exec_mov},
diff --git a/target/i386/emulate/x86_emu.h b/target/i386/emulate/x86_emu.h
index 05686b1..0f284b0 100644
--- a/target/i386/emulate/x86_emu.h
+++ b/target/i386/emulate/x86_emu.h
@@ -21,13 +21,11 @@
#include "x86.h"
#include "x86_decode.h"
+#include "x86_mmu.h"
#include "cpu.h"
struct x86_emul_ops {
- void (*fetch_instruction)(CPUState *cpu, void *data, target_ulong addr,
- int bytes);
- void (*read_mem)(CPUState *cpu, void *data, target_ulong addr, int bytes);
- void (*write_mem)(CPUState *cpu, void *data, target_ulong addr, int bytes);
+ MMUTranslateResult (*mmu_gva_to_gpa) (CPUState *cpu, target_ulong gva, uint64_t *gpa, MMUTranslateFlags flags);
void (*read_segment_descriptor)(CPUState *cpu, struct x86_segment_descriptor *desc,
enum X86Seg seg);
void (*handle_io)(CPUState *cpu, uint16_t port, void *data, int direction,
@@ -46,15 +44,15 @@ target_ulong read_reg(CPUX86State *env, int reg, int size);
void write_reg(CPUX86State *env, int reg, target_ulong val, int size);
target_ulong read_val_from_reg(void *reg_ptr, int size);
void write_val_to_reg(void *reg_ptr, target_ulong val, int size);
-void write_val_ext(CPUX86State *env, struct x86_decode_op *decode, target_ulong val, int size);
+bool write_val_ext(CPUX86State *env, struct x86_decode_op *decode, target_ulong val, int size);
uint8_t *read_mmio(CPUX86State *env, target_ulong ptr, int bytes);
-target_ulong read_val_ext(CPUX86State *env, struct x86_decode_op *decode, int size);
+bool read_val_ext(CPUX86State *env, struct x86_decode_op *decode, int size, target_ulong* val);
-void exec_movzx(CPUX86State *env, struct x86_decode *decode);
-void exec_shl(CPUX86State *env, struct x86_decode *decode);
-void exec_movsx(CPUX86State *env, struct x86_decode *decode);
-void exec_ror(CPUX86State *env, struct x86_decode *decode);
-void exec_rol(CPUX86State *env, struct x86_decode *decode);
-void exec_rcl(CPUX86State *env, struct x86_decode *decode);
-void exec_rcr(CPUX86State *env, struct x86_decode *decode);
+bool exec_movzx(CPUX86State *env, struct x86_decode *decode);
+bool exec_shl(CPUX86State *env, struct x86_decode *decode);
+bool exec_movsx(CPUX86State *env, struct x86_decode *decode);
+bool exec_ror(CPUX86State *env, struct x86_decode *decode);
+bool exec_rol(CPUX86State *env, struct x86_decode *decode);
+bool exec_rcl(CPUX86State *env, struct x86_decode *decode);
+bool exec_rcr(CPUX86State *env, struct x86_decode *decode);
#endif
diff --git a/target/i386/emulate/x86_flags.c b/target/i386/emulate/x86_flags.c
index 6592193..3c4270a 100644
--- a/target/i386/emulate/x86_flags.c
+++ b/target/i386/emulate/x86_flags.c
@@ -82,6 +82,10 @@
SET_FLAGS_OSZAPC_SIZE(16, carries, result)
#define SET_FLAGS_OSZAPC_32(carries, result) \
SET_FLAGS_OSZAPC_SIZE(32, carries, result)
+#ifdef TARGET_X86_64
+#define SET_FLAGS_OSZAPC_64(carries, result) \
+ SET_FLAGS_OSZAPC_SIZE(64, carries, result)
+#endif
/* ******************* */
/* OSZAP */
@@ -107,6 +111,10 @@
SET_FLAGS_OSZAP_SIZE(16, carries, result)
#define SET_FLAGS_OSZAP_32(carries, result) \
SET_FLAGS_OSZAP_SIZE(32, carries, result)
+#ifdef TARGET_X86_64
+#define SET_FLAGS_OSZAP_64(carries, result) \
+ SET_FLAGS_OSZAP_SIZE(64, carries, result)
+#endif
void SET_FLAGS_OxxxxC(CPUX86State *env, bool new_of, bool new_cf)
{
@@ -115,6 +123,14 @@ void SET_FLAGS_OxxxxC(CPUX86State *env, bool new_of, bool new_cf)
env->cc_src ^= ((target_ulong)new_of << LF_BIT_PO);
}
+#ifdef TARGET_X86_64
+void SET_FLAGS_OSZAPC_SUB64(CPUX86State *env, uint64_t v1, uint64_t v2,
+ uint64_t diff)
+{
+ SET_FLAGS_OSZAPC_64(SUB_COUT_VEC(v1, v2, diff), diff);
+}
+#endif
+
void SET_FLAGS_OSZAPC_SUB32(CPUX86State *env, uint32_t v1, uint32_t v2,
uint32_t diff)
{
@@ -133,6 +149,14 @@ void SET_FLAGS_OSZAPC_SUB8(CPUX86State *env, uint8_t v1, uint8_t v2,
SET_FLAGS_OSZAPC_8(SUB_COUT_VEC(v1, v2, diff), diff);
}
+#ifdef TARGET_X86_64
+void SET_FLAGS_OSZAPC_ADD64(CPUX86State *env, uint64_t v1, uint64_t v2,
+ uint64_t diff)
+{
+ SET_FLAGS_OSZAPC_64(ADD_COUT_VEC(v1, v2, diff), diff);
+}
+#endif
+
void SET_FLAGS_OSZAPC_ADD32(CPUX86State *env, uint32_t v1, uint32_t v2,
uint32_t diff)
{
@@ -151,6 +175,14 @@ void SET_FLAGS_OSZAPC_ADD8(CPUX86State *env, uint8_t v1, uint8_t v2,
SET_FLAGS_OSZAPC_8(ADD_COUT_VEC(v1, v2, diff), diff);
}
+#ifdef TARGET_X86_64
+void SET_FLAGS_OSZAP_SUB64(CPUX86State *env, uint64_t v1, uint64_t v2,
+ uint64_t diff)
+{
+ SET_FLAGS_OSZAP_64(SUB_COUT_VEC(v1, v2, diff), diff);
+}
+#endif
+
void SET_FLAGS_OSZAP_SUB32(CPUX86State *env, uint32_t v1, uint32_t v2,
uint32_t diff)
{
@@ -169,6 +201,14 @@ void SET_FLAGS_OSZAP_SUB8(CPUX86State *env, uint8_t v1, uint8_t v2,
SET_FLAGS_OSZAP_8(SUB_COUT_VEC(v1, v2, diff), diff);
}
+#ifdef TARGET_X86_64
+void SET_FLAGS_OSZAP_ADD64(CPUX86State *env, uint64_t v1, uint64_t v2,
+ uint64_t diff)
+{
+ SET_FLAGS_OSZAP_64(ADD_COUT_VEC(v1, v2, diff), diff);
+}
+#endif
+
void SET_FLAGS_OSZAP_ADD32(CPUX86State *env, uint32_t v1, uint32_t v2,
uint32_t diff)
{
@@ -187,6 +227,13 @@ void SET_FLAGS_OSZAP_ADD8(CPUX86State *env, uint8_t v1, uint8_t v2,
SET_FLAGS_OSZAP_8(ADD_COUT_VEC(v1, v2, diff), diff);
}
+#ifdef TARGET_X86_64
+void SET_FLAGS_OSZAPC_LOGIC64(CPUX86State *env, uint64_t v1, uint64_t v2,
+ uint64_t diff)
+{
+ SET_FLAGS_OSZAPC_64(0, diff);
+}
+#endif
void SET_FLAGS_OSZAPC_LOGIC32(CPUX86State *env, uint32_t v1, uint32_t v2,
uint32_t diff)
diff --git a/target/i386/emulate/x86_flags.h b/target/i386/emulate/x86_flags.h
index a395c83..7ffbbe5 100644
--- a/target/i386/emulate/x86_flags.h
+++ b/target/i386/emulate/x86_flags.h
@@ -33,6 +33,10 @@ void set_CF(CPUX86State *env, bool val);
void SET_FLAGS_OxxxxC(CPUX86State *env, bool new_of, bool new_cf);
+#ifdef TARGET_X86_64
+void SET_FLAGS_OSZAPC_SUB64(CPUX86State *env, uint64_t v1, uint64_t v2,
+ uint64_t diff);
+#endif
void SET_FLAGS_OSZAPC_SUB32(CPUX86State *env, uint32_t v1, uint32_t v2,
uint32_t diff);
void SET_FLAGS_OSZAPC_SUB16(CPUX86State *env, uint16_t v1, uint16_t v2,
@@ -40,6 +44,10 @@ void SET_FLAGS_OSZAPC_SUB16(CPUX86State *env, uint16_t v1, uint16_t v2,
void SET_FLAGS_OSZAPC_SUB8(CPUX86State *env, uint8_t v1, uint8_t v2,
uint8_t diff);
+#ifdef TARGET_X86_64
+void SET_FLAGS_OSZAPC_ADD64(CPUX86State *env, uint64_t v1, uint64_t v2,
+ uint64_t diff);
+#endif
void SET_FLAGS_OSZAPC_ADD32(CPUX86State *env, uint32_t v1, uint32_t v2,
uint32_t diff);
void SET_FLAGS_OSZAPC_ADD16(CPUX86State *env, uint16_t v1, uint16_t v2,
@@ -47,6 +55,10 @@ void SET_FLAGS_OSZAPC_ADD16(CPUX86State *env, uint16_t v1, uint16_t v2,
void SET_FLAGS_OSZAPC_ADD8(CPUX86State *env, uint8_t v1, uint8_t v2,
uint8_t diff);
+#ifdef TARGET_X86_64
+void SET_FLAGS_OSZAP_SUB64(CPUX86State *env, uint64_t v1, uint64_t v2,
+ uint64_t diff);
+#endif
void SET_FLAGS_OSZAP_SUB32(CPUX86State *env, uint32_t v1, uint32_t v2,
uint32_t diff);
void SET_FLAGS_OSZAP_SUB16(CPUX86State *env, uint16_t v1, uint16_t v2,
@@ -54,6 +66,10 @@ void SET_FLAGS_OSZAP_SUB16(CPUX86State *env, uint16_t v1, uint16_t v2,
void SET_FLAGS_OSZAP_SUB8(CPUX86State *env, uint8_t v1, uint8_t v2,
uint8_t diff);
+#ifdef TARGET_X86_64
+void SET_FLAGS_OSZAP_ADD64(CPUX86State *env, uint64_t v1, uint64_t v2,
+ uint64_t diff);
+#endif
void SET_FLAGS_OSZAP_ADD32(CPUX86State *env, uint32_t v1, uint32_t v2,
uint32_t diff);
void SET_FLAGS_OSZAP_ADD16(CPUX86State *env, uint16_t v1, uint16_t v2,
@@ -61,6 +77,10 @@ void SET_FLAGS_OSZAP_ADD16(CPUX86State *env, uint16_t v1, uint16_t v2,
void SET_FLAGS_OSZAP_ADD8(CPUX86State *env, uint8_t v1, uint8_t v2,
uint8_t diff);
+#ifdef TARGET_X86_64
+void SET_FLAGS_OSZAPC_LOGIC64(CPUX86State *env, uint64_t v1, uint64_t v2,
+ uint64_t diff);
+#endif
void SET_FLAGS_OSZAPC_LOGIC32(CPUX86State *env, uint32_t v1, uint32_t v2,
uint32_t diff);
void SET_FLAGS_OSZAPC_LOGIC16(CPUX86State *env, uint16_t v1, uint16_t v2,
diff --git a/target/i386/mshv/x86.c b/target/i386/emulate/x86_helpers.c
index 0700cc0..024f9a2 100644
--- a/target/i386/mshv/x86.c
+++ b/target/i386/emulate/x86_helpers.c
@@ -13,6 +13,7 @@
#include "cpu.h"
#include "emulate/x86_decode.h"
#include "emulate/x86_emu.h"
+#include "emulate/x86_mmu.h"
#include "qemu/error-report.h"
#include "system/mshv.h"
@@ -176,7 +177,7 @@ bool x86_read_segment_descriptor(CPUState *cpu,
}
gva = base + sel.index * 8;
- emul_ops->read_mem(cpu, desc, gva, sizeof(*desc));
+ x86_read_mem_priv(cpu, desc, gva, sizeof(*desc));
return true;
}
@@ -200,7 +201,7 @@ bool x86_read_call_gate(CPUState *cpu, struct x86_call_gate *idt_desc,
}
gva = base + gate * 8;
- emul_ops->read_mem(cpu, idt_desc, gva, sizeof(*idt_desc));
+ x86_read_mem_priv(cpu, idt_desc, gva, sizeof(*idt_desc));
return true;
}
@@ -236,6 +237,14 @@ bool x86_is_long_mode(CPUState *cpu)
return ((efer & lme_lma) == lme_lma);
}
+bool x86_is_la57(CPUState *cpu)
+{
+ X86CPU *x86_cpu = X86_CPU(cpu);
+ CPUX86State *env = &x86_cpu->env;
+ uint64_t is_la57 = env->cr[4] & CR4_LA57_MASK;
+ return is_la57;
+}
+
bool x86_is_long64_mode(CPUState *cpu)
{
error_report("unimplemented: is_long64_mode()");
diff --git a/target/i386/emulate/x86_mmu.c b/target/i386/emulate/x86_mmu.c
new file mode 100644
index 0000000..8261ca1
--- /dev/null
+++ b/target/i386/emulate/x86_mmu.c
@@ -0,0 +1,354 @@
+/*
+ * Copyright (C) 2016 Veertu Inc,
+ * Copyright (C) 2017 Google Inc,
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "panic.h"
+#include "cpu.h"
+#include "system/address-spaces.h"
+#include "system/memory.h"
+#include "qemu/error-report.h"
+#include "emulate/x86.h"
+#include "emulate/x86_emu.h"
+#include "emulate/x86_mmu.h"
+
+#define pte_present(pte) (pte & PT_PRESENT)
+#define pte_write_access(pte) (pte & PT_WRITE)
+#define pte_user_access(pte) (pte & PT_USER)
+#define pte_exec_access(pte) (!(pte & PT_NX))
+
+#define pte_large_page(pte) (pte & PT_PS)
+#define pte_global_access(pte) (pte & PT_GLOBAL)
+
+#define mmu_validate_write(flags) (flags & MMU_TRANSLATE_VALIDATE_WRITE)
+#define mmu_validate_execute(flags) (flags & MMU_TRANSLATE_VALIDATE_EXECUTE)
+#define mmu_priv_checks_exempt(flags) (flags & MMU_TRANSLATE_PRIV_CHECKS_EXEMPT)
+
+
+#define PAE_CR3_MASK (~0x1fllu)
+#define LEGACY_CR3_MASK (0xffffffff)
+
+#define LEGACY_PTE_PAGE_MASK (0xffffffffllu << 12)
+#define PAE_PTE_PAGE_MASK ((-1llu << 12) & ((1llu << 52) - 1))
+#define PAE_PTE_LARGE_PAGE_MASK ((-1llu << (21)) & ((1llu << 52) - 1))
+#define PAE_PTE_SUPER_PAGE_MASK ((-1llu << (30)) & ((1llu << 52) - 1))
+
+static bool is_user(CPUState *cpu)
+{
+ return false;
+}
+
+
+struct gpt_translation {
+ target_ulong gva;
+ uint64_t gpa;
+ uint64_t pte[5];
+};
+
+static int gpt_top_level(CPUState *cpu, bool pae)
+{
+ if (!pae) {
+ return 2;
+ }
+ if (x86_is_long_mode(cpu)) {
+ if (x86_is_la57(cpu)) {
+ return 5;
+ }
+ return 4;
+ }
+
+ return 3;
+}
+
+static inline int gpt_entry(target_ulong addr, int level, bool pae)
+{
+ int level_shift = pae ? 9 : 10;
+ return (addr >> (level_shift * (level - 1) + 12)) & ((1 << level_shift) - 1);
+}
+
+static inline int pte_size(bool pae)
+{
+ return pae ? 8 : 4;
+}
+
+
+static bool get_pt_entry(CPUState *cpu, struct gpt_translation *pt,
+ int level, bool pae)
+{
+ int index;
+ uint64_t pte = 0;
+ uint64_t page_mask = pae ? PAE_PTE_PAGE_MASK : LEGACY_PTE_PAGE_MASK;
+ uint64_t gpa = pt->pte[level] & page_mask;
+
+ if (level == 3 && !x86_is_long_mode(cpu)) {
+ gpa = pt->pte[level];
+ }
+
+ index = gpt_entry(pt->gva, level, pae);
+ address_space_read(&address_space_memory, gpa + index * pte_size(pae),
+ MEMTXATTRS_UNSPECIFIED, &pte, pte_size(pae));
+
+ pt->pte[level - 1] = pte;
+
+ return true;
+}
+
+/* test page table entry */
+static MMUTranslateResult test_pt_entry(CPUState *cpu, struct gpt_translation *pt,
+ int level, int *largeness, bool pae, MMUTranslateFlags flags)
+{
+ X86CPU *x86_cpu = X86_CPU(cpu);
+ CPUX86State *env = &x86_cpu->env;
+ uint64_t pte = pt->pte[level];
+
+ if (!pte_present(pte)) {
+ return MMU_TRANSLATE_PAGE_NOT_MAPPED;
+ }
+
+ if (pae && !x86_is_long_mode(cpu) && 2 == level) {
+ goto exit;
+ }
+
+ if (level && pte_large_page(pte)) {
+ *largeness = level;
+ }
+
+ uint32_t cr0 = env->cr[0];
+ /* check protection */
+ if (cr0 & CR0_WP_MASK) {
+ if (mmu_validate_write(flags) && !pte_write_access(pte)) {
+ return MMU_TRANSLATE_PRIV_VIOLATION;
+ }
+ }
+
+ if (!mmu_priv_checks_exempt(flags)) {
+ if (is_user(cpu) && !pte_user_access(pte)) {
+ return MMU_TRANSLATE_PRIV_VIOLATION;
+ }
+ }
+
+ if (pae && mmu_validate_execute(flags) && !pte_exec_access(pte)) {
+ return MMU_TRANSLATE_PRIV_VIOLATION;
+ }
+
+exit:
+ /* TODO: check reserved bits */
+ return MMU_TRANSLATE_SUCCESS;
+}
+
+static inline uint64_t pse_pte_to_page(uint64_t pte)
+{
+ return ((pte & 0x1fe000) << 19) | (pte & 0xffc00000);
+}
+
+static inline uint64_t large_page_gpa(struct gpt_translation *pt, bool pae,
+ int largeness)
+{
+ VM_PANIC_ON(!pte_large_page(pt->pte[largeness]))
+
+ /* 1Gib large page */
+ if (pae && largeness == 2) {
+ return (pt->pte[2] & PAE_PTE_SUPER_PAGE_MASK) | (pt->gva & 0x3fffffff);
+ }
+
+ VM_PANIC_ON(largeness != 1)
+
+ /* 2Mb large page */
+ if (pae) {
+ return (pt->pte[1] & PAE_PTE_LARGE_PAGE_MASK) | (pt->gva & 0x1fffff);
+ }
+
+ /* 4Mb large page */
+ return pse_pte_to_page(pt->pte[1]) | (pt->gva & 0x3fffff);
+}
+
+
+
+static MMUTranslateResult walk_gpt(CPUState *cpu, target_ulong addr, MMUTranslateFlags flags,
+ struct gpt_translation *pt, bool pae)
+{
+ X86CPU *x86_cpu = X86_CPU(cpu);
+ CPUX86State *env = &x86_cpu->env;
+ int top_level, level;
+ int largeness = 0;
+ target_ulong cr3 = env->cr[3];
+ uint64_t page_mask = pae ? PAE_PTE_PAGE_MASK : LEGACY_PTE_PAGE_MASK;
+ MMUTranslateResult res;
+
+ memset(pt, 0, sizeof(*pt));
+ top_level = gpt_top_level(cpu, pae);
+
+ pt->pte[top_level] = pae ? (cr3 & PAE_CR3_MASK) : (cr3 & LEGACY_CR3_MASK);
+ pt->gva = addr;
+
+ for (level = top_level; level > 0; level--) {
+ get_pt_entry(cpu, pt, level, pae);
+ res = test_pt_entry(cpu, pt, level - 1, &largeness, pae, flags);
+
+ if (res) {
+ return res;
+ }
+
+ if (largeness) {
+ break;
+ }
+ }
+
+ if (!largeness) {
+ pt->gpa = (pt->pte[0] & page_mask) | (pt->gva & 0xfff);
+ } else {
+ pt->gpa = large_page_gpa(pt, pae, largeness);
+ }
+
+ return res;
+}
+
+
+MMUTranslateResult mmu_gva_to_gpa(CPUState *cpu, target_ulong gva, uint64_t *gpa, MMUTranslateFlags flags)
+{
+ if (emul_ops->mmu_gva_to_gpa) {
+ return emul_ops->mmu_gva_to_gpa(cpu, gva, gpa, flags);
+ }
+
+ bool res;
+ struct gpt_translation pt;
+
+ if (!x86_is_paging_mode(cpu)) {
+ *gpa = gva;
+ return MMU_TRANSLATE_SUCCESS;
+ }
+
+ res = walk_gpt(cpu, gva, flags, &pt, x86_is_pae_enabled(cpu));
+ if (res == MMU_TRANSLATE_SUCCESS) {
+ *gpa = pt.gpa;
+ }
+
+ return res;
+}
+
+static int translate_res_to_error_code(MMUTranslateResult res, bool is_write, bool is_user)
+{
+ int error_code = 0;
+ if (is_user) {
+ error_code |= PG_ERROR_U_MASK;
+ }
+ if (!(res & MMU_TRANSLATE_PAGE_NOT_MAPPED)) {
+ error_code |= PG_ERROR_P_MASK;
+ }
+ if (is_write && (res & MMU_TRANSLATE_PRIV_VIOLATION)) {
+ error_code |= PG_ERROR_W_MASK;
+ }
+ if (res & MMU_TRANSLATE_INVALID_PT_FLAGS) {
+ error_code |= PG_ERROR_RSVD_MASK;
+ }
+ return error_code;
+}
+
+static MMUTranslateResult x86_write_mem_ex(CPUState *cpu, void *data, target_ulong gva, int bytes, bool priv_check_exempt)
+{
+ X86CPU *x86_cpu = X86_CPU(cpu);
+ CPUX86State *env = &x86_cpu->env;
+
+ MMUTranslateResult translate_res = MMU_TRANSLATE_SUCCESS;
+ MemTxResult mem_tx_res;
+ uint64_t gpa;
+
+ while (bytes > 0) {
+ /* copy page */
+ int copy = MIN(bytes, 0x1000 - (gva & 0xfff));
+
+ translate_res = mmu_gva_to_gpa(cpu, gva, &gpa, MMU_TRANSLATE_VALIDATE_WRITE);
+ if (translate_res) {
+ int error_code = translate_res_to_error_code(translate_res, true, is_user(cpu));
+ env->cr[2] = gva;
+ x86_emul_raise_exception(env, EXCP0E_PAGE, error_code);
+ return translate_res;
+ }
+
+ mem_tx_res = address_space_write(&address_space_memory, gpa,
+ MEMTXATTRS_UNSPECIFIED, data, copy);
+
+ if (mem_tx_res == MEMTX_DECODE_ERROR) {
+ warn_report("write to unmapped mmio region gpa=0x%" PRIx64 " size=%i", gpa, bytes);
+ return MMU_TRANSLATE_GPA_UNMAPPED;
+ } else if (mem_tx_res == MEMTX_ACCESS_ERROR) {
+ return MMU_TRANSLATE_GPA_NO_WRITE_ACCESS;
+ }
+
+ bytes -= copy;
+ gva += copy;
+ data += copy;
+ }
+ return translate_res;
+}
+
+MMUTranslateResult x86_write_mem(CPUState *cpu, void *data, target_ulong gva, int bytes)
+{
+ return x86_write_mem_ex(cpu, data, gva, bytes, false);
+}
+
+MMUTranslateResult x86_write_mem_priv(CPUState *cpu, void *data, target_ulong gva, int bytes)
+{
+ return x86_write_mem_ex(cpu, data, gva, bytes, true);
+}
+
+static MMUTranslateResult x86_read_mem_ex(CPUState *cpu, void *data, target_ulong gva, int bytes, bool priv_check_exempt)
+{
+ X86CPU *x86_cpu = X86_CPU(cpu);
+ CPUX86State *env = &x86_cpu->env;
+
+ MMUTranslateResult translate_res = MMU_TRANSLATE_SUCCESS;
+ MemTxResult mem_tx_res;
+ uint64_t gpa;
+
+ while (bytes > 0) {
+ /* copy page */
+ int copy = MIN(bytes, 0x1000 - (gva & 0xfff));
+
+ translate_res = mmu_gva_to_gpa(cpu, gva, &gpa, 0);
+ if (translate_res) {
+ int error_code = translate_res_to_error_code(translate_res, false, is_user(cpu));
+ env->cr[2] = gva;
+ x86_emul_raise_exception(env, EXCP0E_PAGE, error_code);
+ return translate_res;
+ }
+ mem_tx_res = address_space_read(&address_space_memory, gpa, MEMTXATTRS_UNSPECIFIED,
+ data, copy);
+
+ if (mem_tx_res == MEMTX_DECODE_ERROR) {
+ warn_report("read from unmapped mmio region gpa=0x%" PRIx64 " size=%i", gpa, bytes);
+ return MMU_TRANSLATE_GPA_UNMAPPED;
+ } else if (mem_tx_res == MEMTX_ACCESS_ERROR) {
+ return MMU_TRANSLATE_GPA_NO_READ_ACCESS;
+ }
+
+ bytes -= copy;
+ gva += copy;
+ data += copy;
+ }
+ return translate_res;
+}
+
+MMUTranslateResult x86_read_mem(CPUState *cpu, void *data, target_ulong gva, int bytes)
+{
+ return x86_read_mem_ex(cpu, data, gva, bytes, false);
+}
+
+MMUTranslateResult x86_read_mem_priv(CPUState *cpu, void *data, target_ulong gva, int bytes)
+{
+ return x86_read_mem_ex(cpu, data, gva, bytes, true);
+}
diff --git a/target/i386/hvf/x86_mmu.h b/target/i386/emulate/x86_mmu.h
index 9447ae0..190bd27 100644
--- a/target/i386/hvf/x86_mmu.h
+++ b/target/i386/emulate/x86_mmu.h
@@ -30,15 +30,30 @@
#define PT_GLOBAL (1 << 8)
#define PT_NX (1llu << 63)
-/* error codes */
-#define MMU_PAGE_PT (1 << 0)
-#define MMU_PAGE_WT (1 << 1)
-#define MMU_PAGE_US (1 << 2)
-#define MMU_PAGE_NX (1 << 3)
+typedef enum MMUTranslateFlags {
+ MMU_TRANSLATE_VALIDATE_WRITE = BIT(1),
+ MMU_TRANSLATE_VALIDATE_EXECUTE = BIT(2),
+ MMU_TRANSLATE_PRIV_CHECKS_EXEMPT = BIT(3)
+} MMUTranslateFlags;
-bool mmu_gva_to_gpa(CPUState *cpu, target_ulong gva, uint64_t *gpa);
+typedef enum MMUTranslateResult {
+ MMU_TRANSLATE_SUCCESS = 0,
+ MMU_TRANSLATE_PAGE_NOT_MAPPED = 1,
+ MMU_TRANSLATE_PRIV_VIOLATION = 2,
+ MMU_TRANSLATE_INVALID_PT_FLAGS = 3,
+ MMU_TRANSLATE_GPA_UNMAPPED = 4,
+ MMU_TRANSLATE_GPA_NO_READ_ACCESS = 5,
+ MMU_TRANSLATE_GPA_NO_WRITE_ACCESS = 6
+} MMUTranslateResult;
+
+MMUTranslateResult mmu_gva_to_gpa(CPUState *cpu, target_ulong gva, uint64_t *gpa, MMUTranslateFlags flags);
+
+/* Thin wrappers x86_write_mem_ex/x86_read_mem_ex for code readability */
+MMUTranslateResult x86_write_mem(CPUState *cpu, void *data, target_ulong gva, int bytes);
+MMUTranslateResult x86_read_mem(CPUState *cpu, void *data, target_ulong gva, int bytes);
+
+MMUTranslateResult x86_write_mem_priv(CPUState *cpu, void *data, target_ulong gva, int bytes);
+MMUTranslateResult x86_read_mem_priv(CPUState *cpu, void *data, target_ulong gva, int bytes);
-void vmx_write_mem(CPUState *cpu, target_ulong gva, void *data, int bytes);
-void vmx_read_mem(CPUState *cpu, void *data, target_ulong gva, int bytes);
#endif /* X86_MMU_H */
diff --git a/target/i386/hvf/hvf.c b/target/i386/hvf/hvf.c
index ce54020..a70f846 100644
--- a/target/i386/hvf/hvf.c
+++ b/target/i386/hvf/hvf.c
@@ -62,7 +62,7 @@
#include "emulate/x86.h"
#include "x86_descr.h"
#include "emulate/x86_flags.h"
-#include "x86_mmu.h"
+#include "emulate/x86_mmu.h"
#include "emulate/x86_decode.h"
#include "emulate/x86_emu.h"
#include "x86_task.h"
@@ -252,19 +252,7 @@ static void hvf_read_segment_descriptor(CPUState *s, struct x86_segment_descript
vmx_segment_to_x86_descriptor(s, &vmx_segment, desc);
}
-static void hvf_read_mem(CPUState *cpu, void *data, target_ulong gva, int bytes)
-{
- vmx_read_mem(cpu, data, gva, bytes);
-}
-
-static void hvf_write_mem(CPUState *cpu, void *data, target_ulong gva, int bytes)
-{
- vmx_write_mem(cpu, gva, data, bytes);
-}
-
static const struct x86_emul_ops hvf_x86_emul_ops = {
- .read_mem = hvf_read_mem,
- .write_mem = hvf_write_mem,
.read_segment_descriptor = hvf_read_segment_descriptor,
.handle_io = hvf_handle_io,
.simulate_rdmsr = hvf_simulate_rdmsr,
@@ -482,6 +470,26 @@ static void hvf_cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count,
}
}
+static void hvf_load_crs(CPUState *cs)
+{
+ X86CPU *x86_cpu = X86_CPU(cpu);
+ CPUX86State *env = &x86_cpu->env;
+
+ env->cr[0] = rvmcs(cpu->accel->fd, VMCS_GUEST_CR0);
+ env->cr[3] = rvmcs(cpu->accel->fd, VMCS_GUEST_CR3);
+ env->cr[2] = rreg(cpu->accel->fd, HV_X86_CR2);
+}
+
+static void hvf_save_crs(CPUState *cs)
+{
+ X86CPU *x86_cpu = X86_CPU(cpu);
+ CPUX86State *env = &x86_cpu->env;
+
+ wvmcs(cpu->accel->fd, VMCS_GUEST_CR0, env->cr[0]);
+ wvmcs(cpu->accel->fd, VMCS_GUEST_CR3, env->cr[3]);
+ wreg(cs->accel->fd, HV_X86_CR2, env->cr[2]);
+}
+
void hvf_load_regs(CPUState *cs)
{
X86CPU *cpu = X86_CPU(cs);
@@ -794,9 +802,11 @@ static int hvf_handle_vmexit(CPUState *cpu)
struct x86_decode decode;
hvf_load_regs(cpu);
+ hvf_load_crs(cpu);
decode_instruction(env, &decode);
exec_instruction(env, &decode);
hvf_store_regs(cpu);
+ hvf_save_crs(cpu);
break;
}
break;
@@ -835,10 +845,12 @@ static int hvf_handle_vmexit(CPUState *cpu)
}
hvf_load_regs(cpu);
+ hvf_load_crs(cpu);
decode_instruction(env, &decode);
assert(ins_len == decode.len);
exec_instruction(env, &decode);
hvf_store_regs(cpu);
+ hvf_save_crs(cpu);
break;
}
@@ -940,9 +952,11 @@ static int hvf_handle_vmexit(CPUState *cpu)
struct x86_decode decode;
hvf_load_regs(cpu);
+ hvf_load_crs(cpu);
decode_instruction(env, &decode);
exec_instruction(env, &decode);
hvf_store_regs(cpu);
+ hvf_save_crs(cpu);
break;
}
case EXIT_REASON_TPR: {
diff --git a/target/i386/hvf/meson.build b/target/i386/hvf/meson.build
index 519d190..22bf886 100644
--- a/target/i386/hvf/meson.build
+++ b/target/i386/hvf/meson.build
@@ -3,7 +3,6 @@ i386_system_ss.add(when: [hvf, 'CONFIG_HVF'], if_true: files(
'x86.c',
'x86_cpuid.c',
'x86_descr.c',
- 'x86_mmu.c',
'x86_task.c',
'x86hvf.c',
'hvf-cpu.c',
diff --git a/target/i386/hvf/x86.c b/target/i386/hvf/x86.c
index 5c75ec9..7fe710a 100644
--- a/target/i386/hvf/x86.c
+++ b/target/i386/hvf/x86.c
@@ -23,7 +23,7 @@
#include "emulate/x86_emu.h"
#include "vmcs.h"
#include "vmx.h"
-#include "x86_mmu.h"
+#include "emulate/x86_mmu.h"
#include "x86_descr.h"
/* static uint32_t x86_segment_access_rights(struct x86_segment_descriptor *var)
@@ -72,7 +72,7 @@ bool x86_read_segment_descriptor(CPUState *cpu,
return false;
}
- vmx_read_mem(cpu, desc, base + sel.index * 8, sizeof(*desc));
+ x86_read_mem_priv(cpu, desc, base + sel.index * 8, sizeof(*desc));
return true;
}
@@ -95,7 +95,7 @@ bool x86_write_segment_descriptor(CPUState *cpu,
printf("%s: gdt limit\n", __func__);
return false;
}
- vmx_write_mem(cpu, base + sel.index * 8, desc, sizeof(*desc));
+ x86_write_mem_priv(cpu, desc, base + sel.index * 8, sizeof(*desc));
return true;
}
@@ -111,7 +111,7 @@ bool x86_read_call_gate(CPUState *cpu, struct x86_call_gate *idt_desc,
return false;
}
- vmx_read_mem(cpu, idt_desc, base + gate * 8, sizeof(*idt_desc));
+ x86_read_mem_priv(cpu, idt_desc, base + gate * 8, sizeof(*idt_desc));
return true;
}
@@ -138,6 +138,11 @@ bool x86_is_long_mode(CPUState *cpu)
return rvmcs(cpu->accel->fd, VMCS_GUEST_IA32_EFER) & MSR_EFER_LMA;
}
+bool x86_is_la57(CPUState *cpu)
+{
+ return false;
+}
+
bool x86_is_long64_mode(CPUState *cpu)
{
struct vmx_segment desc;
diff --git a/target/i386/hvf/x86_mmu.c b/target/i386/hvf/x86_mmu.c
deleted file mode 100644
index afc5c17..0000000
--- a/target/i386/hvf/x86_mmu.c
+++ /dev/null
@@ -1,277 +0,0 @@
-/*
- * Copyright (C) 2016 Veertu Inc,
- * Copyright (C) 2017 Google Inc,
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "qemu/osdep.h"
-#include "panic.h"
-#include "cpu.h"
-#include "emulate/x86.h"
-#include "x86_mmu.h"
-#include "vmcs.h"
-#include "vmx.h"
-
-#define pte_present(pte) (pte & PT_PRESENT)
-#define pte_write_access(pte) (pte & PT_WRITE)
-#define pte_user_access(pte) (pte & PT_USER)
-#define pte_exec_access(pte) (!(pte & PT_NX))
-
-#define pte_large_page(pte) (pte & PT_PS)
-#define pte_global_access(pte) (pte & PT_GLOBAL)
-
-#define PAE_CR3_MASK (~0x1fllu)
-#define LEGACY_CR3_MASK (0xffffffff)
-
-#define LEGACY_PTE_PAGE_MASK (0xffffffffllu << 12)
-#define PAE_PTE_PAGE_MASK ((-1llu << 12) & ((1llu << 52) - 1))
-#define PAE_PTE_LARGE_PAGE_MASK ((-1llu << (21)) & ((1llu << 52) - 1))
-#define PAE_PTE_SUPER_PAGE_MASK ((-1llu << (30)) & ((1llu << 52) - 1))
-
-struct gpt_translation {
- target_ulong gva;
- uint64_t gpa;
- int err_code;
- uint64_t pte[5];
- bool write_access;
- bool user_access;
- bool exec_access;
-};
-
-static int gpt_top_level(CPUState *cpu, bool pae)
-{
- if (!pae) {
- return 2;
- }
- if (x86_is_long_mode(cpu)) {
- return 4;
- }
-
- return 3;
-}
-
-static inline int gpt_entry(target_ulong addr, int level, bool pae)
-{
- int level_shift = pae ? 9 : 10;
- return (addr >> (level_shift * (level - 1) + 12)) & ((1 << level_shift) - 1);
-}
-
-static inline int pte_size(bool pae)
-{
- return pae ? 8 : 4;
-}
-
-
-static bool get_pt_entry(CPUState *cpu, struct gpt_translation *pt,
- int level, bool pae)
-{
- int index;
- uint64_t pte = 0;
- uint64_t page_mask = pae ? PAE_PTE_PAGE_MASK : LEGACY_PTE_PAGE_MASK;
- uint64_t gpa = pt->pte[level] & page_mask;
-
- if (level == 3 && !x86_is_long_mode(cpu)) {
- gpa = pt->pte[level];
- }
-
- index = gpt_entry(pt->gva, level, pae);
- address_space_read(&address_space_memory, gpa + index * pte_size(pae),
- MEMTXATTRS_UNSPECIFIED, &pte, pte_size(pae));
-
- pt->pte[level - 1] = pte;
-
- return true;
-}
-
-/* test page table entry */
-static bool test_pt_entry(CPUState *cpu, struct gpt_translation *pt,
- int level, int *largeness, bool pae)
-{
- uint64_t pte = pt->pte[level];
-
- if (pt->write_access) {
- pt->err_code |= MMU_PAGE_WT;
- }
- if (pt->user_access) {
- pt->err_code |= MMU_PAGE_US;
- }
- if (pt->exec_access) {
- pt->err_code |= MMU_PAGE_NX;
- }
-
- if (!pte_present(pte)) {
- return false;
- }
-
- if (pae && !x86_is_long_mode(cpu) && 2 == level) {
- goto exit;
- }
-
- if (level && pte_large_page(pte)) {
- pt->err_code |= MMU_PAGE_PT;
- *largeness = level;
- }
- if (!level) {
- pt->err_code |= MMU_PAGE_PT;
- }
-
- uint32_t cr0 = rvmcs(cpu->accel->fd, VMCS_GUEST_CR0);
- /* check protection */
- if (cr0 & CR0_WP_MASK) {
- if (pt->write_access && !pte_write_access(pte)) {
- return false;
- }
- }
-
- if (pt->user_access && !pte_user_access(pte)) {
- return false;
- }
-
- if (pae && pt->exec_access && !pte_exec_access(pte)) {
- return false;
- }
-
-exit:
- /* TODO: check reserved bits */
- return true;
-}
-
-static inline uint64_t pse_pte_to_page(uint64_t pte)
-{
- return ((pte & 0x1fe000) << 19) | (pte & 0xffc00000);
-}
-
-static inline uint64_t large_page_gpa(struct gpt_translation *pt, bool pae,
- int largeness)
-{
- VM_PANIC_ON(!pte_large_page(pt->pte[largeness]))
-
- /* 1Gib large page */
- if (pae && largeness == 2) {
- return (pt->pte[2] & PAE_PTE_SUPER_PAGE_MASK) | (pt->gva & 0x3fffffff);
- }
-
- VM_PANIC_ON(largeness != 1)
-
- /* 2Mb large page */
- if (pae) {
- return (pt->pte[1] & PAE_PTE_LARGE_PAGE_MASK) | (pt->gva & 0x1fffff);
- }
-
- /* 4Mb large page */
- return pse_pte_to_page(pt->pte[1]) | (pt->gva & 0x3fffff);
-}
-
-
-
-static bool walk_gpt(CPUState *cpu, target_ulong addr, int err_code,
- struct gpt_translation *pt, bool pae)
-{
- int top_level, level;
- int largeness = 0;
- target_ulong cr3 = rvmcs(cpu->accel->fd, VMCS_GUEST_CR3);
- uint64_t page_mask = pae ? PAE_PTE_PAGE_MASK : LEGACY_PTE_PAGE_MASK;
-
- memset(pt, 0, sizeof(*pt));
- top_level = gpt_top_level(cpu, pae);
-
- pt->pte[top_level] = pae ? (cr3 & PAE_CR3_MASK) : (cr3 & LEGACY_CR3_MASK);
- pt->gva = addr;
- pt->user_access = (err_code & MMU_PAGE_US);
- pt->write_access = (err_code & MMU_PAGE_WT);
- pt->exec_access = (err_code & MMU_PAGE_NX);
-
- for (level = top_level; level > 0; level--) {
- get_pt_entry(cpu, pt, level, pae);
-
- if (!test_pt_entry(cpu, pt, level - 1, &largeness, pae)) {
- return false;
- }
-
- if (largeness) {
- break;
- }
- }
-
- if (!largeness) {
- pt->gpa = (pt->pte[0] & page_mask) | (pt->gva & 0xfff);
- } else {
- pt->gpa = large_page_gpa(pt, pae, largeness);
- }
-
- return true;
-}
-
-
-bool mmu_gva_to_gpa(CPUState *cpu, target_ulong gva, uint64_t *gpa)
-{
- bool res;
- struct gpt_translation pt;
- int err_code = 0;
-
- if (!x86_is_paging_mode(cpu)) {
- *gpa = gva;
- return true;
- }
-
- res = walk_gpt(cpu, gva, err_code, &pt, x86_is_pae_enabled(cpu));
- if (res) {
- *gpa = pt.gpa;
- return true;
- }
-
- return false;
-}
-
-void vmx_write_mem(CPUState *cpu, target_ulong gva, void *data, int bytes)
-{
- uint64_t gpa;
-
- while (bytes > 0) {
- /* copy page */
- int copy = MIN(bytes, 0x1000 - (gva & 0xfff));
-
- if (!mmu_gva_to_gpa(cpu, gva, &gpa)) {
- VM_PANIC_EX("%s: mmu_gva_to_gpa %llx failed\n", __func__, gva);
- } else {
- address_space_write(&address_space_memory, gpa,
- MEMTXATTRS_UNSPECIFIED, data, copy);
- }
-
- bytes -= copy;
- gva += copy;
- data += copy;
- }
-}
-
-void vmx_read_mem(CPUState *cpu, void *data, target_ulong gva, int bytes)
-{
- uint64_t gpa;
-
- while (bytes > 0) {
- /* copy page */
- int copy = MIN(bytes, 0x1000 - (gva & 0xfff));
-
- if (!mmu_gva_to_gpa(cpu, gva, &gpa)) {
- VM_PANIC_EX("%s: mmu_gva_to_gpa %llx failed\n", __func__, gva);
- }
- address_space_read(&address_space_memory, gpa, MEMTXATTRS_UNSPECIFIED,
- data, copy);
-
- bytes -= copy;
- gva += copy;
- data += copy;
- }
-}
diff --git a/target/i386/hvf/x86_task.c b/target/i386/hvf/x86_task.c
index bdf8b51..64e30e9 100644
--- a/target/i386/hvf/x86_task.c
+++ b/target/i386/hvf/x86_task.c
@@ -16,7 +16,7 @@
#include "vmx.h"
#include "emulate/x86.h"
#include "x86_descr.h"
-#include "x86_mmu.h"
+#include "emulate/x86_mmu.h"
#include "emulate/x86_decode.h"
#include "emulate/x86_emu.h"
#include "x86_task.h"
@@ -93,16 +93,16 @@ static int task_switch_32(CPUState *cpu, x86_segment_selector tss_sel, x86_segme
uint32_t eip_offset = offsetof(struct x86_tss_segment32, eip);
uint32_t ldt_sel_offset = offsetof(struct x86_tss_segment32, ldt);
- vmx_read_mem(cpu, &tss_seg, old_tss_base, sizeof(tss_seg));
+ x86_read_mem_priv(cpu, &tss_seg, old_tss_base, sizeof(tss_seg));
save_state_to_tss32(cpu, &tss_seg);
- vmx_write_mem(cpu, old_tss_base + eip_offset, &tss_seg.eip, ldt_sel_offset - eip_offset);
- vmx_read_mem(cpu, &tss_seg, new_tss_base, sizeof(tss_seg));
+ x86_write_mem_priv(cpu, &tss_seg.eip, old_tss_base + eip_offset, ldt_sel_offset - eip_offset);
+ x86_read_mem_priv(cpu, &tss_seg, new_tss_base, sizeof(tss_seg));
if (old_tss_sel.sel != 0xffff) {
tss_seg.prev_tss = old_tss_sel.sel;
- vmx_write_mem(cpu, new_tss_base, &tss_seg.prev_tss, sizeof(tss_seg.prev_tss));
+ x86_write_mem_priv(cpu, &tss_seg.prev_tss, new_tss_base, sizeof(tss_seg.prev_tss));
}
load_state_from_tss32(cpu, &tss_seg);
return 0;
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index 3b66ec8..27b1b84 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -112,6 +112,11 @@ typedef struct {
static void kvm_init_msrs(X86CPU *cpu);
static int kvm_filter_msr(KVMState *s, uint32_t msr, QEMURDMSRHandler *rdmsr,
QEMUWRMSRHandler *wrmsr);
+static int unregister_smram_listener(NotifierWithReturn *notifier,
+ void *data, Error** errp);
+NotifierWithReturn kvm_vmfd_change_notifier = {
+ .notify = unregister_smram_listener,
+};
const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
KVM_CAP_INFO(SET_TSS_ADDR),
@@ -1756,7 +1761,7 @@ static int hyperv_init_vcpu(X86CPU *cpu)
return ret;
}
- if (!cpu->hyperv_synic_kvm_only) {
+ if (!cpu->hyperv_synic_kvm_only && !hyperv_is_synic_enabled()) {
ret = hyperv_x86_synic_add(cpu);
if (ret < 0) {
error_report("failed to create HyperV SynIC: %s",
@@ -2885,6 +2890,17 @@ static void register_smram_listener(Notifier *n, void *unused)
}
}
+static int unregister_smram_listener(NotifierWithReturn *notifier,
+ void *data, Error** errp)
+{
+ if (!((VmfdChangeNotifier *)data)->pre) {
+ return 0;
+ }
+
+ memory_listener_unregister(&smram_listener.listener);
+ return 0;
+}
+
/* It should only be called in cpu's hotplug callback */
void kvm_smm_cpu_address_space_init(X86CPU *cpu)
{
@@ -3389,11 +3405,65 @@ static int kvm_vm_enable_energy_msrs(KVMState *s)
return 0;
}
+int kvm_arch_on_vmfd_change(MachineState *ms, KVMState *s)
+{
+ int ret;
+
+ ret = kvm_arch_init(ms, s);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE)) {
+ X86MachineState *x86ms = X86_MACHINE(ms);
+ /*
+ * For confidential guests, reload bios ROM if IGVM is not specified.
+ * If an IGVM file is specified then the firmware must be provided
+ * in the IGVM file.
+ */
+ if (ms->cgs && !x86ms->igvm) {
+ x86_bios_rom_reload(x86ms);
+ }
+ if (x86_machine_is_smm_enabled(x86ms)) {
+ memory_listener_register(&smram_listener.listener,
+ &smram_address_space);
+ }
+ kvm_set_max_apic_id(x86ms->apic_id_limit);
+ }
+
+ trace_kvm_arch_on_vmfd_change();
+ return 0;
+}
+
+bool kvm_arch_supports_vmfd_change(void)
+{
+ return true;
+}
+
+static int xen_init(MachineState *ms, KVMState *s)
+{
+#ifdef CONFIG_XEN_EMU
+ int ret = 0;
+ if (!object_dynamic_cast(OBJECT(ms), TYPE_PC_MACHINE)) {
+ error_report("kvm: Xen support only available in PC machine");
+ return -ENOTSUP;
+ }
+ /* hyperv_enabled() doesn't work yet. */
+ uint32_t msr = XEN_HYPERCALL_MSR;
+ ret = kvm_xen_init(s, msr);
+ return ret;
+#else
+ error_report("kvm: Xen support not enabled in qemu");
+ return -ENOTSUP;
+#endif
+}
+
int kvm_arch_init(MachineState *ms, KVMState *s)
{
int ret;
struct utsname utsname;
Error *local_err = NULL;
+ static bool first = true;
/*
* Initialize confidential guest (SEV/TDX) context, if required
@@ -3422,21 +3492,10 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
}
if (s->xen_version) {
-#ifdef CONFIG_XEN_EMU
- if (!object_dynamic_cast(OBJECT(ms), TYPE_PC_MACHINE)) {
- error_report("kvm: Xen support only available in PC machine");
- return -ENOTSUP;
- }
- /* hyperv_enabled() doesn't work yet. */
- uint32_t msr = XEN_HYPERCALL_MSR;
- ret = kvm_xen_init(s, msr);
+ ret = xen_init(ms, s);
if (ret < 0) {
return ret;
}
-#else
- error_report("kvm: Xen support not enabled in qemu");
- return -ENOTSUP;
-#endif
}
ret = kvm_get_supported_msrs(s);
@@ -3463,16 +3522,17 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
return ret;
}
- /* Tell fw_cfg to notify the BIOS to reserve the range. */
- e820_add_entry(KVM_IDENTITY_BASE, 0x4000, E820_RESERVED);
-
+ if (first) {
+ /* Tell fw_cfg to notify the BIOS to reserve the range. */
+ e820_add_entry(KVM_IDENTITY_BASE, 0x4000, E820_RESERVED);
+ }
ret = kvm_vm_set_nr_mmu_pages(s);
if (ret < 0) {
return ret;
}
if (object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE) &&
- x86_machine_is_smm_enabled(X86_MACHINE(ms))) {
+ x86_machine_is_smm_enabled(X86_MACHINE(ms)) && first) {
smram_machine_done.notify = register_smram_listener;
qemu_add_machine_init_done_notifier(&smram_machine_done);
}
@@ -3519,16 +3579,46 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
return ret;
}
- ret = kvm_msr_energy_thread_init(s, ms);
- if (ret < 0) {
- error_report("kvm : error RAPL feature requirement not met");
- return ret;
+ if (first) {
+ ret = kvm_msr_energy_thread_init(s, ms);
+ if (ret < 0) {
+ error_report("kvm : "
+ "error RAPL feature requirement not met");
+ return ret;
+ }
}
}
}
pmu_cap = kvm_check_extension(s, KVM_CAP_PMU_CAPABILITY);
+ if (first) {
+ kvm_vmfd_add_change_notifier(&kvm_vmfd_change_notifier);
+ }
+
+ /*
+ * Most x86 CPUs in current use have self-snoop, so honoring guest PAT is
+ * preferable. As well, the bochs video driver bug which motivated making
+ * this a default-enabled quirk in KVM was fixed long ago.
+ */
+ if (s->honor_guest_pat != ON_OFF_AUTO_OFF) {
+ ret = kvm_check_extension(s, KVM_CAP_DISABLE_QUIRKS2);
+ if (ret & KVM_X86_QUIRK_IGNORE_GUEST_PAT) {
+ ret = kvm_vm_enable_cap(s, KVM_CAP_DISABLE_QUIRKS2, 0,
+ KVM_X86_QUIRK_IGNORE_GUEST_PAT);
+ if (ret < 0) {
+ error_report("failed to disable KVM_X86_QUIRK_IGNORE_GUEST_PAT");
+ return ret;
+ }
+ } else {
+ if (s->honor_guest_pat == ON_OFF_AUTO_ON) {
+ error_report("KVM does not support disabling ignore-guest-PAT quirk");
+ return -EINVAL;
+ }
+ }
+ }
+
+ first = false;
return 0;
}
@@ -5501,8 +5591,6 @@ static int kvm_get_vcpu_events(X86CPU *cpu)
}
if (events.smi.pending) {
cpu_interrupt(CPU(cpu), CPU_INTERRUPT_SMI);
- } else {
- cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_SMI);
}
if (events.smi.smm_inside_nmi) {
env->hflags2 |= HF2_SMM_INSIDE_NMI_MASK;
@@ -5511,8 +5599,6 @@ static int kvm_get_vcpu_events(X86CPU *cpu)
}
if (events.smi.latched_init) {
cpu_interrupt(CPU(cpu), CPU_INTERRUPT_INIT);
- } else {
- cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_INIT);
}
}
@@ -6277,27 +6363,33 @@ static int kvm_install_msr_filters(KVMState *s)
static int kvm_filter_msr(KVMState *s, uint32_t msr, QEMURDMSRHandler *rdmsr,
QEMUWRMSRHandler *wrmsr)
{
- int i, ret;
+ int i, ret = 0;
for (i = 0; i < ARRAY_SIZE(msr_handlers); i++) {
- if (!msr_handlers[i].msr) {
+ if (msr_handlers[i].msr == msr) {
+ break;
+ } else if (!msr_handlers[i].msr) {
msr_handlers[i] = (KVMMSRHandlers) {
.msr = msr,
.rdmsr = rdmsr,
.wrmsr = wrmsr,
};
+ break;
+ }
+ }
- ret = kvm_install_msr_filters(s);
- if (ret) {
- msr_handlers[i] = (KVMMSRHandlers) { };
- return ret;
- }
+ if (i == ARRAY_SIZE(msr_handlers)) {
+ ret = -EINVAL;
+ goto end;
+ }
- return 0;
- }
+ ret = kvm_install_msr_filters(s);
+ if (ret) {
+ msr_handlers[i] = (KVMMSRHandlers) { };
}
- return -EINVAL;
+ end:
+ return ret;
}
static int kvm_handle_rdmsr(X86CPU *cpu, struct kvm_run *run)
@@ -6983,6 +7075,24 @@ static void kvm_arch_set_xen_evtchn_max_pirq(Object *obj, Visitor *v,
s->xen_evtchn_max_pirq = value;
}
+static int kvm_arch_get_honor_guest_pat(Object *obj, Error **errp)
+{
+ KVMState *s = KVM_STATE(obj);
+ return s->honor_guest_pat;
+}
+
+static void kvm_arch_set_honor_guest_pat(Object *obj, int value, Error **errp)
+{
+ KVMState *s = KVM_STATE(obj);
+
+ if (s->fd != -1) {
+ error_setg(errp, "Cannot set properties after the accelerator has been initialized");
+ return;
+ }
+
+ s->honor_guest_pat = value;
+}
+
void kvm_arch_accel_class_init(ObjectClass *oc)
{
object_class_property_add_enum(oc, "notify-vmexit", "NotifyVMexitOption",
@@ -7022,6 +7132,14 @@ void kvm_arch_accel_class_init(ObjectClass *oc)
NULL, NULL);
object_class_property_set_description(oc, "xen-evtchn-max-pirq",
"Maximum number of Xen PIRQs");
+
+ object_class_property_add_enum(oc, "honor-guest-pat", "OnOffAuto",
+ &OnOffAuto_lookup,
+ kvm_arch_get_honor_guest_pat,
+ kvm_arch_set_honor_guest_pat);
+ object_class_property_set_description(oc, "honor-guest-pat",
+ "Disable KVM quirk that ignores guest PAT "
+ "memory type settings (default: auto)");
}
void kvm_set_max_apic_id(uint32_t max_apic_id)
diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
index 0161985..4cae99c 100644
--- a/target/i386/kvm/tdx.c
+++ b/target/i386/kvm/tdx.c
@@ -19,6 +19,7 @@
#include "crypto/hash.h"
#include "system/kvm_int.h"
#include "system/runstate.h"
+#include "system/reset.h"
#include "system/system.h"
#include "system/ramblock.h"
#include "system/address-spaces.h"
@@ -38,6 +39,7 @@
#include "kvm_i386.h"
#include "tdx.h"
#include "tdx-quote-generator.h"
+#include "trace.h"
#include "standard-headers/asm-x86/kvm_para.h"
@@ -295,14 +297,51 @@ static void tdx_post_init_vcpus(void)
}
}
-static void tdx_finalize_vm(Notifier *notifier, void *unused)
+static void tdx_init_fw_mem_region(void)
{
TdxFirmware *tdvf = &tdx_guest->tdvf;
TdxFirmwareEntry *entry;
- RAMBlock *ram_block;
Error *local_err = NULL;
int r;
+ for_each_tdx_fw_entry(tdvf, entry) {
+ struct kvm_tdx_init_mem_region region;
+ uint32_t flags;
+
+ region = (struct kvm_tdx_init_mem_region) {
+ .source_addr = (uintptr_t)entry->mem_ptr,
+ .gpa = entry->address,
+ .nr_pages = entry->size >> 12,
+ };
+
+ flags = entry->attributes & TDVF_SECTION_ATTRIBUTES_MR_EXTEND ?
+ KVM_TDX_MEASURE_MEMORY_REGION : 0;
+
+ do {
+ error_free(local_err);
+ local_err = NULL;
+ r = tdx_vcpu_ioctl(first_cpu, KVM_TDX_INIT_MEM_REGION, flags,
+ &region, &local_err);
+ } while (r == -EAGAIN || r == -EINTR);
+ if (r < 0) {
+ error_report_err(local_err);
+ exit(1);
+ }
+
+ if (entry->type == TDVF_SECTION_TYPE_TD_HOB ||
+ entry->type == TDVF_SECTION_TYPE_TEMP_MEM) {
+ qemu_ram_munmap(-1, entry->mem_ptr, entry->size);
+ entry->mem_ptr = NULL;
+ }
+ }
+}
+
+static void tdx_finalize_vm(Notifier *notifier, void *unused)
+{
+ TdxFirmware *tdvf = &tdx_guest->tdvf;
+ TdxFirmwareEntry *entry;
+ RAMBlock *ram_block;
+
tdx_init_ram_entries();
for_each_tdx_fw_entry(tdvf, entry) {
@@ -339,37 +378,7 @@ static void tdx_finalize_vm(Notifier *notifier, void *unused)
tdvf_hob_create(tdx_guest, tdx_get_hob_entry(tdx_guest));
tdx_post_init_vcpus();
-
- for_each_tdx_fw_entry(tdvf, entry) {
- struct kvm_tdx_init_mem_region region;
- uint32_t flags;
-
- region = (struct kvm_tdx_init_mem_region) {
- .source_addr = (uintptr_t)entry->mem_ptr,
- .gpa = entry->address,
- .nr_pages = entry->size >> 12,
- };
-
- flags = entry->attributes & TDVF_SECTION_ATTRIBUTES_MR_EXTEND ?
- KVM_TDX_MEASURE_MEMORY_REGION : 0;
-
- do {
- error_free(local_err);
- local_err = NULL;
- r = tdx_vcpu_ioctl(first_cpu, KVM_TDX_INIT_MEM_REGION, flags,
- &region, &local_err);
- } while (r == -EAGAIN || r == -EINTR);
- if (r < 0) {
- error_report_err(local_err);
- exit(1);
- }
-
- if (entry->type == TDVF_SECTION_TYPE_TD_HOB ||
- entry->type == TDVF_SECTION_TYPE_TEMP_MEM) {
- qemu_ram_munmap(-1, entry->mem_ptr, entry->size);
- entry->mem_ptr = NULL;
- }
- }
+ tdx_init_fw_mem_region();
/*
* TDVF image has been copied into private region above via
@@ -382,8 +391,48 @@ static void tdx_finalize_vm(Notifier *notifier, void *unused)
CONFIDENTIAL_GUEST_SUPPORT(tdx_guest)->ready = true;
}
-static Notifier tdx_machine_done_notify = {
- .notify = tdx_finalize_vm,
+static void tdx_handle_reset(Object *obj, ResetType type)
+{
+ if (!runstate_is_running() && !phase_check(PHASE_MACHINE_READY)) {
+ return;
+ }
+
+ if (!kvm_enable_hypercall(BIT_ULL(KVM_HC_MAP_GPA_RANGE))) {
+ error_setg(&error_fatal, "KVM_HC_MAP_GPA_RANGE not enabled for guest");
+ }
+
+ tdx_finalize_vm(NULL, NULL);
+ trace_tdx_handle_reset();
+}
+
+/* TDX guest reset will require us to reinitialize some of tdx guest state. */
+static int set_tdx_vm_uninitialized(NotifierWithReturn *notifier,
+ void *data, Error** errp)
+{
+ TdxFirmware *fw = &tdx_guest->tdvf;
+
+ if (!((VmfdChangeNotifier *)data)->pre) {
+ return 0;
+ }
+
+ if (tdx_guest->initialized) {
+ tdx_guest->initialized = false;
+ }
+
+ g_free(tdx_guest->ram_entries);
+
+ /*
+ * the firmware entries will be parsed again, see
+ * x86_firmware_configure() -> tdx_parse_tdvf()
+ */
+ fw->entries = 0;
+ g_free(fw->entries);
+
+ return 0;
+}
+
+static NotifierWithReturn tdx_vmfd_change_notifier = {
+ .notify = set_tdx_vm_uninitialized,
};
/*
@@ -731,8 +780,6 @@ static int tdx_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
*/
kvm_readonly_mem_allowed = false;
- qemu_add_machine_init_done_notifier(&tdx_machine_done_notify);
-
tdx_guest = tdx;
return 0;
}
@@ -1498,6 +1545,7 @@ OBJECT_DEFINE_TYPE_WITH_INTERFACES(TdxGuest,
TDX_GUEST,
X86_CONFIDENTIAL_GUEST,
{ TYPE_USER_CREATABLE },
+ { TYPE_RESETTABLE_INTERFACE },
{ NULL })
static void tdx_guest_init(Object *obj)
@@ -1531,20 +1579,39 @@ static void tdx_guest_init(Object *obj)
tdx->event_notify_vector = -1;
tdx->event_notify_apicid = -1;
+ kvm_vmfd_add_change_notifier(&tdx_vmfd_change_notifier);
+ qemu_register_resettable(obj);
}
static void tdx_guest_finalize(Object *obj)
{
}
+static ResettableState *tdx_reset_state(Object *obj)
+{
+ TdxGuest *tdx = TDX_GUEST(obj);
+ return &tdx->reset_state;
+}
+
static void tdx_guest_class_init(ObjectClass *oc, const void *data)
{
ConfidentialGuestSupportClass *klass = CONFIDENTIAL_GUEST_SUPPORT_CLASS(oc);
X86ConfidentialGuestClass *x86_klass = X86_CONFIDENTIAL_GUEST_CLASS(oc);
+ ResettableClass *rc = RESETTABLE_CLASS(oc);
klass->kvm_init = tdx_kvm_init;
+ klass->can_rebuild_guest_state = true;
x86_klass->kvm_type = tdx_kvm_type;
x86_klass->cpu_instance_init = tdx_cpu_instance_init;
x86_klass->adjust_cpuid_features = tdx_adjust_cpuid_features;
x86_klass->check_features = tdx_check_features;
+
+ /*
+ * the exit phase makes sure sev handles reset after all legacy resets
+ * have taken place (in the hold phase) and IGVM has also properly
+ * set up the boot state.
+ */
+ rc->phases.exit = tdx_handle_reset;
+ rc->get_state = tdx_reset_state;
+
}
diff --git a/target/i386/kvm/tdx.h b/target/i386/kvm/tdx.h
index 1c38faf..264fbe5 100644
--- a/target/i386/kvm/tdx.h
+++ b/target/i386/kvm/tdx.h
@@ -70,6 +70,7 @@ typedef struct TdxGuest {
uint32_t event_notify_vector;
uint32_t event_notify_apicid;
+ ResettableState reset_state;
} TdxGuest;
#ifdef CONFIG_TDX
diff --git a/target/i386/kvm/trace-events b/target/i386/kvm/trace-events
index 74a6234..a386234 100644
--- a/target/i386/kvm/trace-events
+++ b/target/i386/kvm/trace-events
@@ -6,6 +6,7 @@ kvm_x86_add_msi_route(int virq) "Adding route entry for virq %d"
kvm_x86_remove_msi_route(int virq) "Removing route entry for virq %d"
kvm_x86_update_msi_routes(int num) "Updated %d MSI routes"
kvm_hc_map_gpa_range(uint64_t gpa, uint64_t size, uint64_t attributes, uint64_t flags) "gpa 0x%" PRIx64 " size 0x%" PRIx64 " attributes 0x%" PRIx64 " flags 0x%" PRIx64
+kvm_arch_on_vmfd_change(void) ""
# xen-emu.c
kvm_xen_hypercall(int cpu, uint8_t cpl, uint64_t input, uint64_t a0, uint64_t a1, uint64_t a2, uint64_t ret) "xen_hypercall: cpu %d cpl %d input %" PRIu64 " a0 0x%" PRIx64 " a1 0x%" PRIx64 " a2 0x%" PRIx64" ret 0x%" PRIx64
@@ -13,3 +14,6 @@ kvm_xen_soft_reset(void) ""
kvm_xen_set_shared_info(uint64_t gfn) "shared info at gfn 0x%" PRIx64
kvm_xen_set_vcpu_attr(int cpu, int type, uint64_t gpa) "vcpu attr cpu %d type %d gpa 0x%" PRIx64
kvm_xen_set_vcpu_callback(int cpu, int vector) "callback vcpu %d vector %d"
+
+# tdx.c
+tdx_handle_reset(void) ""
diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index 52de019..29364a9 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -44,9 +44,12 @@
#include "xen-compat.h"
+NotifierWithReturn xen_vmfd_change_notifier;
+static uint32_t xen_msr;
static void xen_vcpu_singleshot_timer_event(void *opaque);
static void xen_vcpu_periodic_timer_event(void *opaque);
static int vcpuop_stop_singleshot_timer(CPUState *cs);
+static int do_initialize_xen_caps(KVMState *s, uint32_t hypercall_msr);
#ifdef TARGET_X86_64
#define hypercall_compat32(longmode) (!(longmode))
@@ -54,6 +57,23 @@ static int vcpuop_stop_singleshot_timer(CPUState *cs);
#define hypercall_compat32(longmode) (false)
#endif
+static int xen_handle_vmfd_change(NotifierWithReturn *n,
+ void *data, Error** errp)
+{
+ int ret;
+
+ /* we are not interested in pre vmfd change notification */
+ if (((VmfdChangeNotifier *)data)->pre) {
+ return 0;
+ }
+
+ ret = do_initialize_xen_caps(kvm_state, xen_msr);
+ if (ret < 0) {
+ return ret;
+ }
+ return 0;
+}
+
static bool kvm_gva_to_gpa(CPUState *cs, uint64_t gva, uint64_t *gpa,
size_t *len, bool is_write)
{
@@ -111,7 +131,7 @@ static inline int kvm_copy_to_gva(CPUState *cs, uint64_t gva, void *buf,
return kvm_gva_rw(cs, gva, buf, sz, true);
}
-int kvm_xen_init(KVMState *s, uint32_t hypercall_msr)
+static int do_initialize_xen_caps(KVMState *s, uint32_t hypercall_msr)
{
const int required_caps = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | KVM_XEN_HVM_CONFIG_SHARED_INFO;
@@ -143,6 +163,19 @@ int kvm_xen_init(KVMState *s, uint32_t hypercall_msr)
strerror(-ret));
return ret;
}
+ return xen_caps;
+}
+
+int kvm_xen_init(KVMState *s, uint32_t hypercall_msr)
+{
+ int xen_caps;
+
+ xen_caps = do_initialize_xen_caps(s, hypercall_msr);
+ if (xen_caps < 0) {
+ return xen_caps;
+ }
+
+ xen_msr = hypercall_msr;
/* If called a second time, don't repeat the rest of the setup. */
if (s->xen_caps) {
@@ -185,6 +218,9 @@ int kvm_xen_init(KVMState *s, uint32_t hypercall_msr)
xen_primary_console_reset();
xen_xenstore_reset();
+ xen_vmfd_change_notifier.notify = xen_handle_vmfd_change;
+ kvm_vmfd_add_change_notifier(&xen_vmfd_change_notifier);
+
return 0;
}
diff --git a/target/i386/mshv/meson.build b/target/i386/mshv/meson.build
index 647e5da..49f28d4 100644
--- a/target/i386/mshv/meson.build
+++ b/target/i386/mshv/meson.build
@@ -2,7 +2,7 @@ i386_mshv_ss = ss.source_set()
i386_mshv_ss.add(files(
'mshv-cpu.c',
- 'x86.c',
))
i386_system_ss.add_all(when: 'CONFIG_MSHV', if_true: i386_mshv_ss)
+
diff --git a/target/i386/mshv/mshv-cpu.c b/target/i386/mshv/mshv-cpu.c
index f190e83..2bc978d 100644
--- a/target/i386/mshv/mshv-cpu.c
+++ b/target/i386/mshv/mshv-cpu.c
@@ -1548,74 +1548,6 @@ int mshv_create_vcpu(int vm_fd, uint8_t vp_index, int *cpu_fd)
return 0;
}
-static int guest_mem_read_with_gva(const CPUState *cpu, uint64_t gva,
- uint8_t *data, uintptr_t size,
- bool fetch_instruction)
-{
- int ret;
- uint64_t gpa, flags;
-
- flags = HV_TRANSLATE_GVA_VALIDATE_READ;
- ret = translate_gva(cpu, gva, &gpa, flags);
- if (ret < 0) {
- error_report("failed to translate gva to gpa");
- return -1;
- }
-
- ret = mshv_guest_mem_read(gpa, data, size, false, fetch_instruction);
- if (ret < 0) {
- error_report("failed to read from guest memory");
- return -1;
- }
-
- return 0;
-}
-
-static int guest_mem_write_with_gva(const CPUState *cpu, uint64_t gva,
- const uint8_t *data, uintptr_t size)
-{
- int ret;
- uint64_t gpa, flags;
-
- flags = HV_TRANSLATE_GVA_VALIDATE_WRITE;
- ret = translate_gva(cpu, gva, &gpa, flags);
- if (ret < 0) {
- error_report("failed to translate gva to gpa");
- return -1;
- }
- ret = mshv_guest_mem_write(gpa, data, size, false);
- if (ret < 0) {
- error_report("failed to write to guest memory");
- return -1;
- }
- return 0;
-}
-
-static void write_mem(CPUState *cpu, void *data, target_ulong addr, int bytes)
-{
- if (guest_mem_write_with_gva(cpu, addr, data, bytes) < 0) {
- error_report("failed to write memory");
- abort();
- }
-}
-
-static void fetch_instruction(CPUState *cpu, void *data,
- target_ulong addr, int bytes)
-{
- if (guest_mem_read_with_gva(cpu, addr, data, bytes, true) < 0) {
- error_report("failed to fetch instruction");
- abort();
- }
-}
-
-static void read_mem(CPUState *cpu, void *data, target_ulong addr, int bytes)
-{
- if (guest_mem_read_with_gva(cpu, addr, data, bytes, false) < 0) {
- error_report("failed to read memory");
- abort();
- }
-}
-
static void read_segment_descriptor(CPUState *cpu,
struct x86_segment_descriptor *desc,
enum X86Seg seg_idx)
@@ -1634,9 +1566,6 @@ static void read_segment_descriptor(CPUState *cpu,
}
static const struct x86_emul_ops mshv_x86_emul_ops = {
- .fetch_instruction = fetch_instruction,
- .read_mem = read_mem,
- .write_mem = write_mem,
.read_segment_descriptor = read_segment_descriptor,
};
diff --git a/target/i386/sev.c b/target/i386/sev.c
index acdcb9c..549e624 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -30,8 +30,10 @@
#include "system/kvm.h"
#include "kvm/kvm_i386.h"
#include "sev.h"
+#include "system/cpus.h"
#include "system/system.h"
#include "system/runstate.h"
+#include "system/reset.h"
#include "trace.h"
#include "migration/blocker.h"
#include "qom/object.h"
@@ -86,6 +88,10 @@ typedef struct QEMU_PACKED PaddedSevHashTable {
uint8_t padding[ROUND_UP(sizeof(SevHashTable), 16) - sizeof(SevHashTable)];
} PaddedSevHashTable;
+static void sev_handle_reset(Object *obj, ResetType type);
+
+SevKernelLoaderContext sev_load_ctx = {};
+
QEMU_BUILD_BUG_ON(sizeof(PaddedSevHashTable) % 16 != 0);
#define SEV_INFO_BLOCK_GUID "00f771de-1a7e-4fcb-890e-68c77e2fb44e"
@@ -129,6 +135,7 @@ struct SevCommonState {
uint8_t build_id;
int sev_fd;
SevState state;
+ ResettableState reset_state;
QTAILQ_HEAD(, SevLaunchVmsa) launch_vmsa;
};
@@ -1421,11 +1428,6 @@ sev_launch_finish(SevCommonState *sev_common)
}
sev_set_guest_state(sev_common, SEV_STATE_RUNNING);
-
- /* add migration blocker */
- error_setg(&sev_mig_blocker,
- "SEV: Migration is not implemented");
- migrate_add_blocker(&sev_mig_blocker, &error_fatal);
}
static int snp_launch_update_data(uint64_t gpa, void *hva, size_t len,
@@ -1608,7 +1610,6 @@ static void
sev_snp_launch_finish(SevCommonState *sev_common)
{
int ret, error;
- Error *local_err = NULL;
OvmfSevMetadata *metadata;
SevLaunchUpdateData *data;
SevSnpGuestState *sev_snp = SEV_SNP_GUEST(sev_common);
@@ -1655,15 +1656,6 @@ sev_snp_launch_finish(SevCommonState *sev_common)
kvm_mark_guest_state_protected();
sev_set_guest_state(sev_common, SEV_STATE_RUNNING);
-
- /* add migration blocker */
- error_setg(&sev_mig_blocker,
- "SEV-SNP: Migration is not implemented");
- ret = migrate_add_blocker(&sev_mig_blocker, &local_err);
- if (local_err) {
- error_report_err(local_err);
- exit(1);
- }
}
@@ -1676,6 +1668,16 @@ sev_vm_state_change(void *opaque, bool running, RunState state)
if (running) {
if (!sev_check_state(sev_common, SEV_STATE_RUNNING)) {
klass->launch_finish(sev_common);
+
+ /* add migration blocker */
+ error_setg(&sev_mig_blocker,
+ "SEV: Migration is not implemented");
+ migrate_add_blocker(&sev_mig_blocker, &error_fatal);
+ /*
+ * mark SEV guest as resettable so that we can reinitialize
+ * SEV upon reset.
+ */
+ qemu_register_resettable(OBJECT(sev_common));
}
}
}
@@ -1783,6 +1785,7 @@ static int sev_common_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
uint32_t ebx;
uint32_t host_cbitpos;
struct sev_user_data_status status = {};
+ SevLaunchUpdateData *data, *next_elm;
SevCommonState *sev_common = SEV_COMMON(cgs);
SevCommonStateClass *klass = SEV_COMMON_GET_CLASS(cgs);
X86ConfidentialGuestClass *x86_klass =
@@ -1790,6 +1793,11 @@ static int sev_common_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
sev_common->state = SEV_STATE_UNINIT;
+ /* free existing launch update data if any */
+ QTAILQ_FOREACH_SAFE(data, &launch_update, next, next_elm) {
+ g_free(data);
+ }
+
host_cpuid(0x8000001F, 0, NULL, &ebx, NULL, NULL);
host_cbitpos = ebx & 0x3f;
@@ -1930,8 +1938,9 @@ static int sev_common_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
return -1;
}
- qemu_add_vm_change_state_handler(sev_vm_state_change, sev_common);
-
+ if (!cgs->ready) {
+ qemu_add_vm_change_state_handler(sev_vm_state_change, sev_common);
+ }
cgs->ready = true;
return 0;
@@ -1953,22 +1962,23 @@ static int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
return -1;
}
- /*
- * SEV uses these notifiers to register/pin pages prior to guest use,
- * but SNP relies on guest_memfd for private pages, which has its
- * own internal mechanisms for registering/pinning private memory.
- */
- ram_block_notifier_add(&sev_ram_notifier);
-
- /*
- * The machine done notify event is used for SEV guests to get the
- * measurement of the encrypted images. When SEV-SNP is enabled, the
- * measurement is part of the guest attestation process where it can
- * be collected without any reliance on the VMM. So skip registering
- * the notifier for SNP in favor of using guest attestation instead.
- */
- qemu_add_machine_init_done_notifier(&sev_machine_done_notify);
+ if (!cgs->ready) {
+ /*
+ * SEV uses these notifiers to register/pin pages prior to guest use,
+ * but SNP relies on guest_memfd for private pages, which has its
+ * own internal mechanisms for registering/pinning private memory.
+ */
+ ram_block_notifier_add(&sev_ram_notifier);
+ /*
+ * The machine done notify event is used for SEV guests to get the
+ * measurement of the encrypted images. When SEV-SNP is enabled, the
+ * measurement is part of the guest attestation process where it can
+ * be collected without any reliance on the VMM. So skip registering
+ * the notifier for SNP in favor of using guest attestation instead.
+ */
+ qemu_add_machine_init_done_notifier(&sev_machine_done_notify);
+ }
return 0;
}
@@ -1976,6 +1986,8 @@ static int sev_snp_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
{
MachineState *ms = MACHINE(qdev_get_machine());
X86MachineState *x86ms = X86_MACHINE(ms);
+ SevCommonState *sev_common = SEV_COMMON(cgs);
+ SevSnpGuestState *sev_snp_guest = SEV_SNP_GUEST(sev_common);
if (x86ms->smm == ON_OFF_AUTO_AUTO) {
x86ms->smm = ON_OFF_AUTO_OFF;
@@ -1984,9 +1996,48 @@ static int sev_snp_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
return -1;
}
+ /* free existing kernel hashes data if any */
+ g_free(sev_snp_guest->kernel_hashes_data);
+ sev_snp_guest->kernel_hashes_data = NULL;
+
return 0;
}
+/*
+ * handle sev vm reset
+ */
+static void sev_handle_reset(Object *obj, ResetType type)
+{
+ SevCommonState *sev_common = SEV_COMMON(MACHINE(qdev_get_machine())->cgs);
+ SevCommonStateClass *klass = SEV_COMMON_GET_CLASS(sev_common);
+
+ if (!sev_common) {
+ return;
+ }
+
+ if (!runstate_is_running()) {
+ return;
+ }
+
+ sev_add_kernel_loader_hashes(&sev_load_ctx, &error_fatal);
+ if (sev_es_enabled() && !sev_snp_enabled()) {
+ sev_launch_get_measure(NULL, NULL);
+ }
+ if (!sev_check_state(sev_common, SEV_STATE_RUNNING)) {
+ /* this calls sev_snp_launch_finish() etc */
+ klass->launch_finish(sev_common);
+ }
+
+ trace_sev_handle_reset();
+ return;
+}
+
+static ResettableState *sev_reset_state(Object *obj)
+{
+ SevCommonState *sev_common = SEV_COMMON(obj);
+ return &sev_common->reset_state;
+}
+
int
sev_encrypt_flash(hwaddr gpa, uint8_t *ptr, uint64_t len, Error **errp)
{
@@ -2465,6 +2516,8 @@ bool sev_add_kernel_loader_hashes(SevKernelLoaderContext *ctx, Error **errp)
return false;
}
+ /* save the context here so that it can be re-used when vm is reset */
+ memcpy(&sev_load_ctx, ctx, sizeof(*ctx));
return klass->build_kernel_loader_hashes(sev_common, area, ctx, errp);
}
@@ -2725,8 +2778,16 @@ static void
sev_common_class_init(ObjectClass *oc, const void *data)
{
ConfidentialGuestSupportClass *klass = CONFIDENTIAL_GUEST_SUPPORT_CLASS(oc);
+ ResettableClass *rc = RESETTABLE_CLASS(oc);
klass->kvm_init = sev_common_kvm_init;
+ /*
+ * the exit phase makes sure sev handles reset after all legacy resets
+ * have taken place (in the hold phase) and IGVM has also properly
+ * set up the boot state.
+ */
+ rc->phases.exit = sev_handle_reset;
+ rc->get_state = sev_reset_state;
object_class_property_add_str(oc, "sev-device",
sev_common_get_sev_device,
@@ -2760,6 +2821,7 @@ sev_common_instance_init(Object *obj)
cgs->set_guest_state = cgs_set_guest_state;
cgs->get_mem_map_entry = cgs_get_mem_map_entry;
cgs->set_guest_policy = cgs_set_guest_policy;
+ cgs->can_rebuild_guest_state = true;
QTAILQ_INIT(&sev_common->launch_vmsa);
}
@@ -2775,6 +2837,7 @@ static const TypeInfo sev_common_info = {
.abstract = true,
.interfaces = (const InterfaceInfo[]) {
{ TYPE_USER_CREATABLE },
+ { TYPE_RESETTABLE_INTERFACE },
{ }
}
};
diff --git a/target/i386/trace-events b/target/i386/trace-events
index 5130167..b320f65 100644
--- a/target/i386/trace-events
+++ b/target/i386/trace-events
@@ -14,3 +14,4 @@ kvm_sev_attestation_report(const char *mnonce, const char *data) "mnonce %s data
kvm_sev_snp_launch_start(uint64_t policy, char *gosvw) "policy 0x%" PRIx64 " gosvw %s"
kvm_sev_snp_launch_update(uint64_t src, uint64_t gpa, uint64_t len, const char *type) "src 0x%" PRIx64 " gpa 0x%" PRIx64 " len 0x%" PRIx64 " (%s page)"
kvm_sev_snp_launch_finish(char *id_block, char *id_auth, char *host_data) "id_block %s id_auth %s host_data %s"
+sev_handle_reset(void) ""
diff --git a/target/i386/whpx/whpx-all.c b/target/i386/whpx/whpx-all.c
index 8210250..c172e86 100644
--- a/target/i386/whpx/whpx-all.c
+++ b/target/i386/whpx/whpx-all.c
@@ -15,6 +15,7 @@
#include "gdbstub/helpers.h"
#include "qemu/accel.h"
#include "accel/accel-ops.h"
+#include "system/memory.h"
#include "system/whpx.h"
#include "system/cpus.h"
#include "system/runstate.h"
@@ -36,10 +37,16 @@
#include "system/whpx-all.h"
#include "system/whpx-common.h"
+#include "emulate/x86_decode.h"
+#include "emulate/x86_emu.h"
+#include "emulate/x86_flags.h"
+#include "emulate/x86_mmu.h"
+
#include <winhvplatform.h>
-#include <winhvemulation.h>
#define HYPERV_APIC_BUS_FREQUENCY (200000000ULL)
+/* for kernel-irqchip=off */
+#define HV_X64_MSR_APIC_FREQUENCY 0x40000023
static const WHV_REGISTER_NAME whpx_register_names[] = {
@@ -362,7 +369,7 @@ static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8)
return cr8 << 4;
}
-void whpx_set_registers(CPUState *cpu, int level)
+void whpx_set_registers(CPUState *cpu, WHPXStateLevel level)
{
struct whpx_state *whpx = &whpx_global;
AccelCPUState *vcpu = cpu->accel;
@@ -381,7 +388,7 @@ void whpx_set_registers(CPUState *cpu, int level)
* Following MSRs have side effects on the guest or are too heavy for
* runtime. Limit them to full state update.
*/
- if (level >= WHPX_SET_RESET_STATE) {
+ if (level >= WHPX_LEVEL_RESET_STATE) {
whpx_set_tsc(cpu);
}
@@ -407,6 +414,7 @@ void whpx_set_registers(CPUState *cpu, int level)
vcxt.values[idx++].Reg64 = env->eip;
assert(whpx_register_names[idx] == WHvX64RegisterRflags);
+ lflags_to_rflags(env);
vcxt.values[idx++].Reg64 = env->eflags;
/* Translate 6+4 segment registers. HV and QEMU order matches */
@@ -416,118 +424,124 @@ void whpx_set_registers(CPUState *cpu, int level)
}
assert(idx == WHvX64RegisterLdtr);
- vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0);
-
- assert(idx == WHvX64RegisterTr);
- vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0);
-
- assert(idx == WHvX64RegisterIdtr);
- vcxt.values[idx].Table.Base = env->idt.base;
- vcxt.values[idx].Table.Limit = env->idt.limit;
- idx += 1;
-
- assert(idx == WHvX64RegisterGdtr);
- vcxt.values[idx].Table.Base = env->gdt.base;
- vcxt.values[idx].Table.Limit = env->gdt.limit;
- idx += 1;
-
- /* CR0, 2, 3, 4, 8 */
- assert(whpx_register_names[idx] == WHvX64RegisterCr0);
- vcxt.values[idx++].Reg64 = env->cr[0];
- assert(whpx_register_names[idx] == WHvX64RegisterCr2);
- vcxt.values[idx++].Reg64 = env->cr[2];
- assert(whpx_register_names[idx] == WHvX64RegisterCr3);
- vcxt.values[idx++].Reg64 = env->cr[3];
- assert(whpx_register_names[idx] == WHvX64RegisterCr4);
- vcxt.values[idx++].Reg64 = env->cr[4];
- assert(whpx_register_names[idx] == WHvX64RegisterCr8);
- vcxt.values[idx++].Reg64 = vcpu->tpr;
-
- /* 8 Debug Registers - Skipped */
-
/*
- * Extended control registers needs to be handled separately depending
- * on whether xsave is supported/enabled or not.
+ * Skip those registers for synchronisation after MMIO accesses
+ * as they're not going to be modified in that case.
*/
- whpx_set_xcrs(cpu);
-
- /* 16 XMM registers */
- assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
- idx_next = idx + 16;
- for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
- vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0);
- vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1);
- }
- idx = idx_next;
+ if (level > WHPX_LEVEL_FAST_RUNTIME_STATE) {
+ vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0);
+
+ assert(idx == WHvX64RegisterTr);
+ vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0);
+
+ assert(idx == WHvX64RegisterIdtr);
+ vcxt.values[idx].Table.Base = env->idt.base;
+ vcxt.values[idx].Table.Limit = env->idt.limit;
+ idx += 1;
+
+ assert(idx == WHvX64RegisterGdtr);
+ vcxt.values[idx].Table.Base = env->gdt.base;
+ vcxt.values[idx].Table.Limit = env->gdt.limit;
+ idx += 1;
+
+ /* CR0, 2, 3, 4, 8 */
+ assert(whpx_register_names[idx] == WHvX64RegisterCr0);
+ vcxt.values[idx++].Reg64 = env->cr[0];
+ assert(whpx_register_names[idx] == WHvX64RegisterCr2);
+ vcxt.values[idx++].Reg64 = env->cr[2];
+ assert(whpx_register_names[idx] == WHvX64RegisterCr3);
+ vcxt.values[idx++].Reg64 = env->cr[3];
+ assert(whpx_register_names[idx] == WHvX64RegisterCr4);
+ vcxt.values[idx++].Reg64 = env->cr[4];
+ assert(whpx_register_names[idx] == WHvX64RegisterCr8);
+ vcxt.values[idx++].Reg64 = vcpu->tpr;
+
+ /* 8 Debug Registers - Skipped */
- /* 8 FP registers */
- assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
- for (i = 0; i < 8; i += 1, idx += 1) {
- vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0);
- /* vcxt.values[idx].Fp.AsUINT128.High64 =
- env->fpregs[i].mmx.MMX_Q(1);
- */
- }
-
- /* FP control status register */
- assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
- vcxt.values[idx].FpControlStatus.FpControl = env->fpuc;
- vcxt.values[idx].FpControlStatus.FpStatus =
- (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
- vcxt.values[idx].FpControlStatus.FpTag = 0;
- for (i = 0; i < 8; ++i) {
- vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i;
- }
- vcxt.values[idx].FpControlStatus.Reserved = 0;
- vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop;
- vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip;
- idx += 1;
-
- /* XMM control status register */
- assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
- vcxt.values[idx].XmmControlStatus.LastFpRdp = 0;
- vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr;
- vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff;
- idx += 1;
+ /*
+ * Extended control registers needs to be handled separately depending
+ * on whether xsave is supported/enabled or not.
+ */
+ whpx_set_xcrs(cpu);
+
+ /* 16 XMM registers */
+ assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
+ idx_next = idx + 16;
+ for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
+ vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0);
+ vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1);
+ }
+ idx = idx_next;
+
+ /* 8 FP registers */
+ assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
+ for (i = 0; i < 8; i += 1, idx += 1) {
+ vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0);
+ /* vcxt.values[idx].Fp.AsUINT128.High64 =
+ env->fpregs[i].mmx.MMX_Q(1);
+ */
+ }
- /* MSRs */
- assert(whpx_register_names[idx] == WHvX64RegisterEfer);
- vcxt.values[idx++].Reg64 = env->efer;
+ /* FP control status register */
+ assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
+ vcxt.values[idx].FpControlStatus.FpControl = env->fpuc;
+ vcxt.values[idx].FpControlStatus.FpStatus =
+ (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
+ vcxt.values[idx].FpControlStatus.FpTag = 0;
+ for (i = 0; i < 8; ++i) {
+ vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i;
+ }
+ vcxt.values[idx].FpControlStatus.Reserved = 0;
+ vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop;
+ vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip;
+ idx += 1;
+
+ /* XMM control status register */
+ assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
+ vcxt.values[idx].XmmControlStatus.LastFpRdp = 0;
+ vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr;
+ vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff;
+ idx += 1;
+
+ /* MSRs */
+ assert(whpx_register_names[idx] == WHvX64RegisterEfer);
+ vcxt.values[idx++].Reg64 = env->efer;
#ifdef TARGET_X86_64
- assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
- vcxt.values[idx++].Reg64 = env->kernelgsbase;
+ assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
+ vcxt.values[idx++].Reg64 = env->kernelgsbase;
#endif
- assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
- vcxt.values[idx++].Reg64 = vcpu->apic_base;
+ assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
+ vcxt.values[idx++].Reg64 = vcpu->apic_base;
- /* WHvX64RegisterPat - Skipped */
+ /* WHvX64RegisterPat - Skipped */
- assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
- vcxt.values[idx++].Reg64 = env->sysenter_cs;
- assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
- vcxt.values[idx++].Reg64 = env->sysenter_eip;
- assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
- vcxt.values[idx++].Reg64 = env->sysenter_esp;
- assert(whpx_register_names[idx] == WHvX64RegisterStar);
- vcxt.values[idx++].Reg64 = env->star;
+ assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
+ vcxt.values[idx++].Reg64 = env->sysenter_cs;
+ assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
+ vcxt.values[idx++].Reg64 = env->sysenter_eip;
+ assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
+ vcxt.values[idx++].Reg64 = env->sysenter_esp;
+ assert(whpx_register_names[idx] == WHvX64RegisterStar);
+ vcxt.values[idx++].Reg64 = env->star;
#ifdef TARGET_X86_64
- assert(whpx_register_names[idx] == WHvX64RegisterLstar);
- vcxt.values[idx++].Reg64 = env->lstar;
- assert(whpx_register_names[idx] == WHvX64RegisterCstar);
- vcxt.values[idx++].Reg64 = env->cstar;
- assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
- vcxt.values[idx++].Reg64 = env->fmask;
+ assert(whpx_register_names[idx] == WHvX64RegisterLstar);
+ vcxt.values[idx++].Reg64 = env->lstar;
+ assert(whpx_register_names[idx] == WHvX64RegisterCstar);
+ vcxt.values[idx++].Reg64 = env->cstar;
+ assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
+ vcxt.values[idx++].Reg64 = env->fmask;
#endif
- /* Interrupt / Event Registers - Skipped */
+ /* Interrupt / Event Registers - Skipped */
- assert(idx == RTL_NUMBER_OF(whpx_register_names));
+ assert(idx == RTL_NUMBER_OF(whpx_register_names));
+ }
hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
whpx->partition, cpu->cpu_index,
whpx_register_names,
- RTL_NUMBER_OF(whpx_register_names),
+ idx,
&vcxt.values[0]);
if (FAILED(hr)) {
@@ -577,7 +591,7 @@ static void whpx_get_xcrs(CPUState *cpu)
cpu_env(cpu)->xcr0 = xcr0.Reg64;
}
-void whpx_get_registers(CPUState *cpu)
+void whpx_get_registers(CPUState *cpu, WHPXStateLevel level)
{
struct whpx_state *whpx = &whpx_global;
AccelCPUState *vcpu = cpu->accel;
@@ -607,7 +621,7 @@ void whpx_get_registers(CPUState *cpu)
hr);
}
- if (whpx_irqchip_in_kernel()) {
+ if (level > WHPX_LEVEL_FAST_RUNTIME_STATE && whpx_irqchip_in_kernel()) {
/*
* Fetch the TPR value from the emulated APIC. It may get overwritten
* below with the value from CR8 returned by
@@ -632,6 +646,7 @@ void whpx_get_registers(CPUState *cpu)
env->eip = vcxt.values[idx++].Reg64;
assert(whpx_register_names[idx] == WHvX64RegisterRflags);
env->eflags = vcxt.values[idx++].Reg64;
+ rflags_to_lflags(env);
/* Translate 6+4 segment registers. HV and QEMU order matches */
assert(idx == WHvX64RegisterEs);
@@ -663,7 +678,7 @@ void whpx_get_registers(CPUState *cpu)
env->cr[4] = vcxt.values[idx++].Reg64;
assert(whpx_register_names[idx] == WHvX64RegisterCr8);
tpr = vcxt.values[idx++].Reg64;
- if (tpr != vcpu->tpr) {
+ if (level > WHPX_LEVEL_FAST_RUNTIME_STATE && tpr != vcpu->tpr) {
vcpu->tpr = tpr;
cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr));
}
@@ -749,165 +764,133 @@ void whpx_get_registers(CPUState *cpu)
assert(idx == RTL_NUMBER_OF(whpx_register_names));
- if (whpx_irqchip_in_kernel()) {
+ if (level > WHPX_LEVEL_FAST_RUNTIME_STATE && whpx_irqchip_in_kernel()) {
whpx_apic_get(x86_cpu->apic_state);
}
x86_update_hflags(env);
}
-static HRESULT CALLBACK whpx_emu_ioport_callback(
- void *ctx,
- WHV_EMULATOR_IO_ACCESS_INFO *IoAccess)
+static int emulate_instruction(CPUState *cpu, const uint8_t *insn_bytes, size_t insn_len)
{
- MemTxAttrs attrs = { 0 };
- address_space_rw(&address_space_io, IoAccess->Port, attrs,
- &IoAccess->Data, IoAccess->AccessSize,
- IoAccess->Direction);
- return S_OK;
-}
+ X86CPU *x86_cpu = X86_CPU(cpu);
+ CPUX86State *env = &x86_cpu->env;
+ struct x86_decode decode = { 0 };
+ x86_insn_stream stream = { .bytes = insn_bytes, .len = insn_len };
-static HRESULT CALLBACK whpx_emu_mmio_callback(
- void *ctx,
- WHV_EMULATOR_MEMORY_ACCESS_INFO *ma)
-{
- CPUState *cs = (CPUState *)ctx;
- AddressSpace *as = cpu_addressspace(cs, MEMTXATTRS_UNSPECIFIED);
+ whpx_get_registers(cpu, WHPX_LEVEL_FAST_RUNTIME_STATE);
+ decode_instruction_stream(env, &decode, &stream);
+ exec_instruction(env, &decode);
+ whpx_set_registers(cpu, WHPX_LEVEL_FAST_RUNTIME_STATE);
- address_space_rw(as, ma->GpaAddress, MEMTXATTRS_UNSPECIFIED,
- ma->Data, ma->AccessSize, ma->Direction);
- return S_OK;
+ return 0;
}
-static HRESULT CALLBACK whpx_emu_getreg_callback(
- void *ctx,
- const WHV_REGISTER_NAME *RegisterNames,
- UINT32 RegisterCount,
- WHV_REGISTER_VALUE *RegisterValues)
+static int whpx_handle_mmio(CPUState *cpu, WHV_RUN_VP_EXIT_CONTEXT *exit_ctx)
{
- HRESULT hr;
- struct whpx_state *whpx = &whpx_global;
- CPUState *cpu = (CPUState *)ctx;
+ WHV_MEMORY_ACCESS_CONTEXT *ctx = &exit_ctx->MemoryAccess;
+ int ret;
- hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
- whpx->partition, cpu->cpu_index,
- RegisterNames, RegisterCount,
- RegisterValues);
- if (FAILED(hr)) {
- error_report("WHPX: Failed to get virtual processor registers,"
- " hr=%08lx", hr);
+ ret = emulate_instruction(cpu, ctx->InstructionBytes, ctx->InstructionByteCount);
+ if (ret < 0) {
+ error_report("failed to emulate mmio");
+ return -1;
}
- return hr;
+ return 0;
}
-static HRESULT CALLBACK whpx_emu_setreg_callback(
- void *ctx,
- const WHV_REGISTER_NAME *RegisterNames,
- UINT32 RegisterCount,
- const WHV_REGISTER_VALUE *RegisterValues)
+static void handle_io(CPUState *env, uint16_t port, void *buffer,
+ int direction, int size, int count)
{
- HRESULT hr;
- struct whpx_state *whpx = &whpx_global;
- CPUState *cpu = (CPUState *)ctx;
+ int i;
+ uint8_t *ptr = buffer;
- hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
- whpx->partition, cpu->cpu_index,
- RegisterNames, RegisterCount,
- RegisterValues);
- if (FAILED(hr)) {
- error_report("WHPX: Failed to set virtual processor registers,"
- " hr=%08lx", hr);
+ for (i = 0; i < count; i++) {
+ address_space_rw(&address_space_io, port, MEMTXATTRS_UNSPECIFIED,
+ ptr, size,
+ direction);
+ ptr += size;
}
-
- /*
- * The emulator just successfully wrote the register state. We clear the
- * dirty state so we avoid the double write on resume of the VP.
- */
- cpu->vcpu_dirty = false;
-
- return hr;
}
-static HRESULT CALLBACK whpx_emu_translate_callback(
- void *ctx,
- WHV_GUEST_VIRTUAL_ADDRESS Gva,
- WHV_TRANSLATE_GVA_FLAGS TranslateFlags,
- WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult,
- WHV_GUEST_PHYSICAL_ADDRESS *Gpa)
+static void whpx_bump_rip(CPUState *cpu, WHV_RUN_VP_EXIT_CONTEXT *exit_ctx)
{
- HRESULT hr;
- struct whpx_state *whpx = &whpx_global;
- CPUState *cpu = (CPUState *)ctx;
- WHV_TRANSLATE_GVA_RESULT res;
-
- hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index,
- Gva, TranslateFlags, &res, Gpa);
- if (FAILED(hr)) {
- error_report("WHPX: Failed to translate GVA, hr=%08lx", hr);
- } else {
- *TranslationResult = res.ResultCode;
- }
-
- return hr;
+ WHV_REGISTER_VALUE reg;
+ whpx_get_reg(cpu, WHvX64RegisterRip, &reg);
+ reg.Reg64 = exit_ctx->VpContext.Rip + exit_ctx->VpContext.InstructionLength;
+ whpx_set_reg(cpu, WHvX64RegisterRip, reg);
}
-static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = {
- .Size = sizeof(WHV_EMULATOR_CALLBACKS),
- .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback,
- .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback,
- .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback,
- .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback,
- .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback,
-};
-
-static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx)
+static int whpx_handle_portio(CPUState *cpu,
+ WHV_RUN_VP_EXIT_CONTEXT *exit_ctx)
{
- HRESULT hr;
- AccelCPUState *vcpu = cpu->accel;
- WHV_EMULATOR_STATUS emu_status;
-
- hr = whp_dispatch.WHvEmulatorTryMmioEmulation(
- vcpu->emulator, cpu,
- &vcpu->exit_ctx.VpContext, ctx,
- &emu_status);
- if (FAILED(hr)) {
- error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr);
- return -1;
- }
+ WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx = &exit_ctx->IoPortAccess;
+ X86CPU *x86_cpu = X86_CPU(cpu);
+ CPUX86State *env = &x86_cpu->env;
+ int ret;
- if (!emu_status.EmulationSuccessful) {
- error_report("WHPX: Failed to emulate MMIO access with"
- " EmulatorReturnStatus: %u", emu_status.AsUINT32);
+ if (!ctx->AccessInfo.StringOp && !ctx->AccessInfo.IsWrite) {
+ uint64_t val = 0;
+ WHV_REGISTER_VALUE reg;
+
+ whpx_get_reg(cpu, WHvX64RegisterRax, &reg);
+ handle_io(cpu, ctx->PortNumber, &val, 0, ctx->AccessInfo.AccessSize, 1);
+ if (ctx->AccessInfo.AccessSize == 1) {
+ reg.Reg8 = val;
+ } else if (ctx->AccessInfo.AccessSize == 2) {
+ reg.Reg16 = val;
+ } else if (ctx->AccessInfo.AccessSize == 4) {
+ reg.Reg64 = (uint32_t)val;
+ } else {
+ reg.Reg64 = (uint64_t)val;
+ }
+ whpx_bump_rip(cpu, exit_ctx);
+ whpx_set_reg(cpu, WHvX64RegisterRax, reg);
+ return 0;
+ } else if (!ctx->AccessInfo.StringOp && ctx->AccessInfo.IsWrite) {
+ RAX(env) = ctx->Rax;
+ handle_io(cpu, ctx->PortNumber, &RAX(env), 1, ctx->AccessInfo.AccessSize, 1);
+ whpx_bump_rip(cpu, exit_ctx);
+ return 0;
+ }
+
+ ret = emulate_instruction(cpu, ctx->InstructionBytes, exit_ctx->VpContext.InstructionLength);
+ if (ret < 0) {
+ error_report("failed to emulate I/O port access");
return -1;
}
return 0;
}
-static int whpx_handle_portio(CPUState *cpu,
- WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx)
+static void read_segment_descriptor(CPUState *cpu,
+ struct x86_segment_descriptor *desc,
+ enum X86Seg seg_idx)
{
- HRESULT hr;
- AccelCPUState *vcpu = cpu->accel;
- WHV_EMULATOR_STATUS emu_status;
+ bool ret;
+ X86CPU *x86_cpu = X86_CPU(cpu);
+ CPUX86State *env = &x86_cpu->env;
+ SegmentCache *seg = &env->segs[seg_idx];
+ x86_segment_selector sel = { .sel = seg->selector & 0xFFFF };
- hr = whp_dispatch.WHvEmulatorTryIoEmulation(
- vcpu->emulator, cpu,
- &vcpu->exit_ctx.VpContext, ctx,
- &emu_status);
- if (FAILED(hr)) {
- error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr);
- return -1;
+ ret = x86_read_segment_descriptor(cpu, desc, sel);
+ if (ret == false) {
+ error_report("failed to read segment descriptor");
+ abort();
}
+}
- if (!emu_status.EmulationSuccessful) {
- error_report("WHPX: Failed to emulate PortIO access with"
- " EmulatorReturnStatus: %u", emu_status.AsUINT32);
- return -1;
- }
- return 0;
+static const struct x86_emul_ops whpx_x86_emul_ops = {
+ .read_segment_descriptor = read_segment_descriptor,
+ .handle_io = handle_io
+};
+
+static void whpx_init_emu(void)
+{
+ init_decoder();
+ init_emu(&whpx_x86_emul_ops);
}
/*
@@ -1272,6 +1255,18 @@ void whpx_apply_breakpoints(
}
}
+bool whpx_arch_supports_guest_debug(void)
+{
+ return true;
+}
+
+void whpx_arch_destroy_vcpu(CPUState *cpu)
+{
+ X86CPU *x86cpu = X86_CPU(cpu);
+ CPUX86State *env = &x86cpu->env;
+ g_free(env->emu_mmio_buf);
+}
+
/* Returns the address of the next instruction that is about to be executed. */
static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid)
{
@@ -1330,6 +1325,16 @@ static int whpx_handle_halt(CPUState *cpu)
return ret;
}
+static void whpx_vcpu_kick_out_of_hlt(CPUState *cpu)
+{
+ WHV_REGISTER_VALUE reg;
+ whpx_get_reg(cpu, WHvRegisterInternalActivityState, &reg);
+ if (reg.InternalActivity.HaltSuspend) {
+ reg.InternalActivity.HaltSuspend = 0;
+ whpx_set_reg(cpu, WHvRegisterInternalActivityState, reg);
+ }
+}
+
static void whpx_vcpu_pre_run(CPUState *cpu)
{
HRESULT hr;
@@ -1413,6 +1418,17 @@ static void whpx_vcpu_pre_run(CPUState *cpu)
.Vector = irq,
};
reg_count += 1;
+ /*
+ * When the Hyper-V APIC is enabled, to get out of HLT we
+ * either have to request an interrupt or manually get it away
+ * from HLT.
+ *
+ * We also manually do inject some interrupts via WHvRegisterPendingEvent
+ * instead of WHVRequestInterrupt, which does not reset the HLT state.
+ */
+ if (whpx_irqchip_in_kernel()) {
+ whpx_vcpu_kick_out_of_hlt(cpu);
+ }
}
}
@@ -1475,6 +1491,7 @@ static void whpx_vcpu_post_run(CPUState *cpu)
!vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow;
}
+
static void whpx_vcpu_process_async_events(CPUState *cpu)
{
X86CPU *x86_cpu = X86_CPU(cpu);
@@ -1513,6 +1530,26 @@ static void whpx_vcpu_process_async_events(CPUState *cpu)
}
}
+static void whpx_inject_exceptions(CPUState* cpu)
+{
+ X86CPU *x86_cpu = X86_CPU(cpu);
+ CPUX86State *env = &x86_cpu->env;
+
+ if (env->exception_injected) {
+ env->exception_injected = 0;
+ WHV_REGISTER_VALUE reg = {};
+ reg.ExceptionEvent.EventPending = 1;
+ reg.ExceptionEvent.EventType = WHvX64PendingEventException;
+ reg.ExceptionEvent.DeliverErrorCode = 1;
+ reg.ExceptionEvent.Vector = env->exception_nr;
+ reg.ExceptionEvent.ErrorCode = env->error_code;
+ if (env->exception_nr == EXCP0E_PAGE) {
+ reg.ExceptionEvent.ExceptionParameter = env->cr[2];
+ }
+ whpx_set_reg(cpu, WHvRegisterPendingEvent, reg);
+ }
+}
+
int whpx_vcpu_run(CPUState *cpu)
{
HRESULT hr;
@@ -1590,7 +1627,7 @@ int whpx_vcpu_run(CPUState *cpu)
do {
if (cpu->vcpu_dirty) {
- whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE);
+ whpx_set_registers(cpu, WHPX_LEVEL_RUNTIME_STATE);
cpu->vcpu_dirty = false;
}
@@ -1607,6 +1644,8 @@ int whpx_vcpu_run(CPUState *cpu)
whpx_vcpu_configure_single_stepping(cpu, true, NULL);
}
+ whpx_inject_exceptions(cpu);
+
hr = whp_dispatch.WHvRunVirtualProcessor(
whpx->partition, cpu->cpu_index,
&vcpu->exit_ctx, sizeof(vcpu->exit_ctx));
@@ -1628,11 +1667,11 @@ int whpx_vcpu_run(CPUState *cpu)
switch (vcpu->exit_ctx.ExitReason) {
case WHvRunVpExitReasonMemoryAccess:
- ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess);
+ ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx);
break;
case WHvRunVpExitReasonX64IoPortAccess:
- ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess);
+ ret = whpx_handle_portio(cpu, &vcpu->exit_ctx);
break;
case WHvRunVpExitReasonX64InterruptWindow:
@@ -1648,8 +1687,7 @@ int whpx_vcpu_run(CPUState *cpu)
case WHvRunVpExitReasonX64Halt:
/*
- * WARNING: as of build 19043.1526 (21H1), this exit reason is no
- * longer used.
+ * Used for kernel-irqchip=off
*/
ret = whpx_handle_halt(cpu);
break;
@@ -1766,6 +1804,7 @@ int whpx_vcpu_run(CPUState *cpu)
WHV_REGISTER_VALUE reg_values[3] = {0};
WHV_REGISTER_NAME reg_names[3];
UINT32 reg_count;
+ bool is_known_msr = 0;
reg_names[0] = WHvX64RegisterRip;
reg_names[1] = WHvX64RegisterRax;
@@ -1775,6 +1814,12 @@ int whpx_vcpu_run(CPUState *cpu)
vcpu->exit_ctx.VpContext.Rip +
vcpu->exit_ctx.VpContext.InstructionLength;
+ if (vcpu->exit_ctx.MsrAccess.MsrNumber == HV_X64_MSR_APIC_FREQUENCY
+ && !vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite
+ && !whpx_irqchip_in_kernel()) {
+ is_known_msr = 1;
+ reg_values[1].Reg32 = (uint32_t)X86_CPU(cpu)->env.apic_bus_freq;
+ }
/*
* For all unsupported MSR access we:
* ignore writes
@@ -1783,6 +1828,11 @@ int whpx_vcpu_run(CPUState *cpu)
reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ?
1 : 3;
+ if (!is_known_msr) {
+ warn_report("WHPX: Unsupported MSR access (0x%x), IsWrite=%i",
+ vcpu->exit_ctx.MsrAccess.MsrNumber, vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite);
+ }
+
hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
whpx->partition,
cpu->cpu_index,
@@ -1796,77 +1846,8 @@ int whpx_vcpu_run(CPUState *cpu)
ret = 0;
break;
}
- case WHvRunVpExitReasonX64Cpuid: {
- WHV_REGISTER_VALUE reg_values[5];
- WHV_REGISTER_NAME reg_names[5];
- UINT32 reg_count = 5;
- UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0;
- X86CPU *x86_cpu = X86_CPU(cpu);
- CPUX86State *env = &x86_cpu->env;
-
- memset(reg_values, 0, sizeof(reg_values));
-
- rip = vcpu->exit_ctx.VpContext.Rip +
- vcpu->exit_ctx.VpContext.InstructionLength;
- cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax;
-
- /*
- * Ideally, these should be supplied to the hypervisor during VCPU
- * initialization and it should be able to satisfy this request.
- * But, currently, WHPX doesn't support setting CPUID values in the
- * hypervisor once the partition has been setup, which is too late
- * since VCPUs are realized later. For now, use the values from
- * QEMU to satisfy these requests, until WHPX adds support for
- * being able to set these values in the hypervisor at runtime.
- */
- cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx,
- (UINT32 *)&rcx, (UINT32 *)&rdx);
- switch (cpuid_fn) {
- case 0x40000000:
- /* Expose the vmware cpu frequency cpuid leaf */
- rax = 0x40000010;
- rbx = rcx = rdx = 0;
- break;
-
- case 0x40000010:
- rax = env->tsc_khz;
- rbx = env->apic_bus_freq / 1000; /* Hz to KHz */
- rcx = rdx = 0;
- break;
-
- case 0x80000001:
- /* Remove any support of OSVW */
- rcx &= ~CPUID_EXT3_OSVW;
- break;
- }
-
- reg_names[0] = WHvX64RegisterRip;
- reg_names[1] = WHvX64RegisterRax;
- reg_names[2] = WHvX64RegisterRcx;
- reg_names[3] = WHvX64RegisterRdx;
- reg_names[4] = WHvX64RegisterRbx;
-
- reg_values[0].Reg64 = rip;
- reg_values[1].Reg64 = rax;
- reg_values[2].Reg64 = rcx;
- reg_values[3].Reg64 = rdx;
- reg_values[4].Reg64 = rbx;
-
- hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
- whpx->partition, cpu->cpu_index,
- reg_names,
- reg_count,
- reg_values);
-
- if (FAILED(hr)) {
- error_report("WHPX: Failed to set CpuidAccess state registers,"
- " hr=%08lx", hr);
- }
- ret = 0;
- break;
- }
case WHvRunVpExitReasonException:
- whpx_get_registers(cpu);
+ whpx_get_registers(cpu, WHPX_LEVEL_FULL_STATE);
if ((vcpu->exit_ctx.VpException.ExceptionType ==
WHvX64ExceptionTypeDebugTrapOrFault) &&
@@ -1898,7 +1879,7 @@ int whpx_vcpu_run(CPUState *cpu)
default:
error_report("WHPX: Unexpected VP exit code %d",
vcpu->exit_ctx.ExitReason);
- whpx_get_registers(cpu);
+ whpx_get_registers(cpu, WHPX_LEVEL_FULL_STATE);
bql_lock();
qemu_system_guest_panicked(cpu_get_crash_info(cpu));
bql_unlock();
@@ -1979,22 +1960,11 @@ int whpx_init_vcpu(CPUState *cpu)
vcpu = g_new0(AccelCPUState, 1);
- hr = whp_dispatch.WHvEmulatorCreateEmulator(
- &whpx_emu_callbacks,
- &vcpu->emulator);
- if (FAILED(hr)) {
- error_report("WHPX: Failed to setup instruction completion support,"
- " hr=%08lx", hr);
- ret = -EINVAL;
- goto error;
- }
-
hr = whp_dispatch.WHvCreateVirtualProcessor(
whpx->partition, cpu->cpu_index, 0);
if (FAILED(hr)) {
error_report("WHPX: Failed to create a virtual processor,"
" hr=%08lx", hr);
- whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
ret = -EINVAL;
goto error;
}
@@ -2029,25 +1999,9 @@ int whpx_init_vcpu(CPUState *cpu)
}
}
- /*
- * If the vmware cpuid frequency leaf option is set, and we have a valid
- * tsc value, trap the corresponding cpuid's.
- */
- if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) {
- UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010};
-
- hr = whp_dispatch.WHvSetPartitionProperty(
- whpx->partition,
- WHvPartitionPropertyCodeCpuidExitList,
- cpuidExitList,
- RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
-
- if (FAILED(hr)) {
- error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
- hr);
- ret = -EINVAL;
- goto error;
- }
+ /* When not using the Hyper-V APIC, the frequency is 1 GHz */
+ if (!whpx_irqchip_in_kernel()) {
+ env->apic_bus_freq = 1000000000;
}
vcpu->interruptable = true;
@@ -2056,6 +2010,8 @@ int whpx_init_vcpu(CPUState *cpu)
max_vcpu_index = max(max_vcpu_index, cpu->cpu_index);
qemu_add_vm_change_state_handler(whpx_cpu_update_state, env);
+ env->emu_mmio_buf = g_new(char, 4096);
+
return 0;
error:
@@ -2083,8 +2039,9 @@ int whpx_accel_init(AccelState *as, MachineState *ms)
WHV_CAPABILITY whpx_cap;
UINT32 whpx_cap_size;
WHV_PARTITION_PROPERTY prop;
- UINT32 cpuidExitList[] = {1, 0x80000001};
WHV_CAPABILITY_FEATURES features = {0};
+ WHV_PROCESSOR_FEATURES_BANKS processor_features;
+ WHV_PROCESSOR_PERFMON_FEATURES perfmon_features;
whpx = &whpx_global;
@@ -2172,8 +2129,7 @@ int whpx_accel_init(AccelState *as, MachineState *ms)
if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation &&
whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) {
WHV_X64_LOCAL_APIC_EMULATION_MODE mode =
- WHvX64LocalApicEmulationModeXApic;
- printf("WHPX: setting APIC emulation mode in the hypervisor\n");
+ WHvX64LocalApicEmulationModeX2Apic;
hr = whp_dispatch.WHvSetPartitionProperty(
whpx->partition,
WHvPartitionPropertyCodeLocalApicEmulationMode,
@@ -2191,10 +2147,102 @@ int whpx_accel_init(AccelState *as, MachineState *ms)
}
}
+ /* Set all the supported features, to follow the MSHV example */
+ memset(&processor_features, 0, sizeof(WHV_PROCESSOR_FEATURES_BANKS));
+ processor_features.BanksCount = 2;
+
+ hr = whp_dispatch.WHvGetCapability(
+ WHvCapabilityCodeProcessorFeaturesBanks, &processor_features,
+ sizeof(WHV_PROCESSOR_FEATURES_BANKS), &whpx_cap_size);
+ if (FAILED(hr)) {
+ error_report("WHPX: Failed to get processor features, hr=%08lx", hr);
+ ret = -ENOSPC;
+ goto error;
+ }
+
+ if (processor_features.Bank1.NestedVirtSupport) {
+ memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
+ prop.NestedVirtualization = 1;
+ hr = whp_dispatch.WHvSetPartitionProperty(
+ whpx->partition,
+ WHvPartitionPropertyCodeNestedVirtualization,
+ &prop,
+ sizeof(WHV_PARTITION_PROPERTY));
+ if (FAILED(hr)) {
+ error_report("WHPX: Failed to enable nested virtualization, hr=%08lx", hr);
+ ret = -EINVAL;
+ goto error;
+ }
+ }
+
+ hr = whp_dispatch.WHvSetPartitionProperty(
+ whpx->partition,
+ WHvPartitionPropertyCodeProcessorFeaturesBanks,
+ &processor_features,
+ sizeof(WHV_PROCESSOR_FEATURES_BANKS));
+ if (FAILED(hr)) {
+ error_report("WHPX: Failed to set processor features, hr=%08lx", hr);
+ ret = -EINVAL;
+ goto error;
+ }
+
+ /* Enable supported performance monitoring capabilities */
+ hr = whp_dispatch.WHvGetCapability(
+ WHvCapabilityCodeProcessorPerfmonFeatures, &perfmon_features,
+ sizeof(WHV_PROCESSOR_PERFMON_FEATURES), &whpx_cap_size);
+ if (FAILED(hr)) {
+ error_report("WHPX: Failed to get performance monitoring features, hr=%08lx", hr);
+ ret = -ENOSPC;
+ goto error;
+ }
+
+ hr = whp_dispatch.WHvSetPartitionProperty(
+ whpx->partition,
+ WHvPartitionPropertyCodeProcessorPerfmonFeatures,
+ &perfmon_features,
+ sizeof(WHV_PROCESSOR_PERFMON_FEATURES));
+ if (FAILED(hr)) {
+ error_report("WHPX: Failed to set performance monitoring features, hr=%08lx", hr);
+ ret = -EINVAL;
+ goto error;
+ }
+
+ /* Enable synthetic processor features */
+ WHV_SYNTHETIC_PROCESSOR_FEATURES_BANKS synthetic_features;
+ memset(&synthetic_features, 0, sizeof(WHV_SYNTHETIC_PROCESSOR_FEATURES_BANKS));
+ synthetic_features.BanksCount = 1;
+
+ synthetic_features.Bank0.HypervisorPresent = 1;
+ synthetic_features.Bank0.Hv1 = 1;
+ synthetic_features.Bank0.AccessPartitionReferenceCounter = 1;
+ synthetic_features.Bank0.AccessPartitionReferenceTsc = 1;
+ synthetic_features.Bank0.AccessFrequencyRegs = 1;
+ synthetic_features.Bank0.AccessVpIndex = 1;
+ synthetic_features.Bank0.AccessHypercallRegs = 1;
+ synthetic_features.Bank0.TbFlushHypercalls = 1;
+
+ if (whpx_irqchip_in_kernel()) {
+ synthetic_features.Bank0.AccessSynicRegs = 1;
+ synthetic_features.Bank0.AccessSyntheticTimerRegs = 1;
+ synthetic_features.Bank0.AccessIntrCtrlRegs = 1;
+ synthetic_features.Bank0.SyntheticClusterIpi = 1;
+ synthetic_features.Bank0.DirectSyntheticTimers = 1;
+ }
+
+ hr = whp_dispatch.WHvSetPartitionProperty(
+ whpx->partition,
+ WHvPartitionPropertyCodeSyntheticProcessorFeaturesBanks,
+ &synthetic_features,
+ sizeof(WHV_SYNTHETIC_PROCESSOR_FEATURES_BANKS));
+ if (FAILED(hr)) {
+ error_report("WHPX: Failed to set synthetic features, hr=%08lx", hr);
+ ret = -EINVAL;
+ goto error;
+ }
+
/* Register for MSR and CPUID exits */
memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
prop.ExtendedVmExits.X64MsrExit = 1;
- prop.ExtendedVmExits.X64CpuidExit = 1;
prop.ExtendedVmExits.ExceptionExit = 1;
if (whpx_irqchip_in_kernel()) {
prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1;
@@ -2211,19 +2259,6 @@ int whpx_accel_init(AccelState *as, MachineState *ms)
goto error;
}
- hr = whp_dispatch.WHvSetPartitionProperty(
- whpx->partition,
- WHvPartitionPropertyCodeCpuidExitList,
- cpuidExitList,
- RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
-
- if (FAILED(hr)) {
- error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
- hr);
- ret = -EINVAL;
- goto error;
- }
-
/*
* We do not want to intercept any exceptions from the guest,
* until we actually start debugging with gdb.
@@ -2245,8 +2280,8 @@ int whpx_accel_init(AccelState *as, MachineState *ms)
}
whpx_memory_init();
+ whpx_init_emu();
- printf("Windows Hypervisor Platform accelerator is operational\n");
return 0;
error:
diff --git a/target/i386/whpx/whpx-apic.c b/target/i386/whpx/whpx-apic.c
index b934fdc..f26ecaf 100644
--- a/target/i386/whpx/whpx-apic.c
+++ b/target/i386/whpx/whpx-apic.c
@@ -192,6 +192,11 @@ static void whpx_send_msi(MSIMessage *msg)
uint8_t trigger_mode = (data >> MSI_DATA_TRIGGER_SHIFT) & 0x1;
uint8_t delivery = (data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 0x7;
+ if (vector == 0) {
+ warn_report("Ignoring request for interrupt vector 0");
+ return;
+ }
+
WHV_INTERRUPT_CONTROL interrupt = {
/* Values correspond to delivery modes */
.Type = delivery,
diff --git a/tests/functional/x86_64/meson.build b/tests/functional/x86_64/meson.build
index beab4f3..05e4914 100644
--- a/tests/functional/x86_64/meson.build
+++ b/tests/functional/x86_64/meson.build
@@ -37,4 +37,5 @@ tests_x86_64_system_thorough = [
'vhost_user_bridge',
'virtio_balloon',
'virtio_gpu',
+ 'rebuild_vmfd',
]
diff --git a/tests/functional/x86_64/test_rebuild_vmfd.py b/tests/functional/x86_64/test_rebuild_vmfd.py
new file mode 100755
index 0000000..5a8e5fd
--- /dev/null
+++ b/tests/functional/x86_64/test_rebuild_vmfd.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+#
+# Functional tests exercising guest KVM file descriptor change on reset.
+#
+# Copyright © 2026 Red Hat, Inc.
+#
+# Author:
+# Ani Sinha <anisinha@redhat.com>
+#
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+import os
+from qemu.machine import machine
+
+from qemu_test import QemuSystemTest, Asset, exec_command_and_wait_for_pattern
+from qemu_test import wait_for_console_pattern
+
+class KVMGuest(QemuSystemTest):
+
+ # ASSET UKI was generated using
+ # https://gitlab.com/kraxel/edk2-tests/-/blob/unittest/tools/make-supermin.sh
+ ASSET_UKI = Asset('https://gitlab.com/anisinha/misc-artifacts/'
+ '-/raw/main/uki.x86-64.efi?ref_type=heads',
+ 'e0f806bd1fa24111312e1fe849d2ee69808d4343930a5'
+ 'dc8c1688da17c65f576')
+ # ASSET_OVMF comes from /usr/share/edk2/ovmf/OVMF.stateless.fd of a
+ # fedora core 43 distribution which in turn comes from the
+ # edk2-ovmf-20251119-3.fc43.noarch rpm of that distribution.
+ ASSET_OVMF = Asset('https://gitlab.com/anisinha/misc-artifacts/'
+ '-/raw/main/OVMF.stateless.fd?ref_type=heads',
+ '58a4275aafa8774bd6b1540adceae4ea434b8db75b476'
+ '11839ff47be88cfcf22')
+
+ def common_vm_setup(self, kvm_args=None, cpu_args=None):
+ self.set_machine('q35')
+ self.require_accelerator("kvm")
+
+ self.vm.set_console()
+ if kvm_args:
+ self.vm.add_args("-accel", "kvm,%s" %kvm_args)
+ else:
+ self.vm.add_args("-accel", "kvm")
+ self.vm.add_args("-smp", "2")
+ if cpu_args:
+ self.vm.add_args("-cpu", "host,%s" %cpu_args)
+ else:
+ self.vm.add_args("-cpu", "host")
+ self.vm.add_args("-m", "2G")
+ self.vm.add_args("-nographic", "-nodefaults")
+
+
+ self.uki_path = self.ASSET_UKI.fetch()
+ self.ovmf_path = self.ASSET_OVMF.fetch()
+
+ self.vm.add_args('-kernel', self.uki_path)
+ self.vm.add_args("-bios", self.ovmf_path)
+ # enable KVM VMFD change on reset for a non-coco VM
+ self.vm.add_args("-machine", "q35,x-change-vmfd-on-reset=on")
+
+ # enable tracing of basic vmfd change function
+ self.vm.add_args("--trace", "kvm_reset_vmfd")
+
+ def launch_vm(self):
+ try:
+ self.vm.launch()
+ except machine.VMLaunchFailure as e:
+ if "Xen HVM guest support not present" in e.output:
+ self.skipTest("KVM Xen support is not present "
+ "(need v5.12+ kernel with CONFIG_KVM_XEN)")
+ elif "Property 'kvm-accel.xen-version' not found" in e.output:
+ self.skipTest("QEMU not built with CONFIG_XEN_EMU support")
+ else:
+ raise e
+
+ self.log.info('VM launched')
+ console_pattern = 'bash-5.1#'
+ wait_for_console_pattern(self, console_pattern)
+ self.log.info('VM ready with a bash prompt')
+
+ def vm_console_reset(self):
+ exec_command_and_wait_for_pattern(self, '/usr/sbin/reboot -f',
+ 'reboot: machine restart')
+ console_pattern = '# --- Hello world ---'
+ wait_for_console_pattern(self, console_pattern)
+ self.vm.shutdown()
+
+ def vm_qmp_reset(self):
+ self.vm.qmp('system_reset')
+ console_pattern = '# --- Hello world ---'
+ wait_for_console_pattern(self, console_pattern)
+ self.vm.shutdown()
+
+ def check_logs(self):
+ self.assertRegex(self.vm.get_log(),
+ r'kvm_reset_vmfd')
+ self.assertRegex(self.vm.get_log(),
+ r'virtual machine state has been rebuilt')
+
+ def test_reset_console(self):
+ self.common_vm_setup()
+ self.launch_vm()
+ self.vm_console_reset()
+ self.check_logs()
+
+ def test_reset_qmp(self):
+ self.common_vm_setup()
+ self.launch_vm()
+ self.vm_qmp_reset()
+ self.check_logs()
+
+ def test_reset_kvmpit(self):
+ self.common_vm_setup()
+ self.vm.add_args("--trace", "kvmpit_post_vmfd_change")
+ self.launch_vm()
+ self.vm_console_reset()
+ self.assertRegex(self.vm.get_log(),
+ r'kvmpit_post_vmfd_change')
+
+ def test_reset_xen_emulation(self):
+ self.common_vm_setup("xen-version=0x4000a,kernel-irqchip=split")
+ self.launch_vm()
+ self.vm_console_reset()
+ self.check_logs()
+
+ def test_reset_hyperv_vmbus(self):
+ self.common_vm_setup(None, "hv-syndbg,hv-relaxed,hv_time,hv-synic,"
+ "hv-vpindex,hv-runtime,hv-stimer")
+ self.vm.add_args("-device", "vmbus-bridge,irq=15")
+ self.vm.add_args("-trace", "vmbus_handle_vmfd_change")
+ self.launch_vm()
+ self.vm_console_reset()
+ self.assertRegex(self.vm.get_log(),
+ r'vmbus_handle_vmfd_change')
+
+if __name__ == '__main__':
+ QemuSystemTest.main()
diff --git a/tests/qtest/libqtest.c b/tests/qtest/libqtest.c
index 794d870..051faf3 100644
--- a/tests/qtest/libqtest.c
+++ b/tests/qtest/libqtest.c
@@ -1815,6 +1815,7 @@ void qtest_cb_for_every_machine(void (*cb)(const char *machine),
g_str_equal("xenpv", machines[i].name) ||
g_str_equal("xenpvh", machines[i].name) ||
g_str_equal("vmapple", machines[i].name) ||
+ g_str_equal("nitro", machines[i].name) ||
g_str_equal("nitro-enclave", machines[i].name)) {
continue;
}
diff --git a/ui/vdagent.c b/ui/vdagent.c
index 7ff0861..5a5e4bf 100644
--- a/ui/vdagent.c
+++ b/ui/vdagent.c
@@ -17,14 +17,6 @@
#include "spice/vd_agent.h"
-#define CHECK_SPICE_PROTOCOL_VERSION(major, minor, micro) \
- (CONFIG_SPICE_PROTOCOL_MAJOR > (major) || \
- (CONFIG_SPICE_PROTOCOL_MAJOR == (major) && \
- CONFIG_SPICE_PROTOCOL_MINOR > (minor)) || \
- (CONFIG_SPICE_PROTOCOL_MAJOR == (major) && \
- CONFIG_SPICE_PROTOCOL_MINOR == (minor) && \
- CONFIG_SPICE_PROTOCOL_MICRO >= (micro)))
-
#define VDAGENT_BUFFER_LIMIT (1 * MiB)
#define VDAGENT_MOUSE_DEFAULT true
#define VDAGENT_CLIPBOARD_DEFAULT false
@@ -87,10 +79,8 @@ static const char *cap_name[] = {
[VD_AGENT_CAP_FILE_XFER_DISABLED] = "file-xfer-disabled",
[VD_AGENT_CAP_FILE_XFER_DETAILED_ERRORS] = "file-xfer-detailed-errors",
[VD_AGENT_CAP_GRAPHICS_DEVICE_INFO] = "graphics-device-info",
-#if CHECK_SPICE_PROTOCOL_VERSION(0, 14, 1)
[VD_AGENT_CAP_CLIPBOARD_NO_RELEASE_ON_REGRAB] = "clipboard-no-release-on-regrab",
[VD_AGENT_CAP_CLIPBOARD_GRAB_SERIAL] = "clipboard-grab-serial",
-#endif
};
static const char *msg_name[] = {
@@ -125,9 +115,7 @@ static const char *type_name[] = {
[VD_AGENT_CLIPBOARD_IMAGE_BMP] = "bmp",
[VD_AGENT_CLIPBOARD_IMAGE_TIFF] = "tiff",
[VD_AGENT_CLIPBOARD_IMAGE_JPG] = "jpg",
-#if CHECK_SPICE_PROTOCOL_VERSION(0, 14, 3)
[VD_AGENT_CLIPBOARD_FILE_LIST] = "files",
-#endif
};
#define GET_NAME(_m, _v) \
@@ -197,9 +185,7 @@ static void vdagent_send_caps(VDAgentChardev *vd, bool request)
if (vd->clipboard) {
caps->caps[0] |= (1 << VD_AGENT_CAP_CLIPBOARD_BY_DEMAND);
caps->caps[0] |= (1 << VD_AGENT_CAP_CLIPBOARD_SELECTION);
-#if CHECK_SPICE_PROTOCOL_VERSION(0, 14, 1)
caps->caps[0] |= (1 << VD_AGENT_CAP_CLIPBOARD_GRAB_SERIAL);
-#endif
}
caps->request = request;
@@ -318,11 +304,7 @@ static bool have_selection(VDAgentChardev *vd)
static bool have_clipboard_serial(VDAgentChardev *vd)
{
-#if CHECK_SPICE_PROTOCOL_VERSION(0, 14, 1)
return vd->caps & (1 << VD_AGENT_CAP_CLIPBOARD_GRAB_SERIAL);
-#else
- return false;
-#endif
}
static uint32_t type_qemu_to_vdagent(enum QemuClipboardType type)
diff --git a/util/rcu.c b/util/rcu.c
index b703c86..acac944 100644
--- a/util/rcu.c
+++ b/util/rcu.c
@@ -43,10 +43,14 @@
#define RCU_GP_LOCKED (1UL << 0)
#define RCU_GP_CTR (1UL << 1)
+
+#define RCU_CALL_MIN_SIZE 30
+
unsigned long rcu_gp_ctr = RCU_GP_LOCKED;
QemuEvent rcu_gp_event;
static int in_drain_call_rcu;
+static int rcu_call_count;
static QemuMutex rcu_registry_lock;
static QemuMutex rcu_sync_lock;
@@ -76,15 +80,29 @@ static void wait_for_readers(void)
{
ThreadList qsreaders = QLIST_HEAD_INITIALIZER(qsreaders);
struct rcu_reader_data *index, *tmp;
+ int sleeps = 0;
+ bool forced = false;
for (;;) {
- /* We want to be notified of changes made to rcu_gp_ongoing
- * while we walk the list.
+ /*
+ * Force the grace period to end and wait for it if any of the
+ * following heuristical conditions are satisfied:
+ * - A decent number of callbacks piled up.
+ * - It timed out.
+ * - It is in a drain_call_rcu() call.
+ *
+ * Otherwise, periodically poll the grace period, hoping it ends
+ * promptly.
*/
- qemu_event_reset(&rcu_gp_event);
+ if (!forced &&
+ (qatomic_read(&rcu_call_count) >= RCU_CALL_MIN_SIZE ||
+ sleeps >= 5 || qatomic_read(&in_drain_call_rcu))) {
+ forced = true;
- QLIST_FOREACH(index, &registry, node) {
- qatomic_set(&index->waiting, true);
+ QLIST_FOREACH(index, &registry, node) {
+ notifier_list_notify(&index->force_rcu, NULL);
+ qatomic_set(&index->waiting, true);
+ }
}
/* Here, order the stores to index->waiting before the loads of
@@ -106,8 +124,6 @@ static void wait_for_readers(void)
* get some extra futex wakeups.
*/
qatomic_set(&index->waiting, false);
- } else if (qatomic_read(&in_drain_call_rcu)) {
- notifier_list_notify(&index->force_rcu, NULL);
}
}
@@ -115,7 +131,8 @@ static void wait_for_readers(void)
break;
}
- /* Wait for one thread to report a quiescent state and try again.
+ /*
+ * Sleep for a while and try again.
* Release rcu_registry_lock, so rcu_(un)register_thread() doesn't
* wait too much time.
*
@@ -133,7 +150,20 @@ static void wait_for_readers(void)
* rcu_registry_lock is released.
*/
qemu_mutex_unlock(&rcu_registry_lock);
- qemu_event_wait(&rcu_gp_event);
+
+ if (forced) {
+ qemu_event_wait(&rcu_gp_event);
+
+ /*
+ * We want to be notified of changes made to rcu_gp_ongoing
+ * while we walk the list.
+ */
+ qemu_event_reset(&rcu_gp_event);
+ } else {
+ g_usleep(10000);
+ sleeps++;
+ }
+
qemu_mutex_lock(&rcu_registry_lock);
}
@@ -173,15 +203,11 @@ void synchronize_rcu(void)
}
}
-
-#define RCU_CALL_MIN_SIZE 30
-
/* Multi-producer, single-consumer queue based on urcu/static/wfqueue.h
* from liburcu. Note that head is only used by the consumer.
*/
static struct rcu_head dummy;
static struct rcu_head *head = &dummy, **tail = &dummy.next;
-static int rcu_call_count;
static QemuEvent rcu_call_ready_event;
static void enqueue(struct rcu_head *node)
@@ -259,30 +285,27 @@ static void *call_rcu_thread(void *opaque)
rcu_register_thread();
for (;;) {
- int tries = 0;
- int n = qatomic_read(&rcu_call_count);
+ int n;
- /* Heuristically wait for a decent number of callbacks to pile up.
+ /*
* Fetch rcu_call_count now, we only must process elements that were
* added before synchronize_rcu() starts.
*/
- while (n == 0 || (n < RCU_CALL_MIN_SIZE && ++tries <= 5)) {
- g_usleep(10000);
- if (n == 0) {
- qemu_event_reset(&rcu_call_ready_event);
- n = qatomic_read(&rcu_call_count);
- if (n == 0) {
+ for (;;) {
+ qemu_event_reset(&rcu_call_ready_event);
+ n = qatomic_read(&rcu_call_count);
+ if (n) {
+ break;
+ }
+
#if defined(CONFIG_MALLOC_TRIM)
- malloc_trim(4 * 1024 * 1024);
+ malloc_trim(4 * 1024 * 1024);
#endif
- qemu_event_wait(&rcu_call_ready_event);
- }
- }
- n = qatomic_read(&rcu_call_count);
+ qemu_event_wait(&rcu_call_ready_event);
}
- qatomic_sub(&rcu_call_count, n);
synchronize_rcu();
+ qatomic_sub(&rcu_call_count, n);
bql_lock();
while (n > 0) {
node = try_dequeue();