aboutsummaryrefslogtreecommitdiff
path: root/target/i386/kvm/kvm.c
diff options
context:
space:
mode:
Diffstat (limited to 'target/i386/kvm/kvm.c')
-rw-r--r--target/i386/kvm/kvm.c257
1 files changed, 172 insertions, 85 deletions
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index 6c749d4..309f043 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -38,6 +38,7 @@
#include "kvm_i386.h"
#include "../confidential-guest.h"
#include "sev.h"
+#include "tdx.h"
#include "xen-emu.h"
#include "hyperv.h"
#include "hyperv-proto.h"
@@ -67,6 +68,7 @@
#include "hw/pci/msix.h"
#include "migration/blocker.h"
#include "exec/memattrs.h"
+#include "exec/target_page.h"
#include "trace.h"
#include CONFIG_DEVICES
@@ -191,6 +193,7 @@ static const char *vm_type_name[] = {
[KVM_X86_SEV_VM] = "SEV",
[KVM_X86_SEV_ES_VM] = "SEV-ES",
[KVM_X86_SNP_VM] = "SEV-SNP",
+ [KVM_X86_TDX_VM] = "TDX",
};
bool kvm_is_vm_type_supported(int type)
@@ -325,7 +328,7 @@ void kvm_synchronize_all_tsc(void)
{
CPUState *cpu;
- if (kvm_enabled()) {
+ if (kvm_enabled() && !is_tdx_vm()) {
CPU_FOREACH(cpu) {
run_on_cpu(cpu, do_kvm_synchronize_tsc, RUN_ON_CPU_NULL);
}
@@ -391,7 +394,7 @@ static bool host_tsx_broken(void)
/* Returns the value for a specific register on the cpuid entry
*/
-static uint32_t cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, int reg)
+uint32_t cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, int reg)
{
uint32_t ret = 0;
switch (reg) {
@@ -413,9 +416,9 @@ static uint32_t cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, int reg)
/* Find matching entry for function/index on kvm_cpuid2 struct
*/
-static struct kvm_cpuid_entry2 *cpuid_find_entry(struct kvm_cpuid2 *cpuid,
- uint32_t function,
- uint32_t index)
+struct kvm_cpuid_entry2 *cpuid_find_entry(struct kvm_cpuid2 *cpuid,
+ uint32_t function,
+ uint32_t index)
{
int i;
for (i = 0; i < cpuid->nent; ++i) {
@@ -571,7 +574,7 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function,
}
if (current_machine->cgs) {
- ret = x86_confidential_guest_mask_cpuid_features(
+ ret = x86_confidential_guest_adjust_cpuid_features(
X86_CONFIDENTIAL_GUEST(current_machine->cgs),
function, index, reg, ret);
}
@@ -650,6 +653,23 @@ uint64_t kvm_arch_get_supported_msr_feature(KVMState *s, uint32_t index)
must_be_one = (uint32_t)value;
can_be_one = (uint32_t)(value >> 32);
return can_be_one & ~must_be_one;
+ case MSR_IA32_ARCH_CAPABILITIES:
+ /*
+ * Special handling for fb-clear bit in ARCH_CAPABILITIES MSR.
+ * KVM will only report the bit if it is enabled in the host,
+ * but, for live migration capability purposes, we want to
+ * expose the bit to the guest even if it is disabled in the
+ * host, as long as the host itself is not vulnerable to
+ * the issue that the fb-clear bit is meant to mitigate.
+ */
+ if ((value & MSR_ARCH_CAP_MDS_NO) &&
+ (value & MSR_ARCH_CAP_TAA_NO) &&
+ (value & MSR_ARCH_CAP_SBDR_SSDP_NO) &&
+ (value & MSR_ARCH_CAP_FBSDP_NO) &&
+ (value & MSR_ARCH_CAP_PSDP_NO)) {
+ value |= MSR_ARCH_CAP_FB_CLEAR;
+ }
+ return value;
default:
return value;
@@ -867,6 +887,15 @@ static int kvm_arch_set_tsc_khz(CPUState *cs)
int r, cur_freq;
bool set_ioctl = false;
+ /*
+ * TSC of TD vcpu is immutable, it cannot be set/changed via vcpu scope
+ * VM_SET_TSC_KHZ, but only be initialized via VM scope VM_SET_TSC_KHZ
+ * before ioctl KVM_TDX_INIT_VM in tdx_pre_create_vcpu()
+ */
+ if (is_tdx_vm()) {
+ return 0;
+ }
+
if (!env->tsc_khz) {
return 0;
}
@@ -1778,8 +1807,6 @@ static int hyperv_init_vcpu(X86CPU *cpu)
static Error *invtsc_mig_blocker;
-#define KVM_MAX_CPUID_ENTRIES 100
-
static void kvm_init_xsave(CPUX86State *env)
{
if (has_xsave2) {
@@ -1822,9 +1849,8 @@ static void kvm_init_nested_state(CPUX86State *env)
}
}
-static uint32_t kvm_x86_build_cpuid(CPUX86State *env,
- struct kvm_cpuid_entry2 *entries,
- uint32_t cpuid_i)
+uint32_t kvm_x86_build_cpuid(CPUX86State *env, struct kvm_cpuid_entry2 *entries,
+ uint32_t cpuid_i)
{
uint32_t limit, i, j;
uint32_t unused;
@@ -1863,7 +1889,7 @@ static uint32_t kvm_x86_build_cpuid(CPUX86State *env,
break;
}
case 0x1f:
- if (!x86_has_extended_topo(env->avail_cpu_topo)) {
+ if (!x86_has_cpuid_0x1f(env_archcpu(env))) {
cpuid_i--;
break;
}
@@ -2051,6 +2077,15 @@ full:
abort();
}
+int kvm_arch_pre_create_vcpu(CPUState *cpu, Error **errp)
+{
+ if (is_tdx_vm()) {
+ return tdx_pre_create_vcpu(cpu, errp);
+ }
+
+ return 0;
+}
+
int kvm_arch_init_vcpu(CPUState *cs)
{
struct {
@@ -2075,6 +2110,14 @@ int kvm_arch_init_vcpu(CPUState *cs)
int r;
Error *local_err = NULL;
+ if (current_machine->cgs) {
+ r = x86_confidential_guest_check_features(
+ X86_CONFIDENTIAL_GUEST(current_machine->cgs), cs);
+ if (r < 0) {
+ return r;
+ }
+ }
+
memset(&cpuid_data, 0, sizeof(cpuid_data));
cpuid_i = 0;
@@ -2233,7 +2276,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
cpuid_i = kvm_x86_build_cpuid(env, cpuid_data.entries, cpuid_i);
cpuid_data.cpuid.nent = cpuid_i;
- if (((env->cpuid_version >> 8)&0xF) >= 6
+ if (x86_cpu_family(env->cpuid_version) >= 6
&& (env->features[FEAT_1_EDX] & (CPUID_MCE | CPUID_MCA)) ==
(CPUID_MCE | CPUID_MCA)) {
uint64_t mcg_cap, unsupported_caps;
@@ -2674,6 +2717,7 @@ static MemoryRegion smram_as_mem;
static void register_smram_listener(Notifier *n, void *unused)
{
+ CPUState *cpu;
MemoryRegion *smram =
(MemoryRegion *) object_resolve_path("/machine/smram", NULL);
@@ -2697,7 +2741,11 @@ static void register_smram_listener(Notifier *n, void *unused)
address_space_init(&smram_address_space, &smram_as_root, "KVM-SMRAM");
kvm_memory_listener_register(kvm_state, &smram_listener,
- &smram_address_space, 1, "kvm-smram");
+ &smram_address_space, X86ASIdx_SMM, "kvm-smram");
+
+ CPU_FOREACH(cpu) {
+ cpu_address_space_init(cpu, X86ASIdx_SMM, "cpu-smm", &smram_as_root);
+ }
}
static void *kvm_msr_energy_thread(void *data)
@@ -3205,16 +3253,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
Error *local_err = NULL;
/*
- * Initialize SEV context, if required
- *
- * If no memory encryption is requested (ms->cgs == NULL) this is
- * a no-op.
- *
- * It's also a no-op if a non-SEV confidential guest support
- * mechanism is selected. SEV is the only mechanism available to
- * select on x86 at present, so this doesn't arise, but if new
- * mechanisms are supported in future (e.g. TDX), they'll need
- * their own initialization either here or elsewhere.
+ * Initialize confidential guest (SEV/TDX) context, if required
*/
if (ms->cgs) {
ret = confidential_guest_kvm_init(ms->cgs, &local_err);
@@ -3289,8 +3328,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
return ret;
}
- if (kvm_check_extension(s, KVM_CAP_X86_SMM) &&
- object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE) &&
+ if (object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE) &&
x86_machine_is_smm_enabled(X86_MACHINE(ms))) {
smram_machine_done.notify = register_smram_listener;
qemu_add_machine_init_done_notifier(&smram_machine_done);
@@ -3855,36 +3893,38 @@ static void kvm_init_msrs(X86CPU *cpu)
CPUX86State *env = &cpu->env;
kvm_msr_buf_reset(cpu);
- if (has_msr_arch_capabs) {
- kvm_msr_entry_add(cpu, MSR_IA32_ARCH_CAPABILITIES,
- env->features[FEAT_ARCH_CAPABILITIES]);
- }
- if (has_msr_core_capabs) {
- kvm_msr_entry_add(cpu, MSR_IA32_CORE_CAPABILITY,
- env->features[FEAT_CORE_CAPABILITY]);
- }
+ if (!is_tdx_vm()) {
+ if (has_msr_arch_capabs) {
+ kvm_msr_entry_add(cpu, MSR_IA32_ARCH_CAPABILITIES,
+ env->features[FEAT_ARCH_CAPABILITIES]);
+ }
+
+ if (has_msr_core_capabs) {
+ kvm_msr_entry_add(cpu, MSR_IA32_CORE_CAPABILITY,
+ env->features[FEAT_CORE_CAPABILITY]);
+ }
+
+ if (has_msr_perf_capabs && cpu->enable_pmu) {
+ kvm_msr_entry_add_perf(cpu, env->features);
+ }
- if (has_msr_perf_capabs && cpu->enable_pmu) {
- kvm_msr_entry_add_perf(cpu, env->features);
+ /*
+ * Older kernels do not include VMX MSRs in KVM_GET_MSR_INDEX_LIST, but
+ * all kernels with MSR features should have them.
+ */
+ if (kvm_feature_msrs && cpu_has_vmx(env)) {
+ kvm_msr_entry_add_vmx(cpu, env->features);
+ }
}
if (has_msr_ucode_rev) {
kvm_msr_entry_add(cpu, MSR_IA32_UCODE_REV, cpu->ucode_rev);
}
-
- /*
- * Older kernels do not include VMX MSRs in KVM_GET_MSR_INDEX_LIST, but
- * all kernels with MSR features should have them.
- */
- if (kvm_feature_msrs && cpu_has_vmx(env)) {
- kvm_msr_entry_add_vmx(cpu, env->features);
- }
-
assert(kvm_buf_set_msrs(cpu) == 0);
}
-static int kvm_put_msrs(X86CPU *cpu, int level)
+static int kvm_put_msrs(X86CPU *cpu, KvmPutState level)
{
CPUX86State *env = &cpu->env;
int i;
@@ -5004,7 +5044,7 @@ static int kvm_get_apic(X86CPU *cpu)
return 0;
}
-static int kvm_put_vcpu_events(X86CPU *cpu, int level)
+static int kvm_put_vcpu_events(X86CPU *cpu, KvmPutState level)
{
CPUState *cs = CPU(cpu);
CPUX86State *env = &cpu->env;
@@ -5043,7 +5083,7 @@ static int kvm_put_vcpu_events(X86CPU *cpu, int level)
*/
events.smi.pending = cs->interrupt_request & CPU_INTERRUPT_SMI;
events.smi.latched_init = cs->interrupt_request & CPU_INTERRUPT_INIT;
- cs->interrupt_request &= ~(CPU_INTERRUPT_INIT | CPU_INTERRUPT_SMI);
+ cpu_reset_interrupt(cs, CPU_INTERRUPT_INIT | CPU_INTERRUPT_SMI);
} else {
/* Keep these in cs->interrupt_request. */
events.smi.pending = 0;
@@ -5247,7 +5287,7 @@ static int kvm_get_nested_state(X86CPU *cpu)
return ret;
}
-int kvm_arch_put_registers(CPUState *cpu, int level, Error **errp)
+int kvm_arch_put_registers(CPUState *cpu, KvmPutState level, Error **errp)
{
X86CPU *x86_cpu = X86_CPU(cpu);
int ret;
@@ -5430,10 +5470,10 @@ void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run)
int ret;
/* Inject NMI */
- if (cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
- if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
+ if (cpu_test_interrupt(cpu, CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
+ if (cpu_test_interrupt(cpu, CPU_INTERRUPT_NMI)) {
bql_lock();
- cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
+ cpu_reset_interrupt(cpu, CPU_INTERRUPT_NMI);
bql_unlock();
DPRINTF("injected NMI\n");
ret = kvm_vcpu_ioctl(cpu, KVM_NMI);
@@ -5442,9 +5482,9 @@ void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run)
strerror(-ret));
}
}
- if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
+ if (cpu_test_interrupt(cpu, CPU_INTERRUPT_SMI)) {
bql_lock();
- cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
+ cpu_reset_interrupt(cpu, CPU_INTERRUPT_SMI);
bql_unlock();
DPRINTF("injected SMI\n");
ret = kvm_vcpu_ioctl(cpu, KVM_SMI);
@@ -5455,32 +5495,31 @@ void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run)
}
}
- if (!kvm_pic_in_kernel()) {
- bql_lock();
- }
/* Force the VCPU out of its inner loop to process any INIT requests
* or (for userspace APIC, but it is cheap to combine the checks here)
* pending TPR access reports.
*/
- if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
- if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
+ if (cpu_test_interrupt(cpu, CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
+ if (cpu_test_interrupt(cpu, CPU_INTERRUPT_INIT) &&
!(env->hflags & HF_SMM_MASK)) {
- cpu->exit_request = 1;
+ qatomic_set(&cpu->exit_request, true);
}
- if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
- cpu->exit_request = 1;
+ if (cpu_test_interrupt(cpu, CPU_INTERRUPT_TPR)) {
+ qatomic_set(&cpu->exit_request, true);
}
}
if (!kvm_pic_in_kernel()) {
/* Try to inject an interrupt if the guest can accept it */
if (run->ready_for_interrupt_injection &&
- (cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
+ cpu_test_interrupt(cpu, CPU_INTERRUPT_HARD) &&
(env->eflags & IF_MASK)) {
int irq;
- cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
+ bql_lock();
+
+ cpu_reset_interrupt(cpu, CPU_INTERRUPT_HARD);
irq = cpu_get_pic_interrupt(env);
if (irq >= 0) {
struct kvm_interrupt intr;
@@ -5494,13 +5533,14 @@ void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run)
strerror(-ret));
}
}
+ bql_unlock();
}
/* If we have an interrupt but the guest is not ready to receive an
* interrupt, request an interrupt window exit. This will
* cause a return to userspace as soon as the guest is ready to
* receive interrupts. */
- if ((cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
+ if (cpu_test_interrupt(cpu, CPU_INTERRUPT_HARD)) {
run->request_interrupt_window = 1;
} else {
run->request_interrupt_window = 0;
@@ -5508,8 +5548,6 @@ void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run)
DPRINTF("setting tpr\n");
run->cr8 = cpu_get_apic_tpr(x86_cpu->apic_state);
-
- bql_unlock();
}
}
@@ -5572,18 +5610,18 @@ int kvm_arch_process_async_events(CPUState *cs)
X86CPU *cpu = X86_CPU(cs);
CPUX86State *env = &cpu->env;
- if (cs->interrupt_request & CPU_INTERRUPT_MCE) {
+ if (cpu_test_interrupt(cs, CPU_INTERRUPT_MCE)) {
/* We must not raise CPU_INTERRUPT_MCE if it's not supported. */
assert(env->mcg_cap);
- cs->interrupt_request &= ~CPU_INTERRUPT_MCE;
+ cpu_reset_interrupt(cs, CPU_INTERRUPT_MCE);
kvm_cpu_synchronize_state(cs);
if (env->exception_nr == EXCP08_DBLE) {
/* this means triple fault */
qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
- cs->exit_request = 1;
+ qatomic_set(&cs->exit_request, true);
return 0;
}
kvm_queue_exception(env, EXCP12_MCHK, 0, 0);
@@ -5595,7 +5633,7 @@ int kvm_arch_process_async_events(CPUState *cs)
}
}
- if ((cs->interrupt_request & CPU_INTERRUPT_INIT) &&
+ if (cpu_test_interrupt(cs, CPU_INTERRUPT_INIT) &&
!(env->hflags & HF_SMM_MASK)) {
kvm_cpu_synchronize_state(cs);
do_cpu_init(cpu);
@@ -5605,21 +5643,21 @@ int kvm_arch_process_async_events(CPUState *cs)
return 0;
}
- if (cs->interrupt_request & CPU_INTERRUPT_POLL) {
- cs->interrupt_request &= ~CPU_INTERRUPT_POLL;
+ if (cpu_test_interrupt(cs, CPU_INTERRUPT_POLL)) {
+ cpu_reset_interrupt(cs, CPU_INTERRUPT_POLL);
apic_poll_irq(cpu->apic_state);
}
- if (((cs->interrupt_request & CPU_INTERRUPT_HARD) &&
+ if ((cpu_test_interrupt(cs, CPU_INTERRUPT_HARD) &&
(env->eflags & IF_MASK)) ||
- (cs->interrupt_request & CPU_INTERRUPT_NMI)) {
+ cpu_test_interrupt(cs, CPU_INTERRUPT_NMI)) {
cs->halted = 0;
}
- if (cs->interrupt_request & CPU_INTERRUPT_SIPI) {
+ if (cpu_test_interrupt(cs, CPU_INTERRUPT_SIPI)) {
kvm_cpu_synchronize_state(cs);
do_cpu_sipi(cpu);
}
- if (cs->interrupt_request & CPU_INTERRUPT_TPR) {
- cs->interrupt_request &= ~CPU_INTERRUPT_TPR;
+ if (cpu_test_interrupt(cs, CPU_INTERRUPT_TPR)) {
+ cpu_reset_interrupt(cs, CPU_INTERRUPT_TPR);
kvm_cpu_synchronize_state(cs);
apic_handle_tpr_access_report(cpu->apic_state, env->eip,
env->tpr_access_type);
@@ -5633,9 +5671,9 @@ static int kvm_handle_halt(X86CPU *cpu)
CPUState *cs = CPU(cpu);
CPUX86State *env = &cpu->env;
- if (!((cs->interrupt_request & CPU_INTERRUPT_HARD) &&
+ if (!(cpu_test_interrupt(cs, CPU_INTERRUPT_HARD) &&
(env->eflags & IF_MASK)) &&
- !(cs->interrupt_request & CPU_INTERRUPT_NMI)) {
+ !cpu_test_interrupt(cs, CPU_INTERRUPT_NMI)) {
cs->halted = 1;
return EXCP_HLT;
}
@@ -5999,9 +6037,11 @@ static bool host_supports_vmx(void)
* because private/shared page tracking is already provided through other
* means, these 2 use-cases should be treated as being mutually-exclusive.
*/
-static int kvm_handle_hc_map_gpa_range(struct kvm_run *run)
+static int kvm_handle_hc_map_gpa_range(X86CPU *cpu, struct kvm_run *run)
{
+ struct kvm_pre_fault_memory mem;
uint64_t gpa, size, attributes;
+ int ret;
if (!machine_require_guest_memfd(current_machine))
return -EINVAL;
@@ -6012,13 +6052,32 @@ static int kvm_handle_hc_map_gpa_range(struct kvm_run *run)
trace_kvm_hc_map_gpa_range(gpa, size, attributes, run->hypercall.flags);
- return kvm_convert_memory(gpa, size, attributes & KVM_MAP_GPA_RANGE_ENCRYPTED);
+ ret = kvm_convert_memory(gpa, size, attributes & KVM_MAP_GPA_RANGE_ENCRYPTED);
+ if (ret || !kvm_pre_fault_memory_supported) {
+ return ret;
+ }
+
+ /*
+ * Opportunistically pre-fault memory in. Failures are ignored so that any
+ * errors in faulting in the memory will get captured in KVM page fault
+ * path when the guest first accesses the page.
+ */
+ memset(&mem, 0, sizeof(mem));
+ mem.gpa = gpa;
+ mem.size = size;
+ while (mem.size) {
+ if (kvm_vcpu_ioctl(CPU(cpu), KVM_PRE_FAULT_MEMORY, &mem)) {
+ break;
+ }
+ }
+
+ return 0;
}
-static int kvm_handle_hypercall(struct kvm_run *run)
+static int kvm_handle_hypercall(X86CPU *cpu, struct kvm_run *run)
{
if (run->hypercall.nr == KVM_HC_MAP_GPA_RANGE)
- return kvm_handle_hc_map_gpa_range(run);
+ return kvm_handle_hc_map_gpa_range(cpu, run);
return -EINVAL;
}
@@ -6118,7 +6177,35 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
break;
#endif
case KVM_EXIT_HYPERCALL:
- ret = kvm_handle_hypercall(run);
+ ret = kvm_handle_hypercall(cpu, run);
+ break;
+ case KVM_EXIT_SYSTEM_EVENT:
+ switch (run->system_event.type) {
+ case KVM_SYSTEM_EVENT_TDX_FATAL:
+ ret = tdx_handle_report_fatal_error(cpu, run);
+ break;
+ default:
+ ret = -1;
+ break;
+ }
+ break;
+ case KVM_EXIT_TDX:
+ /*
+ * run->tdx is already set up for the case where userspace
+ * does not handle the TDVMCALL.
+ */
+ switch (run->tdx.nr) {
+ case TDVMCALL_GET_QUOTE:
+ tdx_handle_get_quote(cpu, run);
+ break;
+ case TDVMCALL_GET_TD_VM_CALL_INFO:
+ tdx_handle_get_tdvmcall_info(cpu, run);
+ break;
+ case TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT:
+ tdx_handle_setup_event_notify_interrupt(cpu, run);
+ break;
+ }
+ ret = 0;
break;
default:
fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);