diff options
Diffstat (limited to 'target/i386/kvm/kvm.c')
-rw-r--r-- | target/i386/kvm/kvm.c | 1045 |
1 files changed, 845 insertions, 200 deletions
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index dd8b0f3..234878c 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -16,9 +16,12 @@ #include "qapi/qapi-events-run-state.h" #include "qapi/error.h" #include "qapi/visitor.h" +#include <math.h> #include <sys/ioctl.h> #include <sys/utsname.h> #include <sys/syscall.h> +#include <sys/resource.h> +#include <sys/time.h> #include <linux/kvm.h> #include <linux/kvm_para.h> @@ -27,13 +30,15 @@ #include "cpu.h" #include "host-cpu.h" -#include "sysemu/sysemu.h" -#include "sysemu/hw_accel.h" -#include "sysemu/kvm_int.h" -#include "sysemu/runstate.h" +#include "vmsr_energy.h" +#include "system/system.h" +#include "system/hw_accel.h" +#include "system/kvm_int.h" +#include "system/runstate.h" #include "kvm_i386.h" #include "../confidential-guest.h" #include "sev.h" +#include "tdx.h" #include "xen-emu.h" #include "hyperv.h" #include "hyperv-proto.h" @@ -63,6 +68,7 @@ #include "hw/pci/msix.h" #include "migration/blocker.h" #include "exec/memattrs.h" +#include "exec/target_page.h" #include "trace.h" #include CONFIG_DEVICES @@ -77,18 +83,35 @@ do { } while (0) #endif +/* + * On older Intel CPUs, KVM uses vm86 mode to emulate 16-bit code directly. + * In order to use vm86 mode, an EPT identity map and a TSS are needed. + * Since these must be part of guest physical memory, we need to allocate + * them, both by setting their start addresses in the kernel and by + * creating a corresponding e820 entry. We need 4 pages before the BIOS, + * so this value allows up to 16M BIOSes. + */ +#define KVM_IDENTITY_BASE 0xfeffc000 + /* From arch/x86/kvm/lapic.h */ #define KVM_APIC_BUS_CYCLE_NS 1 #define KVM_APIC_BUS_FREQUENCY (1000000000ULL / KVM_APIC_BUS_CYCLE_NS) -#define MSR_KVM_WALL_CLOCK 0x11 -#define MSR_KVM_SYSTEM_TIME 0x12 - /* A 4096-byte buffer can hold the 8-byte kvm_msrs header, plus * 255 kvm_msr_entry structs */ #define MSR_BUF_SIZE 4096 +typedef bool QEMURDMSRHandler(X86CPU *cpu, uint32_t msr, uint64_t *val); +typedef bool QEMUWRMSRHandler(X86CPU *cpu, uint32_t msr, uint64_t val); +typedef struct { + uint32_t msr; + QEMURDMSRHandler *rdmsr; + QEMUWRMSRHandler *wrmsr; +} KVMMSRHandlers; + static void kvm_init_msrs(X86CPU *cpu); +static int kvm_filter_msr(KVMState *s, uint32_t msr, QEMURDMSRHandler *rdmsr, + QEMUWRMSRHandler *wrmsr); const KVMCapabilityInfo kvm_arch_required_capabilities[] = { KVM_CAP_INFO(SET_TSS_ADDR), @@ -141,6 +164,7 @@ static bool has_msr_ucode_rev; static bool has_msr_vmx_procbased_ctls2; static bool has_msr_perf_capabs; static bool has_msr_pkrs; +static bool has_msr_hwcr; static uint32_t has_architectural_pmu_version; static uint32_t num_architectural_pmu_gp_counters; @@ -169,6 +193,7 @@ static const char *vm_type_name[] = { [KVM_X86_SEV_VM] = "SEV", [KVM_X86_SEV_ES_VM] = "SEV-ES", [KVM_X86_SNP_VM] = "SEV-SNP", + [KVM_X86_TDX_VM] = "TDX", }; bool kvm_is_vm_type_supported(int type) @@ -303,7 +328,7 @@ void kvm_synchronize_all_tsc(void) { CPUState *cpu; - if (kvm_enabled()) { + if (kvm_enabled() && !is_tdx_vm()) { CPU_FOREACH(cpu) { run_on_cpu(cpu, do_kvm_synchronize_tsc, RUN_ON_CPU_NULL); } @@ -369,7 +394,7 @@ static bool host_tsx_broken(void) /* Returns the value for a specific register on the cpuid entry */ -static uint32_t cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, int reg) +uint32_t cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, int reg) { uint32_t ret = 0; switch (reg) { @@ -391,9 +416,9 @@ static uint32_t cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, int reg) /* Find matching entry for function/index on kvm_cpuid2 struct */ -static struct kvm_cpuid_entry2 *cpuid_find_entry(struct kvm_cpuid2 *cpuid, - uint32_t function, - uint32_t index) +struct kvm_cpuid_entry2 *cpuid_find_entry(struct kvm_cpuid2 *cpuid, + uint32_t function, + uint32_t index) { int i; for (i = 0; i < cpuid->nent; ++i) { @@ -539,15 +564,20 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function, * be enabled without the in-kernel irqchip */ if (!kvm_irqchip_in_kernel()) { - ret &= ~(1U << KVM_FEATURE_PV_UNHALT); + ret &= ~CPUID_KVM_PV_UNHALT; } if (kvm_irqchip_is_split()) { - ret |= 1U << KVM_FEATURE_MSI_EXT_DEST_ID; + ret |= CPUID_KVM_MSI_EXT_DEST_ID; } } else if (function == KVM_CPUID_FEATURES && reg == R_EDX) { - ret |= 1U << KVM_HINTS_REALTIME; + ret |= CPUID_KVM_HINTS_REALTIME; } + if (current_machine->cgs) { + ret = x86_confidential_guest_adjust_cpuid_features( + X86_CONFIDENTIAL_GUEST(current_machine->cgs), + function, index, reg, ret); + } return ret; } @@ -840,6 +870,15 @@ static int kvm_arch_set_tsc_khz(CPUState *cs) int r, cur_freq; bool set_ioctl = false; + /* + * TSC of TD vcpu is immutable, it cannot be set/changed via vcpu scope + * VM_SET_TSC_KHZ, but only be initialized via VM scope VM_SET_TSC_KHZ + * before ioctl KVM_TDX_INIT_VM in tdx_pre_create_vcpu() + */ + if (is_tdx_vm()) { + return 0; + } + if (!env->tsc_khz) { return 0; } @@ -904,6 +943,7 @@ static struct { uint32_t bits; } flags[2]; uint64_t dependencies; + bool skip_passthrough; } kvm_hyperv_properties[] = { [HYPERV_FEAT_RELAXED] = { .desc = "relaxed timing (hv-relaxed)", @@ -1026,16 +1066,15 @@ static struct { .bits = HV_DEPRECATING_AEOI_RECOMMENDED} } }, -#ifdef CONFIG_SYNDBG [HYPERV_FEAT_SYNDBG] = { .desc = "Enable synthetic kernel debugger channel (hv-syndbg)", .flags = { {.func = HV_CPUID_FEATURES, .reg = R_EDX, .bits = HV_FEATURE_DEBUG_MSRS_AVAILABLE} }, - .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_RELAXED) + .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_RELAXED), + .skip_passthrough = true, }, -#endif [HYPERV_FEAT_MSR_BITMAP] = { .desc = "enlightened MSR-Bitmap (hv-emsr-bitmap)", .flags = { @@ -1287,6 +1326,13 @@ static bool hyperv_feature_supported(CPUState *cs, int feature) uint32_t func, bits; int i, reg; + /* + * kvm_hyperv_properties needs to define at least one CPUID flag which + * must be used to detect the feature, it's hard to say whether it is + * supported or not otherwise. + */ + assert(kvm_hyperv_properties[feature].flags[0].func); + for (i = 0; i < ARRAY_SIZE(kvm_hyperv_properties[feature].flags); i++) { func = kvm_hyperv_properties[feature].flags[i].func; @@ -1436,7 +1482,8 @@ bool kvm_hyperv_expand_features(X86CPU *cpu, Error **errp) * hv_build_cpuid_leaf() uses this info to build guest CPUIDs. */ for (feat = 0; feat < ARRAY_SIZE(kvm_hyperv_properties); feat++) { - if (hyperv_feature_supported(cs, feat)) { + if (hyperv_feature_supported(cs, feat) && + !kvm_hyperv_properties[feat].skip_passthrough) { cpu->hyperv_features |= BIT(feat); } } @@ -1743,8 +1790,6 @@ static int hyperv_init_vcpu(X86CPU *cpu) static Error *invtsc_mig_blocker; -#define KVM_MAX_CPUID_ENTRIES 100 - static void kvm_init_xsave(CPUX86State *env) { if (has_xsave2) { @@ -1787,9 +1832,8 @@ static void kvm_init_nested_state(CPUX86State *env) } } -static uint32_t kvm_x86_build_cpuid(CPUX86State *env, - struct kvm_cpuid_entry2 *entries, - uint32_t cpuid_i) +uint32_t kvm_x86_build_cpuid(CPUX86State *env, struct kvm_cpuid_entry2 *entries, + uint32_t cpuid_i) { uint32_t limit, i, j; uint32_t unused; @@ -1809,10 +1853,12 @@ static uint32_t kvm_x86_build_cpuid(CPUX86State *env, int times; c->function = i; - c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC | - KVM_CPUID_FLAG_STATE_READ_NEXT; cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx); times = c->eax & 0xff; + if (times > 1) { + c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC | + KVM_CPUID_FLAG_STATE_READ_NEXT; + } for (j = 1; j < times; ++j) { if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { @@ -1826,7 +1872,7 @@ static uint32_t kvm_x86_build_cpuid(CPUX86State *env, break; } case 0x1f: - if (!x86_has_extended_topo(env->avail_cpu_topo)) { + if (!x86_has_cpuid_0x1f(env_archcpu(env))) { cpuid_i--; break; } @@ -1835,10 +1881,6 @@ static uint32_t kvm_x86_build_cpuid(CPUX86State *env, case 0xb: case 0xd: for (j = 0; ; j++) { - if (i == 0xd && j == 64) { - break; - } - c->function = i; c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX; c->index = j; @@ -1854,7 +1896,12 @@ static uint32_t kvm_x86_build_cpuid(CPUX86State *env, break; } if (i == 0xd && c->eax == 0) { - continue; + if (j < 63) { + continue; + } else { + cpuid_i--; + break; + } } if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { goto full; @@ -1882,7 +1929,8 @@ static uint32_t kvm_x86_build_cpuid(CPUX86State *env, case 0x7: case 0x14: case 0x1d: - case 0x1e: { + case 0x1e: + case 0x24: { uint32_t times; c->function = i; @@ -2012,6 +2060,15 @@ full: abort(); } +int kvm_arch_pre_create_vcpu(CPUState *cpu, Error **errp) +{ + if (is_tdx_vm()) { + return tdx_pre_create_vcpu(cpu, errp); + } + + return 0; +} + int kvm_arch_init_vcpu(CPUState *cs) { struct { @@ -2036,6 +2093,14 @@ int kvm_arch_init_vcpu(CPUState *cs) int r; Error *local_err = NULL; + if (current_machine->cgs) { + r = x86_confidential_guest_check_features( + X86_CONFIDENTIAL_GUEST(current_machine->cgs), cs); + if (r < 0) { + return r; + } + } + memset(&cpuid_data, 0, sizeof(cpuid_data)); cpuid_i = 0; @@ -2373,6 +2438,21 @@ void kvm_arch_after_reset_vcpu(X86CPU *cpu) } } +void kvm_arch_reset_parked_vcpu(unsigned long vcpu_id, int kvm_fd) +{ + g_autofree struct kvm_msrs *msrs = NULL; + + msrs = g_malloc0(sizeof(*msrs) + sizeof(msrs->entries[0])); + msrs->entries[0].index = MSR_IA32_TSC; + msrs->entries[0].data = 1; /* match the value in x86_cpu_reset() */ + msrs->nmsrs++; + + if (ioctl(kvm_fd, KVM_SET_MSRS, msrs) != 1) { + warn_report("parked vCPU %lu TSC reset failed: %d", + vcpu_id, errno); + } +} + void kvm_arch_do_init_vcpu(X86CPU *cpu) { CPUX86State *env = &cpu->env; @@ -2545,6 +2625,8 @@ static int kvm_get_supported_msrs(KVMState *s) case MSR_IA32_PKRS: has_msr_pkrs = true; break; + case MSR_K7_HWCR: + has_msr_hwcr = true; } } } @@ -2554,13 +2636,58 @@ static int kvm_get_supported_msrs(KVMState *s) return ret; } -static bool kvm_rdmsr_core_thread_count(X86CPU *cpu, uint32_t msr, +static bool kvm_rdmsr_core_thread_count(X86CPU *cpu, + uint32_t msr, uint64_t *val) { + *val = cpu_x86_get_msr_core_thread_count(cpu); + + return true; +} + +static bool kvm_rdmsr_rapl_power_unit(X86CPU *cpu, + uint32_t msr, + uint64_t *val) +{ + CPUState *cs = CPU(cpu); - *val = cs->nr_threads * cs->nr_cores; /* thread count, bits 15..0 */ - *val |= ((uint32_t)cs->nr_cores << 16); /* core count, bits 31..16 */ + *val = cs->kvm_state->msr_energy.msr_unit; + + return true; +} + +static bool kvm_rdmsr_pkg_power_limit(X86CPU *cpu, + uint32_t msr, + uint64_t *val) +{ + + CPUState *cs = CPU(cpu); + + *val = cs->kvm_state->msr_energy.msr_limit; + + return true; +} + +static bool kvm_rdmsr_pkg_power_info(X86CPU *cpu, + uint32_t msr, + uint64_t *val) +{ + + CPUState *cs = CPU(cpu); + + *val = cs->kvm_state->msr_energy.msr_info; + + return true; +} + +static bool kvm_rdmsr_pkg_energy_status(X86CPU *cpu, + uint32_t msr, + uint64_t *val) +{ + + CPUState *cs = CPU(cpu); + *val = cs->kvm_state->msr_energy.msr_value[cs->cpu_index]; return true; } @@ -2599,63 +2726,535 @@ static void register_smram_listener(Notifier *n, void *unused) &smram_address_space, 1, "kvm-smram"); } -int kvm_arch_get_default_type(MachineState *ms) +static void *kvm_msr_energy_thread(void *data) { - return 0; + KVMState *s = data; + struct KVMMsrEnergy *vmsr = &s->msr_energy; + + g_autofree vmsr_package_energy_stat *pkg_stat = NULL; + g_autofree vmsr_thread_stat *thd_stat = NULL; + g_autofree CPUState *cpu = NULL; + g_autofree unsigned int *vpkgs_energy_stat = NULL; + unsigned int num_threads = 0; + + X86CPUTopoIDs topo_ids; + + rcu_register_thread(); + + /* Allocate memory for each package energy status */ + pkg_stat = g_new0(vmsr_package_energy_stat, vmsr->host_topo.maxpkgs); + + /* Allocate memory for thread stats */ + thd_stat = g_new0(vmsr_thread_stat, 1); + + /* Allocate memory for holding virtual package energy counter */ + vpkgs_energy_stat = g_new0(unsigned int, vmsr->guest_vsockets); + + /* Populate the max tick of each packages */ + for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) { + /* + * Max numbers of ticks per package + * Time in second * Number of ticks/second * Number of cores/package + * ex: 100 ticks/second/CPU, 12 CPUs per Package gives 1200 ticks max + */ + vmsr->host_topo.maxticks[i] = (MSR_ENERGY_THREAD_SLEEP_US / 1000000) + * sysconf(_SC_CLK_TCK) + * vmsr->host_topo.pkg_cpu_count[i]; + } + + while (true) { + /* Get all qemu threads id */ + g_autofree pid_t *thread_ids + = vmsr_get_thread_ids(vmsr->pid, &num_threads); + + if (thread_ids == NULL) { + goto clean; + } + + thd_stat = g_renew(vmsr_thread_stat, thd_stat, num_threads); + /* Unlike g_new0, g_renew0 function doesn't exist yet... */ + memset(thd_stat, 0, num_threads * sizeof(vmsr_thread_stat)); + + /* Populate all the thread stats */ + for (int i = 0; i < num_threads; i++) { + thd_stat[i].utime = g_new0(unsigned long long, 2); + thd_stat[i].stime = g_new0(unsigned long long, 2); + thd_stat[i].thread_id = thread_ids[i]; + vmsr_read_thread_stat(vmsr->pid, + thd_stat[i].thread_id, + &thd_stat[i].utime[0], + &thd_stat[i].stime[0], + &thd_stat[i].cpu_id); + thd_stat[i].pkg_id = + vmsr_get_physical_package_id(thd_stat[i].cpu_id); + } + + /* Retrieve all packages power plane energy counter */ + for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) { + for (int j = 0; j < num_threads; j++) { + /* + * Use the first thread we found that ran on the CPU + * of the package to read the packages energy counter + */ + if (thd_stat[j].pkg_id == i) { + pkg_stat[i].e_start = + vmsr_read_msr(MSR_PKG_ENERGY_STATUS, + thd_stat[j].cpu_id, + thd_stat[j].thread_id, + s->msr_energy.sioc); + break; + } + } + } + + /* Sleep a short period while the other threads are working */ + usleep(MSR_ENERGY_THREAD_SLEEP_US); + + /* + * Retrieve all packages power plane energy counter + * Calculate the delta of all packages + */ + for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) { + for (int j = 0; j < num_threads; j++) { + /* + * Use the first thread we found that ran on the CPU + * of the package to read the packages energy counter + */ + if (thd_stat[j].pkg_id == i) { + pkg_stat[i].e_end = + vmsr_read_msr(MSR_PKG_ENERGY_STATUS, + thd_stat[j].cpu_id, + thd_stat[j].thread_id, + s->msr_energy.sioc); + /* + * Prevent the case we have migrate the VM + * during the sleep period or any other cases + * were energy counter might be lower after + * the sleep period. + */ + if (pkg_stat[i].e_end > pkg_stat[i].e_start) { + pkg_stat[i].e_delta = + pkg_stat[i].e_end - pkg_stat[i].e_start; + } else { + pkg_stat[i].e_delta = 0; + } + break; + } + } + } + + /* Delta of ticks spend by each thread between the sample */ + for (int i = 0; i < num_threads; i++) { + vmsr_read_thread_stat(vmsr->pid, + thd_stat[i].thread_id, + &thd_stat[i].utime[1], + &thd_stat[i].stime[1], + &thd_stat[i].cpu_id); + + if (vmsr->pid < 0) { + /* + * We don't count the dead thread + * i.e threads that existed before the sleep + * and not anymore + */ + thd_stat[i].delta_ticks = 0; + } else { + vmsr_delta_ticks(thd_stat, i); + } + } + + /* + * Identify the vcpu threads + * Calculate the number of vcpu per package + */ + CPU_FOREACH(cpu) { + for (int i = 0; i < num_threads; i++) { + if (cpu->thread_id == thd_stat[i].thread_id) { + thd_stat[i].is_vcpu = true; + thd_stat[i].vcpu_id = cpu->cpu_index; + pkg_stat[thd_stat[i].pkg_id].nb_vcpu++; + thd_stat[i].acpi_id = kvm_arch_vcpu_id(cpu); + break; + } + } + } + + /* Retrieve the virtual package number of each vCPU */ + for (int i = 0; i < vmsr->guest_cpu_list->len; i++) { + for (int j = 0; j < num_threads; j++) { + if ((thd_stat[j].acpi_id == + vmsr->guest_cpu_list->cpus[i].arch_id) + && (thd_stat[j].is_vcpu == true)) { + x86_topo_ids_from_apicid(thd_stat[j].acpi_id, + &vmsr->guest_topo_info, &topo_ids); + thd_stat[j].vpkg_id = topo_ids.pkg_id; + } + } + } + + /* Calculate the total energy of all non-vCPU thread */ + for (int i = 0; i < num_threads; i++) { + if ((thd_stat[i].is_vcpu != true) && + (thd_stat[i].delta_ticks > 0)) { + double temp; + temp = vmsr_get_ratio(pkg_stat[thd_stat[i].pkg_id].e_delta, + thd_stat[i].delta_ticks, + vmsr->host_topo.maxticks[thd_stat[i].pkg_id]); + pkg_stat[thd_stat[i].pkg_id].e_ratio + += (uint64_t)lround(temp); + } + } + + /* Calculate the ratio per non-vCPU thread of each package */ + for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) { + if (pkg_stat[i].nb_vcpu > 0) { + pkg_stat[i].e_ratio = pkg_stat[i].e_ratio / pkg_stat[i].nb_vcpu; + } + } + + /* + * Calculate the energy for each Package: + * Energy Package = sum of each vCPU energy that belongs to the package + */ + for (int i = 0; i < num_threads; i++) { + if ((thd_stat[i].is_vcpu == true) && \ + (thd_stat[i].delta_ticks > 0)) { + double temp; + temp = vmsr_get_ratio(pkg_stat[thd_stat[i].pkg_id].e_delta, + thd_stat[i].delta_ticks, + vmsr->host_topo.maxticks[thd_stat[i].pkg_id]); + vpkgs_energy_stat[thd_stat[i].vpkg_id] += + (uint64_t)lround(temp); + vpkgs_energy_stat[thd_stat[i].vpkg_id] += + pkg_stat[thd_stat[i].pkg_id].e_ratio; + } + } + + /* + * Finally populate the vmsr register of each vCPU with the total + * package value to emulate the real hardware where each CPU return the + * value of the package it belongs. + */ + for (int i = 0; i < num_threads; i++) { + if ((thd_stat[i].is_vcpu == true) && \ + (thd_stat[i].delta_ticks > 0)) { + vmsr->msr_value[thd_stat[i].vcpu_id] = \ + vpkgs_energy_stat[thd_stat[i].vpkg_id]; + } + } + + /* Freeing memory before zeroing the pointer */ + for (int i = 0; i < num_threads; i++) { + g_free(thd_stat[i].utime); + g_free(thd_stat[i].stime); + } + } + +clean: + rcu_unregister_thread(); + return NULL; } -int kvm_arch_init(MachineState *ms, KVMState *s) +static int kvm_msr_energy_thread_init(KVMState *s, MachineState *ms) { - uint64_t identity_base = 0xfffbc000; - uint64_t shadow_mem; - int ret; - struct utsname utsname; - Error *local_err = NULL; + MachineClass *mc = MACHINE_GET_CLASS(ms); + struct KVMMsrEnergy *r = &s->msr_energy; /* - * Initialize SEV context, if required - * - * If no memory encryption is requested (ms->cgs == NULL) this is - * a no-op. - * - * It's also a no-op if a non-SEV confidential guest support - * mechanism is selected. SEV is the only mechanism available to - * select on x86 at present, so this doesn't arise, but if new - * mechanisms are supported in future (e.g. TDX), they'll need - * their own initialization either here or elsewhere. + * Sanity check + * 1. Host cpu must be Intel cpu + * 2. RAPL must be enabled on the Host */ - if (ms->cgs) { - ret = confidential_guest_kvm_init(ms->cgs, &local_err); - if (ret < 0) { - error_report_err(local_err); - return ret; + if (!is_host_cpu_intel()) { + error_report("The RAPL feature can only be enabled on hosts " + "with Intel CPU models"); + return -1; + } + + if (!is_rapl_enabled()) { + return -1; + } + + /* Retrieve the virtual topology */ + vmsr_init_topo_info(&r->guest_topo_info, ms); + + /* Retrieve the number of vcpu */ + r->guest_vcpus = ms->smp.cpus; + + /* Retrieve the number of virtual sockets */ + r->guest_vsockets = ms->smp.sockets; + + /* Allocate register memory (MSR_PKG_STATUS) for each vcpu */ + r->msr_value = g_new0(uint64_t, r->guest_vcpus); + + /* Retrieve the CPUArchIDlist */ + r->guest_cpu_list = mc->possible_cpu_arch_ids(ms); + + /* Max number of cpus on the Host */ + r->host_topo.maxcpus = vmsr_get_maxcpus(); + if (r->host_topo.maxcpus == 0) { + error_report("host max cpus = 0"); + return -1; + } + + /* Max number of packages on the host */ + r->host_topo.maxpkgs = vmsr_get_max_physical_package(r->host_topo.maxcpus); + if (r->host_topo.maxpkgs == 0) { + error_report("host max pkgs = 0"); + return -1; + } + + /* Allocate memory for each package on the host */ + r->host_topo.pkg_cpu_count = g_new0(unsigned int, r->host_topo.maxpkgs); + r->host_topo.maxticks = g_new0(unsigned int, r->host_topo.maxpkgs); + + vmsr_count_cpus_per_package(r->host_topo.pkg_cpu_count, + r->host_topo.maxpkgs); + for (int i = 0; i < r->host_topo.maxpkgs; i++) { + if (r->host_topo.pkg_cpu_count[i] == 0) { + error_report("cpu per packages = 0 on package_%d", i); + return -1; } } - has_xcrs = kvm_check_extension(s, KVM_CAP_XCRS); - has_sregs2 = kvm_check_extension(s, KVM_CAP_SREGS2) > 0; + /* Get QEMU PID*/ + r->pid = getpid(); - hv_vpindex_settable = kvm_check_extension(s, KVM_CAP_HYPERV_VP_INDEX); + /* Compute the socket path if necessary */ + if (s->msr_energy.socket_path == NULL) { + s->msr_energy.socket_path = vmsr_compute_default_paths(); + } + + /* Open socket with vmsr helper */ + s->msr_energy.sioc = vmsr_open_socket(s->msr_energy.socket_path); + + if (s->msr_energy.sioc == NULL) { + error_report("vmsr socket opening failed"); + return -1; + } + + /* Those MSR values should not change */ + r->msr_unit = vmsr_read_msr(MSR_RAPL_POWER_UNIT, 0, r->pid, + s->msr_energy.sioc); + r->msr_limit = vmsr_read_msr(MSR_PKG_POWER_LIMIT, 0, r->pid, + s->msr_energy.sioc); + r->msr_info = vmsr_read_msr(MSR_PKG_POWER_INFO, 0, r->pid, + s->msr_energy.sioc); + if (r->msr_unit == 0 || r->msr_limit == 0 || r->msr_info == 0) { + error_report("can't read any virtual msr"); + return -1; + } + qemu_thread_create(&r->msr_thr, "kvm-msr", + kvm_msr_energy_thread, + s, QEMU_THREAD_JOINABLE); + return 0; +} + +int kvm_arch_get_default_type(MachineState *ms) +{ + return 0; +} + +static int kvm_vm_enable_exception_payload(KVMState *s) +{ + int ret = 0; has_exception_payload = kvm_check_extension(s, KVM_CAP_EXCEPTION_PAYLOAD); if (has_exception_payload) { ret = kvm_vm_enable_cap(s, KVM_CAP_EXCEPTION_PAYLOAD, 0, true); if (ret < 0) { error_report("kvm: Failed to enable exception payload cap: %s", strerror(-ret)); - return ret; } } - has_triple_fault_event = kvm_check_extension(s, KVM_CAP_X86_TRIPLE_FAULT_EVENT); + return ret; +} + +static int kvm_vm_enable_triple_fault_event(KVMState *s) +{ + int ret = 0; + has_triple_fault_event = \ + kvm_check_extension(s, + KVM_CAP_X86_TRIPLE_FAULT_EVENT); if (has_triple_fault_event) { ret = kvm_vm_enable_cap(s, KVM_CAP_X86_TRIPLE_FAULT_EVENT, 0, true); if (ret < 0) { error_report("kvm: Failed to enable triple fault event cap: %s", strerror(-ret)); + } + } + return ret; +} + +static int kvm_vm_set_identity_map_addr(KVMState *s, uint64_t identity_base) +{ + return kvm_vm_ioctl(s, KVM_SET_IDENTITY_MAP_ADDR, &identity_base); +} + +static int kvm_vm_set_nr_mmu_pages(KVMState *s) +{ + uint64_t shadow_mem; + int ret = 0; + shadow_mem = object_property_get_int(OBJECT(s), + "kvm-shadow-mem", + &error_abort); + if (shadow_mem != -1) { + shadow_mem /= 4096; + ret = kvm_vm_ioctl(s, KVM_SET_NR_MMU_PAGES, shadow_mem); + } + return ret; +} + +static int kvm_vm_set_tss_addr(KVMState *s, uint64_t tss_base) +{ + return kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, tss_base); +} + +static int kvm_vm_enable_disable_exits(KVMState *s) +{ + int disable_exits = kvm_check_extension(s, KVM_CAP_X86_DISABLE_EXITS); + + if (disable_exits) { + disable_exits &= (KVM_X86_DISABLE_EXITS_MWAIT | + KVM_X86_DISABLE_EXITS_HLT | + KVM_X86_DISABLE_EXITS_PAUSE | + KVM_X86_DISABLE_EXITS_CSTATE); + } + + return kvm_vm_enable_cap(s, KVM_CAP_X86_DISABLE_EXITS, 0, + disable_exits); +} + +static int kvm_vm_enable_bus_lock_exit(KVMState *s) +{ + int ret = 0; + ret = kvm_check_extension(s, KVM_CAP_X86_BUS_LOCK_EXIT); + if (!(ret & KVM_BUS_LOCK_DETECTION_EXIT)) { + error_report("kvm: bus lock detection unsupported"); + return -ENOTSUP; + } + ret = kvm_vm_enable_cap(s, KVM_CAP_X86_BUS_LOCK_EXIT, 0, + KVM_BUS_LOCK_DETECTION_EXIT); + if (ret < 0) { + error_report("kvm: Failed to enable bus lock detection cap: %s", + strerror(-ret)); + } + + return ret; +} + +static int kvm_vm_enable_notify_vmexit(KVMState *s) +{ + int ret = 0; + if (s->notify_vmexit != NOTIFY_VMEXIT_OPTION_DISABLE) { + uint64_t notify_window_flags = + ((uint64_t)s->notify_window << 32) | + KVM_X86_NOTIFY_VMEXIT_ENABLED | + KVM_X86_NOTIFY_VMEXIT_USER; + ret = kvm_vm_enable_cap(s, KVM_CAP_X86_NOTIFY_VMEXIT, 0, + notify_window_flags); + if (ret < 0) { + error_report("kvm: Failed to enable notify vmexit cap: %s", + strerror(-ret)); + } + } + return ret; +} + +static int kvm_vm_enable_userspace_msr(KVMState *s) +{ + int ret; + + ret = kvm_vm_enable_cap(s, KVM_CAP_X86_USER_SPACE_MSR, 0, + KVM_MSR_EXIT_REASON_FILTER); + if (ret < 0) { + error_report("Could not enable user space MSRs: %s", + strerror(-ret)); + exit(1); + } + + ret = kvm_filter_msr(s, MSR_CORE_THREAD_COUNT, + kvm_rdmsr_core_thread_count, NULL); + if (ret < 0) { + error_report("Could not install MSR_CORE_THREAD_COUNT handler: %s", + strerror(-ret)); + exit(1); + } + + return 0; +} + +static int kvm_vm_enable_energy_msrs(KVMState *s) +{ + int ret; + + if (s->msr_energy.enable == true) { + ret = kvm_filter_msr(s, MSR_RAPL_POWER_UNIT, + kvm_rdmsr_rapl_power_unit, NULL); + if (ret < 0) { + error_report("Could not install MSR_RAPL_POWER_UNIT handler: %s", + strerror(-ret)); + return ret; + } + + ret = kvm_filter_msr(s, MSR_PKG_POWER_LIMIT, + kvm_rdmsr_pkg_power_limit, NULL); + if (ret < 0) { + error_report("Could not install MSR_PKG_POWER_LIMIT handler: %s", + strerror(-ret)); + return ret; + } + + ret = kvm_filter_msr(s, MSR_PKG_POWER_INFO, + kvm_rdmsr_pkg_power_info, NULL); + if (ret < 0) { + error_report("Could not install MSR_PKG_POWER_INFO handler: %s", + strerror(-ret)); + return ret; + } + ret = kvm_filter_msr(s, MSR_PKG_ENERGY_STATUS, + kvm_rdmsr_pkg_energy_status, NULL); + if (ret < 0) { + error_report("Could not install MSR_PKG_ENERGY_STATUS handler: %s", + strerror(-ret)); return ret; } } + return 0; +} + +int kvm_arch_init(MachineState *ms, KVMState *s) +{ + int ret; + struct utsname utsname; + Error *local_err = NULL; + + /* + * Initialize confidential guest (SEV/TDX) context, if required + */ + if (ms->cgs) { + ret = confidential_guest_kvm_init(ms->cgs, &local_err); + if (ret < 0) { + error_report_err(local_err); + return ret; + } + } + + has_xcrs = kvm_check_extension(s, KVM_CAP_XCRS); + has_sregs2 = kvm_check_extension(s, KVM_CAP_SREGS2) > 0; + + hv_vpindex_settable = kvm_check_extension(s, KVM_CAP_HYPERV_VP_INDEX); + + ret = kvm_vm_enable_exception_payload(s); + if (ret < 0) { + return ret; + } + + ret = kvm_vm_enable_triple_fault_event(s); + if (ret < 0) { + return ret; + } if (s->xen_version) { #ifdef CONFIG_XEN_EMU @@ -2680,47 +3279,33 @@ int kvm_arch_init(MachineState *ms, KVMState *s) return ret; } - kvm_get_supported_feature_msrs(s); + ret = kvm_get_supported_feature_msrs(s); + if (ret < 0) { + return ret; + } uname(&utsname); lm_capable_kernel = strcmp(utsname.machine, "x86_64") == 0; - /* - * On older Intel CPUs, KVM uses vm86 mode to emulate 16-bit code directly. - * In order to use vm86 mode, an EPT identity map and a TSS are needed. - * Since these must be part of guest physical memory, we need to allocate - * them, both by setting their start addresses in the kernel and by - * creating a corresponding e820 entry. We need 4 pages before the BIOS, - * so this value allows up to 16M BIOSes. - */ - identity_base = 0xfeffc000; - ret = kvm_vm_ioctl(s, KVM_SET_IDENTITY_MAP_ADDR, &identity_base); + ret = kvm_vm_set_identity_map_addr(s, KVM_IDENTITY_BASE); if (ret < 0) { return ret; } /* Set TSS base one page after EPT identity map. */ - ret = kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, identity_base + 0x1000); + ret = kvm_vm_set_tss_addr(s, KVM_IDENTITY_BASE + 0x1000); if (ret < 0) { return ret; } /* Tell fw_cfg to notify the BIOS to reserve the range. */ - ret = e820_add_entry(identity_base, 0x4000, E820_RESERVED); + e820_add_entry(KVM_IDENTITY_BASE, 0x4000, E820_RESERVED); + + ret = kvm_vm_set_nr_mmu_pages(s); if (ret < 0) { - fprintf(stderr, "e820_add_entry() table is full\n"); return ret; } - shadow_mem = object_property_get_int(OBJECT(s), "kvm-shadow-mem", &error_abort); - if (shadow_mem != -1) { - shadow_mem /= 4096; - ret = kvm_vm_ioctl(s, KVM_SET_NR_MMU_PAGES, shadow_mem); - if (ret < 0) { - return ret; - } - } - if (kvm_check_extension(s, KVM_CAP_X86_SMM) && object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE) && x86_machine_is_smm_enabled(X86_MACHINE(ms))) { @@ -2729,23 +3314,11 @@ int kvm_arch_init(MachineState *ms, KVMState *s) } if (enable_cpu_pm) { - int disable_exits = kvm_check_extension(s, KVM_CAP_X86_DISABLE_EXITS); -/* Work around for kernel header with a typo. TODO: fix header and drop. */ -#if defined(KVM_X86_DISABLE_EXITS_HTL) && !defined(KVM_X86_DISABLE_EXITS_HLT) -#define KVM_X86_DISABLE_EXITS_HLT KVM_X86_DISABLE_EXITS_HTL -#endif - if (disable_exits) { - disable_exits &= (KVM_X86_DISABLE_EXITS_MWAIT | - KVM_X86_DISABLE_EXITS_HLT | - KVM_X86_DISABLE_EXITS_PAUSE | - KVM_X86_DISABLE_EXITS_CSTATE); - } - - ret = kvm_vm_enable_cap(s, KVM_CAP_X86_DISABLE_EXITS, 0, - disable_exits); + ret = kvm_vm_enable_disable_exits(s); if (ret < 0) { error_report("kvm: guest stopping CPU not supported: %s", strerror(-ret)); + return ret; } } @@ -2753,16 +3326,8 @@ int kvm_arch_init(MachineState *ms, KVMState *s) X86MachineState *x86ms = X86_MACHINE(ms); if (x86ms->bus_lock_ratelimit > 0) { - ret = kvm_check_extension(s, KVM_CAP_X86_BUS_LOCK_EXIT); - if (!(ret & KVM_BUS_LOCK_DETECTION_EXIT)) { - error_report("kvm: bus lock detection unsupported"); - return -ENOTSUP; - } - ret = kvm_vm_enable_cap(s, KVM_CAP_X86_BUS_LOCK_EXIT, 0, - KVM_BUS_LOCK_DETECTION_EXIT); + ret = kvm_vm_enable_bus_lock_exit(s); if (ret < 0) { - error_report("kvm: Failed to enable bus lock detection cap: %s", - strerror(-ret)); return ret; } ratelimit_init(&bus_lock_ratelimit_ctrl); @@ -2771,37 +3336,30 @@ int kvm_arch_init(MachineState *ms, KVMState *s) } } - if (s->notify_vmexit != NOTIFY_VMEXIT_OPTION_DISABLE && - kvm_check_extension(s, KVM_CAP_X86_NOTIFY_VMEXIT)) { - uint64_t notify_window_flags = - ((uint64_t)s->notify_window << 32) | - KVM_X86_NOTIFY_VMEXIT_ENABLED | - KVM_X86_NOTIFY_VMEXIT_USER; - ret = kvm_vm_enable_cap(s, KVM_CAP_X86_NOTIFY_VMEXIT, 0, - notify_window_flags); - if (ret < 0) { - error_report("kvm: Failed to enable notify vmexit cap: %s", - strerror(-ret)); - return ret; - } + if (kvm_check_extension(s, KVM_CAP_X86_NOTIFY_VMEXIT)) { + ret = kvm_vm_enable_notify_vmexit(s); + if (ret < 0) { + return ret; + } } - if (kvm_vm_check_extension(s, KVM_CAP_X86_USER_SPACE_MSR)) { - bool r; - ret = kvm_vm_enable_cap(s, KVM_CAP_X86_USER_SPACE_MSR, 0, - KVM_MSR_EXIT_REASON_FILTER); - if (ret) { - error_report("Could not enable user space MSRs: %s", - strerror(-ret)); - exit(1); + if (kvm_vm_check_extension(s, KVM_CAP_X86_USER_SPACE_MSR)) { + ret = kvm_vm_enable_userspace_msr(s); + if (ret < 0) { + return ret; } - r = kvm_filter_msr(s, MSR_CORE_THREAD_COUNT, - kvm_rdmsr_core_thread_count, NULL); - if (!r) { - error_report("Could not install MSR_CORE_THREAD_COUNT handler: %s", - strerror(-ret)); - exit(1); + if (s->msr_energy.enable == true) { + ret = kvm_vm_enable_energy_msrs(s); + if (ret < 0) { + return ret; + } + + ret = kvm_msr_energy_thread_init(s, ms); + if (ret < 0) { + error_report("kvm : error RAPL feature requirement not met"); + return ret; + } } } @@ -3264,7 +3822,14 @@ static void kvm_msr_entry_add_vmx(X86CPU *cpu, FeatureWordArray f) kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR4_FIXED0, CR4_VMXE_MASK); - if (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_TSC_SCALING) { + if (f[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED) { + /* FRED injected-event data (0x2052). */ + kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x52); + } else if (f[FEAT_VMX_EXIT_CTLS] & + VMX_VM_EXIT_ACTIVATE_SECONDARY_CONTROLS) { + /* Secondary VM-exit controls (0x2044). */ + kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x44); + } else if (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_TSC_SCALING) { /* TSC multiplier (0x2032). */ kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x32); } else { @@ -3307,32 +3872,34 @@ static void kvm_init_msrs(X86CPU *cpu) CPUX86State *env = &cpu->env; kvm_msr_buf_reset(cpu); - if (has_msr_arch_capabs) { - kvm_msr_entry_add(cpu, MSR_IA32_ARCH_CAPABILITIES, - env->features[FEAT_ARCH_CAPABILITIES]); - } - if (has_msr_core_capabs) { - kvm_msr_entry_add(cpu, MSR_IA32_CORE_CAPABILITY, - env->features[FEAT_CORE_CAPABILITY]); - } + if (!is_tdx_vm()) { + if (has_msr_arch_capabs) { + kvm_msr_entry_add(cpu, MSR_IA32_ARCH_CAPABILITIES, + env->features[FEAT_ARCH_CAPABILITIES]); + } + + if (has_msr_core_capabs) { + kvm_msr_entry_add(cpu, MSR_IA32_CORE_CAPABILITY, + env->features[FEAT_CORE_CAPABILITY]); + } - if (has_msr_perf_capabs && cpu->enable_pmu) { - kvm_msr_entry_add_perf(cpu, env->features); + if (has_msr_perf_capabs && cpu->enable_pmu) { + kvm_msr_entry_add_perf(cpu, env->features); + } + + /* + * Older kernels do not include VMX MSRs in KVM_GET_MSR_INDEX_LIST, but + * all kernels with MSR features should have them. + */ + if (kvm_feature_msrs && cpu_has_vmx(env)) { + kvm_msr_entry_add_vmx(cpu, env->features); + } } if (has_msr_ucode_rev) { kvm_msr_entry_add(cpu, MSR_IA32_UCODE_REV, cpu->ucode_rev); } - - /* - * Older kernels do not include VMX MSRs in KVM_GET_MSR_INDEX_LIST, but - * all kernels with MSR features should have them. - */ - if (kvm_feature_msrs && cpu_has_vmx(env)) { - kvm_msr_entry_add_vmx(cpu, env->features); - } - assert(kvm_buf_set_msrs(cpu) == 0); } @@ -3394,6 +3961,9 @@ static int kvm_put_msrs(X86CPU *cpu, int level) if (has_msr_virt_ssbd) { kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, env->virt_ssbd); } + if (has_msr_hwcr) { + kvm_msr_entry_add(cpu, MSR_K7_HWCR, env->msr_hwcr); + } #ifdef TARGET_X86_64 if (lm_capable_kernel) { @@ -3421,22 +3991,24 @@ static int kvm_put_msrs(X86CPU *cpu, int level) */ if (level >= KVM_PUT_RESET_STATE) { kvm_msr_entry_add(cpu, MSR_IA32_TSC, env->tsc); - kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, env->system_time_msr); - kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, env->wall_clock_msr); - if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF_INT)) { + if (env->features[FEAT_KVM] & (CPUID_KVM_CLOCK | CPUID_KVM_CLOCK2)) { + kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, env->system_time_msr); + kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, env->wall_clock_msr); + } + if (env->features[FEAT_KVM] & CPUID_KVM_ASYNCPF_INT) { kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_INT, env->async_pf_int_msr); } - if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF)) { + if (env->features[FEAT_KVM] & CPUID_KVM_ASYNCPF) { kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, env->async_pf_en_msr); } - if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_PV_EOI)) { + if (env->features[FEAT_KVM] & CPUID_KVM_PV_EOI) { kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, env->pv_eoi_en_msr); } - if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) { + if (env->features[FEAT_KVM] & CPUID_KVM_STEAL_TIME) { kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, env->steal_time_msr); } - if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_POLL_CONTROL)) { + if (env->features[FEAT_KVM] & CPUID_KVM_POLL_CONTROL) { kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, env->poll_control_msr); } @@ -3494,13 +4066,11 @@ static int kvm_put_msrs(X86CPU *cpu, int level) kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS, env->msr_hv_tsc_emulation_status); } -#ifdef CONFIG_SYNDBG if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNDBG) && has_msr_hv_syndbg_options) { kvm_msr_entry_add(cpu, HV_X64_MSR_SYNDBG_OPTIONS, hyperv_syndbg_query_options()); } -#endif } if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) { kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE, @@ -3672,7 +4242,8 @@ static int kvm_get_xsave(X86CPU *cpu) { CPUX86State *env = &cpu->env; void *xsave = env->xsave_buf; - int type, ret; + unsigned long type; + int ret; type = has_xsave2 ? KVM_GET_XSAVE2 : KVM_GET_XSAVE; ret = kvm_vcpu_ioctl(CPU(cpu), type, xsave); @@ -3877,6 +4448,9 @@ static int kvm_get_msrs(X86CPU *cpu) kvm_msr_entry_add(cpu, MSR_IA32_TSC, 0); env->tsc_valid = !runstate_is_running(); } + if (has_msr_hwcr) { + kvm_msr_entry_add(cpu, MSR_K7_HWCR, 0); + } #ifdef TARGET_X86_64 if (lm_capable_kernel) { @@ -3897,21 +4471,23 @@ static int kvm_get_msrs(X86CPU *cpu) } } #endif - kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, 0); - kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, 0); - if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF_INT)) { + if (env->features[FEAT_KVM] & (CPUID_KVM_CLOCK | CPUID_KVM_CLOCK2)) { + kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, 0); + kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, 0); + } + if (env->features[FEAT_KVM] & CPUID_KVM_ASYNCPF_INT) { kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_INT, 0); } - if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF)) { + if (env->features[FEAT_KVM] & CPUID_KVM_ASYNCPF) { kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, 0); } - if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_PV_EOI)) { + if (env->features[FEAT_KVM] & CPUID_KVM_PV_EOI) { kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, 0); } - if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) { + if (env->features[FEAT_KVM] & CPUID_KVM_STEAL_TIME) { kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, 0); } - if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_POLL_CONTROL)) { + if (env->features[FEAT_KVM] & CPUID_KVM_POLL_CONTROL) { kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, 1); } if (has_architectural_pmu_version > 0) { @@ -4396,6 +4972,9 @@ static int kvm_get_msrs(X86CPU *cpu) case MSR_ARCH_LBR_INFO_0 ... MSR_ARCH_LBR_INFO_0 + 31: env->lbr_records[index - MSR_ARCH_LBR_INFO_0].info = msrs[i].data; break; + case MSR_K7_HWCR: + env->msr_hwcr = msrs[i].data; + break; } } @@ -4687,7 +5266,7 @@ static int kvm_get_nested_state(X86CPU *cpu) return ret; } -int kvm_arch_put_registers(CPUState *cpu, int level) +int kvm_arch_put_registers(CPUState *cpu, int level, Error **errp) { X86CPU *x86_cpu = X86_CPU(cpu); int ret; @@ -4702,6 +5281,7 @@ int kvm_arch_put_registers(CPUState *cpu, int level) if (level >= KVM_PUT_RESET_STATE) { ret = kvm_put_msr_feature_control(x86_cpu); if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to set feature control MSR"); return ret; } } @@ -4709,12 +5289,14 @@ int kvm_arch_put_registers(CPUState *cpu, int level) /* must be before kvm_put_nested_state so that EFER.SVME is set */ ret = has_sregs2 ? kvm_put_sregs2(x86_cpu) : kvm_put_sregs(x86_cpu); if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to set special registers"); return ret; } if (level >= KVM_PUT_RESET_STATE) { ret = kvm_put_nested_state(x86_cpu); if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to set nested state"); return ret; } } @@ -4732,6 +5314,7 @@ int kvm_arch_put_registers(CPUState *cpu, int level) if (xen_mode == XEN_EMULATE && level == KVM_PUT_FULL_STATE) { ret = kvm_put_xen_state(cpu); if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to set Xen state"); return ret; } } @@ -4739,43 +5322,51 @@ int kvm_arch_put_registers(CPUState *cpu, int level) ret = kvm_getput_regs(x86_cpu, 1); if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to set general purpose registers"); return ret; } ret = kvm_put_xsave(x86_cpu); if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to set XSAVE"); return ret; } ret = kvm_put_xcrs(x86_cpu); if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to set XCRs"); return ret; } ret = kvm_put_msrs(x86_cpu, level); if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to set MSRs"); return ret; } ret = kvm_put_vcpu_events(x86_cpu, level); if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to set vCPU events"); return ret; } if (level >= KVM_PUT_RESET_STATE) { ret = kvm_put_mp_state(x86_cpu); if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to set MP state"); return ret; } } ret = kvm_put_tscdeadline_msr(x86_cpu); if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to set TSC deadline MSR"); return ret; } ret = kvm_put_debugregs(x86_cpu); if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to set debug registers"); return ret; } return 0; } -int kvm_arch_get_registers(CPUState *cs) +int kvm_arch_get_registers(CPUState *cs, Error **errp) { X86CPU *cpu = X86_CPU(cs); int ret; @@ -4784,6 +5375,7 @@ int kvm_arch_get_registers(CPUState *cs) ret = kvm_get_vcpu_events(cpu); if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to get vCPU events"); goto out; } /* @@ -4792,44 +5384,54 @@ int kvm_arch_get_registers(CPUState *cs) */ ret = kvm_get_mp_state(cpu); if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to get MP state"); goto out; } ret = kvm_getput_regs(cpu, 0); if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to get general purpose registers"); goto out; } ret = kvm_get_xsave(cpu); if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to get XSAVE"); goto out; } ret = kvm_get_xcrs(cpu); if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to get XCRs"); goto out; } ret = has_sregs2 ? kvm_get_sregs2(cpu) : kvm_get_sregs(cpu); if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to get special registers"); goto out; } ret = kvm_get_msrs(cpu); if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to get MSRs"); goto out; } ret = kvm_get_apic(cpu); if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to get APIC"); goto out; } ret = kvm_get_debugregs(cpu); if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to get debug registers"); goto out; } ret = kvm_get_nested_state(cpu); if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to get nested state"); goto out; } #ifdef CONFIG_XEN_EMU if (xen_mode == XEN_EMULATE) { ret = kvm_get_xen_state(cs); if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to get Xen state"); goto out; } } @@ -5260,15 +5862,16 @@ void kvm_arch_update_guest_debug(CPUState *cpu, struct kvm_guest_debug *dbg) } } -static bool kvm_install_msr_filters(KVMState *s) +static int kvm_install_msr_filters(KVMState *s) { uint64_t zero = 0; struct kvm_msr_filter filter = { .flags = KVM_MSR_FILTER_DEFAULT_ALLOW, }; - int r, i, j = 0; + int i, j = 0; - for (i = 0; i < KVM_MSR_FILTER_MAX_RANGES; i++) { + QEMU_BUILD_BUG_ON(ARRAY_SIZE(msr_handlers) != ARRAY_SIZE(filter.ranges)); + for (i = 0; i < ARRAY_SIZE(msr_handlers); i++) { KVMMSRHandlers *handler = &msr_handlers[i]; if (handler->msr) { struct kvm_msr_filter_range *range = &filter.ranges[j++]; @@ -5290,18 +5893,13 @@ static bool kvm_install_msr_filters(KVMState *s) } } - r = kvm_vm_ioctl(s, KVM_X86_SET_MSR_FILTER, &filter); - if (r) { - return false; - } - - return true; + return kvm_vm_ioctl(s, KVM_X86_SET_MSR_FILTER, &filter); } -bool kvm_filter_msr(KVMState *s, uint32_t msr, QEMURDMSRHandler *rdmsr, - QEMUWRMSRHandler *wrmsr) +static int kvm_filter_msr(KVMState *s, uint32_t msr, QEMURDMSRHandler *rdmsr, + QEMUWRMSRHandler *wrmsr) { - int i; + int i, ret; for (i = 0; i < ARRAY_SIZE(msr_handlers); i++) { if (!msr_handlers[i].msr) { @@ -5311,16 +5909,17 @@ bool kvm_filter_msr(KVMState *s, uint32_t msr, QEMURDMSRHandler *rdmsr, .wrmsr = wrmsr, }; - if (!kvm_install_msr_filters(s)) { + ret = kvm_install_msr_filters(s); + if (ret) { msr_handlers[i] = (KVMMSRHandlers) { }; - return false; + return ret; } - return true; + return 0; } } - return false; + return -EINVAL; } static int kvm_handle_rdmsr(X86CPU *cpu, struct kvm_run *run) @@ -5340,7 +5939,7 @@ static int kvm_handle_rdmsr(X86CPU *cpu, struct kvm_run *run) } } - assert(false); + g_assert_not_reached(); } static int kvm_handle_wrmsr(X86CPU *cpu, struct kvm_run *run) @@ -5359,7 +5958,7 @@ static int kvm_handle_wrmsr(X86CPU *cpu, struct kvm_run *run) } } - assert(false); + g_assert_not_reached(); } static bool has_sgx_provisioning; @@ -5419,9 +6018,11 @@ static bool host_supports_vmx(void) * because private/shared page tracking is already provided through other * means, these 2 use-cases should be treated as being mutually-exclusive. */ -static int kvm_handle_hc_map_gpa_range(struct kvm_run *run) +static int kvm_handle_hc_map_gpa_range(X86CPU *cpu, struct kvm_run *run) { + struct kvm_pre_fault_memory mem; uint64_t gpa, size, attributes; + int ret; if (!machine_require_guest_memfd(current_machine)) return -EINVAL; @@ -5432,13 +6033,32 @@ static int kvm_handle_hc_map_gpa_range(struct kvm_run *run) trace_kvm_hc_map_gpa_range(gpa, size, attributes, run->hypercall.flags); - return kvm_convert_memory(gpa, size, attributes & KVM_MAP_GPA_RANGE_ENCRYPTED); + ret = kvm_convert_memory(gpa, size, attributes & KVM_MAP_GPA_RANGE_ENCRYPTED); + if (ret || !kvm_pre_fault_memory_supported) { + return ret; + } + + /* + * Opportunistically pre-fault memory in. Failures are ignored so that any + * errors in faulting in the memory will get captured in KVM page fault + * path when the guest first accesses the page. + */ + memset(&mem, 0, sizeof(mem)); + mem.gpa = gpa; + mem.size = size; + while (mem.size) { + if (kvm_vcpu_ioctl(CPU(cpu), KVM_PRE_FAULT_MEMORY, &mem)) { + break; + } + } + + return 0; } -static int kvm_handle_hypercall(struct kvm_run *run) +static int kvm_handle_hypercall(X86CPU *cpu, struct kvm_run *run) { if (run->hypercall.nr == KVM_HC_MAP_GPA_RANGE) - return kvm_handle_hc_map_gpa_range(run); + return kvm_handle_hc_map_gpa_range(cpu, run); return -EINVAL; } @@ -5538,7 +6158,32 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) break; #endif case KVM_EXIT_HYPERCALL: - ret = kvm_handle_hypercall(run); + ret = kvm_handle_hypercall(cpu, run); + break; + case KVM_EXIT_SYSTEM_EVENT: + switch (run->system_event.type) { + case KVM_SYSTEM_EVENT_TDX_FATAL: + ret = tdx_handle_report_fatal_error(cpu, run); + break; + default: + ret = -1; + break; + } + break; + case KVM_EXIT_TDX: + /* + * run->tdx is already set up for the case where userspace + * does not handle the TDVMCALL. + */ + switch (run->tdx.nr) { + case TDVMCALL_GET_QUOTE: + tdx_handle_get_quote(cpu, run); + break; + case TDVMCALL_GET_TD_VM_CALL_INFO: + tdx_handle_get_tdvmcall_info(cpu, run); + break; + } + ret = 0; break; default: fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason); @@ -5612,7 +6257,7 @@ uint64_t kvm_swizzle_msi_ext_dest_id(uint64_t address) return address; } env = &X86_CPU(first_cpu)->env; - if (!(env->features[FEAT_KVM] & (1 << KVM_FEATURE_MSI_EXT_DEST_ID))) { + if (!(env->features[FEAT_KVM] & CPUID_KVM_MSI_EXT_DEST_ID)) { return address; } |