aboutsummaryrefslogtreecommitdiff
path: root/target/i386/kvm/kvm.c
diff options
context:
space:
mode:
Diffstat (limited to 'target/i386/kvm/kvm.c')
-rw-r--r--target/i386/kvm/kvm.c1045
1 files changed, 845 insertions, 200 deletions
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index dd8b0f3..234878c 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -16,9 +16,12 @@
#include "qapi/qapi-events-run-state.h"
#include "qapi/error.h"
#include "qapi/visitor.h"
+#include <math.h>
#include <sys/ioctl.h>
#include <sys/utsname.h>
#include <sys/syscall.h>
+#include <sys/resource.h>
+#include <sys/time.h>
#include <linux/kvm.h>
#include <linux/kvm_para.h>
@@ -27,13 +30,15 @@
#include "cpu.h"
#include "host-cpu.h"
-#include "sysemu/sysemu.h"
-#include "sysemu/hw_accel.h"
-#include "sysemu/kvm_int.h"
-#include "sysemu/runstate.h"
+#include "vmsr_energy.h"
+#include "system/system.h"
+#include "system/hw_accel.h"
+#include "system/kvm_int.h"
+#include "system/runstate.h"
#include "kvm_i386.h"
#include "../confidential-guest.h"
#include "sev.h"
+#include "tdx.h"
#include "xen-emu.h"
#include "hyperv.h"
#include "hyperv-proto.h"
@@ -63,6 +68,7 @@
#include "hw/pci/msix.h"
#include "migration/blocker.h"
#include "exec/memattrs.h"
+#include "exec/target_page.h"
#include "trace.h"
#include CONFIG_DEVICES
@@ -77,18 +83,35 @@
do { } while (0)
#endif
+/*
+ * On older Intel CPUs, KVM uses vm86 mode to emulate 16-bit code directly.
+ * In order to use vm86 mode, an EPT identity map and a TSS are needed.
+ * Since these must be part of guest physical memory, we need to allocate
+ * them, both by setting their start addresses in the kernel and by
+ * creating a corresponding e820 entry. We need 4 pages before the BIOS,
+ * so this value allows up to 16M BIOSes.
+ */
+#define KVM_IDENTITY_BASE 0xfeffc000
+
/* From arch/x86/kvm/lapic.h */
#define KVM_APIC_BUS_CYCLE_NS 1
#define KVM_APIC_BUS_FREQUENCY (1000000000ULL / KVM_APIC_BUS_CYCLE_NS)
-#define MSR_KVM_WALL_CLOCK 0x11
-#define MSR_KVM_SYSTEM_TIME 0x12
-
/* A 4096-byte buffer can hold the 8-byte kvm_msrs header, plus
* 255 kvm_msr_entry structs */
#define MSR_BUF_SIZE 4096
+typedef bool QEMURDMSRHandler(X86CPU *cpu, uint32_t msr, uint64_t *val);
+typedef bool QEMUWRMSRHandler(X86CPU *cpu, uint32_t msr, uint64_t val);
+typedef struct {
+ uint32_t msr;
+ QEMURDMSRHandler *rdmsr;
+ QEMUWRMSRHandler *wrmsr;
+} KVMMSRHandlers;
+
static void kvm_init_msrs(X86CPU *cpu);
+static int kvm_filter_msr(KVMState *s, uint32_t msr, QEMURDMSRHandler *rdmsr,
+ QEMUWRMSRHandler *wrmsr);
const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
KVM_CAP_INFO(SET_TSS_ADDR),
@@ -141,6 +164,7 @@ static bool has_msr_ucode_rev;
static bool has_msr_vmx_procbased_ctls2;
static bool has_msr_perf_capabs;
static bool has_msr_pkrs;
+static bool has_msr_hwcr;
static uint32_t has_architectural_pmu_version;
static uint32_t num_architectural_pmu_gp_counters;
@@ -169,6 +193,7 @@ static const char *vm_type_name[] = {
[KVM_X86_SEV_VM] = "SEV",
[KVM_X86_SEV_ES_VM] = "SEV-ES",
[KVM_X86_SNP_VM] = "SEV-SNP",
+ [KVM_X86_TDX_VM] = "TDX",
};
bool kvm_is_vm_type_supported(int type)
@@ -303,7 +328,7 @@ void kvm_synchronize_all_tsc(void)
{
CPUState *cpu;
- if (kvm_enabled()) {
+ if (kvm_enabled() && !is_tdx_vm()) {
CPU_FOREACH(cpu) {
run_on_cpu(cpu, do_kvm_synchronize_tsc, RUN_ON_CPU_NULL);
}
@@ -369,7 +394,7 @@ static bool host_tsx_broken(void)
/* Returns the value for a specific register on the cpuid entry
*/
-static uint32_t cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, int reg)
+uint32_t cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, int reg)
{
uint32_t ret = 0;
switch (reg) {
@@ -391,9 +416,9 @@ static uint32_t cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, int reg)
/* Find matching entry for function/index on kvm_cpuid2 struct
*/
-static struct kvm_cpuid_entry2 *cpuid_find_entry(struct kvm_cpuid2 *cpuid,
- uint32_t function,
- uint32_t index)
+struct kvm_cpuid_entry2 *cpuid_find_entry(struct kvm_cpuid2 *cpuid,
+ uint32_t function,
+ uint32_t index)
{
int i;
for (i = 0; i < cpuid->nent; ++i) {
@@ -539,15 +564,20 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function,
* be enabled without the in-kernel irqchip
*/
if (!kvm_irqchip_in_kernel()) {
- ret &= ~(1U << KVM_FEATURE_PV_UNHALT);
+ ret &= ~CPUID_KVM_PV_UNHALT;
}
if (kvm_irqchip_is_split()) {
- ret |= 1U << KVM_FEATURE_MSI_EXT_DEST_ID;
+ ret |= CPUID_KVM_MSI_EXT_DEST_ID;
}
} else if (function == KVM_CPUID_FEATURES && reg == R_EDX) {
- ret |= 1U << KVM_HINTS_REALTIME;
+ ret |= CPUID_KVM_HINTS_REALTIME;
}
+ if (current_machine->cgs) {
+ ret = x86_confidential_guest_adjust_cpuid_features(
+ X86_CONFIDENTIAL_GUEST(current_machine->cgs),
+ function, index, reg, ret);
+ }
return ret;
}
@@ -840,6 +870,15 @@ static int kvm_arch_set_tsc_khz(CPUState *cs)
int r, cur_freq;
bool set_ioctl = false;
+ /*
+ * TSC of TD vcpu is immutable, it cannot be set/changed via vcpu scope
+ * VM_SET_TSC_KHZ, but only be initialized via VM scope VM_SET_TSC_KHZ
+ * before ioctl KVM_TDX_INIT_VM in tdx_pre_create_vcpu()
+ */
+ if (is_tdx_vm()) {
+ return 0;
+ }
+
if (!env->tsc_khz) {
return 0;
}
@@ -904,6 +943,7 @@ static struct {
uint32_t bits;
} flags[2];
uint64_t dependencies;
+ bool skip_passthrough;
} kvm_hyperv_properties[] = {
[HYPERV_FEAT_RELAXED] = {
.desc = "relaxed timing (hv-relaxed)",
@@ -1026,16 +1066,15 @@ static struct {
.bits = HV_DEPRECATING_AEOI_RECOMMENDED}
}
},
-#ifdef CONFIG_SYNDBG
[HYPERV_FEAT_SYNDBG] = {
.desc = "Enable synthetic kernel debugger channel (hv-syndbg)",
.flags = {
{.func = HV_CPUID_FEATURES, .reg = R_EDX,
.bits = HV_FEATURE_DEBUG_MSRS_AVAILABLE}
},
- .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_RELAXED)
+ .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_RELAXED),
+ .skip_passthrough = true,
},
-#endif
[HYPERV_FEAT_MSR_BITMAP] = {
.desc = "enlightened MSR-Bitmap (hv-emsr-bitmap)",
.flags = {
@@ -1287,6 +1326,13 @@ static bool hyperv_feature_supported(CPUState *cs, int feature)
uint32_t func, bits;
int i, reg;
+ /*
+ * kvm_hyperv_properties needs to define at least one CPUID flag which
+ * must be used to detect the feature, it's hard to say whether it is
+ * supported or not otherwise.
+ */
+ assert(kvm_hyperv_properties[feature].flags[0].func);
+
for (i = 0; i < ARRAY_SIZE(kvm_hyperv_properties[feature].flags); i++) {
func = kvm_hyperv_properties[feature].flags[i].func;
@@ -1436,7 +1482,8 @@ bool kvm_hyperv_expand_features(X86CPU *cpu, Error **errp)
* hv_build_cpuid_leaf() uses this info to build guest CPUIDs.
*/
for (feat = 0; feat < ARRAY_SIZE(kvm_hyperv_properties); feat++) {
- if (hyperv_feature_supported(cs, feat)) {
+ if (hyperv_feature_supported(cs, feat) &&
+ !kvm_hyperv_properties[feat].skip_passthrough) {
cpu->hyperv_features |= BIT(feat);
}
}
@@ -1743,8 +1790,6 @@ static int hyperv_init_vcpu(X86CPU *cpu)
static Error *invtsc_mig_blocker;
-#define KVM_MAX_CPUID_ENTRIES 100
-
static void kvm_init_xsave(CPUX86State *env)
{
if (has_xsave2) {
@@ -1787,9 +1832,8 @@ static void kvm_init_nested_state(CPUX86State *env)
}
}
-static uint32_t kvm_x86_build_cpuid(CPUX86State *env,
- struct kvm_cpuid_entry2 *entries,
- uint32_t cpuid_i)
+uint32_t kvm_x86_build_cpuid(CPUX86State *env, struct kvm_cpuid_entry2 *entries,
+ uint32_t cpuid_i)
{
uint32_t limit, i, j;
uint32_t unused;
@@ -1809,10 +1853,12 @@ static uint32_t kvm_x86_build_cpuid(CPUX86State *env,
int times;
c->function = i;
- c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC |
- KVM_CPUID_FLAG_STATE_READ_NEXT;
cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
times = c->eax & 0xff;
+ if (times > 1) {
+ c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC |
+ KVM_CPUID_FLAG_STATE_READ_NEXT;
+ }
for (j = 1; j < times; ++j) {
if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
@@ -1826,7 +1872,7 @@ static uint32_t kvm_x86_build_cpuid(CPUX86State *env,
break;
}
case 0x1f:
- if (!x86_has_extended_topo(env->avail_cpu_topo)) {
+ if (!x86_has_cpuid_0x1f(env_archcpu(env))) {
cpuid_i--;
break;
}
@@ -1835,10 +1881,6 @@ static uint32_t kvm_x86_build_cpuid(CPUX86State *env,
case 0xb:
case 0xd:
for (j = 0; ; j++) {
- if (i == 0xd && j == 64) {
- break;
- }
-
c->function = i;
c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
c->index = j;
@@ -1854,7 +1896,12 @@ static uint32_t kvm_x86_build_cpuid(CPUX86State *env,
break;
}
if (i == 0xd && c->eax == 0) {
- continue;
+ if (j < 63) {
+ continue;
+ } else {
+ cpuid_i--;
+ break;
+ }
}
if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
goto full;
@@ -1882,7 +1929,8 @@ static uint32_t kvm_x86_build_cpuid(CPUX86State *env,
case 0x7:
case 0x14:
case 0x1d:
- case 0x1e: {
+ case 0x1e:
+ case 0x24: {
uint32_t times;
c->function = i;
@@ -2012,6 +2060,15 @@ full:
abort();
}
+int kvm_arch_pre_create_vcpu(CPUState *cpu, Error **errp)
+{
+ if (is_tdx_vm()) {
+ return tdx_pre_create_vcpu(cpu, errp);
+ }
+
+ return 0;
+}
+
int kvm_arch_init_vcpu(CPUState *cs)
{
struct {
@@ -2036,6 +2093,14 @@ int kvm_arch_init_vcpu(CPUState *cs)
int r;
Error *local_err = NULL;
+ if (current_machine->cgs) {
+ r = x86_confidential_guest_check_features(
+ X86_CONFIDENTIAL_GUEST(current_machine->cgs), cs);
+ if (r < 0) {
+ return r;
+ }
+ }
+
memset(&cpuid_data, 0, sizeof(cpuid_data));
cpuid_i = 0;
@@ -2373,6 +2438,21 @@ void kvm_arch_after_reset_vcpu(X86CPU *cpu)
}
}
+void kvm_arch_reset_parked_vcpu(unsigned long vcpu_id, int kvm_fd)
+{
+ g_autofree struct kvm_msrs *msrs = NULL;
+
+ msrs = g_malloc0(sizeof(*msrs) + sizeof(msrs->entries[0]));
+ msrs->entries[0].index = MSR_IA32_TSC;
+ msrs->entries[0].data = 1; /* match the value in x86_cpu_reset() */
+ msrs->nmsrs++;
+
+ if (ioctl(kvm_fd, KVM_SET_MSRS, msrs) != 1) {
+ warn_report("parked vCPU %lu TSC reset failed: %d",
+ vcpu_id, errno);
+ }
+}
+
void kvm_arch_do_init_vcpu(X86CPU *cpu)
{
CPUX86State *env = &cpu->env;
@@ -2545,6 +2625,8 @@ static int kvm_get_supported_msrs(KVMState *s)
case MSR_IA32_PKRS:
has_msr_pkrs = true;
break;
+ case MSR_K7_HWCR:
+ has_msr_hwcr = true;
}
}
}
@@ -2554,13 +2636,58 @@ static int kvm_get_supported_msrs(KVMState *s)
return ret;
}
-static bool kvm_rdmsr_core_thread_count(X86CPU *cpu, uint32_t msr,
+static bool kvm_rdmsr_core_thread_count(X86CPU *cpu,
+ uint32_t msr,
uint64_t *val)
{
+ *val = cpu_x86_get_msr_core_thread_count(cpu);
+
+ return true;
+}
+
+static bool kvm_rdmsr_rapl_power_unit(X86CPU *cpu,
+ uint32_t msr,
+ uint64_t *val)
+{
+
CPUState *cs = CPU(cpu);
- *val = cs->nr_threads * cs->nr_cores; /* thread count, bits 15..0 */
- *val |= ((uint32_t)cs->nr_cores << 16); /* core count, bits 31..16 */
+ *val = cs->kvm_state->msr_energy.msr_unit;
+
+ return true;
+}
+
+static bool kvm_rdmsr_pkg_power_limit(X86CPU *cpu,
+ uint32_t msr,
+ uint64_t *val)
+{
+
+ CPUState *cs = CPU(cpu);
+
+ *val = cs->kvm_state->msr_energy.msr_limit;
+
+ return true;
+}
+
+static bool kvm_rdmsr_pkg_power_info(X86CPU *cpu,
+ uint32_t msr,
+ uint64_t *val)
+{
+
+ CPUState *cs = CPU(cpu);
+
+ *val = cs->kvm_state->msr_energy.msr_info;
+
+ return true;
+}
+
+static bool kvm_rdmsr_pkg_energy_status(X86CPU *cpu,
+ uint32_t msr,
+ uint64_t *val)
+{
+
+ CPUState *cs = CPU(cpu);
+ *val = cs->kvm_state->msr_energy.msr_value[cs->cpu_index];
return true;
}
@@ -2599,63 +2726,535 @@ static void register_smram_listener(Notifier *n, void *unused)
&smram_address_space, 1, "kvm-smram");
}
-int kvm_arch_get_default_type(MachineState *ms)
+static void *kvm_msr_energy_thread(void *data)
{
- return 0;
+ KVMState *s = data;
+ struct KVMMsrEnergy *vmsr = &s->msr_energy;
+
+ g_autofree vmsr_package_energy_stat *pkg_stat = NULL;
+ g_autofree vmsr_thread_stat *thd_stat = NULL;
+ g_autofree CPUState *cpu = NULL;
+ g_autofree unsigned int *vpkgs_energy_stat = NULL;
+ unsigned int num_threads = 0;
+
+ X86CPUTopoIDs topo_ids;
+
+ rcu_register_thread();
+
+ /* Allocate memory for each package energy status */
+ pkg_stat = g_new0(vmsr_package_energy_stat, vmsr->host_topo.maxpkgs);
+
+ /* Allocate memory for thread stats */
+ thd_stat = g_new0(vmsr_thread_stat, 1);
+
+ /* Allocate memory for holding virtual package energy counter */
+ vpkgs_energy_stat = g_new0(unsigned int, vmsr->guest_vsockets);
+
+ /* Populate the max tick of each packages */
+ for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) {
+ /*
+ * Max numbers of ticks per package
+ * Time in second * Number of ticks/second * Number of cores/package
+ * ex: 100 ticks/second/CPU, 12 CPUs per Package gives 1200 ticks max
+ */
+ vmsr->host_topo.maxticks[i] = (MSR_ENERGY_THREAD_SLEEP_US / 1000000)
+ * sysconf(_SC_CLK_TCK)
+ * vmsr->host_topo.pkg_cpu_count[i];
+ }
+
+ while (true) {
+ /* Get all qemu threads id */
+ g_autofree pid_t *thread_ids
+ = vmsr_get_thread_ids(vmsr->pid, &num_threads);
+
+ if (thread_ids == NULL) {
+ goto clean;
+ }
+
+ thd_stat = g_renew(vmsr_thread_stat, thd_stat, num_threads);
+ /* Unlike g_new0, g_renew0 function doesn't exist yet... */
+ memset(thd_stat, 0, num_threads * sizeof(vmsr_thread_stat));
+
+ /* Populate all the thread stats */
+ for (int i = 0; i < num_threads; i++) {
+ thd_stat[i].utime = g_new0(unsigned long long, 2);
+ thd_stat[i].stime = g_new0(unsigned long long, 2);
+ thd_stat[i].thread_id = thread_ids[i];
+ vmsr_read_thread_stat(vmsr->pid,
+ thd_stat[i].thread_id,
+ &thd_stat[i].utime[0],
+ &thd_stat[i].stime[0],
+ &thd_stat[i].cpu_id);
+ thd_stat[i].pkg_id =
+ vmsr_get_physical_package_id(thd_stat[i].cpu_id);
+ }
+
+ /* Retrieve all packages power plane energy counter */
+ for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) {
+ for (int j = 0; j < num_threads; j++) {
+ /*
+ * Use the first thread we found that ran on the CPU
+ * of the package to read the packages energy counter
+ */
+ if (thd_stat[j].pkg_id == i) {
+ pkg_stat[i].e_start =
+ vmsr_read_msr(MSR_PKG_ENERGY_STATUS,
+ thd_stat[j].cpu_id,
+ thd_stat[j].thread_id,
+ s->msr_energy.sioc);
+ break;
+ }
+ }
+ }
+
+ /* Sleep a short period while the other threads are working */
+ usleep(MSR_ENERGY_THREAD_SLEEP_US);
+
+ /*
+ * Retrieve all packages power plane energy counter
+ * Calculate the delta of all packages
+ */
+ for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) {
+ for (int j = 0; j < num_threads; j++) {
+ /*
+ * Use the first thread we found that ran on the CPU
+ * of the package to read the packages energy counter
+ */
+ if (thd_stat[j].pkg_id == i) {
+ pkg_stat[i].e_end =
+ vmsr_read_msr(MSR_PKG_ENERGY_STATUS,
+ thd_stat[j].cpu_id,
+ thd_stat[j].thread_id,
+ s->msr_energy.sioc);
+ /*
+ * Prevent the case we have migrate the VM
+ * during the sleep period or any other cases
+ * were energy counter might be lower after
+ * the sleep period.
+ */
+ if (pkg_stat[i].e_end > pkg_stat[i].e_start) {
+ pkg_stat[i].e_delta =
+ pkg_stat[i].e_end - pkg_stat[i].e_start;
+ } else {
+ pkg_stat[i].e_delta = 0;
+ }
+ break;
+ }
+ }
+ }
+
+ /* Delta of ticks spend by each thread between the sample */
+ for (int i = 0; i < num_threads; i++) {
+ vmsr_read_thread_stat(vmsr->pid,
+ thd_stat[i].thread_id,
+ &thd_stat[i].utime[1],
+ &thd_stat[i].stime[1],
+ &thd_stat[i].cpu_id);
+
+ if (vmsr->pid < 0) {
+ /*
+ * We don't count the dead thread
+ * i.e threads that existed before the sleep
+ * and not anymore
+ */
+ thd_stat[i].delta_ticks = 0;
+ } else {
+ vmsr_delta_ticks(thd_stat, i);
+ }
+ }
+
+ /*
+ * Identify the vcpu threads
+ * Calculate the number of vcpu per package
+ */
+ CPU_FOREACH(cpu) {
+ for (int i = 0; i < num_threads; i++) {
+ if (cpu->thread_id == thd_stat[i].thread_id) {
+ thd_stat[i].is_vcpu = true;
+ thd_stat[i].vcpu_id = cpu->cpu_index;
+ pkg_stat[thd_stat[i].pkg_id].nb_vcpu++;
+ thd_stat[i].acpi_id = kvm_arch_vcpu_id(cpu);
+ break;
+ }
+ }
+ }
+
+ /* Retrieve the virtual package number of each vCPU */
+ for (int i = 0; i < vmsr->guest_cpu_list->len; i++) {
+ for (int j = 0; j < num_threads; j++) {
+ if ((thd_stat[j].acpi_id ==
+ vmsr->guest_cpu_list->cpus[i].arch_id)
+ && (thd_stat[j].is_vcpu == true)) {
+ x86_topo_ids_from_apicid(thd_stat[j].acpi_id,
+ &vmsr->guest_topo_info, &topo_ids);
+ thd_stat[j].vpkg_id = topo_ids.pkg_id;
+ }
+ }
+ }
+
+ /* Calculate the total energy of all non-vCPU thread */
+ for (int i = 0; i < num_threads; i++) {
+ if ((thd_stat[i].is_vcpu != true) &&
+ (thd_stat[i].delta_ticks > 0)) {
+ double temp;
+ temp = vmsr_get_ratio(pkg_stat[thd_stat[i].pkg_id].e_delta,
+ thd_stat[i].delta_ticks,
+ vmsr->host_topo.maxticks[thd_stat[i].pkg_id]);
+ pkg_stat[thd_stat[i].pkg_id].e_ratio
+ += (uint64_t)lround(temp);
+ }
+ }
+
+ /* Calculate the ratio per non-vCPU thread of each package */
+ for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) {
+ if (pkg_stat[i].nb_vcpu > 0) {
+ pkg_stat[i].e_ratio = pkg_stat[i].e_ratio / pkg_stat[i].nb_vcpu;
+ }
+ }
+
+ /*
+ * Calculate the energy for each Package:
+ * Energy Package = sum of each vCPU energy that belongs to the package
+ */
+ for (int i = 0; i < num_threads; i++) {
+ if ((thd_stat[i].is_vcpu == true) && \
+ (thd_stat[i].delta_ticks > 0)) {
+ double temp;
+ temp = vmsr_get_ratio(pkg_stat[thd_stat[i].pkg_id].e_delta,
+ thd_stat[i].delta_ticks,
+ vmsr->host_topo.maxticks[thd_stat[i].pkg_id]);
+ vpkgs_energy_stat[thd_stat[i].vpkg_id] +=
+ (uint64_t)lround(temp);
+ vpkgs_energy_stat[thd_stat[i].vpkg_id] +=
+ pkg_stat[thd_stat[i].pkg_id].e_ratio;
+ }
+ }
+
+ /*
+ * Finally populate the vmsr register of each vCPU with the total
+ * package value to emulate the real hardware where each CPU return the
+ * value of the package it belongs.
+ */
+ for (int i = 0; i < num_threads; i++) {
+ if ((thd_stat[i].is_vcpu == true) && \
+ (thd_stat[i].delta_ticks > 0)) {
+ vmsr->msr_value[thd_stat[i].vcpu_id] = \
+ vpkgs_energy_stat[thd_stat[i].vpkg_id];
+ }
+ }
+
+ /* Freeing memory before zeroing the pointer */
+ for (int i = 0; i < num_threads; i++) {
+ g_free(thd_stat[i].utime);
+ g_free(thd_stat[i].stime);
+ }
+ }
+
+clean:
+ rcu_unregister_thread();
+ return NULL;
}
-int kvm_arch_init(MachineState *ms, KVMState *s)
+static int kvm_msr_energy_thread_init(KVMState *s, MachineState *ms)
{
- uint64_t identity_base = 0xfffbc000;
- uint64_t shadow_mem;
- int ret;
- struct utsname utsname;
- Error *local_err = NULL;
+ MachineClass *mc = MACHINE_GET_CLASS(ms);
+ struct KVMMsrEnergy *r = &s->msr_energy;
/*
- * Initialize SEV context, if required
- *
- * If no memory encryption is requested (ms->cgs == NULL) this is
- * a no-op.
- *
- * It's also a no-op if a non-SEV confidential guest support
- * mechanism is selected. SEV is the only mechanism available to
- * select on x86 at present, so this doesn't arise, but if new
- * mechanisms are supported in future (e.g. TDX), they'll need
- * their own initialization either here or elsewhere.
+ * Sanity check
+ * 1. Host cpu must be Intel cpu
+ * 2. RAPL must be enabled on the Host
*/
- if (ms->cgs) {
- ret = confidential_guest_kvm_init(ms->cgs, &local_err);
- if (ret < 0) {
- error_report_err(local_err);
- return ret;
+ if (!is_host_cpu_intel()) {
+ error_report("The RAPL feature can only be enabled on hosts "
+ "with Intel CPU models");
+ return -1;
+ }
+
+ if (!is_rapl_enabled()) {
+ return -1;
+ }
+
+ /* Retrieve the virtual topology */
+ vmsr_init_topo_info(&r->guest_topo_info, ms);
+
+ /* Retrieve the number of vcpu */
+ r->guest_vcpus = ms->smp.cpus;
+
+ /* Retrieve the number of virtual sockets */
+ r->guest_vsockets = ms->smp.sockets;
+
+ /* Allocate register memory (MSR_PKG_STATUS) for each vcpu */
+ r->msr_value = g_new0(uint64_t, r->guest_vcpus);
+
+ /* Retrieve the CPUArchIDlist */
+ r->guest_cpu_list = mc->possible_cpu_arch_ids(ms);
+
+ /* Max number of cpus on the Host */
+ r->host_topo.maxcpus = vmsr_get_maxcpus();
+ if (r->host_topo.maxcpus == 0) {
+ error_report("host max cpus = 0");
+ return -1;
+ }
+
+ /* Max number of packages on the host */
+ r->host_topo.maxpkgs = vmsr_get_max_physical_package(r->host_topo.maxcpus);
+ if (r->host_topo.maxpkgs == 0) {
+ error_report("host max pkgs = 0");
+ return -1;
+ }
+
+ /* Allocate memory for each package on the host */
+ r->host_topo.pkg_cpu_count = g_new0(unsigned int, r->host_topo.maxpkgs);
+ r->host_topo.maxticks = g_new0(unsigned int, r->host_topo.maxpkgs);
+
+ vmsr_count_cpus_per_package(r->host_topo.pkg_cpu_count,
+ r->host_topo.maxpkgs);
+ for (int i = 0; i < r->host_topo.maxpkgs; i++) {
+ if (r->host_topo.pkg_cpu_count[i] == 0) {
+ error_report("cpu per packages = 0 on package_%d", i);
+ return -1;
}
}
- has_xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
- has_sregs2 = kvm_check_extension(s, KVM_CAP_SREGS2) > 0;
+ /* Get QEMU PID*/
+ r->pid = getpid();
- hv_vpindex_settable = kvm_check_extension(s, KVM_CAP_HYPERV_VP_INDEX);
+ /* Compute the socket path if necessary */
+ if (s->msr_energy.socket_path == NULL) {
+ s->msr_energy.socket_path = vmsr_compute_default_paths();
+ }
+
+ /* Open socket with vmsr helper */
+ s->msr_energy.sioc = vmsr_open_socket(s->msr_energy.socket_path);
+
+ if (s->msr_energy.sioc == NULL) {
+ error_report("vmsr socket opening failed");
+ return -1;
+ }
+
+ /* Those MSR values should not change */
+ r->msr_unit = vmsr_read_msr(MSR_RAPL_POWER_UNIT, 0, r->pid,
+ s->msr_energy.sioc);
+ r->msr_limit = vmsr_read_msr(MSR_PKG_POWER_LIMIT, 0, r->pid,
+ s->msr_energy.sioc);
+ r->msr_info = vmsr_read_msr(MSR_PKG_POWER_INFO, 0, r->pid,
+ s->msr_energy.sioc);
+ if (r->msr_unit == 0 || r->msr_limit == 0 || r->msr_info == 0) {
+ error_report("can't read any virtual msr");
+ return -1;
+ }
+ qemu_thread_create(&r->msr_thr, "kvm-msr",
+ kvm_msr_energy_thread,
+ s, QEMU_THREAD_JOINABLE);
+ return 0;
+}
+
+int kvm_arch_get_default_type(MachineState *ms)
+{
+ return 0;
+}
+
+static int kvm_vm_enable_exception_payload(KVMState *s)
+{
+ int ret = 0;
has_exception_payload = kvm_check_extension(s, KVM_CAP_EXCEPTION_PAYLOAD);
if (has_exception_payload) {
ret = kvm_vm_enable_cap(s, KVM_CAP_EXCEPTION_PAYLOAD, 0, true);
if (ret < 0) {
error_report("kvm: Failed to enable exception payload cap: %s",
strerror(-ret));
- return ret;
}
}
- has_triple_fault_event = kvm_check_extension(s, KVM_CAP_X86_TRIPLE_FAULT_EVENT);
+ return ret;
+}
+
+static int kvm_vm_enable_triple_fault_event(KVMState *s)
+{
+ int ret = 0;
+ has_triple_fault_event = \
+ kvm_check_extension(s,
+ KVM_CAP_X86_TRIPLE_FAULT_EVENT);
if (has_triple_fault_event) {
ret = kvm_vm_enable_cap(s, KVM_CAP_X86_TRIPLE_FAULT_EVENT, 0, true);
if (ret < 0) {
error_report("kvm: Failed to enable triple fault event cap: %s",
strerror(-ret));
+ }
+ }
+ return ret;
+}
+
+static int kvm_vm_set_identity_map_addr(KVMState *s, uint64_t identity_base)
+{
+ return kvm_vm_ioctl(s, KVM_SET_IDENTITY_MAP_ADDR, &identity_base);
+}
+
+static int kvm_vm_set_nr_mmu_pages(KVMState *s)
+{
+ uint64_t shadow_mem;
+ int ret = 0;
+ shadow_mem = object_property_get_int(OBJECT(s),
+ "kvm-shadow-mem",
+ &error_abort);
+ if (shadow_mem != -1) {
+ shadow_mem /= 4096;
+ ret = kvm_vm_ioctl(s, KVM_SET_NR_MMU_PAGES, shadow_mem);
+ }
+ return ret;
+}
+
+static int kvm_vm_set_tss_addr(KVMState *s, uint64_t tss_base)
+{
+ return kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, tss_base);
+}
+
+static int kvm_vm_enable_disable_exits(KVMState *s)
+{
+ int disable_exits = kvm_check_extension(s, KVM_CAP_X86_DISABLE_EXITS);
+
+ if (disable_exits) {
+ disable_exits &= (KVM_X86_DISABLE_EXITS_MWAIT |
+ KVM_X86_DISABLE_EXITS_HLT |
+ KVM_X86_DISABLE_EXITS_PAUSE |
+ KVM_X86_DISABLE_EXITS_CSTATE);
+ }
+
+ return kvm_vm_enable_cap(s, KVM_CAP_X86_DISABLE_EXITS, 0,
+ disable_exits);
+}
+
+static int kvm_vm_enable_bus_lock_exit(KVMState *s)
+{
+ int ret = 0;
+ ret = kvm_check_extension(s, KVM_CAP_X86_BUS_LOCK_EXIT);
+ if (!(ret & KVM_BUS_LOCK_DETECTION_EXIT)) {
+ error_report("kvm: bus lock detection unsupported");
+ return -ENOTSUP;
+ }
+ ret = kvm_vm_enable_cap(s, KVM_CAP_X86_BUS_LOCK_EXIT, 0,
+ KVM_BUS_LOCK_DETECTION_EXIT);
+ if (ret < 0) {
+ error_report("kvm: Failed to enable bus lock detection cap: %s",
+ strerror(-ret));
+ }
+
+ return ret;
+}
+
+static int kvm_vm_enable_notify_vmexit(KVMState *s)
+{
+ int ret = 0;
+ if (s->notify_vmexit != NOTIFY_VMEXIT_OPTION_DISABLE) {
+ uint64_t notify_window_flags =
+ ((uint64_t)s->notify_window << 32) |
+ KVM_X86_NOTIFY_VMEXIT_ENABLED |
+ KVM_X86_NOTIFY_VMEXIT_USER;
+ ret = kvm_vm_enable_cap(s, KVM_CAP_X86_NOTIFY_VMEXIT, 0,
+ notify_window_flags);
+ if (ret < 0) {
+ error_report("kvm: Failed to enable notify vmexit cap: %s",
+ strerror(-ret));
+ }
+ }
+ return ret;
+}
+
+static int kvm_vm_enable_userspace_msr(KVMState *s)
+{
+ int ret;
+
+ ret = kvm_vm_enable_cap(s, KVM_CAP_X86_USER_SPACE_MSR, 0,
+ KVM_MSR_EXIT_REASON_FILTER);
+ if (ret < 0) {
+ error_report("Could not enable user space MSRs: %s",
+ strerror(-ret));
+ exit(1);
+ }
+
+ ret = kvm_filter_msr(s, MSR_CORE_THREAD_COUNT,
+ kvm_rdmsr_core_thread_count, NULL);
+ if (ret < 0) {
+ error_report("Could not install MSR_CORE_THREAD_COUNT handler: %s",
+ strerror(-ret));
+ exit(1);
+ }
+
+ return 0;
+}
+
+static int kvm_vm_enable_energy_msrs(KVMState *s)
+{
+ int ret;
+
+ if (s->msr_energy.enable == true) {
+ ret = kvm_filter_msr(s, MSR_RAPL_POWER_UNIT,
+ kvm_rdmsr_rapl_power_unit, NULL);
+ if (ret < 0) {
+ error_report("Could not install MSR_RAPL_POWER_UNIT handler: %s",
+ strerror(-ret));
+ return ret;
+ }
+
+ ret = kvm_filter_msr(s, MSR_PKG_POWER_LIMIT,
+ kvm_rdmsr_pkg_power_limit, NULL);
+ if (ret < 0) {
+ error_report("Could not install MSR_PKG_POWER_LIMIT handler: %s",
+ strerror(-ret));
+ return ret;
+ }
+
+ ret = kvm_filter_msr(s, MSR_PKG_POWER_INFO,
+ kvm_rdmsr_pkg_power_info, NULL);
+ if (ret < 0) {
+ error_report("Could not install MSR_PKG_POWER_INFO handler: %s",
+ strerror(-ret));
+ return ret;
+ }
+ ret = kvm_filter_msr(s, MSR_PKG_ENERGY_STATUS,
+ kvm_rdmsr_pkg_energy_status, NULL);
+ if (ret < 0) {
+ error_report("Could not install MSR_PKG_ENERGY_STATUS handler: %s",
+ strerror(-ret));
return ret;
}
}
+ return 0;
+}
+
+int kvm_arch_init(MachineState *ms, KVMState *s)
+{
+ int ret;
+ struct utsname utsname;
+ Error *local_err = NULL;
+
+ /*
+ * Initialize confidential guest (SEV/TDX) context, if required
+ */
+ if (ms->cgs) {
+ ret = confidential_guest_kvm_init(ms->cgs, &local_err);
+ if (ret < 0) {
+ error_report_err(local_err);
+ return ret;
+ }
+ }
+
+ has_xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
+ has_sregs2 = kvm_check_extension(s, KVM_CAP_SREGS2) > 0;
+
+ hv_vpindex_settable = kvm_check_extension(s, KVM_CAP_HYPERV_VP_INDEX);
+
+ ret = kvm_vm_enable_exception_payload(s);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = kvm_vm_enable_triple_fault_event(s);
+ if (ret < 0) {
+ return ret;
+ }
if (s->xen_version) {
#ifdef CONFIG_XEN_EMU
@@ -2680,47 +3279,33 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
return ret;
}
- kvm_get_supported_feature_msrs(s);
+ ret = kvm_get_supported_feature_msrs(s);
+ if (ret < 0) {
+ return ret;
+ }
uname(&utsname);
lm_capable_kernel = strcmp(utsname.machine, "x86_64") == 0;
- /*
- * On older Intel CPUs, KVM uses vm86 mode to emulate 16-bit code directly.
- * In order to use vm86 mode, an EPT identity map and a TSS are needed.
- * Since these must be part of guest physical memory, we need to allocate
- * them, both by setting their start addresses in the kernel and by
- * creating a corresponding e820 entry. We need 4 pages before the BIOS,
- * so this value allows up to 16M BIOSes.
- */
- identity_base = 0xfeffc000;
- ret = kvm_vm_ioctl(s, KVM_SET_IDENTITY_MAP_ADDR, &identity_base);
+ ret = kvm_vm_set_identity_map_addr(s, KVM_IDENTITY_BASE);
if (ret < 0) {
return ret;
}
/* Set TSS base one page after EPT identity map. */
- ret = kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, identity_base + 0x1000);
+ ret = kvm_vm_set_tss_addr(s, KVM_IDENTITY_BASE + 0x1000);
if (ret < 0) {
return ret;
}
/* Tell fw_cfg to notify the BIOS to reserve the range. */
- ret = e820_add_entry(identity_base, 0x4000, E820_RESERVED);
+ e820_add_entry(KVM_IDENTITY_BASE, 0x4000, E820_RESERVED);
+
+ ret = kvm_vm_set_nr_mmu_pages(s);
if (ret < 0) {
- fprintf(stderr, "e820_add_entry() table is full\n");
return ret;
}
- shadow_mem = object_property_get_int(OBJECT(s), "kvm-shadow-mem", &error_abort);
- if (shadow_mem != -1) {
- shadow_mem /= 4096;
- ret = kvm_vm_ioctl(s, KVM_SET_NR_MMU_PAGES, shadow_mem);
- if (ret < 0) {
- return ret;
- }
- }
-
if (kvm_check_extension(s, KVM_CAP_X86_SMM) &&
object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE) &&
x86_machine_is_smm_enabled(X86_MACHINE(ms))) {
@@ -2729,23 +3314,11 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
}
if (enable_cpu_pm) {
- int disable_exits = kvm_check_extension(s, KVM_CAP_X86_DISABLE_EXITS);
-/* Work around for kernel header with a typo. TODO: fix header and drop. */
-#if defined(KVM_X86_DISABLE_EXITS_HTL) && !defined(KVM_X86_DISABLE_EXITS_HLT)
-#define KVM_X86_DISABLE_EXITS_HLT KVM_X86_DISABLE_EXITS_HTL
-#endif
- if (disable_exits) {
- disable_exits &= (KVM_X86_DISABLE_EXITS_MWAIT |
- KVM_X86_DISABLE_EXITS_HLT |
- KVM_X86_DISABLE_EXITS_PAUSE |
- KVM_X86_DISABLE_EXITS_CSTATE);
- }
-
- ret = kvm_vm_enable_cap(s, KVM_CAP_X86_DISABLE_EXITS, 0,
- disable_exits);
+ ret = kvm_vm_enable_disable_exits(s);
if (ret < 0) {
error_report("kvm: guest stopping CPU not supported: %s",
strerror(-ret));
+ return ret;
}
}
@@ -2753,16 +3326,8 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
X86MachineState *x86ms = X86_MACHINE(ms);
if (x86ms->bus_lock_ratelimit > 0) {
- ret = kvm_check_extension(s, KVM_CAP_X86_BUS_LOCK_EXIT);
- if (!(ret & KVM_BUS_LOCK_DETECTION_EXIT)) {
- error_report("kvm: bus lock detection unsupported");
- return -ENOTSUP;
- }
- ret = kvm_vm_enable_cap(s, KVM_CAP_X86_BUS_LOCK_EXIT, 0,
- KVM_BUS_LOCK_DETECTION_EXIT);
+ ret = kvm_vm_enable_bus_lock_exit(s);
if (ret < 0) {
- error_report("kvm: Failed to enable bus lock detection cap: %s",
- strerror(-ret));
return ret;
}
ratelimit_init(&bus_lock_ratelimit_ctrl);
@@ -2771,37 +3336,30 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
}
}
- if (s->notify_vmexit != NOTIFY_VMEXIT_OPTION_DISABLE &&
- kvm_check_extension(s, KVM_CAP_X86_NOTIFY_VMEXIT)) {
- uint64_t notify_window_flags =
- ((uint64_t)s->notify_window << 32) |
- KVM_X86_NOTIFY_VMEXIT_ENABLED |
- KVM_X86_NOTIFY_VMEXIT_USER;
- ret = kvm_vm_enable_cap(s, KVM_CAP_X86_NOTIFY_VMEXIT, 0,
- notify_window_flags);
- if (ret < 0) {
- error_report("kvm: Failed to enable notify vmexit cap: %s",
- strerror(-ret));
- return ret;
- }
+ if (kvm_check_extension(s, KVM_CAP_X86_NOTIFY_VMEXIT)) {
+ ret = kvm_vm_enable_notify_vmexit(s);
+ if (ret < 0) {
+ return ret;
+ }
}
- if (kvm_vm_check_extension(s, KVM_CAP_X86_USER_SPACE_MSR)) {
- bool r;
- ret = kvm_vm_enable_cap(s, KVM_CAP_X86_USER_SPACE_MSR, 0,
- KVM_MSR_EXIT_REASON_FILTER);
- if (ret) {
- error_report("Could not enable user space MSRs: %s",
- strerror(-ret));
- exit(1);
+ if (kvm_vm_check_extension(s, KVM_CAP_X86_USER_SPACE_MSR)) {
+ ret = kvm_vm_enable_userspace_msr(s);
+ if (ret < 0) {
+ return ret;
}
- r = kvm_filter_msr(s, MSR_CORE_THREAD_COUNT,
- kvm_rdmsr_core_thread_count, NULL);
- if (!r) {
- error_report("Could not install MSR_CORE_THREAD_COUNT handler: %s",
- strerror(-ret));
- exit(1);
+ if (s->msr_energy.enable == true) {
+ ret = kvm_vm_enable_energy_msrs(s);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = kvm_msr_energy_thread_init(s, ms);
+ if (ret < 0) {
+ error_report("kvm : error RAPL feature requirement not met");
+ return ret;
+ }
}
}
@@ -3264,7 +3822,14 @@ static void kvm_msr_entry_add_vmx(X86CPU *cpu, FeatureWordArray f)
kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR4_FIXED0,
CR4_VMXE_MASK);
- if (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_TSC_SCALING) {
+ if (f[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED) {
+ /* FRED injected-event data (0x2052). */
+ kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x52);
+ } else if (f[FEAT_VMX_EXIT_CTLS] &
+ VMX_VM_EXIT_ACTIVATE_SECONDARY_CONTROLS) {
+ /* Secondary VM-exit controls (0x2044). */
+ kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x44);
+ } else if (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_TSC_SCALING) {
/* TSC multiplier (0x2032). */
kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x32);
} else {
@@ -3307,32 +3872,34 @@ static void kvm_init_msrs(X86CPU *cpu)
CPUX86State *env = &cpu->env;
kvm_msr_buf_reset(cpu);
- if (has_msr_arch_capabs) {
- kvm_msr_entry_add(cpu, MSR_IA32_ARCH_CAPABILITIES,
- env->features[FEAT_ARCH_CAPABILITIES]);
- }
- if (has_msr_core_capabs) {
- kvm_msr_entry_add(cpu, MSR_IA32_CORE_CAPABILITY,
- env->features[FEAT_CORE_CAPABILITY]);
- }
+ if (!is_tdx_vm()) {
+ if (has_msr_arch_capabs) {
+ kvm_msr_entry_add(cpu, MSR_IA32_ARCH_CAPABILITIES,
+ env->features[FEAT_ARCH_CAPABILITIES]);
+ }
+
+ if (has_msr_core_capabs) {
+ kvm_msr_entry_add(cpu, MSR_IA32_CORE_CAPABILITY,
+ env->features[FEAT_CORE_CAPABILITY]);
+ }
- if (has_msr_perf_capabs && cpu->enable_pmu) {
- kvm_msr_entry_add_perf(cpu, env->features);
+ if (has_msr_perf_capabs && cpu->enable_pmu) {
+ kvm_msr_entry_add_perf(cpu, env->features);
+ }
+
+ /*
+ * Older kernels do not include VMX MSRs in KVM_GET_MSR_INDEX_LIST, but
+ * all kernels with MSR features should have them.
+ */
+ if (kvm_feature_msrs && cpu_has_vmx(env)) {
+ kvm_msr_entry_add_vmx(cpu, env->features);
+ }
}
if (has_msr_ucode_rev) {
kvm_msr_entry_add(cpu, MSR_IA32_UCODE_REV, cpu->ucode_rev);
}
-
- /*
- * Older kernels do not include VMX MSRs in KVM_GET_MSR_INDEX_LIST, but
- * all kernels with MSR features should have them.
- */
- if (kvm_feature_msrs && cpu_has_vmx(env)) {
- kvm_msr_entry_add_vmx(cpu, env->features);
- }
-
assert(kvm_buf_set_msrs(cpu) == 0);
}
@@ -3394,6 +3961,9 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
if (has_msr_virt_ssbd) {
kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, env->virt_ssbd);
}
+ if (has_msr_hwcr) {
+ kvm_msr_entry_add(cpu, MSR_K7_HWCR, env->msr_hwcr);
+ }
#ifdef TARGET_X86_64
if (lm_capable_kernel) {
@@ -3421,22 +3991,24 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
*/
if (level >= KVM_PUT_RESET_STATE) {
kvm_msr_entry_add(cpu, MSR_IA32_TSC, env->tsc);
- kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, env->system_time_msr);
- kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
- if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF_INT)) {
+ if (env->features[FEAT_KVM] & (CPUID_KVM_CLOCK | CPUID_KVM_CLOCK2)) {
+ kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, env->system_time_msr);
+ kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
+ }
+ if (env->features[FEAT_KVM] & CPUID_KVM_ASYNCPF_INT) {
kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_INT, env->async_pf_int_msr);
}
- if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF)) {
+ if (env->features[FEAT_KVM] & CPUID_KVM_ASYNCPF) {
kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, env->async_pf_en_msr);
}
- if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_PV_EOI)) {
+ if (env->features[FEAT_KVM] & CPUID_KVM_PV_EOI) {
kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, env->pv_eoi_en_msr);
}
- if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) {
+ if (env->features[FEAT_KVM] & CPUID_KVM_STEAL_TIME) {
kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, env->steal_time_msr);
}
- if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_POLL_CONTROL)) {
+ if (env->features[FEAT_KVM] & CPUID_KVM_POLL_CONTROL) {
kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, env->poll_control_msr);
}
@@ -3494,13 +4066,11 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS,
env->msr_hv_tsc_emulation_status);
}
-#ifdef CONFIG_SYNDBG
if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNDBG) &&
has_msr_hv_syndbg_options) {
kvm_msr_entry_add(cpu, HV_X64_MSR_SYNDBG_OPTIONS,
hyperv_syndbg_query_options());
}
-#endif
}
if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) {
kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE,
@@ -3672,7 +4242,8 @@ static int kvm_get_xsave(X86CPU *cpu)
{
CPUX86State *env = &cpu->env;
void *xsave = env->xsave_buf;
- int type, ret;
+ unsigned long type;
+ int ret;
type = has_xsave2 ? KVM_GET_XSAVE2 : KVM_GET_XSAVE;
ret = kvm_vcpu_ioctl(CPU(cpu), type, xsave);
@@ -3877,6 +4448,9 @@ static int kvm_get_msrs(X86CPU *cpu)
kvm_msr_entry_add(cpu, MSR_IA32_TSC, 0);
env->tsc_valid = !runstate_is_running();
}
+ if (has_msr_hwcr) {
+ kvm_msr_entry_add(cpu, MSR_K7_HWCR, 0);
+ }
#ifdef TARGET_X86_64
if (lm_capable_kernel) {
@@ -3897,21 +4471,23 @@ static int kvm_get_msrs(X86CPU *cpu)
}
}
#endif
- kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, 0);
- kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, 0);
- if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF_INT)) {
+ if (env->features[FEAT_KVM] & (CPUID_KVM_CLOCK | CPUID_KVM_CLOCK2)) {
+ kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, 0);
+ kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, 0);
+ }
+ if (env->features[FEAT_KVM] & CPUID_KVM_ASYNCPF_INT) {
kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_INT, 0);
}
- if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF)) {
+ if (env->features[FEAT_KVM] & CPUID_KVM_ASYNCPF) {
kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, 0);
}
- if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_PV_EOI)) {
+ if (env->features[FEAT_KVM] & CPUID_KVM_PV_EOI) {
kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, 0);
}
- if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) {
+ if (env->features[FEAT_KVM] & CPUID_KVM_STEAL_TIME) {
kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, 0);
}
- if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_POLL_CONTROL)) {
+ if (env->features[FEAT_KVM] & CPUID_KVM_POLL_CONTROL) {
kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, 1);
}
if (has_architectural_pmu_version > 0) {
@@ -4396,6 +4972,9 @@ static int kvm_get_msrs(X86CPU *cpu)
case MSR_ARCH_LBR_INFO_0 ... MSR_ARCH_LBR_INFO_0 + 31:
env->lbr_records[index - MSR_ARCH_LBR_INFO_0].info = msrs[i].data;
break;
+ case MSR_K7_HWCR:
+ env->msr_hwcr = msrs[i].data;
+ break;
}
}
@@ -4687,7 +5266,7 @@ static int kvm_get_nested_state(X86CPU *cpu)
return ret;
}
-int kvm_arch_put_registers(CPUState *cpu, int level)
+int kvm_arch_put_registers(CPUState *cpu, int level, Error **errp)
{
X86CPU *x86_cpu = X86_CPU(cpu);
int ret;
@@ -4702,6 +5281,7 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
if (level >= KVM_PUT_RESET_STATE) {
ret = kvm_put_msr_feature_control(x86_cpu);
if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to set feature control MSR");
return ret;
}
}
@@ -4709,12 +5289,14 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
/* must be before kvm_put_nested_state so that EFER.SVME is set */
ret = has_sregs2 ? kvm_put_sregs2(x86_cpu) : kvm_put_sregs(x86_cpu);
if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to set special registers");
return ret;
}
if (level >= KVM_PUT_RESET_STATE) {
ret = kvm_put_nested_state(x86_cpu);
if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to set nested state");
return ret;
}
}
@@ -4732,6 +5314,7 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
if (xen_mode == XEN_EMULATE && level == KVM_PUT_FULL_STATE) {
ret = kvm_put_xen_state(cpu);
if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to set Xen state");
return ret;
}
}
@@ -4739,43 +5322,51 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
ret = kvm_getput_regs(x86_cpu, 1);
if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to set general purpose registers");
return ret;
}
ret = kvm_put_xsave(x86_cpu);
if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to set XSAVE");
return ret;
}
ret = kvm_put_xcrs(x86_cpu);
if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to set XCRs");
return ret;
}
ret = kvm_put_msrs(x86_cpu, level);
if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to set MSRs");
return ret;
}
ret = kvm_put_vcpu_events(x86_cpu, level);
if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to set vCPU events");
return ret;
}
if (level >= KVM_PUT_RESET_STATE) {
ret = kvm_put_mp_state(x86_cpu);
if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to set MP state");
return ret;
}
}
ret = kvm_put_tscdeadline_msr(x86_cpu);
if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to set TSC deadline MSR");
return ret;
}
ret = kvm_put_debugregs(x86_cpu);
if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to set debug registers");
return ret;
}
return 0;
}
-int kvm_arch_get_registers(CPUState *cs)
+int kvm_arch_get_registers(CPUState *cs, Error **errp)
{
X86CPU *cpu = X86_CPU(cs);
int ret;
@@ -4784,6 +5375,7 @@ int kvm_arch_get_registers(CPUState *cs)
ret = kvm_get_vcpu_events(cpu);
if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to get vCPU events");
goto out;
}
/*
@@ -4792,44 +5384,54 @@ int kvm_arch_get_registers(CPUState *cs)
*/
ret = kvm_get_mp_state(cpu);
if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to get MP state");
goto out;
}
ret = kvm_getput_regs(cpu, 0);
if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to get general purpose registers");
goto out;
}
ret = kvm_get_xsave(cpu);
if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to get XSAVE");
goto out;
}
ret = kvm_get_xcrs(cpu);
if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to get XCRs");
goto out;
}
ret = has_sregs2 ? kvm_get_sregs2(cpu) : kvm_get_sregs(cpu);
if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to get special registers");
goto out;
}
ret = kvm_get_msrs(cpu);
if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to get MSRs");
goto out;
}
ret = kvm_get_apic(cpu);
if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to get APIC");
goto out;
}
ret = kvm_get_debugregs(cpu);
if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to get debug registers");
goto out;
}
ret = kvm_get_nested_state(cpu);
if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to get nested state");
goto out;
}
#ifdef CONFIG_XEN_EMU
if (xen_mode == XEN_EMULATE) {
ret = kvm_get_xen_state(cs);
if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to get Xen state");
goto out;
}
}
@@ -5260,15 +5862,16 @@ void kvm_arch_update_guest_debug(CPUState *cpu, struct kvm_guest_debug *dbg)
}
}
-static bool kvm_install_msr_filters(KVMState *s)
+static int kvm_install_msr_filters(KVMState *s)
{
uint64_t zero = 0;
struct kvm_msr_filter filter = {
.flags = KVM_MSR_FILTER_DEFAULT_ALLOW,
};
- int r, i, j = 0;
+ int i, j = 0;
- for (i = 0; i < KVM_MSR_FILTER_MAX_RANGES; i++) {
+ QEMU_BUILD_BUG_ON(ARRAY_SIZE(msr_handlers) != ARRAY_SIZE(filter.ranges));
+ for (i = 0; i < ARRAY_SIZE(msr_handlers); i++) {
KVMMSRHandlers *handler = &msr_handlers[i];
if (handler->msr) {
struct kvm_msr_filter_range *range = &filter.ranges[j++];
@@ -5290,18 +5893,13 @@ static bool kvm_install_msr_filters(KVMState *s)
}
}
- r = kvm_vm_ioctl(s, KVM_X86_SET_MSR_FILTER, &filter);
- if (r) {
- return false;
- }
-
- return true;
+ return kvm_vm_ioctl(s, KVM_X86_SET_MSR_FILTER, &filter);
}
-bool kvm_filter_msr(KVMState *s, uint32_t msr, QEMURDMSRHandler *rdmsr,
- QEMUWRMSRHandler *wrmsr)
+static int kvm_filter_msr(KVMState *s, uint32_t msr, QEMURDMSRHandler *rdmsr,
+ QEMUWRMSRHandler *wrmsr)
{
- int i;
+ int i, ret;
for (i = 0; i < ARRAY_SIZE(msr_handlers); i++) {
if (!msr_handlers[i].msr) {
@@ -5311,16 +5909,17 @@ bool kvm_filter_msr(KVMState *s, uint32_t msr, QEMURDMSRHandler *rdmsr,
.wrmsr = wrmsr,
};
- if (!kvm_install_msr_filters(s)) {
+ ret = kvm_install_msr_filters(s);
+ if (ret) {
msr_handlers[i] = (KVMMSRHandlers) { };
- return false;
+ return ret;
}
- return true;
+ return 0;
}
}
- return false;
+ return -EINVAL;
}
static int kvm_handle_rdmsr(X86CPU *cpu, struct kvm_run *run)
@@ -5340,7 +5939,7 @@ static int kvm_handle_rdmsr(X86CPU *cpu, struct kvm_run *run)
}
}
- assert(false);
+ g_assert_not_reached();
}
static int kvm_handle_wrmsr(X86CPU *cpu, struct kvm_run *run)
@@ -5359,7 +5958,7 @@ static int kvm_handle_wrmsr(X86CPU *cpu, struct kvm_run *run)
}
}
- assert(false);
+ g_assert_not_reached();
}
static bool has_sgx_provisioning;
@@ -5419,9 +6018,11 @@ static bool host_supports_vmx(void)
* because private/shared page tracking is already provided through other
* means, these 2 use-cases should be treated as being mutually-exclusive.
*/
-static int kvm_handle_hc_map_gpa_range(struct kvm_run *run)
+static int kvm_handle_hc_map_gpa_range(X86CPU *cpu, struct kvm_run *run)
{
+ struct kvm_pre_fault_memory mem;
uint64_t gpa, size, attributes;
+ int ret;
if (!machine_require_guest_memfd(current_machine))
return -EINVAL;
@@ -5432,13 +6033,32 @@ static int kvm_handle_hc_map_gpa_range(struct kvm_run *run)
trace_kvm_hc_map_gpa_range(gpa, size, attributes, run->hypercall.flags);
- return kvm_convert_memory(gpa, size, attributes & KVM_MAP_GPA_RANGE_ENCRYPTED);
+ ret = kvm_convert_memory(gpa, size, attributes & KVM_MAP_GPA_RANGE_ENCRYPTED);
+ if (ret || !kvm_pre_fault_memory_supported) {
+ return ret;
+ }
+
+ /*
+ * Opportunistically pre-fault memory in. Failures are ignored so that any
+ * errors in faulting in the memory will get captured in KVM page fault
+ * path when the guest first accesses the page.
+ */
+ memset(&mem, 0, sizeof(mem));
+ mem.gpa = gpa;
+ mem.size = size;
+ while (mem.size) {
+ if (kvm_vcpu_ioctl(CPU(cpu), KVM_PRE_FAULT_MEMORY, &mem)) {
+ break;
+ }
+ }
+
+ return 0;
}
-static int kvm_handle_hypercall(struct kvm_run *run)
+static int kvm_handle_hypercall(X86CPU *cpu, struct kvm_run *run)
{
if (run->hypercall.nr == KVM_HC_MAP_GPA_RANGE)
- return kvm_handle_hc_map_gpa_range(run);
+ return kvm_handle_hc_map_gpa_range(cpu, run);
return -EINVAL;
}
@@ -5538,7 +6158,32 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
break;
#endif
case KVM_EXIT_HYPERCALL:
- ret = kvm_handle_hypercall(run);
+ ret = kvm_handle_hypercall(cpu, run);
+ break;
+ case KVM_EXIT_SYSTEM_EVENT:
+ switch (run->system_event.type) {
+ case KVM_SYSTEM_EVENT_TDX_FATAL:
+ ret = tdx_handle_report_fatal_error(cpu, run);
+ break;
+ default:
+ ret = -1;
+ break;
+ }
+ break;
+ case KVM_EXIT_TDX:
+ /*
+ * run->tdx is already set up for the case where userspace
+ * does not handle the TDVMCALL.
+ */
+ switch (run->tdx.nr) {
+ case TDVMCALL_GET_QUOTE:
+ tdx_handle_get_quote(cpu, run);
+ break;
+ case TDVMCALL_GET_TD_VM_CALL_INFO:
+ tdx_handle_get_tdvmcall_info(cpu, run);
+ break;
+ }
+ ret = 0;
break;
default:
fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
@@ -5612,7 +6257,7 @@ uint64_t kvm_swizzle_msi_ext_dest_id(uint64_t address)
return address;
}
env = &X86_CPU(first_cpu)->env;
- if (!(env->features[FEAT_KVM] & (1 << KVM_FEATURE_MSI_EXT_DEST_ID))) {
+ if (!(env->features[FEAT_KVM] & CPUID_KVM_MSI_EXT_DEST_ID)) {
return address;
}