diff options
Diffstat (limited to 'target/i386/mshv/mshv-cpu.c')
-rw-r--r-- | target/i386/mshv/mshv-cpu.c | 1763 |
1 files changed, 1763 insertions, 0 deletions
diff --git a/target/i386/mshv/mshv-cpu.c b/target/i386/mshv/mshv-cpu.c new file mode 100644 index 0000000..1f7b9cb --- /dev/null +++ b/target/i386/mshv/mshv-cpu.c @@ -0,0 +1,1763 @@ +/* + * QEMU MSHV support + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: Ziqiao Zhou <ziqiaozhou@microsoft.com> + * Magnus Kulke <magnuskulke@microsoft.com> + * Jinank Jain <jinankjain@microsoft.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qemu/error-report.h" +#include "qemu/memalign.h" +#include "qemu/typedefs.h" + +#include "system/mshv.h" +#include "system/mshv_int.h" +#include "system/address-spaces.h" +#include "linux/mshv.h" +#include "hw/hyperv/hvgdk.h" +#include "hw/hyperv/hvgdk_mini.h" +#include "hw/hyperv/hvhdk_mini.h" +#include "hw/i386/apic_internal.h" + +#include "cpu.h" +#include "emulate/x86_decode.h" +#include "emulate/x86_emu.h" +#include "emulate/x86_flags.h" + +#include "trace-accel_mshv.h" +#include "trace.h" + +#include <sys/ioctl.h> + +#define MAX_REGISTER_COUNT (MAX_CONST(ARRAY_SIZE(STANDARD_REGISTER_NAMES), \ + MAX_CONST(ARRAY_SIZE(SPECIAL_REGISTER_NAMES), \ + ARRAY_SIZE(FPU_REGISTER_NAMES)))) + +static enum hv_register_name STANDARD_REGISTER_NAMES[18] = { + HV_X64_REGISTER_RAX, + HV_X64_REGISTER_RBX, + HV_X64_REGISTER_RCX, + HV_X64_REGISTER_RDX, + HV_X64_REGISTER_RSI, + HV_X64_REGISTER_RDI, + HV_X64_REGISTER_RSP, + HV_X64_REGISTER_RBP, + HV_X64_REGISTER_R8, + HV_X64_REGISTER_R9, + HV_X64_REGISTER_R10, + HV_X64_REGISTER_R11, + HV_X64_REGISTER_R12, + HV_X64_REGISTER_R13, + HV_X64_REGISTER_R14, + HV_X64_REGISTER_R15, + HV_X64_REGISTER_RIP, + HV_X64_REGISTER_RFLAGS, +}; + +static enum hv_register_name SPECIAL_REGISTER_NAMES[17] = { + HV_X64_REGISTER_CS, + HV_X64_REGISTER_DS, + HV_X64_REGISTER_ES, + HV_X64_REGISTER_FS, + HV_X64_REGISTER_GS, + HV_X64_REGISTER_SS, + HV_X64_REGISTER_TR, + HV_X64_REGISTER_LDTR, + HV_X64_REGISTER_GDTR, + HV_X64_REGISTER_IDTR, + HV_X64_REGISTER_CR0, + HV_X64_REGISTER_CR2, + HV_X64_REGISTER_CR3, + HV_X64_REGISTER_CR4, + HV_X64_REGISTER_CR8, + HV_X64_REGISTER_EFER, + HV_X64_REGISTER_APIC_BASE, +}; + +static enum hv_register_name FPU_REGISTER_NAMES[26] = { + HV_X64_REGISTER_XMM0, + HV_X64_REGISTER_XMM1, + HV_X64_REGISTER_XMM2, + HV_X64_REGISTER_XMM3, + HV_X64_REGISTER_XMM4, + HV_X64_REGISTER_XMM5, + HV_X64_REGISTER_XMM6, + HV_X64_REGISTER_XMM7, + HV_X64_REGISTER_XMM8, + HV_X64_REGISTER_XMM9, + HV_X64_REGISTER_XMM10, + HV_X64_REGISTER_XMM11, + HV_X64_REGISTER_XMM12, + HV_X64_REGISTER_XMM13, + HV_X64_REGISTER_XMM14, + HV_X64_REGISTER_XMM15, + HV_X64_REGISTER_FP_MMX0, + HV_X64_REGISTER_FP_MMX1, + HV_X64_REGISTER_FP_MMX2, + HV_X64_REGISTER_FP_MMX3, + HV_X64_REGISTER_FP_MMX4, + HV_X64_REGISTER_FP_MMX5, + HV_X64_REGISTER_FP_MMX6, + HV_X64_REGISTER_FP_MMX7, + HV_X64_REGISTER_FP_CONTROL_STATUS, + HV_X64_REGISTER_XMM_CONTROL_STATUS, +}; + +static int translate_gva(const CPUState *cpu, uint64_t gva, uint64_t *gpa, + uint64_t flags) +{ + int ret; + int cpu_fd = mshv_vcpufd(cpu); + int vp_index = cpu->cpu_index; + + hv_input_translate_virtual_address in = { 0 }; + hv_output_translate_virtual_address out = { 0 }; + struct mshv_root_hvcall args = {0}; + uint64_t gva_page = gva >> HV_HYP_PAGE_SHIFT; + + in.vp_index = vp_index; + in.control_flags = flags; + in.gva_page = gva_page; + + /* create the hvcall envelope */ + args.code = HVCALL_TRANSLATE_VIRTUAL_ADDRESS; + args.in_sz = sizeof(in); + args.in_ptr = (uint64_t) ∈ + args.out_sz = sizeof(out); + args.out_ptr = (uint64_t) &out; + + /* perform the call */ + ret = mshv_hvcall(cpu_fd, &args); + if (ret < 0) { + error_report("Failed to invoke gva->gpa translation"); + return -errno; + } + + if (out.translation_result.result_code != HV_TRANSLATE_GVA_SUCCESS) { + error_report("Failed to translate gva (" TARGET_FMT_lx ") to gpa", gva); + return -1; + } + + *gpa = ((out.gpa_page << HV_HYP_PAGE_SHIFT) + | (gva & ~(uint64_t)HV_HYP_PAGE_MASK)); + + return 0; +} + +int mshv_set_generic_regs(const CPUState *cpu, const hv_register_assoc *assocs, + size_t n_regs) +{ + int cpu_fd = mshv_vcpufd(cpu); + int vp_index = cpu->cpu_index; + size_t in_sz, assocs_sz; + hv_input_set_vp_registers *in = cpu->accel->hvcall_args.input_page; + struct mshv_root_hvcall args = {0}; + int ret; + + /* find out the size of the struct w/ a flexible array at the tail */ + assocs_sz = n_regs * sizeof(hv_register_assoc); + in_sz = sizeof(hv_input_set_vp_registers) + assocs_sz; + + /* fill the input struct */ + memset(in, 0, sizeof(hv_input_set_vp_registers)); + in->vp_index = vp_index; + memcpy(in->elements, assocs, assocs_sz); + + /* create the hvcall envelope */ + args.code = HVCALL_SET_VP_REGISTERS; + args.in_sz = in_sz; + args.in_ptr = (uint64_t) in; + args.reps = (uint16_t) n_regs; + + /* perform the call */ + ret = mshv_hvcall(cpu_fd, &args); + if (ret < 0) { + error_report("Failed to set registers"); + return -1; + } + + /* assert we set all registers */ + if (args.reps != n_regs) { + error_report("Failed to set registers: expected %zu elements" + ", got %u", n_regs, args.reps); + return -1; + } + + return 0; +} + +static int get_generic_regs(CPUState *cpu, hv_register_assoc *assocs, + size_t n_regs) +{ + int cpu_fd = mshv_vcpufd(cpu); + int vp_index = cpu->cpu_index; + hv_input_get_vp_registers *in = cpu->accel->hvcall_args.input_page; + hv_register_value *values = cpu->accel->hvcall_args.output_page; + size_t in_sz, names_sz, values_sz; + int i, ret; + struct mshv_root_hvcall args = {0}; + + /* find out the size of the struct w/ a flexible array at the tail */ + names_sz = n_regs * sizeof(hv_register_name); + in_sz = sizeof(hv_input_get_vp_registers) + names_sz; + + /* fill the input struct */ + memset(in, 0, sizeof(hv_input_get_vp_registers)); + in->vp_index = vp_index; + for (i = 0; i < n_regs; i++) { + in->names[i] = assocs[i].name; + } + + /* determine size of value output buffer */ + values_sz = n_regs * sizeof(union hv_register_value); + + /* create the hvcall envelope */ + args.code = HVCALL_GET_VP_REGISTERS; + args.in_sz = in_sz; + args.in_ptr = (uint64_t) in; + args.out_sz = values_sz; + args.out_ptr = (uint64_t) values; + args.reps = (uint16_t) n_regs; + + /* perform the call */ + ret = mshv_hvcall(cpu_fd, &args); + if (ret < 0) { + error_report("Failed to retrieve registers"); + return -1; + } + + /* assert we got all registers */ + if (args.reps != n_regs) { + error_report("Failed to retrieve registers: expected %zu elements" + ", got %u", n_regs, args.reps); + return -1; + } + + /* copy values into assoc */ + for (i = 0; i < n_regs; i++) { + assocs[i].value = values[i]; + } + + return 0; +} + +static int set_standard_regs(const CPUState *cpu) +{ + X86CPU *x86cpu = X86_CPU(cpu); + CPUX86State *env = &x86cpu->env; + hv_register_assoc assocs[ARRAY_SIZE(STANDARD_REGISTER_NAMES)]; + int ret; + size_t n_regs = ARRAY_SIZE(STANDARD_REGISTER_NAMES); + + /* set names */ + for (size_t i = 0; i < ARRAY_SIZE(STANDARD_REGISTER_NAMES); i++) { + assocs[i].name = STANDARD_REGISTER_NAMES[i]; + } + assocs[0].value.reg64 = env->regs[R_EAX]; + assocs[1].value.reg64 = env->regs[R_EBX]; + assocs[2].value.reg64 = env->regs[R_ECX]; + assocs[3].value.reg64 = env->regs[R_EDX]; + assocs[4].value.reg64 = env->regs[R_ESI]; + assocs[5].value.reg64 = env->regs[R_EDI]; + assocs[6].value.reg64 = env->regs[R_ESP]; + assocs[7].value.reg64 = env->regs[R_EBP]; + assocs[8].value.reg64 = env->regs[R_R8]; + assocs[9].value.reg64 = env->regs[R_R9]; + assocs[10].value.reg64 = env->regs[R_R10]; + assocs[11].value.reg64 = env->regs[R_R11]; + assocs[12].value.reg64 = env->regs[R_R12]; + assocs[13].value.reg64 = env->regs[R_R13]; + assocs[14].value.reg64 = env->regs[R_R14]; + assocs[15].value.reg64 = env->regs[R_R15]; + assocs[16].value.reg64 = env->eip; + lflags_to_rflags(env); + assocs[17].value.reg64 = env->eflags; + + ret = mshv_set_generic_regs(cpu, assocs, n_regs); + if (ret < 0) { + error_report("failed to set standard registers"); + return -errno; + } + return 0; +} + +int mshv_store_regs(CPUState *cpu) +{ + int ret; + + ret = set_standard_regs(cpu); + if (ret < 0) { + error_report("Failed to store standard registers"); + return -1; + } + + return 0; +} + +static void populate_standard_regs(const hv_register_assoc *assocs, + CPUX86State *env) +{ + env->regs[R_EAX] = assocs[0].value.reg64; + env->regs[R_EBX] = assocs[1].value.reg64; + env->regs[R_ECX] = assocs[2].value.reg64; + env->regs[R_EDX] = assocs[3].value.reg64; + env->regs[R_ESI] = assocs[4].value.reg64; + env->regs[R_EDI] = assocs[5].value.reg64; + env->regs[R_ESP] = assocs[6].value.reg64; + env->regs[R_EBP] = assocs[7].value.reg64; + env->regs[R_R8] = assocs[8].value.reg64; + env->regs[R_R9] = assocs[9].value.reg64; + env->regs[R_R10] = assocs[10].value.reg64; + env->regs[R_R11] = assocs[11].value.reg64; + env->regs[R_R12] = assocs[12].value.reg64; + env->regs[R_R13] = assocs[13].value.reg64; + env->regs[R_R14] = assocs[14].value.reg64; + env->regs[R_R15] = assocs[15].value.reg64; + + env->eip = assocs[16].value.reg64; + env->eflags = assocs[17].value.reg64; + rflags_to_lflags(env); +} + +int mshv_get_standard_regs(CPUState *cpu) +{ + struct hv_register_assoc assocs[ARRAY_SIZE(STANDARD_REGISTER_NAMES)]; + int ret; + X86CPU *x86cpu = X86_CPU(cpu); + CPUX86State *env = &x86cpu->env; + size_t n_regs = ARRAY_SIZE(STANDARD_REGISTER_NAMES); + + for (size_t i = 0; i < n_regs; i++) { + assocs[i].name = STANDARD_REGISTER_NAMES[i]; + } + ret = get_generic_regs(cpu, assocs, n_regs); + if (ret < 0) { + error_report("failed to get standard registers"); + return -1; + } + + populate_standard_regs(assocs, env); + return 0; +} + +static inline void populate_segment_reg(const hv_x64_segment_register *hv_seg, + SegmentCache *seg) +{ + memset(seg, 0, sizeof(SegmentCache)); + + seg->base = hv_seg->base; + seg->limit = hv_seg->limit; + seg->selector = hv_seg->selector; + + seg->flags = (hv_seg->segment_type << DESC_TYPE_SHIFT) + | (hv_seg->present * DESC_P_MASK) + | (hv_seg->descriptor_privilege_level << DESC_DPL_SHIFT) + | (hv_seg->_default << DESC_B_SHIFT) + | (hv_seg->non_system_segment * DESC_S_MASK) + | (hv_seg->_long << DESC_L_SHIFT) + | (hv_seg->granularity * DESC_G_MASK) + | (hv_seg->available * DESC_AVL_MASK); + +} + +static inline void populate_table_reg(const hv_x64_table_register *hv_seg, + SegmentCache *tbl) +{ + memset(tbl, 0, sizeof(SegmentCache)); + + tbl->base = hv_seg->base; + tbl->limit = hv_seg->limit; +} + +static void populate_special_regs(const hv_register_assoc *assocs, + X86CPU *x86cpu) +{ + CPUX86State *env = &x86cpu->env; + + populate_segment_reg(&assocs[0].value.segment, &env->segs[R_CS]); + populate_segment_reg(&assocs[1].value.segment, &env->segs[R_DS]); + populate_segment_reg(&assocs[2].value.segment, &env->segs[R_ES]); + populate_segment_reg(&assocs[3].value.segment, &env->segs[R_FS]); + populate_segment_reg(&assocs[4].value.segment, &env->segs[R_GS]); + populate_segment_reg(&assocs[5].value.segment, &env->segs[R_SS]); + + populate_segment_reg(&assocs[6].value.segment, &env->tr); + populate_segment_reg(&assocs[7].value.segment, &env->ldt); + + populate_table_reg(&assocs[8].value.table, &env->gdt); + populate_table_reg(&assocs[9].value.table, &env->idt); + + env->cr[0] = assocs[10].value.reg64; + env->cr[2] = assocs[11].value.reg64; + env->cr[3] = assocs[12].value.reg64; + env->cr[4] = assocs[13].value.reg64; + + cpu_set_apic_tpr(x86cpu->apic_state, assocs[14].value.reg64); + env->efer = assocs[15].value.reg64; + cpu_set_apic_base(x86cpu->apic_state, assocs[16].value.reg64); +} + + +int mshv_get_special_regs(CPUState *cpu) +{ + struct hv_register_assoc assocs[ARRAY_SIZE(SPECIAL_REGISTER_NAMES)]; + int ret; + X86CPU *x86cpu = X86_CPU(cpu); + size_t n_regs = ARRAY_SIZE(SPECIAL_REGISTER_NAMES); + + for (size_t i = 0; i < n_regs; i++) { + assocs[i].name = SPECIAL_REGISTER_NAMES[i]; + } + ret = get_generic_regs(cpu, assocs, n_regs); + if (ret < 0) { + error_report("failed to get special registers"); + return -errno; + } + + populate_special_regs(assocs, x86cpu); + return 0; +} + +int mshv_load_regs(CPUState *cpu) +{ + int ret; + + ret = mshv_get_standard_regs(cpu); + if (ret < 0) { + error_report("Failed to load standard registers"); + return -1; + } + + ret = mshv_get_special_regs(cpu); + if (ret < 0) { + error_report("Failed to load special registers"); + return -1; + } + + return 0; +} + +static void add_cpuid_entry(GList *cpuid_entries, + uint32_t function, uint32_t index, + uint32_t eax, uint32_t ebx, + uint32_t ecx, uint32_t edx) +{ + struct hv_cpuid_entry *entry; + + entry = g_malloc0(sizeof(struct hv_cpuid_entry)); + entry->function = function; + entry->index = index; + entry->eax = eax; + entry->ebx = ebx; + entry->ecx = ecx; + entry->edx = edx; + + cpuid_entries = g_list_append(cpuid_entries, entry); +} + +static void collect_cpuid_entries(const CPUState *cpu, GList *cpuid_entries) +{ + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + uint32_t eax, ebx, ecx, edx; + uint32_t leaf, subleaf; + size_t max_leaf = 0x1F; + size_t max_subleaf = 0x20; + + uint32_t leaves_with_subleaves[] = {0x4, 0x7, 0xD, 0xF, 0x10}; + int n_subleaf_leaves = ARRAY_SIZE(leaves_with_subleaves); + + /* Regular leaves without subleaves */ + for (leaf = 0; leaf <= max_leaf; leaf++) { + bool has_subleaves = false; + for (int i = 0; i < n_subleaf_leaves; i++) { + if (leaf == leaves_with_subleaves[i]) { + has_subleaves = true; + break; + } + } + + if (!has_subleaves) { + cpu_x86_cpuid(env, leaf, 0, &eax, &ebx, &ecx, &edx); + if (eax == 0 && ebx == 0 && ecx == 0 && edx == 0) { + /* all zeroes indicates no more leaves */ + continue; + } + + add_cpuid_entry(cpuid_entries, leaf, 0, eax, ebx, ecx, edx); + continue; + } + + subleaf = 0; + while (subleaf < max_subleaf) { + cpu_x86_cpuid(env, leaf, subleaf, &eax, &ebx, &ecx, &edx); + + if (eax == 0 && ebx == 0 && ecx == 0 && edx == 0) { + /* all zeroes indicates no more leaves */ + break; + } + add_cpuid_entry(cpuid_entries, leaf, 0, eax, ebx, ecx, edx); + subleaf++; + } + } +} + +static int register_intercept_result_cpuid_entry(const CPUState *cpu, + uint8_t subleaf_specific, + uint8_t always_override, + struct hv_cpuid_entry *entry) +{ + int ret; + int vp_index = cpu->cpu_index; + int cpu_fd = mshv_vcpufd(cpu); + + struct hv_register_x64_cpuid_result_parameters cpuid_params = { + .input.eax = entry->function, + .input.ecx = entry->index, + .input.subleaf_specific = subleaf_specific, + .input.always_override = always_override, + .input.padding = 0, + /* + * With regard to masks - these are to specify bits to be overwritten + * The current CpuidEntry structure wouldn't allow to carry the masks + * in addition to the actual register values. For this reason, the + * masks are set to the exact values of the corresponding register bits + * to be registered for an overwrite. To view resulting values the + * hypervisor would return, HvCallGetVpCpuidValues hypercall can be + * used. + */ + .result.eax = entry->eax, + .result.eax_mask = entry->eax, + .result.ebx = entry->ebx, + .result.ebx_mask = entry->ebx, + .result.ecx = entry->ecx, + .result.ecx_mask = entry->ecx, + .result.edx = entry->edx, + .result.edx_mask = entry->edx, + }; + union hv_register_intercept_result_parameters parameters = { + .cpuid = cpuid_params, + }; + + hv_input_register_intercept_result in = {0}; + in.vp_index = vp_index; + in.intercept_type = HV_INTERCEPT_TYPE_X64_CPUID; + in.parameters = parameters; + + struct mshv_root_hvcall args = {0}; + args.code = HVCALL_REGISTER_INTERCEPT_RESULT; + args.in_sz = sizeof(in); + args.in_ptr = (uint64_t)∈ + + ret = mshv_hvcall(cpu_fd, &args); + if (ret < 0) { + error_report("failed to register intercept result for cpuid"); + return -1; + } + + return 0; +} + +static int register_intercept_result_cpuid(const CPUState *cpu, + struct hv_cpuid *cpuid) +{ + int ret = 0, entry_ret; + struct hv_cpuid_entry *entry; + uint8_t subleaf_specific, always_override; + + for (size_t i = 0; i < cpuid->nent; i++) { + entry = &cpuid->entries[i]; + + /* set defaults */ + subleaf_specific = 0; + always_override = 1; + + /* Intel */ + /* 0xb - Extended Topology Enumeration Leaf */ + /* 0x1f - V2 Extended Topology Enumeration Leaf */ + /* AMD */ + /* 0x8000_001e - Processor Topology Information */ + /* 0x8000_0026 - Extended CPU Topology */ + if (entry->function == 0xb + || entry->function == 0x1f + || entry->function == 0x8000001e + || entry->function == 0x80000026) { + subleaf_specific = 1; + always_override = 1; + } else if (entry->function == 0x00000001 + || entry->function == 0x80000000 + || entry->function == 0x80000001 + || entry->function == 0x80000008) { + subleaf_specific = 0; + always_override = 1; + } + + entry_ret = register_intercept_result_cpuid_entry(cpu, subleaf_specific, + always_override, + entry); + if ((entry_ret < 0) && (ret == 0)) { + ret = entry_ret; + } + } + + return ret; +} + +static int set_cpuid2(const CPUState *cpu) +{ + int ret; + size_t n_entries, cpuid_size; + struct hv_cpuid *cpuid; + struct hv_cpuid_entry *entry; + GList *entries = NULL; + + collect_cpuid_entries(cpu, entries); + n_entries = g_list_length(entries); + + cpuid_size = sizeof(struct hv_cpuid) + + n_entries * sizeof(struct hv_cpuid_entry); + + cpuid = g_malloc0(cpuid_size); + cpuid->nent = n_entries; + cpuid->padding = 0; + + for (size_t i = 0; i < n_entries; i++) { + entry = g_list_nth_data(entries, i); + cpuid->entries[i] = *entry; + g_free(entry); + } + g_list_free(entries); + + ret = register_intercept_result_cpuid(cpu, cpuid); + g_free(cpuid); + if (ret < 0) { + return ret; + } + + return 0; +} + +static inline void populate_hv_segment_reg(SegmentCache *seg, + hv_x64_segment_register *hv_reg) +{ + uint32_t flags = seg->flags; + + hv_reg->base = seg->base; + hv_reg->limit = seg->limit; + hv_reg->selector = seg->selector; + hv_reg->segment_type = (flags >> DESC_TYPE_SHIFT) & 0xF; + hv_reg->non_system_segment = (flags & DESC_S_MASK) != 0; + hv_reg->descriptor_privilege_level = (flags >> DESC_DPL_SHIFT) & 0x3; + hv_reg->present = (flags & DESC_P_MASK) != 0; + hv_reg->reserved = 0; + hv_reg->available = (flags & DESC_AVL_MASK) != 0; + hv_reg->_long = (flags >> DESC_L_SHIFT) & 0x1; + hv_reg->_default = (flags >> DESC_B_SHIFT) & 0x1; + hv_reg->granularity = (flags & DESC_G_MASK) != 0; +} + +static inline void populate_hv_table_reg(const struct SegmentCache *seg, + hv_x64_table_register *hv_reg) +{ + memset(hv_reg, 0, sizeof(*hv_reg)); + + hv_reg->base = seg->base; + hv_reg->limit = seg->limit; +} + +static int set_special_regs(const CPUState *cpu) +{ + X86CPU *x86cpu = X86_CPU(cpu); + CPUX86State *env = &x86cpu->env; + struct hv_register_assoc assocs[ARRAY_SIZE(SPECIAL_REGISTER_NAMES)]; + size_t n_regs = ARRAY_SIZE(SPECIAL_REGISTER_NAMES); + int ret; + + /* set names */ + for (size_t i = 0; i < n_regs; i++) { + assocs[i].name = SPECIAL_REGISTER_NAMES[i]; + } + populate_hv_segment_reg(&env->segs[R_CS], &assocs[0].value.segment); + populate_hv_segment_reg(&env->segs[R_DS], &assocs[1].value.segment); + populate_hv_segment_reg(&env->segs[R_ES], &assocs[2].value.segment); + populate_hv_segment_reg(&env->segs[R_FS], &assocs[3].value.segment); + populate_hv_segment_reg(&env->segs[R_GS], &assocs[4].value.segment); + populate_hv_segment_reg(&env->segs[R_SS], &assocs[5].value.segment); + populate_hv_segment_reg(&env->tr, &assocs[6].value.segment); + populate_hv_segment_reg(&env->ldt, &assocs[7].value.segment); + + populate_hv_table_reg(&env->gdt, &assocs[8].value.table); + populate_hv_table_reg(&env->idt, &assocs[9].value.table); + + assocs[10].value.reg64 = env->cr[0]; + assocs[11].value.reg64 = env->cr[2]; + assocs[12].value.reg64 = env->cr[3]; + assocs[13].value.reg64 = env->cr[4]; + assocs[14].value.reg64 = cpu_get_apic_tpr(x86cpu->apic_state); + assocs[15].value.reg64 = env->efer; + assocs[16].value.reg64 = cpu_get_apic_base(x86cpu->apic_state); + + ret = mshv_set_generic_regs(cpu, assocs, n_regs); + if (ret < 0) { + error_report("failed to set special registers"); + return -1; + } + + return 0; +} + +static int set_fpu(const CPUState *cpu, const struct MshvFPU *regs) +{ + struct hv_register_assoc assocs[ARRAY_SIZE(FPU_REGISTER_NAMES)]; + union hv_register_value *value; + size_t fp_i; + union hv_x64_fp_control_status_register *ctrl_status; + union hv_x64_xmm_control_status_register *xmm_ctrl_status; + int ret; + size_t n_regs = ARRAY_SIZE(FPU_REGISTER_NAMES); + + /* first 16 registers are xmm0-xmm15 */ + for (size_t i = 0; i < 16; i++) { + assocs[i].name = FPU_REGISTER_NAMES[i]; + value = &assocs[i].value; + memcpy(&value->reg128, ®s->xmm[i], 16); + } + + /* next 8 registers are fp_mmx0-fp_mmx7 */ + for (size_t i = 16; i < 24; i++) { + assocs[i].name = FPU_REGISTER_NAMES[i]; + fp_i = (i - 16); + value = &assocs[i].value; + memcpy(&value->reg128, ®s->fpr[fp_i], 16); + } + + /* last two registers are fp_control_status and xmm_control_status */ + assocs[24].name = FPU_REGISTER_NAMES[24]; + value = &assocs[24].value; + ctrl_status = &value->fp_control_status; + ctrl_status->fp_control = regs->fcw; + ctrl_status->fp_status = regs->fsw; + ctrl_status->fp_tag = regs->ftwx; + ctrl_status->reserved = 0; + ctrl_status->last_fp_op = regs->last_opcode; + ctrl_status->last_fp_rip = regs->last_ip; + + assocs[25].name = FPU_REGISTER_NAMES[25]; + value = &assocs[25].value; + xmm_ctrl_status = &value->xmm_control_status; + xmm_ctrl_status->xmm_status_control = regs->mxcsr; + xmm_ctrl_status->xmm_status_control_mask = 0; + xmm_ctrl_status->last_fp_rdp = regs->last_dp; + + ret = mshv_set_generic_regs(cpu, assocs, n_regs); + if (ret < 0) { + error_report("failed to set fpu registers"); + return -1; + } + + return 0; +} + +static int set_xc_reg(const CPUState *cpu, uint64_t xcr0) +{ + int ret; + struct hv_register_assoc assoc = { + .name = HV_X64_REGISTER_XFEM, + .value.reg64 = xcr0, + }; + + ret = mshv_set_generic_regs(cpu, &assoc, 1); + if (ret < 0) { + error_report("failed to set xcr0"); + return -errno; + } + return 0; +} + +static int set_cpu_state(const CPUState *cpu, const MshvFPU *fpu_regs, + uint64_t xcr0) +{ + int ret; + + ret = set_standard_regs(cpu); + if (ret < 0) { + return ret; + } + ret = set_special_regs(cpu); + if (ret < 0) { + return ret; + } + ret = set_fpu(cpu, fpu_regs); + if (ret < 0) { + return ret; + } + ret = set_xc_reg(cpu, xcr0); + if (ret < 0) { + return ret; + } + return 0; +} + +static int get_vp_state(int cpu_fd, struct mshv_get_set_vp_state *state) +{ + int ret; + + ret = ioctl(cpu_fd, MSHV_GET_VP_STATE, state); + if (ret < 0) { + error_report("failed to get partition state: %s", strerror(errno)); + return -1; + } + + return 0; +} + +static int get_lapic(int cpu_fd, + struct hv_local_interrupt_controller_state *state) +{ + int ret; + size_t size = 4096; + /* buffer aligned to 4k, as *state requires that */ + void *buffer = qemu_memalign(size, size); + struct mshv_get_set_vp_state mshv_state = { 0 }; + + mshv_state.buf_ptr = (uint64_t) buffer; + mshv_state.buf_sz = size; + mshv_state.type = MSHV_VP_STATE_LAPIC; + + ret = get_vp_state(cpu_fd, &mshv_state); + if (ret == 0) { + memcpy(state, buffer, sizeof(*state)); + } + qemu_vfree(buffer); + if (ret < 0) { + error_report("failed to get lapic"); + return -1; + } + + return 0; +} + +static uint32_t set_apic_delivery_mode(uint32_t reg, uint32_t mode) +{ + return ((reg) & ~0x700) | ((mode) << 8); +} + +static int set_vp_state(int cpu_fd, const struct mshv_get_set_vp_state *state) +{ + int ret; + + ret = ioctl(cpu_fd, MSHV_SET_VP_STATE, state); + if (ret < 0) { + error_report("failed to set partition state: %s", strerror(errno)); + return -1; + } + + return 0; +} + +static int set_lapic(int cpu_fd, + const struct hv_local_interrupt_controller_state *state) +{ + int ret; + size_t size = 4096; + /* buffer aligned to 4k, as *state requires that */ + void *buffer = qemu_memalign(size, size); + struct mshv_get_set_vp_state mshv_state = { 0 }; + + if (!state) { + error_report("lapic state is NULL"); + return -1; + } + memcpy(buffer, state, sizeof(*state)); + + mshv_state.buf_ptr = (uint64_t) buffer; + mshv_state.buf_sz = size; + mshv_state.type = MSHV_VP_STATE_LAPIC; + + ret = set_vp_state(cpu_fd, &mshv_state); + qemu_vfree(buffer); + if (ret < 0) { + error_report("failed to set lapic: %s", strerror(errno)); + return -1; + } + + return 0; +} + +static int set_lint(int cpu_fd) +{ + int ret; + uint32_t *lvt_lint0, *lvt_lint1; + + struct hv_local_interrupt_controller_state lapic_state = { 0 }; + ret = get_lapic(cpu_fd, &lapic_state); + if (ret < 0) { + return ret; + } + + lvt_lint0 = &lapic_state.apic_lvt_lint0; + *lvt_lint0 = set_apic_delivery_mode(*lvt_lint0, APIC_DM_EXTINT); + + lvt_lint1 = &lapic_state.apic_lvt_lint1; + *lvt_lint1 = set_apic_delivery_mode(*lvt_lint1, APIC_DM_NMI); + + /* TODO: should we skip setting lapic if the values are the same? */ + + return set_lapic(cpu_fd, &lapic_state); +} + +static int setup_msrs(const CPUState *cpu) +{ + int ret; + uint64_t default_type = MSR_MTRR_ENABLE | MSR_MTRR_MEM_TYPE_WB; + + /* boot msr entries */ + MshvMsrEntry msrs[9] = { + { .index = IA32_MSR_SYSENTER_CS, .data = 0x0, }, + { .index = IA32_MSR_SYSENTER_ESP, .data = 0x0, }, + { .index = IA32_MSR_SYSENTER_EIP, .data = 0x0, }, + { .index = IA32_MSR_STAR, .data = 0x0, }, + { .index = IA32_MSR_CSTAR, .data = 0x0, }, + { .index = IA32_MSR_LSTAR, .data = 0x0, }, + { .index = IA32_MSR_KERNEL_GS_BASE, .data = 0x0, }, + { .index = IA32_MSR_SFMASK, .data = 0x0, }, + { .index = IA32_MSR_MTRR_DEF_TYPE, .data = default_type, }, + }; + + ret = mshv_configure_msr(cpu, msrs, 9); + if (ret < 0) { + error_report("failed to setup msrs"); + return -1; + } + + return 0; +} + +/* + * TODO: populate topology info: + * + * X86CPU *x86cpu = X86_CPU(cpu); + * CPUX86State *env = &x86cpu->env; + * X86CPUTopoInfo *topo_info = &env->topo_info; + */ +int mshv_configure_vcpu(const CPUState *cpu, const struct MshvFPU *fpu, + uint64_t xcr0) +{ + int ret; + int cpu_fd = mshv_vcpufd(cpu); + + ret = set_cpuid2(cpu); + if (ret < 0) { + error_report("failed to set cpuid"); + return -1; + } + + ret = setup_msrs(cpu); + if (ret < 0) { + error_report("failed to setup msrs"); + return -1; + } + + ret = set_cpu_state(cpu, fpu, xcr0); + if (ret < 0) { + error_report("failed to set cpu state"); + return -1; + } + + ret = set_lint(cpu_fd); + if (ret < 0) { + error_report("failed to set lpic int"); + return -1; + } + + return 0; +} + +static int put_regs(const CPUState *cpu) +{ + X86CPU *x86cpu = X86_CPU(cpu); + CPUX86State *env = &x86cpu->env; + MshvFPU fpu = {0}; + int ret; + + memset(&fpu, 0, sizeof(fpu)); + + ret = mshv_configure_vcpu(cpu, &fpu, env->xcr0); + if (ret < 0) { + error_report("failed to configure vcpu"); + return ret; + } + + return 0; +} + +struct MsrPair { + uint32_t index; + uint64_t value; +}; + +static int put_msrs(const CPUState *cpu) +{ + int ret = 0; + X86CPU *x86cpu = X86_CPU(cpu); + CPUX86State *env = &x86cpu->env; + MshvMsrEntries *msrs = g_malloc0(sizeof(MshvMsrEntries)); + + struct MsrPair pairs[] = { + { MSR_IA32_SYSENTER_CS, env->sysenter_cs }, + { MSR_IA32_SYSENTER_ESP, env->sysenter_esp }, + { MSR_IA32_SYSENTER_EIP, env->sysenter_eip }, + { MSR_EFER, env->efer }, + { MSR_PAT, env->pat }, + { MSR_STAR, env->star }, + { MSR_CSTAR, env->cstar }, + { MSR_LSTAR, env->lstar }, + { MSR_KERNELGSBASE, env->kernelgsbase }, + { MSR_FMASK, env->fmask }, + { MSR_MTRRdefType, env->mtrr_deftype }, + { MSR_VM_HSAVE_PA, env->vm_hsave }, + { MSR_SMI_COUNT, env->msr_smi_count }, + { MSR_IA32_PKRS, env->pkrs }, + { MSR_IA32_BNDCFGS, env->msr_bndcfgs }, + { MSR_IA32_XSS, env->xss }, + { MSR_IA32_UMWAIT_CONTROL, env->umwait }, + { MSR_IA32_TSX_CTRL, env->tsx_ctrl }, + { MSR_AMD64_TSC_RATIO, env->amd_tsc_scale_msr }, + { MSR_TSC_AUX, env->tsc_aux }, + { MSR_TSC_ADJUST, env->tsc_adjust }, + { MSR_IA32_SMBASE, env->smbase }, + { MSR_IA32_SPEC_CTRL, env->spec_ctrl }, + { MSR_VIRT_SSBD, env->virt_ssbd }, + }; + + if (ARRAY_SIZE(pairs) > MSHV_MSR_ENTRIES_COUNT) { + error_report("MSR entries exceed maximum size"); + g_free(msrs); + return -1; + } + + for (size_t i = 0; i < ARRAY_SIZE(pairs); i++) { + MshvMsrEntry *entry = &msrs->entries[i]; + entry->index = pairs[i].index; + entry->reserved = 0; + entry->data = pairs[i].value; + msrs->nmsrs++; + } + + ret = mshv_configure_msr(cpu, &msrs->entries[0], msrs->nmsrs); + g_free(msrs); + return ret; +} + + +int mshv_arch_put_registers(const CPUState *cpu) +{ + int ret; + + ret = put_regs(cpu); + if (ret < 0) { + error_report("Failed to put registers"); + return -1; + } + + ret = put_msrs(cpu); + if (ret < 0) { + error_report("Failed to put msrs"); + return -1; + } + + return 0; +} + +void mshv_arch_amend_proc_features( + union hv_partition_synthetic_processor_features *features) +{ + features->access_guest_idle_reg = 1; +} + +static int set_memory_info(const struct hyperv_message *msg, + struct hv_x64_memory_intercept_message *info) +{ + if (msg->header.message_type != HVMSG_GPA_INTERCEPT + && msg->header.message_type != HVMSG_UNMAPPED_GPA + && msg->header.message_type != HVMSG_UNACCEPTED_GPA) { + error_report("invalid message type"); + return -1; + } + memcpy(info, msg->payload, sizeof(*info)); + + return 0; +} + +static int emulate_instruction(CPUState *cpu, + const uint8_t *insn_bytes, size_t insn_len, + uint64_t gva, uint64_t gpa) +{ + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + struct x86_decode decode = { 0 }; + int ret; + x86_insn_stream stream = { .bytes = insn_bytes, .len = insn_len }; + + ret = mshv_load_regs(cpu); + if (ret < 0) { + error_report("failed to load registers"); + return -1; + } + + decode_instruction_stream(env, &decode, &stream); + exec_instruction(env, &decode); + + ret = mshv_store_regs(cpu); + if (ret < 0) { + error_report("failed to store registers"); + return -1; + } + + return 0; +} + +static int handle_mmio(CPUState *cpu, const struct hyperv_message *msg, + MshvVmExit *exit_reason) +{ + struct hv_x64_memory_intercept_message info = { 0 }; + size_t insn_len; + uint8_t access_type; + uint8_t *instruction_bytes; + int ret; + + ret = set_memory_info(msg, &info); + if (ret < 0) { + error_report("failed to convert message to memory info"); + return -1; + } + insn_len = info.instruction_byte_count; + access_type = info.header.intercept_access_type; + + if (access_type == HV_X64_INTERCEPT_ACCESS_TYPE_EXECUTE) { + error_report("invalid intercept access type: execute"); + return -1; + } + + if (insn_len > 16) { + error_report("invalid mmio instruction length: %zu", insn_len); + return -1; + } + + trace_mshv_handle_mmio(info.guest_virtual_address, + info.guest_physical_address, + info.instruction_byte_count, access_type); + + instruction_bytes = info.instruction_bytes; + + ret = emulate_instruction(cpu, instruction_bytes, insn_len, + info.guest_virtual_address, + info.guest_physical_address); + if (ret < 0) { + error_report("failed to emulate mmio"); + return -1; + } + + *exit_reason = MshvVmExitIgnore; + + return 0; +} + +static int handle_unmapped_mem(int vm_fd, CPUState *cpu, + const struct hyperv_message *msg, + MshvVmExit *exit_reason) +{ + struct hv_x64_memory_intercept_message info = { 0 }; + uint64_t gpa; + int ret; + enum MshvRemapResult remap_result; + + ret = set_memory_info(msg, &info); + if (ret < 0) { + error_report("failed to convert message to memory info"); + return -1; + } + + gpa = info.guest_physical_address; + + /* attempt to remap the region, in case of overlapping userspace mappings */ + remap_result = mshv_remap_overlap_region(vm_fd, gpa); + *exit_reason = MshvVmExitIgnore; + + switch (remap_result) { + case MshvRemapNoMapping: + /* if we didn't find a mapping, it is probably mmio */ + return handle_mmio(cpu, msg, exit_reason); + case MshvRemapOk: + break; + case MshvRemapNoOverlap: + /* This should not happen, but we are forgiving it */ + warn_report("found no overlap for unmapped region"); + *exit_reason = MshvVmExitSpecial; + break; + } + + return 0; +} + +static int set_ioport_info(const struct hyperv_message *msg, + hv_x64_io_port_intercept_message *info) +{ + if (msg->header.message_type != HVMSG_X64_IO_PORT_INTERCEPT) { + error_report("Invalid message type"); + return -1; + } + memcpy(info, msg->payload, sizeof(*info)); + + return 0; +} + +static int set_x64_registers(const CPUState *cpu, const uint32_t *names, + const uint64_t *values) +{ + + hv_register_assoc assocs[2]; + int ret; + + for (size_t i = 0; i < ARRAY_SIZE(assocs); i++) { + assocs[i].name = names[i]; + assocs[i].value.reg64 = values[i]; + } + + ret = mshv_set_generic_regs(cpu, assocs, ARRAY_SIZE(assocs)); + if (ret < 0) { + error_report("failed to set x64 registers"); + return -1; + } + + return 0; +} + +static inline MemTxAttrs get_mem_attrs(bool is_secure_mode) +{ + MemTxAttrs memattr = {0}; + memattr.secure = is_secure_mode; + return memattr; +} + +static void pio_read(uint64_t port, uint8_t *data, uintptr_t size, + bool is_secure_mode) +{ + int ret = 0; + MemTxAttrs memattr = get_mem_attrs(is_secure_mode); + ret = address_space_rw(&address_space_io, port, memattr, (void *)data, size, + false); + if (ret != MEMTX_OK) { + error_report("Failed to read from port %lx: %d", port, ret); + abort(); + } +} + +static int pio_write(uint64_t port, const uint8_t *data, uintptr_t size, + bool is_secure_mode) +{ + int ret = 0; + MemTxAttrs memattr = get_mem_attrs(is_secure_mode); + ret = address_space_rw(&address_space_io, port, memattr, (void *)data, size, + true); + return ret; +} + +static int handle_pio_non_str(const CPUState *cpu, + hv_x64_io_port_intercept_message *info) +{ + size_t len = info->access_info.access_size; + uint8_t access_type = info->header.intercept_access_type; + int ret; + uint32_t val, eax; + const uint32_t eax_mask = 0xffffffffu >> (32 - len * 8); + size_t insn_len; + uint64_t rip, rax; + uint32_t reg_names[2]; + uint64_t reg_values[2]; + uint16_t port = info->port_number; + + if (access_type == HV_X64_INTERCEPT_ACCESS_TYPE_WRITE) { + union { + uint32_t u32; + uint8_t bytes[4]; + } conv; + + /* convert the first 4 bytes of rax to bytes */ + conv.u32 = (uint32_t)info->rax; + /* secure mode is set to false */ + ret = pio_write(port, conv.bytes, len, false); + if (ret < 0) { + error_report("Failed to write to io port"); + return -1; + } + } else { + uint8_t data[4] = { 0 }; + /* secure mode is set to false */ + pio_read(info->port_number, data, len, false); + + /* Preserve high bits in EAX, but clear out high bits in RAX */ + val = *(uint32_t *)data; + eax = (((uint32_t)info->rax) & ~eax_mask) | (val & eax_mask); + info->rax = (uint64_t)eax; + } + + insn_len = info->header.instruction_length; + + /* Advance RIP and update RAX */ + rip = info->header.rip + insn_len; + rax = info->rax; + + reg_names[0] = HV_X64_REGISTER_RIP; + reg_values[0] = rip; + reg_names[1] = HV_X64_REGISTER_RAX; + reg_values[1] = rax; + + ret = set_x64_registers(cpu, reg_names, reg_values); + if (ret < 0) { + error_report("Failed to set x64 registers"); + return -1; + } + + cpu->accel->dirty = false; + + return 0; +} + +static int fetch_guest_state(CPUState *cpu) +{ + int ret; + + ret = mshv_get_standard_regs(cpu); + if (ret < 0) { + error_report("Failed to get standard registers"); + return -1; + } + + ret = mshv_get_special_regs(cpu); + if (ret < 0) { + error_report("Failed to get special registers"); + return -1; + } + + return 0; +} + +static int read_memory(const CPUState *cpu, uint64_t initial_gva, + uint64_t initial_gpa, uint64_t gva, uint8_t *data, + size_t len) +{ + int ret; + uint64_t gpa, flags; + + if (gva == initial_gva) { + gpa = initial_gpa; + } else { + flags = HV_TRANSLATE_GVA_VALIDATE_READ; + ret = translate_gva(cpu, gva, &gpa, flags); + if (ret < 0) { + return -1; + } + + ret = mshv_guest_mem_read(gpa, data, len, false, false); + if (ret < 0) { + error_report("failed to read guest mem"); + return -1; + } + } + + return 0; +} + +static int write_memory(const CPUState *cpu, uint64_t initial_gva, + uint64_t initial_gpa, uint64_t gva, const uint8_t *data, + size_t len) +{ + int ret; + uint64_t gpa, flags; + + if (gva == initial_gva) { + gpa = initial_gpa; + } else { + flags = HV_TRANSLATE_GVA_VALIDATE_WRITE; + ret = translate_gva(cpu, gva, &gpa, flags); + if (ret < 0) { + error_report("failed to translate gva to gpa"); + return -1; + } + } + ret = mshv_guest_mem_write(gpa, data, len, false); + if (ret != MEMTX_OK) { + error_report("failed to write to mmio"); + return -1; + } + + return 0; +} + +static int handle_pio_str_write(CPUState *cpu, + hv_x64_io_port_intercept_message *info, + size_t repeat, uint16_t port, + bool direction_flag) +{ + int ret; + uint64_t src; + uint8_t data[4] = { 0 }; + size_t len = info->access_info.access_size; + + src = linear_addr(cpu, info->rsi, R_DS); + + for (size_t i = 0; i < repeat; i++) { + ret = read_memory(cpu, 0, 0, src, data, len); + if (ret < 0) { + error_report("Failed to read memory"); + return -1; + } + ret = pio_write(port, data, len, false); + if (ret < 0) { + error_report("Failed to write to io port"); + return -1; + } + src += direction_flag ? -len : len; + info->rsi += direction_flag ? -len : len; + } + + return 0; +} + +static int handle_pio_str_read(CPUState *cpu, + hv_x64_io_port_intercept_message *info, + size_t repeat, uint16_t port, + bool direction_flag) +{ + int ret; + uint64_t dst; + size_t len = info->access_info.access_size; + uint8_t data[4] = { 0 }; + + dst = linear_addr(cpu, info->rdi, R_ES); + + for (size_t i = 0; i < repeat; i++) { + pio_read(port, data, len, false); + + ret = write_memory(cpu, 0, 0, dst, data, len); + if (ret < 0) { + error_report("Failed to write memory"); + return -1; + } + dst += direction_flag ? -len : len; + info->rdi += direction_flag ? -len : len; + } + + return 0; +} + +static int handle_pio_str(CPUState *cpu, hv_x64_io_port_intercept_message *info) +{ + uint8_t access_type = info->header.intercept_access_type; + uint16_t port = info->port_number; + bool repop = info->access_info.rep_prefix == 1; + size_t repeat = repop ? info->rcx : 1; + size_t insn_len = info->header.instruction_length; + bool direction_flag; + uint32_t reg_names[3]; + uint64_t reg_values[3]; + int ret; + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + + ret = fetch_guest_state(cpu); + if (ret < 0) { + error_report("Failed to fetch guest state"); + return -1; + } + + direction_flag = (env->eflags & DESC_E_MASK) != 0; + + if (access_type == HV_X64_INTERCEPT_ACCESS_TYPE_WRITE) { + ret = handle_pio_str_write(cpu, info, repeat, port, direction_flag); + if (ret < 0) { + error_report("Failed to handle pio str write"); + return -1; + } + reg_names[0] = HV_X64_REGISTER_RSI; + reg_values[0] = info->rsi; + } else { + ret = handle_pio_str_read(cpu, info, repeat, port, direction_flag); + reg_names[0] = HV_X64_REGISTER_RDI; + reg_values[0] = info->rdi; + } + + reg_names[1] = HV_X64_REGISTER_RIP; + reg_values[1] = info->header.rip + insn_len; + reg_names[2] = HV_X64_REGISTER_RAX; + reg_values[2] = info->rax; + + ret = set_x64_registers(cpu, reg_names, reg_values); + if (ret < 0) { + error_report("Failed to set x64 registers"); + return -1; + } + + cpu->accel->dirty = false; + + return 0; +} + +static int handle_pio(CPUState *cpu, const struct hyperv_message *msg) +{ + struct hv_x64_io_port_intercept_message info = { 0 }; + int ret; + + ret = set_ioport_info(msg, &info); + if (ret < 0) { + error_report("Failed to convert message to ioport info"); + return -1; + } + + if (info.access_info.string_op) { + return handle_pio_str(cpu, &info); + } + + return handle_pio_non_str(cpu, &info); +} + +int mshv_run_vcpu(int vm_fd, CPUState *cpu, hv_message *msg, MshvVmExit *exit) +{ + int ret; + enum MshvVmExit exit_reason; + int cpu_fd = mshv_vcpufd(cpu); + + ret = ioctl(cpu_fd, MSHV_RUN_VP, msg); + if (ret < 0) { + return MshvVmExitShutdown; + } + + switch (msg->header.message_type) { + case HVMSG_UNRECOVERABLE_EXCEPTION: + return MshvVmExitShutdown; + case HVMSG_UNMAPPED_GPA: + ret = handle_unmapped_mem(vm_fd, cpu, msg, &exit_reason); + if (ret < 0) { + error_report("failed to handle unmapped memory"); + return -1; + } + return exit_reason; + case HVMSG_GPA_INTERCEPT: + ret = handle_mmio(cpu, msg, &exit_reason); + if (ret < 0) { + error_report("failed to handle mmio"); + return -1; + } + return exit_reason; + case HVMSG_X64_IO_PORT_INTERCEPT: + ret = handle_pio(cpu, msg); + if (ret < 0) { + return MshvVmExitSpecial; + } + return MshvVmExitIgnore; + default: + break; + } + + *exit = MshvVmExitIgnore; + return 0; +} + +void mshv_remove_vcpu(int vm_fd, int cpu_fd) +{ + close(cpu_fd); +} + + +int mshv_create_vcpu(int vm_fd, uint8_t vp_index, int *cpu_fd) +{ + int ret; + struct mshv_create_vp vp_arg = { + .vp_index = vp_index, + }; + ret = ioctl(vm_fd, MSHV_CREATE_VP, &vp_arg); + if (ret < 0) { + error_report("failed to create mshv vcpu: %s", strerror(errno)); + return -1; + } + + *cpu_fd = ret; + + return 0; +} + +static int guest_mem_read_with_gva(const CPUState *cpu, uint64_t gva, + uint8_t *data, uintptr_t size, + bool fetch_instruction) +{ + int ret; + uint64_t gpa, flags; + + flags = HV_TRANSLATE_GVA_VALIDATE_READ; + ret = translate_gva(cpu, gva, &gpa, flags); + if (ret < 0) { + error_report("failed to translate gva to gpa"); + return -1; + } + + ret = mshv_guest_mem_read(gpa, data, size, false, fetch_instruction); + if (ret < 0) { + error_report("failed to read from guest memory"); + return -1; + } + + return 0; +} + +static int guest_mem_write_with_gva(const CPUState *cpu, uint64_t gva, + const uint8_t *data, uintptr_t size) +{ + int ret; + uint64_t gpa, flags; + + flags = HV_TRANSLATE_GVA_VALIDATE_WRITE; + ret = translate_gva(cpu, gva, &gpa, flags); + if (ret < 0) { + error_report("failed to translate gva to gpa"); + return -1; + } + ret = mshv_guest_mem_write(gpa, data, size, false); + if (ret < 0) { + error_report("failed to write to guest memory"); + return -1; + } + return 0; +} + +static void write_mem(CPUState *cpu, void *data, target_ulong addr, int bytes) +{ + if (guest_mem_write_with_gva(cpu, addr, data, bytes) < 0) { + error_report("failed to write memory"); + abort(); + } +} + +static void fetch_instruction(CPUState *cpu, void *data, + target_ulong addr, int bytes) +{ + if (guest_mem_read_with_gva(cpu, addr, data, bytes, true) < 0) { + error_report("failed to fetch instruction"); + abort(); + } +} + +static void read_mem(CPUState *cpu, void *data, target_ulong addr, int bytes) +{ + if (guest_mem_read_with_gva(cpu, addr, data, bytes, false) < 0) { + error_report("failed to read memory"); + abort(); + } +} + +static void read_segment_descriptor(CPUState *cpu, + struct x86_segment_descriptor *desc, + enum X86Seg seg_idx) +{ + bool ret; + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + SegmentCache *seg = &env->segs[seg_idx]; + x86_segment_selector sel = { .sel = seg->selector & 0xFFFF }; + + ret = x86_read_segment_descriptor(cpu, desc, sel); + if (ret == false) { + error_report("failed to read segment descriptor"); + abort(); + } +} + +static const struct x86_emul_ops mshv_x86_emul_ops = { + .fetch_instruction = fetch_instruction, + .read_mem = read_mem, + .write_mem = write_mem, + .read_segment_descriptor = read_segment_descriptor, +}; + +void mshv_init_mmio_emu(void) +{ + init_decoder(); + init_emu(&mshv_x86_emul_ops); +} + +void mshv_arch_init_vcpu(CPUState *cpu) +{ + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + AccelCPUState *state = cpu->accel; + size_t page = HV_HYP_PAGE_SIZE; + void *mem = qemu_memalign(page, 2 * page); + + /* sanity check, to make sure we don't overflow the page */ + QEMU_BUILD_BUG_ON((MAX_REGISTER_COUNT + * sizeof(hv_register_assoc) + + sizeof(hv_input_get_vp_registers) + > HV_HYP_PAGE_SIZE)); + + state->hvcall_args.base = mem; + state->hvcall_args.input_page = mem; + state->hvcall_args.output_page = (uint8_t *)mem + page; + + env->emu_mmio_buf = g_new(char, 4096); +} + +void mshv_arch_destroy_vcpu(CPUState *cpu) +{ + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + AccelCPUState *state = cpu->accel; + + g_free(state->hvcall_args.base); + state->hvcall_args = (MshvHvCallArgs){0}; + g_clear_pointer(&env->emu_mmio_buf, g_free); +} + +/* + * Default Microsoft Hypervisor behavior for unimplemented MSR is to send a + * fault to the guest if it tries to access it. It is possible to override + * this behavior with a more suitable option i.e., ignore writes from the guest + * and return zero in attempt to read unimplemented. + */ +static int set_unimplemented_msr_action(int vm_fd) +{ + struct hv_input_set_partition_property in = {0}; + struct mshv_root_hvcall args = {0}; + + in.property_code = HV_PARTITION_PROPERTY_UNIMPLEMENTED_MSR_ACTION; + in.property_value = HV_UNIMPLEMENTED_MSR_ACTION_IGNORE_WRITE_READ_ZERO; + + args.code = HVCALL_SET_PARTITION_PROPERTY; + args.in_sz = sizeof(in); + args.in_ptr = (uint64_t)∈ + + trace_mshv_hvcall_args("unimplemented_msr_action", args.code, args.in_sz); + + int ret = mshv_hvcall(vm_fd, &args); + if (ret < 0) { + error_report("Failed to set unimplemented MSR action"); + return -1; + } + return 0; +} + +int mshv_arch_post_init_vm(int vm_fd) +{ + int ret; + + ret = set_unimplemented_msr_action(vm_fd); + if (ret < 0) { + error_report("Failed to set unimplemented MSR action"); + } + + return ret; +} |