diff options
Diffstat (limited to 'accel/mshv')
-rw-r--r-- | accel/mshv/irq.c | 399 | ||||
-rw-r--r-- | accel/mshv/mem.c | 563 | ||||
-rw-r--r-- | accel/mshv/meson.build | 9 | ||||
-rw-r--r-- | accel/mshv/mshv-all.c | 727 | ||||
-rw-r--r-- | accel/mshv/msr.c | 375 | ||||
-rw-r--r-- | accel/mshv/trace-events | 33 | ||||
-rw-r--r-- | accel/mshv/trace.h | 14 |
7 files changed, 2120 insertions, 0 deletions
diff --git a/accel/mshv/irq.c b/accel/mshv/irq.c new file mode 100644 index 0000000..adf8f33 --- /dev/null +++ b/accel/mshv/irq.c @@ -0,0 +1,399 @@ +/* + * QEMU MSHV support + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: Ziqiao Zhou <ziqiaozhou@microsoft.com> + * Magnus Kulke <magnuskulke@microsoft.com> + * Stanislav Kinsburskii <skinsburskii@microsoft.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "linux/mshv.h" +#include "qemu/osdep.h" +#include "qemu/error-report.h" +#include "hw/hyperv/hvhdk_mini.h" +#include "hw/hyperv/hvgdk_mini.h" +#include "hw/intc/ioapic.h" +#include "hw/pci/msi.h" +#include "system/mshv.h" +#include "system/mshv_int.h" +#include "trace.h" +#include <stdint.h> +#include <sys/ioctl.h> + +#define MSHV_IRQFD_RESAMPLE_FLAG (1 << MSHV_IRQFD_BIT_RESAMPLE) +#define MSHV_IRQFD_BIT_DEASSIGN_FLAG (1 << MSHV_IRQFD_BIT_DEASSIGN) + +static MshvMsiControl *msi_control; +static QemuMutex msi_control_mutex; + +void mshv_init_msicontrol(void) +{ + qemu_mutex_init(&msi_control_mutex); + msi_control = g_new0(MshvMsiControl, 1); + msi_control->gsi_routes = g_hash_table_new(g_direct_hash, g_direct_equal); + msi_control->updated = false; +} + +static int set_msi_routing(uint32_t gsi, uint64_t addr, uint32_t data) +{ + struct mshv_user_irq_entry *entry; + uint32_t high_addr = addr >> 32; + uint32_t low_addr = addr & 0xFFFFFFFF; + GHashTable *gsi_routes; + + trace_mshv_set_msi_routing(gsi, addr, data); + + if (gsi >= MSHV_MAX_MSI_ROUTES) { + error_report("gsi >= MSHV_MAX_MSI_ROUTES"); + return -1; + } + + assert(msi_control); + + WITH_QEMU_LOCK_GUARD(&msi_control_mutex) { + gsi_routes = msi_control->gsi_routes; + entry = g_hash_table_lookup(gsi_routes, GINT_TO_POINTER(gsi)); + + if (entry + && entry->address_hi == high_addr + && entry->address_lo == low_addr + && entry->data == data) + { + /* nothing to update */ + return 0; + } + + /* free old entry */ + g_free(entry); + + /* create new entry */ + entry = g_new0(struct mshv_user_irq_entry, 1); + entry->gsi = gsi; + entry->address_hi = high_addr; + entry->address_lo = low_addr; + entry->data = data; + + g_hash_table_insert(gsi_routes, GINT_TO_POINTER(gsi), entry); + msi_control->updated = true; + } + + return 0; +} + +static int add_msi_routing(uint64_t addr, uint32_t data) +{ + struct mshv_user_irq_entry *route_entry; + uint32_t high_addr = addr >> 32; + uint32_t low_addr = addr & 0xFFFFFFFF; + int gsi; + GHashTable *gsi_routes; + + trace_mshv_add_msi_routing(addr, data); + + assert(msi_control); + + WITH_QEMU_LOCK_GUARD(&msi_control_mutex) { + /* find an empty slot */ + gsi = 0; + gsi_routes = msi_control->gsi_routes; + while (gsi < MSHV_MAX_MSI_ROUTES) { + route_entry = g_hash_table_lookup(gsi_routes, GINT_TO_POINTER(gsi)); + if (!route_entry) { + break; + } + gsi++; + } + if (gsi >= MSHV_MAX_MSI_ROUTES) { + error_report("No empty gsi slot available"); + return -1; + } + + /* create new entry */ + route_entry = g_new0(struct mshv_user_irq_entry, 1); + route_entry->gsi = gsi; + route_entry->address_hi = high_addr; + route_entry->address_lo = low_addr; + route_entry->data = data; + + g_hash_table_insert(gsi_routes, GINT_TO_POINTER(gsi), route_entry); + msi_control->updated = true; + } + + return gsi; +} + +static int commit_msi_routing_table(int vm_fd) +{ + guint len; + int i, ret; + size_t table_size; + struct mshv_user_irq_table *table; + GHashTableIter iter; + gpointer key, value; + + assert(msi_control); + + WITH_QEMU_LOCK_GUARD(&msi_control_mutex) { + if (!msi_control->updated) { + /* nothing to update */ + return 0; + } + + /* Calculate the size of the table */ + len = g_hash_table_size(msi_control->gsi_routes); + table_size = sizeof(struct mshv_user_irq_table) + + len * sizeof(struct mshv_user_irq_entry); + table = g_malloc0(table_size); + + g_hash_table_iter_init(&iter, msi_control->gsi_routes); + i = 0; + while (g_hash_table_iter_next(&iter, &key, &value)) { + struct mshv_user_irq_entry *entry = value; + table->entries[i] = *entry; + i++; + } + table->nr = i; + + trace_mshv_commit_msi_routing_table(vm_fd, len); + + ret = ioctl(vm_fd, MSHV_SET_MSI_ROUTING, table); + g_free(table); + if (ret < 0) { + error_report("Failed to commit msi routing table"); + return -1; + } + msi_control->updated = false; + } + return 0; +} + +static int remove_msi_routing(uint32_t gsi) +{ + struct mshv_user_irq_entry *route_entry; + GHashTable *gsi_routes; + + trace_mshv_remove_msi_routing(gsi); + + if (gsi >= MSHV_MAX_MSI_ROUTES) { + error_report("Invalid GSI: %u", gsi); + return -1; + } + + assert(msi_control); + + WITH_QEMU_LOCK_GUARD(&msi_control_mutex) { + gsi_routes = msi_control->gsi_routes; + route_entry = g_hash_table_lookup(gsi_routes, GINT_TO_POINTER(gsi)); + if (route_entry) { + g_hash_table_remove(gsi_routes, GINT_TO_POINTER(gsi)); + g_free(route_entry); + msi_control->updated = true; + } + } + + return 0; +} + +/* Pass an eventfd which is to be used for injecting interrupts from userland */ +static int irqfd(int vm_fd, int fd, int resample_fd, uint32_t gsi, + uint32_t flags) +{ + int ret; + struct mshv_user_irqfd arg = { + .fd = fd, + .resamplefd = resample_fd, + .gsi = gsi, + .flags = flags, + }; + + ret = ioctl(vm_fd, MSHV_IRQFD, &arg); + if (ret < 0) { + error_report("Failed to set irqfd: gsi=%u, fd=%d", gsi, fd); + return -1; + } + return ret; +} + +static int register_irqfd(int vm_fd, int event_fd, uint32_t gsi) +{ + int ret; + + trace_mshv_register_irqfd(vm_fd, event_fd, gsi); + + ret = irqfd(vm_fd, event_fd, 0, gsi, 0); + if (ret < 0) { + error_report("Failed to register irqfd: gsi=%u", gsi); + return -1; + } + return 0; +} + +static int register_irqfd_with_resample(int vm_fd, int event_fd, + int resample_fd, uint32_t gsi) +{ + int ret; + uint32_t flags = MSHV_IRQFD_RESAMPLE_FLAG; + + ret = irqfd(vm_fd, event_fd, resample_fd, gsi, flags); + if (ret < 0) { + error_report("Failed to register irqfd with resample: gsi=%u", gsi); + return -errno; + } + return 0; +} + +static int unregister_irqfd(int vm_fd, int event_fd, uint32_t gsi) +{ + int ret; + uint32_t flags = MSHV_IRQFD_BIT_DEASSIGN_FLAG; + + ret = irqfd(vm_fd, event_fd, 0, gsi, flags); + if (ret < 0) { + error_report("Failed to unregister irqfd: gsi=%u", gsi); + return -errno; + } + return 0; +} + +static int irqchip_update_irqfd_notifier_gsi(const EventNotifier *event, + const EventNotifier *resample, + int virq, bool add) +{ + int fd = event_notifier_get_fd(event); + int rfd = resample ? event_notifier_get_fd(resample) : -1; + int vm_fd = mshv_state->vm; + + trace_mshv_irqchip_update_irqfd_notifier_gsi(fd, rfd, virq, add); + + if (!add) { + return unregister_irqfd(vm_fd, fd, virq); + } + + if (rfd > 0) { + return register_irqfd_with_resample(vm_fd, fd, rfd, virq); + } + + return register_irqfd(vm_fd, fd, virq); +} + + +int mshv_irqchip_add_msi_route(int vector, PCIDevice *dev) +{ + MSIMessage msg = { 0, 0 }; + int virq = 0; + + if (pci_available && dev) { + msg = pci_get_msi_message(dev, vector); + virq = add_msi_routing(msg.address, le32_to_cpu(msg.data)); + } + + return virq; +} + +void mshv_irqchip_release_virq(int virq) +{ + remove_msi_routing(virq); +} + +int mshv_irqchip_update_msi_route(int virq, MSIMessage msg, PCIDevice *dev) +{ + int ret; + + ret = set_msi_routing(virq, msg.address, le32_to_cpu(msg.data)); + if (ret < 0) { + error_report("Failed to set msi routing"); + return -1; + } + + return 0; +} + +int mshv_request_interrupt(MshvState *mshv_state, uint32_t interrupt_type, uint32_t vector, + uint32_t vp_index, bool logical_dest_mode, + bool level_triggered) +{ + int ret; + int vm_fd = mshv_state->vm; + + if (vector == 0) { + warn_report("Ignoring request for interrupt vector 0"); + return 0; + } + + union hv_interrupt_control control = { + .interrupt_type = interrupt_type, + .level_triggered = level_triggered, + .logical_dest_mode = logical_dest_mode, + .rsvd = 0, + }; + + struct hv_input_assert_virtual_interrupt arg = {0}; + arg.control = control; + arg.dest_addr = (uint64_t)vp_index; + arg.vector = vector; + + struct mshv_root_hvcall args = {0}; + args.code = HVCALL_ASSERT_VIRTUAL_INTERRUPT; + args.in_sz = sizeof(arg); + args.in_ptr = (uint64_t)&arg; + + ret = mshv_hvcall(vm_fd, &args); + if (ret < 0) { + error_report("Failed to request interrupt"); + return -errno; + } + return 0; +} + +void mshv_irqchip_commit_routes(void) +{ + int ret; + int vm_fd = mshv_state->vm; + + ret = commit_msi_routing_table(vm_fd); + if (ret < 0) { + error_report("Failed to commit msi routing table"); + abort(); + } +} + +int mshv_irqchip_add_irqfd_notifier_gsi(const EventNotifier *event, + const EventNotifier *resample, + int virq) +{ + return irqchip_update_irqfd_notifier_gsi(event, resample, virq, true); +} + +int mshv_irqchip_remove_irqfd_notifier_gsi(const EventNotifier *event, + int virq) +{ + return irqchip_update_irqfd_notifier_gsi(event, NULL, virq, false); +} + +int mshv_reserve_ioapic_msi_routes(int vm_fd) +{ + int ret, gsi; + + /* + * Reserve GSI 0-23 for IOAPIC pins, to avoid conflicts of legacy + * peripherals with MSI-X devices + */ + for (gsi = 0; gsi < IOAPIC_NUM_PINS; gsi++) { + ret = add_msi_routing(0, 0); + if (ret < 0) { + error_report("Failed to reserve GSI %d", gsi); + return -1; + } + } + + ret = commit_msi_routing_table(vm_fd); + if (ret < 0) { + error_report("Failed to commit reserved IOAPIC MSI routes"); + return -1; + } + + return 0; +} diff --git a/accel/mshv/mem.c b/accel/mshv/mem.c new file mode 100644 index 0000000..0e2164a --- /dev/null +++ b/accel/mshv/mem.c @@ -0,0 +1,563 @@ +/* + * QEMU MSHV support + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: + * Magnus Kulke <magnuskulke@microsoft.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + * + */ + +#include "qemu/osdep.h" +#include "qemu/lockable.h" +#include "qemu/error-report.h" +#include "qemu/rcu.h" +#include "linux/mshv.h" +#include "system/address-spaces.h" +#include "system/mshv.h" +#include "system/mshv_int.h" +#include "exec/memattrs.h" +#include <sys/ioctl.h> +#include "trace.h" + +typedef struct SlotsRCUReclaim { + struct rcu_head rcu; + GList *old_head; + MshvMemorySlot *removed_slot; +} SlotsRCUReclaim; + +static void rcu_reclaim_slotlist(struct rcu_head *rcu) +{ + SlotsRCUReclaim *r = container_of(rcu, SlotsRCUReclaim, rcu); + g_list_free(r->old_head); + g_free(r->removed_slot); + g_free(r); +} + +static void publish_slots(GList *new_head, GList *old_head, + MshvMemorySlot *removed_slot) +{ + MshvMemorySlotManager *manager = &mshv_state->msm; + + assert(manager); + qatomic_store_release(&manager->slots, new_head); + + SlotsRCUReclaim *r = g_new(SlotsRCUReclaim, 1); + r->old_head = old_head; + r->removed_slot = removed_slot; + + call_rcu1(&r->rcu, rcu_reclaim_slotlist); +} + +/* Needs to be called with mshv_state->msm.mutex held */ +static int remove_slot(MshvMemorySlot *slot) +{ + GList *old_head, *new_head; + MshvMemorySlotManager *manager = &mshv_state->msm; + + assert(manager); + old_head = qatomic_load_acquire(&manager->slots); + + if (!g_list_find(old_head, slot)) { + error_report("slot requested for removal not found"); + return -1; + } + + new_head = g_list_copy(old_head); + new_head = g_list_remove(new_head, slot); + manager->n_slots--; + + publish_slots(new_head, old_head, slot); + + return 0; +} + +/* Needs to be called with mshv_state->msm.mutex held */ +static MshvMemorySlot *append_slot(uint64_t gpa, uint64_t userspace_addr, + uint64_t size, bool readonly) +{ + GList *old_head, *new_head; + MshvMemorySlot *slot; + MshvMemorySlotManager *manager = &mshv_state->msm; + + assert(manager); + + old_head = qatomic_load_acquire(&manager->slots); + + if (manager->n_slots >= MSHV_MAX_MEM_SLOTS) { + error_report("no free memory slots available"); + return NULL; + } + + slot = g_new0(MshvMemorySlot, 1); + slot->guest_phys_addr = gpa; + slot->userspace_addr = userspace_addr; + slot->memory_size = size; + slot->readonly = readonly; + + new_head = g_list_copy(old_head); + new_head = g_list_append(new_head, slot); + manager->n_slots++; + + publish_slots(new_head, old_head, NULL); + + return slot; +} + +static int slot_overlaps(const MshvMemorySlot *slot1, + const MshvMemorySlot *slot2) +{ + uint64_t start_1 = slot1->userspace_addr, + start_2 = slot2->userspace_addr; + size_t len_1 = slot1->memory_size, + len_2 = slot2->memory_size; + + if (slot1 == slot2) { + return -1; + } + + return ranges_overlap(start_1, len_1, start_2, len_2) ? 0 : -1; +} + +static bool is_mapped(MshvMemorySlot *slot) +{ + /* Subsequent reads of mapped field see a fully-initialized slot */ + return qatomic_load_acquire(&slot->mapped); +} + +/* + * Find slot that is: + * - overlapping in userspace + * - currently mapped in the guest + * + * Needs to be called with mshv_state->msm.mutex or RCU read lock held. + */ +static MshvMemorySlot *find_overlap_mem_slot(GList *head, MshvMemorySlot *slot) +{ + GList *found; + MshvMemorySlot *overlap_slot; + + found = g_list_find_custom(head, slot, (GCompareFunc) slot_overlaps); + + if (!found) { + return NULL; + } + + overlap_slot = found->data; + if (!overlap_slot || !is_mapped(overlap_slot)) { + return NULL; + } + + return overlap_slot; +} + +static int set_guest_memory(int vm_fd, + const struct mshv_user_mem_region *region) +{ + int ret; + + ret = ioctl(vm_fd, MSHV_SET_GUEST_MEMORY, region); + if (ret < 0) { + error_report("failed to set guest memory: %s", strerror(errno)); + return -1; + } + + return 0; +} + +static int map_or_unmap(int vm_fd, const MshvMemorySlot *slot, bool map) +{ + struct mshv_user_mem_region region = {0}; + + region.guest_pfn = slot->guest_phys_addr >> MSHV_PAGE_SHIFT; + region.size = slot->memory_size; + region.userspace_addr = slot->userspace_addr; + + if (!map) { + region.flags |= (1 << MSHV_SET_MEM_BIT_UNMAP); + trace_mshv_unmap_memory(slot->userspace_addr, slot->guest_phys_addr, + slot->memory_size); + return set_guest_memory(vm_fd, ®ion); + } + + region.flags = BIT(MSHV_SET_MEM_BIT_EXECUTABLE); + if (!slot->readonly) { + region.flags |= BIT(MSHV_SET_MEM_BIT_WRITABLE); + } + + trace_mshv_map_memory(slot->userspace_addr, slot->guest_phys_addr, + slot->memory_size); + return set_guest_memory(vm_fd, ®ion); +} + +static int slot_matches_region(const MshvMemorySlot *slot1, + const MshvMemorySlot *slot2) +{ + return (slot1->guest_phys_addr == slot2->guest_phys_addr && + slot1->userspace_addr == slot2->userspace_addr && + slot1->memory_size == slot2->memory_size) ? 0 : -1; +} + +/* Needs to be called with mshv_state->msm.mutex held */ +static MshvMemorySlot *find_mem_slot_by_region(uint64_t gpa, uint64_t size, + uint64_t userspace_addr) +{ + MshvMemorySlot ref_slot = { + .guest_phys_addr = gpa, + .userspace_addr = userspace_addr, + .memory_size = size, + }; + GList *found; + MshvMemorySlotManager *manager = &mshv_state->msm; + + assert(manager); + found = g_list_find_custom(manager->slots, &ref_slot, + (GCompareFunc) slot_matches_region); + + return found ? found->data : NULL; +} + +static int slot_covers_gpa(const MshvMemorySlot *slot, uint64_t *gpa_p) +{ + uint64_t gpa_offset, gpa = *gpa_p; + + gpa_offset = gpa - slot->guest_phys_addr; + return (slot->guest_phys_addr <= gpa && gpa_offset < slot->memory_size) + ? 0 : -1; +} + +/* Needs to be called with mshv_state->msm.mutex or RCU read lock held */ +static MshvMemorySlot *find_mem_slot_by_gpa(GList *head, uint64_t gpa) +{ + GList *found; + MshvMemorySlot *slot; + + trace_mshv_find_slot_by_gpa(gpa); + + found = g_list_find_custom(head, &gpa, (GCompareFunc) slot_covers_gpa); + if (found) { + slot = found->data; + trace_mshv_found_slot(slot->userspace_addr, slot->guest_phys_addr, + slot->memory_size); + return slot; + } + + return NULL; +} + +/* Needs to be called with mshv_state->msm.mutex held */ +static void set_mapped(MshvMemorySlot *slot, bool mapped) +{ + /* prior writes to mapped field becomes visible before readers see slot */ + qatomic_store_release(&slot->mapped, mapped); +} + +MshvRemapResult mshv_remap_overlap_region(int vm_fd, uint64_t gpa) +{ + MshvMemorySlot *gpa_slot, *overlap_slot; + GList *head; + int ret; + MshvMemorySlotManager *manager = &mshv_state->msm; + + /* fast path, called often by unmapped_gpa vm exit */ + WITH_RCU_READ_LOCK_GUARD() { + assert(manager); + head = qatomic_load_acquire(&manager->slots); + /* return early if no slot is found */ + gpa_slot = find_mem_slot_by_gpa(head, gpa); + if (gpa_slot == NULL) { + return MshvRemapNoMapping; + } + + /* return early if no overlapping slot is found */ + overlap_slot = find_overlap_mem_slot(head, gpa_slot); + if (overlap_slot == NULL) { + return MshvRemapNoOverlap; + } + } + + /* + * We'll modify the mapping list, so we need to upgrade to mutex and + * recheck. + */ + assert(manager); + QEMU_LOCK_GUARD(&manager->mutex); + + /* return early if no slot is found */ + gpa_slot = find_mem_slot_by_gpa(manager->slots, gpa); + if (gpa_slot == NULL) { + return MshvRemapNoMapping; + } + + /* return early if no overlapping slot is found */ + overlap_slot = find_overlap_mem_slot(manager->slots, gpa_slot); + if (overlap_slot == NULL) { + return MshvRemapNoOverlap; + } + + /* unmap overlapping slot */ + ret = map_or_unmap(vm_fd, overlap_slot, false); + if (ret < 0) { + error_report("failed to unmap overlap region"); + abort(); + } + set_mapped(overlap_slot, false); + warn_report("mapped out userspace_addr=0x%016lx gpa=0x%010lx size=0x%lx", + overlap_slot->userspace_addr, + overlap_slot->guest_phys_addr, + overlap_slot->memory_size); + + /* map region for gpa */ + ret = map_or_unmap(vm_fd, gpa_slot, true); + if (ret < 0) { + error_report("failed to map new region"); + abort(); + } + set_mapped(gpa_slot, true); + warn_report("mapped in userspace_addr=0x%016lx gpa=0x%010lx size=0x%lx", + gpa_slot->userspace_addr, gpa_slot->guest_phys_addr, + gpa_slot->memory_size); + + return MshvRemapOk; +} + +static int handle_unmapped_mmio_region_read(uint64_t gpa, uint64_t size, + uint8_t *data) +{ + warn_report("read from unmapped mmio region gpa=0x%lx size=%lu", gpa, size); + + if (size == 0 || size > 8) { + error_report("invalid size %lu for reading from unmapped mmio region", + size); + return -1; + } + + memset(data, 0xFF, size); + + return 0; +} + +int mshv_guest_mem_read(uint64_t gpa, uint8_t *data, uintptr_t size, + bool is_secure_mode, bool instruction_fetch) +{ + int ret; + MemTxAttrs memattr = { .secure = is_secure_mode }; + + if (instruction_fetch) { + trace_mshv_insn_fetch(gpa, size); + } else { + trace_mshv_mem_read(gpa, size); + } + + ret = address_space_rw(&address_space_memory, gpa, memattr, (void *)data, + size, false); + if (ret == MEMTX_OK) { + return 0; + } + + if (ret == MEMTX_DECODE_ERROR) { + return handle_unmapped_mmio_region_read(gpa, size, data); + } + + error_report("failed to read guest memory at 0x%lx", gpa); + return -1; +} + +int mshv_guest_mem_write(uint64_t gpa, const uint8_t *data, uintptr_t size, + bool is_secure_mode) +{ + int ret; + MemTxAttrs memattr = { .secure = is_secure_mode }; + + trace_mshv_mem_write(gpa, size); + ret = address_space_rw(&address_space_memory, gpa, memattr, (void *)data, + size, true); + if (ret == MEMTX_OK) { + return 0; + } + + if (ret == MEMTX_DECODE_ERROR) { + warn_report("write to unmapped mmio region gpa=0x%lx size=%lu", gpa, + size); + return 0; + } + + error_report("Failed to write guest memory"); + return -1; +} + +static int tracked_unmap(int vm_fd, uint64_t gpa, uint64_t size, + uint64_t userspace_addr) +{ + int ret; + MshvMemorySlot *slot; + MshvMemorySlotManager *manager = &mshv_state->msm; + + assert(manager); + + QEMU_LOCK_GUARD(&manager->mutex); + + slot = find_mem_slot_by_region(gpa, size, userspace_addr); + if (!slot) { + trace_mshv_skip_unset_mem(userspace_addr, gpa, size); + /* no work to do */ + return 0; + } + + if (!is_mapped(slot)) { + /* remove slot, no need to unmap */ + return remove_slot(slot); + } + + ret = map_or_unmap(vm_fd, slot, false); + if (ret < 0) { + error_report("failed to unmap memory region"); + return ret; + } + return remove_slot(slot); +} + +static int tracked_map(int vm_fd, uint64_t gpa, uint64_t size, bool readonly, + uint64_t userspace_addr) +{ + MshvMemorySlot *slot, *overlap_slot; + int ret; + MshvMemorySlotManager *manager = &mshv_state->msm; + + assert(manager); + + QEMU_LOCK_GUARD(&manager->mutex); + + slot = find_mem_slot_by_region(gpa, size, userspace_addr); + if (slot) { + error_report("memory region already mapped at gpa=0x%lx, " + "userspace_addr=0x%lx, size=0x%lx", + slot->guest_phys_addr, slot->userspace_addr, + slot->memory_size); + return -1; + } + + slot = append_slot(gpa, userspace_addr, size, readonly); + + overlap_slot = find_overlap_mem_slot(manager->slots, slot); + if (overlap_slot) { + trace_mshv_remap_attempt(slot->userspace_addr, + slot->guest_phys_addr, + slot->memory_size); + warn_report("attempt to map region [0x%lx-0x%lx], while " + "[0x%lx-0x%lx] is already mapped in the guest", + userspace_addr, userspace_addr + size - 1, + overlap_slot->userspace_addr, + overlap_slot->userspace_addr + + overlap_slot->memory_size - 1); + + /* do not register mem slot in hv, but record for later swap-in */ + set_mapped(slot, false); + + return 0; + } + + ret = map_or_unmap(vm_fd, slot, true); + if (ret < 0) { + error_report("failed to map memory region"); + return -1; + } + set_mapped(slot, true); + + return 0; +} + +static int set_memory(uint64_t gpa, uint64_t size, bool readonly, + uint64_t userspace_addr, bool add) +{ + int vm_fd = mshv_state->vm; + + if (add) { + return tracked_map(vm_fd, gpa, size, readonly, userspace_addr); + } + + return tracked_unmap(vm_fd, gpa, size, userspace_addr); +} + +/* + * Calculate and align the start address and the size of the section. + * Return the size. If the size is 0, the aligned section is empty. + */ +static hwaddr align_section(MemoryRegionSection *section, hwaddr *start) +{ + hwaddr size = int128_get64(section->size); + hwaddr delta, aligned; + + /* + * works in page size chunks, but the function may be called + * with sub-page size and unaligned start address. Pad the start + * address to next and truncate size to previous page boundary. + */ + aligned = ROUND_UP(section->offset_within_address_space, + qemu_real_host_page_size()); + delta = aligned - section->offset_within_address_space; + *start = aligned; + if (delta > size) { + return 0; + } + + return (size - delta) & qemu_real_host_page_mask(); +} + +void mshv_set_phys_mem(MshvMemoryListener *mml, MemoryRegionSection *section, + bool add) +{ + int ret = 0; + MemoryRegion *area = section->mr; + bool writable = !area->readonly && !area->rom_device; + hwaddr start_addr, mr_offset, size; + void *ram; + + size = align_section(section, &start_addr); + trace_mshv_set_phys_mem(add, section->mr->name, start_addr); + + size = align_section(section, &start_addr); + trace_mshv_set_phys_mem(add, section->mr->name, start_addr); + + /* + * If the memory device is a writable non-ram area, we do not + * want to map it into the guest memory. If it is not a ROM device, + * we want to remove mshv memory mapping, so accesses will trap. + */ + if (!memory_region_is_ram(area)) { + if (writable) { + return; + } else if (!area->romd_mode) { + add = false; + } + } + + if (!size) { + return; + } + + mr_offset = section->offset_within_region + start_addr - + section->offset_within_address_space; + + ram = memory_region_get_ram_ptr(area) + mr_offset; + + ret = set_memory(start_addr, size, !writable, (uint64_t)ram, add); + if (ret < 0) { + error_report("failed to set memory region"); + abort(); + } +} + +void mshv_init_memory_slot_manager(MshvState *mshv_state) +{ + MshvMemorySlotManager *manager; + + assert(mshv_state); + manager = &mshv_state->msm; + + manager->n_slots = 0; + manager->slots = NULL; + qemu_mutex_init(&manager->mutex); +} diff --git a/accel/mshv/meson.build b/accel/mshv/meson.build new file mode 100644 index 0000000..d3a2b32 --- /dev/null +++ b/accel/mshv/meson.build @@ -0,0 +1,9 @@ +mshv_ss = ss.source_set() +mshv_ss.add(if_true: files( + 'irq.c', + 'mem.c', + 'msr.c', + 'mshv-all.c' +)) + +specific_ss.add_all(when: 'CONFIG_MSHV', if_true: mshv_ss) diff --git a/accel/mshv/mshv-all.c b/accel/mshv/mshv-all.c new file mode 100644 index 0000000..45174f7 --- /dev/null +++ b/accel/mshv/mshv-all.c @@ -0,0 +1,727 @@ +/* + * QEMU MSHV support + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: + * Ziqiao Zhou <ziqiaozhou@microsoft.com> + * Magnus Kulke <magnuskulke@microsoft.com> + * Jinank Jain <jinankjain@microsoft.com> + * Wei Liu <liuwe@microsoft.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + * + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "qemu/event_notifier.h" +#include "qemu/module.h" +#include "qemu/main-loop.h" +#include "hw/boards.h" + +#include "hw/hyperv/hvhdk.h" +#include "hw/hyperv/hvhdk_mini.h" +#include "hw/hyperv/hvgdk.h" +#include "hw/hyperv/hvgdk_mini.h" +#include "linux/mshv.h" + +#include "qemu/accel.h" +#include "qemu/guest-random.h" +#include "accel/accel-ops.h" +#include "accel/accel-cpu-ops.h" +#include "system/cpus.h" +#include "system/runstate.h" +#include "system/accel-blocker.h" +#include "system/address-spaces.h" +#include "system/mshv.h" +#include "system/mshv_int.h" +#include "system/reset.h" +#include "trace.h" +#include <err.h> +#include <stdint.h> +#include <sys/ioctl.h> + +#define TYPE_MSHV_ACCEL ACCEL_CLASS_NAME("mshv") + +DECLARE_INSTANCE_CHECKER(MshvState, MSHV_STATE, TYPE_MSHV_ACCEL) + +bool mshv_allowed; + +MshvState *mshv_state; + +static int init_mshv(int *mshv_fd) +{ + int fd = open("/dev/mshv", O_RDWR | O_CLOEXEC); + if (fd < 0) { + error_report("Failed to open /dev/mshv: %s", strerror(errno)); + return -1; + } + *mshv_fd = fd; + return 0; +} + +/* freeze 1 to pause, 0 to resume */ +static int set_time_freeze(int vm_fd, int freeze) +{ + int ret; + struct hv_input_set_partition_property in = {0}; + in.property_code = HV_PARTITION_PROPERTY_TIME_FREEZE; + in.property_value = freeze; + + struct mshv_root_hvcall args = {0}; + args.code = HVCALL_SET_PARTITION_PROPERTY; + args.in_sz = sizeof(in); + args.in_ptr = (uint64_t)∈ + + ret = mshv_hvcall(vm_fd, &args); + if (ret < 0) { + error_report("Failed to set time freeze"); + return -1; + } + + return 0; +} + +static int pause_vm(int vm_fd) +{ + int ret; + + ret = set_time_freeze(vm_fd, 1); + if (ret < 0) { + error_report("Failed to pause partition: %s", strerror(errno)); + return -1; + } + + return 0; +} + +static int resume_vm(int vm_fd) +{ + int ret; + + ret = set_time_freeze(vm_fd, 0); + if (ret < 0) { + error_report("Failed to resume partition: %s", strerror(errno)); + return -1; + } + + return 0; +} + +static int create_partition(int mshv_fd, int *vm_fd) +{ + int ret; + struct mshv_create_partition args = {0}; + + /* Initialize pt_flags with the desired features */ + uint64_t pt_flags = (1ULL << MSHV_PT_BIT_LAPIC) | + (1ULL << MSHV_PT_BIT_X2APIC) | + (1ULL << MSHV_PT_BIT_GPA_SUPER_PAGES); + + /* Set default isolation type */ + uint64_t pt_isolation = MSHV_PT_ISOLATION_NONE; + + args.pt_flags = pt_flags; + args.pt_isolation = pt_isolation; + + ret = ioctl(mshv_fd, MSHV_CREATE_PARTITION, &args); + if (ret < 0) { + error_report("Failed to create partition: %s", strerror(errno)); + return -1; + } + + *vm_fd = ret; + return 0; +} + +static int set_synthetic_proc_features(int vm_fd) +{ + int ret; + struct hv_input_set_partition_property in = {0}; + union hv_partition_synthetic_processor_features features = {0}; + + /* Access the bitfield and set the desired features */ + features.hypervisor_present = 1; + features.hv1 = 1; + features.access_partition_reference_counter = 1; + features.access_synic_regs = 1; + features.access_synthetic_timer_regs = 1; + features.access_partition_reference_tsc = 1; + features.access_frequency_regs = 1; + features.access_intr_ctrl_regs = 1; + features.access_vp_index = 1; + features.access_hypercall_regs = 1; + features.tb_flush_hypercalls = 1; + features.synthetic_cluster_ipi = 1; + features.direct_synthetic_timers = 1; + + mshv_arch_amend_proc_features(&features); + + in.property_code = HV_PARTITION_PROPERTY_SYNTHETIC_PROC_FEATURES; + in.property_value = features.as_uint64[0]; + + struct mshv_root_hvcall args = {0}; + args.code = HVCALL_SET_PARTITION_PROPERTY; + args.in_sz = sizeof(in); + args.in_ptr = (uint64_t)∈ + + trace_mshv_hvcall_args("synthetic_proc_features", args.code, args.in_sz); + + ret = mshv_hvcall(vm_fd, &args); + if (ret < 0) { + error_report("Failed to set synthethic proc features"); + return -errno; + } + return 0; +} + +static int initialize_vm(int vm_fd) +{ + int ret = ioctl(vm_fd, MSHV_INITIALIZE_PARTITION); + if (ret < 0) { + error_report("Failed to initialize partition: %s", strerror(errno)); + return -1; + } + return 0; +} + +static int create_vm(int mshv_fd, int *vm_fd) +{ + int ret = create_partition(mshv_fd, vm_fd); + if (ret < 0) { + return -1; + } + + ret = set_synthetic_proc_features(*vm_fd); + if (ret < 0) { + return -1; + } + + ret = initialize_vm(*vm_fd); + if (ret < 0) { + return -1; + } + + ret = mshv_reserve_ioapic_msi_routes(*vm_fd); + if (ret < 0) { + return -1; + } + + ret = mshv_arch_post_init_vm(*vm_fd); + if (ret < 0) { + return -1; + } + + /* Always create a frozen partition */ + pause_vm(*vm_fd); + + return 0; +} + +static void mem_region_add(MemoryListener *listener, + MemoryRegionSection *section) +{ + MshvMemoryListener *mml; + mml = container_of(listener, MshvMemoryListener, listener); + memory_region_ref(section->mr); + mshv_set_phys_mem(mml, section, true); +} + +static void mem_region_del(MemoryListener *listener, + MemoryRegionSection *section) +{ + MshvMemoryListener *mml; + mml = container_of(listener, MshvMemoryListener, listener); + mshv_set_phys_mem(mml, section, false); + memory_region_unref(section->mr); +} + +typedef enum { + DATAMATCH_NONE, + DATAMATCH_U32, + DATAMATCH_U64, +} DatamatchTag; + +typedef struct { + DatamatchTag tag; + union { + uint32_t u32; + uint64_t u64; + } value; +} Datamatch; + +/* flags: determine whether to de/assign */ +static int ioeventfd(int vm_fd, int event_fd, uint64_t addr, Datamatch dm, + uint32_t flags) +{ + struct mshv_user_ioeventfd args = {0}; + args.fd = event_fd; + args.addr = addr; + args.flags = flags; + + if (dm.tag == DATAMATCH_NONE) { + args.datamatch = 0; + } else { + flags |= BIT(MSHV_IOEVENTFD_BIT_DATAMATCH); + args.flags = flags; + if (dm.tag == DATAMATCH_U64) { + args.len = sizeof(uint64_t); + args.datamatch = dm.value.u64; + } else { + args.len = sizeof(uint32_t); + args.datamatch = dm.value.u32; + } + } + + return ioctl(vm_fd, MSHV_IOEVENTFD, &args); +} + +static int unregister_ioevent(int vm_fd, int event_fd, uint64_t mmio_addr) +{ + uint32_t flags = 0; + Datamatch dm = {0}; + + flags |= BIT(MSHV_IOEVENTFD_BIT_DEASSIGN); + dm.tag = DATAMATCH_NONE; + + return ioeventfd(vm_fd, event_fd, mmio_addr, dm, flags); +} + +static int register_ioevent(int vm_fd, int event_fd, uint64_t mmio_addr, + uint64_t val, bool is_64bit, bool is_datamatch) +{ + uint32_t flags = 0; + Datamatch dm = {0}; + + if (!is_datamatch) { + dm.tag = DATAMATCH_NONE; + } else if (is_64bit) { + dm.tag = DATAMATCH_U64; + dm.value.u64 = val; + } else { + dm.tag = DATAMATCH_U32; + dm.value.u32 = val; + } + + return ioeventfd(vm_fd, event_fd, mmio_addr, dm, flags); +} + +static void mem_ioeventfd_add(MemoryListener *listener, + MemoryRegionSection *section, + bool match_data, uint64_t data, + EventNotifier *e) +{ + int fd = event_notifier_get_fd(e); + int ret; + bool is_64 = int128_get64(section->size) == 8; + uint64_t addr = section->offset_within_address_space; + + trace_mshv_mem_ioeventfd_add(addr, int128_get64(section->size), data); + + ret = register_ioevent(mshv_state->vm, fd, addr, data, is_64, match_data); + + if (ret < 0) { + error_report("Failed to register ioeventfd: %s (%d)", strerror(-ret), + -ret); + abort(); + } +} + +static void mem_ioeventfd_del(MemoryListener *listener, + MemoryRegionSection *section, + bool match_data, uint64_t data, + EventNotifier *e) +{ + int fd = event_notifier_get_fd(e); + int ret; + uint64_t addr = section->offset_within_address_space; + + trace_mshv_mem_ioeventfd_del(section->offset_within_address_space, + int128_get64(section->size), data); + + ret = unregister_ioevent(mshv_state->vm, fd, addr); + if (ret < 0) { + error_report("Failed to unregister ioeventfd: %s (%d)", strerror(-ret), + -ret); + abort(); + } +} + +static MemoryListener mshv_memory_listener = { + .name = "mshv", + .priority = MEMORY_LISTENER_PRIORITY_ACCEL, + .region_add = mem_region_add, + .region_del = mem_region_del, + .eventfd_add = mem_ioeventfd_add, + .eventfd_del = mem_ioeventfd_del, +}; + +static MemoryListener mshv_io_listener = { + .name = "mshv", .priority = MEMORY_LISTENER_PRIORITY_DEV_BACKEND, + /* MSHV does not support PIO eventfd */ +}; + +static void register_mshv_memory_listener(MshvState *s, MshvMemoryListener *mml, + AddressSpace *as, int as_id, + const char *name) +{ + int i; + + mml->listener = mshv_memory_listener; + mml->listener.name = name; + memory_listener_register(&mml->listener, as); + for (i = 0; i < s->nr_as; ++i) { + if (!s->as[i].as) { + s->as[i].as = as; + s->as[i].ml = mml; + break; + } + } +} + +int mshv_hvcall(int fd, const struct mshv_root_hvcall *args) +{ + int ret = 0; + + ret = ioctl(fd, MSHV_ROOT_HVCALL, args); + if (ret < 0) { + error_report("Failed to perform hvcall: %s", strerror(errno)); + return -1; + } + return ret; +} + +static int mshv_init_vcpu(CPUState *cpu) +{ + int vm_fd = mshv_state->vm; + uint8_t vp_index = cpu->cpu_index; + int ret; + + cpu->accel = g_new0(AccelCPUState, 1); + mshv_arch_init_vcpu(cpu); + + ret = mshv_create_vcpu(vm_fd, vp_index, &cpu->accel->cpufd); + if (ret < 0) { + return -1; + } + + cpu->accel->dirty = true; + + return 0; +} + +static int mshv_init(AccelState *as, MachineState *ms) +{ + MshvState *s; + int mshv_fd, vm_fd, ret; + + if (mshv_state) { + warn_report("MSHV accelerator already initialized"); + return 0; + } + + s = MSHV_STATE(as); + + accel_blocker_init(); + + s->vm = 0; + + ret = init_mshv(&mshv_fd); + if (ret < 0) { + return -1; + } + + mshv_init_mmio_emu(); + + mshv_init_msicontrol(); + + mshv_init_memory_slot_manager(s); + + ret = create_vm(mshv_fd, &vm_fd); + if (ret < 0) { + close(mshv_fd); + return -1; + } + + ret = resume_vm(vm_fd); + if (ret < 0) { + close(mshv_fd); + close(vm_fd); + return -1; + } + + s->vm = vm_fd; + s->fd = mshv_fd; + s->nr_as = 1; + s->as = g_new0(MshvAddressSpace, s->nr_as); + + mshv_state = s; + + register_mshv_memory_listener(s, &s->memory_listener, &address_space_memory, + 0, "mshv-memory"); + memory_listener_register(&mshv_io_listener, &address_space_io); + + return 0; +} + +static int mshv_destroy_vcpu(CPUState *cpu) +{ + int cpu_fd = mshv_vcpufd(cpu); + int vm_fd = mshv_state->vm; + + mshv_remove_vcpu(vm_fd, cpu_fd); + mshv_vcpufd(cpu) = 0; + + mshv_arch_destroy_vcpu(cpu); + g_clear_pointer(&cpu->accel, g_free); + return 0; +} + +static int mshv_cpu_exec(CPUState *cpu) +{ + hv_message mshv_msg; + enum MshvVmExit exit_reason; + int ret = 0; + + bql_unlock(); + cpu_exec_start(cpu); + + do { + if (cpu->accel->dirty) { + ret = mshv_arch_put_registers(cpu); + if (ret) { + error_report("Failed to put registers after init: %s", + strerror(-ret)); + ret = -1; + break; + } + cpu->accel->dirty = false; + } + + ret = mshv_run_vcpu(mshv_state->vm, cpu, &mshv_msg, &exit_reason); + if (ret < 0) { + error_report("Failed to run on vcpu %d", cpu->cpu_index); + abort(); + } + + switch (exit_reason) { + case MshvVmExitIgnore: + break; + default: + ret = EXCP_INTERRUPT; + break; + } + } while (ret == 0); + + cpu_exec_end(cpu); + bql_lock(); + + if (ret < 0) { + cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); + vm_stop(RUN_STATE_INTERNAL_ERROR); + } + + return ret; +} + +/* + * The signal handler is triggered when QEMU's main thread receives a SIG_IPI + * (SIGUSR1). This signal causes the current CPU thread to be kicked, forcing a + * VM exit on the CPU. The VM exit generates an exit reason that breaks the loop + * (see mshv_cpu_exec). If the exit is due to a Ctrl+A+x command, the system + * will shut down. For other cases, the system will continue running. + */ +static void sa_ipi_handler(int sig) +{ + /* TODO: call IOCTL to set_immediate_exit, once implemented. */ + + qemu_cpu_kick_self(); +} + +static void init_signal(CPUState *cpu) +{ + /* init cpu signals */ + struct sigaction sigact; + sigset_t set; + + memset(&sigact, 0, sizeof(sigact)); + sigact.sa_handler = sa_ipi_handler; + sigaction(SIG_IPI, &sigact, NULL); + + pthread_sigmask(SIG_BLOCK, NULL, &set); + sigdelset(&set, SIG_IPI); + pthread_sigmask(SIG_SETMASK, &set, NULL); +} + +static void *mshv_vcpu_thread(void *arg) +{ + CPUState *cpu = arg; + int ret; + + rcu_register_thread(); + + bql_lock(); + qemu_thread_get_self(cpu->thread); + cpu->thread_id = qemu_get_thread_id(); + current_cpu = cpu; + ret = mshv_init_vcpu(cpu); + if (ret < 0) { + error_report("Failed to init vcpu %d", cpu->cpu_index); + goto cleanup; + } + init_signal(cpu); + + /* signal CPU creation */ + cpu_thread_signal_created(cpu); + qemu_guest_random_seed_thread_part2(cpu->random_seed); + + do { + qemu_process_cpu_events(cpu); + if (cpu_can_run(cpu)) { + mshv_cpu_exec(cpu); + } + } while (!cpu->unplug || cpu_can_run(cpu)); + + mshv_destroy_vcpu(cpu); +cleanup: + cpu_thread_signal_destroyed(cpu); + bql_unlock(); + rcu_unregister_thread(); + return NULL; +} + +static void mshv_start_vcpu_thread(CPUState *cpu) +{ + char thread_name[VCPU_THREAD_NAME_SIZE]; + + cpu->thread = g_malloc0(sizeof(QemuThread)); + cpu->halt_cond = g_malloc0(sizeof(QemuCond)); + + qemu_cond_init(cpu->halt_cond); + + trace_mshv_start_vcpu_thread(thread_name, cpu->cpu_index); + qemu_thread_create(cpu->thread, thread_name, mshv_vcpu_thread, cpu, + QEMU_THREAD_JOINABLE); +} + +static void do_mshv_cpu_synchronize_post_init(CPUState *cpu, + run_on_cpu_data arg) +{ + int ret = mshv_arch_put_registers(cpu); + if (ret < 0) { + error_report("Failed to put registers after init: %s", strerror(-ret)); + abort(); + } + + cpu->accel->dirty = false; +} + +static void mshv_cpu_synchronize_post_init(CPUState *cpu) +{ + run_on_cpu(cpu, do_mshv_cpu_synchronize_post_init, RUN_ON_CPU_NULL); +} + +static void mshv_cpu_synchronize_post_reset(CPUState *cpu) +{ + int ret = mshv_arch_put_registers(cpu); + if (ret) { + error_report("Failed to put registers after reset: %s", + strerror(-ret)); + cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); + vm_stop(RUN_STATE_INTERNAL_ERROR); + } + cpu->accel->dirty = false; +} + +static void do_mshv_cpu_synchronize_pre_loadvm(CPUState *cpu, + run_on_cpu_data arg) +{ + cpu->accel->dirty = true; +} + +static void mshv_cpu_synchronize_pre_loadvm(CPUState *cpu) +{ + run_on_cpu(cpu, do_mshv_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); +} + +static void do_mshv_cpu_synchronize(CPUState *cpu, run_on_cpu_data arg) +{ + if (!cpu->accel->dirty) { + int ret = mshv_load_regs(cpu); + if (ret < 0) { + error_report("Failed to load registers for vcpu %d", + cpu->cpu_index); + + cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); + vm_stop(RUN_STATE_INTERNAL_ERROR); + } + + cpu->accel->dirty = true; + } +} + +static void mshv_cpu_synchronize(CPUState *cpu) +{ + if (!cpu->accel->dirty) { + run_on_cpu(cpu, do_mshv_cpu_synchronize, RUN_ON_CPU_NULL); + } +} + +static bool mshv_cpus_are_resettable(void) +{ + return false; +} + +static void mshv_accel_class_init(ObjectClass *oc, const void *data) +{ + AccelClass *ac = ACCEL_CLASS(oc); + + ac->name = "MSHV"; + ac->init_machine = mshv_init; + ac->allowed = &mshv_allowed; +} + +static void mshv_accel_instance_init(Object *obj) +{ + MshvState *s = MSHV_STATE(obj); + + s->vm = 0; +} + +static const TypeInfo mshv_accel_type = { + .name = TYPE_MSHV_ACCEL, + .parent = TYPE_ACCEL, + .instance_init = mshv_accel_instance_init, + .class_init = mshv_accel_class_init, + .instance_size = sizeof(MshvState), +}; + +static void mshv_accel_ops_class_init(ObjectClass *oc, const void *data) +{ + AccelOpsClass *ops = ACCEL_OPS_CLASS(oc); + + ops->create_vcpu_thread = mshv_start_vcpu_thread; + ops->synchronize_post_init = mshv_cpu_synchronize_post_init; + ops->synchronize_post_reset = mshv_cpu_synchronize_post_reset; + ops->synchronize_state = mshv_cpu_synchronize; + ops->synchronize_pre_loadvm = mshv_cpu_synchronize_pre_loadvm; + ops->cpus_are_resettable = mshv_cpus_are_resettable; + ops->handle_interrupt = generic_handle_interrupt; +} + +static const TypeInfo mshv_accel_ops_type = { + .name = ACCEL_OPS_NAME("mshv"), + .parent = TYPE_ACCEL_OPS, + .class_init = mshv_accel_ops_class_init, + .abstract = true, +}; + +static void mshv_type_init(void) +{ + type_register_static(&mshv_accel_type); + type_register_static(&mshv_accel_ops_type); +} + +type_init(mshv_type_init); diff --git a/accel/mshv/msr.c b/accel/mshv/msr.c new file mode 100644 index 0000000..e6e5bae --- /dev/null +++ b/accel/mshv/msr.c @@ -0,0 +1,375 @@ +/* + * QEMU MSHV support + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: Magnus Kulke <magnuskulke@microsoft.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "system/mshv.h" +#include "system/mshv_int.h" +#include "hw/hyperv/hvgdk_mini.h" +#include "linux/mshv.h" +#include "qemu/error-report.h" + +static uint32_t supported_msrs[64] = { + IA32_MSR_TSC, + IA32_MSR_EFER, + IA32_MSR_KERNEL_GS_BASE, + IA32_MSR_APIC_BASE, + IA32_MSR_PAT, + IA32_MSR_SYSENTER_CS, + IA32_MSR_SYSENTER_ESP, + IA32_MSR_SYSENTER_EIP, + IA32_MSR_STAR, + IA32_MSR_LSTAR, + IA32_MSR_CSTAR, + IA32_MSR_SFMASK, + IA32_MSR_MTRR_DEF_TYPE, + IA32_MSR_MTRR_PHYSBASE0, + IA32_MSR_MTRR_PHYSMASK0, + IA32_MSR_MTRR_PHYSBASE1, + IA32_MSR_MTRR_PHYSMASK1, + IA32_MSR_MTRR_PHYSBASE2, + IA32_MSR_MTRR_PHYSMASK2, + IA32_MSR_MTRR_PHYSBASE3, + IA32_MSR_MTRR_PHYSMASK3, + IA32_MSR_MTRR_PHYSBASE4, + IA32_MSR_MTRR_PHYSMASK4, + IA32_MSR_MTRR_PHYSBASE5, + IA32_MSR_MTRR_PHYSMASK5, + IA32_MSR_MTRR_PHYSBASE6, + IA32_MSR_MTRR_PHYSMASK6, + IA32_MSR_MTRR_PHYSBASE7, + IA32_MSR_MTRR_PHYSMASK7, + IA32_MSR_MTRR_FIX64K_00000, + IA32_MSR_MTRR_FIX16K_80000, + IA32_MSR_MTRR_FIX16K_A0000, + IA32_MSR_MTRR_FIX4K_C0000, + IA32_MSR_MTRR_FIX4K_C8000, + IA32_MSR_MTRR_FIX4K_D0000, + IA32_MSR_MTRR_FIX4K_D8000, + IA32_MSR_MTRR_FIX4K_E0000, + IA32_MSR_MTRR_FIX4K_E8000, + IA32_MSR_MTRR_FIX4K_F0000, + IA32_MSR_MTRR_FIX4K_F8000, + IA32_MSR_TSC_AUX, + IA32_MSR_DEBUG_CTL, + HV_X64_MSR_GUEST_OS_ID, + HV_X64_MSR_SINT0, + HV_X64_MSR_SINT1, + HV_X64_MSR_SINT2, + HV_X64_MSR_SINT3, + HV_X64_MSR_SINT4, + HV_X64_MSR_SINT5, + HV_X64_MSR_SINT6, + HV_X64_MSR_SINT7, + HV_X64_MSR_SINT8, + HV_X64_MSR_SINT9, + HV_X64_MSR_SINT10, + HV_X64_MSR_SINT11, + HV_X64_MSR_SINT12, + HV_X64_MSR_SINT13, + HV_X64_MSR_SINT14, + HV_X64_MSR_SINT15, + HV_X64_MSR_SCONTROL, + HV_X64_MSR_SIEFP, + HV_X64_MSR_SIMP, + HV_X64_MSR_REFERENCE_TSC, + HV_X64_MSR_EOM, +}; +static const size_t msr_count = ARRAY_SIZE(supported_msrs); + +static int compare_msr_index(const void *a, const void *b) +{ + return *(uint32_t *)a - *(uint32_t *)b; +} + +__attribute__((constructor)) +static void init_sorted_msr_map(void) +{ + qsort(supported_msrs, msr_count, sizeof(uint32_t), compare_msr_index); +} + +static int mshv_is_supported_msr(uint32_t msr) +{ + return bsearch(&msr, supported_msrs, msr_count, sizeof(uint32_t), + compare_msr_index) != NULL; +} + +static int mshv_msr_to_hv_reg_name(uint32_t msr, uint32_t *hv_reg) +{ + switch (msr) { + case IA32_MSR_TSC: + *hv_reg = HV_X64_REGISTER_TSC; + return 0; + case IA32_MSR_EFER: + *hv_reg = HV_X64_REGISTER_EFER; + return 0; + case IA32_MSR_KERNEL_GS_BASE: + *hv_reg = HV_X64_REGISTER_KERNEL_GS_BASE; + return 0; + case IA32_MSR_APIC_BASE: + *hv_reg = HV_X64_REGISTER_APIC_BASE; + return 0; + case IA32_MSR_PAT: + *hv_reg = HV_X64_REGISTER_PAT; + return 0; + case IA32_MSR_SYSENTER_CS: + *hv_reg = HV_X64_REGISTER_SYSENTER_CS; + return 0; + case IA32_MSR_SYSENTER_ESP: + *hv_reg = HV_X64_REGISTER_SYSENTER_ESP; + return 0; + case IA32_MSR_SYSENTER_EIP: + *hv_reg = HV_X64_REGISTER_SYSENTER_EIP; + return 0; + case IA32_MSR_STAR: + *hv_reg = HV_X64_REGISTER_STAR; + return 0; + case IA32_MSR_LSTAR: + *hv_reg = HV_X64_REGISTER_LSTAR; + return 0; + case IA32_MSR_CSTAR: + *hv_reg = HV_X64_REGISTER_CSTAR; + return 0; + case IA32_MSR_SFMASK: + *hv_reg = HV_X64_REGISTER_SFMASK; + return 0; + case IA32_MSR_MTRR_CAP: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_CAP; + return 0; + case IA32_MSR_MTRR_DEF_TYPE: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_DEF_TYPE; + return 0; + case IA32_MSR_MTRR_PHYSBASE0: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_BASE0; + return 0; + case IA32_MSR_MTRR_PHYSMASK0: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_MASK0; + return 0; + case IA32_MSR_MTRR_PHYSBASE1: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_BASE1; + return 0; + case IA32_MSR_MTRR_PHYSMASK1: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_MASK1; + return 0; + case IA32_MSR_MTRR_PHYSBASE2: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_BASE2; + return 0; + case IA32_MSR_MTRR_PHYSMASK2: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_MASK2; + return 0; + case IA32_MSR_MTRR_PHYSBASE3: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_BASE3; + return 0; + case IA32_MSR_MTRR_PHYSMASK3: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_MASK3; + return 0; + case IA32_MSR_MTRR_PHYSBASE4: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_BASE4; + return 0; + case IA32_MSR_MTRR_PHYSMASK4: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_MASK4; + return 0; + case IA32_MSR_MTRR_PHYSBASE5: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_BASE5; + return 0; + case IA32_MSR_MTRR_PHYSMASK5: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_MASK5; + return 0; + case IA32_MSR_MTRR_PHYSBASE6: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_BASE6; + return 0; + case IA32_MSR_MTRR_PHYSMASK6: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_MASK6; + return 0; + case IA32_MSR_MTRR_PHYSBASE7: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_BASE7; + return 0; + case IA32_MSR_MTRR_PHYSMASK7: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_MASK7; + return 0; + case IA32_MSR_MTRR_FIX64K_00000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX64K00000; + return 0; + case IA32_MSR_MTRR_FIX16K_80000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX16K80000; + return 0; + case IA32_MSR_MTRR_FIX16K_A0000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX16KA0000; + return 0; + case IA32_MSR_MTRR_FIX4K_C0000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX4KC0000; + return 0; + case IA32_MSR_MTRR_FIX4K_C8000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX4KC8000; + return 0; + case IA32_MSR_MTRR_FIX4K_D0000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX4KD0000; + return 0; + case IA32_MSR_MTRR_FIX4K_D8000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX4KD8000; + return 0; + case IA32_MSR_MTRR_FIX4K_E0000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX4KE0000; + return 0; + case IA32_MSR_MTRR_FIX4K_E8000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX4KE8000; + return 0; + case IA32_MSR_MTRR_FIX4K_F0000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX4KF0000; + return 0; + case IA32_MSR_MTRR_FIX4K_F8000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX4KF8000; + return 0; + case IA32_MSR_TSC_AUX: + *hv_reg = HV_X64_REGISTER_TSC_AUX; + return 0; + case IA32_MSR_BNDCFGS: + *hv_reg = HV_X64_REGISTER_BNDCFGS; + return 0; + case IA32_MSR_DEBUG_CTL: + *hv_reg = HV_X64_REGISTER_DEBUG_CTL; + return 0; + case IA32_MSR_TSC_ADJUST: + *hv_reg = HV_X64_REGISTER_TSC_ADJUST; + return 0; + case IA32_MSR_SPEC_CTRL: + *hv_reg = HV_X64_REGISTER_SPEC_CTRL; + return 0; + case HV_X64_MSR_GUEST_OS_ID: + *hv_reg = HV_REGISTER_GUEST_OS_ID; + return 0; + case HV_X64_MSR_SINT0: + *hv_reg = HV_REGISTER_SINT0; + return 0; + case HV_X64_MSR_SINT1: + *hv_reg = HV_REGISTER_SINT1; + return 0; + case HV_X64_MSR_SINT2: + *hv_reg = HV_REGISTER_SINT2; + return 0; + case HV_X64_MSR_SINT3: + *hv_reg = HV_REGISTER_SINT3; + return 0; + case HV_X64_MSR_SINT4: + *hv_reg = HV_REGISTER_SINT4; + return 0; + case HV_X64_MSR_SINT5: + *hv_reg = HV_REGISTER_SINT5; + return 0; + case HV_X64_MSR_SINT6: + *hv_reg = HV_REGISTER_SINT6; + return 0; + case HV_X64_MSR_SINT7: + *hv_reg = HV_REGISTER_SINT7; + return 0; + case HV_X64_MSR_SINT8: + *hv_reg = HV_REGISTER_SINT8; + return 0; + case HV_X64_MSR_SINT9: + *hv_reg = HV_REGISTER_SINT9; + return 0; + case HV_X64_MSR_SINT10: + *hv_reg = HV_REGISTER_SINT10; + return 0; + case HV_X64_MSR_SINT11: + *hv_reg = HV_REGISTER_SINT11; + return 0; + case HV_X64_MSR_SINT12: + *hv_reg = HV_REGISTER_SINT12; + return 0; + case HV_X64_MSR_SINT13: + *hv_reg = HV_REGISTER_SINT13; + return 0; + case HV_X64_MSR_SINT14: + *hv_reg = HV_REGISTER_SINT14; + return 0; + case HV_X64_MSR_SINT15: + *hv_reg = HV_REGISTER_SINT15; + return 0; + case IA32_MSR_MISC_ENABLE: + *hv_reg = HV_X64_REGISTER_MSR_IA32_MISC_ENABLE; + return 0; + case HV_X64_MSR_SCONTROL: + *hv_reg = HV_REGISTER_SCONTROL; + return 0; + case HV_X64_MSR_SIEFP: + *hv_reg = HV_REGISTER_SIEFP; + return 0; + case HV_X64_MSR_SIMP: + *hv_reg = HV_REGISTER_SIMP; + return 0; + case HV_X64_MSR_REFERENCE_TSC: + *hv_reg = HV_REGISTER_REFERENCE_TSC; + return 0; + case HV_X64_MSR_EOM: + *hv_reg = HV_REGISTER_EOM; + return 0; + default: + error_report("failed to map MSR %u to HV register name", msr); + return -1; + } +} + +static int set_msrs(const CPUState *cpu, GList *msrs) +{ + size_t n_msrs; + GList *entries; + MshvMsrEntry *entry; + enum hv_register_name name; + struct hv_register_assoc *assoc; + int ret; + size_t i = 0; + + n_msrs = g_list_length(msrs); + hv_register_assoc *assocs = g_new0(hv_register_assoc, n_msrs); + + entries = msrs; + for (const GList *elem = entries; elem != NULL; elem = elem->next) { + entry = elem->data; + ret = mshv_msr_to_hv_reg_name(entry->index, &name); + if (ret < 0) { + g_free(assocs); + return ret; + } + assoc = &assocs[i]; + assoc->name = name; + /* the union has been initialized to 0 */ + assoc->value.reg64 = entry->data; + i++; + } + ret = mshv_set_generic_regs(cpu, assocs, n_msrs); + g_free(assocs); + if (ret < 0) { + error_report("failed to set msrs"); + return -1; + } + return 0; +} + + +int mshv_configure_msr(const CPUState *cpu, const MshvMsrEntry *msrs, + size_t n_msrs) +{ + GList *valid_msrs = NULL; + uint32_t msr_index; + int ret; + + for (size_t i = 0; i < n_msrs; i++) { + msr_index = msrs[i].index; + /* check whether index of msrs is in SUPPORTED_MSRS */ + if (mshv_is_supported_msr(msr_index)) { + valid_msrs = g_list_append(valid_msrs, (void *) &msrs[i]); + } + } + + ret = set_msrs(cpu, valid_msrs); + g_list_free(valid_msrs); + + return ret; +} diff --git a/accel/mshv/trace-events b/accel/mshv/trace-events new file mode 100644 index 0000000..36f0d59 --- /dev/null +++ b/accel/mshv/trace-events @@ -0,0 +1,33 @@ +# Authors: Ziqiao Zhou <ziqiaozhou@microsoft.com> +# Magnus Kulke <magnuskulke@microsoft.com> +# +# SPDX-License-Identifier: GPL-2.0-or-later + +mshv_start_vcpu_thread(const char* thread, uint32_t cpu) "thread=%s cpu_index=%d" + +mshv_set_memory(bool add, uint64_t gpa, uint64_t size, uint64_t user_addr, bool readonly, int ret) "add=%d gpa=0x%" PRIx64 " size=0x%" PRIx64 " user=0x%" PRIx64 " readonly=%d result=%d" +mshv_mem_ioeventfd_add(uint64_t addr, uint32_t size, uint32_t data) "addr=0x%" PRIx64 " size=%d data=0x%x" +mshv_mem_ioeventfd_del(uint64_t addr, uint32_t size, uint32_t data) "addr=0x%" PRIx64 " size=%d data=0x%x" + +mshv_hvcall_args(const char* hvcall, uint16_t code, uint16_t in_sz) "built args for '%s' code: %d in_sz: %d" + +mshv_handle_interrupt(uint32_t cpu, int mask) "cpu_index=%d mask=0x%x" +mshv_set_msi_routing(uint32_t gsi, uint64_t addr, uint32_t data) "gsi=%d addr=0x%" PRIx64 " data=0x%x" +mshv_remove_msi_routing(uint32_t gsi) "gsi=%d" +mshv_add_msi_routing(uint64_t addr, uint32_t data) "addr=0x%" PRIx64 " data=0x%x" +mshv_commit_msi_routing_table(int vm_fd, int len) "vm_fd=%d table_size=%d" +mshv_register_irqfd(int vm_fd, int event_fd, uint32_t gsi) "vm_fd=%d event_fd=%d gsi=%d" +mshv_irqchip_update_irqfd_notifier_gsi(int event_fd, int resample_fd, int virq, bool add) "event_fd=%d resample_fd=%d virq=%d add=%d" + +mshv_insn_fetch(uint64_t addr, size_t size) "gpa=0x%" PRIx64 " size=%zu" +mshv_mem_write(uint64_t addr, size_t size) "\tgpa=0x%" PRIx64 " size=%zu" +mshv_mem_read(uint64_t addr, size_t size) "\tgpa=0x%" PRIx64 " size=%zu" +mshv_map_memory(uint64_t userspace_addr, uint64_t gpa, uint64_t size) "\tu_a=0x%" PRIx64 " gpa=0x%010" PRIx64 " size=0x%08" PRIx64 +mshv_unmap_memory(uint64_t userspace_addr, uint64_t gpa, uint64_t size) "\tu_a=0x%" PRIx64 " gpa=0x%010" PRIx64 " size=0x%08" PRIx64 +mshv_set_phys_mem(bool add, const char *name, uint64_t gpa) "\tadd=%d name=%s gpa=0x%010" PRIx64 +mshv_handle_mmio(uint64_t gva, uint64_t gpa, uint64_t size, uint8_t access_type) "\tgva=0x%" PRIx64 " gpa=0x%010" PRIx64 " size=0x%" PRIx64 " access_type=%d" + +mshv_found_slot(uint64_t userspace_addr, uint64_t gpa, uint64_t size) "\tu_a=0x%" PRIx64 " gpa=0x%010" PRIx64 " size=0x%08" PRIx64 +mshv_skip_unset_mem(uint64_t userspace_addr, uint64_t gpa, uint64_t size) "\tu_a=0x%" PRIx64 " gpa=0x%010" PRIx64 " size=0x%08" PRIx64 +mshv_remap_attempt(uint64_t userspace_addr, uint64_t gpa, uint64_t size) "\tu_a=0x%" PRIx64 " gpa=0x%010" PRIx64 " size=0x%08" PRIx64 +mshv_find_slot_by_gpa(uint64_t gpa) "\tgpa=0x%010" PRIx64 diff --git a/accel/mshv/trace.h b/accel/mshv/trace.h new file mode 100644 index 0000000..0dca48f --- /dev/null +++ b/accel/mshv/trace.h @@ -0,0 +1,14 @@ +/* + * QEMU MSHV support + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: + * Ziqiao Zhou <ziqiaozhou@microsoft.com> + * Magnus Kulke <magnuskulke@microsoft.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + * + */ + +#include "trace/trace-accel_mshv.h" |