diff options
Diffstat (limited to 'hw/i386')
54 files changed, 2497 insertions, 895 deletions
diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig index f4a33b6..eb65bda 100644 --- a/hw/i386/Kconfig +++ b/hw/i386/Kconfig @@ -10,6 +10,11 @@ config SGX bool depends on KVM +config TDX + bool + select X86_FW_OVMF + depends on KVM + config PC bool imply APPLESMC @@ -26,6 +31,7 @@ config PC imply QXL imply SEV imply SGX + imply TDX imply TEST_DEVICES imply TPM_CRB imply TPM_TIS_ISA @@ -43,6 +49,7 @@ config PC select SERIAL_ISA select ACPI_PCI select ACPI_VMGENID + select ACPI_VMCLOCK select VIRTIO_PMEM_SUPPORTED select VIRTIO_MEM_SUPPORTED select HV_BALLOON_SUPPORTED @@ -129,6 +136,16 @@ config MICROVM select USB_XHCI_SYSBUS select I8254 +config NITRO_ENCLAVE + default y + depends on I386 && FDT # for MICROVM + depends on LIBCBOR && GNUTLS # for EIF and VIRTIO_NSM + depends on VHOST_USER # for VHOST_USER_VSOCK + select EIF + select MICROVM + select VHOST_USER_VSOCK + select VIRTIO_NSM + config X86_IOMMU bool depends on PC diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index f4e366f..61851cc 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -22,7 +22,7 @@ #include "qemu/osdep.h" #include "qapi/error.h" -#include "qapi/qmp/qnum.h" +#include "qobject/qnum.h" #include "acpi-build.h" #include "acpi-common.h" #include "qemu/bitmap.h" @@ -40,18 +40,19 @@ #include "hw/acpi/acpi_aml_interface.h" #include "hw/input/i8042.h" #include "hw/acpi/memory_hotplug.h" -#include "sysemu/tpm.h" +#include "system/tpm.h" #include "hw/acpi/tpm.h" #include "hw/acpi/vmgenid.h" +#include "hw/acpi/vmclock.h" #include "hw/acpi/erst.h" #include "hw/acpi/piix4.h" -#include "sysemu/tpm_backend.h" +#include "system/tpm_backend.h" #include "hw/rtc/mc146818rtc_regs.h" #include "migration/vmstate.h" #include "hw/mem/memory-device.h" #include "hw/mem/nvdimm.h" -#include "sysemu/numa.h" -#include "sysemu/reset.h" +#include "system/numa.h" +#include "system/reset.h" #include "hw/hyperv/vmbus-bridge.h" /* Supported chipsets: */ @@ -68,7 +69,6 @@ #include "hw/acpi/utils.h" #include "hw/acpi/pci.h" #include "hw/acpi/cxl.h" -#include "hw/acpi/acpi_generic_initiator.h" #include "qom/qom-qobject.h" #include "hw/i386/amd_iommu.h" @@ -139,7 +139,7 @@ static void init_common_fadt_data(MachineState *ms, Object *o, /* * "ICH9-LPC" or "PIIX4_PM" has "smm-compat" property to keep the old * behavior for compatibility irrelevant to smm_enabled, which doesn't - * comforms to ACPI spec. + * conform to the ACPI spec. */ bool smm_enabled = object_property_get_bool(o, "smm-compat", NULL) ? true : x86_machine_is_smm_enabled(x86ms); @@ -589,8 +589,8 @@ void build_append_pci_bus_devices(Aml *parent_scope, PCIBus *bus) } } -static bool build_append_notfication_callback(Aml *parent_scope, - const PCIBus *bus) +static bool build_append_notification_callback(Aml *parent_scope, + const PCIBus *bus) { Aml *method; PCIBus *sec; @@ -604,7 +604,7 @@ static bool build_append_notfication_callback(Aml *parent_scope, continue; } nr_notifiers = nr_notifiers + - build_append_notfication_callback(br_scope, sec); + build_append_notification_callback(br_scope, sec); /* * add new child scope to parent * and keep track of bus that have PCNT, @@ -655,6 +655,7 @@ static Aml *aml_pci_pdsm(void) Aml *acpi_index = aml_local(2); Aml *zero = aml_int(0); Aml *one = aml_int(1); + Aml *not_supp = aml_int(0xFFFFFFFF); Aml *func = aml_arg(2); Aml *params = aml_arg(4); Aml *bnum = aml_derefof(aml_index(params, aml_int(0))); @@ -679,7 +680,7 @@ static Aml *aml_pci_pdsm(void) */ ifctx1 = aml_if(aml_lnot( aml_or(aml_equal(acpi_index, zero), - aml_equal(acpi_index, aml_int(0xFFFFFFFF)), NULL) + aml_equal(acpi_index, not_supp), NULL) )); { /* have supported functions */ @@ -705,18 +706,30 @@ static Aml *aml_pci_pdsm(void) { Aml *pkg = aml_package(2); - aml_append(pkg, zero); - /* - * optional, if not impl. should return null string - */ - aml_append(pkg, aml_string("%s", "")); - aml_append(ifctx, aml_store(pkg, ret)); - aml_append(ifctx, aml_store(aml_call2("AIDX", bnum, sunum), acpi_index)); + aml_append(ifctx, aml_store(pkg, ret)); /* - * update acpi-index to actual value + * Windows calls func=7 without checking if it's available, + * as workaround Microsoft has suggested to return invalid for func7 + * Package, so return 2 elements package but only initialize elements + * when acpi_index is supported and leave them uninitialized, which + * leads elements to being Uninitialized ObjectType and should trip + * Windows into discarding result as an unexpected and prevent setting + * bogus 'PCI Label' on the device. */ - aml_append(ifctx, aml_store(acpi_index, aml_index(ret, zero))); + ifctx1 = aml_if(aml_lnot(aml_lor( + aml_equal(acpi_index, zero), aml_equal(acpi_index, not_supp) + ))); + { + aml_append(ifctx1, aml_store(acpi_index, aml_index(ret, zero))); + /* + * optional, if not impl. should return null string + */ + aml_append(ifctx1, aml_store(aml_string("%s", ""), + aml_index(ret, one))); + } + aml_append(ifctx, ifctx1); + aml_append(ifctx, aml_return(ret)); } @@ -724,120 +737,45 @@ static Aml *aml_pci_pdsm(void) return method; } -/** - * build_prt_entry: - * @link_name: link name for PCI route entry - * - * build AML package containing a PCI route entry for @link_name - */ -static Aml *build_prt_entry(const char *link_name) -{ - Aml *a_zero = aml_int(0); - Aml *pkg = aml_package(4); - aml_append(pkg, a_zero); - aml_append(pkg, a_zero); - aml_append(pkg, aml_name("%s", link_name)); - aml_append(pkg, a_zero); - return pkg; -} - /* - * initialize_route - Initialize the interrupt routing rule - * through a specific LINK: - * if (lnk_idx == idx) - * route using link 'link_name' - */ -static Aml *initialize_route(Aml *route, const char *link_name, - Aml *lnk_idx, int idx) -{ - Aml *if_ctx = aml_if(aml_equal(lnk_idx, aml_int(idx))); - Aml *pkg = build_prt_entry(link_name); - - aml_append(if_ctx, aml_store(pkg, route)); - - return if_ctx; -} - -/* - * build_prt - Define interrupt rounting rules + * build_prt - Define interrupt routing rules * * Returns an array of 128 routes, one for each device, * based on device location. * The main goal is to equally distribute the interrupts * over the 4 existing ACPI links (works only for i440fx). - * The hash function is (slot + pin) & 3 -> "LNK[D|A|B|C]". + * The hash function is: (slot + pin) & 3 -> "LNK[D|A|B|C]". * */ static Aml *build_prt(bool is_pci0_prt) { - Aml *method, *while_ctx, *pin, *res; + const int nroutes = 128; + Aml *rt_pkg, *method; + int pin; method = aml_method("_PRT", 0, AML_NOTSERIALIZED); - res = aml_local(0); - pin = aml_local(1); - aml_append(method, aml_store(aml_package(128), res)); - aml_append(method, aml_store(aml_int(0), pin)); - - /* while (pin < 128) */ - while_ctx = aml_while(aml_lless(pin, aml_int(128))); - { - Aml *slot = aml_local(2); - Aml *lnk_idx = aml_local(3); - Aml *route = aml_local(4); - - /* slot = pin >> 2 */ - aml_append(while_ctx, - aml_store(aml_shiftright(pin, aml_int(2), NULL), slot)); - /* lnk_idx = (slot + pin) & 3 */ - aml_append(while_ctx, - aml_store(aml_and(aml_add(pin, slot, NULL), aml_int(3), NULL), - lnk_idx)); - - /* route[2] = "LNK[D|A|B|C]", selection based on pin % 3 */ - aml_append(while_ctx, initialize_route(route, "LNKD", lnk_idx, 0)); - if (is_pci0_prt) { - Aml *if_device_1, *if_pin_4, *else_pin_4; - - /* device 1 is the power-management device, needs SCI */ - if_device_1 = aml_if(aml_equal(lnk_idx, aml_int(1))); - { - if_pin_4 = aml_if(aml_equal(pin, aml_int(4))); - { - aml_append(if_pin_4, - aml_store(build_prt_entry("LNKS"), route)); - } - aml_append(if_device_1, if_pin_4); - else_pin_4 = aml_else(); - { - aml_append(else_pin_4, - aml_store(build_prt_entry("LNKA"), route)); - } - aml_append(if_device_1, else_pin_4); - } - aml_append(while_ctx, if_device_1); + assert(nroutes < 256); + rt_pkg = aml_package(nroutes); + + for (pin = 0; pin < nroutes; pin++) { + Aml *pkg = aml_package(4); + int slot = pin >> 2; + + aml_append(pkg, aml_int((slot << 16) | 0xFFFF)); + aml_append(pkg, aml_int(pin & 3)); + /* device 1 is the power-management device, needs SCI */ + if (is_pci0_prt && pin == 4) { + aml_append(pkg, aml_name("%s", "LNKS")); } else { - aml_append(while_ctx, initialize_route(route, "LNKA", lnk_idx, 1)); + static const char link_name[][5] = {"LNKD", "LNKA", "LNKB", "LNKC"}; + int hash = (slot + pin) & 3; + aml_append(pkg, aml_name("%s", link_name[hash])); } - aml_append(while_ctx, initialize_route(route, "LNKB", lnk_idx, 2)); - aml_append(while_ctx, initialize_route(route, "LNKC", lnk_idx, 3)); - - /* route[0] = 0x[slot]FFFF */ - aml_append(while_ctx, - aml_store(aml_or(aml_shiftleft(slot, aml_int(16)), aml_int(0xFFFF), - NULL), - aml_index(route, aml_int(0)))); - /* route[1] = pin & 3 */ - aml_append(while_ctx, - aml_store(aml_and(pin, aml_int(3), NULL), - aml_index(route, aml_int(1)))); - /* res[pin] = route */ - aml_append(while_ctx, aml_store(route, aml_index(res, pin))); - /* pin++ */ - aml_append(while_ctx, aml_increment(pin)); + aml_append(pkg, aml_int(0)); + aml_append(rt_pkg, pkg); } - aml_append(method, while_ctx); - /* return res*/ - aml_append(method, aml_return(res)); + + aml_append(method, aml_return(rt_pkg)); return method; } @@ -1536,7 +1474,8 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, .fw_unplugs_cpu = pm->smi_on_cpu_unplug, }; build_cpus_aml(dsdt, machine, opts, pc_madt_cpu_entry, - pm->cpu_hp_io_base, "\\_SB.PCI0", "\\_GPE._E02"); + pm->cpu_hp_io_base, "\\_SB.PCI0", "\\_GPE._E02", + AML_SYSTEM_IO); } if (pcms->memhp_io_base && nr_mem) { @@ -1551,6 +1490,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, QLIST_FOREACH(bus, &bus->child, sibling) { uint8_t bus_num = pci_bus_num(bus); uint8_t numa_node = pci_bus_numa_node(bus); + uint32_t uid; /* look only for expander root buses */ if (!pci_bus_is_root(bus)) { @@ -1561,6 +1501,8 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, root_bus_limit = bus_num - 1; } + uid = object_property_get_uint(OBJECT(bus), "acpi_uid", + &error_fatal); scope = aml_scope("\\_SB"); if (pci_bus_is_cxl(bus)) { @@ -1568,7 +1510,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, } else { dev = aml_device("PC%.02X", bus_num); } - aml_append(dev, aml_name_decl("_UID", aml_int(bus_num))); + aml_append(dev, aml_name_decl("_UID", aml_int(uid))); aml_append(dev, aml_name_decl("_BBN", aml_int(bus_num))); if (pci_bus_is_cxl(bus)) { struct Aml *aml_pkg = aml_package(2); @@ -1831,7 +1773,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, PCIBus *b = PCI_HOST_BRIDGE(pci_host)->bus; scope = aml_scope("\\_SB.PCI0"); - has_pcnt = build_append_notfication_callback(scope, b); + has_pcnt = build_append_notification_callback(scope, b); if (has_pcnt) { aml_append(dsdt, scope); } @@ -2046,7 +1988,7 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) build_srat_memory(table_data, 0, 0, 0, MEM_AFFINITY_NOFLAGS); } - build_srat_generic_pci_initiator(table_data); + build_srat_generic_affinity_structures(table_data); /* * Entry is required for Windows to enable memory hotplug in OS @@ -2391,12 +2333,12 @@ build_amd_iommu(GArray *table_data, BIOSLinker *linker, const char *oem_id, build_append_int_noprefix(table_data, ivhd_blob->len + 24, 2); /* DeviceID */ build_append_int_noprefix(table_data, - object_property_get_int(OBJECT(&s->pci), "addr", + object_property_get_int(OBJECT(s->pci), "addr", &error_abort), 2); /* Capability offset */ - build_append_int_noprefix(table_data, s->pci.capab_offset, 2); + build_append_int_noprefix(table_data, s->pci->capab_offset, 2); /* IOMMU base address */ - build_append_int_noprefix(table_data, s->mmio.addr, 8); + build_append_int_noprefix(table_data, s->mr_mmio.addr, 8); /* PCI Segment Group */ build_append_int_noprefix(table_data, 0, 2); /* IOMMU info */ @@ -2426,12 +2368,12 @@ build_amd_iommu(GArray *table_data, BIOSLinker *linker, const char *oem_id, build_append_int_noprefix(table_data, ivhd_blob->len + 40, 2); /* DeviceID */ build_append_int_noprefix(table_data, - object_property_get_int(OBJECT(&s->pci), "addr", + object_property_get_int(OBJECT(s->pci), "addr", &error_abort), 2); /* Capability offset */ - build_append_int_noprefix(table_data, s->pci.capab_offset, 2); + build_append_int_noprefix(table_data, s->pci->capab_offset, 2); /* IOMMU base address */ - build_append_int_noprefix(table_data, s->mmio.addr, 8); + build_append_int_noprefix(table_data, s->mr_mmio.addr, 8); /* PCI Segment Group */ build_append_int_noprefix(table_data, 0, 2); /* IOMMU info */ @@ -2504,7 +2446,7 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine) uint8_t *u; GArray *tables_blob = tables->table_data; AcpiSlicOem slic_oem = { .id = NULL, .table_id = NULL }; - Object *vmgenid_dev; + Object *vmgenid_dev, *vmclock_dev; char *oem_id; char *oem_table_id; @@ -2577,6 +2519,13 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine) tables->vmgenid, tables->linker, x86ms->oem_id); } + vmclock_dev = find_vmclock_dev(); + if (vmclock_dev) { + acpi_add_table(table_offsets, tables_blob); + vmclock_build_acpi(VMCLOCK(vmclock_dev), tables_blob, tables->linker, + x86ms->oem_id); + } + if (misc.has_hpet) { acpi_add_table(table_offsets, tables_blob); build_hpet(tables_blob, tables->linker, x86ms->oem_id, diff --git a/hw/i386/acpi-build.h b/hw/i386/acpi-build.h index 0dce155..275ec05 100644 --- a/hw/i386/acpi-build.h +++ b/hw/i386/acpi-build.h @@ -5,7 +5,7 @@ extern const struct AcpiGenericAddress x86_nvdimm_acpi_dsmio; -/* PCI Hot-plug registers bases. See docs/spec/acpi_pci_hotplug.txt */ +/* PCI Hot-plug registers' base. See docs/specs/acpi_pci_hotplug.rst */ #define ACPI_PCIHP_SEJ_BASE 0x8 #define ACPI_PCIHP_BNMR_BASE 0x10 diff --git a/hw/i386/acpi-common.c b/hw/i386/acpi-common.c index 0cc2919..7bd0806 100644 --- a/hw/i386/acpi-common.c +++ b/hw/i386/acpi-common.c @@ -23,7 +23,7 @@ #include "qemu/osdep.h" #include "qapi/error.h" -#include "exec/memory.h" +#include "system/memory.h" #include "hw/acpi/acpi.h" #include "hw/acpi/aml-build.h" #include "hw/acpi/utils.h" diff --git a/hw/i386/acpi-microvm.c b/hw/i386/acpi-microvm.c index 279da6b..bc65717 100644 --- a/hw/i386/acpi-microvm.c +++ b/hw/i386/acpi-microvm.c @@ -24,7 +24,7 @@ #include "qemu/cutils.h" #include "qapi/error.h" -#include "exec/memory.h" +#include "system/memory.h" #include "hw/acpi/acpi.h" #include "hw/acpi/acpi_aml_interface.h" #include "hw/acpi/aml-build.h" diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c index 6d4fde7..963aa24 100644 --- a/hw/i386/amd_iommu.c +++ b/hw/i386/amd_iommu.c @@ -32,6 +32,7 @@ #include "trace.h" #include "hw/i386/apic-msidef.h" #include "hw/qdev-properties.h" +#include "kvm/kvm_i386.h" /* used AMD-Vi MMIO registers */ const char *amdvi_mmio_low[] = { @@ -60,8 +61,9 @@ struct AMDVIAddressSpace { uint8_t bus_num; /* bus number */ uint8_t devfn; /* device function */ AMDVIState *iommu_state; /* AMDVI - one per machine */ - MemoryRegion root; /* AMDVI Root memory map region */ + MemoryRegion root; /* AMDVI Root memory map region */ IOMMUMemoryRegion iommu; /* Device's address translation region */ + MemoryRegion iommu_nodma; /* Alias of shared nodma memory region */ MemoryRegion iommu_ir; /* Device's interrupt remapping region */ AddressSpace as; /* device's corresponding address space */ }; @@ -165,11 +167,11 @@ static void amdvi_generate_msi_interrupt(AMDVIState *s) { MSIMessage msg = {}; MemTxAttrs attrs = { - .requester_id = pci_requester_id(&s->pci.dev) + .requester_id = pci_requester_id(&s->pci->dev) }; - if (msi_enabled(&s->pci.dev)) { - msg = msi_get_message(&s->pci.dev, 0); + if (msi_enabled(&s->pci->dev)) { + msg = msi_get_message(&s->pci->dev, 0); address_space_stl_le(&address_space_memory, msg.address, msg.data, attrs, NULL); } @@ -237,7 +239,7 @@ static void amdvi_page_fault(AMDVIState *s, uint16_t devid, info |= AMDVI_EVENT_IOPF_I | AMDVI_EVENT_IOPF; amdvi_encode_event(evt, devid, addr, info); amdvi_log_event(s, evt); - pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS, + pci_word_test_and_set_mask(s->pci->dev.config + PCI_STATUS, PCI_STATUS_SIG_TARGET_ABORT); } /* @@ -254,7 +256,7 @@ static void amdvi_log_devtab_error(AMDVIState *s, uint16_t devid, amdvi_encode_event(evt, devid, devtab, info); amdvi_log_event(s, evt); - pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS, + pci_word_test_and_set_mask(s->pci->dev.config + PCI_STATUS, PCI_STATUS_SIG_TARGET_ABORT); } /* log an event trying to access command buffer @@ -267,7 +269,7 @@ static void amdvi_log_command_error(AMDVIState *s, hwaddr addr) amdvi_encode_event(evt, 0, addr, info); amdvi_log_event(s, evt); - pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS, + pci_word_test_and_set_mask(s->pci->dev.config + PCI_STATUS, PCI_STATUS_SIG_TARGET_ABORT); } /* log an illegal command event @@ -308,7 +310,7 @@ static void amdvi_log_pagetab_error(AMDVIState *s, uint16_t devid, info |= AMDVI_EVENT_PAGE_TAB_HW_ERROR; amdvi_encode_event(evt, devid, addr, info); amdvi_log_event(s, evt); - pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS, + pci_word_test_and_set_mask(s->pci->dev.config + PCI_STATUS, PCI_STATUS_SIG_TARGET_ABORT); } @@ -357,12 +359,12 @@ static void amdvi_update_iotlb(AMDVIState *s, uint16_t devid, uint64_t gpa, IOMMUTLBEntry to_cache, uint16_t domid) { - AMDVIIOTLBEntry *entry = g_new(AMDVIIOTLBEntry, 1); - uint64_t *key = g_new(uint64_t, 1); - uint64_t gfn = gpa >> AMDVI_PAGE_SHIFT_4K; - /* don't cache erroneous translations */ if (to_cache.perm != IOMMU_NONE) { + AMDVIIOTLBEntry *entry = g_new(AMDVIIOTLBEntry, 1); + uint64_t *key = g_new(uint64_t, 1); + uint64_t gfn = gpa >> AMDVI_PAGE_SHIFT_4K; + trace_amdvi_cache_update(domid, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), gpa, to_cache.translated_addr); @@ -430,6 +432,12 @@ static void amdvi_complete_ppr(AMDVIState *s, uint64_t *cmd) trace_amdvi_ppr_exec(); } +static void amdvi_intremap_inval_notify_all(AMDVIState *s, bool global, + uint32_t index, uint32_t mask) +{ + x86_iommu_iec_notify_all(X86_IOMMU_DEVICE(s), global, index, mask); +} + static void amdvi_inval_all(AMDVIState *s, uint64_t *cmd) { if (extract64(cmd[0], 0, 60) || cmd[1]) { @@ -437,6 +445,9 @@ static void amdvi_inval_all(AMDVIState *s, uint64_t *cmd) s->cmdbuf + s->cmdbuf_head); } + /* Notify global invalidation */ + amdvi_intremap_inval_notify_all(s, true, 0, 0); + amdvi_iotlb_reset(s); trace_amdvi_all_inval(); } @@ -485,6 +496,9 @@ static void amdvi_inval_inttable(AMDVIState *s, uint64_t *cmd) return; } + /* Notify global invalidation */ + amdvi_intremap_inval_notify_all(s, true, 0, 0); + trace_amdvi_intr_inval(); } @@ -1295,15 +1309,15 @@ static int amdvi_int_remap_msi(AMDVIState *iommu, ret = -AMDVI_IR_ERR; break; case AMDVI_IOAPIC_INT_TYPE_NMI: - pass = dte[3] & AMDVI_DEV_NMI_PASS_MASK; + pass = dte[2] & AMDVI_DEV_NMI_PASS_MASK; trace_amdvi_ir_delivery_mode("nmi"); break; case AMDVI_IOAPIC_INT_TYPE_INIT: - pass = dte[3] & AMDVI_DEV_INT_PASS_MASK; + pass = dte[2] & AMDVI_DEV_INT_PASS_MASK; trace_amdvi_ir_delivery_mode("init"); break; case AMDVI_IOAPIC_INT_TYPE_EINT: - pass = dte[3] & AMDVI_DEV_EINT_PASS_MASK; + pass = dte[2] & AMDVI_DEV_EINT_PASS_MASK; trace_amdvi_ir_delivery_mode("eint"); break; default: @@ -1436,13 +1450,13 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) * Memory region relationships looks like (Address range shows * only lower 32 bits to make it short in length...): * - * |-----------------+-------------------+----------| - * | Name | Address range | Priority | - * |-----------------+-------------------+----------+ - * | amdvi_root | 00000000-ffffffff | 0 | - * | amdvi_iommu | 00000000-ffffffff | 1 | - * | amdvi_iommu_ir | fee00000-feefffff | 64 | - * |-----------------+-------------------+----------| + * |--------------------+-------------------+----------| + * | Name | Address range | Priority | + * |--------------------+-------------------+----------+ + * | amdvi-root | 00000000-ffffffff | 0 | + * | amdvi-iommu_nodma | 00000000-ffffffff | 0 | + * | amdvi-iommu_ir | fee00000-feefffff | 1 | + * |--------------------+-------------------+----------| */ memory_region_init_iommu(&amdvi_dev_as->iommu, sizeof(amdvi_dev_as->iommu), @@ -1452,16 +1466,27 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) memory_region_init(&amdvi_dev_as->root, OBJECT(s), "amdvi_root", UINT64_MAX); address_space_init(&amdvi_dev_as->as, &amdvi_dev_as->root, name); - memory_region_init_io(&amdvi_dev_as->iommu_ir, OBJECT(s), - &amdvi_ir_ops, s, "amd_iommu_ir", - AMDVI_INT_ADDR_SIZE); - memory_region_add_subregion_overlap(&amdvi_dev_as->root, - AMDVI_INT_ADDR_FIRST, - &amdvi_dev_as->iommu_ir, - 64); memory_region_add_subregion_overlap(&amdvi_dev_as->root, 0, MEMORY_REGION(&amdvi_dev_as->iommu), - 1); + 0); + + /* Build the DMA Disabled alias to shared memory */ + memory_region_init_alias(&amdvi_dev_as->iommu_nodma, OBJECT(s), + "amdvi-sys", &s->mr_sys, 0, + memory_region_size(&s->mr_sys)); + memory_region_add_subregion_overlap(&amdvi_dev_as->root, 0, + &amdvi_dev_as->iommu_nodma, + 0); + /* Build the Interrupt Remapping alias to shared memory */ + memory_region_init_alias(&amdvi_dev_as->iommu_ir, OBJECT(s), + "amdvi-ir", &s->mr_ir, 0, + memory_region_size(&s->mr_ir)); + memory_region_add_subregion_overlap(MEMORY_REGION(&amdvi_dev_as->iommu), + AMDVI_INT_ADDR_FIRST, + &amdvi_dev_as->iommu_ir, 1); + + memory_region_set_enabled(&amdvi_dev_as->iommu_nodma, false); + memory_region_set_enabled(MEMORY_REGION(&amdvi_dev_as->iommu), true); } return &iommu_as[devfn]->as; } @@ -1560,9 +1585,9 @@ static void amdvi_pci_realize(PCIDevice *pdev, Error **errp) /* reset AMDVI specific capabilities, all r/o */ pci_set_long(pdev->config + s->capab_offset, AMDVI_CAPAB_FEATURES); pci_set_long(pdev->config + s->capab_offset + AMDVI_CAPAB_BAR_LOW, - AMDVI_BASE_ADDR & ~(0xffff0000)); + AMDVI_BASE_ADDR & MAKE_64BIT_MASK(14, 18)); pci_set_long(pdev->config + s->capab_offset + AMDVI_CAPAB_BAR_HIGH, - (AMDVI_BASE_ADDR & ~(0xffff)) >> 16); + AMDVI_BASE_ADDR >> 32); pci_set_long(pdev->config + s->capab_offset + AMDVI_CAPAB_RANGE, 0xff000000); pci_set_long(pdev->config + s->capab_offset + AMDVI_CAPAB_MISC, 0); @@ -1574,41 +1599,137 @@ static void amdvi_sysbus_reset(DeviceState *dev) { AMDVIState *s = AMD_IOMMU_DEVICE(dev); - msi_reset(&s->pci.dev); + msi_reset(&s->pci->dev); amdvi_init(s); } +static const VMStateDescription vmstate_amdvi_sysbus_migratable = { + .name = "amd-iommu", + .version_id = 1, + .minimum_version_id = 1, + .priority = MIG_PRI_IOMMU, + .fields = (VMStateField[]) { + /* Updated in amdvi_handle_control_write() */ + VMSTATE_BOOL(enabled, AMDVIState), + VMSTATE_BOOL(ga_enabled, AMDVIState), + VMSTATE_BOOL(ats_enabled, AMDVIState), + VMSTATE_BOOL(cmdbuf_enabled, AMDVIState), + VMSTATE_BOOL(completion_wait_intr, AMDVIState), + VMSTATE_BOOL(evtlog_enabled, AMDVIState), + VMSTATE_BOOL(evtlog_intr, AMDVIState), + /* Updated in amdvi_handle_devtab_write() */ + VMSTATE_UINT64(devtab, AMDVIState), + VMSTATE_UINT64(devtab_len, AMDVIState), + /* Updated in amdvi_handle_cmdbase_write() */ + VMSTATE_UINT64(cmdbuf, AMDVIState), + VMSTATE_UINT64(cmdbuf_len, AMDVIState), + /* Updated in amdvi_handle_cmdhead_write() */ + VMSTATE_UINT32(cmdbuf_head, AMDVIState), + /* Updated in amdvi_handle_cmdtail_write() */ + VMSTATE_UINT32(cmdbuf_tail, AMDVIState), + /* Updated in amdvi_handle_evtbase_write() */ + VMSTATE_UINT64(evtlog, AMDVIState), + VMSTATE_UINT32(evtlog_len, AMDVIState), + /* Updated in amdvi_handle_evthead_write() */ + VMSTATE_UINT32(evtlog_head, AMDVIState), + /* Updated in amdvi_handle_evttail_write() */ + VMSTATE_UINT32(evtlog_tail, AMDVIState), + /* Updated in amdvi_handle_pprbase_write() */ + VMSTATE_UINT64(ppr_log, AMDVIState), + VMSTATE_UINT32(pprlog_len, AMDVIState), + /* Updated in amdvi_handle_pprhead_write() */ + VMSTATE_UINT32(pprlog_head, AMDVIState), + /* Updated in amdvi_handle_tailhead_write() */ + VMSTATE_UINT32(pprlog_tail, AMDVIState), + /* MMIO registers */ + VMSTATE_UINT8_ARRAY(mmior, AMDVIState, AMDVI_MMIO_SIZE), + VMSTATE_UINT8_ARRAY(romask, AMDVIState, AMDVI_MMIO_SIZE), + VMSTATE_UINT8_ARRAY(w1cmask, AMDVIState, AMDVI_MMIO_SIZE), + VMSTATE_END_OF_LIST() + } +}; + static void amdvi_sysbus_realize(DeviceState *dev, Error **errp) { + DeviceClass *dc = (DeviceClass *) object_get_class(OBJECT(dev)); AMDVIState *s = AMD_IOMMU_DEVICE(dev); MachineState *ms = MACHINE(qdev_get_machine()); PCMachineState *pcms = PC_MACHINE(ms); X86MachineState *x86ms = X86_MACHINE(ms); PCIBus *bus = pcms->pcibus; - s->iotlb = g_hash_table_new_full(amdvi_uint64_hash, - amdvi_uint64_equal, g_free, g_free); + if (s->pci_id) { + PCIDevice *pdev = NULL; + int ret = pci_qdev_find_device(s->pci_id, &pdev); - /* This device should take care of IOMMU PCI properties */ - if (!qdev_realize(DEVICE(&s->pci), &bus->qbus, errp)) { - return; + if (ret) { + error_report("Cannot find PCI device '%s'", s->pci_id); + return; + } + + if (!object_dynamic_cast(OBJECT(pdev), TYPE_AMD_IOMMU_PCI)) { + error_report("Device '%s' must be an AMDVI-PCI device type", s->pci_id); + return; + } + + s->pci = AMD_IOMMU_PCI(pdev); + dc->vmsd = &vmstate_amdvi_sysbus_migratable; + } else { + s->pci = AMD_IOMMU_PCI(object_new(TYPE_AMD_IOMMU_PCI)); + /* This device should take care of IOMMU PCI properties */ + if (!qdev_realize(DEVICE(s->pci), &bus->qbus, errp)) { + return; + } } + s->iotlb = g_hash_table_new_full(amdvi_uint64_hash, + amdvi_uint64_equal, g_free, g_free); + /* Pseudo address space under root PCI bus. */ x86ms->ioapic_as = amdvi_host_dma_iommu(bus, s, AMDVI_IOAPIC_SB_DEVID); /* set up MMIO */ - memory_region_init_io(&s->mmio, OBJECT(s), &mmio_mem_ops, s, "amdvi-mmio", - AMDVI_MMIO_SIZE); + memory_region_init_io(&s->mr_mmio, OBJECT(s), &mmio_mem_ops, s, + "amdvi-mmio", AMDVI_MMIO_SIZE); memory_region_add_subregion(get_system_memory(), AMDVI_BASE_ADDR, - &s->mmio); + &s->mr_mmio); + + /* Create the share memory regions by all devices */ + memory_region_init(&s->mr_sys, OBJECT(s), "amdvi-sys", UINT64_MAX); + + /* set up the DMA disabled memory region */ + memory_region_init_alias(&s->mr_nodma, OBJECT(s), + "amdvi-nodma", get_system_memory(), 0, + memory_region_size(get_system_memory())); + memory_region_add_subregion_overlap(&s->mr_sys, 0, + &s->mr_nodma, 0); + + /* set up the Interrupt Remapping memory region */ + memory_region_init_io(&s->mr_ir, OBJECT(s), &amdvi_ir_ops, + s, "amdvi-ir", AMDVI_INT_ADDR_SIZE); + memory_region_add_subregion_overlap(&s->mr_sys, AMDVI_INT_ADDR_FIRST, + &s->mr_ir, 1); + + if (kvm_enabled() && x86ms->apic_id_limit > 255 && !s->xtsup) { + error_report("AMD IOMMU with x2APIC configuration requires xtsup=on"); + exit(EXIT_FAILURE); + } + + if (s->xtsup) { + if (kvm_irqchip_is_split() && !kvm_enable_x2apic()) { + error_report("AMD IOMMU xtsup=on requires x2APIC support on " + "the KVM side"); + exit(EXIT_FAILURE); + } + } + pci_setup_iommu(bus, &amdvi_iommu_ops, s); amdvi_init(s); } -static Property amdvi_properties[] = { +static const Property amdvi_properties[] = { DEFINE_PROP_BOOL("xtsup", AMDVIState, xtsup, false), - DEFINE_PROP_END_OF_LIST(), + DEFINE_PROP_STRING("pci-id", AMDVIState, pci_id), }; static const VMStateDescription vmstate_amdvi_sysbus = { @@ -1616,25 +1737,16 @@ static const VMStateDescription vmstate_amdvi_sysbus = { .unmigratable = 1 }; -static void amdvi_sysbus_instance_init(Object *klass) -{ - AMDVIState *s = AMD_IOMMU_DEVICE(klass); - - object_initialize(&s->pci, sizeof(s->pci), TYPE_AMD_IOMMU_PCI); -} - -static void amdvi_sysbus_class_init(ObjectClass *klass, void *data) +static void amdvi_sysbus_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); X86IOMMUClass *dc_class = X86_IOMMU_DEVICE_CLASS(klass); - dc->reset = amdvi_sysbus_reset; + device_class_set_legacy_reset(dc, amdvi_sysbus_reset); dc->vmsd = &vmstate_amdvi_sysbus; dc->hotpluggable = false; dc_class->realize = amdvi_sysbus_realize; dc_class->int_remap = amdvi_int_remap; - /* Supported by the pc-q35-* machine types */ - dc->user_creatable = true; set_bit(DEVICE_CATEGORY_MISC, dc->categories); dc->desc = "AMD IOMMU (AMD-Vi) DMA Remapping device"; device_class_set_props(dc, amdvi_properties); @@ -1644,16 +1756,16 @@ static const TypeInfo amdvi_sysbus = { .name = TYPE_AMD_IOMMU_DEVICE, .parent = TYPE_X86_IOMMU_DEVICE, .instance_size = sizeof(AMDVIState), - .instance_init = amdvi_sysbus_instance_init, .class_init = amdvi_sysbus_class_init }; -static void amdvi_pci_class_init(ObjectClass *klass, void *data) +static void amdvi_pci_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); k->vendor_id = PCI_VENDOR_ID_AMD; + k->device_id = 0x1419; k->class_id = 0x0806; k->realize = amdvi_pci_realize; @@ -1666,13 +1778,14 @@ static const TypeInfo amdvi_pci = { .parent = TYPE_PCI_DEVICE, .instance_size = sizeof(AMDVIPCIState), .class_init = amdvi_pci_class_init, - .interfaces = (InterfaceInfo[]) { + .interfaces = (const InterfaceInfo[]) { { INTERFACE_CONVENTIONAL_PCI_DEVICE }, { }, }, }; -static void amdvi_iommu_memory_region_class_init(ObjectClass *klass, void *data) +static void amdvi_iommu_memory_region_class_init(ObjectClass *klass, + const void *data) { IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass); diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h index 73619fe..5672bde 100644 --- a/hw/i386/amd_iommu.h +++ b/hw/i386/amd_iommu.h @@ -187,7 +187,7 @@ AMDVI_CAPAB_FLAG_HTTUNNEL | AMDVI_CAPAB_EFR_SUP) /* AMDVI default address */ -#define AMDVI_BASE_ADDR 0xfed80000 +#define AMDVI_BASE_ADDR 0xfed80000ULL /* page management constants */ #define AMDVI_PAGE_SHIFT 12 @@ -315,7 +315,8 @@ struct AMDVIPCIState { struct AMDVIState { X86IOMMUState iommu; /* IOMMU bus device */ - AMDVIPCIState pci; /* IOMMU PCI device */ + AMDVIPCIState *pci; /* IOMMU PCI device */ + char *pci_id; /* ID of AMDVI-PCI device, if user created */ uint32_t version; @@ -328,7 +329,7 @@ struct AMDVIState { bool excl_enabled; hwaddr devtab; /* base address device table */ - size_t devtab_len; /* device table length */ + uint64_t devtab_len; /* device table length */ hwaddr cmdbuf; /* command buffer base address */ uint64_t cmdbuf_len; /* command buffer length */ @@ -353,7 +354,10 @@ struct AMDVIState { uint32_t pprlog_head; /* ppr log head */ uint32_t pprlog_tail; /* ppr log tail */ - MemoryRegion mmio; /* MMIO region */ + MemoryRegion mr_mmio; /* MMIO region */ + MemoryRegion mr_sys; + MemoryRegion mr_nodma; + MemoryRegion mr_ir; uint8_t mmior[AMDVI_MMIO_SIZE]; /* read/write MMIO */ uint8_t w1cmask[AMDVI_MMIO_SIZE]; /* read/write 1 clear mask */ uint8_t romask[AMDVI_MMIO_SIZE]; /* MMIO read/only mask */ diff --git a/hw/i386/fw_cfg.c b/hw/i386/fw_cfg.c index 0e44946..5c0bcd5 100644 --- a/hw/i386/fw_cfg.c +++ b/hw/i386/fw_cfg.c @@ -13,7 +13,7 @@ */ #include "qemu/osdep.h" -#include "sysemu/numa.h" +#include "system/numa.h" #include "hw/acpi/acpi.h" #include "hw/acpi/aml-build.h" #include "hw/firmware/smbios.h" @@ -26,7 +26,9 @@ #include CONFIG_DEVICES #include "target/i386/cpu.h" -struct hpet_fw_config hpet_cfg = {.count = UINT8_MAX}; +#if !defined(CONFIG_HPET) +struct hpet_fw_config hpet_fw_cfg = {.count = UINT8_MAX}; +#endif const char *fw_cfg_arch_key_name(uint16_t key) { @@ -143,13 +145,13 @@ FWCfgState *fw_cfg_arch_create(MachineState *ms, */ fw_cfg_add_i16(fw_cfg, FW_CFG_MAX_CPUS, apic_id_limit); fw_cfg_add_i64(fw_cfg, FW_CFG_RAM_SIZE, ms->ram_size); -#ifdef CONFIG_ACPI - fw_cfg_add_bytes(fw_cfg, FW_CFG_ACPI_TABLES, - acpi_tables, acpi_tables_len); -#endif + if (acpi_builtin()) { + fw_cfg_add_bytes(fw_cfg, FW_CFG_ACPI_TABLES, + acpi_tables, acpi_tables_len); + } fw_cfg_add_i32(fw_cfg, FW_CFG_IRQ0_OVERRIDE, 1); - fw_cfg_add_bytes(fw_cfg, FW_CFG_HPET, &hpet_cfg, sizeof(hpet_cfg)); + fw_cfg_add_bytes(fw_cfg, FW_CFG_HPET, &hpet_fw_cfg, sizeof(hpet_fw_cfg)); /* allocate memory for the NUMA channel: one (64bit) word for the number * of nodes, one word for each VCPU->node and one word for each node to * hold the amount of memory. diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 37c21a0a..69d72ad 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -32,9 +32,9 @@ #include "hw/i386/apic-msidef.h" #include "hw/i386/x86-iommu.h" #include "hw/pci-host/q35.h" -#include "sysemu/kvm.h" -#include "sysemu/dma.h" -#include "sysemu/sysemu.h" +#include "system/kvm.h" +#include "system/dma.h" +#include "system/system.h" #include "hw/i386/apic_internal.h" #include "kvm/kvm_i386.h" #include "migration/vmstate.h" @@ -48,7 +48,10 @@ /* pe operations */ #define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT) -#define VTD_PE_GET_LEVEL(pe) (2 + (((pe)->val[0] >> 2) & VTD_SM_PASID_ENTRY_AW)) +#define VTD_PE_GET_FL_LEVEL(pe) \ + (4 + (((pe)->val[2] >> 2) & VTD_SM_PASID_ENTRY_FLPM)) +#define VTD_PE_GET_SL_LEVEL(pe) \ + (2 + (((pe)->val[0] >> 2) & VTD_SM_PASID_ENTRY_AW)) /* * PCI bus number (or SID) is not reliable since the device is usaully @@ -67,6 +70,11 @@ struct vtd_hiod_key { uint8_t devfn; }; +struct vtd_as_raw_key { + uint16_t sid; + uint32_t pasid; +}; + struct vtd_iotlb_key { uint64_t gfn; uint32_t pasid; @@ -284,15 +292,15 @@ static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value, } /* The shift of an addr for a certain level of paging structure */ -static inline uint32_t vtd_slpt_level_shift(uint32_t level) +static inline uint32_t vtd_pt_level_shift(uint32_t level) { assert(level != 0); - return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_SL_LEVEL_BITS; + return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_LEVEL_BITS; } -static inline uint64_t vtd_slpt_level_page_mask(uint32_t level) +static inline uint64_t vtd_pt_level_page_mask(uint32_t level) { - return ~((1ULL << vtd_slpt_level_shift(level)) - 1); + return ~((1ULL << vtd_pt_level_shift(level)) - 1); } static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value, @@ -302,9 +310,43 @@ static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value, VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data; uint64_t gfn = (info->addr >> VTD_PAGE_SHIFT_4K) & info->mask; uint64_t gfn_tlb = (info->addr & entry->mask) >> VTD_PAGE_SHIFT_4K; - return (entry->domain_id == info->domain_id) && - (((entry->gfn & info->mask) == gfn) || - (entry->gfn == gfn_tlb)); + + if (entry->domain_id != info->domain_id) { + return false; + } + + /* + * According to spec, IOTLB entries caching first-stage (PGTT=001b) or + * nested (PGTT=011b) mapping associated with specified domain-id are + * invalidated. Nested isn't supported yet, so only need to check 001b. + */ + if (entry->pgtt == VTD_SM_PASID_ENTRY_FLT) { + return true; + } + + return (entry->gfn & info->mask) == gfn || entry->gfn == gfn_tlb; +} + +static gboolean vtd_hash_remove_by_page_piotlb(gpointer key, gpointer value, + gpointer user_data) +{ + VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value; + VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data; + uint64_t gfn = (info->addr >> VTD_PAGE_SHIFT_4K) & info->mask; + uint64_t gfn_tlb = (info->addr & entry->mask) >> VTD_PAGE_SHIFT_4K; + + /* + * According to spec, PASID-based-IOTLB Invalidation in page granularity + * doesn't invalidate IOTLB entries caching second-stage (PGTT=010b) + * or pass-through (PGTT=100b) mappings. Nested isn't supported yet, + * so only need to check first-stage (PGTT=001b) mappings. + */ + if (entry->pgtt != VTD_SM_PASID_ENTRY_FLT) { + return false; + } + + return entry->domain_id == info->domain_id && entry->pasid == info->pasid && + ((entry->gfn & info->mask) == gfn || entry->gfn == gfn_tlb); } /* Reset all the gen of VTDAddressSpace to zero and set the gen of @@ -349,7 +391,7 @@ static void vtd_reset_caches(IntelIOMMUState *s) static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level) { - return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K; + return (addr & vtd_pt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K; } /* Must be called with IOMMU lock held */ @@ -358,9 +400,9 @@ static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id, { struct vtd_iotlb_key key; VTDIOTLBEntry *entry; - int level; + unsigned level; - for (level = VTD_SL_PT_LEVEL; level < VTD_SL_PML4_LEVEL; level++) { + for (level = VTD_PT_LEVEL; level < VTD_PML4_LEVEL; level++) { key.gfn = vtd_get_iotlb_gfn(addr, level); key.level = level; key.sid = source_id; @@ -377,15 +419,15 @@ out: /* Must be with IOMMU lock held */ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, - uint16_t domain_id, hwaddr addr, uint64_t slpte, + uint16_t domain_id, hwaddr addr, uint64_t pte, uint8_t access_flags, uint32_t level, - uint32_t pasid) + uint32_t pasid, uint8_t pgtt) { VTDIOTLBEntry *entry = g_malloc(sizeof(*entry)); struct vtd_iotlb_key *key = g_malloc(sizeof(*key)); uint64_t gfn = vtd_get_iotlb_gfn(addr, level); - trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id); + trace_vtd_iotlb_page_update(source_id, addr, pte, domain_id); if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) { trace_vtd_iotlb_reset("iotlb exceeds size limit"); vtd_reset_iotlb_locked(s); @@ -393,10 +435,11 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, entry->gfn = gfn; entry->domain_id = domain_id; - entry->slpte = slpte; + entry->pte = pte; entry->access_flags = access_flags; - entry->mask = vtd_slpt_level_page_mask(level); + entry->mask = vtd_pt_level_page_mask(level); entry->pasid = pasid; + entry->pgtt = pgtt; key->gfn = gfn; key->sid = source_id; @@ -710,32 +753,32 @@ static inline dma_addr_t vtd_ce_get_slpt_base(VTDContextEntry *ce) return ce->lo & VTD_CONTEXT_ENTRY_SLPTPTR; } -static inline uint64_t vtd_get_slpte_addr(uint64_t slpte, uint8_t aw) +static inline uint64_t vtd_get_pte_addr(uint64_t pte, uint8_t aw) { - return slpte & VTD_SL_PT_BASE_ADDR_MASK(aw); + return pte & VTD_PT_BASE_ADDR_MASK(aw); } /* Whether the pte indicates the address of the page frame */ -static inline bool vtd_is_last_slpte(uint64_t slpte, uint32_t level) +static inline bool vtd_is_last_pte(uint64_t pte, uint32_t level) { - return level == VTD_SL_PT_LEVEL || (slpte & VTD_SL_PT_PAGE_SIZE_MASK); + return level == VTD_PT_LEVEL || (pte & VTD_PT_PAGE_SIZE_MASK); } -/* Get the content of a spte located in @base_addr[@index] */ -static uint64_t vtd_get_slpte(dma_addr_t base_addr, uint32_t index) +/* Get the content of a pte located in @base_addr[@index] */ +static uint64_t vtd_get_pte(dma_addr_t base_addr, uint32_t index) { - uint64_t slpte; + uint64_t pte; - assert(index < VTD_SL_PT_ENTRY_NR); + assert(index < VTD_PT_ENTRY_NR); if (dma_memory_read(&address_space_memory, - base_addr + index * sizeof(slpte), - &slpte, sizeof(slpte), MEMTXATTRS_UNSPECIFIED)) { - slpte = (uint64_t)-1; - return slpte; + base_addr + index * sizeof(pte), + &pte, sizeof(pte), MEMTXATTRS_UNSPECIFIED)) { + pte = (uint64_t)-1; + return pte; } - slpte = le64_to_cpu(slpte); - return slpte; + pte = le64_to_cpu(pte); + return pte; } /* Given an iova and the level of paging structure, return the offset @@ -743,36 +786,39 @@ static uint64_t vtd_get_slpte(dma_addr_t base_addr, uint32_t index) */ static inline uint32_t vtd_iova_level_offset(uint64_t iova, uint32_t level) { - return (iova >> vtd_slpt_level_shift(level)) & - ((1ULL << VTD_SL_LEVEL_BITS) - 1); + return (iova >> vtd_pt_level_shift(level)) & + ((1ULL << VTD_LEVEL_BITS) - 1); } /* Check Capability Register to see if the @level of page-table is supported */ -static inline bool vtd_is_level_supported(IntelIOMMUState *s, uint32_t level) +static inline bool vtd_is_sl_level_supported(IntelIOMMUState *s, uint32_t level) { return VTD_CAP_SAGAW_MASK & s->cap & (1ULL << (level - 2 + VTD_CAP_SAGAW_SHIFT)); } +static inline bool vtd_is_fl_level_supported(IntelIOMMUState *s, uint32_t level) +{ + return level == VTD_PML4_LEVEL; +} + /* Return true if check passed, otherwise false */ -static inline bool vtd_pe_type_check(X86IOMMUState *x86_iommu, - VTDPASIDEntry *pe) +static inline bool vtd_pe_type_check(IntelIOMMUState *s, VTDPASIDEntry *pe) { switch (VTD_PE_GET_TYPE(pe)) { case VTD_SM_PASID_ENTRY_FLT: + return !!(s->ecap & VTD_ECAP_FLTS); case VTD_SM_PASID_ENTRY_SLT: + return !!(s->ecap & VTD_ECAP_SLTS); case VTD_SM_PASID_ENTRY_NESTED: - break; + /* Not support NESTED page table type yet */ + return false; case VTD_SM_PASID_ENTRY_PT: - if (!x86_iommu->pt_supported) { - return false; - } - break; + return !!(s->ecap & VTD_ECAP_PT); default: /* Unknown type */ return false; } - return true; } static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire) @@ -796,7 +842,7 @@ static int vtd_get_pdire_from_pdir_table(dma_addr_t pasid_dir_base, addr = pasid_dir_base + index * entry_size; if (dma_memory_read(&address_space_memory, addr, pdire, entry_size, MEMTXATTRS_UNSPECIFIED)) { - return -VTD_FR_PASID_TABLE_INV; + return -VTD_FR_PASID_DIR_ACCESS_ERR; } pdire->val = le64_to_cpu(pdire->val); @@ -814,28 +860,35 @@ static int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState *s, dma_addr_t addr, VTDPASIDEntry *pe) { + uint8_t pgtt; uint32_t index; dma_addr_t entry_size; - X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); index = VTD_PASID_TABLE_INDEX(pasid); entry_size = VTD_PASID_ENTRY_SIZE; addr = addr + index * entry_size; if (dma_memory_read(&address_space_memory, addr, pe, entry_size, MEMTXATTRS_UNSPECIFIED)) { - return -VTD_FR_PASID_TABLE_INV; + return -VTD_FR_PASID_TABLE_ACCESS_ERR; } for (size_t i = 0; i < ARRAY_SIZE(pe->val); i++) { pe->val[i] = le64_to_cpu(pe->val[i]); } /* Do translation type check */ - if (!vtd_pe_type_check(x86_iommu, pe)) { - return -VTD_FR_PASID_TABLE_INV; + if (!vtd_pe_type_check(s, pe)) { + return -VTD_FR_PASID_TABLE_ENTRY_INV; } - if (!vtd_is_level_supported(s, VTD_PE_GET_LEVEL(pe))) { - return -VTD_FR_PASID_TABLE_INV; + pgtt = VTD_PE_GET_TYPE(pe); + if (pgtt == VTD_SM_PASID_ENTRY_SLT && + !vtd_is_sl_level_supported(s, VTD_PE_GET_SL_LEVEL(pe))) { + return -VTD_FR_PASID_TABLE_ENTRY_INV; + } + + if (pgtt == VTD_SM_PASID_ENTRY_FLT && + !vtd_is_fl_level_supported(s, VTD_PE_GET_FL_LEVEL(pe))) { + return -VTD_FR_PASID_TABLE_ENTRY_INV; } return 0; @@ -876,7 +929,7 @@ static int vtd_get_pe_from_pasid_table(IntelIOMMUState *s, } if (!vtd_pdire_present(&pdire)) { - return -VTD_FR_PASID_TABLE_INV; + return -VTD_FR_PASID_DIR_ENTRY_P; } ret = vtd_get_pe_from_pdire(s, pasid, &pdire, pe); @@ -885,7 +938,7 @@ static int vtd_get_pe_from_pasid_table(IntelIOMMUState *s, } if (!vtd_pe_present(pe)) { - return -VTD_FR_PASID_TABLE_INV; + return -VTD_FR_PASID_ENTRY_P; } return 0; @@ -938,7 +991,7 @@ static int vtd_ce_get_pasid_fpd(IntelIOMMUState *s, } if (!vtd_pdire_present(&pdire)) { - return -VTD_FR_PASID_TABLE_INV; + return -VTD_FR_PASID_DIR_ENTRY_P; } /* @@ -973,7 +1026,11 @@ static uint32_t vtd_get_iova_level(IntelIOMMUState *s, if (s->root_scalable) { vtd_ce_get_rid2pasid_entry(s, ce, &pe, pasid); - return VTD_PE_GET_LEVEL(&pe); + if (s->flts) { + return VTD_PE_GET_FL_LEVEL(&pe); + } else { + return VTD_PE_GET_SL_LEVEL(&pe); + } } return vtd_ce_get_level(ce); @@ -1041,9 +1098,9 @@ static inline uint64_t vtd_iova_limit(IntelIOMMUState *s, } /* Return true if IOVA passes range check, otherwise false. */ -static inline bool vtd_iova_range_check(IntelIOMMUState *s, - uint64_t iova, VTDContextEntry *ce, - uint8_t aw, uint32_t pasid) +static inline bool vtd_iova_sl_range_check(IntelIOMMUState *s, + uint64_t iova, VTDContextEntry *ce, + uint8_t aw, uint32_t pasid) { /* * Check if @iova is above 2^X-1, where X is the minimum of MGAW @@ -1060,7 +1117,11 @@ static dma_addr_t vtd_get_iova_pgtbl_base(IntelIOMMUState *s, if (s->root_scalable) { vtd_ce_get_rid2pasid_entry(s, ce, &pe, pasid); - return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR; + if (s->flts) { + return pe.val[2] & VTD_SM_PASID_ENTRY_FLPTPTR; + } else { + return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR; + } } return vtd_ce_get_slpt_base(ce); @@ -1084,17 +1145,17 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level) /* * We should have caught a guest-mis-programmed level earlier, - * via vtd_is_level_supported. + * via vtd_is_sl_level_supported. */ assert(level < VTD_SPTE_RSVD_LEN); /* - * Zero level doesn't exist. The smallest level is VTD_SL_PT_LEVEL=1 and - * checked by vtd_is_last_slpte(). + * Zero level doesn't exist. The smallest level is VTD_PT_LEVEL=1 and + * checked by vtd_is_last_pte(). */ assert(level); - if ((level == VTD_SL_PD_LEVEL || level == VTD_SL_PDP_LEVEL) && - (slpte & VTD_SL_PT_PAGE_SIZE_MASK)) { + if ((level == VTD_PD_LEVEL || level == VTD_PDP_LEVEL) && + (slpte & VTD_PT_PAGE_SIZE_MASK)) { /* large page */ rsvd_mask = vtd_spte_rsvd_large[level]; } else { @@ -1118,9 +1179,8 @@ static int vtd_iova_to_slpte(IntelIOMMUState *s, VTDContextEntry *ce, uint32_t offset; uint64_t slpte; uint64_t access_right_check; - uint64_t xlat, size; - if (!vtd_iova_range_check(s, iova, ce, aw_bits, pasid)) { + if (!vtd_iova_sl_range_check(s, iova, ce, aw_bits, pasid)) { error_report_once("%s: detected IOVA overflow (iova=0x%" PRIx64 "," "pasid=0x%" PRIx32 ")", __func__, iova, pasid); return -VTD_FR_ADDR_BEYOND_MGAW; @@ -1131,7 +1191,7 @@ static int vtd_iova_to_slpte(IntelIOMMUState *s, VTDContextEntry *ce, while (true) { offset = vtd_iova_level_offset(iova, level); - slpte = vtd_get_slpte(addr, offset); + slpte = vtd_get_pte(addr, offset); if (slpte == (uint64_t)-1) { error_report_once("%s: detected read error on DMAR slpte " @@ -1162,37 +1222,16 @@ static int vtd_iova_to_slpte(IntelIOMMUState *s, VTDContextEntry *ce, return -VTD_FR_PAGING_ENTRY_RSVD; } - if (vtd_is_last_slpte(slpte, level)) { + if (vtd_is_last_pte(slpte, level)) { *slptep = slpte; *slpte_level = level; break; } - addr = vtd_get_slpte_addr(slpte, aw_bits); + addr = vtd_get_pte_addr(slpte, aw_bits); level--; } - xlat = vtd_get_slpte_addr(*slptep, aw_bits); - size = ~vtd_slpt_level_page_mask(level) + 1; - - /* - * From VT-d spec 3.14: Untranslated requests and translation - * requests that result in an address in the interrupt range will be - * blocked with condition code LGN.4 or SGN.8. - */ - if ((xlat > VTD_INTERRUPT_ADDR_LAST || - xlat + size - 1 < VTD_INTERRUPT_ADDR_FIRST)) { - return 0; - } else { - error_report_once("%s: xlat address is in interrupt range " - "(iova=0x%" PRIx64 ", level=0x%" PRIx32 ", " - "slpte=0x%" PRIx64 ", write=%d, " - "xlat=0x%" PRIx64 ", size=0x%" PRIx64 ", " - "pasid=0x%" PRIx32 ")", - __func__, iova, level, slpte, is_write, - xlat, size, pasid); - return s->scalable_mode ? -VTD_FR_SM_INTERRUPT_ADDR : - -VTD_FR_INTERRUPT_ADDR; - } + return 0; } typedef int (*vtd_page_walk_hook)(const IOMMUTLBEvent *event, void *private); @@ -1323,14 +1362,14 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, trace_vtd_page_walk_level(addr, level, start, end); - subpage_size = 1ULL << vtd_slpt_level_shift(level); - subpage_mask = vtd_slpt_level_page_mask(level); + subpage_size = 1ULL << vtd_pt_level_shift(level); + subpage_mask = vtd_pt_level_page_mask(level); while (iova < end) { iova_next = (iova & subpage_mask) + subpage_size; offset = vtd_iova_level_offset(iova, level); - slpte = vtd_get_slpte(addr, offset); + slpte = vtd_get_pte(addr, offset); if (slpte == (uint64_t)-1) { trace_vtd_page_walk_skip_read(iova, iova_next); @@ -1353,12 +1392,12 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, */ entry_valid = read_cur | write_cur; - if (!vtd_is_last_slpte(slpte, level) && entry_valid) { + if (!vtd_is_last_pte(slpte, level) && entry_valid) { /* * This is a valid PDE (or even bigger than PDE). We need * to walk one further level. */ - ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw), + ret = vtd_page_walk_level(vtd_get_pte_addr(slpte, info->aw), iova, MIN(iova_next, end), level - 1, read_cur, write_cur, info); } else { @@ -1375,7 +1414,7 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, event.entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur); event.entry.addr_mask = ~subpage_mask; /* NOTE: this is only meaningful if entry_valid == true */ - event.entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw); + event.entry.translated_addr = vtd_get_pte_addr(slpte, info->aw); event.type = event.entry.perm ? IOMMU_NOTIFIER_MAP : IOMMU_NOTIFIER_UNMAP; ret = vtd_page_walk_one(&event, info); @@ -1409,11 +1448,11 @@ static int vtd_page_walk(IntelIOMMUState *s, VTDContextEntry *ce, dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce, pasid); uint32_t level = vtd_get_iova_level(s, ce, pasid); - if (!vtd_iova_range_check(s, start, ce, info->aw, pasid)) { + if (!vtd_iova_sl_range_check(s, start, ce, info->aw, pasid)) { return -VTD_FR_ADDR_BEYOND_MGAW; } - if (!vtd_iova_range_check(s, end, ce, info->aw, pasid)) { + if (!vtd_iova_sl_range_check(s, end, ce, info->aw, pasid)) { /* Fix end so that it reaches the maximum */ end = vtd_iova_limit(s, ce, info->aw, pasid); } @@ -1528,7 +1567,7 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, /* Check if the programming of context-entry is valid */ if (!s->root_scalable && - !vtd_is_level_supported(s, vtd_ce_get_level(ce))) { + !vtd_is_sl_level_supported(s, vtd_ce_get_level(ce))) { error_report_once("%s: invalid context entry: hi=%"PRIx64 ", lo=%"PRIx64" (level %d not supported)", __func__, ce->hi, ce->lo, @@ -1689,8 +1728,6 @@ static bool vtd_as_pt_enabled(VTDAddressSpace *as) static bool vtd_switch_address_space(VTDAddressSpace *as) { bool use_iommu, pt; - /* Whether we need to take the BQL on our own */ - bool take_bql = !bql_locked(); assert(as); @@ -1707,9 +1744,7 @@ static bool vtd_switch_address_space(VTDAddressSpace *as) * from vtd_pt_enable_fast_path(). However the memory APIs need * it. We'd better make sure we have had it already, or, take it. */ - if (take_bql) { - bql_lock(); - } + BQL_LOCK_GUARD(); /* Turn off first then on the other */ if (use_iommu) { @@ -1762,10 +1797,6 @@ static bool vtd_switch_address_space(VTDAddressSpace *as) memory_region_set_enabled(&as->iommu_ir_fault, false); } - if (take_bql) { - bql_unlock(); - } - return use_iommu; } @@ -1795,8 +1826,20 @@ static const bool vtd_qualified_faults[] = { [VTD_FR_ROOT_ENTRY_RSVD] = false, [VTD_FR_PAGING_ENTRY_RSVD] = true, [VTD_FR_CONTEXT_ENTRY_TT] = true, - [VTD_FR_PASID_TABLE_INV] = false, + [VTD_FR_PASID_DIR_ACCESS_ERR] = false, + [VTD_FR_PASID_DIR_ENTRY_P] = true, + [VTD_FR_PASID_TABLE_ACCESS_ERR] = false, + [VTD_FR_PASID_ENTRY_P] = true, + [VTD_FR_PASID_TABLE_ENTRY_INV] = true, + [VTD_FR_FS_PAGING_ENTRY_INV] = true, + [VTD_FR_FS_PAGING_ENTRY_P] = true, + [VTD_FR_FS_PAGING_ENTRY_RSVD] = true, + [VTD_FR_PASID_ENTRY_FSPTPTR_INV] = true, + [VTD_FR_FS_NON_CANONICAL] = true, + [VTD_FR_FS_PAGING_ENTRY_US] = true, + [VTD_FR_SM_WRITE] = true, [VTD_FR_SM_INTERRUPT_ADDR] = true, + [VTD_FR_FS_BIT_UPDATE_FAILED] = true, [VTD_FR_MAX] = false, }; @@ -1814,29 +1857,32 @@ static inline bool vtd_is_interrupt_addr(hwaddr addr) return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST; } -static gboolean vtd_find_as_by_sid(gpointer key, gpointer value, - gpointer user_data) +static gboolean vtd_find_as_by_sid_and_pasid(gpointer key, gpointer value, + gpointer user_data) { struct vtd_as_key *as_key = (struct vtd_as_key *)key; - uint16_t target_sid = *(uint16_t *)user_data; + struct vtd_as_raw_key *target = (struct vtd_as_raw_key *)user_data; uint16_t sid = PCI_BUILD_BDF(pci_bus_num(as_key->bus), as_key->devfn); - return sid == target_sid; + + return (as_key->pasid == target->pasid) && (sid == target->sid); } -static VTDAddressSpace *vtd_get_as_by_sid(IntelIOMMUState *s, uint16_t sid) +static VTDAddressSpace *vtd_get_as_by_sid_and_pasid(IntelIOMMUState *s, + uint16_t sid, + uint32_t pasid) { - uint8_t bus_num = PCI_BUS_NUM(sid); - VTDAddressSpace *vtd_as = s->vtd_as_cache[bus_num]; - - if (vtd_as && - (sid == PCI_BUILD_BDF(pci_bus_num(vtd_as->bus), vtd_as->devfn))) { - return vtd_as; - } + struct vtd_as_raw_key key = { + .sid = sid, + .pasid = pasid + }; - vtd_as = g_hash_table_find(s->vtd_address_spaces, vtd_find_as_by_sid, &sid); - s->vtd_as_cache[bus_num] = vtd_as; + return g_hash_table_find(s->vtd_address_spaces, + vtd_find_as_by_sid_and_pasid, &key); +} - return vtd_as; +static VTDAddressSpace *vtd_get_as_by_sid(IntelIOMMUState *s, uint16_t sid) +{ + return vtd_get_as_by_sid_and_pasid(s, sid, PCI_NO_PASID); } static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id) @@ -1858,6 +1904,157 @@ out: trace_vtd_pt_enable_fast_path(source_id, success); } +/* + * Rsvd field masks for fpte: + * vtd_fpte_rsvd 4k pages + * vtd_fpte_rsvd_large large pages + * + * We support only 4-level page tables. + */ +#define VTD_FPTE_RSVD_LEN 5 +static uint64_t vtd_fpte_rsvd[VTD_FPTE_RSVD_LEN]; +static uint64_t vtd_fpte_rsvd_large[VTD_FPTE_RSVD_LEN]; + +static bool vtd_flpte_nonzero_rsvd(uint64_t flpte, uint32_t level) +{ + uint64_t rsvd_mask; + + /* + * We should have caught a guest-mis-programmed level earlier, + * via vtd_is_fl_level_supported. + */ + assert(level < VTD_FPTE_RSVD_LEN); + /* + * Zero level doesn't exist. The smallest level is VTD_PT_LEVEL=1 and + * checked by vtd_is_last_pte(). + */ + assert(level); + + if ((level == VTD_PD_LEVEL || level == VTD_PDP_LEVEL) && + (flpte & VTD_PT_PAGE_SIZE_MASK)) { + /* large page */ + rsvd_mask = vtd_fpte_rsvd_large[level]; + } else { + rsvd_mask = vtd_fpte_rsvd[level]; + } + + return flpte & rsvd_mask; +} + +static inline bool vtd_flpte_present(uint64_t flpte) +{ + return !!(flpte & VTD_FL_P); +} + +/* Return true if IOVA is canonical, otherwise false. */ +static bool vtd_iova_fl_check_canonical(IntelIOMMUState *s, uint64_t iova, + VTDContextEntry *ce, uint32_t pasid) +{ + uint64_t iova_limit = vtd_iova_limit(s, ce, s->aw_bits, pasid); + uint64_t upper_bits_mask = ~(iova_limit - 1); + uint64_t upper_bits = iova & upper_bits_mask; + bool msb = ((iova & (iova_limit >> 1)) != 0); + + if (msb) { + return upper_bits == upper_bits_mask; + } else { + return !upper_bits; + } +} + +static MemTxResult vtd_set_flag_in_pte(dma_addr_t base_addr, uint32_t index, + uint64_t pte, uint64_t flag) +{ + if (pte & flag) { + return MEMTX_OK; + } + pte |= flag; + pte = cpu_to_le64(pte); + return dma_memory_write(&address_space_memory, + base_addr + index * sizeof(pte), + &pte, sizeof(pte), + MEMTXATTRS_UNSPECIFIED); +} + +/* + * Given the @iova, get relevant @flptep. @flpte_level will be the last level + * of the translation, can be used for deciding the size of large page. + */ +static int vtd_iova_to_flpte(IntelIOMMUState *s, VTDContextEntry *ce, + uint64_t iova, bool is_write, + uint64_t *flptep, uint32_t *flpte_level, + bool *reads, bool *writes, uint8_t aw_bits, + uint32_t pasid) +{ + dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce, pasid); + uint32_t level = vtd_get_iova_level(s, ce, pasid); + uint32_t offset; + uint64_t flpte, flag_ad = VTD_FL_A; + + if (!vtd_iova_fl_check_canonical(s, iova, ce, pasid)) { + error_report_once("%s: detected non canonical IOVA (iova=0x%" PRIx64 "," + "pasid=0x%" PRIx32 ")", __func__, iova, pasid); + return -VTD_FR_FS_NON_CANONICAL; + } + + while (true) { + offset = vtd_iova_level_offset(iova, level); + flpte = vtd_get_pte(addr, offset); + + if (flpte == (uint64_t)-1) { + if (level == vtd_get_iova_level(s, ce, pasid)) { + /* Invalid programming of pasid-entry */ + return -VTD_FR_PASID_ENTRY_FSPTPTR_INV; + } else { + return -VTD_FR_FS_PAGING_ENTRY_INV; + } + } + + if (!vtd_flpte_present(flpte)) { + *reads = false; + *writes = false; + return -VTD_FR_FS_PAGING_ENTRY_P; + } + + /* No emulated device supports supervisor privilege request yet */ + if (!(flpte & VTD_FL_US)) { + *reads = false; + *writes = false; + return -VTD_FR_FS_PAGING_ENTRY_US; + } + + *reads = true; + *writes = (*writes) && (flpte & VTD_FL_RW); + if (is_write && !(flpte & VTD_FL_RW)) { + return -VTD_FR_SM_WRITE; + } + if (vtd_flpte_nonzero_rsvd(flpte, level)) { + error_report_once("%s: detected flpte reserved non-zero " + "iova=0x%" PRIx64 ", level=0x%" PRIx32 + "flpte=0x%" PRIx64 ", pasid=0x%" PRIX32 ")", + __func__, iova, level, flpte, pasid); + return -VTD_FR_FS_PAGING_ENTRY_RSVD; + } + + if (vtd_is_last_pte(flpte, level) && is_write) { + flag_ad |= VTD_FL_D; + } + + if (vtd_set_flag_in_pte(addr, offset, flpte, flag_ad) != MEMTX_OK) { + return -VTD_FR_FS_BIT_UPDATE_FAILED; + } + + if (vtd_is_last_pte(flpte, level)) { + *flptep = flpte; + *flpte_level = level; + return 0; + } + + addr = vtd_get_pte_addr(flpte, aw_bits); + level--; + } +} + static void vtd_report_fault(IntelIOMMUState *s, int err, bool is_fpd_set, uint16_t source_id, @@ -1894,16 +2091,17 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, VTDContextEntry ce; uint8_t bus_num = pci_bus_num(bus); VTDContextCacheEntry *cc_entry; - uint64_t slpte, page_mask; + uint64_t pte, page_mask; uint32_t level, pasid = vtd_as->pasid; uint16_t source_id = PCI_BUILD_BDF(bus_num, devfn); int ret_fr; bool is_fpd_set = false; bool reads = true; bool writes = true; - uint8_t access_flags; + uint8_t access_flags, pgtt; bool rid2pasid = (pasid == PCI_NO_PASID) && s->root_scalable; VTDIOTLBEntry *iotlb_entry; + uint64_t xlat, size; /* * We have standalone memory region for interrupt addresses, we @@ -1915,13 +2113,13 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, cc_entry = &vtd_as->context_cache_entry; - /* Try to fetch slpte form IOTLB, we don't need RID2PASID logic */ + /* Try to fetch pte from IOTLB, we don't need RID2PASID logic */ if (!rid2pasid) { iotlb_entry = vtd_lookup_iotlb(s, source_id, pasid, addr); if (iotlb_entry) { - trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->slpte, + trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->pte, iotlb_entry->domain_id); - slpte = iotlb_entry->slpte; + pte = iotlb_entry->pte; access_flags = iotlb_entry->access_flags; page_mask = iotlb_entry->mask; goto out; @@ -1993,35 +2191,65 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, return true; } - /* Try to fetch slpte form IOTLB for RID2PASID slow path */ + /* Try to fetch pte from IOTLB for RID2PASID slow path */ if (rid2pasid) { iotlb_entry = vtd_lookup_iotlb(s, source_id, pasid, addr); if (iotlb_entry) { - trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->slpte, + trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->pte, iotlb_entry->domain_id); - slpte = iotlb_entry->slpte; + pte = iotlb_entry->pte; access_flags = iotlb_entry->access_flags; page_mask = iotlb_entry->mask; goto out; } } - ret_fr = vtd_iova_to_slpte(s, &ce, addr, is_write, &slpte, &level, - &reads, &writes, s->aw_bits, pasid); + if (s->flts && s->root_scalable) { + ret_fr = vtd_iova_to_flpte(s, &ce, addr, is_write, &pte, &level, + &reads, &writes, s->aw_bits, pasid); + pgtt = VTD_SM_PASID_ENTRY_FLT; + } else { + ret_fr = vtd_iova_to_slpte(s, &ce, addr, is_write, &pte, &level, + &reads, &writes, s->aw_bits, pasid); + pgtt = VTD_SM_PASID_ENTRY_SLT; + } + if (!ret_fr) { + xlat = vtd_get_pte_addr(pte, s->aw_bits); + size = ~vtd_pt_level_page_mask(level) + 1; + + /* + * Per VT-d spec 4.1 section 3.15: Untranslated requests and translation + * requests that result in an address in the interrupt range will be + * blocked with condition code LGN.4 or SGN.8. + */ + if ((xlat <= VTD_INTERRUPT_ADDR_LAST && + xlat + size - 1 >= VTD_INTERRUPT_ADDR_FIRST)) { + error_report_once("%s: xlat address is in interrupt range " + "(iova=0x%" PRIx64 ", level=0x%" PRIx32 ", " + "pte=0x%" PRIx64 ", write=%d, " + "xlat=0x%" PRIx64 ", size=0x%" PRIx64 ", " + "pasid=0x%" PRIx32 ")", + __func__, addr, level, pte, is_write, + xlat, size, pasid); + ret_fr = s->scalable_mode ? -VTD_FR_SM_INTERRUPT_ADDR : + -VTD_FR_INTERRUPT_ADDR; + } + } + if (ret_fr) { vtd_report_fault(s, -ret_fr, is_fpd_set, source_id, addr, is_write, pasid != PCI_NO_PASID, pasid); goto error; } - page_mask = vtd_slpt_level_page_mask(level); + page_mask = vtd_pt_level_page_mask(level); access_flags = IOMMU_ACCESS_FLAG(reads, writes); vtd_update_iotlb(s, source_id, vtd_get_domain_id(s, &ce, pasid), - addr, slpte, access_flags, level, pasid); + addr, pte, access_flags, level, pasid, pgtt); out: vtd_iommu_unlock(s); entry->iova = addr & page_mask; - entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask; + entry->translated_addr = vtd_get_pte_addr(pte, s->aw_bits) & page_mask; entry->addr_mask = ~page_mask; entry->perm = access_flags; return true; @@ -2215,8 +2443,13 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) } } +/* + * There is no pasid field in iotlb invalidation descriptor, so PCI_NO_PASID + * is passed as parameter. Piotlb invalidation supports pasid, pasid in its + * descriptor is passed which should not be PCI_NO_PASID. + */ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, - uint16_t domain_id, hwaddr addr, + uint16_t domain_id, hwaddr addr, uint8_t am, uint32_t pasid) { VTDAddressSpace *vtd_as; @@ -2225,19 +2458,37 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, hwaddr size = (1 << am) * VTD_PAGE_SIZE; QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) { - if (pasid != PCI_NO_PASID && pasid != vtd_as->pasid) { - continue; - } ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, &ce); if (!ret && domain_id == vtd_get_domain_id(s, &ce, vtd_as->pasid)) { + uint32_t rid2pasid = PCI_NO_PASID; + + if (s->root_scalable) { + rid2pasid = VTD_CE_GET_RID2PASID(&ce); + } + + /* + * In legacy mode, vtd_as->pasid == pasid is always true. + * In scalable mode, for vtd address space backing a PCI + * device without pasid, needs to compare pasid with + * rid2pasid of this device. + */ + if (!(vtd_as->pasid == pasid || + (vtd_as->pasid == PCI_NO_PASID && pasid == rid2pasid))) { + continue; + } + if (vtd_as_has_map_notifier(vtd_as)) { /* - * As long as we have MAP notifications registered in - * any of our IOMMU notifiers, we need to sync the - * shadow page table. + * When stage-1 translation is off, as long as we have MAP + * notifications registered in any of our IOMMU notifiers, + * we need to sync the shadow page table. Otherwise VFIO + * device attaches to nested page table instead of shadow + * page table, so no need to sync. */ - vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size); + if (!s->flts || !s->root_scalable) { + vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size); + } } else { /* * For UNMAP-only notifiers, we don't need to walk the @@ -2532,15 +2783,51 @@ static bool vtd_get_inv_desc(IntelIOMMUState *s, return true; } +static bool vtd_inv_desc_reserved_check(IntelIOMMUState *s, + VTDInvDesc *inv_desc, + uint64_t mask[4], bool dw, + const char *func_name, + const char *desc_type) +{ + if (s->iq_dw) { + if (inv_desc->val[0] & mask[0] || inv_desc->val[1] & mask[1] || + inv_desc->val[2] & mask[2] || inv_desc->val[3] & mask[3]) { + error_report("%s: invalid %s desc val[3]: 0x%"PRIx64 + " val[2]: 0x%"PRIx64" val[1]=0x%"PRIx64 + " val[0]=0x%"PRIx64" (reserved nonzero)", + func_name, desc_type, inv_desc->val[3], + inv_desc->val[2], inv_desc->val[1], + inv_desc->val[0]); + return false; + } + } else { + if (dw) { + error_report("%s: 256-bit %s desc in 128-bit invalidation queue", + func_name, desc_type); + return false; + } + + if (inv_desc->lo & mask[0] || inv_desc->hi & mask[1]) { + error_report("%s: invalid %s desc: hi=%"PRIx64", lo=%"PRIx64 + " (reserved nonzero)", func_name, desc_type, + inv_desc->hi, inv_desc->lo); + return false; + } + } + + return true; +} + static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) { - if ((inv_desc->hi & VTD_INV_DESC_WAIT_RSVD_HI) || - (inv_desc->lo & VTD_INV_DESC_WAIT_RSVD_LO)) { - error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64 - " (reserved nonzero)", __func__, inv_desc->hi, - inv_desc->lo); + uint64_t mask[4] = {VTD_INV_DESC_WAIT_RSVD_LO, VTD_INV_DESC_WAIT_RSVD_HI, + VTD_INV_DESC_ALL_ONE, VTD_INV_DESC_ALL_ONE}; + + if (!vtd_inv_desc_reserved_check(s, inv_desc, mask, false, + __func__, "wait")) { return false; } + if (inv_desc->lo & VTD_INV_DESC_WAIT_SW) { /* Status Write */ uint32_t status_data = (uint32_t)(inv_desc->lo >> @@ -2574,13 +2861,14 @@ static bool vtd_process_context_cache_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) { uint16_t sid, fmask; + uint64_t mask[4] = {VTD_INV_DESC_CC_RSVD, VTD_INV_DESC_ALL_ONE, + VTD_INV_DESC_ALL_ONE, VTD_INV_DESC_ALL_ONE}; - if ((inv_desc->lo & VTD_INV_DESC_CC_RSVD) || inv_desc->hi) { - error_report_once("%s: invalid cc inv desc: hi=%"PRIx64", lo=%"PRIx64 - " (reserved nonzero)", __func__, inv_desc->hi, - inv_desc->lo); + if (!vtd_inv_desc_reserved_check(s, inv_desc, mask, false, + __func__, "cc inv")) { return false; } + switch (inv_desc->lo & VTD_INV_DESC_CC_G) { case VTD_INV_DESC_CC_DOMAIN: trace_vtd_inv_desc_cc_domain( @@ -2610,12 +2898,11 @@ static bool vtd_process_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) uint16_t domain_id; uint8_t am; hwaddr addr; + uint64_t mask[4] = {VTD_INV_DESC_IOTLB_RSVD_LO, VTD_INV_DESC_IOTLB_RSVD_HI, + VTD_INV_DESC_ALL_ONE, VTD_INV_DESC_ALL_ONE}; - if ((inv_desc->lo & VTD_INV_DESC_IOTLB_RSVD_LO) || - (inv_desc->hi & VTD_INV_DESC_IOTLB_RSVD_HI)) { - error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64 - ", lo=0x%"PRIx64" (reserved bits unzero)", - __func__, inv_desc->hi, inv_desc->lo); + if (!vtd_inv_desc_reserved_check(s, inv_desc, mask, false, + __func__, "iotlb inv")) { return false; } @@ -2653,9 +2940,117 @@ static bool vtd_process_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) return true; } +static gboolean vtd_hash_remove_by_pasid(gpointer key, gpointer value, + gpointer user_data) +{ + VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value; + VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data; + + return ((entry->domain_id == info->domain_id) && + (entry->pasid == info->pasid)); +} + +static void vtd_piotlb_pasid_invalidate(IntelIOMMUState *s, + uint16_t domain_id, uint32_t pasid) +{ + VTDIOTLBPageInvInfo info; + VTDAddressSpace *vtd_as; + VTDContextEntry ce; + + info.domain_id = domain_id; + info.pasid = pasid; + + vtd_iommu_lock(s); + g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_pasid, + &info); + vtd_iommu_unlock(s); + + QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { + if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), + vtd_as->devfn, &ce) && + domain_id == vtd_get_domain_id(s, &ce, vtd_as->pasid)) { + uint32_t rid2pasid = VTD_CE_GET_RID2PASID(&ce); + + if ((vtd_as->pasid != PCI_NO_PASID || pasid != rid2pasid) && + vtd_as->pasid != pasid) { + continue; + } + + if (!s->flts || !vtd_as_has_map_notifier(vtd_as)) { + vtd_address_space_sync(vtd_as); + } + } + } +} + +static void vtd_piotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id, + uint32_t pasid, hwaddr addr, uint8_t am) +{ + VTDIOTLBPageInvInfo info; + + info.domain_id = domain_id; + info.pasid = pasid; + info.addr = addr; + info.mask = ~((1 << am) - 1); + + vtd_iommu_lock(s); + g_hash_table_foreach_remove(s->iotlb, + vtd_hash_remove_by_page_piotlb, &info); + vtd_iommu_unlock(s); + + vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am, pasid); +} + +static bool vtd_process_piotlb_desc(IntelIOMMUState *s, + VTDInvDesc *inv_desc) +{ + uint16_t domain_id; + uint32_t pasid; + hwaddr addr; + uint8_t am; + uint64_t mask[4] = {VTD_INV_DESC_PIOTLB_RSVD_VAL0, + VTD_INV_DESC_PIOTLB_RSVD_VAL1, + VTD_INV_DESC_ALL_ONE, VTD_INV_DESC_ALL_ONE}; + + if (!vtd_inv_desc_reserved_check(s, inv_desc, mask, true, + __func__, "piotlb inv")) { + return false; + } + + domain_id = VTD_INV_DESC_PIOTLB_DID(inv_desc->val[0]); + pasid = VTD_INV_DESC_PIOTLB_PASID(inv_desc->val[0]); + switch (inv_desc->val[0] & VTD_INV_DESC_PIOTLB_G) { + case VTD_INV_DESC_PIOTLB_ALL_IN_PASID: + vtd_piotlb_pasid_invalidate(s, domain_id, pasid); + break; + + case VTD_INV_DESC_PIOTLB_PSI_IN_PASID: + am = VTD_INV_DESC_PIOTLB_AM(inv_desc->val[1]); + addr = (hwaddr) VTD_INV_DESC_PIOTLB_ADDR(inv_desc->val[1]); + vtd_piotlb_page_invalidate(s, domain_id, pasid, addr, am); + break; + + default: + error_report_once("%s: invalid piotlb inv desc: hi=0x%"PRIx64 + ", lo=0x%"PRIx64" (type mismatch: 0x%llx)", + __func__, inv_desc->val[1], inv_desc->val[0], + inv_desc->val[0] & VTD_INV_DESC_IOTLB_G); + return false; + } + return true; +} + static bool vtd_process_inv_iec_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) { + uint64_t mask[4] = {VTD_INV_DESC_IEC_RSVD, VTD_INV_DESC_ALL_ONE, + VTD_INV_DESC_ALL_ONE, VTD_INV_DESC_ALL_ONE}; + + if (!vtd_inv_desc_reserved_check(s, inv_desc, mask, false, + __func__, "iec inv")) { + return false; + } + trace_vtd_inv_desc_iec(inv_desc->iec.granularity, inv_desc->iec.index, inv_desc->iec.index_mask); @@ -2666,38 +3061,11 @@ static bool vtd_process_inv_iec_desc(IntelIOMMUState *s, return true; } -static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, - VTDInvDesc *inv_desc) +static void do_invalidate_device_tlb(VTDAddressSpace *vtd_dev_as, + bool size, hwaddr addr) { - VTDAddressSpace *vtd_dev_as; - IOMMUTLBEvent event; - hwaddr addr; - uint64_t sz; - uint16_t sid; - bool size; - - addr = VTD_INV_DESC_DEVICE_IOTLB_ADDR(inv_desc->hi); - sid = VTD_INV_DESC_DEVICE_IOTLB_SID(inv_desc->lo); - size = VTD_INV_DESC_DEVICE_IOTLB_SIZE(inv_desc->hi); - - if ((inv_desc->lo & VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO) || - (inv_desc->hi & VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI)) { - error_report_once("%s: invalid dev-iotlb inv desc: hi=%"PRIx64 - ", lo=%"PRIx64" (reserved nonzero)", __func__, - inv_desc->hi, inv_desc->lo); - return false; - } - /* - * Using sid is OK since the guest should have finished the - * initialization of both the bus and device. - */ - vtd_dev_as = vtd_get_as_by_sid(s, sid); - if (!vtd_dev_as) { - goto done; - } - - /* According to ATS spec table 2.4: + * According to ATS spec table 2.4: * S = 0, bits 15:12 = xxxx range size: 4K * S = 1, bits 15:12 = xxx0 range size: 8K * S = 1, bits 15:12 = xx01 range size: 16K @@ -2705,6 +3073,10 @@ static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, * S = 1, bits 15:12 = 0111 range size: 64K * ... */ + + IOMMUTLBEvent event; + uint64_t sz; + if (size) { sz = (VTD_PAGE_SIZE * 2) << cto64(addr >> VTD_PAGE_SHIFT); addr &= ~(sz - 1); @@ -2719,6 +3091,81 @@ static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, event.entry.perm = IOMMU_NONE; event.entry.translated_addr = 0; memory_region_notify_iommu(&vtd_dev_as->iommu, 0, event); +} + +static bool vtd_process_device_piotlb_desc(IntelIOMMUState *s, + VTDInvDesc *inv_desc) +{ + uint16_t sid; + VTDAddressSpace *vtd_dev_as; + bool size; + bool global; + hwaddr addr; + uint32_t pasid; + uint64_t mask[4] = {VTD_INV_DESC_PASID_DEVICE_IOTLB_RSVD_VAL0, + VTD_INV_DESC_PASID_DEVICE_IOTLB_RSVD_VAL1, + VTD_INV_DESC_ALL_ONE, VTD_INV_DESC_ALL_ONE}; + + if (!vtd_inv_desc_reserved_check(s, inv_desc, mask, true, + __func__, "device piotlb inv")) { + return false; + } + + global = VTD_INV_DESC_PASID_DEVICE_IOTLB_GLOBAL(inv_desc->hi); + size = VTD_INV_DESC_PASID_DEVICE_IOTLB_SIZE(inv_desc->hi); + addr = VTD_INV_DESC_PASID_DEVICE_IOTLB_ADDR(inv_desc->hi); + sid = VTD_INV_DESC_PASID_DEVICE_IOTLB_SID(inv_desc->lo); + if (global) { + QLIST_FOREACH(vtd_dev_as, &s->vtd_as_with_notifiers, next) { + if ((vtd_dev_as->pasid != PCI_NO_PASID) && + (PCI_BUILD_BDF(pci_bus_num(vtd_dev_as->bus), + vtd_dev_as->devfn) == sid)) { + do_invalidate_device_tlb(vtd_dev_as, size, addr); + } + } + } else { + pasid = VTD_INV_DESC_PASID_DEVICE_IOTLB_PASID(inv_desc->lo); + vtd_dev_as = vtd_get_as_by_sid_and_pasid(s, sid, pasid); + if (!vtd_dev_as) { + return true; + } + + do_invalidate_device_tlb(vtd_dev_as, size, addr); + } + + return true; +} + +static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, + VTDInvDesc *inv_desc) +{ + VTDAddressSpace *vtd_dev_as; + hwaddr addr; + uint16_t sid; + bool size; + uint64_t mask[4] = {VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO, + VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI, + VTD_INV_DESC_ALL_ONE, VTD_INV_DESC_ALL_ONE}; + + if (!vtd_inv_desc_reserved_check(s, inv_desc, mask, false, + __func__, "dev-iotlb inv")) { + return false; + } + + addr = VTD_INV_DESC_DEVICE_IOTLB_ADDR(inv_desc->hi); + sid = VTD_INV_DESC_DEVICE_IOTLB_SID(inv_desc->lo); + size = VTD_INV_DESC_DEVICE_IOTLB_SIZE(inv_desc->hi); + + /* + * Using sid is OK since the guest should have finished the + * initialization of both the bus and device. + */ + vtd_dev_as = vtd_get_as_by_sid(s, sid); + if (!vtd_dev_as) { + goto done; + } + + do_invalidate_device_tlb(vtd_dev_as, size, addr); done: return true; @@ -2735,7 +3182,7 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s) return false; } - desc_type = inv_desc.lo & VTD_INV_DESC_TYPE; + desc_type = VTD_INV_DESC_TYPE(inv_desc.lo); /* FIXME: should update at first or at last? */ s->iq_last_desc_type = desc_type; @@ -2754,15 +3201,11 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s) } break; - /* - * TODO: the entity of below two cases will be implemented in future series. - * To make guest (which integrates scalable mode support patch set in - * iommu driver) work, just return true is enough so far. - */ - case VTD_INV_DESC_PC: - break; - case VTD_INV_DESC_PIOTLB: + trace_vtd_inv_desc("p-iotlb", inv_desc.val[1], inv_desc.val[0]); + if (!vtd_process_piotlb_desc(s, &inv_desc)) { + return false; + } break; case VTD_INV_DESC_WAIT: @@ -2779,6 +3222,13 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s) } break; + case VTD_INV_DESC_DEV_PIOTLB: + trace_vtd_inv_desc("device-piotlb", inv_desc.hi, inv_desc.lo); + if (!vtd_process_device_piotlb_desc(s, &inv_desc)) { + return false; + } + break; + case VTD_INV_DESC_DEVICE: trace_vtd_inv_desc("device", inv_desc.hi, inv_desc.lo); if (!vtd_process_device_iotlb_desc(s, &inv_desc)) { @@ -2786,6 +3236,16 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s) } break; + /* + * TODO: the entity of below two cases will be implemented in future series. + * To make guest (which integrates scalable mode support patch set in + * iommu driver) work, just return true is enough so far. + */ + case VTD_INV_DESC_PC: + if (s->scalable_mode) { + break; + } + /* fallthrough */ default: error_report_once("%s: invalid inv desc: hi=%"PRIx64", lo=%"PRIx64 " (unknown type)", __func__, inv_desc.hi, @@ -2838,6 +3298,7 @@ static void vtd_handle_iqt_write(IntelIOMMUState *s) if (s->iq_dw && (val & VTD_IQT_QT_256_RSV_BIT)) { error_report_once("%s: RSV bit is set: val=0x%"PRIx64, __func__, val); + vtd_handle_inv_queue_error(s); return; } s->iq_tail = VTD_IQT_QT(s->iq_dw, val); @@ -2938,7 +3399,9 @@ static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size) /* Invalidation Queue Address Register, 64-bit */ case DMAR_IQA_REG: - val = s->iq | (vtd_get_quad(s, DMAR_IQA_REG) & VTD_IQA_QS); + val = s->iq | + (vtd_get_quad(s, DMAR_IQA_REG) & + (VTD_IQA_QS | VTD_IQA_DW_MASK)); if (size == 4) { val = val & ((1ULL << 32) - 1); } @@ -3348,7 +3811,7 @@ static const MemoryRegionOps vtd_mem_ops = { }, }; -static Property vtd_properties[] = { +static const Property vtd_properties[] = { DEFINE_PROP_UINT32("version", IntelIOMMUState, version, 0), DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim, ON_OFF_AUTO_AUTO), @@ -3357,11 +3820,13 @@ static Property vtd_properties[] = { VTD_HOST_ADDRESS_WIDTH), DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE), DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, FALSE), + DEFINE_PROP_BOOL("x-flts", IntelIOMMUState, flts, FALSE), DEFINE_PROP_BOOL("snoop-control", IntelIOMMUState, snoop_control, false), DEFINE_PROP_BOOL("x-pasid-mode", IntelIOMMUState, pasid, false), DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true), DEFINE_PROP_BOOL("dma-translation", IntelIOMMUState, dma_translation, true), - DEFINE_PROP_END_OF_LIST(), + DEFINE_PROP_BOOL("stale-tm", IntelIOMMUState, stale_tm, false), + DEFINE_PROP_BOOL("fs1gp", IntelIOMMUState, fs1gp, true), }; /* Read IRTE entry with specific index */ @@ -3740,9 +4205,30 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, VTDAddressSpace *vtd_dev_as; char name[128]; + vtd_iommu_lock(s); vtd_dev_as = g_hash_table_lookup(s->vtd_address_spaces, &key); + vtd_iommu_unlock(s); + if (!vtd_dev_as) { - struct vtd_as_key *new_key = g_malloc(sizeof(*new_key)); + struct vtd_as_key *new_key; + /* Slow path */ + + /* + * memory_region_add_subregion_overlap requires the bql, + * make sure we own it. + */ + BQL_LOCK_GUARD(); + vtd_iommu_lock(s); + + /* Check again as we released the lock for a moment */ + vtd_dev_as = g_hash_table_lookup(s->vtd_address_spaces, &key); + if (vtd_dev_as) { + vtd_iommu_unlock(s); + return vtd_dev_as; + } + + /* Still nothing, allocate a new address space */ + new_key = g_malloc(sizeof(*new_key)); new_key->bus = bus; new_key->devfn = devfn; @@ -3833,6 +4319,8 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, vtd_switch_address_space(vtd_dev_as); g_hash_table_insert(s->vtd_address_spaces, new_key, vtd_dev_as); + + vtd_iommu_unlock(s); } return vtd_dev_as; } @@ -3858,7 +4346,13 @@ static bool vtd_check_hiod(IntelIOMMUState *s, HostIOMMUDevice *hiod, return false; } - return true; + if (!s->flts) { + /* All checks requested by VTD stage-2 translation pass */ + return true; + } + + error_setg(errp, "host device is uncompatible with stage-1 translation"); + return false; } static bool vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn, @@ -4036,8 +4530,6 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn), PCI_FUNC(vtd_as->devfn)); } - - return; } static void vtd_cap_init(IntelIOMMUState *s) @@ -4081,7 +4573,12 @@ static void vtd_cap_init(IntelIOMMUState *s) } /* TODO: read cap/ecap from host to decide which cap to be exposed. */ - if (s->scalable_mode) { + if (s->flts) { + s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_FLTS; + if (s->fs1gp) { + s->cap |= VTD_CAP_FS1GP; + } + } else if (s->scalable_mode) { s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS; } @@ -4127,15 +4624,27 @@ static void vtd_init(IntelIOMMUState *s) */ vtd_spte_rsvd[0] = ~0ULL; vtd_spte_rsvd[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits, - x86_iommu->dt_supported); + x86_iommu->dt_supported && s->stale_tm); vtd_spte_rsvd[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits); vtd_spte_rsvd[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits); vtd_spte_rsvd[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits); vtd_spte_rsvd_large[2] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits, - x86_iommu->dt_supported); + x86_iommu->dt_supported && s->stale_tm); vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits, - x86_iommu->dt_supported); + x86_iommu->dt_supported && s->stale_tm); + + /* + * Rsvd field masks for fpte + */ + vtd_fpte_rsvd[0] = ~0ULL; + vtd_fpte_rsvd[1] = VTD_FPTE_PAGE_L1_RSVD_MASK(s->aw_bits); + vtd_fpte_rsvd[2] = VTD_FPTE_PAGE_L2_RSVD_MASK(s->aw_bits); + vtd_fpte_rsvd[3] = VTD_FPTE_PAGE_L3_RSVD_MASK(s->aw_bits); + vtd_fpte_rsvd[4] = VTD_FPTE_PAGE_L4_RSVD_MASK(s->aw_bits); + + vtd_fpte_rsvd_large[2] = VTD_FPTE_LPAGE_L2_RSVD_MASK(s->aw_bits); + vtd_fpte_rsvd_large[3] = VTD_FPTE_LPAGE_L3_RSVD_MASK(s->aw_bits); if (s->scalable_mode || s->snoop_control) { vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP; @@ -4201,10 +4710,11 @@ static void vtd_init(IntelIOMMUState *s) /* Should not reset address_spaces when reset because devices will still use * the address space they got at first (won't ask the bus again). */ -static void vtd_reset(DeviceState *dev) +static void vtd_reset_exit(Object *obj, ResetType type) { - IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev); + IntelIOMMUState *s = INTEL_IOMMU_DEVICE(obj); + trace_vtd_reset_exit(); vtd_init(s); vtd_address_space_refresh_all(s); } @@ -4248,14 +4758,26 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) } } - /* Currently only address widths supported are 39 and 48 bits */ - if ((s->aw_bits != VTD_HOST_AW_39BIT) && - (s->aw_bits != VTD_HOST_AW_48BIT)) { - error_setg(errp, "Supported values for aw-bits are: %d, %d", + if (!s->scalable_mode && s->flts) { + error_setg(errp, "x-flts is only available in scalable mode"); + return false; + } + + if (!s->flts && s->aw_bits != VTD_HOST_AW_39BIT && + s->aw_bits != VTD_HOST_AW_48BIT) { + error_setg(errp, "%s: supported values for aw-bits are: %d, %d", + s->scalable_mode ? "Scalable mode(flts=off)" : "Legacy mode", VTD_HOST_AW_39BIT, VTD_HOST_AW_48BIT); return false; } + if (s->flts && s->aw_bits != VTD_HOST_AW_48BIT) { + error_setg(errp, + "Scalable mode(flts=on): supported value for aw-bits is: %d", + VTD_HOST_AW_48BIT); + return false; + } + if (s->scalable_mode && !s->dma_drain) { error_setg(errp, "Need to set dma_drain for scalable mode"); return false; @@ -4352,19 +4874,22 @@ static void vtd_realize(DeviceState *dev, Error **errp) qemu_add_machine_init_done_notifier(&vtd_machine_done_notify); } -static void vtd_class_init(ObjectClass *klass, void *data) +static void vtd_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); X86IOMMUClass *x86_class = X86_IOMMU_DEVICE_CLASS(klass); + ResettableClass *rc = RESETTABLE_CLASS(klass); - dc->reset = vtd_reset; + /* + * Use 'exit' reset phase to make sure all DMA requests + * have been quiesced during 'enter' or 'hold' phase + */ + rc->phases.exit = vtd_reset_exit; dc->vmsd = &vtd_vmstate; device_class_set_props(dc, vtd_properties); dc->hotpluggable = false; x86_class->realize = vtd_realize; x86_class->int_remap = vtd_int_remap; - /* Supported by the pc-q35-* machine types */ - dc->user_creatable = true; set_bit(DEVICE_CATEGORY_MISC, dc->categories); dc->desc = "Intel IOMMU (VT-d) DMA Remapping device"; } @@ -4377,7 +4902,7 @@ static const TypeInfo vtd_info = { }; static void vtd_iommu_memory_region_class_init(ObjectClass *klass, - void *data) + const void *data) { IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass); diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index f8cf99b..e8b211e 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -195,6 +195,7 @@ #define VTD_ECAP_PASID (1ULL << 40) #define VTD_ECAP_SMTS (1ULL << 43) #define VTD_ECAP_SLTS (1ULL << 46) +#define VTD_ECAP_FLTS (1ULL << 47) /* CAP_REG */ /* (offset >> 4) << 24 */ @@ -211,6 +212,7 @@ #define VTD_CAP_SLLPS ((1ULL << 34) | (1ULL << 35)) #define VTD_CAP_DRAIN_WRITE (1ULL << 54) #define VTD_CAP_DRAIN_READ (1ULL << 55) +#define VTD_CAP_FS1GP (1ULL << 56) #define VTD_CAP_DRAIN (VTD_CAP_DRAIN_READ | VTD_CAP_DRAIN_WRITE) #define VTD_CAP_CM (1ULL << 7) #define VTD_PASID_ID_SHIFT 20 @@ -264,10 +266,10 @@ #define VTD_FRCD_FR(val) (((val) & 0xffULL) << 32) #define VTD_FRCD_SID_MASK 0xffffULL #define VTD_FRCD_SID(val) ((val) & VTD_FRCD_SID_MASK) +#define VTD_FRCD_PV(val) (((val) & 0xffffULL) << 40) +#define VTD_FRCD_PP(val) (((val) & 0x1ULL) << 31) /* For the low 64-bit of 128-bit */ #define VTD_FRCD_FI(val) ((val) & ~0xfffULL) -#define VTD_FRCD_PV(val) (((val) & 0xffffULL) << 40) -#define VTD_FRCD_PP(val) (((val) & 0x1) << 31) #define VTD_FRCD_IR_IDX(val) (((val) & 0xffffULL) << 48) /* DMA Remapping Fault Conditions */ @@ -311,10 +313,28 @@ typedef enum VTDFaultReason { * request while disabled */ VTD_FR_IR_SID_ERR = 0x26, /* Invalid Source-ID */ - VTD_FR_PASID_TABLE_INV = 0x58, /*Invalid PASID table entry */ + /* PASID directory entry access failure */ + VTD_FR_PASID_DIR_ACCESS_ERR = 0x50, + /* The Present(P) field of pasid directory entry is 0 */ + VTD_FR_PASID_DIR_ENTRY_P = 0x51, + VTD_FR_PASID_TABLE_ACCESS_ERR = 0x58, /* PASID table entry access failure */ + /* The Present(P) field of pasid table entry is 0 */ + VTD_FR_PASID_ENTRY_P = 0x59, + VTD_FR_PASID_TABLE_ENTRY_INV = 0x5b, /*Invalid PASID table entry */ + + /* Fail to access a first-level paging entry (not FS_PML4E) */ + VTD_FR_FS_PAGING_ENTRY_INV = 0x70, + VTD_FR_FS_PAGING_ENTRY_P = 0x71, + /* Non-zero reserved field in present first-stage paging entry */ + VTD_FR_FS_PAGING_ENTRY_RSVD = 0x72, + VTD_FR_PASID_ENTRY_FSPTPTR_INV = 0x73, /* Invalid FSPTPTR in PASID entry */ + VTD_FR_FS_NON_CANONICAL = 0x80, /* SNG.1 : Address for FS not canonical.*/ + VTD_FR_FS_PAGING_ENTRY_US = 0x81, /* Privilege violation */ + VTD_FR_SM_WRITE = 0x85, /* No write permission */ /* Output address in the interrupt address range for scalable mode */ VTD_FR_SM_INTERRUPT_ADDR = 0x87, + VTD_FR_FS_BIT_UPDATE_FAILED = 0x91, /* SFS.10 */ VTD_FR_MAX, /* Guard */ } VTDFaultReason; @@ -356,7 +376,9 @@ union VTDInvDesc { typedef union VTDInvDesc VTDInvDesc; /* Masks for struct VTDInvDesc */ -#define VTD_INV_DESC_TYPE 0xf +#define VTD_INV_DESC_ALL_ONE -1ULL +#define VTD_INV_DESC_TYPE(val) ((((val) >> 5) & 0x70ULL) | \ + ((val) & 0xfULL)) #define VTD_INV_DESC_CC 0x1 /* Context-cache Invalidate Desc */ #define VTD_INV_DESC_IOTLB 0x2 #define VTD_INV_DESC_DEVICE 0x3 @@ -365,6 +387,7 @@ typedef union VTDInvDesc VTDInvDesc; #define VTD_INV_DESC_WAIT 0x5 /* Invalidation Wait Descriptor */ #define VTD_INV_DESC_PIOTLB 0x6 /* PASID-IOTLB Invalidate Desc */ #define VTD_INV_DESC_PC 0x7 /* PASID-cache Invalidate Desc */ +#define VTD_INV_DESC_DEV_PIOTLB 0x8 /* PASID-based-DIOTLB inv_desc*/ #define VTD_INV_DESC_NONE 0 /* Not an Invalidate Descriptor */ /* Masks for Invalidation Wait Descriptor*/ @@ -372,7 +395,7 @@ typedef union VTDInvDesc VTDInvDesc; #define VTD_INV_DESC_WAIT_IF (1ULL << 4) #define VTD_INV_DESC_WAIT_FN (1ULL << 6) #define VTD_INV_DESC_WAIT_DATA_SHIFT 32 -#define VTD_INV_DESC_WAIT_RSVD_LO 0Xffffff80ULL +#define VTD_INV_DESC_WAIT_RSVD_LO 0Xfffff180ULL #define VTD_INV_DESC_WAIT_RSVD_HI 3ULL /* Masks for Context-cache Invalidation Descriptor */ @@ -383,7 +406,7 @@ typedef union VTDInvDesc VTDInvDesc; #define VTD_INV_DESC_CC_DID(val) (((val) >> 16) & VTD_DOMAIN_ID_MASK) #define VTD_INV_DESC_CC_SID(val) (((val) >> 32) & 0xffffUL) #define VTD_INV_DESC_CC_FM(val) (((val) >> 48) & 3UL) -#define VTD_INV_DESC_CC_RSVD 0xfffc00000000ffc0ULL +#define VTD_INV_DESC_CC_RSVD 0xfffc00000000f1c0ULL /* Masks for IOTLB Invalidate Descriptor */ #define VTD_INV_DESC_IOTLB_G (3ULL << 4) @@ -393,26 +416,34 @@ typedef union VTDInvDesc VTDInvDesc; #define VTD_INV_DESC_IOTLB_DID(val) (((val) >> 16) & VTD_DOMAIN_ID_MASK) #define VTD_INV_DESC_IOTLB_ADDR(val) ((val) & ~0xfffULL) #define VTD_INV_DESC_IOTLB_AM(val) ((val) & 0x3fULL) -#define VTD_INV_DESC_IOTLB_RSVD_LO 0xffffffff0000ff00ULL +#define VTD_INV_DESC_IOTLB_RSVD_LO 0xffffffff0000f100ULL #define VTD_INV_DESC_IOTLB_RSVD_HI 0xf80ULL -#define VTD_INV_DESC_IOTLB_PASID_PASID (2ULL << 4) -#define VTD_INV_DESC_IOTLB_PASID_PAGE (3ULL << 4) -#define VTD_INV_DESC_IOTLB_PASID(val) (((val) >> 32) & VTD_PASID_ID_MASK) -#define VTD_INV_DESC_IOTLB_PASID_RSVD_LO 0xfff00000000001c0ULL -#define VTD_INV_DESC_IOTLB_PASID_RSVD_HI 0xf80ULL /* Mask for Device IOTLB Invalidate Descriptor */ #define VTD_INV_DESC_DEVICE_IOTLB_ADDR(val) ((val) & 0xfffffffffffff000ULL) #define VTD_INV_DESC_DEVICE_IOTLB_SIZE(val) ((val) & 0x1) #define VTD_INV_DESC_DEVICE_IOTLB_SID(val) (((val) >> 32) & 0xFFFFULL) #define VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI 0xffeULL -#define VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO 0xffff0000ffe0fff8 +#define VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO 0xffff0000ffe0f1f0 + +/* Masks for Interrupt Entry Invalidate Descriptor */ +#define VTD_INV_DESC_IEC_RSVD 0xffff000007fff1e0ULL + +/* Masks for PASID based Device IOTLB Invalidate Descriptor */ +#define VTD_INV_DESC_PASID_DEVICE_IOTLB_ADDR(val) ((val) & \ + 0xfffffffffffff000ULL) +#define VTD_INV_DESC_PASID_DEVICE_IOTLB_SIZE(val) ((val >> 11) & 0x1) +#define VTD_INV_DESC_PASID_DEVICE_IOTLB_GLOBAL(val) ((val) & 0x1) +#define VTD_INV_DESC_PASID_DEVICE_IOTLB_SID(val) (((val) >> 16) & 0xffffULL) +#define VTD_INV_DESC_PASID_DEVICE_IOTLB_PASID(val) ((val >> 32) & 0xfffffULL) +#define VTD_INV_DESC_PASID_DEVICE_IOTLB_RSVD_VAL0 0xfff000000000f000ULL +#define VTD_INV_DESC_PASID_DEVICE_IOTLB_RSVD_VAL1 0x7feULL /* Rsvd field masks for spte */ #define VTD_SPTE_SNP 0x800ULL -#define VTD_SPTE_PAGE_L1_RSVD_MASK(aw, dt_supported) \ - dt_supported ? \ +#define VTD_SPTE_PAGE_L1_RSVD_MASK(aw, stale_tm) \ + stale_tm ? \ (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM | VTD_SL_TM)) : \ (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM)) #define VTD_SPTE_PAGE_L2_RSVD_MASK(aw) \ @@ -422,21 +453,49 @@ typedef union VTDInvDesc VTDInvDesc; #define VTD_SPTE_PAGE_L4_RSVD_MASK(aw) \ (0x880ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM)) -#define VTD_SPTE_LPAGE_L2_RSVD_MASK(aw, dt_supported) \ - dt_supported ? \ +#define VTD_SPTE_LPAGE_L2_RSVD_MASK(aw, stale_tm) \ + stale_tm ? \ (0x1ff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM | VTD_SL_TM)) : \ (0x1ff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM)) -#define VTD_SPTE_LPAGE_L3_RSVD_MASK(aw, dt_supported) \ - dt_supported ? \ +#define VTD_SPTE_LPAGE_L3_RSVD_MASK(aw, stale_tm) \ + stale_tm ? \ (0x3ffff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM | VTD_SL_TM)) : \ (0x3ffff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM)) +/* Rsvd field masks for fpte */ +#define VTD_FS_UPPER_IGNORED 0xfff0000000000000ULL +#define VTD_FPTE_PAGE_L1_RSVD_MASK(aw) \ + (~(VTD_HAW_MASK(aw) | VTD_FS_UPPER_IGNORED)) +#define VTD_FPTE_PAGE_L2_RSVD_MASK(aw) \ + (~(VTD_HAW_MASK(aw) | VTD_FS_UPPER_IGNORED)) +#define VTD_FPTE_PAGE_L3_RSVD_MASK(aw) \ + (~(VTD_HAW_MASK(aw) | VTD_FS_UPPER_IGNORED)) +#define VTD_FPTE_PAGE_L4_RSVD_MASK(aw) \ + (0x80ULL | ~(VTD_HAW_MASK(aw) | VTD_FS_UPPER_IGNORED)) + +#define VTD_FPTE_LPAGE_L2_RSVD_MASK(aw) \ + (0x1fe000ULL | ~(VTD_HAW_MASK(aw) | VTD_FS_UPPER_IGNORED)) +#define VTD_FPTE_LPAGE_L3_RSVD_MASK(aw) \ + (0x3fffe000ULL | ~(VTD_HAW_MASK(aw) | VTD_FS_UPPER_IGNORED)) + +/* Masks for PIOTLB Invalidate Descriptor */ +#define VTD_INV_DESC_PIOTLB_G (3ULL << 4) +#define VTD_INV_DESC_PIOTLB_ALL_IN_PASID (2ULL << 4) +#define VTD_INV_DESC_PIOTLB_PSI_IN_PASID (3ULL << 4) +#define VTD_INV_DESC_PIOTLB_DID(val) (((val) >> 16) & VTD_DOMAIN_ID_MASK) +#define VTD_INV_DESC_PIOTLB_PASID(val) (((val) >> 32) & 0xfffffULL) +#define VTD_INV_DESC_PIOTLB_AM(val) ((val) & 0x3fULL) +#define VTD_INV_DESC_PIOTLB_IH(val) (((val) >> 6) & 0x1) +#define VTD_INV_DESC_PIOTLB_ADDR(val) ((val) & ~0xfffULL) +#define VTD_INV_DESC_PIOTLB_RSVD_VAL0 0xfff000000000f1c0ULL +#define VTD_INV_DESC_PIOTLB_RSVD_VAL1 0xf80ULL + /* Information about page-selective IOTLB invalidate */ struct VTDIOTLBPageInvInfo { uint16_t domain_id; uint32_t pasid; uint64_t addr; - uint8_t mask; + uint64_t mask; }; typedef struct VTDIOTLBPageInvInfo VTDIOTLBPageInvInfo; @@ -514,27 +573,38 @@ typedef struct VTDRootEntry VTDRootEntry; #define VTD_SM_PASID_ENTRY_AW 7ULL /* Adjusted guest-address-width */ #define VTD_SM_PASID_ENTRY_DID(val) ((val) & VTD_DOMAIN_ID_MASK) +#define VTD_SM_PASID_ENTRY_FLPM 3ULL +#define VTD_SM_PASID_ENTRY_FLPTPTR (~0xfffULL) + +/* First Level Paging Structure */ +/* Masks for First Level Paging Entry */ +#define VTD_FL_P 1ULL +#define VTD_FL_RW (1ULL << 1) +#define VTD_FL_US (1ULL << 2) +#define VTD_FL_A (1ULL << 5) +#define VTD_FL_D (1ULL << 6) + /* Second Level Page Translation Pointer*/ #define VTD_SM_PASID_ENTRY_SLPTPTR (~0xfffULL) -/* Paging Structure common */ -#define VTD_SL_PT_PAGE_SIZE_MASK (1ULL << 7) -/* Bits to decide the offset for each level */ -#define VTD_SL_LEVEL_BITS 9 - /* Second Level Paging Structure */ -#define VTD_SL_PML4_LEVEL 4 -#define VTD_SL_PDP_LEVEL 3 -#define VTD_SL_PD_LEVEL 2 -#define VTD_SL_PT_LEVEL 1 -#define VTD_SL_PT_ENTRY_NR 512 - /* Masks for Second Level Paging Entry */ #define VTD_SL_RW_MASK 3ULL #define VTD_SL_R 1ULL #define VTD_SL_W (1ULL << 1) -#define VTD_SL_PT_BASE_ADDR_MASK(aw) (~(VTD_PAGE_SIZE - 1) & VTD_HAW_MASK(aw)) #define VTD_SL_IGN_COM 0xbff0000000000000ULL #define VTD_SL_TM (1ULL << 62) +/* Common for both First Level and Second Level */ +#define VTD_PML4_LEVEL 4 +#define VTD_PDP_LEVEL 3 +#define VTD_PD_LEVEL 2 +#define VTD_PT_LEVEL 1 +#define VTD_PT_ENTRY_NR 512 +#define VTD_PT_PAGE_SIZE_MASK (1ULL << 7) +#define VTD_PT_BASE_ADDR_MASK(aw) (~(VTD_PAGE_SIZE - 1) & VTD_HAW_MASK(aw)) + +/* Bits to decide the offset for each level */ +#define VTD_LEVEL_BITS 9 + #endif diff --git a/hw/i386/kvm/apic.c b/hw/i386/kvm/apic.c index a72c28e..1be9bfe 100644 --- a/hw/i386/kvm/apic.c +++ b/hw/i386/kvm/apic.c @@ -14,9 +14,10 @@ #include "qemu/module.h" #include "hw/i386/apic_internal.h" #include "hw/pci/msi.h" -#include "sysemu/hw_accel.h" -#include "sysemu/kvm.h" +#include "system/hw_accel.h" +#include "system/kvm.h" #include "kvm/kvm_i386.h" +#include "kvm/tdx.h" static inline void kvm_apic_set_reg(struct kvm_lapic_state *kapic, int reg_id, uint32_t val) @@ -141,6 +142,10 @@ static void kvm_apic_put(CPUState *cs, run_on_cpu_data data) struct kvm_lapic_state kapic; int ret; + if (is_tdx_vm()) { + return; + } + kvm_put_apicbase(s->cpu, s->apicbase); kvm_put_apic_state(s, &kapic); @@ -214,7 +219,7 @@ static void kvm_apic_mem_write(void *opaque, hwaddr addr, static const MemoryRegionOps kvm_apic_io_ops = { .read = kvm_apic_mem_read, .write = kvm_apic_mem_write, - .endianness = DEVICE_NATIVE_ENDIAN, + .endianness = DEVICE_LITTLE_ENDIAN, }; static void kvm_apic_reset(APICCommonState *s) @@ -240,7 +245,7 @@ static void kvm_apic_unrealize(DeviceState *dev) { } -static void kvm_apic_class_init(ObjectClass *klass, void *data) +static void kvm_apic_class_init(ObjectClass *klass, const void *data) { APICCommonClass *k = APIC_COMMON_CLASS(klass); diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c index 40aa9a3..f563827 100644 --- a/hw/i386/kvm/clock.c +++ b/hw/i386/kvm/clock.c @@ -16,9 +16,9 @@ #include "qemu/osdep.h" #include "qemu/host-utils.h" #include "qemu/module.h" -#include "sysemu/kvm.h" -#include "sysemu/runstate.h" -#include "sysemu/hw_accel.h" +#include "system/kvm.h" +#include "system/runstate.h" +#include "system/hw_accel.h" #include "kvm/kvm_i386.h" #include "migration/vmstate.h" #include "hw/sysbus.h" @@ -27,7 +27,6 @@ #include "qapi/error.h" #include <linux/kvm.h> -#include "standard-headers/asm-x86/kvm_para.h" #include "qom/object.h" #define TYPE_KVM_CLOCK "kvmclock" @@ -305,13 +304,12 @@ static const VMStateDescription kvmclock_vmsd = { } }; -static Property kvmclock_properties[] = { +static const Property kvmclock_properties[] = { DEFINE_PROP_BOOL("x-mach-use-reliable-get-clock", KVMClockState, mach_use_reliable_get_clock, true), - DEFINE_PROP_END_OF_LIST(), }; -static void kvmclock_class_init(ObjectClass *klass, void *data) +static void kvmclock_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); @@ -334,8 +332,8 @@ void kvmclock_create(bool create_always) assert(kvm_enabled()); if (create_always || - cpu->env.features[FEAT_KVM] & ((1ULL << KVM_FEATURE_CLOCKSOURCE) | - (1ULL << KVM_FEATURE_CLOCKSOURCE2))) { + cpu->env.features[FEAT_KVM] & (CPUID_KVM_CLOCK | + CPUID_KVM_CLOCK2)) { sysbus_create_simple(TYPE_KVM_CLOCK, -1, NULL); } } diff --git a/hw/i386/kvm/i8254.c b/hw/i386/kvm/i8254.c index e49b9c4..14b78f3 100644 --- a/hw/i386/kvm/i8254.c +++ b/hw/i386/kvm/i8254.c @@ -29,11 +29,11 @@ #include "qapi/error.h" #include "qemu/module.h" #include "qemu/timer.h" -#include "sysemu/runstate.h" +#include "system/runstate.h" #include "hw/timer/i8254.h" #include "hw/timer/i8254_internal.h" #include "hw/qdev-properties-system.h" -#include "sysemu/kvm.h" +#include "system/kvm.h" #include "target/i386/kvm/kvm_i386.h" #include "qom/object.h" @@ -287,13 +287,12 @@ static void kvm_pit_realizefn(DeviceState *dev, Error **errp) kpc->parent_realize(dev, errp); } -static Property kvm_pit_properties[] = { +static const Property kvm_pit_properties[] = { DEFINE_PROP_LOSTTICKPOLICY("lost_tick_policy", KVMPITState, lost_tick_policy, LOST_TICK_POLICY_DELAY), - DEFINE_PROP_END_OF_LIST(), }; -static void kvm_pit_class_init(ObjectClass *klass, void *data) +static void kvm_pit_class_init(ObjectClass *klass, const void *data) { KVMPITClass *kpc = KVM_PIT_CLASS(klass); PITCommonClass *k = PIT_COMMON_CLASS(klass); @@ -303,7 +302,7 @@ static void kvm_pit_class_init(ObjectClass *klass, void *data) &kpc->parent_realize); k->set_channel_gate = kvm_pit_set_gate; k->get_channel_info = kvm_pit_get_channel_info; - dc->reset = kvm_pit_reset; + device_class_set_legacy_reset(dc, kvm_pit_reset); device_class_set_props(dc, kvm_pit_properties); } diff --git a/hw/i386/kvm/i8259.c b/hw/i386/kvm/i8259.c index 3ca0e1f..8a72d6e 100644 --- a/hw/i386/kvm/i8259.c +++ b/hw/i386/kvm/i8259.c @@ -16,7 +16,7 @@ #include "qemu/module.h" #include "hw/intc/kvm_irqcount.h" #include "hw/irq.h" -#include "sysemu/kvm.h" +#include "system/kvm.h" #include "qom/object.h" #define TYPE_KVM_I8259 "kvm-i8259" @@ -139,13 +139,13 @@ qemu_irq *kvm_i8259_init(ISABus *bus) return qemu_allocate_irqs(kvm_pic_set_irq, NULL, ISA_NUM_IRQS); } -static void kvm_i8259_class_init(ObjectClass *klass, void *data) +static void kvm_i8259_class_init(ObjectClass *klass, const void *data) { KVMPICClass *kpc = KVM_PIC_CLASS(klass); PICCommonClass *k = PIC_COMMON_CLASS(klass); DeviceClass *dc = DEVICE_CLASS(klass); - dc->reset = kvm_pic_reset; + device_class_set_legacy_reset(dc, kvm_pic_reset); device_class_set_parent_realize(dc, kvm_pic_realize, &kpc->parent_realize); k->pre_save = kvm_pic_get; k->post_load = kvm_pic_put; diff --git a/hw/i386/kvm/ioapic.c b/hw/i386/kvm/ioapic.c index b96fe84..693ee97 100644 --- a/hw/i386/kvm/ioapic.c +++ b/hw/i386/kvm/ioapic.c @@ -15,7 +15,7 @@ #include "hw/qdev-properties.h" #include "hw/intc/ioapic_internal.h" #include "hw/intc/kvm_irqcount.h" -#include "sysemu/kvm.h" +#include "system/kvm.h" #include "kvm/kvm_i386.h" /* PC Utility function */ @@ -133,12 +133,11 @@ static void kvm_ioapic_realize(DeviceState *dev, Error **errp) qdev_init_gpio_in(dev, kvm_ioapic_set_irq, IOAPIC_NUM_PINS); } -static Property kvm_ioapic_properties[] = { +static const Property kvm_ioapic_properties[] = { DEFINE_PROP_UINT32("gsi_base", KVMIOAPICState, kvm_gsi_base, 0), - DEFINE_PROP_END_OF_LIST() }; -static void kvm_ioapic_class_init(ObjectClass *klass, void *data) +static void kvm_ioapic_class_init(ObjectClass *klass, const void *data) { IOAPICCommonClass *k = IOAPIC_COMMON_CLASS(klass); DeviceClass *dc = DEVICE_CLASS(klass); @@ -146,7 +145,7 @@ static void kvm_ioapic_class_init(ObjectClass *klass, void *data) k->realize = kvm_ioapic_realize; k->pre_save = kvm_ioapic_get; k->post_load = kvm_ioapic_put; - dc->reset = kvm_ioapic_reset; + device_class_set_legacy_reset(dc, kvm_ioapic_reset); device_class_set_props(dc, kvm_ioapic_properties); } diff --git a/hw/i386/kvm/xen-stubs.c b/hw/i386/kvm/xen-stubs.c index d03131e..ce73119 100644 --- a/hw/i386/kvm/xen-stubs.c +++ b/hw/i386/kvm/xen-stubs.c @@ -12,7 +12,6 @@ #include "qemu/osdep.h" #include "qapi/error.h" -#include "qapi/qapi-commands-misc-target.h" #include "xen_evtchn.h" #include "xen_primary_console.h" @@ -38,15 +37,3 @@ void xen_primary_console_create(void) void xen_primary_console_set_be_port(uint16_t port) { } -#ifdef TARGET_I386 -EvtchnInfoList *qmp_xen_event_list(Error **errp) -{ - error_setg(errp, "Xen event channel emulation not enabled"); - return NULL; -} - -void qmp_xen_event_inject(uint32_t port, Error **errp) -{ - error_setg(errp, "Xen event channel emulation not enabled"); -} -#endif diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c index 07bd0c9..dd566c4 100644 --- a/hw/i386/kvm/xen_evtchn.c +++ b/hw/i386/kvm/xen_evtchn.c @@ -19,11 +19,11 @@ #include "monitor/monitor.h" #include "monitor/hmp.h" #include "qapi/error.h" -#include "qapi/qapi-commands-misc-target.h" -#include "qapi/qmp/qdict.h" +#include "qapi/qapi-commands-misc-i386.h" +#include "qobject/qdict.h" #include "qom/object.h" #include "exec/target_page.h" -#include "exec/address-spaces.h" +#include "system/address-spaces.h" #include "migration/vmstate.h" #include "trace.h" @@ -41,8 +41,8 @@ #include "xen_overlay.h" #include "xen_xenstore.h" -#include "sysemu/kvm.h" -#include "sysemu/kvm_xen.h" +#include "system/kvm.h" +#include "system/kvm_xen.h" #include <linux/kvm.h> #include <sys/eventfd.h> @@ -140,6 +140,8 @@ struct XenEvtchnState { uint64_t callback_param; bool evtchn_in_kernel; + bool setting_callback_gsi; + int extern_gsi_level; uint32_t callback_gsi; QEMUBH *gsi_bh; @@ -269,7 +271,7 @@ static const VMStateDescription xen_evtchn_vmstate = { } }; -static void xen_evtchn_class_init(ObjectClass *klass, void *data) +static void xen_evtchn_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); @@ -431,9 +433,22 @@ void xen_evtchn_set_callback_level(int level) } if (s->callback_gsi && s->callback_gsi < s->nr_callback_gsis) { - qemu_set_irq(s->callback_gsis[s->callback_gsi], level); - if (level) { - /* Ensure the vCPU polls for deassertion */ + /* + * Ugly, but since we hold the BQL we can set this flag so that + * xen_evtchn_set_gsi() can tell the difference between this code + * setting the GSI, and an external device (PCI INTx) doing so. + */ + s->setting_callback_gsi = true; + /* Do not deassert the line if an external device is asserting it. */ + qemu_set_irq(s->callback_gsis[s->callback_gsi], + level || s->extern_gsi_level); + s->setting_callback_gsi = false; + + /* + * If the callback GSI is the only one asserted, ensure the status + * is polled for deassertion in kvm_arch_post_run(). + */ + if (level && !s->extern_gsi_level) { kvm_xen_set_callback_asserted(); } } @@ -1596,7 +1611,7 @@ static int allocate_pirq(XenEvtchnState *s, int type, int gsi) return pirq; } -bool xen_evtchn_set_gsi(int gsi, int level) +bool xen_evtchn_set_gsi(int gsi, int *level) { XenEvtchnState *s = xen_evtchn_singleton; int pirq; @@ -1608,16 +1623,35 @@ bool xen_evtchn_set_gsi(int gsi, int level) } /* - * Check that that it *isn't* the event channel GSI, and thus - * that we are not recursing and it's safe to take s->port_lock. - * - * Locking aside, it's perfectly sane to bail out early for that - * special case, as it would make no sense for the event channel - * GSI to be routed back to event channels, when the delivery - * method is to raise the GSI... that recursion wouldn't *just* - * be a locking issue. + * For the callback_gsi we need to implement a logical OR of the event + * channel GSI and the external input (e.g. from PCI INTx), because + * QEMU itself doesn't support shared level interrupts via demux or + * resamplers. */ if (gsi && gsi == s->callback_gsi) { + /* Remember the external state of the GSI pin (e.g. from PCI INTx) */ + if (!s->setting_callback_gsi) { + s->extern_gsi_level = *level; + + /* + * Don't allow the external device to deassert the line if the + * eveht channel GSI should still be asserted. + */ + if (!s->extern_gsi_level) { + struct vcpu_info *vi = kvm_xen_get_vcpu_info_hva(0); + if (vi && vi->evtchn_upcall_pending) { + /* Need to poll for deassertion */ + kvm_xen_set_callback_asserted(); + *level = 1; + } + } + } + + /* + * The event channel GSI cannot be routed to PIRQ, as that would make + * no sense. It could also deadlock on s->port_lock, if we proceed. + * So bail out now. + */ return false; } @@ -1628,7 +1662,7 @@ bool xen_evtchn_set_gsi(int gsi, int level) return false; } - if (level) { + if (*level) { int port = s->pirq[pirq].port; s->pirq_gsi_set |= (1U << gsi); diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h index b740acf..0521ebc 100644 --- a/hw/i386/kvm/xen_evtchn.h +++ b/hw/i386/kvm/xen_evtchn.h @@ -23,7 +23,7 @@ void xen_evtchn_set_callback_level(int level); int xen_evtchn_set_port(uint16_t port); -bool xen_evtchn_set_gsi(int gsi, int level); +bool xen_evtchn_set_gsi(int gsi, int *level); void xen_evtchn_snoop_msi(PCIDevice *dev, bool is_msix, unsigned int vector, uint64_t addr, uint32_t data, bool is_masked); void xen_evtchn_remove_pci_device(PCIDevice *dev); diff --git a/hw/i386/kvm/xen_gnttab.c b/hw/i386/kvm/xen_gnttab.c index 245e4b1..4b9e272 100644 --- a/hw/i386/kvm/xen_gnttab.c +++ b/hw/i386/kvm/xen_gnttab.c @@ -17,7 +17,7 @@ #include "qapi/error.h" #include "qom/object.h" #include "exec/target_page.h" -#include "exec/address-spaces.h" +#include "system/address-spaces.h" #include "migration/vmstate.h" #include "hw/sysbus.h" @@ -27,8 +27,8 @@ #include "xen_gnttab.h" #include "xen_primary_console.h" -#include "sysemu/kvm.h" -#include "sysemu/kvm_xen.h" +#include "system/kvm.h" +#include "system/kvm_xen.h" #include "hw/xen/interface/memory.h" #include "hw/xen/interface/grant_table.h" @@ -135,7 +135,7 @@ static const VMStateDescription xen_gnttab_vmstate = { } }; -static void xen_gnttab_class_init(ObjectClass *klass, void *data) +static void xen_gnttab_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); diff --git a/hw/i386/kvm/xen_overlay.c b/hw/i386/kvm/xen_overlay.c index c68e78a..3cb7361 100644 --- a/hw/i386/kvm/xen_overlay.c +++ b/hw/i386/kvm/xen_overlay.c @@ -16,15 +16,15 @@ #include "qapi/error.h" #include "qom/object.h" #include "exec/target_page.h" -#include "exec/address-spaces.h" +#include "system/address-spaces.h" #include "migration/vmstate.h" #include "hw/sysbus.h" #include "hw/xen/xen.h" #include "xen_overlay.h" -#include "sysemu/kvm.h" -#include "sysemu/kvm_xen.h" +#include "system/kvm.h" +#include "system/kvm_xen.h" #include <linux/kvm.h> #include "hw/xen/interface/memory.h" @@ -151,11 +151,11 @@ static void xen_overlay_reset(DeviceState *dev) kvm_xen_soft_reset(); } -static void xen_overlay_class_init(ObjectClass *klass, void *data) +static void xen_overlay_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); - dc->reset = xen_overlay_reset; + device_class_set_legacy_reset(dc, xen_overlay_reset); dc->realize = xen_overlay_realize; dc->vmsd = &xen_overlay_vmstate; } diff --git a/hw/i386/kvm/xen_primary_console.c b/hw/i386/kvm/xen_primary_console.c index abe79f5..6e9d641 100644 --- a/hw/i386/kvm/xen_primary_console.c +++ b/hw/i386/kvm/xen_primary_console.c @@ -20,8 +20,8 @@ #include "xen_overlay.h" #include "xen_primary_console.h" -#include "sysemu/kvm.h" -#include "sysemu/kvm_xen.h" +#include "system/kvm.h" +#include "system/kvm_xen.h" #include "trace.h" @@ -67,7 +67,7 @@ static void xen_primary_console_realize(DeviceState *dev, Error **errp) xen_primary_console_singleton = s; } -static void xen_primary_console_class_init(ObjectClass *klass, void *data) +static void xen_primary_console_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); diff --git a/hw/i386/kvm/xen_xenstore.c b/hw/i386/kvm/xen_xenstore.c index 1a9bc34..42955cc 100644 --- a/hw/i386/kvm/xen_xenstore.c +++ b/hw/i386/kvm/xen_xenstore.c @@ -28,8 +28,8 @@ #include "xen_primary_console.h" #include "xen_xenstore.h" -#include "sysemu/kvm.h" -#include "sysemu/kvm_xen.h" +#include "system/kvm.h" +#include "system/kvm_xen.h" #include "trace.h" @@ -209,7 +209,6 @@ static int xen_xenstore_post_load(void *opaque, int ver) { XenXenstoreState *s = opaque; GByteArray *save; - int ret; /* * As qemu/dom0, rebind to the guest's port. The Windows drivers may @@ -231,8 +230,7 @@ static int xen_xenstore_post_load(void *opaque, int ver) s->impl_state = NULL; s->impl_state_size = 0; - ret = xs_impl_deserialize(s->impl, save, xen_domid, fire_watch_cb, s); - return ret; + return xs_impl_deserialize(s->impl, save, xen_domid, fire_watch_cb, s); } static const VMStateDescription xen_xenstore_vmstate = { @@ -261,7 +259,7 @@ static const VMStateDescription xen_xenstore_vmstate = { } }; -static void xen_xenstore_class_init(ObjectClass *klass, void *data) +static void xen_xenstore_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); @@ -532,6 +530,10 @@ static void xs_read(XenXenstoreState *s, unsigned int req_id, return; } + if (!len) { + return; + } + memcpy(&rsp_data[rsp->len], data->data, len); rsp->len += len; } diff --git a/hw/i386/meson.build b/hw/i386/meson.build index 03aad10..7896f34 100644 --- a/hw/i386/meson.build +++ b/hw/i386/meson.build @@ -15,6 +15,7 @@ i386_ss.add(when: 'CONFIG_AMD_IOMMU', if_true: files('amd_iommu.c'), if_false: files('amd_iommu-stub.c')) i386_ss.add(when: 'CONFIG_I440FX', if_true: files('pc_piix.c')) i386_ss.add(when: 'CONFIG_MICROVM', if_true: files('x86-common.c', 'microvm.c', 'acpi-microvm.c', 'microvm-dt.c')) +i386_ss.add(when: 'CONFIG_NITRO_ENCLAVE', if_true: files('nitro_enclave.c')) i386_ss.add(when: 'CONFIG_Q35', if_true: files('pc_q35.c')) i386_ss.add(when: 'CONFIG_VMMOUSE', if_true: files('vmmouse.c')) i386_ss.add(when: 'CONFIG_VMPORT', if_true: files('vmport.c')) @@ -31,6 +32,7 @@ i386_ss.add(when: 'CONFIG_PC', if_true: files( 'port92.c')) i386_ss.add(when: 'CONFIG_X86_FW_OVMF', if_true: files('pc_sysfw_ovmf.c'), if_false: files('pc_sysfw_ovmf-stubs.c')) +i386_ss.add(when: 'CONFIG_TDX', if_true: files('tdvf.c', 'tdvf-hob.c')) subdir('kvm') subdir('xen') diff --git a/hw/i386/microvm-dt.c b/hw/i386/microvm-dt.c index b3049e4..cb27dfd 100644 --- a/hw/i386/microvm-dt.c +++ b/hw/i386/microvm-dt.c @@ -33,8 +33,8 @@ #include "qemu/osdep.h" #include "qemu/cutils.h" #include "qapi/error.h" -#include "sysemu/device_tree.h" -#include "hw/char/serial.h" +#include "system/device_tree.h" +#include "hw/char/serial-isa.h" #include "hw/i386/fw_cfg.h" #include "hw/rtc/mc146818rtc.h" #include "hw/sysbus.h" diff --git a/hw/i386/microvm.c b/hw/i386/microvm.c index 40edcee..e0daf0d 100644 --- a/hw/i386/microvm.c +++ b/hw/i386/microvm.c @@ -22,11 +22,11 @@ #include "qapi/error.h" #include "qapi/visitor.h" #include "qapi/qapi-visit-common.h" -#include "sysemu/sysemu.h" -#include "sysemu/cpus.h" -#include "sysemu/numa.h" -#include "sysemu/reset.h" -#include "sysemu/runstate.h" +#include "system/system.h" +#include "system/cpus.h" +#include "system/numa.h" +#include "system/reset.h" +#include "system/runstate.h" #include "acpi-microvm.h" #include "microvm-dt.h" @@ -39,7 +39,7 @@ #include "hw/intc/i8259.h" #include "hw/timer/i8254.h" #include "hw/rtc/mc146818rtc.h" -#include "hw/char/serial.h" +#include "hw/char/serial-isa.h" #include "hw/display/ramfb.h" #include "hw/i386/topology.h" #include "hw/i386/e820_memory_layout.h" @@ -139,7 +139,7 @@ static void create_gpex(MicrovmMachineState *mms) mms->gpex.mmio64.base, mmio64_alias); } - for (i = 0; i < GPEX_NUM_IRQS; i++) { + for (i = 0; i < PCI_NUM_PINS; i++) { sysbus_connect_irq(SYS_BUS_DEVICE(dev), i, x86ms->gsi[mms->gpex.irq + i]); } @@ -283,6 +283,7 @@ static void microvm_devices_init(MicrovmMachineState *mms) static void microvm_memory_init(MicrovmMachineState *mms) { + MicrovmMachineClass *mmc = MICROVM_MACHINE_GET_CLASS(mms); MachineState *machine = MACHINE(mms); X86MachineState *x86ms = X86_MACHINE(mms); MemoryRegion *ram_below_4g, *ram_above_4g; @@ -328,7 +329,7 @@ static void microvm_memory_init(MicrovmMachineState *mms) rom_set_fw(fw_cfg); if (machine->kernel_filename != NULL) { - x86_load_linux(x86ms, fw_cfg, 0, true); + mmc->x86_load_linux(x86ms, fw_cfg, 0, true); } if (mms->option_roms) { @@ -450,11 +451,44 @@ static HotplugHandler *microvm_get_hotplug_handler(MachineState *machine, return NULL; } +static void microvm_machine_done(Notifier *notifier, void *data) +{ + MicrovmMachineState *mms = container_of(notifier, MicrovmMachineState, + machine_done); + X86MachineState *x86ms = X86_MACHINE(mms); + + acpi_setup_microvm(mms); + dt_setup_microvm(mms); + fw_cfg_add_e820(x86ms->fw_cfg); +} + +static void microvm_powerdown_req(Notifier *notifier, void *data) +{ + MicrovmMachineState *mms = container_of(notifier, MicrovmMachineState, + powerdown_req); + X86MachineState *x86ms = X86_MACHINE(mms); + + if (x86ms->acpi_dev) { + Object *obj = OBJECT(x86ms->acpi_dev); + AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(obj); + adevc->send_event(ACPI_DEVICE_IF(x86ms->acpi_dev), + ACPI_POWER_DOWN_STATUS); + } +} + static void microvm_machine_state_init(MachineState *machine) { MicrovmMachineState *mms = MICROVM_MACHINE(machine); X86MachineState *x86ms = X86_MACHINE(machine); + /* State */ + mms->kernel_cmdline_fixed = false; + + mms->machine_done.notify = microvm_machine_done; + qemu_add_machine_init_done_notifier(&mms->machine_done); + mms->powerdown_req.notify = microvm_powerdown_req; + qemu_register_powerdown_notifier(&mms->powerdown_req); + microvm_memory_init(mms); x86_cpus_init(x86ms, CPU_VERSION_LATEST); @@ -462,7 +496,7 @@ static void microvm_machine_state_init(MachineState *machine) microvm_devices_init(mms); } -static void microvm_machine_reset(MachineState *machine, ShutdownCause reason) +static void microvm_machine_reset(MachineState *machine, ResetType type) { MicrovmMachineState *mms = MICROVM_MACHINE(machine); CPUState *cs; @@ -475,7 +509,7 @@ static void microvm_machine_reset(MachineState *machine, ShutdownCause reason) mms->kernel_cmdline_fixed = true; } - qemu_devices_reset(reason); + qemu_devices_reset(type); CPU_FOREACH(cs) { cpu = X86_CPU(cs); @@ -580,31 +614,6 @@ static void microvm_machine_set_auto_kernel_cmdline(Object *obj, bool value, mms->auto_kernel_cmdline = value; } -static void microvm_machine_done(Notifier *notifier, void *data) -{ - MicrovmMachineState *mms = container_of(notifier, MicrovmMachineState, - machine_done); - X86MachineState *x86ms = X86_MACHINE(mms); - - acpi_setup_microvm(mms); - dt_setup_microvm(mms); - fw_cfg_add_e820(x86ms->fw_cfg); -} - -static void microvm_powerdown_req(Notifier *notifier, void *data) -{ - MicrovmMachineState *mms = container_of(notifier, MicrovmMachineState, - powerdown_req); - X86MachineState *x86ms = X86_MACHINE(mms); - - if (x86ms->acpi_dev) { - Object *obj = OBJECT(x86ms->acpi_dev); - AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(obj); - adevc->send_event(ACPI_DEVICE_IF(x86ms->acpi_dev), - ACPI_POWER_DOWN_STATUS); - } -} - static void microvm_machine_initfn(Object *obj) { MicrovmMachineState *mms = MICROVM_MACHINE(obj); @@ -616,14 +625,6 @@ static void microvm_machine_initfn(Object *obj) mms->isa_serial = true; mms->option_roms = true; mms->auto_kernel_cmdline = true; - - /* State */ - mms->kernel_cmdline_fixed = false; - - mms->machine_done.notify = microvm_machine_done; - qemu_add_machine_init_done_notifier(&mms->machine_done); - mms->powerdown_req.notify = microvm_powerdown_req; - qemu_register_powerdown_notifier(&mms->powerdown_req); } GlobalProperty microvm_properties[] = { @@ -634,12 +635,15 @@ GlobalProperty microvm_properties[] = { { "pcie-root-port", "io-reserve", "0" }, }; -static void microvm_class_init(ObjectClass *oc, void *data) +static void microvm_class_init(ObjectClass *oc, const void *data) { X86MachineClass *x86mc = X86_MACHINE_CLASS(oc); + MicrovmMachineClass *mmc = MICROVM_MACHINE_CLASS(oc); MachineClass *mc = MACHINE_CLASS(oc); HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc); + mmc->x86_load_linux = x86_load_linux; + mc->init = microvm_machine_state_init; mc->family = "microvm_i386"; @@ -722,7 +726,7 @@ static const TypeInfo microvm_machine_info = { .instance_init = microvm_machine_initfn, .class_size = sizeof(MicrovmMachineClass), .class_init = microvm_class_init, - .interfaces = (InterfaceInfo[]) { + .interfaces = (const InterfaceInfo[]) { { TYPE_HOTPLUG_HANDLER }, { } }, diff --git a/hw/i386/monitor.c b/hw/i386/monitor.c index 1ebd356..79df965 100644 --- a/hw/i386/monitor.c +++ b/hw/i386/monitor.c @@ -24,9 +24,9 @@ #include "qemu/osdep.h" #include "monitor/monitor.h" -#include "qapi/qmp/qdict.h" +#include "qobject/qdict.h" #include "qapi/error.h" -#include "qapi/qapi-commands-misc-target.h" +#include "qapi/qapi-commands-misc-i386.h" #include "hw/i386/x86.h" #include "hw/rtc/mc146818rtc.h" diff --git a/hw/i386/multiboot.c b/hw/i386/multiboot.c index 3332712..6e6b96b 100644 --- a/hw/i386/multiboot.c +++ b/hw/i386/multiboot.c @@ -29,7 +29,8 @@ #include "multiboot.h" #include "hw/loader.h" #include "elf.h" -#include "sysemu/sysemu.h" +#include "exec/target_page.h" +#include "system/system.h" #include "qemu/error-report.h" /* Show multiboot debug output */ @@ -133,9 +134,9 @@ static void mb_add_mod(MultibootState *s, p = (char *)s->mb_buf + s->offset_mbinfo + MB_MOD_SIZE * s->mb_mods_count; - stl_p(p + MB_MOD_START, start); - stl_p(p + MB_MOD_END, end); - stl_p(p + MB_MOD_CMDLINE, cmdline_phys); + stl_le_p(p + MB_MOD_START, start); + stl_le_p(p + MB_MOD_END, end); + stl_le_p(p + MB_MOD_CMDLINE, cmdline_phys); mb_debug("mod%02d: "HWADDR_FMT_plx" - "HWADDR_FMT_plx, s->mb_mods_count, start, end); @@ -168,9 +169,9 @@ int load_multiboot(X86MachineState *x86ms, /* Ok, let's see if it is a multiboot image. The header is 12x32bit long, so the latest entry may be 8192 - 48. */ for (i = 0; i < (8192 - 48); i += 4) { - if (ldl_p(header+i) == 0x1BADB002) { - uint32_t checksum = ldl_p(header+i+8); - flags = ldl_p(header+i+4); + if (ldl_le_p(header + i) == 0x1BADB002) { + uint32_t checksum = ldl_le_p(header + i + 8); + flags = ldl_le_p(header + i + 4); checksum += flags; checksum += (uint32_t)0x1BADB002; if (!checksum) { @@ -202,8 +203,8 @@ int load_multiboot(X86MachineState *x86ms, } kernel_size = load_elf(kernel_filename, NULL, NULL, NULL, &elf_entry, - &elf_low, &elf_high, NULL, 0, I386_ELF_MACHINE, - 0, 0); + &elf_low, &elf_high, NULL, + ELFDATA2LSB, I386_ELF_MACHINE, 0, 0); if (kernel_size < 0) { error_report("Error while loading elf kernel"); exit(1); @@ -223,11 +224,11 @@ int load_multiboot(X86MachineState *x86ms, mb_kernel_size, (size_t)mh_entry_addr); } else { /* Valid if mh_flags sets MULTIBOOT_HEADER_HAS_ADDR. */ - uint32_t mh_header_addr = ldl_p(header+i+12); - uint32_t mh_load_end_addr = ldl_p(header+i+20); - uint32_t mh_bss_end_addr = ldl_p(header+i+24); + uint32_t mh_header_addr = ldl_le_p(header + i + 12); + uint32_t mh_load_end_addr = ldl_le_p(header + i + 20); + uint32_t mh_bss_end_addr = ldl_le_p(header + i + 24); - mh_load_addr = ldl_p(header+i+16); + mh_load_addr = ldl_le_p(header + i + 16); if (mh_header_addr < mh_load_addr) { error_report("invalid load_addr address"); exit(1); @@ -239,7 +240,7 @@ int load_multiboot(X86MachineState *x86ms, uint32_t mb_kernel_text_offset = i - (mh_header_addr - mh_load_addr); uint32_t mb_load_size = 0; - mh_entry_addr = ldl_p(header+i+28); + mh_entry_addr = ldl_le_p(header + i + 28); if (mh_load_end_addr) { if (mh_load_end_addr < mh_load_addr) { @@ -364,22 +365,21 @@ int load_multiboot(X86MachineState *x86ms, /* Commandline support */ kcmdline = g_strdup_printf("%s %s", kernel_filename, kernel_cmdline); - stl_p(bootinfo + MBI_CMDLINE, mb_add_cmdline(&mbs, kcmdline)); - - stl_p(bootinfo + MBI_BOOTLOADER, mb_add_bootloader(&mbs, bootloader_name)); - - stl_p(bootinfo + MBI_MODS_ADDR, mbs.mb_buf_phys + mbs.offset_mbinfo); - stl_p(bootinfo + MBI_MODS_COUNT, mbs.mb_mods_count); /* mods_count */ + stl_le_p(bootinfo + MBI_CMDLINE, mb_add_cmdline(&mbs, kcmdline)); + stl_le_p(bootinfo + MBI_BOOTLOADER, mb_add_bootloader(&mbs, + bootloader_name)); + stl_le_p(bootinfo + MBI_MODS_ADDR, mbs.mb_buf_phys + mbs.offset_mbinfo); + stl_le_p(bootinfo + MBI_MODS_COUNT, mbs.mb_mods_count); /* mods_count */ /* the kernel is where we want it to be now */ - stl_p(bootinfo + MBI_FLAGS, MULTIBOOT_FLAGS_MEMORY + stl_le_p(bootinfo + MBI_FLAGS, MULTIBOOT_FLAGS_MEMORY | MULTIBOOT_FLAGS_BOOT_DEVICE | MULTIBOOT_FLAGS_CMDLINE | MULTIBOOT_FLAGS_MODULES | MULTIBOOT_FLAGS_MMAP | MULTIBOOT_FLAGS_BOOTLOADER); - stl_p(bootinfo + MBI_BOOT_DEVICE, 0x8000ffff); /* XXX: use the -boot switch? */ - stl_p(bootinfo + MBI_MMAP_ADDR, ADDR_E820_MAP); + stl_le_p(bootinfo + MBI_BOOT_DEVICE, 0x8000ffff); /* XXX: use the -boot switch? */ + stl_le_p(bootinfo + MBI_MMAP_ADDR, ADDR_E820_MAP); mb_debug("multiboot: entry_addr = %#x", mh_entry_addr); mb_debug(" mb_buf_phys = "HWADDR_FMT_plx, mbs.mb_buf_phys); diff --git a/hw/i386/nitro_enclave.c b/hw/i386/nitro_enclave.c new file mode 100644 index 0000000..5ee50f3 --- /dev/null +++ b/hw/i386/nitro_enclave.c @@ -0,0 +1,353 @@ +/* + * AWS nitro-enclave machine + * + * Copyright (c) 2024 Dorjoy Chowdhury <dorjoychy111@gmail.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * (at your option) any later version. See the COPYING file in the + * top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/error-report.h" +#include "qapi/error.h" +#include "qom/object_interfaces.h" + +#include "chardev/char.h" +#include "hw/sysbus.h" +#include "hw/core/eif.h" +#include "hw/i386/x86.h" +#include "hw/i386/microvm.h" +#include "hw/i386/nitro_enclave.h" +#include "hw/virtio/virtio-mmio.h" +#include "hw/virtio/virtio-nsm.h" +#include "hw/virtio/vhost-user-vsock.h" +#include "system/hostmem.h" + +static BusState *find_free_virtio_mmio_bus(void) +{ + BusChild *kid; + BusState *bus = sysbus_get_default(); + + QTAILQ_FOREACH(kid, &bus->children, sibling) { + DeviceState *dev = kid->child; + if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MMIO)) { + VirtIOMMIOProxy *mmio = VIRTIO_MMIO(OBJECT(dev)); + VirtioBusState *mmio_virtio_bus = &mmio->bus; + BusState *mmio_bus = &mmio_virtio_bus->parent_obj; + if (QTAILQ_EMPTY(&mmio_bus->children)) { + return mmio_bus; + } + } + } + + return NULL; +} + +static void vhost_user_vsock_init(NitroEnclaveMachineState *nems) +{ + DeviceState *dev = qdev_new(TYPE_VHOST_USER_VSOCK); + VHostUserVSock *vsock = VHOST_USER_VSOCK(dev); + BusState *bus; + + if (!nems->vsock) { + error_report("A valid chardev id for vhost-user-vsock device must be " + "provided using the 'vsock' machine option"); + exit(1); + } + + bus = find_free_virtio_mmio_bus(); + if (!bus) { + error_report("Failed to find bus for vhost-user-vsock device"); + exit(1); + } + + Chardev *chardev = qemu_chr_find(nems->vsock); + if (!chardev) { + error_report("Failed to find chardev with id %s", nems->vsock); + exit(1); + } + + vsock->conf.chardev.chr = chardev; + + qdev_realize_and_unref(dev, bus, &error_fatal); +} + +static void virtio_nsm_init(NitroEnclaveMachineState *nems) +{ + DeviceState *dev = qdev_new(TYPE_VIRTIO_NSM); + VirtIONSM *vnsm = VIRTIO_NSM(dev); + BusState *bus = find_free_virtio_mmio_bus(); + + if (!bus) { + error_report("Failed to find bus for virtio-nsm device."); + exit(1); + } + + qdev_prop_set_string(dev, "module-id", nems->id); + + qdev_realize_and_unref(dev, bus, &error_fatal); + nems->vnsm = vnsm; +} + +static void nitro_enclave_devices_init(NitroEnclaveMachineState *nems) +{ + vhost_user_vsock_init(nems); + virtio_nsm_init(nems); +} + +static void nitro_enclave_machine_state_init(MachineState *machine) +{ + NitroEnclaveMachineClass *ne_class = + NITRO_ENCLAVE_MACHINE_GET_CLASS(machine); + NitroEnclaveMachineState *ne_state = NITRO_ENCLAVE_MACHINE(machine); + + ne_class->parent_init(machine); + nitro_enclave_devices_init(ne_state); +} + +static void nitro_enclave_machine_reset(MachineState *machine, ResetType type) +{ + NitroEnclaveMachineClass *ne_class = + NITRO_ENCLAVE_MACHINE_GET_CLASS(machine); + NitroEnclaveMachineState *ne_state = NITRO_ENCLAVE_MACHINE(machine); + + ne_class->parent_reset(machine, type); + + memset(ne_state->vnsm->pcrs, 0, sizeof(ne_state->vnsm->pcrs)); + + /* PCR0 */ + ne_state->vnsm->extend_pcr(ne_state->vnsm, 0, ne_state->image_hash, + QCRYPTO_HASH_DIGEST_LEN_SHA384); + /* PCR1 */ + ne_state->vnsm->extend_pcr(ne_state->vnsm, 1, ne_state->bootstrap_hash, + QCRYPTO_HASH_DIGEST_LEN_SHA384); + /* PCR2 */ + ne_state->vnsm->extend_pcr(ne_state->vnsm, 2, ne_state->app_hash, + QCRYPTO_HASH_DIGEST_LEN_SHA384); + /* PCR3 */ + if (ne_state->parent_role) { + ne_state->vnsm->extend_pcr(ne_state->vnsm, 3, + (uint8_t *) ne_state->parent_role, + strlen(ne_state->parent_role)); + } + /* PCR4 */ + if (ne_state->parent_id) { + ne_state->vnsm->extend_pcr(ne_state->vnsm, 4, + (uint8_t *) ne_state->parent_id, + strlen(ne_state->parent_id)); + } + /* PCR8 */ + if (ne_state->signature_found) { + ne_state->vnsm->extend_pcr(ne_state->vnsm, 8, + ne_state->fingerprint_hash, + QCRYPTO_HASH_DIGEST_LEN_SHA384); + } + + /* First 16 PCRs are locked from boot and reserved for nitro enclave */ + for (int i = 0; i < 16; ++i) { + ne_state->vnsm->lock_pcr(ne_state->vnsm, i); + } +} + +static void nitro_enclave_machine_initfn(Object *obj) +{ + MicrovmMachineState *mms = MICROVM_MACHINE(obj); + X86MachineState *x86ms = X86_MACHINE(obj); + NitroEnclaveMachineState *nems = NITRO_ENCLAVE_MACHINE(obj); + + nems->id = g_strdup("i-234-enc5678"); + + /* AWS nitro enclaves have PCIE and ACPI disabled */ + mms->pcie = ON_OFF_AUTO_OFF; + x86ms->acpi = ON_OFF_AUTO_OFF; +} + +static void x86_load_eif(X86MachineState *x86ms, FWCfgState *fw_cfg, + int acpi_data_size, bool pvh_enabled) +{ + Error *err = NULL; + char *eif_kernel, *eif_initrd, *eif_cmdline; + MachineState *machine = MACHINE(x86ms); + NitroEnclaveMachineState *nems = NITRO_ENCLAVE_MACHINE(x86ms); + + if (!read_eif_file(machine->kernel_filename, machine->initrd_filename, + &eif_kernel, &eif_initrd, &eif_cmdline, + nems->image_hash, nems->bootstrap_hash, + nems->app_hash, nems->fingerprint_hash, + &(nems->signature_found), &err)) { + error_report_err(err); + exit(1); + } + + g_free(machine->kernel_filename); + machine->kernel_filename = eif_kernel; + g_free(machine->initrd_filename); + machine->initrd_filename = eif_initrd; + + /* + * If kernel cmdline argument was provided, let's concatenate it to the + * extracted EIF kernel cmdline. + */ + if (machine->kernel_cmdline != NULL) { + char *cmd = g_strdup_printf("%s %s", eif_cmdline, + machine->kernel_cmdline); + g_free(eif_cmdline); + g_free(machine->kernel_cmdline); + machine->kernel_cmdline = cmd; + } else { + machine->kernel_cmdline = eif_cmdline; + } + + x86_load_linux(x86ms, fw_cfg, 0, true); + + unlink(machine->kernel_filename); + unlink(machine->initrd_filename); +} + +static bool create_memfd_backend(MachineState *ms, const char *path, + Error **errp) +{ + Object *obj; + MachineClass *mc = MACHINE_GET_CLASS(ms); + bool r = false; + + obj = object_new(TYPE_MEMORY_BACKEND_MEMFD); + if (!object_property_set_int(obj, "size", ms->ram_size, errp)) { + goto out; + } + object_property_add_child(object_get_objects_root(), mc->default_ram_id, + obj); + + if (!user_creatable_complete(USER_CREATABLE(obj), errp)) { + goto out; + } + r = object_property_set_link(OBJECT(ms), "memory-backend", obj, errp); + +out: + object_unref(obj); + return r; +} + +static char *nitro_enclave_get_vsock_chardev_id(Object *obj, Error **errp) +{ + NitroEnclaveMachineState *nems = NITRO_ENCLAVE_MACHINE(obj); + + return g_strdup(nems->vsock); +} + +static void nitro_enclave_set_vsock_chardev_id(Object *obj, const char *value, + Error **errp) +{ + NitroEnclaveMachineState *nems = NITRO_ENCLAVE_MACHINE(obj); + + g_free(nems->vsock); + nems->vsock = g_strdup(value); +} + +static char *nitro_enclave_get_id(Object *obj, Error **errp) +{ + NitroEnclaveMachineState *nems = NITRO_ENCLAVE_MACHINE(obj); + + return g_strdup(nems->id); +} + +static void nitro_enclave_set_id(Object *obj, const char *value, + Error **errp) +{ + NitroEnclaveMachineState *nems = NITRO_ENCLAVE_MACHINE(obj); + + g_free(nems->id); + nems->id = g_strdup(value); +} + +static char *nitro_enclave_get_parent_role(Object *obj, Error **errp) +{ + NitroEnclaveMachineState *nems = NITRO_ENCLAVE_MACHINE(obj); + + return g_strdup(nems->parent_role); +} + +static void nitro_enclave_set_parent_role(Object *obj, const char *value, + Error **errp) +{ + NitroEnclaveMachineState *nems = NITRO_ENCLAVE_MACHINE(obj); + + g_free(nems->parent_role); + nems->parent_role = g_strdup(value); +} + +static char *nitro_enclave_get_parent_id(Object *obj, Error **errp) +{ + NitroEnclaveMachineState *nems = NITRO_ENCLAVE_MACHINE(obj); + + return g_strdup(nems->parent_id); +} + +static void nitro_enclave_set_parent_id(Object *obj, const char *value, + Error **errp) +{ + NitroEnclaveMachineState *nems = NITRO_ENCLAVE_MACHINE(obj); + + g_free(nems->parent_id); + nems->parent_id = g_strdup(value); +} + +static void nitro_enclave_class_init(ObjectClass *oc, const void *data) +{ + MachineClass *mc = MACHINE_CLASS(oc); + MicrovmMachineClass *mmc = MICROVM_MACHINE_CLASS(oc); + NitroEnclaveMachineClass *nemc = NITRO_ENCLAVE_MACHINE_CLASS(oc); + + mmc->x86_load_linux = x86_load_eif; + + mc->family = "nitro_enclave_i386"; + mc->desc = "AWS Nitro Enclave"; + + nemc->parent_init = mc->init; + mc->init = nitro_enclave_machine_state_init; + + nemc->parent_reset = mc->reset; + mc->reset = nitro_enclave_machine_reset; + + mc->create_default_memdev = create_memfd_backend; + + object_class_property_add_str(oc, NITRO_ENCLAVE_VSOCK_CHARDEV_ID, + nitro_enclave_get_vsock_chardev_id, + nitro_enclave_set_vsock_chardev_id); + object_class_property_set_description(oc, NITRO_ENCLAVE_VSOCK_CHARDEV_ID, + "Set chardev id for vhost-user-vsock " + "device"); + + object_class_property_add_str(oc, NITRO_ENCLAVE_ID, nitro_enclave_get_id, + nitro_enclave_set_id); + object_class_property_set_description(oc, NITRO_ENCLAVE_ID, + "Set enclave identifier"); + + object_class_property_add_str(oc, NITRO_ENCLAVE_PARENT_ROLE, + nitro_enclave_get_parent_role, + nitro_enclave_set_parent_role); + object_class_property_set_description(oc, NITRO_ENCLAVE_PARENT_ROLE, + "Set parent instance IAM role ARN"); + + object_class_property_add_str(oc, NITRO_ENCLAVE_PARENT_ID, + nitro_enclave_get_parent_id, + nitro_enclave_set_parent_id); + object_class_property_set_description(oc, NITRO_ENCLAVE_PARENT_ID, + "Set parent instance identifier"); +} + +static const TypeInfo nitro_enclave_machine_info = { + .name = TYPE_NITRO_ENCLAVE_MACHINE, + .parent = TYPE_MICROVM_MACHINE, + .instance_size = sizeof(NitroEnclaveMachineState), + .instance_init = nitro_enclave_machine_initfn, + .class_size = sizeof(NitroEnclaveMachineClass), + .class_init = nitro_enclave_class_init, +}; + +static void nitro_enclave_machine_init(void) +{ + type_register_static(&nitro_enclave_machine_info); +} +type_init(nitro_enclave_machine_init); diff --git a/hw/i386/pc.c b/hw/i386/pc.c index c74931d..b211633 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -24,13 +24,14 @@ #include "qemu/osdep.h" #include "qemu/units.h" +#include "exec/target_page.h" #include "hw/i386/pc.h" -#include "hw/char/serial.h" +#include "hw/char/serial-isa.h" #include "hw/char/parallel.h" #include "hw/hyperv/hv-balloon.h" #include "hw/i386/fw_cfg.h" #include "hw/i386/vmport.h" -#include "sysemu/cpus.h" +#include "system/cpus.h" #include "hw/ide/ide-bus.h" #include "hw/timer/hpet.h" #include "hw/loader.h" @@ -39,12 +40,13 @@ #include "hw/timer/i8254.h" #include "hw/input/i8042.h" #include "hw/audio/pcspk.h" -#include "sysemu/sysemu.h" -#include "sysemu/xen.h" -#include "sysemu/reset.h" +#include "system/system.h" +#include "system/xen.h" +#include "system/reset.h" #include "kvm/kvm_i386.h" +#include "kvm/tdx.h" #include "hw/xen/xen.h" -#include "qapi/qmp/qlist.h" +#include "qobject/qlist.h" #include "qemu/error-report.h" #include "hw/acpi/cpu_hotplug.h" #include "acpi-build.h" @@ -79,6 +81,20 @@ { "qemu64-" TYPE_X86_CPU, "model-id", "QEMU Virtual CPU version " v, },\ { "athlon-" TYPE_X86_CPU, "model-id", "QEMU Virtual CPU version " v, }, +GlobalProperty pc_compat_10_0[] = {}; +const size_t pc_compat_10_0_len = G_N_ELEMENTS(pc_compat_10_0); + +GlobalProperty pc_compat_9_2[] = {}; +const size_t pc_compat_9_2_len = G_N_ELEMENTS(pc_compat_9_2); + +GlobalProperty pc_compat_9_1[] = { + { "ICH9-LPC", "x-smi-swsmi-timer", "off" }, + { "ICH9-LPC", "x-smi-periodic-timer", "off" }, + { TYPE_INTEL_IOMMU_DEVICE, "stale-tm", "on" }, + { TYPE_INTEL_IOMMU_DEVICE, "aw-bits", "39" }, +}; +const size_t pc_compat_9_1_len = G_N_ELEMENTS(pc_compat_9_1); + GlobalProperty pc_compat_9_0[] = { { TYPE_X86_CPU, "x-amd-topoext-features-only", "false" }, { TYPE_X86_CPU, "x-l1-cache-per-thread", "false" }, @@ -244,28 +260,6 @@ GlobalProperty pc_compat_2_6[] = { }; const size_t pc_compat_2_6_len = G_N_ELEMENTS(pc_compat_2_6); -GlobalProperty pc_compat_2_5[] = {}; -const size_t pc_compat_2_5_len = G_N_ELEMENTS(pc_compat_2_5); - -GlobalProperty pc_compat_2_4[] = { - PC_CPU_MODEL_IDS("2.4.0") - { "Haswell-" TYPE_X86_CPU, "abm", "off" }, - { "Haswell-noTSX-" TYPE_X86_CPU, "abm", "off" }, - { "Broadwell-" TYPE_X86_CPU, "abm", "off" }, - { "Broadwell-noTSX-" TYPE_X86_CPU, "abm", "off" }, - { "host" "-" TYPE_X86_CPU, "host-cache-info", "on" }, - { TYPE_X86_CPU, "check", "off" }, - { "qemu64" "-" TYPE_X86_CPU, "sse4a", "on" }, - { "qemu64" "-" TYPE_X86_CPU, "abm", "on" }, - { "qemu64" "-" TYPE_X86_CPU, "popcnt", "on" }, - { "qemu32" "-" TYPE_X86_CPU, "popcnt", "on" }, - { "Opteron_G2" "-" TYPE_X86_CPU, "rdtscp", "on" }, - { "Opteron_G3" "-" TYPE_X86_CPU, "rdtscp", "on" }, - { "Opteron_G4" "-" TYPE_X86_CPU, "rdtscp", "on" }, - { "Opteron_G5" "-" TYPE_X86_CPU, "rdtscp", "on", } -}; -const size_t pc_compat_2_4_len = G_N_ELEMENTS(pc_compat_2_4); - /* * @PC_FW_DATA: * Size of the chunk of memory at the top of RAM for the BIOS ACPI tables @@ -453,7 +447,7 @@ static int check_fdc(Object *obj, void *opaque) } static const char * const fdc_container_path[] = { - "/unattached", "/peripheral", "/peripheral-anon" + "unattached", "peripheral", "peripheral-anon" }; /* @@ -467,7 +461,7 @@ static ISADevice *pc_find_fdc0(void) CheckFdcState state = { 0 }; for (i = 0; i < ARRAY_SIZE(fdc_container_path); i++) { - container = container_get(qdev_get_machine(), fdc_container_path[i]); + container = machine_get_container(fdc_container_path[i]); object_child_foreach(container, check_fdc, &state); } @@ -621,7 +615,8 @@ void pc_machine_done(Notifier *notifier, void *data) /* set the number of CPUs */ x86_rtc_set_cpus_count(x86ms->rtc, x86ms->boot_cpus); - fw_cfg_add_extra_pci_roots(pcms->pcibus, x86ms->fw_cfg); + pci_bus_add_fw_cfg_extra_pci_roots(x86ms->fw_cfg, pcms->pcibus, + &error_abort); acpi_setup(); if (x86ms->fw_cfg) { @@ -960,21 +955,23 @@ void pc_memory_init(PCMachineState *pcms, /* Initialize PC system firmware */ pc_system_firmware_init(pcms, rom_memory); - option_rom_mr = g_malloc(sizeof(*option_rom_mr)); - if (machine_require_guest_memfd(machine)) { - memory_region_init_ram_guest_memfd(option_rom_mr, NULL, "pc.rom", - PC_ROM_SIZE, &error_fatal); - } else { - memory_region_init_ram(option_rom_mr, NULL, "pc.rom", PC_ROM_SIZE, - &error_fatal); - if (pcmc->pci_enabled) { - memory_region_set_readonly(option_rom_mr, true); + if (!is_tdx_vm()) { + option_rom_mr = g_malloc(sizeof(*option_rom_mr)); + if (machine_require_guest_memfd(machine)) { + memory_region_init_ram_guest_memfd(option_rom_mr, NULL, "pc.rom", + PC_ROM_SIZE, &error_fatal); + } else { + memory_region_init_ram(option_rom_mr, NULL, "pc.rom", PC_ROM_SIZE, + &error_fatal); + if (pcmc->pci_enabled) { + memory_region_set_readonly(option_rom_mr, true); + } } + memory_region_add_subregion_overlap(rom_memory, + PC_ROM_MIN_VGA, + option_rom_mr, + 1); } - memory_region_add_subregion_overlap(rom_memory, - PC_ROM_MIN_VGA, - option_rom_mr, - 1); fw_cfg = fw_cfg_arch_create(machine, x86ms->boot_cpus, x86ms->apic_id_limit); @@ -983,14 +980,13 @@ void pc_memory_init(PCMachineState *pcms, if (machine->device_memory) { uint64_t *val = g_malloc(sizeof(*val)); - uint64_t res_mem_end = machine->device_memory->base; - - if (!pcmc->broken_reserved_end) { - res_mem_end += memory_region_size(&machine->device_memory->mr); - } + uint64_t res_mem_end; if (pcms->cxl_devices_state.is_enabled) { res_mem_end = cxl_resv_end; + } else { + res_mem_end = machine->device_memory->base + + memory_region_size(&machine->device_memory->mr); } *val = cpu_to_le64(ROUND_UP(res_mem_end, 1 * GiB)); fw_cfg_add_file(fw_cfg, "etc/reserved-memory-end", val, sizeof(*val)); @@ -1028,9 +1024,7 @@ uint64_t pc_pci_hole64_start(void) hole64_start = pc_get_cxl_range_end(pcms); } else if (pcmc->has_reserved_memory && (ms->ram_size < ms->maxram_size)) { pc_get_device_memory_range(pcms, &hole64_start, &size); - if (!pcmc->broken_reserved_end) { - hole64_start += size; - } + hole64_start += size; } else { hole64_start = pc_above_4g_end(pcms); } @@ -1042,7 +1036,6 @@ DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus) { DeviceState *dev = NULL; - rom_set_order_override(FW_CFG_ORDER_OVERRIDE_VGA); if (pci_bus) { PCIDevice *pcidev = pci_vga_init(pci_bus); dev = pcidev ? &pcidev->qdev : NULL; @@ -1050,14 +1043,14 @@ DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus) ISADevice *isadev = isa_vga_init(isa_bus); dev = isadev ? DEVICE(isadev) : NULL; } - rom_reset_order_override(); + return dev; } static const MemoryRegionOps ioport80_io_ops = { .write = ioport80_write, .read = ioport80_read, - .endianness = DEVICE_NATIVE_ENDIAN, + .endianness = DEVICE_LITTLE_ENDIAN, .impl = { .min_access_size = 1, .max_access_size = 1, @@ -1067,7 +1060,7 @@ static const MemoryRegionOps ioport80_io_ops = { static const MemoryRegionOps ioportF0_io_ops = { .write = ioportF0_write, .read = ioportF0_read, - .endianness = DEVICE_NATIVE_ENDIAN, + .endianness = DEVICE_LITTLE_ENDIAN, .impl = { .min_access_size = 1, .max_access_size = 1, @@ -1075,7 +1068,7 @@ static const MemoryRegionOps ioportF0_io_ops = { }; static void pc_superio_init(ISABus *isa_bus, bool create_fdctrl, - bool create_i8042, bool no_vmport) + bool create_i8042, bool no_vmport, Error **errp) { int i; DriveInfo *fd[MAX_FD]; @@ -1100,6 +1093,10 @@ static void pc_superio_init(ISABus *isa_bus, bool create_fdctrl, } if (!create_i8042) { + if (!no_vmport) { + error_setg(errp, + "vmport requires the i8042 controller to be enabled"); + } return; } @@ -1217,9 +1214,17 @@ void pc_basic_device_init(struct PCMachineState *pcms, isa_realize_and_unref(pcms->pcspk, isa_bus, &error_fatal); } + if (pcms->vmport == ON_OFF_AUTO_AUTO) { + pcms->vmport = (xen_enabled() || !pcms->i8042_enabled) + ? ON_OFF_AUTO_OFF : ON_OFF_AUTO_ON; + } + /* Super I/O */ pc_superio_init(isa_bus, create_fdctrl, pcms->i8042_enabled, - pcms->vmport != ON_OFF_AUTO_ON); + pcms->vmport != ON_OFF_AUTO_ON, &error_fatal); + + pcms->machine_done.notify = pc_machine_done; + qemu_add_machine_init_done_notifier(&pcms->machine_done); } void pc_nic_init(PCMachineClass *pcmc, ISABus *isa_bus, PCIBus *pci_bus) @@ -1228,16 +1233,14 @@ void pc_nic_init(PCMachineClass *pcmc, ISABus *isa_bus, PCIBus *pci_bus) bool default_is_ne2k = g_str_equal(mc->default_nic, TYPE_ISA_NE2000); NICInfo *nd; - rom_set_order_override(FW_CFG_ORDER_OVERRIDE_NIC); - while ((nd = qemu_find_nic_info(TYPE_ISA_NE2000, default_is_ne2k, NULL))) { pc_init_ne2k_isa(isa_bus, nd, &error_fatal); } /* Anything remaining should be a PCI NIC */ - pci_init_nic_devices(pci_bus, mc->default_nic); - - rom_reset_order_override(); + if (pci_bus) { + pci_init_nic_devices(pci_bus, mc->default_nic); + } } void pc_i8259_create(ISABus *isa_bus, qemu_irq *i8259_irqs) @@ -1678,7 +1681,7 @@ static void pc_machine_initfn(Object *obj) pcms->sata_enabled = true; pcms->i8042_enabled = true; pcms->max_fw_size = 8 * MiB; -#ifdef CONFIG_HPET +#if defined(CONFIG_HPET) pcms->hpet_enabled = true; #endif pcms->fd_bootchk = true; @@ -1691,17 +1694,14 @@ static void pc_machine_initfn(Object *obj) if (pcmc->pci_enabled) { cxl_machine_init(obj, &pcms->cxl_devices_state); } - - pcms->machine_done.notify = pc_machine_done; - qemu_add_machine_init_done_notifier(&pcms->machine_done); } -static void pc_machine_reset(MachineState *machine, ShutdownCause reason) +static void pc_machine_reset(MachineState *machine, ResetType type) { CPUState *cs; X86CPU *cpu; - qemu_devices_reset(reason); + qemu_devices_reset(type); /* Reset APIC after devices have been reset to cancel * any changes that qemu_devices_reset() might have done. @@ -1716,7 +1716,7 @@ static void pc_machine_reset(MachineState *machine, ShutdownCause reason) static void pc_machine_wakeup(MachineState *machine) { cpu_synchronize_all_states(); - pc_machine_reset(machine, SHUTDOWN_CAUSE_NONE); + pc_machine_reset(machine, RESET_TYPE_WAKEUP); cpu_synchronize_all_post_reset(); } @@ -1739,7 +1739,7 @@ static bool pc_hotplug_allowed(MachineState *ms, DeviceState *dev, Error **errp) return true; } -static void pc_machine_class_init(ObjectClass *oc, void *data) +static void pc_machine_class_init(ObjectClass *oc, const void *data) { MachineClass *mc = MACHINE_CLASS(oc); X86MachineClass *x86mc = X86_MACHINE_CLASS(oc); @@ -1775,6 +1775,10 @@ static void pc_machine_class_init(ObjectClass *oc, void *data) mc->nvdimm_supported = true; mc->smp_props.dies_supported = true; mc->smp_props.modules_supported = true; + mc->smp_props.cache_supported[CACHE_LEVEL_AND_TYPE_L1D] = true; + mc->smp_props.cache_supported[CACHE_LEVEL_AND_TYPE_L1I] = true; + mc->smp_props.cache_supported[CACHE_LEVEL_AND_TYPE_L2] = true; + mc->smp_props.cache_supported[CACHE_LEVEL_AND_TYPE_L3] = true; mc->default_ram_id = "pc.ram"; pcmc->default_smbios_ep_type = SMBIOS_ENTRY_POINT_TYPE_AUTO; @@ -1807,6 +1811,8 @@ static void pc_machine_class_init(ObjectClass *oc, void *data) object_class_property_add_bool(oc, PC_MACHINE_I8042, pc_machine_get_i8042, pc_machine_set_i8042); + object_class_property_set_description(oc, PC_MACHINE_I8042, + "Enable/disable Intel 8042 PS/2 controller emulation"); object_class_property_add_bool(oc, "default-bus-bypass-iommu", pc_machine_get_default_bus_bypass_iommu, @@ -1837,7 +1843,7 @@ static const TypeInfo pc_machine_info = { .instance_init = pc_machine_initfn, .class_size = sizeof(PCMachineClass), .class_init = pc_machine_class_init, - .interfaces = (InterfaceInfo[]) { + .interfaces = (const InterfaceInfo[]) { { TYPE_HOTPLUG_HANDLER }, { } }, diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 9445b07..ea7572e 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -43,15 +43,15 @@ #include "hw/ide/isa.h" #include "hw/ide/pci.h" #include "hw/irq.h" -#include "sysemu/kvm.h" +#include "system/kvm.h" #include "hw/i386/kvm/clock.h" #include "hw/sysbus.h" #include "hw/i2c/smbus_eeprom.h" -#include "exec/memory.h" +#include "system/memory.h" #include "hw/acpi/acpi.h" #include "qapi/error.h" #include "qemu/error-report.h" -#include "sysemu/xen.h" +#include "system/xen.h" #ifdef CONFIG_XEN #include <xen/hvm/hvm_info_table.h> #include "hw/xen/xen_pt.h" @@ -61,10 +61,11 @@ #include "hw/xen/xen.h" #include "migration/global_state.h" #include "migration/misc.h" -#include "sysemu/runstate.h" -#include "sysemu/numa.h" +#include "system/runstate.h" +#include "system/numa.h" #include "hw/hyperv/vmbus-bridge.h" #include "hw/mem/nvdimm.h" +#include "hw/uefi/var-service-api.h" #include "hw/i386/acpi-build.h" #include "target/i386/cpu.h" @@ -284,6 +285,8 @@ static void pc_init1(MachineState *machine, const char *pci_type) pcms->idebus[0] = qdev_get_child_bus(dev, "ide.0"); pcms->idebus[1] = qdev_get_child_bus(dev, "ide.1"); } else { + uint32_t irq; + isa_bus = isa_bus_new(NULL, system_memory, system_io, &error_abort); isa_bus_register_input_irqs(isa_bus, x86ms->gsi); @@ -291,6 +294,9 @@ static void pc_init1(MachineState *machine, const char *pci_type) x86ms->rtc = isa_new(TYPE_MC146818_RTC); qdev_prop_set_int32(DEVICE(x86ms->rtc), "base_year", 2000); isa_realize_and_unref(x86ms->rtc, isa_bus, &error_fatal); + irq = object_property_get_uint(OBJECT(x86ms->rtc), "irq", + &error_fatal); + isa_connect_gpio_out(ISA_DEVICE(x86ms->rtc), 0, irq); i8257_dma_init(OBJECT(machine), isa_bus, 0); pcms->hpet_enabled = false; @@ -310,11 +316,6 @@ static void pc_init1(MachineState *machine, const char *pci_type) pc_vga_init(isa_bus, pcmc->pci_enabled ? pcms->pcibus : NULL); - assert(pcms->vmport != ON_OFF_AUTO__MAX); - if (pcms->vmport == ON_OFF_AUTO_AUTO) { - pcms->vmport = xen_enabled() ? ON_OFF_AUTO_OFF : ON_OFF_AUTO_ON; - } - /* init basic PC hardware */ pc_basic_device_init(pcms, isa_bus, x86ms->gsi, x86ms->rtc, !MACHINE_CLASS(pcmc)->no_floppy, 0x4); @@ -451,7 +452,10 @@ static void pc_i440fx_init(MachineState *machine) } #define DEFINE_I440FX_MACHINE(major, minor) \ - DEFINE_PC_VER_MACHINE(pc_i440fx, "pc-i440fx", pc_i440fx_init, major, minor); + DEFINE_PC_VER_MACHINE(pc_i440fx, "pc-i440fx", pc_i440fx_init, false, NULL, major, minor); + +#define DEFINE_I440FX_MACHINE_AS_LATEST(major, minor) \ + DEFINE_PC_VER_MACHINE(pc_i440fx, "pc-i440fx", pc_i440fx_init, true, "pc", major, minor); static void pc_i440fx_machine_options(MachineClass *m) { @@ -470,6 +474,7 @@ static void pc_i440fx_machine_options(MachineClass *m) m->no_parallel = !module_object_class_by_name(TYPE_ISA_PARALLEL); machine_class_allow_dynamic_sysbus_dev(m, TYPE_RAMFB_DEVICE); machine_class_allow_dynamic_sysbus_dev(m, TYPE_VMBUS_BRIDGE); + machine_class_allow_dynamic_sysbus_dev(m, TYPE_UEFI_VARS_X64); object_class_property_add_enum(oc, "x-south-bridge", "PCSouthBridgeOption", &PCSouthBridgeOption_lookup, @@ -479,11 +484,36 @@ static void pc_i440fx_machine_options(MachineClass *m) "Use a different south bridge than PIIX3"); } -static void pc_i440fx_machine_9_1_options(MachineClass *m) +static void pc_i440fx_machine_10_1_options(MachineClass *m) { pc_i440fx_machine_options(m); - m->alias = "pc"; - m->is_default = true; +} + +DEFINE_I440FX_MACHINE_AS_LATEST(10, 1); + +static void pc_i440fx_machine_10_0_options(MachineClass *m) +{ + pc_i440fx_machine_10_1_options(m); + compat_props_add(m->compat_props, hw_compat_10_0, hw_compat_10_0_len); + compat_props_add(m->compat_props, pc_compat_10_0, pc_compat_10_0_len); +} + +DEFINE_I440FX_MACHINE(10, 0); + +static void pc_i440fx_machine_9_2_options(MachineClass *m) +{ + pc_i440fx_machine_10_0_options(m); + compat_props_add(m->compat_props, hw_compat_9_2, hw_compat_9_2_len); + compat_props_add(m->compat_props, pc_compat_9_2, pc_compat_9_2_len); +} + +DEFINE_I440FX_MACHINE(9, 2); + +static void pc_i440fx_machine_9_1_options(MachineClass *m) +{ + pc_i440fx_machine_9_2_options(m); + compat_props_add(m->compat_props, hw_compat_9_1, hw_compat_9_1_len); + compat_props_add(m->compat_props, pc_compat_9_1, pc_compat_9_1_len); } DEFINE_I440FX_MACHINE(9, 1); @@ -493,8 +523,7 @@ static void pc_i440fx_machine_9_0_options(MachineClass *m) PCMachineClass *pcmc = PC_MACHINE_CLASS(m); pc_i440fx_machine_9_1_options(m); - m->alias = NULL; - m->is_default = false; + m->smbios_memory_device_size = 16 * GiB; compat_props_add(m->compat_props, hw_compat_9_0, hw_compat_9_0_len); compat_props_add(m->compat_props, pc_compat_9_0, pc_compat_9_0_len); @@ -754,32 +783,6 @@ static void pc_i440fx_machine_2_6_options(MachineClass *m) DEFINE_I440FX_MACHINE(2, 6); -static void pc_i440fx_machine_2_5_options(MachineClass *m) -{ - X86MachineClass *x86mc = X86_MACHINE_CLASS(m); - - pc_i440fx_machine_2_6_options(m); - x86mc->save_tsc_khz = false; - m->legacy_fw_cfg_order = 1; - compat_props_add(m->compat_props, hw_compat_2_5, hw_compat_2_5_len); - compat_props_add(m->compat_props, pc_compat_2_5, pc_compat_2_5_len); -} - -DEFINE_I440FX_MACHINE(2, 5); - -static void pc_i440fx_machine_2_4_options(MachineClass *m) -{ - PCMachineClass *pcmc = PC_MACHINE_CLASS(m); - - pc_i440fx_machine_2_5_options(m); - m->hw_version = "2.4.0"; - pcmc->broken_reserved_end = true; - compat_props_add(m->compat_props, hw_compat_2_4, hw_compat_2_4_len); - compat_props_add(m->compat_props, pc_compat_2_4, pc_compat_2_4_len); -} - -DEFINE_I440FX_MACHINE(2, 4); - #ifdef CONFIG_ISAPC static void isapc_machine_options(MachineClass *m) { diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 71d3c6d..33211b1 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -35,8 +35,8 @@ #include "hw/loader.h" #include "hw/i2c/smbus_eeprom.h" #include "hw/rtc/mc146818rtc.h" -#include "sysemu/tcg.h" -#include "sysemu/kvm.h" +#include "system/tcg.h" +#include "system/kvm.h" #include "hw/i386/kvm/clock.h" #include "hw/pci-host/q35.h" #include "hw/pci/pcie_port.h" @@ -55,9 +55,10 @@ #include "hw/usb/hcd-uhci.h" #include "qapi/error.h" #include "qemu/error-report.h" -#include "sysemu/numa.h" +#include "system/numa.h" #include "hw/hyperv/vmbus-bridge.h" #include "hw/mem/nvdimm.h" +#include "hw/uefi/var-service-api.h" #include "hw/i386/acpi-build.h" #include "target/i386/cpu.h" @@ -276,11 +277,6 @@ static void pc_q35_init(MachineState *machine) x86_register_ferr_irq(x86ms->gsi[13]); } - assert(pcms->vmport != ON_OFF_AUTO__MAX); - if (pcms->vmport == ON_OFF_AUTO_AUTO) { - pcms->vmport = ON_OFF_AUTO_ON; - } - /* init basic PC hardware */ pc_basic_device_init(pcms, isa_bus, x86ms->gsi, x86ms->rtc, !mc->no_floppy, 0xff0104); @@ -332,10 +328,13 @@ static void pc_q35_init(MachineState *machine) } #define DEFINE_Q35_MACHINE(major, minor) \ - DEFINE_PC_VER_MACHINE(pc_q35, "pc-q35", pc_q35_init, major, minor); + DEFINE_PC_VER_MACHINE(pc_q35, "pc-q35", pc_q35_init, false, NULL, major, minor); + +#define DEFINE_Q35_MACHINE_AS_LATEST(major, minor) \ + DEFINE_PC_VER_MACHINE(pc_q35, "pc-q35", pc_q35_init, false, "q35", major, minor); #define DEFINE_Q35_MACHINE_BUGFIX(major, minor, micro) \ - DEFINE_PC_VER_MACHINE(pc_q35, "pc-q35", pc_q35_init, major, minor, micro); + DEFINE_PC_VER_MACHINE(pc_q35, "pc-q35", pc_q35_init, false, NULL, major, minor, micro); static void pc_q35_machine_options(MachineClass *m) { @@ -357,14 +356,41 @@ static void pc_q35_machine_options(MachineClass *m) machine_class_allow_dynamic_sysbus_dev(m, TYPE_INTEL_IOMMU_DEVICE); machine_class_allow_dynamic_sysbus_dev(m, TYPE_RAMFB_DEVICE); machine_class_allow_dynamic_sysbus_dev(m, TYPE_VMBUS_BRIDGE); + machine_class_allow_dynamic_sysbus_dev(m, TYPE_UEFI_VARS_X64); compat_props_add(m->compat_props, pc_q35_compat_defaults, pc_q35_compat_defaults_len); } -static void pc_q35_machine_9_1_options(MachineClass *m) +static void pc_q35_machine_10_1_options(MachineClass *m) { pc_q35_machine_options(m); - m->alias = "q35"; +} + +DEFINE_Q35_MACHINE_AS_LATEST(10, 1); + +static void pc_q35_machine_10_0_options(MachineClass *m) +{ + pc_q35_machine_10_1_options(m); + compat_props_add(m->compat_props, hw_compat_10_0, hw_compat_10_0_len); + compat_props_add(m->compat_props, pc_compat_10_0, pc_compat_10_0_len); +} + +DEFINE_Q35_MACHINE(10, 0); + +static void pc_q35_machine_9_2_options(MachineClass *m) +{ + pc_q35_machine_10_0_options(m); + compat_props_add(m->compat_props, hw_compat_9_2, hw_compat_9_2_len); + compat_props_add(m->compat_props, pc_compat_9_2, pc_compat_9_2_len); +} + +DEFINE_Q35_MACHINE(9, 2); + +static void pc_q35_machine_9_1_options(MachineClass *m) +{ + pc_q35_machine_9_2_options(m); + compat_props_add(m->compat_props, hw_compat_9_1, hw_compat_9_1_len); + compat_props_add(m->compat_props, pc_compat_9_1, pc_compat_9_1_len); } DEFINE_Q35_MACHINE(9, 1); @@ -373,7 +399,7 @@ static void pc_q35_machine_9_0_options(MachineClass *m) { PCMachineClass *pcmc = PC_MACHINE_CLASS(m); pc_q35_machine_9_1_options(m); - m->alias = NULL; + m->smbios_memory_device_size = 16 * GiB; compat_props_add(m->compat_props, hw_compat_9_0, hw_compat_9_0_len); compat_props_add(m->compat_props, pc_compat_9_0, pc_compat_9_0_len); pcmc->isa_bios_alias = false; @@ -646,29 +672,3 @@ static void pc_q35_machine_2_6_options(MachineClass *m) } DEFINE_Q35_MACHINE(2, 6); - -static void pc_q35_machine_2_5_options(MachineClass *m) -{ - X86MachineClass *x86mc = X86_MACHINE_CLASS(m); - - pc_q35_machine_2_6_options(m); - x86mc->save_tsc_khz = false; - m->legacy_fw_cfg_order = 1; - compat_props_add(m->compat_props, hw_compat_2_5, hw_compat_2_5_len); - compat_props_add(m->compat_props, pc_compat_2_5, pc_compat_2_5_len); -} - -DEFINE_Q35_MACHINE(2, 5); - -static void pc_q35_machine_2_4_options(MachineClass *m) -{ - PCMachineClass *pcmc = PC_MACHINE_CLASS(m); - - pc_q35_machine_2_5_options(m); - m->hw_version = "2.4.0"; - pcmc->broken_reserved_end = true; - compat_props_add(m->compat_props, hw_compat_2_4, hw_compat_2_4_len); - compat_props_add(m->compat_props, pc_compat_2_4, pc_compat_2_4_len); -} - -DEFINE_Q35_MACHINE(2, 4); diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c index ef80281..821396c 100644 --- a/hw/i386/pc_sysfw.c +++ b/hw/i386/pc_sysfw.c @@ -25,7 +25,7 @@ #include "qemu/osdep.h" #include "qapi/error.h" -#include "sysemu/block-backend.h" +#include "system/block-backend.h" #include "qemu/error-report.h" #include "qemu/option.h" #include "qemu/units.h" @@ -35,8 +35,9 @@ #include "hw/loader.h" #include "hw/qdev-properties.h" #include "hw/block/flash.h" -#include "sysemu/kvm.h" -#include "sev.h" +#include "system/kvm.h" +#include "target/i386/sev.h" +#include "kvm/tdx.h" #define FLASH_SECTOR_SIZE 4096 @@ -280,5 +281,11 @@ void x86_firmware_configure(hwaddr gpa, void *ptr, int size) } sev_encrypt_flash(gpa, ptr, size, &error_fatal); + } else if (is_tdx_vm()) { + ret = tdx_parse_tdvf(ptr, size); + if (ret) { + error_report("failed to parse TDVF for TDX VM"); + exit(1); + } } } diff --git a/hw/i386/pc_sysfw_ovmf.c b/hw/i386/pc_sysfw_ovmf.c index 07a4c26..da947c3 100644 --- a/hw/i386/pc_sysfw_ovmf.c +++ b/hw/i386/pc_sysfw_ovmf.c @@ -26,6 +26,7 @@ #include "qemu/osdep.h" #include "qemu/error-report.h" #include "hw/i386/pc.h" +#include "exec/target_page.h" #include "cpu.h" #define OVMF_TABLE_FOOTER_GUID "96b582de-1fb2-45f7-baea-a366c55a082d" diff --git a/hw/i386/port92.c b/hw/i386/port92.c index b25157f..39b6f31 100644 --- a/hw/i386/port92.c +++ b/hw/i386/port92.c @@ -7,7 +7,7 @@ */ #include "qemu/osdep.h" -#include "sysemu/runstate.h" +#include "system/runstate.h" #include "migration/vmstate.h" #include "hw/irq.h" #include "hw/isa/isa.h" @@ -97,12 +97,12 @@ static void port92_realizefn(DeviceState *dev, Error **errp) isa_register_ioport(isadev, &s->io, 0x92); } -static void port92_class_initfn(ObjectClass *klass, void *data) +static void port92_class_initfn(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); dc->realize = port92_realizefn; - dc->reset = port92_reset; + device_class_set_legacy_reset(dc, port92_reset); dc->vmsd = &vmstate_port92_isa; /* * Reason: unlike ordinary ISA devices, this one needs additional diff --git a/hw/i386/sgx-epc.c b/hw/i386/sgx-epc.c index d664829..2b3b282 100644 --- a/hw/i386/sgx-epc.c +++ b/hw/i386/sgx-epc.c @@ -17,14 +17,13 @@ #include "qapi/error.h" #include "qapi/visitor.h" #include "target/i386/cpu.h" -#include "exec/address-spaces.h" +#include "system/address-spaces.h" -static Property sgx_epc_properties[] = { +static const Property sgx_epc_properties[] = { DEFINE_PROP_UINT64(SGX_EPC_ADDR_PROP, SGXEPCDevice, addr, 0), DEFINE_PROP_UINT32(SGX_EPC_NUMA_NODE_PROP, SGXEPCDevice, node, 0), DEFINE_PROP_LINK(SGX_EPC_MEMDEV_PROP, SGXEPCDevice, hostmem, TYPE_MEMORY_BACKEND_EPC, HostMemoryBackendEpc *), - DEFINE_PROP_END_OF_LIST(), }; static void sgx_epc_get_size(Object *obj, Visitor *v, const char *name, @@ -148,7 +147,7 @@ static void sgx_epc_md_fill_device_info(const MemoryDeviceState *md, info->type = MEMORY_DEVICE_INFO_KIND_SGX_EPC; } -static void sgx_epc_class_init(ObjectClass *oc, void *data) +static void sgx_epc_class_init(ObjectClass *oc, const void *data) { DeviceClass *dc = DEVICE_CLASS(oc); MemoryDeviceClass *mdc = MEMORY_DEVICE_CLASS(oc); @@ -174,7 +173,7 @@ static const TypeInfo sgx_epc_info = { .instance_init = sgx_epc_init, .class_init = sgx_epc_class_init, .class_size = sizeof(DeviceClass), - .interfaces = (InterfaceInfo[]) { + .interfaces = (const InterfaceInfo[]) { { TYPE_MEMORY_DEVICE }, { } }, diff --git a/hw/i386/sgx-stub.c b/hw/i386/sgx-stub.c index 16b1dfd..d295e54 100644 --- a/hw/i386/sgx-stub.c +++ b/hw/i386/sgx-stub.c @@ -3,20 +3,20 @@ #include "monitor/hmp-target.h" #include "hw/i386/pc.h" #include "hw/i386/sgx-epc.h" +#include "qapi/qapi-commands-misc-i386.h" #include "qapi/error.h" -#include "qapi/qapi-commands-misc-target.h" void sgx_epc_build_srat(GArray *table_data) { } -SGXInfo *qmp_query_sgx(Error **errp) +SgxInfo *qmp_query_sgx(Error **errp) { error_setg(errp, "SGX support is not compiled in"); return NULL; } -SGXInfo *qmp_query_sgx_capabilities(Error **errp) +SgxInfo *qmp_query_sgx_capabilities(Error **errp) { error_setg(errp, "SGX support is not compiled in"); return NULL; @@ -32,6 +32,11 @@ void pc_machine_init_sgx_epc(PCMachineState *pcms) memset(&pcms->sgx_epc, 0, sizeof(SGXEPCState)); } +bool check_sgx_support(void) +{ + return false; +} + bool sgx_epc_get_section(int section_nr, uint64_t *addr, uint64_t *size) { return true; diff --git a/hw/i386/sgx.c b/hw/i386/sgx.c index a14a84b..e280154 100644 --- a/hw/i386/sgx.c +++ b/hw/i386/sgx.c @@ -19,10 +19,10 @@ #include "monitor/hmp-target.h" #include "qapi/error.h" #include "qemu/error-report.h" -#include "qapi/qapi-commands-misc-target.h" -#include "exec/address-spaces.h" -#include "sysemu/hw_accel.h" -#include "sysemu/reset.h" +#include "qapi/qapi-commands-misc-i386.h" +#include "system/address-spaces.h" +#include "system/hw_accel.h" +#include "system/reset.h" #include <sys/ioctl.h> #include "hw/acpi/aml-build.h" @@ -84,10 +84,10 @@ static uint64_t sgx_calc_section_metric(uint64_t low, uint64_t high) ((high & MAKE_64BIT_MASK(0, 20)) << 32); } -static SGXEPCSectionList *sgx_calc_host_epc_sections(void) +static SgxEpcSectionList *sgx_calc_host_epc_sections(void) { - SGXEPCSectionList *head = NULL, **tail = &head; - SGXEPCSection *section; + SgxEpcSectionList *head = NULL, **tail = &head; + SgxEpcSection *section; uint32_t i, type; uint32_t eax, ebx, ecx, edx; uint32_t j = 0; @@ -104,7 +104,7 @@ static SGXEPCSectionList *sgx_calc_host_epc_sections(void) break; } - section = g_new0(SGXEPCSection, 1); + section = g_new0(SgxEpcSection, 1); section->node = j++; section->size = sgx_calc_section_metric(ecx, edx); QAPI_LIST_APPEND(tail, section); @@ -153,9 +153,9 @@ static void sgx_epc_reset(void *opaque) } } -SGXInfo *qmp_query_sgx_capabilities(Error **errp) +SgxInfo *qmp_query_sgx_capabilities(Error **errp) { - SGXInfo *info = NULL; + SgxInfo *info = NULL; uint32_t eax, ebx, ecx, edx; Error *local_err = NULL; @@ -166,7 +166,7 @@ SGXInfo *qmp_query_sgx_capabilities(Error **errp) return NULL; } - info = g_new0(SGXInfo, 1); + info = g_new0(SgxInfo, 1); host_cpuid(0x7, 0, &eax, &ebx, &ecx, &edx); info->sgx = ebx & (1U << 2) ? true : false; @@ -183,17 +183,17 @@ SGXInfo *qmp_query_sgx_capabilities(Error **errp) return info; } -static SGXEPCSectionList *sgx_get_epc_sections_list(void) +static SgxEpcSectionList *sgx_get_epc_sections_list(void) { GSList *device_list = sgx_epc_get_device_list(); - SGXEPCSectionList *head = NULL, **tail = &head; - SGXEPCSection *section; + SgxEpcSectionList *head = NULL, **tail = &head; + SgxEpcSection *section; for (; device_list; device_list = device_list->next) { DeviceState *dev = device_list->data; Object *obj = OBJECT(dev); - section = g_new0(SGXEPCSection, 1); + section = g_new0(SgxEpcSection, 1); section->node = object_property_get_uint(obj, SGX_EPC_NUMA_NODE_PROP, &error_abort); section->size = object_property_get_uint(obj, SGX_EPC_SIZE_PROP, @@ -205,9 +205,9 @@ static SGXEPCSectionList *sgx_get_epc_sections_list(void) return head; } -SGXInfo *qmp_query_sgx(Error **errp) +SgxInfo *qmp_query_sgx(Error **errp) { - SGXInfo *info = NULL; + SgxInfo *info = NULL; X86MachineState *x86ms; PCMachineState *pcms = (PCMachineState *)object_dynamic_cast(qdev_get_machine(), @@ -223,7 +223,7 @@ SGXInfo *qmp_query_sgx(Error **errp) return NULL; } - info = g_new0(SGXInfo, 1); + info = g_new0(SgxInfo, 1); info->sgx = true; info->sgx1 = true; @@ -237,8 +237,8 @@ SGXInfo *qmp_query_sgx(Error **errp) void hmp_info_sgx(Monitor *mon, const QDict *qdict) { Error *err = NULL; - SGXEPCSectionList *section_list, *section; - g_autoptr(SGXInfo) info = qmp_query_sgx(&err); + SgxEpcSectionList *section_list, *section; + g_autoptr(SgxInfo) info = qmp_query_sgx(&err); uint64_t size = 0; if (err) { @@ -266,12 +266,22 @@ void hmp_info_sgx(Monitor *mon, const QDict *qdict) size); } +bool check_sgx_support(void) +{ + if (!object_dynamic_cast(qdev_get_machine(), TYPE_PC_MACHINE)) { + return false; + } + return true; +} + bool sgx_epc_get_section(int section_nr, uint64_t *addr, uint64_t *size) { - PCMachineState *pcms = PC_MACHINE(qdev_get_machine()); + PCMachineState *pcms = + (PCMachineState *)object_dynamic_cast(qdev_get_machine(), + TYPE_PC_MACHINE); SGXEPCDevice *epc; - if (pcms->sgx_epc.size == 0 || pcms->sgx_epc.nr_sections <= section_nr) { + if (!pcms || pcms->sgx_epc.size == 0 || pcms->sgx_epc.nr_sections <= section_nr) { return true; } diff --git a/hw/i386/tdvf-hob.c b/hw/i386/tdvf-hob.c new file mode 100644 index 0000000..782b3d1 --- /dev/null +++ b/hw/i386/tdvf-hob.c @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2025 Intel Corporation + * Author: Isaku Yamahata <isaku.yamahata at gmail.com> + * <isaku.yamahata at intel.com> + * Xiaoyao Li <xiaoyao.li@intel.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qemu/error-report.h" +#include "standard-headers/uefi/uefi.h" +#include "hw/pci/pcie_host.h" +#include "tdvf-hob.h" + +typedef struct TdvfHob { + hwaddr hob_addr; + void *ptr; + int size; + + /* working area */ + void *current; + void *end; +} TdvfHob; + +static uint64_t tdvf_current_guest_addr(const TdvfHob *hob) +{ + return hob->hob_addr + (hob->current - hob->ptr); +} + +static void tdvf_align(TdvfHob *hob, size_t align) +{ + hob->current = QEMU_ALIGN_PTR_UP(hob->current, align); +} + +static void *tdvf_get_area(TdvfHob *hob, uint64_t size) +{ + void *ret; + + if (hob->current + size > hob->end) { + error_report("TD_HOB overrun, size = 0x%" PRIx64, size); + exit(1); + } + + ret = hob->current; + hob->current += size; + tdvf_align(hob, 8); + return ret; +} + +static void tdvf_hob_add_memory_resources(TdxGuest *tdx, TdvfHob *hob) +{ + EFI_HOB_RESOURCE_DESCRIPTOR *region; + EFI_RESOURCE_ATTRIBUTE_TYPE attr; + EFI_RESOURCE_TYPE resource_type; + + TdxRamEntry *e; + int i; + + for (i = 0; i < tdx->nr_ram_entries; i++) { + e = &tdx->ram_entries[i]; + + if (e->type == TDX_RAM_UNACCEPTED) { + resource_type = EFI_RESOURCE_MEMORY_UNACCEPTED; + attr = EFI_RESOURCE_ATTRIBUTE_TDVF_UNACCEPTED; + } else if (e->type == TDX_RAM_ADDED) { + resource_type = EFI_RESOURCE_SYSTEM_MEMORY; + attr = EFI_RESOURCE_ATTRIBUTE_TDVF_PRIVATE; + } else { + error_report("unknown TDX_RAM_ENTRY type %d", e->type); + exit(1); + } + + region = tdvf_get_area(hob, sizeof(*region)); + *region = (EFI_HOB_RESOURCE_DESCRIPTOR) { + .Header = { + .HobType = EFI_HOB_TYPE_RESOURCE_DESCRIPTOR, + .HobLength = cpu_to_le16(sizeof(*region)), + .Reserved = cpu_to_le32(0), + }, + .Owner = EFI_HOB_OWNER_ZERO, + .ResourceType = cpu_to_le32(resource_type), + .ResourceAttribute = cpu_to_le32(attr), + .PhysicalStart = cpu_to_le64(e->address), + .ResourceLength = cpu_to_le64(e->length), + }; + } +} + +void tdvf_hob_create(TdxGuest *tdx, TdxFirmwareEntry *td_hob) +{ + TdvfHob hob = { + .hob_addr = td_hob->address, + .size = td_hob->size, + .ptr = td_hob->mem_ptr, + + .current = td_hob->mem_ptr, + .end = td_hob->mem_ptr + td_hob->size, + }; + + EFI_HOB_GENERIC_HEADER *last_hob; + EFI_HOB_HANDOFF_INFO_TABLE *hit; + + /* Note, Efi{Free}Memory{Bottom,Top} are ignored, leave 'em zeroed. */ + hit = tdvf_get_area(&hob, sizeof(*hit)); + *hit = (EFI_HOB_HANDOFF_INFO_TABLE) { + .Header = { + .HobType = EFI_HOB_TYPE_HANDOFF, + .HobLength = cpu_to_le16(sizeof(*hit)), + .Reserved = cpu_to_le32(0), + }, + .Version = cpu_to_le32(EFI_HOB_HANDOFF_TABLE_VERSION), + .BootMode = cpu_to_le32(0), + .EfiMemoryTop = cpu_to_le64(0), + .EfiMemoryBottom = cpu_to_le64(0), + .EfiFreeMemoryTop = cpu_to_le64(0), + .EfiFreeMemoryBottom = cpu_to_le64(0), + .EfiEndOfHobList = cpu_to_le64(0), /* initialized later */ + }; + + tdvf_hob_add_memory_resources(tdx, &hob); + + last_hob = tdvf_get_area(&hob, sizeof(*last_hob)); + *last_hob = (EFI_HOB_GENERIC_HEADER) { + .HobType = EFI_HOB_TYPE_END_OF_HOB_LIST, + .HobLength = cpu_to_le16(sizeof(*last_hob)), + .Reserved = cpu_to_le32(0), + }; + hit->EfiEndOfHobList = tdvf_current_guest_addr(&hob); +} diff --git a/hw/i386/tdvf-hob.h b/hw/i386/tdvf-hob.h new file mode 100644 index 0000000..4fc6a37 --- /dev/null +++ b/hw/i386/tdvf-hob.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef HW_I386_TD_HOB_H +#define HW_I386_TD_HOB_H + +#include "hw/i386/tdvf.h" +#include "target/i386/kvm/tdx.h" + +void tdvf_hob_create(TdxGuest *tdx, TdxFirmwareEntry *td_hob); + +#define EFI_RESOURCE_ATTRIBUTE_TDVF_PRIVATE \ + (EFI_RESOURCE_ATTRIBUTE_PRESENT | \ + EFI_RESOURCE_ATTRIBUTE_INITIALIZED | \ + EFI_RESOURCE_ATTRIBUTE_TESTED) + +#define EFI_RESOURCE_ATTRIBUTE_TDVF_UNACCEPTED \ + (EFI_RESOURCE_ATTRIBUTE_PRESENT | \ + EFI_RESOURCE_ATTRIBUTE_INITIALIZED | \ + EFI_RESOURCE_ATTRIBUTE_TESTED) + +#define EFI_RESOURCE_ATTRIBUTE_TDVF_MMIO \ + (EFI_RESOURCE_ATTRIBUTE_PRESENT | \ + EFI_RESOURCE_ATTRIBUTE_INITIALIZED | \ + EFI_RESOURCE_ATTRIBUTE_UNCACHEABLE) + +#endif diff --git a/hw/i386/tdvf.c b/hw/i386/tdvf.c new file mode 100644 index 0000000..645d9d1 --- /dev/null +++ b/hw/i386/tdvf.c @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2025 Intel Corporation + * Author: Isaku Yamahata <isaku.yamahata at gmail.com> + * <isaku.yamahata at intel.com> + * Xiaoyao Li <xiaoyao.li@intel.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qemu/error-report.h" + +#include "hw/i386/pc.h" +#include "hw/i386/tdvf.h" +#include "system/kvm.h" + +#define TDX_METADATA_OFFSET_GUID "e47a6535-984a-4798-865e-4685a7bf8ec2" +#define TDX_METADATA_VERSION 1 +#define TDVF_SIGNATURE 0x46564454 /* TDVF as little endian */ +#define TDVF_ALIGNMENT 4096 + +/* + * the raw structs read from TDVF keeps the name convention in + * TDVF Design Guide spec. + */ +typedef struct { + uint32_t DataOffset; + uint32_t RawDataSize; + uint64_t MemoryAddress; + uint64_t MemoryDataSize; + uint32_t Type; + uint32_t Attributes; +} TdvfSectionEntry; + +typedef struct { + uint32_t Signature; + uint32_t Length; + uint32_t Version; + uint32_t NumberOfSectionEntries; + TdvfSectionEntry SectionEntries[]; +} TdvfMetadata; + +struct tdx_metadata_offset { + uint32_t offset; +}; + +static TdvfMetadata *tdvf_get_metadata(void *flash_ptr, int size) +{ + TdvfMetadata *metadata; + uint32_t offset = 0; + uint8_t *data; + + if ((uint32_t) size != size) { + return NULL; + } + + if (pc_system_ovmf_table_find(TDX_METADATA_OFFSET_GUID, &data, NULL)) { + offset = size - le32_to_cpu(((struct tdx_metadata_offset *)data)->offset); + + if (offset + sizeof(*metadata) > size) { + return NULL; + } + } else { + error_report("Cannot find TDX_METADATA_OFFSET_GUID"); + return NULL; + } + + metadata = flash_ptr + offset; + + /* Finally, verify the signature to determine if this is a TDVF image. */ + metadata->Signature = le32_to_cpu(metadata->Signature); + if (metadata->Signature != TDVF_SIGNATURE) { + error_report("Invalid TDVF signature in metadata!"); + return NULL; + } + + /* Sanity check that the TDVF doesn't overlap its own metadata. */ + metadata->Length = le32_to_cpu(metadata->Length); + if (offset + metadata->Length > size) { + return NULL; + } + + /* Only version 1 is supported/defined. */ + metadata->Version = le32_to_cpu(metadata->Version); + if (metadata->Version != TDX_METADATA_VERSION) { + return NULL; + } + + return metadata; +} + +static int tdvf_parse_and_check_section_entry(const TdvfSectionEntry *src, + TdxFirmwareEntry *entry) +{ + entry->data_offset = le32_to_cpu(src->DataOffset); + entry->data_len = le32_to_cpu(src->RawDataSize); + entry->address = le64_to_cpu(src->MemoryAddress); + entry->size = le64_to_cpu(src->MemoryDataSize); + entry->type = le32_to_cpu(src->Type); + entry->attributes = le32_to_cpu(src->Attributes); + + /* sanity check */ + if (entry->size < entry->data_len) { + error_report("Broken metadata RawDataSize 0x%x MemoryDataSize 0x%"PRIx64, + entry->data_len, entry->size); + return -1; + } + if (!QEMU_IS_ALIGNED(entry->address, TDVF_ALIGNMENT)) { + error_report("MemoryAddress 0x%"PRIx64" not page aligned", entry->address); + return -1; + } + if (!QEMU_IS_ALIGNED(entry->size, TDVF_ALIGNMENT)) { + error_report("MemoryDataSize 0x%"PRIx64" not page aligned", entry->size); + return -1; + } + + switch (entry->type) { + case TDVF_SECTION_TYPE_BFV: + case TDVF_SECTION_TYPE_CFV: + /* The sections that must be copied from firmware image to TD memory */ + if (entry->data_len == 0) { + error_report("%d section with RawDataSize == 0", entry->type); + return -1; + } + break; + case TDVF_SECTION_TYPE_TD_HOB: + case TDVF_SECTION_TYPE_TEMP_MEM: + /* The sections that no need to be copied from firmware image */ + if (entry->data_len != 0) { + error_report("%d section with RawDataSize 0x%x != 0", + entry->type, entry->data_len); + return -1; + } + break; + default: + error_report("TDVF contains unsupported section type %d", entry->type); + return -1; + } + + return 0; +} + +int tdvf_parse_metadata(TdxFirmware *fw, void *flash_ptr, int size) +{ + g_autofree TdvfSectionEntry *sections = NULL; + TdvfMetadata *metadata; + ssize_t entries_size; + int i; + + metadata = tdvf_get_metadata(flash_ptr, size); + if (!metadata) { + return -EINVAL; + } + + /* load and parse metadata entries */ + fw->nr_entries = le32_to_cpu(metadata->NumberOfSectionEntries); + if (fw->nr_entries < 2) { + error_report("Invalid number of fw entries (%u) in TDVF Metadata", + fw->nr_entries); + return -EINVAL; + } + + entries_size = fw->nr_entries * sizeof(TdvfSectionEntry); + if (metadata->Length != sizeof(*metadata) + entries_size) { + error_report("TDVF metadata len (0x%x) mismatch, expected (0x%x)", + metadata->Length, + (uint32_t)(sizeof(*metadata) + entries_size)); + return -EINVAL; + } + + fw->entries = g_new(TdxFirmwareEntry, fw->nr_entries); + sections = g_new(TdvfSectionEntry, fw->nr_entries); + + memcpy(sections, (void *)metadata + sizeof(*metadata), entries_size); + + for (i = 0; i < fw->nr_entries; i++) { + if (tdvf_parse_and_check_section_entry(§ions[i], &fw->entries[i])) { + goto err; + } + } + + fw->mem_ptr = flash_ptr; + return 0; + +err: + fw->entries = 0; + g_free(fw->entries); + return -EINVAL; +} diff --git a/hw/i386/trace-events b/hw/i386/trace-events index 53c02d7..ac9e1a1 100644 --- a/hw/i386/trace-events +++ b/hw/i386/trace-events @@ -68,6 +68,7 @@ vtd_frr_new(int index, uint64_t hi, uint64_t lo) "index %d high 0x%"PRIx64" low vtd_warn_invalid_qi_tail(uint16_t tail) "tail 0x%"PRIx16 vtd_warn_ir_vector(uint16_t sid, int index, int vec, int target) "sid 0x%"PRIx16" index %d vec %d (should be: %d)" vtd_warn_ir_trigger(uint16_t sid, int index, int trig, int target) "sid 0x%"PRIx16" index %d trigger %d (should be: %d)" +vtd_reset_exit(void) "" # amd_iommu.c amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" + offset 0x%"PRIx32 diff --git a/hw/i386/vapic.c b/hw/i386/vapic.c index f5b1db7..0c1c92c 100644 --- a/hw/i386/vapic.c +++ b/hw/i386/vapic.c @@ -11,12 +11,13 @@ #include "qemu/osdep.h" #include "qemu/module.h" -#include "sysemu/sysemu.h" -#include "sysemu/cpus.h" -#include "sysemu/hw_accel.h" -#include "sysemu/kvm.h" -#include "sysemu/runstate.h" -#include "exec/address-spaces.h" +#include "exec/target_page.h" +#include "system/system.h" +#include "system/cpus.h" +#include "system/hw_accel.h" +#include "system/kvm.h" +#include "system/runstate.h" +#include "system/address-spaces.h" #include "hw/i386/apic_internal.h" #include "hw/sysbus.h" #include "hw/boards.h" @@ -718,7 +719,7 @@ static uint64_t vapic_read(void *opaque, hwaddr addr, unsigned size) static const MemoryRegionOps vapic_ops = { .write = vapic_write, .read = vapic_read, - .endianness = DEVICE_NATIVE_ENDIAN, + .endianness = DEVICE_LITTLE_ENDIAN, }; static void vapic_realize(DeviceState *dev, Error **errp) @@ -846,11 +847,11 @@ static const VMStateDescription vmstate_vapic = { } }; -static void vapic_class_init(ObjectClass *klass, void *data) +static void vapic_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); - dc->reset = vapic_reset; + device_class_set_legacy_reset(dc, vapic_reset); dc->vmsd = &vmstate_vapic; dc->realize = vapic_realize; } diff --git a/hw/i386/vmmouse.c b/hw/i386/vmmouse.c index a8d014d..3896159 100644 --- a/hw/i386/vmmouse.c +++ b/hw/i386/vmmouse.c @@ -317,17 +317,16 @@ static void vmmouse_realizefn(DeviceState *dev, Error **errp) vmport_register(VMPORT_CMD_VMMOUSE_DATA, vmmouse_ioport_read, s); } -static Property vmmouse_properties[] = { +static const Property vmmouse_properties[] = { DEFINE_PROP_LINK("i8042", VMMouseState, i8042, TYPE_I8042, ISAKBDState *), - DEFINE_PROP_END_OF_LIST(), }; -static void vmmouse_class_initfn(ObjectClass *klass, void *data) +static void vmmouse_class_initfn(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); dc->realize = vmmouse_realizefn; - dc->reset = vmmouse_reset; + device_class_set_legacy_reset(dc, vmmouse_reset); dc->vmsd = &vmstate_vmmouse; device_class_set_props(dc, vmmouse_properties); set_bit(DEVICE_CATEGORY_INPUT, dc->categories); diff --git a/hw/i386/vmport.c b/hw/i386/vmport.c index 7cc75db..6d93457 100644 --- a/hw/i386/vmport.c +++ b/hw/i386/vmport.c @@ -33,9 +33,9 @@ #include "hw/i386/vmport.h" #include "hw/qdev-properties.h" #include "hw/boards.h" -#include "sysemu/sysemu.h" -#include "sysemu/hw_accel.h" -#include "sysemu/qtest.h" +#include "system/system.h" +#include "system/hw_accel.h" +#include "system/qtest.h" #include "qemu/log.h" #include "trace.h" #include "qom/object.h" @@ -252,7 +252,7 @@ static void vmport_realizefn(DeviceState *dev, Error **errp) } } -static Property vmport_properties[] = { +static const Property vmport_properties[] = { /* Used to enforce compatibility for migration */ DEFINE_PROP_BIT("x-read-set-eax", VMPortState, compat_flags, VMPORT_COMPAT_READ_SET_EAX_BIT, true), @@ -284,11 +284,9 @@ static Property vmport_properties[] = { * 5 - ACE 1.x (Deprecated) */ DEFINE_PROP_UINT8("vmware-vmx-type", VMPortState, vmware_vmx_type, 2), - - DEFINE_PROP_END_OF_LIST(), }; -static void vmport_class_initfn(ObjectClass *klass, void *data) +static void vmport_class_initfn(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); diff --git a/hw/i386/x86-common.c b/hw/i386/x86-common.c index c0c66a0..b1b5f11 100644 --- a/hw/i386/x86-common.c +++ b/hw/i386/x86-common.c @@ -26,9 +26,9 @@ #include "qemu/units.h" #include "qemu/datadir.h" #include "qapi/error.h" -#include "sysemu/numa.h" -#include "sysemu/sysemu.h" -#include "sysemu/xen.h" +#include "system/numa.h" +#include "system/system.h" +#include "system/xen.h" #include "trace.h" #include "hw/i386/x86.h" @@ -44,6 +44,7 @@ #include "standard-headers/asm-x86/bootparam.h" #include CONFIG_DEVICES #include "kvm/kvm_i386.h" +#include "kvm/tdx.h" #ifdef CONFIG_XEN_EMU #include "hw/xen/xen.h" @@ -248,9 +249,7 @@ void x86_cpu_pre_plug(HotplugHandler *hotplug_dev, CPUX86State *env = &cpu->env; MachineState *ms = MACHINE(hotplug_dev); X86MachineState *x86ms = X86_MACHINE(hotplug_dev); - unsigned int smp_cores = ms->smp.cores; - unsigned int smp_threads = ms->smp.threads; - X86CPUTopoInfo topo_info; + X86CPUTopoInfo *topo_info = &env->topo_info; if (!object_dynamic_cast(OBJECT(cpu), ms->cpu_type)) { error_setg(errp, "Invalid CPU type, expected cpu type: '%s'", @@ -269,16 +268,14 @@ void x86_cpu_pre_plug(HotplugHandler *hotplug_dev, } } - init_topo_info(&topo_info, x86ms); + init_topo_info(topo_info, x86ms); if (ms->smp.modules > 1) { - env->nr_modules = ms->smp.modules; - set_bit(CPU_TOPO_LEVEL_MODULE, env->avail_cpu_topo); + set_bit(CPU_TOPOLOGY_LEVEL_MODULE, env->avail_cpu_topo); } if (ms->smp.dies > 1) { - env->nr_dies = ms->smp.dies; - set_bit(CPU_TOPO_LEVEL_DIE, env->avail_cpu_topo); + set_bit(CPU_TOPOLOGY_LEVEL_DIE, env->avail_cpu_topo); } /* @@ -329,17 +326,17 @@ void x86_cpu_pre_plug(HotplugHandler *hotplug_dev, if (cpu->core_id < 0) { error_setg(errp, "CPU core-id is not set"); return; - } else if (cpu->core_id > (smp_cores - 1)) { + } else if (cpu->core_id > (ms->smp.cores - 1)) { error_setg(errp, "Invalid CPU core-id: %u must be in range 0:%u", - cpu->core_id, smp_cores - 1); + cpu->core_id, ms->smp.cores - 1); return; } if (cpu->thread_id < 0) { error_setg(errp, "CPU thread-id is not set"); return; - } else if (cpu->thread_id > (smp_threads - 1)) { + } else if (cpu->thread_id > (ms->smp.threads - 1)) { error_setg(errp, "Invalid CPU thread-id: %u must be in range 0:%u", - cpu->thread_id, smp_threads - 1); + cpu->thread_id, ms->smp.threads - 1); return; } @@ -348,12 +345,12 @@ void x86_cpu_pre_plug(HotplugHandler *hotplug_dev, topo_ids.module_id = cpu->module_id; topo_ids.core_id = cpu->core_id; topo_ids.smt_id = cpu->thread_id; - cpu->apic_id = x86_apicid_from_topo_ids(&topo_info, &topo_ids); + cpu->apic_id = x86_apicid_from_topo_ids(topo_info, &topo_ids); } cpu_slot = x86_find_cpu_slot(MACHINE(x86ms), cpu->apic_id, &idx); if (!cpu_slot) { - x86_topo_ids_from_apicid(cpu->apic_id, &topo_info, &topo_ids); + x86_topo_ids_from_apicid(cpu->apic_id, topo_info, &topo_ids); error_setg(errp, "Invalid CPU [socket: %u, die: %u, module: %u, core: %u, thread: %u]" @@ -376,7 +373,7 @@ void x86_cpu_pre_plug(HotplugHandler *hotplug_dev, /* TODO: move socket_id/core_id/thread_id checks into x86_cpu_realizefn() * once -smp refactoring is complete and there will be CPU private * CPUState::nr_cores and CPUState::nr_threads fields instead of globals */ - x86_topo_ids_from_apicid(cpu->apic_id, &topo_info, &topo_ids); + x86_topo_ids_from_apicid(cpu->apic_id, topo_info, &topo_ids); if (cpu->socket_id != -1 && cpu->socket_id != topo_ids.pkg_id) { error_setg(errp, "property socket-id: %u doesn't match set apic-id:" " 0x%x (socket-id: %u)", cpu->socket_id, cpu->apic_id, @@ -450,8 +447,27 @@ static long get_file_size(FILE *f) void gsi_handler(void *opaque, int n, int level) { GSIState *s = opaque; + bool bypass_ioapic = false; trace_x86_gsi_interrupt(n, level); + +#ifdef CONFIG_XEN_EMU + /* + * Xen delivers the GSI to the Legacy PIC (not that Legacy PIC + * routing actually works properly under Xen). And then to + * *either* the PIRQ handling or the I/OAPIC depending on whether + * the former wants it. + * + * Additionally, this hook allows the Xen event channel GSI to + * work around QEMU's lack of support for shared level interrupts, + * by keeping track of the externally driven state of the pin and + * implementing a logical OR with the state of the evtchn GSI. + */ + if (xen_mode == XEN_EMULATE) { + bypass_ioapic = xen_evtchn_set_gsi(n, &level); + } +#endif + switch (n) { case 0 ... ISA_NUM_IRQS - 1: if (s->i8259_irq[n]) { @@ -460,18 +476,9 @@ void gsi_handler(void *opaque, int n, int level) } /* fall through */ case ISA_NUM_IRQS ... IOAPIC_NUM_PINS - 1: -#ifdef CONFIG_XEN_EMU - /* - * Xen delivers the GSI to the Legacy PIC (not that Legacy PIC - * routing actually works properly under Xen). And then to - * *either* the PIRQ handling or the I/OAPIC depending on - * whether the former wants it. - */ - if (xen_mode == XEN_EMULATE && xen_evtchn_set_gsi(n, level)) { - break; + if (!bypass_ioapic) { + qemu_set_irq(s->ioapic_irq[n], level); } -#endif - qemu_set_irq(s->ioapic_irq[n], level); break; case IO_APIC_SECONDARY_IRQBASE ... IO_APIC_SECONDARY_IRQBASE + IOAPIC_NUM_PINS - 1: @@ -586,7 +593,7 @@ static bool load_elfboot(const char *kernel_filename, uint64_t elf_low, elf_high; int kernel_size; - if (ldl_p(header) != 0x464c457f) { + if (ldl_le_p(header) != 0x464c457f) { return false; /* no elfboot */ } @@ -602,8 +609,8 @@ static bool load_elfboot(const char *kernel_filename, uint64_t elf_note_type = XEN_ELFNOTE_PHYS32_ENTRY; kernel_size = load_elf(kernel_filename, read_pvh_start_addr, NULL, &elf_note_type, &elf_entry, - &elf_low, &elf_high, NULL, 0, I386_ELF_MACHINE, - 0, 0); + &elf_low, &elf_high, NULL, + ELFDATA2LSB, I386_ELF_MACHINE, 0, 0); if (kernel_size < 0) { error_report("Error while loading elf kernel"); @@ -665,9 +672,12 @@ void x86_load_linux(X86MachineState *x86ms, exit(1); } - /* kernel protocol version */ - if (ldl_p(header + 0x202) == 0x53726448) { - protocol = lduw_p(header + 0x206); + /* + * kernel protocol version. + * Please see https://www.kernel.org/doc/Documentation/x86/boot.txt + */ + if (ldl_le_p(header + 0x202) == 0x53726448) /* Magic signature "HdrS" */ { + protocol = lduw_le_p(header + 0x206); } else { /* * This could be a multiboot kernel. If it is, let's stop treating it @@ -694,9 +704,11 @@ void x86_load_linux(X86MachineState *x86ms, strlen(kernel_cmdline) + 1); fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA, kernel_cmdline); + setup = g_memdup2(header, sizeof(header)); + fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_SIZE, sizeof(header)); fw_cfg_add_bytes(fw_cfg, FW_CFG_SETUP_DATA, - header, sizeof(header)); + setup, sizeof(header)); /* load initrd */ if (initrd_filename) { @@ -759,7 +771,7 @@ void x86_load_linux(X86MachineState *x86ms, /* highest address for loading the initrd */ if (protocol >= 0x20c && - lduw_p(header + 0x236) & XLF_CAN_BE_LOADED_ABOVE_4G) { + lduw_le_p(header + 0x236) & XLF_CAN_BE_LOADED_ABOVE_4G) { /* * Linux has supported initrd up to 4 GB for a very long time (2007, * long before XLF_CAN_BE_LOADED_ABOVE_4G which was added in 2013), @@ -778,7 +790,7 @@ void x86_load_linux(X86MachineState *x86ms, */ initrd_max = UINT32_MAX; } else if (protocol >= 0x203) { - initrd_max = ldl_p(header + 0x22c); + initrd_max = ldl_le_p(header + 0x22c); } else { initrd_max = 0x37ffffff; } @@ -794,10 +806,10 @@ void x86_load_linux(X86MachineState *x86ms, sev_load_ctx.cmdline_size = strlen(kernel_cmdline) + 1; if (protocol >= 0x202) { - stl_p(header + 0x228, cmdline_addr); + stl_le_p(header + 0x228, cmdline_addr); } else { - stw_p(header + 0x20, 0xA33F); - stw_p(header + 0x22, cmdline_addr - real_addr); + stw_le_p(header + 0x20, 0xA33F); + stw_le_p(header + 0x22, cmdline_addr - real_addr); } /* handle vga= parameter */ @@ -821,7 +833,7 @@ void x86_load_linux(X86MachineState *x86ms, exit(1); } } - stw_p(header + 0x1fa, video_mode); + stw_le_p(header + 0x1fa, video_mode); } /* loader type */ @@ -836,7 +848,7 @@ void x86_load_linux(X86MachineState *x86ms, /* heap */ if (protocol >= 0x201) { header[0x211] |= 0x80; /* CAN_USE_HEAP */ - stw_p(header + 0x224, cmdline_addr - real_addr - 0x200); + stw_le_p(header + 0x224, cmdline_addr - real_addr - 0x200); } /* load initrd */ @@ -876,8 +888,8 @@ void x86_load_linux(X86MachineState *x86ms, sev_load_ctx.initrd_data = initrd_data; sev_load_ctx.initrd_size = initrd_size; - stl_p(header + 0x218, initrd_addr); - stl_p(header + 0x21c, initrd_size); + stl_le_p(header + 0x218, initrd_addr); + stl_le_p(header + 0x21c, initrd_size); } /* load kernel and setup */ @@ -890,7 +902,6 @@ void x86_load_linux(X86MachineState *x86ms, fprintf(stderr, "qemu: invalid kernel header\n"); exit(1); } - kernel_size -= setup_size; setup = g_malloc(setup_size); kernel = g_malloc(kernel_size); @@ -899,6 +910,7 @@ void x86_load_linux(X86MachineState *x86ms, fprintf(stderr, "fread() failed\n"); exit(1); } + fseek(f, 0, SEEK_SET); if (fread(kernel, 1, kernel_size, f) != kernel_size) { fprintf(stderr, "fread() failed\n"); exit(1); @@ -923,7 +935,7 @@ void x86_load_linux(X86MachineState *x86ms, kernel_size = setup_data_offset + sizeof(struct setup_data) + dtb_size; kernel = g_realloc(kernel, kernel_size); - stq_p(header + 0x250, prot_addr + setup_data_offset); + stq_le_p(header + 0x250, prot_addr + setup_data_offset); setup_data = (struct setup_data *)(kernel + setup_data_offset); setup_data->next = 0; @@ -940,15 +952,16 @@ void x86_load_linux(X86MachineState *x86ms, * kernel on the other side of the fw_cfg interface matches the hash of the * file the user passed in. */ - if (!sev_enabled()) { + if (!sev_enabled() && protocol > 0) { memcpy(setup, header, MIN(sizeof(header), setup_size)); } fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ADDR, prot_addr); - fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_SIZE, kernel_size); - fw_cfg_add_bytes(fw_cfg, FW_CFG_KERNEL_DATA, kernel, kernel_size); - sev_load_ctx.kernel_data = (char *)kernel; - sev_load_ctx.kernel_size = kernel_size; + fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_SIZE, kernel_size - setup_size); + fw_cfg_add_bytes(fw_cfg, FW_CFG_KERNEL_DATA, + kernel + setup_size, kernel_size - setup_size); + sev_load_ctx.kernel_data = (char *)kernel + setup_size; + sev_load_ctx.kernel_size = kernel_size - setup_size; fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_ADDR, real_addr); fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_SIZE, setup_size); @@ -956,6 +969,25 @@ void x86_load_linux(X86MachineState *x86ms, sev_load_ctx.setup_data = (char *)setup; sev_load_ctx.setup_size = setup_size; + /* kernel without setup header patches */ + fw_cfg_add_file(fw_cfg, "etc/boot/kernel", kernel, kernel_size); + + if (machine->shim_filename) { + GMappedFile *mapped_file; + GError *gerr = NULL; + + mapped_file = g_mapped_file_new(machine->shim_filename, false, &gerr); + if (!mapped_file) { + fprintf(stderr, "qemu: error reading shim %s: %s\n", + machine->shim_filename, gerr->message); + exit(1); + } + + fw_cfg_add_file(fw_cfg, "etc/boot/shim", + g_mapped_file_get_contents(mapped_file), + g_mapped_file_get_length(mapped_file)); + } + if (sev_enabled()) { sev_add_kernel_loader_hashes(&sev_load_ctx, &error_fatal); } @@ -1004,11 +1036,14 @@ void x86_bios_rom_init(X86MachineState *x86ms, const char *default_firmware, if (machine_require_guest_memfd(MACHINE(x86ms))) { memory_region_init_ram_guest_memfd(&x86ms->bios, NULL, "pc.bios", bios_size, &error_fatal); + if (is_tdx_vm()) { + tdx_set_tdvf_region(&x86ms->bios); + } } else { memory_region_init_ram(&x86ms->bios, NULL, "pc.bios", bios_size, &error_fatal); } - if (sev_enabled()) { + if (sev_enabled() || is_tdx_vm()) { /* * The concept of a "reset" simply doesn't exist for * confidential computing guests, we have to destroy and diff --git a/hw/i386/x86-cpu.c b/hw/i386/x86-cpu.c index ab29205..c876e67 100644 --- a/hw/i386/x86-cpu.c +++ b/hw/i386/x86-cpu.c @@ -21,15 +21,15 @@ * THE SOFTWARE. */ #include "qemu/osdep.h" -#include "sysemu/whpx.h" -#include "sysemu/cpu-timers.h" +#include "system/whpx.h" +#include "system/cpu-timers.h" #include "trace.h" #include "hw/i386/x86.h" #include "target/i386/cpu.h" #include "hw/intc/i8259.h" #include "hw/irq.h" -#include "sysemu/kvm.h" +#include "system/kvm.h" /* TSC handling */ uint64_t cpu_get_tsc(CPUX86State *env) diff --git a/hw/i386/x86-iommu.c b/hw/i386/x86-iommu.c index 60af896..d34a684 100644 --- a/hw/i386/x86-iommu.c +++ b/hw/i386/x86-iommu.c @@ -25,7 +25,7 @@ #include "qapi/error.h" #include "qemu/error-report.h" #include "trace.h" -#include "sysemu/kvm.h" +#include "system/kvm.h" void x86_iommu_iec_register_notifier(X86IOMMUState *iommu, iec_notify_fn fn, void *data) @@ -125,15 +125,14 @@ static void x86_iommu_realize(DeviceState *dev, Error **errp) } } -static Property x86_iommu_properties[] = { +static const Property x86_iommu_properties[] = { DEFINE_PROP_ON_OFF_AUTO("intremap", X86IOMMUState, intr_supported, ON_OFF_AUTO_AUTO), DEFINE_PROP_BOOL("device-iotlb", X86IOMMUState, dt_supported, false), DEFINE_PROP_BOOL("pt", X86IOMMUState, pt_supported, true), - DEFINE_PROP_END_OF_LIST(), }; -static void x86_iommu_class_init(ObjectClass *klass, void *data) +static void x86_iommu_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); dc->realize = x86_iommu_realize; @@ -147,7 +146,7 @@ bool x86_iommu_ir_supported(X86IOMMUState *s) static const TypeInfo x86_iommu_info = { .name = TYPE_X86_IOMMU_DEVICE, - .parent = TYPE_SYS_BUS_DEVICE, + .parent = TYPE_DYNAMIC_SYS_BUS_DEVICE, .instance_size = sizeof(X86IOMMUState), .class_init = x86_iommu_class_init, .class_size = sizeof(X86IOMMUClass), diff --git a/hw/i386/x86.c b/hw/i386/x86.c index 01fc5e6..f80533d 100644 --- a/hw/i386/x86.c +++ b/hw/i386/x86.c @@ -27,8 +27,8 @@ #include "qapi/qapi-visit-common.h" #include "qapi/qapi-visit-machine.h" #include "qapi/visitor.h" -#include "sysemu/qtest.h" -#include "sysemu/numa.h" +#include "system/qtest.h" +#include "system/numa.h" #include "trace.h" #include "hw/acpi/aml-build.h" @@ -372,7 +372,7 @@ static void x86_machine_initfn(Object *obj) x86ms->above_4g_mem_start = 4 * GiB; } -static void x86_machine_class_init(ObjectClass *oc, void *data) +static void x86_machine_class_init(ObjectClass *oc, const void *data) { MachineClass *mc = MACHINE_CLASS(oc); X86MachineClass *x86mc = X86_MACHINE_CLASS(oc); @@ -382,7 +382,6 @@ static void x86_machine_class_init(ObjectClass *oc, void *data) mc->get_default_cpu_node_id = x86_get_default_cpu_node_id; mc->possible_cpu_arch_ids = x86_possible_cpu_arch_ids; mc->kvm_type = x86_kvm_type; - x86mc->save_tsc_khz = true; x86mc->fwcfg_dma_enabled = true; nc->nmi_monitor_handler = x86_nmi; @@ -450,7 +449,7 @@ static const TypeInfo x86_machine_info = { .instance_init = x86_machine_initfn, .class_size = sizeof(X86MachineClass), .class_init = x86_machine_class_init, - .interfaces = (InterfaceInfo[]) { + .interfaces = (const InterfaceInfo[]) { { TYPE_NMI }, { } }, diff --git a/hw/i386/xen/meson.build b/hw/i386/xen/meson.build index 3f0df8b..c73c62b 100644 --- a/hw/i386/xen/meson.build +++ b/hw/i386/xen/meson.build @@ -4,6 +4,7 @@ i386_ss.add(when: 'CONFIG_XEN', if_true: files( )) i386_ss.add(when: ['CONFIG_XEN', xen], if_true: files( 'xen-hvm.c', + 'xen-pvh.c', )) i386_ss.add(when: 'CONFIG_XEN_BUS', if_true: files( diff --git a/hw/i386/xen/xen-hvm.c b/hw/i386/xen/xen-hvm.c index 4f64466..ceb2242 100644 --- a/hw/i386/xen/xen-hvm.c +++ b/hw/i386/xen/xen-hvm.c @@ -10,10 +10,12 @@ #include "qemu/osdep.h" #include "qemu/units.h" +#include "qemu/error-report.h" #include "qapi/error.h" #include "qapi/qapi-commands-migration.h" #include "trace.h" +#include "hw/hw.h" #include "hw/i386/pc.h" #include "hw/irq.h" #include "hw/i386/apic-msidef.h" @@ -24,6 +26,10 @@ #include "hw/xen/arch_hvm.h" #include <xen/hvm/e820.h> #include "exec/target_page.h" +#include "target/i386/cpu.h" +#include "system/runstate.h" +#include "system/xen-mapcache.h" +#include "system/xen.h" static MemoryRegion ram_640k, ram_lo, ram_hi; static MemoryRegion *framebuffer; @@ -614,7 +620,9 @@ void xen_hvm_init_pc(PCMachineState *pcms, MemoryRegion **ram_memory) state = g_new0(XenIOState, 1); - xen_register_ioreq(state, max_cpus, &xen_memory_listener); + xen_register_ioreq(state, max_cpus, + HVM_IOREQSRV_BUFIOREQ_ATOMIC, + &xen_memory_listener); xen_is_stubdomain = xen_check_stubdomain(state->xenstore); @@ -750,6 +758,4 @@ void arch_handle_ioreq(XenIOState *state, ioreq_t *req) default: hw_error("Invalid ioreq type 0x%x\n", req->type); } - - return; } diff --git a/hw/i386/xen/xen-pvh.c b/hw/i386/xen/xen-pvh.c new file mode 100644 index 0000000..067f73e --- /dev/null +++ b/hw/i386/xen/xen-pvh.c @@ -0,0 +1,125 @@ +/* + * QEMU Xen PVH x86 Machine + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * Written by Edgar E. Iglesias <edgar.iglesias@amd.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qemu/error-report.h" +#include "hw/boards.h" +#include "system/system.h" +#include "hw/xen/arch_hvm.h" +#include <xen/hvm/hvm_info_table.h> +#include "hw/xen/xen-pvh-common.h" +#include "target/i386/cpu.h" + +#define TYPE_XEN_PVH_X86 MACHINE_TYPE_NAME("xenpvh") +OBJECT_DECLARE_SIMPLE_TYPE(XenPVHx86State, XEN_PVH_X86) + +struct XenPVHx86State { + /*< private >*/ + XenPVHMachineState parent; + + DeviceState **cpu; +}; + +static DeviceState *xen_pvh_cpu_new(MachineState *ms, + int64_t apic_id) +{ + Object *cpu = object_new(ms->cpu_type); + + object_property_add_child(OBJECT(ms), "cpu[*]", cpu); + object_property_set_uint(cpu, "apic-id", apic_id, &error_fatal); + qdev_realize(DEVICE(cpu), NULL, &error_fatal); + object_unref(cpu); + + return DEVICE(cpu); +} + +static void xen_pvh_init(MachineState *ms) +{ + XenPVHx86State *xp = XEN_PVH_X86(ms); + int i; + + /* Create dummy cores. This will indirectly create the APIC MSI window. */ + xp->cpu = g_malloc(sizeof xp->cpu[0] * ms->smp.max_cpus); + for (i = 0; i < ms->smp.max_cpus; i++) { + xp->cpu[i] = xen_pvh_cpu_new(ms, i); + } +} + +static void xen_pvh_instance_init(Object *obj) +{ + XenPVHMachineState *s = XEN_PVH_MACHINE(obj); + + /* Default values. */ + s->cfg.ram_low = (MemMapEntry) { 0x0, 0x80000000U }; + s->cfg.ram_high = (MemMapEntry) { 0xC000000000ULL, 0x4000000000ULL }; + s->cfg.pci_intx_irq_base = 16; +} + +/* + * Deliver INTX interrupts to Xen guest. + */ +static void xen_pvh_set_pci_intx_irq(void *opaque, int irq, int level) +{ + /* + * Since QEMU emulates all of the swizziling + * We don't want Xen to do any additional swizzling in + * xen_set_pci_intx_level() so we always set device to 0. + */ + if (xen_set_pci_intx_level(xen_domid, 0, 0, 0, irq, level)) { + error_report("xendevicemodel_set_pci_intx_level failed"); + } +} + +static void xen_pvh_machine_class_init(ObjectClass *oc, const void *data) +{ + XenPVHMachineClass *xpc = XEN_PVH_MACHINE_CLASS(oc); + MachineClass *mc = MACHINE_CLASS(oc); + + mc->desc = "Xen PVH x86 machine"; + mc->default_cpu_type = TARGET_DEFAULT_CPU_TYPE; + + /* mc->max_cpus holds the MAX value allowed in the -smp cmd-line opts. */ + mc->max_cpus = HVM_MAX_VCPUS; + + /* We have an implementation specific init to create CPU objects. */ + xpc->init = xen_pvh_init; + + /* Enable buffered IOREQs. */ + xpc->handle_bufioreq = HVM_IOREQSRV_BUFIOREQ_ATOMIC; + + /* + * PCI INTX routing. + * + * We describe the mapping between the 4 INTX interrupt and GSIs + * using xen_set_pci_link_route(). xen_pvh_set_pci_intx_irq is + * used to deliver the interrupt. + */ + xpc->set_pci_intx_irq = xen_pvh_set_pci_intx_irq; + xpc->set_pci_link_route = xen_set_pci_link_route; + + /* List of supported features known to work on PVH x86. */ + xpc->has_pci = true; + + xen_pvh_class_setup_common_props(xpc); +} + +static const TypeInfo xen_pvh_x86_machine_type = { + .name = TYPE_XEN_PVH_X86, + .parent = TYPE_XEN_PVH_MACHINE, + .class_init = xen_pvh_machine_class_init, + .instance_init = xen_pvh_instance_init, + .instance_size = sizeof(XenPVHx86State), +}; + +static void xen_pvh_machine_register_types(void) +{ + type_register_static(&xen_pvh_x86_machine_type); +} + +type_init(xen_pvh_machine_register_types) diff --git a/hw/i386/xen/xen_apic.c b/hw/i386/xen/xen_apic.c index 101e16a..f30398f 100644 --- a/hw/i386/xen/xen_apic.c +++ b/hw/i386/xen/xen_apic.c @@ -36,7 +36,7 @@ static void xen_apic_mem_write(void *opaque, hwaddr addr, static const MemoryRegionOps xen_apic_io_ops = { .read = xen_apic_mem_read, .write = xen_apic_mem_write, - .endianness = DEVICE_NATIVE_ENDIAN, + .endianness = DEVICE_LITTLE_ENDIAN, }; static void xen_apic_realize(DeviceState *dev, Error **errp) @@ -76,7 +76,7 @@ static void xen_send_msi(MSIMessage *msi) xen_hvm_inject_msi(msi->address, msi->data); } -static void xen_apic_class_init(ObjectClass *klass, void *data) +static void xen_apic_class_init(ObjectClass *klass, const void *data) { APICCommonClass *k = APIC_COMMON_CLASS(klass); diff --git a/hw/i386/xen/xen_platform.c b/hw/i386/xen/xen_platform.c index 708488a..c8b852b 100644 --- a/hw/i386/xen/xen_platform.c +++ b/hw/i386/xen/xen_platform.c @@ -30,8 +30,8 @@ #include "migration/vmstate.h" #include "net/net.h" #include "trace.h" -#include "sysemu/xen.h" -#include "sysemu/block-backend.h" +#include "system/xen.h" +#include "system/block-backend.h" #include "qemu/error-report.h" #include "qemu/module.h" #include "qom/object.h" @@ -514,7 +514,7 @@ static void platform_mmio_write(void *opaque, hwaddr addr, static const MemoryRegionOps platform_mmio_handler = { .read = &platform_mmio_read, .write = &platform_mmio_write, - .endianness = DEVICE_NATIVE_ENDIAN, + .endianness = DEVICE_LITTLE_ENDIAN, }; static void platform_mmio_setup(PCIXenPlatformState *d) @@ -581,7 +581,7 @@ static void platform_reset(DeviceState *dev) platform_fixed_ioport_reset(s); } -static void xen_platform_class_init(ObjectClass *klass, void *data) +static void xen_platform_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); @@ -595,7 +595,7 @@ static void xen_platform_class_init(ObjectClass *klass, void *data) k->revision = 1; set_bit(DEVICE_CATEGORY_MISC, dc->categories); dc->desc = "XEN platform pci device"; - dc->reset = platform_reset; + device_class_set_legacy_reset(dc, platform_reset); dc->vmsd = &vmstate_xen_platform; } @@ -604,7 +604,7 @@ static const TypeInfo xen_platform_info = { .parent = TYPE_PCI_DEVICE, .instance_size = sizeof(PCIXenPlatformState), .class_init = xen_platform_class_init, - .interfaces = (InterfaceInfo[]) { + .interfaces = (const InterfaceInfo[]) { { INTERFACE_CONVENTIONAL_PCI_DEVICE }, { }, }, diff --git a/hw/i386/xen/xen_pvdevice.c b/hw/i386/xen/xen_pvdevice.c index ed62153..87a974a 100644 --- a/hw/i386/xen/xen_pvdevice.c +++ b/hw/i386/xen/xen_pvdevice.c @@ -115,15 +115,14 @@ static void xen_pv_realize(PCIDevice *pci_dev, Error **errp) &d->mmio); } -static Property xen_pv_props[] = { +static const Property xen_pv_props[] = { DEFINE_PROP_UINT16("vendor-id", XenPVDevice, vendor_id, PCI_VENDOR_ID_XEN), DEFINE_PROP_UINT16("device-id", XenPVDevice, device_id, 0xffff), DEFINE_PROP_UINT8("revision", XenPVDevice, revision, 0x01), DEFINE_PROP_UINT32("size", XenPVDevice, size, 0x400000), - DEFINE_PROP_END_OF_LIST() }; -static void xen_pv_class_init(ObjectClass *klass, void *data) +static void xen_pv_class_init(ObjectClass *klass, const void *data) { DeviceClass *dc = DEVICE_CLASS(klass); PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); @@ -140,7 +139,7 @@ static const TypeInfo xen_pv_type_info = { .parent = TYPE_PCI_DEVICE, .instance_size = sizeof(XenPVDevice), .class_init = xen_pv_class_init, - .interfaces = (InterfaceInfo[]) { + .interfaces = (const InterfaceInfo[]) { { INTERFACE_CONVENTIONAL_PCI_DEVICE }, { }, }, |