diff options
Diffstat (limited to 'hw/i386')
-rw-r--r-- | hw/i386/Kconfig | 12 | ||||
-rw-r--r-- | hw/i386/acpi-build.c | 538 | ||||
-rw-r--r-- | hw/i386/acpi-build.h | 4 | ||||
-rw-r--r-- | hw/i386/amd_iommu.c | 1183 | ||||
-rw-r--r-- | hw/i386/amd_iommu.h | 112 | ||||
-rw-r--r-- | hw/i386/intel_iommu.c | 539 | ||||
-rw-r--r-- | hw/i386/intel_iommu_internal.h | 53 | ||||
-rw-r--r-- | hw/i386/isapc.c | 189 | ||||
-rw-r--r-- | hw/i386/kvm/apic.c | 5 | ||||
-rw-r--r-- | hw/i386/kvm/xen-stubs.c | 13 | ||||
-rw-r--r-- | hw/i386/kvm/xen_evtchn.c | 2 | ||||
-rw-r--r-- | hw/i386/meson.build | 2 | ||||
-rw-r--r-- | hw/i386/microvm.c | 3 | ||||
-rw-r--r-- | hw/i386/monitor.c | 2 | ||||
-rw-r--r-- | hw/i386/pc.c | 175 | ||||
-rw-r--r-- | hw/i386/pc_piix.c | 311 | ||||
-rw-r--r-- | hw/i386/pc_q35.c | 53 | ||||
-rw-r--r-- | hw/i386/pc_sysfw.c | 38 | ||||
-rw-r--r-- | hw/i386/sgx-stub.c | 6 | ||||
-rw-r--r-- | hw/i386/sgx.c | 34 | ||||
-rw-r--r-- | hw/i386/tdvf-hob.c | 130 | ||||
-rw-r--r-- | hw/i386/tdvf-hob.h | 26 | ||||
-rw-r--r-- | hw/i386/tdvf.c | 189 | ||||
-rw-r--r-- | hw/i386/x86-common.c | 8 | ||||
-rw-r--r-- | hw/i386/x86-iommu.c | 1 | ||||
-rw-r--r-- | hw/i386/x86.c | 1 |
26 files changed, 2462 insertions, 1167 deletions
diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig index d34ce07..6a0ab54 100644 --- a/hw/i386/Kconfig +++ b/hw/i386/Kconfig @@ -4,12 +4,17 @@ config X86_FW_OVMF config SEV bool select X86_FW_OVMF - depends on KVM + depends on KVM && X86_64 config SGX bool depends on KVM +config TDX + bool + select X86_FW_OVMF + depends on KVM && X86_64 + config PC bool imply APPLESMC @@ -26,6 +31,7 @@ config PC imply QXL imply SEV imply SGX + imply TDX imply TEST_DEVICES imply TPM_CRB imply TPM_TIS_ISA @@ -90,9 +96,6 @@ config ISAPC select ISA_BUS select PC select IDE_ISA - # FIXME: it is in the same file as i440fx, and does not compile - # if separated - depends on I440FX config Q35 bool @@ -125,6 +128,7 @@ config MICROVM select I8259 select MC146818RTC select VIRTIO_MMIO + select ACPI_PCI select ACPI_HW_REDUCED select PCI_EXPRESS_GENERIC_BRIDGE select USB_XHCI_SYSBUS diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 61851cc..9446a9f 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -338,405 +338,6 @@ build_facs(GArray *table_data) g_array_append_vals(table_data, reserved, 40); /* Reserved */ } -Aml *aml_pci_device_dsm(void) -{ - Aml *method; - - method = aml_method("_DSM", 4, AML_SERIALIZED); - { - Aml *params = aml_local(0); - Aml *pkg = aml_package(2); - aml_append(pkg, aml_int(0)); - aml_append(pkg, aml_int(0)); - aml_append(method, aml_store(pkg, params)); - aml_append(method, - aml_store(aml_name("BSEL"), aml_index(params, aml_int(0)))); - aml_append(method, - aml_store(aml_name("ASUN"), aml_index(params, aml_int(1)))); - aml_append(method, - aml_return(aml_call5("PDSM", aml_arg(0), aml_arg(1), - aml_arg(2), aml_arg(3), params)) - ); - } - return method; -} - -static void build_append_pci_dsm_func0_common(Aml *ctx, Aml *retvar) -{ - Aml *UUID, *ifctx1; - uint8_t byte_list[1] = { 0 }; /* nothing supported yet */ - - aml_append(ctx, aml_store(aml_buffer(1, byte_list), retvar)); - /* - * PCI Firmware Specification 3.1 - * 4.6. _DSM Definitions for PCI - */ - UUID = aml_touuid("E5C937D0-3553-4D7A-9117-EA4D19C3434D"); - ifctx1 = aml_if(aml_lnot(aml_equal(aml_arg(0), UUID))); - { - /* call is for unsupported UUID, bail out */ - aml_append(ifctx1, aml_return(retvar)); - } - aml_append(ctx, ifctx1); - - ifctx1 = aml_if(aml_lless(aml_arg(1), aml_int(2))); - { - /* call is for unsupported REV, bail out */ - aml_append(ifctx1, aml_return(retvar)); - } - aml_append(ctx, ifctx1); -} - -static Aml *aml_pci_edsm(void) -{ - Aml *method, *ifctx; - Aml *zero = aml_int(0); - Aml *func = aml_arg(2); - Aml *ret = aml_local(0); - Aml *aidx = aml_local(1); - Aml *params = aml_arg(4); - - method = aml_method("EDSM", 5, AML_SERIALIZED); - - /* get supported functions */ - ifctx = aml_if(aml_equal(func, zero)); - { - /* 1: have supported functions */ - /* 7: support for function 7 */ - const uint8_t caps = 1 | BIT(7); - build_append_pci_dsm_func0_common(ifctx, ret); - aml_append(ifctx, aml_store(aml_int(caps), aml_index(ret, zero))); - aml_append(ifctx, aml_return(ret)); - } - aml_append(method, ifctx); - - /* handle specific functions requests */ - /* - * PCI Firmware Specification 3.1 - * 4.6.7. _DSM for Naming a PCI or PCI Express Device Under - * Operating Systems - */ - ifctx = aml_if(aml_equal(func, aml_int(7))); - { - Aml *pkg = aml_package(2); - aml_append(pkg, zero); - /* optional, if not impl. should return null string */ - aml_append(pkg, aml_string("%s", "")); - aml_append(ifctx, aml_store(pkg, ret)); - - /* - * IASL is fine when initializing Package with computational data, - * however it makes guest unhappy /it fails to process such AML/. - * So use runtime assignment to set acpi-index after initializer - * to make OSPM happy. - */ - aml_append(ifctx, - aml_store(aml_derefof(aml_index(params, aml_int(0))), aidx)); - aml_append(ifctx, aml_store(aidx, aml_index(ret, zero))); - aml_append(ifctx, aml_return(ret)); - } - aml_append(method, ifctx); - - return method; -} - -static Aml *aml_pci_static_endpoint_dsm(PCIDevice *pdev) -{ - Aml *method; - - g_assert(pdev->acpi_index != 0); - method = aml_method("_DSM", 4, AML_SERIALIZED); - { - Aml *params = aml_local(0); - Aml *pkg = aml_package(1); - aml_append(pkg, aml_int(pdev->acpi_index)); - aml_append(method, aml_store(pkg, params)); - aml_append(method, - aml_return(aml_call5("EDSM", aml_arg(0), aml_arg(1), - aml_arg(2), aml_arg(3), params)) - ); - } - return method; -} - -static void build_append_pcihp_notify_entry(Aml *method, int slot) -{ - Aml *if_ctx; - int32_t devfn = PCI_DEVFN(slot, 0); - - if_ctx = aml_if(aml_and(aml_arg(0), aml_int(0x1U << slot), NULL)); - aml_append(if_ctx, aml_notify(aml_name("S%.02X", devfn), aml_arg(1))); - aml_append(method, if_ctx); -} - -static bool is_devfn_ignored_generic(const int devfn, const PCIBus *bus) -{ - const PCIDevice *pdev = bus->devices[devfn]; - - if (PCI_FUNC(devfn)) { - if (IS_PCI_BRIDGE(pdev)) { - /* - * Ignore only hotplugged PCI bridges on !0 functions, but - * allow describing cold plugged bridges on all functions - */ - if (DEVICE(pdev)->hotplugged) { - return true; - } - } - } - return false; -} - -static bool is_devfn_ignored_hotplug(const int devfn, const PCIBus *bus) -{ - PCIDevice *pdev = bus->devices[devfn]; - if (pdev) { - return is_devfn_ignored_generic(devfn, bus) || - !DEVICE_GET_CLASS(pdev)->hotpluggable || - /* Cold plugged bridges aren't themselves hot-pluggable */ - (IS_PCI_BRIDGE(pdev) && !DEVICE(pdev)->hotplugged); - } else { /* non populated slots */ - /* - * hotplug is supported only for non-multifunction device - * so generate device description only for function 0 - */ - if (PCI_FUNC(devfn) || - (pci_bus_is_express(bus) && PCI_SLOT(devfn) > 0)) { - return true; - } - } - return false; -} - -void build_append_pcihp_slots(Aml *parent_scope, PCIBus *bus) -{ - int devfn; - Aml *dev, *notify_method = NULL, *method; - QObject *bsel = object_property_get_qobject(OBJECT(bus), - ACPI_PCIHP_PROP_BSEL, NULL); - uint64_t bsel_val = qnum_get_uint(qobject_to(QNum, bsel)); - qobject_unref(bsel); - - aml_append(parent_scope, aml_name_decl("BSEL", aml_int(bsel_val))); - notify_method = aml_method("DVNT", 2, AML_NOTSERIALIZED); - - for (devfn = 0; devfn < ARRAY_SIZE(bus->devices); devfn++) { - int slot = PCI_SLOT(devfn); - int adr = slot << 16 | PCI_FUNC(devfn); - - if (is_devfn_ignored_hotplug(devfn, bus)) { - continue; - } - - if (bus->devices[devfn]) { - dev = aml_scope("S%.02X", devfn); - } else { - dev = aml_device("S%.02X", devfn); - aml_append(dev, aml_name_decl("_ADR", aml_int(adr))); - } - - /* - * Can't declare _SUN here for every device as it changes 'slot' - * enumeration order in linux kernel, so use another variable for it - */ - aml_append(dev, aml_name_decl("ASUN", aml_int(slot))); - aml_append(dev, aml_pci_device_dsm()); - - aml_append(dev, aml_name_decl("_SUN", aml_int(slot))); - /* add _EJ0 to make slot hotpluggable */ - method = aml_method("_EJ0", 1, AML_NOTSERIALIZED); - aml_append(method, - aml_call2("PCEJ", aml_name("BSEL"), aml_name("_SUN")) - ); - aml_append(dev, method); - - build_append_pcihp_notify_entry(notify_method, slot); - - /* device descriptor has been composed, add it into parent context */ - aml_append(parent_scope, dev); - } - aml_append(parent_scope, notify_method); -} - -void build_append_pci_bus_devices(Aml *parent_scope, PCIBus *bus) -{ - int devfn; - Aml *dev; - - for (devfn = 0; devfn < ARRAY_SIZE(bus->devices); devfn++) { - /* ACPI spec: 1.0b: Table 6-2 _ADR Object Bus Types, PCI type */ - int adr = PCI_SLOT(devfn) << 16 | PCI_FUNC(devfn); - PCIDevice *pdev = bus->devices[devfn]; - - if (!pdev || is_devfn_ignored_generic(devfn, bus)) { - continue; - } - - /* start to compose PCI device descriptor */ - dev = aml_device("S%.02X", devfn); - aml_append(dev, aml_name_decl("_ADR", aml_int(adr))); - - call_dev_aml_func(DEVICE(bus->devices[devfn]), dev); - /* add _DSM if device has acpi-index set */ - if (pdev->acpi_index && - !object_property_get_bool(OBJECT(pdev), "hotpluggable", - &error_abort)) { - aml_append(dev, aml_pci_static_endpoint_dsm(pdev)); - } - - /* device descriptor has been composed, add it into parent context */ - aml_append(parent_scope, dev); - } -} - -static bool build_append_notification_callback(Aml *parent_scope, - const PCIBus *bus) -{ - Aml *method; - PCIBus *sec; - QObject *bsel; - int nr_notifiers = 0; - GQueue *pcnt_bus_list = g_queue_new(); - - QLIST_FOREACH(sec, &bus->child, sibling) { - Aml *br_scope = aml_scope("S%.02X", sec->parent_dev->devfn); - if (pci_bus_is_root(sec)) { - continue; - } - nr_notifiers = nr_notifiers + - build_append_notification_callback(br_scope, sec); - /* - * add new child scope to parent - * and keep track of bus that have PCNT, - * bus list is used later to call children PCNTs from this level PCNT - */ - if (nr_notifiers) { - g_queue_push_tail(pcnt_bus_list, sec); - aml_append(parent_scope, br_scope); - } - } - - /* - * Append PCNT method to notify about events on local and child buses. - * ps: hostbridge might not have hotplug (bsel) enabled but might have - * child bridges that do have bsel. - */ - method = aml_method("PCNT", 0, AML_NOTSERIALIZED); - - /* If bus supports hotplug select it and notify about local events */ - bsel = object_property_get_qobject(OBJECT(bus), ACPI_PCIHP_PROP_BSEL, NULL); - if (bsel) { - uint64_t bsel_val = qnum_get_uint(qobject_to(QNum, bsel)); - - aml_append(method, aml_store(aml_int(bsel_val), aml_name("BNUM"))); - aml_append(method, aml_call2("DVNT", aml_name("PCIU"), - aml_int(1))); /* Device Check */ - aml_append(method, aml_call2("DVNT", aml_name("PCID"), - aml_int(3))); /* Eject Request */ - nr_notifiers++; - } - - /* Notify about child bus events in any case */ - while ((sec = g_queue_pop_head(pcnt_bus_list))) { - aml_append(method, aml_name("^S%.02X.PCNT", sec->parent_dev->devfn)); - } - - aml_append(parent_scope, method); - qobject_unref(bsel); - g_queue_free(pcnt_bus_list); - return !!nr_notifiers; -} - -static Aml *aml_pci_pdsm(void) -{ - Aml *method, *ifctx, *ifctx1; - Aml *ret = aml_local(0); - Aml *caps = aml_local(1); - Aml *acpi_index = aml_local(2); - Aml *zero = aml_int(0); - Aml *one = aml_int(1); - Aml *not_supp = aml_int(0xFFFFFFFF); - Aml *func = aml_arg(2); - Aml *params = aml_arg(4); - Aml *bnum = aml_derefof(aml_index(params, aml_int(0))); - Aml *sunum = aml_derefof(aml_index(params, aml_int(1))); - - method = aml_method("PDSM", 5, AML_SERIALIZED); - - /* get supported functions */ - ifctx = aml_if(aml_equal(func, zero)); - { - build_append_pci_dsm_func0_common(ifctx, ret); - - aml_append(ifctx, aml_store(zero, caps)); - aml_append(ifctx, - aml_store(aml_call2("AIDX", bnum, sunum), acpi_index)); - /* - * advertise function 7 if device has acpi-index - * acpi_index values: - * 0: not present (default value) - * FFFFFFFF: not supported (old QEMU without PIDX reg) - * other: device's acpi-index - */ - ifctx1 = aml_if(aml_lnot( - aml_or(aml_equal(acpi_index, zero), - aml_equal(acpi_index, not_supp), NULL) - )); - { - /* have supported functions */ - aml_append(ifctx1, aml_or(caps, one, caps)); - /* support for function 7 */ - aml_append(ifctx1, - aml_or(caps, aml_shiftleft(one, aml_int(7)), caps)); - } - aml_append(ifctx, ifctx1); - - aml_append(ifctx, aml_store(caps, aml_index(ret, zero))); - aml_append(ifctx, aml_return(ret)); - } - aml_append(method, ifctx); - - /* handle specific functions requests */ - /* - * PCI Firmware Specification 3.1 - * 4.6.7. _DSM for Naming a PCI or PCI Express Device Under - * Operating Systems - */ - ifctx = aml_if(aml_equal(func, aml_int(7))); - { - Aml *pkg = aml_package(2); - - aml_append(ifctx, aml_store(aml_call2("AIDX", bnum, sunum), acpi_index)); - aml_append(ifctx, aml_store(pkg, ret)); - /* - * Windows calls func=7 without checking if it's available, - * as workaround Microsoft has suggested to return invalid for func7 - * Package, so return 2 elements package but only initialize elements - * when acpi_index is supported and leave them uninitialized, which - * leads elements to being Uninitialized ObjectType and should trip - * Windows into discarding result as an unexpected and prevent setting - * bogus 'PCI Label' on the device. - */ - ifctx1 = aml_if(aml_lnot(aml_lor( - aml_equal(acpi_index, zero), aml_equal(acpi_index, not_supp) - ))); - { - aml_append(ifctx1, aml_store(acpi_index, aml_index(ret, zero))); - /* - * optional, if not impl. should return null string - */ - aml_append(ifctx1, aml_store(aml_string("%s", ""), - aml_index(ret, one))); - } - aml_append(ifctx, ifctx1); - - aml_append(ifctx, aml_return(ret)); - } - - aml_append(method, ifctx); - return method; -} - /* * build_prt - Define interrupt routing rules * @@ -1227,112 +828,6 @@ static Aml *build_q35_dram_controller(const AcpiMcfgInfo *mcfg) return dev; } -static void build_x86_acpi_pci_hotplug(Aml *table, uint64_t pcihp_addr) -{ - Aml *scope; - Aml *field; - Aml *method; - - scope = aml_scope("_SB.PCI0"); - - aml_append(scope, - aml_operation_region("PCST", AML_SYSTEM_IO, aml_int(pcihp_addr), 0x08)); - field = aml_field("PCST", AML_DWORD_ACC, AML_NOLOCK, AML_WRITE_AS_ZEROS); - aml_append(field, aml_named_field("PCIU", 32)); - aml_append(field, aml_named_field("PCID", 32)); - aml_append(scope, field); - - aml_append(scope, - aml_operation_region("SEJ", AML_SYSTEM_IO, - aml_int(pcihp_addr + ACPI_PCIHP_SEJ_BASE), 0x04)); - field = aml_field("SEJ", AML_DWORD_ACC, AML_NOLOCK, AML_WRITE_AS_ZEROS); - aml_append(field, aml_named_field("B0EJ", 32)); - aml_append(scope, field); - - aml_append(scope, - aml_operation_region("BNMR", AML_SYSTEM_IO, - aml_int(pcihp_addr + ACPI_PCIHP_BNMR_BASE), 0x08)); - field = aml_field("BNMR", AML_DWORD_ACC, AML_NOLOCK, AML_WRITE_AS_ZEROS); - aml_append(field, aml_named_field("BNUM", 32)); - aml_append(field, aml_named_field("PIDX", 32)); - aml_append(scope, field); - - aml_append(scope, aml_mutex("BLCK", 0)); - - method = aml_method("PCEJ", 2, AML_NOTSERIALIZED); - aml_append(method, aml_acquire(aml_name("BLCK"), 0xFFFF)); - aml_append(method, aml_store(aml_arg(0), aml_name("BNUM"))); - aml_append(method, - aml_store(aml_shiftleft(aml_int(1), aml_arg(1)), aml_name("B0EJ"))); - aml_append(method, aml_release(aml_name("BLCK"))); - aml_append(method, aml_return(aml_int(0))); - aml_append(scope, method); - - method = aml_method("AIDX", 2, AML_NOTSERIALIZED); - aml_append(method, aml_acquire(aml_name("BLCK"), 0xFFFF)); - aml_append(method, aml_store(aml_arg(0), aml_name("BNUM"))); - aml_append(method, - aml_store(aml_shiftleft(aml_int(1), aml_arg(1)), aml_name("PIDX"))); - aml_append(method, aml_store(aml_name("PIDX"), aml_local(0))); - aml_append(method, aml_release(aml_name("BLCK"))); - aml_append(method, aml_return(aml_local(0))); - aml_append(scope, method); - - aml_append(scope, aml_pci_pdsm()); - - aml_append(table, scope); -} - -static Aml *build_q35_osc_method(bool enable_native_pcie_hotplug) -{ - Aml *if_ctx; - Aml *if_ctx2; - Aml *else_ctx; - Aml *method; - Aml *a_cwd1 = aml_name("CDW1"); - Aml *a_ctrl = aml_local(0); - - method = aml_method("_OSC", 4, AML_NOTSERIALIZED); - aml_append(method, aml_create_dword_field(aml_arg(3), aml_int(0), "CDW1")); - - if_ctx = aml_if(aml_equal( - aml_arg(0), aml_touuid("33DB4D5B-1FF7-401C-9657-7441C03DD766"))); - aml_append(if_ctx, aml_create_dword_field(aml_arg(3), aml_int(4), "CDW2")); - aml_append(if_ctx, aml_create_dword_field(aml_arg(3), aml_int(8), "CDW3")); - - aml_append(if_ctx, aml_store(aml_name("CDW3"), a_ctrl)); - - /* - * Always allow native PME, AER (no dependencies) - * Allow SHPC (PCI bridges can have SHPC controller) - * Disable PCIe Native Hot-plug if ACPI PCI Hot-plug is enabled. - */ - aml_append(if_ctx, aml_and(a_ctrl, - aml_int(0x1E | (enable_native_pcie_hotplug ? 0x1 : 0x0)), a_ctrl)); - - if_ctx2 = aml_if(aml_lnot(aml_equal(aml_arg(1), aml_int(1)))); - /* Unknown revision */ - aml_append(if_ctx2, aml_or(a_cwd1, aml_int(0x08), a_cwd1)); - aml_append(if_ctx, if_ctx2); - - if_ctx2 = aml_if(aml_lnot(aml_equal(aml_name("CDW3"), a_ctrl))); - /* Capabilities bits were masked */ - aml_append(if_ctx2, aml_or(a_cwd1, aml_int(0x10), a_cwd1)); - aml_append(if_ctx, if_ctx2); - - /* Update DWORD3 in the buffer */ - aml_append(if_ctx, aml_store(a_ctrl, aml_name("CDW3"))); - aml_append(method, if_ctx); - - else_ctx = aml_else(); - /* Unrecognized UUID */ - aml_append(else_ctx, aml_or(a_cwd1, aml_int(4), a_cwd1)); - aml_append(method, else_ctx); - - aml_append(method, aml_return(aml_arg(3))); - return method; -} - static void build_acpi0017(Aml *table) { Aml *dev, *scope, *method; @@ -1389,12 +884,12 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, dev = aml_device("PCI0"); aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0A03"))); aml_append(dev, aml_name_decl("_UID", aml_int(pcmc->pci_root_uid))); - aml_append(dev, aml_pci_edsm()); + aml_append(dev, build_pci_bridge_edsm()); aml_append(sb_scope, dev); aml_append(dsdt, sb_scope); if (pm->pcihp_bridge_en || pm->pcihp_root_en) { - build_x86_acpi_pci_hotplug(dsdt, pm->pcihp_io_base); + build_acpi_pci_hotplug(dsdt, AML_SYSTEM_IO, pm->pcihp_io_base); } build_piix4_pci0_int(dsdt); } else if (q35) { @@ -1403,8 +898,8 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0A08"))); aml_append(dev, aml_name_decl("_CID", aml_eisaid("PNP0A03"))); aml_append(dev, aml_name_decl("_UID", aml_int(pcmc->pci_root_uid))); - aml_append(dev, build_q35_osc_method(!pm->pcihp_bridge_en)); - aml_append(dev, aml_pci_edsm()); + aml_append(dev, build_pci_host_bridge_osc_method(!pm->pcihp_bridge_en)); + aml_append(dev, build_pci_bridge_edsm()); aml_append(sb_scope, dev); if (mcfg_valid) { aml_append(sb_scope, build_q35_dram_controller(&mcfg)); @@ -1438,7 +933,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, aml_append(dsdt, sb_scope); if (pm->pcihp_bridge_en) { - build_x86_acpi_pci_hotplug(dsdt, pm->pcihp_io_base); + build_acpi_pci_hotplug(dsdt, AML_SYSTEM_IO, pm->pcihp_io_base); } build_q35_pci0_int(dsdt); } @@ -1525,7 +1020,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, aml_append(dev, aml_name_decl("_CID", aml_eisaid("PNP0A03"))); /* Expander bridges do not have ACPI PCI Hot-plug enabled */ - aml_append(dev, build_q35_osc_method(true)); + aml_append(dev, build_pci_host_bridge_osc_method(true)); } else { aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0A03"))); } @@ -1654,19 +1149,8 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, /* reserve PCIHP resources */ if (pm->pcihp_io_len && (pm->pcihp_bridge_en || pm->pcihp_root_en)) { - dev = aml_device("PHPR"); - aml_append(dev, aml_name_decl("_HID", aml_string("PNP0A06"))); - aml_append(dev, - aml_name_decl("_UID", aml_string("PCI Hotplug resources"))); - /* device present, functioning, decoding, not shown in UI */ - aml_append(dev, aml_name_decl("_STA", aml_int(0xB))); - crs = aml_resource_template(); - aml_append(crs, - aml_io(AML_DECODE16, pm->pcihp_io_base, pm->pcihp_io_base, 1, - pm->pcihp_io_len) - ); - aml_append(dev, aml_name_decl("_CRS", crs)); - aml_append(scope, dev); + build_append_pcihp_resources(scope, + pm->pcihp_io_base, pm->pcihp_io_len); } aml_append(dsdt, scope); @@ -2379,7 +1863,11 @@ build_amd_iommu(GArray *table_data, BIOSLinker *linker, const char *oem_id, /* IOMMU info */ build_append_int_noprefix(table_data, 0, 2); /* IOMMU Attributes */ - build_append_int_noprefix(table_data, 0, 4); + if (!s->iommu.dma_translation) { + build_append_int_noprefix(table_data, (1UL << 0) /* HATDis */, 4); + } else { + build_append_int_noprefix(table_data, 0, 4); + } /* EFR Register Image */ build_append_int_noprefix(table_data, amdvi_extended_feature_register(s), diff --git a/hw/i386/acpi-build.h b/hw/i386/acpi-build.h index 275ec05..8ba3c33 100644 --- a/hw/i386/acpi-build.h +++ b/hw/i386/acpi-build.h @@ -5,10 +5,6 @@ extern const struct AcpiGenericAddress x86_nvdimm_acpi_dsmio; -/* PCI Hot-plug registers' base. See docs/specs/acpi_pci_hotplug.rst */ -#define ACPI_PCIHP_SEJ_BASE 0x8 -#define ACPI_PCIHP_BNMR_BASE 0x10 - void acpi_setup(void); Object *acpi_get_i386_pci_host(void); diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c index 0775c8f..378e0cb 100644 --- a/hw/i386/amd_iommu.c +++ b/hw/i386/amd_iommu.c @@ -33,6 +33,7 @@ #include "hw/i386/apic-msidef.h" #include "hw/qdev-properties.h" #include "kvm/kvm_i386.h" +#include "qemu/iova-tree.h" /* used AMD-Vi MMIO registers */ const char *amdvi_mmio_low[] = { @@ -66,6 +67,15 @@ struct AMDVIAddressSpace { MemoryRegion iommu_nodma; /* Alias of shared nodma memory region */ MemoryRegion iommu_ir; /* Device's interrupt remapping region */ AddressSpace as; /* device's corresponding address space */ + + /* DMA address translation support */ + IOMMUNotifierFlag notifier_flags; + /* entry in list of Address spaces with registered notifiers */ + QLIST_ENTRY(AMDVIAddressSpace) next; + /* Record DMA translation ranges */ + IOVATree *iova_tree; + /* DMA address translation active */ + bool addr_translation; }; /* AMDVI cache entry */ @@ -77,12 +87,29 @@ typedef struct AMDVIIOTLBEntry { uint64_t page_mask; /* physical page size */ } AMDVIIOTLBEntry; +/* + * These 'fault' reasons have an overloaded meaning since they are not only + * intended for describing reasons that generate an IO_PAGE_FAULT as per the AMD + * IOMMU specification, but are also used to signal internal errors in the + * emulation code. + */ +typedef enum AMDVIFaultReason { + AMDVI_FR_DTE_RTR_ERR = 1, /* Failure to retrieve DTE */ + AMDVI_FR_DTE_V, /* DTE[V] = 0 */ + AMDVI_FR_DTE_TV, /* DTE[TV] = 0 */ + AMDVI_FR_PT_ROOT_INV, /* Page Table Root ptr invalid */ + AMDVI_FR_PT_ENTRY_INV, /* Failure to read PTE from guest memory */ +} AMDVIFaultReason; + uint64_t amdvi_extended_feature_register(AMDVIState *s) { uint64_t feature = AMDVI_DEFAULT_EXT_FEATURES; if (s->xtsup) { feature |= AMDVI_FEATURE_XT; } + if (!s->iommu.dma_translation) { + feature |= AMDVI_HATS_MODE_RESERVED; + } return feature; } @@ -123,8 +150,13 @@ static void amdvi_writew(AMDVIState *s, hwaddr addr, uint16_t val) uint16_t romask = lduw_le_p(&s->romask[addr]); uint16_t w1cmask = lduw_le_p(&s->w1cmask[addr]); uint16_t oldval = lduw_le_p(&s->mmior[addr]); + + uint16_t oldval_preserved = oldval & (romask | w1cmask); + uint16_t newval_write = val & ~romask; + uint16_t newval_w1c_set = val & w1cmask; + stw_le_p(&s->mmior[addr], - ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask)); + (oldval_preserved | newval_write) & ~newval_w1c_set); } static void amdvi_writel(AMDVIState *s, hwaddr addr, uint32_t val) @@ -132,23 +164,33 @@ static void amdvi_writel(AMDVIState *s, hwaddr addr, uint32_t val) uint32_t romask = ldl_le_p(&s->romask[addr]); uint32_t w1cmask = ldl_le_p(&s->w1cmask[addr]); uint32_t oldval = ldl_le_p(&s->mmior[addr]); + + uint32_t oldval_preserved = oldval & (romask | w1cmask); + uint32_t newval_write = val & ~romask; + uint32_t newval_w1c_set = val & w1cmask; + stl_le_p(&s->mmior[addr], - ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask)); + (oldval_preserved | newval_write) & ~newval_w1c_set); } static void amdvi_writeq(AMDVIState *s, hwaddr addr, uint64_t val) { uint64_t romask = ldq_le_p(&s->romask[addr]); uint64_t w1cmask = ldq_le_p(&s->w1cmask[addr]); - uint32_t oldval = ldq_le_p(&s->mmior[addr]); + uint64_t oldval = ldq_le_p(&s->mmior[addr]); + + uint64_t oldval_preserved = oldval & (romask | w1cmask); + uint64_t newval_write = val & ~romask; + uint64_t newval_w1c_set = val & w1cmask; + stq_le_p(&s->mmior[addr], - ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask)); + (oldval_preserved | newval_write) & ~newval_w1c_set); } -/* OR a 64-bit register with a 64-bit value */ +/* AND a 64-bit register with a 64-bit value */ static bool amdvi_test_mask(AMDVIState *s, hwaddr addr, uint64_t val) { - return amdvi_readq(s, addr) | val; + return amdvi_readq(s, addr) & val; } /* OR a 64-bit register with a 64-bit value storing result in the register */ @@ -177,19 +219,31 @@ static void amdvi_generate_msi_interrupt(AMDVIState *s) } } +static uint32_t get_next_eventlog_entry(AMDVIState *s) +{ + uint32_t evtlog_size = s->evtlog_len * AMDVI_EVENT_LEN; + return (s->evtlog_tail + AMDVI_EVENT_LEN) % evtlog_size; +} + static void amdvi_log_event(AMDVIState *s, uint64_t *evt) { + uint32_t evtlog_tail_next; + /* event logging not enabled */ if (!s->evtlog_enabled || amdvi_test_mask(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_EVT_OVF)) { return; } + evtlog_tail_next = get_next_eventlog_entry(s); + /* event log buffer full */ - if (s->evtlog_tail >= s->evtlog_len) { - amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_EVT_OVF); - /* generate interrupt */ - amdvi_generate_msi_interrupt(s); + if (evtlog_tail_next == s->evtlog_head) { + /* generate overflow interrupt */ + if (s->evtlog_intr) { + amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_EVT_OVF); + amdvi_generate_msi_interrupt(s); + } return; } @@ -198,9 +252,13 @@ static void amdvi_log_event(AMDVIState *s, uint64_t *evt) trace_amdvi_evntlog_fail(s->evtlog, s->evtlog_tail); } - s->evtlog_tail += AMDVI_EVENT_LEN; - amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_COMP_INT); - amdvi_generate_msi_interrupt(s); + s->evtlog_tail = evtlog_tail_next; + amdvi_writeq_raw(s, AMDVI_MMIO_EVENT_TAIL, s->evtlog_tail); + + if (s->evtlog_intr) { + amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_EVENT_INT); + amdvi_generate_msi_interrupt(s); + } } static void amdvi_setevent_bits(uint64_t *buffer, uint64_t value, int start, @@ -407,18 +465,704 @@ static void amdvi_completion_wait(AMDVIState *s, uint64_t *cmd) trace_amdvi_completion_wait(addr, data); } +static inline uint64_t amdvi_get_perms(uint64_t entry) +{ + return (entry & (AMDVI_DEV_PERM_READ | AMDVI_DEV_PERM_WRITE)) >> + AMDVI_DEV_PERM_SHIFT; +} + +/* validate that reserved bits are honoured */ +static bool amdvi_validate_dte(AMDVIState *s, uint16_t devid, + uint64_t *dte) +{ + + uint64_t root; + + if ((dte[0] & AMDVI_DTE_QUAD0_RESERVED) || + (dte[1] & AMDVI_DTE_QUAD1_RESERVED) || + (dte[2] & AMDVI_DTE_QUAD2_RESERVED) || + (dte[3] & AMDVI_DTE_QUAD3_RESERVED)) { + amdvi_log_illegaldevtab_error(s, devid, + s->devtab + + devid * AMDVI_DEVTAB_ENTRY_SIZE, 0); + return false; + } + + /* + * 1 = Host Address Translation is not supported. Value in MMIO Offset + * 0030h[HATS] is not meaningful. A non-zero host page table root pointer + * in the DTE would result in an ILLEGAL_DEV_TABLE_ENTRY event. + */ + root = (dte[0] & AMDVI_DEV_PT_ROOT_MASK) >> 12; + if (root && !s->iommu.dma_translation) { + amdvi_log_illegaldevtab_error(s, devid, + s->devtab + + devid * AMDVI_DEVTAB_ENTRY_SIZE, 0); + return false; + } + + return true; +} + +/* get a device table entry given the devid */ +static bool amdvi_get_dte(AMDVIState *s, int devid, uint64_t *entry) +{ + uint32_t offset = devid * AMDVI_DEVTAB_ENTRY_SIZE; + + if (dma_memory_read(&address_space_memory, s->devtab + offset, entry, + AMDVI_DEVTAB_ENTRY_SIZE, MEMTXATTRS_UNSPECIFIED)) { + trace_amdvi_dte_get_fail(s->devtab, offset); + /* log error accessing dte */ + amdvi_log_devtab_error(s, devid, s->devtab + offset, 0); + return false; + } + + *entry = le64_to_cpu(*entry); + if (!amdvi_validate_dte(s, devid, entry)) { + trace_amdvi_invalid_dte(entry[0]); + return false; + } + + return true; +} + +/* get pte translation mode */ +static inline uint8_t get_pte_translation_mode(uint64_t pte) +{ + return (pte >> AMDVI_DEV_MODE_RSHIFT) & AMDVI_DEV_MODE_MASK; +} + +static inline uint64_t amdvi_get_pte_entry(AMDVIState *s, uint64_t pte_addr, + uint16_t devid) +{ + uint64_t pte; + + if (dma_memory_read(&address_space_memory, pte_addr, + &pte, sizeof(pte), MEMTXATTRS_UNSPECIFIED)) { + trace_amdvi_get_pte_hwerror(pte_addr); + amdvi_log_pagetab_error(s, devid, pte_addr, 0); + pte = (uint64_t)-1; + return pte; + } + + pte = le64_to_cpu(pte); + return pte; +} + +static int amdvi_as_to_dte(AMDVIAddressSpace *as, uint64_t *dte) +{ + uint16_t devid = PCI_BUILD_BDF(as->bus_num, as->devfn); + AMDVIState *s = as->iommu_state; + + if (!amdvi_get_dte(s, devid, dte)) { + /* Unable to retrieve DTE for devid */ + return -AMDVI_FR_DTE_RTR_ERR; + } + + if (!(dte[0] & AMDVI_DEV_VALID)) { + /* DTE[V] not set, address is passed untranslated for devid */ + return -AMDVI_FR_DTE_V; + } + + if (!(dte[0] & AMDVI_DEV_TRANSLATION_VALID)) { + /* DTE[TV] not set, host page table not valid for devid */ + return -AMDVI_FR_DTE_TV; + } + return 0; +} + +/* + * For a PTE encoding a large page, return the page size it encodes as described + * by the AMD IOMMU Specification Table 14: Example Page Size Encodings. + * No need to adjust the value of the PTE to point to the first PTE in the large + * page since the encoding guarantees all "base" PTEs in the large page are the + * same. + */ +static uint64_t large_pte_page_size(uint64_t pte) +{ + assert(PTE_NEXT_LEVEL(pte) == 7); + + /* Determine size of the large/contiguous page encoded in the PTE */ + return PTE_LARGE_PAGE_SIZE(pte); +} + +/* + * Helper function to fetch a PTE using AMD v1 pgtable format. + * On successful page walk, returns 0 and pte parameter points to a valid PTE. + * On failure, returns: + * -AMDVI_FR_PT_ROOT_INV: A page walk is not possible due to conditions like DTE + * with invalid permissions, Page Table Root can not be read from DTE, or a + * larger IOVA than supported by page table level encoded in DTE[Mode]. + * -AMDVI_FR_PT_ENTRY_INV: A PTE could not be read from guest memory during a + * page table walk. This means that the DTE has valid data, but one of the + * lower level entries in the Page Table could not be read. + */ +static uint64_t fetch_pte(AMDVIAddressSpace *as, hwaddr address, uint64_t dte, + uint64_t *pte, hwaddr *page_size) +{ + IOMMUAccessFlags perms = amdvi_get_perms(dte); + + uint8_t level, mode; + uint64_t pte_addr; + + *pte = dte; + *page_size = 0; + + if (perms == IOMMU_NONE) { + return -AMDVI_FR_PT_ROOT_INV; + } + + /* + * The Linux kernel driver initializes the default mode to 3, corresponding + * to a 39-bit GPA space, where each entry in the pagetable translates to a + * 1GB (2^30) page size. + */ + level = mode = get_pte_translation_mode(dte); + assert(mode > 0 && mode < 7); + + /* + * If IOVA is larger than the max supported by the current pgtable level, + * there is nothing to do. + */ + if (address > PT_LEVEL_MAX_ADDR(mode - 1)) { + /* IOVA too large for the current DTE */ + return -AMDVI_FR_PT_ROOT_INV; + } + + do { + level -= 1; + + /* Update the page_size */ + *page_size = PTE_LEVEL_PAGE_SIZE(level); + + /* Permission bits are ANDed at every level, including the DTE */ + perms &= amdvi_get_perms(*pte); + if (perms == IOMMU_NONE) { + return 0; + } + + /* Not Present */ + if (!IOMMU_PTE_PRESENT(*pte)) { + return 0; + } + + /* Large or Leaf PTE found */ + if (PTE_NEXT_LEVEL(*pte) == 7 || PTE_NEXT_LEVEL(*pte) == 0) { + /* Leaf PTE found */ + break; + } + + /* + * Index the pgtable using the IOVA bits corresponding to current level + * and walk down to the lower level. + */ + pte_addr = NEXT_PTE_ADDR(*pte, level, address); + *pte = amdvi_get_pte_entry(as->iommu_state, pte_addr, as->devfn); + + if (*pte == (uint64_t)-1) { + /* + * A returned PTE of -1 indicates a failure to read the page table + * entry from guest memory. + */ + if (level == mode - 1) { + /* Failure to retrieve the Page Table from Root Pointer */ + *page_size = 0; + return -AMDVI_FR_PT_ROOT_INV; + } else { + /* Failure to read PTE. Page walk skips a page_size chunk */ + return -AMDVI_FR_PT_ENTRY_INV; + } + } + } while (level > 0); + + assert(PTE_NEXT_LEVEL(*pte) == 0 || PTE_NEXT_LEVEL(*pte) == 7 || + level == 0); + /* + * Page walk ends when Next Level field on PTE shows that either a leaf PTE + * or a series of large PTEs have been reached. In the latter case, even if + * the range starts in the middle of a contiguous page, the returned PTE + * must be the first PTE of the series. + */ + if (PTE_NEXT_LEVEL(*pte) == 7) { + /* Update page_size with the large PTE page size */ + *page_size = large_pte_page_size(*pte); + } + + return 0; +} + +/* + * Invoke notifiers registered for the address space. Update record of mapped + * ranges in IOVA Tree. + */ +static void amdvi_notify_iommu(AMDVIAddressSpace *as, IOMMUTLBEvent *event) +{ + IOMMUTLBEntry *entry = &event->entry; + + DMAMap target = { + .iova = entry->iova, + .size = entry->addr_mask, + .translated_addr = entry->translated_addr, + .perm = entry->perm, + }; + + /* + * Search the IOVA Tree for an existing translation for the target, and skip + * the notification if the mapping is already recorded. + * When the guest uses large pages, comparing against the record makes it + * possible to determine the size of the original MAP and adjust the UNMAP + * request to match it. This avoids failed checks against the mappings kept + * by the VFIO kernel driver. + */ + const DMAMap *mapped = iova_tree_find(as->iova_tree, &target); + + if (event->type == IOMMU_NOTIFIER_UNMAP) { + if (!mapped) { + /* No record exists of this mapping, nothing to do */ + return; + } + /* + * Adjust the size based on the original record. This is essential to + * determine when large/contiguous pages are used, since the guest has + * already cleared the PTE (erasing the pagesize encoded on it) before + * issuing the invalidation command. + */ + if (mapped->size != target.size) { + assert(mapped->size > target.size); + target.size = mapped->size; + /* Adjust event to invoke notifier with correct range */ + entry->addr_mask = mapped->size; + } + iova_tree_remove(as->iova_tree, target); + } else { /* IOMMU_NOTIFIER_MAP */ + if (mapped) { + /* + * If a mapping is present and matches the request, skip the + * notification. + */ + if (!memcmp(mapped, &target, sizeof(DMAMap))) { + return; + } else { + /* + * This should never happen unless a buggy guest OS omits or + * sends incorrect invalidation(s). Report an error in the event + * it does happen. + */ + error_report("Found conflicting translation. This could be due " + "to an incorrect or missing invalidation command"); + } + } + /* Record the new mapping */ + iova_tree_insert(as->iova_tree, &target); + } + + /* Invoke the notifiers registered for this address space */ + memory_region_notify_iommu(&as->iommu, 0, *event); +} + +/* + * Walk the guest page table for an IOVA and range and signal the registered + * notifiers to sync the shadow page tables in the host. + * Must be called with a valid DTE for DMA remapping i.e. V=1,TV=1 + */ +static void amdvi_sync_shadow_page_table_range(AMDVIAddressSpace *as, + uint64_t *dte, hwaddr addr, + uint64_t size, bool send_unmap) +{ + IOMMUTLBEvent event; + + hwaddr page_mask, pagesize; + hwaddr iova = addr; + hwaddr end = iova + size - 1; + + uint64_t pte; + int ret; + + while (iova < end) { + + ret = fetch_pte(as, iova, dte[0], &pte, &pagesize); + + if (ret == -AMDVI_FR_PT_ROOT_INV) { + /* + * Invalid conditions such as the IOVA being larger than supported + * by current page table mode as configured in the DTE, or a failure + * to fetch the Page Table from the Page Table Root Pointer in DTE. + */ + assert(pagesize == 0); + return; + } + /* PTE has been validated for major errors and pagesize is set */ + assert(pagesize); + page_mask = ~(pagesize - 1); + + if (ret == -AMDVI_FR_PT_ENTRY_INV) { + /* + * Failure to read PTE from memory, the pagesize matches the current + * level. Unable to determine the region type, so a safe strategy is + * to skip the range and continue the page walk. + */ + goto next; + } + + event.entry.target_as = &address_space_memory; + event.entry.iova = iova & page_mask; + /* translated_addr is irrelevant for the unmap case */ + event.entry.translated_addr = (pte & AMDVI_DEV_PT_ROOT_MASK) & + page_mask; + event.entry.addr_mask = ~page_mask; + event.entry.perm = amdvi_get_perms(pte); + + /* + * In cases where the leaf PTE is not found, or it has invalid + * permissions, an UNMAP type notification is sent, but only if the + * caller requested it. + */ + if (!IOMMU_PTE_PRESENT(pte) || (event.entry.perm == IOMMU_NONE)) { + if (!send_unmap) { + goto next; + } + event.type = IOMMU_NOTIFIER_UNMAP; + } else { + event.type = IOMMU_NOTIFIER_MAP; + } + + /* + * The following call might need to adjust event.entry.size in cases + * where the guest unmapped a series of large pages. + */ + amdvi_notify_iommu(as, &event); + /* + * In the special scenario where the guest is unmapping a large page, + * addr_mask has been adjusted before sending the notification. Update + * pagesize accordingly in order to correctly compute the next IOVA. + */ + pagesize = event.entry.addr_mask + 1; + +next: + iova &= ~(pagesize - 1); + + /* Check for 64-bit overflow and terminate walk in such cases */ + if ((iova + pagesize) < iova) { + break; + } else { + iova += pagesize; + } + } +} + +/* + * Unmap entire range that the notifier registered for i.e. the full AS. + * + * This is seemingly technically equivalent to directly calling + * memory_region_unmap_iommu_notifier_range(), but it allows to check for + * notifier boundaries and issue notifications with ranges within those bounds. + */ +static void amdvi_address_space_unmap(AMDVIAddressSpace *as, IOMMUNotifier *n) +{ + + hwaddr start = n->start; + hwaddr end = n->end; + hwaddr remain; + DMAMap map; + + assert(start <= end); + remain = end - start + 1; + + /* + * Divide the notifier range into chunks that are aligned and do not exceed + * the notifier boundaries. + */ + while (remain >= AMDVI_PAGE_SIZE) { + + IOMMUTLBEvent event; + + uint64_t mask = dma_aligned_pow2_mask(start, end, 64); + + event.type = IOMMU_NOTIFIER_UNMAP; + + IOMMUTLBEntry entry = { + .target_as = &address_space_memory, + .iova = start, + .translated_addr = 0, /* irrelevant for unmap case */ + .addr_mask = mask, + .perm = IOMMU_NONE, + }; + event.entry = entry; + + /* Call notifier registered for updates on this address space */ + memory_region_notify_iommu_one(n, &event); + + start += mask + 1; + remain -= mask + 1; + } + + assert(!remain); + + map.iova = n->start; + map.size = n->end - n->start; + + iova_tree_remove(as->iova_tree, map); +} + +/* + * For all the address spaces with notifiers registered, unmap the entire range + * the notifier registered for i.e. clear all the address spaces managed by the + * IOMMU. + */ +static void amdvi_address_space_unmap_all(AMDVIState *s) +{ + AMDVIAddressSpace *as; + IOMMUNotifier *n; + + QLIST_FOREACH(as, &s->amdvi_as_with_notifiers, next) { + IOMMU_NOTIFIER_FOREACH(n, &as->iommu) { + amdvi_address_space_unmap(as, n); + } + } +} + +/* + * For every translation present in the IOMMU, construct IOMMUTLBEntry data + * and pass it as parameter to notifier callback. + */ +static void amdvi_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) +{ + AMDVIAddressSpace *as = container_of(iommu_mr, AMDVIAddressSpace, iommu); + uint64_t dte[4] = { 0 }; + + if (!(n->notifier_flags & IOMMU_NOTIFIER_MAP)) { + return; + } + + if (amdvi_as_to_dte(as, dte)) { + return; + } + + /* Dropping all mappings for the address space. Also clears the IOVA tree */ + amdvi_address_space_unmap(as, n); + + amdvi_sync_shadow_page_table_range(as, &dte[0], 0, UINT64_MAX, false); +} + +static void amdvi_address_space_sync(AMDVIAddressSpace *as) +{ + IOMMUNotifier *n; + uint64_t dte[4] = { 0 }; + + /* If only UNMAP notifiers are registered, drop all existing mappings */ + if (!(as->notifier_flags & IOMMU_NOTIFIER_MAP)) { + IOMMU_NOTIFIER_FOREACH(n, &as->iommu) { + /* + * Directly calling memory_region_unmap_iommu_notifier_range() does + * not guarantee that the addr_mask eventually passed as parameter + * to the notifier is valid. Use amdvi_address_space_unmap() which + * ensures the notifier range is divided into properly aligned + * regions, and issues notifications for each one. + */ + amdvi_address_space_unmap(as, n); + } + return; + } + + if (amdvi_as_to_dte(as, dte)) { + return; + } + + amdvi_sync_shadow_page_table_range(as, &dte[0], 0, UINT64_MAX, true); +} + +/* + * This differs from the replay() method in that it issues both MAP and UNMAP + * notifications since it is called after global invalidation events in order to + * re-sync all address spaces. + */ +static void amdvi_iommu_address_space_sync_all(AMDVIState *s) +{ + AMDVIAddressSpace *as; + + QLIST_FOREACH(as, &s->amdvi_as_with_notifiers, next) { + amdvi_address_space_sync(as); + } +} + +/* + * Toggle between address translation and passthrough modes by enabling the + * corresponding memory regions. + */ +static void amdvi_switch_address_space(AMDVIAddressSpace *amdvi_as) +{ + AMDVIState *s = amdvi_as->iommu_state; + + if (s->dma_remap && amdvi_as->addr_translation) { + /* Enabling DMA region */ + memory_region_set_enabled(&amdvi_as->iommu_nodma, false); + memory_region_set_enabled(MEMORY_REGION(&amdvi_as->iommu), true); + } else { + /* Disabling DMA region, using passthrough */ + memory_region_set_enabled(MEMORY_REGION(&amdvi_as->iommu), false); + memory_region_set_enabled(&amdvi_as->iommu_nodma, true); + } +} + +/* + * For all existing address spaces managed by the IOMMU, enable/disable the + * corresponding memory regions to reset the address translation mode and + * use passthrough by default. + */ +static void amdvi_reset_address_translation_all(AMDVIState *s) +{ + AMDVIAddressSpace **iommu_as; + + for (int bus_num = 0; bus_num < PCI_BUS_MAX; bus_num++) { + + /* Nothing to do if there are no devices on the current bus */ + if (!s->address_spaces[bus_num]) { + continue; + } + iommu_as = s->address_spaces[bus_num]; + + for (int devfn = 0; devfn < PCI_DEVFN_MAX; devfn++) { + + if (!iommu_as[devfn]) { + continue; + } + /* Use passthrough as default mode after reset */ + iommu_as[devfn]->addr_translation = false; + amdvi_switch_address_space(iommu_as[devfn]); + } + } +} + +static void enable_dma_mode(AMDVIAddressSpace *as, bool inval_current) +{ + /* + * When enabling DMA mode for the purpose of isolating guest devices on + * a failure to retrieve or invalid DTE, all existing mappings must be + * dropped. + */ + if (inval_current) { + IOMMUNotifier *n; + IOMMU_NOTIFIER_FOREACH(n, &as->iommu) { + amdvi_address_space_unmap(as, n); + } + } + + if (as->addr_translation) { + return; + } + + /* Installing DTE enabling translation, activate region */ + as->addr_translation = true; + amdvi_switch_address_space(as); + /* Sync shadow page tables */ + amdvi_address_space_sync(as); +} + +/* + * If paging was previously in use in the address space + * - invalidate all existing mappings + * - switch to no_dma memory region + */ +static void enable_nodma_mode(AMDVIAddressSpace *as) +{ + IOMMUNotifier *n; + + if (!as->addr_translation) { + /* passthrough is already active, nothing to do */ + return; + } + + as->addr_translation = false; + IOMMU_NOTIFIER_FOREACH(n, &as->iommu) { + /* Drop all mappings for the address space */ + amdvi_address_space_unmap(as, n); + } + amdvi_switch_address_space(as); +} + +/* + * A guest driver must issue the INVALIDATE_DEVTAB_ENTRY command to the IOMMU + * after changing a Device Table entry. We can use this fact to detect when a + * Device Table entry is created for a device attached to a paging domain and + * enable the corresponding IOMMU memory region to allow for DMA translation if + * appropriate. + */ +static void amdvi_update_addr_translation_mode(AMDVIState *s, uint16_t devid) +{ + uint8_t bus_num, devfn, dte_mode; + AMDVIAddressSpace *as; + uint64_t dte[4] = { 0 }; + int ret; + + /* + * Convert the devid encoded in the command to a bus and devfn in + * order to retrieve the corresponding address space. + */ + bus_num = PCI_BUS_NUM(devid); + devfn = devid & 0xff; + + /* + * The main buffer of size (AMDVIAddressSpace *) * (PCI_BUS_MAX) has already + * been allocated within AMDVIState, but must be careful to not access + * unallocated devfn. + */ + if (!s->address_spaces[bus_num] || !s->address_spaces[bus_num][devfn]) { + return; + } + as = s->address_spaces[bus_num][devfn]; + + ret = amdvi_as_to_dte(as, dte); + + if (!ret) { + dte_mode = (dte[0] >> AMDVI_DEV_MODE_RSHIFT) & AMDVI_DEV_MODE_MASK; + } + + switch (ret) { + case 0: + /* DTE was successfully retrieved */ + if (!dte_mode) { + enable_nodma_mode(as); /* DTE[V]=1 && DTE[Mode]=0 => passthrough */ + } else { + enable_dma_mode(as, false); /* Enable DMA translation */ + } + break; + case -AMDVI_FR_DTE_V: + /* DTE[V]=0, address is passed untranslated */ + enable_nodma_mode(as); + break; + case -AMDVI_FR_DTE_RTR_ERR: + case -AMDVI_FR_DTE_TV: + /* + * Enforce isolation by using DMA in rare scenarios where the DTE cannot + * be retrieved or DTE[TV]=0. Existing mappings are dropped. + */ + enable_dma_mode(as, true); + break; + } +} + /* log error without aborting since linux seems to be using reserved bits */ static void amdvi_inval_devtab_entry(AMDVIState *s, uint64_t *cmd) { uint16_t devid = cpu_to_le16((uint16_t)extract64(cmd[0], 0, 16)); + trace_amdvi_devtab_inval(PCI_BUS_NUM(devid), PCI_SLOT(devid), + PCI_FUNC(devid)); + /* This command should invalidate internal caches of which there isn't */ if (extract64(cmd[0], 16, 44) || cmd[1]) { amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4), s->cmdbuf + s->cmdbuf_head); + return; + } + + /* + * When DMA remapping capability is enabled, check if updated DTE is setup + * for paging or not, and configure the corresponding memory regions. + */ + if (s->dma_remap) { + amdvi_update_addr_translation_mode(s, devid); } - trace_amdvi_devtab_inval(PCI_BUS_NUM(devid), PCI_SLOT(devid), - PCI_FUNC(devid)); } static void amdvi_complete_ppr(AMDVIState *s, uint64_t *cmd) @@ -449,6 +1193,13 @@ static void amdvi_inval_all(AMDVIState *s, uint64_t *cmd) amdvi_intremap_inval_notify_all(s, true, 0, 0); amdvi_iotlb_reset(s); + + /* + * Fully replay the address space i.e. send both UNMAP and MAP events in + * order to synchronize guest and host IO page tables tables. + */ + amdvi_iommu_address_space_sync_all(s); + trace_amdvi_all_inval(); } @@ -460,10 +1211,109 @@ static gboolean amdvi_iotlb_remove_by_domid(gpointer key, gpointer value, return entry->domid == domid; } +/* + * Helper to decode the size of the range to invalidate encoded in the + * INVALIDATE_IOMMU_PAGES Command format. + * The size of the region to invalidate depends on the S bit and address. + * S bit value: + * 0 : Invalidation size is 4 Kbytes. + * 1 : Invalidation size is determined by first zero bit in the address + * starting from Address[12]. + * + * In the AMD IOMMU Linux driver, an invalidation command with address + * ((1 << 63) - 1) is sent when intending to clear the entire cache. + * However, Table 14: Example Page Size Encodings shows that an address of + * ((1ULL << 51) - 1) encodes the entire cache, so effectively any address with + * first zero at bit 51 or larger is a request to invalidate the entire address + * space. + */ +static uint64_t amdvi_decode_invalidation_size(hwaddr addr, uint16_t flags) +{ + uint64_t size = AMDVI_PAGE_SIZE; + uint8_t fzbit = 0; + + if (flags & AMDVI_CMD_INVAL_IOMMU_PAGES_S) { + fzbit = cto64(addr | 0xFFF); + + if (fzbit >= 51) { + size = AMDVI_INV_ALL_PAGES; + } else { + size = 1ULL << (fzbit + 1); + } + } + return size; +} + +/* + * Synchronize the guest page tables with the shadow page tables kept in the + * host for the specified range. + * The invalidation command issued by the guest and intercepted by the VMM + * does not specify a device, but a domain, since all devices in the same domain + * share the same page tables. However, vIOMMU emulation creates separate + * address spaces per device, so it is necessary to traverse the list of all of + * address spaces (i.e. devices) that have notifiers registered in order to + * propagate the changes to the host page tables. + * We cannot return early from this function once a matching domain has been + * identified and its page tables synced (based on the fact that all devices in + * the same domain share the page tables). The reason is that different devices + * (i.e. address spaces) could have different notifiers registered, and by + * skipping address spaces that appear later on the amdvi_as_with_notifiers list + * their notifiers (which could differ from the ones registered for the first + * device/address space) would not be invoked. + */ +static void amdvi_sync_domain(AMDVIState *s, uint16_t domid, uint64_t addr, + uint16_t flags) +{ + AMDVIAddressSpace *as; + + uint64_t size = amdvi_decode_invalidation_size(addr, flags); + + if (size == AMDVI_INV_ALL_PAGES) { + addr = 0; /* Set start address to 0 and invalidate entire AS */ + } else { + addr &= ~(size - 1); + } + + /* + * Call notifiers that have registered for each address space matching the + * domain ID, in order to sync the guest pagetable state with the host. + */ + QLIST_FOREACH(as, &s->amdvi_as_with_notifiers, next) { + + uint64_t dte[4] = { 0 }; + + /* + * Retrieve the Device Table entry for the devid corresponding to the + * current address space, and verify the DomainID matches i.e. the page + * tables to be synced belong to devices in the domain. + */ + if (amdvi_as_to_dte(as, dte)) { + continue; + } + + /* Only need to sync the Page Tables for a matching domain */ + if (domid != (dte[1] & AMDVI_DEV_DOMID_ID_MASK)) { + continue; + } + + /* + * We have determined that there is a valid Device Table Entry for a + * device matching the DomainID in the INV_IOMMU_PAGES command issued by + * the guest. Walk the guest page table to sync shadow page table. + */ + if (as->notifier_flags & IOMMU_NOTIFIER_MAP) { + /* Sync guest IOMMU mappings with host */ + amdvi_sync_shadow_page_table_range(as, &dte[0], addr, size, true); + } + } +} + /* we don't have devid - we can't remove pages by address */ static void amdvi_inval_pages(AMDVIState *s, uint64_t *cmd) { uint16_t domid = cpu_to_le16((uint16_t)extract64(cmd[0], 32, 16)); + uint64_t addr = cpu_to_le64(extract64(cmd[1], 12, 52)) << 12; + uint16_t flags = cpu_to_le16((uint16_t)extract64(cmd[1], 0, 3)); if (extract64(cmd[0], 20, 12) || extract64(cmd[0], 48, 12) || extract64(cmd[1], 3, 9)) { @@ -473,6 +1323,8 @@ static void amdvi_inval_pages(AMDVIState *s, uint64_t *cmd) g_hash_table_foreach_remove(s->iotlb, amdvi_iotlb_remove_by_domid, &domid); + + amdvi_sync_domain(s, domid, addr, flags); trace_amdvi_pages_inval(domid); } @@ -508,7 +1360,7 @@ static void amdvi_inval_inttable(AMDVIState *s, uint64_t *cmd) static void iommu_inval_iotlb(AMDVIState *s, uint64_t *cmd) { - uint16_t devid = extract64(cmd[0], 0, 16); + uint16_t devid = cpu_to_le16(extract64(cmd[0], 0, 16)); if (extract64(cmd[1], 1, 1) || extract64(cmd[1], 3, 1) || extract64(cmd[1], 6, 6)) { amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4), @@ -521,7 +1373,7 @@ static void iommu_inval_iotlb(AMDVIState *s, uint64_t *cmd) &devid); } else { amdvi_iotlb_remove_page(s, cpu_to_le64(extract64(cmd[1], 12, 52)) << 12, - cpu_to_le16(extract64(cmd[1], 0, 16))); + devid); } trace_amdvi_iotlb_inval(); } @@ -592,18 +1444,31 @@ static void amdvi_cmdbuf_run(AMDVIState *s) } } -static void amdvi_mmio_trace(hwaddr addr, unsigned size) +static inline uint8_t amdvi_mmio_get_index(hwaddr addr) { uint8_t index = (addr & ~0x2000) / 8; if ((addr & 0x2000)) { /* high table */ index = index >= AMDVI_MMIO_REGS_HIGH ? AMDVI_MMIO_REGS_HIGH : index; - trace_amdvi_mmio_read(amdvi_mmio_high[index], addr, size, addr & ~0x07); } else { index = index >= AMDVI_MMIO_REGS_LOW ? AMDVI_MMIO_REGS_LOW : index; - trace_amdvi_mmio_read(amdvi_mmio_low[index], addr, size, addr & ~0x07); } + + return index; +} + +static void amdvi_mmio_trace_read(hwaddr addr, unsigned size) +{ + uint8_t index = amdvi_mmio_get_index(addr); + trace_amdvi_mmio_read(amdvi_mmio_low[index], addr, size, addr & ~0x07); +} + +static void amdvi_mmio_trace_write(hwaddr addr, unsigned size, uint64_t val) +{ + uint8_t index = amdvi_mmio_get_index(addr); + trace_amdvi_mmio_write(amdvi_mmio_low[index], addr, size, val, + addr & ~0x07); } static uint64_t amdvi_mmio_read(void *opaque, hwaddr addr, unsigned size) @@ -623,7 +1488,7 @@ static uint64_t amdvi_mmio_read(void *opaque, hwaddr addr, unsigned size) } else if (size == 8) { val = amdvi_readq(s, addr); } - amdvi_mmio_trace(addr, size); + amdvi_mmio_trace_read(addr, size); return val; } @@ -633,7 +1498,6 @@ static void amdvi_handle_control_write(AMDVIState *s) unsigned long control = amdvi_readq(s, AMDVI_MMIO_CONTROL); s->enabled = !!(control & AMDVI_MMIO_CONTROL_AMDVIEN); - s->ats_enabled = !!(control & AMDVI_MMIO_CONTROL_HTTUNEN); s->evtlog_enabled = s->enabled && !!(control & AMDVI_MMIO_CONTROL_EVENTLOGEN); @@ -665,8 +1529,8 @@ static inline void amdvi_handle_devtab_write(AMDVIState *s) uint64_t val = amdvi_readq(s, AMDVI_MMIO_DEVICE_TABLE); s->devtab = (val & AMDVI_MMIO_DEVTAB_BASE_MASK); - /* set device table length */ - s->devtab_len = ((val & AMDVI_MMIO_DEVTAB_SIZE_MASK) + 1 * + /* set device table length (i.e. number of entries table can hold) */ + s->devtab_len = (((val & AMDVI_MMIO_DEVTAB_SIZE_MASK) + 1) * (AMDVI_MMIO_DEVTAB_SIZE_UNIT / AMDVI_MMIO_DEVTAB_ENTRY_SIZE)); } @@ -704,9 +1568,19 @@ static inline void amdvi_handle_excllim_write(AMDVIState *s) static inline void amdvi_handle_evtbase_write(AMDVIState *s) { uint64_t val = amdvi_readq(s, AMDVI_MMIO_EVENT_BASE); + + if (amdvi_readq(s, AMDVI_MMIO_STATUS) & AMDVI_MMIO_STATUS_EVENT_INT) + /* Do not reset if eventlog interrupt bit is set*/ + return; + s->evtlog = val & AMDVI_MMIO_EVTLOG_BASE_MASK; s->evtlog_len = 1UL << (amdvi_readq(s, AMDVI_MMIO_EVTLOG_SIZE_BYTE) & AMDVI_MMIO_EVTLOG_SIZE_MASK); + + /* clear tail and head pointer to 0 when event base is updated */ + s->evtlog_tail = s->evtlog_head = 0; + amdvi_writeq_raw(s, AMDVI_MMIO_EVENT_HEAD, s->evtlog_head); + amdvi_writeq_raw(s, AMDVI_MMIO_EVENT_TAIL, s->evtlog_tail); } static inline void amdvi_handle_evttail_write(AMDVIState *s) @@ -770,7 +1644,7 @@ static void amdvi_mmio_write(void *opaque, hwaddr addr, uint64_t val, return; } - amdvi_mmio_trace(addr, size); + amdvi_mmio_trace_write(addr, size, val); switch (addr & ~0x07) { case AMDVI_MMIO_CONTROL: amdvi_mmio_reg_write(s, size, val, addr); @@ -835,152 +1709,74 @@ static void amdvi_mmio_write(void *opaque, hwaddr addr, uint64_t val, amdvi_mmio_reg_write(s, size, val, addr); amdvi_handle_pprtail_write(s); break; + case AMDVI_MMIO_STATUS: + amdvi_mmio_reg_write(s, size, val, addr); + break; } } -static inline uint64_t amdvi_get_perms(uint64_t entry) -{ - return (entry & (AMDVI_DEV_PERM_READ | AMDVI_DEV_PERM_WRITE)) >> - AMDVI_DEV_PERM_SHIFT; -} - -/* validate that reserved bits are honoured */ -static bool amdvi_validate_dte(AMDVIState *s, uint16_t devid, - uint64_t *dte) -{ - if ((dte[0] & AMDVI_DTE_LOWER_QUAD_RESERVED) - || (dte[1] & AMDVI_DTE_MIDDLE_QUAD_RESERVED) - || (dte[2] & AMDVI_DTE_UPPER_QUAD_RESERVED) || dte[3]) { - amdvi_log_illegaldevtab_error(s, devid, - s->devtab + - devid * AMDVI_DEVTAB_ENTRY_SIZE, 0); - return false; - } - - return true; -} - -/* get a device table entry given the devid */ -static bool amdvi_get_dte(AMDVIState *s, int devid, uint64_t *entry) +static void amdvi_page_walk(AMDVIAddressSpace *as, uint64_t *dte, + IOMMUTLBEntry *ret, unsigned perms, + hwaddr addr) { - uint32_t offset = devid * AMDVI_DEVTAB_ENTRY_SIZE; + hwaddr page_mask, pagesize = 0; + uint8_t mode; + uint64_t pte; + int fetch_ret; - if (dma_memory_read(&address_space_memory, s->devtab + offset, entry, - AMDVI_DEVTAB_ENTRY_SIZE, MEMTXATTRS_UNSPECIFIED)) { - trace_amdvi_dte_get_fail(s->devtab, offset); - /* log error accessing dte */ - amdvi_log_devtab_error(s, devid, s->devtab + offset, 0); - return false; + /* make sure the DTE has TV = 1 */ + if (!(dte[0] & AMDVI_DEV_TRANSLATION_VALID)) { + /* + * A DTE with V=1, TV=0 does not have a valid Page Table Root Pointer. + * An IOMMU processing a request that requires a table walk terminates + * the walk when it encounters this condition. Do the same and return + * instead of assuming that the address is forwarded without translation + * i.e. the passthrough case, as it is done for the case where DTE[V]=0. + */ + return; } - *entry = le64_to_cpu(*entry); - if (!amdvi_validate_dte(s, devid, entry)) { - trace_amdvi_invalid_dte(entry[0]); - return false; + mode = get_pte_translation_mode(dte[0]); + if (mode >= 7) { + trace_amdvi_mode_invalid(mode, addr); + return; } - - return true; -} - -/* get pte translation mode */ -static inline uint8_t get_pte_translation_mode(uint64_t pte) -{ - return (pte >> AMDVI_DEV_MODE_RSHIFT) & AMDVI_DEV_MODE_MASK; -} - -static inline uint64_t pte_override_page_mask(uint64_t pte) -{ - uint8_t page_mask = 13; - uint64_t addr = (pte & AMDVI_DEV_PT_ROOT_MASK) >> 12; - /* find the first zero bit */ - while (addr & 1) { - page_mask++; - addr = addr >> 1; + if (mode == 0) { + goto no_remap; } - return ~((1ULL << page_mask) - 1); -} - -static inline uint64_t pte_get_page_mask(uint64_t oldlevel) -{ - return ~((1UL << ((oldlevel * 9) + 3)) - 1); -} + /* Attempt to fetch the PTE to determine if a valid mapping exists */ + fetch_ret = fetch_pte(as, addr, dte[0], &pte, &pagesize); -static inline uint64_t amdvi_get_pte_entry(AMDVIState *s, uint64_t pte_addr, - uint16_t devid) -{ - uint64_t pte; + /* + * If walking the page table results in an error of any type, returns an + * empty PTE i.e. no mapping, or the permissions do not match, return since + * there is no translation available. + */ + if (fetch_ret < 0 || !IOMMU_PTE_PRESENT(pte) || + perms != (perms & amdvi_get_perms(pte))) { - if (dma_memory_read(&address_space_memory, pte_addr, - &pte, sizeof(pte), MEMTXATTRS_UNSPECIFIED)) { - trace_amdvi_get_pte_hwerror(pte_addr); - amdvi_log_pagetab_error(s, devid, pte_addr, 0); - pte = 0; - return pte; + amdvi_page_fault(as->iommu_state, as->devfn, addr, perms); + trace_amdvi_page_fault(addr); + return; } - pte = le64_to_cpu(pte); - return pte; -} - -static void amdvi_page_walk(AMDVIAddressSpace *as, uint64_t *dte, - IOMMUTLBEntry *ret, unsigned perms, - hwaddr addr) -{ - unsigned level, present, pte_perms, oldlevel; - uint64_t pte = dte[0], pte_addr, page_mask; - - /* make sure the DTE has TV = 1 */ - if (pte & AMDVI_DEV_TRANSLATION_VALID) { - level = get_pte_translation_mode(pte); - if (level >= 7) { - trace_amdvi_mode_invalid(level, addr); - return; - } - if (level == 0) { - goto no_remap; - } - - /* we are at the leaf page table or page table encodes a huge page */ - do { - pte_perms = amdvi_get_perms(pte); - present = pte & 1; - if (!present || perms != (perms & pte_perms)) { - amdvi_page_fault(as->iommu_state, as->devfn, addr, perms); - trace_amdvi_page_fault(addr); - return; - } - - /* go to the next lower level */ - pte_addr = pte & AMDVI_DEV_PT_ROOT_MASK; - /* add offset and load pte */ - pte_addr += ((addr >> (3 + 9 * level)) & 0x1FF) << 3; - pte = amdvi_get_pte_entry(as->iommu_state, pte_addr, as->devfn); - if (!pte) { - return; - } - oldlevel = level; - level = get_pte_translation_mode(pte); - } while (level > 0 && level < 7); + /* A valid PTE and page size has been retrieved */ + assert(pagesize); + page_mask = ~(pagesize - 1); - if (level == 0x7) { - page_mask = pte_override_page_mask(pte); - } else { - page_mask = pte_get_page_mask(oldlevel); - } + /* get access permissions from pte */ + ret->iova = addr & page_mask; + ret->translated_addr = (pte & AMDVI_DEV_PT_ROOT_MASK) & page_mask; + ret->addr_mask = ~page_mask; + ret->perm = amdvi_get_perms(pte); + return; - /* get access permissions from pte */ - ret->iova = addr & page_mask; - ret->translated_addr = (pte & AMDVI_DEV_PT_ROOT_MASK) & page_mask; - ret->addr_mask = ~page_mask; - ret->perm = amdvi_get_perms(pte); - return; - } no_remap: ret->iova = addr & AMDVI_PAGE_MASK_4K; ret->translated_addr = addr & AMDVI_PAGE_MASK_4K; ret->addr_mask = ~AMDVI_PAGE_MASK_4K; - ret->perm = amdvi_get_perms(pte); + ret->perm = amdvi_get_perms(dte[0]); } static void amdvi_do_translate(AMDVIAddressSpace *as, hwaddr addr, @@ -990,6 +1786,7 @@ static void amdvi_do_translate(AMDVIAddressSpace *as, hwaddr addr, uint16_t devid = PCI_BUILD_BDF(as->bus_num, as->devfn); AMDVIIOTLBEntry *iotlb_entry = amdvi_iotlb_lookup(s, addr, devid); uint64_t entry[4]; + int dte_ret; if (iotlb_entry) { trace_amdvi_iotlb_hit(PCI_BUS_NUM(devid), PCI_SLOT(devid), @@ -1001,13 +1798,14 @@ static void amdvi_do_translate(AMDVIAddressSpace *as, hwaddr addr, return; } - if (!amdvi_get_dte(s, devid, entry)) { - return; - } + dte_ret = amdvi_as_to_dte(as, entry); - /* devices with V = 0 are not translated */ - if (!(entry[0] & AMDVI_DEV_VALID)) { - goto out; + if (dte_ret < 0) { + if (dte_ret == -AMDVI_FR_DTE_V) { + /* DTE[V]=0, address is passed untranslated */ + goto out; + } + return; } amdvi_page_walk(as, entry, ret, @@ -1426,7 +2224,6 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) AMDVIState *s = opaque; AMDVIAddressSpace **iommu_as, *amdvi_dev_as; int bus_num = pci_bus_num(bus); - X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); iommu_as = s->address_spaces[bus_num]; @@ -1444,6 +2241,9 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) iommu_as[devfn]->bus_num = (uint8_t)bus_num; iommu_as[devfn]->devfn = (uint8_t)devfn; iommu_as[devfn]->iommu_state = s; + iommu_as[devfn]->notifier_flags = IOMMU_NOTIFIER_NONE; + iommu_as[devfn]->iova_tree = iova_tree_new(); + iommu_as[devfn]->addr_translation = false; amdvi_dev_as = iommu_as[devfn]; @@ -1486,15 +2286,7 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) AMDVI_INT_ADDR_FIRST, &amdvi_dev_as->iommu_ir, 1); - if (!x86_iommu->pt_supported) { - memory_region_set_enabled(&amdvi_dev_as->iommu_nodma, false); - memory_region_set_enabled(MEMORY_REGION(&amdvi_dev_as->iommu), - true); - } else { - memory_region_set_enabled(MEMORY_REGION(&amdvi_dev_as->iommu), - false); - memory_region_set_enabled(&amdvi_dev_as->iommu_nodma, true); - } + amdvi_switch_address_space(amdvi_dev_as); } return &iommu_as[devfn]->as; } @@ -1524,14 +2316,35 @@ static int amdvi_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, Error **errp) { AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu); + AMDVIState *s = as->iommu_state; + + /* + * Accurate synchronization of the vIOMMU page tables required to support + * MAP notifiers is provided by the dma-remap feature. In addition, this + * also requires that the vIOMMU presents the NpCache capability, so a guest + * driver issues invalidations for both map() and unmap() operations. The + * capability is already set by default as part of AMDVI_CAPAB_FEATURES and + * written to the configuration in amdvi_pci_realize(). + */ + if (!s->dma_remap && (new & IOMMU_NOTIFIER_MAP)) { + error_setg_errno(errp, ENOTSUP, + "device %02x.%02x.%x requires dma-remap=1", + as->bus_num, PCI_SLOT(as->devfn), PCI_FUNC(as->devfn)); + return -ENOTSUP; + } + + /* + * Update notifier flags for address space and the list of address spaces + * with registered notifiers. + */ + as->notifier_flags = new; - if (new & IOMMU_NOTIFIER_MAP) { - error_setg(errp, - "device %02x.%02x.%x requires iommu notifier which is not " - "currently supported", as->bus_num, PCI_SLOT(as->devfn), - PCI_FUNC(as->devfn)); - return -EINVAL; + if (old == IOMMU_NOTIFIER_NONE) { + QLIST_INSERT_HEAD(&s->amdvi_as_with_notifiers, as, next); + } else if (new == IOMMU_NOTIFIER_NONE) { + QLIST_REMOVE(as, next); } + return 0; } @@ -1549,7 +2362,6 @@ static void amdvi_init(AMDVIState *s) s->excl_allow = false; s->mmio_enabled = false; s->enabled = false; - s->ats_enabled = false; s->cmdbuf_enabled = false; /* reset MMIO */ @@ -1609,6 +2421,10 @@ static void amdvi_sysbus_reset(DeviceState *dev) msi_reset(&s->pci->dev); amdvi_init(s); + + /* Discard all mappings on device reset */ + amdvi_address_space_unmap_all(s); + amdvi_reset_address_translation_all(s); } static const VMStateDescription vmstate_amdvi_sysbus_migratable = { @@ -1620,7 +2436,8 @@ static const VMStateDescription vmstate_amdvi_sysbus_migratable = { /* Updated in amdvi_handle_control_write() */ VMSTATE_BOOL(enabled, AMDVIState), VMSTATE_BOOL(ga_enabled, AMDVIState), - VMSTATE_BOOL(ats_enabled, AMDVIState), + /* bool ats_enabled is obsolete */ + VMSTATE_UNUSED(1), /* was ats_enabled */ VMSTATE_BOOL(cmdbuf_enabled, AMDVIState), VMSTATE_BOOL(completion_wait_intr, AMDVIState), VMSTATE_BOOL(evtlog_enabled, AMDVIState), @@ -1693,9 +2510,6 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp) s->iotlb = g_hash_table_new_full(amdvi_uint64_hash, amdvi_uint64_equal, g_free, g_free); - /* Pseudo address space under root PCI bus. */ - x86ms->ioapic_as = amdvi_host_dma_iommu(bus, s, AMDVI_IOAPIC_SB_DEVID); - /* set up MMIO */ memory_region_init_io(&s->mr_mmio, OBJECT(s), &mmio_mem_ops, s, "amdvi-mmio", AMDVI_MMIO_SIZE); @@ -1718,11 +2532,22 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp) memory_region_add_subregion_overlap(&s->mr_sys, AMDVI_INT_ADDR_FIRST, &s->mr_ir, 1); + /* Pseudo address space under root PCI bus. */ + x86ms->ioapic_as = amdvi_host_dma_iommu(bus, s, AMDVI_IOAPIC_SB_DEVID); + if (kvm_enabled() && x86ms->apic_id_limit > 255 && !s->xtsup) { error_report("AMD IOMMU with x2APIC configuration requires xtsup=on"); exit(EXIT_FAILURE); } + if (s->xtsup) { + if (kvm_irqchip_is_split() && !kvm_enable_x2apic()) { + error_report("AMD IOMMU xtsup=on requires x2APIC support on " + "the KVM side"); + exit(EXIT_FAILURE); + } + } + pci_setup_iommu(bus, &amdvi_iommu_ops, s); amdvi_init(s); } @@ -1730,6 +2555,7 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp) static const Property amdvi_properties[] = { DEFINE_PROP_BOOL("xtsup", AMDVIState, xtsup, false), DEFINE_PROP_STRING("pci-id", AMDVIState, pci_id), + DEFINE_PROP_BOOL("dma-remap", AMDVIState, dma_remap, false), }; static const VMStateDescription vmstate_amdvi_sysbus = { @@ -1791,6 +2617,7 @@ static void amdvi_iommu_memory_region_class_init(ObjectClass *klass, imrc->translate = amdvi_translate; imrc->notify_flag_changed = amdvi_iommu_notify_flag_changed; + imrc->replay = amdvi_iommu_replay; } static const TypeInfo amdvi_iommu_memory_region_info = { diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h index 5672bde..daf82fc 100644 --- a/hw/i386/amd_iommu.h +++ b/hw/i386/amd_iommu.h @@ -25,6 +25,8 @@ #include "hw/i386/x86-iommu.h" #include "qom/object.h" +#define GENMASK64(h, l) (((~0ULL) >> (63 - (h) + (l))) << (l)) + /* Capability registers */ #define AMDVI_CAPAB_BAR_LOW 0x04 #define AMDVI_CAPAB_BAR_HIGH 0x08 @@ -66,34 +68,34 @@ #define AMDVI_MMIO_SIZE 0x4000 -#define AMDVI_MMIO_DEVTAB_SIZE_MASK ((1ULL << 12) - 1) -#define AMDVI_MMIO_DEVTAB_BASE_MASK (((1ULL << 52) - 1) & ~ \ - AMDVI_MMIO_DEVTAB_SIZE_MASK) +#define AMDVI_MMIO_DEVTAB_SIZE_MASK GENMASK64(8, 0) +#define AMDVI_MMIO_DEVTAB_BASE_MASK GENMASK64(51, 12) + #define AMDVI_MMIO_DEVTAB_ENTRY_SIZE 32 #define AMDVI_MMIO_DEVTAB_SIZE_UNIT 4096 /* some of this are similar but just for readability */ #define AMDVI_MMIO_CMDBUF_SIZE_BYTE (AMDVI_MMIO_COMMAND_BASE + 7) #define AMDVI_MMIO_CMDBUF_SIZE_MASK 0x0f -#define AMDVI_MMIO_CMDBUF_BASE_MASK AMDVI_MMIO_DEVTAB_BASE_MASK -#define AMDVI_MMIO_CMDBUF_HEAD_MASK (((1ULL << 19) - 1) & ~0x0f) -#define AMDVI_MMIO_CMDBUF_TAIL_MASK AMDVI_MMIO_EVTLOG_HEAD_MASK +#define AMDVI_MMIO_CMDBUF_BASE_MASK GENMASK64(51, 12) +#define AMDVI_MMIO_CMDBUF_HEAD_MASK GENMASK64(18, 4) +#define AMDVI_MMIO_CMDBUF_TAIL_MASK GENMASK64(18, 4) #define AMDVI_MMIO_EVTLOG_SIZE_BYTE (AMDVI_MMIO_EVENT_BASE + 7) -#define AMDVI_MMIO_EVTLOG_SIZE_MASK AMDVI_MMIO_CMDBUF_SIZE_MASK -#define AMDVI_MMIO_EVTLOG_BASE_MASK AMDVI_MMIO_CMDBUF_BASE_MASK -#define AMDVI_MMIO_EVTLOG_HEAD_MASK (((1ULL << 19) - 1) & ~0x0f) -#define AMDVI_MMIO_EVTLOG_TAIL_MASK AMDVI_MMIO_EVTLOG_HEAD_MASK +#define AMDVI_MMIO_EVTLOG_SIZE_MASK 0x0f +#define AMDVI_MMIO_EVTLOG_BASE_MASK GENMASK64(51, 12) +#define AMDVI_MMIO_EVTLOG_HEAD_MASK GENMASK64(18, 4) +#define AMDVI_MMIO_EVTLOG_TAIL_MASK GENMASK64(18, 4) -#define AMDVI_MMIO_PPRLOG_SIZE_BYTE (AMDVI_MMIO_EVENT_BASE + 7) -#define AMDVI_MMIO_PPRLOG_HEAD_MASK AMDVI_MMIO_EVTLOG_HEAD_MASK -#define AMDVI_MMIO_PPRLOG_TAIL_MASK AMDVI_MMIO_EVTLOG_HEAD_MASK -#define AMDVI_MMIO_PPRLOG_BASE_MASK AMDVI_MMIO_EVTLOG_BASE_MASK -#define AMDVI_MMIO_PPRLOG_SIZE_MASK AMDVI_MMIO_EVTLOG_SIZE_MASK +#define AMDVI_MMIO_PPRLOG_SIZE_BYTE (AMDVI_MMIO_PPR_BASE + 7) +#define AMDVI_MMIO_PPRLOG_SIZE_MASK 0x0f +#define AMDVI_MMIO_PPRLOG_BASE_MASK GENMASK64(51, 12) +#define AMDVI_MMIO_PPRLOG_HEAD_MASK GENMASK64(18, 4) +#define AMDVI_MMIO_PPRLOG_TAIL_MASK GENMASK64(18, 4) #define AMDVI_MMIO_EXCL_ENABLED_MASK (1ULL << 0) #define AMDVI_MMIO_EXCL_ALLOW_MASK (1ULL << 1) -#define AMDVI_MMIO_EXCL_LIMIT_MASK AMDVI_MMIO_DEVTAB_BASE_MASK +#define AMDVI_MMIO_EXCL_LIMIT_MASK GENMASK64(51, 12) #define AMDVI_MMIO_EXCL_LIMIT_LOW 0xfff /* mmio control register flags */ @@ -109,6 +111,7 @@ #define AMDVI_MMIO_STATUS_CMDBUF_RUN (1 << 4) #define AMDVI_MMIO_STATUS_EVT_RUN (1 << 3) #define AMDVI_MMIO_STATUS_COMP_INT (1 << 2) +#define AMDVI_MMIO_STATUS_EVENT_INT (1 << 1) #define AMDVI_MMIO_STATUS_EVT_OVF (1 << 0) #define AMDVI_CMDBUF_ID_BYTE 0x07 @@ -123,6 +126,10 @@ #define AMDVI_CMD_COMPLETE_PPR_REQUEST 0x07 #define AMDVI_CMD_INVAL_AMDVI_ALL 0x08 + +#define AMDVI_CMD_INVAL_IOMMU_PAGES_S (1ULL << 0) +#define AMDVI_INV_ALL_PAGES (1ULL << 52) + #define AMDVI_DEVTAB_ENTRY_SIZE 32 /* Device table entry bits 0:63 */ @@ -130,14 +137,14 @@ #define AMDVI_DEV_TRANSLATION_VALID (1ULL << 1) #define AMDVI_DEV_MODE_MASK 0x7 #define AMDVI_DEV_MODE_RSHIFT 9 -#define AMDVI_DEV_PT_ROOT_MASK 0xffffffffff000 +#define AMDVI_DEV_PT_ROOT_MASK GENMASK64(51, 12) #define AMDVI_DEV_PT_ROOT_RSHIFT 12 #define AMDVI_DEV_PERM_SHIFT 61 #define AMDVI_DEV_PERM_READ (1ULL << 61) #define AMDVI_DEV_PERM_WRITE (1ULL << 62) /* Device table entry bits 64:127 */ -#define AMDVI_DEV_DOMID_ID_MASK ((1ULL << 16) - 1) +#define AMDVI_DEV_DOMID_ID_MASK GENMASK64(15, 0) /* Event codes and flags, as stored in the info field */ #define AMDVI_EVENT_ILLEGAL_DEVTAB_ENTRY (0x1U << 12) @@ -162,13 +169,55 @@ #define AMDVI_FEATURE_PC (1ULL << 9) /* Perf counters */ /* reserved DTE bits */ -#define AMDVI_DTE_LOWER_QUAD_RESERVED 0x80300000000000fc -#define AMDVI_DTE_MIDDLE_QUAD_RESERVED 0x0000000000000100 -#define AMDVI_DTE_UPPER_QUAD_RESERVED 0x08f0000000000000 +#define AMDVI_DTE_QUAD0_RESERVED (GENMASK64(6, 2) | GENMASK64(63, 63)) +#define AMDVI_DTE_QUAD1_RESERVED 0 +#define AMDVI_DTE_QUAD2_RESERVED GENMASK64(53, 52) +#define AMDVI_DTE_QUAD3_RESERVED (GENMASK64(14, 0) | GENMASK64(53, 48)) /* AMDVI paging mode */ #define AMDVI_GATS_MODE (2ULL << 12) #define AMDVI_HATS_MODE (2ULL << 10) +#define AMDVI_HATS_MODE_RESERVED (3ULL << 10) + +/* Page Table format */ + +#define AMDVI_PTE_PR (1ULL << 0) +#define AMDVI_PTE_NEXT_LEVEL_MASK GENMASK64(11, 9) + +#define IOMMU_PTE_PRESENT(pte) ((pte) & AMDVI_PTE_PR) + +/* Using level=0 for leaf PTE at 4K page size */ +#define PT_LEVEL_SHIFT(level) (12 + ((level) * 9)) + +/* Return IOVA bit group used to index the Page Table at specific level */ +#define PT_LEVEL_INDEX(level, iova) (((iova) >> PT_LEVEL_SHIFT(level)) & \ + GENMASK64(8, 0)) + +/* Return the max address for a specified level i.e. max_oaddr */ +#define PT_LEVEL_MAX_ADDR(x) (((x) < 5) ? \ + ((1ULL << PT_LEVEL_SHIFT((x + 1))) - 1) : \ + (~(0ULL))) + +/* Extract the NextLevel field from PTE/PDE */ +#define PTE_NEXT_LEVEL(pte) (((pte) & AMDVI_PTE_NEXT_LEVEL_MASK) >> 9) + +/* Take page table level and return default pagetable size for level */ +#define PTE_LEVEL_PAGE_SIZE(level) (1ULL << (PT_LEVEL_SHIFT(level))) + +/* + * Return address of lower level page table encoded in PTE and specified by + * current level and corresponding IOVA bit group at such level. + */ +#define NEXT_PTE_ADDR(pte, level, iova) (((pte) & AMDVI_DEV_PT_ROOT_MASK) + \ + (PT_LEVEL_INDEX(level, iova) * 8)) + +/* + * Take a PTE value with mode=0x07 and return the page size it encodes. + */ +#define PTE_LARGE_PAGE_SIZE(pte) (1ULL << (1 + cto64(((pte) | 0xfffULL)))) + +/* Return number of PTEs to use for a given page size (expected power of 2) */ +#define PAGE_SIZE_PTE_COUNT(pgsz) (1ULL << ((ctz64(pgsz) - 12) % 9)) /* IOTLB */ #define AMDVI_IOTLB_MAX_SIZE 1024 @@ -194,20 +243,16 @@ #define AMDVI_PAGE_SIZE (1ULL << AMDVI_PAGE_SHIFT) #define AMDVI_PAGE_SHIFT_4K 12 -#define AMDVI_PAGE_MASK_4K (~((1ULL << AMDVI_PAGE_SHIFT_4K) - 1)) +#define AMDVI_PAGE_MASK_4K GENMASK64(63, 12) -#define AMDVI_MAX_VA_ADDR (48UL << 5) -#define AMDVI_MAX_PH_ADDR (40UL << 8) -#define AMDVI_MAX_GVA_ADDR (48UL << 15) +#define AMDVI_MAX_GVA_ADDR (2UL << 5) +#define AMDVI_MAX_PH_ADDR (40UL << 8) +#define AMDVI_MAX_VA_ADDR (48UL << 15) /* Completion Wait data size */ #define AMDVI_COMPLETION_DATA_SIZE 8 #define AMDVI_COMMAND_SIZE 16 -/* Completion Wait data size */ -#define AMDVI_COMPLETION_DATA_SIZE 8 - -#define AMDVI_COMMAND_SIZE 16 #define AMDVI_INT_ADDR_FIRST 0xfee00000 #define AMDVI_INT_ADDR_LAST 0xfeefffff @@ -228,7 +273,7 @@ #define AMDVI_IR_INTCTL_PASS 1 #define AMDVI_IR_INTCTL_REMAP 2 -#define AMDVI_IR_PHYS_ADDR_MASK (((1ULL << 45) - 1) << 6) +#define AMDVI_IR_PHYS_ADDR_MASK GENMASK64(51, 6) /* MSI data 10:0 bits (section 2.2.5.1 Fig 14) */ #define AMDVI_IRTE_OFFSET 0x7ff @@ -323,7 +368,6 @@ struct AMDVIState { uint64_t mmio_addr; bool enabled; /* IOMMU enabled */ - bool ats_enabled; /* address translation enabled */ bool cmdbuf_enabled; /* command buffer enabled */ bool evtlog_enabled; /* event log enabled */ bool excl_enabled; @@ -366,12 +410,18 @@ struct AMDVIState { /* for each served device */ AMDVIAddressSpace **address_spaces[PCI_BUS_MAX]; + /* list of address spaces with registered notifiers */ + QLIST_HEAD(, AMDVIAddressSpace) amdvi_as_with_notifiers; + /* IOTLB */ GHashTable *iotlb; /* Interrupt remapping */ bool ga_enabled; bool xtsup; + + /* DMA address translation */ + bool dma_remap; }; uint64_t amdvi_extended_feature_register(AMDVIState *s); diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 69d72ad..6a168d5 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -45,6 +45,8 @@ ((ce)->val[1] & VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK) #define VTD_CE_GET_PASID_DIR_TABLE(ce) \ ((ce)->val[0] & VTD_PASID_DIR_BASE_ADDR_MASK) +#define VTD_CE_GET_PRE(ce) \ + ((ce)->val[0] & VTD_SM_CONTEXT_ENTRY_PRE) /* pe operations */ #define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT) @@ -85,13 +87,6 @@ struct vtd_iotlb_key { static void vtd_address_space_refresh_all(IntelIOMMUState *s); static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n); -static void vtd_panic_require_caching_mode(void) -{ - error_report("We need to set caching-mode=on for intel-iommu to enable " - "device assignment with IOMMU protection."); - exit(1); -} - static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val, uint64_t wmask, uint64_t w1cmask) { @@ -1838,6 +1833,7 @@ static const bool vtd_qualified_faults[] = { [VTD_FR_FS_NON_CANONICAL] = true, [VTD_FR_FS_PAGING_ENTRY_US] = true, [VTD_FR_SM_WRITE] = true, + [VTD_FR_SM_PRE_ABS] = true, [VTD_FR_SM_INTERRUPT_ADDR] = true, [VTD_FR_FS_BIT_UPDATE_FAILED] = true, [VTD_FR_MAX] = false, @@ -1987,9 +1983,9 @@ static int vtd_iova_to_flpte(IntelIOMMUState *s, VTDContextEntry *ce, uint32_t pasid) { dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce, pasid); - uint32_t level = vtd_get_iova_level(s, ce, pasid); uint32_t offset; uint64_t flpte, flag_ad = VTD_FL_A; + *flpte_level = vtd_get_iova_level(s, ce, pasid); if (!vtd_iova_fl_check_canonical(s, iova, ce, pasid)) { error_report_once("%s: detected non canonical IOVA (iova=0x%" PRIx64 "," @@ -1998,11 +1994,11 @@ static int vtd_iova_to_flpte(IntelIOMMUState *s, VTDContextEntry *ce, } while (true) { - offset = vtd_iova_level_offset(iova, level); + offset = vtd_iova_level_offset(iova, *flpte_level); flpte = vtd_get_pte(addr, offset); if (flpte == (uint64_t)-1) { - if (level == vtd_get_iova_level(s, ce, pasid)) { + if (*flpte_level == vtd_get_iova_level(s, ce, pasid)) { /* Invalid programming of pasid-entry */ return -VTD_FR_PASID_ENTRY_FSPTPTR_INV; } else { @@ -2028,15 +2024,15 @@ static int vtd_iova_to_flpte(IntelIOMMUState *s, VTDContextEntry *ce, if (is_write && !(flpte & VTD_FL_RW)) { return -VTD_FR_SM_WRITE; } - if (vtd_flpte_nonzero_rsvd(flpte, level)) { + if (vtd_flpte_nonzero_rsvd(flpte, *flpte_level)) { error_report_once("%s: detected flpte reserved non-zero " "iova=0x%" PRIx64 ", level=0x%" PRIx32 "flpte=0x%" PRIx64 ", pasid=0x%" PRIX32 ")", - __func__, iova, level, flpte, pasid); + __func__, iova, *flpte_level, flpte, pasid); return -VTD_FR_FS_PAGING_ENTRY_RSVD; } - if (vtd_is_last_pte(flpte, level) && is_write) { + if (vtd_is_last_pte(flpte, *flpte_level) && is_write) { flag_ad |= VTD_FL_D; } @@ -2044,14 +2040,13 @@ static int vtd_iova_to_flpte(IntelIOMMUState *s, VTDContextEntry *ce, return -VTD_FR_FS_BIT_UPDATE_FAILED; } - if (vtd_is_last_pte(flpte, level)) { + if (vtd_is_last_pte(flpte, *flpte_level)) { *flptep = flpte; - *flpte_level = level; return 0; } addr = vtd_get_pte_addr(flpte, aw_bits); - level--; + (*flpte_level)--; } } @@ -2092,7 +2087,8 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, uint8_t bus_num = pci_bus_num(bus); VTDContextCacheEntry *cc_entry; uint64_t pte, page_mask; - uint32_t level, pasid = vtd_as->pasid; + uint32_t level = UINT32_MAX; + uint32_t pasid = vtd_as->pasid; uint16_t source_id = PCI_BUILD_BDF(bus_num, devfn); int ret_fr; bool is_fpd_set = false; @@ -2251,14 +2247,19 @@ out: entry->iova = addr & page_mask; entry->translated_addr = vtd_get_pte_addr(pte, s->aw_bits) & page_mask; entry->addr_mask = ~page_mask; - entry->perm = access_flags; + entry->perm = (is_write ? access_flags : (access_flags & (~IOMMU_WO))); return true; error: vtd_iommu_unlock(s); entry->iova = 0; entry->translated_addr = 0; - entry->addr_mask = 0; + /* + * Set the mask for ATS (the range must be present even when the + * translation fails : PCIe rev 5 10.2.3.5) + */ + entry->addr_mask = (level != UINT32_MAX) ? + (~vtd_pt_level_page_mask(level)) : (~VTD_PAGE_MASK_4K); entry->perm = IOMMU_NONE; return false; } @@ -2503,6 +2504,7 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, .translated_addr = 0, .addr_mask = size - 1, .perm = IOMMU_NONE, + .pasid = vtd_as->pasid, }, }; memory_region_notify_iommu(&vtd_as->iommu, 0, event); @@ -2695,7 +2697,7 @@ static void vtd_handle_gcmd_write(IntelIOMMUState *s) uint32_t changed = status ^ val; trace_vtd_reg_write_gcmd(status, val); - if ((changed & VTD_GCMD_TE) && s->dma_translation) { + if ((changed & VTD_GCMD_TE) && x86_iommu->dma_translation) { /* Translation enable/disable */ vtd_handle_gcmd_te(s, val & VTD_GCMD_TE); } @@ -2822,6 +2824,7 @@ static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) { uint64_t mask[4] = {VTD_INV_DESC_WAIT_RSVD_LO, VTD_INV_DESC_WAIT_RSVD_HI, VTD_INV_DESC_ALL_ONE, VTD_INV_DESC_ALL_ONE}; + bool ret = true; if (!vtd_inv_desc_reserved_check(s, inv_desc, mask, false, __func__, "wait")) { @@ -2833,8 +2836,6 @@ static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) uint32_t status_data = (uint32_t)(inv_desc->lo >> VTD_INV_DESC_WAIT_DATA_SHIFT); - assert(!(inv_desc->lo & VTD_INV_DESC_WAIT_IF)); - /* FIXME: need to be masked with HAW? */ dma_addr_t status_addr = inv_desc->hi; trace_vtd_inv_desc_wait_sw(status_addr, status_data); @@ -2843,18 +2844,28 @@ static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) &status_data, sizeof(status_data), MEMTXATTRS_UNSPECIFIED)) { trace_vtd_inv_desc_wait_write_fail(inv_desc->hi, inv_desc->lo); - return false; + ret = false; } - } else if (inv_desc->lo & VTD_INV_DESC_WAIT_IF) { + } + + if (inv_desc->lo & VTD_INV_DESC_WAIT_IF) { /* Interrupt flag */ vtd_generate_completion_event(s); - } else { + } + + /* + * SW=0, IF=0, FN=1 is also a valid descriptor (VT-d 7.10) + * Nothing to do as we process the descriptors in order + */ + + if (!(inv_desc->lo & (VTD_INV_DESC_WAIT_IF | VTD_INV_DESC_WAIT_SW | + VTD_INV_DESC_WAIT_FN))) { error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64 " (unknown type)", __func__, inv_desc->hi, inv_desc->lo); return false; } - return true; + return ret; } static bool vtd_process_context_cache_desc(IntelIOMMUState *s, @@ -3090,6 +3101,7 @@ static void do_invalidate_device_tlb(VTDAddressSpace *vtd_dev_as, event.entry.iova = addr; event.entry.perm = IOMMU_NONE; event.entry.translated_addr = 0; + event.entry.pasid = vtd_dev_as->pasid; memory_region_notify_iommu(&vtd_dev_as->iommu, 0, event); } @@ -3136,6 +3148,59 @@ static bool vtd_process_device_piotlb_desc(IntelIOMMUState *s, return true; } +static bool vtd_process_page_group_response_desc(IntelIOMMUState *s, + VTDInvDesc *inv_desc) +{ + VTDAddressSpace *vtd_dev_as; + bool pasid_present; + uint8_t response_code; + uint16_t rid; + uint32_t pasid; + uint16_t prgi; + IOMMUPRIResponse response; + + if ((inv_desc->lo & VTD_INV_DESC_PGRESP_RSVD_LO) || + (inv_desc->hi & VTD_INV_DESC_PGRESP_RSVD_HI)) { + error_report_once("%s: invalid page group response desc: hi=%"PRIx64 + ", lo=%"PRIx64" (reserved nonzero)", __func__, + inv_desc->hi, inv_desc->lo); + return false; + } + + pasid_present = VTD_INV_DESC_PGRESP_PP(inv_desc->lo); + response_code = VTD_INV_DESC_PGRESP_RC(inv_desc->lo); + rid = VTD_INV_DESC_PGRESP_RID(inv_desc->lo); + pasid = VTD_INV_DESC_PGRESP_PASID(inv_desc->lo); + prgi = VTD_INV_DESC_PGRESP_PRGI(inv_desc->hi); + + if (!pasid_present) { + error_report_once("Page group response without PASID is" + "not supported yet"); + return false; + } + + vtd_dev_as = vtd_get_as_by_sid_and_pasid(s, rid, pasid); + if (!vtd_dev_as) { + return true; + } + + response.prgi = prgi; + + if (response_code == 0x0u) { + response.response_code = IOMMU_PRI_RESP_SUCCESS; + } else if (response_code == 0x1u) { + response.response_code = IOMMU_PRI_RESP_INVALID_REQUEST; + } else { + response.response_code = IOMMU_PRI_RESP_FAILURE; + } + + if (vtd_dev_as->pri_notifier) { + vtd_dev_as->pri_notifier->notify(vtd_dev_as->pri_notifier, &response); + } + + return true; +} + static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) { @@ -3236,6 +3301,13 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s) } break; + case VTD_INV_DESC_PGRESP: + trace_vtd_inv_desc("page group response", inv_desc.hi, inv_desc.lo); + if (!vtd_process_page_group_response_desc(s, &inv_desc)) { + return false; + } + break; + /* * TODO: the entity of below two cases will be implemented in future series. * To make guest (which integrates scalable mode support patch set in @@ -3370,6 +3442,27 @@ static void vtd_handle_iectl_write(IntelIOMMUState *s) } } +static void vtd_handle_prs_write(IntelIOMMUState *s) +{ + uint32_t prs = vtd_get_long_raw(s, DMAR_PRS_REG); + if (!(prs & VTD_PR_STATUS_PPR) && !(prs & VTD_PR_STATUS_PRO)) { + vtd_set_clear_mask_long(s, DMAR_PECTL_REG, VTD_PR_PECTL_IP, 0); + } +} + +static void vtd_handle_pectl_write(IntelIOMMUState *s) +{ + uint32_t pectl = vtd_get_long_raw(s, DMAR_PECTL_REG); + if ((pectl & VTD_PR_PECTL_IP) && !(pectl & VTD_PR_PECTL_IM)) { + /* + * If IP field was 1 when software clears the IM field, + * the interrupt is generated along with clearing the IP field. + */ + vtd_set_clear_mask_long(s, DMAR_PECTL_REG, VTD_PR_PECTL_IP, 0); + vtd_generate_interrupt(s, DMAR_PEADDR_REG, DMAR_PEDATA_REG); + } +} + static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size) { IntelIOMMUState *s = opaque; @@ -3412,6 +3505,11 @@ static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size) val = s->iq >> 32; break; + case DMAR_PEUADDR_REG: + assert(size == 4); + val = vtd_get_long_raw(s, DMAR_PEUADDR_REG); + break; + default: if (size == 4) { val = vtd_get_long(s, addr); @@ -3475,6 +3573,11 @@ static void vtd_mem_write(void *opaque, hwaddr addr, vtd_handle_iotlb_write(s); break; + case DMAR_PEUADDR_REG: + assert(size == 4); + vtd_set_long(s, addr, val); + break; + /* Invalidate Address Register, 64-bit */ case DMAR_IVA_REG: if (size == 4) { @@ -3655,6 +3758,18 @@ static void vtd_mem_write(void *opaque, hwaddr addr, vtd_set_long(s, addr, val); break; + case DMAR_PRS_REG: + assert(size == 4); + vtd_set_long(s, addr, val); + vtd_handle_prs_write(s); + break; + + case DMAR_PECTL_REG: + assert(size == 4); + vtd_set_long(s, addr, val); + vtd_handle_pectl_write(s); + break; + default: if (size == 4) { vtd_set_long(s, addr, val); @@ -3672,6 +3787,7 @@ static IOMMUTLBEntry vtd_iommu_translate(IOMMUMemoryRegion *iommu, hwaddr addr, IOMMUTLBEntry iotlb = { /* We'll fill in the rest later. */ .target_as = &address_space_memory, + .pasid = vtd_as->pasid, }; bool success; @@ -3824,7 +3940,6 @@ static const Property vtd_properties[] = { DEFINE_PROP_BOOL("snoop-control", IntelIOMMUState, snoop_control, false), DEFINE_PROP_BOOL("x-pasid-mode", IntelIOMMUState, pasid, false), DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true), - DEFINE_PROP_BOOL("dma-translation", IntelIOMMUState, dma_translation, true), DEFINE_PROP_BOOL("stale-tm", IntelIOMMUState, stale_tm, false), DEFINE_PROP_BOOL("fs1gp", IntelIOMMUState, fs1gp, true), }; @@ -4367,6 +4482,12 @@ static bool vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn, assert(hiod); + if (!s->caching_mode) { + error_setg(errp, "Device assignment is not allowed without enabling " + "caching-mode=on for Intel IOMMU."); + return false; + } + vtd_iommu_lock(s); if (g_hash_table_lookup(s->vtd_host_iommu_dev, &key)) { @@ -4538,11 +4659,11 @@ static void vtd_cap_init(IntelIOMMUState *s) s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS | - VTD_CAP_MGAW(s->aw_bits); + VTD_CAP_ESRTPS | VTD_CAP_MGAW(s->aw_bits); if (s->dma_drain) { s->cap |= VTD_CAP_DRAIN; } - if (s->dma_translation) { + if (x86_iommu->dma_translation) { if (s->aw_bits >= VTD_HOST_AW_39BIT) { s->cap |= VTD_CAP_SAGAW_39bit; } @@ -4587,7 +4708,7 @@ static void vtd_cap_init(IntelIOMMUState *s) } if (s->pasid) { - s->ecap |= VTD_ECAP_PASID; + s->ecap |= VTD_ECAP_PASID | VTD_ECAP_PSS; } } @@ -4705,6 +4826,18 @@ static void vtd_init(IntelIOMMUState *s) * Interrupt remapping registers. */ vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0); + + /* Page request registers */ + if (s->ecap & VTD_ECAP_PRS) { + vtd_define_quad(s, DMAR_PQH_REG, 0, 0x7ffe0ULL, 0); + vtd_define_quad(s, DMAR_PQT_REG, 0, 0x7ffe0ULL, 0); + vtd_define_quad(s, DMAR_PQA_REG, 0, 0xfffffffffffff007ULL, 0); + vtd_define_long(s, DMAR_PRS_REG, 0, 0, 0x3UL); + vtd_define_long(s, DMAR_PECTL_REG, 0, 0x80000000UL, 0); + vtd_define_long(s, DMAR_PEDATA_REG, 0, 0xffffUL, 0); + vtd_define_long(s, DMAR_PEADDR_REG, 0, 0xfffffffcUL, 0); + vtd_define_long(s, DMAR_PEUADDR_REG, 0, 0xffffffffUL, 0); + } } /* Should not reset address_spaces when reset because devices will still use @@ -4730,10 +4863,329 @@ static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) return &vtd_as->as; } +static IOMMUTLBEntry vtd_iommu_ats_do_translate(IOMMUMemoryRegion *iommu, + hwaddr addr, + IOMMUAccessFlags flags) +{ + IOMMUTLBEntry entry; + VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); + + if (vtd_is_interrupt_addr(addr)) { + vtd_report_ir_illegal_access(vtd_as, addr, flags & IOMMU_WO); + entry.target_as = &address_space_memory; + entry.iova = 0; + entry.translated_addr = 0; + entry.addr_mask = ~VTD_PAGE_MASK_4K; + entry.perm = IOMMU_NONE; + entry.pasid = PCI_NO_PASID; + } else { + entry = vtd_iommu_translate(iommu, addr, flags, 0); + } + + return entry; +} + +static ssize_t vtd_ats_request_translation(PCIBus *bus, void *opaque, + int devfn, uint32_t pasid, + bool priv_req, bool exec_req, + hwaddr addr, size_t length, + bool no_write, IOMMUTLBEntry *result, + size_t result_length, + uint32_t *err_count) +{ + IntelIOMMUState *s = opaque; + VTDAddressSpace *vtd_as; + IOMMUAccessFlags flags = IOMMU_ACCESS_FLAG_FULL(true, !no_write, exec_req, + priv_req, false, false); + ssize_t res_index = 0; + hwaddr target_address = addr + length; + IOMMUTLBEntry entry; + + vtd_as = vtd_find_add_as(s, bus, devfn, pasid); + *err_count = 0; + + while ((addr < target_address) && (res_index < result_length)) { + entry = vtd_iommu_ats_do_translate(&vtd_as->iommu, addr, flags); + entry.perm &= ~IOMMU_GLOBAL; /* Spec 4.1.2: Global Mapping never set */ + + if ((entry.perm & flags) != flags) { + *err_count += 1; /* Less than expected */ + } + + result[res_index] = entry; + res_index += 1; + addr = (addr & (~entry.addr_mask)) + (entry.addr_mask + 1); + } + + /* Buffer too small */ + if (addr < target_address) { + return -ENOMEM; + } + + return res_index; +} + +/* 11.4.11.3 : The number of entries in the page request queue is 2^(PQS + 7) */ +static inline uint64_t vtd_prq_size(IntelIOMMUState *s) +{ + return 1ULL << ((vtd_get_quad(s, DMAR_PQA_REG) & VTD_PQA_SIZE) + 7); +} + +/** + * Return true if the bit is accessible and correctly set, false otherwise + */ +static bool vtd_check_pre_bit(VTDAddressSpace *vtd_as, hwaddr addr, + uint16_t sid, bool is_write) +{ + int ret; + IntelIOMMUState *s = vtd_as->iommu_state; + uint8_t bus_n = pci_bus_num(vtd_as->bus); + VTDContextEntry ce; + bool is_fpd_set = false; + + ret = vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce); + + if (ret) { + goto error_report; + } + + if (!VTD_CE_GET_PRE(&ce)) { + ret = -VTD_FR_SM_PRE_ABS; + goto error_get_fpd_and_report; + } + + return true; + +error_get_fpd_and_report: + /* Try to get fpd (may not work but we are already on an error path) */ + is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD; + vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set, vtd_as->pasid); +error_report: + vtd_report_fault(s, -ret, is_fpd_set, sid, addr, is_write, + vtd_as->pasid != PCI_NO_PASID, vtd_as->pasid); + return false; +} + +/* Logic described in section 7.5 */ +static void vtd_generate_page_request_event(IntelIOMMUState *s, + uint32_t old_pr_status) +{ + uint32_t current_pectl = vtd_get_long(s, DMAR_PECTL_REG); + /* + * Hardware evaluates PPR and PRO fields in the Page Request Status Register + * and if any of them is set, Page Request Event is not generated + */ + if (old_pr_status & (VTD_PR_STATUS_PRO | VTD_PR_STATUS_PPR)) { + return; + } + + vtd_set_clear_mask_long(s, DMAR_PECTL_REG, 0, VTD_PR_PECTL_IP); + if (!(current_pectl & VTD_PR_PECTL_IM)) { + vtd_set_clear_mask_long(s, DMAR_PECTL_REG, VTD_PR_PECTL_IP, 0); + vtd_generate_interrupt(s, DMAR_PEADDR_REG, DMAR_PEDATA_REG); + } +} + +/* When calling this function, we known that we are in scalable mode */ +static int vtd_pri_perform_implicit_invalidation(VTDAddressSpace *vtd_as, + hwaddr addr) +{ + IntelIOMMUState *s = vtd_as->iommu_state; + VTDContextEntry ce; + VTDPASIDEntry pe; + uint16_t pgtt; + uint16_t domain_id; + int ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), + vtd_as->devfn, &ce); + if (ret) { + return -EINVAL; + } + ret = vtd_ce_get_rid2pasid_entry(s, &ce, &pe, vtd_as->pasid); + if (ret) { + return -EINVAL; + } + pgtt = VTD_PE_GET_TYPE(&pe); + domain_id = VTD_SM_PASID_ENTRY_DID(pe.val[1]); + ret = 0; + switch (pgtt) { + case VTD_SM_PASID_ENTRY_FLT: + vtd_piotlb_page_invalidate(s, domain_id, vtd_as->pasid, addr, 0); + break; + /* Room for other pgtt values */ + default: + error_report_once("Translation type not supported yet : %d", pgtt); + ret = -EINVAL; + break; + } + + return ret; +} + +/* Page Request Descriptor : 7.4.1.1 */ +static int vtd_pri_request_page(PCIBus *bus, void *opaque, int devfn, + uint32_t pasid, bool priv_req, bool exec_req, + hwaddr addr, bool lpig, uint16_t prgi, + bool is_read, bool is_write) +{ + IntelIOMMUState *s = opaque; + VTDAddressSpace *vtd_as; + + vtd_as = vtd_find_add_as(s, bus, devfn, pasid); + + uint64_t queue_addr_reg = vtd_get_quad(s, DMAR_PQA_REG); + uint64_t queue_tail_offset_reg = vtd_get_quad(s, DMAR_PQT_REG); + uint64_t new_queue_tail_offset = ( + (queue_tail_offset_reg + VTD_PQA_ENTRY_SIZE) % + (vtd_prq_size(s) * VTD_PQA_ENTRY_SIZE)); + uint64_t queue_head_offset_reg = vtd_get_quad(s, DMAR_PQH_REG); + hwaddr queue_tail = (queue_addr_reg & VTD_PQA_ADDR) + queue_tail_offset_reg; + uint32_t old_pr_status = vtd_get_long(s, DMAR_PRS_REG); + uint16_t sid = PCI_BUILD_BDF(pci_bus_num(vtd_as->bus), vtd_as->devfn); + VTDPRDesc desc; + + if (!(s->ecap & VTD_ECAP_PRS)) { + return -EPERM; + } + + /* + * No need to check if scalable mode is enabled as we already known that + * VTD_ECAP_PRS is set (see vtd_decide_config) + */ + + /* We do not support PRI without PASID */ + if (vtd_as->pasid == PCI_NO_PASID) { + return -EPERM; + } + if (exec_req && !is_read) { + return -EINVAL; + } + + /* Check PRE bit in the scalable mode context entry */ + if (!vtd_check_pre_bit(vtd_as, addr, sid, is_write)) { + return -EPERM; + } + + if (old_pr_status & VTD_PR_STATUS_PRO) { + /* + * No action is taken by hardware to report a fault + * or generate an event + */ + return -ENOSPC; + } + + /* Check for overflow */ + if (new_queue_tail_offset == queue_head_offset_reg) { + vtd_set_clear_mask_long(s, DMAR_PRS_REG, 0, VTD_PR_STATUS_PRO); + vtd_generate_page_request_event(s, old_pr_status); + return -ENOSPC; + } + + if (vtd_pri_perform_implicit_invalidation(vtd_as, addr)) { + return -EINVAL; + } + + desc.lo = VTD_PRD_TYPE | VTD_PRD_PP(true) | VTD_PRD_RID(sid) | + VTD_PRD_PASID(vtd_as->pasid) | VTD_PRD_PMR(priv_req); + desc.hi = VTD_PRD_RDR(is_read) | VTD_PRD_WRR(is_write) | + VTD_PRD_LPIG(lpig) | VTD_PRD_PRGI(prgi) | VTD_PRD_ADDR(addr); + + desc.lo = cpu_to_le64(desc.lo); + desc.hi = cpu_to_le64(desc.hi); + if (dma_memory_write(&address_space_memory, queue_tail, &desc, sizeof(desc), + MEMTXATTRS_UNSPECIFIED)) { + error_report_once("IO error, the PQ tail cannot be updated"); + return -EIO; + } + + /* increment the tail register and set the pending request bit */ + vtd_set_quad(s, DMAR_PQT_REG, new_queue_tail_offset); + /* + * read status again so that the kernel does not miss a request. + * in some cases, we can trigger an unecessary interrupt but this strategy + * drastically improves performance as we don't need to take a lock. + */ + old_pr_status = vtd_get_long(s, DMAR_PRS_REG); + if (!(old_pr_status & VTD_PR_STATUS_PPR)) { + vtd_set_clear_mask_long(s, DMAR_PRS_REG, 0, VTD_PR_STATUS_PPR); + vtd_generate_page_request_event(s, old_pr_status); + } + + return 0; +} + +static void vtd_init_iotlb_notifier(PCIBus *bus, void *opaque, int devfn, + IOMMUNotifier *n, IOMMUNotify fn, + void *user_opaque) +{ + n->opaque = user_opaque; + iommu_notifier_init(n, fn, IOMMU_NOTIFIER_DEVIOTLB_EVENTS, 0, + HWADDR_MAX, 0); +} + +static void vtd_get_iotlb_info(void *opaque, uint8_t *addr_width, + uint32_t *min_page_size) +{ + IntelIOMMUState *s = opaque; + + *addr_width = s->aw_bits; + *min_page_size = VTD_PAGE_SIZE; +} + +static void vtd_register_iotlb_notifier(PCIBus *bus, void *opaque, + int devfn, uint32_t pasid, + IOMMUNotifier *n) +{ + IntelIOMMUState *s = opaque; + VTDAddressSpace *vtd_as; + + vtd_as = vtd_find_add_as(s, bus, devfn, pasid); + memory_region_register_iommu_notifier(MEMORY_REGION(&vtd_as->iommu), n, + &error_fatal); +} + +static void vtd_unregister_iotlb_notifier(PCIBus *bus, void *opaque, + int devfn, uint32_t pasid, + IOMMUNotifier *n) +{ + IntelIOMMUState *s = opaque; + VTDAddressSpace *vtd_as; + + vtd_as = vtd_find_add_as(s, bus, devfn, pasid); + memory_region_unregister_iommu_notifier(MEMORY_REGION(&vtd_as->iommu), n); +} + +static void vtd_pri_register_notifier(PCIBus *bus, void *opaque, int devfn, + uint32_t pasid, IOMMUPRINotifier *notifier) +{ + IntelIOMMUState *s = opaque; + VTDAddressSpace *vtd_as; + + vtd_as = vtd_find_add_as(s, bus, devfn, pasid); + vtd_as->pri_notifier = notifier; +} + +static void vtd_pri_unregister_notifier(PCIBus *bus, void *opaque, + int devfn, uint32_t pasid) +{ + IntelIOMMUState *s = opaque; + VTDAddressSpace *vtd_as; + + vtd_as = vtd_find_add_as(s, bus, devfn, pasid); + vtd_as->pri_notifier = NULL; +} + static PCIIOMMUOps vtd_iommu_ops = { .get_address_space = vtd_host_dma_iommu, .set_iommu_device = vtd_dev_set_iommu_device, .unset_iommu_device = vtd_dev_unset_iommu_device, + .get_iotlb_info = vtd_get_iotlb_info, + .init_iotlb_notifier = vtd_init_iotlb_notifier, + .register_iotlb_notifier = vtd_register_iotlb_notifier, + .unregister_iotlb_notifier = vtd_unregister_iotlb_notifier, + .ats_request_translation = vtd_ats_request_translation, + .pri_register_notifier = vtd_pri_register_notifier, + .pri_unregister_notifier = vtd_pri_unregister_notifier, + .pri_request_page = vtd_pri_request_page, }; static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) @@ -4791,32 +5243,6 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) return true; } -static int vtd_machine_done_notify_one(Object *child, void *unused) -{ - IntelIOMMUState *iommu = INTEL_IOMMU_DEVICE(x86_iommu_get_default()); - - /* - * We hard-coded here because vfio-pci is the only special case - * here. Let's be more elegant in the future when we can, but so - * far there seems to be no better way. - */ - if (object_dynamic_cast(child, "vfio-pci") && !iommu->caching_mode) { - vtd_panic_require_caching_mode(); - } - - return 0; -} - -static void vtd_machine_done_hook(Notifier *notifier, void *unused) -{ - object_child_foreach_recursive(object_get_root(), - vtd_machine_done_notify_one, NULL); -} - -static Notifier vtd_machine_done_notify = { - .notify = vtd_machine_done_hook, -}; - static void vtd_realize(DeviceState *dev, Error **errp) { MachineState *ms = MACHINE(qdev_get_machine()); @@ -4871,7 +5297,6 @@ static void vtd_realize(DeviceState *dev, Error **errp) pci_setup_iommu(bus, &vtd_iommu_ops, dev); /* Pseudo address space under root PCI bus. */ x86ms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC); - qemu_add_machine_init_done_notifier(&vtd_machine_done_notify); } static void vtd_class_init(ObjectClass *klass, const void *data) diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index e8b211e..0f6a123 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -190,8 +190,10 @@ #define VTD_ECAP_EIM (1ULL << 4) #define VTD_ECAP_PT (1ULL << 6) #define VTD_ECAP_SC (1ULL << 7) +#define VTD_ECAP_PRS (1ULL << 29) #define VTD_ECAP_MHMV (15ULL << 20) #define VTD_ECAP_SRS (1ULL << 31) +#define VTD_ECAP_PSS (7ULL << 35) /* limit: MemTxAttrs::pid */ #define VTD_ECAP_PASID (1ULL << 40) #define VTD_ECAP_SMTS (1ULL << 43) #define VTD_ECAP_SLTS (1ULL << 46) @@ -213,6 +215,7 @@ #define VTD_CAP_DRAIN_WRITE (1ULL << 54) #define VTD_CAP_DRAIN_READ (1ULL << 55) #define VTD_CAP_FS1GP (1ULL << 56) +#define VTD_CAP_ESRTPS (1ULL << 63) #define VTD_CAP_DRAIN (VTD_CAP_DRAIN_READ | VTD_CAP_DRAIN_WRITE) #define VTD_CAP_CM (1ULL << 7) #define VTD_PASID_ID_SHIFT 20 @@ -313,6 +316,8 @@ typedef enum VTDFaultReason { * request while disabled */ VTD_FR_IR_SID_ERR = 0x26, /* Invalid Source-ID */ + VTD_FR_SM_PRE_ABS = 0x47, /* SCT.8 : PRE bit in a present SM CE is 0 */ + /* PASID directory entry access failure */ VTD_FR_PASID_DIR_ACCESS_ERR = 0x50, /* The Present(P) field of pasid directory entry is 0 */ @@ -375,6 +380,18 @@ union VTDInvDesc { }; typedef union VTDInvDesc VTDInvDesc; +/* Page Request Descriptor */ +union VTDPRDesc { + struct { + uint64_t lo; + uint64_t hi; + }; + struct { + uint64_t val[4]; + }; +}; +typedef union VTDPRDesc VTDPRDesc; + /* Masks for struct VTDInvDesc */ #define VTD_INV_DESC_ALL_ONE -1ULL #define VTD_INV_DESC_TYPE(val) ((((val) >> 5) & 0x70ULL) | \ @@ -388,6 +405,7 @@ typedef union VTDInvDesc VTDInvDesc; #define VTD_INV_DESC_PIOTLB 0x6 /* PASID-IOTLB Invalidate Desc */ #define VTD_INV_DESC_PC 0x7 /* PASID-cache Invalidate Desc */ #define VTD_INV_DESC_DEV_PIOTLB 0x8 /* PASID-based-DIOTLB inv_desc*/ +#define VTD_INV_DESC_PGRESP 0x9 /* Page Group Response Desc */ #define VTD_INV_DESC_NONE 0 /* Not an Invalidate Descriptor */ /* Masks for Invalidation Wait Descriptor*/ @@ -439,6 +457,15 @@ typedef union VTDInvDesc VTDInvDesc; #define VTD_INV_DESC_PASID_DEVICE_IOTLB_RSVD_VAL0 0xfff000000000f000ULL #define VTD_INV_DESC_PASID_DEVICE_IOTLB_RSVD_VAL1 0x7feULL +/* Mask for Page Group Response Descriptor */ +#define VTD_INV_DESC_PGRESP_RSVD_HI 0xfffffffffffff003ULL +#define VTD_INV_DESC_PGRESP_RSVD_LO 0xfff00000000001e0ULL +#define VTD_INV_DESC_PGRESP_PP(val) (((val) >> 4) & 0x1ULL) +#define VTD_INV_DESC_PGRESP_RC(val) (((val) >> 12) & 0xfULL) +#define VTD_INV_DESC_PGRESP_RID(val) (((val) >> 16) & 0xffffULL) +#define VTD_INV_DESC_PGRESP_PASID(val) (((val) >> 32) & 0xfffffULL) +#define VTD_INV_DESC_PGRESP_PRGI(val) (((val) >> 3) & 0x1ffULL) + /* Rsvd field masks for spte */ #define VTD_SPTE_SNP 0x800ULL @@ -490,6 +517,31 @@ typedef union VTDInvDesc VTDInvDesc; #define VTD_INV_DESC_PIOTLB_RSVD_VAL0 0xfff000000000f1c0ULL #define VTD_INV_DESC_PIOTLB_RSVD_VAL1 0xf80ULL +/* Page Request Descriptor */ +/* For the low 64-bit of 128-bit */ +#define VTD_PRD_TYPE (1ULL) +#define VTD_PRD_PP(val) (((val) & 1ULL) << 8) +#define VTD_PRD_RID(val) (((val) & 0xffffULL) << 16) +#define VTD_PRD_PASID(val) (((val) & 0xfffffULL) << 32) +#define VTD_PRD_EXR(val) (((val) & 1ULL) << 52) +#define VTD_PRD_PMR(val) (((val) & 1ULL) << 53) +/* For the high 64-bit of 128-bit */ +#define VTD_PRD_RDR(val) ((val) & 1ULL) +#define VTD_PRD_WRR(val) (((val) & 1ULL) << 1) +#define VTD_PRD_LPIG(val) (((val) & 1ULL) << 2) +#define VTD_PRD_PRGI(val) (((val) & 0x1ffULL) << 3) +#define VTD_PRD_ADDR(val) ((val) & 0xfffffffffffff000ULL) + +/* Page Request Queue constants */ +#define VTD_PQA_ENTRY_SIZE 32 /* Size of an entry in bytes */ +/* Page Request Queue masks */ +#define VTD_PQA_ADDR 0xfffffffffffff000ULL /* PR queue address */ +#define VTD_PQA_SIZE 0x7ULL /* PR queue size */ +#define VTD_PR_STATUS_PPR 1UL /* Pending page request */ +#define VTD_PR_STATUS_PRO 2UL /* Page request overflow */ +#define VTD_PR_PECTL_IP 0x40000000UL /* PR control interrup pending */ +#define VTD_PR_PECTL_IM 0x80000000UL /* PR control interrup mask */ + /* Information about page-selective IOTLB invalidate */ struct VTDIOTLBPageInvInfo { uint16_t domain_id; @@ -549,6 +601,7 @@ typedef struct VTDRootEntry VTDRootEntry; #define VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK 0xfffff #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(aw) (0x1e0ULL | ~VTD_HAW_MASK(aw)) #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL1 0xffffffffffe00000ULL +#define VTD_SM_CONTEXT_ENTRY_PRE 0x10ULL /* PASID Table Related Definitions */ #define VTD_PASID_DIR_BASE_ADDR_MASK (~0xfffULL) diff --git a/hw/i386/isapc.c b/hw/i386/isapc.c new file mode 100644 index 0000000..44f4a44 --- /dev/null +++ b/hw/i386/isapc.c @@ -0,0 +1,189 @@ +/* + * QEMU PC System Emulator + * + * Copyright (c) 2003-2004 Fabrice Bellard + * + * SPDX-License-Identifier: MIT + */ + +#include "qemu/osdep.h" + +#include "qemu/units.h" +#include "qemu/error-report.h" +#include "hw/char/parallel-isa.h" +#include "hw/dma/i8257.h" +#include "hw/i386/pc.h" +#include "hw/ide/isa.h" +#include "hw/ide/ide-bus.h" +#include "system/kvm.h" +#include "hw/i386/kvm/clock.h" +#include "hw/xen/xen-x86.h" +#include "system/xen.h" +#include "hw/rtc/mc146818rtc.h" +#include "target/i386/cpu.h" + +static const int ide_iobase[MAX_IDE_BUS] = { 0x1f0, 0x170 }; +static const int ide_iobase2[MAX_IDE_BUS] = { 0x3f6, 0x376 }; +static const int ide_irq[MAX_IDE_BUS] = { 14, 15 }; + + +static void pc_init_isa(MachineState *machine) +{ + PCMachineState *pcms = PC_MACHINE(machine); + PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); + X86MachineState *x86ms = X86_MACHINE(machine); + MemoryRegion *system_memory = get_system_memory(); + MemoryRegion *system_io = get_system_io(); + ISABus *isa_bus; + uint32_t irq; + GSIState *gsi_state; + MemoryRegion *ram_memory; + DriveInfo *hd[MAX_IDE_BUS * MAX_IDE_DEVS]; + int i; + + /* + * There is a small chance that someone unintentionally passes "-cpu max" + * for the isapc machine, which will provide a much more modern 32-bit + * CPU than would be expected for an ISA-era PC. If the "max" cpu type has + * been specified, choose the "best" 32-bit cpu possible which we consider + * be the pentium3 (deliberately choosing an Intel CPU given that the + * default 486 CPU for the isapc machine is also an Intel CPU). + */ + if (!strcmp(machine->cpu_type, X86_CPU_TYPE_NAME("max"))) { + machine->cpu_type = X86_CPU_TYPE_NAME("pentium3"); + warn_report("-cpu max is invalid for isapc machine, using pentium3"); + } + + /* + * Similarly if someone unintentionally passes "-cpu host" for the isapc + * machine then display a warning and also switch to the "best" 32-bit + * cpu possible which we consider to be the pentium3. This is because any + * host CPU will already be modern than this, but it also ensures any + * newer CPU flags/features are filtered out for older guests. + */ + if (!strcmp(machine->cpu_type, X86_CPU_TYPE_NAME("host"))) { + machine->cpu_type = X86_CPU_TYPE_NAME("pentium3"); + warn_report("-cpu host is invalid for isapc machine, using pentium3"); + } + + if (machine->ram_size > 3.5 * GiB) { + error_report("Too much memory for this machine: %" PRId64 " MiB, " + "maximum 3584 MiB", machine->ram_size / MiB); + exit(1); + } + + /* + * There is no RAM split for the isapc machine + */ + if (xen_enabled()) { + xen_hvm_init_pc(pcms, &ram_memory); + } else { + ram_memory = machine->ram; + + pcms->max_ram_below_4g = 3.5 * GiB; + x86ms->above_4g_mem_size = 0; + x86ms->below_4g_mem_size = machine->ram_size; + } + + x86_cpus_init(x86ms, pcmc->default_cpu_version); + + if (kvm_enabled()) { + kvmclock_create(pcmc->kvmclock_create_always); + } + + /* allocate ram and load rom/bios */ + if (!xen_enabled()) { + pc_memory_init(pcms, system_memory, system_memory, 0); + } else { + assert(machine->ram_size == x86ms->below_4g_mem_size + + x86ms->above_4g_mem_size); + + if (machine->kernel_filename != NULL) { + /* For xen HVM direct kernel boot, load linux here */ + xen_load_linux(pcms); + } + } + + gsi_state = pc_gsi_create(&x86ms->gsi, false); + + isa_bus = isa_bus_new(NULL, system_memory, system_io, + &error_abort); + isa_bus_register_input_irqs(isa_bus, x86ms->gsi); + + x86ms->rtc = isa_new(TYPE_MC146818_RTC); + qdev_prop_set_int32(DEVICE(x86ms->rtc), "base_year", 2000); + isa_realize_and_unref(x86ms->rtc, isa_bus, &error_fatal); + irq = object_property_get_uint(OBJECT(x86ms->rtc), "irq", + &error_fatal); + isa_connect_gpio_out(ISA_DEVICE(x86ms->rtc), 0, irq); + + i8257_dma_init(OBJECT(machine), isa_bus, 0); + pcms->hpet_enabled = false; + + if (x86ms->pic == ON_OFF_AUTO_ON || x86ms->pic == ON_OFF_AUTO_AUTO) { + pc_i8259_create(isa_bus, gsi_state->i8259_irq); + } + + if (tcg_enabled()) { + x86_register_ferr_irq(x86ms->gsi[13]); + } + + pc_vga_init(isa_bus, NULL); + + /* init basic PC hardware */ + pc_basic_device_init(pcms, isa_bus, x86ms->gsi, x86ms->rtc, + !MACHINE_CLASS(pcmc)->no_floppy, 0x4); + + pc_nic_init(pcmc, isa_bus, NULL); + + ide_drive_get(hd, ARRAY_SIZE(hd)); + for (i = 0; i < MAX_IDE_BUS; i++) { + ISADevice *dev; + char busname[] = "ide.0"; + dev = isa_ide_init(isa_bus, ide_iobase[i], ide_iobase2[i], + ide_irq[i], + hd[MAX_IDE_DEVS * i], hd[MAX_IDE_DEVS * i + 1]); + /* + * The ide bus name is ide.0 for the first bus and ide.1 for the + * second one. + */ + busname[4] = '0' + i; + pcms->idebus[i] = qdev_get_child_bus(DEVICE(dev), busname); + } +} + +static void isapc_machine_options(MachineClass *m) +{ + static const char * const valid_cpu_types[] = { + X86_CPU_TYPE_NAME("486"), + X86_CPU_TYPE_NAME("athlon"), + X86_CPU_TYPE_NAME("kvm32"), + X86_CPU_TYPE_NAME("pentium"), + X86_CPU_TYPE_NAME("pentium2"), + X86_CPU_TYPE_NAME("pentium3"), + X86_CPU_TYPE_NAME("qemu32"), + X86_CPU_TYPE_NAME("max"), + X86_CPU_TYPE_NAME("host"), + NULL + }; + PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + + m->desc = "ISA-only PC"; + m->max_cpus = 1; + m->option_rom_has_mr = true; + m->rom_file_has_mr = false; + pcmc->pci_enabled = false; + pcmc->has_acpi_build = false; + pcmc->smbios_defaults = false; + pcmc->gigabyte_align = false; + pcmc->smbios_legacy_mode = true; + pcmc->has_reserved_memory = false; + m->default_nic = "ne2k_isa"; + m->default_cpu_type = X86_CPU_TYPE_NAME("486"); + m->valid_cpu_types = valid_cpu_types; + m->no_floppy = !module_object_class_by_name(TYPE_ISA_FDC); + m->no_parallel = !module_object_class_by_name(TYPE_ISA_PARALLEL); +} + +DEFINE_PC_MACHINE(isapc, "isapc", pc_init_isa, + isapc_machine_options); diff --git a/hw/i386/kvm/apic.c b/hw/i386/kvm/apic.c index 39035db..1be9bfe 100644 --- a/hw/i386/kvm/apic.c +++ b/hw/i386/kvm/apic.c @@ -17,6 +17,7 @@ #include "system/hw_accel.h" #include "system/kvm.h" #include "kvm/kvm_i386.h" +#include "kvm/tdx.h" static inline void kvm_apic_set_reg(struct kvm_lapic_state *kapic, int reg_id, uint32_t val) @@ -141,6 +142,10 @@ static void kvm_apic_put(CPUState *cs, run_on_cpu_data data) struct kvm_lapic_state kapic; int ret; + if (is_tdx_vm()) { + return; + } + kvm_put_apicbase(s->cpu, s->apicbase); kvm_put_apic_state(s, &kapic); diff --git a/hw/i386/kvm/xen-stubs.c b/hw/i386/kvm/xen-stubs.c index d03131e..ce73119 100644 --- a/hw/i386/kvm/xen-stubs.c +++ b/hw/i386/kvm/xen-stubs.c @@ -12,7 +12,6 @@ #include "qemu/osdep.h" #include "qapi/error.h" -#include "qapi/qapi-commands-misc-target.h" #include "xen_evtchn.h" #include "xen_primary_console.h" @@ -38,15 +37,3 @@ void xen_primary_console_create(void) void xen_primary_console_set_be_port(uint16_t port) { } -#ifdef TARGET_I386 -EvtchnInfoList *qmp_xen_event_list(Error **errp) -{ - error_setg(errp, "Xen event channel emulation not enabled"); - return NULL; -} - -void qmp_xen_event_inject(uint32_t port, Error **errp) -{ - error_setg(errp, "Xen event channel emulation not enabled"); -} -#endif diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c index b519054..dd566c4 100644 --- a/hw/i386/kvm/xen_evtchn.c +++ b/hw/i386/kvm/xen_evtchn.c @@ -19,7 +19,7 @@ #include "monitor/monitor.h" #include "monitor/hmp.h" #include "qapi/error.h" -#include "qapi/qapi-commands-misc-target.h" +#include "qapi/qapi-commands-misc-i386.h" #include "qobject/qdict.h" #include "qom/object.h" #include "exec/target_page.h" diff --git a/hw/i386/meson.build b/hw/i386/meson.build index 10bdfde..436b3ce 100644 --- a/hw/i386/meson.build +++ b/hw/i386/meson.build @@ -14,6 +14,7 @@ i386_ss.add(when: 'CONFIG_X86_IOMMU', if_true: files('x86-iommu.c'), i386_ss.add(when: 'CONFIG_AMD_IOMMU', if_true: files('amd_iommu.c'), if_false: files('amd_iommu-stub.c')) i386_ss.add(when: 'CONFIG_I440FX', if_true: files('pc_piix.c')) +i386_ss.add(when: 'CONFIG_ISAPC', if_true: files('isapc.c')) i386_ss.add(when: 'CONFIG_MICROVM', if_true: files('x86-common.c', 'microvm.c', 'acpi-microvm.c', 'microvm-dt.c')) i386_ss.add(when: 'CONFIG_NITRO_ENCLAVE', if_true: files('nitro_enclave.c')) i386_ss.add(when: 'CONFIG_Q35', if_true: files('pc_q35.c')) @@ -32,6 +33,7 @@ i386_ss.add(when: 'CONFIG_PC', if_true: files( 'port92.c')) i386_ss.add(when: 'CONFIG_X86_FW_OVMF', if_true: files('pc_sysfw_ovmf.c'), if_false: files('pc_sysfw_ovmf-stubs.c')) +i386_ss.add(when: 'CONFIG_TDX', if_true: files('tdvf.c', 'tdvf-hob.c')) subdir('kvm') subdir('xen') diff --git a/hw/i386/microvm.c b/hw/i386/microvm.c index e0daf0d..94d22a2 100644 --- a/hw/i386/microvm.c +++ b/hw/i386/microvm.c @@ -49,6 +49,7 @@ #include "hw/acpi/generic_event_device.h" #include "hw/pci-host/gpex.h" #include "hw/usb/xhci.h" +#include "hw/vfio/types.h" #include "elf.h" #include "kvm/kvm_i386.h" @@ -633,6 +634,8 @@ GlobalProperty microvm_properties[] = { * so reserving io space is not going to work. Turn it off. */ { "pcie-root-port", "io-reserve", "0" }, + { TYPE_RAMFB_DEVICE, "use-legacy-x86-rom", "true" }, + { TYPE_VFIO_PCI_NOHOTPLUG, "use-legacy-x86-rom", "true" }, }; static void microvm_class_init(ObjectClass *oc, const void *data) diff --git a/hw/i386/monitor.c b/hw/i386/monitor.c index 1921e4d..79df965 100644 --- a/hw/i386/monitor.c +++ b/hw/i386/monitor.c @@ -26,7 +26,7 @@ #include "monitor/monitor.h" #include "qobject/qdict.h" #include "qapi/error.h" -#include "qapi/qapi-commands-misc-target.h" +#include "qapi/qapi-commands-misc-i386.h" #include "hw/i386/x86.h" #include "hw/rtc/mc146818rtc.h" diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 7065615..4d6bcbb 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -44,6 +44,7 @@ #include "system/xen.h" #include "system/reset.h" #include "kvm/kvm_i386.h" +#include "kvm/tdx.h" #include "hw/xen/xen.h" #include "qobject/qlist.h" #include "qemu/error-report.h" @@ -80,7 +81,15 @@ { "qemu64-" TYPE_X86_CPU, "model-id", "QEMU Virtual CPU version " v, },\ { "athlon-" TYPE_X86_CPU, "model-id", "QEMU Virtual CPU version " v, }, -GlobalProperty pc_compat_10_0[] = {}; +GlobalProperty pc_compat_10_1[] = {}; +const size_t pc_compat_10_1_len = G_N_ELEMENTS(pc_compat_10_1); + +GlobalProperty pc_compat_10_0[] = { + { TYPE_X86_CPU, "x-consistent-cache", "false" }, + { TYPE_X86_CPU, "x-vendor-cpuid-only-v2", "false" }, + { TYPE_X86_CPU, "x-arch-cap-always-on", "true" }, + { TYPE_X86_CPU, "x-pdcm-on-even-without-pmu", "true" }, +}; const size_t pc_compat_10_0_len = G_N_ELEMENTS(pc_compat_10_0); GlobalProperty pc_compat_9_2[] = {}; @@ -259,28 +268,6 @@ GlobalProperty pc_compat_2_6[] = { }; const size_t pc_compat_2_6_len = G_N_ELEMENTS(pc_compat_2_6); -GlobalProperty pc_compat_2_5[] = {}; -const size_t pc_compat_2_5_len = G_N_ELEMENTS(pc_compat_2_5); - -GlobalProperty pc_compat_2_4[] = { - PC_CPU_MODEL_IDS("2.4.0") - { "Haswell-" TYPE_X86_CPU, "abm", "off" }, - { "Haswell-noTSX-" TYPE_X86_CPU, "abm", "off" }, - { "Broadwell-" TYPE_X86_CPU, "abm", "off" }, - { "Broadwell-noTSX-" TYPE_X86_CPU, "abm", "off" }, - { "host" "-" TYPE_X86_CPU, "host-cache-info", "on" }, - { TYPE_X86_CPU, "check", "off" }, - { "qemu64" "-" TYPE_X86_CPU, "sse4a", "on" }, - { "qemu64" "-" TYPE_X86_CPU, "abm", "on" }, - { "qemu64" "-" TYPE_X86_CPU, "popcnt", "on" }, - { "qemu32" "-" TYPE_X86_CPU, "popcnt", "on" }, - { "Opteron_G2" "-" TYPE_X86_CPU, "rdtscp", "on" }, - { "Opteron_G3" "-" TYPE_X86_CPU, "rdtscp", "on" }, - { "Opteron_G4" "-" TYPE_X86_CPU, "rdtscp", "on" }, - { "Opteron_G5" "-" TYPE_X86_CPU, "rdtscp", "on", } -}; -const size_t pc_compat_2_4_len = G_N_ELEMENTS(pc_compat_2_4); - /* * @PC_FW_DATA: * Size of the chunk of memory at the top of RAM for the BIOS ACPI tables @@ -630,7 +617,7 @@ void pc_machine_done(Notifier *notifier, void *data) &error_fatal); if (pcms->cxl_devices_state.is_enabled) { - cxl_fmws_link_targets(&pcms->cxl_devices_state, &error_fatal); + cxl_fmws_link_targets(&error_fatal); } /* set the number of CPUs */ @@ -739,20 +726,28 @@ static uint64_t pc_get_cxl_range_start(PCMachineState *pcms) return cxl_base; } -static uint64_t pc_get_cxl_range_end(PCMachineState *pcms) +static int cxl_get_fmw_end(Object *obj, void *opaque) { - uint64_t start = pc_get_cxl_range_start(pcms) + MiB; + struct CXLFixedWindow *fw; + uint64_t *start = opaque; - if (pcms->cxl_devices_state.fixed_windows) { - GList *it; - - start = ROUND_UP(start, 256 * MiB); - for (it = pcms->cxl_devices_state.fixed_windows; it; it = it->next) { - CXLFixedWindow *fw = it->data; - start += fw->size; - } + if (!object_dynamic_cast(obj, TYPE_CXL_FMW)) { + return 0; } + fw = CXL_FMW(obj); + *start += fw->size; + + return 0; +} + +static uint64_t pc_get_cxl_range_end(PCMachineState *pcms) +{ + uint64_t start = pc_get_cxl_range_start(pcms) + MiB; + + /* Ordering doesn't matter so no need to build a sorted list */ + object_child_foreach_recursive(object_get_root(), cxl_get_fmw_end, + &start); return start; } @@ -844,6 +839,7 @@ void pc_memory_init(PCMachineState *pcms, hwaddr maxphysaddr, maxusedaddr; hwaddr cxl_base, cxl_resv_end = 0; X86CPU *cpu = X86_CPU(first_cpu); + uint64_t res_mem_end; assert(machine->ram_size == x86ms->below_4g_mem_size + x86ms->above_4g_mem_size); @@ -954,60 +950,48 @@ void pc_memory_init(PCMachineState *pcms, cxl_base = pc_get_cxl_range_start(pcms); memory_region_init(mr, OBJECT(machine), "cxl_host_reg", cxl_size); memory_region_add_subregion(system_memory, cxl_base, mr); - cxl_resv_end = cxl_base + cxl_size; - if (pcms->cxl_devices_state.fixed_windows) { - hwaddr cxl_fmw_base; - GList *it; - - cxl_fmw_base = ROUND_UP(cxl_base + cxl_size, 256 * MiB); - for (it = pcms->cxl_devices_state.fixed_windows; it; it = it->next) { - CXLFixedWindow *fw = it->data; - - fw->base = cxl_fmw_base; - memory_region_init_io(&fw->mr, OBJECT(machine), &cfmws_ops, fw, - "cxl-fixed-memory-region", fw->size); - memory_region_add_subregion(system_memory, fw->base, &fw->mr); - cxl_fmw_base += fw->size; - cxl_resv_end = cxl_fmw_base; - } - } + cxl_base = ROUND_UP(cxl_base + cxl_size, 256 * MiB); + cxl_resv_end = cxl_fmws_set_memmap(cxl_base, maxphysaddr); + cxl_fmws_update_mmio(); } /* Initialize PC system firmware */ pc_system_firmware_init(pcms, rom_memory); - option_rom_mr = g_malloc(sizeof(*option_rom_mr)); - if (machine_require_guest_memfd(machine)) { - memory_region_init_ram_guest_memfd(option_rom_mr, NULL, "pc.rom", - PC_ROM_SIZE, &error_fatal); - } else { - memory_region_init_ram(option_rom_mr, NULL, "pc.rom", PC_ROM_SIZE, - &error_fatal); - if (pcmc->pci_enabled) { - memory_region_set_readonly(option_rom_mr, true); + if (!is_tdx_vm()) { + option_rom_mr = g_malloc(sizeof(*option_rom_mr)); + if (machine_require_guest_memfd(machine)) { + memory_region_init_ram_guest_memfd(option_rom_mr, NULL, "pc.rom", + PC_ROM_SIZE, &error_fatal); + } else { + memory_region_init_ram(option_rom_mr, NULL, "pc.rom", PC_ROM_SIZE, + &error_fatal); + if (pcmc->pci_enabled) { + memory_region_set_readonly(option_rom_mr, true); + } } + memory_region_add_subregion_overlap(rom_memory, + PC_ROM_MIN_VGA, + option_rom_mr, + 1); } - memory_region_add_subregion_overlap(rom_memory, - PC_ROM_MIN_VGA, - option_rom_mr, - 1); fw_cfg = fw_cfg_arch_create(machine, x86ms->boot_cpus, x86ms->apic_id_limit); rom_set_fw(fw_cfg); - if (machine->device_memory) { - uint64_t *val = g_malloc(sizeof(*val)); - uint64_t res_mem_end = machine->device_memory->base; - - if (!pcmc->broken_reserved_end) { - res_mem_end += memory_region_size(&machine->device_memory->mr); - } + if (pcms->cxl_devices_state.is_enabled) { + res_mem_end = cxl_resv_end; + } else if (machine->device_memory) { + res_mem_end = machine->device_memory->base + + memory_region_size(&machine->device_memory->mr); + } else { + res_mem_end = 0; + } - if (pcms->cxl_devices_state.is_enabled) { - res_mem_end = cxl_resv_end; - } + if (res_mem_end) { + uint64_t *val = g_malloc(sizeof(*val)); *val = cpu_to_le64(ROUND_UP(res_mem_end, 1 * GiB)); fw_cfg_add_file(fw_cfg, "etc/reserved-memory-end", val, sizeof(*val)); } @@ -1044,9 +1028,7 @@ uint64_t pc_pci_hole64_start(void) hole64_start = pc_get_cxl_range_end(pcms); } else if (pcmc->has_reserved_memory && (ms->ram_size < ms->maxram_size)) { pc_get_device_memory_range(pcms, &hole64_start, &size); - if (!pcmc->broken_reserved_end) { - hole64_start += size; - } + hole64_start += size; } else { hole64_start = pc_above_4g_end(pcms); } @@ -1058,7 +1040,6 @@ DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus) { DeviceState *dev = NULL; - rom_set_order_override(FW_CFG_ORDER_OVERRIDE_VGA); if (pci_bus) { PCIDevice *pcidev = pci_vga_init(pci_bus); dev = pcidev ? &pcidev->qdev : NULL; @@ -1066,7 +1047,7 @@ DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus) ISADevice *isadev = isa_vga_init(isa_bus); dev = isadev ? DEVICE(isadev) : NULL; } - rom_reset_order_override(); + return dev; } @@ -1256,8 +1237,6 @@ void pc_nic_init(PCMachineClass *pcmc, ISABus *isa_bus, PCIBus *pci_bus) bool default_is_ne2k = g_str_equal(mc->default_nic, TYPE_ISA_NE2000); NICInfo *nd; - rom_set_order_override(FW_CFG_ORDER_OVERRIDE_NIC); - while ((nd = qemu_find_nic_info(TYPE_ISA_NE2000, default_is_ne2k, NULL))) { pc_init_ne2k_isa(isa_bus, nd, &error_fatal); } @@ -1266,8 +1245,6 @@ void pc_nic_init(PCMachineClass *pcmc, ISABus *isa_bus, PCIBus *pci_bus) if (pci_bus) { pci_init_nic_devices(pci_bus, mc->default_nic); } - - rom_reset_order_override(); } void pc_i8259_create(ISABus *isa_bus, qemu_irq *i8259_irqs) @@ -1747,25 +1724,6 @@ static void pc_machine_wakeup(MachineState *machine) cpu_synchronize_all_post_reset(); } -static bool pc_hotplug_allowed(MachineState *ms, DeviceState *dev, Error **errp) -{ - X86IOMMUState *iommu = x86_iommu_get_default(); - IntelIOMMUState *intel_iommu; - - if (iommu && - object_dynamic_cast((Object *)iommu, TYPE_INTEL_IOMMU_DEVICE) && - object_dynamic_cast((Object *)dev, "vfio-pci")) { - intel_iommu = INTEL_IOMMU_DEVICE(iommu); - if (!intel_iommu->caching_mode) { - error_setg(errp, "Device assignment is not allowed without " - "enabling caching-mode=on for Intel IOMMU."); - return false; - } - } - - return true; -} - static void pc_machine_class_init(ObjectClass *oc, const void *data) { MachineClass *mc = MACHINE_CLASS(oc); @@ -1785,7 +1743,6 @@ static void pc_machine_class_init(ObjectClass *oc, const void *data) x86mc->apic_xrupt_override = true; assert(!mc->get_hotplug_handler); mc->get_hotplug_handler = pc_get_hotplug_handler; - mc->hotplug_allowed = pc_hotplug_allowed; mc->auto_enable_numa_with_memhp = true; mc->auto_enable_numa_with_memdev = true; mc->has_hotpluggable_cpus = true; @@ -1860,6 +1817,18 @@ static void pc_machine_class_init(ObjectClass *oc, const void *data) object_class_property_add_bool(oc, "fd-bootchk", pc_machine_get_fd_bootchk, pc_machine_set_fd_bootchk); + +#if defined(CONFIG_IGVM) + object_class_property_add_link(oc, "igvm-cfg", + TYPE_IGVM_CFG, + offsetof(X86MachineState, igvm), + object_property_allow_set_link, + OBJ_PROP_LINK_STRONG); + object_class_property_set_description(oc, "igvm-cfg", + "Set IGVM configuration"); +#endif + + } static const TypeInfo pc_machine_info = { diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 0dce512..7b3611e 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -27,20 +27,16 @@ #include "qemu/units.h" #include "hw/char/parallel-isa.h" -#include "hw/dma/i8257.h" -#include "hw/loader.h" #include "hw/i386/x86.h" #include "hw/i386/pc.h" #include "hw/i386/apic.h" #include "hw/pci-host/i440fx.h" -#include "hw/rtc/mc146818rtc.h" #include "hw/southbridge/piix.h" #include "hw/display/ramfb.h" #include "hw/pci/pci.h" #include "hw/pci/pci_ids.h" #include "hw/usb.h" #include "net/net.h" -#include "hw/ide/isa.h" #include "hw/ide/pci.h" #include "hw/irq.h" #include "system/kvm.h" @@ -49,6 +45,7 @@ #include "hw/i2c/smbus_eeprom.h" #include "system/memory.h" #include "hw/acpi/acpi.h" +#include "hw/vfio/types.h" #include "qapi/error.h" #include "qemu/error-report.h" #include "system/xen.h" @@ -71,11 +68,12 @@ #define XEN_IOAPIC_NUM_PIRQS 128ULL -#ifdef CONFIG_IDE_ISA -static const int ide_iobase[MAX_IDE_BUS] = { 0x1f0, 0x170 }; -static const int ide_iobase2[MAX_IDE_BUS] = { 0x3f6, 0x376 }; -static const int ide_irq[MAX_IDE_BUS] = { 14, 15 }; -#endif +static GlobalProperty pc_piix_compat_defaults[] = { + { TYPE_RAMFB_DEVICE, "use-legacy-x86-rom", "true" }, + { TYPE_VFIO_PCI_NOHOTPLUG, "use-legacy-x86-rom", "true" }, +}; +static const size_t pc_piix_compat_defaults_len = + G_N_ELEMENTS(pc_piix_compat_defaults); /* * Return the global irq number corresponding to a given device irq @@ -108,16 +106,20 @@ static void pc_init1(MachineState *machine, const char *pci_type) X86MachineState *x86ms = X86_MACHINE(machine); MemoryRegion *system_memory = get_system_memory(); MemoryRegion *system_io = get_system_io(); - Object *phb = NULL; + Object *phb; ISABus *isa_bus; Object *piix4_pm = NULL; qemu_irq smi_irq; GSIState *gsi_state; MemoryRegion *ram_memory; MemoryRegion *pci_memory = NULL; - MemoryRegion *rom_memory = system_memory; ram_addr_t lowmem; uint64_t hole64_size = 0; + PCIDevice *pci_dev; + DeviceState *dev; + size_t i; + + assert(pcmc->pci_enabled); /* * Calculate ram split, for memory below and above 4G. It's a bit @@ -188,42 +190,39 @@ static void pc_init1(MachineState *machine, const char *pci_type) kvmclock_create(pcmc->kvmclock_create_always); } - if (pcmc->pci_enabled) { - pci_memory = g_new(MemoryRegion, 1); - memory_region_init(pci_memory, NULL, "pci", UINT64_MAX); - rom_memory = pci_memory; - - phb = OBJECT(qdev_new(TYPE_I440FX_PCI_HOST_BRIDGE)); - object_property_add_child(OBJECT(machine), "i440fx", phb); - object_property_set_link(phb, PCI_HOST_PROP_RAM_MEM, - OBJECT(ram_memory), &error_fatal); - object_property_set_link(phb, PCI_HOST_PROP_PCI_MEM, - OBJECT(pci_memory), &error_fatal); - object_property_set_link(phb, PCI_HOST_PROP_SYSTEM_MEM, - OBJECT(system_memory), &error_fatal); - object_property_set_link(phb, PCI_HOST_PROP_IO_MEM, - OBJECT(system_io), &error_fatal); - object_property_set_uint(phb, PCI_HOST_BELOW_4G_MEM_SIZE, - x86ms->below_4g_mem_size, &error_fatal); - object_property_set_uint(phb, PCI_HOST_ABOVE_4G_MEM_SIZE, - x86ms->above_4g_mem_size, &error_fatal); - object_property_set_str(phb, I440FX_HOST_PROP_PCI_TYPE, pci_type, - &error_fatal); - sysbus_realize_and_unref(SYS_BUS_DEVICE(phb), &error_fatal); - - pcms->pcibus = PCI_BUS(qdev_get_child_bus(DEVICE(phb), "pci.0")); - pci_bus_map_irqs(pcms->pcibus, - xen_enabled() ? xen_pci_slot_get_pirq - : pc_pci_slot_get_pirq); - - hole64_size = object_property_get_uint(phb, - PCI_HOST_PROP_PCI_HOLE64_SIZE, - &error_abort); - } + pci_memory = g_new(MemoryRegion, 1); + memory_region_init(pci_memory, NULL, "pci", UINT64_MAX); + + phb = OBJECT(qdev_new(TYPE_I440FX_PCI_HOST_BRIDGE)); + object_property_add_child(OBJECT(machine), "i440fx", phb); + object_property_set_link(phb, PCI_HOST_PROP_RAM_MEM, + OBJECT(ram_memory), &error_fatal); + object_property_set_link(phb, PCI_HOST_PROP_PCI_MEM, + OBJECT(pci_memory), &error_fatal); + object_property_set_link(phb, PCI_HOST_PROP_SYSTEM_MEM, + OBJECT(system_memory), &error_fatal); + object_property_set_link(phb, PCI_HOST_PROP_IO_MEM, + OBJECT(system_io), &error_fatal); + object_property_set_uint(phb, PCI_HOST_BELOW_4G_MEM_SIZE, + x86ms->below_4g_mem_size, &error_fatal); + object_property_set_uint(phb, PCI_HOST_ABOVE_4G_MEM_SIZE, + x86ms->above_4g_mem_size, &error_fatal); + object_property_set_str(phb, I440FX_HOST_PROP_PCI_TYPE, pci_type, + &error_fatal); + sysbus_realize_and_unref(SYS_BUS_DEVICE(phb), &error_fatal); + + pcms->pcibus = PCI_BUS(qdev_get_child_bus(DEVICE(phb), "pci.0")); + pci_bus_map_irqs(pcms->pcibus, + xen_enabled() ? xen_pci_slot_get_pirq + : pc_pci_slot_get_pirq); + + hole64_size = object_property_get_uint(phb, + PCI_HOST_PROP_PCI_HOLE64_SIZE, + &error_abort); /* allocate ram and load rom/bios */ if (!xen_enabled()) { - pc_memory_init(pcms, system_memory, rom_memory, hole64_size); + pc_memory_init(pcms, system_memory, pci_memory, hole64_size); } else { assert(machine->ram_size == x86ms->below_4g_mem_size + x86ms->above_4g_mem_size); @@ -235,81 +234,63 @@ static void pc_init1(MachineState *machine, const char *pci_type) } } - gsi_state = pc_gsi_create(&x86ms->gsi, pcmc->pci_enabled); - - if (pcmc->pci_enabled) { - PCIDevice *pci_dev; - DeviceState *dev; - size_t i; - - pci_dev = pci_new_multifunction(-1, pcms->south_bridge); - object_property_set_bool(OBJECT(pci_dev), "has-usb", - machine_usb(machine), &error_abort); - object_property_set_bool(OBJECT(pci_dev), "has-acpi", - x86_machine_is_acpi_enabled(x86ms), - &error_abort); - object_property_set_bool(OBJECT(pci_dev), "has-pic", false, - &error_abort); - object_property_set_bool(OBJECT(pci_dev), "has-pit", false, - &error_abort); - qdev_prop_set_uint32(DEVICE(pci_dev), "smb_io_base", 0xb100); - object_property_set_bool(OBJECT(pci_dev), "smm-enabled", - x86_machine_is_smm_enabled(x86ms), - &error_abort); - dev = DEVICE(pci_dev); - for (i = 0; i < ISA_NUM_IRQS; i++) { - qdev_connect_gpio_out_named(dev, "isa-irqs", i, x86ms->gsi[i]); - } - pci_realize_and_unref(pci_dev, pcms->pcibus, &error_fatal); - - if (xen_enabled()) { - pci_device_set_intx_routing_notifier( - pci_dev, piix_intx_routing_notifier_xen); - - /* - * Xen supports additional interrupt routes from the PCI devices to - * the IOAPIC: the four pins of each PCI device on the bus are also - * connected to the IOAPIC directly. - * These additional routes can be discovered through ACPI. - */ - pci_bus_irqs(pcms->pcibus, xen_intx_set_irq, pci_dev, - XEN_IOAPIC_NUM_PIRQS); - } + gsi_state = pc_gsi_create(&x86ms->gsi, true); + + pci_dev = pci_new_multifunction(-1, pcms->south_bridge); + object_property_set_bool(OBJECT(pci_dev), "has-usb", + machine_usb(machine), &error_abort); + object_property_set_bool(OBJECT(pci_dev), "has-acpi", + x86_machine_is_acpi_enabled(x86ms), + &error_abort); + object_property_set_bool(OBJECT(pci_dev), "has-pic", false, + &error_abort); + object_property_set_bool(OBJECT(pci_dev), "has-pit", false, + &error_abort); + qdev_prop_set_uint32(DEVICE(pci_dev), "smb_io_base", 0xb100); + object_property_set_bool(OBJECT(pci_dev), "smm-enabled", + x86_machine_is_smm_enabled(x86ms), + &error_abort); + dev = DEVICE(pci_dev); + for (i = 0; i < ISA_NUM_IRQS; i++) { + qdev_connect_gpio_out_named(dev, "isa-irqs", i, x86ms->gsi[i]); + } + pci_realize_and_unref(pci_dev, pcms->pcibus, &error_fatal); - isa_bus = ISA_BUS(qdev_get_child_bus(DEVICE(pci_dev), "isa.0")); - x86ms->rtc = ISA_DEVICE(object_resolve_path_component(OBJECT(pci_dev), - "rtc")); - piix4_pm = object_resolve_path_component(OBJECT(pci_dev), "pm"); - dev = DEVICE(object_resolve_path_component(OBJECT(pci_dev), "ide")); - pci_ide_create_devs(PCI_DEVICE(dev)); - pcms->idebus[0] = qdev_get_child_bus(dev, "ide.0"); - pcms->idebus[1] = qdev_get_child_bus(dev, "ide.1"); - } else { - isa_bus = isa_bus_new(NULL, system_memory, system_io, - &error_abort); - isa_bus_register_input_irqs(isa_bus, x86ms->gsi); + if (xen_enabled()) { + pci_device_set_intx_routing_notifier( + pci_dev, piix_intx_routing_notifier_xen); + + /* + * Xen supports additional interrupt routes from the PCI devices to + * the IOAPIC: the four pins of each PCI device on the bus are also + * connected to the IOAPIC directly. + * These additional routes can be discovered through ACPI. + */ + pci_bus_irqs(pcms->pcibus, xen_intx_set_irq, pci_dev, + XEN_IOAPIC_NUM_PIRQS); + } - x86ms->rtc = isa_new(TYPE_MC146818_RTC); - qdev_prop_set_int32(DEVICE(x86ms->rtc), "base_year", 2000); - isa_realize_and_unref(x86ms->rtc, isa_bus, &error_fatal); + isa_bus = ISA_BUS(qdev_get_child_bus(DEVICE(pci_dev), "isa.0")); + x86ms->rtc = ISA_DEVICE(object_resolve_path_component(OBJECT(pci_dev), + "rtc")); + piix4_pm = object_resolve_path_component(OBJECT(pci_dev), "pm"); + dev = DEVICE(object_resolve_path_component(OBJECT(pci_dev), "ide")); + pci_ide_create_devs(PCI_DEVICE(dev)); + pcms->idebus[0] = qdev_get_child_bus(dev, "ide.0"); + pcms->idebus[1] = qdev_get_child_bus(dev, "ide.1"); - i8257_dma_init(OBJECT(machine), isa_bus, 0); - pcms->hpet_enabled = false; - } if (x86ms->pic == ON_OFF_AUTO_ON || x86ms->pic == ON_OFF_AUTO_AUTO) { pc_i8259_create(isa_bus, gsi_state->i8259_irq); } - if (phb) { - ioapic_init_gsi(gsi_state, phb); - } + ioapic_init_gsi(gsi_state, phb); if (tcg_enabled()) { x86_register_ferr_irq(x86ms->gsi[13]); } - pc_vga_init(isa_bus, pcmc->pci_enabled ? pcms->pcibus : NULL); + pc_vga_init(isa_bus, pcms->pcibus); /* init basic PC hardware */ pc_basic_device_init(pcms, isa_bus, x86ms->gsi, x86ms->rtc, @@ -317,28 +298,6 @@ static void pc_init1(MachineState *machine, const char *pci_type) pc_nic_init(pcmc, isa_bus, pcms->pcibus); -#ifdef CONFIG_IDE_ISA - if (!pcmc->pci_enabled) { - DriveInfo *hd[MAX_IDE_BUS * MAX_IDE_DEVS]; - int i; - - ide_drive_get(hd, ARRAY_SIZE(hd)); - for (i = 0; i < MAX_IDE_BUS; i++) { - ISADevice *dev; - char busname[] = "ide.0"; - dev = isa_ide_init(isa_bus, ide_iobase[i], ide_iobase2[i], - ide_irq[i], - hd[MAX_IDE_DEVS * i], hd[MAX_IDE_DEVS * i + 1]); - /* - * The ide bus name is ide.0 for the first bus and ide.1 for the - * second one. - */ - busname[4] = '0' + i; - pcms->idebus[i] = qdev_get_child_bus(DEVICE(dev), busname); - } - } -#endif - if (piix4_pm) { smi_irq = qemu_allocate_irq(pc_acpi_smi_interrupt, first_cpu, 0); @@ -361,6 +320,16 @@ static void pc_init1(MachineState *machine, const char *pci_type) x86_nvdimm_acpi_dsmio, x86ms->fw_cfg, OBJECT(pcms)); } + +#if defined(CONFIG_IGVM) + /* Apply guest state from IGVM if supplied */ + if (x86ms->igvm) { + if (IGVM_CFG_GET_CLASS(x86ms->igvm) + ->process(x86ms->igvm, machine->cgs, false, &error_fatal) < 0) { + g_assert_not_reached(); + } + } +#endif } typedef enum PCSouthBridgeOption { @@ -410,22 +379,7 @@ static void pc_set_south_bridge(Object *obj, int value, Error **errp) pcms->south_bridge = PCSouthBridgeOption_lookup.array[value]; } -#ifdef CONFIG_ISAPC -static void pc_init_isa(MachineState *machine) -{ - pc_init1(machine, NULL); -} -#endif - #ifdef CONFIG_XEN -static void pc_xen_hvm_init_pci(MachineState *machine) -{ - const char *pci_type = xen_igd_gfx_pt_enabled() ? - TYPE_IGD_PASSTHROUGH_I440FX_PCI_DEVICE : TYPE_I440FX_PCI_DEVICE; - - pc_init1(machine, pci_type); -} - static void pc_xen_hvm_init(MachineState *machine) { PCMachineState *pcms = PC_MACHINE(machine); @@ -435,7 +389,10 @@ static void pc_xen_hvm_init(MachineState *machine) exit(1); } - pc_xen_hvm_init_pci(machine); + pc_init1(machine, xen_igd_gfx_pt_enabled() + ? TYPE_IGD_PASSTHROUGH_I440FX_PCI_DEVICE + : TYPE_I440FX_PCI_DEVICE); + xen_igd_reserve_slot(pcms->pcibus); pci_create_simple(pcms->pcibus, -1, "xen-platform"); } @@ -477,14 +434,26 @@ static void pc_i440fx_machine_options(MachineClass *m) pc_set_south_bridge); object_class_property_set_description(oc, "x-south-bridge", "Use a different south bridge than PIIX3"); + compat_props_add(m->compat_props, + pc_piix_compat_defaults, pc_piix_compat_defaults_len); } -static void pc_i440fx_machine_10_1_options(MachineClass *m) +static void pc_i440fx_machine_10_2_options(MachineClass *m) { pc_i440fx_machine_options(m); } -DEFINE_I440FX_MACHINE_AS_LATEST(10, 1); +DEFINE_I440FX_MACHINE_AS_LATEST(10, 2); + +static void pc_i440fx_machine_10_1_options(MachineClass *m) +{ + pc_i440fx_machine_10_2_options(m); + m->smbios_memory_device_size = 2047 * TiB; + compat_props_add(m->compat_props, hw_compat_10_1, hw_compat_10_1_len); + compat_props_add(m->compat_props, pc_compat_10_1, pc_compat_10_1_len); +} + +DEFINE_I440FX_MACHINE(10, 1); static void pc_i440fx_machine_10_0_options(MachineClass *m) { @@ -778,56 +747,6 @@ static void pc_i440fx_machine_2_6_options(MachineClass *m) DEFINE_I440FX_MACHINE(2, 6); -static void pc_i440fx_machine_2_5_options(MachineClass *m) -{ - X86MachineClass *x86mc = X86_MACHINE_CLASS(m); - - pc_i440fx_machine_2_6_options(m); - x86mc->save_tsc_khz = false; - m->legacy_fw_cfg_order = 1; - compat_props_add(m->compat_props, hw_compat_2_5, hw_compat_2_5_len); - compat_props_add(m->compat_props, pc_compat_2_5, pc_compat_2_5_len); -} - -DEFINE_I440FX_MACHINE(2, 5); - -static void pc_i440fx_machine_2_4_options(MachineClass *m) -{ - PCMachineClass *pcmc = PC_MACHINE_CLASS(m); - - pc_i440fx_machine_2_5_options(m); - m->hw_version = "2.4.0"; - pcmc->broken_reserved_end = true; - compat_props_add(m->compat_props, hw_compat_2_4, hw_compat_2_4_len); - compat_props_add(m->compat_props, pc_compat_2_4, pc_compat_2_4_len); -} - -DEFINE_I440FX_MACHINE(2, 4); - -#ifdef CONFIG_ISAPC -static void isapc_machine_options(MachineClass *m) -{ - PCMachineClass *pcmc = PC_MACHINE_CLASS(m); - m->desc = "ISA-only PC"; - m->max_cpus = 1; - m->option_rom_has_mr = true; - m->rom_file_has_mr = false; - pcmc->pci_enabled = false; - pcmc->has_acpi_build = false; - pcmc->smbios_defaults = false; - pcmc->gigabyte_align = false; - pcmc->smbios_legacy_mode = true; - pcmc->has_reserved_memory = false; - m->default_nic = "ne2k_isa"; - m->default_cpu_type = X86_CPU_TYPE_NAME("486"); - m->no_floppy = !module_object_class_by_name(TYPE_ISA_FDC); - m->no_parallel = !module_object_class_by_name(TYPE_ISA_PARALLEL); -} - -DEFINE_PC_MACHINE(isapc, "isapc", pc_init_isa, - isapc_machine_options); -#endif - #ifdef CONFIG_XEN static void xenfv_machine_4_2_options(MachineClass *m) { diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index c538b3d..6015e63 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -45,6 +45,7 @@ #include "hw/i386/pc.h" #include "hw/i386/amd_iommu.h" #include "hw/i386/intel_iommu.h" +#include "hw/vfio/types.h" #include "hw/virtio/virtio-iommu.h" #include "hw/display/ramfb.h" #include "hw/ide/pci.h" @@ -67,6 +68,8 @@ static GlobalProperty pc_q35_compat_defaults[] = { { TYPE_VIRTIO_IOMMU_PCI, "aw-bits", "39" }, + { TYPE_RAMFB_DEVICE, "use-legacy-x86-rom", "true" }, + { TYPE_VFIO_PCI_NOHOTPLUG, "use-legacy-x86-rom", "true" }, }; static const size_t pc_q35_compat_defaults_len = G_N_ELEMENTS(pc_q35_compat_defaults); @@ -325,6 +328,16 @@ static void pc_q35_init(MachineState *machine) x86_nvdimm_acpi_dsmio, x86ms->fw_cfg, OBJECT(pcms)); } + +#if defined(CONFIG_IGVM) + /* Apply guest state from IGVM if supplied */ + if (x86ms->igvm) { + if (IGVM_CFG_GET_CLASS(x86ms->igvm) + ->process(x86ms->igvm, machine->cgs, false, &error_fatal) < 0) { + g_assert_not_reached(); + } + } +#endif } #define DEFINE_Q35_MACHINE(major, minor) \ @@ -361,12 +374,22 @@ static void pc_q35_machine_options(MachineClass *m) pc_q35_compat_defaults, pc_q35_compat_defaults_len); } -static void pc_q35_machine_10_1_options(MachineClass *m) +static void pc_q35_machine_10_2_options(MachineClass *m) { pc_q35_machine_options(m); } -DEFINE_Q35_MACHINE_AS_LATEST(10, 1); +DEFINE_Q35_MACHINE_AS_LATEST(10, 2); + +static void pc_q35_machine_10_1_options(MachineClass *m) +{ + pc_q35_machine_10_2_options(m); + m->smbios_memory_device_size = 2047 * TiB; + compat_props_add(m->compat_props, hw_compat_10_1, hw_compat_10_1_len); + compat_props_add(m->compat_props, pc_compat_10_1, pc_compat_10_1_len); +} + +DEFINE_Q35_MACHINE(10, 1); static void pc_q35_machine_10_0_options(MachineClass *m) { @@ -672,29 +695,3 @@ static void pc_q35_machine_2_6_options(MachineClass *m) } DEFINE_Q35_MACHINE(2, 6); - -static void pc_q35_machine_2_5_options(MachineClass *m) -{ - X86MachineClass *x86mc = X86_MACHINE_CLASS(m); - - pc_q35_machine_2_6_options(m); - x86mc->save_tsc_khz = false; - m->legacy_fw_cfg_order = 1; - compat_props_add(m->compat_props, hw_compat_2_5, hw_compat_2_5_len); - compat_props_add(m->compat_props, pc_compat_2_5, pc_compat_2_5_len); -} - -DEFINE_Q35_MACHINE(2, 5); - -static void pc_q35_machine_2_4_options(MachineClass *m) -{ - PCMachineClass *pcmc = PC_MACHINE_CLASS(m); - - pc_q35_machine_2_5_options(m); - m->hw_version = "2.4.0"; - pcmc->broken_reserved_end = true; - compat_props_add(m->compat_props, hw_compat_2_4, hw_compat_2_4_len); - compat_props_add(m->compat_props, pc_compat_2_4, pc_compat_2_4_len); -} - -DEFINE_Q35_MACHINE(2, 4); diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c index 1eeb58a..1a12b63 100644 --- a/hw/i386/pc_sysfw.c +++ b/hw/i386/pc_sysfw.c @@ -37,6 +37,7 @@ #include "hw/block/flash.h" #include "system/kvm.h" #include "target/i386/sev.h" +#include "kvm/tdx.h" #define FLASH_SECTOR_SIZE 4096 @@ -219,7 +220,13 @@ void pc_system_firmware_init(PCMachineState *pcms, BlockBackend *pflash_blk[ARRAY_SIZE(pcms->flash)]; if (!pcmc->pci_enabled) { - x86_bios_rom_init(X86_MACHINE(pcms), "bios.bin", rom_memory, true); + /* + * If an IGVM file is specified then the firmware must be provided + * in the IGVM file. + */ + if (!X86_MACHINE(pcms)->igvm) { + x86_bios_rom_init(X86_MACHINE(pcms), "bios.bin", rom_memory, true); + } return; } @@ -239,8 +246,13 @@ void pc_system_firmware_init(PCMachineState *pcms, } if (!pflash_blk[0]) { - /* Machine property pflash0 not set, use ROM mode */ - x86_bios_rom_init(X86_MACHINE(pcms), "bios.bin", rom_memory, false); + /* + * Machine property pflash0 not set, use ROM mode unless using IGVM, + * in which case the firmware must be provided by the IGVM file. + */ + if (!X86_MACHINE(pcms)->igvm) { + x86_bios_rom_init(X86_MACHINE(pcms), "bios.bin", rom_memory, false); + } } else { if (kvm_enabled() && !kvm_readonly_mem_enabled()) { /* @@ -256,6 +268,20 @@ void pc_system_firmware_init(PCMachineState *pcms, } pc_system_flash_cleanup_unused(pcms); + + /* + * The user should not have specified any pflash devices when using IGVM + * to configure the guest. + */ + if (X86_MACHINE(pcms)->igvm) { + for (i = 0; i < ARRAY_SIZE(pcms->flash); i++) { + if (pcms->flash[i]) { + error_report("pflash devices cannot be configured when " + "using IGVM"); + exit(1); + } + } + } } void x86_firmware_configure(hwaddr gpa, void *ptr, int size) @@ -280,5 +306,11 @@ void x86_firmware_configure(hwaddr gpa, void *ptr, int size) } sev_encrypt_flash(gpa, ptr, size, &error_fatal); + } else if (is_tdx_vm()) { + ret = tdx_parse_tdvf(ptr, size); + if (ret) { + error_report("failed to parse TDVF for TDX VM"); + exit(1); + } } } diff --git a/hw/i386/sgx-stub.c b/hw/i386/sgx-stub.c index 38ff75e..d295e54 100644 --- a/hw/i386/sgx-stub.c +++ b/hw/i386/sgx-stub.c @@ -3,20 +3,20 @@ #include "monitor/hmp-target.h" #include "hw/i386/pc.h" #include "hw/i386/sgx-epc.h" +#include "qapi/qapi-commands-misc-i386.h" #include "qapi/error.h" -#include "qapi/qapi-commands-misc-target.h" void sgx_epc_build_srat(GArray *table_data) { } -SGXInfo *qmp_query_sgx(Error **errp) +SgxInfo *qmp_query_sgx(Error **errp) { error_setg(errp, "SGX support is not compiled in"); return NULL; } -SGXInfo *qmp_query_sgx_capabilities(Error **errp) +SgxInfo *qmp_query_sgx_capabilities(Error **errp) { error_setg(errp, "SGX support is not compiled in"); return NULL; diff --git a/hw/i386/sgx.c b/hw/i386/sgx.c index 5685c4f..e280154 100644 --- a/hw/i386/sgx.c +++ b/hw/i386/sgx.c @@ -19,7 +19,7 @@ #include "monitor/hmp-target.h" #include "qapi/error.h" #include "qemu/error-report.h" -#include "qapi/qapi-commands-misc-target.h" +#include "qapi/qapi-commands-misc-i386.h" #include "system/address-spaces.h" #include "system/hw_accel.h" #include "system/reset.h" @@ -84,10 +84,10 @@ static uint64_t sgx_calc_section_metric(uint64_t low, uint64_t high) ((high & MAKE_64BIT_MASK(0, 20)) << 32); } -static SGXEPCSectionList *sgx_calc_host_epc_sections(void) +static SgxEpcSectionList *sgx_calc_host_epc_sections(void) { - SGXEPCSectionList *head = NULL, **tail = &head; - SGXEPCSection *section; + SgxEpcSectionList *head = NULL, **tail = &head; + SgxEpcSection *section; uint32_t i, type; uint32_t eax, ebx, ecx, edx; uint32_t j = 0; @@ -104,7 +104,7 @@ static SGXEPCSectionList *sgx_calc_host_epc_sections(void) break; } - section = g_new0(SGXEPCSection, 1); + section = g_new0(SgxEpcSection, 1); section->node = j++; section->size = sgx_calc_section_metric(ecx, edx); QAPI_LIST_APPEND(tail, section); @@ -153,9 +153,9 @@ static void sgx_epc_reset(void *opaque) } } -SGXInfo *qmp_query_sgx_capabilities(Error **errp) +SgxInfo *qmp_query_sgx_capabilities(Error **errp) { - SGXInfo *info = NULL; + SgxInfo *info = NULL; uint32_t eax, ebx, ecx, edx; Error *local_err = NULL; @@ -166,7 +166,7 @@ SGXInfo *qmp_query_sgx_capabilities(Error **errp) return NULL; } - info = g_new0(SGXInfo, 1); + info = g_new0(SgxInfo, 1); host_cpuid(0x7, 0, &eax, &ebx, &ecx, &edx); info->sgx = ebx & (1U << 2) ? true : false; @@ -183,17 +183,17 @@ SGXInfo *qmp_query_sgx_capabilities(Error **errp) return info; } -static SGXEPCSectionList *sgx_get_epc_sections_list(void) +static SgxEpcSectionList *sgx_get_epc_sections_list(void) { GSList *device_list = sgx_epc_get_device_list(); - SGXEPCSectionList *head = NULL, **tail = &head; - SGXEPCSection *section; + SgxEpcSectionList *head = NULL, **tail = &head; + SgxEpcSection *section; for (; device_list; device_list = device_list->next) { DeviceState *dev = device_list->data; Object *obj = OBJECT(dev); - section = g_new0(SGXEPCSection, 1); + section = g_new0(SgxEpcSection, 1); section->node = object_property_get_uint(obj, SGX_EPC_NUMA_NODE_PROP, &error_abort); section->size = object_property_get_uint(obj, SGX_EPC_SIZE_PROP, @@ -205,9 +205,9 @@ static SGXEPCSectionList *sgx_get_epc_sections_list(void) return head; } -SGXInfo *qmp_query_sgx(Error **errp) +SgxInfo *qmp_query_sgx(Error **errp) { - SGXInfo *info = NULL; + SgxInfo *info = NULL; X86MachineState *x86ms; PCMachineState *pcms = (PCMachineState *)object_dynamic_cast(qdev_get_machine(), @@ -223,7 +223,7 @@ SGXInfo *qmp_query_sgx(Error **errp) return NULL; } - info = g_new0(SGXInfo, 1); + info = g_new0(SgxInfo, 1); info->sgx = true; info->sgx1 = true; @@ -237,8 +237,8 @@ SGXInfo *qmp_query_sgx(Error **errp) void hmp_info_sgx(Monitor *mon, const QDict *qdict) { Error *err = NULL; - SGXEPCSectionList *section_list, *section; - g_autoptr(SGXInfo) info = qmp_query_sgx(&err); + SgxEpcSectionList *section_list, *section; + g_autoptr(SgxInfo) info = qmp_query_sgx(&err); uint64_t size = 0; if (err) { diff --git a/hw/i386/tdvf-hob.c b/hw/i386/tdvf-hob.c new file mode 100644 index 0000000..782b3d1 --- /dev/null +++ b/hw/i386/tdvf-hob.c @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2025 Intel Corporation + * Author: Isaku Yamahata <isaku.yamahata at gmail.com> + * <isaku.yamahata at intel.com> + * Xiaoyao Li <xiaoyao.li@intel.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qemu/error-report.h" +#include "standard-headers/uefi/uefi.h" +#include "hw/pci/pcie_host.h" +#include "tdvf-hob.h" + +typedef struct TdvfHob { + hwaddr hob_addr; + void *ptr; + int size; + + /* working area */ + void *current; + void *end; +} TdvfHob; + +static uint64_t tdvf_current_guest_addr(const TdvfHob *hob) +{ + return hob->hob_addr + (hob->current - hob->ptr); +} + +static void tdvf_align(TdvfHob *hob, size_t align) +{ + hob->current = QEMU_ALIGN_PTR_UP(hob->current, align); +} + +static void *tdvf_get_area(TdvfHob *hob, uint64_t size) +{ + void *ret; + + if (hob->current + size > hob->end) { + error_report("TD_HOB overrun, size = 0x%" PRIx64, size); + exit(1); + } + + ret = hob->current; + hob->current += size; + tdvf_align(hob, 8); + return ret; +} + +static void tdvf_hob_add_memory_resources(TdxGuest *tdx, TdvfHob *hob) +{ + EFI_HOB_RESOURCE_DESCRIPTOR *region; + EFI_RESOURCE_ATTRIBUTE_TYPE attr; + EFI_RESOURCE_TYPE resource_type; + + TdxRamEntry *e; + int i; + + for (i = 0; i < tdx->nr_ram_entries; i++) { + e = &tdx->ram_entries[i]; + + if (e->type == TDX_RAM_UNACCEPTED) { + resource_type = EFI_RESOURCE_MEMORY_UNACCEPTED; + attr = EFI_RESOURCE_ATTRIBUTE_TDVF_UNACCEPTED; + } else if (e->type == TDX_RAM_ADDED) { + resource_type = EFI_RESOURCE_SYSTEM_MEMORY; + attr = EFI_RESOURCE_ATTRIBUTE_TDVF_PRIVATE; + } else { + error_report("unknown TDX_RAM_ENTRY type %d", e->type); + exit(1); + } + + region = tdvf_get_area(hob, sizeof(*region)); + *region = (EFI_HOB_RESOURCE_DESCRIPTOR) { + .Header = { + .HobType = EFI_HOB_TYPE_RESOURCE_DESCRIPTOR, + .HobLength = cpu_to_le16(sizeof(*region)), + .Reserved = cpu_to_le32(0), + }, + .Owner = EFI_HOB_OWNER_ZERO, + .ResourceType = cpu_to_le32(resource_type), + .ResourceAttribute = cpu_to_le32(attr), + .PhysicalStart = cpu_to_le64(e->address), + .ResourceLength = cpu_to_le64(e->length), + }; + } +} + +void tdvf_hob_create(TdxGuest *tdx, TdxFirmwareEntry *td_hob) +{ + TdvfHob hob = { + .hob_addr = td_hob->address, + .size = td_hob->size, + .ptr = td_hob->mem_ptr, + + .current = td_hob->mem_ptr, + .end = td_hob->mem_ptr + td_hob->size, + }; + + EFI_HOB_GENERIC_HEADER *last_hob; + EFI_HOB_HANDOFF_INFO_TABLE *hit; + + /* Note, Efi{Free}Memory{Bottom,Top} are ignored, leave 'em zeroed. */ + hit = tdvf_get_area(&hob, sizeof(*hit)); + *hit = (EFI_HOB_HANDOFF_INFO_TABLE) { + .Header = { + .HobType = EFI_HOB_TYPE_HANDOFF, + .HobLength = cpu_to_le16(sizeof(*hit)), + .Reserved = cpu_to_le32(0), + }, + .Version = cpu_to_le32(EFI_HOB_HANDOFF_TABLE_VERSION), + .BootMode = cpu_to_le32(0), + .EfiMemoryTop = cpu_to_le64(0), + .EfiMemoryBottom = cpu_to_le64(0), + .EfiFreeMemoryTop = cpu_to_le64(0), + .EfiFreeMemoryBottom = cpu_to_le64(0), + .EfiEndOfHobList = cpu_to_le64(0), /* initialized later */ + }; + + tdvf_hob_add_memory_resources(tdx, &hob); + + last_hob = tdvf_get_area(&hob, sizeof(*last_hob)); + *last_hob = (EFI_HOB_GENERIC_HEADER) { + .HobType = EFI_HOB_TYPE_END_OF_HOB_LIST, + .HobLength = cpu_to_le16(sizeof(*last_hob)), + .Reserved = cpu_to_le32(0), + }; + hit->EfiEndOfHobList = tdvf_current_guest_addr(&hob); +} diff --git a/hw/i386/tdvf-hob.h b/hw/i386/tdvf-hob.h new file mode 100644 index 0000000..4fc6a37 --- /dev/null +++ b/hw/i386/tdvf-hob.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef HW_I386_TD_HOB_H +#define HW_I386_TD_HOB_H + +#include "hw/i386/tdvf.h" +#include "target/i386/kvm/tdx.h" + +void tdvf_hob_create(TdxGuest *tdx, TdxFirmwareEntry *td_hob); + +#define EFI_RESOURCE_ATTRIBUTE_TDVF_PRIVATE \ + (EFI_RESOURCE_ATTRIBUTE_PRESENT | \ + EFI_RESOURCE_ATTRIBUTE_INITIALIZED | \ + EFI_RESOURCE_ATTRIBUTE_TESTED) + +#define EFI_RESOURCE_ATTRIBUTE_TDVF_UNACCEPTED \ + (EFI_RESOURCE_ATTRIBUTE_PRESENT | \ + EFI_RESOURCE_ATTRIBUTE_INITIALIZED | \ + EFI_RESOURCE_ATTRIBUTE_TESTED) + +#define EFI_RESOURCE_ATTRIBUTE_TDVF_MMIO \ + (EFI_RESOURCE_ATTRIBUTE_PRESENT | \ + EFI_RESOURCE_ATTRIBUTE_INITIALIZED | \ + EFI_RESOURCE_ATTRIBUTE_UNCACHEABLE) + +#endif diff --git a/hw/i386/tdvf.c b/hw/i386/tdvf.c new file mode 100644 index 0000000..645d9d1 --- /dev/null +++ b/hw/i386/tdvf.c @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2025 Intel Corporation + * Author: Isaku Yamahata <isaku.yamahata at gmail.com> + * <isaku.yamahata at intel.com> + * Xiaoyao Li <xiaoyao.li@intel.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qemu/error-report.h" + +#include "hw/i386/pc.h" +#include "hw/i386/tdvf.h" +#include "system/kvm.h" + +#define TDX_METADATA_OFFSET_GUID "e47a6535-984a-4798-865e-4685a7bf8ec2" +#define TDX_METADATA_VERSION 1 +#define TDVF_SIGNATURE 0x46564454 /* TDVF as little endian */ +#define TDVF_ALIGNMENT 4096 + +/* + * the raw structs read from TDVF keeps the name convention in + * TDVF Design Guide spec. + */ +typedef struct { + uint32_t DataOffset; + uint32_t RawDataSize; + uint64_t MemoryAddress; + uint64_t MemoryDataSize; + uint32_t Type; + uint32_t Attributes; +} TdvfSectionEntry; + +typedef struct { + uint32_t Signature; + uint32_t Length; + uint32_t Version; + uint32_t NumberOfSectionEntries; + TdvfSectionEntry SectionEntries[]; +} TdvfMetadata; + +struct tdx_metadata_offset { + uint32_t offset; +}; + +static TdvfMetadata *tdvf_get_metadata(void *flash_ptr, int size) +{ + TdvfMetadata *metadata; + uint32_t offset = 0; + uint8_t *data; + + if ((uint32_t) size != size) { + return NULL; + } + + if (pc_system_ovmf_table_find(TDX_METADATA_OFFSET_GUID, &data, NULL)) { + offset = size - le32_to_cpu(((struct tdx_metadata_offset *)data)->offset); + + if (offset + sizeof(*metadata) > size) { + return NULL; + } + } else { + error_report("Cannot find TDX_METADATA_OFFSET_GUID"); + return NULL; + } + + metadata = flash_ptr + offset; + + /* Finally, verify the signature to determine if this is a TDVF image. */ + metadata->Signature = le32_to_cpu(metadata->Signature); + if (metadata->Signature != TDVF_SIGNATURE) { + error_report("Invalid TDVF signature in metadata!"); + return NULL; + } + + /* Sanity check that the TDVF doesn't overlap its own metadata. */ + metadata->Length = le32_to_cpu(metadata->Length); + if (offset + metadata->Length > size) { + return NULL; + } + + /* Only version 1 is supported/defined. */ + metadata->Version = le32_to_cpu(metadata->Version); + if (metadata->Version != TDX_METADATA_VERSION) { + return NULL; + } + + return metadata; +} + +static int tdvf_parse_and_check_section_entry(const TdvfSectionEntry *src, + TdxFirmwareEntry *entry) +{ + entry->data_offset = le32_to_cpu(src->DataOffset); + entry->data_len = le32_to_cpu(src->RawDataSize); + entry->address = le64_to_cpu(src->MemoryAddress); + entry->size = le64_to_cpu(src->MemoryDataSize); + entry->type = le32_to_cpu(src->Type); + entry->attributes = le32_to_cpu(src->Attributes); + + /* sanity check */ + if (entry->size < entry->data_len) { + error_report("Broken metadata RawDataSize 0x%x MemoryDataSize 0x%"PRIx64, + entry->data_len, entry->size); + return -1; + } + if (!QEMU_IS_ALIGNED(entry->address, TDVF_ALIGNMENT)) { + error_report("MemoryAddress 0x%"PRIx64" not page aligned", entry->address); + return -1; + } + if (!QEMU_IS_ALIGNED(entry->size, TDVF_ALIGNMENT)) { + error_report("MemoryDataSize 0x%"PRIx64" not page aligned", entry->size); + return -1; + } + + switch (entry->type) { + case TDVF_SECTION_TYPE_BFV: + case TDVF_SECTION_TYPE_CFV: + /* The sections that must be copied from firmware image to TD memory */ + if (entry->data_len == 0) { + error_report("%d section with RawDataSize == 0", entry->type); + return -1; + } + break; + case TDVF_SECTION_TYPE_TD_HOB: + case TDVF_SECTION_TYPE_TEMP_MEM: + /* The sections that no need to be copied from firmware image */ + if (entry->data_len != 0) { + error_report("%d section with RawDataSize 0x%x != 0", + entry->type, entry->data_len); + return -1; + } + break; + default: + error_report("TDVF contains unsupported section type %d", entry->type); + return -1; + } + + return 0; +} + +int tdvf_parse_metadata(TdxFirmware *fw, void *flash_ptr, int size) +{ + g_autofree TdvfSectionEntry *sections = NULL; + TdvfMetadata *metadata; + ssize_t entries_size; + int i; + + metadata = tdvf_get_metadata(flash_ptr, size); + if (!metadata) { + return -EINVAL; + } + + /* load and parse metadata entries */ + fw->nr_entries = le32_to_cpu(metadata->NumberOfSectionEntries); + if (fw->nr_entries < 2) { + error_report("Invalid number of fw entries (%u) in TDVF Metadata", + fw->nr_entries); + return -EINVAL; + } + + entries_size = fw->nr_entries * sizeof(TdvfSectionEntry); + if (metadata->Length != sizeof(*metadata) + entries_size) { + error_report("TDVF metadata len (0x%x) mismatch, expected (0x%x)", + metadata->Length, + (uint32_t)(sizeof(*metadata) + entries_size)); + return -EINVAL; + } + + fw->entries = g_new(TdxFirmwareEntry, fw->nr_entries); + sections = g_new(TdvfSectionEntry, fw->nr_entries); + + memcpy(sections, (void *)metadata + sizeof(*metadata), entries_size); + + for (i = 0; i < fw->nr_entries; i++) { + if (tdvf_parse_and_check_section_entry(§ions[i], &fw->entries[i])) { + goto err; + } + } + + fw->mem_ptr = flash_ptr; + return 0; + +err: + fw->entries = 0; + g_free(fw->entries); + return -EINVAL; +} diff --git a/hw/i386/x86-common.c b/hw/i386/x86-common.c index 1b0671c..7512be6 100644 --- a/hw/i386/x86-common.c +++ b/hw/i386/x86-common.c @@ -44,6 +44,7 @@ #include "standard-headers/asm-x86/bootparam.h" #include CONFIG_DEVICES #include "kvm/kvm_i386.h" +#include "kvm/tdx.h" #ifdef CONFIG_XEN_EMU #include "hw/xen/xen.h" @@ -951,7 +952,7 @@ void x86_load_linux(X86MachineState *x86ms, * kernel on the other side of the fw_cfg interface matches the hash of the * file the user passed in. */ - if (!sev_enabled() && protocol > 0) { + if (!MACHINE(x86ms)->cgs && protocol > 0) { memcpy(setup, header, MIN(sizeof(header), setup_size)); } @@ -1035,11 +1036,14 @@ void x86_bios_rom_init(X86MachineState *x86ms, const char *default_firmware, if (machine_require_guest_memfd(MACHINE(x86ms))) { memory_region_init_ram_guest_memfd(&x86ms->bios, NULL, "pc.bios", bios_size, &error_fatal); + if (is_tdx_vm()) { + tdx_set_tdvf_region(&x86ms->bios); + } } else { memory_region_init_ram(&x86ms->bios, NULL, "pc.bios", bios_size, &error_fatal); } - if (sev_enabled()) { + if (sev_enabled() || is_tdx_vm()) { /* * The concept of a "reset" simply doesn't exist for * confidential computing guests, we have to destroy and diff --git a/hw/i386/x86-iommu.c b/hw/i386/x86-iommu.c index d34a684..c127a44 100644 --- a/hw/i386/x86-iommu.c +++ b/hw/i386/x86-iommu.c @@ -130,6 +130,7 @@ static const Property x86_iommu_properties[] = { intr_supported, ON_OFF_AUTO_AUTO), DEFINE_PROP_BOOL("device-iotlb", X86IOMMUState, dt_supported, false), DEFINE_PROP_BOOL("pt", X86IOMMUState, pt_supported, true), + DEFINE_PROP_BOOL("dma-translation", X86IOMMUState, dma_translation, true), }; static void x86_iommu_class_init(ObjectClass *klass, const void *data) diff --git a/hw/i386/x86.c b/hw/i386/x86.c index e2d0409..f80533d 100644 --- a/hw/i386/x86.c +++ b/hw/i386/x86.c @@ -382,7 +382,6 @@ static void x86_machine_class_init(ObjectClass *oc, const void *data) mc->get_default_cpu_node_id = x86_get_default_cpu_node_id; mc->possible_cpu_arch_ids = x86_possible_cpu_arch_ids; mc->kvm_type = x86_kvm_type; - x86mc->save_tsc_khz = true; x86mc->fwcfg_dma_enabled = true; nc->nmi_monitor_handler = x86_nmi; |