aboutsummaryrefslogtreecommitdiff
path: root/hw/i386
diff options
context:
space:
mode:
Diffstat (limited to 'hw/i386')
-rw-r--r--hw/i386/Kconfig12
-rw-r--r--hw/i386/acpi-build.c538
-rw-r--r--hw/i386/acpi-build.h4
-rw-r--r--hw/i386/amd_iommu.c1183
-rw-r--r--hw/i386/amd_iommu.h112
-rw-r--r--hw/i386/intel_iommu.c539
-rw-r--r--hw/i386/intel_iommu_internal.h53
-rw-r--r--hw/i386/isapc.c189
-rw-r--r--hw/i386/kvm/apic.c5
-rw-r--r--hw/i386/kvm/xen-stubs.c13
-rw-r--r--hw/i386/kvm/xen_evtchn.c2
-rw-r--r--hw/i386/meson.build2
-rw-r--r--hw/i386/microvm.c3
-rw-r--r--hw/i386/monitor.c2
-rw-r--r--hw/i386/pc.c175
-rw-r--r--hw/i386/pc_piix.c311
-rw-r--r--hw/i386/pc_q35.c53
-rw-r--r--hw/i386/pc_sysfw.c38
-rw-r--r--hw/i386/sgx-stub.c6
-rw-r--r--hw/i386/sgx.c34
-rw-r--r--hw/i386/tdvf-hob.c130
-rw-r--r--hw/i386/tdvf-hob.h26
-rw-r--r--hw/i386/tdvf.c189
-rw-r--r--hw/i386/x86-common.c8
-rw-r--r--hw/i386/x86-iommu.c1
-rw-r--r--hw/i386/x86.c1
26 files changed, 2462 insertions, 1167 deletions
diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig
index d34ce07..6a0ab54 100644
--- a/hw/i386/Kconfig
+++ b/hw/i386/Kconfig
@@ -4,12 +4,17 @@ config X86_FW_OVMF
config SEV
bool
select X86_FW_OVMF
- depends on KVM
+ depends on KVM && X86_64
config SGX
bool
depends on KVM
+config TDX
+ bool
+ select X86_FW_OVMF
+ depends on KVM && X86_64
+
config PC
bool
imply APPLESMC
@@ -26,6 +31,7 @@ config PC
imply QXL
imply SEV
imply SGX
+ imply TDX
imply TEST_DEVICES
imply TPM_CRB
imply TPM_TIS_ISA
@@ -90,9 +96,6 @@ config ISAPC
select ISA_BUS
select PC
select IDE_ISA
- # FIXME: it is in the same file as i440fx, and does not compile
- # if separated
- depends on I440FX
config Q35
bool
@@ -125,6 +128,7 @@ config MICROVM
select I8259
select MC146818RTC
select VIRTIO_MMIO
+ select ACPI_PCI
select ACPI_HW_REDUCED
select PCI_EXPRESS_GENERIC_BRIDGE
select USB_XHCI_SYSBUS
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 61851cc..9446a9f 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -338,405 +338,6 @@ build_facs(GArray *table_data)
g_array_append_vals(table_data, reserved, 40); /* Reserved */
}
-Aml *aml_pci_device_dsm(void)
-{
- Aml *method;
-
- method = aml_method("_DSM", 4, AML_SERIALIZED);
- {
- Aml *params = aml_local(0);
- Aml *pkg = aml_package(2);
- aml_append(pkg, aml_int(0));
- aml_append(pkg, aml_int(0));
- aml_append(method, aml_store(pkg, params));
- aml_append(method,
- aml_store(aml_name("BSEL"), aml_index(params, aml_int(0))));
- aml_append(method,
- aml_store(aml_name("ASUN"), aml_index(params, aml_int(1))));
- aml_append(method,
- aml_return(aml_call5("PDSM", aml_arg(0), aml_arg(1),
- aml_arg(2), aml_arg(3), params))
- );
- }
- return method;
-}
-
-static void build_append_pci_dsm_func0_common(Aml *ctx, Aml *retvar)
-{
- Aml *UUID, *ifctx1;
- uint8_t byte_list[1] = { 0 }; /* nothing supported yet */
-
- aml_append(ctx, aml_store(aml_buffer(1, byte_list), retvar));
- /*
- * PCI Firmware Specification 3.1
- * 4.6. _DSM Definitions for PCI
- */
- UUID = aml_touuid("E5C937D0-3553-4D7A-9117-EA4D19C3434D");
- ifctx1 = aml_if(aml_lnot(aml_equal(aml_arg(0), UUID)));
- {
- /* call is for unsupported UUID, bail out */
- aml_append(ifctx1, aml_return(retvar));
- }
- aml_append(ctx, ifctx1);
-
- ifctx1 = aml_if(aml_lless(aml_arg(1), aml_int(2)));
- {
- /* call is for unsupported REV, bail out */
- aml_append(ifctx1, aml_return(retvar));
- }
- aml_append(ctx, ifctx1);
-}
-
-static Aml *aml_pci_edsm(void)
-{
- Aml *method, *ifctx;
- Aml *zero = aml_int(0);
- Aml *func = aml_arg(2);
- Aml *ret = aml_local(0);
- Aml *aidx = aml_local(1);
- Aml *params = aml_arg(4);
-
- method = aml_method("EDSM", 5, AML_SERIALIZED);
-
- /* get supported functions */
- ifctx = aml_if(aml_equal(func, zero));
- {
- /* 1: have supported functions */
- /* 7: support for function 7 */
- const uint8_t caps = 1 | BIT(7);
- build_append_pci_dsm_func0_common(ifctx, ret);
- aml_append(ifctx, aml_store(aml_int(caps), aml_index(ret, zero)));
- aml_append(ifctx, aml_return(ret));
- }
- aml_append(method, ifctx);
-
- /* handle specific functions requests */
- /*
- * PCI Firmware Specification 3.1
- * 4.6.7. _DSM for Naming a PCI or PCI Express Device Under
- * Operating Systems
- */
- ifctx = aml_if(aml_equal(func, aml_int(7)));
- {
- Aml *pkg = aml_package(2);
- aml_append(pkg, zero);
- /* optional, if not impl. should return null string */
- aml_append(pkg, aml_string("%s", ""));
- aml_append(ifctx, aml_store(pkg, ret));
-
- /*
- * IASL is fine when initializing Package with computational data,
- * however it makes guest unhappy /it fails to process such AML/.
- * So use runtime assignment to set acpi-index after initializer
- * to make OSPM happy.
- */
- aml_append(ifctx,
- aml_store(aml_derefof(aml_index(params, aml_int(0))), aidx));
- aml_append(ifctx, aml_store(aidx, aml_index(ret, zero)));
- aml_append(ifctx, aml_return(ret));
- }
- aml_append(method, ifctx);
-
- return method;
-}
-
-static Aml *aml_pci_static_endpoint_dsm(PCIDevice *pdev)
-{
- Aml *method;
-
- g_assert(pdev->acpi_index != 0);
- method = aml_method("_DSM", 4, AML_SERIALIZED);
- {
- Aml *params = aml_local(0);
- Aml *pkg = aml_package(1);
- aml_append(pkg, aml_int(pdev->acpi_index));
- aml_append(method, aml_store(pkg, params));
- aml_append(method,
- aml_return(aml_call5("EDSM", aml_arg(0), aml_arg(1),
- aml_arg(2), aml_arg(3), params))
- );
- }
- return method;
-}
-
-static void build_append_pcihp_notify_entry(Aml *method, int slot)
-{
- Aml *if_ctx;
- int32_t devfn = PCI_DEVFN(slot, 0);
-
- if_ctx = aml_if(aml_and(aml_arg(0), aml_int(0x1U << slot), NULL));
- aml_append(if_ctx, aml_notify(aml_name("S%.02X", devfn), aml_arg(1)));
- aml_append(method, if_ctx);
-}
-
-static bool is_devfn_ignored_generic(const int devfn, const PCIBus *bus)
-{
- const PCIDevice *pdev = bus->devices[devfn];
-
- if (PCI_FUNC(devfn)) {
- if (IS_PCI_BRIDGE(pdev)) {
- /*
- * Ignore only hotplugged PCI bridges on !0 functions, but
- * allow describing cold plugged bridges on all functions
- */
- if (DEVICE(pdev)->hotplugged) {
- return true;
- }
- }
- }
- return false;
-}
-
-static bool is_devfn_ignored_hotplug(const int devfn, const PCIBus *bus)
-{
- PCIDevice *pdev = bus->devices[devfn];
- if (pdev) {
- return is_devfn_ignored_generic(devfn, bus) ||
- !DEVICE_GET_CLASS(pdev)->hotpluggable ||
- /* Cold plugged bridges aren't themselves hot-pluggable */
- (IS_PCI_BRIDGE(pdev) && !DEVICE(pdev)->hotplugged);
- } else { /* non populated slots */
- /*
- * hotplug is supported only for non-multifunction device
- * so generate device description only for function 0
- */
- if (PCI_FUNC(devfn) ||
- (pci_bus_is_express(bus) && PCI_SLOT(devfn) > 0)) {
- return true;
- }
- }
- return false;
-}
-
-void build_append_pcihp_slots(Aml *parent_scope, PCIBus *bus)
-{
- int devfn;
- Aml *dev, *notify_method = NULL, *method;
- QObject *bsel = object_property_get_qobject(OBJECT(bus),
- ACPI_PCIHP_PROP_BSEL, NULL);
- uint64_t bsel_val = qnum_get_uint(qobject_to(QNum, bsel));
- qobject_unref(bsel);
-
- aml_append(parent_scope, aml_name_decl("BSEL", aml_int(bsel_val)));
- notify_method = aml_method("DVNT", 2, AML_NOTSERIALIZED);
-
- for (devfn = 0; devfn < ARRAY_SIZE(bus->devices); devfn++) {
- int slot = PCI_SLOT(devfn);
- int adr = slot << 16 | PCI_FUNC(devfn);
-
- if (is_devfn_ignored_hotplug(devfn, bus)) {
- continue;
- }
-
- if (bus->devices[devfn]) {
- dev = aml_scope("S%.02X", devfn);
- } else {
- dev = aml_device("S%.02X", devfn);
- aml_append(dev, aml_name_decl("_ADR", aml_int(adr)));
- }
-
- /*
- * Can't declare _SUN here for every device as it changes 'slot'
- * enumeration order in linux kernel, so use another variable for it
- */
- aml_append(dev, aml_name_decl("ASUN", aml_int(slot)));
- aml_append(dev, aml_pci_device_dsm());
-
- aml_append(dev, aml_name_decl("_SUN", aml_int(slot)));
- /* add _EJ0 to make slot hotpluggable */
- method = aml_method("_EJ0", 1, AML_NOTSERIALIZED);
- aml_append(method,
- aml_call2("PCEJ", aml_name("BSEL"), aml_name("_SUN"))
- );
- aml_append(dev, method);
-
- build_append_pcihp_notify_entry(notify_method, slot);
-
- /* device descriptor has been composed, add it into parent context */
- aml_append(parent_scope, dev);
- }
- aml_append(parent_scope, notify_method);
-}
-
-void build_append_pci_bus_devices(Aml *parent_scope, PCIBus *bus)
-{
- int devfn;
- Aml *dev;
-
- for (devfn = 0; devfn < ARRAY_SIZE(bus->devices); devfn++) {
- /* ACPI spec: 1.0b: Table 6-2 _ADR Object Bus Types, PCI type */
- int adr = PCI_SLOT(devfn) << 16 | PCI_FUNC(devfn);
- PCIDevice *pdev = bus->devices[devfn];
-
- if (!pdev || is_devfn_ignored_generic(devfn, bus)) {
- continue;
- }
-
- /* start to compose PCI device descriptor */
- dev = aml_device("S%.02X", devfn);
- aml_append(dev, aml_name_decl("_ADR", aml_int(adr)));
-
- call_dev_aml_func(DEVICE(bus->devices[devfn]), dev);
- /* add _DSM if device has acpi-index set */
- if (pdev->acpi_index &&
- !object_property_get_bool(OBJECT(pdev), "hotpluggable",
- &error_abort)) {
- aml_append(dev, aml_pci_static_endpoint_dsm(pdev));
- }
-
- /* device descriptor has been composed, add it into parent context */
- aml_append(parent_scope, dev);
- }
-}
-
-static bool build_append_notification_callback(Aml *parent_scope,
- const PCIBus *bus)
-{
- Aml *method;
- PCIBus *sec;
- QObject *bsel;
- int nr_notifiers = 0;
- GQueue *pcnt_bus_list = g_queue_new();
-
- QLIST_FOREACH(sec, &bus->child, sibling) {
- Aml *br_scope = aml_scope("S%.02X", sec->parent_dev->devfn);
- if (pci_bus_is_root(sec)) {
- continue;
- }
- nr_notifiers = nr_notifiers +
- build_append_notification_callback(br_scope, sec);
- /*
- * add new child scope to parent
- * and keep track of bus that have PCNT,
- * bus list is used later to call children PCNTs from this level PCNT
- */
- if (nr_notifiers) {
- g_queue_push_tail(pcnt_bus_list, sec);
- aml_append(parent_scope, br_scope);
- }
- }
-
- /*
- * Append PCNT method to notify about events on local and child buses.
- * ps: hostbridge might not have hotplug (bsel) enabled but might have
- * child bridges that do have bsel.
- */
- method = aml_method("PCNT", 0, AML_NOTSERIALIZED);
-
- /* If bus supports hotplug select it and notify about local events */
- bsel = object_property_get_qobject(OBJECT(bus), ACPI_PCIHP_PROP_BSEL, NULL);
- if (bsel) {
- uint64_t bsel_val = qnum_get_uint(qobject_to(QNum, bsel));
-
- aml_append(method, aml_store(aml_int(bsel_val), aml_name("BNUM")));
- aml_append(method, aml_call2("DVNT", aml_name("PCIU"),
- aml_int(1))); /* Device Check */
- aml_append(method, aml_call2("DVNT", aml_name("PCID"),
- aml_int(3))); /* Eject Request */
- nr_notifiers++;
- }
-
- /* Notify about child bus events in any case */
- while ((sec = g_queue_pop_head(pcnt_bus_list))) {
- aml_append(method, aml_name("^S%.02X.PCNT", sec->parent_dev->devfn));
- }
-
- aml_append(parent_scope, method);
- qobject_unref(bsel);
- g_queue_free(pcnt_bus_list);
- return !!nr_notifiers;
-}
-
-static Aml *aml_pci_pdsm(void)
-{
- Aml *method, *ifctx, *ifctx1;
- Aml *ret = aml_local(0);
- Aml *caps = aml_local(1);
- Aml *acpi_index = aml_local(2);
- Aml *zero = aml_int(0);
- Aml *one = aml_int(1);
- Aml *not_supp = aml_int(0xFFFFFFFF);
- Aml *func = aml_arg(2);
- Aml *params = aml_arg(4);
- Aml *bnum = aml_derefof(aml_index(params, aml_int(0)));
- Aml *sunum = aml_derefof(aml_index(params, aml_int(1)));
-
- method = aml_method("PDSM", 5, AML_SERIALIZED);
-
- /* get supported functions */
- ifctx = aml_if(aml_equal(func, zero));
- {
- build_append_pci_dsm_func0_common(ifctx, ret);
-
- aml_append(ifctx, aml_store(zero, caps));
- aml_append(ifctx,
- aml_store(aml_call2("AIDX", bnum, sunum), acpi_index));
- /*
- * advertise function 7 if device has acpi-index
- * acpi_index values:
- * 0: not present (default value)
- * FFFFFFFF: not supported (old QEMU without PIDX reg)
- * other: device's acpi-index
- */
- ifctx1 = aml_if(aml_lnot(
- aml_or(aml_equal(acpi_index, zero),
- aml_equal(acpi_index, not_supp), NULL)
- ));
- {
- /* have supported functions */
- aml_append(ifctx1, aml_or(caps, one, caps));
- /* support for function 7 */
- aml_append(ifctx1,
- aml_or(caps, aml_shiftleft(one, aml_int(7)), caps));
- }
- aml_append(ifctx, ifctx1);
-
- aml_append(ifctx, aml_store(caps, aml_index(ret, zero)));
- aml_append(ifctx, aml_return(ret));
- }
- aml_append(method, ifctx);
-
- /* handle specific functions requests */
- /*
- * PCI Firmware Specification 3.1
- * 4.6.7. _DSM for Naming a PCI or PCI Express Device Under
- * Operating Systems
- */
- ifctx = aml_if(aml_equal(func, aml_int(7)));
- {
- Aml *pkg = aml_package(2);
-
- aml_append(ifctx, aml_store(aml_call2("AIDX", bnum, sunum), acpi_index));
- aml_append(ifctx, aml_store(pkg, ret));
- /*
- * Windows calls func=7 without checking if it's available,
- * as workaround Microsoft has suggested to return invalid for func7
- * Package, so return 2 elements package but only initialize elements
- * when acpi_index is supported and leave them uninitialized, which
- * leads elements to being Uninitialized ObjectType and should trip
- * Windows into discarding result as an unexpected and prevent setting
- * bogus 'PCI Label' on the device.
- */
- ifctx1 = aml_if(aml_lnot(aml_lor(
- aml_equal(acpi_index, zero), aml_equal(acpi_index, not_supp)
- )));
- {
- aml_append(ifctx1, aml_store(acpi_index, aml_index(ret, zero)));
- /*
- * optional, if not impl. should return null string
- */
- aml_append(ifctx1, aml_store(aml_string("%s", ""),
- aml_index(ret, one)));
- }
- aml_append(ifctx, ifctx1);
-
- aml_append(ifctx, aml_return(ret));
- }
-
- aml_append(method, ifctx);
- return method;
-}
-
/*
* build_prt - Define interrupt routing rules
*
@@ -1227,112 +828,6 @@ static Aml *build_q35_dram_controller(const AcpiMcfgInfo *mcfg)
return dev;
}
-static void build_x86_acpi_pci_hotplug(Aml *table, uint64_t pcihp_addr)
-{
- Aml *scope;
- Aml *field;
- Aml *method;
-
- scope = aml_scope("_SB.PCI0");
-
- aml_append(scope,
- aml_operation_region("PCST", AML_SYSTEM_IO, aml_int(pcihp_addr), 0x08));
- field = aml_field("PCST", AML_DWORD_ACC, AML_NOLOCK, AML_WRITE_AS_ZEROS);
- aml_append(field, aml_named_field("PCIU", 32));
- aml_append(field, aml_named_field("PCID", 32));
- aml_append(scope, field);
-
- aml_append(scope,
- aml_operation_region("SEJ", AML_SYSTEM_IO,
- aml_int(pcihp_addr + ACPI_PCIHP_SEJ_BASE), 0x04));
- field = aml_field("SEJ", AML_DWORD_ACC, AML_NOLOCK, AML_WRITE_AS_ZEROS);
- aml_append(field, aml_named_field("B0EJ", 32));
- aml_append(scope, field);
-
- aml_append(scope,
- aml_operation_region("BNMR", AML_SYSTEM_IO,
- aml_int(pcihp_addr + ACPI_PCIHP_BNMR_BASE), 0x08));
- field = aml_field("BNMR", AML_DWORD_ACC, AML_NOLOCK, AML_WRITE_AS_ZEROS);
- aml_append(field, aml_named_field("BNUM", 32));
- aml_append(field, aml_named_field("PIDX", 32));
- aml_append(scope, field);
-
- aml_append(scope, aml_mutex("BLCK", 0));
-
- method = aml_method("PCEJ", 2, AML_NOTSERIALIZED);
- aml_append(method, aml_acquire(aml_name("BLCK"), 0xFFFF));
- aml_append(method, aml_store(aml_arg(0), aml_name("BNUM")));
- aml_append(method,
- aml_store(aml_shiftleft(aml_int(1), aml_arg(1)), aml_name("B0EJ")));
- aml_append(method, aml_release(aml_name("BLCK")));
- aml_append(method, aml_return(aml_int(0)));
- aml_append(scope, method);
-
- method = aml_method("AIDX", 2, AML_NOTSERIALIZED);
- aml_append(method, aml_acquire(aml_name("BLCK"), 0xFFFF));
- aml_append(method, aml_store(aml_arg(0), aml_name("BNUM")));
- aml_append(method,
- aml_store(aml_shiftleft(aml_int(1), aml_arg(1)), aml_name("PIDX")));
- aml_append(method, aml_store(aml_name("PIDX"), aml_local(0)));
- aml_append(method, aml_release(aml_name("BLCK")));
- aml_append(method, aml_return(aml_local(0)));
- aml_append(scope, method);
-
- aml_append(scope, aml_pci_pdsm());
-
- aml_append(table, scope);
-}
-
-static Aml *build_q35_osc_method(bool enable_native_pcie_hotplug)
-{
- Aml *if_ctx;
- Aml *if_ctx2;
- Aml *else_ctx;
- Aml *method;
- Aml *a_cwd1 = aml_name("CDW1");
- Aml *a_ctrl = aml_local(0);
-
- method = aml_method("_OSC", 4, AML_NOTSERIALIZED);
- aml_append(method, aml_create_dword_field(aml_arg(3), aml_int(0), "CDW1"));
-
- if_ctx = aml_if(aml_equal(
- aml_arg(0), aml_touuid("33DB4D5B-1FF7-401C-9657-7441C03DD766")));
- aml_append(if_ctx, aml_create_dword_field(aml_arg(3), aml_int(4), "CDW2"));
- aml_append(if_ctx, aml_create_dword_field(aml_arg(3), aml_int(8), "CDW3"));
-
- aml_append(if_ctx, aml_store(aml_name("CDW3"), a_ctrl));
-
- /*
- * Always allow native PME, AER (no dependencies)
- * Allow SHPC (PCI bridges can have SHPC controller)
- * Disable PCIe Native Hot-plug if ACPI PCI Hot-plug is enabled.
- */
- aml_append(if_ctx, aml_and(a_ctrl,
- aml_int(0x1E | (enable_native_pcie_hotplug ? 0x1 : 0x0)), a_ctrl));
-
- if_ctx2 = aml_if(aml_lnot(aml_equal(aml_arg(1), aml_int(1))));
- /* Unknown revision */
- aml_append(if_ctx2, aml_or(a_cwd1, aml_int(0x08), a_cwd1));
- aml_append(if_ctx, if_ctx2);
-
- if_ctx2 = aml_if(aml_lnot(aml_equal(aml_name("CDW3"), a_ctrl)));
- /* Capabilities bits were masked */
- aml_append(if_ctx2, aml_or(a_cwd1, aml_int(0x10), a_cwd1));
- aml_append(if_ctx, if_ctx2);
-
- /* Update DWORD3 in the buffer */
- aml_append(if_ctx, aml_store(a_ctrl, aml_name("CDW3")));
- aml_append(method, if_ctx);
-
- else_ctx = aml_else();
- /* Unrecognized UUID */
- aml_append(else_ctx, aml_or(a_cwd1, aml_int(4), a_cwd1));
- aml_append(method, else_ctx);
-
- aml_append(method, aml_return(aml_arg(3)));
- return method;
-}
-
static void build_acpi0017(Aml *table)
{
Aml *dev, *scope, *method;
@@ -1389,12 +884,12 @@ build_dsdt(GArray *table_data, BIOSLinker *linker,
dev = aml_device("PCI0");
aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0A03")));
aml_append(dev, aml_name_decl("_UID", aml_int(pcmc->pci_root_uid)));
- aml_append(dev, aml_pci_edsm());
+ aml_append(dev, build_pci_bridge_edsm());
aml_append(sb_scope, dev);
aml_append(dsdt, sb_scope);
if (pm->pcihp_bridge_en || pm->pcihp_root_en) {
- build_x86_acpi_pci_hotplug(dsdt, pm->pcihp_io_base);
+ build_acpi_pci_hotplug(dsdt, AML_SYSTEM_IO, pm->pcihp_io_base);
}
build_piix4_pci0_int(dsdt);
} else if (q35) {
@@ -1403,8 +898,8 @@ build_dsdt(GArray *table_data, BIOSLinker *linker,
aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0A08")));
aml_append(dev, aml_name_decl("_CID", aml_eisaid("PNP0A03")));
aml_append(dev, aml_name_decl("_UID", aml_int(pcmc->pci_root_uid)));
- aml_append(dev, build_q35_osc_method(!pm->pcihp_bridge_en));
- aml_append(dev, aml_pci_edsm());
+ aml_append(dev, build_pci_host_bridge_osc_method(!pm->pcihp_bridge_en));
+ aml_append(dev, build_pci_bridge_edsm());
aml_append(sb_scope, dev);
if (mcfg_valid) {
aml_append(sb_scope, build_q35_dram_controller(&mcfg));
@@ -1438,7 +933,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker,
aml_append(dsdt, sb_scope);
if (pm->pcihp_bridge_en) {
- build_x86_acpi_pci_hotplug(dsdt, pm->pcihp_io_base);
+ build_acpi_pci_hotplug(dsdt, AML_SYSTEM_IO, pm->pcihp_io_base);
}
build_q35_pci0_int(dsdt);
}
@@ -1525,7 +1020,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker,
aml_append(dev, aml_name_decl("_CID", aml_eisaid("PNP0A03")));
/* Expander bridges do not have ACPI PCI Hot-plug enabled */
- aml_append(dev, build_q35_osc_method(true));
+ aml_append(dev, build_pci_host_bridge_osc_method(true));
} else {
aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0A03")));
}
@@ -1654,19 +1149,8 @@ build_dsdt(GArray *table_data, BIOSLinker *linker,
/* reserve PCIHP resources */
if (pm->pcihp_io_len && (pm->pcihp_bridge_en || pm->pcihp_root_en)) {
- dev = aml_device("PHPR");
- aml_append(dev, aml_name_decl("_HID", aml_string("PNP0A06")));
- aml_append(dev,
- aml_name_decl("_UID", aml_string("PCI Hotplug resources")));
- /* device present, functioning, decoding, not shown in UI */
- aml_append(dev, aml_name_decl("_STA", aml_int(0xB)));
- crs = aml_resource_template();
- aml_append(crs,
- aml_io(AML_DECODE16, pm->pcihp_io_base, pm->pcihp_io_base, 1,
- pm->pcihp_io_len)
- );
- aml_append(dev, aml_name_decl("_CRS", crs));
- aml_append(scope, dev);
+ build_append_pcihp_resources(scope,
+ pm->pcihp_io_base, pm->pcihp_io_len);
}
aml_append(dsdt, scope);
@@ -2379,7 +1863,11 @@ build_amd_iommu(GArray *table_data, BIOSLinker *linker, const char *oem_id,
/* IOMMU info */
build_append_int_noprefix(table_data, 0, 2);
/* IOMMU Attributes */
- build_append_int_noprefix(table_data, 0, 4);
+ if (!s->iommu.dma_translation) {
+ build_append_int_noprefix(table_data, (1UL << 0) /* HATDis */, 4);
+ } else {
+ build_append_int_noprefix(table_data, 0, 4);
+ }
/* EFR Register Image */
build_append_int_noprefix(table_data,
amdvi_extended_feature_register(s),
diff --git a/hw/i386/acpi-build.h b/hw/i386/acpi-build.h
index 275ec05..8ba3c33 100644
--- a/hw/i386/acpi-build.h
+++ b/hw/i386/acpi-build.h
@@ -5,10 +5,6 @@
extern const struct AcpiGenericAddress x86_nvdimm_acpi_dsmio;
-/* PCI Hot-plug registers' base. See docs/specs/acpi_pci_hotplug.rst */
-#define ACPI_PCIHP_SEJ_BASE 0x8
-#define ACPI_PCIHP_BNMR_BASE 0x10
-
void acpi_setup(void);
Object *acpi_get_i386_pci_host(void);
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index 0775c8f..378e0cb 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -33,6 +33,7 @@
#include "hw/i386/apic-msidef.h"
#include "hw/qdev-properties.h"
#include "kvm/kvm_i386.h"
+#include "qemu/iova-tree.h"
/* used AMD-Vi MMIO registers */
const char *amdvi_mmio_low[] = {
@@ -66,6 +67,15 @@ struct AMDVIAddressSpace {
MemoryRegion iommu_nodma; /* Alias of shared nodma memory region */
MemoryRegion iommu_ir; /* Device's interrupt remapping region */
AddressSpace as; /* device's corresponding address space */
+
+ /* DMA address translation support */
+ IOMMUNotifierFlag notifier_flags;
+ /* entry in list of Address spaces with registered notifiers */
+ QLIST_ENTRY(AMDVIAddressSpace) next;
+ /* Record DMA translation ranges */
+ IOVATree *iova_tree;
+ /* DMA address translation active */
+ bool addr_translation;
};
/* AMDVI cache entry */
@@ -77,12 +87,29 @@ typedef struct AMDVIIOTLBEntry {
uint64_t page_mask; /* physical page size */
} AMDVIIOTLBEntry;
+/*
+ * These 'fault' reasons have an overloaded meaning since they are not only
+ * intended for describing reasons that generate an IO_PAGE_FAULT as per the AMD
+ * IOMMU specification, but are also used to signal internal errors in the
+ * emulation code.
+ */
+typedef enum AMDVIFaultReason {
+ AMDVI_FR_DTE_RTR_ERR = 1, /* Failure to retrieve DTE */
+ AMDVI_FR_DTE_V, /* DTE[V] = 0 */
+ AMDVI_FR_DTE_TV, /* DTE[TV] = 0 */
+ AMDVI_FR_PT_ROOT_INV, /* Page Table Root ptr invalid */
+ AMDVI_FR_PT_ENTRY_INV, /* Failure to read PTE from guest memory */
+} AMDVIFaultReason;
+
uint64_t amdvi_extended_feature_register(AMDVIState *s)
{
uint64_t feature = AMDVI_DEFAULT_EXT_FEATURES;
if (s->xtsup) {
feature |= AMDVI_FEATURE_XT;
}
+ if (!s->iommu.dma_translation) {
+ feature |= AMDVI_HATS_MODE_RESERVED;
+ }
return feature;
}
@@ -123,8 +150,13 @@ static void amdvi_writew(AMDVIState *s, hwaddr addr, uint16_t val)
uint16_t romask = lduw_le_p(&s->romask[addr]);
uint16_t w1cmask = lduw_le_p(&s->w1cmask[addr]);
uint16_t oldval = lduw_le_p(&s->mmior[addr]);
+
+ uint16_t oldval_preserved = oldval & (romask | w1cmask);
+ uint16_t newval_write = val & ~romask;
+ uint16_t newval_w1c_set = val & w1cmask;
+
stw_le_p(&s->mmior[addr],
- ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask));
+ (oldval_preserved | newval_write) & ~newval_w1c_set);
}
static void amdvi_writel(AMDVIState *s, hwaddr addr, uint32_t val)
@@ -132,23 +164,33 @@ static void amdvi_writel(AMDVIState *s, hwaddr addr, uint32_t val)
uint32_t romask = ldl_le_p(&s->romask[addr]);
uint32_t w1cmask = ldl_le_p(&s->w1cmask[addr]);
uint32_t oldval = ldl_le_p(&s->mmior[addr]);
+
+ uint32_t oldval_preserved = oldval & (romask | w1cmask);
+ uint32_t newval_write = val & ~romask;
+ uint32_t newval_w1c_set = val & w1cmask;
+
stl_le_p(&s->mmior[addr],
- ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask));
+ (oldval_preserved | newval_write) & ~newval_w1c_set);
}
static void amdvi_writeq(AMDVIState *s, hwaddr addr, uint64_t val)
{
uint64_t romask = ldq_le_p(&s->romask[addr]);
uint64_t w1cmask = ldq_le_p(&s->w1cmask[addr]);
- uint32_t oldval = ldq_le_p(&s->mmior[addr]);
+ uint64_t oldval = ldq_le_p(&s->mmior[addr]);
+
+ uint64_t oldval_preserved = oldval & (romask | w1cmask);
+ uint64_t newval_write = val & ~romask;
+ uint64_t newval_w1c_set = val & w1cmask;
+
stq_le_p(&s->mmior[addr],
- ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask));
+ (oldval_preserved | newval_write) & ~newval_w1c_set);
}
-/* OR a 64-bit register with a 64-bit value */
+/* AND a 64-bit register with a 64-bit value */
static bool amdvi_test_mask(AMDVIState *s, hwaddr addr, uint64_t val)
{
- return amdvi_readq(s, addr) | val;
+ return amdvi_readq(s, addr) & val;
}
/* OR a 64-bit register with a 64-bit value storing result in the register */
@@ -177,19 +219,31 @@ static void amdvi_generate_msi_interrupt(AMDVIState *s)
}
}
+static uint32_t get_next_eventlog_entry(AMDVIState *s)
+{
+ uint32_t evtlog_size = s->evtlog_len * AMDVI_EVENT_LEN;
+ return (s->evtlog_tail + AMDVI_EVENT_LEN) % evtlog_size;
+}
+
static void amdvi_log_event(AMDVIState *s, uint64_t *evt)
{
+ uint32_t evtlog_tail_next;
+
/* event logging not enabled */
if (!s->evtlog_enabled || amdvi_test_mask(s, AMDVI_MMIO_STATUS,
AMDVI_MMIO_STATUS_EVT_OVF)) {
return;
}
+ evtlog_tail_next = get_next_eventlog_entry(s);
+
/* event log buffer full */
- if (s->evtlog_tail >= s->evtlog_len) {
- amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_EVT_OVF);
- /* generate interrupt */
- amdvi_generate_msi_interrupt(s);
+ if (evtlog_tail_next == s->evtlog_head) {
+ /* generate overflow interrupt */
+ if (s->evtlog_intr) {
+ amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_EVT_OVF);
+ amdvi_generate_msi_interrupt(s);
+ }
return;
}
@@ -198,9 +252,13 @@ static void amdvi_log_event(AMDVIState *s, uint64_t *evt)
trace_amdvi_evntlog_fail(s->evtlog, s->evtlog_tail);
}
- s->evtlog_tail += AMDVI_EVENT_LEN;
- amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_COMP_INT);
- amdvi_generate_msi_interrupt(s);
+ s->evtlog_tail = evtlog_tail_next;
+ amdvi_writeq_raw(s, AMDVI_MMIO_EVENT_TAIL, s->evtlog_tail);
+
+ if (s->evtlog_intr) {
+ amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_EVENT_INT);
+ amdvi_generate_msi_interrupt(s);
+ }
}
static void amdvi_setevent_bits(uint64_t *buffer, uint64_t value, int start,
@@ -407,18 +465,704 @@ static void amdvi_completion_wait(AMDVIState *s, uint64_t *cmd)
trace_amdvi_completion_wait(addr, data);
}
+static inline uint64_t amdvi_get_perms(uint64_t entry)
+{
+ return (entry & (AMDVI_DEV_PERM_READ | AMDVI_DEV_PERM_WRITE)) >>
+ AMDVI_DEV_PERM_SHIFT;
+}
+
+/* validate that reserved bits are honoured */
+static bool amdvi_validate_dte(AMDVIState *s, uint16_t devid,
+ uint64_t *dte)
+{
+
+ uint64_t root;
+
+ if ((dte[0] & AMDVI_DTE_QUAD0_RESERVED) ||
+ (dte[1] & AMDVI_DTE_QUAD1_RESERVED) ||
+ (dte[2] & AMDVI_DTE_QUAD2_RESERVED) ||
+ (dte[3] & AMDVI_DTE_QUAD3_RESERVED)) {
+ amdvi_log_illegaldevtab_error(s, devid,
+ s->devtab +
+ devid * AMDVI_DEVTAB_ENTRY_SIZE, 0);
+ return false;
+ }
+
+ /*
+ * 1 = Host Address Translation is not supported. Value in MMIO Offset
+ * 0030h[HATS] is not meaningful. A non-zero host page table root pointer
+ * in the DTE would result in an ILLEGAL_DEV_TABLE_ENTRY event.
+ */
+ root = (dte[0] & AMDVI_DEV_PT_ROOT_MASK) >> 12;
+ if (root && !s->iommu.dma_translation) {
+ amdvi_log_illegaldevtab_error(s, devid,
+ s->devtab +
+ devid * AMDVI_DEVTAB_ENTRY_SIZE, 0);
+ return false;
+ }
+
+ return true;
+}
+
+/* get a device table entry given the devid */
+static bool amdvi_get_dte(AMDVIState *s, int devid, uint64_t *entry)
+{
+ uint32_t offset = devid * AMDVI_DEVTAB_ENTRY_SIZE;
+
+ if (dma_memory_read(&address_space_memory, s->devtab + offset, entry,
+ AMDVI_DEVTAB_ENTRY_SIZE, MEMTXATTRS_UNSPECIFIED)) {
+ trace_amdvi_dte_get_fail(s->devtab, offset);
+ /* log error accessing dte */
+ amdvi_log_devtab_error(s, devid, s->devtab + offset, 0);
+ return false;
+ }
+
+ *entry = le64_to_cpu(*entry);
+ if (!amdvi_validate_dte(s, devid, entry)) {
+ trace_amdvi_invalid_dte(entry[0]);
+ return false;
+ }
+
+ return true;
+}
+
+/* get pte translation mode */
+static inline uint8_t get_pte_translation_mode(uint64_t pte)
+{
+ return (pte >> AMDVI_DEV_MODE_RSHIFT) & AMDVI_DEV_MODE_MASK;
+}
+
+static inline uint64_t amdvi_get_pte_entry(AMDVIState *s, uint64_t pte_addr,
+ uint16_t devid)
+{
+ uint64_t pte;
+
+ if (dma_memory_read(&address_space_memory, pte_addr,
+ &pte, sizeof(pte), MEMTXATTRS_UNSPECIFIED)) {
+ trace_amdvi_get_pte_hwerror(pte_addr);
+ amdvi_log_pagetab_error(s, devid, pte_addr, 0);
+ pte = (uint64_t)-1;
+ return pte;
+ }
+
+ pte = le64_to_cpu(pte);
+ return pte;
+}
+
+static int amdvi_as_to_dte(AMDVIAddressSpace *as, uint64_t *dte)
+{
+ uint16_t devid = PCI_BUILD_BDF(as->bus_num, as->devfn);
+ AMDVIState *s = as->iommu_state;
+
+ if (!amdvi_get_dte(s, devid, dte)) {
+ /* Unable to retrieve DTE for devid */
+ return -AMDVI_FR_DTE_RTR_ERR;
+ }
+
+ if (!(dte[0] & AMDVI_DEV_VALID)) {
+ /* DTE[V] not set, address is passed untranslated for devid */
+ return -AMDVI_FR_DTE_V;
+ }
+
+ if (!(dte[0] & AMDVI_DEV_TRANSLATION_VALID)) {
+ /* DTE[TV] not set, host page table not valid for devid */
+ return -AMDVI_FR_DTE_TV;
+ }
+ return 0;
+}
+
+/*
+ * For a PTE encoding a large page, return the page size it encodes as described
+ * by the AMD IOMMU Specification Table 14: Example Page Size Encodings.
+ * No need to adjust the value of the PTE to point to the first PTE in the large
+ * page since the encoding guarantees all "base" PTEs in the large page are the
+ * same.
+ */
+static uint64_t large_pte_page_size(uint64_t pte)
+{
+ assert(PTE_NEXT_LEVEL(pte) == 7);
+
+ /* Determine size of the large/contiguous page encoded in the PTE */
+ return PTE_LARGE_PAGE_SIZE(pte);
+}
+
+/*
+ * Helper function to fetch a PTE using AMD v1 pgtable format.
+ * On successful page walk, returns 0 and pte parameter points to a valid PTE.
+ * On failure, returns:
+ * -AMDVI_FR_PT_ROOT_INV: A page walk is not possible due to conditions like DTE
+ * with invalid permissions, Page Table Root can not be read from DTE, or a
+ * larger IOVA than supported by page table level encoded in DTE[Mode].
+ * -AMDVI_FR_PT_ENTRY_INV: A PTE could not be read from guest memory during a
+ * page table walk. This means that the DTE has valid data, but one of the
+ * lower level entries in the Page Table could not be read.
+ */
+static uint64_t fetch_pte(AMDVIAddressSpace *as, hwaddr address, uint64_t dte,
+ uint64_t *pte, hwaddr *page_size)
+{
+ IOMMUAccessFlags perms = amdvi_get_perms(dte);
+
+ uint8_t level, mode;
+ uint64_t pte_addr;
+
+ *pte = dte;
+ *page_size = 0;
+
+ if (perms == IOMMU_NONE) {
+ return -AMDVI_FR_PT_ROOT_INV;
+ }
+
+ /*
+ * The Linux kernel driver initializes the default mode to 3, corresponding
+ * to a 39-bit GPA space, where each entry in the pagetable translates to a
+ * 1GB (2^30) page size.
+ */
+ level = mode = get_pte_translation_mode(dte);
+ assert(mode > 0 && mode < 7);
+
+ /*
+ * If IOVA is larger than the max supported by the current pgtable level,
+ * there is nothing to do.
+ */
+ if (address > PT_LEVEL_MAX_ADDR(mode - 1)) {
+ /* IOVA too large for the current DTE */
+ return -AMDVI_FR_PT_ROOT_INV;
+ }
+
+ do {
+ level -= 1;
+
+ /* Update the page_size */
+ *page_size = PTE_LEVEL_PAGE_SIZE(level);
+
+ /* Permission bits are ANDed at every level, including the DTE */
+ perms &= amdvi_get_perms(*pte);
+ if (perms == IOMMU_NONE) {
+ return 0;
+ }
+
+ /* Not Present */
+ if (!IOMMU_PTE_PRESENT(*pte)) {
+ return 0;
+ }
+
+ /* Large or Leaf PTE found */
+ if (PTE_NEXT_LEVEL(*pte) == 7 || PTE_NEXT_LEVEL(*pte) == 0) {
+ /* Leaf PTE found */
+ break;
+ }
+
+ /*
+ * Index the pgtable using the IOVA bits corresponding to current level
+ * and walk down to the lower level.
+ */
+ pte_addr = NEXT_PTE_ADDR(*pte, level, address);
+ *pte = amdvi_get_pte_entry(as->iommu_state, pte_addr, as->devfn);
+
+ if (*pte == (uint64_t)-1) {
+ /*
+ * A returned PTE of -1 indicates a failure to read the page table
+ * entry from guest memory.
+ */
+ if (level == mode - 1) {
+ /* Failure to retrieve the Page Table from Root Pointer */
+ *page_size = 0;
+ return -AMDVI_FR_PT_ROOT_INV;
+ } else {
+ /* Failure to read PTE. Page walk skips a page_size chunk */
+ return -AMDVI_FR_PT_ENTRY_INV;
+ }
+ }
+ } while (level > 0);
+
+ assert(PTE_NEXT_LEVEL(*pte) == 0 || PTE_NEXT_LEVEL(*pte) == 7 ||
+ level == 0);
+ /*
+ * Page walk ends when Next Level field on PTE shows that either a leaf PTE
+ * or a series of large PTEs have been reached. In the latter case, even if
+ * the range starts in the middle of a contiguous page, the returned PTE
+ * must be the first PTE of the series.
+ */
+ if (PTE_NEXT_LEVEL(*pte) == 7) {
+ /* Update page_size with the large PTE page size */
+ *page_size = large_pte_page_size(*pte);
+ }
+
+ return 0;
+}
+
+/*
+ * Invoke notifiers registered for the address space. Update record of mapped
+ * ranges in IOVA Tree.
+ */
+static void amdvi_notify_iommu(AMDVIAddressSpace *as, IOMMUTLBEvent *event)
+{
+ IOMMUTLBEntry *entry = &event->entry;
+
+ DMAMap target = {
+ .iova = entry->iova,
+ .size = entry->addr_mask,
+ .translated_addr = entry->translated_addr,
+ .perm = entry->perm,
+ };
+
+ /*
+ * Search the IOVA Tree for an existing translation for the target, and skip
+ * the notification if the mapping is already recorded.
+ * When the guest uses large pages, comparing against the record makes it
+ * possible to determine the size of the original MAP and adjust the UNMAP
+ * request to match it. This avoids failed checks against the mappings kept
+ * by the VFIO kernel driver.
+ */
+ const DMAMap *mapped = iova_tree_find(as->iova_tree, &target);
+
+ if (event->type == IOMMU_NOTIFIER_UNMAP) {
+ if (!mapped) {
+ /* No record exists of this mapping, nothing to do */
+ return;
+ }
+ /*
+ * Adjust the size based on the original record. This is essential to
+ * determine when large/contiguous pages are used, since the guest has
+ * already cleared the PTE (erasing the pagesize encoded on it) before
+ * issuing the invalidation command.
+ */
+ if (mapped->size != target.size) {
+ assert(mapped->size > target.size);
+ target.size = mapped->size;
+ /* Adjust event to invoke notifier with correct range */
+ entry->addr_mask = mapped->size;
+ }
+ iova_tree_remove(as->iova_tree, target);
+ } else { /* IOMMU_NOTIFIER_MAP */
+ if (mapped) {
+ /*
+ * If a mapping is present and matches the request, skip the
+ * notification.
+ */
+ if (!memcmp(mapped, &target, sizeof(DMAMap))) {
+ return;
+ } else {
+ /*
+ * This should never happen unless a buggy guest OS omits or
+ * sends incorrect invalidation(s). Report an error in the event
+ * it does happen.
+ */
+ error_report("Found conflicting translation. This could be due "
+ "to an incorrect or missing invalidation command");
+ }
+ }
+ /* Record the new mapping */
+ iova_tree_insert(as->iova_tree, &target);
+ }
+
+ /* Invoke the notifiers registered for this address space */
+ memory_region_notify_iommu(&as->iommu, 0, *event);
+}
+
+/*
+ * Walk the guest page table for an IOVA and range and signal the registered
+ * notifiers to sync the shadow page tables in the host.
+ * Must be called with a valid DTE for DMA remapping i.e. V=1,TV=1
+ */
+static void amdvi_sync_shadow_page_table_range(AMDVIAddressSpace *as,
+ uint64_t *dte, hwaddr addr,
+ uint64_t size, bool send_unmap)
+{
+ IOMMUTLBEvent event;
+
+ hwaddr page_mask, pagesize;
+ hwaddr iova = addr;
+ hwaddr end = iova + size - 1;
+
+ uint64_t pte;
+ int ret;
+
+ while (iova < end) {
+
+ ret = fetch_pte(as, iova, dte[0], &pte, &pagesize);
+
+ if (ret == -AMDVI_FR_PT_ROOT_INV) {
+ /*
+ * Invalid conditions such as the IOVA being larger than supported
+ * by current page table mode as configured in the DTE, or a failure
+ * to fetch the Page Table from the Page Table Root Pointer in DTE.
+ */
+ assert(pagesize == 0);
+ return;
+ }
+ /* PTE has been validated for major errors and pagesize is set */
+ assert(pagesize);
+ page_mask = ~(pagesize - 1);
+
+ if (ret == -AMDVI_FR_PT_ENTRY_INV) {
+ /*
+ * Failure to read PTE from memory, the pagesize matches the current
+ * level. Unable to determine the region type, so a safe strategy is
+ * to skip the range and continue the page walk.
+ */
+ goto next;
+ }
+
+ event.entry.target_as = &address_space_memory;
+ event.entry.iova = iova & page_mask;
+ /* translated_addr is irrelevant for the unmap case */
+ event.entry.translated_addr = (pte & AMDVI_DEV_PT_ROOT_MASK) &
+ page_mask;
+ event.entry.addr_mask = ~page_mask;
+ event.entry.perm = amdvi_get_perms(pte);
+
+ /*
+ * In cases where the leaf PTE is not found, or it has invalid
+ * permissions, an UNMAP type notification is sent, but only if the
+ * caller requested it.
+ */
+ if (!IOMMU_PTE_PRESENT(pte) || (event.entry.perm == IOMMU_NONE)) {
+ if (!send_unmap) {
+ goto next;
+ }
+ event.type = IOMMU_NOTIFIER_UNMAP;
+ } else {
+ event.type = IOMMU_NOTIFIER_MAP;
+ }
+
+ /*
+ * The following call might need to adjust event.entry.size in cases
+ * where the guest unmapped a series of large pages.
+ */
+ amdvi_notify_iommu(as, &event);
+ /*
+ * In the special scenario where the guest is unmapping a large page,
+ * addr_mask has been adjusted before sending the notification. Update
+ * pagesize accordingly in order to correctly compute the next IOVA.
+ */
+ pagesize = event.entry.addr_mask + 1;
+
+next:
+ iova &= ~(pagesize - 1);
+
+ /* Check for 64-bit overflow and terminate walk in such cases */
+ if ((iova + pagesize) < iova) {
+ break;
+ } else {
+ iova += pagesize;
+ }
+ }
+}
+
+/*
+ * Unmap entire range that the notifier registered for i.e. the full AS.
+ *
+ * This is seemingly technically equivalent to directly calling
+ * memory_region_unmap_iommu_notifier_range(), but it allows to check for
+ * notifier boundaries and issue notifications with ranges within those bounds.
+ */
+static void amdvi_address_space_unmap(AMDVIAddressSpace *as, IOMMUNotifier *n)
+{
+
+ hwaddr start = n->start;
+ hwaddr end = n->end;
+ hwaddr remain;
+ DMAMap map;
+
+ assert(start <= end);
+ remain = end - start + 1;
+
+ /*
+ * Divide the notifier range into chunks that are aligned and do not exceed
+ * the notifier boundaries.
+ */
+ while (remain >= AMDVI_PAGE_SIZE) {
+
+ IOMMUTLBEvent event;
+
+ uint64_t mask = dma_aligned_pow2_mask(start, end, 64);
+
+ event.type = IOMMU_NOTIFIER_UNMAP;
+
+ IOMMUTLBEntry entry = {
+ .target_as = &address_space_memory,
+ .iova = start,
+ .translated_addr = 0, /* irrelevant for unmap case */
+ .addr_mask = mask,
+ .perm = IOMMU_NONE,
+ };
+ event.entry = entry;
+
+ /* Call notifier registered for updates on this address space */
+ memory_region_notify_iommu_one(n, &event);
+
+ start += mask + 1;
+ remain -= mask + 1;
+ }
+
+ assert(!remain);
+
+ map.iova = n->start;
+ map.size = n->end - n->start;
+
+ iova_tree_remove(as->iova_tree, map);
+}
+
+/*
+ * For all the address spaces with notifiers registered, unmap the entire range
+ * the notifier registered for i.e. clear all the address spaces managed by the
+ * IOMMU.
+ */
+static void amdvi_address_space_unmap_all(AMDVIState *s)
+{
+ AMDVIAddressSpace *as;
+ IOMMUNotifier *n;
+
+ QLIST_FOREACH(as, &s->amdvi_as_with_notifiers, next) {
+ IOMMU_NOTIFIER_FOREACH(n, &as->iommu) {
+ amdvi_address_space_unmap(as, n);
+ }
+ }
+}
+
+/*
+ * For every translation present in the IOMMU, construct IOMMUTLBEntry data
+ * and pass it as parameter to notifier callback.
+ */
+static void amdvi_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
+{
+ AMDVIAddressSpace *as = container_of(iommu_mr, AMDVIAddressSpace, iommu);
+ uint64_t dte[4] = { 0 };
+
+ if (!(n->notifier_flags & IOMMU_NOTIFIER_MAP)) {
+ return;
+ }
+
+ if (amdvi_as_to_dte(as, dte)) {
+ return;
+ }
+
+ /* Dropping all mappings for the address space. Also clears the IOVA tree */
+ amdvi_address_space_unmap(as, n);
+
+ amdvi_sync_shadow_page_table_range(as, &dte[0], 0, UINT64_MAX, false);
+}
+
+static void amdvi_address_space_sync(AMDVIAddressSpace *as)
+{
+ IOMMUNotifier *n;
+ uint64_t dte[4] = { 0 };
+
+ /* If only UNMAP notifiers are registered, drop all existing mappings */
+ if (!(as->notifier_flags & IOMMU_NOTIFIER_MAP)) {
+ IOMMU_NOTIFIER_FOREACH(n, &as->iommu) {
+ /*
+ * Directly calling memory_region_unmap_iommu_notifier_range() does
+ * not guarantee that the addr_mask eventually passed as parameter
+ * to the notifier is valid. Use amdvi_address_space_unmap() which
+ * ensures the notifier range is divided into properly aligned
+ * regions, and issues notifications for each one.
+ */
+ amdvi_address_space_unmap(as, n);
+ }
+ return;
+ }
+
+ if (amdvi_as_to_dte(as, dte)) {
+ return;
+ }
+
+ amdvi_sync_shadow_page_table_range(as, &dte[0], 0, UINT64_MAX, true);
+}
+
+/*
+ * This differs from the replay() method in that it issues both MAP and UNMAP
+ * notifications since it is called after global invalidation events in order to
+ * re-sync all address spaces.
+ */
+static void amdvi_iommu_address_space_sync_all(AMDVIState *s)
+{
+ AMDVIAddressSpace *as;
+
+ QLIST_FOREACH(as, &s->amdvi_as_with_notifiers, next) {
+ amdvi_address_space_sync(as);
+ }
+}
+
+/*
+ * Toggle between address translation and passthrough modes by enabling the
+ * corresponding memory regions.
+ */
+static void amdvi_switch_address_space(AMDVIAddressSpace *amdvi_as)
+{
+ AMDVIState *s = amdvi_as->iommu_state;
+
+ if (s->dma_remap && amdvi_as->addr_translation) {
+ /* Enabling DMA region */
+ memory_region_set_enabled(&amdvi_as->iommu_nodma, false);
+ memory_region_set_enabled(MEMORY_REGION(&amdvi_as->iommu), true);
+ } else {
+ /* Disabling DMA region, using passthrough */
+ memory_region_set_enabled(MEMORY_REGION(&amdvi_as->iommu), false);
+ memory_region_set_enabled(&amdvi_as->iommu_nodma, true);
+ }
+}
+
+/*
+ * For all existing address spaces managed by the IOMMU, enable/disable the
+ * corresponding memory regions to reset the address translation mode and
+ * use passthrough by default.
+ */
+static void amdvi_reset_address_translation_all(AMDVIState *s)
+{
+ AMDVIAddressSpace **iommu_as;
+
+ for (int bus_num = 0; bus_num < PCI_BUS_MAX; bus_num++) {
+
+ /* Nothing to do if there are no devices on the current bus */
+ if (!s->address_spaces[bus_num]) {
+ continue;
+ }
+ iommu_as = s->address_spaces[bus_num];
+
+ for (int devfn = 0; devfn < PCI_DEVFN_MAX; devfn++) {
+
+ if (!iommu_as[devfn]) {
+ continue;
+ }
+ /* Use passthrough as default mode after reset */
+ iommu_as[devfn]->addr_translation = false;
+ amdvi_switch_address_space(iommu_as[devfn]);
+ }
+ }
+}
+
+static void enable_dma_mode(AMDVIAddressSpace *as, bool inval_current)
+{
+ /*
+ * When enabling DMA mode for the purpose of isolating guest devices on
+ * a failure to retrieve or invalid DTE, all existing mappings must be
+ * dropped.
+ */
+ if (inval_current) {
+ IOMMUNotifier *n;
+ IOMMU_NOTIFIER_FOREACH(n, &as->iommu) {
+ amdvi_address_space_unmap(as, n);
+ }
+ }
+
+ if (as->addr_translation) {
+ return;
+ }
+
+ /* Installing DTE enabling translation, activate region */
+ as->addr_translation = true;
+ amdvi_switch_address_space(as);
+ /* Sync shadow page tables */
+ amdvi_address_space_sync(as);
+}
+
+/*
+ * If paging was previously in use in the address space
+ * - invalidate all existing mappings
+ * - switch to no_dma memory region
+ */
+static void enable_nodma_mode(AMDVIAddressSpace *as)
+{
+ IOMMUNotifier *n;
+
+ if (!as->addr_translation) {
+ /* passthrough is already active, nothing to do */
+ return;
+ }
+
+ as->addr_translation = false;
+ IOMMU_NOTIFIER_FOREACH(n, &as->iommu) {
+ /* Drop all mappings for the address space */
+ amdvi_address_space_unmap(as, n);
+ }
+ amdvi_switch_address_space(as);
+}
+
+/*
+ * A guest driver must issue the INVALIDATE_DEVTAB_ENTRY command to the IOMMU
+ * after changing a Device Table entry. We can use this fact to detect when a
+ * Device Table entry is created for a device attached to a paging domain and
+ * enable the corresponding IOMMU memory region to allow for DMA translation if
+ * appropriate.
+ */
+static void amdvi_update_addr_translation_mode(AMDVIState *s, uint16_t devid)
+{
+ uint8_t bus_num, devfn, dte_mode;
+ AMDVIAddressSpace *as;
+ uint64_t dte[4] = { 0 };
+ int ret;
+
+ /*
+ * Convert the devid encoded in the command to a bus and devfn in
+ * order to retrieve the corresponding address space.
+ */
+ bus_num = PCI_BUS_NUM(devid);
+ devfn = devid & 0xff;
+
+ /*
+ * The main buffer of size (AMDVIAddressSpace *) * (PCI_BUS_MAX) has already
+ * been allocated within AMDVIState, but must be careful to not access
+ * unallocated devfn.
+ */
+ if (!s->address_spaces[bus_num] || !s->address_spaces[bus_num][devfn]) {
+ return;
+ }
+ as = s->address_spaces[bus_num][devfn];
+
+ ret = amdvi_as_to_dte(as, dte);
+
+ if (!ret) {
+ dte_mode = (dte[0] >> AMDVI_DEV_MODE_RSHIFT) & AMDVI_DEV_MODE_MASK;
+ }
+
+ switch (ret) {
+ case 0:
+ /* DTE was successfully retrieved */
+ if (!dte_mode) {
+ enable_nodma_mode(as); /* DTE[V]=1 && DTE[Mode]=0 => passthrough */
+ } else {
+ enable_dma_mode(as, false); /* Enable DMA translation */
+ }
+ break;
+ case -AMDVI_FR_DTE_V:
+ /* DTE[V]=0, address is passed untranslated */
+ enable_nodma_mode(as);
+ break;
+ case -AMDVI_FR_DTE_RTR_ERR:
+ case -AMDVI_FR_DTE_TV:
+ /*
+ * Enforce isolation by using DMA in rare scenarios where the DTE cannot
+ * be retrieved or DTE[TV]=0. Existing mappings are dropped.
+ */
+ enable_dma_mode(as, true);
+ break;
+ }
+}
+
/* log error without aborting since linux seems to be using reserved bits */
static void amdvi_inval_devtab_entry(AMDVIState *s, uint64_t *cmd)
{
uint16_t devid = cpu_to_le16((uint16_t)extract64(cmd[0], 0, 16));
+ trace_amdvi_devtab_inval(PCI_BUS_NUM(devid), PCI_SLOT(devid),
+ PCI_FUNC(devid));
+
/* This command should invalidate internal caches of which there isn't */
if (extract64(cmd[0], 16, 44) || cmd[1]) {
amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
s->cmdbuf + s->cmdbuf_head);
+ return;
+ }
+
+ /*
+ * When DMA remapping capability is enabled, check if updated DTE is setup
+ * for paging or not, and configure the corresponding memory regions.
+ */
+ if (s->dma_remap) {
+ amdvi_update_addr_translation_mode(s, devid);
}
- trace_amdvi_devtab_inval(PCI_BUS_NUM(devid), PCI_SLOT(devid),
- PCI_FUNC(devid));
}
static void amdvi_complete_ppr(AMDVIState *s, uint64_t *cmd)
@@ -449,6 +1193,13 @@ static void amdvi_inval_all(AMDVIState *s, uint64_t *cmd)
amdvi_intremap_inval_notify_all(s, true, 0, 0);
amdvi_iotlb_reset(s);
+
+ /*
+ * Fully replay the address space i.e. send both UNMAP and MAP events in
+ * order to synchronize guest and host IO page tables tables.
+ */
+ amdvi_iommu_address_space_sync_all(s);
+
trace_amdvi_all_inval();
}
@@ -460,10 +1211,109 @@ static gboolean amdvi_iotlb_remove_by_domid(gpointer key, gpointer value,
return entry->domid == domid;
}
+/*
+ * Helper to decode the size of the range to invalidate encoded in the
+ * INVALIDATE_IOMMU_PAGES Command format.
+ * The size of the region to invalidate depends on the S bit and address.
+ * S bit value:
+ * 0 : Invalidation size is 4 Kbytes.
+ * 1 : Invalidation size is determined by first zero bit in the address
+ * starting from Address[12].
+ *
+ * In the AMD IOMMU Linux driver, an invalidation command with address
+ * ((1 << 63) - 1) is sent when intending to clear the entire cache.
+ * However, Table 14: Example Page Size Encodings shows that an address of
+ * ((1ULL << 51) - 1) encodes the entire cache, so effectively any address with
+ * first zero at bit 51 or larger is a request to invalidate the entire address
+ * space.
+ */
+static uint64_t amdvi_decode_invalidation_size(hwaddr addr, uint16_t flags)
+{
+ uint64_t size = AMDVI_PAGE_SIZE;
+ uint8_t fzbit = 0;
+
+ if (flags & AMDVI_CMD_INVAL_IOMMU_PAGES_S) {
+ fzbit = cto64(addr | 0xFFF);
+
+ if (fzbit >= 51) {
+ size = AMDVI_INV_ALL_PAGES;
+ } else {
+ size = 1ULL << (fzbit + 1);
+ }
+ }
+ return size;
+}
+
+/*
+ * Synchronize the guest page tables with the shadow page tables kept in the
+ * host for the specified range.
+ * The invalidation command issued by the guest and intercepted by the VMM
+ * does not specify a device, but a domain, since all devices in the same domain
+ * share the same page tables. However, vIOMMU emulation creates separate
+ * address spaces per device, so it is necessary to traverse the list of all of
+ * address spaces (i.e. devices) that have notifiers registered in order to
+ * propagate the changes to the host page tables.
+ * We cannot return early from this function once a matching domain has been
+ * identified and its page tables synced (based on the fact that all devices in
+ * the same domain share the page tables). The reason is that different devices
+ * (i.e. address spaces) could have different notifiers registered, and by
+ * skipping address spaces that appear later on the amdvi_as_with_notifiers list
+ * their notifiers (which could differ from the ones registered for the first
+ * device/address space) would not be invoked.
+ */
+static void amdvi_sync_domain(AMDVIState *s, uint16_t domid, uint64_t addr,
+ uint16_t flags)
+{
+ AMDVIAddressSpace *as;
+
+ uint64_t size = amdvi_decode_invalidation_size(addr, flags);
+
+ if (size == AMDVI_INV_ALL_PAGES) {
+ addr = 0; /* Set start address to 0 and invalidate entire AS */
+ } else {
+ addr &= ~(size - 1);
+ }
+
+ /*
+ * Call notifiers that have registered for each address space matching the
+ * domain ID, in order to sync the guest pagetable state with the host.
+ */
+ QLIST_FOREACH(as, &s->amdvi_as_with_notifiers, next) {
+
+ uint64_t dte[4] = { 0 };
+
+ /*
+ * Retrieve the Device Table entry for the devid corresponding to the
+ * current address space, and verify the DomainID matches i.e. the page
+ * tables to be synced belong to devices in the domain.
+ */
+ if (amdvi_as_to_dte(as, dte)) {
+ continue;
+ }
+
+ /* Only need to sync the Page Tables for a matching domain */
+ if (domid != (dte[1] & AMDVI_DEV_DOMID_ID_MASK)) {
+ continue;
+ }
+
+ /*
+ * We have determined that there is a valid Device Table Entry for a
+ * device matching the DomainID in the INV_IOMMU_PAGES command issued by
+ * the guest. Walk the guest page table to sync shadow page table.
+ */
+ if (as->notifier_flags & IOMMU_NOTIFIER_MAP) {
+ /* Sync guest IOMMU mappings with host */
+ amdvi_sync_shadow_page_table_range(as, &dte[0], addr, size, true);
+ }
+ }
+}
+
/* we don't have devid - we can't remove pages by address */
static void amdvi_inval_pages(AMDVIState *s, uint64_t *cmd)
{
uint16_t domid = cpu_to_le16((uint16_t)extract64(cmd[0], 32, 16));
+ uint64_t addr = cpu_to_le64(extract64(cmd[1], 12, 52)) << 12;
+ uint16_t flags = cpu_to_le16((uint16_t)extract64(cmd[1], 0, 3));
if (extract64(cmd[0], 20, 12) || extract64(cmd[0], 48, 12) ||
extract64(cmd[1], 3, 9)) {
@@ -473,6 +1323,8 @@ static void amdvi_inval_pages(AMDVIState *s, uint64_t *cmd)
g_hash_table_foreach_remove(s->iotlb, amdvi_iotlb_remove_by_domid,
&domid);
+
+ amdvi_sync_domain(s, domid, addr, flags);
trace_amdvi_pages_inval(domid);
}
@@ -508,7 +1360,7 @@ static void amdvi_inval_inttable(AMDVIState *s, uint64_t *cmd)
static void iommu_inval_iotlb(AMDVIState *s, uint64_t *cmd)
{
- uint16_t devid = extract64(cmd[0], 0, 16);
+ uint16_t devid = cpu_to_le16(extract64(cmd[0], 0, 16));
if (extract64(cmd[1], 1, 1) || extract64(cmd[1], 3, 1) ||
extract64(cmd[1], 6, 6)) {
amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
@@ -521,7 +1373,7 @@ static void iommu_inval_iotlb(AMDVIState *s, uint64_t *cmd)
&devid);
} else {
amdvi_iotlb_remove_page(s, cpu_to_le64(extract64(cmd[1], 12, 52)) << 12,
- cpu_to_le16(extract64(cmd[1], 0, 16)));
+ devid);
}
trace_amdvi_iotlb_inval();
}
@@ -592,18 +1444,31 @@ static void amdvi_cmdbuf_run(AMDVIState *s)
}
}
-static void amdvi_mmio_trace(hwaddr addr, unsigned size)
+static inline uint8_t amdvi_mmio_get_index(hwaddr addr)
{
uint8_t index = (addr & ~0x2000) / 8;
if ((addr & 0x2000)) {
/* high table */
index = index >= AMDVI_MMIO_REGS_HIGH ? AMDVI_MMIO_REGS_HIGH : index;
- trace_amdvi_mmio_read(amdvi_mmio_high[index], addr, size, addr & ~0x07);
} else {
index = index >= AMDVI_MMIO_REGS_LOW ? AMDVI_MMIO_REGS_LOW : index;
- trace_amdvi_mmio_read(amdvi_mmio_low[index], addr, size, addr & ~0x07);
}
+
+ return index;
+}
+
+static void amdvi_mmio_trace_read(hwaddr addr, unsigned size)
+{
+ uint8_t index = amdvi_mmio_get_index(addr);
+ trace_amdvi_mmio_read(amdvi_mmio_low[index], addr, size, addr & ~0x07);
+}
+
+static void amdvi_mmio_trace_write(hwaddr addr, unsigned size, uint64_t val)
+{
+ uint8_t index = amdvi_mmio_get_index(addr);
+ trace_amdvi_mmio_write(amdvi_mmio_low[index], addr, size, val,
+ addr & ~0x07);
}
static uint64_t amdvi_mmio_read(void *opaque, hwaddr addr, unsigned size)
@@ -623,7 +1488,7 @@ static uint64_t amdvi_mmio_read(void *opaque, hwaddr addr, unsigned size)
} else if (size == 8) {
val = amdvi_readq(s, addr);
}
- amdvi_mmio_trace(addr, size);
+ amdvi_mmio_trace_read(addr, size);
return val;
}
@@ -633,7 +1498,6 @@ static void amdvi_handle_control_write(AMDVIState *s)
unsigned long control = amdvi_readq(s, AMDVI_MMIO_CONTROL);
s->enabled = !!(control & AMDVI_MMIO_CONTROL_AMDVIEN);
- s->ats_enabled = !!(control & AMDVI_MMIO_CONTROL_HTTUNEN);
s->evtlog_enabled = s->enabled && !!(control &
AMDVI_MMIO_CONTROL_EVENTLOGEN);
@@ -665,8 +1529,8 @@ static inline void amdvi_handle_devtab_write(AMDVIState *s)
uint64_t val = amdvi_readq(s, AMDVI_MMIO_DEVICE_TABLE);
s->devtab = (val & AMDVI_MMIO_DEVTAB_BASE_MASK);
- /* set device table length */
- s->devtab_len = ((val & AMDVI_MMIO_DEVTAB_SIZE_MASK) + 1 *
+ /* set device table length (i.e. number of entries table can hold) */
+ s->devtab_len = (((val & AMDVI_MMIO_DEVTAB_SIZE_MASK) + 1) *
(AMDVI_MMIO_DEVTAB_SIZE_UNIT /
AMDVI_MMIO_DEVTAB_ENTRY_SIZE));
}
@@ -704,9 +1568,19 @@ static inline void amdvi_handle_excllim_write(AMDVIState *s)
static inline void amdvi_handle_evtbase_write(AMDVIState *s)
{
uint64_t val = amdvi_readq(s, AMDVI_MMIO_EVENT_BASE);
+
+ if (amdvi_readq(s, AMDVI_MMIO_STATUS) & AMDVI_MMIO_STATUS_EVENT_INT)
+ /* Do not reset if eventlog interrupt bit is set*/
+ return;
+
s->evtlog = val & AMDVI_MMIO_EVTLOG_BASE_MASK;
s->evtlog_len = 1UL << (amdvi_readq(s, AMDVI_MMIO_EVTLOG_SIZE_BYTE)
& AMDVI_MMIO_EVTLOG_SIZE_MASK);
+
+ /* clear tail and head pointer to 0 when event base is updated */
+ s->evtlog_tail = s->evtlog_head = 0;
+ amdvi_writeq_raw(s, AMDVI_MMIO_EVENT_HEAD, s->evtlog_head);
+ amdvi_writeq_raw(s, AMDVI_MMIO_EVENT_TAIL, s->evtlog_tail);
}
static inline void amdvi_handle_evttail_write(AMDVIState *s)
@@ -770,7 +1644,7 @@ static void amdvi_mmio_write(void *opaque, hwaddr addr, uint64_t val,
return;
}
- amdvi_mmio_trace(addr, size);
+ amdvi_mmio_trace_write(addr, size, val);
switch (addr & ~0x07) {
case AMDVI_MMIO_CONTROL:
amdvi_mmio_reg_write(s, size, val, addr);
@@ -835,152 +1709,74 @@ static void amdvi_mmio_write(void *opaque, hwaddr addr, uint64_t val,
amdvi_mmio_reg_write(s, size, val, addr);
amdvi_handle_pprtail_write(s);
break;
+ case AMDVI_MMIO_STATUS:
+ amdvi_mmio_reg_write(s, size, val, addr);
+ break;
}
}
-static inline uint64_t amdvi_get_perms(uint64_t entry)
-{
- return (entry & (AMDVI_DEV_PERM_READ | AMDVI_DEV_PERM_WRITE)) >>
- AMDVI_DEV_PERM_SHIFT;
-}
-
-/* validate that reserved bits are honoured */
-static bool amdvi_validate_dte(AMDVIState *s, uint16_t devid,
- uint64_t *dte)
-{
- if ((dte[0] & AMDVI_DTE_LOWER_QUAD_RESERVED)
- || (dte[1] & AMDVI_DTE_MIDDLE_QUAD_RESERVED)
- || (dte[2] & AMDVI_DTE_UPPER_QUAD_RESERVED) || dte[3]) {
- amdvi_log_illegaldevtab_error(s, devid,
- s->devtab +
- devid * AMDVI_DEVTAB_ENTRY_SIZE, 0);
- return false;
- }
-
- return true;
-}
-
-/* get a device table entry given the devid */
-static bool amdvi_get_dte(AMDVIState *s, int devid, uint64_t *entry)
+static void amdvi_page_walk(AMDVIAddressSpace *as, uint64_t *dte,
+ IOMMUTLBEntry *ret, unsigned perms,
+ hwaddr addr)
{
- uint32_t offset = devid * AMDVI_DEVTAB_ENTRY_SIZE;
+ hwaddr page_mask, pagesize = 0;
+ uint8_t mode;
+ uint64_t pte;
+ int fetch_ret;
- if (dma_memory_read(&address_space_memory, s->devtab + offset, entry,
- AMDVI_DEVTAB_ENTRY_SIZE, MEMTXATTRS_UNSPECIFIED)) {
- trace_amdvi_dte_get_fail(s->devtab, offset);
- /* log error accessing dte */
- amdvi_log_devtab_error(s, devid, s->devtab + offset, 0);
- return false;
+ /* make sure the DTE has TV = 1 */
+ if (!(dte[0] & AMDVI_DEV_TRANSLATION_VALID)) {
+ /*
+ * A DTE with V=1, TV=0 does not have a valid Page Table Root Pointer.
+ * An IOMMU processing a request that requires a table walk terminates
+ * the walk when it encounters this condition. Do the same and return
+ * instead of assuming that the address is forwarded without translation
+ * i.e. the passthrough case, as it is done for the case where DTE[V]=0.
+ */
+ return;
}
- *entry = le64_to_cpu(*entry);
- if (!amdvi_validate_dte(s, devid, entry)) {
- trace_amdvi_invalid_dte(entry[0]);
- return false;
+ mode = get_pte_translation_mode(dte[0]);
+ if (mode >= 7) {
+ trace_amdvi_mode_invalid(mode, addr);
+ return;
}
-
- return true;
-}
-
-/* get pte translation mode */
-static inline uint8_t get_pte_translation_mode(uint64_t pte)
-{
- return (pte >> AMDVI_DEV_MODE_RSHIFT) & AMDVI_DEV_MODE_MASK;
-}
-
-static inline uint64_t pte_override_page_mask(uint64_t pte)
-{
- uint8_t page_mask = 13;
- uint64_t addr = (pte & AMDVI_DEV_PT_ROOT_MASK) >> 12;
- /* find the first zero bit */
- while (addr & 1) {
- page_mask++;
- addr = addr >> 1;
+ if (mode == 0) {
+ goto no_remap;
}
- return ~((1ULL << page_mask) - 1);
-}
-
-static inline uint64_t pte_get_page_mask(uint64_t oldlevel)
-{
- return ~((1UL << ((oldlevel * 9) + 3)) - 1);
-}
+ /* Attempt to fetch the PTE to determine if a valid mapping exists */
+ fetch_ret = fetch_pte(as, addr, dte[0], &pte, &pagesize);
-static inline uint64_t amdvi_get_pte_entry(AMDVIState *s, uint64_t pte_addr,
- uint16_t devid)
-{
- uint64_t pte;
+ /*
+ * If walking the page table results in an error of any type, returns an
+ * empty PTE i.e. no mapping, or the permissions do not match, return since
+ * there is no translation available.
+ */
+ if (fetch_ret < 0 || !IOMMU_PTE_PRESENT(pte) ||
+ perms != (perms & amdvi_get_perms(pte))) {
- if (dma_memory_read(&address_space_memory, pte_addr,
- &pte, sizeof(pte), MEMTXATTRS_UNSPECIFIED)) {
- trace_amdvi_get_pte_hwerror(pte_addr);
- amdvi_log_pagetab_error(s, devid, pte_addr, 0);
- pte = 0;
- return pte;
+ amdvi_page_fault(as->iommu_state, as->devfn, addr, perms);
+ trace_amdvi_page_fault(addr);
+ return;
}
- pte = le64_to_cpu(pte);
- return pte;
-}
-
-static void amdvi_page_walk(AMDVIAddressSpace *as, uint64_t *dte,
- IOMMUTLBEntry *ret, unsigned perms,
- hwaddr addr)
-{
- unsigned level, present, pte_perms, oldlevel;
- uint64_t pte = dte[0], pte_addr, page_mask;
-
- /* make sure the DTE has TV = 1 */
- if (pte & AMDVI_DEV_TRANSLATION_VALID) {
- level = get_pte_translation_mode(pte);
- if (level >= 7) {
- trace_amdvi_mode_invalid(level, addr);
- return;
- }
- if (level == 0) {
- goto no_remap;
- }
-
- /* we are at the leaf page table or page table encodes a huge page */
- do {
- pte_perms = amdvi_get_perms(pte);
- present = pte & 1;
- if (!present || perms != (perms & pte_perms)) {
- amdvi_page_fault(as->iommu_state, as->devfn, addr, perms);
- trace_amdvi_page_fault(addr);
- return;
- }
-
- /* go to the next lower level */
- pte_addr = pte & AMDVI_DEV_PT_ROOT_MASK;
- /* add offset and load pte */
- pte_addr += ((addr >> (3 + 9 * level)) & 0x1FF) << 3;
- pte = amdvi_get_pte_entry(as->iommu_state, pte_addr, as->devfn);
- if (!pte) {
- return;
- }
- oldlevel = level;
- level = get_pte_translation_mode(pte);
- } while (level > 0 && level < 7);
+ /* A valid PTE and page size has been retrieved */
+ assert(pagesize);
+ page_mask = ~(pagesize - 1);
- if (level == 0x7) {
- page_mask = pte_override_page_mask(pte);
- } else {
- page_mask = pte_get_page_mask(oldlevel);
- }
+ /* get access permissions from pte */
+ ret->iova = addr & page_mask;
+ ret->translated_addr = (pte & AMDVI_DEV_PT_ROOT_MASK) & page_mask;
+ ret->addr_mask = ~page_mask;
+ ret->perm = amdvi_get_perms(pte);
+ return;
- /* get access permissions from pte */
- ret->iova = addr & page_mask;
- ret->translated_addr = (pte & AMDVI_DEV_PT_ROOT_MASK) & page_mask;
- ret->addr_mask = ~page_mask;
- ret->perm = amdvi_get_perms(pte);
- return;
- }
no_remap:
ret->iova = addr & AMDVI_PAGE_MASK_4K;
ret->translated_addr = addr & AMDVI_PAGE_MASK_4K;
ret->addr_mask = ~AMDVI_PAGE_MASK_4K;
- ret->perm = amdvi_get_perms(pte);
+ ret->perm = amdvi_get_perms(dte[0]);
}
static void amdvi_do_translate(AMDVIAddressSpace *as, hwaddr addr,
@@ -990,6 +1786,7 @@ static void amdvi_do_translate(AMDVIAddressSpace *as, hwaddr addr,
uint16_t devid = PCI_BUILD_BDF(as->bus_num, as->devfn);
AMDVIIOTLBEntry *iotlb_entry = amdvi_iotlb_lookup(s, addr, devid);
uint64_t entry[4];
+ int dte_ret;
if (iotlb_entry) {
trace_amdvi_iotlb_hit(PCI_BUS_NUM(devid), PCI_SLOT(devid),
@@ -1001,13 +1798,14 @@ static void amdvi_do_translate(AMDVIAddressSpace *as, hwaddr addr,
return;
}
- if (!amdvi_get_dte(s, devid, entry)) {
- return;
- }
+ dte_ret = amdvi_as_to_dte(as, entry);
- /* devices with V = 0 are not translated */
- if (!(entry[0] & AMDVI_DEV_VALID)) {
- goto out;
+ if (dte_ret < 0) {
+ if (dte_ret == -AMDVI_FR_DTE_V) {
+ /* DTE[V]=0, address is passed untranslated */
+ goto out;
+ }
+ return;
}
amdvi_page_walk(as, entry, ret,
@@ -1426,7 +2224,6 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
AMDVIState *s = opaque;
AMDVIAddressSpace **iommu_as, *amdvi_dev_as;
int bus_num = pci_bus_num(bus);
- X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
iommu_as = s->address_spaces[bus_num];
@@ -1444,6 +2241,9 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
iommu_as[devfn]->bus_num = (uint8_t)bus_num;
iommu_as[devfn]->devfn = (uint8_t)devfn;
iommu_as[devfn]->iommu_state = s;
+ iommu_as[devfn]->notifier_flags = IOMMU_NOTIFIER_NONE;
+ iommu_as[devfn]->iova_tree = iova_tree_new();
+ iommu_as[devfn]->addr_translation = false;
amdvi_dev_as = iommu_as[devfn];
@@ -1486,15 +2286,7 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
AMDVI_INT_ADDR_FIRST,
&amdvi_dev_as->iommu_ir, 1);
- if (!x86_iommu->pt_supported) {
- memory_region_set_enabled(&amdvi_dev_as->iommu_nodma, false);
- memory_region_set_enabled(MEMORY_REGION(&amdvi_dev_as->iommu),
- true);
- } else {
- memory_region_set_enabled(MEMORY_REGION(&amdvi_dev_as->iommu),
- false);
- memory_region_set_enabled(&amdvi_dev_as->iommu_nodma, true);
- }
+ amdvi_switch_address_space(amdvi_dev_as);
}
return &iommu_as[devfn]->as;
}
@@ -1524,14 +2316,35 @@ static int amdvi_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu,
Error **errp)
{
AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu);
+ AMDVIState *s = as->iommu_state;
+
+ /*
+ * Accurate synchronization of the vIOMMU page tables required to support
+ * MAP notifiers is provided by the dma-remap feature. In addition, this
+ * also requires that the vIOMMU presents the NpCache capability, so a guest
+ * driver issues invalidations for both map() and unmap() operations. The
+ * capability is already set by default as part of AMDVI_CAPAB_FEATURES and
+ * written to the configuration in amdvi_pci_realize().
+ */
+ if (!s->dma_remap && (new & IOMMU_NOTIFIER_MAP)) {
+ error_setg_errno(errp, ENOTSUP,
+ "device %02x.%02x.%x requires dma-remap=1",
+ as->bus_num, PCI_SLOT(as->devfn), PCI_FUNC(as->devfn));
+ return -ENOTSUP;
+ }
+
+ /*
+ * Update notifier flags for address space and the list of address spaces
+ * with registered notifiers.
+ */
+ as->notifier_flags = new;
- if (new & IOMMU_NOTIFIER_MAP) {
- error_setg(errp,
- "device %02x.%02x.%x requires iommu notifier which is not "
- "currently supported", as->bus_num, PCI_SLOT(as->devfn),
- PCI_FUNC(as->devfn));
- return -EINVAL;
+ if (old == IOMMU_NOTIFIER_NONE) {
+ QLIST_INSERT_HEAD(&s->amdvi_as_with_notifiers, as, next);
+ } else if (new == IOMMU_NOTIFIER_NONE) {
+ QLIST_REMOVE(as, next);
}
+
return 0;
}
@@ -1549,7 +2362,6 @@ static void amdvi_init(AMDVIState *s)
s->excl_allow = false;
s->mmio_enabled = false;
s->enabled = false;
- s->ats_enabled = false;
s->cmdbuf_enabled = false;
/* reset MMIO */
@@ -1609,6 +2421,10 @@ static void amdvi_sysbus_reset(DeviceState *dev)
msi_reset(&s->pci->dev);
amdvi_init(s);
+
+ /* Discard all mappings on device reset */
+ amdvi_address_space_unmap_all(s);
+ amdvi_reset_address_translation_all(s);
}
static const VMStateDescription vmstate_amdvi_sysbus_migratable = {
@@ -1620,7 +2436,8 @@ static const VMStateDescription vmstate_amdvi_sysbus_migratable = {
/* Updated in amdvi_handle_control_write() */
VMSTATE_BOOL(enabled, AMDVIState),
VMSTATE_BOOL(ga_enabled, AMDVIState),
- VMSTATE_BOOL(ats_enabled, AMDVIState),
+ /* bool ats_enabled is obsolete */
+ VMSTATE_UNUSED(1), /* was ats_enabled */
VMSTATE_BOOL(cmdbuf_enabled, AMDVIState),
VMSTATE_BOOL(completion_wait_intr, AMDVIState),
VMSTATE_BOOL(evtlog_enabled, AMDVIState),
@@ -1693,9 +2510,6 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp)
s->iotlb = g_hash_table_new_full(amdvi_uint64_hash,
amdvi_uint64_equal, g_free, g_free);
- /* Pseudo address space under root PCI bus. */
- x86ms->ioapic_as = amdvi_host_dma_iommu(bus, s, AMDVI_IOAPIC_SB_DEVID);
-
/* set up MMIO */
memory_region_init_io(&s->mr_mmio, OBJECT(s), &mmio_mem_ops, s,
"amdvi-mmio", AMDVI_MMIO_SIZE);
@@ -1718,11 +2532,22 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp)
memory_region_add_subregion_overlap(&s->mr_sys, AMDVI_INT_ADDR_FIRST,
&s->mr_ir, 1);
+ /* Pseudo address space under root PCI bus. */
+ x86ms->ioapic_as = amdvi_host_dma_iommu(bus, s, AMDVI_IOAPIC_SB_DEVID);
+
if (kvm_enabled() && x86ms->apic_id_limit > 255 && !s->xtsup) {
error_report("AMD IOMMU with x2APIC configuration requires xtsup=on");
exit(EXIT_FAILURE);
}
+ if (s->xtsup) {
+ if (kvm_irqchip_is_split() && !kvm_enable_x2apic()) {
+ error_report("AMD IOMMU xtsup=on requires x2APIC support on "
+ "the KVM side");
+ exit(EXIT_FAILURE);
+ }
+ }
+
pci_setup_iommu(bus, &amdvi_iommu_ops, s);
amdvi_init(s);
}
@@ -1730,6 +2555,7 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp)
static const Property amdvi_properties[] = {
DEFINE_PROP_BOOL("xtsup", AMDVIState, xtsup, false),
DEFINE_PROP_STRING("pci-id", AMDVIState, pci_id),
+ DEFINE_PROP_BOOL("dma-remap", AMDVIState, dma_remap, false),
};
static const VMStateDescription vmstate_amdvi_sysbus = {
@@ -1791,6 +2617,7 @@ static void amdvi_iommu_memory_region_class_init(ObjectClass *klass,
imrc->translate = amdvi_translate;
imrc->notify_flag_changed = amdvi_iommu_notify_flag_changed;
+ imrc->replay = amdvi_iommu_replay;
}
static const TypeInfo amdvi_iommu_memory_region_info = {
diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h
index 5672bde..daf82fc 100644
--- a/hw/i386/amd_iommu.h
+++ b/hw/i386/amd_iommu.h
@@ -25,6 +25,8 @@
#include "hw/i386/x86-iommu.h"
#include "qom/object.h"
+#define GENMASK64(h, l) (((~0ULL) >> (63 - (h) + (l))) << (l))
+
/* Capability registers */
#define AMDVI_CAPAB_BAR_LOW 0x04
#define AMDVI_CAPAB_BAR_HIGH 0x08
@@ -66,34 +68,34 @@
#define AMDVI_MMIO_SIZE 0x4000
-#define AMDVI_MMIO_DEVTAB_SIZE_MASK ((1ULL << 12) - 1)
-#define AMDVI_MMIO_DEVTAB_BASE_MASK (((1ULL << 52) - 1) & ~ \
- AMDVI_MMIO_DEVTAB_SIZE_MASK)
+#define AMDVI_MMIO_DEVTAB_SIZE_MASK GENMASK64(8, 0)
+#define AMDVI_MMIO_DEVTAB_BASE_MASK GENMASK64(51, 12)
+
#define AMDVI_MMIO_DEVTAB_ENTRY_SIZE 32
#define AMDVI_MMIO_DEVTAB_SIZE_UNIT 4096
/* some of this are similar but just for readability */
#define AMDVI_MMIO_CMDBUF_SIZE_BYTE (AMDVI_MMIO_COMMAND_BASE + 7)
#define AMDVI_MMIO_CMDBUF_SIZE_MASK 0x0f
-#define AMDVI_MMIO_CMDBUF_BASE_MASK AMDVI_MMIO_DEVTAB_BASE_MASK
-#define AMDVI_MMIO_CMDBUF_HEAD_MASK (((1ULL << 19) - 1) & ~0x0f)
-#define AMDVI_MMIO_CMDBUF_TAIL_MASK AMDVI_MMIO_EVTLOG_HEAD_MASK
+#define AMDVI_MMIO_CMDBUF_BASE_MASK GENMASK64(51, 12)
+#define AMDVI_MMIO_CMDBUF_HEAD_MASK GENMASK64(18, 4)
+#define AMDVI_MMIO_CMDBUF_TAIL_MASK GENMASK64(18, 4)
#define AMDVI_MMIO_EVTLOG_SIZE_BYTE (AMDVI_MMIO_EVENT_BASE + 7)
-#define AMDVI_MMIO_EVTLOG_SIZE_MASK AMDVI_MMIO_CMDBUF_SIZE_MASK
-#define AMDVI_MMIO_EVTLOG_BASE_MASK AMDVI_MMIO_CMDBUF_BASE_MASK
-#define AMDVI_MMIO_EVTLOG_HEAD_MASK (((1ULL << 19) - 1) & ~0x0f)
-#define AMDVI_MMIO_EVTLOG_TAIL_MASK AMDVI_MMIO_EVTLOG_HEAD_MASK
+#define AMDVI_MMIO_EVTLOG_SIZE_MASK 0x0f
+#define AMDVI_MMIO_EVTLOG_BASE_MASK GENMASK64(51, 12)
+#define AMDVI_MMIO_EVTLOG_HEAD_MASK GENMASK64(18, 4)
+#define AMDVI_MMIO_EVTLOG_TAIL_MASK GENMASK64(18, 4)
-#define AMDVI_MMIO_PPRLOG_SIZE_BYTE (AMDVI_MMIO_EVENT_BASE + 7)
-#define AMDVI_MMIO_PPRLOG_HEAD_MASK AMDVI_MMIO_EVTLOG_HEAD_MASK
-#define AMDVI_MMIO_PPRLOG_TAIL_MASK AMDVI_MMIO_EVTLOG_HEAD_MASK
-#define AMDVI_MMIO_PPRLOG_BASE_MASK AMDVI_MMIO_EVTLOG_BASE_MASK
-#define AMDVI_MMIO_PPRLOG_SIZE_MASK AMDVI_MMIO_EVTLOG_SIZE_MASK
+#define AMDVI_MMIO_PPRLOG_SIZE_BYTE (AMDVI_MMIO_PPR_BASE + 7)
+#define AMDVI_MMIO_PPRLOG_SIZE_MASK 0x0f
+#define AMDVI_MMIO_PPRLOG_BASE_MASK GENMASK64(51, 12)
+#define AMDVI_MMIO_PPRLOG_HEAD_MASK GENMASK64(18, 4)
+#define AMDVI_MMIO_PPRLOG_TAIL_MASK GENMASK64(18, 4)
#define AMDVI_MMIO_EXCL_ENABLED_MASK (1ULL << 0)
#define AMDVI_MMIO_EXCL_ALLOW_MASK (1ULL << 1)
-#define AMDVI_MMIO_EXCL_LIMIT_MASK AMDVI_MMIO_DEVTAB_BASE_MASK
+#define AMDVI_MMIO_EXCL_LIMIT_MASK GENMASK64(51, 12)
#define AMDVI_MMIO_EXCL_LIMIT_LOW 0xfff
/* mmio control register flags */
@@ -109,6 +111,7 @@
#define AMDVI_MMIO_STATUS_CMDBUF_RUN (1 << 4)
#define AMDVI_MMIO_STATUS_EVT_RUN (1 << 3)
#define AMDVI_MMIO_STATUS_COMP_INT (1 << 2)
+#define AMDVI_MMIO_STATUS_EVENT_INT (1 << 1)
#define AMDVI_MMIO_STATUS_EVT_OVF (1 << 0)
#define AMDVI_CMDBUF_ID_BYTE 0x07
@@ -123,6 +126,10 @@
#define AMDVI_CMD_COMPLETE_PPR_REQUEST 0x07
#define AMDVI_CMD_INVAL_AMDVI_ALL 0x08
+
+#define AMDVI_CMD_INVAL_IOMMU_PAGES_S (1ULL << 0)
+#define AMDVI_INV_ALL_PAGES (1ULL << 52)
+
#define AMDVI_DEVTAB_ENTRY_SIZE 32
/* Device table entry bits 0:63 */
@@ -130,14 +137,14 @@
#define AMDVI_DEV_TRANSLATION_VALID (1ULL << 1)
#define AMDVI_DEV_MODE_MASK 0x7
#define AMDVI_DEV_MODE_RSHIFT 9
-#define AMDVI_DEV_PT_ROOT_MASK 0xffffffffff000
+#define AMDVI_DEV_PT_ROOT_MASK GENMASK64(51, 12)
#define AMDVI_DEV_PT_ROOT_RSHIFT 12
#define AMDVI_DEV_PERM_SHIFT 61
#define AMDVI_DEV_PERM_READ (1ULL << 61)
#define AMDVI_DEV_PERM_WRITE (1ULL << 62)
/* Device table entry bits 64:127 */
-#define AMDVI_DEV_DOMID_ID_MASK ((1ULL << 16) - 1)
+#define AMDVI_DEV_DOMID_ID_MASK GENMASK64(15, 0)
/* Event codes and flags, as stored in the info field */
#define AMDVI_EVENT_ILLEGAL_DEVTAB_ENTRY (0x1U << 12)
@@ -162,13 +169,55 @@
#define AMDVI_FEATURE_PC (1ULL << 9) /* Perf counters */
/* reserved DTE bits */
-#define AMDVI_DTE_LOWER_QUAD_RESERVED 0x80300000000000fc
-#define AMDVI_DTE_MIDDLE_QUAD_RESERVED 0x0000000000000100
-#define AMDVI_DTE_UPPER_QUAD_RESERVED 0x08f0000000000000
+#define AMDVI_DTE_QUAD0_RESERVED (GENMASK64(6, 2) | GENMASK64(63, 63))
+#define AMDVI_DTE_QUAD1_RESERVED 0
+#define AMDVI_DTE_QUAD2_RESERVED GENMASK64(53, 52)
+#define AMDVI_DTE_QUAD3_RESERVED (GENMASK64(14, 0) | GENMASK64(53, 48))
/* AMDVI paging mode */
#define AMDVI_GATS_MODE (2ULL << 12)
#define AMDVI_HATS_MODE (2ULL << 10)
+#define AMDVI_HATS_MODE_RESERVED (3ULL << 10)
+
+/* Page Table format */
+
+#define AMDVI_PTE_PR (1ULL << 0)
+#define AMDVI_PTE_NEXT_LEVEL_MASK GENMASK64(11, 9)
+
+#define IOMMU_PTE_PRESENT(pte) ((pte) & AMDVI_PTE_PR)
+
+/* Using level=0 for leaf PTE at 4K page size */
+#define PT_LEVEL_SHIFT(level) (12 + ((level) * 9))
+
+/* Return IOVA bit group used to index the Page Table at specific level */
+#define PT_LEVEL_INDEX(level, iova) (((iova) >> PT_LEVEL_SHIFT(level)) & \
+ GENMASK64(8, 0))
+
+/* Return the max address for a specified level i.e. max_oaddr */
+#define PT_LEVEL_MAX_ADDR(x) (((x) < 5) ? \
+ ((1ULL << PT_LEVEL_SHIFT((x + 1))) - 1) : \
+ (~(0ULL)))
+
+/* Extract the NextLevel field from PTE/PDE */
+#define PTE_NEXT_LEVEL(pte) (((pte) & AMDVI_PTE_NEXT_LEVEL_MASK) >> 9)
+
+/* Take page table level and return default pagetable size for level */
+#define PTE_LEVEL_PAGE_SIZE(level) (1ULL << (PT_LEVEL_SHIFT(level)))
+
+/*
+ * Return address of lower level page table encoded in PTE and specified by
+ * current level and corresponding IOVA bit group at such level.
+ */
+#define NEXT_PTE_ADDR(pte, level, iova) (((pte) & AMDVI_DEV_PT_ROOT_MASK) + \
+ (PT_LEVEL_INDEX(level, iova) * 8))
+
+/*
+ * Take a PTE value with mode=0x07 and return the page size it encodes.
+ */
+#define PTE_LARGE_PAGE_SIZE(pte) (1ULL << (1 + cto64(((pte) | 0xfffULL))))
+
+/* Return number of PTEs to use for a given page size (expected power of 2) */
+#define PAGE_SIZE_PTE_COUNT(pgsz) (1ULL << ((ctz64(pgsz) - 12) % 9))
/* IOTLB */
#define AMDVI_IOTLB_MAX_SIZE 1024
@@ -194,20 +243,16 @@
#define AMDVI_PAGE_SIZE (1ULL << AMDVI_PAGE_SHIFT)
#define AMDVI_PAGE_SHIFT_4K 12
-#define AMDVI_PAGE_MASK_4K (~((1ULL << AMDVI_PAGE_SHIFT_4K) - 1))
+#define AMDVI_PAGE_MASK_4K GENMASK64(63, 12)
-#define AMDVI_MAX_VA_ADDR (48UL << 5)
-#define AMDVI_MAX_PH_ADDR (40UL << 8)
-#define AMDVI_MAX_GVA_ADDR (48UL << 15)
+#define AMDVI_MAX_GVA_ADDR (2UL << 5)
+#define AMDVI_MAX_PH_ADDR (40UL << 8)
+#define AMDVI_MAX_VA_ADDR (48UL << 15)
/* Completion Wait data size */
#define AMDVI_COMPLETION_DATA_SIZE 8
#define AMDVI_COMMAND_SIZE 16
-/* Completion Wait data size */
-#define AMDVI_COMPLETION_DATA_SIZE 8
-
-#define AMDVI_COMMAND_SIZE 16
#define AMDVI_INT_ADDR_FIRST 0xfee00000
#define AMDVI_INT_ADDR_LAST 0xfeefffff
@@ -228,7 +273,7 @@
#define AMDVI_IR_INTCTL_PASS 1
#define AMDVI_IR_INTCTL_REMAP 2
-#define AMDVI_IR_PHYS_ADDR_MASK (((1ULL << 45) - 1) << 6)
+#define AMDVI_IR_PHYS_ADDR_MASK GENMASK64(51, 6)
/* MSI data 10:0 bits (section 2.2.5.1 Fig 14) */
#define AMDVI_IRTE_OFFSET 0x7ff
@@ -323,7 +368,6 @@ struct AMDVIState {
uint64_t mmio_addr;
bool enabled; /* IOMMU enabled */
- bool ats_enabled; /* address translation enabled */
bool cmdbuf_enabled; /* command buffer enabled */
bool evtlog_enabled; /* event log enabled */
bool excl_enabled;
@@ -366,12 +410,18 @@ struct AMDVIState {
/* for each served device */
AMDVIAddressSpace **address_spaces[PCI_BUS_MAX];
+ /* list of address spaces with registered notifiers */
+ QLIST_HEAD(, AMDVIAddressSpace) amdvi_as_with_notifiers;
+
/* IOTLB */
GHashTable *iotlb;
/* Interrupt remapping */
bool ga_enabled;
bool xtsup;
+
+ /* DMA address translation */
+ bool dma_remap;
};
uint64_t amdvi_extended_feature_register(AMDVIState *s);
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 69d72ad..6a168d5 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -45,6 +45,8 @@
((ce)->val[1] & VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK)
#define VTD_CE_GET_PASID_DIR_TABLE(ce) \
((ce)->val[0] & VTD_PASID_DIR_BASE_ADDR_MASK)
+#define VTD_CE_GET_PRE(ce) \
+ ((ce)->val[0] & VTD_SM_CONTEXT_ENTRY_PRE)
/* pe operations */
#define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT)
@@ -85,13 +87,6 @@ struct vtd_iotlb_key {
static void vtd_address_space_refresh_all(IntelIOMMUState *s);
static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n);
-static void vtd_panic_require_caching_mode(void)
-{
- error_report("We need to set caching-mode=on for intel-iommu to enable "
- "device assignment with IOMMU protection.");
- exit(1);
-}
-
static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
uint64_t wmask, uint64_t w1cmask)
{
@@ -1838,6 +1833,7 @@ static const bool vtd_qualified_faults[] = {
[VTD_FR_FS_NON_CANONICAL] = true,
[VTD_FR_FS_PAGING_ENTRY_US] = true,
[VTD_FR_SM_WRITE] = true,
+ [VTD_FR_SM_PRE_ABS] = true,
[VTD_FR_SM_INTERRUPT_ADDR] = true,
[VTD_FR_FS_BIT_UPDATE_FAILED] = true,
[VTD_FR_MAX] = false,
@@ -1987,9 +1983,9 @@ static int vtd_iova_to_flpte(IntelIOMMUState *s, VTDContextEntry *ce,
uint32_t pasid)
{
dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce, pasid);
- uint32_t level = vtd_get_iova_level(s, ce, pasid);
uint32_t offset;
uint64_t flpte, flag_ad = VTD_FL_A;
+ *flpte_level = vtd_get_iova_level(s, ce, pasid);
if (!vtd_iova_fl_check_canonical(s, iova, ce, pasid)) {
error_report_once("%s: detected non canonical IOVA (iova=0x%" PRIx64 ","
@@ -1998,11 +1994,11 @@ static int vtd_iova_to_flpte(IntelIOMMUState *s, VTDContextEntry *ce,
}
while (true) {
- offset = vtd_iova_level_offset(iova, level);
+ offset = vtd_iova_level_offset(iova, *flpte_level);
flpte = vtd_get_pte(addr, offset);
if (flpte == (uint64_t)-1) {
- if (level == vtd_get_iova_level(s, ce, pasid)) {
+ if (*flpte_level == vtd_get_iova_level(s, ce, pasid)) {
/* Invalid programming of pasid-entry */
return -VTD_FR_PASID_ENTRY_FSPTPTR_INV;
} else {
@@ -2028,15 +2024,15 @@ static int vtd_iova_to_flpte(IntelIOMMUState *s, VTDContextEntry *ce,
if (is_write && !(flpte & VTD_FL_RW)) {
return -VTD_FR_SM_WRITE;
}
- if (vtd_flpte_nonzero_rsvd(flpte, level)) {
+ if (vtd_flpte_nonzero_rsvd(flpte, *flpte_level)) {
error_report_once("%s: detected flpte reserved non-zero "
"iova=0x%" PRIx64 ", level=0x%" PRIx32
"flpte=0x%" PRIx64 ", pasid=0x%" PRIX32 ")",
- __func__, iova, level, flpte, pasid);
+ __func__, iova, *flpte_level, flpte, pasid);
return -VTD_FR_FS_PAGING_ENTRY_RSVD;
}
- if (vtd_is_last_pte(flpte, level) && is_write) {
+ if (vtd_is_last_pte(flpte, *flpte_level) && is_write) {
flag_ad |= VTD_FL_D;
}
@@ -2044,14 +2040,13 @@ static int vtd_iova_to_flpte(IntelIOMMUState *s, VTDContextEntry *ce,
return -VTD_FR_FS_BIT_UPDATE_FAILED;
}
- if (vtd_is_last_pte(flpte, level)) {
+ if (vtd_is_last_pte(flpte, *flpte_level)) {
*flptep = flpte;
- *flpte_level = level;
return 0;
}
addr = vtd_get_pte_addr(flpte, aw_bits);
- level--;
+ (*flpte_level)--;
}
}
@@ -2092,7 +2087,8 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
uint8_t bus_num = pci_bus_num(bus);
VTDContextCacheEntry *cc_entry;
uint64_t pte, page_mask;
- uint32_t level, pasid = vtd_as->pasid;
+ uint32_t level = UINT32_MAX;
+ uint32_t pasid = vtd_as->pasid;
uint16_t source_id = PCI_BUILD_BDF(bus_num, devfn);
int ret_fr;
bool is_fpd_set = false;
@@ -2251,14 +2247,19 @@ out:
entry->iova = addr & page_mask;
entry->translated_addr = vtd_get_pte_addr(pte, s->aw_bits) & page_mask;
entry->addr_mask = ~page_mask;
- entry->perm = access_flags;
+ entry->perm = (is_write ? access_flags : (access_flags & (~IOMMU_WO)));
return true;
error:
vtd_iommu_unlock(s);
entry->iova = 0;
entry->translated_addr = 0;
- entry->addr_mask = 0;
+ /*
+ * Set the mask for ATS (the range must be present even when the
+ * translation fails : PCIe rev 5 10.2.3.5)
+ */
+ entry->addr_mask = (level != UINT32_MAX) ?
+ (~vtd_pt_level_page_mask(level)) : (~VTD_PAGE_MASK_4K);
entry->perm = IOMMU_NONE;
return false;
}
@@ -2503,6 +2504,7 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
.translated_addr = 0,
.addr_mask = size - 1,
.perm = IOMMU_NONE,
+ .pasid = vtd_as->pasid,
},
};
memory_region_notify_iommu(&vtd_as->iommu, 0, event);
@@ -2695,7 +2697,7 @@ static void vtd_handle_gcmd_write(IntelIOMMUState *s)
uint32_t changed = status ^ val;
trace_vtd_reg_write_gcmd(status, val);
- if ((changed & VTD_GCMD_TE) && s->dma_translation) {
+ if ((changed & VTD_GCMD_TE) && x86_iommu->dma_translation) {
/* Translation enable/disable */
vtd_handle_gcmd_te(s, val & VTD_GCMD_TE);
}
@@ -2822,6 +2824,7 @@ static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc)
{
uint64_t mask[4] = {VTD_INV_DESC_WAIT_RSVD_LO, VTD_INV_DESC_WAIT_RSVD_HI,
VTD_INV_DESC_ALL_ONE, VTD_INV_DESC_ALL_ONE};
+ bool ret = true;
if (!vtd_inv_desc_reserved_check(s, inv_desc, mask, false,
__func__, "wait")) {
@@ -2833,8 +2836,6 @@ static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc)
uint32_t status_data = (uint32_t)(inv_desc->lo >>
VTD_INV_DESC_WAIT_DATA_SHIFT);
- assert(!(inv_desc->lo & VTD_INV_DESC_WAIT_IF));
-
/* FIXME: need to be masked with HAW? */
dma_addr_t status_addr = inv_desc->hi;
trace_vtd_inv_desc_wait_sw(status_addr, status_data);
@@ -2843,18 +2844,28 @@ static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc)
&status_data, sizeof(status_data),
MEMTXATTRS_UNSPECIFIED)) {
trace_vtd_inv_desc_wait_write_fail(inv_desc->hi, inv_desc->lo);
- return false;
+ ret = false;
}
- } else if (inv_desc->lo & VTD_INV_DESC_WAIT_IF) {
+ }
+
+ if (inv_desc->lo & VTD_INV_DESC_WAIT_IF) {
/* Interrupt flag */
vtd_generate_completion_event(s);
- } else {
+ }
+
+ /*
+ * SW=0, IF=0, FN=1 is also a valid descriptor (VT-d 7.10)
+ * Nothing to do as we process the descriptors in order
+ */
+
+ if (!(inv_desc->lo & (VTD_INV_DESC_WAIT_IF | VTD_INV_DESC_WAIT_SW |
+ VTD_INV_DESC_WAIT_FN))) {
error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64
" (unknown type)", __func__, inv_desc->hi,
inv_desc->lo);
return false;
}
- return true;
+ return ret;
}
static bool vtd_process_context_cache_desc(IntelIOMMUState *s,
@@ -3090,6 +3101,7 @@ static void do_invalidate_device_tlb(VTDAddressSpace *vtd_dev_as,
event.entry.iova = addr;
event.entry.perm = IOMMU_NONE;
event.entry.translated_addr = 0;
+ event.entry.pasid = vtd_dev_as->pasid;
memory_region_notify_iommu(&vtd_dev_as->iommu, 0, event);
}
@@ -3136,6 +3148,59 @@ static bool vtd_process_device_piotlb_desc(IntelIOMMUState *s,
return true;
}
+static bool vtd_process_page_group_response_desc(IntelIOMMUState *s,
+ VTDInvDesc *inv_desc)
+{
+ VTDAddressSpace *vtd_dev_as;
+ bool pasid_present;
+ uint8_t response_code;
+ uint16_t rid;
+ uint32_t pasid;
+ uint16_t prgi;
+ IOMMUPRIResponse response;
+
+ if ((inv_desc->lo & VTD_INV_DESC_PGRESP_RSVD_LO) ||
+ (inv_desc->hi & VTD_INV_DESC_PGRESP_RSVD_HI)) {
+ error_report_once("%s: invalid page group response desc: hi=%"PRIx64
+ ", lo=%"PRIx64" (reserved nonzero)", __func__,
+ inv_desc->hi, inv_desc->lo);
+ return false;
+ }
+
+ pasid_present = VTD_INV_DESC_PGRESP_PP(inv_desc->lo);
+ response_code = VTD_INV_DESC_PGRESP_RC(inv_desc->lo);
+ rid = VTD_INV_DESC_PGRESP_RID(inv_desc->lo);
+ pasid = VTD_INV_DESC_PGRESP_PASID(inv_desc->lo);
+ prgi = VTD_INV_DESC_PGRESP_PRGI(inv_desc->hi);
+
+ if (!pasid_present) {
+ error_report_once("Page group response without PASID is"
+ "not supported yet");
+ return false;
+ }
+
+ vtd_dev_as = vtd_get_as_by_sid_and_pasid(s, rid, pasid);
+ if (!vtd_dev_as) {
+ return true;
+ }
+
+ response.prgi = prgi;
+
+ if (response_code == 0x0u) {
+ response.response_code = IOMMU_PRI_RESP_SUCCESS;
+ } else if (response_code == 0x1u) {
+ response.response_code = IOMMU_PRI_RESP_INVALID_REQUEST;
+ } else {
+ response.response_code = IOMMU_PRI_RESP_FAILURE;
+ }
+
+ if (vtd_dev_as->pri_notifier) {
+ vtd_dev_as->pri_notifier->notify(vtd_dev_as->pri_notifier, &response);
+ }
+
+ return true;
+}
+
static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s,
VTDInvDesc *inv_desc)
{
@@ -3236,6 +3301,13 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s)
}
break;
+ case VTD_INV_DESC_PGRESP:
+ trace_vtd_inv_desc("page group response", inv_desc.hi, inv_desc.lo);
+ if (!vtd_process_page_group_response_desc(s, &inv_desc)) {
+ return false;
+ }
+ break;
+
/*
* TODO: the entity of below two cases will be implemented in future series.
* To make guest (which integrates scalable mode support patch set in
@@ -3370,6 +3442,27 @@ static void vtd_handle_iectl_write(IntelIOMMUState *s)
}
}
+static void vtd_handle_prs_write(IntelIOMMUState *s)
+{
+ uint32_t prs = vtd_get_long_raw(s, DMAR_PRS_REG);
+ if (!(prs & VTD_PR_STATUS_PPR) && !(prs & VTD_PR_STATUS_PRO)) {
+ vtd_set_clear_mask_long(s, DMAR_PECTL_REG, VTD_PR_PECTL_IP, 0);
+ }
+}
+
+static void vtd_handle_pectl_write(IntelIOMMUState *s)
+{
+ uint32_t pectl = vtd_get_long_raw(s, DMAR_PECTL_REG);
+ if ((pectl & VTD_PR_PECTL_IP) && !(pectl & VTD_PR_PECTL_IM)) {
+ /*
+ * If IP field was 1 when software clears the IM field,
+ * the interrupt is generated along with clearing the IP field.
+ */
+ vtd_set_clear_mask_long(s, DMAR_PECTL_REG, VTD_PR_PECTL_IP, 0);
+ vtd_generate_interrupt(s, DMAR_PEADDR_REG, DMAR_PEDATA_REG);
+ }
+}
+
static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size)
{
IntelIOMMUState *s = opaque;
@@ -3412,6 +3505,11 @@ static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size)
val = s->iq >> 32;
break;
+ case DMAR_PEUADDR_REG:
+ assert(size == 4);
+ val = vtd_get_long_raw(s, DMAR_PEUADDR_REG);
+ break;
+
default:
if (size == 4) {
val = vtd_get_long(s, addr);
@@ -3475,6 +3573,11 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
vtd_handle_iotlb_write(s);
break;
+ case DMAR_PEUADDR_REG:
+ assert(size == 4);
+ vtd_set_long(s, addr, val);
+ break;
+
/* Invalidate Address Register, 64-bit */
case DMAR_IVA_REG:
if (size == 4) {
@@ -3655,6 +3758,18 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
vtd_set_long(s, addr, val);
break;
+ case DMAR_PRS_REG:
+ assert(size == 4);
+ vtd_set_long(s, addr, val);
+ vtd_handle_prs_write(s);
+ break;
+
+ case DMAR_PECTL_REG:
+ assert(size == 4);
+ vtd_set_long(s, addr, val);
+ vtd_handle_pectl_write(s);
+ break;
+
default:
if (size == 4) {
vtd_set_long(s, addr, val);
@@ -3672,6 +3787,7 @@ static IOMMUTLBEntry vtd_iommu_translate(IOMMUMemoryRegion *iommu, hwaddr addr,
IOMMUTLBEntry iotlb = {
/* We'll fill in the rest later. */
.target_as = &address_space_memory,
+ .pasid = vtd_as->pasid,
};
bool success;
@@ -3824,7 +3940,6 @@ static const Property vtd_properties[] = {
DEFINE_PROP_BOOL("snoop-control", IntelIOMMUState, snoop_control, false),
DEFINE_PROP_BOOL("x-pasid-mode", IntelIOMMUState, pasid, false),
DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true),
- DEFINE_PROP_BOOL("dma-translation", IntelIOMMUState, dma_translation, true),
DEFINE_PROP_BOOL("stale-tm", IntelIOMMUState, stale_tm, false),
DEFINE_PROP_BOOL("fs1gp", IntelIOMMUState, fs1gp, true),
};
@@ -4367,6 +4482,12 @@ static bool vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
assert(hiod);
+ if (!s->caching_mode) {
+ error_setg(errp, "Device assignment is not allowed without enabling "
+ "caching-mode=on for Intel IOMMU.");
+ return false;
+ }
+
vtd_iommu_lock(s);
if (g_hash_table_lookup(s->vtd_host_iommu_dev, &key)) {
@@ -4538,11 +4659,11 @@ static void vtd_cap_init(IntelIOMMUState *s)
s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND |
VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS |
- VTD_CAP_MGAW(s->aw_bits);
+ VTD_CAP_ESRTPS | VTD_CAP_MGAW(s->aw_bits);
if (s->dma_drain) {
s->cap |= VTD_CAP_DRAIN;
}
- if (s->dma_translation) {
+ if (x86_iommu->dma_translation) {
if (s->aw_bits >= VTD_HOST_AW_39BIT) {
s->cap |= VTD_CAP_SAGAW_39bit;
}
@@ -4587,7 +4708,7 @@ static void vtd_cap_init(IntelIOMMUState *s)
}
if (s->pasid) {
- s->ecap |= VTD_ECAP_PASID;
+ s->ecap |= VTD_ECAP_PASID | VTD_ECAP_PSS;
}
}
@@ -4705,6 +4826,18 @@ static void vtd_init(IntelIOMMUState *s)
* Interrupt remapping registers.
*/
vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0);
+
+ /* Page request registers */
+ if (s->ecap & VTD_ECAP_PRS) {
+ vtd_define_quad(s, DMAR_PQH_REG, 0, 0x7ffe0ULL, 0);
+ vtd_define_quad(s, DMAR_PQT_REG, 0, 0x7ffe0ULL, 0);
+ vtd_define_quad(s, DMAR_PQA_REG, 0, 0xfffffffffffff007ULL, 0);
+ vtd_define_long(s, DMAR_PRS_REG, 0, 0, 0x3UL);
+ vtd_define_long(s, DMAR_PECTL_REG, 0, 0x80000000UL, 0);
+ vtd_define_long(s, DMAR_PEDATA_REG, 0, 0xffffUL, 0);
+ vtd_define_long(s, DMAR_PEADDR_REG, 0, 0xfffffffcUL, 0);
+ vtd_define_long(s, DMAR_PEUADDR_REG, 0, 0xffffffffUL, 0);
+ }
}
/* Should not reset address_spaces when reset because devices will still use
@@ -4730,10 +4863,329 @@ static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
return &vtd_as->as;
}
+static IOMMUTLBEntry vtd_iommu_ats_do_translate(IOMMUMemoryRegion *iommu,
+ hwaddr addr,
+ IOMMUAccessFlags flags)
+{
+ IOMMUTLBEntry entry;
+ VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
+
+ if (vtd_is_interrupt_addr(addr)) {
+ vtd_report_ir_illegal_access(vtd_as, addr, flags & IOMMU_WO);
+ entry.target_as = &address_space_memory;
+ entry.iova = 0;
+ entry.translated_addr = 0;
+ entry.addr_mask = ~VTD_PAGE_MASK_4K;
+ entry.perm = IOMMU_NONE;
+ entry.pasid = PCI_NO_PASID;
+ } else {
+ entry = vtd_iommu_translate(iommu, addr, flags, 0);
+ }
+
+ return entry;
+}
+
+static ssize_t vtd_ats_request_translation(PCIBus *bus, void *opaque,
+ int devfn, uint32_t pasid,
+ bool priv_req, bool exec_req,
+ hwaddr addr, size_t length,
+ bool no_write, IOMMUTLBEntry *result,
+ size_t result_length,
+ uint32_t *err_count)
+{
+ IntelIOMMUState *s = opaque;
+ VTDAddressSpace *vtd_as;
+ IOMMUAccessFlags flags = IOMMU_ACCESS_FLAG_FULL(true, !no_write, exec_req,
+ priv_req, false, false);
+ ssize_t res_index = 0;
+ hwaddr target_address = addr + length;
+ IOMMUTLBEntry entry;
+
+ vtd_as = vtd_find_add_as(s, bus, devfn, pasid);
+ *err_count = 0;
+
+ while ((addr < target_address) && (res_index < result_length)) {
+ entry = vtd_iommu_ats_do_translate(&vtd_as->iommu, addr, flags);
+ entry.perm &= ~IOMMU_GLOBAL; /* Spec 4.1.2: Global Mapping never set */
+
+ if ((entry.perm & flags) != flags) {
+ *err_count += 1; /* Less than expected */
+ }
+
+ result[res_index] = entry;
+ res_index += 1;
+ addr = (addr & (~entry.addr_mask)) + (entry.addr_mask + 1);
+ }
+
+ /* Buffer too small */
+ if (addr < target_address) {
+ return -ENOMEM;
+ }
+
+ return res_index;
+}
+
+/* 11.4.11.3 : The number of entries in the page request queue is 2^(PQS + 7) */
+static inline uint64_t vtd_prq_size(IntelIOMMUState *s)
+{
+ return 1ULL << ((vtd_get_quad(s, DMAR_PQA_REG) & VTD_PQA_SIZE) + 7);
+}
+
+/**
+ * Return true if the bit is accessible and correctly set, false otherwise
+ */
+static bool vtd_check_pre_bit(VTDAddressSpace *vtd_as, hwaddr addr,
+ uint16_t sid, bool is_write)
+{
+ int ret;
+ IntelIOMMUState *s = vtd_as->iommu_state;
+ uint8_t bus_n = pci_bus_num(vtd_as->bus);
+ VTDContextEntry ce;
+ bool is_fpd_set = false;
+
+ ret = vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce);
+
+ if (ret) {
+ goto error_report;
+ }
+
+ if (!VTD_CE_GET_PRE(&ce)) {
+ ret = -VTD_FR_SM_PRE_ABS;
+ goto error_get_fpd_and_report;
+ }
+
+ return true;
+
+error_get_fpd_and_report:
+ /* Try to get fpd (may not work but we are already on an error path) */
+ is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
+ vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set, vtd_as->pasid);
+error_report:
+ vtd_report_fault(s, -ret, is_fpd_set, sid, addr, is_write,
+ vtd_as->pasid != PCI_NO_PASID, vtd_as->pasid);
+ return false;
+}
+
+/* Logic described in section 7.5 */
+static void vtd_generate_page_request_event(IntelIOMMUState *s,
+ uint32_t old_pr_status)
+{
+ uint32_t current_pectl = vtd_get_long(s, DMAR_PECTL_REG);
+ /*
+ * Hardware evaluates PPR and PRO fields in the Page Request Status Register
+ * and if any of them is set, Page Request Event is not generated
+ */
+ if (old_pr_status & (VTD_PR_STATUS_PRO | VTD_PR_STATUS_PPR)) {
+ return;
+ }
+
+ vtd_set_clear_mask_long(s, DMAR_PECTL_REG, 0, VTD_PR_PECTL_IP);
+ if (!(current_pectl & VTD_PR_PECTL_IM)) {
+ vtd_set_clear_mask_long(s, DMAR_PECTL_REG, VTD_PR_PECTL_IP, 0);
+ vtd_generate_interrupt(s, DMAR_PEADDR_REG, DMAR_PEDATA_REG);
+ }
+}
+
+/* When calling this function, we known that we are in scalable mode */
+static int vtd_pri_perform_implicit_invalidation(VTDAddressSpace *vtd_as,
+ hwaddr addr)
+{
+ IntelIOMMUState *s = vtd_as->iommu_state;
+ VTDContextEntry ce;
+ VTDPASIDEntry pe;
+ uint16_t pgtt;
+ uint16_t domain_id;
+ int ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
+ vtd_as->devfn, &ce);
+ if (ret) {
+ return -EINVAL;
+ }
+ ret = vtd_ce_get_rid2pasid_entry(s, &ce, &pe, vtd_as->pasid);
+ if (ret) {
+ return -EINVAL;
+ }
+ pgtt = VTD_PE_GET_TYPE(&pe);
+ domain_id = VTD_SM_PASID_ENTRY_DID(pe.val[1]);
+ ret = 0;
+ switch (pgtt) {
+ case VTD_SM_PASID_ENTRY_FLT:
+ vtd_piotlb_page_invalidate(s, domain_id, vtd_as->pasid, addr, 0);
+ break;
+ /* Room for other pgtt values */
+ default:
+ error_report_once("Translation type not supported yet : %d", pgtt);
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+/* Page Request Descriptor : 7.4.1.1 */
+static int vtd_pri_request_page(PCIBus *bus, void *opaque, int devfn,
+ uint32_t pasid, bool priv_req, bool exec_req,
+ hwaddr addr, bool lpig, uint16_t prgi,
+ bool is_read, bool is_write)
+{
+ IntelIOMMUState *s = opaque;
+ VTDAddressSpace *vtd_as;
+
+ vtd_as = vtd_find_add_as(s, bus, devfn, pasid);
+
+ uint64_t queue_addr_reg = vtd_get_quad(s, DMAR_PQA_REG);
+ uint64_t queue_tail_offset_reg = vtd_get_quad(s, DMAR_PQT_REG);
+ uint64_t new_queue_tail_offset = (
+ (queue_tail_offset_reg + VTD_PQA_ENTRY_SIZE) %
+ (vtd_prq_size(s) * VTD_PQA_ENTRY_SIZE));
+ uint64_t queue_head_offset_reg = vtd_get_quad(s, DMAR_PQH_REG);
+ hwaddr queue_tail = (queue_addr_reg & VTD_PQA_ADDR) + queue_tail_offset_reg;
+ uint32_t old_pr_status = vtd_get_long(s, DMAR_PRS_REG);
+ uint16_t sid = PCI_BUILD_BDF(pci_bus_num(vtd_as->bus), vtd_as->devfn);
+ VTDPRDesc desc;
+
+ if (!(s->ecap & VTD_ECAP_PRS)) {
+ return -EPERM;
+ }
+
+ /*
+ * No need to check if scalable mode is enabled as we already known that
+ * VTD_ECAP_PRS is set (see vtd_decide_config)
+ */
+
+ /* We do not support PRI without PASID */
+ if (vtd_as->pasid == PCI_NO_PASID) {
+ return -EPERM;
+ }
+ if (exec_req && !is_read) {
+ return -EINVAL;
+ }
+
+ /* Check PRE bit in the scalable mode context entry */
+ if (!vtd_check_pre_bit(vtd_as, addr, sid, is_write)) {
+ return -EPERM;
+ }
+
+ if (old_pr_status & VTD_PR_STATUS_PRO) {
+ /*
+ * No action is taken by hardware to report a fault
+ * or generate an event
+ */
+ return -ENOSPC;
+ }
+
+ /* Check for overflow */
+ if (new_queue_tail_offset == queue_head_offset_reg) {
+ vtd_set_clear_mask_long(s, DMAR_PRS_REG, 0, VTD_PR_STATUS_PRO);
+ vtd_generate_page_request_event(s, old_pr_status);
+ return -ENOSPC;
+ }
+
+ if (vtd_pri_perform_implicit_invalidation(vtd_as, addr)) {
+ return -EINVAL;
+ }
+
+ desc.lo = VTD_PRD_TYPE | VTD_PRD_PP(true) | VTD_PRD_RID(sid) |
+ VTD_PRD_PASID(vtd_as->pasid) | VTD_PRD_PMR(priv_req);
+ desc.hi = VTD_PRD_RDR(is_read) | VTD_PRD_WRR(is_write) |
+ VTD_PRD_LPIG(lpig) | VTD_PRD_PRGI(prgi) | VTD_PRD_ADDR(addr);
+
+ desc.lo = cpu_to_le64(desc.lo);
+ desc.hi = cpu_to_le64(desc.hi);
+ if (dma_memory_write(&address_space_memory, queue_tail, &desc, sizeof(desc),
+ MEMTXATTRS_UNSPECIFIED)) {
+ error_report_once("IO error, the PQ tail cannot be updated");
+ return -EIO;
+ }
+
+ /* increment the tail register and set the pending request bit */
+ vtd_set_quad(s, DMAR_PQT_REG, new_queue_tail_offset);
+ /*
+ * read status again so that the kernel does not miss a request.
+ * in some cases, we can trigger an unecessary interrupt but this strategy
+ * drastically improves performance as we don't need to take a lock.
+ */
+ old_pr_status = vtd_get_long(s, DMAR_PRS_REG);
+ if (!(old_pr_status & VTD_PR_STATUS_PPR)) {
+ vtd_set_clear_mask_long(s, DMAR_PRS_REG, 0, VTD_PR_STATUS_PPR);
+ vtd_generate_page_request_event(s, old_pr_status);
+ }
+
+ return 0;
+}
+
+static void vtd_init_iotlb_notifier(PCIBus *bus, void *opaque, int devfn,
+ IOMMUNotifier *n, IOMMUNotify fn,
+ void *user_opaque)
+{
+ n->opaque = user_opaque;
+ iommu_notifier_init(n, fn, IOMMU_NOTIFIER_DEVIOTLB_EVENTS, 0,
+ HWADDR_MAX, 0);
+}
+
+static void vtd_get_iotlb_info(void *opaque, uint8_t *addr_width,
+ uint32_t *min_page_size)
+{
+ IntelIOMMUState *s = opaque;
+
+ *addr_width = s->aw_bits;
+ *min_page_size = VTD_PAGE_SIZE;
+}
+
+static void vtd_register_iotlb_notifier(PCIBus *bus, void *opaque,
+ int devfn, uint32_t pasid,
+ IOMMUNotifier *n)
+{
+ IntelIOMMUState *s = opaque;
+ VTDAddressSpace *vtd_as;
+
+ vtd_as = vtd_find_add_as(s, bus, devfn, pasid);
+ memory_region_register_iommu_notifier(MEMORY_REGION(&vtd_as->iommu), n,
+ &error_fatal);
+}
+
+static void vtd_unregister_iotlb_notifier(PCIBus *bus, void *opaque,
+ int devfn, uint32_t pasid,
+ IOMMUNotifier *n)
+{
+ IntelIOMMUState *s = opaque;
+ VTDAddressSpace *vtd_as;
+
+ vtd_as = vtd_find_add_as(s, bus, devfn, pasid);
+ memory_region_unregister_iommu_notifier(MEMORY_REGION(&vtd_as->iommu), n);
+}
+
+static void vtd_pri_register_notifier(PCIBus *bus, void *opaque, int devfn,
+ uint32_t pasid, IOMMUPRINotifier *notifier)
+{
+ IntelIOMMUState *s = opaque;
+ VTDAddressSpace *vtd_as;
+
+ vtd_as = vtd_find_add_as(s, bus, devfn, pasid);
+ vtd_as->pri_notifier = notifier;
+}
+
+static void vtd_pri_unregister_notifier(PCIBus *bus, void *opaque,
+ int devfn, uint32_t pasid)
+{
+ IntelIOMMUState *s = opaque;
+ VTDAddressSpace *vtd_as;
+
+ vtd_as = vtd_find_add_as(s, bus, devfn, pasid);
+ vtd_as->pri_notifier = NULL;
+}
+
static PCIIOMMUOps vtd_iommu_ops = {
.get_address_space = vtd_host_dma_iommu,
.set_iommu_device = vtd_dev_set_iommu_device,
.unset_iommu_device = vtd_dev_unset_iommu_device,
+ .get_iotlb_info = vtd_get_iotlb_info,
+ .init_iotlb_notifier = vtd_init_iotlb_notifier,
+ .register_iotlb_notifier = vtd_register_iotlb_notifier,
+ .unregister_iotlb_notifier = vtd_unregister_iotlb_notifier,
+ .ats_request_translation = vtd_ats_request_translation,
+ .pri_register_notifier = vtd_pri_register_notifier,
+ .pri_unregister_notifier = vtd_pri_unregister_notifier,
+ .pri_request_page = vtd_pri_request_page,
};
static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
@@ -4791,32 +5243,6 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
return true;
}
-static int vtd_machine_done_notify_one(Object *child, void *unused)
-{
- IntelIOMMUState *iommu = INTEL_IOMMU_DEVICE(x86_iommu_get_default());
-
- /*
- * We hard-coded here because vfio-pci is the only special case
- * here. Let's be more elegant in the future when we can, but so
- * far there seems to be no better way.
- */
- if (object_dynamic_cast(child, "vfio-pci") && !iommu->caching_mode) {
- vtd_panic_require_caching_mode();
- }
-
- return 0;
-}
-
-static void vtd_machine_done_hook(Notifier *notifier, void *unused)
-{
- object_child_foreach_recursive(object_get_root(),
- vtd_machine_done_notify_one, NULL);
-}
-
-static Notifier vtd_machine_done_notify = {
- .notify = vtd_machine_done_hook,
-};
-
static void vtd_realize(DeviceState *dev, Error **errp)
{
MachineState *ms = MACHINE(qdev_get_machine());
@@ -4871,7 +5297,6 @@ static void vtd_realize(DeviceState *dev, Error **errp)
pci_setup_iommu(bus, &vtd_iommu_ops, dev);
/* Pseudo address space under root PCI bus. */
x86ms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC);
- qemu_add_machine_init_done_notifier(&vtd_machine_done_notify);
}
static void vtd_class_init(ObjectClass *klass, const void *data)
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index e8b211e..0f6a123 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -190,8 +190,10 @@
#define VTD_ECAP_EIM (1ULL << 4)
#define VTD_ECAP_PT (1ULL << 6)
#define VTD_ECAP_SC (1ULL << 7)
+#define VTD_ECAP_PRS (1ULL << 29)
#define VTD_ECAP_MHMV (15ULL << 20)
#define VTD_ECAP_SRS (1ULL << 31)
+#define VTD_ECAP_PSS (7ULL << 35) /* limit: MemTxAttrs::pid */
#define VTD_ECAP_PASID (1ULL << 40)
#define VTD_ECAP_SMTS (1ULL << 43)
#define VTD_ECAP_SLTS (1ULL << 46)
@@ -213,6 +215,7 @@
#define VTD_CAP_DRAIN_WRITE (1ULL << 54)
#define VTD_CAP_DRAIN_READ (1ULL << 55)
#define VTD_CAP_FS1GP (1ULL << 56)
+#define VTD_CAP_ESRTPS (1ULL << 63)
#define VTD_CAP_DRAIN (VTD_CAP_DRAIN_READ | VTD_CAP_DRAIN_WRITE)
#define VTD_CAP_CM (1ULL << 7)
#define VTD_PASID_ID_SHIFT 20
@@ -313,6 +316,8 @@ typedef enum VTDFaultReason {
* request while disabled */
VTD_FR_IR_SID_ERR = 0x26, /* Invalid Source-ID */
+ VTD_FR_SM_PRE_ABS = 0x47, /* SCT.8 : PRE bit in a present SM CE is 0 */
+
/* PASID directory entry access failure */
VTD_FR_PASID_DIR_ACCESS_ERR = 0x50,
/* The Present(P) field of pasid directory entry is 0 */
@@ -375,6 +380,18 @@ union VTDInvDesc {
};
typedef union VTDInvDesc VTDInvDesc;
+/* Page Request Descriptor */
+union VTDPRDesc {
+ struct {
+ uint64_t lo;
+ uint64_t hi;
+ };
+ struct {
+ uint64_t val[4];
+ };
+};
+typedef union VTDPRDesc VTDPRDesc;
+
/* Masks for struct VTDInvDesc */
#define VTD_INV_DESC_ALL_ONE -1ULL
#define VTD_INV_DESC_TYPE(val) ((((val) >> 5) & 0x70ULL) | \
@@ -388,6 +405,7 @@ typedef union VTDInvDesc VTDInvDesc;
#define VTD_INV_DESC_PIOTLB 0x6 /* PASID-IOTLB Invalidate Desc */
#define VTD_INV_DESC_PC 0x7 /* PASID-cache Invalidate Desc */
#define VTD_INV_DESC_DEV_PIOTLB 0x8 /* PASID-based-DIOTLB inv_desc*/
+#define VTD_INV_DESC_PGRESP 0x9 /* Page Group Response Desc */
#define VTD_INV_DESC_NONE 0 /* Not an Invalidate Descriptor */
/* Masks for Invalidation Wait Descriptor*/
@@ -439,6 +457,15 @@ typedef union VTDInvDesc VTDInvDesc;
#define VTD_INV_DESC_PASID_DEVICE_IOTLB_RSVD_VAL0 0xfff000000000f000ULL
#define VTD_INV_DESC_PASID_DEVICE_IOTLB_RSVD_VAL1 0x7feULL
+/* Mask for Page Group Response Descriptor */
+#define VTD_INV_DESC_PGRESP_RSVD_HI 0xfffffffffffff003ULL
+#define VTD_INV_DESC_PGRESP_RSVD_LO 0xfff00000000001e0ULL
+#define VTD_INV_DESC_PGRESP_PP(val) (((val) >> 4) & 0x1ULL)
+#define VTD_INV_DESC_PGRESP_RC(val) (((val) >> 12) & 0xfULL)
+#define VTD_INV_DESC_PGRESP_RID(val) (((val) >> 16) & 0xffffULL)
+#define VTD_INV_DESC_PGRESP_PASID(val) (((val) >> 32) & 0xfffffULL)
+#define VTD_INV_DESC_PGRESP_PRGI(val) (((val) >> 3) & 0x1ffULL)
+
/* Rsvd field masks for spte */
#define VTD_SPTE_SNP 0x800ULL
@@ -490,6 +517,31 @@ typedef union VTDInvDesc VTDInvDesc;
#define VTD_INV_DESC_PIOTLB_RSVD_VAL0 0xfff000000000f1c0ULL
#define VTD_INV_DESC_PIOTLB_RSVD_VAL1 0xf80ULL
+/* Page Request Descriptor */
+/* For the low 64-bit of 128-bit */
+#define VTD_PRD_TYPE (1ULL)
+#define VTD_PRD_PP(val) (((val) & 1ULL) << 8)
+#define VTD_PRD_RID(val) (((val) & 0xffffULL) << 16)
+#define VTD_PRD_PASID(val) (((val) & 0xfffffULL) << 32)
+#define VTD_PRD_EXR(val) (((val) & 1ULL) << 52)
+#define VTD_PRD_PMR(val) (((val) & 1ULL) << 53)
+/* For the high 64-bit of 128-bit */
+#define VTD_PRD_RDR(val) ((val) & 1ULL)
+#define VTD_PRD_WRR(val) (((val) & 1ULL) << 1)
+#define VTD_PRD_LPIG(val) (((val) & 1ULL) << 2)
+#define VTD_PRD_PRGI(val) (((val) & 0x1ffULL) << 3)
+#define VTD_PRD_ADDR(val) ((val) & 0xfffffffffffff000ULL)
+
+/* Page Request Queue constants */
+#define VTD_PQA_ENTRY_SIZE 32 /* Size of an entry in bytes */
+/* Page Request Queue masks */
+#define VTD_PQA_ADDR 0xfffffffffffff000ULL /* PR queue address */
+#define VTD_PQA_SIZE 0x7ULL /* PR queue size */
+#define VTD_PR_STATUS_PPR 1UL /* Pending page request */
+#define VTD_PR_STATUS_PRO 2UL /* Page request overflow */
+#define VTD_PR_PECTL_IP 0x40000000UL /* PR control interrup pending */
+#define VTD_PR_PECTL_IM 0x80000000UL /* PR control interrup mask */
+
/* Information about page-selective IOTLB invalidate */
struct VTDIOTLBPageInvInfo {
uint16_t domain_id;
@@ -549,6 +601,7 @@ typedef struct VTDRootEntry VTDRootEntry;
#define VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK 0xfffff
#define VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(aw) (0x1e0ULL | ~VTD_HAW_MASK(aw))
#define VTD_SM_CONTEXT_ENTRY_RSVD_VAL1 0xffffffffffe00000ULL
+#define VTD_SM_CONTEXT_ENTRY_PRE 0x10ULL
/* PASID Table Related Definitions */
#define VTD_PASID_DIR_BASE_ADDR_MASK (~0xfffULL)
diff --git a/hw/i386/isapc.c b/hw/i386/isapc.c
new file mode 100644
index 0000000..44f4a44
--- /dev/null
+++ b/hw/i386/isapc.c
@@ -0,0 +1,189 @@
+/*
+ * QEMU PC System Emulator
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "qemu/osdep.h"
+
+#include "qemu/units.h"
+#include "qemu/error-report.h"
+#include "hw/char/parallel-isa.h"
+#include "hw/dma/i8257.h"
+#include "hw/i386/pc.h"
+#include "hw/ide/isa.h"
+#include "hw/ide/ide-bus.h"
+#include "system/kvm.h"
+#include "hw/i386/kvm/clock.h"
+#include "hw/xen/xen-x86.h"
+#include "system/xen.h"
+#include "hw/rtc/mc146818rtc.h"
+#include "target/i386/cpu.h"
+
+static const int ide_iobase[MAX_IDE_BUS] = { 0x1f0, 0x170 };
+static const int ide_iobase2[MAX_IDE_BUS] = { 0x3f6, 0x376 };
+static const int ide_irq[MAX_IDE_BUS] = { 14, 15 };
+
+
+static void pc_init_isa(MachineState *machine)
+{
+ PCMachineState *pcms = PC_MACHINE(machine);
+ PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
+ X86MachineState *x86ms = X86_MACHINE(machine);
+ MemoryRegion *system_memory = get_system_memory();
+ MemoryRegion *system_io = get_system_io();
+ ISABus *isa_bus;
+ uint32_t irq;
+ GSIState *gsi_state;
+ MemoryRegion *ram_memory;
+ DriveInfo *hd[MAX_IDE_BUS * MAX_IDE_DEVS];
+ int i;
+
+ /*
+ * There is a small chance that someone unintentionally passes "-cpu max"
+ * for the isapc machine, which will provide a much more modern 32-bit
+ * CPU than would be expected for an ISA-era PC. If the "max" cpu type has
+ * been specified, choose the "best" 32-bit cpu possible which we consider
+ * be the pentium3 (deliberately choosing an Intel CPU given that the
+ * default 486 CPU for the isapc machine is also an Intel CPU).
+ */
+ if (!strcmp(machine->cpu_type, X86_CPU_TYPE_NAME("max"))) {
+ machine->cpu_type = X86_CPU_TYPE_NAME("pentium3");
+ warn_report("-cpu max is invalid for isapc machine, using pentium3");
+ }
+
+ /*
+ * Similarly if someone unintentionally passes "-cpu host" for the isapc
+ * machine then display a warning and also switch to the "best" 32-bit
+ * cpu possible which we consider to be the pentium3. This is because any
+ * host CPU will already be modern than this, but it also ensures any
+ * newer CPU flags/features are filtered out for older guests.
+ */
+ if (!strcmp(machine->cpu_type, X86_CPU_TYPE_NAME("host"))) {
+ machine->cpu_type = X86_CPU_TYPE_NAME("pentium3");
+ warn_report("-cpu host is invalid for isapc machine, using pentium3");
+ }
+
+ if (machine->ram_size > 3.5 * GiB) {
+ error_report("Too much memory for this machine: %" PRId64 " MiB, "
+ "maximum 3584 MiB", machine->ram_size / MiB);
+ exit(1);
+ }
+
+ /*
+ * There is no RAM split for the isapc machine
+ */
+ if (xen_enabled()) {
+ xen_hvm_init_pc(pcms, &ram_memory);
+ } else {
+ ram_memory = machine->ram;
+
+ pcms->max_ram_below_4g = 3.5 * GiB;
+ x86ms->above_4g_mem_size = 0;
+ x86ms->below_4g_mem_size = machine->ram_size;
+ }
+
+ x86_cpus_init(x86ms, pcmc->default_cpu_version);
+
+ if (kvm_enabled()) {
+ kvmclock_create(pcmc->kvmclock_create_always);
+ }
+
+ /* allocate ram and load rom/bios */
+ if (!xen_enabled()) {
+ pc_memory_init(pcms, system_memory, system_memory, 0);
+ } else {
+ assert(machine->ram_size == x86ms->below_4g_mem_size +
+ x86ms->above_4g_mem_size);
+
+ if (machine->kernel_filename != NULL) {
+ /* For xen HVM direct kernel boot, load linux here */
+ xen_load_linux(pcms);
+ }
+ }
+
+ gsi_state = pc_gsi_create(&x86ms->gsi, false);
+
+ isa_bus = isa_bus_new(NULL, system_memory, system_io,
+ &error_abort);
+ isa_bus_register_input_irqs(isa_bus, x86ms->gsi);
+
+ x86ms->rtc = isa_new(TYPE_MC146818_RTC);
+ qdev_prop_set_int32(DEVICE(x86ms->rtc), "base_year", 2000);
+ isa_realize_and_unref(x86ms->rtc, isa_bus, &error_fatal);
+ irq = object_property_get_uint(OBJECT(x86ms->rtc), "irq",
+ &error_fatal);
+ isa_connect_gpio_out(ISA_DEVICE(x86ms->rtc), 0, irq);
+
+ i8257_dma_init(OBJECT(machine), isa_bus, 0);
+ pcms->hpet_enabled = false;
+
+ if (x86ms->pic == ON_OFF_AUTO_ON || x86ms->pic == ON_OFF_AUTO_AUTO) {
+ pc_i8259_create(isa_bus, gsi_state->i8259_irq);
+ }
+
+ if (tcg_enabled()) {
+ x86_register_ferr_irq(x86ms->gsi[13]);
+ }
+
+ pc_vga_init(isa_bus, NULL);
+
+ /* init basic PC hardware */
+ pc_basic_device_init(pcms, isa_bus, x86ms->gsi, x86ms->rtc,
+ !MACHINE_CLASS(pcmc)->no_floppy, 0x4);
+
+ pc_nic_init(pcmc, isa_bus, NULL);
+
+ ide_drive_get(hd, ARRAY_SIZE(hd));
+ for (i = 0; i < MAX_IDE_BUS; i++) {
+ ISADevice *dev;
+ char busname[] = "ide.0";
+ dev = isa_ide_init(isa_bus, ide_iobase[i], ide_iobase2[i],
+ ide_irq[i],
+ hd[MAX_IDE_DEVS * i], hd[MAX_IDE_DEVS * i + 1]);
+ /*
+ * The ide bus name is ide.0 for the first bus and ide.1 for the
+ * second one.
+ */
+ busname[4] = '0' + i;
+ pcms->idebus[i] = qdev_get_child_bus(DEVICE(dev), busname);
+ }
+}
+
+static void isapc_machine_options(MachineClass *m)
+{
+ static const char * const valid_cpu_types[] = {
+ X86_CPU_TYPE_NAME("486"),
+ X86_CPU_TYPE_NAME("athlon"),
+ X86_CPU_TYPE_NAME("kvm32"),
+ X86_CPU_TYPE_NAME("pentium"),
+ X86_CPU_TYPE_NAME("pentium2"),
+ X86_CPU_TYPE_NAME("pentium3"),
+ X86_CPU_TYPE_NAME("qemu32"),
+ X86_CPU_TYPE_NAME("max"),
+ X86_CPU_TYPE_NAME("host"),
+ NULL
+ };
+ PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
+
+ m->desc = "ISA-only PC";
+ m->max_cpus = 1;
+ m->option_rom_has_mr = true;
+ m->rom_file_has_mr = false;
+ pcmc->pci_enabled = false;
+ pcmc->has_acpi_build = false;
+ pcmc->smbios_defaults = false;
+ pcmc->gigabyte_align = false;
+ pcmc->smbios_legacy_mode = true;
+ pcmc->has_reserved_memory = false;
+ m->default_nic = "ne2k_isa";
+ m->default_cpu_type = X86_CPU_TYPE_NAME("486");
+ m->valid_cpu_types = valid_cpu_types;
+ m->no_floppy = !module_object_class_by_name(TYPE_ISA_FDC);
+ m->no_parallel = !module_object_class_by_name(TYPE_ISA_PARALLEL);
+}
+
+DEFINE_PC_MACHINE(isapc, "isapc", pc_init_isa,
+ isapc_machine_options);
diff --git a/hw/i386/kvm/apic.c b/hw/i386/kvm/apic.c
index 39035db..1be9bfe 100644
--- a/hw/i386/kvm/apic.c
+++ b/hw/i386/kvm/apic.c
@@ -17,6 +17,7 @@
#include "system/hw_accel.h"
#include "system/kvm.h"
#include "kvm/kvm_i386.h"
+#include "kvm/tdx.h"
static inline void kvm_apic_set_reg(struct kvm_lapic_state *kapic,
int reg_id, uint32_t val)
@@ -141,6 +142,10 @@ static void kvm_apic_put(CPUState *cs, run_on_cpu_data data)
struct kvm_lapic_state kapic;
int ret;
+ if (is_tdx_vm()) {
+ return;
+ }
+
kvm_put_apicbase(s->cpu, s->apicbase);
kvm_put_apic_state(s, &kapic);
diff --git a/hw/i386/kvm/xen-stubs.c b/hw/i386/kvm/xen-stubs.c
index d03131e..ce73119 100644
--- a/hw/i386/kvm/xen-stubs.c
+++ b/hw/i386/kvm/xen-stubs.c
@@ -12,7 +12,6 @@
#include "qemu/osdep.h"
#include "qapi/error.h"
-#include "qapi/qapi-commands-misc-target.h"
#include "xen_evtchn.h"
#include "xen_primary_console.h"
@@ -38,15 +37,3 @@ void xen_primary_console_create(void)
void xen_primary_console_set_be_port(uint16_t port)
{
}
-#ifdef TARGET_I386
-EvtchnInfoList *qmp_xen_event_list(Error **errp)
-{
- error_setg(errp, "Xen event channel emulation not enabled");
- return NULL;
-}
-
-void qmp_xen_event_inject(uint32_t port, Error **errp)
-{
- error_setg(errp, "Xen event channel emulation not enabled");
-}
-#endif
diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index b519054..dd566c4 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -19,7 +19,7 @@
#include "monitor/monitor.h"
#include "monitor/hmp.h"
#include "qapi/error.h"
-#include "qapi/qapi-commands-misc-target.h"
+#include "qapi/qapi-commands-misc-i386.h"
#include "qobject/qdict.h"
#include "qom/object.h"
#include "exec/target_page.h"
diff --git a/hw/i386/meson.build b/hw/i386/meson.build
index 10bdfde..436b3ce 100644
--- a/hw/i386/meson.build
+++ b/hw/i386/meson.build
@@ -14,6 +14,7 @@ i386_ss.add(when: 'CONFIG_X86_IOMMU', if_true: files('x86-iommu.c'),
i386_ss.add(when: 'CONFIG_AMD_IOMMU', if_true: files('amd_iommu.c'),
if_false: files('amd_iommu-stub.c'))
i386_ss.add(when: 'CONFIG_I440FX', if_true: files('pc_piix.c'))
+i386_ss.add(when: 'CONFIG_ISAPC', if_true: files('isapc.c'))
i386_ss.add(when: 'CONFIG_MICROVM', if_true: files('x86-common.c', 'microvm.c', 'acpi-microvm.c', 'microvm-dt.c'))
i386_ss.add(when: 'CONFIG_NITRO_ENCLAVE', if_true: files('nitro_enclave.c'))
i386_ss.add(when: 'CONFIG_Q35', if_true: files('pc_q35.c'))
@@ -32,6 +33,7 @@ i386_ss.add(when: 'CONFIG_PC', if_true: files(
'port92.c'))
i386_ss.add(when: 'CONFIG_X86_FW_OVMF', if_true: files('pc_sysfw_ovmf.c'),
if_false: files('pc_sysfw_ovmf-stubs.c'))
+i386_ss.add(when: 'CONFIG_TDX', if_true: files('tdvf.c', 'tdvf-hob.c'))
subdir('kvm')
subdir('xen')
diff --git a/hw/i386/microvm.c b/hw/i386/microvm.c
index e0daf0d..94d22a2 100644
--- a/hw/i386/microvm.c
+++ b/hw/i386/microvm.c
@@ -49,6 +49,7 @@
#include "hw/acpi/generic_event_device.h"
#include "hw/pci-host/gpex.h"
#include "hw/usb/xhci.h"
+#include "hw/vfio/types.h"
#include "elf.h"
#include "kvm/kvm_i386.h"
@@ -633,6 +634,8 @@ GlobalProperty microvm_properties[] = {
* so reserving io space is not going to work. Turn it off.
*/
{ "pcie-root-port", "io-reserve", "0" },
+ { TYPE_RAMFB_DEVICE, "use-legacy-x86-rom", "true" },
+ { TYPE_VFIO_PCI_NOHOTPLUG, "use-legacy-x86-rom", "true" },
};
static void microvm_class_init(ObjectClass *oc, const void *data)
diff --git a/hw/i386/monitor.c b/hw/i386/monitor.c
index 1921e4d..79df965 100644
--- a/hw/i386/monitor.c
+++ b/hw/i386/monitor.c
@@ -26,7 +26,7 @@
#include "monitor/monitor.h"
#include "qobject/qdict.h"
#include "qapi/error.h"
-#include "qapi/qapi-commands-misc-target.h"
+#include "qapi/qapi-commands-misc-i386.h"
#include "hw/i386/x86.h"
#include "hw/rtc/mc146818rtc.h"
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 7065615..4d6bcbb 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -44,6 +44,7 @@
#include "system/xen.h"
#include "system/reset.h"
#include "kvm/kvm_i386.h"
+#include "kvm/tdx.h"
#include "hw/xen/xen.h"
#include "qobject/qlist.h"
#include "qemu/error-report.h"
@@ -80,7 +81,15 @@
{ "qemu64-" TYPE_X86_CPU, "model-id", "QEMU Virtual CPU version " v, },\
{ "athlon-" TYPE_X86_CPU, "model-id", "QEMU Virtual CPU version " v, },
-GlobalProperty pc_compat_10_0[] = {};
+GlobalProperty pc_compat_10_1[] = {};
+const size_t pc_compat_10_1_len = G_N_ELEMENTS(pc_compat_10_1);
+
+GlobalProperty pc_compat_10_0[] = {
+ { TYPE_X86_CPU, "x-consistent-cache", "false" },
+ { TYPE_X86_CPU, "x-vendor-cpuid-only-v2", "false" },
+ { TYPE_X86_CPU, "x-arch-cap-always-on", "true" },
+ { TYPE_X86_CPU, "x-pdcm-on-even-without-pmu", "true" },
+};
const size_t pc_compat_10_0_len = G_N_ELEMENTS(pc_compat_10_0);
GlobalProperty pc_compat_9_2[] = {};
@@ -259,28 +268,6 @@ GlobalProperty pc_compat_2_6[] = {
};
const size_t pc_compat_2_6_len = G_N_ELEMENTS(pc_compat_2_6);
-GlobalProperty pc_compat_2_5[] = {};
-const size_t pc_compat_2_5_len = G_N_ELEMENTS(pc_compat_2_5);
-
-GlobalProperty pc_compat_2_4[] = {
- PC_CPU_MODEL_IDS("2.4.0")
- { "Haswell-" TYPE_X86_CPU, "abm", "off" },
- { "Haswell-noTSX-" TYPE_X86_CPU, "abm", "off" },
- { "Broadwell-" TYPE_X86_CPU, "abm", "off" },
- { "Broadwell-noTSX-" TYPE_X86_CPU, "abm", "off" },
- { "host" "-" TYPE_X86_CPU, "host-cache-info", "on" },
- { TYPE_X86_CPU, "check", "off" },
- { "qemu64" "-" TYPE_X86_CPU, "sse4a", "on" },
- { "qemu64" "-" TYPE_X86_CPU, "abm", "on" },
- { "qemu64" "-" TYPE_X86_CPU, "popcnt", "on" },
- { "qemu32" "-" TYPE_X86_CPU, "popcnt", "on" },
- { "Opteron_G2" "-" TYPE_X86_CPU, "rdtscp", "on" },
- { "Opteron_G3" "-" TYPE_X86_CPU, "rdtscp", "on" },
- { "Opteron_G4" "-" TYPE_X86_CPU, "rdtscp", "on" },
- { "Opteron_G5" "-" TYPE_X86_CPU, "rdtscp", "on", }
-};
-const size_t pc_compat_2_4_len = G_N_ELEMENTS(pc_compat_2_4);
-
/*
* @PC_FW_DATA:
* Size of the chunk of memory at the top of RAM for the BIOS ACPI tables
@@ -630,7 +617,7 @@ void pc_machine_done(Notifier *notifier, void *data)
&error_fatal);
if (pcms->cxl_devices_state.is_enabled) {
- cxl_fmws_link_targets(&pcms->cxl_devices_state, &error_fatal);
+ cxl_fmws_link_targets(&error_fatal);
}
/* set the number of CPUs */
@@ -739,20 +726,28 @@ static uint64_t pc_get_cxl_range_start(PCMachineState *pcms)
return cxl_base;
}
-static uint64_t pc_get_cxl_range_end(PCMachineState *pcms)
+static int cxl_get_fmw_end(Object *obj, void *opaque)
{
- uint64_t start = pc_get_cxl_range_start(pcms) + MiB;
+ struct CXLFixedWindow *fw;
+ uint64_t *start = opaque;
- if (pcms->cxl_devices_state.fixed_windows) {
- GList *it;
-
- start = ROUND_UP(start, 256 * MiB);
- for (it = pcms->cxl_devices_state.fixed_windows; it; it = it->next) {
- CXLFixedWindow *fw = it->data;
- start += fw->size;
- }
+ if (!object_dynamic_cast(obj, TYPE_CXL_FMW)) {
+ return 0;
}
+ fw = CXL_FMW(obj);
+ *start += fw->size;
+
+ return 0;
+}
+
+static uint64_t pc_get_cxl_range_end(PCMachineState *pcms)
+{
+ uint64_t start = pc_get_cxl_range_start(pcms) + MiB;
+
+ /* Ordering doesn't matter so no need to build a sorted list */
+ object_child_foreach_recursive(object_get_root(), cxl_get_fmw_end,
+ &start);
return start;
}
@@ -844,6 +839,7 @@ void pc_memory_init(PCMachineState *pcms,
hwaddr maxphysaddr, maxusedaddr;
hwaddr cxl_base, cxl_resv_end = 0;
X86CPU *cpu = X86_CPU(first_cpu);
+ uint64_t res_mem_end;
assert(machine->ram_size == x86ms->below_4g_mem_size +
x86ms->above_4g_mem_size);
@@ -954,60 +950,48 @@ void pc_memory_init(PCMachineState *pcms,
cxl_base = pc_get_cxl_range_start(pcms);
memory_region_init(mr, OBJECT(machine), "cxl_host_reg", cxl_size);
memory_region_add_subregion(system_memory, cxl_base, mr);
- cxl_resv_end = cxl_base + cxl_size;
- if (pcms->cxl_devices_state.fixed_windows) {
- hwaddr cxl_fmw_base;
- GList *it;
-
- cxl_fmw_base = ROUND_UP(cxl_base + cxl_size, 256 * MiB);
- for (it = pcms->cxl_devices_state.fixed_windows; it; it = it->next) {
- CXLFixedWindow *fw = it->data;
-
- fw->base = cxl_fmw_base;
- memory_region_init_io(&fw->mr, OBJECT(machine), &cfmws_ops, fw,
- "cxl-fixed-memory-region", fw->size);
- memory_region_add_subregion(system_memory, fw->base, &fw->mr);
- cxl_fmw_base += fw->size;
- cxl_resv_end = cxl_fmw_base;
- }
- }
+ cxl_base = ROUND_UP(cxl_base + cxl_size, 256 * MiB);
+ cxl_resv_end = cxl_fmws_set_memmap(cxl_base, maxphysaddr);
+ cxl_fmws_update_mmio();
}
/* Initialize PC system firmware */
pc_system_firmware_init(pcms, rom_memory);
- option_rom_mr = g_malloc(sizeof(*option_rom_mr));
- if (machine_require_guest_memfd(machine)) {
- memory_region_init_ram_guest_memfd(option_rom_mr, NULL, "pc.rom",
- PC_ROM_SIZE, &error_fatal);
- } else {
- memory_region_init_ram(option_rom_mr, NULL, "pc.rom", PC_ROM_SIZE,
- &error_fatal);
- if (pcmc->pci_enabled) {
- memory_region_set_readonly(option_rom_mr, true);
+ if (!is_tdx_vm()) {
+ option_rom_mr = g_malloc(sizeof(*option_rom_mr));
+ if (machine_require_guest_memfd(machine)) {
+ memory_region_init_ram_guest_memfd(option_rom_mr, NULL, "pc.rom",
+ PC_ROM_SIZE, &error_fatal);
+ } else {
+ memory_region_init_ram(option_rom_mr, NULL, "pc.rom", PC_ROM_SIZE,
+ &error_fatal);
+ if (pcmc->pci_enabled) {
+ memory_region_set_readonly(option_rom_mr, true);
+ }
}
+ memory_region_add_subregion_overlap(rom_memory,
+ PC_ROM_MIN_VGA,
+ option_rom_mr,
+ 1);
}
- memory_region_add_subregion_overlap(rom_memory,
- PC_ROM_MIN_VGA,
- option_rom_mr,
- 1);
fw_cfg = fw_cfg_arch_create(machine,
x86ms->boot_cpus, x86ms->apic_id_limit);
rom_set_fw(fw_cfg);
- if (machine->device_memory) {
- uint64_t *val = g_malloc(sizeof(*val));
- uint64_t res_mem_end = machine->device_memory->base;
-
- if (!pcmc->broken_reserved_end) {
- res_mem_end += memory_region_size(&machine->device_memory->mr);
- }
+ if (pcms->cxl_devices_state.is_enabled) {
+ res_mem_end = cxl_resv_end;
+ } else if (machine->device_memory) {
+ res_mem_end = machine->device_memory->base
+ + memory_region_size(&machine->device_memory->mr);
+ } else {
+ res_mem_end = 0;
+ }
- if (pcms->cxl_devices_state.is_enabled) {
- res_mem_end = cxl_resv_end;
- }
+ if (res_mem_end) {
+ uint64_t *val = g_malloc(sizeof(*val));
*val = cpu_to_le64(ROUND_UP(res_mem_end, 1 * GiB));
fw_cfg_add_file(fw_cfg, "etc/reserved-memory-end", val, sizeof(*val));
}
@@ -1044,9 +1028,7 @@ uint64_t pc_pci_hole64_start(void)
hole64_start = pc_get_cxl_range_end(pcms);
} else if (pcmc->has_reserved_memory && (ms->ram_size < ms->maxram_size)) {
pc_get_device_memory_range(pcms, &hole64_start, &size);
- if (!pcmc->broken_reserved_end) {
- hole64_start += size;
- }
+ hole64_start += size;
} else {
hole64_start = pc_above_4g_end(pcms);
}
@@ -1058,7 +1040,6 @@ DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus)
{
DeviceState *dev = NULL;
- rom_set_order_override(FW_CFG_ORDER_OVERRIDE_VGA);
if (pci_bus) {
PCIDevice *pcidev = pci_vga_init(pci_bus);
dev = pcidev ? &pcidev->qdev : NULL;
@@ -1066,7 +1047,7 @@ DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus)
ISADevice *isadev = isa_vga_init(isa_bus);
dev = isadev ? DEVICE(isadev) : NULL;
}
- rom_reset_order_override();
+
return dev;
}
@@ -1256,8 +1237,6 @@ void pc_nic_init(PCMachineClass *pcmc, ISABus *isa_bus, PCIBus *pci_bus)
bool default_is_ne2k = g_str_equal(mc->default_nic, TYPE_ISA_NE2000);
NICInfo *nd;
- rom_set_order_override(FW_CFG_ORDER_OVERRIDE_NIC);
-
while ((nd = qemu_find_nic_info(TYPE_ISA_NE2000, default_is_ne2k, NULL))) {
pc_init_ne2k_isa(isa_bus, nd, &error_fatal);
}
@@ -1266,8 +1245,6 @@ void pc_nic_init(PCMachineClass *pcmc, ISABus *isa_bus, PCIBus *pci_bus)
if (pci_bus) {
pci_init_nic_devices(pci_bus, mc->default_nic);
}
-
- rom_reset_order_override();
}
void pc_i8259_create(ISABus *isa_bus, qemu_irq *i8259_irqs)
@@ -1747,25 +1724,6 @@ static void pc_machine_wakeup(MachineState *machine)
cpu_synchronize_all_post_reset();
}
-static bool pc_hotplug_allowed(MachineState *ms, DeviceState *dev, Error **errp)
-{
- X86IOMMUState *iommu = x86_iommu_get_default();
- IntelIOMMUState *intel_iommu;
-
- if (iommu &&
- object_dynamic_cast((Object *)iommu, TYPE_INTEL_IOMMU_DEVICE) &&
- object_dynamic_cast((Object *)dev, "vfio-pci")) {
- intel_iommu = INTEL_IOMMU_DEVICE(iommu);
- if (!intel_iommu->caching_mode) {
- error_setg(errp, "Device assignment is not allowed without "
- "enabling caching-mode=on for Intel IOMMU.");
- return false;
- }
- }
-
- return true;
-}
-
static void pc_machine_class_init(ObjectClass *oc, const void *data)
{
MachineClass *mc = MACHINE_CLASS(oc);
@@ -1785,7 +1743,6 @@ static void pc_machine_class_init(ObjectClass *oc, const void *data)
x86mc->apic_xrupt_override = true;
assert(!mc->get_hotplug_handler);
mc->get_hotplug_handler = pc_get_hotplug_handler;
- mc->hotplug_allowed = pc_hotplug_allowed;
mc->auto_enable_numa_with_memhp = true;
mc->auto_enable_numa_with_memdev = true;
mc->has_hotpluggable_cpus = true;
@@ -1860,6 +1817,18 @@ static void pc_machine_class_init(ObjectClass *oc, const void *data)
object_class_property_add_bool(oc, "fd-bootchk",
pc_machine_get_fd_bootchk,
pc_machine_set_fd_bootchk);
+
+#if defined(CONFIG_IGVM)
+ object_class_property_add_link(oc, "igvm-cfg",
+ TYPE_IGVM_CFG,
+ offsetof(X86MachineState, igvm),
+ object_property_allow_set_link,
+ OBJ_PROP_LINK_STRONG);
+ object_class_property_set_description(oc, "igvm-cfg",
+ "Set IGVM configuration");
+#endif
+
+
}
static const TypeInfo pc_machine_info = {
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 0dce512..7b3611e 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -27,20 +27,16 @@
#include "qemu/units.h"
#include "hw/char/parallel-isa.h"
-#include "hw/dma/i8257.h"
-#include "hw/loader.h"
#include "hw/i386/x86.h"
#include "hw/i386/pc.h"
#include "hw/i386/apic.h"
#include "hw/pci-host/i440fx.h"
-#include "hw/rtc/mc146818rtc.h"
#include "hw/southbridge/piix.h"
#include "hw/display/ramfb.h"
#include "hw/pci/pci.h"
#include "hw/pci/pci_ids.h"
#include "hw/usb.h"
#include "net/net.h"
-#include "hw/ide/isa.h"
#include "hw/ide/pci.h"
#include "hw/irq.h"
#include "system/kvm.h"
@@ -49,6 +45,7 @@
#include "hw/i2c/smbus_eeprom.h"
#include "system/memory.h"
#include "hw/acpi/acpi.h"
+#include "hw/vfio/types.h"
#include "qapi/error.h"
#include "qemu/error-report.h"
#include "system/xen.h"
@@ -71,11 +68,12 @@
#define XEN_IOAPIC_NUM_PIRQS 128ULL
-#ifdef CONFIG_IDE_ISA
-static const int ide_iobase[MAX_IDE_BUS] = { 0x1f0, 0x170 };
-static const int ide_iobase2[MAX_IDE_BUS] = { 0x3f6, 0x376 };
-static const int ide_irq[MAX_IDE_BUS] = { 14, 15 };
-#endif
+static GlobalProperty pc_piix_compat_defaults[] = {
+ { TYPE_RAMFB_DEVICE, "use-legacy-x86-rom", "true" },
+ { TYPE_VFIO_PCI_NOHOTPLUG, "use-legacy-x86-rom", "true" },
+};
+static const size_t pc_piix_compat_defaults_len =
+ G_N_ELEMENTS(pc_piix_compat_defaults);
/*
* Return the global irq number corresponding to a given device irq
@@ -108,16 +106,20 @@ static void pc_init1(MachineState *machine, const char *pci_type)
X86MachineState *x86ms = X86_MACHINE(machine);
MemoryRegion *system_memory = get_system_memory();
MemoryRegion *system_io = get_system_io();
- Object *phb = NULL;
+ Object *phb;
ISABus *isa_bus;
Object *piix4_pm = NULL;
qemu_irq smi_irq;
GSIState *gsi_state;
MemoryRegion *ram_memory;
MemoryRegion *pci_memory = NULL;
- MemoryRegion *rom_memory = system_memory;
ram_addr_t lowmem;
uint64_t hole64_size = 0;
+ PCIDevice *pci_dev;
+ DeviceState *dev;
+ size_t i;
+
+ assert(pcmc->pci_enabled);
/*
* Calculate ram split, for memory below and above 4G. It's a bit
@@ -188,42 +190,39 @@ static void pc_init1(MachineState *machine, const char *pci_type)
kvmclock_create(pcmc->kvmclock_create_always);
}
- if (pcmc->pci_enabled) {
- pci_memory = g_new(MemoryRegion, 1);
- memory_region_init(pci_memory, NULL, "pci", UINT64_MAX);
- rom_memory = pci_memory;
-
- phb = OBJECT(qdev_new(TYPE_I440FX_PCI_HOST_BRIDGE));
- object_property_add_child(OBJECT(machine), "i440fx", phb);
- object_property_set_link(phb, PCI_HOST_PROP_RAM_MEM,
- OBJECT(ram_memory), &error_fatal);
- object_property_set_link(phb, PCI_HOST_PROP_PCI_MEM,
- OBJECT(pci_memory), &error_fatal);
- object_property_set_link(phb, PCI_HOST_PROP_SYSTEM_MEM,
- OBJECT(system_memory), &error_fatal);
- object_property_set_link(phb, PCI_HOST_PROP_IO_MEM,
- OBJECT(system_io), &error_fatal);
- object_property_set_uint(phb, PCI_HOST_BELOW_4G_MEM_SIZE,
- x86ms->below_4g_mem_size, &error_fatal);
- object_property_set_uint(phb, PCI_HOST_ABOVE_4G_MEM_SIZE,
- x86ms->above_4g_mem_size, &error_fatal);
- object_property_set_str(phb, I440FX_HOST_PROP_PCI_TYPE, pci_type,
- &error_fatal);
- sysbus_realize_and_unref(SYS_BUS_DEVICE(phb), &error_fatal);
-
- pcms->pcibus = PCI_BUS(qdev_get_child_bus(DEVICE(phb), "pci.0"));
- pci_bus_map_irqs(pcms->pcibus,
- xen_enabled() ? xen_pci_slot_get_pirq
- : pc_pci_slot_get_pirq);
-
- hole64_size = object_property_get_uint(phb,
- PCI_HOST_PROP_PCI_HOLE64_SIZE,
- &error_abort);
- }
+ pci_memory = g_new(MemoryRegion, 1);
+ memory_region_init(pci_memory, NULL, "pci", UINT64_MAX);
+
+ phb = OBJECT(qdev_new(TYPE_I440FX_PCI_HOST_BRIDGE));
+ object_property_add_child(OBJECT(machine), "i440fx", phb);
+ object_property_set_link(phb, PCI_HOST_PROP_RAM_MEM,
+ OBJECT(ram_memory), &error_fatal);
+ object_property_set_link(phb, PCI_HOST_PROP_PCI_MEM,
+ OBJECT(pci_memory), &error_fatal);
+ object_property_set_link(phb, PCI_HOST_PROP_SYSTEM_MEM,
+ OBJECT(system_memory), &error_fatal);
+ object_property_set_link(phb, PCI_HOST_PROP_IO_MEM,
+ OBJECT(system_io), &error_fatal);
+ object_property_set_uint(phb, PCI_HOST_BELOW_4G_MEM_SIZE,
+ x86ms->below_4g_mem_size, &error_fatal);
+ object_property_set_uint(phb, PCI_HOST_ABOVE_4G_MEM_SIZE,
+ x86ms->above_4g_mem_size, &error_fatal);
+ object_property_set_str(phb, I440FX_HOST_PROP_PCI_TYPE, pci_type,
+ &error_fatal);
+ sysbus_realize_and_unref(SYS_BUS_DEVICE(phb), &error_fatal);
+
+ pcms->pcibus = PCI_BUS(qdev_get_child_bus(DEVICE(phb), "pci.0"));
+ pci_bus_map_irqs(pcms->pcibus,
+ xen_enabled() ? xen_pci_slot_get_pirq
+ : pc_pci_slot_get_pirq);
+
+ hole64_size = object_property_get_uint(phb,
+ PCI_HOST_PROP_PCI_HOLE64_SIZE,
+ &error_abort);
/* allocate ram and load rom/bios */
if (!xen_enabled()) {
- pc_memory_init(pcms, system_memory, rom_memory, hole64_size);
+ pc_memory_init(pcms, system_memory, pci_memory, hole64_size);
} else {
assert(machine->ram_size == x86ms->below_4g_mem_size +
x86ms->above_4g_mem_size);
@@ -235,81 +234,63 @@ static void pc_init1(MachineState *machine, const char *pci_type)
}
}
- gsi_state = pc_gsi_create(&x86ms->gsi, pcmc->pci_enabled);
-
- if (pcmc->pci_enabled) {
- PCIDevice *pci_dev;
- DeviceState *dev;
- size_t i;
-
- pci_dev = pci_new_multifunction(-1, pcms->south_bridge);
- object_property_set_bool(OBJECT(pci_dev), "has-usb",
- machine_usb(machine), &error_abort);
- object_property_set_bool(OBJECT(pci_dev), "has-acpi",
- x86_machine_is_acpi_enabled(x86ms),
- &error_abort);
- object_property_set_bool(OBJECT(pci_dev), "has-pic", false,
- &error_abort);
- object_property_set_bool(OBJECT(pci_dev), "has-pit", false,
- &error_abort);
- qdev_prop_set_uint32(DEVICE(pci_dev), "smb_io_base", 0xb100);
- object_property_set_bool(OBJECT(pci_dev), "smm-enabled",
- x86_machine_is_smm_enabled(x86ms),
- &error_abort);
- dev = DEVICE(pci_dev);
- for (i = 0; i < ISA_NUM_IRQS; i++) {
- qdev_connect_gpio_out_named(dev, "isa-irqs", i, x86ms->gsi[i]);
- }
- pci_realize_and_unref(pci_dev, pcms->pcibus, &error_fatal);
-
- if (xen_enabled()) {
- pci_device_set_intx_routing_notifier(
- pci_dev, piix_intx_routing_notifier_xen);
-
- /*
- * Xen supports additional interrupt routes from the PCI devices to
- * the IOAPIC: the four pins of each PCI device on the bus are also
- * connected to the IOAPIC directly.
- * These additional routes can be discovered through ACPI.
- */
- pci_bus_irqs(pcms->pcibus, xen_intx_set_irq, pci_dev,
- XEN_IOAPIC_NUM_PIRQS);
- }
+ gsi_state = pc_gsi_create(&x86ms->gsi, true);
+
+ pci_dev = pci_new_multifunction(-1, pcms->south_bridge);
+ object_property_set_bool(OBJECT(pci_dev), "has-usb",
+ machine_usb(machine), &error_abort);
+ object_property_set_bool(OBJECT(pci_dev), "has-acpi",
+ x86_machine_is_acpi_enabled(x86ms),
+ &error_abort);
+ object_property_set_bool(OBJECT(pci_dev), "has-pic", false,
+ &error_abort);
+ object_property_set_bool(OBJECT(pci_dev), "has-pit", false,
+ &error_abort);
+ qdev_prop_set_uint32(DEVICE(pci_dev), "smb_io_base", 0xb100);
+ object_property_set_bool(OBJECT(pci_dev), "smm-enabled",
+ x86_machine_is_smm_enabled(x86ms),
+ &error_abort);
+ dev = DEVICE(pci_dev);
+ for (i = 0; i < ISA_NUM_IRQS; i++) {
+ qdev_connect_gpio_out_named(dev, "isa-irqs", i, x86ms->gsi[i]);
+ }
+ pci_realize_and_unref(pci_dev, pcms->pcibus, &error_fatal);
- isa_bus = ISA_BUS(qdev_get_child_bus(DEVICE(pci_dev), "isa.0"));
- x86ms->rtc = ISA_DEVICE(object_resolve_path_component(OBJECT(pci_dev),
- "rtc"));
- piix4_pm = object_resolve_path_component(OBJECT(pci_dev), "pm");
- dev = DEVICE(object_resolve_path_component(OBJECT(pci_dev), "ide"));
- pci_ide_create_devs(PCI_DEVICE(dev));
- pcms->idebus[0] = qdev_get_child_bus(dev, "ide.0");
- pcms->idebus[1] = qdev_get_child_bus(dev, "ide.1");
- } else {
- isa_bus = isa_bus_new(NULL, system_memory, system_io,
- &error_abort);
- isa_bus_register_input_irqs(isa_bus, x86ms->gsi);
+ if (xen_enabled()) {
+ pci_device_set_intx_routing_notifier(
+ pci_dev, piix_intx_routing_notifier_xen);
+
+ /*
+ * Xen supports additional interrupt routes from the PCI devices to
+ * the IOAPIC: the four pins of each PCI device on the bus are also
+ * connected to the IOAPIC directly.
+ * These additional routes can be discovered through ACPI.
+ */
+ pci_bus_irqs(pcms->pcibus, xen_intx_set_irq, pci_dev,
+ XEN_IOAPIC_NUM_PIRQS);
+ }
- x86ms->rtc = isa_new(TYPE_MC146818_RTC);
- qdev_prop_set_int32(DEVICE(x86ms->rtc), "base_year", 2000);
- isa_realize_and_unref(x86ms->rtc, isa_bus, &error_fatal);
+ isa_bus = ISA_BUS(qdev_get_child_bus(DEVICE(pci_dev), "isa.0"));
+ x86ms->rtc = ISA_DEVICE(object_resolve_path_component(OBJECT(pci_dev),
+ "rtc"));
+ piix4_pm = object_resolve_path_component(OBJECT(pci_dev), "pm");
+ dev = DEVICE(object_resolve_path_component(OBJECT(pci_dev), "ide"));
+ pci_ide_create_devs(PCI_DEVICE(dev));
+ pcms->idebus[0] = qdev_get_child_bus(dev, "ide.0");
+ pcms->idebus[1] = qdev_get_child_bus(dev, "ide.1");
- i8257_dma_init(OBJECT(machine), isa_bus, 0);
- pcms->hpet_enabled = false;
- }
if (x86ms->pic == ON_OFF_AUTO_ON || x86ms->pic == ON_OFF_AUTO_AUTO) {
pc_i8259_create(isa_bus, gsi_state->i8259_irq);
}
- if (phb) {
- ioapic_init_gsi(gsi_state, phb);
- }
+ ioapic_init_gsi(gsi_state, phb);
if (tcg_enabled()) {
x86_register_ferr_irq(x86ms->gsi[13]);
}
- pc_vga_init(isa_bus, pcmc->pci_enabled ? pcms->pcibus : NULL);
+ pc_vga_init(isa_bus, pcms->pcibus);
/* init basic PC hardware */
pc_basic_device_init(pcms, isa_bus, x86ms->gsi, x86ms->rtc,
@@ -317,28 +298,6 @@ static void pc_init1(MachineState *machine, const char *pci_type)
pc_nic_init(pcmc, isa_bus, pcms->pcibus);
-#ifdef CONFIG_IDE_ISA
- if (!pcmc->pci_enabled) {
- DriveInfo *hd[MAX_IDE_BUS * MAX_IDE_DEVS];
- int i;
-
- ide_drive_get(hd, ARRAY_SIZE(hd));
- for (i = 0; i < MAX_IDE_BUS; i++) {
- ISADevice *dev;
- char busname[] = "ide.0";
- dev = isa_ide_init(isa_bus, ide_iobase[i], ide_iobase2[i],
- ide_irq[i],
- hd[MAX_IDE_DEVS * i], hd[MAX_IDE_DEVS * i + 1]);
- /*
- * The ide bus name is ide.0 for the first bus and ide.1 for the
- * second one.
- */
- busname[4] = '0' + i;
- pcms->idebus[i] = qdev_get_child_bus(DEVICE(dev), busname);
- }
- }
-#endif
-
if (piix4_pm) {
smi_irq = qemu_allocate_irq(pc_acpi_smi_interrupt, first_cpu, 0);
@@ -361,6 +320,16 @@ static void pc_init1(MachineState *machine, const char *pci_type)
x86_nvdimm_acpi_dsmio,
x86ms->fw_cfg, OBJECT(pcms));
}
+
+#if defined(CONFIG_IGVM)
+ /* Apply guest state from IGVM if supplied */
+ if (x86ms->igvm) {
+ if (IGVM_CFG_GET_CLASS(x86ms->igvm)
+ ->process(x86ms->igvm, machine->cgs, false, &error_fatal) < 0) {
+ g_assert_not_reached();
+ }
+ }
+#endif
}
typedef enum PCSouthBridgeOption {
@@ -410,22 +379,7 @@ static void pc_set_south_bridge(Object *obj, int value, Error **errp)
pcms->south_bridge = PCSouthBridgeOption_lookup.array[value];
}
-#ifdef CONFIG_ISAPC
-static void pc_init_isa(MachineState *machine)
-{
- pc_init1(machine, NULL);
-}
-#endif
-
#ifdef CONFIG_XEN
-static void pc_xen_hvm_init_pci(MachineState *machine)
-{
- const char *pci_type = xen_igd_gfx_pt_enabled() ?
- TYPE_IGD_PASSTHROUGH_I440FX_PCI_DEVICE : TYPE_I440FX_PCI_DEVICE;
-
- pc_init1(machine, pci_type);
-}
-
static void pc_xen_hvm_init(MachineState *machine)
{
PCMachineState *pcms = PC_MACHINE(machine);
@@ -435,7 +389,10 @@ static void pc_xen_hvm_init(MachineState *machine)
exit(1);
}
- pc_xen_hvm_init_pci(machine);
+ pc_init1(machine, xen_igd_gfx_pt_enabled()
+ ? TYPE_IGD_PASSTHROUGH_I440FX_PCI_DEVICE
+ : TYPE_I440FX_PCI_DEVICE);
+
xen_igd_reserve_slot(pcms->pcibus);
pci_create_simple(pcms->pcibus, -1, "xen-platform");
}
@@ -477,14 +434,26 @@ static void pc_i440fx_machine_options(MachineClass *m)
pc_set_south_bridge);
object_class_property_set_description(oc, "x-south-bridge",
"Use a different south bridge than PIIX3");
+ compat_props_add(m->compat_props,
+ pc_piix_compat_defaults, pc_piix_compat_defaults_len);
}
-static void pc_i440fx_machine_10_1_options(MachineClass *m)
+static void pc_i440fx_machine_10_2_options(MachineClass *m)
{
pc_i440fx_machine_options(m);
}
-DEFINE_I440FX_MACHINE_AS_LATEST(10, 1);
+DEFINE_I440FX_MACHINE_AS_LATEST(10, 2);
+
+static void pc_i440fx_machine_10_1_options(MachineClass *m)
+{
+ pc_i440fx_machine_10_2_options(m);
+ m->smbios_memory_device_size = 2047 * TiB;
+ compat_props_add(m->compat_props, hw_compat_10_1, hw_compat_10_1_len);
+ compat_props_add(m->compat_props, pc_compat_10_1, pc_compat_10_1_len);
+}
+
+DEFINE_I440FX_MACHINE(10, 1);
static void pc_i440fx_machine_10_0_options(MachineClass *m)
{
@@ -778,56 +747,6 @@ static void pc_i440fx_machine_2_6_options(MachineClass *m)
DEFINE_I440FX_MACHINE(2, 6);
-static void pc_i440fx_machine_2_5_options(MachineClass *m)
-{
- X86MachineClass *x86mc = X86_MACHINE_CLASS(m);
-
- pc_i440fx_machine_2_6_options(m);
- x86mc->save_tsc_khz = false;
- m->legacy_fw_cfg_order = 1;
- compat_props_add(m->compat_props, hw_compat_2_5, hw_compat_2_5_len);
- compat_props_add(m->compat_props, pc_compat_2_5, pc_compat_2_5_len);
-}
-
-DEFINE_I440FX_MACHINE(2, 5);
-
-static void pc_i440fx_machine_2_4_options(MachineClass *m)
-{
- PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
-
- pc_i440fx_machine_2_5_options(m);
- m->hw_version = "2.4.0";
- pcmc->broken_reserved_end = true;
- compat_props_add(m->compat_props, hw_compat_2_4, hw_compat_2_4_len);
- compat_props_add(m->compat_props, pc_compat_2_4, pc_compat_2_4_len);
-}
-
-DEFINE_I440FX_MACHINE(2, 4);
-
-#ifdef CONFIG_ISAPC
-static void isapc_machine_options(MachineClass *m)
-{
- PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
- m->desc = "ISA-only PC";
- m->max_cpus = 1;
- m->option_rom_has_mr = true;
- m->rom_file_has_mr = false;
- pcmc->pci_enabled = false;
- pcmc->has_acpi_build = false;
- pcmc->smbios_defaults = false;
- pcmc->gigabyte_align = false;
- pcmc->smbios_legacy_mode = true;
- pcmc->has_reserved_memory = false;
- m->default_nic = "ne2k_isa";
- m->default_cpu_type = X86_CPU_TYPE_NAME("486");
- m->no_floppy = !module_object_class_by_name(TYPE_ISA_FDC);
- m->no_parallel = !module_object_class_by_name(TYPE_ISA_PARALLEL);
-}
-
-DEFINE_PC_MACHINE(isapc, "isapc", pc_init_isa,
- isapc_machine_options);
-#endif
-
#ifdef CONFIG_XEN
static void xenfv_machine_4_2_options(MachineClass *m)
{
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index c538b3d..6015e63 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -45,6 +45,7 @@
#include "hw/i386/pc.h"
#include "hw/i386/amd_iommu.h"
#include "hw/i386/intel_iommu.h"
+#include "hw/vfio/types.h"
#include "hw/virtio/virtio-iommu.h"
#include "hw/display/ramfb.h"
#include "hw/ide/pci.h"
@@ -67,6 +68,8 @@
static GlobalProperty pc_q35_compat_defaults[] = {
{ TYPE_VIRTIO_IOMMU_PCI, "aw-bits", "39" },
+ { TYPE_RAMFB_DEVICE, "use-legacy-x86-rom", "true" },
+ { TYPE_VFIO_PCI_NOHOTPLUG, "use-legacy-x86-rom", "true" },
};
static const size_t pc_q35_compat_defaults_len =
G_N_ELEMENTS(pc_q35_compat_defaults);
@@ -325,6 +328,16 @@ static void pc_q35_init(MachineState *machine)
x86_nvdimm_acpi_dsmio,
x86ms->fw_cfg, OBJECT(pcms));
}
+
+#if defined(CONFIG_IGVM)
+ /* Apply guest state from IGVM if supplied */
+ if (x86ms->igvm) {
+ if (IGVM_CFG_GET_CLASS(x86ms->igvm)
+ ->process(x86ms->igvm, machine->cgs, false, &error_fatal) < 0) {
+ g_assert_not_reached();
+ }
+ }
+#endif
}
#define DEFINE_Q35_MACHINE(major, minor) \
@@ -361,12 +374,22 @@ static void pc_q35_machine_options(MachineClass *m)
pc_q35_compat_defaults, pc_q35_compat_defaults_len);
}
-static void pc_q35_machine_10_1_options(MachineClass *m)
+static void pc_q35_machine_10_2_options(MachineClass *m)
{
pc_q35_machine_options(m);
}
-DEFINE_Q35_MACHINE_AS_LATEST(10, 1);
+DEFINE_Q35_MACHINE_AS_LATEST(10, 2);
+
+static void pc_q35_machine_10_1_options(MachineClass *m)
+{
+ pc_q35_machine_10_2_options(m);
+ m->smbios_memory_device_size = 2047 * TiB;
+ compat_props_add(m->compat_props, hw_compat_10_1, hw_compat_10_1_len);
+ compat_props_add(m->compat_props, pc_compat_10_1, pc_compat_10_1_len);
+}
+
+DEFINE_Q35_MACHINE(10, 1);
static void pc_q35_machine_10_0_options(MachineClass *m)
{
@@ -672,29 +695,3 @@ static void pc_q35_machine_2_6_options(MachineClass *m)
}
DEFINE_Q35_MACHINE(2, 6);
-
-static void pc_q35_machine_2_5_options(MachineClass *m)
-{
- X86MachineClass *x86mc = X86_MACHINE_CLASS(m);
-
- pc_q35_machine_2_6_options(m);
- x86mc->save_tsc_khz = false;
- m->legacy_fw_cfg_order = 1;
- compat_props_add(m->compat_props, hw_compat_2_5, hw_compat_2_5_len);
- compat_props_add(m->compat_props, pc_compat_2_5, pc_compat_2_5_len);
-}
-
-DEFINE_Q35_MACHINE(2, 5);
-
-static void pc_q35_machine_2_4_options(MachineClass *m)
-{
- PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
-
- pc_q35_machine_2_5_options(m);
- m->hw_version = "2.4.0";
- pcmc->broken_reserved_end = true;
- compat_props_add(m->compat_props, hw_compat_2_4, hw_compat_2_4_len);
- compat_props_add(m->compat_props, pc_compat_2_4, pc_compat_2_4_len);
-}
-
-DEFINE_Q35_MACHINE(2, 4);
diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c
index 1eeb58a..1a12b63 100644
--- a/hw/i386/pc_sysfw.c
+++ b/hw/i386/pc_sysfw.c
@@ -37,6 +37,7 @@
#include "hw/block/flash.h"
#include "system/kvm.h"
#include "target/i386/sev.h"
+#include "kvm/tdx.h"
#define FLASH_SECTOR_SIZE 4096
@@ -219,7 +220,13 @@ void pc_system_firmware_init(PCMachineState *pcms,
BlockBackend *pflash_blk[ARRAY_SIZE(pcms->flash)];
if (!pcmc->pci_enabled) {
- x86_bios_rom_init(X86_MACHINE(pcms), "bios.bin", rom_memory, true);
+ /*
+ * If an IGVM file is specified then the firmware must be provided
+ * in the IGVM file.
+ */
+ if (!X86_MACHINE(pcms)->igvm) {
+ x86_bios_rom_init(X86_MACHINE(pcms), "bios.bin", rom_memory, true);
+ }
return;
}
@@ -239,8 +246,13 @@ void pc_system_firmware_init(PCMachineState *pcms,
}
if (!pflash_blk[0]) {
- /* Machine property pflash0 not set, use ROM mode */
- x86_bios_rom_init(X86_MACHINE(pcms), "bios.bin", rom_memory, false);
+ /*
+ * Machine property pflash0 not set, use ROM mode unless using IGVM,
+ * in which case the firmware must be provided by the IGVM file.
+ */
+ if (!X86_MACHINE(pcms)->igvm) {
+ x86_bios_rom_init(X86_MACHINE(pcms), "bios.bin", rom_memory, false);
+ }
} else {
if (kvm_enabled() && !kvm_readonly_mem_enabled()) {
/*
@@ -256,6 +268,20 @@ void pc_system_firmware_init(PCMachineState *pcms,
}
pc_system_flash_cleanup_unused(pcms);
+
+ /*
+ * The user should not have specified any pflash devices when using IGVM
+ * to configure the guest.
+ */
+ if (X86_MACHINE(pcms)->igvm) {
+ for (i = 0; i < ARRAY_SIZE(pcms->flash); i++) {
+ if (pcms->flash[i]) {
+ error_report("pflash devices cannot be configured when "
+ "using IGVM");
+ exit(1);
+ }
+ }
+ }
}
void x86_firmware_configure(hwaddr gpa, void *ptr, int size)
@@ -280,5 +306,11 @@ void x86_firmware_configure(hwaddr gpa, void *ptr, int size)
}
sev_encrypt_flash(gpa, ptr, size, &error_fatal);
+ } else if (is_tdx_vm()) {
+ ret = tdx_parse_tdvf(ptr, size);
+ if (ret) {
+ error_report("failed to parse TDVF for TDX VM");
+ exit(1);
+ }
}
}
diff --git a/hw/i386/sgx-stub.c b/hw/i386/sgx-stub.c
index 38ff75e..d295e54 100644
--- a/hw/i386/sgx-stub.c
+++ b/hw/i386/sgx-stub.c
@@ -3,20 +3,20 @@
#include "monitor/hmp-target.h"
#include "hw/i386/pc.h"
#include "hw/i386/sgx-epc.h"
+#include "qapi/qapi-commands-misc-i386.h"
#include "qapi/error.h"
-#include "qapi/qapi-commands-misc-target.h"
void sgx_epc_build_srat(GArray *table_data)
{
}
-SGXInfo *qmp_query_sgx(Error **errp)
+SgxInfo *qmp_query_sgx(Error **errp)
{
error_setg(errp, "SGX support is not compiled in");
return NULL;
}
-SGXInfo *qmp_query_sgx_capabilities(Error **errp)
+SgxInfo *qmp_query_sgx_capabilities(Error **errp)
{
error_setg(errp, "SGX support is not compiled in");
return NULL;
diff --git a/hw/i386/sgx.c b/hw/i386/sgx.c
index 5685c4f..e280154 100644
--- a/hw/i386/sgx.c
+++ b/hw/i386/sgx.c
@@ -19,7 +19,7 @@
#include "monitor/hmp-target.h"
#include "qapi/error.h"
#include "qemu/error-report.h"
-#include "qapi/qapi-commands-misc-target.h"
+#include "qapi/qapi-commands-misc-i386.h"
#include "system/address-spaces.h"
#include "system/hw_accel.h"
#include "system/reset.h"
@@ -84,10 +84,10 @@ static uint64_t sgx_calc_section_metric(uint64_t low, uint64_t high)
((high & MAKE_64BIT_MASK(0, 20)) << 32);
}
-static SGXEPCSectionList *sgx_calc_host_epc_sections(void)
+static SgxEpcSectionList *sgx_calc_host_epc_sections(void)
{
- SGXEPCSectionList *head = NULL, **tail = &head;
- SGXEPCSection *section;
+ SgxEpcSectionList *head = NULL, **tail = &head;
+ SgxEpcSection *section;
uint32_t i, type;
uint32_t eax, ebx, ecx, edx;
uint32_t j = 0;
@@ -104,7 +104,7 @@ static SGXEPCSectionList *sgx_calc_host_epc_sections(void)
break;
}
- section = g_new0(SGXEPCSection, 1);
+ section = g_new0(SgxEpcSection, 1);
section->node = j++;
section->size = sgx_calc_section_metric(ecx, edx);
QAPI_LIST_APPEND(tail, section);
@@ -153,9 +153,9 @@ static void sgx_epc_reset(void *opaque)
}
}
-SGXInfo *qmp_query_sgx_capabilities(Error **errp)
+SgxInfo *qmp_query_sgx_capabilities(Error **errp)
{
- SGXInfo *info = NULL;
+ SgxInfo *info = NULL;
uint32_t eax, ebx, ecx, edx;
Error *local_err = NULL;
@@ -166,7 +166,7 @@ SGXInfo *qmp_query_sgx_capabilities(Error **errp)
return NULL;
}
- info = g_new0(SGXInfo, 1);
+ info = g_new0(SgxInfo, 1);
host_cpuid(0x7, 0, &eax, &ebx, &ecx, &edx);
info->sgx = ebx & (1U << 2) ? true : false;
@@ -183,17 +183,17 @@ SGXInfo *qmp_query_sgx_capabilities(Error **errp)
return info;
}
-static SGXEPCSectionList *sgx_get_epc_sections_list(void)
+static SgxEpcSectionList *sgx_get_epc_sections_list(void)
{
GSList *device_list = sgx_epc_get_device_list();
- SGXEPCSectionList *head = NULL, **tail = &head;
- SGXEPCSection *section;
+ SgxEpcSectionList *head = NULL, **tail = &head;
+ SgxEpcSection *section;
for (; device_list; device_list = device_list->next) {
DeviceState *dev = device_list->data;
Object *obj = OBJECT(dev);
- section = g_new0(SGXEPCSection, 1);
+ section = g_new0(SgxEpcSection, 1);
section->node = object_property_get_uint(obj, SGX_EPC_NUMA_NODE_PROP,
&error_abort);
section->size = object_property_get_uint(obj, SGX_EPC_SIZE_PROP,
@@ -205,9 +205,9 @@ static SGXEPCSectionList *sgx_get_epc_sections_list(void)
return head;
}
-SGXInfo *qmp_query_sgx(Error **errp)
+SgxInfo *qmp_query_sgx(Error **errp)
{
- SGXInfo *info = NULL;
+ SgxInfo *info = NULL;
X86MachineState *x86ms;
PCMachineState *pcms =
(PCMachineState *)object_dynamic_cast(qdev_get_machine(),
@@ -223,7 +223,7 @@ SGXInfo *qmp_query_sgx(Error **errp)
return NULL;
}
- info = g_new0(SGXInfo, 1);
+ info = g_new0(SgxInfo, 1);
info->sgx = true;
info->sgx1 = true;
@@ -237,8 +237,8 @@ SGXInfo *qmp_query_sgx(Error **errp)
void hmp_info_sgx(Monitor *mon, const QDict *qdict)
{
Error *err = NULL;
- SGXEPCSectionList *section_list, *section;
- g_autoptr(SGXInfo) info = qmp_query_sgx(&err);
+ SgxEpcSectionList *section_list, *section;
+ g_autoptr(SgxInfo) info = qmp_query_sgx(&err);
uint64_t size = 0;
if (err) {
diff --git a/hw/i386/tdvf-hob.c b/hw/i386/tdvf-hob.c
new file mode 100644
index 0000000..782b3d1
--- /dev/null
+++ b/hw/i386/tdvf-hob.c
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2025 Intel Corporation
+ * Author: Isaku Yamahata <isaku.yamahata at gmail.com>
+ * <isaku.yamahata at intel.com>
+ * Xiaoyao Li <xiaoyao.li@intel.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/error-report.h"
+#include "standard-headers/uefi/uefi.h"
+#include "hw/pci/pcie_host.h"
+#include "tdvf-hob.h"
+
+typedef struct TdvfHob {
+ hwaddr hob_addr;
+ void *ptr;
+ int size;
+
+ /* working area */
+ void *current;
+ void *end;
+} TdvfHob;
+
+static uint64_t tdvf_current_guest_addr(const TdvfHob *hob)
+{
+ return hob->hob_addr + (hob->current - hob->ptr);
+}
+
+static void tdvf_align(TdvfHob *hob, size_t align)
+{
+ hob->current = QEMU_ALIGN_PTR_UP(hob->current, align);
+}
+
+static void *tdvf_get_area(TdvfHob *hob, uint64_t size)
+{
+ void *ret;
+
+ if (hob->current + size > hob->end) {
+ error_report("TD_HOB overrun, size = 0x%" PRIx64, size);
+ exit(1);
+ }
+
+ ret = hob->current;
+ hob->current += size;
+ tdvf_align(hob, 8);
+ return ret;
+}
+
+static void tdvf_hob_add_memory_resources(TdxGuest *tdx, TdvfHob *hob)
+{
+ EFI_HOB_RESOURCE_DESCRIPTOR *region;
+ EFI_RESOURCE_ATTRIBUTE_TYPE attr;
+ EFI_RESOURCE_TYPE resource_type;
+
+ TdxRamEntry *e;
+ int i;
+
+ for (i = 0; i < tdx->nr_ram_entries; i++) {
+ e = &tdx->ram_entries[i];
+
+ if (e->type == TDX_RAM_UNACCEPTED) {
+ resource_type = EFI_RESOURCE_MEMORY_UNACCEPTED;
+ attr = EFI_RESOURCE_ATTRIBUTE_TDVF_UNACCEPTED;
+ } else if (e->type == TDX_RAM_ADDED) {
+ resource_type = EFI_RESOURCE_SYSTEM_MEMORY;
+ attr = EFI_RESOURCE_ATTRIBUTE_TDVF_PRIVATE;
+ } else {
+ error_report("unknown TDX_RAM_ENTRY type %d", e->type);
+ exit(1);
+ }
+
+ region = tdvf_get_area(hob, sizeof(*region));
+ *region = (EFI_HOB_RESOURCE_DESCRIPTOR) {
+ .Header = {
+ .HobType = EFI_HOB_TYPE_RESOURCE_DESCRIPTOR,
+ .HobLength = cpu_to_le16(sizeof(*region)),
+ .Reserved = cpu_to_le32(0),
+ },
+ .Owner = EFI_HOB_OWNER_ZERO,
+ .ResourceType = cpu_to_le32(resource_type),
+ .ResourceAttribute = cpu_to_le32(attr),
+ .PhysicalStart = cpu_to_le64(e->address),
+ .ResourceLength = cpu_to_le64(e->length),
+ };
+ }
+}
+
+void tdvf_hob_create(TdxGuest *tdx, TdxFirmwareEntry *td_hob)
+{
+ TdvfHob hob = {
+ .hob_addr = td_hob->address,
+ .size = td_hob->size,
+ .ptr = td_hob->mem_ptr,
+
+ .current = td_hob->mem_ptr,
+ .end = td_hob->mem_ptr + td_hob->size,
+ };
+
+ EFI_HOB_GENERIC_HEADER *last_hob;
+ EFI_HOB_HANDOFF_INFO_TABLE *hit;
+
+ /* Note, Efi{Free}Memory{Bottom,Top} are ignored, leave 'em zeroed. */
+ hit = tdvf_get_area(&hob, sizeof(*hit));
+ *hit = (EFI_HOB_HANDOFF_INFO_TABLE) {
+ .Header = {
+ .HobType = EFI_HOB_TYPE_HANDOFF,
+ .HobLength = cpu_to_le16(sizeof(*hit)),
+ .Reserved = cpu_to_le32(0),
+ },
+ .Version = cpu_to_le32(EFI_HOB_HANDOFF_TABLE_VERSION),
+ .BootMode = cpu_to_le32(0),
+ .EfiMemoryTop = cpu_to_le64(0),
+ .EfiMemoryBottom = cpu_to_le64(0),
+ .EfiFreeMemoryTop = cpu_to_le64(0),
+ .EfiFreeMemoryBottom = cpu_to_le64(0),
+ .EfiEndOfHobList = cpu_to_le64(0), /* initialized later */
+ };
+
+ tdvf_hob_add_memory_resources(tdx, &hob);
+
+ last_hob = tdvf_get_area(&hob, sizeof(*last_hob));
+ *last_hob = (EFI_HOB_GENERIC_HEADER) {
+ .HobType = EFI_HOB_TYPE_END_OF_HOB_LIST,
+ .HobLength = cpu_to_le16(sizeof(*last_hob)),
+ .Reserved = cpu_to_le32(0),
+ };
+ hit->EfiEndOfHobList = tdvf_current_guest_addr(&hob);
+}
diff --git a/hw/i386/tdvf-hob.h b/hw/i386/tdvf-hob.h
new file mode 100644
index 0000000..4fc6a37
--- /dev/null
+++ b/hw/i386/tdvf-hob.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef HW_I386_TD_HOB_H
+#define HW_I386_TD_HOB_H
+
+#include "hw/i386/tdvf.h"
+#include "target/i386/kvm/tdx.h"
+
+void tdvf_hob_create(TdxGuest *tdx, TdxFirmwareEntry *td_hob);
+
+#define EFI_RESOURCE_ATTRIBUTE_TDVF_PRIVATE \
+ (EFI_RESOURCE_ATTRIBUTE_PRESENT | \
+ EFI_RESOURCE_ATTRIBUTE_INITIALIZED | \
+ EFI_RESOURCE_ATTRIBUTE_TESTED)
+
+#define EFI_RESOURCE_ATTRIBUTE_TDVF_UNACCEPTED \
+ (EFI_RESOURCE_ATTRIBUTE_PRESENT | \
+ EFI_RESOURCE_ATTRIBUTE_INITIALIZED | \
+ EFI_RESOURCE_ATTRIBUTE_TESTED)
+
+#define EFI_RESOURCE_ATTRIBUTE_TDVF_MMIO \
+ (EFI_RESOURCE_ATTRIBUTE_PRESENT | \
+ EFI_RESOURCE_ATTRIBUTE_INITIALIZED | \
+ EFI_RESOURCE_ATTRIBUTE_UNCACHEABLE)
+
+#endif
diff --git a/hw/i386/tdvf.c b/hw/i386/tdvf.c
new file mode 100644
index 0000000..645d9d1
--- /dev/null
+++ b/hw/i386/tdvf.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2025 Intel Corporation
+ * Author: Isaku Yamahata <isaku.yamahata at gmail.com>
+ * <isaku.yamahata at intel.com>
+ * Xiaoyao Li <xiaoyao.li@intel.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/error-report.h"
+
+#include "hw/i386/pc.h"
+#include "hw/i386/tdvf.h"
+#include "system/kvm.h"
+
+#define TDX_METADATA_OFFSET_GUID "e47a6535-984a-4798-865e-4685a7bf8ec2"
+#define TDX_METADATA_VERSION 1
+#define TDVF_SIGNATURE 0x46564454 /* TDVF as little endian */
+#define TDVF_ALIGNMENT 4096
+
+/*
+ * the raw structs read from TDVF keeps the name convention in
+ * TDVF Design Guide spec.
+ */
+typedef struct {
+ uint32_t DataOffset;
+ uint32_t RawDataSize;
+ uint64_t MemoryAddress;
+ uint64_t MemoryDataSize;
+ uint32_t Type;
+ uint32_t Attributes;
+} TdvfSectionEntry;
+
+typedef struct {
+ uint32_t Signature;
+ uint32_t Length;
+ uint32_t Version;
+ uint32_t NumberOfSectionEntries;
+ TdvfSectionEntry SectionEntries[];
+} TdvfMetadata;
+
+struct tdx_metadata_offset {
+ uint32_t offset;
+};
+
+static TdvfMetadata *tdvf_get_metadata(void *flash_ptr, int size)
+{
+ TdvfMetadata *metadata;
+ uint32_t offset = 0;
+ uint8_t *data;
+
+ if ((uint32_t) size != size) {
+ return NULL;
+ }
+
+ if (pc_system_ovmf_table_find(TDX_METADATA_OFFSET_GUID, &data, NULL)) {
+ offset = size - le32_to_cpu(((struct tdx_metadata_offset *)data)->offset);
+
+ if (offset + sizeof(*metadata) > size) {
+ return NULL;
+ }
+ } else {
+ error_report("Cannot find TDX_METADATA_OFFSET_GUID");
+ return NULL;
+ }
+
+ metadata = flash_ptr + offset;
+
+ /* Finally, verify the signature to determine if this is a TDVF image. */
+ metadata->Signature = le32_to_cpu(metadata->Signature);
+ if (metadata->Signature != TDVF_SIGNATURE) {
+ error_report("Invalid TDVF signature in metadata!");
+ return NULL;
+ }
+
+ /* Sanity check that the TDVF doesn't overlap its own metadata. */
+ metadata->Length = le32_to_cpu(metadata->Length);
+ if (offset + metadata->Length > size) {
+ return NULL;
+ }
+
+ /* Only version 1 is supported/defined. */
+ metadata->Version = le32_to_cpu(metadata->Version);
+ if (metadata->Version != TDX_METADATA_VERSION) {
+ return NULL;
+ }
+
+ return metadata;
+}
+
+static int tdvf_parse_and_check_section_entry(const TdvfSectionEntry *src,
+ TdxFirmwareEntry *entry)
+{
+ entry->data_offset = le32_to_cpu(src->DataOffset);
+ entry->data_len = le32_to_cpu(src->RawDataSize);
+ entry->address = le64_to_cpu(src->MemoryAddress);
+ entry->size = le64_to_cpu(src->MemoryDataSize);
+ entry->type = le32_to_cpu(src->Type);
+ entry->attributes = le32_to_cpu(src->Attributes);
+
+ /* sanity check */
+ if (entry->size < entry->data_len) {
+ error_report("Broken metadata RawDataSize 0x%x MemoryDataSize 0x%"PRIx64,
+ entry->data_len, entry->size);
+ return -1;
+ }
+ if (!QEMU_IS_ALIGNED(entry->address, TDVF_ALIGNMENT)) {
+ error_report("MemoryAddress 0x%"PRIx64" not page aligned", entry->address);
+ return -1;
+ }
+ if (!QEMU_IS_ALIGNED(entry->size, TDVF_ALIGNMENT)) {
+ error_report("MemoryDataSize 0x%"PRIx64" not page aligned", entry->size);
+ return -1;
+ }
+
+ switch (entry->type) {
+ case TDVF_SECTION_TYPE_BFV:
+ case TDVF_SECTION_TYPE_CFV:
+ /* The sections that must be copied from firmware image to TD memory */
+ if (entry->data_len == 0) {
+ error_report("%d section with RawDataSize == 0", entry->type);
+ return -1;
+ }
+ break;
+ case TDVF_SECTION_TYPE_TD_HOB:
+ case TDVF_SECTION_TYPE_TEMP_MEM:
+ /* The sections that no need to be copied from firmware image */
+ if (entry->data_len != 0) {
+ error_report("%d section with RawDataSize 0x%x != 0",
+ entry->type, entry->data_len);
+ return -1;
+ }
+ break;
+ default:
+ error_report("TDVF contains unsupported section type %d", entry->type);
+ return -1;
+ }
+
+ return 0;
+}
+
+int tdvf_parse_metadata(TdxFirmware *fw, void *flash_ptr, int size)
+{
+ g_autofree TdvfSectionEntry *sections = NULL;
+ TdvfMetadata *metadata;
+ ssize_t entries_size;
+ int i;
+
+ metadata = tdvf_get_metadata(flash_ptr, size);
+ if (!metadata) {
+ return -EINVAL;
+ }
+
+ /* load and parse metadata entries */
+ fw->nr_entries = le32_to_cpu(metadata->NumberOfSectionEntries);
+ if (fw->nr_entries < 2) {
+ error_report("Invalid number of fw entries (%u) in TDVF Metadata",
+ fw->nr_entries);
+ return -EINVAL;
+ }
+
+ entries_size = fw->nr_entries * sizeof(TdvfSectionEntry);
+ if (metadata->Length != sizeof(*metadata) + entries_size) {
+ error_report("TDVF metadata len (0x%x) mismatch, expected (0x%x)",
+ metadata->Length,
+ (uint32_t)(sizeof(*metadata) + entries_size));
+ return -EINVAL;
+ }
+
+ fw->entries = g_new(TdxFirmwareEntry, fw->nr_entries);
+ sections = g_new(TdvfSectionEntry, fw->nr_entries);
+
+ memcpy(sections, (void *)metadata + sizeof(*metadata), entries_size);
+
+ for (i = 0; i < fw->nr_entries; i++) {
+ if (tdvf_parse_and_check_section_entry(&sections[i], &fw->entries[i])) {
+ goto err;
+ }
+ }
+
+ fw->mem_ptr = flash_ptr;
+ return 0;
+
+err:
+ fw->entries = 0;
+ g_free(fw->entries);
+ return -EINVAL;
+}
diff --git a/hw/i386/x86-common.c b/hw/i386/x86-common.c
index 1b0671c..7512be6 100644
--- a/hw/i386/x86-common.c
+++ b/hw/i386/x86-common.c
@@ -44,6 +44,7 @@
#include "standard-headers/asm-x86/bootparam.h"
#include CONFIG_DEVICES
#include "kvm/kvm_i386.h"
+#include "kvm/tdx.h"
#ifdef CONFIG_XEN_EMU
#include "hw/xen/xen.h"
@@ -951,7 +952,7 @@ void x86_load_linux(X86MachineState *x86ms,
* kernel on the other side of the fw_cfg interface matches the hash of the
* file the user passed in.
*/
- if (!sev_enabled() && protocol > 0) {
+ if (!MACHINE(x86ms)->cgs && protocol > 0) {
memcpy(setup, header, MIN(sizeof(header), setup_size));
}
@@ -1035,11 +1036,14 @@ void x86_bios_rom_init(X86MachineState *x86ms, const char *default_firmware,
if (machine_require_guest_memfd(MACHINE(x86ms))) {
memory_region_init_ram_guest_memfd(&x86ms->bios, NULL, "pc.bios",
bios_size, &error_fatal);
+ if (is_tdx_vm()) {
+ tdx_set_tdvf_region(&x86ms->bios);
+ }
} else {
memory_region_init_ram(&x86ms->bios, NULL, "pc.bios",
bios_size, &error_fatal);
}
- if (sev_enabled()) {
+ if (sev_enabled() || is_tdx_vm()) {
/*
* The concept of a "reset" simply doesn't exist for
* confidential computing guests, we have to destroy and
diff --git a/hw/i386/x86-iommu.c b/hw/i386/x86-iommu.c
index d34a684..c127a44 100644
--- a/hw/i386/x86-iommu.c
+++ b/hw/i386/x86-iommu.c
@@ -130,6 +130,7 @@ static const Property x86_iommu_properties[] = {
intr_supported, ON_OFF_AUTO_AUTO),
DEFINE_PROP_BOOL("device-iotlb", X86IOMMUState, dt_supported, false),
DEFINE_PROP_BOOL("pt", X86IOMMUState, pt_supported, true),
+ DEFINE_PROP_BOOL("dma-translation", X86IOMMUState, dma_translation, true),
};
static void x86_iommu_class_init(ObjectClass *klass, const void *data)
diff --git a/hw/i386/x86.c b/hw/i386/x86.c
index e2d0409..f80533d 100644
--- a/hw/i386/x86.c
+++ b/hw/i386/x86.c
@@ -382,7 +382,6 @@ static void x86_machine_class_init(ObjectClass *oc, const void *data)
mc->get_default_cpu_node_id = x86_get_default_cpu_node_id;
mc->possible_cpu_arch_ids = x86_possible_cpu_arch_ids;
mc->kvm_type = x86_kvm_type;
- x86mc->save_tsc_khz = true;
x86mc->fwcfg_dma_enabled = true;
nc->nmi_monitor_handler = x86_nmi;