aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStefan Hajnoczi <stefanha@redhat.com>2025-01-16 09:02:17 -0500
committerStefan Hajnoczi <stefanha@redhat.com>2025-01-16 09:02:18 -0500
commit0e3327b690b76b7c3966b028110ee053cc16a385 (patch)
tree1bf1c254a5589f5e9f595a55eaa2a6e93bb4c2be
parent7433709a147706ad7d1956b15669279933d0f82b (diff)
parent3634039b93cc51816263e0cb5ba32e1b61142d5d (diff)
downloadqemu-0e3327b690b76b7c3966b028110ee053cc16a385.zip
qemu-0e3327b690b76b7c3966b028110ee053cc16a385.tar.gz
qemu-0e3327b690b76b7c3966b028110ee053cc16a385.tar.bz2
Merge tag 'for_upstream' of https://git.kernel.org/pub/scm/virt/kvm/mst/qemu into staging
virtio,pc,pci: features, fixes, cleanups The big thing here are: stage-1 translation in vtd internal migration in vhost-user ghes driver preparation for error injection new resource uuid feature in virtio gpu new vmclock device And as usual, fixes and cleanups. Signed-off-by: Michael S. Tsirkin <mst@redhat.com> # -----BEGIN PGP SIGNATURE----- # # iQFDBAABCAAtFiEEXQn9CHHI+FuUyooNKB8NuNKNVGkFAmeIOiIPHG1zdEByZWRo # YXQuY29tAAoJECgfDbjSjVRpORgIAL0clwZxQL7PIPJ91FwXc1bo6Do/HYquAzvH # eA+ryCG5S5ewh/e2R8SdIUG7nYesEMWJGVL1gb3BFu7wgGh1aLaaTxQ1LIo5HpRF # P0Ak3QO7TKIsSEcZIz9h3eMEpg6X9d8i2h7llp7H3qqXBbduO+cGfeNH/fZD5IEl # 7DFvXuJUgUtZb38I+qtcO+9EQFKGHjgdQAN5P/I4vawWJdxN9sBfT4YVEgpVhiq/ # ALxdSeaEiXA4EXexdHVZhXiQzEBsCQ78RZIIDiRE8I34cVY7rolTodKRfr4bip3P # 6Llu11yvzNi1gppOzkny3QFsRza3hV0RisWYjAMTwLhNCdi/mHQ= # =GjDq # -----END PGP SIGNATURE----- # gpg: Signature made Wed 15 Jan 2025 17:43:46 EST # gpg: using RSA key 5D09FD0871C8F85B94CA8A0D281F0DB8D28D5469 # gpg: issuer "mst@redhat.com" # gpg: Good signature from "Michael S. Tsirkin <mst@kernel.org>" [full] # gpg: aka "Michael S. Tsirkin <mst@redhat.com>" [full] # Primary key fingerprint: 0270 606B 6F3C DF3D 0B17 0970 C350 3912 AFBE 8E67 # Subkey fingerprint: 5D09 FD08 71C8 F85B 94CA 8A0D 281F 0DB8 D28D 5469 * tag 'for_upstream' of https://git.kernel.org/pub/scm/virt/kvm/mst/qemu: (49 commits) hw/acpi: Add vmclock device virtio-net: vhost-user: Implement internal migration vhost: Add stubs for the migration state transfer interface hw/cxl: Fix msix_notify: Assertion `vector < dev->msix_entries_nr` tests: acpi: update expected blobs pci: acpi: Windows 'PCI Label Id' bug workaround tests: acpi: whitelist expected blobs docs: acpi_hest_ghes: fix documentation for CPER size acpi/ghes: Change ghes fill logic to work with only one source acpi/ghes: move offset calculus to a separate function acpi/ghes: better name the offset of the hardware error firmware acpi/ghes: rename etc/hardware_error file macros acpi/ghes: don't crash QEMU if ghes GED is not found acpi/ghes: better name GHES memory error function acpi/ghes: make the GHES record generation more generic acpi/ghes: don't check if physical_address is not zero acpi/ghes: Change the type for source_id acpi/ghes: Remove a duplicated out of bounds check acpi/ghes: Fix acpi_ghes_record_errors() argument acpi/ghes: better handle source_id and notification ... Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-rw-r--r--MAINTAINERS1
-rw-r--r--docs/specs/acpi_hest_ghes.rst6
-rw-r--r--hw/acpi/Kconfig5
-rw-r--r--hw/acpi/cpu.c43
-rw-r--r--hw/acpi/generic_event_device.c4
-rw-r--r--hw/acpi/ghes-stub.c2
-rw-r--r--hw/acpi/ghes.c242
-rw-r--r--hw/acpi/meson.build1
-rw-r--r--hw/acpi/vmclock.c179
-rw-r--r--hw/arm/virt-acpi-build.c5
-rw-r--r--hw/display/vhost-user-gpu.c8
-rw-r--r--hw/display/virtio-gpu-base.c3
-rw-r--r--hw/i386/Kconfig1
-rw-r--r--hw/i386/acpi-build.c43
-rw-r--r--hw/i386/intel_iommu.c732
-rw-r--r--hw/i386/intel_iommu_internal.h101
-rw-r--r--hw/i386/pc.c1
-rw-r--r--hw/mem/cxl_type3.c2
-rw-r--r--hw/net/virtio-net.c135
-rw-r--r--hw/pci/msix.c2
-rw-r--r--hw/pci/pcie.c12
-rw-r--r--include/hw/acpi/ghes.h16
-rw-r--r--include/hw/acpi/vmclock.h34
-rw-r--r--include/hw/i386/intel_iommu.h8
-rw-r--r--include/hw/virtio/vhost.h23
-rw-r--r--include/hw/virtio/virtio-gpu.h3
-rw-r--r--include/standard-headers/linux/vmclock-abi.h182
-rwxr-xr-xscripts/update-linux-headers.sh1
-rw-r--r--target/arm/kvm.c2
-rw-r--r--tests/data/acpi/x86/pc/DSDTbin8526 -> 8611 bytes
-rw-r--r--tests/data/acpi/x86/pc/DSDT.acpierstbin8437 -> 8522 bytes
-rw-r--r--tests/data/acpi/x86/pc/DSDT.acpihmatbin9851 -> 9936 bytes
-rw-r--r--tests/data/acpi/x86/pc/DSDT.bridgebin15397 -> 15482 bytes
-rw-r--r--tests/data/acpi/x86/pc/DSDT.cphpbin8990 -> 9075 bytes
-rw-r--r--tests/data/acpi/x86/pc/DSDT.dimmpxmbin10180 -> 10265 bytes
-rw-r--r--tests/data/acpi/x86/pc/DSDT.hpbridgebin8477 -> 8562 bytes
-rw-r--r--tests/data/acpi/x86/pc/DSDT.hpbrrootbin5033 -> 5100 bytes
-rw-r--r--tests/data/acpi/x86/pc/DSDT.ipmikcsbin8598 -> 8683 bytes
-rw-r--r--tests/data/acpi/x86/pc/DSDT.memhpbin9885 -> 9970 bytes
-rw-r--r--tests/data/acpi/x86/pc/DSDT.nohpetbin8384 -> 8469 bytes
-rw-r--r--tests/data/acpi/x86/pc/DSDT.numamembin8532 -> 8617 bytes
-rw-r--r--tests/data/acpi/x86/pc/DSDT.roothpbin12319 -> 12404 bytes
-rw-r--r--tests/data/acpi/x86/q35/DMAR.dmarbin120 -> 120 bytes
-rw-r--r--tests/data/acpi/x86/q35/DSDTbin8355 -> 8440 bytes
-rw-r--r--tests/data/acpi/x86/q35/DSDT.acpierstbin8372 -> 8457 bytes
-rw-r--r--tests/data/acpi/x86/q35/DSDT.acpihmatbin9680 -> 9765 bytes
-rw-r--r--tests/data/acpi/x86/q35/DSDT.acpihmat-generic-xbin12565 -> 12650 bytes
-rw-r--r--tests/data/acpi/x86/q35/DSDT.acpihmat-noinitiatorbin8634 -> 8719 bytes
-rw-r--r--tests/data/acpi/x86/q35/DSDT.applesmcbin8401 -> 8486 bytes
-rw-r--r--tests/data/acpi/x86/q35/DSDT.bridgebin11968 -> 12053 bytes
-rw-r--r--tests/data/acpi/x86/q35/DSDT.core-countbin12913 -> 12998 bytes
-rw-r--r--tests/data/acpi/x86/q35/DSDT.core-count2bin33770 -> 33855 bytes
-rw-r--r--tests/data/acpi/x86/q35/DSDT.cphpbin8819 -> 8904 bytes
-rw-r--r--tests/data/acpi/x86/q35/DSDT.cxlbin13146 -> 13231 bytes
-rw-r--r--tests/data/acpi/x86/q35/DSDT.dimmpxmbin10009 -> 10094 bytes
-rw-r--r--tests/data/acpi/x86/q35/DSDT.ipmibtbin8430 -> 8515 bytes
-rw-r--r--tests/data/acpi/x86/q35/DSDT.ipmismbusbin8443 -> 8528 bytes
-rw-r--r--tests/data/acpi/x86/q35/DSDT.ivrsbin8372 -> 8457 bytes
-rw-r--r--tests/data/acpi/x86/q35/DSDT.memhpbin9714 -> 9799 bytes
-rw-r--r--tests/data/acpi/x86/q35/DSDT.mmio64bin9485 -> 9570 bytes
-rw-r--r--tests/data/acpi/x86/q35/DSDT.multi-bridgebin13208 -> 13293 bytes
-rw-r--r--tests/data/acpi/x86/q35/DSDT.noacpihpbin8235 -> 8302 bytes
-rw-r--r--tests/data/acpi/x86/q35/DSDT.nohpetbin8213 -> 8298 bytes
-rw-r--r--tests/data/acpi/x86/q35/DSDT.numamembin8361 -> 8446 bytes
-rw-r--r--tests/data/acpi/x86/q35/DSDT.pvpanic-isabin8456 -> 8541 bytes
-rw-r--r--tests/data/acpi/x86/q35/DSDT.thread-countbin12913 -> 12998 bytes
-rw-r--r--tests/data/acpi/x86/q35/DSDT.thread-count2bin33770 -> 33855 bytes
-rw-r--r--tests/data/acpi/x86/q35/DSDT.tis.tpm12bin8961 -> 9046 bytes
-rw-r--r--tests/data/acpi/x86/q35/DSDT.tis.tpm2bin8987 -> 9072 bytes
-rw-r--r--tests/data/acpi/x86/q35/DSDT.type4-countbin18589 -> 18674 bytes
-rw-r--r--tests/data/acpi/x86/q35/DSDT.viotbin14612 -> 14697 bytes
-rw-r--r--tests/data/acpi/x86/q35/DSDT.xapicbin35718 -> 35803 bytes
-rw-r--r--tests/qtest/intel-iommu-test.c64
-rw-r--r--tests/qtest/meson.build1
74 files changed, 1536 insertions, 326 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 8b9d9a7..a928ce3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3711,6 +3711,7 @@ F: hw/i386/intel_iommu.c
F: hw/i386/intel_iommu_internal.h
F: include/hw/i386/intel_iommu.h
F: tests/functional/test_intel_iommu.py
+F: tests/qtest/intel-iommu-test.c
AMD-Vi Emulation
S: Orphan
diff --git a/docs/specs/acpi_hest_ghes.rst b/docs/specs/acpi_hest_ghes.rst
index 68f1fbe..c3e9f8d 100644
--- a/docs/specs/acpi_hest_ghes.rst
+++ b/docs/specs/acpi_hest_ghes.rst
@@ -67,8 +67,10 @@ Design Details
(3) The address registers table contains N Error Block Address entries
and N Read Ack Register entries. The size for each entry is 8-byte.
The Error Status Data Block table contains N Error Status Data Block
- entries. The size for each entry is 4096(0x1000) bytes. The total size
- for the "etc/hardware_errors" fw_cfg blob is (N * 8 * 2 + N * 4096) bytes.
+ entries. The size for each entry is defined at the source code as
+ ACPI_GHES_MAX_RAW_DATA_LENGTH (currently 1024 bytes). The total size
+ for the "etc/hardware_errors" fw_cfg blob is
+ (N * 8 * 2 + N * ACPI_GHES_MAX_RAW_DATA_LENGTH) bytes.
N is the number of the kinds of hardware error sources.
(4) QEMU generates the ACPI linker/loader script for the firmware. The
diff --git a/hw/acpi/Kconfig b/hw/acpi/Kconfig
index e07d320..1d4e9f0 100644
--- a/hw/acpi/Kconfig
+++ b/hw/acpi/Kconfig
@@ -60,6 +60,11 @@ config ACPI_VMGENID
default y
depends on PC
+config ACPI_VMCLOCK
+ bool
+ default y
+ depends on PC
+
config ACPI_VIOT
bool
depends on ACPI
diff --git a/hw/acpi/cpu.c b/hw/acpi/cpu.c
index 9d530a2..f70a2c0 100644
--- a/hw/acpi/cpu.c
+++ b/hw/acpi/cpu.c
@@ -327,6 +327,7 @@ const VMStateDescription vmstate_cpu_hotplug = {
#define CPU_EJECT_METHOD "CEJ0"
#define CPU_OST_METHOD "COST"
#define CPU_ADDED_LIST "CNEW"
+#define CPU_EJ_LIST "CEJL"
#define CPU_ENABLED "CPEN"
#define CPU_SELECTOR "CSEL"
@@ -488,7 +489,6 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts,
method = aml_method(CPU_SCAN_METHOD, 0, AML_SERIALIZED);
{
const uint8_t max_cpus_per_pass = 255;
- Aml *else_ctx;
Aml *while_ctx, *while_ctx2;
Aml *has_event = aml_local(0);
Aml *dev_chk = aml_int(1);
@@ -499,6 +499,8 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts,
Aml *uid = aml_local(3);
Aml *has_job = aml_local(4);
Aml *new_cpus = aml_name(CPU_ADDED_LIST);
+ Aml *ej_cpus = aml_name(CPU_EJ_LIST);
+ Aml *num_ej_cpus = aml_local(5);
aml_append(method, aml_acquire(ctrl_lock, 0xFFFF));
@@ -513,6 +515,8 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts,
*/
aml_append(method, aml_name_decl(CPU_ADDED_LIST,
aml_package(max_cpus_per_pass)));
+ aml_append(method, aml_name_decl(CPU_EJ_LIST,
+ aml_package(max_cpus_per_pass)));
aml_append(method, aml_store(zero, uid));
aml_append(method, aml_store(one, has_job));
@@ -527,6 +531,7 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts,
aml_append(while_ctx2, aml_store(one, has_event));
aml_append(while_ctx2, aml_store(zero, num_added_cpus));
+ aml_append(while_ctx2, aml_store(zero, num_ej_cpus));
/*
* Scan CPUs, till there are CPUs with events or
@@ -559,8 +564,10 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts,
* if CPU_ADDED_LIST is full, exit inner loop and process
* collected CPUs
*/
- ifctx = aml_if(
- aml_equal(num_added_cpus, aml_int(max_cpus_per_pass)));
+ ifctx = aml_if(aml_lor(
+ aml_equal(num_added_cpus, aml_int(max_cpus_per_pass)),
+ aml_equal(num_ej_cpus, aml_int(max_cpus_per_pass))
+ ));
{
aml_append(ifctx, aml_store(one, has_job));
aml_append(ifctx, aml_break());
@@ -577,16 +584,16 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts,
aml_append(ifctx, aml_store(one, has_event));
}
aml_append(while_ctx, ifctx);
- else_ctx = aml_else();
+
ifctx = aml_if(aml_equal(rm_evt, one));
{
- aml_append(ifctx,
- aml_call2(CPU_NOTIFY_METHOD, uid, eject_req));
- aml_append(ifctx, aml_store(one, rm_evt));
+ /* cache to be removed CPUs to Notify later */
+ aml_append(ifctx, aml_store(uid,
+ aml_index(ej_cpus, num_ej_cpus)));
+ aml_append(ifctx, aml_increment(num_ej_cpus));
aml_append(ifctx, aml_store(one, has_event));
}
- aml_append(else_ctx, ifctx);
- aml_append(while_ctx, else_ctx);
+ aml_append(while_ctx, ifctx);
aml_append(while_ctx, aml_increment(uid));
}
aml_append(while_ctx2, while_ctx);
@@ -620,6 +627,24 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts,
aml_append(while_ctx, aml_increment(cpu_idx));
}
aml_append(while_ctx2, while_ctx);
+
+ /*
+ * Notify OSPM about to be removed CPUs and clear remove flag
+ */
+ aml_append(while_ctx2, aml_store(zero, cpu_idx));
+ while_ctx = aml_while(aml_lless(cpu_idx, num_ej_cpus));
+ {
+ aml_append(while_ctx,
+ aml_store(aml_derefof(aml_index(ej_cpus, cpu_idx)),
+ uid));
+ aml_append(while_ctx,
+ aml_call2(CPU_NOTIFY_METHOD, uid, eject_req));
+ aml_append(while_ctx, aml_store(uid, cpu_selector));
+ aml_append(while_ctx, aml_store(one, rm_evt));
+ aml_append(while_ctx, aml_increment(cpu_idx));
+ }
+ aml_append(while_ctx2, while_ctx);
+
/*
* If another batch is needed, then it will resume scanning
* exactly at -- and not after -- the last CPU that's currently
diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c
index 58540c0..c85d97c 100644
--- a/hw/acpi/generic_event_device.c
+++ b/hw/acpi/generic_event_device.c
@@ -363,7 +363,7 @@ static const VMStateDescription vmstate_ghes = {
.version_id = 1,
.minimum_version_id = 1,
.fields = (const VMStateField[]) {
- VMSTATE_UINT64(ghes_addr_le, AcpiGhesState),
+ VMSTATE_UINT64(hw_error_le, AcpiGhesState),
VMSTATE_END_OF_LIST()
},
};
@@ -371,7 +371,7 @@ static const VMStateDescription vmstate_ghes = {
static bool ghes_needed(void *opaque)
{
AcpiGedState *s = opaque;
- return s->ghes_state.ghes_addr_le;
+ return s->ghes_state.hw_error_le;
}
static const VMStateDescription vmstate_ghes_state = {
diff --git a/hw/acpi/ghes-stub.c b/hw/acpi/ghes-stub.c
index c315de1..7cec181 100644
--- a/hw/acpi/ghes-stub.c
+++ b/hw/acpi/ghes-stub.c
@@ -11,7 +11,7 @@
#include "qemu/osdep.h"
#include "hw/acpi/ghes.h"
-int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address)
+int acpi_ghes_memory_errors(uint16_t source_id, uint64_t physical_address)
{
return -1;
}
diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
index e9511d9..b709c17 100644
--- a/hw/acpi/ghes.c
+++ b/hw/acpi/ghes.c
@@ -28,15 +28,12 @@
#include "hw/nvram/fw_cfg.h"
#include "qemu/uuid.h"
-#define ACPI_GHES_ERRORS_FW_CFG_FILE "etc/hardware_errors"
-#define ACPI_GHES_DATA_ADDR_FW_CFG_FILE "etc/hardware_errors_addr"
+#define ACPI_HW_ERROR_FW_CFG_FILE "etc/hardware_errors"
+#define ACPI_HW_ERROR_ADDR_FW_CFG_FILE "etc/hardware_errors_addr"
/* The max size in bytes for one error block */
#define ACPI_GHES_MAX_RAW_DATA_LENGTH (1 * KiB)
-/* Now only support ARMv8 SEA notification type error source */
-#define ACPI_GHES_ERROR_SOURCE_COUNT 1
-
/* Generic Hardware Error Source version 2 */
#define ACPI_GHES_SOURCE_GENERIC_ERROR_V2 10
@@ -184,51 +181,24 @@ static void acpi_ghes_build_append_mem_cper(GArray *table,
build_append_int_noprefix(table, 0, 7);
}
-static int acpi_ghes_record_mem_error(uint64_t error_block_address,
- uint64_t error_physical_addr)
+static void
+ghes_gen_err_data_uncorrectable_recoverable(GArray *block,
+ const uint8_t *section_type,
+ int data_length)
{
- GArray *block;
-
- /* Memory Error Section Type */
- const uint8_t uefi_cper_mem_sec[] =
- UUID_LE(0xA5BC1114, 0x6F64, 0x4EDE, 0xB8, 0x63, 0x3E, 0x83, \
- 0xED, 0x7C, 0x83, 0xB1);
-
/* invalid fru id: ACPI 4.0: 17.3.2.6.1 Generic Error Data,
* Table 17-13 Generic Error Data Entry
*/
QemuUUID fru_id = {};
- uint32_t data_length;
-
- block = g_array_new(false, true /* clear */, 1);
-
- /* This is the length if adding a new generic error data entry*/
- data_length = ACPI_GHES_DATA_LENGTH + ACPI_GHES_MEM_CPER_LENGTH;
- /*
- * It should not run out of the preallocated memory if adding a new generic
- * error data entry
- */
- assert((data_length + ACPI_GHES_GESB_SIZE) <=
- ACPI_GHES_MAX_RAW_DATA_LENGTH);
/* Build the new generic error status block header */
acpi_ghes_generic_error_status(block, ACPI_GEBS_UNCORRECTABLE,
0, 0, data_length, ACPI_CPER_SEV_RECOVERABLE);
/* Build this new generic error data entry header */
- acpi_ghes_generic_error_data(block, uefi_cper_mem_sec,
+ acpi_ghes_generic_error_data(block, section_type,
ACPI_CPER_SEV_RECOVERABLE, 0, 0,
ACPI_GHES_MEM_CPER_LENGTH, fru_id, 0);
-
- /* Build the memory section CPER for above new generic error data entry */
- acpi_ghes_build_append_mem_cper(block, error_physical_addr);
-
- /* Write the generic error data entry into guest memory */
- cpu_physical_memory_write(error_block_address, block->data, block->len);
-
- g_array_free(block, true);
-
- return 0;
}
/*
@@ -236,7 +206,7 @@ static int acpi_ghes_record_mem_error(uint64_t error_block_address,
* Initialize "etc/hardware_errors" and "etc/hardware_errors_addr" fw_cfg blobs.
* See docs/specs/acpi_hest_ghes.rst for blobs format.
*/
-void build_ghes_error_table(GArray *hardware_errors, BIOSLinker *linker)
+static void build_ghes_error_table(GArray *hardware_errors, BIOSLinker *linker)
{
int i, error_status_block_offset;
@@ -264,7 +234,7 @@ void build_ghes_error_table(GArray *hardware_errors, BIOSLinker *linker)
ACPI_GHES_MAX_RAW_DATA_LENGTH * ACPI_GHES_ERROR_SOURCE_COUNT);
/* Tell guest firmware to place hardware_errors blob into RAM */
- bios_linker_loader_alloc(linker, ACPI_GHES_ERRORS_FW_CFG_FILE,
+ bios_linker_loader_alloc(linker, ACPI_HW_ERROR_FW_CFG_FILE,
hardware_errors, sizeof(uint64_t), false);
for (i = 0; i < ACPI_GHES_ERROR_SOURCE_COUNT; i++) {
@@ -273,23 +243,31 @@ void build_ghes_error_table(GArray *hardware_errors, BIOSLinker *linker)
* corresponding "Generic Error Status Block"
*/
bios_linker_loader_add_pointer(linker,
- ACPI_GHES_ERRORS_FW_CFG_FILE, sizeof(uint64_t) * i,
- sizeof(uint64_t), ACPI_GHES_ERRORS_FW_CFG_FILE,
- error_status_block_offset + i * ACPI_GHES_MAX_RAW_DATA_LENGTH);
+ ACPI_HW_ERROR_FW_CFG_FILE,
+ sizeof(uint64_t) * i,
+ sizeof(uint64_t),
+ ACPI_HW_ERROR_FW_CFG_FILE,
+ error_status_block_offset +
+ i * ACPI_GHES_MAX_RAW_DATA_LENGTH);
}
/*
* tell firmware to write hardware_errors GPA into
* hardware_errors_addr fw_cfg, once the former has been initialized.
*/
- bios_linker_loader_write_pointer(linker, ACPI_GHES_DATA_ADDR_FW_CFG_FILE,
- 0, sizeof(uint64_t), ACPI_GHES_ERRORS_FW_CFG_FILE, 0);
+ bios_linker_loader_write_pointer(linker, ACPI_HW_ERROR_ADDR_FW_CFG_FILE, 0,
+ sizeof(uint64_t),
+ ACPI_HW_ERROR_FW_CFG_FILE, 0);
}
/* Build Generic Hardware Error Source version 2 (GHESv2) */
-static void build_ghes_v2(GArray *table_data, int source_id, BIOSLinker *linker)
+static void build_ghes_v2(GArray *table_data,
+ BIOSLinker *linker,
+ enum AcpiGhesNotifyType notify,
+ uint16_t source_id)
{
uint64_t address_offset;
+
/*
* Type:
* Generic Hardware Error Source version 2(GHESv2 - Type 10)
@@ -316,21 +294,13 @@ static void build_ghes_v2(GArray *table_data, int source_id, BIOSLinker *linker)
build_append_gas(table_data, AML_AS_SYSTEM_MEMORY, 0x40, 0,
4 /* QWord access */, 0);
bios_linker_loader_add_pointer(linker, ACPI_BUILD_TABLE_FILE,
- address_offset + GAS_ADDR_OFFSET, sizeof(uint64_t),
- ACPI_GHES_ERRORS_FW_CFG_FILE, source_id * sizeof(uint64_t));
+ address_offset + GAS_ADDR_OFFSET,
+ sizeof(uint64_t),
+ ACPI_HW_ERROR_FW_CFG_FILE,
+ source_id * sizeof(uint64_t));
- switch (source_id) {
- case ACPI_HEST_SRC_ID_SEA:
- /*
- * Notification Structure
- * Now only enable ARMv8 SEA notification type
- */
- build_ghes_hw_error_notification(table_data, ACPI_GHES_NOTIFY_SEA);
- break;
- default:
- error_report("Not support this error source");
- abort();
- }
+ /* Notification Structure */
+ build_ghes_hw_error_notification(table_data, notify);
/* Error Status Block Length */
build_append_int_noprefix(table_data, ACPI_GHES_MAX_RAW_DATA_LENGTH, 4);
@@ -344,9 +314,11 @@ static void build_ghes_v2(GArray *table_data, int source_id, BIOSLinker *linker)
build_append_gas(table_data, AML_AS_SYSTEM_MEMORY, 0x40, 0,
4 /* QWord access */, 0);
bios_linker_loader_add_pointer(linker, ACPI_BUILD_TABLE_FILE,
- address_offset + GAS_ADDR_OFFSET,
- sizeof(uint64_t), ACPI_GHES_ERRORS_FW_CFG_FILE,
- (ACPI_GHES_ERROR_SOURCE_COUNT + source_id) * sizeof(uint64_t));
+ address_offset + GAS_ADDR_OFFSET,
+ sizeof(uint64_t),
+ ACPI_HW_ERROR_FW_CFG_FILE,
+ (ACPI_GHES_ERROR_SOURCE_COUNT + source_id)
+ * sizeof(uint64_t));
/*
* Read Ack Preserve field
@@ -359,17 +331,21 @@ static void build_ghes_v2(GArray *table_data, int source_id, BIOSLinker *linker)
}
/* Build Hardware Error Source Table */
-void acpi_build_hest(GArray *table_data, BIOSLinker *linker,
+void acpi_build_hest(GArray *table_data, GArray *hardware_errors,
+ BIOSLinker *linker,
const char *oem_id, const char *oem_table_id)
{
AcpiTable table = { .sig = "HEST", .rev = 1,
.oem_id = oem_id, .oem_table_id = oem_table_id };
+ build_ghes_error_table(hardware_errors, linker);
+
acpi_table_begin(&table, table_data);
/* Error Source Count */
build_append_int_noprefix(table_data, ACPI_GHES_ERROR_SOURCE_COUNT, 4);
- build_ghes_v2(table_data, ACPI_HEST_SRC_ID_SEA, linker);
+ build_ghes_v2(table_data, linker,
+ ACPI_GHES_NOTIFY_SEA, ACPI_HEST_SRC_ID_SEA);
acpi_table_end(linker, &table);
}
@@ -378,70 +354,132 @@ void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, FWCfgState *s,
GArray *hardware_error)
{
/* Create a read-only fw_cfg file for GHES */
- fw_cfg_add_file(s, ACPI_GHES_ERRORS_FW_CFG_FILE, hardware_error->data,
+ fw_cfg_add_file(s, ACPI_HW_ERROR_FW_CFG_FILE, hardware_error->data,
hardware_error->len);
/* Create a read-write fw_cfg file for Address */
- fw_cfg_add_file_callback(s, ACPI_GHES_DATA_ADDR_FW_CFG_FILE, NULL, NULL,
- NULL, &(ags->ghes_addr_le), sizeof(ags->ghes_addr_le), false);
+ fw_cfg_add_file_callback(s, ACPI_HW_ERROR_ADDR_FW_CFG_FILE, NULL, NULL,
+ NULL, &(ags->hw_error_le), sizeof(ags->hw_error_le), false);
ags->present = true;
}
-int acpi_ghes_record_errors(uint8_t source_id, uint64_t physical_address)
+static void get_hw_error_offsets(uint64_t ghes_addr,
+ uint64_t *cper_addr,
+ uint64_t *read_ack_register_addr)
{
- uint64_t error_block_addr, read_ack_register_addr, read_ack_register = 0;
- uint64_t start_addr;
- bool ret = -1;
+ if (!ghes_addr) {
+ return;
+ }
+
+ /*
+ * non-HEST version supports only one source, so no need to change
+ * the start offset based on the source ID. Also, we can't validate
+ * the source ID, as it is stored inside the HEST table.
+ */
+
+ cpu_physical_memory_read(ghes_addr, cper_addr,
+ sizeof(*cper_addr));
+
+ *cper_addr = le64_to_cpu(*cper_addr);
+
+ /*
+ * As the current version supports only one source, the ack offset is
+ * just sizeof(uint64_t).
+ */
+ *read_ack_register_addr = ghes_addr + sizeof(uint64_t);
+}
+
+void ghes_record_cper_errors(const void *cper, size_t len,
+ uint16_t source_id, Error **errp)
+{
+ uint64_t cper_addr = 0, read_ack_register_addr = 0, read_ack_register;
AcpiGedState *acpi_ged_state;
AcpiGhesState *ags;
- assert(source_id < ACPI_HEST_SRC_ID_RESERVED);
+ if (len > ACPI_GHES_MAX_RAW_DATA_LENGTH) {
+ error_setg(errp, "GHES CPER record is too big: %zd", len);
+ return;
+ }
acpi_ged_state = ACPI_GED(object_resolve_path_type("", TYPE_ACPI_GED,
NULL));
- g_assert(acpi_ged_state);
+ if (!acpi_ged_state) {
+ error_setg(errp, "Can't find ACPI_GED object");
+ return;
+ }
ags = &acpi_ged_state->ghes_state;
- start_addr = le64_to_cpu(ags->ghes_addr_le);
+ assert(ACPI_GHES_ERROR_SOURCE_COUNT == 1);
+ get_hw_error_offsets(le64_to_cpu(ags->hw_error_le),
+ &cper_addr, &read_ack_register_addr);
+
+ if (!cper_addr) {
+ error_setg(errp, "can not find Generic Error Status Block");
+ return;
+ }
+
+ cpu_physical_memory_read(read_ack_register_addr,
+ &read_ack_register, sizeof(read_ack_register));
+
+ /* zero means OSPM does not acknowledge the error */
+ if (!read_ack_register) {
+ error_setg(errp,
+ "OSPM does not acknowledge previous error,"
+ " so can not record CPER for current error anymore");
+ return;
+ }
+
+ read_ack_register = cpu_to_le64(0);
+ /*
+ * Clear the Read Ack Register, OSPM will write 1 to this register when
+ * it acknowledges the error.
+ */
+ cpu_physical_memory_write(read_ack_register_addr,
+ &read_ack_register, sizeof(uint64_t));
+
+ /* Write the generic error data entry into guest memory */
+ cpu_physical_memory_write(cper_addr, cper, len);
+
+ return;
+}
- if (physical_address) {
+int acpi_ghes_memory_errors(uint16_t source_id, uint64_t physical_address)
+{
+ /* Memory Error Section Type */
+ const uint8_t guid[] =
+ UUID_LE(0xA5BC1114, 0x6F64, 0x4EDE, 0xB8, 0x63, 0x3E, 0x83, \
+ 0xED, 0x7C, 0x83, 0xB1);
+ Error *errp = NULL;
+ int data_length;
+ GArray *block;
- if (source_id < ACPI_HEST_SRC_ID_RESERVED) {
- start_addr += source_id * sizeof(uint64_t);
- }
+ block = g_array_new(false, true /* clear */, 1);
- cpu_physical_memory_read(start_addr, &error_block_addr,
- sizeof(error_block_addr));
+ data_length = ACPI_GHES_DATA_LENGTH + ACPI_GHES_MEM_CPER_LENGTH;
+ /*
+ * It should not run out of the preallocated memory if adding a new generic
+ * error data entry
+ */
+ assert((data_length + ACPI_GHES_GESB_SIZE) <=
+ ACPI_GHES_MAX_RAW_DATA_LENGTH);
- error_block_addr = le64_to_cpu(error_block_addr);
+ ghes_gen_err_data_uncorrectable_recoverable(block, guid, data_length);
- read_ack_register_addr = start_addr +
- ACPI_GHES_ERROR_SOURCE_COUNT * sizeof(uint64_t);
+ /* Build the memory section CPER for above new generic error data entry */
+ acpi_ghes_build_append_mem_cper(block, physical_address);
- cpu_physical_memory_read(read_ack_register_addr,
- &read_ack_register, sizeof(read_ack_register));
+ /* Report the error */
+ ghes_record_cper_errors(block->data, block->len, source_id, &errp);
- /* zero means OSPM does not acknowledge the error */
- if (!read_ack_register) {
- error_report("OSPM does not acknowledge previous error,"
- " so can not record CPER for current error anymore");
- } else if (error_block_addr) {
- read_ack_register = cpu_to_le64(0);
- /*
- * Clear the Read Ack Register, OSPM will write it to 1 when
- * it acknowledges this error.
- */
- cpu_physical_memory_write(read_ack_register_addr,
- &read_ack_register, sizeof(uint64_t));
+ g_array_free(block, true);
- ret = acpi_ghes_record_mem_error(error_block_addr,
- physical_address);
- } else
- error_report("can not find Generic Error Status Block");
+ if (errp) {
+ error_report_err(errp);
+ return -1;
}
- return ret;
+ return 0;
}
bool acpi_ghes_present(void)
diff --git a/hw/acpi/meson.build b/hw/acpi/meson.build
index c8854f4..73f02b9 100644
--- a/hw/acpi/meson.build
+++ b/hw/acpi/meson.build
@@ -15,6 +15,7 @@ acpi_ss.add(when: 'CONFIG_ACPI_NVDIMM', if_false: files('acpi-nvdimm-stub.c'))
acpi_ss.add(when: 'CONFIG_ACPI_PCI', if_true: files('pci.c'))
acpi_ss.add(when: 'CONFIG_ACPI_CXL', if_true: files('cxl.c'), if_false: files('cxl-stub.c'))
acpi_ss.add(when: 'CONFIG_ACPI_VMGENID', if_true: files('vmgenid.c'))
+acpi_ss.add(when: 'CONFIG_ACPI_VMCLOCK', if_true: files('vmclock.c'))
acpi_ss.add(when: 'CONFIG_ACPI_HW_REDUCED', if_true: files('generic_event_device.c'))
acpi_ss.add(when: 'CONFIG_ACPI_HMAT', if_true: files('hmat.c'))
acpi_ss.add(when: 'CONFIG_ACPI_APEI', if_true: files('ghes.c'), if_false: files('ghes-stub.c'))
diff --git a/hw/acpi/vmclock.c b/hw/acpi/vmclock.c
new file mode 100644
index 0000000..7387e5c
--- /dev/null
+++ b/hw/acpi/vmclock.c
@@ -0,0 +1,179 @@
+/*
+ * Virtual Machine Clock Device
+ *
+ * Copyright © 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/module.h"
+#include "hw/i386/e820_memory_layout.h"
+#include "hw/acpi/acpi.h"
+#include "hw/acpi/aml-build.h"
+#include "hw/acpi/vmclock.h"
+#include "hw/nvram/fw_cfg.h"
+#include "hw/qdev-properties.h"
+#include "hw/qdev-properties-system.h"
+#include "migration/vmstate.h"
+#include "system/reset.h"
+
+#include "standard-headers/linux/vmclock-abi.h"
+
+void vmclock_build_acpi(VmclockState *vms, GArray *table_data,
+ BIOSLinker *linker, const char *oem_id)
+{
+ Aml *ssdt, *dev, *scope, *crs;
+ AcpiTable table = { .sig = "SSDT", .rev = 1,
+ .oem_id = oem_id, .oem_table_id = "VMCLOCK" };
+
+ /* Put VMCLOCK into a separate SSDT table */
+ acpi_table_begin(&table, table_data);
+ ssdt = init_aml_allocator();
+
+ scope = aml_scope("\\_SB");
+ dev = aml_device("VCLK");
+ aml_append(dev, aml_name_decl("_HID", aml_string("AMZNC10C")));
+ aml_append(dev, aml_name_decl("_CID", aml_string("VMCLOCK")));
+ aml_append(dev, aml_name_decl("_DDN", aml_string("VMCLOCK")));
+
+ /* Simple status method */
+ aml_append(dev, aml_name_decl("_STA", aml_int(0xf)));
+
+ crs = aml_resource_template();
+ aml_append(crs, aml_qword_memory(AML_POS_DECODE,
+ AML_MIN_FIXED, AML_MAX_FIXED,
+ AML_CACHEABLE, AML_READ_ONLY,
+ 0xffffffffffffffffULL,
+ vms->physaddr,
+ vms->physaddr + VMCLOCK_SIZE - 1,
+ 0, VMCLOCK_SIZE));
+ aml_append(dev, aml_name_decl("_CRS", crs));
+ aml_append(scope, dev);
+ aml_append(ssdt, scope);
+
+ g_array_append_vals(table_data, ssdt->buf->data, ssdt->buf->len);
+ acpi_table_end(linker, &table);
+ free_aml_allocator();
+}
+
+static void vmclock_update_guest(VmclockState *vms)
+{
+ uint64_t disruption_marker;
+ uint32_t seq_count;
+
+ if (!vms->clk) {
+ return;
+ }
+
+ seq_count = le32_to_cpu(vms->clk->seq_count) | 1;
+ vms->clk->seq_count = cpu_to_le32(seq_count);
+ /* These barriers pair with read barriers in the guest */
+ smp_wmb();
+
+ disruption_marker = le64_to_cpu(vms->clk->disruption_marker);
+ disruption_marker++;
+ vms->clk->disruption_marker = cpu_to_le64(disruption_marker);
+
+ /* These barriers pair with read barriers in the guest */
+ smp_wmb();
+ vms->clk->seq_count = cpu_to_le32(seq_count + 1);
+}
+
+/*
+ * After restoring an image, we need to update the guest memory to notify
+ * it of clock disruption.
+ */
+static int vmclock_post_load(void *opaque, int version_id)
+{
+ VmclockState *vms = opaque;
+
+ vmclock_update_guest(vms);
+ return 0;
+}
+
+static const VMStateDescription vmstate_vmclock = {
+ .name = "vmclock",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .post_load = vmclock_post_load,
+ .fields = (const VMStateField[]) {
+ VMSTATE_UINT64(physaddr, VmclockState),
+ VMSTATE_END_OF_LIST()
+ },
+};
+
+static void vmclock_handle_reset(void *opaque)
+{
+ VmclockState *vms = VMCLOCK(opaque);
+
+ if (!memory_region_is_mapped(&vms->clk_page)) {
+ memory_region_add_subregion_overlap(get_system_memory(),
+ vms->physaddr,
+ &vms->clk_page, 0);
+ }
+}
+
+static void vmclock_realize(DeviceState *dev, Error **errp)
+{
+ VmclockState *vms = VMCLOCK(dev);
+
+ /*
+ * Given that this function is executing, there is at least one VMCLOCK
+ * device. Check if there are several.
+ */
+ if (!find_vmclock_dev()) {
+ error_setg(errp, "at most one %s device is permitted", TYPE_VMCLOCK);
+ return;
+ }
+
+ vms->physaddr = VMCLOCK_ADDR;
+
+ e820_add_entry(vms->physaddr, VMCLOCK_SIZE, E820_RESERVED);
+
+ memory_region_init_ram(&vms->clk_page, OBJECT(dev), "vmclock_page",
+ VMCLOCK_SIZE, &error_abort);
+ memory_region_set_enabled(&vms->clk_page, true);
+ vms->clk = memory_region_get_ram_ptr(&vms->clk_page);
+ memset(vms->clk, 0, VMCLOCK_SIZE);
+
+ vms->clk->magic = cpu_to_le32(VMCLOCK_MAGIC);
+ vms->clk->size = cpu_to_le16(VMCLOCK_SIZE);
+ vms->clk->version = cpu_to_le16(1);
+
+ /* These are all zero and thus default, but be explicit */
+ vms->clk->clock_status = VMCLOCK_STATUS_UNKNOWN;
+ vms->clk->counter_id = VMCLOCK_COUNTER_INVALID;
+
+ qemu_register_reset(vmclock_handle_reset, vms);
+
+ vmclock_update_guest(vms);
+}
+
+static void vmclock_device_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+
+ dc->vmsd = &vmstate_vmclock;
+ dc->realize = vmclock_realize;
+ dc->hotpluggable = false;
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+}
+
+static const TypeInfo vmclock_device_info = {
+ .name = TYPE_VMCLOCK,
+ .parent = TYPE_DEVICE,
+ .instance_size = sizeof(VmclockState),
+ .class_init = vmclock_device_class_init,
+};
+
+static void vmclock_register_types(void)
+{
+ type_register_static(&vmclock_device_info);
+}
+
+type_init(vmclock_register_types)
diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index c9b1305..3ac8f8e 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -946,10 +946,9 @@ void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables)
build_dbg2(tables_blob, tables->linker, vms);
if (vms->ras) {
- build_ghes_error_table(tables->hardware_errors, tables->linker);
acpi_add_table(table_offsets, tables_blob);
- acpi_build_hest(tables_blob, tables->linker, vms->oem_id,
- vms->oem_table_id);
+ acpi_build_hest(tables_blob, tables->hardware_errors, tables->linker,
+ vms->oem_id, vms->oem_table_id);
}
if (ms->numa_state->num_nodes > 0) {
diff --git a/hw/display/vhost-user-gpu.c b/hw/display/vhost-user-gpu.c
index 12d5c37..2aed624 100644
--- a/hw/display/vhost-user-gpu.c
+++ b/hw/display/vhost-user-gpu.c
@@ -631,6 +631,14 @@ vhost_user_gpu_device_realize(DeviceState *qdev, Error **errp)
error_report("EDID requested but the backend doesn't support it.");
g->parent_obj.conf.flags &= ~(1 << VIRTIO_GPU_FLAG_EDID_ENABLED);
}
+ if (virtio_has_feature(g->vhost->dev.features,
+ VIRTIO_GPU_F_RESOURCE_UUID)) {
+ g->parent_obj.conf.flags |= 1 << VIRTIO_GPU_FLAG_RESOURCE_UUID_ENABLED;
+ }
+ if (virtio_has_feature(g->vhost->dev.features,
+ VIRTIO_GPU_F_RESOURCE_UUID)) {
+ g->parent_obj.conf.flags |= 1 << VIRTIO_GPU_FLAG_RESOURCE_UUID_ENABLED;
+ }
if (!virtio_gpu_base_device_realize(qdev, NULL, NULL, errp)) {
return;
diff --git a/hw/display/virtio-gpu-base.c b/hw/display/virtio-gpu-base.c
index 4fc7ef8..7827536 100644
--- a/hw/display/virtio-gpu-base.c
+++ b/hw/display/virtio-gpu-base.c
@@ -235,6 +235,9 @@ virtio_gpu_base_get_features(VirtIODevice *vdev, uint64_t features,
if (virtio_gpu_context_init_enabled(g->conf)) {
features |= (1 << VIRTIO_GPU_F_CONTEXT_INIT);
}
+ if (virtio_gpu_resource_uuid_enabled(g->conf)) {
+ features |= (1 << VIRTIO_GPU_F_RESOURCE_UUID);
+ }
return features;
}
diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig
index 3281848..d34ce07 100644
--- a/hw/i386/Kconfig
+++ b/hw/i386/Kconfig
@@ -43,6 +43,7 @@ config PC
select SERIAL_ISA
select ACPI_PCI
select ACPI_VMGENID
+ select ACPI_VMCLOCK
select VIRTIO_PMEM_SUPPORTED
select VIRTIO_MEM_SUPPORTED
select HV_BALLOON_SUPPORTED
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 733b8f0..53b7306 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -43,6 +43,7 @@
#include "system/tpm.h"
#include "hw/acpi/tpm.h"
#include "hw/acpi/vmgenid.h"
+#include "hw/acpi/vmclock.h"
#include "hw/acpi/erst.h"
#include "hw/acpi/piix4.h"
#include "system/tpm_backend.h"
@@ -654,6 +655,7 @@ static Aml *aml_pci_pdsm(void)
Aml *acpi_index = aml_local(2);
Aml *zero = aml_int(0);
Aml *one = aml_int(1);
+ Aml *not_supp = aml_int(0xFFFFFFFF);
Aml *func = aml_arg(2);
Aml *params = aml_arg(4);
Aml *bnum = aml_derefof(aml_index(params, aml_int(0)));
@@ -678,7 +680,7 @@ static Aml *aml_pci_pdsm(void)
*/
ifctx1 = aml_if(aml_lnot(
aml_or(aml_equal(acpi_index, zero),
- aml_equal(acpi_index, aml_int(0xFFFFFFFF)), NULL)
+ aml_equal(acpi_index, not_supp), NULL)
));
{
/* have supported functions */
@@ -704,18 +706,30 @@ static Aml *aml_pci_pdsm(void)
{
Aml *pkg = aml_package(2);
- aml_append(pkg, zero);
- /*
- * optional, if not impl. should return null string
- */
- aml_append(pkg, aml_string("%s", ""));
- aml_append(ifctx, aml_store(pkg, ret));
-
aml_append(ifctx, aml_store(aml_call2("AIDX", bnum, sunum), acpi_index));
+ aml_append(ifctx, aml_store(pkg, ret));
/*
- * update acpi-index to actual value
+ * Windows calls func=7 without checking if it's available,
+ * as workaround Microsoft has suggested to return invalid for func7
+ * Package, so return 2 elements package but only initialize elements
+ * when acpi_index is supported and leave them uninitialized, which
+ * leads elements to being Uninitialized ObjectType and should trip
+ * Windows into discarding result as an unexpected and prevent setting
+ * bogus 'PCI Label' on the device.
*/
- aml_append(ifctx, aml_store(acpi_index, aml_index(ret, zero)));
+ ifctx1 = aml_if(aml_lnot(aml_lor(
+ aml_equal(acpi_index, zero), aml_equal(acpi_index, not_supp)
+ )));
+ {
+ aml_append(ifctx1, aml_store(acpi_index, aml_index(ret, zero)));
+ /*
+ * optional, if not impl. should return null string
+ */
+ aml_append(ifctx1, aml_store(aml_string("%s", ""),
+ aml_index(ret, one)));
+ }
+ aml_append(ifctx, ifctx1);
+
aml_append(ifctx, aml_return(ret));
}
@@ -2432,7 +2446,7 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine)
uint8_t *u;
GArray *tables_blob = tables->table_data;
AcpiSlicOem slic_oem = { .id = NULL, .table_id = NULL };
- Object *vmgenid_dev;
+ Object *vmgenid_dev, *vmclock_dev;
char *oem_id;
char *oem_table_id;
@@ -2505,6 +2519,13 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine)
tables->vmgenid, tables->linker, x86ms->oem_id);
}
+ vmclock_dev = find_vmclock_dev();
+ if (vmclock_dev) {
+ acpi_add_table(table_offsets, tables_blob);
+ vmclock_build_acpi(VMCLOCK(vmclock_dev), tables_blob, tables->linker,
+ x86ms->oem_id);
+ }
+
if (misc.has_hpet) {
acpi_add_table(table_offsets, tables_blob);
build_hpet(tables_blob, tables->linker, x86ms->oem_id,
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index a8c275f..f366c22 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -48,7 +48,10 @@
/* pe operations */
#define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT)
-#define VTD_PE_GET_LEVEL(pe) (2 + (((pe)->val[0] >> 2) & VTD_SM_PASID_ENTRY_AW))
+#define VTD_PE_GET_FL_LEVEL(pe) \
+ (4 + (((pe)->val[2] >> 2) & VTD_SM_PASID_ENTRY_FLPM))
+#define VTD_PE_GET_SL_LEVEL(pe) \
+ (2 + (((pe)->val[0] >> 2) & VTD_SM_PASID_ENTRY_AW))
/*
* PCI bus number (or SID) is not reliable since the device is usaully
@@ -67,6 +70,11 @@ struct vtd_hiod_key {
uint8_t devfn;
};
+struct vtd_as_raw_key {
+ uint16_t sid;
+ uint32_t pasid;
+};
+
struct vtd_iotlb_key {
uint64_t gfn;
uint32_t pasid;
@@ -284,15 +292,15 @@ static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value,
}
/* The shift of an addr for a certain level of paging structure */
-static inline uint32_t vtd_slpt_level_shift(uint32_t level)
+static inline uint32_t vtd_pt_level_shift(uint32_t level)
{
assert(level != 0);
- return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_SL_LEVEL_BITS;
+ return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_LEVEL_BITS;
}
-static inline uint64_t vtd_slpt_level_page_mask(uint32_t level)
+static inline uint64_t vtd_pt_level_page_mask(uint32_t level)
{
- return ~((1ULL << vtd_slpt_level_shift(level)) - 1);
+ return ~((1ULL << vtd_pt_level_shift(level)) - 1);
}
static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value,
@@ -302,9 +310,43 @@ static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value,
VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data;
uint64_t gfn = (info->addr >> VTD_PAGE_SHIFT_4K) & info->mask;
uint64_t gfn_tlb = (info->addr & entry->mask) >> VTD_PAGE_SHIFT_4K;
- return (entry->domain_id == info->domain_id) &&
- (((entry->gfn & info->mask) == gfn) ||
- (entry->gfn == gfn_tlb));
+
+ if (entry->domain_id != info->domain_id) {
+ return false;
+ }
+
+ /*
+ * According to spec, IOTLB entries caching first-stage (PGTT=001b) or
+ * nested (PGTT=011b) mapping associated with specified domain-id are
+ * invalidated. Nested isn't supported yet, so only need to check 001b.
+ */
+ if (entry->pgtt == VTD_SM_PASID_ENTRY_FLT) {
+ return true;
+ }
+
+ return (entry->gfn & info->mask) == gfn || entry->gfn == gfn_tlb;
+}
+
+static gboolean vtd_hash_remove_by_page_piotlb(gpointer key, gpointer value,
+ gpointer user_data)
+{
+ VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value;
+ VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data;
+ uint64_t gfn = (info->addr >> VTD_PAGE_SHIFT_4K) & info->mask;
+ uint64_t gfn_tlb = (info->addr & entry->mask) >> VTD_PAGE_SHIFT_4K;
+
+ /*
+ * According to spec, PASID-based-IOTLB Invalidation in page granularity
+ * doesn't invalidate IOTLB entries caching second-stage (PGTT=010b)
+ * or pass-through (PGTT=100b) mappings. Nested isn't supported yet,
+ * so only need to check first-stage (PGTT=001b) mappings.
+ */
+ if (entry->pgtt != VTD_SM_PASID_ENTRY_FLT) {
+ return false;
+ }
+
+ return entry->domain_id == info->domain_id && entry->pasid == info->pasid &&
+ ((entry->gfn & info->mask) == gfn || entry->gfn == gfn_tlb);
}
/* Reset all the gen of VTDAddressSpace to zero and set the gen of
@@ -349,7 +391,7 @@ static void vtd_reset_caches(IntelIOMMUState *s)
static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level)
{
- return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K;
+ return (addr & vtd_pt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K;
}
/* Must be called with IOMMU lock held */
@@ -360,7 +402,7 @@ static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id,
VTDIOTLBEntry *entry;
unsigned level;
- for (level = VTD_SL_PT_LEVEL; level < VTD_SL_PML4_LEVEL; level++) {
+ for (level = VTD_PT_LEVEL; level < VTD_PML4_LEVEL; level++) {
key.gfn = vtd_get_iotlb_gfn(addr, level);
key.level = level;
key.sid = source_id;
@@ -377,15 +419,15 @@ out:
/* Must be with IOMMU lock held */
static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id,
- uint16_t domain_id, hwaddr addr, uint64_t slpte,
+ uint16_t domain_id, hwaddr addr, uint64_t pte,
uint8_t access_flags, uint32_t level,
- uint32_t pasid)
+ uint32_t pasid, uint8_t pgtt)
{
VTDIOTLBEntry *entry = g_malloc(sizeof(*entry));
struct vtd_iotlb_key *key = g_malloc(sizeof(*key));
uint64_t gfn = vtd_get_iotlb_gfn(addr, level);
- trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id);
+ trace_vtd_iotlb_page_update(source_id, addr, pte, domain_id);
if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) {
trace_vtd_iotlb_reset("iotlb exceeds size limit");
vtd_reset_iotlb_locked(s);
@@ -393,10 +435,11 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id,
entry->gfn = gfn;
entry->domain_id = domain_id;
- entry->slpte = slpte;
+ entry->pte = pte;
entry->access_flags = access_flags;
- entry->mask = vtd_slpt_level_page_mask(level);
+ entry->mask = vtd_pt_level_page_mask(level);
entry->pasid = pasid;
+ entry->pgtt = pgtt;
key->gfn = gfn;
key->sid = source_id;
@@ -710,32 +753,32 @@ static inline dma_addr_t vtd_ce_get_slpt_base(VTDContextEntry *ce)
return ce->lo & VTD_CONTEXT_ENTRY_SLPTPTR;
}
-static inline uint64_t vtd_get_slpte_addr(uint64_t slpte, uint8_t aw)
+static inline uint64_t vtd_get_pte_addr(uint64_t pte, uint8_t aw)
{
- return slpte & VTD_SL_PT_BASE_ADDR_MASK(aw);
+ return pte & VTD_PT_BASE_ADDR_MASK(aw);
}
/* Whether the pte indicates the address of the page frame */
-static inline bool vtd_is_last_slpte(uint64_t slpte, uint32_t level)
+static inline bool vtd_is_last_pte(uint64_t pte, uint32_t level)
{
- return level == VTD_SL_PT_LEVEL || (slpte & VTD_SL_PT_PAGE_SIZE_MASK);
+ return level == VTD_PT_LEVEL || (pte & VTD_PT_PAGE_SIZE_MASK);
}
-/* Get the content of a spte located in @base_addr[@index] */
-static uint64_t vtd_get_slpte(dma_addr_t base_addr, uint32_t index)
+/* Get the content of a pte located in @base_addr[@index] */
+static uint64_t vtd_get_pte(dma_addr_t base_addr, uint32_t index)
{
- uint64_t slpte;
+ uint64_t pte;
- assert(index < VTD_SL_PT_ENTRY_NR);
+ assert(index < VTD_PT_ENTRY_NR);
if (dma_memory_read(&address_space_memory,
- base_addr + index * sizeof(slpte),
- &slpte, sizeof(slpte), MEMTXATTRS_UNSPECIFIED)) {
- slpte = (uint64_t)-1;
- return slpte;
+ base_addr + index * sizeof(pte),
+ &pte, sizeof(pte), MEMTXATTRS_UNSPECIFIED)) {
+ pte = (uint64_t)-1;
+ return pte;
}
- slpte = le64_to_cpu(slpte);
- return slpte;
+ pte = le64_to_cpu(pte);
+ return pte;
}
/* Given an iova and the level of paging structure, return the offset
@@ -743,36 +786,39 @@ static uint64_t vtd_get_slpte(dma_addr_t base_addr, uint32_t index)
*/
static inline uint32_t vtd_iova_level_offset(uint64_t iova, uint32_t level)
{
- return (iova >> vtd_slpt_level_shift(level)) &
- ((1ULL << VTD_SL_LEVEL_BITS) - 1);
+ return (iova >> vtd_pt_level_shift(level)) &
+ ((1ULL << VTD_LEVEL_BITS) - 1);
}
/* Check Capability Register to see if the @level of page-table is supported */
-static inline bool vtd_is_level_supported(IntelIOMMUState *s, uint32_t level)
+static inline bool vtd_is_sl_level_supported(IntelIOMMUState *s, uint32_t level)
{
return VTD_CAP_SAGAW_MASK & s->cap &
(1ULL << (level - 2 + VTD_CAP_SAGAW_SHIFT));
}
+static inline bool vtd_is_fl_level_supported(IntelIOMMUState *s, uint32_t level)
+{
+ return level == VTD_PML4_LEVEL;
+}
+
/* Return true if check passed, otherwise false */
-static inline bool vtd_pe_type_check(X86IOMMUState *x86_iommu,
- VTDPASIDEntry *pe)
+static inline bool vtd_pe_type_check(IntelIOMMUState *s, VTDPASIDEntry *pe)
{
switch (VTD_PE_GET_TYPE(pe)) {
case VTD_SM_PASID_ENTRY_FLT:
+ return !!(s->ecap & VTD_ECAP_FLTS);
case VTD_SM_PASID_ENTRY_SLT:
+ return !!(s->ecap & VTD_ECAP_SLTS);
case VTD_SM_PASID_ENTRY_NESTED:
- break;
+ /* Not support NESTED page table type yet */
+ return false;
case VTD_SM_PASID_ENTRY_PT:
- if (!x86_iommu->pt_supported) {
- return false;
- }
- break;
+ return !!(s->ecap & VTD_ECAP_PT);
default:
/* Unknown type */
return false;
}
- return true;
}
static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire)
@@ -796,7 +842,7 @@ static int vtd_get_pdire_from_pdir_table(dma_addr_t pasid_dir_base,
addr = pasid_dir_base + index * entry_size;
if (dma_memory_read(&address_space_memory, addr,
pdire, entry_size, MEMTXATTRS_UNSPECIFIED)) {
- return -VTD_FR_PASID_TABLE_INV;
+ return -VTD_FR_PASID_DIR_ACCESS_ERR;
}
pdire->val = le64_to_cpu(pdire->val);
@@ -814,28 +860,35 @@ static int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState *s,
dma_addr_t addr,
VTDPASIDEntry *pe)
{
+ uint8_t pgtt;
uint32_t index;
dma_addr_t entry_size;
- X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
index = VTD_PASID_TABLE_INDEX(pasid);
entry_size = VTD_PASID_ENTRY_SIZE;
addr = addr + index * entry_size;
if (dma_memory_read(&address_space_memory, addr,
pe, entry_size, MEMTXATTRS_UNSPECIFIED)) {
- return -VTD_FR_PASID_TABLE_INV;
+ return -VTD_FR_PASID_TABLE_ACCESS_ERR;
}
for (size_t i = 0; i < ARRAY_SIZE(pe->val); i++) {
pe->val[i] = le64_to_cpu(pe->val[i]);
}
/* Do translation type check */
- if (!vtd_pe_type_check(x86_iommu, pe)) {
- return -VTD_FR_PASID_TABLE_INV;
+ if (!vtd_pe_type_check(s, pe)) {
+ return -VTD_FR_PASID_TABLE_ENTRY_INV;
}
- if (!vtd_is_level_supported(s, VTD_PE_GET_LEVEL(pe))) {
- return -VTD_FR_PASID_TABLE_INV;
+ pgtt = VTD_PE_GET_TYPE(pe);
+ if (pgtt == VTD_SM_PASID_ENTRY_SLT &&
+ !vtd_is_sl_level_supported(s, VTD_PE_GET_SL_LEVEL(pe))) {
+ return -VTD_FR_PASID_TABLE_ENTRY_INV;
+ }
+
+ if (pgtt == VTD_SM_PASID_ENTRY_FLT &&
+ !vtd_is_fl_level_supported(s, VTD_PE_GET_FL_LEVEL(pe))) {
+ return -VTD_FR_PASID_TABLE_ENTRY_INV;
}
return 0;
@@ -876,7 +929,7 @@ static int vtd_get_pe_from_pasid_table(IntelIOMMUState *s,
}
if (!vtd_pdire_present(&pdire)) {
- return -VTD_FR_PASID_TABLE_INV;
+ return -VTD_FR_PASID_DIR_ENTRY_P;
}
ret = vtd_get_pe_from_pdire(s, pasid, &pdire, pe);
@@ -885,7 +938,7 @@ static int vtd_get_pe_from_pasid_table(IntelIOMMUState *s,
}
if (!vtd_pe_present(pe)) {
- return -VTD_FR_PASID_TABLE_INV;
+ return -VTD_FR_PASID_ENTRY_P;
}
return 0;
@@ -938,7 +991,7 @@ static int vtd_ce_get_pasid_fpd(IntelIOMMUState *s,
}
if (!vtd_pdire_present(&pdire)) {
- return -VTD_FR_PASID_TABLE_INV;
+ return -VTD_FR_PASID_DIR_ENTRY_P;
}
/*
@@ -973,7 +1026,11 @@ static uint32_t vtd_get_iova_level(IntelIOMMUState *s,
if (s->root_scalable) {
vtd_ce_get_rid2pasid_entry(s, ce, &pe, pasid);
- return VTD_PE_GET_LEVEL(&pe);
+ if (s->flts) {
+ return VTD_PE_GET_FL_LEVEL(&pe);
+ } else {
+ return VTD_PE_GET_SL_LEVEL(&pe);
+ }
}
return vtd_ce_get_level(ce);
@@ -1041,9 +1098,9 @@ static inline uint64_t vtd_iova_limit(IntelIOMMUState *s,
}
/* Return true if IOVA passes range check, otherwise false. */
-static inline bool vtd_iova_range_check(IntelIOMMUState *s,
- uint64_t iova, VTDContextEntry *ce,
- uint8_t aw, uint32_t pasid)
+static inline bool vtd_iova_sl_range_check(IntelIOMMUState *s,
+ uint64_t iova, VTDContextEntry *ce,
+ uint8_t aw, uint32_t pasid)
{
/*
* Check if @iova is above 2^X-1, where X is the minimum of MGAW
@@ -1060,7 +1117,11 @@ static dma_addr_t vtd_get_iova_pgtbl_base(IntelIOMMUState *s,
if (s->root_scalable) {
vtd_ce_get_rid2pasid_entry(s, ce, &pe, pasid);
- return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR;
+ if (s->flts) {
+ return pe.val[2] & VTD_SM_PASID_ENTRY_FLPTPTR;
+ } else {
+ return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR;
+ }
}
return vtd_ce_get_slpt_base(ce);
@@ -1084,17 +1145,17 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level)
/*
* We should have caught a guest-mis-programmed level earlier,
- * via vtd_is_level_supported.
+ * via vtd_is_sl_level_supported.
*/
assert(level < VTD_SPTE_RSVD_LEN);
/*
- * Zero level doesn't exist. The smallest level is VTD_SL_PT_LEVEL=1 and
- * checked by vtd_is_last_slpte().
+ * Zero level doesn't exist. The smallest level is VTD_PT_LEVEL=1 and
+ * checked by vtd_is_last_pte().
*/
assert(level);
- if ((level == VTD_SL_PD_LEVEL || level == VTD_SL_PDP_LEVEL) &&
- (slpte & VTD_SL_PT_PAGE_SIZE_MASK)) {
+ if ((level == VTD_PD_LEVEL || level == VTD_PDP_LEVEL) &&
+ (slpte & VTD_PT_PAGE_SIZE_MASK)) {
/* large page */
rsvd_mask = vtd_spte_rsvd_large[level];
} else {
@@ -1118,9 +1179,8 @@ static int vtd_iova_to_slpte(IntelIOMMUState *s, VTDContextEntry *ce,
uint32_t offset;
uint64_t slpte;
uint64_t access_right_check;
- uint64_t xlat, size;
- if (!vtd_iova_range_check(s, iova, ce, aw_bits, pasid)) {
+ if (!vtd_iova_sl_range_check(s, iova, ce, aw_bits, pasid)) {
error_report_once("%s: detected IOVA overflow (iova=0x%" PRIx64 ","
"pasid=0x%" PRIx32 ")", __func__, iova, pasid);
return -VTD_FR_ADDR_BEYOND_MGAW;
@@ -1131,7 +1191,7 @@ static int vtd_iova_to_slpte(IntelIOMMUState *s, VTDContextEntry *ce,
while (true) {
offset = vtd_iova_level_offset(iova, level);
- slpte = vtd_get_slpte(addr, offset);
+ slpte = vtd_get_pte(addr, offset);
if (slpte == (uint64_t)-1) {
error_report_once("%s: detected read error on DMAR slpte "
@@ -1162,37 +1222,16 @@ static int vtd_iova_to_slpte(IntelIOMMUState *s, VTDContextEntry *ce,
return -VTD_FR_PAGING_ENTRY_RSVD;
}
- if (vtd_is_last_slpte(slpte, level)) {
+ if (vtd_is_last_pte(slpte, level)) {
*slptep = slpte;
*slpte_level = level;
break;
}
- addr = vtd_get_slpte_addr(slpte, aw_bits);
+ addr = vtd_get_pte_addr(slpte, aw_bits);
level--;
}
- xlat = vtd_get_slpte_addr(*slptep, aw_bits);
- size = ~vtd_slpt_level_page_mask(level) + 1;
-
- /*
- * From VT-d spec 3.14: Untranslated requests and translation
- * requests that result in an address in the interrupt range will be
- * blocked with condition code LGN.4 or SGN.8.
- */
- if ((xlat > VTD_INTERRUPT_ADDR_LAST ||
- xlat + size - 1 < VTD_INTERRUPT_ADDR_FIRST)) {
- return 0;
- } else {
- error_report_once("%s: xlat address is in interrupt range "
- "(iova=0x%" PRIx64 ", level=0x%" PRIx32 ", "
- "slpte=0x%" PRIx64 ", write=%d, "
- "xlat=0x%" PRIx64 ", size=0x%" PRIx64 ", "
- "pasid=0x%" PRIx32 ")",
- __func__, iova, level, slpte, is_write,
- xlat, size, pasid);
- return s->scalable_mode ? -VTD_FR_SM_INTERRUPT_ADDR :
- -VTD_FR_INTERRUPT_ADDR;
- }
+ return 0;
}
typedef int (*vtd_page_walk_hook)(const IOMMUTLBEvent *event, void *private);
@@ -1323,14 +1362,14 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
trace_vtd_page_walk_level(addr, level, start, end);
- subpage_size = 1ULL << vtd_slpt_level_shift(level);
- subpage_mask = vtd_slpt_level_page_mask(level);
+ subpage_size = 1ULL << vtd_pt_level_shift(level);
+ subpage_mask = vtd_pt_level_page_mask(level);
while (iova < end) {
iova_next = (iova & subpage_mask) + subpage_size;
offset = vtd_iova_level_offset(iova, level);
- slpte = vtd_get_slpte(addr, offset);
+ slpte = vtd_get_pte(addr, offset);
if (slpte == (uint64_t)-1) {
trace_vtd_page_walk_skip_read(iova, iova_next);
@@ -1353,12 +1392,12 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
*/
entry_valid = read_cur | write_cur;
- if (!vtd_is_last_slpte(slpte, level) && entry_valid) {
+ if (!vtd_is_last_pte(slpte, level) && entry_valid) {
/*
* This is a valid PDE (or even bigger than PDE). We need
* to walk one further level.
*/
- ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw),
+ ret = vtd_page_walk_level(vtd_get_pte_addr(slpte, info->aw),
iova, MIN(iova_next, end), level - 1,
read_cur, write_cur, info);
} else {
@@ -1375,7 +1414,7 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
event.entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
event.entry.addr_mask = ~subpage_mask;
/* NOTE: this is only meaningful if entry_valid == true */
- event.entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
+ event.entry.translated_addr = vtd_get_pte_addr(slpte, info->aw);
event.type = event.entry.perm ? IOMMU_NOTIFIER_MAP :
IOMMU_NOTIFIER_UNMAP;
ret = vtd_page_walk_one(&event, info);
@@ -1409,11 +1448,11 @@ static int vtd_page_walk(IntelIOMMUState *s, VTDContextEntry *ce,
dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce, pasid);
uint32_t level = vtd_get_iova_level(s, ce, pasid);
- if (!vtd_iova_range_check(s, start, ce, info->aw, pasid)) {
+ if (!vtd_iova_sl_range_check(s, start, ce, info->aw, pasid)) {
return -VTD_FR_ADDR_BEYOND_MGAW;
}
- if (!vtd_iova_range_check(s, end, ce, info->aw, pasid)) {
+ if (!vtd_iova_sl_range_check(s, end, ce, info->aw, pasid)) {
/* Fix end so that it reaches the maximum */
end = vtd_iova_limit(s, ce, info->aw, pasid);
}
@@ -1528,7 +1567,7 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
/* Check if the programming of context-entry is valid */
if (!s->root_scalable &&
- !vtd_is_level_supported(s, vtd_ce_get_level(ce))) {
+ !vtd_is_sl_level_supported(s, vtd_ce_get_level(ce))) {
error_report_once("%s: invalid context entry: hi=%"PRIx64
", lo=%"PRIx64" (level %d not supported)",
__func__, ce->hi, ce->lo,
@@ -1795,8 +1834,20 @@ static const bool vtd_qualified_faults[] = {
[VTD_FR_ROOT_ENTRY_RSVD] = false,
[VTD_FR_PAGING_ENTRY_RSVD] = true,
[VTD_FR_CONTEXT_ENTRY_TT] = true,
- [VTD_FR_PASID_TABLE_INV] = false,
+ [VTD_FR_PASID_DIR_ACCESS_ERR] = false,
+ [VTD_FR_PASID_DIR_ENTRY_P] = true,
+ [VTD_FR_PASID_TABLE_ACCESS_ERR] = false,
+ [VTD_FR_PASID_ENTRY_P] = true,
+ [VTD_FR_PASID_TABLE_ENTRY_INV] = true,
+ [VTD_FR_FS_PAGING_ENTRY_INV] = true,
+ [VTD_FR_FS_PAGING_ENTRY_P] = true,
+ [VTD_FR_FS_PAGING_ENTRY_RSVD] = true,
+ [VTD_FR_PASID_ENTRY_FSPTPTR_INV] = true,
+ [VTD_FR_FS_NON_CANONICAL] = true,
+ [VTD_FR_FS_PAGING_ENTRY_US] = true,
+ [VTD_FR_SM_WRITE] = true,
[VTD_FR_SM_INTERRUPT_ADDR] = true,
+ [VTD_FR_FS_BIT_UPDATE_FAILED] = true,
[VTD_FR_MAX] = false,
};
@@ -1814,29 +1865,32 @@ static inline bool vtd_is_interrupt_addr(hwaddr addr)
return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST;
}
-static gboolean vtd_find_as_by_sid(gpointer key, gpointer value,
- gpointer user_data)
+static gboolean vtd_find_as_by_sid_and_pasid(gpointer key, gpointer value,
+ gpointer user_data)
{
struct vtd_as_key *as_key = (struct vtd_as_key *)key;
- uint16_t target_sid = *(uint16_t *)user_data;
+ struct vtd_as_raw_key *target = (struct vtd_as_raw_key *)user_data;
uint16_t sid = PCI_BUILD_BDF(pci_bus_num(as_key->bus), as_key->devfn);
- return sid == target_sid;
+
+ return (as_key->pasid == target->pasid) && (sid == target->sid);
}
-static VTDAddressSpace *vtd_get_as_by_sid(IntelIOMMUState *s, uint16_t sid)
+static VTDAddressSpace *vtd_get_as_by_sid_and_pasid(IntelIOMMUState *s,
+ uint16_t sid,
+ uint32_t pasid)
{
- uint8_t bus_num = PCI_BUS_NUM(sid);
- VTDAddressSpace *vtd_as = s->vtd_as_cache[bus_num];
-
- if (vtd_as &&
- (sid == PCI_BUILD_BDF(pci_bus_num(vtd_as->bus), vtd_as->devfn))) {
- return vtd_as;
- }
+ struct vtd_as_raw_key key = {
+ .sid = sid,
+ .pasid = pasid
+ };
- vtd_as = g_hash_table_find(s->vtd_address_spaces, vtd_find_as_by_sid, &sid);
- s->vtd_as_cache[bus_num] = vtd_as;
+ return g_hash_table_find(s->vtd_address_spaces,
+ vtd_find_as_by_sid_and_pasid, &key);
+}
- return vtd_as;
+static VTDAddressSpace *vtd_get_as_by_sid(IntelIOMMUState *s, uint16_t sid)
+{
+ return vtd_get_as_by_sid_and_pasid(s, sid, PCI_NO_PASID);
}
static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id)
@@ -1858,6 +1912,157 @@ out:
trace_vtd_pt_enable_fast_path(source_id, success);
}
+/*
+ * Rsvd field masks for fpte:
+ * vtd_fpte_rsvd 4k pages
+ * vtd_fpte_rsvd_large large pages
+ *
+ * We support only 4-level page tables.
+ */
+#define VTD_FPTE_RSVD_LEN 5
+static uint64_t vtd_fpte_rsvd[VTD_FPTE_RSVD_LEN];
+static uint64_t vtd_fpte_rsvd_large[VTD_FPTE_RSVD_LEN];
+
+static bool vtd_flpte_nonzero_rsvd(uint64_t flpte, uint32_t level)
+{
+ uint64_t rsvd_mask;
+
+ /*
+ * We should have caught a guest-mis-programmed level earlier,
+ * via vtd_is_fl_level_supported.
+ */
+ assert(level < VTD_FPTE_RSVD_LEN);
+ /*
+ * Zero level doesn't exist. The smallest level is VTD_PT_LEVEL=1 and
+ * checked by vtd_is_last_pte().
+ */
+ assert(level);
+
+ if ((level == VTD_PD_LEVEL || level == VTD_PDP_LEVEL) &&
+ (flpte & VTD_PT_PAGE_SIZE_MASK)) {
+ /* large page */
+ rsvd_mask = vtd_fpte_rsvd_large[level];
+ } else {
+ rsvd_mask = vtd_fpte_rsvd[level];
+ }
+
+ return flpte & rsvd_mask;
+}
+
+static inline bool vtd_flpte_present(uint64_t flpte)
+{
+ return !!(flpte & VTD_FL_P);
+}
+
+/* Return true if IOVA is canonical, otherwise false. */
+static bool vtd_iova_fl_check_canonical(IntelIOMMUState *s, uint64_t iova,
+ VTDContextEntry *ce, uint32_t pasid)
+{
+ uint64_t iova_limit = vtd_iova_limit(s, ce, s->aw_bits, pasid);
+ uint64_t upper_bits_mask = ~(iova_limit - 1);
+ uint64_t upper_bits = iova & upper_bits_mask;
+ bool msb = ((iova & (iova_limit >> 1)) != 0);
+
+ if (msb) {
+ return upper_bits == upper_bits_mask;
+ } else {
+ return !upper_bits;
+ }
+}
+
+static MemTxResult vtd_set_flag_in_pte(dma_addr_t base_addr, uint32_t index,
+ uint64_t pte, uint64_t flag)
+{
+ if (pte & flag) {
+ return MEMTX_OK;
+ }
+ pte |= flag;
+ pte = cpu_to_le64(pte);
+ return dma_memory_write(&address_space_memory,
+ base_addr + index * sizeof(pte),
+ &pte, sizeof(pte),
+ MEMTXATTRS_UNSPECIFIED);
+}
+
+/*
+ * Given the @iova, get relevant @flptep. @flpte_level will be the last level
+ * of the translation, can be used for deciding the size of large page.
+ */
+static int vtd_iova_to_flpte(IntelIOMMUState *s, VTDContextEntry *ce,
+ uint64_t iova, bool is_write,
+ uint64_t *flptep, uint32_t *flpte_level,
+ bool *reads, bool *writes, uint8_t aw_bits,
+ uint32_t pasid)
+{
+ dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce, pasid);
+ uint32_t level = vtd_get_iova_level(s, ce, pasid);
+ uint32_t offset;
+ uint64_t flpte, flag_ad = VTD_FL_A;
+
+ if (!vtd_iova_fl_check_canonical(s, iova, ce, pasid)) {
+ error_report_once("%s: detected non canonical IOVA (iova=0x%" PRIx64 ","
+ "pasid=0x%" PRIx32 ")", __func__, iova, pasid);
+ return -VTD_FR_FS_NON_CANONICAL;
+ }
+
+ while (true) {
+ offset = vtd_iova_level_offset(iova, level);
+ flpte = vtd_get_pte(addr, offset);
+
+ if (flpte == (uint64_t)-1) {
+ if (level == vtd_get_iova_level(s, ce, pasid)) {
+ /* Invalid programming of pasid-entry */
+ return -VTD_FR_PASID_ENTRY_FSPTPTR_INV;
+ } else {
+ return -VTD_FR_FS_PAGING_ENTRY_INV;
+ }
+ }
+
+ if (!vtd_flpte_present(flpte)) {
+ *reads = false;
+ *writes = false;
+ return -VTD_FR_FS_PAGING_ENTRY_P;
+ }
+
+ /* No emulated device supports supervisor privilege request yet */
+ if (!(flpte & VTD_FL_US)) {
+ *reads = false;
+ *writes = false;
+ return -VTD_FR_FS_PAGING_ENTRY_US;
+ }
+
+ *reads = true;
+ *writes = (*writes) && (flpte & VTD_FL_RW);
+ if (is_write && !(flpte & VTD_FL_RW)) {
+ return -VTD_FR_SM_WRITE;
+ }
+ if (vtd_flpte_nonzero_rsvd(flpte, level)) {
+ error_report_once("%s: detected flpte reserved non-zero "
+ "iova=0x%" PRIx64 ", level=0x%" PRIx32
+ "flpte=0x%" PRIx64 ", pasid=0x%" PRIX32 ")",
+ __func__, iova, level, flpte, pasid);
+ return -VTD_FR_FS_PAGING_ENTRY_RSVD;
+ }
+
+ if (vtd_is_last_pte(flpte, level) && is_write) {
+ flag_ad |= VTD_FL_D;
+ }
+
+ if (vtd_set_flag_in_pte(addr, offset, flpte, flag_ad) != MEMTX_OK) {
+ return -VTD_FR_FS_BIT_UPDATE_FAILED;
+ }
+
+ if (vtd_is_last_pte(flpte, level)) {
+ *flptep = flpte;
+ *flpte_level = level;
+ return 0;
+ }
+
+ addr = vtd_get_pte_addr(flpte, aw_bits);
+ level--;
+ }
+}
+
static void vtd_report_fault(IntelIOMMUState *s,
int err, bool is_fpd_set,
uint16_t source_id,
@@ -1894,16 +2099,17 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
VTDContextEntry ce;
uint8_t bus_num = pci_bus_num(bus);
VTDContextCacheEntry *cc_entry;
- uint64_t slpte, page_mask;
+ uint64_t pte, page_mask;
uint32_t level, pasid = vtd_as->pasid;
uint16_t source_id = PCI_BUILD_BDF(bus_num, devfn);
int ret_fr;
bool is_fpd_set = false;
bool reads = true;
bool writes = true;
- uint8_t access_flags;
+ uint8_t access_flags, pgtt;
bool rid2pasid = (pasid == PCI_NO_PASID) && s->root_scalable;
VTDIOTLBEntry *iotlb_entry;
+ uint64_t xlat, size;
/*
* We have standalone memory region for interrupt addresses, we
@@ -1915,13 +2121,13 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
cc_entry = &vtd_as->context_cache_entry;
- /* Try to fetch slpte form IOTLB, we don't need RID2PASID logic */
+ /* Try to fetch pte from IOTLB, we don't need RID2PASID logic */
if (!rid2pasid) {
iotlb_entry = vtd_lookup_iotlb(s, source_id, pasid, addr);
if (iotlb_entry) {
- trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->slpte,
+ trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->pte,
iotlb_entry->domain_id);
- slpte = iotlb_entry->slpte;
+ pte = iotlb_entry->pte;
access_flags = iotlb_entry->access_flags;
page_mask = iotlb_entry->mask;
goto out;
@@ -1993,35 +2199,65 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
return true;
}
- /* Try to fetch slpte form IOTLB for RID2PASID slow path */
+ /* Try to fetch pte from IOTLB for RID2PASID slow path */
if (rid2pasid) {
iotlb_entry = vtd_lookup_iotlb(s, source_id, pasid, addr);
if (iotlb_entry) {
- trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->slpte,
+ trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->pte,
iotlb_entry->domain_id);
- slpte = iotlb_entry->slpte;
+ pte = iotlb_entry->pte;
access_flags = iotlb_entry->access_flags;
page_mask = iotlb_entry->mask;
goto out;
}
}
- ret_fr = vtd_iova_to_slpte(s, &ce, addr, is_write, &slpte, &level,
- &reads, &writes, s->aw_bits, pasid);
+ if (s->flts && s->root_scalable) {
+ ret_fr = vtd_iova_to_flpte(s, &ce, addr, is_write, &pte, &level,
+ &reads, &writes, s->aw_bits, pasid);
+ pgtt = VTD_SM_PASID_ENTRY_FLT;
+ } else {
+ ret_fr = vtd_iova_to_slpte(s, &ce, addr, is_write, &pte, &level,
+ &reads, &writes, s->aw_bits, pasid);
+ pgtt = VTD_SM_PASID_ENTRY_SLT;
+ }
+ if (!ret_fr) {
+ xlat = vtd_get_pte_addr(pte, s->aw_bits);
+ size = ~vtd_pt_level_page_mask(level) + 1;
+
+ /*
+ * Per VT-d spec 4.1 section 3.15: Untranslated requests and translation
+ * requests that result in an address in the interrupt range will be
+ * blocked with condition code LGN.4 or SGN.8.
+ */
+ if ((xlat <= VTD_INTERRUPT_ADDR_LAST &&
+ xlat + size - 1 >= VTD_INTERRUPT_ADDR_FIRST)) {
+ error_report_once("%s: xlat address is in interrupt range "
+ "(iova=0x%" PRIx64 ", level=0x%" PRIx32 ", "
+ "pte=0x%" PRIx64 ", write=%d, "
+ "xlat=0x%" PRIx64 ", size=0x%" PRIx64 ", "
+ "pasid=0x%" PRIx32 ")",
+ __func__, addr, level, pte, is_write,
+ xlat, size, pasid);
+ ret_fr = s->scalable_mode ? -VTD_FR_SM_INTERRUPT_ADDR :
+ -VTD_FR_INTERRUPT_ADDR;
+ }
+ }
+
if (ret_fr) {
vtd_report_fault(s, -ret_fr, is_fpd_set, source_id,
addr, is_write, pasid != PCI_NO_PASID, pasid);
goto error;
}
- page_mask = vtd_slpt_level_page_mask(level);
+ page_mask = vtd_pt_level_page_mask(level);
access_flags = IOMMU_ACCESS_FLAG(reads, writes);
vtd_update_iotlb(s, source_id, vtd_get_domain_id(s, &ce, pasid),
- addr, slpte, access_flags, level, pasid);
+ addr, pte, access_flags, level, pasid, pgtt);
out:
vtd_iommu_unlock(s);
entry->iova = addr & page_mask;
- entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask;
+ entry->translated_addr = vtd_get_pte_addr(pte, s->aw_bits) & page_mask;
entry->addr_mask = ~page_mask;
entry->perm = access_flags;
return true;
@@ -2215,8 +2451,13 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
}
}
+/*
+ * There is no pasid field in iotlb invalidation descriptor, so PCI_NO_PASID
+ * is passed as parameter. Piotlb invalidation supports pasid, pasid in its
+ * descriptor is passed which should not be PCI_NO_PASID.
+ */
static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
- uint16_t domain_id, hwaddr addr,
+ uint16_t domain_id, hwaddr addr,
uint8_t am, uint32_t pasid)
{
VTDAddressSpace *vtd_as;
@@ -2225,19 +2466,37 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
hwaddr size = (1 << am) * VTD_PAGE_SIZE;
QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) {
- if (pasid != PCI_NO_PASID && pasid != vtd_as->pasid) {
- continue;
- }
ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
vtd_as->devfn, &ce);
if (!ret && domain_id == vtd_get_domain_id(s, &ce, vtd_as->pasid)) {
+ uint32_t rid2pasid = PCI_NO_PASID;
+
+ if (s->root_scalable) {
+ rid2pasid = VTD_CE_GET_RID2PASID(&ce);
+ }
+
+ /*
+ * In legacy mode, vtd_as->pasid == pasid is always true.
+ * In scalable mode, for vtd address space backing a PCI
+ * device without pasid, needs to compare pasid with
+ * rid2pasid of this device.
+ */
+ if (!(vtd_as->pasid == pasid ||
+ (vtd_as->pasid == PCI_NO_PASID && pasid == rid2pasid))) {
+ continue;
+ }
+
if (vtd_as_has_map_notifier(vtd_as)) {
/*
- * As long as we have MAP notifications registered in
- * any of our IOMMU notifiers, we need to sync the
- * shadow page table.
+ * When stage-1 translation is off, as long as we have MAP
+ * notifications registered in any of our IOMMU notifiers,
+ * we need to sync the shadow page table. Otherwise VFIO
+ * device attaches to nested page table instead of shadow
+ * page table, so no need to sync.
*/
- vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size);
+ if (!s->flts || !s->root_scalable) {
+ vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size);
+ }
} else {
/*
* For UNMAP-only notifiers, we don't need to walk the
@@ -2689,6 +2948,106 @@ static bool vtd_process_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc)
return true;
}
+static gboolean vtd_hash_remove_by_pasid(gpointer key, gpointer value,
+ gpointer user_data)
+{
+ VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value;
+ VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data;
+
+ return ((entry->domain_id == info->domain_id) &&
+ (entry->pasid == info->pasid));
+}
+
+static void vtd_piotlb_pasid_invalidate(IntelIOMMUState *s,
+ uint16_t domain_id, uint32_t pasid)
+{
+ VTDIOTLBPageInvInfo info;
+ VTDAddressSpace *vtd_as;
+ VTDContextEntry ce;
+
+ info.domain_id = domain_id;
+ info.pasid = pasid;
+
+ vtd_iommu_lock(s);
+ g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_pasid,
+ &info);
+ vtd_iommu_unlock(s);
+
+ QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
+ if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
+ vtd_as->devfn, &ce) &&
+ domain_id == vtd_get_domain_id(s, &ce, vtd_as->pasid)) {
+ uint32_t rid2pasid = VTD_CE_GET_RID2PASID(&ce);
+
+ if ((vtd_as->pasid != PCI_NO_PASID || pasid != rid2pasid) &&
+ vtd_as->pasid != pasid) {
+ continue;
+ }
+
+ if (!s->flts || !vtd_as_has_map_notifier(vtd_as)) {
+ vtd_address_space_sync(vtd_as);
+ }
+ }
+ }
+}
+
+static void vtd_piotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
+ uint32_t pasid, hwaddr addr, uint8_t am)
+{
+ VTDIOTLBPageInvInfo info;
+
+ info.domain_id = domain_id;
+ info.pasid = pasid;
+ info.addr = addr;
+ info.mask = ~((1 << am) - 1);
+
+ vtd_iommu_lock(s);
+ g_hash_table_foreach_remove(s->iotlb,
+ vtd_hash_remove_by_page_piotlb, &info);
+ vtd_iommu_unlock(s);
+
+ vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am, pasid);
+}
+
+static bool vtd_process_piotlb_desc(IntelIOMMUState *s,
+ VTDInvDesc *inv_desc)
+{
+ uint16_t domain_id;
+ uint32_t pasid;
+ hwaddr addr;
+ uint8_t am;
+ uint64_t mask[4] = {VTD_INV_DESC_PIOTLB_RSVD_VAL0,
+ VTD_INV_DESC_PIOTLB_RSVD_VAL1,
+ VTD_INV_DESC_ALL_ONE, VTD_INV_DESC_ALL_ONE};
+
+ if (!vtd_inv_desc_reserved_check(s, inv_desc, mask, true,
+ __func__, "piotlb inv")) {
+ return false;
+ }
+
+ domain_id = VTD_INV_DESC_PIOTLB_DID(inv_desc->val[0]);
+ pasid = VTD_INV_DESC_PIOTLB_PASID(inv_desc->val[0]);
+ switch (inv_desc->val[0] & VTD_INV_DESC_PIOTLB_G) {
+ case VTD_INV_DESC_PIOTLB_ALL_IN_PASID:
+ vtd_piotlb_pasid_invalidate(s, domain_id, pasid);
+ break;
+
+ case VTD_INV_DESC_PIOTLB_PSI_IN_PASID:
+ am = VTD_INV_DESC_PIOTLB_AM(inv_desc->val[1]);
+ addr = (hwaddr) VTD_INV_DESC_PIOTLB_ADDR(inv_desc->val[1]);
+ vtd_piotlb_page_invalidate(s, domain_id, pasid, addr, am);
+ break;
+
+ default:
+ error_report_once("%s: invalid piotlb inv desc: hi=0x%"PRIx64
+ ", lo=0x%"PRIx64" (type mismatch: 0x%llx)",
+ __func__, inv_desc->val[1], inv_desc->val[0],
+ inv_desc->val[0] & VTD_INV_DESC_IOTLB_G);
+ return false;
+ }
+ return true;
+}
+
static bool vtd_process_inv_iec_desc(IntelIOMMUState *s,
VTDInvDesc *inv_desc)
{
@@ -2742,6 +3101,49 @@ static void do_invalidate_device_tlb(VTDAddressSpace *vtd_dev_as,
memory_region_notify_iommu(&vtd_dev_as->iommu, 0, event);
}
+static bool vtd_process_device_piotlb_desc(IntelIOMMUState *s,
+ VTDInvDesc *inv_desc)
+{
+ uint16_t sid;
+ VTDAddressSpace *vtd_dev_as;
+ bool size;
+ bool global;
+ hwaddr addr;
+ uint32_t pasid;
+ uint64_t mask[4] = {VTD_INV_DESC_PASID_DEVICE_IOTLB_RSVD_VAL0,
+ VTD_INV_DESC_PASID_DEVICE_IOTLB_RSVD_VAL1,
+ VTD_INV_DESC_ALL_ONE, VTD_INV_DESC_ALL_ONE};
+
+ if (!vtd_inv_desc_reserved_check(s, inv_desc, mask, true,
+ __func__, "device piotlb inv")) {
+ return false;
+ }
+
+ global = VTD_INV_DESC_PASID_DEVICE_IOTLB_GLOBAL(inv_desc->hi);
+ size = VTD_INV_DESC_PASID_DEVICE_IOTLB_SIZE(inv_desc->hi);
+ addr = VTD_INV_DESC_PASID_DEVICE_IOTLB_ADDR(inv_desc->hi);
+ sid = VTD_INV_DESC_PASID_DEVICE_IOTLB_SID(inv_desc->lo);
+ if (global) {
+ QLIST_FOREACH(vtd_dev_as, &s->vtd_as_with_notifiers, next) {
+ if ((vtd_dev_as->pasid != PCI_NO_PASID) &&
+ (PCI_BUILD_BDF(pci_bus_num(vtd_dev_as->bus),
+ vtd_dev_as->devfn) == sid)) {
+ do_invalidate_device_tlb(vtd_dev_as, size, addr);
+ }
+ }
+ } else {
+ pasid = VTD_INV_DESC_PASID_DEVICE_IOTLB_PASID(inv_desc->lo);
+ vtd_dev_as = vtd_get_as_by_sid_and_pasid(s, sid, pasid);
+ if (!vtd_dev_as) {
+ return true;
+ }
+
+ do_invalidate_device_tlb(vtd_dev_as, size, addr);
+ }
+
+ return true;
+}
+
static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s,
VTDInvDesc *inv_desc)
{
@@ -2807,6 +3209,13 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s)
}
break;
+ case VTD_INV_DESC_PIOTLB:
+ trace_vtd_inv_desc("p-iotlb", inv_desc.val[1], inv_desc.val[0]);
+ if (!vtd_process_piotlb_desc(s, &inv_desc)) {
+ return false;
+ }
+ break;
+
case VTD_INV_DESC_WAIT:
trace_vtd_inv_desc("wait", inv_desc.hi, inv_desc.lo);
if (!vtd_process_wait_desc(s, &inv_desc)) {
@@ -2821,6 +3230,13 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s)
}
break;
+ case VTD_INV_DESC_DEV_PIOTLB:
+ trace_vtd_inv_desc("device-piotlb", inv_desc.hi, inv_desc.lo);
+ if (!vtd_process_device_piotlb_desc(s, &inv_desc)) {
+ return false;
+ }
+ break;
+
case VTD_INV_DESC_DEVICE:
trace_vtd_inv_desc("device", inv_desc.hi, inv_desc.lo);
if (!vtd_process_device_iotlb_desc(s, &inv_desc)) {
@@ -2834,7 +3250,6 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s)
* iommu driver) work, just return true is enough so far.
*/
case VTD_INV_DESC_PC:
- case VTD_INV_DESC_PIOTLB:
if (s->scalable_mode) {
break;
}
@@ -3413,11 +3828,13 @@ static const Property vtd_properties[] = {
VTD_HOST_ADDRESS_WIDTH),
DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE),
DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, FALSE),
+ DEFINE_PROP_BOOL("x-flts", IntelIOMMUState, flts, FALSE),
DEFINE_PROP_BOOL("snoop-control", IntelIOMMUState, snoop_control, false),
DEFINE_PROP_BOOL("x-pasid-mode", IntelIOMMUState, pasid, false),
DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true),
DEFINE_PROP_BOOL("dma-translation", IntelIOMMUState, dma_translation, true),
DEFINE_PROP_BOOL("stale-tm", IntelIOMMUState, stale_tm, false),
+ DEFINE_PROP_BOOL("fs1gp", IntelIOMMUState, fs1gp, true),
};
/* Read IRTE entry with specific index */
@@ -3914,7 +4331,13 @@ static bool vtd_check_hiod(IntelIOMMUState *s, HostIOMMUDevice *hiod,
return false;
}
- return true;
+ if (!s->flts) {
+ /* All checks requested by VTD stage-2 translation pass */
+ return true;
+ }
+
+ error_setg(errp, "host device is uncompatible with stage-1 translation");
+ return false;
}
static bool vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
@@ -4137,7 +4560,12 @@ static void vtd_cap_init(IntelIOMMUState *s)
}
/* TODO: read cap/ecap from host to decide which cap to be exposed. */
- if (s->scalable_mode) {
+ if (s->flts) {
+ s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_FLTS;
+ if (s->fs1gp) {
+ s->cap |= VTD_CAP_FS1GP;
+ }
+ } else if (s->scalable_mode) {
s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
}
@@ -4193,6 +4621,18 @@ static void vtd_init(IntelIOMMUState *s)
vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits,
x86_iommu->dt_supported && s->stale_tm);
+ /*
+ * Rsvd field masks for fpte
+ */
+ vtd_fpte_rsvd[0] = ~0ULL;
+ vtd_fpte_rsvd[1] = VTD_FPTE_PAGE_L1_RSVD_MASK(s->aw_bits);
+ vtd_fpte_rsvd[2] = VTD_FPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
+ vtd_fpte_rsvd[3] = VTD_FPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
+ vtd_fpte_rsvd[4] = VTD_FPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
+
+ vtd_fpte_rsvd_large[2] = VTD_FPTE_LPAGE_L2_RSVD_MASK(s->aw_bits);
+ vtd_fpte_rsvd_large[3] = VTD_FPTE_LPAGE_L3_RSVD_MASK(s->aw_bits);
+
if (s->scalable_mode || s->snoop_control) {
vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP;
vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP;
@@ -4304,14 +4744,26 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
}
}
- /* Currently only address widths supported are 39 and 48 bits */
- if ((s->aw_bits != VTD_HOST_AW_39BIT) &&
- (s->aw_bits != VTD_HOST_AW_48BIT)) {
- error_setg(errp, "Supported values for aw-bits are: %d, %d",
+ if (!s->scalable_mode && s->flts) {
+ error_setg(errp, "x-flts is only available in scalable mode");
+ return false;
+ }
+
+ if (!s->flts && s->aw_bits != VTD_HOST_AW_39BIT &&
+ s->aw_bits != VTD_HOST_AW_48BIT) {
+ error_setg(errp, "%s: supported values for aw-bits are: %d, %d",
+ s->scalable_mode ? "Scalable mode(flts=off)" : "Legacy mode",
VTD_HOST_AW_39BIT, VTD_HOST_AW_48BIT);
return false;
}
+ if (s->flts && s->aw_bits != VTD_HOST_AW_48BIT) {
+ error_setg(errp,
+ "Scalable mode(flts=on): supported value for aw-bits is: %d",
+ VTD_HOST_AW_48BIT);
+ return false;
+ }
+
if (s->scalable_mode && !s->dma_drain) {
error_setg(errp, "Need to set dma_drain for scalable mode");
return false;
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 4323fc5..e8b211e 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -195,6 +195,7 @@
#define VTD_ECAP_PASID (1ULL << 40)
#define VTD_ECAP_SMTS (1ULL << 43)
#define VTD_ECAP_SLTS (1ULL << 46)
+#define VTD_ECAP_FLTS (1ULL << 47)
/* CAP_REG */
/* (offset >> 4) << 24 */
@@ -211,6 +212,7 @@
#define VTD_CAP_SLLPS ((1ULL << 34) | (1ULL << 35))
#define VTD_CAP_DRAIN_WRITE (1ULL << 54)
#define VTD_CAP_DRAIN_READ (1ULL << 55)
+#define VTD_CAP_FS1GP (1ULL << 56)
#define VTD_CAP_DRAIN (VTD_CAP_DRAIN_READ | VTD_CAP_DRAIN_WRITE)
#define VTD_CAP_CM (1ULL << 7)
#define VTD_PASID_ID_SHIFT 20
@@ -311,10 +313,28 @@ typedef enum VTDFaultReason {
* request while disabled */
VTD_FR_IR_SID_ERR = 0x26, /* Invalid Source-ID */
- VTD_FR_PASID_TABLE_INV = 0x58, /*Invalid PASID table entry */
+ /* PASID directory entry access failure */
+ VTD_FR_PASID_DIR_ACCESS_ERR = 0x50,
+ /* The Present(P) field of pasid directory entry is 0 */
+ VTD_FR_PASID_DIR_ENTRY_P = 0x51,
+ VTD_FR_PASID_TABLE_ACCESS_ERR = 0x58, /* PASID table entry access failure */
+ /* The Present(P) field of pasid table entry is 0 */
+ VTD_FR_PASID_ENTRY_P = 0x59,
+ VTD_FR_PASID_TABLE_ENTRY_INV = 0x5b, /*Invalid PASID table entry */
+
+ /* Fail to access a first-level paging entry (not FS_PML4E) */
+ VTD_FR_FS_PAGING_ENTRY_INV = 0x70,
+ VTD_FR_FS_PAGING_ENTRY_P = 0x71,
+ /* Non-zero reserved field in present first-stage paging entry */
+ VTD_FR_FS_PAGING_ENTRY_RSVD = 0x72,
+ VTD_FR_PASID_ENTRY_FSPTPTR_INV = 0x73, /* Invalid FSPTPTR in PASID entry */
+ VTD_FR_FS_NON_CANONICAL = 0x80, /* SNG.1 : Address for FS not canonical.*/
+ VTD_FR_FS_PAGING_ENTRY_US = 0x81, /* Privilege violation */
+ VTD_FR_SM_WRITE = 0x85, /* No write permission */
/* Output address in the interrupt address range for scalable mode */
VTD_FR_SM_INTERRUPT_ADDR = 0x87,
+ VTD_FR_FS_BIT_UPDATE_FAILED = 0x91, /* SFS.10 */
VTD_FR_MAX, /* Guard */
} VTDFaultReason;
@@ -367,6 +387,7 @@ typedef union VTDInvDesc VTDInvDesc;
#define VTD_INV_DESC_WAIT 0x5 /* Invalidation Wait Descriptor */
#define VTD_INV_DESC_PIOTLB 0x6 /* PASID-IOTLB Invalidate Desc */
#define VTD_INV_DESC_PC 0x7 /* PASID-cache Invalidate Desc */
+#define VTD_INV_DESC_DEV_PIOTLB 0x8 /* PASID-based-DIOTLB inv_desc*/
#define VTD_INV_DESC_NONE 0 /* Not an Invalidate Descriptor */
/* Masks for Invalidation Wait Descriptor*/
@@ -397,11 +418,6 @@ typedef union VTDInvDesc VTDInvDesc;
#define VTD_INV_DESC_IOTLB_AM(val) ((val) & 0x3fULL)
#define VTD_INV_DESC_IOTLB_RSVD_LO 0xffffffff0000f100ULL
#define VTD_INV_DESC_IOTLB_RSVD_HI 0xf80ULL
-#define VTD_INV_DESC_IOTLB_PASID_PASID (2ULL << 4)
-#define VTD_INV_DESC_IOTLB_PASID_PAGE (3ULL << 4)
-#define VTD_INV_DESC_IOTLB_PASID(val) (((val) >> 32) & VTD_PASID_ID_MASK)
-#define VTD_INV_DESC_IOTLB_PASID_RSVD_LO 0xfff00000000001c0ULL
-#define VTD_INV_DESC_IOTLB_PASID_RSVD_HI 0xf80ULL
/* Mask for Device IOTLB Invalidate Descriptor */
#define VTD_INV_DESC_DEVICE_IOTLB_ADDR(val) ((val) & 0xfffffffffffff000ULL)
@@ -413,6 +429,16 @@ typedef union VTDInvDesc VTDInvDesc;
/* Masks for Interrupt Entry Invalidate Descriptor */
#define VTD_INV_DESC_IEC_RSVD 0xffff000007fff1e0ULL
+/* Masks for PASID based Device IOTLB Invalidate Descriptor */
+#define VTD_INV_DESC_PASID_DEVICE_IOTLB_ADDR(val) ((val) & \
+ 0xfffffffffffff000ULL)
+#define VTD_INV_DESC_PASID_DEVICE_IOTLB_SIZE(val) ((val >> 11) & 0x1)
+#define VTD_INV_DESC_PASID_DEVICE_IOTLB_GLOBAL(val) ((val) & 0x1)
+#define VTD_INV_DESC_PASID_DEVICE_IOTLB_SID(val) (((val) >> 16) & 0xffffULL)
+#define VTD_INV_DESC_PASID_DEVICE_IOTLB_PASID(val) ((val >> 32) & 0xfffffULL)
+#define VTD_INV_DESC_PASID_DEVICE_IOTLB_RSVD_VAL0 0xfff000000000f000ULL
+#define VTD_INV_DESC_PASID_DEVICE_IOTLB_RSVD_VAL1 0x7feULL
+
/* Rsvd field masks for spte */
#define VTD_SPTE_SNP 0x800ULL
@@ -436,6 +462,34 @@ typedef union VTDInvDesc VTDInvDesc;
(0x3ffff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM | VTD_SL_TM)) : \
(0x3ffff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+/* Rsvd field masks for fpte */
+#define VTD_FS_UPPER_IGNORED 0xfff0000000000000ULL
+#define VTD_FPTE_PAGE_L1_RSVD_MASK(aw) \
+ (~(VTD_HAW_MASK(aw) | VTD_FS_UPPER_IGNORED))
+#define VTD_FPTE_PAGE_L2_RSVD_MASK(aw) \
+ (~(VTD_HAW_MASK(aw) | VTD_FS_UPPER_IGNORED))
+#define VTD_FPTE_PAGE_L3_RSVD_MASK(aw) \
+ (~(VTD_HAW_MASK(aw) | VTD_FS_UPPER_IGNORED))
+#define VTD_FPTE_PAGE_L4_RSVD_MASK(aw) \
+ (0x80ULL | ~(VTD_HAW_MASK(aw) | VTD_FS_UPPER_IGNORED))
+
+#define VTD_FPTE_LPAGE_L2_RSVD_MASK(aw) \
+ (0x1fe000ULL | ~(VTD_HAW_MASK(aw) | VTD_FS_UPPER_IGNORED))
+#define VTD_FPTE_LPAGE_L3_RSVD_MASK(aw) \
+ (0x3fffe000ULL | ~(VTD_HAW_MASK(aw) | VTD_FS_UPPER_IGNORED))
+
+/* Masks for PIOTLB Invalidate Descriptor */
+#define VTD_INV_DESC_PIOTLB_G (3ULL << 4)
+#define VTD_INV_DESC_PIOTLB_ALL_IN_PASID (2ULL << 4)
+#define VTD_INV_DESC_PIOTLB_PSI_IN_PASID (3ULL << 4)
+#define VTD_INV_DESC_PIOTLB_DID(val) (((val) >> 16) & VTD_DOMAIN_ID_MASK)
+#define VTD_INV_DESC_PIOTLB_PASID(val) (((val) >> 32) & 0xfffffULL)
+#define VTD_INV_DESC_PIOTLB_AM(val) ((val) & 0x3fULL)
+#define VTD_INV_DESC_PIOTLB_IH(val) (((val) >> 6) & 0x1)
+#define VTD_INV_DESC_PIOTLB_ADDR(val) ((val) & ~0xfffULL)
+#define VTD_INV_DESC_PIOTLB_RSVD_VAL0 0xfff000000000f1c0ULL
+#define VTD_INV_DESC_PIOTLB_RSVD_VAL1 0xf80ULL
+
/* Information about page-selective IOTLB invalidate */
struct VTDIOTLBPageInvInfo {
uint16_t domain_id;
@@ -519,27 +573,38 @@ typedef struct VTDRootEntry VTDRootEntry;
#define VTD_SM_PASID_ENTRY_AW 7ULL /* Adjusted guest-address-width */
#define VTD_SM_PASID_ENTRY_DID(val) ((val) & VTD_DOMAIN_ID_MASK)
+#define VTD_SM_PASID_ENTRY_FLPM 3ULL
+#define VTD_SM_PASID_ENTRY_FLPTPTR (~0xfffULL)
+
+/* First Level Paging Structure */
+/* Masks for First Level Paging Entry */
+#define VTD_FL_P 1ULL
+#define VTD_FL_RW (1ULL << 1)
+#define VTD_FL_US (1ULL << 2)
+#define VTD_FL_A (1ULL << 5)
+#define VTD_FL_D (1ULL << 6)
+
/* Second Level Page Translation Pointer*/
#define VTD_SM_PASID_ENTRY_SLPTPTR (~0xfffULL)
-/* Paging Structure common */
-#define VTD_SL_PT_PAGE_SIZE_MASK (1ULL << 7)
-/* Bits to decide the offset for each level */
-#define VTD_SL_LEVEL_BITS 9
-
/* Second Level Paging Structure */
-#define VTD_SL_PML4_LEVEL 4
-#define VTD_SL_PDP_LEVEL 3
-#define VTD_SL_PD_LEVEL 2
-#define VTD_SL_PT_LEVEL 1
-#define VTD_SL_PT_ENTRY_NR 512
-
/* Masks for Second Level Paging Entry */
#define VTD_SL_RW_MASK 3ULL
#define VTD_SL_R 1ULL
#define VTD_SL_W (1ULL << 1)
-#define VTD_SL_PT_BASE_ADDR_MASK(aw) (~(VTD_PAGE_SIZE - 1) & VTD_HAW_MASK(aw))
#define VTD_SL_IGN_COM 0xbff0000000000000ULL
#define VTD_SL_TM (1ULL << 62)
+/* Common for both First Level and Second Level */
+#define VTD_PML4_LEVEL 4
+#define VTD_PDP_LEVEL 3
+#define VTD_PD_LEVEL 2
+#define VTD_PT_LEVEL 1
+#define VTD_PT_ENTRY_NR 512
+#define VTD_PT_PAGE_SIZE_MASK (1ULL << 7)
+#define VTD_PT_BASE_ADDR_MASK(aw) (~(VTD_PAGE_SIZE - 1) & VTD_HAW_MASK(aw))
+
+/* Bits to decide the offset for each level */
+#define VTD_LEVEL_BITS 9
+
#endif
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 9334b03..b46975c 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -86,6 +86,7 @@ GlobalProperty pc_compat_9_1[] = {
{ "ICH9-LPC", "x-smi-swsmi-timer", "off" },
{ "ICH9-LPC", "x-smi-periodic-timer", "off" },
{ TYPE_INTEL_IOMMU_DEVICE, "stale-tm", "on" },
+ { TYPE_INTEL_IOMMU_DEVICE, "aw-bits", "39" },
};
const size_t pc_compat_9_1_len = G_N_ELEMENTS(pc_compat_9_1);
diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index bd76527..0ae1704 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -843,7 +843,7 @@ static void ct3_realize(PCIDevice *pci_dev, Error **errp)
ComponentRegisters *regs = &cxl_cstate->crb;
MemoryRegion *mr = &regs->component_registers;
uint8_t *pci_conf = pci_dev->config;
- unsigned short msix_num = 6;
+ unsigned short msix_num = 10;
int i, rc;
uint16_t count;
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 06f096a..85e14b7 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -3337,6 +3337,117 @@ static const VMStateDescription vmstate_virtio_net_rss = {
},
};
+static struct vhost_dev *virtio_net_get_vhost(VirtIODevice *vdev)
+{
+ VirtIONet *n = VIRTIO_NET(vdev);
+ NetClientState *nc;
+ struct vhost_net *net;
+
+ if (!n->nic) {
+ return NULL;
+ }
+
+ nc = qemu_get_queue(n->nic);
+ if (!nc) {
+ return NULL;
+ }
+
+ net = get_vhost_net(nc->peer);
+ if (!net) {
+ return NULL;
+ }
+
+ return &net->dev;
+}
+
+static int vhost_user_net_save_state(QEMUFile *f, void *pv, size_t size,
+ const VMStateField *field,
+ JSONWriter *vmdesc)
+{
+ VirtIONet *n = pv;
+ VirtIODevice *vdev = VIRTIO_DEVICE(n);
+ struct vhost_dev *vhdev;
+ Error *local_error = NULL;
+ int ret;
+
+ vhdev = virtio_net_get_vhost(vdev);
+ if (vhdev == NULL) {
+ error_reportf_err(local_error,
+ "Error getting vhost back-end of %s device %s: ",
+ vdev->name, vdev->parent_obj.canonical_path);
+ return -1;
+ }
+
+ ret = vhost_save_backend_state(vhdev, f, &local_error);
+ if (ret < 0) {
+ error_reportf_err(local_error,
+ "Error saving back-end state of %s device %s: ",
+ vdev->name, vdev->parent_obj.canonical_path);
+ return ret;
+ }
+
+ return 0;
+}
+
+static int vhost_user_net_load_state(QEMUFile *f, void *pv, size_t size,
+ const VMStateField *field)
+{
+ VirtIONet *n = pv;
+ VirtIODevice *vdev = VIRTIO_DEVICE(n);
+ struct vhost_dev *vhdev;
+ Error *local_error = NULL;
+ int ret;
+
+ vhdev = virtio_net_get_vhost(vdev);
+ if (vhdev == NULL) {
+ error_reportf_err(local_error,
+ "Error getting vhost back-end of %s device %s: ",
+ vdev->name, vdev->parent_obj.canonical_path);
+ return -1;
+ }
+
+ ret = vhost_load_backend_state(vhdev, f, &local_error);
+ if (ret < 0) {
+ error_reportf_err(local_error,
+ "Error loading back-end state of %s device %s: ",
+ vdev->name, vdev->parent_obj.canonical_path);
+ return ret;
+ }
+
+ return 0;
+}
+
+static bool vhost_user_net_is_internal_migration(void *opaque)
+{
+ VirtIONet *n = opaque;
+ VirtIODevice *vdev = VIRTIO_DEVICE(n);
+ struct vhost_dev *vhdev;
+
+ vhdev = virtio_net_get_vhost(vdev);
+ if (vhdev == NULL) {
+ return false;
+ }
+
+ return vhost_supports_device_state(vhdev);
+}
+
+static const VMStateDescription vhost_user_net_backend_state = {
+ .name = "virtio-net-device/backend",
+ .version_id = 0,
+ .needed = vhost_user_net_is_internal_migration,
+ .fields = (const VMStateField[]) {
+ {
+ .name = "backend",
+ .info = &(const VMStateInfo) {
+ .name = "virtio-net vhost-user backend state",
+ .get = vhost_user_net_load_state,
+ .put = vhost_user_net_save_state,
+ },
+ },
+ VMSTATE_END_OF_LIST()
+ }
+};
+
static const VMStateDescription vmstate_virtio_net_device = {
.name = "virtio-net-device",
.version_id = VIRTIO_NET_VM_VERSION,
@@ -3389,6 +3500,7 @@ static const VMStateDescription vmstate_virtio_net_device = {
},
.subsections = (const VMStateDescription * const []) {
&vmstate_virtio_net_rss,
+ &vhost_user_net_backend_state,
NULL
}
};
@@ -3950,29 +4062,6 @@ static bool dev_unplug_pending(void *opaque)
return vdc->primary_unplug_pending(dev);
}
-static struct vhost_dev *virtio_net_get_vhost(VirtIODevice *vdev)
-{
- VirtIONet *n = VIRTIO_NET(vdev);
- NetClientState *nc;
- struct vhost_net *net;
-
- if (!n->nic) {
- return NULL;
- }
-
- nc = qemu_get_queue(n->nic);
- if (!nc) {
- return NULL;
- }
-
- net = get_vhost_net(nc->peer);
- if (!net) {
- return NULL;
- }
-
- return &net->dev;
-}
-
static const VMStateDescription vmstate_virtio_net = {
.name = "virtio-net",
.minimum_version_id = VIRTIO_NET_VM_VERSION,
diff --git a/hw/pci/msix.c b/hw/pci/msix.c
index d8a55a6..57ec708 100644
--- a/hw/pci/msix.c
+++ b/hw/pci/msix.c
@@ -250,7 +250,7 @@ static uint64_t msix_pba_mmio_read(void *opaque, hwaddr addr,
PCIDevice *dev = opaque;
if (dev->msix_vector_poll_notifier) {
unsigned vector_start = addr * 8;
- unsigned vector_end = MIN(addr + size * 8, dev->msix_entries_nr);
+ unsigned vector_end = MIN((addr + size) * 8, dev->msix_entries_nr);
dev->msix_vector_poll_notifier(dev, vector_start, vector_end);
}
diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
index 0b455c8..1b12db6 100644
--- a/hw/pci/pcie.c
+++ b/hw/pci/pcie.c
@@ -1113,18 +1113,22 @@ void pcie_sync_bridge_lnk(PCIDevice *bridge_dev)
if ((lnksta & PCI_EXP_LNKSTA_NLW) > (lnkcap & PCI_EXP_LNKCAP_MLW)) {
lnksta &= ~PCI_EXP_LNKSTA_NLW;
lnksta |= lnkcap & PCI_EXP_LNKCAP_MLW;
- } else if (!(lnksta & PCI_EXP_LNKSTA_NLW)) {
- lnksta |= QEMU_PCI_EXP_LNKSTA_NLW(QEMU_PCI_EXP_LNK_X1);
}
if ((lnksta & PCI_EXP_LNKSTA_CLS) > (lnkcap & PCI_EXP_LNKCAP_SLS)) {
lnksta &= ~PCI_EXP_LNKSTA_CLS;
lnksta |= lnkcap & PCI_EXP_LNKCAP_SLS;
- } else if (!(lnksta & PCI_EXP_LNKSTA_CLS)) {
- lnksta |= QEMU_PCI_EXP_LNKSTA_CLS(QEMU_PCI_EXP_LNK_2_5GT);
}
}
+ if (!(lnksta & PCI_EXP_LNKSTA_NLW)) {
+ lnksta |= QEMU_PCI_EXP_LNKSTA_NLW(QEMU_PCI_EXP_LNK_X1);
+ }
+
+ if (!(lnksta & PCI_EXP_LNKSTA_CLS)) {
+ lnksta |= QEMU_PCI_EXP_LNKSTA_CLS(QEMU_PCI_EXP_LNK_2_5GT);
+ }
+
pci_word_test_and_clear_mask(exp_cap + PCI_EXP_LNKSTA,
PCI_EXP_LNKSTA_CLS | PCI_EXP_LNKSTA_NLW);
pci_word_test_and_set_mask(exp_cap + PCI_EXP_LNKSTA, lnksta &
diff --git a/include/hw/acpi/ghes.h b/include/hw/acpi/ghes.h
index 674f695..39619a2 100644
--- a/include/hw/acpi/ghes.h
+++ b/include/hw/acpi/ghes.h
@@ -23,6 +23,7 @@
#define ACPI_GHES_H
#include "hw/acpi/bios-linker-loader.h"
+#include "qapi/error.h"
/*
* Values for Hardware Error Notification Type field
@@ -59,26 +60,29 @@ enum AcpiGhesNotifyType {
enum {
ACPI_HEST_SRC_ID_SEA = 0,
/* future ids go here */
- ACPI_HEST_SRC_ID_RESERVED,
+
+ ACPI_GHES_ERROR_SOURCE_COUNT
};
typedef struct AcpiGhesState {
- uint64_t ghes_addr_le;
+ uint64_t hw_error_le;
bool present; /* True if GHES is present at all on this board */
} AcpiGhesState;
-void build_ghes_error_table(GArray *hardware_errors, BIOSLinker *linker);
-void acpi_build_hest(GArray *table_data, BIOSLinker *linker,
+void acpi_build_hest(GArray *table_data, GArray *hardware_errors,
+ BIOSLinker *linker,
const char *oem_id, const char *oem_table_id);
void acpi_ghes_add_fw_cfg(AcpiGhesState *vms, FWCfgState *s,
GArray *hardware_errors);
-int acpi_ghes_record_errors(uint8_t notify, uint64_t error_physical_addr);
+int acpi_ghes_memory_errors(uint16_t source_id, uint64_t error_physical_addr);
+void ghes_record_cper_errors(const void *cper, size_t len,
+ uint16_t source_id, Error **errp);
/**
* acpi_ghes_present: Report whether ACPI GHES table is present
*
* Returns: true if the system has an ACPI GHES table and it is
- * safe to call acpi_ghes_record_errors() to record a memory error.
+ * safe to call acpi_ghes_memory_errors() to record a memory error.
*/
bool acpi_ghes_present(void);
#endif
diff --git a/include/hw/acpi/vmclock.h b/include/hw/acpi/vmclock.h
new file mode 100644
index 0000000..5605605
--- /dev/null
+++ b/include/hw/acpi/vmclock.h
@@ -0,0 +1,34 @@
+#ifndef ACPI_VMCLOCK_H
+#define ACPI_VMCLOCK_H
+
+#include "hw/acpi/bios-linker-loader.h"
+#include "hw/qdev-core.h"
+#include "qemu/uuid.h"
+#include "qom/object.h"
+
+#define TYPE_VMCLOCK "vmclock"
+
+#define VMCLOCK_ADDR 0xfeffb000
+#define VMCLOCK_SIZE 0x1000
+
+OBJECT_DECLARE_SIMPLE_TYPE(VmclockState, VMCLOCK)
+
+struct vmclock_abi;
+
+struct VmclockState {
+ DeviceState parent_obj;
+ MemoryRegion clk_page;
+ uint64_t physaddr;
+ struct vmclock_abi *clk;
+};
+
+/* returns NULL unless there is exactly one device */
+static inline Object *find_vmclock_dev(void)
+{
+ return object_resolve_path_type("", TYPE_VMCLOCK, NULL);
+}
+
+void vmclock_build_acpi(VmclockState *vms, GArray *table_data,
+ BIOSLinker *linker, const char *oem_id);
+
+#endif
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index d372cd3..e95477e 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -45,8 +45,9 @@ OBJECT_DECLARE_SIMPLE_TYPE(IntelIOMMUState, INTEL_IOMMU_DEVICE)
#define DMAR_REG_SIZE 0x230
#define VTD_HOST_AW_39BIT 39
#define VTD_HOST_AW_48BIT 48
-#define VTD_HOST_ADDRESS_WIDTH VTD_HOST_AW_39BIT
+#define VTD_HOST_ADDRESS_WIDTH VTD_HOST_AW_48BIT
#define VTD_HAW_MASK(aw) ((1ULL << (aw)) - 1)
+#define VTD_MGAW_FROM_CAP(cap) ((cap >> 16) & 0x3fULL)
#define DMAR_REPORT_F_INTR (1)
@@ -152,9 +153,10 @@ struct VTDIOTLBEntry {
uint64_t gfn;
uint16_t domain_id;
uint32_t pasid;
- uint64_t slpte;
+ uint64_t pte;
uint64_t mask;
uint8_t access_flags;
+ uint8_t pgtt;
};
/* VT-d Source-ID Qualifier types */
@@ -262,6 +264,7 @@ struct IntelIOMMUState {
bool caching_mode; /* RO - is cap CM enabled? */
bool scalable_mode; /* RO - is Scalable Mode supported? */
+ bool flts; /* RO - is stage-1 translation supported? */
bool snoop_control; /* RO - is SNP filed supported? */
dma_addr_t root; /* Current root table pointer */
@@ -305,6 +308,7 @@ struct IntelIOMMUState {
bool dma_drain; /* Whether DMA r/w draining enabled */
bool dma_translation; /* Whether DMA translation supported */
bool pasid; /* Whether to support PASID */
+ bool fs1gp; /* First Stage 1-GByte Page Support */
/* Transient Mapping, Reserved(0) since VTD spec revision 3.2 */
bool stale_tm;
diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
index 461c168..a9469d5 100644
--- a/include/hw/virtio/vhost.h
+++ b/include/hw/virtio/vhost.h
@@ -365,7 +365,14 @@ static inline int vhost_reset_device(struct vhost_dev *hdev)
* Returns true if the device supports these commands, and false if it
* does not.
*/
+#ifdef CONFIG_VHOST
bool vhost_supports_device_state(struct vhost_dev *dev);
+#else
+static inline bool vhost_supports_device_state(struct vhost_dev *dev)
+{
+ return false;
+}
+#endif
/**
* vhost_set_device_state_fd(): Begin transfer of internal state from/to
@@ -448,7 +455,15 @@ int vhost_check_device_state(struct vhost_dev *dev, Error **errp);
*
* Returns 0 on success, and -errno otherwise.
*/
+#ifdef CONFIG_VHOST
int vhost_save_backend_state(struct vhost_dev *dev, QEMUFile *f, Error **errp);
+#else
+static inline int vhost_save_backend_state(struct vhost_dev *dev, QEMUFile *f,
+ Error **errp)
+{
+ return -ENOSYS;
+}
+#endif
/**
* vhost_load_backend_state(): High-level function to load a vhost
@@ -465,6 +480,14 @@ int vhost_save_backend_state(struct vhost_dev *dev, QEMUFile *f, Error **errp);
*
* Returns 0 on success, and -errno otherwise.
*/
+#ifdef CONFIG_VHOST
int vhost_load_backend_state(struct vhost_dev *dev, QEMUFile *f, Error **errp);
+#else
+static inline int vhost_load_backend_state(struct vhost_dev *dev, QEMUFile *f,
+ Error **errp)
+{
+ return -ENOSYS;
+}
+#endif
#endif
diff --git a/include/hw/virtio/virtio-gpu.h b/include/hw/virtio/virtio-gpu.h
index bd93672..a42957c 100644
--- a/include/hw/virtio/virtio-gpu.h
+++ b/include/hw/virtio/virtio-gpu.h
@@ -98,6 +98,7 @@ enum virtio_gpu_base_conf_flags {
VIRTIO_GPU_FLAG_CONTEXT_INIT_ENABLED,
VIRTIO_GPU_FLAG_RUTABAGA_ENABLED,
VIRTIO_GPU_FLAG_VENUS_ENABLED,
+ VIRTIO_GPU_FLAG_RESOURCE_UUID_ENABLED,
};
#define virtio_gpu_virgl_enabled(_cfg) \
@@ -114,6 +115,8 @@ enum virtio_gpu_base_conf_flags {
(_cfg.flags & (1 << VIRTIO_GPU_FLAG_CONTEXT_INIT_ENABLED))
#define virtio_gpu_rutabaga_enabled(_cfg) \
(_cfg.flags & (1 << VIRTIO_GPU_FLAG_RUTABAGA_ENABLED))
+#define virtio_gpu_resource_uuid_enabled(_cfg) \
+ (_cfg.flags & (1 << VIRTIO_GPU_FLAG_RESOURCE_UUID_ENABLED))
#define virtio_gpu_hostmem_enabled(_cfg) \
(_cfg.hostmem > 0)
#define virtio_gpu_venus_enabled(_cfg) \
diff --git a/include/standard-headers/linux/vmclock-abi.h b/include/standard-headers/linux/vmclock-abi.h
new file mode 100644
index 0000000..15b0316
--- /dev/null
+++ b/include/standard-headers/linux/vmclock-abi.h
@@ -0,0 +1,182 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */
+
+/*
+ * This structure provides a vDSO-style clock to VM guests, exposing the
+ * relationship (or lack thereof) between the CPU clock (TSC, timebase, arch
+ * counter, etc.) and real time. It is designed to address the problem of
+ * live migration, which other clock enlightenments do not.
+ *
+ * When a guest is live migrated, this affects the clock in two ways.
+ *
+ * First, even between identical hosts the actual frequency of the underlying
+ * counter will change within the tolerances of its specification (typically
+ * ±50PPM, or 4 seconds a day). This frequency also varies over time on the
+ * same host, but can be tracked by NTP as it generally varies slowly. With
+ * live migration there is a step change in the frequency, with no warning.
+ *
+ * Second, there may be a step change in the value of the counter itself, as
+ * its accuracy is limited by the precision of the NTP synchronization on the
+ * source and destination hosts.
+ *
+ * So any calibration (NTP, PTP, etc.) which the guest has done on the source
+ * host before migration is invalid, and needs to be redone on the new host.
+ *
+ * In its most basic mode, this structure provides only an indication to the
+ * guest that live migration has occurred. This allows the guest to know that
+ * its clock is invalid and take remedial action. For applications that need
+ * reliable accurate timestamps (e.g. distributed databases), the structure
+ * can be mapped all the way to userspace. This allows the application to see
+ * directly for itself that the clock is disrupted and take appropriate
+ * action, even when using a vDSO-style method to get the time instead of a
+ * system call.
+ *
+ * In its more advanced mode. this structure can also be used to expose the
+ * precise relationship of the CPU counter to real time, as calibrated by the
+ * host. This means that userspace applications can have accurate time
+ * immediately after live migration, rather than having to pause operations
+ * and wait for NTP to recover. This mode does, of course, rely on the
+ * counter being reliable and consistent across CPUs.
+ *
+ * Note that this must be true UTC, never with smeared leap seconds. If a
+ * guest wishes to construct a smeared clock, it can do so. Presenting a
+ * smeared clock through this interface would be problematic because it
+ * actually messes with the apparent counter *period*. A linear smearing
+ * of 1 ms per second would effectively tweak the counter period by 1000PPM
+ * at the start/end of the smearing period, while a sinusoidal smear would
+ * basically be impossible to represent.
+ *
+ * This structure is offered with the intent that it be adopted into the
+ * nascent virtio-rtc standard, as a virtio-rtc that does not address the live
+ * migration problem seems a little less than fit for purpose. For that
+ * reason, certain fields use precisely the same numeric definitions as in
+ * the virtio-rtc proposal. The structure can also be exposed through an ACPI
+ * device with the CID "VMCLOCK", modelled on the "VMGENID" device except for
+ * the fact that it uses a real _CRS to convey the address of the structure
+ * (which should be a full page, to allow for mapping directly to userspace).
+ */
+
+#ifndef __VMCLOCK_ABI_H__
+#define __VMCLOCK_ABI_H__
+
+#include "standard-headers/linux/types.h"
+
+struct vmclock_abi {
+ /* CONSTANT FIELDS */
+ uint32_t magic;
+#define VMCLOCK_MAGIC 0x4b4c4356 /* "VCLK" */
+ uint32_t size; /* Size of region containing this structure */
+ uint16_t version; /* 1 */
+ uint8_t counter_id; /* Matches VIRTIO_RTC_COUNTER_xxx except INVALID */
+#define VMCLOCK_COUNTER_ARM_VCNT 0
+#define VMCLOCK_COUNTER_X86_TSC 1
+#define VMCLOCK_COUNTER_INVALID 0xff
+ uint8_t time_type; /* Matches VIRTIO_RTC_TYPE_xxx */
+#define VMCLOCK_TIME_UTC 0 /* Since 1970-01-01 00:00:00z */
+#define VMCLOCK_TIME_TAI 1 /* Since 1970-01-01 00:00:00z */
+#define VMCLOCK_TIME_MONOTONIC 2 /* Since undefined epoch */
+#define VMCLOCK_TIME_INVALID_SMEARED 3 /* Not supported */
+#define VMCLOCK_TIME_INVALID_MAYBE_SMEARED 4 /* Not supported */
+
+ /* NON-CONSTANT FIELDS PROTECTED BY SEQCOUNT LOCK */
+ uint32_t seq_count; /* Low bit means an update is in progress */
+ /*
+ * This field changes to another non-repeating value when the CPU
+ * counter is disrupted, for example on live migration. This lets
+ * the guest know that it should discard any calibration it has
+ * performed of the counter against external sources (NTP/PTP/etc.).
+ */
+ uint64_t disruption_marker;
+ uint64_t flags;
+ /* Indicates that the tai_offset_sec field is valid */
+#define VMCLOCK_FLAG_TAI_OFFSET_VALID (1 << 0)
+ /*
+ * Optionally used to notify guests of pending maintenance events.
+ * A guest which provides latency-sensitive services may wish to
+ * remove itself from service if an event is coming up. Two flags
+ * indicate the approximate imminence of the event.
+ */
+#define VMCLOCK_FLAG_DISRUPTION_SOON (1 << 1) /* About a day */
+#define VMCLOCK_FLAG_DISRUPTION_IMMINENT (1 << 2) /* About an hour */
+#define VMCLOCK_FLAG_PERIOD_ESTERROR_VALID (1 << 3)
+#define VMCLOCK_FLAG_PERIOD_MAXERROR_VALID (1 << 4)
+#define VMCLOCK_FLAG_TIME_ESTERROR_VALID (1 << 5)
+#define VMCLOCK_FLAG_TIME_MAXERROR_VALID (1 << 6)
+ /*
+ * If the MONOTONIC flag is set then (other than leap seconds) it is
+ * guaranteed that the time calculated according this structure at
+ * any given moment shall never appear to be later than the time
+ * calculated via the structure at any *later* moment.
+ *
+ * In particular, a timestamp based on a counter reading taken
+ * immediately after setting the low bit of seq_count (and the
+ * associated memory barrier), using the previously-valid time and
+ * period fields, shall never be later than a timestamp based on
+ * a counter reading taken immediately before *clearing* the low
+ * bit again after the update, using the about-to-be-valid fields.
+ */
+#define VMCLOCK_FLAG_TIME_MONOTONIC (1 << 7)
+
+ uint8_t pad[2];
+ uint8_t clock_status;
+#define VMCLOCK_STATUS_UNKNOWN 0
+#define VMCLOCK_STATUS_INITIALIZING 1
+#define VMCLOCK_STATUS_SYNCHRONIZED 2
+#define VMCLOCK_STATUS_FREERUNNING 3
+#define VMCLOCK_STATUS_UNRELIABLE 4
+
+ /*
+ * The time exposed through this device is never smeared. This field
+ * corresponds to the 'subtype' field in virtio-rtc, which indicates
+ * the smearing method. However in this case it provides a *hint* to
+ * the guest operating system, such that *if* the guest OS wants to
+ * provide its users with an alternative clock which does not follow
+ * UTC, it may do so in a fashion consistent with the other systems
+ * in the nearby environment.
+ */
+ uint8_t leap_second_smearing_hint; /* Matches VIRTIO_RTC_SUBTYPE_xxx */
+#define VMCLOCK_SMEARING_STRICT 0
+#define VMCLOCK_SMEARING_NOON_LINEAR 1
+#define VMCLOCK_SMEARING_UTC_SLS 2
+ uint16_t tai_offset_sec; /* Actually two's complement signed */
+ uint8_t leap_indicator;
+ /*
+ * This field is based on the VIRTIO_RTC_LEAP_xxx values as defined
+ * in the current draft of virtio-rtc, but since smearing cannot be
+ * used with the shared memory device, some values are not used.
+ *
+ * The _POST_POS and _POST_NEG values allow the guest to perform
+ * its own smearing during the day or so after a leap second when
+ * such smearing may need to continue being applied for a leap
+ * second which is now theoretically "historical".
+ */
+#define VMCLOCK_LEAP_NONE 0x00 /* No known nearby leap second */
+#define VMCLOCK_LEAP_PRE_POS 0x01 /* Positive leap second at EOM */
+#define VMCLOCK_LEAP_PRE_NEG 0x02 /* Negative leap second at EOM */
+#define VMCLOCK_LEAP_POS 0x03 /* Set during 23:59:60 second */
+#define VMCLOCK_LEAP_POST_POS 0x04
+#define VMCLOCK_LEAP_POST_NEG 0x05
+
+ /* Bit shift for counter_period_frac_sec and its error rate */
+ uint8_t counter_period_shift;
+ /*
+ * Paired values of counter and UTC at a given point in time.
+ */
+ uint64_t counter_value;
+ /*
+ * Counter period, and error margin of same. The unit of these
+ * fields is 1/2^(64 + counter_period_shift) of a second.
+ */
+ uint64_t counter_period_frac_sec;
+ uint64_t counter_period_esterror_rate_frac_sec;
+ uint64_t counter_period_maxerror_rate_frac_sec;
+
+ /*
+ * Time according to time_type field above.
+ */
+ uint64_t time_sec; /* Seconds since time_type epoch */
+ uint64_t time_frac_sec; /* Units of 1/2^64 of a second */
+ uint64_t time_esterror_nanosec;
+ uint64_t time_maxerror_nanosec;
+};
+
+#endif /* __VMCLOCK_ABI_H__ */
diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh
index 99a8d9f..8913e4f 100755
--- a/scripts/update-linux-headers.sh
+++ b/scripts/update-linux-headers.sh
@@ -258,6 +258,7 @@ for i in "$hdrdir"/include/linux/*virtio*.h \
"$hdrdir/include/linux/kernel.h" \
"$hdrdir/include/linux/kvm_para.h" \
"$hdrdir/include/linux/vhost_types.h" \
+ "$hdrdir/include/linux/vmclock-abi.h" \
"$hdrdir/include/linux/sysinfo.h"; do
cp_portable "$i" "$output/include/standard-headers/linux"
done
diff --git a/target/arm/kvm.c b/target/arm/kvm.c
index a9444a2..da30bdb 100644
--- a/target/arm/kvm.c
+++ b/target/arm/kvm.c
@@ -2387,7 +2387,7 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
*/
if (code == BUS_MCEERR_AR) {
kvm_cpu_synchronize_state(c);
- if (!acpi_ghes_record_errors(ACPI_HEST_SRC_ID_SEA, paddr)) {
+ if (!acpi_ghes_memory_errors(ACPI_HEST_SRC_ID_SEA, paddr)) {
kvm_inject_arm_sea(c);
} else {
error_report("failed to record the error");
diff --git a/tests/data/acpi/x86/pc/DSDT b/tests/data/acpi/x86/pc/DSDT
index 8b8235f..4beb519 100644
--- a/tests/data/acpi/x86/pc/DSDT
+++ b/tests/data/acpi/x86/pc/DSDT
Binary files differ
diff --git a/tests/data/acpi/x86/pc/DSDT.acpierst b/tests/data/acpi/x86/pc/DSDT.acpierst
index 06829b9..abda686 100644
--- a/tests/data/acpi/x86/pc/DSDT.acpierst
+++ b/tests/data/acpi/x86/pc/DSDT.acpierst
Binary files differ
diff --git a/tests/data/acpi/x86/pc/DSDT.acpihmat b/tests/data/acpi/x86/pc/DSDT.acpihmat
index 2fe355e..d081db2 100644
--- a/tests/data/acpi/x86/pc/DSDT.acpihmat
+++ b/tests/data/acpi/x86/pc/DSDT.acpihmat
Binary files differ
diff --git a/tests/data/acpi/x86/pc/DSDT.bridge b/tests/data/acpi/x86/pc/DSDT.bridge
index 4d4067c..e16897d 100644
--- a/tests/data/acpi/x86/pc/DSDT.bridge
+++ b/tests/data/acpi/x86/pc/DSDT.bridge
Binary files differ
diff --git a/tests/data/acpi/x86/pc/DSDT.cphp b/tests/data/acpi/x86/pc/DSDT.cphp
index 045a52e..e95711c 100644
--- a/tests/data/acpi/x86/pc/DSDT.cphp
+++ b/tests/data/acpi/x86/pc/DSDT.cphp
Binary files differ
diff --git a/tests/data/acpi/x86/pc/DSDT.dimmpxm b/tests/data/acpi/x86/pc/DSDT.dimmpxm
index 205219b..90ba66b 100644
--- a/tests/data/acpi/x86/pc/DSDT.dimmpxm
+++ b/tests/data/acpi/x86/pc/DSDT.dimmpxm
Binary files differ
diff --git a/tests/data/acpi/x86/pc/DSDT.hpbridge b/tests/data/acpi/x86/pc/DSDT.hpbridge
index 8fa8b51..0eafe5f 100644
--- a/tests/data/acpi/x86/pc/DSDT.hpbridge
+++ b/tests/data/acpi/x86/pc/DSDT.hpbridge
Binary files differ
diff --git a/tests/data/acpi/x86/pc/DSDT.hpbrroot b/tests/data/acpi/x86/pc/DSDT.hpbrroot
index 0171946..077a4cc 100644
--- a/tests/data/acpi/x86/pc/DSDT.hpbrroot
+++ b/tests/data/acpi/x86/pc/DSDT.hpbrroot
Binary files differ
diff --git a/tests/data/acpi/x86/pc/DSDT.ipmikcs b/tests/data/acpi/x86/pc/DSDT.ipmikcs
index 0ca6646..8d465f0 100644
--- a/tests/data/acpi/x86/pc/DSDT.ipmikcs
+++ b/tests/data/acpi/x86/pc/DSDT.ipmikcs
Binary files differ
diff --git a/tests/data/acpi/x86/pc/DSDT.memhp b/tests/data/acpi/x86/pc/DSDT.memhp
index 03ff464..e3b4975 100644
--- a/tests/data/acpi/x86/pc/DSDT.memhp
+++ b/tests/data/acpi/x86/pc/DSDT.memhp
Binary files differ
diff --git a/tests/data/acpi/x86/pc/DSDT.nohpet b/tests/data/acpi/x86/pc/DSDT.nohpet
index b081030..9e772c1 100644
--- a/tests/data/acpi/x86/pc/DSDT.nohpet
+++ b/tests/data/acpi/x86/pc/DSDT.nohpet
Binary files differ
diff --git a/tests/data/acpi/x86/pc/DSDT.numamem b/tests/data/acpi/x86/pc/DSDT.numamem
index 2c98caf..9bfbfc2 100644
--- a/tests/data/acpi/x86/pc/DSDT.numamem
+++ b/tests/data/acpi/x86/pc/DSDT.numamem
Binary files differ
diff --git a/tests/data/acpi/x86/pc/DSDT.roothp b/tests/data/acpi/x86/pc/DSDT.roothp
index da018dc..efbee6d 100644
--- a/tests/data/acpi/x86/pc/DSDT.roothp
+++ b/tests/data/acpi/x86/pc/DSDT.roothp
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DMAR.dmar b/tests/data/acpi/x86/q35/DMAR.dmar
index 0dca6e6..0c05976 100644
--- a/tests/data/acpi/x86/q35/DMAR.dmar
+++ b/tests/data/acpi/x86/q35/DMAR.dmar
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DSDT b/tests/data/acpi/x86/q35/DSDT
index fb89ae0..e5e8d1e 100644
--- a/tests/data/acpi/x86/q35/DSDT
+++ b/tests/data/acpi/x86/q35/DSDT
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DSDT.acpierst b/tests/data/acpi/x86/q35/DSDT.acpierst
index 46fd254..072a3fe 100644
--- a/tests/data/acpi/x86/q35/DSDT.acpierst
+++ b/tests/data/acpi/x86/q35/DSDT.acpierst
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DSDT.acpihmat b/tests/data/acpi/x86/q35/DSDT.acpihmat
index 61c5bd5..2a4f2fc 100644
--- a/tests/data/acpi/x86/q35/DSDT.acpihmat
+++ b/tests/data/acpi/x86/q35/DSDT.acpihmat
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DSDT.acpihmat-generic-x b/tests/data/acpi/x86/q35/DSDT.acpihmat-generic-x
index 497706c..7911c05 100644
--- a/tests/data/acpi/x86/q35/DSDT.acpihmat-generic-x
+++ b/tests/data/acpi/x86/q35/DSDT.acpihmat-generic-x
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DSDT.acpihmat-noinitiator b/tests/data/acpi/x86/q35/DSDT.acpihmat-noinitiator
index 3aaa2bb..580b4a4 100644
--- a/tests/data/acpi/x86/q35/DSDT.acpihmat-noinitiator
+++ b/tests/data/acpi/x86/q35/DSDT.acpihmat-noinitiator
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DSDT.applesmc b/tests/data/acpi/x86/q35/DSDT.applesmc
index 944209a..5e8220e 100644
--- a/tests/data/acpi/x86/q35/DSDT.applesmc
+++ b/tests/data/acpi/x86/q35/DSDT.applesmc
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DSDT.bridge b/tests/data/acpi/x86/q35/DSDT.bridge
index d9938db..ee03945 100644
--- a/tests/data/acpi/x86/q35/DSDT.bridge
+++ b/tests/data/acpi/x86/q35/DSDT.bridge
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DSDT.core-count b/tests/data/acpi/x86/q35/DSDT.core-count
index a24b04c..7ebfcee 100644
--- a/tests/data/acpi/x86/q35/DSDT.core-count
+++ b/tests/data/acpi/x86/q35/DSDT.core-count
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DSDT.core-count2 b/tests/data/acpi/x86/q35/DSDT.core-count2
index 3a0cb8c..d039455 100644
--- a/tests/data/acpi/x86/q35/DSDT.core-count2
+++ b/tests/data/acpi/x86/q35/DSDT.core-count2
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DSDT.cphp b/tests/data/acpi/x86/q35/DSDT.cphp
index 20955d0..a055c2e 100644
--- a/tests/data/acpi/x86/q35/DSDT.cphp
+++ b/tests/data/acpi/x86/q35/DSDT.cphp
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DSDT.cxl b/tests/data/acpi/x86/q35/DSDT.cxl
index 3c34d4d..2084354 100644
--- a/tests/data/acpi/x86/q35/DSDT.cxl
+++ b/tests/data/acpi/x86/q35/DSDT.cxl
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DSDT.dimmpxm b/tests/data/acpi/x86/q35/DSDT.dimmpxm
index 228374b..664e926 100644
--- a/tests/data/acpi/x86/q35/DSDT.dimmpxm
+++ b/tests/data/acpi/x86/q35/DSDT.dimmpxm
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DSDT.ipmibt b/tests/data/acpi/x86/q35/DSDT.ipmibt
index 45f911a..4066a76 100644
--- a/tests/data/acpi/x86/q35/DSDT.ipmibt
+++ b/tests/data/acpi/x86/q35/DSDT.ipmibt
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DSDT.ipmismbus b/tests/data/acpi/x86/q35/DSDT.ipmismbus
index e5d6811..6d0b6b9 100644
--- a/tests/data/acpi/x86/q35/DSDT.ipmismbus
+++ b/tests/data/acpi/x86/q35/DSDT.ipmismbus
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DSDT.ivrs b/tests/data/acpi/x86/q35/DSDT.ivrs
index 46fd254..072a3fe 100644
--- a/tests/data/acpi/x86/q35/DSDT.ivrs
+++ b/tests/data/acpi/x86/q35/DSDT.ivrs
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DSDT.memhp b/tests/data/acpi/x86/q35/DSDT.memhp
index 5ce0811..4f2f9bc 100644
--- a/tests/data/acpi/x86/q35/DSDT.memhp
+++ b/tests/data/acpi/x86/q35/DSDT.memhp
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DSDT.mmio64 b/tests/data/acpi/x86/q35/DSDT.mmio64
index bdf36c4..0fb6aab 100644
--- a/tests/data/acpi/x86/q35/DSDT.mmio64
+++ b/tests/data/acpi/x86/q35/DSDT.mmio64
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DSDT.multi-bridge b/tests/data/acpi/x86/q35/DSDT.multi-bridge
index 1db43a6..f6afa6d 100644
--- a/tests/data/acpi/x86/q35/DSDT.multi-bridge
+++ b/tests/data/acpi/x86/q35/DSDT.multi-bridge
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DSDT.noacpihp b/tests/data/acpi/x86/q35/DSDT.noacpihp
index 8bc1688..9f7261d 100644
--- a/tests/data/acpi/x86/q35/DSDT.noacpihp
+++ b/tests/data/acpi/x86/q35/DSDT.noacpihp
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DSDT.nohpet b/tests/data/acpi/x86/q35/DSDT.nohpet
index c13e45e..99ad629 100644
--- a/tests/data/acpi/x86/q35/DSDT.nohpet
+++ b/tests/data/acpi/x86/q35/DSDT.nohpet
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DSDT.numamem b/tests/data/acpi/x86/q35/DSDT.numamem
index ba66694..fd1d8a7 100644
--- a/tests/data/acpi/x86/q35/DSDT.numamem
+++ b/tests/data/acpi/x86/q35/DSDT.numamem
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DSDT.pvpanic-isa b/tests/data/acpi/x86/q35/DSDT.pvpanic-isa
index 6ad4287..89032fa 100644
--- a/tests/data/acpi/x86/q35/DSDT.pvpanic-isa
+++ b/tests/data/acpi/x86/q35/DSDT.pvpanic-isa
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DSDT.thread-count b/tests/data/acpi/x86/q35/DSDT.thread-count
index a24b04c..7ebfcee 100644
--- a/tests/data/acpi/x86/q35/DSDT.thread-count
+++ b/tests/data/acpi/x86/q35/DSDT.thread-count
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DSDT.thread-count2 b/tests/data/acpi/x86/q35/DSDT.thread-count2
index 3a0cb8c..d039455 100644
--- a/tests/data/acpi/x86/q35/DSDT.thread-count2
+++ b/tests/data/acpi/x86/q35/DSDT.thread-count2
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DSDT.tis.tpm12 b/tests/data/acpi/x86/q35/DSDT.tis.tpm12
index e381ce4..f2ed40c 100644
--- a/tests/data/acpi/x86/q35/DSDT.tis.tpm12
+++ b/tests/data/acpi/x86/q35/DSDT.tis.tpm12
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DSDT.tis.tpm2 b/tests/data/acpi/x86/q35/DSDT.tis.tpm2
index a092530..5c975d2 100644
--- a/tests/data/acpi/x86/q35/DSDT.tis.tpm2
+++ b/tests/data/acpi/x86/q35/DSDT.tis.tpm2
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DSDT.type4-count b/tests/data/acpi/x86/q35/DSDT.type4-count
index edc2319..3194a82 100644
--- a/tests/data/acpi/x86/q35/DSDT.type4-count
+++ b/tests/data/acpi/x86/q35/DSDT.type4-count
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DSDT.viot b/tests/data/acpi/x86/q35/DSDT.viot
index 4c93dfd..129d43e 100644
--- a/tests/data/acpi/x86/q35/DSDT.viot
+++ b/tests/data/acpi/x86/q35/DSDT.viot
Binary files differ
diff --git a/tests/data/acpi/x86/q35/DSDT.xapic b/tests/data/acpi/x86/q35/DSDT.xapic
index d4acd85..b37ab59 100644
--- a/tests/data/acpi/x86/q35/DSDT.xapic
+++ b/tests/data/acpi/x86/q35/DSDT.xapic
Binary files differ
diff --git a/tests/qtest/intel-iommu-test.c b/tests/qtest/intel-iommu-test.c
new file mode 100644
index 0000000..c521b37
--- /dev/null
+++ b/tests/qtest/intel-iommu-test.c
@@ -0,0 +1,64 @@
+/*
+ * QTest testcase for intel-iommu
+ *
+ * Copyright (c) 2024 Intel, Inc.
+ *
+ * Author: Zhenzhong Duan <zhenzhong.duan@intel.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "libqtest.h"
+#include "hw/i386/intel_iommu_internal.h"
+
+#define CAP_STAGE_1_FIXED1 (VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | \
+ VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS)
+#define ECAP_STAGE_1_FIXED1 (VTD_ECAP_QI | VTD_ECAP_IR | VTD_ECAP_IRO | \
+ VTD_ECAP_MHMV | VTD_ECAP_SMTS | VTD_ECAP_FLTS)
+
+static inline uint64_t vtd_reg_readq(QTestState *s, uint64_t offset)
+{
+ return qtest_readq(s, Q35_HOST_BRIDGE_IOMMU_ADDR + offset);
+}
+
+static void test_intel_iommu_stage_1(void)
+{
+ uint8_t init_csr[DMAR_REG_SIZE]; /* register values */
+ uint8_t post_reset_csr[DMAR_REG_SIZE]; /* register values */
+ uint64_t cap, ecap, tmp;
+ QTestState *s;
+
+ s = qtest_init("-M q35 -device intel-iommu,x-scalable-mode=on,x-flts=on");
+
+ cap = vtd_reg_readq(s, DMAR_CAP_REG);
+ g_assert((cap & CAP_STAGE_1_FIXED1) == CAP_STAGE_1_FIXED1);
+
+ tmp = cap & VTD_CAP_SAGAW_MASK;
+ g_assert(tmp == (VTD_CAP_SAGAW_39bit | VTD_CAP_SAGAW_48bit));
+
+ tmp = VTD_MGAW_FROM_CAP(cap);
+ g_assert(tmp == VTD_HOST_AW_48BIT - 1);
+
+ ecap = vtd_reg_readq(s, DMAR_ECAP_REG);
+ g_assert((ecap & ECAP_STAGE_1_FIXED1) == ECAP_STAGE_1_FIXED1);
+
+ qtest_memread(s, Q35_HOST_BRIDGE_IOMMU_ADDR, init_csr, DMAR_REG_SIZE);
+
+ qobject_unref(qtest_qmp(s, "{ 'execute': 'system_reset' }"));
+ qtest_qmp_eventwait(s, "RESET");
+
+ qtest_memread(s, Q35_HOST_BRIDGE_IOMMU_ADDR, post_reset_csr, DMAR_REG_SIZE);
+ /* Ensure registers are consistent after hard reset */
+ g_assert(!memcmp(init_csr, post_reset_csr, DMAR_REG_SIZE));
+
+ qtest_quit(s);
+}
+
+int main(int argc, char **argv)
+{
+ g_test_init(&argc, &argv, NULL);
+ qtest_add_func("/q35/intel-iommu/stage-1", test_intel_iommu_stage_1);
+
+ return g_test_run();
+}
diff --git a/tests/qtest/meson.build b/tests/qtest/meson.build
index bf4d5c2..edd53ec 100644
--- a/tests/qtest/meson.build
+++ b/tests/qtest/meson.build
@@ -93,6 +93,7 @@ qtests_i386 = \
(config_all_devices.has_key('CONFIG_SB16') ? ['fuzz-sb16-test'] : []) + \
(config_all_devices.has_key('CONFIG_SDHCI_PCI') ? ['fuzz-sdcard-test'] : []) + \
(config_all_devices.has_key('CONFIG_ESP_PCI') ? ['am53c974-test'] : []) + \
+ (config_all_devices.has_key('CONFIG_VTD') ? ['intel-iommu-test'] : []) + \
(host_os != 'windows' and \
config_all_devices.has_key('CONFIG_ACPI_ERST') ? ['erst-test'] : []) + \
(config_all_devices.has_key('CONFIG_PCIE_PORT') and \