From e4bcec0c3c8527a3f2be2791fa6a89387b9116c1 Mon Sep 17 00:00:00 2001 From: Robert Hoo Date: Mon, 4 Jul 2022 16:58:52 +0800 Subject: acpi/nvdimm: Define trace events for NVDIMM and substitute nvdimm_debug() Signed-off-by: Robert Hoo Reviewed-by: Jingqi Liu Message-Id: <20220704085852.330005-1-robert.hu@linux.intel.com> Reviewed-by: Igor Mammedov Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/nvdimm.c | 35 ++++++++++++++++------------------- hw/acpi/trace-events | 13 +++++++++++++ 2 files changed, 29 insertions(+), 19 deletions(-) (limited to 'hw') diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c index 5f85b16..31e46df 100644 --- a/hw/acpi/nvdimm.c +++ b/hw/acpi/nvdimm.c @@ -35,6 +35,7 @@ #include "hw/nvram/fw_cfg.h" #include "hw/mem/nvdimm.h" #include "qemu/nvdimm-utils.h" +#include "trace.h" /* * define Byte Addressable Persistent Memory (PM) Region according to @@ -550,8 +551,8 @@ static void nvdimm_dsm_func_read_fit(NVDIMMState *state, NvdimmDsmIn *in, fit = fit_buf->fit; - nvdimm_debug("Read FIT: offset 0x%x FIT size 0x%x Dirty %s.\n", - read_fit->offset, fit->len, fit_buf->dirty ? "Yes" : "No"); + trace_acpi_nvdimm_read_fit(read_fit->offset, fit->len, + fit_buf->dirty ? "Yes" : "No"); if (read_fit->offset > fit->len) { func_ret_status = NVDIMM_DSM_RET_STATUS_INVALID; @@ -658,7 +659,7 @@ static void nvdimm_dsm_label_size(NVDIMMDevice *nvdimm, hwaddr dsm_mem_addr) label_size = nvdimm->label_size; mxfer = nvdimm_get_max_xfer_label_size(); - nvdimm_debug("label_size 0x%x, max_xfer 0x%x.\n", label_size, mxfer); + trace_acpi_nvdimm_label_info(label_size, mxfer); label_size_out.func_ret_status = cpu_to_le32(NVDIMM_DSM_RET_STATUS_SUCCESS); label_size_out.label_size = cpu_to_le32(label_size); @@ -674,20 +675,18 @@ static uint32_t nvdimm_rw_label_data_check(NVDIMMDevice *nvdimm, uint32_t ret = NVDIMM_DSM_RET_STATUS_INVALID; if (offset + length < offset) { - nvdimm_debug("offset 0x%x + length 0x%x is overflow.\n", offset, - length); + trace_acpi_nvdimm_label_overflow(offset, length); return ret; } if (nvdimm->label_size < offset + length) { - nvdimm_debug("position 0x%x is beyond label data (len = %" PRIx64 ").\n", - offset + length, nvdimm->label_size); + trace_acpi_nvdimm_label_oversize(offset + length, nvdimm->label_size); return ret; } if (length > nvdimm_get_max_xfer_label_size()) { - nvdimm_debug("length (0x%x) is larger than max_xfer (0x%x).\n", - length, nvdimm_get_max_xfer_label_size()); + trace_acpi_nvdimm_label_xfer_exceed(length, + nvdimm_get_max_xfer_label_size()); return ret; } @@ -710,8 +709,8 @@ static void nvdimm_dsm_get_label_data(NVDIMMDevice *nvdimm, NvdimmDsmIn *in, get_label_data->offset = le32_to_cpu(get_label_data->offset); get_label_data->length = le32_to_cpu(get_label_data->length); - nvdimm_debug("Read Label Data: offset 0x%x length 0x%x.\n", - get_label_data->offset, get_label_data->length); + trace_acpi_nvdimm_read_label(get_label_data->offset, + get_label_data->length); status = nvdimm_rw_label_data_check(nvdimm, get_label_data->offset, get_label_data->length); @@ -749,8 +748,8 @@ static void nvdimm_dsm_set_label_data(NVDIMMDevice *nvdimm, NvdimmDsmIn *in, set_label_data->offset = le32_to_cpu(set_label_data->offset); set_label_data->length = le32_to_cpu(set_label_data->length); - nvdimm_debug("Write Label Data: offset 0x%x length 0x%x.\n", - set_label_data->offset, set_label_data->length); + trace_acpi_nvdimm_write_label(set_label_data->offset, + set_label_data->length); status = nvdimm_rw_label_data_check(nvdimm, set_label_data->offset, set_label_data->length); @@ -821,7 +820,7 @@ static void nvdimm_dsm_device(NvdimmDsmIn *in, hwaddr dsm_mem_addr) static uint64_t nvdimm_dsm_read(void *opaque, hwaddr addr, unsigned size) { - nvdimm_debug("BUG: we never read _DSM IO Port.\n"); + trace_acpi_nvdimm_read_io_port(); return 0; } @@ -832,7 +831,7 @@ nvdimm_dsm_write(void *opaque, hwaddr addr, uint64_t val, unsigned size) NvdimmDsmIn *in; hwaddr dsm_mem_addr = val; - nvdimm_debug("dsm memory address 0x%" HWADDR_PRIx ".\n", dsm_mem_addr); + trace_acpi_nvdimm_dsm_mem_addr(dsm_mem_addr); /* * The DSM memory is mapped to guest address space so an evil guest @@ -846,12 +845,10 @@ nvdimm_dsm_write(void *opaque, hwaddr addr, uint64_t val, unsigned size) in->function = le32_to_cpu(in->function); in->handle = le32_to_cpu(in->handle); - nvdimm_debug("Revision 0x%x Handler 0x%x Function 0x%x.\n", in->revision, - in->handle, in->function); + trace_acpi_nvdimm_dsm_info(in->revision, in->handle, in->function); if (in->revision != 0x1 /* Currently we only support DSM Spec Rev1. */) { - nvdimm_debug("Revision 0x%x is not supported, expect 0x%x.\n", - in->revision, 0x1); + trace_acpi_nvdimm_invalid_revision(in->revision); nvdimm_dsm_no_payload(NVDIMM_DSM_RET_STATUS_UNSUPPORT, dsm_mem_addr); goto exit; } diff --git a/hw/acpi/trace-events b/hw/acpi/trace-events index 2250126..eb60b04 100644 --- a/hw/acpi/trace-events +++ b/hw/acpi/trace-events @@ -70,3 +70,16 @@ acpi_erst_reset_out(unsigned record_count) "record_count %u" acpi_erst_post_load(void *header, unsigned slot_size) "header: 0x%p slot_size %u" acpi_erst_class_init_in(void) acpi_erst_class_init_out(void) + +# nvdimm.c +acpi_nvdimm_read_fit(uint32_t offset, uint32_t len, const char *dirty) "Read FIT: offset 0x%" PRIx32 " FIT size 0x%" PRIx32 " Dirty %s" +acpi_nvdimm_label_info(uint32_t label_size, uint32_t mxfer) "label_size 0x%" PRIx32 ", max_xfer 0x%" PRIx32 +acpi_nvdimm_label_overflow(uint32_t offset, uint32_t length) "offset 0x%" PRIx32 " + length 0x%" PRIx32 " is overflow" +acpi_nvdimm_label_oversize(uint32_t pos, uint64_t size) "position 0x%" PRIx32 " is beyond label data (len = %" PRIu64 ")" +acpi_nvdimm_label_xfer_exceed(uint32_t length, uint32_t max_xfer) "length (0x%" PRIx32 ") is larger than max_xfer (0x%" PRIx32 ")" +acpi_nvdimm_read_label(uint32_t offset, uint32_t length) "Read Label Data: offset 0x%" PRIx32 " length 0x%" PRIx32 +acpi_nvdimm_write_label(uint32_t offset, uint32_t length) "Write Label Data: offset 0x%" PRIx32 " length 0x%" PRIx32 +acpi_nvdimm_read_io_port(void) "Alert: we never read _DSM IO Port" +acpi_nvdimm_dsm_mem_addr(uint64_t dsm_mem_addr) "dsm memory address 0x%" PRIx64 +acpi_nvdimm_dsm_info(uint32_t revision, uint32_t handle, uint32_t function) "Revision 0x%" PRIx32 " Handle 0x%" PRIx32 " Function 0x%" PRIx32 +acpi_nvdimm_invalid_revision(uint32_t revision) "Revision 0x%" PRIx32 " is not supported, expect 0x1" -- cgit v1.1 From 4a447a710c523392b09391232d5a3dfb156a9d75 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 1 Jul 2022 14:22:59 +0100 Subject: hw/i386/pc: Always place CXL Memory Regions after device_memory Previously broken_reserved_end was taken into account, but Igor Mammedov identified that this could lead to a clash between potential RAM being mapped in the region and CXL usage. Hence always add the size of the device_memory memory region. This only affects the case where the broken_reserved_end flag was set. Fixes: 6e4e3ae936e6 ("hw/cxl/component: Implement host bridge MMIO (8.2.5, table 142)") Reported-by: Igor Mammedov Signed-off-by: Jonathan Cameron Message-Id: <20220701132300.2264-3-Jonathan.Cameron@huawei.com> Acked-by: Igor Mammedov Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'hw') diff --git a/hw/i386/pc.c b/hw/i386/pc.c index d2b5823..46ab1dc 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -922,10 +922,8 @@ void pc_memory_init(PCMachineState *pcms, hwaddr cxl_size = MiB; if (pcmc->has_reserved_memory && machine->device_memory->base) { - cxl_base = machine->device_memory->base; - if (!pcmc->broken_reserved_end) { - cxl_base += memory_region_size(&machine->device_memory->mr); - } + cxl_base = machine->device_memory->base + + memory_region_size(&machine->device_memory->mr); } else if (pcms->sgx_epc.size != 0) { cxl_base = sgx_epc_above_4g_end(&pcms->sgx_epc); } else { -- cgit v1.1 From 4ab4c33014b4876bc6d7888efecd6bfcca0d045a Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 19 Jul 2022 18:00:04 +0100 Subject: hw/i386: add 4g boundary start to X86MachineState Rather than hardcoding the 4G boundary everywhere, introduce a X86MachineState field @above_4g_mem_start and use it accordingly. This is in preparation for relocating ram-above-4g to be dynamically start at 1T on AMD platforms. Signed-off-by: Joao Martins Reviewed-by: Igor Mammedov Message-Id: <20220719170014.27028-2-joao.m.martins@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/acpi-build.c | 2 +- hw/i386/pc.c | 11 ++++++----- hw/i386/sgx.c | 2 +- hw/i386/x86.c | 1 + 4 files changed, 9 insertions(+), 7 deletions(-) (limited to 'hw') diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index cad6f5a..0355bd3 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -2024,7 +2024,7 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) build_srat_memory(table_data, mem_base, mem_len, i - 1, MEM_AFFINITY_ENABLED); } - mem_base = 1ULL << 32; + mem_base = x86ms->above_4g_mem_start; mem_len = next_base - x86ms->below_4g_mem_size; next_base = mem_base + mem_len; } diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 46ab1dc..13b6830 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -850,9 +850,10 @@ void pc_memory_init(PCMachineState *pcms, machine->ram, x86ms->below_4g_mem_size, x86ms->above_4g_mem_size); - memory_region_add_subregion(system_memory, 0x100000000ULL, + memory_region_add_subregion(system_memory, x86ms->above_4g_mem_start, ram_above_4g); - e820_add_entry(0x100000000ULL, x86ms->above_4g_mem_size, E820_RAM); + e820_add_entry(x86ms->above_4g_mem_start, x86ms->above_4g_mem_size, + E820_RAM); } if (pcms->sgx_epc.size != 0) { @@ -893,7 +894,7 @@ void pc_memory_init(PCMachineState *pcms, machine->device_memory->base = sgx_epc_above_4g_end(&pcms->sgx_epc); } else { machine->device_memory->base = - 0x100000000ULL + x86ms->above_4g_mem_size; + x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; } machine->device_memory->base = @@ -927,7 +928,7 @@ void pc_memory_init(PCMachineState *pcms, } else if (pcms->sgx_epc.size != 0) { cxl_base = sgx_epc_above_4g_end(&pcms->sgx_epc); } else { - cxl_base = 0x100000000ULL + x86ms->above_4g_mem_size; + cxl_base = x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; } e820_add_entry(cxl_base, cxl_size, E820_RESERVED); @@ -1035,7 +1036,7 @@ uint64_t pc_pci_hole64_start(void) } else if (pcms->sgx_epc.size != 0) { hole64_start = sgx_epc_above_4g_end(&pcms->sgx_epc); } else { - hole64_start = 0x100000000ULL + x86ms->above_4g_mem_size; + hole64_start = x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; } return ROUND_UP(hole64_start, 1 * GiB); diff --git a/hw/i386/sgx.c b/hw/i386/sgx.c index a44d66b..09d9c7c 100644 --- a/hw/i386/sgx.c +++ b/hw/i386/sgx.c @@ -295,7 +295,7 @@ void pc_machine_init_sgx_epc(PCMachineState *pcms) return; } - sgx_epc->base = 0x100000000ULL + x86ms->above_4g_mem_size; + sgx_epc->base = x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; memory_region_init(&sgx_epc->mr, OBJECT(pcms), "sgx-epc", UINT64_MAX); memory_region_add_subregion(get_system_memory(), sgx_epc->base, diff --git a/hw/i386/x86.c b/hw/i386/x86.c index ecea25d..050eedc 100644 --- a/hw/i386/x86.c +++ b/hw/i386/x86.c @@ -1391,6 +1391,7 @@ static void x86_machine_initfn(Object *obj) x86ms->oem_id = g_strndup(ACPI_BUILD_APPNAME6, 6); x86ms->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8); x86ms->bus_lock_ratelimit = 0; + x86ms->above_4g_mem_start = 4 * GiB; } static void x86_machine_class_init(ObjectClass *oc, void *data) -- cgit v1.1 From 4876778749021a59a03dd3da3518dc861bc95ff8 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 19 Jul 2022 18:00:05 +0100 Subject: i386/pc: create pci-host qdev prior to pc_memory_init() At the start of pc_memory_init() we usually pass a range of 0..UINT64_MAX as pci_memory, when really its 2G (i440fx) or 32G (q35). To get the real user value, we need to get pci-host passed property for default pci_hole64_size. Thus to get that, create the qdev prior to memory init to better make estimations on max used/phys addr. This is in preparation to determine that host-phys-bits are enough and also for pci-hole64-size to be considered to relocate ram-above-4g to be at 1T (on AMD platforms). Signed-off-by: Joao Martins Reviewed-by: Igor Mammedov Message-Id: <20220719170014.27028-3-joao.m.martins@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc_piix.c | 7 +++++-- hw/i386/pc_q35.c | 6 +++--- hw/pci-host/i440fx.c | 5 ++--- 3 files changed, 10 insertions(+), 8 deletions(-) (limited to 'hw') diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index fbf9465..b8b3ce3 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -91,6 +91,7 @@ static void pc_init1(MachineState *machine, MemoryRegion *pci_memory; MemoryRegion *rom_memory; ram_addr_t lowmem; + DeviceState *i440fx_host; /* * Calculate ram split, for memory below and above 4G. It's a bit @@ -164,9 +165,11 @@ static void pc_init1(MachineState *machine, pci_memory = g_new(MemoryRegion, 1); memory_region_init(pci_memory, NULL, "pci", UINT64_MAX); rom_memory = pci_memory; + i440fx_host = qdev_new(host_type); } else { pci_memory = NULL; rom_memory = system_memory; + i440fx_host = NULL; } pc_guest_info_init(pcms); @@ -200,8 +203,8 @@ static void pc_init1(MachineState *machine, const char *type = xen_enabled() ? TYPE_PIIX3_XEN_DEVICE : TYPE_PIIX3_DEVICE; - pci_bus = i440fx_init(host_type, - pci_type, + pci_bus = i440fx_init(pci_type, + i440fx_host, system_memory, system_io, machine->ram_size, x86ms->below_4g_mem_size, x86ms->above_4g_mem_size, diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 12cc76a..f4d23b1 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -203,12 +203,12 @@ static void pc_q35_init(MachineState *machine) pcms->smbios_entry_point_type); } - /* allocate ram and load rom/bios */ - pc_memory_init(pcms, get_system_memory(), rom_memory, &ram_memory); - /* create pci host bus */ q35_host = Q35_HOST_DEVICE(qdev_new(TYPE_Q35_HOST_DEVICE)); + /* allocate ram and load rom/bios */ + pc_memory_init(pcms, get_system_memory(), rom_memory, &ram_memory); + object_property_add_child(qdev_get_machine(), "q35", OBJECT(q35_host)); object_property_set_link(OBJECT(q35_host), MCH_HOST_PROP_RAM_MEM, OBJECT(ram_memory), NULL); diff --git a/hw/pci-host/i440fx.c b/hw/pci-host/i440fx.c index 1c5ad5f..d5426ef 100644 --- a/hw/pci-host/i440fx.c +++ b/hw/pci-host/i440fx.c @@ -237,7 +237,8 @@ static void i440fx_realize(PCIDevice *dev, Error **errp) } } -PCIBus *i440fx_init(const char *host_type, const char *pci_type, +PCIBus *i440fx_init(const char *pci_type, + DeviceState *dev, MemoryRegion *address_space_mem, MemoryRegion *address_space_io, ram_addr_t ram_size, @@ -246,7 +247,6 @@ PCIBus *i440fx_init(const char *host_type, const char *pci_type, MemoryRegion *pci_address_space, MemoryRegion *ram_memory) { - DeviceState *dev; PCIBus *b; PCIDevice *d; PCIHostState *s; @@ -254,7 +254,6 @@ PCIBus *i440fx_init(const char *host_type, const char *pci_type, unsigned i; I440FXState *i440fx; - dev = qdev_new(host_type); s = PCI_HOST_BRIDGE(dev); b = pci_root_bus_new(dev, NULL, pci_address_space, address_space_io, 0, TYPE_PCI_BUS); -- cgit v1.1 From c48eb7a4e869f338e5b3e233fed7c9dbb0520247 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 19 Jul 2022 18:00:06 +0100 Subject: i386/pc: pass pci_hole64_size to pc_memory_init() Use the pre-initialized pci-host qdev and fetch the pci-hole64-size into pc_memory_init() newly added argument. Use PCI_HOST_PROP_PCI_HOLE64_SIZE pci-host property for fetching pci-hole64-size. This is in preparation to determine that host-phys-bits are enough and for pci-hole64-size to be considered to relocate ram-above-4g to be at 1T (on AMD platforms). Signed-off-by: Joao Martins Reviewed-by: Igor Mammedov Message-Id: <20220719170014.27028-4-joao.m.martins@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 3 ++- hw/i386/pc_piix.c | 7 ++++++- hw/i386/pc_q35.c | 10 +++++++++- 3 files changed, 17 insertions(+), 3 deletions(-) (limited to 'hw') diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 13b6830..f4d5b25 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -817,7 +817,8 @@ void xen_load_linux(PCMachineState *pcms) void pc_memory_init(PCMachineState *pcms, MemoryRegion *system_memory, MemoryRegion *rom_memory, - MemoryRegion **ram_memory) + MemoryRegion **ram_memory, + uint64_t pci_hole64_size) { int linux_boot, i; MemoryRegion *option_rom_mr; diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index b8b3ce3..aa191d4 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -91,6 +91,7 @@ static void pc_init1(MachineState *machine, MemoryRegion *pci_memory; MemoryRegion *rom_memory; ram_addr_t lowmem; + uint64_t hole64_size; DeviceState *i440fx_host; /* @@ -166,10 +167,14 @@ static void pc_init1(MachineState *machine, memory_region_init(pci_memory, NULL, "pci", UINT64_MAX); rom_memory = pci_memory; i440fx_host = qdev_new(host_type); + hole64_size = object_property_get_uint(OBJECT(i440fx_host), + PCI_HOST_PROP_PCI_HOLE64_SIZE, + &error_abort); } else { pci_memory = NULL; rom_memory = system_memory; i440fx_host = NULL; + hole64_size = 0; } pc_guest_info_init(pcms); @@ -186,7 +191,7 @@ static void pc_init1(MachineState *machine, /* allocate ram and load rom/bios */ if (!xen_enabled()) { pc_memory_init(pcms, system_memory, - rom_memory, &ram_memory); + rom_memory, &ram_memory, hole64_size); } else { pc_system_flash_cleanup_unused(pcms); if (machine->kernel_filename != NULL) { diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index f4d23b1..307910b 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -138,6 +138,7 @@ static void pc_q35_init(MachineState *machine) MachineClass *mc = MACHINE_GET_CLASS(machine); bool acpi_pcihp; bool keep_pci_slot_hpc; + uint64_t pci_hole64_size = 0; /* Check whether RAM fits below 4G (leaving 1/2 GByte for IO memory * and 256 Mbytes for PCI Express Enhanced Configuration Access Mapping @@ -206,8 +207,15 @@ static void pc_q35_init(MachineState *machine) /* create pci host bus */ q35_host = Q35_HOST_DEVICE(qdev_new(TYPE_Q35_HOST_DEVICE)); + if (pcmc->pci_enabled) { + pci_hole64_size = object_property_get_uint(OBJECT(q35_host), + PCI_HOST_PROP_PCI_HOLE64_SIZE, + &error_abort); + } + /* allocate ram and load rom/bios */ - pc_memory_init(pcms, get_system_memory(), rom_memory, &ram_memory); + pc_memory_init(pcms, get_system_memory(), rom_memory, &ram_memory, + pci_hole64_size); object_property_add_child(qdev_get_machine(), "q35", OBJECT(q35_host)); object_property_set_link(OBJECT(q35_host), MCH_HOST_PROP_RAM_MEM, -- cgit v1.1 From 5ff62e2afedf5c984048f90a881794622a9eb3e8 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 19 Jul 2022 18:00:07 +0100 Subject: i386/pc: factor out above-4g end to an helper There's a couple of places that seem to duplicate this calculation of RAM size above the 4G boundary. Move all those to a helper function. Signed-off-by: Joao Martins Reviewed-by: Igor Mammedov Message-Id: <20220719170014.27028-5-joao.m.martins@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'hw') diff --git a/hw/i386/pc.c b/hw/i386/pc.c index f4d5b25..d1e20cc 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -814,6 +814,17 @@ void xen_load_linux(PCMachineState *pcms) #define PC_ROM_ALIGN 0x800 #define PC_ROM_SIZE (PC_ROM_MAX - PC_ROM_MIN_VGA) +static hwaddr pc_above_4g_end(PCMachineState *pcms) +{ + X86MachineState *x86ms = X86_MACHINE(pcms); + + if (pcms->sgx_epc.size != 0) { + return sgx_epc_above_4g_end(&pcms->sgx_epc); + } + + return x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; +} + void pc_memory_init(PCMachineState *pcms, MemoryRegion *system_memory, MemoryRegion *rom_memory, @@ -891,15 +902,8 @@ void pc_memory_init(PCMachineState *pcms, exit(EXIT_FAILURE); } - if (pcms->sgx_epc.size != 0) { - machine->device_memory->base = sgx_epc_above_4g_end(&pcms->sgx_epc); - } else { - machine->device_memory->base = - x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; - } - machine->device_memory->base = - ROUND_UP(machine->device_memory->base, 1 * GiB); + ROUND_UP(pc_above_4g_end(pcms), 1 * GiB); if (pcmc->enforce_aligned_dimm) { /* size device region assuming 1G page max alignment per slot */ @@ -926,10 +930,8 @@ void pc_memory_init(PCMachineState *pcms, if (pcmc->has_reserved_memory && machine->device_memory->base) { cxl_base = machine->device_memory->base + memory_region_size(&machine->device_memory->mr); - } else if (pcms->sgx_epc.size != 0) { - cxl_base = sgx_epc_above_4g_end(&pcms->sgx_epc); } else { - cxl_base = x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; + cxl_base = pc_above_4g_end(pcms); } e820_add_entry(cxl_base, cxl_size, E820_RESERVED); @@ -1016,7 +1018,6 @@ uint64_t pc_pci_hole64_start(void) PCMachineState *pcms = PC_MACHINE(qdev_get_machine()); PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); MachineState *ms = MACHINE(pcms); - X86MachineState *x86ms = X86_MACHINE(pcms); uint64_t hole64_start = 0; if (pcms->cxl_devices_state.host_mr.addr) { @@ -1034,10 +1035,8 @@ uint64_t pc_pci_hole64_start(void) if (!pcmc->broken_reserved_end) { hole64_start += memory_region_size(&ms->device_memory->mr); } - } else if (pcms->sgx_epc.size != 0) { - hole64_start = sgx_epc_above_4g_end(&pcms->sgx_epc); } else { - hole64_start = x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; + hole64_start = pc_above_4g_end(pcms); } return ROUND_UP(hole64_start, 1 * GiB); -- cgit v1.1 From 55668e409bcaf04e6af7acb44037d5f18c6b786e Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 19 Jul 2022 18:00:08 +0100 Subject: i386/pc: factor out cxl range end to helper Move calculation of CXL memory region end to separate helper. This is in preparation to a future change that removes CXL range dependency on the CXL memory region, with the goal of allowing pc_pci_hole64_start() to be called before any memory region are initialized. Cc: Jonathan Cameron Signed-off-by: Joao Martins Acked-by: Igor Mammedov Message-Id: <20220719170014.27028-6-joao.m.martins@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) (limited to 'hw') diff --git a/hw/i386/pc.c b/hw/i386/pc.c index d1e20cc..cb27309 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -825,6 +825,25 @@ static hwaddr pc_above_4g_end(PCMachineState *pcms) return x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; } +static uint64_t pc_get_cxl_range_end(PCMachineState *pcms) +{ + uint64_t start = 0; + + if (pcms->cxl_devices_state.host_mr.addr) { + start = pcms->cxl_devices_state.host_mr.addr + + memory_region_size(&pcms->cxl_devices_state.host_mr); + if (pcms->cxl_devices_state.fixed_windows) { + GList *it; + for (it = pcms->cxl_devices_state.fixed_windows; it; it = it->next) { + CXLFixedWindow *fw = it->data; + start = fw->mr.addr + memory_region_size(&fw->mr); + } + } + } + + return start; +} + void pc_memory_init(PCMachineState *pcms, MemoryRegion *system_memory, MemoryRegion *rom_memory, @@ -1020,16 +1039,8 @@ uint64_t pc_pci_hole64_start(void) MachineState *ms = MACHINE(pcms); uint64_t hole64_start = 0; - if (pcms->cxl_devices_state.host_mr.addr) { - hole64_start = pcms->cxl_devices_state.host_mr.addr + - memory_region_size(&pcms->cxl_devices_state.host_mr); - if (pcms->cxl_devices_state.fixed_windows) { - GList *it; - for (it = pcms->cxl_devices_state.fixed_windows; it; it = it->next) { - CXLFixedWindow *fw = it->data; - hole64_start = fw->mr.addr + memory_region_size(&fw->mr); - } - } + if (pcms->cxl_devices_state.is_enabled) { + hole64_start = pc_get_cxl_range_end(pcms); } else if (pcmc->has_reserved_memory && ms->device_memory->base) { hole64_start = ms->device_memory->base; if (!pcmc->broken_reserved_end) { -- cgit v1.1 From 42bed0712725ce338fc1cad725c3d9cf5cf443dc Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 19 Jul 2022 18:00:09 +0100 Subject: i386/pc: factor out cxl range start to helper Factor out the calculation of the base address of the memory region. It will be used later on for the cxl range end counterpart calculation and as well in pc_memory_init() CXL memory region initialization, thus avoiding duplication. Cc: Jonathan Cameron Signed-off-by: Joao Martins Acked-by: Igor Mammedov Message-Id: <20220719170014.27028-7-joao.m.martins@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'hw') diff --git a/hw/i386/pc.c b/hw/i386/pc.c index cb27309..9e1a067 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -825,6 +825,22 @@ static hwaddr pc_above_4g_end(PCMachineState *pcms) return x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; } +static uint64_t pc_get_cxl_range_start(PCMachineState *pcms) +{ + PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); + MachineState *machine = MACHINE(pcms); + hwaddr cxl_base; + + if (pcmc->has_reserved_memory && machine->device_memory->base) { + cxl_base = machine->device_memory->base + + memory_region_size(&machine->device_memory->mr); + } else { + cxl_base = pc_above_4g_end(pcms); + } + + return cxl_base; +} + static uint64_t pc_get_cxl_range_end(PCMachineState *pcms) { uint64_t start = 0; @@ -946,13 +962,7 @@ void pc_memory_init(PCMachineState *pcms, MemoryRegion *mr = &pcms->cxl_devices_state.host_mr; hwaddr cxl_size = MiB; - if (pcmc->has_reserved_memory && machine->device_memory->base) { - cxl_base = machine->device_memory->base - + memory_region_size(&machine->device_memory->mr); - } else { - cxl_base = pc_above_4g_end(pcms); - } - + cxl_base = pc_get_cxl_range_start(pcms); e820_add_entry(cxl_base, cxl_size, E820_RESERVED); memory_region_init(mr, OBJECT(machine), "cxl_host_reg", cxl_size); memory_region_add_subregion(system_memory, cxl_base, mr); -- cgit v1.1 From 1065b21993628a00066a2ab1439bfff5d186c12d Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 19 Jul 2022 18:00:10 +0100 Subject: i386/pc: handle unitialized mr in pc_get_cxl_range_end() Remove pc_get_cxl_range_end() dependency on the CXL memory region, and replace with one that does not require the CXL host_mr to determine the start of CXL start. This in preparation to allow pc_pci_hole64_start() to be called early in pc_memory_init(), handle CXL memory region end when its underlying memory region isn't yet initialized. Cc: Jonathan Cameron Signed-off-by: Joao Martins Message-Id: <20220719170014.27028-8-joao.m.martins@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Acked-by: Igor Mammedov --- hw/i386/pc.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'hw') diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 9e1a067..611eb19 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -843,17 +843,15 @@ static uint64_t pc_get_cxl_range_start(PCMachineState *pcms) static uint64_t pc_get_cxl_range_end(PCMachineState *pcms) { - uint64_t start = 0; + uint64_t start = pc_get_cxl_range_start(pcms) + MiB; - if (pcms->cxl_devices_state.host_mr.addr) { - start = pcms->cxl_devices_state.host_mr.addr + - memory_region_size(&pcms->cxl_devices_state.host_mr); - if (pcms->cxl_devices_state.fixed_windows) { - GList *it; - for (it = pcms->cxl_devices_state.fixed_windows; it; it = it->next) { - CXLFixedWindow *fw = it->data; - start = fw->mr.addr + memory_region_size(&fw->mr); - } + if (pcms->cxl_devices_state.fixed_windows) { + GList *it; + + start = ROUND_UP(start, 256 * MiB); + for (it = pcms->cxl_devices_state.fixed_windows; it; it = it->next) { + CXLFixedWindow *fw = it->data; + start += fw->size; } } -- cgit v1.1 From 8288a8286d00e074b765f1d47ee61159ed5291fd Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 19 Jul 2022 18:00:11 +0100 Subject: i386/pc: factor out device_memory base/size to helper Move obtaining hole64_start from device_memory memory region base/size into an helper alongside correspondent getters in pc_memory_init() when the hotplug range is unitialized. While doing that remove the memory region based logic from this newly added helper. This is the final step that allows pc_pci_hole64_start() to be callable at the beginning of pc_memory_init() before any memory regions are initialized. Cc: Jonathan Cameron Signed-off-by: Joao Martins Acked-by: Igor Mammedov Message-Id: <20220719170014.27028-9-joao.m.martins@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 46 +++++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 15 deletions(-) (limited to 'hw') diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 611eb19..ebc27e4 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -825,15 +825,36 @@ static hwaddr pc_above_4g_end(PCMachineState *pcms) return x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; } -static uint64_t pc_get_cxl_range_start(PCMachineState *pcms) +static void pc_get_device_memory_range(PCMachineState *pcms, + hwaddr *base, + ram_addr_t *device_mem_size) { PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); MachineState *machine = MACHINE(pcms); + ram_addr_t size; + hwaddr addr; + + size = machine->maxram_size - machine->ram_size; + addr = ROUND_UP(pc_above_4g_end(pcms), 1 * GiB); + + if (pcmc->enforce_aligned_dimm) { + /* size device region assuming 1G page max alignment per slot */ + size += (1 * GiB) * machine->ram_slots; + } + + *base = addr; + *device_mem_size = size; +} + +static uint64_t pc_get_cxl_range_start(PCMachineState *pcms) +{ + PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); hwaddr cxl_base; + ram_addr_t size; - if (pcmc->has_reserved_memory && machine->device_memory->base) { - cxl_base = machine->device_memory->base - + memory_region_size(&machine->device_memory->mr); + if (pcmc->has_reserved_memory) { + pc_get_device_memory_range(pcms, &cxl_base, &size); + cxl_base += size; } else { cxl_base = pc_above_4g_end(pcms); } @@ -920,7 +941,7 @@ void pc_memory_init(PCMachineState *pcms, /* initialize device memory address space */ if (pcmc->has_reserved_memory && (machine->ram_size < machine->maxram_size)) { - ram_addr_t device_mem_size = machine->maxram_size - machine->ram_size; + ram_addr_t device_mem_size; if (machine->ram_slots > ACPI_MAX_RAM_SLOTS) { error_report("unsupported amount of memory slots: %"PRIu64, @@ -935,13 +956,7 @@ void pc_memory_init(PCMachineState *pcms, exit(EXIT_FAILURE); } - machine->device_memory->base = - ROUND_UP(pc_above_4g_end(pcms), 1 * GiB); - - if (pcmc->enforce_aligned_dimm) { - /* size device region assuming 1G page max alignment per slot */ - device_mem_size += (1 * GiB) * machine->ram_slots; - } + pc_get_device_memory_range(pcms, &machine->device_memory->base, &device_mem_size); if ((machine->device_memory->base + device_mem_size) < device_mem_size) { @@ -1046,13 +1061,14 @@ uint64_t pc_pci_hole64_start(void) PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); MachineState *ms = MACHINE(pcms); uint64_t hole64_start = 0; + ram_addr_t size = 0; if (pcms->cxl_devices_state.is_enabled) { hole64_start = pc_get_cxl_range_end(pcms); - } else if (pcmc->has_reserved_memory && ms->device_memory->base) { - hole64_start = ms->device_memory->base; + } else if (pcmc->has_reserved_memory && (ms->ram_size < ms->maxram_size)) { + pc_get_device_memory_range(pcms, &hole64_start, &size); if (!pcmc->broken_reserved_end) { - hole64_start += memory_region_size(&ms->device_memory->mr); + hole64_start += size; } } else { hole64_start = pc_above_4g_end(pcms); -- cgit v1.1 From 1caab5cf86bdf02fa71079c4ca4a3c0748f39eb5 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 19 Jul 2022 18:00:12 +0100 Subject: i386/pc: bounds check phys-bits against max used GPA Calculate max *used* GPA against the CPU maximum possible address and error out if the former surprasses the latter. This ensures max used GPA is reacheable by configured phys-bits. Default phys-bits on Qemu is TCG_PHYS_ADDR_BITS (40) which is enough for the CPU to address 1Tb (0xff ffff ffff) or 1010G (0xfc ffff ffff) in AMD hosts with IOMMU. This is preparation for AMD guests with >1010G, where it will want relocate ram-above-4g to be after 1Tb instead of 4G. Signed-off-by: Joao Martins Acked-by: Igor Mammedov Message-Id: <20220719170014.27028-10-joao.m.martins@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'hw') diff --git a/hw/i386/pc.c b/hw/i386/pc.c index ebc27e4..56d8c17 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -879,6 +879,18 @@ static uint64_t pc_get_cxl_range_end(PCMachineState *pcms) return start; } +static hwaddr pc_max_used_gpa(PCMachineState *pcms, uint64_t pci_hole64_size) +{ + X86CPU *cpu = X86_CPU(first_cpu); + + /* 32-bit systems don't have hole64 thus return max CPU address */ + if (cpu->phys_bits <= 32) { + return ((hwaddr)1 << cpu->phys_bits) - 1; + } + + return pc_pci_hole64_start() + pci_hole64_size - 1; +} + void pc_memory_init(PCMachineState *pcms, MemoryRegion *system_memory, MemoryRegion *rom_memory, @@ -893,7 +905,9 @@ void pc_memory_init(PCMachineState *pcms, MachineClass *mc = MACHINE_GET_CLASS(machine); PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); X86MachineState *x86ms = X86_MACHINE(pcms); + hwaddr maxphysaddr, maxusedaddr; hwaddr cxl_base, cxl_resv_end = 0; + X86CPU *cpu = X86_CPU(first_cpu); assert(machine->ram_size == x86ms->below_4g_mem_size + x86ms->above_4g_mem_size); @@ -901,6 +915,19 @@ void pc_memory_init(PCMachineState *pcms, linux_boot = (machine->kernel_filename != NULL); /* + * phys-bits is required to be appropriately configured + * to make sure max used GPA is reachable. + */ + maxusedaddr = pc_max_used_gpa(pcms, pci_hole64_size); + maxphysaddr = ((hwaddr)1 << cpu->phys_bits) - 1; + if (maxphysaddr < maxusedaddr) { + error_report("Address space limit 0x%"PRIx64" < 0x%"PRIx64 + " phys-bits too low (%u)", + maxphysaddr, maxusedaddr, cpu->phys_bits); + exit(EXIT_FAILURE); + } + + /* * Split single memory region and use aliases to address portions of it, * done for backwards compatibility with older qemus. */ -- cgit v1.1 From 8504f129450b909c88e199ca44facd35d38ba4de Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 19 Jul 2022 18:00:13 +0100 Subject: i386/pc: relocate 4g start to 1T where applicable It is assumed that the whole GPA space is available to be DMA addressable, within a given address space limit, except for a tiny region before the 4G. Since Linux v5.4, VFIO validates whether the selected GPA is indeed valid i.e. not reserved by IOMMU on behalf of some specific devices or platform-defined restrictions, and thus failing the ioctl(VFIO_DMA_MAP) with -EINVAL. AMD systems with an IOMMU are examples of such platforms and particularly may only have these ranges as allowed: 0000000000000000 - 00000000fedfffff (0 .. 3.982G) 00000000fef00000 - 000000fcffffffff (3.983G .. 1011.9G) 0000010000000000 - ffffffffffffffff (1Tb .. 16Pb[*]) We already account for the 4G hole, albeit if the guest is big enough we will fail to allocate a guest with >1010G due to the ~12G hole at the 1Tb boundary, reserved for HyperTransport (HT). [*] there is another reserved region unrelated to HT that exists in the 256T boundary in Fam 17h according to Errata #1286, documeted also in "Open-Source Register Reference for AMD Family 17h Processors (PUB)" When creating the region above 4G, take into account that on AMD platforms the HyperTransport range is reserved and hence it cannot be used either as GPAs. On those cases rather than establishing the start of ram-above-4g to be 4G, relocate instead to 1Tb. See AMD IOMMU spec, section 2.1.2 "IOMMU Logical Topology", for more information on the underlying restriction of IOVAs. After accounting for the 1Tb hole on AMD hosts, mtree should look like: 0000000000000000-000000007fffffff (prio 0, i/o): alias ram-below-4g @pc.ram 0000000000000000-000000007fffffff 0000010000000000-000001ff7fffffff (prio 0, i/o): alias ram-above-4g @pc.ram 0000000080000000-000000ffffffffff If the relocation is done or the address space covers it, we also add the the reserved HT e820 range as reserved. Default phys-bits on Qemu is TCG_PHYS_ADDR_BITS (40) which is enough to address 1Tb (0xff ffff ffff). On AMD platforms, if a ram-above-4g relocation is attempted and the CPU wasn't configured with a big enough phys-bits, an error message will be printed due to the maxphysaddr vs maxusedaddr check previously added. Suggested-by: Igor Mammedov Signed-off-by: Joao Martins Acked-by: Igor Mammedov Message-Id: <20220719170014.27028-11-joao.m.martins@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) (limited to 'hw') diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 56d8c17..1c5c9e1 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -891,6 +891,40 @@ static hwaddr pc_max_used_gpa(PCMachineState *pcms, uint64_t pci_hole64_size) return pc_pci_hole64_start() + pci_hole64_size - 1; } +/* + * AMD systems with an IOMMU have an additional hole close to the + * 1Tb, which are special GPAs that cannot be DMA mapped. Depending + * on kernel version, VFIO may or may not let you DMA map those ranges. + * Starting Linux v5.4 we validate it, and can't create guests on AMD machines + * with certain memory sizes. It's also wrong to use those IOVA ranges + * in detriment of leading to IOMMU INVALID_DEVICE_REQUEST or worse. + * The ranges reserved for Hyper-Transport are: + * + * FD_0000_0000h - FF_FFFF_FFFFh + * + * The ranges represent the following: + * + * Base Address Top Address Use + * + * FD_0000_0000h FD_F7FF_FFFFh Reserved interrupt address space + * FD_F800_0000h FD_F8FF_FFFFh Interrupt/EOI IntCtl + * FD_F900_0000h FD_F90F_FFFFh Legacy PIC IACK + * FD_F910_0000h FD_F91F_FFFFh System Management + * FD_F920_0000h FD_FAFF_FFFFh Reserved Page Tables + * FD_FB00_0000h FD_FBFF_FFFFh Address Translation + * FD_FC00_0000h FD_FDFF_FFFFh I/O Space + * FD_FE00_0000h FD_FFFF_FFFFh Configuration + * FE_0000_0000h FE_1FFF_FFFFh Extended Configuration/Device Messages + * FE_2000_0000h FF_FFFF_FFFFh Reserved + * + * See AMD IOMMU spec, section 2.1.2 "IOMMU Logical Topology", + * Table 3: Special Address Controls (GPA) for more information. + */ +#define AMD_HT_START 0xfd00000000UL +#define AMD_HT_END 0xffffffffffUL +#define AMD_ABOVE_1TB_START (AMD_HT_END + 1) +#define AMD_HT_SIZE (AMD_ABOVE_1TB_START - AMD_HT_START) + void pc_memory_init(PCMachineState *pcms, MemoryRegion *system_memory, MemoryRegion *rom_memory, @@ -915,6 +949,26 @@ void pc_memory_init(PCMachineState *pcms, linux_boot = (machine->kernel_filename != NULL); /* + * The HyperTransport range close to the 1T boundary is unique to AMD + * hosts with IOMMUs enabled. Restrict the ram-above-4g relocation + * to above 1T to AMD vCPUs only. + */ + if (IS_AMD_CPU(&cpu->env)) { + /* Bail out if max possible address does not cross HT range */ + if (pc_max_used_gpa(pcms, pci_hole64_size) >= AMD_HT_START) { + x86ms->above_4g_mem_start = AMD_ABOVE_1TB_START; + } + + /* + * Advertise the HT region if address space covers the reserved + * region or if we relocate. + */ + if (cpu->phys_bits >= 40) { + e820_add_entry(AMD_HT_START, AMD_HT_SIZE, E820_RESERVED); + } + } + + /* * phys-bits is required to be appropriately configured * to make sure max used GPA is reachable. */ -- cgit v1.1 From b3e6982b4154c1c0ab8b25f2e1ac7838a1809824 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 19 Jul 2022 18:00:14 +0100 Subject: i386/pc: restrict AMD only enforcing of 1Tb hole to new machine type The added enforcing is only relevant in the case of AMD where the range right before the 1TB is restricted and cannot be DMA mapped by the kernel consequently leading to IOMMU INVALID_DEVICE_REQUEST or possibly other kinds of IOMMU events in the AMD IOMMU. Although, there's a case where it may make sense to disable the IOVA relocation/validation when migrating from a non-amd-1tb-aware qemu to one that supports it. Relocating RAM regions to after the 1Tb hole has consequences for guest ABI because we are changing the memory mapping, so make sure that only new machine enforce but not older ones. Signed-off-by: Joao Martins Acked-by: Dr. David Alan Gilbert Acked-by: Igor Mammedov Message-Id: <20220719170014.27028-12-joao.m.martins@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 6 ++++-- hw/i386/pc_piix.c | 1 + hw/i386/pc_q35.c | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'hw') diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 1c5c9e1..7280c02 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -951,9 +951,10 @@ void pc_memory_init(PCMachineState *pcms, /* * The HyperTransport range close to the 1T boundary is unique to AMD * hosts with IOMMUs enabled. Restrict the ram-above-4g relocation - * to above 1T to AMD vCPUs only. + * to above 1T to AMD vCPUs only. @enforce_amd_1tb_hole is only false in + * older machine types (<= 7.0) for compatibility purposes. */ - if (IS_AMD_CPU(&cpu->env)) { + if (IS_AMD_CPU(&cpu->env) && pcmc->enforce_amd_1tb_hole) { /* Bail out if max possible address does not cross HT range */ if (pc_max_used_gpa(pcms, pci_hole64_size) >= AMD_HT_START) { x86ms->above_4g_mem_start = AMD_ABOVE_1TB_START; @@ -1902,6 +1903,7 @@ static void pc_machine_class_init(ObjectClass *oc, void *data) pcmc->has_reserved_memory = true; pcmc->kvmclock_enabled = true; pcmc->enforce_aligned_dimm = true; + pcmc->enforce_amd_1tb_hole = true; /* BIOS ACPI tables: 128K. Other BIOS datastructures: less than 4K reported * to be used at the moment, 32K should be enough for a while. */ pcmc->acpi_data_size = 0x20000 + 0x8000; diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index aa191d4..a5c65c1 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -451,6 +451,7 @@ static void pc_i440fx_7_0_machine_options(MachineClass *m) m->alias = NULL; m->is_default = false; pcmc->legacy_no_rng_seed = true; + pcmc->enforce_amd_1tb_hole = false; compat_props_add(m->compat_props, hw_compat_7_0, hw_compat_7_0_len); compat_props_add(m->compat_props, pc_compat_7_0, pc_compat_7_0_len); } diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 307910b..3a35193 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -387,6 +387,7 @@ static void pc_q35_7_0_machine_options(MachineClass *m) pc_q35_7_1_machine_options(m); m->alias = NULL; pcmc->legacy_no_rng_seed = true; + pcmc->enforce_amd_1tb_hole = false; compat_props_add(m->compat_props, hw_compat_7_0, hw_compat_7_0_len); compat_props_add(m->compat_props, pc_compat_7_0, pc_compat_7_0_len); } -- cgit v1.1 From 0522be9a0c0094088ccef7aab352c57f483ca250 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Mon, 18 Jul 2022 14:56:37 +0100 Subject: hw/virtio/virtio-iommu: Enforce power-of-two notify for both MAP and UNMAP Currently we only enforce power-of-two mappings (required by the QEMU notifier) for UNMAP requests. A MAP request not aligned on a power-of-two may be successfully handled by VFIO, and then the corresponding UNMAP notify will fail because it will attempt to split that mapping. Ensure MAP and UNMAP notifications are consistent. Fixes: dde3f08b5cab ("virtio-iommu: Handle non power of 2 range invalidations") Reported-by: Tina Zhang Signed-off-by: Jean-Philippe Brucker Message-Id: <20220718135636.338264-1-jean-philippe@linaro.org> Tested-by: Tina Zhang Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-iommu.c | 47 ++++++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 19 deletions(-) (limited to 'hw') diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index 281152d..62e07ec 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -197,6 +197,32 @@ static gint interval_cmp(gconstpointer a, gconstpointer b, gpointer user_data) } } +static void virtio_iommu_notify_map_unmap(IOMMUMemoryRegion *mr, + IOMMUTLBEvent *event, + hwaddr virt_start, hwaddr virt_end) +{ + uint64_t delta = virt_end - virt_start; + + event->entry.iova = virt_start; + event->entry.addr_mask = delta; + + if (delta == UINT64_MAX) { + memory_region_notify_iommu(mr, 0, *event); + } + + while (virt_start != virt_end + 1) { + uint64_t mask = dma_aligned_pow2_mask(virt_start, virt_end, 64); + + event->entry.addr_mask = mask; + event->entry.iova = virt_start; + memory_region_notify_iommu(mr, 0, *event); + virt_start += mask + 1; + if (event->entry.perm != IOMMU_NONE) { + event->entry.translated_addr += mask + 1; + } + } +} + static void virtio_iommu_notify_map(IOMMUMemoryRegion *mr, hwaddr virt_start, hwaddr virt_end, hwaddr paddr, uint32_t flags) @@ -215,19 +241,16 @@ static void virtio_iommu_notify_map(IOMMUMemoryRegion *mr, hwaddr virt_start, event.type = IOMMU_NOTIFIER_MAP; event.entry.target_as = &address_space_memory; - event.entry.addr_mask = virt_end - virt_start; - event.entry.iova = virt_start; event.entry.perm = perm; event.entry.translated_addr = paddr; - memory_region_notify_iommu(mr, 0, event); + virtio_iommu_notify_map_unmap(mr, &event, virt_start, virt_end); } static void virtio_iommu_notify_unmap(IOMMUMemoryRegion *mr, hwaddr virt_start, hwaddr virt_end) { IOMMUTLBEvent event; - uint64_t delta = virt_end - virt_start; if (!(mr->iommu_notify_flags & IOMMU_NOTIFIER_UNMAP)) { return; @@ -239,22 +262,8 @@ static void virtio_iommu_notify_unmap(IOMMUMemoryRegion *mr, hwaddr virt_start, event.entry.target_as = &address_space_memory; event.entry.perm = IOMMU_NONE; event.entry.translated_addr = 0; - event.entry.addr_mask = delta; - event.entry.iova = virt_start; - - if (delta == UINT64_MAX) { - memory_region_notify_iommu(mr, 0, event); - } - - while (virt_start != virt_end + 1) { - uint64_t mask = dma_aligned_pow2_mask(virt_start, virt_end, 64); - - event.entry.addr_mask = mask; - event.entry.iova = virt_start; - memory_region_notify_iommu(mr, 0, event); - virt_start += mask + 1; - } + virtio_iommu_notify_map_unmap(mr, &event, virt_start, virt_end); } static gboolean virtio_iommu_notify_unmap_cb(gpointer key, gpointer value, -- cgit v1.1