diff options
36 files changed, 1146 insertions, 419 deletions
@@ -50,6 +50,7 @@ #include "qapi-event.h" #include "hw/nmi.h" #include "sysemu/replay.h" +#include "hw/boards.h" #ifdef CONFIG_LINUX @@ -1865,6 +1866,8 @@ void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg) CpuInfoList *qmp_query_cpus(Error **errp) { + MachineState *ms = MACHINE(qdev_get_machine()); + MachineClass *mc = MACHINE_GET_CLASS(ms); CpuInfoList *head = NULL, *cur_item = NULL; CPUState *cpu; @@ -1915,6 +1918,13 @@ CpuInfoList *qmp_query_cpus(Error **errp) #else info->value->arch = CPU_INFO_ARCH_OTHER; #endif + info->value->has_props = !!mc->cpu_index_to_instance_props; + if (info->value->has_props) { + CpuInstanceProperties *props; + props = g_malloc0(sizeof(*props)); + *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index); + info->value->props = props; + } /* XXX: waiting for the qapi to support GSList */ if (!cur_item) { diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c index c6f2032..be496c8 100644 --- a/hw/acpi/aml-build.c +++ b/hw/acpi/aml-build.c @@ -24,6 +24,7 @@ #include "hw/acpi/aml-build.h" #include "qemu/bswap.h" #include "qemu/bitops.h" +#include "sysemu/numa.h" static GArray *build_alloc_array(void) { @@ -1609,3 +1610,28 @@ void build_srat_memory(AcpiSratMemoryAffinity *numamem, uint64_t base, numamem->base_addr = cpu_to_le64(base); numamem->range_length = cpu_to_le64(len); } + +/* + * ACPI spec 5.2.17 System Locality Distance Information Table + * (Revision 2.0 or later) + */ +void build_slit(GArray *table_data, BIOSLinker *linker) +{ + int slit_start, i, j; + slit_start = table_data->len; + + acpi_data_push(table_data, sizeof(AcpiTableHeader)); + + build_append_int_noprefix(table_data, nb_numa_nodes, 8); + for (i = 0; i < nb_numa_nodes; i++) { + for (j = 0; j < nb_numa_nodes; j++) { + assert(numa_info[i].distance[j]); + build_append_int_noprefix(table_data, numa_info[i].distance[j], 1); + } + } + + build_header(linker, table_data, + (void *)(table_data->data + slit_start), + "SLIT", + table_data->len - slit_start, 1, NULL, NULL); +} diff --git a/hw/acpi/cpu.c b/hw/acpi/cpu.c index 8c719d3..a233fe1 100644 --- a/hw/acpi/cpu.c +++ b/hw/acpi/cpu.c @@ -503,7 +503,6 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts, /* build Processor object for each processor */ for (i = 0; i < arch_ids->len; i++) { - int j; Aml *dev; Aml *uid = aml_int(i); GArray *madt_buf = g_array_new(0, 1, 1); @@ -557,9 +556,9 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts, * as a result _PXM is required for all CPUs which might * be hot-plugged. For simplicity, add it for all CPUs. */ - j = numa_get_node_for_cpu(i); - if (j < nb_numa_nodes) { - aml_append(dev, aml_name_decl("_PXM", aml_int(j))); + if (arch_ids->cpus[i].props.has_node_id) { + aml_append(dev, aml_name_decl("_PXM", + aml_int(arch_ids->cpus[i].props.node_id))); } aml_append(cpus_dev, dev); diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 0835e59..ce7499c 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -486,30 +486,25 @@ build_srat(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) AcpiSystemResourceAffinityTable *srat; AcpiSratProcessorGiccAffinity *core; AcpiSratMemoryAffinity *numamem; - int i, j, srat_start; + int i, srat_start; uint64_t mem_base; - uint32_t *cpu_node = g_malloc0(vms->smp_cpus * sizeof(uint32_t)); - - for (i = 0; i < vms->smp_cpus; i++) { - j = numa_get_node_for_cpu(i); - if (j < nb_numa_nodes) { - cpu_node[i] = j; - } - } + MachineClass *mc = MACHINE_GET_CLASS(vms); + const CPUArchIdList *cpu_list = mc->possible_cpu_arch_ids(MACHINE(vms)); srat_start = table_data->len; srat = acpi_data_push(table_data, sizeof(*srat)); srat->reserved1 = cpu_to_le32(1); - for (i = 0; i < vms->smp_cpus; ++i) { + for (i = 0; i < cpu_list->len; ++i) { + int node_id = cpu_list->cpus[i].props.has_node_id ? + cpu_list->cpus[i].props.node_id : 0; core = acpi_data_push(table_data, sizeof(*core)); core->type = ACPI_SRAT_PROCESSOR_GICC; core->length = sizeof(*core); - core->proximity = cpu_to_le32(cpu_node[i]); + core->proximity = cpu_to_le32(node_id); core->acpi_processor_uid = cpu_to_le32(i); core->flags = cpu_to_le32(1); } - g_free(cpu_node); mem_base = vms->memmap[VIRT_MEM].base; for (i = 0; i < nb_numa_nodes; ++i) { diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 5f62a03..c7c8159 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -338,7 +338,7 @@ static void fdt_add_cpu_nodes(const VirtMachineState *vms) { int cpu; int addr_cells = 1; - unsigned int i; + const MachineState *ms = MACHINE(vms); /* * From Documentation/devicetree/bindings/arm/cpus.txt @@ -369,6 +369,7 @@ static void fdt_add_cpu_nodes(const VirtMachineState *vms) for (cpu = vms->smp_cpus - 1; cpu >= 0; cpu--) { char *nodename = g_strdup_printf("/cpus/cpu@%d", cpu); ARMCPU *armcpu = ARM_CPU(qemu_get_cpu(cpu)); + CPUState *cs = CPU(armcpu); qemu_fdt_add_subnode(vms->fdt, nodename); qemu_fdt_setprop_string(vms->fdt, nodename, "device_type", "cpu"); @@ -389,9 +390,9 @@ static void fdt_add_cpu_nodes(const VirtMachineState *vms) armcpu->mp_affinity); } - i = numa_get_node_for_cpu(cpu); - if (i < nb_numa_nodes) { - qemu_fdt_setprop_cell(vms->fdt, nodename, "numa-node-id", i); + if (ms->possible_cpus->cpus[cs->cpu_index].props.has_node_id) { + qemu_fdt_setprop_cell(vms->fdt, nodename, "numa-node-id", + ms->possible_cpus->cpus[cs->cpu_index].props.node_id); } g_free(nodename); @@ -1194,10 +1195,35 @@ void virt_machine_done(Notifier *notifier, void *data) virt_build_smbios(vms); } +static uint64_t virt_cpu_mp_affinity(VirtMachineState *vms, int idx) +{ + uint8_t clustersz = ARM_DEFAULT_CPUS_PER_CLUSTER; + VirtMachineClass *vmc = VIRT_MACHINE_GET_CLASS(vms); + + if (!vmc->disallow_affinity_adjustment) { + /* Adjust MPIDR like 64-bit KVM hosts, which incorporate the + * GIC's target-list limitations. 32-bit KVM hosts currently + * always create clusters of 4 CPUs, but that is expected to + * change when they gain support for gicv3. When KVM is enabled + * it will override the changes we make here, therefore our + * purposes are to make TCG consistent (with 64-bit KVM hosts) + * and to improve SGI efficiency. + */ + if (vms->gic_version == 3) { + clustersz = GICV3_TARGETLIST_BITS; + } else { + clustersz = GIC_TARGETLIST_BITS; + } + } + return arm_cpu_mp_affinity(idx, clustersz); +} + static void machvirt_init(MachineState *machine) { VirtMachineState *vms = VIRT_MACHINE(machine); VirtMachineClass *vmc = VIRT_MACHINE_GET_CLASS(machine); + MachineClass *mc = MACHINE_GET_CLASS(machine); + const CPUArchIdList *possible_cpus; qemu_irq pic[NUM_IRQS]; MemoryRegion *sysmem = get_system_memory(); MemoryRegion *secure_sysmem = NULL; @@ -1210,7 +1236,6 @@ static void machvirt_init(MachineState *machine) CPUClass *cc; Error *err = NULL; bool firmware_loaded = bios_name || drive_get(IF_PFLASH, 0, 0); - uint8_t clustersz; if (!cpu_model) { cpu_model = "cortex-a15"; @@ -1263,10 +1288,8 @@ static void machvirt_init(MachineState *machine) */ if (vms->gic_version == 3) { virt_max_cpus = vms->memmap[VIRT_GIC_REDIST].size / 0x20000; - clustersz = GICV3_TARGETLIST_BITS; } else { virt_max_cpus = GIC_NCPU; - clustersz = GIC_TARGETLIST_BITS; } if (max_cpus > virt_max_cpus) { @@ -1324,21 +1347,35 @@ static void machvirt_init(MachineState *machine) exit(1); } - for (n = 0; n < smp_cpus; n++) { - Object *cpuobj = object_new(typename); - if (!vmc->disallow_affinity_adjustment) { - /* Adjust MPIDR like 64-bit KVM hosts, which incorporate the - * GIC's target-list limitations. 32-bit KVM hosts currently - * always create clusters of 4 CPUs, but that is expected to - * change when they gain support for gicv3. When KVM is enabled - * it will override the changes we make here, therefore our - * purposes are to make TCG consistent (with 64-bit KVM hosts) - * and to improve SGI efficiency. - */ - uint8_t aff1 = n / clustersz; - uint8_t aff0 = n % clustersz; - object_property_set_int(cpuobj, (aff1 << ARM_AFF1_SHIFT) | aff0, - "mp-affinity", NULL); + possible_cpus = mc->possible_cpu_arch_ids(machine); + for (n = 0; n < possible_cpus->len; n++) { + Object *cpuobj; + CPUState *cs; + int node_id; + + if (n >= smp_cpus) { + break; + } + + cpuobj = object_new(typename); + object_property_set_int(cpuobj, possible_cpus->cpus[n].arch_id, + "mp-affinity", NULL); + + cs = CPU(cpuobj); + cs->cpu_index = n; + + node_id = possible_cpus->cpus[cs->cpu_index].props.node_id; + if (!possible_cpus->cpus[cs->cpu_index].props.has_node_id) { + /* by default CPUState::numa_node was 0 if it's not set via CLI + * keep it this way for now but in future we probably should + * refuse to start up with incomplete numa mapping */ + node_id = 0; + } + if (cs->numa_node == CPU_UNSET_NUMA_NODE_ID) { + cs->numa_node = node_id; + } else { + /* CPU isn't device_add compatible yet, this shouldn't happen */ + error_setg(&error_abort, "user set node-id not implemented"); } if (!vms->secure) { @@ -1518,6 +1555,46 @@ static void virt_set_gic_version(Object *obj, const char *value, Error **errp) } } +static CpuInstanceProperties +virt_cpu_index_to_props(MachineState *ms, unsigned cpu_index) +{ + MachineClass *mc = MACHINE_GET_CLASS(ms); + const CPUArchIdList *possible_cpus = mc->possible_cpu_arch_ids(ms); + + assert(cpu_index < possible_cpus->len); + return possible_cpus->cpus[cpu_index].props; +} + +static const CPUArchIdList *virt_possible_cpu_arch_ids(MachineState *ms) +{ + int n; + VirtMachineState *vms = VIRT_MACHINE(ms); + + if (ms->possible_cpus) { + assert(ms->possible_cpus->len == max_cpus); + return ms->possible_cpus; + } + + ms->possible_cpus = g_malloc0(sizeof(CPUArchIdList) + + sizeof(CPUArchId) * max_cpus); + ms->possible_cpus->len = max_cpus; + for (n = 0; n < ms->possible_cpus->len; n++) { + ms->possible_cpus->cpus[n].arch_id = + virt_cpu_mp_affinity(vms, n); + ms->possible_cpus->cpus[n].props.has_thread_id = true; + ms->possible_cpus->cpus[n].props.thread_id = n; + + /* default distribution of CPUs over NUMA nodes */ + if (nb_numa_nodes) { + /* preset values but do not enable them i.e. 'has_node_id = false', + * numa init code will enable them later if manual mapping wasn't + * present on CLI */ + ms->possible_cpus->cpus[n].props.node_id = n % nb_numa_nodes; + } + } + return ms->possible_cpus; +} + static void virt_machine_class_init(ObjectClass *oc, void *data) { MachineClass *mc = MACHINE_CLASS(oc); @@ -1534,6 +1611,8 @@ static void virt_machine_class_init(ObjectClass *oc, void *data) mc->pci_allow_0_address = true; /* We know we will never create a pre-ARMv7 CPU which needs 1K pages */ mc->minimum_page_bits = 12; + mc->possible_cpu_arch_ids = virt_possible_cpu_arch_ids; + mc->cpu_index_to_instance_props = virt_cpu_index_to_props; } static const TypeInfo virt_machine_info = { diff --git a/hw/core/machine.c b/hw/core/machine.c index ada9eea..fd6a436 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -17,8 +17,10 @@ #include "qapi/visitor.h" #include "hw/sysbus.h" #include "sysemu/sysemu.h" +#include "sysemu/numa.h" #include "qemu/error-report.h" #include "qemu/cutils.h" +#include "sysemu/numa.h" static char *machine_get_accel(Object *obj, Error **errp) { @@ -388,6 +390,102 @@ HotpluggableCPUList *machine_query_hotpluggable_cpus(MachineState *machine) return head; } +/** + * machine_set_cpu_numa_node: + * @machine: machine object to modify + * @props: specifies which cpu objects to assign to + * numa node specified by @props.node_id + * @errp: if an error occurs, a pointer to an area to store the error + * + * Associate NUMA node specified by @props.node_id with cpu slots that + * match socket/core/thread-ids specified by @props. It's recommended to use + * query-hotpluggable-cpus.props values to specify affected cpu slots, + * which would lead to exact 1:1 mapping of cpu slots to NUMA node. + * + * However for CLI convenience it's possible to pass in subset of properties, + * which would affect all cpu slots that match it. + * Ex for pc machine: + * -smp 4,cores=2,sockets=2 -numa node,nodeid=0 -numa node,nodeid=1 \ + * -numa cpu,node-id=0,socket_id=0 \ + * -numa cpu,node-id=1,socket_id=1 + * will assign all child cores of socket 0 to node 0 and + * of socket 1 to node 1. + * + * On attempt of reassigning (already assigned) cpu slot to another NUMA node, + * return error. + * Empty subset is disallowed and function will return with error in this case. + */ +void machine_set_cpu_numa_node(MachineState *machine, + const CpuInstanceProperties *props, Error **errp) +{ + MachineClass *mc = MACHINE_GET_CLASS(machine); + bool match = false; + int i; + + if (!mc->possible_cpu_arch_ids) { + error_setg(errp, "mapping of CPUs to NUMA node is not supported"); + return; + } + + /* disabling node mapping is not supported, forbid it */ + assert(props->has_node_id); + + /* force board to initialize possible_cpus if it hasn't been done yet */ + mc->possible_cpu_arch_ids(machine); + + for (i = 0; i < machine->possible_cpus->len; i++) { + CPUArchId *slot = &machine->possible_cpus->cpus[i]; + + /* reject unsupported by board properties */ + if (props->has_thread_id && !slot->props.has_thread_id) { + error_setg(errp, "thread-id is not supported"); + return; + } + + if (props->has_core_id && !slot->props.has_core_id) { + error_setg(errp, "core-id is not supported"); + return; + } + + if (props->has_socket_id && !slot->props.has_socket_id) { + error_setg(errp, "socket-id is not supported"); + return; + } + + /* skip slots with explicit mismatch */ + if (props->has_thread_id && props->thread_id != slot->props.thread_id) { + continue; + } + + if (props->has_core_id && props->core_id != slot->props.core_id) { + continue; + } + + if (props->has_socket_id && props->socket_id != slot->props.socket_id) { + continue; + } + + /* reject assignment if slot is already assigned, for compatibility + * of legacy cpu_index mapping with SPAPR core based mapping do not + * error out if cpu thread and matched core have the same node-id */ + if (slot->props.has_node_id && + slot->props.node_id != props->node_id) { + error_setg(errp, "CPU is already assigned to node-id: %" PRId64, + slot->props.node_id); + return; + } + + /* assign slot to node as it's matched '-numa cpu' key */ + match = true; + slot->props.node_id = props->node_id; + slot->props.has_node_id = props->has_node_id; + } + + if (!match) { + error_setg(errp, "no match found"); + } +} + static void machine_class_init(ObjectClass *oc, void *data) { MachineClass *mc = MACHINE_CLASS(oc); @@ -400,6 +498,7 @@ static void machine_class_init(ObjectClass *oc, void *data) * On Linux, each node's border has to be 8MB aligned */ mc->numa_mem_align_shift = 23; + mc->numa_auto_assign_ram = numa_default_auto_assign_ram; object_class_property_add_str(oc, "accel", machine_get_accel, machine_set_accel, &error_abort); @@ -580,6 +679,69 @@ bool machine_mem_merge(MachineState *machine) return machine->mem_merge; } +static char *cpu_slot_to_string(const CPUArchId *cpu) +{ + GString *s = g_string_new(NULL); + if (cpu->props.has_socket_id) { + g_string_append_printf(s, "socket-id: %"PRId64, cpu->props.socket_id); + } + if (cpu->props.has_core_id) { + if (s->len) { + g_string_append_printf(s, ", "); + } + g_string_append_printf(s, "core-id: %"PRId64, cpu->props.core_id); + } + if (cpu->props.has_thread_id) { + if (s->len) { + g_string_append_printf(s, ", "); + } + g_string_append_printf(s, "thread-id: %"PRId64, cpu->props.thread_id); + } + return g_string_free(s, false); +} + +static void machine_numa_validate(MachineState *machine) +{ + int i; + GString *s = g_string_new(NULL); + MachineClass *mc = MACHINE_GET_CLASS(machine); + const CPUArchIdList *possible_cpus = mc->possible_cpu_arch_ids(machine); + + assert(nb_numa_nodes); + for (i = 0; i < possible_cpus->len; i++) { + const CPUArchId *cpu_slot = &possible_cpus->cpus[i]; + + /* at this point numa mappings are initilized by CLI options + * or with default mappings so it's sufficient to list + * all not yet mapped CPUs here */ + /* TODO: make it hard error in future */ + if (!cpu_slot->props.has_node_id) { + char *cpu_str = cpu_slot_to_string(cpu_slot); + g_string_append_printf(s, "%sCPU %d [%s]", s->len ? ", " : "", i, + cpu_str); + g_free(cpu_str); + } + } + if (s->len) { + error_report("warning: CPU(s) not present in any NUMA nodes: %s", + s->str); + error_report("warning: All CPU(s) up to maxcpus should be described " + "in NUMA config, ability to start up with partial NUMA " + "mappings is obsoleted and will be removed in future"); + } + g_string_free(s, true); +} + +void machine_run_board_init(MachineState *machine) +{ + MachineClass *machine_class = MACHINE_GET_CLASS(machine); + + if (nb_numa_nodes) { + machine_numa_validate(machine); + } + machine_class->init(machine); +} + static void machine_class_finalize(ObjectClass *klass, void *data) { MachineClass *mc = MACHINE_CLASS(klass); diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 1d8c645..cc0418f 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -2335,7 +2335,8 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) srat->reserved1 = cpu_to_le32(1); for (i = 0; i < apic_ids->len; i++) { - int j = numa_get_node_for_cpu(i); + int node_id = apic_ids->cpus[i].props.has_node_id ? + apic_ids->cpus[i].props.node_id : 0; uint32_t apic_id = apic_ids->cpus[i].arch_id; if (apic_id < 255) { @@ -2345,9 +2346,7 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) core->type = ACPI_SRAT_PROCESSOR_APIC; core->length = sizeof(*core); core->local_apic_id = apic_id; - if (j < nb_numa_nodes) { - core->proximity_lo = j; - } + core->proximity_lo = node_id; memset(core->proximity_hi, 0, 3); core->local_sapic_eid = 0; core->flags = cpu_to_le32(1); @@ -2358,9 +2357,7 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) core->type = ACPI_SRAT_PROCESSOR_x2APIC; core->length = sizeof(*core); core->x2apic_id = cpu_to_le32(apic_id); - if (j < nb_numa_nodes) { - core->proximity_domain = cpu_to_le32(j); - } + core->proximity_domain = cpu_to_le32(node_id); core->flags = cpu_to_le32(1); } } @@ -2707,6 +2704,10 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine) if (pcms->numa_nodes) { acpi_add_table(table_offsets, tables_blob); build_srat(tables_blob, tables->linker, machine); + if (have_numa_distance) { + acpi_add_table(table_offsets, tables_blob); + build_slit(tables_blob, tables->linker); + } } if (acpi_get_mcfg(&mcfg)) { acpi_add_table(table_offsets, tables_blob); diff --git a/hw/i386/pc.c b/hw/i386/pc.c index f3b372a..e36a375 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -747,7 +747,9 @@ static FWCfgState *bochs_bios_init(AddressSpace *as, PCMachineState *pcms) { FWCfgState *fw_cfg; uint64_t *numa_fw_cfg; - int i, j; + int i; + const CPUArchIdList *cpus; + MachineClass *mc = MACHINE_GET_CLASS(pcms); fw_cfg = fw_cfg_init_io_dma(FW_CFG_IO_BASE, FW_CFG_IO_BASE + 4, as); fw_cfg_add_i16(fw_cfg, FW_CFG_NB_CPUS, pcms->boot_cpus); @@ -782,12 +784,12 @@ static FWCfgState *bochs_bios_init(AddressSpace *as, PCMachineState *pcms) */ numa_fw_cfg = g_new0(uint64_t, 1 + pcms->apic_id_limit + nb_numa_nodes); numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes); - for (i = 0; i < max_cpus; i++) { - unsigned int apic_id = x86_cpu_apic_id_from_index(i); + cpus = mc->possible_cpu_arch_ids(MACHINE(pcms)); + for (i = 0; i < cpus->len; i++) { + unsigned int apic_id = cpus->cpus[i].arch_id; assert(apic_id < pcms->apic_id_limit); - j = numa_get_node_for_cpu(i); - if (j < nb_numa_nodes) { - numa_fw_cfg[apic_id + 1] = cpu_to_le64(j); + if (cpus->cpus[i].props.has_node_id) { + numa_fw_cfg[apic_id + 1] = cpu_to_le64(cpus->cpus[i].props.node_id); } } for (i = 0; i < nb_numa_nodes; i++) { @@ -1893,6 +1895,7 @@ static void pc_cpu_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { int idx; + int node_id; CPUState *cs; CPUArchId *cpu_slot; X86CPUTopoInfo topo; @@ -1982,6 +1985,22 @@ static void pc_cpu_pre_plug(HotplugHandler *hotplug_dev, cs = CPU(cpu); cs->cpu_index = idx; + + node_id = cpu_slot->props.node_id; + if (!cpu_slot->props.has_node_id) { + /* by default CPUState::numa_node was 0 if it's not set via CLI + * keep it this way for now but in future we probably should + * refuse to start up with incomplete numa mapping */ + node_id = 0; + } + if (cs->numa_node == CPU_UNSET_NUMA_NODE_ID) { + cs->numa_node = node_id; + } else if (cs->numa_node != node_id) { + error_setg(errp, "node-id %d must match numa node specified" + "with -numa option for cpu-index %d", + cs->numa_node, cs->cpu_index); + return; + } } static void pc_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev, @@ -2243,12 +2262,14 @@ static void pc_machine_reset(void) } } -static unsigned pc_cpu_index_to_socket_id(unsigned cpu_index) +static CpuInstanceProperties +pc_cpu_index_to_props(MachineState *ms, unsigned cpu_index) { - X86CPUTopoInfo topo; - x86_topo_ids_from_idx(smp_cores, smp_threads, cpu_index, - &topo); - return topo.pkg_id; + MachineClass *mc = MACHINE_GET_CLASS(ms); + const CPUArchIdList *possible_cpus = mc->possible_cpu_arch_ids(ms); + + assert(cpu_index < possible_cpus->len); + return possible_cpus->cpus[cpu_index].props; } static const CPUArchIdList *pc_possible_cpu_arch_ids(MachineState *ms) @@ -2280,6 +2301,15 @@ static const CPUArchIdList *pc_possible_cpu_arch_ids(MachineState *ms) ms->possible_cpus->cpus[i].props.core_id = topo.core_id; ms->possible_cpus->cpus[i].props.has_thread_id = true; ms->possible_cpus->cpus[i].props.thread_id = topo.smt_id; + + /* default distribution of CPUs over NUMA nodes */ + if (nb_numa_nodes) { + /* preset values but do not enable them i.e. 'has_node_id = false', + * numa init code will enable them later if manual mapping wasn't + * present on CLI */ + ms->possible_cpus->cpus[i].props.node_id = + topo.pkg_id % nb_numa_nodes; + } } return ms->possible_cpus; } @@ -2322,7 +2352,7 @@ static void pc_machine_class_init(ObjectClass *oc, void *data) pcmc->acpi_data_size = 0x20000 + 0x8000; pcmc->save_tsc_khz = true; mc->get_hotplug_handler = pc_get_hotpug_handler; - mc->cpu_index_to_socket_id = pc_cpu_index_to_socket_id; + mc->cpu_index_to_instance_props = pc_cpu_index_to_props; mc->possible_cpu_arch_ids = pc_possible_cpu_arch_ids; mc->has_hotpluggable_cpus = true; mc->default_boot_order = "cad"; diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 9f102aa..d468b96 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -54,6 +54,7 @@ #endif #include "migration/migration.h" #include "kvm_i386.h" +#include "sysemu/numa.h" #define MAX_IDE_BUS 2 @@ -442,6 +443,7 @@ static void pc_i440fx_2_9_machine_options(MachineClass *m) pc_i440fx_machine_options(m); m->alias = "pc"; m->is_default = 1; + m->numa_auto_assign_ram = numa_legacy_auto_assign_ram; } DEFINE_I440FX_MACHINE(v2_9, "pc-i440fx-2.9", NULL, diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index dd792a8..66303a7 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -47,6 +47,7 @@ #include "hw/usb.h" #include "qemu/error-report.h" #include "migration/migration.h" +#include "sysemu/numa.h" /* ICH9 AHCI has 6 ports */ #define MAX_SATA_PORTS 6 @@ -305,6 +306,7 @@ static void pc_q35_2_9_machine_options(MachineClass *m) { pc_q35_machine_options(m); m->alias = "q35"; + m->numa_auto_assign_ram = numa_legacy_auto_assign_ram; } DEFINE_Q35_MACHINE(v2_9, "pc-q35-2.9", NULL, diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 1b7cada..0980d73 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -2831,9 +2831,11 @@ static void spapr_core_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev); Error *local_err = NULL; CPUCore *cc = CPU_CORE(dev); + sPAPRCPUCore *sc = SPAPR_CPU_CORE(dev); char *base_core_type = spapr_get_cpu_core_type(machine->cpu_model); const char *type = object_get_typename(OBJECT(dev)); CPUArchId *core_slot; + int node_id; int index; if (dev->hotplugged && !mc->has_hotpluggable_cpus) { @@ -2868,6 +2870,21 @@ static void spapr_core_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, goto out; } + node_id = core_slot->props.node_id; + if (!core_slot->props.has_node_id) { + /* by default CPUState::numa_node was 0 if it's not set via CLI + * keep it this way for now but in future we probably should + * refuse to start up with incomplete numa mapping */ + node_id = 0; + } + if (sc->node_id == CPU_UNSET_NUMA_NODE_ID) { + sc->node_id = node_id; + } else if (sc->node_id != node_id) { + error_setg(&local_err, "node-id %d must match numa node specified" + "with -numa option for cpu-index %d", sc->node_id, cc->core_id); + goto out; + } + out: g_free(base_core_type); error_propagate(errp, local_err); @@ -2988,11 +3005,18 @@ static HotplugHandler *spapr_get_hotplug_handler(MachineState *machine, return NULL; } -static unsigned spapr_cpu_index_to_socket_id(unsigned cpu_index) +static CpuInstanceProperties +spapr_cpu_index_to_props(MachineState *machine, unsigned cpu_index) { - /* Allocate to NUMA nodes on a "socket" basis (not that concept of - * socket means much for the paravirtualized PAPR platform) */ - return cpu_index / smp_threads / smp_cores; + CPUArchId *core_slot; + MachineClass *mc = MACHINE_GET_CLASS(machine); + + /* make sure possible_cpu are intialized */ + mc->possible_cpu_arch_ids(machine); + /* get CPU core slot containing thread that matches cpu_index */ + core_slot = spapr_find_cpu_slot(machine, cpu_index, NULL); + assert(core_slot); + return core_slot->props; } static const CPUArchIdList *spapr_possible_cpu_arch_ids(MachineState *machine) @@ -3019,8 +3043,15 @@ static const CPUArchIdList *spapr_possible_cpu_arch_ids(MachineState *machine) machine->possible_cpus->cpus[i].arch_id = core_id; machine->possible_cpus->cpus[i].props.has_core_id = true; machine->possible_cpus->cpus[i].props.core_id = core_id; - /* TODO: add 'has_node/node' here to describe - to which node core belongs */ + + /* default distribution of CPUs over NUMA nodes */ + if (nb_numa_nodes) { + /* preset values but do not enable them i.e. 'has_node_id = false', + * numa init code will enable them later if manual mapping wasn't + * present on CLI */ + machine->possible_cpus->cpus[i].props.node_id = + core_id / smp_threads / smp_cores % nb_numa_nodes; + } } return machine->possible_cpus; } @@ -3145,7 +3176,7 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data) hc->pre_plug = spapr_machine_device_pre_plug; hc->plug = spapr_machine_device_plug; hc->unplug = spapr_machine_device_unplug; - mc->cpu_index_to_socket_id = spapr_cpu_index_to_socket_id; + mc->cpu_index_to_instance_props = spapr_cpu_index_to_props; mc->possible_cpu_arch_ids = spapr_possible_cpu_arch_ids; hc->unplug_request = spapr_machine_device_unplug_request; @@ -3249,6 +3280,7 @@ static void spapr_machine_2_9_class_options(MachineClass *mc) { spapr_machine_2_10_class_options(mc); SET_MACHINE_COMPAT(mc, SPAPR_COMPAT_2_9); + mc->numa_auto_assign_ram = numa_legacy_auto_assign_ram; } DEFINE_SPAPR_MACHINE(2_9, "2.9", false); diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c index 4389ef4..a17ea07 100644 --- a/hw/ppc/spapr_cpu_core.c +++ b/hw/ppc/spapr_cpu_core.c @@ -176,13 +176,11 @@ static void spapr_cpu_core_realize(DeviceState *dev, Error **errp) const char *typename = object_class_get_name(scc->cpu_class); size_t size = object_type_get_instance_size(typename); Error *local_err = NULL; - int core_node_id = numa_get_node_for_cpu(cc->core_id);; void *obj; int i, j; sc->threads = g_malloc0(size * cc->nr_threads); for (i = 0; i < cc->nr_threads; i++) { - int node_id; char id[32]; CPUState *cs; @@ -192,17 +190,8 @@ static void spapr_cpu_core_realize(DeviceState *dev, Error **errp) cs = CPU(obj); cs->cpu_index = cc->core_id + i; - /* Set NUMA node for the added CPUs */ - node_id = numa_get_node_for_cpu(cs->cpu_index); - if (node_id != core_node_id) { - error_setg(&local_err, "Invalid node-id=%d of thread[cpu-index: %d]" - " on CPU[core-id: %d, node-id: %d], node-id must be the same", - node_id, cs->cpu_index, cc->core_id, core_node_id); - goto err; - } - if (node_id < nb_numa_nodes) { - cs->numa_node = node_id; - } + /* Set NUMA node for the threads belonged to core */ + cs->numa_node = sc->node_id; snprintf(id, sizeof(id), "thread[%d]", i); object_property_add_child(OBJECT(sc), id, obj, &local_err); @@ -263,6 +252,11 @@ static const char *spapr_core_models[] = { "POWER9_v1.0", }; +static Property spapr_cpu_core_properties[] = { + DEFINE_PROP_INT32("node-id", sPAPRCPUCore, node_id, CPU_UNSET_NUMA_NODE_ID), + DEFINE_PROP_END_OF_LIST() +}; + void spapr_cpu_core_class_init(ObjectClass *oc, void *data) { DeviceClass *dc = DEVICE_CLASS(oc); @@ -270,6 +264,7 @@ void spapr_cpu_core_class_init(ObjectClass *oc, void *data) dc->realize = spapr_cpu_core_realize; dc->unrealize = spapr_cpu_core_unrealizefn; + dc->props = spapr_cpu_core_properties; scc->cpu_class = cpu_class_by_name(TYPE_POWERPC_CPU, data); g_assert(scc->cpu_class); } diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h index 00c21f1..329a0d0 100644 --- a/include/hw/acpi/aml-build.h +++ b/include/hw/acpi/aml-build.h @@ -389,4 +389,5 @@ GCC_FMT_ATTR(2, 3); void build_srat_memory(AcpiSratMemoryAffinity *numamem, uint64_t base, uint64_t len, int node, MemoryAffinityFlags flags); +void build_slit(GArray *table_data, BIOSLinker *linker); #endif diff --git a/include/hw/boards.h b/include/hw/boards.h index 31d9c72..76ce021 100644 --- a/include/hw/boards.h +++ b/include/hw/boards.h @@ -32,6 +32,7 @@ void memory_region_allocate_system_memory(MemoryRegion *mr, Object *owner, MachineClass *find_default_machine(void); extern MachineState *current_machine; +void machine_run_board_init(MachineState *machine); bool machine_usb(MachineState *machine); bool machine_kernel_irqchip_allowed(MachineState *machine); bool machine_kernel_irqchip_required(MachineState *machine); @@ -42,6 +43,9 @@ bool machine_dump_guest_core(MachineState *machine); bool machine_mem_merge(MachineState *machine); void machine_register_compat_props(MachineState *machine); HotpluggableCPUList *machine_query_hotpluggable_cpus(MachineState *machine); +void machine_set_cpu_numa_node(MachineState *machine, + const CpuInstanceProperties *props, + Error **errp); /** * CPUArchId: @@ -74,7 +78,10 @@ typedef struct { * of HotplugHandler object, which handles hotplug operation * for a given @dev. It may return NULL if @dev doesn't require * any actions to be performed by hotplug handler. - * @cpu_index_to_socket_id: + * @cpu_index_to_instance_props: + * used to provide @cpu_index to socket/core/thread number mapping, allowing + * legacy code to perform maping from cpu_index to topology properties + * Returns: tuple of socket/core/thread ids given cpu_index belongs to. * used to provide @cpu_index to socket number mapping, allowing * a machine to group CPU threads belonging to the same socket/package * Returns: socket number given cpu_index belongs to. @@ -136,10 +143,13 @@ struct MachineClass { int minimum_page_bits; bool has_hotpluggable_cpus; int numa_mem_align_shift; + void (*numa_auto_assign_ram)(MachineClass *mc, NodeInfo *nodes, + int nb_nodes, ram_addr_t size); HotplugHandler *(*get_hotplug_handler)(MachineState *machine, DeviceState *dev); - unsigned (*cpu_index_to_socket_id)(unsigned cpu_index); + CpuInstanceProperties (*cpu_index_to_instance_props)(MachineState *machine, + unsigned cpu_index); const CPUArchIdList *(*possible_cpu_arch_ids)(MachineState *machine); }; diff --git a/include/hw/ppc/spapr_cpu_core.h b/include/hw/ppc/spapr_cpu_core.h index 3c35665..93051e9 100644 --- a/include/hw/ppc/spapr_cpu_core.h +++ b/include/hw/ppc/spapr_cpu_core.h @@ -27,6 +27,7 @@ typedef struct sPAPRCPUCore { /*< public >*/ void *threads; + int node_id; } sPAPRCPUCore; typedef struct sPAPRCPUCoreClass { diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h index f08d327..7d85057 100644 --- a/include/qemu/typedefs.h +++ b/include/qemu/typedefs.h @@ -97,5 +97,6 @@ typedef struct SSIBus SSIBus; typedef struct uWireSlave uWireSlave; typedef struct VirtIODevice VirtIODevice; typedef struct Visitor Visitor; +typedef struct node_info NodeInfo; #endif /* QEMU_TYPEDEFS_H */ diff --git a/include/qom/cpu.h b/include/qom/cpu.h index 5d10359..55214ce 100644 --- a/include/qom/cpu.h +++ b/include/qom/cpu.h @@ -258,6 +258,8 @@ typedef void (*run_on_cpu_func)(CPUState *cpu, run_on_cpu_data data); struct qemu_work_item; +#define CPU_UNSET_NUMA_NODE_ID -1 + /** * CPUState: * @cpu_index: CPU index (informative). diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h index 8f09dcf..7ffde5b 100644 --- a/include/sysemu/numa.h +++ b/include/sysemu/numa.h @@ -8,6 +8,7 @@ #include "hw/boards.h" extern int nb_numa_nodes; /* Number of NUMA nodes */ +extern bool have_numa_distance; struct numa_addr_range { ram_addr_t mem_start; @@ -15,24 +16,23 @@ struct numa_addr_range { QLIST_ENTRY(numa_addr_range) entry; }; -typedef struct node_info { +struct node_info { uint64_t node_mem; - unsigned long *node_cpu; struct HostMemoryBackend *node_memdev; bool present; QLIST_HEAD(, numa_addr_range) addr; /* List to store address ranges */ -} NodeInfo; + uint8_t distance[MAX_NODES]; +}; extern NodeInfo numa_info[MAX_NODES]; -void parse_numa_opts(MachineClass *mc); -void numa_post_machine_init(void); +void parse_numa_opts(MachineState *ms); void query_numa_node_mem(uint64_t node_mem[]); extern QemuOptsList qemu_numa_opts; void numa_set_mem_node_id(ram_addr_t addr, uint64_t size, uint32_t node); void numa_unset_mem_node_id(ram_addr_t addr, uint64_t size, uint32_t node); uint32_t numa_get_node(ram_addr_t addr, Error **errp); - -/* on success returns node index in numa_info, - * on failure returns nb_numa_nodes */ -int numa_get_node_for_cpu(int idx); +void numa_legacy_auto_assign_ram(MachineClass *mc, NodeInfo *nodes, + int nb_nodes, ram_addr_t size); +void numa_default_auto_assign_ram(MachineClass *mc, NodeInfo *nodes, + int nb_nodes, ram_addr_t size); #endif diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 15656b7..be9e22c 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -166,6 +166,10 @@ extern int mem_prealloc; #define MAX_NODES 128 #define NUMA_NODE_UNASSIGNED MAX_NODES +#define NUMA_DISTANCE_MIN 10 +#define NUMA_DISTANCE_DEFAULT 20 +#define NUMA_DISTANCE_MAX 254 +#define NUMA_DISTANCE_UNREACHABLE 255 #define MAX_OPTION_ROMS 16 typedef struct QEMUOptionRom { @@ -51,6 +51,7 @@ static int max_numa_nodeid; /* Highest specified NUMA node ID, plus one. * For all nodes, nodeid < max_numa_nodeid */ int nb_numa_nodes; +bool have_numa_distance; NodeInfo numa_info[MAX_NODES]; void numa_set_mem_node_id(ram_addr_t addr, uint64_t size, uint32_t node) @@ -140,10 +141,12 @@ uint32_t numa_get_node(ram_addr_t addr, Error **errp) return -1; } -static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp) +static void parse_numa_node(MachineState *ms, NumaNodeOptions *node, + QemuOpts *opts, Error **errp) { uint16_t nodenr; uint16List *cpus = NULL; + MachineClass *mc = MACHINE_GET_CLASS(ms); if (node->has_nodeid) { nodenr = node->nodeid; @@ -162,7 +165,12 @@ static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp) return; } + if (!mc->cpu_index_to_instance_props) { + error_report("NUMA is not supported by this machine-type"); + exit(1); + } for (cpus = node->cpus; cpus; cpus = cpus->next) { + CpuInstanceProperties props; if (cpus->value >= max_cpus) { error_setg(errp, "CPU index (%" PRIu16 ")" @@ -170,7 +178,10 @@ static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp) cpus->value, max_cpus); return; } - bitmap_set(numa_info[nodenr].node_cpu, cpus->value, 1); + props = mc->cpu_index_to_instance_props(ms, cpus->value); + props.node_id = nodenr; + props.has_node_id = true; + machine_set_cpu_numa_node(ms, &props, &error_fatal); } if (node->has_mem && node->has_memdev) { @@ -212,9 +223,47 @@ static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp) max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1); } +static void parse_numa_distance(NumaDistOptions *dist, Error **errp) +{ + uint16_t src = dist->src; + uint16_t dst = dist->dst; + uint8_t val = dist->val; + + if (src >= MAX_NODES || dst >= MAX_NODES) { + error_setg(errp, + "Invalid node %" PRIu16 + ", max possible could be %" PRIu16, + MAX(src, dst), MAX_NODES); + return; + } + + if (!numa_info[src].present || !numa_info[dst].present) { + error_setg(errp, "Source/Destination NUMA node is missing. " + "Please use '-numa node' option to declare it first."); + return; + } + + if (val < NUMA_DISTANCE_MIN) { + error_setg(errp, "NUMA distance (%" PRIu8 ") is invalid, " + "it shouldn't be less than %d.", + val, NUMA_DISTANCE_MIN); + return; + } + + if (src == dst && val != NUMA_DISTANCE_MIN) { + error_setg(errp, "Local distance of node %d should be %d.", + src, NUMA_DISTANCE_MIN); + return; + } + + numa_info[src].distance[dst] = val; + have_numa_distance = true; +} + static int parse_numa(void *opaque, QemuOpts *opts, Error **errp) { NumaOptions *object = NULL; + MachineState *ms = opaque; Error *err = NULL; { @@ -229,12 +278,33 @@ static int parse_numa(void *opaque, QemuOpts *opts, Error **errp) switch (object->type) { case NUMA_OPTIONS_TYPE_NODE: - numa_node_parse(&object->u.node, opts, &err); + parse_numa_node(ms, &object->u.node, opts, &err); if (err) { goto end; } nb_numa_nodes++; break; + case NUMA_OPTIONS_TYPE_DIST: + parse_numa_distance(&object->u.dist, &err); + if (err) { + goto end; + } + break; + case NUMA_OPTIONS_TYPE_CPU: + if (!object->u.cpu.has_node_id) { + error_setg(&err, "Missing mandatory node-id property"); + goto end; + } + if (!numa_info[object->u.cpu.node_id].present) { + error_setg(&err, "Invalid node-id=%" PRId64 ", NUMA node must be " + "defined with -numa node,nodeid=ID before it's used with " + "-numa cpu,node-id=ID", object->u.cpu.node_id); + goto end; + } + + machine_set_cpu_numa_node(ms, qapi_NumaCpuOptions_base(&object->u.cpu), + &err); + break; default: abort(); } @@ -249,60 +319,118 @@ end: return 0; } -static char *enumerate_cpus(unsigned long *cpus, int max_cpus) +/* If all node pair distances are symmetric, then only distances + * in one direction are enough. If there is even one asymmetric + * pair, though, then all distances must be provided. The + * distance from a node to itself is always NUMA_DISTANCE_MIN, + * so providing it is never necessary. + */ +static void validate_numa_distance(void) { - int cpu; - bool first = true; - GString *s = g_string_new(NULL); + int src, dst; + bool is_asymmetrical = false; + + for (src = 0; src < nb_numa_nodes; src++) { + for (dst = src; dst < nb_numa_nodes; dst++) { + if (numa_info[src].distance[dst] == 0 && + numa_info[dst].distance[src] == 0) { + if (src != dst) { + error_report("The distance between node %d and %d is " + "missing, at least one distance value " + "between each nodes should be provided.", + src, dst); + exit(EXIT_FAILURE); + } + } - for (cpu = find_first_bit(cpus, max_cpus); - cpu < max_cpus; - cpu = find_next_bit(cpus, max_cpus, cpu + 1)) { - g_string_append_printf(s, "%s%d", first ? "" : " ", cpu); - first = false; + if (numa_info[src].distance[dst] != 0 && + numa_info[dst].distance[src] != 0 && + numa_info[src].distance[dst] != + numa_info[dst].distance[src]) { + is_asymmetrical = true; + } + } + } + + if (is_asymmetrical) { + for (src = 0; src < nb_numa_nodes; src++) { + for (dst = 0; dst < nb_numa_nodes; dst++) { + if (src != dst && numa_info[src].distance[dst] == 0) { + error_report("At least one asymmetrical pair of " + "distances is given, please provide distances " + "for both directions of all node pairs."); + exit(EXIT_FAILURE); + } + } + } } - return g_string_free(s, FALSE); } -static void validate_numa_cpus(void) +static void complete_init_numa_distance(void) { - int i; - unsigned long *seen_cpus = bitmap_new(max_cpus); + int src, dst; - for (i = 0; i < nb_numa_nodes; i++) { - if (bitmap_intersects(seen_cpus, numa_info[i].node_cpu, max_cpus)) { - bitmap_and(seen_cpus, seen_cpus, - numa_info[i].node_cpu, max_cpus); - error_report("CPU(s) present in multiple NUMA nodes: %s", - enumerate_cpus(seen_cpus, max_cpus)); - g_free(seen_cpus); - exit(EXIT_FAILURE); + /* Fixup NUMA distance by symmetric policy because if it is an + * asymmetric distance table, it should be a complete table and + * there would not be any missing distance except local node, which + * is verified by validate_numa_distance above. + */ + for (src = 0; src < nb_numa_nodes; src++) { + for (dst = 0; dst < nb_numa_nodes; dst++) { + if (numa_info[src].distance[dst] == 0) { + if (src == dst) { + numa_info[src].distance[dst] = NUMA_DISTANCE_MIN; + } else { + numa_info[src].distance[dst] = numa_info[dst].distance[src]; + } + } } - bitmap_or(seen_cpus, seen_cpus, - numa_info[i].node_cpu, max_cpus); } +} - if (!bitmap_full(seen_cpus, max_cpus)) { - char *msg; - bitmap_complement(seen_cpus, seen_cpus, max_cpus); - msg = enumerate_cpus(seen_cpus, max_cpus); - error_report("warning: CPU(s) not present in any NUMA nodes: %s", msg); - error_report("warning: All CPU(s) up to maxcpus should be described " - "in NUMA config"); - g_free(msg); +void numa_legacy_auto_assign_ram(MachineClass *mc, NodeInfo *nodes, + int nb_nodes, ram_addr_t size) +{ + int i; + uint64_t usedmem = 0; + + /* Align each node according to the alignment + * requirements of the machine class + */ + + for (i = 0; i < nb_nodes - 1; i++) { + nodes[i].node_mem = (size / nb_nodes) & + ~((1 << mc->numa_mem_align_shift) - 1); + usedmem += nodes[i].node_mem; } - g_free(seen_cpus); + nodes[i].node_mem = size - usedmem; } -void parse_numa_opts(MachineClass *mc) +void numa_default_auto_assign_ram(MachineClass *mc, NodeInfo *nodes, + int nb_nodes, ram_addr_t size) { int i; + uint64_t usedmem = 0, node_mem; + uint64_t granularity = size / nb_nodes; + uint64_t propagate = 0; + + for (i = 0; i < nb_nodes - 1; i++) { + node_mem = (granularity + propagate) & + ~((1 << mc->numa_mem_align_shift) - 1); + propagate = granularity + propagate - node_mem; + nodes[i].node_mem = node_mem; + usedmem += node_mem; + } + nodes[i].node_mem = size - usedmem; +} - for (i = 0; i < MAX_NODES; i++) { - numa_info[i].node_cpu = bitmap_new(max_cpus); - } +void parse_numa_opts(MachineState *ms) +{ + int i; + const CPUArchIdList *possible_cpus; + MachineClass *mc = MACHINE_GET_CLASS(ms); - if (qemu_opts_foreach(qemu_find_opts("numa"), parse_numa, NULL, NULL)) { + if (qemu_opts_foreach(qemu_find_opts("numa"), parse_numa, ms, NULL)) { exit(1); } @@ -336,17 +464,8 @@ void parse_numa_opts(MachineClass *mc) } } if (i == nb_numa_nodes) { - uint64_t usedmem = 0; - - /* Align each node according to the alignment - * requirements of the machine class - */ - for (i = 0; i < nb_numa_nodes - 1; i++) { - numa_info[i].node_mem = (ram_size / nb_numa_nodes) & - ~((1 << mc->numa_mem_align_shift) - 1); - usedmem += numa_info[i].node_mem; - } - numa_info[i].node_mem = ram_size - usedmem; + assert(mc->numa_auto_assign_ram); + mc->numa_auto_assign_ram(mc, numa_info, nb_numa_nodes, ram_size); } numa_total = 0; @@ -366,47 +485,52 @@ void parse_numa_opts(MachineClass *mc) numa_set_mem_ranges(); - for (i = 0; i < nb_numa_nodes; i++) { - if (!bitmap_empty(numa_info[i].node_cpu, max_cpus)) { + /* assign CPUs to nodes using board provided default mapping */ + if (!mc->cpu_index_to_instance_props || !mc->possible_cpu_arch_ids) { + error_report("default CPUs to NUMA node mapping isn't supported"); + exit(1); + } + + possible_cpus = mc->possible_cpu_arch_ids(ms); + for (i = 0; i < possible_cpus->len; i++) { + if (possible_cpus->cpus[i].props.has_node_id) { break; } } - /* Historically VCPUs were assigned in round-robin order to NUMA - * nodes. However it causes issues with guest not handling it nice - * in case where cores/threads from a multicore CPU appear on - * different nodes. So allow boards to override default distribution - * rule grouping VCPUs by socket so that VCPUs from the same socket - * would be on the same node. - */ - if (i == nb_numa_nodes) { + + /* no CPUs are assigned to NUMA nodes */ + if (i == possible_cpus->len) { for (i = 0; i < max_cpus; i++) { - unsigned node_id = i % nb_numa_nodes; - if (mc->cpu_index_to_socket_id) { - node_id = mc->cpu_index_to_socket_id(i) % nb_numa_nodes; - } + CpuInstanceProperties props; + /* fetch default mapping from board and enable it */ + props = mc->cpu_index_to_instance_props(ms, i); + props.has_node_id = true; - set_bit(i, numa_info[node_id].node_cpu); + machine_set_cpu_numa_node(ms, &props, &error_fatal); } } - validate_numa_cpus(); - } else { - numa_set_mem_node_id(0, ram_size, 0); - } -} - -void numa_post_machine_init(void) -{ - CPUState *cpu; - int i; + /* QEMU needs at least all unique node pair distances to build + * the whole NUMA distance table. QEMU treats the distance table + * as symmetric by default, i.e. distance A->B == distance B->A. + * Thus, QEMU is able to complete the distance table + * initialization even though only distance A->B is provided and + * distance B->A is not. QEMU knows the distance of a node to + * itself is always 10, so A->A distances may be omitted. When + * the distances of two nodes of a pair differ, i.e. distance + * A->B != distance B->A, then that means the distance table is + * asymmetric. In this case, the distances for both directions + * of all node pairs are required. + */ + if (have_numa_distance) { + /* Validate enough NUMA distance information was provided. */ + validate_numa_distance(); - CPU_FOREACH(cpu) { - for (i = 0; i < nb_numa_nodes; i++) { - assert(cpu->cpu_index < max_cpus); - if (test_bit(cpu->cpu_index, numa_info[i].node_cpu)) { - cpu->numa_node = i; - } + /* Validation succeeded, now fill in any missing distances. */ + complete_init_numa_distance(); } + } else { + numa_set_mem_node_id(0, ram_size, 0); } } @@ -560,20 +684,6 @@ MemdevList *qmp_query_memdev(Error **errp) return list; } -int numa_get_node_for_cpu(int idx) -{ - int i; - - assert(idx < max_cpus); - - for (i = 0; i < nb_numa_nodes; i++) { - if (test_bit(idx, numa_info[i].node_cpu)) { - break; - } - } - return i; -} - void ram_block_notifier_add(RAMBlockNotifier *n) { QLIST_INSERT_HEAD(&ram_list.ramblock_notifiers, n, next); diff --git a/qapi-schema.json b/qapi-schema.json index 5728b7f..80603cf 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -1325,6 +1325,9 @@ # # @thread_id: ID of the underlying host thread # +# @props: properties describing to which node/socket/core/thread +# virtual CPU belongs to, provided if supported by board (since 2.10) +# # @arch: architecture of the cpu, which determines which additional fields # will be listed (since 2.6) # @@ -1335,7 +1338,8 @@ ## { 'union': 'CpuInfo', 'base': {'CPU': 'int', 'current': 'bool', 'halted': 'bool', - 'qom_path': 'str', 'thread_id': 'int', 'arch': 'CpuInfoArch' }, + 'qom_path': 'str', 'thread_id': 'int', + '*props': 'CpuInstanceProperties', 'arch': 'CpuInfoArch' }, 'discriminator': 'arch', 'data': { 'x86': 'CpuInfoX86', 'sparc': 'CpuInfoSPARC', @@ -5682,10 +5686,16 @@ ## # @NumaOptionsType: # +# @node: NUMA nodes configuration +# +# @dist: NUMA distance configuration (since 2.10) +# +# @cpu: property based CPU(s) to node mapping (Since: 2.10) +# # Since: 2.1 ## { 'enum': 'NumaOptionsType', - 'data': [ 'node' ] } + 'data': [ 'node', 'dist', 'cpu' ] } ## # @NumaOptions: @@ -5698,7 +5708,9 @@ 'base': { 'type': 'NumaOptionsType' }, 'discriminator': 'type', 'data': { - 'node': 'NumaNodeOptions' }} + 'node': 'NumaNodeOptions', + 'dist': 'NumaDistOptions', + 'cpu': 'NumaCpuOptions' }} ## # @NumaNodeOptions: @@ -5727,6 +5739,41 @@ '*memdev': 'str' }} ## +# @NumaDistOptions: +# +# Set the distance between 2 NUMA nodes. +# +# @src: source NUMA node. +# +# @dst: destination NUMA node. +# +# @val: NUMA distance from source node to destination node. +# When a node is unreachable from another node, set the distance +# between them to 255. +# +# Since: 2.10 +## +{ 'struct': 'NumaDistOptions', + 'data': { + 'src': 'uint16', + 'dst': 'uint16', + 'val': 'uint8' }} + +## +# @NumaCpuOptions: +# +# Option "-numa cpu" overrides default cpu to node mapping. +# It accepts the same set of cpu properties as returned by +# query-hotpluggable-cpus[].props, where node-id could be used to +# override default node mapping. +# +# Since: 2.10 +## +{ 'struct': 'NumaCpuOptions', + 'base': 'CpuInstanceProperties', + 'data' : {} } + +## # @HostMemPolicy: # # Host memory policy types diff --git a/qemu-options.hx b/qemu-options.hx index f806af9..f07a310 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -139,13 +139,18 @@ ETEXI DEF("numa", HAS_ARG, QEMU_OPTION_numa, "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n" - "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n", QEMU_ARCH_ALL) + "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n" + "-numa dist,src=source,dst=destination,val=distance\n", QEMU_ARCH_ALL) STEXI @item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}] @itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}] +@itemx -numa dist,src=@var{source},dst=@var{destination},val=@var{distance} +@itemx -numa cpu,node-id=@var{node}[,socket-id=@var{x}][,core-id=@var{y}][,thread-id=@var{z}] @findex -numa Define a NUMA node and assign RAM and VCPUs to it. +Set the NUMA distance from a source node to a destination node. +Legacy VCPU assignment uses @samp{cpus} option where @var{firstcpu} and @var{lastcpu} are CPU indexes. Each @samp{cpus} option represent a contiguous range of CPU indexes (or a single VCPU if @var{lastcpu} is omitted). A non-contiguous @@ -159,6 +164,24 @@ a NUMA node: -numa node,cpus=0-2,cpus=5 @end example +@samp{cpu} option is a new alternative to @samp{cpus} option +which uses @samp{socket-id|core-id|thread-id} properties to assign +CPU objects to a @var{node} using topology layout properties of CPU. +The set of properties is machine specific, and depends on used +machine type/@samp{smp} options. It could be queried with +@samp{hotpluggable-cpus} monitor command. +@samp{node-id} property specifies @var{node} to which CPU object +will be assigned, it's required for @var{node} to be declared +with @samp{node} option before it's used with @samp{cpu} option. + +For example: +@example +-M pc \ +-smp 1,sockets=2,maxcpus=2 \ +-numa node,nodeid=0 -numa node,nodeid=1 \ +-numa cpu,node-id=0,socket-id=0 -numa cpu,node-id=1,socket-id=1 +@end example + @samp{mem} assigns a given RAM amount to a node. @samp{memdev} assigns RAM from a given memory backend device to a node. If @samp{mem} and @samp{memdev} are omitted in all nodes, RAM is @@ -167,6 +190,17 @@ split equally between them. @samp{mem} and @samp{memdev} are mutually exclusive. Furthermore, if one node uses @samp{memdev}, all of them have to use it. +@var{source} and @var{destination} are NUMA node IDs. +@var{distance} is the NUMA distance from @var{source} to @var{destination}. +The distance from a node to itself is always 10. If any pair of nodes is +given a distance, then all pairs must be given distances. Although, when +distances are only given in one direction for each pair of nodes, then +the distances in the opposite directions are assumed to be the same. If, +however, an asymmetrical pair of distances is given for even one node +pair, then all node pairs must be provided distance values for both +directions, even when they are symmetrical. When a node is unreachable +from another node, set the pair's distance to 255. + Note that the -@option{numa} option doesn't allocate any of the specified resources, it just assigns existing resources to NUMA nodes. This means that one still has to use the @option{-m}, diff --git a/target/arm/cpu.c b/target/arm/cpu.c index b357aee..c185eb1 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -458,6 +458,13 @@ static void arm_disas_set_info(CPUState *cpu, disassemble_info *info) } } +uint64_t arm_cpu_mp_affinity(int idx, uint8_t clustersz) +{ + uint32_t Aff1 = idx / clustersz; + uint32_t Aff0 = idx % clustersz; + return (Aff1 << ARM_AFF1_SHIFT) | Aff0; +} + static void arm_cpu_initfn(Object *obj) { CPUState *cs = CPU(obj); @@ -709,9 +716,8 @@ static void arm_cpu_realizefn(DeviceState *dev, Error **errp) * so these bits always RAZ. */ if (cpu->mp_affinity == ARM64_AFFINITY_INVALID) { - uint32_t Aff1 = cs->cpu_index / ARM_DEFAULT_CPUS_PER_CLUSTER; - uint32_t Aff0 = cs->cpu_index % ARM_DEFAULT_CPUS_PER_CLUSTER; - cpu->mp_affinity = (Aff1 << ARM_AFF1_SHIFT) | Aff0; + cpu->mp_affinity = arm_cpu_mp_affinity(cs->cpu_index, + ARM_DEFAULT_CPUS_PER_CLUSTER); } if (cpu->reset_hivecs) { @@ -1567,6 +1573,7 @@ static Property arm_cpu_properties[] = { DEFINE_PROP_UINT32("midr", ARMCPU, midr, 0), DEFINE_PROP_UINT64("mp-affinity", ARMCPU, mp_affinity, ARM64_AFFINITY_INVALID), + DEFINE_PROP_INT32("node-id", CPUState, numa_node, CPU_UNSET_NUMA_NODE_ID), DEFINE_PROP_END_OF_LIST() }; diff --git a/target/arm/cpu.h b/target/arm/cpu.h index 1055bfe..048faed 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -710,6 +710,8 @@ static inline ARMCPU *arm_env_get_cpu(CPUARMState *env) return container_of(env, ARMCPU, env); } +uint64_t arm_cpu_mp_affinity(int idx, uint8_t clustersz); + #define ENV_GET_CPU(e) CPU(arm_env_get_cpu(e)) #define ENV_OFFSET offsetof(ARMCPU, env) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 7e87031..5e76840 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -2635,28 +2635,23 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, X86CPU *cpu = x86_env_get_cpu(env); CPUState *cs = CPU(cpu); uint32_t pkg_offset; + uint32_t limit; - /* test if maximum index reached */ - if (index & 0x80000000) { - if (index > env->cpuid_xlevel) { - if (env->cpuid_xlevel2 > 0) { - /* Handle the Centaur's CPUID instruction. */ - if (index > env->cpuid_xlevel2) { - index = env->cpuid_xlevel2; - } else if (index < 0xC0000000) { - index = env->cpuid_xlevel; - } - } else { - /* Intel documentation states that invalid EAX input will - * return the same information as EAX=cpuid_level - * (Intel SDM Vol. 2A - Instruction Set Reference - CPUID) - */ - index = env->cpuid_level; - } - } + /* Calculate & apply limits for different index ranges */ + if (index >= 0xC0000000) { + limit = env->cpuid_xlevel2; + } else if (index >= 0x80000000) { + limit = env->cpuid_xlevel; } else { - if (index > env->cpuid_level) - index = env->cpuid_level; + limit = env->cpuid_level; + } + + if (index > limit) { + /* Intel documentation states that invalid EAX input will + * return the same information as EAX=cpuid_level + * (Intel SDM Vol. 2A - Instruction Set Reference - CPUID) + */ + index = env->cpuid_level; } switch(index) { @@ -3991,6 +3986,7 @@ static Property x86_cpu_properties[] = { DEFINE_PROP_INT32("core-id", X86CPU, core_id, -1), DEFINE_PROP_INT32("socket-id", X86CPU, socket_id, -1), #endif + DEFINE_PROP_INT32("node-id", CPUState, numa_node, CPU_UNSET_NUMA_NODE_ID), DEFINE_PROP_BOOL("pmu", X86CPU, enable_pmu, false), { .name = "hv-spinlocks", .info = &qdev_prop_spinlocks }, DEFINE_PROP_BOOL("hv-relaxed", X86CPU, hyperv_relaxed_timing, false), diff --git a/target/i386/machine.c b/target/i386/machine.c index 78ae2f9..3cb2729 100644 --- a/target/i386/machine.c +++ b/target/i386/machine.c @@ -136,178 +136,48 @@ static const VMStateDescription vmstate_mtrr_var = { #define VMSTATE_MTRR_VARS(_field, _state, _n, _v) \ VMSTATE_STRUCT_ARRAY(_field, _state, _n, _v, vmstate_mtrr_var, MTRRVar) -static int put_fpreg_error(QEMUFile *f, void *opaque, size_t size, - VMStateField *field, QJSON *vmdesc) -{ - fprintf(stderr, "call put_fpreg() with invalid arguments\n"); - exit(0); - return 0; -} - -/* XXX: add that in a FPU generic layer */ -union x86_longdouble { - uint64_t mant; - uint16_t exp; -}; +typedef struct x86_FPReg_tmp { + FPReg *parent; + uint64_t tmp_mant; + uint16_t tmp_exp; +} x86_FPReg_tmp; -#define MANTD1(fp) (fp & ((1LL << 52) - 1)) -#define EXPBIAS1 1023 -#define EXPD1(fp) ((fp >> 52) & 0x7FF) -#define SIGND1(fp) ((fp >> 32) & 0x80000000) - -static void fp64_to_fp80(union x86_longdouble *p, uint64_t temp) +static void fpreg_pre_save(void *opaque) { - int e; - /* mantissa */ - p->mant = (MANTD1(temp) << 11) | (1LL << 63); - /* exponent + sign */ - e = EXPD1(temp) - EXPBIAS1 + 16383; - e |= SIGND1(temp) >> 16; - p->exp = e; -} + x86_FPReg_tmp *tmp = opaque; -static int get_fpreg(QEMUFile *f, void *opaque, size_t size, - VMStateField *field) -{ - FPReg *fp_reg = opaque; - uint64_t mant; - uint16_t exp; - - qemu_get_be64s(f, &mant); - qemu_get_be16s(f, &exp); - fp_reg->d = cpu_set_fp80(mant, exp); - return 0; -} - -static int put_fpreg(QEMUFile *f, void *opaque, size_t size, - VMStateField *field, QJSON *vmdesc) -{ - FPReg *fp_reg = opaque; - uint64_t mant; - uint16_t exp; /* we save the real CPU data (in case of MMX usage only 'mant' contains the MMX register */ - cpu_get_fp80(&mant, &exp, fp_reg->d); - qemu_put_be64s(f, &mant); - qemu_put_be16s(f, &exp); - - return 0; + cpu_get_fp80(&tmp->tmp_mant, &tmp->tmp_exp, tmp->parent->d); } -static const VMStateInfo vmstate_fpreg = { - .name = "fpreg", - .get = get_fpreg, - .put = put_fpreg, -}; - -static int get_fpreg_1_mmx(QEMUFile *f, void *opaque, size_t size, - VMStateField *field) +static int fpreg_post_load(void *opaque, int version) { - union x86_longdouble *p = opaque; - uint64_t mant; + x86_FPReg_tmp *tmp = opaque; - qemu_get_be64s(f, &mant); - p->mant = mant; - p->exp = 0xffff; + tmp->parent->d = cpu_set_fp80(tmp->tmp_mant, tmp->tmp_exp); return 0; } -static const VMStateInfo vmstate_fpreg_1_mmx = { - .name = "fpreg_1_mmx", - .get = get_fpreg_1_mmx, - .put = put_fpreg_error, -}; - -static int get_fpreg_1_no_mmx(QEMUFile *f, void *opaque, size_t size, - VMStateField *field) -{ - union x86_longdouble *p = opaque; - uint64_t mant; - - qemu_get_be64s(f, &mant); - fp64_to_fp80(p, mant); - return 0; -} - -static const VMStateInfo vmstate_fpreg_1_no_mmx = { - .name = "fpreg_1_no_mmx", - .get = get_fpreg_1_no_mmx, - .put = put_fpreg_error, +static const VMStateDescription vmstate_fpreg_tmp = { + .name = "fpreg_tmp", + .post_load = fpreg_post_load, + .pre_save = fpreg_pre_save, + .fields = (VMStateField[]) { + VMSTATE_UINT64(tmp_mant, x86_FPReg_tmp), + VMSTATE_UINT16(tmp_exp, x86_FPReg_tmp), + VMSTATE_END_OF_LIST() + } }; -static bool fpregs_is_0(void *opaque, int version_id) -{ - X86CPU *cpu = opaque; - CPUX86State *env = &cpu->env; - - return (env->fpregs_format_vmstate == 0); -} - -static bool fpregs_is_1_mmx(void *opaque, int version_id) -{ - X86CPU *cpu = opaque; - CPUX86State *env = &cpu->env; - int guess_mmx; - - guess_mmx = ((env->fptag_vmstate == 0xff) && - (env->fpus_vmstate & 0x3800) == 0); - return (guess_mmx && (env->fpregs_format_vmstate == 1)); -} - -static bool fpregs_is_1_no_mmx(void *opaque, int version_id) -{ - X86CPU *cpu = opaque; - CPUX86State *env = &cpu->env; - int guess_mmx; - - guess_mmx = ((env->fptag_vmstate == 0xff) && - (env->fpus_vmstate & 0x3800) == 0); - return (!guess_mmx && (env->fpregs_format_vmstate == 1)); -} - -#define VMSTATE_FP_REGS(_field, _state, _n) \ - VMSTATE_ARRAY_TEST(_field, _state, _n, fpregs_is_0, vmstate_fpreg, FPReg), \ - VMSTATE_ARRAY_TEST(_field, _state, _n, fpregs_is_1_mmx, vmstate_fpreg_1_mmx, FPReg), \ - VMSTATE_ARRAY_TEST(_field, _state, _n, fpregs_is_1_no_mmx, vmstate_fpreg_1_no_mmx, FPReg) - -static bool version_is_5(void *opaque, int version_id) -{ - return version_id == 5; -} - -#ifdef TARGET_X86_64 -static bool less_than_7(void *opaque, int version_id) -{ - return version_id < 7; -} - -static int get_uint64_as_uint32(QEMUFile *f, void *pv, size_t size, - VMStateField *field) -{ - uint64_t *v = pv; - *v = qemu_get_be32(f); - return 0; -} - -static int put_uint64_as_uint32(QEMUFile *f, void *pv, size_t size, - VMStateField *field, QJSON *vmdesc) -{ - uint64_t *v = pv; - qemu_put_be32(f, *v); - - return 0; -} - -static const VMStateInfo vmstate_hack_uint64_as_uint32 = { - .name = "uint64_as_uint32", - .get = get_uint64_as_uint32, - .put = put_uint64_as_uint32, +static const VMStateDescription vmstate_fpreg = { + .name = "fpreg", + .fields = (VMStateField[]) { + VMSTATE_WITH_TMP(FPReg, x86_FPReg_tmp, vmstate_fpreg_tmp), + VMSTATE_END_OF_LIST() + } }; -#define VMSTATE_HACK_UINT32(_f, _s, _t) \ - VMSTATE_SINGLE_TEST(_f, _s, _t, 0, vmstate_hack_uint64_as_uint32, uint64_t) -#endif - static void cpu_pre_save(void *opaque) { X86CPU *cpu = opaque; @@ -356,6 +226,10 @@ static int cpu_post_load(void *opaque, int version_id) return -EINVAL; } + if (env->fpregs_format_vmstate) { + error_report("Unsupported old non-softfloat CPU state"); + return -EINVAL; + } /* * Real mode guest segments register DPL should be zero. * Older KVM version were setting it wrongly. @@ -930,7 +804,7 @@ static const VMStateDescription vmstate_mcg_ext_ctl = { VMStateDescription vmstate_x86_cpu = { .name = "cpu", .version_id = 12, - .minimum_version_id = 3, + .minimum_version_id = 11, .pre_save = cpu_pre_save, .post_load = cpu_post_load, .fields = (VMStateField[]) { @@ -943,7 +817,8 @@ VMStateDescription vmstate_x86_cpu = { VMSTATE_UINT16(env.fpus_vmstate, X86CPU), VMSTATE_UINT16(env.fptag_vmstate, X86CPU), VMSTATE_UINT16(env.fpregs_format_vmstate, X86CPU), - VMSTATE_FP_REGS(env.fpregs, X86CPU, 8), + + VMSTATE_STRUCT_ARRAY(env.fpregs, X86CPU, 8, 0, vmstate_fpreg, FPReg), VMSTATE_SEGMENT_ARRAY(env.segs, X86CPU, 6), VMSTATE_SEGMENT(env.ldt, X86CPU), @@ -952,16 +827,8 @@ VMStateDescription vmstate_x86_cpu = { VMSTATE_SEGMENT(env.idt, X86CPU), VMSTATE_UINT32(env.sysenter_cs, X86CPU), -#ifdef TARGET_X86_64 - /* Hack: In v7 size changed from 32 to 64 bits on x86_64 */ - VMSTATE_HACK_UINT32(env.sysenter_esp, X86CPU, less_than_7), - VMSTATE_HACK_UINT32(env.sysenter_eip, X86CPU, less_than_7), - VMSTATE_UINTTL_V(env.sysenter_esp, X86CPU, 7), - VMSTATE_UINTTL_V(env.sysenter_eip, X86CPU, 7), -#else VMSTATE_UINTTL(env.sysenter_esp, X86CPU), VMSTATE_UINTTL(env.sysenter_eip, X86CPU), -#endif VMSTATE_UINTTL(env.cr[0], X86CPU), VMSTATE_UINTTL(env.cr[2], X86CPU), @@ -982,46 +849,45 @@ VMStateDescription vmstate_x86_cpu = { VMSTATE_UINT64(env.fmask, X86CPU), VMSTATE_UINT64(env.kernelgsbase, X86CPU), #endif - VMSTATE_UINT32_V(env.smbase, X86CPU, 4), - - VMSTATE_UINT64_V(env.pat, X86CPU, 5), - VMSTATE_UINT32_V(env.hflags2, X86CPU, 5), - - VMSTATE_UINT32_TEST(parent_obj.halted, X86CPU, version_is_5), - VMSTATE_UINT64_V(env.vm_hsave, X86CPU, 5), - VMSTATE_UINT64_V(env.vm_vmcb, X86CPU, 5), - VMSTATE_UINT64_V(env.tsc_offset, X86CPU, 5), - VMSTATE_UINT64_V(env.intercept, X86CPU, 5), - VMSTATE_UINT16_V(env.intercept_cr_read, X86CPU, 5), - VMSTATE_UINT16_V(env.intercept_cr_write, X86CPU, 5), - VMSTATE_UINT16_V(env.intercept_dr_read, X86CPU, 5), - VMSTATE_UINT16_V(env.intercept_dr_write, X86CPU, 5), - VMSTATE_UINT32_V(env.intercept_exceptions, X86CPU, 5), - VMSTATE_UINT8_V(env.v_tpr, X86CPU, 5), + VMSTATE_UINT32(env.smbase, X86CPU), + + VMSTATE_UINT64(env.pat, X86CPU), + VMSTATE_UINT32(env.hflags2, X86CPU), + + VMSTATE_UINT64(env.vm_hsave, X86CPU), + VMSTATE_UINT64(env.vm_vmcb, X86CPU), + VMSTATE_UINT64(env.tsc_offset, X86CPU), + VMSTATE_UINT64(env.intercept, X86CPU), + VMSTATE_UINT16(env.intercept_cr_read, X86CPU), + VMSTATE_UINT16(env.intercept_cr_write, X86CPU), + VMSTATE_UINT16(env.intercept_dr_read, X86CPU), + VMSTATE_UINT16(env.intercept_dr_write, X86CPU), + VMSTATE_UINT32(env.intercept_exceptions, X86CPU), + VMSTATE_UINT8(env.v_tpr, X86CPU), /* MTRRs */ - VMSTATE_UINT64_ARRAY_V(env.mtrr_fixed, X86CPU, 11, 8), - VMSTATE_UINT64_V(env.mtrr_deftype, X86CPU, 8), + VMSTATE_UINT64_ARRAY(env.mtrr_fixed, X86CPU, 11), + VMSTATE_UINT64(env.mtrr_deftype, X86CPU), VMSTATE_MTRR_VARS(env.mtrr_var, X86CPU, MSR_MTRRcap_VCNT, 8), /* KVM-related states */ - VMSTATE_INT32_V(env.interrupt_injected, X86CPU, 9), - VMSTATE_UINT32_V(env.mp_state, X86CPU, 9), - VMSTATE_UINT64_V(env.tsc, X86CPU, 9), - VMSTATE_INT32_V(env.exception_injected, X86CPU, 11), - VMSTATE_UINT8_V(env.soft_interrupt, X86CPU, 11), - VMSTATE_UINT8_V(env.nmi_injected, X86CPU, 11), - VMSTATE_UINT8_V(env.nmi_pending, X86CPU, 11), - VMSTATE_UINT8_V(env.has_error_code, X86CPU, 11), - VMSTATE_UINT32_V(env.sipi_vector, X86CPU, 11), + VMSTATE_INT32(env.interrupt_injected, X86CPU), + VMSTATE_UINT32(env.mp_state, X86CPU), + VMSTATE_UINT64(env.tsc, X86CPU), + VMSTATE_INT32(env.exception_injected, X86CPU), + VMSTATE_UINT8(env.soft_interrupt, X86CPU), + VMSTATE_UINT8(env.nmi_injected, X86CPU), + VMSTATE_UINT8(env.nmi_pending, X86CPU), + VMSTATE_UINT8(env.has_error_code, X86CPU), + VMSTATE_UINT32(env.sipi_vector, X86CPU), /* MCE */ - VMSTATE_UINT64_V(env.mcg_cap, X86CPU, 10), - VMSTATE_UINT64_V(env.mcg_status, X86CPU, 10), - VMSTATE_UINT64_V(env.mcg_ctl, X86CPU, 10), - VMSTATE_UINT64_ARRAY_V(env.mce_banks, X86CPU, MCE_BANKS_DEF * 4, 10), + VMSTATE_UINT64(env.mcg_cap, X86CPU), + VMSTATE_UINT64(env.mcg_status, X86CPU), + VMSTATE_UINT64(env.mcg_ctl, X86CPU), + VMSTATE_UINT64_ARRAY(env.mce_banks, X86CPU, MCE_BANKS_DEF * 4), /* rdtscp */ - VMSTATE_UINT64_V(env.tsc_aux, X86CPU, 11), + VMSTATE_UINT64(env.tsc_aux, X86CPU), /* KVM pvclock msr */ - VMSTATE_UINT64_V(env.system_time_msr, X86CPU, 11), - VMSTATE_UINT64_V(env.wall_clock_msr, X86CPU, 11), + VMSTATE_UINT64(env.system_time_msr, X86CPU), + VMSTATE_UINT64(env.wall_clock_msr, X86CPU), /* XSAVE related fields */ VMSTATE_UINT64_V(env.xcr0, X86CPU, 12), VMSTATE_UINT64_V(env.xstate_bv, X86CPU, 12), diff --git a/tests/Makefile.include b/tests/Makefile.include index 31931c0..16ff8f3 100644 --- a/tests/Makefile.include +++ b/tests/Makefile.include @@ -260,6 +260,7 @@ check-qtest-i386-y += tests/test-filter-mirror$(EXESUF) check-qtest-i386-y += tests/test-filter-redirector$(EXESUF) check-qtest-i386-y += tests/postcopy-test$(EXESUF) check-qtest-i386-y += tests/test-x86-cpuid-compat$(EXESUF) +check-qtest-i386-y += tests/numa-test$(EXESUF) check-qtest-x86_64-y += $(check-qtest-i386-y) gcov-files-i386-y += i386-softmmu/hw/timer/mc146818rtc.c gcov-files-x86_64-y = $(subst i386-softmmu/,x86_64-softmmu/,$(gcov-files-i386-y)) @@ -300,6 +301,7 @@ check-qtest-ppc64-y += tests/test-netfilter$(EXESUF) check-qtest-ppc64-y += tests/test-filter-mirror$(EXESUF) check-qtest-ppc64-y += tests/test-filter-redirector$(EXESUF) check-qtest-ppc64-y += tests/display-vga-test$(EXESUF) +check-qtest-ppc64-y += tests/numa-test$(EXESUF) check-qtest-ppc64-$(CONFIG_EVENTFD) += tests/ivshmem-test$(EXESUF) check-qtest-sh4-y = tests/endianness-test$(EXESUF) @@ -324,6 +326,8 @@ gcov-files-arm-y += arm-softmmu/hw/block/virtio-blk.c check-qtest-arm-y += tests/test-arm-mptimer$(EXESUF) gcov-files-arm-y += hw/timer/arm_mptimer.c +check-qtest-aarch64-y = tests/numa-test$(EXESUF) + check-qtest-microblazeel-y = $(check-qtest-microblaze-y) check-qtest-xtensaeb-y = $(check-qtest-xtensa-y) @@ -753,6 +757,7 @@ tests/vhost-user-bridge$(EXESUF): tests/vhost-user-bridge.o contrib/libvhost-use tests/test-uuid$(EXESUF): tests/test-uuid.o $(test-util-obj-y) tests/test-arm-mptimer$(EXESUF): tests/test-arm-mptimer.o tests/test-qapi-util$(EXESUF): tests/test-qapi-util.o $(test-util-obj-y) +tests/numa-test$(EXESUF): tests/numa-test.o tests/migration/stress$(EXESUF): tests/migration/stress.o $(call quiet-command, $(LINKPROG) -static -O3 $(PTHREAD_LIB) -o $@ $< ,"LINK","$(TARGET_DIR)$@") diff --git a/tests/acpi-test-data/pc/SLIT.cphp b/tests/acpi-test-data/pc/SLIT.cphp Binary files differnew file mode 100644 index 0000000..74ec3b4 --- /dev/null +++ b/tests/acpi-test-data/pc/SLIT.cphp diff --git a/tests/acpi-test-data/pc/SLIT.memhp b/tests/acpi-test-data/pc/SLIT.memhp Binary files differnew file mode 100644 index 0000000..74ec3b4 --- /dev/null +++ b/tests/acpi-test-data/pc/SLIT.memhp diff --git a/tests/acpi-test-data/pc/SRAT.memhp b/tests/acpi-test-data/pc/SRAT.memhp Binary files differindex 66ce9a8..a7dddf7 100644 --- a/tests/acpi-test-data/pc/SRAT.memhp +++ b/tests/acpi-test-data/pc/SRAT.memhp diff --git a/tests/acpi-test-data/q35/SLIT.cphp b/tests/acpi-test-data/q35/SLIT.cphp Binary files differnew file mode 100644 index 0000000..74ec3b4 --- /dev/null +++ b/tests/acpi-test-data/q35/SLIT.cphp diff --git a/tests/acpi-test-data/q35/SLIT.memhp b/tests/acpi-test-data/q35/SLIT.memhp Binary files differnew file mode 100644 index 0000000..74ec3b4 --- /dev/null +++ b/tests/acpi-test-data/q35/SLIT.memhp diff --git a/tests/acpi-test-data/q35/SRAT.memhp b/tests/acpi-test-data/q35/SRAT.memhp Binary files differindex 66ce9a8..a7dddf7 100644 --- a/tests/acpi-test-data/q35/SRAT.memhp +++ b/tests/acpi-test-data/q35/SRAT.memhp diff --git a/tests/bios-tables-test.c b/tests/bios-tables-test.c index 9c96a67..4e5c65a 100644 --- a/tests/bios-tables-test.c +++ b/tests/bios-tables-test.c @@ -723,7 +723,8 @@ static void test_acpi_piix4_tcg_cphp(void) data.machine = MACHINE_PC; data.variant = ".cphp"; test_acpi_one("-smp 2,cores=3,sockets=2,maxcpus=6" - " -numa node -numa node", + " -numa node -numa node" + " -numa dist,src=0,dst=1,val=21", &data); free_test_data(&data); } @@ -736,7 +737,8 @@ static void test_acpi_q35_tcg_cphp(void) data.machine = MACHINE_Q35; data.variant = ".cphp"; test_acpi_one(" -smp 2,cores=3,sockets=2,maxcpus=6" - " -numa node -numa node", + " -numa node -numa node" + " -numa dist,src=0,dst=1,val=21", &data); free_test_data(&data); } @@ -785,7 +787,10 @@ static void test_acpi_q35_tcg_memhp(void) memset(&data, 0, sizeof(data)); data.machine = MACHINE_Q35; data.variant = ".memhp"; - test_acpi_one(" -m 128,slots=3,maxmem=1G -numa node", &data); + test_acpi_one(" -m 128,slots=3,maxmem=1G" + " -numa node -numa node" + " -numa dist,src=0,dst=1,val=21", + &data); free_test_data(&data); } @@ -796,7 +801,10 @@ static void test_acpi_piix4_tcg_memhp(void) memset(&data, 0, sizeof(data)); data.machine = MACHINE_PC; data.variant = ".memhp"; - test_acpi_one(" -m 128,slots=3,maxmem=1G -numa node", &data); + test_acpi_one(" -m 128,slots=3,maxmem=1G" + " -numa node -numa node" + " -numa dist,src=0,dst=1,val=21", + &data); free_test_data(&data); } diff --git a/tests/numa-test.c b/tests/numa-test.c new file mode 100644 index 0000000..c3475d6 --- /dev/null +++ b/tests/numa-test.c @@ -0,0 +1,302 @@ +/* + * NUMA configuration test cases + * + * Copyright (c) 2017 Red Hat Inc. + * Authors: + * Igor Mammedov <imammedo@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "libqtest.h" + +static char *make_cli(const char *generic_cli, const char *test_cli) +{ + return g_strdup_printf("%s %s", generic_cli ? generic_cli : "", test_cli); +} + +static char *hmp_info_numa(void) +{ + QDict *resp; + char *s; + + resp = qmp("{ 'execute': 'human-monitor-command', 'arguments': " + "{ 'command-line': 'info numa '} }"); + g_assert(resp); + g_assert(qdict_haskey(resp, "return")); + s = g_strdup(qdict_get_str(resp, "return")); + g_assert(s); + QDECREF(resp); + return s; +} + +static void test_mon_explicit(const void *data) +{ + char *s; + char *cli; + + cli = make_cli(data, "-smp 8 " + "-numa node,nodeid=0,cpus=0-3 " + "-numa node,nodeid=1,cpus=4-7 "); + qtest_start(cli); + + s = hmp_info_numa(); + g_assert(strstr(s, "node 0 cpus: 0 1 2 3")); + g_assert(strstr(s, "node 1 cpus: 4 5 6 7")); + g_free(s); + + qtest_end(); + g_free(cli); +} + +static void test_mon_default(const void *data) +{ + char *s; + char *cli; + + cli = make_cli(data, "-smp 8 -numa node -numa node"); + qtest_start(cli); + + s = hmp_info_numa(); + g_assert(strstr(s, "node 0 cpus: 0 2 4 6")); + g_assert(strstr(s, "node 1 cpus: 1 3 5 7")); + g_free(s); + + qtest_end(); + g_free(cli); +} + +static void test_mon_partial(const void *data) +{ + char *s; + char *cli; + + cli = make_cli(data, "-smp 8 " + "-numa node,nodeid=0,cpus=0-1 " + "-numa node,nodeid=1,cpus=4-5 "); + qtest_start(cli); + + s = hmp_info_numa(); + g_assert(strstr(s, "node 0 cpus: 0 1 2 3 6 7")); + g_assert(strstr(s, "node 1 cpus: 4 5")); + g_free(s); + + qtest_end(); + g_free(cli); +} + +static QList *get_cpus(QDict **resp) +{ + *resp = qmp("{ 'execute': 'query-cpus' }"); + g_assert(*resp); + g_assert(qdict_haskey(*resp, "return")); + return qdict_get_qlist(*resp, "return"); +} + +static void test_query_cpus(const void *data) +{ + char *cli; + QDict *resp; + QList *cpus; + const QObject *e; + + cli = make_cli(data, "-smp 8 -numa node,cpus=0-3 -numa node,cpus=4-7"); + qtest_start(cli); + cpus = get_cpus(&resp); + g_assert(cpus); + + while ((e = qlist_pop(cpus))) { + QDict *cpu, *props; + int64_t cpu_idx, node; + + cpu = qobject_to_qdict(e); + g_assert(qdict_haskey(cpu, "CPU")); + g_assert(qdict_haskey(cpu, "props")); + + cpu_idx = qdict_get_int(cpu, "CPU"); + props = qdict_get_qdict(cpu, "props"); + g_assert(qdict_haskey(props, "node-id")); + node = qdict_get_int(props, "node-id"); + if (cpu_idx >= 0 && cpu_idx < 4) { + g_assert_cmpint(node, ==, 0); + } else { + g_assert_cmpint(node, ==, 1); + } + } + + QDECREF(resp); + qtest_end(); + g_free(cli); +} + +static void pc_numa_cpu(const void *data) +{ + char *cli; + QDict *resp; + QList *cpus; + const QObject *e; + + cli = make_cli(data, "-cpu pentium -smp 8,sockets=2,cores=2,threads=2 " + "-numa node,nodeid=0 -numa node,nodeid=1 " + "-numa cpu,node-id=1,socket-id=0 " + "-numa cpu,node-id=0,socket-id=1,core-id=0 " + "-numa cpu,node-id=0,socket-id=1,core-id=1,thread-id=0 " + "-numa cpu,node-id=1,socket-id=1,core-id=1,thread-id=1"); + qtest_start(cli); + cpus = get_cpus(&resp); + g_assert(cpus); + + while ((e = qlist_pop(cpus))) { + QDict *cpu, *props; + int64_t socket, core, thread, node; + + cpu = qobject_to_qdict(e); + g_assert(qdict_haskey(cpu, "props")); + props = qdict_get_qdict(cpu, "props"); + + g_assert(qdict_haskey(props, "node-id")); + node = qdict_get_int(props, "node-id"); + g_assert(qdict_haskey(props, "socket-id")); + socket = qdict_get_int(props, "socket-id"); + g_assert(qdict_haskey(props, "core-id")); + core = qdict_get_int(props, "core-id"); + g_assert(qdict_haskey(props, "thread-id")); + thread = qdict_get_int(props, "thread-id"); + + if (socket == 0) { + g_assert_cmpint(node, ==, 1); + } else if (socket == 1 && core == 0) { + g_assert_cmpint(node, ==, 0); + } else if (socket == 1 && core == 1 && thread == 0) { + g_assert_cmpint(node, ==, 0); + } else if (socket == 1 && core == 1 && thread == 1) { + g_assert_cmpint(node, ==, 1); + } else { + g_assert(false); + } + } + + QDECREF(resp); + qtest_end(); + g_free(cli); +} + +static void spapr_numa_cpu(const void *data) +{ + char *cli; + QDict *resp; + QList *cpus; + const QObject *e; + + cli = make_cli(data, "-smp 4,cores=4 " + "-numa node,nodeid=0 -numa node,nodeid=1 " + "-numa cpu,node-id=0,core-id=0 " + "-numa cpu,node-id=0,core-id=1 " + "-numa cpu,node-id=0,core-id=2 " + "-numa cpu,node-id=1,core-id=3"); + qtest_start(cli); + cpus = get_cpus(&resp); + g_assert(cpus); + + while ((e = qlist_pop(cpus))) { + QDict *cpu, *props; + int64_t core, node; + + cpu = qobject_to_qdict(e); + g_assert(qdict_haskey(cpu, "props")); + props = qdict_get_qdict(cpu, "props"); + + g_assert(qdict_haskey(props, "node-id")); + node = qdict_get_int(props, "node-id"); + g_assert(qdict_haskey(props, "core-id")); + core = qdict_get_int(props, "core-id"); + + if (core >= 0 && core < 3) { + g_assert_cmpint(node, ==, 0); + } else if (core == 3) { + g_assert_cmpint(node, ==, 1); + } else { + g_assert(false); + } + } + + QDECREF(resp); + qtest_end(); + g_free(cli); +} + +static void aarch64_numa_cpu(const void *data) +{ + char *cli; + QDict *resp; + QList *cpus; + const QObject *e; + + cli = make_cli(data, "-smp 2 " + "-numa node,nodeid=0 -numa node,nodeid=1 " + "-numa cpu,node-id=1,thread-id=0 " + "-numa cpu,node-id=0,thread-id=1"); + qtest_start(cli); + cpus = get_cpus(&resp); + g_assert(cpus); + + while ((e = qlist_pop(cpus))) { + QDict *cpu, *props; + int64_t thread, node; + + cpu = qobject_to_qdict(e); + g_assert(qdict_haskey(cpu, "props")); + props = qdict_get_qdict(cpu, "props"); + + g_assert(qdict_haskey(props, "node-id")); + node = qdict_get_int(props, "node-id"); + g_assert(qdict_haskey(props, "thread-id")); + thread = qdict_get_int(props, "thread-id"); + + if (thread == 0) { + g_assert_cmpint(node, ==, 1); + } else if (thread == 1) { + g_assert_cmpint(node, ==, 0); + } else { + g_assert(false); + } + } + + QDECREF(resp); + qtest_end(); + g_free(cli); +} + +int main(int argc, char **argv) +{ + const char *args = NULL; + const char *arch = qtest_get_arch(); + + if (strcmp(arch, "aarch64") == 0) { + args = "-machine virt"; + } + + g_test_init(&argc, &argv, NULL); + + qtest_add_data_func("/numa/mon/default", args, test_mon_default); + qtest_add_data_func("/numa/mon/cpus/explicit", args, test_mon_explicit); + qtest_add_data_func("/numa/mon/cpus/partial", args, test_mon_partial); + qtest_add_data_func("/numa/qmp/cpus/query-cpus", args, test_query_cpus); + + if (!strcmp(arch, "i386") || !strcmp(arch, "x86_64")) { + qtest_add_data_func("/numa/pc/cpu/explicit", args, pc_numa_cpu); + } + + if (!strcmp(arch, "ppc64")) { + qtest_add_data_func("/numa/spapr/cpu/explicit", args, spapr_numa_cpu); + } + + if (!strcmp(arch, "aarch64")) { + qtest_add_data_func("/numa/aarch64/cpu/explicit", args, + aarch64_numa_cpu); + } + + return g_test_run(); +} @@ -4504,7 +4504,7 @@ int main(int argc, char **argv, char **envp) default_drive(default_floppy, snapshot, IF_FLOPPY, 0, FD_OPTS); default_drive(default_sdcard, snapshot, IF_SD, 0, SD_OPTS); - parse_numa_opts(machine_class); + parse_numa_opts(current_machine); if (qemu_opts_foreach(qemu_find_opts("mon"), mon_init_func, NULL, NULL)) { @@ -4560,7 +4560,7 @@ int main(int argc, char **argv, char **envp) current_machine->boot_order = boot_order; current_machine->cpu_model = cpu_model; - machine_class->init(current_machine); + machine_run_board_init(current_machine); realtime_init(); @@ -4593,8 +4593,6 @@ int main(int argc, char **argv, char **envp) cpu_synchronize_all_post_init(); - numa_post_machine_init(); - rom_reset_order_override(); /* |