From f0638a0b6bba455e8eaf518f23487d6ff1f59b5a Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Fri, 11 Sep 2020 01:31:23 -0300 Subject: spapr: Handle HPT allocation failure in nested guest The nested KVM code does not yet support HPT guests. Calling the KVM_CAP_PPC_ALLOC_HTAB ioctl currently leads to KVM setting the guest as HPT and erroneously executing code in L1 that should only run in hypervisor mode, leading to an exception in the L1 vcpu thread when it enters the nested guest. This can be reproduced with -machine max-cpu-compat=power8 in the L2 guest command line. The KVM code has since been modified to fail the ioctl when running in a nested environment so QEMU needs to be able to handle that. This patch provides an error message informing the user about the lack of support for HPT in nested guests. Reported-by: Satheesh Rajendran Signed-off-by: Fabiano Rosas Message-Id: <20200911043123.204162-1-farosas@linux.ibm.com> Reviewed-by: Greg Kurz Signed-off-by: David Gibson --- hw/ppc/spapr.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'hw') diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 2db810f..544a194 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -1483,6 +1483,12 @@ void spapr_reallocate_hpt(SpaprMachineState *spapr, int shift, spapr_free_hpt(spapr); rc = kvmppc_reset_htab(shift); + + if (rc == -EOPNOTSUPP) { + error_setg(errp, "HPT not supported in nested guests"); + return; + } + if (rc < 0) { /* kernel-side HPT needed, but couldn't allocate one */ error_setg_errno(errp, errno, -- cgit v1.1 From 9c4d1497e8d44a0045d04533bb822d453639c944 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Mon, 14 Sep 2020 14:34:51 +0200 Subject: spapr: Fix error leak in spapr_realize_vcpu() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If spapr_irq_cpu_intc_create() fails, local_err isn't propagated and thus leaked. Fixes: 992861fb1e4c ("error: Eliminate error_propagate() manually") Cc: armbru@redhat.com Signed-off-by: Greg Kurz Message-Id: <20200914123505.612812-2-groug@kaod.org> Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: David Gibson --- hw/ppc/spapr_cpu_core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'hw') diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c index 2125fda..3e4f402 100644 --- a/hw/ppc/spapr_cpu_core.c +++ b/hw/ppc/spapr_cpu_core.c @@ -232,7 +232,6 @@ static void spapr_realize_vcpu(PowerPCCPU *cpu, SpaprMachineState *spapr, { CPUPPCState *env = &cpu->env; CPUState *cs = CPU(cpu); - Error *local_err = NULL; if (!qdev_realize(DEVICE(cpu), NULL, errp)) { return; @@ -244,7 +243,7 @@ static void spapr_realize_vcpu(PowerPCCPU *cpu, SpaprMachineState *spapr, cpu_ppc_set_vhyp(cpu, PPC_VIRTUAL_HYPERVISOR(spapr)); kvmppc_set_papr(cpu); - if (spapr_irq_cpu_intc_create(spapr, cpu, &local_err) < 0) { + if (spapr_irq_cpu_intc_create(spapr, cpu, errp) < 0) { cpu_remove_sync(CPU(cpu)); return; } -- cgit v1.1 From a3114923d4c5f0a2ccc8b4d74b3eebad3fb6ce5d Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Mon, 14 Sep 2020 14:34:54 +0200 Subject: spapr: Simplify error handling in callers of ppc_set_compat() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that ppc_set_compat() indicates success/failure with a return value, use it and reduce error propagation overhead. Signed-off-by: Greg Kurz Message-Id: <20200914123505.612812-5-groug@kaod.org> Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: David Gibson --- hw/ppc/spapr.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'hw') diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 544a194..0f82e65 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -3817,10 +3817,9 @@ static void spapr_core_plug(HotplugHandler *hotplug_dev, DeviceState *dev, */ if (hotplugged) { for (i = 0; i < cc->nr_threads; i++) { - ppc_set_compat(core->threads[i], POWERPC_CPU(first_cpu)->compat_pvr, - &local_err); - if (local_err) { - error_propagate(errp, local_err); + if (ppc_set_compat(core->threads[i], + POWERPC_CPU(first_cpu)->compat_pvr, + errp) < 0) { return; } } -- cgit v1.1 From 121afbe487b7b10b9fc683be16068368e1ad0f11 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Mon, 14 Sep 2020 14:34:55 +0200 Subject: spapr: Get rid of cas_check_pvr() error reporting The cas_check_pvr() function has two purposes: - finding the "best" logical PVR, ie. the most recent one supported by the guest for this CPU type - checking if the guest supports the real PVR of this CPU type, which is just an optional extra information to workaround the lack of support for "compat" mode in PR KVM This logic doesn't need error reporting, really. If we don't find a suitable logical PVR, we return the special value 0 which is definitely not a valid PVR. Let the caller decide on whether it should error out or not. This doesn't change the behavior. Signed-off-by: Greg Kurz Message-Id: <20200914123505.612812-6-groug@kaod.org> Reviewed-by: Vladimir Sementsov-Ogievskiy Signed-off-by: David Gibson --- hw/ppc/spapr_hcall.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) (limited to 'hw') diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c index c2776b6..885ea60 100644 --- a/hw/ppc/spapr_hcall.c +++ b/hw/ppc/spapr_hcall.c @@ -1590,12 +1590,11 @@ static target_ulong h_signal_sys_reset(PowerPCCPU *cpu, } } -static uint32_t cas_check_pvr(SpaprMachineState *spapr, PowerPCCPU *cpu, - target_ulong *addr, bool *raw_mode_supported, - Error **errp) +/* Returns either a logical PVR or zero if none was found */ +static uint32_t cas_check_pvr(PowerPCCPU *cpu, uint32_t max_compat, + target_ulong *addr, bool *raw_mode_supported) { bool explicit_match = false; /* Matched the CPU's real PVR */ - uint32_t max_compat = spapr->max_compat_pvr; uint32_t best_compat = 0; int i; @@ -1624,14 +1623,6 @@ static uint32_t cas_check_pvr(SpaprMachineState *spapr, PowerPCCPU *cpu, } } - if ((best_compat == 0) && (!explicit_match || max_compat)) { - /* We couldn't find a suitable compatibility mode, and either - * the guest doesn't support "raw" mode for this CPU, or raw - * mode is disabled because a maximum compat mode is set */ - error_setg(errp, "Couldn't negotiate a suitable PVR during CAS"); - return 0; - } - *raw_mode_supported = explicit_match; /* Parsing finished */ @@ -1680,6 +1671,7 @@ target_ulong do_client_architecture_support(PowerPCCPU *cpu, bool guest_xive; CPUState *cs; void *fdt; + uint32_t max_compat = spapr->max_compat_pvr; /* CAS is supposed to be called early when only the boot vCPU is active. */ CPU_FOREACH(cs) { @@ -1692,9 +1684,14 @@ target_ulong do_client_architecture_support(PowerPCCPU *cpu, } } - cas_pvr = cas_check_pvr(spapr, cpu, &vec, &raw_mode_supported, &local_err); - if (local_err) { - error_report_err(local_err); + cas_pvr = cas_check_pvr(cpu, max_compat, &vec, &raw_mode_supported); + if (!cas_pvr && (!raw_mode_supported || max_compat)) { + /* + * We couldn't find a suitable compatibility mode, and either + * the guest doesn't support "raw" mode for this CPU, or "raw" + * mode is disabled because a maximum compat mode is set. + */ + error_report("Couldn't negotiate a suitable PVR during CAS"); return H_HARDWARE; } -- cgit v1.1 From 7e92da81be377a604f4ace7551ce61dd51afbbaa Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Mon, 14 Sep 2020 14:34:56 +0200 Subject: spapr: Simplify error handling in do_client_architecture_support() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the return value of ppc_set_compat_all() to check failures, which is preferred over hijacking local_err. Signed-off-by: Greg Kurz Message-Id: <20200914123505.612812-7-groug@kaod.org> Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: David Gibson --- hw/ppc/spapr_hcall.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'hw') diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c index 885ea60..6077401 100644 --- a/hw/ppc/spapr_hcall.c +++ b/hw/ppc/spapr_hcall.c @@ -1666,7 +1666,6 @@ target_ulong do_client_architecture_support(PowerPCCPU *cpu, uint32_t cas_pvr; SpaprOptionVector *ov1_guest, *ov5_guest; bool guest_radix; - Error *local_err = NULL; bool raw_mode_supported = false; bool guest_xive; CPUState *cs; @@ -1697,8 +1696,9 @@ target_ulong do_client_architecture_support(PowerPCCPU *cpu, /* Update CPUs */ if (cpu->compat_pvr != cas_pvr) { - ppc_set_compat_all(cas_pvr, &local_err); - if (local_err) { + Error *local_err = NULL; + + if (ppc_set_compat_all(cas_pvr, &local_err) < 0) { /* We fail to set compat mode (likely because running with KVM PR), * but maybe we can fallback to raw mode if the guest supports it. */ @@ -1707,7 +1707,6 @@ target_ulong do_client_architecture_support(PowerPCCPU *cpu, return H_HARDWARE; } error_free(local_err); - local_err = NULL; } } -- cgit v1.1 From a9c2cdace0a9f42d4a2b1b230baab96819b79641 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Mon, 14 Sep 2020 14:34:57 +0200 Subject: spapr: Simplify error handling in spapr_vio_busdev_realize() Use the return value of spapr_irq_findone() and spapr_irq_claim() to detect failures. This allows to reduce the error propagation overhead. Signed-off-by: Greg Kurz Message-Id: <20200914123505.612812-8-groug@kaod.org> Reviewed-by: Vladimir Sementsov-Ogievskiy Signed-off-by: David Gibson --- hw/ppc/spapr_vio.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'hw') diff --git a/hw/ppc/spapr_vio.c b/hw/ppc/spapr_vio.c index 731080d..44fdd64 100644 --- a/hw/ppc/spapr_vio.c +++ b/hw/ppc/spapr_vio.c @@ -474,7 +474,6 @@ static void spapr_vio_busdev_realize(DeviceState *qdev, Error **errp) SpaprVioDevice *dev = (SpaprVioDevice *)qdev; SpaprVioDeviceClass *pc = VIO_SPAPR_DEVICE_GET_CLASS(dev); char *id; - Error *local_err = NULL; if (dev->reg != -1) { /* @@ -510,16 +509,15 @@ static void spapr_vio_busdev_realize(DeviceState *qdev, Error **errp) dev->irq = spapr_vio_reg_to_irq(dev->reg); if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) { - dev->irq = spapr_irq_findone(spapr, &local_err); - if (local_err) { - error_propagate(errp, local_err); + int irq = spapr_irq_findone(spapr, errp); + + if (irq < 0) { return; } + dev->irq = irq; } - spapr_irq_claim(spapr, dev->irq, false, &local_err); - if (local_err) { - error_propagate(errp, local_err); + if (spapr_irq_claim(spapr, dev->irq, false, errp) < 0) { return; } -- cgit v1.1 From 17548fe64a68e93deb25ac1d82f1585f916b59b1 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Mon, 14 Sep 2020 14:34:58 +0200 Subject: spapr: Add a return value to spapr_drc_attach() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As recommended in "qapi/error.h", return true on success and false on failure. This allows to reduce error propagation overhead in the callers. Signed-off-by: Greg Kurz Message-Id: <20200914123505.612812-9-groug@kaod.org> Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: David Gibson --- hw/ppc/spapr.c | 15 +++------------ hw/ppc/spapr_drc.c | 5 +++-- hw/ppc/spapr_nvdimm.c | 5 +---- hw/ppc/spapr_pci.c | 5 +---- 4 files changed, 8 insertions(+), 22 deletions(-) (limited to 'hw') diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 0f82e65..26b3432 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -3371,22 +3371,19 @@ static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size, int i; uint64_t addr = addr_start; bool hotplugged = spapr_drc_hotplugged(dev); - Error *local_err = NULL; for (i = 0; i < nr_lmbs; i++) { drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / SPAPR_MEMORY_BLOCK_SIZE); g_assert(drc); - spapr_drc_attach(drc, dev, &local_err); - if (local_err) { + if (!spapr_drc_attach(drc, dev, errp)) { while (addr > addr_start) { addr -= SPAPR_MEMORY_BLOCK_SIZE; drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / SPAPR_MEMORY_BLOCK_SIZE); spapr_drc_detach(drc); } - error_propagate(errp, local_err); return; } if (!hotplugged) { @@ -3767,7 +3764,6 @@ static void spapr_core_plug(HotplugHandler *hotplug_dev, DeviceState *dev, CPUCore *cc = CPU_CORE(dev); CPUState *cs; SpaprDrc *drc; - Error *local_err = NULL; CPUArchId *core_slot; int index; bool hotplugged = spapr_drc_hotplugged(dev); @@ -3785,9 +3781,7 @@ static void spapr_core_plug(HotplugHandler *hotplug_dev, DeviceState *dev, g_assert(drc || !mc->has_hotpluggable_cpus); if (drc) { - spapr_drc_attach(drc, dev, &local_err); - if (local_err) { - error_propagate(errp, local_err); + if (!spapr_drc_attach(drc, dev, errp)) { return; } @@ -3939,7 +3933,6 @@ static void spapr_phb_plug(HotplugHandler *hotplug_dev, DeviceState *dev, SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(dev); SpaprDrc *drc; bool hotplugged = spapr_drc_hotplugged(dev); - Error *local_err = NULL; if (!smc->dr_phb_enabled) { return; @@ -3949,9 +3942,7 @@ static void spapr_phb_plug(HotplugHandler *hotplug_dev, DeviceState *dev, /* hotplug hooks should check it's enabled before getting this far */ assert(drc); - spapr_drc_attach(drc, dev, &local_err); - if (local_err) { - error_propagate(errp, local_err); + if (!spapr_drc_attach(drc, dev, errp)) { return; } diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c index fe998d8..04a6bf1 100644 --- a/hw/ppc/spapr_drc.c +++ b/hw/ppc/spapr_drc.c @@ -371,13 +371,13 @@ static void prop_get_fdt(Object *obj, Visitor *v, const char *name, } while (fdt_depth != 0); } -void spapr_drc_attach(SpaprDrc *drc, DeviceState *d, Error **errp) +bool spapr_drc_attach(SpaprDrc *drc, DeviceState *d, Error **errp) { trace_spapr_drc_attach(spapr_drc_index(drc)); if (drc->dev) { error_setg(errp, "an attached device is still awaiting release"); - return; + return false; } g_assert((drc->state == SPAPR_DRC_STATE_LOGICAL_UNUSABLE) || (drc->state == SPAPR_DRC_STATE_PHYSICAL_POWERON)); @@ -388,6 +388,7 @@ void spapr_drc_attach(SpaprDrc *drc, DeviceState *d, Error **errp) object_get_typename(OBJECT(drc->dev)), (Object **)(&drc->dev), NULL, 0); + return true; } static void spapr_drc_release(SpaprDrc *drc) diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c index 6387205..c06f903 100644 --- a/hw/ppc/spapr_nvdimm.c +++ b/hw/ppc/spapr_nvdimm.c @@ -91,14 +91,11 @@ void spapr_add_nvdimm(DeviceState *dev, uint64_t slot, Error **errp) { SpaprDrc *drc; bool hotplugged = spapr_drc_hotplugged(dev); - Error *local_err = NULL; drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM, slot); g_assert(drc); - spapr_drc_attach(drc, dev, &local_err); - if (local_err) { - error_propagate(errp, local_err); + if (!spapr_drc_attach(drc, dev, errp)) { return; } diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c index 5db912b..3999392 100644 --- a/hw/ppc/spapr_pci.c +++ b/hw/ppc/spapr_pci.c @@ -1539,7 +1539,6 @@ static void spapr_pci_plug(HotplugHandler *plug_handler, PCIDevice *pdev = PCI_DEVICE(plugged_dev); PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(plugged_dev); SpaprDrc *drc = drc_from_dev(phb, pdev); - Error *local_err = NULL; PCIBus *bus = PCI_BUS(qdev_get_parent_bus(DEVICE(pdev))); uint32_t slotnr = PCI_SLOT(pdev->devfn); @@ -1578,9 +1577,7 @@ static void spapr_pci_plug(HotplugHandler *plug_handler, return; } - spapr_drc_attach(drc, DEVICE(pdev), &local_err); - if (local_err) { - error_propagate(errp, local_err); + if (!spapr_drc_attach(drc, DEVICE(pdev), errp)) { return; } -- cgit v1.1 From ebd226d221c4ef8192fc5b148e0aece07bd302d1 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Mon, 14 Sep 2020 14:34:59 +0200 Subject: spapr: Simplify error handling in prop_get_fdt() Use the return value of visit_check_struct() and visit_check_list() for error checking instead of local_err. This allows to get rid of the error propagation overhead. Signed-off-by: Greg Kurz Message-Id: <20200914123505.612812-10-groug@kaod.org> Reviewed-by: Vladimir Sementsov-Ogievskiy Signed-off-by: David Gibson --- hw/ppc/spapr_drc.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'hw') diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c index 04a6bf1..697b28c 100644 --- a/hw/ppc/spapr_drc.c +++ b/hw/ppc/spapr_drc.c @@ -302,7 +302,6 @@ static void prop_get_fdt(Object *obj, Visitor *v, const char *name, { SpaprDrc *drc = SPAPR_DR_CONNECTOR(obj); QNull *null = NULL; - Error *err = NULL; int fdt_offset_next, fdt_offset, fdt_depth; void *fdt; @@ -321,6 +320,7 @@ static void prop_get_fdt(Object *obj, Visitor *v, const char *name, const struct fdt_property *prop = NULL; int prop_len = 0, name_len = 0; uint32_t tag; + bool ok; tag = fdt_next_tag(fdt, fdt_offset, &fdt_offset_next); switch (tag) { @@ -334,10 +334,9 @@ static void prop_get_fdt(Object *obj, Visitor *v, const char *name, case FDT_END_NODE: /* shouldn't ever see an FDT_END_NODE before FDT_BEGIN_NODE */ g_assert(fdt_depth > 0); - visit_check_struct(v, &err); + ok = visit_check_struct(v, errp); visit_end_struct(v, NULL); - if (err) { - error_propagate(errp, err); + if (!ok) { return; } fdt_depth--; @@ -355,10 +354,9 @@ static void prop_get_fdt(Object *obj, Visitor *v, const char *name, return; } } - visit_check_list(v, &err); + ok = visit_check_list(v, errp); visit_end_list(v, NULL); - if (err) { - error_propagate(errp, err); + if (!ok) { return; } break; -- cgit v1.1 From cfdc52747390af88086094e51ddc7f8fbeea330e Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Mon, 14 Sep 2020 14:35:00 +0200 Subject: spapr: Add a return value to spapr_set_vcpu_id() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As recommended in "qapi/error.h", return true on success and false on failure. This allows to reduce error propagation overhead in the callers. Signed-off-by: Greg Kurz Message-Id: <20200914123505.612812-11-groug@kaod.org> Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: David Gibson --- hw/ppc/spapr.c | 5 +++-- hw/ppc/spapr_cpu_core.c | 5 +---- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'hw') diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 26b3432..c6af456 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -4286,7 +4286,7 @@ int spapr_get_vcpu_id(PowerPCCPU *cpu) return cpu->vcpu_id; } -void spapr_set_vcpu_id(PowerPCCPU *cpu, int cpu_index, Error **errp) +bool spapr_set_vcpu_id(PowerPCCPU *cpu, int cpu_index, Error **errp) { SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); MachineState *ms = MACHINE(spapr); @@ -4299,10 +4299,11 @@ void spapr_set_vcpu_id(PowerPCCPU *cpu, int cpu_index, Error **errp) error_append_hint(errp, "Adjust the number of cpus to %d " "or try to raise the number of threads per core\n", vcpu_id * ms->smp.threads / spapr->vsmt); - return; + return false; } cpu->vcpu_id = vcpu_id; + return true; } PowerPCCPU *spapr_find_cpu(int vcpu_id) diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c index 3e4f402..0c879d4 100644 --- a/hw/ppc/spapr_cpu_core.c +++ b/hw/ppc/spapr_cpu_core.c @@ -262,7 +262,6 @@ static PowerPCCPU *spapr_create_vcpu(SpaprCpuCore *sc, int i, Error **errp) char *id; CPUState *cs; PowerPCCPU *cpu; - Error *local_err = NULL; obj = object_new(scc->cpu_type); @@ -274,8 +273,7 @@ static PowerPCCPU *spapr_create_vcpu(SpaprCpuCore *sc, int i, Error **errp) */ cs->start_powered_off = true; cs->cpu_index = cc->core_id + i; - spapr_set_vcpu_id(cpu, cs->cpu_index, &local_err); - if (local_err) { + if (!spapr_set_vcpu_id(cpu, cs->cpu_index, errp)) { goto err; } @@ -292,7 +290,6 @@ static PowerPCCPU *spapr_create_vcpu(SpaprCpuCore *sc, int i, Error **errp) err: object_unref(obj); - error_propagate(errp, local_err); return NULL; } -- cgit v1.1 From a5af92e2e9377f753d3df6b2e050b3db6f64fb7d Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Mon, 14 Sep 2020 14:35:01 +0200 Subject: spapr: Simplify error handling in spapr_cpu_core_realize() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As recommended in "qapi/error.h", add a bool return value to spapr_realize_vcpu() and use it in spapr_cpu_core_realize() in order to get rid of the error propagation overhead. Signed-off-by: Greg Kurz Message-Id: <20200914123505.612812-12-groug@kaod.org> Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: David Gibson --- hw/ppc/spapr_cpu_core.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'hw') diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c index 0c879d4..b036208 100644 --- a/hw/ppc/spapr_cpu_core.c +++ b/hw/ppc/spapr_cpu_core.c @@ -227,14 +227,14 @@ static void spapr_cpu_core_unrealize(DeviceState *dev) g_free(sc->threads); } -static void spapr_realize_vcpu(PowerPCCPU *cpu, SpaprMachineState *spapr, +static bool spapr_realize_vcpu(PowerPCCPU *cpu, SpaprMachineState *spapr, SpaprCpuCore *sc, Error **errp) { CPUPPCState *env = &cpu->env; CPUState *cs = CPU(cpu); if (!qdev_realize(DEVICE(cpu), NULL, errp)) { - return; + return false; } /* Set time-base frequency to 512 MHz */ @@ -245,13 +245,14 @@ static void spapr_realize_vcpu(PowerPCCPU *cpu, SpaprMachineState *spapr, if (spapr_irq_cpu_intc_create(spapr, cpu, errp) < 0) { cpu_remove_sync(CPU(cpu)); - return; + return false; } if (!sc->pre_3_0_migration) { vmstate_register(NULL, cs->cpu_index, &vmstate_spapr_cpu_state, cpu->machine_data); } + return true; } static PowerPCCPU *spapr_create_vcpu(SpaprCpuCore *sc, int i, Error **errp) @@ -312,7 +313,6 @@ static void spapr_cpu_core_realize(DeviceState *dev, Error **errp) TYPE_SPAPR_MACHINE); SpaprCpuCore *sc = SPAPR_CPU_CORE(OBJECT(dev)); CPUCore *cc = CPU_CORE(OBJECT(dev)); - Error *local_err = NULL; int i, j; if (!spapr) { @@ -322,15 +322,14 @@ static void spapr_cpu_core_realize(DeviceState *dev, Error **errp) sc->threads = g_new(PowerPCCPU *, cc->nr_threads); for (i = 0; i < cc->nr_threads; i++) { - sc->threads[i] = spapr_create_vcpu(sc, i, &local_err); - if (local_err) { + sc->threads[i] = spapr_create_vcpu(sc, i, errp); + if (!sc->threads[i]) { goto err; } } for (j = 0; j < cc->nr_threads; j++) { - spapr_realize_vcpu(sc->threads[j], spapr, sc, &local_err); - if (local_err) { + if (!spapr_realize_vcpu(sc->threads[j], spapr, sc, errp)) { goto err_unrealize; } } @@ -347,7 +346,6 @@ err: spapr_delete_vcpu(sc->threads[i], sc); } g_free(sc->threads); - error_propagate(errp, local_err); } static Property spapr_cpu_core_properties[] = { -- cgit v1.1 From 451c6905899da0cdcd23bffef93504f93fd48d5e Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Mon, 14 Sep 2020 14:35:02 +0200 Subject: spapr: Add a return value to spapr_nvdimm_validate() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As recommended in "qapi/error.h", return true on success and false on failure. This allows to reduce error propagation overhead in the callers. Signed-off-by: Greg Kurz Message-Id: <20200914123505.612812-13-groug@kaod.org> Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: David Gibson --- hw/ppc/spapr.c | 4 +--- hw/ppc/spapr_nvdimm.c | 14 ++++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'hw') diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index c6af456..7f3a620 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -3478,9 +3478,7 @@ static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, } if (is_nvdimm) { - spapr_nvdimm_validate(hotplug_dev, NVDIMM(dev), size, &local_err); - if (local_err) { - error_propagate(errp, local_err); + if (!spapr_nvdimm_validate(hotplug_dev, NVDIMM(dev), size, errp)) { return; } } else if (size % SPAPR_MEMORY_BLOCK_SIZE) { diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c index c06f903..b3a489e 100644 --- a/hw/ppc/spapr_nvdimm.c +++ b/hw/ppc/spapr_nvdimm.c @@ -33,7 +33,7 @@ #include "sysemu/sysemu.h" #include "hw/ppc/spapr_numa.h" -void spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm, +bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm, uint64_t size, Error **errp) { const MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev); @@ -45,7 +45,7 @@ void spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm, if (!mc->nvdimm_supported) { error_setg(errp, "NVDIMM hotplug not supported for this machine"); - return; + return false; } /* @@ -59,20 +59,20 @@ void spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm, */ if (!ms->nvdimms_state->is_enabled && nvdimm_opt) { error_setg(errp, "nvdimm device found but 'nvdimm=off' was set"); - return; + return false; } if (object_property_get_int(OBJECT(nvdimm), NVDIMM_LABEL_SIZE_PROP, &error_abort) == 0) { error_setg(errp, "PAPR requires NVDIMM devices to have label-size set"); - return; + return false; } if (size % SPAPR_MINIMUM_SCM_BLOCK_SIZE) { error_setg(errp, "PAPR requires NVDIMM memory size (excluding label)" " to be a multiple of %" PRIu64 "MB", SPAPR_MINIMUM_SCM_BLOCK_SIZE / MiB); - return; + return false; } uuidstr = object_property_get_str(OBJECT(nvdimm), NVDIMM_UUID_PROP, @@ -82,8 +82,10 @@ void spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm, if (qemu_uuid_is_null(&uuid)) { error_setg(errp, "NVDIMM device requires the uuid to be set"); - return; + return false; } + + return true; } -- cgit v1.1 From 35dce34fbc1cfa6a26f95b83f3a8949a4150412f Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Mon, 14 Sep 2020 14:35:03 +0200 Subject: spapr: Add a return value to spapr_check_pagesize() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As recommended in "qapi/error.h", return true on success and false on failure. This allows to reduce error propagation overhead in the callers. Signed-off-by: Greg Kurz Message-Id: <20200914123505.612812-14-groug@kaod.org> Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: David Gibson --- hw/ppc/spapr.c | 4 +--- hw/ppc/spapr_caps.c | 7 +++++-- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'hw') diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 7f3a620..4256794 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -3490,9 +3490,7 @@ static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, memdev = object_property_get_link(OBJECT(dimm), PC_DIMM_MEMDEV_PROP, &error_abort); pagesize = host_memory_backend_pagesize(MEMORY_BACKEND(memdev)); - spapr_check_pagesize(spapr, pagesize, &local_err); - if (local_err) { - error_propagate(errp, local_err); + if (!spapr_check_pagesize(spapr, pagesize, errp)) { return; } diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c index 10a80a8..9341e97 100644 --- a/hw/ppc/spapr_caps.c +++ b/hw/ppc/spapr_caps.c @@ -310,13 +310,13 @@ static void cap_safe_indirect_branch_apply(SpaprMachineState *spapr, #define VALUE_DESC_TRISTATE " (broken, workaround, fixed)" -void spapr_check_pagesize(SpaprMachineState *spapr, hwaddr pagesize, +bool spapr_check_pagesize(SpaprMachineState *spapr, hwaddr pagesize, Error **errp) { hwaddr maxpagesize = (1ULL << spapr->eff.caps[SPAPR_CAP_HPT_MAXPAGESIZE]); if (!kvmppc_hpt_needs_host_contiguous_pages()) { - return; + return true; } if (maxpagesize > pagesize) { @@ -324,7 +324,10 @@ void spapr_check_pagesize(SpaprMachineState *spapr, hwaddr pagesize, "Can't support %"HWADDR_PRIu" kiB guest pages with %" HWADDR_PRIu" kiB host pages with this KVM implementation", maxpagesize >> 10, pagesize >> 10); + return false; } + + return true; } static void cap_hpt_maxpagesize_apply(SpaprMachineState *spapr, -- cgit v1.1 From 83fa6e2a9fb4359208ed1a2ac74589ef0271627a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Fri, 2 Oct 2020 11:14:40 +0200 Subject: ppc/pnv: Increase max firmware size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Builds enabling GCOV can be bigger than 4MB and the limit on FSP systems is 16MB. Signed-off-by: Cédric Le Goater Message-Id: <20201002091440.1349326-1-clg@kaod.org> Signed-off-by: David Gibson --- hw/ppc/pnv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'hw') diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c index 6670967..d9e5287 100644 --- a/hw/ppc/pnv.c +++ b/hw/ppc/pnv.c @@ -61,7 +61,7 @@ #define FW_FILE_NAME "skiboot.lid" #define FW_LOAD_ADDR 0x0 -#define FW_MAX_SIZE (4 * MiB) +#define FW_MAX_SIZE (16 * MiB) #define KERNEL_LOAD_ADDR 0x20000000 #define KERNEL_MAX_SIZE (256 * MiB) -- cgit v1.1 From 29bfe52a5229bd457d85e1033dbfd91fe441dcf3 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Wed, 7 Oct 2020 14:28:45 -0300 Subject: spapr: add spapr_machine_using_legacy_numa() helper The changes to come to NUMA support are all guest visible. In theory we could just create a new 5_1 class option flag to avoid the changes to cascade to 5.1 and under. The reality is that these changes are only relevant if the machine has more than one NUMA node. There is no need to change guest behavior that has been around for years needlesly. This new helper will be used by the next patches to determine whether we should retain the (soon to be) legacy NUMA behavior in the pSeries machine. The new behavior will only be exposed if: - machine is pseries-5.2 and newer; - more than one NUMA node is declared in NUMA state. Reviewed-by: Greg Kurz Reviewed-by: David Gibson Signed-off-by: Daniel Henrique Barboza Message-Id: <20201007172849.302240-2-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'hw') diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 4256794..63315f2 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -294,6 +294,15 @@ static hwaddr spapr_node0_size(MachineState *machine) return machine->ram_size; } +bool spapr_machine_using_legacy_numa(SpaprMachineState *spapr) +{ + MachineState *machine = MACHINE(spapr); + SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine); + + return smc->pre_5_2_numa_associativity || + machine->numa_state->num_nodes <= 1; +} + static void add_str(GString *s, const gchar *s1) { g_string_append_len(s, s1, strlen(s1) + 1); @@ -4519,8 +4528,11 @@ DEFINE_SPAPR_MACHINE(5_2, "5.2", true); */ static void spapr_machine_5_1_class_options(MachineClass *mc) { + SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); + spapr_machine_5_2_class_options(mc); compat_props_add(mc->compat_props, hw_compat_5_1, hw_compat_5_1_len); + smc->pre_5_2_numa_associativity = true; } DEFINE_SPAPR_MACHINE(5_1, "5.1", false); -- cgit v1.1 From ee6635b227491e7d487ecd868e0dbfbb0c444217 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Wed, 7 Oct 2020 14:28:46 -0300 Subject: spapr_numa: forbid asymmetrical NUMA setups The pSeries machine does not support asymmetrical NUMA configurations. This doesn't make much of a different since we're not using user input for pSeries NUMA setup, but this will change in the next patches. To avoid breaking existing setups, gate this change by checking for legacy NUMA support. Reviewed-by: Greg Kurz Reviewed-by: David Gibson Signed-off-by: Daniel Henrique Barboza Message-Id: <20201007172849.302240-3-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr_numa.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'hw') diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c index 64fe567..fe395e8 100644 --- a/hw/ppc/spapr_numa.c +++ b/hw/ppc/spapr_numa.c @@ -19,6 +19,24 @@ /* Moved from hw/ppc/spapr_pci_nvlink2.c */ #define SPAPR_GPU_NUMA_ID (cpu_to_be32(1)) +static bool spapr_numa_is_symmetrical(MachineState *ms) +{ + int src, dst; + int nb_numa_nodes = ms->numa_state->num_nodes; + NodeInfo *numa_info = ms->numa_state->nodes; + + for (src = 0; src < nb_numa_nodes; src++) { + for (dst = src; dst < nb_numa_nodes; dst++) { + if (numa_info[src].distance[dst] != + numa_info[dst].distance[src]) { + return false; + } + } + } + + return true; +} + void spapr_numa_associativity_init(SpaprMachineState *spapr, MachineState *machine) { @@ -61,6 +79,22 @@ void spapr_numa_associativity_init(SpaprMachineState *spapr, spapr->numa_assoc_array[i][MAX_DISTANCE_REF_POINTS] = cpu_to_be32(i); } + + /* + * Legacy NUMA guests (pseries-5.1 and older, or guests with only + * 1 NUMA node) will not benefit from anything we're going to do + * after this point. + */ + if (spapr_machine_using_legacy_numa(spapr)) { + return; + } + + if (!spapr_numa_is_symmetrical(machine)) { + error_report("Asymmetrical NUMA topologies aren't supported " + "in the pSeries machine"); + exit(EXIT_FAILURE); + } + } void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt, -- cgit v1.1 From 491e884e3666e0af6a1eef06df496611097a060e Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Wed, 7 Oct 2020 14:28:47 -0300 Subject: spapr_numa: change reference-points and maxdomain settings This is the first guest visible change introduced in spapr_numa.c. The previous settings of both reference-points and maxdomains were too restrictive, but enough for the existing associativity we're setting in the resources. We'll change that in the following patches, populating the associativity arrays based on user input. For those changes to be effective, reference-points and maxdomains must be more flexible. After this patch, we'll have 4 distinct levels of NUMA (0x4, 0x3, 0x2, 0x1) and maxdomains will allow for any type of configuration the user intends to do - under the scope and limitations of PAPR itself, of course. Reviewed-by: Greg Kurz Reviewed-by: David Gibson Signed-off-by: Daniel Henrique Barboza Message-Id: <20201007172849.302240-4-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr_numa.c | 43 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) (limited to 'hw') diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c index fe395e8..16badb1 100644 --- a/hw/ppc/spapr_numa.c +++ b/hw/ppc/spapr_numa.c @@ -178,24 +178,51 @@ int spapr_numa_write_assoc_lookup_arrays(SpaprMachineState *spapr, void *fdt, */ void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas) { + MachineState *ms = MACHINE(spapr); SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr); uint32_t refpoints[] = { cpu_to_be32(0x4), - cpu_to_be32(0x4), + cpu_to_be32(0x3), cpu_to_be32(0x2), + cpu_to_be32(0x1), }; uint32_t nr_refpoints = ARRAY_SIZE(refpoints); - uint32_t maxdomain = cpu_to_be32(spapr->gpu_numa_id > 1 ? 1 : 0); + uint32_t maxdomain = ms->numa_state->num_nodes + spapr->gpu_numa_id; uint32_t maxdomains[] = { cpu_to_be32(4), - maxdomain, - maxdomain, - maxdomain, - cpu_to_be32(spapr->gpu_numa_id), + cpu_to_be32(maxdomain), + cpu_to_be32(maxdomain), + cpu_to_be32(maxdomain), + cpu_to_be32(maxdomain) }; - if (smc->pre_5_1_assoc_refpoints) { - nr_refpoints = 2; + if (spapr_machine_using_legacy_numa(spapr)) { + uint32_t legacy_refpoints[] = { + cpu_to_be32(0x4), + cpu_to_be32(0x4), + cpu_to_be32(0x2), + }; + uint32_t legacy_maxdomain = spapr->gpu_numa_id > 1 ? 1 : 0; + uint32_t legacy_maxdomains[] = { + cpu_to_be32(4), + cpu_to_be32(legacy_maxdomain), + cpu_to_be32(legacy_maxdomain), + cpu_to_be32(legacy_maxdomain), + cpu_to_be32(spapr->gpu_numa_id), + }; + + G_STATIC_ASSERT(sizeof(legacy_refpoints) <= sizeof(refpoints)); + G_STATIC_ASSERT(sizeof(legacy_maxdomains) <= sizeof(maxdomains)); + + nr_refpoints = 3; + + memcpy(refpoints, legacy_refpoints, sizeof(legacy_refpoints)); + memcpy(maxdomains, legacy_maxdomains, sizeof(legacy_maxdomains)); + + /* pseries-5.0 and older reference-points array is {0x4, 0x4} */ + if (smc->pre_5_1_assoc_refpoints) { + nr_refpoints = 2; + } } _FDT(fdt_setprop(fdt, rtas, "ibm,associativity-reference-points", -- cgit v1.1 From 690fbe4295d5f1eec7c0862797abd7626a965e59 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Wed, 7 Oct 2020 14:28:48 -0300 Subject: spapr_numa: consider user input when defining associativity A new function called spapr_numa_define_associativity_domains() is created to calculate the associativity domains and change the associativity arrays considering user input. This is how the associativity domain between two NUMA nodes A and B is calculated: - get the distance D between them - get the correspondent NUMA level 'n_level' for D. This is done via a helper called spapr_numa_get_numa_level() - all associativity arrays were initialized with their own numa_ids, and we're calculating the distance in node_id ascending order, starting from node id 0 (the first node retrieved by numa_state). This will have a cascade effect in the algorithm because the associativity domains that node 0 defines will be carried over to other nodes, and node 1 associativities will be carried over after taking node 0 associativities into account, and so on. This happens because we'll assign assoc_src as the associativity domain of dst as well, for all NUMA levels beyond and including n_level. The PPC kernel expects the associativity domains of the first node (node id 0) to be always 0 [1], and this algorithm will grant that by default. Ultimately, all of this results in a best effort approximation for the actual NUMA distances the user input in the command line. Given the nature of how PAPR itself interprets NUMA distances versus the expectations risen by how ACPI SLIT works, there might be better algorithms but, in the end, it'll also result in another way to approximate what the user really wanted. To keep this commit message no longer than it already is, the next patch will update the existing documentation in ppc-spapr-numa.rst with more in depth details and design considerations/drawbacks. [1] https://lore.kernel.org/linuxppc-dev/5e8fbea3-8faf-0951-172a-b41a2138fbcf@gmail.com/ Signed-off-by: Daniel Henrique Barboza Message-Id: <20201007172849.302240-5-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr_numa.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 109 insertions(+), 1 deletion(-) (limited to 'hw') diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c index 16badb1..b50796b 100644 --- a/hw/ppc/spapr_numa.c +++ b/hw/ppc/spapr_numa.c @@ -37,12 +37,108 @@ static bool spapr_numa_is_symmetrical(MachineState *ms) return true; } +/* + * This function will translate the user distances into + * what the kernel understand as possible values: 10 + * (local distance), 20, 40, 80 and 160, and return the equivalent + * NUMA level for each. Current heuristic is: + * - local distance (10) returns numa_level = 0x4, meaning there is + * no rounding for local distance + * - distances between 11 and 30 inclusive -> rounded to 20, + * numa_level = 0x3 + * - distances between 31 and 60 inclusive -> rounded to 40, + * numa_level = 0x2 + * - distances between 61 and 120 inclusive -> rounded to 80, + * numa_level = 0x1 + * - everything above 120 returns numa_level = 0 to indicate that + * there is no match. This will be calculated as disntace = 160 + * by the kernel (as of v5.9) + */ +static uint8_t spapr_numa_get_numa_level(uint8_t distance) +{ + if (distance == 10) { + return 0x4; + } else if (distance > 11 && distance <= 30) { + return 0x3; + } else if (distance > 31 && distance <= 60) { + return 0x2; + } else if (distance > 61 && distance <= 120) { + return 0x1; + } + + return 0; +} + +static void spapr_numa_define_associativity_domains(SpaprMachineState *spapr) +{ + MachineState *ms = MACHINE(spapr); + NodeInfo *numa_info = ms->numa_state->nodes; + int nb_numa_nodes = ms->numa_state->num_nodes; + int src, dst, i; + + for (src = 0; src < nb_numa_nodes; src++) { + for (dst = src; dst < nb_numa_nodes; dst++) { + /* + * This is how the associativity domain between A and B + * is calculated: + * + * - get the distance D between them + * - get the correspondent NUMA level 'n_level' for D + * - all associativity arrays were initialized with their own + * numa_ids, and we're calculating the distance in node_id + * ascending order, starting from node id 0 (the first node + * retrieved by numa_state). This will have a cascade effect in + * the algorithm because the associativity domains that node 0 + * defines will be carried over to other nodes, and node 1 + * associativities will be carried over after taking node 0 + * associativities into account, and so on. This happens because + * we'll assign assoc_src as the associativity domain of dst + * as well, for all NUMA levels beyond and including n_level. + * + * The PPC kernel expects the associativity domains of node 0 to + * be always 0, and this algorithm will grant that by default. + */ + uint8_t distance = numa_info[src].distance[dst]; + uint8_t n_level = spapr_numa_get_numa_level(distance); + uint32_t assoc_src; + + /* + * n_level = 0 means that the distance is greater than our last + * rounded value (120). In this case there is no NUMA level match + * between src and dst and we can skip the remaining of the loop. + * + * The Linux kernel will assume that the distance between src and + * dst, in this case of no match, is 10 (local distance) doubled + * for each NUMA it didn't match. We have MAX_DISTANCE_REF_POINTS + * levels (4), so this gives us 10*2*2*2*2 = 160. + * + * This logic can be seen in the Linux kernel source code, as of + * v5.9, in arch/powerpc/mm/numa.c, function __node_distance(). + */ + if (n_level == 0) { + continue; + } + + /* + * We must assign all assoc_src to dst, starting from n_level + * and going up to 0x1. + */ + for (i = n_level; i > 0; i--) { + assoc_src = spapr->numa_assoc_array[src][i]; + spapr->numa_assoc_array[dst][i] = assoc_src; + } + } + } + +} + void spapr_numa_associativity_init(SpaprMachineState *spapr, MachineState *machine) { SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr); int nb_numa_nodes = machine->numa_state->num_nodes; int i, j, max_nodes_with_gpus; + bool using_legacy_numa = spapr_machine_using_legacy_numa(spapr); /* * For all associativity arrays: first position is the size, @@ -56,6 +152,17 @@ void spapr_numa_associativity_init(SpaprMachineState *spapr, for (i = 0; i < nb_numa_nodes; i++) { spapr->numa_assoc_array[i][0] = cpu_to_be32(MAX_DISTANCE_REF_POINTS); spapr->numa_assoc_array[i][MAX_DISTANCE_REF_POINTS] = cpu_to_be32(i); + + /* + * Fill all associativity domains of non-zero NUMA nodes with + * node_id. This is required because the default value (0) is + * considered a match with associativity domains of node 0. + */ + if (!using_legacy_numa && i != 0) { + for (j = 1; j < MAX_DISTANCE_REF_POINTS; j++) { + spapr->numa_assoc_array[i][j] = cpu_to_be32(i); + } + } } /* @@ -85,7 +192,7 @@ void spapr_numa_associativity_init(SpaprMachineState *spapr, * 1 NUMA node) will not benefit from anything we're going to do * after this point. */ - if (spapr_machine_using_legacy_numa(spapr)) { + if (using_legacy_numa) { return; } @@ -95,6 +202,7 @@ void spapr_numa_associativity_init(SpaprMachineState *spapr, exit(EXIT_FAILURE); } + spapr_numa_define_associativity_domains(spapr); } void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt, -- cgit v1.1