From bb89646c75e46db0074e263d74770dac6f0a29fc Mon Sep 17 00:00:00 2001 From: Luis Pires Date: Fri, 10 Sep 2021 08:26:06 -0300 Subject: i386/kvm: Replace abs64() with uabs64() from host-utils Drop abs64() and use uabs64() from host-utils, which avoids an undefined behavior when taking abs of the most negative value. Signed-off-by: Luis Pires Reviewed-by: Richard Henderson Reviewed-by: Eduardo Habkost Message-Id: <20210910112624.72748-5-luis.pires@eldorado.org.br> Signed-off-by: David Gibson --- hw/i386/kvm/i8254.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'hw') diff --git a/hw/i386/kvm/i8254.c b/hw/i386/kvm/i8254.c index fa68669..191a26f 100644 --- a/hw/i386/kvm/i8254.c +++ b/hw/i386/kvm/i8254.c @@ -59,11 +59,6 @@ struct KVMPITClass { DeviceRealize parent_realize; }; -static int64_t abs64(int64_t v) -{ - return v < 0 ? -v : v; -} - static void kvm_pit_update_clock_offset(KVMPITState *s) { int64_t offset, clock_offset; @@ -81,7 +76,7 @@ static void kvm_pit_update_clock_offset(KVMPITState *s) clock_gettime(CLOCK_MONOTONIC, &ts); offset -= ts.tv_nsec; offset -= (int64_t)ts.tv_sec * 1000000000; - if (abs64(offset) < abs64(clock_offset)) { + if (uabs64(offset) < uabs64(clock_offset)) { clock_offset = offset; } } -- cgit v1.1 From 3ad2111175dddb4e396c25ce242a5c7763d42631 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Wed, 1 Sep 2021 11:41:47 +0200 Subject: ppc/spapr: Add a POWER10 DD2 CPU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédric Le Goater Message-Id: <20210901094153.227671-3-clg@kaod.org> Reviewed-by: Greg Kurz Signed-off-by: David Gibson --- hw/ppc/spapr_cpu_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'hw') diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c index 4f316a6..58e7341 100644 --- a/hw/ppc/spapr_cpu_core.c +++ b/hw/ppc/spapr_cpu_core.c @@ -382,6 +382,7 @@ static const TypeInfo spapr_cpu_core_type_infos[] = { DEFINE_SPAPR_CPU_CORE_TYPE("power9_v1.0"), DEFINE_SPAPR_CPU_CORE_TYPE("power9_v2.0"), DEFINE_SPAPR_CPU_CORE_TYPE("power10_v1.0"), + DEFINE_SPAPR_CPU_CORE_TYPE("power10_v2.0"), #ifdef CONFIG_KVM DEFINE_SPAPR_CPU_CORE_TYPE("host"), #endif -- cgit v1.1 From 40ef88ba77ac84baab3a78171a49829a6cd1d3e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Wed, 1 Sep 2021 11:41:48 +0200 Subject: ppc/pnv: Add a comment on the "primary-topology-index" property MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On P10, the chip id is calculated from the "Primary topology table index". See skiboot commits for more information [1]. This information is extracted from the hdata on real systems which QEMU needs to emulate. Add this property for all machines even if it is only used on POWER10. [1] https://github.com/open-power/skiboot/commit/2ce3f083f399 https://github.com/open-power/skiboot/commit/a2d4d7f9e14a Signed-off-by: Cédric Le Goater Message-Id: <20210901094153.227671-4-clg@kaod.org> Signed-off-by: David Gibson --- hw/ppc/pnv_xscom.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'hw') diff --git a/hw/ppc/pnv_xscom.c b/hw/ppc/pnv_xscom.c index faa488e..9ce018d 100644 --- a/hw/ppc/pnv_xscom.c +++ b/hw/ppc/pnv_xscom.c @@ -284,6 +284,10 @@ int pnv_dt_xscom(PnvChip *chip, void *fdt, int root_offset, _FDT(xscom_offset); g_free(name); _FDT((fdt_setprop_cell(fdt, xscom_offset, "ibm,chip-id", chip->chip_id))); + /* + * On P10, the xscom bus id has been deprecated and the chip id is + * calculated from the "Primary topology table index". See skiboot. + */ _FDT((fdt_setprop_cell(fdt, xscom_offset, "ibm,primary-topology-index", chip->chip_id))); _FDT((fdt_setprop_cell(fdt, xscom_offset, "#address-cells", 1))); -- cgit v1.1 From 0e5e9ff455dc529ee9e3c485d4543cdcaa04ab8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Wed, 1 Sep 2021 11:41:49 +0200 Subject: ppc/pnv: Remove useless variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédric Le Goater Message-Id: <20210901094153.227671-5-clg@kaod.org> Signed-off-by: David Gibson --- hw/ppc/pnv.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'hw') diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c index 2f5358b..a62e90b 100644 --- a/hw/ppc/pnv.c +++ b/hw/ppc/pnv.c @@ -838,8 +838,7 @@ static void pnv_init(MachineState *machine) for (i = 0; i < pnv->num_chips; i++) { char chip_name[32]; Object *chip = OBJECT(qdev_new(chip_typename)); - int chip_id = i; - uint64_t chip_ram_size = pnv_chip_get_ram_size(pnv, chip_id); + uint64_t chip_ram_size = pnv_chip_get_ram_size(pnv, i); pnv->chips[i] = PNV_CHIP(chip); @@ -850,9 +849,9 @@ static void pnv_init(MachineState *machine) &error_fatal); chip_ram_start += chip_ram_size; - snprintf(chip_name, sizeof(chip_name), "chip[%d]", chip_id); + snprintf(chip_name, sizeof(chip_name), "chip[%d]", i); object_property_add_child(OBJECT(pnv), chip_name, chip); - object_property_set_int(chip, "chip-id", chip_id, &error_fatal); + object_property_set_int(chip, "chip-id", i, &error_fatal); object_property_set_int(chip, "nr-cores", machine->smp.cores, &error_fatal); object_property_set_int(chip, "nr-threads", machine->smp.threads, -- cgit v1.1 From 89d2468d964e635eb80d4d00c29074a28d0e6d19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Wed, 1 Sep 2021 11:41:51 +0200 Subject: ppc/xive: Export priority_to_ipb() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédric Le Goater Message-Id: <20210901094153.227671-7-clg@kaod.org> Signed-off-by: David Gibson --- hw/intc/xive.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) (limited to 'hw') diff --git a/hw/intc/xive.c b/hw/intc/xive.c index b817ee8..b0c4f76 100644 --- a/hw/intc/xive.c +++ b/hw/intc/xive.c @@ -28,17 +28,6 @@ */ /* - * Convert a priority number to an Interrupt Pending Buffer (IPB) - * register, which indicates a pending interrupt at the priority - * corresponding to the bit number - */ -static uint8_t priority_to_ipb(uint8_t priority) -{ - return priority > XIVE_PRIORITY_MAX ? - 0 : 1 << (XIVE_PRIORITY_MAX - priority); -} - -/* * Convert an Interrupt Pending Buffer (IPB) register to a Pending * Interrupt Priority Register (PIPR), which contains the priority of * the most favored pending notification. @@ -89,7 +78,7 @@ static uint64_t xive_tctx_accept(XiveTCTX *tctx, uint8_t ring) regs[TM_CPPR] = cppr; /* Reset the pending buffer bit */ - regs[TM_IPB] &= ~priority_to_ipb(cppr); + regs[TM_IPB] &= ~xive_priority_to_ipb(cppr); regs[TM_PIPR] = ipb_to_pipr(regs[TM_IPB]); /* Drop Exception bit */ @@ -353,7 +342,7 @@ static void xive_tm_set_os_cppr(XivePresenter *xptr, XiveTCTX *tctx, static void xive_tm_set_os_pending(XivePresenter *xptr, XiveTCTX *tctx, hwaddr offset, uint64_t value, unsigned size) { - xive_tctx_ipb_update(tctx, TM_QW1_OS, priority_to_ipb(value & 0xff)); + xive_tctx_ipb_update(tctx, TM_QW1_OS, xive_priority_to_ipb(value & 0xff)); } static void xive_os_cam_decode(uint32_t cam, uint8_t *nvt_blk, @@ -1535,7 +1524,8 @@ bool xive_presenter_notify(XiveFabric *xfb, uint8_t format, /* handle CPU exception delivery */ if (count) { trace_xive_presenter_notify(nvt_blk, nvt_idx, match.ring); - xive_tctx_ipb_update(match.tctx, match.ring, priority_to_ipb(priority)); + xive_tctx_ipb_update(match.tctx, match.ring, + xive_priority_to_ipb(priority)); } return !!count; @@ -1682,7 +1672,8 @@ static void xive_router_end_notify(XiveRouter *xrtr, uint8_t end_blk, * use. The presenter will resend the interrupt when the vCPU * is dispatched again on a HW thread. */ - ipb = xive_get_field32(NVT_W4_IPB, nvt.w4) | priority_to_ipb(priority); + ipb = xive_get_field32(NVT_W4_IPB, nvt.w4) | + xive_priority_to_ipb(priority); nvt.w4 = xive_set_field32(NVT_W4_IPB, nvt.w4, ipb); xive_router_write_nvt(xrtr, nvt_blk, nvt_idx, &nvt, 4); -- cgit v1.1 From daf115cf9a42bb42eabaf6177e59084c22cc7b1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Wed, 1 Sep 2021 11:41:52 +0200 Subject: ppc/xive: Export xive_tctx_word2() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédric Le Goater Message-Id: <20210901094153.227671-8-clg@kaod.org> Signed-off-by: David Gibson --- hw/intc/xive.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'hw') diff --git a/hw/intc/xive.c b/hw/intc/xive.c index b0c4f76..6c82326 100644 --- a/hw/intc/xive.c +++ b/hw/intc/xive.c @@ -141,11 +141,6 @@ void xive_tctx_ipb_update(XiveTCTX *tctx, uint8_t ring, uint8_t ipb) xive_tctx_notify(tctx, ring); } -static inline uint32_t xive_tctx_word2(uint8_t *ring) -{ - return *((uint32_t *) &ring[TM_WORD2]); -} - /* * XIVE Thread Interrupt Management Area (TIMA) */ -- cgit v1.1 From 92612f1550fb11c2bda09821bad11c6349c283cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Wed, 1 Sep 2021 11:41:53 +0200 Subject: ppc/pnv: Rename "id" to "quad-id" in PnvQuad MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This to avoid possible conflicts with the "id" property of QOM objects. Signed-off-by: Cédric Le Goater Message-Id: <20210901094153.227671-9-clg@kaod.org> Signed-off-by: David Gibson --- hw/ppc/pnv.c | 4 ++-- hw/ppc/pnv_core.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'hw') diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c index a62e90b..03c8650 100644 --- a/hw/ppc/pnv.c +++ b/hw/ppc/pnv.c @@ -1368,10 +1368,10 @@ static void pnv_chip_quad_realize(Pnv9Chip *chip9, Error **errp) sizeof(*eq), TYPE_PNV_QUAD, &error_fatal, NULL); - object_property_set_int(OBJECT(eq), "id", core_id, &error_fatal); + object_property_set_int(OBJECT(eq), "quad-id", core_id, &error_fatal); qdev_realize(DEVICE(eq), NULL, &error_fatal); - pnv_xscom_add_subregion(chip, PNV9_XSCOM_EQ_BASE(eq->id), + pnv_xscom_add_subregion(chip, PNV9_XSCOM_EQ_BASE(eq->quad_id), &eq->xscom_regs); } } diff --git a/hw/ppc/pnv_core.c b/hw/ppc/pnv_core.c index 4de8414..19e8eb8 100644 --- a/hw/ppc/pnv_core.c +++ b/hw/ppc/pnv_core.c @@ -407,13 +407,13 @@ static void pnv_quad_realize(DeviceState *dev, Error **errp) PnvQuad *eq = PNV_QUAD(dev); char name[32]; - snprintf(name, sizeof(name), "xscom-quad.%d", eq->id); + snprintf(name, sizeof(name), "xscom-quad.%d", eq->quad_id); pnv_xscom_region_init(&eq->xscom_regs, OBJECT(dev), &pnv_quad_xscom_ops, eq, name, PNV9_XSCOM_EQ_SIZE); } static Property pnv_quad_properties[] = { - DEFINE_PROP_UINT32("id", PnvQuad, id, 0), + DEFINE_PROP_UINT32("quad-id", PnvQuad, quad_id, 0), DEFINE_PROP_END_OF_LIST(), }; -- cgit v1.1 From f640afec1a14e99d561f7da93e3fc8a5e1206384 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 2 Sep 2021 15:09:10 +0200 Subject: ppc/pnv: Add an assert when calculating the RAM distribution on chips MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédric Le Goater Message-Id: <20210902130928.528803-3-clg@kaod.org> Reviewed-by: Greg Kurz Signed-off-by: David Gibson --- hw/ppc/pnv.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'hw') diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c index 03c8650..71e4551 100644 --- a/hw/ppc/pnv.c +++ b/hw/ppc/pnv.c @@ -723,6 +723,8 @@ static uint64_t pnv_chip_get_ram_size(PnvMachineState *pnv, int chip_id) return QEMU_ALIGN_DOWN(ram_per_chip, 1 * MiB); } + assert(pnv->num_chips > 1); + ram_per_chip = (machine->ram_size - 1 * GiB) / (pnv->num_chips - 1); return chip_id == 0 ? 1 * GiB : QEMU_ALIGN_DOWN(ram_per_chip, 1 * MiB); } -- cgit v1.1 From 99b2c0622513fc98e8ed9dac56cafb6d29240e87 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Mon, 6 Sep 2021 21:47:49 -0300 Subject: memory_hotplug.c: handle dev->id = NULL in acpi_memory_hotplug_write() qapi_event_send_mem_unplug_error() deals with @device being NULL by replacing it with an empty string ("") when emitting the event. Aside from the fact that this behavior (qapi visitor mapping NULL pointer to "") can be patched/changed someday, there's also the lack of utility that the event brings to listeners, e.g. "a memory unplug error happened somewhere". In theory we should just avoit emitting this event at all if dev->id is NULL, but this would be an incompatible change to existing guests. Instead, let's make the forementioned behavior explicit: if dev->id is NULL, pass an empty string to qapi_event_send_mem_unplug_error(). Suggested-by: Markus Armbruster Reviewed-by: Igor Mammedov Reviewed-by: Greg Kurz Reviewed-by: David Gibson Reviewed-by: Markus Armbruster Signed-off-by: Daniel Henrique Barboza Message-Id: <20210907004755.424931-2-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/acpi/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'hw') diff --git a/hw/acpi/memory_hotplug.c b/hw/acpi/memory_hotplug.c index af37889..6a71de4 100644 --- a/hw/acpi/memory_hotplug.c +++ b/hw/acpi/memory_hotplug.c @@ -178,7 +178,7 @@ static void acpi_memory_hotplug_write(void *opaque, hwaddr addr, uint64_t data, hotplug_handler_unplug(hotplug_ctrl, dev, &local_err); if (local_err) { trace_mhp_acpi_pc_dimm_delete_failed(mem_st->selector); - qapi_event_send_mem_unplug_error(dev->id, + qapi_event_send_mem_unplug_error(dev->id ? : "", error_get_pretty(local_err)); error_free(local_err); break; -- cgit v1.1 From 44d886abab268043157b08b77147cc29bff22090 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Mon, 6 Sep 2021 21:47:50 -0300 Subject: spapr.c: handle dev->id in spapr_memory_unplug_rollback() As done in hw/acpi/memory_hotplug.c, pass an empty string if dev->id is NULL to qapi_event_send_mem_unplug_error() to avoid relying on a behavior that can be changed in the future. Suggested-by: Markus Armbruster Reviewed-by: Greg Kurz Reviewed-by: David Gibson Reviewed-by: Markus Armbruster Signed-off-by: Daniel Henrique Barboza Message-Id: <20210907004755.424931-3-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'hw') diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index d39fd4e..ac11c8a 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -3690,7 +3690,7 @@ void spapr_memory_unplug_rollback(SpaprMachineState *spapr, DeviceState *dev) */ qapi_error = g_strdup_printf("Memory hotunplug rejected by the guest " "for device %s", dev->id); - qapi_event_send_mem_unplug_error(dev->id, qapi_error); + qapi_event_send_mem_unplug_error(dev->id ? : "", qapi_error); } /* Callback to be called during DRC release. */ -- cgit v1.1 From 91bd95ce162cd34ba29e9f6470ecdcaaaf1d768d Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Mon, 6 Sep 2021 21:47:51 -0300 Subject: spapr_drc.c: do not error_report() when drc->dev->id == NULL The error_report() call in drc_unisolate_logical() is not considering that drc->dev->id can be NULL, and the underlying functions error_report() calls to do its job (vprintf(), g_strdup_printf() ...) has undefined behavior when trying to handle "%s" with NULL arguments. Besides, there is no utility into reporting that an unknown device was rejected by the guest. Acked-by: David Gibson Reviewed-by: Greg Kurz Reviewed-by: Markus Armbruster Signed-off-by: Daniel Henrique Barboza Message-Id: <20210907004755.424931-4-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr_drc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'hw') diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c index a2f2634..a4d9496 100644 --- a/hw/ppc/spapr_drc.c +++ b/hw/ppc/spapr_drc.c @@ -167,8 +167,11 @@ static uint32_t drc_unisolate_logical(SpaprDrc *drc) } drc->unplug_requested = false; - error_report("Device hotunplug rejected by the guest " - "for device %s", drc->dev->id); + + if (drc->dev->id) { + error_report("Device hotunplug rejected by the guest " + "for device %s", drc->dev->id); + } /* * TODO: send a QAPI DEVICE_UNPLUG_ERROR event when -- cgit v1.1 From 4b08cd567b54e26a0608463311418197dda83e37 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Mon, 6 Sep 2021 21:47:54 -0300 Subject: spapr: use DEVICE_UNPLUG_GUEST_ERROR to report unplug errors Linux Kernel 5.12 is now unisolating CPU DRCs in the device_removal error path, signalling that the hotunplug process wasn't successful. This allow us to send a DEVICE_UNPLUG_GUEST_ERROR in drc_unisolate_logical() to signal this error to the management layer. We also have another error path in spapr_memory_unplug_rollback() for configured LMB DRCs. Kernels older than 5.13 will not unisolate the LMBs in the hotunplug error path, but it will reconfigure them. Let's send the DEVICE_UNPLUG_GUEST_ERROR event in that code path as well to cover the case of older kernels. Acked-by: David Gibson Reviewed-by: Greg Kurz Reviewed-by: Markus Armbruster Signed-off-by: Daniel Henrique Barboza Message-Id: <20210907004755.424931-7-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr.c | 10 +++++++++- hw/ppc/spapr_drc.c | 9 +++++---- 2 files changed, 14 insertions(+), 5 deletions(-) (limited to 'hw') diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index ac11c8a..2701069 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -29,6 +29,7 @@ #include "qemu/datadir.h" #include "qapi/error.h" #include "qapi/qapi-events-machine.h" +#include "qapi/qapi-events-qdev.h" #include "qapi/visitor.h" #include "sysemu/sysemu.h" #include "sysemu/hostmem.h" @@ -3686,11 +3687,18 @@ void spapr_memory_unplug_rollback(SpaprMachineState *spapr, DeviceState *dev) /* * Tell QAPI that something happened and the memory - * hotunplug wasn't successful. + * hotunplug wasn't successful. Keep sending + * MEM_UNPLUG_ERROR even while sending + * DEVICE_UNPLUG_GUEST_ERROR until the deprecation of + * MEM_UNPLUG_ERROR is due. */ qapi_error = g_strdup_printf("Memory hotunplug rejected by the guest " "for device %s", dev->id); + qapi_event_send_mem_unplug_error(dev->id ? : "", qapi_error); + + qapi_event_send_device_unplug_guest_error(!!dev->id, dev->id, + dev->canonical_path); } /* Callback to be called during DRC release. */ diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c index a4d9496..f8ac0a1 100644 --- a/hw/ppc/spapr_drc.c +++ b/hw/ppc/spapr_drc.c @@ -17,6 +17,8 @@ #include "hw/ppc/spapr_drc.h" #include "qom/object.h" #include "migration/vmstate.h" +#include "qapi/error.h" +#include "qapi/qapi-events-qdev.h" #include "qapi/visitor.h" #include "qemu/error-report.h" #include "hw/ppc/spapr.h" /* for RTAS return codes */ @@ -173,10 +175,9 @@ static uint32_t drc_unisolate_logical(SpaprDrc *drc) "for device %s", drc->dev->id); } - /* - * TODO: send a QAPI DEVICE_UNPLUG_ERROR event when - * it is implemented. - */ + qapi_event_send_device_unplug_guest_error(!!drc->dev->id, + drc->dev->id, + drc->dev->canonical_path); } return RTAS_OUT_SUCCESS; /* Nothing to do */ -- cgit v1.1 From 46f2c282c32ed0e0c9366e018b98fd58a859682e Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Mon, 6 Sep 2021 21:47:55 -0300 Subject: memory_hotplug.c: send DEVICE_UNPLUG_GUEST_ERROR in acpi_memory_hotplug_write() MEM_UNPLUG_ERROR is deprecated since the introduction of DEVICE_UNPLUG_GUEST_ERROR. Keep emitting both while the deprecation of MEM_UNPLUG_ERROR is pending. CC: Michael S. Tsirkin CC: Igor Mammedov Acked-by: Michael S. Tsirkin Reviewed-by: Greg Kurz Reviewed-by: David Gibson Reviewed-by: Igor Mammedov Reviewed-by: Markus Armbruster Signed-off-by: Daniel Henrique Barboza Message-Id: <20210907004755.424931-8-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/acpi/memory_hotplug.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'hw') diff --git a/hw/acpi/memory_hotplug.c b/hw/acpi/memory_hotplug.c index 6a71de4..d0fffcf 100644 --- a/hw/acpi/memory_hotplug.c +++ b/hw/acpi/memory_hotplug.c @@ -8,6 +8,7 @@ #include "qapi/error.h" #include "qapi/qapi-events-acpi.h" #include "qapi/qapi-events-machine.h" +#include "qapi/qapi-events-qdev.h" #define MEMORY_SLOTS_NUMBER "MDNR" #define MEMORY_HOTPLUG_IO_REGION "HPMR" @@ -178,8 +179,16 @@ static void acpi_memory_hotplug_write(void *opaque, hwaddr addr, uint64_t data, hotplug_handler_unplug(hotplug_ctrl, dev, &local_err); if (local_err) { trace_mhp_acpi_pc_dimm_delete_failed(mem_st->selector); + + /* + * Send both MEM_UNPLUG_ERROR and DEVICE_UNPLUG_GUEST_ERROR + * while the deprecation of MEM_UNPLUG_ERROR is + * pending. + */ qapi_event_send_mem_unplug_error(dev->id ? : "", error_get_pretty(local_err)); + qapi_event_send_device_unplug_guest_error(!!dev->id, dev->id, + dev->canonical_path); error_free(local_err); break; } -- cgit v1.1 From 7279810b67b6ea16f9b6060693a286dc6deb3036 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 20 Sep 2021 08:12:01 +0200 Subject: target/ppc: Replace debug messages by asserts for unknown IRQ pins MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If an unknown pin of the IRQ controller is raised, something is very wrong in the QEMU model. It is better to abort. Signed-off-by: Cédric Le Goater Message-Id: <20210920061203.989563-3-clg@kaod.org> Signed-off-by: David Gibson --- hw/ppc/ppc.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) (limited to 'hw') diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c index 7375bf4..a327206 100644 --- a/hw/ppc/ppc.c +++ b/hw/ppc/ppc.c @@ -165,9 +165,7 @@ static void ppc6xx_set_irq(void *opaque, int pin, int level) ppc_set_irq(cpu, PPC_INTERRUPT_RESET, level); break; default: - /* Unknown pin - do nothing */ - LOG_IRQ("%s: unknown IRQ pin %d\n", __func__, pin); - return; + g_assert_not_reached(); } if (level) env->irq_input_state |= 1 << pin; @@ -252,9 +250,7 @@ static void ppc970_set_irq(void *opaque, int pin, int level) /* XXX: TODO */ break; default: - /* Unknown pin - do nothing */ - LOG_IRQ("%s: unknown IRQ pin %d\n", __func__, pin); - return; + g_assert_not_reached(); } if (level) env->irq_input_state |= 1 << pin; @@ -287,9 +283,7 @@ static void power7_set_irq(void *opaque, int pin, int level) ppc_set_irq(cpu, PPC_INTERRUPT_EXT, level); break; default: - /* Unknown pin - do nothing */ - LOG_IRQ("%s: unknown IRQ pin %d\n", __func__, pin); - return; + g_assert_not_reached(); } } @@ -323,9 +317,7 @@ static void power9_set_irq(void *opaque, int pin, int level) ppc_set_irq(cpu, PPC_INTERRUPT_HVIRT, level); break; default: - /* Unknown pin - do nothing */ - LOG_IRQ("%s: unknown IRQ pin %d\n", __func__, pin); - return; + g_assert_not_reached(); } } @@ -459,9 +451,7 @@ static void ppc40x_set_irq(void *opaque, int pin, int level) ppc_set_irq(cpu, PPC_INTERRUPT_DEBUG, level); break; default: - /* Unknown pin - do nothing */ - LOG_IRQ("%s: unknown IRQ pin %d\n", __func__, pin); - return; + g_assert_not_reached(); } if (level) env->irq_input_state |= 1 << pin; @@ -523,9 +513,7 @@ static void ppce500_set_irq(void *opaque, int pin, int level) ppc_set_irq(cpu, PPC_INTERRUPT_DEBUG, level); break; default: - /* Unknown pin - do nothing */ - LOG_IRQ("%s: unknown IRQ pin %d\n", __func__, pin); - return; + g_assert_not_reached(); } if (level) env->irq_input_state |= 1 << pin; -- cgit v1.1 From d98dbe2a2bec249847e789d0cb42eb5e7e93343d Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Mon, 20 Sep 2021 14:49:41 -0300 Subject: spapr_numa.c: split FORM1 code into helpers The upcoming FORM2 NUMA affinity will support asymmetric NUMA topologies and doesn't need be concerned with all the legacy support for older pseries FORM1 guests. We're also not going to calculate associativity domains based on numa distance (via spapr_numa_define_associativity_domains) since the distances will be written directly into new DT properties. Let's split FORM1 code into its own functions to allow for easier insertion of FORM2 logic later on. Reviewed-by: Greg Kurz Signed-off-by: Daniel Henrique Barboza Message-Id: <20210920174947.556324-2-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr_numa.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) (limited to 'hw') diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c index 779f18b..786def7 100644 --- a/hw/ppc/spapr_numa.c +++ b/hw/ppc/spapr_numa.c @@ -92,7 +92,7 @@ static uint8_t spapr_numa_get_numa_level(uint8_t distance) return 0; } -static void spapr_numa_define_associativity_domains(SpaprMachineState *spapr) +static void spapr_numa_define_FORM1_domains(SpaprMachineState *spapr) { MachineState *ms = MACHINE(spapr); NodeInfo *numa_info = ms->numa_state->nodes; @@ -155,8 +155,11 @@ static void spapr_numa_define_associativity_domains(SpaprMachineState *spapr) } -void spapr_numa_associativity_init(SpaprMachineState *spapr, - MachineState *machine) +/* + * Set NUMA machine state data based on FORM1 affinity semantics. + */ +static void spapr_numa_FORM1_affinity_init(SpaprMachineState *spapr, + MachineState *machine) { SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr); int nb_numa_nodes = machine->numa_state->num_nodes; @@ -225,7 +228,13 @@ void spapr_numa_associativity_init(SpaprMachineState *spapr, exit(EXIT_FAILURE); } - spapr_numa_define_associativity_domains(spapr); + spapr_numa_define_FORM1_domains(spapr); +} + +void spapr_numa_associativity_init(SpaprMachineState *spapr, + MachineState *machine) +{ + spapr_numa_FORM1_affinity_init(spapr, machine); } void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt, @@ -302,12 +311,8 @@ int spapr_numa_write_assoc_lookup_arrays(SpaprMachineState *spapr, void *fdt, return ret; } -/* - * Helper that writes ibm,associativity-reference-points and - * max-associativity-domains in the RTAS pointed by @rtas - * in the DT @fdt. - */ -void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas) +static void spapr_numa_FORM1_write_rtas_dt(SpaprMachineState *spapr, + void *fdt, int rtas) { MachineState *ms = MACHINE(spapr); SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr); @@ -365,6 +370,16 @@ void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas) maxdomains, sizeof(maxdomains))); } +/* + * Helper that writes ibm,associativity-reference-points and + * max-associativity-domains in the RTAS pointed by @rtas + * in the DT @fdt. + */ +void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas) +{ + spapr_numa_FORM1_write_rtas_dt(spapr, fdt, rtas); +} + static target_ulong h_home_node_associativity(PowerPCCPU *cpu, SpaprMachineState *spapr, target_ulong opcode, -- cgit v1.1 From afa3b3c9ee8ae2c7c25c93f2d6eebe09e962cd3a Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Mon, 20 Sep 2021 14:49:42 -0300 Subject: spapr_numa.c: scrap 'legacy_numa' concept When first introduced, 'legacy_numa' was a way to refer to guests that either wouldn't be affected by associativity domain calculations, namely the ones with only 1 NUMA node, and pre 5.2 guests that shouldn't be affected by it because it would be an userspace change. Calling these cases 'legacy_numa' was a convenient way to label these cases. We're about to introduce a new NUMA affinity, FORM2, and this concept of 'legacy_numa' is now a bit misleading because, although it is called 'legacy' it is in fact a FORM1 exclusive contraint. This patch removes spapr_machine_using_legacy_numa() and open code the conditions in each caller. While we're at it, move the chunk inside spapr_numa_FORM1_affinity_init() that sets all numa_assoc_array domains with 'node_id' to spapr_numa_define_FORM1_domains(). This chunk was being executed if !pre_5_2_numa_associativity and num_nodes => 1, the same conditions in which spapr_numa_define_FORM1_domains() is called shortly after. Reviewed-by: Greg Kurz Signed-off-by: Daniel Henrique Barboza Message-Id: <20210920174947.556324-3-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr_numa.c | 47 ++++++++++++++++++++--------------------------- 1 file changed, 20 insertions(+), 27 deletions(-) (limited to 'hw') diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c index 786def7..bf520d4 100644 --- a/hw/ppc/spapr_numa.c +++ b/hw/ppc/spapr_numa.c @@ -19,15 +19,6 @@ /* Moved from hw/ppc/spapr_pci_nvlink2.c */ #define SPAPR_GPU_NUMA_ID (cpu_to_be32(1)) -static bool spapr_machine_using_legacy_numa(SpaprMachineState *spapr) -{ - MachineState *machine = MACHINE(spapr); - SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine); - - return smc->pre_5_2_numa_associativity || - machine->numa_state->num_nodes <= 1; -} - static bool spapr_numa_is_symmetrical(MachineState *ms) { int src, dst; @@ -97,7 +88,18 @@ static void spapr_numa_define_FORM1_domains(SpaprMachineState *spapr) MachineState *ms = MACHINE(spapr); NodeInfo *numa_info = ms->numa_state->nodes; int nb_numa_nodes = ms->numa_state->num_nodes; - int src, dst, i; + int src, dst, i, j; + + /* + * Fill all associativity domains of non-zero NUMA nodes with + * node_id. This is required because the default value (0) is + * considered a match with associativity domains of node 0. + */ + for (i = 1; i < nb_numa_nodes; i++) { + for (j = 1; j < MAX_DISTANCE_REF_POINTS; j++) { + spapr->numa_assoc_array[i][j] = cpu_to_be32(i); + } + } for (src = 0; src < nb_numa_nodes; src++) { for (dst = src; dst < nb_numa_nodes; dst++) { @@ -164,7 +166,6 @@ static void spapr_numa_FORM1_affinity_init(SpaprMachineState *spapr, SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr); int nb_numa_nodes = machine->numa_state->num_nodes; int i, j, max_nodes_with_gpus; - bool using_legacy_numa = spapr_machine_using_legacy_numa(spapr); /* * For all associativity arrays: first position is the size, @@ -178,17 +179,6 @@ static void spapr_numa_FORM1_affinity_init(SpaprMachineState *spapr, for (i = 0; i < nb_numa_nodes; i++) { spapr->numa_assoc_array[i][0] = cpu_to_be32(MAX_DISTANCE_REF_POINTS); spapr->numa_assoc_array[i][MAX_DISTANCE_REF_POINTS] = cpu_to_be32(i); - - /* - * Fill all associativity domains of non-zero NUMA nodes with - * node_id. This is required because the default value (0) is - * considered a match with associativity domains of node 0. - */ - if (!using_legacy_numa && i != 0) { - for (j = 1; j < MAX_DISTANCE_REF_POINTS; j++) { - spapr->numa_assoc_array[i][j] = cpu_to_be32(i); - } - } } /* @@ -214,11 +204,13 @@ static void spapr_numa_FORM1_affinity_init(SpaprMachineState *spapr, } /* - * Legacy NUMA guests (pseries-5.1 and older, or guests with only - * 1 NUMA node) will not benefit from anything we're going to do - * after this point. + * Guests pseries-5.1 and older uses zeroed associativity domains, + * i.e. no domain definition based on NUMA distance input. + * + * Same thing with guests that have only one NUMA node. */ - if (using_legacy_numa) { + if (smc->pre_5_2_numa_associativity || + machine->numa_state->num_nodes <= 1) { return; } @@ -334,7 +326,8 @@ static void spapr_numa_FORM1_write_rtas_dt(SpaprMachineState *spapr, cpu_to_be32(maxdomain) }; - if (spapr_machine_using_legacy_numa(spapr)) { + if (smc->pre_5_2_numa_associativity || + ms->numa_state->num_nodes <= 1) { uint32_t legacy_refpoints[] = { cpu_to_be32(0x4), cpu_to_be32(0x4), -- cgit v1.1 From 3a6e4ce684e48988f9736aecc6b365609e83f8d1 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Mon, 20 Sep 2021 14:49:43 -0300 Subject: spapr_numa.c: parametrize FORM1 macros The next preliminary step to introduce NUMA FORM2 affinity is to make the existing code independent of FORM1 macros and values, i.e. MAX_DISTANCE_REF_POINTS, NUMA_ASSOC_SIZE and VCPU_ASSOC_SIZE. This patch accomplishes that by doing the following: - move the NUMA related macros from spapr.h to spapr_numa.c where they are used. spapr.h gets instead a 'NUMA_NODES_MAX_NUM' macro that is used to refer to the maximum number of NUMA nodes, including GPU nodes, that the machine can support; - MAX_DISTANCE_REF_POINTS and NUMA_ASSOC_SIZE are renamed to FORM1_DIST_REF_POINTS and FORM1_NUMA_ASSOC_SIZE. These FORM1 specific macros are used in FORM1 init functions; - code that uses MAX_DISTANCE_REF_POINTS now retrieves the max_dist_ref_points value using get_max_dist_ref_points(). NUMA_ASSOC_SIZE is replaced by get_numa_assoc_size() and VCPU_ASSOC_SIZE is replaced by get_vcpu_assoc_size(). These functions are used by the generic device tree functions and h_home_node_associativity() and will allow them to switch between FORM1 and FORM2 without changing their core logic. Reviewed-by: Greg Kurz Signed-off-by: Daniel Henrique Barboza Message-Id: <20210920174947.556324-4-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr_numa.c | 74 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 53 insertions(+), 21 deletions(-) (limited to 'hw') diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c index bf520d4..08e2d6a 100644 --- a/hw/ppc/spapr_numa.c +++ b/hw/ppc/spapr_numa.c @@ -19,6 +19,33 @@ /* Moved from hw/ppc/spapr_pci_nvlink2.c */ #define SPAPR_GPU_NUMA_ID (cpu_to_be32(1)) +/* + * Retrieves max_dist_ref_points of the current NUMA affinity. + */ +static int get_max_dist_ref_points(SpaprMachineState *spapr) +{ + return FORM1_DIST_REF_POINTS; +} + +/* + * Retrieves numa_assoc_size of the current NUMA affinity. + */ +static int get_numa_assoc_size(SpaprMachineState *spapr) +{ + return FORM1_NUMA_ASSOC_SIZE; +} + +/* + * Retrieves vcpu_assoc_size of the current NUMA affinity. + * + * vcpu_assoc_size is the size of ibm,associativity array + * for CPUs, which has an extra element (vcpu_id) in the end. + */ +static int get_vcpu_assoc_size(SpaprMachineState *spapr) +{ + return get_numa_assoc_size(spapr) + 1; +} + static bool spapr_numa_is_symmetrical(MachineState *ms) { int src, dst; @@ -96,7 +123,7 @@ static void spapr_numa_define_FORM1_domains(SpaprMachineState *spapr) * considered a match with associativity domains of node 0. */ for (i = 1; i < nb_numa_nodes; i++) { - for (j = 1; j < MAX_DISTANCE_REF_POINTS; j++) { + for (j = 1; j < FORM1_DIST_REF_POINTS; j++) { spapr->numa_assoc_array[i][j] = cpu_to_be32(i); } } @@ -134,7 +161,7 @@ static void spapr_numa_define_FORM1_domains(SpaprMachineState *spapr) * * The Linux kernel will assume that the distance between src and * dst, in this case of no match, is 10 (local distance) doubled - * for each NUMA it didn't match. We have MAX_DISTANCE_REF_POINTS + * for each NUMA it didn't match. We have FORM1_DIST_REF_POINTS * levels (4), so this gives us 10*2*2*2*2 = 160. * * This logic can be seen in the Linux kernel source code, as of @@ -169,7 +196,7 @@ static void spapr_numa_FORM1_affinity_init(SpaprMachineState *spapr, /* * For all associativity arrays: first position is the size, - * position MAX_DISTANCE_REF_POINTS is always the numa_id, + * position FORM1_DIST_REF_POINTS is always the numa_id, * represented by the index 'i'. * * This will break on sparse NUMA setups, when/if QEMU starts @@ -177,8 +204,8 @@ static void spapr_numa_FORM1_affinity_init(SpaprMachineState *spapr, * 'i' will be a valid node_id set by the user. */ for (i = 0; i < nb_numa_nodes; i++) { - spapr->numa_assoc_array[i][0] = cpu_to_be32(MAX_DISTANCE_REF_POINTS); - spapr->numa_assoc_array[i][MAX_DISTANCE_REF_POINTS] = cpu_to_be32(i); + spapr->numa_assoc_array[i][0] = cpu_to_be32(FORM1_DIST_REF_POINTS); + spapr->numa_assoc_array[i][FORM1_DIST_REF_POINTS] = cpu_to_be32(i); } /* @@ -192,15 +219,15 @@ static void spapr_numa_FORM1_affinity_init(SpaprMachineState *spapr, max_nodes_with_gpus = nb_numa_nodes + NVGPU_MAX_NUM; for (i = nb_numa_nodes; i < max_nodes_with_gpus; i++) { - spapr->numa_assoc_array[i][0] = cpu_to_be32(MAX_DISTANCE_REF_POINTS); + spapr->numa_assoc_array[i][0] = cpu_to_be32(FORM1_DIST_REF_POINTS); - for (j = 1; j < MAX_DISTANCE_REF_POINTS; j++) { + for (j = 1; j < FORM1_DIST_REF_POINTS; j++) { uint32_t gpu_assoc = smc->pre_5_1_assoc_refpoints ? SPAPR_GPU_NUMA_ID : cpu_to_be32(i); spapr->numa_assoc_array[i][j] = gpu_assoc; } - spapr->numa_assoc_array[i][MAX_DISTANCE_REF_POINTS] = cpu_to_be32(i); + spapr->numa_assoc_array[i][FORM1_DIST_REF_POINTS] = cpu_to_be32(i); } /* @@ -234,13 +261,15 @@ void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt, { _FDT((fdt_setprop(fdt, offset, "ibm,associativity", spapr->numa_assoc_array[nodeid], - sizeof(spapr->numa_assoc_array[nodeid])))); + get_numa_assoc_size(spapr) * sizeof(uint32_t)))); } static uint32_t *spapr_numa_get_vcpu_assoc(SpaprMachineState *spapr, PowerPCCPU *cpu) { - uint32_t *vcpu_assoc = g_new(uint32_t, VCPU_ASSOC_SIZE); + int max_distance_ref_points = get_max_dist_ref_points(spapr); + int vcpu_assoc_size = get_vcpu_assoc_size(spapr); + uint32_t *vcpu_assoc = g_new(uint32_t, vcpu_assoc_size); int index = spapr_get_vcpu_id(cpu); /* @@ -249,10 +278,10 @@ static uint32_t *spapr_numa_get_vcpu_assoc(SpaprMachineState *spapr, * 0, put cpu_id last, then copy the remaining associativity * domains. */ - vcpu_assoc[0] = cpu_to_be32(MAX_DISTANCE_REF_POINTS + 1); - vcpu_assoc[VCPU_ASSOC_SIZE - 1] = cpu_to_be32(index); + vcpu_assoc[0] = cpu_to_be32(max_distance_ref_points + 1); + vcpu_assoc[vcpu_assoc_size - 1] = cpu_to_be32(index); memcpy(vcpu_assoc + 1, spapr->numa_assoc_array[cpu->node_id] + 1, - (VCPU_ASSOC_SIZE - 2) * sizeof(uint32_t)); + (vcpu_assoc_size - 2) * sizeof(uint32_t)); return vcpu_assoc; } @@ -261,12 +290,13 @@ int spapr_numa_fixup_cpu_dt(SpaprMachineState *spapr, void *fdt, int offset, PowerPCCPU *cpu) { g_autofree uint32_t *vcpu_assoc = NULL; + int vcpu_assoc_size = get_vcpu_assoc_size(spapr); vcpu_assoc = spapr_numa_get_vcpu_assoc(spapr, cpu); /* Advertise NUMA via ibm,associativity */ return fdt_setprop(fdt, offset, "ibm,associativity", vcpu_assoc, - VCPU_ASSOC_SIZE * sizeof(uint32_t)); + vcpu_assoc_size * sizeof(uint32_t)); } @@ -274,17 +304,18 @@ int spapr_numa_write_assoc_lookup_arrays(SpaprMachineState *spapr, void *fdt, int offset) { MachineState *machine = MACHINE(spapr); + int max_distance_ref_points = get_max_dist_ref_points(spapr); int nb_numa_nodes = machine->numa_state->num_nodes; int nr_nodes = nb_numa_nodes ? nb_numa_nodes : 1; uint32_t *int_buf, *cur_index, buf_len; int ret, i; /* ibm,associativity-lookup-arrays */ - buf_len = (nr_nodes * MAX_DISTANCE_REF_POINTS + 2) * sizeof(uint32_t); + buf_len = (nr_nodes * max_distance_ref_points + 2) * sizeof(uint32_t); cur_index = int_buf = g_malloc0(buf_len); int_buf[0] = cpu_to_be32(nr_nodes); /* Number of entries per associativity list */ - int_buf[1] = cpu_to_be32(MAX_DISTANCE_REF_POINTS); + int_buf[1] = cpu_to_be32(max_distance_ref_points); cur_index += 2; for (i = 0; i < nr_nodes; i++) { /* @@ -293,8 +324,8 @@ int spapr_numa_write_assoc_lookup_arrays(SpaprMachineState *spapr, void *fdt, */ uint32_t *associativity = spapr->numa_assoc_array[i]; memcpy(cur_index, ++associativity, - sizeof(uint32_t) * MAX_DISTANCE_REF_POINTS); - cur_index += MAX_DISTANCE_REF_POINTS; + sizeof(uint32_t) * max_distance_ref_points); + cur_index += max_distance_ref_points; } ret = fdt_setprop(fdt, offset, "ibm,associativity-lookup-arrays", int_buf, (cur_index - int_buf) * sizeof(uint32_t)); @@ -383,6 +414,7 @@ static target_ulong h_home_node_associativity(PowerPCCPU *cpu, target_ulong procno = args[1]; PowerPCCPU *tcpu; int idx, assoc_idx; + int vcpu_assoc_size = get_vcpu_assoc_size(spapr); /* only support procno from H_REGISTER_VPA */ if (flags != 0x1) { @@ -401,7 +433,7 @@ static target_ulong h_home_node_associativity(PowerPCCPU *cpu, * 12 associativity domains for vcpus. Assert and bail if that's * not the case. */ - G_STATIC_ASSERT((VCPU_ASSOC_SIZE - 1) <= 12); + g_assert((vcpu_assoc_size - 1) <= 12); vcpu_assoc = spapr_numa_get_vcpu_assoc(spapr, tcpu); /* assoc_idx starts at 1 to skip associativity size */ @@ -422,9 +454,9 @@ static target_ulong h_home_node_associativity(PowerPCCPU *cpu, * macro. The ternary will fill the remaining registers with -1 * after we went through vcpu_assoc[]. */ - a = assoc_idx < VCPU_ASSOC_SIZE ? + a = assoc_idx < vcpu_assoc_size ? be32_to_cpu(vcpu_assoc[assoc_idx++]) : -1; - b = assoc_idx < VCPU_ASSOC_SIZE ? + b = assoc_idx < vcpu_assoc_size ? be32_to_cpu(vcpu_assoc[assoc_idx++]) : -1; args[idx] = ASSOCIATIVITY(a, b); -- cgit v1.1 From a165ac67c3ff8dc7065aa827de36da3785ec83fd Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Mon, 20 Sep 2021 14:49:44 -0300 Subject: spapr_numa.c: rename numa_assoc_array to FORM1_assoc_array Introducing a new NUMA affinity, FORM2, requires a new mechanism to switch between affinity modes after CAS. Also, we want FORM2 data structures and functions to be completely separated from the existing FORM1 code, allowing us to avoid adding new code that inherits the existing complexity of FORM1. The idea of switching values used by the write_dt() functions in spapr_numa.c was already introduced in the previous patch, and the same approach will be used when dealing with the FORM1 and FORM2 arrays. We can accomplish that by that by renaming the existing numa_assoc_array to FORM1_assoc_array, which now is used exclusively to handle FORM1 affinity data. A new helper get_associativity() is then introduced to be used by the write_dt() functions to retrieve the current ibm,associativity array of a given node, after considering affinity selection that might have been done during CAS. All code that was using numa_assoc_array now needs to retrieve the array by calling this function. This will allow for an easier plug of FORM2 data later on. Signed-off-by: Daniel Henrique Barboza Message-Id: <20210920174947.556324-5-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr_hcall.c | 1 + hw/ppc/spapr_numa.c | 38 +++++++++++++++++++++++++------------- 2 files changed, 26 insertions(+), 13 deletions(-) (limited to 'hw') diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c index 0e9a5b2..9056644 100644 --- a/hw/ppc/spapr_hcall.c +++ b/hw/ppc/spapr_hcall.c @@ -17,6 +17,7 @@ #include "kvm_ppc.h" #include "hw/ppc/fdt.h" #include "hw/ppc/spapr_ovec.h" +#include "hw/ppc/spapr_numa.h" #include "mmu-book3s-v3.h" #include "hw/mem/memory-device.h" diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c index 08e2d6a..dce9ce9 100644 --- a/hw/ppc/spapr_numa.c +++ b/hw/ppc/spapr_numa.c @@ -46,6 +46,15 @@ static int get_vcpu_assoc_size(SpaprMachineState *spapr) return get_numa_assoc_size(spapr) + 1; } +/* + * Retrieves the ibm,associativity array of NUMA node 'node_id' + * for the current NUMA affinity. + */ +static const uint32_t *get_associativity(SpaprMachineState *spapr, int node_id) +{ + return spapr->FORM1_assoc_array[node_id]; +} + static bool spapr_numa_is_symmetrical(MachineState *ms) { int src, dst; @@ -124,7 +133,7 @@ static void spapr_numa_define_FORM1_domains(SpaprMachineState *spapr) */ for (i = 1; i < nb_numa_nodes; i++) { for (j = 1; j < FORM1_DIST_REF_POINTS; j++) { - spapr->numa_assoc_array[i][j] = cpu_to_be32(i); + spapr->FORM1_assoc_array[i][j] = cpu_to_be32(i); } } @@ -176,8 +185,8 @@ static void spapr_numa_define_FORM1_domains(SpaprMachineState *spapr) * and going up to 0x1. */ for (i = n_level; i > 0; i--) { - assoc_src = spapr->numa_assoc_array[src][i]; - spapr->numa_assoc_array[dst][i] = assoc_src; + assoc_src = spapr->FORM1_assoc_array[src][i]; + spapr->FORM1_assoc_array[dst][i] = assoc_src; } } } @@ -204,8 +213,8 @@ static void spapr_numa_FORM1_affinity_init(SpaprMachineState *spapr, * 'i' will be a valid node_id set by the user. */ for (i = 0; i < nb_numa_nodes; i++) { - spapr->numa_assoc_array[i][0] = cpu_to_be32(FORM1_DIST_REF_POINTS); - spapr->numa_assoc_array[i][FORM1_DIST_REF_POINTS] = cpu_to_be32(i); + spapr->FORM1_assoc_array[i][0] = cpu_to_be32(FORM1_DIST_REF_POINTS); + spapr->FORM1_assoc_array[i][FORM1_DIST_REF_POINTS] = cpu_to_be32(i); } /* @@ -219,15 +228,15 @@ static void spapr_numa_FORM1_affinity_init(SpaprMachineState *spapr, max_nodes_with_gpus = nb_numa_nodes + NVGPU_MAX_NUM; for (i = nb_numa_nodes; i < max_nodes_with_gpus; i++) { - spapr->numa_assoc_array[i][0] = cpu_to_be32(FORM1_DIST_REF_POINTS); + spapr->FORM1_assoc_array[i][0] = cpu_to_be32(FORM1_DIST_REF_POINTS); for (j = 1; j < FORM1_DIST_REF_POINTS; j++) { uint32_t gpu_assoc = smc->pre_5_1_assoc_refpoints ? SPAPR_GPU_NUMA_ID : cpu_to_be32(i); - spapr->numa_assoc_array[i][j] = gpu_assoc; + spapr->FORM1_assoc_array[i][j] = gpu_assoc; } - spapr->numa_assoc_array[i][FORM1_DIST_REF_POINTS] = cpu_to_be32(i); + spapr->FORM1_assoc_array[i][FORM1_DIST_REF_POINTS] = cpu_to_be32(i); } /* @@ -259,14 +268,17 @@ void spapr_numa_associativity_init(SpaprMachineState *spapr, void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt, int offset, int nodeid) { + const uint32_t *associativity = get_associativity(spapr, nodeid); + _FDT((fdt_setprop(fdt, offset, "ibm,associativity", - spapr->numa_assoc_array[nodeid], + associativity, get_numa_assoc_size(spapr) * sizeof(uint32_t)))); } static uint32_t *spapr_numa_get_vcpu_assoc(SpaprMachineState *spapr, PowerPCCPU *cpu) { + const uint32_t *associativity = get_associativity(spapr, cpu->node_id); int max_distance_ref_points = get_max_dist_ref_points(spapr); int vcpu_assoc_size = get_vcpu_assoc_size(spapr); uint32_t *vcpu_assoc = g_new(uint32_t, vcpu_assoc_size); @@ -280,7 +292,7 @@ static uint32_t *spapr_numa_get_vcpu_assoc(SpaprMachineState *spapr, */ vcpu_assoc[0] = cpu_to_be32(max_distance_ref_points + 1); vcpu_assoc[vcpu_assoc_size - 1] = cpu_to_be32(index); - memcpy(vcpu_assoc + 1, spapr->numa_assoc_array[cpu->node_id] + 1, + memcpy(vcpu_assoc + 1, associativity + 1, (vcpu_assoc_size - 2) * sizeof(uint32_t)); return vcpu_assoc; @@ -319,10 +331,10 @@ int spapr_numa_write_assoc_lookup_arrays(SpaprMachineState *spapr, void *fdt, cur_index += 2; for (i = 0; i < nr_nodes; i++) { /* - * For the lookup-array we use the ibm,associativity array, - * from numa_assoc_array. without the first element (size). + * For the lookup-array we use the ibm,associativity array of the + * current NUMA affinity, without the first element (size). */ - uint32_t *associativity = spapr->numa_assoc_array[i]; + const uint32_t *associativity = get_associativity(spapr, i); memcpy(cur_index, ++associativity, sizeof(uint32_t) * max_distance_ref_points); cur_index += max_distance_ref_points; -- cgit v1.1 From 5dab5abe623d97929382158b43b3fd545e8edd62 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Mon, 20 Sep 2021 14:49:45 -0300 Subject: spapr: move FORM1 verifications to post CAS FORM2 NUMA affinity is prepared to deal with empty (memory/cpu less) NUMA nodes. This is used by the DAX KMEM driver to locate a PAPR SCM device that has a different latency than the original NUMA node from the regular memory. FORM2 is also able to deal with asymmetric NUMA distances gracefully, something that our FORM1 implementation doesn't do. Move these FORM1 verifications to a new function and wait until after CAS, when we're sure that we're sticking with FORM1, to enforce them. Reviewed-by: Greg Kurz Signed-off-by: Daniel Henrique Barboza Message-Id: <20210920174947.556324-6-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr.c | 33 -------------------------------- hw/ppc/spapr_hcall.c | 6 ++++++ hw/ppc/spapr_numa.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 53 insertions(+), 39 deletions(-) (limited to 'hw') diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 2701069..524951d 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -2774,39 +2774,6 @@ static void spapr_machine_init(MachineState *machine) /* init CPUs */ spapr_init_cpus(spapr); - /* - * check we don't have a memory-less/cpu-less NUMA node - * Firmware relies on the existing memory/cpu topology to provide the - * NUMA topology to the kernel. - * And the linux kernel needs to know the NUMA topology at start - * to be able to hotplug CPUs later. - */ - if (machine->numa_state->num_nodes) { - for (i = 0; i < machine->numa_state->num_nodes; ++i) { - /* check for memory-less node */ - if (machine->numa_state->nodes[i].node_mem == 0) { - CPUState *cs; - int found = 0; - /* check for cpu-less node */ - CPU_FOREACH(cs) { - PowerPCCPU *cpu = POWERPC_CPU(cs); - if (cpu->node_id == i) { - found = 1; - break; - } - } - /* memory-less and cpu-less node */ - if (!found) { - error_report( - "Memory-less/cpu-less nodes are not supported (node %d)", - i); - exit(1); - } - } - } - - } - spapr->gpu_numa_id = spapr_numa_initial_nvgpu_numa_id(machine); /* Init numa_assoc_array */ diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c index 9056644..222c1b6 100644 --- a/hw/ppc/spapr_hcall.c +++ b/hw/ppc/spapr_hcall.c @@ -1199,6 +1199,12 @@ target_ulong do_client_architecture_support(PowerPCCPU *cpu, spapr_ovec_cleanup(ov1_guest); /* + * Check for NUMA affinity conditions now that we know which NUMA + * affinity the guest will use. + */ + spapr_numa_associativity_check(spapr); + + /* * Ensure the guest asks for an interrupt mode we support; * otherwise terminate the boot. */ diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c index dce9ce9..6718c0f 100644 --- a/hw/ppc/spapr_numa.c +++ b/hw/ppc/spapr_numa.c @@ -193,6 +193,48 @@ static void spapr_numa_define_FORM1_domains(SpaprMachineState *spapr) } +static void spapr_numa_FORM1_affinity_check(MachineState *machine) +{ + int i; + + /* + * Check we don't have a memory-less/cpu-less NUMA node + * Firmware relies on the existing memory/cpu topology to provide the + * NUMA topology to the kernel. + * And the linux kernel needs to know the NUMA topology at start + * to be able to hotplug CPUs later. + */ + if (machine->numa_state->num_nodes) { + for (i = 0; i < machine->numa_state->num_nodes; ++i) { + /* check for memory-less node */ + if (machine->numa_state->nodes[i].node_mem == 0) { + CPUState *cs; + int found = 0; + /* check for cpu-less node */ + CPU_FOREACH(cs) { + PowerPCCPU *cpu = POWERPC_CPU(cs); + if (cpu->node_id == i) { + found = 1; + break; + } + } + /* memory-less and cpu-less node */ + if (!found) { + error_report( +"Memory-less/cpu-less nodes are not supported with FORM1 NUMA (node %d)", i); + exit(EXIT_FAILURE); + } + } + } + } + + if (!spapr_numa_is_symmetrical(machine)) { + error_report( +"Asymmetrical NUMA topologies aren't supported in the pSeries machine using FORM1 NUMA"); + exit(EXIT_FAILURE); + } +} + /* * Set NUMA machine state data based on FORM1 affinity semantics. */ @@ -250,12 +292,6 @@ static void spapr_numa_FORM1_affinity_init(SpaprMachineState *spapr, return; } - if (!spapr_numa_is_symmetrical(machine)) { - error_report("Asymmetrical NUMA topologies aren't supported " - "in the pSeries machine"); - exit(EXIT_FAILURE); - } - spapr_numa_define_FORM1_domains(spapr); } @@ -265,6 +301,11 @@ void spapr_numa_associativity_init(SpaprMachineState *spapr, spapr_numa_FORM1_affinity_init(spapr, machine); } +void spapr_numa_associativity_check(SpaprMachineState *spapr) +{ + spapr_numa_FORM1_affinity_check(MACHINE(spapr)); +} + void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt, int offset, int nodeid) { -- cgit v1.1 From e0eb84d4f51f26aff6210407ec658c4b9ecbc68f Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Mon, 20 Sep 2021 14:49:46 -0300 Subject: spapr_numa.c: FORM2 NUMA affinity support The main feature of FORM2 affinity support is the separation of NUMA distances from ibm,associativity information. This allows for a more flexible and straightforward NUMA distance assignment without relying on complex associations between several levels of NUMA via ibm,associativity matches. Another feature is its extensibility. This base support contains the facilities for NUMA distance assignment, but in the future more facilities will be added for latency, performance, bandwidth and so on. This patch implements the base FORM2 affinity support as follows: - the use of FORM2 associativity is indicated by using bit 2 of byte 5 of ibm,architecture-vec-5. A FORM2 aware guest can choose to use FORM1 or FORM2 affinity. Setting both forms will default to FORM2. We're not advertising FORM2 for pseries-6.1 and older machine versions to prevent guest visible changes in those; - ibm,associativity-reference-points has a new semantic. Instead of being used to calculate distances via NUMA levels, it's now used to indicate the primary domain index in the ibm,associativity domain of each resource. In our case it's set to {0x4}, matching the position where we already place logical_domain_id; - two new RTAS DT artifacts are introduced: ibm,numa-lookup-index-table and ibm,numa-distance-table. The index table is used to list all the NUMA logical domains of the platform, in ascending order, and allows for spartial NUMA configurations (although QEMU ATM doesn't support that). ibm,numa-distance-table is an array that contains all the distances from the first NUMA node to all other nodes, then the second NUMA node distances to all other nodes and so on; - get_max_dist_ref_points(), get_numa_assoc_size() and get_associativity() now checks for OV5_FORM2_AFFINITY and returns FORM2 values if the guest selected FORM2 affinity during CAS. Reviewed-by: Greg Kurz Signed-off-by: Daniel Henrique Barboza Message-Id: <20210920174947.556324-7-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr.c | 8 +++ hw/ppc/spapr_numa.c | 146 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 154 insertions(+) (limited to 'hw') diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 524951d..b7bee5f 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -2753,6 +2753,11 @@ static void spapr_machine_init(MachineState *machine) spapr_ovec_set(spapr->ov5, OV5_FORM1_AFFINITY); + /* Do not advertise FORM2 NUMA support for pseries-6.1 and older */ + if (!smc->pre_6_2_numa_affinity) { + spapr_ovec_set(spapr->ov5, OV5_FORM2_AFFINITY); + } + /* advertise support for dedicated HP event source to guests */ if (spapr->use_hotplug_event_source) { spapr_ovec_set(spapr->ov5, OV5_HP_EVT); @@ -4675,8 +4680,11 @@ DEFINE_SPAPR_MACHINE(6_2, "6.2", true); */ static void spapr_machine_6_1_class_options(MachineClass *mc) { + SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); + spapr_machine_6_2_class_options(mc); compat_props_add(mc->compat_props, hw_compat_6_1, hw_compat_6_1_len); + smc->pre_6_2_numa_affinity = true; } DEFINE_SPAPR_MACHINE(6_1, "6.1", false); diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c index 6718c0f..13db321 100644 --- a/hw/ppc/spapr_numa.c +++ b/hw/ppc/spapr_numa.c @@ -24,6 +24,10 @@ */ static int get_max_dist_ref_points(SpaprMachineState *spapr) { + if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) { + return FORM2_DIST_REF_POINTS; + } + return FORM1_DIST_REF_POINTS; } @@ -32,6 +36,10 @@ static int get_max_dist_ref_points(SpaprMachineState *spapr) */ static int get_numa_assoc_size(SpaprMachineState *spapr) { + if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) { + return FORM2_NUMA_ASSOC_SIZE; + } + return FORM1_NUMA_ASSOC_SIZE; } @@ -52,6 +60,9 @@ static int get_vcpu_assoc_size(SpaprMachineState *spapr) */ static const uint32_t *get_associativity(SpaprMachineState *spapr, int node_id) { + if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) { + return spapr->FORM2_assoc_array[node_id]; + } return spapr->FORM1_assoc_array[node_id]; } @@ -295,14 +306,50 @@ static void spapr_numa_FORM1_affinity_init(SpaprMachineState *spapr, spapr_numa_define_FORM1_domains(spapr); } +/* + * Init NUMA FORM2 machine state data + */ +static void spapr_numa_FORM2_affinity_init(SpaprMachineState *spapr) +{ + int i; + + /* + * For all resources but CPUs, FORM2 associativity arrays will + * be a size 2 array with the following format: + * + * ibm,associativity = {1, numa_id} + * + * CPUs will write an additional 'vcpu_id' on top of the arrays + * being initialized here. 'numa_id' is represented by the + * index 'i' of the loop. + * + * Given that this initialization is also valid for GPU associativity + * arrays, handle everything in one single step by populating the + * arrays up to NUMA_NODES_MAX_NUM. + */ + for (i = 0; i < NUMA_NODES_MAX_NUM; i++) { + spapr->FORM2_assoc_array[i][0] = cpu_to_be32(1); + spapr->FORM2_assoc_array[i][1] = cpu_to_be32(i); + } +} + void spapr_numa_associativity_init(SpaprMachineState *spapr, MachineState *machine) { spapr_numa_FORM1_affinity_init(spapr, machine); + spapr_numa_FORM2_affinity_init(spapr); } void spapr_numa_associativity_check(SpaprMachineState *spapr) { + /* + * FORM2 does not have any restrictions we need to handle + * at CAS time, for now. + */ + if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) { + return; + } + spapr_numa_FORM1_affinity_check(MACHINE(spapr)); } @@ -447,6 +494,100 @@ static void spapr_numa_FORM1_write_rtas_dt(SpaprMachineState *spapr, maxdomains, sizeof(maxdomains))); } +static void spapr_numa_FORM2_write_rtas_tables(SpaprMachineState *spapr, + void *fdt, int rtas) +{ + MachineState *ms = MACHINE(spapr); + NodeInfo *numa_info = ms->numa_state->nodes; + int nb_numa_nodes = ms->numa_state->num_nodes; + int distance_table_entries = nb_numa_nodes * nb_numa_nodes; + g_autofree uint32_t *lookup_index_table = NULL; + g_autofree uint32_t *distance_table = NULL; + int src, dst, i, distance_table_size; + uint8_t *node_distances; + + /* + * ibm,numa-lookup-index-table: array with length and a + * list of NUMA ids present in the guest. + */ + lookup_index_table = g_new0(uint32_t, nb_numa_nodes + 1); + lookup_index_table[0] = cpu_to_be32(nb_numa_nodes); + + for (i = 0; i < nb_numa_nodes; i++) { + lookup_index_table[i + 1] = cpu_to_be32(i); + } + + _FDT(fdt_setprop(fdt, rtas, "ibm,numa-lookup-index-table", + lookup_index_table, + (nb_numa_nodes + 1) * sizeof(uint32_t))); + + /* + * ibm,numa-distance-table: contains all node distances. First + * element is the size of the table as uint32, followed up + * by all the uint8 distances from the first NUMA node, then all + * distances from the second NUMA node and so on. + * + * ibm,numa-lookup-index-table is used by guest to navigate this + * array because NUMA ids can be sparse (node 0 is the first, + * node 8 is the second ...). + */ + distance_table = g_new0(uint32_t, distance_table_entries + 1); + distance_table[0] = cpu_to_be32(distance_table_entries); + + node_distances = (uint8_t *)&distance_table[1]; + i = 0; + + for (src = 0; src < nb_numa_nodes; src++) { + for (dst = 0; dst < nb_numa_nodes; dst++) { + node_distances[i++] = numa_info[src].distance[dst]; + } + } + + distance_table_size = distance_table_entries * sizeof(uint8_t) + + sizeof(uint32_t); + _FDT(fdt_setprop(fdt, rtas, "ibm,numa-distance-table", + distance_table, distance_table_size)); +} + +/* + * This helper could be compressed in a single function with + * FORM1 logic since we're setting the same DT values, with the + * difference being a call to spapr_numa_FORM2_write_rtas_tables() + * in the end. The separation was made to avoid clogging FORM1 code + * which already has to deal with compat modes from previous + * QEMU machine types. + */ +static void spapr_numa_FORM2_write_rtas_dt(SpaprMachineState *spapr, + void *fdt, int rtas) +{ + MachineState *ms = MACHINE(spapr); + uint32_t number_nvgpus_nodes = spapr->gpu_numa_id - + spapr_numa_initial_nvgpu_numa_id(ms); + + /* + * In FORM2, ibm,associativity-reference-points will point to + * the element in the ibm,associativity array that contains the + * primary domain index (for FORM2, the first element). + * + * This value (in our case, the numa-id) is then used as an index + * to retrieve all other attributes of the node (distance, + * bandwidth, latency) via ibm,numa-lookup-index-table and other + * ibm,numa-*-table properties. + */ + uint32_t refpoints[] = { cpu_to_be32(1) }; + + uint32_t maxdomain = ms->numa_state->num_nodes + number_nvgpus_nodes; + uint32_t maxdomains[] = { cpu_to_be32(1), cpu_to_be32(maxdomain) }; + + _FDT(fdt_setprop(fdt, rtas, "ibm,associativity-reference-points", + refpoints, sizeof(refpoints))); + + _FDT(fdt_setprop(fdt, rtas, "ibm,max-associativity-domains", + maxdomains, sizeof(maxdomains))); + + spapr_numa_FORM2_write_rtas_tables(spapr, fdt, rtas); +} + /* * Helper that writes ibm,associativity-reference-points and * max-associativity-domains in the RTAS pointed by @rtas @@ -454,6 +595,11 @@ static void spapr_numa_FORM1_write_rtas_dt(SpaprMachineState *spapr, */ void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas) { + if (spapr_ovec_test(spapr->ov5_cas, OV5_FORM2_AFFINITY)) { + spapr_numa_FORM2_write_rtas_dt(spapr, fdt, rtas); + return; + } + spapr_numa_FORM1_write_rtas_dt(spapr, fdt, rtas); } -- cgit v1.1 From 0d5ba48112daf820daddb5c952265fa23e092e69 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Mon, 20 Sep 2021 14:49:47 -0300 Subject: spapr_numa.c: handle auto NUMA node with no distance info numa_complete_configuration() in hw/core/numa.c always adds a NUMA node for the pSeries machine if none was specified, but without node distance information for the single node created. NUMA FORM1 affinity code didn't rely on numa_state information to do its job, but FORM2 does. As is now, this is the result of a pSeries guest with NUMA FORM2 affinity when no NUMA nodes is specified: $ numactl -H available: 1 nodes (0) node 0 cpus: 0 node 0 size: 16222 MB node 0 free: 15681 MB No distance information available. This can be amended in spapr_numa_FORM2_write_rtas_tables(). We're enforcing that the local distance (the distance to the node to itself) is always 10. This allows for the proper creation of the NUMA distance tables, fixing the output of 'numactl -H' in the guest: $ numactl -H available: 1 nodes (0) node 0 cpus: 0 node 0 size: 16222 MB node 0 free: 15685 MB node distances: node 0 0: 10 CC: Igor Mammedov Reviewed-by: Greg Kurz Signed-off-by: Daniel Henrique Barboza Message-Id: <20210920174947.556324-8-danielhb413@gmail.com> Acked-by: Igor Mammedov Signed-off-by: David Gibson --- hw/ppc/spapr_numa.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'hw') diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c index 13db321..58d5dc7 100644 --- a/hw/ppc/spapr_numa.c +++ b/hw/ppc/spapr_numa.c @@ -539,6 +539,17 @@ static void spapr_numa_FORM2_write_rtas_tables(SpaprMachineState *spapr, for (src = 0; src < nb_numa_nodes; src++) { for (dst = 0; dst < nb_numa_nodes; dst++) { + /* + * We need to be explicit with the local distance + * value to cover the case where the user didn't added any + * NUMA nodes, but QEMU adds the default NUMA node without + * adding the numa_info to retrieve distance info from. + */ + if (src == dst) { + node_distances[i++] = 10; + continue; + } + node_distances[i++] = numa_info[src].distance[dst]; } } -- cgit v1.1 From af96d2e69227fc25f663d840527dbe2a0c592400 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 20 Sep 2021 08:12:02 +0200 Subject: target/ppc: Convert debug to trace events (decrementer and IRQ) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédric Le Goater Message-Id: <20210920061203.989563-4-clg@kaod.org> Signed-off-by: David Gibson --- hw/ppc/ppc.c | 169 +++++++++++++++++++--------------------------------- hw/ppc/trace-events | 22 ++++++- 2 files changed, 82 insertions(+), 109 deletions(-) (limited to 'hw') diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c index a327206..b813ef7 100644 --- a/hw/ppc/ppc.c +++ b/hw/ppc/ppc.c @@ -37,22 +37,6 @@ #include "migration/vmstate.h" #include "trace.h" -//#define PPC_DEBUG_IRQ -//#define PPC_DEBUG_TB - -#ifdef PPC_DEBUG_IRQ -# define LOG_IRQ(...) qemu_log_mask(CPU_LOG_INT, ## __VA_ARGS__) -#else -# define LOG_IRQ(...) do { } while (0) -#endif - - -#ifdef PPC_DEBUG_TB -# define LOG_TB(...) qemu_log(__VA_ARGS__) -#else -# define LOG_TB(...) do { } while (0) -#endif - static void cpu_ppc_tb_stop (CPUPPCState *env); static void cpu_ppc_tb_start (CPUPPCState *env); @@ -86,9 +70,8 @@ void ppc_set_irq(PowerPCCPU *cpu, int n_IRQ, int level) } - LOG_IRQ("%s: %p n_IRQ %d level %d => pending %08" PRIx32 - "req %08x\n", __func__, env, n_IRQ, level, - env->pending_interrupts, CPU(cpu)->interrupt_request); + trace_ppc_irq_set_exit(env, n_IRQ, level, env->pending_interrupts, + CPU(cpu)->interrupt_request); if (locked) { qemu_mutex_unlock_iothread(); @@ -102,8 +85,8 @@ static void ppc6xx_set_irq(void *opaque, int pin, int level) CPUPPCState *env = &cpu->env; int cur_level; - LOG_IRQ("%s: env %p pin %d level %d\n", __func__, - env, pin, level); + trace_ppc_irq_set(env, pin, level); + cur_level = (env->irq_input_state >> pin) & 1; /* Don't generate spurious events */ if ((cur_level == 1 && level == 0) || (cur_level == 0 && level != 0)) { @@ -112,8 +95,7 @@ static void ppc6xx_set_irq(void *opaque, int pin, int level) switch (pin) { case PPC6xx_INPUT_TBEN: /* Level sensitive - active high */ - LOG_IRQ("%s: %s the time base\n", - __func__, level ? "start" : "stop"); + trace_ppc_irq_set_state("time base", level); if (level) { cpu_ppc_tb_start(env); } else { @@ -122,14 +104,12 @@ static void ppc6xx_set_irq(void *opaque, int pin, int level) break; case PPC6xx_INPUT_INT: /* Level sensitive - active high */ - LOG_IRQ("%s: set the external IRQ state to %d\n", - __func__, level); + trace_ppc_irq_set_state("external IRQ", level); ppc_set_irq(cpu, PPC_INTERRUPT_EXT, level); break; case PPC6xx_INPUT_SMI: /* Level sensitive - active high */ - LOG_IRQ("%s: set the SMI IRQ state to %d\n", - __func__, level); + trace_ppc_irq_set_state("SMI IRQ", level); ppc_set_irq(cpu, PPC_INTERRUPT_SMI, level); break; case PPC6xx_INPUT_MCP: @@ -138,8 +118,7 @@ static void ppc6xx_set_irq(void *opaque, int pin, int level) * 603/604/740/750: check HID0[EMCP] */ if (cur_level == 1 && level == 0) { - LOG_IRQ("%s: raise machine check state\n", - __func__); + trace_ppc_irq_set_state("machine check", 1); ppc_set_irq(cpu, PPC_INTERRUPT_MCK, 1); } break; @@ -148,20 +127,19 @@ static void ppc6xx_set_irq(void *opaque, int pin, int level) /* XXX: TODO: relay the signal to CKSTP_OUT pin */ /* XXX: Note that the only way to restart the CPU is to reset it */ if (level) { - LOG_IRQ("%s: stop the CPU\n", __func__); + trace_ppc_irq_cpu("stop"); cs->halted = 1; } break; case PPC6xx_INPUT_HRESET: /* Level sensitive - active low */ if (level) { - LOG_IRQ("%s: reset the CPU\n", __func__); + trace_ppc_irq_reset("CPU"); cpu_interrupt(cs, CPU_INTERRUPT_RESET); } break; case PPC6xx_INPUT_SRESET: - LOG_IRQ("%s: set the RESET IRQ state to %d\n", - __func__, level); + trace_ppc_irq_set_state("RESET IRQ", level); ppc_set_irq(cpu, PPC_INTERRUPT_RESET, level); break; default: @@ -190,8 +168,8 @@ static void ppc970_set_irq(void *opaque, int pin, int level) CPUPPCState *env = &cpu->env; int cur_level; - LOG_IRQ("%s: env %p pin %d level %d\n", __func__, - env, pin, level); + trace_ppc_irq_set(env, pin, level); + cur_level = (env->irq_input_state >> pin) & 1; /* Don't generate spurious events */ if ((cur_level == 1 && level == 0) || (cur_level == 0 && level != 0)) { @@ -200,14 +178,12 @@ static void ppc970_set_irq(void *opaque, int pin, int level) switch (pin) { case PPC970_INPUT_INT: /* Level sensitive - active high */ - LOG_IRQ("%s: set the external IRQ state to %d\n", - __func__, level); + trace_ppc_irq_set_state("external IRQ", level); ppc_set_irq(cpu, PPC_INTERRUPT_EXT, level); break; case PPC970_INPUT_THINT: /* Level sensitive - active high */ - LOG_IRQ("%s: set the SMI IRQ state to %d\n", __func__, - level); + trace_ppc_irq_set_state("SMI IRQ", level); ppc_set_irq(cpu, PPC_INTERRUPT_THERM, level); break; case PPC970_INPUT_MCP: @@ -216,8 +192,7 @@ static void ppc970_set_irq(void *opaque, int pin, int level) * 603/604/740/750: check HID0[EMCP] */ if (cur_level == 1 && level == 0) { - LOG_IRQ("%s: raise machine check state\n", - __func__); + trace_ppc_irq_set_state("machine check", 1); ppc_set_irq(cpu, PPC_INTERRUPT_MCK, 1); } break; @@ -225,10 +200,10 @@ static void ppc970_set_irq(void *opaque, int pin, int level) /* Level sensitive - active low */ /* XXX: TODO: relay the signal to CKSTP_OUT pin */ if (level) { - LOG_IRQ("%s: stop the CPU\n", __func__); + trace_ppc_irq_cpu("stop"); cs->halted = 1; } else { - LOG_IRQ("%s: restart the CPU\n", __func__); + trace_ppc_irq_cpu("restart"); cs->halted = 0; qemu_cpu_kick(cs); } @@ -240,13 +215,11 @@ static void ppc970_set_irq(void *opaque, int pin, int level) } break; case PPC970_INPUT_SRESET: - LOG_IRQ("%s: set the RESET IRQ state to %d\n", - __func__, level); + trace_ppc_irq_set_state("RESET IRQ", level); ppc_set_irq(cpu, PPC_INTERRUPT_RESET, level); break; case PPC970_INPUT_TBEN: - LOG_IRQ("%s: set the TBEN state to %d\n", __func__, - level); + trace_ppc_irq_set_state("TBEN IRQ", level); /* XXX: TODO */ break; default: @@ -272,14 +245,12 @@ static void power7_set_irq(void *opaque, int pin, int level) { PowerPCCPU *cpu = opaque; - LOG_IRQ("%s: env %p pin %d level %d\n", __func__, - &cpu->env, pin, level); + trace_ppc_irq_set(&cpu->env, pin, level); switch (pin) { case POWER7_INPUT_INT: /* Level sensitive - active high */ - LOG_IRQ("%s: set the external IRQ state to %d\n", - __func__, level); + trace_ppc_irq_set_state("external IRQ", level); ppc_set_irq(cpu, PPC_INTERRUPT_EXT, level); break; default: @@ -300,24 +271,22 @@ static void power9_set_irq(void *opaque, int pin, int level) { PowerPCCPU *cpu = opaque; - LOG_IRQ("%s: env %p pin %d level %d\n", __func__, - &cpu->env, pin, level); + trace_ppc_irq_set(&cpu->env, pin, level); switch (pin) { case POWER9_INPUT_INT: /* Level sensitive - active high */ - LOG_IRQ("%s: set the external IRQ state to %d\n", - __func__, level); + trace_ppc_irq_set_state("external IRQ", level); ppc_set_irq(cpu, PPC_INTERRUPT_EXT, level); break; case POWER9_INPUT_HINT: /* Level sensitive - active high */ - LOG_IRQ("%s: set the external IRQ state to %d\n", - __func__, level); + trace_ppc_irq_set_state("HV external IRQ", level); ppc_set_irq(cpu, PPC_INTERRUPT_HVIRT, level); break; default: g_assert_not_reached(); + return; } } @@ -393,8 +362,8 @@ static void ppc40x_set_irq(void *opaque, int pin, int level) CPUPPCState *env = &cpu->env; int cur_level; - LOG_IRQ("%s: env %p pin %d level %d\n", __func__, - env, pin, level); + trace_ppc_irq_set(env, pin, level); + cur_level = (env->irq_input_state >> pin) & 1; /* Don't generate spurious events */ if ((cur_level == 1 && level == 0) || (cur_level == 0 && level != 0)) { @@ -403,51 +372,47 @@ static void ppc40x_set_irq(void *opaque, int pin, int level) switch (pin) { case PPC40x_INPUT_RESET_SYS: if (level) { - LOG_IRQ("%s: reset the PowerPC system\n", - __func__); + trace_ppc_irq_reset("system"); ppc40x_system_reset(cpu); } break; case PPC40x_INPUT_RESET_CHIP: if (level) { - LOG_IRQ("%s: reset the PowerPC chip\n", __func__); + trace_ppc_irq_reset("chip"); ppc40x_chip_reset(cpu); } break; case PPC40x_INPUT_RESET_CORE: /* XXX: TODO: update DBSR[MRR] */ if (level) { - LOG_IRQ("%s: reset the PowerPC core\n", __func__); + trace_ppc_irq_reset("core"); ppc40x_core_reset(cpu); } break; case PPC40x_INPUT_CINT: /* Level sensitive - active high */ - LOG_IRQ("%s: set the critical IRQ state to %d\n", - __func__, level); + trace_ppc_irq_set_state("critical IRQ", level); ppc_set_irq(cpu, PPC_INTERRUPT_CEXT, level); break; case PPC40x_INPUT_INT: /* Level sensitive - active high */ - LOG_IRQ("%s: set the external IRQ state to %d\n", - __func__, level); + trace_ppc_irq_set_state("external IRQ", level); ppc_set_irq(cpu, PPC_INTERRUPT_EXT, level); break; case PPC40x_INPUT_HALT: /* Level sensitive - active low */ if (level) { - LOG_IRQ("%s: stop the CPU\n", __func__); + trace_ppc_irq_cpu("stop"); cs->halted = 1; } else { - LOG_IRQ("%s: restart the CPU\n", __func__); + trace_ppc_irq_cpu("restart"); cs->halted = 0; qemu_cpu_kick(cs); } break; case PPC40x_INPUT_DEBUG: /* Level sensitive - active high */ - LOG_IRQ("%s: set the debug pin state to %d\n", - __func__, level); + trace_ppc_irq_set_state("debug pin", level); ppc_set_irq(cpu, PPC_INTERRUPT_DEBUG, level); break; default: @@ -475,41 +440,37 @@ static void ppce500_set_irq(void *opaque, int pin, int level) CPUPPCState *env = &cpu->env; int cur_level; - LOG_IRQ("%s: env %p pin %d level %d\n", __func__, - env, pin, level); + trace_ppc_irq_set(env, pin, level); + cur_level = (env->irq_input_state >> pin) & 1; /* Don't generate spurious events */ if ((cur_level == 1 && level == 0) || (cur_level == 0 && level != 0)) { switch (pin) { case PPCE500_INPUT_MCK: if (level) { - LOG_IRQ("%s: reset the PowerPC system\n", - __func__); + trace_ppc_irq_reset("system"); qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); } break; case PPCE500_INPUT_RESET_CORE: if (level) { - LOG_IRQ("%s: reset the PowerPC core\n", __func__); + trace_ppc_irq_reset("core"); ppc_set_irq(cpu, PPC_INTERRUPT_MCK, level); } break; case PPCE500_INPUT_CINT: /* Level sensitive - active high */ - LOG_IRQ("%s: set the critical IRQ state to %d\n", - __func__, level); + trace_ppc_irq_set_state("critical IRQ", level); ppc_set_irq(cpu, PPC_INTERRUPT_CEXT, level); break; case PPCE500_INPUT_INT: /* Level sensitive - active high */ - LOG_IRQ("%s: set the core IRQ state to %d\n", - __func__, level); + trace_ppc_irq_set_state("core IRQ", level); ppc_set_irq(cpu, PPC_INTERRUPT_EXT, level); break; case PPCE500_INPUT_DEBUG: /* Level sensitive - active high */ - LOG_IRQ("%s: set the debug pin state to %d\n", - __func__, level); + trace_ppc_irq_set_state("debug pin", level); ppc_set_irq(cpu, PPC_INTERRUPT_DEBUG, level); break; default: @@ -564,7 +525,7 @@ uint64_t cpu_ppc_load_tbl (CPUPPCState *env) } tb = cpu_ppc_get_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL), tb_env->tb_offset); - LOG_TB("%s: tb %016" PRIx64 "\n", __func__, tb); + trace_ppc_tb_load(tb); return tb; } @@ -575,7 +536,7 @@ static inline uint32_t _cpu_ppc_load_tbu(CPUPPCState *env) uint64_t tb; tb = cpu_ppc_get_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL), tb_env->tb_offset); - LOG_TB("%s: tb %016" PRIx64 "\n", __func__, tb); + trace_ppc_tb_load(tb); return tb >> 32; } @@ -595,8 +556,7 @@ static inline void cpu_ppc_store_tb(ppc_tb_t *tb_env, uint64_t vmclk, *tb_offsetp = value - muldiv64(vmclk, tb_env->tb_freq, NANOSECONDS_PER_SECOND); - LOG_TB("%s: tb %016" PRIx64 " offset %08" PRIx64 "\n", - __func__, value, *tb_offsetp); + trace_ppc_tb_store(value, *tb_offsetp); } void cpu_ppc_store_tbl (CPUPPCState *env, uint32_t value) @@ -632,7 +592,7 @@ uint64_t cpu_ppc_load_atbl (CPUPPCState *env) uint64_t tb; tb = cpu_ppc_get_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL), tb_env->atb_offset); - LOG_TB("%s: tb %016" PRIx64 "\n", __func__, tb); + trace_ppc_tb_load(tb); return tb; } @@ -643,7 +603,7 @@ uint32_t cpu_ppc_load_atbu (CPUPPCState *env) uint64_t tb; tb = cpu_ppc_get_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL), tb_env->atb_offset); - LOG_TB("%s: tb %016" PRIx64 "\n", __func__, tb); + trace_ppc_tb_load(tb); return tb >> 32; } @@ -762,7 +722,7 @@ static inline int64_t _cpu_ppc_load_decr(CPUPPCState *env, uint64_t next) } else { decr = -muldiv64(-diff, tb_env->decr_freq, NANOSECONDS_PER_SECOND); } - LOG_TB("%s: %016" PRIx64 "\n", __func__, decr); + trace_ppc_decr_load(decr); return decr; } @@ -821,7 +781,7 @@ uint64_t cpu_ppc_load_purr (CPUPPCState *env) static inline void cpu_ppc_decr_excp(PowerPCCPU *cpu) { /* Raise it */ - LOG_TB("raise decrementer exception\n"); + trace_ppc_decr_excp("raise"); ppc_set_irq(cpu, PPC_INTERRUPT_DECR, 1); } @@ -835,7 +795,7 @@ static inline void cpu_ppc_hdecr_excp(PowerPCCPU *cpu) CPUPPCState *env = &cpu->env; /* Raise it */ - LOG_TB("raise hv decrementer exception\n"); + trace_ppc_decr_excp("raise HV"); /* The architecture specifies that we don't deliver HDEC * interrupts in a PM state. Not only they don't cause a @@ -870,8 +830,7 @@ static void __cpu_ppc_store_decr(PowerPCCPU *cpu, uint64_t *nextp, value |= (0xFFFFFFFFULL << nr_bits); } - LOG_TB("%s: " TARGET_FMT_lx " => " TARGET_FMT_lx "\n", __func__, - decr, value); + trace_ppc_decr_store(nr_bits, decr, value); if (kvm_enabled()) { /* KVM handles decrementer exceptions, we don't need our own timer */ @@ -1199,9 +1158,8 @@ static void cpu_4xx_fit_cb (void *opaque) if ((env->spr[SPR_40x_TCR] >> 23) & 0x1) { ppc_set_irq(cpu, PPC_INTERRUPT_FIT, 1); } - LOG_TB("%s: ir %d TCR " TARGET_FMT_lx " TSR " TARGET_FMT_lx "\n", __func__, - (int)((env->spr[SPR_40x_TCR] >> 23) & 0x1), - env->spr[SPR_40x_TCR], env->spr[SPR_40x_TSR]); + trace_ppc4xx_fit((int)((env->spr[SPR_40x_TCR] >> 23) & 0x1), + env->spr[SPR_40x_TCR], env->spr[SPR_40x_TSR]); } /* Programmable interval timer */ @@ -1215,11 +1173,10 @@ static void start_stop_pit (CPUPPCState *env, ppc_tb_t *tb_env, int is_excp) !((env->spr[SPR_40x_TCR] >> 26) & 0x1) || (is_excp && !((env->spr[SPR_40x_TCR] >> 22) & 0x1))) { /* Stop PIT */ - LOG_TB("%s: stop PIT\n", __func__); + trace_ppc4xx_pit_stop(); timer_del(tb_env->decr_timer); } else { - LOG_TB("%s: start PIT %016" PRIx64 "\n", - __func__, ppc40x_timer->pit_reload); + trace_ppc4xx_pit_start(ppc40x_timer->pit_reload); now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); next = now + muldiv64(ppc40x_timer->pit_reload, NANOSECONDS_PER_SECOND, tb_env->decr_freq); @@ -1248,9 +1205,7 @@ static void cpu_4xx_pit_cb (void *opaque) ppc_set_irq(cpu, ppc40x_timer->decr_excp, 1); } start_stop_pit(env, tb_env, 1); - LOG_TB("%s: ar %d ir %d TCR " TARGET_FMT_lx " TSR " TARGET_FMT_lx " " - "%016" PRIx64 "\n", __func__, - (int)((env->spr[SPR_40x_TCR] >> 22) & 0x1), + trace_ppc4xx_pit((int)((env->spr[SPR_40x_TCR] >> 22) & 0x1), (int)((env->spr[SPR_40x_TCR] >> 26) & 0x1), env->spr[SPR_40x_TCR], env->spr[SPR_40x_TSR], ppc40x_timer->pit_reload); @@ -1290,8 +1245,7 @@ static void cpu_4xx_wdt_cb (void *opaque) next = now + muldiv64(next, NANOSECONDS_PER_SECOND, tb_env->decr_freq); if (next == now) next++; - LOG_TB("%s: TCR " TARGET_FMT_lx " TSR " TARGET_FMT_lx "\n", __func__, - env->spr[SPR_40x_TCR], env->spr[SPR_40x_TSR]); + trace_ppc4xx_wdt(env->spr[SPR_40x_TCR], env->spr[SPR_40x_TSR]); switch ((env->spr[SPR_40x_TSR] >> 30) & 0x3) { case 0x0: case 0x1: @@ -1334,7 +1288,7 @@ void store_40x_pit (CPUPPCState *env, target_ulong val) tb_env = env->tb_env; ppc40x_timer = tb_env->opaque; - LOG_TB("%s val" TARGET_FMT_lx "\n", __func__, val); + trace_ppc40x_store_pit(val); ppc40x_timer->pit_reload = val; start_stop_pit(env, tb_env, 0); } @@ -1349,8 +1303,7 @@ static void ppc_40x_set_tb_clk (void *opaque, uint32_t freq) CPUPPCState *env = opaque; ppc_tb_t *tb_env = env->tb_env; - LOG_TB("%s set new frequency to %" PRIu32 "\n", __func__, - freq); + trace_ppc40x_set_tb_clk(freq); tb_env->tb_freq = freq; tb_env->decr_freq = freq; /* XXX: we should also update all timers */ @@ -1369,7 +1322,7 @@ clk_setup_cb ppc_40x_timers_init (CPUPPCState *env, uint32_t freq, tb_env->tb_freq = freq; tb_env->decr_freq = freq; tb_env->opaque = ppc40x_timer; - LOG_TB("%s freq %" PRIu32 "\n", __func__, freq); + trace_ppc40x_timers_init(freq); if (ppc40x_timer != NULL) { /* We use decr timer for PIT */ tb_env->decr_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, &cpu_4xx_pit_cb, env); diff --git a/hw/ppc/trace-events b/hw/ppc/trace-events index da6e74b..3bf43fa 100644 --- a/hw/ppc/trace-events +++ b/hw/ppc/trace-events @@ -97,7 +97,27 @@ vof_claimed(uint64_t start, uint64_t end, uint64_t size) "0x%"PRIx64"..0x%"PRIx6 # ppc.c ppc_tb_adjust(uint64_t offs1, uint64_t offs2, int64_t diff, int64_t seconds) "adjusted from 0x%"PRIx64" to 0x%"PRIx64", diff %"PRId64" (%"PRId64"s)" - +ppc_tb_load(uint64_t tb) "tb 0x%016" PRIx64 +ppc_tb_store(uint64_t tb, uint64_t offset) "tb 0x%016" PRIx64 " offset 0x%08" PRIx64 + +ppc_decr_load(uint64_t tb) "decr 0x%016" PRIx64 +ppc_decr_excp(const char *action) "%s decrementer" +ppc_decr_store(uint32_t nr_bits, uint64_t decr, uint64_t value) "%d-bit 0x%016" PRIx64 " => 0x%016" PRIx64 + +ppc4xx_fit(uint32_t ir, uint64_t tcr, uint64_t tsr) "ir %d TCR 0x%" PRIx64 " TSR 0x%" PRIx64 +ppc4xx_pit_stop(void) "" +ppc4xx_pit_start(uint64_t reload) "PIT 0x%016" PRIx64 +ppc4xx_pit(uint32_t ar, uint32_t ir, uint64_t tcr, uint64_t tsr, uint64_t reload) "ar %d ir %d TCR 0x%" PRIx64 " TSR 0x%" PRIx64 " PIT 0x%016" PRIx64 +ppc4xx_wdt(uint64_t tcr, uint64_t tsr) "TCR 0x%" PRIx64 " TSR 0x%" PRIx64 +ppc40x_store_pit(uint64_t value) "val 0x%" PRIx64 +ppc40x_set_tb_clk(uint32_t value) "new frequency %" PRIu32 +ppc40x_timers_init(uint32_t value) "frequency %" PRIu32 + +ppc_irq_set(void *env, uint32_t pin, uint32_t level) "env [%p] pin %d level %d" +ppc_irq_set_exit(void *env, uint32_t n_IRQ, uint32_t level, uint32_t pending, uint32_t request) "env [%p] n_IRQ %d level %d => pending 0x%08" PRIx32 " req 0x%08" PRIx32 +ppc_irq_set_state(const char *name, uint32_t level) "\"%s\" level %d" +ppc_irq_reset(const char *name) "%s" +ppc_irq_cpu(const char *action) "%s" # prep_systemio.c prep_systemio_read(uint32_t addr, uint32_t val) "read addr=0x%x val=0x%x" -- cgit v1.1 From 4d9b8ef9b5ab880d491b0c41d222b42ed83bdbfe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 20 Sep 2021 08:12:03 +0200 Subject: target/ppc: Fix 64-bit decrementer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current way the mask is built can overflow with a 64-bit decrementer. Use sextract64() to extract the signed values and remove the logic to handle negative values which has become useless. Cc: Luis Fernando Fujita Pires Fixes: a8dafa525181 ("target/ppc: Implement large decrementer support for TCG") Signed-off-by: Cédric Le Goater Message-Id: <20210920061203.989563-5-clg@kaod.org> Reviewed-by: Luis Pires Signed-off-by: David Gibson --- hw/ppc/ppc.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'hw') diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c index b813ef7..f5d012f 100644 --- a/hw/ppc/ppc.c +++ b/hw/ppc/ppc.c @@ -821,14 +821,12 @@ static void __cpu_ppc_store_decr(PowerPCCPU *cpu, uint64_t *nextp, CPUPPCState *env = &cpu->env; ppc_tb_t *tb_env = env->tb_env; uint64_t now, next; - bool negative; + int64_t signed_value; + int64_t signed_decr; /* Truncate value to decr_width and sign extend for simplicity */ - value &= ((1ULL << nr_bits) - 1); - negative = !!(value & (1ULL << (nr_bits - 1))); - if (negative) { - value |= (0xFFFFFFFFULL << nr_bits); - } + signed_value = sextract64(value, 0, nr_bits); + signed_decr = sextract64(decr, 0, nr_bits); trace_ppc_decr_store(nr_bits, decr, value); @@ -850,16 +848,16 @@ static void __cpu_ppc_store_decr(PowerPCCPU *cpu, uint64_t *nextp, * On MSB edge based DEC implementations the MSB going from 0 -> 1 triggers * an edge interrupt, so raise it here too. */ - if ((value < 3) || - ((tb_env->flags & PPC_DECR_UNDERFLOW_LEVEL) && negative) || - ((tb_env->flags & PPC_DECR_UNDERFLOW_TRIGGERED) && negative - && !(decr & (1ULL << (nr_bits - 1))))) { + if ((signed_value < 3) || + ((tb_env->flags & PPC_DECR_UNDERFLOW_LEVEL) && signed_value < 0) || + ((tb_env->flags & PPC_DECR_UNDERFLOW_TRIGGERED) && signed_value < 0 + && signed_decr >= 0)) { (*raise_excp)(cpu); return; } /* On MSB level based systems a 0 for the MSB stops interrupt delivery */ - if (!negative && (tb_env->flags & PPC_DECR_UNDERFLOW_LEVEL)) { + if (signed_value >= 0 && (tb_env->flags & PPC_DECR_UNDERFLOW_LEVEL)) { (*lower_excp)(cpu); } -- cgit v1.1 From 457279cb49d343b2c39a9efd2b28fe0fbde71f78 Mon Sep 17 00:00:00 2001 From: Bin Meng Date: Sat, 18 Sep 2021 11:26:51 +0800 Subject: hw/intc: openpic: Correct the reset value of IPIDR for FSL chipset The reset value of IPIDR should be zero for Freescale chipset, per the following 2 manuals I checked: - P2020RM (https://www.nxp.com/webapp/Download?colCode=P2020RM) - P4080RM (https://www.nxp.com/webapp/Download?colCode=P4080RM) Currently it is set to 1, which leaves the IPI enabled on core 0 after power-on reset. Such may cause unexpected interrupt to be delivered to core 0 if the IPI is triggered from core 0 to other cores later. Fixes: ffd5e9fe0276 ("openpic: Reset IRQ source private members") Resolves: https://gitlab.com/qemu-project/qemu/-/issues/584 Signed-off-by: Bin Meng Message-Id: <20210918032653.646370-1-bin.meng@windriver.com> Signed-off-by: David Gibson --- hw/intc/openpic.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'hw') diff --git a/hw/intc/openpic.c b/hw/intc/openpic.c index 9b4c178..2790c67 100644 --- a/hw/intc/openpic.c +++ b/hw/intc/openpic.c @@ -1276,6 +1276,15 @@ static void openpic_reset(DeviceState *d) break; } + /* Mask all IPI interrupts for Freescale OpenPIC */ + if ((opp->model == OPENPIC_MODEL_FSL_MPIC_20) || + (opp->model == OPENPIC_MODEL_FSL_MPIC_42)) { + if (i >= opp->irq_ipi0 && i < opp->irq_tim0) { + write_IRQreg_idr(opp, i, 0); + continue; + } + } + write_IRQreg_idr(opp, i, opp->idr_reset); } /* Initialise IRQ destinations */ -- cgit v1.1 From 86229b68a28a8414797dd5548f3c5284f009fb53 Mon Sep 17 00:00:00 2001 From: Bin Meng Date: Sat, 18 Sep 2021 11:26:52 +0800 Subject: hw/intc: openpic: Drop Raven related codes There is no machine that uses Motorola MCP750 (aka Raven) model. Drop the related codes. While we are here, drop the mentioning of Intel GW80314 I/O companion chip in the comments as it has been obsolete for years, and correct a typo too. Signed-off-by: Bin Meng Message-Id: <20210918032653.646370-2-bin.meng@windriver.com> Signed-off-by: David Gibson --- hw/intc/openpic.c | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) (limited to 'hw') diff --git a/hw/intc/openpic.c b/hw/intc/openpic.c index 2790c67..23eafb3 100644 --- a/hw/intc/openpic.c +++ b/hw/intc/openpic.c @@ -25,12 +25,8 @@ /* * * Based on OpenPic implementations: - * - Intel GW80314 I/O companion chip developer's manual * - Motorola MPC8245 & MPC8540 user manuals. - * - Motorola MCP750 (aka Raven) programmer manual. - * - Motorola Harrier programmer manuel - * - * Serial interrupts, as implemented in Raven chipset are not supported yet. + * - Motorola Harrier programmer manual * */ @@ -1564,28 +1560,6 @@ static void openpic_realize(DeviceState *dev, Error **errp) break; - case OPENPIC_MODEL_RAVEN: - opp->nb_irqs = RAVEN_MAX_EXT; - opp->vid = VID_REVISION_1_3; - opp->vir = VIR_GENERIC; - opp->vector_mask = 0xFF; - opp->tfrr_reset = 4160000; - opp->ivpr_reset = IVPR_MASK_MASK | IVPR_MODE_MASK; - opp->idr_reset = 0; - opp->max_irq = RAVEN_MAX_IRQ; - opp->irq_ipi0 = RAVEN_IPI_IRQ; - opp->irq_tim0 = RAVEN_TMR_IRQ; - opp->brr1 = -1; - opp->mpic_mode_mask = GCR_MODE_MIXED; - - if (opp->nb_cpus != 1) { - error_setg(errp, "Only UP supported today"); - return; - } - - map_list(opp, list_le, &list_count); - break; - case OPENPIC_MODEL_KEYLARGO: opp->nb_irqs = KEYLARGO_MAX_EXT; opp->vid = VID_REVISION_1_2; -- cgit v1.1 From 06caae8af0926077338e9133dd95913aead28745 Mon Sep 17 00:00:00 2001 From: Bin Meng Date: Sat, 18 Sep 2021 11:26:53 +0800 Subject: hw/intc: openpic: Clean up the styles Correct the multi-line comment format. No functional changes. Signed-off-by: Bin Meng Message-Id: <20210918032653.646370-3-bin.meng@windriver.com> Signed-off-by: David Gibson --- hw/intc/openpic.c | 55 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 21 deletions(-) (limited to 'hw') diff --git a/hw/intc/openpic.c b/hw/intc/openpic.c index 23eafb3..49504e7 100644 --- a/hw/intc/openpic.c +++ b/hw/intc/openpic.c @@ -47,7 +47,7 @@ #include "qemu/timer.h" #include "qemu/error-report.h" -//#define DEBUG_OPENPIC +/* #define DEBUG_OPENPIC */ #ifdef DEBUG_OPENPIC static const int debug_openpic = 1; @@ -118,7 +118,8 @@ static FslMpicInfo fsl_mpic_42 = { #define ILR_INTTGT_CINT 0x01 /* critical */ #define ILR_INTTGT_MCP 0x02 /* machine check */ -/* The currently supported INTTGT values happen to be the same as QEMU's +/* + * The currently supported INTTGT values happen to be the same as QEMU's * openpic output codes, but don't depend on this. The output codes * could change (unlikely, but...) or support could be added for * more INTTGT values. @@ -177,10 +178,11 @@ static void openpic_cpu_write_internal(void *opaque, hwaddr addr, uint32_t val, int idx); static void openpic_reset(DeviceState *d); -/* Convert between openpic clock ticks and nanosecs. In the hardware the clock - frequency is driven by board inputs to the PIC which the PIC would then - divide by 4 or 8. For now hard code to 25MZ. -*/ +/* + * Convert between openpic clock ticks and nanosecs. In the hardware the clock + * frequency is driven by board inputs to the PIC which the PIC would then + * divide by 4 or 8. For now hard code to 25MZ. + */ #define OPENPIC_TIMER_FREQ_MHZ 25 #define OPENPIC_TIMER_NS_PER_TICK (1000 / OPENPIC_TIMER_FREQ_MHZ) static inline uint64_t ns_to_ticks(uint64_t ns) @@ -253,7 +255,8 @@ static void IRQ_local_pipe(OpenPICState *opp, int n_CPU, int n_IRQ, __func__, src->output, n_IRQ, active, was_active, dst->outputs_active[src->output]); - /* On Freescale MPIC, critical interrupts ignore priority, + /* + * On Freescale MPIC, critical interrupts ignore priority, * IACK, EOI, etc. Before MPIC v4.1 they also ignore * masking. */ @@ -276,7 +279,8 @@ static void IRQ_local_pipe(OpenPICState *opp, int n_CPU, int n_IRQ, priority = IVPR_PRIORITY(src->ivpr); - /* Even if the interrupt doesn't have enough priority, + /* + * Even if the interrupt doesn't have enough priority, * it is still raised, in case ctpr is lowered later. */ if (active) { @@ -408,7 +412,8 @@ static void openpic_set_irq(void *opaque, int n_IRQ, int level) } if (src->output != OPENPIC_OUTPUT_INT) { - /* Edge-triggered interrupts shouldn't be used + /* + * Edge-triggered interrupts shouldn't be used * with non-INT delivery, but just in case, * try to make it do something sane rather than * cause an interrupt storm. This is close to @@ -501,7 +506,8 @@ static inline void write_IRQreg_ivpr(OpenPICState *opp, int n_IRQ, uint32_t val) { uint32_t mask; - /* NOTE when implementing newer FSL MPIC models: starting with v4.0, + /* + * NOTE when implementing newer FSL MPIC models: starting with v4.0, * the polarity bit is read-only on internal interrupts. */ mask = IVPR_MASK_MASK | IVPR_PRIORITY_MASK | IVPR_SENSE_MASK | @@ -511,7 +517,8 @@ static inline void write_IRQreg_ivpr(OpenPICState *opp, int n_IRQ, uint32_t val) opp->src[n_IRQ].ivpr = (opp->src[n_IRQ].ivpr & IVPR_ACTIVITY_MASK) | (val & mask); - /* For FSL internal interrupts, The sense bit is reserved and zero, + /* + * For FSL internal interrupts, The sense bit is reserved and zero, * and the interrupt is always level-triggered. Timers and IPIs * have no sense or polarity bits, and are edge-triggered. */ @@ -695,16 +702,20 @@ static void qemu_timer_cb(void *opaque) openpic_set_irq(opp, n_IRQ, 0); } -/* If enabled is true, arranges for an interrupt to be raised val clocks into - the future, if enabled is false cancels the timer. */ +/* + * If enabled is true, arranges for an interrupt to be raised val clocks into + * the future, if enabled is false cancels the timer. + */ static void openpic_tmr_set_tmr(OpenPICTimer *tmr, uint32_t val, bool enabled) { uint64_t ns = ticks_to_ns(val & ~TCCR_TOG); - /* A count of zero causes a timer to be set to expire immediately. This - effectively stops the simulation since the timer is constantly expiring - which prevents guest code execution, so we don't honor that - configuration. On real hardware, this situation would generate an - interrupt on every clock cycle if the interrupt was unmasked. */ + /* + * A count of zero causes a timer to be set to expire immediately. This + * effectively stops the simulation since the timer is constantly expiring + * which prevents guest code execution, so we don't honor that + * configuration. On real hardware, this situation would generate an + * interrupt on every clock cycle if the interrupt was unmasked. + */ if ((ns == 0) || !enabled) { tmr->qemu_timer_active = false; tmr->tccr = tmr->tccr & TCCR_TOG; @@ -717,8 +728,10 @@ static void openpic_tmr_set_tmr(OpenPICTimer *tmr, uint32_t val, bool enabled) } } -/* Returns the currrent tccr value, i.e., timer value (in clocks) with - appropriate TOG. */ +/* + * Returns the currrent tccr value, i.e., timer value (in clocks) with + * appropriate TOG. + */ static uint64_t openpic_tmr_get_timer(OpenPICTimer *tmr) { uint64_t retval; @@ -1309,7 +1322,7 @@ static void openpic_reset(DeviceState *d) typedef struct MemReg { const char *name; MemoryRegionOps const *ops; - hwaddr start_addr; + hwaddr start_addr; ram_addr_t size; } MemReg; -- cgit v1.1 From 28d86252fc6d7ff0b48a0298014e4761d8580c70 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Wed, 22 Sep 2021 09:28:52 -0300 Subject: spapr_numa.c: fixes in spapr_numa_FORM2_write_rtas_tables() This patch has a handful of modifications for the recent added FORM2 support: - to not allocate more than the necessary size in 'distance_table'. At this moment the array is oversized due to allocating uint32_t for all elements, when most of them fits in an uint8_t. Fix it by changing the array to uint8_t and allocating the exact size; - use stl_be_p() to store the uint32_t at the start of 'distance_table'; - use sizeof(uint32_t) to skip the uint32_t length when populating the distances; - use the NUMA_DISTANCE_MIN macro from sysemu/numa.h to avoid hardcoding the local distance value. Signed-off-by: Daniel Henrique Barboza Message-Id: <20210922122852.130054-2-danielhb413@gmail.com> Reviewed-by: Greg Kurz Signed-off-by: David Gibson --- hw/ppc/spapr_numa.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'hw') diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c index 58d5dc7..5822938 100644 --- a/hw/ppc/spapr_numa.c +++ b/hw/ppc/spapr_numa.c @@ -502,9 +502,8 @@ static void spapr_numa_FORM2_write_rtas_tables(SpaprMachineState *spapr, int nb_numa_nodes = ms->numa_state->num_nodes; int distance_table_entries = nb_numa_nodes * nb_numa_nodes; g_autofree uint32_t *lookup_index_table = NULL; - g_autofree uint32_t *distance_table = NULL; + g_autofree uint8_t *distance_table = NULL; int src, dst, i, distance_table_size; - uint8_t *node_distances; /* * ibm,numa-lookup-index-table: array with length and a @@ -531,11 +530,13 @@ static void spapr_numa_FORM2_write_rtas_tables(SpaprMachineState *spapr, * array because NUMA ids can be sparse (node 0 is the first, * node 8 is the second ...). */ - distance_table = g_new0(uint32_t, distance_table_entries + 1); - distance_table[0] = cpu_to_be32(distance_table_entries); + distance_table_size = distance_table_entries * sizeof(uint8_t) + + sizeof(uint32_t); + distance_table = g_new0(uint8_t, distance_table_size); + stl_be_p(distance_table, distance_table_entries); - node_distances = (uint8_t *)&distance_table[1]; - i = 0; + /* Skip the uint32_t array length at the start */ + i = sizeof(uint32_t); for (src = 0; src < nb_numa_nodes; src++) { for (dst = 0; dst < nb_numa_nodes; dst++) { @@ -546,16 +547,14 @@ static void spapr_numa_FORM2_write_rtas_tables(SpaprMachineState *spapr, * adding the numa_info to retrieve distance info from. */ if (src == dst) { - node_distances[i++] = 10; + distance_table[i++] = NUMA_DISTANCE_MIN; continue; } - node_distances[i++] = numa_info[src].distance[dst]; + distance_table[i++] = numa_info[src].distance[dst]; } } - distance_table_size = distance_table_entries * sizeof(uint8_t) + - sizeof(uint32_t); _FDT(fdt_setprop(fdt, rtas, "ibm,numa-distance-table", distance_table, distance_table_size)); } -- cgit v1.1 From 179abc1fcf6f74e21be02c1e4ec54776354c7136 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Wed, 22 Sep 2021 09:02:05 +0200 Subject: spapr/xive: Fix kvm_xive_source_reset trace event MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The trace event was placed in the wrong routine. Move it under kvmppc_xive_source_reset_one(). Fixes: 4e960974d4ee ("xive: Add trace events") Signed-off-by: Cédric Le Goater Message-Id: <20210922070205.1235943-1-clg@kaod.org> Reviewed-by: Greg Kurz Signed-off-by: David Gibson --- hw/intc/spapr_xive_kvm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'hw') diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c index 3e534b9..6d4909d 100644 --- a/hw/intc/spapr_xive_kvm.c +++ b/hw/intc/spapr_xive_kvm.c @@ -236,6 +236,8 @@ int kvmppc_xive_source_reset_one(XiveSource *xsrc, int srcno, Error **errp) SpaprXive *xive = SPAPR_XIVE(xsrc->xive); uint64_t state = 0; + trace_kvm_xive_source_reset(srcno); + assert(xive->fd != -1); if (xive_source_irq_is_lsi(xsrc, srcno)) { @@ -311,8 +313,6 @@ uint64_t kvmppc_xive_esb_rw(XiveSource *xsrc, int srcno, uint32_t offset, return xive_esb_rw(xsrc, srcno, offset, data, 1); } - trace_kvm_xive_source_reset(srcno); - /* * Special Load EOI handling for LSI sources. Q bit is never set * and the interrupt should be re-triggered if the level is still -- cgit v1.1