diff options
-rw-r--r-- | hw/npu2-common.c | 7 | ||||
-rw-r--r-- | hw/npu2-opencapi.c | 21 | ||||
-rw-r--r-- | include/npu2.h | 5 |
3 files changed, 29 insertions, 4 deletions
diff --git a/hw/npu2-common.c b/hw/npu2-common.c index 6d5c35a..51ecd0c 100644 --- a/hw/npu2-common.c +++ b/hw/npu2-common.c @@ -406,6 +406,13 @@ static void npu2_err_interrupt(struct irq_source *is, uint32_t isn) p->chip_id, irq_name); free(irq_name); show_all_regs(p, brick); + /* + * P9 NPU doesn't support recovering a link going down + * unexpectedly. So we mark the device as broken and + * report it to the OS, so that the error is logged + * and the drivers notified. + */ + npu2_opencapi_set_broken(p, brick); opal_update_pending_evt(OPAL_EVENT_PCI_ERROR, OPAL_EVENT_PCI_ERROR); break; diff --git a/hw/npu2-opencapi.c b/hw/npu2-opencapi.c index 140b8ef..a8dd021 100644 --- a/hw/npu2-opencapi.c +++ b/hw/npu2-opencapi.c @@ -1465,14 +1465,12 @@ static int64_t npu2_opencapi_eeh_next_error(struct phb *phb, uint16_t *severity) { struct npu2_dev *dev = phb_to_npu2_dev_ocapi(phb); - uint64_t reg; if (!first_frozen_pe || !pci_error_type || !severity) return OPAL_PARAMETER; - reg = npu2_read(dev->npu, NPU2_MISC_FENCE_STATE); - if (reg & PPC_BIT(dev->brick_index)) { - OCAPIERR(dev, "Brick %d fenced!\n", dev->brick_index); + if (dev->flags & NPU2_DEV_BROKEN) { + OCAPIDBG(dev, "Reporting device as broken\n"); *first_frozen_pe = dev->linux_pe; *pci_error_type = OPAL_EEH_PHB_ERROR; *severity = OPAL_EEH_SEV_PHB_DEAD; @@ -1822,6 +1820,21 @@ static const struct phb_ops npu2_opencapi_ops = { .tce_kill = NULL, }; +void npu2_opencapi_set_broken(struct npu2 *npu, int brick) +{ + struct phb *phb; + struct npu2_dev *dev; + + for_each_phb(phb) { + if (phb->phb_type == phb_type_npu_v2_opencapi) { + dev = phb_to_npu2_dev_ocapi(phb); + if (dev->npu == npu && + dev->brick_index == brick) + dev->flags |= NPU2_DEV_BROKEN; + } + } +} + static int64_t opal_npu_spa_setup(uint64_t phb_id, uint32_t __unused bdfn, uint64_t addr, uint64_t PE_mask) { diff --git a/include/npu2.h b/include/npu2.h index 6171cd3..d2a3430 100644 --- a/include/npu2.h +++ b/include/npu2.h @@ -118,6 +118,8 @@ struct npu2_dev_nvlink { const char *slot_label; }; +#define NPU2_DEV_BROKEN 0x1 + struct npu2_dev { enum npu2_dev_type type; uint32_t link_index; @@ -126,6 +128,7 @@ struct npu2_dev { struct dt_node *dt_node; struct npu2_pcie_bar bars[2]; struct npu2 *npu; + long flags; uint32_t bdfn; @@ -251,4 +254,6 @@ int64_t npu2_map_lpar(struct phb *phb, uint64_t bdf, uint64_t lparid, int64_t npu2_set_relaxed_order(struct phb *phb, uint32_t gcid, int pec, bool enable); +void npu2_opencapi_set_broken(struct npu2 *npu, int brick); + #endif /* __NPU2_H */ |