aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--hw/npu2-common.c7
-rw-r--r--hw/npu2-opencapi.c21
-rw-r--r--include/npu2.h5
3 files changed, 29 insertions, 4 deletions
diff --git a/hw/npu2-common.c b/hw/npu2-common.c
index 6d5c35a..51ecd0c 100644
--- a/hw/npu2-common.c
+++ b/hw/npu2-common.c
@@ -406,6 +406,13 @@ static void npu2_err_interrupt(struct irq_source *is, uint32_t isn)
p->chip_id, irq_name);
free(irq_name);
show_all_regs(p, brick);
+ /*
+ * P9 NPU doesn't support recovering a link going down
+ * unexpectedly. So we mark the device as broken and
+ * report it to the OS, so that the error is logged
+ * and the drivers notified.
+ */
+ npu2_opencapi_set_broken(p, brick);
opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
OPAL_EVENT_PCI_ERROR);
break;
diff --git a/hw/npu2-opencapi.c b/hw/npu2-opencapi.c
index 140b8ef..a8dd021 100644
--- a/hw/npu2-opencapi.c
+++ b/hw/npu2-opencapi.c
@@ -1465,14 +1465,12 @@ static int64_t npu2_opencapi_eeh_next_error(struct phb *phb,
uint16_t *severity)
{
struct npu2_dev *dev = phb_to_npu2_dev_ocapi(phb);
- uint64_t reg;
if (!first_frozen_pe || !pci_error_type || !severity)
return OPAL_PARAMETER;
- reg = npu2_read(dev->npu, NPU2_MISC_FENCE_STATE);
- if (reg & PPC_BIT(dev->brick_index)) {
- OCAPIERR(dev, "Brick %d fenced!\n", dev->brick_index);
+ if (dev->flags & NPU2_DEV_BROKEN) {
+ OCAPIDBG(dev, "Reporting device as broken\n");
*first_frozen_pe = dev->linux_pe;
*pci_error_type = OPAL_EEH_PHB_ERROR;
*severity = OPAL_EEH_SEV_PHB_DEAD;
@@ -1822,6 +1820,21 @@ static const struct phb_ops npu2_opencapi_ops = {
.tce_kill = NULL,
};
+void npu2_opencapi_set_broken(struct npu2 *npu, int brick)
+{
+ struct phb *phb;
+ struct npu2_dev *dev;
+
+ for_each_phb(phb) {
+ if (phb->phb_type == phb_type_npu_v2_opencapi) {
+ dev = phb_to_npu2_dev_ocapi(phb);
+ if (dev->npu == npu &&
+ dev->brick_index == brick)
+ dev->flags |= NPU2_DEV_BROKEN;
+ }
+ }
+}
+
static int64_t opal_npu_spa_setup(uint64_t phb_id, uint32_t __unused bdfn,
uint64_t addr, uint64_t PE_mask)
{
diff --git a/include/npu2.h b/include/npu2.h
index 6171cd3..d2a3430 100644
--- a/include/npu2.h
+++ b/include/npu2.h
@@ -118,6 +118,8 @@ struct npu2_dev_nvlink {
const char *slot_label;
};
+#define NPU2_DEV_BROKEN 0x1
+
struct npu2_dev {
enum npu2_dev_type type;
uint32_t link_index;
@@ -126,6 +128,7 @@ struct npu2_dev {
struct dt_node *dt_node;
struct npu2_pcie_bar bars[2];
struct npu2 *npu;
+ long flags;
uint32_t bdfn;
@@ -251,4 +254,6 @@ int64_t npu2_map_lpar(struct phb *phb, uint64_t bdf, uint64_t lparid,
int64_t npu2_set_relaxed_order(struct phb *phb, uint32_t gcid, int pec,
bool enable);
+void npu2_opencapi_set_broken(struct npu2 *npu, int brick);
+
#endif /* __NPU2_H */