diff options
-rw-r--r-- | hw/npu2-opencapi.c | 55 | ||||
-rw-r--r-- | include/npu2.h | 1 |
2 files changed, 52 insertions, 4 deletions
diff --git a/hw/npu2-opencapi.c b/hw/npu2-opencapi.c index 285615a..9df51b2 100644 --- a/hw/npu2-opencapi.c +++ b/hw/npu2-opencapi.c @@ -1434,18 +1434,64 @@ static int64_t npu2_opencapi_ioda_reset(struct phb __unused *phb, return OPAL_SUCCESS; } -static int64_t npu2_opencapi_set_pe(struct phb __unused *phb, - uint64_t __unused pe_num, +static int64_t npu2_opencapi_set_pe(struct phb *phb, + uint64_t pe_num, uint64_t __unused bdfn, uint8_t __unused bcompare, uint8_t __unused dcompare, uint8_t __unused fcompare, uint8_t __unused action) { + struct npu2_dev *dev = phb_to_npu2_dev_ocapi(phb); /* * Ignored on OpenCAPI - we use fixed PE assignments. May need * addressing when we support dual-link devices. + * + * We nonetheless store the PE reported by the OS so that we + * can send it back in case of error. If there are several PCI + * functions on the device, the OS can define many PEs, we + * only keep one, the OS will handle it. */ + dev->linux_pe = pe_num; + return OPAL_SUCCESS; +} + +static int64_t npu2_opencapi_freeze_status(struct phb *phb __unused, + uint64_t pe_number __unused, + uint8_t *freeze_state, + uint16_t *pci_error_type, + uint16_t *severity) +{ + *freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN; + *pci_error_type = OPAL_EEH_NO_ERROR; + if (severity) + *severity = OPAL_EEH_SEV_NO_ERROR; + + return OPAL_SUCCESS; +} + +static int64_t npu2_opencapi_eeh_next_error(struct phb *phb, + uint64_t *first_frozen_pe, + uint16_t *pci_error_type, + uint16_t *severity) +{ + struct npu2_dev *dev = phb_to_npu2_dev_ocapi(phb); + uint64_t reg; + + if (!first_frozen_pe || !pci_error_type || !severity) + return OPAL_PARAMETER; + + reg = npu2_read(dev->npu, NPU2_MISC_FENCE_STATE); + if (reg & PPC_BIT(dev->brick_index)) { + OCAPIERR(dev, "Brick %d fenced!\n", dev->brick_index); + *first_frozen_pe = dev->linux_pe; + *pci_error_type = OPAL_EEH_PHB_ERROR; + *severity = OPAL_EEH_SEV_PHB_DEAD; + } else { + *first_frozen_pe = -1; + *pci_error_type = OPAL_EEH_NO_ERROR; + *severity = OPAL_EEH_SEV_NO_ERROR; + } return OPAL_SUCCESS; } @@ -1646,6 +1692,7 @@ static void setup_device(struct npu2_dev *dev) dev->phb_ocapi.scan_map = 0; dev->bdfn = 0; + dev->linux_pe = -1; dev->train_need_fence = false; dev->train_fenced = false; @@ -1765,10 +1812,10 @@ static const struct phb_ops npu2_opencapi_ops = { .get_msi_64 = NULL, .set_pe = npu2_opencapi_set_pe, .set_peltv = NULL, - .eeh_freeze_status = npu2_freeze_status, /* TODO */ + .eeh_freeze_status = npu2_opencapi_freeze_status, .eeh_freeze_clear = NULL, .eeh_freeze_set = NULL, - .next_error = NULL, + .next_error = npu2_opencapi_eeh_next_error, .err_inject = NULL, .get_diag_data = NULL, .get_diag_data2 = NULL, diff --git a/include/npu2.h b/include/npu2.h index 6c73679..ef4e7af 100644 --- a/include/npu2.h +++ b/include/npu2.h @@ -157,6 +157,7 @@ struct npu2_dev { /* OpenCAPI */ struct phb phb_ocapi; + uint64_t linux_pe; bool train_need_fence; bool train_fenced; }; |