aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRussell Currey <ruscur@russell.cc>2016-01-18 16:59:42 +1100
committerStewart Smith <stewart@linux.vnet.ibm.com>2016-01-21 11:59:47 +1100
commitaab4407bd58e2b9342c1f552860ce5a90c68f213 (patch)
treecca17c7101b5985b9f63f5f6b74bda4271265dba
parent526a1705aba57ac151f43245ae0628beb0a791ac (diff)
downloadskiboot-aab4407bd58e2b9342c1f552860ce5a90c68f213.zip
skiboot-aab4407bd58e2b9342c1f552860ce5a90c68f213.tar.gz
skiboot-aab4407bd58e2b9342c1f552860ce5a90c68f213.tar.bz2
nvlink: Add primitive EEH support for NPU devices
Implements Extended Error Handling callbacks for NVLink devices. At present, this supports fence mode emulation, and some easily detectable freezes. There is a lot of work still to be done here, but this enables EEH to work as expected in some specific scenarios. Signed-off-by: Russell Currey <ruscur@russell.cc> Acked-By: Alistair Popple <alistair@popple.id.au> Signed-off-by: Stewart Smith <stewart@linux.vnet.ibm.com>
-rw-r--r--hw/npu.c45
1 files changed, 42 insertions, 3 deletions
diff --git a/hw/npu.c b/hw/npu.c
index 23facaf..a3898b1 100644
--- a/hw/npu.c
+++ b/hw/npu.c
@@ -766,7 +766,6 @@ static void npu_err_interrupt(void *data, uint32_t isn)
prerror("Invalid NPU error interrupt received\n");
break;
case 6 ... 7:
- NPUERR(p, "Error handling not implemented\n");
opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
OPAL_EVENT_PCI_ERROR);
}
@@ -992,6 +991,13 @@ static int64_t npu_power_state(struct phb *phb __unused)
return OPAL_SHPC_POWER_ON;
}
+static int64_t npu_hreset(struct phb *phb __unused)
+{
+ prlog(PR_DEBUG, "NPU: driver should call reset procedure here\n");
+
+ return OPAL_SUCCESS;
+}
+
static int64_t npu_freset(struct phb *phb __unused)
{
/* FIXME: PHB fundamental reset, which need to be
@@ -1021,6 +1027,39 @@ static int64_t npu_freeze_status(struct phb *phb,
return OPAL_SUCCESS;
}
+static int64_t npu_eeh_next_error(struct phb *phb,
+ uint64_t *first_frozen_pe,
+ uint16_t *pci_error_type,
+ uint16_t *severity)
+{
+ struct npu *p = phb_to_npu(phb);
+ int i;
+ uint64_t result = 0;
+ *first_frozen_pe = -1;
+ *pci_error_type = OPAL_EEH_NO_ERROR;
+ *severity = OPAL_EEH_SEV_NO_ERROR;
+
+ if (p->fenced) {
+ *pci_error_type = OPAL_EEH_PHB_ERROR;
+ *severity = OPAL_EEH_SEV_PHB_FENCED;
+ return OPAL_SUCCESS;
+ }
+
+ npu_ioda_sel(p, NPU_IODA_TBL_PESTB, 0, true);
+ for (i = 0; i < NPU_NUM_OF_PES; i++) {
+ result = in_be64(p->at_regs + NPU_IODA_DATA0);
+ if (result > 0) {
+ *first_frozen_pe = i;
+ *pci_error_type = OPAL_EEH_PE_ERROR;
+ *severity = OPAL_EEH_SEV_PE_ER;
+ break;
+ }
+ }
+
+ return OPAL_SUCCESS;
+}
+
+
/* Sets the NPU to trigger an error when a DMA occurs */
static int64_t npu_err_inject(struct phb *phb, uint32_t pe_num,
uint32_t type, uint32_t func __unused,
@@ -1093,14 +1132,14 @@ static const struct phb_ops npu_ops = {
.power_state = npu_power_state,
.slot_power_off = NULL,
.slot_power_on = NULL,
- .hot_reset = NULL,
+ .hot_reset = npu_hreset,
.fundamental_reset = npu_freset,
.complete_reset = NULL,
.poll = NULL,
.eeh_freeze_status = npu_freeze_status,
.eeh_freeze_clear = NULL,
.eeh_freeze_set = NULL,
- .next_error = NULL,
+ .next_error = npu_eeh_next_error,
.err_inject = npu_err_inject,
.get_diag_data = NULL,
.get_diag_data2 = NULL,