diff options
-rw-r--r-- | core/hmi.c | 58 | ||||
-rw-r--r-- | hw/npu2-common.c | 234 | ||||
-rw-r--r-- | include/npu2-regs.h | 10 | ||||
-rw-r--r-- | include/npu2.h | 1 |
4 files changed, 246 insertions, 57 deletions
@@ -594,60 +594,6 @@ static void find_nx_checkstop_reason(int flat_chip_id, queue_hmi_event(hmi_evt, 0, out_flags); } -/* - * If the year is 2018 and you still see all these hardcoded, you - * should really replace this with the neat macros that's in the - * NPU2 code rather than this horrible listing of every single - * NPU2 register hardcoded for a specific chip. - * - * I feel dirty having even written it. - */ -static uint32_t npu2_scom_dump[] = { - 0x5011017, 0x5011047, 0x5011077, 0x50110A7, - 0x5011217, 0x5011247, 0x5011277, 0x50112A7, - 0x5011417, 0x5011447, 0x5011477, 0x50114A7, - 0x50110DA, 0x50112DA, 0x50114DA, - 0x50110DB, 0x50112DB, 0x50114DB, - 0x5011011, 0x5011041, 0x5011071, 0x50110A1, - 0x5011211, 0x5011241, 0x5011271, 0x50112A1, - 0x5011411, 0x5011441, 0x5011471, 0x50114A1, - 0x5011018, 0x5011048, 0x5011078, 0x50110A8, - 0x5011218, 0x5011248, 0x5011278, 0x50112A8, - 0x5011418, 0x5011448, 0x5011478, 0x50114A8, - 0x5011640, - 0x5011114, 0x5011134, 0x5011314, 0x5011334, - 0x5011514, 0x5011534, 0x5011118, 0x5011138, - 0x5011318, 0x5011338, 0x5011518, 0x5011538, - 0x50110D8, 0x50112D8, 0x50114D8, - 0x50110D9, 0x50112D9, 0x50114D9, - 0x5011019, 0x5011049, 0x5011079, 0x50110A9, - 0x5011219, 0x5011249, 0x5011279, 0x50112A9, - 0x5011419, 0x5011449, 0x5011479, 0x50114A9, - 0x50110F4, 0x50112F4, 0x50114F4, - 0x50110F5, 0x50112F5, 0x50114F5, - 0x50110F6, 0x50112F6, 0x50114F6, - 0x50110FD, 0x50112FD, 0x50114FD, - 0x50110FE, 0x50112FE, 0x50114FE, - 0x00 -}; - -static void dump_scoms(int flat_chip_id, const char *unit, uint32_t *scoms, - const char *loc) -{ - uint64_t value; - int r; - - while (*scoms != 0) { - value = 0; - r = _xscom_read(flat_chip_id, *scoms, &value, false); - if (r != OPAL_SUCCESS) - continue; - prlog(PR_ERR, "%s: [Loc: %s] P:%d 0x%08x=0x%016llx\n", - unit, loc, flat_chip_id, *scoms, value); - scoms++; - } -} - static bool phb_is_npu2(struct dt_node *dn) { return (dt_node_is_compatible(dn, "ibm,power9-npu-pciex") || @@ -731,9 +677,7 @@ static void find_npu2_checkstop_reason(int flat_chip_id, npu2_hmi_verbose = true; if (npu2_hmi_verbose) { - _xscom_lock(); - dump_scoms(flat_chip_id, "NPU", npu2_scom_dump, loc); - _xscom_unlock(); + npu2_dump_scoms(flat_chip_id); prlog(PR_ERR, " _________________________ \n"); prlog(PR_ERR, "< It's Driver Debug time! >\n"); prlog(PR_ERR, " ------------------------- \n"); diff --git a/hw/npu2-common.c b/hw/npu2-common.c index ccbbbbc..d4c0f85 100644 --- a/hw/npu2-common.c +++ b/hw/npu2-common.c @@ -103,6 +103,239 @@ void npu2_write_mask_4b(struct npu2 *p, uint64_t reg, uint32_t val, uint32_t mas (uint64_t)new_val << 32); } +typedef struct { + const char *name; + uint32_t block; + uint32_t offset; +} npu2_scom_dump_t; + +static npu2_scom_dump_t npu2_scom_dump_global[] = { + /* CQ State Machine */ + { "CS.SM0.MISC.CERR_MESSAGE0", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG0 }, + { "CS.SM1.MISC.CERR_MESSAGE0", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG0 }, + { "CS.SM2.MISC.CERR_MESSAGE0", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG0 }, + { "CS.SM3.MISC.CERR_MESSAGE0", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG0 }, + + { "CS.SM0.MISC.CERR_MESSAGE1", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG1 }, + { "CS.SM1.MISC.CERR_MESSAGE1", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG1 }, + { "CS.SM2.MISC.CERR_MESSAGE1", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG1 }, + { "CS.SM3.MISC.CERR_MESSAGE1", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG1 }, + + { "CS.SM0.MISC.CERR_MESSAGE2", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG2 }, + { "CS.SM1.MISC.CERR_MESSAGE2", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG2 }, + { "CS.SM2.MISC.CERR_MESSAGE2", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG2 }, + { "CS.SM3.MISC.CERR_MESSAGE2", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG2 }, + + { "CS.SM0.MISC.CERR_MESSAGE3", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG3 }, + { "CS.SM1.MISC.CERR_MESSAGE3", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG3 }, + { "CS.SM2.MISC.CERR_MESSAGE3", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG3 }, + { "CS.SM3.MISC.CERR_MESSAGE3", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG3 }, + + { "CS.SM0.MISC.CERR_MESSAGE4", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG4 }, + { "CS.SM1.MISC.CERR_MESSAGE4", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG4 }, + { "CS.SM2.MISC.CERR_MESSAGE4", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG4 }, + { "CS.SM3.MISC.CERR_MESSAGE4", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG4 }, + + { "CS.SM0.MISC.CERR_MESSAGE5", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG5 }, + { "CS.SM1.MISC.CERR_MESSAGE5", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG5 }, + { "CS.SM2.MISC.CERR_MESSAGE5", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG5 }, + { "CS.SM3.MISC.CERR_MESSAGE5", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG5 }, + + { "CS.SM0.MISC.CERR_MESSAGE6", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_MSG6 }, + { "CS.SM1.MISC.CERR_MESSAGE6", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_MSG6 }, + { "CS.SM2.MISC.CERR_MESSAGE6", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_MSG6 }, + { "CS.SM3.MISC.CERR_MESSAGE6", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_MSG6 }, + + { "CS.SM0.MISC.CERR_FIRST0", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST0 }, + { "CS.SM1.MISC.CERR_FIRST0", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST0 }, + { "CS.SM2.MISC.CERR_FIRST0", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST0 }, + { "CS.SM3.MISC.CERR_FIRST0", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST0 }, + + { "CS.SM0.MISC.CERR_FIRST1", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST1 }, + { "CS.SM1.MISC.CERR_FIRST1", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST1 }, + { "CS.SM2.MISC.CERR_FIRST1", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST1 }, + { "CS.SM3.MISC.CERR_FIRST1", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST1 }, + + { "CS.SM0.MISC.CERR_FIRST2", NPU2_BLOCK_SM_0, NPU2_C_ERR_RPT_FIRST2 }, + { "CS.SM1.MISC.CERR_FIRST2", NPU2_BLOCK_SM_1, NPU2_C_ERR_RPT_FIRST2 }, + { "CS.SM2.MISC.CERR_FIRST2", NPU2_BLOCK_SM_2, NPU2_C_ERR_RPT_FIRST2 }, + { "CS.SM3.MISC.CERR_FIRST2", NPU2_BLOCK_SM_3, NPU2_C_ERR_RPT_FIRST2 }, + + /* CQ Control */ + { "CS.CTL.MISC.CERR_MESSAGE0", NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_MSG0 }, + { "CS.CTL.MISC.CERR_MESSAGE1", NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_MSG1 }, + { "CS.CTL.MISC.CERR_FIRST0", NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_FIRST0 }, + { "CS.CTL.MISC.CERR_FIRST1", NPU2_BLOCK_CTL, NPU2_CQ_C_ERR_RPT_FIRST1 }, + + /* CQ Data */ + { "DAT.MISC.CERR_ECC_HOLD", NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_STATUS }, + { "DAT.MISC.CERR_ECC_MASK", NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_MASK }, + { "DAT.MISC.CERR_ECC_FIRST", NPU2_BLOCK_DAT, NPU2_CQ_DAT_ECC_FIRST }, + { "DAT.MISC.REM0", NPU2_BLOCK_DAT, NPU2_CQ_DAT_RAS_MSG0 }, + { "DAT.MISC.REM1", NPU2_BLOCK_DAT, NPU2_CQ_DAT_RAS_MSG1 }, +}; + +static npu2_scom_dump_t npu2_scom_dump_nvlink[] = { + { "NTL0.REGS.CERR_FIRST1", NPU2_BLOCK_NTL0, NPU2_NTL_ERR_FIRST1_OFF }, + { "NTL1.REGS.CERR_FIRST1", NPU2_BLOCK_NTL1, NPU2_NTL_ERR_FIRST1_OFF }, + { "NTL0.REGS.CERR_FIRST2", NPU2_BLOCK_NTL0, NPU2_NTL_ERR_FIRST2_OFF }, + { "NTL1.REGS.CERR_FIRST2", NPU2_BLOCK_NTL1, NPU2_NTL_ERR_FIRST2_OFF }, +}; + +static npu2_scom_dump_t npu2_scom_dump_ocapi[] = { + { "OTL0.MISC.C_ERR_RPT_HOLD0", NPU2_BLOCK_OTL0, NPU2_OTL_ERR_RPT_HOLD0 }, + { "OTL1.MISC.C_ERR_RPT_HOLD0", NPU2_BLOCK_OTL1, NPU2_OTL_ERR_RPT_HOLD0 }, + { "OTL0.MISC.OTL_REM0", NPU2_BLOCK_OTL0, NPU2_OTL_RAS_ERR_MSG0 }, + { "OTL1.MISC.OTL_REM0", NPU2_BLOCK_OTL1, NPU2_OTL_RAS_ERR_MSG0 }, + { "OTL0.MISC.ERROR_SIG_RXI", NPU2_BLOCK_OTL0, NPU2_OTL_RXI_ERR_SIG }, + { "OTL1.MISC.ERROR_SIG_RXI", NPU2_BLOCK_OTL1, NPU2_OTL_RXI_ERR_SIG }, + { "OTL0.MISC.ERROR_SIG_RXO", NPU2_BLOCK_OTL0, NPU2_OTL_RXO_ERR_SIG }, + { "OTL1.MISC.ERROR_SIG_RXO", NPU2_BLOCK_OTL1, NPU2_OTL_RXO_ERR_SIG }, + { "OTL0.MISC.C_ERR_RPT_HOLD1", NPU2_BLOCK_OTL0, NPU2_OTL_ERR_RPT_HOLD1 }, + { "OTL1.MISC.C_ERR_RPT_HOLD1", NPU2_BLOCK_OTL1, NPU2_OTL_ERR_RPT_HOLD1 }, +}; + +static void print_one_npu_reg(struct npu2 *npu, npu2_scom_dump_t *scom, int stack) +{ + uint64_t reg, val; + + reg = NPU2_REG_OFFSET(stack, scom->block, scom->offset); + val = npu2_scom_read(npu->chip_id, npu->xscom_base, + reg, NPU2_MISC_DA_LEN_8B); + + prlog(PR_ERR, "NPU[%d] STCK%d.%s 0x%llx = 0x%016llx\n", + npu->chip_id, stack - 4, scom->name, reg, val); +} + +/* same as above, but for direct access registers */ +static void print_one_reg(int chip_id, int brick_index, + uint64_t reg_addr, const char *reg_name) +{ + uint64_t val; + + xscom_read(chip_id, reg_addr, &val); + prlog(PR_ERR, "NPU[%d] %s brick %d 0x%llx = 0x%016llx\n", + chip_id, reg_name, brick_index, reg_addr, val); +} + +static void show_nvlink_regs(struct npu2 *npu, int brick_index) +{ + uint32_t stack, ntl; + int i; + + stack = NPU2_STACK_STCK_0 + brick_index / 2; + ntl = NPU2_BLOCK_NTL0 + (brick_index % 2) * 2; + + for (i = 0; i < ARRAY_SIZE(npu2_scom_dump_nvlink); i++) { + if (npu2_scom_dump_nvlink[i].block == ntl) + print_one_npu_reg(npu, &npu2_scom_dump_nvlink[i], stack); + } +} + +static void show_opencapi_regs(struct npu2 *npu, int brick_index) +{ + uint32_t stack, otl; + int i; + + stack = NPU2_STACK_STCK_0 + brick_index / 2; + otl = NPU2_BLOCK_OTL0 + (brick_index % 2); + + /* NPU registers */ + for (i = 0; i < ARRAY_SIZE(npu2_scom_dump_ocapi); i++) { + if (npu2_scom_dump_ocapi[i].block == otl) + print_one_npu_reg(npu, &npu2_scom_dump_ocapi[i], stack); + } + + /* Fabric registers */ + print_one_reg(npu->chip_id, brick_index, + OB_ODL_STATUS(brick_index), "ODL status"); + print_one_reg(npu->chip_id, brick_index, + OB_ODL_TRAINING_STATUS(brick_index), "ODL training status"); + print_one_reg(npu->chip_id, brick_index, + OB_ODL_ENDPOINT_INFO(brick_index), "ODL endpoint info"); +} + +static void show_all_regs(struct npu2 *npu, int brick_index) +{ + int i, stack, stack_min, stack_max; + uint64_t fir_val, mask_val, fir_addr, mask_addr; + struct npu2_dev *dev; + npu2_scom_dump_t scom_reg; + + if (brick_index != -1) { + stack_min = stack_max = NPU2_STACK_STCK_0 + brick_index / 2; + } else { + stack_min = NPU2_STACK_STCK_0; + stack_max = NPU2_STACK_STCK_2; + /* Avoid dumping unused stacks for opencapi on Lagrange */ + if (npu->total_devices == 2) + stack_min = stack_max = NPU2_STACK_STCK_1; + } + + /* NPU FIRs */ + for (i = 0; i < NPU2_TOTAL_FIR_REGISTERS; i++) { + fir_addr = NPU2_FIR_REGISTER_0 + i * NPU2_FIR_OFFSET; + mask_addr = fir_addr + NPU2_FIR_MASK_OFFSET; + xscom_read(npu->chip_id, fir_addr, &fir_val); + xscom_read(npu->chip_id, mask_addr, &mask_val); + prlog(PR_ERR, "NPU[%d] FIR%d = 0x%016llx (mask 0x%016llx => 0x%016llx)\n", + npu->chip_id, i, fir_val, mask_val, fir_val & ~mask_val); + } + + /* NPU global, per-stack registers */ + for (i = 0; i < ARRAY_SIZE(npu2_scom_dump_global); i++) { + for (stack = stack_min; stack <= stack_max; stack++) + print_one_npu_reg(npu, &npu2_scom_dump_global[i], stack); + } + + /* + * NPU global registers, stack independent + * We have only one for now, so dump it directly + */ + scom_reg.name = "XTS.REG.ERR_HOLD"; + scom_reg.block = NPU2_BLOCK_XTS; + scom_reg.offset = 0; + print_one_npu_reg(npu, &scom_reg, NPU2_STACK_MISC); + + /* nvlink- or opencapi-specific registers */ + for (i = 0; i < npu->total_devices; i++) { + dev = &npu->devices[i]; + if (brick_index == -1 || dev->brick_index == brick_index) { + if (dev->type == NPU2_DEV_TYPE_NVLINK) + show_nvlink_regs(npu, dev->brick_index); + else if (dev->type == NPU2_DEV_TYPE_OPENCAPI) + show_opencapi_regs(npu, dev->brick_index); + } + } +} + +void npu2_dump_scoms(int chip_id) +{ + struct npu2 *npu; + struct phb *phb; + struct npu2_dev *dev; + + /* + * Look for the npu2 structure for that chip ID. We can access it + * through the array of phbs, looking for a nvlink or opencapi + * phb. We can have several entries, but they all point + * to the same npu2 structure + */ + for_each_phb(phb) { + npu = NULL; + if (phb->phb_type == phb_type_npu_v2) { + npu = phb_to_npu2_nvlink(phb); + } else if (phb->phb_type == phb_type_npu_v2_opencapi) { + dev = phb_to_npu2_dev_ocapi(phb); + npu = dev->npu; + } + if (npu && npu->chip_id == chip_id) { + show_all_regs(npu, -1 /* all bricks */); + break; + } + } +} + static uint64_t npu2_ipi_attributes(struct irq_source *is __unused, uint32_t isn __unused) { struct npu2 *p = is->data; @@ -182,6 +415,7 @@ static void npu2_err_interrupt(struct irq_source *is, uint32_t isn) brick = 2 + ((idx - 27) % 4); prlog(PR_ERR, "NPU[%d] error interrupt for brick %d\n", p->chip_id, brick); + show_all_regs(p, brick); opal_update_pending_evt(OPAL_EVENT_PCI_ERROR, OPAL_EVENT_PCI_ERROR); break; diff --git a/include/npu2-regs.h b/include/npu2-regs.h index 939a23f..ba10b8e 100644 --- a/include/npu2-regs.h +++ b/include/npu2-regs.h @@ -203,6 +203,8 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base, #define NPU2_PERF_MASK 0x110 #define NPU2_DBG0_CFG 0x118 #define NPU2_DBG1_CFG 0x120 +#define NPU2_C_ERR_RPT_MSG5 0x128 +#define NPU2_C_ERR_RPT_MSG6 0x130 /* CTL block registers */ #define NPU2_CQ_CTL_MISC_CFG 0x000 @@ -295,10 +297,12 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base, #define NPU2_NTL_MISC_CFG3(ndev) NPU2_NTL_REG_OFFSET(ndev, 0x008) #define NPU2_NTL_ERR_HOLD1(ndev) NPU2_NTL_REG_OFFSET(ndev, 0x010) #define NPU2_NTL_ERR_MASK1(ndev) NPU2_NTL_REG_OFFSET(ndev, 0x018) +#define NPU2_NTL_ERR_FIRST1_OFF 0x020 #define NPU2_NTL_ERR_FIRST1(ndev) NPU2_NTL_REG_OFFSET(ndev, 0x020) #define NPU2_NTL_ERR_FIRST1_MASK(ndev) NPU2_NTL_REG_OFFSET(ndev, 0x028) #define NPU2_NTL_ERR_HOLD2(ndev) NPU2_NTL_REG_OFFSET(ndev, 0x030) #define NPU2_NTL_ERR_MASK2(ndev) NPU2_NTL_REG_OFFSET(ndev, 0x038) +#define NPU2_NTL_ERR_FIRST2_OFF 0x040 #define NPU2_NTL_ERR_FIRST2(ndev) NPU2_NTL_REG_OFFSET(ndev, 0x040) #define NPU2_NTL_ERR_FIRST2_MASK(ndev) NPU2_NTL_REG_OFFSET(ndev, 0x048) #define NPU2_NTL_SCRATCH2(ndev) NPU2_NTL_REG_OFFSET(ndev, 0x050) @@ -402,6 +406,12 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base, #define NPU2_OTL_OSL_DAR(stack, block) NPU2_REG_OFFSET(stack, block, 0x008) #define NPU2_OTL_OSL_TFC(stack, block) NPU2_REG_OFFSET(stack, block, 0x010) #define NPU2_OTL_OSL_PEHANDLE(stack, block) NPU2_REG_OFFSET(stack, block, 0x018) +#define NPU2_OTL_ERR_RPT_HOLD0 0x30 +#define NPU2_OTL_RAS_ERR_MSG0 0x68 +#define NPU2_OTL_RXI_ERR_SIG 0x70 +#define NPU2_OTL_RXO_ERR_SIG 0x78 +#define NPU2_OTL_ERR_RPT_HOLD1 0xB0 + /* Misc block registers. Unlike the SM/CTL/DAT/NTL registers above * there is only a single instance of each of these in the NPU so we diff --git a/include/npu2.h b/include/npu2.h index ef4e7af..d58aab4 100644 --- a/include/npu2.h +++ b/include/npu2.h @@ -248,4 +248,5 @@ int64_t npu2_freeze_status(struct phb *phb __unused, uint8_t *freeze_state, uint16_t *pci_error_type __unused, uint16_t *severity __unused); +void npu2_dump_scoms(int chip_id); #endif /* __NPU2_H */ |