From c08f384bb22a414314217bf08edefc153668d8e3 Mon Sep 17 00:00:00 2001 From: Stewart Smith Date: Wed, 28 Feb 2018 17:52:12 +1100 Subject: NPU2 HMIs: dump out a *LOT* of npu2 registers for debugging This is not the way we want to end up doing this. This is a hack to make folk happy and not require crondump to debug nvidia/npu2 issues. Cc: stable Signed-off-by: Stewart Smith (cherry picked from commit fbdc91e693fc3103f7e2a65054ed32bfb26a2e17) Signed-off-by: Stewart Smith --- core/hmi.c | 38 +++++++++++++++++++++++++++++++++++++- hw/slw.c | 4 ++-- hw/xscom.c | 36 ++++++++++++++++++++++-------------- include/npu2-regs.h | 7 ++++++- include/xscom.h | 4 ++-- 5 files changed, 69 insertions(+), 20 deletions(-) diff --git a/core/hmi.c b/core/hmi.c index 00d0fb7..17983a3 100644 --- a/core/hmi.c +++ b/core/hmi.c @@ -1,4 +1,4 @@ -/* Copyright 2013-2014 IBM Corp. +/* Copyright 2013-2018 IBM Corp. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,6 +29,7 @@ #include #include #include +#include /* * HMER register layout: @@ -544,7 +545,10 @@ static void find_npu2_checkstop_reason(int flat_chip_id, uint64_t npu2_fir_action0_addr; uint64_t npu2_fir_action1_addr; uint64_t fatal_errors; + uint64_t npu_scom_dump[2]; + bool npu2_hmi_verbose; int total_errors = 0; + uint64_t r; /* Find the NPU on the chip associated with the HMI. */ for_each_phb(phb) { @@ -596,6 +600,38 @@ static void find_npu2_checkstop_reason(int flat_chip_id, if (!total_errors) return; + npu2_hmi_verbose = nvram_query_eq("npu2-hmi-verbose", "true"); + /* Force this for now until we sort out something better */ + npu2_hmi_verbose = true; + + if (npu2_hmi_verbose) { + _xscom_lock(); + for (r = NPU2_DEBUG_REG_START; r < NPU2_DEBUG_REG_END; r++) { + npu_scom_dump[0] = npu_scom_dump[1] = 0; + _xscom_read(flat_chip_id, r++, &npu_scom_dump[0], false, true); + _xscom_read(flat_chip_id, r, &npu_scom_dump[1], false, true); + prlog(PR_ERR, "NPU: 0x%016llx=0x%016llx 0x%016llx=0x%016llx\n", + r-1, npu_scom_dump[0], + r, npu_scom_dump[1]); + } + for (r = NPU2_FIR_REGISTER_0; r < NPU2_FIR_REGISTER_END; r++) { + npu_scom_dump[0] = npu_scom_dump[1] = 0; + _xscom_read(flat_chip_id, r++, &npu_scom_dump[0], false, true); + _xscom_read(flat_chip_id, r, &npu_scom_dump[1], false, true); + prlog(PR_ERR, "NPU: 0x%016llx=0x%016llx 0x%016llx=0x%016llx\n", + r-1, npu_scom_dump[0], + r, npu_scom_dump[1]); + } + _xscom_unlock(); + prlog(PR_ERR, " _________________________ \n"); + prlog(PR_ERR, "< It's Driver Debug time! >\n"); + prlog(PR_ERR, " ------------------------- \n"); + prlog(PR_ERR, " \\ ,__, \n"); + prlog(PR_ERR, " \\ (oo)____ \n"); + prlog(PR_ERR, " (__) )\\ \n"); + prlog(PR_ERR, " ||--|| * \n"); + } + /* Set up the HMI event */ hmi_evt->severity = OpalHMI_SEV_WARNING; hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT; diff --git a/hw/slw.c b/hw/slw.c index f3c8374..db238ec 100644 --- a/hw/slw.c +++ b/hw/slw.c @@ -1620,7 +1620,7 @@ void slw_update_timer_expiry(uint64_t new_target) /* Grab generation and spin if odd */ _xscom_lock(); for (;;) { - rc = _xscom_read(slw_timer_chip, 0xE0006, &gen, false); + rc = _xscom_read(slw_timer_chip, 0xE0006, &gen, false, false); if (rc) { prerror("SLW: Error %lld reading tmr gen " " count\n", rc); @@ -1664,7 +1664,7 @@ void slw_update_timer_expiry(uint64_t new_target) } /* Re-check gen count */ - rc = _xscom_read(slw_timer_chip, 0xE0006, &gen2, false); + rc = _xscom_read(slw_timer_chip, 0xE0006, &gen2, false, false); if (rc) { prerror("SLW: Error %lld re-reading tmr gen " " count\n", rc); diff --git a/hw/xscom.c b/hw/xscom.c index 0501278..1bcfd47 100644 --- a/hw/xscom.c +++ b/hw/xscom.c @@ -215,8 +215,9 @@ static int xscom_clear_error(uint32_t gcid, uint32_t pcb_addr) } static int64_t xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr, - bool is_write, int64_t retries, - int64_t *xscom_clear_retries) + bool is_write, int64_t retries, + int64_t *xscom_clear_retries, + bool ignore_error) { unsigned int stat = GETFIELD(SPR_HMER_XSCOM_STATUS, hmer); int64_t rc = OPAL_HARDWARE; @@ -277,9 +278,12 @@ static int64_t xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_add } /* XXX: Create error log entry ? */ - log_simple_error(&e_info(OPAL_RC_XSCOM_RW), - "XSCOM: %s error gcid=0x%x pcb_addr=0x%x stat=0x%x\n", - is_write ? "write" : "read", gcid, pcb_addr, stat); + if (!ignore_error) + log_simple_error(&e_info(OPAL_RC_XSCOM_RW), + "XSCOM: %s error gcid=0x%x " + "pcb_addr=0x%x stat=0x%x\n", + is_write ? "write" : "read", gcid, + pcb_addr, stat); /* We need to reset the XSCOM or we'll hang on the next access */ xscom_reset(gcid, false); @@ -322,14 +326,16 @@ static inline bool xscom_is_multicast_addr(uint32_t addr) * Low level XSCOM access functions, perform a single direct xscom * access via MMIO */ -static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val) +static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val, + bool ignore_error) { uint64_t hmer; int64_t ret, retries; int64_t xscom_clear_retries = XSCOM_CLEAR_MAX_RETRIES; if (!xscom_gcid_ok(gcid)) { - prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid); + if (!ignore_error) + prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid); return OPAL_PARAMETER; } @@ -351,7 +357,7 @@ static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val) /* Handle error and possibly eventually retry */ ret = xscom_handle_error(hmer, gcid, pcb_addr, false, retries, - &xscom_clear_retries); + &xscom_clear_retries, ignore_error); if (ret != OPAL_BUSY) break; } @@ -370,7 +376,8 @@ static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val) if (proc_gen == proc_gen_p9 && ret == OPAL_XSCOM_CHIPLET_OFF) return ret; - prerror("XSCOM: Read failed, ret = %lld\n", ret); + if (!ignore_error) + prerror("XSCOM: Read failed, ret = %lld\n", ret); return ret; } @@ -403,7 +410,7 @@ static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val) /* Handle error and possibly eventually retry */ ret = xscom_handle_error(hmer, gcid, pcb_addr, true, retries, - &xscom_clear_retries); + &xscom_clear_retries, false); if (ret != OPAL_BUSY) break; } @@ -451,7 +458,7 @@ static int xscom_indirect_read_form0(uint32_t gcid, uint64_t pcb_addr, /* Wait for completion */ for (retries = 0; retries < XSCOM_IND_MAX_RETRIES; retries++) { - rc = __xscom_read(gcid, addr, &data); + rc = __xscom_read(gcid, addr, &data, false); if (rc) goto bail; if ((data & XSCOM_DATA_IND_COMPLETE) && @@ -513,7 +520,7 @@ static int xscom_indirect_write_form0(uint32_t gcid, uint64_t pcb_addr, /* Wait for completion */ for (retries = 0; retries < XSCOM_IND_MAX_RETRIES; retries++) { - rc = __xscom_read(gcid, addr, &data); + rc = __xscom_read(gcid, addr, &data, false); if (rc) goto bail; if ((data & XSCOM_DATA_IND_COMPLETE) && @@ -588,7 +595,8 @@ void _xscom_unlock(void) /* * External API */ -int _xscom_read(uint32_t partid, uint64_t pcb_addr, uint64_t *val, bool take_lock) +int _xscom_read(uint32_t partid, uint64_t pcb_addr, uint64_t *val, + bool take_lock, bool ignore_error) { uint32_t gcid; int rc; @@ -635,7 +643,7 @@ int _xscom_read(uint32_t partid, uint64_t pcb_addr, uint64_t *val, bool take_loc if (pcb_addr & XSCOM_ADDR_IND_FLAG) rc = xscom_indirect_read(gcid, pcb_addr, val); else - rc = __xscom_read(gcid, pcb_addr & 0x7fffffff, val); + rc = __xscom_read(gcid, pcb_addr & 0x7fffffff, val, ignore_error); /* Unlock it */ if (take_lock) diff --git a/include/npu2-regs.h b/include/npu2-regs.h index c109273..73925f9 100644 --- a/include/npu2-regs.h +++ b/include/npu2-regs.h @@ -1,4 +1,4 @@ -/* Copyright 2013-2016 IBM Corp. +/* Copyright 2013-2018 IBM Corp. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,6 +24,10 @@ uint64_t npu2_read(struct npu2 *p, uint64_t reg); void npu2_write(struct npu2 *p, uint64_t reg, uint64_t val); void npu2_write_mask(struct npu2 *p, uint64_t reg, uint64_t val, uint64_t mask); +/* SCOM Registers to dump on HMI to aid in debugging */ +#define NPU2_DEBUG_REG_START 0x5011000 +#define NPU2_DEBUG_REG_END 0x50110FF + /* These aren't really NPU specific registers but we initialise them in NPU * code */ #define MCD0_BANK0_CN3 0x301100d @@ -468,6 +472,7 @@ void npu2_write_mask(struct npu2 *p, uint64_t reg, uint64_t val, uint64_t mask); #define NPU2_FIR_REGISTER_0 0x0000000005013C00 #define NPU2_FIR_REGISTER_1 0x0000000005013C40 #define NPU2_FIR_REGISTER_2 0x0000000005013C80 +#define NPU2_FIR_REGISTER_END 0x0000000005013CFF #define NPU2_TOTAL_FIR_REGISTERS 3 diff --git a/include/xscom.h b/include/xscom.h index 9853224..3193abd 100644 --- a/include/xscom.h +++ b/include/xscom.h @@ -225,7 +225,7 @@ /* Use only in select places where multiple SCOMs are time/latency sensitive */ extern void _xscom_lock(void); -extern int _xscom_read(uint32_t partid, uint64_t pcb_addr, uint64_t *val, bool take_lock); +extern int _xscom_read(uint32_t partid, uint64_t pcb_addr, uint64_t *val, bool take_lock, bool ignore_error); extern int _xscom_write(uint32_t partid, uint64_t pcb_addr, uint64_t val, bool take_lock); extern void _xscom_unlock(void); @@ -233,7 +233,7 @@ extern void _xscom_unlock(void); /* Targeted SCOM access */ static inline int xscom_read(uint32_t partid, uint64_t pcb_addr, uint64_t *val) { - return _xscom_read(partid, pcb_addr, val, true); + return _xscom_read(partid, pcb_addr, val, true, false); } static inline int xscom_write(uint32_t partid, uint64_t pcb_addr, uint64_t val) { return _xscom_write(partid, pcb_addr, val, true); -- cgit v1.1