aboutsummaryrefslogtreecommitdiff
path: root/core/hmi.c
diff options
context:
space:
mode:
Diffstat (limited to 'core/hmi.c')
-rw-r--r--core/hmi.c221
1 files changed, 210 insertions, 11 deletions
diff --git a/core/hmi.c b/core/hmi.c
index 120fe4b..35b6090 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -15,6 +15,7 @@
#include <xscom.h>
#include <xscom-p8-regs.h>
#include <xscom-p9-regs.h>
+#include <xscom-p10-regs.h>
#include <pci.h>
#include <cpu.h>
#include <chip.h>
@@ -27,7 +28,7 @@
#include <cpu.h>
/*
- * HMER register layout:
+ * P9 HMER register layout:
* +===+==========+============================+========+===================+
* |Bit|Name |Description |PowerKVM|Action |
* | | | |HMI | |
@@ -147,6 +148,78 @@
* NOTE: Per Dave Larson, never enable 8,9,21-23
*/
+/*
+ * P10 HMER register layout:
+ * Bit Name Description
+ * 0 malfunction_alert A processor core in the system has checkstopped
+ * (failed recovery). This is broadcasted to every
+ * processor in the system
+ *
+ * 1 reserved reserved
+ *
+ * 2 proc_rcvy_done Processor recovery occurred error-bit in fir not
+ * masked (see bit 11)
+ *
+ * 3 reserved reserved
+ *
+ * 4 tfac_error Timer facility experienced an error. TB, DEC,
+ * HDEC, PURR or SPURR may be corrupted (details in
+ * TFMR)
+ *
+ * 5 tfx_error Error occurred on transfer from tfac shadow to
+ * core
+ *
+ * 6 spurr_scale_limit Nominal frequency exceeded 399 percent
+ *
+ * 7 reserved reserved
+ *
+ * 8 xscom_fail An XSCOM operation caused by a cache inhibited
+ * load/store from this thread failed. A trap
+ * register is available.
+ *
+ * 9 xscom_done An XSCOM operation caused by a cache inhibited
+ * load/store from this thread completed. If
+ * hypervisor intends to use this bit, it is
+ * responsible for clearing it before performing the
+ * xscom operation. NOTE: this bit should always be
+ * masked in HMEER
+ *
+ * 10 reserved reserved
+ *
+ * 11 proc_rcvy_again Processor recovery occurred again before bit 2
+ * was cleared
+ *
+ * 12-15 reserved reserved
+ *
+ * 16 scom_fir_hmi An error inject to PC FIR has occurred to set HMI.
+ * This error inject can also set FIR(61) to cause
+ * recovery.
+ *
+ * 17 reserved reserved
+ *
+ * 18 trig_fir_hmi Debug trigger has occurred to set HMI. This
+ * trigger can also set FIR(60) to cause recovery
+ *
+ * 19-20 reserved reserved
+ *
+ * 21-23 xscom_status If bit 8 is active, the reason will be detailed in
+ * these bits. These bits are information only and
+ * always masked (mask = ‘0’) If hypervisor intends
+ * to use this field, it is responsible for clearing
+ * it before performing the xscom operation.
+ *
+ * 24:63 Not implemented Not implemented.
+ *
+ * P10 HMEER enabled bits:
+ * Name Action
+ * malfunction_alert Decode and log FIR bits.
+ * proc_rcvy_done Log and continue.
+ * tfac_error Log and attempt to recover time facilities.
+ * tfx_error Log and attempt to recover time facilities.
+ * spurr_scale_limit Log and continue. XXX?
+ * proc_rcvy_again Log and continue.
+ */
+
/* Used for tracking cpu threads inside hmi handling. */
#define HMI_STATE_CLEANUP_DONE 0x100
#define CORE_THREAD_MASK 0x0ff
@@ -174,13 +247,17 @@
(SPR_TFMR_TBST_CORRUPT | SPR_TFMR_TB_MISSING_SYNC | \
SPR_TFMR_TB_MISSING_STEP | SPR_TFMR_FW_CONTROL_ERR | \
SPR_TFMR_TFMR_CORRUPT | SPR_TFMR_TB_RESIDUE_ERR | \
- SPR_TFMR_HDEC_PARITY_ERROR)
+ SPR_TFMR_HDEC_PARITY_ERROR | SPR_TFMR_TFAC_XFER_ERROR)
/* TFMR "thread" errors */
#define SPR_TFMR_THREAD_ERRORS \
(SPR_TFMR_PURR_PARITY_ERR | SPR_TFMR_SPURR_PARITY_ERR | \
SPR_TFMR_DEC_PARITY_ERR)
+/*
+ * Starting from p9, core inits are setup to escalate all core
+ * local checkstop to system checkstop. Review this list when that changes.
+ */
static const struct core_xstop_bit_info {
uint8_t bit; /* CORE FIR bit number */
enum OpalHMI_CoreXstopReason reason;
@@ -203,10 +280,12 @@ static const struct core_xstop_bit_info {
{ 63, CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ },
};
-static const struct core_recoverable_bit_info {
+struct core_fir_bit_info {
uint8_t bit; /* CORE FIR bit number */
const char *reason;
-} recoverable_bits[] = {
+};
+
+static const struct core_fir_bit_info p9_recoverable_bits[] = {
{ 0, "IFU - SRAM (ICACHE parity, etc)" },
{ 2, "IFU - RegFile" },
{ 4, "IFU - Logic" },
@@ -226,6 +305,58 @@ static const struct core_recoverable_bit_info {
{ 43, "PC - Thread hang recovery" },
};
+static const struct core_fir_bit_info p10_core_fir_bits[] = {
+ { 0, "IFU - SRAM recoverable error (ICACHE parity error, etc.)" },
+ { 1, "PC - TC checkstop" },
+ { 2, "IFU - RegFile recoverable error" },
+ { 3, "IFU - RegFile core checkstop" },
+ { 4, "IFU - Logic recoverable error" },
+ { 5, "IFU - Logic core checkstop" },
+ { 7, "VSU - Inference accumulator recoverable error" },
+ { 8, "PC - Recovery core checkstop" },
+ { 9, "VSU - Slice Target File (STF) recoverable error" },
+ { 11, "ISU - Logic recoverable error" },
+ { 12, "ISU - Logic core checkstop" },
+ { 14, "ISU - Machine check received while ME=0 checkstop" },
+ { 15, "ISU - UE from L2" },
+ { 16, "ISU - Number of UEs from L2 above threshold" },
+ { 17, "ISU - UE on CI load" },
+ { 18, "MMU - TLB recoverable error" },
+ { 19, "MMU - SLB error" },
+ { 21, "MMU - CXT recoverable error" },
+ { 22, "MMU - Logic core checkstop" },
+ { 23, "MMU - MMU system checkstop" },
+ { 24, "VSU - Logic recoverable error" },
+ { 25, "VSU - Logic core checkstop" },
+ { 26, "PC - In maint mode and recovery in progress" },
+ { 28, "PC - PC system checkstop" },
+ { 29, "LSU - SRAM recoverable error (DCACHE parity error, etc.)" },
+ { 30, "LSU - Set deleted" },
+ { 31, "LSU - RegFile recoverable error" },
+ { 32, "LSU - RegFile core checkstop" },
+ { 33, "MMU - TLB multi hit error occurred" },
+ { 34, "MMU - SLB multi hit error occurred" },
+ { 35, "LSU - ERAT multi hit error occurred" },
+ { 36, "PC - Forward progress error" },
+ { 37, "LSU - Logic recoverable error" },
+ { 38, "LSU - Logic core checkstop" },
+ { 41, "LSU - System checkstop" },
+ { 43, "PC - Thread hang recoverable error" },
+ { 45, "PC - Logic core checkstop" },
+ { 47, "PC - TimeBase facility checkstop" },
+ { 52, "PC - Hang recovery failed core checkstop" },
+ { 53, "PC - Core internal hang detected" },
+ { 55, "PC - Nest hang detected" },
+ { 56, "PC - Other core chiplet recoverable error" },
+ { 57, "PC - Other core chiplet core checkstop" },
+ { 58, "PC - Other core chiplet system checkstop" },
+ { 59, "PC - SCOM satellite error detected" },
+ { 60, "PC - Debug trigger error inject" },
+ { 61, "PC - SCOM or firmware recoverable error inject" },
+ { 62, "PC - Firmware checkstop error inject" },
+ { 63, "PC - Firmware SPRC / SPRD checkstop" },
+};
+
static const struct nx_xstop_bit_info {
uint8_t bit; /* NX FIR bit number */
enum OpalHMI_NestAccelXstopReason reason;
@@ -270,6 +401,12 @@ static int setup_scom_addresses(void)
nx_dma_engine_fir = P9_NX_DMA_ENGINE_FIR;
nx_pbi_fir = P9_NX_PBI_FIR;
return 1;
+ case proc_gen_p10:
+ malf_alert_scom = P10_MALFUNC_ALERT;
+ nx_status_reg = P10_NX_STATUS_REG;
+ nx_dma_engine_fir = P10_NX_DMA_ENGINE_FIR;
+ nx_pbi_fir = P10_NX_PBI_FIR;
+ return 1;
default:
prerror("%s: Unknown CPU type\n", __func__);
break;
@@ -320,6 +457,10 @@ static int read_core_fir(uint32_t chip_id, uint32_t core_id, uint64_t *core_fir)
rc = xscom_read(chip_id,
XSCOM_ADDR_P9_EC(core_id, P9_CORE_FIR), core_fir);
break;
+ case proc_gen_p10:
+ rc = xscom_read(chip_id,
+ XSCOM_ADDR_P10_EC(core_id, P10_CORE_FIR), core_fir);
+ break;
default:
rc = OPAL_HARDWARE;
}
@@ -335,6 +476,10 @@ static int read_core_wof(uint32_t chip_id, uint32_t core_id, uint64_t *core_wof)
rc = xscom_read(chip_id,
XSCOM_ADDR_P9_EC(core_id, P9_CORE_WOF), core_wof);
break;
+ case proc_gen_p10:
+ rc = xscom_read(chip_id,
+ XSCOM_ADDR_P10_EC(core_id, P10_CORE_WOF), core_wof);
+ break;
default:
rc = OPAL_HARDWARE;
}
@@ -394,6 +539,13 @@ static bool decode_core_fir(struct cpu_thread *cpu,
loc ? loc : "Not Available",
cpu->chip_id, core_id, core_fir);
+ if (proc_gen == proc_gen_p10) {
+ for (i = 0; i < ARRAY_SIZE(p10_core_fir_bits); i++) {
+ if (core_fir & PPC_BIT(p10_core_fir_bits[i].bit))
+ prlog(PR_INFO, " %s\n", p10_core_fir_bits[i].reason);
+ }
+ }
+
/* Check CORE FIR bits and populate HMI event with error info. */
for (i = 0; i < ARRAY_SIZE(xstop_bits); i++) {
if (core_fir & PPC_BIT(xstop_bits[i].bit)) {
@@ -910,6 +1062,7 @@ static void hmi_print_debug(const uint8_t *msg, uint64_t hmer)
if (!loc)
loc = "Not Available";
+ /* Also covers P10 SPR_HMER_TFAC_SHADOW_XFER_ERROR */
if (hmer & (SPR_HMER_TFAC_ERROR | SPR_HMER_TFMR_PARITY_ERROR)) {
prlog(PR_DEBUG, "[Loc: %s]: P:%d C:%d T:%d: TFMR(%016lx) %s\n",
loc, this_cpu()->chip_id, core_id, thread_index,
@@ -1231,10 +1384,16 @@ static int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt,
int i;
prlog(PR_DEBUG, "Core WOF = 0x%016llx recovered error:\n", core_wof);
- for (i = 0; i < ARRAY_SIZE(recoverable_bits); i++) {
- if (core_wof & PPC_BIT(recoverable_bits[i].bit))
- prlog(PR_DEBUG, "%s\n",
- recoverable_bits[i].reason);
+ if (proc_gen <= proc_gen_p9) {
+ for (i = 0; i < ARRAY_SIZE(p9_recoverable_bits); i++) {
+ if (core_wof & PPC_BIT(p9_recoverable_bits[i].bit))
+ prlog(PR_DEBUG, " %s\n", p9_recoverable_bits[i].reason);
+ }
+ } else if (proc_gen == proc_gen_p10) {
+ for (i = 0; i < ARRAY_SIZE(p10_core_fir_bits); i++) {
+ if (core_wof & PPC_BIT(p10_core_fir_bits[i].bit))
+ prlog(PR_DEBUG, " %s\n", p10_core_fir_bits[i].reason);
+ }
}
}
@@ -1245,7 +1404,8 @@ static int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt,
queue_hmi_event(hmi_evt, recover, out_flags);
}
}
- if (hmer & SPR_HMER_PROC_RECV_ERROR_MASKED) {
+
+ if ((proc_gen <= proc_gen_p9) && (hmer & SPR_HMER_PROC_RECV_ERROR_MASKED)) {
handled |= SPR_HMER_PROC_RECV_ERROR_MASKED;
if (cpu_is_thread0(cpu) && hmi_evt) {
hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
@@ -1254,6 +1414,7 @@ static int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt,
}
hmi_print_debug("Processor recovery Done (masked).", hmer);
}
+
if (hmer & SPR_HMER_PROC_RECV_AGAIN) {
handled |= SPR_HMER_PROC_RECV_AGAIN;
if (cpu_is_thread0(cpu) && hmi_evt) {
@@ -1264,17 +1425,30 @@ static int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt,
hmi_print_debug("Processor recovery occurred again before"
"bit2 was cleared\n", hmer);
}
+
+ /* XXX: what to do with this? */
+ if (hmer & SPR_HMER_SPURR_SCALE_LIMIT) {
+ handled |= SPR_HMER_SPURR_SCALE_LIMIT;
+ if (cpu_is_thread0(cpu) && hmi_evt) {
+ hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
+ hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_DONE;
+ queue_hmi_event(hmi_evt, recover, out_flags);
+ }
+ hmi_print_debug("Turbo versus nominal frequency exceeded limit.", hmer);
+ }
+
/* Assert if we see malfunction alert, we can not continue. */
if (hmer & SPR_HMER_MALFUNCTION_ALERT) {
handled |= SPR_HMER_MALFUNCTION_ALERT;
hmi_print_debug("Malfunction Alert", hmer);
+ recover = 0;
if (hmi_evt)
decode_malfunction(hmi_evt, out_flags);
}
/* Assert if we see Hypervisor resource error, we can not continue. */
- if (hmer & SPR_HMER_HYP_RESOURCE_ERR) {
+ if ((proc_gen <= proc_gen_p9) && (hmer & SPR_HMER_HYP_RESOURCE_ERR)) {
handled |= SPR_HMER_HYP_RESOURCE_ERR;
hmi_print_debug("Hypervisor resource error", hmer);
@@ -1285,7 +1459,21 @@ static int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt,
queue_hmi_event(hmi_evt, recover, out_flags);
}
}
- if (hmer & SPR_HMER_TRIG_FIR_HMI) {
+
+ /* XXX: what to do with this? */
+ if ((proc_gen <= proc_gen_p9) && (hmer & SPR_HMER_THD_WAKE_BLOCKED_TM_SUSPEND)) {
+ handled |= SPR_HMER_THD_WAKE_BLOCKED_TM_SUSPEND;
+ hmer &= ~SPR_HMER_THD_WAKE_BLOCKED_TM_SUSPEND;
+
+ hmi_print_debug("Attempted to wake thread when threads in TM suspend mode.", hmer);
+ if (hmi_evt) {
+ hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
+ hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_DONE,
+ queue_hmi_event(hmi_evt, recover, out_flags);
+ }
+ }
+
+ if ((proc_gen <= proc_gen_p9) && (hmer & SPR_HMER_TRIG_FIR_HMI)) {
handled |= SPR_HMER_TRIG_FIR_HMI;
hmer &= ~SPR_HMER_TRIG_FIR_HMI;
@@ -1296,6 +1484,17 @@ static int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt,
queue_hmi_event(hmi_evt, recover, out_flags);
}
}
+ if ((proc_gen == proc_gen_p10) && (hmer & SPR_HMER_P10_TRIG_FIR_HMI)) {
+ handled |= SPR_HMER_P10_TRIG_FIR_HMI;
+ hmer &= ~SPR_HMER_P10_TRIG_FIR_HMI;
+
+ hmi_print_debug("Clearing unknown debug trigger", hmer);
+ if (hmi_evt) {
+ hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
+ hmi_evt->type = OpalHMI_ERROR_DEBUG_TRIG_FIR,
+ queue_hmi_event(hmi_evt, recover, out_flags);
+ }
+ }
if (recover == 0)
disable_fast_reboot("Unrecoverable HMI");