aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMahesh Salgaonkar <mahesh@linux.ibm.com>2023-03-16 00:26:08 +0530
committerReza Arbab <arbab@linux.ibm.com>2023-03-28 09:16:29 -0500
commit09fb9548e2e96a9958936486f4cb1e2594475aff (patch)
treebcd0db94b2462ef36bf144f2cafec0e0e7dc18bc
parent80e2b1dc7396d5a02d14b90cd6e86dfbacd85d1d (diff)
downloadskiboot-09fb9548e2e96a9958936486f4cb1e2594475aff.zip
skiboot-09fb9548e2e96a9958936486f4cb1e2594475aff.tar.gz
skiboot-09fb9548e2e96a9958936486f4cb1e2594475aff.tar.bz2
opal/hmi: Recover from unusual HMI with no TB error reported.
Timer facility HMIs are reported with TB error reason set in the TFMR register. With help from reason set in TFMR register, OPAL hmi handler carry out appropriate recovery procedure to successfully recover from Timer facility HMIs. However, On p10, in a very rare situation when core is waking up from stop2 or higher stop state, timer facility goes into error state due to Missing step, causing an HMI with no error reason set in TFMR register other than TFMR[41]=0 (tb_valid) and TFMR[28:31]=9 (tbst_encoded). Ideally, "Missing step" error should be reported in TFMR[44]=1. It looks like in this rare case, while generating HMI, HW fails to sync up the TFMR register with the core which is waking up from stop2. Hence, in absence of proper error reason, OPAL fails to recover from this unusual HMI, resulting in system panic/crash. In order to recover from this HMI it needs to reset the core level error "Missing step". Handle this as special case by treating this as TFMR corrupt error (TFMR[60]) which will then force reset core level errors including Missing step. Reported-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Mahesh Salgaonkar <mahesh@linux.ibm.com> Signed-off-by: Reza Arbab <arbab@linux.ibm.com>
-rw-r--r--core/hmi.c40
1 files changed, 36 insertions, 4 deletions
diff --git a/core/hmi.c b/core/hmi.c
index ce5abd7..1564e4e 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -1121,6 +1121,7 @@ static int handle_all_core_tfac_error(uint64_t tfmr, uint64_t *out_flags)
struct cpu_thread *t, *t0;
int recover = -1;
struct cpu_job **hmi_jobs = NULL;
+ bool hmi_with_no_error = false;
t = this_cpu();
t0 = find_cpu_by_pir(cpu_get_thread0(t));
@@ -1128,6 +1129,19 @@ static int handle_all_core_tfac_error(uint64_t tfmr, uint64_t *out_flags)
if (t == t0 && t0->state == cpu_state_os)
hmi_jobs = hmi_kick_secondaries();
+ /*
+ * Handle special case: If TB is in invalid state and no TB error
+ * reported in TFMR for this HMI, then treat this as TFMR corrupt error
+ * to force the recovery procedure recover_corrupt_tfmr(). This will
+ * also reset the core level TB erorrs including Missing step. Do this
+ * only on thread 0, otherwise every thread will repeat the same
+ * procedure unnecessarily.
+ */
+ if (t == t0 && !(tfmr & SPR_TFMR_CORE_ERRORS) && this_cpu()->tb_invalid) {
+ tfmr |= SPR_TFMR_TFMR_CORRUPT;
+ hmi_with_no_error = true;
+ }
+
/* Rendez vous all threads */
hmi_rendez_vous(1);
@@ -1142,7 +1156,7 @@ static int handle_all_core_tfac_error(uint64_t tfmr, uint64_t *out_flags)
*/
if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
/* Check if it's still in error state */
- if (mfspr(SPR_TFMR) & SPR_TFMR_TFMR_CORRUPT)
+ if (hmi_with_no_error || mfspr(SPR_TFMR) & SPR_TFMR_TFMR_CORRUPT)
if (!recover_corrupt_tfmr()) {
unlock(&hmi_lock);
recover = 0;
@@ -1311,13 +1325,31 @@ static int handle_tfac_errors(struct OpalHMIEvent *hmi_evt, uint64_t *out_flags)
if (recover != 0)
recover = recover2;
} else if (this_cpu()->tb_invalid) {
- /* This shouldn't happen, TB is invalid and no global error
- * was reported. We just return for now assuming one will
- * be. We can't do a rendez vous without a core-global HMI.
+ int recover2;
+
+ /*
+ * This shouldn't happen, TB is invalid and no global error was
+ * reported. However, On p10, in a very rare situation when
+ * core is waking up from stop2 or higher stop state, timer
+ * facility goes into error state due to Missing step, causing
+ * an HMI with no error reason set in TFMR register other than
+ * TFMR[41]=0 (tb_valid) and TFMR[28:31]=9 (tbst_encoded).
+ * Ideally, "Missing step" error should be reported in
+ * TFMR[44]=1. It looks like in this rare case, while
+ * generating HMI, HW fails to sync up the TFMR register with
+ * the core which is waking up from stop2.
+ *
+ * To be able to recover, follow down to recovery method as if
+ * we got core level TB error and treat this as TFMR corrupt
+ * error and reset all core errors including Missing step.
*/
+
prlog(PR_ERR, "HMI: TB invalid without core error reported ! "
"CPU=%x, TFMR=0x%016lx\n", this_cpu()->pir,
mfspr(SPR_TFMR));
+ recover2 = handle_all_core_tfac_error(tfmr, out_flags);
+ if (recover != 0)
+ recover = recover2;
}
if (recover != -1 && hmi_evt) {