aboutsummaryrefslogtreecommitdiff
path: root/core/hmi.c
diff options
context:
space:
mode:
Diffstat (limited to 'core/hmi.c')
-rw-r--r--core/hmi.c40
1 files changed, 36 insertions, 4 deletions
diff --git a/core/hmi.c b/core/hmi.c
index ce5abd7..1564e4e 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -1121,6 +1121,7 @@ static int handle_all_core_tfac_error(uint64_t tfmr, uint64_t *out_flags)
struct cpu_thread *t, *t0;
int recover = -1;
struct cpu_job **hmi_jobs = NULL;
+ bool hmi_with_no_error = false;
t = this_cpu();
t0 = find_cpu_by_pir(cpu_get_thread0(t));
@@ -1128,6 +1129,19 @@ static int handle_all_core_tfac_error(uint64_t tfmr, uint64_t *out_flags)
if (t == t0 && t0->state == cpu_state_os)
hmi_jobs = hmi_kick_secondaries();
+ /*
+ * Handle special case: If TB is in invalid state and no TB error
+ * reported in TFMR for this HMI, then treat this as TFMR corrupt error
+ * to force the recovery procedure recover_corrupt_tfmr(). This will
+ * also reset the core level TB erorrs including Missing step. Do this
+ * only on thread 0, otherwise every thread will repeat the same
+ * procedure unnecessarily.
+ */
+ if (t == t0 && !(tfmr & SPR_TFMR_CORE_ERRORS) && this_cpu()->tb_invalid) {
+ tfmr |= SPR_TFMR_TFMR_CORRUPT;
+ hmi_with_no_error = true;
+ }
+
/* Rendez vous all threads */
hmi_rendez_vous(1);
@@ -1142,7 +1156,7 @@ static int handle_all_core_tfac_error(uint64_t tfmr, uint64_t *out_flags)
*/
if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
/* Check if it's still in error state */
- if (mfspr(SPR_TFMR) & SPR_TFMR_TFMR_CORRUPT)
+ if (hmi_with_no_error || mfspr(SPR_TFMR) & SPR_TFMR_TFMR_CORRUPT)
if (!recover_corrupt_tfmr()) {
unlock(&hmi_lock);
recover = 0;
@@ -1311,13 +1325,31 @@ static int handle_tfac_errors(struct OpalHMIEvent *hmi_evt, uint64_t *out_flags)
if (recover != 0)
recover = recover2;
} else if (this_cpu()->tb_invalid) {
- /* This shouldn't happen, TB is invalid and no global error
- * was reported. We just return for now assuming one will
- * be. We can't do a rendez vous without a core-global HMI.
+ int recover2;
+
+ /*
+ * This shouldn't happen, TB is invalid and no global error was
+ * reported. However, On p10, in a very rare situation when
+ * core is waking up from stop2 or higher stop state, timer
+ * facility goes into error state due to Missing step, causing
+ * an HMI with no error reason set in TFMR register other than
+ * TFMR[41]=0 (tb_valid) and TFMR[28:31]=9 (tbst_encoded).
+ * Ideally, "Missing step" error should be reported in
+ * TFMR[44]=1. It looks like in this rare case, while
+ * generating HMI, HW fails to sync up the TFMR register with
+ * the core which is waking up from stop2.
+ *
+ * To be able to recover, follow down to recovery method as if
+ * we got core level TB error and treat this as TFMR corrupt
+ * error and reset all core errors including Missing step.
*/
+
prlog(PR_ERR, "HMI: TB invalid without core error reported ! "
"CPU=%x, TFMR=0x%016lx\n", this_cpu()->pir,
mfspr(SPR_TFMR));
+ recover2 = handle_all_core_tfac_error(tfmr, out_flags);
+ if (recover != 0)
+ recover = recover2;
}
if (recover != -1 && hmi_evt) {