diff options
Diffstat (limited to 'core/hmi.c')
-rw-r--r-- | core/hmi.c | 40 |
1 files changed, 36 insertions, 4 deletions
@@ -1121,6 +1121,7 @@ static int handle_all_core_tfac_error(uint64_t tfmr, uint64_t *out_flags) struct cpu_thread *t, *t0; int recover = -1; struct cpu_job **hmi_jobs = NULL; + bool hmi_with_no_error = false; t = this_cpu(); t0 = find_cpu_by_pir(cpu_get_thread0(t)); @@ -1128,6 +1129,19 @@ static int handle_all_core_tfac_error(uint64_t tfmr, uint64_t *out_flags) if (t == t0 && t0->state == cpu_state_os) hmi_jobs = hmi_kick_secondaries(); + /* + * Handle special case: If TB is in invalid state and no TB error + * reported in TFMR for this HMI, then treat this as TFMR corrupt error + * to force the recovery procedure recover_corrupt_tfmr(). This will + * also reset the core level TB erorrs including Missing step. Do this + * only on thread 0, otherwise every thread will repeat the same + * procedure unnecessarily. + */ + if (t == t0 && !(tfmr & SPR_TFMR_CORE_ERRORS) && this_cpu()->tb_invalid) { + tfmr |= SPR_TFMR_TFMR_CORRUPT; + hmi_with_no_error = true; + } + /* Rendez vous all threads */ hmi_rendez_vous(1); @@ -1142,7 +1156,7 @@ static int handle_all_core_tfac_error(uint64_t tfmr, uint64_t *out_flags) */ if (tfmr & SPR_TFMR_TFMR_CORRUPT) { /* Check if it's still in error state */ - if (mfspr(SPR_TFMR) & SPR_TFMR_TFMR_CORRUPT) + if (hmi_with_no_error || mfspr(SPR_TFMR) & SPR_TFMR_TFMR_CORRUPT) if (!recover_corrupt_tfmr()) { unlock(&hmi_lock); recover = 0; @@ -1311,13 +1325,31 @@ static int handle_tfac_errors(struct OpalHMIEvent *hmi_evt, uint64_t *out_flags) if (recover != 0) recover = recover2; } else if (this_cpu()->tb_invalid) { - /* This shouldn't happen, TB is invalid and no global error - * was reported. We just return for now assuming one will - * be. We can't do a rendez vous without a core-global HMI. + int recover2; + + /* + * This shouldn't happen, TB is invalid and no global error was + * reported. However, On p10, in a very rare situation when + * core is waking up from stop2 or higher stop state, timer + * facility goes into error state due to Missing step, causing + * an HMI with no error reason set in TFMR register other than + * TFMR[41]=0 (tb_valid) and TFMR[28:31]=9 (tbst_encoded). + * Ideally, "Missing step" error should be reported in + * TFMR[44]=1. It looks like in this rare case, while + * generating HMI, HW fails to sync up the TFMR register with + * the core which is waking up from stop2. + * + * To be able to recover, follow down to recovery method as if + * we got core level TB error and treat this as TFMR corrupt + * error and reset all core errors including Missing step. */ + prlog(PR_ERR, "HMI: TB invalid without core error reported ! " "CPU=%x, TFMR=0x%016lx\n", this_cpu()->pir, mfspr(SPR_TFMR)); + recover2 = handle_all_core_tfac_error(tfmr, out_flags); + if (recover != 0) + recover = recover2; } if (recover != -1 && hmi_evt) { |