diff options
-rw-r--r-- | core/hmi.c | 28 | ||||
-rw-r--r-- | hw/chiptod.c | 27 |
2 files changed, 54 insertions, 1 deletions
@@ -179,6 +179,14 @@ /* Number of iterations for the various timeouts */ #define TIMEOUT_LOOPS 20000000 +/* TFMR other errors. (other than bit 26 and 45) */ +#define SPR_TFMR_OTHER_ERRORS \ + (SPR_TFMR_TBST_CORRUPT | SPR_TFMR_TB_MISSING_SYNC | \ + SPR_TFMR_TB_MISSING_STEP | SPR_TFMR_FW_CONTROL_ERR | \ + SPR_TFMR_PURR_PARITY_ERR | SPR_TFMR_SPURR_PARITY_ERR | \ + SPR_TFMR_DEC_PARITY_ERR | SPR_TFMR_TFMR_CORRUPT | \ + SPR_TFMR_CHIP_TOD_INTERRUPT) + static const struct core_xstop_bit_info { uint8_t bit; /* CORE FIR bit number */ enum OpalHMI_CoreXstopReason reason; @@ -654,7 +662,12 @@ static void wait_for_cleanup_complete(void) */ static void timer_facility_do_cleanup(uint64_t tfmr) { - if (tfmr & SPR_TFMR_TB_RESIDUE_ERR) { + /* + * Workaround for HW logic bug in Power9. Do not reset the + * TB register if TB is valid and running. + */ + if ((tfmr & SPR_TFMR_TB_RESIDUE_ERR) && !(tfmr & SPR_TFMR_TB_VALID)) { + /* Reset the TB register to clear the dirty data. */ mtspr(SPR_TBWU, 0); mtspr(SPR_TBWL, 0); @@ -841,6 +854,19 @@ static void pre_recovery_cleanup_p9(void) } /* + * Due to a HW logic bug in p9, TFMR bit 26 and 45 always set + * once TB residue or HDEC errors occurs at first time. Hence for HMI + * on subsequent TB errors add additional check as workaround to + * identify validity of the errors and decide whether pre-recovery + * is required or not. Exit pre-recovery if there are other TB + * errors also present on TFMR. + */ + if (tfmr & SPR_TFMR_OTHER_ERRORS) { + unlock(&hmi_lock); + return; + } + + /* * First thread on the core ? * if yes, setup the hmi cleanup state to !DONE */ diff --git a/hw/chiptod.c b/hw/chiptod.c index 1dd1b26..b9e4774 100644 --- a/hw/chiptod.c +++ b/hw/chiptod.c @@ -1478,6 +1478,7 @@ int chiptod_recover_tb_errors(void) { uint64_t tfmr; int rc = -1; + int thread_id; if (chiptod_primary < 0) return 0; @@ -1503,6 +1504,17 @@ int chiptod_recover_tb_errors(void) tfmr = mfspr(SPR_TFMR); /* + * Workaround for HW logic bug in Power9 + * Even after clearing TB residue error by one thread it does not + * get reflected to other threads on same core. + * Check if TB is already valid and skip the checking of TB errors. + */ + + if ((proc_gen == proc_gen_p9) && (tfmr & SPR_TFMR_TB_RESIDUE_ERR) + && (tfmr & SPR_TFMR_TB_VALID)) + goto skip_tb_error_clear; + + /* * Check for TB errors. * On Sync check error, bit 44 of TFMR is set. Check for it and * clear it. @@ -1525,6 +1537,7 @@ int chiptod_recover_tb_errors(void) } } +skip_tb_error_clear: /* * Check for TOD sync check error. * On TOD errors, bit 51 of TFMR is set. If this bit is on then we @@ -1559,6 +1572,20 @@ int chiptod_recover_tb_errors(void) } /* + * Workaround for HW logic bug in power9. + * In idea case (without the HW bug) only one thread from the core + * would have fallen through tfmr_recover_non_tb_errors() to clear + * HDEC parity error on TFMR. + * + * Hence to achieve same behavior, allow only thread 0 to clear the + * HDEC parity error. And for rest of the threads just reset the bit + * to avoid other threads to fall through tfmr_recover_non_tb_errors(). + */ + thread_id = cpu_get_thread_index(this_cpu()); + if ((proc_gen == proc_gen_p9) && thread_id) + tfmr &= ~SPR_TFMR_HDEC_PARITY_ERROR; + + /* * Now that TB is running, check for TFMR non-TB errors. */ if ((tfmr & SPR_TFMR_HDEC_PARITY_ERROR) || |