diff options
author | Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> | 2015-09-14 16:39:44 +0530 |
---|---|---|
committer | Stewart Smith <stewart@linux.vnet.ibm.com> | 2015-09-15 11:15:16 +1000 |
commit | 1764f2452565bc7203f6e4523a3ea59bfffc35ee (patch) | |
tree | ff96c2b73dad358be5cdd2a0678d83faf96fb800 | |
parent | 1311068c72f82a0ccfd8cb539d7f7f062d077a23 (diff) | |
download | skiboot-1764f2452565bc7203f6e4523a3ea59bfffc35ee.zip skiboot-1764f2452565bc7203f6e4523a3ea59bfffc35ee.tar.gz skiboot-1764f2452565bc7203f6e4523a3ea59bfffc35ee.tar.bz2 |
opal: Fix hang in time_wait* calls on HMI for TB errors.
On TOD/TB errors timebase register stops/freezes until HMI error recovery
gets TOD/TB back into running state. However, while HMI recovery is in
progress there are chances where some code path may invoke time_wait*()
calls which depends on running TB value. In an event of TB not moving,
time_wait* calls would keep looping resulting into a hang on that CPU.
On OpenPower systems we are seeing system hang on TOD/TB errors. The hang
is seen inside OPAL HMI handler while invoking prlog/perror(). The reason
is, on OpenPower systems prlog/perror() depends on LPC UART console
driver to flush log messages to the console. UART read/write calls invoke
time_wait_nopoll() inside opb_[read|write]() functions. When TB is in
stopped state this causes a hang in prlog/perror() calls.
This patch fixes this issue by modifying time_wait_[no]poll() to check
for TB validity and return immediately.
Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Stewart Smith <stewart@linux.vnet.ibm.com>
-rw-r--r-- | core/hmi.c | 8 | ||||
-rw-r--r-- | core/timebase.c | 10 | ||||
-rw-r--r-- | include/cpu.h | 1 |
3 files changed, 19 insertions, 0 deletions
@@ -610,6 +610,12 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt) pre_recovery_cleanup(); lock(&hmi_lock); + /* + * Not all HMIs would move TB into invalid state. Set the TB state + * looking at TFMR register. TFMR will tell us correct state of + * TB register. + */ + this_cpu()->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID); printf("HMI: Received HMI interrupt: HMER = 0x%016llx\n", hmer); if (hmi_evt) hmi_evt->hmer = hmer; @@ -697,6 +703,8 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt) */ mtspr(SPR_HMER, hmer); hmi_exit(); + /* Set the TB state looking at TFMR register before we head out. */ + this_cpu()->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID); unlock(&hmi_lock); return recover; } diff --git a/core/timebase.c b/core/timebase.c index b1d8196..4fcfae5 100644 --- a/core/timebase.c +++ b/core/timebase.c @@ -25,6 +25,11 @@ static void time_wait_poll(unsigned long duration) unsigned long end = mftb() + duration; unsigned long period = msecs_to_tb(5); + if (this_cpu()->tb_invalid) { + cpu_relax(); + return; + } + while (tb_compare(mftb(), end) != TB_AAFTERB) { /* Call pollers periodically but not continually to avoid * bouncing cachelines due to lock contention. */ @@ -57,6 +62,11 @@ void time_wait_nopoll(unsigned long duration) { unsigned long end = mftb() + duration; + if (this_cpu()->tb_invalid) { + cpu_relax(); + return; + } + while(tb_compare(mftb(), end) != TB_AAFTERB) cpu_relax(); } diff --git a/include/cpu.h b/include/cpu.h index d2c1825..03a51f9 100644 --- a/include/cpu.h +++ b/include/cpu.h @@ -85,6 +85,7 @@ struct cpu_thread { uint32_t *core_hmi_state_ptr; /* Mask to indicate thread id in core. */ uint8_t thread_mask; + bool tb_invalid; }; /* This global is set to 1 to allow secondaries to callin, |