aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMahesh Salgaonkar <mahesh@linux.vnet.ibm.com>2015-09-14 16:39:44 +0530
committerStewart Smith <stewart@linux.vnet.ibm.com>2015-09-15 11:15:16 +1000
commit1764f2452565bc7203f6e4523a3ea59bfffc35ee (patch)
treeff96c2b73dad358be5cdd2a0678d83faf96fb800
parent1311068c72f82a0ccfd8cb539d7f7f062d077a23 (diff)
downloadskiboot-1764f2452565bc7203f6e4523a3ea59bfffc35ee.zip
skiboot-1764f2452565bc7203f6e4523a3ea59bfffc35ee.tar.gz
skiboot-1764f2452565bc7203f6e4523a3ea59bfffc35ee.tar.bz2
opal: Fix hang in time_wait* calls on HMI for TB errors.
On TOD/TB errors timebase register stops/freezes until HMI error recovery gets TOD/TB back into running state. However, while HMI recovery is in progress there are chances where some code path may invoke time_wait*() calls which depends on running TB value. In an event of TB not moving, time_wait* calls would keep looping resulting into a hang on that CPU. On OpenPower systems we are seeing system hang on TOD/TB errors. The hang is seen inside OPAL HMI handler while invoking prlog/perror(). The reason is, on OpenPower systems prlog/perror() depends on LPC UART console driver to flush log messages to the console. UART read/write calls invoke time_wait_nopoll() inside opb_[read|write]() functions. When TB is in stopped state this causes a hang in prlog/perror() calls. This patch fixes this issue by modifying time_wait_[no]poll() to check for TB validity and return immediately. Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> Signed-off-by: Stewart Smith <stewart@linux.vnet.ibm.com>
-rw-r--r--core/hmi.c8
-rw-r--r--core/timebase.c10
-rw-r--r--include/cpu.h1
3 files changed, 19 insertions, 0 deletions
diff --git a/core/hmi.c b/core/hmi.c
index cbd35e6..f4453c5 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -610,6 +610,12 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
pre_recovery_cleanup();
lock(&hmi_lock);
+ /*
+ * Not all HMIs would move TB into invalid state. Set the TB state
+ * looking at TFMR register. TFMR will tell us correct state of
+ * TB register.
+ */
+ this_cpu()->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID);
printf("HMI: Received HMI interrupt: HMER = 0x%016llx\n", hmer);
if (hmi_evt)
hmi_evt->hmer = hmer;
@@ -697,6 +703,8 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
*/
mtspr(SPR_HMER, hmer);
hmi_exit();
+ /* Set the TB state looking at TFMR register before we head out. */
+ this_cpu()->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID);
unlock(&hmi_lock);
return recover;
}
diff --git a/core/timebase.c b/core/timebase.c
index b1d8196..4fcfae5 100644
--- a/core/timebase.c
+++ b/core/timebase.c
@@ -25,6 +25,11 @@ static void time_wait_poll(unsigned long duration)
unsigned long end = mftb() + duration;
unsigned long period = msecs_to_tb(5);
+ if (this_cpu()->tb_invalid) {
+ cpu_relax();
+ return;
+ }
+
while (tb_compare(mftb(), end) != TB_AAFTERB) {
/* Call pollers periodically but not continually to avoid
* bouncing cachelines due to lock contention. */
@@ -57,6 +62,11 @@ void time_wait_nopoll(unsigned long duration)
{
unsigned long end = mftb() + duration;
+ if (this_cpu()->tb_invalid) {
+ cpu_relax();
+ return;
+ }
+
while(tb_compare(mftb(), end) != TB_AAFTERB)
cpu_relax();
}
diff --git a/include/cpu.h b/include/cpu.h
index d2c1825..03a51f9 100644
--- a/include/cpu.h
+++ b/include/cpu.h
@@ -85,6 +85,7 @@ struct cpu_thread {
uint32_t *core_hmi_state_ptr;
/* Mask to indicate thread id in core. */
uint8_t thread_mask;
+ bool tb_invalid;
};
/* This global is set to 1 to allow secondaries to callin,