From 1764f2452565bc7203f6e4523a3ea59bfffc35ee Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Mon, 14 Sep 2015 16:39:44 +0530 Subject: opal: Fix hang in time_wait* calls on HMI for TB errors. On TOD/TB errors timebase register stops/freezes until HMI error recovery gets TOD/TB back into running state. However, while HMI recovery is in progress there are chances where some code path may invoke time_wait*() calls which depends on running TB value. In an event of TB not moving, time_wait* calls would keep looping resulting into a hang on that CPU. On OpenPower systems we are seeing system hang on TOD/TB errors. The hang is seen inside OPAL HMI handler while invoking prlog/perror(). The reason is, on OpenPower systems prlog/perror() depends on LPC UART console driver to flush log messages to the console. UART read/write calls invoke time_wait_nopoll() inside opb_[read|write]() functions. When TB is in stopped state this causes a hang in prlog/perror() calls. This patch fixes this issue by modifying time_wait_[no]poll() to check for TB validity and return immediately. Signed-off-by: Mahesh Salgaonkar Signed-off-by: Stewart Smith --- core/hmi.c | 8 ++++++++ core/timebase.c | 10 ++++++++++ include/cpu.h | 1 + 3 files changed, 19 insertions(+) diff --git a/core/hmi.c b/core/hmi.c index cbd35e6..f4453c5 100644 --- a/core/hmi.c +++ b/core/hmi.c @@ -610,6 +610,12 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt) pre_recovery_cleanup(); lock(&hmi_lock); + /* + * Not all HMIs would move TB into invalid state. Set the TB state + * looking at TFMR register. TFMR will tell us correct state of + * TB register. + */ + this_cpu()->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID); printf("HMI: Received HMI interrupt: HMER = 0x%016llx\n", hmer); if (hmi_evt) hmi_evt->hmer = hmer; @@ -697,6 +703,8 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt) */ mtspr(SPR_HMER, hmer); hmi_exit(); + /* Set the TB state looking at TFMR register before we head out. */ + this_cpu()->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID); unlock(&hmi_lock); return recover; } diff --git a/core/timebase.c b/core/timebase.c index b1d8196..4fcfae5 100644 --- a/core/timebase.c +++ b/core/timebase.c @@ -25,6 +25,11 @@ static void time_wait_poll(unsigned long duration) unsigned long end = mftb() + duration; unsigned long period = msecs_to_tb(5); + if (this_cpu()->tb_invalid) { + cpu_relax(); + return; + } + while (tb_compare(mftb(), end) != TB_AAFTERB) { /* Call pollers periodically but not continually to avoid * bouncing cachelines due to lock contention. */ @@ -57,6 +62,11 @@ void time_wait_nopoll(unsigned long duration) { unsigned long end = mftb() + duration; + if (this_cpu()->tb_invalid) { + cpu_relax(); + return; + } + while(tb_compare(mftb(), end) != TB_AAFTERB) cpu_relax(); } diff --git a/include/cpu.h b/include/cpu.h index d2c1825..03a51f9 100644 --- a/include/cpu.h +++ b/include/cpu.h @@ -85,6 +85,7 @@ struct cpu_thread { uint32_t *core_hmi_state_ptr; /* Mask to indicate thread id in core. */ uint8_t thread_mask; + bool tb_invalid; }; /* This global is set to 1 to allow secondaries to callin, -- cgit v1.1