aboutsummaryrefslogtreecommitdiff
path: root/core/hmi.c
diff options
context:
space:
mode:
authorMahesh Salgaonkar <mahesh@linux.vnet.ibm.com>2018-09-21 09:01:34 +0530
committerStewart Smith <stewart@linux.ibm.com>2018-09-27 00:41:48 -0500
commitc884f2d0cb921131737df99ed3aad9f5a2d2945f (patch)
tree9581965d2234fdf2e2a49b795fb9a04477624c0e /core/hmi.c
parent1355c312c3082841eae58726cd0c44ed760dde97 (diff)
downloadskiboot-c884f2d0cb921131737df99ed3aad9f5a2d2945f.zip
skiboot-c884f2d0cb921131737df99ed3aad9f5a2d2945f.tar.gz
skiboot-c884f2d0cb921131737df99ed3aad9f5a2d2945f.tar.bz2
opal/hmi: Handle early HMIs on thread0 when secondaries are still in OPAL.
When primary thread receives a CORE level HMI for timer facility errors while secondaries are still in OPAL, thread 0 ends up in rendez-vous waiting for secondaries to get into hmi handling. This is because OPAL runs with MSR(EE=0) and hence HMIs are delayed on secondary threads until they are given to Linux OS. Fix this by adding a check for secondary state and force them in hmi handling by queuing job on secondary threads. I have tested this by injecting HDEC parity error very early during Linux kernel boot. Recovery works fine for non-TB errors. But if TB is bad at this very eary stage we already doomed. Without this patch we see: [ 285.046347408,7] OPAL: Start CPU 0x0843 (PIR 0x0843) -> 0x000000000000a83c [ 285.051160609,7] OPAL: Start CPU 0x0844 (PIR 0x0844) -> 0x000000000000a83c [ 285.055359021,7] HMI: Received HMI interrupt: HMER = 0x0840000000000000 [ 285.055361439,7] HMI: [Loc: U78D3.ND1.WZS004A-P1-C48]: P:8 C:17 T:0: TFMR(2e12002870e14000) Timer Facility Error [ 286.232183823,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 1 (sptr=0000ccc1) [ 287.409002056,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 2 (sptr=0000ccc1) [ 289.073820164,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 3 (sptr=0000ccc1) [ 290.250638683,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 1 (sptr=0000ccc2) [ 291.427456821,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 2 (sptr=0000ccc2) [ 293.092274807,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 3 (sptr=0000ccc2) [ 294.269092904,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 1 (sptr=0000ccc3) [ 295.445910944,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 2 (sptr=0000ccc3) [ 297.110728970,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 3 (sptr=0000ccc3) After this patch: [ 259.401719351,7] OPAL: Start CPU 0x0841 (PIR 0x0841) -> 0x000000000000a83c [ 259.406259572,7] OPAL: Start CPU 0x0842 (PIR 0x0842) -> 0x000000000000a83c [ 259.410615534,7] OPAL: Start CPU 0x0843 (PIR 0x0843) -> 0x000000000000a83c [ 259.415444519,7] OPAL: Start CPU 0x0844 (PIR 0x0844) -> 0x000000000000a83c [ 259.419641401,7] HMI: Received HMI interrupt: HMER = 0x0840000000000000 [ 259.419644124,7] HMI: [Loc: U78D3.ND1.WZS004A-P1-C48]: P:8 C:17 T:0: TFMR(2e12002870e04000) Timer Facility Error [ 259.419650678,7] HMI: Sending hmi job to thread 1 [ 259.419652744,7] HMI: Sending hmi job to thread 2 [ 259.419653051,7] HMI: Received HMI interrupt: HMER = 0x0840000000000000 [ 259.419654725,7] HMI: Sending hmi job to thread 3 [ 259.419654916,7] HMI: Received HMI interrupt: HMER = 0x0840000000000000 [ 259.419658025,7] HMI: Received HMI interrupt: HMER = 0x0840000000000000 [ 259.419658406,7] HMI: [Loc: U78D3.ND1.WZS004A-P1-C48]: P:8 C:17 T:2: TFMR(2e12002870e04000) Timer Facility Error [ 259.419663095,7] HMI: [Loc: U78D3.ND1.WZS004A-P1-C48]: P:8 C:17 T:3: TFMR(2e12002870e04000) Timer Facility Error [ 259.419655234,7] HMI: [Loc: U78D3.ND1.WZS004A-P1-C48]: P:8 C:17 T:1: TFMR(2e12002870e04000) Timer Facility Error [ 259.425109779,7] OPAL: Start CPU 0x0845 (PIR 0x0845) -> 0x000000000000a83c [ 259.429870681,7] OPAL: Start CPU 0x0846 (PIR 0x0846) -> 0x000000000000a83c [ 259.434549250,7] OPAL: Start CPU 0x0847 (PIR 0x0847) -> 0x000000000000a83c Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> Signed-off-by: Stewart Smith <stewart@linux.ibm.com>
Diffstat (limited to 'core/hmi.c')
-rw-r--r--core/hmi.c49
1 files changed, 49 insertions, 0 deletions
diff --git a/core/hmi.c b/core/hmi.c
index 4d1c3a7..57f3fdb 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -32,6 +32,7 @@
#include <npu.h>
#include <capp.h>
#include <nvram.h>
+#include <cpu.h>
/*
* HMER register layout:
@@ -966,14 +967,54 @@ static int handle_thread_tfac_error(uint64_t tfmr, uint64_t *out_flags)
return recover;
}
+static int64_t opal_handle_hmi(void);
+
+static void opal_handle_hmi_job(void *data __unused)
+{
+ opal_handle_hmi();
+}
+
+/*
+ * Queue hmi handling job If secondaries are still in OPAL
+ * This function is called by thread 0.
+ */
+static struct cpu_job **hmi_kick_secondaries(void)
+{
+ struct cpu_thread *ts = this_cpu();
+ struct cpu_job **hmi_jobs = NULL;
+ int job_sz = sizeof(struct cpu_job *) * cpu_thread_count;
+ int i;
+
+ for (i = 1; i < cpu_thread_count; i++) {
+ ts = next_cpu(ts);
+
+ /* Is this thread still in OPAL ? */
+ if (ts->state == cpu_state_active) {
+ if (!hmi_jobs) {
+ hmi_jobs = zalloc(job_sz);
+ assert(hmi_jobs);
+ }
+
+ prlog(PR_DEBUG, "Sending hmi job to thread %d\n", i);
+ hmi_jobs[i] = cpu_queue_job(ts, "handle_hmi_job",
+ opal_handle_hmi_job, NULL);
+ }
+ }
+ return hmi_jobs;
+}
+
static int handle_all_core_tfac_error(uint64_t tfmr, uint64_t *out_flags)
{
struct cpu_thread *t, *t0;
int recover = -1;
+ struct cpu_job **hmi_jobs = NULL;
t = this_cpu();
t0 = find_cpu_by_pir(cpu_get_thread0(t));
+ if (t == t0 && t0->state == cpu_state_os)
+ hmi_jobs = hmi_kick_secondaries();
+
/* Rendez vous all threads */
hmi_rendez_vous(1);
@@ -1055,6 +1096,14 @@ error_out:
if (t0->tb_resynced)
*out_flags |= OPAL_HMI_FLAGS_TB_RESYNC;
+ if (t == t0 && hmi_jobs) {
+ int i;
+ for (i = 1; i < cpu_thread_count; i++)
+ if (hmi_jobs[i])
+ cpu_wait_job(hmi_jobs[i], true);
+ free(hmi_jobs);
+ }
+
return recover;
}