diff options
author | Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> | 2018-09-21 09:01:34 +0530 |
---|---|---|
committer | Stewart Smith <stewart@linux.ibm.com> | 2018-09-27 00:41:48 -0500 |
commit | c884f2d0cb921131737df99ed3aad9f5a2d2945f (patch) | |
tree | 9581965d2234fdf2e2a49b795fb9a04477624c0e /core/hmi.c | |
parent | 1355c312c3082841eae58726cd0c44ed760dde97 (diff) | |
download | skiboot-c884f2d0cb921131737df99ed3aad9f5a2d2945f.zip skiboot-c884f2d0cb921131737df99ed3aad9f5a2d2945f.tar.gz skiboot-c884f2d0cb921131737df99ed3aad9f5a2d2945f.tar.bz2 |
opal/hmi: Handle early HMIs on thread0 when secondaries are still in OPAL.
When primary thread receives a CORE level HMI for timer facility errors
while secondaries are still in OPAL, thread 0 ends up in rendez-vous
waiting for secondaries to get into hmi handling. This is because OPAL
runs with MSR(EE=0) and hence HMIs are delayed on secondary threads until
they are given to Linux OS. Fix this by adding a check for secondary
state and force them in hmi handling by queuing job on secondary threads.
I have tested this by injecting HDEC parity error very early during Linux
kernel boot. Recovery works fine for non-TB errors. But if TB is bad at
this very eary stage we already doomed.
Without this patch we see:
[ 285.046347408,7] OPAL: Start CPU 0x0843 (PIR 0x0843) -> 0x000000000000a83c
[ 285.051160609,7] OPAL: Start CPU 0x0844 (PIR 0x0844) -> 0x000000000000a83c
[ 285.055359021,7] HMI: Received HMI interrupt: HMER = 0x0840000000000000
[ 285.055361439,7] HMI: [Loc: U78D3.ND1.WZS004A-P1-C48]: P:8 C:17 T:0: TFMR(2e12002870e14000) Timer Facility Error
[ 286.232183823,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 1 (sptr=0000ccc1)
[ 287.409002056,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 2 (sptr=0000ccc1)
[ 289.073820164,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 3 (sptr=0000ccc1)
[ 290.250638683,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 1 (sptr=0000ccc2)
[ 291.427456821,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 2 (sptr=0000ccc2)
[ 293.092274807,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 3 (sptr=0000ccc2)
[ 294.269092904,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 1 (sptr=0000ccc3)
[ 295.445910944,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 2 (sptr=0000ccc3)
[ 297.110728970,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 3 (sptr=0000ccc3)
After this patch:
[ 259.401719351,7] OPAL: Start CPU 0x0841 (PIR 0x0841) -> 0x000000000000a83c
[ 259.406259572,7] OPAL: Start CPU 0x0842 (PIR 0x0842) -> 0x000000000000a83c
[ 259.410615534,7] OPAL: Start CPU 0x0843 (PIR 0x0843) -> 0x000000000000a83c
[ 259.415444519,7] OPAL: Start CPU 0x0844 (PIR 0x0844) -> 0x000000000000a83c
[ 259.419641401,7] HMI: Received HMI interrupt: HMER = 0x0840000000000000
[ 259.419644124,7] HMI: [Loc: U78D3.ND1.WZS004A-P1-C48]: P:8 C:17 T:0: TFMR(2e12002870e04000) Timer Facility Error
[ 259.419650678,7] HMI: Sending hmi job to thread 1
[ 259.419652744,7] HMI: Sending hmi job to thread 2
[ 259.419653051,7] HMI: Received HMI interrupt: HMER = 0x0840000000000000
[ 259.419654725,7] HMI: Sending hmi job to thread 3
[ 259.419654916,7] HMI: Received HMI interrupt: HMER = 0x0840000000000000
[ 259.419658025,7] HMI: Received HMI interrupt: HMER = 0x0840000000000000
[ 259.419658406,7] HMI: [Loc: U78D3.ND1.WZS004A-P1-C48]: P:8 C:17 T:2: TFMR(2e12002870e04000) Timer Facility Error
[ 259.419663095,7] HMI: [Loc: U78D3.ND1.WZS004A-P1-C48]: P:8 C:17 T:3: TFMR(2e12002870e04000) Timer Facility Error
[ 259.419655234,7] HMI: [Loc: U78D3.ND1.WZS004A-P1-C48]: P:8 C:17 T:1: TFMR(2e12002870e04000) Timer Facility Error
[ 259.425109779,7] OPAL: Start CPU 0x0845 (PIR 0x0845) -> 0x000000000000a83c
[ 259.429870681,7] OPAL: Start CPU 0x0846 (PIR 0x0846) -> 0x000000000000a83c
[ 259.434549250,7] OPAL: Start CPU 0x0847 (PIR 0x0847) -> 0x000000000000a83c
Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Stewart Smith <stewart@linux.ibm.com>
Diffstat (limited to 'core/hmi.c')
-rw-r--r-- | core/hmi.c | 49 |
1 files changed, 49 insertions, 0 deletions
@@ -32,6 +32,7 @@ #include <npu.h> #include <capp.h> #include <nvram.h> +#include <cpu.h> /* * HMER register layout: @@ -966,14 +967,54 @@ static int handle_thread_tfac_error(uint64_t tfmr, uint64_t *out_flags) return recover; } +static int64_t opal_handle_hmi(void); + +static void opal_handle_hmi_job(void *data __unused) +{ + opal_handle_hmi(); +} + +/* + * Queue hmi handling job If secondaries are still in OPAL + * This function is called by thread 0. + */ +static struct cpu_job **hmi_kick_secondaries(void) +{ + struct cpu_thread *ts = this_cpu(); + struct cpu_job **hmi_jobs = NULL; + int job_sz = sizeof(struct cpu_job *) * cpu_thread_count; + int i; + + for (i = 1; i < cpu_thread_count; i++) { + ts = next_cpu(ts); + + /* Is this thread still in OPAL ? */ + if (ts->state == cpu_state_active) { + if (!hmi_jobs) { + hmi_jobs = zalloc(job_sz); + assert(hmi_jobs); + } + + prlog(PR_DEBUG, "Sending hmi job to thread %d\n", i); + hmi_jobs[i] = cpu_queue_job(ts, "handle_hmi_job", + opal_handle_hmi_job, NULL); + } + } + return hmi_jobs; +} + static int handle_all_core_tfac_error(uint64_t tfmr, uint64_t *out_flags) { struct cpu_thread *t, *t0; int recover = -1; + struct cpu_job **hmi_jobs = NULL; t = this_cpu(); t0 = find_cpu_by_pir(cpu_get_thread0(t)); + if (t == t0 && t0->state == cpu_state_os) + hmi_jobs = hmi_kick_secondaries(); + /* Rendez vous all threads */ hmi_rendez_vous(1); @@ -1055,6 +1096,14 @@ error_out: if (t0->tb_resynced) *out_flags |= OPAL_HMI_FLAGS_TB_RESYNC; + if (t == t0 && hmi_jobs) { + int i; + for (i = 1; i < cpu_thread_count; i++) + if (hmi_jobs[i]) + cpu_wait_job(hmi_jobs[i], true); + free(hmi_jobs); + } + return recover; } |