aboutsummaryrefslogtreecommitdiff
path: root/core/lock.c
diff options
context:
space:
mode:
authorStewart Smith <stewart@linux.vnet.ibm.com>2015-01-21 17:46:33 +1100
committerStewart Smith <stewart@linux.vnet.ibm.com>2015-01-21 17:46:33 +1100
commit82f0a29720190172fe889af266ce62270bd46c10 (patch)
tree81764a34d4bd435747d428da16e05e498abb7522 /core/lock.c
parent162ec60743b66d15de281775176e8f051c4dfde5 (diff)
downloadskiboot-82f0a29720190172fe889af266ce62270bd46c10.zip
skiboot-82f0a29720190172fe889af266ce62270bd46c10.tar.gz
skiboot-82f0a29720190172fe889af266ce62270bd46c10.tar.bz2
Don't run pollers when sending fsp msg for op panel op_display(FATAL)
I got the following recursive lock crash: LOCK ERROR: Invalid recursive lock @0x30108108 (state: 0x0000000000000001) [315691375,0] Aborting! CPU 0000 Backtrace: S: 0000000031a03080 R: 0000000030013508 .backtrace+0x24 S: 0000000031a03100 R: 0000000030017128 .abort+0x64 S: 0000000031a03170 R: 0000000030015d34 .lock_error+0x54 S: 0000000031a031f0 R: 0000000030015de8 .lock+0x50 S: 0000000031a03270 R: 000000003004fc20 .elog_timeout_poll+0x28 <- take elog_write_lock S: 0000000031a03300 R: 0000000030017d48 .opal_run_pollers+0x54 S: 0000000031a03380 R: 0000000030043fdc .fsp_sync_msg+0x74 S: 0000000031a03410 R: 000000003004eaa0 .op_display+0xdc S: 0000000031a034c0 R: 0000000030015d08 .lock_error+0x28 <- printf"LOCK ERROR" S: 0000000031a03540 R: 0000000030015de8 .lock+0x50 <- S: 0000000031a035c0 R: 000000003002413c .rtc_cache_get+0x20 <- rtc_tod_lock INCORRECTLY KEPT LOCK S: 0000000031a03650 R: 000000003002421c .rtc_cache_get_datetime+0x24 S: 0000000031a03700 R: 0000000030023100 .create_pel_log+0x340 S: 0000000031a037f0 R: 000000003004f9dc .opal_send_elog_to_fsp+0x5c <- take elog_write_lock S: 0000000031a03880 R: 00000000300238b4 .log_simple_error+0xe0 S: 0000000031a03a10 R: 00000000300245c8 .__xscom_write+0xb8 S: 0000000031a03aa0 R: 00000000300248c0 .xscom_write+0x138 S: 0000000031a03b60 R: 000000003002af1c .occ_send_dummy_interrupt+0x34 S: 0000000031a03bd0 R: 0000000030017b9c .opal_update_pending_evt+0x68 <- take evt_lock S: 0000000031a03c60 R: 00000000300511c8 .update_opal_dump_notify+0x48 S: 0000000031a03cd0 R: 00000000300517f0 .add_dump_id_to_list+0xe8 S: 0000000031a03d80 R: 00000000300553e4 .ibm_fsp_init+0xd4 S: 0000000031a03e30 R: 00000000300140f0 .main_cpu_entry+0x3ec S: 0000000031a03f00 R: 0000000030002504 boot_entry+0x18c Which is wonderfully fascinating as there are *TWO* lock_error frames in the call stack, which means we lost the original error. This is because op_display() sends an FSP message with fsp_sync_msg() which runs opal_run_pollers() which ended up running a poller that tried to take a lock we already had when we ran into trouble. Basically, running pollers during this kind of fatal error handling is a bad idea. So, instead of doing that, just queue the FSP message and don't wait for a response (we don't really care about the response on a fatal error anyway). Additionally, move op_display() to *after* the printf call on the hope that printf ends up being a bit more foolproof than having to send FSP messages. Now, with this bug fixed, the correct lock_error() will be displayed. Signed-off-by: Stewart Smith <stewart@linux.vnet.ibm.com>
Diffstat (limited to 'core/lock.c')
-rw-r--r--core/lock.c4
1 files changed, 2 insertions, 2 deletions
diff --git a/core/lock.c b/core/lock.c
index 6a8282f..d916141 100644
--- a/core/lock.c
+++ b/core/lock.c
@@ -31,10 +31,10 @@ bool bust_locks = true;
static void lock_error(struct lock *l, const char *reason, uint16_t err)
{
- op_display(OP_FATAL, OP_MOD_LOCK, err);
-
fprintf(stderr, "LOCK ERROR: %s @%p (state: 0x%016lx)\n",
reason, l, l->lock_val);
+ op_display(OP_FATAL, OP_MOD_LOCK, err);
+
abort();
}