aboutsummaryrefslogtreecommitdiff
path: root/core
diff options
context:
space:
mode:
authorNicholas Piggin <npiggin@gmail.com>2020-04-27 21:08:08 +1000
committerOliver O'Halloran <oohall@gmail.com>2020-06-11 12:52:55 +1000
commit8c49753c04ae39cbd9b238484ccb8de88a4355df (patch)
tree1ef61b3c290fe7a1cae4fc70ed4ea364c74e2275 /core
parentdca0d5345631fb8d116eaf015416a6a51ead6028 (diff)
downloadskiboot-8c49753c04ae39cbd9b238484ccb8de88a4355df.zip
skiboot-8c49753c04ae39cbd9b238484ccb8de88a4355df.tar.gz
skiboot-8c49753c04ae39cbd9b238484ccb8de88a4355df.tar.bz2
core/mce: add support for decoding and handling machine checks
This provides an initial facility to decode machine checks into human readable strings, plus a minimum amount of metadata that a handler has to understand in order to deal with the machine check. For now this is only used by skiboot to make MCE reporting nicer, and an ERAT flush recovery attempt which is more about code coverage than really being helpful. *********************************************** Fatal MCE at 00000000300c9c0c .memcmp+0x3c MSR 9000000000141002 Cause: instruction fetch TLB multi-hit error Effective address: 0x00000000300c9c0c ... The intention is to subsequently provide an OPAL API with this information that will enable an OS to implement a machine independent OPAL machine check driver. The code and data tables are derived from Linux code that I wrote, so relicensing is okay. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Diffstat (limited to 'core')
-rw-r--r--core/Makefile.inc2
-rw-r--r--core/exceptions.c57
-rw-r--r--core/mce.c187
3 files changed, 240 insertions, 6 deletions
diff --git a/core/Makefile.inc b/core/Makefile.inc
index cc7d789..829800e 100644
--- a/core/Makefile.inc
+++ b/core/Makefile.inc
@@ -7,7 +7,7 @@ CORE_OBJS = relocate.o console.o stack.o init.o chip.o mem_region.o
CORE_OBJS += malloc.o lock.o cpu.o utils.o fdt.o opal.o interrupts.o timebase.o
CORE_OBJS += opal-msg.o pci.o pci-virt.o pci-slot.o pcie-slot.o
CORE_OBJS += pci-opal.o fast-reboot.o device.o exceptions.o trace.o affinity.o
-CORE_OBJS += vpd.o platform.o nvram.o nvram-format.o hmi.o
+CORE_OBJS += vpd.o platform.o nvram.o nvram-format.o hmi.o mce.o
CORE_OBJS += console-log.o ipmi.o time-utils.o pel.o pool.o errorlog.o
CORE_OBJS += timer.o i2c.o rtc.o flash.o sensor.o ipmi-opal.o
CORE_OBJS += flash-subpartition.o bitmap.o buddy.o pci-quirk.o powercap.o psr.o
diff --git a/core/exceptions.c b/core/exceptions.c
index fd069aa..389548d 100644
--- a/core/exceptions.c
+++ b/core/exceptions.c
@@ -10,6 +10,7 @@
#include <opal.h>
#include <processor.h>
#include <cpu.h>
+#include <ras.h>
#define REG "%016llx"
#define REG32 "%08x"
@@ -32,6 +33,54 @@ static void dump_regs(struct stack_frame *stack)
#define EXCEPTION_MAX_STR 320
+static void handle_mce(struct stack_frame *stack, uint64_t nip, uint64_t msr, bool *fatal)
+{
+ uint64_t mce_flags, mce_addr;
+ const char *mce_err;
+ const char *mce_fix = NULL;
+ char buf[EXCEPTION_MAX_STR];
+ size_t l;
+
+ decode_mce(stack->srr0, stack->srr1, stack->dsisr, stack->dar,
+ &mce_flags, &mce_err, &mce_addr);
+
+ /* Try to recover. */
+ if (mce_flags & MCE_ERAT_ERROR) {
+ /* Real-mode still uses ERAT, flush transient bitflips */
+ flush_erat();
+ mce_fix = "ERAT flush";
+
+ } else {
+ *fatal = true;
+ }
+
+ prerror("***********************************************\n");
+ l = 0;
+ l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
+ "%s MCE at "REG" ", *fatal ? "Fatal" : "Non-fatal", nip);
+ l += snprintf_symbol(buf + l, EXCEPTION_MAX_STR - l, nip);
+ l += snprintf(buf + l, EXCEPTION_MAX_STR - l, " MSR "REG, msr);
+ prerror("%s\n", buf);
+
+ l = 0;
+ l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
+ "Cause: %s", mce_err);
+ prerror("%s\n", buf);
+ if (mce_flags & MCE_INVOLVED_EA) {
+ l = 0;
+ l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
+ "Effective address: 0x%016llx", mce_addr);
+ prerror("%s\n", buf);
+ }
+
+ if (!*fatal) {
+ l = 0;
+ l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
+ "Attempting recovery: %s", mce_fix);
+ prerror("%s\n", buf);
+ }
+}
+
void exception_entry(struct stack_frame *stack)
{
bool fatal = false;
@@ -85,11 +134,8 @@ void exception_entry(struct stack_frame *stack)
break;
case 0x200:
- fatal = true;
- prerror("***********************************************\n");
- l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
- "Fatal MCE at "REG" ", nip);
- break;
+ handle_mce(stack, nip, msr, &fatal);
+ goto no_symbol;
case 0x700: {
struct trap_table_entry *tte;
@@ -130,6 +176,7 @@ void exception_entry(struct stack_frame *stack)
l += snprintf_symbol(buf + l, EXCEPTION_MAX_STR - l, nip);
l += snprintf(buf + l, EXCEPTION_MAX_STR - l, " MSR "REG, msr);
prerror("%s\n", buf);
+no_symbol:
dump_regs(stack);
backtrace_r1((uint64_t)stack);
if (fatal) {
diff --git a/core/mce.c b/core/mce.c
new file mode 100644
index 0000000..a07eeb6
--- /dev/null
+++ b/core/mce.c
@@ -0,0 +1,187 @@
+// SPDX-License-Identifier: Apache-2.0
+/*
+ * Machine Check Exceptions
+ *
+ * Copyright 2020 IBM Corp.
+ */
+
+#define pr_fmt(fmt) "MCE: " fmt
+
+#include <ras.h>
+#include <opal.h>
+#include <cpu.h>
+
+#define SRR1_MC_LOADSTORE(srr1) ((srr1) & PPC_BIT(42))
+
+struct mce_ierror_table {
+ unsigned long srr1_mask;
+ unsigned long srr1_value;
+ uint64_t type;
+ const char *error_str;
+};
+
+static const struct mce_ierror_table mce_p9_ierror_table[] = {
+{ 0x00000000081c0000, 0x0000000000040000,
+ MCE_INSNFETCH | MCE_MEMORY_ERROR | MCE_INVOLVED_EA,
+ "instruction fetch memory uncorrectable error", },
+{ 0x00000000081c0000, 0x0000000000080000,
+ MCE_INSNFETCH | MCE_SLB_ERROR | MCE_INVOLVED_EA,
+ "instruction fetch SLB parity error", },
+{ 0x00000000081c0000, 0x00000000000c0000,
+ MCE_INSNFETCH | MCE_SLB_ERROR | MCE_INVOLVED_EA,
+ "instruction fetch SLB multi-hit error", },
+{ 0x00000000081c0000, 0x0000000000100000,
+ MCE_INSNFETCH | MCE_INVOLVED_EA | MCE_ERAT_ERROR,
+ "instruction fetch ERAT multi-hit error", },
+{ 0x00000000081c0000, 0x0000000000140000,
+ MCE_INSNFETCH | MCE_INVOLVED_EA | MCE_TLB_ERROR,
+ "instruction fetch TLB multi-hit error", },
+{ 0x00000000081c0000, 0x0000000000180000,
+ MCE_INSNFETCH | MCE_MEMORY_ERROR | MCE_TABLE_WALK | MCE_INVOLVED_EA,
+ "instruction fetch page table access memory uncorrectable error", },
+{ 0x00000000081c0000, 0x00000000001c0000,
+ MCE_INSNFETCH | MCE_INVOLVED_EA,
+ "instruction fetch to foreign address", },
+{ 0x00000000081c0000, 0x0000000008000000,
+ MCE_INSNFETCH | MCE_INVOLVED_EA,
+ "instruction fetch foreign link time-out", },
+{ 0x00000000081c0000, 0x0000000008040000,
+ MCE_INSNFETCH | MCE_TABLE_WALK | MCE_INVOLVED_EA,
+ "instruction fetch page table access foreign link time-out", },
+{ 0x00000000081c0000, 0x00000000080c0000,
+ MCE_INSNFETCH | MCE_INVOLVED_EA,
+ "instruction fetch real address error", },
+{ 0x00000000081c0000, 0x0000000008100000,
+ MCE_INSNFETCH | MCE_TABLE_WALK | MCE_INVOLVED_EA,
+ "instruction fetch page table access real address error", },
+{ 0x00000000081c0000, 0x0000000008140000,
+ MCE_LOADSTORE | MCE_IMPRECISE,
+ "store real address asynchronous error", },
+{ 0x00000000081c0000, 0x0000000008180000,
+ MCE_LOADSTORE | MCE_IMPRECISE,
+ "store foreign link time-out asynchronous error", },
+{ 0x00000000081c0000, 0x00000000081c0000,
+ MCE_INSNFETCH | MCE_TABLE_WALK | MCE_INVOLVED_EA,
+ "instruction fetch page table access to foreign address", },
+{ 0 } };
+
+struct mce_derror_table {
+ unsigned long dsisr_value;
+ uint64_t type;
+ const char *error_str;
+};
+
+static const struct mce_derror_table mce_p9_derror_table[] = {
+{ 0x00008000,
+ MCE_LOADSTORE | MCE_MEMORY_ERROR,
+ "load/store memory uncorrectable error", },
+{ 0x00004000,
+ MCE_LOADSTORE | MCE_MEMORY_ERROR | MCE_TABLE_WALK | MCE_INVOLVED_EA,
+ "load/store page table access memory uncorrectable error", },
+{ 0x00002000,
+ MCE_LOADSTORE | MCE_INVOLVED_EA,
+ "load/store foreign link time-out", },
+{ 0x00001000,
+ MCE_LOADSTORE | MCE_TABLE_WALK | MCE_INVOLVED_EA,
+ "load/store page table access foreign link time-out", },
+{ 0x00000800,
+ MCE_LOADSTORE | MCE_INVOLVED_EA | MCE_ERAT_ERROR,
+ "load/store ERAT multi-hit error", },
+{ 0x00000400,
+ MCE_LOADSTORE | MCE_INVOLVED_EA | MCE_TLB_ERROR,
+ "load/store TLB multi-hit error", },
+{ 0x00000200,
+ MCE_LOADSTORE | MCE_TLBIE_ERROR,
+ "TLBIE or TLBIEL instruction programming error", },
+{ 0x00000100,
+ MCE_LOADSTORE | MCE_INVOLVED_EA | MCE_SLB_ERROR,
+ "load/store SLB parity error", },
+{ 0x00000080,
+ MCE_LOADSTORE | MCE_INVOLVED_EA | MCE_SLB_ERROR,
+ "load/store SLB multi-hit error", },
+{ 0x00000040,
+ MCE_LOADSTORE | MCE_INVOLVED_EA,
+ "load real address error", },
+{ 0x00000020,
+ MCE_LOADSTORE | MCE_TABLE_WALK,
+ "load/store page table access real address error", },
+{ 0x00000010,
+ MCE_LOADSTORE | MCE_TABLE_WALK,
+ "load/store page table access to foreign address", },
+{ 0x00000008,
+ MCE_LOADSTORE,
+ "load/store to foreign address", },
+{ 0 } };
+
+static void decode_ierror(const struct mce_ierror_table table[],
+ uint64_t srr1,
+ uint64_t *type,
+ const char **error_str)
+{
+ int i;
+
+ for (i = 0; table[i].srr1_mask; i++) {
+ if ((srr1 & table[i].srr1_mask) != table[i].srr1_value)
+ continue;
+
+ *type = table[i].type;
+ *error_str = table[i].error_str;
+ }
+}
+
+static void decode_derror(const struct mce_derror_table table[],
+ uint32_t dsisr,
+ uint64_t *type,
+ const char **error_str)
+{
+ int i;
+
+ for (i = 0; table[i].dsisr_value; i++) {
+ if (!(dsisr & table[i].dsisr_value))
+ continue;
+
+ *type = table[i].type;
+ *error_str = table[i].error_str;
+ }
+}
+
+void decode_mce(uint64_t srr0, uint64_t srr1,
+ uint32_t dsisr, uint64_t dar,
+ uint64_t *type, const char **error_str,
+ uint64_t *address)
+{
+ *type = MCE_UNKNOWN;
+ *error_str = "unknown error";
+ *address = 0;
+
+ if (proc_gen != proc_gen_p9) {
+ *error_str = "unknown error (processor not supported)";
+ return;
+ }
+
+ /*
+ * On POWER9 DD2.1 and below, it's possible to get a machine check
+ * caused by a paste instruction where only DSISR bit 25 is set. This
+ * will result in the MCE handler seeing an unknown event and the
+ * kernel crashing. An MCE that occurs like this is spurious, so we
+ * don't need to do anything in terms of servicing it. If there is
+ * something that needs to be serviced, the CPU will raise the MCE
+ * again with the correct DSISR so that it can be serviced properly.
+ * So detect this case and mark it as handled.
+ */
+ if (SRR1_MC_LOADSTORE(srr1) && dsisr == 0x02000000) {
+ *type = MCE_NO_ERROR;
+ *error_str = "no error (superfluous machine check)";
+ return;
+ }
+
+ if (SRR1_MC_LOADSTORE(srr1)) {
+ decode_derror(mce_p9_derror_table, dsisr, type, error_str);
+ if (*type & MCE_INVOLVED_EA)
+ *address = dar;
+ } else {
+ decode_ierror(mce_p9_ierror_table, srr1, type, error_str);
+ if (*type & MCE_INVOLVED_EA)
+ *address = srr0;
+ }
+}