aboutsummaryrefslogtreecommitdiff
path: root/core/exceptions.c
diff options
context:
space:
mode:
Diffstat (limited to 'core/exceptions.c')
-rw-r--r--core/exceptions.c529
1 files changed, 529 insertions, 0 deletions
diff --git a/core/exceptions.c b/core/exceptions.c
new file mode 100644
index 0000000..995ca92
--- /dev/null
+++ b/core/exceptions.c
@@ -0,0 +1,529 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <stack.h>
+#include <opal.h>
+#include <processor.h>
+#include <cpu.h>
+
+static uint64_t client_mc_address;
+
+extern uint8_t exc_primary_start;
+extern uint8_t exc_primary_end;
+
+extern uint32_t exc_primary_patch_branch;
+
+extern uint8_t exc_secondary_start;
+extern uint8_t exc_secondary_end;
+
+extern uint32_t exc_secondary_patch_stack;
+extern uint32_t exc_secondary_patch_mfsrr0;
+extern uint32_t exc_secondary_patch_mfsrr1;
+extern uint32_t exc_secondary_patch_type;
+extern uint32_t exc_secondary_patch_mtsrr0;
+extern uint32_t exc_secondary_patch_mtsrr1;
+extern uint32_t exc_secondary_patch_rfid;
+
+struct lock hmi_lock = LOCK_UNLOCKED;
+
+#define SRR1_MC_LOADSTORE(srr1) ((srr1) & PPC_BIT(42))
+
+#define SRR1_MC_IFETCH(srr1) ((srr1) & PPC_BITMASK(43,45))
+#define SRR1_MC_IFETCH_UE (0x1 << PPC_BITLSHIFT(45))
+#define SRR1_MC_IFETCH_SLB_PARITY (0x2 << PPC_BITLSHIFT(45))
+#define SRR1_MC_IFETCH_SLB_MULTIHIT (0x3 << PPC_BITLSHIFT(45))
+#define SRR1_MC_IFETCH_SLB_BOTH (0x4 << PPC_BITLSHIFT(45))
+#define SRR1_MC_IFETCH_TLB_MULTIHIT (0x5 << PPC_BITLSHIFT(45))
+#define SRR1_MC_IFETCH_UE_TLB_RELOAD (0x6 << PPC_BITLSHIFT(45))
+#define SRR1_MC_IFETCH_UE_IFU_INTERNAL (0x7 << PPC_BITLSHIFT(45))
+
+#define DSISR_MC_UE (PPC_BIT(48))
+#define DSISR_MC_UE_TABLEWALK (PPC_BIT(49))
+#define DSISR_MC_ERAT_MULTIHIT (PPC_BIT(52))
+#define DSISR_MC_TLB_MULTIHIT_MFTLB (PPC_BIT(53))
+#define DSISR_MC_TLB_MULTIHIT_MFSLB (PPC_BIT(55))
+#define DSISR_MC_TLB_MULTIHIT (PPC_BIT(53) | PPC_BIT(55))
+#define DSISR_MC_SLB_MULTIHIT (PPC_BIT(56))
+#define DSISR_MC_SLB_MULTIHIT_PARITY (PPC_BIT(57))
+
+static void mce_set_ierror(struct opal_machine_check_event *mce, uint64_t srr1)
+{
+ switch (SRR1_MC_IFETCH(srr1)) {
+ case SRR1_MC_IFETCH_SLB_PARITY:
+ mce->error_type = OpalMCE_ERROR_TYPE_SLB;
+ mce->u.slb_error.slb_error_type = OpalMCE_SLB_ERROR_PARITY;
+ break;
+
+ case SRR1_MC_IFETCH_SLB_MULTIHIT:
+ mce->error_type = OpalMCE_ERROR_TYPE_SLB;
+ mce->u.slb_error.slb_error_type = OpalMCE_SLB_ERROR_MULTIHIT;
+ break;
+
+ case SRR1_MC_IFETCH_SLB_BOTH:
+ mce->error_type = OpalMCE_ERROR_TYPE_SLB;
+ mce->u.slb_error.slb_error_type =
+ OpalMCE_SLB_ERROR_INDETERMINATE;
+ break;
+
+ case SRR1_MC_IFETCH_TLB_MULTIHIT:
+ mce->error_type = OpalMCE_ERROR_TYPE_TLB;
+ mce->u.tlb_error.tlb_error_type = OpalMCE_TLB_ERROR_MULTIHIT;
+ break;
+
+ case SRR1_MC_IFETCH_UE:
+ case SRR1_MC_IFETCH_UE_IFU_INTERNAL:
+ mce->error_type = OpalMCE_ERROR_TYPE_UE;
+ mce->u.ue_error.ue_error_type = OpalMCE_UE_ERROR_IFETCH;
+ break;
+
+ case SRR1_MC_IFETCH_UE_TLB_RELOAD:
+ mce->error_type = OpalMCE_ERROR_TYPE_UE;
+ mce->u.ue_error.ue_error_type =
+ OpalMCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH;
+ break;
+ }
+
+}
+
+static void mce_set_derror(struct opal_machine_check_event *mce, uint64_t dsisr)
+{
+ if (dsisr & DSISR_MC_UE) {
+ mce->error_type = OpalMCE_ERROR_TYPE_UE;
+ mce->u.ue_error.ue_error_type = OpalMCE_UE_ERROR_LOAD_STORE;
+
+ } else if (dsisr & DSISR_MC_UE_TABLEWALK) {
+ mce->error_type = OpalMCE_ERROR_TYPE_UE;
+ mce->u.ue_error.ue_error_type =
+ OpalMCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE;
+
+ } else if (dsisr & DSISR_MC_ERAT_MULTIHIT) {
+ mce->error_type = OpalMCE_ERROR_TYPE_ERAT;
+ mce->u.erat_error.erat_error_type =
+ OpalMCE_ERAT_ERROR_MULTIHIT;
+
+ } else if (dsisr & DSISR_MC_TLB_MULTIHIT) {
+ mce->error_type = OpalMCE_ERROR_TYPE_TLB;
+ mce->u.tlb_error.tlb_error_type =
+ OpalMCE_TLB_ERROR_MULTIHIT;
+
+ } else if (dsisr & DSISR_MC_SLB_MULTIHIT) {
+ mce->error_type = OpalMCE_ERROR_TYPE_SLB;
+ mce->u.slb_error.slb_error_type =
+ OpalMCE_SLB_ERROR_MULTIHIT;
+
+ } else if (dsisr & DSISR_MC_SLB_MULTIHIT_PARITY) {
+ mce->error_type = OpalMCE_ERROR_TYPE_SLB;
+ mce->u.slb_error.slb_error_type =
+ OpalMCE_SLB_ERROR_INDETERMINATE;
+ }
+}
+
+/* Called from head.S, thus no prototype */
+void handle_machine_check(struct stack_frame *stack);
+
+void handle_machine_check(struct stack_frame *stack)
+{
+ struct opal_machine_check_event *mce;
+ uint64_t srr1, addr;
+
+ mce = &this_cpu()->mc_event;
+
+ /* This will occur if we get another MC between the time that
+ * we re-set MSR_ME, and the OS clears this flag.
+ *
+ * However, the alternative is keeping MSR_ME cleared, and letting
+ * the OS re-set it (after clearing the flag). However, we
+ * risk a checkstop, and an opal assert() is the better option.
+ */
+ assert(!mce->in_use);
+
+ mce->in_use = 1;
+
+ /* Populate generic machine check info */
+ mce->version = OpalMCE_V1;
+ mce->srr0 = stack->srr0;
+ mce->srr1 = stack->srr1;
+ mce->gpr3 = stack->gpr[3];
+
+ mce->initiator = OpalMCE_INITIATOR_CPU;
+ mce->disposition = OpalMCE_DISPOSITION_NOT_RECOVERED;
+ mce->severity = OpalMCE_SEV_ERROR_SYNC;
+
+ srr1 = stack->srr1;
+
+ /* Populate the mce error_type and type-specific error_type from either
+ * SRR1 or DSISR, depending whether this was a load/store or ifetch
+ * exception */
+ if (SRR1_MC_LOADSTORE(srr1)) {
+ mce_set_derror(mce, srr1);
+ addr = stack->srr0;
+ } else {
+ mce_set_ierror(mce, mfspr(SPR_DSISR));
+ addr = mfspr(SPR_DAR);
+ }
+
+ if (mce->error_type == OpalMCE_ERROR_TYPE_TLB) {
+ mce->u.tlb_error.effective_address_provided = true;
+ mce->u.tlb_error.effective_address = addr;
+
+ } else if (mce->error_type == OpalMCE_ERROR_TYPE_SLB) {
+ mce->u.slb_error.effective_address_provided = true;
+ mce->u.slb_error.effective_address = addr;
+
+ } else if (mce->error_type == OpalMCE_ERROR_TYPE_ERAT) {
+ mce->u.erat_error.effective_address_provided = true;
+ mce->u.erat_error.effective_address = addr;
+
+ } else if (mce->error_type == OpalMCE_ERROR_TYPE_UE) {
+ mce->u.ue_error.effective_address_provided = true;
+ mce->u.ue_error.effective_address = addr;
+ }
+
+ /* Setup stack to rfi into the OS' handler, with ME re-enabled. */
+ stack->gpr[3] = (uint64_t)mce;
+ stack->srr0 = client_mc_address;
+ stack->srr1 = mfmsr() | MSR_ME;
+}
+
+#define REG "%016llx"
+#define REGS_PER_LINE 4
+#define LAST_VOLATILE 13
+
+static void dump_regs(struct stack_frame *stack, uint64_t hmer)
+{
+ int i;
+ uint64_t tfmr;
+
+ if (hmer & SPR_HMER_MALFUNCTION_ALERT)
+ printf("HMI: malfunction Alert\n");
+ if (hmer & SPR_HMER_HYP_RESOURCE_ERR)
+ printf("HMI: Hypervisor resource error.\n");
+ if (hmer & SPR_HMER_TFAC_ERROR) {
+ tfmr = mfspr(SPR_TFMR);
+ printf("HMI: TFAC error: SPRN_TFMR = 0x%016llx\n", tfmr);
+ }
+ if (hmer & SPR_HMER_TFMR_PARITY_ERROR) {
+ tfmr = mfspr(SPR_TFMR);
+ printf("HMI: TFMR parity error: SPRN_TFMR = 0x%016llx\n", tfmr);
+ }
+ printf("TRAP: %04llx\n", stack->type);
+ printf("SRR0: "REG" SRR1: "REG"\n", stack->srr0, stack->srr1);
+ printf("CFAR: "REG" LR: "REG" CTR: "REG"\n",
+ stack->cfar, stack->lr, stack->ctr);
+ printf(" CR: %08x XER: %08x\n", stack->cr, stack->xer);
+
+ for (i = 0; i < 32; i++) {
+ if ((i % REGS_PER_LINE) == 0)
+ printf("\nGPR%02d: ", i);
+ printf(REG " ", stack->gpr[i]);
+ if (i == LAST_VOLATILE)
+ break;
+ }
+ printf("\n");
+}
+
+/*
+ * HMER register layout:
+ * +===+==========+============================+========+===================+
+ * |Bit|Name |Description |PowerKVM|Action |
+ * | | | |HMI | |
+ * | | | |enabled | |
+ * | | | |for this| |
+ * | | | |bit ? | |
+ * +===+==========+============================+========+===================+
+ * |0 |malfunctio|A processor core in the |Yes |Raise attn from |
+ * | |n_allert |system has checkstopped | |sapphire resulting |
+ * | | |(failed recovery) and has | |xstop |
+ * | | |requested a CP Sparing | | |
+ * | | |to occur. This is | | |
+ * | | |broadcasted to every | | |
+ * | | |processor in the system | | |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |1 |Reserved |reserved |n/a | |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |2 |proc_recv_|Processor recovery occurred |Yes |Log message and |
+ * | |done |error-bit in fir not masked | |continue working. |
+ * | | |(see bit 11) | | |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |3 |proc_recv_|Processor went through |Yes |Log message and |
+ * | |error_mask|recovery for an error which | |continue working. |
+ * | |ed |is actually masked for | | |
+ * | | |reporting | | |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |4 | |Timer facility experienced |Yes |Raise attn from |
+ * | |tfac_error|an error. | |sapphire resulting |
+ * | | |TB, DEC, HDEC, PURR or SPURR| |xstop |
+ * | | |may be corrupted (details in| | |
+ * | | |TFMR) | | |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |5 | |TFMR SPR itself is |Yes |Raise attn from |
+ * | |tfmr_parit|corrupted. | |sapphire resulting |
+ * | |y_error |Entire timing facility may | |xstop |
+ * | | |be compromised. | | |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |6 |ha_overflo| UPS (Uniterrupted Power |No |N/A |
+ * | |w_warning |System) Overflow indication | | |
+ * | | |indicating that the UPS | | |
+ * | | |DirtyAddrTable has | | |
+ * | | |reached a limit where it | | |
+ * | | |requires PHYP unload support| | |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |7 |reserved |reserved |n/a |n/a |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |8 |xscom_fail|An XSCOM operation caused by|No |We handle it by |
+ * | | |a cache inhibited load/store| |manually reading |
+ * | | |from this thread failed. A | |HMER register. |
+ * | | |trap register is | | |
+ * | | |available. | | |
+ * | | | | | |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |9 |xscom_done|An XSCOM operation caused by|No |We handle it by |
+ * | | |a cache inhibited load/store| |manually reading |
+ * | | |from this thread completed. | |HMER register. |
+ * | | |If hypervisor | | |
+ * | | |intends to use this bit, it | | |
+ * | | |is responsible for clearing | | |
+ * | | |it before performing the | | |
+ * | | |xscom operation. | | |
+ * | | |NOTE: this bit should always| | |
+ * | | |be masked in HMEER | | |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |10 |reserved |reserved |n/a |n/a |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |11 |proc_recv_|Processor recovery occurred |y |Log message and |
+ * | |again |again before bit2 or bit3 | |continue working. |
+ * | | |was cleared | | |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |12-|reserved |was temperature sensor |n/a |n/a |
+ * |15 | |passed the critical point on| | |
+ * | | |the way up | | |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |16 | |SCOM has set a reserved FIR |No |n/a |
+ * | |scom_fir_h|bit to cause recovery | | |
+ * | |m | | | |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |17 |trig_fir_h|Debug trigger has set a |No |n/a |
+ * | |mi |reserved FIR bit to cause | | |
+ * | | |recovery | | |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |18 |reserved |reserved |n/a |n/a |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |19 |reserved |reserved |n/a |n/a |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |20 |hyp_resour|A hypervisor resource error |y |Raise attn from |
+ * | |ce_err |occurred: data parity error | |sapphire resulting |
+ * | | |on, SPRC0:3; SPR_Modereg or | |xstop. |
+ * | | |HMEER. | | |
+ * | | |Note: this bit will cause an| | |
+ * | | |check_stop when (HV=1, PR=0 | | |
+ * | | |and EE=0) | | |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |21-| |if bit 8 is active, the |No |We handle it by |
+ * |23 |xscom_stat|reason will be detailed in | |Manually reading |
+ * | |us |these bits. see chapter 11.1| |HMER register. |
+ * | | |This bits are information | | |
+ * | | |only and always masked | | |
+ * | | |(mask = '0') | | |
+ * | | |If hypervisor intends to use| | |
+ * | | |this bit, it is responsible | | |
+ * | | |for clearing it before | | |
+ * | | |performing the xscom | | |
+ * | | |operation. | | |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |24-|Not |Not implemented |n/a |n/a |
+ * |63 |implemente| | | |
+ * | |d | | | |
+ * +-- +----------+----------------------------+--------+-------------------+
+ *
+ * Above HMER bits can be enabled/disabled by modifying
+ * SPR_HMEER_HMI_ENABLE_MASK #define in include/processor.h
+ * If you modify support for any of the bits listed above, please make sure
+ * you change the above table to refelct that.
+ *
+ * NOTE: Per Dave Larson, never enable 8,9,21-23
+ */
+
+/* make compiler happy with a prototype */
+void handle_hmi(struct stack_frame *stack);
+
+void handle_hmi(struct stack_frame *stack)
+{
+ uint64_t hmer, orig_hmer;
+ bool assert = false;
+
+ orig_hmer = hmer = mfspr(SPR_HMER);
+ printf("HMI: Received HMI interrupt: HMER = 0x%016llx\n", hmer);
+ if (hmer & (SPR_HMER_PROC_RECV_DONE
+ | SPR_HMER_PROC_RECV_ERROR_MASKED)) {
+ hmer &= ~(SPR_HMER_PROC_RECV_DONE
+ | SPR_HMER_PROC_RECV_ERROR_MASKED);
+ printf("HMI: Processor recovery Done.\n");
+ }
+ if (hmer & SPR_HMER_PROC_RECV_AGAIN) {
+ hmer &= ~SPR_HMER_PROC_RECV_AGAIN;
+ printf("HMI: Processor recovery occurred again before"
+ "bit2 was cleared\n");
+ }
+ /* Assert if we see malfunction alert, we can not continue. */
+ if (hmer & SPR_HMER_MALFUNCTION_ALERT) {
+ hmer &= ~SPR_HMER_MALFUNCTION_ALERT;
+ assert = true;
+ }
+
+ /* Assert if we see Hypervisor resource error, we can not continue. */
+ if (hmer & SPR_HMER_HYP_RESOURCE_ERR) {
+ hmer &= ~SPR_HMER_HYP_RESOURCE_ERR;
+ assert = true;
+ }
+
+ /*
+ * Assert for now for all TOD errors. In future we need to decode
+ * TFMR and take corrective action wherever required.
+ */
+ if (hmer & (SPR_HMER_TFAC_ERROR | SPR_HMER_TFMR_PARITY_ERROR)) {
+ hmer &= ~(SPR_HMER_TFAC_ERROR | SPR_HMER_TFMR_PARITY_ERROR);
+ assert = true;
+ }
+
+ /*
+ * HMER bits are sticky, once set to 1 they remain set to 1 until
+ * they are set to 0. Reset the error source bit to 0, otherwise
+ * we keep getting HMI interrupt again and again.
+ */
+ mtspr(SPR_HMER, hmer);
+ if (!assert)
+ return;
+
+ /*
+ * Raise attn to crash.
+ *
+ * We get HMI on all threads at the same time. Using locks to avoid
+ * printf messages jumbled up.
+ */
+ lock(&hmi_lock);
+ dump_regs(stack, orig_hmer);
+ /* Should we unlock? We are going down anyway. */
+ unlock(&hmi_lock);
+ assert(false);
+}
+
+/* Called from head.S, thus no prototype */
+void exception_entry(struct stack_frame *stack);
+
+void exception_entry(struct stack_frame *stack)
+{
+ switch(stack->type) {
+ case STACK_ENTRY_MCHECK:
+ handle_machine_check(stack);
+ break;
+ case STACK_ENTRY_HMI:
+ handle_hmi(stack);
+ /* XXX TODO : Implement machine check */
+ break;
+ case STACK_ENTRY_SOFTPATCH:
+ /* XXX TODO : Implement softpatch ? */
+ break;
+ }
+}
+
+static int64_t patch_exception(uint64_t vector, uint64_t glue, bool hv)
+{
+ uint64_t iaddr;
+
+ /* Copy over primary exception handler */
+ memcpy((void *)vector, &exc_primary_start,
+ &exc_primary_end - &exc_primary_start);
+
+ /* Patch branch instruction in primary handler */
+ iaddr = vector + exc_primary_patch_branch;
+ *(uint32_t *)iaddr |= (glue - iaddr) & 0x03fffffc;
+
+ /* Copy over secondary exception handler */
+ memcpy((void *)glue, &exc_secondary_start,
+ &exc_secondary_end - &exc_secondary_start);
+
+ /* Patch-in the vector number */
+ *(uint32_t *)(glue + exc_secondary_patch_type) |= vector;
+
+ /*
+ * If machine check, patch GET_STACK to get to the MC stack
+ * instead of the normal stack.
+ *
+ * To simplify the arithmetic involved I make assumptions
+ * on the fact that the base of all CPU stacks is 64k aligned
+ * and that our stack size is < 32k, which means that the
+ * "addi" instruction used in GET_STACK() is always using a
+ * small (<32k) positive offset, which we can then easily
+ * fixup with a simple addition
+ */
+ BUILD_ASSERT(STACK_SIZE < 0x8000);
+ BUILD_ASSERT(!(CPU_STACKS_BASE & 0xffff));
+
+ if (vector == 0x200) {
+ /*
+ * The addi we try to patch is the 3rd instruction
+ * of GET_STACK(). If you change the macro, you must
+ * update this code
+ */
+ iaddr = glue + exc_secondary_patch_stack + 8;
+ *(uint32_t *)iaddr += MC_STACK_SIZE;
+ }
+
+ /* Standard exception ? All done */
+ if (!hv)
+ goto flush;
+
+ /* HV exception, change the SRR's to HSRRs and rfid to hrfid
+ *
+ * The magic is that mfspr/mtspr of SRR can be turned into the
+ * equivalent HSRR version by OR'ing 0x4800. For rfid to hrfid
+ * we OR 0x200.
+ */
+ *(uint32_t *)(glue + exc_secondary_patch_mfsrr0) |= 0x4800;
+ *(uint32_t *)(glue + exc_secondary_patch_mfsrr1) |= 0x4800;
+ *(uint32_t *)(glue + exc_secondary_patch_mtsrr0) |= 0x4800;
+ *(uint32_t *)(glue + exc_secondary_patch_mtsrr1) |= 0x4800;
+ *(uint32_t *)(glue + exc_secondary_patch_rfid) |= 0x200;
+
+ flush:
+ /* On P7 and later all we need is : */
+ sync_icache();
+
+ return OPAL_SUCCESS;
+}
+
+static int64_t opal_register_exc_handler(uint64_t opal_exception,
+ uint64_t handler_address,
+ uint64_t glue_cache_line)
+{
+ switch(opal_exception) {
+ case OPAL_MACHINE_CHECK_HANDLER:
+ client_mc_address = handler_address;
+ return patch_exception(0x200, glue_cache_line, false);
+ case OPAL_HYPERVISOR_MAINTENANCE_HANDLER:
+ return patch_exception(0xe60, glue_cache_line, true);
+#if 0 /* We let Linux handle softpatch */
+ case OPAL_SOFTPATCH_HANDLER:
+ return patch_exception(0x1500, glue_cache_line, true);
+#endif
+ default:
+ break;
+ }
+ return OPAL_PARAMETER;
+}
+opal_call(OPAL_REGISTER_OPAL_EXCEPTION_HANDLER, opal_register_exc_handler, 3);
+