39 files changed, 11572 insertions, 0 deletions
diff --git a/core/Makefile.inc b/core/Makefile.inc
new file mode 100644
index 0000000..843ce05
--- /dev/null
+++ b/core/Makefile.inc
@@ -0,0 +1,12 @@
+# -*-Makefile-*-
+
+SUBDIRS += core
+CORE_OBJS = relocate.o console.o backtrace.o init.o chip.o mem_region.o
+CORE_OBJS += malloc.o lock.o cpu.o utils.o fdt.o opal.o interrupts.o
+CORE_OBJS += timebase.o opal-msg.o pci.o pci-opal.o fast-reboot.o
+CORE_OBJS += device.o exceptions.o trace.o affinity.o vpd.o
+CORE_OBJS += hostservices.o platform.o nvram.o flash-nvram.o
+CORE=core/built-in.o
+
+$(CORE): $(CORE_OBJS:%=core/%)
+
diff --git a/core/affinity.c b/core/affinity.c
new file mode 100644
index 0000000..d5eea82
--- /dev/null
+++ b/core/affinity.c
@@ -0,0 +1,132 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ *
+ * We currently construct our associativity properties as such:
+ *
+ * - For "chip" devices (bridges, memory, ...), 4 entries:
+ *
+ *     - CCM node ID
+ *     - HW card ID
+ *     - HW module ID
+ *     - Chip ID
+ *
+ *   The information is constructed based on the chip ID which (unlike
+ *   pHyp) is our HW chip ID (aka "XSCOM" chip ID). We use it to retrieve
+ *   the other properties from the corresponding chip/xscom node in the
+ *   device-tree. If those properties are absent, 0 is used.
+ *
+ * - For "core" devices, we add a 5th entry:
+ *
+ *     - Core ID
+ *
+ *   Here too, we do not use the "cooked" HW processor ID from HDAT but
+ *   intead use the real HW core ID which is basically the interrupt
+ *   server number of thread 0 on that core.
+ *
+ *
+ * The ibm,associativity-reference-points property is currently set to
+ * 4,4 indicating that the chip ID is our only reference point. This
+ * should be extended to encompass the node IDs eventually.
+ */
+#include <skiboot.h>
+#include <opal.h>
+#include <device.h>
+#include <console.h>
+#include <trace.h>
+#include <chip.h>
+#include <cpu.h>
+#include <affinity.h>
+
+static uint32_t get_chip_node_id(struct proc_chip *chip)
+{
+	/* If the xscom node has an ibm,ccm-node-id property, use it */
+	if (dt_has_node_property(chip->devnode, "ibm,ccm-node-id", NULL))
+		return dt_prop_get_u32(chip->devnode, "ibm,ccm-node-id");
+
+	/*
+	 * Else use the 3 top bits of the chip ID which should be
+	 * the node on both P7 and P8
+	 */
+	return chip->id >> 3;
+}
+
+void add_associativity_ref_point(void)
+{
+	int ref2 = 0x4;
+
+	/*
+	 * Note about our use of reference points:
+	 *
+	 * Linux currently supports two levels of NUMA. We use the first
+	 * reference point for the node ID and the second reference point
+	 * for a second level of affinity. We always use the chip ID (4)
+	 * for the first reference point.
+	 *
+	 * Choosing the second level of affinity is model specific
+	 * unfortunately. Current POWER8E models should use the DCM
+	 * as a second level of NUMA.
+	 *
+	 * If there is a way to obtain this information from the FSP
+	 * that would be ideal, but for now hardwire our POWER8E setting.
+	 */
+	if (PVR_TYPE(mfspr(SPR_PVR)) == PVR_TYPE_P8E)
+		ref2 = 0x3;
+
+	dt_add_property_cells(opal_node, "ibm,associativity-reference-points",
+			      0x4, ref2);
+}
+
+void add_chip_dev_associativity(struct dt_node *dev)
+{
+	uint32_t chip_id = dt_get_chip_id(dev);
+	struct proc_chip *chip = get_chip(chip_id);
+	uint32_t hw_cid, hw_mid;
+
+	if (!chip)
+		return;
+
+	hw_cid = dt_prop_get_u32_def(chip->devnode, "ibm,hw-card-id", 0);
+	hw_mid = dt_prop_get_u32_def(chip->devnode, "ibm,hw-module-id", 0);
+
+	dt_add_property_cells(dev, "ibm,associativity", 4,
+			      get_chip_node_id(chip),
+			      hw_cid, hw_mid, chip_id);
+}
+
+void add_core_associativity(struct cpu_thread *cpu)
+{
+	struct proc_chip *chip = get_chip(cpu->chip_id);
+	uint32_t hw_cid, hw_mid, core_id;
+
+	if (!chip)
+		return;
+
+	if (proc_gen == proc_gen_p7)
+		core_id = (cpu->pir >> 2) & 0x7;
+	else if (proc_gen == proc_gen_p8)
+		core_id = (cpu->pir >> 3) & 0xf;
+	else
+		return;
+
+	hw_cid = dt_prop_get_u32_def(chip->devnode, "ibm,hw-card-id", 0);
+	hw_mid = dt_prop_get_u32_def(chip->devnode, "ibm,hw-module-id", 0);
+
+	dt_add_property_cells(cpu->node, "ibm,associativity", 5,
+			      get_chip_node_id(chip),
+			      hw_cid, hw_mid, chip->id, core_id);
+}
diff --git a/core/backtrace.c b/core/backtrace.c
new file mode 100644
index 0000000..3439db0
--- /dev/null
+++ b/core/backtrace.c
@@ -0,0 +1,41 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include <skiboot.h>
+#include <processor.h>
+#include <cpu.h>
+
+void backtrace(void)
+{
+	unsigned int pir = mfspr(SPR_PIR);
+	unsigned long *sp;
+	unsigned long *bottom, *top;
+
+	/* Check if there's a __builtin_something instead */
+	asm("mr %0,1" : "=r" (sp));
+
+	bottom = cpu_stack_bottom(pir);
+	top = cpu_stack_top(pir);
+
+	/* XXX Handle SMP */
+	fprintf(stderr, "CPU %08x Backtrace:\n", pir);
+	while(sp > bottom && sp < top) {
+		fprintf(stderr, " S: %016lx R: %016lx\n",
+			(unsigned long)sp, sp[2]);
+		sp = (unsigned long *)sp[0];
+	}
+}
diff --git a/core/chip.c b/core/chip.c
new file mode 100644
index 0000000..e6eb81c
--- /dev/null
+++ b/core/chip.c
@@ -0,0 +1,85 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include <skiboot.h>
+#include <chip.h>
+#include <device.h>
+
+static struct proc_chip *chips[MAX_CHIPS];
+
+uint32_t pir_to_chip_id(uint32_t pir)
+{
+	if (proc_gen == proc_gen_p8)
+		return P8_PIR2GCID(pir);
+	else
+		return P7_PIR2GCID(pir);
+}
+
+uint32_t pir_to_core_id(uint32_t pir)
+{
+	if (proc_gen == proc_gen_p8)
+		return P8_PIR2COREID(pir);
+	else
+		return P7_PIR2COREID(pir);
+}
+
+uint32_t pir_to_thread_id(uint32_t pir)
+{
+	if (proc_gen == proc_gen_p8)
+		return P8_PIR2THREADID(pir);
+	else
+		return P7_PIR2THREADID(pir);
+}
+
+struct proc_chip *next_chip(struct proc_chip *chip)
+{
+	unsigned int i;
+
+	for (i = chip ? (chip->id + 1) : 0; i < MAX_CHIPS; i++)
+		if (chips[i])
+			return chips[i];
+	return NULL;
+}
+
+
+struct proc_chip *get_chip(uint32_t chip_id)
+{
+	return chips[chip_id];
+}
+
+void init_chips(void)
+{
+	struct proc_chip *chip;
+	struct dt_node *xn;
+
+	/* We walk the chips based on xscom nodes in the tree */
+	dt_for_each_compatible(dt_root, xn, "ibm,xscom") {
+		uint32_t id = dt_get_chip_id(xn);
+
+		assert(id < MAX_CHIPS);
+
+		chip = zalloc(sizeof(struct proc_chip));
+		assert(chip);
+		chip->id = id;
+		chip->devnode = xn;
+		chips[id] = chip;
+		chip->dbob_id = dt_prop_get_u32_def(xn, "ibm,dbob-id",
+						    0xffffffff);
+		chip->pcid = dt_prop_get_u32_def(xn, "ibm,proc-chip-id",
+						 0xffffffff);
+	};
+}
diff --git a/core/console.c b/core/console.c
new file mode 100644
index 0000000..b291b1b
--- /dev/null
+++ b/core/console.c
@@ -0,0 +1,334 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Console IO routine for use by libc
+ *
+ * fd is the classic posix 0,1,2 (stdin, stdout, stderr)
+ */
+#include <skiboot.h>
+#include <unistd.h>
+#include <console.h>
+#include <opal.h>
+#include <device.h>
+#include <processor.h>
+#include <cpu.h>
+
+static char *con_buf = (char *)INMEM_CON_START;
+static size_t con_in;
+static size_t con_out;
+static bool con_wrapped;
+static struct con_ops *con_driver;
+
+struct lock con_lock = LOCK_UNLOCKED;
+
+/* This is mapped via TCEs so we keep it alone in a page */
+struct memcons memcons __section(".data.memcons") = {
+	.magic		= MEMCONS_MAGIC,
+	.obuf_phys	= INMEM_CON_START,
+	.ibuf_phys	= INMEM_CON_START + INMEM_CON_OUT_LEN,
+	.obuf_size	= INMEM_CON_OUT_LEN,
+	.ibuf_size	= INMEM_CON_IN_LEN,
+};
+
+bool dummy_console_enabled(void)
+{
+#ifdef FORCE_DUMMY_CONSOLE
+	return true;
+#else
+	return dt_has_node_property(dt_chosen,
+				    "sapphire,enable-dummy-console", NULL);
+#endif
+}
+
+void force_dummy_console(void)
+{
+	dt_add_property(dt_chosen, "sapphire,enable-dummy-console", NULL, 0);
+}
+
+#ifdef MAMBO_CONSOLE
+static void mambo_write(const char *buf, size_t count)
+{
+#define SIM_WRITE_CONSOLE_CODE	0
+	register int c asm("r3") = 0; /* SIM_WRITE_CONSOLE_CODE */
+	register unsigned long a1 asm("r4") = (unsigned long)buf;
+	register unsigned long a2 asm("r5") = count;
+	register unsigned long a3 asm("r6") = 0;
+	asm volatile (".long 0x000eaeb0":"=r" (c):"r"(c), "r"(a1), "r"(a2),
+		      "r"(a3));
+}
+#else
+static void mambo_write(const char *buf __unused, size_t count __unused) { }
+#endif /* MAMBO_CONSOLE */
+
+void clear_console(void)
+{
+	memset(con_buf, 0, INMEM_CON_LEN);
+}
+
+/*
+ * Flush the console buffer into the driver, returns true
+ * if there is more to go
+ */
+bool __flush_console(void)
+{
+	struct cpu_thread *cpu = this_cpu();
+	size_t req, len = 0;
+	static bool in_flush, more_flush;
+
+	/* Is there anything to flush ? Bail out early if not */
+	if (con_in == con_out || !con_driver)
+		return false;
+
+	/*
+	 * Console flushing is suspended on this CPU, typically because
+	 * some critical locks are held that would potentially case a
+	 * flush to deadlock
+	 */
+	if (cpu->con_suspend) {
+		cpu->con_need_flush = true;
+		return false;
+	}
+	cpu->con_need_flush = false;
+
+	/*
+	 * We must call the underlying driver with the console lock
+	 * dropped otherwise we get some deadlocks if anything down
+	 * that path tries to printf() something.
+	 *
+	 * So instead what we do is we keep a static in_flush flag
+	 * set/released with the lock held, which is used to prevent
+	 * concurrent attempts at flushing the same chunk of buffer
+	 * by other processors.
+	 */
+	if (in_flush) {
+		more_flush = true;
+		return false;
+	}
+	in_flush = true;
+
+	do {
+		more_flush = false;
+		if (con_out > con_in) {
+			req = INMEM_CON_OUT_LEN - con_out;
+			unlock(&con_lock);
+			len = con_driver->write(con_buf + con_out, req);
+			lock(&con_lock);
+			con_out = (con_out + len) % INMEM_CON_OUT_LEN;
+			if (len < req)
+				goto bail;
+		}
+		if (con_out < con_in) {
+			unlock(&con_lock);
+			len = con_driver->write(con_buf + con_out,
+						con_in - con_out);
+			lock(&con_lock);
+			con_out = (con_out + len) % INMEM_CON_OUT_LEN;
+		}
+	} while(more_flush);
+bail:
+	in_flush = false;
+	return con_out != con_in;
+}
+
+bool flush_console(void)
+{
+	bool ret;
+
+	lock(&con_lock);
+	ret = __flush_console();
+	unlock(&con_lock);
+
+	return ret;
+}
+
+static void inmem_write(char c)
+{
+	uint32_t opos;
+
+	if (!c)
+		return;
+	con_buf[con_in++] = c;
+	if (con_in >= INMEM_CON_OUT_LEN) {
+		con_in = 0;
+		con_wrapped = true;
+	}
+
+	/*
+	 * We must always re-generate memcons.out_pos because
+	 * under some circumstances, the console script will
+	 * use a broken putmemproc that does RMW on the full
+	 * 8 bytes containing out_pos and in_prod, thus corrupting
+	 * out_pos
+	 */
+	opos = con_in;
+	if (con_wrapped)
+		opos |= MEMCONS_OUT_POS_WRAP;
+	lwsync();
+	memcons.out_pos = opos;
+
+	/* If head reaches tail, push tail around & drop chars */
+	if (con_in == con_out)
+		con_out = (con_in + 1) % INMEM_CON_OUT_LEN;
+}
+
+static size_t inmem_read(char *buf, size_t req)
+{
+	size_t read = 0;
+	char *ibuf = (char *)memcons.ibuf_phys;
+
+	while (req && memcons.in_prod != memcons.in_cons) {
+		*(buf++) = ibuf[memcons.in_cons];
+		lwsync();
+		memcons.in_cons = (memcons.in_cons + 1) % INMEM_CON_IN_LEN;
+		req--;
+		read++;
+	}
+	return read;
+}
+
+static void write_char(char c)
+{
+	mambo_write(&c, 1);
+	inmem_write(c);
+}
+
+ssize_t write(int fd __unused, const void *buf, size_t count)
+{
+	/* We use recursive locking here as we can get called
+	 * from fairly deep debug path
+	 */
+	bool need_unlock = lock_recursive(&con_lock);
+	const char *cbuf = buf;
+
+	while(count--) {
+		char c = *(cbuf++);
+		if (c == 10)
+			write_char(13);
+		write_char(c);
+	}
+
+	__flush_console();
+
+	if (need_unlock)
+		unlock(&con_lock);
+
+	return count;
+}
+
+ssize_t read(int fd __unused, void *buf, size_t req_count)
+{
+	bool need_unlock = lock_recursive(&con_lock);
+	size_t count = 0;
+
+	if (con_driver && con_driver->read)
+		count = con_driver->read(buf, req_count);
+	if (!count)
+		count = inmem_read(buf, req_count);
+	if (need_unlock)
+		unlock(&con_lock);
+	return count;
+}
+
+void set_console(struct con_ops *driver)
+{
+	con_driver = driver;
+	if (driver)
+		flush_console();
+}
+
+void memcons_add_properties(void)
+{
+	uint64_t addr = (u64)&memcons;
+
+	dt_add_property_cells(opal_node, "ibm,opal-memcons",
+			      hi32(addr), lo32(addr));
+}
+
+/*
+ * Default OPAL console provided if nothing else overrides it
+ */
+static int64_t dummy_console_write(int64_t term_number, int64_t *length,
+				   const uint8_t *buffer)
+{
+	if (term_number != 0)
+		return OPAL_PARAMETER;
+	write(0, buffer, *length);
+
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_CONSOLE_WRITE, dummy_console_write, 3);
+
+static int64_t dummy_console_write_buffer_space(int64_t term_number,
+						int64_t *length)
+{
+	if (term_number != 0)
+		return OPAL_PARAMETER;
+	if (length)
+		*length = INMEM_CON_OUT_LEN;
+
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_CONSOLE_WRITE_BUFFER_SPACE, dummy_console_write_buffer_space, 2);
+
+static int64_t dummy_console_read(int64_t term_number, int64_t *length,
+				  uint8_t *buffer)
+{
+	if (term_number != 0)
+		return OPAL_PARAMETER;
+	*length = read(0, buffer, *length);
+
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_CONSOLE_READ, dummy_console_read, 3);
+
+static void dummy_console_poll(void *data __unused)
+{
+	bool uart_has_data;
+
+	lock(&con_lock);
+	uart_has_data = uart_console_poll();
+
+	if (uart_has_data || memcons.in_prod != memcons.in_cons)
+		opal_update_pending_evt(OPAL_EVENT_CONSOLE_INPUT,
+					OPAL_EVENT_CONSOLE_INPUT);
+	else
+		opal_update_pending_evt(OPAL_EVENT_CONSOLE_INPUT, 0);
+	unlock(&con_lock);
+
+}
+
+void dummy_console_add_nodes(void)
+{
+	struct dt_node *con, *consoles;
+
+	consoles = dt_new(opal_node, "consoles");
+	assert(consoles);
+	dt_add_property_cells(consoles, "#address-cells", 1);
+	dt_add_property_cells(consoles, "#size-cells", 0);
+
+	con = dt_new_addr(consoles, "serial", 0);
+	assert(con);
+	dt_add_property_string(con, "compatible", "ibm,opal-console-raw");
+	dt_add_property_cells(con, "#write-buffer-size", INMEM_CON_OUT_LEN);
+	dt_add_property_cells(con, "reg", 0);
+	dt_add_property_string(con, "device_type", "serial");
+
+	dt_add_property_string(dt_chosen, "linux,stdout-path",
+			       "/ibm,opal/consoles/serial@0");
+
+	opal_add_poller(dummy_console_poll, NULL);
+}
diff --git a/core/cpu.c b/core/cpu.c
new file mode 100644
index 0000000..0eea946
--- /dev/null
+++ b/core/cpu.c
@@ -0,0 +1,672 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * TODO: Index array by PIR to be able to catch them easily
+ * from assembly such as machine checks etc...
+ */
+#include <skiboot.h>
+#include <cpu.h>
+#include <fsp.h>
+#include <device.h>
+#include <opal.h>
+#include <stack.h>
+#include <trace.h>
+#include <affinity.h>
+#include <chip.h>
+#include <timebase.h>
+#include <ccan/str/str.h>
+#include <ccan/container_of/container_of.h>
+
+/* The cpu_threads array is static and indexed by PIR in
+ * order to speed up lookup from asm entry points
+ */
+struct cpu_stack {
+	union {
+		uint8_t	stack[STACK_SIZE];
+		struct cpu_thread cpu;
+	};
+} __align(STACK_SIZE);
+
+static struct cpu_stack *cpu_stacks = (struct cpu_stack *)CPU_STACKS_BASE;
+unsigned int cpu_thread_count;
+unsigned int cpu_max_pir;
+struct cpu_thread *boot_cpu;
+static struct lock reinit_lock = LOCK_UNLOCKED;
+
+unsigned long cpu_secondary_start __force_data = 0;
+
+struct cpu_job {
+	struct list_node	link;
+	void			(*func)(void *data);
+	void			*data;
+	bool			complete;
+	bool		        no_return;
+};
+
+/* attribute const as cpu_stacks is constant. */
+void __attrconst *cpu_stack_bottom(unsigned int pir)
+{
+	return (void *)&cpu_stacks[pir] + sizeof(struct cpu_thread);
+}
+
+void __attrconst *cpu_stack_top(unsigned int pir)
+{
+	/* This is the top of the MC stack which is above the normal
+	 * stack, which means a SP between cpu_stack_bottom() and
+	 * cpu_stack_top() can either be a normal stack pointer or
+	 * a Machine Check stack pointer
+	 */
+	return (void *)&cpu_stacks[pir] + STACK_SIZE - STACK_TOP_GAP;
+}
+
+struct cpu_job *__cpu_queue_job(struct cpu_thread *cpu,
+				void (*func)(void *data), void *data,
+				bool no_return)
+{
+	struct cpu_job *job;
+
+	if (!cpu_is_available(cpu)) {
+		prerror("CPU: Tried to queue job on unavailable CPU 0x%04x\n",
+			cpu->pir);
+		return NULL;
+	}
+
+	job = zalloc(sizeof(struct cpu_job));
+	if (!job)
+		return NULL;
+	job->func = func;
+	job->data = data;
+	job->complete = false;
+	job->no_return = no_return;
+
+	if (cpu != this_cpu()) {
+		lock(&cpu->job_lock);
+		list_add_tail(&cpu->job_queue, &job->link);
+		unlock(&cpu->job_lock);
+	} else {
+		func(data);
+		job->complete = true;
+	}
+
+	/* XXX Add poking of CPU with interrupt */
+
+	return job;
+}
+
+bool cpu_poll_job(struct cpu_job *job)
+{
+	lwsync();
+	return job->complete;
+}
+
+void cpu_wait_job(struct cpu_job *job, bool free_it)
+{
+	if (!job)
+		return;
+
+	while(!job->complete) {
+		/* Handle mbox if master CPU */
+		if (this_cpu() == boot_cpu)
+			fsp_poll();
+		else
+			smt_low();
+		lwsync();
+	}
+	lwsync();
+	smt_medium();
+
+	if (free_it)
+		free(job);
+}
+
+void cpu_free_job(struct cpu_job *job)
+{
+	if (!job)
+		return;
+
+	assert(job->complete);
+	free(job);
+}
+
+void cpu_process_jobs(void)
+{
+	struct cpu_thread *cpu = this_cpu();
+	struct cpu_job *job;
+	void (*func)(void *);
+	void *data;
+
+	sync();
+	if (list_empty(&cpu->job_queue))
+		return;
+
+	lock(&cpu->job_lock);
+	while (true) {
+		bool no_return;
+
+		if (list_empty(&cpu->job_queue))
+			break;
+		smt_medium();
+		job = list_pop(&cpu->job_queue, struct cpu_job, link);
+		if (!job)
+			break;
+		func = job->func;
+		data = job->data;
+		no_return = job->no_return;
+		unlock(&cpu->job_lock);
+		if (no_return)
+			free(job);
+		func(data);
+		lock(&cpu->job_lock);
+		if (!no_return) {
+			lwsync();
+			job->complete = true;
+		}
+	}
+	unlock(&cpu->job_lock);
+}
+
+struct dt_node *get_cpu_node(u32 pir)
+{
+	struct cpu_thread *t = find_cpu_by_pir(pir);
+
+	return t ? t->node : NULL;
+}
+
+/* This only covers primary, active cpus */
+struct cpu_thread *find_cpu_by_chip_id(u32 chip_id)
+{
+	struct cpu_thread *t;
+
+	for_each_available_cpu(t) {
+		if (t->is_secondary)
+			continue;
+		if (t->chip_id == chip_id)
+			return t;
+	}
+	return NULL;
+}
+
+struct cpu_thread *find_cpu_by_node(struct dt_node *cpu)
+{
+	struct cpu_thread *t;
+
+	for_each_available_cpu(t) {
+		if (t->node == cpu)
+			return t;
+	}
+	return NULL;
+}
+
+struct cpu_thread *find_cpu_by_pir(u32 pir)
+{
+	if (pir > cpu_max_pir)
+		return NULL;
+	return &cpu_stacks[pir].cpu;
+}
+
+struct cpu_thread *find_cpu_by_server(u32 server_no)
+{
+	struct cpu_thread *t;
+
+	for_each_cpu(t) {
+		if (t->server_no == server_no)
+			return t;
+	}
+	return NULL;
+}
+
+struct cpu_thread *next_cpu(struct cpu_thread *cpu)
+{
+	struct cpu_stack *s = container_of(cpu, struct cpu_stack, cpu);
+	unsigned int index;
+
+	if (cpu == NULL)
+		index = 0;
+	else
+		index = s - cpu_stacks + 1;
+	for (; index <= cpu_max_pir; index++) {
+		cpu = &cpu_stacks[index].cpu;
+		if (cpu->state != cpu_state_no_cpu)
+			return cpu;
+	}
+	return NULL;
+}
+
+struct cpu_thread *first_cpu(void)
+{
+	return next_cpu(NULL);
+}
+
+struct cpu_thread *next_available_cpu(struct cpu_thread *cpu)
+{
+	do {
+		cpu = next_cpu(cpu);
+	} while(cpu && !cpu_is_available(cpu));
+
+	return cpu;
+}
+
+struct cpu_thread *first_available_cpu(void)
+{
+	return next_available_cpu(NULL);
+}
+
+struct cpu_thread *next_available_core_in_chip(struct cpu_thread *core,
+					       u32 chip_id)
+{
+	do {
+		core = next_cpu(core);
+	} while(core && (!cpu_is_available(core) ||
+			 core->chip_id != chip_id ||
+			 core->is_secondary));
+	return core;
+}
+
+struct cpu_thread *first_available_core_in_chip(u32 chip_id)
+{
+	return next_available_core_in_chip(NULL, chip_id);
+}
+
+uint32_t cpu_get_core_index(struct cpu_thread *cpu)
+{
+	return pir_to_core_id(cpu->pir);
+}
+
+void cpu_remove_node(const struct cpu_thread *t)
+{
+	struct dt_node *i;
+
+	/* Find this cpu node */
+	dt_for_each_node(dt_root, i) {
+		const struct dt_property *p;
+
+		if (!dt_has_node_property(i, "device_type", "cpu"))
+			continue;
+		p = dt_find_property(i, "ibm,pir");
+		if (dt_property_get_cell(p, 0) == t->pir) {
+			dt_free(i);
+			return;
+		}
+	}
+	prerror("CPU: Could not find cpu node %i to remove!\n", t->pir);
+	abort();
+}
+
+void cpu_disable_all_threads(struct cpu_thread *cpu)
+{
+	unsigned int i;
+
+	for (i = 0; i <= cpu_max_pir; i++) {
+		struct cpu_thread *t = &cpu_stacks[i].cpu;
+
+		if (t->primary == cpu->primary)
+			t->state = cpu_state_disabled;
+	}
+
+	/* XXX Do something to actually stop the core */
+}
+
+static void init_cpu_thread(struct cpu_thread *t,
+			    enum cpu_thread_state state,
+			    unsigned int pir)
+{
+	init_lock(&t->job_lock);
+	list_head_init(&t->job_queue);
+	t->state = state;
+	t->pir = pir;
+	assert(pir == container_of(t, struct cpu_stack, cpu) - cpu_stacks);
+}
+
+void pre_init_boot_cpu(void)
+{
+	struct cpu_thread *cpu = this_cpu();
+
+	memset(cpu, 0, sizeof(struct cpu_thread));
+}
+
+void init_boot_cpu(void)
+{
+	unsigned int i, pir, pvr;
+
+	pir = mfspr(SPR_PIR);
+	pvr = mfspr(SPR_PVR);
+
+	/* Get a CPU thread count and an initial max PIR based on PVR */
+	switch(PVR_TYPE(pvr)) {
+	case PVR_TYPE_P7:
+	case PVR_TYPE_P7P:
+		cpu_thread_count = 4;
+		cpu_max_pir = SPR_PIR_P7_MASK;
+		proc_gen = proc_gen_p7;
+		printf("CPU: P7 generation processor\n");
+		break;
+	case PVR_TYPE_P8E:
+	case PVR_TYPE_P8:
+		cpu_thread_count = 8;
+		cpu_max_pir = SPR_PIR_P8_MASK;
+		proc_gen = proc_gen_p8;
+		printf("CPU: P8 generation processor\n");
+		break;
+	default:
+		prerror("CPU: Unknown PVR, assuming 1 thread\n");
+		cpu_thread_count = 1;
+		cpu_max_pir = mfspr(SPR_PIR);
+		proc_gen = proc_gen_unknown;
+	}
+
+	printf("CPU: Boot CPU PIR is 0x%04x PVR is 0x%08x\n", pir, pvr);
+	printf("CPU: Initial max PIR set to 0x%x\n", cpu_max_pir);
+	printf("CPU: Assuming max %d threads per core\n", cpu_thread_count);
+
+	/* Clear the CPU structs */
+	for (i = 0; i <= cpu_max_pir; i++)
+		memset(&cpu_stacks[i].cpu, 0, sizeof(struct cpu_thread));
+
+	/* Setup boot CPU state */
+	boot_cpu = &cpu_stacks[pir].cpu;
+	init_cpu_thread(boot_cpu, cpu_state_active, pir);
+	init_boot_tracebuf(boot_cpu);
+	assert(this_cpu() == boot_cpu);
+}
+
+void init_all_cpus(void)
+{
+	struct dt_node *cpus, *cpu;
+	unsigned int thread, new_max_pir = 0;
+
+	cpus = dt_find_by_path(dt_root, "/cpus");
+	assert(cpus);
+
+	/* Iterate all CPUs in the device-tree */
+	dt_for_each_child(cpus, cpu) {
+		unsigned int pir, server_no, chip_id;
+		enum cpu_thread_state state;
+		const struct dt_property *p;
+		struct cpu_thread *t, *pt;
+
+		/* Skip cache nodes */
+		if (strcmp(dt_prop_get(cpu, "device_type"), "cpu"))
+			continue;
+
+		server_no = dt_prop_get_u32(cpu, "reg");
+
+		/* If PIR property is absent, assume it's the same as the
+		 * server number
+		 */
+		pir = dt_prop_get_u32_def(cpu, "ibm,pir", server_no);
+
+		/* We should always have an ibm,chip-id property */
+		chip_id = dt_get_chip_id(cpu);
+
+		/* Only use operational CPUs */
+		if (!strcmp(dt_prop_get(cpu, "status"), "okay"))
+			state = cpu_state_present;
+		else
+			state = cpu_state_unavailable;
+
+		printf("CPU: CPU from DT PIR=0x%04x Server#=0x%x State=%d\n",
+		       pir, server_no, state);
+
+		/* Setup thread 0 */
+		t = pt = &cpu_stacks[pir].cpu;
+		if (t != boot_cpu) {
+			init_cpu_thread(t, state, pir);
+			/* Each cpu gets its own later in init_trace_buffers */
+			t->trace = boot_cpu->trace;
+		}
+		t->server_no = server_no;
+		t->primary = t;
+		t->node = cpu;
+		t->chip_id = chip_id;
+		t->icp_regs = 0; /* Will be set later */
+
+		/* Add associativity properties */
+		add_core_associativity(t);
+
+		/* Adjust max PIR */
+		if (new_max_pir < (pir + cpu_thread_count - 1))
+			new_max_pir = pir + cpu_thread_count - 1;
+
+		/* Iterate threads */
+		p = dt_find_property(cpu, "ibm,ppc-interrupt-server#s");
+		if (!p)
+			continue;
+		for (thread = 1; thread < (p->len / 4); thread++) {
+			printf("CPU:   secondary thread %d found\n", thread);
+			t = &cpu_stacks[pir + thread].cpu;
+			init_cpu_thread(t, state, pir + thread);
+			t->trace = boot_cpu->trace;
+			t->server_no = ((const u32 *)p->prop)[thread];
+			t->is_secondary = true;
+			t->primary = pt;
+			t->node = cpu;
+			t->chip_id = chip_id;
+		}
+	}
+	cpu_max_pir = new_max_pir;
+	printf("CPU: New max PIR set to 0x%x\n", new_max_pir);
+}
+
+void cpu_bringup(void)
+{
+	struct cpu_thread *t;
+
+	printf("CPU: Setting up secondary CPU state\n");
+
+	op_display(OP_LOG, OP_MOD_CPU, 0x0000);
+
+	/* Tell everybody to chime in ! */	
+	printf("CPU: Calling in all processors...\n");
+	cpu_secondary_start = 1;
+	sync();
+
+	op_display(OP_LOG, OP_MOD_CPU, 0x0002);
+
+	for_each_cpu(t) {
+		if (t->state != cpu_state_present &&
+		    t->state != cpu_state_active)
+			continue;
+
+		/* Add a callin timeout ?  If so, call cpu_remove_node(t). */
+		while (t->state != cpu_state_active) {
+			smt_very_low();
+			sync();
+		}
+		smt_medium();
+	}
+
+	printf("CPU: All processors called in...\n");
+
+	op_display(OP_LOG, OP_MOD_CPU, 0x0003);
+}
+
+void cpu_callin(struct cpu_thread *cpu)
+{
+	cpu->state = cpu_state_active;
+}
+
+static void opal_start_thread_job(void *data)
+{
+	cpu_give_self_os();
+
+	/* We do not return, so let's mark the job as
+	 * complete
+	 */
+	start_kernel_secondary((uint64_t)data);
+}
+
+static int64_t opal_start_cpu_thread(uint64_t server_no, uint64_t start_address)
+{
+	struct cpu_thread *cpu;
+	struct cpu_job *job;
+
+	cpu = find_cpu_by_server(server_no);
+	if (!cpu) {
+		prerror("OPAL: Start invalid CPU 0x%04llx !\n", server_no);
+		return OPAL_PARAMETER;
+	}
+	printf("OPAL: Start CPU 0x%04llx (PIR 0x%04x) -> 0x%016llx\n",
+	       server_no, cpu->pir, start_address);
+
+	lock(&reinit_lock);
+	if (!cpu_is_available(cpu)) {
+		unlock(&reinit_lock);
+		prerror("OPAL: CPU not active in OPAL !\n");
+		return OPAL_WRONG_STATE;
+	}
+	job = __cpu_queue_job(cpu, opal_start_thread_job, (void *)start_address,
+			      true);
+	unlock(&reinit_lock);
+	if (!job) {
+		prerror("OPAL: Failed to create CPU start job !\n");
+		return OPAL_INTERNAL_ERROR;
+	}
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_START_CPU, opal_start_cpu_thread, 2);
+
+static int64_t opal_query_cpu_status(uint64_t server_no, uint8_t *thread_status)
+{
+	struct cpu_thread *cpu;
+
+	cpu = find_cpu_by_server(server_no);
+	if (!cpu) {
+		prerror("OPAL: Query invalid CPU 0x%04llx !\n", server_no);
+		return OPAL_PARAMETER;
+	}
+	if (!cpu_is_available(cpu) && cpu->state != cpu_state_os) {
+		prerror("OPAL: CPU not active in OPAL nor OS !\n");
+		return OPAL_PARAMETER;
+	}
+	switch(cpu->state) {
+	case cpu_state_os:
+		*thread_status = OPAL_THREAD_STARTED;
+		break;
+	case cpu_state_active:
+		/* Active in skiboot -> inactive in OS */
+		*thread_status = OPAL_THREAD_INACTIVE;
+		break;
+	default:
+		*thread_status = OPAL_THREAD_UNAVAILABLE;
+	}
+
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_QUERY_CPU_STATUS, opal_query_cpu_status, 2);
+
+static int64_t opal_return_cpu(void)
+{
+	printf("OPAL: Returning CPU 0x%04x\n", this_cpu()->pir);
+
+	__secondary_cpu_entry();
+
+	return OPAL_HARDWARE; /* Should not happen */
+}
+opal_call(OPAL_RETURN_CPU, opal_return_cpu, 0);
+
+static void cpu_change_hile(void *hilep)
+{
+	bool hile = *(bool *)hilep;
+	unsigned long hid0;
+
+	hid0 = mfspr(SPR_HID0);
+	if (hile)
+		hid0 |= SPR_HID0_HILE;
+	else
+		hid0 &= ~SPR_HID0_HILE;
+	printf("CPU: [%08x] HID0 set to 0x%016lx\n", this_cpu()->pir, hid0);
+	set_hid0(hid0);
+
+	this_cpu()->current_hile = hile;
+}
+
+static int64_t cpu_change_all_hile(bool hile)
+{
+	struct cpu_thread *cpu;
+
+	printf("CPU: Switching HILE on all CPUs to %d\n", hile);
+
+	for_each_available_cpu(cpu) {
+		if (cpu->current_hile == hile)
+			continue;
+		if (cpu == this_cpu()) {
+			cpu_change_hile(&hile);
+			continue;
+		}
+		cpu_wait_job(cpu_queue_job(cpu, cpu_change_hile, &hile), true);
+	}
+	return OPAL_SUCCESS;
+}
+
+static int64_t opal_reinit_cpus(uint64_t flags)
+{
+	struct cpu_thread *cpu;
+	int64_t rc = OPAL_SUCCESS;
+	int i;
+
+	lock(&reinit_lock);
+
+	prerror("OPAL: Trying a CPU re-init with flags: 0x%llx\n", flags);
+
+	for (cpu = first_cpu(); cpu; cpu = next_cpu(cpu)) {
+		if (cpu == this_cpu())
+			continue;
+		if (cpu->state == cpu_state_os) {
+			/*
+			 * That might be a race with return CPU during kexec
+			 * where we are still, wait a bit and try again
+			 */
+			for (i = 0; (i < 3) && (cpu->state == cpu_state_os); i++)
+				time_wait_ms(1);
+			if (cpu->state == cpu_state_os) {
+				prerror("OPAL: CPU 0x%x not in OPAL !\n", cpu->pir);
+				rc = OPAL_WRONG_STATE;
+				goto bail;
+			}
+		}
+	}
+	/*
+	 * Now we need to mark ourselves "active" or we'll be skipped
+	 * by the various "for_each_active_..." calls done by slw_reinit()
+	 */
+	this_cpu()->state = cpu_state_active;
+
+	/*
+	 * If the flags affect endianness and we are on P8 DD2 or later, then
+	 * use the HID bit. We use the PVR (we could use the EC level in
+	 * the chip but the PVR is more readily available).
+	 */
+	if (proc_gen == proc_gen_p8 && PVR_VERS_MAJ(mfspr(SPR_PVR)) >= 2 &&
+	    (flags & (OPAL_REINIT_CPUS_HILE_BE | OPAL_REINIT_CPUS_HILE_LE))) {
+		bool hile = !!(flags & OPAL_REINIT_CPUS_HILE_LE);
+
+		flags &= ~(OPAL_REINIT_CPUS_HILE_BE | OPAL_REINIT_CPUS_HILE_LE);
+		rc = cpu_change_all_hile(hile);
+	}
+
+	/* Any flags left ? */
+	if (flags != 0)
+		rc = slw_reinit(flags);
+
+	/* And undo the above */
+	this_cpu()->state = cpu_state_os;
+
+bail:
+	unlock(&reinit_lock);
+	return rc;
+}
+opal_call(OPAL_REINIT_CPUS, opal_reinit_cpus, 1);
diff --git a/core/device.c b/core/device.c
new file mode 100644
index 0000000..28cccb7
--- /dev/null
+++ b/core/device.c
@@ -0,0 +1,791 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <device.h>
+#include <stdlib.h>
+#include <skiboot.h>
+#include <libfdt/libfdt.h>
+#include <libfdt/libfdt_internal.h>
+#include <ccan/str/str.h>
+#include <ccan/endian/endian.h>
+
+/* Used to give unique handles. */
+u32 last_phandle = 0;
+
+struct dt_node *dt_root;
+struct dt_node *dt_chosen;
+
+static const char *take_name(const char *name)
+{
+	if (!is_rodata(name) && !(name = strdup(name))) {
+		prerror("Failed to allocate copy of name");
+		abort();
+	}
+	return name;
+}
+
+static void free_name(const char *name)
+{
+	if (!is_rodata(name))
+		free((char *)name);
+}
+
+static struct dt_node *new_node(const char *name)
+{
+	struct dt_node *node = malloc(sizeof *node);
+	if (!node) {
+		prerror("Failed to allocate node\n");
+		abort();
+	}
+
+	node->name = take_name(name);
+	node->parent = NULL;
+	list_head_init(&node->properties);
+	list_head_init(&node->children);
+	/* FIXME: locking? */
+	node->phandle = ++last_phandle;
+	return node;
+}
+
+struct dt_node *dt_new_root(const char *name)
+{
+	return new_node(name);
+}
+
+bool dt_attach_root(struct dt_node *parent, struct dt_node *root)
+{
+	struct dt_node *node;
+
+	/* Look for duplicates */
+
+	assert(!root->parent);
+	dt_for_each_child(parent, node) {
+		if (!strcmp(node->name, root->name)) {
+			prerror("DT: %s failed, duplicate %s\n",
+				__func__, root->name);
+			return false;
+		}
+	}
+	list_add_tail(&parent->children, &root->list);
+	root->parent = parent;
+
+	return true;
+}
+	
+struct dt_node *dt_new(struct dt_node *parent, const char *name)
+{
+	struct dt_node *new;
+	assert(parent);
+
+	new = new_node(name);
+	if (!dt_attach_root(parent, new)) {
+		free_name(new->name);
+		free(new);
+		return NULL;
+	}
+	return new;
+}
+
+struct dt_node *dt_new_addr(struct dt_node *parent, const char *name,
+			    uint64_t addr)
+{
+	char *lname;
+	struct dt_node *new;
+	size_t len;
+
+	assert(parent);
+	len = strlen(name) + STR_MAX_CHARS(addr) + 2;
+	lname = malloc(len);
+	if (!lname)
+		return NULL;
+	snprintf(lname, len, "%s@%llx", name, (long long)addr);
+	new = new_node(lname);
+	free(lname);
+	if (!dt_attach_root(parent, new)) {
+		free_name(new->name);
+		free(new);
+		return NULL;
+	}
+	return new;
+}
+
+struct dt_node *dt_new_2addr(struct dt_node *parent, const char *name,
+			     uint64_t addr0, uint64_t addr1)
+{
+	char *lname;
+	struct dt_node *new;
+	size_t len;
+	assert(parent);
+
+	len = strlen(name) + 2*STR_MAX_CHARS(addr0) + 3;
+	lname = malloc(len);
+	if (!lname)
+		return NULL;
+	snprintf(lname, len, "%s@%llx,%llx",
+		 name, (long long)addr0, (long long)addr1);
+	new = new_node(lname);
+	free(lname);
+	if (!dt_attach_root(parent, new)) {
+		free_name(new->name);
+		free(new);
+		return NULL;
+	}
+	return new;
+}
+
+char *dt_get_path(const struct dt_node *node)
+{
+	unsigned int len = 0;
+	const struct dt_node *n;
+	char *path, *p;
+
+	/* Dealing with NULL is for test/debug purposes */
+	if (!node)
+		return strdup("<NULL>");
+
+	for (n = node; n; n = n->parent) {
+		len += strlen(n->name);
+		if (n->parent || n == node)
+			len++;
+	}
+	path = zalloc(len + 1);
+	assert(path);
+	p = path + len;
+	for (n = node; n; n = n->parent) {
+		len = strlen(n->name);
+		p -= len;
+		memcpy(p, n->name, len);
+		if (n->parent || n == node)
+			*(--p) = '/';
+	}
+	assert(p == path);
+
+	return p;
+}
+
+static const char *__dt_path_split(const char *p,
+				   const char **namep, unsigned int *namel,
+				   const char **addrp, unsigned int *addrl)
+{
+	const char *at, *sl;
+
+	*namel = *addrl = 0;
+
+	/* Skip initial '/' */
+	while (*p == '/')
+		p++;
+
+	/* Check empty path */
+	if (*p == 0)
+		return p;
+
+	at = strchr(p, '@');
+	sl = strchr(p, '/');
+	if (sl == NULL)
+		sl = p + strlen(p);
+	if (sl < at)
+		at = NULL;
+	if (at) {
+		*addrp = at + 1;
+		*addrl = sl - at - 1;
+	}
+	*namep = p;
+	*namel = at ? (at - p) : (sl - p);
+
+	return sl;
+}
+
+struct dt_node *dt_find_by_path(struct dt_node *root, const char *path)
+{
+	struct dt_node *n;
+	const char *pn, *pa, *p = path, *nn, *na;
+	unsigned int pnl, pal, nnl, nal;
+	bool match;
+
+	/* Walk path components */
+	while (*p) {
+		/* Extract next path component */
+		p = __dt_path_split(p, &pn, &pnl, &pa, &pal);
+		if (pnl == 0 && pal == 0)
+			break;
+
+		/* Compare with each child node */
+		match = false;
+		list_for_each(&root->children, n, list) {
+			match = true;
+			__dt_path_split(n->name, &nn, &nnl, &na, &nal);
+			if (pnl && (pnl != nnl || strncmp(pn, nn, pnl)))
+				match = false;
+			if (pal && (pal != nal || strncmp(pa, na, pal)))
+				match = false;
+			if (match) {
+				root = n;
+				break;
+			}
+		}
+
+		/* No child match */
+		if (!match)
+			return NULL;
+	}
+	return root;
+}
+
+struct dt_node *dt_find_by_phandle(struct dt_node *root, u32 phandle)
+{
+	struct dt_node *node;
+
+	dt_for_each_node(root, node)
+		if (node->phandle == phandle)
+			return node;
+	return NULL;
+}
+
+static struct dt_property *new_property(struct dt_node *node,
+					const char *name, size_t size)
+{
+	struct dt_property *p = malloc(sizeof(*p) + size);
+	if (!p) {
+		prerror("Failed to allocate property \"%s\" for %s of %zu bytes\n",
+			name, dt_get_path(node), size);
+		abort();
+	}
+	if (dt_find_property(node, name)) {
+		prerror("Duplicate property \"%s\" in node %s\n",
+			name, dt_get_path(node));
+		abort();
+
+	}
+
+	p->name = take_name(name);
+	p->len = size;
+	list_add_tail(&node->properties, &p->list);
+	return p;
+}
+
+struct dt_property *dt_add_property(struct dt_node *node,
+				    const char *name,
+				    const void *val, size_t size)
+{
+	struct dt_property *p;
+
+	/*
+	 * Filter out phandle properties, we re-generate them
+	 * when flattening
+	 */
+	if (strcmp(name, "linux,phandle") == 0 ||
+	    strcmp(name, "phandle") == 0) {
+		assert(size == 4);
+		node->phandle = *(const u32 *)val;
+		if (node->phandle >= last_phandle)
+			last_phandle = node->phandle;
+		return NULL;
+	}
+
+	p = new_property(node, name, size);
+	if (size)
+		memcpy(p->prop, val, size);
+	return p;
+}
+
+void dt_resize_property(struct dt_property **prop, size_t len)
+{
+	size_t new_len = sizeof(**prop) + len;
+
+	*prop = realloc(*prop, new_len);
+
+	/* Fix up linked lists in case we moved. (note: not an empty list). */
+	(*prop)->list.next->prev = &(*prop)->list;
+	(*prop)->list.prev->next = &(*prop)->list;
+}
+
+struct dt_property *dt_add_property_string(struct dt_node *node,
+					   const char *name,
+					   const char *value)
+{
+	return dt_add_property(node, name, value, strlen(value)+1);
+}
+
+struct dt_property *dt_add_property_nstr(struct dt_node *node,
+					 const char *name,
+					 const char *value, unsigned int vlen)
+{
+	struct dt_property *p;
+	char *tmp = zalloc(vlen + 1);
+
+	strncpy(tmp, value, vlen);
+	p = dt_add_property(node, name, tmp, strlen(tmp)+1);
+	free(tmp);
+
+	return p;
+}
+
+struct dt_property *__dt_add_property_cells(struct dt_node *node,
+					    const char *name,
+					    int count, ...)
+{
+	struct dt_property *p;
+	u32 *val;
+	unsigned int i;
+	va_list args;
+
+	p = new_property(node, name, count * sizeof(u32));
+	val = (u32 *)p->prop;
+	va_start(args, count);
+	for (i = 0; i < count; i++)
+		val[i] = cpu_to_fdt32(va_arg(args, u32));
+	va_end(args);
+	return p;
+}
+
+struct dt_property *__dt_add_property_u64s(struct dt_node *node,
+					   const char *name,
+					   int count, ...)
+{
+	struct dt_property *p;
+	u64 *val;
+	unsigned int i;
+	va_list args;
+
+	p = new_property(node, name, count * sizeof(u64));
+	val = (u64 *)p->prop;
+	va_start(args, count);
+	for (i = 0; i < count; i++)
+		val[i] = cpu_to_fdt64(va_arg(args, u64));
+	va_end(args);
+	return p;
+}
+
+struct dt_property *__dt_add_property_strings(struct dt_node *node,
+					      const char *name,
+					      int count, ...)
+{
+	struct dt_property *p;
+	unsigned int i, size;
+	va_list args;
+	const char *sstr;
+	char *s;
+
+	va_start(args, count);
+	for (i = size = 0; i < count; i++) {
+		sstr = va_arg(args, const char *);
+		if (sstr)
+			size += strlen(sstr) + 1;
+	}
+	va_end(args);
+	if (!size)
+		size = 1;
+	p = new_property(node, name, size);
+	s = (char *)p->prop;
+	*s = 0;
+	va_start(args, count);
+	for (i = 0; i < count; i++) {	
+		sstr = va_arg(args, const char *);
+		if (sstr) {
+			strcpy(s, sstr);
+			s = s + strlen(sstr) + 1;
+		}
+	}
+	va_end(args);
+	return p;
+}
+
+void dt_del_property(struct dt_node *node, struct dt_property *prop)
+{
+	list_del_from(&node->properties, &prop->list);
+	free_name(prop->name);
+	free(prop);
+}
+
+u32 dt_property_get_cell(const struct dt_property *prop, u32 index)
+{
+	assert(prop->len >= (index+1)*sizeof(u32));
+	/* Always aligned, so this works. */
+	return fdt32_to_cpu(((const u32 *)prop->prop)[index]);
+}
+
+/* First child of this node. */
+struct dt_node *dt_first(const struct dt_node *root)
+{
+	return list_top(&root->children, struct dt_node, list);
+}
+
+/* Return next node, or NULL. */
+struct dt_node *dt_next(const struct dt_node *root,
+			const struct dt_node *prev)
+{
+	/* Children? */
+	if (!list_empty(&prev->children))
+		return dt_first(prev);
+
+	do {
+		/* More siblings? */
+		if (prev->list.next != &prev->parent->children.n)
+			return list_entry(prev->list.next, struct dt_node,list);
+
+		/* No more siblings, move up to parent. */
+		prev = prev->parent;
+	} while (prev != root);
+
+	return NULL;
+}
+
+struct dt_property *__dt_find_property(struct dt_node *node, const char *name)
+{
+	struct dt_property *i;
+
+	list_for_each(&node->properties, i, list)
+		if (strcmp(i->name, name) == 0)
+			return i;
+	return NULL;
+}
+
+const struct dt_property *dt_find_property(const struct dt_node *node,
+					   const char *name)
+{
+	const struct dt_property *i;
+
+	list_for_each(&node->properties, i, list)
+		if (strcmp(i->name, name) == 0)
+			return i;
+	return NULL;
+}
+
+const struct dt_property *dt_require_property(const struct dt_node *node,
+					      const char *name, int wanted_len)
+{
+	const struct dt_property *p = dt_find_property(node, name);
+
+	if (!p) {
+		const char *path = dt_get_path(node);
+
+		prerror("DT: Missing required property %s/%s\n",
+			path, name);
+		assert(false);
+	}
+	if (wanted_len >= 0 && p->len != wanted_len) {
+		const char *path = dt_get_path(node);
+
+		prerror("DT: Unexpected property length %s/%s\n",
+			path, name);
+		prerror("DT: Expected len: %d got len: %zu\n",
+			wanted_len, p->len);
+		assert(false);
+	}
+
+	return p;
+}
+
+bool dt_has_node_property(const struct dt_node *node,
+			  const char *name, const char *val)
+{
+	const struct dt_property *p = dt_find_property(node, name);
+
+	if (!p)
+		return false;
+	if (!val)
+		return true;
+
+	return p->len == strlen(val) + 1 && memcmp(p->prop, val, p->len) == 0;
+}
+
+bool dt_prop_find_string(const struct dt_property *p, const char *s)
+{
+	const char *c, *end;
+
+	if (!p)
+		return false;
+	c = p->prop;
+	end = c + p->len;
+
+	while(c < end) {
+		if (!strcasecmp(s, c))
+			return true;
+		c += strlen(c) + 1;
+	}
+	return false;
+}
+
+bool dt_node_is_compatible(const struct dt_node *node, const char *compat)
+{
+	const struct dt_property *p = dt_find_property(node, "compatible");
+
+	return dt_prop_find_string(p, compat);
+}
+
+struct dt_node *dt_find_compatible_node(struct dt_node *root,
+					struct dt_node *prev,
+					const char *compat)
+{
+	struct dt_node *node;
+
+	node = prev ? dt_next(root, prev) : root;
+	for (; node; node = dt_next(root, node))
+		if (dt_node_is_compatible(node, compat))
+			return node;
+	return NULL;
+}
+
+u64 dt_prop_get_u64(const struct dt_node *node, const char *prop)
+{
+	const struct dt_property *p = dt_require_property(node, prop, 8);
+
+	return ((u64)dt_property_get_cell(p, 0) << 32)
+		| dt_property_get_cell(p, 1);
+}
+
+u64 dt_prop_get_u64_def(const struct dt_node *node, const char *prop, u64 def)
+{
+	const struct dt_property *p = dt_find_property(node, prop);
+
+	if (!p)
+		return def;
+
+	return ((u64)dt_property_get_cell(p, 0) << 32)
+		| dt_property_get_cell(p, 1);
+}
+
+u32 dt_prop_get_u32(const struct dt_node *node, const char *prop)
+{
+	const struct dt_property *p = dt_require_property(node, prop, 4);
+
+	return dt_property_get_cell(p, 0);
+}
+
+u32 dt_prop_get_u32_def(const struct dt_node *node, const char *prop, u32 def)
+{
+	const struct dt_property *p = dt_find_property(node, prop);
+
+	if (!p)
+		return def;
+
+	return dt_property_get_cell(p, 0);
+}
+
+const void *dt_prop_get(const struct dt_node *node, const char *prop)
+{
+	const struct dt_property *p = dt_require_property(node, prop, -1);
+
+	return p->prop;
+}
+
+const void *dt_prop_get_def(const struct dt_node *node, const char *prop,
+			    void *def)
+{
+	const struct dt_property *p = dt_find_property(node, prop);
+
+	return p ? p->prop : def;
+}
+
+const void *dt_prop_get_def_size(const struct dt_node *node, const char *prop,
+				void *def, size_t *len)
+{
+	const struct dt_property *p = dt_find_property(node, prop);
+	*len = 0;
+	if (p)
+		*len = p->len;
+
+	return p ? p->prop : def;
+}
+
+u32 dt_prop_get_cell(const struct dt_node *node, const char *prop, u32 cell)
+{
+	const struct dt_property *p = dt_require_property(node, prop, -1);
+
+	return dt_property_get_cell(p, cell);
+}
+
+u32 dt_prop_get_cell_def(const struct dt_node *node, const char *prop,
+			 u32 cell, u32 def)
+{
+	const struct dt_property *p = dt_find_property(node, prop);
+
+	if (!p)
+		return def;
+
+	return dt_property_get_cell(p, cell);
+}
+
+void dt_free(struct dt_node *node)
+{
+	struct dt_node *child;
+	struct dt_property *p;
+
+	while ((child = list_top(&node->children, struct dt_node, list)))
+		dt_free(child);
+
+	while ((p = list_pop(&node->properties, struct dt_property, list))) {
+		free_name(p->name);
+		free(p);
+	}
+
+	if (node->parent)
+		list_del_from(&node->parent->children, &node->list);
+	free_name(node->name);
+	free(node);
+}
+
+int dt_expand_node(struct dt_node *node, const void *fdt, int fdt_node)
+{
+	const struct fdt_property *prop;
+	int offset, nextoffset, err;
+	struct dt_node *child;
+	const char *name;
+	uint32_t tag;
+
+	if (((err = fdt_check_header(fdt)) != 0)
+	    || ((err = _fdt_check_node_offset(fdt, fdt_node)) < 0)) {
+		prerror("FDT: Error %d parsing node 0x%x\n", err, fdt_node);
+		return -1;
+	}
+
+	nextoffset = err;
+	do {
+		offset = nextoffset;
+
+		tag = fdt_next_tag(fdt, offset, &nextoffset);
+		switch (tag) {
+		case FDT_PROP:
+			prop = _fdt_offset_ptr(fdt, offset);
+			name = fdt_string(fdt, fdt32_to_cpu(prop->nameoff));
+			dt_add_property(node, name, prop->data,
+					fdt32_to_cpu(prop->len));
+			break;
+		case FDT_BEGIN_NODE:
+			name = fdt_get_name(fdt, offset, NULL);
+			child = dt_new_root(name);
+			assert(child);
+			nextoffset = dt_expand_node(child, fdt, offset);
+
+			/*
+			 * This may fail in case of duplicate, keep it
+			 * going for now, we may ultimately want to
+			 * assert
+			 */
+			(void)dt_attach_root(node, child);
+			break;
+		case FDT_END:
+			return -1;
+		}
+	} while (tag != FDT_END_NODE);
+
+	return nextoffset;
+}
+
+void dt_expand(const void *fdt)
+{
+	printf("FDT: Parsing fdt @%p\n", fdt);
+
+	dt_root = dt_new_root("");
+
+	dt_expand_node(dt_root, fdt, 0);
+}
+
+u64 dt_get_number(const void *pdata, unsigned int cells)
+{
+	const u32 *p = pdata;
+	u64 ret = 0;
+
+	while(cells--)
+		ret = (ret << 32) | be32_to_cpu(*(p++));
+	return ret;
+}
+
+u32 dt_n_address_cells(const struct dt_node *node)
+{
+	if (!node->parent)
+		return 0;
+	return dt_prop_get_u32_def(node->parent, "#address-cells", 2);
+}
+
+u32 dt_n_size_cells(const struct dt_node *node)
+{
+	if (!node->parent)
+		return 0;
+	return dt_prop_get_u32_def(node->parent, "#size-cells", 1);
+}
+
+u64 dt_get_address(const struct dt_node *node, unsigned int index,
+		   u64 *out_size)
+{
+	const struct dt_property *p;
+	u32 na = dt_n_address_cells(node);
+	u32 ns = dt_n_size_cells(node);
+	u32 pos, n;
+
+	p = dt_require_property(node, "reg", -1);
+	n = (na + ns) * sizeof(u32);
+	pos = n * index;
+	assert((pos + n) <= p->len);
+	if (out_size)
+		*out_size = dt_get_number(p->prop + pos + na * sizeof(u32), ns);
+	return dt_get_number(p->prop + pos, na);
+}
+
+static u32 __dt_get_chip_id(const struct dt_node *node)
+{
+	const struct dt_property *prop;
+
+	for (; node; node = node->parent) {
+		prop = dt_find_property(node, "ibm,chip-id");
+		if (prop)
+			return dt_property_get_cell(prop, 0);
+	}
+	return 0xffffffff;
+}
+
+u32 dt_get_chip_id(const struct dt_node *node)
+{
+	u32 id = __dt_get_chip_id(node);
+	assert(id != 0xffffffff);
+	return id;
+}
+
+struct dt_node *dt_find_compatible_node_on_chip(struct dt_node *root,
+						struct dt_node *prev,
+						const char *compat,
+						uint32_t chip_id)
+{
+	struct dt_node *node;
+
+	node = prev ? dt_next(root, prev) : root;
+	for (; node; node = dt_next(root, node)) {
+		u32 cid = __dt_get_chip_id(node);
+		if (cid == chip_id &&
+		    dt_node_is_compatible(node, compat))
+			return node;
+	}
+	return NULL;
+}
+
+unsigned int dt_count_addresses(const struct dt_node *node)
+{
+	const struct dt_property *p;
+	u32 na = dt_n_address_cells(node);
+	u32 ns = dt_n_size_cells(node);
+	u32 n;
+
+	p = dt_require_property(node, "reg", -1);
+	n = (na + ns) * sizeof(u32);
+	return p->len / n;
+}
+
+u64 dt_translate_address(const struct dt_node *node, unsigned int index,
+			 u64 *out_size)
+{
+	/* XXX TODO */
+	return dt_get_address(node, index, out_size);
+}
diff --git a/core/exceptions.c b/core/exceptions.c
new file mode 100644
index 0000000..995ca92
--- /dev/null
+++ b/core/exceptions.c
@@ -0,0 +1,529 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <stack.h>
+#include <opal.h>
+#include <processor.h>
+#include <cpu.h>
+
+static uint64_t client_mc_address;
+
+extern uint8_t exc_primary_start;
+extern uint8_t exc_primary_end;
+
+extern uint32_t exc_primary_patch_branch;
+
+extern uint8_t exc_secondary_start;
+extern uint8_t exc_secondary_end;
+
+extern uint32_t exc_secondary_patch_stack;
+extern uint32_t exc_secondary_patch_mfsrr0;
+extern uint32_t exc_secondary_patch_mfsrr1;
+extern uint32_t exc_secondary_patch_type;
+extern uint32_t exc_secondary_patch_mtsrr0;
+extern uint32_t exc_secondary_patch_mtsrr1;
+extern uint32_t exc_secondary_patch_rfid;
+
+struct lock hmi_lock = LOCK_UNLOCKED;
+
+#define SRR1_MC_LOADSTORE(srr1)		((srr1) & PPC_BIT(42))
+
+#define SRR1_MC_IFETCH(srr1)		((srr1) & PPC_BITMASK(43,45))
+#define SRR1_MC_IFETCH_UE		(0x1 << PPC_BITLSHIFT(45))
+#define SRR1_MC_IFETCH_SLB_PARITY	(0x2 << PPC_BITLSHIFT(45))
+#define SRR1_MC_IFETCH_SLB_MULTIHIT	(0x3 << PPC_BITLSHIFT(45))
+#define SRR1_MC_IFETCH_SLB_BOTH		(0x4 << PPC_BITLSHIFT(45))
+#define SRR1_MC_IFETCH_TLB_MULTIHIT	(0x5 << PPC_BITLSHIFT(45))
+#define SRR1_MC_IFETCH_UE_TLB_RELOAD	(0x6 << PPC_BITLSHIFT(45))
+#define SRR1_MC_IFETCH_UE_IFU_INTERNAL	(0x7 << PPC_BITLSHIFT(45))
+
+#define DSISR_MC_UE			(PPC_BIT(48))
+#define DSISR_MC_UE_TABLEWALK		(PPC_BIT(49))
+#define DSISR_MC_ERAT_MULTIHIT		(PPC_BIT(52))
+#define DSISR_MC_TLB_MULTIHIT_MFTLB	(PPC_BIT(53))
+#define DSISR_MC_TLB_MULTIHIT_MFSLB	(PPC_BIT(55))
+#define DSISR_MC_TLB_MULTIHIT		(PPC_BIT(53) | PPC_BIT(55))
+#define DSISR_MC_SLB_MULTIHIT		(PPC_BIT(56))
+#define DSISR_MC_SLB_MULTIHIT_PARITY	(PPC_BIT(57))
+
+static void mce_set_ierror(struct opal_machine_check_event *mce, uint64_t srr1)
+{
+	switch (SRR1_MC_IFETCH(srr1)) {
+	case SRR1_MC_IFETCH_SLB_PARITY:
+		mce->error_type = OpalMCE_ERROR_TYPE_SLB;
+		mce->u.slb_error.slb_error_type = OpalMCE_SLB_ERROR_PARITY;
+		break;
+
+	case SRR1_MC_IFETCH_SLB_MULTIHIT:
+		mce->error_type = OpalMCE_ERROR_TYPE_SLB;
+		mce->u.slb_error.slb_error_type = OpalMCE_SLB_ERROR_MULTIHIT;
+		break;
+
+	case SRR1_MC_IFETCH_SLB_BOTH:
+		mce->error_type = OpalMCE_ERROR_TYPE_SLB;
+		mce->u.slb_error.slb_error_type =
+				OpalMCE_SLB_ERROR_INDETERMINATE;
+		break;
+
+	case SRR1_MC_IFETCH_TLB_MULTIHIT:
+		mce->error_type = OpalMCE_ERROR_TYPE_TLB;
+		mce->u.tlb_error.tlb_error_type = OpalMCE_TLB_ERROR_MULTIHIT;
+		break;
+
+	case SRR1_MC_IFETCH_UE:
+	case SRR1_MC_IFETCH_UE_IFU_INTERNAL:
+		mce->error_type = OpalMCE_ERROR_TYPE_UE;
+		mce->u.ue_error.ue_error_type = OpalMCE_UE_ERROR_IFETCH;
+		break;
+
+	case SRR1_MC_IFETCH_UE_TLB_RELOAD:
+		mce->error_type = OpalMCE_ERROR_TYPE_UE;
+		mce->u.ue_error.ue_error_type =
+				OpalMCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH;
+		break;
+	}
+
+}
+
+static void mce_set_derror(struct opal_machine_check_event *mce, uint64_t dsisr)
+{
+	if (dsisr & DSISR_MC_UE) {
+		mce->error_type = OpalMCE_ERROR_TYPE_UE;
+		mce->u.ue_error.ue_error_type = OpalMCE_UE_ERROR_LOAD_STORE;
+
+	} else if (dsisr & DSISR_MC_UE_TABLEWALK) {
+		mce->error_type = OpalMCE_ERROR_TYPE_UE;
+		mce->u.ue_error.ue_error_type =
+				OpalMCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE;
+
+	} else if (dsisr & DSISR_MC_ERAT_MULTIHIT) {
+		mce->error_type = OpalMCE_ERROR_TYPE_ERAT;
+		mce->u.erat_error.erat_error_type =
+					OpalMCE_ERAT_ERROR_MULTIHIT;
+
+	} else if (dsisr & DSISR_MC_TLB_MULTIHIT) {
+		mce->error_type = OpalMCE_ERROR_TYPE_TLB;
+		mce->u.tlb_error.tlb_error_type =
+					OpalMCE_TLB_ERROR_MULTIHIT;
+
+	} else if (dsisr & DSISR_MC_SLB_MULTIHIT) {
+		mce->error_type = OpalMCE_ERROR_TYPE_SLB;
+		mce->u.slb_error.slb_error_type =
+					OpalMCE_SLB_ERROR_MULTIHIT;
+
+	} else if (dsisr & DSISR_MC_SLB_MULTIHIT_PARITY) {
+		mce->error_type = OpalMCE_ERROR_TYPE_SLB;
+		mce->u.slb_error.slb_error_type =
+					OpalMCE_SLB_ERROR_INDETERMINATE;
+	}
+}
+
+/* Called from head.S, thus no prototype */
+void handle_machine_check(struct stack_frame *stack);
+
+void handle_machine_check(struct stack_frame *stack)
+{
+	struct opal_machine_check_event *mce;
+	uint64_t srr1, addr;
+
+	mce = &this_cpu()->mc_event;
+
+	/* This will occur if we get another MC between the time that
+	 * we re-set MSR_ME, and the OS clears this flag.
+	 *
+	 * However, the alternative is keeping MSR_ME cleared, and letting
+	 * the OS re-set it (after clearing the flag). However, we
+	 * risk a checkstop, and an opal assert() is the better option.
+	 */
+	assert(!mce->in_use);
+
+	mce->in_use = 1;
+
+	/* Populate generic machine check info */
+	mce->version = OpalMCE_V1;
+	mce->srr0 = stack->srr0;
+	mce->srr1 = stack->srr1;
+	mce->gpr3 = stack->gpr[3];
+
+	mce->initiator = OpalMCE_INITIATOR_CPU;
+	mce->disposition = OpalMCE_DISPOSITION_NOT_RECOVERED;
+	mce->severity = OpalMCE_SEV_ERROR_SYNC;
+
+	srr1 = stack->srr1;
+
+	/* Populate the mce error_type and type-specific error_type from either
+	 * SRR1 or DSISR, depending whether this was a load/store or ifetch
+	 * exception */
+	if (SRR1_MC_LOADSTORE(srr1)) {
+		mce_set_derror(mce, srr1);
+		addr = stack->srr0;
+	} else {
+		mce_set_ierror(mce, mfspr(SPR_DSISR));
+		addr = mfspr(SPR_DAR);
+	}
+
+	if (mce->error_type == OpalMCE_ERROR_TYPE_TLB) {
+		mce->u.tlb_error.effective_address_provided = true;
+		mce->u.tlb_error.effective_address = addr;
+
+	} else if (mce->error_type == OpalMCE_ERROR_TYPE_SLB) {
+		mce->u.slb_error.effective_address_provided = true;
+		mce->u.slb_error.effective_address = addr;
+
+	} else if (mce->error_type == OpalMCE_ERROR_TYPE_ERAT) {
+		mce->u.erat_error.effective_address_provided = true;
+		mce->u.erat_error.effective_address = addr;
+
+	} else if (mce->error_type == OpalMCE_ERROR_TYPE_UE) {
+		mce->u.ue_error.effective_address_provided = true;
+		mce->u.ue_error.effective_address = addr;
+	}
+
+	/* Setup stack to rfi into the OS' handler, with ME re-enabled. */
+	stack->gpr[3] = (uint64_t)mce;
+	stack->srr0 = client_mc_address;
+	stack->srr1 = mfmsr() | MSR_ME;
+}
+
+#define REG		"%016llx"
+#define REGS_PER_LINE	4
+#define LAST_VOLATILE	13
+
+static void dump_regs(struct stack_frame *stack, uint64_t hmer)
+{
+	int i;
+	uint64_t tfmr;
+
+	if (hmer & SPR_HMER_MALFUNCTION_ALERT)
+		printf("HMI: malfunction Alert\n");
+	if (hmer & SPR_HMER_HYP_RESOURCE_ERR)
+		printf("HMI: Hypervisor resource error.\n");
+	if (hmer & SPR_HMER_TFAC_ERROR) {
+		tfmr = mfspr(SPR_TFMR);
+		printf("HMI: TFAC error: SPRN_TFMR = 0x%016llx\n", tfmr);
+	}
+	if (hmer & SPR_HMER_TFMR_PARITY_ERROR) {
+		tfmr = mfspr(SPR_TFMR);
+		printf("HMI: TFMR parity error: SPRN_TFMR = 0x%016llx\n", tfmr);
+	}
+	printf("TRAP: %04llx\n", stack->type);
+	printf("SRR0: "REG" SRR1: "REG"\n", stack->srr0, stack->srr1);
+	printf("CFAR: "REG" LR: "REG" CTR: "REG"\n",
+		stack->cfar, stack->lr, stack->ctr);
+	printf("  CR: %08x  XER: %08x\n", stack->cr, stack->xer);
+
+	for (i = 0;  i < 32;  i++) {
+		if ((i % REGS_PER_LINE) == 0)
+			printf("\nGPR%02d: ", i);
+		printf(REG " ", stack->gpr[i]);
+		if (i == LAST_VOLATILE)
+			break;
+	}
+	printf("\n");
+}
+
+/*
+ * HMER register layout:
+ * +===+==========+============================+========+===================+
+ * |Bit|Name      |Description                 |PowerKVM|Action             |
+ * |   |          |                            |HMI     |                   |
+ * |   |          |                            |enabled |                   |
+ * |   |          |                            |for this|                   |
+ * |   |          |                            |bit ?   |                   |
+ * +===+==========+============================+========+===================+
+ * |0  |malfunctio|A processor core in the     |Yes     |Raise attn from    |
+ * |   |n_allert  |system has checkstopped     |        |sapphire resulting |
+ * |   |          |(failed recovery) and has   |        |xstop              |
+ * |   |          |requested a CP Sparing      |        |                   |
+ * |   |          |to occur. This is           |        |                   |
+ * |   |          |broadcasted to every        |        |                   |
+ * |   |          |processor in the system     |        |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |1  |Reserved  |reserved                    |n/a     |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |2  |proc_recv_|Processor recovery occurred |Yes     |Log message and    |
+ * |   |done      |error-bit in fir not masked |        |continue working.  |
+ * |   |          |(see bit 11)                |        |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |3  |proc_recv_|Processor went through      |Yes     |Log message and    |
+ * |   |error_mask|recovery for an error which |        |continue working.  |
+ * |   |ed        |is actually masked for      |        |                   |
+ * |   |          |reporting                   |        |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |4  |          |Timer facility experienced  |Yes     |Raise attn from    |
+ * |   |tfac_error|an error.                   |        |sapphire resulting |
+ * |   |          |TB, DEC, HDEC, PURR or SPURR|        |xstop              |
+ * |   |          |may be corrupted (details in|        |                   |
+ * |   |          |TFMR)                       |        |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |5  |          |TFMR SPR itself is          |Yes     |Raise attn from    |
+ * |   |tfmr_parit|corrupted.                  |        |sapphire resulting |
+ * |   |y_error   |Entire timing facility may  |        |xstop              |
+ * |   |          |be compromised.             |        |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |6  |ha_overflo| UPS (Uniterrupted Power    |No      |N/A                |
+ * |   |w_warning |System) Overflow indication |        |                   |
+ * |   |          |indicating that the UPS     |        |                   |
+ * |   |          |DirtyAddrTable has          |        |                   |
+ * |   |          |reached a limit where it    |        |                   |
+ * |   |          |requires PHYP unload support|        |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |7  |reserved  |reserved                    |n/a     |n/a                |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |8  |xscom_fail|An XSCOM operation caused by|No      |We handle it by    |
+ * |   |          |a cache inhibited load/store|        |manually reading   |
+ * |   |          |from this thread failed. A  |        |HMER register.     |
+ * |   |          |trap register is            |        |                   |
+ * |   |          |available.                  |        |                   |
+ * |   |          |                            |        |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |9  |xscom_done|An XSCOM operation caused by|No      |We handle it by    |
+ * |   |          |a cache inhibited load/store|        |manually reading   |
+ * |   |          |from this thread completed. |        |HMER register.     |
+ * |   |          |If hypervisor               |        |                   |
+ * |   |          |intends to use this bit, it |        |                   |
+ * |   |          |is responsible for clearing |        |                   |
+ * |   |          |it before performing the    |        |                   |
+ * |   |          |xscom operation.            |        |                   |
+ * |   |          |NOTE: this bit should always|        |                   |
+ * |   |          |be masked in HMEER          |        |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |10 |reserved  |reserved                    |n/a     |n/a                |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |11 |proc_recv_|Processor recovery occurred |y       |Log message and    |
+ * |   |again     |again before bit2 or bit3   |        |continue working.  |
+ * |   |          |was cleared                 |        |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |12-|reserved  |was temperature sensor      |n/a     |n/a                |
+ * |15 |          |passed the critical point on|        |                   |
+ * |   |          |the way up                  |        |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |16 |          |SCOM has set a reserved FIR |No      |n/a                |
+ * |   |scom_fir_h|bit to cause recovery       |        |                   |
+ * |   |m         |                            |        |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |17 |trig_fir_h|Debug trigger has set a     |No      |n/a                |
+ * |   |mi        |reserved FIR bit to cause   |        |                   |
+ * |   |          |recovery                    |        |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |18 |reserved  |reserved                    |n/a     |n/a                |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |19 |reserved  |reserved                    |n/a     |n/a                |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |20 |hyp_resour|A hypervisor resource error |y       |Raise attn from    |
+ * |   |ce_err    |occurred: data parity error |        |sapphire resulting |
+ * |   |          |on, SPRC0:3; SPR_Modereg or |        |xstop.             |
+ * |   |          |HMEER.                      |        |                   |
+ * |   |          |Note: this bit will cause an|        |                   |
+ * |   |          |check_stop when (HV=1, PR=0 |        |                   |
+ * |   |          |and EE=0)                   |        |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |21-|          |if bit 8 is active, the     |No      |We handle it by    |
+ * |23 |xscom_stat|reason will be detailed in  |        |Manually reading   |
+ * |   |us        |these bits. see chapter 11.1|        |HMER register.     |
+ * |   |          |This bits are information   |        |                   |
+ * |   |          |only and always masked      |        |                   |
+ * |   |          |(mask = '0')                |        |                   |
+ * |   |          |If hypervisor intends to use|        |                   |
+ * |   |          |this bit, it is responsible |        |                   |
+ * |   |          |for clearing it before      |        |                   |
+ * |   |          |performing the xscom        |        |                   |
+ * |   |          |operation.                  |        |                   |
+ * |---+----------+----------------------------+--------+-------------------|
+ * |24-|Not       |Not implemented             |n/a     |n/a                |
+ * |63 |implemente|                            |        |                   |
+ * |   |d         |                            |        |                   |
+ * +-- +----------+----------------------------+--------+-------------------+
+ *
+ * Above HMER bits can be enabled/disabled by modifying
+ * SPR_HMEER_HMI_ENABLE_MASK #define in include/processor.h
+ * If you modify support for any of the bits listed above, please make sure
+ * you change the above table to refelct that.
+ *
+ * NOTE: Per Dave Larson, never enable 8,9,21-23
+ */
+
+/* make compiler happy with a prototype */
+void handle_hmi(struct stack_frame *stack);
+
+void handle_hmi(struct stack_frame *stack)
+{
+	uint64_t hmer, orig_hmer;
+	bool assert = false;
+
+	orig_hmer = hmer = mfspr(SPR_HMER);
+	printf("HMI: Received HMI interrupt: HMER = 0x%016llx\n", hmer);
+	if (hmer & (SPR_HMER_PROC_RECV_DONE
+			| SPR_HMER_PROC_RECV_ERROR_MASKED)) {
+		hmer &= ~(SPR_HMER_PROC_RECV_DONE
+			| SPR_HMER_PROC_RECV_ERROR_MASKED);
+		printf("HMI: Processor recovery Done.\n");
+	}
+	if (hmer & SPR_HMER_PROC_RECV_AGAIN) {
+		hmer &= ~SPR_HMER_PROC_RECV_AGAIN;
+		printf("HMI: Processor recovery occurred again before"
+			"bit2 was cleared\n");
+	}
+	/* Assert if we see malfunction alert, we can not continue. */
+	if (hmer & SPR_HMER_MALFUNCTION_ALERT) {
+		hmer &= ~SPR_HMER_MALFUNCTION_ALERT;
+		assert = true;
+	}
+
+	/* Assert if we see Hypervisor resource error, we can not continue. */
+	if (hmer & SPR_HMER_HYP_RESOURCE_ERR) {
+		hmer &= ~SPR_HMER_HYP_RESOURCE_ERR;
+		assert = true;
+	}
+
+	/*
+	 * Assert for now for all TOD errors. In future we need to decode
+	 * TFMR and take corrective action wherever required.
+	 */
+	if (hmer & (SPR_HMER_TFAC_ERROR | SPR_HMER_TFMR_PARITY_ERROR)) {
+		hmer &= ~(SPR_HMER_TFAC_ERROR | SPR_HMER_TFMR_PARITY_ERROR);
+		assert = true;
+	}
+
+	/*
+	 * HMER bits are sticky, once set to 1 they remain set to 1 until
+	 * they are set to 0. Reset the error source bit to 0, otherwise
+	 * we keep getting HMI interrupt again and again.
+	 */
+	mtspr(SPR_HMER, hmer);
+	if (!assert)
+		return;
+
+	/*
+	 * Raise attn to crash.
+	 *
+	 * We get HMI on all threads at the same time. Using locks to avoid
+	 * printf messages jumbled up.
+	 */
+	lock(&hmi_lock);
+	dump_regs(stack, orig_hmer);
+	/* Should we unlock? We are going down anyway. */
+	unlock(&hmi_lock);
+	assert(false);
+}
+
+/* Called from head.S, thus no prototype */
+void exception_entry(struct stack_frame *stack);
+
+void exception_entry(struct stack_frame *stack)
+{
+	switch(stack->type) {
+	case STACK_ENTRY_MCHECK:
+		handle_machine_check(stack);
+		break;
+	case STACK_ENTRY_HMI:
+		handle_hmi(stack);
+		/* XXX TODO : Implement machine check */
+		break;
+	case STACK_ENTRY_SOFTPATCH:
+		/* XXX TODO : Implement softpatch ? */
+		break;
+	}
+}
+
+static int64_t patch_exception(uint64_t vector, uint64_t glue, bool hv)
+{
+	uint64_t iaddr;
+
+	/* Copy over primary exception handler */
+	memcpy((void *)vector, &exc_primary_start,
+	       &exc_primary_end - &exc_primary_start);
+
+	/* Patch branch instruction in primary handler */
+	iaddr = vector + exc_primary_patch_branch;
+	*(uint32_t *)iaddr |= (glue - iaddr) & 0x03fffffc;
+
+	/* Copy over secondary exception handler */
+	memcpy((void *)glue, &exc_secondary_start,
+	       &exc_secondary_end - &exc_secondary_start);
+
+	/* Patch-in the vector number */
+	*(uint32_t *)(glue + exc_secondary_patch_type) |= vector;
+
+	/*
+	 * If machine check, patch GET_STACK to get to the MC stack
+	 * instead of the normal stack.
+	 *
+	 * To simplify the arithmetic involved I make assumptions
+	 * on the fact that the base of all CPU stacks is 64k aligned
+	 * and that our stack size is < 32k, which means that the
+	 * "addi" instruction used in GET_STACK() is always using a
+	 * small (<32k) positive offset, which we can then easily
+	 * fixup with a simple addition
+	 */
+	BUILD_ASSERT(STACK_SIZE < 0x8000);
+	BUILD_ASSERT(!(CPU_STACKS_BASE & 0xffff));
+
+	if (vector == 0x200) {
+		/*
+		 * The addi we try to patch is the 3rd instruction
+		 * of GET_STACK(). If you change the macro, you must
+		 * update this code
+		 */
+		iaddr = glue + exc_secondary_patch_stack + 8;
+		*(uint32_t *)iaddr += MC_STACK_SIZE;
+	}
+
+	/* Standard exception ? All done */
+	if (!hv)
+		goto flush;
+
+	/* HV exception, change the SRR's to HSRRs and rfid to hrfid
+	 *
+	 * The magic is that mfspr/mtspr of SRR can be turned into the
+	 * equivalent HSRR version by OR'ing 0x4800. For rfid to hrfid
+	 * we OR 0x200.
+	 */
+	*(uint32_t *)(glue + exc_secondary_patch_mfsrr0) |= 0x4800;
+	*(uint32_t *)(glue + exc_secondary_patch_mfsrr1) |= 0x4800;
+	*(uint32_t *)(glue + exc_secondary_patch_mtsrr0) |= 0x4800;
+	*(uint32_t *)(glue + exc_secondary_patch_mtsrr1) |= 0x4800;
+	*(uint32_t *)(glue + exc_secondary_patch_rfid) |= 0x200;
+
+ flush:
+	/* On P7 and later all we need is : */
+	sync_icache();
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t opal_register_exc_handler(uint64_t opal_exception,
+					 uint64_t handler_address,
+					 uint64_t glue_cache_line)
+{
+	switch(opal_exception) {
+	case OPAL_MACHINE_CHECK_HANDLER:
+		client_mc_address = handler_address;
+		return patch_exception(0x200, glue_cache_line, false);
+	case OPAL_HYPERVISOR_MAINTENANCE_HANDLER:
+		return patch_exception(0xe60, glue_cache_line, true);
+#if 0 /* We let Linux handle softpatch */
+	case OPAL_SOFTPATCH_HANDLER:
+		return patch_exception(0x1500, glue_cache_line, true);
+#endif
+	default:
+		break;
+	}
+	return OPAL_PARAMETER;
+}
+opal_call(OPAL_REGISTER_OPAL_EXCEPTION_HANDLER, opal_register_exc_handler, 3);
+
diff --git a/core/fast-reboot.c b/core/fast-reboot.c
new file mode 100644
index 0000000..49b80b6
--- /dev/null
+++ b/core/fast-reboot.c
@@ -0,0 +1,346 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <cpu.h>
+#include <fsp.h>
+#include <psi.h>
+#include <opal.h>
+#include <xscom.h>
+#include <interrupts.h>
+#include <cec.h>
+#include <timebase.h>
+#include <memory.h>
+#include <pci.h>
+#include <chip.h>
+
+/*
+ * To get control of all threads, we sreset them via XSCOM after
+ * patching the 0x100 vector. This will work as long as the target
+ * HRMOR is 0. If Linux ever uses HRMOR, we'll have to consider
+ * a more messy approach.
+ *
+ * The SCOM register we want is called "Core RAS Control" in the doc
+ * and EX0.EC.PC.TCTL_GENERATE#0.TCTL.DIRECT_CONTROLS in the SCOM list
+ *
+ * Bits in there change from CPU rev to CPU rev but the bit we care
+ * about, bit 60 "sreset_request" appears to have stuck to the same
+ * place in both P7 and P7+. The register also has the same SCOM
+ * address
+ */
+#define EX0_TCTL_DIRECT_CONTROLS0	0x08010400
+#define EX0_TCTL_DIRECT_CONTROLS1	0x08010440
+#define EX0_TCTL_DIRECT_CONTROLS2	0x08010480
+#define EX0_TCTL_DIRECT_CONTROLS3	0x080104c0
+#define   TCTL_DC_SRESET_REQUEST	PPC_BIT(60)
+
+/* Flag tested by the OPAL entry code */
+uint8_t reboot_in_progress;
+static struct cpu_thread *resettor, *resettee;
+
+static void flush_caches(void)
+{
+	uint64_t base = SKIBOOT_BASE;
+	uint64_t end = base + SKIBOOT_SIZE;
+
+	/* Not sure what the effect of sreset is on cores, so let's
+	 * shoot a series of dcbf's on all cachelines that make up
+	 * our core memory just in case...
+	 */
+	while(base < end) {
+		asm volatile("dcbf 0,%0" : : "r" (base) : "memory");
+		base += 128;
+	}
+	sync();
+}
+
+static bool do_reset_core_p7(struct cpu_thread *cpu)
+{
+	uint32_t xscom_addr, chip;
+	uint64_t ctl;
+	int rc;
+
+	/* Add the Core# */
+	xscom_addr = EX0_TCTL_DIRECT_CONTROLS0;
+	xscom_addr |= ((cpu->pir >> 2) & 7) << 24;
+
+	chip = pir_to_chip_id(cpu->pir);
+
+	ctl = TCTL_DC_SRESET_REQUEST;
+	rc = xscom_write(chip, xscom_addr, ctl);
+	rc |= xscom_write(chip, xscom_addr + 0x40, ctl);
+	rc |= xscom_write(chip, xscom_addr + 0x80, ctl);
+	rc |= xscom_write(chip, xscom_addr + 0xc0, ctl);
+	if (rc) {
+		prerror("RESET: Error %d resetting CPU 0x%04x\n",
+			rc, cpu->pir);
+		return false;
+	}
+	return true;
+}
+
+static void fast_reset_p7(void)
+{
+	struct cpu_thread *cpu;
+
+	resettee = this_cpu();
+	resettor = NULL;
+
+	/* Pick up a candidate resettor. We do that before we flush
+	 * the caches
+	 */
+	for_each_cpu(cpu) {
+		/*
+		 * Some threads might still be in skiboot.
+		 *
+		 * But because we deal with entire cores and we don't want
+		 * to special case things, we are just going to reset them
+		 * too making the assumption that this is safe, they are
+		 * holding no locks. This can only be true if they don't
+		 * have jobs scheduled which is hopefully the case.
+		 */
+		if (cpu->state != cpu_state_os &&
+		    cpu->state != cpu_state_active)
+			continue;
+
+		/*
+		 * Only hit cores and only if they aren't on the same core
+		 * as ourselves
+		 */
+		if (cpu_get_thread0(cpu) == cpu_get_thread0(this_cpu()) ||
+		    cpu->pir & 0x3)
+			continue;
+
+		/* Pick up one of those guys as our "resettor". It will be
+		 * in charge of resetting this CPU. We avoid resetting
+		 * ourselves, not sure how well it would do with SCOM
+		 */
+		resettor = cpu;
+		break;
+	}
+
+	if (!resettor) {
+		printf("RESET: Can't find a resettor !\n");
+		return;
+	}
+	printf("RESET: Resetting from 0x%04x, resettor 0x%04x\n",
+	       this_cpu()->pir, resettor->pir);
+
+	printf("RESET: Flushing caches...\n");
+
+	/* Is that necessary ? */
+	flush_caches();
+
+	/* Reset everybody except self and except resettor */
+	for_each_cpu(cpu) {
+		if (cpu->state != cpu_state_os &&
+		    cpu->state != cpu_state_active)
+			continue;
+		if (cpu_get_thread0(cpu) == cpu_get_thread0(this_cpu()) ||
+		    cpu->pir & 0x3)
+			continue;
+		if (cpu_get_thread0(cpu) == cpu_get_thread0(resettor))
+			continue;
+
+		printf("RESET: Resetting CPU 0x%04x...\n", cpu->pir);
+
+		if (!do_reset_core_p7(cpu))
+			return;
+	}
+
+	/* Reset the resettor last because it's going to kill me ! */
+	printf("RESET: Resetting CPU 0x%04x...\n", resettor->pir);
+	if (!do_reset_core_p7(resettor))
+		return;
+
+	/* Don't return */
+	for (;;)
+		;
+}
+
+void fast_reset(void)
+{
+	uint32_t pvr = mfspr(SPR_PVR);
+	extern uint32_t fast_reset_patch_start;
+	extern uint32_t fast_reset_patch_end;
+	uint32_t *dst, *src;
+
+	printf("RESET: Fast reboot request !\n");
+
+	/* XXX We need a way to ensure that no other CPU is in skiboot
+	 * holding locks (via the OPAL APIs) and if they are, we need
+	 * for them to get out
+	 */
+	reboot_in_progress = 1;
+	time_wait_ms(200);
+
+	/* Copy reset trampoline */
+	printf("RESET: Copying reset trampoline...\n");
+	src = &fast_reset_patch_start;
+	dst = (uint32_t *)0x100;
+	while(src < &fast_reset_patch_end)
+		*(dst++) = *(src++);
+	sync_icache();
+
+	switch(PVR_TYPE(pvr)) {
+	case PVR_TYPE_P7:
+	case PVR_TYPE_P7P:
+		fast_reset_p7();
+	}
+}
+
+static void cleanup_cpu_state(void)
+{
+	if (cpu_is_thread0(this_cpu())) {
+		cleanup_tlb();
+		init_shared_sprs();
+	}
+	init_replicated_sprs();
+	reset_cpu_icp();
+}
+
+#ifdef FAST_REBOOT_CLEARS_MEMORY
+static void fast_mem_clear(uint64_t start, uint64_t end)
+{
+	printf("MEMORY: Clearing %llx..%llx\n", start, end);
+
+	while(start < end) {
+		asm volatile("dcbz 0,%0" : : "r" (start) : "memory");
+		start += 128;
+	}
+}
+
+static void memory_reset(void)
+{
+	struct address_range *i;
+	uint64_t skistart = SKIBOOT_BASE;
+	uint64_t skiend = SKIBOOT_BASE + SKIBOOT_SIZE;
+
+	printf("MEMORY: Clearing ...\n");
+
+	list_for_each(&address_ranges, i, list) {
+		uint64_t start = cleanup_addr(i->arange->start);
+		uint64_t end = cleanup_addr(i->arange->end);
+
+		if (start >= skiend || end <= skistart)
+			fast_mem_clear(start, end);
+		else {
+			if (start < skistart)
+				fast_mem_clear(start, skistart);
+			if (end > skiend)
+				fast_mem_clear(skiend, end);
+		}
+	}
+}
+#endif /* FAST_REBOOT_CLEARS_MEMORY */
+
+/* Entry from asm after a fast reset */
+void fast_reboot(void);
+
+void fast_reboot(void)
+{
+	static volatile bool fast_boot_release;
+	struct cpu_thread *cpu;
+
+	printf("INIT: CPU PIR 0x%04x reset in\n", this_cpu()->pir);
+
+	/* If this CPU was chosen as the resettor, it must reset the
+	 * resettee (the one that initiated the whole process
+	 */
+	if (this_cpu() == resettor)
+		do_reset_core_p7(resettee);
+
+	/* Are we the original boot CPU ? If not, we spin waiting
+	 * for a relase signal from CPU 1, then we clean ourselves
+	 * up and go processing jobs.
+	 */
+	if (this_cpu() != boot_cpu) {
+		this_cpu()->state = cpu_state_present;
+		while (!fast_boot_release) {
+			smt_very_low();
+			sync();
+		}
+		smt_medium();
+		cleanup_cpu_state();
+		__secondary_cpu_entry();
+	}
+
+	/* We are the original boot CPU, wait for secondaries to
+	 * be captured
+	 */
+	for_each_cpu(cpu) {
+		if (cpu == this_cpu())
+			continue;
+
+		/* XXX Add a callin timeout ? */
+		while (cpu->state != cpu_state_present) {
+			smt_very_low();
+			sync();
+		}
+		smt_medium();
+	}
+
+	printf("INIT: Releasing secondaries...\n");
+
+	/* Release everybody */
+	fast_boot_release = true;
+	sync();
+
+	/* Wait for them to respond */
+	for_each_cpu(cpu) {
+		if (cpu == this_cpu())
+			continue;
+
+		/* XXX Add a callin timeout ? */
+		while (cpu->state == cpu_state_present) {
+			smt_very_low();
+			sync();
+		}
+	}
+
+	printf("INIT: All done, resetting everything else...\n");
+
+	/* Clear release flag for next time */
+	fast_boot_release = false;
+	reboot_in_progress = 0;
+
+	/* Cleanup ourselves */
+	cleanup_cpu_state();
+
+	/* Set our state to active */
+	this_cpu()->state = cpu_state_active;
+
+	/* Poke the consoles (see comments in the code there) */
+	fsp_console_reset();
+
+	/* Reset/EOI the PSI interrupt */
+	psi_irq_reset();
+
+	/* Remove all PCI devices */
+	pci_reset();
+
+	/* Reset IO Hubs */
+	cec_reset();
+
+	/* Re-Initialize all discovered PCI slots */
+	pci_init_slots();
+
+	/* Clear memory */
+#ifdef FAST_REBOOT_CLEARS_MEMORY
+	memory_reset();
+#endif
+	load_and_boot_kernel(true);
+}
diff --git a/core/fdt.c b/core/fdt.c
new file mode 100644
index 0000000..62e60fc
--- /dev/null
+++ b/core/fdt.c
@@ -0,0 +1,208 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <stdarg.h>
+#include <libfdt.h>
+#include <device.h>
+#include <cpu.h>
+#include <memory.h>
+#include <opal.h>
+#include <interrupts.h>
+#include <fsp.h>
+#include <cec.h>
+#include <vpd.h>
+#include <ccan/str/str.h>
+
+static int fdt_error;
+static void *fdt;
+
+#undef DEBUG_FDT
+
+static void __save_err(int err, const char *str)
+{
+#ifdef DEBUG_FDT
+	printf("FDT: rc: %d from \"%s\"\n", err, str);
+#endif
+	if (err && !fdt_error) {
+		prerror("FDT: Error %d from \"%s\"\n", err, str);
+		fdt_error = err;
+	}
+}
+
+#define save_err(...) __save_err(__VA_ARGS__, #__VA_ARGS__)
+
+static void dt_property_cell(const char *name, u32 cell)
+{
+	save_err(fdt_property_cell(fdt, name, cell));
+}
+
+static void dt_begin_node(const char *name, uint32_t phandle)
+{
+	save_err(fdt_begin_node(fdt, name));
+
+	/*
+	 * We add both the new style "phandle" and the legacy
+	 * "linux,phandle" properties
+	 */
+	dt_property_cell("linux,phandle", phandle);
+	dt_property_cell("phandle", phandle);
+}
+
+static void dt_property(const char *name, const void *val, size_t size)
+{
+	save_err(fdt_property(fdt, name, val, size));
+}
+
+static void dt_end_node(void)
+{
+	save_err(fdt_end_node(fdt));
+}
+
+static void dump_fdt(void)
+{
+#ifdef DEBUG_FDT
+	int i, off, depth, err;
+
+	printf("Device tree %u@%p\n", fdt_totalsize(fdt), fdt);
+
+	err = fdt_check_header(fdt);
+	if (err) {
+		prerror("fdt_check_header: %s\n", fdt_strerror(err));
+		return;
+	}
+	printf("fdt_check_header passed\n");
+
+	printf("fdt_num_mem_rsv = %u\n", fdt_num_mem_rsv(fdt));
+	for (i = 0; i < fdt_num_mem_rsv(fdt); i++) {
+		u64 addr, size;
+
+		err = fdt_get_mem_rsv(fdt, i, &addr, &size);
+		if (err) {
+			printf(" ERR %s\n", fdt_strerror(err));
+			return;
+		}
+		printf("  mem_rsv[%i] = %lu@%#lx\n", i, (long)addr, (long)size);
+	}
+
+	for (off = fdt_next_node(fdt, 0, &depth);
+	     off > 0;
+	     off = fdt_next_node(fdt, off, &depth)) {
+		int len;
+		const char *name;
+
+		name = fdt_get_name(fdt, off, &len);
+		if (!name) {
+			prerror("fdt: offset %i no name!\n", off);
+			return;
+		}
+		printf("name: %s [%u]\n", name, off);
+	}
+#endif
+}
+
+static void flatten_dt_node(const struct dt_node *root)
+{
+	const struct dt_node *i;
+	const struct dt_property *p;
+
+#ifdef DEBUG_FDT
+	printf("FDT: node: %s\n", root->name);
+#endif
+
+	list_for_each(&root->properties, p, list) {
+		if (strstarts(p->name, DT_PRIVATE))
+			continue;
+#ifdef DEBUG_FDT
+		printf("FDT:   prop: %s size: %ld\n", p->name, p->len);
+#endif
+		dt_property(p->name, p->prop, p->len);
+	}
+
+	list_for_each(&root->children, i, list) {
+		dt_begin_node(i->name, i->phandle);
+		flatten_dt_node(i);
+		dt_end_node();
+	}
+}
+
+static void create_dtb_reservemap(const struct dt_node *root)
+{
+	uint64_t base, size;
+	const uint64_t *ranges;
+	const struct dt_property *prop;
+	int i;
+
+	/* Duplicate the reserved-ranges property into the fdt reservemap */
+	prop = dt_find_property(root, "reserved-ranges");
+	if (prop) {
+		ranges = (const void *)prop->prop;
+
+		for (i = 0; i < prop->len / (sizeof(uint64_t) * 2); i++) {
+			base = *(ranges++);
+			size = *(ranges++);
+			save_err(fdt_add_reservemap_entry(fdt, base, size));
+		}
+	}
+
+	save_err(fdt_finish_reservemap(fdt));
+}
+
+void *create_dtb(const struct dt_node *root)
+{
+	size_t len = DEVICE_TREE_MAX_SIZE;
+	uint32_t old_last_phandle = last_phandle;
+
+	do {
+		if (fdt)
+			free(fdt);
+		last_phandle = old_last_phandle;
+		fdt_error = 0;
+		fdt = malloc(len);
+		if (!fdt) {
+			prerror("dtb: could not malloc %lu\n", (long)len);
+			return NULL;
+		}
+
+		fdt_create(fdt, len);
+
+		create_dtb_reservemap(root);
+
+		/* Open root node */
+		dt_begin_node(root->name, root->phandle);
+
+		/* Unflatten our live tree */
+		flatten_dt_node(root);
+
+		/* Close root node */
+		dt_end_node();
+
+		save_err(fdt_finish(fdt));
+
+		if (!fdt_error)
+			break;
+
+		len *= 2;
+	} while (fdt_error == -FDT_ERR_NOSPACE);
+
+	dump_fdt();
+
+	if (fdt_error) {
+		prerror("dtb: error %s\n", fdt_strerror(fdt_error));
+		return NULL;
+	}
+	return fdt;
+}
diff --git a/core/flash-nvram.c b/core/flash-nvram.c
new file mode 100644
index 0000000..7e261b1
--- /dev/null
+++ b/core/flash-nvram.c
@@ -0,0 +1,76 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include <skiboot.h>
+#include <device.h>
+#include <console.h>
+#include <opal.h>
+#include <platform.h>
+#include <libflash/libflash.h>
+
+static struct flash_chip *fl_nv_chip;
+static uint32_t fl_nv_start, fl_nv_size;
+
+static int flash_nvram_info(uint32_t *total_size)
+{
+	if (!fl_nv_chip)
+		return OPAL_HARDWARE;
+	*total_size = fl_nv_size;
+	return OPAL_SUCCESS;
+}
+
+static int flash_nvram_start_read(void *dst, uint32_t src, uint32_t len)
+{
+	int rc;
+
+	if ((src + len) > fl_nv_size) {
+		prerror("FLASH_NVRAM: read out of bound (0x%x,0x%x)\n",
+			src, len);
+		return OPAL_PARAMETER;
+	}
+	rc = flash_read(fl_nv_chip, fl_nv_start + src, dst, len);
+	if (rc)
+		return rc;
+	nvram_read_complete(true);
+	return 0;
+}
+
+static int flash_nvram_write(uint32_t dst, void *src, uint32_t len)
+{
+	/* TODO: When we have async jobs for PRD, turn this into one */
+
+	if ((dst + len) > fl_nv_size) {
+		prerror("FLASH_NVRAM: write out of bound (0x%x,0x%x)\n",
+			dst, len);
+		return OPAL_PARAMETER;
+	}
+	return flash_smart_write(fl_nv_chip, fl_nv_start + dst, src, len);
+}
+
+int flash_nvram_init(struct flash_chip *chip, uint32_t start, uint32_t size)
+{
+	fl_nv_chip = chip;
+	fl_nv_start = start;
+	fl_nv_size = size;
+
+	platform.nvram_info = flash_nvram_info;
+	platform.nvram_start_read = flash_nvram_start_read;
+	platform.nvram_write = flash_nvram_write;
+
+	return 0;
+}
+
diff --git a/core/hostservices.c b/core/hostservices.c
new file mode 100644
index 0000000..85e62e3
--- /dev/null
+++ b/core/hostservices.c
@@ -0,0 +1,826 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <lock.h>
+#include <device.h>
+#include <compiler.h>
+#include <hostservices.h>
+#include <mem_region.h>
+#include <xscom.h>
+#include <fsp.h>
+#include <chip.h>
+#include <console.h>
+#include <mem-map.h>
+#include <timebase.h>
+
+#define HOSTBOOT_RUNTIME_INTERFACE_VERSION 1
+
+struct host_interfaces {
+	/** Interface version. */
+	uint64_t interface_version;
+
+	/** Put a string to the console. */
+	void (*puts)(const char*);
+	/** Critical failure in runtime execution. */
+	void (*assert)(void);
+
+	/** OPTIONAL. Hint to environment that the page may be executed. */
+	int (*set_page_execute)(void*);
+
+	/** malloc */
+	void *(*malloc)(size_t);
+	/** free */
+	void (*free)(void*);
+	/** realloc */
+	void *(*realloc)(void*, size_t);
+
+	/** sendErrorLog
+	 * @param[in] plid Platform Log identifier
+	 * @param[in] data size in bytes
+	 * @param[in] pointer to data
+	 * @return 0 on success else error code
+	 */
+	int (*send_error_log)(uint32_t,uint32_t,void *);
+
+	/** Scan communication read
+	 * @param[in] chip_id (based on devtree defn)
+	 * @param[in] address
+	 * @param[in] pointer to 8-byte data buffer
+	 * @return 0 on success else return code
+	 */
+	int (*scom_read)(uint64_t, uint64_t, void*);
+
+	/** Scan communication write
+	 * @param[in] chip_id (based on devtree defn)
+	 * @param[in] address
+	 * @param[in] pointer to 8-byte data buffer
+	 * @return 0 on success else return code
+	 */
+	int (*scom_write)(uint64_t, uint64_t, const void *);
+
+	/** lid_load
+	 *  Load a LID from PNOR, FSP, etc.
+	 *
+	 *  @param[in] LID number.
+	 *  @param[out] Allocated buffer for LID.
+	 *  @param[out] Size of LID (in bytes).
+	 *
+	 *  @return 0 on success, else RC.
+	 */
+	int (*lid_load)(uint32_t lid, void **buf, size_t *len);
+
+	/** lid_unload
+	 *  Release memory from previously loaded LID.
+	 *
+	 *  @param[in] Allocated buffer for LID to release.
+	 *
+	 *  @return 0 on success, else RC.
+	 */
+	int (*lid_unload)(void *buf);
+
+	/** Get the address of a reserved memory region by its devtree name.
+	 *
+	 *  @param[in] Devtree name (ex. "ibm,hbrt-vpd-image")
+	 *  @return physical address of region (or NULL).
+	 **/
+	uint64_t (*get_reserved_mem)(const char*);
+
+	/**
+	 * @brief  Force a core to be awake, or clear the force
+	 * @param[in] i_core  Core to wake up (pid)
+	 * @param[in] i_mode  0=force awake
+	 *                1=clear force
+	 *                2=clear all previous forces
+	 * @return rc  non-zero on error
+	 */
+	int (*wakeup)( uint32_t i_core, uint32_t i_mode );
+
+	/**
+	 * @brief Delay/sleep for at least the time given
+	 * @param[in] seconds
+	 * @param[in] nano seconds
+	 */
+	void (*nanosleep)(uint64_t i_seconds, uint64_t i_nano_seconds);
+
+	// Reserve some space for future growth.
+	void (*reserved[32])(void);
+};
+
+struct runtime_interfaces {
+	/** Interface version. */
+	uint64_t interface_version;
+
+	/** Execute CxxTests that may be contained in the image.
+	 *
+	 * @param[in] - Pointer to CxxTestStats structure for results reporting.
+	 */
+	void (*cxxtestExecute)(void *);
+	/** Get a list of lids numbers of the lids known to HostBoot
+	 *
+	 * @param[out] o_num - the number of lids in the list
+	 * @return a pointer to the list
+	 */
+	const uint32_t * (*get_lid_list)(size_t * o_num);
+
+	/** Load OCC Image and common data into mainstore, also setup OCC BARSs
+	 *
+	 * @param[in] i_homer_addr_phys - The physical mainstore address of the
+	 *                                start of the HOMER image
+	 * @param[in] i_homer_addr_va - Virtual memory address of the HOMER image
+	 * @param[in] i_common_addr_phys - The physical mainstore address of the
+	 *                                 OCC common area.
+	 * @param[in] i_common_addr_va - Virtual memory address of the common area
+	 * @param[in] i_chip - The HW chip id (XSCOM chip ID)
+	 * @return 0 on success else return code
+	 */
+	int(*loadOCC)(uint64_t i_homer_addr_phys,
+			uint64_t i_homer_addr_va,
+			uint64_t i_common_addr_phys,
+			uint64_t i_common_addr_va,
+			uint64_t i_chip);
+
+	/** Start OCC on all chips, by module
+	 *
+	 *  @param[in] i_chip - Array of functional HW chip ids
+	 *  @Note The caller must include a complete modules worth of chips
+	 *  @param[in] i_num_chips - Number of chips in the array
+	 *  @return 0 on success else return code
+	 */
+	int (*startOCCs)(uint64_t* i_chip,
+			size_t i_num_chips);
+
+	/** Stop OCC hold OCCs in reset
+	 *
+	 *  @param[in] i_chip - Array of functional HW chip ids
+	 *  @Note The caller must include a complete modules worth of chips
+	 *  @param[in] i_num_chips - Number of chips in the array
+	 *  @return 0 on success else return code
+	 */
+	int (*stopOCCs)(uint64_t* i_chip,
+			size_t i_num_chips);
+
+	/* Reserve some space for future growth. */
+	void (*reserved[32])(void);
+};
+
+static struct runtime_interfaces *hservice_runtime;
+
+static char *hbrt_con_buf = (char *)HBRT_CON_START;
+static size_t hbrt_con_pos;
+static bool hbrt_con_wrapped;
+
+#define HBRT_CON_IN_LEN		0
+#define HBRT_CON_OUT_LEN	(HBRT_CON_LEN - HBRT_CON_IN_LEN)
+
+struct memcons hbrt_memcons __section(".data.memcons") = {
+	.magic		= MEMCONS_MAGIC,
+	.obuf_phys	= HBRT_CON_START,
+	.ibuf_phys	= HBRT_CON_START + HBRT_CON_OUT_LEN,
+	.obuf_size	= HBRT_CON_OUT_LEN,
+	.ibuf_size	= HBRT_CON_IN_LEN,
+};
+
+static void hservice_putc(char c)
+{
+	uint32_t opos;
+
+	hbrt_con_buf[hbrt_con_pos++] = c;
+	if (hbrt_con_pos >= HBRT_CON_OUT_LEN) {
+		hbrt_con_pos = 0;
+		hbrt_con_wrapped = true;
+	}
+
+	/*
+	 * We must always re-generate memcons.out_pos because
+	 * under some circumstances, the console script will
+	 * use a broken putmemproc that does RMW on the full
+	 * 8 bytes containing out_pos and in_prod, thus corrupting
+	 * out_pos
+	 */
+	opos = hbrt_con_pos;
+	if (hbrt_con_wrapped)
+		opos |= MEMCONS_OUT_POS_WRAP;
+	lwsync();
+	hbrt_memcons.out_pos = opos;
+}
+
+static void hservice_puts(const char *str)
+{
+	char c;
+
+	while((c = *(str++)) != 0)
+		hservice_putc(c);
+	hservice_putc(10);
+}
+
+static void hservice_mark(void)
+{
+	hservice_puts("--------------------------------------------------"
+		      "--------------------------------------------------\n");
+}
+
+static void hservice_assert(void)
+{
+	prerror("HBRT: Assertion from hostservices\n");
+	abort();
+}
+
+static void *hservice_malloc(size_t size)
+{
+	return malloc(size);
+}
+
+static void hservice_free(void *ptr)
+{
+	free(ptr);
+}
+
+
+static void *hservice_realloc(void *ptr, size_t size)
+{
+	return realloc(ptr, size);
+}
+
+struct hbrt_elog_ent {
+	void *buf;
+	unsigned int size;
+	unsigned int plid;
+	struct list_node link;
+};
+static LIST_HEAD(hbrt_elogs);
+static struct lock hbrt_elog_lock = LOCK_UNLOCKED;
+static bool hbrt_elog_sending;
+static void hservice_start_elog_send(void);
+
+static void hservice_elog_write_complete(struct fsp_msg *msg)
+{
+	struct hbrt_elog_ent *ent = msg->user_data;
+
+	lock(&hbrt_elog_lock);
+	printf("HBRT: Completed send of PLID 0x%08x\n", ent->plid);
+	hbrt_elog_sending = false;
+	fsp_tce_unmap(PSI_DMA_HBRT_LOG_WRITE_BUF,
+		      PSI_DMA_HBRT_LOG_WRITE_BUF_SZ);
+	free(ent->buf);
+	free(ent);
+	fsp_freemsg(msg);
+	hservice_start_elog_send();
+	unlock(&hbrt_elog_lock);
+}
+
+static void hservice_start_elog_send(void)
+{
+	struct fsp_msg *msg;
+	struct hbrt_elog_ent *ent;
+
+ again:
+	if (list_empty(&hbrt_elogs))
+		return;
+	ent = list_pop(&hbrt_elogs, struct hbrt_elog_ent, link);
+
+	hbrt_elog_sending = true;
+
+	printf("HBRT: Starting send of PLID 0x%08x\n", ent->plid);
+
+	fsp_tce_map(PSI_DMA_HBRT_LOG_WRITE_BUF, ent->buf,
+		    PSI_DMA_HBRT_LOG_WRITE_BUF_SZ);
+
+	msg = fsp_mkmsg(FSP_CMD_WRITE_SP_DATA, 6, FSP_DATASET_HBRT_BLOB,
+			0, 0, 0, PSI_DMA_HBRT_LOG_WRITE_BUF,
+			ent->size);
+
+	if (!msg) {
+		prerror("HBRT: Failed to create error msg log to FSP\n");
+		goto error;
+	}
+	msg->user_data = ent;
+	if (!fsp_queue_msg(msg, hservice_elog_write_complete))
+		return;
+	prerror("FSP: Error queueing elog update\n");
+ error:
+	if (msg)
+		fsp_freemsg(msg);
+	fsp_tce_unmap(PSI_DMA_HBRT_LOG_WRITE_BUF,
+		      PSI_DMA_HBRT_LOG_WRITE_BUF_SZ);
+	free(ent->buf);
+	free(ent);
+	hbrt_elog_sending = false;
+	goto again;
+}
+
+static int hservice_send_error_log(uint32_t plid, uint32_t dsize, void *data)
+{
+	struct hbrt_elog_ent *ent;
+	void *abuf;
+
+	printf("HBRT: Error log generated with plid 0x%08x\n", plid);
+
+	/* We only know how to send error logs to FSP */
+	if (!fsp_present()) {
+		prerror("HBRT: Warning, error log from HBRT discarded !\n");
+		return OPAL_UNSUPPORTED;
+	}
+	if (dsize > PSI_DMA_HBRT_LOG_WRITE_BUF_SZ) {
+		prerror("HBRT: Warning, error log from HBRT too big (%d) !\n",
+			dsize);
+		dsize = PSI_DMA_HBRT_LOG_WRITE_BUF_SZ;
+	}
+
+	lock(&hbrt_elog_lock);
+
+	/* Create and populate a tracking structure */
+	ent = zalloc(sizeof(struct hbrt_elog_ent));
+	if (!ent) {
+		unlock(&hbrt_elog_lock);
+		return OPAL_NO_MEM;
+	}
+
+	/* Grab a 4k aligned page */
+	abuf = memalign(0x1000, PSI_DMA_HBRT_LOG_WRITE_BUF_SZ);
+	if (!abuf) {
+		free(ent);
+		unlock(&hbrt_elog_lock);
+		return OPAL_NO_MEM;
+	}
+	memset(abuf, 0, PSI_DMA_HBRT_LOG_WRITE_BUF_SZ);
+	memcpy(abuf, data, dsize);
+	ent->buf = abuf;
+	ent->size = dsize;
+	ent->plid = plid;
+	list_add_tail(&hbrt_elogs, &ent->link);
+	if (!hbrt_elog_sending)
+		hservice_start_elog_send();
+	unlock(&hbrt_elog_lock);
+
+	return 0;
+}
+
+static int hservice_scom_read(uint64_t chip_id, uint64_t addr, void *buf)
+{
+	return xscom_read(chip_id, addr, buf);
+}
+
+static int hservice_scom_write(uint64_t chip_id, uint64_t addr,
+			       const void *buf)
+{
+	uint64_t val;
+
+	memcpy(&val, buf, sizeof(val));
+	return xscom_write(chip_id, addr, val);
+}
+
+static int hservice_lid_load(uint32_t lid, void **buf, size_t *len)
+{
+	int rc;
+	static void *lid_cache;
+	static size_t lid_cache_len;
+	static uint32_t lid_cache_id;
+
+	printf("HBRT: LID load request for 0x%08x\n", lid);
+
+	/* Adjust LID side first or we get a cache mismatch */
+	lid = fsp_adjust_lid_side(lid);
+
+	/* Check for cache */
+	if (lid_cache && lid_cache_id == lid) {
+		*buf = lid_cache;
+		*len = lid_cache_len;
+		printf("HBRT: Serviced from cache, len=0x%lx\n", lid_cache_len);
+		return 0;
+	}
+
+	/* Cache mismatch, discard old one */
+	if (lid_cache) {
+		printf("HBRT: Cache mismatch, discarding old 0x%08x\n",
+		       lid_cache_id);
+		free(lid_cache);
+		lid_cache = NULL;
+	}
+
+	/* Allocate a new buffer and load the LID into it */
+	*buf = malloc(HBRT_LOAD_LID_SIZE);
+	*len = HBRT_LOAD_LID_SIZE;
+	rc = fsp_fetch_data(0, FSP_DATASET_NONSP_LID, lid, 0, *buf, len);
+	if (rc != 0)
+		/* Take advantage of realloc corner case here. */
+		*len = 0;
+	*buf = realloc(*buf, *len);
+
+	/* We managed, let's cache it */
+	if (rc == 0 && *len) {
+		lid_cache = *buf;
+		lid_cache_len = *len;
+		lid_cache_id = lid;
+
+		printf("HBRT: LID 0x%08x successfully loaded and cached"
+		       ", len=0x%lx\n", lid, lid_cache_len);
+	}
+
+	return rc;
+}
+
+static int hservice_lid_unload(void *buf __unused)
+{
+	/* We do nothing as the LID is held in cache */
+	return 0;
+}
+
+static uint64_t hservice_get_reserved_mem(const char *name)
+{
+	struct mem_region *region;
+	uint64_t ret;
+
+	/* We assume it doesn't change after we've unlocked it, but
+	 * lock ensures list is safe to walk. */
+	lock(&mem_region_lock);
+	region = find_mem_region(name);
+	ret = region ? region->start : 0;
+	unlock(&mem_region_lock);
+
+	if (!ret)
+		prerror("HBRT: Mem region '%s' not found !\n", name);
+
+	return ret;
+}
+
+static void hservice_nanosleep(uint64_t i_seconds, uint64_t i_nano_seconds)
+{
+	struct timespec ts;
+
+	ts.tv_sec = i_seconds;
+	ts.tv_nsec = i_nano_seconds;
+	nanosleep(&ts, NULL);
+}
+
+static int hservice_set_special_wakeup(struct cpu_thread *cpu)
+{
+	uint64_t val, core_id, poll_target, stamp;
+	int rc;
+
+	/*
+	 * Note: HWP checks for checkstops, but I assume we don't need to
+	 * as we wouldn't be running if one was present
+	 */
+
+	/* Grab core ID once */
+	core_id = pir_to_core_id(cpu->pir);
+
+	/*
+	 * The original HWp reads the XSCOM first but ignores the result
+	 * and error, let's do the same until I know for sure that is
+	 * not neccessary
+	 */
+	xscom_read(cpu->chip_id,
+		   XSCOM_ADDR_P8_EX_SLAVE(core_id, EX_PM_SPECIAL_WAKEUP_PHYP),
+		   &val);
+
+	/* Then we write special wakeup */
+	rc = xscom_write(cpu->chip_id,
+			 XSCOM_ADDR_P8_EX_SLAVE(core_id,
+						EX_PM_SPECIAL_WAKEUP_PHYP),
+			 PPC_BIT(0));
+	if (rc) {
+		prerror("HBRT: XSCOM error %d asserting special"
+			" wakeup on 0x%x\n", rc, cpu->pir);
+		return rc;
+	}
+
+	/*
+	 * HWP uses the history for Perf register here, dunno why it uses
+	 * that one instead of the pHyp one, maybe to avoid clobbering it...
+	 *
+	 * In any case, it does that to check for run/nap vs.sleep/winkle/other
+	 * to decide whether to poll on checkstop or not. Since we don't deal
+	 * with checkstop conditions here, we ignore that part.
+	 */
+
+	/*
+	 * Now poll for completion of special wakeup. The HWP is nasty here,
+	 * it will poll at 5ms intervals for up to 200ms. This is not quite
+	 * acceptable for us at runtime, at least not until we have the
+	 * ability to "context switch" HBRT. In practice, because we don't
+	 * winkle, it will never take that long, so we increase the polling
+	 * frequency to 1us per poll. However we do have to keep the same
+	 * timeout.
+	 *
+	 * We don't use time_wait_ms() either for now as we don't want to
+	 * poll the FSP here.
+	 */
+	stamp = mftb();
+	poll_target = stamp + msecs_to_tb(200);
+	val = 0;
+	while (!(val & EX_PM_GP0_SPECIAL_WAKEUP_DONE)) {
+		/* Wait 1 us */
+		hservice_nanosleep(0, 1000);
+
+		/* Read PM state */
+		rc = xscom_read(cpu->chip_id,
+				XSCOM_ADDR_P8_EX_SLAVE(core_id, EX_PM_GP0),
+				&val);
+		if (rc) {
+			prerror("HBRT: XSCOM error %d reading PM state on"
+				" 0x%x\n", rc, cpu->pir);
+			return rc;
+		}
+		/* Check timeout */
+		if (mftb() > poll_target)
+			break;
+	}
+
+	/* Success ? */
+	if (val & EX_PM_GP0_SPECIAL_WAKEUP_DONE) {
+		uint64_t now = mftb();
+		printf("HBRT: Special wakeup complete after %ld us\n",
+		       tb_to_usecs(now - stamp));
+		return 0;
+	}
+
+	/*
+	 * We timed out ...
+	 *
+	 * HWP has a complex workaround for HW255321 which affects
+	 * Murano DD1 and Venice DD1. Ignore that for now
+	 *
+	 * Instead we just dump some XSCOMs for error logging
+	 */
+	prerror("HBRT: Timeout on special wakeup of 0x%0x\n", cpu->pir);
+	prerror("HBRT:      PM0 = 0x%016llx\n", val);
+	val = -1;
+	xscom_read(cpu->chip_id,
+		   XSCOM_ADDR_P8_EX_SLAVE(core_id, EX_PM_SPECIAL_WAKEUP_PHYP),
+		   &val);
+	prerror("HBRT: SPC_WKUP = 0x%016llx\n", val);
+	val = -1;
+	xscom_read(cpu->chip_id,
+		   XSCOM_ADDR_P8_EX_SLAVE(core_id,
+					  EX_PM_IDLE_STATE_HISTORY_PHYP),
+		   &val);
+	prerror("HBRT:  HISTORY = 0x%016llx\n", val);
+
+	return OPAL_HARDWARE;
+}
+
+static int hservice_clr_special_wakeup(struct cpu_thread *cpu)
+{
+	uint64_t val, core_id;
+	int rc;
+
+	/*
+	 * Note: HWP checks for checkstops, but I assume we don't need to
+	 * as we wouldn't be running if one was present
+	 */
+
+	/* Grab core ID once */
+	core_id = pir_to_core_id(cpu->pir);
+
+	/*
+	 * The original HWp reads the XSCOM first but ignores the result
+	 * and error, let's do the same until I know for sure that is
+	 * not neccessary
+	 */
+	xscom_read(cpu->chip_id,
+		   XSCOM_ADDR_P8_EX_SLAVE(core_id, EX_PM_SPECIAL_WAKEUP_PHYP),
+		   &val);
+
+	/* Then we write special wakeup */
+	rc = xscom_write(cpu->chip_id,
+			 XSCOM_ADDR_P8_EX_SLAVE(core_id,
+						EX_PM_SPECIAL_WAKEUP_PHYP), 0);
+	if (rc) {
+		prerror("HBRT: XSCOM error %d deasserting"
+			" special wakeup on 0x%x\n", rc, cpu->pir);
+		return rc;
+	}
+
+	/*
+	 * The original HWp reads the XSCOM again with the comment
+	 * "This puts an inherent delay in the propagation of the reset
+	 * transition"
+	 */
+	xscom_read(cpu->chip_id,
+		   XSCOM_ADDR_P8_EX_SLAVE(core_id, EX_PM_SPECIAL_WAKEUP_PHYP),
+		   &val);
+
+	return 0;
+}
+
+static int hservice_wakeup(uint32_t i_core, uint32_t i_mode)
+{
+	struct cpu_thread *cpu;
+	int rc = OPAL_SUCCESS;
+
+	/*
+	 * Mask out the top nibble of i_core since it may contain
+	 * 0x4 (which we use for XSCOM targetting)
+	 */
+	i_core &= 0x0fffffff;
+
+	/* What do we need to do ? */
+	switch(i_mode) {
+	case 0: /* Assert special wakeup */
+		/* XXX Assume P8 */
+		cpu = find_cpu_by_pir(i_core << 3);
+		if (!cpu)
+			return OPAL_PARAMETER;
+		printf("HBRT: Special wakeup assert for core 0x%x, count=%d\n",
+		       i_core, cpu->hbrt_spec_wakeup);
+		if (cpu->hbrt_spec_wakeup == 0)
+			rc = hservice_set_special_wakeup(cpu);
+		if (rc == 0)
+			cpu->hbrt_spec_wakeup++;
+		return rc;
+	case 1: /* Deassert special wakeup */
+		/* XXX Assume P8 */
+		cpu = find_cpu_by_pir(i_core << 3);
+		if (!cpu)
+			return OPAL_PARAMETER;
+		printf("HBRT: Special wakeup release for core 0x%x, count=%d\n",
+		       i_core, cpu->hbrt_spec_wakeup);
+		if (cpu->hbrt_spec_wakeup == 0) {
+			prerror("HBRT: Special wakeup clear"
+				" on core 0x%x with count=0\n",
+				i_core);
+			return OPAL_WRONG_STATE;
+		}
+		/* What to do with count on errors ? */
+		cpu->hbrt_spec_wakeup--;
+		if (cpu->hbrt_spec_wakeup == 0)
+			rc = hservice_clr_special_wakeup(cpu);
+		return rc;
+	case 2: /* Clear all special wakeups */
+		printf("HBRT: Special wakeup release for all cores\n");
+		for_each_cpu(cpu) {
+			if (cpu->hbrt_spec_wakeup) {
+				cpu->hbrt_spec_wakeup = 0;
+				/* What to do on errors ? */
+				hservice_clr_special_wakeup(cpu);
+			}
+		}
+		return OPAL_SUCCESS;
+	default:
+		return OPAL_PARAMETER;
+	}
+}
+
+static struct host_interfaces hinterface = {
+	.interface_version = HOSTBOOT_RUNTIME_INTERFACE_VERSION,
+	.puts = hservice_puts,
+	.assert = hservice_assert,
+	.malloc = hservice_malloc,
+	.free = hservice_free,
+	.realloc = hservice_realloc,
+	.send_error_log = hservice_send_error_log,
+	.scom_read = hservice_scom_read,
+	.scom_write = hservice_scom_write,
+	.lid_load = hservice_lid_load,
+	.lid_unload = hservice_lid_unload,
+	.get_reserved_mem = hservice_get_reserved_mem,
+	.wakeup = hservice_wakeup,
+	.nanosleep = hservice_nanosleep,
+};
+
+int host_services_occ_load(void)
+{
+	struct proc_chip *chip;
+	int rc = 0;
+
+	printf("HBRT: OCC Load requested\n");
+
+	if (!(hservice_runtime && hservice_runtime->loadOCC)) {
+		prerror("HBRT: No hservice_runtime->loadOCC\n");
+		return -ENOENT;
+	}
+
+	for_each_chip(chip) {
+
+		printf("HBRT: [%16lx] Calling loadOCC() homer %016llx, occ_common_area %016llx, "
+		       "chip %04x\n",
+		       mftb(),
+		       chip->homer_base,
+		       chip->occ_common_base,
+		       chip->id);
+
+		rc = hservice_runtime->loadOCC(chip->homer_base,
+						chip->homer_base,
+						chip->occ_common_base,
+						chip->occ_common_base,
+						chip->id);
+
+		hservice_mark();
+		printf("HBRT: [%16lx] -> rc = %d\n", mftb(), rc);
+	}
+	return rc;
+}
+
+int host_services_occ_start(void)
+{
+	struct proc_chip *chip;
+	int i, rc = 0, nr_chips=0;
+	uint64_t chipids[MAX_CHIPS];
+
+	printf("HBRT: OCC Start requested\n");
+
+	if (!(hservice_runtime && hservice_runtime->startOCCs)) {
+		prerror("HBRT: No hservice_runtime->startOCCs\n");
+		return -ENOENT;
+	}
+
+	for_each_chip(chip) {
+		chipids[nr_chips++] = chip->id;
+	}
+
+	printf("HBRT: [%16lx] Calling startOCC() for IDs: ", mftb());
+	for (i = 0; i < nr_chips; i++)
+		printf("%04llx ", chipids[i]);
+	printf("\n");
+
+	/* Lets start all OCC */
+	rc = hservice_runtime->startOCCs(chipids, nr_chips);
+	hservice_mark();
+	printf("HBRT: [%16lx] -> rc = %d\n", mftb(), rc);
+	return rc;
+}
+
+void host_services_occ_base_setup(void)
+{
+	struct proc_chip *chip;
+	uint64_t occ_common;
+
+	chip = next_chip(NULL); /* Frist chip */
+	occ_common = (uint64_t) local_alloc(chip->id, OCC_COMMON_SIZE, OCC_COMMON_SIZE);
+
+	for_each_chip(chip) {
+		chip->occ_common_base = occ_common;
+		chip->occ_common_size = OCC_COMMON_SIZE;
+
+		chip->homer_base = (uint64_t) local_alloc(chip->id, HOMER_IMAGE_SIZE,
+							HOMER_IMAGE_SIZE);
+		chip->homer_size = HOMER_IMAGE_SIZE;
+		memset((void *)chip->homer_base, 0, chip->homer_size);
+
+		printf("HBRT: Chip %d HOMER base %016llx : %08llx "
+			"OCC common base %016llx : %08llx\n",
+			chip->id, chip->homer_base, chip->homer_size,
+			chip->occ_common_base, chip->occ_common_size);
+	}
+}
+
+bool hservices_init(void)
+{
+	void *code = NULL;
+	struct runtime_interfaces *(*hbrt_init)(struct host_interfaces *);
+
+	struct function_descriptor {
+		void *addr;
+		void *toc;
+	} fdesc;
+
+	code = (void *)hservice_get_reserved_mem("ibm,hbrt-code-image");
+	if (!code) {
+		prerror("HBRT: No ibm,hbrt-code-image found.\n");
+		return false;
+	}
+
+	if (memcmp(code, "HBRTVERS", 8) != 0) {
+		prerror("HBRT: Bad eyecatcher for ibm,hbrt-code-image!\n");
+		return false;
+	}
+
+	printf("HBRT: Found HostBoot Runtime version %llu\n", ((u64 *)code)[1]);
+
+	/* We enter at 0x100 into the image. */
+	fdesc.addr = code + 0x100;
+	/* It doesn't care about TOC */
+	fdesc.toc = 0;
+
+	hbrt_init = (void *)&fdesc;
+
+	hservice_runtime = hbrt_init(&hinterface);
+	hservice_mark();
+	if (!hservice_runtime) {
+		prerror("HBRT: Host services init failed\n");
+		return false;
+	}
+
+	printf("HBRT: Interface version %llu\n",
+	       hservice_runtime->interface_version);
+
+	return true;
+}		
diff --git a/core/init.c b/core/init.c
new file mode 100644
index 0000000..3d72ce5
--- /dev/null
+++ b/core/init.c
@@ -0,0 +1,687 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <fsp.h>
+#include <fsp-sysparam.h>
+#include <psi.h>
+#include <memory.h>
+#include <chiptod.h>
+#include <nx.h>
+#include <cpu.h>
+#include <processor.h>
+#include <xscom.h>
+#include <device_tree.h>
+#include <opal.h>
+#include <opal-msg.h>
+#include <elf.h>
+#include <io.h>
+#include <cec.h>
+#include <device.h>
+#include <pci.h>
+#include <lpc.h>
+#include <chip.h>
+#include <interrupts.h>
+#include <mem_region.h>
+#include <trace.h>
+#include <console.h>
+#include <fsi-master.h>
+#include <centaur.h>
+#include <libfdt/libfdt.h>
+#include <hostservices.h>
+
+/*
+ * Boot semaphore, incremented by each CPU calling in
+ *
+ * Forced into data section as it will be used before BSS is initialized
+ */
+enum ipl_state ipl_state = ipl_initial;
+enum proc_gen proc_gen;
+
+static uint64_t kernel_entry;
+static bool kernel_32bit;
+static void *fdt;
+
+struct debug_descriptor debug_descriptor = {
+	.eye_catcher	= "OPALdbug",
+	.version	= DEBUG_DESC_VERSION,
+	.memcons_phys	= (uint64_t)&memcons,
+	.trace_mask	= 0, /* All traces disabled by default */
+};
+
+static bool try_load_elf64_le(struct elf_hdr *header)
+{
+	struct elf64_hdr *kh = (struct elf64_hdr *)header;
+	uint64_t load_base = (uint64_t)kh;
+	struct elf64_phdr *ph;
+	unsigned int i;
+
+	printf("INIT: 64-bit LE kernel discovered\n");
+
+	/* Look for a loadable program header that has our entry in it
+	 *
+	 * Note that we execute the kernel in-place, we don't actually
+	 * obey the load informations in the headers. This is expected
+	 * to work for the Linux Kernel because it's a fairly dumb ELF
+	 * but it will not work for any ELF binary.
+	 */
+	ph = (struct elf64_phdr *)(load_base + le64_to_cpu(kh->e_phoff));
+	for (i = 0; i < le16_to_cpu(kh->e_phnum); i++, ph++) {
+		if (le32_to_cpu(ph->p_type) != ELF_PTYPE_LOAD)
+			continue;
+		if (le64_to_cpu(ph->p_vaddr) > le64_to_cpu(kh->e_entry) ||
+		    (le64_to_cpu(ph->p_vaddr) + le64_to_cpu(ph->p_memsz)) <
+		    le64_to_cpu(kh->e_entry))
+			continue;
+
+		/* Get our entry */
+		kernel_entry = le64_to_cpu(kh->e_entry) -
+			le64_to_cpu(ph->p_vaddr) + le64_to_cpu(ph->p_offset);
+		break;
+	}
+
+	if (!kernel_entry) {
+		prerror("INIT: Failed to find kernel entry !\n");
+		return false;
+	}
+	kernel_entry += load_base;
+	kernel_32bit = false;
+
+	printf("INIT: 64-bit kernel entry at 0x%llx\n", kernel_entry);
+
+	return true;
+}
+
+static bool try_load_elf64(struct elf_hdr *header)
+{
+	struct elf64_hdr *kh = (struct elf64_hdr *)header;
+	uint64_t load_base = (uint64_t)kh;
+	struct elf64_phdr *ph;
+	unsigned int i;
+
+	/* Check it's a ppc64 LE ELF */
+	if (kh->ei_ident == ELF_IDENT		&&
+	    kh->ei_data == ELF_DATA_LSB		&&
+	    kh->e_machine == le16_to_cpu(ELF_MACH_PPC64)) {
+		return try_load_elf64_le(header);
+	}
+
+	/* Check it's a ppc64 ELF */
+	if (kh->ei_ident != ELF_IDENT		||
+	    kh->ei_data != ELF_DATA_MSB		||
+	    kh->e_machine != ELF_MACH_PPC64) {
+		prerror("INIT: Kernel doesn't look like an ppc64 ELF\n");
+		return false;
+	}
+
+	/* Look for a loadable program header that has our entry in it
+	 *
+	 * Note that we execute the kernel in-place, we don't actually
+	 * obey the load informations in the headers. This is expected
+	 * to work for the Linux Kernel because it's a fairly dumb ELF
+	 * but it will not work for any ELF binary.
+	 */
+	ph = (struct elf64_phdr *)(load_base + kh->e_phoff);
+	for (i = 0; i < kh->e_phnum; i++, ph++) {
+		if (ph->p_type != ELF_PTYPE_LOAD)
+			continue;
+		if (ph->p_vaddr > kh->e_entry ||
+		    (ph->p_vaddr + ph->p_memsz) < kh->e_entry)
+			continue;
+
+		/* Get our entry */
+		kernel_entry = kh->e_entry - ph->p_vaddr + ph->p_offset;
+		break;
+	}
+
+	if (!kernel_entry) {
+		prerror("INIT: Failed to find kernel entry !\n");
+		return false;
+	}
+	kernel_entry += load_base;
+	kernel_32bit = false;
+
+	printf("INIT: 64-bit kernel entry at 0x%llx\n", kernel_entry);
+
+	return true;
+}
+
+static bool try_load_elf32_le(struct elf_hdr *header)
+{
+	struct elf32_hdr *kh = (struct elf32_hdr *)header;
+	uint64_t load_base = (uint64_t)kh;
+	struct elf32_phdr *ph;
+	unsigned int i;
+
+	printf("INIT: 32-bit LE kernel discovered\n");
+
+	/* Look for a loadable program header that has our entry in it
+	 *
+	 * Note that we execute the kernel in-place, we don't actually
+	 * obey the load informations in the headers. This is expected
+	 * to work for the Linux Kernel because it's a fairly dumb ELF
+	 * but it will not work for any ELF binary.
+	 */
+	ph = (struct elf32_phdr *)(load_base + le32_to_cpu(kh->e_phoff));
+	for (i = 0; i < le16_to_cpu(kh->e_phnum); i++, ph++) {
+		if (le32_to_cpu(ph->p_type) != ELF_PTYPE_LOAD)
+			continue;
+		if (le32_to_cpu(ph->p_vaddr) > le32_to_cpu(kh->e_entry) ||
+		    (le32_to_cpu(ph->p_vaddr) + le32_to_cpu(ph->p_memsz)) <
+		    le32_to_cpu(kh->e_entry))
+			continue;
+
+		/* Get our entry */
+		kernel_entry = le32_to_cpu(kh->e_entry) -
+			le32_to_cpu(ph->p_vaddr) + le32_to_cpu(ph->p_offset);
+		break;
+	}
+
+	if (!kernel_entry) {
+		prerror("INIT: Failed to find kernel entry !\n");
+		return false;
+	}
+
+	kernel_entry += load_base;
+	kernel_32bit = true;
+
+	printf("INIT: 32-bit kernel entry at 0x%llx\n", kernel_entry);
+
+	return true;
+}
+
+static bool try_load_elf32(struct elf_hdr *header)
+{
+	struct elf32_hdr *kh = (struct elf32_hdr *)header;
+	uint64_t load_base = (uint64_t)kh;
+	struct elf32_phdr *ph;
+	unsigned int i;
+
+	/* Check it's a ppc32 LE ELF */
+	if (header->ei_ident == ELF_IDENT		&&
+	    header->ei_data == ELF_DATA_LSB		&&
+	    header->e_machine == le16_to_cpu(ELF_MACH_PPC32)) {
+		return try_load_elf32_le(header);
+	}
+
+	/* Check it's a ppc32 ELF */
+	if (header->ei_ident != ELF_IDENT		||
+	    header->ei_data != ELF_DATA_MSB		||
+	    header->e_machine != ELF_MACH_PPC32) {
+		prerror("INIT: Kernel doesn't look like an ppc32 ELF\n");
+		return false;
+	}
+
+	/* Look for a loadable program header that has our entry in it
+	 *
+	 * Note that we execute the kernel in-place, we don't actually
+	 * obey the load informations in the headers. This is expected
+	 * to work for the Linux Kernel because it's a fairly dumb ELF
+	 * but it will not work for any ELF binary.
+	 */
+	ph = (struct elf32_phdr *)(load_base + kh->e_phoff);
+	for (i = 0; i < kh->e_phnum; i++, ph++) {
+		if (ph->p_type != ELF_PTYPE_LOAD)
+			continue;
+		if (ph->p_vaddr > kh->e_entry ||
+		    (ph->p_vaddr + ph->p_memsz) < kh->e_entry)
+			continue;
+
+		/* Get our entry */
+		kernel_entry = kh->e_entry - ph->p_vaddr + ph->p_offset;
+		break;
+	}
+
+	if (!kernel_entry) {
+		prerror("INIT: Failed to find kernel entry !\n");
+		return false;
+	}
+
+	kernel_entry += load_base;
+	kernel_32bit = true;
+
+	printf("INIT: 32-bit kernel entry at 0x%llx\n", kernel_entry);
+
+	return true;
+}
+
+/* LID numbers. For now we hijack some of pHyp's own until i figure
+ * out the whole business with the MasterLID
+ */
+#define KERNEL_LID_PHYP	0x80a00701
+#define KERNEL_LID_OPAL	0x80f00101
+
+extern char __builtin_kernel_start[];
+extern char __builtin_kernel_end[];
+extern uint64_t boot_offset;
+
+static bool load_kernel(void)
+{
+	struct elf_hdr *kh;
+	uint32_t lid;
+	size_t ksize;
+	const char *ltype;
+
+	ltype = dt_prop_get_def(dt_root, "lid-type", NULL);
+
+	/* No lid-type, assume stradale, currently pre-loaded at fixed
+	 * address
+	 */
+	if (!ltype) {
+		printf("No lid-type property, assuming FSP-less setup\n");
+		ksize = __builtin_kernel_end - __builtin_kernel_start;
+		if (ksize) {
+			/* Move the built-in kernel up */
+			uint64_t builtin_base =
+				((uint64_t)__builtin_kernel_start) -
+				SKIBOOT_BASE + boot_offset;    
+			printf("Using built-in kernel\n");
+			memmove(KERNEL_LOAD_BASE, (void*)builtin_base, ksize);
+		} else
+			printf("Assuming kernel at 0x%p\n", KERNEL_LOAD_BASE);
+	} else {
+		ksize = KERNEL_LOAD_SIZE;
+
+		/* First try to load an OPAL secondary LID always */
+		lid = fsp_adjust_lid_side(KERNEL_LID_OPAL);
+		printf("Trying to load OPAL secondary LID...\n");
+		if (fsp_fetch_data(0, FSP_DATASET_NONSP_LID, lid, 0,
+				   KERNEL_LOAD_BASE, &ksize) != 0) {	
+			if (!strcmp(ltype, "opal")) {
+				prerror("Failed to load in OPAL mode...\n");
+				return false;
+			}
+			printf("Trying to load as PHYP LID...\n");
+			lid = fsp_adjust_lid_side(KERNEL_LID_PHYP);
+			ksize = KERNEL_LOAD_SIZE;
+			if (fsp_fetch_data(0, FSP_DATASET_NONSP_LID, lid, 0,
+					   KERNEL_LOAD_BASE, &ksize) != 0) {	
+				prerror("Failed to load kernel\n");
+				return false;
+			}
+		}
+	}
+
+	printf("INIT: Kernel loaded, size: %zu bytes (0 = unknown preload)\n",
+	       ksize);
+
+	kh = (struct elf_hdr *)KERNEL_LOAD_BASE;
+	if (kh->ei_class == ELF_CLASS_64)
+		return try_load_elf64(kh);
+	else if (kh->ei_class == ELF_CLASS_32)
+		return try_load_elf32(kh);
+
+	printf("INIT: Neither ELF32 not ELF64 ?\n");
+	return false;
+}
+
+void __noreturn load_and_boot_kernel(bool is_reboot)
+{
+	const struct dt_property *memprop;
+	uint64_t mem_top;
+
+	memprop = dt_find_property(dt_root, DT_PRIVATE "maxmem");
+	if (memprop)
+		mem_top = (u64)dt_property_get_cell(memprop, 0) << 32
+			| dt_property_get_cell(memprop, 1);
+	else /* XXX HB hack, might want to calc it */
+		mem_top = 0x40000000;
+
+	op_display(OP_LOG, OP_MOD_INIT, 0x000A);
+
+	/* Load kernel LID */
+	if (!load_kernel()) {
+		op_display(OP_FATAL, OP_MOD_INIT, 1);
+		abort();
+	}
+
+	if (!is_reboot) {
+		/* We wait for the nvram read to complete here so we can
+		 * grab stuff from there such as the kernel arguments
+		 */
+		fsp_nvram_wait_open();
+
+		/* Wait for FW VPD data read to complete */
+		fsp_code_update_wait_vpd(true);
+	}
+	fsp_console_select_stdout();
+
+	/* 
+	 * OCC takes few secs to boot.  Call this as late as
+	 * as possible to avoid delay.
+	 */
+	occ_pstates_init();
+
+	/* Set kernel command line argument if specified */
+#ifdef KERNEL_COMMAND_LINE
+	dt_add_property_string(dt_chosen, "bootargs", KERNEL_COMMAND_LINE);
+#endif
+
+	op_display(OP_LOG, OP_MOD_INIT, 0x000B);
+
+	/* Create the device tree blob to boot OS. */
+	fdt = create_dtb(dt_root);
+	if (!fdt) {
+		op_display(OP_FATAL, OP_MOD_INIT, 2);
+		abort();
+	}
+
+	op_display(OP_LOG, OP_MOD_INIT, 0x000C);
+
+	/* Start the kernel */
+	if (!is_reboot)
+		op_panel_disable_src_echo();
+
+	/* Clear SRCs on the op-panel when Linux starts */
+	op_panel_clear_src();
+
+	cpu_give_self_os();
+
+	printf("INIT: Starting kernel at 0x%llx, fdt at %p (size 0x%x)\n",
+	       kernel_entry, fdt, fdt_totalsize(fdt));
+
+	fdt_set_boot_cpuid_phys(fdt, this_cpu()->pir);
+	if (kernel_32bit)
+		start_kernel32(kernel_entry, fdt, mem_top);
+	start_kernel(kernel_entry, fdt, mem_top);
+}
+
+static void dt_fixups(void)
+{
+	struct dt_node *n;
+	struct dt_node *primary_lpc = NULL;
+
+	/* lpc node missing #address/size cells. Also pick one as
+	 * primary for now (TBD: How to convey that from HB)
+	 */
+	dt_for_each_compatible(dt_root, n, "ibm,power8-lpc") {
+		if (!primary_lpc || dt_has_node_property(n, "primary", NULL))
+			primary_lpc = n;
+		if (dt_has_node_property(n, "#address-cells", NULL))
+			break;
+		dt_add_property_cells(n, "#address-cells", 2);
+		dt_add_property_cells(n, "#size-cells", 1);
+		dt_add_property_strings(n, "status", "ok");
+	}
+
+	/* Missing "primary" property in LPC bus */
+	if (primary_lpc && !dt_has_node_property(primary_lpc, "primary", NULL))
+		dt_add_property(primary_lpc, "primary", NULL, 0);
+
+	/* Missing "scom-controller" */
+	dt_for_each_compatible(dt_root, n, "ibm,xscom") {
+		if (!dt_has_node_property(n, "scom-controller", NULL))
+			dt_add_property(n, "scom-controller", NULL, 0);
+	}
+}
+
+static void add_arch_vector(void)
+{
+	/**
+	 * vec5 = a PVR-list : Number-of-option-vectors :
+	 *	  option-vectors[Number-of-option-vectors + 1]
+	 */
+	uint8_t vec5[] = {0x05, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00};
+
+	if (dt_has_node_property(dt_chosen, "ibm,architecture-vec-5", NULL))
+		return;
+
+	dt_add_property(dt_chosen, "ibm,architecture-vec-5",
+			vec5, sizeof(vec5));
+}
+
+static void dt_init_misc(void)
+{
+	/* Check if there's a /chosen node, if not, add one */
+	dt_chosen = dt_find_by_path(dt_root, "/chosen");
+	if (!dt_chosen)
+		dt_chosen = dt_new(dt_root, "chosen");
+	assert(dt_chosen);
+
+	/* Add IBM architecture vectors if needed */
+	add_arch_vector();
+
+	/* Add the "OPAL virtual ICS*/
+	add_ics_node();
+
+	/* Additional fixups. TODO: Move into platform */
+	dt_fixups();
+}
+
+/* Called from head.S, thus no prototype. */
+void main_cpu_entry(const void *fdt, u32 master_cpu);
+
+void __noreturn main_cpu_entry(const void *fdt, u32 master_cpu)
+{
+	/*
+	 * WARNING: At this point. the timebases have
+	 * *not* been synchronized yet. Do not use any timebase
+	 * related functions for timeouts etc... unless you can cope
+	 * with the speed being some random core clock divider and
+	 * the value jumping backward when the synchronization actually
+	 * happens (in chiptod_init() below).
+	 *
+	 * Also the current cpu_thread() struct is not initialized
+	 * either so we need to clear it out first thing first (without
+	 * putting any other useful info in there jus yet) otherwise
+	 * printf an locks are going to play funny games with "con_suspend"
+	 */
+	pre_init_boot_cpu();
+
+	/*
+	 * Before first printk, ensure console buffer is clear or
+	 * reading tools might think it has wrapped
+	 */
+	clear_console();
+
+	printf("SkiBoot %s starting...\n", gitid);
+
+	/* Initialize boot cpu's cpu_thread struct */
+	init_boot_cpu();
+
+	/* Now locks can be used */
+	init_locks();
+
+	/* Create the OPAL call table early on, entries can be overridden
+	 * later on (FSP console code for example)
+	 */
+	opal_table_init();
+
+	/*
+	 * If we are coming in with a flat device-tree, we expand it
+	 * now. Else look for HDAT and create a device-tree from them
+	 *
+	 * Hack alert: When entering via the OPAL entry point, fdt
+	 * is set to -1, we record that and pass it to parse_hdat
+	 */
+	if (fdt == (void *)-1ul)
+		parse_hdat(true, master_cpu);
+	else if (fdt == NULL)
+		parse_hdat(false, master_cpu);
+	else {
+		dt_expand(fdt);
+	}
+
+	/*
+	 * From there, we follow a fairly strict initialization order.
+	 *
+	 * First we need to build up our chip data structures and initialize
+	 * XSCOM which will be needed for a number of susbequent things.
+	 *
+	 * We want XSCOM available as early as the platform probe in case the
+	 * probe requires some HW accesses.
+	 *
+	 * We also initialize the FSI master at that point in case we need
+	 * to access chips via that path early on.
+	 */
+	init_chips();
+	xscom_init();
+	mfsi_init();
+
+	/*
+	 * Put various bits & pieces in device-tree that might not
+	 * already be there such as the /chosen node if not there yet,
+	 * the ICS node, etc... This can potentially use XSCOM
+	 */
+	dt_init_misc();
+
+	/*
+	 * Initialize LPC (P8 only) so we can get to UART, BMC and
+	 * other system controller. This is done before probe_platform
+	 * so that the platform probing code can access an external
+	 * BMC if needed.
+	 */
+	lpc_init();
+
+	/*
+	 * Now, we init our memory map from the device-tree, and immediately
+	 * reserve areas which we know might contain data coming from
+	 * HostBoot. We need to do these things before we start doing
+	 * allocations outside of our heap, such as chip local allocs,
+	 * otherwise we might clobber those data.
+	 */
+	mem_region_init();
+
+	/* Reserve HOMER and OCC area */
+	homer_init();
+
+	/* Initialize host services. */
+	hservices_init();
+
+	/*
+	 * We probe the platform now. This means the platform probe gets
+	 * the opportunity to reserve additional areas of memory if needed.
+	 *
+	 * Note: Timebases still not synchronized.
+	 */
+	probe_platform();
+
+	/* Initialize the rest of the cpu thread structs */
+	init_all_cpus();
+
+	/* Add the /opal node to the device-tree */
+	add_opal_node();
+
+	/* Allocate our split trace buffers now. Depends add_opal_node() */
+	init_trace_buffers();
+
+	/* Get the ICPs and make sure they are in a sane state */
+	init_interrupts();
+
+	/* Grab centaurs from device-tree if present (only on FSP-less) */
+	centaur_init();
+
+	/* Initialize PSI (depends on probe_platform being called) */
+	psi_init();
+
+	/* Call in secondary CPUs */
+	cpu_bringup();
+
+	/*
+	 * Sycnhronize time bases. Thi resets all the TB values to a small
+	 * value (so they appear to go backward at this point), and synchronize
+	 * all core timebases to the global ChipTOD network
+	 */
+	chiptod_init(master_cpu);
+
+	/*
+	 * We have initialized the basic HW, we can now call into the
+	 * platform to perform subsequent inits, such as establishing
+	 * communication with the FSP.
+	 */
+	if (platform.init)
+		platform.init();
+
+	/* Init SLW related stuff, including fastsleep */
+	slw_init();
+
+	op_display(OP_LOG, OP_MOD_INIT, 0x0002);
+
+	/* Read in NVRAM and set it up */
+	nvram_init();
+
+	/* NX init */
+	nx_init();
+
+	/* Initialize the opal messaging */
+	opal_init_msg();
+
+	/* Probe IO hubs */
+	probe_p5ioc2();
+	probe_p7ioc();
+
+	/* Probe PHB3 on P8 */
+	probe_phb3();
+
+	/* Initialize PCI */
+	pci_init_slots();
+
+	/*
+	 * These last few things must be done as late as possible
+	 * because they rely on various other things having been setup,
+	 * for example, add_opal_interrupts() will add all the interrupt
+	 * sources that are going to the firmware. We can't add a new one
+	 * after that call. Similarily, the mem_region calls will construct
+	 * the reserve maps in the DT so we shouldn't affect the memory
+	 * regions after that
+	 */
+
+	/* Add the list of interrupts going to OPAL */
+	add_opal_interrupts();
+
+	/* Now release parts of memory nodes we haven't used ourselves... */
+	mem_region_release_unused();
+
+	/* ... and add remaining reservations to the DT */
+	mem_region_add_dt_reserved();
+
+	load_and_boot_kernel(false);
+}
+
+void __noreturn __secondary_cpu_entry(void)
+{
+	struct cpu_thread *cpu = this_cpu();
+
+	/* Secondary CPU called in */
+	cpu_callin(cpu);
+
+	/* Wait for work to do */
+	while(true) {
+		int i;
+
+		/* Process pending jobs on this processor */
+		cpu_process_jobs();
+
+		/* Relax a bit to give the simulator some breathing space */
+		i = 1000;
+		while (--i)
+			smt_very_low();
+		smt_low();
+	}
+}
+
+/* Called from head.S, thus no prototype. */
+void secondary_cpu_entry(void);
+
+void __noreturn secondary_cpu_entry(void)
+{
+	struct cpu_thread *cpu = this_cpu();
+
+	printf("INIT: CPU PIR 0x%04x called in\n", cpu->pir);
+
+	__secondary_cpu_entry();
+}
+
diff --git a/core/interrupts.c b/core/interrupts.c
new file mode 100644
index 0000000..cabebc2
--- /dev/null
+++ b/core/interrupts.c
@@ -0,0 +1,332 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <cpu.h>
+#include <fsp.h>
+#include <interrupts.h>
+#include <opal.h>
+#include <io.h>
+#include <cec.h>
+#include <device.h>
+#include <ccan/str/str.h>
+
+/* ICP registers */
+#define ICP_XIRR		0x4	/* 32-bit access */
+#define ICP_CPPR		0x4	/* 8-bit access */
+#define ICP_MFRR		0xc	/* 8-bit access */
+
+struct irq_source {
+	uint32_t			start;
+	uint32_t			end;
+	const struct irq_source_ops	*ops;
+	void				*data;
+	struct list_node		link;
+};
+
+static LIST_HEAD(irq_sources);
+static struct lock irq_lock = LOCK_UNLOCKED;
+
+void register_irq_source(const struct irq_source_ops *ops, void *data,
+			 uint32_t start, uint32_t count)
+{
+	struct irq_source *is, *is1;
+
+	is = zalloc(sizeof(struct irq_source));
+	assert(is);
+	is->start = start;
+	is->end = start + count;
+	is->ops = ops;
+	is->data = data;
+
+	printf("IRQ: Registering %04x..%04x ops @%p (data %p) %s\n",
+	       start, start + count - 1, ops, data,
+	       ops->interrupt ? "[Internal]" : "[OS]");
+
+	lock(&irq_lock);
+	list_for_each(&irq_sources, is1, link) {
+		if (is->end > is1->start && is->start < is1->end) {
+			prerror("register IRQ source overlap !\n");
+			prerror("  new: %x..%x old: %x..%x\n",
+				is->start, is->end - 1,
+				is1->start, is1->end - 1);
+			assert(0);
+		}
+	}
+	list_add_tail(&irq_sources, &is->link);
+	unlock(&irq_lock);
+}
+
+void unregister_irq_source(uint32_t start, uint32_t count)
+{
+	struct irq_source *is;
+
+	lock(&irq_lock);
+	list_for_each(&irq_sources, is, link) {
+		if (start >= is->start && start < is->end) {
+			if (start != is->start ||
+			    count != (is->end - is->start)) {
+				prerror("unregister IRQ source mismatch !\n");
+				prerror("start:%x, count: %x match: %x..%x\n",
+					start, count, is->start, is->end);
+				assert(0);
+			}
+			list_del(&is->link);
+			unlock(&irq_lock);
+			/* XXX Add synchronize / RCU */
+			free(is);
+			return;
+		}
+	}
+	unlock(&irq_lock);
+	prerror("unregister IRQ source not found !\n");
+	prerror("start:%x, count: %x\n", start, count);
+	assert(0);
+}
+
+/*
+ * This takes a 6-bit chip id and returns a 20 bit value representing
+ * the PSI interrupt. This includes all the fields above, ie, is a
+ * global interrupt number.
+ *
+ * For P8, this returns the base of the 8-interrupts block for PSI
+ */
+uint32_t get_psi_interrupt(uint32_t chip_id)
+{
+	uint32_t irq;
+
+	switch(proc_gen) {
+	case proc_gen_p7:
+		/* Get the chip ID into position, it already has
+		 * the T bit so all we need is room for the GX
+		 * bit, 9 bit BUID and 4 bit level
+		 */
+		irq  = chip_id << (1 + 9 + 4);
+
+		/* Add in the BUID */
+		irq |= P7_PSI_IRQ_BUID << 4;
+		break;
+	case proc_gen_p8:
+		irq = P8_CHIP_IRQ_BLOCK_BASE(chip_id, P8_IRQ_BLOCK_MISC);
+		irq += P8_IRQ_MISC_PSI_BASE;
+		break;
+	default:
+		assert(false);
+	};
+
+	return irq;
+}
+
+
+struct dt_node *add_ics_node(void)
+{
+	struct dt_node *ics = dt_new_addr(dt_root, "interrupt-controller", 0);
+	if (!ics)
+		return NULL;
+
+	dt_add_property_cells(ics, "reg", 0, 0, 0, 0);
+	dt_add_property_strings(ics, "compatible", "IBM,ppc-xics",
+				"IBM,opal-xics");
+	dt_add_property_cells(ics, "#address-cells", 0);
+	dt_add_property_cells(ics, "#interrupt-cells", 1);
+	dt_add_property_string(ics, "device_type",
+			       "PowerPC-Interrupt-Source-Controller");
+	dt_add_property(ics, "interrupt-controller", NULL, 0);
+
+	return ics;
+}
+
+uint32_t get_ics_phandle(void)
+{
+	struct dt_node *i;
+
+	for (i = dt_first(dt_root); i; i = dt_next(dt_root, i)) {
+		if (streq(i->name, "interrupt-controller@0")) {
+			return i->phandle;
+		}
+	}
+	abort();
+}
+
+void add_opal_interrupts(void)
+{
+	struct irq_source *is;
+	unsigned int i, count = 0;
+	uint32_t *irqs = NULL, isn;
+
+	lock(&irq_lock);
+	list_for_each(&irq_sources, is, link) {
+		/*
+		 * Add a source to opal-interrupts if it has an
+		 * ->interrupt callback
+		 */
+		if (!is->ops->interrupt)
+			continue;
+		for (isn = is->start; isn < is->end; isn++) {
+			i = count++;
+			irqs = realloc(irqs, 4 * count);
+			irqs[i] = isn;
+		}
+	}
+	unlock(&irq_lock);
+
+	/* The opal-interrupts property has one cell per interrupt,
+	 * it is not a standard interrupt property
+	 */
+	if (irqs)
+		dt_add_property(opal_node, "opal-interrupts", irqs, count * 4);
+}
+
+/*
+ * This is called at init time (and one fast reboot) to sanitize the
+ * ICP. We set our priority to 0 to mask all interrupts and make sure
+ * no IPI is on the way.
+ */
+void reset_cpu_icp(void)
+{
+	void *icp = this_cpu()->icp_regs;
+
+	assert(icp);
+
+	/* Clear pending IPIs */
+	out_8(icp + ICP_MFRR, 0xff);
+
+	/* Set priority to max, ignore all incoming interrupts, EOI IPIs */
+	out_be32(icp + ICP_XIRR, 2);
+}
+
+/* Used by the PSI code to send an EOI during reset. This will also
+ * set the CPPR to 0 which should already be the case anyway
+ */
+void icp_send_eoi(uint32_t interrupt)
+{
+	void *icp = this_cpu()->icp_regs;
+
+	assert(icp);
+
+	/* Set priority to max, ignore all incoming interrupts */
+	out_be32(icp + ICP_XIRR, interrupt & 0xffffff);
+}
+
+/* This is called before winkle, we clear pending IPIs and set our priority
+ * to 1 to mask all but the IPI
+ */
+void icp_prep_for_rvwinkle(void)
+{
+	void *icp = this_cpu()->icp_regs;
+
+	assert(icp);
+
+	/* Clear pending IPIs */
+	out_8(icp + ICP_MFRR, 0xff);
+
+	/* Set priority to 1, ignore all incoming interrupts, EOI IPIs */
+	out_be32(icp + ICP_XIRR, 0x01000002);
+}
+
+/* This is called to wakeup somebody from winkle */
+void icp_kick_cpu(struct cpu_thread *cpu)
+{
+	void *icp = cpu->icp_regs;
+
+	assert(icp);
+
+	/* Send high priority IPI */
+	out_8(icp + ICP_MFRR, 0);
+}
+
+static struct irq_source *irq_find_source(uint32_t isn)
+{
+	struct irq_source *is;
+
+	lock(&irq_lock);
+	list_for_each(&irq_sources, is, link) {
+		if (isn >= is->start && isn < is->end) {
+			unlock(&irq_lock);
+			return is;
+		}
+	}
+	unlock(&irq_lock);
+
+	return NULL;
+}
+
+static int64_t opal_set_xive(uint32_t isn, uint16_t server, uint8_t priority)
+{
+	struct irq_source *is = irq_find_source(isn);
+
+	if (!is || !is->ops->set_xive)
+		return OPAL_PARAMETER;
+
+	return is->ops->set_xive(is->data, isn, server, priority);
+}
+opal_call(OPAL_SET_XIVE, opal_set_xive, 3);
+
+static int64_t opal_get_xive(uint32_t isn, uint16_t *server, uint8_t *priority)
+{
+	struct irq_source *is = irq_find_source(isn);
+
+	if (!is || !is->ops->get_xive)
+		return OPAL_PARAMETER;
+
+	return is->ops->get_xive(is->data, isn, server, priority);
+}
+opal_call(OPAL_GET_XIVE, opal_get_xive, 3);
+
+static int64_t opal_handle_interrupt(uint32_t isn, uint64_t *outstanding_event_mask)
+{
+	struct irq_source *is = irq_find_source(isn);
+	int64_t rc = OPAL_SUCCESS;
+
+	if (!is || !is->ops->interrupt) {
+		rc = OPAL_PARAMETER;
+		goto bail;
+	}
+
+	is->ops->interrupt(is->data, isn);
+
+	/* Update output events */
+ bail:
+	if (outstanding_event_mask)
+		*outstanding_event_mask = opal_pending_events;
+
+	return rc;
+}
+opal_call(OPAL_HANDLE_INTERRUPT, opal_handle_interrupt, 2);
+
+void init_interrupts(void)
+{
+	struct dt_node *icp;
+	const struct dt_property *sranges;
+	struct cpu_thread *cpu;
+	u32 base, count, i;
+	u64 addr, size;
+
+	dt_for_each_compatible(dt_root, icp, "ibm,ppc-xicp") {
+		sranges = dt_require_property(icp,
+					      "ibm,interrupt-server-ranges",
+					      -1);
+		base = dt_get_number(sranges->prop, 1);
+		count = dt_get_number(sranges->prop + 4, 1);
+		for (i = 0; i < count; i++) {
+			addr = dt_get_address(icp, i, &size);
+			cpu = find_cpu_by_server(base + i);
+			if (cpu)
+				cpu->icp_regs = (void *)addr;
+		}
+	}
+}
+
diff --git a/core/lock.c b/core/lock.c
new file mode 100644
index 0000000..fc4bf6b
--- /dev/null
+++ b/core/lock.c
@@ -0,0 +1,125 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <lock.h>
+#include <assert.h>
+#include <processor.h>
+#include <cpu.h>
+#include <console.h>
+
+/* Set to bust locks. Note, this is initialized to true because our
+ * lock debugging code is not going to work until we have the per
+ * CPU data initialized
+ */
+bool bust_locks = true;
+
+#ifdef DEBUG_LOCKS
+
+static void lock_error(struct lock *l, const char *reason, uint16_t err)
+{
+	op_display(OP_FATAL, OP_MOD_LOCK, err);
+
+	fprintf(stderr, "LOCK ERROR: %s @%p (state: 0x%016lx)\n",
+		reason, l, l->lock_val);
+	abort();
+}
+
+static void lock_check(struct lock *l)
+{
+	if ((l->lock_val & 1) && (l->lock_val >> 32) == this_cpu()->pir)
+		lock_error(l, "Invalid recursive lock", 0);
+}
+
+static void unlock_check(struct lock *l)
+{
+	if (!(l->lock_val & 1))
+		lock_error(l, "Unlocking unlocked lock", 1);
+
+	if ((l->lock_val >> 32) != this_cpu()->pir)
+		lock_error(l, "Unlocked non-owned lock", 2);
+
+	if (l->in_con_path && this_cpu()->con_suspend == 0)
+		lock_error(l, "Unlock con lock with console not suspended", 3);
+}
+
+#else
+static inline void lock_check(struct lock *l) { };
+static inline void unlock_check(struct lock *l) { };
+#endif /* DEBUG_LOCKS */
+
+
+bool try_lock(struct lock *l)
+{
+	if (__try_lock(l)) {
+		if (l->in_con_path)
+			this_cpu()->con_suspend++;
+		return true;
+	}
+	return false;
+}
+
+void lock(struct lock *l)
+{
+	if (bust_locks)
+		return;
+
+	lock_check(l);
+	for (;;) {
+		if (try_lock(l))
+			break;
+		smt_low();
+	}
+	smt_medium();
+}
+
+void unlock(struct lock *l)
+{
+	struct cpu_thread *cpu = this_cpu();
+
+	if (bust_locks)
+		return;
+
+	unlock_check(l);
+
+	lwsync();
+	l->lock_val = 0;
+
+	if (l->in_con_path) {
+		cpu->con_suspend--;
+		if (cpu->con_suspend == 0 && cpu->con_need_flush)
+			flush_console();
+	}
+}
+
+bool lock_recursive(struct lock *l)
+{
+	if (bust_locks)
+		return false;
+
+	if ((l->lock_val & 1) &&
+	    (l->lock_val >> 32) == this_cpu()->pir)
+		return false;
+
+	lock(l);
+	return true;
+}
+
+
+void init_locks(void)
+{
+	bust_locks = false;
+}
diff --git a/core/malloc.c b/core/malloc.c
new file mode 100644
index 0000000..692a501
--- /dev/null
+++ b/core/malloc.c
@@ -0,0 +1,84 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/* Wrappers for malloc, et. al. */
+#include <mem_region.h>
+#include <lock.h>
+#include <string.h>
+#include <mem_region-malloc.h>
+
+#define DEFAULT_ALIGN __alignof__(long)
+
+void *__memalign(size_t blocksize, size_t bytes, const char *location)
+{
+	void *p;
+
+	lock(&mem_region_lock);
+	p = mem_alloc(&skiboot_heap, bytes, blocksize, location);
+	unlock(&mem_region_lock);
+
+	return p;
+}
+
+void *__malloc(size_t bytes, const char *location)
+{
+	return __memalign(DEFAULT_ALIGN, bytes, location);
+}
+
+void __free(void *p, const char *location)
+{
+	lock(&mem_region_lock);
+	mem_free(&skiboot_heap, p, location);
+	unlock(&mem_region_lock);
+}
+
+void *__realloc(void *ptr, size_t size, const char *location)
+{
+	void *newptr;
+
+	/* Two classic malloc corner cases. */
+	if (!size) {
+		__free(ptr, location);
+		return NULL;
+	}
+	if (!ptr)
+		return __malloc(size, location);
+
+	lock(&mem_region_lock);
+	if (mem_resize(&skiboot_heap, ptr, size, location)) {
+		newptr = ptr;
+	} else {
+		newptr = mem_alloc(&skiboot_heap, size, DEFAULT_ALIGN,
+				   location);
+		if (newptr) {
+			size_t copy = mem_size(&skiboot_heap, ptr);
+			if (copy > size)
+				copy = size;
+			memcpy(newptr, ptr, copy);
+			mem_free(&skiboot_heap, ptr, location);
+		}
+	}
+	unlock(&mem_region_lock);
+	return newptr;
+}
+
+void *__zalloc(size_t bytes, const char *location)
+{
+	void *p = __malloc(bytes, location);
+
+	if (p)
+		memset(p, 0, bytes);
+	return p;
+}
diff --git a/core/mem_region.c b/core/mem_region.c
new file mode 100644
index 0000000..8904a18
--- /dev/null
+++ b/core/mem_region.c
@@ -0,0 +1,956 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <mem-map.h>
+#include <libfdt_env.h>
+#include <lock.h>
+#include <device.h>
+#include <cpu.h>
+#include <affinity.h>
+#include <types.h>
+#include <mem_region.h>
+#include <mem_region-malloc.h>
+
+/* Memory poisoning on free (if POISON_MEM_REGION set to 1) */
+#define POISON_MEM_REGION	0
+#define POISON_MEM_REGION_WITH	0x99
+#define POISON_MEM_REGION_LIMIT 1*1024*1024*1024
+
+struct lock mem_region_lock = LOCK_UNLOCKED;
+
+static struct list_head regions = LIST_HEAD_INIT(regions);
+
+static struct mem_region skiboot_os_reserve = {
+	.name		= "ibm,os-reserve",
+	.start		= 0,
+	.len		= SKIBOOT_BASE,
+	.type		= REGION_OS,
+};
+
+struct mem_region skiboot_heap = {
+	.name		= "ibm,firmware-heap",
+	.start		= HEAP_BASE,
+	.len		= HEAP_SIZE,
+	.type		= REGION_SKIBOOT_HEAP,
+};
+
+static struct mem_region skiboot_code_and_text = {
+	.name		= "ibm,firmware-code",
+	.start		= SKIBOOT_BASE,
+	.len		= HEAP_BASE - SKIBOOT_BASE,
+	.type		= REGION_SKIBOOT_FIRMWARE,
+};
+
+static struct mem_region skiboot_after_heap = {
+	.name		= "ibm,firmware-data",
+	.start		= HEAP_BASE + HEAP_SIZE,
+	.len		= SKIBOOT_BASE + SKIBOOT_SIZE - (HEAP_BASE + HEAP_SIZE),
+	.type		= REGION_SKIBOOT_FIRMWARE,
+};
+
+static struct mem_region skiboot_cpu_stacks = {
+	.name		= "ibm,firmware-stacks",
+	.start		= CPU_STACKS_BASE,
+	.len		= 0, /* TBA */
+	.type		= REGION_SKIBOOT_FIRMWARE,
+};
+
+struct alloc_hdr {
+	bool free : 1;
+	bool prev_free : 1;
+	unsigned long num_longs : BITS_PER_LONG-2; /* Including header. */
+	const char *location;
+};
+
+struct free_hdr {
+	struct alloc_hdr hdr;
+	struct list_node list;
+	/* ... unsigned long tailer; */
+};
+
+#define ALLOC_HDR_LONGS (sizeof(struct alloc_hdr) / sizeof(long))
+#define ALLOC_MIN_LONGS (sizeof(struct free_hdr) / sizeof(long) + 1)
+
+/* Avoid ugly casts. */
+static void *region_start(const struct mem_region *region)
+{
+	return (void *)(unsigned long)region->start;
+}
+
+/* Each free block has a tailer, so we can walk backwards. */
+static unsigned long *tailer(struct free_hdr *f)
+{
+	return (unsigned long *)f + f->hdr.num_longs - 1;
+}
+
+/* This walks forward to the next hdr (or NULL if at the end). */
+static struct alloc_hdr *next_hdr(const struct mem_region *region,
+				  const struct alloc_hdr *hdr)
+{
+	void *next;
+
+	next = ((unsigned long *)hdr + hdr->num_longs);
+	if (next >= region_start(region) + region->len)
+		next = NULL;
+	return next;
+}
+
+/* Creates free block covering entire region. */
+static void init_allocatable_region(struct mem_region *region)
+{
+	struct free_hdr *f = region_start(region);
+	assert(region->type == REGION_SKIBOOT_HEAP);
+	f->hdr.num_longs = region->len / sizeof(long);
+	f->hdr.free = true;
+	f->hdr.prev_free = false;
+	*tailer(f) = f->hdr.num_longs;
+	list_head_init(&region->free_list);
+	list_add(&region->free_list, &f->list);
+}
+
+static void make_free(struct mem_region *region, struct free_hdr *f,
+		      const char *location)
+{
+	struct alloc_hdr *next;
+#if POISON_MEM_REGION == 1
+	size_t poison_size= (void*)tailer(f) - (void*)(f+1);
+
+	/* We only poison up to a limit, as otherwise boot is kinda slow */
+	if (poison_size > POISON_MEM_REGION_LIMIT) {
+		poison_size = POISON_MEM_REGION_LIMIT;
+	}
+
+	memset(f+1, POISON_MEM_REGION_WITH, poison_size);
+#endif
+	if (f->hdr.prev_free) {
+		struct free_hdr *prev;
+		unsigned long *prev_tailer = (unsigned long *)f - 1;
+
+		assert(*prev_tailer);
+		prev = (void *)((unsigned long *)f - *prev_tailer);
+		assert(prev->hdr.free);
+		assert(!prev->hdr.prev_free);
+
+		/* Expand to cover the one we just freed. */
+		prev->hdr.num_longs += f->hdr.num_longs;
+		f = prev;
+	} else {
+		f->hdr.free = true;
+		f->hdr.location = location;
+		list_add(&region->free_list, &f->list);
+	}
+
+	/* Fix up tailer. */
+	*tailer(f) = f->hdr.num_longs;
+
+	/* If next is free, coalesce it */
+	next = next_hdr(region, &f->hdr);
+	if (next) {
+		next->prev_free = true;
+		if (next->free) {
+			struct free_hdr *next_free = (void *)next;
+			list_del_from(&region->free_list, &next_free->list);
+			/* Maximum of one level of recursion */
+			make_free(region, next_free, location);
+		}
+	}
+}
+
+/* Can we fit this many longs with this alignment in this free block? */
+static bool fits(struct free_hdr *f, size_t longs, size_t align, size_t *offset)
+{
+	*offset = 0;
+
+	while (f->hdr.num_longs >= *offset + longs) {
+		size_t addr;
+
+		addr = (unsigned long)f
+			+ (*offset + ALLOC_HDR_LONGS) * sizeof(long);
+		if ((addr & (align - 1)) == 0)
+			return true;
+
+		/* Don't make tiny chunks! */
+		if (*offset == 0)
+			*offset = ALLOC_MIN_LONGS;
+		else
+			(*offset)++;
+	}
+	return false;
+}
+
+static void discard_excess(struct mem_region *region,
+			   struct alloc_hdr *hdr, size_t alloc_longs,
+			   const char *location)
+{
+	/* Do we have excess? */
+	if (hdr->num_longs > alloc_longs + ALLOC_MIN_LONGS) {
+		struct free_hdr *post;
+
+		/* Set up post block. */
+		post = (void *)hdr + alloc_longs * sizeof(long);
+		post->hdr.num_longs = hdr->num_longs - alloc_longs;
+		post->hdr.prev_free = false;
+
+		/* Trim our block. */
+		hdr->num_longs = alloc_longs;
+
+		/* This coalesces as required. */
+		make_free(region, post, location);
+	}
+}
+
+static const char *hdr_location(const struct alloc_hdr *hdr)
+{
+	/* Corrupt: step carefully! */
+	if (is_rodata(hdr->location))
+		return hdr->location;
+	return "*CORRUPT*";
+}
+
+static void bad_header(const struct mem_region *region,
+		       const struct alloc_hdr *hdr,
+		       const char *during,
+		       const char *location)
+{
+	/* Corrupt: step carefully! */
+	if (is_rodata(hdr->location))
+		prerror("%p (in %s) %s at %s, previously %s\n",
+			hdr-1, region->name, during, location, hdr->location);
+	else
+		prerror("%p (in %s) %s at %s, previously %p\n",
+			hdr-1, region->name, during, location, hdr->location);
+	abort();
+}
+
+static bool region_is_reserved(struct mem_region *region)
+{
+	return region->type != REGION_OS;
+}
+
+static void mem_dump_allocs(void)
+{
+	struct mem_region *region;
+	struct alloc_hdr *hdr;
+
+	/* Second pass: populate property data */
+	printf("Memory regions:\n");
+	list_for_each(&regions, region, list) {
+		if (region->type != REGION_SKIBOOT_HEAP)
+			continue;
+		printf("  0x%012llx..%012llx : %s\n",
+		       (long long)region->start,
+		       (long long)(region->start + region->len - 1),
+		       region->name);
+		if (region->free_list.n.next == NULL) {
+			printf("    no allocs\n");
+			continue;
+		}
+		for (hdr = region_start(region); hdr; hdr = next_hdr(region, hdr)) {
+			if (hdr->free)
+				continue;
+			printf("    0x%.8lx %s\n", hdr->num_longs * sizeof(long),
+			       hdr_location(hdr));
+		}
+	}
+}
+
+static void *__mem_alloc(struct mem_region *region, size_t size, size_t align,
+			 const char *location)
+{
+	size_t alloc_longs, offset;
+	struct free_hdr *f;
+	struct alloc_hdr *next;
+
+	/* Align must be power of 2. */
+	assert(!((align - 1) & align));
+
+	/* This should be a constant. */
+	assert(is_rodata(location));
+
+	/* Unallocatable region? */
+	if (region->type != REGION_SKIBOOT_HEAP)
+		return NULL;
+
+	/* First allocation? */
+	if (region->free_list.n.next == NULL)
+		init_allocatable_region(region);
+
+	/* Don't do screwy sizes. */
+	if (size > region->len)
+		return NULL;
+
+	/* Don't do tiny alignments, we deal in long increments. */
+	if (align < sizeof(long))
+		align = sizeof(long);
+
+	/* Convert size to number of longs, too. */
+	alloc_longs = (size + sizeof(long)-1) / sizeof(long) + ALLOC_HDR_LONGS;
+
+	/* Can't be too small for when we free it, either. */
+	if (alloc_longs < ALLOC_MIN_LONGS)
+		alloc_longs = ALLOC_MIN_LONGS;
+
+	/* Walk free list. */
+	list_for_each(&region->free_list, f, list) {
+		/* We may have to skip some to meet alignment. */
+		if (fits(f, alloc_longs, align, &offset))
+			goto found;
+	}
+
+	return NULL;
+
+found:
+	assert(f->hdr.free);
+	assert(!f->hdr.prev_free);
+
+	/* This block is no longer free. */
+	list_del_from(&region->free_list, &f->list);
+	f->hdr.free = false;
+	f->hdr.location = location;
+
+	next = next_hdr(region, &f->hdr);
+	if (next) {
+		assert(next->prev_free);
+		next->prev_free = false;
+	}
+
+	if (offset != 0) {
+		struct free_hdr *pre = f;
+
+		f = (void *)f + offset * sizeof(long);
+		assert(f >= pre + 1);
+
+		/* Set up new header. */
+		f->hdr.num_longs = pre->hdr.num_longs - offset;
+		/* f->hdr.prev_free will be set by make_free below. */
+		f->hdr.free = false;
+		f->hdr.location = location;
+
+		/* Fix up old header. */
+		pre->hdr.num_longs = offset;
+		pre->hdr.prev_free = false;
+
+		/* This coalesces as required. */
+		make_free(region, pre, location);
+	}
+
+	/* We might be too long; put the rest back. */
+	discard_excess(region, &f->hdr, alloc_longs, location);
+
+	/* Clear tailer for debugging */
+	*tailer(f) = 0;
+
+	/* Their pointer is immediately after header. */
+	return &f->hdr + 1;
+}
+
+void *mem_alloc(struct mem_region *region, size_t size, size_t align,
+		const char *location)
+{
+	void *r = __mem_alloc(region, size, align, location);
+
+	if (r)
+		return r;
+
+	prerror("mem_alloc(0x%lx, 0x%lx, \"%s\") failed !\n",
+		size, align, location);
+	mem_dump_allocs();
+	return NULL;
+}
+
+void mem_free(struct mem_region *region, void *mem, const char *location)
+{
+	struct alloc_hdr *hdr;
+
+	/* This should be a constant. */
+	assert(is_rodata(location));
+
+	/* Freeing NULL is always a noop. */
+	if (!mem)
+		return;
+
+	/* Your memory is in the region, right? */
+	assert(mem >= region_start(region) + sizeof(*hdr));
+	assert(mem < region_start(region) + region->len);
+
+	/* Grab header. */
+	hdr = mem - sizeof(*hdr);
+
+	if (hdr->free)
+		bad_header(region, hdr, "re-freed", location);
+
+	make_free(region, (struct free_hdr *)hdr, location);
+}
+
+size_t mem_size(const struct mem_region *region __unused, const void *ptr)
+{
+	const struct alloc_hdr *hdr = ptr - sizeof(*hdr);
+	return hdr->num_longs * sizeof(long);
+}
+
+bool mem_resize(struct mem_region *region, void *mem, size_t len,
+		const char *location)
+{
+	struct alloc_hdr *hdr, *next;
+	struct free_hdr *f;
+
+	/* This should be a constant. */
+	assert(is_rodata(location));
+
+	/* Get header. */
+	hdr = mem - sizeof(*hdr);
+	if (hdr->free)
+		bad_header(region, hdr, "resize", location);
+
+	/* Round up size to multiple of longs. */
+	len = (sizeof(*hdr) + len + sizeof(long) - 1) / sizeof(long);
+
+	/* Can't be too small for when we free it, either. */
+	if (len < ALLOC_MIN_LONGS)
+		len = ALLOC_MIN_LONGS;
+
+	/* Shrinking is simple. */
+	if (len <= hdr->num_longs) {
+		hdr->location = location;
+		discard_excess(region, hdr, len, location);
+		return true;
+	}
+
+	/* Check if we can expand. */
+	next = next_hdr(region, hdr);
+	if (!next || !next->free || hdr->num_longs + next->num_longs < len)
+		return false;
+
+	/* OK, it's free and big enough, absorb it. */
+	f = (struct free_hdr *)next;
+	list_del_from(&region->free_list, &f->list);
+	hdr->num_longs += next->num_longs;
+	hdr->location = location;
+
+	/* Update next prev_free */
+	next = next_hdr(region, &f->hdr);
+	if (next) {
+		assert(next->prev_free);
+		next->prev_free = false;
+	}
+
+	/* Clear tailer for debugging */
+	*tailer(f) = 0;
+
+	/* Now we might have *too* much. */
+	discard_excess(region, hdr, len, location);
+	return true;
+}
+
+bool mem_check(const struct mem_region *region)
+{
+	size_t frees = 0;
+	struct alloc_hdr *hdr, *prev_free = NULL;
+	struct free_hdr *f;
+
+	/* Check it's sanely aligned. */
+	if (region->start % sizeof(struct alloc_hdr)) {
+		prerror("Region '%s' not sanely aligned (%llx)\n",
+			region->name, (unsigned long long)region->start);
+		return false;
+	}
+	if ((long)region->len % sizeof(struct alloc_hdr)) {
+		prerror("Region '%s' not sane length (%llu)\n",
+			region->name, (unsigned long long)region->len);
+		return false;
+	}
+
+	/* Not ours to play with, or empty?  Don't do anything. */
+	if (region->type != REGION_SKIBOOT_HEAP ||
+			region->free_list.n.next == NULL)
+		return true;
+
+	/* Walk linearly. */
+	for (hdr = region_start(region); hdr; hdr = next_hdr(region, hdr)) {
+		if (hdr->num_longs < ALLOC_MIN_LONGS) {
+			prerror("Region '%s' %s %p (%s) size %zu\n",
+				region->name, hdr->free ? "free" : "alloc",
+				hdr, hdr_location(hdr),
+				hdr->num_longs * sizeof(long));
+				return false;
+		}			
+		if ((unsigned long)hdr + hdr->num_longs * sizeof(long) >
+		    region->start + region->len) {
+			prerror("Region '%s' %s %p (%s) oversize %zu\n",
+				region->name, hdr->free ? "free" : "alloc",
+				hdr, hdr_location(hdr),
+				hdr->num_longs * sizeof(long));
+				return false;
+		}
+		if (hdr->free) {
+			if (hdr->prev_free || prev_free) {
+				prerror("Region '%s' free %p (%s) has prev_free"
+					" %p (%s) %sset?\n",
+					region->name, hdr, hdr_location(hdr),
+					prev_free,
+					prev_free ? hdr_location(prev_free)
+					: "NULL",
+					hdr->prev_free ? "" : "un");
+				return false;
+			}
+			prev_free = hdr;
+			frees ^= (unsigned long)hdr - region->start;
+		} else {
+			if (hdr->prev_free != (bool)prev_free) {
+				prerror("Region '%s' alloc %p (%s) has"
+					" prev_free %p %sset?\n",
+					region->name, hdr, hdr_location(hdr),
+					prev_free, hdr->prev_free ? "" : "un");
+				return false;
+			}
+			prev_free = NULL;
+		}
+	}
+
+	/* Now walk free list. */
+	list_for_each(&region->free_list, f, list)
+		frees ^= (unsigned long)f - region->start;
+
+	if (frees) {
+		prerror("Region '%s' free list and walk do not match!\n",
+			region->name);
+		return false;
+	}
+	return true;
+}
+
+static struct mem_region *new_region(const char *name,
+				     uint64_t start, uint64_t len,
+				     struct dt_node *mem_node,
+				     enum mem_region_type type)
+{
+	struct mem_region *region;
+
+	/* Avoid lock recursion, call mem_alloc directly. */
+	region = mem_alloc(&skiboot_heap,
+			   sizeof(*region), __alignof__(*region), __location__);
+	if (!region)
+		return NULL;
+
+	region->name = name;
+	region->start = start;
+	region->len = len;
+	region->mem_node = mem_node;
+	region->type = type;
+	region->free_list.n.next = NULL;
+
+	return region;
+}
+
+/* We always split regions, so we only have to replace one. */
+static struct mem_region *split_region(struct mem_region *head,
+				       uint64_t split_at,
+				       enum mem_region_type type)
+{
+	struct mem_region *tail;
+	uint64_t end = head->start + head->len;
+
+	tail = new_region(head->name, split_at, end - split_at,
+			  head->mem_node, type);
+	/* Original region becomes head. */
+	if (tail)
+		head->len -= tail->len;
+
+	return tail;
+}
+
+static bool intersects(const struct mem_region *region, uint64_t addr)
+{
+	return addr > region->start &&
+		addr < region->start + region->len;
+}
+
+static bool maybe_split(struct mem_region *r, uint64_t split_at)
+{
+	struct mem_region *tail;
+
+	if (!intersects(r, split_at))
+		return true;
+
+	tail = split_region(r, split_at, r->type);
+	if (!tail)
+		return false;
+
+	/* Tail add is important: we may need to split again! */
+	list_add_tail(&regions, &tail->list);
+	return true;
+}
+
+static bool overlaps(const struct mem_region *r1, const struct mem_region *r2)
+{
+	return (r1->start + r1->len > r2->start
+		&& r1->start < r2->start + r2->len);
+}
+
+static struct mem_region *get_overlap(const struct mem_region *region)
+{
+	struct mem_region *i;
+
+	list_for_each(&regions, i, list) {
+		if (overlaps(region, i))
+			return i;
+	}
+	return NULL;
+}
+
+static bool add_region(struct mem_region *region)
+{
+	struct mem_region *r;
+
+	/* First split any regions which intersect. */
+	list_for_each(&regions, r, list)
+		if (!maybe_split(r, region->start) ||
+		    !maybe_split(r, region->start + region->len))
+			return false;
+
+	/* Now we have only whole overlaps, if any. */
+	while ((r = get_overlap(region)) != NULL) {
+		assert(r->start == region->start);
+		assert(r->len == region->len);
+		list_del_from(&regions, &r->list);
+		/* We already hold mem_region lock */
+		mem_free(&skiboot_heap, r, __location__);
+	}
+
+	/* Finally, add in our own region. */
+	list_add(&regions, &region->list);
+	return true;
+}
+
+void mem_reserve(const char *name, uint64_t start, uint64_t len)
+{
+	struct mem_region *region;
+	bool added;
+
+	lock(&mem_region_lock);
+	region = new_region(name, start, len, NULL, REGION_RESERVED);
+	assert(region);
+	added = add_region(region);
+	assert(added);
+	unlock(&mem_region_lock);
+}
+
+static bool matches_chip_id(const __be32 ids[], size_t num, u32 chip_id)
+{
+	size_t i;
+
+	for (i = 0; i < num; i++)
+		if (be32_to_cpu(ids[i]) == chip_id)
+			return true;
+
+	return false;
+}
+
+void *__local_alloc(unsigned int chip_id, size_t size, size_t align,
+		    const char *location)
+{
+	struct mem_region *region;
+	void *p = NULL;
+	bool use_local = true;
+
+	lock(&mem_region_lock);
+
+restart:
+	list_for_each(&regions, region, list) {
+		const struct dt_property *prop;
+		const __be32 *ids;
+
+		if (region->type != REGION_SKIBOOT_HEAP)
+			continue;
+
+		/* Don't allocate from normal heap. */
+		if (region == &skiboot_heap)
+			continue;
+
+		/* First pass, only match node local regions */
+		if (use_local) {
+			if (!region->mem_node)
+				continue;
+			prop = dt_find_property(region->mem_node, "ibm,chip-id");
+			ids = (const __be32 *)prop->prop;
+			if (!matches_chip_id(ids, prop->len/sizeof(u32),
+					     chip_id))
+				continue;
+		}
+
+		/* Second pass, match anything */
+		p = mem_alloc(region, size, align, location);
+		if (p)
+			break;
+	}
+
+	/*
+	 * If we can't allocate the memory block from the expected
+	 * node, we bail to any one that can accomodate our request.
+	 */
+	if (!p && use_local) {
+		use_local = false;
+		goto restart;
+	}
+
+	unlock(&mem_region_lock);
+
+	return p;
+}
+
+struct mem_region *find_mem_region(const char *name)
+{
+	struct mem_region *region;
+
+	list_for_each(&regions, region, list) {
+		if (streq(region->name, name))
+			return region;
+	}
+	return NULL;
+}
+
+/* Trawl through device tree, create memory regions from nodes. */
+void mem_region_init(void)
+{
+	const struct dt_property *names, *ranges;
+	struct mem_region *region;
+	struct dt_node *i;
+
+	/* Ensure we have no collision between skiboot core and our heap */
+	extern char _end[];
+	BUILD_ASSERT(HEAP_BASE >= (uint64_t)_end);
+
+	/*
+	 * Add associativity properties outside of the lock
+	 * to avoid recursive locking caused by allocations
+	 * done by add_chip_dev_associativity()
+	 */
+	dt_for_each_node(dt_root, i) {
+		if (!dt_has_node_property(i, "device_type", "memory"))
+			continue;
+
+		/* Add associativity properties */
+		add_chip_dev_associativity(i);
+	}
+
+	/* Add each memory node. */
+	dt_for_each_node(dt_root, i) {
+		uint64_t start, len;
+		char *rname;
+#define NODE_REGION_PREFIX 	"ibm,firmware-allocs-"
+
+		if (!dt_has_node_property(i, "device_type", "memory"))
+			continue;
+		rname = zalloc(strlen(i->name) + strlen(NODE_REGION_PREFIX) + 1);
+		strcat(rname, NODE_REGION_PREFIX);
+		strcat(rname, i->name);
+		start = dt_get_address(i, 0, &len);
+		lock(&mem_region_lock);
+		region = new_region(rname, start, len, i, REGION_SKIBOOT_HEAP);
+		if (!region) {
+			prerror("MEM: Could not add mem region %s!\n", i->name);
+			abort();
+		}
+		list_add(&regions, &region->list);
+		unlock(&mem_region_lock);
+	}
+
+	/* Now we know how many CPU stacks we have, fix that up. */
+	skiboot_cpu_stacks.len = (cpu_max_pir + 1) * STACK_SIZE;
+
+	lock(&mem_region_lock);
+
+	/* Now carve out our own reserved areas. */
+	if (!add_region(&skiboot_os_reserve) ||
+	    !add_region(&skiboot_code_and_text) ||
+	    !add_region(&skiboot_heap) ||
+	    !add_region(&skiboot_after_heap) ||
+	    !add_region(&skiboot_cpu_stacks)) {
+		prerror("Out of memory adding skiboot reserved areas\n");
+		abort();
+	}
+
+	/* Add reserved ranges from the DT */
+	names = dt_find_property(dt_root, "reserved-names");
+	ranges = dt_find_property(dt_root, "reserved-ranges");
+	if (names && ranges) {
+		const uint64_t *range;
+		int n, len;
+
+		range = (const void *)ranges->prop;
+
+		for (n = 0; n < names->len; n += len, range += 2) {
+			char *name;
+
+			len = strlen(names->prop + n) + 1;
+
+			name = mem_alloc(&skiboot_heap, len,
+					__alignof__(*name), __location__);
+			memcpy(name, names->prop + n, len);
+
+			region = new_region(name,
+					dt_get_number(range, 2),
+					dt_get_number(range + 1, 2),
+					NULL, REGION_RESERVED);
+			list_add(&regions, &region->list);
+		}
+	} else if (names || ranges) {
+		prerror("Invalid properties: reserved-names=%p "
+				"with reserved-ranges=%p\n",
+				names, ranges);
+		abort();
+	}
+
+	unlock(&mem_region_lock);
+
+	/* We generate the reservation properties from our own region list,
+	 * which now includes the existing data.
+	 */
+	if (names)
+		dt_del_property(dt_root, (struct dt_property *)names);
+	if (ranges)
+		dt_del_property(dt_root, (struct dt_property *)ranges);
+}
+
+static uint64_t allocated_length(const struct mem_region *r)
+{
+	struct free_hdr *f, *last = NULL;
+
+	/* No allocations at all? */
+	if (r->free_list.n.next == NULL)
+		return 0;
+
+	/* Find last free block. */
+	list_for_each(&r->free_list, f, list)
+		if (f > last)
+			last = f;
+
+	/* No free blocks? */
+	if (!last)
+		return r->len;
+
+	/* Last free block isn't at end? */
+	if (next_hdr(r, &last->hdr))
+		return r->len;
+	return (unsigned long)last - r->start;
+}
+
+/* Separate out allocated sections into their own region. */
+void mem_region_release_unused(void)
+{
+	struct mem_region *r;
+
+	lock(&mem_region_lock);
+
+	printf("Releasing unused memory:\n");
+	list_for_each(&regions, r, list) {
+		uint64_t used_len;
+
+		/* If it's not allocatable, ignore it. */
+		if (r->type != REGION_SKIBOOT_HEAP)
+			continue;
+
+		used_len = allocated_length(r);
+
+		printf("    %s: %llu/%llu used\n",
+		       r->name, (long long)used_len, (long long)r->len);
+
+		/* We keep the skiboot heap. */
+		if (r == &skiboot_heap)
+			continue;
+
+		/* Nothing used?  Whole thing is for Linux. */
+		if (used_len == 0)
+			r->type = REGION_OS;
+		/* Partially used?  Split region. */
+		else if (used_len != r->len) {
+			struct mem_region *for_linux;
+			struct free_hdr *last = region_start(r) + used_len;
+
+			/* Remove the final free block. */
+			list_del_from(&r->free_list, &last->list);
+
+			for_linux = split_region(r, r->start + used_len,
+						 REGION_OS);
+			if (!for_linux) {
+				prerror("OOM splitting mem node %s for linux\n",
+					r->name);
+				abort();
+			}
+			list_add(&regions, &for_linux->list);
+		}
+	}
+	unlock(&mem_region_lock);
+}
+
+void mem_region_add_dt_reserved(void)
+{
+	int names_len, ranges_len, len;
+	struct mem_region *region;
+	void *names, *ranges;
+	uint64_t *range;
+	char *name;
+
+	names_len = 0;
+	ranges_len = 0;
+
+	lock(&mem_region_lock);
+
+	/* First pass: calculate length of property data */
+	list_for_each(&regions, region, list) {
+		if (!region_is_reserved(region))
+			continue;
+		names_len += strlen(region->name) + 1;
+		ranges_len += 2 * sizeof(uint64_t);
+	}
+
+	/* Allocate property data with mem_alloc; malloc() acquires
+	 * mem_region_lock */
+	names = mem_alloc(&skiboot_heap, names_len,
+			__alignof__(*names), __location__);
+	ranges = mem_alloc(&skiboot_heap, ranges_len,
+			__alignof__(*ranges), __location__);
+
+	name = names;
+	range = ranges;
+
+	printf("Reserved regions:\n");
+	/* Second pass: populate property data */
+	list_for_each(&regions, region, list) {
+		if (!region_is_reserved(region))
+			continue;
+		len = strlen(region->name) + 1;
+		memcpy(name, region->name, len);
+		name += len;
+
+		printf("  0x%012llx..%012llx : %s\n",
+		       (long long)region->start,
+		       (long long)(region->start + region->len - 1),
+		       region->name);
+
+		range[0] = cpu_to_fdt64(region->start);
+		range[1] = cpu_to_fdt64(region->len);
+		range += 2;
+	}
+	unlock(&mem_region_lock);
+
+	dt_add_property(dt_root, "reserved-names", names, names_len);
+	dt_add_property(dt_root, "reserved-ranges", ranges, ranges_len);
+
+	free(names);
+	free(ranges);
+}
diff --git a/core/nvram.c b/core/nvram.c
new file mode 100644
index 0000000..f25d6aa
--- /dev/null
+++ b/core/nvram.c
@@ -0,0 +1,248 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <fsp.h>
+#include <opal.h>
+#include <lock.h>
+#include <device.h>
+#include <platform.h>
+
+static void *nvram_image;
+static uint32_t nvram_size;
+static bool nvram_ready;
+
+static int64_t opal_read_nvram(uint64_t buffer, uint64_t size, uint64_t offset)
+{
+	if (!nvram_ready)
+		return OPAL_HARDWARE;
+	if (offset >= nvram_size || (offset + size) > nvram_size)
+		return OPAL_PARAMETER;
+
+	memcpy((void *)buffer, nvram_image + offset, size);
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_READ_NVRAM, opal_read_nvram, 3);
+
+static int64_t opal_write_nvram(uint64_t buffer, uint64_t size, uint64_t offset)
+{
+	if (!nvram_ready)
+		return OPAL_HARDWARE;
+	if (offset >= nvram_size || (offset + size) > nvram_size)
+		return OPAL_PARAMETER;
+	memcpy(nvram_image + offset, (void *)buffer, size);
+	if (platform.nvram_write)
+		platform.nvram_write(offset, nvram_image + offset, size);
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_WRITE_NVRAM, opal_write_nvram, 3);
+
+struct chrp_nvram_hdr {
+	uint8_t		sig;
+	uint8_t		cksum;
+	uint16_t	len;
+	char		name[12];
+};
+
+#define NVRAM_SIG_FW_PRIV	0x51
+#define NVRAM_SIG_SYSTEM	0x70
+#define NVRAM_SIG_FREE		0x7f
+
+#define NVRAM_NAME_COMMON	"common"
+#define NVRAM_NAME_FW_PRIV	"ibm,skiboot"
+#define NVRAM_NAME_FREE		"wwwwwwwwwwww"
+
+/* 64k should be enough, famous last words... */
+#define NVRAM_SIZE_COMMON	0x10000
+
+/* 4k should be enough, famous last words... */
+#define NVRAM_SIZE_FW_PRIV	0x1000
+
+static uint8_t chrp_nv_cksum(struct chrp_nvram_hdr *hdr)
+{
+	struct chrp_nvram_hdr h_copy = *hdr;
+	uint8_t b_data, i_sum, c_sum;
+	uint8_t *p = (uint8_t *)&h_copy;
+	unsigned int nbytes = sizeof(h_copy);
+
+	h_copy.cksum = 0;
+	for (c_sum = 0; nbytes; nbytes--) {
+		b_data = *(p++);
+		i_sum = c_sum + b_data;
+		if (i_sum < c_sum)
+			i_sum++;
+		c_sum = i_sum;
+	}
+	return c_sum;
+}
+
+static void nvram_format(void)
+{
+	struct chrp_nvram_hdr *h;
+	unsigned int offset = 0;
+
+	prerror("NVRAM: Re-initializing\n");
+	memset(nvram_image, 0, nvram_size);
+
+	/* Create private partition */
+	h = nvram_image + offset;
+	h->sig = NVRAM_SIG_FW_PRIV;
+	h->len = NVRAM_SIZE_FW_PRIV >> 4;
+	strcpy(h->name, NVRAM_NAME_FW_PRIV);
+	h->cksum = chrp_nv_cksum(h);
+	offset += NVRAM_SIZE_FW_PRIV;
+
+	/* Create common partition */
+	h = nvram_image + offset;
+	h->sig = NVRAM_SIG_SYSTEM;
+	h->len = NVRAM_SIZE_COMMON >> 4;
+	strcpy(h->name, NVRAM_NAME_COMMON);
+	h->cksum = chrp_nv_cksum(h);
+	offset += NVRAM_SIZE_COMMON;
+
+	/* Create free space partition */
+	h = nvram_image + offset;
+	h->sig = NVRAM_SIG_FREE;
+	h->len = (nvram_size - offset) >> 4;
+	strncpy(h->name, NVRAM_NAME_FREE, 12);
+	h->cksum = chrp_nv_cksum(h);
+
+	/* Write the whole thing back */
+	if (platform.nvram_write)
+		platform.nvram_write(0, nvram_image, nvram_size);
+}
+
+/*
+ * Check that the nvram partition layout is sane and that it
+ * contains our required partitions. If not, we re-format the
+ * lot of it
+ */
+static void nvram_check(void)
+{
+	unsigned int offset = 0;
+	bool found_common = false;
+	bool found_skiboot = false;
+
+	while (offset + sizeof(struct chrp_nvram_hdr) < nvram_size) {
+		struct chrp_nvram_hdr *h = nvram_image + offset;
+
+		if (chrp_nv_cksum(h) != h->cksum) {
+			prerror("NVRAM: Partition at offset 0x%x"
+				" has bad checksum\n", offset);
+			goto failed;
+		}
+		if (h->len < 1) {
+			prerror("NVRAM: Partition at offset 0x%x"
+				" has incorrect 0 length\n", offset);
+			goto failed;
+		}
+
+		if (h->sig == NVRAM_SIG_SYSTEM &&
+		    strcmp(h->name, NVRAM_NAME_COMMON) == 0)
+			found_common = true;
+
+		if (h->sig == NVRAM_SIG_FW_PRIV &&
+		    strcmp(h->name, NVRAM_NAME_FW_PRIV) == 0)
+			found_skiboot = true;
+
+		offset += h->len << 4;
+		if (offset > nvram_size) {
+			prerror("NVRAM: Partition at offset 0x%x"
+				" extends beyond end of nvram !\n", offset);
+			goto failed;
+		}
+	}
+	if (!found_common) {
+			prerror("NVRAM: Common partition not found !\n");
+		goto failed;
+	}
+	if (!found_skiboot) {
+			prerror("NVRAM: Skiboot private partition "
+				"not found !\n");
+		goto failed;
+	}
+
+	prerror("NVRAM: Layout appears sane\n");
+	return;
+ failed:
+	nvram_format();
+}
+
+void nvram_read_complete(bool success)
+{
+	struct dt_node *np;
+
+	/* Read not successful, error out and free the buffer */
+	if (!success) {
+		free(nvram_image);
+		nvram_size = 0;
+		return;
+	}
+
+	/* Check and maybe format nvram */
+	nvram_check();
+
+	/* Add nvram node */
+	np = dt_new(opal_node, "nvram");
+	dt_add_property_cells(np, "#bytes", nvram_size);
+	dt_add_property_string(np, "compatible", "ibm,opal-nvram");
+
+	/* Mark ready */
+	nvram_ready = true;
+}
+
+void nvram_init(void)
+{
+	int rc;
+
+	if (!platform.nvram_info)
+		return;
+	rc = platform.nvram_info(&nvram_size);
+	if (rc) {
+		prerror("NVRAM: Error %d retrieving nvram info\n", rc);
+		return;
+	}
+	printf("NVRAM: Size is %d KB\n", nvram_size >> 10);
+	if (nvram_size > 0x100000) {
+		printf("NVRAM: Cropping to 1MB !\n");
+		nvram_size = 0x100000;
+	}
+
+	/*
+	 * We allocate the nvram image with 4k alignment to make the
+	 * FSP backend job's easier
+	 */
+	nvram_image = memalign(0x1000, nvram_size);
+	if (!nvram_image) {
+		prerror("NVRAM: Failed to allocate nvram image\n");
+		nvram_size = 0;
+		return;
+	}
+
+	/* Read it in */
+	rc = platform.nvram_start_read(nvram_image, 0, nvram_size);
+	if (rc) {
+		prerror("NVRAM: Failed to read NVRAM from FSP !\n");
+		nvram_size = 0;
+		free(nvram_image);
+		return;
+	}
+
+	/*
+	 * We'll get called back later (or recursively from
+	 * nvram_start_read) in nvram_read_complete()
+	 */
+}
diff --git a/core/opal-msg.c b/core/opal-msg.c
new file mode 100644
index 0000000..f033b76
--- /dev/null
+++ b/core/opal-msg.c
@@ -0,0 +1,167 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include <skiboot.h>
+#include <opal-msg.h>
+#include <lock.h>
+
+#define OPAL_MAX_MSGS		(OPAL_MSG_TYPE_MAX + OPAL_MAX_ASYNC_COMP - 1)
+#define OPAL_MSG_PREFIX		"opalmsg: "
+
+
+struct opal_msg_entry {
+	struct list_node link;
+	void (*consumed)(void *data);
+	void *data;
+	struct opal_msg msg;
+};
+
+static LIST_HEAD(msg_free_list);
+static LIST_HEAD(msg_pending_list);
+
+static struct lock opal_msg_lock = LOCK_UNLOCKED;
+
+int _opal_queue_msg(enum OpalMessageType msg_type, void *data,
+		    void (*consumed)(void *data), size_t num_params,
+		    const u64 *params)
+{
+	struct opal_msg_entry *entry;
+
+	lock(&opal_msg_lock);
+
+	entry = list_pop(&msg_free_list, struct opal_msg_entry, link);
+	if (!entry) {
+		prerror(OPAL_MSG_PREFIX "No available node in the free list, allocating\n");
+		entry = zalloc(sizeof(struct opal_msg_entry));
+		if (!entry) {
+			prerror(OPAL_MSG_PREFIX "Allocation failed\n");
+			unlock(&opal_msg_lock);
+			return OPAL_RESOURCE;
+		}
+	}
+
+	entry->consumed = consumed;
+	entry->data = data;
+	entry->msg.msg_type = msg_type;
+
+	if (num_params > ARRAY_SIZE(entry->msg.params)) {
+		prerror(OPAL_MSG_PREFIX "Discarding extra parameters\n");
+		num_params = ARRAY_SIZE(entry->msg.params);
+	}
+	memcpy(entry->msg.params, params, num_params*sizeof(u64));
+
+	list_add_tail(&msg_pending_list, &entry->link);
+	opal_update_pending_evt(OPAL_EVENT_MSG_PENDING,
+				OPAL_EVENT_MSG_PENDING);
+
+	unlock(&opal_msg_lock);
+
+	return 0;
+}
+
+static int64_t opal_get_msg(uint64_t *buffer, uint64_t size)
+{
+	struct opal_msg_entry *entry;
+	void (*callback)(void *data);
+	void *data;
+
+	if (size < sizeof(struct opal_msg) || !buffer)
+		return OPAL_PARAMETER;
+
+	lock(&opal_msg_lock);
+
+	entry = list_pop(&msg_pending_list, struct opal_msg_entry, link);
+	if (!entry) {
+		unlock(&opal_msg_lock);
+		return OPAL_RESOURCE;
+	}
+
+	memcpy(buffer, &entry->msg, sizeof(entry->msg));
+	callback = entry->consumed;
+	data = entry->data;
+
+	list_add(&msg_free_list, &entry->link);
+	if (list_empty(&msg_pending_list))
+		opal_update_pending_evt(OPAL_EVENT_MSG_PENDING, 0);
+
+	unlock(&opal_msg_lock);
+
+	if (callback)
+		callback(data);
+
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_GET_MSG, opal_get_msg, 2);
+
+static int64_t opal_check_completion(uint64_t *buffer, uint64_t size,
+				     uint64_t token)
+{
+	struct opal_msg_entry *entry, *next_entry;
+	void (*callback)(void *data) = NULL;
+	int rc = OPAL_BUSY;
+	void *data = NULL;
+
+	lock(&opal_msg_lock);
+	list_for_each_safe(&msg_pending_list, entry, next_entry, link) {
+		if (entry->msg.msg_type == OPAL_MSG_ASYNC_COMP &&
+				entry->msg.params[0] == token) {
+			list_del(&entry->link);
+			callback = entry->consumed;
+			data = entry->data;
+			list_add(&msg_free_list, &entry->link);
+			if (list_empty(&msg_pending_list))
+				opal_update_pending_evt(OPAL_EVENT_MSG_PENDING,
+							0);
+			rc = OPAL_SUCCESS;
+			break;
+		}
+	}
+
+	if (rc == OPAL_SUCCESS && size >= sizeof(struct opal_msg))
+		memcpy(buffer, &entry->msg, sizeof(entry->msg));
+
+	unlock(&opal_msg_lock);
+
+	if (callback)
+		callback(data);
+
+	return rc;
+
+}
+opal_call(OPAL_CHECK_ASYNC_COMPLETION, opal_check_completion, 3);
+
+void opal_init_msg(void)
+{
+	struct opal_msg_entry *entry;
+	int i;
+
+	for (i = 0; i < OPAL_MAX_MSGS; i++, entry++) {
+                entry = zalloc(sizeof(*entry));
+                if (!entry)
+                        goto err;
+		list_add_tail(&msg_free_list, &entry->link);
+        }
+        return;
+
+err:
+        for (; i > 0; i--) {
+                entry = list_pop(&msg_free_list, struct opal_msg_entry, link);
+                if (entry)
+                        free(entry);
+        }
+}
+
diff --git a/core/opal.c b/core/opal.c
new file mode 100644
index 0000000..2727fd5
--- /dev/null
+++ b/core/opal.c
@@ -0,0 +1,308 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <opal.h>
+#include <stack.h>
+#include <lock.h>
+#include <fsp.h>
+#include <cpu.h>
+#include <interrupts.h>
+#include <op-panel.h>
+#include <device.h>
+#include <console.h>
+#include <trace.h>
+#include <timebase.h>
+#include <affinity.h>
+#include <opal-msg.h>
+
+/* Pending events to signal via opal_poll_events */
+uint64_t opal_pending_events;
+
+/* OPAL dispatch table defined in head.S */
+extern uint64_t opal_branch_table[];
+
+/* Number of args expected for each call. */
+static u8 opal_num_args[OPAL_LAST+1];
+
+/* OPAL anchor node */
+struct dt_node *opal_node;
+
+extern uint32_t attn_trigger;
+extern uint32_t hir_trigger;
+
+void opal_table_init(void)
+{
+	struct opal_table_entry *s = __opal_table_start;
+	struct opal_table_entry *e = __opal_table_end;
+
+	printf("OPAL table: %p .. %p, branch table: %p\n",
+	       s, e, opal_branch_table);
+	while(s < e) {
+		uint64_t *func = s->func;
+		opal_branch_table[s->token] = *func;
+		opal_num_args[s->token] = s->nargs;
+		s++;
+	}
+}
+
+/* Called from head.S, thus no prototype */
+long opal_bad_token(uint64_t token);
+
+long opal_bad_token(uint64_t token)
+{
+	prerror("OPAL: Called with bad token %lld !\n", token);
+
+	return OPAL_PARAMETER;
+}
+
+/* Called from head.S, thus no prototype */
+void opal_trace_entry(struct stack_frame *eframe);
+
+/* FIXME: Do this in asm */ 
+void opal_trace_entry(struct stack_frame *eframe)
+{
+	union trace t;
+	unsigned nargs;
+
+	if (this_cpu()->pir != mfspr(SPR_PIR)) {
+		printf("CPU MISMATCH ! PIR=%04lx cpu @%p -> pir=%04x\n",
+		       mfspr(SPR_PIR), this_cpu(), this_cpu()->pir);
+		abort();
+	}
+	if (eframe->gpr[0] > OPAL_LAST)
+		nargs = 0;
+	else
+		nargs = opal_num_args[eframe->gpr[0]];
+
+	t.opal.token = eframe->gpr[0];
+	t.opal.lr = eframe->lr;
+	t.opal.sp = eframe->gpr[1];
+	memcpy(t.opal.r3_to_11, &eframe->gpr[3], nargs*sizeof(u64));
+
+	trace_add(&t, TRACE_OPAL, offsetof(struct trace_opal, r3_to_11[nargs]));
+}
+
+void __opal_register(uint64_t token, void *func, unsigned int nargs)
+{
+	uint64_t *opd = func;
+
+	assert(token <= OPAL_LAST);
+
+	opal_branch_table[token] = *opd;
+	opal_num_args[token] = nargs;
+}
+
+static void add_opal_firmware_node(void)
+{
+	struct dt_node *firmware = dt_new(opal_node, "firmware");
+
+	dt_add_property_string(firmware, "compatible", "ibm,opal-firmware");
+	dt_add_property_string(firmware, "name", "firmware");
+	dt_add_property_string(firmware, "git-id", gitid);
+}
+
+void add_opal_node(void)
+{
+	uint64_t base, entry, size;
+	extern uint32_t opal_entry;
+
+	/* XXX TODO: Reorg this. We should create the base OPAL
+	 * node early on, and have the various sub modules populate
+	 * their own entries (console etc...)
+	 *
+	 * The logic of which console backend to use should be
+	 * extracted
+	 */
+
+	entry = (uint64_t)&opal_entry;
+	base = SKIBOOT_BASE;
+	size = (CPU_STACKS_BASE +
+		(cpu_max_pir + 1) * STACK_SIZE) - SKIBOOT_BASE;
+
+	opal_node = dt_new(dt_root, "ibm,opal");
+	dt_add_property_cells(opal_node, "#address-cells", 0);
+	dt_add_property_cells(opal_node, "#size-cells", 0);
+	dt_add_property_strings(opal_node, "compatible", "ibm,opal-v2",
+				"ibm,opal-v3");
+	dt_add_property_cells(opal_node, "opal-msg-async-num", OPAL_MAX_ASYNC_COMP);
+	dt_add_property_cells(opal_node, "opal-msg-size", sizeof(struct opal_msg));
+	dt_add_property_u64(opal_node, "opal-base-address", base);
+	dt_add_property_u64(opal_node, "opal-entry-address", entry);
+	dt_add_property_u64(opal_node, "opal-runtime-size", size);
+
+	add_opal_firmware_node();
+	add_associativity_ref_point();
+	memcons_add_properties();
+	add_cpu_idle_state_properties();
+}
+
+void opal_update_pending_evt(uint64_t evt_mask, uint64_t evt_values)
+{
+	static struct lock evt_lock = LOCK_UNLOCKED;
+	uint64_t new_evts;
+
+	/* XXX FIXME: Use atomics instead ??? Or caller locks (con_lock ?) */
+	lock(&evt_lock);
+	new_evts = (opal_pending_events & ~evt_mask) | evt_values;
+#ifdef OPAL_TRACE_EVT_CHG
+	printf("OPAL: Evt change: 0x%016llx -> 0x%016llx\n",
+	       opal_pending_events, new_evts);
+#endif
+	opal_pending_events = new_evts;
+	unlock(&evt_lock);
+}
+
+
+static uint64_t opal_test_func(uint64_t arg)
+{
+	printf("OPAL: Test function called with arg 0x%llx\n", arg);
+
+	return 0xfeedf00d;
+}
+opal_call(OPAL_TEST, opal_test_func, 1);
+
+struct opal_poll_entry {
+	struct list_node	link;
+	void			(*poller)(void *data);
+	void			*data;
+};
+
+static struct list_head opal_pollers = LIST_HEAD_INIT(opal_pollers);
+static struct lock opal_poll_lock = LOCK_UNLOCKED;
+
+void opal_add_poller(void (*poller)(void *data), void *data)
+{
+	struct opal_poll_entry *ent;
+
+	ent = zalloc(sizeof(struct opal_poll_entry));
+	assert(ent);
+	ent->poller = poller;
+	ent->data = data;
+	lock(&opal_poll_lock);
+	list_add_tail(&opal_pollers, &ent->link);
+	unlock(&opal_poll_lock);
+}
+
+void opal_del_poller(void (*poller)(void *data))
+{
+	struct opal_poll_entry *ent;
+
+	lock(&opal_poll_lock);
+	list_for_each(&opal_pollers, ent, link) {
+		if (ent->poller == poller) {
+			list_del(&ent->link);
+			free(ent);
+			break;
+		}
+	}
+	unlock(&opal_poll_lock);
+}
+
+static int64_t opal_poll_events(uint64_t *outstanding_event_mask)
+{
+	struct opal_poll_entry *poll_ent;
+
+	/* Check if we need to trigger an attn for test use */
+	if (attn_trigger == 0xdeadbeef) {
+		printf("Triggering attn\n");
+		assert(false);
+	}
+
+	/* Test the host initiated reset */
+	if (hir_trigger == 0xdeadbeef) {
+		fsp_trigger_reset();
+		hir_trigger = 0;
+	}
+
+	/*
+	 * Only run the pollers if they aren't already running
+	 * on another CPU
+	 */
+	if (try_lock(&opal_poll_lock)) {
+		list_for_each(&opal_pollers, poll_ent, link)
+			poll_ent->poller(poll_ent->data);
+		unlock(&opal_poll_lock);
+	}
+
+	if (outstanding_event_mask)
+		*outstanding_event_mask = opal_pending_events;
+
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_POLL_EVENTS, opal_poll_events, 1);
+
+static int64_t opal_check_token(uint64_t token)
+{
+	if (token > OPAL_LAST)
+		return OPAL_TOKEN_ABSENT;
+
+	if (opal_branch_table[token])
+		return OPAL_TOKEN_PRESENT;
+
+	return OPAL_TOKEN_ABSENT;
+}
+opal_call(OPAL_CHECK_TOKEN, opal_check_token, 1);
+
+struct opal_sync_entry {
+	struct list_node	link;
+	bool			(*notify)(void *data);
+	void			*data;
+};
+
+static struct list_head opal_syncers = LIST_HEAD_INIT(opal_syncers);
+
+void opal_add_host_sync_notifier(bool (*notify)(void *data), void *data)
+{
+	struct opal_sync_entry *ent;
+
+	ent = zalloc(sizeof(struct opal_sync_entry));
+	assert(ent);
+	ent->notify = notify;
+	ent->data = data;
+	list_add_tail(&opal_syncers, &ent->link);
+}
+
+void opal_del_host_sync_notifier(bool (*notify)(void *data))
+{
+	struct opal_sync_entry *ent;
+
+	list_for_each(&opal_syncers, ent, link) {
+		if (ent->notify == notify) {
+			list_del(&ent->link);
+			free(ent);
+			return;
+		}
+	}
+}
+
+/*
+ * OPAL call to handle host kexec'ing scenario
+ */
+static int64_t opal_sync_host_reboot(void)
+{
+	struct opal_sync_entry *ent;
+	bool ret = true;
+
+	list_for_each(&opal_syncers, ent, link)
+		ret &= ent->notify(ent->data);
+
+	if (ret)
+		return OPAL_SUCCESS;
+	else
+		return OPAL_BUSY_EVENT;
+}
+opal_call(OPAL_SYNC_HOST_REBOOT, opal_sync_host_reboot, 0);
diff --git a/core/pci-opal.c b/core/pci-opal.c
new file mode 100644
index 0000000..ee534cc
--- /dev/null
+++ b/core/pci-opal.c
@@ -0,0 +1,666 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <pci.h>
+#include <pci-cfg.h>
+#include <timebase.h>
+#include <lock.h>
+
+#define OPAL_PCICFG_ACCESS(op, cb, type)	\
+static int64_t opal_pci_config_##op(uint64_t phb_id,			\
+				    uint64_t bus_dev_func,		\
+				    uint64_t offset, type data)		\
+{									\
+	struct phb *phb = pci_get_phb(phb_id);				\
+	int64_t rc;							\
+									\
+	if (!phb)							\
+		return OPAL_PARAMETER;					\
+	phb->ops->lock(phb);						\
+	rc = phb->ops->cfg_##cb(phb, bus_dev_func, offset, data);	\
+	phb->ops->unlock(phb);						\
+	pci_put_phb(phb);						\
+									\
+	return rc;							\
+}
+
+OPAL_PCICFG_ACCESS(read_byte,		read8, uint8_t *)
+OPAL_PCICFG_ACCESS(read_half_word,	read16, uint16_t *)
+OPAL_PCICFG_ACCESS(read_word,		read32, uint32_t *)
+OPAL_PCICFG_ACCESS(write_byte,		write8, uint8_t)
+OPAL_PCICFG_ACCESS(write_half_word,	write16, uint16_t)
+OPAL_PCICFG_ACCESS(write_word,		write32, uint32_t)
+
+opal_call(OPAL_PCI_CONFIG_READ_BYTE, opal_pci_config_read_byte, 4);
+opal_call(OPAL_PCI_CONFIG_READ_HALF_WORD, opal_pci_config_read_half_word, 4);
+opal_call(OPAL_PCI_CONFIG_READ_WORD, opal_pci_config_read_word, 4);
+opal_call(OPAL_PCI_CONFIG_WRITE_BYTE, opal_pci_config_write_byte, 4);
+opal_call(OPAL_PCI_CONFIG_WRITE_HALF_WORD, opal_pci_config_write_half_word, 4);
+opal_call(OPAL_PCI_CONFIG_WRITE_WORD, opal_pci_config_write_word, 4);
+
+static int64_t opal_pci_eeh_freeze_status(uint64_t phb_id, uint64_t pe_number,
+					  uint8_t *freeze_state,
+					  uint16_t *pci_error_type,
+					  uint64_t *phb_status)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->eeh_freeze_status)
+		return OPAL_UNSUPPORTED;
+	phb->ops->lock(phb);
+	rc = phb->ops->eeh_freeze_status(phb, pe_number, freeze_state,
+					 pci_error_type, NULL, phb_status);
+	phb->ops->unlock(phb);
+	pci_put_phb(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_EEH_FREEZE_STATUS, opal_pci_eeh_freeze_status, 5);
+
+static int64_t opal_pci_eeh_freeze_clear(uint64_t phb_id, uint64_t pe_number,
+					 uint64_t eeh_action_token)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->eeh_freeze_clear)
+		return OPAL_UNSUPPORTED;
+	phb->ops->lock(phb);
+	rc = phb->ops->eeh_freeze_clear(phb, pe_number, eeh_action_token);
+	phb->ops->unlock(phb);
+	pci_put_phb(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_EEH_FREEZE_CLEAR, opal_pci_eeh_freeze_clear, 3);
+
+static int64_t opal_pci_phb_mmio_enable(uint64_t phb_id, uint16_t window_type,
+					uint16_t window_num, uint16_t enable)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->phb_mmio_enable)
+		return OPAL_UNSUPPORTED;
+	phb->ops->lock(phb);
+	rc = phb->ops->phb_mmio_enable(phb, window_type, window_num, enable);
+	phb->ops->unlock(phb);
+	pci_put_phb(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_PHB_MMIO_ENABLE, opal_pci_phb_mmio_enable, 4);
+
+static int64_t opal_pci_set_phb_mem_window(uint64_t phb_id,
+					   uint16_t window_type,
+					   uint16_t window_num,
+					   uint64_t addr,
+					   uint64_t pci_addr,
+					   uint64_t size)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->set_phb_mem_window)
+		return OPAL_UNSUPPORTED;
+	phb->ops->lock(phb);
+	rc = phb->ops->set_phb_mem_window(phb, window_type, window_num,
+					  addr, pci_addr, size);
+	phb->ops->unlock(phb);
+	pci_put_phb(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_SET_PHB_MEM_WINDOW, opal_pci_set_phb_mem_window, 6);
+
+static int64_t opal_pci_map_pe_mmio_window(uint64_t phb_id, uint16_t pe_number,
+					   uint16_t window_type,
+					   uint16_t window_num,
+					   uint16_t segment_num)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->map_pe_mmio_window)
+		return OPAL_UNSUPPORTED;
+	phb->ops->lock(phb);
+	rc = phb->ops->map_pe_mmio_window(phb, pe_number, window_type,
+					  window_num, segment_num);
+	phb->ops->unlock(phb);
+	pci_put_phb(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_MAP_PE_MMIO_WINDOW, opal_pci_map_pe_mmio_window, 5);
+
+static int64_t opal_pci_set_phb_table_memory(uint64_t phb_id __unused,
+					     uint64_t rtt_addr __unused,
+					     uint64_t ivt_addr __unused,
+					     uint64_t ivt_len __unused,
+					     uint64_t rej_array_addr __unused,
+					     uint64_t peltv_addr __unused)
+{
+	/* IODA2 (P8) stuff, TODO */
+	return OPAL_UNSUPPORTED;
+}
+opal_call(OPAL_PCI_SET_PHB_TABLE_MEMORY, opal_pci_set_phb_table_memory, 6);
+
+static int64_t opal_pci_set_pe(uint64_t phb_id, uint64_t pe_number,
+			       uint64_t bus_dev_func, uint8_t bus_compare,
+			       uint8_t dev_compare, uint8_t func_compare,
+			       uint8_t pe_action)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->set_pe)
+		return OPAL_UNSUPPORTED;
+	phb->ops->lock(phb);
+	rc = phb->ops->set_pe(phb, pe_number, bus_dev_func, bus_compare,
+			      dev_compare, func_compare, pe_action);
+	phb->ops->unlock(phb);
+	pci_put_phb(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_SET_PE, opal_pci_set_pe, 7);
+
+static int64_t opal_pci_set_peltv(uint64_t phb_id, uint32_t parent_pe,
+				  uint32_t child_pe, uint8_t state)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->set_peltv)
+		return OPAL_UNSUPPORTED;
+	phb->ops->lock(phb);
+	rc = phb->ops->set_peltv(phb, parent_pe, child_pe, state);
+	phb->ops->unlock(phb);
+	pci_put_phb(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_SET_PELTV, opal_pci_set_peltv, 4);
+
+static int64_t opal_pci_set_mve(uint64_t phb_id, uint32_t mve_number,
+				uint32_t pe_number)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->set_mve)
+		return OPAL_UNSUPPORTED;
+	phb->ops->lock(phb);
+	rc = phb->ops->set_mve(phb, mve_number, pe_number);
+	phb->ops->unlock(phb);
+	pci_put_phb(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_SET_MVE, opal_pci_set_mve, 3);
+
+static int64_t opal_pci_set_mve_enable(uint64_t phb_id, uint32_t mve_number,
+				       uint32_t state)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->set_mve_enable)
+		return OPAL_UNSUPPORTED;
+	phb->ops->lock(phb);
+	rc = phb->ops->set_mve_enable(phb, mve_number, state);
+	phb->ops->unlock(phb);
+	pci_put_phb(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_SET_MVE_ENABLE, opal_pci_set_mve_enable, 3);
+
+static int64_t opal_pci_get_xive_reissue(uint64_t phb_id __unused,
+					 uint32_t xive_number __unused,
+					 uint8_t *p_bit __unused,
+					 uint8_t *q_bit __unused)
+{
+	/* IODA2 (P8) stuff, TODO */
+	return OPAL_UNSUPPORTED;
+}
+opal_call(OPAL_PCI_GET_XIVE_REISSUE, opal_pci_get_xive_reissue, 4);
+
+static int64_t opal_pci_set_xive_reissue(uint64_t phb_id __unused,
+					 uint32_t xive_number __unused,
+					 uint8_t p_bit __unused,
+					 uint8_t q_bit __unused)
+{
+	/* IODA2 (P8) stuff, TODO */
+	return OPAL_UNSUPPORTED;
+}
+opal_call(OPAL_PCI_SET_XIVE_REISSUE, opal_pci_set_xive_reissue, 4);
+
+static int64_t opal_pci_msi_eoi(uint64_t phb_id,
+				uint32_t hwirq)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->pci_msi_eoi)
+		return OPAL_UNSUPPORTED;
+	phb->ops->lock(phb);
+	rc = phb->ops->pci_msi_eoi(phb, hwirq);
+	phb->ops->unlock(phb);
+	pci_put_phb(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_MSI_EOI, opal_pci_msi_eoi, 2);
+
+static int64_t opal_pci_set_xive_pe(uint64_t phb_id, uint32_t pe_number,
+				    uint32_t xive_num)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->set_xive_pe)
+		return OPAL_UNSUPPORTED;
+	phb->ops->lock(phb);
+	rc = phb->ops->set_xive_pe(phb, pe_number, xive_num);
+	phb->ops->unlock(phb);
+	pci_put_phb(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_SET_XIVE_PE, opal_pci_set_xive_pe, 3);
+
+static int64_t opal_get_xive_source(uint64_t phb_id, uint32_t xive_num,
+				    int32_t *interrupt_source_number)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->get_xive_source)
+		return OPAL_UNSUPPORTED;
+	phb->ops->lock(phb);
+	rc = phb->ops->get_xive_source(phb, xive_num, interrupt_source_number);
+	phb->ops->unlock(phb);
+	pci_put_phb(phb);
+
+	return rc;
+}
+opal_call(OPAL_GET_XIVE_SOURCE, opal_get_xive_source, 3);
+
+static int64_t opal_get_msi_32(uint64_t phb_id, uint32_t mve_number,
+			       uint32_t xive_num, uint8_t msi_range,
+			       uint32_t *msi_address, uint32_t *message_data)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->get_msi_32)
+		return OPAL_UNSUPPORTED;
+	phb->ops->lock(phb);
+	rc = phb->ops->get_msi_32(phb, mve_number, xive_num, msi_range,
+				  msi_address, message_data);
+	phb->ops->unlock(phb);
+	pci_put_phb(phb);
+
+	return rc;
+}
+opal_call(OPAL_GET_MSI_32, opal_get_msi_32, 6);
+
+static int64_t opal_get_msi_64(uint64_t phb_id, uint32_t mve_number,
+			       uint32_t xive_num, uint8_t msi_range,
+			       uint64_t *msi_address, uint32_t *message_data)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->get_msi_64)
+		return OPAL_UNSUPPORTED;
+	phb->ops->lock(phb);
+	rc = phb->ops->get_msi_64(phb, mve_number, xive_num, msi_range,
+				  msi_address, message_data);
+	phb->ops->unlock(phb);
+	pci_put_phb(phb);
+
+	return rc;
+}
+opal_call(OPAL_GET_MSI_64, opal_get_msi_64, 6);
+
+static int64_t opal_pci_map_pe_dma_window(uint64_t phb_id, uint16_t pe_number,
+					  uint16_t window_id,
+					  uint16_t tce_levels,
+					  uint64_t tce_table_addr,
+					  uint64_t tce_table_size,
+					  uint64_t tce_page_size)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->map_pe_dma_window)
+		return OPAL_UNSUPPORTED;
+	phb->ops->lock(phb);
+	rc = phb->ops->map_pe_dma_window(phb, pe_number, window_id,
+					 tce_levels, tce_table_addr,
+					 tce_table_size, tce_page_size);
+	phb->ops->unlock(phb);
+	pci_put_phb(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_MAP_PE_DMA_WINDOW, opal_pci_map_pe_dma_window, 7);
+
+static int64_t opal_pci_map_pe_dma_window_real(uint64_t phb_id,
+					       uint16_t pe_number,
+					       uint16_t window_id,
+					       uint64_t pci_start_addr,
+					       uint64_t pci_mem_size)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->map_pe_dma_window_real)
+		return OPAL_UNSUPPORTED;
+	phb->ops->lock(phb);
+	rc = phb->ops->map_pe_dma_window_real(phb, pe_number, window_id,
+					      pci_start_addr, pci_mem_size);
+	phb->ops->unlock(phb);
+	pci_put_phb(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_MAP_PE_DMA_WINDOW_REAL, opal_pci_map_pe_dma_window_real, 5);
+
+static int64_t opal_pci_reset(uint64_t phb_id, uint8_t reset_scope,
+                              uint8_t assert_state)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc = OPAL_SUCCESS;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops)
+		return OPAL_UNSUPPORTED;
+	if (assert_state != OPAL_ASSERT_RESET &&
+	    assert_state != OPAL_DEASSERT_RESET)
+		return OPAL_PARAMETER;
+
+	phb->ops->lock(phb);
+
+	switch(reset_scope) {
+	case OPAL_RESET_PHB_COMPLETE:
+		if (!phb->ops->complete_reset) {
+			rc = OPAL_UNSUPPORTED;
+			break;
+		}
+
+		rc = phb->ops->complete_reset(phb, assert_state);
+		if (rc < 0)
+			prerror("PHB#%d: Failure on complete reset, rc=%lld\n",
+				phb->opal_id, rc);
+		break;
+	case OPAL_RESET_PCI_FUNDAMENTAL:
+		if (!phb->ops->fundamental_reset) {
+			rc = OPAL_UNSUPPORTED;
+			break;
+		}
+
+		/* We need do nothing on deassert time */
+		if (assert_state != OPAL_ASSERT_RESET)
+			break;
+
+		rc = phb->ops->fundamental_reset(phb);
+		if (rc < 0)
+			prerror("PHB#%d: Failure on fundamental reset, rc=%lld\n",
+				phb->opal_id, rc);
+		break;
+	case OPAL_RESET_PCI_HOT:
+		if (!phb->ops->hot_reset) {
+			rc = OPAL_UNSUPPORTED;
+			break;
+		}
+
+		/* We need do nothing on deassert time */
+		if (assert_state != OPAL_ASSERT_RESET)
+			break;
+
+		rc = phb->ops->hot_reset(phb);
+		if (rc < 0)
+			prerror("PHB#%d: Failure on hot reset, rc=%lld\n",
+				phb->opal_id, rc);
+		break;
+	case OPAL_RESET_PCI_IODA_TABLE:
+		if (assert_state != OPAL_ASSERT_RESET)
+			break;
+		if (phb->ops->ioda_reset)
+			phb->ops->ioda_reset(phb, true);
+		break;
+	default:
+		rc = OPAL_UNSUPPORTED;
+	}
+	phb->ops->unlock(phb);
+	pci_put_phb(phb);
+
+	return (rc > 0) ? tb_to_msecs(rc) : rc;
+}
+opal_call(OPAL_PCI_RESET, opal_pci_reset, 3);
+
+static int64_t opal_pci_reinit(uint64_t phb_id,
+			       uint64_t reinit_scope,
+			       uint64_t data)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops || !phb->ops->pci_reinit)
+		return OPAL_UNSUPPORTED;
+
+	phb->ops->lock(phb);
+	rc = phb->ops->pci_reinit(phb, reinit_scope, data);
+	phb->ops->unlock(phb);
+	pci_put_phb(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_REINIT, opal_pci_reinit, 3);
+
+static int64_t opal_pci_poll(uint64_t phb_id)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops || !phb->ops->poll)
+		return OPAL_UNSUPPORTED;
+
+	phb->ops->lock(phb);
+	rc = phb->ops->poll(phb);
+	phb->ops->unlock(phb);
+	pci_put_phb(phb);
+
+	/* Return milliseconds for caller to sleep: round up */
+	if (rc > 0) {
+		rc = tb_to_msecs(rc);
+		if (rc == 0)
+			rc = 1;
+	}
+
+	return rc;
+}
+opal_call(OPAL_PCI_POLL, opal_pci_poll, 1);
+
+static int64_t opal_pci_set_phb_tce_memory(uint64_t phb_id,
+					   uint64_t tce_mem_addr,
+					   uint64_t tce_mem_size)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->set_phb_tce_memory)
+		return OPAL_UNSUPPORTED;
+	phb->ops->lock(phb);
+	rc = phb->ops->set_phb_tce_memory(phb, tce_mem_addr, tce_mem_size);
+	phb->ops->unlock(phb);
+	pci_put_phb(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_SET_PHB_TCE_MEMORY, opal_pci_set_phb_tce_memory, 3);
+
+static int64_t opal_pci_get_phb_diag_data(uint64_t phb_id,
+					  void *diag_buffer,
+					  uint64_t diag_buffer_len)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->get_diag_data)
+		return OPAL_UNSUPPORTED;
+	phb->ops->lock(phb);
+	rc = phb->ops->get_diag_data(phb, diag_buffer, diag_buffer_len);
+	phb->ops->unlock(phb);
+	pci_put_phb(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_GET_PHB_DIAG_DATA, opal_pci_get_phb_diag_data, 3);
+
+static int64_t opal_pci_get_phb_diag_data2(uint64_t phb_id,
+					   void *diag_buffer,
+					   uint64_t diag_buffer_len)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->get_diag_data2)
+		return OPAL_UNSUPPORTED;
+	phb->ops->lock(phb);
+	rc = phb->ops->get_diag_data2(phb, diag_buffer, diag_buffer_len);
+	phb->ops->unlock(phb);
+	pci_put_phb(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_GET_PHB_DIAG_DATA2, opal_pci_get_phb_diag_data2, 3);
+
+static int64_t opal_pci_next_error(uint64_t phb_id, uint64_t *first_frozen_pe,
+				   uint16_t *pci_error_type, uint16_t *severity)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->next_error)
+		return OPAL_UNSUPPORTED;
+	phb->ops->lock(phb);
+
+	/* Any call to this function clears the error event */
+	opal_update_pending_evt(OPAL_EVENT_PCI_ERROR, 0);
+	rc = phb->ops->next_error(phb, first_frozen_pe, pci_error_type,
+				  severity);
+	phb->ops->unlock(phb);
+	pci_put_phb(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_NEXT_ERROR, opal_pci_next_error, 4);
+
+static int64_t opal_pci_eeh_freeze_status2(uint64_t phb_id, uint64_t pe_number,
+					   uint8_t *freeze_state,
+					   uint16_t *pci_error_type,
+					   uint16_t *severity,
+					   uint64_t *phb_status)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->eeh_freeze_status)
+		return OPAL_UNSUPPORTED;
+	phb->ops->lock(phb);
+	rc = phb->ops->eeh_freeze_status(phb, pe_number, freeze_state,
+					 pci_error_type, severity, phb_status);
+	phb->ops->unlock(phb);
+	pci_put_phb(phb);
+
+	return rc;
+}
+opal_call(OPAL_PCI_EEH_FREEZE_STATUS2, opal_pci_eeh_freeze_status2, 6);
+
+static int64_t opal_pci_set_phb_capi_mode(uint64_t phb_id, uint64_t mode, uint64_t pe_number)
+{
+	struct phb *phb = pci_get_phb(phb_id);
+	int64_t rc;
+
+	if (!phb)
+		return OPAL_PARAMETER;
+	if (!phb->ops->set_capi_mode)
+		return OPAL_UNSUPPORTED;
+	if (mode == 1) {
+		phb->ops->lock(phb);
+		rc = phb->ops->set_capi_mode(phb, mode, pe_number);
+		phb->ops->unlock(phb);
+		return rc;
+	}
+	if (mode == 0) {
+		/* FIXME add support for PCI mode*/
+	}
+	return OPAL_UNSUPPORTED;
+}
+opal_call(OPAL_PCI_SET_PHB_CAPI_MODE, opal_pci_set_phb_capi_mode, 3);
diff --git a/core/pci.c b/core/pci.c
new file mode 100644
index 0000000..f07908b
--- /dev/null
+++ b/core/pci.c
@@ -0,0 +1,1388 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <pci.h>
+#include <pci-cfg.h>
+#include <timebase.h>
+#include <lock.h>
+#include <device.h>
+
+static struct lock pci_lock = LOCK_UNLOCKED;
+#define PCI_MAX_PHBs	64
+static struct phb *phbs[PCI_MAX_PHBs];
+
+#define DBG(fmt...) do { } while(0)
+
+/*
+ * Generic PCI utilities
+ */
+
+/* pci_find_cap - Find a PCI capability in a device config space
+ *
+ * This will return a config space offset (positive) or a negative
+ * error (OPAL error codes).
+ *
+ * OPAL_UNSUPPORTED is returned if the capability doesn't exist
+ */
+int64_t pci_find_cap(struct phb *phb, uint16_t bdfn, uint8_t want)
+{
+	int64_t rc;
+	uint16_t stat, cap;
+	uint8_t pos, next;
+
+	rc = pci_cfg_read16(phb, bdfn, PCI_CFG_STAT, &stat);
+	if (rc)
+		return rc;
+	if (!(stat & PCI_CFG_STAT_CAP))
+		return OPAL_UNSUPPORTED;
+	rc = pci_cfg_read8(phb, bdfn, PCI_CFG_CAP, &pos);
+	if (rc)
+		return rc;
+	pos &= 0xfc;
+	while(pos) {
+		rc = pci_cfg_read16(phb, bdfn, pos, &cap);
+		if (rc)
+			return rc;
+		if ((cap & 0xff) == want)
+			return pos;
+		next = (cap >> 8) & 0xfc;
+		if (next == pos) {
+			prerror("PHB%d: dev %04x pci_find_cap hit a loop !\n",
+				phb->opal_id, bdfn);
+			break;
+		}
+		pos = next;
+	}
+	return OPAL_UNSUPPORTED;
+}
+
+/* pci_find_ecap - Find a PCIe extended capability in a device
+ *                 config space
+ *
+ * This will return a config space offset (positive) or a negative
+ * error (OPAL error code). Additionally, if the "version" argument
+ * is non-NULL, the capability version will be returned there.
+ *
+ * OPAL_UNSUPPORTED is returned if the capability doesn't exist
+ */
+int64_t pci_find_ecap(struct phb *phb, uint16_t bdfn, uint16_t want,
+		      uint8_t *version)
+{
+	int64_t rc;
+	uint32_t cap;
+	uint16_t off, prev = 0;
+
+	for (off = 0x100; off && off < 0x1000; off = (cap >> 20) & 0xffc ) {
+		if (off == prev) {
+			prerror("PHB%d: dev %04x pci_find_ecap hit a loop !\n",
+				phb->opal_id, bdfn);
+			break;
+		}
+		prev = off;
+		rc = pci_cfg_read32(phb, bdfn, off, &cap);
+		if (rc)
+			return rc;
+		if ((cap & 0xffff) == want) {
+			if (version)
+				*version = (cap >> 16) & 0xf;
+			return off;
+		}
+	}
+	return OPAL_UNSUPPORTED;
+}
+
+static struct pci_device *pci_scan_one(struct phb *phb, struct pci_device *parent,
+				       uint16_t bdfn)
+{
+	struct pci_device *pd = NULL;
+	uint32_t retries, vdid, val;
+	int64_t rc, ecap;
+	uint8_t htype;
+	uint16_t capreg;
+	bool had_crs = false;
+
+	for (retries = 40; retries; retries--) {
+		rc = pci_cfg_read32(phb, bdfn, 0, &vdid);
+		if (rc)
+			return NULL;
+		if (vdid == 0xffffffff || vdid == 0x00000000)
+			return NULL;
+		if (vdid != 0xffff0001)
+			break;
+		had_crs = true;
+		time_wait_ms(100);
+	}
+	if (vdid == 0xffff0001) {
+		prerror("PCI: Device %04x CRS timeout !\n", bdfn);
+		return NULL;
+	}
+	if (had_crs)
+		printf("PCI: Device %04x replied after CRS\n", bdfn);
+	pd = zalloc(sizeof(struct pci_device));
+	if (!pd) {
+		prerror("PCI: Failed to allocate structure pci_device !\n");
+		goto fail;
+	}
+	pd->bdfn = bdfn;
+	pd->parent = parent;
+	list_head_init(&pd->children);
+	rc = pci_cfg_read8(phb, bdfn, PCI_CFG_HDR_TYPE, &htype);
+	if (rc) {
+		prerror("PCI: Failed to read header type !\n");
+		goto fail;
+	}
+	pd->is_multifunction = !!(htype & 0x80);
+	pd->is_bridge = (htype & 0x7f) != 0;
+	pd->scan_map = 0xffffffff; /* Default */
+
+	ecap = pci_find_cap(phb, bdfn, PCI_CFG_CAP_ID_EXP);
+	if (ecap > 0) {
+		pci_set_cap(pd, PCI_CFG_CAP_ID_EXP, ecap, false);
+		pci_cfg_read16(phb, bdfn, ecap + PCICAP_EXP_CAPABILITY_REG,
+			       &capreg);
+		pd->dev_type = GETFIELD(PCICAP_EXP_CAP_TYPE, capreg);
+
+		/*
+		 * XXX We observe a problem on some PLX switches where one
+		 * of the downstream ports appears as an upstream port, we
+		 * fix that up here otherwise, other code will misbehave
+		 */
+		if (pd->parent && pd->dev_type == PCIE_TYPE_SWITCH_UPPORT &&
+		    pd->parent->dev_type == PCIE_TYPE_SWITCH_UPPORT &&
+		    vdid == 0x874810b5) {
+			prerror("PCI: Fixing up bad PLX downstream port !\n");
+			pd->dev_type = PCIE_TYPE_SWITCH_DNPORT;
+		}
+
+		/* XXX Handle ARI */
+		if (pd->dev_type == PCIE_TYPE_SWITCH_DNPORT ||
+		    pd->dev_type == PCIE_TYPE_ROOT_PORT)
+			pd->scan_map = 0x1;
+
+		/* Read MPS capability, whose maximal size is 4096 */
+		pci_cfg_read32(phb, bdfn, ecap + PCICAP_EXP_DEVCAP, &val);
+		pd->mps = (128 << GETFIELD(PCICAP_EXP_DEVCAP_MPSS, val));
+		if (pd->mps > 4096)
+			pd->mps = 4096;
+	} else {
+		pd->dev_type = PCIE_TYPE_LEGACY;
+	}
+
+	/* If it's a bridge, sanitize the bus numbers to avoid forwarding
+	 *
+	 * This will help when walking down those bridges later on
+	 */
+	if (pd->is_bridge) {
+		pci_cfg_write8(phb, bdfn, PCI_CFG_PRIMARY_BUS, bdfn >> 8);
+		pci_cfg_write8(phb, bdfn, PCI_CFG_SECONDARY_BUS, 0);
+		pci_cfg_write8(phb, bdfn, PCI_CFG_SUBORDINATE_BUS, 0);
+	}
+
+	/* XXX Need to do some basic setups, such as MPSS, MRS,
+	 * RCB, etc...
+	 */
+
+	printf("PCI: Device %04x VID:%04x DEV:%04x TYP:%d MF%s BR%s EX%s\n",
+	       bdfn, vdid & 0xffff, vdid >> 16, pd->dev_type,
+	       pd->is_multifunction ? "+" : "-",
+	       pd->is_bridge ? "+" : "-",
+	       pci_has_cap(pd, PCI_CFG_CAP_ID_EXP, false) ? "+" : "-");
+
+	/*
+	 * Call PHB hook
+	 */
+	if (phb->ops->device_init)
+		phb->ops->device_init(phb, pd);
+
+	return pd;
+ fail:
+	if (pd)
+		free(pd);
+	return NULL;
+}
+
+/* pci_check_clear_freeze - Probing empty slot will result in an EEH
+ *                          freeze. Currently we have a single PE mapping
+ *                          everything (default state of our backend) so
+ *                          we just check and clear the state of PE#0
+ *
+ * NOTE: We currently only handle simple PE freeze, not PHB fencing
+ *       (or rather our backend does)
+ */
+static void pci_check_clear_freeze(struct phb *phb)
+{
+	int64_t rc;
+	uint8_t freeze_state;
+	uint16_t pci_error_type, sev;
+
+	rc = phb->ops->eeh_freeze_status(phb, 0, &freeze_state,
+					 &pci_error_type, &sev, NULL);
+	if (rc)
+		return;
+	if (freeze_state == OPAL_EEH_STOPPED_NOT_FROZEN)
+		return;
+	/* We can't handle anything worse than an ER here */
+	if (sev > OPAL_EEH_SEV_NO_ERROR &&
+	    sev < OPAL_EEH_SEV_PE_ER) {
+		prerror("PCI: PHB%d fatal probe error !\n", phb->opal_id);
+		return;
+	}
+	phb->ops->eeh_freeze_clear(phb, 0, OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+}
+
+/* pci_enable_bridge - Called before scanning a bridge
+ *
+ * Ensures error flags are clean, disable master abort, and
+ * check if the subordinate bus isn't reset, the slot is enabled
+ * on PCIe, etc...
+ */
+static bool pci_enable_bridge(struct phb *phb, struct pci_device *pd)
+{
+	uint16_t bctl;
+	bool was_reset = false;
+	int64_t ecap = 0;
+
+	/* Disable master aborts, clear errors */
+	pci_cfg_read16(phb, pd->bdfn, PCI_CFG_BRCTL, &bctl);
+	bctl &= ~PCI_CFG_BRCTL_MABORT_REPORT;
+	pci_cfg_write16(phb, pd->bdfn, PCI_CFG_BRCTL, bctl);
+
+	/* PCI-E bridge, check the slot state */
+	if (pd->dev_type == PCIE_TYPE_ROOT_PORT ||
+	    pd->dev_type == PCIE_TYPE_SWITCH_DNPORT) {
+		uint16_t slctl, slcap, slsta, lctl;
+
+		ecap = pci_cap(pd, PCI_CFG_CAP_ID_EXP, false);
+
+		/* Read the slot status & check for presence detect */
+		pci_cfg_read16(phb, pd->bdfn, ecap+PCICAP_EXP_SLOTSTAT, &slsta);
+		DBG(" slstat=%04x\n", slsta);
+		if (!(slsta & PCICAP_EXP_SLOTSTAT_PDETECTST)) {
+			printf("PCI: No card in slot\n");
+			return false;
+		}
+		
+		/* Read the slot capabilities */
+		pci_cfg_read16(phb, pd->bdfn, ecap+PCICAP_EXP_SLOTCAP, &slcap);
+		DBG(" slcap=%04x\n", slcap);
+		if (!(slcap & PCICAP_EXP_SLOTCAP_PWCTRL))
+			goto power_is_on;
+
+		/* Read the slot control register, check if the slot is off */
+		pci_cfg_read16(phb, pd->bdfn, ecap+PCICAP_EXP_SLOTCTL, &slctl);
+		DBG(" slctl=%04x\n", slctl);
+		if (!(slctl & PCICAP_EXP_SLOTCTL_PWRCTLR))
+			goto power_is_on;
+
+		/* Turn power on
+		 *
+		 * XXX This is a "command", we should wait for it to complete
+		 * etc... but just waiting 2s will do for now
+		 */
+		DBG("PCI: Bridge power is off, turning on ...\n");
+		slctl &= ~PCICAP_EXP_SLOTCTL_PWRCTLR;
+		slctl |= SETFIELD(PCICAP_EXP_SLOTCTL_PWRI, 0, PCIE_INDIC_ON);
+		pci_cfg_write16(phb, pd->bdfn, ecap+PCICAP_EXP_SLOTCTL, slctl);
+
+		/* Wait a couple of seconds */
+		time_wait_ms(2000);
+
+ power_is_on:
+		/* Enable link */
+		pci_cfg_read16(phb, pd->bdfn, ecap+PCICAP_EXP_LCTL, &lctl);
+		DBG(" lctl=%04x\n", lctl);
+		lctl &= ~PCICAP_EXP_LCTL_LINK_DIS;
+		pci_cfg_write16(phb, pd->bdfn, ecap+PCICAP_EXP_LCTL, lctl);
+	}
+
+	/* Clear secondary reset */
+	if (bctl & PCI_CFG_BRCTL_SECONDARY_RESET) {
+		printf("PCI: Bridge secondary reset is on, clearing it ...\n");
+		bctl &= ~PCI_CFG_BRCTL_SECONDARY_RESET;
+		pci_cfg_write16(phb, pd->bdfn, PCI_CFG_BRCTL, bctl);
+		time_wait_ms(1000);
+		was_reset = true;
+	}
+
+	/* PCI-E bridge, wait for link */
+	if (pd->dev_type == PCIE_TYPE_ROOT_PORT ||
+	    pd->dev_type == PCIE_TYPE_SWITCH_DNPORT) {
+		uint32_t lcap;
+
+		/* Read link caps */
+		pci_cfg_read32(phb, pd->bdfn, ecap+PCICAP_EXP_LCAP, &lcap);
+
+		/* Did link capability say we got reporting ?
+		 *
+		 * If yes, wait up to 10s, if not, wait 1s if we didn't already
+		 */
+		if (lcap & PCICAP_EXP_LCAP_DL_ACT_REP) {
+			uint32_t retries = 100;
+			uint16_t lstat;
+
+			printf("%016lx: waiting for link... \n", mftb());
+
+			while(retries--) {
+				pci_cfg_read16(phb, pd->bdfn,
+					       ecap+PCICAP_EXP_LSTAT, &lstat);
+				if (lstat & PCICAP_EXP_LSTAT_DLLL_ACT)
+					break;
+				time_wait_ms(100);
+			}
+			printf("%016lx: end wait for link...\n", mftb());
+			if (!(lstat & PCICAP_EXP_LSTAT_DLLL_ACT)) {
+				prerror("PCI: Bridge %04x, timeout waiting"
+					" for downstream link\n", pd->bdfn);
+				return false;
+			}
+			/* Need to wait another 100ms before touching
+			 * the config space
+			 */
+			time_wait_ms(100);
+		} else if (!was_reset)
+			time_wait_ms(1000);
+	}
+
+	/* Clear error status */
+	pci_cfg_write16(phb, pd->bdfn, PCI_CFG_STAT, 0xffff);
+
+	return true;
+}
+
+/* Clear up bridge resources */
+static void pci_cleanup_bridge(struct phb *phb, struct pci_device *pd)
+{
+	uint16_t cmd;
+
+	pci_cfg_write16(phb, pd->bdfn, PCI_CFG_IO_BASE_U16, 0xffff);
+	pci_cfg_write8(phb, pd->bdfn, PCI_CFG_IO_BASE, 0xf0);
+	pci_cfg_write16(phb, pd->bdfn, PCI_CFG_IO_LIMIT_U16, 0);
+	pci_cfg_write8(phb, pd->bdfn, PCI_CFG_IO_LIMIT, 0);
+	pci_cfg_write16(phb, pd->bdfn, PCI_CFG_MEM_BASE, 0xfff0);
+	pci_cfg_write16(phb, pd->bdfn, PCI_CFG_MEM_LIMIT, 0);
+	pci_cfg_write32(phb, pd->bdfn, PCI_CFG_PREF_MEM_BASE_U32, 0xffffffff);
+	pci_cfg_write16(phb, pd->bdfn, PCI_CFG_PREF_MEM_BASE, 0xfff0);
+	pci_cfg_write32(phb, pd->bdfn, PCI_CFG_PREF_MEM_LIMIT_U32, 0);
+	pci_cfg_write16(phb, pd->bdfn, PCI_CFG_PREF_MEM_LIMIT, 0);
+
+	/* Note: This is a bit fishy but since we have closed all the
+	 * bridge windows above, it shouldn't be a problem. Basically
+	 * we enable Memory, IO and Bus Master on the bridge because
+	 * some versions of Linux will fail to do it themselves.
+	 */
+	pci_cfg_read16(phb, pd->bdfn, PCI_CFG_CMD, &cmd);
+	cmd |= PCI_CFG_CMD_IO_EN | PCI_CFG_CMD_MEM_EN;
+	cmd |= PCI_CFG_CMD_BUS_MASTER_EN;
+	pci_cfg_write16(phb, pd->bdfn, PCI_CFG_CMD, cmd);	
+}
+
+
+/* pci_scan - Perform a recursive scan of the bus at bus_number
+ *            populating the list passed as an argument. This also
+ *            performs the bus numbering, so it returns the largest
+ *            bus number that was assigned.
+ *
+ * Note: Eventually this might want to access some VPD information
+ *       in order to know what slots to scan and what not etc..
+ *
+ * XXX NOTE: We might want to enable ARI along the way...
+ *
+ * XXX NOTE: We might also want to setup the PCIe MPS/MRSS properly
+ *           here as Linux may or may not do it
+ */
+static uint8_t pci_scan(struct phb *phb, uint8_t bus, uint8_t max_bus,
+			struct list_head *list, struct pci_device *parent,
+			bool scan_downstream)
+{
+	struct pci_device *pd = NULL;
+	uint8_t dev, fn, next_bus, max_sub, save_max;
+	uint32_t scan_map;
+
+	/* Decide what to scan  */
+	scan_map = parent ? parent->scan_map : phb->scan_map;
+
+	/* Do scan */
+	for (dev = 0; dev < 32; dev++) {
+		if (!(scan_map & (1ul << dev)))
+			continue;
+
+		/* Scan the device */
+		pd = pci_scan_one(phb, parent, (bus << 8) | (dev << 3));
+		pci_check_clear_freeze(phb);
+		if (!pd)
+			continue;
+
+		/* Get slot info if any */
+		if (platform.pci_get_slot_info)
+			platform.pci_get_slot_info(phb, pd);
+
+		/* Link it up */
+		list_add_tail(list, &pd->link);
+
+		/* XXX Handle ARI */
+		if (!pd->is_multifunction)
+			continue;
+		for (fn = 1; fn < 8; fn++) {
+			pd = pci_scan_one(phb, parent,
+					  ((uint16_t)bus << 8) | (dev << 3) | fn);
+			pci_check_clear_freeze(phb);
+			if (pd) {
+				if (platform.pci_get_slot_info)
+					platform.pci_get_slot_info(phb, pd);
+				list_add_tail(list, &pd->link);
+			}
+		}
+	}
+
+	/*
+	 * We only scan downstream if instructed to do so by the
+	 * caller. Typically we avoid the scan when we know the
+	 * link is down already, which happens for the top level
+	 * root complex, and avoids a long secondary timeout
+	 */
+	if (!scan_downstream)
+		return bus;
+
+	next_bus = bus + 1;
+	max_sub = bus;
+	save_max = max_bus;
+
+	/* Scan down bridges */
+	list_for_each(list, pd, link) {
+		bool use_max, do_scan;
+
+		if (!pd->is_bridge)
+			continue;
+
+		/* We need to figure out a new bus number to start from.
+		 *
+		 * This can be tricky due to our HW constraints which differ
+		 * from bridge to bridge so we are going to let the phb
+		 * driver decide what to do. This can return us a maximum
+		 * bus number to assign as well
+		 *
+		 * This function will:
+		 *
+		 *  - Return the bus number to use as secondary for the
+		 *    bridge or 0 for a failure
+		 *
+		 *  - "max_bus" will be adjusted to represent the max
+		 *    subordinate that can be associated with the downstream
+		 *    device
+		 *
+		 *  - "use_max" will be set to true if the returned max_bus
+		 *    *must* be used as the subordinate bus number of that
+		 *    bridge (when we need to give aligned powers of two's
+		 *    on P7IOC). If is is set to false, we just adjust the
+		 *    subordinate bus number based on what we probed.
+		 *    
+		 */
+		max_bus = save_max;
+		next_bus = phb->ops->choose_bus(phb, pd, next_bus,
+						&max_bus, &use_max);
+
+		/* Configure the bridge with the returned values */
+		if (next_bus <= bus) {
+			printf("PCI: Bridge %04x, out of bus numbers !\n",
+			       pd->bdfn);
+			max_bus = next_bus = 0; /* Failure case */
+		}
+		pci_cfg_write8(phb, pd->bdfn, PCI_CFG_SECONDARY_BUS, next_bus);
+		pci_cfg_write8(phb, pd->bdfn, PCI_CFG_SUBORDINATE_BUS, max_bus);
+		if (!next_bus)
+			break;
+
+		printf("PCI: Bridge %04x, bus: %02x..%02x %s scanning...\n",
+		       pd->bdfn, next_bus, max_bus, use_max ? "[use max]" : "");
+
+		/* Clear up bridge resources */
+		pci_cleanup_bridge(phb, pd);
+
+		/* Configure the bridge. This will enable power to the slot
+		 * if it's currently disabled, lift reset, etc...
+		 *
+		 * Return false if we know there's nothing behind the bridge
+		 */
+		do_scan = pci_enable_bridge(phb, pd);
+
+		/* Perform recursive scan */
+		if (do_scan) {
+			max_sub = pci_scan(phb, next_bus, max_bus,
+					   &pd->children, pd, true);
+		} else if (!use_max) {
+			/* XXX Empty bridge... we leave room for hotplug
+			 * slots etc.. but we should be smarter at figuring
+			 * out if this is actually a hotpluggable one
+			 */
+			max_sub = next_bus + 4;
+			if (max_sub > max_bus)
+				max_sub = max_bus;
+		}
+
+		/* Update the max subordinate as described previously */
+		if (use_max)
+			max_sub = max_bus;
+		pci_cfg_write8(phb, pd->bdfn, PCI_CFG_SUBORDINATE_BUS, max_sub);
+		next_bus = max_sub + 1;
+	}
+
+	return max_sub;
+}
+
+static int pci_get_mps(struct phb *phb,
+		       struct pci_device *pd, void *userdata)
+{
+	uint32_t *mps = (uint32_t *)userdata;
+
+	/* Only check PCI device that had MPS capacity */
+	if (phb && pd && pd->mps && *mps > pd->mps)
+		*mps = pd->mps;
+
+	return 0;
+}
+
+static int __pci_configure_mps(struct phb *phb,
+			       struct pci_device *pd,
+			       void *userdata __unused)
+{
+	uint32_t ecap, mps = phb->mps;
+	uint16_t val;
+
+	/* If the MPS isn't acceptable one, bail immediately */
+	if (mps < 128 || mps > 4096)
+		return 1;
+
+	if (!phb || !pd)
+		return 0;
+
+	/* PCIe deivce always has MPS capacity */
+	if (pd->mps) {
+		ecap = pci_cap(pd, PCI_CFG_CAP_ID_EXP, false);
+		mps = ilog2(mps) - 7;
+
+		pci_cfg_read16(phb, pd->bdfn, ecap + PCICAP_EXP_DEVCTL, &val);
+		val = SETFIELD(PCICAP_EXP_DEVCTL_MPS, val, mps);
+		pci_cfg_write16(phb, pd->bdfn, ecap + PCICAP_EXP_DEVCTL, val);
+	}
+
+	return 0;
+}
+
+int32_t pci_configure_mps(struct phb *phb, struct pci_device *pd)
+{
+	return __pci_configure_mps(phb, pd, NULL);
+}
+
+/*
+ * The power state would be checked. If the power has
+ * been on, we will issue fundamental reset. Otherwise,
+ * we will power it on before issuing fundamental reset.
+ */
+static int64_t pci_reset_phb(struct phb *phb)
+{
+	const char *desc;
+	int64_t rc;
+
+	rc = phb->ops->power_state(phb);
+	if (rc < 0) {
+		printf("PHB%d: Failed to get power state, rc=%lld\n",
+			phb->opal_id, rc);
+		return rc;
+	}
+
+	if (rc == OPAL_SHPC_POWER_ON) {
+		desc = "fundamental reset";
+		rc = phb->ops->fundamental_reset(phb);
+	} else {
+		desc = "power on";
+		rc = phb->ops->slot_power_on(phb);
+	}
+
+	if (rc < 0) {
+		/* Don't warn if it's just an empty slot */
+		if (rc != OPAL_CLOSED)
+			printf("PHB%d: Failed to %s, rc=%lld\n",
+			       phb->opal_id, desc, rc);
+		return rc;
+	}
+
+	/* Wait the internal state machine */
+	while (rc > 0) {
+		time_wait(rc);
+		rc = phb->ops->poll(phb);
+	}
+	if (rc < 0)
+		printf("PHB%d: Failed to %s, rc=%lld\n",
+			phb->opal_id, desc, rc);
+
+        return rc;
+}
+
+static void pci_init_slot(struct phb *phb)
+{
+	uint32_t mps = 0xffffffff;
+	int64_t rc;
+	bool has_link;
+
+	printf("PHB%d: Init slot\n", phb->opal_id);
+
+	/*
+	 * For PCI/PCI-X, we get the slot info and we also
+	 * check if the PHB has anything connected to it
+	 */
+	if (phb->phb_type < phb_type_pcie_v1) {
+		if (platform.pci_get_slot_info)
+			platform.pci_get_slot_info(phb, NULL);
+		rc = phb->ops->presence_detect(phb);
+		if (rc != OPAL_SHPC_DEV_PRESENT) {
+			printf("PHB%d: Slot empty\n", phb->opal_id);
+			return;
+		}
+	}
+
+	/*
+	 * Power on the PHB, the PHB should be reset in
+	 * fundamental way while powering on. The reset
+	 * state machine is going to wait for the link
+	 */
+	rc = pci_reset_phb(phb);
+	if (rc && rc != OPAL_CLOSED)
+		return;
+
+	/* It's up, print some things */
+	rc = phb->ops->link_state(phb);
+	if (rc < 0) {
+		printf("PHB%d: Failed to query link state, rc=%lld\n",
+		       phb->opal_id, rc);
+		return;
+	}
+	has_link = rc != OPAL_SHPC_LINK_DOWN;
+
+	if(!has_link)
+		printf("PHB%d: Link down\n", phb->opal_id);
+	else if (phb->phb_type >= phb_type_pcie_v1)
+		printf("PHB%d: Link up at x%lld width\n", phb->opal_id, rc);
+
+	printf("PHB%d: Scanning (upstream%s)...\n", phb->opal_id,
+	       has_link ? "+downsteam" : " only");
+	pci_scan(phb, 0, 0xff, &phb->devices, NULL, has_link);
+
+	/* Configre MPS (Max Payload Size) for PCIe domain */
+	pci_walk_dev(phb, pci_get_mps, &mps);
+	phb->mps = mps;
+	pci_walk_dev(phb, __pci_configure_mps, NULL);
+}
+
+int64_t pci_register_phb(struct phb *phb)
+{
+	int64_t rc = OPAL_SUCCESS;
+	unsigned int i;
+
+	lock(&pci_lock);
+	for (i = 0; i < PCI_MAX_PHBs; i++)
+		if (!phbs[i])
+			break;
+	if (i >= PCI_MAX_PHBs) {
+		prerror("PHB: Failed to find a free ID slot\n");
+		rc = OPAL_RESOURCE;
+	} else {
+		phbs[i] = phb;
+		phb->opal_id = i;
+		dt_add_property_cells(phb->dt_node, "ibm,opal-phbid",
+				      0, phb->opal_id);
+		printf("PCI: Registered PHB ID %d\n", i);
+	}
+	list_head_init(&phb->devices);
+	unlock(&pci_lock);
+
+	return rc;
+}
+
+int64_t pci_unregister_phb(struct phb *phb)
+{
+	/* XXX We want some kind of RCU or RWlock to make things
+	 * like that happen while no OPAL callback is in progress,
+	 * that way we avoid taking a lock in each of them.
+	 *
+	 * Right now we don't unregister so we are fine
+	 */
+	lock(&pci_lock);
+	phbs[phb->opal_id] = phb;
+	unlock(&pci_lock);
+
+	return OPAL_SUCCESS;
+}
+
+struct phb *pci_get_phb(uint64_t phb_id)
+{
+	if (phb_id >= PCI_MAX_PHBs)
+		return NULL;
+
+	/* XXX See comment in pci_unregister_phb() about locking etc... */
+	return phbs[phb_id];
+}
+
+static const char *pci_class_name(uint32_t class_code)
+{
+	uint8_t class = class_code >> 16;
+	uint8_t sub = (class_code >> 8) & 0xff;
+	uint8_t pif = class_code & 0xff;
+
+	switch(class) {
+	case 0x00:
+		switch(sub) {
+		case 0x00: return "device";
+		case 0x01: return "vga";
+		}
+		break;
+	case 0x01:
+		switch(sub) {
+		case 0x00: return "scsi";
+		case 0x01: return "ide";
+		case 0x02: return "fdc";
+		case 0x03: return "ipi";
+		case 0x04: return "raid";
+		case 0x05: return "ata";
+		case 0x06: return "sata";
+		case 0x07: return "sas";
+		default:   return "mass-storage";
+		}
+	case 0x02:
+		switch(sub) {
+		case 0x00: return "ethernet";
+		case 0x01: return "token-ring";
+		case 0x02: return "fddi";
+		case 0x03: return "atm";
+		case 0x04: return "isdn";
+		case 0x05: return "worldfip";
+		case 0x06: return "picmg";
+		default:   return "network";
+		}
+	case 0x03:
+		switch(sub) {
+		case 0x00: return "vga";
+		case 0x01: return "xga";
+		case 0x02: return "3d-controller";
+		default:   return "display";
+		}
+	case 0x04:
+		switch(sub) {
+		case 0x00: return "video";
+		case 0x01: return "sound";
+		case 0x02: return "telephony";
+		default:   return "multimedia-device";
+		}
+	case 0x05:
+		switch(sub) {
+		case 0x00: return "memory";
+		case 0x01: return "flash";
+		default:   return "memory-controller";
+		}
+	case 0x06:
+		switch(sub) {
+		case 0x00: return "host";
+		case 0x01: return "isa";
+		case 0x02: return "eisa";
+		case 0x03: return "mca";
+		case 0x04: return "pci";
+		case 0x05: return "pcmcia";
+		case 0x06: return "nubus";
+		case 0x07: return "cardbus";
+		case 0x08: return "raceway";
+		case 0x09: return "semi-transparent-pci";
+		case 0x0a: return "infiniband";
+		default:   return "unknown-bridge";
+		}
+	case 0x07:
+		switch(sub) {
+		case 0x00:
+			switch(pif) {
+			case 0x01: return "16450-serial";
+			case 0x02: return "16550-serial";
+			case 0x03: return "16650-serial";
+			case 0x04: return "16750-serial";
+			case 0x05: return "16850-serial";
+			case 0x06: return "16950-serial";
+			default:   return "serial";
+			}
+		case 0x01:
+			switch(pif) {
+			case 0x01: return "bi-directional-parallel";
+			case 0x02: return "ecp-1.x-parallel";
+			case 0x03: return "ieee1284-controller";
+			case 0xfe: return "ieee1284-device";
+			default:   return "parallel";
+			}
+		case 0x02: return "multiport-serial";
+		case 0x03:
+			switch(pif) {
+			case 0x01: return "16450-modem";
+			case 0x02: return "16550-modem";
+			case 0x03: return "16650-modem";
+			case 0x04: return "16750-modem";
+			default:   return "modem";
+			}
+		case 0x04: return "gpib";
+		case 0x05: return "smart-card";
+		default:   return "communication-controller";
+		}
+	case 0x08:
+		switch(sub) {
+		case 0x00:
+			switch(pif) {
+			case 0x01: return "isa-pic";
+			case 0x02: return "eisa-pic";
+			case 0x10: return "io-apic";
+			case 0x20: return "iox-apic";
+			default:   return "interrupt-controller";
+			}
+		case 0x01:
+			switch(pif) {
+			case 0x01: return "isa-dma";
+			case 0x02: return "eisa-dma";
+			default:   return "dma-controller";
+			}
+		case 0x02:
+			switch(pif) {
+			case 0x01: return "isa-system-timer";
+			case 0x02: return "eisa-system-timer";
+			default:   return "timer";
+			}
+		case 0x03:
+			switch(pif) {
+			case 0x01: return "isa-rtc";
+			default:   return "rtc";
+			}
+		case 0x04: return "hotplug-controller";
+		case 0x05: return "sd-host-controller";
+		default:   return "system-peripheral";
+		}
+	case 0x09:
+		switch(sub) {
+		case 0x00: return "keyboard";
+		case 0x01: return "pen";
+		case 0x02: return "mouse";
+		case 0x03: return "scanner";
+		case 0x04: return "gameport";
+		default:   return "input-controller";
+		}
+	case 0x0a:
+		switch(sub) {
+		case 0x00: return "clock";
+		default:   return "docking-station";
+		}
+	case 0x0b:
+		switch(sub) {
+		case 0x00: return "386";
+		case 0x01: return "486";
+		case 0x02: return "pentium";
+		case 0x10: return "alpha";
+		case 0x20: return "powerpc";
+		case 0x30: return "mips";
+		case 0x40: return "co-processor";
+		default:   return "cpu";
+		}
+	case 0x0c:
+		switch(sub) {
+		case 0x00: return "firewire";
+		case 0x01: return "access-bus";
+		case 0x02: return "ssa";
+		case 0x03:
+			switch(pif) {
+			case 0x00: return "usb-uhci";
+			case 0x10: return "usb-ohci";
+			case 0x20: return "usb-ehci";
+			case 0x30: return "usb-xhci";
+			case 0xfe: return "usb-device";
+			default:   return "usb";
+			}
+		case 0x04: return "fibre-channel";
+		case 0x05: return "smb";
+		case 0x06: return "infiniband";
+		case 0x07:
+			switch(pif) {
+			case 0x00: return "impi-smic";
+			case 0x01: return "impi-kbrd";
+			case 0x02: return "impi-bltr";
+			default:   return "impi";
+			}
+		case 0x08: return "secos";
+		case 0x09: return "canbus";
+		default:   return "serial-bus";
+		}
+	case 0x0d:
+		switch(sub) {
+		case 0x00: return "irda";
+		case 0x01: return "consumer-ir";
+		case 0x10: return "rf-controller";
+		case 0x11: return "bluetooth";
+		case 0x12: return "broadband";
+		case 0x20: return "enet-802.11a";
+		case 0x21: return "enet-802.11b";
+		default:   return "wireless-controller";
+		}
+	case 0x0e: return "intelligent-controller";
+	case 0x0f:
+		switch(sub) {
+		case 0x01: return "satellite-tv";
+		case 0x02: return "satellite-audio";
+		case 0x03: return "satellite-voice";
+		case 0x04: return "satellite-data";
+		default:   return "satellite-device";
+		}
+	case 0x10:
+		switch(sub) {
+		case 0x00: return "network-encryption";
+		case 0x01: return "entertainment-encryption";
+		default:   return "encryption";
+		}
+	case 0x011:
+		switch(sub) {
+		case 0x00: return "dpio";
+		case 0x01: return "counter";
+		case 0x10: return "measurement";
+		case 0x20: return "management-card";
+		default:   return "data-processing";
+		}
+	}
+	return "device";
+}
+
+void pci_std_swizzle_irq_map(struct dt_node *np,
+			     struct pci_device *pd,
+			     struct pci_lsi_state *lstate,
+			     uint8_t swizzle)
+{
+	uint32_t *map, *p;
+	int dev, irq;
+	size_t map_size;
+
+	/* Size in bytes of a target interrupt */
+	size_t isize = lstate->int_size * sizeof(uint32_t);
+
+	/* Calculate the size of a map entry:
+	 *
+	 * 3 cells : PCI Address
+	 * 1 cell  : PCI IRQ
+	 * 1 cell  : PIC phandle
+	 * n cells : PIC irq (n = lstate->int_size)
+	 *
+	 * Assumption: PIC address is 0-size
+	 */
+	int esize = 3 + 1 + 1 + lstate->int_size;
+
+	/* Number of map "device" entries
+	 *
+	 * A PCI Express root or downstream port needs only one
+	 * entry for device 0. Anything else will get a full map
+	 * for all possible 32 child device numbers
+	 *
+	 * If we have been passed a host bridge (pd == NULL) we also
+	 * do a simple per-pin map
+	 */
+	int edevcount;
+
+	if (!pd || (pd->dev_type == PCIE_TYPE_ROOT_PORT ||
+		    pd->dev_type == PCIE_TYPE_SWITCH_DNPORT)) {
+		edevcount = 1;
+		dt_add_property_cells(np, "interrupt-map-mask", 0, 0, 0, 7);
+	} else {
+		edevcount = 32;
+		dt_add_property_cells(np, "interrupt-map-mask",
+				      0xf800, 0, 0, 7);
+	}
+	map_size = esize * edevcount * 4 * sizeof(uint32_t);
+	map = p = zalloc(map_size);
+
+	for (dev = 0; dev < edevcount; dev++) {
+		for (irq = 0; irq < 4; irq++) {
+			/* Calculate pin */
+			uint32_t new_irq = (irq + dev + swizzle) % 4;
+
+			/* PCI address portion */
+			*(p++) = dev << (8 + 3);
+			*(p++) = 0;
+			*(p++) = 0;
+
+			/* PCI interrupt portion */
+			*(p++) = irq + 1;
+
+			/* Parent phandle */
+			*(p++) = lstate->int_parent[new_irq];
+
+			/* Parent desc */
+			memcpy(p, lstate->int_val[new_irq], isize);
+			p += lstate->int_size;
+		}
+	}
+
+	dt_add_property(np, "interrupt-map", map, map_size);
+	free(map);
+}
+
+static void pci_add_slot_properties(struct phb *phb, struct pci_slot_info *info,
+				    struct dt_node *np)
+{
+	char loc_code[LOC_CODE_SIZE];
+	size_t base_loc_code_len, slot_label_len;
+
+	if (phb->base_loc_code) {
+		base_loc_code_len = strlen(phb->base_loc_code);
+		slot_label_len = strlen(info->label);
+		if ((base_loc_code_len + slot_label_len +1) < LOC_CODE_SIZE) {
+			strcpy(loc_code, phb->base_loc_code);
+			strcat(loc_code, "-");
+			strcat(loc_code, info->label);
+			dt_add_property(np, "ibm,slot-location-code",
+					loc_code, strlen(loc_code) + 1);
+		} else
+			prerror("PCI:  Loc Code too long - %zu + %zu + 1\n",
+			       base_loc_code_len, slot_label_len);
+	} else
+		DBG("PCI: Base Loc code not found...\n");
+
+	/* Add other slot information */
+	dt_add_property_cells(np, "ibm,slot-pluggable", info->pluggable);
+	dt_add_property_cells(np, "ibm,slot-power-ctl", info->power_ctl);
+	dt_add_property_cells(np, "ibm,slot-wired-lanes", info->wired_lanes);
+	/*dt_add_property(np, "ibm,slot-bus-clock", &pd->slot_info->bus_clock, sizeof(uint8_t));*/
+	dt_add_property_cells(np, "ibm,slot-connector-type", info->connector_type);
+	dt_add_property_cells(np, "ibm,slot-card-desc", info->card_desc);
+	dt_add_property_cells(np, "ibm,slot-card-mech", info->card_mech);
+	dt_add_property_cells(np, "ibm,slot-pwr-led-ctl", info->pwr_led_ctl);
+	dt_add_property_cells(np, "ibm,slot-attn-led-ctl", info->attn_led_ctl);
+	dt_add_property_string(np, "ibm,slot-label", info->label);
+}
+
+static void pci_add_loc_code(struct dt_node *np)
+{
+	struct dt_node *p = np->parent;
+	const char *blcode = NULL;
+
+	/* Look for a parent with a slot-location-code */
+	while (p && !blcode) {
+		blcode = dt_prop_get_def(p, "ibm,slot-location-code", NULL);
+		p = p->parent;
+	}
+	if (!blcode)
+		return;
+	dt_add_property_string(np, "ibm,loc-code", blcode);
+}
+
+static void pci_print_summary_line(struct phb *phb, struct pci_device *pd,
+				   struct dt_node *np, u32 rev_class,
+				   const char *cname)
+{
+	const char *label, *dtype, *s;
+	u32 vdid;
+#define MAX_SLOTSTR 32
+	char slotstr[MAX_SLOTSTR  + 1] = { 0, };
+
+	pci_cfg_read32(phb, pd->bdfn, 0, &vdid);
+
+	/* If it's a slot, it has a slot-label */
+	label = dt_prop_get_def(np, "ibm,slot-label", NULL);
+	if (label) {
+		u32 lanes = dt_prop_get_u32_def(np, "ibm,slot-wired-lanes", 0);
+		static const char *lanestrs[] = {
+			"", " x1", " x2", " x4", " x8", "x16", "x32", "32b", "64b"
+		};
+		const char *lstr = lanes > PCI_SLOT_WIRED_LANES_PCIX_64 ? "" : lanestrs[lanes];
+		snprintf(slotstr, MAX_SLOTSTR, "SLOT=%3s %s", label, lstr);
+		/* XXX Add more slot info */
+	} else {
+		/*
+		 * No label, ignore downstream switch legs and root complex,
+		 * Those would essentially be non-populated
+		 */
+		if (pd->dev_type != PCIE_TYPE_ROOT_PORT &&
+		    pd->dev_type != PCIE_TYPE_SWITCH_DNPORT) {
+			/* It's a mere device, get loc code */
+			s = dt_prop_get_def(np, "ibm,loc-code", NULL);
+			if (s)
+				snprintf(slotstr, MAX_SLOTSTR, "LOC_CODE=%s", s);
+		}
+	}
+
+	if (pci_has_cap(pd, PCI_CFG_CAP_ID_EXP, false)) {
+		static const char *pcie_types[] = {
+			"EP  ", "LGCY", "????", "????", "ROOT", "SWUP", "SWDN",
+			"ETOX", "XTOE", "RINT", "EVTC" };
+		if (pd->dev_type >= ARRAY_SIZE(pcie_types))
+			dtype = "????";
+		else
+			dtype = pcie_types[pd->dev_type];
+	} else
+		dtype = pd->is_bridge ? "PCIB" : "PCID";
+
+	if (pd->is_bridge) {
+		uint8_t sec_bus, sub_bus;
+		pci_cfg_read8(phb, pd->bdfn, PCI_CFG_SECONDARY_BUS, &sec_bus);
+		pci_cfg_read8(phb, pd->bdfn, PCI_CFG_SUBORDINATE_BUS, &sub_bus);
+		printf(" %04x:%02x:%02x.%x [%s] %04x %04x R:%02x C:%06x B:%02x..%02x %s\n",
+		       phb->opal_id, pd->bdfn >> 8, (pd->bdfn >> 3) & 0x1f,
+		       pd->bdfn & 0x7, dtype, vdid & 0xffff, vdid >> 16,
+		       rev_class & 0xff, rev_class >> 8, sec_bus, sub_bus, slotstr);
+	} else
+		printf(" %04x:%02x:%02x.%x [%s] %04x %04x R:%02x C:%06x (%14s) %s\n",
+		       phb->opal_id, pd->bdfn >> 8, (pd->bdfn >> 3) & 0x1f,
+		       pd->bdfn & 0x7, dtype, vdid & 0xffff, vdid >> 16,
+		       rev_class & 0xff, rev_class >> 8, cname, slotstr);
+}
+
+
+static void pci_add_one_node(struct phb *phb, struct pci_device *pd,
+			     struct dt_node *parent_node,
+			     struct pci_lsi_state *lstate, uint8_t swizzle)
+{
+	struct pci_device *child;
+	struct dt_node *np;
+	const char *cname;
+#define MAX_NAME 256
+	char name[MAX_NAME];
+	char compat[MAX_NAME];
+	uint32_t rev_class, vdid;
+	uint32_t reg[5];
+	uint8_t intpin;
+
+	pci_cfg_read32(phb, pd->bdfn, 0, &vdid);
+	pci_cfg_read32(phb, pd->bdfn, PCI_CFG_REV_ID, &rev_class);
+	pci_cfg_read8(phb, pd->bdfn, PCI_CFG_INT_PIN, &intpin);
+
+	/*
+	 * Quirk for IBM bridge bogus class on PCIe root complex.
+	 * Without it, the PCI DN won't be created for its downstream
+	 * devices in Linux.
+	 */
+	if (pci_has_cap(pd, PCI_CFG_CAP_ID_EXP, false) &&
+	    parent_node == phb->dt_node)
+		rev_class = (rev_class & 0xff) | 0x6040000;
+	cname = pci_class_name(rev_class >> 8);
+
+	if (pd->bdfn & 0x7)
+		snprintf(name, MAX_NAME - 1, "%s@%x,%x",
+			 cname, (pd->bdfn >> 3) & 0x1f, pd->bdfn & 0x7);
+	else
+		snprintf(name, MAX_NAME - 1, "%s@%x",
+			 cname, (pd->bdfn >> 3) & 0x1f);
+	np = dt_new(parent_node, name);
+
+	/* XXX FIXME: make proper "compatible" properties */
+	if (pci_has_cap(pd, PCI_CFG_CAP_ID_EXP, false)) {
+		snprintf(compat, MAX_NAME, "pciex%x,%x",
+			 vdid & 0xffff, vdid >> 16);
+		dt_add_property_cells(np, "ibm,pci-config-space-type", 1);
+	} else {
+		snprintf(compat, MAX_NAME, "pci%x,%x",
+			 vdid & 0xffff, vdid >> 16);
+		dt_add_property_cells(np, "ibm,pci-config-space-type", 0);
+	}
+	dt_add_property_cells(np, "class-code", rev_class >> 8);
+	dt_add_property_cells(np, "revision-id", rev_class & 0xff);
+	dt_add_property_cells(np, "vendor-id", vdid & 0xffff);
+	dt_add_property_cells(np, "device-id", vdid >> 16);
+	if (intpin)
+		dt_add_property_cells(np, "interrupts", intpin);
+
+	/* XXX FIXME: Add a few missing ones such as
+	 *
+	 *  - devsel-speed (!express)
+	 *  - max-latency
+	 *  - min-grant
+	 *  - subsystem-id
+	 *  - subsystem-vendor-id
+	 *  - ...
+	 */
+
+	/* Add slot properties if needed */
+	if (pd->slot_info)
+		pci_add_slot_properties(phb, pd->slot_info, np);
+
+	/* Make up location code */
+	pci_add_loc_code(np);
+
+	/* XXX FIXME: We don't look for BARs, we only put the config space
+	 * entry in the "reg" property. That's enough for Linux and we might
+	 * even want to make this legit in future ePAPR
+	 */
+	reg[0] = pd->bdfn << 8;
+	reg[1] = reg[2] = reg[3] = reg[4] = 0;
+	dt_add_property(np, "reg", reg, sizeof(reg));
+
+	/* Print summary info about the device */
+	pci_print_summary_line(phb, pd, np, rev_class, cname);
+
+	if (!pd->is_bridge)
+		return;
+
+	dt_add_property_cells(np, "#address-cells", 3);
+	dt_add_property_cells(np, "#size-cells", 2);
+	dt_add_property_cells(np, "#interrupt-cells", 1);
+
+	/* We want "device_type" for bridges */
+	if (pci_has_cap(pd, PCI_CFG_CAP_ID_EXP, false))
+		dt_add_property_string(np, "device_type", "pciex");
+	else
+		dt_add_property_string(np, "device_type", "pci");
+
+	/* Update the current interrupt swizzling level based on our own
+	 * device number
+	 */
+	swizzle = (swizzle + ((pd->bdfn >> 3) & 0x1f)) & 3;
+
+	/* We generate a standard-swizzling interrupt map. This is pretty
+	 * big, we *could* try to be smarter for things that aren't hotplug
+	 * slots at least and only populate those entries for which there's
+	 * an actual children (especially on PCI Express), but for now that
+	 * will do
+	 */
+	pci_std_swizzle_irq_map(np, pd, lstate, swizzle);
+
+	/* We do an empty ranges property for now, we haven't setup any
+	 * bridge windows, the kernel will deal with that
+	 *
+	 * XXX The kernel should probably fix that up
+	 */
+	dt_add_property(np, "ranges", NULL, 0);
+
+	list_for_each(&pd->children, child, link)
+		pci_add_one_node(phb, child, np, lstate, swizzle);
+}
+
+static void pci_add_nodes(struct phb *phb)
+{
+	struct pci_lsi_state *lstate = &phb->lstate;
+	struct pci_device *pd;
+
+	/* If the PHB has its own slot info, add them */
+	if (phb->slot_info)
+		pci_add_slot_properties(phb, phb->slot_info, NULL);
+
+	/* Add all child devices */
+	list_for_each(&phb->devices, pd, link)
+		pci_add_one_node(phb, pd, phb->dt_node, lstate, 0);
+}
+
+static void __pci_reset(struct list_head *list)
+{
+	struct pci_device *pd;
+
+	while ((pd = list_pop(list, struct pci_device, link)) != NULL) {
+		__pci_reset(&pd->children);
+		free(pd);
+	}
+}
+
+void pci_reset(void)
+{
+	unsigned int i;
+
+	printf("PCI: Clearing all devices...\n");
+
+	lock(&pci_lock);
+
+	/* XXX Do those in parallel (at least the power up
+	 * state machine could be done in parallel)
+	 */
+	for (i = 0; i < PCI_MAX_PHBs; i++) {
+		if (!phbs[i])
+			continue;
+		__pci_reset(&phbs[i]->devices);
+	}
+	unlock(&pci_lock);
+}
+
+void pci_init_slots(void)
+{
+	unsigned int i;
+
+	printf("PCI: Probing PHB slots...\n");
+
+	lock(&pci_lock);
+
+	/* XXX Do those in parallel (at least the power up
+	 * state machine could be done in parallel)
+	 */
+	for (i = 0; i < PCI_MAX_PHBs; i++) {
+		if (!phbs[i])
+			continue;
+		pci_init_slot(phbs[i]);
+	}
+
+	if (platform.pci_probe_complete)
+		platform.pci_probe_complete();
+
+	printf("PCI: Summary\n");
+	for (i = 0; i < PCI_MAX_PHBs; i++) {
+		if (!phbs[i])
+			continue;
+		pci_add_nodes(phbs[i]);
+	}
+	unlock(&pci_lock);
+}
+
+static struct pci_device *__pci_walk_dev(struct phb *phb,
+					 struct list_head *l,
+					 int (*cb)(struct phb *,
+						   struct pci_device *,
+						   void *),
+					 void *userdata)
+{
+	struct pci_device *pd, *child;
+
+	if (list_empty(l))
+		return NULL;
+
+	list_for_each(l, pd, link) {
+		if (cb && cb(phb, pd, userdata))
+			return pd;
+
+		child = __pci_walk_dev(phb, &pd->children, cb, userdata);
+		if (child)
+			return child;
+        }
+
+	return NULL;
+}
+
+struct pci_device *pci_walk_dev(struct phb *phb,
+				int (*cb)(struct phb *,
+					  struct pci_device *,
+					  void *),
+				void *userdata)
+{
+	return __pci_walk_dev(phb, &phb->devices, cb, userdata);
+}
+
+static int __pci_find_dev(struct phb *phb,
+			  struct pci_device *pd, void *userdata)
+{
+	uint16_t bdfn = *((uint16_t *)userdata);
+
+	if (!phb || !pd)
+		return 0;
+
+	if (pd->bdfn == bdfn)
+		return 1;
+
+	return 0;
+}
+
+struct pci_device *pci_find_dev(struct phb *phb, uint16_t bdfn)
+{
+	return pci_walk_dev(phb, __pci_find_dev, &bdfn);
+}
diff --git a/core/platform.c b/core/platform.c
new file mode 100644
index 0000000..e54b334
--- /dev/null
+++ b/core/platform.c
@@ -0,0 +1,78 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include <skiboot.h>
+#include <opal.h>
+#include <console.h>
+
+/*
+ * Various wrappers for platform functions
+ */
+static int64_t opal_cec_power_down(uint64_t request)
+{
+	printf("OPAL: Shutdown request type 0x%llx...\n", request);
+
+	if (platform.cec_power_down)
+		return platform.cec_power_down(request);
+
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_CEC_POWER_DOWN, opal_cec_power_down, 1);
+
+static int64_t opal_cec_reboot(void)
+{
+	printf("OPAL: Reboot request...\n");
+
+#ifdef ENABLE_FAST_RESET
+	/* Try a fast reset first */
+	fast_reset();
+#endif
+	if (platform.cec_reboot)
+		return platform.cec_reboot();
+
+	return OPAL_SUCCESS;
+}
+opal_call(OPAL_CEC_REBOOT, opal_cec_reboot, 0);
+
+static void generic_platform_init(void)
+{
+	/* Do we want to unconditionally enable it ? */
+	if (dummy_console_enabled())
+		dummy_console_add_nodes();
+}
+
+static struct platform generic_platform = {
+	.name	= "generic",
+	.init	= generic_platform_init,
+};
+
+void probe_platform(void)
+{
+	struct platform *platforms = &__platforms_start;
+	unsigned int i;
+
+	platform = generic_platform;
+
+	for (i = 0; &platforms[i] < &__platforms_end; i++) {
+		if (platforms[i].probe && platforms[i].probe()) {
+			platform = platforms[i];
+			break;
+		}
+	}
+
+	printf("PLAT: Detected %s platform\n", platform.name);
+}
diff --git a/core/relocate.c b/core/relocate.c
new file mode 100644
index 0000000..f6bda37
--- /dev/null
+++ b/core/relocate.c
@@ -0,0 +1,65 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdbool.h>
+#include <elf.h>
+
+/* WARNING: This code is used to self-relocate, it cannot have any
+ * global reference nor TOC reference. It's also called before BSS
+ * is cleared.
+ */
+
+/* Called from head.S, thus no header. */
+int relocate(uint64_t offset, struct elf64_dyn *dyn, struct elf64_rela *rela);
+
+/* Note: This code is simplified according to the assumptions
+ *       that our link address is 0 and we are running at the
+ *       target address already.
+ */
+int relocate(uint64_t offset, struct elf64_dyn *dyn, struct elf64_rela *rela)
+{
+	uint64_t dt_rela	= 0;
+	uint64_t dt_relacount	= 0;
+	unsigned int i;
+
+	/* Look for relocation table */
+	for (; dyn->d_tag != DT_NULL; dyn++) {
+		if (dyn->d_tag == DT_RELA)
+			dt_rela = dyn->d_val;
+		else if (dyn->d_tag == DT_RELACOUNT)
+			dt_relacount = dyn->d_val;
+	}
+
+	/* If we miss either rela or relacount, bail */
+	if (!dt_rela || !dt_relacount)
+		return false;
+
+	/* Check if the offset is consistent */
+	if ((offset + dt_rela) != (uint64_t)rela)
+		return false;
+
+	/* Perform relocations */
+	for (i = 0; i < dt_relacount; i++, rela++) {
+		uint64_t *t;
+
+		if (ELF64_R_TYPE(rela->r_info) != R_PPC64_RELATIVE)
+			return false;
+		t = (uint64_t *)(rela->r_offset + offset);
+		*t = rela->r_addend + offset;
+	}
+
+	return true;
+}
diff --git a/core/test/Makefile.check b/core/test/Makefile.check
new file mode 100644
index 0000000..37dac46
--- /dev/null
+++ b/core/test/Makefile.check
@@ -0,0 +1,29 @@
+# -*-Makefile-*-
+CORE_TEST := core/test/run-device core/test/run-mem_region core/test/run-malloc core/test/run-malloc-speed core/test/run-mem_region_init core/test/run-mem_region_release_unused core/test/run-mem_region_release_unused_noalloc core/test/run-trace core/test/run-msg
+
+check: $(CORE_TEST:%=%-check)
+
+$(CORE_TEST:%=%-check) : %-check: %
+	$(VALGRIND) $<
+
+core/test/stubs.o: core/test/stubs.c
+	$(HOSTCC) $(HOSTCFLAGS) -g -c -o $@ $<
+
+$(CORE_TEST) : core/test/stubs.o
+
+$(CORE_TEST) : % : %.c 
+	$(HOSTCC) $(HOSTCFLAGS) -O0 -g -I include -I . -I libfdt -o $@ $< core/test/stubs.o
+
+$(CORE_TEST): % : %.d
+
+core/test/stubs.o: core/test/stubs.d
+
+core/test/%.d: core/test/%.c
+	$(HOSTCC) $(HOSTCFLAGS) -I include -I . -I libfdt -M $< > $@
+
+-include core/test/*.d
+
+clean: core-test-clean
+
+core-test-clean:
+	$(RM) -f core/test/*.[od] $(CORE_TEST)
diff --git a/core/test/run-device.c b/core/test/run-device.c
new file mode 100644
index 0000000..fa9e951
--- /dev/null
+++ b/core/test/run-device.c
@@ -0,0 +1,118 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+
+/* Override this for testing. */
+#define is_rodata(p) fake_is_rodata(p)
+
+char __rodata_start[16];
+#define __rodata_end (__rodata_start + sizeof(__rodata_start))
+
+static inline bool fake_is_rodata(const void *p)
+{
+	return ((char *)p >= __rodata_start && (char *)p < __rodata_end);
+}
+
+#define zalloc(bytes) calloc((bytes), 1)
+
+#include "../device.c"
+#include "../../ccan/list/list.c" /* For list_check */
+#include <assert.h>
+
+int main(void)
+{
+	struct dt_node *root, *c1, *c2, *gc1, *gc2, *gc3, *ggc1, *i;
+	const struct dt_property *p;
+	struct dt_property *p2;
+	unsigned int n;
+
+	root = dt_new_root("root");
+	assert(!list_top(&root->properties, struct dt_property, list));
+	c1 = dt_new(root, "c1");
+	assert(!list_top(&c1->properties, struct dt_property, list));
+	c2 = dt_new(root, "c2");
+	assert(!list_top(&c2->properties, struct dt_property, list));
+	gc1 = dt_new(c1, "gc1");
+	assert(!list_top(&gc1->properties, struct dt_property, list));
+	gc2 = dt_new(c1, "gc2");
+	assert(!list_top(&gc2->properties, struct dt_property, list));
+	gc3 = dt_new(c1, "gc3");
+	assert(!list_top(&gc3->properties, struct dt_property, list));
+	ggc1 = dt_new(gc1, "ggc1");
+	assert(!list_top(&ggc1->properties, struct dt_property, list));
+
+	for (n = 0, i = dt_first(root); i; i = dt_next(root, i), n++) {
+		assert(!list_top(&i->properties, struct dt_property, list));
+		dt_add_property_cells(i, "visited", 1);
+	}
+	assert(n == 6);
+
+	for (n = 0, i = dt_first(root); i; i = dt_next(root, i), n++) {
+		p = list_top(&i->properties, struct dt_property, list);
+		assert(strcmp(p->name, "visited") == 0);
+		assert(p->len == sizeof(u32));
+		assert(fdt32_to_cpu(*(u32 *)p->prop) == 1);
+	}
+	assert(n == 6);
+
+	dt_add_property_cells(c1, "some-property", 1, 2, 3);
+	p = dt_find_property(c1, "some-property");
+	assert(p);
+	assert(strcmp(p->name, "some-property") == 0);
+	assert(p->len == sizeof(u32) * 3);
+	assert(fdt32_to_cpu(*(u32 *)p->prop) == 1);
+	assert(fdt32_to_cpu(*((u32 *)p->prop + 1)) == 2);
+	assert(fdt32_to_cpu(*((u32 *)p->prop + 2)) == 3);
+
+	/* Test freeing a single node */
+	assert(!list_empty(&gc1->children));
+	dt_free(ggc1);
+	assert(list_empty(&gc1->children));
+
+	/* Test rodata logic. */
+	assert(!is_rodata("hello"));
+	assert(is_rodata(__rodata_start));
+	strcpy(__rodata_start, "name");
+	ggc1 = dt_new(root, __rodata_start);
+	assert(ggc1->name == __rodata_start);
+
+	/* Test string node. */
+	dt_add_property_string(ggc1, "somestring", "someval");
+	assert(dt_has_node_property(ggc1, "somestring", "someval"));
+	assert(!dt_has_node_property(ggc1, "somestrin", "someval"));
+	assert(!dt_has_node_property(ggc1, "somestring", "someva"));
+	assert(!dt_has_node_property(ggc1, "somestring", "somevale"));
+
+	/* Test resizing property. */
+	p = p2 = __dt_find_property(c1, "some-property");
+	assert(p);
+	n = p2->len;
+	while (p2 == p) {
+		n *= 2;
+		dt_resize_property(&p2, n);
+	}
+
+	assert(dt_find_property(c1, "some-property") == p2);
+	list_check(&c1->properties, "properties after resizing");
+
+	dt_del_property(c1, p2);
+	list_check(&c1->properties, "properties after delete");
+
+	/* No leaks for valgrind! */
+	dt_free(root);
+	return 0;
+}
diff --git a/core/test/run-malloc-speed.c b/core/test/run-malloc-speed.c
new file mode 100644
index 0000000..edc7589
--- /dev/null
+++ b/core/test/run-malloc-speed.c
@@ -0,0 +1,94 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+
+#define BITS_PER_LONG (sizeof(long) * 8)
+/* Don't include this, it's PPC-specific */
+#define __CPU_H
+static unsigned int cpu_max_pir = 1;
+struct cpu_thread {
+	unsigned int			chip_id;
+};
+
+#include <stdlib.h>
+
+/* Use these before we undefine them below. */
+static inline void *real_malloc(size_t size)
+{
+	return malloc(size);
+}
+
+static inline void real_free(void *p)
+{
+	return free(p);
+}
+
+#include <skiboot.h>
+
+/* We need mem_region to accept __location__ */
+#define is_rodata(p) true
+#include "../malloc.c"
+#include "../mem_region.c"
+#include "../device.c"
+
+#undef malloc
+#undef free
+#undef realloc
+
+#include <assert.h>
+#include <stdio.h>
+
+char __rodata_start[1], __rodata_end[1];
+struct dt_node *dt_root;
+
+void lock(struct lock *l)
+{
+	assert(!l->lock_val);
+	l->lock_val = 1;
+}
+
+void unlock(struct lock *l)
+{
+	assert(l->lock_val);
+	l->lock_val = 0;
+}
+
+#define TEST_HEAP_ORDER 27
+#define TEST_HEAP_SIZE (1ULL << TEST_HEAP_ORDER)
+
+#define NUM_ALLOCS 4096
+
+int main(void)
+{
+	uint64_t i, len;
+	void *p[NUM_ALLOCS];
+
+	/* Use malloc for the heap, so valgrind can find issues. */
+	skiboot_heap.start = (unsigned long)real_malloc(skiboot_heap.len);
+
+	len = skiboot_heap.len / NUM_ALLOCS - sizeof(struct alloc_hdr);
+	for (i = 0; i < NUM_ALLOCS; i++) {
+		p[i] = __malloc(len, __location__);
+		assert(p[i] > region_start(&skiboot_heap));
+		assert(p[i] + len <= region_start(&skiboot_heap)
+		       + skiboot_heap.len);
+	}
+	assert(mem_check(&skiboot_heap));
+	assert(mem_region_lock.lock_val == 0);
+	free(region_start(&skiboot_heap));
+	return 0;
+}
diff --git a/core/test/run-malloc.c b/core/test/run-malloc.c
new file mode 100644
index 0000000..226ce75
--- /dev/null
+++ b/core/test/run-malloc.c
@@ -0,0 +1,144 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+
+#define BITS_PER_LONG (sizeof(long) * 8)
+
+/* Don't include this, it's PPC-specific */
+#define __CPU_H
+static unsigned int cpu_max_pir = 1;
+struct cpu_thread {
+	unsigned int			chip_id;
+};
+
+#include <skiboot.h>
+
+#define is_rodata(p) true
+
+#include "../mem_region.c"
+#include "../malloc.c"
+#include "../device.c"
+
+#include "mem_region-malloc.h"
+
+#define TEST_HEAP_ORDER 12
+#define TEST_HEAP_SIZE (1ULL << TEST_HEAP_ORDER)
+
+struct dt_node *dt_root;
+
+void lock(struct lock *l)
+{
+	assert(!l->lock_val);
+	l->lock_val = 1;
+}
+
+void unlock(struct lock *l)
+{
+	assert(l->lock_val);
+	l->lock_val = 0;
+}
+
+static bool heap_empty(void)
+{
+	const struct alloc_hdr *h = region_start(&skiboot_heap);
+	return h->num_longs == skiboot_heap.len / sizeof(long);
+}
+
+int main(void)
+{
+	char test_heap[TEST_HEAP_SIZE], *p, *p2, *p3, *p4;
+	size_t i;
+
+	/* Use malloc for the heap, so valgrind can find issues. */
+	skiboot_heap.start = (unsigned long)test_heap;
+	skiboot_heap.len = TEST_HEAP_SIZE;
+
+	/* Allocations of various sizes. */
+	for (i = 0; i < TEST_HEAP_ORDER; i++) {
+		p = malloc(1ULL << i);
+		assert(p);
+		assert(p > (char *)test_heap);
+		assert(p + (1ULL << i) <= (char *)test_heap + TEST_HEAP_SIZE);
+		assert(!mem_region_lock.lock_val);
+		free(p);
+		assert(!mem_region_lock.lock_val);
+		assert(heap_empty());
+	}
+
+	/* Realloc as malloc. */
+	mem_region_lock.lock_val = 0;
+	p = realloc(NULL, 100);
+	assert(p);
+	assert(!mem_region_lock.lock_val);
+
+	/* Realloc as free. */
+	p = realloc(p, 0);
+	assert(!p);
+	assert(!mem_region_lock.lock_val);
+	assert(heap_empty());
+
+	/* Realloc longer. */
+	p = realloc(NULL, 100);
+	assert(p);
+	assert(!mem_region_lock.lock_val);
+	p2 = realloc(p, 200);
+	assert(p2 == p);
+	assert(!mem_region_lock.lock_val);
+	free(p);
+	assert(!mem_region_lock.lock_val);
+	assert(heap_empty());
+
+	/* Realloc shorter. */
+	mem_region_lock.lock_val = 0;
+	p = realloc(NULL, 100);
+	assert(!mem_region_lock.lock_val);
+	assert(p);
+	p2 = realloc(p, 1);
+	assert(!mem_region_lock.lock_val);
+	assert(p2 == p);
+	free(p);
+	assert(!mem_region_lock.lock_val);
+	assert(heap_empty());
+
+	/* Realloc with move. */
+	p2 = malloc(TEST_HEAP_SIZE - 64 - sizeof(struct alloc_hdr)*2);
+	assert(p2);
+	p = malloc(64);
+	assert(p);
+	free(p2);
+
+	p2 = realloc(p, 128);
+	assert(p2 != p);
+	free(p2);
+	assert(heap_empty());
+	assert(!mem_region_lock.lock_val);
+
+	/* Reproduce bug BZ109128/SW257364 */
+	p = malloc(100);
+	p2 = malloc(100);
+	p3 = malloc(100);
+	p4 = malloc(100);
+	free(p2);
+	realloc(p,216);
+	free(p3);
+	free(p);
+	free(p4);
+	assert(heap_empty());
+	assert(!mem_region_lock.lock_val);
+
+	return 0;
+}
diff --git a/core/test/run-mem_region.c b/core/test/run-mem_region.c
new file mode 100644
index 0000000..f0ad2c2
--- /dev/null
+++ b/core/test/run-mem_region.c
@@ -0,0 +1,250 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+
+#define BITS_PER_LONG (sizeof(long) * 8)
+/* Don't include this, it's PPC-specific */
+#define __CPU_H
+static unsigned int cpu_max_pir = 1;
+struct cpu_thread {
+	unsigned int			chip_id;
+};
+
+#include <stdlib.h>
+#include <string.h>
+
+/* Use these before we override definitions below. */
+static void *__malloc(size_t size, const char *location __attribute__((unused)))
+{
+	return malloc(size);
+}
+
+static void *__realloc(void *ptr, size_t size, const char *location __attribute__((unused)))
+{
+	return realloc(ptr, size);
+}
+
+static inline void __free(void *p, const char *location __attribute__((unused)))
+{
+	return free(p);
+}
+
+static void *__zalloc(size_t size, const char *location __attribute__((unused)))
+{
+	void *ptr = malloc(size);
+	memset(ptr, 0, size);
+	return ptr;
+}
+
+#include <skiboot.h>
+
+#define is_rodata(p) true
+
+#include "../mem_region.c"
+#include "../device.c"
+
+#include <assert.h>
+#include <stdio.h>
+
+struct dt_node *dt_root;
+
+void lock(struct lock *l)
+{
+	l->lock_val++;
+}
+
+void unlock(struct lock *l)
+{
+	l->lock_val--;
+}
+
+#define TEST_HEAP_ORDER 12
+#define TEST_HEAP_SIZE (1ULL << TEST_HEAP_ORDER)
+
+static bool heap_empty(void)
+{
+	const struct alloc_hdr *h = region_start(&skiboot_heap);
+	return h->num_longs == skiboot_heap.len / sizeof(long);
+}
+
+int main(void)
+{
+	char *test_heap;
+	void *p, *ptrs[100];
+	size_t i;
+	struct mem_region *r;
+
+	/* Use malloc for the heap, so valgrind can find issues. */
+	test_heap = __malloc(TEST_HEAP_SIZE, __location__);
+	skiboot_heap.start = (unsigned long)test_heap;
+	skiboot_heap.len = TEST_HEAP_SIZE;
+
+	/* Allocations of various sizes. */
+	for (i = 0; i < TEST_HEAP_ORDER; i++) {
+		p = mem_alloc(&skiboot_heap, 1ULL << i, 1, "here");
+		assert(p);
+		assert(mem_check(&skiboot_heap));
+		assert(!strcmp(((struct alloc_hdr *)p)[-1].location, "here"));
+		assert(p > (void *)test_heap);
+		assert(p + (1ULL << i) <= (void *)test_heap + TEST_HEAP_SIZE);
+		assert(mem_size(&skiboot_heap, p) >= 1ULL << i);
+		mem_free(&skiboot_heap, p, "freed");
+		assert(heap_empty());
+		assert(mem_check(&skiboot_heap));
+		assert(!strcmp(((struct alloc_hdr *)p)[-1].location, "freed"));
+	}
+	p = mem_alloc(&skiboot_heap, 1ULL << i, 1, "here");
+	assert(!p);
+	mem_free(&skiboot_heap, p, "freed");
+	assert(heap_empty());
+	assert(mem_check(&skiboot_heap));
+
+	/* Allocations of various alignments: use small alloc first. */
+	ptrs[0] = mem_alloc(&skiboot_heap, 1, 1, "small");
+	for (i = 0; ; i++) {
+		p = mem_alloc(&skiboot_heap, 1, 1ULL << i, "here");
+		assert(mem_check(&skiboot_heap));
+		/* We will eventually fail... */
+		if (!p) {
+			assert(i >= TEST_HEAP_ORDER);
+			break;
+		}
+		assert(p);
+		assert((long)p % (1ULL << i) == 0);
+		assert(p > (void *)test_heap);
+		assert(p + 1 <= (void *)test_heap + TEST_HEAP_SIZE);
+		mem_free(&skiboot_heap, p, "freed");
+		assert(mem_check(&skiboot_heap));
+	}
+	mem_free(&skiboot_heap, ptrs[0], "small freed");
+	assert(heap_empty());
+	assert(mem_check(&skiboot_heap));
+
+	/* Many little allocations, freed in reverse order. */
+	for (i = 0; i < 100; i++) {
+		ptrs[i] = mem_alloc(&skiboot_heap, sizeof(long), 1, "here");
+		assert(ptrs[i]);
+		assert(ptrs[i] > (void *)test_heap);
+		assert(ptrs[i] + sizeof(long)
+		       <= (void *)test_heap + TEST_HEAP_SIZE);
+		assert(mem_check(&skiboot_heap));
+	}
+	for (i = 0; i < 100; i++)
+		mem_free(&skiboot_heap, ptrs[100 - 1 - i], "freed");
+
+	assert(heap_empty());
+	assert(mem_check(&skiboot_heap));
+
+	/* Check the prev_free gets updated properly. */
+	ptrs[0] = mem_alloc(&skiboot_heap, sizeof(long), 1, "ptrs[0]");
+	ptrs[1] = mem_alloc(&skiboot_heap, sizeof(long), 1, "ptrs[1]");
+	assert(ptrs[1] > ptrs[0]);
+	mem_free(&skiboot_heap, ptrs[0], "ptrs[0] free");
+	assert(mem_check(&skiboot_heap));
+	ptrs[0] = mem_alloc(&skiboot_heap, sizeof(long), 1, "ptrs[0] again");
+	assert(mem_check(&skiboot_heap));
+	mem_free(&skiboot_heap, ptrs[1], "ptrs[1] free");
+	mem_free(&skiboot_heap, ptrs[0], "ptrs[0] free");
+	assert(mem_check(&skiboot_heap));
+	assert(heap_empty());
+
+#if 0
+	printf("Heap map:\n");
+	for (i = 0; i < TEST_HEAP_SIZE / sizeof(long); i++) {
+		printf("%u", test_bit(skiboot_heap.bitmap, i));
+		if (i % 64 == 63)
+			printf("\n");
+		else if (i % 8 == 7)
+			printf(" ");
+	}
+#endif
+
+	/* Simple enlargement, then free */
+	p = mem_alloc(&skiboot_heap, 1, 1, "one byte");
+	assert(p);
+	assert(mem_resize(&skiboot_heap, p, 100, "hundred bytes"));
+	assert(mem_size(&skiboot_heap, p) >= 100);
+	assert(mem_check(&skiboot_heap));
+	assert(!strcmp(((struct alloc_hdr *)p)[-1].location, "hundred bytes"));
+	mem_free(&skiboot_heap, p, "freed");
+
+	/* Simple shrink, then free */
+	p = mem_alloc(&skiboot_heap, 100, 1, "100 bytes");
+	assert(p);
+	assert(mem_resize(&skiboot_heap, p, 1, "1 byte"));
+	assert(mem_size(&skiboot_heap, p) < 100);
+	assert(mem_check(&skiboot_heap));
+	assert(!strcmp(((struct alloc_hdr *)p)[-1].location, "1 byte"));
+	mem_free(&skiboot_heap, p, "freed");
+
+	/* Lots of resizing (enlarge). */
+	p = mem_alloc(&skiboot_heap, 1, 1, "one byte");
+	assert(p);
+	for (i = 1; i <= TEST_HEAP_SIZE - sizeof(struct alloc_hdr); i++) {
+		assert(mem_resize(&skiboot_heap, p, i, "enlarge"));
+		assert(mem_size(&skiboot_heap, p) >= i);
+		assert(mem_check(&skiboot_heap));
+	}
+
+	/* Can't make it larger though. */
+	assert(!mem_resize(&skiboot_heap, p, i, "enlarge"));
+
+	for (i = TEST_HEAP_SIZE - sizeof(struct alloc_hdr); i > 0; i--) {
+		assert(mem_resize(&skiboot_heap, p, i, "shrink"));
+		assert(mem_check(&skiboot_heap));
+	}
+
+	mem_free(&skiboot_heap, p, "freed");
+	assert(mem_check(&skiboot_heap));
+
+	/* Test splitting of a region. */
+	r = new_region("base", (unsigned long)test_heap,
+		       TEST_HEAP_SIZE, NULL, REGION_SKIBOOT_HEAP);
+	assert(add_region(r));
+	r = new_region("splitter", (unsigned long)test_heap + TEST_HEAP_SIZE/4,
+		       TEST_HEAP_SIZE/2, NULL, REGION_RESERVED);
+	assert(add_region(r));
+	/* Now we should have *three* regions. */
+	i = 0;
+	list_for_each(&regions, r, list) {
+		if (region_start(r) == test_heap) {
+			assert(r->len == TEST_HEAP_SIZE/4);
+			assert(strcmp(r->name, "base") == 0);
+			assert(r->type == REGION_SKIBOOT_HEAP);
+		} else if (region_start(r) == test_heap + TEST_HEAP_SIZE / 4) {
+			assert(r->len == TEST_HEAP_SIZE/2);
+			assert(strcmp(r->name, "splitter") == 0);
+			assert(r->type == REGION_RESERVED);
+			assert(!r->free_list.n.next);
+		} else if (region_start(r) == test_heap + TEST_HEAP_SIZE/4*3) {
+			assert(r->len == TEST_HEAP_SIZE/4);
+			assert(strcmp(r->name, "base") == 0);
+			assert(r->type == REGION_SKIBOOT_HEAP);
+		} else
+			abort();
+		assert(mem_check(r));
+		i++;
+	}
+	assert(i == 3);
+	while ((r = list_pop(&regions, struct mem_region, list)) != NULL) {
+		list_del(&r->list);
+		mem_free(&skiboot_heap, r, __location__);
+	}
+	assert(mem_region_lock.lock_val == 0);
+	__free(test_heap, "");
+	return 0;
+}
diff --git a/core/test/run-mem_region_init.c b/core/test/run-mem_region_init.c
new file mode 100644
index 0000000..a24cc7b
--- /dev/null
+++ b/core/test/run-mem_region_init.c
@@ -0,0 +1,179 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+
+#define BITS_PER_LONG (sizeof(long) * 8)
+/* Don't include this, it's PPC-specific */
+#define __CPU_H
+static unsigned int cpu_max_pir = 1;
+struct cpu_thread {
+	unsigned int			chip_id;
+};
+
+#include <stdlib.h>
+
+/* Use these before we undefine them below. */
+static inline void *real_malloc(size_t size)
+{
+	return malloc(size);
+}
+
+static inline void real_free(void *p)
+{
+	return free(p);
+}
+
+#include "../malloc.c"
+
+#include <skiboot.h>
+/* We need mem_region to accept __location__ */
+#define is_rodata(p) true
+#include "../mem_region.c"
+
+/* But we need device tree to make copies of names. */
+#undef is_rodata
+#define is_rodata(p) false
+
+static inline char *skiboot_strdup(const char *str)
+{
+	char *ret = __malloc(strlen(str) + 1, "");
+	return memcpy(ret, str, strlen(str) + 1);
+}
+#undef strdup
+#define strdup skiboot_strdup
+
+#include "../device.c"
+
+#include <skiboot.h>
+
+#include <assert.h>
+#include <stdio.h>
+
+void lock(struct lock *l)
+{
+	assert(!l->lock_val);
+	l->lock_val = 1;
+}
+
+void unlock(struct lock *l)
+{
+	assert(l->lock_val);
+	l->lock_val = 0;
+}
+
+/* We actually need a lot of room for the bitmaps! */
+#define TEST_HEAP_ORDER 27
+#define TEST_HEAP_SIZE (1ULL << TEST_HEAP_ORDER)
+
+static void add_mem_node(uint64_t start, uint64_t len)
+{
+	struct dt_node *mem;
+	u64 reg[2];
+	char name[sizeof("memory@") + STR_MAX_CHARS(reg[0])];
+
+	/* reg contains start and length */
+	reg[0] = cpu_to_be64(start);
+	reg[1] = cpu_to_be64(len);
+
+	sprintf(name, "memory@%llx", (unsigned long long)start);
+
+	mem = dt_new(dt_root, name);
+	assert(mem);
+	dt_add_property_string(mem, "device_type", "memory");
+	dt_add_property(mem, "reg", reg, sizeof(reg));
+}
+
+void add_chip_dev_associativity(struct dt_node *dev __attribute__((unused)))
+{
+}
+
+int main(void)
+{
+	uint64_t end;
+	int builtins;
+	struct mem_region *r;
+	char *heap = real_malloc(TEST_HEAP_SIZE);
+
+	/* Use malloc for the heap, so valgrind can find issues. */
+	skiboot_heap.start = (unsigned long)heap;
+	skiboot_heap.len = TEST_HEAP_SIZE;
+	skiboot_os_reserve.len = (unsigned long)heap;
+
+	dt_root = dt_new_root("");
+	dt_add_property_cells(dt_root, "#address-cells", 2);
+	dt_add_property_cells(dt_root, "#size-cells", 2);
+
+	/* Make sure we overlap the heap, at least. */
+	add_mem_node(0, 0x100000000ULL);
+	add_mem_node(0x100000000ULL, 0x100000000ULL);
+	end = 0x200000000ULL;
+
+	/* Now convert. */
+	mem_region_init();
+	assert(mem_check(&skiboot_heap));
+
+	builtins = 0;
+	list_for_each(&regions, r, list) {
+		/* Regions must not overlap. */
+		struct mem_region *r2, *pre = NULL, *post = NULL;
+		list_for_each(&regions, r2, list) {
+			if (r == r2)
+				continue;
+			assert(!overlaps(r, r2));
+		}
+
+		/* But should have exact neighbours. */
+		list_for_each(&regions, r2, list) {
+			if (r == r2)
+				continue;
+			if (r2->start == r->start + r->len)
+				post = r2;
+			if (r2->start + r2->len == r->start)
+				pre = r2;
+		}
+		assert(r->start == 0 || pre);
+		assert(r->start + r->len == end || post);
+
+		if (r == &skiboot_code_and_text ||
+		    r == &skiboot_heap ||
+		    r == &skiboot_after_heap ||
+		    r == &skiboot_cpu_stacks ||
+		    r == &skiboot_os_reserve)
+			builtins++;
+		else
+			assert(r->type == REGION_SKIBOOT_HEAP);
+		assert(mem_check(r));
+	}
+	assert(builtins == 5);
+
+	dt_free(dt_root);
+
+	while ((r = list_pop(&regions, struct mem_region, list)) != NULL) {
+		list_del(&r->list);
+		if (r != &skiboot_code_and_text &&
+		    r != &skiboot_heap &&
+		    r != &skiboot_after_heap &&
+		    r != &skiboot_os_reserve &&
+		    r != &skiboot_cpu_stacks) {
+			free(r);
+		}
+		assert(mem_check(&skiboot_heap));
+	}
+	assert(mem_region_lock.lock_val == 0);
+	real_free(heap);
+	return 0;
+}
diff --git a/core/test/run-mem_region_release_unused.c b/core/test/run-mem_region_release_unused.c
new file mode 100644
index 0000000..e73cf25
--- /dev/null
+++ b/core/test/run-mem_region_release_unused.c
@@ -0,0 +1,177 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+
+#define BITS_PER_LONG (sizeof(long) * 8)
+/* Don't include this, it's PPC-specific */
+#define __CPU_H
+static unsigned int cpu_max_pir = 1;
+struct cpu_thread {
+	unsigned int			chip_id;
+};
+
+#include <stdlib.h>
+
+static void *__malloc(size_t size, const char *location __attribute__((unused)))
+{
+	return malloc(size);
+}
+
+static void *__realloc(void *ptr, size_t size, const char *location __attribute__((unused)))
+{
+	return realloc(ptr, size);
+}
+
+static void *__zalloc(size_t size, const char *location __attribute__((unused)))
+{
+	return calloc(size, 1);
+}
+
+static inline void __free(void *p, const char *location __attribute__((unused)))
+{
+	return free(p);
+}
+
+#include <skiboot.h>
+
+/* We need mem_region to accept __location__ */
+#define is_rodata(p) true
+#include "../mem_region.c"
+
+/* But we need device tree to make copies of names. */
+#undef is_rodata
+#define is_rodata(p) false
+
+#include "../device.c"
+#include <assert.h>
+#include <stdio.h>
+
+void lock(struct lock *l)
+{
+	l->lock_val++;
+}
+
+void unlock(struct lock *l)
+{
+	l->lock_val--;
+}
+
+#define TEST_HEAP_ORDER 12
+#define TEST_HEAP_SIZE (1ULL << TEST_HEAP_ORDER)
+
+static void add_mem_node(uint64_t start, uint64_t len)
+{
+	struct dt_node *mem;
+	u64 reg[2];
+	char name[sizeof("memory@") + STR_MAX_CHARS(reg[0])];
+
+	/* reg contains start and length */
+	reg[0] = cpu_to_be64(start);
+	reg[1] = cpu_to_be64(len);
+
+	sprintf(name, "memory@%llx", (long long)start);
+
+	mem = dt_new(dt_root, name);
+	dt_add_property_string(mem, "device_type", "memory");
+	dt_add_property(mem, "reg", reg, sizeof(reg));
+}
+
+void add_chip_dev_associativity(struct dt_node *dev __attribute__((unused)))
+{
+}
+
+int main(void)
+{
+	uint64_t i;
+	struct mem_region *r, *other = NULL;
+	void *other_mem;
+	const char *last;
+
+	/* Use malloc for the heap, so valgrind can find issues. */
+	skiboot_heap.start = (unsigned long)malloc(TEST_HEAP_SIZE);
+	skiboot_heap.len = TEST_HEAP_SIZE;
+	skiboot_os_reserve.len = skiboot_heap.start;
+
+	dt_root = dt_new_root("");
+	dt_add_property_cells(dt_root, "#address-cells", 2);
+	dt_add_property_cells(dt_root, "#size-cells", 2);
+
+	other_mem = malloc(1024*1024);
+	add_mem_node((unsigned long)other_mem, 1024*1024);
+
+	/* Now convert. */
+	mem_region_init();
+
+	/* Find our node to allocate from */
+	list_for_each(&regions, r, list) {
+		if (region_start(r) == other_mem)
+			other = r;
+	}
+	/* This could happen if skiboot addresses clashed with our alloc. */
+	assert(other);
+	assert(mem_check(other));
+
+	/* Allocate 1k from other region. */
+	mem_alloc(other, 1024, 1, "1k");
+	mem_region_release_unused();
+
+	assert(mem_check(&skiboot_heap));
+
+	/* Now we expect it to be split. */
+	i = 0;
+	list_for_each(&regions, r, list) {
+		assert(mem_check(r));
+		i++;
+		if (r == &skiboot_os_reserve)
+			continue;
+		if (r == &skiboot_code_and_text)
+			continue;
+		if (r == &skiboot_heap)
+			continue;
+		if (r == &skiboot_after_heap)
+			continue;
+		if (r == &skiboot_cpu_stacks)
+			continue;
+		if (r == other) {
+			assert(r->type == REGION_SKIBOOT_HEAP);
+			assert(r->len < 1024 * 1024);
+		} else {
+			assert(r->type == REGION_OS);
+			assert(r->start == other->start + other->len);
+			assert(r->start + r->len == other->start + 1024*1024);
+		}
+	}
+	assert(i == 7);
+
+	last = NULL;
+	list_for_each(&regions, r, list) {
+		if (last != r->name &&
+		    strncmp(r->name, NODE_REGION_PREFIX,
+			    strlen(NODE_REGION_PREFIX)) == 0) {
+			/* It's safe to cast away const as this is
+			 * only going to happen in test code */
+			free((void*)r->name);
+			break;
+		}
+		last = r->name;
+	}
+
+	dt_free(dt_root);
+	free((void *)(long)skiboot_heap.start);
+	free(other_mem);
+	return 0;
+}
diff --git a/core/test/run-mem_region_release_unused_noalloc.c b/core/test/run-mem_region_release_unused_noalloc.c
new file mode 100644
index 0000000..818e272
--- /dev/null
+++ b/core/test/run-mem_region_release_unused_noalloc.c
@@ -0,0 +1,159 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+
+#define BITS_PER_LONG (sizeof(long) * 8)
+/* Don't include this, it's PPC-specific */
+#define __CPU_H
+static unsigned int cpu_max_pir = 1;
+struct cpu_thread {
+	unsigned int			chip_id;
+};
+
+#include <stdlib.h>
+
+static void *__malloc(size_t size, const char *location __attribute__((unused)))
+{
+	return malloc(size);
+}
+
+static void *__realloc(void *ptr, size_t size, const char *location __attribute__((unused)))
+{
+	return realloc(ptr, size);
+}
+
+static void *__zalloc(size_t size, const char *location __attribute__((unused)))
+{
+	return calloc(size, 1);
+}
+
+static inline void __free(void *p, const char *location __attribute__((unused)))
+{
+	return free(p);
+}
+
+#include <skiboot.h>
+
+/* We need mem_region to accept __location__ */
+#define is_rodata(p) true
+#include "../mem_region.c"
+
+/* But we need device tree to make copies of names. */
+#undef is_rodata
+#define is_rodata(p) false
+
+#include "../device.c"
+#include <assert.h>
+#include <stdio.h>
+
+void lock(struct lock *l)
+{
+	l->lock_val++;
+}
+
+void unlock(struct lock *l)
+{
+	l->lock_val--;
+}
+
+#define TEST_HEAP_ORDER 12
+#define TEST_HEAP_SIZE (1ULL << TEST_HEAP_ORDER)
+
+static void add_mem_node(uint64_t start, uint64_t len)
+{
+	struct dt_node *mem;
+	u64 reg[2];
+	char name[sizeof("memory@") + STR_MAX_CHARS(reg[0])];
+
+	/* reg contains start and length */
+	reg[0] = cpu_to_be64(start);
+	reg[1] = cpu_to_be64(len);
+
+	sprintf(name, "memory@%llx", (long long)start);
+
+	mem = dt_new(dt_root, name);
+	dt_add_property_string(mem, "device_type", "memory");
+	dt_add_property(mem, "reg", reg, sizeof(reg));
+}
+
+void add_chip_dev_associativity(struct dt_node *dev __attribute__((unused)))
+{
+}
+
+int main(void)
+{
+	uint64_t i;
+	struct mem_region *r;
+	const char *last;
+
+	/* Use malloc for the heap, so valgrind can find issues. */
+	skiboot_heap.start = (unsigned long)malloc(TEST_HEAP_SIZE);
+	skiboot_heap.len = TEST_HEAP_SIZE;
+	skiboot_os_reserve.len = skiboot_heap.start;
+
+	dt_root = dt_new_root("");
+	dt_add_property_cells(dt_root, "#address-cells", 2);
+	dt_add_property_cells(dt_root, "#size-cells", 2);
+
+	add_mem_node(0, 0x100000000ULL);
+	add_mem_node(0x100000000ULL, 0x100000000ULL);
+
+	mem_region_init();
+
+	mem_region_release_unused();
+
+	assert(mem_check(&skiboot_heap));
+
+	/* Now we expect it to be split. */
+	i = 0;
+	list_for_each(&regions, r, list) {
+		assert(mem_check(r));
+		i++;
+		if (r == &skiboot_os_reserve)
+			continue;
+		if (r == &skiboot_code_and_text)
+			continue;
+		if (r == &skiboot_heap)
+			continue;
+		if (r == &skiboot_after_heap)
+			continue;
+		if (r == &skiboot_cpu_stacks)
+			continue;
+
+		/* the memory nodes should all be available to the OS now */
+		assert(r->type == REGION_OS);
+	}
+	assert(i == 9);
+
+	last = NULL;
+	list_for_each(&regions, r, list) {
+		if (last != r->name &&
+		    strncmp(r->name, NODE_REGION_PREFIX,
+			    strlen(NODE_REGION_PREFIX)) == 0) {
+			/* It's safe to cast away the const as
+			 * this never happens at runtime,
+			 * only in test and only for valgrind
+			 */
+			free((void*)r->name);
+		}
+		last = r->name;
+	}
+
+	dt_free(dt_root);
+	free((void *)(long)skiboot_heap.start);
+	return 0;
+}
diff --git a/core/test/run-msg.c b/core/test/run-msg.c
new file mode 100644
index 0000000..cd36408
--- /dev/null
+++ b/core/test/run-msg.c
@@ -0,0 +1,256 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <skiboot.h>
+#include <inttypes.h>
+#include <assert.h>
+
+static bool zalloc_should_fail = false;
+static void *zalloc(size_t size)
+{
+        if (zalloc_should_fail) {
+                errno = ENOMEM;
+                return NULL;
+        }
+
+        return calloc(size, 1);
+}
+
+#include "../opal-msg.c"
+
+void lock(struct lock *l)
+{
+        assert(!l->lock_val);
+        l->lock_val = 1;
+}
+
+void unlock(struct lock *l)
+{
+        assert(l->lock_val);
+        l->lock_val = 0;
+}
+
+void opal_update_pending_evt(uint64_t evt_mask, uint64_t evt_values)
+{
+        (void)evt_mask;
+        (void)evt_values;
+}
+
+static long magic = 8097883813087437089UL;
+static void callback(void *data)
+{
+        assert(*(uint64_t *)data == magic);
+}
+
+static size_t list_count(struct list_head *list)
+{
+        size_t count = 0;
+        struct opal_msg_entry *dummy;
+
+        list_for_each(list, dummy, link)
+                count++;
+        return count;
+}
+
+int main(void)
+{
+        struct opal_msg_entry* entry;
+        int free_size = OPAL_MAX_MSGS;
+        int nfree = free_size;
+        int npending = 0;
+        int r;
+        static struct opal_msg m;
+        uint64_t *m_ptr = (uint64_t *)&m;
+
+        opal_init_msg();
+
+        assert(list_count(&msg_pending_list) == npending);
+        assert(list_count(&msg_free_list) == nfree);
+
+        /* Callback. */
+        r = opal_queue_msg(0, &magic, callback, (u64)0, (u64)1, (u64)2);
+        assert(r == 0);
+
+        assert(list_count(&msg_pending_list) == ++npending);
+        assert(list_count(&msg_free_list) == --nfree);
+
+        r = opal_get_msg(m_ptr, sizeof(m));
+        assert(r == 0);
+
+        assert(m.params[0] == 0);
+        assert(m.params[1] == 1);
+        assert(m.params[2] == 2);
+
+        assert(list_count(&msg_pending_list) == --npending);
+        assert(list_count(&msg_free_list) == ++nfree);
+
+        /* No params. */
+        r = opal_queue_msg(0, NULL, NULL);
+        assert(r == 0);
+
+        assert(list_count(&msg_pending_list) == ++npending);
+        assert(list_count(&msg_free_list) == --nfree);
+
+        r = opal_get_msg(m_ptr, sizeof(m));
+        assert(r == 0);
+
+        assert(list_count(&msg_pending_list) == --npending);
+        assert(list_count(&msg_free_list) == ++nfree);
+
+        /* > 8 params (ARRAY_SIZE(entry->msg.params) */
+        r = opal_queue_msg(0, NULL, NULL, 0, 1, 2, 3, 4, 5, 6, 7, 0xBADDA7A);
+        assert(r == 0);
+
+        assert(list_count(&msg_pending_list) == ++npending);
+        assert(list_count(&msg_free_list) == --nfree);
+
+        r = opal_get_msg(m_ptr, sizeof(m));
+        assert(r == 0);
+
+        assert(list_count(&msg_pending_list) == --npending);
+        assert(list_count(&msg_free_list) == ++nfree);
+
+        assert(m.params[0] == 0);
+        assert(m.params[1] == 1);
+        assert(m.params[2] == 2);
+        assert(m.params[3] == 3);
+        assert(m.params[4] == 4);
+        assert(m.params[5] == 5);
+        assert(m.params[6] == 6);
+        assert(m.params[7] == 7);
+
+        /* 8 params (ARRAY_SIZE(entry->msg.params) */
+        r = opal_queue_msg(0, NULL, NULL, 0, 10, 20, 30, 40, 50, 60, 70);
+        assert(r == 0);
+
+        assert(list_count(&msg_pending_list) == ++npending);
+        assert(list_count(&msg_free_list) == --nfree);
+
+        r = opal_get_msg(m_ptr, sizeof(m));
+        assert(r == 0);
+
+        assert(list_count(&msg_pending_list) == --npending);
+        assert(list_count(&msg_free_list) == ++nfree);
+
+        assert(m.params[0] == 0);
+        assert(m.params[1] == 10);
+        assert(m.params[2] == 20);
+        assert(m.params[3] == 30);
+        assert(m.params[4] == 40);
+        assert(m.params[5] == 50);
+        assert(m.params[6] == 60);
+        assert(m.params[7] == 70);
+
+        /* Full list (no free nodes in pending). */
+        while (nfree > 0) {
+                r = opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL);
+                assert(r == 0);
+                assert(list_count(&msg_pending_list) == ++npending);
+                assert(list_count(&msg_free_list) == --nfree);
+        }
+        assert(list_count(&msg_free_list) == 0);
+        assert(nfree == 0);
+        assert(npending == OPAL_MAX_MSGS);
+
+        r = opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL);
+        assert(r == 0);
+
+        assert(list_count(&msg_pending_list) == OPAL_MAX_MSGS+1);
+        assert(list_count(&msg_pending_list) == ++npending);
+        assert(list_count(&msg_free_list) == nfree);
+
+        /* Make zalloc fail to test error handling. */
+        zalloc_should_fail = true;
+        r = opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL);
+        assert(r == OPAL_RESOURCE);
+
+        assert(list_count(&msg_pending_list) == OPAL_MAX_MSGS+1);
+        assert(list_count(&msg_pending_list) == npending);
+        assert(list_count(&msg_free_list) == nfree);
+
+        /* Empty list (no nodes). */
+        while(!list_empty(&msg_pending_list)) {
+                r = opal_get_msg(m_ptr, sizeof(m));
+                assert(r == 0);
+                npending--;
+                nfree++;
+        }
+        assert(list_count(&msg_pending_list) == npending);
+        assert(list_count(&msg_free_list) == nfree);
+        assert(npending == 0);
+        assert(nfree == OPAL_MAX_MSGS+1);
+
+        r = opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL);
+        assert(r == 0);
+
+        assert(list_count(&msg_pending_list) == ++npending);
+        assert(list_count(&msg_free_list) == --nfree);
+
+        /* Request invalid size. */
+        r = opal_get_msg(m_ptr, sizeof(m) - 1);
+        assert(r == OPAL_PARAMETER);
+
+        /* Pass null buffer. */
+        r = opal_get_msg(NULL, sizeof(m));
+        assert(r == OPAL_PARAMETER);
+
+        /* Get msg when none are pending. */
+        r = opal_get_msg(m_ptr, sizeof(m));
+        assert(r == 0);
+
+        r = opal_get_msg(m_ptr, sizeof(m));
+        assert(r == OPAL_RESOURCE);
+
+#define test_queue_num(type, val) \
+        r = opal_queue_msg(0, NULL, NULL, \
+                (type)val, (type)val, (type)val, (type)val, \
+                (type)val, (type)val, (type)val, (type)val); \
+        assert(r == 0); \
+        opal_get_msg(m_ptr, sizeof(m)); \
+        assert(r == OPAL_SUCCESS); \
+        assert(m.params[0] == (type)val); \
+        assert(m.params[1] == (type)val); \
+        assert(m.params[2] == (type)val); \
+        assert(m.params[3] == (type)val); \
+        assert(m.params[4] == (type)val); \
+        assert(m.params[5] == (type)val); \
+        assert(m.params[6] == (type)val); \
+        assert(m.params[7] == (type)val)
+
+        /* Test types of various widths */
+        test_queue_num(u64, -1);
+        test_queue_num(s64, -1);
+        test_queue_num(u32, -1);
+        test_queue_num(s32, -1);
+        test_queue_num(u16, -1);
+        test_queue_num(s16, -1);
+        test_queue_num(u8, -1);
+        test_queue_num(s8, -1);
+
+        /* Clean up the list to keep valgrind happy. */
+        while(!list_empty(&msg_free_list)) {
+                entry = list_pop(&msg_free_list, struct opal_msg_entry, link);
+                assert(entry);
+                free(entry);
+        }
+
+        while(!list_empty(&msg_pending_list)) {
+                entry = list_pop(&msg_pending_list, struct opal_msg_entry, link);
+                assert(entry);
+                free(entry);
+        }
+
+        return 0;
+}
diff --git a/core/test/run-trace.c b/core/test/run-trace.c
new file mode 100644
index 0000000..7dabebd
--- /dev/null
+++ b/core/test/run-trace.c
@@ -0,0 +1,386 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <sched.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+/* Don't include these: PPC-specific */
+#define __CPU_H
+#define __TIME_H
+#define __PROCESSOR_H
+
+#if defined(__i386__) || defined(__x86_64__)
+/* This is more than a lwsync, but it'll work */
+static void full_barrier(void)
+{
+	asm volatile("mfence" : : : "memory");
+}
+#define lwsync full_barrier
+#define sync full_barrier
+#else
+#error "Define sync & lwsync for this arch"
+#endif
+
+#define zalloc(size) calloc((size), 1)
+
+struct cpu_thread {
+	uint32_t pir;
+	uint32_t chip_id;
+	struct trace_info *trace;
+	int server_no;
+	bool is_secondary;
+	struct cpu_thread *primary;
+};
+static struct cpu_thread *this_cpu(void);
+
+#define CPUS 4
+
+static struct cpu_thread fake_cpus[CPUS];
+
+static inline struct cpu_thread *next_cpu(struct cpu_thread *cpu)
+{
+	if (cpu == NULL)
+		return &fake_cpus[0];
+	cpu++;
+	if (cpu == &fake_cpus[CPUS])
+		return NULL;
+	return cpu;
+}
+
+#define first_cpu() next_cpu(NULL)
+
+#define for_each_cpu(cpu)	\
+	for (cpu = first_cpu(); cpu; cpu = next_cpu(cpu))
+
+static unsigned long timestamp;
+static unsigned long mftb(void)
+{
+	return timestamp;
+}
+
+static void *local_alloc(unsigned int chip_id,
+			 size_t size, size_t align)
+{
+	void *p;
+
+	(void)chip_id;
+	if (posix_memalign(&p, align, size))
+		p = NULL;
+	return p;
+}
+
+struct dt_node;
+extern struct dt_node *opal_node;
+
+#include "../trace.c"
+
+#define rmb() lwsync()
+
+#include "../external/trace.c"
+#include "../device.c"
+
+char __rodata_start[1], __rodata_end[1];
+struct dt_node *opal_node;
+struct debug_descriptor debug_descriptor = {
+	.trace_mask = -1
+};
+
+void lock(struct lock *l)
+{
+	assert(!l->lock_val);
+	l->lock_val = 1;
+}
+
+void unlock(struct lock *l)
+{
+	assert(l->lock_val);
+	l->lock_val = 0;
+}
+
+struct cpu_thread *my_fake_cpu;
+static struct cpu_thread *this_cpu(void)
+{
+	return my_fake_cpu;
+}
+
+#include <sys/mman.h>
+#define PER_CHILD_TRACES (1024*1024)
+
+static void write_trace_entries(int id)
+{
+	void exit(int);
+	unsigned int i;
+	union trace trace;
+
+	timestamp = id;
+	for (i = 0; i < PER_CHILD_TRACES; i++) {
+		timestamp = i * CPUS + id;
+		assert(sizeof(trace.hdr) % 8 == 0);
+		/* First child never repeats, second repeats once, etc. */
+		trace_add(&trace, 3 + ((i / (id + 1)) % 0x40),
+			  sizeof(trace.hdr));
+	}
+
+	/* Final entry has special type, so parent knows it's over. */
+	trace_add(&trace, 0x70, sizeof(trace.hdr));
+	exit(0);
+}
+
+static bool all_done(const bool done[])
+{
+	unsigned int i;
+
+	for (i = 0; i < CPUS; i++)
+		if (!done[i])
+			return false;
+	return true;
+}
+
+static void test_parallel(void)
+{
+	void *p;
+	unsigned int i, counts[CPUS] = { 0 }, overflows[CPUS] = { 0 };
+	unsigned int repeats[CPUS] = { 0 }, num_overflows[CPUS] = { 0 };
+	bool done[CPUS] = { false };
+	size_t len = sizeof(struct trace_info) + TBUF_SZ + sizeof(union trace);
+	int last = 0;
+
+	/* Use a shared mmap to test actual parallel buffers. */
+	i = (CPUS*len + getpagesize()-1)&~(getpagesize()-1);
+	p = mmap(NULL, i, PROT_READ|PROT_WRITE,
+		 MAP_ANONYMOUS|MAP_SHARED, -1, 0);
+
+	for (i = 0; i < CPUS; i++) {
+		fake_cpus[i].trace = p + i * len;
+		fake_cpus[i].trace->tb.mask = TBUF_SZ - 1;
+		fake_cpus[i].trace->tb.max_size = sizeof(union trace);
+		fake_cpus[i].is_secondary = false;
+	}
+
+	for (i = 0; i < CPUS; i++) {
+		if (!fork()) {
+			/* Child. */
+			my_fake_cpu = &fake_cpus[i];
+			write_trace_entries(i);
+		}
+	}
+
+	while (!all_done(done)) {
+		union trace t;
+
+		for (i = 0; i < CPUS; i++) {
+			if (trace_get(&t, &fake_cpus[(i+last) % CPUS].trace->tb))
+				break;
+		}
+
+		if (i == CPUS) {
+			sched_yield();
+			continue;
+		}
+		i = (i + last) % CPUS;
+		last = i;
+
+		assert(t.hdr.cpu < CPUS);
+		assert(!done[t.hdr.cpu]);
+
+		if (t.hdr.type == TRACE_OVERFLOW) {
+			/* Conveniently, each record is 16 bytes here. */
+			assert(t.overflow.bytes_missed % 16 == 0);
+			overflows[i] += t.overflow.bytes_missed / 16;
+			num_overflows[i]++;
+			continue;
+		}
+
+		assert(t.hdr.timestamp % CPUS == t.hdr.cpu);
+		if (t.hdr.type == TRACE_REPEAT) {
+			assert(t.hdr.len_div_8 * 8 == sizeof(t.repeat));
+			assert(t.repeat.num != 0);
+			assert(t.repeat.num <= t.hdr.cpu);
+			repeats[t.hdr.cpu] += t.repeat.num;
+		} else if (t.hdr.type == 0x70) {
+			done[t.hdr.cpu] = true;
+		} else {
+			counts[t.hdr.cpu]++;
+		}
+	}
+
+	/* Gather children. */
+	for (i = 0; i < CPUS; i++) {
+		int status;
+		wait(&status);
+	}
+
+	for (i = 0; i < CPUS; i++) {
+		printf("Child %i: %u produced, %u overflows, %llu total\n", i,
+		       counts[i], overflows[i],
+		       (long long)fake_cpus[i].trace->tb.end);
+		assert(counts[i] + repeats[i] <= PER_CHILD_TRACES);
+	}
+	/* Child 0 never repeats. */
+	assert(repeats[0] == 0);
+	assert(counts[0] + overflows[0] == PER_CHILD_TRACES);
+
+	/*
+	 * FIXME: Other children have some fuzz, since overflows may
+	 * include repeat record we already read.  And odd-numbered
+	 * overflows may include more repeat records than normal
+	 * records (they alternate).
+	 */
+}
+
+int main(void)
+{
+	union trace minimal;
+	union trace large;
+	union trace trace;
+	unsigned int i, j;
+
+	opal_node = dt_new_root("opal");
+	for (i = 0; i < CPUS; i++) {
+		fake_cpus[i].server_no = i;
+		fake_cpus[i].is_secondary = (i & 0x1);
+		fake_cpus[i].primary = &fake_cpus[i & ~0x1];
+	}
+	init_trace_buffers();
+	my_fake_cpu = &fake_cpus[0];
+
+	for (i = 0; i < CPUS; i++) {
+		assert(trace_empty(&fake_cpus[i].trace->tb));
+		assert(!trace_get(&trace, &fake_cpus[i].trace->tb));
+	}
+
+	assert(sizeof(trace.hdr) % 8 == 0);
+	timestamp = 1;
+	trace_add(&minimal, 100, sizeof(trace.hdr));
+	assert(trace_get(&trace, &my_fake_cpu->trace->tb));
+	assert(trace.hdr.len_div_8 == minimal.hdr.len_div_8);
+	assert(trace.hdr.timestamp == timestamp);
+
+	/* Make it wrap once. */
+	for (i = 0; i < TBUF_SZ / (minimal.hdr.len_div_8 * 8) + 1; i++) {
+		timestamp = i;
+		trace_add(&minimal, 99 + (i%2), sizeof(trace.hdr));
+	}
+
+	assert(trace_get(&trace, &my_fake_cpu->trace->tb));
+	/* First one must be overflow marker. */
+	assert(trace.hdr.type == TRACE_OVERFLOW);
+	assert(trace.hdr.len_div_8 * 8 == sizeof(trace.overflow));
+	assert(trace.overflow.bytes_missed == minimal.hdr.len_div_8 * 8);
+
+	for (i = 0; i < TBUF_SZ / (minimal.hdr.len_div_8 * 8); i++) {
+		assert(trace_get(&trace, &my_fake_cpu->trace->tb));
+		assert(trace.hdr.len_div_8 == minimal.hdr.len_div_8);
+		assert(trace.hdr.timestamp == i+1);
+		assert(trace.hdr.type == 99 + ((i+1)%2));
+	}
+	assert(!trace_get(&trace, &my_fake_cpu->trace->tb));
+
+	/* Now put in some weird-length ones, to test overlap.
+	 * Last power of 2, minus 8. */
+	for (j = 0; (1 << j) < sizeof(large); j++);
+	for (i = 0; i < TBUF_SZ; i++) {
+		timestamp = i;
+		trace_add(&large, 100 + (i%2), (1 << (j-1)));
+	}
+	assert(trace_get(&trace, &my_fake_cpu->trace->tb));
+	assert(trace.hdr.type == TRACE_OVERFLOW);
+	assert(trace_get(&trace, &my_fake_cpu->trace->tb));
+	assert(trace.hdr.len_div_8 == large.hdr.len_div_8);
+	i = trace.hdr.timestamp;
+	while (trace_get(&trace, &my_fake_cpu->trace->tb))
+		assert(trace.hdr.timestamp == ++i);
+
+	/* Test repeats. */
+	for (i = 0; i < 65538; i++) {
+		timestamp = i;
+		trace_add(&minimal, 100, sizeof(trace.hdr));
+	}
+	timestamp = i;
+	trace_add(&minimal, 101, sizeof(trace.hdr));
+	timestamp = i+1;
+	trace_add(&minimal, 101, sizeof(trace.hdr));
+
+	assert(trace_get(&trace, &my_fake_cpu->trace->tb));
+	assert(trace.hdr.timestamp == 0);
+	assert(trace.hdr.len_div_8 == minimal.hdr.len_div_8);
+	assert(trace.hdr.type == 100);
+	assert(trace_get(&trace, &my_fake_cpu->trace->tb));
+	assert(trace.hdr.type == TRACE_REPEAT);
+	assert(trace.hdr.len_div_8 * 8 == sizeof(trace.repeat));
+	assert(trace.repeat.num == 65535);
+	assert(trace.repeat.timestamp == 65535);
+	assert(trace_get(&trace, &my_fake_cpu->trace->tb));
+	assert(trace.hdr.timestamp == 65536);
+	assert(trace.hdr.len_div_8 == minimal.hdr.len_div_8);
+	assert(trace.hdr.type == 100);
+	assert(trace_get(&trace, &my_fake_cpu->trace->tb));
+	assert(trace.hdr.type == TRACE_REPEAT);
+	assert(trace.hdr.len_div_8 * 8 == sizeof(trace.repeat));
+	assert(trace.repeat.num == 1);
+	assert(trace.repeat.timestamp == 65537);
+
+	assert(trace_get(&trace, &my_fake_cpu->trace->tb));
+	assert(trace.hdr.timestamp == 65538);
+	assert(trace.hdr.len_div_8 == minimal.hdr.len_div_8);
+	assert(trace.hdr.type == 101);
+	assert(trace_get(&trace, &my_fake_cpu->trace->tb));
+	assert(trace.hdr.type == TRACE_REPEAT);
+	assert(trace.hdr.len_div_8 * 8 == sizeof(trace.repeat));
+	assert(trace.repeat.num == 1);
+	assert(trace.repeat.timestamp == 65539);
+
+	/* Now, test adding repeat while we're reading... */
+	timestamp = 0;
+	trace_add(&minimal, 100, sizeof(trace.hdr));
+	assert(trace_get(&trace, &my_fake_cpu->trace->tb));
+	assert(trace.hdr.timestamp == 0);
+	assert(trace.hdr.len_div_8 == minimal.hdr.len_div_8);
+	assert(trace.hdr.type == 100);
+
+	for (i = 1; i < TBUF_SZ; i++) {
+		timestamp = i;
+		trace_add(&minimal, 100, sizeof(trace.hdr));
+		assert(trace_get(&trace, &my_fake_cpu->trace->tb));
+		if (i % 65536 == 0) {
+			assert(trace.hdr.type == 100);
+			assert(trace.hdr.len_div_8 == minimal.hdr.len_div_8);
+		} else {
+			assert(trace.hdr.type == TRACE_REPEAT);
+			assert(trace.hdr.len_div_8 * 8 == sizeof(trace.repeat));
+			assert(trace.repeat.num == 1);
+		}
+		assert(trace.repeat.timestamp == i);
+		assert(!trace_get(&trace, &my_fake_cpu->trace->tb));
+	}
+
+	for (i = 0; i < CPUS; i++)
+		if (!fake_cpus[i].is_secondary)
+			free(fake_cpus[i].trace);
+
+	test_parallel();
+
+	return 0;
+}
diff --git a/core/test/stubs.c b/core/test/stubs.c
new file mode 100644
index 0000000..3233455
--- /dev/null
+++ b/core/test/stubs.c
@@ -0,0 +1,43 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/* Add any stub functions required for linking here. */
+#include <stdlib.h>
+
+static void stub_function(void)
+{
+	abort();
+}
+
+#define STUB(fnname) \
+	void fnname(void) __attribute__((weak, alias ("stub_function")))
+
+STUB(fdt_begin_node);
+STUB(fdt_property);
+STUB(fdt_end_node);
+STUB(fdt_create);
+STUB(fdt_add_reservemap_entry);
+STUB(fdt_finish_reservemap);
+STUB(fdt_strerror);
+STUB(fdt_check_header);
+STUB(_fdt_check_node_offset);
+STUB(fdt_next_tag);
+STUB(fdt_string);
+STUB(fdt_get_name);
+STUB(dt_first);
+STUB(dt_next);
+STUB(dt_has_node_property);
+STUB(dt_get_address);
+STUB(add_chip_dev_associativity);
diff --git a/core/timebase.c b/core/timebase.c
new file mode 100644
index 0000000..d51e96b
--- /dev/null
+++ b/core/timebase.c
@@ -0,0 +1,67 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <timebase.h>
+#include <fsp.h>
+
+void time_wait(unsigned long duration)
+{
+	unsigned long end = mftb() + duration;
+
+	while(tb_compare(mftb(), end) != TB_AAFTERB)
+		fsp_poll();
+}
+
+void time_wait_ms(unsigned long ms)
+{
+	time_wait(msecs_to_tb(ms));
+}
+
+void time_wait_us(unsigned long us)
+{
+	time_wait(usecs_to_tb(us));
+}
+
+unsigned long timespec_to_tb(const struct timespec *ts)
+{
+	unsigned long ns;
+
+	/* First convert to ns */
+	ns = ts->tv_sec * 1000000000ul;
+	ns += ts->tv_nsec;
+
+	/*
+	 * This is a very rough approximation, it works provided
+	 * we never try to pass too long delays here and the TB
+	 * frequency isn't significantly lower than 512Mhz.
+	 *
+	 * We could improve the precision by shifting less bits
+	 * at the expense of capacity or do 128 bit math which
+	 * I'm not eager to do :-)
+	 */
+	return (ns * (tb_hz >> 24)) / (1000000000ul >> 24);
+}
+
+int nanosleep(const struct timespec *req, struct timespec *rem)
+{
+	time_wait(timespec_to_tb(req));
+
+	if (rem) {
+		rem->tv_sec = 0;
+		rem->tv_nsec = 0;
+	}
+	return 0;
+}
diff --git a/core/trace.c b/core/trace.c
new file mode 100644
index 0000000..76f3c30
--- /dev/null
+++ b/core/trace.c
@@ -0,0 +1,244 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <trace.h>
+#include <timebase.h>
+#include <lock.h>
+#include <string.h>
+#include <stdlib.h>
+#include <cpu.h>
+#include <device.h>
+#include <libfdt.h>
+#include <processor.h>
+#include <skiboot.h>
+
+#define DEBUG_TRACES
+
+#define MAX_SIZE (sizeof(union trace) + 7)
+
+/* Smaller trace buffer for early booting */
+#define BOOT_TBUF_SZ 65536
+static struct {
+	struct trace_info trace_info;
+	char buf[BOOT_TBUF_SZ + MAX_SIZE];
+} boot_tracebuf __section(".data.boot_trace") = {
+	.trace_info = {
+		.lock = LOCK_UNLOCKED,
+		.tb = {
+		        .mask = BOOT_TBUF_SZ - 1,
+			.max_size = MAX_SIZE
+		},
+	},
+	.buf = { 0 }
+};
+
+void init_boot_tracebuf(struct cpu_thread *boot_cpu)
+{
+	boot_cpu->trace = &boot_tracebuf.trace_info;
+}
+
+static size_t tracebuf_extra(void)
+{
+	/* We make room for the largest possible record */
+	return TBUF_SZ + MAX_SIZE;
+}
+
+/* To avoid bloating each entry, repeats are actually specific entries.
+ * tb->last points to the last (non-repeat) entry. */
+static bool handle_repeat(struct tracebuf *tb, const union trace *trace)
+{
+	struct trace_hdr *prev;
+	struct trace_repeat *rpt;
+	u32 len;
+
+	prev = (void *)tb->buf + (tb->last & tb->mask);
+
+	if (prev->type != trace->hdr.type
+	    || prev->len_div_8 != trace->hdr.len_div_8
+	    || prev->cpu != trace->hdr.cpu)
+		return false;
+
+	len = prev->len_div_8 << 3;
+	if (memcmp(prev + 1, &trace->hdr + 1, len - sizeof(*prev)) != 0)
+		return false;
+
+	/* If they've consumed prev entry, don't repeat. */
+	if (tb->last < tb->start)
+		return false;
+
+	/* OK, it's a duplicate.  Do we already have repeat? */
+	if (tb->last + len != tb->end) {
+		/* FIXME: Reader is not protected from seeing this! */
+		rpt = (void *)tb->buf + ((tb->last + len) & tb->mask);
+		assert(tb->last + len + rpt->len_div_8*8 == tb->end);
+		assert(rpt->type == TRACE_REPEAT);
+
+		/* If this repeat entry is full, don't repeat. */
+		if (rpt->num == 0xFFFF)
+			return false;
+
+		rpt->num++;
+		rpt->timestamp = trace->hdr.timestamp;
+		return true;
+	}
+
+	/*
+	 * Generate repeat entry: it's the smallest possible entry, so we
+	 * must have eliminated old entries.
+	 */
+	assert(trace->hdr.len_div_8 * 8 >= sizeof(*rpt));
+
+	rpt = (void *)tb->buf + (tb->end & tb->mask);
+	rpt->timestamp = trace->hdr.timestamp;
+	rpt->type = TRACE_REPEAT;
+	rpt->len_div_8 = sizeof(*rpt) >> 3;
+	rpt->cpu = trace->hdr.cpu;
+	rpt->prev_len = trace->hdr.len_div_8 << 3;
+	rpt->num = 1;
+	lwsync(); /* write barrier: complete repeat record before exposing */
+	tb->end += sizeof(*rpt);
+	return true;
+}
+
+void trace_add(union trace *trace, u8 type, u16 len)
+{
+	struct trace_info *ti = this_cpu()->trace;
+	unsigned int tsz;
+
+	trace->hdr.type = type;
+	trace->hdr.len_div_8 = (len + 7) >> 3;
+
+	tsz = trace->hdr.len_div_8 << 3;
+
+#ifdef DEBUG_TRACES
+	assert(tsz >= sizeof(trace->hdr));
+	assert(tsz <= sizeof(*trace));
+	assert(trace->hdr.type != TRACE_REPEAT);
+	assert(trace->hdr.type != TRACE_OVERFLOW);
+#endif
+	/* Skip traces not enabled in the debug descriptor */
+	if (!((1ul << trace->hdr.type) & debug_descriptor.trace_mask))
+		return;
+
+	trace->hdr.timestamp = mftb();
+	trace->hdr.cpu = this_cpu()->server_no;
+
+	lock(&ti->lock);
+
+	/* Throw away old entries before we overwrite them. */
+	while ((ti->tb.start + ti->tb.mask + 1) < (ti->tb.end + tsz)) {
+		struct trace_hdr *hdr;
+
+		hdr = (void *)ti->tb.buf + (ti->tb.start & ti->tb.mask);
+		ti->tb.start += hdr->len_div_8 << 3;
+	}
+
+	/* Must update ->start before we rewrite new entries. */
+	lwsync(); /* write barrier */
+
+	/* Check for duplicates... */
+	if (!handle_repeat(&ti->tb, trace)) {
+		/* This may go off end, and that's why ti->tb.buf is oversize */
+		memcpy(ti->tb.buf + (ti->tb.end & ti->tb.mask), trace, tsz);
+		ti->tb.last = ti->tb.end;
+		lwsync(); /* write barrier: write entry before exposing */
+		ti->tb.end += tsz;
+	}
+	unlock(&ti->lock);
+}
+
+static void trace_add_dt_props(void)
+{
+	unsigned int i;
+	u64 *prop, tmask;
+
+	prop = malloc(sizeof(u64) * 2 * debug_descriptor.num_traces);
+
+	for (i = 0; i < debug_descriptor.num_traces; i++) {
+		prop[i * 2] = cpu_to_fdt64(debug_descriptor.trace_phys[i]);
+		prop[i * 2 + 1] = cpu_to_fdt64(debug_descriptor.trace_size[i]);
+	}
+
+	dt_add_property(opal_node, "ibm,opal-traces",
+			prop, sizeof(u64) * 2 * i);
+	free(prop);
+
+	tmask = (uint64_t)&debug_descriptor.trace_mask;
+	dt_add_property_cells(opal_node, "ibm,opal-trace-mask",
+			      hi32(tmask), lo32(tmask));
+}
+
+static void trace_add_desc(struct trace_info *t, uint64_t size)
+{
+	unsigned int i = debug_descriptor.num_traces;
+
+	if (i >= DEBUG_DESC_MAX_TRACES) {
+		prerror("TRACE: Debug descriptor trace list full !\n");
+		return;
+	}
+	debug_descriptor.num_traces++;
+
+	debug_descriptor.trace_phys[i] = (uint64_t)&t->tb;
+	debug_descriptor.trace_tce[i] = 0; /* populated later */
+	debug_descriptor.trace_size[i] = size;
+}
+
+/* Allocate trace buffers once we know memory topology */
+void init_trace_buffers(void)
+{
+	struct cpu_thread *t;
+	struct trace_info *any = &boot_tracebuf.trace_info;
+	uint64_t size;
+
+	/* Boot the boot trace in the debug descriptor */
+	trace_add_desc(any, sizeof(boot_tracebuf.buf));
+
+	/* Allocate a trace buffer for each primary cpu. */
+	for_each_cpu(t) {
+		if (t->is_secondary)
+			continue;
+
+		/* Use a 4K alignment for TCE mapping */
+		size = ALIGN_UP(sizeof(*t->trace) + tracebuf_extra(), 0x1000);
+		t->trace = local_alloc(t->chip_id, size, 0x1000);
+		if (t->trace) {
+			any = t->trace;
+			memset(t->trace, 0, size);
+			init_lock(&t->trace->lock);
+			t->trace->tb.mask = TBUF_SZ - 1;
+			t->trace->tb.max_size = MAX_SIZE;
+			trace_add_desc(any, sizeof(t->trace->tb) +
+				       tracebuf_extra());
+		} else
+			prerror("TRACE: cpu 0x%x allocation failed\n", t->pir);
+	}
+
+	/* In case any allocations failed, share trace buffers. */
+	for_each_cpu(t) {
+		if (!t->is_secondary && !t->trace)
+			t->trace = any;
+	}
+
+	/* And copy those to the secondaries. */
+	for_each_cpu(t) {
+		if (!t->is_secondary)
+			continue;
+		t->trace = t->primary->trace;
+	}
+
+	/* Trace node in DT. */
+	trace_add_dt_props();
+}
diff --git a/core/utils.c b/core/utils.c
new file mode 100644
index 0000000..2bc57b1
--- /dev/null
+++ b/core/utils.c
@@ -0,0 +1,59 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <lock.h>
+#include <fsp.h>
+#include <processor.h>
+
+void abort(void)
+{
+	static bool in_abort = false;
+	unsigned long hid0;
+
+	if (in_abort)
+		for (;;) ;
+	in_abort = true;
+
+	bust_locks = true;
+
+	op_display(OP_FATAL, OP_MOD_CORE, 0x6666);
+	
+	fputs("Aborting!\n", stderr);
+	backtrace();
+
+	/* XXX FIXME: We should fsp_poll for a while to ensure any pending
+	 * console writes have made it out, but until we have decent PSI
+	 * link handling we must not do it forever. Polling can prevent the
+	 * FSP from bringing the PSI link up and it can get stuck in a
+	 * reboot loop.
+	 */
+
+	hid0 = mfspr(SPR_HID0);
+	hid0 |= SPR_HID0_ENABLE_ATTN;
+	set_hid0(hid0);
+	trigger_attn();
+	for (;;) ;
+}
+
+char __attrconst tohex(uint8_t nibble)
+{
+	static const char __tohex[] = {'0','1','2','3','4','5','6','7','8','9',
+				       'A','B','C','D','E','F'};
+	if (nibble > 0xf)
+		return '?';
+	return __tohex[nibble];
+}
diff --git a/core/vpd.c b/core/vpd.c
new file mode 100644
index 0000000..deb552c
--- /dev/null
+++ b/core/vpd.c
@@ -0,0 +1,211 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <vpd.h>
+#include <string.h>
+#include <fsp.h>
+#include <device.h>
+
+#define CHECK_SPACE(_p, _n, _e) (((_e) - (_p)) >= (_n))
+
+/* Low level keyword search in a record. Can be used when we
+ * need to find the next keyword of a given type, for example
+ * when having multiple MF/SM keyword pairs
+ */
+const void *vpd_find_keyword(const void *rec, size_t rec_sz,
+			     const char *kw, uint8_t *kw_size)
+{
+	const uint8_t *p = rec, *end = rec + rec_sz;
+
+	while (CHECK_SPACE(p, 3, end)) {
+		uint8_t k1 = *(p++);
+		uint8_t k2 = *(p++);
+		uint8_t sz = *(p++);
+
+		if (k1 == kw[0] && k2 == kw[1]) {
+			if (kw_size)
+				*kw_size = sz;
+			return p;
+		}
+		p += sz;
+	}
+	return NULL;
+}
+
+/* Locate  a record in a VPD blob
+ *
+ * Note: This works with VPD LIDs. It will scan until it finds
+ * the first 0x84, so it will skip all those 0's that the VPD
+ * LIDs seem to contain
+ */
+const void *vpd_find_record(const void *vpd, size_t vpd_size,
+			    const char *record, size_t *sz)
+{
+	const uint8_t *p = vpd, *end = vpd + vpd_size;
+	bool first_start = true;
+	size_t rec_sz;
+	uint8_t namesz = 0;
+	const char *rec_name;
+
+	while (CHECK_SPACE(p, 4, end)) {
+		/* Get header byte */
+		if (*(p++) != 0x84) {
+			/* Skip initial crap in VPD LIDs */
+			if (first_start)
+				continue;
+			break;
+		}
+		first_start = false;
+		rec_sz = *(p++);
+		rec_sz |= *(p++) << 8;
+		if (!CHECK_SPACE(p, rec_sz, end)) {
+			prerror("VPD: Malformed or truncated VPD,"
+				" record size doesn't fit\n");
+			return NULL;
+		}
+
+		/* Find record name */
+		rec_name = vpd_find_keyword(p, rec_sz, "RT", &namesz);
+		if (rec_name && strncmp(record, rec_name, namesz) == 0) {
+			*sz = rec_sz;
+			return p;
+		}
+
+		p += rec_sz;
+		if (*(p++) != 0x78) {
+			prerror("VPD: Malformed or truncated VPD,"
+				" missing final 0x78 in record %.4s\n",
+				rec_name ? rec_name : "????");
+			return NULL;
+		}
+	}
+	return NULL;
+}
+
+/* Locate a keyword in a record in a VPD blob
+ *
+ * Note: This works with VPD LIDs. It will scan until it finds
+ * the first 0x84, so it will skip all those 0's that the VPD
+ * LIDs seem to contain
+ */
+const void *vpd_find(const void *vpd, size_t vpd_size,
+		     const char *record, const char *keyword,
+		     uint8_t *sz)
+{
+	size_t rec_sz;
+	const uint8_t *p;
+
+	p = vpd_find_record(vpd, vpd_size, record, &rec_sz);
+	if (p)
+		p = vpd_find_keyword(p, rec_sz, keyword, sz);
+	return p;
+}
+
+/* Helper to load a VPD LID. Pass a ptr to the corresponding LX keyword */
+static void *vpd_lid_load(const uint8_t *lx, uint8_t lxrn, size_t *size)
+{
+	/* Now this is a guess game as we don't have the info from the
+	 * pHyp folks. But basically, it seems to boil down to loading
+	 * a LID whose name is 0x80e000yy where yy is the last 2 digits
+	 * of the LX record in hex.
+	 *
+	 * [ Correction: After a chat with some folks, it looks like it's
+	 * actually 4 digits, though the lid number is limited to fff
+	 * so we weren't far off. ]
+	 *
+	 * For safety, we look for a matching LX record in an LXRn
+	 * (n = lxrn argument) or in VINI if lxrn=0xff
+	 */
+	uint32_t lid_no = 0x80e00000 | ((lx[6] & 0xf) << 8) | lx[7];
+
+	/* We don't quite know how to get to the LID directory so
+	 * we don't know the size. Let's allocate 16K. All the VPD LIDs
+	 * I've seen so far are much smaller.
+	 */
+#define VPD_LID_MAX_SIZE	0x4000
+	void *data = malloc(VPD_LID_MAX_SIZE);
+	char record[4] = "LXR0";
+	const void *valid_lx;
+	uint8_t lx_size;
+	int rc;
+
+	if (!data) {
+		prerror("VPD: Failed to allocate memory for LID\n");
+		return NULL;
+	}
+
+	/* Adjust LID number for flash side */
+	lid_no = fsp_adjust_lid_side(lid_no);
+	printf("VPD: Trying to load VPD LID 0x%08x...\n", lid_no);
+
+	*size = VPD_LID_MAX_SIZE;
+
+	/* Load it from the FSP */
+	rc = fsp_fetch_data(0, FSP_DATASET_NONSP_LID, lid_no, 0, data, size);
+	if (rc) {
+		prerror("VPD: Error %d loading VPD LID\n", rc);
+		goto fail;
+	}
+
+	/* Validate it */
+	if (lxrn < 9)
+		record[3] = '0' + lxrn;
+	else
+		memcpy(record, "VINI", 4);
+
+	valid_lx = vpd_find(data, *size, record, "LX", &lx_size);
+	if (!valid_lx || lx_size != 8) {
+		prerror("VPD: Cannot find validation LX record\n");
+		goto fail;
+	}
+	if (memcmp(valid_lx, lx, 8) != 0) {
+		prerror("VPD: LX record mismatch !\n");
+		goto fail;
+	}
+
+	printf("VPD: Loaded %zu bytes\n", *size);
+
+	/* Got it ! */
+	return realloc(data, *size);
+ fail:
+	free(data);
+	return NULL;
+}
+
+void vpd_iohub_load(struct dt_node *hub_node)
+{
+	void *vpd;
+	size_t sz;
+	const uint32_t *p;
+	unsigned int lx_idx;
+	const char *lxr;
+
+	p = dt_prop_get_def(hub_node, "ibm,vpd-lx-info", NULL);
+	if (!p)
+		return;
+
+	lx_idx = p[0];
+	lxr = (const char *)&p[1];
+
+	vpd = vpd_lid_load(lxr, lx_idx, &sz);
+	if (!vpd) {
+		prerror("VPD: Failed to load VPD LID\n");
+	} else {
+		dt_add_property(hub_node, "ibm,io-vpd", vpd, sz);
+		free(vpd);
+	}
+}