8 files changed, 1587 insertions, 7 deletions
diff --git a/core/init.c b/core/init.c
index d9d62ee..6b8137c 100644
--- a/core/init.c
+++ b/core/init.c
@@ -952,6 +952,7 @@ void __noreturn __nomcount main_cpu_entry(const void *fdt)
 
 	/* Probe NPUs */
 	probe_npu();
+	probe_npu2();
 
 	/* Initialize PCI */
 	pci_init_slots();
diff --git a/doc/device-tree/nvlink.rst b/doc/device-tree/nvlink.rst
index 8375dc4..6ce44e9 100644
--- a/doc/device-tree/nvlink.rst
+++ b/doc/device-tree/nvlink.rst
@@ -66,6 +66,27 @@ NPU bindings:
     };
   };
 
+GPU memory bindings
+-------------------
+
+.. code-block:: dts
+
+        memory@100000000 {
+                device_type = "memory"
+                compatible = "ibm,coherent-device-memory";
+                linux,usable-memory = <0x0 0x100000000 0x0 0x0>;
+
+        ; denotes a region of unplugged system memory
+
+                reg = <0x0 0x100000000 0x0 0x80000000>;
+                ibm,associativity = <0x4 0x0 0x0 0x0 0x64>;
+
+        ; numa associativity for the memory once it is hotplugged
+
+                phandle = <0x10000abc>;
+                linux,phandle = <0x10000abc>;
+        };
+
 Emulated PCI device bindings
 ----------------------------
 
@@ -83,7 +104,8 @@ Emulated PCI device bindings
                         ibm,pci-config-space-type = <0x1>;
                         vendor-id = <0x1014>;
                         ibm,gpu = <0x100002f7>; /* phandle pointing the associated GPU PCI device node */
-  	  	        phandle = <0x100002fc>;
+			memory-region = <0x10000abc>; /* phandle pointing to the GPU memory */
+			phandle = <0x100002fc>;
                 };
 
                 pci@1 {
@@ -94,6 +116,7 @@ Emulated PCI device bindings
                         ibm,pci-config-space-type = <0x1>;
                         vendor-id = <0x1014>;
                         ibm,gpu = <0x100002f5>;
+                        memory-region = <0x10000def>;
                         phandle = <0x100002fe>;
                         class-code = <0x60400>;
                         linux,phandle = <0x100002fe>;
@@ -107,6 +130,7 @@ Emulated PCI device bindings
                         ibm,pci-config-space-type = <0x1>;
                         vendor-id = <0x1014>;
                         ibm,gpu = <0x100002f7>;
+                        memory-region = <0x10000abc>;
                         phandle = <0x100002fd>;
                         class-code = <0x60400>;
                         linux,phandle = <0x100002fd>;
@@ -120,6 +144,7 @@ Emulated PCI device bindings
                         ibm,pci-config-space-type = <0x1>;
                         vendor-id = <0x1014>;
                         ibm,gpu = <0x100002f5>;
+                        memory-region = <0x10000def>;
                         phandle = <0x100002ff>;
                         class-code = <0x60400>;
                         linux,phandle = <0x100002ff>;
diff --git a/doc/nvlink.rst b/doc/nvlink.rst
index d035968..cba64df 100644
--- a/doc/nvlink.rst
+++ b/doc/nvlink.rst
@@ -38,14 +38,15 @@ related to the setup of DMA windows.
 Configuration Space Parameters
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-============ =============== =====
-============ =============== =====
-Vendor ID    0x1014          (IBM)
+============ =================== =====
+============ =================== =====
+Vendor ID    0x1014              (IBM)
 Device ID    0x04ea
 Revision ID  0x00
-Class        0x068000        (Bridge Device Other, ProgIf = 0x0)
+Class        0x068000 / 0x068001 (Bridge Device Other, ProgIf = 0x0 / 0x1)
 BAR0/1       TL/DL Registers
-============ =============== =====
+BAR2/3       GEN-ID Registers    (Only for rev-id = 0x1)
+============ =================== =====
 
 TL/DL Registers
 ^^^^^^^^^^^^^^^
@@ -59,6 +60,17 @@ to 64-bit BAR#0 of the emulated PCI device configuration space. ::
       	     | DL (64K)  |
  BAR#0	     +-----------+
 
+Generation Registers (GEN-ID)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+On POWER9 each link has 64K of generation ID registers for the relaxed
+ordering mode syncronisation. Refer to the programming guide for
+details of the register layout in this BAR.
+
+Relaxed ordering mode will be disabled by default as it requires
+device driver support. Device drivers will need to request relaxed
+ordering mode through some yet to be designed mechanism.
+
 Vendor Specific Capabilities
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 ::
diff --git a/hw/Makefile.inc b/hw/Makefile.inc
index d87f85e..bcddcf0 100644
--- a/hw/Makefile.inc
+++ b/hw/Makefile.inc
@@ -6,7 +6,7 @@ HW_OBJS += nx.o nx-rng.o nx-crypto.o nx-842.o
 HW_OBJS += p7ioc.o p7ioc-inits.o p7ioc-phb.o
 HW_OBJS += phb3.o sfc-ctrl.o fake-rtc.o bt.o p8-i2c.o prd.o
 HW_OBJS += dts.o lpc-rtc.o npu.o npu-hw-procedures.o xive.o phb4.o
-HW_OBJS += fake-nvram.o lpc-mbox.o
+HW_OBJS += fake-nvram.o lpc-mbox.o npu2.o
 HW=hw/built-in.o
 
 # FIXME hack this for now
diff --git a/hw/npu2.c b/hw/npu2.c
new file mode 100644
index 0000000..ada6b38
--- /dev/null
+++ b/hw/npu2.c
@@ -0,0 +1,1388 @@
+/* Copyright 2013-2016 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <skiboot.h>
+#include <io.h>
+#include <timebase.h>
+#include <pci-cfg.h>
+#include <pci.h>
+#include <pci-slot.h>
+#include <pci-virt.h>
+#include <interrupts.h>
+#include <opal.h>
+#include <opal-api.h>
+#include <cpu.h>
+#include <device.h>
+#include <ccan/str/str.h>
+#include <ccan/array_size/array_size.h>
+#include <affinity.h>
+#include <npu2-regs.h>
+#include <npu2.h>
+#include <lock.h>
+#include <xscom.h>
+#include <bitutils.h>
+#include <chip.h>
+
+/*
+ * NPU2 BAR layout definition. We have 3 stacks and each of them
+ * contains 2 bricks. So every NPU2 has 6 bricks in total. There are 2
+ * PHY BARs and each of them is shared by 3 bricks. Every brick has
+ * one NTL BAR and two bricks share one GENID BAR. There is also a
+ * global MMIO BAR. We only expose DL and GENID BARs to the OS and all
+ * other BARs will be hidden in skiboot.
+ *
+ * Before the global MMIO BAR is configured, scom is the only way to
+ * access the BAR registers. At NPU2 PHB probing time, we rely on scom
+ * to assign all BARs until the global MMIO BAR is established.
+ *
+ * We need to access 4 SM registers in the same stack in order to
+ * configure one particular BAR.
+ */
+#define NPU2_DEFINE_BAR(t, n, s)					\
+	{ .flags         = 0,						\
+	  .type          = t,						\
+	  .reg           = NPU2_##n,					\
+	  .stack         = s,						\
+	  .base	         = 0ul,						\
+	  .size          = 0ul,						\
+	}
+
+#define VENDOR_CAP_START          0x80
+#define VENDOR_CAP_END	          0x90
+
+#define VENDOR_CAP_PCI_DEV_OFFSET 0x0d
+
+static struct npu2_bar npu2_bars[] = {
+	NPU2_DEFINE_BAR(NPU2_BAR_TYPE_GLOBAL,   PHY_BAR,   NPU2_STACK_STCK_2),
+	NPU2_DEFINE_BAR(NPU2_BAR_TYPE_PHY,	PHY_BAR,   NPU2_STACK_STCK_0),
+	NPU2_DEFINE_BAR(NPU2_BAR_TYPE_PHY,	PHY_BAR,   NPU2_STACK_STCK_1),
+	NPU2_DEFINE_BAR(NPU2_BAR_TYPE_NTL,	NTL0_BAR,  NPU2_STACK_STCK_0),
+	NPU2_DEFINE_BAR(NPU2_BAR_TYPE_NTL,	NTL1_BAR,  NPU2_STACK_STCK_0),
+	NPU2_DEFINE_BAR(NPU2_BAR_TYPE_NTL,	NTL0_BAR,  NPU2_STACK_STCK_1),
+	NPU2_DEFINE_BAR(NPU2_BAR_TYPE_NTL,	NTL1_BAR,  NPU2_STACK_STCK_1),
+	NPU2_DEFINE_BAR(NPU2_BAR_TYPE_NTL,	NTL0_BAR,  NPU2_STACK_STCK_2),
+	NPU2_DEFINE_BAR(NPU2_BAR_TYPE_NTL,	NTL1_BAR,  NPU2_STACK_STCK_2),
+	NPU2_DEFINE_BAR(NPU2_BAR_TYPE_GENID,	GENID_BAR, NPU2_STACK_STCK_0),
+	NPU2_DEFINE_BAR(NPU2_BAR_TYPE_GENID,	GENID_BAR, NPU2_STACK_STCK_1),
+	NPU2_DEFINE_BAR(NPU2_BAR_TYPE_GENID,	GENID_BAR, NPU2_STACK_STCK_2)
+};
+
+/*
+ * We use the indirect method because it uses the same addresses as
+ * the MMIO offsets (NPU RING)
+ */
+static void npu2_scom_set_addr(uint64_t gcid, uint64_t scom_base,
+			       uint64_t addr, uint64_t size)
+{
+	addr = SETFIELD(NPU2_MISC_DA_ADDR, 0ull, addr);
+	addr = SETFIELD(NPU2_MISC_DA_LEN, addr, size);
+	xscom_write(gcid, scom_base + NPU2_MISC_SCOM_IND_SCOM_ADDR, addr);
+}
+
+static void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
+			    uint64_t reg, uint64_t size,
+			    uint64_t val)
+{
+	npu2_scom_set_addr(gcid, scom_base, reg, size);
+	xscom_write(gcid, scom_base + NPU2_MISC_SCOM_IND_SCOM_DATA, val);
+}
+
+static uint64_t npu2_scom_read(uint64_t gcid, uint64_t scom_base,
+			       uint64_t reg, uint64_t size)
+{
+	uint64_t val;
+
+	npu2_scom_set_addr(gcid, scom_base, reg, size);
+	xscom_read(gcid, scom_base + NPU2_MISC_SCOM_IND_SCOM_DATA, &val);
+
+	return val;
+}
+
+void npu2_write_4b(struct npu2 *p, uint64_t reg, uint64_t val)
+{
+	npu2_scom_write(p->chip_id, p->xscom_base, reg, NPU2_MISC_DA_LEN_4B, val);
+}
+
+uint64_t npu2_read_4b(struct npu2 *p, uint64_t reg)
+{
+	return npu2_scom_read(p->chip_id, p->xscom_base, reg, NPU2_MISC_DA_LEN_4B);
+}
+
+void npu2_write(struct npu2 *p, uint64_t reg, uint64_t val)
+{
+	npu2_scom_write(p->chip_id, p->xscom_base, reg, NPU2_MISC_DA_LEN_8B, val);
+}
+
+uint64_t npu2_read(struct npu2 *p, uint64_t reg)
+{
+	return npu2_scom_read(p->chip_id, p->xscom_base, reg, NPU2_MISC_DA_LEN_8B);
+}
+
+void npu2_write_mask(struct npu2 *p, uint64_t reg, uint64_t val, uint64_t mask)
+{
+	uint64_t new_val;
+
+	new_val = npu2_read(p, reg);
+	new_val &= ~mask;
+	new_val |= val & mask;
+	npu2_scom_write(p->chip_id, p->xscom_base, reg, NPU2_MISC_DA_LEN_8B, new_val);
+}
+
+static inline void npu2_ioda_sel(struct npu2 *p, uint32_t table,
+				uint32_t index, bool autoinc)
+{
+	out_be64(p->regs + NPU2_ATS_IODA_TBL,
+		 (autoinc ? NPU2_ATS_IODA_TBL_AUTOINC : 0ul)	|
+		 SETFIELD(NPU2_ATS_IODA_TBL_SELECT, 0ul, table)	|
+		 SETFIELD(NPU2_ATS_IODA_TBL_INDEX,  0ul, index));
+}
+
+static struct npu2_dev *npu2_bdf_to_dev(struct npu2 *p,
+					uint32_t bdfn)
+{
+	struct pci_virt_device *pvd;
+
+	/* All emulated devices are attached to root bus */
+	if (bdfn & ~0xff)
+		return NULL;
+
+	pvd = pci_virt_find_device(&p->phb, bdfn);
+	if (pvd)
+		return pvd->data;
+
+	return NULL;
+}
+
+static void npu2_write_bar(struct npu2 *p,
+			   struct npu2_bar *bar,
+			   uint32_t gcid,
+			   uint32_t scom)
+{
+	uint64_t reg, val, enable = !!(bar->flags & NPU2_BAR_FLAG_ENABLED);
+	int block;
+
+	switch (bar->type) {
+	case NPU2_BAR_TYPE_GLOBAL:
+	case NPU2_BAR_TYPE_PHY:
+		val = SETFIELD(NPU2_PHY_BAR_ADDR, 0ul, bar->base >> 21);
+		val = SETFIELD(NPU2_PHY_BAR_ENABLE, val, enable);
+		break;
+	case NPU2_BAR_TYPE_NTL:
+		val = SETFIELD(NPU2_NTL_BAR_ADDR, 0ul, bar->base >> 17);
+		val = SETFIELD(NPU2_NTL_BAR_ENABLE, val, enable);
+		break;
+	case NPU2_BAR_TYPE_GENID:
+		val = SETFIELD(NPU2_GENID_BAR_ADDR, 0ul, bar->base >> 17);
+		val = SETFIELD(NPU2_GENID_BAR_ENABLE, val, enable);
+		break;
+	default:
+		val = 0ul;
+	}
+
+	for (block = NPU2_BLOCK_SM_0; block <= NPU2_BLOCK_SM_3; block++) {
+		reg = NPU2_REG_OFFSET(bar->stack, block, bar->reg);
+		if (p)
+			npu2_write(p, reg, val);
+		else
+			npu2_scom_write(gcid, scom, reg, val, NPU2_MISC_DA_LEN_8B);
+	}
+}
+
+/* Trap for PCI command (0x4) to enable or disable device's BARs */
+static int64_t npu2_cfg_write_cmd(void *dev,
+				  struct pci_cfg_reg_filter *pcrf __unused,
+				  uint32_t offset, uint32_t size,
+				  uint32_t *data, bool write)
+{
+	struct pci_virt_device *pvd = dev;
+	struct npu2_dev *ndev = pvd->data;
+	struct npu2_bar *ntl_npu_bar, *genid_npu_bar;
+	bool enabled;
+
+	if (!write)
+		return OPAL_PARTIAL;
+
+	if (offset != PCI_CFG_CMD)
+		return OPAL_PARAMETER;
+	if (size != 1 && size != 2 && size != 4)
+		return OPAL_PARAMETER;
+
+	/*
+	 * Enable or disable NTL and GENID BAR. Two bricks share
+	 * one GENID BAR, which is exposed via the first brick.
+	 */
+	enabled = !!(*data & PCI_CFG_CMD_MEM_EN);
+	ntl_npu_bar = ndev->bars[0].npu2_bar;
+	genid_npu_bar = ndev->bars[1].npu2_bar;
+
+	ntl_npu_bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED, ntl_npu_bar->flags, enabled);
+	npu2_write_bar(ndev->npu, ntl_npu_bar, 0, 0);
+
+	/*
+	 * Enable/disable the GENID BAR. Two bricks share one GENID
+	 * BAR which is exposed via the first brick so we need to
+	 * track the enables separately.
+	 */
+	if (NPU2DEV_BRICK(ndev))
+		genid_npu_bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED1, genid_npu_bar->flags,
+						enabled);
+	else
+		genid_npu_bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED0, genid_npu_bar->flags,
+						enabled);
+
+	/* Enable the BAR if either deivce requests it enabled, otherwise disable it */
+	genid_npu_bar->flags = SETFIELD(NPU2_BAR_FLAG_ENABLED, genid_npu_bar->flags,
+					!!(genid_npu_bar->flags & (NPU2_BAR_FLAG_ENABLED0 |
+								   NPU2_BAR_FLAG_ENABLED1)));
+	npu2_write_bar(ndev->npu, genid_npu_bar, 0, 0);
+
+	return OPAL_PARTIAL;
+}
+
+static int64_t npu2_cfg_read_bar(struct npu2_dev *dev __unused,
+				 struct pci_cfg_reg_filter *pcrf,
+				 uint32_t offset, uint32_t size,
+				 uint32_t *data)
+{
+	struct npu2_pcie_bar *bar = (struct npu2_pcie_bar *) pcrf->data;
+
+	if (!(bar->flags & NPU2_PCIE_BAR_FLAG_TRAPPED))
+		return OPAL_PARTIAL;
+
+	if ((size != 4) ||
+	    (offset != pcrf->start && offset != pcrf->start + 4))
+		return OPAL_PARAMETER;
+
+	if (bar->flags & NPU2_PCIE_BAR_FLAG_SIZE_HI)
+		*data = bar->size >> 32;
+	else
+		*data = bar->size;
+	bar->flags &= ~(NPU2_PCIE_BAR_FLAG_TRAPPED | NPU2_PCIE_BAR_FLAG_SIZE_HI);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu2_cfg_write_bar(struct npu2_dev *dev,
+				  struct pci_cfg_reg_filter *pcrf,
+				  uint32_t offset, uint32_t size,
+				  uint32_t data)
+{
+	struct pci_virt_device *pvd = dev->pvd;
+	struct npu2_pcie_bar *bar = (struct npu2_pcie_bar *) pcrf->data;
+	uint32_t pci_cmd;
+
+	if ((size != 4) ||
+	    (offset != pcrf->start && offset != pcrf->start + 4))
+		return OPAL_PARAMETER;
+
+	/* Return BAR size on next read */
+	if (data == 0xffffffff) {
+		bar->flags |= NPU2_PCIE_BAR_FLAG_TRAPPED;
+		if (offset == pcrf->start + 4)
+			bar->flags |= NPU2_PCIE_BAR_FLAG_SIZE_HI;
+
+		return OPAL_SUCCESS;
+	}
+
+	if (offset == pcrf->start) {
+		bar->base &= 0xffffffff00000000;
+		bar->base |= (data & 0xfffffff0);
+	} else {
+		bar->base &= 0x00000000ffffffff;
+		bar->base |= ((uint64_t)data << 32);
+
+		PCI_VIRT_CFG_NORMAL_RD(pvd, PCI_CFG_CMD, 4, &pci_cmd);
+
+		if (bar->npu2_bar->type == NPU2_BAR_TYPE_GENID && NPU2DEV_BRICK(dev))
+			bar->base -= 0x10000;
+
+		/* Only allow changing the base address if the BAR is not enabled */
+		if ((bar->npu2_bar->flags & NPU2_BAR_FLAG_ENABLED) &&
+		    (bar->npu2_bar->base != bar->base))
+			return OPAL_HARDWARE;
+
+		bar->npu2_bar->base = bar->base;
+		npu2_write_bar(dev->npu, bar->npu2_bar, 0, 0);
+	}
+
+	/* To update the config cache */
+	return OPAL_PARTIAL;
+}
+
+static int64_t npu2_dev_cfg_bar(void *dev, struct pci_cfg_reg_filter *pcrf,
+				uint32_t offset, uint32_t len, uint32_t *data,
+				bool write)
+{
+	struct pci_virt_device *pvd = dev;
+	struct npu2_dev *ndev = (struct npu2_dev *) pvd->data;
+
+	if (write)
+		return npu2_cfg_write_bar(ndev, pcrf, offset, len, *data);
+
+	return npu2_cfg_read_bar(ndev, pcrf, offset, len, data);
+}
+
+#define NPU2_CFG_READ(size, type)					\
+static int64_t npu2_cfg_read##size(struct phb *phb, uint32_t bdfn,	\
+				   uint32_t offset, type *data)		\
+{									\
+	uint32_t val;							\
+	int64_t ret;							\
+									\
+	ret = pci_virt_cfg_read(phb, bdfn, offset,			\
+				sizeof(*data), &val);			\
+	*data = (type)val;						\
+        return ret;							\
+}
+#define NPU2_CFG_WRITE(size, type)					\
+static int64_t npu2_cfg_write##size(struct phb *phb, uint32_t bdfn,	\
+				    uint32_t offset, type data)		\
+{									\
+	uint32_t val = data;						\
+	int64_t ret;							\
+									\
+	ret = pci_virt_cfg_write(phb, bdfn, offset,			\
+				 sizeof(data), val);			\
+	return ret;							\
+}
+
+NPU2_CFG_READ(8, u8);
+NPU2_CFG_READ(16, u16);
+NPU2_CFG_READ(32, u32);
+NPU2_CFG_WRITE(8, u8);
+NPU2_CFG_WRITE(16, u16);
+NPU2_CFG_WRITE(32, u32);
+
+static int __npu2_dev_bind_pci_dev(struct phb *phb __unused,
+				  struct pci_device *pd,
+				  void *data)
+{
+	struct npu2_dev *dev = data;
+	struct dt_node *pci_dt_node;
+	char *pcislot;
+
+	/* Ignore non-nvidia PCI devices */
+	if ((pd->vdid & 0xffff) != 0x10de)
+		return 0;
+
+	/* Find the PCI device's slot location */
+	for (pci_dt_node = pd->dn;
+	     pci_dt_node && !dt_find_property(pci_dt_node, "ibm,slot-label");
+	     pci_dt_node = pci_dt_node->parent);
+
+	if (!pci_dt_node)
+		return 0;
+
+	pcislot = (char *)dt_prop_get(pci_dt_node, "ibm,slot-label");
+
+	prlog(PR_DEBUG, "NPU: comparing GPU %s and NPU %s\n",
+	      pcislot, dev->slot_label);
+
+	if (streq(pcislot, dev->slot_label))
+		return 1;
+
+	return 0;
+}
+
+static void npu2_dev_bind_pci_dev(struct npu2_dev *dev)
+{
+	struct phb *phb;
+	uint32_t i;
+
+	if (dev->pd)
+		return;
+
+	for (i = 0; i < 64; i++) {
+		if (dev->npu->phb.opal_id == i)
+			continue;
+
+		phb = pci_get_phb(i);
+		if (!phb)
+			continue;
+
+		dev->pd = pci_walk_dev(phb, NULL, __npu2_dev_bind_pci_dev, dev);
+		if (dev->pd) {
+			dev->phb = phb;
+			/* Found the device, set the bit in config space */
+			PCI_VIRT_CFG_INIT_RO(dev->pvd, VENDOR_CAP_START +
+				VENDOR_CAP_PCI_DEV_OFFSET, 1, 0x01);
+			return;
+		}
+	}
+
+	prlog(PR_INFO, "%s: No PCI device for NPU device %04x:00:%02x.0 to bind to. If you expect a GPU to be there, this is a problem.\n",
+	      __func__, dev->npu->phb.opal_id, dev->index);
+}
+
+static struct lock pci_npu_phandle_lock = LOCK_UNLOCKED;
+
+static void npu2_append_phandle(struct dt_node *dn,
+				u32 phandle)
+{
+	struct dt_property *prop;
+	uint32_t *npu_phandles;
+	size_t len;
+
+	/*
+	 * Use a lock to make sure no one else has a reference to an
+	 * ibm,npu property (this assumes this is the only function
+	 * that holds a reference to it)
+	 */
+	lock(&pci_npu_phandle_lock);
+
+	/* This function shouldn't be called unless ibm,npu exists */
+	prop = (struct dt_property *)dt_require_property(dn, "ibm,npu", -1);
+
+	/* Need to append to the properties */
+	len = prop->len + sizeof(*npu_phandles);
+	dt_resize_property(&prop, len);
+	prop->len = len;
+
+	npu_phandles = (uint32_t *)prop->prop;
+	npu_phandles[len / sizeof(*npu_phandles) - 1] = phandle;
+	unlock(&pci_npu_phandle_lock);
+}
+
+static int npu2_dn_fixup(struct phb *phb,
+			 struct pci_device *pd,
+			 void *data __unused)
+{
+	struct npu2 *p = phb_to_npu2(phb);
+	struct npu2_dev *dev;
+
+	dev = npu2_bdf_to_dev(p, pd->bdfn);
+	assert(dev);
+	if (dev->phb || dev->pd)
+		return 0;
+
+	dt_add_property_cells(pd->dn, "ibm,nvlink", dev->dt_node->phandle);
+
+	/* NPU devices require a slot location to associate with GPUs */
+	dev->slot_label = dt_prop_get_def(pd->dn, "ibm,slot-label", NULL);
+	if (!dev->slot_label) {
+		/**
+		 * @fwts-label NPUNoPHBSlotLabel
+		 * @fwts-advice No GPU/NPU slot information was found.
+		 * NVLink2 functionality will not work.
+		 */
+		prlog(PR_ERR, "NPU: Cannot find GPU slot information\n");
+		return 0;
+	}
+
+	/*
+	 * Bind the emulated PCI device with the real one, which can't
+	 * be done until the PCI devices are populated. Once the real
+	 * PCI device is identified, we also need fix the device-tree
+	 * for it
+	 */
+	npu2_dev_bind_pci_dev(dev);
+	if (dev->phb && dev->pd && dev->pd->dn) {
+		if (dt_find_property(dev->pd->dn, "ibm,npu"))
+			npu2_append_phandle(dev->pd->dn, pd->dn->phandle);
+		else
+			dt_add_property_cells(dev->pd->dn, "ibm,npu", pd->dn->phandle);
+
+		dt_add_property_cells(pd->dn, "ibm,gpu", dev->pd->dn->phandle);
+		dev->gpu_bdfn = dev->pd->bdfn;
+	}
+
+	return 0;
+}
+
+static void npu2_phb_final_fixup(struct phb *phb)
+{
+	pci_walk_dev(phb, NULL, npu2_dn_fixup, NULL);
+}
+
+static void npu2_init_ioda_cache(struct npu2 *p)
+{
+	uint64_t val[2];
+	uint32_t i;
+
+	/*
+	 * PE mapping: there are two sets of registers. One of them
+	 * is used to map PEs for transactions. Another set is used
+	 * for error routing. We should have consistent setting in
+	 * both of them. Note that each brick can support 3 PEs at
+	 * the maximal degree. For now, we just support one PE per
+	 * brick.
+	 */
+	val[0] = NPU2_CQ_BRICK_BDF2PE_MAP_ENABLE;
+	val[0] = SETFIELD(NPU2_CQ_BRICK_BDF2PE_MAP_PE,
+			  val[0], NPU2_RESERVED_PE_NUM);
+	val[1] = NPU2_MISC_BRICK_BDF2PE_MAP_ENABLE;
+	val[1] = SETFIELD(NPU2_MISC_BRICK_BDF2PE_MAP_PE,
+			  val[1], NPU2_RESERVED_PE_NUM);
+	for (i = 0; i < ARRAY_SIZE(p->bdf2pe_cache); i++) {
+		if (i < ARRAY_SIZE(p->bdf2pe_cache))
+			p->bdf2pe_cache[i] = SETFIELD(NPU2_CQ_BRICK_BDF2PE_MAP_BDF,
+						      val[0], i / 3);
+		else
+			p->bdf2pe_cache[i] = SETFIELD(NPU2_MISC_BRICK_BDF2PE_MAP_BDF,
+						      val[1], i / 3);
+
+		if (i % 3)
+			p->bdf2pe_cache[i] = 0ul;
+	}
+
+	/* TVT */
+	memset(p->tve_cache, 0, sizeof(p->tve_cache));
+}
+
+static int64_t npu2_ioda_reset(struct phb *phb, bool purge)
+{
+	struct npu2 *p = phb_to_npu2(phb);
+	uint32_t i;
+
+	if (purge) {
+		NPU2DBG(p, "Purging all IODA tables...\n");
+		npu2_init_ioda_cache(p);
+	}
+
+	/* TVT */
+	npu2_ioda_sel(p, NPU2_ATS_IODA_TBL_TVT, 0, true);
+	for (i = 0; i < ARRAY_SIZE(p->tve_cache); i++)
+		out_be64(p->regs + NPU2_ATS_IODA_DATA, p->tve_cache[i]);
+
+	return OPAL_SUCCESS;
+}
+
+static void npu2_hw_init(struct npu2 *p)
+{
+	uint64_t val;
+
+	npu2_ioda_reset(&p->phb, false);
+
+	/* Enable XTS retry mode */
+	val = npu2_read(p, NPU2_XTS_CFG);
+	npu2_write(p, NPU2_XTS_CFG, val | NPU2_XTS_CFG_TRY_ATR_RO);
+}
+
+static int64_t npu2_map_pe_dma_window_real(struct phb *phb,
+					   uint64_t pe_num,
+					   uint16_t window_id,
+					   uint64_t pci_start_addr,
+					   uint64_t pci_mem_size)
+{
+	struct npu2 *p = phb_to_npu2(phb);
+	uint64_t end;
+	uint64_t tve;
+
+	/* Sanity check. Each PE has one corresponding TVE */
+	if (pe_num >= NPU2_MAX_PE_NUM ||
+	    window_id != pe_num)
+		return OPAL_PARAMETER;
+
+	if (pci_mem_size) {
+		/* Enable */
+
+		end = pci_start_addr + pci_mem_size;
+
+		/* We have to be 16M aligned */
+		if ((pci_start_addr & 0x00ffffff) ||
+		    (pci_mem_size & 0x00ffffff))
+			return OPAL_PARAMETER;
+
+		/*
+		 * It *looks* like this is the max we can support (we need
+		 * to verify this. Also we are not checking for rollover,
+		 * but then we aren't trying too hard to protect ourselves
+		 * againt a completely broken OS.
+		 */
+		if (end > 0x0003ffffffffffffull)
+			return OPAL_PARAMETER;
+
+		/*
+		 * Put start address bits 49:24 into TVE[52:53]||[0:23]
+		 * and end address bits 49:24 into TVE[54:55]||[24:47]
+		 * and set TVE[51]
+		 */
+		tve  = (pci_start_addr << 16) & (0xffffffull << 40);
+		tve |= (pci_start_addr >> 38) & (3ull << 10);
+		tve |= (end >>  8) & (0xfffffful << 16);
+		tve |= (end >> 40) & (3ull << 8);
+		tve |= PPC_BIT(51);
+	} else {
+		/* Disable */
+		tve = 0;
+	}
+
+	npu2_ioda_sel(p, NPU2_ATS_IODA_TBL_TVT, window_id, false);
+	out_be64(p->regs + NPU2_ATS_IODA_DATA, tve);
+	p->tve_cache[window_id] = tve;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu2_map_pe_dma_window(struct phb *phb,
+				      uint64_t pe_num,
+				      uint16_t window_id,
+				      uint16_t tce_levels,
+				      uint64_t tce_table_addr,
+				      uint64_t tce_table_size,
+				      uint64_t tce_page_size)
+{
+	struct npu2 *p = phb_to_npu2(phb);
+	uint64_t tts_encoded;
+	uint64_t data64 = 0;
+
+	/* Sanity check. Each PE has one corresponding TVE */
+	if (pe_num >= NPU2_MAX_PE_NUM ||
+	    window_id != pe_num)
+		return OPAL_PARAMETER;
+
+	/*
+	 * Special condition, zero TCE table size used to disable
+	 * the TVE.
+	 */
+	if (!tce_table_size) {
+		npu2_ioda_sel(p, NPU2_ATS_IODA_TBL_TVT, window_id, false);
+		out_be64(p->regs + NPU2_ATS_IODA_DATA, 0ul);
+		p->tve_cache[window_id] = 0ul;
+		return OPAL_SUCCESS;
+	}
+
+	/* Additional arguments validation */
+	if (tce_levels < 1 ||
+	    tce_levels > 4 ||
+	    !is_pow2(tce_table_size) ||
+	    tce_table_size < 0x1000)
+		return OPAL_PARAMETER;
+
+	/* TCE table size */
+	data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_TTA, 0ul, tce_table_addr >> 12);
+	tts_encoded = ilog2(tce_table_size) - 11;
+	if (tts_encoded > 39)
+		return OPAL_PARAMETER;
+	data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_SIZE, data64, tts_encoded);
+
+	/* TCE page size */
+	switch (tce_page_size) {
+	case 0x10000:		/* 64K */
+		data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_PSIZE, data64, 5);
+		break;
+	case 0x1000000:		/* 16M */
+		data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_PSIZE, data64, 13);
+		break;
+	case 0x10000000:	/* 256M */
+		data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_PSIZE, data64, 17);
+		break;
+	case 0x1000:		/* 4K */
+	default:
+		data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_PSIZE, data64, 1);
+	}
+
+	/* Number of levels */
+	data64 = SETFIELD(NPU2_ATS_IODA_TBL_TVT_LEVEL, data64, tce_levels - 1);
+
+	/* Update to hardware */
+	npu2_ioda_sel(p, NPU2_ATS_IODA_TBL_TVT, window_id, false);
+	out_be64(p->regs + NPU2_ATS_IODA_DATA, data64);
+	p->tve_cache[window_id] = data64;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu2_set_pe(struct phb *phb,
+			   uint64_t pe_num,
+			   uint64_t bdfn,
+			   uint8_t bcompare,
+			   uint8_t dcompare,
+			   uint8_t fcompare,
+			   uint8_t action)
+{
+	struct npu2 *p = phb_to_npu2(phb);
+	struct npu2_dev *dev;
+	uint64_t reg, val;
+
+	/* Sanity check */
+	if (action != OPAL_MAP_PE && action != OPAL_UNMAP_PE)
+		return OPAL_PARAMETER;
+	if (pe_num >= NPU2_MAX_PE_NUM)
+		return OPAL_PARAMETER;
+	if (bdfn >> 8)
+		return OPAL_PARAMETER;
+	if (bcompare != OpalPciBusAll ||
+	    dcompare != OPAL_COMPARE_RID_DEVICE_NUMBER ||
+	    fcompare != OPAL_COMPARE_RID_FUNCTION_NUMBER)
+		return OPAL_UNSUPPORTED;
+
+	/* Get the NPU2 device */
+	dev = npu2_bdf_to_dev(p, bdfn);
+	if (!dev)
+		return OPAL_PARAMETER;
+
+	val = NPU2_CQ_BRICK_BDF2PE_MAP_ENABLE;
+	val = SETFIELD(NPU2_CQ_BRICK_BDF2PE_MAP_PE, val, pe_num);
+	val = SETFIELD(NPU2_CQ_BRICK_BDF2PE_MAP_BDF, val, dev->gpu_bdfn);
+
+	if (!NPU2DEV_BRICK(dev))
+		reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + dev->index/2,
+				      NPU2_BLOCK_CTL, NPU2_CQ_BRICK0_BDF2PE_MAP0);
+	else
+		reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + dev->index/2,
+				      NPU2_BLOCK_CTL, NPU2_CQ_BRICK1_BDF2PE_MAP0);
+
+	npu2_write(p, reg, val);
+	val = NPU2_MISC_BRICK_BDF2PE_MAP_ENABLE;
+	val = SETFIELD(NPU2_MISC_BRICK_BDF2PE_MAP_PE, val, pe_num);
+	val = SETFIELD(NPU2_MISC_BRICK_BDF2PE_MAP_BDF, val, dev->gpu_bdfn);
+	reg = NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC,
+			      NPU2_MISC_BRICK0_BDF2PE_MAP0 + (dev->index * 0x18));
+	p->bdf2pe_cache[dev->index] = val;
+	npu2_write(p, reg, val);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu2_get_link_state(struct pci_slot *slot __unused, uint8_t *val)
+{
+	/*
+	 * As we're emulating all PCI stuff, the link bandwidth
+	 * isn't big deal anyway.
+	 */
+	*val = OPAL_SHPC_LINK_UP_x1;
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu2_get_power_state(struct pci_slot *slot __unused, uint8_t *val)
+{
+	*val = PCI_SLOT_POWER_ON;
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu2_hreset(struct pci_slot *slot __unused)
+{
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu2_freset(struct pci_slot *slot __unused)
+{
+	return OPAL_SUCCESS;
+}
+
+static struct pci_slot *npu2_slot_create(struct phb *phb)
+{
+	struct pci_slot *slot;
+
+	slot = pci_slot_alloc(phb, NULL);
+	if (!slot)
+		return slot;
+
+	/* Elementary functions */
+	slot->ops.get_presence_state  = NULL;
+	slot->ops.get_link_state      = npu2_get_link_state;
+	slot->ops.get_power_state     = npu2_get_power_state;
+	slot->ops.get_attention_state = NULL;
+	slot->ops.get_latch_state     = NULL;
+	slot->ops.set_power_state     = NULL;
+	slot->ops.set_attention_state = NULL;
+
+	slot->ops.prepare_link_change = NULL;
+	slot->ops.poll_link           = NULL;
+	slot->ops.hreset              = npu2_hreset;
+	slot->ops.freset              = npu2_freset;
+	slot->ops.creset              = NULL;
+
+	return slot;
+}
+
+static int64_t npu2_freeze_status(struct phb *phb __unused,
+				  uint64_t pe_number __unused,
+				  uint8_t *freeze_state,
+				  uint16_t *pci_error_type __unused,
+				  uint16_t *severity __unused,
+				  uint64_t *phb_status __unused)
+{
+	/*
+	 * FIXME: When it's called by skiboot PCI config accessor,
+	 * the PE number is fixed to 0, which is incorrect. We need
+	 * introduce another PHB callback to translate it. For now,
+	 * it keeps the skiboot PCI enumeration going.
+	 */
+	*freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN;
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu2_tce_kill(struct phb *phb, uint32_t kill_type,
+			     uint64_t pe_number, uint32_t tce_size,
+			     uint64_t dma_addr, uint32_t npages)
+{
+	struct npu2 *npu = phb_to_npu2(phb);
+	uint32_t tce_page_size;
+	uint64_t val;
+
+	if (pe_number > NPU2_MAX_PE_NUM)
+		return OPAL_PARAMETER;
+
+	sync();
+	switch(kill_type) {
+	case OPAL_PCI_TCE_KILL_PAGES:
+		tce_page_size = GETFIELD(npu->tve_cache[pe_number], NPU2_ATS_IODA_TBL_TVT_PSIZE);
+		if (tce_page_size != tce_size) {
+			NPU2ERR(npu, "npu2_tce_kill: Unexpected TCE size (got 0x%x expected 0x%x)\n",
+				tce_size, tce_page_size);
+			return OPAL_PARAMETER;
+		}
+
+		while (npages--) {
+			val = SETFIELD(NPU2_ATS_TCE_KILL_PENUM, dma_addr, pe_number);
+			npu2_write(npu, NPU2_ATS_TCE_KILL, NPU2_ATS_TCE_KILL_ONE | val);
+		}
+		break;
+	case OPAL_PCI_TCE_KILL_PE:
+		/*
+		 * NPU2 doesn't support killing a PE so fall through
+		 * and do a kill all instead.
+		 */
+	case OPAL_PCI_TCE_KILL:
+		npu2_write(npu, NPU2_ATS_TCE_KILL, NPU2_ATS_TCE_KILL_ALL);
+		break;
+	default:
+		return OPAL_PARAMETER;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+static const struct phb_ops npu_ops = {
+	.cfg_read8		= npu2_cfg_read8,
+	.cfg_read16		= npu2_cfg_read16,
+	.cfg_read32		= npu2_cfg_read32,
+	.cfg_write8		= npu2_cfg_write8,
+	.cfg_write16		= npu2_cfg_write16,
+	.cfg_write32		= npu2_cfg_write32,
+	.choose_bus		= NULL,
+	.device_init		= NULL,
+	.phb_final_fixup	= npu2_phb_final_fixup,
+	.ioda_reset		= npu2_ioda_reset,
+	.papr_errinjct_reset	= NULL,
+	.pci_reinit		= NULL,
+	.set_phb_mem_window	= NULL,
+	.phb_mmio_enable	= NULL,
+	.map_pe_mmio_window	= NULL,
+	.map_pe_dma_window	= npu2_map_pe_dma_window,
+	.map_pe_dma_window_real	= npu2_map_pe_dma_window_real,
+	.pci_msi_eoi		= NULL,
+	.set_xive_pe		= NULL,
+	.get_msi_32		= NULL,
+	.get_msi_64		= NULL,
+	.set_pe			= npu2_set_pe,
+	.set_peltv		= NULL,
+	.eeh_freeze_status	= npu2_freeze_status,
+	.eeh_freeze_clear	= NULL,
+	.eeh_freeze_set		= NULL,
+	.next_error		= NULL,
+	.err_inject		= NULL,
+	.get_diag_data		= NULL,
+	.get_diag_data2		= NULL,
+	.set_capi_mode		= NULL,
+	.set_capp_recovery	= NULL,
+	.tce_kill		= npu2_tce_kill,
+};
+
+static void assign_mmio_bars(uint32_t gcid,
+			     uint32_t scom)
+{
+	uint64_t mem_start;
+	struct npu2_bar *bar;
+	uint32_t i;
+
+	mem_start = 0x6030200000000;
+	mem_start |= gcid << 21;
+
+	/*
+	 * We're going to assign the BARs in reversed order according
+	 * to their sizes, just like the order we have in npu_bars[].
+	 * In that way, all BARs will be aligned perfectly without
+	 * wasting resources. Also, the Linux kernel won't change
+	 * anything though it attempts to reassign the BARs that
+	 * it can see, which are NTL and GENID BARs.
+	 *
+	 * GLOBAL MMIO (16MB)
+	 *        PHY0 (2MB)
+	 *        PHB1 (2MB)
+	 *        NTL0 (128KB)
+	 *        NTL1 (128KB)
+	 *        NTL2 (128KB)
+	 *        NTL3 (128KB)
+	 *        NTL4 (128KB)
+	 *        NTL5 (128KB)
+	 *      GENID0 (128KB)
+	 *      GENID1 (128KB)
+	 *      GENID2 (128KB)
+	 */
+	for (i = 0; i < ARRAY_SIZE(npu2_bars); i++) {
+		bar = &npu2_bars[i];
+		switch (bar->type) {
+		case NPU2_BAR_TYPE_GLOBAL:
+			bar->flags |= NPU2_BAR_FLAG_ENABLED;
+			bar->size = 0x1000000;
+			break;
+		case NPU2_BAR_TYPE_PHY:
+			bar->flags |= NPU2_BAR_FLAG_ENABLED;
+			bar->size = 0x200000;
+			break;
+		case NPU2_BAR_TYPE_NTL:
+			bar->flags &= ~NPU2_BAR_FLAG_ENABLED;
+			bar->size = 0x20000;
+			break;
+		case NPU2_BAR_TYPE_GENID:
+			bar->flags &= ~NPU2_BAR_FLAG_ENABLED;
+			bar->size = 0x20000;
+			break;
+		default:
+			bar->size = 0ul;
+		}
+
+		bar->base = mem_start;
+		mem_start += bar->size;
+		npu2_write_bar(NULL, bar, gcid, scom);
+	}
+}
+
+/*
+ *Probe NPU device node and create PCI root device node
+ * accordingly. The NPU deivce node should specify number
+ * of links and xscom base address to access links.
+ */
+static void npu2_probe_phb(struct dt_node *dn)
+{
+	struct dt_node *np;
+	uint32_t gcid, scom, index, phb_index, links;
+	uint64_t reg[2], mm_win[2];
+	char *path;
+
+	/* Retrieve chip id */
+	path = dt_get_path(dn);
+	gcid = dt_get_chip_id(dn);
+	index = dt_prop_get_u32(dn, "ibm,npu-index");
+	phb_index = dt_prop_get_u32(dn, "ibm,phb-index");
+	links = dt_prop_get_u32(dn, "ibm,npu-links");
+	prlog(PR_INFO, "Chip %d Found NPU%d (%d links) at %s\n",
+	      gcid, index, links, path);
+	free(path);
+
+	/* Retrieve scom base address */
+	scom = dt_get_address(dn, 0, NULL);
+	prlog(PR_INFO, "   SCOM Base:  %08x\n", scom);
+
+	/* Reassign the BARs */
+	assign_mmio_bars(gcid, scom);
+
+	/* Global MMIO BAR */
+	reg[0] = npu2_bars[0].base;
+	reg[1] = npu2_bars[0].size;
+	if (reg[0] && reg[1])
+		prlog(PR_INFO, "   Global MMIO BAR:  %016llx (%lldMB)\n",
+		      reg[0], reg[1] >> 20);
+	else
+		prlog(PR_ERR, "    Global MMIO BAR: Disabled\n");
+
+	/* NTL and GENID BARs are exposed to kernel */
+	mm_win[0] = npu2_bars[3].base;
+	mm_win[1] = npu2_bars[ARRAY_SIZE(npu2_bars) - 1].base +
+		    npu2_bars[ARRAY_SIZE(npu2_bars) - 1].size -
+		    mm_win[0];
+
+	/* Populate PCI root device node */
+	np = dt_new_addr(dt_root, "pciex", reg[0]);
+	assert(np);
+	dt_add_property_strings(np,
+				"compatible",
+				"ibm,power9-npu-pciex",
+				"ibm,ioda2-npu2-phb");
+	dt_add_property_strings(np, "device_type", "pciex");
+	dt_add_property(np, "reg", reg, sizeof(reg));
+	dt_add_property_cells(np, "ibm,phb-index", phb_index);
+	dt_add_property_cells(np, "ibm,npu-index", index);
+	dt_add_property_cells(np, "ibm,chip-id", gcid);
+	dt_add_property_cells(np, "ibm,xscom-base", scom);
+	dt_add_property_cells(np, "ibm,npcq", dn->phandle);
+	dt_add_property_cells(np, "ibm,links", links);
+	dt_add_property(np, "ibm,mmio-window", mm_win, sizeof(mm_win));
+}
+
+static uint32_t npu2_populate_pcie_cap(struct npu2_dev *dev,
+				       uint32_t start,
+				       uint32_t prev_cap)
+{
+	struct pci_virt_device *pvd = dev->pvd;
+	uint32_t val;
+
+	/* Add capability list */
+	PCI_VIRT_CFG_INIT_RO(pvd, prev_cap, 1, start);
+	PCI_VIRT_CFG_INIT_RO(pvd, start, 1, PCI_CFG_CAP_ID_EXP);
+
+	/* 0x00 - ID/PCIE capability */
+	val = PCI_CFG_CAP_ID_EXP;
+	val |= ((0x2 << 16) | (PCIE_TYPE_ENDPOINT << 20));
+	PCI_VIRT_CFG_INIT_RO(pvd, start, 4, val);
+
+	/* 0x04 - Device capability
+	 *
+	 * We should support FLR. Oterwhsie, it might have
+	 * problem passing it through to userland via Linux
+	 * VFIO infrastructure
+	 */
+	val = ((PCIE_MPSS_128) |
+	       (PCIE_PHANTOM_NONE << 3) |
+	       (PCIE_L0SL_MAX_NO_LIMIT << 6) |
+	       (PCIE_L1L_MAX_NO_LIMIT << 9) |
+	       (PCICAP_EXP_DEVCAP_FUNC_RESET));
+	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_DEVCAP, 4, val);
+
+	/* 0x08 - Device control and status */
+	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_DEVCTL, 4, 0x00002810,
+			  0xffff0000, 0x000f0000);
+
+	/* 0x0c - Link capability */
+	val = (PCIE_LSPEED_VECBIT_2 | (PCIE_LWIDTH_1X << 4));
+	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_LCAP, 4, val);
+
+	/* 0x10 - Link control and status */
+	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_LCTL, 4, 0x00130000,
+			 0xfffff000, 0xc0000000);
+
+	/* 0x14 - Slot capability */
+	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SLOTCAP, 4, 0x00000000);
+
+	/* 0x18 - Slot control and status */
+	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SLOTCTL, 4, 0x00000000);
+
+	/* 0x1c - Root control and capability */
+	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_RC, 4, 0x00000000,
+			  0xffffffe0, 0x00000000);
+
+	/* 0x20 - Root status */
+	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_RSTAT, 4, 0x00000000,
+			 0xffffffff, 0x00010000);
+
+	/* 0x24 - Device capability 2 */
+	PCI_VIRT_CFG_INIT_RO(pvd, start + PCIECAP_EXP_DCAP2, 4, 0x00000000);
+
+	/* 0x28 - Device Control and status 2 */
+	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_DCTL2, 4, 0x00070000,
+			 0xffff0000, 0x00000000);
+
+	/* 0x2c - Link capability 2 */
+	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_LCAP2, 4, 0x00000007);
+
+	/* 0x30 - Link control and status 2 */
+	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_LCTL2, 4, 0x00000003,
+			 0xffff0000, 0x00200000);
+
+	/* 0x34 - Slot capability 2 */
+	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SCAP2, 4, 0x00000000);
+
+	/* 0x38 - Slot control and status 2 */
+	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SCTL2, 4, 0x00000000);
+
+	return start + PCICAP_EXP_SCTL2 + 8;
+}
+
+static uint32_t npu2_populate_vendor_cap(struct npu2_dev *dev,
+					 uint32_t start,
+					 uint32_t prev_cap)
+{
+	struct pci_virt_device *pvd = dev->pvd;
+
+#define NPU2_VENDOR_CAP_VERSION	0x00
+#define NPU2_VENDOR_CAP_LEN	0x10
+
+	/* Capbility list */
+	PCI_VIRT_CFG_INIT_RO(pvd, prev_cap, 1, start);
+	PCI_VIRT_CFG_INIT_RO(pvd, start, 1, PCI_CFG_CAP_ID_VENDOR);
+	dev->vendor_cap = start;
+
+	/* Length and version */
+	PCI_VIRT_CFG_INIT_RO(pvd, start + 2, 1, NPU2_VENDOR_CAP_LEN);
+	PCI_VIRT_CFG_INIT_RO(pvd, start + 3, 1, NPU2_VENDOR_CAP_VERSION);
+
+	/*
+	 * Defaults when the trap can't handle the read/write (eg. due
+	 * to reading/writing less than 4 bytes).
+	 */
+	PCI_VIRT_CFG_INIT_RO(pvd, start + 4, 4, 0);
+	PCI_VIRT_CFG_INIT_RO(pvd, start + 8, 4, 0);
+
+	/* Link index */
+	PCI_VIRT_CFG_INIT_RO(pvd, start + 0xc, 1, dev->index);
+
+	return start + NPU2_VENDOR_CAP_LEN;
+}
+
+static void npu2_populate_cfg(struct npu2_dev *dev)
+{
+	struct pci_virt_device *pvd = dev->pvd;
+	struct npu2_pcie_bar *bar;
+	uint32_t pos;
+
+	/* 0x00 - Vendor/Device ID */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_VENDOR_ID, 4, 0x04ea1014);
+
+	/* 0x04 - Command/Status */
+	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_CMD, 4, 0x00100000, 0xffb802b8,
+			  0xf9000000);
+
+	pci_virt_add_filter(pvd, PCI_CFG_CMD, 1, PCI_REG_FLAG_WRITE,
+			    npu2_cfg_write_cmd, NULL);
+
+	/* 0x08 - Rev/Class/Cache */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_REV_ID, 4, 0x06800001);
+
+	/* 0x0c - CLS/Latency Timer/Header/BIST */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CACHE_LINE_SIZE, 4, 0x00800000);
+
+	/* 0x10/14 - BAR#0, NTL BAR */
+	bar = &dev->bars[0];
+	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR0, 4,
+			  (bar->base & 0xfffffff0) | (bar->flags & 0xF),
+			  0x0000000f, 0x00000000);
+	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR1, 4, (bar->base >> 32),
+			  0x00000000, 0x00000000);
+	pci_virt_add_filter(pvd, PCI_CFG_BAR0, 8,
+			    PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
+			    npu2_dev_cfg_bar, bar);
+
+	/* 0x18/1c - BAR#1, GENID BAR */
+	bar = &dev->bars[1];
+	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR2, 4, (bar->base & 0xfffffff0) |
+			  (bar->flags & 0xF),
+			  0x0000000f, 0x00000000);
+	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR3, 4, (bar->base >> 32), 0x00000000,
+			  0x00000000);
+	pci_virt_add_filter(pvd, PCI_CFG_BAR2, 8,
+			    PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
+			    npu2_dev_cfg_bar, bar);
+
+	/* 0x20/0x24 - BARs, disabled */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR4, 4, 0x00000000);
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR5, 4, 0x00000000);
+
+	/* 0x28 - Cardbus CIS pointer */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CARDBUS_CIS, 4, 0x00000000);
+
+	/* 0x2c - Subsystem ID */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_SUBSYS_VENDOR_ID, 4, 0x00000000);
+
+	/* 0x30 - ROM BAR, zero sized */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_ROMBAR, 4, 0xffffffff);
+
+	/* 0x34 - PCI Capability */
+	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CAP, 4, 0x00000000);
+
+	/* 0x38 - Reserved */
+	PCI_VIRT_CFG_INIT_RO(pvd, 0x38, 4, 0x00000000);
+
+	/* 0x3c - INT line/pin/Minimal grant/Maximal latency */
+	if (!NPU2DEV_BRICK(dev))
+		PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_INT_LINE, 4, 0x00000100);
+	else
+		PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_INT_LINE, 4, 0x00000200);
+
+	/* PCIE and vendor specific capability */
+	pos = npu2_populate_pcie_cap(dev, 0x40, PCI_CFG_CAP);
+	npu2_populate_vendor_cap(dev, pos, 0x41);
+	PCI_VIRT_CFG_INIT_RO(pvd, pos + 1, 1, 0);
+}
+
+static uint32_t npu_allocate_bdfn(struct npu2 *p, uint32_t group)
+{
+	int i;
+	int bdfn = (group << 3);
+
+	for (i = 0; i < p->total_devices; i++) {
+		if ((p->devices[i].bdfn & 0xf8) == (bdfn & 0xf8))
+			bdfn++;
+	}
+
+	return bdfn;
+}
+
+static void npu2_populate_devices(struct npu2 *p,
+				  struct dt_node *dn)
+{
+	struct npu2_dev *dev;
+	struct dt_node *npu2_dn, *link;
+	uint32_t npu_phandle, index = 0;
+
+	/*
+	 * Get the npu node which has the links which we expand here
+	 * into pci like devices attached to our emulated phb.
+	 */
+	npu_phandle = dt_prop_get_u32(dn, "ibm,npcq");
+	npu2_dn = dt_find_by_phandle(dt_root, npu_phandle);
+	assert(npu2_dn);
+
+	/* Walk the link@x nodes to initialize devices */
+	p->total_devices = 0;
+	p->phb.scan_map = 0;
+	dt_for_each_compatible(npu2_dn, link, "ibm,npu-link") {
+		uint32_t group_id;
+
+		dev = &p->devices[index];
+		dev->npu = p;
+		dev->dt_node = link;
+		dev->index = dt_prop_get_u32(link, "ibm,npu-link-index");
+
+		group_id = dt_prop_get_u32(link, "ibm,npu-group-id");
+		dev->bdfn = npu_allocate_bdfn(p, group_id);
+
+		/* This must be done after calling
+		 * npu_allocate_bdfn() */
+		p->total_devices++;
+		p->phb.scan_map |= 0x1 << ((dev->bdfn & 0xf8) >> 3);
+
+		dev->lane_mask = dt_prop_get_u32(link, "ibm,npu-lane-mask");
+
+		/* Populate BARs. BAR0/1 is the NTL bar. */
+		dev->bars[0].npu2_bar = &npu2_bars[3 + dev->index];
+		dev->bars[0].base = dev->bars[0].npu2_bar->base;
+		dev->bars[0].size = dev->bars[0].npu2_bar->size;
+		dev->bars[0].flags = PCI_CFG_BAR_TYPE_MEM | PCI_CFG_BAR_MEM64;
+
+		/* BAR2/3 is the GENID bar. */
+		dev->bars[1].npu2_bar = &npu2_bars[9 + dev->index / 2];
+		dev->bars[1].base = dev->bars[1].npu2_bar->base + (NPU2DEV_BRICK(dev) * 0x10000);
+		dev->bars[1].size = 0x10000;
+		dev->bars[1].flags = PCI_CFG_BAR_TYPE_MEM | PCI_CFG_BAR_MEM64;
+
+		/* Initialize PCI virtual device */
+		dev->pvd = pci_virt_add_device(&p->phb, dev->bdfn, 0x100, dev);
+		if (dev->pvd) {
+			p->phb.scan_map |=
+				0x1 << ((dev->pvd->bdfn & 0xf8) >> 3);
+			npu2_populate_cfg(dev);
+		}
+
+		index++;
+	}
+}
+
+static void npu2_add_phb_properties(struct npu2 *p)
+{
+	struct dt_node *np = p->phb.dt_node;
+	uint32_t icsp = get_ics_phandle();
+	uint64_t mm_base, mm_size, mmio_atsd;
+
+	/*
+	 * Add various properties that HB doesn't have to
+	 * add, some of them simply because they result from
+	 * policy decisions made in skiboot rather than in HB
+	 * such as the MMIO windows going to PCI, interrupts,
+	 * etc.
+	 */
+	dt_add_property_cells(np, "#address-cells", 3);
+	dt_add_property_cells(np, "#size-cells", 2);
+	dt_add_property_cells(np, "#interrupt-cells", 1);
+	dt_add_property_cells(np, "bus-range", 0, 0xff);
+	dt_add_property_cells(np, "clock-frequency", 0x200, 0);
+        dt_add_property_cells(np, "interrupt-parent", icsp);
+
+	/* NPU PHB properties */
+	dt_add_property_cells(np, "ibm,opal-num-pes",
+			      NPU2_MAX_PE_NUM);
+	dt_add_property_cells(np, "ibm,opal-reserved-pe",
+			      NPU2_RESERVED_PE_NUM);
+
+	mmio_atsd = (u64) p->regs +
+		NPU2_REG_OFFSET(NPU2_STACK_ATSD, NPU2_BLOCK_ATSD0, NPU2_XTS_MMIO_ATSD_LAUNCH);
+	dt_add_property_cells(np, "ibm,mmio-atsd", hi32(mmio_atsd),
+			      lo32(mmio_atsd));
+
+	/*
+	 * Memory window is exposed as 64-bits non-prefetchable
+	 * one because 64-bits prefetchable one is kind of special
+	 * to kernel.
+	 */
+	mm_base = p->mm_base;
+	mm_size = p->mm_size;
+	dt_add_property_cells(np, "ranges", 0x02000000,
+			      hi32(mm_base), lo32(mm_base),
+			      hi32(mm_base), lo32(mm_base),
+			      hi32(mm_size), lo32(mm_size));
+}
+
+static void npu2_create_phb(struct dt_node *dn)
+{
+	const struct dt_property *prop;
+	struct npu2 *p;
+	struct pci_slot *slot;
+	uint32_t links;
+	void *pmem;
+
+	/* Retrieve number of devices */
+	links = dt_prop_get_u32(dn, "ibm,links");
+	pmem = zalloc(sizeof(struct npu2) + links * sizeof(struct npu2_dev));
+	assert(pmem);
+
+	/* Populate PHB */
+	p = pmem;
+	p->index = dt_prop_get_u32(dn, "ibm,phb-index");
+	p->chip_id = dt_prop_get_u32(dn, "ibm,chip-id");
+	p->xscom_base = dt_prop_get_u32(dn, "ibm,xscom-base");
+	p->total_devices = links;
+
+	p->regs = (void *)dt_get_address(dn, 0, NULL);
+
+	prop = dt_require_property(dn, "ibm,mmio-window", -1);
+	assert(prop->len >= (2 * sizeof(uint64_t)));
+	p->mm_base = ((const uint64_t *)prop->prop)[0];
+	p->mm_size = ((const uint64_t *)prop->prop)[1];
+
+	p->devices = pmem + sizeof(struct npu2);
+
+	/* Generic PHB */
+	p->phb.dt_node = dn;
+	p->phb.ops = &npu_ops;
+	p->phb.phb_type = phb_type_npu_v2;
+	init_lock(&p->lock);
+	init_lock(&p->phb.lock);
+	list_head_init(&p->phb.devices);
+	list_head_init(&p->phb.virt_devices);
+
+	npu2_populate_devices(p, dn);
+	npu2_add_phb_properties(p);
+
+	slot = npu2_slot_create(&p->phb);
+	if (!slot)
+	{
+		/**
+		 * @fwts-label NPUCannotCreatePHBSlot
+		 * @fwts-advice Firmware probably ran out of memory creating
+		 * NPU slot. NVLink functionality could be broken.
+		 */
+		prlog(PR_ERR, "NPU: Cannot create PHB slot\n");
+	}
+
+	pci_register_phb(&p->phb, OPAL_DYNAMIC_PHB_ID);
+
+	npu2_init_ioda_cache(p);
+	npu2_hw_init(p);
+}
+
+void probe_npu2(void)
+{
+	struct dt_node *np;
+
+	/* Scan NPU XSCOM nodes */
+	dt_for_each_compatible(dt_root, np, "ibm,power9-npu")
+		npu2_probe_phb(np);
+
+	/* Scan newly created PHB nodes */
+	dt_for_each_compatible(dt_root, np, "ibm,power9-npu-pciex")
+		npu2_create_phb(np);
+}
diff --git a/include/npu2.h b/include/npu2.h
new file mode 100644
index 0000000..ec62ad2
--- /dev/null
+++ b/include/npu2.h
@@ -0,0 +1,152 @@
+/* Copyright 2013-2016 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NPU2_H
+#define __NPU2_H
+
+/* Debugging options */
+#define NPU2DBG(p, fmt, a...)	prlog(PR_DEBUG, "NPU%d: " fmt, \
+				      (p)->phb.opal_id, ##a)
+#define NPU2INF(p, fmt, a...)	prlog(PR_INFO,  "NPU%d: " fmt, \
+				      (p)->phb.opal_id, ##a)
+#define NPU2ERR(p, fmt, a...)	prlog(PR_ERR,   "NPU%d: " fmt, \
+				      (p)->phb.opal_id, ##a)
+
+/* Number of PEs supported */
+#define NPU2_MAX_PE_NUM		16
+#define NPU2_RESERVED_PE_NUM	15
+
+/* Return the stack (0-2) of a device */
+#define NPU2DEV_STACK(ndev) ((ndev)->index / 2)
+
+/* Return the brick number (0-1) within a stack */
+#define NPU2DEV_BRICK(ndev) ((ndev)->index % 2)
+
+/* This represents the state of the actual hardware BARs not the
+ * emulated PCIe BARs. The is a subtle difference between the two as
+ * not all BARs are exposed outside of skiboot. */
+struct npu2_bar {
+#define NPU2_BAR_FLAG_ENABLED	0x0010
+
+/* Generation ID's are a single space in the hardware but we split
+ * them in two for the emulated PCIe devices so we need to keep track
+ * of which one has been enabled/disabled. */
+#define NPU2_BAR_FLAG_ENABLED0	0x0080
+#define NPU2_BAR_FLAG_ENABLED1  0x0100
+	uint32_t		flags;
+
+#define NPU2_BAR_TYPE_GLOBAL	0
+#define NPU2_BAR_TYPE_PHY	1
+#define NPU2_BAR_TYPE_NTL	2
+#define NPU2_BAR_TYPE_GENID	3
+#define NPU2_BAR_TYPE_MAX	4
+	uint32_t		type;
+	uint64_t		reg;
+	uint64_t		stack;
+	uint64_t		base;
+	uint64_t		size;
+};
+
+/* Rpresents a BAR that is exposed via the PCIe emulated
+ * devices */
+struct npu2_pcie_bar {
+#define NPU2_PCIE_BAR_FLAG_SIZE_HI	0x0020
+#define NPU2_PCIE_BAR_FLAG_TRAPPED	0x0040
+	uint32_t		flags;
+	struct npu2_bar		*npu2_bar;
+	uint64_t		base;
+	uint64_t		size;
+};
+
+struct npu2;
+struct npu2_dev {
+	uint32_t		index;
+	uint32_t                flags;
+	uint64_t                xscom;
+	void			*regs;
+	struct dt_node		*dt_node;
+	struct npu2_pcie_bar	bars[2];
+	struct npu2		*npu;
+
+	/* Device and function numbers are allocated based on GPU
+	 * association. Links to connected to the same GPU will be
+	 * exposed as different functions of the same bus/device. */
+	uint32_t		bdfn;
+	uint32_t		gpu_bdfn;
+
+	/* PCI virtual device and the associated GPU device */
+	struct pci_virt_device	*pvd;
+	struct phb		*phb;
+	struct pci_device	*pd;
+
+	/* Vendor specific capability */
+	uint32_t		vendor_cap;
+
+	/* Which PHY lanes this device is associated with */
+	uint16_t		lane_mask;
+
+	/* Track currently running procedure and step number */
+	uint16_t		procedure_number;
+	uint16_t		procedure_step;
+	uint64_t		procedure_data;
+	unsigned long		procedure_tb;
+	uint32_t		procedure_status;
+
+	/* Used to associate the NPU device with GPU PCI devices */
+	const char		*slot_label;
+};
+
+struct npu2 {
+	uint32_t	index;
+	uint32_t	flags;
+	uint32_t	chip_id;
+	uint64_t	xscom_base;
+	uint64_t	at_xscom;
+	void		*regs;
+	uint64_t	mm_base;
+	uint64_t	mm_size;
+	uint32_t	base_lsi;
+	uint32_t	total_devices;
+	struct npu2_dev	*devices;
+
+	/* IODA cache */
+	uint64_t	lxive_cache[8];
+	uint64_t	bdf2pe_cache[36];
+	uint64_t	tve_cache[16];
+	bool		tx_zcal_complete[2];
+
+	/* Used to protect global MMIO space, in particular the XTS
+	 * tables. */
+	struct lock	lock;
+
+	struct phb	phb;
+};
+
+static inline struct npu2 *phb_to_npu2(struct phb *phb)
+{
+	return container_of(phb, struct npu2, phb);
+}
+
+void npu2_write_4b(struct npu2 *p, uint64_t reg, uint64_t val);
+uint64_t npu2_read_4b(struct npu2 *p, uint64_t reg);
+void npu2_write(struct npu2 *p, uint64_t reg, uint64_t val);
+uint64_t npu2_read(struct npu2 *p, uint64_t reg);
+void npu2_write_mask(struct npu2 *p, uint64_t reg, uint64_t val, uint64_t mask);
+int64_t npu2_dev_procedure(void *dev, struct pci_cfg_reg_filter *pcrf,
+			   uint32_t offset, uint32_t len, uint32_t *data,
+			   bool write);
+
+#endif /* __NPU2_H */
diff --git a/include/pci.h b/include/pci.h
index 44bedf6..732c1a3 100644
--- a/include/pci.h
+++ b/include/pci.h
@@ -328,6 +328,7 @@ enum phb_type {
 	phb_type_pcie_v2,
 	phb_type_pcie_v3,
 	phb_type_pcie_v4,
+	phb_type_npu_v2,
 };
 
 struct phb {
diff --git a/include/skiboot.h b/include/skiboot.h
index c55995b..8bc767a 100644
--- a/include/skiboot.h
+++ b/include/skiboot.h
@@ -217,6 +217,7 @@ extern void phb3_preload_vpd(void);
 extern int phb4_preload_capp_ucode(void);
 extern void phb4_preload_vpd(void);
 extern void probe_npu(void);
+extern void probe_npu2(void);
 extern void uart_init(void);
 extern void mbox_init(void);
 extern void early_uart_init(void);