2 files changed, 198 insertions, 0 deletions
diff --git a/hw/npu2.c b/hw/npu2.c
index ada6b38..0215570 100644
--- a/hw/npu2.c
+++ b/hw/npu2.c
@@ -455,18 +455,209 @@ static void npu2_append_phandle(struct dt_node *dn,
 	unlock(&pci_npu_phandle_lock);
 }
 
+static struct dt_node *npu2_create_memory_dn(uint64_t addr, uint64_t size)
+{
+	struct dt_node *mem;
+	char *name;
+	size_t namesz;
+	static u32 chip_id = 255;
+
+	/*
+	 * Find and return the node if it already exists.
+	 */
+	namesz = sizeof("memory@") + STR_MAX_CHARS(addr);
+	name = malloc(namesz);
+	if (!name)
+		return NULL;
+	snprintf(name, namesz, "memory@%llx", (long long)addr);
+	mem = dt_find_by_name(dt_root, name);
+	free(name);
+	if (mem)
+		return mem;
+
+	mem = dt_new_addr(dt_root, "memory", addr);
+	if (!mem)
+		return NULL;
+	dt_add_property_string(mem, "device_type", "memory");
+	dt_add_property_string(mem, "compatible", "ibm,coherent-device-memory");
+	dt_add_property_u64s(mem, "reg", addr, size);
+	dt_add_property_cells(mem, "ibm,chip-id", chip_id);
+	dt_add_property_u64s(mem, "linux,usable-memory", addr, 0);
+	dt_add_property_cells(mem, "ibm,associativity", 4, 0, 0, 0, chip_id--);
+
+	assert(chip_id);
+	return mem;
+}
+
+static void npu2_dn_fixup_gmb(struct dt_node *pd_dn, uint64_t gmb)
+{
+	uint64_t sel, gpu_base, gpu_size, gta;
+	struct dt_node *mem_dn;
+
+	sel = GETFIELD(NPU2_MEM_BAR_SEL_MEM, gmb);
+	switch (sel) {
+	case 0:
+		/* BAR disabled */
+		return;
+	case 4:
+		gpu_base = 0;
+		break;
+	case 5:
+		gpu_base = 1UL << 49;
+		break;
+	case 6:
+		gpu_base = 2UL << 49;
+		break;
+	default:
+		prlog(PR_ERR, "unhandled NPU2_MEM_BAR_SEL_MEM 0x%llx\n", sel);
+		return;
+	}
+
+	gpu_base |= GETFIELD(NPU2_MEM_BAR_GROUP, gmb) << 43;
+	gpu_base |= GETFIELD(NPU2_MEM_BAR_CHIP, gmb) << 41;
+	gpu_base |= GETFIELD(NPU2_MEM_BAR_ADDR, gmb) << 30;
+	gpu_size = GETFIELD(NPU2_MEM_BAR_BAR_SIZE, gmb) << 32;
+
+	mem_dn = npu2_create_memory_dn(gpu_base, gpu_size);
+	assert(mem_dn);
+	dt_add_property_cells(pd_dn, "memory-region", mem_dn->phandle);
+
+	gta  = ((gpu_base >> 42) & 0x1) << 41;
+	gta |= ((gpu_base >> 45) & 0x3) << 42;
+	gta |= ((gpu_base >> 49) & 0x3) << 45;
+	gta |= gpu_base & ((1UL << 43) - 1);
+
+	dt_add_property_u64s(pd_dn, "ibm,device-tgt-addr", gta);
+}
+
+/* Used by the below function to assign GPU base addresses */
+static uint64_t npu2_group_addr[NPU2_LINKS_PER_CHIP] = {0};
+static uint64_t npu2_base_addr;
+static int npu2_assign_gmb(struct phb *phb, struct pci_device *pd,
+			 void *data __unused)
+{
+	struct npu2 *p = phb_to_npu2(phb);
+	struct npu2_dev *ndev;
+	int group, peers, mode;
+	uint32_t bdfn;
+	uint64_t reg, gmb, old_val;
+
+	ndev = npu2_bdf_to_dev(p, pd->bdfn);
+	assert(ndev);
+
+	/* Assign GPU memory base addresses. The current strategy is
+	 * to work backwards from maximum memory assigning 128GB per
+	 * GPU as that is the minimum alignment requirements. So we
+	 * need to count the number of GPUs and give each a base
+	 * address then configure the hashing based on the number of
+	 * links */
+
+	/* Need to work out number of link peers. This amount to
+	 * working out the maximum function number. So work start at
+	 * the highest bdfn (fn = 6) and count back until we find a
+	 * npu2_dev. */
+	for (bdfn = (ndev->bdfn & ~0x7) | NPU2_LINKS_PER_CHIP;
+	     (bdfn & 0x7) != 0x7; bdfn = (bdfn & ~0x7) | ((bdfn & 0x7) - 1))
+		if (npu2_bdf_to_dev(p, bdfn))
+			break;
+
+	peers = bdfn & 0x7;
+	group = (bdfn >> 3) & 0x1f;
+
+	/* These should never happen but would lead to memory
+	 * corruption if they do so best to check. */
+	assert(peers != 0x7);
+	assert(group < NPU2_LINKS_PER_CHIP);
+
+	if (!npu2_group_addr[group]) {
+		/* Assign new base address */
+		npu2_base_addr -= 128;
+		npu2_group_addr[group] = npu2_base_addr;
+	}
+
+	gmb = SETFIELD(NPU2_MEM_BAR_SEL_MEM, 0ULL, 4);
+	gmb = SETFIELD(PPC_BITMASK(2,21), gmb, npu2_group_addr[group]);
+	gmb = SETFIELD(NPU2_MEM_BAR_POISON, gmb, 1);
+	gmb = SETFIELD(NPU2_MEM_BAR_GRANULE, gmb, 0);
+
+	/* We don't know how much memory the GPU has but if we
+	 * have to align it to 128GB boundaries we may as well
+	 * just pass the whole aperture through at this
+	 * point. */
+	gmb = SETFIELD(NPU2_MEM_BAR_BAR_SIZE, gmb, 7);
+
+	switch (peers) {
+	case 0:
+		mode = 0;
+		break;
+	case 1:
+		mode = 1;
+		break;
+	case 2:
+		mode = 3;
+		break;
+	case 3:
+		mode = 6;
+		break;
+	case 5:
+		mode = 10;
+		break;
+	default:
+		/* Hardware does not support this configuration */
+		assert(0);
+	}
+
+	mode += ndev->bdfn & 0x7;
+	gmb = SETFIELD(NPU2_MEM_BAR_MODE, gmb, mode);
+	if (NPU2DEV_BRICK(ndev))
+		gmb >>= 32;
+	reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(ndev),
+			      NPU2_BLOCK_SM_0,
+			      NPU2_GPU0_MEM_BAR);
+
+	old_val = npu2_read(p, reg);
+	gmb |= old_val;
+
+	npu2_write(p, reg, gmb);
+	reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(ndev),
+			      NPU2_BLOCK_SM_1,
+			      NPU2_GPU0_MEM_BAR);
+	npu2_write(p, reg, gmb);
+	reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(ndev),
+			      NPU2_BLOCK_SM_2,
+			      NPU2_GPU0_MEM_BAR);
+	npu2_write(p, reg, gmb);
+	reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(ndev),
+			      NPU2_BLOCK_SM_3,
+			      NPU2_GPU0_MEM_BAR);
+	npu2_write(p, reg, gmb);
+
+	return 0;
+}
+
 static int npu2_dn_fixup(struct phb *phb,
 			 struct pci_device *pd,
 			 void *data __unused)
 {
 	struct npu2 *p = phb_to_npu2(phb);
 	struct npu2_dev *dev;
+	uint64_t reg, gmb;
 
 	dev = npu2_bdf_to_dev(p, pd->bdfn);
 	assert(dev);
 	if (dev->phb || dev->pd)
 		return 0;
 
+	reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(dev),
+			      NPU2_BLOCK_SM_0,
+			      NPU2_GPU0_MEM_BAR);
+	gmb = npu2_read(p, reg);
+	if (NPU2DEV_BRICK(dev))
+		gmb <<= 32;
+	else
+		gmb &= 0xffffffff00000000;
+
+	npu2_dn_fixup_gmb(pd->dn, gmb);
 	dt_add_property_cells(pd->dn, "ibm,nvlink", dev->dt_node->phandle);
 
 	/* NPU devices require a slot location to associate with GPUs */
@@ -503,6 +694,11 @@ static int npu2_dn_fixup(struct phb *phb,
 
 static void npu2_phb_final_fixup(struct phb *phb)
 {
+	struct npu2 *p = phb_to_npu2(phb);
+
+	/* Start allocating GPU memory from 4TB down. */
+	npu2_base_addr = (p->chip_id << 21) | 4*1024;
+	pci_walk_dev(phb, NULL, npu2_assign_gmb, NULL);
 	pci_walk_dev(phb, NULL, npu2_dn_fixup, NULL);
 }
 
diff --git a/include/npu2.h b/include/npu2.h
index ec62ad2..d535e65 100644
--- a/include/npu2.h
+++ b/include/npu2.h
@@ -29,6 +29,8 @@
 #define NPU2_MAX_PE_NUM		16
 #define NPU2_RESERVED_PE_NUM	15
 
+#define NPU2_LINKS_PER_CHIP 6
+
 /* Return the stack (0-2) of a device */
 #define NPU2DEV_STACK(ndev) ((ndev)->index / 2)