// SPDX-License-Identifier: Apache-2.0 /* * Copyright 2019 IBM Corp. */ #include #include #include #include #include #include #include #define NPU3LOG(l, npu, fmt, a...) \ prlog(l, "NPU[%d:%d]: " fmt, (npu)->chip_id, (npu)->index, ##a) #define NPU3DBG(npu, fmt, a...) NPU3LOG(PR_DEBUG, npu, fmt, ##a) #define NPU3INF(npu, fmt, a...) NPU3LOG(PR_INFO, npu, fmt, ##a) #define NPU3ERR(npu, fmt, a...) NPU3LOG(PR_ERR, npu, fmt, ##a) #define NPU3DEVLOG(l, dev, fmt, a...) \ prlog(l, "NPU[%d:%d:%d]: " fmt, \ (dev)->npu->chip_id, \ (dev)->npu->index, \ (dev)->index, ##a) #define NPU3DEVDBG(dev, fmt, a...) NPU3DEVLOG(PR_DEBUG, dev, fmt, ##a) #define NPU3DEVINF(dev, fmt, a...) NPU3DEVLOG(PR_INFO, dev, fmt, ##a) #define NPU3DEVERR(dev, fmt, a...) NPU3DEVLOG(PR_ERR, dev, fmt, ##a) static void npu3_dt_create_link(struct dt_node *npu, uint32_t npu_index, uint32_t dev_index) { struct dt_node *link; uint32_t phy_lane_mask, ob_chiplet; link = dt_new_addr(npu, "link", dev_index); dt_add_property_string(link, "compatible", "ibm,npu-link"); dt_add_property_cells(link, "reg", dev_index); dt_add_property_cells(link, "ibm,npu-link-index", dev_index); switch (npu_index) { case 0: /* fall through */ case 2: ob_chiplet = npu_index ? 3 : 0; switch (dev_index) { case 0: phy_lane_mask = PPC_BITMASK32(0, 3); break; case 1: phy_lane_mask = PPC_BITMASK32(13, 16); break; case 2: phy_lane_mask = PPC_BITMASK32(7, 10); break; case 3: phy_lane_mask = PPC_BITMASK32(20, 23); break; } break; case 1: switch (dev_index) { case 0: ob_chiplet = 1; phy_lane_mask = PPC_BITMASK32(0, 3); break; case 1: ob_chiplet = 2; phy_lane_mask = PPC_BITMASK32(0, 3); break; case 2: ob_chiplet = 1; phy_lane_mask = PPC_BITMASK32(7, 10); break; case 3: ob_chiplet = 2; phy_lane_mask = PPC_BITMASK32(7, 10); break; } break; default: return; } dt_add_property_cells(link, "ibm,npu-phy", ob_chiplet); dt_add_property_cells(link, "ibm,npu-lane-mask", phy_lane_mask); } static void npu3_dt_create_npu(struct dt_node *xscom, uint32_t npu_index) { const uint32_t npu_base[] = { 0x5011000, 0x5011400, 0x3011c00 }; struct dt_node *npu; npu = dt_new_addr(xscom, "npu", npu_base[npu_index]); dt_add_property_cells(npu, "#size-cells", 0); dt_add_property_cells(npu, "#address-cells", 1); dt_add_property_cells(npu, "reg", npu_base[npu_index], 0x2c); dt_add_property_string(npu, "compatible", "ibm,power9-npu3"); dt_add_property_cells(npu, "ibm,npu-index", npu_index); dt_add_property_cells(npu, "ibm,phb-index", 7 + npu_index); for (uint32_t i = 0; i < NPU3_LINKS_PER_NPU; i++) npu3_dt_create_link(npu, npu_index, i); } /* This can be removed when/if we decide to use HDAT instead */ static bool npu3_dt_create(void) { struct proc_chip *chip = next_chip(NULL); struct dt_node *xscom; /* npu3 chips only */ if (proc_gen < proc_gen_p9 || chip->type == PROC_CHIP_P9_NIMBUS || chip->type == PROC_CHIP_P9_CUMULUS) return false; dt_for_each_compatible(dt_root, xscom, "ibm,xscom") for (uint32_t i = 0; i < 3; i++) npu3_dt_create_npu(xscom, i); return true; } static struct npu3 *npu3_create(struct dt_node *dn) { struct npu3 *npu; struct dt_node *link; struct npu3_dev *dev; char *path; uint32_t i; npu = zalloc(sizeof(*npu)); assert(npu); init_lock(&npu->lock); npu->dt_node = dn; npu->index = dt_prop_get_u32(dn, "ibm,npu-index"); npu->xscom_base = dt_get_address(dn, 0, NULL); npu->chip_id = dt_get_chip_id(dn); assert(get_chip(npu->chip_id)); dt_for_each_compatible(dn, link, "ibm,npu-link") { i = dt_prop_get_u32(link, "ibm,npu-link-index"); assert(i < NPU3_LINKS_PER_NPU); dev = &npu->devices[i]; dev->index = i; dev->npu = npu; dev->dn = link; dev->ob_chiplet = dt_prop_get_u32(link, "ibm,npu-phy"); dev->phy_lane_mask = dt_prop_get_u32(link, "ibm,npu-lane-mask"); dev->proc.status = NPU3_PROC_COMPLETE; }; path = dt_get_path(dn); NPU3INF(npu, "Found %s\n", path); NPU3INF(npu, "SCOM base: 0x%llx\n", npu->xscom_base); free(path); return npu; } struct npu3_dev *npu3_next_dev(struct npu3 *npu, struct npu3_dev *dev, enum npu3_dev_type type) { uint32_t i = 0; if (dev) i = dev->index + 1; for (; i < NPU3_LINKS_PER_NPU; i++) { dev = &npu->devices[i]; if (dev->type == type || type == NPU3_DEV_TYPE_ANY) return dev; } return NULL; } static void npu3_device_detect_fixup(struct npu3_dev *dev) { struct dt_node *dn = dev->dn; if (dev->type == NPU3_DEV_TYPE_NVLINK) { dt_add_property_strings(dn, "ibm,npu-link-type", "nvlink"); dev->link_speed = dt_prop_get_u32_def( dn, "nvidia,link-speed", 0xff); return; } NPU3DEVDBG(dev, "Link type unknown\n"); dt_add_property_strings(dn, "ibm,npu-link-type", "unknown"); } /* * We use the indirect method because it uses the same addresses as * the MMIO offsets (NPU RING) */ static void npu3_scom_sel(struct npu3 *npu, uint64_t reg, uint64_t size) { uint64_t val; val = SETFIELD(NPU3_MISC_DA_ADDR, 0ull, reg); val = SETFIELD(NPU3_MISC_DA_LEN, val, size); xscom_write(npu->chip_id, npu->xscom_base + NPU3_MISC_SCOM_IND_SCOM_ADDR, val); } static void npu3_scom_write(struct npu3 *npu, uint64_t reg, uint64_t size, uint64_t val) { npu3_scom_sel(npu, reg, size); xscom_write(npu->chip_id, npu->xscom_base + NPU3_MISC_SCOM_IND_SCOM_DATA, val); } static uint64_t npu3_scom_read(struct npu3 *npu, uint64_t reg, uint64_t size) { uint64_t val; npu3_scom_sel(npu, reg, size); xscom_read(npu->chip_id, npu->xscom_base + NPU3_MISC_SCOM_IND_SCOM_DATA, &val); return val; } void npu3_write(struct npu3 *npu, uint64_t reg, uint64_t val) { void *mmio = (void *)npu->regs[0]; if (mmio) out_be64(mmio + reg, val); else npu3_scom_write(npu, reg, NPU3_MISC_DA_LEN_8B, val); /* CQ_SM writes should be mirrored in all four blocks */ if (NPU3_REG_BLOCK(reg) != NPU3_BLOCK_CQ_SM(0)) return; for (uint32_t i = 1; i < 4; i++) npu3_write(npu, NPU3_BLOCK_CQ_SM(i) + NPU3_REG_OFFSET(reg), val); } uint64_t npu3_read(struct npu3 *npu, uint64_t reg) { void *mmio = (void *)npu->regs[0]; if (mmio) return in_be64(mmio + reg); return npu3_scom_read(npu, reg, NPU3_MISC_DA_LEN_8B); } void npu3_write_4b(struct npu3 *npu, uint64_t reg, uint32_t val) { void *mmio = (void *)npu->regs[0]; if (mmio) out_be32(mmio + reg, val); else npu3_scom_write(npu, reg, NPU3_MISC_DA_LEN_4B, (uint64_t)val << 32); if (NPU3_REG_BLOCK(reg) != NPU3_BLOCK_CQ_SM(0)) return; for (uint32_t i = 1; i < 4; i++) npu3_write_4b(npu, NPU3_BLOCK_CQ_SM(i) + NPU3_REG_OFFSET(reg), val); } uint32_t npu3_read_4b(struct npu3 *npu, uint64_t reg) { void *mmio = (void *)npu->regs[0]; if (mmio) return in_be32(mmio + reg); return npu3_scom_read(npu, reg, NPU3_MISC_DA_LEN_4B) >> 32; } static void npu3_misc_config(struct npu3 *npu) { struct npu3_dev *dev; uint32_t typemap = 0; uint64_t reg, val; npu3_for_each_nvlink_dev(dev, npu) typemap |= 0x10 >> dev->index; reg = NPU3_MCP_MISC_CFG0; val = npu3_read(npu, reg); val |= NPU3_MCP_MISC_CFG0_ENABLE_PBUS; val &= ~NPU3_MCP_MISC_CFG0_ENABLE_SNARF_CPM; val = SETFIELD(NPU3_MCP_MISC_CFG0_NVLINK_MODE, val, typemap); val = SETFIELD(NPU3_MCP_MISC_CFG0_OCAPI_MODE, val, ~typemap); npu3_write(npu, reg, val); reg = NPU3_SNP_MISC_CFG0; val = npu3_read(npu, reg); val |= NPU3_SNP_MISC_CFG0_ENABLE_PBUS; val = SETFIELD(NPU3_SNP_MISC_CFG0_NVLINK_MODE, val, typemap); val = SETFIELD(NPU3_SNP_MISC_CFG0_OCAPI_MODE, val, ~typemap); npu3_write(npu, reg, val); reg = NPU3_CTL_MISC_CFG2; val = npu3_read(npu, reg); val = SETFIELD(NPU3_CTL_MISC_CFG2_NVLINK_MODE, val, typemap); val = SETFIELD(NPU3_CTL_MISC_CFG2_OCAPI_MODE, val, ~typemap); npu3_write(npu, reg, val); reg = NPU3_DAT_MISC_CFG1; val = npu3_read(npu, reg); val = SETFIELD(NPU3_DAT_MISC_CFG1_NVLINK_MODE, val, typemap); val = SETFIELD(NPU3_DAT_MISC_CFG1_OCAPI_MODE, val, ~typemap); npu3_write(npu, reg, val); } static void npu3_assign_bars(struct npu3 *npu) { struct npu3_dev *dev; uint64_t addr, size, val; /* Global MMIO bar (per npu) */ phys_map_get(npu->chip_id, NPU_REGS, npu->index, &addr, &size); val = SETFIELD(NPU3_MMIO_BAR_ADDR, 0ull, addr >> 24); val |= NPU3_MMIO_BAR_ENABLE; npu3_write(npu, NPU3_MMIO_BAR, val); NPU3INF(npu, "MMIO base: 0x%016llx (%lldMB)\n", addr, size >> 20); npu->regs[0] = addr; npu->regs[1] = size; /* NTL bar (per device) */ npu3_for_each_dev(dev, npu) { phys_map_get(npu->chip_id, NPU_NTL, npu3_chip_dev_index(dev), &addr, &size); val = SETFIELD(NPU3_NTL_BAR_ADDR, 0ull, addr >> 16); val = SETFIELD(NPU3_NTL_BAR_SIZE, val, ilog2(size >> 16)); npu3_write(npu, NPU3_NTL_BAR(dev->index), val); dev->ntl_bar.addr = addr; dev->ntl_bar.size = size; } /* GENID bar (logically divided per device) */ phys_map_get(npu->chip_id, NPU_GENID, npu->index, &addr, NULL); val = SETFIELD(NPU3_GENID_BAR_ADDR, 0ull, addr >> 19); npu3_write(npu, NPU3_GENID_BAR, val); npu3_for_each_dev(dev, npu) { dev->genid_bar.addr = addr + (dev->index << 16); dev->genid_bar.size = 64 << 10; } } void npu3_dev_enable_bars(struct npu3_dev *dev, bool enable) { struct npu3 *npu = dev->npu; uint64_t reg, val; if (dev->ntl_bar.enable == enable) /* No state change */ return; dev->ntl_bar.enable = enable; dev->genid_bar.enable = enable; reg = NPU3_NTL_BAR(dev->index); val = npu3_read(npu, reg); val = SETFIELD(NPU3_NTL_BAR_ENABLE, val, enable); npu3_write(npu, reg, val); /* * Generation IDs are a single space in the hardware but we split them * per device. Only disable in hardware if every device has disabled. */ if (!enable) npu3_for_each_dev(dev, npu) if (dev->genid_bar.enable) return; reg = NPU3_GENID_BAR; val = npu3_read(npu, reg); val = SETFIELD(NPU3_GENID_BAR_ENABLE, val, enable); npu3_write(npu, reg, val); } static uint64_t npu3_ipi_attributes(struct irq_source *is, uint32_t isn) { struct npu3 *npu = is->data; uint32_t level = isn - npu->irq_base; /* TCE interrupt is used to detect a frozen PE */ if (level == 18) return IRQ_ATTR_TARGET_OPAL | IRQ_ATTR_TARGET_RARE | IRQ_ATTR_TYPE_MSI; return IRQ_ATTR_TARGET_LINUX; } static void npu3_ipi_interrupt(struct irq_source *is, uint32_t isn) { struct npu3 *npu = is->data; uint32_t level = isn - npu->irq_base; if (level != 18) { NPU3ERR(npu, "Received unknown interrupt %d\n", level); return; } opal_update_pending_evt(OPAL_EVENT_PCI_ERROR, OPAL_EVENT_PCI_ERROR); } #define NPU3_IRQ_LEVELS 60 static char *npu3_ipi_name(struct irq_source *is, uint32_t isn) { struct npu3 *npu = is->data; uint32_t level = isn - npu->irq_base; static const char *names[NPU3_IRQ_LEVELS] = { [0] = "NDL 0 Stall Event (brick 0)", [1] = "NDL 0 No-Stall Event (brick 0)", [2] = "NDL 1 Stall Event (brick 1)", [3] = "NDL 1 No-Stall Event (brick 1)", [4] = "NDL 2 Stall Event (brick 2)", [5] = "NDL 2 No-Stall Event (brick 2)", [6] = "NDL 3 Stall Event (brick 3)", [7] = "NDL 3 No-Stall Event (brick 3)", [8] = "NDL 4 Stall Event (brick 4)", [9] = "NDL 4 No-Stall Event (brick 4)", [10] = "NDL 5 Stall Event (brick 5)", [11] = "NDL 5 No-Stall Event (brick 5)", [12] = "NTL 0 Event", [13] = "NTL 1 Event", [14] = "NTL 2 Event", [15] = "NTL 3 Event", [16] = "NTL 4 Event", [17] = "NTL 5 Event", [18] = "TCE Event", [19] = "ATS Event", [20] = "CQ Event", [21] = "MISC Event", [41] = "Memory Controller Event", [42] = "NDL 6 Stall Event (brick 6)", [43] = "NDL 6 No-Stall Event (brick 6)", [44] = "NDL 7 Stall Event (brick 7)", [45] = "NDL 7 No-Stall Event (brick 7)", [46] = "NDL 8 Stall Event (brick 8)", [47] = "NDL 8 No-Stall Event (brick 8)", [48] = "NDL 9 Stall Event (brick 9)", [49] = "NDL 9 No-Stall Event (brick 9)", [50] = "NDL 10 Stall Event (brick 10)", [51] = "NDL 10 No-Stall Event (brick 10)", [52] = "NDL 11 Stall Event (brick 11)", [53] = "NDL 11 No-Stall Event (brick 11)", [54] = "NTL 6 Event", [55] = "NTL 7 Event", [56] = "NTL 8 Event", [57] = "NTL 9 Event", [58] = "NTL 10 Event", [59] = "NTL 11 Event", }; if (level >= NPU3_IRQ_LEVELS || !names[level]) return strdup("Unknown"); return strdup(names[level]); } static const struct irq_source_ops npu3_ipi_ops = { .attributes = npu3_ipi_attributes, .interrupt = npu3_ipi_interrupt, .name = npu3_ipi_name, }; static void npu3_setup_irqs(struct npu3 *npu) { uint64_t reg, val; uint32_t base; base = xive_alloc_ipi_irqs(npu->chip_id, NPU3_IRQ_LEVELS, 64); if (base == XIVE_IRQ_ERROR) { NPU3ERR(npu, "Failed to allocate interrupt sources\n"); return; } xive_register_ipi_source(base, NPU3_IRQ_LEVELS, npu, &npu3_ipi_ops); /* Set IPI configuration */ reg = NPU3_MISC_CFG; val = npu3_read(npu, reg); val = SETFIELD(NPU3_MISC_CFG_IPI_PS, val, NPU3_MISC_CFG_IPI_PS_64K); val = SETFIELD(NPU3_MISC_CFG_IPI_OS, val, NPU3_MISC_CFG_IPI_OS_AIX); npu3_write(npu, reg, val); /* Set IRQ base */ reg = NPU3_MISC_INT_BAR; val = SETFIELD(NPU3_MISC_INT_BAR_ADDR, 0ull, (uint64_t)xive_get_trigger_port(base) >> 12); npu3_write(npu, reg, val); npu->irq_base = base; } static void npu3_init(struct npu3 *npu) { struct npu3_dev *dev; platform.npu3_device_detect(npu); npu3_for_each_dev(dev, npu) npu3_device_detect_fixup(dev); npu3_misc_config(npu); npu3_assign_bars(npu); npu3_setup_irqs(npu); npu3_init_nvlink(npu); } void probe_npu3(void) { struct dt_node *dn; struct npu3 *npu; if (!npu3_dt_create()) return; if (!platform.npu3_device_detect) { prlog(PR_INFO, "NPU: Platform does not support NPU\n"); return; } dt_for_each_compatible(dt_root, dn, "ibm,power9-npu3") { npu = npu3_create(dn); npu3_init(npu); } }