aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrederic Barrat <fbarrat@linux.ibm.com>2019-03-12 21:35:10 +0100
committerStewart Smith <stewart@linux.ibm.com>2019-03-13 21:55:18 -0500
commite621b7b6795163d9f429d28fb120736fa4fad042 (patch)
treea5a65885c60908c58ae57465f1ccae151c3f6fb5
parent536c8fbf932d6a790f95cb8cf39dacf4a2de06cb (diff)
downloadskiboot-e621b7b6795163d9f429d28fb120736fa4fad042.zip
skiboot-e621b7b6795163d9f429d28fb120736fa4fad042.tar.gz
skiboot-e621b7b6795163d9f429d28fb120736fa4fad042.tar.bz2
npu2-opencapi: Setup perf counters to detect CRC errors
It's possible to set up performance counters for the PLL to detect various conditions for the links in nvlink or opencapi mode. Since those counters are currently unused, let's configure them when an obus is in opencapi mode to detect CRC errors on the link. Each link has two counters: - CRC error detected by the host - CRC error detected by the DLx (NAK received by the host) We also dump the counters shortly after the link trains, but they can be read multiple times through cronus, pdbg or linux. The counters are configured to be reset after each read. Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com> Reviewed-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com> Reviewed-by: Christophe Lombard <clombard@linux.vnet.ibm.com> Signed-off-by: Stewart Smith <stewart@linux.ibm.com>
-rw-r--r--hw/npu2-opencapi.c62
-rw-r--r--include/npu2-regs.h17
2 files changed, 79 insertions, 0 deletions
diff --git a/hw/npu2-opencapi.c b/hw/npu2-opencapi.c
index 6ad561c..6d642cd 100644
--- a/hw/npu2-opencapi.c
+++ b/hw/npu2-opencapi.c
@@ -909,6 +909,66 @@ static void reset_odl(uint32_t gcid, struct npu2_dev *dev)
xscom_write(gcid, config_xscom, reg);
}
+static void setup_perf_counters(struct npu2_dev *dev)
+{
+ uint64_t addr, reg, link;
+
+ /*
+ * setup the DLL perf counters to check CRC errors detected by
+ * the NPU or the adapter.
+ *
+ * Counter 0: link 0/ODL0, CRC error detected by ODL
+ * Counter 1: link 0/ODL0, CRC error detected by DLx
+ * Counter 2: link 1/ODL1, CRC error detected by ODL
+ * Counter 3: link 1/ODL1, CRC error detected by DLx
+ */
+ if ((dev->brick_index == 2) || (dev->brick_index == 5))
+ link = 0;
+ else
+ link = 1;
+
+ addr = OB_DLL_PERF_MONITOR_CONFIG(dev->brick_index);
+ xscom_read(dev->npu->chip_id, addr, &reg);
+ if (link == 0) {
+ reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_ENABLE, reg,
+ OB_DLL_PERF_MONITOR_CONFIG_LINK0);
+ reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_ENABLE >> 2, reg,
+ OB_DLL_PERF_MONITOR_CONFIG_LINK0);
+ } else {
+ reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_ENABLE >> 4, reg,
+ OB_DLL_PERF_MONITOR_CONFIG_LINK1);
+ reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_ENABLE >> 6, reg,
+ OB_DLL_PERF_MONITOR_CONFIG_LINK1);
+ }
+ reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_SIZE, reg,
+ OB_DLL_PERF_MONITOR_CONFIG_SIZE16);
+ xscom_write(dev->npu->chip_id,
+ OB_DLL_PERF_MONITOR_CONFIG(dev->brick_index), reg);
+ OCAPIDBG(dev, "perf counter config %llx = %llx\n", addr, reg);
+
+ addr = OB_DLL_PERF_MONITOR_SELECT(dev->brick_index);
+ xscom_read(dev->npu->chip_id, addr, &reg);
+ reg = SETFIELD(OB_DLL_PERF_MONITOR_SELECT_COUNTER >> (link * 16),
+ reg, OB_DLL_PERF_MONITOR_SELECT_CRC_ODL);
+ reg = SETFIELD(OB_DLL_PERF_MONITOR_SELECT_COUNTER >> ((link * 16) + 8),
+ reg, OB_DLL_PERF_MONITOR_SELECT_CRC_DLX);
+ xscom_write(dev->npu->chip_id, addr, reg);
+ OCAPIDBG(dev, "perf counter select %llx = %llx\n", addr, reg);
+}
+
+static void check_perf_counters(struct npu2_dev *dev)
+{
+ uint64_t addr, reg, link0, link1;
+
+ addr = OB_DLL_PERF_COUNTER0(dev->brick_index);
+ xscom_read(dev->npu->chip_id, addr, &reg);
+ link0 = GETFIELD(PPC_BITMASK(0, 31), reg);
+ link1 = GETFIELD(PPC_BITMASK(32, 63), reg);
+ if (link0 || link1)
+ OCAPIERR(dev, "CRC error count link0=%08llx link1=%08llx\n",
+ link0, link1);
+}
+
static void set_init_pattern(uint32_t gcid, struct npu2_dev *dev)
{
uint64_t reg, config_xscom;
@@ -1048,6 +1108,7 @@ static int64_t npu2_opencapi_poll_link(struct pci_slot *slot)
case OCAPI_SLOT_LINK_TRAINED:
otl_enabletx(chip_id, dev->npu->xscom_base, dev);
pci_slot_set_state(slot, OCAPI_SLOT_NORMAL);
+ check_perf_counters(dev);
dev->phb_ocapi.scan_map = 1;
return OPAL_SUCCESS;
@@ -1569,6 +1630,7 @@ static void setup_device(struct npu2_dev *dev)
setup_afu_mmio_bars(dev->npu->chip_id, dev->npu->xscom_base, dev);
/* Procedure 13.1.3.9 - AFU Config BARs */
setup_afu_config_bars(dev->npu->chip_id, dev->npu->xscom_base, dev);
+ setup_perf_counters(dev);
set_fence_control(dev->npu->chip_id, dev->npu->xscom_base, dev->brick_index, 0b00);
diff --git a/include/npu2-regs.h b/include/npu2-regs.h
index 5190aeb..ca31109 100644
--- a/include/npu2-regs.h
+++ b/include/npu2-regs.h
@@ -725,6 +725,23 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
#define PU_IOE_PB_FP_CFG_FP1_FMR_DISABLE PPC_BIT(52)
#define PU_IOE_PB_FP_CFG_FP1_PRS_DISABLE PPC_BIT(57)
+#define OB_DLL_PERF_MONITOR_CONFIG(brick_index) \
+ (0x901081C + ((brick_index - 2) >> 1) * 0x3000000)
+#define OB_DLL_PERF_MONITOR_CONFIG_ENABLE PPC_BITMASK(0, 1)
+#define OB_DLL_PERF_MONITOR_CONFIG_LINK0 0b10
+#define OB_DLL_PERF_MONITOR_CONFIG_LINK1 0b01
+#define OB_DLL_PERF_MONITOR_CONFIG_SIZE PPC_BITMASK(16, 23)
+#define OB_DLL_PERF_MONITOR_CONFIG_SIZE16 0xFF
+#define OB_DLL_PERF_MONITOR_SELECT(brick_index) \
+ (0x901081D + ((brick_index - 2) >> 1) * 0x3000000)
+#define OB_DLL_PERF_MONITOR_SELECT_COUNTER PPC_BITMASK(0, 7)
+#define OB_DLL_PERF_MONITOR_SELECT_CRC_ODL 0x44
+#define OB_DLL_PERF_MONITOR_SELECT_CRC_DLX 0x45
+#define OB_DLL_PERF_COUNTER0(brick_index) \
+ (0x901081E + ((brick_index - 2) >> 1) * 0x3000000)
+#define OB_DLL_PERF_COUNTER0_VAL PPC_BITMASK(0, 31)
+
+
#define OB_ODL_OFFSET(brick_index) \
((((brick_index - 2) >> 1) * 0x3000000) + ((brick_index == 3 || brick_index == 4) ? 1 : 0))