aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorOliver O'Halloran <oohall@gmail.com>2018-10-30 11:02:30 +1100
committerStewart Smith <stewart@linux.ibm.com>2018-11-02 18:17:44 +1100
commit125cecaa0f236cc01cf527b432f74e4de69f3d12 (patch)
treea375502806d9ecbf090ece52322e850d3a40afd0
parent9555cf21ba5bd90d2213020f8354dd7bd33cd4c3 (diff)
downloadskiboot-125cecaa0f236cc01cf527b432f74e4de69f3d12.zip
skiboot-125cecaa0f236cc01cf527b432f74e4de69f3d12.tar.gz
skiboot-125cecaa0f236cc01cf527b432f74e4de69f3d12.tar.bz2
phb4: Check for RX errors after link training
[ Upstream commit 9597a12ef4b3644e4b8644f659bec04ca139b7f9 ] Some PHB4 PHYs can get stuck in a bad state where they are constantly retraining the link. This happens transparently to skiboot and Linux but will causes PCIe to be slow. Resetting the PHB4 clears the problem. We can detect this case by looking at the RX errors count where we check for link stability. This patch does this by modifying the link optimal code to check for RX errors. If errors are occurring we retrain the link irrespective of the chip rev or card. Normally when this problem occurs, the RX error count is maxed out at 255. When there is no problem, the count is 0. We chose 8 as the max rx errors value to give us some margin for a few errors. There is also a knob that can be used to set the error threshold for when we should retrain the link. ie nvram -p ibm,skiboot --update-config phb-rx-err-max=8 Signed-off-by: Oliver O'Halloran <oohall@gmail.com> Signed-off-by: Michael Neuling <mikey@neuling.org> Signed-off-by: Stewart Smith <stewart@linux.ibm.com>
-rw-r--r--hw/phb4.c29
-rw-r--r--include/phb4-regs.h2
-rw-r--r--include/phb4.h2
3 files changed, 30 insertions, 3 deletions
diff --git a/hw/phb4.c b/hw/phb4.c
index 55d74dc..cd58acc 100644
--- a/hw/phb4.c
+++ b/hw/phb4.c
@@ -152,6 +152,7 @@ static bool verbose_eeh;
static bool pci_tracing;
static bool pci_eeh_mmio;
static bool pci_retry_all;
+static int rx_err_max = PHB4_RX_ERR_MAX;
/* Note: The "ASB" name is historical, practically this means access via
* the XSCOM backdoor
@@ -2535,11 +2536,12 @@ static void phb4_lane_eq_change(struct phb4 *p, uint32_t vdid)
static bool phb4_link_optimal(struct pci_slot *slot, uint32_t *vdid)
{
struct phb4 *p = phb_to_phb4(slot->phb);
+ uint64_t reg;
uint32_t id;
- uint16_t bdfn;
- uint8_t trained_speed, phb_speed, dev_speed, target_speed;
+ uint16_t bdfn, lane_errs;
+ uint8_t trained_speed, phb_speed, dev_speed, target_speed, rx_errs;
uint8_t trained_width, phb_width, dev_width, target_width;
- bool optimal_speed, optimal_width, optimal, retry_enabled;
+ bool optimal_speed, optimal_width, optimal, retry_enabled, rx_err_ok;
/* Current trained state */
@@ -2565,6 +2567,11 @@ static bool phb4_link_optimal(struct pci_slot *slot, uint32_t *vdid)
retry_enabled = (phb4_chip_retry_workaround() &&
phb4_adapter_in_whitelist(id)) ||
phb4_lane_eq_retry_whitelist(id);
+ reg = in_be64(p->regs + PHB_PCIE_DLP_ERR_COUNTERS);
+ rx_errs = GETFIELD(PHB_PCIE_DLP_RX_ERR_CNT, reg);
+ rx_err_ok = (rx_errs < rx_err_max);
+ reg = in_be64(p->regs + PHB_PCIE_DLP_ERR_STATUS);
+ lane_errs = GETFIELD(PHB_PCIE_DLP_LANE_ERR, reg);
PHBDBG(p, "LINK: Card [%04x:%04x] %s Retry:%s\n", VENDOR(id),
DEVICE(id), optimal ? "Optimal" : "Degraded",
@@ -2573,10 +2580,16 @@ static bool phb4_link_optimal(struct pci_slot *slot, uint32_t *vdid)
trained_speed, phb_speed, dev_speed, optimal_speed ? "" : " *");
PHBDBG(p, "LINK: Width Train:x%02i PHB:x%02i DEV:x%02i%s\n",
trained_width, phb_width, dev_width, optimal_width ? "" : " *");
+ PHBDBG(p, "LINK: RX Errors Now:%i Max:%i Lane:0x%04x%s\n",
+ rx_errs, rx_err_max, lane_errs, rx_err_ok ? "" : " *");
if (vdid)
*vdid = id;
+ /* Always do RX error retry irrespective of chip and card */
+ if (!rx_err_ok)
+ return false;
+
if (!retry_enabled)
return true;
@@ -5580,6 +5593,7 @@ static void phb4_probe_pbcq(struct dt_node *pbcq)
void probe_phb4(void)
{
struct dt_node *np;
+ const char *s;
verbose_eeh = nvram_query_eq("pci-eeh-verbose", "true");
/* REMOVEME: force this for now until we stabalise PCIe */
@@ -5590,6 +5604,15 @@ void probe_phb4(void)
pci_tracing = nvram_query_eq("pci-tracing", "true");
pci_eeh_mmio = !nvram_query_eq("pci-eeh-mmio", "disabled");
pci_retry_all = nvram_query_eq("pci-retry-all", "true");
+ s = nvram_query("phb-rx-err-max");
+ if (s) {
+ rx_err_max = atoi(s);
+
+ /* Clip to uint8_t used by hardware */
+ rx_err_max = MAX(rx_err_max, 0);
+ rx_err_max = MIN(rx_err_max, 255);
+ }
+ prlog(PR_DEBUG, "PHB4: Maximum RX errors during training: %d\n", rx_err_max);
/* Look for PBCQ XSCOM nodes */
dt_for_each_compatible(dt_root, np, "ibm,power9-pbcq")
diff --git a/include/phb4-regs.h b/include/phb4-regs.h
index ef3cfa9..28f22f9 100644
--- a/include/phb4-regs.h
+++ b/include/phb4-regs.h
@@ -312,7 +312,9 @@
#define PHB_PCIE_DLP_ERRLOG1 0x1AA0
#define PHB_PCIE_DLP_ERRLOG2 0x1AA8
#define PHB_PCIE_DLP_ERR_STATUS 0x1AB0
+#define PHB_PCIE_DLP_LANE_ERR PPC_BITMASK(0,15)
#define PHB_PCIE_DLP_ERR_COUNTERS 0x1AB8
+#define PHB_PCIE_DLP_RX_ERR_CNT PPC_BITMASK(16,23)
#define PHB_PCIE_LANE_EQ_CNTL0 0x1AD0
#define PHB_PCIE_LANE_EQ_CNTL1 0x1AD8
diff --git a/include/phb4.h b/include/phb4.h
index 4ab2912..70bf6de 100644
--- a/include/phb4.h
+++ b/include/phb4.h
@@ -159,6 +159,8 @@ struct phb4_err {
#define PHB4_LINK_ELECTRICAL_RETRIES 100
#define PHB4_LINK_WAIT_RETRIES 200
+#define PHB4_RX_ERR_MAX 8
+
/* PHB4 flags */
#define PHB4_AIB_FENCED 0x00000001
#define PHB4_CFG_USE_ASB 0x00000002