aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorReza Arbab <arbab@linux.ibm.com>2019-11-14 10:13:04 -0600
committerVasant Hegde <hegdevasant@linux.vnet.ibm.com>2019-12-10 11:28:57 +0530
commit0ecfbdd01ac0f55ac304ca946f91f7f5297b30ff (patch)
tree4d216006274118fbbe80c83a66f643b20ab43d81
parenta2dabb70693b0db556f112b0664bb6f4a28a79aa (diff)
downloadskiboot-0ecfbdd01ac0f55ac304ca946f91f7f5297b30ff.zip
skiboot-0ecfbdd01ac0f55ac304ca946f91f7f5297b30ff.tar.gz
skiboot-0ecfbdd01ac0f55ac304ca946f91f7f5297b30ff.tar.bz2
npu2/hw-procedures: Remove assertion from check_credits()
[ Upstream commit 24664b48642845d620e225111bf6184f3c102f60 ] The RX clock mux in the NVLink PHY can glitch, which will manifest in hard to diagnose behavior--at best, a checkstop during the first link traffic. The only reliable way we found to detect this was by checking for a discrepancy in the credits we expect to receive during link training. Since the time the check was added, we've found that * Commit ac6f1599ff33 ("npu2: hw-procedures: Add phy_rx_clock_sel()") does work around the original glitch. * Asserting is too harsh. Before root cause was established, it was thought this could have been a manufacturing defect and we wanted to loudly fail hardware acceptance boot cycle tests. * It seems there is a valid situation in which credits are off from the expected value. During GPU hot reset, a CPU prefetch across the link can affect the credit count before we check. Given all of the above, remove the assert(). Cc: stable # 6.0.x Signed-off-by: Reza Arbab <arbab@linux.ibm.com> Signed-off-by: Oliver O'Halloran <oohall@gmail.com> Signed-off-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
-rw-r--r--hw/npu2-hw-procedures.c15
1 files changed, 6 insertions, 9 deletions
diff --git a/hw/npu2-hw-procedures.c b/hw/npu2-hw-procedures.c
index 8686462..2a03bed 100644
--- a/hw/npu2-hw-procedures.c
+++ b/hw/npu2-hw-procedures.c
@@ -756,17 +756,14 @@ static uint32_t check_credit(struct npu2_dev *ndev, uint64_t reg,
static uint32_t check_credits(struct npu2_dev *ndev)
{
- int fail = 0;
uint64_t val;
- fail += CHECK_CREDIT(ndev, NPU2_NTL_CRED_HDR_CREDIT_RX, 0x0BE0BE0000000000ULL);
- fail += CHECK_CREDIT(ndev, NPU2_NTL_RSP_HDR_CREDIT_RX, 0x0BE0BE0000000000ULL);
- fail += CHECK_CREDIT(ndev, NPU2_NTL_CRED_DATA_CREDIT_RX, 0x1001000000000000ULL);
- fail += CHECK_CREDIT(ndev, NPU2_NTL_RSP_DATA_CREDIT_RX, 0x1001000000000000ULL);
- fail += CHECK_CREDIT(ndev, NPU2_NTL_DBD_HDR_CREDIT_RX, 0x0640640000000000ULL);
- fail += CHECK_CREDIT(ndev, NPU2_NTL_ATSD_HDR_CREDIT_RX, 0x0200200000000000ULL);
-
- assert(!fail);
+ CHECK_CREDIT(ndev, NPU2_NTL_CRED_HDR_CREDIT_RX, 0x0BE0BE0000000000ULL);
+ CHECK_CREDIT(ndev, NPU2_NTL_RSP_HDR_CREDIT_RX, 0x0BE0BE0000000000ULL);
+ CHECK_CREDIT(ndev, NPU2_NTL_CRED_DATA_CREDIT_RX, 0x1001000000000000ULL);
+ CHECK_CREDIT(ndev, NPU2_NTL_RSP_DATA_CREDIT_RX, 0x1001000000000000ULL);
+ CHECK_CREDIT(ndev, NPU2_NTL_DBD_HDR_CREDIT_RX, 0x0640640000000000ULL);
+ CHECK_CREDIT(ndev, NPU2_NTL_ATSD_HDR_CREDIT_RX, 0x0200200000000000ULL);
val = npu2_read(ndev->npu, NPU2_NTL_MISC_CFG1(ndev));
val &= 0xFF3FFFFFFFFFFFFF;