All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jon Mason <mason@myri.com>
To: davem@davemloft.net
Cc: netdev@vger.kernel.org, Andrew Gallatin <gallatin@myri.com>
Subject: [PATCH 3/9 v2] myri10ge: rework parity error check and cleanup
Date: Mon, 27 Jun 2011 15:54:34 -0500	[thread overview]
Message-ID: <20110627205432.GA18978@myri.com> (raw)
In-Reply-To: <1309187108-12715-3-git-send-email-mason@myri.com>

Clean up watchdog reset code:
 - move code that checks for stuck slice to a common routine
 - unless there is a confirmed h/w fault, verify that a stuck
   slice is still stuck in the watchdog worker; if the slice is no
   longer stuck, abort the reset.
 - this removes an egregious 2000ms pause in the watchdog worker that
   was a diagnostic aid (to look for spurious resets) the snuck into
   production code.

v2 includes corrections from Ben Hutchings and Joe Perches

Signed-off-by: Jon Mason <mason@myri.com>
---
 drivers/net/myri10ge/myri10ge.c |  100 +++++++++++++++++++++++---------------
 1 files changed, 60 insertions(+), 40 deletions(-)

diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c
index 0f0f83d..c2574c5 100644
--- a/drivers/net/myri10ge/myri10ge.c
+++ b/drivers/net/myri10ge/myri10ge.c
@@ -193,6 +193,7 @@ struct myri10ge_slice_state {
 	int watchdog_tx_done;
 	int watchdog_tx_req;
 	int watchdog_rx_done;
+	int stuck;
 #ifdef CONFIG_MYRI10GE_DCA
 	int cached_dca_tag;
 	int cpu;
@@ -3442,6 +3443,42 @@ static u32 myri10ge_read_reboot(struct myri10ge_priv *mgp)
 	return reboot;
 }
 
+static void
+myri10ge_check_slice(struct myri10ge_slice_state *ss, int *reset_needed,
+		     int *busy_slice_cnt, u32 rx_pause_cnt)
+{
+	struct myri10ge_priv *mgp = ss->mgp;
+	int slice = ss - mgp->ss;
+
+	if (ss->tx.req != ss->tx.done &&
+	    ss->tx.done == ss->watchdog_tx_done &&
+	    ss->watchdog_tx_req != ss->watchdog_tx_done) {
+		/* nic seems like it might be stuck.. */
+		if (rx_pause_cnt != mgp->watchdog_pause) {
+			if (net_ratelimit())
+				netdev_warn(mgp->dev, "slice %d: TX paused, "
+					    "check link partner\n", slice);
+		} else {
+			netdev_warn(mgp->dev,
+				    "slice %d: TX stuck %d %d %d %d %d %d\n",
+				    slice, ss->tx.queue_active, ss->tx.req,
+				    ss->tx.done, ss->tx.pkt_start,
+				    ss->tx.pkt_done,
+				    (int)ntohl(mgp->ss[slice].fw_stats->
+					       send_done_count));
+			*reset_needed = 1;
+			ss->stuck = 1;
+		}
+	}
+	if (ss->watchdog_tx_done != ss->tx.done ||
+	    ss->watchdog_rx_done != ss->rx_done.cnt) {
+		*busy_slice_cnt += 1;
+	}
+	ss->watchdog_tx_done = ss->tx.done;
+	ss->watchdog_tx_req = ss->tx.req;
+	ss->watchdog_rx_done = ss->rx_done.cnt;
+}
+
 /*
  * This watchdog is used to check whether the board has suffered
  * from a parity error and needs to be recovered.
@@ -3450,10 +3487,12 @@ static void myri10ge_watchdog(struct work_struct *work)
 {
 	struct myri10ge_priv *mgp =
 	    container_of(work, struct myri10ge_priv, watchdog_work);
-	struct myri10ge_tx_buf *tx;
-	u32 reboot;
+	struct myri10ge_slice_state *ss;
+	u32 reboot, rx_pause_cnt;
 	int status, rebooted;
 	int i;
+	int reset_needed = 0;
+	int busy_slice_cnt = 0;
 	u16 cmd, vendor;
 
 	mgp->watchdog_resets++;
@@ -3465,8 +3504,7 @@ static void myri10ge_watchdog(struct work_struct *work)
 		 * For now, just report it */
 		reboot = myri10ge_read_reboot(mgp);
 		netdev_err(mgp->dev, "NIC rebooted (0x%x),%s resetting\n",
-			   reboot,
-			   myri10ge_reset_recover ? "" : " not");
+			   reboot, myri10ge_reset_recover ? " " : " not");
 		if (myri10ge_reset_recover == 0)
 			return;
 		rtnl_lock();
@@ -3498,23 +3536,24 @@ static void myri10ge_watchdog(struct work_struct *work)
 				return;
 			}
 		}
-		/* Perhaps it is a software error.  Try to reset */
-
-		netdev_err(mgp->dev, "device timeout, resetting\n");
+		/* Perhaps it is a software error. See if stuck slice
+		 * has recovered, reset if not */
+		rx_pause_cnt = ntohl(mgp->ss[0].fw_stats->dropped_pause);
 		for (i = 0; i < mgp->num_slices; i++) {
-			tx = &mgp->ss[i].tx;
-			netdev_err(mgp->dev, "(%d): %d %d %d %d %d %d\n",
-				   i, tx->queue_active, tx->req,
-				   tx->done, tx->pkt_start, tx->pkt_done,
-				   (int)ntohl(mgp->ss[i].fw_stats->
-					      send_done_count));
-			msleep(2000);
-			netdev_info(mgp->dev, "(%d): %d %d %d %d %d %d\n",
-				    i, tx->queue_active, tx->req,
-				    tx->done, tx->pkt_start, tx->pkt_done,
-				    (int)ntohl(mgp->ss[i].fw_stats->
-					       send_done_count));
+			ss = mgp->ss;
+			if (ss->stuck) {
+				myri10ge_check_slice(ss, &reset_needed,
+						     &busy_slice_cnt,
+						     rx_pause_cnt);
+				ss->stuck = 0;
+			}
 		}
+		if (!reset_needed) {
+			netdev_dbg(mgp->dev, "not resetting\n");
+			return;
+		}
+
+		netdev_err(mgp->dev, "device timeout, resetting\n");
 	}
 
 	if (!rebooted) {
@@ -3567,27 +3606,8 @@ static void myri10ge_watchdog_timer(unsigned long arg)
 			    myri10ge_fill_thresh)
 				ss->rx_big.watchdog_needed = 0;
 		}
-
-		if (ss->tx.req != ss->tx.done &&
-		    ss->tx.done == ss->watchdog_tx_done &&
-		    ss->watchdog_tx_req != ss->watchdog_tx_done) {
-			/* nic seems like it might be stuck.. */
-			if (rx_pause_cnt != mgp->watchdog_pause) {
-				if (net_ratelimit())
-					netdev_err(mgp->dev, "slice %d: TX paused, check link partner\n",
-						   i);
-			} else {
-				netdev_warn(mgp->dev, "slice %d stuck:", i);
-				reset_needed = 1;
-			}
-		}
-		if (ss->watchdog_tx_done != ss->tx.done ||
-		    ss->watchdog_rx_done != ss->rx_done.cnt) {
-			busy_slice_cnt++;
-		}
-		ss->watchdog_tx_done = ss->tx.done;
-		ss->watchdog_tx_req = ss->tx.req;
-		ss->watchdog_rx_done = ss->rx_done.cnt;
+		myri10ge_check_slice(ss, &reset_needed, &busy_slice_cnt,
+				     rx_pause_cnt);
 	}
 	/* if we've sent or received no traffic, poll the NIC to
 	 * ensure it is still there.  Otherwise, we risk not noticing
-- 
1.7.5.4


  parent reply	other threads:[~2011-06-27 20:54 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-06-27 15:05 [PATCH 1/9] myri10ge: ensure tx queues remain stopped Jon Mason
2011-06-27 15:05 ` [PATCH 2/9] myri10ge: Mask PCI Surprise Link Down Events Jon Mason
2011-06-29 13:02   ` David Miller
2011-06-27 15:05 ` [PATCH 3/9] myri10ge: rework parity error check and cleanup Jon Mason
2011-06-27 15:27   ` Ben Hutchings
2011-06-27 20:26     ` Jon Mason
2011-06-27 16:01   ` Joe Perches
2011-06-27 20:54   ` Jon Mason [this message]
2011-06-28  2:17     ` [PATCH 3/9 v2] " Joe Perches
2011-06-28  3:31       ` Jon Mason
2011-06-28  3:57   ` [PATCH 3/9 v3] " Jon Mason
2011-06-29 13:02     ` David Miller
2011-06-27 15:05 ` [PATCH 4/9] myri10ge: allow small_bytes = 0 Jon Mason
2011-06-29 13:03   ` David Miller
2011-06-27 15:05 ` [PATCH 5/9] myri10ge: add support for set_phys_id Jon Mason
2011-06-29 13:03   ` David Miller
2011-06-27 15:05 ` [PATCH 6/9] myri10ge: remove unnecessary read of PCI_CAP_ID_EXP Jon Mason
2011-06-29 13:03   ` David Miller
2011-06-27 15:05 ` [PATCH 7/9] myri10ge: misc style cleanups Jon Mason
2011-06-27 16:05   ` Joe Perches
2011-06-27 20:56   ` [PATCH 7/9 v2] " Jon Mason
2011-06-29 13:03     ` David Miller
2011-06-27 15:05 ` [PATCH 8/9] myri10ge: update version Jon Mason
2011-06-29 13:04   ` David Miller
2011-06-27 15:05 ` [PATCH 9/9] myri10ge: Update MAINTAINERS Jon Mason
2011-06-29 13:04   ` David Miller
2011-06-29 13:02 ` [PATCH 1/9] myri10ge: ensure tx queues remain stopped David Miller

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20110627205432.GA18978@myri.com \
    --to=mason@myri.com \
    --cc=davem@davemloft.net \
    --cc=gallatin@myri.com \
    --cc=netdev@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.