[PATCH] eepro100 - need testers

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] eepro100 - need testers
       [not found] <E167w6n-0001dz-00@fenrus.demon.nl>
@ 2001-12-04 22:57 ` Tim Hockin
  2001-12-04 23:15   ` Edward Muller
                     ` (5 more replies)
  0 siblings, 6 replies; 16+ messages in thread
From: Tim Hockin @ 2001-12-04 22:57 UTC (permalink / raw)
  To: Linux Kernel Mailing List; +Cc: arjanv, saw, sparker

[-- Attachment #1: Type: text/plain, Size: 289 bytes --]

This patch was developed here to resolve a number of eepro100 issues we
were seeing. I'd like to get people to try this on their eepro100 chips and
beat on it for a while.

volunteers?

Tim
-- 
Tim Hockin
Systems Software Engineer
Sun Microsystems, Cobalt Server Appliances
thockin@sun.com

[-- Attachment #2: sparker-eepro100.diff --]
[-- Type: text/plain, Size: 6937 bytes --]

diff -ruN 2.4.14-orig/drivers/net/eepro100.c 2.4.14-cobalt/drivers/net/eepro100.c
--- 2.4.14-orig/drivers/net/eepro100.c	Tue Dec  4 14:30:09 2001
+++ 2.4.14-cobalt/drivers/net/eepro100.c	Tue Dec  4 14:03:35 2001
@@ -64,8 +64,8 @@
 
 /* A few values that may be tweaked. */
 /* The ring sizes should be a power of two for efficiency. */
-#define TX_RING_SIZE	32
-#define RX_RING_SIZE	32
+#define TX_RING_SIZE	64
+#define RX_RING_SIZE	1024
 /* How much slots multicast filter setup may take.
    Do not descrease without changing set_rx_mode() implementaion. */
 #define TX_MULTICAST_SIZE   2
@@ -1067,6 +1071,50 @@
 	outw(CUStart | SCBMaskEarlyRx | SCBMaskFlowCtl, ioaddr + SCBCmd);
 }
 
+/*
+ * Sometimes the receiver stops making progress.  This routine knows how to
+ * get it going again, without losing packets or being otherwise nasty like
+ * a chip reset would be.  Previously the driver had a whole sequence
+ * of if RxSuspended, if it's no buffers do one thing, if it's no resources,
+ * do another, etc.  But those things don't really matter.  Separate logic
+ * in the ISR provides for allocating buffers--the other half of operation
+ * is just making sure the receiver is active.  speedo_rx_soft_reset does that.
+ * This problem with the old, more involved algorithm is shown up under
+ * ping floods on the order of 60K packets/second on a 100Mbps fdx network.
+ */
+static void
+speedo_rx_soft_reset(struct net_device *dev)
+{
+	struct speedo_private *sp = dev->priv;
+	struct RxFD *rfd;
+	long ioaddr;
+
+	ioaddr = dev->base_addr;
+	wait_for_cmd_done(ioaddr + SCBCmd);
+	if (inb(ioaddr + SCBCmd) != 0) {
+		printk("%s: previous command stalled\n", dev->name);
+		return;
+	}
+	/*
+	* Put the hardware into a known state.
+	*/
+	outb(RxAbort, ioaddr + SCBCmd);
+
+	rfd = sp->rx_ringp[sp->cur_rx % RX_RING_SIZE];
+
+	rfd->rx_buf_addr = 0xffffffff;
+
+	wait_for_cmd_done(ioaddr + SCBCmd);
+
+	if (inb(ioaddr + SCBCmd) != 0) {
+		printk("%s: RxAbort command stalled\n", dev->name);
+		return;
+	}
+	outl(sp->rx_ring_dma[sp->cur_rx % RX_RING_SIZE],
+		ioaddr + SCBPointer);
+	outb(RxStart, ioaddr + SCBCmd);
+}
+
 /* Media monitoring and control. */
 static void speedo_timer(unsigned long data)
 {
@@ -1500,82 +1591,37 @@
 		if ((status & 0xfc00) == 0)
 			break;
 
-		/* Always check if all rx buffers are allocated.  --SAW */
-		speedo_refill_rx_buffers(dev, 0);
-
 		if ((status & 0x5000) ||	/* Packet received, or Rx error. */
 			(sp->rx_ring_state&(RrNoMem|RrPostponed)) == RrPostponed)
 									/* Need to gather the postponed packet. */
 			speedo_rx(dev);
 
-		if (status & 0x1000) {
-			spin_lock(&sp->lock);
-			if ((status & 0x003c) == 0x0028) {		/* No more Rx buffers. */
-				struct RxFD *rxf;
-				printk(KERN_WARNING "%s: card reports no RX buffers.\n",
-						dev->name);
-				rxf = sp->rx_ringp[sp->cur_rx % RX_RING_SIZE];
-				if (rxf == NULL) {
-					if (speedo_debug > 2)
-						printk(KERN_DEBUG
-								"%s: NULL cur_rx in speedo_interrupt().\n",
-								dev->name);
-					sp->rx_ring_state |= RrNoMem|RrNoResources;
-				} else if (rxf == sp->last_rxf) {
-					if (speedo_debug > 2)
-						printk(KERN_DEBUG
-								"%s: cur_rx is last in speedo_interrupt().\n",
-								dev->name);
-					sp->rx_ring_state |= RrNoMem|RrNoResources;
-				} else
-					outb(RxResumeNoResources, ioaddr + SCBCmd);
-			} else if ((status & 0x003c) == 0x0008) { /* No resources. */
-				struct RxFD *rxf;
-				printk(KERN_WARNING "%s: card reports no resources.\n",
-						dev->name);
-				rxf = sp->rx_ringp[sp->cur_rx % RX_RING_SIZE];
-				if (rxf == NULL) {
-					if (speedo_debug > 2)
-						printk(KERN_DEBUG
-								"%s: NULL cur_rx in speedo_interrupt().\n",
-								dev->name);
-					sp->rx_ring_state |= RrNoMem|RrNoResources;
-				} else if (rxf == sp->last_rxf) {
-					if (speedo_debug > 2)
-						printk(KERN_DEBUG
-								"%s: cur_rx is last in speedo_interrupt().\n",
-								dev->name);
-					sp->rx_ring_state |= RrNoMem|RrNoResources;
-				} else {
-					/* Restart the receiver. */
-					outl(sp->rx_ring_dma[sp->cur_rx % RX_RING_SIZE],
-						 ioaddr + SCBPointer);
-					outb(RxStart, ioaddr + SCBCmd);
-				}
-			}
-			sp->stats.rx_errors++;
-			spin_unlock(&sp->lock);
-		}
+		/* Always check if all rx buffers are allocated.  --SAW */
+		speedo_refill_rx_buffers(dev, 0);
 
-		if ((sp->rx_ring_state&(RrNoMem|RrNoResources)) == RrNoResources) {
-			printk(KERN_WARNING
-					"%s: restart the receiver after a possible hang.\n",
-					dev->name);
-			spin_lock(&sp->lock);
-			/* Restart the receiver.
-			   I'm not sure if it's always right to restart the receiver
-			   here but I don't know another way to prevent receiver hangs.
-			   1999/12/25 SAW */
-			outl(sp->rx_ring_dma[sp->cur_rx % RX_RING_SIZE],
-				 ioaddr + SCBPointer);
-			outb(RxStart, ioaddr + SCBCmd);
-			sp->rx_ring_state &= ~RrNoResources;
-			spin_unlock(&sp->lock);
+		spin_lock(&sp->lock);
+		/*
+		 * The chip may have suspended reception for various reasons.
+		 * Check for that, and re-prime it should this be the case.
+		 */
+		switch ((status >> 2) & 0xf) {
+		case 0: /* Idle */
+			break;
+		case 1:	/* Suspended */
+		case 2:	/* No resources (RxFDs) */
+		case 9:	/* Suspended with no more RBDs */
+		case 10: /* No resources due to no RBDs */
+		case 12: /* Ready with no RBDs */
+			speedo_rx_soft_reset(dev);
+			break;
+		case 3:  case 5:  case 6:  case 7:  case 8:
+		case 11:  case 13:  case 14:  case 15:
+			/* these are all reserved values */
+			break;
 		}
 
 		/* User interrupt, Command/Tx unit interrupt or CU not active. */
 		if (status & 0xA400) {
-			spin_lock(&sp->lock);
 			speedo_tx_buffer_gc(dev);
 			if (sp->tx_full
 				&& (int)(sp->cur_tx - sp->dirty_tx) < TX_QUEUE_UNFULL) {
@@ -1583,8 +1629,8 @@
 				sp->tx_full = 0;
 				netif_wake_queue(dev); /* Attention: under a spinlock.  --SAW */
 			}
-			spin_unlock(&sp->lock);
 		}
+		spin_unlock(&sp->lock);
 
 		if (--boguscnt < 0) {
 			printk(KERN_ERR "%s: Too much work at interrupt, status=0x%4.4x.\n",
@@ -1702,6 +1748,7 @@
 	int entry = sp->cur_rx % RX_RING_SIZE;
 	int rx_work_limit = sp->dirty_rx + RX_RING_SIZE - sp->cur_rx;
 	int alloc_ok = 1;
+	int npkts = 0;
 
 	if (speedo_debug > 4)
 		printk(KERN_DEBUG " In speedo_rx().\n");
@@ -1768,6 +1815,7 @@
 				memcpy(skb_put(skb, pkt_len), sp->rx_skbuff[entry]->tail,
 					   pkt_len);
 #endif
+				npkts++;
 			} else {
 				/* Pass up the already-filled skbuff. */
 				skb = sp->rx_skbuff[entry];
@@ -1778,6 +1826,7 @@
 				}
 				sp->rx_skbuff[entry] = NULL;
 				skb_put(skb, pkt_len);
+				npkts++;
 				sp->rx_ringp[entry] = NULL;
 				pci_unmap_single(sp->pdev, sp->rx_ring_dma[entry],
 						PKT_BUF_SZ + sizeof(struct RxFD), PCI_DMA_FROMDEVICE);
@@ -1798,7 +1847,8 @@
 	/* Try hard to refill the recently taken buffers. */
 	speedo_refill_rx_buffers(dev, 1);
 
-	sp->last_rx_time = jiffies;
+	if (npkts)
+		sp->last_rx_time = jiffies;
 
 	return 0;
 }

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] eepro100 - need testers
  2001-12-04 22:57 ` [PATCH] eepro100 - need testers Tim Hockin
@ 2001-12-04 23:15   ` Edward Muller
  2001-12-05  1:26   ` Kurt Roeckx
                     ` (4 subsequent siblings)
  5 siblings, 0 replies; 16+ messages in thread
From: Edward Muller @ 2001-12-04 23:15 UTC (permalink / raw)
  To: Tim Hockin; +Cc: Linux Kernel Mailing List, arjanv, saw, sparker

What are the eepro100 issues? I have two machines with two eepro100's in
each machine.

I am running 2.4.16 on them and I've been testing nbd and the errors
that I have been having MAY be caused by the network going out (as per a
discussion on the ENBD list).

On Tue, 2001-12-04 at 17:57, Tim Hockin wrote:
> This patch was developed here to resolve a number of eepro100 issues we
> were seeing. I'd like to get people to try this on their eepro100 chips and
> beat on it for a while.
> 
> volunteers?
> 
> Tim
> -- 
> Tim Hockin
> Systems Software Engineer
> Sun Microsystems, Cobalt Server Appliances
> thockin@sun.com
> ----
> 

> diff -ruN 2.4.14-orig/drivers/net/eepro100.c 2.4.14-cobalt/drivers/net/eepro100.c
> --- 2.4.14-orig/drivers/net/eepro100.c	Tue Dec  4 14:30:09 2001
> +++ 2.4.14-cobalt/drivers/net/eepro100.c	Tue Dec  4 14:03:35 2001
> @@ -64,8 +64,8 @@
>  
>  /* A few values that may be tweaked. */
>  /* The ring sizes should be a power of two for efficiency. */
> -#define TX_RING_SIZE	32
> -#define RX_RING_SIZE	32
> +#define TX_RING_SIZE	64
> +#define RX_RING_SIZE	1024
>  /* How much slots multicast filter setup may take.
>     Do not descrease without changing set_rx_mode() implementaion. */
>  #define TX_MULTICAST_SIZE   2
> @@ -1067,6 +1071,50 @@
>  	outw(CUStart | SCBMaskEarlyRx | SCBMaskFlowCtl, ioaddr + SCBCmd);
>  }
>  
> +/*
> + * Sometimes the receiver stops making progress.  This routine knows how to
> + * get it going again, without losing packets or being otherwise nasty like
> + * a chip reset would be.  Previously the driver had a whole sequence
> + * of if RxSuspended, if it's no buffers do one thing, if it's no resources,
> + * do another, etc.  But those things don't really matter.  Separate logic
> + * in the ISR provides for allocating buffers--the other half of operation
> + * is just making sure the receiver is active.  speedo_rx_soft_reset does that.
> + * This problem with the old, more involved algorithm is shown up under
> + * ping floods on the order of 60K packets/second on a 100Mbps fdx network.
> + */
> +static void
> +speedo_rx_soft_reset(struct net_device *dev)
> +{
> +	struct speedo_private *sp = dev->priv;
> +	struct RxFD *rfd;
> +	long ioaddr;
> +
> +	ioaddr = dev->base_addr;
> +	wait_for_cmd_done(ioaddr + SCBCmd);
> +	if (inb(ioaddr + SCBCmd) != 0) {
> +		printk("%s: previous command stalled\n", dev->name);
> +		return;
> +	}
> +	/*
> +	* Put the hardware into a known state.
> +	*/
> +	outb(RxAbort, ioaddr + SCBCmd);
> +
> +	rfd = sp->rx_ringp[sp->cur_rx % RX_RING_SIZE];
> +
> +	rfd->rx_buf_addr = 0xffffffff;
> +
> +	wait_for_cmd_done(ioaddr + SCBCmd);
> +
> +	if (inb(ioaddr + SCBCmd) != 0) {
> +		printk("%s: RxAbort command stalled\n", dev->name);
> +		return;
> +	}
> +	outl(sp->rx_ring_dma[sp->cur_rx % RX_RING_SIZE],
> +		ioaddr + SCBPointer);
> +	outb(RxStart, ioaddr + SCBCmd);
> +}
> +
>  /* Media monitoring and control. */
>  static void speedo_timer(unsigned long data)
>  {
> @@ -1500,82 +1591,37 @@
>  		if ((status & 0xfc00) == 0)
>  			break;
>  
> -		/* Always check if all rx buffers are allocated.  --SAW */
> -		speedo_refill_rx_buffers(dev, 0);
> -
>  		if ((status & 0x5000) ||	/* Packet received, or Rx error. */
>  			(sp->rx_ring_state&(RrNoMem|RrPostponed)) == RrPostponed)
>  									/* Need to gather the postponed packet. */
>  			speedo_rx(dev);
>  
> -		if (status & 0x1000) {
> -			spin_lock(&sp->lock);
> -			if ((status & 0x003c) == 0x0028) {		/* No more Rx buffers. */
> -				struct RxFD *rxf;
> -				printk(KERN_WARNING "%s: card reports no RX buffers.\n",
> -						dev->name);
> -				rxf = sp->rx_ringp[sp->cur_rx % RX_RING_SIZE];
> -				if (rxf == NULL) {
> -					if (speedo_debug > 2)
> -						printk(KERN_DEBUG
> -								"%s: NULL cur_rx in speedo_interrupt().\n",
> -								dev->name);
> -					sp->rx_ring_state |= RrNoMem|RrNoResources;
> -				} else if (rxf == sp->last_rxf) {
> -					if (speedo_debug > 2)
> -						printk(KERN_DEBUG
> -								"%s: cur_rx is last in speedo_interrupt().\n",
> -								dev->name);
> -					sp->rx_ring_state |= RrNoMem|RrNoResources;
> -				} else
> -					outb(RxResumeNoResources, ioaddr + SCBCmd);
> -			} else if ((status & 0x003c) == 0x0008) { /* No resources. */
> -				struct RxFD *rxf;
> -				printk(KERN_WARNING "%s: card reports no resources.\n",
> -						dev->name);
> -				rxf = sp->rx_ringp[sp->cur_rx % RX_RING_SIZE];
> -				if (rxf == NULL) {
> -					if (speedo_debug > 2)
> -						printk(KERN_DEBUG
> -								"%s: NULL cur_rx in speedo_interrupt().\n",
> -								dev->name);
> -					sp->rx_ring_state |= RrNoMem|RrNoResources;
> -				} else if (rxf == sp->last_rxf) {
> -					if (speedo_debug > 2)
> -						printk(KERN_DEBUG
> -								"%s: cur_rx is last in speedo_interrupt().\n",
> -								dev->name);
> -					sp->rx_ring_state |= RrNoMem|RrNoResources;
> -				} else {
> -					/* Restart the receiver. */
> -					outl(sp->rx_ring_dma[sp->cur_rx % RX_RING_SIZE],
> -						 ioaddr + SCBPointer);
> -					outb(RxStart, ioaddr + SCBCmd);
> -				}
> -			}
> -			sp->stats.rx_errors++;
> -			spin_unlock(&sp->lock);
> -		}
> +		/* Always check if all rx buffers are allocated.  --SAW */
> +		speedo_refill_rx_buffers(dev, 0);
>  
> -		if ((sp->rx_ring_state&(RrNoMem|RrNoResources)) == RrNoResources) {
> -			printk(KERN_WARNING
> -					"%s: restart the receiver after a possible hang.\n",
> -					dev->name);
> -			spin_lock(&sp->lock);
> -			/* Restart the receiver.
> -			   I'm not sure if it's always right to restart the receiver
> -			   here but I don't know another way to prevent receiver hangs.
> -			   1999/12/25 SAW */
> -			outl(sp->rx_ring_dma[sp->cur_rx % RX_RING_SIZE],
> -				 ioaddr + SCBPointer);
> -			outb(RxStart, ioaddr + SCBCmd);
> -			sp->rx_ring_state &= ~RrNoResources;
> -			spin_unlock(&sp->lock);
> +		spin_lock(&sp->lock);
> +		/*
> +		 * The chip may have suspended reception for various reasons.
> +		 * Check for that, and re-prime it should this be the case.
> +		 */
> +		switch ((status >> 2) & 0xf) {
> +		case 0: /* Idle */
> +			break;
> +		case 1:	/* Suspended */
> +		case 2:	/* No resources (RxFDs) */
> +		case 9:	/* Suspended with no more RBDs */
> +		case 10: /* No resources due to no RBDs */
> +		case 12: /* Ready with no RBDs */
> +			speedo_rx_soft_reset(dev);
> +			break;
> +		case 3:  case 5:  case 6:  case 7:  case 8:
> +		case 11:  case 13:  case 14:  case 15:
> +			/* these are all reserved values */
> +			break;
>  		}
>  
>  		/* User interrupt, Command/Tx unit interrupt or CU not active. */
>  		if (status & 0xA400) {
> -			spin_lock(&sp->lock);
>  			speedo_tx_buffer_gc(dev);
>  			if (sp->tx_full
>  				&& (int)(sp->cur_tx - sp->dirty_tx) < TX_QUEUE_UNFULL) {
> @@ -1583,8 +1629,8 @@
>  				sp->tx_full = 0;
>  				netif_wake_queue(dev); /* Attention: under a spinlock.  --SAW */
>  			}
> -			spin_unlock(&sp->lock);
>  		}
> +		spin_unlock(&sp->lock);
>  
>  		if (--boguscnt < 0) {
>  			printk(KERN_ERR "%s: Too much work at interrupt, status=0x%4.4x.\n",
> @@ -1702,6 +1748,7 @@
>  	int entry = sp->cur_rx % RX_RING_SIZE;
>  	int rx_work_limit = sp->dirty_rx + RX_RING_SIZE - sp->cur_rx;
>  	int alloc_ok = 1;
> +	int npkts = 0;
>  
>  	if (speedo_debug > 4)
>  		printk(KERN_DEBUG " In speedo_rx().\n");
> @@ -1768,6 +1815,7 @@
>  				memcpy(skb_put(skb, pkt_len), sp->rx_skbuff[entry]->tail,
>  					   pkt_len);
>  #endif
> +				npkts++;
>  			} else {
>  				/* Pass up the already-filled skbuff. */
>  				skb = sp->rx_skbuff[entry];
> @@ -1778,6 +1826,7 @@
>  				}
>  				sp->rx_skbuff[entry] = NULL;
>  				skb_put(skb, pkt_len);
> +				npkts++;
>  				sp->rx_ringp[entry] = NULL;
>  				pci_unmap_single(sp->pdev, sp->rx_ring_dma[entry],
>  						PKT_BUF_SZ + sizeof(struct RxFD), PCI_DMA_FROMDEVICE);
> @@ -1798,7 +1847,8 @@
>  	/* Try hard to refill the recently taken buffers. */
>  	speedo_refill_rx_buffers(dev, 1);
>  
> -	sp->last_rx_time = jiffies;
> +	if (npkts)
> +		sp->last_rx_time = jiffies;
>  
>  	return 0;
>  }
-- 
-------------------------------
Edward Muller
Director of IS

973-715-0230 (cell)
212-487-9064 x115 (NYC)

http://www.learningpatterns.com
-------------------------------


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] eepro100 - need testers
  2001-12-04 22:57 ` [PATCH] eepro100 - need testers Tim Hockin
  2001-12-04 23:15   ` Edward Muller
@ 2001-12-05  1:26   ` Kurt Roeckx
  2001-12-05 16:59   ` Steve Parker
                     ` (3 subsequent siblings)
  5 siblings, 0 replies; 16+ messages in thread
From: Kurt Roeckx @ 2001-12-05  1:26 UTC (permalink / raw)
  To: Tim Hockin; +Cc: Linux Kernel Mailing List, arjanv, saw, sparker

On Tue, Dec 04, 2001 at 02:57:35PM -0800, Tim Hockin wrote:
> -#define TX_RING_SIZE	32
> -#define RX_RING_SIZE	32
> +#define TX_RING_SIZE	64
> +#define RX_RING_SIZE	1024

Why do I have the feeling that you're just changing those values
so you get less chance of having the problem?  Are there any
other reason why you change this?  It might even be a good idea
to test it with lower values.


> -			} else if ((status & 0x003c) == 0x0008) { /* No resources. */
> -				struct RxFD *rxf;
> -				printk(KERN_WARNING "%s: card reports no resources.\n",
> -						dev->name);

[...]

> +		switch ((status >> 2) & 0xf) {
> +		case 0: /* Idle */
> +			break;
> +		case 1:	/* Suspended */
> +		case 2:	/* No resources (RxFDs) */
> +		case 9:	/* Suspended with no more RBDs */
> +		case 10: /* No resources due to no RBDs */
> +		case 12: /* Ready with no RBDs */
> +			speedo_rx_soft_reset(dev);
> +			break;

You can also argue that you're trying to fix the problem by
hiding it.  It would be useful that it still reported the same
error message, so you can see that if it happens again with the
patch that it no longer locks up.


Kurt


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] eepro100 - need testers
  2001-12-04 22:57 ` [PATCH] eepro100 - need testers Tim Hockin
  2001-12-04 23:15   ` Edward Muller
  2001-12-05  1:26   ` Kurt Roeckx
@ 2001-12-05 16:59   ` Steve Parker
  2001-12-05 19:36     ` Mike Fedyk
  2001-12-06 23:34   ` Alan Cox
                     ` (2 subsequent siblings)
  5 siblings, 1 reply; 16+ messages in thread
From: Steve Parker @ 2001-12-05 16:59 UTC (permalink / raw)
  To: Kurt Roeckx, Tim Hockin; +Cc: Linux Kernel Mailing List, arjanv, saw

At 05:26 PM 12/4/2001 , Kurt Roeckx wrote:
>On Tue, Dec 04, 2001 at 02:57:35PM -0800, Tim Hockin wrote:
> > -#define TX_RING_SIZE 32
> > -#define RX_RING_SIZE 32
> > +#define TX_RING_SIZE 64
> > +#define RX_RING_SIZE 1024
>
>Why do I have the feeling that you're just changing those values
>so you get less chance of having the problem?  Are there any
>other reason why you change this?  It might even be a good idea
>to test it with lower values.

If you test with lower values, I find that the problem happens so often that
bidirectional TCP bulk throughput tests on 100Mbits/sec ethernet are 
significantly
lower.  As Tim pointed out, the RX ring size is chosen based on being large 
enough
to receive steadily and only require the ISR to come by and empty it once 
every jiffy.
In order to provide good performance and survivability on maximum packet 
rate loads,
it needs to be 1024, although it's moderately good on 512, on my 300MHz K6 
system.

> > -                     } else if ((status & 0x003c) == 0x0008) { /* No 
> resources. */
> > -                             struct RxFD *rxf;
> > -                             printk(KERN_WARNING "%s: card reports no 
> resources.\n",
> > -                                             dev->name);
>
>[...]
>
> > +             switch ((status >> 2) & 0xf) {
> > +             case 0: /* Idle */
> > +                     break;
> > +             case 1: /* Suspended */
> > +             case 2: /* No resources (RxFDs) */
> > +             case 9: /* Suspended with no more RBDs */
> > +             case 10: /* No resources due to no RBDs */
> > +             case 12: /* Ready with no RBDs */
> > +                     speedo_rx_soft_reset(dev);
> > +                     break;
>
>You can also argue that you're trying to fix the problem by
>hiding it.  It would be useful that it still reported the same
>error message, so you can see that if it happens again with the
>patch that it no longer locks up.

The printk significantly reduces TCP throughput on my tests, it doesn't tell me
about an interesting condition, so I removed it.  This state happens any 
time the chip receives
more than a ring buffer full before the ISR can empty it, which is 
something which
is always possible.

And any way, why would you care to know?  Is there something you'ld imagine
doing because you saw the message?

Cheers,

	~sparker

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] eepro100 - need testers
  2001-12-05 16:59   ` Steve Parker
@ 2001-12-05 19:36     ` Mike Fedyk
  0 siblings, 0 replies; 16+ messages in thread
From: Mike Fedyk @ 2001-12-05 19:36 UTC (permalink / raw)
  To: Steve Parker
  Cc: Kurt Roeckx, Tim Hockin, Linux Kernel Mailing List, arjanv, saw

On Wed, Dec 05, 2001 at 08:59:45AM -0800, Steve Parker wrote:
> At 05:26 PM 12/4/2001 , Kurt Roeckx wrote:
> >On Tue, Dec 04, 2001 at 02:57:35PM -0800, Tim Hockin wrote:
> >> -#define TX_RING_SIZE 32
> >> -#define RX_RING_SIZE 32
> >> +#define TX_RING_SIZE 64
> >> +#define RX_RING_SIZE 1024
> >
> >Why do I have the feeling that you're just changing those values
> >so you get less chance of having the problem?  Are there any
> >other reason why you change this?  It might even be a good idea
> >to test it with lower values.
> 
> If you test with lower values, I find that the problem happens so often that
> bidirectional TCP bulk throughput tests on 100Mbits/sec ethernet are 
> significantly
> lower.  As Tim pointed out, the RX ring size is chosen based on being large 
> enough
> to receive steadily and only require the ISR to come by and empty it once 
> every jiffy.
> In order to provide good performance and survivability on maximum packet 
> rate loads,
> it needs to be 1024, although it's moderately good on 512, on my 300MHz K6 
> system.
> 

So, if I choose to plug an eepro100 into a pentium 75 (or comperable on
other pci based arch), am I going to get massive RX_RING overflows?  If so,
then the ring size should probably be sized based on bogomips...

mf

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] eepro100 - need testers
  2001-12-04 22:57 ` [PATCH] eepro100 - need testers Tim Hockin
                     ` (2 preceding siblings ...)
  2001-12-05 16:59   ` Steve Parker
@ 2001-12-06 23:34   ` Alan Cox
  2001-12-06 23:28     ` Tim Hockin
  2001-12-06 23:36     ` Jeff Garzik
  2001-12-10  3:42   ` Ben Greear
  2001-12-24  3:24   ` Ben Greear
  5 siblings, 2 replies; 16+ messages in thread
From: Alan Cox @ 2001-12-06 23:34 UTC (permalink / raw)
  To: Tim Hockin; +Cc: Linux Kernel Mailing List, arjanv, saw, sparker

> This patch was developed here to resolve a number of eepro100 issues we
> were seeing. I'd like to get people to try this on their eepro100 chips and
> beat on it for a while.

Works for me. Its the first eepro100 driver that wont choke eventually on
my i810 board and its also the only one that will recover the board after
a soft boot when it had previously started spewing errors

Alan

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] eepro100 - need testers
  2001-12-06 23:34   ` Alan Cox
@ 2001-12-06 23:28     ` Tim Hockin
  2001-12-06 23:36     ` Jeff Garzik
  1 sibling, 0 replies; 16+ messages in thread
From: Tim Hockin @ 2001-12-06 23:28 UTC (permalink / raw)
  To: Alan Cox; +Cc: Linux Kernel Mailing List, arjanv, saw, sparker

Alan Cox wrote:

> Works for me. Its the first eepro100 driver that wont choke eventually on
> my i810 board and its also the only one that will recover the board after
> a soft boot when it had previously started spewing errors


woohoo!  Glad to know, thanks Alan.

-- 
Tim Hockin
Systems Software Engineer
Sun Microsystems, Cobalt Server Appliances
thockin@sun.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] eepro100 - need testers
  2001-12-06 23:34   ` Alan Cox
  2001-12-06 23:28     ` Tim Hockin
@ 2001-12-06 23:36     ` Jeff Garzik
  2001-12-07  1:05       ` Tim Hockin
  1 sibling, 1 reply; 16+ messages in thread
From: Jeff Garzik @ 2001-12-06 23:36 UTC (permalink / raw)
  To: Alan Cox; +Cc: Tim Hockin, Linux Kernel Mailing List, arjanv, saw, sparker

Alan Cox wrote:
> 
> > This patch was developed here to resolve a number of eepro100 issues we
> > were seeing. I'd like to get people to try this on their eepro100 chips and
> > beat on it for a while.
> 
> Works for me. Its the first eepro100 driver that wont choke eventually on
> my i810 board and its also the only one that will recover the board after
> a soft boot when it had previously started spewing errors

This patch got me thinking about net driver ring sizes in general.  When
you are talking thousands of packets per second at 100 mbit, a larger
ring size than the average 32-64 seems to make sense too.

-- 
Jeff Garzik      | Only so many songs can be sung
Building 1024    | with two lips, two lungs, and one tongue.
MandrakeSoft     |         - nomeansno


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] eepro100 - need testers
  2001-12-06 23:36     ` Jeff Garzik
@ 2001-12-07  1:05       ` Tim Hockin
  0 siblings, 0 replies; 16+ messages in thread
From: Tim Hockin @ 2001-12-07  1:05 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: Alan Cox, Linux Kernel Mailing List, arjanv, saw, sparker

Jeff Garzik wrote:
 
> This patch got me thinking about net driver ring sizes in general.  When
> you are talking thousands of packets per second at 100 mbit, a larger
> ring size than the average 32-64 seems to make sense too.

Well, the math for teh very worst case is something like: 

100,000,000  bits/sec
/8 
= 12500000  bytes/sec
/64  bytes/ping
= 195312.5  ping/sec
/100
= 1953 ping/jiffy
rounded to 2048
/2 = 1024 rx buffers per 1/2 jiffie.  

1024 means you can withstand a wire-speed storm while interrupting twice
per jiffy.


-- 
Tim Hockin
Systems Software Engineer
Sun Microsystems, Cobalt Server Appliances
thockin@sun.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] eepro100 - need testers
  2001-12-04 22:57 ` [PATCH] eepro100 - need testers Tim Hockin
                     ` (3 preceding siblings ...)
  2001-12-06 23:34   ` Alan Cox
@ 2001-12-10  3:42   ` Ben Greear
  2001-12-24  3:24   ` Ben Greear
  5 siblings, 0 replies; 16+ messages in thread
From: Ben Greear @ 2001-12-10  3:42 UTC (permalink / raw)
  To: Tim Hockin; +Cc: Linux Kernel Mailing List, arjanv, saw, sparker

Before this patch, I would see out-of-resource messages when I ran
50Mbps+ traffic + bonnie++ on a P-III 550Mhz machine.  With this patch,
I see no error messages, and traffic is flowing fine...

So, seems like a winner to me!

PS.  I don't currently have any machines available to test the
cmd-timeout issues with the eepro driver and some NICs.  Has
anyone tested to see if this patch actually fixes those problems too?

Thanks,
Ben

Tim Hockin wrote:

> This patch was developed here to resolve a number of eepro100 issues we
> were seeing. I'd like to get people to try this on their eepro100 chips and
> beat on it for a while.
> 
> volunteers?
> 
> Tim

-- 
Ben Greear <greearb@candelatech.com>       <Ben_Greear AT excite.com>
President of Candela Technologies Inc      http://www.candelatech.com
ScryMUD:  http://scry.wanfear.com     http://scry.wanfear.com/~greear

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] eepro100 - need testers
  2001-12-04 22:57 ` [PATCH] eepro100 - need testers Tim Hockin
                     ` (4 preceding siblings ...)
  2001-12-10  3:42   ` Ben Greear
@ 2001-12-24  3:24   ` Ben Greear
  2001-12-28 18:52     ` Jeremy Jackson
  5 siblings, 1 reply; 16+ messages in thread
From: Ben Greear @ 2001-12-24  3:24 UTC (permalink / raw)
  To: Tim Hockin; +Cc: Linux Kernel Mailing List, arjanv, saw, sparker

I just tried this patch against the 2.4.17 kernel.  I was able to
completely freeze my D815EEA2 motherboard based computer by trying
to copy a large directory over NFS.  The machine is connected to a
10bt HUB, and this setup has shown lockups before with various
eepro100 drivers.  The e100 seems to work fine in this setup...

An older eepro driver (the one with RH's 2.4.9-13 kernel) does not
lock up the machine, but I do see incessant wait-for-cmd-done-timeout
messages, and the network is basically un-usable.

On other machines, connected to a 100bt-FD switch, the new patch
seems to work just fine, btw.

The eepro lockup is repeatable, so let me know if there is any
information I can get for you that will help.

Thanks,
Ben

Tim Hockin wrote:

> This patch was developed here to resolve a number of eepro100 issues we
> were seeing. I'd like to get people to try this on their eepro100 chips and
> beat on it for a while.
> 
> volunteers?

-- 
Ben Greear <greearb@candelatech.com>       <Ben_Greear AT excite.com>
President of Candela Technologies Inc      http://www.candelatech.com
ScryMUD:  http://scry.wanfear.com     http://scry.wanfear.com/~greear

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] eepro100 - need testers
  2001-12-24  3:24   ` Ben Greear
@ 2001-12-28 18:52     ` Jeremy Jackson
  2001-12-31  3:28       ` Ben Greear
  0 siblings, 1 reply; 16+ messages in thread
From: Jeremy Jackson @ 2001-12-28 18:52 UTC (permalink / raw)
  To: Ben Greear, Tim Hockin; +Cc: Linux Kernel Mailing List, arjanv, saw, sparker

I have a friend who has had headaches with a D815EEA board with a 2nd
eepro100+ nic installed.
He compiled a new kernel and used a driver from intel's website, and it's
all good now.
FYI
----- Original Message -----
From: "Ben Greear" <greearb@candelatech.com>
To: "Tim Hockin" <thockin@sun.com>
Cc: "Linux Kernel Mailing List" <linux-kernel@vger.kernel.org>;
<arjanv@redhat.com>; <saw@sw-soft.com>; <sparker@sparker.net>
Sent: Sunday, December 23, 2001 7:24 PM
Subject: Re: [PATCH] eepro100 - need testers


> I just tried this patch against the 2.4.17 kernel.  I was able to
> completely freeze my D815EEA2 motherboard based computer by trying
> to copy a large directory over NFS.  The machine is connected to a
> 10bt HUB, and this setup has shown lockups before with various
> eepro100 drivers.  The e100 seems to work fine in this setup...
>
> An older eepro driver (the one with RH's 2.4.9-13 kernel) does not
> lock up the machine, but I do see incessant wait-for-cmd-done-timeout
> messages, and the network is basically un-usable.
>
> On other machines, connected to a 100bt-FD switch, the new patch
> seems to work just fine, btw.
>
> The eepro lockup is repeatable, so let me know if there is any
> information I can get for you that will help.
>
> Thanks,
> Ben
>
> Tim Hockin wrote:
>
> > This patch was developed here to resolve a number of eepro100 issues we
> > were seeing. I'd like to get people to try this on their eepro100 chips
and
> > beat on it for a while.
> >
> > volunteers?
>
>
> --
> Ben Greear <greearb@candelatech.com>       <Ben_Greear AT excite.com>
> President of Candela Technologies Inc      http://www.candelatech.com
> ScryMUD:  http://scry.wanfear.com     http://scry.wanfear.com/~greear
>
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
>


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] eepro100 - need testers
  2001-12-28 18:52     ` Jeremy Jackson
@ 2001-12-31  3:28       ` Ben Greear
  0 siblings, 0 replies; 16+ messages in thread
From: Ben Greear @ 2001-12-31  3:28 UTC (permalink / raw)
  To: Jeremy Jackson
  Cc: Tim Hockin, Linux Kernel Mailing List, arjanv, saw, sparker



Jeremy Jackson wrote:

> I have a friend who has had headaches with a D815EEA board with a 2nd
> eepro100+ nic installed.
> He compiled a new kernel and used a driver from intel's website, and it's
> all good now.
> FYI


Yes, I have good results with e100 too.  However, I look forward
to the day that I don't have to download/compile/install 3rd party
drivers to get my machines working...

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com>       <Ben_Greear AT excite.com>
President of Candela Technologies Inc      http://www.candelatech.com
ScryMUD:  http://scry.wanfear.com     http://scry.wanfear.com/~greear



^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH] eepro100 - need testers
@ 2001-12-07  1:30 Leif Sawyer
  0 siblings, 0 replies; 16+ messages in thread
From: Leif Sawyer @ 2001-12-07  1:30 UTC (permalink / raw)
  To: Tim Hockin, Jeff Garzik; +Cc: Linux Kernel Mailing List

Tim Hockin responded to
> Jeff Garzik who wrote:
>  
>> This patch got me thinking about net driver ring sizes in 
>> general.  When you are talking thousands of packets per second
>> at 100 mbit, a larger ring size than the average 32-64 seems
>> to make sense too.
> 
> Well, the math for the very worst case is something like: 
> 
> 100,000,000  bits/sec
> /8  = 12500000  bytes/sec
> /64 bytes/ping = 195312.5  ping/sec
> /100 = 1953 ping/jiffy
> rounded to 2048 /2 = 1024 rx buffers per 1/2 jiffie.  
> 
> 1024 means you can withstand a wire-speed storm while 
> interrupting twice per jiffy.

Given this, and the ever-upward climb in ethernet speed,
what would be the dangers involved in making this a run-time
option?

As soon as we detect the device, we know what it's max speed is,
and we can then build the ring size base on that knowledge.

just some ignorant thoughts..


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH] eepro100 - need testers
@ 2001-12-11 15:00 Zwane Mwaikambo
  0 siblings, 0 replies; 16+ messages in thread
From: Zwane Mwaikambo @ 2001-12-11 15:00 UTC (permalink / raw)
  To: thockin; +Cc: Linux Kernel

The problem i was experiencing (albeit with 2.4.10-ac11) was losing
connectivity for 2-10s at a time, no messages in the logs, and the machine
would resume activity as normal afterwards. The machine is connected to
the network via two NICs (3c59x and eepro100) and i only get these freezes
when connecting to the IP address on the eepro100. Unfortunately, due to
the lack of error messages, this report doesn't help much. But i was
wondering wether this was what some people were experiencing.

connected via switch with moderately high network load (general purpose
server)

Cheers,
	Zwane Mwaikambo

PS i have 2.4.17-pre5 and so far haven't noticed it, but haven't done much
testing either.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] eepro100 - need testers
@ 2001-12-29 19:01 Peter Hartzler
  0 siblings, 0 replies; 16+ messages in thread
From: Peter Hartzler @ 2001-12-29 19:01 UTC (permalink / raw)
  To: Linux Kernel Mailing List; +Cc: Tim Hockin

I just applied Tim Hockin's eepro100 patch of Tue, 04 Dec 2001 against an
otherwise stock 2.4.17.  Result summary: no joy.  Stock or patched, a ping
-f against a neighboring machine causes the driver to fail after a short
while (time < coffee-run) with that old standby:

	eepro100: wait_for_cmd_done timeout!

Doing the ping -f test with the unpatched OR patched module loaded as:

	modprobe eepro100 debug=6

gives very dubious output along the lines of (unpatched module gives
similar output....):

----- cut here -----

kernel: eepro100.c: Debug level is 6.
kernel: eepro100.c:v1.09j-t 9/29/99 Donald Becker http://cesdis.gsfc.nasa.gov/linux/drivers/eepro100.html
kernel: eepro100.c: $Revision: 1.36 $ 2000/11/17 Modified by Andrey V. Savochkin <saw@saw.sw.com.sg> and others
kernel: Found Intel i82557 PCI Speedo, MMIO at 0xff8ff000, IRQ 3.
kernel: PCI: Found IRQ 3 for device 01:08.0
kernel: eth0: Intel Corp. 82820 (ICH2) Chipset Ethernet Controller, 00:03:47:0E:62:F3, IRQ 3.
kernel:   Board assembly 000000-000, Physical connectors present: RJ45
kernel:   Primary interface chip i82555 PHY #1.
kernel:   General self-test: passed.
kernel:   Serial sub-system self-test: passed.
kernel:   Internal registers self-test: passed.
kernel:   ROM checksum self-test: passed (0x04f4518b).
sysctl: net.ipv4.ip_forward = 0
sysctl: net.ipv4.conf.default.rp_filter = 1
sysctl: kernel.sysrq = 0
network: Setting network parameters:  succeeded
network: Bringing up interface lo:  succeeded
network: Bringing up interface eth0:  succeeded
kernel: nterrupt  status=0x0050.
kernel: tatus=0x0050.
kernel: tatus=0x0050.
kernel: <nterrupt  status=0x0050.
kernel: eth0: exiting interrupt, status=0x0050.
kernel: <7nterrupt  status=0x0050.
kernel: nterrupt  status=0x0050.
kernel: <7rrupt  status=0x4050.
kernel: <x0050.
kernel:   status=0x0050.
kernel: nterrupt  status=0x0050.
kernel: x2050.
kernel: x0050.
kernel: t  status=0x0050.
kernel: nterrupt  status=0x2050.
kernel:  status=0x2050.
kernel: <7x0050.
kernel: nterrupt  status=0x0050.
kernel: nterrupt  status=0x0050.
kernel: <nterrupt  status=0x0050.
kernel: <x4050.
kernel: nterrupt  status=0x0050.
kernel: <7x2050.
kernel: th0: interrupt  status=0x0050.
kernel: <x0050.
kernel: e candidate 39 status 400ca000.
kernel: nterrupt  status=0x0050.
kernel: <7status=0x0050.
kernel: <7nterrupt  status=0x0050.
kernel: <7h0: interrupt  status=0x0050.
kernel: .
kernel: nterrupt, status=0x0050.
kernel: eepro100: wait_for_cmd_done timeout!
last message repeated 14 times
kernel: h0:    396 00000001.

----- cut here -----

Trying to balance the need to send details versus avoiding list-flood...  
Let me know if any other bits would be useful.  I wrote a cheezy network
watchdog script which *should* let me back in from home after the next
hare-brained experiment... :)

System Info:
 - Dell Dimension 4100 "EA81510A.10A.0022.P06.0008291722"
 - BIOS Version A04
 - i686 800MHz "Pentium(R)III 800EB MHz"
 - 256M PC133 RAM

 - Hub is 10Mb/s (no easy way to test w/ 100Mb/s hub.)
 - Int 3 (eth0) is not shared.

 - Fully patched RedHat 7.2
	gcc version 2.96 20000731 (Red Hat Linux 7.1 2.96-98)
	glibc-2.2.4-19.3 (i386)

Regards,

Pete.

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2001-12-31  3:28 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <E167w6n-0001dz-00@fenrus.demon.nl>
2001-12-04 22:57 ` [PATCH] eepro100 - need testers Tim Hockin
2001-12-04 23:15   ` Edward Muller
2001-12-05  1:26   ` Kurt Roeckx
2001-12-05 16:59   ` Steve Parker
2001-12-05 19:36     ` Mike Fedyk
2001-12-06 23:34   ` Alan Cox
2001-12-06 23:28     ` Tim Hockin
2001-12-06 23:36     ` Jeff Garzik
2001-12-07  1:05       ` Tim Hockin
2001-12-10  3:42   ` Ben Greear
2001-12-24  3:24   ` Ben Greear
2001-12-28 18:52     ` Jeremy Jackson
2001-12-31  3:28       ` Ben Greear
2001-12-07  1:30 Leif Sawyer
  -- strict thread matches above, loose matches on Subject: below --
2001-12-11 15:00 Zwane Mwaikambo
2001-12-29 19:01 Peter Hartzler

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox