netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [TG3]: Add hw coalescing infrastructure.
@ 2005-05-11 21:15 David S. Miller
  2005-05-11 21:17 ` Michael Chan
  2005-06-22 15:25 ` [TG3]: About " Eric Dumazet
  0 siblings, 2 replies; 64+ messages in thread
From: David S. Miller @ 2005-05-11 21:15 UTC (permalink / raw)
  To: netdev; +Cc: mchan


Ok, now that we have the tagged status stuff sorted I began
to work on putting the hw mitigation bits back into the
driver.  The discussion on the DMA rw-ctrl settings is still
ongoing, but I will get back to it shortly.

This is the first step, we cache the settings in the tg3
struct and put those values into the chip via tg3_set_coalesce().

ETHTOOL_GCOALESCE is supported, setting is not.

The idea is that if we add dynamnic mitigation or ETHTOOL_SCOALESCE,
it will simply invoke this routine to program the hardware.

It seems that we should also be avoiding the HOSTCC_{R,T}XCOAL_MAXF_INT
registers when TG3_FLG2_5705_PLUS.  I didn't make that change yet, just
preserving what we were doing previously.

Another thing which occurred to me is that these settings are line
rate dependant.  For example, the 20 usec value is for gigabit.
So we may wish to adjust the values we use based upon the negotiated
line speed.   But that's a future refinement that can wait.

I think with the removal of that I/O readback done by the tagged
status changes, and this coalescing stuff below, the SGI NUMA
performance should be significantly higher.

Comments?

Signed-off-by: David S. Miller <davem@davemloft.net>

--- 1/drivers/net/tg3.c.~1~	2005-05-11 11:46:11.000000000 -0700
+++ 2/drivers/net/tg3.c	2005-05-11 13:47:17.000000000 -0700
@@ -2507,7 +2507,7 @@ static int tg3_setup_phy(struct tg3 *tp,
 	if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS)) {
 		if (netif_carrier_ok(tp->dev)) {
 			tw32(HOSTCC_STAT_COAL_TICKS,
-			     DEFAULT_STAT_COAL_TICKS);
+			     tp->coal.stats_block_coalesce_usecs);
 		} else {
 			tw32(HOSTCC_STAT_COAL_TICKS, 0);
 		}
@@ -5094,6 +5094,22 @@ static void tg3_set_bdinfo(struct tg3 *t
 }
 
 static void __tg3_set_rx_mode(struct net_device *);
+static void tg3_set_coalesce(struct tg3 *tp, struct ethtool_coalesce *ec)
+{
+	tw32(HOSTCC_RXCOL_TICKS, ec->rx_coalesce_usecs);
+	tw32(HOSTCC_TXCOL_TICKS, ec->tx_coalesce_usecs);
+	tw32(HOSTCC_RXMAX_FRAMES, ec->rx_max_coalesced_frames);
+	tw32(HOSTCC_TXMAX_FRAMES, ec->tx_max_coalesced_frames);
+	if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS)) {
+		tw32(HOSTCC_RXCOAL_TICK_INT, ec->rx_coalesce_usecs_irq);
+		tw32(HOSTCC_TXCOAL_TICK_INT, ec->tx_coalesce_usecs_irq);
+	}
+	tw32(HOSTCC_RXCOAL_MAXF_INT, ec->rx_max_coalesced_frames_irq);
+	tw32(HOSTCC_TXCOAL_MAXF_INT, ec->tx_max_coalesced_frames_irq);
+	if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS))
+		tw32(HOSTCC_STAT_COAL_TICKS,
+		     ec->stats_block_coalesce_usecs);
+}
 
 /* tp->lock is held. */
 static int tg3_reset_hw(struct tg3 *tp)
@@ -5416,16 +5432,7 @@ static int tg3_reset_hw(struct tg3 *tp)
 		udelay(10);
 	}
 
-	tw32(HOSTCC_RXCOL_TICKS, 0);
-	tw32(HOSTCC_TXCOL_TICKS, LOW_TXCOL_TICKS);
-	tw32(HOSTCC_RXMAX_FRAMES, 1);
-	tw32(HOSTCC_TXMAX_FRAMES, LOW_RXMAX_FRAMES);
-	if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS)) {
-		tw32(HOSTCC_RXCOAL_TICK_INT, 0);
-		tw32(HOSTCC_TXCOAL_TICK_INT, 0);
-	}
-	tw32(HOSTCC_RXCOAL_MAXF_INT, 1);
-	tw32(HOSTCC_TXCOAL_MAXF_INT, 0);
+	tg3_set_coalesce(tp, &tp->coal);
 
 	/* set status block DMA address */
 	tw32(HOSTCC_STATUS_BLK_HOST_ADDR + TG3_64BIT_REG_HIGH,
@@ -5438,8 +5445,6 @@ static int tg3_reset_hw(struct tg3 *tp)
 		 * the tg3_periodic_fetch_stats call there, and
 		 * tg3_get_stats to see how this works for 5705/5750 chips.
 		 */
-		tw32(HOSTCC_STAT_COAL_TICKS,
-		     DEFAULT_STAT_COAL_TICKS);
 		tw32(HOSTCC_STATS_BLK_HOST_ADDR + TG3_64BIT_REG_HIGH,
 		     ((u64) tp->stats_mapping >> 32));
 		tw32(HOSTCC_STATS_BLK_HOST_ADDR + TG3_64BIT_REG_LOW,
@@ -7284,6 +7289,14 @@ static void tg3_vlan_rx_kill_vid(struct 
 }
 #endif
 
+static int tg3_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec)
+{
+	struct tg3 *tp = netdev_priv(dev);
+
+	memcpy(ec, &tp->coal, sizeof(*ec));
+	return 0;
+}
+
 static struct ethtool_ops tg3_ethtool_ops = {
 	.get_settings		= tg3_get_settings,
 	.set_settings		= tg3_set_settings,
@@ -7316,6 +7329,7 @@ static struct ethtool_ops tg3_ethtool_op
 	.get_strings		= tg3_get_strings,
 	.get_stats_count	= tg3_get_stats_count,
 	.get_ethtool_stats	= tg3_get_ethtool_stats,
+	.get_coalesce		= tg3_get_coalesce,
 };
 
 static void __devinit tg3_get_eeprom_size(struct tg3 *tp)
@@ -9096,6 +9110,31 @@ static struct pci_dev * __devinit tg3_fi
 	return peer;
 }
 
+static void __devinit tg3_init_coal(struct tg3 *tp)
+{
+	struct ethtool_coalesce *ec = &tp->coal;
+
+	memset(ec, 0, sizeof(*ec));
+	ec->cmd = ETHTOOL_GCOALESCE;
+	ec->rx_coalesce_usecs = LOW_RXCOL_TICKS;
+	ec->tx_coalesce_usecs = LOW_TXCOL_TICKS;
+	ec->rx_max_coalesced_frames = LOW_RXMAX_FRAMES;
+	ec->tx_max_coalesced_frames = LOW_TXMAX_FRAMES;
+	ec->rx_coalesce_usecs_irq = DEFAULT_RXCOAL_TICK_INT;
+	ec->tx_coalesce_usecs_irq = DEFAULT_TXCOAL_TICK_INT;
+	ec->rx_max_coalesced_frames_irq = DEFAULT_RXCOAL_MAXF_INT;
+	ec->tx_max_coalesced_frames_irq = DEFAULT_TXCOAL_MAXF_INT;
+	ec->stats_block_coalesce_usecs = DEFAULT_STAT_COAL_TICKS;
+
+	if (tp->coalesce_mode & (HOSTCC_MODE_CLRTICK_RXBD |
+				 HOSTCC_MODE_CLRTICK_TXBD)) {
+		ec->rx_coalesce_usecs = LOW_RXCOL_TICKS_CLRTCKS;
+		ec->rx_coalesce_usecs_irq = DEFAULT_RXCOAL_TICK_INT_CLRTCKS;
+		ec->tx_coalesce_usecs = LOW_TXCOL_TICKS_CLRTCKS;
+		ec->tx_coalesce_usecs_irq = DEFAULT_TXCOAL_TICK_INT_CLRTCKS;
+	}
+}
+
 static int __devinit tg3_init_one(struct pci_dev *pdev,
 				  const struct pci_device_id *ent)
 {
@@ -9341,6 +9380,8 @@ static int __devinit tg3_init_one(struct
 	/* flow control autonegotiation is default behavior */
 	tp->tg3_flags |= TG3_FLAG_PAUSE_AUTONEG;
 
+	tg3_init_coal(tp);
+
 	err = register_netdev(dev);
 	if (err) {
 		printk(KERN_ERR PFX "Cannot register net device, "
--- 1/drivers/net/tg3.h.~1~	2005-05-11 11:46:11.000000000 -0700
+++ 2/drivers/net/tg3.h	2005-05-11 13:48:14.000000000 -0700
@@ -876,10 +876,12 @@
 #define  HOSTCC_STATUS_ERROR_ATTN	 0x00000004
 #define HOSTCC_RXCOL_TICKS		0x00003c08
 #define  LOW_RXCOL_TICKS		 0x00000032
+#define  LOW_RXCOL_TICKS_CLRTCKS	 0x00000014
 #define  DEFAULT_RXCOL_TICKS		 0x00000048
 #define  HIGH_RXCOL_TICKS		 0x00000096
 #define HOSTCC_TXCOL_TICKS		0x00003c0c
 #define  LOW_TXCOL_TICKS		 0x00000096
+#define  LOW_TXCOL_TICKS_CLRTCKS	 0x00000048
 #define  DEFAULT_TXCOL_TICKS		 0x0000012c
 #define  HIGH_TXCOL_TICKS		 0x00000145
 #define HOSTCC_RXMAX_FRAMES		0x00003c10
@@ -892,8 +894,10 @@
 #define  HIGH_TXMAX_FRAMES		 0x00000052
 #define HOSTCC_RXCOAL_TICK_INT		0x00003c18
 #define  DEFAULT_RXCOAL_TICK_INT	 0x00000019
+#define  DEFAULT_RXCOAL_TICK_INT_CLRTCKS 0x00000014
 #define HOSTCC_TXCOAL_TICK_INT		0x00003c1c
 #define  DEFAULT_TXCOAL_TICK_INT	 0x00000019
+#define  DEFAULT_TXCOAL_TICK_INT_CLRTCKS 0x00000014
 #define HOSTCC_RXCOAL_MAXF_INT		0x00003c20
 #define  DEFAULT_RXCOAL_MAXF_INT	 0x00000005
 #define HOSTCC_TXCOAL_MAXF_INT		0x00003c24
@@ -2227,7 +2231,7 @@ struct tg3 {
 
 #define SST_25VF0X0_PAGE_SIZE		4098
 
-
+	struct ethtool_coalesce		coal;
 };
 
 #endif /* !(_T3_H) */

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [TG3]: Add hw coalescing infrastructure.
  2005-05-11 21:15 [TG3]: Add hw coalescing infrastructure David S. Miller
@ 2005-05-11 21:17 ` Michael Chan
  2005-05-12  2:28   ` David S. Miller
  2005-06-22 15:25 ` [TG3]: About " Eric Dumazet
  1 sibling, 1 reply; 64+ messages in thread
From: Michael Chan @ 2005-05-11 21:17 UTC (permalink / raw)
  To: David S.Miller; +Cc: netdev

On Wed, 2005-05-11 at 14:15 -0700, David S.Miller wrote:

> It seems that we should also be avoiding the HOSTCC_{R,T}XCOAL_MAXF_INT
> registers when TG3_FLG2_5705_PLUS.  I didn't make that change yet, just
> preserving what we were doing previously.

These registers are still defined for 5705_PLUS chips, only the maximum
value has been reduced to 255. So we're ok.

> 
> Another thing which occurred to me is that these settings are line
> rate dependant.  For example, the 20 usec value is for gigabit.
> So we may wish to adjust the values we use based upon the negotiated
> line speed.   But that's a future refinement that can wait.
> 
Yes, and MTU size dependent too. But we may not want to coalesce the
same way when running at 10/100 Mbps. For example, if one packet has
arrived, we don't want to wait 120 usec or 1200 usec to see if another
packet will arrive before we interrupt at 100 and 10 Mbps respectively.

> I think with the removal of that I/O readback done by the tagged
> status changes, and this coalescing stuff below, the SGI NUMA
> performance should be significantly higher.
> 
> Comments?

Other than adding a check for netif_carrier_ok() before setting the
HOSTCC_STAT_COAL_TICKS register in tg3_set_coalesce(), everything looks
good to me.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [TG3]: Add hw coalescing infrastructure.
  2005-05-11 21:17 ` Michael Chan
@ 2005-05-12  2:28   ` David S. Miller
  2005-05-12  7:53     ` Robert Olsson
  0 siblings, 1 reply; 64+ messages in thread
From: David S. Miller @ 2005-05-12  2:28 UTC (permalink / raw)
  To: mchan; +Cc: netdev

From: "Michael Chan" <mchan@broadcom.com>
Subject: Re: [TG3]: Add hw coalescing infrastructure.
Date: Wed, 11 May 2005 14:17:36 -0700

> On Wed, 2005-05-11 at 14:15 -0700, David S.Miller wrote:
> 
> > Another thing which occurred to me is that these settings are line
> > rate dependant.  For example, the 20 usec value is for gigabit.
> > So we may wish to adjust the values we use based upon the negotiated
> > line speed.   But that's a future refinement that can wait.
> > 
> Yes, and MTU size dependent too. But we may not want to coalesce the
> same way when running at 10/100 Mbps. For example, if one packet has
> arrived, we don't want to wait 120 usec or 1200 usec to see if another
> packet will arrive before we interrupt at 100 and 10 Mbps respectively.

That's a good point.  I think it would be wise for us to try
and come up with a dynamic mitigation algorithm that is as MTU
and link speed agnostic as possible.

One thing to note is that what we're really trying to do is
maximize the amount of work done per interrupt, without
unduly adding latency to packet reception.

Work done per-interrupt, and maximum latency, are better goals
because formulas based upon those values will automatically take
differences in CPU and BUS I/O speed into account.

> > I think with the removal of that I/O readback done by the tagged
> > status changes, and this coalescing stuff below, the SGI NUMA
> > performance should be significantly higher.
> > 
> > Comments?
> 
> Other than adding a check for netif_carrier_ok() before setting the
> HOSTCC_STAT_COAL_TICKS register in tg3_set_coalesce(), everything looks
> good to me.

Good catch.  Here is the final version.

I'm still debating whether to pursue the dynamic mitigation
setting idea.  I'd like to see some new numbers from the
SGI NUMA folks first.

I think we certainly should add ETHTOOL_SCOALESCE support.
It isn't in there now because I have to think a bit about
how to verify the user's settings.  As you know, some
combinations of values could wedge the driver and cause
no interrupts to be generated for TX or RX even when packets
are transmitted/received.  And there's that 256 limit on
5705_plus chips which you mentioned.

[TG3]: Set minimal hw interrupt mitigation.

Even though we do software interrupt mitigation
via NAPI, it still helps to have some minimal
hw assisted mitigation.

This helps, particularly, on systems where register
I/O overhead is much greater than the CPU horsepower.

For example, it helps on NUMA systems.  In such cases
the PIO overhead to disable interrupts for NAPI accounts
for the majority of the packet processing cost.  The
CPU is fast enough such that only a single packet is
processed by each NAPI poll call.

Thanks to Michael Chan for reviewing this patch.

Signed-off-by: David S. Miller <davem@davemloft.net>

--- 1/drivers/net/tg3.c.~1~	2005-05-11 11:46:11.000000000 -0700
+++ 2/drivers/net/tg3.c	2005-05-11 19:26:28.000000000 -0700
@@ -2507,7 +2507,7 @@ static int tg3_setup_phy(struct tg3 *tp,
 	if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS)) {
 		if (netif_carrier_ok(tp->dev)) {
 			tw32(HOSTCC_STAT_COAL_TICKS,
-			     DEFAULT_STAT_COAL_TICKS);
+			     tp->coal.stats_block_coalesce_usecs);
 		} else {
 			tw32(HOSTCC_STAT_COAL_TICKS, 0);
 		}
@@ -5094,6 +5094,27 @@ static void tg3_set_bdinfo(struct tg3 *t
 }
 
 static void __tg3_set_rx_mode(struct net_device *);
+static void tg3_set_coalesce(struct tg3 *tp, struct ethtool_coalesce *ec)
+{
+	tw32(HOSTCC_RXCOL_TICKS, ec->rx_coalesce_usecs);
+	tw32(HOSTCC_TXCOL_TICKS, ec->tx_coalesce_usecs);
+	tw32(HOSTCC_RXMAX_FRAMES, ec->rx_max_coalesced_frames);
+	tw32(HOSTCC_TXMAX_FRAMES, ec->tx_max_coalesced_frames);
+	if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS)) {
+		tw32(HOSTCC_RXCOAL_TICK_INT, ec->rx_coalesce_usecs_irq);
+		tw32(HOSTCC_TXCOAL_TICK_INT, ec->tx_coalesce_usecs_irq);
+	}
+	tw32(HOSTCC_RXCOAL_MAXF_INT, ec->rx_max_coalesced_frames_irq);
+	tw32(HOSTCC_TXCOAL_MAXF_INT, ec->tx_max_coalesced_frames_irq);
+	if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS)) {
+		u32 val = ec->stats_block_coalesce_usecs;
+
+		if (!netif_carrier_ok(tp->dev))
+			val = 0;
+
+		tw32(HOSTCC_STAT_COAL_TICKS, val);
+	}
+}
 
 /* tp->lock is held. */
 static int tg3_reset_hw(struct tg3 *tp)
@@ -5416,16 +5437,7 @@ static int tg3_reset_hw(struct tg3 *tp)
 		udelay(10);
 	}
 
-	tw32(HOSTCC_RXCOL_TICKS, 0);
-	tw32(HOSTCC_TXCOL_TICKS, LOW_TXCOL_TICKS);
-	tw32(HOSTCC_RXMAX_FRAMES, 1);
-	tw32(HOSTCC_TXMAX_FRAMES, LOW_RXMAX_FRAMES);
-	if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS)) {
-		tw32(HOSTCC_RXCOAL_TICK_INT, 0);
-		tw32(HOSTCC_TXCOAL_TICK_INT, 0);
-	}
-	tw32(HOSTCC_RXCOAL_MAXF_INT, 1);
-	tw32(HOSTCC_TXCOAL_MAXF_INT, 0);
+	tg3_set_coalesce(tp, &tp->coal);
 
 	/* set status block DMA address */
 	tw32(HOSTCC_STATUS_BLK_HOST_ADDR + TG3_64BIT_REG_HIGH,
@@ -5438,8 +5450,6 @@ static int tg3_reset_hw(struct tg3 *tp)
 		 * the tg3_periodic_fetch_stats call there, and
 		 * tg3_get_stats to see how this works for 5705/5750 chips.
 		 */
-		tw32(HOSTCC_STAT_COAL_TICKS,
-		     DEFAULT_STAT_COAL_TICKS);
 		tw32(HOSTCC_STATS_BLK_HOST_ADDR + TG3_64BIT_REG_HIGH,
 		     ((u64) tp->stats_mapping >> 32));
 		tw32(HOSTCC_STATS_BLK_HOST_ADDR + TG3_64BIT_REG_LOW,
@@ -7284,6 +7294,14 @@ static void tg3_vlan_rx_kill_vid(struct 
 }
 #endif
 
+static int tg3_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec)
+{
+	struct tg3 *tp = netdev_priv(dev);
+
+	memcpy(ec, &tp->coal, sizeof(*ec));
+	return 0;
+}
+
 static struct ethtool_ops tg3_ethtool_ops = {
 	.get_settings		= tg3_get_settings,
 	.set_settings		= tg3_set_settings,
@@ -7316,6 +7334,7 @@ static struct ethtool_ops tg3_ethtool_op
 	.get_strings		= tg3_get_strings,
 	.get_stats_count	= tg3_get_stats_count,
 	.get_ethtool_stats	= tg3_get_ethtool_stats,
+	.get_coalesce		= tg3_get_coalesce,
 };
 
 static void __devinit tg3_get_eeprom_size(struct tg3 *tp)
@@ -9096,6 +9115,31 @@ static struct pci_dev * __devinit tg3_fi
 	return peer;
 }
 
+static void __devinit tg3_init_coal(struct tg3 *tp)
+{
+	struct ethtool_coalesce *ec = &tp->coal;
+
+	memset(ec, 0, sizeof(*ec));
+	ec->cmd = ETHTOOL_GCOALESCE;
+	ec->rx_coalesce_usecs = LOW_RXCOL_TICKS;
+	ec->tx_coalesce_usecs = LOW_TXCOL_TICKS;
+	ec->rx_max_coalesced_frames = LOW_RXMAX_FRAMES;
+	ec->tx_max_coalesced_frames = LOW_TXMAX_FRAMES;
+	ec->rx_coalesce_usecs_irq = DEFAULT_RXCOAL_TICK_INT;
+	ec->tx_coalesce_usecs_irq = DEFAULT_TXCOAL_TICK_INT;
+	ec->rx_max_coalesced_frames_irq = DEFAULT_RXCOAL_MAXF_INT;
+	ec->tx_max_coalesced_frames_irq = DEFAULT_TXCOAL_MAXF_INT;
+	ec->stats_block_coalesce_usecs = DEFAULT_STAT_COAL_TICKS;
+
+	if (tp->coalesce_mode & (HOSTCC_MODE_CLRTICK_RXBD |
+				 HOSTCC_MODE_CLRTICK_TXBD)) {
+		ec->rx_coalesce_usecs = LOW_RXCOL_TICKS_CLRTCKS;
+		ec->rx_coalesce_usecs_irq = DEFAULT_RXCOAL_TICK_INT_CLRTCKS;
+		ec->tx_coalesce_usecs = LOW_TXCOL_TICKS_CLRTCKS;
+		ec->tx_coalesce_usecs_irq = DEFAULT_TXCOAL_TICK_INT_CLRTCKS;
+	}
+}
+
 static int __devinit tg3_init_one(struct pci_dev *pdev,
 				  const struct pci_device_id *ent)
 {
@@ -9341,6 +9385,8 @@ static int __devinit tg3_init_one(struct
 	/* flow control autonegotiation is default behavior */
 	tp->tg3_flags |= TG3_FLAG_PAUSE_AUTONEG;
 
+	tg3_init_coal(tp);
+
 	err = register_netdev(dev);
 	if (err) {
 		printk(KERN_ERR PFX "Cannot register net device, "
--- 1/drivers/net/tg3.h.~1~	2005-05-11 11:46:11.000000000 -0700
+++ 2/drivers/net/tg3.h	2005-05-11 13:48:14.000000000 -0700
@@ -876,10 +876,12 @@
 #define  HOSTCC_STATUS_ERROR_ATTN	 0x00000004
 #define HOSTCC_RXCOL_TICKS		0x00003c08
 #define  LOW_RXCOL_TICKS		 0x00000032
+#define  LOW_RXCOL_TICKS_CLRTCKS	 0x00000014
 #define  DEFAULT_RXCOL_TICKS		 0x00000048
 #define  HIGH_RXCOL_TICKS		 0x00000096
 #define HOSTCC_TXCOL_TICKS		0x00003c0c
 #define  LOW_TXCOL_TICKS		 0x00000096
+#define  LOW_TXCOL_TICKS_CLRTCKS	 0x00000048
 #define  DEFAULT_TXCOL_TICKS		 0x0000012c
 #define  HIGH_TXCOL_TICKS		 0x00000145
 #define HOSTCC_RXMAX_FRAMES		0x00003c10
@@ -892,8 +894,10 @@
 #define  HIGH_TXMAX_FRAMES		 0x00000052
 #define HOSTCC_RXCOAL_TICK_INT		0x00003c18
 #define  DEFAULT_RXCOAL_TICK_INT	 0x00000019
+#define  DEFAULT_RXCOAL_TICK_INT_CLRTCKS 0x00000014
 #define HOSTCC_TXCOAL_TICK_INT		0x00003c1c
 #define  DEFAULT_TXCOAL_TICK_INT	 0x00000019
+#define  DEFAULT_TXCOAL_TICK_INT_CLRTCKS 0x00000014
 #define HOSTCC_RXCOAL_MAXF_INT		0x00003c20
 #define  DEFAULT_RXCOAL_MAXF_INT	 0x00000005
 #define HOSTCC_TXCOAL_MAXF_INT		0x00003c24
@@ -2227,7 +2231,7 @@ struct tg3 {
 
 #define SST_25VF0X0_PAGE_SIZE		4098
 
-
+	struct ethtool_coalesce		coal;
 };
 
 #endif /* !(_T3_H) */

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [TG3]: Add hw coalescing infrastructure.
  2005-05-12  2:28   ` David S. Miller
@ 2005-05-12  7:53     ` Robert Olsson
  0 siblings, 0 replies; 64+ messages in thread
From: Robert Olsson @ 2005-05-12  7:53 UTC (permalink / raw)
  To: David S. Miller; +Cc: mchan, netdev


David S. Miller writes:
 > From: "Michael Chan" <mchan@broadcom.com>
 > Subject: Re: [TG3]: Add hw coalescing infrastructure.
 > Date: Wed, 11 May 2005 14:17:36 -0700

 > > Yes, and MTU size dependent too. But we may not want to coalesce the
 > > same way when running at 10/100 Mbps. For example, if one packet has
 > > arrived, we don't want to wait 120 usec or 1200 usec to see if another
 > > packet will arrive before we interrupt at 100 and 10 Mbps respectively.
 > 
 > That's a good point.  I think it would be wise for us to try
 > and come up with a dynamic mitigation algorithm that is as MTU
 > and link speed agnostic as possible.

 Size of RX-ring is also to be taken to account. You probably have to design 
 for samllest packets otherwise you risk RX-ring overrun and packet drop. 

 > One thing to note is that what we're really trying to do is
 > maximize the amount of work done per interrupt, without
 > unduly adding latency to packet reception.
 > 
 > Work done per-interrupt, and maximum latency, are better goals
 > because formulas based upon those values will automatically take
 > differences in CPU and BUS I/O speed into account.

  Yes.

  Cheers.
					--ro
 

^ permalink raw reply	[flat|nested] 64+ messages in thread

* [TG3]: About hw coalescing infrastructure.
  2005-05-11 21:15 [TG3]: Add hw coalescing infrastructure David S. Miller
  2005-05-11 21:17 ` Michael Chan
@ 2005-06-22 15:25 ` Eric Dumazet
  2005-06-22 19:03   ` Michael Chan
  1 sibling, 1 reply; 64+ messages in thread
From: Eric Dumazet @ 2005-06-22 15:25 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, mchan

David S. Miller a écrit :
> Ok, now that we have the tagged status stuff sorted I began
> to work on putting the hw mitigation bits back into the
> driver.  The discussion on the DMA rw-ctrl settings is still
> ongoing, but I will get back to it shortly.
> 
> This is the first step, we cache the settings in the tg3
> struct and put those values into the chip via tg3_set_coalesce().
> 
> ETHTOOL_GCOALESCE is supported, setting is not.
> 
Hi David

I am using 2.6.12 now, but still experiment a high number of interrupts per second on
my tg3 NIC, on an dual Opteron based machine.

(about 7300 interrupts per second generated by one interface eth0, 100Mbit/s link)

Is there anything I can try to tune the coalescing ?
Being able to handle 100 packets each interrupt instead of one or two would certainly help.
I dont mind about latency. But of course I would like not to drop packets :)
But maybe the BCM5702 is not able to delay an interrupt ?

Thank you
Eric Dumazet

----------------------------------------------------------------------------------------
# lspci -v
02:03.0 Ethernet controller: Broadcom Corporation NetXtreme BCM5702 Gigabit Ethernet (rev 02)
         Subsystem: Broadcom Corporation BCM5702 1000Base-T
         Flags: bus master, 66Mhz, medium devsel, latency 64, IRQ 27
         Memory at 00000000fe000000 (64-bit, non-prefetchable) [size=64K]
         Expansion ROM at <unassigned> [disabled] [size=64K]
         Capabilities: [40] PCI-X non-bridge device.
         Capabilities: [48] Power Management version 2
         Capabilities: [50] Vital Product Data
         Capabilities: [58] Message Signalled Interrupts: 64bit+ Queue=0/3 Enable-

# ethtool -c eth0
Coalesce parameters for eth0:
Adaptive RX: off  TX: off
stats-block-usecs: 1000000
sample-interval: 0
pkt-rate-low: 0
pkt-rate-high: 0

rx-usecs: 20
rx-frames: 5
rx-usecs-irq: 20
rx-frames-irq: 5

tx-usecs: 72
tx-frames: 53
tx-usecs-irq: 20
tx-frames-irq: 5

rx-usecs-low: 0
rx-frame-low: 0
tx-usecs-low: 0
tx-frame-low: 0

rx-usecs-high: 0
rx-frame-high: 0
tx-usecs-high: 0
tx-frame-high: 0


# ethtool -S eth0
NIC statistics:
      rx_octets: 104634072366
      rx_fragments: 0
      rx_ucast_packets: 852685070
      rx_mcast_packets: 0
      rx_bcast_packets: 20429
      rx_fcs_errors: 0
      rx_align_errors: 0
      rx_xon_pause_rcvd: 0
      rx_xoff_pause_rcvd: 0
      rx_mac_ctrl_rcvd: 0
      rx_xoff_entered: 0
      rx_frame_too_long_errors: 0
      rx_jabbers: 0
      rx_undersize_packets: 0
      rx_in_length_errors: 0
      rx_out_length_errors: 0
      rx_64_or_less_octet_packets: 451956709
      rx_65_to_127_octet_packets: 272058231
      rx_128_to_255_octet_packets: 63364655
      rx_256_to_511_octet_packets: 35814973
      rx_512_to_1023_octet_packets: 11867701
      rx_1024_to_1522_octet_packets: 17643210
      rx_1523_to_2047_octet_packets: 0
      rx_2048_to_4095_octet_packets: 0
      rx_4096_to_8191_octet_packets: 0
      rx_8192_to_9022_octet_packets: 0
      tx_octets: 134640205605
      tx_collisions: 0
      tx_xon_sent: 0
      tx_xoff_sent: 0
      tx_flow_control: 0
      tx_mac_errors: 0
      tx_single_collisions: 0
      tx_mult_collisions: 0
      tx_deferred: 0
      tx_excessive_collisions: 0
      tx_late_collisions: 0
      tx_collide_2times: 0
      tx_collide_3times: 0
      tx_collide_4times: 0
      tx_collide_5times: 0
      tx_collide_6times: 0
      tx_collide_7times: 0
      tx_collide_8times: 0
      tx_collide_9times: 0
      tx_collide_10times: 0
      tx_collide_11times: 0
      tx_collide_12times: 0
      tx_collide_13times: 0
      tx_collide_14times: 0
      tx_collide_15times: 0
      tx_ucast_packets: 774312055
      tx_mcast_packets: 13
      tx_bcast_packets: 246
      tx_carrier_sense_errors: 0
      tx_discards: 0
      tx_errors: 0
      dma_writeq_full: 21375
      dma_write_prioq_full: 0
      rxbds_empty: 0
      rx_discards: 2644
      rx_errors: 0
      rx_threshold_hit: 57384403
      dma_readq_full: 27100189
      dma_read_prioq_full: 1557267
      tx_comp_queue_full: 35712755
      ring_set_send_prod_index: 747986769
      ring_status_update: 502110997
      nic_irqs: 446148615
      nic_avoided_irqs: 55962382
      nic_tx_threshold_hit: 37282069

# ethtool -g eth0
Ring parameters for eth0:
Pre-set maximums:
RX:             511
RX Mini:        0
RX Jumbo:       255
TX:             0
Current hardware settings:
RX:             200
RX Mini:        0
RX Jumbo:       100
TX:             511


# ethtool eth0
Settings for eth0:
         Supported ports: [ MII ]
         Supported link modes:   10baseT/Half 10baseT/Full
                                 100baseT/Half 100baseT/Full
                                 1000baseT/Half 1000baseT/Full
         Supports auto-negotiation: Yes
         Advertised link modes:  10baseT/Half 10baseT/Full
                                 100baseT/Half 100baseT/Full
                                 1000baseT/Half 1000baseT/Full
         Advertised auto-negotiation: Yes
         Speed: 100Mb/s
         Duplex: Full
         Port: Twisted Pair
         PHYAD: 1
         Transceiver: internal
         Auto-negotiation: on
         Supports Wake-on: g
         Wake-on: d
         Current message level: 0x000000ff (255)
         Link detected: yes


# cat /proc/interrupts    (HZ=200)
            CPU0       CPU1
   0:     164055   14038348    IO-APIC-edge  timer
   2:          0          0          XT-PIC  cascade
   8:          0          0    IO-APIC-edge  rtc
  14:       4073     368224    IO-APIC-edge  ide0
  15:          0         20    IO-APIC-edge  ide1
  27:   35985951  421578656   IO-APIC-level  eth0, eth1
NMI:  874625217  905019517  (oprofile running)
LOC:   14201857   14201976
ERR:          0
MIS:          0

oprofile data :
# more /tmp/vmlinux.oprofile
CPU: Hammer, speed 2205.08 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Cycles outside of halt state) with a unit mask of 0x00 (No unit mask) count 100000
samples  cum. samples  %        cum. %     symbol name
20208503 20208503       7.7982   7.7982    ipt_do_table
8336463  28544966       3.2169  11.0151    tcp_v4_rcv
7746814  36291780       2.9894  14.0045    handle_IRQ_event
7117968  43409748       2.7467  16.7512    tg3_poll
6585377  49995125       2.5412  19.2924    memcpy
5184695  55179820       2.0007  21.2931    ip_route_input
4346890  59526710       1.6774  22.9705    kfree
4214007  63740717       1.6261  24.5967    copy_user_generic_c
4093885  67834602       1.5798  26.1764    tcp_ack
4006753  71841355       1.5462  27.7226    tg3_interrupt_tagged
3778976  75620331       1.4583  29.1809    tcp_rcv_established
3756498  79376829       1.4496  30.6304    ip_queue_xmit
3418999  82795828       1.3193  31.9498    schedule
3274459  86070287       1.2636  33.2134    try_to_wake_up
3034809  89105096       1.1711  34.3844    tcp_sendmsg
2846436  91951532       1.0984  35.4828    kmem_cache_alloc
2745147  94696679       1.0593  36.5422    free_block
2679056  97375735       1.0338  37.5760    kmem_cache_free
2595289  99971024       1.0015  38.5775    fn_hash_lookup
2582072  102553096      0.9964  39.5738    __memset
2576462  105129558      0.9942  40.5681    tcp_transmit_skb
2528313  107657871      0.9756  41.5437    tcp_recvmsg
2392370  110050241      0.9232  42.4669    timer_interrupt
2365615  112415856      0.9129  43.3797    system_call
2358666  114774522      0.9102  44.2899    sockfd_lookup
2357192  117131714      0.9096  45.1995    tcp_poll
2340568  119472282      0.9032  46.1027    ip_rcv
2315805  121788087      0.8936  46.9964    tcp_match
2276212  124064299      0.8784  47.8747    sys_epoll_wait
2260913  126325212      0.8725  48.7472    __mod_timer
2173905  128499117      0.8389  49.5861    tg3_start_xmit
2057738  130556855      0.7941  50.3801    __switch_to
2022435  132579290      0.7804  51.1605    ep_poll_callback
2020449  134599739      0.7797  51.9402    sock_wfree
1913008  136512747      0.7382  52.6784    find_busiest_group
1891578  138404325      0.7299  53.4083    local_bh_enable
1860130  140264455      0.7178  54.1261    ip_local_deliver
1793639  142058094      0.6921  54.8183    __ip_route_output_key
1789287  143847381      0.6905  55.5087    alloc_skb
1770972  145618353      0.6834  56.1921    tcp_write_timer
1727286  147345639      0.6665  56.8587    __wake_up
1634111  148979750      0.6306  57.4893    skb_release_data
1625157  150604907      0.6271  58.1164    __kmalloc
1567198  152172105      0.6048  58.7211    tcp_v4_do_rcv
1562495  153734600      0.6029  59.3241    __kfree_skb

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [TG3]: About hw coalescing infrastructure.
  2005-06-22 15:25 ` [TG3]: About " Eric Dumazet
@ 2005-06-22 19:03   ` Michael Chan
  2005-07-04 21:22     ` Eric Dumazet
  0 siblings, 1 reply; 64+ messages in thread
From: Michael Chan @ 2005-06-22 19:03 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S. Miller, netdev

On Wed, 2005-06-22 at 17:25 +0200, Eric Dumazet wrote:

> Is there anything I can try to tune the coalescing ?
> Being able to handle 100 packets each interrupt instead of one or two would certainly help.
> I dont mind about latency. But of course I would like not to drop packets :)
> But maybe the BCM5702 is not able to delay an interrupt ?
> 

On the 5702 that supports CLRTCKS mode, you need to play around with the
following parameters in tg3.h. To reduce interrupts, you generally have
to increase the values.

#define  LOW_RXCOL_TICKS_CLRTCKS	 0x00000014
#define  LOW_TXCOL_TICKS_CLRTCKS	 0x00000048
#define  LOW_RXMAX_FRAMES		 0x00000005
#define  LOW_TXMAX_FRAMES		 0x00000035
#define  DEFAULT_RXCOAL_TICK_INT_CLRTCKS 0x00000014
#define  DEFAULT_TXCOAL_TICK_INT_CLRTCKS 0x00000014
#define  DEFAULT_RXCOAL_MAXF_INT	 0x00000005
#define  DEFAULT_TXCOAL_MAXF_INT	 0x00000005

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [TG3]: About hw coalescing infrastructure.
  2005-06-22 19:03   ` Michael Chan
@ 2005-07-04 21:22     ` Eric Dumazet
  2005-07-04 21:26       ` David S. Miller
  0 siblings, 1 reply; 64+ messages in thread
From: Eric Dumazet @ 2005-07-04 21:22 UTC (permalink / raw)
  To: Michael Chan; +Cc: David S. Miller, netdev

Michael Chan a écrit :
> On Wed, 2005-06-22 at 17:25 +0200, Eric Dumazet wrote:
> 
> 
>>Is there anything I can try to tune the coalescing ?
>>Being able to handle 100 packets each interrupt instead of one or two would certainly help.
>>I dont mind about latency. But of course I would like not to drop packets :)
>>But maybe the BCM5702 is not able to delay an interrupt ?
>>
> 
> 
> On the 5702 that supports CLRTCKS mode, you need to play around with the
> following parameters in tg3.h. To reduce interrupts, you generally have
> to increase the values.
> 
> #define  LOW_RXCOL_TICKS_CLRTCKS	 0x00000014
> #define  LOW_TXCOL_TICKS_CLRTCKS	 0x00000048
> #define  LOW_RXMAX_FRAMES		 0x00000005
> #define  LOW_TXMAX_FRAMES		 0x00000035
> #define  DEFAULT_RXCOAL_TICK_INT_CLRTCKS 0x00000014
> #define  DEFAULT_TXCOAL_TICK_INT_CLRTCKS 0x00000014
> #define  DEFAULT_RXCOAL_MAXF_INT	 0x00000005
> #define  DEFAULT_TXCOAL_MAXF_INT	 0x00000005
> 
> 
> 

Thanks Michael

I tried various settings.

# ethtool -c eth0
Coalesce parameters for eth0:
Adaptive RX: off  TX: off
stats-block-usecs: 1000000
sample-interval: 0
pkt-rate-low: 0
pkt-rate-high: 0

rx-usecs: 500
rx-frames: 20
rx-usecs-irq: 500
rx-frames-irq: 20

tx-usecs: 600
tx-frames: 53
tx-usecs-irq: 600
tx-frames-irq: 20

rx-usecs-low: 0
rx-frame-low: 0
tx-usecs-low: 0
tx-frame-low: 0

rx-usecs-high: 0
rx-frame-high: 0
tx-usecs-high: 0
tx-frame-high: 0


But it seems something is wrong because when network load becomes high I get :

Jul  4 23:01:19 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul  4 23:01:19 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul  4 23:01:24 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul  4 23:01:24 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul  4 23:01:29 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul  4 23:01:29 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul  4 23:01:34 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul  4 23:01:34 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul  4 23:01:39 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul  4 23:01:39 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul  4 23:01:44 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul  4 23:01:44 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul  4 23:01:49 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul  4 23:01:49 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul  4 23:01:54 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul  4 23:01:54 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul  4 23:01:59 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul  4 23:01:59 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul  4 23:02:04 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul  4 23:02:04 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul  4 23:02:09 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul  4 23:02:09 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul  4 23:02:14 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul  4 23:02:14 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul  4 23:02:19 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul  4 23:02:19 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul  4 23:02:24 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul  4 23:02:24 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul  4 23:02:29 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul  4 23:02:29 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul  4 23:02:34 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul  4 23:02:34 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul  4 23:02:39 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul  4 23:02:39 dada1 kernel: tg3: eth0: transmit timed out, resetting


Then the machine crashes at this point.

tg3.c :

#define DRV_MODULE_VERSION  "3.31"
#define DRV_MODULE_RELDATE  "June 8, 2005"

# ethtool -g eth0
Ring parameters for eth0:
Pre-set maximums:
RX:             511
RX Mini:        0
RX Jumbo:       255
TX:             0
Current hardware settings:
RX:             400
RX Mini:        0
RX Jumbo:       40
TX:             511


Thank you

Eric Dumazet

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [TG3]: About hw coalescing infrastructure.
  2005-07-04 21:22     ` Eric Dumazet
@ 2005-07-04 21:26       ` David S. Miller
  2005-07-04 21:39         ` Eric Dumazet
  0 siblings, 1 reply; 64+ messages in thread
From: David S. Miller @ 2005-07-04 21:26 UTC (permalink / raw)
  To: dada1; +Cc: mchan, netdev

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Mon, 04 Jul 2005 23:22:12 +0200

> But it seems something is wrong because when network load becomes high I get :
> 
> Jul  4 23:01:19 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
> Jul  4 23:01:19 dada1 kernel: tg3: eth0: transmit timed out, resetting
> Jul  4 23:01:24 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
> Jul  4 23:01:24 dada1 kernel: tg3: eth0: transmit timed out, resetting
> Jul  4 23:01:29 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out

Does this happen without changing the coalescing settings
at all?

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [TG3]: About hw coalescing infrastructure.
  2005-07-04 21:26       ` David S. Miller
@ 2005-07-04 21:39         ` Eric Dumazet
  2005-07-04 21:49           ` David S. Miller
  2005-07-04 22:31           ` Eric Dumazet
  0 siblings, 2 replies; 64+ messages in thread
From: Eric Dumazet @ 2005-07-04 21:39 UTC (permalink / raw)
  To: David S. Miller; +Cc: mchan, netdev

David S. Miller a écrit :
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Mon, 04 Jul 2005 23:22:12 +0200
> 
> 
>>But it seems something is wrong because when network load becomes high I get :
>>
>>Jul  4 23:01:19 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
>>Jul  4 23:01:19 dada1 kernel: tg3: eth0: transmit timed out, resetting
>>Jul  4 23:01:24 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
>>Jul  4 23:01:24 dada1 kernel: tg3: eth0: transmit timed out, resetting
>>Jul  4 23:01:29 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
> 
> 
> Does this happen without changing the coalescing settings
> at all?
> 
> 
Not easy to answer because I have to rebuild a kernel and reboot. Will do that shortly.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [TG3]: About hw coalescing infrastructure.
  2005-07-04 21:39         ` Eric Dumazet
@ 2005-07-04 21:49           ` David S. Miller
  2005-07-04 22:31           ` Eric Dumazet
  1 sibling, 0 replies; 64+ messages in thread
From: David S. Miller @ 2005-07-04 21:49 UTC (permalink / raw)
  To: dada1; +Cc: mchan, netdev

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Mon, 04 Jul 2005 23:39:35 +0200

> Not easy to answer because I have to rebuild a kernel and
> reboot. Will do that shortly.

Thanks.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [TG3]: About hw coalescing infrastructure.
  2005-07-04 21:39         ` Eric Dumazet
  2005-07-04 21:49           ` David S. Miller
@ 2005-07-04 22:31           ` Eric Dumazet
  2005-07-04 22:47             ` David S. Miller
  2005-07-04 22:47             ` Eric Dumazet
  1 sibling, 2 replies; 64+ messages in thread
From: Eric Dumazet @ 2005-07-04 22:31 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S. Miller, mchan, netdev

Eric Dumazet a écrit :
> David S. Miller a écrit :
> 
>> From: Eric Dumazet <dada1@cosmosbay.com>
>> Date: Mon, 04 Jul 2005 23:22:12 +0200
>>
>>
>>> But it seems something is wrong because when network load becomes 
>>> high I get :
>>>
>>> Jul  4 23:01:19 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
>>> Jul  4 23:01:19 dada1 kernel: tg3: eth0: transmit timed out, resetting
>>> Jul  4 23:01:24 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
>>> Jul  4 23:01:24 dada1 kernel: tg3: eth0: transmit timed out, resetting
>>> Jul  4 23:01:29 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
>>
>>
>>
>> Does this happen without changing the coalescing settings
>> at all?
>>
>>
> Not easy to answer because I have to rebuild a kernel and reboot. Will 
> do that shortly.
> 
> 
> 

Oops, I forgot to tell you I applied the patch  : [TG3]: Eliminate all hw IRQ handler spinlocks.
(Date: Fri, 03 Jun 2005 12:25:58 -0700 (PDT))

Maybe I should revert to stock 2.6.12 tg3 driver ?

Eric

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [TG3]: About hw coalescing infrastructure.
  2005-07-04 22:31           ` Eric Dumazet
@ 2005-07-04 22:47             ` David S. Miller
  2005-07-04 22:55               ` Eric Dumazet
  2005-07-04 22:47             ` Eric Dumazet
  1 sibling, 1 reply; 64+ messages in thread
From: David S. Miller @ 2005-07-04 22:47 UTC (permalink / raw)
  To: dada1; +Cc: mchan, netdev

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 05 Jul 2005 00:31:55 +0200

> Oops, I forgot to tell you I applied the patch  : [TG3]: Eliminate all hw IRQ handler spinlocks.
> (Date: Fri, 03 Jun 2005 12:25:58 -0700 (PDT))
> 
> Maybe I should revert to stock 2.6.12 tg3 driver ?

Please don't ever do stuff like that :-(  That makes the driver
version, and any other information you report completely meaningless
and useless.  You've just wasted a lot of our time.

I have no idea to even know _WHICH_ IRQ spinlock patch you applied.

The one that ended up in Linus's tree has many bug fixes and
refinements.  The ones which were posted on netdev had many bugs and
deadlocks which needed to be cured before pushing the change upstream.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [TG3]: About hw coalescing infrastructure.
  2005-07-04 22:31           ` Eric Dumazet
  2005-07-04 22:47             ` David S. Miller
@ 2005-07-04 22:47             ` Eric Dumazet
  1 sibling, 0 replies; 64+ messages in thread
From: Eric Dumazet @ 2005-07-04 22:47 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S. Miller, mchan, netdev

Eric Dumazet a écrit :
>
> 
> Oops, I forgot to tell you I applied the patch  : [TG3]: Eliminate all 
> hw IRQ handler spinlocks.
> (Date: Fri, 03 Jun 2005 12:25:58 -0700 (PDT))
> 
> Maybe I should revert to stock 2.6.12 tg3 driver ?
> 
> Eric
> 
> 

Oh well, I checked 2.6.13-rc1 and it contains this line in tg3_netif_stop() :

+   tp->dev->trans_start = jiffies; /* prevent tx timeout */

So I am trying driver 3.32 June 24, 2005, included in 2.6.13-rc1

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [TG3]: About hw coalescing infrastructure.
  2005-07-04 22:47             ` David S. Miller
@ 2005-07-04 22:55               ` Eric Dumazet
  2005-07-04 22:57                 ` Eric Dumazet
  2005-07-04 23:00                 ` [TG3]: About hw coalescing infrastructure David S. Miller
  0 siblings, 2 replies; 64+ messages in thread
From: Eric Dumazet @ 2005-07-04 22:55 UTC (permalink / raw)
  To: David S. Miller; +Cc: mchan, netdev

David S. Miller a écrit :

> Please don't ever do stuff like that :-(  That makes the driver
> version, and any other information you report completely meaningless
> and useless.  You've just wasted a lot of our time.
> 

Yes. But if you dont want us to test your patches, dont send them to the list.

For your information, 2.6.13-rc1 locks too.

Very easy way to lock it :

ping -f some_destination.



> I have no idea to even know _WHICH_ IRQ spinlock patch you applied.
> 
> The one that ended up in Linus's tree has many bug fixes and
> refinements.  The ones which were posted on netdev had many bugs and
> deadlocks which needed to be cured before pushing the change upstream.
> 
> 

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [TG3]: About hw coalescing infrastructure.
  2005-07-04 22:55               ` Eric Dumazet
@ 2005-07-04 22:57                 ` Eric Dumazet
  2005-07-04 23:01                   ` David S. Miller
  2005-07-04 23:00                 ` [TG3]: About hw coalescing infrastructure David S. Miller
  1 sibling, 1 reply; 64+ messages in thread
From: Eric Dumazet @ 2005-07-04 22:57 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S. Miller, mchan, netdev

Eric Dumazet a écrit :

> 
> For your information, 2.6.13-rc1 locks too.
> 
> Very easy way to lock it :
> 
> ping -f some_destination.


Arg. False alarm. Sorry.

Time for me to sleep.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [TG3]: About hw coalescing infrastructure.
  2005-07-04 22:55               ` Eric Dumazet
  2005-07-04 22:57                 ` Eric Dumazet
@ 2005-07-04 23:00                 ` David S. Miller
  2005-07-05 16:14                   ` Eric Dumazet
  1 sibling, 1 reply; 64+ messages in thread
From: David S. Miller @ 2005-07-04 23:00 UTC (permalink / raw)
  To: dada1; +Cc: mchan, netdev

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 05 Jul 2005 00:55:37 +0200

> David S. Miller a écrit :
> 
> > Please don't ever do stuff like that :-(  That makes the driver
> > version, and any other information you report completely meaningless
> > and useless.  You've just wasted a lot of our time.
> > 
> 
> Yes. But if you dont want us to test your patches, dont send them to
> the list.

The case here is that you didn't even mention that you had
the patch applied.  If you don't say what changes you've applied
to the stock driver we can't properly debug anything.

> For your information, 2.6.13-rc1 locks too.
> 
> Very easy way to lock it :
> 
> ping -f some_destination.

SMP system?  What platform and 570x chip revision?

Does the stock driver in 2.6.12 hang as well?

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [TG3]: About hw coalescing infrastructure.
  2005-07-04 22:57                 ` Eric Dumazet
@ 2005-07-04 23:01                   ` David S. Miller
  2005-07-05  7:38                     ` [PATCH] loop unrolling in net/sched/sch_generic.c Eric Dumazet
  0 siblings, 1 reply; 64+ messages in thread
From: David S. Miller @ 2005-07-04 23:01 UTC (permalink / raw)
  To: dada1; +Cc: mchan, netdev

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 05 Jul 2005 00:57:58 +0200

> Eric Dumazet a écrit :
> 
> > Very easy way to lock it :
> > 
> > ping -f some_destination.
> 
> 
> Arg. False alarm. Sorry.
> 
> Time for me to sleep.

Oh well...

^ permalink raw reply	[flat|nested] 64+ messages in thread

* [PATCH] loop unrolling in net/sched/sch_generic.c
  2005-07-04 23:01                   ` David S. Miller
@ 2005-07-05  7:38                     ` Eric Dumazet
  2005-07-05 11:51                       ` Thomas Graf
                                         ` (2 more replies)
  0 siblings, 3 replies; 64+ messages in thread
From: Eric Dumazet @ 2005-07-05  7:38 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev

[-- Attachment #1: Type: text/plain, Size: 305 bytes --]

[NET] : unroll a small loop in pfifo_fast_dequeue(). Compiler generates better code.
	(Using skb_queue_empty() to test the queue is faster than trying to __skb_dequeue())
	oprofile says this function uses now 0.29% instead of 1.22 %, on a x86_64 target.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

[-- Attachment #2: patch.sch_generic --]
[-- Type: text/plain, Size: 1022 bytes --]

--- linux-2.6.12/net/sched/sch_generic.c  2005-06-17 21:48:29.000000000 +0200
+++ linux-2.6.12-ed/net/sched/sch_generic.c       2005-07-05 09:11:30.000000000 +0200
@@ -333,18 +333,23 @@
 static struct sk_buff *
 pfifo_fast_dequeue(struct Qdisc* qdisc)
 {
-       int prio;
        struct sk_buff_head *list = qdisc_priv(qdisc);
        struct sk_buff *skb;

-       for (prio = 0; prio < 3; prio++, list++) {
-               skb = __skb_dequeue(list);
-               if (skb) {
-                       qdisc->q.qlen--;
-                       return skb;
-               }
+       for (;;) {
+               if (!skb_queue_empty(list))
+                       break;
+               list++;
+               if (!skb_queue_empty(list))
+                       break;
+               list++;
+               if (!skb_queue_empty(list))
+                       break;
+               return NULL;
        }
-       return NULL;
+       skb = __skb_dequeue(list);
+       qdisc->q.qlen--;
+       return skb;
 }

 static int

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
  2005-07-05  7:38                     ` [PATCH] loop unrolling in net/sched/sch_generic.c Eric Dumazet
@ 2005-07-05 11:51                       ` Thomas Graf
  2005-07-05 12:03                         ` Thomas Graf
  2005-07-05 13:04                         ` Eric Dumazet
  2005-07-05 21:26                       ` David S. Miller
  2005-07-28 15:52                       ` [PATCH] Add prefetches in net/ipv4/route.c Eric Dumazet
  2 siblings, 2 replies; 64+ messages in thread
From: Thomas Graf @ 2005-07-05 11:51 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S. Miller, netdev

* Eric Dumazet <42CA390C.9000801@cosmosbay.com> 2005-07-05 09:38
> [NET] : unroll a small loop in pfifo_fast_dequeue(). Compiler generates 
> better code.
> 	(Using skb_queue_empty() to test the queue is faster than trying to 
> 	__skb_dequeue())
> 	oprofile says this function uses now 0.29% instead of 1.22 %, on a 
> 	x86_64 target.

I think this patch is pretty much pointless. __skb_dequeue() and
!skb_queue_empty() should produce almost the same code and as soon
as you disable profiling and debugging you'll see that the compiler
unrolls the loop itself if possible.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
  2005-07-05 11:51                       ` Thomas Graf
@ 2005-07-05 12:03                         ` Thomas Graf
  2005-07-05 13:04                         ` Eric Dumazet
  1 sibling, 0 replies; 64+ messages in thread
From: Thomas Graf @ 2005-07-05 12:03 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S. Miller, netdev

* Thomas Graf <20050705115108.GE16076@postel.suug.ch> 2005-07-05 13:51
> * Eric Dumazet <42CA390C.9000801@cosmosbay.com> 2005-07-05 09:38
> > [NET] : unroll a small loop in pfifo_fast_dequeue(). Compiler generates 
> > better code.
> > 	(Using skb_queue_empty() to test the queue is faster than trying to 
> > 	__skb_dequeue())
> > 	oprofile says this function uses now 0.29% instead of 1.22 %, on a 
> > 	x86_64 target.
> 
> I think this patch is pretty much pointless. __skb_dequeue() and
> !skb_queue_empty() should produce almost the same code and as soon
> as you disable profiling and debugging you'll see that the compiler
> unrolls the loop itself if possible.

... given one enables it of course.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
  2005-07-05 11:51                       ` Thomas Graf
  2005-07-05 12:03                         ` Thomas Graf
@ 2005-07-05 13:04                         ` Eric Dumazet
  2005-07-05 13:48                           ` Thomas Graf
  1 sibling, 1 reply; 64+ messages in thread
From: Eric Dumazet @ 2005-07-05 13:04 UTC (permalink / raw)
  To: Thomas Graf; +Cc: David S. Miller, netdev

Thomas Graf a écrit :
> * Eric Dumazet <42CA390C.9000801@cosmosbay.com> 2005-07-05 09:38
> 
>>[NET] : unroll a small loop in pfifo_fast_dequeue(). Compiler generates 
>>better code.
>>	(Using skb_queue_empty() to test the queue is faster than trying to 
>>	__skb_dequeue())
>>	oprofile says this function uses now 0.29% instead of 1.22 %, on a 
>>	x86_64 target.
> 
> 
> I think this patch is pretty much pointless. __skb_dequeue() and
> !skb_queue_empty() should produce almost the same code and as soon
> as you disable profiling and debugging you'll see that the compiler
> unrolls the loop itself if possible.
> 
> 

OK. At least my compiler (gcc-3.3.1) does NOT unroll the loop :

Original 2.6.12 gives :

ffffffff802a9790 <pfifo_fast_dequeue>: /* pfifo_fast_dequeue total: 2904054  1.9531 */
258371  0.1738 :ffffffff802a9790:       lea    0xc0(%rdi),%rcx
273669  0.1841 :ffffffff802a9797:       xor    %esi,%esi
  12533  0.0084 :ffffffff802a9799:       mov    (%rcx),%rdx
292315  0.1966 :ffffffff802a979c:       cmp    %rcx,%rdx
  11717  0.0079 :ffffffff802a979f:       je     ffffffff802a97d1 <pfifo_fast_dequeue+0x41>
   4474  0.0030 :ffffffff802a97a1:       mov    %rdx,%rax
   6238  0.0042 :ffffffff802a97a4:       mov    (%rdx),%rdx
     41 2.8e-05 :ffffffff802a97a7:       decl   0x10(%rcx)
   6089  0.0041 :ffffffff802a97aa:       test   %rax,%rax
    126 8.5e-05 :ffffffff802a97ad:       movq   $0x0,0x10(%rax)
     39 2.6e-05 :ffffffff802a97b5:       mov    %rcx,0x8(%rdx)
   6974  0.0047 :ffffffff802a97b9:       mov    %rdx,(%rcx)
   2841  0.0019 :ffffffff802a97bc:       movq   $0x0,0x8(%rax)
    366 2.5e-04 :ffffffff802a97c4:       movq   $0x0,(%rax)
  14757  0.0099 :ffffffff802a97cb:       je     ffffffff802a97d1 <pfifo_fast_dequeue+0x41>
    288 1.9e-04 :ffffffff802a97cd:       decl   0x40(%rdi)
     94 6.3e-05 :ffffffff802a97d0:       retq
970400  0.6526 :ffffffff802a97d1:       inc    %esi
982402  0.6607 :ffffffff802a97d3:       add    $0x18,%rcx
      4 2.7e-06 :ffffffff802a97d7:       cmp    $0x2,%esi
      1 6.7e-07 :ffffffff802a97da:       jle    ffffffff802a9799 <pfifo_fast_dequeue+0x9>
  59754  0.0402 :ffffffff802a97dc:       xor    %eax,%eax
    561 3.8e-04 :ffffffff802a97de:       data16
                :ffffffff802a97df:       nop
                :ffffffff802a97e0:       retq


And new code (2.6.12-ed):

ffffffff802b1020 <pfifo_fast_dequeue>: /* pfifo_fast_dequeue total: 153139  0.2934 */
  27388  0.0525 :ffffffff802b1020:       lea    0xc0(%rdi),%rdx
  42091  0.0806 :ffffffff802b1027:       cmp    %rdx,0xc0(%rdi)
                :ffffffff802b102e:       jne    ffffffff802b1052 <pfifo_fast_dequeue+0x32>
    474 9.1e-04 :ffffffff802b1030:       lea    0xd8(%rdi),%rdx
   5571  0.0107 :ffffffff802b1037:       cmp    %rdx,0xd8(%rdi)
      2 3.8e-06 :ffffffff802b103e:       jne    ffffffff802b1052 <pfifo_fast_dequeue+0x32>
      1 1.9e-06 :ffffffff802b1040:       lea    0xf0(%rdi),%rdx
  20030  0.0384 :ffffffff802b1047:       xor    %eax,%eax
      6 1.1e-05 :ffffffff802b1049:       cmp    %rdx,0xf0(%rdi)
      6 1.1e-05 :ffffffff802b1050:       je     ffffffff802b1086 <pfifo_fast_dequeue+0x66>
                :ffffffff802b1052:       mov    (%rdx),%rcx
  11796  0.0226 :ffffffff802b1055:       xor    %eax,%eax
                :ffffffff802b1057:       cmp    %rdx,%rcx
      8 1.5e-05 :ffffffff802b105a:       je     ffffffff802b1083 <pfifo_fast_dequeue+0x63>
   3146  0.0060 :ffffffff802b105c:       mov    %rcx,%rax
     12 2.3e-05 :ffffffff802b105f:       mov    (%rcx),%rcx
    118 2.3e-04 :ffffffff802b1062:       decl   0x10(%rdx)
   4924  0.0094 :ffffffff802b1065:       movq   $0x0,0x10(%rax)
     65 1.2e-04 :ffffffff802b106d:       mov    %rdx,0x8(%rcx)
    725  0.0014 :ffffffff802b1071:       mov    %rcx,(%rdx)
  11493  0.0220 :ffffffff802b1074:       movq   $0x0,0x8(%rax)
    194 3.7e-04 :ffffffff802b107c:       movq   $0x0,(%rax)
   2995  0.0057 :ffffffff802b1083:       decl   0x40(%rdi)
  19607  0.0376 :ffffffff802b1086:       nop
   2487  0.0048 :ffffffff802b1087:       retq


Please give us the code your compiler produces, and explain me how disabling oprofile can change the generated assembly. :)
Debugging has no impact on this code either.

Thank you

Eric

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
  2005-07-05 13:04                         ` Eric Dumazet
@ 2005-07-05 13:48                           ` Thomas Graf
  2005-07-05 15:58                             ` Eric Dumazet
  0 siblings, 1 reply; 64+ messages in thread
From: Thomas Graf @ 2005-07-05 13:48 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S. Miller, netdev

* Eric Dumazet <42CA8555.9050607@cosmosbay.com> 2005-07-05 15:04
> Thomas Graf a écrit :
> >* Eric Dumazet <42CA390C.9000801@cosmosbay.com> 2005-07-05 09:38
> >
> >>[NET] : unroll a small loop in pfifo_fast_dequeue(). Compiler generates 
> >>better code.
> >>	(Using skb_queue_empty() to test the queue is faster than trying to 
> >>	__skb_dequeue())
> >>	oprofile says this function uses now 0.29% instead of 1.22 %, on a 
> >>	x86_64 target.
> >
> >
> >I think this patch is pretty much pointless. __skb_dequeue() and
> >!skb_queue_empty() should produce almost the same code and as soon
> >as you disable profiling and debugging you'll see that the compiler
> >unrolls the loop itself if possible.
> >
> >
> 
> OK. At least my compiler (gcc-3.3.1) does NOT unroll the loop :

Because you don't specify -funroll-loop

[...]

> Please give us the code your compiler produces,

Unrolled version:

pfifo_fast_dequeue:
	pushl	%esi
	xorl	%edx, %edx
	pushl	%ebx
	movl	12(%esp), %esi
	movl	128(%esi), %eax
	leal	128(%esi), %ecx
	cmpl	%ecx, %eax
	je	.L132
	movl	%eax, %edx
	movl	(%eax), %eax
	decl	8(%ecx)
	movl	$0, 8(%edx)
	movl	%ecx, 4(%eax)
	movl	%eax, 128(%esi)
	movl	$0, 4(%edx)
	movl	$0, (%edx)
.L132:
	testl	%edx, %edx
	je	.L131
	movl	96(%edx), %ebx
	movl	80(%esi), %eax
	decl	40(%esi)
	subl	%ebx, %eax
	movl	%eax, 80(%esi)
	movl	%edx, %eax
.L117:
	popl	%ebx
	popl	%esi
	ret
.L131:
	movl	20(%ecx), %eax
	leal	20(%ecx), %edx
	xorl	%ebx, %ebx
	cmpl	%edx, %eax
	je	.L137
	movl	%eax, %ebx
	movl	(%eax), %eax
	decl	8(%edx)
	movl	$0, 8(%ebx)
	movl	%edx, 4(%eax)
	movl	%eax, 20(%ecx)
	movl	$0, 4(%ebx)
	movl	$0, (%ebx)
.L137:
	testl	%ebx, %ebx
	je	.L147
.L146:
	movl	96(%ebx), %ecx
	movl	80(%esi), %eax
	decl	40(%esi)
	subl	%ecx, %eax
	movl	%eax, 80(%esi)
	movl	%ebx, %eax
	jmp	.L117
.L147:
	movl	40(%ecx), %eax
	leal	40(%ecx), %edx
	xorl	%ebx, %ebx
	cmpl	%edx, %eax
	je	.L142
	movl	%eax, %ebx
	movl	(%eax), %eax
	decl	8(%edx)
	movl	$0, 8(%ebx)
	movl	%edx, 4(%eax)
	movl	%eax, 40(%ecx)
	movl	$0, 4(%ebx)
	movl	$0, (%ebx)
.L142:
	xorl	%eax, %eax
	testl	%ebx, %ebx
	jne	.L146
	jmp	.L117

> and explain me how 
> disabling oprofile can change the generated assembly. :)
> Debugging has no impact on this code either.

I just noticed that this is a local modification of my own, so in
the vanilla tree it indeed doesn't have any impact on the code
generated.

Still, your patch does not make sense to me. The latest tree
also includes my pfifo_fast changes wich modified the code to
maintain a backlog and made it easy to add more fifos at compile
time.  If you want the loop unrolled then let the compiler do it
via -funroll-loop. These kind of optimization seem as uncessary
to me as all the loopback optimizations.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
  2005-07-05 13:48                           ` Thomas Graf
@ 2005-07-05 15:58                             ` Eric Dumazet
  2005-07-05 17:34                               ` Thomas Graf
  0 siblings, 1 reply; 64+ messages in thread
From: Eric Dumazet @ 2005-07-05 15:58 UTC (permalink / raw)
  To: Thomas Graf; +Cc: David S. Miller, netdev

Thomas Graf a écrit :

>>OK. At least my compiler (gcc-3.3.1) does NOT unroll the loop :
> 
> 
> Because you don't specify -funroll-loop

I'm using vanilla 2.6.12 : no -funroll-loop in it. Maybe in your tree, not on 99.9% of 2.6.12 trees.

Are you suggesting everybody should use this compiler flag ?
Something like :

net/sched/Makefile:

CFLAGS_sch_generic.o := -funroll-loops

?

> 
> [...]
> 
> 
>>Please give us the code your compiler produces,
> 
> 
> Unrolled version:
> 
> pfifo_fast_dequeue:
> 	pushl	%esi
> 	xorl	%edx, %edx
> 	pushl	%ebx
> 	movl	12(%esp), %esi
> 	movl	128(%esi), %eax
> 	leal	128(%esi), %ecx
> 	cmpl	%ecx, %eax
> 	je	.L132
> 	movl	%eax, %edx
> 	movl	(%eax), %eax
> 	decl	8(%ecx)
> 	movl	$0, 8(%edx)
> 	movl	%ecx, 4(%eax)
> 	movl	%eax, 128(%esi)
> 	movl	$0, 4(%edx)
> 	movl	$0, (%edx)
> .L132:
> 	testl	%edx, %edx
> 	je	.L131
> 	movl	96(%edx), %ebx
> 	movl	80(%esi), %eax
> 	decl	40(%esi)
> 	subl	%ebx, %eax
> 	movl	%eax, 80(%esi)
> 	movl	%edx, %eax
> .L117:
> 	popl	%ebx
> 	popl	%esi
> 	ret
> .L131:
> 	movl	20(%ecx), %eax
> 	leal	20(%ecx), %edx
> 	xorl	%ebx, %ebx
> 	cmpl	%edx, %eax
> 	je	.L137
> 	movl	%eax, %ebx
> 	movl	(%eax), %eax
> 	decl	8(%edx)
> 	movl	$0, 8(%ebx)
> 	movl	%edx, 4(%eax)
> 	movl	%eax, 20(%ecx)
> 	movl	$0, 4(%ebx)
> 	movl	$0, (%ebx)
> .L137:
> 	testl	%ebx, %ebx
> 	je	.L147
> .L146:
> 	movl	96(%ebx), %ecx
> 	movl	80(%esi), %eax
> 	decl	40(%esi)
> 	subl	%ecx, %eax
> 	movl	%eax, 80(%esi)
> 	movl	%ebx, %eax
> 	jmp	.L117
> .L147:
> 	movl	40(%ecx), %eax
> 	leal	40(%ecx), %edx
> 	xorl	%ebx, %ebx
> 	cmpl	%edx, %eax
> 	je	.L142
> 	movl	%eax, %ebx
> 	movl	(%eax), %eax
> 	decl	8(%edx)
> 	movl	$0, 8(%ebx)
> 	movl	%edx, 4(%eax)
> 	movl	%eax, 40(%ecx)
> 	movl	$0, 4(%ebx)
> 	movl	$0, (%ebx)
> .L142:
> 	xorl	%eax, %eax
> 	testl	%ebx, %ebx
> 	jne	.L146
> 	jmp	.L117
> 

OK thanks, but you dont give the code for my version :) shorter and unrolled as you can see, and with nice predicted branches.

00000fc0 <pfifo_fast_dequeue>:
      fc0:       56                      push   %esi
      fc1:       89 c1                   mov    %eax,%ecx
      fc3:       53                      push   %ebx
      fc4:       8d 98 a0 00 00 00       lea    0xa0(%eax),%ebx
      fca:       39 98 a0 00 00 00       cmp    %ebx,0xa0(%eax)
      fd0:       89 da                   mov    %ebx,%edx
      fd2:       75 22                   jne    ff6 <pfifo_fast_dequeue+0x36>
      fd4:       8d 90 c4 00 00 00       lea    0xc4(%eax),%edx
      fda:       39 90 c4 00 00 00       cmp    %edx,0xc4(%eax)
      fe0:       89 d3                   mov    %edx,%ebx
      fe2:       75 12                   jne    ff6 <pfifo_fast_dequeue+0x36>
      fe4:       8d 98 e8 00 00 00       lea    0xe8(%eax),%ebx
      fea:       31 f6                   xor    %esi,%esi
      fec:       39 98 e8 00 00 00       cmp    %ebx,0xe8(%eax)
      ff2:       89 da                   mov    %ebx,%edx
      ff4:       74 27                   je     101d <pfifo_fast_dequeue+0x5d>
      ff6:       8b 32                   mov    (%edx),%esi
      ff8:       39 d6                   cmp    %edx,%esi
      ffa:       74 26                   je     1022 <pfifo_fast_dequeue+0x62>
      ffc:       8b 06                   mov    (%esi),%eax
      ffe:       ff 4b 08                decl   0x8(%ebx)
     1001:       c7 46 08 00 00 00 00    movl   $0x0,0x8(%esi)
     1008:       89 50 04                mov    %edx,0x4(%eax)
     100b:       89 02                   mov    %eax,(%edx)
     100d:       c7 46 04 00 00 00 00    movl   $0x0,0x4(%esi)
     1014:       c7 06 00 00 00 00       movl   $0x0,(%esi)
     101a:       ff 49 28                decl   0x28(%ecx)
     101d:       5b                      pop    %ebx
     101e:       89 f0                   mov    %esi,%eax
     1020:       5e                      pop    %esi
     1021:       c3                      ret
     1022:       ff 49 28                decl   0x28(%ecx)
     1025:       31 f6                   xor    %esi,%esi
     1027:       eb f4                   jmp    101d <pfifo_fast_dequeue+0x5d>


> 
> I just noticed that this is a local modification of my own, so in
> the vanilla tree it indeed doesn't have any impact on the code
> generated.
> 
> Still, your patch does not make sense to me. The latest tree
> also includes my pfifo_fast changes wich modified the code to
> maintain a backlog and made it easy to add more fifos at compile
> time.  If you want the loop unrolled then let the compiler do it
> via -funroll-loop. These kind of optimization seem as uncessary
> to me as all the loopback optimizations.
> 

I dont want change compiler flags in my tree and loose this optim when 2.6.13 is released.

I dont know about loopback optimization, I am not involved with this stuff, maybe you think I'm another guy ?

It seems to me you give unrelated arguments.
I dont know what are your plans, but mine were not to say you are writing bad code.
Just to give my performance analysis and feedback, I'm sorry if it hurts you.


Eric Dumazet

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [TG3]: About hw coalescing infrastructure.
  2005-07-04 23:00                 ` [TG3]: About hw coalescing infrastructure David S. Miller
@ 2005-07-05 16:14                   ` Eric Dumazet
  0 siblings, 0 replies; 64+ messages in thread
From: Eric Dumazet @ 2005-07-05 16:14 UTC (permalink / raw)
  To: David S. Miller; +Cc: mchan, netdev

David S. Miller a écrit :

> SMP system?  What platform and 570x chip revision?
> 

dual opterons 248,
Ethernet controller: Broadcom Corporation NetXtreme BCM5702 Gigabit Ethernet (rev 02)

> Does the stock driver in 2.6.12 hang as well?

Stock driver in 2.6.11 or 2.6.12 cause hangs (crashes ?) too, but no kernel mesages are logged on disk.
I think this is crashing in cache_grow() (slab code), not sure because I dont have access to the console.

The 2.6.13-rc1 tg3 driver seems to survive (running for 18 hours now), and the cpu used by network activity
was cut down :)

rx-usecs: 500
rx-frames: 30
rx-usecs-irq: 500
rx-frames-irq: 20

tx-usecs: 490
tx-frames: 53
tx-usecs-irq: 490
tx-frames-irq: 5

About 1200 IRQ/second now, with 30000 recv packs/s and 25000 xmit pack/s

Thank you

Eric

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
  2005-07-05 15:58                             ` Eric Dumazet
@ 2005-07-05 17:34                               ` Thomas Graf
  2005-07-05 21:22                                 ` David S. Miller
  0 siblings, 1 reply; 64+ messages in thread
From: Thomas Graf @ 2005-07-05 17:34 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S. Miller, netdev

* Eric Dumazet <42CAAE2F.5070807@cosmosbay.com> 2005-07-05 17:58
> I'm using vanilla 2.6.12 : no -funroll-loop in it. Maybe in your tree, not 
> on 99.9% of 2.6.12 trees.
> 
> Are you suggesting everybody should use this compiler flag ?

> I dont know about loopback optimization, I am not involved with this stuff, 
> maybe you think I'm another guy ?
> 
> It seems to me you give unrelated arguments.

Do as you wish, I don't feel like argueing about micro optimizations.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
  2005-07-05 17:34                               ` Thomas Graf
@ 2005-07-05 21:22                                 ` David S. Miller
  2005-07-05 21:33                                   ` Thomas Graf
  0 siblings, 1 reply; 64+ messages in thread
From: David S. Miller @ 2005-07-05 21:22 UTC (permalink / raw)
  To: tgraf; +Cc: dada1, netdev

From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 5 Jul 2005 19:34:11 +0200

> Do as you wish, I don't feel like argueing about micro optimizations.

I bet the performance gain really comes from the mispredicted
branches in the loop.

For loops of fixed duration, say, 5 or 6 iterations or less, it
totally defeats the branch prediction logic in most processors.
By the time the chip moves the I-cache branch state to "likely"
the loop has ended and we eat a mispredict.

I think the original patch is OK, hand unrolling the loop in
the C code.  Adding -funroll-loops to the CFLAGS has lots of
implications, and in particular the embedded folks might not
be happy with some things that result from that.

So I'll apply the original unrolling patch for now.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
  2005-07-05  7:38                     ` [PATCH] loop unrolling in net/sched/sch_generic.c Eric Dumazet
  2005-07-05 11:51                       ` Thomas Graf
@ 2005-07-05 21:26                       ` David S. Miller
  2005-07-28 15:52                       ` [PATCH] Add prefetches in net/ipv4/route.c Eric Dumazet
  2 siblings, 0 replies; 64+ messages in thread
From: David S. Miller @ 2005-07-05 21:26 UTC (permalink / raw)
  To: dada1; +Cc: netdev


Eric, I've told you this before many times.  Please do something
so that your email client does not corrupt the patches.  Once again,
your email client turned all the tab characters into spaces and
thus made the patch unusable.

Even though you used an attachment, the tab-->space transformation
still happened somehow.

Please fix this, and make a serious mental note to prevent this
somehow in the future, thanks a lot.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
  2005-07-05 21:22                                 ` David S. Miller
@ 2005-07-05 21:33                                   ` Thomas Graf
  2005-07-05 21:35                                     ` David S. Miller
  0 siblings, 1 reply; 64+ messages in thread
From: Thomas Graf @ 2005-07-05 21:33 UTC (permalink / raw)
  To: David S. Miller; +Cc: dada1, netdev

* David S. Miller <20050705.142210.14973612.davem@davemloft.net> 2005-07-05 14:22
> So I'll apply the original unrolling patch for now.

The patch must be changed to use __qdisc_dequeue_head() instead of
__skb_dequeue() or we screw up the backlog.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
  2005-07-05 21:33                                   ` Thomas Graf
@ 2005-07-05 21:35                                     ` David S. Miller
  2005-07-05 23:16                                       ` Eric Dumazet
  0 siblings, 1 reply; 64+ messages in thread
From: David S. Miller @ 2005-07-05 21:35 UTC (permalink / raw)
  To: tgraf; +Cc: dada1, netdev

From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 5 Jul 2005 23:33:55 +0200

> * David S. Miller <20050705.142210.14973612.davem@davemloft.net> 2005-07-05 14:22
> > So I'll apply the original unrolling patch for now.
> 
> The patch must be changed to use __qdisc_dequeue_head() instead of
> __skb_dequeue() or we screw up the backlog.

Ok, good thing the patch didn't apply correctly anyways :)

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
  2005-07-05 21:35                                     ` David S. Miller
@ 2005-07-05 23:16                                       ` Eric Dumazet
  2005-07-05 23:41                                         ` Thomas Graf
  0 siblings, 1 reply; 64+ messages in thread
From: Eric Dumazet @ 2005-07-05 23:16 UTC (permalink / raw)
  To: David S. Miller, tgraf; +Cc: netdev

[-- Attachment #1: Type: text/plain, Size: 921 bytes --]

David S. Miller a écrit :
> From: Thomas Graf <tgraf@suug.ch>
> Date: Tue, 5 Jul 2005 23:33:55 +0200
> 
> 
>>* David S. Miller <20050705.142210.14973612.davem@davemloft.net> 2005-07-05 14:22
>>
>>>So I'll apply the original unrolling patch for now.
>>
>>The patch must be changed to use __qdisc_dequeue_head() instead of
>>__skb_dequeue() or we screw up the backlog.
> 
> 
> Ok, good thing the patch didn't apply correctly anyways :)
> 
> 
Oh well, I was unaware of last changes in 2.6.13-rc1 :(

Given the fact that the PFIFO_FAST_BANDS macro was introduced, I wonder if the patch should be this one or not...
Should we assume PFIFO_FAST_BANDS will stay at 3 or what ?

[NET] : unroll a small loop in pfifo_fast_dequeue(). Compiler generates better code.
     oprofile says this function uses now 0.29% instead of 1.22 %, on a x86_64 target.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>


[-- Attachment #2: patch.sch_generic --]
[-- Type: text/plain, Size: 1015 bytes --]

--- linux-2.6.13-rc1/net/sched/sch_generic.c	2005-07-06 00:46:53.000000000 +0200
+++ linux-2.6.13-rc1-ed/net/sched/sch_generic.c	2005-07-06 01:05:04.000000000 +0200
@@ -328,18 +328,31 @@
 
 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
 {
-	int prio;
 	struct sk_buff_head *list = qdisc_priv(qdisc);
 
-	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++, list++) {
-		struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);
-		if (skb) {
-			qdisc->q.qlen--;
-			return skb;
+#if PFIFO_FAST_BANDS == 3
+	for (;;) {
+		if (!skb_queue_empty(list))
+			break;
+		list++;
+		if (!skb_queue_empty(list))
+			break;
+		list++;
+		if (!skb_queue_empty(list))
+			break;
+		return NULL;
 		}
-	}
-
-	return NULL;
+#else
+	int prio;
+	for (prio = 0;; list++) {
+		if (!skb_queue_empty(list))
+			break;
+		if (++prio == PFIFO_FAST_BANDS)
+			return NULL;
+		}
+#endif
+	qdisc->q.qlen--;
+	return __qdisc_dequeue_head(qdisc, list);
 }
 
 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
  2005-07-05 23:16                                       ` Eric Dumazet
@ 2005-07-05 23:41                                         ` Thomas Graf
  2005-07-05 23:45                                           ` David S. Miller
  2005-07-06  0:32                                           ` Eric Dumazet
  0 siblings, 2 replies; 64+ messages in thread
From: Thomas Graf @ 2005-07-05 23:41 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S. Miller, netdev

* Eric Dumazet <42CB14B2.5090601@cosmosbay.com> 2005-07-06 01:16
> Oh well, I was unaware of last changes in 2.6.13-rc1 :(

Ok, this clarifies a lot for me, I was under the impression
you knew about these changes.

> Given the fact that the PFIFO_FAST_BANDS macro was introduced, I wonder if 
> the patch should be this one or not...
> Should we assume PFIFO_FAST_BANDS will stay at 3 or what ?

It is very unlikely to change within mainline but the idea behind
it is to allow it be changed at compile time.

I still think we can fix this performance issue without manually
unrolling the loop or we should at least try to. In the end gcc
should notice the constant part of the loop and move it out so
basically the only difference should the additional prio++ and
possibly a failing branch prediction.

What about this? I'm still not sure where exactly all the time
is lost so this is a shot in the dark.

Index: net-2.6/net/sched/sch_generic.c
===================================================================
--- net-2.6.orig/net/sched/sch_generic.c
+++ net-2.6/net/sched/sch_generic.c
@@ -330,10 +330,11 @@ static struct sk_buff *pfifo_fast_dequeu
 {
 	int prio;
 	struct sk_buff_head *list = qdisc_priv(qdisc);
+	struct sk_buff *skb;
 
-	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++, list++) {
-		struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);
-		if (skb) {
+	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
+		if (!skb_queue_empty(list + prio)) {
+			skb = __qdisc_dequeue_head(qdisc, list);
 			qdisc->q.qlen--;
 			return skb;
 		}

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
  2005-07-05 23:41                                         ` Thomas Graf
@ 2005-07-05 23:45                                           ` David S. Miller
  2005-07-05 23:55                                             ` Thomas Graf
  2005-07-06  0:32                                           ` Eric Dumazet
  1 sibling, 1 reply; 64+ messages in thread
From: David S. Miller @ 2005-07-05 23:45 UTC (permalink / raw)
  To: tgraf; +Cc: dada1, netdev

From: Thomas Graf <tgraf@suug.ch>
Date: Wed, 6 Jul 2005 01:41:04 +0200

> I still think we can fix this performance issue without manually
> unrolling the loop or we should at least try to. In the end gcc
> should notice the constant part of the loop and move it out so
> basically the only difference should the additional prio++ and
> possibly a failing branch prediction.

But the branch prediction is where I personally think a lot
of the lossage is coming from.  These can cost upwards of 20
or 30 processor cycles, easily.  That's getting close to the
cost of a L2 cache miss.

I see the difficulties with this change now, why don't we revisit
this some time in the future?

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
  2005-07-05 23:45                                           ` David S. Miller
@ 2005-07-05 23:55                                             ` Thomas Graf
  0 siblings, 0 replies; 64+ messages in thread
From: Thomas Graf @ 2005-07-05 23:55 UTC (permalink / raw)
  To: David S. Miller; +Cc: dada1, netdev

* David S. Miller <20050705.164503.104035718.davem@davemloft.net> 2005-07-05 16:45
> From: Thomas Graf <tgraf@suug.ch>
> Date: Wed, 6 Jul 2005 01:41:04 +0200
> 
> > I still think we can fix this performance issue without manually
> > unrolling the loop or we should at least try to. In the end gcc
> > should notice the constant part of the loop and move it out so
> > basically the only difference should the additional prio++ and
> > possibly a failing branch prediction.
> 
> But the branch prediction is where I personally think a lot
> of the lossage is coming from.  These can cost upwards of 20
> or 30 processor cycles, easily.  That's getting close to the
> cost of a L2 cache miss.

Absolutely. I think what happens is that we produce predicion
failures due to the logic within qdisc_dequeue_head(), I
cannot back this up with numbers though.

> I see the difficulties with this change now, why don't we revisit
> this some time in the future?

Fine with me.

Eric, the patch I just posted should result in the same branch
prediction as your loop unrolling. The only additional overhead
we still have is the list + prio thing and an additional conditional
jump to do the loop. If you have the cycles etc. it would be nice
to compare it with your numbers.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
  2005-07-05 23:41                                         ` Thomas Graf
  2005-07-05 23:45                                           ` David S. Miller
@ 2005-07-06  0:32                                           ` Eric Dumazet
  2005-07-06  0:51                                             ` Thomas Graf
  2005-07-06  0:53                                             ` Eric Dumazet
  1 sibling, 2 replies; 64+ messages in thread
From: Eric Dumazet @ 2005-07-06  0:32 UTC (permalink / raw)
  To: Thomas Graf; +Cc: David S. Miller, netdev

Thomas Graf a écrit :

> I still think we can fix this performance issue without manually
> unrolling the loop or we should at least try to. In the end gcc
> should notice the constant part of the loop and move it out so
> basically the only difference should the additional prio++ and
> possibly a failing branch prediction.
> 
> What about this? I'm still not sure where exactly all the time
> is lost so this is a shot in the dark.
> 
> Index: net-2.6/net/sched/sch_generic.c
> ===================================================================
> --- net-2.6.orig/net/sched/sch_generic.c
> +++ net-2.6/net/sched/sch_generic.c
> @@ -330,10 +330,11 @@ static struct sk_buff *pfifo_fast_dequeu
>  {
>  	int prio;
>  	struct sk_buff_head *list = qdisc_priv(qdisc);
> +	struct sk_buff *skb;
>  
> -	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++, list++) {
> -		struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);
> -		if (skb) {
> +	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
> +		if (!skb_queue_empty(list + prio)) {
> +			skb = __qdisc_dequeue_head(qdisc, list);
>  			qdisc->q.qlen--;
>  			return skb;
>  		}
> 
> 

Hum... shouldnt it be :

+			skb = __qdisc_dequeue_head(qdisc, list + prio);


?

Anyway, the branches misprediction come from the fact that most of packets are queued in the prio=2 list.

So each time this function is called, a non unrolled version has to pay 2 to 5 branches misprediction.

if ((!skb_queue_empty(list + prio))  /* branch not taken, mispredict when prio=0 */

if ((!skb_queue_empty(list + prio))  /* branch not taken, mispredict when prio=1 */
if ((!skb_queue_empty(list + prio))  /* branch taken (or not if queue is really empty), mispredict when prio=2 */

Maybe we can rewrite the whole thing without branches, examining prio from PFIFO_FAST_BANDS-1 down to 0, at least for modern cpu with 
conditional mov (cmov)

struct sk_buff_head *best = NULL;
struct sk_buff_head *list = qdisc_priv(qdisc)+PFIFO_FAST_BANDS-1;
if (skb_queue_empty(list)) best = list ;
list--;
if (skb_queue_empty(list)) best = list ;
list--;
if (skb_queue_empty(list)) best = list ;
if (best != NULL) {
	qdisc->q.qlen--;
	return __qdisc_dequeue_head(qdisc, best);
	}

This version should have one branch.
I will test this after some sleep :)
See you
Eric

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
  2005-07-06  0:32                                           ` Eric Dumazet
@ 2005-07-06  0:51                                             ` Thomas Graf
  2005-07-06  1:04                                               ` Eric Dumazet
  2005-07-06  0:53                                             ` Eric Dumazet
  1 sibling, 1 reply; 64+ messages in thread
From: Thomas Graf @ 2005-07-06  0:51 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S. Miller, netdev

* Eric Dumazet <42CB2698.2080904@cosmosbay.com> 2005-07-06 02:32
> Hum... shouldnt it be :
> 
> +			skb = __qdisc_dequeue_head(qdisc, list + prio);
> 

Correct.


> Anyway, the branches misprediction come from the fact that most of packets 
> are queued in the prio=2 list.
> 
> So each time this function is called, a non unrolled version has to pay 2 
> to 5 branches misprediction.
> 
> if ((!skb_queue_empty(list + prio))  /* branch not taken, mispredict when 
> prio=0 */

The !expr implies an unlikely so the prediction should be right and
equal to your unrolling version.

> Maybe we can rewrite the whole thing without branches, examining prio from 
> PFIFO_FAST_BANDS-1 down to 0, at least for modern cpu with conditional mov 
> (cmov)

This would break the whole thing, the qdisc is supposed to try and
dequeue from the highest priority queue (prio=0) first.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
  2005-07-06  0:32                                           ` Eric Dumazet
  2005-07-06  0:51                                             ` Thomas Graf
@ 2005-07-06  0:53                                             ` Eric Dumazet
  2005-07-06  1:02                                               ` Thomas Graf
  2005-07-06 12:42                                               ` Thomas Graf
  1 sibling, 2 replies; 64+ messages in thread
From: Eric Dumazet @ 2005-07-06  0:53 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Thomas Graf, David S. Miller, netdev

Eric Dumazet a écrit :
>
> 
> Maybe we can rewrite the whole thing without branches, examining prio 
> from PFIFO_FAST_BANDS-1 down to 0, at least for modern cpu with 
> conditional mov (cmov)
> 
> struct sk_buff_head *best = NULL;
> struct sk_buff_head *list = qdisc_priv(qdisc)+PFIFO_FAST_BANDS-1;
> if (skb_queue_empty(list)) best = list ;
> list--;
> if (skb_queue_empty(list)) best = list ;
> list--;
> if (skb_queue_empty(list)) best = list ;
> if (best != NULL) {
>     qdisc->q.qlen--;
>     return __qdisc_dequeue_head(qdisc, best);
>     }
> 
> This version should have one branch.
> I will test this after some sleep :)
> See you
> Eric
> 
> 

(Sorry, still using 2.6.12, but the idea remains)

static struct sk_buff *
pfifo_fast_dequeue(struct Qdisc* qdisc)
{
         struct sk_buff_head *list = qdisc_priv(qdisc);
         struct sk_buff_head *best = NULL;

	list += 2;
         if (!skb_queue_empty(list))
                 best = list;
         list--;
         if (!skb_queue_empty(list))
                 best = list;
         list--;
         if (!skb_queue_empty(list))
                 best = list;
         if (best) {
                 qdisc->q.qlen--;
                 return __skb_dequeue(best);
                 }
         return NULL;
}



At least the compiler output seems promising :

0000000000000550 <pfifo_fast_dequeue>:
  550:   48 8d 97 f0 00 00 00    lea    0xf0(%rdi),%rdx
  557:   31 c9                   xor    %ecx,%ecx
  559:   48 8d 87 c0 00 00 00    lea    0xc0(%rdi),%rax
  560:   48 39 97 f0 00 00 00    cmp    %rdx,0xf0(%rdi)
  567:   48 0f 45 ca             cmovne %rdx,%rcx
  56b:   48 8d 97 d8 00 00 00    lea    0xd8(%rdi),%rdx
  572:   48 39 97 d8 00 00 00    cmp    %rdx,0xd8(%rdi)
  579:   48 0f 45 ca             cmovne %rdx,%rcx
  57d:   48 39 87 c0 00 00 00    cmp    %rax,0xc0(%rdi)
  584:   48 0f 45 c8             cmovne %rax,%rcx
  588:   31 c0                   xor    %eax,%eax
  58a:   48 85 c9                test   %rcx,%rcx
  58d:   74 32                   je     5c1 <pfifo_fast_dequeue+0x71> // one conditional branch
  58f:   ff 4f 40                decl   0x40(%rdi)
  592:   48 8b 11                mov    (%rcx),%rdx
  595:   48 39 ca                cmp    %rcx,%rdx
  598:   74 27                   je     5c1 <pfifo_fast_dequeue+0x71> // never taken branch : always predicted OK
  59a:   48 89 d0                mov    %rdx,%rax
  59d:   48 8b 12                mov    (%rdx),%rdx
  5a0:   ff 49 10                decl   0x10(%rcx)
  5a3:   48 c7 40 10 00 00 00    movq   $0x0,0x10(%rax)
  5aa:   00
  5ab:   48 89 4a 08             mov    %rcx,0x8(%rdx)
  5af:   48 89 11                mov    %rdx,(%rcx)
  5b2:   48 c7 40 08 00 00 00    movq   $0x0,0x8(%rax)
  5b9:   00
  5ba:   48 c7 00 00 00 00 00    movq   $0x0,(%rax)
  5c1:   90                      nop
  5c2:   c3                      retq

I Will post tomorrow some profiling results.
Eric

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
  2005-07-06  0:53                                             ` Eric Dumazet
@ 2005-07-06  1:02                                               ` Thomas Graf
  2005-07-06  1:09                                                 ` Eric Dumazet
  2005-07-06 12:42                                               ` Thomas Graf
  1 sibling, 1 reply; 64+ messages in thread
From: Thomas Graf @ 2005-07-06  1:02 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S. Miller, netdev

* Eric Dumazet <42CB2B84.50702@cosmosbay.com> 2005-07-06 02:53
> (Sorry, still using 2.6.12, but the idea remains)

I think you got me wrong, the whole point of this qdisc
is to prioritize which means that we cannot dequeue from
prio 1 as long as the queue in prio 0 is not empty. 

If you have no traffic at all for prio=0 and prio=1 then
the best solution is to replace the qdisc on the device
with a simple fifo.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
  2005-07-06  0:51                                             ` Thomas Graf
@ 2005-07-06  1:04                                               ` Eric Dumazet
  2005-07-06  1:07                                                 ` Thomas Graf
  0 siblings, 1 reply; 64+ messages in thread
From: Eric Dumazet @ 2005-07-06  1:04 UTC (permalink / raw)
  To: Thomas Graf; +Cc: David S. Miller, netdev

Thomas Graf a écrit :

>>Maybe we can rewrite the whole thing without branches, examining prio from 
>>PFIFO_FAST_BANDS-1 down to 0, at least for modern cpu with conditional mov 
>>(cmov)
> 
> 
> This would break the whole thing, the qdisc is supposed to try and
> dequeue from the highest priority queue (prio=0) first.
> 
> 

I still dequeue a packet from the highest priority queue.

But nothing prevents us to look the three queues in the reverse order, if you can avoid the conditional branches.

No memory penalty, since most of time we were looking at the three queues anyway, and the 3 sk_buff_head are in the same cache line.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
  2005-07-06  1:04                                               ` Eric Dumazet
@ 2005-07-06  1:07                                                 ` Thomas Graf
  0 siblings, 0 replies; 64+ messages in thread
From: Thomas Graf @ 2005-07-06  1:07 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S. Miller, netdev

* Eric Dumazet <42CB2E24.6010303@cosmosbay.com> 2005-07-06 03:04
> Thomas Graf a écrit :
> 
> >>Maybe we can rewrite the whole thing without branches, examining prio 
> >>from PFIFO_FAST_BANDS-1 down to 0, at least for modern cpu with 
> >>conditional mov (cmov)
> >
> >
> >This would break the whole thing, the qdisc is supposed to try and
> >dequeue from the highest priority queue (prio=0) first.
> >
> >
> 
> I still dequeue a packet from the highest priority queue.

Ahh... sorry, I misread your patch, interesting idea. I'll
be waiting for your numbers.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
  2005-07-06  1:02                                               ` Thomas Graf
@ 2005-07-06  1:09                                                 ` Eric Dumazet
  0 siblings, 0 replies; 64+ messages in thread
From: Eric Dumazet @ 2005-07-06  1:09 UTC (permalink / raw)
  To: Thomas Graf; +Cc: David S. Miller, netdev

Thomas Graf a écrit :
> I think you got me wrong, the whole point of this qdisc
> is to prioritize which means that we cannot dequeue from
> prio 1 as long as the queue in prio 0 is not empty. 

if prio 0 is not empty, then the last

if (!skb_queue_empty(list))
      best = list;

will set 'best' to the prio 0 list, and we dequeue the packet on this prio 0 list, not on prio 1 or prio 2.

> 
> If you have no traffic at all for prio=0 and prio=1 then
> the best solution is to replace the qdisc on the device
> with a simple fifo.

Yes sure, but I know that already. Unfortunatly I have some trafic on prio=1 and prio=0 (about 5 %)

Thank you
Eric

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
  2005-07-06  0:53                                             ` Eric Dumazet
  2005-07-06  1:02                                               ` Thomas Graf
@ 2005-07-06 12:42                                               ` Thomas Graf
  2005-07-07 21:17                                                 ` David S. Miller
  1 sibling, 1 reply; 64+ messages in thread
From: Thomas Graf @ 2005-07-06 12:42 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S. Miller, netdev

* Eric Dumazet <42CB2B84.50702@cosmosbay.com> 2005-07-06 02:53

A short recap after some coffee and sleep:

The inital issue you brought up which you backed up with numbers
is probably the cause of multiple wrong branch predictions due
to the fact that I wrote skb = dequeue(); if (skb) which was
assumed to be likely by the compiler. In your patch you fixed
this with !skb_queue_empty() which fixed this wrong prediction
and also acts as a little optimization due to skb_queue_empty()
being really simple to implement for the compiler. The patch
I posted should result in almost the same result, despite of
the additional wrong branch prediction for the loop it always
has one wrong prediction which is when we hit a non-empty queue.
In your unrolled version you could optimize it even more by
taking advantage that prio=2 is the most likely non-empty queue
so you could change the check to likely and save a wrong branch
prediction for the common case at the cost of a branch misprediction
if all queues are empty.

The patch I posted results in something like this:

pfifo_fast_dequeue:
	pushl	%ebx
	xorl	%ecx, %ecx
	movl	8(%esp), %ebx
	leal	128(%ebx), %edx
.L129:
	movl	(%edx), %eax
	cmpl	%edx, %eax
	jne	.L132		; if (!skb_queue_empty())
	incl	%ecx
	addl	$20, %edx
	cmpl	$2, %ecx
	jle	.L129		; end of loop
	xorl	%eax, %eax	; all queues empty
.L117:
	popl	%ebx
	ret

I regard the miss here as acceptable for the increased flexibility
we get. It can be optimized with a loop unrolling but my opinion
is to try and avoid it if possible.

Now your second thought is quite interesting, although it heavly
depends on the fact that prio=2 is the most often used band. It
will be interesting to see some numbers.

> static struct sk_buff *
> pfifo_fast_dequeue(struct Qdisc* qdisc)
> {
>         struct sk_buff_head *list = qdisc_priv(qdisc);
>         struct sk_buff_head *best = NULL;
> 
> 	list += 2;
>         if (!skb_queue_empty(list))
>                 best = list;
>         list--;
>         if (!skb_queue_empty(list))
>                 best = list;
>         list--;
>         if (!skb_queue_empty(list))
>                 best = list;

Here is what I mean, a likely() should be even better.

>         if (best) {
>                 qdisc->q.qlen--;
>                 return __skb_dequeue(best);
>                 }
>         return NULL;
> }

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
  2005-07-06 12:42                                               ` Thomas Graf
@ 2005-07-07 21:17                                                 ` David S. Miller
  2005-07-07 21:34                                                   ` Thomas Graf
       [not found]                                                   ` <42CE22CE.7030902@cosmosbay.com>
  0 siblings, 2 replies; 64+ messages in thread
From: David S. Miller @ 2005-07-07 21:17 UTC (permalink / raw)
  To: tgraf; +Cc: dada1, netdev

From: Thomas Graf <tgraf@suug.ch>
Date: Wed, 6 Jul 2005 14:42:06 +0200

> The inital issue you brought up which you backed up with numbers
> is probably the cause of multiple wrong branch predictions due
> to the fact that I wrote skb = dequeue(); if (skb) which was
> assumed to be likely by the compiler. In your patch you fixed
> this with !skb_queue_empty() which fixed this wrong prediction
> and also acts as a little optimization due to skb_queue_empty()
> being really simple to implement for the compiler.

As an aside, this reminds me that as part of my quest to make
sk_buff smaller, I intend to walk across the tree and change
all tests of the form:

	if (!skb_queue_len(list))

into:

	if (skb_queue_empty(list))

It would be really nice, after the above transformation and some
others, to get rid of sk_buff_head->qlen.  Why?  Because that
also allows us to remove the skb->list member as well, as it's
the only reason for existing is so that the SKB queue removal
routines can decrement the queue length.

That's kind of silly, and most SKB lists in the kernel do not care
about the queue length at all.  Rather, they care about empty and
non-empty.  The cases that do care (mostly packet schedulers) can
keep track of the queue length themselves in their private data
structures.  When they remove packets, they _know_ which queue to
decrement the queue length of.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
  2005-07-07 21:17                                                 ` David S. Miller
@ 2005-07-07 21:34                                                   ` Thomas Graf
  2005-07-07 22:24                                                     ` David S. Miller
       [not found]                                                   ` <42CE22CE.7030902@cosmosbay.com>
  1 sibling, 1 reply; 64+ messages in thread
From: Thomas Graf @ 2005-07-07 21:34 UTC (permalink / raw)
  To: David S. Miller; +Cc: dada1, netdev

* David S. Miller <20050707.141718.85410359.davem@davemloft.net> 2005-07-07 14:17
> As an aside, this reminds me that as part of my quest to make
> sk_buff smaller, I intend to walk across the tree and change
> all tests of the form:
> 
> 	if (!skb_queue_len(list))
> 
> into:
> 
> 	if (skb_queue_empty(list))
> 
> [...]
> 
> The cases that do care (mostly packet schedulers) can
> keep track of the queue length themselves in their private data
> structures.  When they remove packets, they _know_ which queue to
> decrement the queue length of.

Since I'm changing the classful qdiscs to use a generic API
for queue management anyway I could take care of this if you want.
WRT the leaf qdiscs it's a bit more complicated since we have
to change the new API to take a new struct which includes the qlen
and the sk_buff_head but not a problem either.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
  2005-07-07 21:34                                                   ` Thomas Graf
@ 2005-07-07 22:24                                                     ` David S. Miller
  0 siblings, 0 replies; 64+ messages in thread
From: David S. Miller @ 2005-07-07 22:24 UTC (permalink / raw)
  To: tgraf; +Cc: dada1, netdev

From: Thomas Graf <tgraf@suug.ch>
Date: Thu, 7 Jul 2005 23:34:50 +0200

> Since I'm changing the classful qdiscs to use a generic API
> for queue management anyway I could take care of this if you want.
> WRT the leaf qdiscs it's a bit more complicated since we have
> to change the new API to take a new struct which includes the qlen
> and the sk_buff_head but not a problem either.

Ok.  I'm going to check something like the following into
my tree.  It takes care of the obvious cases of direct
binary test of queue length being zero vs. non-zero.

This uncovered some seriously questionable stuff along the way.

For example, take a look at drivers/usb/net/usbnet.c:usbnet_stop().
That code seems to want to wait until all the SKB queues are empty,
but the way it is coded it only waits if all the queues have at least
one packet.

I preserved the behavior there, but if someone could verify my
analysis and post a bug fix, I'd really appreciate it.

Thanks.

diff --git a/drivers/bluetooth/hci_vhci.c b/drivers/bluetooth/hci_vhci.c
--- a/drivers/bluetooth/hci_vhci.c
+++ b/drivers/bluetooth/hci_vhci.c
@@ -120,7 +120,7 @@ static unsigned int hci_vhci_chr_poll(st
 
 	poll_wait(file, &hci_vhci->read_wait, wait);
  
-	if (skb_queue_len(&hci_vhci->readq))
+	if (!skb_queue_empty(&hci_vhci->readq))
 		return POLLIN | POLLRDNORM;
 
 	return POLLOUT | POLLWRNORM;
diff --git a/drivers/isdn/hisax/isdnl1.c b/drivers/isdn/hisax/isdnl1.c
--- a/drivers/isdn/hisax/isdnl1.c
+++ b/drivers/isdn/hisax/isdnl1.c
@@ -279,7 +279,8 @@ BChannel_proc_xmt(struct BCState *bcs)
 	if (test_and_clear_bit(FLG_L1_PULL_REQ, &st->l1.Flags))
 		st->l1.l1l2(st, PH_PULL | CONFIRM, NULL);
 	if (!test_bit(BC_FLG_ACTIV, &bcs->Flag)) {
-		if (!test_bit(BC_FLG_BUSY, &bcs->Flag) && (!skb_queue_len(&bcs->squeue))) {
+		if (!test_bit(BC_FLG_BUSY, &bcs->Flag) &&
+		    skb_queue_empty(&bcs->squeue)) {
 			st->l2.l2l1(st, PH_DEACTIVATE | CONFIRM, NULL);
 		}
 	}
diff --git a/drivers/isdn/hisax/isdnl2.c b/drivers/isdn/hisax/isdnl2.c
--- a/drivers/isdn/hisax/isdnl2.c
+++ b/drivers/isdn/hisax/isdnl2.c
@@ -108,7 +108,8 @@ static int l2addrsize(struct Layer2 *l2)
 static void
 set_peer_busy(struct Layer2 *l2) {
 	test_and_set_bit(FLG_PEER_BUSY, &l2->flag);
-	if (skb_queue_len(&l2->i_queue) || skb_queue_len(&l2->ui_queue))
+	if (!skb_queue_empty(&l2->i_queue) ||
+	    !skb_queue_empty(&l2->ui_queue))
 		test_and_set_bit(FLG_L2BLOCK, &l2->flag);
 }
 
@@ -754,7 +755,7 @@ l2_restart_multi(struct FsmInst *fi, int
 		st->l2.l2l3(st, DL_ESTABLISH | INDICATION, NULL);
 
 	if ((ST_L2_7==state) || (ST_L2_8 == state))
-		if (skb_queue_len(&st->l2.i_queue) && cansend(st))
+		if (!skb_queue_empty(&st->l2.i_queue) && cansend(st))
 			st->l2.l2l1(st, PH_PULL | REQUEST, NULL);
 }
 
@@ -810,7 +811,7 @@ l2_connected(struct FsmInst *fi, int eve
 	if (pr != -1)
 		st->l2.l2l3(st, pr, NULL);
 
-	if (skb_queue_len(&st->l2.i_queue) && cansend(st))
+	if (!skb_queue_empty(&st->l2.i_queue) && cansend(st))
 		st->l2.l2l1(st, PH_PULL | REQUEST, NULL);
 }
 
@@ -1014,7 +1015,7 @@ l2_st7_got_super(struct FsmInst *fi, int
 			if(typ != RR) FsmDelTimer(&st->l2.t203, 9);
 			restart_t200(st, 12);
 		}
-		if (skb_queue_len(&st->l2.i_queue) && (typ == RR))
+		if (!skb_queue_empty(&st->l2.i_queue) && (typ == RR))
 			st->l2.l2l1(st, PH_PULL | REQUEST, NULL);
 	} else
 		nrerrorrecovery(fi);
@@ -1120,7 +1121,7 @@ l2_got_iframe(struct FsmInst *fi, int ev
 		return;
 	}
 
-	if (skb_queue_len(&st->l2.i_queue) && (fi->state == ST_L2_7))
+	if (!skb_queue_empty(&st->l2.i_queue) && (fi->state == ST_L2_7))
 		st->l2.l2l1(st, PH_PULL | REQUEST, NULL);
 	if (test_and_clear_bit(FLG_ACK_PEND, &st->l2.flag))
 		enquiry_cr(st, RR, RSP, 0);
@@ -1138,7 +1139,7 @@ l2_got_tei(struct FsmInst *fi, int event
 		test_and_set_bit(FLG_L3_INIT, &st->l2.flag);
 	} else
 		FsmChangeState(fi, ST_L2_4);
-	if (skb_queue_len(&st->l2.ui_queue))
+	if (!skb_queue_empty(&st->l2.ui_queue))
 		tx_ui(st);
 }
 
@@ -1301,7 +1302,7 @@ l2_pull_iqueue(struct FsmInst *fi, int e
 		FsmDelTimer(&st->l2.t203, 13);
 		FsmAddTimer(&st->l2.t200, st->l2.T200, EV_L2_T200, NULL, 11);
 	}
-	if (skb_queue_len(&l2->i_queue) && cansend(st))
+	if (!skb_queue_empty(&l2->i_queue) && cansend(st))
 		st->l2.l2l1(st, PH_PULL | REQUEST, NULL);
 }
 
@@ -1347,7 +1348,7 @@ l2_st8_got_super(struct FsmInst *fi, int
 			}
 			invoke_retransmission(st, nr);
 			FsmChangeState(fi, ST_L2_7);
-			if (skb_queue_len(&l2->i_queue) && cansend(st))
+			if (!skb_queue_empty(&l2->i_queue) && cansend(st))
 				st->l2.l2l1(st, PH_PULL | REQUEST, NULL);
 		} else
 			nrerrorrecovery(fi);
diff --git a/drivers/isdn/hisax/isdnl3.c b/drivers/isdn/hisax/isdnl3.c
--- a/drivers/isdn/hisax/isdnl3.c
+++ b/drivers/isdn/hisax/isdnl3.c
@@ -302,7 +302,7 @@ release_l3_process(struct l3_process *p)
 				!test_bit(FLG_PTP, &p->st->l2.flag)) {
 				if (p->debug)
 					l3_debug(p->st, "release_l3_process: last process");
-				if (!skb_queue_len(&p->st->l3.squeue)) {
+				if (skb_queue_empty(&p->st->l3.squeue)) {
 					if (p->debug)
 						l3_debug(p->st, "release_l3_process: release link");
 					if (p->st->protocol != ISDN_PTYPE_NI1)
diff --git a/drivers/isdn/i4l/isdn_tty.c b/drivers/isdn/i4l/isdn_tty.c
--- a/drivers/isdn/i4l/isdn_tty.c
+++ b/drivers/isdn/i4l/isdn_tty.c
@@ -1223,7 +1223,7 @@ isdn_tty_write(struct tty_struct *tty, c
 		total += c;
 	}
 	atomic_dec(&info->xmit_lock);
-	if ((info->xmit_count) || (skb_queue_len(&info->xmit_queue))) {
+	if ((info->xmit_count) || !skb_queue_empty(&info->xmit_queue)) {
 		if (m->mdmreg[REG_DXMT] & BIT_DXMT) {
 			isdn_tty_senddown(info);
 			isdn_tty_tint(info);
@@ -1284,7 +1284,7 @@ isdn_tty_flush_chars(struct tty_struct *
 
 	if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_flush_chars"))
 		return;
-	if ((info->xmit_count) || (skb_queue_len(&info->xmit_queue)))
+	if ((info->xmit_count) || !skb_queue_empty(&info->xmit_queue))
 		isdn_timer_ctrl(ISDN_TIMER_MODEMXMIT, 1);
 }
 
diff --git a/drivers/isdn/icn/icn.c b/drivers/isdn/icn/icn.c
--- a/drivers/isdn/icn/icn.c
+++ b/drivers/isdn/icn/icn.c
@@ -304,12 +304,12 @@ icn_pollbchan_send(int channel, icn_card
 	isdn_ctrl cmd;
 
 	if (!(card->sndcount[channel] || card->xskb[channel] ||
-	      skb_queue_len(&card->spqueue[channel])))
+	      !skb_queue_empty(&card->spqueue[channel])))
 		return;
 	if (icn_trymaplock_channel(card, mch)) {
 		while (sbfree && 
 		       (card->sndcount[channel] ||
-			skb_queue_len(&card->spqueue[channel]) ||
+			!skb_queue_empty(&card->spqueue[channel]) ||
 			card->xskb[channel])) {
 			spin_lock_irqsave(&card->lock, flags);
 			if (card->xmit_lock[channel]) {
diff --git a/drivers/net/hamradio/scc.c b/drivers/net/hamradio/scc.c
--- a/drivers/net/hamradio/scc.c
+++ b/drivers/net/hamradio/scc.c
@@ -304,7 +304,7 @@ static inline void scc_discard_buffers(s
 		scc->tx_buff = NULL;
 	}
 	
-	while (skb_queue_len(&scc->tx_queue))
+	while (!skb_queue_empty(&scc->tx_queue))
 		dev_kfree_skb(skb_dequeue(&scc->tx_queue));
 
 	spin_unlock_irqrestore(&scc->lock, flags);
@@ -1126,8 +1126,7 @@ static void t_dwait(unsigned long channe
 	
 	if (scc->stat.tx_state == TXS_WAIT)	/* maxkeyup or idle timeout */
 	{
-		if (skb_queue_len(&scc->tx_queue) == 0)	/* nothing to send */
-		{
+		if (skb_queue_empty(&scc->tx_queue)) {	/* nothing to send */
 			scc->stat.tx_state = TXS_IDLE;
 			netif_wake_queue(scc->dev);	/* t_maxkeyup locked it. */
 			return;
diff --git a/drivers/net/ppp_async.c b/drivers/net/ppp_async.c
--- a/drivers/net/ppp_async.c
+++ b/drivers/net/ppp_async.c
@@ -364,7 +364,7 @@ ppp_asynctty_receive(struct tty_struct *
 	spin_lock_irqsave(&ap->recv_lock, flags);
 	ppp_async_input(ap, buf, cflags, count);
 	spin_unlock_irqrestore(&ap->recv_lock, flags);
-	if (skb_queue_len(&ap->rqueue))
+	if (!skb_queue_empty(&ap->rqueue))
 		tasklet_schedule(&ap->tsk);
 	ap_put(ap);
 	if (test_and_clear_bit(TTY_THROTTLED, &tty->flags)
diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c
--- a/drivers/net/ppp_generic.c
+++ b/drivers/net/ppp_generic.c
@@ -1237,8 +1237,8 @@ static int ppp_mp_explode(struct ppp *pp
 		pch = list_entry(list, struct channel, clist);
 		navail += pch->avail = (pch->chan != NULL);
 		if (pch->avail) {
-			if (skb_queue_len(&pch->file.xq) == 0
-			    || !pch->had_frag) {
+			if (skb_queue_empty(&pch->file.xq) ||
+			    !pch->had_frag) {
 				pch->avail = 2;
 				++nfree;
 			}
@@ -1374,8 +1374,8 @@ static int ppp_mp_explode(struct ppp *pp
 
 		/* try to send it down the channel */
 		chan = pch->chan;
-		if (skb_queue_len(&pch->file.xq)
-		    || !chan->ops->start_xmit(chan, frag))
+		if (!skb_queue_empty(&pch->file.xq) ||
+		    !chan->ops->start_xmit(chan, frag))
 			skb_queue_tail(&pch->file.xq, frag);
 		pch->had_frag = 1;
 		p += flen;
@@ -1412,7 +1412,7 @@ ppp_channel_push(struct channel *pch)
 
 	spin_lock_bh(&pch->downl);
 	if (pch->chan != 0) {
-		while (skb_queue_len(&pch->file.xq) > 0) {
+		while (!skb_queue_empty(&pch->file.xq)) {
 			skb = skb_dequeue(&pch->file.xq);
 			if (!pch->chan->ops->start_xmit(pch->chan, skb)) {
 				/* put the packet back and try again later */
@@ -1426,7 +1426,7 @@ ppp_channel_push(struct channel *pch)
 	}
 	spin_unlock_bh(&pch->downl);
 	/* see if there is anything from the attached unit to be sent */
-	if (skb_queue_len(&pch->file.xq) == 0) {
+	if (skb_queue_empty(&pch->file.xq)) {
 		read_lock_bh(&pch->upl);
 		ppp = pch->ppp;
 		if (ppp != 0)
diff --git a/drivers/net/ppp_synctty.c b/drivers/net/ppp_synctty.c
--- a/drivers/net/ppp_synctty.c
+++ b/drivers/net/ppp_synctty.c
@@ -406,7 +406,7 @@ ppp_sync_receive(struct tty_struct *tty,
 	spin_lock_irqsave(&ap->recv_lock, flags);
 	ppp_sync_input(ap, buf, cflags, count);
 	spin_unlock_irqrestore(&ap->recv_lock, flags);
-	if (skb_queue_len(&ap->rqueue))
+	if (!skb_queue_empty(&ap->rqueue))
 		tasklet_schedule(&ap->tsk);
 	sp_put(ap);
 	if (test_and_clear_bit(TTY_THROTTLED, &tty->flags)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -215,7 +215,7 @@ static unsigned int tun_chr_poll(struct 
 
 	poll_wait(file, &tun->read_wait, wait);
  
-	if (skb_queue_len(&tun->readq))
+	if (!skb_queue_empty(&tun->readq))
 		mask |= POLLIN | POLLRDNORM;
 
 	return mask;
diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c
--- a/drivers/net/wireless/airo.c
+++ b/drivers/net/wireless/airo.c
@@ -2374,7 +2374,7 @@ void stop_airo_card( struct net_device *
 	/*
 	 * Clean out tx queue
 	 */
-	if (test_bit(FLAG_MPI, &ai->flags) && skb_queue_len (&ai->txq) > 0) {
+	if (test_bit(FLAG_MPI, &ai->flags) && !skb_queue_empty(&ai->txq)) {
 		struct sk_buff *skb = NULL;
 		for (;(skb = skb_dequeue(&ai->txq));)
 			dev_kfree_skb(skb);
@@ -3287,7 +3287,7 @@ exitrx:
 				if (status & EV_TXEXC)
 					get_tx_error(apriv, -1);
 				spin_lock_irqsave(&apriv->aux_lock, flags);
-				if (skb_queue_len (&apriv->txq)) {
+				if (!skb_queue_empty(&apriv->txq)) {
 					spin_unlock_irqrestore(&apriv->aux_lock,flags);
 					mpi_send_packet (dev);
 				} else {
diff --git a/drivers/s390/net/claw.c b/drivers/s390/net/claw.c
--- a/drivers/s390/net/claw.c
+++ b/drivers/s390/net/claw.c
@@ -428,7 +428,7 @@ claw_pack_skb(struct claw_privbk *privpt
 	new_skb = NULL;		/* assume no dice */
 	pkt_cnt = 0;
 	CLAW_DBF_TEXT(4,trace,"PackSKBe");
-	if (skb_queue_len(&p_ch->collect_queue) > 0) {
+	if (!skb_queue_empty(&p_ch->collect_queue)) {
 	/* some data */
 		held_skb = skb_dequeue(&p_ch->collect_queue);
 		if (p_env->packing != DO_PACKED)
@@ -1254,7 +1254,7 @@ claw_write_next ( struct chbk * p_ch )
 	privptr = (struct claw_privbk *) dev->priv;
         claw_free_wrt_buf( dev );
 	if ((privptr->write_free_count > 0) &&
-	    (skb_queue_len(&p_ch->collect_queue) > 0)) {
+	    !skb_queue_empty(&p_ch->collect_queue)) {
 	  	pk_skb = claw_pack_skb(privptr);
 		while (pk_skb != NULL) {
 			rc = claw_hw_tx( pk_skb, dev,1);
diff --git a/drivers/s390/net/ctctty.c b/drivers/s390/net/ctctty.c
--- a/drivers/s390/net/ctctty.c
+++ b/drivers/s390/net/ctctty.c
@@ -156,7 +156,7 @@ ctc_tty_readmodem(ctc_tty_info *info)
 					skb_queue_head(&info->rx_queue, skb);
 				else {
 					kfree_skb(skb);
-					ret = skb_queue_len(&info->rx_queue);
+					ret = !skb_queue_empty(&info->rx_queue);
 				}
 			}
 		}
@@ -530,7 +530,7 @@ ctc_tty_write(struct tty_struct *tty, co
 		total += c;
 		count -= c;
 	}
-	if (skb_queue_len(&info->tx_queue)) {
+	if (!skb_queue_empty(&info->tx_queue)) {
 		info->lsr &= ~UART_LSR_TEMT;
 		tasklet_schedule(&info->tasklet);
 	}
@@ -594,7 +594,7 @@ ctc_tty_flush_chars(struct tty_struct *t
 		return;
 	if (ctc_tty_paranoia_check(info, tty->name, "ctc_tty_flush_chars"))
 		return;
-	if (tty->stopped || tty->hw_stopped || (!skb_queue_len(&info->tx_queue)))
+	if (tty->stopped || tty->hw_stopped || skb_queue_empty(&info->tx_queue))
 		return;
 	tasklet_schedule(&info->tasklet);
 }
diff --git a/drivers/usb/net/usbnet.c b/drivers/usb/net/usbnet.c
--- a/drivers/usb/net/usbnet.c
+++ b/drivers/usb/net/usbnet.c
@@ -3227,9 +3227,9 @@ static int usbnet_stop (struct net_devic
 	temp = unlink_urbs (dev, &dev->txq) + unlink_urbs (dev, &dev->rxq);
 
 	// maybe wait for deletions to finish.
-	while (skb_queue_len (&dev->rxq)
-			&& skb_queue_len (&dev->txq)
-			&& skb_queue_len (&dev->done)) {
+	while (!skb_queue_empty(&dev->rxq) &&
+	       !skb_queue_empty(&dev->txq) &&
+	       !skb_queue_empty(&dev->done)) {
 		msleep(UNLINK_TIMEOUT_MS);
 		if (netif_msg_ifdown (dev))
 			devdbg (dev, "waited for %d urb completions", temp);
diff --git a/include/net/irda/irda_device.h b/include/net/irda/irda_device.h
--- a/include/net/irda/irda_device.h
+++ b/include/net/irda/irda_device.h
@@ -224,7 +224,7 @@ int  irda_device_is_receiving(struct net
 /* Interface for internal use */
 static inline int irda_device_txqueue_empty(const struct net_device *dev)
 {
-	return (skb_queue_len(&dev->qdisc->q) == 0);
+	return skb_queue_empty(&dev->qdisc->q);
 }
 int  irda_device_set_raw_mode(struct net_device* self, int status);
 struct net_device *alloc_irdadev(int sizeof_priv);
diff --git a/include/net/tcp.h b/include/net/tcp.h
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -991,7 +991,7 @@ static __inline__ void tcp_fast_path_on(
 
 static inline void tcp_fast_path_check(struct sock *sk, struct tcp_sock *tp)
 {
-	if (skb_queue_len(&tp->out_of_order_queue) == 0 &&
+	if (skb_queue_empty(&tp->out_of_order_queue) &&
 	    tp->rcv_wnd &&
 	    atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
 	    !tp->urg_data)
diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
--- a/net/bluetooth/cmtp/core.c
+++ b/net/bluetooth/cmtp/core.c
@@ -213,7 +213,7 @@ static int cmtp_send_frame(struct cmtp_s
 	return kernel_sendmsg(sock, &msg, &iv, 1, len);
 }
 
-static int cmtp_process_transmit(struct cmtp_session *session)
+static void cmtp_process_transmit(struct cmtp_session *session)
 {
 	struct sk_buff *skb, *nskb;
 	unsigned char *hdr;
@@ -223,7 +223,7 @@ static int cmtp_process_transmit(struct 
 
 	if (!(nskb = alloc_skb(session->mtu, GFP_ATOMIC))) {
 		BT_ERR("Can't allocate memory for new frame");
-		return -ENOMEM;
+		return;
 	}
 
 	while ((skb = skb_dequeue(&session->transmit))) {
@@ -275,8 +275,6 @@ static int cmtp_process_transmit(struct 
 	cmtp_send_frame(session, nskb->data, nskb->len);
 
 	kfree_skb(nskb);
-
-	return skb_queue_len(&session->transmit);
 }
 
 static int cmtp_session(void *arg)
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -428,7 +428,7 @@ static int hidp_send_frame(struct socket
 	return kernel_sendmsg(sock, &msg, &iv, 1, len);
 }
 
-static int hidp_process_transmit(struct hidp_session *session)
+static void hidp_process_transmit(struct hidp_session *session)
 {
 	struct sk_buff *skb;
 
@@ -453,9 +453,6 @@ static int hidp_process_transmit(struct 
 		hidp_set_timer(session);
 		kfree_skb(skb);
 	}
-
-	return skb_queue_len(&session->ctrl_transmit) +
-				skb_queue_len(&session->intr_transmit);
 }
 
 static int hidp_session(void *arg)
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -590,8 +590,11 @@ static long rfcomm_sock_data_wait(struct
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
 
-		if (skb_queue_len(&sk->sk_receive_queue) || sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN) ||
-				signal_pending(current) || !timeo)
+		if (!skb_queue_empty(&sk->sk_receive_queue) ||
+		    sk->sk_err ||
+		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
+		    signal_pending(current) ||
+		    !timeo)
 			break;
 
 		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -781,7 +781,7 @@ static int rfcomm_tty_chars_in_buffer(st
 
 	BT_DBG("tty %p dev %p", tty, dev);
 
-	if (skb_queue_len(&dlc->tx_queue))
+	if (!skb_queue_empty(&dlc->tx_queue))
 		return dlc->mtu;
 
 	return 0;
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -536,7 +536,7 @@ static void dn_keepalive(struct sock *sk
 	 * we are double checking that we are not sending too
 	 * many of these keepalive frames.
 	 */
-	if (skb_queue_len(&scp->other_xmit_queue) == 0)
+	if (skb_queue_empty(&scp->other_xmit_queue))
 		dn_nsp_send_link(sk, DN_NOCHANGE, 0);
 }
 
@@ -1191,7 +1191,7 @@ static unsigned int dn_poll(struct file 
 	struct dn_scp *scp = DN_SK(sk);
 	int mask = datagram_poll(file, sock, wait);
 
-	if (skb_queue_len(&scp->other_receive_queue))
+	if (!skb_queue_empty(&scp->other_receive_queue))
 		mask |= POLLRDBAND;
 
 	return mask;
@@ -1214,7 +1214,7 @@ static int dn_ioctl(struct socket *sock,
 
 	case SIOCATMARK:
 		lock_sock(sk);
-		val = (skb_queue_len(&scp->other_receive_queue) != 0);
+		val = !skb_queue_empty(&scp->other_receive_queue);
 		if (scp->state != DN_RUN)
 			val = -ENOTCONN;
 		release_sock(sk);
@@ -1630,7 +1630,7 @@ static int dn_data_ready(struct sock *sk
 	int len = 0;
 
 	if (flags & MSG_OOB)
-		return skb_queue_len(q) ? 1 : 0;
+		return !skb_queue_empty(q) ? 1 : 0;
 
 	while(skb != (struct sk_buff *)q) {
 		struct dn_skb_cb *cb = DN_SKB_CB(skb);
@@ -1707,7 +1707,7 @@ static int dn_recvmsg(struct kiocb *iocb
 		if (sk->sk_err)
 			goto out;
 
-		if (skb_queue_len(&scp->other_receive_queue)) {
+		if (!skb_queue_empty(&scp->other_receive_queue)) {
 			if (!(flags & MSG_OOB)) {
 				msg->msg_flags |= MSG_OOB;
 				if (!scp->other_report) {
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
--- a/net/decnet/dn_nsp_out.c
+++ b/net/decnet/dn_nsp_out.c
@@ -342,7 +342,8 @@ int dn_nsp_xmit_timeout(struct sock *sk)
 
 	dn_nsp_output(sk);
 
-	if (skb_queue_len(&scp->data_xmit_queue) || skb_queue_len(&scp->other_xmit_queue))
+	if (!skb_queue_empty(&scp->data_xmit_queue) ||
+	    !skb_queue_empty(&scp->other_xmit_queue))
 		scp->persist = dn_nsp_persist(sk);
 
 	return 0;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1105,7 +1105,7 @@ static void tcp_prequeue_process(struct 
 	struct sk_buff *skb;
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));
+	NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
 
 	/* RX process wants to run with disabled BHs, though it is not
 	 * necessary */
@@ -1369,7 +1369,7 @@ int tcp_recvmsg(struct kiocb *iocb, stru
 			 * is not empty. It is more elegant, but eats cycles,
 			 * unfortunately.
 			 */
-			if (skb_queue_len(&tp->ucopy.prequeue))
+			if (!skb_queue_empty(&tp->ucopy.prequeue))
 				goto do_prequeue;
 
 			/* __ Set realtime policy in scheduler __ */
@@ -1394,7 +1394,7 @@ int tcp_recvmsg(struct kiocb *iocb, stru
 			}
 
 			if (tp->rcv_nxt == tp->copied_seq &&
-			    skb_queue_len(&tp->ucopy.prequeue)) {
+			    !skb_queue_empty(&tp->ucopy.prequeue)) {
 do_prequeue:
 				tcp_prequeue_process(sk);
 
@@ -1476,7 +1476,7 @@ skip_copy:
 	} while (len > 0);
 
 	if (user_recv) {
-		if (skb_queue_len(&tp->ucopy.prequeue)) {
+		if (!skb_queue_empty(&tp->ucopy.prequeue)) {
 			int chunk;
 
 			tp->ucopy.len = copied > 0 ? len : 0;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2802,7 +2802,7 @@ static void tcp_sack_remove(struct tcp_s
 	int this_sack;
 
 	/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
-	if (skb_queue_len(&tp->out_of_order_queue) == 0) {
+	if (skb_queue_empty(&tp->out_of_order_queue)) {
 		tp->rx_opt.num_sacks = 0;
 		tp->rx_opt.eff_sacks = tp->rx_opt.dsack;
 		return;
@@ -2935,13 +2935,13 @@ queue_and_out:
 		if(th->fin)
 			tcp_fin(skb, sk, th);
 
-		if (skb_queue_len(&tp->out_of_order_queue)) {
+		if (!skb_queue_empty(&tp->out_of_order_queue)) {
 			tcp_ofo_queue(sk);
 
 			/* RFC2581. 4.2. SHOULD send immediate ACK, when
 			 * gap in queue is filled.
 			 */
-			if (!skb_queue_len(&tp->out_of_order_queue))
+			if (skb_queue_empty(&tp->out_of_order_queue))
 				tp->ack.pingpong = 0;
 		}
 
@@ -3249,9 +3249,8 @@ static int tcp_prune_queue(struct sock *
 	 * This must not ever occur. */
 
 	/* First, purge the out_of_order queue. */
-	if (skb_queue_len(&tp->out_of_order_queue)) {
-		NET_ADD_STATS_BH(LINUX_MIB_OFOPRUNED, 
-				 skb_queue_len(&tp->out_of_order_queue));
+	if (!skb_queue_empty(&tp->out_of_order_queue)) {
+		NET_INC_STATS_BH(LINUX_MIB_OFOPRUNED);
 		__skb_queue_purge(&tp->out_of_order_queue);
 
 		/* Reset SACK state.  A conforming SACK implementation will
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -231,11 +231,10 @@ static void tcp_delack_timer(unsigned lo
 	}
 	tp->ack.pending &= ~TCP_ACK_TIMER;
 
-	if (skb_queue_len(&tp->ucopy.prequeue)) {
+	if (!skb_queue_empty(&tp->ucopy.prequeue)) {
 		struct sk_buff *skb;
 
-		NET_ADD_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED, 
-				 skb_queue_len(&tp->ucopy.prequeue));
+		NET_INC_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED);
 
 		while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
 			sk->sk_backlog_rcv(sk, skb);
diff --git a/net/irda/irlap.c b/net/irda/irlap.c
--- a/net/irda/irlap.c
+++ b/net/irda/irlap.c
@@ -445,9 +445,8 @@ void irlap_disconnect_request(struct irl
 	IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
 
 	/* Don't disconnect until all data frames are successfully sent */
-	if (skb_queue_len(&self->txq) > 0) {
+	if (!skb_queue_empty(&self->txq)) {
 		self->disconnect_pending = TRUE;
-
 		return;
 	}
 
diff --git a/net/irda/irlap_event.c b/net/irda/irlap_event.c
--- a/net/irda/irlap_event.c
+++ b/net/irda/irlap_event.c
@@ -191,7 +191,7 @@ static void irlap_start_poll_timer(struc
 	 * Send out the RR frames faster if our own transmit queue is empty, or
 	 * if the peer is busy. The effect is a much faster conversation
 	 */
-	if ((skb_queue_len(&self->txq) == 0) || (self->remote_busy)) {
+	if (skb_queue_empty(&self->txq) || self->remote_busy) {
 		if (self->fast_RR == TRUE) {
 			/*
 			 *  Assert that the fast poll timer has not reached the
@@ -263,7 +263,7 @@ void irlap_do_event(struct irlap_cb *sel
 		IRDA_DEBUG(2, "%s() : queue len = %d\n", __FUNCTION__,
 			   skb_queue_len(&self->txq));
 
-		if (skb_queue_len(&self->txq)) {
+		if (!skb_queue_empty(&self->txq)) {
 			/* Prevent race conditions with irlap_data_request() */
 			self->local_busy = TRUE;
 
@@ -1074,7 +1074,7 @@ static int irlap_state_xmit_p(struct irl
 #else	/* CONFIG_IRDA_DYNAMIC_WINDOW */
 			/* Window has been adjusted for the max packet
 			 * size, so much simpler... - Jean II */
-			nextfit = (skb_queue_len(&self->txq) > 0);
+			nextfit = !skb_queue_empty(&self->txq);
 #endif	/* CONFIG_IRDA_DYNAMIC_WINDOW */
 			/*
 			 *  Send data with poll bit cleared only if window > 1
@@ -1814,7 +1814,7 @@ static int irlap_state_xmit_s(struct irl
 #else	/* CONFIG_IRDA_DYNAMIC_WINDOW */
 			/* Window has been adjusted for the max packet
 			 * size, so much simpler... - Jean II */
-			nextfit = (skb_queue_len(&self->txq) > 0);
+			nextfit = !skb_queue_empty(&self->txq);
 #endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
 			/*
 			 *  Send data with final bit cleared only if window > 1
@@ -1937,7 +1937,7 @@ static int irlap_state_nrm_s(struct irla
 				irlap_data_indication(self, skb, FALSE);
 
 				/* Any pending data requests?  */
-				if ((skb_queue_len(&self->txq) > 0) &&
+				if (!skb_queue_empty(&self->txq) &&
 				    (self->window > 0))
 				{
 					self->ack_required = TRUE;
@@ -2038,7 +2038,7 @@ static int irlap_state_nrm_s(struct irla
 			/*
 			 *  Any pending data requests?
 			 */
-			if ((skb_queue_len(&self->txq) > 0) &&
+			if (!skb_queue_empty(&self->txq) &&
 			    (self->window > 0) && !self->remote_busy)
 			{
 				irlap_data_indication(self, skb, TRUE);
@@ -2069,7 +2069,7 @@ static int irlap_state_nrm_s(struct irla
 		 */
 		nr_status = irlap_validate_nr_received(self, info->nr);
 		if (nr_status == NR_EXPECTED) {
-			if ((skb_queue_len( &self->txq) > 0) &&
+			if (!skb_queue_empty(&self->txq) &&
 			    (self->window > 0)) {
 				self->remote_busy = FALSE;
 
diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c
--- a/net/irda/irlap_frame.c
+++ b/net/irda/irlap_frame.c
@@ -1018,11 +1018,10 @@ void irlap_resend_rejected_frames(struct
 	/*
 	 *  We can now fill the window with additional data frames
 	 */
-	while (skb_queue_len( &self->txq) > 0) {
+	while (!skb_queue_empty(&self->txq)) {
 
 		IRDA_DEBUG(0, "%s(), sending additional frames!\n", __FUNCTION__);
-		if ((skb_queue_len( &self->txq) > 0) &&
-		    (self->window > 0)) {
+		if (self->window > 0) {
 			skb = skb_dequeue( &self->txq);
 			IRDA_ASSERT(skb != NULL, return;);
 
@@ -1031,8 +1030,7 @@ void irlap_resend_rejected_frames(struct
 			 *  bit cleared
 			 */
 			if ((self->window > 1) &&
-			    skb_queue_len(&self->txq) > 0)
-			{
+			    !skb_queue_empty(&self->txq)) {
 				irlap_send_data_primary(self, skb);
 			} else {
 				irlap_send_data_primary_poll(self, skb);
diff --git a/net/irda/irttp.c b/net/irda/irttp.c
--- a/net/irda/irttp.c
+++ b/net/irda/irttp.c
@@ -1513,7 +1513,7 @@ int irttp_disconnect_request(struct tsap
 	/*
 	 *  Check if there is still data segments in the transmit queue
 	 */
-	if (skb_queue_len(&self->tx_queue) > 0) {
+	if (!skb_queue_empty(&self->tx_queue)) {
 		if (priority == P_HIGH) {
 			/*
 			 *  No need to send the queued data, if we are
diff --git a/net/llc/llc_c_ev.c b/net/llc/llc_c_ev.c
--- a/net/llc/llc_c_ev.c
+++ b/net/llc/llc_c_ev.c
@@ -84,7 +84,7 @@ static u16 llc_util_nr_inside_tx_window(
 	if (llc->dev->flags & IFF_LOOPBACK)
 		goto out;
 	rc = 1;
-	if (!skb_queue_len(&llc->pdu_unack_q))
+	if (skb_queue_empty(&llc->pdu_unack_q))
 		goto out;
 	skb = skb_peek(&llc->pdu_unack_q);
 	pdu = llc_pdu_sn_hdr(skb);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -858,7 +858,7 @@ static inline void netlink_rcv_wake(stru
 {
 	struct netlink_sock *nlk = nlk_sk(sk);
 
-	if (!skb_queue_len(&sk->sk_receive_queue))
+	if (skb_queue_empty(&sk->sk_receive_queue))
 		clear_bit(0, &nlk->state);
 	if (!test_bit(0, &nlk->state))
 		wake_up_interruptible(&nlk->wait);
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -385,7 +385,7 @@ static int red_change(struct Qdisc *sch,
 	memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256);
 
 	q->qcount = -1;
-	if (skb_queue_len(&sch->q) == 0)
+	if (skb_queue_empty(&sch->q))
 		PSCHED_SET_PASTPERFECT(q->qidlestart);
 	sch_tree_unlock(sch);
 	return 0;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -302,7 +302,7 @@ static void unix_write_space(struct sock
  * may receive messages only from that peer. */
 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 {
-	if (skb_queue_len(&sk->sk_receive_queue)) {
+	if (!skb_queue_empty(&sk->sk_receive_queue)) {
 		skb_queue_purge(&sk->sk_receive_queue);
 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 
@@ -1619,7 +1619,7 @@ static long unix_stream_data_wait(struct
 	for (;;) {
 		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
 
-		if (skb_queue_len(&sk->sk_receive_queue) ||
+		if (!skb_queue_empty(&sk->sk_receive_queue) ||
 		    sk->sk_err ||
 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
 		    signal_pending(current) ||

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
       [not found] <C925F8B43D79CC49ACD0601FB68FF50C045E0FB0@orsmsx408>
@ 2005-07-07 22:30 ` David S. Miller
  0 siblings, 0 replies; 64+ messages in thread
From: David S. Miller @ 2005-07-07 22:30 UTC (permalink / raw)
  To: jesse.brandeburg; +Cc: tgraf, dada1, netdev

From: "Brandeburg, Jesse" <jesse.brandeburg@intel.com>
Date: Thu, 7 Jul 2005 15:02:17 -0700

> Arg, this thread wasn't on the new list, is there any chance we can just
> get netdev@oss.sgi.com to forward to netdev@vger.kernel.org?

It does already.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
       [not found]                                                   ` <42CE22CE.7030902@cosmosbay.com>
@ 2005-07-08  7:30                                                     ` David S. Miller
  2005-07-08  8:19                                                       ` Eric Dumazet
  0 siblings, 1 reply; 64+ messages in thread
From: David S. Miller @ 2005-07-08  7:30 UTC (permalink / raw)
  To: dada1; +Cc: tgraf, netdev

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Fri, 08 Jul 2005 08:53:02 +0200

> About making sk_buff smaller, I use this patch to declare 'struct
> sec_path *sp' only ifdef CONFIG_XFRM, what do you think ?  I also
> use a patch to declare nfcache, nfctinfo and nfct only if
> CONFIG_NETFILTER_CONNTRACK or CONFIG_NETFILTER_CONNTRACK_MODULE are
> defined, but thats more intrusive.  Also, tc_index is not used if
> CONFIG_NET_SCHED only is declared but none of CONFIG_NET_SCH_* In my
> case, I am using CONFIG_NET_SCHED only to be able to do : tc -s -d
> qdisc

Distributions enable all of the ifdefs, and that is thus the
size and resultant performance most users see.

That's why I'm working on shrinking the size assuming all the
config options are enabled, because that is the reality for most
installations.

For all of this stuff we could consider stealing some ideas from BSD,
namely doing something similar to their MBUF tags.

If a subsystem wants to add a cookie to a networking buffer, it
allocates a tag and links it into the struct.  So, you basically get
away with only one pointer (a struct hlist_head).  We could use this
for the security, netfilter, and TC stuff.  I don't know exactly what
our tags would look like, but perhaps:

struct skb_tag;
struct skb_tag_type {
	void (*destructor)(struct skb_tag *);
	kmem_cache_t	*slab_cache;
	const char	*name;
};

struct skb_tag {
	struct hlist_node	list;
	struct skb_tag_type	*owner;
	int			tag_id;
	char			data[0];
};

struct sk_buff {
	...
	struct hlist_head	tag_list;
	...
};

Then netfilter does stuff like:

	struct sk_buff *skb;
	struct skb_tag *tag;
	struct conntrack_skb_info *info;

	tag = skb_find_tag(skb, SKB_TAG_NETFILTER_CONNTRACK);
	info = (struct conntrack_skb_info *) tag->data;

etc. etc.

The downsides to this approach are:

1) Tagging an SKB eats a memory allocation, which isn't nice.
   This is mainly why I haven't mentioned this idea before.

   It may be that, on an active system, the per-cpu SLAB caches
   for such tag objects might keep the allocation costs real low.
   Another factor is that tags are relatively tiny, so a large
   number of them fit in one SLAB.

   But on the other hand we've been trying to remove per-packet
   kmalloc() counts, see the SKB fast-clone discussions about that.
   And people ask for SKB recycling all the time.

2) skb_clone() would get more expensive.  This is because you'd
   need to clone the SKB tags as well.

   There is the possibility to hang the tags off of the
   skb_shinfo() area.  I know this idea sounds crazy, but
   the theory goes that if the netfilter et. al info would
   change (and thus, so would the assosciative tags), then
   you'd need to COW the SKB anyways.

   This is actually an idea worth considering regardless of
   whether we do tags or not.  It would result in less reference
   counting when we clone an SKB with netfilter stuff or
   security stuff attached.

Overall I'm not too thrilled with the idea, but I'm enthusiatic
about being convinced otherwise since this would shrink sk_buff
dramatically. :-)

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
  2005-07-08  7:30                                                     ` David S. Miller
@ 2005-07-08  8:19                                                       ` Eric Dumazet
  2005-07-08 11:08                                                         ` Arnaldo Carvalho de Melo
  0 siblings, 1 reply; 64+ messages in thread
From: Eric Dumazet @ 2005-07-08  8:19 UTC (permalink / raw)
  To: David S. Miller; +Cc: tgraf, netdev

David S. Miller a écrit :
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Fri, 08 Jul 2005 08:53:02 +0200
> 
> 
>>About making sk_buff smaller, I use this patch to declare 'struct
>>sec_path *sp' only ifdef CONFIG_XFRM, what do you think ?  I also
>>use a patch to declare nfcache, nfctinfo and nfct only if
>>CONFIG_NETFILTER_CONNTRACK or CONFIG_NETFILTER_CONNTRACK_MODULE are
>>defined, but thats more intrusive.  Also, tc_index is not used if
>>CONFIG_NET_SCHED only is declared but none of CONFIG_NET_SCH_* In my
>>case, I am using CONFIG_NET_SCHED only to be able to do : tc -s -d
>>qdisc
> 
> 
> Distributions enable all of the ifdefs, and that is thus the
> size and resultant performance most users see.

Well, I had this idea because I found another similar use in include/linux/ip.h

struct inet_sock {
     /* sk and pinet6 has to be the first two members of inet_sock */
     struct sock     sk;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
     struct ipv6_pinfo   *pinet6;
#endif


You are right such conditions are distributions nightmare :(

Eric

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
  2005-07-08  8:19                                                       ` Eric Dumazet
@ 2005-07-08 11:08                                                         ` Arnaldo Carvalho de Melo
  2005-07-12  4:02                                                           ` David S. Miller
  0 siblings, 1 reply; 64+ messages in thread
From: Arnaldo Carvalho de Melo @ 2005-07-08 11:08 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S. Miller, tgraf, netdev

[-- Attachment #1: Type: text/plain, Size: 1345 bytes --]

On 7/8/05, Eric Dumazet <dada1@cosmosbay.com> wrote:
> 
> David S. Miller a écrit :
> > From: Eric Dumazet <dada1@cosmosbay.com>
> > Date: Fri, 08 Jul 2005 08:53:02 +0200
> >
> >
> >>About making sk_buff smaller, I use this patch to declare 'struct
> >>sec_path *sp' only ifdef CONFIG_XFRM, what do you think ? I also
> >>use a patch to declare nfcache, nfctinfo and nfct only if
> >>CONFIG_NETFILTER_CONNTRACK or CONFIG_NETFILTER_CONNTRACK_MODULE are
> >>defined, but thats more intrusive. Also, tc_index is not used if
> >>CONFIG_NET_SCHED only is declared but none of CONFIG_NET_SCH_* In my
> >>case, I am using CONFIG_NET_SCHED only to be able to do : tc -s -d
> >>qdisc
> >
> >
> > Distributions enable all of the ifdefs, and that is thus the
> > size and resultant performance most users see.
> 
> Well, I had this idea because I found another similar use in 
> include/linux/ip.h
> 
> struct inet_sock {
> /* sk and pinet6 has to be the first two members of inet_sock */
> struct sock sk;
> #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
> struct ipv6_pinfo *pinet6;
> #endif



/me pleads guilty, Dave, any problem with removing this #ifdef? Humm, I'll 
think about using
the skb_alloc_extension() idea for struct sock, but this pinet6 sucker is a 
bit more difficult I guess...

- Arnaldo

[-- Attachment #2: Type: text/html, Size: 1903 bytes --]

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
  2005-07-08 11:08                                                         ` Arnaldo Carvalho de Melo
@ 2005-07-12  4:02                                                           ` David S. Miller
  0 siblings, 0 replies; 64+ messages in thread
From: David S. Miller @ 2005-07-12  4:02 UTC (permalink / raw)
  To: arnaldo.melo; +Cc: dada1, tgraf, netdev

From: Arnaldo Carvalho de Melo <arnaldo.melo@gmail.com>
Date: Fri, 8 Jul 2005 08:08:54 -0300

> > struct inet_sock {
> > /* sk and pinet6 has to be the first two members of inet_sock */
> > struct sock sk;
> > #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
> > struct ipv6_pinfo *pinet6;
> > #endif
> 
> /me pleads guilty, Dave, any problem with removing this #ifdef? Humm, I'll 
> think about using

Just leave it for now.  If we come up with some spectacularly
nice way to deal with this in the future, we can change it then.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* [PATCH] Add prefetches in net/ipv4/route.c
  2005-07-05  7:38                     ` [PATCH] loop unrolling in net/sched/sch_generic.c Eric Dumazet
  2005-07-05 11:51                       ` Thomas Graf
  2005-07-05 21:26                       ` David S. Miller
@ 2005-07-28 15:52                       ` Eric Dumazet
  2005-07-28 19:39                         ` David S. Miller
  2 siblings, 1 reply; 64+ messages in thread
From: Eric Dumazet @ 2005-07-28 15:52 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev

[-- Attachment #1: Type: text/plain, Size: 219 bytes --]

[NET] : Adds prefetches in route hash list traversals.

The actual code doesnt use a prefetch enabled macro like list_for_each_rcu(), so manually
add prefetch() hints.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

[-- Attachment #2: route.prefetches --]
[-- Type: text/plain, Size: 1606 bytes --]

diff -Nru linux-2.6.13-rc3/net/ipv4/route.c linux-2.6.13-rc3-ed/net/ipv4/route.c
--- linux-2.6.13-rc3/net/ipv4/route.c	2005-07-13 06:46:46.000000000 +0200
+++ linux-2.6.13-rc3-ed/net/ipv4/route.c	2005-07-28 17:20:21.000000000 +0200
@@ -1148,6 +1148,7 @@
 			while ((rth = rcu_dereference(*rthp)) != NULL) {
 				struct rtable *rt;
 
+				prefetch(rth->u.rt_next);
 				if (rth->fl.fl4_dst != daddr ||
 				    rth->fl.fl4_src != skeys[i] ||
 				    rth->fl.fl4_tos != tos ||
@@ -1401,6 +1402,7 @@
 		rcu_read_lock();
 		for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
 		     rth = rcu_dereference(rth->u.rt_next)) {
+			prefetch(rth->u.rt_next);
 			if (rth->fl.fl4_dst == daddr &&
 			    rth->fl.fl4_src == skeys[i] &&
 			    rth->rt_dst  == daddr &&
@@ -2094,6 +2096,7 @@
 	rcu_read_lock();
 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
 	     rth = rcu_dereference(rth->u.rt_next)) {
+		prefetch(rth->u.rt_next);
 		if (rth->fl.fl4_dst == daddr &&
 		    rth->fl.fl4_src == saddr &&
 		    rth->fl.iif == iif &&
@@ -2565,6 +2568,7 @@
 	rcu_read_lock_bh();
 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
 		rth = rcu_dereference(rth->u.rt_next)) {
+		prefetch(rth->u.rt_next);
 		if (rth->fl.fl4_dst == flp->fl4_dst &&
 		    rth->fl.fl4_src == flp->fl4_src &&
 		    rth->fl.iif == 0 &&
@@ -2819,6 +2823,7 @@
 		rcu_read_lock_bh();
 		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
 		     rt = rcu_dereference(rt->u.rt_next), idx++) {
+			prefetch(rt->u.rt_next);
 			if (idx < s_idx)
 				continue;
 			skb->dst = dst_clone(&rt->u.dst);

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] Add prefetches in net/ipv4/route.c
  2005-07-28 15:52                       ` [PATCH] Add prefetches in net/ipv4/route.c Eric Dumazet
@ 2005-07-28 19:39                         ` David S. Miller
  2005-07-28 20:56                           ` Eric Dumazet
  0 siblings, 1 reply; 64+ messages in thread
From: David S. Miller @ 2005-07-28 19:39 UTC (permalink / raw)
  To: dada1; +Cc: netdev

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Thu, 28 Jul 2005 17:52:04 +0200

> [NET] : Adds prefetches in route hash list traversals.
>
> The actual code doesnt use a prefetch enabled macro like
> list_for_each_rcu(), so manually add prefetch() hints.

and the measured performance improvement is?

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] Add prefetches in net/ipv4/route.c
  2005-07-28 19:39                         ` David S. Miller
@ 2005-07-28 20:56                           ` Eric Dumazet
  2005-07-28 20:58                             ` David S. Miller
  0 siblings, 1 reply; 64+ messages in thread
From: Eric Dumazet @ 2005-07-28 20:56 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev

David S. Miller a écrit :
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Thu, 28 Jul 2005 17:52:04 +0200
> 
> 
>>[NET] : Adds prefetches in route hash list traversals.
>>
>>The actual code doesnt use a prefetch enabled macro like
>>list_for_each_rcu(), so manually add prefetch() hints.
> 
> 
> and the measured performance improvement is?
> 
> 
Half the improvement we could get if only fl.fl4_dst, and other fields were not so far away from the u.rt_next field. (0xE8 on x86_64)

For good performance, one should of course choose a big route cache hash size, and in this case, prefetchs are useless, and even cost an 
extra load : prefetch(rth->u.rt_next) imply the load of the rt_next pointer at the start of rth structure, while the fl fields are on a 
different cache line)

But in case of DDOS, prefetches are a win.

I did not test a solution using two prefetches...

Other patches using prefetches will follow.

Eric

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] Add prefetches in net/ipv4/route.c
  2005-07-28 20:56                           ` Eric Dumazet
@ 2005-07-28 20:58                             ` David S. Miller
  2005-07-28 21:24                               ` Eric Dumazet
  0 siblings, 1 reply; 64+ messages in thread
From: David S. Miller @ 2005-07-28 20:58 UTC (permalink / raw)
  To: dada1; +Cc: netdev

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Thu, 28 Jul 2005 22:56:32 +0200

> But in case of DDOS, prefetches are a win.

Numbers please, I'm simply curious.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] Add prefetches in net/ipv4/route.c
  2005-07-28 20:58                             ` David S. Miller
@ 2005-07-28 21:24                               ` Eric Dumazet
  2005-07-28 22:44                                 ` David S. Miller
  2005-07-29 14:50                                 ` Robert Olsson
  0 siblings, 2 replies; 64+ messages in thread
From: Eric Dumazet @ 2005-07-28 21:24 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev

David S. Miller a écrit :
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Thu, 28 Jul 2005 22:56:32 +0200
> 
> 
>>But in case of DDOS, prefetches are a win.
> 
> 
> Numbers please, I'm simply curious.
> 
> 

I have no profiling info for this exact patch, I'm sorry David.

On a dual opteron machine, this thing from ip_route_input() is very expensive :

  RT_CACHE_STAT_INC(in_hlist_search);

ip_route_input() use a total of 3.4563 % of one cpu, but this 'increment' takes 1.20 % !!!

0.0047   mov    2123529(%rip),%rax        # ffffffff804b4a60 <rt_cache_stat>
1.1898   not    %rax
          mov    %gs:0x34,%edx
0.0042   movslq %edx,%rdx
          mov    (%rax,%rdx,8),%rax
          incl   0x38(%rax)

Sometime I wonder if oprofile can be trusted :(

Maybe we should increment a counter on the stack and do a final
    if (counter != 0)
        RT_CACHE_STAT_ADD(in_hlist_search, counter);

Eric

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] Add prefetches in net/ipv4/route.c
  2005-07-28 21:24                               ` Eric Dumazet
@ 2005-07-28 22:44                                 ` David S. Miller
  2005-07-29 14:50                                 ` Robert Olsson
  1 sibling, 0 replies; 64+ messages in thread
From: David S. Miller @ 2005-07-28 22:44 UTC (permalink / raw)
  To: dada1; +Cc: netdev

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Thu, 28 Jul 2005 23:24:33 +0200

> On a dual opteron machine, this thing from ip_route_input() is very expensive :
> 
>   RT_CACHE_STAT_INC(in_hlist_search);

That's amazing since it's per-cpu.  I don't have any suggestions
besides your idea to increment it once using an accumulation local
variable.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] Add prefetches in net/ipv4/route.c
  2005-07-28 21:24                               ` Eric Dumazet
  2005-07-28 22:44                                 ` David S. Miller
@ 2005-07-29 14:50                                 ` Robert Olsson
  2005-07-29 17:06                                   ` Rick Jones
  2005-07-31  3:44                                   ` David S. Miller
  1 sibling, 2 replies; 64+ messages in thread
From: Robert Olsson @ 2005-07-29 14:50 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S. Miller, netdev


Eric Dumazet writes:

 > I have no profiling info for this exact patch, I'm sorry David.
 > On a dual opteron machine, this thing from ip_route_input() is very expensive :
 > 
 >   RT_CACHE_STAT_INC(in_hlist_search);
 > 
 > ip_route_input() use a total of 3.4563 % of one cpu, but this 'increment' takes 1.20 % !!!

 Very weird if the statscounter taking a third of ip_route_input.

 > Sometime I wonder if oprofile can be trusted :(
 > 
 > Maybe we should increment a counter on the stack and do a final
 >     if (counter != 0)
 >         RT_CACHE_STAT_ADD(in_hlist_search, counter);

 My experiences from playing with prefetching eth_type_trans in this 
 case. One must look in the total performance not just were the 
 prefetching is done. In this case I was able to get eth_type_trans
 down in the profile list but other functions increased so performance
 was the same or lower. This needs to be sorted out... 

 Cheers.
					--ro

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] Add prefetches in net/ipv4/route.c
  2005-07-29 14:50                                 ` Robert Olsson
@ 2005-07-29 17:06                                   ` Rick Jones
  2005-07-29 17:44                                     ` Robert Olsson
  2005-07-29 17:57                                     ` Eric Dumazet
  2005-07-31  3:44                                   ` David S. Miller
  1 sibling, 2 replies; 64+ messages in thread
From: Rick Jones @ 2005-07-29 17:06 UTC (permalink / raw)
  To: netdev

Robert Olsson wrote:
> Eric Dumazet writes:
> 
>  > I have no profiling info for this exact patch, I'm sorry David.
>  > On a dual opteron machine, this thing from ip_route_input() is very expensive :
>  > 
>  >   RT_CACHE_STAT_INC(in_hlist_search);
>  > 
>  > ip_route_input() use a total of 3.4563 % of one cpu, but this 'increment' takes 1.20 % !!!
> 
>  Very weird if the statscounter taking a third of ip_route_input.
> 
>  > Sometime I wonder if oprofile can be trusted :(
>  > 
>  > Maybe we should increment a counter on the stack and do a final
>  >     if (counter != 0)
>  >         RT_CACHE_STAT_ADD(in_hlist_search, counter);
> 
>  My experiences from playing with prefetching eth_type_trans in this 
>  case. One must look in the total performance not just were the 
>  prefetching is done. In this case I was able to get eth_type_trans
>  down in the profile list but other functions increased so performance
>  was the same or lower. This needs to be sorted out... 

How many of the architectures have PMU's that can give us cache miss statistics? 
  Itanium does, and can go so far as to tell us which addresses and instructions 
are involved - do the others?

That sort of data would seem to be desirable in this sort of situation.

rick jones

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] Add prefetches in net/ipv4/route.c
  2005-07-29 17:06                                   ` Rick Jones
@ 2005-07-29 17:44                                     ` Robert Olsson
  2005-07-29 17:57                                     ` Eric Dumazet
  1 sibling, 0 replies; 64+ messages in thread
From: Robert Olsson @ 2005-07-29 17:44 UTC (permalink / raw)
  To: Rick Jones; +Cc: netdev


Rick Jones writes:

 > >  My experiences from playing with prefetching eth_type_trans in this 
 > >  case. One must look in the total performance not just were the 
 > >  prefetching is done. In this case I was able to get eth_type_trans
 > >  down in the profile list but other functions increased so performance
 > >  was the same or lower. This needs to be sorted out... 
 > 
 > How many of the architectures have PMU's that can give us cache miss statistics? 
 >   Itanium does, and can go so far as to tell us which addresses and instructions 
 > are involved - do the others?

 I've seem XEON and Opterons has performance counters for this that can 
 be used with oprofile. Intel has a document (was it an application note?) 
 describing prefetching. Really a lot of things to consider to become 
 successful.

 > That sort of data would seem to be desirable in this sort of situation.

 Also what scenario code patch and load we optimizing for Eric mentioned 
 this briefly. 

 Cheers.
					--ro


 

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] Add prefetches in net/ipv4/route.c
  2005-07-29 17:06                                   ` Rick Jones
  2005-07-29 17:44                                     ` Robert Olsson
@ 2005-07-29 17:57                                     ` Eric Dumazet
  2005-07-29 18:25                                       ` Rick Jones
  2005-07-31  3:51                                       ` David S. Miller
  1 sibling, 2 replies; 64+ messages in thread
From: Eric Dumazet @ 2005-07-29 17:57 UTC (permalink / raw)
  To: Rick Jones; +Cc: netdev, David S. Miller, Robert Olsson

Rick Jones a écrit :
> Robert Olsson wrote:
> 
>> Eric Dumazet writes:
>>
>>  > I have no profiling info for this exact patch, I'm sorry David.
>>  > On a dual opteron machine, this thing from ip_route_input() is very 
>> expensive :
>>  >  >   RT_CACHE_STAT_INC(in_hlist_search);
>>  >  > ip_route_input() use a total of 3.4563 % of one cpu, but this 
>> 'increment' takes 1.20 % !!!
>>
>>  Very weird if the statscounter taking a third of ip_route_input.
>>
>>  > Sometime I wonder if oprofile can be trusted :(
>>  >  > Maybe we should increment a counter on the stack and do a final
>>  >     if (counter != 0)
>>  >         RT_CACHE_STAT_ADD(in_hlist_search, counter);
>>
>>  My experiences from playing with prefetching eth_type_trans in this 
>>  case. One must look in the total performance not just were the 
>>  prefetching is done. In this case I was able to get eth_type_trans
>>  down in the profile list but other functions increased so performance
>>  was the same or lower. This needs to be sorted out... 
> 
> 
> How many of the architectures have PMU's that can give us cache miss 
> statistics?  Itanium does, and can go so far as to tell us which 
> addresses and instructions are involved - do the others?
> 
> That sort of data would seem to be desirable in this sort of situation.
> 
> rick jones
> 
> 

oprofile on AMD64 can gather lots of data, DATA_CACHE_MISSES for example...

But I think I know what happens...

nm -v /usr/src/linux/vmlinux | grep -5 rt_cache_stat

ffffffff804c6a80 b rover.5
ffffffff804c6a88 b last_gc.2
ffffffff804c6a90 b rover.3
ffffffff804c6a94 b equilibrium.4
ffffffff804c6a98 b ip_fallback_id.7
ffffffff804c6aa0 B rt_cache_stat
ffffffff804c6aa8 b ip_rt_max_size
ffffffff804c6aac b ip_rt_debug
ffffffff804c6ab0 b rt_deadline

So rt_cache_stat (which is a read only pointer) is in the middle of a hot cache line (some parts of it are written over and over), that 
probably ping pong between CPUS.

Time to provide a patch to carefully place all the static data from net/ipv4/route.c into 2 parts : mostly readonly, and others... :)

Eric

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] Add prefetches in net/ipv4/route.c
  2005-07-29 17:57                                     ` Eric Dumazet
@ 2005-07-29 18:25                                       ` Rick Jones
  2005-07-31  3:52                                         ` David S. Miller
  2005-07-31  3:51                                       ` David S. Miller
  1 sibling, 1 reply; 64+ messages in thread
From: Rick Jones @ 2005-07-29 18:25 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, David S. Miller, Robert Olsson


> oprofile on AMD64 can gather lots of data, DATA_CACHE_MISSES for example...
> 
> But I think I know what happens...
> 
> nm -v /usr/src/linux/vmlinux | grep -5 rt_cache_stat
> 
> ffffffff804c6a80 b rover.5
> ffffffff804c6a88 b last_gc.2
> ffffffff804c6a90 b rover.3
> ffffffff804c6a94 b equilibrium.4
> ffffffff804c6a98 b ip_fallback_id.7
> ffffffff804c6aa0 B rt_cache_stat
> ffffffff804c6aa8 b ip_rt_max_size
> ffffffff804c6aac b ip_rt_debug
> ffffffff804c6ab0 b rt_deadline
> 
> So rt_cache_stat (which is a read only pointer) is in the middle of a 
> hot cache line (some parts of it are written over and over), that 
> probably ping pong between CPUS.
> 
> Time to provide a patch to carefully place all the static data from 
> net/ipv4/route.c into 2 parts : mostly readonly, and others... :)

Which of course begs the question - what cache line size should be ass-u-me-d 
when blocking these things?  I'll put-forth 128 bytes.

rick jones

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] Add prefetches in net/ipv4/route.c
  2005-07-29 14:50                                 ` Robert Olsson
  2005-07-29 17:06                                   ` Rick Jones
@ 2005-07-31  3:44                                   ` David S. Miller
  1 sibling, 0 replies; 64+ messages in thread
From: David S. Miller @ 2005-07-31  3:44 UTC (permalink / raw)
  To: Robert.Olsson; +Cc: dada1, netdev

From: Robert Olsson <Robert.Olsson@data.slu.se>
Date: Fri, 29 Jul 2005 16:50:31 +0200

>  My experiences from playing with prefetching eth_type_trans in this 
>  case. One must look in the total performance not just were the 
>  prefetching is done. In this case I was able to get eth_type_trans
>  down in the profile list but other functions increased so performance
>  was the same or lower. This needs to be sorted out... 

The problem is that if you just barely fit in the cache for a
workload, prefetches can hurt if done too early.  Let's say that your
code path needs to access data items A, B, and C, in that order.  If
you need to access A in order to know C, and subsequently if you
prefetch C before you use B, you might kick out B and end up making
performance worse (since you'll thus need to bring in C twice).

I really do not want to merge in any prefetch patches until
there is hard data showing an improvement, instead of some
shamanistic justification :-)

When the witch doctor comes to town and starts adding prefetches
all over the place without performance metrics, I become rightly
concerned.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] Add prefetches in net/ipv4/route.c
  2005-07-29 17:57                                     ` Eric Dumazet
  2005-07-29 18:25                                       ` Rick Jones
@ 2005-07-31  3:51                                       ` David S. Miller
  1 sibling, 0 replies; 64+ messages in thread
From: David S. Miller @ 2005-07-31  3:51 UTC (permalink / raw)
  To: dada1; +Cc: rick.jones2, netdev, davem, Robert.Olsson

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Fri, 29 Jul 2005 19:57:35 +0200

> nm -v /usr/src/linux/vmlinux | grep -5 rt_cache_stat
> 
> ffffffff804c6a80 b rover.5
> ffffffff804c6a88 b last_gc.2
> ffffffff804c6a90 b rover.3
> ffffffff804c6a94 b equilibrium.4
> ffffffff804c6a98 b ip_fallback_id.7
> ffffffff804c6aa0 B rt_cache_stat
> ffffffff804c6aa8 b ip_rt_max_size
> ffffffff804c6aac b ip_rt_debug
> ffffffff804c6ab0 b rt_deadline
> 
> So rt_cache_stat (which is a read only pointer) is in the middle of a hot cache line (some parts of it are written over and over), that 
> probably ping pong between CPUS.

One cure for this would be to declare it as "__read_mostly", that
should help a lot.  But it's not the best idea, I think.

Instead, we can do away with the pointer and use DECLARE_PERCPU() and
__get_cpu_var() for rt_cache_stat.  That would emit the most efficient
code on every architecture.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] Add prefetches in net/ipv4/route.c
  2005-07-29 18:25                                       ` Rick Jones
@ 2005-07-31  3:52                                         ` David S. Miller
       [not found]                                           ` <42EDDA50.4010405@cosmosbay.com>
  0 siblings, 1 reply; 64+ messages in thread
From: David S. Miller @ 2005-07-31  3:52 UTC (permalink / raw)
  To: rick.jones2; +Cc: dada1, netdev, davem, Robert.Olsson

From: Rick Jones <rick.jones2@hp.com>
Date: Fri, 29 Jul 2005 11:25:21 -0700

> Which of course begs the question - what cache line size should be
> ass-u-me-d when blocking these things?  I'll put-forth 128 bytes.

Why guess?  There is no need to.

Use "__read_mostly" which is an attribute designed explicitly for
this.

^ permalink raw reply	[flat|nested] 64+ messages in thread

* Re: [PATCH] Add prefetches in net/ipv4/route.c
       [not found]                                           ` <42EDDA50.4010405@cosmosbay.com>
@ 2005-08-01 15:39                                             ` David S. Miller
  0 siblings, 0 replies; 64+ messages in thread
From: David S. Miller @ 2005-08-01 15:39 UTC (permalink / raw)
  To: dada1; +Cc: davem, rick.jones2, netdev, Robert.Olsson

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Mon, 01 Aug 2005 10:16:16 +0200

> Last time I checked, read_mostly had a meaning only if CONFIG_X86 ||
> CONFIG_SPARC64

Meaning that other platforms need to implement this,
big deal :)

I do agree that the implementor of __read_mostly should
have just let the build break instead so folks could
simply be forced to fix things up.

It's actually a trivial change, and the author therefore
could also have updated all the platform's vmlinux.lds.S files
instead of ifdef'ing this stuff.

^ permalink raw reply	[flat|nested] 64+ messages in thread

end of thread, other threads:[~2005-08-01 15:39 UTC | newest]

Thread overview: 64+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-05-11 21:15 [TG3]: Add hw coalescing infrastructure David S. Miller
2005-05-11 21:17 ` Michael Chan
2005-05-12  2:28   ` David S. Miller
2005-05-12  7:53     ` Robert Olsson
2005-06-22 15:25 ` [TG3]: About " Eric Dumazet
2005-06-22 19:03   ` Michael Chan
2005-07-04 21:22     ` Eric Dumazet
2005-07-04 21:26       ` David S. Miller
2005-07-04 21:39         ` Eric Dumazet
2005-07-04 21:49           ` David S. Miller
2005-07-04 22:31           ` Eric Dumazet
2005-07-04 22:47             ` David S. Miller
2005-07-04 22:55               ` Eric Dumazet
2005-07-04 22:57                 ` Eric Dumazet
2005-07-04 23:01                   ` David S. Miller
2005-07-05  7:38                     ` [PATCH] loop unrolling in net/sched/sch_generic.c Eric Dumazet
2005-07-05 11:51                       ` Thomas Graf
2005-07-05 12:03                         ` Thomas Graf
2005-07-05 13:04                         ` Eric Dumazet
2005-07-05 13:48                           ` Thomas Graf
2005-07-05 15:58                             ` Eric Dumazet
2005-07-05 17:34                               ` Thomas Graf
2005-07-05 21:22                                 ` David S. Miller
2005-07-05 21:33                                   ` Thomas Graf
2005-07-05 21:35                                     ` David S. Miller
2005-07-05 23:16                                       ` Eric Dumazet
2005-07-05 23:41                                         ` Thomas Graf
2005-07-05 23:45                                           ` David S. Miller
2005-07-05 23:55                                             ` Thomas Graf
2005-07-06  0:32                                           ` Eric Dumazet
2005-07-06  0:51                                             ` Thomas Graf
2005-07-06  1:04                                               ` Eric Dumazet
2005-07-06  1:07                                                 ` Thomas Graf
2005-07-06  0:53                                             ` Eric Dumazet
2005-07-06  1:02                                               ` Thomas Graf
2005-07-06  1:09                                                 ` Eric Dumazet
2005-07-06 12:42                                               ` Thomas Graf
2005-07-07 21:17                                                 ` David S. Miller
2005-07-07 21:34                                                   ` Thomas Graf
2005-07-07 22:24                                                     ` David S. Miller
     [not found]                                                   ` <42CE22CE.7030902@cosmosbay.com>
2005-07-08  7:30                                                     ` David S. Miller
2005-07-08  8:19                                                       ` Eric Dumazet
2005-07-08 11:08                                                         ` Arnaldo Carvalho de Melo
2005-07-12  4:02                                                           ` David S. Miller
2005-07-05 21:26                       ` David S. Miller
2005-07-28 15:52                       ` [PATCH] Add prefetches in net/ipv4/route.c Eric Dumazet
2005-07-28 19:39                         ` David S. Miller
2005-07-28 20:56                           ` Eric Dumazet
2005-07-28 20:58                             ` David S. Miller
2005-07-28 21:24                               ` Eric Dumazet
2005-07-28 22:44                                 ` David S. Miller
2005-07-29 14:50                                 ` Robert Olsson
2005-07-29 17:06                                   ` Rick Jones
2005-07-29 17:44                                     ` Robert Olsson
2005-07-29 17:57                                     ` Eric Dumazet
2005-07-29 18:25                                       ` Rick Jones
2005-07-31  3:52                                         ` David S. Miller
     [not found]                                           ` <42EDDA50.4010405@cosmosbay.com>
2005-08-01 15:39                                             ` David S. Miller
2005-07-31  3:51                                       ` David S. Miller
2005-07-31  3:44                                   ` David S. Miller
2005-07-04 23:00                 ` [TG3]: About hw coalescing infrastructure David S. Miller
2005-07-05 16:14                   ` Eric Dumazet
2005-07-04 22:47             ` Eric Dumazet
     [not found] <C925F8B43D79CC49ACD0601FB68FF50C045E0FB0@orsmsx408>
2005-07-07 22:30 ` [PATCH] loop unrolling in net/sched/sch_generic.c David S. Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).