* [TG3]: Add hw coalescing infrastructure.
@ 2005-05-11 21:15 David S. Miller
2005-05-11 21:17 ` Michael Chan
2005-06-22 15:25 ` [TG3]: About " Eric Dumazet
0 siblings, 2 replies; 64+ messages in thread
From: David S. Miller @ 2005-05-11 21:15 UTC (permalink / raw)
To: netdev; +Cc: mchan
Ok, now that we have the tagged status stuff sorted I began
to work on putting the hw mitigation bits back into the
driver. The discussion on the DMA rw-ctrl settings is still
ongoing, but I will get back to it shortly.
This is the first step, we cache the settings in the tg3
struct and put those values into the chip via tg3_set_coalesce().
ETHTOOL_GCOALESCE is supported, setting is not.
The idea is that if we add dynamnic mitigation or ETHTOOL_SCOALESCE,
it will simply invoke this routine to program the hardware.
It seems that we should also be avoiding the HOSTCC_{R,T}XCOAL_MAXF_INT
registers when TG3_FLG2_5705_PLUS. I didn't make that change yet, just
preserving what we were doing previously.
Another thing which occurred to me is that these settings are line
rate dependant. For example, the 20 usec value is for gigabit.
So we may wish to adjust the values we use based upon the negotiated
line speed. But that's a future refinement that can wait.
I think with the removal of that I/O readback done by the tagged
status changes, and this coalescing stuff below, the SGI NUMA
performance should be significantly higher.
Comments?
Signed-off-by: David S. Miller <davem@davemloft.net>
--- 1/drivers/net/tg3.c.~1~ 2005-05-11 11:46:11.000000000 -0700
+++ 2/drivers/net/tg3.c 2005-05-11 13:47:17.000000000 -0700
@@ -2507,7 +2507,7 @@ static int tg3_setup_phy(struct tg3 *tp,
if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS)) {
if (netif_carrier_ok(tp->dev)) {
tw32(HOSTCC_STAT_COAL_TICKS,
- DEFAULT_STAT_COAL_TICKS);
+ tp->coal.stats_block_coalesce_usecs);
} else {
tw32(HOSTCC_STAT_COAL_TICKS, 0);
}
@@ -5094,6 +5094,22 @@ static void tg3_set_bdinfo(struct tg3 *t
}
static void __tg3_set_rx_mode(struct net_device *);
+static void tg3_set_coalesce(struct tg3 *tp, struct ethtool_coalesce *ec)
+{
+ tw32(HOSTCC_RXCOL_TICKS, ec->rx_coalesce_usecs);
+ tw32(HOSTCC_TXCOL_TICKS, ec->tx_coalesce_usecs);
+ tw32(HOSTCC_RXMAX_FRAMES, ec->rx_max_coalesced_frames);
+ tw32(HOSTCC_TXMAX_FRAMES, ec->tx_max_coalesced_frames);
+ if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS)) {
+ tw32(HOSTCC_RXCOAL_TICK_INT, ec->rx_coalesce_usecs_irq);
+ tw32(HOSTCC_TXCOAL_TICK_INT, ec->tx_coalesce_usecs_irq);
+ }
+ tw32(HOSTCC_RXCOAL_MAXF_INT, ec->rx_max_coalesced_frames_irq);
+ tw32(HOSTCC_TXCOAL_MAXF_INT, ec->tx_max_coalesced_frames_irq);
+ if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS))
+ tw32(HOSTCC_STAT_COAL_TICKS,
+ ec->stats_block_coalesce_usecs);
+}
/* tp->lock is held. */
static int tg3_reset_hw(struct tg3 *tp)
@@ -5416,16 +5432,7 @@ static int tg3_reset_hw(struct tg3 *tp)
udelay(10);
}
- tw32(HOSTCC_RXCOL_TICKS, 0);
- tw32(HOSTCC_TXCOL_TICKS, LOW_TXCOL_TICKS);
- tw32(HOSTCC_RXMAX_FRAMES, 1);
- tw32(HOSTCC_TXMAX_FRAMES, LOW_RXMAX_FRAMES);
- if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS)) {
- tw32(HOSTCC_RXCOAL_TICK_INT, 0);
- tw32(HOSTCC_TXCOAL_TICK_INT, 0);
- }
- tw32(HOSTCC_RXCOAL_MAXF_INT, 1);
- tw32(HOSTCC_TXCOAL_MAXF_INT, 0);
+ tg3_set_coalesce(tp, &tp->coal);
/* set status block DMA address */
tw32(HOSTCC_STATUS_BLK_HOST_ADDR + TG3_64BIT_REG_HIGH,
@@ -5438,8 +5445,6 @@ static int tg3_reset_hw(struct tg3 *tp)
* the tg3_periodic_fetch_stats call there, and
* tg3_get_stats to see how this works for 5705/5750 chips.
*/
- tw32(HOSTCC_STAT_COAL_TICKS,
- DEFAULT_STAT_COAL_TICKS);
tw32(HOSTCC_STATS_BLK_HOST_ADDR + TG3_64BIT_REG_HIGH,
((u64) tp->stats_mapping >> 32));
tw32(HOSTCC_STATS_BLK_HOST_ADDR + TG3_64BIT_REG_LOW,
@@ -7284,6 +7289,14 @@ static void tg3_vlan_rx_kill_vid(struct
}
#endif
+static int tg3_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec)
+{
+ struct tg3 *tp = netdev_priv(dev);
+
+ memcpy(ec, &tp->coal, sizeof(*ec));
+ return 0;
+}
+
static struct ethtool_ops tg3_ethtool_ops = {
.get_settings = tg3_get_settings,
.set_settings = tg3_set_settings,
@@ -7316,6 +7329,7 @@ static struct ethtool_ops tg3_ethtool_op
.get_strings = tg3_get_strings,
.get_stats_count = tg3_get_stats_count,
.get_ethtool_stats = tg3_get_ethtool_stats,
+ .get_coalesce = tg3_get_coalesce,
};
static void __devinit tg3_get_eeprom_size(struct tg3 *tp)
@@ -9096,6 +9110,31 @@ static struct pci_dev * __devinit tg3_fi
return peer;
}
+static void __devinit tg3_init_coal(struct tg3 *tp)
+{
+ struct ethtool_coalesce *ec = &tp->coal;
+
+ memset(ec, 0, sizeof(*ec));
+ ec->cmd = ETHTOOL_GCOALESCE;
+ ec->rx_coalesce_usecs = LOW_RXCOL_TICKS;
+ ec->tx_coalesce_usecs = LOW_TXCOL_TICKS;
+ ec->rx_max_coalesced_frames = LOW_RXMAX_FRAMES;
+ ec->tx_max_coalesced_frames = LOW_TXMAX_FRAMES;
+ ec->rx_coalesce_usecs_irq = DEFAULT_RXCOAL_TICK_INT;
+ ec->tx_coalesce_usecs_irq = DEFAULT_TXCOAL_TICK_INT;
+ ec->rx_max_coalesced_frames_irq = DEFAULT_RXCOAL_MAXF_INT;
+ ec->tx_max_coalesced_frames_irq = DEFAULT_TXCOAL_MAXF_INT;
+ ec->stats_block_coalesce_usecs = DEFAULT_STAT_COAL_TICKS;
+
+ if (tp->coalesce_mode & (HOSTCC_MODE_CLRTICK_RXBD |
+ HOSTCC_MODE_CLRTICK_TXBD)) {
+ ec->rx_coalesce_usecs = LOW_RXCOL_TICKS_CLRTCKS;
+ ec->rx_coalesce_usecs_irq = DEFAULT_RXCOAL_TICK_INT_CLRTCKS;
+ ec->tx_coalesce_usecs = LOW_TXCOL_TICKS_CLRTCKS;
+ ec->tx_coalesce_usecs_irq = DEFAULT_TXCOAL_TICK_INT_CLRTCKS;
+ }
+}
+
static int __devinit tg3_init_one(struct pci_dev *pdev,
const struct pci_device_id *ent)
{
@@ -9341,6 +9380,8 @@ static int __devinit tg3_init_one(struct
/* flow control autonegotiation is default behavior */
tp->tg3_flags |= TG3_FLAG_PAUSE_AUTONEG;
+ tg3_init_coal(tp);
+
err = register_netdev(dev);
if (err) {
printk(KERN_ERR PFX "Cannot register net device, "
--- 1/drivers/net/tg3.h.~1~ 2005-05-11 11:46:11.000000000 -0700
+++ 2/drivers/net/tg3.h 2005-05-11 13:48:14.000000000 -0700
@@ -876,10 +876,12 @@
#define HOSTCC_STATUS_ERROR_ATTN 0x00000004
#define HOSTCC_RXCOL_TICKS 0x00003c08
#define LOW_RXCOL_TICKS 0x00000032
+#define LOW_RXCOL_TICKS_CLRTCKS 0x00000014
#define DEFAULT_RXCOL_TICKS 0x00000048
#define HIGH_RXCOL_TICKS 0x00000096
#define HOSTCC_TXCOL_TICKS 0x00003c0c
#define LOW_TXCOL_TICKS 0x00000096
+#define LOW_TXCOL_TICKS_CLRTCKS 0x00000048
#define DEFAULT_TXCOL_TICKS 0x0000012c
#define HIGH_TXCOL_TICKS 0x00000145
#define HOSTCC_RXMAX_FRAMES 0x00003c10
@@ -892,8 +894,10 @@
#define HIGH_TXMAX_FRAMES 0x00000052
#define HOSTCC_RXCOAL_TICK_INT 0x00003c18
#define DEFAULT_RXCOAL_TICK_INT 0x00000019
+#define DEFAULT_RXCOAL_TICK_INT_CLRTCKS 0x00000014
#define HOSTCC_TXCOAL_TICK_INT 0x00003c1c
#define DEFAULT_TXCOAL_TICK_INT 0x00000019
+#define DEFAULT_TXCOAL_TICK_INT_CLRTCKS 0x00000014
#define HOSTCC_RXCOAL_MAXF_INT 0x00003c20
#define DEFAULT_RXCOAL_MAXF_INT 0x00000005
#define HOSTCC_TXCOAL_MAXF_INT 0x00003c24
@@ -2227,7 +2231,7 @@ struct tg3 {
#define SST_25VF0X0_PAGE_SIZE 4098
-
+ struct ethtool_coalesce coal;
};
#endif /* !(_T3_H) */
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [TG3]: Add hw coalescing infrastructure.
2005-05-11 21:15 [TG3]: Add hw coalescing infrastructure David S. Miller
@ 2005-05-11 21:17 ` Michael Chan
2005-05-12 2:28 ` David S. Miller
2005-06-22 15:25 ` [TG3]: About " Eric Dumazet
1 sibling, 1 reply; 64+ messages in thread
From: Michael Chan @ 2005-05-11 21:17 UTC (permalink / raw)
To: David S.Miller; +Cc: netdev
On Wed, 2005-05-11 at 14:15 -0700, David S.Miller wrote:
> It seems that we should also be avoiding the HOSTCC_{R,T}XCOAL_MAXF_INT
> registers when TG3_FLG2_5705_PLUS. I didn't make that change yet, just
> preserving what we were doing previously.
These registers are still defined for 5705_PLUS chips, only the maximum
value has been reduced to 255. So we're ok.
>
> Another thing which occurred to me is that these settings are line
> rate dependant. For example, the 20 usec value is for gigabit.
> So we may wish to adjust the values we use based upon the negotiated
> line speed. But that's a future refinement that can wait.
>
Yes, and MTU size dependent too. But we may not want to coalesce the
same way when running at 10/100 Mbps. For example, if one packet has
arrived, we don't want to wait 120 usec or 1200 usec to see if another
packet will arrive before we interrupt at 100 and 10 Mbps respectively.
> I think with the removal of that I/O readback done by the tagged
> status changes, and this coalescing stuff below, the SGI NUMA
> performance should be significantly higher.
>
> Comments?
Other than adding a check for netif_carrier_ok() before setting the
HOSTCC_STAT_COAL_TICKS register in tg3_set_coalesce(), everything looks
good to me.
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [TG3]: Add hw coalescing infrastructure.
2005-05-11 21:17 ` Michael Chan
@ 2005-05-12 2:28 ` David S. Miller
2005-05-12 7:53 ` Robert Olsson
0 siblings, 1 reply; 64+ messages in thread
From: David S. Miller @ 2005-05-12 2:28 UTC (permalink / raw)
To: mchan; +Cc: netdev
From: "Michael Chan" <mchan@broadcom.com>
Subject: Re: [TG3]: Add hw coalescing infrastructure.
Date: Wed, 11 May 2005 14:17:36 -0700
> On Wed, 2005-05-11 at 14:15 -0700, David S.Miller wrote:
>
> > Another thing which occurred to me is that these settings are line
> > rate dependant. For example, the 20 usec value is for gigabit.
> > So we may wish to adjust the values we use based upon the negotiated
> > line speed. But that's a future refinement that can wait.
> >
> Yes, and MTU size dependent too. But we may not want to coalesce the
> same way when running at 10/100 Mbps. For example, if one packet has
> arrived, we don't want to wait 120 usec or 1200 usec to see if another
> packet will arrive before we interrupt at 100 and 10 Mbps respectively.
That's a good point. I think it would be wise for us to try
and come up with a dynamic mitigation algorithm that is as MTU
and link speed agnostic as possible.
One thing to note is that what we're really trying to do is
maximize the amount of work done per interrupt, without
unduly adding latency to packet reception.
Work done per-interrupt, and maximum latency, are better goals
because formulas based upon those values will automatically take
differences in CPU and BUS I/O speed into account.
> > I think with the removal of that I/O readback done by the tagged
> > status changes, and this coalescing stuff below, the SGI NUMA
> > performance should be significantly higher.
> >
> > Comments?
>
> Other than adding a check for netif_carrier_ok() before setting the
> HOSTCC_STAT_COAL_TICKS register in tg3_set_coalesce(), everything looks
> good to me.
Good catch. Here is the final version.
I'm still debating whether to pursue the dynamic mitigation
setting idea. I'd like to see some new numbers from the
SGI NUMA folks first.
I think we certainly should add ETHTOOL_SCOALESCE support.
It isn't in there now because I have to think a bit about
how to verify the user's settings. As you know, some
combinations of values could wedge the driver and cause
no interrupts to be generated for TX or RX even when packets
are transmitted/received. And there's that 256 limit on
5705_plus chips which you mentioned.
[TG3]: Set minimal hw interrupt mitigation.
Even though we do software interrupt mitigation
via NAPI, it still helps to have some minimal
hw assisted mitigation.
This helps, particularly, on systems where register
I/O overhead is much greater than the CPU horsepower.
For example, it helps on NUMA systems. In such cases
the PIO overhead to disable interrupts for NAPI accounts
for the majority of the packet processing cost. The
CPU is fast enough such that only a single packet is
processed by each NAPI poll call.
Thanks to Michael Chan for reviewing this patch.
Signed-off-by: David S. Miller <davem@davemloft.net>
--- 1/drivers/net/tg3.c.~1~ 2005-05-11 11:46:11.000000000 -0700
+++ 2/drivers/net/tg3.c 2005-05-11 19:26:28.000000000 -0700
@@ -2507,7 +2507,7 @@ static int tg3_setup_phy(struct tg3 *tp,
if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS)) {
if (netif_carrier_ok(tp->dev)) {
tw32(HOSTCC_STAT_COAL_TICKS,
- DEFAULT_STAT_COAL_TICKS);
+ tp->coal.stats_block_coalesce_usecs);
} else {
tw32(HOSTCC_STAT_COAL_TICKS, 0);
}
@@ -5094,6 +5094,27 @@ static void tg3_set_bdinfo(struct tg3 *t
}
static void __tg3_set_rx_mode(struct net_device *);
+static void tg3_set_coalesce(struct tg3 *tp, struct ethtool_coalesce *ec)
+{
+ tw32(HOSTCC_RXCOL_TICKS, ec->rx_coalesce_usecs);
+ tw32(HOSTCC_TXCOL_TICKS, ec->tx_coalesce_usecs);
+ tw32(HOSTCC_RXMAX_FRAMES, ec->rx_max_coalesced_frames);
+ tw32(HOSTCC_TXMAX_FRAMES, ec->tx_max_coalesced_frames);
+ if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS)) {
+ tw32(HOSTCC_RXCOAL_TICK_INT, ec->rx_coalesce_usecs_irq);
+ tw32(HOSTCC_TXCOAL_TICK_INT, ec->tx_coalesce_usecs_irq);
+ }
+ tw32(HOSTCC_RXCOAL_MAXF_INT, ec->rx_max_coalesced_frames_irq);
+ tw32(HOSTCC_TXCOAL_MAXF_INT, ec->tx_max_coalesced_frames_irq);
+ if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS)) {
+ u32 val = ec->stats_block_coalesce_usecs;
+
+ if (!netif_carrier_ok(tp->dev))
+ val = 0;
+
+ tw32(HOSTCC_STAT_COAL_TICKS, val);
+ }
+}
/* tp->lock is held. */
static int tg3_reset_hw(struct tg3 *tp)
@@ -5416,16 +5437,7 @@ static int tg3_reset_hw(struct tg3 *tp)
udelay(10);
}
- tw32(HOSTCC_RXCOL_TICKS, 0);
- tw32(HOSTCC_TXCOL_TICKS, LOW_TXCOL_TICKS);
- tw32(HOSTCC_RXMAX_FRAMES, 1);
- tw32(HOSTCC_TXMAX_FRAMES, LOW_RXMAX_FRAMES);
- if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS)) {
- tw32(HOSTCC_RXCOAL_TICK_INT, 0);
- tw32(HOSTCC_TXCOAL_TICK_INT, 0);
- }
- tw32(HOSTCC_RXCOAL_MAXF_INT, 1);
- tw32(HOSTCC_TXCOAL_MAXF_INT, 0);
+ tg3_set_coalesce(tp, &tp->coal);
/* set status block DMA address */
tw32(HOSTCC_STATUS_BLK_HOST_ADDR + TG3_64BIT_REG_HIGH,
@@ -5438,8 +5450,6 @@ static int tg3_reset_hw(struct tg3 *tp)
* the tg3_periodic_fetch_stats call there, and
* tg3_get_stats to see how this works for 5705/5750 chips.
*/
- tw32(HOSTCC_STAT_COAL_TICKS,
- DEFAULT_STAT_COAL_TICKS);
tw32(HOSTCC_STATS_BLK_HOST_ADDR + TG3_64BIT_REG_HIGH,
((u64) tp->stats_mapping >> 32));
tw32(HOSTCC_STATS_BLK_HOST_ADDR + TG3_64BIT_REG_LOW,
@@ -7284,6 +7294,14 @@ static void tg3_vlan_rx_kill_vid(struct
}
#endif
+static int tg3_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec)
+{
+ struct tg3 *tp = netdev_priv(dev);
+
+ memcpy(ec, &tp->coal, sizeof(*ec));
+ return 0;
+}
+
static struct ethtool_ops tg3_ethtool_ops = {
.get_settings = tg3_get_settings,
.set_settings = tg3_set_settings,
@@ -7316,6 +7334,7 @@ static struct ethtool_ops tg3_ethtool_op
.get_strings = tg3_get_strings,
.get_stats_count = tg3_get_stats_count,
.get_ethtool_stats = tg3_get_ethtool_stats,
+ .get_coalesce = tg3_get_coalesce,
};
static void __devinit tg3_get_eeprom_size(struct tg3 *tp)
@@ -9096,6 +9115,31 @@ static struct pci_dev * __devinit tg3_fi
return peer;
}
+static void __devinit tg3_init_coal(struct tg3 *tp)
+{
+ struct ethtool_coalesce *ec = &tp->coal;
+
+ memset(ec, 0, sizeof(*ec));
+ ec->cmd = ETHTOOL_GCOALESCE;
+ ec->rx_coalesce_usecs = LOW_RXCOL_TICKS;
+ ec->tx_coalesce_usecs = LOW_TXCOL_TICKS;
+ ec->rx_max_coalesced_frames = LOW_RXMAX_FRAMES;
+ ec->tx_max_coalesced_frames = LOW_TXMAX_FRAMES;
+ ec->rx_coalesce_usecs_irq = DEFAULT_RXCOAL_TICK_INT;
+ ec->tx_coalesce_usecs_irq = DEFAULT_TXCOAL_TICK_INT;
+ ec->rx_max_coalesced_frames_irq = DEFAULT_RXCOAL_MAXF_INT;
+ ec->tx_max_coalesced_frames_irq = DEFAULT_TXCOAL_MAXF_INT;
+ ec->stats_block_coalesce_usecs = DEFAULT_STAT_COAL_TICKS;
+
+ if (tp->coalesce_mode & (HOSTCC_MODE_CLRTICK_RXBD |
+ HOSTCC_MODE_CLRTICK_TXBD)) {
+ ec->rx_coalesce_usecs = LOW_RXCOL_TICKS_CLRTCKS;
+ ec->rx_coalesce_usecs_irq = DEFAULT_RXCOAL_TICK_INT_CLRTCKS;
+ ec->tx_coalesce_usecs = LOW_TXCOL_TICKS_CLRTCKS;
+ ec->tx_coalesce_usecs_irq = DEFAULT_TXCOAL_TICK_INT_CLRTCKS;
+ }
+}
+
static int __devinit tg3_init_one(struct pci_dev *pdev,
const struct pci_device_id *ent)
{
@@ -9341,6 +9385,8 @@ static int __devinit tg3_init_one(struct
/* flow control autonegotiation is default behavior */
tp->tg3_flags |= TG3_FLAG_PAUSE_AUTONEG;
+ tg3_init_coal(tp);
+
err = register_netdev(dev);
if (err) {
printk(KERN_ERR PFX "Cannot register net device, "
--- 1/drivers/net/tg3.h.~1~ 2005-05-11 11:46:11.000000000 -0700
+++ 2/drivers/net/tg3.h 2005-05-11 13:48:14.000000000 -0700
@@ -876,10 +876,12 @@
#define HOSTCC_STATUS_ERROR_ATTN 0x00000004
#define HOSTCC_RXCOL_TICKS 0x00003c08
#define LOW_RXCOL_TICKS 0x00000032
+#define LOW_RXCOL_TICKS_CLRTCKS 0x00000014
#define DEFAULT_RXCOL_TICKS 0x00000048
#define HIGH_RXCOL_TICKS 0x00000096
#define HOSTCC_TXCOL_TICKS 0x00003c0c
#define LOW_TXCOL_TICKS 0x00000096
+#define LOW_TXCOL_TICKS_CLRTCKS 0x00000048
#define DEFAULT_TXCOL_TICKS 0x0000012c
#define HIGH_TXCOL_TICKS 0x00000145
#define HOSTCC_RXMAX_FRAMES 0x00003c10
@@ -892,8 +894,10 @@
#define HIGH_TXMAX_FRAMES 0x00000052
#define HOSTCC_RXCOAL_TICK_INT 0x00003c18
#define DEFAULT_RXCOAL_TICK_INT 0x00000019
+#define DEFAULT_RXCOAL_TICK_INT_CLRTCKS 0x00000014
#define HOSTCC_TXCOAL_TICK_INT 0x00003c1c
#define DEFAULT_TXCOAL_TICK_INT 0x00000019
+#define DEFAULT_TXCOAL_TICK_INT_CLRTCKS 0x00000014
#define HOSTCC_RXCOAL_MAXF_INT 0x00003c20
#define DEFAULT_RXCOAL_MAXF_INT 0x00000005
#define HOSTCC_TXCOAL_MAXF_INT 0x00003c24
@@ -2227,7 +2231,7 @@ struct tg3 {
#define SST_25VF0X0_PAGE_SIZE 4098
-
+ struct ethtool_coalesce coal;
};
#endif /* !(_T3_H) */
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [TG3]: Add hw coalescing infrastructure.
2005-05-12 2:28 ` David S. Miller
@ 2005-05-12 7:53 ` Robert Olsson
0 siblings, 0 replies; 64+ messages in thread
From: Robert Olsson @ 2005-05-12 7:53 UTC (permalink / raw)
To: David S. Miller; +Cc: mchan, netdev
David S. Miller writes:
> From: "Michael Chan" <mchan@broadcom.com>
> Subject: Re: [TG3]: Add hw coalescing infrastructure.
> Date: Wed, 11 May 2005 14:17:36 -0700
> > Yes, and MTU size dependent too. But we may not want to coalesce the
> > same way when running at 10/100 Mbps. For example, if one packet has
> > arrived, we don't want to wait 120 usec or 1200 usec to see if another
> > packet will arrive before we interrupt at 100 and 10 Mbps respectively.
>
> That's a good point. I think it would be wise for us to try
> and come up with a dynamic mitigation algorithm that is as MTU
> and link speed agnostic as possible.
Size of RX-ring is also to be taken to account. You probably have to design
for samllest packets otherwise you risk RX-ring overrun and packet drop.
> One thing to note is that what we're really trying to do is
> maximize the amount of work done per interrupt, without
> unduly adding latency to packet reception.
>
> Work done per-interrupt, and maximum latency, are better goals
> because formulas based upon those values will automatically take
> differences in CPU and BUS I/O speed into account.
Yes.
Cheers.
--ro
^ permalink raw reply [flat|nested] 64+ messages in thread
* [TG3]: About hw coalescing infrastructure.
2005-05-11 21:15 [TG3]: Add hw coalescing infrastructure David S. Miller
2005-05-11 21:17 ` Michael Chan
@ 2005-06-22 15:25 ` Eric Dumazet
2005-06-22 19:03 ` Michael Chan
1 sibling, 1 reply; 64+ messages in thread
From: Eric Dumazet @ 2005-06-22 15:25 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev, mchan
David S. Miller a écrit :
> Ok, now that we have the tagged status stuff sorted I began
> to work on putting the hw mitigation bits back into the
> driver. The discussion on the DMA rw-ctrl settings is still
> ongoing, but I will get back to it shortly.
>
> This is the first step, we cache the settings in the tg3
> struct and put those values into the chip via tg3_set_coalesce().
>
> ETHTOOL_GCOALESCE is supported, setting is not.
>
Hi David
I am using 2.6.12 now, but still experiment a high number of interrupts per second on
my tg3 NIC, on an dual Opteron based machine.
(about 7300 interrupts per second generated by one interface eth0, 100Mbit/s link)
Is there anything I can try to tune the coalescing ?
Being able to handle 100 packets each interrupt instead of one or two would certainly help.
I dont mind about latency. But of course I would like not to drop packets :)
But maybe the BCM5702 is not able to delay an interrupt ?
Thank you
Eric Dumazet
----------------------------------------------------------------------------------------
# lspci -v
02:03.0 Ethernet controller: Broadcom Corporation NetXtreme BCM5702 Gigabit Ethernet (rev 02)
Subsystem: Broadcom Corporation BCM5702 1000Base-T
Flags: bus master, 66Mhz, medium devsel, latency 64, IRQ 27
Memory at 00000000fe000000 (64-bit, non-prefetchable) [size=64K]
Expansion ROM at <unassigned> [disabled] [size=64K]
Capabilities: [40] PCI-X non-bridge device.
Capabilities: [48] Power Management version 2
Capabilities: [50] Vital Product Data
Capabilities: [58] Message Signalled Interrupts: 64bit+ Queue=0/3 Enable-
# ethtool -c eth0
Coalesce parameters for eth0:
Adaptive RX: off TX: off
stats-block-usecs: 1000000
sample-interval: 0
pkt-rate-low: 0
pkt-rate-high: 0
rx-usecs: 20
rx-frames: 5
rx-usecs-irq: 20
rx-frames-irq: 5
tx-usecs: 72
tx-frames: 53
tx-usecs-irq: 20
tx-frames-irq: 5
rx-usecs-low: 0
rx-frame-low: 0
tx-usecs-low: 0
tx-frame-low: 0
rx-usecs-high: 0
rx-frame-high: 0
tx-usecs-high: 0
tx-frame-high: 0
# ethtool -S eth0
NIC statistics:
rx_octets: 104634072366
rx_fragments: 0
rx_ucast_packets: 852685070
rx_mcast_packets: 0
rx_bcast_packets: 20429
rx_fcs_errors: 0
rx_align_errors: 0
rx_xon_pause_rcvd: 0
rx_xoff_pause_rcvd: 0
rx_mac_ctrl_rcvd: 0
rx_xoff_entered: 0
rx_frame_too_long_errors: 0
rx_jabbers: 0
rx_undersize_packets: 0
rx_in_length_errors: 0
rx_out_length_errors: 0
rx_64_or_less_octet_packets: 451956709
rx_65_to_127_octet_packets: 272058231
rx_128_to_255_octet_packets: 63364655
rx_256_to_511_octet_packets: 35814973
rx_512_to_1023_octet_packets: 11867701
rx_1024_to_1522_octet_packets: 17643210
rx_1523_to_2047_octet_packets: 0
rx_2048_to_4095_octet_packets: 0
rx_4096_to_8191_octet_packets: 0
rx_8192_to_9022_octet_packets: 0
tx_octets: 134640205605
tx_collisions: 0
tx_xon_sent: 0
tx_xoff_sent: 0
tx_flow_control: 0
tx_mac_errors: 0
tx_single_collisions: 0
tx_mult_collisions: 0
tx_deferred: 0
tx_excessive_collisions: 0
tx_late_collisions: 0
tx_collide_2times: 0
tx_collide_3times: 0
tx_collide_4times: 0
tx_collide_5times: 0
tx_collide_6times: 0
tx_collide_7times: 0
tx_collide_8times: 0
tx_collide_9times: 0
tx_collide_10times: 0
tx_collide_11times: 0
tx_collide_12times: 0
tx_collide_13times: 0
tx_collide_14times: 0
tx_collide_15times: 0
tx_ucast_packets: 774312055
tx_mcast_packets: 13
tx_bcast_packets: 246
tx_carrier_sense_errors: 0
tx_discards: 0
tx_errors: 0
dma_writeq_full: 21375
dma_write_prioq_full: 0
rxbds_empty: 0
rx_discards: 2644
rx_errors: 0
rx_threshold_hit: 57384403
dma_readq_full: 27100189
dma_read_prioq_full: 1557267
tx_comp_queue_full: 35712755
ring_set_send_prod_index: 747986769
ring_status_update: 502110997
nic_irqs: 446148615
nic_avoided_irqs: 55962382
nic_tx_threshold_hit: 37282069
# ethtool -g eth0
Ring parameters for eth0:
Pre-set maximums:
RX: 511
RX Mini: 0
RX Jumbo: 255
TX: 0
Current hardware settings:
RX: 200
RX Mini: 0
RX Jumbo: 100
TX: 511
# ethtool eth0
Settings for eth0:
Supported ports: [ MII ]
Supported link modes: 10baseT/Half 10baseT/Full
100baseT/Half 100baseT/Full
1000baseT/Half 1000baseT/Full
Supports auto-negotiation: Yes
Advertised link modes: 10baseT/Half 10baseT/Full
100baseT/Half 100baseT/Full
1000baseT/Half 1000baseT/Full
Advertised auto-negotiation: Yes
Speed: 100Mb/s
Duplex: Full
Port: Twisted Pair
PHYAD: 1
Transceiver: internal
Auto-negotiation: on
Supports Wake-on: g
Wake-on: d
Current message level: 0x000000ff (255)
Link detected: yes
# cat /proc/interrupts (HZ=200)
CPU0 CPU1
0: 164055 14038348 IO-APIC-edge timer
2: 0 0 XT-PIC cascade
8: 0 0 IO-APIC-edge rtc
14: 4073 368224 IO-APIC-edge ide0
15: 0 20 IO-APIC-edge ide1
27: 35985951 421578656 IO-APIC-level eth0, eth1
NMI: 874625217 905019517 (oprofile running)
LOC: 14201857 14201976
ERR: 0
MIS: 0
oprofile data :
# more /tmp/vmlinux.oprofile
CPU: Hammer, speed 2205.08 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Cycles outside of halt state) with a unit mask of 0x00 (No unit mask) count 100000
samples cum. samples % cum. % symbol name
20208503 20208503 7.7982 7.7982 ipt_do_table
8336463 28544966 3.2169 11.0151 tcp_v4_rcv
7746814 36291780 2.9894 14.0045 handle_IRQ_event
7117968 43409748 2.7467 16.7512 tg3_poll
6585377 49995125 2.5412 19.2924 memcpy
5184695 55179820 2.0007 21.2931 ip_route_input
4346890 59526710 1.6774 22.9705 kfree
4214007 63740717 1.6261 24.5967 copy_user_generic_c
4093885 67834602 1.5798 26.1764 tcp_ack
4006753 71841355 1.5462 27.7226 tg3_interrupt_tagged
3778976 75620331 1.4583 29.1809 tcp_rcv_established
3756498 79376829 1.4496 30.6304 ip_queue_xmit
3418999 82795828 1.3193 31.9498 schedule
3274459 86070287 1.2636 33.2134 try_to_wake_up
3034809 89105096 1.1711 34.3844 tcp_sendmsg
2846436 91951532 1.0984 35.4828 kmem_cache_alloc
2745147 94696679 1.0593 36.5422 free_block
2679056 97375735 1.0338 37.5760 kmem_cache_free
2595289 99971024 1.0015 38.5775 fn_hash_lookup
2582072 102553096 0.9964 39.5738 __memset
2576462 105129558 0.9942 40.5681 tcp_transmit_skb
2528313 107657871 0.9756 41.5437 tcp_recvmsg
2392370 110050241 0.9232 42.4669 timer_interrupt
2365615 112415856 0.9129 43.3797 system_call
2358666 114774522 0.9102 44.2899 sockfd_lookup
2357192 117131714 0.9096 45.1995 tcp_poll
2340568 119472282 0.9032 46.1027 ip_rcv
2315805 121788087 0.8936 46.9964 tcp_match
2276212 124064299 0.8784 47.8747 sys_epoll_wait
2260913 126325212 0.8725 48.7472 __mod_timer
2173905 128499117 0.8389 49.5861 tg3_start_xmit
2057738 130556855 0.7941 50.3801 __switch_to
2022435 132579290 0.7804 51.1605 ep_poll_callback
2020449 134599739 0.7797 51.9402 sock_wfree
1913008 136512747 0.7382 52.6784 find_busiest_group
1891578 138404325 0.7299 53.4083 local_bh_enable
1860130 140264455 0.7178 54.1261 ip_local_deliver
1793639 142058094 0.6921 54.8183 __ip_route_output_key
1789287 143847381 0.6905 55.5087 alloc_skb
1770972 145618353 0.6834 56.1921 tcp_write_timer
1727286 147345639 0.6665 56.8587 __wake_up
1634111 148979750 0.6306 57.4893 skb_release_data
1625157 150604907 0.6271 58.1164 __kmalloc
1567198 152172105 0.6048 58.7211 tcp_v4_do_rcv
1562495 153734600 0.6029 59.3241 __kfree_skb
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [TG3]: About hw coalescing infrastructure.
2005-06-22 15:25 ` [TG3]: About " Eric Dumazet
@ 2005-06-22 19:03 ` Michael Chan
2005-07-04 21:22 ` Eric Dumazet
0 siblings, 1 reply; 64+ messages in thread
From: Michael Chan @ 2005-06-22 19:03 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David S. Miller, netdev
On Wed, 2005-06-22 at 17:25 +0200, Eric Dumazet wrote:
> Is there anything I can try to tune the coalescing ?
> Being able to handle 100 packets each interrupt instead of one or two would certainly help.
> I dont mind about latency. But of course I would like not to drop packets :)
> But maybe the BCM5702 is not able to delay an interrupt ?
>
On the 5702 that supports CLRTCKS mode, you need to play around with the
following parameters in tg3.h. To reduce interrupts, you generally have
to increase the values.
#define LOW_RXCOL_TICKS_CLRTCKS 0x00000014
#define LOW_TXCOL_TICKS_CLRTCKS 0x00000048
#define LOW_RXMAX_FRAMES 0x00000005
#define LOW_TXMAX_FRAMES 0x00000035
#define DEFAULT_RXCOAL_TICK_INT_CLRTCKS 0x00000014
#define DEFAULT_TXCOAL_TICK_INT_CLRTCKS 0x00000014
#define DEFAULT_RXCOAL_MAXF_INT 0x00000005
#define DEFAULT_TXCOAL_MAXF_INT 0x00000005
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [TG3]: About hw coalescing infrastructure.
2005-06-22 19:03 ` Michael Chan
@ 2005-07-04 21:22 ` Eric Dumazet
2005-07-04 21:26 ` David S. Miller
0 siblings, 1 reply; 64+ messages in thread
From: Eric Dumazet @ 2005-07-04 21:22 UTC (permalink / raw)
To: Michael Chan; +Cc: David S. Miller, netdev
Michael Chan a écrit :
> On Wed, 2005-06-22 at 17:25 +0200, Eric Dumazet wrote:
>
>
>>Is there anything I can try to tune the coalescing ?
>>Being able to handle 100 packets each interrupt instead of one or two would certainly help.
>>I dont mind about latency. But of course I would like not to drop packets :)
>>But maybe the BCM5702 is not able to delay an interrupt ?
>>
>
>
> On the 5702 that supports CLRTCKS mode, you need to play around with the
> following parameters in tg3.h. To reduce interrupts, you generally have
> to increase the values.
>
> #define LOW_RXCOL_TICKS_CLRTCKS 0x00000014
> #define LOW_TXCOL_TICKS_CLRTCKS 0x00000048
> #define LOW_RXMAX_FRAMES 0x00000005
> #define LOW_TXMAX_FRAMES 0x00000035
> #define DEFAULT_RXCOAL_TICK_INT_CLRTCKS 0x00000014
> #define DEFAULT_TXCOAL_TICK_INT_CLRTCKS 0x00000014
> #define DEFAULT_RXCOAL_MAXF_INT 0x00000005
> #define DEFAULT_TXCOAL_MAXF_INT 0x00000005
>
>
>
Thanks Michael
I tried various settings.
# ethtool -c eth0
Coalesce parameters for eth0:
Adaptive RX: off TX: off
stats-block-usecs: 1000000
sample-interval: 0
pkt-rate-low: 0
pkt-rate-high: 0
rx-usecs: 500
rx-frames: 20
rx-usecs-irq: 500
rx-frames-irq: 20
tx-usecs: 600
tx-frames: 53
tx-usecs-irq: 600
tx-frames-irq: 20
rx-usecs-low: 0
rx-frame-low: 0
tx-usecs-low: 0
tx-frame-low: 0
rx-usecs-high: 0
rx-frame-high: 0
tx-usecs-high: 0
tx-frame-high: 0
But it seems something is wrong because when network load becomes high I get :
Jul 4 23:01:19 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul 4 23:01:19 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul 4 23:01:24 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul 4 23:01:24 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul 4 23:01:29 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul 4 23:01:29 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul 4 23:01:34 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul 4 23:01:34 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul 4 23:01:39 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul 4 23:01:39 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul 4 23:01:44 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul 4 23:01:44 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul 4 23:01:49 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul 4 23:01:49 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul 4 23:01:54 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul 4 23:01:54 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul 4 23:01:59 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul 4 23:01:59 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul 4 23:02:04 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul 4 23:02:04 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul 4 23:02:09 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul 4 23:02:09 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul 4 23:02:14 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul 4 23:02:14 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul 4 23:02:19 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul 4 23:02:19 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul 4 23:02:24 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul 4 23:02:24 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul 4 23:02:29 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul 4 23:02:29 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul 4 23:02:34 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul 4 23:02:34 dada1 kernel: tg3: eth0: transmit timed out, resetting
Jul 4 23:02:39 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Jul 4 23:02:39 dada1 kernel: tg3: eth0: transmit timed out, resetting
Then the machine crashes at this point.
tg3.c :
#define DRV_MODULE_VERSION "3.31"
#define DRV_MODULE_RELDATE "June 8, 2005"
# ethtool -g eth0
Ring parameters for eth0:
Pre-set maximums:
RX: 511
RX Mini: 0
RX Jumbo: 255
TX: 0
Current hardware settings:
RX: 400
RX Mini: 0
RX Jumbo: 40
TX: 511
Thank you
Eric Dumazet
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [TG3]: About hw coalescing infrastructure.
2005-07-04 21:22 ` Eric Dumazet
@ 2005-07-04 21:26 ` David S. Miller
2005-07-04 21:39 ` Eric Dumazet
0 siblings, 1 reply; 64+ messages in thread
From: David S. Miller @ 2005-07-04 21:26 UTC (permalink / raw)
To: dada1; +Cc: mchan, netdev
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Mon, 04 Jul 2005 23:22:12 +0200
> But it seems something is wrong because when network load becomes high I get :
>
> Jul 4 23:01:19 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
> Jul 4 23:01:19 dada1 kernel: tg3: eth0: transmit timed out, resetting
> Jul 4 23:01:24 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
> Jul 4 23:01:24 dada1 kernel: tg3: eth0: transmit timed out, resetting
> Jul 4 23:01:29 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
Does this happen without changing the coalescing settings
at all?
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [TG3]: About hw coalescing infrastructure.
2005-07-04 21:26 ` David S. Miller
@ 2005-07-04 21:39 ` Eric Dumazet
2005-07-04 21:49 ` David S. Miller
2005-07-04 22:31 ` Eric Dumazet
0 siblings, 2 replies; 64+ messages in thread
From: Eric Dumazet @ 2005-07-04 21:39 UTC (permalink / raw)
To: David S. Miller; +Cc: mchan, netdev
David S. Miller a écrit :
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Mon, 04 Jul 2005 23:22:12 +0200
>
>
>>But it seems something is wrong because when network load becomes high I get :
>>
>>Jul 4 23:01:19 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
>>Jul 4 23:01:19 dada1 kernel: tg3: eth0: transmit timed out, resetting
>>Jul 4 23:01:24 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
>>Jul 4 23:01:24 dada1 kernel: tg3: eth0: transmit timed out, resetting
>>Jul 4 23:01:29 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
>
>
> Does this happen without changing the coalescing settings
> at all?
>
>
Not easy to answer because I have to rebuild a kernel and reboot. Will do that shortly.
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [TG3]: About hw coalescing infrastructure.
2005-07-04 21:39 ` Eric Dumazet
@ 2005-07-04 21:49 ` David S. Miller
2005-07-04 22:31 ` Eric Dumazet
1 sibling, 0 replies; 64+ messages in thread
From: David S. Miller @ 2005-07-04 21:49 UTC (permalink / raw)
To: dada1; +Cc: mchan, netdev
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Mon, 04 Jul 2005 23:39:35 +0200
> Not easy to answer because I have to rebuild a kernel and
> reboot. Will do that shortly.
Thanks.
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [TG3]: About hw coalescing infrastructure.
2005-07-04 21:39 ` Eric Dumazet
2005-07-04 21:49 ` David S. Miller
@ 2005-07-04 22:31 ` Eric Dumazet
2005-07-04 22:47 ` David S. Miller
2005-07-04 22:47 ` Eric Dumazet
1 sibling, 2 replies; 64+ messages in thread
From: Eric Dumazet @ 2005-07-04 22:31 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David S. Miller, mchan, netdev
Eric Dumazet a écrit :
> David S. Miller a écrit :
>
>> From: Eric Dumazet <dada1@cosmosbay.com>
>> Date: Mon, 04 Jul 2005 23:22:12 +0200
>>
>>
>>> But it seems something is wrong because when network load becomes
>>> high I get :
>>>
>>> Jul 4 23:01:19 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
>>> Jul 4 23:01:19 dada1 kernel: tg3: eth0: transmit timed out, resetting
>>> Jul 4 23:01:24 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
>>> Jul 4 23:01:24 dada1 kernel: tg3: eth0: transmit timed out, resetting
>>> Jul 4 23:01:29 dada1 kernel: NETDEV WATCHDOG: eth0: transmit timed out
>>
>>
>>
>> Does this happen without changing the coalescing settings
>> at all?
>>
>>
> Not easy to answer because I have to rebuild a kernel and reboot. Will
> do that shortly.
>
>
>
Oops, I forgot to tell you I applied the patch : [TG3]: Eliminate all hw IRQ handler spinlocks.
(Date: Fri, 03 Jun 2005 12:25:58 -0700 (PDT))
Maybe I should revert to stock 2.6.12 tg3 driver ?
Eric
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [TG3]: About hw coalescing infrastructure.
2005-07-04 22:31 ` Eric Dumazet
@ 2005-07-04 22:47 ` David S. Miller
2005-07-04 22:55 ` Eric Dumazet
2005-07-04 22:47 ` Eric Dumazet
1 sibling, 1 reply; 64+ messages in thread
From: David S. Miller @ 2005-07-04 22:47 UTC (permalink / raw)
To: dada1; +Cc: mchan, netdev
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 05 Jul 2005 00:31:55 +0200
> Oops, I forgot to tell you I applied the patch : [TG3]: Eliminate all hw IRQ handler spinlocks.
> (Date: Fri, 03 Jun 2005 12:25:58 -0700 (PDT))
>
> Maybe I should revert to stock 2.6.12 tg3 driver ?
Please don't ever do stuff like that :-( That makes the driver
version, and any other information you report completely meaningless
and useless. You've just wasted a lot of our time.
I have no idea to even know _WHICH_ IRQ spinlock patch you applied.
The one that ended up in Linus's tree has many bug fixes and
refinements. The ones which were posted on netdev had many bugs and
deadlocks which needed to be cured before pushing the change upstream.
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [TG3]: About hw coalescing infrastructure.
2005-07-04 22:31 ` Eric Dumazet
2005-07-04 22:47 ` David S. Miller
@ 2005-07-04 22:47 ` Eric Dumazet
1 sibling, 0 replies; 64+ messages in thread
From: Eric Dumazet @ 2005-07-04 22:47 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David S. Miller, mchan, netdev
Eric Dumazet a écrit :
>
>
> Oops, I forgot to tell you I applied the patch : [TG3]: Eliminate all
> hw IRQ handler spinlocks.
> (Date: Fri, 03 Jun 2005 12:25:58 -0700 (PDT))
>
> Maybe I should revert to stock 2.6.12 tg3 driver ?
>
> Eric
>
>
Oh well, I checked 2.6.13-rc1 and it contains this line in tg3_netif_stop() :
+ tp->dev->trans_start = jiffies; /* prevent tx timeout */
So I am trying driver 3.32 June 24, 2005, included in 2.6.13-rc1
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [TG3]: About hw coalescing infrastructure.
2005-07-04 22:47 ` David S. Miller
@ 2005-07-04 22:55 ` Eric Dumazet
2005-07-04 22:57 ` Eric Dumazet
2005-07-04 23:00 ` [TG3]: About hw coalescing infrastructure David S. Miller
0 siblings, 2 replies; 64+ messages in thread
From: Eric Dumazet @ 2005-07-04 22:55 UTC (permalink / raw)
To: David S. Miller; +Cc: mchan, netdev
David S. Miller a écrit :
> Please don't ever do stuff like that :-( That makes the driver
> version, and any other information you report completely meaningless
> and useless. You've just wasted a lot of our time.
>
Yes. But if you dont want us to test your patches, dont send them to the list.
For your information, 2.6.13-rc1 locks too.
Very easy way to lock it :
ping -f some_destination.
> I have no idea to even know _WHICH_ IRQ spinlock patch you applied.
>
> The one that ended up in Linus's tree has many bug fixes and
> refinements. The ones which were posted on netdev had many bugs and
> deadlocks which needed to be cured before pushing the change upstream.
>
>
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [TG3]: About hw coalescing infrastructure.
2005-07-04 22:55 ` Eric Dumazet
@ 2005-07-04 22:57 ` Eric Dumazet
2005-07-04 23:01 ` David S. Miller
2005-07-04 23:00 ` [TG3]: About hw coalescing infrastructure David S. Miller
1 sibling, 1 reply; 64+ messages in thread
From: Eric Dumazet @ 2005-07-04 22:57 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David S. Miller, mchan, netdev
Eric Dumazet a écrit :
>
> For your information, 2.6.13-rc1 locks too.
>
> Very easy way to lock it :
>
> ping -f some_destination.
Arg. False alarm. Sorry.
Time for me to sleep.
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [TG3]: About hw coalescing infrastructure.
2005-07-04 22:55 ` Eric Dumazet
2005-07-04 22:57 ` Eric Dumazet
@ 2005-07-04 23:00 ` David S. Miller
2005-07-05 16:14 ` Eric Dumazet
1 sibling, 1 reply; 64+ messages in thread
From: David S. Miller @ 2005-07-04 23:00 UTC (permalink / raw)
To: dada1; +Cc: mchan, netdev
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 05 Jul 2005 00:55:37 +0200
> David S. Miller a écrit :
>
> > Please don't ever do stuff like that :-( That makes the driver
> > version, and any other information you report completely meaningless
> > and useless. You've just wasted a lot of our time.
> >
>
> Yes. But if you dont want us to test your patches, dont send them to
> the list.
The case here is that you didn't even mention that you had
the patch applied. If you don't say what changes you've applied
to the stock driver we can't properly debug anything.
> For your information, 2.6.13-rc1 locks too.
>
> Very easy way to lock it :
>
> ping -f some_destination.
SMP system? What platform and 570x chip revision?
Does the stock driver in 2.6.12 hang as well?
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [TG3]: About hw coalescing infrastructure.
2005-07-04 22:57 ` Eric Dumazet
@ 2005-07-04 23:01 ` David S. Miller
2005-07-05 7:38 ` [PATCH] loop unrolling in net/sched/sch_generic.c Eric Dumazet
0 siblings, 1 reply; 64+ messages in thread
From: David S. Miller @ 2005-07-04 23:01 UTC (permalink / raw)
To: dada1; +Cc: mchan, netdev
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 05 Jul 2005 00:57:58 +0200
> Eric Dumazet a écrit :
>
> > Very easy way to lock it :
> >
> > ping -f some_destination.
>
>
> Arg. False alarm. Sorry.
>
> Time for me to sleep.
Oh well...
^ permalink raw reply [flat|nested] 64+ messages in thread
* [PATCH] loop unrolling in net/sched/sch_generic.c
2005-07-04 23:01 ` David S. Miller
@ 2005-07-05 7:38 ` Eric Dumazet
2005-07-05 11:51 ` Thomas Graf
` (2 more replies)
0 siblings, 3 replies; 64+ messages in thread
From: Eric Dumazet @ 2005-07-05 7:38 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev
[-- Attachment #1: Type: text/plain, Size: 305 bytes --]
[NET] : unroll a small loop in pfifo_fast_dequeue(). Compiler generates better code.
(Using skb_queue_empty() to test the queue is faster than trying to __skb_dequeue())
oprofile says this function uses now 0.29% instead of 1.22 %, on a x86_64 target.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
[-- Attachment #2: patch.sch_generic --]
[-- Type: text/plain, Size: 1022 bytes --]
--- linux-2.6.12/net/sched/sch_generic.c 2005-06-17 21:48:29.000000000 +0200
+++ linux-2.6.12-ed/net/sched/sch_generic.c 2005-07-05 09:11:30.000000000 +0200
@@ -333,18 +333,23 @@
static struct sk_buff *
pfifo_fast_dequeue(struct Qdisc* qdisc)
{
- int prio;
struct sk_buff_head *list = qdisc_priv(qdisc);
struct sk_buff *skb;
- for (prio = 0; prio < 3; prio++, list++) {
- skb = __skb_dequeue(list);
- if (skb) {
- qdisc->q.qlen--;
- return skb;
- }
+ for (;;) {
+ if (!skb_queue_empty(list))
+ break;
+ list++;
+ if (!skb_queue_empty(list))
+ break;
+ list++;
+ if (!skb_queue_empty(list))
+ break;
+ return NULL;
}
- return NULL;
+ skb = __skb_dequeue(list);
+ qdisc->q.qlen--;
+ return skb;
}
static int
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
2005-07-05 7:38 ` [PATCH] loop unrolling in net/sched/sch_generic.c Eric Dumazet
@ 2005-07-05 11:51 ` Thomas Graf
2005-07-05 12:03 ` Thomas Graf
2005-07-05 13:04 ` Eric Dumazet
2005-07-05 21:26 ` David S. Miller
2005-07-28 15:52 ` [PATCH] Add prefetches in net/ipv4/route.c Eric Dumazet
2 siblings, 2 replies; 64+ messages in thread
From: Thomas Graf @ 2005-07-05 11:51 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David S. Miller, netdev
* Eric Dumazet <42CA390C.9000801@cosmosbay.com> 2005-07-05 09:38
> [NET] : unroll a small loop in pfifo_fast_dequeue(). Compiler generates
> better code.
> (Using skb_queue_empty() to test the queue is faster than trying to
> __skb_dequeue())
> oprofile says this function uses now 0.29% instead of 1.22 %, on a
> x86_64 target.
I think this patch is pretty much pointless. __skb_dequeue() and
!skb_queue_empty() should produce almost the same code and as soon
as you disable profiling and debugging you'll see that the compiler
unrolls the loop itself if possible.
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
2005-07-05 11:51 ` Thomas Graf
@ 2005-07-05 12:03 ` Thomas Graf
2005-07-05 13:04 ` Eric Dumazet
1 sibling, 0 replies; 64+ messages in thread
From: Thomas Graf @ 2005-07-05 12:03 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David S. Miller, netdev
* Thomas Graf <20050705115108.GE16076@postel.suug.ch> 2005-07-05 13:51
> * Eric Dumazet <42CA390C.9000801@cosmosbay.com> 2005-07-05 09:38
> > [NET] : unroll a small loop in pfifo_fast_dequeue(). Compiler generates
> > better code.
> > (Using skb_queue_empty() to test the queue is faster than trying to
> > __skb_dequeue())
> > oprofile says this function uses now 0.29% instead of 1.22 %, on a
> > x86_64 target.
>
> I think this patch is pretty much pointless. __skb_dequeue() and
> !skb_queue_empty() should produce almost the same code and as soon
> as you disable profiling and debugging you'll see that the compiler
> unrolls the loop itself if possible.
... given one enables it of course.
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
2005-07-05 11:51 ` Thomas Graf
2005-07-05 12:03 ` Thomas Graf
@ 2005-07-05 13:04 ` Eric Dumazet
2005-07-05 13:48 ` Thomas Graf
1 sibling, 1 reply; 64+ messages in thread
From: Eric Dumazet @ 2005-07-05 13:04 UTC (permalink / raw)
To: Thomas Graf; +Cc: David S. Miller, netdev
Thomas Graf a écrit :
> * Eric Dumazet <42CA390C.9000801@cosmosbay.com> 2005-07-05 09:38
>
>>[NET] : unroll a small loop in pfifo_fast_dequeue(). Compiler generates
>>better code.
>> (Using skb_queue_empty() to test the queue is faster than trying to
>> __skb_dequeue())
>> oprofile says this function uses now 0.29% instead of 1.22 %, on a
>> x86_64 target.
>
>
> I think this patch is pretty much pointless. __skb_dequeue() and
> !skb_queue_empty() should produce almost the same code and as soon
> as you disable profiling and debugging you'll see that the compiler
> unrolls the loop itself if possible.
>
>
OK. At least my compiler (gcc-3.3.1) does NOT unroll the loop :
Original 2.6.12 gives :
ffffffff802a9790 <pfifo_fast_dequeue>: /* pfifo_fast_dequeue total: 2904054 1.9531 */
258371 0.1738 :ffffffff802a9790: lea 0xc0(%rdi),%rcx
273669 0.1841 :ffffffff802a9797: xor %esi,%esi
12533 0.0084 :ffffffff802a9799: mov (%rcx),%rdx
292315 0.1966 :ffffffff802a979c: cmp %rcx,%rdx
11717 0.0079 :ffffffff802a979f: je ffffffff802a97d1 <pfifo_fast_dequeue+0x41>
4474 0.0030 :ffffffff802a97a1: mov %rdx,%rax
6238 0.0042 :ffffffff802a97a4: mov (%rdx),%rdx
41 2.8e-05 :ffffffff802a97a7: decl 0x10(%rcx)
6089 0.0041 :ffffffff802a97aa: test %rax,%rax
126 8.5e-05 :ffffffff802a97ad: movq $0x0,0x10(%rax)
39 2.6e-05 :ffffffff802a97b5: mov %rcx,0x8(%rdx)
6974 0.0047 :ffffffff802a97b9: mov %rdx,(%rcx)
2841 0.0019 :ffffffff802a97bc: movq $0x0,0x8(%rax)
366 2.5e-04 :ffffffff802a97c4: movq $0x0,(%rax)
14757 0.0099 :ffffffff802a97cb: je ffffffff802a97d1 <pfifo_fast_dequeue+0x41>
288 1.9e-04 :ffffffff802a97cd: decl 0x40(%rdi)
94 6.3e-05 :ffffffff802a97d0: retq
970400 0.6526 :ffffffff802a97d1: inc %esi
982402 0.6607 :ffffffff802a97d3: add $0x18,%rcx
4 2.7e-06 :ffffffff802a97d7: cmp $0x2,%esi
1 6.7e-07 :ffffffff802a97da: jle ffffffff802a9799 <pfifo_fast_dequeue+0x9>
59754 0.0402 :ffffffff802a97dc: xor %eax,%eax
561 3.8e-04 :ffffffff802a97de: data16
:ffffffff802a97df: nop
:ffffffff802a97e0: retq
And new code (2.6.12-ed):
ffffffff802b1020 <pfifo_fast_dequeue>: /* pfifo_fast_dequeue total: 153139 0.2934 */
27388 0.0525 :ffffffff802b1020: lea 0xc0(%rdi),%rdx
42091 0.0806 :ffffffff802b1027: cmp %rdx,0xc0(%rdi)
:ffffffff802b102e: jne ffffffff802b1052 <pfifo_fast_dequeue+0x32>
474 9.1e-04 :ffffffff802b1030: lea 0xd8(%rdi),%rdx
5571 0.0107 :ffffffff802b1037: cmp %rdx,0xd8(%rdi)
2 3.8e-06 :ffffffff802b103e: jne ffffffff802b1052 <pfifo_fast_dequeue+0x32>
1 1.9e-06 :ffffffff802b1040: lea 0xf0(%rdi),%rdx
20030 0.0384 :ffffffff802b1047: xor %eax,%eax
6 1.1e-05 :ffffffff802b1049: cmp %rdx,0xf0(%rdi)
6 1.1e-05 :ffffffff802b1050: je ffffffff802b1086 <pfifo_fast_dequeue+0x66>
:ffffffff802b1052: mov (%rdx),%rcx
11796 0.0226 :ffffffff802b1055: xor %eax,%eax
:ffffffff802b1057: cmp %rdx,%rcx
8 1.5e-05 :ffffffff802b105a: je ffffffff802b1083 <pfifo_fast_dequeue+0x63>
3146 0.0060 :ffffffff802b105c: mov %rcx,%rax
12 2.3e-05 :ffffffff802b105f: mov (%rcx),%rcx
118 2.3e-04 :ffffffff802b1062: decl 0x10(%rdx)
4924 0.0094 :ffffffff802b1065: movq $0x0,0x10(%rax)
65 1.2e-04 :ffffffff802b106d: mov %rdx,0x8(%rcx)
725 0.0014 :ffffffff802b1071: mov %rcx,(%rdx)
11493 0.0220 :ffffffff802b1074: movq $0x0,0x8(%rax)
194 3.7e-04 :ffffffff802b107c: movq $0x0,(%rax)
2995 0.0057 :ffffffff802b1083: decl 0x40(%rdi)
19607 0.0376 :ffffffff802b1086: nop
2487 0.0048 :ffffffff802b1087: retq
Please give us the code your compiler produces, and explain me how disabling oprofile can change the generated assembly. :)
Debugging has no impact on this code either.
Thank you
Eric
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
2005-07-05 13:04 ` Eric Dumazet
@ 2005-07-05 13:48 ` Thomas Graf
2005-07-05 15:58 ` Eric Dumazet
0 siblings, 1 reply; 64+ messages in thread
From: Thomas Graf @ 2005-07-05 13:48 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David S. Miller, netdev
* Eric Dumazet <42CA8555.9050607@cosmosbay.com> 2005-07-05 15:04
> Thomas Graf a écrit :
> >* Eric Dumazet <42CA390C.9000801@cosmosbay.com> 2005-07-05 09:38
> >
> >>[NET] : unroll a small loop in pfifo_fast_dequeue(). Compiler generates
> >>better code.
> >> (Using skb_queue_empty() to test the queue is faster than trying to
> >> __skb_dequeue())
> >> oprofile says this function uses now 0.29% instead of 1.22 %, on a
> >> x86_64 target.
> >
> >
> >I think this patch is pretty much pointless. __skb_dequeue() and
> >!skb_queue_empty() should produce almost the same code and as soon
> >as you disable profiling and debugging you'll see that the compiler
> >unrolls the loop itself if possible.
> >
> >
>
> OK. At least my compiler (gcc-3.3.1) does NOT unroll the loop :
Because you don't specify -funroll-loop
[...]
> Please give us the code your compiler produces,
Unrolled version:
pfifo_fast_dequeue:
pushl %esi
xorl %edx, %edx
pushl %ebx
movl 12(%esp), %esi
movl 128(%esi), %eax
leal 128(%esi), %ecx
cmpl %ecx, %eax
je .L132
movl %eax, %edx
movl (%eax), %eax
decl 8(%ecx)
movl $0, 8(%edx)
movl %ecx, 4(%eax)
movl %eax, 128(%esi)
movl $0, 4(%edx)
movl $0, (%edx)
.L132:
testl %edx, %edx
je .L131
movl 96(%edx), %ebx
movl 80(%esi), %eax
decl 40(%esi)
subl %ebx, %eax
movl %eax, 80(%esi)
movl %edx, %eax
.L117:
popl %ebx
popl %esi
ret
.L131:
movl 20(%ecx), %eax
leal 20(%ecx), %edx
xorl %ebx, %ebx
cmpl %edx, %eax
je .L137
movl %eax, %ebx
movl (%eax), %eax
decl 8(%edx)
movl $0, 8(%ebx)
movl %edx, 4(%eax)
movl %eax, 20(%ecx)
movl $0, 4(%ebx)
movl $0, (%ebx)
.L137:
testl %ebx, %ebx
je .L147
.L146:
movl 96(%ebx), %ecx
movl 80(%esi), %eax
decl 40(%esi)
subl %ecx, %eax
movl %eax, 80(%esi)
movl %ebx, %eax
jmp .L117
.L147:
movl 40(%ecx), %eax
leal 40(%ecx), %edx
xorl %ebx, %ebx
cmpl %edx, %eax
je .L142
movl %eax, %ebx
movl (%eax), %eax
decl 8(%edx)
movl $0, 8(%ebx)
movl %edx, 4(%eax)
movl %eax, 40(%ecx)
movl $0, 4(%ebx)
movl $0, (%ebx)
.L142:
xorl %eax, %eax
testl %ebx, %ebx
jne .L146
jmp .L117
> and explain me how
> disabling oprofile can change the generated assembly. :)
> Debugging has no impact on this code either.
I just noticed that this is a local modification of my own, so in
the vanilla tree it indeed doesn't have any impact on the code
generated.
Still, your patch does not make sense to me. The latest tree
also includes my pfifo_fast changes wich modified the code to
maintain a backlog and made it easy to add more fifos at compile
time. If you want the loop unrolled then let the compiler do it
via -funroll-loop. These kind of optimization seem as uncessary
to me as all the loopback optimizations.
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
2005-07-05 13:48 ` Thomas Graf
@ 2005-07-05 15:58 ` Eric Dumazet
2005-07-05 17:34 ` Thomas Graf
0 siblings, 1 reply; 64+ messages in thread
From: Eric Dumazet @ 2005-07-05 15:58 UTC (permalink / raw)
To: Thomas Graf; +Cc: David S. Miller, netdev
Thomas Graf a écrit :
>>OK. At least my compiler (gcc-3.3.1) does NOT unroll the loop :
>
>
> Because you don't specify -funroll-loop
I'm using vanilla 2.6.12 : no -funroll-loop in it. Maybe in your tree, not on 99.9% of 2.6.12 trees.
Are you suggesting everybody should use this compiler flag ?
Something like :
net/sched/Makefile:
CFLAGS_sch_generic.o := -funroll-loops
?
>
> [...]
>
>
>>Please give us the code your compiler produces,
>
>
> Unrolled version:
>
> pfifo_fast_dequeue:
> pushl %esi
> xorl %edx, %edx
> pushl %ebx
> movl 12(%esp), %esi
> movl 128(%esi), %eax
> leal 128(%esi), %ecx
> cmpl %ecx, %eax
> je .L132
> movl %eax, %edx
> movl (%eax), %eax
> decl 8(%ecx)
> movl $0, 8(%edx)
> movl %ecx, 4(%eax)
> movl %eax, 128(%esi)
> movl $0, 4(%edx)
> movl $0, (%edx)
> .L132:
> testl %edx, %edx
> je .L131
> movl 96(%edx), %ebx
> movl 80(%esi), %eax
> decl 40(%esi)
> subl %ebx, %eax
> movl %eax, 80(%esi)
> movl %edx, %eax
> .L117:
> popl %ebx
> popl %esi
> ret
> .L131:
> movl 20(%ecx), %eax
> leal 20(%ecx), %edx
> xorl %ebx, %ebx
> cmpl %edx, %eax
> je .L137
> movl %eax, %ebx
> movl (%eax), %eax
> decl 8(%edx)
> movl $0, 8(%ebx)
> movl %edx, 4(%eax)
> movl %eax, 20(%ecx)
> movl $0, 4(%ebx)
> movl $0, (%ebx)
> .L137:
> testl %ebx, %ebx
> je .L147
> .L146:
> movl 96(%ebx), %ecx
> movl 80(%esi), %eax
> decl 40(%esi)
> subl %ecx, %eax
> movl %eax, 80(%esi)
> movl %ebx, %eax
> jmp .L117
> .L147:
> movl 40(%ecx), %eax
> leal 40(%ecx), %edx
> xorl %ebx, %ebx
> cmpl %edx, %eax
> je .L142
> movl %eax, %ebx
> movl (%eax), %eax
> decl 8(%edx)
> movl $0, 8(%ebx)
> movl %edx, 4(%eax)
> movl %eax, 40(%ecx)
> movl $0, 4(%ebx)
> movl $0, (%ebx)
> .L142:
> xorl %eax, %eax
> testl %ebx, %ebx
> jne .L146
> jmp .L117
>
OK thanks, but you dont give the code for my version :) shorter and unrolled as you can see, and with nice predicted branches.
00000fc0 <pfifo_fast_dequeue>:
fc0: 56 push %esi
fc1: 89 c1 mov %eax,%ecx
fc3: 53 push %ebx
fc4: 8d 98 a0 00 00 00 lea 0xa0(%eax),%ebx
fca: 39 98 a0 00 00 00 cmp %ebx,0xa0(%eax)
fd0: 89 da mov %ebx,%edx
fd2: 75 22 jne ff6 <pfifo_fast_dequeue+0x36>
fd4: 8d 90 c4 00 00 00 lea 0xc4(%eax),%edx
fda: 39 90 c4 00 00 00 cmp %edx,0xc4(%eax)
fe0: 89 d3 mov %edx,%ebx
fe2: 75 12 jne ff6 <pfifo_fast_dequeue+0x36>
fe4: 8d 98 e8 00 00 00 lea 0xe8(%eax),%ebx
fea: 31 f6 xor %esi,%esi
fec: 39 98 e8 00 00 00 cmp %ebx,0xe8(%eax)
ff2: 89 da mov %ebx,%edx
ff4: 74 27 je 101d <pfifo_fast_dequeue+0x5d>
ff6: 8b 32 mov (%edx),%esi
ff8: 39 d6 cmp %edx,%esi
ffa: 74 26 je 1022 <pfifo_fast_dequeue+0x62>
ffc: 8b 06 mov (%esi),%eax
ffe: ff 4b 08 decl 0x8(%ebx)
1001: c7 46 08 00 00 00 00 movl $0x0,0x8(%esi)
1008: 89 50 04 mov %edx,0x4(%eax)
100b: 89 02 mov %eax,(%edx)
100d: c7 46 04 00 00 00 00 movl $0x0,0x4(%esi)
1014: c7 06 00 00 00 00 movl $0x0,(%esi)
101a: ff 49 28 decl 0x28(%ecx)
101d: 5b pop %ebx
101e: 89 f0 mov %esi,%eax
1020: 5e pop %esi
1021: c3 ret
1022: ff 49 28 decl 0x28(%ecx)
1025: 31 f6 xor %esi,%esi
1027: eb f4 jmp 101d <pfifo_fast_dequeue+0x5d>
>
> I just noticed that this is a local modification of my own, so in
> the vanilla tree it indeed doesn't have any impact on the code
> generated.
>
> Still, your patch does not make sense to me. The latest tree
> also includes my pfifo_fast changes wich modified the code to
> maintain a backlog and made it easy to add more fifos at compile
> time. If you want the loop unrolled then let the compiler do it
> via -funroll-loop. These kind of optimization seem as uncessary
> to me as all the loopback optimizations.
>
I dont want change compiler flags in my tree and loose this optim when 2.6.13 is released.
I dont know about loopback optimization, I am not involved with this stuff, maybe you think I'm another guy ?
It seems to me you give unrelated arguments.
I dont know what are your plans, but mine were not to say you are writing bad code.
Just to give my performance analysis and feedback, I'm sorry if it hurts you.
Eric Dumazet
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [TG3]: About hw coalescing infrastructure.
2005-07-04 23:00 ` [TG3]: About hw coalescing infrastructure David S. Miller
@ 2005-07-05 16:14 ` Eric Dumazet
0 siblings, 0 replies; 64+ messages in thread
From: Eric Dumazet @ 2005-07-05 16:14 UTC (permalink / raw)
To: David S. Miller; +Cc: mchan, netdev
David S. Miller a écrit :
> SMP system? What platform and 570x chip revision?
>
dual opterons 248,
Ethernet controller: Broadcom Corporation NetXtreme BCM5702 Gigabit Ethernet (rev 02)
> Does the stock driver in 2.6.12 hang as well?
Stock driver in 2.6.11 or 2.6.12 cause hangs (crashes ?) too, but no kernel mesages are logged on disk.
I think this is crashing in cache_grow() (slab code), not sure because I dont have access to the console.
The 2.6.13-rc1 tg3 driver seems to survive (running for 18 hours now), and the cpu used by network activity
was cut down :)
rx-usecs: 500
rx-frames: 30
rx-usecs-irq: 500
rx-frames-irq: 20
tx-usecs: 490
tx-frames: 53
tx-usecs-irq: 490
tx-frames-irq: 5
About 1200 IRQ/second now, with 30000 recv packs/s and 25000 xmit pack/s
Thank you
Eric
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
2005-07-05 15:58 ` Eric Dumazet
@ 2005-07-05 17:34 ` Thomas Graf
2005-07-05 21:22 ` David S. Miller
0 siblings, 1 reply; 64+ messages in thread
From: Thomas Graf @ 2005-07-05 17:34 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David S. Miller, netdev
* Eric Dumazet <42CAAE2F.5070807@cosmosbay.com> 2005-07-05 17:58
> I'm using vanilla 2.6.12 : no -funroll-loop in it. Maybe in your tree, not
> on 99.9% of 2.6.12 trees.
>
> Are you suggesting everybody should use this compiler flag ?
> I dont know about loopback optimization, I am not involved with this stuff,
> maybe you think I'm another guy ?
>
> It seems to me you give unrelated arguments.
Do as you wish, I don't feel like argueing about micro optimizations.
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
2005-07-05 17:34 ` Thomas Graf
@ 2005-07-05 21:22 ` David S. Miller
2005-07-05 21:33 ` Thomas Graf
0 siblings, 1 reply; 64+ messages in thread
From: David S. Miller @ 2005-07-05 21:22 UTC (permalink / raw)
To: tgraf; +Cc: dada1, netdev
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 5 Jul 2005 19:34:11 +0200
> Do as you wish, I don't feel like argueing about micro optimizations.
I bet the performance gain really comes from the mispredicted
branches in the loop.
For loops of fixed duration, say, 5 or 6 iterations or less, it
totally defeats the branch prediction logic in most processors.
By the time the chip moves the I-cache branch state to "likely"
the loop has ended and we eat a mispredict.
I think the original patch is OK, hand unrolling the loop in
the C code. Adding -funroll-loops to the CFLAGS has lots of
implications, and in particular the embedded folks might not
be happy with some things that result from that.
So I'll apply the original unrolling patch for now.
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
2005-07-05 7:38 ` [PATCH] loop unrolling in net/sched/sch_generic.c Eric Dumazet
2005-07-05 11:51 ` Thomas Graf
@ 2005-07-05 21:26 ` David S. Miller
2005-07-28 15:52 ` [PATCH] Add prefetches in net/ipv4/route.c Eric Dumazet
2 siblings, 0 replies; 64+ messages in thread
From: David S. Miller @ 2005-07-05 21:26 UTC (permalink / raw)
To: dada1; +Cc: netdev
Eric, I've told you this before many times. Please do something
so that your email client does not corrupt the patches. Once again,
your email client turned all the tab characters into spaces and
thus made the patch unusable.
Even though you used an attachment, the tab-->space transformation
still happened somehow.
Please fix this, and make a serious mental note to prevent this
somehow in the future, thanks a lot.
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
2005-07-05 21:22 ` David S. Miller
@ 2005-07-05 21:33 ` Thomas Graf
2005-07-05 21:35 ` David S. Miller
0 siblings, 1 reply; 64+ messages in thread
From: Thomas Graf @ 2005-07-05 21:33 UTC (permalink / raw)
To: David S. Miller; +Cc: dada1, netdev
* David S. Miller <20050705.142210.14973612.davem@davemloft.net> 2005-07-05 14:22
> So I'll apply the original unrolling patch for now.
The patch must be changed to use __qdisc_dequeue_head() instead of
__skb_dequeue() or we screw up the backlog.
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
2005-07-05 21:33 ` Thomas Graf
@ 2005-07-05 21:35 ` David S. Miller
2005-07-05 23:16 ` Eric Dumazet
0 siblings, 1 reply; 64+ messages in thread
From: David S. Miller @ 2005-07-05 21:35 UTC (permalink / raw)
To: tgraf; +Cc: dada1, netdev
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 5 Jul 2005 23:33:55 +0200
> * David S. Miller <20050705.142210.14973612.davem@davemloft.net> 2005-07-05 14:22
> > So I'll apply the original unrolling patch for now.
>
> The patch must be changed to use __qdisc_dequeue_head() instead of
> __skb_dequeue() or we screw up the backlog.
Ok, good thing the patch didn't apply correctly anyways :)
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
2005-07-05 21:35 ` David S. Miller
@ 2005-07-05 23:16 ` Eric Dumazet
2005-07-05 23:41 ` Thomas Graf
0 siblings, 1 reply; 64+ messages in thread
From: Eric Dumazet @ 2005-07-05 23:16 UTC (permalink / raw)
To: David S. Miller, tgraf; +Cc: netdev
[-- Attachment #1: Type: text/plain, Size: 921 bytes --]
David S. Miller a écrit :
> From: Thomas Graf <tgraf@suug.ch>
> Date: Tue, 5 Jul 2005 23:33:55 +0200
>
>
>>* David S. Miller <20050705.142210.14973612.davem@davemloft.net> 2005-07-05 14:22
>>
>>>So I'll apply the original unrolling patch for now.
>>
>>The patch must be changed to use __qdisc_dequeue_head() instead of
>>__skb_dequeue() or we screw up the backlog.
>
>
> Ok, good thing the patch didn't apply correctly anyways :)
>
>
Oh well, I was unaware of last changes in 2.6.13-rc1 :(
Given the fact that the PFIFO_FAST_BANDS macro was introduced, I wonder if the patch should be this one or not...
Should we assume PFIFO_FAST_BANDS will stay at 3 or what ?
[NET] : unroll a small loop in pfifo_fast_dequeue(). Compiler generates better code.
oprofile says this function uses now 0.29% instead of 1.22 %, on a x86_64 target.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
[-- Attachment #2: patch.sch_generic --]
[-- Type: text/plain, Size: 1015 bytes --]
--- linux-2.6.13-rc1/net/sched/sch_generic.c 2005-07-06 00:46:53.000000000 +0200
+++ linux-2.6.13-rc1-ed/net/sched/sch_generic.c 2005-07-06 01:05:04.000000000 +0200
@@ -328,18 +328,31 @@
static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
{
- int prio;
struct sk_buff_head *list = qdisc_priv(qdisc);
- for (prio = 0; prio < PFIFO_FAST_BANDS; prio++, list++) {
- struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);
- if (skb) {
- qdisc->q.qlen--;
- return skb;
+#if PFIFO_FAST_BANDS == 3
+ for (;;) {
+ if (!skb_queue_empty(list))
+ break;
+ list++;
+ if (!skb_queue_empty(list))
+ break;
+ list++;
+ if (!skb_queue_empty(list))
+ break;
+ return NULL;
}
- }
-
- return NULL;
+#else
+ int prio;
+ for (prio = 0;; list++) {
+ if (!skb_queue_empty(list))
+ break;
+ if (++prio == PFIFO_FAST_BANDS)
+ return NULL;
+ }
+#endif
+ qdisc->q.qlen--;
+ return __qdisc_dequeue_head(qdisc, list);
}
static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
2005-07-05 23:16 ` Eric Dumazet
@ 2005-07-05 23:41 ` Thomas Graf
2005-07-05 23:45 ` David S. Miller
2005-07-06 0:32 ` Eric Dumazet
0 siblings, 2 replies; 64+ messages in thread
From: Thomas Graf @ 2005-07-05 23:41 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David S. Miller, netdev
* Eric Dumazet <42CB14B2.5090601@cosmosbay.com> 2005-07-06 01:16
> Oh well, I was unaware of last changes in 2.6.13-rc1 :(
Ok, this clarifies a lot for me, I was under the impression
you knew about these changes.
> Given the fact that the PFIFO_FAST_BANDS macro was introduced, I wonder if
> the patch should be this one or not...
> Should we assume PFIFO_FAST_BANDS will stay at 3 or what ?
It is very unlikely to change within mainline but the idea behind
it is to allow it be changed at compile time.
I still think we can fix this performance issue without manually
unrolling the loop or we should at least try to. In the end gcc
should notice the constant part of the loop and move it out so
basically the only difference should the additional prio++ and
possibly a failing branch prediction.
What about this? I'm still not sure where exactly all the time
is lost so this is a shot in the dark.
Index: net-2.6/net/sched/sch_generic.c
===================================================================
--- net-2.6.orig/net/sched/sch_generic.c
+++ net-2.6/net/sched/sch_generic.c
@@ -330,10 +330,11 @@ static struct sk_buff *pfifo_fast_dequeu
{
int prio;
struct sk_buff_head *list = qdisc_priv(qdisc);
+ struct sk_buff *skb;
- for (prio = 0; prio < PFIFO_FAST_BANDS; prio++, list++) {
- struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);
- if (skb) {
+ for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
+ if (!skb_queue_empty(list + prio)) {
+ skb = __qdisc_dequeue_head(qdisc, list);
qdisc->q.qlen--;
return skb;
}
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
2005-07-05 23:41 ` Thomas Graf
@ 2005-07-05 23:45 ` David S. Miller
2005-07-05 23:55 ` Thomas Graf
2005-07-06 0:32 ` Eric Dumazet
1 sibling, 1 reply; 64+ messages in thread
From: David S. Miller @ 2005-07-05 23:45 UTC (permalink / raw)
To: tgraf; +Cc: dada1, netdev
From: Thomas Graf <tgraf@suug.ch>
Date: Wed, 6 Jul 2005 01:41:04 +0200
> I still think we can fix this performance issue without manually
> unrolling the loop or we should at least try to. In the end gcc
> should notice the constant part of the loop and move it out so
> basically the only difference should the additional prio++ and
> possibly a failing branch prediction.
But the branch prediction is where I personally think a lot
of the lossage is coming from. These can cost upwards of 20
or 30 processor cycles, easily. That's getting close to the
cost of a L2 cache miss.
I see the difficulties with this change now, why don't we revisit
this some time in the future?
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
2005-07-05 23:45 ` David S. Miller
@ 2005-07-05 23:55 ` Thomas Graf
0 siblings, 0 replies; 64+ messages in thread
From: Thomas Graf @ 2005-07-05 23:55 UTC (permalink / raw)
To: David S. Miller; +Cc: dada1, netdev
* David S. Miller <20050705.164503.104035718.davem@davemloft.net> 2005-07-05 16:45
> From: Thomas Graf <tgraf@suug.ch>
> Date: Wed, 6 Jul 2005 01:41:04 +0200
>
> > I still think we can fix this performance issue without manually
> > unrolling the loop or we should at least try to. In the end gcc
> > should notice the constant part of the loop and move it out so
> > basically the only difference should the additional prio++ and
> > possibly a failing branch prediction.
>
> But the branch prediction is where I personally think a lot
> of the lossage is coming from. These can cost upwards of 20
> or 30 processor cycles, easily. That's getting close to the
> cost of a L2 cache miss.
Absolutely. I think what happens is that we produce predicion
failures due to the logic within qdisc_dequeue_head(), I
cannot back this up with numbers though.
> I see the difficulties with this change now, why don't we revisit
> this some time in the future?
Fine with me.
Eric, the patch I just posted should result in the same branch
prediction as your loop unrolling. The only additional overhead
we still have is the list + prio thing and an additional conditional
jump to do the loop. If you have the cycles etc. it would be nice
to compare it with your numbers.
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
2005-07-05 23:41 ` Thomas Graf
2005-07-05 23:45 ` David S. Miller
@ 2005-07-06 0:32 ` Eric Dumazet
2005-07-06 0:51 ` Thomas Graf
2005-07-06 0:53 ` Eric Dumazet
1 sibling, 2 replies; 64+ messages in thread
From: Eric Dumazet @ 2005-07-06 0:32 UTC (permalink / raw)
To: Thomas Graf; +Cc: David S. Miller, netdev
Thomas Graf a écrit :
> I still think we can fix this performance issue without manually
> unrolling the loop or we should at least try to. In the end gcc
> should notice the constant part of the loop and move it out so
> basically the only difference should the additional prio++ and
> possibly a failing branch prediction.
>
> What about this? I'm still not sure where exactly all the time
> is lost so this is a shot in the dark.
>
> Index: net-2.6/net/sched/sch_generic.c
> ===================================================================
> --- net-2.6.orig/net/sched/sch_generic.c
> +++ net-2.6/net/sched/sch_generic.c
> @@ -330,10 +330,11 @@ static struct sk_buff *pfifo_fast_dequeu
> {
> int prio;
> struct sk_buff_head *list = qdisc_priv(qdisc);
> + struct sk_buff *skb;
>
> - for (prio = 0; prio < PFIFO_FAST_BANDS; prio++, list++) {
> - struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);
> - if (skb) {
> + for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
> + if (!skb_queue_empty(list + prio)) {
> + skb = __qdisc_dequeue_head(qdisc, list);
> qdisc->q.qlen--;
> return skb;
> }
>
>
Hum... shouldnt it be :
+ skb = __qdisc_dequeue_head(qdisc, list + prio);
?
Anyway, the branches misprediction come from the fact that most of packets are queued in the prio=2 list.
So each time this function is called, a non unrolled version has to pay 2 to 5 branches misprediction.
if ((!skb_queue_empty(list + prio)) /* branch not taken, mispredict when prio=0 */
if ((!skb_queue_empty(list + prio)) /* branch not taken, mispredict when prio=1 */
if ((!skb_queue_empty(list + prio)) /* branch taken (or not if queue is really empty), mispredict when prio=2 */
Maybe we can rewrite the whole thing without branches, examining prio from PFIFO_FAST_BANDS-1 down to 0, at least for modern cpu with
conditional mov (cmov)
struct sk_buff_head *best = NULL;
struct sk_buff_head *list = qdisc_priv(qdisc)+PFIFO_FAST_BANDS-1;
if (skb_queue_empty(list)) best = list ;
list--;
if (skb_queue_empty(list)) best = list ;
list--;
if (skb_queue_empty(list)) best = list ;
if (best != NULL) {
qdisc->q.qlen--;
return __qdisc_dequeue_head(qdisc, best);
}
This version should have one branch.
I will test this after some sleep :)
See you
Eric
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
2005-07-06 0:32 ` Eric Dumazet
@ 2005-07-06 0:51 ` Thomas Graf
2005-07-06 1:04 ` Eric Dumazet
2005-07-06 0:53 ` Eric Dumazet
1 sibling, 1 reply; 64+ messages in thread
From: Thomas Graf @ 2005-07-06 0:51 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David S. Miller, netdev
* Eric Dumazet <42CB2698.2080904@cosmosbay.com> 2005-07-06 02:32
> Hum... shouldnt it be :
>
> + skb = __qdisc_dequeue_head(qdisc, list + prio);
>
Correct.
> Anyway, the branches misprediction come from the fact that most of packets
> are queued in the prio=2 list.
>
> So each time this function is called, a non unrolled version has to pay 2
> to 5 branches misprediction.
>
> if ((!skb_queue_empty(list + prio)) /* branch not taken, mispredict when
> prio=0 */
The !expr implies an unlikely so the prediction should be right and
equal to your unrolling version.
> Maybe we can rewrite the whole thing without branches, examining prio from
> PFIFO_FAST_BANDS-1 down to 0, at least for modern cpu with conditional mov
> (cmov)
This would break the whole thing, the qdisc is supposed to try and
dequeue from the highest priority queue (prio=0) first.
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
2005-07-06 0:32 ` Eric Dumazet
2005-07-06 0:51 ` Thomas Graf
@ 2005-07-06 0:53 ` Eric Dumazet
2005-07-06 1:02 ` Thomas Graf
2005-07-06 12:42 ` Thomas Graf
1 sibling, 2 replies; 64+ messages in thread
From: Eric Dumazet @ 2005-07-06 0:53 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Thomas Graf, David S. Miller, netdev
Eric Dumazet a écrit :
>
>
> Maybe we can rewrite the whole thing without branches, examining prio
> from PFIFO_FAST_BANDS-1 down to 0, at least for modern cpu with
> conditional mov (cmov)
>
> struct sk_buff_head *best = NULL;
> struct sk_buff_head *list = qdisc_priv(qdisc)+PFIFO_FAST_BANDS-1;
> if (skb_queue_empty(list)) best = list ;
> list--;
> if (skb_queue_empty(list)) best = list ;
> list--;
> if (skb_queue_empty(list)) best = list ;
> if (best != NULL) {
> qdisc->q.qlen--;
> return __qdisc_dequeue_head(qdisc, best);
> }
>
> This version should have one branch.
> I will test this after some sleep :)
> See you
> Eric
>
>
(Sorry, still using 2.6.12, but the idea remains)
static struct sk_buff *
pfifo_fast_dequeue(struct Qdisc* qdisc)
{
struct sk_buff_head *list = qdisc_priv(qdisc);
struct sk_buff_head *best = NULL;
list += 2;
if (!skb_queue_empty(list))
best = list;
list--;
if (!skb_queue_empty(list))
best = list;
list--;
if (!skb_queue_empty(list))
best = list;
if (best) {
qdisc->q.qlen--;
return __skb_dequeue(best);
}
return NULL;
}
At least the compiler output seems promising :
0000000000000550 <pfifo_fast_dequeue>:
550: 48 8d 97 f0 00 00 00 lea 0xf0(%rdi),%rdx
557: 31 c9 xor %ecx,%ecx
559: 48 8d 87 c0 00 00 00 lea 0xc0(%rdi),%rax
560: 48 39 97 f0 00 00 00 cmp %rdx,0xf0(%rdi)
567: 48 0f 45 ca cmovne %rdx,%rcx
56b: 48 8d 97 d8 00 00 00 lea 0xd8(%rdi),%rdx
572: 48 39 97 d8 00 00 00 cmp %rdx,0xd8(%rdi)
579: 48 0f 45 ca cmovne %rdx,%rcx
57d: 48 39 87 c0 00 00 00 cmp %rax,0xc0(%rdi)
584: 48 0f 45 c8 cmovne %rax,%rcx
588: 31 c0 xor %eax,%eax
58a: 48 85 c9 test %rcx,%rcx
58d: 74 32 je 5c1 <pfifo_fast_dequeue+0x71> // one conditional branch
58f: ff 4f 40 decl 0x40(%rdi)
592: 48 8b 11 mov (%rcx),%rdx
595: 48 39 ca cmp %rcx,%rdx
598: 74 27 je 5c1 <pfifo_fast_dequeue+0x71> // never taken branch : always predicted OK
59a: 48 89 d0 mov %rdx,%rax
59d: 48 8b 12 mov (%rdx),%rdx
5a0: ff 49 10 decl 0x10(%rcx)
5a3: 48 c7 40 10 00 00 00 movq $0x0,0x10(%rax)
5aa: 00
5ab: 48 89 4a 08 mov %rcx,0x8(%rdx)
5af: 48 89 11 mov %rdx,(%rcx)
5b2: 48 c7 40 08 00 00 00 movq $0x0,0x8(%rax)
5b9: 00
5ba: 48 c7 00 00 00 00 00 movq $0x0,(%rax)
5c1: 90 nop
5c2: c3 retq
I Will post tomorrow some profiling results.
Eric
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
2005-07-06 0:53 ` Eric Dumazet
@ 2005-07-06 1:02 ` Thomas Graf
2005-07-06 1:09 ` Eric Dumazet
2005-07-06 12:42 ` Thomas Graf
1 sibling, 1 reply; 64+ messages in thread
From: Thomas Graf @ 2005-07-06 1:02 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David S. Miller, netdev
* Eric Dumazet <42CB2B84.50702@cosmosbay.com> 2005-07-06 02:53
> (Sorry, still using 2.6.12, but the idea remains)
I think you got me wrong, the whole point of this qdisc
is to prioritize which means that we cannot dequeue from
prio 1 as long as the queue in prio 0 is not empty.
If you have no traffic at all for prio=0 and prio=1 then
the best solution is to replace the qdisc on the device
with a simple fifo.
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
2005-07-06 0:51 ` Thomas Graf
@ 2005-07-06 1:04 ` Eric Dumazet
2005-07-06 1:07 ` Thomas Graf
0 siblings, 1 reply; 64+ messages in thread
From: Eric Dumazet @ 2005-07-06 1:04 UTC (permalink / raw)
To: Thomas Graf; +Cc: David S. Miller, netdev
Thomas Graf a écrit :
>>Maybe we can rewrite the whole thing without branches, examining prio from
>>PFIFO_FAST_BANDS-1 down to 0, at least for modern cpu with conditional mov
>>(cmov)
>
>
> This would break the whole thing, the qdisc is supposed to try and
> dequeue from the highest priority queue (prio=0) first.
>
>
I still dequeue a packet from the highest priority queue.
But nothing prevents us to look the three queues in the reverse order, if you can avoid the conditional branches.
No memory penalty, since most of time we were looking at the three queues anyway, and the 3 sk_buff_head are in the same cache line.
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
2005-07-06 1:04 ` Eric Dumazet
@ 2005-07-06 1:07 ` Thomas Graf
0 siblings, 0 replies; 64+ messages in thread
From: Thomas Graf @ 2005-07-06 1:07 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David S. Miller, netdev
* Eric Dumazet <42CB2E24.6010303@cosmosbay.com> 2005-07-06 03:04
> Thomas Graf a écrit :
>
> >>Maybe we can rewrite the whole thing without branches, examining prio
> >>from PFIFO_FAST_BANDS-1 down to 0, at least for modern cpu with
> >>conditional mov (cmov)
> >
> >
> >This would break the whole thing, the qdisc is supposed to try and
> >dequeue from the highest priority queue (prio=0) first.
> >
> >
>
> I still dequeue a packet from the highest priority queue.
Ahh... sorry, I misread your patch, interesting idea. I'll
be waiting for your numbers.
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
2005-07-06 1:02 ` Thomas Graf
@ 2005-07-06 1:09 ` Eric Dumazet
0 siblings, 0 replies; 64+ messages in thread
From: Eric Dumazet @ 2005-07-06 1:09 UTC (permalink / raw)
To: Thomas Graf; +Cc: David S. Miller, netdev
Thomas Graf a écrit :
> I think you got me wrong, the whole point of this qdisc
> is to prioritize which means that we cannot dequeue from
> prio 1 as long as the queue in prio 0 is not empty.
if prio 0 is not empty, then the last
if (!skb_queue_empty(list))
best = list;
will set 'best' to the prio 0 list, and we dequeue the packet on this prio 0 list, not on prio 1 or prio 2.
>
> If you have no traffic at all for prio=0 and prio=1 then
> the best solution is to replace the qdisc on the device
> with a simple fifo.
Yes sure, but I know that already. Unfortunatly I have some trafic on prio=1 and prio=0 (about 5 %)
Thank you
Eric
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
2005-07-06 0:53 ` Eric Dumazet
2005-07-06 1:02 ` Thomas Graf
@ 2005-07-06 12:42 ` Thomas Graf
2005-07-07 21:17 ` David S. Miller
1 sibling, 1 reply; 64+ messages in thread
From: Thomas Graf @ 2005-07-06 12:42 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David S. Miller, netdev
* Eric Dumazet <42CB2B84.50702@cosmosbay.com> 2005-07-06 02:53
A short recap after some coffee and sleep:
The inital issue you brought up which you backed up with numbers
is probably the cause of multiple wrong branch predictions due
to the fact that I wrote skb = dequeue(); if (skb) which was
assumed to be likely by the compiler. In your patch you fixed
this with !skb_queue_empty() which fixed this wrong prediction
and also acts as a little optimization due to skb_queue_empty()
being really simple to implement for the compiler. The patch
I posted should result in almost the same result, despite of
the additional wrong branch prediction for the loop it always
has one wrong prediction which is when we hit a non-empty queue.
In your unrolled version you could optimize it even more by
taking advantage that prio=2 is the most likely non-empty queue
so you could change the check to likely and save a wrong branch
prediction for the common case at the cost of a branch misprediction
if all queues are empty.
The patch I posted results in something like this:
pfifo_fast_dequeue:
pushl %ebx
xorl %ecx, %ecx
movl 8(%esp), %ebx
leal 128(%ebx), %edx
.L129:
movl (%edx), %eax
cmpl %edx, %eax
jne .L132 ; if (!skb_queue_empty())
incl %ecx
addl $20, %edx
cmpl $2, %ecx
jle .L129 ; end of loop
xorl %eax, %eax ; all queues empty
.L117:
popl %ebx
ret
I regard the miss here as acceptable for the increased flexibility
we get. It can be optimized with a loop unrolling but my opinion
is to try and avoid it if possible.
Now your second thought is quite interesting, although it heavly
depends on the fact that prio=2 is the most often used band. It
will be interesting to see some numbers.
> static struct sk_buff *
> pfifo_fast_dequeue(struct Qdisc* qdisc)
> {
> struct sk_buff_head *list = qdisc_priv(qdisc);
> struct sk_buff_head *best = NULL;
>
> list += 2;
> if (!skb_queue_empty(list))
> best = list;
> list--;
> if (!skb_queue_empty(list))
> best = list;
> list--;
> if (!skb_queue_empty(list))
> best = list;
Here is what I mean, a likely() should be even better.
> if (best) {
> qdisc->q.qlen--;
> return __skb_dequeue(best);
> }
> return NULL;
> }
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
2005-07-06 12:42 ` Thomas Graf
@ 2005-07-07 21:17 ` David S. Miller
2005-07-07 21:34 ` Thomas Graf
[not found] ` <42CE22CE.7030902@cosmosbay.com>
0 siblings, 2 replies; 64+ messages in thread
From: David S. Miller @ 2005-07-07 21:17 UTC (permalink / raw)
To: tgraf; +Cc: dada1, netdev
From: Thomas Graf <tgraf@suug.ch>
Date: Wed, 6 Jul 2005 14:42:06 +0200
> The inital issue you brought up which you backed up with numbers
> is probably the cause of multiple wrong branch predictions due
> to the fact that I wrote skb = dequeue(); if (skb) which was
> assumed to be likely by the compiler. In your patch you fixed
> this with !skb_queue_empty() which fixed this wrong prediction
> and also acts as a little optimization due to skb_queue_empty()
> being really simple to implement for the compiler.
As an aside, this reminds me that as part of my quest to make
sk_buff smaller, I intend to walk across the tree and change
all tests of the form:
if (!skb_queue_len(list))
into:
if (skb_queue_empty(list))
It would be really nice, after the above transformation and some
others, to get rid of sk_buff_head->qlen. Why? Because that
also allows us to remove the skb->list member as well, as it's
the only reason for existing is so that the SKB queue removal
routines can decrement the queue length.
That's kind of silly, and most SKB lists in the kernel do not care
about the queue length at all. Rather, they care about empty and
non-empty. The cases that do care (mostly packet schedulers) can
keep track of the queue length themselves in their private data
structures. When they remove packets, they _know_ which queue to
decrement the queue length of.
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
2005-07-07 21:17 ` David S. Miller
@ 2005-07-07 21:34 ` Thomas Graf
2005-07-07 22:24 ` David S. Miller
[not found] ` <42CE22CE.7030902@cosmosbay.com>
1 sibling, 1 reply; 64+ messages in thread
From: Thomas Graf @ 2005-07-07 21:34 UTC (permalink / raw)
To: David S. Miller; +Cc: dada1, netdev
* David S. Miller <20050707.141718.85410359.davem@davemloft.net> 2005-07-07 14:17
> As an aside, this reminds me that as part of my quest to make
> sk_buff smaller, I intend to walk across the tree and change
> all tests of the form:
>
> if (!skb_queue_len(list))
>
> into:
>
> if (skb_queue_empty(list))
>
> [...]
>
> The cases that do care (mostly packet schedulers) can
> keep track of the queue length themselves in their private data
> structures. When they remove packets, they _know_ which queue to
> decrement the queue length of.
Since I'm changing the classful qdiscs to use a generic API
for queue management anyway I could take care of this if you want.
WRT the leaf qdiscs it's a bit more complicated since we have
to change the new API to take a new struct which includes the qlen
and the sk_buff_head but not a problem either.
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
2005-07-07 21:34 ` Thomas Graf
@ 2005-07-07 22:24 ` David S. Miller
0 siblings, 0 replies; 64+ messages in thread
From: David S. Miller @ 2005-07-07 22:24 UTC (permalink / raw)
To: tgraf; +Cc: dada1, netdev
From: Thomas Graf <tgraf@suug.ch>
Date: Thu, 7 Jul 2005 23:34:50 +0200
> Since I'm changing the classful qdiscs to use a generic API
> for queue management anyway I could take care of this if you want.
> WRT the leaf qdiscs it's a bit more complicated since we have
> to change the new API to take a new struct which includes the qlen
> and the sk_buff_head but not a problem either.
Ok. I'm going to check something like the following into
my tree. It takes care of the obvious cases of direct
binary test of queue length being zero vs. non-zero.
This uncovered some seriously questionable stuff along the way.
For example, take a look at drivers/usb/net/usbnet.c:usbnet_stop().
That code seems to want to wait until all the SKB queues are empty,
but the way it is coded it only waits if all the queues have at least
one packet.
I preserved the behavior there, but if someone could verify my
analysis and post a bug fix, I'd really appreciate it.
Thanks.
diff --git a/drivers/bluetooth/hci_vhci.c b/drivers/bluetooth/hci_vhci.c
--- a/drivers/bluetooth/hci_vhci.c
+++ b/drivers/bluetooth/hci_vhci.c
@@ -120,7 +120,7 @@ static unsigned int hci_vhci_chr_poll(st
poll_wait(file, &hci_vhci->read_wait, wait);
- if (skb_queue_len(&hci_vhci->readq))
+ if (!skb_queue_empty(&hci_vhci->readq))
return POLLIN | POLLRDNORM;
return POLLOUT | POLLWRNORM;
diff --git a/drivers/isdn/hisax/isdnl1.c b/drivers/isdn/hisax/isdnl1.c
--- a/drivers/isdn/hisax/isdnl1.c
+++ b/drivers/isdn/hisax/isdnl1.c
@@ -279,7 +279,8 @@ BChannel_proc_xmt(struct BCState *bcs)
if (test_and_clear_bit(FLG_L1_PULL_REQ, &st->l1.Flags))
st->l1.l1l2(st, PH_PULL | CONFIRM, NULL);
if (!test_bit(BC_FLG_ACTIV, &bcs->Flag)) {
- if (!test_bit(BC_FLG_BUSY, &bcs->Flag) && (!skb_queue_len(&bcs->squeue))) {
+ if (!test_bit(BC_FLG_BUSY, &bcs->Flag) &&
+ skb_queue_empty(&bcs->squeue)) {
st->l2.l2l1(st, PH_DEACTIVATE | CONFIRM, NULL);
}
}
diff --git a/drivers/isdn/hisax/isdnl2.c b/drivers/isdn/hisax/isdnl2.c
--- a/drivers/isdn/hisax/isdnl2.c
+++ b/drivers/isdn/hisax/isdnl2.c
@@ -108,7 +108,8 @@ static int l2addrsize(struct Layer2 *l2)
static void
set_peer_busy(struct Layer2 *l2) {
test_and_set_bit(FLG_PEER_BUSY, &l2->flag);
- if (skb_queue_len(&l2->i_queue) || skb_queue_len(&l2->ui_queue))
+ if (!skb_queue_empty(&l2->i_queue) ||
+ !skb_queue_empty(&l2->ui_queue))
test_and_set_bit(FLG_L2BLOCK, &l2->flag);
}
@@ -754,7 +755,7 @@ l2_restart_multi(struct FsmInst *fi, int
st->l2.l2l3(st, DL_ESTABLISH | INDICATION, NULL);
if ((ST_L2_7==state) || (ST_L2_8 == state))
- if (skb_queue_len(&st->l2.i_queue) && cansend(st))
+ if (!skb_queue_empty(&st->l2.i_queue) && cansend(st))
st->l2.l2l1(st, PH_PULL | REQUEST, NULL);
}
@@ -810,7 +811,7 @@ l2_connected(struct FsmInst *fi, int eve
if (pr != -1)
st->l2.l2l3(st, pr, NULL);
- if (skb_queue_len(&st->l2.i_queue) && cansend(st))
+ if (!skb_queue_empty(&st->l2.i_queue) && cansend(st))
st->l2.l2l1(st, PH_PULL | REQUEST, NULL);
}
@@ -1014,7 +1015,7 @@ l2_st7_got_super(struct FsmInst *fi, int
if(typ != RR) FsmDelTimer(&st->l2.t203, 9);
restart_t200(st, 12);
}
- if (skb_queue_len(&st->l2.i_queue) && (typ == RR))
+ if (!skb_queue_empty(&st->l2.i_queue) && (typ == RR))
st->l2.l2l1(st, PH_PULL | REQUEST, NULL);
} else
nrerrorrecovery(fi);
@@ -1120,7 +1121,7 @@ l2_got_iframe(struct FsmInst *fi, int ev
return;
}
- if (skb_queue_len(&st->l2.i_queue) && (fi->state == ST_L2_7))
+ if (!skb_queue_empty(&st->l2.i_queue) && (fi->state == ST_L2_7))
st->l2.l2l1(st, PH_PULL | REQUEST, NULL);
if (test_and_clear_bit(FLG_ACK_PEND, &st->l2.flag))
enquiry_cr(st, RR, RSP, 0);
@@ -1138,7 +1139,7 @@ l2_got_tei(struct FsmInst *fi, int event
test_and_set_bit(FLG_L3_INIT, &st->l2.flag);
} else
FsmChangeState(fi, ST_L2_4);
- if (skb_queue_len(&st->l2.ui_queue))
+ if (!skb_queue_empty(&st->l2.ui_queue))
tx_ui(st);
}
@@ -1301,7 +1302,7 @@ l2_pull_iqueue(struct FsmInst *fi, int e
FsmDelTimer(&st->l2.t203, 13);
FsmAddTimer(&st->l2.t200, st->l2.T200, EV_L2_T200, NULL, 11);
}
- if (skb_queue_len(&l2->i_queue) && cansend(st))
+ if (!skb_queue_empty(&l2->i_queue) && cansend(st))
st->l2.l2l1(st, PH_PULL | REQUEST, NULL);
}
@@ -1347,7 +1348,7 @@ l2_st8_got_super(struct FsmInst *fi, int
}
invoke_retransmission(st, nr);
FsmChangeState(fi, ST_L2_7);
- if (skb_queue_len(&l2->i_queue) && cansend(st))
+ if (!skb_queue_empty(&l2->i_queue) && cansend(st))
st->l2.l2l1(st, PH_PULL | REQUEST, NULL);
} else
nrerrorrecovery(fi);
diff --git a/drivers/isdn/hisax/isdnl3.c b/drivers/isdn/hisax/isdnl3.c
--- a/drivers/isdn/hisax/isdnl3.c
+++ b/drivers/isdn/hisax/isdnl3.c
@@ -302,7 +302,7 @@ release_l3_process(struct l3_process *p)
!test_bit(FLG_PTP, &p->st->l2.flag)) {
if (p->debug)
l3_debug(p->st, "release_l3_process: last process");
- if (!skb_queue_len(&p->st->l3.squeue)) {
+ if (skb_queue_empty(&p->st->l3.squeue)) {
if (p->debug)
l3_debug(p->st, "release_l3_process: release link");
if (p->st->protocol != ISDN_PTYPE_NI1)
diff --git a/drivers/isdn/i4l/isdn_tty.c b/drivers/isdn/i4l/isdn_tty.c
--- a/drivers/isdn/i4l/isdn_tty.c
+++ b/drivers/isdn/i4l/isdn_tty.c
@@ -1223,7 +1223,7 @@ isdn_tty_write(struct tty_struct *tty, c
total += c;
}
atomic_dec(&info->xmit_lock);
- if ((info->xmit_count) || (skb_queue_len(&info->xmit_queue))) {
+ if ((info->xmit_count) || !skb_queue_empty(&info->xmit_queue)) {
if (m->mdmreg[REG_DXMT] & BIT_DXMT) {
isdn_tty_senddown(info);
isdn_tty_tint(info);
@@ -1284,7 +1284,7 @@ isdn_tty_flush_chars(struct tty_struct *
if (isdn_tty_paranoia_check(info, tty->name, "isdn_tty_flush_chars"))
return;
- if ((info->xmit_count) || (skb_queue_len(&info->xmit_queue)))
+ if ((info->xmit_count) || !skb_queue_empty(&info->xmit_queue))
isdn_timer_ctrl(ISDN_TIMER_MODEMXMIT, 1);
}
diff --git a/drivers/isdn/icn/icn.c b/drivers/isdn/icn/icn.c
--- a/drivers/isdn/icn/icn.c
+++ b/drivers/isdn/icn/icn.c
@@ -304,12 +304,12 @@ icn_pollbchan_send(int channel, icn_card
isdn_ctrl cmd;
if (!(card->sndcount[channel] || card->xskb[channel] ||
- skb_queue_len(&card->spqueue[channel])))
+ !skb_queue_empty(&card->spqueue[channel])))
return;
if (icn_trymaplock_channel(card, mch)) {
while (sbfree &&
(card->sndcount[channel] ||
- skb_queue_len(&card->spqueue[channel]) ||
+ !skb_queue_empty(&card->spqueue[channel]) ||
card->xskb[channel])) {
spin_lock_irqsave(&card->lock, flags);
if (card->xmit_lock[channel]) {
diff --git a/drivers/net/hamradio/scc.c b/drivers/net/hamradio/scc.c
--- a/drivers/net/hamradio/scc.c
+++ b/drivers/net/hamradio/scc.c
@@ -304,7 +304,7 @@ static inline void scc_discard_buffers(s
scc->tx_buff = NULL;
}
- while (skb_queue_len(&scc->tx_queue))
+ while (!skb_queue_empty(&scc->tx_queue))
dev_kfree_skb(skb_dequeue(&scc->tx_queue));
spin_unlock_irqrestore(&scc->lock, flags);
@@ -1126,8 +1126,7 @@ static void t_dwait(unsigned long channe
if (scc->stat.tx_state == TXS_WAIT) /* maxkeyup or idle timeout */
{
- if (skb_queue_len(&scc->tx_queue) == 0) /* nothing to send */
- {
+ if (skb_queue_empty(&scc->tx_queue)) { /* nothing to send */
scc->stat.tx_state = TXS_IDLE;
netif_wake_queue(scc->dev); /* t_maxkeyup locked it. */
return;
diff --git a/drivers/net/ppp_async.c b/drivers/net/ppp_async.c
--- a/drivers/net/ppp_async.c
+++ b/drivers/net/ppp_async.c
@@ -364,7 +364,7 @@ ppp_asynctty_receive(struct tty_struct *
spin_lock_irqsave(&ap->recv_lock, flags);
ppp_async_input(ap, buf, cflags, count);
spin_unlock_irqrestore(&ap->recv_lock, flags);
- if (skb_queue_len(&ap->rqueue))
+ if (!skb_queue_empty(&ap->rqueue))
tasklet_schedule(&ap->tsk);
ap_put(ap);
if (test_and_clear_bit(TTY_THROTTLED, &tty->flags)
diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c
--- a/drivers/net/ppp_generic.c
+++ b/drivers/net/ppp_generic.c
@@ -1237,8 +1237,8 @@ static int ppp_mp_explode(struct ppp *pp
pch = list_entry(list, struct channel, clist);
navail += pch->avail = (pch->chan != NULL);
if (pch->avail) {
- if (skb_queue_len(&pch->file.xq) == 0
- || !pch->had_frag) {
+ if (skb_queue_empty(&pch->file.xq) ||
+ !pch->had_frag) {
pch->avail = 2;
++nfree;
}
@@ -1374,8 +1374,8 @@ static int ppp_mp_explode(struct ppp *pp
/* try to send it down the channel */
chan = pch->chan;
- if (skb_queue_len(&pch->file.xq)
- || !chan->ops->start_xmit(chan, frag))
+ if (!skb_queue_empty(&pch->file.xq) ||
+ !chan->ops->start_xmit(chan, frag))
skb_queue_tail(&pch->file.xq, frag);
pch->had_frag = 1;
p += flen;
@@ -1412,7 +1412,7 @@ ppp_channel_push(struct channel *pch)
spin_lock_bh(&pch->downl);
if (pch->chan != 0) {
- while (skb_queue_len(&pch->file.xq) > 0) {
+ while (!skb_queue_empty(&pch->file.xq)) {
skb = skb_dequeue(&pch->file.xq);
if (!pch->chan->ops->start_xmit(pch->chan, skb)) {
/* put the packet back and try again later */
@@ -1426,7 +1426,7 @@ ppp_channel_push(struct channel *pch)
}
spin_unlock_bh(&pch->downl);
/* see if there is anything from the attached unit to be sent */
- if (skb_queue_len(&pch->file.xq) == 0) {
+ if (skb_queue_empty(&pch->file.xq)) {
read_lock_bh(&pch->upl);
ppp = pch->ppp;
if (ppp != 0)
diff --git a/drivers/net/ppp_synctty.c b/drivers/net/ppp_synctty.c
--- a/drivers/net/ppp_synctty.c
+++ b/drivers/net/ppp_synctty.c
@@ -406,7 +406,7 @@ ppp_sync_receive(struct tty_struct *tty,
spin_lock_irqsave(&ap->recv_lock, flags);
ppp_sync_input(ap, buf, cflags, count);
spin_unlock_irqrestore(&ap->recv_lock, flags);
- if (skb_queue_len(&ap->rqueue))
+ if (!skb_queue_empty(&ap->rqueue))
tasklet_schedule(&ap->tsk);
sp_put(ap);
if (test_and_clear_bit(TTY_THROTTLED, &tty->flags)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -215,7 +215,7 @@ static unsigned int tun_chr_poll(struct
poll_wait(file, &tun->read_wait, wait);
- if (skb_queue_len(&tun->readq))
+ if (!skb_queue_empty(&tun->readq))
mask |= POLLIN | POLLRDNORM;
return mask;
diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c
--- a/drivers/net/wireless/airo.c
+++ b/drivers/net/wireless/airo.c
@@ -2374,7 +2374,7 @@ void stop_airo_card( struct net_device *
/*
* Clean out tx queue
*/
- if (test_bit(FLAG_MPI, &ai->flags) && skb_queue_len (&ai->txq) > 0) {
+ if (test_bit(FLAG_MPI, &ai->flags) && !skb_queue_empty(&ai->txq)) {
struct sk_buff *skb = NULL;
for (;(skb = skb_dequeue(&ai->txq));)
dev_kfree_skb(skb);
@@ -3287,7 +3287,7 @@ exitrx:
if (status & EV_TXEXC)
get_tx_error(apriv, -1);
spin_lock_irqsave(&apriv->aux_lock, flags);
- if (skb_queue_len (&apriv->txq)) {
+ if (!skb_queue_empty(&apriv->txq)) {
spin_unlock_irqrestore(&apriv->aux_lock,flags);
mpi_send_packet (dev);
} else {
diff --git a/drivers/s390/net/claw.c b/drivers/s390/net/claw.c
--- a/drivers/s390/net/claw.c
+++ b/drivers/s390/net/claw.c
@@ -428,7 +428,7 @@ claw_pack_skb(struct claw_privbk *privpt
new_skb = NULL; /* assume no dice */
pkt_cnt = 0;
CLAW_DBF_TEXT(4,trace,"PackSKBe");
- if (skb_queue_len(&p_ch->collect_queue) > 0) {
+ if (!skb_queue_empty(&p_ch->collect_queue)) {
/* some data */
held_skb = skb_dequeue(&p_ch->collect_queue);
if (p_env->packing != DO_PACKED)
@@ -1254,7 +1254,7 @@ claw_write_next ( struct chbk * p_ch )
privptr = (struct claw_privbk *) dev->priv;
claw_free_wrt_buf( dev );
if ((privptr->write_free_count > 0) &&
- (skb_queue_len(&p_ch->collect_queue) > 0)) {
+ !skb_queue_empty(&p_ch->collect_queue)) {
pk_skb = claw_pack_skb(privptr);
while (pk_skb != NULL) {
rc = claw_hw_tx( pk_skb, dev,1);
diff --git a/drivers/s390/net/ctctty.c b/drivers/s390/net/ctctty.c
--- a/drivers/s390/net/ctctty.c
+++ b/drivers/s390/net/ctctty.c
@@ -156,7 +156,7 @@ ctc_tty_readmodem(ctc_tty_info *info)
skb_queue_head(&info->rx_queue, skb);
else {
kfree_skb(skb);
- ret = skb_queue_len(&info->rx_queue);
+ ret = !skb_queue_empty(&info->rx_queue);
}
}
}
@@ -530,7 +530,7 @@ ctc_tty_write(struct tty_struct *tty, co
total += c;
count -= c;
}
- if (skb_queue_len(&info->tx_queue)) {
+ if (!skb_queue_empty(&info->tx_queue)) {
info->lsr &= ~UART_LSR_TEMT;
tasklet_schedule(&info->tasklet);
}
@@ -594,7 +594,7 @@ ctc_tty_flush_chars(struct tty_struct *t
return;
if (ctc_tty_paranoia_check(info, tty->name, "ctc_tty_flush_chars"))
return;
- if (tty->stopped || tty->hw_stopped || (!skb_queue_len(&info->tx_queue)))
+ if (tty->stopped || tty->hw_stopped || skb_queue_empty(&info->tx_queue))
return;
tasklet_schedule(&info->tasklet);
}
diff --git a/drivers/usb/net/usbnet.c b/drivers/usb/net/usbnet.c
--- a/drivers/usb/net/usbnet.c
+++ b/drivers/usb/net/usbnet.c
@@ -3227,9 +3227,9 @@ static int usbnet_stop (struct net_devic
temp = unlink_urbs (dev, &dev->txq) + unlink_urbs (dev, &dev->rxq);
// maybe wait for deletions to finish.
- while (skb_queue_len (&dev->rxq)
- && skb_queue_len (&dev->txq)
- && skb_queue_len (&dev->done)) {
+ while (!skb_queue_empty(&dev->rxq) &&
+ !skb_queue_empty(&dev->txq) &&
+ !skb_queue_empty(&dev->done)) {
msleep(UNLINK_TIMEOUT_MS);
if (netif_msg_ifdown (dev))
devdbg (dev, "waited for %d urb completions", temp);
diff --git a/include/net/irda/irda_device.h b/include/net/irda/irda_device.h
--- a/include/net/irda/irda_device.h
+++ b/include/net/irda/irda_device.h
@@ -224,7 +224,7 @@ int irda_device_is_receiving(struct net
/* Interface for internal use */
static inline int irda_device_txqueue_empty(const struct net_device *dev)
{
- return (skb_queue_len(&dev->qdisc->q) == 0);
+ return skb_queue_empty(&dev->qdisc->q);
}
int irda_device_set_raw_mode(struct net_device* self, int status);
struct net_device *alloc_irdadev(int sizeof_priv);
diff --git a/include/net/tcp.h b/include/net/tcp.h
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -991,7 +991,7 @@ static __inline__ void tcp_fast_path_on(
static inline void tcp_fast_path_check(struct sock *sk, struct tcp_sock *tp)
{
- if (skb_queue_len(&tp->out_of_order_queue) == 0 &&
+ if (skb_queue_empty(&tp->out_of_order_queue) &&
tp->rcv_wnd &&
atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
!tp->urg_data)
diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
--- a/net/bluetooth/cmtp/core.c
+++ b/net/bluetooth/cmtp/core.c
@@ -213,7 +213,7 @@ static int cmtp_send_frame(struct cmtp_s
return kernel_sendmsg(sock, &msg, &iv, 1, len);
}
-static int cmtp_process_transmit(struct cmtp_session *session)
+static void cmtp_process_transmit(struct cmtp_session *session)
{
struct sk_buff *skb, *nskb;
unsigned char *hdr;
@@ -223,7 +223,7 @@ static int cmtp_process_transmit(struct
if (!(nskb = alloc_skb(session->mtu, GFP_ATOMIC))) {
BT_ERR("Can't allocate memory for new frame");
- return -ENOMEM;
+ return;
}
while ((skb = skb_dequeue(&session->transmit))) {
@@ -275,8 +275,6 @@ static int cmtp_process_transmit(struct
cmtp_send_frame(session, nskb->data, nskb->len);
kfree_skb(nskb);
-
- return skb_queue_len(&session->transmit);
}
static int cmtp_session(void *arg)
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -428,7 +428,7 @@ static int hidp_send_frame(struct socket
return kernel_sendmsg(sock, &msg, &iv, 1, len);
}
-static int hidp_process_transmit(struct hidp_session *session)
+static void hidp_process_transmit(struct hidp_session *session)
{
struct sk_buff *skb;
@@ -453,9 +453,6 @@ static int hidp_process_transmit(struct
hidp_set_timer(session);
kfree_skb(skb);
}
-
- return skb_queue_len(&session->ctrl_transmit) +
- skb_queue_len(&session->intr_transmit);
}
static int hidp_session(void *arg)
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -590,8 +590,11 @@ static long rfcomm_sock_data_wait(struct
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
- if (skb_queue_len(&sk->sk_receive_queue) || sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN) ||
- signal_pending(current) || !timeo)
+ if (!skb_queue_empty(&sk->sk_receive_queue) ||
+ sk->sk_err ||
+ (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ signal_pending(current) ||
+ !timeo)
break;
set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -781,7 +781,7 @@ static int rfcomm_tty_chars_in_buffer(st
BT_DBG("tty %p dev %p", tty, dev);
- if (skb_queue_len(&dlc->tx_queue))
+ if (!skb_queue_empty(&dlc->tx_queue))
return dlc->mtu;
return 0;
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -536,7 +536,7 @@ static void dn_keepalive(struct sock *sk
* we are double checking that we are not sending too
* many of these keepalive frames.
*/
- if (skb_queue_len(&scp->other_xmit_queue) == 0)
+ if (skb_queue_empty(&scp->other_xmit_queue))
dn_nsp_send_link(sk, DN_NOCHANGE, 0);
}
@@ -1191,7 +1191,7 @@ static unsigned int dn_poll(struct file
struct dn_scp *scp = DN_SK(sk);
int mask = datagram_poll(file, sock, wait);
- if (skb_queue_len(&scp->other_receive_queue))
+ if (!skb_queue_empty(&scp->other_receive_queue))
mask |= POLLRDBAND;
return mask;
@@ -1214,7 +1214,7 @@ static int dn_ioctl(struct socket *sock,
case SIOCATMARK:
lock_sock(sk);
- val = (skb_queue_len(&scp->other_receive_queue) != 0);
+ val = !skb_queue_empty(&scp->other_receive_queue);
if (scp->state != DN_RUN)
val = -ENOTCONN;
release_sock(sk);
@@ -1630,7 +1630,7 @@ static int dn_data_ready(struct sock *sk
int len = 0;
if (flags & MSG_OOB)
- return skb_queue_len(q) ? 1 : 0;
+ return !skb_queue_empty(q) ? 1 : 0;
while(skb != (struct sk_buff *)q) {
struct dn_skb_cb *cb = DN_SKB_CB(skb);
@@ -1707,7 +1707,7 @@ static int dn_recvmsg(struct kiocb *iocb
if (sk->sk_err)
goto out;
- if (skb_queue_len(&scp->other_receive_queue)) {
+ if (!skb_queue_empty(&scp->other_receive_queue)) {
if (!(flags & MSG_OOB)) {
msg->msg_flags |= MSG_OOB;
if (!scp->other_report) {
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
--- a/net/decnet/dn_nsp_out.c
+++ b/net/decnet/dn_nsp_out.c
@@ -342,7 +342,8 @@ int dn_nsp_xmit_timeout(struct sock *sk)
dn_nsp_output(sk);
- if (skb_queue_len(&scp->data_xmit_queue) || skb_queue_len(&scp->other_xmit_queue))
+ if (!skb_queue_empty(&scp->data_xmit_queue) ||
+ !skb_queue_empty(&scp->other_xmit_queue))
scp->persist = dn_nsp_persist(sk);
return 0;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1105,7 +1105,7 @@ static void tcp_prequeue_process(struct
struct sk_buff *skb;
struct tcp_sock *tp = tcp_sk(sk);
- NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));
+ NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
/* RX process wants to run with disabled BHs, though it is not
* necessary */
@@ -1369,7 +1369,7 @@ int tcp_recvmsg(struct kiocb *iocb, stru
* is not empty. It is more elegant, but eats cycles,
* unfortunately.
*/
- if (skb_queue_len(&tp->ucopy.prequeue))
+ if (!skb_queue_empty(&tp->ucopy.prequeue))
goto do_prequeue;
/* __ Set realtime policy in scheduler __ */
@@ -1394,7 +1394,7 @@ int tcp_recvmsg(struct kiocb *iocb, stru
}
if (tp->rcv_nxt == tp->copied_seq &&
- skb_queue_len(&tp->ucopy.prequeue)) {
+ !skb_queue_empty(&tp->ucopy.prequeue)) {
do_prequeue:
tcp_prequeue_process(sk);
@@ -1476,7 +1476,7 @@ skip_copy:
} while (len > 0);
if (user_recv) {
- if (skb_queue_len(&tp->ucopy.prequeue)) {
+ if (!skb_queue_empty(&tp->ucopy.prequeue)) {
int chunk;
tp->ucopy.len = copied > 0 ? len : 0;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2802,7 +2802,7 @@ static void tcp_sack_remove(struct tcp_s
int this_sack;
/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
- if (skb_queue_len(&tp->out_of_order_queue) == 0) {
+ if (skb_queue_empty(&tp->out_of_order_queue)) {
tp->rx_opt.num_sacks = 0;
tp->rx_opt.eff_sacks = tp->rx_opt.dsack;
return;
@@ -2935,13 +2935,13 @@ queue_and_out:
if(th->fin)
tcp_fin(skb, sk, th);
- if (skb_queue_len(&tp->out_of_order_queue)) {
+ if (!skb_queue_empty(&tp->out_of_order_queue)) {
tcp_ofo_queue(sk);
/* RFC2581. 4.2. SHOULD send immediate ACK, when
* gap in queue is filled.
*/
- if (!skb_queue_len(&tp->out_of_order_queue))
+ if (skb_queue_empty(&tp->out_of_order_queue))
tp->ack.pingpong = 0;
}
@@ -3249,9 +3249,8 @@ static int tcp_prune_queue(struct sock *
* This must not ever occur. */
/* First, purge the out_of_order queue. */
- if (skb_queue_len(&tp->out_of_order_queue)) {
- NET_ADD_STATS_BH(LINUX_MIB_OFOPRUNED,
- skb_queue_len(&tp->out_of_order_queue));
+ if (!skb_queue_empty(&tp->out_of_order_queue)) {
+ NET_INC_STATS_BH(LINUX_MIB_OFOPRUNED);
__skb_queue_purge(&tp->out_of_order_queue);
/* Reset SACK state. A conforming SACK implementation will
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -231,11 +231,10 @@ static void tcp_delack_timer(unsigned lo
}
tp->ack.pending &= ~TCP_ACK_TIMER;
- if (skb_queue_len(&tp->ucopy.prequeue)) {
+ if (!skb_queue_empty(&tp->ucopy.prequeue)) {
struct sk_buff *skb;
- NET_ADD_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED,
- skb_queue_len(&tp->ucopy.prequeue));
+ NET_INC_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED);
while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
sk->sk_backlog_rcv(sk, skb);
diff --git a/net/irda/irlap.c b/net/irda/irlap.c
--- a/net/irda/irlap.c
+++ b/net/irda/irlap.c
@@ -445,9 +445,8 @@ void irlap_disconnect_request(struct irl
IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
/* Don't disconnect until all data frames are successfully sent */
- if (skb_queue_len(&self->txq) > 0) {
+ if (!skb_queue_empty(&self->txq)) {
self->disconnect_pending = TRUE;
-
return;
}
diff --git a/net/irda/irlap_event.c b/net/irda/irlap_event.c
--- a/net/irda/irlap_event.c
+++ b/net/irda/irlap_event.c
@@ -191,7 +191,7 @@ static void irlap_start_poll_timer(struc
* Send out the RR frames faster if our own transmit queue is empty, or
* if the peer is busy. The effect is a much faster conversation
*/
- if ((skb_queue_len(&self->txq) == 0) || (self->remote_busy)) {
+ if (skb_queue_empty(&self->txq) || self->remote_busy) {
if (self->fast_RR == TRUE) {
/*
* Assert that the fast poll timer has not reached the
@@ -263,7 +263,7 @@ void irlap_do_event(struct irlap_cb *sel
IRDA_DEBUG(2, "%s() : queue len = %d\n", __FUNCTION__,
skb_queue_len(&self->txq));
- if (skb_queue_len(&self->txq)) {
+ if (!skb_queue_empty(&self->txq)) {
/* Prevent race conditions with irlap_data_request() */
self->local_busy = TRUE;
@@ -1074,7 +1074,7 @@ static int irlap_state_xmit_p(struct irl
#else /* CONFIG_IRDA_DYNAMIC_WINDOW */
/* Window has been adjusted for the max packet
* size, so much simpler... - Jean II */
- nextfit = (skb_queue_len(&self->txq) > 0);
+ nextfit = !skb_queue_empty(&self->txq);
#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
/*
* Send data with poll bit cleared only if window > 1
@@ -1814,7 +1814,7 @@ static int irlap_state_xmit_s(struct irl
#else /* CONFIG_IRDA_DYNAMIC_WINDOW */
/* Window has been adjusted for the max packet
* size, so much simpler... - Jean II */
- nextfit = (skb_queue_len(&self->txq) > 0);
+ nextfit = !skb_queue_empty(&self->txq);
#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
/*
* Send data with final bit cleared only if window > 1
@@ -1937,7 +1937,7 @@ static int irlap_state_nrm_s(struct irla
irlap_data_indication(self, skb, FALSE);
/* Any pending data requests? */
- if ((skb_queue_len(&self->txq) > 0) &&
+ if (!skb_queue_empty(&self->txq) &&
(self->window > 0))
{
self->ack_required = TRUE;
@@ -2038,7 +2038,7 @@ static int irlap_state_nrm_s(struct irla
/*
* Any pending data requests?
*/
- if ((skb_queue_len(&self->txq) > 0) &&
+ if (!skb_queue_empty(&self->txq) &&
(self->window > 0) && !self->remote_busy)
{
irlap_data_indication(self, skb, TRUE);
@@ -2069,7 +2069,7 @@ static int irlap_state_nrm_s(struct irla
*/
nr_status = irlap_validate_nr_received(self, info->nr);
if (nr_status == NR_EXPECTED) {
- if ((skb_queue_len( &self->txq) > 0) &&
+ if (!skb_queue_empty(&self->txq) &&
(self->window > 0)) {
self->remote_busy = FALSE;
diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c
--- a/net/irda/irlap_frame.c
+++ b/net/irda/irlap_frame.c
@@ -1018,11 +1018,10 @@ void irlap_resend_rejected_frames(struct
/*
* We can now fill the window with additional data frames
*/
- while (skb_queue_len( &self->txq) > 0) {
+ while (!skb_queue_empty(&self->txq)) {
IRDA_DEBUG(0, "%s(), sending additional frames!\n", __FUNCTION__);
- if ((skb_queue_len( &self->txq) > 0) &&
- (self->window > 0)) {
+ if (self->window > 0) {
skb = skb_dequeue( &self->txq);
IRDA_ASSERT(skb != NULL, return;);
@@ -1031,8 +1030,7 @@ void irlap_resend_rejected_frames(struct
* bit cleared
*/
if ((self->window > 1) &&
- skb_queue_len(&self->txq) > 0)
- {
+ !skb_queue_empty(&self->txq)) {
irlap_send_data_primary(self, skb);
} else {
irlap_send_data_primary_poll(self, skb);
diff --git a/net/irda/irttp.c b/net/irda/irttp.c
--- a/net/irda/irttp.c
+++ b/net/irda/irttp.c
@@ -1513,7 +1513,7 @@ int irttp_disconnect_request(struct tsap
/*
* Check if there is still data segments in the transmit queue
*/
- if (skb_queue_len(&self->tx_queue) > 0) {
+ if (!skb_queue_empty(&self->tx_queue)) {
if (priority == P_HIGH) {
/*
* No need to send the queued data, if we are
diff --git a/net/llc/llc_c_ev.c b/net/llc/llc_c_ev.c
--- a/net/llc/llc_c_ev.c
+++ b/net/llc/llc_c_ev.c
@@ -84,7 +84,7 @@ static u16 llc_util_nr_inside_tx_window(
if (llc->dev->flags & IFF_LOOPBACK)
goto out;
rc = 1;
- if (!skb_queue_len(&llc->pdu_unack_q))
+ if (skb_queue_empty(&llc->pdu_unack_q))
goto out;
skb = skb_peek(&llc->pdu_unack_q);
pdu = llc_pdu_sn_hdr(skb);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -858,7 +858,7 @@ static inline void netlink_rcv_wake(stru
{
struct netlink_sock *nlk = nlk_sk(sk);
- if (!skb_queue_len(&sk->sk_receive_queue))
+ if (skb_queue_empty(&sk->sk_receive_queue))
clear_bit(0, &nlk->state);
if (!test_bit(0, &nlk->state))
wake_up_interruptible(&nlk->wait);
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -385,7 +385,7 @@ static int red_change(struct Qdisc *sch,
memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256);
q->qcount = -1;
- if (skb_queue_len(&sch->q) == 0)
+ if (skb_queue_empty(&sch->q))
PSCHED_SET_PASTPERFECT(q->qidlestart);
sch_tree_unlock(sch);
return 0;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -302,7 +302,7 @@ static void unix_write_space(struct sock
* may receive messages only from that peer. */
static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
{
- if (skb_queue_len(&sk->sk_receive_queue)) {
+ if (!skb_queue_empty(&sk->sk_receive_queue)) {
skb_queue_purge(&sk->sk_receive_queue);
wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
@@ -1619,7 +1619,7 @@ static long unix_stream_data_wait(struct
for (;;) {
prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
- if (skb_queue_len(&sk->sk_receive_queue) ||
+ if (!skb_queue_empty(&sk->sk_receive_queue) ||
sk->sk_err ||
(sk->sk_shutdown & RCV_SHUTDOWN) ||
signal_pending(current) ||
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
[not found] <C925F8B43D79CC49ACD0601FB68FF50C045E0FB0@orsmsx408>
@ 2005-07-07 22:30 ` David S. Miller
0 siblings, 0 replies; 64+ messages in thread
From: David S. Miller @ 2005-07-07 22:30 UTC (permalink / raw)
To: jesse.brandeburg; +Cc: tgraf, dada1, netdev
From: "Brandeburg, Jesse" <jesse.brandeburg@intel.com>
Date: Thu, 7 Jul 2005 15:02:17 -0700
> Arg, this thread wasn't on the new list, is there any chance we can just
> get netdev@oss.sgi.com to forward to netdev@vger.kernel.org?
It does already.
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
[not found] ` <42CE22CE.7030902@cosmosbay.com>
@ 2005-07-08 7:30 ` David S. Miller
2005-07-08 8:19 ` Eric Dumazet
0 siblings, 1 reply; 64+ messages in thread
From: David S. Miller @ 2005-07-08 7:30 UTC (permalink / raw)
To: dada1; +Cc: tgraf, netdev
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Fri, 08 Jul 2005 08:53:02 +0200
> About making sk_buff smaller, I use this patch to declare 'struct
> sec_path *sp' only ifdef CONFIG_XFRM, what do you think ? I also
> use a patch to declare nfcache, nfctinfo and nfct only if
> CONFIG_NETFILTER_CONNTRACK or CONFIG_NETFILTER_CONNTRACK_MODULE are
> defined, but thats more intrusive. Also, tc_index is not used if
> CONFIG_NET_SCHED only is declared but none of CONFIG_NET_SCH_* In my
> case, I am using CONFIG_NET_SCHED only to be able to do : tc -s -d
> qdisc
Distributions enable all of the ifdefs, and that is thus the
size and resultant performance most users see.
That's why I'm working on shrinking the size assuming all the
config options are enabled, because that is the reality for most
installations.
For all of this stuff we could consider stealing some ideas from BSD,
namely doing something similar to their MBUF tags.
If a subsystem wants to add a cookie to a networking buffer, it
allocates a tag and links it into the struct. So, you basically get
away with only one pointer (a struct hlist_head). We could use this
for the security, netfilter, and TC stuff. I don't know exactly what
our tags would look like, but perhaps:
struct skb_tag;
struct skb_tag_type {
void (*destructor)(struct skb_tag *);
kmem_cache_t *slab_cache;
const char *name;
};
struct skb_tag {
struct hlist_node list;
struct skb_tag_type *owner;
int tag_id;
char data[0];
};
struct sk_buff {
...
struct hlist_head tag_list;
...
};
Then netfilter does stuff like:
struct sk_buff *skb;
struct skb_tag *tag;
struct conntrack_skb_info *info;
tag = skb_find_tag(skb, SKB_TAG_NETFILTER_CONNTRACK);
info = (struct conntrack_skb_info *) tag->data;
etc. etc.
The downsides to this approach are:
1) Tagging an SKB eats a memory allocation, which isn't nice.
This is mainly why I haven't mentioned this idea before.
It may be that, on an active system, the per-cpu SLAB caches
for such tag objects might keep the allocation costs real low.
Another factor is that tags are relatively tiny, so a large
number of them fit in one SLAB.
But on the other hand we've been trying to remove per-packet
kmalloc() counts, see the SKB fast-clone discussions about that.
And people ask for SKB recycling all the time.
2) skb_clone() would get more expensive. This is because you'd
need to clone the SKB tags as well.
There is the possibility to hang the tags off of the
skb_shinfo() area. I know this idea sounds crazy, but
the theory goes that if the netfilter et. al info would
change (and thus, so would the assosciative tags), then
you'd need to COW the SKB anyways.
This is actually an idea worth considering regardless of
whether we do tags or not. It would result in less reference
counting when we clone an SKB with netfilter stuff or
security stuff attached.
Overall I'm not too thrilled with the idea, but I'm enthusiatic
about being convinced otherwise since this would shrink sk_buff
dramatically. :-)
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
2005-07-08 7:30 ` David S. Miller
@ 2005-07-08 8:19 ` Eric Dumazet
2005-07-08 11:08 ` Arnaldo Carvalho de Melo
0 siblings, 1 reply; 64+ messages in thread
From: Eric Dumazet @ 2005-07-08 8:19 UTC (permalink / raw)
To: David S. Miller; +Cc: tgraf, netdev
David S. Miller a écrit :
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Fri, 08 Jul 2005 08:53:02 +0200
>
>
>>About making sk_buff smaller, I use this patch to declare 'struct
>>sec_path *sp' only ifdef CONFIG_XFRM, what do you think ? I also
>>use a patch to declare nfcache, nfctinfo and nfct only if
>>CONFIG_NETFILTER_CONNTRACK or CONFIG_NETFILTER_CONNTRACK_MODULE are
>>defined, but thats more intrusive. Also, tc_index is not used if
>>CONFIG_NET_SCHED only is declared but none of CONFIG_NET_SCH_* In my
>>case, I am using CONFIG_NET_SCHED only to be able to do : tc -s -d
>>qdisc
>
>
> Distributions enable all of the ifdefs, and that is thus the
> size and resultant performance most users see.
Well, I had this idea because I found another similar use in include/linux/ip.h
struct inet_sock {
/* sk and pinet6 has to be the first two members of inet_sock */
struct sock sk;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
struct ipv6_pinfo *pinet6;
#endif
You are right such conditions are distributions nightmare :(
Eric
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
2005-07-08 8:19 ` Eric Dumazet
@ 2005-07-08 11:08 ` Arnaldo Carvalho de Melo
2005-07-12 4:02 ` David S. Miller
0 siblings, 1 reply; 64+ messages in thread
From: Arnaldo Carvalho de Melo @ 2005-07-08 11:08 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David S. Miller, tgraf, netdev
[-- Attachment #1: Type: text/plain, Size: 1345 bytes --]
On 7/8/05, Eric Dumazet <dada1@cosmosbay.com> wrote:
>
> David S. Miller a écrit :
> > From: Eric Dumazet <dada1@cosmosbay.com>
> > Date: Fri, 08 Jul 2005 08:53:02 +0200
> >
> >
> >>About making sk_buff smaller, I use this patch to declare 'struct
> >>sec_path *sp' only ifdef CONFIG_XFRM, what do you think ? I also
> >>use a patch to declare nfcache, nfctinfo and nfct only if
> >>CONFIG_NETFILTER_CONNTRACK or CONFIG_NETFILTER_CONNTRACK_MODULE are
> >>defined, but thats more intrusive. Also, tc_index is not used if
> >>CONFIG_NET_SCHED only is declared but none of CONFIG_NET_SCH_* In my
> >>case, I am using CONFIG_NET_SCHED only to be able to do : tc -s -d
> >>qdisc
> >
> >
> > Distributions enable all of the ifdefs, and that is thus the
> > size and resultant performance most users see.
>
> Well, I had this idea because I found another similar use in
> include/linux/ip.h
>
> struct inet_sock {
> /* sk and pinet6 has to be the first two members of inet_sock */
> struct sock sk;
> #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
> struct ipv6_pinfo *pinet6;
> #endif
/me pleads guilty, Dave, any problem with removing this #ifdef? Humm, I'll
think about using
the skb_alloc_extension() idea for struct sock, but this pinet6 sucker is a
bit more difficult I guess...
- Arnaldo
[-- Attachment #2: Type: text/html, Size: 1903 bytes --]
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] loop unrolling in net/sched/sch_generic.c
2005-07-08 11:08 ` Arnaldo Carvalho de Melo
@ 2005-07-12 4:02 ` David S. Miller
0 siblings, 0 replies; 64+ messages in thread
From: David S. Miller @ 2005-07-12 4:02 UTC (permalink / raw)
To: arnaldo.melo; +Cc: dada1, tgraf, netdev
From: Arnaldo Carvalho de Melo <arnaldo.melo@gmail.com>
Date: Fri, 8 Jul 2005 08:08:54 -0300
> > struct inet_sock {
> > /* sk and pinet6 has to be the first two members of inet_sock */
> > struct sock sk;
> > #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
> > struct ipv6_pinfo *pinet6;
> > #endif
>
> /me pleads guilty, Dave, any problem with removing this #ifdef? Humm, I'll
> think about using
Just leave it for now. If we come up with some spectacularly
nice way to deal with this in the future, we can change it then.
^ permalink raw reply [flat|nested] 64+ messages in thread
* [PATCH] Add prefetches in net/ipv4/route.c
2005-07-05 7:38 ` [PATCH] loop unrolling in net/sched/sch_generic.c Eric Dumazet
2005-07-05 11:51 ` Thomas Graf
2005-07-05 21:26 ` David S. Miller
@ 2005-07-28 15:52 ` Eric Dumazet
2005-07-28 19:39 ` David S. Miller
2 siblings, 1 reply; 64+ messages in thread
From: Eric Dumazet @ 2005-07-28 15:52 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev
[-- Attachment #1: Type: text/plain, Size: 219 bytes --]
[NET] : Adds prefetches in route hash list traversals.
The actual code doesnt use a prefetch enabled macro like list_for_each_rcu(), so manually
add prefetch() hints.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
[-- Attachment #2: route.prefetches --]
[-- Type: text/plain, Size: 1606 bytes --]
diff -Nru linux-2.6.13-rc3/net/ipv4/route.c linux-2.6.13-rc3-ed/net/ipv4/route.c
--- linux-2.6.13-rc3/net/ipv4/route.c 2005-07-13 06:46:46.000000000 +0200
+++ linux-2.6.13-rc3-ed/net/ipv4/route.c 2005-07-28 17:20:21.000000000 +0200
@@ -1148,6 +1148,7 @@
while ((rth = rcu_dereference(*rthp)) != NULL) {
struct rtable *rt;
+ prefetch(rth->u.rt_next);
if (rth->fl.fl4_dst != daddr ||
rth->fl.fl4_src != skeys[i] ||
rth->fl.fl4_tos != tos ||
@@ -1401,6 +1402,7 @@
rcu_read_lock();
for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
rth = rcu_dereference(rth->u.rt_next)) {
+ prefetch(rth->u.rt_next);
if (rth->fl.fl4_dst == daddr &&
rth->fl.fl4_src == skeys[i] &&
rth->rt_dst == daddr &&
@@ -2094,6 +2096,7 @@
rcu_read_lock();
for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
rth = rcu_dereference(rth->u.rt_next)) {
+ prefetch(rth->u.rt_next);
if (rth->fl.fl4_dst == daddr &&
rth->fl.fl4_src == saddr &&
rth->fl.iif == iif &&
@@ -2565,6 +2568,7 @@
rcu_read_lock_bh();
for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
rth = rcu_dereference(rth->u.rt_next)) {
+ prefetch(rth->u.rt_next);
if (rth->fl.fl4_dst == flp->fl4_dst &&
rth->fl.fl4_src == flp->fl4_src &&
rth->fl.iif == 0 &&
@@ -2819,6 +2823,7 @@
rcu_read_lock_bh();
for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
rt = rcu_dereference(rt->u.rt_next), idx++) {
+ prefetch(rt->u.rt_next);
if (idx < s_idx)
continue;
skb->dst = dst_clone(&rt->u.dst);
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] Add prefetches in net/ipv4/route.c
2005-07-28 15:52 ` [PATCH] Add prefetches in net/ipv4/route.c Eric Dumazet
@ 2005-07-28 19:39 ` David S. Miller
2005-07-28 20:56 ` Eric Dumazet
0 siblings, 1 reply; 64+ messages in thread
From: David S. Miller @ 2005-07-28 19:39 UTC (permalink / raw)
To: dada1; +Cc: netdev
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Thu, 28 Jul 2005 17:52:04 +0200
> [NET] : Adds prefetches in route hash list traversals.
>
> The actual code doesnt use a prefetch enabled macro like
> list_for_each_rcu(), so manually add prefetch() hints.
and the measured performance improvement is?
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] Add prefetches in net/ipv4/route.c
2005-07-28 19:39 ` David S. Miller
@ 2005-07-28 20:56 ` Eric Dumazet
2005-07-28 20:58 ` David S. Miller
0 siblings, 1 reply; 64+ messages in thread
From: Eric Dumazet @ 2005-07-28 20:56 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev
David S. Miller a écrit :
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Thu, 28 Jul 2005 17:52:04 +0200
>
>
>>[NET] : Adds prefetches in route hash list traversals.
>>
>>The actual code doesnt use a prefetch enabled macro like
>>list_for_each_rcu(), so manually add prefetch() hints.
>
>
> and the measured performance improvement is?
>
>
Half the improvement we could get if only fl.fl4_dst, and other fields were not so far away from the u.rt_next field. (0xE8 on x86_64)
For good performance, one should of course choose a big route cache hash size, and in this case, prefetchs are useless, and even cost an
extra load : prefetch(rth->u.rt_next) imply the load of the rt_next pointer at the start of rth structure, while the fl fields are on a
different cache line)
But in case of DDOS, prefetches are a win.
I did not test a solution using two prefetches...
Other patches using prefetches will follow.
Eric
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] Add prefetches in net/ipv4/route.c
2005-07-28 20:56 ` Eric Dumazet
@ 2005-07-28 20:58 ` David S. Miller
2005-07-28 21:24 ` Eric Dumazet
0 siblings, 1 reply; 64+ messages in thread
From: David S. Miller @ 2005-07-28 20:58 UTC (permalink / raw)
To: dada1; +Cc: netdev
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Thu, 28 Jul 2005 22:56:32 +0200
> But in case of DDOS, prefetches are a win.
Numbers please, I'm simply curious.
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] Add prefetches in net/ipv4/route.c
2005-07-28 20:58 ` David S. Miller
@ 2005-07-28 21:24 ` Eric Dumazet
2005-07-28 22:44 ` David S. Miller
2005-07-29 14:50 ` Robert Olsson
0 siblings, 2 replies; 64+ messages in thread
From: Eric Dumazet @ 2005-07-28 21:24 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev
David S. Miller a écrit :
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Thu, 28 Jul 2005 22:56:32 +0200
>
>
>>But in case of DDOS, prefetches are a win.
>
>
> Numbers please, I'm simply curious.
>
>
I have no profiling info for this exact patch, I'm sorry David.
On a dual opteron machine, this thing from ip_route_input() is very expensive :
RT_CACHE_STAT_INC(in_hlist_search);
ip_route_input() use a total of 3.4563 % of one cpu, but this 'increment' takes 1.20 % !!!
0.0047 mov 2123529(%rip),%rax # ffffffff804b4a60 <rt_cache_stat>
1.1898 not %rax
mov %gs:0x34,%edx
0.0042 movslq %edx,%rdx
mov (%rax,%rdx,8),%rax
incl 0x38(%rax)
Sometime I wonder if oprofile can be trusted :(
Maybe we should increment a counter on the stack and do a final
if (counter != 0)
RT_CACHE_STAT_ADD(in_hlist_search, counter);
Eric
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] Add prefetches in net/ipv4/route.c
2005-07-28 21:24 ` Eric Dumazet
@ 2005-07-28 22:44 ` David S. Miller
2005-07-29 14:50 ` Robert Olsson
1 sibling, 0 replies; 64+ messages in thread
From: David S. Miller @ 2005-07-28 22:44 UTC (permalink / raw)
To: dada1; +Cc: netdev
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Thu, 28 Jul 2005 23:24:33 +0200
> On a dual opteron machine, this thing from ip_route_input() is very expensive :
>
> RT_CACHE_STAT_INC(in_hlist_search);
That's amazing since it's per-cpu. I don't have any suggestions
besides your idea to increment it once using an accumulation local
variable.
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] Add prefetches in net/ipv4/route.c
2005-07-28 21:24 ` Eric Dumazet
2005-07-28 22:44 ` David S. Miller
@ 2005-07-29 14:50 ` Robert Olsson
2005-07-29 17:06 ` Rick Jones
2005-07-31 3:44 ` David S. Miller
1 sibling, 2 replies; 64+ messages in thread
From: Robert Olsson @ 2005-07-29 14:50 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David S. Miller, netdev
Eric Dumazet writes:
> I have no profiling info for this exact patch, I'm sorry David.
> On a dual opteron machine, this thing from ip_route_input() is very expensive :
>
> RT_CACHE_STAT_INC(in_hlist_search);
>
> ip_route_input() use a total of 3.4563 % of one cpu, but this 'increment' takes 1.20 % !!!
Very weird if the statscounter taking a third of ip_route_input.
> Sometime I wonder if oprofile can be trusted :(
>
> Maybe we should increment a counter on the stack and do a final
> if (counter != 0)
> RT_CACHE_STAT_ADD(in_hlist_search, counter);
My experiences from playing with prefetching eth_type_trans in this
case. One must look in the total performance not just were the
prefetching is done. In this case I was able to get eth_type_trans
down in the profile list but other functions increased so performance
was the same or lower. This needs to be sorted out...
Cheers.
--ro
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] Add prefetches in net/ipv4/route.c
2005-07-29 14:50 ` Robert Olsson
@ 2005-07-29 17:06 ` Rick Jones
2005-07-29 17:44 ` Robert Olsson
2005-07-29 17:57 ` Eric Dumazet
2005-07-31 3:44 ` David S. Miller
1 sibling, 2 replies; 64+ messages in thread
From: Rick Jones @ 2005-07-29 17:06 UTC (permalink / raw)
To: netdev
Robert Olsson wrote:
> Eric Dumazet writes:
>
> > I have no profiling info for this exact patch, I'm sorry David.
> > On a dual opteron machine, this thing from ip_route_input() is very expensive :
> >
> > RT_CACHE_STAT_INC(in_hlist_search);
> >
> > ip_route_input() use a total of 3.4563 % of one cpu, but this 'increment' takes 1.20 % !!!
>
> Very weird if the statscounter taking a third of ip_route_input.
>
> > Sometime I wonder if oprofile can be trusted :(
> >
> > Maybe we should increment a counter on the stack and do a final
> > if (counter != 0)
> > RT_CACHE_STAT_ADD(in_hlist_search, counter);
>
> My experiences from playing with prefetching eth_type_trans in this
> case. One must look in the total performance not just were the
> prefetching is done. In this case I was able to get eth_type_trans
> down in the profile list but other functions increased so performance
> was the same or lower. This needs to be sorted out...
How many of the architectures have PMU's that can give us cache miss statistics?
Itanium does, and can go so far as to tell us which addresses and instructions
are involved - do the others?
That sort of data would seem to be desirable in this sort of situation.
rick jones
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] Add prefetches in net/ipv4/route.c
2005-07-29 17:06 ` Rick Jones
@ 2005-07-29 17:44 ` Robert Olsson
2005-07-29 17:57 ` Eric Dumazet
1 sibling, 0 replies; 64+ messages in thread
From: Robert Olsson @ 2005-07-29 17:44 UTC (permalink / raw)
To: Rick Jones; +Cc: netdev
Rick Jones writes:
> > My experiences from playing with prefetching eth_type_trans in this
> > case. One must look in the total performance not just were the
> > prefetching is done. In this case I was able to get eth_type_trans
> > down in the profile list but other functions increased so performance
> > was the same or lower. This needs to be sorted out...
>
> How many of the architectures have PMU's that can give us cache miss statistics?
> Itanium does, and can go so far as to tell us which addresses and instructions
> are involved - do the others?
I've seem XEON and Opterons has performance counters for this that can
be used with oprofile. Intel has a document (was it an application note?)
describing prefetching. Really a lot of things to consider to become
successful.
> That sort of data would seem to be desirable in this sort of situation.
Also what scenario code patch and load we optimizing for Eric mentioned
this briefly.
Cheers.
--ro
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] Add prefetches in net/ipv4/route.c
2005-07-29 17:06 ` Rick Jones
2005-07-29 17:44 ` Robert Olsson
@ 2005-07-29 17:57 ` Eric Dumazet
2005-07-29 18:25 ` Rick Jones
2005-07-31 3:51 ` David S. Miller
1 sibling, 2 replies; 64+ messages in thread
From: Eric Dumazet @ 2005-07-29 17:57 UTC (permalink / raw)
To: Rick Jones; +Cc: netdev, David S. Miller, Robert Olsson
Rick Jones a écrit :
> Robert Olsson wrote:
>
>> Eric Dumazet writes:
>>
>> > I have no profiling info for this exact patch, I'm sorry David.
>> > On a dual opteron machine, this thing from ip_route_input() is very
>> expensive :
>> > > RT_CACHE_STAT_INC(in_hlist_search);
>> > > ip_route_input() use a total of 3.4563 % of one cpu, but this
>> 'increment' takes 1.20 % !!!
>>
>> Very weird if the statscounter taking a third of ip_route_input.
>>
>> > Sometime I wonder if oprofile can be trusted :(
>> > > Maybe we should increment a counter on the stack and do a final
>> > if (counter != 0)
>> > RT_CACHE_STAT_ADD(in_hlist_search, counter);
>>
>> My experiences from playing with prefetching eth_type_trans in this
>> case. One must look in the total performance not just were the
>> prefetching is done. In this case I was able to get eth_type_trans
>> down in the profile list but other functions increased so performance
>> was the same or lower. This needs to be sorted out...
>
>
> How many of the architectures have PMU's that can give us cache miss
> statistics? Itanium does, and can go so far as to tell us which
> addresses and instructions are involved - do the others?
>
> That sort of data would seem to be desirable in this sort of situation.
>
> rick jones
>
>
oprofile on AMD64 can gather lots of data, DATA_CACHE_MISSES for example...
But I think I know what happens...
nm -v /usr/src/linux/vmlinux | grep -5 rt_cache_stat
ffffffff804c6a80 b rover.5
ffffffff804c6a88 b last_gc.2
ffffffff804c6a90 b rover.3
ffffffff804c6a94 b equilibrium.4
ffffffff804c6a98 b ip_fallback_id.7
ffffffff804c6aa0 B rt_cache_stat
ffffffff804c6aa8 b ip_rt_max_size
ffffffff804c6aac b ip_rt_debug
ffffffff804c6ab0 b rt_deadline
So rt_cache_stat (which is a read only pointer) is in the middle of a hot cache line (some parts of it are written over and over), that
probably ping pong between CPUS.
Time to provide a patch to carefully place all the static data from net/ipv4/route.c into 2 parts : mostly readonly, and others... :)
Eric
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] Add prefetches in net/ipv4/route.c
2005-07-29 17:57 ` Eric Dumazet
@ 2005-07-29 18:25 ` Rick Jones
2005-07-31 3:52 ` David S. Miller
2005-07-31 3:51 ` David S. Miller
1 sibling, 1 reply; 64+ messages in thread
From: Rick Jones @ 2005-07-29 18:25 UTC (permalink / raw)
To: Eric Dumazet; +Cc: netdev, David S. Miller, Robert Olsson
> oprofile on AMD64 can gather lots of data, DATA_CACHE_MISSES for example...
>
> But I think I know what happens...
>
> nm -v /usr/src/linux/vmlinux | grep -5 rt_cache_stat
>
> ffffffff804c6a80 b rover.5
> ffffffff804c6a88 b last_gc.2
> ffffffff804c6a90 b rover.3
> ffffffff804c6a94 b equilibrium.4
> ffffffff804c6a98 b ip_fallback_id.7
> ffffffff804c6aa0 B rt_cache_stat
> ffffffff804c6aa8 b ip_rt_max_size
> ffffffff804c6aac b ip_rt_debug
> ffffffff804c6ab0 b rt_deadline
>
> So rt_cache_stat (which is a read only pointer) is in the middle of a
> hot cache line (some parts of it are written over and over), that
> probably ping pong between CPUS.
>
> Time to provide a patch to carefully place all the static data from
> net/ipv4/route.c into 2 parts : mostly readonly, and others... :)
Which of course begs the question - what cache line size should be ass-u-me-d
when blocking these things? I'll put-forth 128 bytes.
rick jones
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] Add prefetches in net/ipv4/route.c
2005-07-29 14:50 ` Robert Olsson
2005-07-29 17:06 ` Rick Jones
@ 2005-07-31 3:44 ` David S. Miller
1 sibling, 0 replies; 64+ messages in thread
From: David S. Miller @ 2005-07-31 3:44 UTC (permalink / raw)
To: Robert.Olsson; +Cc: dada1, netdev
From: Robert Olsson <Robert.Olsson@data.slu.se>
Date: Fri, 29 Jul 2005 16:50:31 +0200
> My experiences from playing with prefetching eth_type_trans in this
> case. One must look in the total performance not just were the
> prefetching is done. In this case I was able to get eth_type_trans
> down in the profile list but other functions increased so performance
> was the same or lower. This needs to be sorted out...
The problem is that if you just barely fit in the cache for a
workload, prefetches can hurt if done too early. Let's say that your
code path needs to access data items A, B, and C, in that order. If
you need to access A in order to know C, and subsequently if you
prefetch C before you use B, you might kick out B and end up making
performance worse (since you'll thus need to bring in C twice).
I really do not want to merge in any prefetch patches until
there is hard data showing an improvement, instead of some
shamanistic justification :-)
When the witch doctor comes to town and starts adding prefetches
all over the place without performance metrics, I become rightly
concerned.
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] Add prefetches in net/ipv4/route.c
2005-07-29 17:57 ` Eric Dumazet
2005-07-29 18:25 ` Rick Jones
@ 2005-07-31 3:51 ` David S. Miller
1 sibling, 0 replies; 64+ messages in thread
From: David S. Miller @ 2005-07-31 3:51 UTC (permalink / raw)
To: dada1; +Cc: rick.jones2, netdev, davem, Robert.Olsson
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Fri, 29 Jul 2005 19:57:35 +0200
> nm -v /usr/src/linux/vmlinux | grep -5 rt_cache_stat
>
> ffffffff804c6a80 b rover.5
> ffffffff804c6a88 b last_gc.2
> ffffffff804c6a90 b rover.3
> ffffffff804c6a94 b equilibrium.4
> ffffffff804c6a98 b ip_fallback_id.7
> ffffffff804c6aa0 B rt_cache_stat
> ffffffff804c6aa8 b ip_rt_max_size
> ffffffff804c6aac b ip_rt_debug
> ffffffff804c6ab0 b rt_deadline
>
> So rt_cache_stat (which is a read only pointer) is in the middle of a hot cache line (some parts of it are written over and over), that
> probably ping pong between CPUS.
One cure for this would be to declare it as "__read_mostly", that
should help a lot. But it's not the best idea, I think.
Instead, we can do away with the pointer and use DECLARE_PERCPU() and
__get_cpu_var() for rt_cache_stat. That would emit the most efficient
code on every architecture.
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] Add prefetches in net/ipv4/route.c
2005-07-29 18:25 ` Rick Jones
@ 2005-07-31 3:52 ` David S. Miller
[not found] ` <42EDDA50.4010405@cosmosbay.com>
0 siblings, 1 reply; 64+ messages in thread
From: David S. Miller @ 2005-07-31 3:52 UTC (permalink / raw)
To: rick.jones2; +Cc: dada1, netdev, davem, Robert.Olsson
From: Rick Jones <rick.jones2@hp.com>
Date: Fri, 29 Jul 2005 11:25:21 -0700
> Which of course begs the question - what cache line size should be
> ass-u-me-d when blocking these things? I'll put-forth 128 bytes.
Why guess? There is no need to.
Use "__read_mostly" which is an attribute designed explicitly for
this.
^ permalink raw reply [flat|nested] 64+ messages in thread
* Re: [PATCH] Add prefetches in net/ipv4/route.c
[not found] ` <42EDDA50.4010405@cosmosbay.com>
@ 2005-08-01 15:39 ` David S. Miller
0 siblings, 0 replies; 64+ messages in thread
From: David S. Miller @ 2005-08-01 15:39 UTC (permalink / raw)
To: dada1; +Cc: davem, rick.jones2, netdev, Robert.Olsson
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Mon, 01 Aug 2005 10:16:16 +0200
> Last time I checked, read_mostly had a meaning only if CONFIG_X86 ||
> CONFIG_SPARC64
Meaning that other platforms need to implement this,
big deal :)
I do agree that the implementor of __read_mostly should
have just let the build break instead so folks could
simply be forced to fix things up.
It's actually a trivial change, and the author therefore
could also have updated all the platform's vmlinux.lds.S files
instead of ifdef'ing this stuff.
^ permalink raw reply [flat|nested] 64+ messages in thread
end of thread, other threads:[~2005-08-01 15:39 UTC | newest]
Thread overview: 64+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-05-11 21:15 [TG3]: Add hw coalescing infrastructure David S. Miller
2005-05-11 21:17 ` Michael Chan
2005-05-12 2:28 ` David S. Miller
2005-05-12 7:53 ` Robert Olsson
2005-06-22 15:25 ` [TG3]: About " Eric Dumazet
2005-06-22 19:03 ` Michael Chan
2005-07-04 21:22 ` Eric Dumazet
2005-07-04 21:26 ` David S. Miller
2005-07-04 21:39 ` Eric Dumazet
2005-07-04 21:49 ` David S. Miller
2005-07-04 22:31 ` Eric Dumazet
2005-07-04 22:47 ` David S. Miller
2005-07-04 22:55 ` Eric Dumazet
2005-07-04 22:57 ` Eric Dumazet
2005-07-04 23:01 ` David S. Miller
2005-07-05 7:38 ` [PATCH] loop unrolling in net/sched/sch_generic.c Eric Dumazet
2005-07-05 11:51 ` Thomas Graf
2005-07-05 12:03 ` Thomas Graf
2005-07-05 13:04 ` Eric Dumazet
2005-07-05 13:48 ` Thomas Graf
2005-07-05 15:58 ` Eric Dumazet
2005-07-05 17:34 ` Thomas Graf
2005-07-05 21:22 ` David S. Miller
2005-07-05 21:33 ` Thomas Graf
2005-07-05 21:35 ` David S. Miller
2005-07-05 23:16 ` Eric Dumazet
2005-07-05 23:41 ` Thomas Graf
2005-07-05 23:45 ` David S. Miller
2005-07-05 23:55 ` Thomas Graf
2005-07-06 0:32 ` Eric Dumazet
2005-07-06 0:51 ` Thomas Graf
2005-07-06 1:04 ` Eric Dumazet
2005-07-06 1:07 ` Thomas Graf
2005-07-06 0:53 ` Eric Dumazet
2005-07-06 1:02 ` Thomas Graf
2005-07-06 1:09 ` Eric Dumazet
2005-07-06 12:42 ` Thomas Graf
2005-07-07 21:17 ` David S. Miller
2005-07-07 21:34 ` Thomas Graf
2005-07-07 22:24 ` David S. Miller
[not found] ` <42CE22CE.7030902@cosmosbay.com>
2005-07-08 7:30 ` David S. Miller
2005-07-08 8:19 ` Eric Dumazet
2005-07-08 11:08 ` Arnaldo Carvalho de Melo
2005-07-12 4:02 ` David S. Miller
2005-07-05 21:26 ` David S. Miller
2005-07-28 15:52 ` [PATCH] Add prefetches in net/ipv4/route.c Eric Dumazet
2005-07-28 19:39 ` David S. Miller
2005-07-28 20:56 ` Eric Dumazet
2005-07-28 20:58 ` David S. Miller
2005-07-28 21:24 ` Eric Dumazet
2005-07-28 22:44 ` David S. Miller
2005-07-29 14:50 ` Robert Olsson
2005-07-29 17:06 ` Rick Jones
2005-07-29 17:44 ` Robert Olsson
2005-07-29 17:57 ` Eric Dumazet
2005-07-29 18:25 ` Rick Jones
2005-07-31 3:52 ` David S. Miller
[not found] ` <42EDDA50.4010405@cosmosbay.com>
2005-08-01 15:39 ` David S. Miller
2005-07-31 3:51 ` David S. Miller
2005-07-31 3:44 ` David S. Miller
2005-07-04 23:00 ` [TG3]: About hw coalescing infrastructure David S. Miller
2005-07-05 16:14 ` Eric Dumazet
2005-07-04 22:47 ` Eric Dumazet
[not found] <C925F8B43D79CC49ACD0601FB68FF50C045E0FB0@orsmsx408>
2005-07-07 22:30 ` [PATCH] loop unrolling in net/sched/sch_generic.c David S. Miller
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).