Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 3/8] [Blackfin] EMAC driver: bf537 MAC multicast hash filtering patch
From: Bryan Wu @ 2008-01-30  8:52 UTC (permalink / raw)
  To: jeff, netdev; +Cc: linux-kernel, Aidan Williams, Bryan Wu
In-Reply-To: <1201683148-23931-1-git-send-email-bryan.wu@analog.com>

From: Aidan Williams <aidan@nicta.com.au>

The bf537 Ethernet MAC driver in the 2007R1.1-RC3 kernel (and the
current kernel) do not implement multicast hash filtering. This
is a performance problem if you have lots of multicast on your network.

This patch plugs the right bits into the multicast hash registers.

Signed-off-by: Aidan Williams <aidan@nicta.com.au>
Signed-off-by: Bryan Wu <bryan.wu@analog.com>
---
 drivers/net/bfin_mac.c |   42 +++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 41 insertions(+), 1 deletions(-)

diff --git a/drivers/net/bfin_mac.c b/drivers/net/bfin_mac.c
index ee39819..c6586cd 100644
--- a/drivers/net/bfin_mac.c
+++ b/drivers/net/bfin_mac.c
@@ -11,6 +11,7 @@
  * Description:
  *
  * Modified:
+ * 		2006-12-19 Aidan Williams, multicast hash support
  *		Copyright 2004-2006 Analog Devices Inc.
  *
  * Bugs:	Enter bugs at http://blackfin.uclinux.org/
@@ -800,6 +801,39 @@ static void bf537mac_timeout(struct net_device *dev)
 	netif_wake_queue(dev);
 }
 
+static void bf537mac_multicast_hash(struct net_device *dev)
+{
+	u32 emac_hashhi, emac_hashlo;
+	struct dev_mc_list *dmi = dev->mc_list;
+	char *addrs;
+	int i;
+	u32 crc;
+
+	emac_hashhi = emac_hashlo = 0;
+
+	for (i = 0; i < dev->mc_count; i++) {
+		addrs = dmi->dmi_addr;
+		dmi = dmi->next;
+
+		/* skip non-multicast addresses */
+		if (!(*addrs & 1))
+			continue;
+
+		crc = ether_crc(ETH_ALEN, addrs);
+		crc >>= 26;
+
+		if (crc & 0x20)
+			emac_hashhi |= 1 << (crc & 0x1f);
+		else
+			emac_hashlo |= 1 << (crc & 0x1f);
+	}
+
+	bfin_write_EMAC_HASHHI(emac_hashhi);
+	bfin_write_EMAC_HASHLO(emac_hashlo);
+
+	return;
+}
+
 /*
  * This routine will, depending on the values passed to it,
  * either make it accept multicast packets, go into
@@ -815,11 +849,17 @@ static void bf537mac_set_multicast_list(struct net_device *dev)
 		sysctl = bfin_read_EMAC_OPMODE();
 		sysctl |= RAF;
 		bfin_write_EMAC_OPMODE(sysctl);
-	} else if (dev->flags & IFF_ALLMULTI || dev->mc_count) {
+	} else if (dev->flags & IFF_ALLMULTI) {
 		/* accept all multicast */
 		sysctl = bfin_read_EMAC_OPMODE();
 		sysctl |= PAM;
 		bfin_write_EMAC_OPMODE(sysctl);
+	} else if (dev->mc_count) {
+		/* set up multicast hash table */
+		sysctl = bfin_read_EMAC_OPMODE();
+		sysctl |= HM;
+		bfin_write_EMAC_OPMODE(sysctl);
+		bf537mac_multicast_hash(dev);
 	} else {
 		/* clear promisc or multicast mode */
 		sysctl = bfin_read_EMAC_OPMODE();
-- 
1.5.3.4

^ permalink raw reply related

* [PATCH 1/8] [Blackfin] EMAC driver: shorten the mdelay value to solve netperf performance issue
From: Bryan Wu @ 2008-01-30  8:52 UTC (permalink / raw)
  To: jeff, netdev; +Cc: linux-kernel, Bryan Wu
In-Reply-To: <1201683148-23931-1-git-send-email-bryan.wu@analog.com>

Signed-off-by: Bryan Wu <bryan.wu@analog.com>
---
 drivers/net/bfin_mac.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bfin_mac.c b/drivers/net/bfin_mac.c
index eb97175..4006a5d 100644
--- a/drivers/net/bfin_mac.c
+++ b/drivers/net/bfin_mac.c
@@ -296,7 +296,7 @@ static void mdio_poll(void)
 
 	/* poll the STABUSY bit */
 	while ((bfin_read_EMAC_STAADD()) & STABUSY) {
-		mdelay(10);
+		udelay(1);
 		if (timeout_cnt-- < 0) {
 			printk(KERN_ERR DRV_NAME
 			": wait MDC/MDIO transaction to complete timeout\n");
@@ -551,7 +551,7 @@ static void adjust_tx_list(void)
 	 */
 	if (current_tx_ptr->next->next == tx_list_head) {
 		while (tx_list_head->status.status_word == 0) {
-			mdelay(10);
+			mdelay(1);
 			if (tx_list_head->status.status_word != 0
 			    || !(bfin_read_DMA2_IRQ_STATUS() & 0x08)) {
 				goto adjust_head;
-- 
1.5.3.4

^ permalink raw reply related

* [PATCH 2/8] [Blackfin] EMAC driver: define MDC_CLK=2.5MHz and caculate mdc_div according to SCLK.
From: Bryan Wu @ 2008-01-30  8:52 UTC (permalink / raw)
  To: jeff, netdev; +Cc: linux-kernel, Bryan Wu
In-Reply-To: <1201683148-23931-1-git-send-email-bryan.wu@analog.com>

Signed-off-by: Bryan Wu <bryan.wu@analog.com>
---
 drivers/net/bfin_mac.c |   16 ++++++++++++----
 1 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/drivers/net/bfin_mac.c b/drivers/net/bfin_mac.c
index 4006a5d..ee39819 100644
--- a/drivers/net/bfin_mac.c
+++ b/drivers/net/bfin_mac.c
@@ -412,20 +412,26 @@ static void bf537_adjust_link(struct net_device *dev)
 	spin_unlock_irqrestore(&lp->lock, flags);
 }
 
+/* MDC  = 2.5 MHz */
+#define MDC_CLK 2500000
+
 static int mii_probe(struct net_device *dev)
 {
 	struct bf537mac_local *lp = netdev_priv(dev);
 	struct phy_device *phydev = NULL;
 	unsigned short sysctl;
 	int i;
+	u32 sclk, mdc_div;
 
 	/* Enable PHY output early */
 	if (!(bfin_read_VR_CTL() & PHYCLKOE))
 		bfin_write_VR_CTL(bfin_read_VR_CTL() | PHYCLKOE);
 
-	/* MDC  = 2.5 MHz */
+	sclk = get_sclk();
+	mdc_div = ((sclk / MDC_CLK) / 2) - 1;
+
 	sysctl = bfin_read_EMAC_SYSCTL();
-	sysctl |= SET_MDCDIV(24);
+	sysctl |= SET_MDCDIV(mdc_div);
 	bfin_write_EMAC_SYSCTL(sysctl);
 
 	/* search for connect PHY device */
@@ -477,8 +483,10 @@ static int mii_probe(struct net_device *dev)
 	lp->phydev = phydev;
 
 	printk(KERN_INFO "%s: attached PHY driver [%s] "
-	       "(mii_bus:phy_addr=%s, irq=%d)\n",
-	       DRV_NAME, phydev->drv->name, phydev->dev.bus_id, phydev->irq);
+	       "(mii_bus:phy_addr=%s, irq=%d, mdc_clk=%dHz(mdc_div=%d)"
+	       "@sclk=%dMHz)\n",
+	       DRV_NAME, phydev->drv->name, phydev->dev.bus_id, phydev->irq,
+	       MDC_CLK, mdc_div, sclk/1000000);
 
 	return 0;
 }
-- 
1.5.3.4

^ permalink raw reply related

* [PATCH 0/8] [Blackfin] EMAC driver updates
From: Bryan Wu @ 2008-01-30  8:52 UTC (permalink / raw)
  To: jeff, netdev; +Cc: linux-kernel

Several bug fixing for this driver.


^ permalink raw reply

* Re: [PATCH 1/2] sky2: restore multicast addresses after recovery
From: Jeff Garzik @ 2008-01-30  8:51 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev
In-Reply-To: <20080123191151.1423a045@deepthought>

Stephen Hemminger wrote:
> If the sky2 deadman timer forces a recovery, the multicast hash
> list is lost. Move the call to sky2_set_multicast to the end
> of sky2_up() so all paths that bring device up will restore multicast.
> 
> Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
> 
> ---
> Please apply for 2.6.24

applied 1-2

You'll want to send this to stable@kernel.org, since by the time I read 
your mail, 2.6.24 had been released, just around 24 hours thereafter.



^ permalink raw reply

* Re: [PATCH] pci-skeleton: Misc fixes to build neatly
From: Jeff Garzik @ 2008-01-30  8:49 UTC (permalink / raw)
  To: Jike Song; +Cc: netdev, linux-kernel
In-Reply-To: <1201157496-20665-2-git-send-email-albcamus@gmail.com>

Jike Song wrote:
> The pci-skeleton.c has several problems with compilation, such as missing args
> when calling synchronize_irq(). Fix it.
> 
> Signed-off-by: Jike Song <albcamus@gmail.com>
> ---
>  drivers/net/pci-skeleton.c |   49 ++++++++++++++++++++++---------------------
>  1 files changed, 25 insertions(+), 24 deletions(-)

applied



^ permalink raw reply

* Re: [PATCH] phylib: Add Realtek 821x eth PHY support
From: Jeff Garzik @ 2008-01-30  8:49 UTC (permalink / raw)
  To: Kim Phillips; +Cc: netdev, Johnson Leung, Kevin Lam, Joe D'Abbraccio
In-Reply-To: <20080124202826.a50fa29c.kim.phillips@freescale.com>

Kim Phillips wrote:
> this PHY present on the MPC8315E and MPC837xE RDB boards.
> 
> Signed-off-by: Johnson Leung <r58129@freescale.com>
> Signed-off-by: Kevin Lam <r43770@freescale.com>
> Signed-off-by: Joe D'Abbraccio <ljd015@freescale.com>
> Signed-off-by: Kim Phillips <kim.phillips@freescale.com>
> ---
>  drivers/net/phy/Kconfig   |    5 +++
>  drivers/net/phy/Makefile  |    1 +
>  drivers/net/phy/realtek.c |   80 +++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 86 insertions(+), 0 deletions(-)
>  create mode 100644 drivers/net/phy/realtek.c

applied (though admittedly the filename seems a bit too generic)



^ permalink raw reply

* Re: [PATCH] natsemi: Update locking documentation
From: Jeff Garzik @ 2008-01-30  8:48 UTC (permalink / raw)
  To: Mark Brown; +Cc: Tim Hockin, netdev, linux-kernel
In-Reply-To: <1201442291-13668-1-git-send-email-broonie@sirena.org.uk>

Mark Brown wrote:
> The documentation regarding synchronisation at the head of the natsemi
> driver was badly bitrotted so replace it with a general statement about
> the techniques used which is less likely to bitrot.
> 
> Also remove the note saying these chips are uncommon - it makes little
> difference but they were used in a number of laptops and at least one mass
> market PCI ethernet card.
> 
> Signed-off-by: Mark Brown <broonie@sirena.org.uk>
> ---
>  drivers/net/natsemi.c |   18 ++----------------
>  1 files changed, 2 insertions(+), 16 deletions(-)

applied



^ permalink raw reply

* Re: [PATCH] PHYLIB: Locking fixes for PHY I/O potentially sleeping
From: Jeff Garzik @ 2008-01-30  8:48 UTC (permalink / raw)
  To: Nate Case; +Cc: Andy Fleming, David S. Miller, netdev
In-Reply-To: <1201622709.12444.118.camel@localhost.localdomain>

Nate Case wrote:
> PHY read/write functions can potentially sleep (e.g., a PHY accessed
> via I2C).  The following changes were made to account for this:
> 
>     * Change spin locks to mutex locks
>     * Add a BUG_ON() to phy_read() phy_write() to warn against
>       calling them from an interrupt context.
>     * Use work queue for PHY state machine handling since
>       it can potentially sleep
>     * Change phydev lock from spinlock to mutex
> 
> Signed-off-by: Nate Case <ncase@xes-inc.com>
> Acked-by: Andy Fleming <afleming@freescale.com>
> 
> ---
> Note: This is a resend of the patch submitted on January 3rd, 2008
> 
>  drivers/net/phy/mdio_bus.c   |    2 +-
>  drivers/net/phy/phy.c        |   68 ++++++++++++++++++++++++++++-------------
>  drivers/net/phy/phy_device.c |   11 +++----
>  include/linux/phy.h          |    5 ++-
>  4 files changed, 55 insertions(+), 31 deletions(-)

applied



^ permalink raw reply

* Re: [PATCH] forcedeth: mac address mcp77/79
From: Jeff Garzik @ 2008-01-30  8:48 UTC (permalink / raw)
  To: Ayaz Abdulla; +Cc: Andrew Morton, nedev, stable
In-Reply-To: <479DF3B8.2000204@nvidia.com>

Ayaz Abdulla wrote:
> This patch is a critical fix for MCP77 and MCP79 devices. The feature 
> flags were missing the define for correct mac address 
> (DEV_HAS_CORRECT_MACADDR).
> 
> Signed-off-by: Ayaz Abdulla <aabdulla@nvidia.com>

applied (upstream)



^ permalink raw reply

* Re: [PATCH] [NET]: Remove PowerPC code from fec.c
From: Jeff Garzik @ 2008-01-30  8:41 UTC (permalink / raw)
  To: Jochen Friedrich
  Cc: Vitaly Bordug, Scott Wood, Kumar Gala, Geert Uytterhoeven,
	Kernel, Linux, netdev@vger.kernel.org, linuxppc-dev list,
	linux-m68k
In-Reply-To: <4799F349.9090102@scram.de>

Jochen Friedrich wrote:
> fec.c is only used on M68k Coldfire CPUs. Remove leftover
> PowerPC code from this driver.
> 
> Signed-off-by: Jochen Friedrich <jochen@scram.de>
> ---
>  drivers/net/fec.c |  136 +---------------------------------------------------
>  1 files changed, 3 insertions(+), 133 deletions(-)

Seems OK to me, but I feel I don't have enough knowledge to ACK or NAK. 
  Please pass through an arch tree, thanks.

	Jeff




^ permalink raw reply

* Re: [PATCH 1/5] forcedeth: reset register fix
From: Jeff Garzik @ 2008-01-30  8:39 UTC (permalink / raw)
  To: Ayaz Abdulla; +Cc: Manfred Spraul, Andrew Morton, nedev
In-Reply-To: <478A7C72.8060903@nvidia.com>

Ayaz Abdulla wrote:
> This patch fixes the reset register definition from 0x3C to 0x34.
> 
> Signed-off-by: Ayaz Abdulla <aabdulla@nvidia.com>

applied 1-5



^ permalink raw reply

* Re: [PATCH 1/1 resend][arm/at91_ether.c]  logical/bitand typo in function reset_phy() (inactive), drivers/net/arm/at91_ether.c
From: Jeff Garzik @ 2008-01-30  8:39 UTC (permalink / raw)
  To: Roel Kluin; +Cc: andrew, netdev, linux-arm-kernel
In-Reply-To: <479F8940.7030301@tiscali.nl>

Roel Kluin wrote:
> include/linux/mii.h:48:#define BMCR_RESET 0x8000
> 
> The function reset_phy() is in "#if 0" inactivated code
> --
> Replace logical "&&" by bit "&" before BMCR_RESET
> 
> Signed-off-by: Roel Kluin <12o3l@tiscali.nl>
> ---
> diff --git a/drivers/net/arm/at91_ether.c b/drivers/net/arm/at91_ether.c
> index 25b114a..0ae0d83 100644
> --- a/drivers/net/arm/at91_ether.c
> +++ b/drivers/net/arm/at91_ether.c
> @@ -384,7 +384,7 @@ static void reset_phy(struct net_device *dev)
>  	/* Wait until PHY reset is complete */
>  	do {
>  		read_phy(lp->phy_address, MII_BMCR, &bmcr);
> -	} while (!(bmcr && BMCR_RESET));
> +	} while (!(bmcr & BMCR_RESET));
>  
>  	disable_mdi();
>  	spin_unlock_irq(&lp->lock);

applied



^ permalink raw reply

* Re: [PATCH] cxgb3: Remove incorrect __devinit annotations
From: Jeff Garzik @ 2008-01-30  8:38 UTC (permalink / raw)
  To: Roland Dreier; +Cc: Divy Le Ray, netdev
In-Reply-To: <adak5lsdzew.fsf@cisco.com>

Roland Dreier wrote:
> When PCI error recovery was added to cxgb3, a function t3_io_slot_reset()
> was added.  This function can call back into t3_prep_adapter() at any
> time, so t3_prep_adapter() can no longer be marked __devinit.
> This patch removes the __devinit annotation from t3_prep_adapter() and
> all the functions that it calls, which fixes
> 
>     WARNING: drivers/net/cxgb3/built-in.o(.text+0x2427): Section mismatch in reference from the function t3_io_slot_reset() to the function .devinit.text:t3_prep_adapter()
> 
> Signed-off-by: Roland Dreier <rolandd@cisco.com>
> ---
>  drivers/net/cxgb3/mc5.c   |    2 +-
>  drivers/net/cxgb3/sge.c   |    2 +-
>  drivers/net/cxgb3/t3_hw.c |   22 ++++++++++------------
>  3 files changed, 12 insertions(+), 14 deletions(-)

applied



^ permalink raw reply

* Re: [PATCH 1/7] bonding: fix parameter parsing
From: Jeff Garzik @ 2008-01-30  8:38 UTC (permalink / raw)
  To: Jay Vosburgh; +Cc: netdev
In-Reply-To: <12016588701127-git-send-email-fubar@us.ibm.com>

Jay Vosburgh wrote:
> 	My last fix (commit ece95f7fefe3afae19e641e1b3f5e64b00d5b948)
> didn't handle one case correctly.  This resolves that, and it will now
> correctly parse parameters with arbitrary white space, and either text
> names or mode values.
> 
> Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
> ---
>  drivers/net/bonding/bond_main.c |   17 +++++++++++------
>  1 files changed, 11 insertions(+), 6 deletions(-)

applied 1-7



^ permalink raw reply

* Re: [PATCH] [2/2] Remove some unnecessary gotos in established_get_first()
From: Andi Kleen @ 2008-01-30  8:27 UTC (permalink / raw)
  To: Oliver Neukum; +Cc: netdev, davem
In-Reply-To: <200801300925.12397.oliver@neukum.org>

Oliver Neukum <oliver@neukum.org> writes:

> Am Mittwoch, 30. Januar 2008 09:01:10 schrieb Andi Kleen:
>> 
>> gcc does not generate different code for return foo vs bar = foo; goto x;
>> x: return bar; So convert it all to direct returns for better readability.
>
> Now suppose somebody needs to change locking. He'll have to convert
> it back. 

Please take a look at the overall /proc/net/tcp logic. Any locking 
change will be a major change to the code flow of the whole family
of funtions.

-Andi

^ permalink raw reply

* Re: sis190 build breakage
From: Jeff Garzik @ 2008-01-30  8:22 UTC (permalink / raw)
  To: Sam Ravnborg; +Cc: Francois Romieu, maximilian attems, netdev
In-Reply-To: <20080130032838.GA17881@uranus.ravnborg.org>

Sam Ravnborg wrote:
> On Tue, Jan 29, 2008 at 11:03:10PM +0100, Francois Romieu wrote:
>> maximilian attems <max@stro.at> :
>>>   CC [M]  drivers/net/sis190.o
>>>   drivers/net/sis190.c:329: error: sis190_pci_tbl causes a section type conflict
>>>   make[5]: *** [drivers/net/sis190.o] Error 1
>>>
>>> gcc --version
>>> gcc (GCC) 4.1.2 20061115 (prerelease) (Debian 4.1.1-21)
> 
> Looks like a bug where __initdata has been used
> for const data.
> Searching:
> static int __devinit sis190_get_mac_addr_from_apc(struct pci_dev *pdev,
>                                                   struct net_device *dev)
> {
>         static const u16 __devinitdata ids[] = { 0x0965, 0x0966, 0x0968 };
>         struct sis190_private *tp = netdev_priv(dev);
>         struct pci_dev *isa_bridge;
>         u8 reg, tmp8;
> 
> Try to change this is __initconst and it should be fixed.

We have __initconst now?

Three cheers, and a beer, to whomever did that...

	Jeff




^ permalink raw reply

* Re: [PATCH] [2/2] Remove some unnecessary gotos in established_get_first()
From: Oliver Neukum @ 2008-01-30  8:25 UTC (permalink / raw)
  To: Andi Kleen; +Cc: netdev, davem
In-Reply-To: <20080130080110.35B691B416F@basil.firstfloor.org>

Am Mittwoch, 30. Januar 2008 09:01:10 schrieb Andi Kleen:
> 
> gcc does not generate different code for return foo vs bar = foo; goto x;
> x: return bar; So convert it all to direct returns for better readability.

Now suppose somebody needs to change locking. He'll have to convert
it back. IMHO a conditional return is worse than "goto clearly_named_label"

	Regards
		Oliver

^ permalink raw reply

* [PATCH] [1/2] Skip empty hash buckets faster in /proc/net/tcp
From: Andi Kleen @ 2008-01-30  8:01 UTC (permalink / raw)
  To: meissner, netdev, davem

On most systems most of the TCP established/time-wait hash buckets are empty.
When walking the hash table for /proc/net/tcp their read locks would
always be aquired just to find out they're empty. This patch changes the code
to check first if the buckets have any entries before taking the lock, which
is much cheaper than taking a lock. Since the hash tables are large
this makes a measurable difference on processing /proc/net/tcp, 
especially on architectures with slow read_lock (e.g. PPC) 

On a 2GB Core2 system here I see a time cat /proc/net/tcp > /dev/null
constently dropping from 0.44s to 0.4-0.8s system time with this change.
This is with mostly empty hash tables.

On systems with slower atomics (like P4 or POWER4) or larger hash tables
(more RAM) the difference is much higher.

This can be noticeable because there are some daemons around who regularly
scan /proc/net/tcp.

Original idea for this patch from Marcus Meissner, but redone by me.

Cc: meissner@suse.de
Signed-off-by: Andi Kleen <ak@suse.de>

---
 net/ipv4/tcp_ipv4.c |   30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

Index: linux/net/ipv4/tcp_ipv4.c
===================================================================
--- linux.orig/net/ipv4/tcp_ipv4.c
+++ linux/net/ipv4/tcp_ipv4.c
@@ -2039,6 +2039,12 @@ static void *listening_get_idx(struct se
 	return rc;
 }

+static inline int empty_bucket(struct tcp_iter_state *st)
+{
+	return hlist_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
+		hlist_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
+}
+
 static void *established_get_first(struct seq_file *seq)
 {
 	struct tcp_iter_state* st = seq->private;
@@ -2050,6 +2056,10 @@ static void *established_get_first(struc
 		struct inet_timewait_sock *tw;
 		rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);

+		/* Lockless fast path for the common case of empty buckets */
+		if (empty_bucket(st))
+			continue;
+
 		read_lock_bh(lock);
 		sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
 			if (sk->sk_family != st->family) {
@@ -2097,13 +2107,15 @@ get_tw:
 		read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
 		st->state = TCP_SEQ_STATE_ESTABLISHED;

-		if (++st->bucket < tcp_hashinfo.ehash_size) {
-			read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
-			sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
-		} else {
-			cur = NULL;
-			goto out;
-		}
+		/* Look for next non empty bucket */
+		while (++st->bucket < tcp_hashinfo.ehash_size &&
+				empty_bucket(st))
+			;
+		if (st->bucket >= tcp_hashinfo.ehash_size)
+			return NULL;
+
+		read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
+		sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
 	} else
 		sk = sk_next(sk);

^ permalink raw reply

* [PATCH] [2/2] Remove some unnecessary gotos in established_get_first()
From: Andi Kleen @ 2008-01-30  8:01 UTC (permalink / raw)
  To: netdev, davem
In-Reply-To: <20080130901.138686634@suse.de>


gcc does not generate different code for return foo vs bar = foo; goto x;
x: return bar; So convert it all to direct returns for better readability.

Signed-off-by: Andi Kleen <ak@suse.de>

Index: linux/net/ipv4/tcp_ipv4.c
===================================================================
--- linux.orig/net/ipv4/tcp_ipv4.c
+++ linux/net/ipv4/tcp_ipv4.c
@@ -2065,8 +2065,7 @@ static void *established_get_first(struc
 			if (sk->sk_family != st->family) {
 				continue;
 			}
-			rc = sk;
-			goto out;
+			return sk;
 		}
 		st->state = TCP_SEQ_STATE_TIME_WAIT;
 		inet_twsk_for_each(tw, node,
@@ -2074,13 +2073,11 @@ static void *established_get_first(struc
 			if (tw->tw_family != st->family) {
 				continue;
 			}
-			rc = tw;
-			goto out;
+			return tw;
 		}
 		read_unlock_bh(lock);
 		st->state = TCP_SEQ_STATE_ESTABLISHED;
 	}
-out:
 	return rc;
 }
 
@@ -2100,10 +2097,8 @@ get_tw:
 		while (tw && tw->tw_family != st->family) {
 			tw = tw_next(tw);
 		}
-		if (tw) {
-			cur = tw;
-			goto out;
-		}
+		if (tw)
+			return tw;
 		read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
 		st->state = TCP_SEQ_STATE_ESTABLISHED;
 
@@ -2121,16 +2116,12 @@ get_tw:
 
 	sk_for_each_from(sk, node) {
 		if (sk->sk_family == st->family)
-			goto found;
+			return sk;
 	}
 
 	st->state = TCP_SEQ_STATE_TIME_WAIT;
 	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
 	goto get_tw;
-found:
-	cur = sk;
-out:
-	return cur;
 }
 
 static void *established_get_idx(struct seq_file *seq, loff_t pos)

^ permalink raw reply

* Re: 2.6.24-git6 net build failure when SYSFS=n, PROC_FS=n
From: Daniel Lezcano @ 2008-01-30  8:05 UTC (permalink / raw)
  To: Randy Dunlap; +Cc: netdev, lkml, David Miller
In-Reply-To: <20080129160254.b7ff01bc.randy.dunlap@oracle.com>

Randy Dunlap wrote:
> linux-2.6.24-git6/net/ipv4/fib_frontend.c: In function 'fib_net_init':
> linux-2.6.24-git6/net/ipv4/fib_frontend.c:1024: error: implicit declaration of function 'fib_proc_init'
> linux-2.6.24-git6/net/ipv4/fib_frontend.c: In function 'fib_net_exit':
> linux-2.6.24-git6/net/ipv4/fib_frontend.c:1039: error: implicit declaration of function 'fib_proc_exit'
> 
> linux-2.6.24-git6/net/ipv6/sysctl_net_ipv6.c: In function 'ipv6_sysctl_net_init':
> linux-2.6.24-git6/net/ipv6/sysctl_net_ipv6.c:71: error: implicit declaration of function 'ipv6_route_sysctl_init'
> linux-2.6.24-git6/net/ipv6/sysctl_net_ipv6.c:71: warning: assignment makes pointer from integer without a cast
> linux-2.6.24-git6/net/ipv6/sysctl_net_ipv6.c:75: error: implicit declaration of function 'ipv6_icmp_sysctl_init'
> linux-2.6.24-git6/net/ipv6/sysctl_net_ipv6.c:75: warning: assignment makes pointer from integer without a cast
> 
> 
> config attached.
> 
> ---
> ~Randy

Hi,

thanks for catching this. I sent a fix for these two compilation errors 
a few days ago. I guess they will be merged very soon by Dave.

   -- Daniel

^ permalink raw reply

* [IPV4] route cache: Introduce rt_genid for smooth cache invalidation
From: Eric Dumazet @ 2008-01-30  8:08 UTC (permalink / raw)
  To: David S. Miller; +Cc: Linux Netdev List, Robert Olsson

[-- Attachment #1: Type: text/plain, Size: 1975 bytes --]

Current ip route cache implementation is not suited to large caches.

We can consume a lot of CPU when cache must be invalidated, since we
currently need to evict all cache entries, and this eviction is
sometimes asynchronous. min_delay & max_delay can somewhat control this
asynchronism behavior, but whole thing is a kludge, regularly triggering
infamous soft lockup messages. When entries are still in use, this also
consumes a lot of ram, filling dst_garbage.list.

A better scheme is to use a generation identifier on each entry,
so that cache invalidation can be performed by changing the table
identifier, without having to scan all entries.
No more delayed flushing, no more stalling when secret_interval expires.

Invalidated entries will then be freed at GC time (controled by
ip_rt_gc_timeout or stress), or when an invalidated entry is found
in a chain when an insert is done.
Thus we keep a normal equilibrium.

This patch :
- renames rt_hash_rnd to rt_genid (and makes it an atomic_t)
- Adds a new rt_genid field to 'struct rtable' (filling a hole on 64bit)
- Checks entry->rt_genid at appropriate places :
--- Readers have to ignore invalidated entries.
--- Writers can delete invalidated entries.
- Removes rt_flush_timer timer
- Removes unused /proc/sys/net/ipv4/{min_delay,max_delay}

We even reduce size of route.o

# size net/ipv4/route.o
    text    data     bss     dec     hex filename
   20038    1331     160   21529    5419 net/ipv4/route.o.before
   19991    1203     104   21298    5332 net/ipv4/route.o

Next step will be to audit all rt_cache_flush(0) (aka flushes) users, see
if they can be converted to "invalidate the cache" users.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

  Documentation/filesystems/proc.txt |    4
  include/linux/sysctl.h             |    4
  include/net/route.h                |    1
  net/ipv4/route.c                   |  209 +++++++++++----------------
  4 files changed, 92 insertions(+), 126 deletions(-)

[-- Attachment #2: rt_genid.patch --]
[-- Type: text/plain, Size: 14825 bytes --]

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 4413a2d..11fe51c 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1919,11 +1919,6 @@ max_size
 Maximum size  of  the routing cache. Old entries will be purged once the cache
 reached has this size.
 
-max_delay, min_delay
---------------------
-
-Delays for flushing the routing cache.
-
 redirect_load, redirect_number
 ------------------------------
 
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 89faebf..bf4ae4e 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -440,8 +440,8 @@ enum
 
 enum {
 	NET_IPV4_ROUTE_FLUSH=1,
-	NET_IPV4_ROUTE_MIN_DELAY=2,
-	NET_IPV4_ROUTE_MAX_DELAY=3,
+	NET_IPV4_ROUTE_MIN_DELAY=2, /* obsolete since 2.6.25 */
+	NET_IPV4_ROUTE_MAX_DELAY=3, /* obsolete since 2.6.25 */
 	NET_IPV4_ROUTE_GC_THRESH=4,
 	NET_IPV4_ROUTE_MAX_SIZE=5,
 	NET_IPV4_ROUTE_GC_MIN_INTERVAL=6,
diff --git a/include/net/route.h b/include/net/route.h
index 4eabf00..c0675a4 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -61,6 +61,7 @@ struct rtable
 
 	struct in_device	*idev;
 	
+	int			rt_genid;
 	unsigned		rt_flags;
 	__u16			rt_type;
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 896c768..922f2ae 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -117,8 +117,6 @@
 
 #define RT_GC_TIMEOUT (300*HZ)
 
-static int ip_rt_min_delay		= 2 * HZ;
-static int ip_rt_max_delay		= 10 * HZ;
 static int ip_rt_max_size;
 static int ip_rt_gc_timeout		= RT_GC_TIMEOUT;
 static int ip_rt_gc_interval		= 60 * HZ;
@@ -133,12 +131,9 @@ static int ip_rt_mtu_expires		= 10 * 60 * HZ;
 static int ip_rt_min_pmtu		= 512 + 20 + 20;
 static int ip_rt_min_advmss		= 256;
 static int ip_rt_secret_interval	= 10 * 60 * HZ;
-static int ip_rt_flush_expected;
-static unsigned long rt_deadline;
 
 #define RTprint(a...)	printk(KERN_DEBUG a)
 
-static struct timer_list rt_flush_timer;
 static void rt_worker_func(struct work_struct *work);
 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
 static struct timer_list rt_secret_timer;
@@ -259,19 +254,16 @@ static inline void rt_hash_lock_init(void)
 static struct rt_hash_bucket 	*rt_hash_table;
 static unsigned			rt_hash_mask;
 static unsigned int		rt_hash_log;
-static unsigned int		rt_hash_rnd;
+static atomic_t			rt_genid;
 
 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 #define RT_CACHE_STAT_INC(field) \
 	(__raw_get_cpu_var(rt_cache_stat).field++)
 
-static int rt_intern_hash(unsigned hash, struct rtable *rth,
-				struct rtable **res);
-
 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
 {
-	return (jhash_2words(daddr, saddr, rt_hash_rnd)
-		& rt_hash_mask);
+	return jhash_2words(daddr, saddr, atomic_read(&rt_genid))
+		& rt_hash_mask;
 }
 
 #define rt_hash(daddr, saddr, idx) \
@@ -281,27 +273,28 @@ static unsigned int rt_hash_code(u32 daddr, u32 saddr)
 #ifdef CONFIG_PROC_FS
 struct rt_cache_iter_state {
 	int bucket;
+	int genid;
 };
 
-static struct rtable *rt_cache_get_first(struct seq_file *seq)
+static struct rtable *rt_cache_get_first(struct rt_cache_iter_state *st)
 {
 	struct rtable *r = NULL;
-	struct rt_cache_iter_state *st = seq->private;
 
 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 		rcu_read_lock_bh();
-		r = rt_hash_table[st->bucket].chain;
-		if (r)
-			break;
+		r = rcu_dereference(rt_hash_table[st->bucket].chain);
+		while (r) {
+			if (r->rt_genid == st->genid)
+				return r;
+			r = rcu_dereference(r->u.dst.rt_next);
+		}
 		rcu_read_unlock_bh();
 	}
-	return rcu_dereference(r);
+	return r;
 }
 
-static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
+static struct rtable *rt_cache_get_next(struct rt_cache_iter_state *st, struct rtable *r)
 {
-	struct rt_cache_iter_state *st = seq->private;
-
 	r = r->u.dst.rt_next;
 	while (!r) {
 		rcu_read_unlock_bh();
@@ -313,29 +306,38 @@ static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
 	return rcu_dereference(r);
 }
 
-static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
+static struct rtable *rt_cache_get_idx(struct rt_cache_iter_state *st, loff_t pos)
 {
-	struct rtable *r = rt_cache_get_first(seq);
+	struct rtable *r = rt_cache_get_first(st);
 
 	if (r)
-		while (pos && (r = rt_cache_get_next(seq, r)))
+		while (pos && (r = rt_cache_get_next(st, r))) {
+			if (r->rt_genid != st->genid)
+				continue;
 			--pos;
+		}
 	return pos ? NULL : r;
 }
 
 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+	struct rt_cache_iter_state *st = seq->private;
+
+	if (*pos)
+		return rt_cache_get_idx(st, *pos - 1);
+	st->genid = atomic_read(&rt_genid);
+	return SEQ_START_TOKEN;
 }
 
 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct rtable *r = NULL;
+	struct rtable *r;
+	struct rt_cache_iter_state *st = seq->private;
 
 	if (v == SEQ_START_TOKEN)
-		r = rt_cache_get_first(seq);
+		r = rt_cache_get_first(st);
 	else
-		r = rt_cache_get_next(seq, v);
+		r = rt_cache_get_next(st, v);
 	++*pos;
 	return r;
 }
@@ -708,6 +710,11 @@ static void rt_check_expire(void)
 			continue;
 		spin_lock_bh(rt_hash_lock_addr(i));
 		while ((rth = *rthp) != NULL) {
+			if (rth->rt_genid != atomic_read(&rt_genid)) {
+				*rthp = rth->u.dst.rt_next;
+				rt_free(rth);
+				continue;
+			}
 			if (rth->u.dst.expires) {
 				/* Entry is expired even if it is in use */
 				if (time_before_eq(jiffies, rth->u.dst.expires)) {
@@ -732,83 +739,45 @@ static void rt_check_expire(void)
 
 /*
  * rt_worker_func() is run in process context.
- * If a whole flush was scheduled, it is done.
- * Else, we call rt_check_expire() to scan part of the hash table
+ * we call rt_check_expire() to scan part of the hash table
  */
 static void rt_worker_func(struct work_struct *work)
 {
-	if (ip_rt_flush_expected) {
-		ip_rt_flush_expected = 0;
-		rt_do_flush(1);
-	} else
-		rt_check_expire();
+	rt_check_expire();
 	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 }
 
-/* This can run from both BH and non-BH contexts, the latter
- * in the case of a forced flush event.
+/*
+ * Pertubation of rt_genid by a small quantity [1..256]
+ * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
+ * many times (2^24) without giving recent rt_genid.
+ * Jenkins hash is strong enough that litle changes of rt_genid are OK.
  */
-static void rt_run_flush(unsigned long process_context)
+static void rt_cache_invalidate(void)
 {
-	rt_deadline = 0;
-
-	get_random_bytes(&rt_hash_rnd, 4);
+	unsigned char shuffle;
 
-	rt_do_flush(process_context);
+	get_random_bytes(&shuffle, sizeof(shuffle));
+	atomic_add(shuffle + 1U, &rt_genid);
 }
 
-static DEFINE_SPINLOCK(rt_flush_lock);
-
+/*
+ * delay < 0  : invalidate cache (fast : entries will be deleted later)
+ * delay >= 0 : invalidate & flush cache (can be long)
+ */
 void rt_cache_flush(int delay)
 {
-	unsigned long now = jiffies;
-	int user_mode = !in_softirq();
-
-	if (delay < 0)
-		delay = ip_rt_min_delay;
-
-	spin_lock_bh(&rt_flush_lock);
-
-	if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
-		long tmo = (long)(rt_deadline - now);
-
-		/* If flush timer is already running
-		   and flush request is not immediate (delay > 0):
-
-		   if deadline is not achieved, prolongate timer to "delay",
-		   otherwise fire it at deadline time.
-		 */
-
-		if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
-			tmo = 0;
-
-		if (delay > tmo)
-			delay = tmo;
-	}
-
-	if (delay <= 0) {
-		spin_unlock_bh(&rt_flush_lock);
-		rt_run_flush(user_mode);
-		return;
-	}
-
-	if (rt_deadline == 0)
-		rt_deadline = now + ip_rt_max_delay;
-
-	mod_timer(&rt_flush_timer, now+delay);
-	spin_unlock_bh(&rt_flush_lock);
+	rt_cache_invalidate();
+	if (delay >= 0)
+		rt_do_flush(!in_softirq());
 }
 
 /*
- * We change rt_hash_rnd and ask next rt_worker_func() invocation
- * to perform a flush in process context
+ * We change rt_genid and let gc do the cleanup
  */
 static void rt_secret_rebuild(unsigned long dummy)
 {
-	get_random_bytes(&rt_hash_rnd, 4);
-	ip_rt_flush_expected = 1;
-	cancel_delayed_work(&expires_work);
-	schedule_delayed_work(&expires_work, HZ/10);
+	rt_cache_invalidate();
 	mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
 }
 
@@ -885,7 +854,8 @@ static int rt_garbage_collect(struct dst_ops *ops)
 			rthp = &rt_hash_table[k].chain;
 			spin_lock_bh(rt_hash_lock_addr(k));
 			while ((rth = *rthp) != NULL) {
-				if (!rt_may_expire(rth, tmo, expire)) {
+				if (rth->rt_genid == atomic_read(&rt_genid) &&
+					!rt_may_expire(rth, tmo, expire)) {
 					tmo >>= 1;
 					rthp = &rth->u.dst.rt_next;
 					continue;
@@ -966,6 +936,11 @@ restart:
 
 	spin_lock_bh(rt_hash_lock_addr(hash));
 	while ((rth = *rthp) != NULL) {
+		if (rth->rt_genid != atomic_read(&rt_genid)) {
+			*rthp = rth->u.dst.rt_next;
+			rt_free(rth);
+			continue;
+		}
 		if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
 			/* Put it first */
 			*rthp = rth->u.dst.rt_next;
@@ -1131,17 +1106,19 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 
 static void rt_del(unsigned hash, struct rtable *rt)
 {
-	struct rtable **rthp;
+	struct rtable **rthp, *aux;
 
+	rthp = &rt_hash_table[hash].chain;
 	spin_lock_bh(rt_hash_lock_addr(hash));
 	ip_rt_put(rt);
-	for (rthp = &rt_hash_table[hash].chain; *rthp;
-	     rthp = &(*rthp)->u.dst.rt_next)
-		if (*rthp == rt) {
-			*rthp = rt->u.dst.rt_next;
-			rt_free(rt);
-			break;
+	while ((aux = *rthp) != NULL) {
+		if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
+			*rthp = aux->u.dst.rt_next;
+			rt_free(aux);
+			continue;
 		}
+		rthp = &aux->u.dst.rt_next;
+	}
 	spin_unlock_bh(rt_hash_lock_addr(hash));
 }
 
@@ -1186,7 +1163,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 				if (rth->fl.fl4_dst != daddr ||
 				    rth->fl.fl4_src != skeys[i] ||
 				    rth->fl.oif != ikeys[k] ||
-				    rth->fl.iif != 0) {
+				    rth->fl.iif != 0 ||
+				    rth->rt_genid != atomic_read(&rt_genid)) {
 					rthp = &rth->u.dst.rt_next;
 					continue;
 				}
@@ -1224,7 +1202,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 				rt->u.dst.neighbour	= NULL;
 				rt->u.dst.hh		= NULL;
 				rt->u.dst.xfrm		= NULL;
-
+				rt->rt_genid		= atomic_read(&rt_genid);
 				rt->rt_flags		|= RTCF_REDIRECTED;
 
 				/* Gateway is different ... */
@@ -1445,7 +1423,8 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
 			    rth->rt_src  == iph->saddr &&
 			    rth->fl.iif == 0 &&
 			    !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) &&
-			    rth->u.dst.dev->nd_net == net) {
+			    rth->u.dst.dev->nd_net == net &&
+			    rth->rt_genid == atomic_read(&rt_genid)) {
 				unsigned short mtu = new_mtu;
 
 				if (new_mtu < 68 || new_mtu >= old_mtu) {
@@ -1680,8 +1659,9 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->fl.oif	= 0;
 	rth->rt_gateway	= daddr;
 	rth->rt_spec_dst= spec_dst;
-	rth->rt_type	= RTN_MULTICAST;
+	rth->rt_genid	= atomic_read(&rt_genid);
 	rth->rt_flags	= RTCF_MULTICAST;
+	rth->rt_type	= RTN_MULTICAST;
 	if (our) {
 		rth->u.dst.input= ip_local_deliver;
 		rth->rt_flags |= RTCF_LOCAL;
@@ -1820,6 +1800,7 @@ static inline int __mkroute_input(struct sk_buff *skb,
 
 	rth->u.dst.input = ip_forward;
 	rth->u.dst.output = ip_output;
+	rth->rt_genid = atomic_read(&rt_genid);
 
 	rt_set_nexthop(rth, res, itag);
 
@@ -1980,6 +1961,7 @@ local_input:
 		goto e_nobufs;
 
 	rth->u.dst.output= ip_rt_bug;
+	rth->rt_genid = atomic_read(&rt_genid);
 
 	atomic_set(&rth->u.dst.__refcnt, 1);
 	rth->u.dst.flags= DST_HOST;
@@ -2071,7 +2053,8 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 		    rth->fl.oif == 0 &&
 		    rth->fl.mark == skb->mark &&
 		    rth->fl.fl4_tos == tos &&
-		    rth->u.dst.dev->nd_net == net) {
+		    rth->u.dst.dev->nd_net == net &&
+		    rth->rt_genid == atomic_read(&rt_genid)) {
 			dst_use(&rth->u.dst, jiffies);
 			RT_CACHE_STAT_INC(in_hit);
 			rcu_read_unlock();
@@ -2199,6 +2182,7 @@ static inline int __mkroute_output(struct rtable **result,
 	rth->rt_spec_dst= fl->fl4_src;
 
 	rth->u.dst.output=ip_output;
+	rth->rt_genid = atomic_read(&rt_genid);
 
 	RT_CACHE_STAT_INC(out_slow_tot);
 
@@ -2471,7 +2455,8 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
 		    rth->fl.mark == flp->mark &&
 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
-		    rth->u.dst.dev->nd_net == net) {
+		    rth->u.dst.dev->nd_net == net &&
+		    rth->rt_genid == atomic_read(&rt_genid)) {
 			dst_use(&rth->u.dst, jiffies);
 			RT_CACHE_STAT_INC(out_hit);
 			rcu_read_unlock_bh();
@@ -2525,6 +2510,7 @@ static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock
 		rt->idev = ort->idev;
 		if (rt->idev)
 			in_dev_hold(rt->idev);
+		rt->rt_genid = atomic_read(&rt_genid);
 		rt->rt_flags = ort->rt_flags;
 		rt->rt_type = ort->rt_type;
 		rt->rt_dst = ort->rt_dst;
@@ -2779,6 +2765,8 @@ int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
 		     rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
 			if (idx < s_idx)
 				continue;
+			if (rt->rt_genid != atomic_read(&rt_genid))
+				continue;
 			skb->dst = dst_clone(&rt->u.dst);
 			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
@@ -2848,24 +2836,6 @@ ctl_table ipv4_route_table[] = {
 		.strategy	= &ipv4_sysctl_rtcache_flush_strategy,
 	},
 	{
-		.ctl_name	= NET_IPV4_ROUTE_MIN_DELAY,
-		.procname	= "min_delay",
-		.data		= &ip_rt_min_delay,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_jiffies,
-		.strategy	= &sysctl_jiffies,
-	},
-	{
-		.ctl_name	= NET_IPV4_ROUTE_MAX_DELAY,
-		.procname	= "max_delay",
-		.data		= &ip_rt_max_delay,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_jiffies,
-		.strategy	= &sysctl_jiffies,
-	},
-	{
 		.ctl_name	= NET_IPV4_ROUTE_GC_THRESH,
 		.procname	= "gc_thresh",
 		.data		= &ipv4_dst_ops.gc_thresh,
@@ -3023,8 +2993,8 @@ int __init ip_rt_init(void)
 {
 	int rc = 0;
 
-	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
-			     (jiffies ^ (jiffies >> 7)));
+	atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
+			     (jiffies ^ (jiffies >> 7))));
 
 #ifdef CONFIG_NET_CLS_ROUTE
 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
@@ -3057,7 +3027,6 @@ int __init ip_rt_init(void)
 	devinet_init();
 	ip_fib_init();
 
-	setup_timer(&rt_flush_timer, rt_run_flush, 0);
 	setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
 
 	/* All the timers, started at system startup tend

^ permalink raw reply related

* Re: [BUILD FAILURE]2.6.24-git6 build failure on sis190 ethernet driver
From: Kamalesh Babulal @ 2008-01-30  8:03 UTC (permalink / raw)
  To: Sam Ravnborg; +Cc: LKML, netdev, romieu, Andy Whitcroft
In-Reply-To: <20080130053451.GA18641@uranus.ravnborg.org>

Sam Ravnborg wrote:
> On Wed, Jan 30, 2008 at 09:11:36AM +0530, Kamalesh Babulal wrote:
>> Hi,
>>
>> The 2.6.24-git6 kernel build fails on various x86_64 machines with the build failure
>>
>> drivers/net/sis190.c:329: error: sis190_pci_tbl causes a section type conflict
>> make[2]: *** [drivers/net/sis190.o] Error 1
>>
>> # gcc --version (machine1)
>> gcc (GCC) 4.1.1 20070105 (Red Hat 4.1.1-52)
>>
>> # gcc --version (machine2)
>> gcc (GCC) 4.1.1 20060525 (Red Hat 4.1.1-1)
> 
> Hi Kamalesh
> 
> I know another patch is circulating, but please try the following.

Hi Sam,

Thanks, the patch fixes the build failure.

Tested-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>

> diff --git a/drivers/net/sis190.c b/drivers/net/sis190.c
> index b570402..0a5e024 100644
> --- a/drivers/net/sis190.c
> +++ b/drivers/net/sis190.c
> @@ -1556,7 +1556,7 @@ static int __devinit sis190_get_mac_addr_from_eeprom(struct pci_dev *pdev,
>  static int __devinit sis190_get_mac_addr_from_apc(struct pci_dev *pdev,
>  						   struct net_device *dev)
>  {
> -	static const u16 __devinitdata ids[] = { 0x0965, 0x0966, 0x0968 };
> +	static const u16 __devinitconst ids[] = { 0x0965, 0x0966, 0x0968 };
>  	struct sis190_private *tp = netdev_priv(dev);
>  	struct pci_dev *isa_bridge;
>  	u8 reg, tmp8;
> 
> It is the better fix if you can confirm it working.
> The section conflict issued by gcc happens because we try to
> mix const and non-const data in the same section.
> 
> 	Sam


-- 
Thanks & Regards,
Kamalesh Babulal,
Linux Technology Center,
IBM, ISTL.

^ permalink raw reply

* Re: [BUILD FAILURE]2.6.24-git6 build failure on sis190 ethernet driver
From: Kamalesh Babulal @ 2008-01-30  8:01 UTC (permalink / raw)
  To: Gabriel C; +Cc: LKML, netdev, romieu, Andy Whitcroft
In-Reply-To: <47A00057.9090306@googlemail.com>

Gabriel C wrote:
> Kamalesh Babulal wrote:
>> Hi,
>>
>> The 2.6.24-git6 kernel build fails on various x86_64 machines with the build failure
>>
>> drivers/net/sis190.c:329: error: sis190_pci_tbl causes a section type conflict
>> make[2]: *** [drivers/net/sis190.o] Error 1
>>
>> # gcc --version (machine1)
>> gcc (GCC) 4.1.1 20070105 (Red Hat 4.1.1-52)
>>
>> # gcc --version (machine2)
>> gcc (GCC) 4.1.1 20060525 (Red Hat 4.1.1-1)
>>
> 
> Heh :) vger.kernel.org does not like emails directly from gmail , it seems =)
> 
> ( sorry for sending this 3 time now )
> 
> The following patch should fix the build failure.
> 
> diff --git a/drivers/net/sis190.c b/drivers/net/sis190.c
> index b570402..e48e4ad 100644
> --- a/drivers/net/sis190.c
> +++ b/drivers/net/sis190.c
> @@ -326,7 +326,7 @@ static const struct {
>  	{ "SiS 191 PCI Gigabit Ethernet adapter" },
>  };
> 
> -static struct pci_device_id sis190_pci_tbl[] __devinitdata = {
> +static const struct pci_device_id sis190_pci_tbl[] __devinitdata = {
>  	{ PCI_DEVICE(PCI_VENDOR_ID_SI, 0x0190), 0, 0, 0 },
>  	{ PCI_DEVICE(PCI_VENDOR_ID_SI, 0x0191), 0, 0, 1 },
>  	{ 0, },
> 
> 
> Gabriel

Hi Gabriel,

Thanks, the patch fixes the build failure.


-- 
Thanks & Regards,
Kamalesh Babulal,
Linux Technology Center,
IBM, ISTL.

^ permalink raw reply

* Re: [PATCH] Optimize cxgb3 xmit path (a bit)
From: Krishna Kumar2 @ 2008-01-30  7:05 UTC (permalink / raw)
  To: jeff; +Cc: davem, netdev
In-Reply-To: <20080130070016.29078.94125.sendpatchset@N20wks267652wss.in.ibm.com>

I forgot to mention but the patch is only compile tested as I don't have
hardware to test it.

Krishna Kumar2/India/IBM@IBMIN wrote on 01/30/2008 12:30:16 PM:

> Changes:
>    1. Add common code for stopping queue.
>    2. No need to call netif_stop_queue followed by netif_wake_queue (and
>       infact a netif_start_queue could have been used instead), instead
>       call stop_queue if required, and remove code under USE_GTS macro.
>    3. There is no need to check for netif_queue_stopped, as the network
>       core guarantees that for us (I am sure every driver could remove
>       that check, eg e1000 - I have tested that path a few billion times
>       with about a few hundred thousand qstops but the condition never
>       hit even once).
>
> Thanks,
>
> - KK
>
> Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
> ---
>  sge.c |   35 +++++++++++++++--------------------
>  1 files changed, 15 insertions(+), 20 deletions(-)
>
> diff -ruNp a/drivers/net/cxgb3/sge.c b/drivers/net/cxgb3/sge.c
> --- a/drivers/net/cxgb3/sge.c   2008-01-30 11:42:39.000000000 +0530
> +++ b/drivers/net/cxgb3/sge.c   2008-01-30 12:15:28.000000000 +0530
> @@ -1059,6 +1059,14 @@ static void write_tx_pkt_wr(struct adapt
>            htonl(V_WR_TID(q->token)));
>  }
>
> +static inline void t3_stop_queue(struct net_device *dev, struct sge_qset
*qs,
> +             struct sge_txq *q)
> +{
> +   netif_stop_queue(dev);
> +   set_bit(TXQ_ETH, &qs->txq_stopped);
> +   q->stops++;
> +}
> +
>  /**
>   *   eth_xmit - add a packet to the Ethernet Tx queue
>   *   @skb: the packet
> @@ -1090,31 +1098,18 @@ int t3_eth_xmit(struct sk_buff *skb, str
>     ndesc = calc_tx_descs(skb);
>
>     if (unlikely(credits < ndesc)) {
> -      if (!netif_queue_stopped(dev)) {
> -         netif_stop_queue(dev);
> -         set_bit(TXQ_ETH, &qs->txq_stopped);
> -         q->stops++;
> -         dev_err(&adap->pdev->dev,
> -            "%s: Tx ring %u full while queue awake!\n",
> -            dev->name, q->cntxt_id & 7);
> -      }
> +      t3_stop_queue(dev, qs, q);
> +      dev_err(&adap->pdev->dev,
> +         "%s: Tx ring %u full while queue awake!\n",
> +         dev->name, q->cntxt_id & 7);
>        spin_unlock(&q->lock);
>        return NETDEV_TX_BUSY;
>     }
>
>     q->in_use += ndesc;
> -   if (unlikely(credits - ndesc < q->stop_thres)) {
> -      q->stops++;
> -      netif_stop_queue(dev);
> -      set_bit(TXQ_ETH, &qs->txq_stopped);
> -#if !USE_GTS
> -      if (should_restart_tx(q) &&
> -          test_and_clear_bit(TXQ_ETH, &qs->txq_stopped)) {
> -         q->restarts++;
> -         netif_wake_queue(dev);
> -      }
> -#endif
> -   }
> +   if (unlikely(credits - ndesc < q->stop_thres))
> +      if (USE_GTS || !should_restart_tx(q))
> +         t3_stop_queue(dev, qs, q);
>
>     gen = q->gen;
>     q->unacked += ndesc;


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox