Netdev List
 help / color / mirror / Atom feed
* ipvs 27/31: Use atomic operations atomicly
From: Patrick McHardy @ 2009-09-10 16:12 UTC (permalink / raw)
  To: davem; +Cc: netdev, Patrick McHardy, netfilter-devel
In-Reply-To: <20090910161142.31179.5256.sendpatchset@x2.localnet>

commit 1e66dafc75f40a08b2addb82779987b269b4ca23
Author: Simon Horman <horms@verge.net.au>
Date:   Mon Aug 31 14:18:48 2009 +0200

    ipvs: Use atomic operations atomicly
    
    A pointed out by Shin Hong, IPVS doesn't always use atomic operations
    in an atomic manner. While this seems unlikely to be manifest in
    strange behaviour, it seems appropriate to clean this up.
    
    Cc: shin hong <hongshin@gmail.com>
    Signed-off-by: Simon Horman <horms@verge.net.au>
    Signed-off-by: Patrick McHardy <kaber@trash.net>

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index b227750..a986ee2 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1259,7 +1259,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
 	struct ip_vs_iphdr iph;
 	struct ip_vs_protocol *pp;
 	struct ip_vs_conn *cp;
-	int ret, restart, af;
+	int ret, restart, af, pkts;
 
 	af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
 
@@ -1346,12 +1346,12 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
 	 * Sync connection if it is about to close to
 	 * encorage the standby servers to update the connections timeout
 	 */
-	atomic_inc(&cp->in_pkts);
+	pkts = atomic_add_return(1, &cp->in_pkts);
 	if (af == AF_INET &&
 	    (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
 	    (((cp->protocol != IPPROTO_TCP ||
 	       cp->state == IP_VS_TCP_S_ESTABLISHED) &&
-	      (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
+	      (pkts % sysctl_ip_vs_sync_threshold[1]
 	       == sysctl_ip_vs_sync_threshold[0])) ||
 	     ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
 	      ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c
index 70ff82c..6182e8e 100644
--- a/net/netfilter/ipvs/ip_vs_wrr.c
+++ b/net/netfilter/ipvs/ip_vs_wrr.c
@@ -77,11 +77,12 @@ static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc)
 static int ip_vs_wrr_max_weight(struct ip_vs_service *svc)
 {
 	struct ip_vs_dest *dest;
-	int weight = 0;
+	int new_weight, weight = 0;
 
 	list_for_each_entry(dest, &svc->destinations, n_list) {
-		if (atomic_read(&dest->weight) > weight)
-			weight = atomic_read(&dest->weight);
+		new_weight = atomic_read(&dest->weight);
+		if (new_weight > weight)
+			weight = new_weight;
 	}
 
 	return weight;

^ permalink raw reply related

* netfilter 29/31: ip6t_eui: fix read outside array bounds
From: Patrick McHardy @ 2009-09-10 16:12 UTC (permalink / raw)
  To: davem; +Cc: netdev, Patrick McHardy, netfilter-devel
In-Reply-To: <20090910161142.31179.5256.sendpatchset@x2.localnet>

commit 488908696971c5ea1dcc5d13f29c158ba4f6ae7d
Author: Patrick McHardy <kaber@trash.net>
Date:   Mon Aug 31 15:30:31 2009 +0200

    netfilter: ip6t_eui: fix read outside array bounds
    
    Use memcmp() instead of open coded comparison that reads one byte past
    the intended end.
    
    Based on patch from Roel Kluin <roel.kluin@gmail.com>
    
    Signed-off-by: Patrick McHardy <kaber@trash.net>

diff --git a/net/ipv6/netfilter/ip6t_eui64.c b/net/ipv6/netfilter/ip6t_eui64.c
index db610ba..ca287f6 100644
--- a/net/ipv6/netfilter/ip6t_eui64.c
+++ b/net/ipv6/netfilter/ip6t_eui64.c
@@ -23,7 +23,6 @@ static bool
 eui64_mt6(const struct sk_buff *skb, const struct xt_match_param *par)
 {
 	unsigned char eui64[8];
-	int i = 0;
 
 	if (!(skb_mac_header(skb) >= skb->head &&
 	      skb_mac_header(skb) + ETH_HLEN <= skb->data) &&
@@ -42,12 +41,8 @@ eui64_mt6(const struct sk_buff *skb, const struct xt_match_param *par)
 			eui64[4] = 0xfe;
 			eui64[0] ^= 0x02;
 
-			i = 0;
-			while (ipv6_hdr(skb)->saddr.s6_addr[8 + i] == eui64[i]
-			       && i < 8)
-				i++;
-
-			if (i == 8)
+			if (!memcmp(ipv6_hdr(skb)->saddr.s6_addr + 8, eui64,
+				    sizeof(eui64)))
 				return true;
 		}
 	}

^ permalink raw reply related

* netfilter 28/31: nf_conntrack: netns fix re reliable conntrack event delivery
From: Patrick McHardy @ 2009-09-10 16:12 UTC (permalink / raw)
  To: davem; +Cc: netdev, Patrick McHardy, netfilter-devel
In-Reply-To: <20090910161142.31179.5256.sendpatchset@x2.localnet>

commit ee254fa44d902ab89fd0d66851701098f07872a7
Author: Alexey Dobriyan <adobriyan@gmail.com>
Date:   Mon Aug 31 14:23:15 2009 +0200

    netfilter: nf_conntrack: netns fix re reliable conntrack event delivery
    
    Conntracks in netns other than init_net dying list were never killed.
    
    Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
    Acked-by: Pablo Neira Ayuso <pablo@netfilter.org>
    Signed-off-by: Patrick McHardy <kaber@trash.net>

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 565c3a8..b371098 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1089,14 +1089,14 @@ void nf_conntrack_flush_report(struct net *net, u32 pid, int report)
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_flush_report);
 
-static void nf_ct_release_dying_list(void)
+static void nf_ct_release_dying_list(struct net *net)
 {
 	struct nf_conntrack_tuple_hash *h;
 	struct nf_conn *ct;
 	struct hlist_nulls_node *n;
 
 	spin_lock_bh(&nf_conntrack_lock);
-	hlist_nulls_for_each_entry(h, n, &init_net.ct.dying, hnnode) {
+	hlist_nulls_for_each_entry(h, n, &net->ct.dying, hnnode) {
 		ct = nf_ct_tuplehash_to_ctrack(h);
 		/* never fails to remove them, no listeners at this point */
 		nf_ct_kill(ct);
@@ -1115,7 +1115,7 @@ static void nf_conntrack_cleanup_net(struct net *net)
 {
  i_see_dead_people:
 	nf_ct_iterate_cleanup(net, kill_all, NULL);
-	nf_ct_release_dying_list();
+	nf_ct_release_dying_list(net);
 	if (atomic_read(&net->ct.count) != 0) {
 		schedule();
 		goto i_see_dead_people;

^ permalink raw reply related

* IPVS 30/31: Add handling of incoming ICMPV6 messages
From: Patrick McHardy @ 2009-09-10 16:12 UTC (permalink / raw)
  To: davem; +Cc: netdev, Patrick McHardy, netfilter-devel
In-Reply-To: <20090910161142.31179.5256.sendpatchset@x2.localnet>

commit 94b265514a8398ba3cfecb5a821a027b68a5c38e
Author: Julius Volz <julius.volz@gmail.com>
Date:   Mon Aug 31 16:22:23 2009 +0200

    IPVS: Add handling of incoming ICMPV6 messages
    
    Add handling of incoming ICMPv6 messages.
    This follows the handling of IPv4 ICMP messages.
    
    Amongst ther things this problem allows IPVS to behave sensibly
    when an ICMPV6_PKT_TOOBIG message is received:
    
    This message is received when a realserver sends a packet >PMTU to the
    client. The hop on this path with insufficient MTU will generate an
    ICMPv6 Packet Too Big message back to the VIP. The LVS server receives
    this message, but the call to the function handling this has been
    missing. Thus, IPVS fails to forward the message to the real server,
    which then does not adjust the path MTU. This patch adds the missing
    call to ip_vs_in_icmp_v6() in ip_vs_in() to handle this situation.
    
    Thanks to Rob Gallagher from HEAnet for reporting this issue and for
    testing this patch in production (with direct routing mode).
    
    [horms@verge.net.au: tweaked changelog]
    Signed-off-by: Julius Volz <julius.volz@gmail.com>
    Tested-by: Rob Gallagher <robert.gallagher@heanet.ie>
    Signed-off-by: Simon Horman <horms@verge.net.au>
    Signed-off-by: Patrick McHardy <kaber@trash.net>

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index a986ee2..b95699f 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1277,13 +1277,24 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
 		return NF_ACCEPT;
 	}
 
-	if (unlikely(iph.protocol == IPPROTO_ICMP)) {
-		int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6) {
+		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
+			int related, verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
 
-		if (related)
-			return verdict;
-		ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
-	}
+			if (related)
+				return verdict;
+			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+		}
+	} else
+#endif
+		if (unlikely(iph.protocol == IPPROTO_ICMP)) {
+			int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
+
+			if (related)
+				return verdict;
+			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+		}
 
 	/* Protocol supported? */
 	pp = ip_vs_proto_get(iph.protocol);

^ permalink raw reply related

* netfilter 31/31: ebt_ulog: fix checkentry return value
From: Patrick McHardy @ 2009-09-10 16:12 UTC (permalink / raw)
  To: davem; +Cc: netdev, Patrick McHardy, netfilter-devel
In-Reply-To: <20090910161142.31179.5256.sendpatchset@x2.localnet>

commit 8a56df0ae1690f8f42a3c6c4532f4b06f93febea
Author: Patrick McHardy <kaber@trash.net>
Date:   Tue Sep 1 14:34:01 2009 +0200

    netfilter: ebt_ulog: fix checkentry return value
    
    Commit 19eda87 (netfilter: change return types of check functions for
    Ebtables extensions) broke the ebtables ulog module by missing a return
    value conversion.
    
    Signed-off-by: Patrick McHardy <kaber@trash.net>

diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index 133eeae..ce50688 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -266,7 +266,7 @@ static bool ebt_ulog_tg_check(const struct xt_tgchk_param *par)
 	if (uloginfo->qthreshold > EBT_ULOG_MAX_QLEN)
 		uloginfo->qthreshold = EBT_ULOG_MAX_QLEN;
 
-	return 0;
+	return true;
 }
 
 static struct xt_target ebt_ulog_tg_reg __read_mostly = {

^ permalink raw reply related

* Re: [RFC] net/core: Delay neighbor only if it has been used after confirmed
From: Jens Rosenboom @ 2009-09-10 16:21 UTC (permalink / raw)
  To: YOSHIFUJI Hideaki; +Cc: Linux Network Developers
In-Reply-To: <4A9E639B.20907@linux-ipv6.org>

On Wed, 2009-09-02 at 21:22 +0900, YOSHIFUJI Hideaki wrote:
[...]
> And, this "if" for REACHABLE->DELAY may be completely needless.
> Timer in REACHABLE is only for state transition for toward REACHABLE
> or STALE.

I did some testing with the following patch, which works fine for me, so
I propose this one now instead of my previous one. I still have no real
idea about the non-IPv6 implications of this, though.

---

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index e587e68..f61926f 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -819,13 +819,6 @@ static void neigh_timer_handler(unsigned long arg)
 				   neigh->confirmed + neigh->parms->reachable_time)) {
 			NEIGH_PRINTK2("neigh %p is still alive.\n", neigh);
 			next = neigh->confirmed + neigh->parms->reachable_time;
-		} else if (time_before_eq(now,
-					  neigh->used + neigh->parms->delay_probe_time)) {
-			NEIGH_PRINTK2("neigh %p is delayed.\n", neigh);
-			neigh->nud_state = NUD_DELAY;
-			neigh->updated = jiffies;
-			neigh_suspect(neigh);
-			next = now + neigh->parms->delay_probe_time;
 		} else {
 			NEIGH_PRINTK2("neigh %p is suspected.\n", neigh);
 			neigh->nud_state = NUD_STALE;



^ permalink raw reply related

* [PATCH net-next] ipv6: Ignore route option with ROUTER_PREF_INVALID
From: Jens Rosenboom @ 2009-09-10 16:25 UTC (permalink / raw)
  To: Linux Network Developers

RFC4191 says that "If the Reserved (10) value is received, the Route
Information Option MUST be ignored.", so this patch makes us conform
to the RFC. This is different to the usage of the Default Router
Preference, where an invalid value must indeed be treated as
PREF_MEDIUM.

Signed-off-by: Jens Rosenboom <me@jayr.de>

---

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1473ee0..f3635b7 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -481,7 +481,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt,
int len,
 
 	pref = rinfo->route_pref;
 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
-		pref = ICMPV6_ROUTER_PREF_MEDIUM;
+		return -EINVAL;
 
 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
 



^ permalink raw reply related

* Re: [PATCH] [NIU] VLAN does not work with niu driver
From: Rick Jones @ 2009-09-10 16:35 UTC (permalink / raw)
  To: Matheos Worku; +Cc: David Miller, Joyce.Yu, netdev
In-Reply-To: <4AA85404.5020300@sun.com>

> 
> We did throughput testing (netperf) and didn't notice any performance 
> degradation. We haven't done forwarding testing however.
> 

With my "Mr. Netperf" had on I'll ask if you just measured throughput, or if you 
also included CPU utilization/service demand?

rick jones

^ permalink raw reply

* Re: L2 switching in igb
From: Alexander Duyck @ 2009-09-10 17:30 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: Alexander Duyck, Kirsher, Jeffrey T, Fischer, Anna,
	netdev@vger.kernel.org, David Miller, Stephen Hemminger
In-Reply-To: <4AA8B317.2080706@voltaire.com>

Or Gerlitz wrote:
> Note that VEPA mode is a characteristic of the PF, correct? and the PF 
> resides in the host kernel. Also, as I wrote you earlier, I do see many 
> schemes where a VF spawned in the host kernel IS very useful, and as 
> such I'd be happy to continue the discussion on the approach suggested 
> by Dave and Stephen, can you provide a pointer? (thanks).
> 
> Or.

You are correct, the vSwitch can basically do VEPA by disabling local 
loopback enable bit in the DTXSWC register.  This would force all 
traffic from the PF/VFs out the lan physical port and from the lan 
physical port to the appropriate PF/VFs without doing any switching in 
between PF/VFs.

I am currently thinking what probably needs to be done is to add an 
rtnl_link_ops interface to handle vSwitch configuration that could then 
be applied to the igb netdevs that support VEPA/vSwitch technologies.  A 
subset of that interface could then be dedicated to VF configuration to 
handle things such as spawning VFs, setting the default mac addresses, 
security controls, etc.

Thanks,

Alex


^ permalink raw reply

* [PATCH v2 3/3] ucc_geth: Fix hangs after switching from full to half duplex
From: Anton Vorontsov @ 2009-09-10 17:58 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linuxppc-dev, Andy Fleming, Timur Tabi
In-Reply-To: <20090910020145.GC31083@oksana.dev.rtsoft.ru>

MPC8360 QE UCC ethernet controllers hang when changing link duplex
under a load (a bit of NFS activity is enough).

  PHY: mdio@e0102120:00 - Link is Up - 1000/Full
  sh-3.00# ethtool -s eth0 speed 100 duplex half autoneg off
  PHY: mdio@e0102120:00 - Link is Down
  PHY: mdio@e0102120:00 - Link is Up - 100/Half
  NETDEV WATCHDOG: eth0 (ucc_geth): transmit queue 0 timed out
  ------------[ cut here ]------------
  Badness at c01fcbd0 [verbose debug info unavailable]
  NIP: c01fcbd0 LR: c01fcbd0 CTR: c0194e44
  ...

The cure is to disable the controller before changing speed/duplex
and enable it afterwards.

Since ugeth_graceful_stop_{tx,rx} now may be called from an atomic
context, switch the two functions from msleep() to mdelay().

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
---

v2: Switch ugeth_graceful_stop_{tx,rx} to mdelay. Noticed with
    CONFIG_DEBUG_SPINLOCK_SLEEP.

 drivers/net/ucc_geth.c |    8 ++++++--
 1 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ucc_geth.c b/drivers/net/ucc_geth.c
index 2a2c973..5a0803f 100644
--- a/drivers/net/ucc_geth.c
+++ b/drivers/net/ucc_geth.c
@@ -1432,7 +1432,7 @@ static int ugeth_graceful_stop_tx(struct ucc_geth_private *ugeth)
 
 	/* Wait for command to complete */
 	do {
-		msleep(10);
+		mdelay(10);
 		temp = in_be32(uccf->p_ucce);
 	} while (!(temp & UCC_GETH_UCCE_GRA) && --i);
 
@@ -1464,7 +1464,7 @@ static int ugeth_graceful_stop_rx(struct ucc_geth_private *ugeth)
 						ucc_num);
 		qe_issue_cmd(QE_GRACEFUL_STOP_RX, cecr_subblock,
 			     QE_CR_PROTOCOL_ETHERNET, 0);
-		msleep(10);
+		mdelay(10);
 		temp = in_8(&ugeth->p_rx_glbl_pram->rxgstpack);
 	} while (!(temp & GRACEFUL_STOP_ACKNOWLEDGE_RX) && --i);
 
@@ -1631,9 +1631,13 @@ static void adjust_link(struct net_device *dev)
 			ugeth->oldspeed = phydev->speed;
 		}
 
+		ugeth_disable(ugeth, COMM_DIR_RX_AND_TX);
+
 		out_be32(&ug_regs->maccfg2, tempval);
 		out_be32(&uf_regs->upsmr, upsmr);
 
+		ugeth_enable(ugeth, COMM_DIR_RX_AND_TX);
+
 		if (!ugeth->oldlink) {
 			new_state = 1;
 			ugeth->oldlink = 1;
-- 
1.6.3.3


^ permalink raw reply related

* Re: [PATCH v2 3/3] ucc_geth: Fix hangs after switching from full to half duplex
From: Scott Wood @ 2009-09-10 18:04 UTC (permalink / raw)
  To: avorontsov; +Cc: David Miller, netdev, Andy Fleming, Timur Tabi, linuxppc-dev
In-Reply-To: <20090910175852.GA18948@oksana.dev.rtsoft.ru>

Anton Vorontsov wrote:
> MPC8360 QE UCC ethernet controllers hang when changing link duplex
> under a load (a bit of NFS activity is enough).
> 
>   PHY: mdio@e0102120:00 - Link is Up - 1000/Full
>   sh-3.00# ethtool -s eth0 speed 100 duplex half autoneg off
>   PHY: mdio@e0102120:00 - Link is Down
>   PHY: mdio@e0102120:00 - Link is Up - 100/Half
>   NETDEV WATCHDOG: eth0 (ucc_geth): transmit queue 0 timed out
>   ------------[ cut here ]------------
>   Badness at c01fcbd0 [verbose debug info unavailable]
>   NIP: c01fcbd0 LR: c01fcbd0 CTR: c0194e44
>   ...
> 
> The cure is to disable the controller before changing speed/duplex
> and enable it afterwards.
> 
> Since ugeth_graceful_stop_{tx,rx} now may be called from an atomic
> context, switch the two functions from msleep() to mdelay().

Ouch.  Can we put this in a workqueue or something?

-Scott

^ permalink raw reply

* [PATCH net-next-2.6 v2] bonding: introduce primary_passive option
From: Jiri Pirko @ 2009-09-10 18:21 UTC (permalink / raw)
  To: netdev; +Cc: davem, fubar, bonding-devel, nicolas.2p.debian

(updated 2)

In some cases there is not desirable to switch back to primary interface when
it's link recovers and rather stay with currently active one. We need to avoid
packetloss as much as we can in some cases. This is solved by introducing
primary_passive option. Note that enslaved primary slave is set as current
active no matter what.

This patch depends on the following one:
[net-next-2.6] bonding: make ab_arp select active slaves as other modes
http://patchwork.ozlabs.org/patch/32684/

Signed-off-by: Jiri Pirko <jpirko@redhat.com>

diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
index d5181ce..e5f2c31 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -614,6 +614,32 @@ primary
 
 	The primary option is only valid for active-backup mode.
 
+primary_passive
+
+	Specifies the behavior of the current active slave when the primary was
+	down and comes back up.  This option is designed to prevent
+	flip-flopping between the primary slave and other slaves.  The possible
+	values and their respective effects are:
+
+	disabled or 0 (default)
+
+		The primary slave becomes the active slave whenever it comes
+		back up.
+
+	better or 1
+
+		The primary slave becomes the active slave when it comes back
+		up, if the speed and duplex of the primary slave is better
+		than the speed and duplex of the current active slave.
+
+	always or 2
+
+		The primary slave becomes the active slave only if the current
+		active slave fails and the primary slave is up.
+
+	When no slave are active, if the primary comes back up, it becomes the
+	active slave, regardless of the value of primary_return
+
 updelay
 
 	Specifies the time, in milliseconds, to wait before enabling a
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index a7e731f..193eca4 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -94,6 +94,7 @@ static int downdelay;
 static int use_carrier	= 1;
 static char *mode;
 static char *primary;
+static char *primary_passive;
 static char *lacp_rate;
 static char *ad_select;
 static char *xmit_hash_policy;
@@ -126,6 +127,13 @@ MODULE_PARM_DESC(mode, "Mode of operation : 0 for balance-rr, "
 		       "6 for balance-alb");
 module_param(primary, charp, 0);
 MODULE_PARM_DESC(primary, "Primary network device to use");
+module_param(primary_passive, charp, 0);
+MODULE_PARM_DESC(primary_passive, "Do not set primary slave active "
+				  "once it comes up; "
+				  "0 for disabled (default), "
+				  "1 for on only if speed of primary is not "
+				  "better, "
+				  "2 for always on");
 module_param(lacp_rate, charp, 0);
 MODULE_PARM_DESC(lacp_rate, "LACPDU tx rate to request from 802.3ad partner "
 			    "(slow/fast)");
@@ -200,6 +208,13 @@ const struct bond_parm_tbl fail_over_mac_tbl[] = {
 {	NULL,			-1},
 };
 
+const struct bond_parm_tbl pri_passive_tbl[] = {
+{	"disabled",		BOND_PRI_PASSIVE_DISABLED},
+{	"better",		BOND_PRI_PASSIVE_BETTER},
+{	"always",		BOND_PRI_PASSIVE_ALWAYS},
+{	NULL,			-1},
+};
+
 struct bond_parm_tbl ad_select_tbl[] = {
 {	"stable",	BOND_AD_STABLE},
 {	"bandwidth",	BOND_AD_BANDWIDTH},
@@ -1070,6 +1085,25 @@ out:
 
 }
 
+static bool bond_should_loose_active(struct bonding *bond)
+{
+	struct slave *prim = bond->primary_slave;
+	struct slave *curr = bond->curr_active_slave;
+
+	if (!prim || !curr || curr->link != BOND_LINK_UP)
+		return true;
+	if (bond->force_primary) {
+		bond->force_primary = false;
+		return true;
+	}
+	if (bond->params.primary_passive == 1 &&
+	    (prim->speed < curr->speed ||
+	     (prim->speed == curr->speed && prim->duplex <= curr->duplex)))
+		return false;
+	if (bond->params.primary_passive == 2)
+		return false;
+	return true;
+}
 
 /**
  * find_best_interface - select the best available slave to be the active one
@@ -1093,15 +1127,9 @@ static struct slave *bond_find_best_slave(struct bonding *bond)
 			return NULL; /* still no slave, return NULL */
 	}
 
-	/*
-	 * first try the primary link; if arping, a link must tx/rx
-	 * traffic before it can be considered the curr_active_slave.
-	 * also, we would skip slaves between the curr_active_slave
-	 * and primary_slave that may be up and able to arp
-	 */
 	if ((bond->primary_slave) &&
-	    (!bond->params.arp_interval) &&
-	    (IS_UP(bond->primary_slave->dev))) {
+	    bond->primary_slave->link == BOND_LINK_UP &&
+	    bond_should_loose_active(bond)) {
 		new_active = bond->primary_slave;
 	}
 
@@ -1109,15 +1137,14 @@ static struct slave *bond_find_best_slave(struct bonding *bond)
 	old_active = new_active;
 
 	bond_for_each_slave_from(bond, new_active, i, old_active) {
-		if (IS_UP(new_active->dev)) {
-			if (new_active->link == BOND_LINK_UP) {
-				return new_active;
-			} else if (new_active->link == BOND_LINK_BACK) {
-				/* link up, but waiting for stabilization */
-				if (new_active->delay < mintime) {
-					mintime = new_active->delay;
-					bestslave = new_active;
-				}
+		if (new_active->link == BOND_LINK_UP) {
+			return new_active;
+		} else if (new_active->link == BOND_LINK_BACK &&
+			   IS_UP(new_active->dev)) {
+			/* link up, but waiting for stabilization */
+			if (new_active->delay < mintime) {
+				mintime = new_active->delay;
+				bestslave = new_active;
 			}
 		}
 	}
@@ -1683,8 +1710,10 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 
 	if (USES_PRIMARY(bond->params.mode) && bond->params.primary[0]) {
 		/* if there is a primary slave, remember it */
-		if (strcmp(bond->params.primary, new_slave->dev->name) == 0)
+		if (strcmp(bond->params.primary, new_slave->dev->name) == 0) {
 			bond->primary_slave = new_slave;
+			bond->force_primary = true;
+		}
 	}
 
 	write_lock_bh(&bond->curr_slave_lock);
@@ -2929,18 +2958,6 @@ static int bond_ab_arp_inspect(struct bonding *bond, int delta_in_ticks)
 		}
 	}
 
-	read_lock(&bond->curr_slave_lock);
-
-	/*
-	 * Trigger a commit if the primary option setting has changed.
-	 */
-	if (bond->primary_slave &&
-	    (bond->primary_slave != bond->curr_active_slave) &&
-	    (bond->primary_slave->link == BOND_LINK_UP))
-		commit++;
-
-	read_unlock(&bond->curr_slave_lock);
-
 	return commit;
 }
 
@@ -2961,90 +2978,58 @@ static void bond_ab_arp_commit(struct bonding *bond, int delta_in_ticks)
 			continue;
 
 		case BOND_LINK_UP:
-			write_lock_bh(&bond->curr_slave_lock);
-
-			if (!bond->curr_active_slave &&
-			    time_before_eq(jiffies, dev_trans_start(slave->dev) +
-					   delta_in_ticks)) {
+			if ((!bond->curr_active_slave &&
+			     time_before_eq(jiffies,
+					    dev_trans_start(slave->dev) +
+					    delta_in_ticks)) ||
+			    bond->curr_active_slave != slave) {
 				slave->link = BOND_LINK_UP;
-				bond_change_active_slave(bond, slave);
 				bond->current_arp_slave = NULL;
 
 				pr_info(DRV_NAME
-				       ": %s: %s is up and now the "
-				       "active interface\n",
-				       bond->dev->name, slave->dev->name);
-
-			} else if (bond->curr_active_slave != slave) {
-				/* this slave has just come up but we
-				 * already have a current slave; this can
-				 * also happen if bond_enslave adds a new
-				 * slave that is up while we are searching
-				 * for a new slave
-				 */
-				slave->link = BOND_LINK_UP;
-				bond_set_slave_inactive_flags(slave);
-				bond->current_arp_slave = NULL;
+					": %s: link status definitely "
+					"up for interface %s.\n",
+					bond->dev->name, slave->dev->name);
 
-				pr_info(DRV_NAME
-				       ": %s: backup interface %s is now up\n",
-				       bond->dev->name, slave->dev->name);
-			}
+				if (!bond->curr_active_slave ||
+				    (slave == bond->primary_slave))
+					goto do_failover;
 
-			write_unlock_bh(&bond->curr_slave_lock);
+			}
 
-			break;
+			continue;
 
 		case BOND_LINK_DOWN:
 			if (slave->link_failure_count < UINT_MAX)
 				slave->link_failure_count++;
 
 			slave->link = BOND_LINK_DOWN;
+			bond_set_slave_inactive_flags(slave);
 
-			if (slave == bond->curr_active_slave) {
-				pr_info(DRV_NAME
-				       ": %s: link status down for active "
-				       "interface %s, disabling it\n",
-				       bond->dev->name, slave->dev->name);
-
-				bond_set_slave_inactive_flags(slave);
-
-				write_lock_bh(&bond->curr_slave_lock);
-
-				bond_select_active_slave(bond);
-				if (bond->curr_active_slave)
-					bond->curr_active_slave->jiffies =
-						jiffies;
-
-				write_unlock_bh(&bond->curr_slave_lock);
+			pr_info(DRV_NAME
+				": %s: link status definitely down for "
+				"interface %s, disabling it\n",
+				bond->dev->name, slave->dev->name);
 
+			if (slave == bond->curr_active_slave) {
 				bond->current_arp_slave = NULL;
-
-			} else if (slave->state == BOND_STATE_BACKUP) {
-				pr_info(DRV_NAME
-				       ": %s: backup interface %s is now down\n",
-				       bond->dev->name, slave->dev->name);
-
-				bond_set_slave_inactive_flags(slave);
+				goto do_failover;
 			}
-			break;
+
+			continue;
 
 		default:
 			pr_err(DRV_NAME
 			       ": %s: impossible: new_link %d on slave %s\n",
 			       bond->dev->name, slave->new_link,
 			       slave->dev->name);
+			continue;
 		}
-	}
 
-	/*
-	 * No race with changes to primary via sysfs, as we hold rtnl.
-	 */
-	if (bond->primary_slave &&
-	    (bond->primary_slave != bond->curr_active_slave) &&
-	    (bond->primary_slave->link == BOND_LINK_UP)) {
+do_failover:
+		ASSERT_RTNL();
 		write_lock_bh(&bond->curr_slave_lock);
-		bond_change_active_slave(bond, bond->primary_slave);
+		bond_select_active_slave(bond);
 		write_unlock_bh(&bond->curr_slave_lock);
 	}
 
@@ -4695,7 +4680,7 @@ int bond_parse_parm(const char *buf, const struct bond_parm_tbl *tbl)
 
 static int bond_check_params(struct bond_params *params)
 {
-	int arp_validate_value, fail_over_mac_value;
+	int arp_validate_value, fail_over_mac_value, primary_passive_value;
 
 	/*
 	 * Convert string parameters.
@@ -4994,6 +4979,20 @@ static int bond_check_params(struct bond_params *params)
 		primary = NULL;
 	}
 
+	if (primary && primary_passive) {
+		primary_passive_value = bond_parse_parm(primary_passive,
+							pri_passive_tbl);
+		if (primary_passive_value == -1) {
+			pr_err(DRV_NAME
+			       ": Error: Invalid primary_passive \"%s\"\n",
+			       primary_passive ==
+					NULL ? "NULL" : primary_passive);
+			return -EINVAL;
+		}
+	} else {
+		primary_passive_value = BOND_PRI_PASSIVE_DISABLED;
+	}
+
 	if (fail_over_mac) {
 		fail_over_mac_value = bond_parse_parm(fail_over_mac,
 						      fail_over_mac_tbl);
@@ -5025,6 +5024,7 @@ static int bond_check_params(struct bond_params *params)
 	params->use_carrier = use_carrier;
 	params->lacp_fast = lacp_fast;
 	params->primary[0] = 0;
+	params->primary_passive = primary_passive_value;
 	params->fail_over_mac = fail_over_mac_value;
 
 	if (primary) {
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index 6044e12..84ce933 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -1212,6 +1212,59 @@ static DEVICE_ATTR(primary, S_IRUGO | S_IWUSR,
 		   bonding_show_primary, bonding_store_primary);
 
 /*
+ * Show and set the primary_passive flag.
+ */
+static ssize_t bonding_show_primary_passive(struct device *d,
+					    struct device_attribute *attr,
+					    char *buf)
+{
+	struct bonding *bond = to_bond(d);
+
+	return sprintf(buf, "%s %d\n",
+		       pri_passive_tbl[bond->params.primary_passive].modename,
+		       bond->params.primary_passive);
+}
+
+static ssize_t bonding_store_primary_passive(struct device *d,
+					     struct device_attribute *attr,
+					     const char *buf, size_t count)
+{
+	int new_value, ret = count;
+	struct bonding *bond = to_bond(d);
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	new_value = bond_parse_parm(buf, pri_passive_tbl);
+	if (new_value < 0)  {
+		pr_err(DRV_NAME
+		       ": %s: Ignoring invalid primary_passive value %.*s.\n",
+		       bond->dev->name,
+		       (int) strlen(buf) - 1, buf);
+		ret = -EINVAL;
+		goto out;
+	} else {
+		bond->params.primary_passive = new_value;
+		pr_info(DRV_NAME ": %s: setting primary_passive to %s (%d).\n",
+		       bond->dev->name, pri_passive_tbl[new_value].modename,
+		       new_value);
+		if (new_value == 0 || new_value == 1) {
+			bond->force_primary = true;
+			read_lock(&bond->lock);
+			write_lock_bh(&bond->curr_slave_lock);
+			bond_select_active_slave(bond);
+			write_unlock_bh(&bond->curr_slave_lock);
+			read_unlock(&bond->lock);
+		}
+	}
+out:
+	rtnl_unlock();
+	return ret;
+}
+static DEVICE_ATTR(primary_passive, S_IRUGO | S_IWUSR,
+		   bonding_show_primary_passive, bonding_store_primary_passive);
+
+/*
  * Show and set the use_carrier flag.
  */
 static ssize_t bonding_show_carrier(struct device *d,
@@ -1500,6 +1553,7 @@ static struct attribute *per_bond_attrs[] = {
 	&dev_attr_num_unsol_na.attr,
 	&dev_attr_miimon.attr,
 	&dev_attr_primary.attr,
+	&dev_attr_primary_passive.attr,
 	&dev_attr_use_carrier.attr,
 	&dev_attr_active_slave.attr,
 	&dev_attr_mii_status.attr,
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index 6824771..9a9e0c7 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -131,6 +131,7 @@ struct bond_params {
 	int lacp_fast;
 	int ad_select;
 	char primary[IFNAMSIZ];
+	int primary_passive;
 	__be32 arp_targets[BOND_MAX_ARP_TARGETS];
 };
 
@@ -190,6 +191,7 @@ struct bonding {
 	struct   slave *curr_active_slave;
 	struct   slave *current_arp_slave;
 	struct   slave *primary_slave;
+	bool     force_primary;
 	s32      slave_cnt; /* never change this value outside the attach/detach wrappers */
 	rwlock_t lock;
 	rwlock_t curr_slave_lock;
@@ -258,6 +260,10 @@ static inline bool bond_is_lb(const struct bonding *bond)
 		|| bond->params.mode == BOND_MODE_ALB;
 }
 
+#define BOND_PRI_PASSIVE_DISABLED	0
+#define BOND_PRI_PASSIVE_BETTER		1
+#define BOND_PRI_PASSIVE_ALWAYS		2
+
 #define BOND_FOM_NONE			0
 #define BOND_FOM_ACTIVE			1
 #define BOND_FOM_FOLLOW			2
@@ -348,6 +354,7 @@ extern const struct bond_parm_tbl bond_mode_tbl[];
 extern const struct bond_parm_tbl xmit_hashtype_tbl[];
 extern const struct bond_parm_tbl arp_validate_tbl[];
 extern const struct bond_parm_tbl fail_over_mac_tbl[];
+extern const struct bond_parm_tbl pri_passive_tbl[];
 extern struct bond_parm_tbl ad_select_tbl[];
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)

^ permalink raw reply related

* iproute2 tools for 2.6.31
From: Stephen Hemminger @ 2009-09-10 19:04 UTC (permalink / raw)
  To: netdev

I am putting together release for 2.6.31 based tools.
The only open issue is how to deal with the error handling in commands
that do monitoring filtering.  Right now leaning towards the two socket
solution.

So if you have anything else that you have been waiting for,
please drop me a note.

^ permalink raw reply

* Re: [PATCH] ipv6: Add IFA_F_DADFAILED flag
From: Brian Haley @ 2009-09-10 19:14 UTC (permalink / raw)
  To: Jens Rosenboom; +Cc: david Miller, netdev@vger.kernel.org, YOSHIFUJI Hideaki
In-Reply-To: <1252599080.5980.3.camel@fnki-nb00130>

Hi Jens,

Jens Rosenboom wrote:
>> Ok, how does this look?  I changed it to set the tentative flag as it did
>> before, plus clear the dad_failed flag if the device got restarted,
>> triggering DAD to happen again for any tentative address, that was an
>> oversight on my part.
> 
> Looks fine to me so far, can you also send the patch for userspace? That
> would making testing this a bit easier. ;-)

Iproute2 patch below, I'll re-post both once you have a chance to test.

>> I'd still like to know if using this last ifa_flag is going to be an issue,
>> I actually finished a similar patch that uses a new IFA_ADDRFLAGS structure
>> to pass in/out this additional info.
> 
> IMHO you should stick to this version, if any future feature needs
> another bit, it may happen also to need two of them and so will need a
> new structure then anyway, but why not keep it simple for now?

I'll leave it for now, I might just post as an RFC to get some feedback on it.

Thanks,

-Brian


diff --git a/include/linux/if_addr.h b/include/linux/if_addr.h
index a60c821..fd97404 100644
--- a/include/linux/if_addr.h
+++ b/include/linux/if_addr.h
@@ -41,6 +41,7 @@ enum
 
 #define	IFA_F_NODAD		0x02
 #define IFA_F_OPTIMISTIC	0x04
+#define IFA_F_DADFAILED		0x08
 #define	IFA_F_HOMEADDRESS	0x10
 #define IFA_F_DEPRECATED	0x20
 #define IFA_F_TENTATIVE		0x40
diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 267ecb3..97c7a8b 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -508,6 +508,10 @@ int print_addrinfo(const struct sockaddr_nl *who, struct nlmsghdr *n,
 		fprintf(fp, "dynamic ");
 	} else
 		ifa->ifa_flags &= ~IFA_F_PERMANENT;
+	if (ifa->ifa_flags&IFA_F_DADFAILED) {
+		ifa->ifa_flags &= ~IFA_F_DADFAILED;
+		fprintf(fp, "dadfailed ");
+	}
 	if (ifa->ifa_flags)
 		fprintf(fp, "flags %02x ", ifa->ifa_flags);
 	if (rta_tb[IFA_LABEL])

^ permalink raw reply related

* Re: [PATCH v2 3/3] ucc_geth: Fix hangs after switching from full to half duplex
From: Anton Vorontsov @ 2009-09-10 19:40 UTC (permalink / raw)
  To: Scott Wood; +Cc: David Miller, netdev, Andy Fleming, Timur Tabi, linuxppc-dev
In-Reply-To: <4AA93FB0.5060802@freescale.com>

On Thu, Sep 10, 2009 at 01:04:32PM -0500, Scott Wood wrote:
> Anton Vorontsov wrote:
> >MPC8360 QE UCC ethernet controllers hang when changing link duplex
> >under a load (a bit of NFS activity is enough).
> >
> >  PHY: mdio@e0102120:00 - Link is Up - 1000/Full
> >  sh-3.00# ethtool -s eth0 speed 100 duplex half autoneg off
> >  PHY: mdio@e0102120:00 - Link is Down
> >  PHY: mdio@e0102120:00 - Link is Up - 100/Half
> >  NETDEV WATCHDOG: eth0 (ucc_geth): transmit queue 0 timed out
> >  ------------[ cut here ]------------
> >  Badness at c01fcbd0 [verbose debug info unavailable]
> >  NIP: c01fcbd0 LR: c01fcbd0 CTR: c0194e44
> >  ...
> >
> >The cure is to disable the controller before changing speed/duplex
> >and enable it afterwards.
> >
> >Since ugeth_graceful_stop_{tx,rx} now may be called from an atomic
> >context, switch the two functions from msleep() to mdelay().
> 
> Ouch.

Yeah, right... delaying for 10ms with irqs off isn't good.

> Can we put this in a workqueue or something?

adjust_link() itself isn't called from an atomic context.

It's we are grabbing ugeth->lock, i.e. a spinlock. I don't see
why the lock is needed in adjust_link() in its current form,
but if we're going to disable the controller for some time,
we'll have to make sure that no start_xmit() or NAPI is running,
scheduled or will be scheduled until we say so.

I think that lock-less, and thus completely sleep-able variant
of adjust_link is doable.

Thanks,

-- 
Anton Vorontsov
email: cbouatmailru@gmail.com
irc://irc.freenode.net/bd2

^ permalink raw reply

* Re: [Linux-ATM-General] [PATCH] atm/br2684: netif_stop_queue() when atm device busy and netif_wake_queue() when we can send packets again.
From: Philip A. Prindeville @ 2009-09-10 19:49 UTC (permalink / raw)
  To: Karl Hiramoto; +Cc: netdev, linux-atm-general
In-Reply-To: <1251545092-18081-1-git-send-email-karl@hiramoto.org>

[-- Attachment #1: Type: text/plain, Size: 3504 bytes --]

I had noticed that with my Solos card, and using Qwest 7062/896kb/s
service that I was typically only getting ~400kb/s upstream, so I
thought that delayed transmitter restarts might be the culprit and
decided to try out this patch.

I'm running 2.6.27.26, and I modified the patch as below.

Has anyone confirmed this patch (Karl's) against 2.6.27?

Thanks,

-Philip


Karl Hiramoto wrote:
> This patch removes the call to dev_kfree_skb() when the atm device is busy.
> Calling dev_kfree_skb() causes heavy packet loss then the device is under
> heavy load, the more correct behavior should be to stop the upper layers,
> then when the lower device can queue packets again wake the upper layers.
>
> Signed-off-by: Karl Hiramoto <karl@hiramoto.org>
> ---
>  net/atm/br2684.c |   37 +++++++++++++++++++++++++++----------
>  1 files changed, 27 insertions(+), 10 deletions(-)
>
> diff --git a/net/atm/br2684.c b/net/atm/br2684.c
> index 2912665..5c42225 100644
> --- a/net/atm/br2684.c
> +++ b/net/atm/br2684.c
> @@ -69,7 +69,7 @@ struct br2684_vcc {
>  	struct net_device *device;
>  	/* keep old push, pop functions for chaining */
>  	void (*old_push) (struct atm_vcc * vcc, struct sk_buff * skb);
> -	/* void (*old_pop)(struct atm_vcc *vcc, struct sk_buff *skb); */
> +	void (*old_pop)(struct atm_vcc *vcc, struct sk_buff *skb);
>  	enum br2684_encaps encaps;
>  	struct list_head brvccs;
>  #ifdef CONFIG_ATM_BR2684_IPFILTER
> @@ -142,6 +142,22 @@ static struct net_device *br2684_find_dev(const struct br2684_if_spec *s)
>  	return NULL;
>  }
>  
> +/* chained vcc->pop function.  Check if we should wake the netif_queue */
> +static void br2684_pop(struct atm_vcc *vcc, struct sk_buff *skb)
> +{
> +	struct br2684_vcc *brvcc = BR2684_VCC(vcc);
> +	struct net_device *net_dev = skb->dev;
> +
> +	pr_debug("br2684_pop(vcc %p ; net_dev %p )\n", vcc, net_dev);
> +	brvcc->old_pop(vcc, skb);
> +
> +	if (!net_dev)
> +		return;
> +
> +	if (atm_may_send(vcc, 0))
> +		netif_wake_queue(net_dev);
> +
> +}
>  /*
>   * Send a packet out a particular vcc.  Not to useful right now, but paves
>   * the way for multiple vcc's per itf.  Returns true if we can send,
> @@ -200,20 +216,19 @@ static int br2684_xmit_vcc(struct sk_buff *skb, struct net_device *dev,
>  
>  	ATM_SKB(skb)->vcc = atmvcc = brvcc->atmvcc;
>  	pr_debug("atm_skb(%p)->vcc(%p)->dev(%p)\n", skb, atmvcc, atmvcc->dev);
> -	if (!atm_may_send(atmvcc, skb->truesize)) {
> -		/*
> -		 * We free this here for now, because we cannot know in a higher
> -		 * layer whether the skb pointer it supplied wasn't freed yet.
> -		 * Now, it always is.
> -		 */
> -		dev_kfree_skb(skb);
> -		return 0;
> -	}
>  	atomic_add(skb->truesize, &sk_atm(atmvcc)->sk_wmem_alloc);
>  	ATM_SKB(skb)->atm_options = atmvcc->atm_options;
>  	dev->stats.tx_packets++;
>  	dev->stats.tx_bytes += skb->len;
>  	atmvcc->send(atmvcc, skb);
> +
> +	if (!atm_may_send(atmvcc, 0)) {
> +		netif_stop_queue(brvcc->device);
> +		/*check for race with br2684_pop*/
> +		if (atm_may_send(atmvcc, 0))
> +			netif_start_queue(brvcc->device);
> +	}
> +
>  	return 1;
>  }
>  
> @@ -503,8 +518,10 @@ static int br2684_regvcc(struct atm_vcc *atmvcc, void __user * arg)
>  	atmvcc->user_back = brvcc;
>  	brvcc->encaps = (enum br2684_encaps)be.encaps;
>  	brvcc->old_push = atmvcc->push;
> +	brvcc->old_pop = atmvcc->pop;
>  	barrier();
>  	atmvcc->push = br2684_push;
> +	atmvcc->pop = br2684_pop;
>  
>  	__skb_queue_head_init(&queue);
>  	rq = &sk_atm(atmvcc)->sk_receive_queue;
>   


[-- Attachment #2: linux-2.6.27-br2684.patch --]
[-- Type: text/plain, Size: 2396 bytes --]

--- linux-2.6.27.29-astlinux/net/atm/br2684.c.orig	2009-07-30 16:06:41.000000000 -0700
+++ linux-2.6.27.29-astlinux/net/atm/br2684.c	2009-09-10 12:42:50.000000000 -0700
@@ -69,7 +69,7 @@ struct br2684_vcc {
 	struct net_device *device;
 	/* keep old push, pop functions for chaining */
 	void (*old_push) (struct atm_vcc * vcc, struct sk_buff * skb);
-	/* void (*old_pop)(struct atm_vcc *vcc, struct sk_buff *skb); */
+	void (*old_pop)(struct atm_vcc *vcc, struct sk_buff *skb);
 	enum br2684_encaps encaps;
 	struct list_head brvccs;
 #ifdef CONFIG_ATM_BR2684_IPFILTER
@@ -143,6 +143,22 @@ static struct net_device *br2684_find_de
 	return NULL;
 }
 
+/* chained vcc->pop function.  Check if we should wake the netif_queue */
+static void br2684_pop(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+	struct br2684_vcc *brvcc = BR2684_VCC(vcc);
+	struct net_device *net_dev = skb->dev;
+
+	pr_debug("br2684_pop(vcc %p ; net_dev %p )\n", vcc, net_dev);
+	brvcc->old_pop(vcc, skb);
+
+	if (!net_dev)
+		return;
+
+	if (atm_may_send(vcc, 0))
+		netif_wake_queue(net_dev);
+
+}
 /*
  * Send a packet out a particular vcc.  Not to useful right now, but paves
  * the way for multiple vcc's per itf.  Returns true if we can send,
@@ -200,20 +216,19 @@ static int br2684_xmit_vcc(struct sk_buf
 
 	ATM_SKB(skb)->vcc = atmvcc = brvcc->atmvcc;
 	pr_debug("atm_skb(%p)->vcc(%p)->dev(%p)\n", skb, atmvcc, atmvcc->dev);
-	if (!atm_may_send(atmvcc, skb->truesize)) {
-		/*
-		 * We free this here for now, because we cannot know in a higher
-		 * layer whether the skb pointer it supplied wasn't freed yet.
-		 * Now, it always is.
-		 */
-		dev_kfree_skb(skb);
-		return 0;
-	}
 	atomic_add(skb->truesize, &sk_atm(atmvcc)->sk_wmem_alloc);
 	ATM_SKB(skb)->atm_options = atmvcc->atm_options;
 	brdev->stats.tx_packets++;
 	brdev->stats.tx_bytes += skb->len;
 	atmvcc->send(atmvcc, skb);
+
+	if (!atm_may_send(atmvcc, 0)) {
+		netif_stop_queue(brvcc->device);
+		/*check for race with br2684_pop*/
+		if (atm_may_send(atmvcc, 0))
+			netif_start_queue(brvcc->device);
+	}
+
 	return 1;
 }
 
@@ -509,8 +524,10 @@ static int br2684_regvcc(struct atm_vcc 
 	atmvcc->user_back = brvcc;
 	brvcc->encaps = (enum br2684_encaps)be.encaps;
 	brvcc->old_push = atmvcc->push;
+	brvcc->old_pop = atmvcc->pop;
 	barrier();
 	atmvcc->push = br2684_push;
+	atmvcc->pop = br2684_pop;
 
 	rq = &sk_atm(atmvcc)->sk_receive_queue;
 

^ permalink raw reply

* Re: UDP regression with packets rates < 10k per sec
From: Eric Dumazet @ 2009-09-10 20:37 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: netdev
In-Reply-To: <alpine.DEB.1.10.0909091350590.32067@V090114053VZO-1>

Something must be wrong with program or whatever...

On the receiver I did this to trace the latency messages only

# tcpdump -i eth0 not host 239.0.192.2 and port 9002 -n

22:28:07.223300 IP 55.225.18.7.9002 > 55.225.18.5.9002: UDP, length 32
22:28:07.223403 IP 55.225.18.5.9002 > 55.225.18.7.9002: UDP, length 32

22:28:08.223301 IP 55.225.18.7.9002 > 55.225.18.5.9002: UDP, length 32
22:28:08.223416 IP 55.225.18.5.9002 > 55.225.18.7.9002: UDP, length 32

22:28:09.223380 IP 55.225.18.7.9002 > 55.225.18.5.9002: UDP, length 32
22:28:09.223494 IP 55.225.18.5.9002 > 55.225.18.7.9002: UDP, length 32

22:28:10.223481 IP 55.225.18.7.9002 > 55.225.18.5.9002: UDP, length 32
22:28:10.223597 IP 55.225.18.5.9002 > 55.225.18.7.9002: UDP, length 32

22:28:11.223581 IP 55.225.18.7.9002 > 55.225.18.5.9002: UDP, length 32
22:28:11.223678 IP 55.225.18.5.9002 > 55.225.18.7.9002: UDP, length 32


See how the answer is *very* slow ? Something like > 100 us ?


^ permalink raw reply

* Re: [PATCH v2 3/3] ucc_geth: Fix hangs after switching from full to half duplex
From: Anton Vorontsov @ 2009-09-10 21:09 UTC (permalink / raw)
  To: Scott Wood; +Cc: David Miller, netdev, Andy Fleming, Timur Tabi, linuxppc-dev
In-Reply-To: <20090910194053.GA24363@oksana.dev.rtsoft.ru>

On Thu, Sep 10, 2009 at 11:40:53PM +0400, Anton Vorontsov wrote:
> On Thu, Sep 10, 2009 at 01:04:32PM -0500, Scott Wood wrote:
> > Anton Vorontsov wrote:
> > >MPC8360 QE UCC ethernet controllers hang when changing link duplex
> > >under a load (a bit of NFS activity is enough).
> > >
> > >  PHY: mdio@e0102120:00 - Link is Up - 1000/Full
> > >  sh-3.00# ethtool -s eth0 speed 100 duplex half autoneg off
> > >  PHY: mdio@e0102120:00 - Link is Down
> > >  PHY: mdio@e0102120:00 - Link is Up - 100/Half
> > >  NETDEV WATCHDOG: eth0 (ucc_geth): transmit queue 0 timed out
> > >  ------------[ cut here ]------------
> > >  Badness at c01fcbd0 [verbose debug info unavailable]
> > >  NIP: c01fcbd0 LR: c01fcbd0 CTR: c0194e44
> > >  ...
> > >
> > >The cure is to disable the controller before changing speed/duplex
> > >and enable it afterwards.
> > >
> > >Since ugeth_graceful_stop_{tx,rx} now may be called from an atomic
> > >context, switch the two functions from msleep() to mdelay().
> > 
> > Ouch.
> 
> Yeah, right... delaying for 10ms with irqs off isn't good.
> 
> > Can we put this in a workqueue or something?
> 
> adjust_link() itself isn't called from an atomic context.

Oops. I though that phylib calls us from a workqueue, not a timer. Hm...

Will be a little bit more work..

-- 
Anton Vorontsov
email: cbouatmailru@gmail.com
irc://irc.freenode.net/bd2

^ permalink raw reply

* Re: [Linux-ATM-General] [PATCH] atm/br2684: netif_stop_queue() when atm device busy and netif_wake_queue() when we can send packets again.
From: Karl Hiramoto @ 2009-09-10 21:30 UTC (permalink / raw)
  To: Philip A. Prindeville; +Cc: netdev, linux-atm-general
In-Reply-To: <4AA95838.4010007@redfish-solutions.com>

Philip A. Prindeville wrote:
> I had noticed that with my Solos card, and using Qwest 7062/896kb/s
> service that I was typically only getting ~400kb/s upstream, so I
> thought that delayed transmitter restarts might be the culprit and
> decided to try out this patch.
>
> I'm running 2.6.27.26, and I modified the patch as below.
>
> Has anyone confirmed this patch (Karl's) against 2.6.27?
>
> Thanks,
>
> -Philip
I'd be interested in hearing comparisons with/without the patch.    What 
i have is no upstream packet loss with this patch, however slightly 
lower total throughput.  I think because the upper networking layers 
take time to restart the packet flow.   I'm not really sure if or how 
many packets to upper layers buffer.  I haven't had time to debug it 
further.

I don't have a solos card, but you may have to tweak the solos driver 
sk_sndbuf  value.

--
Karl

^ permalink raw reply

* Re: UDP regression with packets rates < 10k per sec
From: Christoph Lameter @ 2009-09-10 21:36 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev
In-Reply-To: <4AA963A4.5080509@gmail.com>

On Thu, 10 Sep 2009, Eric Dumazet wrote:

> On the receiver I did this to trace the latency messages only
>
> # tcpdump -i eth0 not host 239.0.192.2 and port 9002 -n

You do not show the multicast traffic. The latency timestamps requests are
send unicast and at larger intervals. About one per second per mc channel.


^ permalink raw reply

* Re: UDP regression with packets rates < 10k per sec
From: Eric Dumazet @ 2009-09-10 21:37 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: netdev
In-Reply-To: <4AA963A4.5080509@gmail.com>

Eric Dumazet a écrit :
> Something must be wrong with program or whatever...
> 
> On the receiver I did this to trace the latency messages only
> 
> # tcpdump -i eth0 not host 239.0.192.2 and port 9002 -n
> 
> 22:28:07.223300 IP 55.225.18.7.9002 > 55.225.18.5.9002: UDP, length 32
> 22:28:07.223403 IP 55.225.18.5.9002 > 55.225.18.7.9002: UDP, length 32
> 
> 22:28:08.223301 IP 55.225.18.7.9002 > 55.225.18.5.9002: UDP, length 32
> 22:28:08.223416 IP 55.225.18.5.9002 > 55.225.18.7.9002: UDP, length 32
> 
> 22:28:09.223380 IP 55.225.18.7.9002 > 55.225.18.5.9002: UDP, length 32
> 22:28:09.223494 IP 55.225.18.5.9002 > 55.225.18.7.9002: UDP, length 32
> 
> 22:28:10.223481 IP 55.225.18.7.9002 > 55.225.18.5.9002: UDP, length 32
> 22:28:10.223597 IP 55.225.18.5.9002 > 55.225.18.7.9002: UDP, length 32
> 
> 22:28:11.223581 IP 55.225.18.7.9002 > 55.225.18.5.9002: UDP, length 32
> 22:28:11.223678 IP 55.225.18.5.9002 > 55.225.18.7.9002: UDP, length 32
> 
> 
> See how the answer is *very* slow ? Something like > 100 us ?
> 

Never mind, this is OK, after proper irq affinity and tweaks on receiver :
(I should do same on other machine as well...)

(echo 1 >/proc/irq/default_smp_affinity) before up-ing bnx2 NIC

echo performance >/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor

and idle=mwait  (or hpet and C2/C3 states are used on this machine and 
wakeups are slow)

# tcpdump -i eth0 not host 239.0.192.2 and port 9002 -n
23:25:02.137143 IP 55.225.18.7.9002 > 55.225.18.5.9002: UDP, length 32
23:25:02.137212 IP 55.225.18.5.9002 > 55.225.18.7.9002: UDP, length 32

23:25:03.137204 IP 55.225.18.7.9002 > 55.225.18.5.9002: UDP, length 32
23:25:03.137288 IP 55.225.18.5.9002 > 55.225.18.7.9002: UDP, length 32

23:25:04.137276 IP 55.225.18.7.9002 > 55.225.18.5.9002: UDP, length 32
23:25:04.137355 IP 55.225.18.5.9002 > 55.225.18.7.9002: UDP, length 32


I presume powersaving improvements in last kernels are against
latencies goals...


^ permalink raw reply

* Re: UDP regression with packets rates < 10k per sec
From: Christoph Lameter @ 2009-09-10 21:39 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev
In-Reply-To: <4AA963A4.5080509@gmail.com>

On Thu, 10 Sep 2009, Eric Dumazet wrote:

> 22:28:11.223581 IP 55.225.18.7.9002 > 55.225.18.5.9002: UDP, length 32
> 22:28:11.223678 IP 55.225.18.5.9002 > 55.225.18.7.9002: UDP, length 32
>
> See how the answer is *very* slow ? Something like > 100 us ?

If you run tcpdump then you create some overhead I guess. 100 usec should
show up as latencies > 50usec in the report.

The latency request includes the timestamp from the source. So the
destination can take another timestamp at receive time. The sending back
of the reply to the timestamp request is not critical at all. The receiver
will just register the time differential established by the sender.


^ permalink raw reply

* Re: UDP regression with packets rates < 10k per sec
From: Christoph Lameter @ 2009-09-10 21:42 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev
In-Reply-To: <4AA97183.3030008@gmail.com>

On Thu, 10 Sep 2009, Eric Dumazet wrote:

> and idle=mwait  (or hpet and C2/C3 states are used on this machine and
> wakeups are slow)

We disabled C states here.

^ permalink raw reply

* [PATCH v3 3/3] ucc_geth: Fix hangs after switching from full to half duplex
From: Anton Vorontsov @ 2009-09-10 21:48 UTC (permalink / raw)
  To: Scott Wood; +Cc: David Miller, netdev, Andy Fleming, Timur Tabi, linuxppc-dev
In-Reply-To: <20090910210935.GA26037@oksana.dev.rtsoft.ru>

MPC8360 QE UCC ethernet controllers hang when changing link duplex
under a load (a bit of NFS activity is enough).

  PHY: mdio@e0102120:00 - Link is Up - 1000/Full
  sh-3.00# ethtool -s eth0 speed 100 duplex half autoneg off
  PHY: mdio@e0102120:00 - Link is Down
  PHY: mdio@e0102120:00 - Link is Up - 100/Half
  NETDEV WATCHDOG: eth0 (ucc_geth): transmit queue 0 timed out
  ------------[ cut here ]------------
  Badness at c01fcbd0 [verbose debug info unavailable]
  NIP: c01fcbd0 LR: c01fcbd0 CTR: c0194e44
  ...

The cure is to disable the controller before changing speed/duplex
and enable it afterwards.

Though, disabling the controller might take quite a while, so we
better not grab any spinlocks in adjust_link(). Instead, we quiesce
the driver's activity, and only then disable the controller.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
---

On Fri, Sep 11, 2009 at 01:09:36AM +0400, Anton Vorontsov wrote:
> On Thu, Sep 10, 2009 at 11:40:53PM +0400, Anton Vorontsov wrote:
> > On Thu, Sep 10, 2009 at 01:04:32PM -0500, Scott Wood wrote:
> > > Anton Vorontsov wrote:
> > > >MPC8360 QE UCC ethernet controllers hang when changing link duplex
> > > >under a load (a bit of NFS activity is enough).
> > > >
> > > >  PHY: mdio@e0102120:00 - Link is Up - 1000/Full
> > > >  sh-3.00# ethtool -s eth0 speed 100 duplex half autoneg off
> > > >  PHY: mdio@e0102120:00 - Link is Down
> > > >  PHY: mdio@e0102120:00 - Link is Up - 100/Half
> > > >  NETDEV WATCHDOG: eth0 (ucc_geth): transmit queue 0 timed out
> > > >  ------------[ cut here ]------------
> > > >  Badness at c01fcbd0 [verbose debug info unavailable]
> > > >  NIP: c01fcbd0 LR: c01fcbd0 CTR: c0194e44
> > > >  ...
> > > >
> > > >The cure is to disable the controller before changing speed/duplex
> > > >and enable it afterwards.
> > > >
> > > >Since ugeth_graceful_stop_{tx,rx} now may be called from an atomic
> > > >context, switch the two functions from msleep() to mdelay().
> > > 
> > > Ouch.
> > 
> > Yeah, right... delaying for 10ms with irqs off isn't good.
> > 
> > > Can we put this in a workqueue or something?
> > 
> > adjust_link() itself isn't called from an atomic context.
> 
> Oops. I though that phylib calls us from a workqueue, not a timer. Hm...
> 
> Will be a little bit more work..

Ignore me. I'm working on two kernel versions in parallel (2.6.21 and
mainline), and it's 2.6.21 where phylib uses a timer. Mainline is OK.

How about this patch?

 drivers/net/ucc_geth.c |   36 +++++++++++++++++++++++++++++++-----
 1 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ucc_geth.c b/drivers/net/ucc_geth.c
index 2a2c973..232fef9 100644
--- a/drivers/net/ucc_geth.c
+++ b/drivers/net/ucc_geth.c
@@ -1560,6 +1560,25 @@ static int ugeth_disable(struct ucc_geth_private *ugeth, enum comm_dir mode)
 	return 0;
 }
 
+static void ugeth_quiesce(struct ucc_geth_private *ugeth)
+{
+	/* Wait for and prevent any further xmits. */
+	netif_tx_disable(ugeth->ndev);
+
+	/* Disable the interrupt to avoid NAPI rescheduling. */
+	disable_irq(ugeth->ug_info->uf_info.irq);
+
+	/* Stop NAPI, and possibly wait for its completion. */
+	napi_disable(&ugeth->napi);
+}
+
+static void ugeth_activate(struct ucc_geth_private *ugeth)
+{
+	napi_enable(&ugeth->napi);
+	enable_irq(ugeth->ug_info->uf_info.irq);
+	netif_tx_wake_all_queues(ugeth->ndev);
+}
+
 /* Called every time the controller might need to be made
  * aware of new link state.  The PHY code conveys this
  * information through variables in the ugeth structure, and this
@@ -1573,14 +1592,11 @@ static void adjust_link(struct net_device *dev)
 	struct ucc_geth __iomem *ug_regs;
 	struct ucc_fast __iomem *uf_regs;
 	struct phy_device *phydev = ugeth->phydev;
-	unsigned long flags;
 	int new_state = 0;
 
 	ug_regs = ugeth->ug_regs;
 	uf_regs = ugeth->uccf->uf_regs;
 
-	spin_lock_irqsave(&ugeth->lock, flags);
-
 	if (phydev->link) {
 		u32 tempval = in_be32(&ug_regs->maccfg2);
 		u32 upsmr = in_be32(&uf_regs->upsmr);
@@ -1631,9 +1647,21 @@ static void adjust_link(struct net_device *dev)
 			ugeth->oldspeed = phydev->speed;
 		}
 
+		/*
+		 * To change the MAC configuration we need to disable the
+		 * controller. To do so, we have to either grab ugeth->lock,
+		 * which is a bad idea since 'graceful stop' commands might
+		 * take quite a while, or we can quiesce driver's activity.
+		 */
+		ugeth_quiesce(ugeth);
+		ugeth_disable(ugeth, COMM_DIR_RX_AND_TX);
+
 		out_be32(&ug_regs->maccfg2, tempval);
 		out_be32(&uf_regs->upsmr, upsmr);
 
+		ugeth_enable(ugeth, COMM_DIR_RX_AND_TX);
+		ugeth_activate(ugeth);
+
 		if (!ugeth->oldlink) {
 			new_state = 1;
 			ugeth->oldlink = 1;
@@ -1647,8 +1675,6 @@ static void adjust_link(struct net_device *dev)
 
 	if (new_state && netif_msg_link(ugeth))
 		phy_print_status(phydev);
-
-	spin_unlock_irqrestore(&ugeth->lock, flags);
 }
 
 /* Initialize TBI PHY interface for communicating with the
-- 
1.6.3.3


^ permalink raw reply related

* [PATCH] net: force bridge module(s) to be GPL
From: Stephen Hemminger @ 2009-09-10 21:51 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-kernel

The only valid usage for the bridge frame hooks are by a
GPL components (such as the bridge module).
The kernel should not leave a crack in the door for proprietary
networking stacks to slip in.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

--- a/net/core/dev.c	2009-09-10 14:27:59.076074483 -0700
+++ b/net/core/dev.c	2009-09-10 14:28:43.659314263 -0700
@@ -2116,7 +2116,7 @@ static inline int deliver_skb(struct sk_
 /* This hook is defined here for ATM LANE */
 int (*br_fdb_test_addr_hook)(struct net_device *dev,
 			     unsigned char *addr) __read_mostly;
-EXPORT_SYMBOL(br_fdb_test_addr_hook);
+EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
 #endif
 
 /*
@@ -2125,7 +2125,7 @@ EXPORT_SYMBOL(br_fdb_test_addr_hook);
  */
 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
 					struct sk_buff *skb) __read_mostly;
-EXPORT_SYMBOL(br_handle_frame_hook);
+EXPORT_SYMBOL_GPL(br_handle_frame_hook);
 
 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
 					    struct packet_type **pt_prev, int *ret,

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox