Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH 08/15] ipv4: Kill routes during PMTU/redirect updates.
From: Joe Perches @ 2012-07-18 19:15 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20120718.112356.1409220904008377845.davem@davemloft.net>

On Wed, 2012-07-18 at 11:23 -0700, David Miller wrote:
> Mark them obsolete so there will be a re-lookup to fetch the
> FIB nexthop exception info.
[]
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
[]
> @@ -716,8 +717,8 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
>  					fnhe->fnhe_gw = new_gw;
>  				spin_unlock_bh(&fnhe_lock);
>  			}
> -			rt->rt_gateway = new_gw;
> -			rt->rt_flags |= RTCF_REDIRECTED;
> +			if (kill_route)
> +				rt->dst.obsolete = -2;

Perhaps -2 should be a #define?

Perhaps struct dst_entry.obsolete could be a char instead of
a short and a pad byte could added for some future use.

Maybe:

 include/net/dst.h |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/include/net/dst.h b/include/net/dst.h
index 5161046..6c40490 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -65,7 +65,8 @@ struct dst_entry {
 	unsigned short		pending_confirm;
 
 	short			error;
-	short			obsolete;
+	char			obsolete;
+	char			__pad3;
 	unsigned short		header_len;	/* more space at head required */
 	unsigned short		trailer_len;	/* space to reserve at tail */
 #ifdef CONFIG_IP_ROUTE_CLASSID

^ permalink raw reply related

* Re: [PATCH 0/3 v2] net: various tilegx networking fixes
From: Chris Metcalf @ 2012-07-18 19:22 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-kernel
In-Reply-To: <20120718.113623.984635805289135415.davem@davemloft.net>

On 7/18/2012 2:36 PM, David Miller wrote:
> From: Chris Metcalf <cmetcalf@tilera.com>
> Date: Sun, 1 Jul 2012 14:43:47 -0400
>
>> The tree is at:
>>
>>   git://git.kernel.org/pub/scm/linux/kernel/git/cmetcalf/linux-tile.git net
>>
>> Chris Metcalf (3):
>>       net: tilegx driver bugfix (be explicit about percpu queue number)
>>       tilegx net driver: handle payload data not in frags
>>       tilegx net: use eth_hw_addr_random(), not random_ether_addr()
> These changes look fine, but when I pull from your tree I get tons of
> totally unrelated stuff and a merge conflict in this driver.
>
> Can you put together a clean pull against net-next?

The merge conflict was against Joe Perches' bombing of random_ether_addr()
to eth_random_addr().  I left in my change to convert that again to be
eth_hw_addr_random(), which naively seems like a better API, and sets
NET_ADDR_RANDOM, which is presumably a good thing.

I recreated the tree to be branched off of net-next. ( I had originally
created it off of Linus's tree, which in retrospect doesn't make much
sense.)  Please try to pull again - thanks!

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com

^ permalink raw reply

* Re: [PATCH 10/15] ipv4: Cache input routes in fib_info nexthops.
From: Joe Perches @ 2012-07-18 19:27 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20120718.112413.1969496621247659288.davem@davemloft.net>

On Wed, 2012-07-18 at 11:24 -0700, David Miller wrote:
> Caching input routes is slightly simpler than output routes, since we
> don't need to be concerned with nexthop exceptions.  (locally
> destined, and routed packets, never trigger PMTU events or redirects
> that will be processed by us).
[]
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
[]
> @@ -1355,11 +1357,11 @@ static int __mkroute_input(struct sk_buff *skb,
[]
> +	do_cache = false;
> +	if (res->fi) {
> +		if (!(flags & RTCF_DIRECTSRC) && !itag) {
> +			rth = FIB_RES_NH(*res).nh_rth_input;
> +			if (rth) {
> +				dst_use(&rth->dst, jiffies);
> +				goto out;
> +			}
> +			do_cache = true;
> +		}
> +	}
[]
> @@ -1568,8 +1580,20 @@ brd_input:
[]
> +	do_cache = false;
> +	if (res.fi) {
> +		if (!(flags & RTCF_DIRECTSRC) && !itag) {
> +			rth = FIB_RES_NH(res).nh_rth_input;
> +			if (rth) {
> +				dst_use(&rth->dst, jiffies);
> +				goto set_and_out;
> +			}
> +			do_cache = true;
> +		}
> +	}

Maybe a helper like:

	if (some_do_cache_name(rth, res, itag, flags, &do_cache))
		goto foo;

^ permalink raw reply

* [PATCH] cxgb3: Set vlan_feature on net_device
From: brenohl @ 2012-07-18 19:29 UTC (permalink / raw)
  To: divy; +Cc: netdev, Breno Leitao

cxgb3 interface has a bad performance when VLAN is set. On my current
setup, a PowerLinux 7R2, I am able to get around 7 Gbps on a TCP_STREAM
(8 instances, 4k message).
With this patch, I am able to reach 9.5 Gbps.

Signed-off-by: Breno Leitao <brenohl@br.ibm.com>

diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
index abb6ce7..fcf4b31 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
@@ -3173,6 +3173,9 @@ static void __devinit cxgb3_init_iscsi_mac(struct net_device *dev)
 	pi->iscsic.mac_addr[3] |= 0x80;
 }
 
+#define TSO_FLAGS (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_TSO_ECN)
+#define VLAN_FEAT (NETIF_F_SG | NETIF_F_IP_CSUM | TSO_FLAGS | \
+			NETIF_F_IPV6_CSUM | NETIF_F_HIGHDMA)
 static int __devinit init_one(struct pci_dev *pdev,
 			      const struct pci_device_id *ent)
 {
@@ -3293,6 +3296,7 @@ static int __devinit init_one(struct pci_dev *pdev,
 		netdev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM |
 			NETIF_F_TSO | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_RX;
 		netdev->features |= netdev->hw_features | NETIF_F_HW_VLAN_TX;
+		netdev->vlan_features |= netdev->features & VLAN_FEAT;
 		if (pci_using_dac)
 			netdev->features |= NETIF_F_HIGHDMA;
 
-- 
1.7.1

^ permalink raw reply related

* Re: [PATCH 08/15] ipv4: Kill routes during PMTU/redirect updates.
From: David Miller @ 2012-07-18 19:30 UTC (permalink / raw)
  To: joe; +Cc: netdev
In-Reply-To: <1342638944.2013.10.camel@joe2Laptop>

From: Joe Perches <joe@perches.com>
Date: Wed, 18 Jul 2012 12:15:44 -0700

> On Wed, 2012-07-18 at 11:23 -0700, David Miller wrote:
>> Mark them obsolete so there will be a re-lookup to fetch the
>> FIB nexthop exception info.
> []
>> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> []
>> @@ -716,8 +717,8 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
>>  					fnhe->fnhe_gw = new_gw;
>>  				spin_unlock_bh(&fnhe_lock);
>>  			}
>> -			rt->rt_gateway = new_gw;
>> -			rt->rt_flags |= RTCF_REDIRECTED;
>> +			if (kill_route)
>> +				rt->dst.obsolete = -2;
> 
> Perhaps -2 should be a #define?
> 
> Perhaps struct dst_entry.obsolete could be a char instead of
> a short and a pad byte could added for some future use.

First thing, char is not signed by default on all systems :-)

But yes, this should be cleaned up.  Also with a big fat comment above
the struct member detailing it's usage.

I'll put this on my TODO list, thanks a lot Joe.

^ permalink raw reply

* Re: [PATCH 10/15] ipv4: Cache input routes in fib_info nexthops.
From: David Miller @ 2012-07-18 19:30 UTC (permalink / raw)
  To: joe; +Cc: netdev
In-Reply-To: <1342639675.2013.18.camel@joe2Laptop>

From: Joe Perches <joe@perches.com>
Date: Wed, 18 Jul 2012 12:27:55 -0700

> Maybe a helper like:
> 
> 	if (some_do_cache_name(rth, res, itag, flags, &do_cache))
> 		goto foo;

Yep that would make a lot of sense.

^ permalink raw reply

* Re: r8169: link up, link down
From: Francois Romieu @ 2012-07-18 19:23 UTC (permalink / raw)
  To: J. Christopher Pereira; +Cc: netdev
In-Reply-To: <038001cd6512$613125c0$23937140$@cl>

J. Christopher Pereira <kripper@imatronix.cl> :
[...]
> dmesg says "eth0: RTL8110s at 0xffffc2000067ec00, 00:4f:4a:10:1e:cf, XID
> 04000000 IRQ 16".

It's an old chipset (RTL_GIGA_MAC_VER_03). The PCI remove / rescan trick
may or may not work.

> > Building a modern kernel is strongly suggested if the hardware includes a
> recent 816x chipset.
> 
> Is there any particular patch I could apply and just recompile the driver?

Your kernel is more than three years old. It is not _that_ long but there
are ~182 r8169 patches between v2.6.30 and v3.4. There will still be a lot
even after the trivial ones are factored out.

See git log -p v2.6.30..v3.4 --follow -- drivers/net/ethernet/realtek/r8169.c
for the whole gore.

> My hope was to first receive feedback and identify some probably related
> known bug, in order to avoid searching for a solution by trial and error or
> by updating the whole environment.

You should really try 3.4 and revert 036dafa28da1e2565a8529de2ae663c37b7a0060.
It's the best I can suggest.

-- 
Ueimor

^ permalink raw reply

* Re: [PATCH 08/15] ipv4: Kill routes during PMTU/redirect updates.
From: Joe Perches @ 2012-07-18 19:51 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20120718.123015.476222169838022819.davem@davemloft.net>

On Wed, 2012-07-18 at 12:30 -0700, David Miller wrote:
> From: Joe Perches <joe@perches.com>
> > Perhaps struct dst_entry.obsolete could be a char instead of
> > a short and a pad byte could added for some future use.
> 
> First thing, char is not signed by default on all systems :-)

yeah, yeah. I'm sure you'll dtrt :)

^ permalink raw reply

* Re: [PATCH 1/7] net-tcp: Fast Open base
From: Eric Dumazet @ 2012-07-18 19:55 UTC (permalink / raw)
  To: David Miller; +Cc: ycheng, hkchu, edumazet, ncardwell, sivasankar, netdev
In-Reply-To: <20120716.231644.1189536600250332545.davem@davemloft.net>

On Mon, 2012-07-16 at 23:16 -0700, David Miller wrote:
> From: Yuchung Cheng <ycheng@google.com>
> Date: Mon, 16 Jul 2012 14:16:44 -0700
> 
> > +#define TCPOPT_EXP		254	/* Experimental */
> > +/* Magic number to be after the option value for sharing TCP
> > + * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
> > + */
> > +#define TCPOPT_FASTOPEN_MAGIC	0xF989
> 
> If I apply this, we're stuck supporting this experimental number
> forever.
> 
> Because somewhere, someone will have a kernel running using this
> number, so we have to support this option value as well as whatever
> the official one is.
> 
> Therefore I think the only logical thing we can do is only deploy
> this once an official option number is choosen.

Hi David

This is a chicken and egg problem.

IANA wont grant an official number like that in 2012+. Maybe if billions
of Android/linux devices use TFO in 2015 IANA will grant an official
number.

So we chose to follow Joe touch proposal
(http://tools.ietf.org/html/draft-ietf-tcpm-experimental-options-01) and
the magic 0xF989 was generated according to section 3) to avoid possible
clashes with other experimental options using code option 254

(Code options 253 & 254 are reserved for experimental use.
Linux Cookie extension uses 253 without a magic cookie so 253 cannot be
shared. By the way I wonder if anybody uses it... oh well...) 

Only servers will need to cope with this experimental option plus the
official one (_if_ IANA accepts to unblock one of the many reserved
options, in two or three years)

Yuchung only posted the Client side in this patch series. But we already
run the server side, and supporting the official TFO option plus the
experimental one is adding less than 10 lines of code.

So the plan would be :

1) Use the experimental 254 + magic on TFO Clients/Servers in 2012

2) When/If IANA grants an official number, add its support to servers
   (keeping support for experimental option as well)

3) One/two years later, switch client side to use this official number

4) Ten years later, remove experimental from server side.

Thanks !

PS :

TFO is not mandatory : If the initial SYN TFO option is not understood
by a server, it will reply with a SYN/ACK without the option and cookie,
and client will proceed as today.

^ permalink raw reply

* Re: [PATCH] SUNRPC: Prevent kernel stack corruption on long values of flush
From: Jim Rees @ 2012-07-18 20:00 UTC (permalink / raw)
  To: J. Bruce Fields
  Cc: Sasha Levin, Trond.Myklebust, davem, davej, linux-nfs, netdev,
	linux-kernel
In-Reply-To: <20120718173913.GA1298@fieldses.org>

J. Bruce Fields wrote:

  On Tue, Jul 17, 2012 at 12:01:26AM +0200, Sasha Levin wrote:
  > The buffer size in read_flush() is too small for the longest possible values
  > for it. This can lead to a kernel stack corruption:
  
  Thanks!
  
  > 
  > diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
  > index 2afd2a8..f86d95e 100644
  > --- a/net/sunrpc/cache.c
  > +++ b/net/sunrpc/cache.c
  > @@ -1409,11 +1409,11 @@ static ssize_t read_flush(struct file *file, char __user *buf,
  >  			  size_t count, loff_t *ppos,
  >  			  struct cache_detail *cd)
  >  {
  > -	char tbuf[20];
  > +	char tbuf[22];
  
  I wonder how common this sort of calculation is in the kernel?  It might
  provide some peace of mind to be able to write this something like
  
  	char tbuf[MAXLEN_BASE10_UL + 2]  /* + 2 for final "\n\0" */

You could use something like:

    char tbuf[sizeof (unsigned long) * 24 / 10 + 1 + 2]; /* + 2 for final "\n\0" */

since there are roughly 10 bits for every 3 decimal digits.

But I'm obviously confused, because I don't understand why tbuf needs to be
any more than 10 + 2.

^ permalink raw reply

* [PATCH v3] ipv4: use seqlock for nh_exceptions
From: Julian Anastasov @ 2012-07-18 20:15 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

	Use global seqlock for the nh_exceptions. Call
fnhe_oldest with the right hash chain. Correct the diff
value for dst_set_expires.

v2: after suggestions from Eric Dumazet:
* get rid of spin lock fnhe_lock, rearrange update_or_create_fnhe
* continue daddr search in rt_bind_exception

v3:
* remove the daddr check before seqlock in rt_bind_exception
* restart lookup in rt_bind_exception on detected seqlock change,
as suggested by David Miller

Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
 include/net/ip_fib.h |    2 +-
 net/ipv4/route.c     |  118 +++++++++++++++++++++++++++++---------------------
 2 files changed, 69 insertions(+), 51 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index e9ee1ca..2daf096 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -51,7 +51,7 @@ struct fib_nh_exception {
 	struct fib_nh_exception __rcu	*fnhe_next;
 	__be32				fnhe_daddr;
 	u32				fnhe_pmtu;
-	u32				fnhe_gw;
+	__be32				fnhe_gw;
 	unsigned long			fnhe_expires;
 	unsigned long			fnhe_stamp;
 };
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f67e702..e9802d8 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1333,9 +1333,9 @@ static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 		build_sk_flow_key(fl4, sk);
 }
 
-static DEFINE_SPINLOCK(fnhe_lock);
+static DEFINE_SEQLOCK(fnhe_seqlock);
 
-static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash, __be32 daddr)
+static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 {
 	struct fib_nh_exception *fnhe, *oldest;
 
@@ -1358,47 +1358,63 @@ static inline u32 fnhe_hashfun(__be32 daddr)
 	return hval & (FNHE_HASH_SIZE - 1);
 }
 
-static struct fib_nh_exception *find_or_create_fnhe(struct fib_nh *nh, __be32 daddr)
+static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
+				  u32 pmtu, unsigned long expires)
 {
-	struct fnhe_hash_bucket *hash = nh->nh_exceptions;
+	struct fnhe_hash_bucket *hash;
 	struct fib_nh_exception *fnhe;
 	int depth;
-	u32 hval;
+	u32 hval = fnhe_hashfun(daddr);
+
+	write_seqlock_bh(&fnhe_seqlock);
 
+	hash = nh->nh_exceptions;
 	if (!hash) {
-		hash = nh->nh_exceptions = kzalloc(FNHE_HASH_SIZE * sizeof(*hash),
-						   GFP_ATOMIC);
+		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 		if (!hash)
-			return NULL;
+			goto out_unlock;
+		nh->nh_exceptions = hash;
 	}
 
-	hval = fnhe_hashfun(daddr);
 	hash += hval;
 
 	depth = 0;
 	for (fnhe = rcu_dereference(hash->chain); fnhe;
 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 		if (fnhe->fnhe_daddr == daddr)
-			goto out;
+			break;
 		depth++;
 	}
 
-	if (depth > FNHE_RECLAIM_DEPTH) {
-		fnhe = fnhe_oldest(hash + hval, daddr);
-		goto out_daddr;
+	if (fnhe) {
+		if (gw)
+			fnhe->fnhe_gw = gw;
+		if (pmtu) {
+			fnhe->fnhe_pmtu = pmtu;
+			fnhe->fnhe_expires = expires;
+		}
+	} else {
+		if (depth > FNHE_RECLAIM_DEPTH)
+			fnhe = fnhe_oldest(hash);
+		else {
+			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
+			if (!fnhe)
+				goto out_unlock;
+
+			fnhe->fnhe_next = hash->chain;
+			rcu_assign_pointer(hash->chain, fnhe);
+		}
+		fnhe->fnhe_daddr = daddr;
+		fnhe->fnhe_gw = gw;
+		fnhe->fnhe_pmtu = pmtu;
+		fnhe->fnhe_expires = expires;
 	}
-	fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
-	if (!fnhe)
-		return NULL;
-
-	fnhe->fnhe_next = hash->chain;
-	rcu_assign_pointer(hash->chain, fnhe);
 
-out_daddr:
-	fnhe->fnhe_daddr = daddr;
-out:
 	fnhe->fnhe_stamp = jiffies;
-	return fnhe;
+
+out_unlock:
+	write_sequnlock_bh(&fnhe_seqlock);
+	return;
 }
 
 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4)
@@ -1452,13 +1468,9 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
 		} else {
 			if (fib_lookup(net, fl4, &res) == 0) {
 				struct fib_nh *nh = &FIB_RES_NH(res);
-				struct fib_nh_exception *fnhe;
 
-				spin_lock_bh(&fnhe_lock);
-				fnhe = find_or_create_fnhe(nh, fl4->daddr);
-				if (fnhe)
-					fnhe->fnhe_gw = new_gw;
-				spin_unlock_bh(&fnhe_lock);
+				update_or_create_fnhe(nh, fl4->daddr, new_gw,
+						      0, 0);
 			}
 			rt->rt_gateway = new_gw;
 			rt->rt_flags |= RTCF_REDIRECTED;
@@ -1663,15 +1675,9 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 
 	if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
 		struct fib_nh *nh = &FIB_RES_NH(res);
-		struct fib_nh_exception *fnhe;
 
-		spin_lock_bh(&fnhe_lock);
-		fnhe = find_or_create_fnhe(nh, fl4->daddr);
-		if (fnhe) {
-			fnhe->fnhe_pmtu = mtu;
-			fnhe->fnhe_expires = jiffies + ip_rt_mtu_expires;
-		}
-		spin_unlock_bh(&fnhe_lock);
+		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
+				      jiffies + ip_rt_mtu_expires);
 	}
 	rt->rt_pmtu = mtu;
 	dst_set_expires(&rt->dst, ip_rt_mtu_expires);
@@ -1902,23 +1908,35 @@ static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr
 
 	hval = fnhe_hashfun(daddr);
 
+restart:
 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
-		if (fnhe->fnhe_daddr == daddr) {
-			if (fnhe->fnhe_pmtu) {
-				unsigned long expires = fnhe->fnhe_expires;
-				unsigned long diff = jiffies - expires;
-
-				if (time_before(jiffies, expires)) {
-					rt->rt_pmtu = fnhe->fnhe_pmtu;
-					dst_set_expires(&rt->dst, diff);
-				}
+		__be32 fnhe_daddr, gw;
+		u32 pmtu;
+		unsigned long expires;
+		unsigned int seq;
+
+		seq = read_seqbegin(&fnhe_seqlock);
+		fnhe_daddr = fnhe->fnhe_daddr;
+		gw = fnhe->fnhe_gw;
+		pmtu = fnhe->fnhe_pmtu;
+		expires = fnhe->fnhe_expires;
+		if (read_seqretry(&fnhe_seqlock, seq))
+			goto restart;
+		if (daddr != fnhe_daddr)
+			continue;
+		if (pmtu) {
+			unsigned long diff = expires - jiffies;
+
+			if (time_before(jiffies, expires)) {
+				rt->rt_pmtu = pmtu;
+				dst_set_expires(&rt->dst, diff);
 			}
-			if (fnhe->fnhe_gw)
-				rt->rt_gateway = fnhe->fnhe_gw;
-			fnhe->fnhe_stamp = jiffies;
-			break;
 		}
+		if (gw)
+			rt->rt_gateway = gw;
+		fnhe->fnhe_stamp = jiffies;
+		break;
 	}
 }
 
-- 
1.7.3.4

^ permalink raw reply related

* Re: [PATCH v2] net: cgroup: null ptr dereference in netprio cgroup during init
From: Neil Horman @ 2012-07-18 20:10 UTC (permalink / raw)
  To: John Fastabend; +Cc: davem, gaofeng, mark.d.rustad, netdev, eric.dumazet
In-Reply-To: <20120718183408.27037.16130.stgit@jf-dev1-dcblab>

On Wed, Jul 18, 2012 at 11:34:09AM -0700, John Fastabend wrote:
> When the netprio cgroup is built in the kernel cgroup_init will call
> cgrp_create which eventually calls update_netdev_tables. This is
> being called before do_initcalls() so a null ptr dereference occurs
> on init_net.
> 
> This patch adds a check on init_net.count to verify the structure
> has been initialized. The failure was introduced here,
> 
> commit ef209f15980360f6945873df3cd710c5f62f2a3e
> Author: Gao feng <gaofeng@cn.fujitsu.com>
> Date:   Wed Jul 11 21:50:15 2012 +0000
> 
>     net: cgroup: fix access the unallocated memory in netprio cgroup
> 
> Tested with ping with netprio_cgroup as a module and built in.
> 
> [    0.256451] Initializing cgroup subsys net_prio
> [    0.269948] BUG: unable to handle kernel NULL pointer dereference at
> 0000000000000698
> [    0.293303] IP: [<ffffffff81512e37>] cgrp_create+0x107/0x1c0
> [    0.310175] PGD 0
> [    0.316157] Oops: 0000 [#1] SMP
> [    0.325775] CPU 0
> [    0.331227] Modules linked in:
> [    0.340846]
> [    0.345264] Pid: 0, comm: swapper/0 Not tainted 3.5.0-rc7+ #1 AMD Dinar/Dinar
> [    0.366555] RIP: 0010:[<ffffffff81512e37>]  [<ffffffff81512e37>]
> cgrp_create+0x107/0x1c0
> [    0.390681] RSP: 0000:ffffffff81c01ea8  EFLAGS: 00010213
> [    0.406501] RAX: 0000000000000000 RBX: ffffffffffffff10 RCX: 0000000000000000
> [    0.427764] RDX: 0000000000000000 RSI: 0000000000000246 RDI: ffffffff81c9d840
> [    0.449026] RBP: ffffffff81c01ed8 R08: 00000000000164e0 R09: 0000000000000000
> [    0.470289] R10: ffff8804278303c0 R11: 0000000000000000 R12: 0000000000000001
> [    0.491553] R13: ffff8804278303c0 R14: ffff881036fd0700 R15: 0000000000000000
> [    0.512819] FS:  0000000000000000(0000) GS:ffff880427c00000(0000)
> knlGS:0000000000000000
> [    0.536932] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> [    0.554049] CR2: 0000000000000698 CR3: 0000000001c0b000 CR4: 00000000000406b0
> [    0.575311] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [    0.596574] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
> [    0.617838] Process swapper/0 (pid: 0, threadinfo ffffffff81c00000, task
> ffffffff81c13420)
> [    0.642471] Stack:
> [    0.648442]  ffffffff81c01eb8 ffffffff81c9f320 ffffffff81c9f320
> ffffffff81c9f320
> [    0.670522]  ffffffff81c9f320 ffffffff81d482c0 ffffffff81c01ef8
> ffffffff81d10397
> [    0.692604]  ffffffff81e99790 0000000000000048 ffffffff81c01f18
> ffffffff81d1062e
> [    0.714687] Call Trace:
> [    0.721960]  [<ffffffff81d10397>] cgroup_init_subsys+0x51/0xdf
> [    0.739337]  [<ffffffff81d1062e>] cgroup_init+0x36/0x119
> [    0.755160]  [<ffffffff81cf5c02>] start_kernel+0x38f/0x3c4
> [    0.771501]  [<ffffffff81cf5672>] ? repair_env_string+0x5e/0x5e
> [    0.789138]  [<ffffffff81cf5356>] x86_64_start_reservations+0x131/0x135
> [    0.808849]  [<ffffffff81cf545a>] x86_64_start_kernel+0x100/0x10f
> 
> 
> Reported-by: Mark Rustad <mark.d.rustad@intel.com>
> Cc: Neil Horman <nhorman@tuxdriver.com>
> Cc: Eric Dumazet <edumazet@google.com>
> Cc: Gao feng <gaofeng@cn.fujitsu.com>
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> ---
> 
>  net/core/net_namespace.c  |    4 +++-
>  net/core/netprio_cgroup.c |    3 +++
>  2 files changed, 6 insertions(+), 1 deletions(-)
> 
> diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
> index dddbacb..faa33bb 100644
> --- a/net/core/net_namespace.c
> +++ b/net/core/net_namespace.c
> @@ -27,7 +27,9 @@ static DEFINE_MUTEX(net_mutex);
>  LIST_HEAD(net_namespace_list);
>  EXPORT_SYMBOL_GPL(net_namespace_list);
>  
> -struct net init_net;
> +struct net init_net = {
> +	.count = ATOMIC_INIT(0),
> +};
>  EXPORT_SYMBOL(init_net);
>  
>  #define INITIAL_NET_GEN_PTRS	13 /* +1 for len +2 for rcu_head */
> diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
> index b2e9caa..e9fd7fd 100644
> --- a/net/core/netprio_cgroup.c
> +++ b/net/core/netprio_cgroup.c
> @@ -116,6 +116,9 @@ static int update_netdev_tables(void)
>  	u32 max_len;
>  	struct netprio_map *map;
>  
> +	if (!atomic_read(&init_net.count))
> +		return ret;
> +
>  	rtnl_lock();
>  	max_len = atomic_read(&max_prioidx) + 1;
>  	for_each_netdev(&init_net, dev) {
> 
> 
Acked-by: Neil Horman <nhorman@tuxdriver.com>

^ permalink raw reply

* Re: [PATCH] net: Statically initialize init_net.dev_base_head
From: Neil Horman @ 2012-07-18 20:11 UTC (permalink / raw)
  To: Mark Rustad; +Cc: netdev, davem, gaofeng, eric.dumazet
In-Reply-To: <20120718190607.22923.77935.stgit@host1-mdrustad.localdomain>

On Wed, Jul 18, 2012 at 12:06:07PM -0700, Mark Rustad wrote:
> This change eliminates an initialization-order hazard most
> recently seen when netprio_cgroup is built into the kernel.
> 
> With thanks to Eric Dumazet for catching a bug.
> 
> Signed-off-by: Mark Rustad <mark.d.rustad@intel.com>
> ---
> 
>  net/core/dev.c           |    3 ++-
>  net/core/net_namespace.c |    4 +++-
>  2 files changed, 5 insertions(+), 2 deletions(-)
> 
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 0f28a9e..1cb0d8a 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -6283,7 +6283,8 @@ static struct hlist_head *netdev_create_hash(void)
>  /* Initialize per network namespace state */
>  static int __net_init netdev_init(struct net *net)
>  {
> -	INIT_LIST_HEAD(&net->dev_base_head);
> +	if (net != &init_net)
> +		INIT_LIST_HEAD(&net->dev_base_head);
>  
>  	net->dev_name_head = netdev_create_hash();
>  	if (net->dev_name_head == NULL)
> diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
> index dddbacb..42f1e1c 100644
> --- a/net/core/net_namespace.c
> +++ b/net/core/net_namespace.c
> @@ -27,7 +27,9 @@ static DEFINE_MUTEX(net_mutex);
>  LIST_HEAD(net_namespace_list);
>  EXPORT_SYMBOL_GPL(net_namespace_list);
>  
> -struct net init_net;
> +struct net init_net = {
> +	.dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head),
> +};
>  EXPORT_SYMBOL(init_net);
>  
>  #define INITIAL_NET_GEN_PTRS	13 /* +1 for len +2 for rcu_head */
> 
> 

I think dave was going to take John Fastabends patch from earlier today, but
this works just as well.  Long term I'm going to look into delaying
initzlization for cgroups, as it creates a strange initialization state when you
have a module_init routine registered.
Neil

^ permalink raw reply

* Re: [PATCH] cxgb3: Set vlan_feature on net_device
From: Rick Jones @ 2012-07-18 20:12 UTC (permalink / raw)
  To: brenohl@br.ibm.com; +Cc: divy@chelsio.com, netdev@vger.kernel.org
In-Reply-To: <1342639748-16276-1-git-send-email-brenohl@br.ibm.com>

On 07/18/2012 12:29 PM, brenohl@br.ibm.com wrote:
> cxgb3 interface has a bad performance when VLAN is set. On my current
> setup, a PowerLinux 7R2, I am able to get around 7 Gbps on a TCP_STREAM
> (8 instances, 4k message).
> With this patch, I am able to reach 9.5 Gbps.
Getting service demand out of an aggregate netperf test is a chore, but 
reporting the change in CPU utilization should be pretty 
straightforward.   Since you ended-up being constrained by link-rate, 
showing the CPU utilization change (and calculating service demand 
manually if you feel up to it) may help show the change has an even 
greater effect then (9.5-7)/7 or 35%.

What does the change do for latency and/or maximum,  min-sized packets 
per second.

rick jones
there is more to the network than just bits/s :)

>
> Signed-off-by: Breno Leitao <brenohl@br.ibm.com>
>
> diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
> index abb6ce7..fcf4b31 100644
> --- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
> +++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
> @@ -3173,6 +3173,9 @@ static void __devinit cxgb3_init_iscsi_mac(struct net_device *dev)
>   	pi->iscsic.mac_addr[3] |= 0x80;
>   }
>   
> +#define TSO_FLAGS (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_TSO_ECN)
> +#define VLAN_FEAT (NETIF_F_SG | NETIF_F_IP_CSUM | TSO_FLAGS | \
> +			NETIF_F_IPV6_CSUM | NETIF_F_HIGHDMA)
>   static int __devinit init_one(struct pci_dev *pdev,
>   			      const struct pci_device_id *ent)
>   {
> @@ -3293,6 +3296,7 @@ static int __devinit init_one(struct pci_dev *pdev,
>   		netdev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM |
>   			NETIF_F_TSO | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_RX;
>   		netdev->features |= netdev->hw_features | NETIF_F_HW_VLAN_TX;
> +		netdev->vlan_features |= netdev->features & VLAN_FEAT;
>   		if (pci_using_dac)
>   			netdev->features |= NETIF_F_HIGHDMA;
>   

^ permalink raw reply

* Re: [PATCH 1/7] net-tcp: Fast Open base
From: David Miller @ 2012-07-18 20:18 UTC (permalink / raw)
  To: eric.dumazet; +Cc: ycheng, hkchu, edumazet, ncardwell, sivasankar, netdev
In-Reply-To: <1342641349.2626.3555.camel@edumazet-glaptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 18 Jul 2012 21:55:49 +0200

> So the plan would be :
> 
> 1) Use the experimental 254 + magic on TFO Clients/Servers in 2012
> 
> 2) When/If IANA grants an official number, add its support to servers
>    (keeping support for experimental option as well)
> 
> 3) One/two years later, switch client side to use this official number
> 
> 4) Ten years later, remove experimental from server side.

Fair enough.

^ permalink raw reply

* Re: [PATCH] net: Statically initialize init_net.dev_base_head
From: David Miller @ 2012-07-18 20:20 UTC (permalink / raw)
  To: nhorman; +Cc: mark.d.rustad, netdev, gaofeng, eric.dumazet
In-Reply-To: <20120718201149.GB22057@hmsreliant.think-freely.org>

From: Neil Horman <nhorman@tuxdriver.com>
Date: Wed, 18 Jul 2012 16:11:49 -0400

> On Wed, Jul 18, 2012 at 12:06:07PM -0700, Mark Rustad wrote:
>> This change eliminates an initialization-order hazard most
>> recently seen when netprio_cgroup is built into the kernel.
>> 
>> With thanks to Eric Dumazet for catching a bug.
>> 
>> Signed-off-by: Mark Rustad <mark.d.rustad@intel.com>
 ...
> I think dave was going to take John Fastabends patch from earlier today, but
> this works just as well.  Long term I'm going to look into delaying
> initzlization for cgroups, as it creates a strange initialization state when you
> have a module_init routine registered.

Neil, any particular preference between John's and Mark's version
of the fix?

^ permalink raw reply

* Re: [PATCH] net: Statically initialize init_net.dev_base_head
From: Neil Horman @ 2012-07-18 20:21 UTC (permalink / raw)
  To: David Miller; +Cc: mark.d.rustad, netdev, gaofeng, eric.dumazet
In-Reply-To: <20120718.132010.1765790775051953381.davem@davemloft.net>

On Wed, Jul 18, 2012 at 01:20:10PM -0700, David Miller wrote:
> From: Neil Horman <nhorman@tuxdriver.com>
> Date: Wed, 18 Jul 2012 16:11:49 -0400
> 
> > On Wed, Jul 18, 2012 at 12:06:07PM -0700, Mark Rustad wrote:
> >> This change eliminates an initialization-order hazard most
> >> recently seen when netprio_cgroup is built into the kernel.
> >> 
> >> With thanks to Eric Dumazet for catching a bug.
> >> 
> >> Signed-off-by: Mark Rustad <mark.d.rustad@intel.com>
>  ...
> > I think dave was going to take John Fastabends patch from earlier today, but
> > this works just as well.  Long term I'm going to look into delaying
> > initzlization for cgroups, as it creates a strange initialization state when you
> > have a module_init routine registered.
> 
> Neil, any particular preference between John's and Mark's version
> of the fix?
> 
I think they're both perfectly good.  If I had to choose I'd say Marks, just
because its done by initializing data, rather than adding more code to run every
time we create a cgroup.

Neil

^ permalink raw reply

* Re: [RFC] r8169 : why SG / TX checksum are default disabled
From: Francois Romieu @ 2012-07-18 20:12 UTC (permalink / raw)
  To: David Miller; +Cc: hayeswang, eric.dumazet, netdev
In-Reply-To: <20120718.092346.1263036873056516097.davem@davemloft.net>

David Miller <davem@davemloft.net> :
> From: hayeswang <hayeswang@realtek.com>
> > Francois Romieu [mailto:romieu@fr.zoreil.com] 
> > [...]
> > 
> >> Hayes, should we not add into the kernel driver something similar to
> >> the rtl8168_start_xmit::skb_checksum_help stuff in Realtek's 
> >> 8168 driver ?
> >> There seems to be a bug for (skb->len < 60 && RTL_GIGA_MAC_VER_34.
> > 
> > For RTL8168E-VL (RTL_GIGA_MAC_VER_34), the hardware wouldn't send the packet
> > with the length less than 60 bytes. The hardware should pad this kind of packet
> > to 60 bytes, but it wouldn't. Therefore, the software has to pad the packet to
> > 60 bytes. However, the hw checksum would be incorrect for the modified packet,
> > so the software checksum is necessary.
> 
> I wonder how the hardware checksum can be incorrectly calculated if the padding
> is done with zeros?

A part of the apparent problem may stem from the fact that Realtek's 8168
driver claims a modified length but it does not really skb_padto... 

Hayes, would the patch below fix the original problem ?

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index be4e00f..a463697 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -5740,7 +5740,7 @@ err_out:
 	return -EIO;
 }
 
-static inline void rtl8169_tso_csum(struct rtl8169_private *tp,
+static inline bool rtl8169_tso_csum(struct rtl8169_private *tp,
 				    struct sk_buff *skb, u32 *opts)
 {
 	const struct rtl_tx_desc_info *info = tx_desc_info + tp->txd_version;
@@ -5753,6 +5753,12 @@ static inline void rtl8169_tso_csum(struct rtl8169_private *tp,
 	} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		const struct iphdr *ip = ip_hdr(skb);
 
+		if (unlikely(skb->len < 60 &&
+		    (tp->mac_version == RTL_GIGA_MAC_VER_34) &&
+		    skb_padto(skb, ETH_ZLEN))) {
+			return false;
+		}
+
 		if (ip->protocol == IPPROTO_TCP)
 			opts[offset] |= info->checksum.tcp;
 		else if (ip->protocol == IPPROTO_UDP)
@@ -5760,6 +5766,7 @@ static inline void rtl8169_tso_csum(struct rtl8169_private *tp,
 		else
 			WARN_ON_ONCE(1);
 	}
+	return true;
 }
 
 static netdev_tx_t rtl8169_start_xmit(struct sk_buff *skb,
@@ -5797,7 +5804,8 @@ static netdev_tx_t rtl8169_start_xmit(struct sk_buff *skb,
 	opts[1] = cpu_to_le32(rtl8169_tx_vlan_tag(tp, skb));
 	opts[0] = DescOwn;
 
-	rtl8169_tso_csum(tp, skb, opts);
+	if (!rtl8169_tso_csum(tp, skb, opts))
+		goto err_update_stats;
 
 	frags = rtl8169_xmit_frags(tp, skb, opts);
 	if (frags < 0)
@@ -5853,6 +5861,7 @@ err_dma_1:
 	rtl8169_unmap_tx_skb(d, tp->tx_skb + entry, txd);
 err_dma_0:
 	dev_kfree_skb(skb);
+err_update_stats:
 	dev->stats.tx_dropped++;
 	return NETDEV_TX_OK;
 

^ permalink raw reply related

* Re: [RFC] r8169 : why SG / TX checksum are default disabled
From: David Miller @ 2012-07-18 20:28 UTC (permalink / raw)
  To: romieu; +Cc: hayeswang, eric.dumazet, netdev
In-Reply-To: <20120718201201.GC14149@electric-eye.fr.zoreil.com>

From: Francois Romieu <romieu@fr.zoreil.com>
Date: Wed, 18 Jul 2012 22:12:01 +0200

> David Miller <davem@davemloft.net> :
>> From: hayeswang <hayeswang@realtek.com>
>> > Francois Romieu [mailto:romieu@fr.zoreil.com] 
>> > [...]
>> > 
>> >> Hayes, should we not add into the kernel driver something similar to
>> >> the rtl8168_start_xmit::skb_checksum_help stuff in Realtek's 
>> >> 8168 driver ?
>> >> There seems to be a bug for (skb->len < 60 && RTL_GIGA_MAC_VER_34.
>> > 
>> > For RTL8168E-VL (RTL_GIGA_MAC_VER_34), the hardware wouldn't send the packet
>> > with the length less than 60 bytes. The hardware should pad this kind of packet
>> > to 60 bytes, but it wouldn't. Therefore, the software has to pad the packet to
>> > 60 bytes. However, the hw checksum would be incorrect for the modified packet,
>> > so the software checksum is necessary.
>> 
>> I wonder how the hardware checksum can be incorrectly calculated if the padding
>> is done with zeros?
> 
> A part of the apparent problem may stem from the fact that Realtek's 8168
> driver claims a modified length but it does not really skb_padto... 
> 
> Hayes, would the patch below fix the original problem ?

A NETDEV_TX_OK return means we accepted the SKB, it doesn't look like
that's what you are doing in the skb_padto() failure path.

^ permalink raw reply

* Re: [PATCH v2] sctp: Implement quick failover draft from tsvwg
From: Joe Perches @ 2012-07-18 20:30 UTC (permalink / raw)
  To: Neil Horman
  Cc: netdev, Vlad Yasevich, Sridhar Samudrala, David S. Miller,
	linux-sctp
In-Reply-To: <1342634466-17930-1-git-send-email-nhorman@tuxdriver.com>

On Wed, 2012-07-18 at 14:01 -0400, Neil Horman wrote:
> I've seen several attempts recently made to do quick failover of sctp transports
> by reducing various retransmit timers and counters.  While its possible to
> implement a faster failover on multihomed sctp associations, its not
> particularly robust, in that it can lead to unneeded retransmits, as well as
> false connection failures due to intermittent latency on a network.

trivia:

> diff --git a/net/sctp/associola.c b/net/sctp/associola.c

> @@ -871,6 +885,10 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
>  		spc_state = SCTP_ADDR_UNREACHABLE;
>  		break;
>  
> +	case SCTP_TRANSPORT_PF:
> +		transport->state = SCTP_PF;
> +		ulp_notify = false;
> +		break;

nicer to add a newline here

>  	default:
>  		return;
>  	}
> @@ -878,12 +896,15 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
[]
> +	if (ulp_notify) {
> +		memset(&addr, 0, sizeof(struct sockaddr_storage));
> +		memcpy(&addr, &transport->ipaddr,
> +		       transport->af_specific->sockaddr_len);

Perhaps it's better to do the memcpy then the memset of the
space left instead.

		memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len);
		memset((char *)&addr) + transport->af_specific->sockaddr_len, 0,
		       sizeof(struct sockaddr_storage) - transport->af_specific->sockaddr_len);
		       

^ permalink raw reply

* Re: [PATCH] net: Statically initialize init_net.dev_base_head
From: John Fastabend @ 2012-07-18 20:31 UTC (permalink / raw)
  To: Neil Horman, David Miller; +Cc: mark.d.rustad, netdev, gaofeng, eric.dumazet
In-Reply-To: <20120718202159.GA30706@hmsreliant.think-freely.org>

On 7/18/2012 1:21 PM, Neil Horman wrote:
> On Wed, Jul 18, 2012 at 01:20:10PM -0700, David Miller wrote:
>> From: Neil Horman <nhorman@tuxdriver.com>
>> Date: Wed, 18 Jul 2012 16:11:49 -0400
>>
>>> On Wed, Jul 18, 2012 at 12:06:07PM -0700, Mark Rustad wrote:
>>>> This change eliminates an initialization-order hazard most
>>>> recently seen when netprio_cgroup is built into the kernel.
>>>>
>>>> With thanks to Eric Dumazet for catching a bug.
>>>>
>>>> Signed-off-by: Mark Rustad <mark.d.rustad@intel.com>
>>   ...
>>> I think dave was going to take John Fastabends patch from earlier today, but
>>> this works just as well.  Long term I'm going to look into delaying
>>> initzlization for cgroups, as it creates a strange initialization state when you
>>> have a module_init routine registered.
>>
>> Neil, any particular preference between John's and Mark's version
>> of the fix?
>>
> I think they're both perfectly good.  If I had to choose I'd say Marks, just
> because its done by initializing data, rather than adding more code to run every
> time we create a cgroup.
>
> Neil
>

Fine by me if we take this version instead.

^ permalink raw reply

* [net-next 0/9][pull request] Intel Wired LAN Driver Updates
From: Jeff Kirsher @ 2012-07-18 20:31 UTC (permalink / raw)
  To: davem; +Cc: Jeff Kirsher, netdev, gospo, sassmann

This series contains updates to ixgbevf & ixgbe.

The following are changes since commit ddbe503203855939946430e39bae58de11b70b69:
  ipv6: add ipv6_addr_hash() helper
and are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/net-next master

Alexander Duyck (8):
  ixgbevf: Do not rewind the Rx ring before bumping tail
  ixgbevf: Add netdev to ring structure
  ixgbevf: Consolidate Tx context descriptor creation code
  ixgbevf: Fix multiple issues in ixgbevf_get/set_ringparam
  ixgbe: Update configure virtualization to allow for multiple PF pools
  ixgbe: Add support for SR-IOV w/ DCB or RSS
  ixgbe: Retire RSS enabled and capable flags
  ixgbe: Cleanup holes in flags after removing several of them

Pascal Bouchareine (1):
  ixgbevf: fix VF untagging when 802.1 prio is set

 drivers/net/ethernet/intel/ixgbe/ixgbe.h          |   56 +--
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c  |    4 -
 drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c      |  387 ++++++++++++++++++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     |   90 +++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c    |   52 ++-
 drivers/net/ethernet/intel/ixgbevf/defines.h      |    1 +
 drivers/net/ethernet/intel/ixgbevf/ethtool.c      |  159 ++++----
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h      |    2 +
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |  405 ++++++++++-----------
 9 files changed, 745 insertions(+), 411 deletions(-)

-- 
1.7.10.4

^ permalink raw reply

* [net-next 1/9] ixgbevf: fix VF untagging when 802.1 prio is set
From: Jeff Kirsher @ 2012-07-18 20:31 UTC (permalink / raw)
  To: davem; +Cc: Pascal Bouchareine, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1342643516-2696-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Pascal Bouchareine <pascal@gandi.net>

We have had an issue when using ixgbe+ixgbevf and 802.1 VLAN tagging.

When attaching a VLAN to a VF, frames with a 802.1q priority appeared
untagged on the VF hence not reaching the VLAN, where frames with
priority 0 where tagged as expected and seen by the VLAN device.

This seems due to the way ixgbevf is looking up the full tag
(prio+cfi+vlan) against the adapter active_vlans, as a condition to mark
the skb tagged.

Signed-off-by: Pascal Bouchareine <pascal@gandi.net>
Tested-by: Sibai Li <sibai.li@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index c98cdf7..b88218c 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -279,7 +279,7 @@ static void ixgbevf_receive_skb(struct ixgbevf_q_vector *q_vector,
 	bool is_vlan = (status & IXGBE_RXD_STAT_VP);
 	u16 tag = le16_to_cpu(rx_desc->wb.upper.vlan);
 
-	if (is_vlan && test_bit(tag, adapter->active_vlans))
+	if (is_vlan && test_bit(tag & VLAN_VID_MASK, adapter->active_vlans))
 		__vlan_hwaccel_put_tag(skb, tag);
 
 	napi_gro_receive(&q_vector->napi, skb);
-- 
1.7.10.4

^ permalink raw reply related

* [net-next 2/9] ixgbevf: Do not rewind the Rx ring before bumping tail
From: Jeff Kirsher @ 2012-07-18 20:31 UTC (permalink / raw)
  To: davem; +Cc: Alexander Duyck, netdev, gospo, sassmann, Greg Rose, Jeff Kirsher
In-Reply-To: <1342643516-2696-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Alexander Duyck <alexander.h.duyck@intel.com>

The driver is going back one step from its' previous location before
bumping tail. This is incorrect.  We should just be writing the value of
next_to_use into the tail register.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Greg Rose <gregory.v.rose@intel.com>
Tested-by: Sibai Li <sibai.li@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |    7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index b88218c..c27ce44 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -375,8 +375,6 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_adapter *adapter,
 no_buffers:
 	if (rx_ring->next_to_use != i) {
 		rx_ring->next_to_use = i;
-		if (i-- == 0)
-			i = (rx_ring->count - 1);
 
 		ixgbevf_release_rx_desc(&adapter->hw, rx_ring, i);
 	}
@@ -1240,9 +1238,8 @@ static void ixgbevf_configure(struct ixgbevf_adapter *adapter)
 	ixgbevf_configure_rx(adapter);
 	for (i = 0; i < adapter->num_rx_queues; i++) {
 		struct ixgbevf_ring *ring = &adapter->rx_ring[i];
-		ixgbevf_alloc_rx_buffers(adapter, ring, ring->count);
-		ring->next_to_use = ring->count - 1;
-		writel(ring->next_to_use, adapter->hw.hw_addr + ring->tail);
+		ixgbevf_alloc_rx_buffers(adapter, ring,
+					 IXGBE_DESC_UNUSED(ring));
 	}
 }
 
-- 
1.7.10.4

^ permalink raw reply related

* [net-next 3/9] ixgbevf: Add netdev to ring structure
From: Jeff Kirsher @ 2012-07-18 20:31 UTC (permalink / raw)
  To: davem; +Cc: Alexander Duyck, netdev, gospo, sassmann, Greg Rose, Jeff Kirsher
In-Reply-To: <1342643516-2696-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This change adds the netdev to the ring structure.  This allows for a
quicker transition from ring to netdev without having to go from ring to
adapter to netdev.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Greg Rose <gregory.v.rose@intel.com>
Tested-by: Sibai Li <sibai.li@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbevf/ethtool.c      |    6 +--
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h      |    2 +
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |   54 +++++++++------------
 3 files changed, 28 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ethtool.c b/drivers/net/ethernet/intel/ixgbevf/ethtool.c
index 15947c9..2c3b20ed 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ethtool.c
@@ -359,8 +359,7 @@ static int ixgbevf_set_ringparam(struct net_device *netdev,
 		if (err) {
 			while (i) {
 				i--;
-				ixgbevf_free_tx_resources(adapter,
-							  &tx_ring[i]);
+				ixgbevf_free_tx_resources(adapter, &tx_ring[i]);
 			}
 			goto err_tx_ring_setup;
 		}
@@ -374,8 +373,7 @@ static int ixgbevf_set_ringparam(struct net_device *netdev,
 		if (err) {
 			while (i) {
 				i--;
-				ixgbevf_free_rx_resources(adapter,
-							  &rx_ring[i]);
+				ixgbevf_free_rx_resources(adapter, &rx_ring[i]);
 			}
 				goto err_rx_ring_setup;
 		}
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
index 1f13765..e167d1b 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
@@ -56,6 +56,8 @@ struct ixgbevf_rx_buffer {
 
 struct ixgbevf_ring {
 	struct ixgbevf_ring *next;
+	struct net_device *netdev;
+	struct device *dev;
 	struct ixgbevf_adapter *adapter;  /* backlink */
 	void *desc;			/* descriptor ring memory */
 	dma_addr_t dma;			/* phys. address of descriptor ring */
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index c27ce44..1c53e13 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -187,7 +187,6 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector *q_vector,
 				 struct ixgbevf_ring *tx_ring)
 {
 	struct ixgbevf_adapter *adapter = q_vector->adapter;
-	struct net_device *netdev = adapter->netdev;
 	union ixgbe_adv_tx_desc *tx_desc, *eop_desc;
 	struct ixgbevf_tx_buffer *tx_buffer_info;
 	unsigned int i, eop, count = 0;
@@ -241,15 +240,17 @@ cont_loop:
 	tx_ring->next_to_clean = i;
 
 #define TX_WAKE_THRESHOLD (DESC_NEEDED * 2)
-	if (unlikely(count && netif_carrier_ok(netdev) &&
+	if (unlikely(count && netif_carrier_ok(tx_ring->netdev) &&
 		     (IXGBE_DESC_UNUSED(tx_ring) >= TX_WAKE_THRESHOLD))) {
 		/* Make sure that anybody stopping the queue after this
 		 * sees the new next_to_clean.
 		 */
 		smp_mb();
-		if (__netif_subqueue_stopped(netdev, tx_ring->queue_index) &&
+		if (__netif_subqueue_stopped(tx_ring->netdev,
+					     tx_ring->queue_index) &&
 		    !test_bit(__IXGBEVF_DOWN, &adapter->state)) {
-			netif_wake_subqueue(netdev, tx_ring->queue_index);
+			netif_wake_subqueue(tx_ring->netdev,
+					    tx_ring->queue_index);
 			++adapter->restart_queue;
 		}
 	}
@@ -292,12 +293,13 @@ static void ixgbevf_receive_skb(struct ixgbevf_q_vector *q_vector,
  * @skb: skb currently being received and modified
  **/
 static inline void ixgbevf_rx_checksum(struct ixgbevf_adapter *adapter,
+				       struct ixgbevf_ring *ring,
 				       u32 status_err, struct sk_buff *skb)
 {
 	skb_checksum_none_assert(skb);
 
 	/* Rx csum disabled */
-	if (!(adapter->netdev->features & NETIF_F_RXCSUM))
+	if (!(ring->netdev->features & NETIF_F_RXCSUM))
 		return;
 
 	/* if IP and error */
@@ -332,31 +334,21 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_adapter *adapter,
 	union ixgbe_adv_rx_desc *rx_desc;
 	struct ixgbevf_rx_buffer *bi;
 	struct sk_buff *skb;
-	unsigned int i;
-	unsigned int bufsz = rx_ring->rx_buf_len + NET_IP_ALIGN;
+	unsigned int i = rx_ring->next_to_use;
 
-	i = rx_ring->next_to_use;
 	bi = &rx_ring->rx_buffer_info[i];
 
 	while (cleaned_count--) {
 		rx_desc = IXGBEVF_RX_DESC(rx_ring, i);
 		skb = bi->skb;
 		if (!skb) {
-			skb = netdev_alloc_skb(adapter->netdev,
-							       bufsz);
-
+			skb = netdev_alloc_skb_ip_align(rx_ring->netdev,
+							rx_ring->rx_buf_len);
 			if (!skb) {
 				adapter->alloc_rx_buff_failed++;
 				goto no_buffers;
 			}
 
-			/*
-			 * Make buffer alignment 2 beyond a 16 byte boundary
-			 * this will result in a 16 byte aligned IP header after
-			 * the 14 byte MAC header is removed
-			 */
-			skb_reserve(skb, NET_IP_ALIGN);
-
 			bi->skb = skb;
 		}
 		if (!bi->dma) {
@@ -449,7 +441,7 @@ static bool ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector,
 			goto next_desc;
 		}
 
-		ixgbevf_rx_checksum(adapter, staterr, skb);
+		ixgbevf_rx_checksum(adapter, rx_ring, staterr, skb);
 
 		/* probably a little skewed due to removing CRC */
 		total_rx_bytes += skb->len;
@@ -464,7 +456,7 @@ static bool ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector,
 			if (header_fixup_len < 14)
 				skb_push(skb, header_fixup_len);
 		}
-		skb->protocol = eth_type_trans(skb, adapter->netdev);
+		skb->protocol = eth_type_trans(skb, rx_ring->netdev);
 
 		ixgbevf_receive_skb(q_vector, skb, staterr, rx_ring, rx_desc);
 
@@ -1669,12 +1661,16 @@ static int ixgbevf_alloc_queues(struct ixgbevf_adapter *adapter)
 		adapter->tx_ring[i].count = adapter->tx_ring_count;
 		adapter->tx_ring[i].queue_index = i;
 		adapter->tx_ring[i].reg_idx = i;
+		adapter->tx_ring[i].dev = &adapter->pdev->dev;
+		adapter->tx_ring[i].netdev = adapter->netdev;
 	}
 
 	for (i = 0; i < adapter->num_rx_queues; i++) {
 		adapter->rx_ring[i].count = adapter->rx_ring_count;
 		adapter->rx_ring[i].queue_index = i;
 		adapter->rx_ring[i].reg_idx = i;
+		adapter->rx_ring[i].dev = &adapter->pdev->dev;
+		adapter->rx_ring[i].netdev = adapter->netdev;
 	}
 
 	return 0;
@@ -2721,12 +2717,11 @@ static void ixgbevf_tx_queue(struct ixgbevf_adapter *adapter,
 	writel(i, adapter->hw.hw_addr + tx_ring->tail);
 }
 
-static int __ixgbevf_maybe_stop_tx(struct net_device *netdev,
-				   struct ixgbevf_ring *tx_ring, int size)
+static int __ixgbevf_maybe_stop_tx(struct ixgbevf_ring *tx_ring, int size)
 {
-	struct ixgbevf_adapter *adapter = netdev_priv(netdev);
+	struct ixgbevf_adapter *adapter = netdev_priv(tx_ring->netdev);
 
-	netif_stop_subqueue(netdev, tx_ring->queue_index);
+	netif_stop_subqueue(tx_ring->netdev, tx_ring->queue_index);
 	/* Herbert's original patch had:
 	 *  smp_mb__after_netif_stop_queue();
 	 * but since that doesn't exist yet, just open code it. */
@@ -2738,17 +2733,16 @@ static int __ixgbevf_maybe_stop_tx(struct net_device *netdev,
 		return -EBUSY;
 
 	/* A reprieve! - use start_queue because it doesn't call schedule */
-	netif_start_subqueue(netdev, tx_ring->queue_index);
+	netif_start_subqueue(tx_ring->netdev, tx_ring->queue_index);
 	++adapter->restart_queue;
 	return 0;
 }
 
-static int ixgbevf_maybe_stop_tx(struct net_device *netdev,
-				 struct ixgbevf_ring *tx_ring, int size)
+static int ixgbevf_maybe_stop_tx(struct ixgbevf_ring *tx_ring, int size)
 {
 	if (likely(IXGBE_DESC_UNUSED(tx_ring) >= size))
 		return 0;
-	return __ixgbevf_maybe_stop_tx(netdev, tx_ring, size);
+	return __ixgbevf_maybe_stop_tx(tx_ring, size);
 }
 
 static int ixgbevf_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
@@ -2779,7 +2773,7 @@ static int ixgbevf_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 #else
 	count += skb_shinfo(skb)->nr_frags;
 #endif
-	if (ixgbevf_maybe_stop_tx(netdev, tx_ring, count + 3)) {
+	if (ixgbevf_maybe_stop_tx(tx_ring, count + 3)) {
 		adapter->tx_busy++;
 		return NETDEV_TX_BUSY;
 	}
@@ -2810,7 +2804,7 @@ static int ixgbevf_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 			 ixgbevf_tx_map(adapter, tx_ring, skb, tx_flags, first),
 			 skb->len, hdr_len);
 
-	ixgbevf_maybe_stop_tx(netdev, tx_ring, DESC_NEEDED);
+	ixgbevf_maybe_stop_tx(tx_ring, DESC_NEEDED);
 
 	return NETDEV_TX_OK;
 }
-- 
1.7.10.4

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox