Netdev List
 help / color / mirror / Atom feed
* Re: [RFC net-next PATCH V1 7/9] net: frag queue locking per hash bucket
From: Jesper Dangaard Brouer @ 2012-11-27 15:00 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S. Miller, Florian Westphal, netdev, Pablo Neira Ayuso,
	Thomas Graf, Cong Wang, Patrick McHardy, Paul E. McKenney,
	Herbert Xu
In-Reply-To: <20121123130836.18764.9297.stgit@dragon>

On Fri, 2012-11-23 at 14:08 +0100, Jesper Dangaard Brouer wrote:
> DO NOT apply - patch not finished, can cause on OOPS/PANIC during hash rebuild
> 
> This patch implements per hash bucket locking for the frag queue
> hash.  This removes two write locks, and the only remaining write
> lock is for protecting hash rebuild.  This essentially reduce the
> readers-writer lock to a rebuild lock.
> 
> NOT-Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>

Last bug mentioned, were not the only one... fixing hopefully the last bug in this patch.


> diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
> index 1620a21..447423f 100644
> --- a/net/ipv4/inet_fragment.c
> +++ b/net/ipv4/inet_fragment.c
> @@ -35,20 +35,27 @@ static void inet_frag_secret_rebuild(unsigned long dummy)
>  	unsigned long now = jiffies;
>  	int i;
>  
> +	/* Per bucket lock NOT needed here, due to write lock protection */
>  	write_lock(&f->lock);
> +
>  	get_random_bytes(&f->rnd, sizeof(u32));
>  	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
> +		struct inet_frag_bucket *hb;
>  		struct inet_frag_queue *q;
>  		struct hlist_node *p, *n;
>  
> -		hlist_for_each_entry_safe(q, p, n, &f->hash[i], list) {
> +		hb = &f->hash[i];
> +		hlist_for_each_entry_safe(q, p, n, &hb->chain, list) {
>  			unsigned int hval = f->hashfn(q);
>  
>  			if (hval != i) {
> +				struct inet_frag_bucket *hb_dest;
> +
>  				hlist_del(&q->list);
>  
>  				/* Relink to new hash chain. */
> -				hlist_add_head(&q->list, &f->hash[hval]);
> +				hb_dest = &f->hash[hval];
> +				hlist_add_head(&q->list, &hb->chain);

The above line were wrong, it should have been:
   hlist_add_head(&q->list, &hb_dest->chain);

>  			}
>  		}
>  	}

The patch seem quite stable now.  My test is to adjust to rebuild
interval to 2 sec and then run 4x 10G with two fragments (packet size
1472*2) to create as many fragments as possible (approx 300
inet_frag_queue elements).

30 min test run:
 3726+3896+3960+3608 = 15190 Mbit/s

(For reproducers, note, that changing ipfrag_secret_interval (e.g.
sysctl -w net/ipv4/ipfrag_secret_interval=2), first take effect after
the first interval/timer expires, which default is 10 min)


-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Sr. Network Kernel Developer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* [PATCH net-next] net: move inet_dport/inet_num in sock_common
From: Eric Dumazet @ 2012-11-27 15:06 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Ling Ma

From: Eric Dumazet <edumazet@google.com>

commit 68835aba4d9b (net: optimize INET input path further)
moved some fields used for tcp/udp sockets lookup in the first cache
line of struct sock_common.

This patch moves inet_dport/inet_num as well, filling a 32bit hole
on 64 bit arches and reducing number of cache line misses.

Also change INET_MATCH()/INET_TW_MATCH() to perform the ports match
before addresses match, as this check is more discriminant.
The namespace check can also be done at last.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ling Ma <ling.ma.program@gmail.com>
---
 include/linux/ipv6.h             |   26 ++++++++++---------
 include/net/inet_hashtables.h    |   38 ++++++++++++++++-------------
 include/net/inet_sock.h          |    4 +--
 include/net/inet_timewait_sock.h |    5 ++-
 include/net/sock.h               |   10 ++++++-
 5 files changed, 48 insertions(+), 35 deletions(-)

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 5e11905..196ede4 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -365,19 +365,21 @@ static inline struct raw6_sock *raw6_sk(const struct sock *sk)
 #endif /* IS_ENABLED(CONFIG_IPV6) */
 
 #define INET6_MATCH(__sk, __net, __hash, __saddr, __daddr, __ports, __dif)\
-	(((__sk)->sk_hash == (__hash)) && sock_net((__sk)) == (__net)	&& \
-	 ((*((__portpair *)&(inet_sk(__sk)->inet_dport))) == (__ports)) && \
-	 ((__sk)->sk_family		== AF_INET6)		&& \
-	 ipv6_addr_equal(&inet6_sk(__sk)->daddr, (__saddr))	&& \
-	 ipv6_addr_equal(&inet6_sk(__sk)->rcv_saddr, (__daddr))	&& \
-	 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
+	(((__sk)->sk_hash == (__hash)) &&					\
+	 ((*((__portpair *)&(inet_sk(__sk)->inet_dport))) == (__ports)) &&	\
+	 ((__sk)->sk_family		== AF_INET6)		&&		\
+	 ipv6_addr_equal(&inet6_sk(__sk)->daddr, (__saddr))	&&		\
+	 ipv6_addr_equal(&inet6_sk(__sk)->rcv_saddr, (__daddr))	&&		\
+	 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))) && \
+	 net_eq(sock_net(__sk), (__net)))
 
 #define INET6_TW_MATCH(__sk, __net, __hash, __saddr, __daddr, __ports, __dif) \
-	(((__sk)->sk_hash == (__hash)) && sock_net((__sk)) == (__net)	&& \
-	 (*((__portpair *)&(inet_twsk(__sk)->tw_dport)) == (__ports))	&& \
-	 ((__sk)->sk_family	       == PF_INET6)			&& \
-	 (ipv6_addr_equal(&inet6_twsk(__sk)->tw_v6_daddr, (__saddr)))	&& \
-	 (ipv6_addr_equal(&inet6_twsk(__sk)->tw_v6_rcv_saddr, (__daddr))) && \
-	 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
+	(((__sk)->sk_hash == (__hash)) &&					\
+	 (*((__portpair *)&(inet_twsk(__sk)->tw_dport)) == (__ports))	&&	\
+	 ((__sk)->sk_family	       == PF_INET6)			&&	\
+	 (ipv6_addr_equal(&inet6_twsk(__sk)->tw_v6_daddr, (__saddr)))	&&	\
+	 (ipv6_addr_equal(&inet6_twsk(__sk)->tw_v6_rcv_saddr, (__daddr))) &&	\
+	 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))) && \
+	 net_eq(sock_net(__sk), (__net)))
 
 #endif /* _IPV6_H */
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 54be028..cd6766a 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -300,29 +300,33 @@ typedef __u64 __bitwise __addrpair;
 				   ((__force __u64)(__be32)(__saddr)));
 #endif /* __BIG_ENDIAN */
 #define INET_MATCH(__sk, __net, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
-	(((__sk)->sk_hash == (__hash)) && net_eq(sock_net(__sk), (__net)) &&	\
-	 ((*((__addrpair *)&(inet_sk(__sk)->inet_daddr))) == (__cookie))  &&	\
-	 ((*((__portpair *)&(inet_sk(__sk)->inet_dport))) == (__ports))   &&	\
-	 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
+	(((__sk)->sk_hash == (__hash)) &&					\
+	 ((*((__portpair *)&(inet_sk(__sk)->inet_dport))) == (__ports)) &&	\
+	 ((*((__addrpair *)&(inet_sk(__sk)->inet_daddr))) == (__cookie)) &&	\
+	 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))) && \
+	 net_eq(sock_net(__sk), (__net)))
 #define INET_TW_MATCH(__sk, __net, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
-	(((__sk)->sk_hash == (__hash)) && net_eq(sock_net(__sk), (__net)) &&	\
-	 ((*((__addrpair *)&(inet_twsk(__sk)->tw_daddr))) == (__cookie)) &&	\
+	(((__sk)->sk_hash == (__hash)) &&					\
 	 ((*((__portpair *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) &&	\
-	 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
+	 ((*((__addrpair *)&(inet_twsk(__sk)->tw_daddr))) == (__cookie)) &&	\
+	 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))) && \
+	 net_eq(sock_net(__sk), (__net)))
 #else /* 32-bit arch */
 #define INET_ADDR_COOKIE(__name, __saddr, __daddr)
 #define INET_MATCH(__sk, __net, __hash, __cookie, __saddr, __daddr, __ports, __dif)	\
-	(((__sk)->sk_hash == (__hash)) && net_eq(sock_net(__sk), (__net))	&&	\
-	 (inet_sk(__sk)->inet_daddr	== (__saddr))		&&	\
-	 (inet_sk(__sk)->inet_rcv_saddr	== (__daddr))		&&	\
-	 ((*((__portpair *)&(inet_sk(__sk)->inet_dport))) == (__ports))	&&	\
-	 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
+	(((__sk)->sk_hash == (__hash)) &&						\
+	 ((*((__portpair *)&(inet_sk(__sk)->inet_dport))) == (__ports))	&&		\
+	 (inet_sk(__sk)->inet_daddr	== (__saddr))		&&			\
+	 (inet_sk(__sk)->inet_rcv_saddr	== (__daddr))		&&			\
+	 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))) &&	\
+	 net_eq(sock_net(__sk), (__net)))
 #define INET_TW_MATCH(__sk, __net, __hash,__cookie, __saddr, __daddr, __ports, __dif)	\
-	(((__sk)->sk_hash == (__hash)) && net_eq(sock_net(__sk), (__net))	&&	\
-	 (inet_twsk(__sk)->tw_daddr	== (__saddr))		&&	\
-	 (inet_twsk(__sk)->tw_rcv_saddr	== (__daddr))		&&	\
-	 ((*((__portpair *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) &&	\
-	 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
+	(((__sk)->sk_hash == (__hash)) &&						\
+	 ((*((__portpair *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) &&		\
+	 (inet_twsk(__sk)->tw_daddr	== (__saddr))		&&			\
+	 (inet_twsk(__sk)->tw_rcv_saddr	== (__daddr))		&&			\
+	 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))) &&	\
+	 net_eq(sock_net(__sk), (__net)))
 #endif /* 64-bit arch */
 
 /*
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 256c1ed..ae6cfa5 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -144,9 +144,9 @@ struct inet_sock {
 	/* Socket demultiplex comparisons on incoming packets. */
 #define inet_daddr		sk.__sk_common.skc_daddr
 #define inet_rcv_saddr		sk.__sk_common.skc_rcv_saddr
+#define inet_dport		sk.__sk_common.skc_dport
+#define inet_num		sk.__sk_common.skc_num
 
-	__be16			inet_dport;
-	__u16			inet_num;
 	__be32			inet_saddr;
 	__s16			uc_ttl;
 	__u16			cmsg_flags;
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index ba52c83..671dbb7 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -112,6 +112,9 @@ struct inet_timewait_sock {
 #define tw_net			__tw_common.skc_net
 #define tw_daddr        	__tw_common.skc_daddr
 #define tw_rcv_saddr    	__tw_common.skc_rcv_saddr
+#define tw_dport		__tw_common.skc_dport
+#define tw_num			__tw_common.skc_num
+
 	int			tw_timeout;
 	volatile unsigned char	tw_substate;
 	unsigned char		tw_rcv_wscale;
@@ -119,8 +122,6 @@ struct inet_timewait_sock {
 	/* Socket demultiplex comparisons on incoming packets. */
 	/* these three are in inet_sock */
 	__be16			tw_sport;
-	__be16			tw_dport;
-	__u16			tw_num;
 	kmemcheck_bitfield_begin(flags);
 	/* And these are ours. */
 	unsigned int		tw_ipv6only     : 1,
diff --git a/include/net/sock.h b/include/net/sock.h
index c945fba..e4bab2e 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -132,6 +132,8 @@ struct net;
  *	@skc_rcv_saddr: Bound local IPv4 addr
  *	@skc_hash: hash value used with various protocol lookup tables
  *	@skc_u16hashes: two u16 hash values used by UDP lookup tables
+ *	@skc_dport: placeholder for inet_dport/tw_dport
+ *	@skc_num: placeholder for inet_num/tw_num
  *	@skc_family: network address family
  *	@skc_state: Connection state
  *	@skc_reuse: %SO_REUSEADDR setting
@@ -149,8 +151,8 @@ struct net;
  *	for struct sock and struct inet_timewait_sock.
  */
 struct sock_common {
-	/* skc_daddr and skc_rcv_saddr must be grouped :
-	 * cf INET_MATCH() and INET_TW_MATCH()
+	/* skc_daddr and skc_rcv_saddr must be grouped on a 8 bytes aligned
+	 * address on 64bit arches : cf INET_MATCH() and INET_TW_MATCH()
 	 */
 	__be32			skc_daddr;
 	__be32			skc_rcv_saddr;
@@ -159,6 +161,10 @@ struct sock_common {
 		unsigned int	skc_hash;
 		__u16		skc_u16hashes[2];
 	};
+	/* skc_dport && skc_num must be grouped as well */
+	__be16			skc_dport;
+	__u16			skc_num;
+
 	unsigned short		skc_family;
 	volatile unsigned char	skc_state;
 	unsigned char		skc_reuse;

^ permalink raw reply related

* VPN traffic leaks in IPv6/IPv4 dual-stack networks/hosts
From: Fernando Gont @ 2012-11-27 14:54 UTC (permalink / raw)
  To: netdev

Folks,

FYI. This is might affect Linux users employing e.g. OpenVPN:
<http://tools.ietf.org/html/draft-gont-opsec-vpn-leakages>.

For a project such as OpenVPN, a (portable) fix might be non-trivial.
However, I guess Linux might hook some iptables rules when establishing
the VPN tunnel, such that e.g. all v6 traffic is filtered (yes, this is
certainly not the most desirable fix, but still probably better than
having your supposedly-secured traffic being sent in the clear).

P.S.: Not sure if this is the right list to send this note. Please
advice of a more appropriate one and/or feel free to forward this note
if deemed appropriate...

Thanks,
-- 
Fernando Gont
e-mail: fernando@gont.com.ar || fgont@si6networks.com
PGP Fingerprint: 7809 84F5 322E 45C7 F1C9 3945 96EE A9EF D076 FFF1

^ permalink raw reply

* Re: [PATCH v3 8/7] pppoatm: fix missing wakeup in pppoatm_send()
From: chas williams - CONTRACTOR @ 2012-11-27 15:23 UTC (permalink / raw)
  To: David Woodhouse; +Cc: Krzysztof Mazur, netdev, linux-kernel, davem
In-Reply-To: <1354022867.26346.334.camel@shinybook.infradead.org>

On Tue, 27 Nov 2012 13:27:47 +0000
David Woodhouse <dwmw2@infradead.org> wrote:

> > i really would prefer not to use a strange name since it might confuse
> > larger group of people who are more familiar with the traditional meaning
> > of this function.  vcc_release() isnt exported so we could rename it if
> > things get too confusing.
> > 
> > i have to look at this a bit more but we might be able to use release_cb
> > to get rid of the null push to detach the underlying protocol.  that would
> > be somewhat nice.
> 
> In the meantime, should I resend this patch with the name 'release_cb'
> instead of 'unlock_cb'? I'll just put a comment in to make sure it isn't
> confused with vcc_release(), and if we need to change vcc_release()
> later we can.
> 

yes, but dont call it 8/7 since that doesnt make sense.

^ permalink raw reply

* Re: [PATCH 2/2] smsc75xx: support PHY wakeup source
From: Joe Perches @ 2012-11-27 15:56 UTC (permalink / raw)
  To: Steve Glendinning; +Cc: netdev
In-Reply-To: <1354026482-10443-3-git-send-email-steve.glendinning@shawell.net>

On Tue, 2012-11-27 at 14:28 +0000, Steve Glendinning wrote:
> This patch enables LAN7500 family devices to wake from suspend
> on either link up or link down events.
[]
> diff --git a/drivers/net/usb/smsc75xx.c b/drivers/net/usb/smsc75xx.c
[]
> +static int smsc75xx_enter_suspend1(struct usbnet *dev)
> +{
> +	u32 val;
> +	int ret;
> +
> +	ret = smsc75xx_read_reg_nopm(dev, PMT_CTL, &val);
> +	check_warn_return(ret, "Error reading PMT_CTL");

Hi Steve, can you please add newlines to these new
check_warn_<foo> messages and the netdev_<level> ones too?
It helps avoid message interleaving.

> +		if (!link_up) {
> +			struct mii_if_info *mii = &dev->mii;
> +			netdev_info(dev->net, "entering SUSPEND1 mode");

etc..

^ permalink raw reply

* Re: VPN traffic leaks in IPv6/IPv4 dual-stack networks/hosts
From: Eric Dumazet @ 2012-11-27 16:04 UTC (permalink / raw)
  To: Fernando Gont; +Cc: netdev
In-Reply-To: <50B4D43A.7030208@gont.com.ar>

On Tue, 2012-11-27 at 11:54 -0300, Fernando Gont wrote:
> Folks,
> 
> FYI. This is might affect Linux users employing e.g. OpenVPN:
> <http://tools.ietf.org/html/draft-gont-opsec-vpn-leakages>.
> 
> For a project such as OpenVPN, a (portable) fix might be non-trivial.
> However, I guess Linux might hook some iptables rules when establishing
> the VPN tunnel, such that e.g. all v6 traffic is filtered (yes, this is
> certainly not the most desirable fix, but still probably better than
> having your supposedly-secured traffic being sent in the clear).
> 
> P.S.: Not sure if this is the right list to send this note. Please
> advice of a more appropriate one and/or feel free to forward this note
> if deemed appropriate...

This seems a user space issue to me.

accept_ra on linux is set to 1, meaning that as soon as forwarding is
enabled, RA are ignored.

^ permalink raw reply

* Re: VPN traffic leaks in IPv6/IPv4 dual-stack networks/hosts
From: Fernando Gont @ 2012-11-27 16:07 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev
In-Reply-To: <1354032252.14302.37.camel@edumazet-glaptop>

Hi, Eric,

On 11/27/2012 01:04 PM, Eric Dumazet wrote:
>> P.S.: Not sure if this is the right list to send this note. Please
>> advice of a more appropriate one and/or feel free to forward this note
>> if deemed appropriate...
> 
> This seems a user space issue to me.
> 
> accept_ra on linux is set to 1, meaning that as soon as forwarding is
> enabled, RA are ignored.

I don't follow. Why would RAs be ignored if accept_ra is set to 1??

Cheers,
-- 
Fernando Gont
e-mail: fernando@gont.com.ar || fgont@si6networks.com
PGP Fingerprint: 7809 84F5 322E 45C7 F1C9 3945 96EE A9EF D076 FFF1

^ permalink raw reply

* Re: VPN traffic leaks in IPv6/IPv4 dual-stack networks/hosts
From: Jan Engelhardt @ 2012-11-27 16:10 UTC (permalink / raw)
  To: Fernando Gont; +Cc: netdev
In-Reply-To: <50B4D43A.7030208@gont.com.ar>

On Tuesday 2012-11-27 15:54, Fernando Gont wrote:

>Folks,
>
>FYI. This is might affect Linux users employing e.g. OpenVPN:
><http://tools.ietf.org/html/draft-gont-opsec-vpn-leakages>.
>
>For a project such as OpenVPN, a (portable) fix might be non-trivial.

If the VPN server does not even advertise to-be-secured IPv6 prefixes, 
any client-side fix is questionable. Disabling all of IPv6 on the client 
just because no IPv6 prefixes were sent is bogus.. it is like disabling 
all my IPv4 internet just because the server only gave me one IPv6 route 
into $internal_company_network.

^ permalink raw reply

* Re: VPN traffic leaks in IPv6/IPv4 dual-stack networks/hosts
From: Michal Kubeček @ 2012-11-27 16:22 UTC (permalink / raw)
  To: netdev
In-Reply-To: <50B4E52C.8090507@gont.com.ar>

On Tuesday 27 of November 2012 13:07EN, Fernando Gont wrote:
> > accept_ra on linux is set to 1, meaning that as soon as forwarding
> > is
> > enabled, RA are ignored.
> 
> I don't follow. Why would RAs be ignored if accept_ra is set to 1??

Value of 1 means "Accept Router Advertisements if forwarding is 
disabled.", see Documentation/networking/ip-sysct.txt

                                                        Michal Kubeček

^ permalink raw reply

* [PATCH net-next 1/1] ip6tnl/sit: drop packet if ECN present with not-ECT
From: Nicolas Dichtel @ 2012-11-27 13:07 UTC (permalink / raw)
  To: davem; +Cc: netdev, shemminger, Nicolas Dichtel

This patch reports the change made by Stephen Hemminger in ipip and gre[6] in
commit eccc1bb8d4b4 (tunnel: drop packet if ECN present with not-ECT).

Goal is to handle RFC6040, Section 4.2:

Default Tunnel Egress Behaviour.
 o If the inner ECN field is Not-ECT, the decapsulator MUST NOT
      propagate any other ECN codepoint onwards.  This is because the
      inner Not-ECT marking is set by transports that rely on dropped
      packets as an indication of congestion and would not understand or
      respond to any other ECN codepoint [RFC4774].  Specifically:

      *  If the inner ECN field is Not-ECT and the outer ECN field is
         CE, the decapsulator MUST drop the packet.

      *  If the inner ECN field is Not-ECT and the outer ECN field is
         Not-ECT, ECT(0), or ECT(1), the decapsulator MUST forward the
         outgoing packet with the ECN field cleared to Not-ECT.

The patch takes benefits from common function added in net/inet_ecn.h.

Like it was done for Xin4 tunnels, it adds logging to allow detecting broken
systems that set ECN bits incorrectly when tunneling (or an intermediate
router might be changing the header). Errors are also tracked via
rx_frame_error.

CC: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
 net/ipv6/ip6_tunnel.c | 49 ++++++++++++++++++++++++++++++++-----------------
 net/ipv6/sit.c        | 33 +++++++++++++++++++++------------
 2 files changed, 53 insertions(+), 29 deletions(-)

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index fb828e9..a14f28b 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -74,6 +74,10 @@ MODULE_ALIAS_NETDEV("ip6tnl0");
 #define HASH_SIZE_SHIFT  5
 #define HASH_SIZE (1 << HASH_SIZE_SHIFT)
 
+static bool log_ecn_error = true;
+module_param(log_ecn_error, bool, 0644);
+MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
+
 static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2)
 {
 	u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2);
@@ -683,28 +687,26 @@ ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	return 0;
 }
 
-static void ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
-					const struct ipv6hdr *ipv6h,
-					struct sk_buff *skb)
+static int ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
+				       const struct ipv6hdr *ipv6h,
+				       struct sk_buff *skb)
 {
 	__u8 dsfield = ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK;
 
 	if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY)
 		ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, dsfield);
 
-	if (INET_ECN_is_ce(dsfield))
-		IP_ECN_set_ce(ip_hdr(skb));
+	return IP6_ECN_decapsulate(ipv6h, skb);
 }
 
-static void ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
-					const struct ipv6hdr *ipv6h,
-					struct sk_buff *skb)
+static int ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
+				       const struct ipv6hdr *ipv6h,
+				       struct sk_buff *skb)
 {
 	if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY)
 		ipv6_copy_dscp(ipv6_get_dsfield(ipv6h), ipv6_hdr(skb));
 
-	if (INET_ECN_is_ce(ipv6_get_dsfield(ipv6h)))
-		IP6_ECN_set_ce(ipv6_hdr(skb));
+	return IP6_ECN_decapsulate(ipv6h, skb);
 }
 
 __u32 ip6_tnl_get_cap(struct ip6_tnl *t,
@@ -768,12 +770,13 @@ EXPORT_SYMBOL_GPL(ip6_tnl_rcv_ctl);
 
 static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol,
 		       __u8 ipproto,
-		       void (*dscp_ecn_decapsulate)(const struct ip6_tnl *t,
-						    const struct ipv6hdr *ipv6h,
-						    struct sk_buff *skb))
+		       int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t,
+						   const struct ipv6hdr *ipv6h,
+						   struct sk_buff *skb))
 {
 	struct ip6_tnl *t;
 	const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+	int err;
 
 	rcu_read_lock();
 
@@ -803,14 +806,26 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol,
 		skb->pkt_type = PACKET_HOST;
 		memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
 
+		__skb_tunnel_rx(skb, t->dev);
+
+		err = dscp_ecn_decapsulate(t, ipv6h, skb);
+		if (unlikely(err)) {
+			if (log_ecn_error)
+				net_info_ratelimited("non-ECT from %pI6 with dsfield=%#x\n",
+						     &ipv6h->saddr,
+						     ipv6_get_dsfield(ipv6h));
+			if (err > 1) {
+				++t->dev->stats.rx_frame_errors;
+				++t->dev->stats.rx_errors;
+				rcu_read_unlock();
+				goto discard;
+			}
+		}
+
 		tstats = this_cpu_ptr(t->dev->tstats);
 		tstats->rx_packets++;
 		tstats->rx_bytes += skb->len;
 
-		__skb_tunnel_rx(skb, t->dev);
-
-		dscp_ecn_decapsulate(t, ipv6h, skb);
-
 		netif_rx(skb);
 
 		rcu_read_unlock();
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 80cb382..cfba99b 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -65,6 +65,10 @@
 #define HASH_SIZE  16
 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
 
+static bool log_ecn_error = true;
+module_param(log_ecn_error, bool, 0644);
+MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
+
 static int ipip6_tunnel_init(struct net_device *dev);
 static void ipip6_tunnel_setup(struct net_device *dev);
 static void ipip6_dev_free(struct net_device *dev);
@@ -106,6 +110,7 @@ static struct rtnl_link_stats64 *ipip6_get_stats64(struct net_device *dev,
 	}
 
 	tot->rx_errors = dev->stats.rx_errors;
+	tot->rx_frame_errors = dev->stats.rx_frame_errors;
 	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
 	tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
 	tot->tx_dropped = dev->stats.tx_dropped;
@@ -585,16 +590,11 @@ out:
 	return err;
 }
 
-static inline void ipip6_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
-{
-	if (INET_ECN_is_ce(iph->tos))
-		IP6_ECN_set_ce(ipv6_hdr(skb));
-}
-
 static int ipip6_rcv(struct sk_buff *skb)
 {
 	const struct iphdr *iph;
 	struct ip_tunnel *tunnel;
+	int err;
 
 	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
 		goto out;
@@ -616,18 +616,27 @@ static int ipip6_rcv(struct sk_buff *skb)
 		if ((tunnel->dev->priv_flags & IFF_ISATAP) &&
 		    !isatap_chksrc(skb, iph, tunnel)) {
 			tunnel->dev->stats.rx_errors++;
-			kfree_skb(skb);
-			return 0;
+			goto out;
+		}
+
+		__skb_tunnel_rx(skb, tunnel->dev);
+
+		err = IP_ECN_decapsulate(iph, skb);
+		if (unlikely(err)) {
+			if (log_ecn_error)
+				net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
+						     &iph->saddr, iph->tos);
+			if (err > 1) {
+				++tunnel->dev->stats.rx_frame_errors;
+				++tunnel->dev->stats.rx_errors;
+				goto out;
+			}
 		}
 
 		tstats = this_cpu_ptr(tunnel->dev->tstats);
 		tstats->rx_packets++;
 		tstats->rx_bytes += skb->len;
 
-		__skb_tunnel_rx(skb, tunnel->dev);
-
-		ipip6_ecn_decapsulate(iph, skb);
-
 		netif_rx(skb);
 
 		return 0;
-- 
1.7.12

^ permalink raw reply related

* Arp problems after upgrade to 3.6
From: Phil Oester @ 2012-11-27 16:29 UTC (permalink / raw)
  To: netdev

Upgraded a box to 3.6.7 yesterday, and started noticing it becoming
unreachable occasionally.  Watching arp traffic, it appears it is
sometimes arping for the gateway using the wrong source IP in the requests. 
Sample trace from bond1:

11:10:13.489388 ARP, Request who-has 10.253.128.13 tell 10.253.128.6, length 28
11:10:14.489372 ARP, Request who-has 10.253.128.13 tell 10.253.128.6, length 28
11:10:15.489358 ARP, Request who-has 10.253.128.13 tell 10.253.128.6, length 28
11:10:16.489517 ARP, Request who-has 10.253.128.13 tell 10.253.128.6, length 28
11:10:16.489518 ARP, Request who-has 10.253.128.13 tell 10.253.128.14, length 28
11:10:16.490025 ARP, Reply 10.253.128.13 is-at 00:d0:04:b1:d4:00, length 46
11:11:43.618851 ARP, Request who-has 10.253.128.13 tell 10.253.128.14, length 28
11:11:43.619406 ARP, Reply 10.253.128.13 is-at 00:d0:04:b1:d4:00, length 46
11:15:05.792656 ARP, Request who-has 10.253.128.13 tell 10.253.128.6, length 28
11:15:06.789369 ARP, Request who-has 10.253.128.13 tell 10.253.128.6, length 28
11:15:07.789338 ARP, Request who-has 10.253.128.13 tell 10.253.128.6, length 28
11:15:08.789939 ARP, Request who-has 10.253.128.13 tell 10.253.128.6, length 28 

Seems to be getting bond0 and bond1 IPs confused here:

12: bond0: <BROADCAST,MULTICAST,MASTER,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP 
    link/ether xx:xx:xx:xx:xx:20 brd ff:ff:ff:ff:ff:ff
    inet 10.253.128.6/30 brd 10.253.128.7 scope global bond0
17: bond1: <BROADCAST,MULTICAST,MASTER,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP 
    link/ether xx:xx:xx:xx:xx:22 brd ff:ff:ff:ff:ff:ff
    inet 10.253.128.14/30 brd 10.253.128.15 scope global bond1 

This box is still up, so if there is any additional information I can provide
before rebooting it, please let me know.

Phil Oester

^ permalink raw reply

* Re: [net-next RFC] pktgen: don't wait for the device who doesn't free skb immediately after sent
From: Stephen Hemminger @ 2012-11-27 16:49 UTC (permalink / raw)
  To: Jason Wang; +Cc: mst, netdev, linux-kernel, virtualization, davem
In-Reply-To: <50B46179.4040809@redhat.com>

On Tue, 27 Nov 2012 14:45:13 +0800
Jason Wang <jasowang@redhat.com> wrote:

> On 11/27/2012 01:37 AM, Stephen Hemminger wrote:
> > On Mon, 26 Nov 2012 15:56:52 +0800
> > Jason Wang <jasowang@redhat.com> wrote:
> >
> >> Some deivces do not free the old tx skbs immediately after it has been sent
> >> (usually in tx interrupt). One such example is virtio-net which optimizes for
> >> virt and only free the possible old tx skbs during the next packet sending. This
> >> would lead the pktgen to wait forever in the refcount of the skb if no other
> >> pakcet will be sent afterwards.
> >>
> >> Solving this issue by introducing a new flag IFF_TX_SKB_FREE_DELAY which could
> >> notify the pktgen that the device does not free skb immediately after it has
> >> been sent and let it not to wait for the refcount to be one.
> >>
> >> Signed-off-by: Jason Wang <jasowang@redhat.com>
> > Another alternative would be using skb_orphan() and skb->destructor.
> > There are other cases where skb's are not freed right away.
> > --
> > To unsubscribe from this list: send the line "unsubscribe netdev" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Hi Stephen:
> 
> Do you mean registering a skb->destructor for pktgen then set and check 
> bits in skb->tx_flag?

Yes. Register a destructor that does something like update a counter (number of packets pending),
then just spin while number of packets pending is over threshold.

^ permalink raw reply

* Re: TCP and reordering
From: Rick Jones @ 2012-11-27 17:05 UTC (permalink / raw)
  To: Saku Ytti; +Cc: netdev
In-Reply-To: <CAAeewD8zsfb6QPFecvPab_oK3D3T7eG_M53MVvfJ2QrxUFxWSQ@mail.gmail.com>

On 11/27/2012 01:32 AM, Saku Ytti wrote:
> Today due to fast retransmit performance on links which cause
> reordering is appalling.
>
> Is it too esoteric situation to handle gracefully? Couldn't we
> maintain 'reorder' counter in socket, which is increment when we get
> two copies of same packet after duplicate ack, if this counter is
> sufficiently high in relation to packet loss, we could start delaying
> duplicate acks as we'd expect to receive the sequence very soon.

Packet reordering is supposed to be the exception, not the rule.  Links 
which habitually/constantly introduce reordering are, in my opinion, 
broken.  Optimizing for them would be optimizing an error case.

That said, there is net.ipv4.tcp_reordering which you can increase from 
the default of three to desensitize TCP to such broken links.  That will 
though be on the sending rather than receiving side.

rick jones

^ permalink raw reply

* Re: TCP and reordering
From: Saku Ytti @ 2012-11-27 17:15 UTC (permalink / raw)
  To: Rick Jones; +Cc: netdev
In-Reply-To: <50B4F2DA.8020206@hp.com>

On 27 November 2012 19:05, Rick Jones <rick.jones2@hp.com> wrote:

> Packet reordering is supposed to be the exception, not the rule.  Links
> which habitually/constantly introduce reordering are, in my opinion, broken.
> Optimizing for them would be optimizing an error case.

TCP used to be friendly to reordering before fast retransmit
optimization was implemented.

It seems like minimal complexity in TCP algorithm and would
dynamically work correctly depending on situation. It is rather slim
comfort that network should work, when it does not, and you cannot
affect it.

But if the complexity is higher than I expect, then I fully agree,
makes no sense to add it. Reason why reordering can happen in modern
MPLS network is that you have to essentially duck type your traffic,
and sometimes you duck type them wrong and you are then calculating
ECMP on incorrect values, causing packets inside flow to take
different ports.

--
  ++ytti

^ permalink raw reply

* Re: [PATCH v2 3/3] pppoatm: protect against freeing of vcc
From: David Woodhouse @ 2012-11-27 17:16 UTC (permalink / raw)
  To: Krzysztof Mazur; +Cc: davem, netdev, linux-kernel, nathan
In-Reply-To: <1350926091-12642-3-git-send-email-krzysiek@podlesie.net>

[-- Attachment #1: Type: text/plain, Size: 901 bytes --]

Krzysztof, you've fixed a bunch of races... but I think there's one
still left.

An ATM driver will often have code like this, which gets called from
arbitrary contexts:
        if (vcc->pop)
                vcc->pop(vcc, skb);

Now, what happens if pppoatm_send(vcc, NULL) happens after the address
of vcc->pop (currently pppoatm_pop) has been loaded, but before the
function is actually called?

You tear down all the setup and set vcc->user_back to NULL. And then
pppoatm_pop() gets called. And promptly crashes because pvcc is NULL.

A lot of these problems exist for br2684 too, and in prodding at it a
little I can consistently crash the system by sending a flood of
outbound packets while I kill the br2684ctl program. I end up in
br2684_pop() with vcc->user_back == NULL. In looking to see how you'd
fixed that in pppoatm, I realised that you haven't... :)

-- 
dwmw2


[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 6171 bytes --]

^ permalink raw reply

* Re: [PATCH 1/2] smsc75xx: refactor entering suspend modes
From: Steve Glendinning @ 2012-11-27 17:21 UTC (permalink / raw)
  To: Bjørn Mork; +Cc: netdev, linux-usb-u79uwXL29TY76Z2rM5mHXA, David Miller
In-Reply-To: <87624r9k1f.fsf-lbf33ChDnrE/G1V5fR+Y7Q@public.gmane.org>

Hi Bjorn,

>> +     smsc75xx_set_feature(dev, USB_DEVICE_REMOTE_WAKEUP);
>
> As mentioned in another comment to the smsc95xx driver: This is weird.
> Do you really need to do that?
>
> This is an USB interface driver.  The USB device is handled by the
> generic "usb" driver, which will do the right thing.  See
> drivers/usb/generic.c and drivers/usb/core/hub.c

Thanks, I've tested removing all these calls from the driver and
wakeup functionality seems to still work.

I'll resubmit my smsc75xx enhancement patchset with this change once
I've done some more testing.

David: please discard this patch.
--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH net-next] net: move inet_dport/inet_num in sock_common
From: Joe Perches @ 2012-11-27 17:23 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev, Ling Ma
In-Reply-To: <1354028815.14302.35.camel@edumazet-glaptop>

On Tue, 2012-11-27 at 07:06 -0800, Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
> 
> commit 68835aba4d9b (net: optimize INET input path further)
> moved some fields used for tcp/udp sockets lookup in the first cache
> line of struct sock_common.
[]
> diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
> index 5e11905..196ede4 100644
> --- a/include/linux/ipv6.h
> +++ b/include/linux/ipv6.h
> @@ -365,19 +365,21 @@ static inline struct raw6_sock *raw6_sk(const struct sock *sk)
>  #endif /* IS_ENABLED(CONFIG_IPV6) */
>  
>  #define INET6_MATCH(__sk, __net, __hash, __saddr, __daddr, __ports, __dif)\
> +	(((__sk)->sk_hash == (__hash)) &&					\
> +	 ((*((__portpair *)&(inet_sk(__sk)->inet_dport))) == (__ports)) &&	\
> +	 ((__sk)->sk_family		== AF_INET6)		&&		\

Perhaps these could be |'d together to avoid the test/jump
after each comparison by using some bit operations instead.

> +	 ipv6_addr_equal(&inet6_sk(__sk)->daddr, (__saddr))	&&		\
> +	 ipv6_addr_equal(&inet6_sk(__sk)->rcv_saddr, (__daddr))	&&		\
> +	 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))) && \
> +	 net_eq(sock_net(__sk), (__net)))

^ permalink raw reply

* Re: [PATCH 2/2] smsc75xx: support PHY wakeup source
From: Steve Glendinning @ 2012-11-27 17:23 UTC (permalink / raw)
  To: Joe Perches; +Cc: netdev, David Miller
In-Reply-To: <1354031804.2116.8.camel@joe-AO722>

Hi Joe,

On 27 November 2012 15:56, Joe Perches <joe@perches.com> wrote:
> On Tue, 2012-11-27 at 14:28 +0000, Steve Glendinning wrote:
>> +     ret = smsc75xx_read_reg_nopm(dev, PMT_CTL, &val);
>> +     check_warn_return(ret, "Error reading PMT_CTL");
>
> Hi Steve, can you please add newlines to these new
> check_warn_<foo> messages and the netdev_<level> ones too?
> It helps avoid message interleaving.

Sorry about that, I did see your earlier patch to fix existing instances.

David: please discard this patch too, I'll resubmit.

^ permalink raw reply

* [patch] atm: forever loop loading ambassador firmware
From: Dan Carpenter @ 2012-11-27 17:29 UTC (permalink / raw)
  To: Chas Williams, David Woodhouse; +Cc: linux-atm-general, netdev, kernel-janitors

There was a forever loop introduced here when we converted this to
request_firmware() back in 2008.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
---
Spotted in code reading.  Untested.

diff --git a/drivers/atm/ambassador.c b/drivers/atm/ambassador.c
index 89b30f3..ff7bb8a 100644
--- a/drivers/atm/ambassador.c
+++ b/drivers/atm/ambassador.c
@@ -1961,6 +1961,7 @@ static int __devinit ucode_init (loader_block * lb, amb_dev * dev) {
     res = loader_verify(lb, dev, rec);
     if (res)
       break;
+    rec = ihex_next_binrec(rec);
   }
   release_firmware(fw);
   if (!res)

^ permalink raw reply related

* RE: [E1000-devel] 82571EB: Detected Hardware Unit Hang
From: Fujinaka, Todd @ 2012-11-27 17:32 UTC (permalink / raw)
  To: Mary Mcgrath, Joe Jin
  Cc: netdev@vger.kernel.org, e1000-devel@lists.sf.net,
	linux-kernel@vger.kernel.org
In-Reply-To: <b46b84cf-3688-4fd7-b741-4e96f2663517@default>

Forgive me if I'm being too repetitious as I think some of this has been mentioned in the past.

We (and by we I mean the Ethernet part and driver) can only change the advertised availability of a larger MaxPayloadSize. The size is negotiated by both sides of the link when the link is established. The driver should not change the size of the link as it would be poking at registers outside of its scope and is controlled by the upstream bridge (not us).

You also need to check all the PCIe links to get to the device. There can be several to get from the root complex, through bridges, to the endpoint Ethernet controller. The Ethernet part and driver has no control over any other links. You'll have to talk to the motherboard manufacturer about those links.

Your original problem appears to be hangs and Tushar asked you to the entire path of PCIe connections from the root complex to the endpoint. Any mismatches in payload can cause hangs and I believe you have had the problem in the past. I'm sure you remember all the lspci commands to list the tree view and to dump all the details from each of the links and I would suggest you do that to check to see that the payload sizes match. What I do is "lspci -tvvv" to see what's connected, then "lspci -s xx:xx.x -vvv" to check the devices on the link.

Thanks.

Todd Fujinaka
Technical Marketing Engineer
LAN Access Division (LAD)
Intel Corporation
todd.fujinaka@intel.com
(503) 712-4565


-----Original Message-----
From: Mary Mcgrath [mailto:mary.mcgrath@oracle.com] 
Sent: Monday, November 26, 2012 6:07 PM
To: Joe Jin
Cc: netdev@vger.kernel.org; e1000-devel@lists.sf.net; linux-kernel@vger.kernel.org
Subject: Re: [E1000-devel] 82571EB: Detected Hardware Unit Hang

Joe
Thank you for working this.
I would love to find out how they expect a customer to make the modification To  "word  0x1A, and see if the 8th bit is 0 or 1, and to change to 0."

I have in turn asked the ct for the lspci command on eth3, maybe the incorrect setting is upstream.

Again,  thank you.
Regards
Mary



-----Original Message-----
From: Joe Jin
Sent: Monday, November 26, 2012 8:00 PM
To: Fujinaka, Todd
Cc: Dave, Tushar N; netdev@vger.kernel.org; e1000-devel@lists.sf.net; linux-kernel@vger.kernel.org; Mary Mcgrath
Subject: Re: [E1000-devel] 82571EB: Detected Hardware Unit Hang

On 11/27/12 00:23, Fujinaka, Todd wrote:
> If you look at the previous section, DevCap, you'll see that it's 
> correctly advertising 256 bytes but the system is negotiating 128 for 
> the link to the Ethernet controller. Things on the "other" side of the 
> link are controlled outside of the e1000 driver.
> 
> Tushar's first suggestion was to check the PCIe payload settings in 
> the entire chain. Have you done that? Mismatches will cause hangs.

Hi Todd,

So far I had to know how to modify the maxpayload size, since BIOS have not entry to change this, so I had to use ethtool, now I need to get the offset of MaxPayload size in eeprom, I ever tried to find from Intel online document but failed, any idea?

Thanks in advance,
Joe

------------------------------------------------------------------------------
Monitor your physical, virtual and cloud infrastructure from a single web console. Get in-depth insight into apps, servers, databases, vmware, SAP, cloud infrastructure, etc. Download 30-day Free Trial.
Pricing starts from $795 for 25 servers or applications!
http://p.sf.net/sfu/zoho_dev2dev_nov
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply

* Re: [PATCH v2 3/3] pppoatm: protect against freeing of vcc
From: Krzysztof Mazur @ 2012-11-27 17:39 UTC (permalink / raw)
  To: David Woodhouse; +Cc: davem, netdev, linux-kernel, nathan
In-Reply-To: <1354036592.2534.6.camel@shinybook.infradead.org>

On Tue, Nov 27, 2012 at 05:16:32PM +0000, David Woodhouse wrote:
> Krzysztof, you've fixed a bunch of races... but I think there's one
> still left.
> 
> An ATM driver will often have code like this, which gets called from
> arbitrary contexts:
>         if (vcc->pop)
>                 vcc->pop(vcc, skb);
> 
> Now, what happens if pppoatm_send(vcc, NULL) happens after the address
> of vcc->pop (currently pppoatm_pop) has been loaded, but before the
> function is actually called?
> 
> You tear down all the setup and set vcc->user_back to NULL. And then
> pppoatm_pop() gets called. And promptly crashes because pvcc is NULL.
> 
> A lot of these problems exist for br2684 too, and in prodding at it a
> little I can consistently crash the system by sending a flood of
> outbound packets while I kill the br2684ctl program. I end up in
> br2684_pop() with vcc->user_back == NULL. In looking to see how you'd
> fixed that in pppoatm, I realised that you haven't... :)
> 

Yes, I missed that one - it's even worse, I introduced that bug 
in "[PATCH 1/7] atm: detach protocol before closing vcc". Before that
patch that scenario shouldn't happen because vcc was closed before
calling pppoatm_send(vcc, NULL) - the driver should provide appropriate
synchronization.

I think that we should just drop that patch. With later changes it's not
necessary - the pppoatm_send() can be safely called while closing vcc.

Krzysiek

^ permalink raw reply

* Re: [PATCH] smsc95xx: fix suspend buffer overflow
From: Joe Perches @ 2012-11-27 17:42 UTC (permalink / raw)
  To: Steve Glendinning; +Cc: netdev, dan.carpenter
In-Reply-To: <1354022623-7317-1-git-send-email-steve.glendinning@shawell.net>

On Tue, 2012-11-27 at 13:23 +0000, Steve Glendinning wrote:
> This patch fixes a buffer overflow introduced by bbd9f9e, where
> the filter_mask array is accessed beyond its bounds.
> 
> Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
> Signed-off-by: Steve Glendinning <steve.glendinning@shawell.net>
> ---
>  drivers/net/usb/smsc95xx.c |    2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c
> index 79d495d..6cdc504 100644
> --- a/drivers/net/usb/smsc95xx.c
> +++ b/drivers/net/usb/smsc95xx.c
> @@ -1281,7 +1281,7 @@ static int smsc95xx_suspend(struct usb_interface *intf, pm_message_t message)
>  	}
>  
>  	if (pdata->wolopts & (WAKE_BCAST | WAKE_MCAST | WAKE_ARP | WAKE_UCAST)) {
> -		u32 *filter_mask = kzalloc(32, GFP_KERNEL);
> +		u32 *filter_mask = kzalloc(sizeof(u32) * 32, GFP_KERNEL);

It's also unchecked for alloc failure.

>  		u32 command[2];
>  		u32 offset[2];
>  		u32 crc[4];

^ permalink raw reply

* [PATCH 1/1] net: ethernet: cpsw: fix build warnings for CPSW when CPTS not selected
From: Mugunthan V N @ 2012-11-27 17:53 UTC (permalink / raw)
  To: netdev; +Cc: davem, linux-arm-kernel, linux-omap, richardcochran,
	Mugunthan V N

  CC      drivers/net/ethernet/ti/cpsw.o
drivers/net/ethernet/ti/cpsw.c: In function 'cpsw_ndo_ioctl':
drivers/net/ethernet/ti/cpsw.c:881:20: warning: unused variable 'priv'

The build warning is generated when CPTS is not selected in Kernel Build.
Fixing by passing the net_device pointer to cpts IOCTL instead of passing priv

Signed-off-by: Mugunthan V N <mugunthanvnm@ti.com>
---
 drivers/net/ethernet/ti/cpsw.c |    7 +++----
 1 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 02c2477..c9714e1 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -812,8 +812,9 @@ static void cpsw_hwtstamp_v2(struct cpsw_priv *priv)
 	__raw_writel(ETH_P_1588, &priv->regs->ts_ltype);
 }
 
-static int cpsw_hwtstamp_ioctl(struct cpsw_priv *priv, struct ifreq *ifr)
+static int cpsw_hwtstamp_ioctl(struct net_device *dev, struct ifreq *ifr)
 {
+	struct cpsw_priv *priv = netdev_priv(dev);
 	struct cpts *cpts = &priv->cpts;
 	struct hwtstamp_config cfg;
 
@@ -878,14 +879,12 @@ static int cpsw_hwtstamp_ioctl(struct cpsw_priv *priv, struct ifreq *ifr)
 
 static int cpsw_ndo_ioctl(struct net_device *dev, struct ifreq *req, int cmd)
 {
-	struct cpsw_priv *priv = netdev_priv(dev);
-
 	if (!netif_running(dev))
 		return -EINVAL;
 
 #ifdef CONFIG_TI_CPTS
 	if (cmd == SIOCSHWTSTAMP)
-		return cpsw_hwtstamp_ioctl(priv, req);
+		return cpsw_hwtstamp_ioctl(dev, req);
 #endif
 	return -ENOTSUPP;
 }
-- 
1.7.0.4


^ permalink raw reply related

* Re: [PATCH 1/2] smsc75xx: refactor entering suspend modes
From: Steve Glendinning @ 2012-11-27 17:55 UTC (permalink / raw)
  To: Bjørn Mork; +Cc: netdev, linux-usb-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <CAKh2mn4OiN7ZAX_0SJ_XHwRNPoyKUk=+4ha5esBMAajGD4BX=w@mail.gmail.com>

Hi Bjorn,

On 27 November 2012 17:21, Steve Glendinning <steve-nksJyM/082jR7s880joybQ@public.gmane.org> wrote:
> Hi Bjorn,
>
>>> +     smsc75xx_set_feature(dev, USB_DEVICE_REMOTE_WAKEUP);
>>
>> As mentioned in another comment to the smsc95xx driver: This is weird.
>> Do you really need to do that?
>>
>> This is an USB interface driver.  The USB device is handled by the
>> generic "usb" driver, which will do the right thing.  See
>> drivers/usb/generic.c and drivers/usb/core/hub.c
>
> Thanks, I've tested removing all these calls from the driver and
> wakeup functionality seems to still work.
>
> I'll resubmit my smsc75xx enhancement patchset with this change once
> I've done some more testing.

Further testing shows that removing these calls stop wakeup from
system suspend working (although don't appear to impact runtime
autosuspend).  Have I missed a flag or somewhere that causes
udev->do_remote_wakeup to be set in the code you posted?
--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH for 3.8] iproute2: Add "ip netns pids" and "ip netns identify"
From: Ben Hutchings @ 2012-11-27 18:00 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Stephen Hemminger, netdev, Serge E. Hallyn
In-Reply-To: <87a9u4q7k9.fsf@xmission.com>

On Mon, 2012-11-26 at 17:16 -0600, Eric W. Biederman wrote:
> Add command that go between network namespace names and process
> identifiers.  The code builds and runs agains older kernels but
> only works on Linux 3.8+ kernels where I have fixed stat to work
> properly.
> 
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> ---
> 
> I don't know if this is too soon to send this patch to iproute as the
> kernel code that fixes stat is currently sitting in my for-next branch
> of:
> git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.git
> 
> and has not hit Linus's tree yet.  Still the code runs and is harmless
> on older kernels so it should be harmless whatever happens with it.
> 
>  ip/ipnetns.c        |  141 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  man/man8/ip-netns.8 |    5 ++-
>  2 files changed, 145 insertions(+), 1 deletions(-)
> 
> diff --git a/ip/ipnetns.c b/ip/ipnetns.c
> index e41a598..c55fe3a 100644
> --- a/ip/ipnetns.c
> +++ b/ip/ipnetns.c
> @@ -13,6 +13,7 @@
>  #include <dirent.h>
>  #include <errno.h>
>  #include <unistd.h>
> +#include <ctype.h>
>  
>  #include "utils.h"
>  #include "ip_common.h"
> @@ -48,6 +49,8 @@ static void usage(void)
>  	fprintf(stderr, "Usage: ip netns list\n");
>  	fprintf(stderr, "       ip netns add NAME\n");
>  	fprintf(stderr, "       ip netns delete NAME\n");
> +	fprintf(stderr, "       ip netns identify PID\n");
> +	fprintf(stderr, "       ip netns pids NAME\n");
>  	fprintf(stderr, "       ip netns exec NAME cmd ...\n");
>  	fprintf(stderr, "       ip netns monitor\n");
>  	exit(-1);
> @@ -171,6 +174,138 @@ static int netns_exec(int argc, char **argv)
>  	exit(-1);
>  }
>  
> +static int is_pid(const char *str)
> +{
> +	int ch;
> +	for (; (ch = *str); str++) {
> +		if (!isdigit(ch))

ch must be cast to unsigned char before passing to isdigit().

> +			return 0;
> +	}
> +	return 1;
> +}
> +
> +static int netns_pids(int argc, char **argv)
> +{
> +	const char *name;
> +	char net_path[MAXPATHLEN];
> +	int netns;
> +	struct stat netst;
> +	DIR *dir;
> +	struct dirent *entry;
> +
> +	if (argc < 1) {
> +		fprintf(stderr, "No netns name specified\n");
> +		return -1;
> +	}
> +	if (argc > 1) {
> +		fprintf(stderr, "extra arguments specified\n");
> +		return -1;
> +	}

These, and many other return statements in this file which set the
process exit code, should return 1 (general failure) or 2 (user error)
rather than -1 (likely to be interpreted as command not found).

> +	name = argv[0];
> +	snprintf(net_path, sizeof(net_path), "%s/%s", NETNS_RUN_DIR, name);

No check for truncation?

> +	netns = open(net_path, O_RDONLY);

This file descriptor is leaked, though that probably doesn't really
matter.

[...]
> +static int netns_identify(int argc, char **argv)
> +{
> +	const char *pidstr;
> +	char net_path[MAXPATHLEN];
> +	int netns;
> +	struct stat netst;
> +	DIR *dir;
> +	struct dirent *entry;
> +
> +	if (argc < 1) {
> +		fprintf(stderr, "No pid specified\n");
> +		return -1;
> +	}
> +	if (argc > 1) {
> +		fprintf(stderr, "extra arguments specified\n");
> +		return -1;
> +	}
> +	pidstr = argv[0];
> +
> +	if (!is_pid(pidstr)) {
> +		fprintf(stderr, "Specified string '%s' is not a pid\n",
> +			pidstr);
> +		return -1;
> +	}
> +
> +	snprintf(net_path, sizeof(net_path), "/proc/%s/ns/net", pidstr);
> +	netns = open(net_path, O_RDONLY);
> +	if (netns < 0) {
> +		fprintf(stderr, "Cannot open network namespace: %s\n",
> +			strerror(errno));
> +		return -1;
> +	}
> +	if (fstat(netns, &netst) < 0) {
> +		fprintf(stderr, "Stat of netns failed: %s\n",
> +			strerror(errno));
> +		return -1;
> +	}
> +	dir = opendir(NETNS_RUN_DIR);
> +	if (!dir)
> +		return 0;

Shouldn't this be treated as an error?  Or, if you want it to succeed
when the kernel does not have netns functionality, then treat it as an
error if !dir && errno != ENOENT.

> +	while((entry = readdir(dir))) {
> +		char name_path[MAXPATHLEN];
> +		struct stat st;
> +
> +		if (strcmp(entry->d_name, ".") == 0)
> +			continue;
> +		if (strcmp(entry->d_name, "..") == 0)
> +			continue;
> +
> +		snprintf(name_path, sizeof(name_path), "%s/%s",	NETNS_RUN_DIR,
> +			entry->d_name);
> +
> +		if (stat(name_path, &st) != 0)
> +			continue;
> +
> +		if ((st.st_dev == netst.st_dev) &&
> +		    (st.st_ino == netst.st_ino)) {
> +			printf("%s\n", entry->d_name);
> +		}
> +	}
> +	closedir(dir);
> +	return 0;
> +	
> +}
[...]

It's a shame there isn't a more efficient way to do these lookups.

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox