Netdev List
 help / color / mirror / Atom feed
* [PATCH 04/11] nf_conntrack_netlink: pass nf_conntrack_netlink module to netlink_dump_start
From: Gao feng @ 2012-09-26  4:52 UTC (permalink / raw)
  To: davem
  Cc: netfilter-devel, linux-rdma, netdev, eric.dumazet, pablo,
	steffen.klassert, linux-crypto, jengelh, stephen.hemminger,
	Gao feng
In-Reply-To: <1348635140-20225-1-git-send-email-gaofeng@cn.fujitsu.com>

use proper netlink_dump_control.done and .module to avoid panic.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
---
 net/netfilter/nf_conntrack_netlink.c |    8 ++++++++
 1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 9807f32..509a257 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -706,6 +706,7 @@ static int ctnetlink_done(struct netlink_callback *cb)
 		nf_ct_put((struct nf_conn *)cb->args[1]);
 	if (cb->data)
 		kfree(cb->data);
+	netlink_dump_done(cb);
 	return 0;
 }
 
@@ -1022,6 +1023,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
 		struct netlink_dump_control c = {
 			.dump = ctnetlink_dump_table,
 			.done = ctnetlink_done,
+			.module = THIS_MODULE,
 		};
 #ifdef CONFIG_NF_CONNTRACK_MARK
 		if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) {
@@ -1706,6 +1708,8 @@ ctnetlink_stat_ct_cpu(struct sock *ctnl, struct sk_buff *skb,
 	if (nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
 			.dump = ctnetlink_ct_stat_cpu_dump,
+			.done = netlink_dump_done,
+			.module = THIS_MODULE,
 		};
 		return netlink_dump_start(ctnl, skb, nlh, &c);
 	}
@@ -2141,6 +2145,7 @@ static int ctnetlink_exp_done(struct netlink_callback *cb)
 {
 	if (cb->args[1])
 		nf_ct_expect_put((struct nf_conntrack_expect *)cb->args[1]);
+	netlink_dump_done(cb);
 	return 0;
 }
 
@@ -2222,6 +2227,7 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
 		struct netlink_dump_control c = {
 			.dump = ctnetlink_exp_dump_table,
 			.done = ctnetlink_exp_done,
+			.module = THIS_MODULE,
 		};
 		return netlink_dump_start(ctnl, skb, nlh, &c);
 	}
@@ -2660,6 +2666,8 @@ ctnetlink_stat_exp_cpu(struct sock *ctnl, struct sk_buff *skb,
 	if (nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
 			.dump = ctnetlink_exp_stat_cpu_dump,
+			.done = netlink_dump_done,
+			.module = THIS_MODULE,
 		};
 		return netlink_dump_start(ctnl, skb, nlh, &c);
 	}
-- 
1.7.7.6

^ permalink raw reply related

* [PATCH 10/11] ipset: pass ipset module to netlink_dump_start
From: Gao feng @ 2012-09-26  4:52 UTC (permalink / raw)
  To: davem-fT/PcQaiUtIeIZ0/mPfg9Q
  Cc: netfilter-devel-u79uwXL29TY76Z2rM5mHXA,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	eric.dumazet-Re5JQEeQqe8AvxtiuMwx3w, pablo-Cap9r6Oaw4JrovVCs/uTlw,
	steffen.klassert-opNxpl+3fjRBDgjK7y7TUQ,
	linux-crypto-u79uwXL29TY76Z2rM5mHXA, jengelh-9+2X+4sQBs8,
	stephen.hemminger-ZtmgI6mnKB3QT0dZR+AlfA, Gao feng,
	Jozsef Kadlecsik
In-Reply-To: <1348635140-20225-1-git-send-email-gaofeng-BthXqXjhjHXQFUHtdCDX3A@public.gmane.org>

use proper netlink_dump_control.done and .module to avoid panic.

Signed-off-by: Gao feng <gaofeng-BthXqXjhjHXQFUHtdCDX3A@public.gmane.org>
Cc: Jozsef Kadlecsik <kadlec-K40Dz/62t/MgiyqX0sVFJYdd74u8MsAO@public.gmane.org>
---
 net/netfilter/ipset/ip_set_core.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 9730882..4c9c892 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -986,6 +986,7 @@ ip_set_dump_done(struct netlink_callback *cb)
 		pr_debug("release set %s\n", ip_set_list[cb->args[1]]->name);
 		ip_set_put_byindex((ip_set_id_t) cb->args[1]);
 	}
+	netlink_dump_done(cb);
 	return 0;
 }
 
@@ -1176,6 +1177,7 @@ ip_set_dump(struct sock *ctnl, struct sk_buff *skb,
 		struct netlink_dump_control c = {
 			.dump = ip_set_dump_start,
 			.done = ip_set_dump_done,
+			.module = THIS_MODULE,
 		};
 		return netlink_dump_start(ctnl, skb, nlh, &c);
 	}
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH 07/11] nfnetlink_cttimeout: pass nfnetlink_cttimeout module to netlink_dump_start
From: Gao feng @ 2012-09-26  4:52 UTC (permalink / raw)
  To: davem
  Cc: netfilter-devel, linux-rdma, netdev, eric.dumazet, pablo,
	steffen.klassert, linux-crypto, jengelh, stephen.hemminger,
	Gao feng
In-Reply-To: <1348635140-20225-1-git-send-email-gaofeng@cn.fujitsu.com>

use proper netlink_dump_control.done and .module to avoid panic.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
---
 net/netfilter/nfnetlink_cttimeout.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index cdecbc8..bc3e730 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -248,6 +248,8 @@ cttimeout_get_timeout(struct sock *ctnl, struct sk_buff *skb,
 	if (nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
 			.dump = ctnl_timeout_dump,
+			.done = netlink_dump_done,
+			.module = THIS_MODULE,
 		};
 		return netlink_dump_start(ctnl, skb, nlh, &c);
 	}
-- 
1.7.7.6


^ permalink raw reply related

* [PATCH 03/11] unix_diag: pass unix_diag module to netlink_dump_start
From: Gao feng @ 2012-09-26  4:52 UTC (permalink / raw)
  To: davem
  Cc: netfilter-devel, linux-rdma, netdev, eric.dumazet, pablo,
	steffen.klassert, linux-crypto, jengelh, stephen.hemminger,
	Gao feng
In-Reply-To: <1348635140-20225-1-git-send-email-gaofeng@cn.fujitsu.com>

use proper netlink_dump_control.done and .module to avoid panic.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
---
 net/unix/diag.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/net/unix/diag.c b/net/unix/diag.c
index 750b134..5e09553 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -299,6 +299,8 @@ static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
 	if (h->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
 			.dump = unix_diag_dump,
+			.done = netlink_dump_done,
+			.module = THIS_MODULE,
 		};
 		return netlink_dump_start(net->diag_nlsk, skb, h, &c);
 	} else
-- 
1.7.7.6

^ permalink raw reply related

* [PATCH 02/11] inet_diag: pass inet_diag module to netlink_dump_start
From: Gao feng @ 2012-09-26  4:52 UTC (permalink / raw)
  To: davem
  Cc: netfilter-devel, linux-rdma, netdev, eric.dumazet, pablo,
	steffen.klassert, linux-crypto, jengelh, stephen.hemminger,
	Gao feng
In-Reply-To: <1348635140-20225-1-git-send-email-gaofeng@cn.fujitsu.com>

use proper netlink_dump_control.done and .module to avoid panic.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
---
 net/ipv4/inet_diag.c |    4 ++++
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 570e61f..36d4be5 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -972,6 +972,8 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
 		{
 			struct netlink_dump_control c = {
 				.dump = inet_diag_dump_compat,
+				.done = netlink_dump_done,
+				.module = THIS_MODULE,
 			};
 			return netlink_dump_start(net->diag_nlsk, skb, nlh, &c);
 		}
@@ -1001,6 +1003,8 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
 		{
 			struct netlink_dump_control c = {
 				.dump = inet_diag_dump,
+				.done = netlink_dump_done,
+				.module = THIS_MODULE,
 			};
 			return netlink_dump_start(net->diag_nlsk, skb, h, &c);
 		}
-- 
1.7.7.6


^ permalink raw reply related

* [PATCH 01/11] netlink: add reference of module in netlink_dump_start
From: Gao feng @ 2012-09-26  4:52 UTC (permalink / raw)
  To: davem
  Cc: netfilter-devel, linux-rdma, netdev, eric.dumazet, pablo,
	steffen.klassert, linux-crypto, jengelh, stephen.hemminger,
	Gao feng

I get a panic when I use ss -a and rmmod inet_diag at the
same time.

it's because netlink_dump use inet_diag_dump witch function
belongs to module inet_diag.

I search the codes and find many modules have the same problem.
We need add reference of the module witch the cb->dump belongs
to.

since CONFIG_NET is bool,so netlink_dump_start in rtnetlink.c
and genetlink.c will never trigger this problem.

Thanks for all help from Stephen,Jan and Eric.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
---
 include/linux/netlink.h  |    4 ++++
 net/netlink/af_netlink.c |   25 +++++++++++++++++++++----
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index f74dd13..a3641e3 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -232,6 +232,8 @@ struct netlink_callback {
 					struct netlink_callback *cb);
 	int			(*done)(struct netlink_callback *cb);
 	void			*data;
+	/* the module that dump function belong to */
+	struct module		*module;
 	u16			family;
 	u16			min_dump_alloc;
 	unsigned int		prev_seq, seq;
@@ -251,9 +253,11 @@ struct netlink_dump_control {
 	int (*dump)(struct sk_buff *skb, struct netlink_callback *);
 	int (*done)(struct netlink_callback*);
 	void *data;
+	struct module *module;
 	u16 min_dump_alloc;
 };
 
+extern int netlink_dump_done(struct netlink_callback *cb);
 extern int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 			      const struct nlmsghdr *nlh,
 			      struct netlink_dump_control *control);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 5270238..011091c 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1769,6 +1769,14 @@ errout_skb:
 	return err;
 }
 
+int netlink_dump_done(struct netlink_callback *cb)
+{
+	if (cb->module)
+		module_put(cb->module);
+	return 0;
+}
+EXPORT_SYMBOL(netlink_dump_done);
+
 int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 		       const struct nlmsghdr *nlh,
 		       struct netlink_dump_control *control)
@@ -1786,6 +1794,7 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 	cb->done = control->done;
 	cb->nlh = nlh;
 	cb->data = control->data;
+	cb->module = control->module;
 	cb->min_dump_alloc = control->min_dump_alloc;
 	atomic_inc(&skb->users);
 	cb->skb = skb;
@@ -1796,19 +1805,27 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 		return -ECONNREFUSED;
 	}
 	nlk = nlk_sk(sk);
-	/* A dump is in progress... */
+
 	mutex_lock(nlk->cb_mutex);
+	/* A dump is in progress... */
 	if (nlk->cb) {
 		mutex_unlock(nlk->cb_mutex);
 		netlink_destroy_callback(cb);
-		sock_put(sk);
-		return -EBUSY;
+		ret = -EBUSY;
+		goto out;
+	}
+	/* add reference of module witch cb->dump belone to */
+	if (cb->module && !try_module_get(cb->module)) {
+		mutex_unlock(nlk->cb_mutex);
+		ret = -EPROTONOSUPPORT;
+		goto out;
 	}
+
 	nlk->cb = cb;
 	mutex_unlock(nlk->cb_mutex);
 
 	ret = netlink_dump(sk);
-
+out:
 	sock_put(sk);
 
 	if (ret)
-- 
1.7.7.6

^ permalink raw reply related

* Re: [PATCHv4 net-next] vxlan: virtual extensible lan
From: Stephen Hemminger @ 2012-09-26  4:36 UTC (permalink / raw)
  To: Jesse Gross; +Cc: Chris Wright, David Miller, netdev
In-Reply-To: <CAEP_g=_PQSx_OVcU47OJFKgO_MpWSKStmGnVB76vSm=Z7r6mvg@mail.gmail.com>

On Tue, 25 Sep 2012 14:55:13 -0700
Jesse Gross <jesse@nicira.com> wrote:

> On Mon, Sep 24, 2012 at 2:50 PM, Stephen Hemminger
> <shemminger@vyatta.com> wrote:
> > +static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
> [...]
> > +       /* Do PMTU */
> > +       if (skb->protocol == htons(ETH_P_IP)) {
> > +               df |= old_iph->frag_off & htons(IP_DF);
> > +               if (df && mtu < pkt_len) {
> > +                       icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
> > +                                 htonl(mtu));
> > +                       ip_rt_put(rt);
> > +                       goto tx_error;
> > +               }
> > +       }
> > +#if IS_ENABLED(CONFIG_IPV6)
> > +       else if (skb->protocol == htons(ETH_P_IPV6)) {
> > +               if (mtu >= IPV6_MIN_MTU && mtu < pkt_len) {
> > +                       icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
> > +                       ip_rt_put(rt);
> > +                       goto tx_error;
> > +               }
> > +       }
> > +#endif
> 
> Won't this black hole packets if we need to generate ICMP messages?
> Since we're doing switching and not routing here icmp_send() doesn't
> necessarily have a route to the relevant endpoint.  It looks like
> Ethernet over GRE has this issue as well.

It is an interesting question about what is the correct way to handle packets
where the inner header is IPv6 or IPv4 with Don't Fragment set. As you mention
sending an ICMP response won't work because the tunnel endpoint is not part
of that IP network.

The simple option is to fragment it in the tunnel and since the fragmentation
is not visible to the overlay network, that is okay. But for PMTU discovery
it might be better to just drop the packet and not send a fragmented payload.

Some backbone networks don't allow fragmentation at all (in a futile attempt
to block DoS attacks and protect fragile Windows hosts). Fragmentation
brings all sorts of evil problems like the potential of corrupted assembly
because of sequence wrap; the checksum in the inner packet will defend against
that but tunnels are not supposed to rely on inner protocol data protection.

Or you can just do what Cisco and Microsoft do and just tell everyone
to set larger MTU on the backbone.

^ permalink raw reply

* Re: [PATCH net-next] net: remove sk_init() helper
From: Shan Wei @ 2012-09-26  3:51 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev
In-Reply-To: <1348626205.26828.3563.camel@edumazet-glaptop>

Eric Dumazet said, at 2012/9/26 10:23:
> On Wed, 2012-09-26 at 10:07 +0800, Shan Wei wrote:
>> Eric Dumazet said, at 2012/9/26 5:32:
>>> From: Eric Dumazet <edumazet@google.com>
>>>
>>> It seems sk_init() has no value today and even does strange things :
>>>
>>> # grep . /proc/sys/net/core/?mem_*
>>> /proc/sys/net/core/rmem_default:212992
>>> /proc/sys/net/core/rmem_max:131071
>>> /proc/sys/net/core/wmem_default:212992
>>> /proc/sys/net/core/wmem_max:131071
>>>
>>> We can remove it completely.
>>
>> Ye, I have reported this problem.
>> http://lists.openwall.net/netdev/2012/04/27/49
>>
>> How about for small RAM ~16M?
> 
> It was maybe a concern 20 years ago, ( I ran linux on [34]86 computers
> with 12 or 16 MB of memory) but this makes no sense today with linux-3.7

I never play so lower memory computer. :-(
And now so lower memory device die out.

Reviewed-by: Shan Wei <davidshan@tencent.com>

^ permalink raw reply

* Re: [PATCH net-next] net: remove sk_init() helper
From: Eric Dumazet @ 2012-09-26  2:23 UTC (permalink / raw)
  To: Shan Wei; +Cc: David Miller, netdev
In-Reply-To: <5062636E.2060200@gmail.com>

On Wed, 2012-09-26 at 10:07 +0800, Shan Wei wrote:
> Eric Dumazet said, at 2012/9/26 5:32:
> > From: Eric Dumazet <edumazet@google.com>
> > 
> > It seems sk_init() has no value today and even does strange things :
> > 
> > # grep . /proc/sys/net/core/?mem_*
> > /proc/sys/net/core/rmem_default:212992
> > /proc/sys/net/core/rmem_max:131071
> > /proc/sys/net/core/wmem_default:212992
> > /proc/sys/net/core/wmem_max:131071
> > 
> > We can remove it completely.
> 
> Ye, I have reported this problem.
> http://lists.openwall.net/netdev/2012/04/27/49
> 
> How about for small RAM ~16M?

It was maybe a concern 20 years ago, ( I ran linux on [34]86 computers
with 12 or 16 MB of memory) but this makes no sense today with linux-3.7

^ permalink raw reply

* Re: [PATCH net-next] net: remove sk_init() helper
From: Shan Wei @ 2012-09-26  2:07 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev
In-Reply-To: <1348608733.26828.3492.camel@edumazet-glaptop>

Eric Dumazet said, at 2012/9/26 5:32:
> From: Eric Dumazet <edumazet@google.com>
> 
> It seems sk_init() has no value today and even does strange things :
> 
> # grep . /proc/sys/net/core/?mem_*
> /proc/sys/net/core/rmem_default:212992
> /proc/sys/net/core/rmem_max:131071
> /proc/sys/net/core/wmem_default:212992
> /proc/sys/net/core/wmem_max:131071
> 
> We can remove it completely.

Ye, I have reported this problem.
http://lists.openwall.net/netdev/2012/04/27/49

How about for small RAM ~16M?


> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
>  include/net/sock.h |    2 --
>  net/core/sock.c    |   13 -------------
>  net/socket.c       |    6 ------
>  3 files changed, 21 deletions(-)
> 
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 42053759..02456b6 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -2229,8 +2229,6 @@ extern int net_msg_warn;
>  extern __u32 sysctl_wmem_max;
>  extern __u32 sysctl_rmem_max;
>  
> -extern void sk_init(void);
> -
>  extern int sysctl_optmem_max;
>  
>  extern __u32 sysctl_wmem_default;
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 727114c..f5a4260 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -1464,19 +1464,6 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
>  }
>  EXPORT_SYMBOL_GPL(sk_setup_caps);
>  
> -void __init sk_init(void)
> -{
> -	if (totalram_pages <= 4096) {
> -		sysctl_wmem_max = 32767;
> -		sysctl_rmem_max = 32767;
> -		sysctl_wmem_default = 32767;
> -		sysctl_rmem_default = 32767;
> -	} else if (totalram_pages >= 131072) {
> -		sysctl_wmem_max = 131071;
> -		sysctl_rmem_max = 131071;
> -	}
> -}
> -
>  /*
>   *	Simple resource managers for sockets.
>   */
> diff --git a/net/socket.c b/net/socket.c
> index c641549..80dc7e8 100644
> --- a/net/socket.c
> +++ b/net/socket.c
> @@ -2601,12 +2601,6 @@ static int __init sock_init(void)
>  		goto out;
>  
>  	/*
> -	 *      Initialize sock SLAB cache.
> -	 */
> -
> -	sk_init();
> -
> -	/*
>  	 *      Initialize skbuff SLAB cache
>  	 */
>  	skb_init();
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply

* Re: [PATCH linux-next] nf_defrag_ipv6: fix oops on module unloading
From: Cong Wang @ 2012-09-26  1:48 UTC (permalink / raw)
  To: Konstantin Khlebnikov; +Cc: netdev, David S. Miller
In-Reply-To: <20120925160750.30475.77562.stgit@zurg>

On Tue, 2012-09-25 at 20:07 +0400, Konstantin Khlebnikov wrote:
> fix copy-paste error introduced in linux-next commit
> "ipv6: add a new namespace for nf_conntrack_reasm"
> 
> Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
> Cc: Amerigo Wang <amwang@redhat.com>
> Cc: David S. Miller <davem@davemloft.net>
> 

Oops! Sorry, my bad...

Acked-by: Cong Wang <amwang@redhat.com>

^ permalink raw reply

* linux-next: manual merge of the net-next tree with Linus' tree
From: Stephen Rothwell @ 2012-09-26  1:46 UTC (permalink / raw)
  To: David Miller, netdev
  Cc: linux-next, linux-kernel, "Linus Lüssing",
	Antonio Quartulli, Sven Eckelmann

[-- Attachment #1: Type: text/plain, Size: 1076 bytes --]

Hi all,

Today's linux-next merge of the net-next tree got a conflict in
net/batman-adv/bat_iv_ogm.c between commit 7caf69fb9c50 ("batman-adv: Fix
symmetry check / route flapping in multi interface setups") from Linus'
tree and commit bbb1f90efba8 ("batman-adv: Don't break statements after
assignment operator") from the net-next tree.

I fixed it up (see below) and can carry the fix as necessary (no action
is required).

-- 
Cheers,
Stephen Rothwell                    sfr@canb.auug.org.au

diff --cc net/batman-adv/bat_iv_ogm.c
index 469daab,df79300..0000000
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@@ -642,9 -652,9 +652,10 @@@ batadv_iv_ogm_orig_update(struct batadv
  	struct batadv_neigh_node *router = NULL;
  	struct batadv_orig_node *orig_node_tmp;
  	struct hlist_node *node;
 +	int if_num;
  	uint8_t sum_orig, sum_neigh;
  	uint8_t *neigh_addr;
+ 	uint8_t tq_avg;
  
  	batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
  		   "update_originator(): Searching and updating originator entry of received packet\n");

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* [PATCH -next] device.h: Add missing inline to #ifndef CONFIG_PRINTK dev_vprintk_emit
From: Joe Perches @ 2012-09-26  1:19 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: Andrew Morton, Greg Kroah-Hartman, David S. Miller, Jason Baron,
	Jim Cromie, Kay Sievers, netdev, linux-kernel
In-Reply-To: <CAMuHMdVH-ni1NhbTDXgh82WUSAV85fahUjp1BoizVy6D2Xyvaw@mail.gmail.com>

Also add __printf() verification for format string.

Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Joe Perches <joe@perches.com>

---

 include/linux/device.h |   10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/linux/device.h b/include/linux/device.h
index 8873603..86ef6ab 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -902,8 +902,9 @@ extern const char *dev_driver_string(const struct device *dev);
 
 #ifdef CONFIG_PRINTK
 
-extern int dev_vprintk_emit(int level, const struct device *dev,
-			    const char *fmt, va_list args);
+extern __printf(3, 0)
+int dev_vprintk_emit(int level, const struct device *dev,
+		     const char *fmt, va_list args);
 extern __printf(3, 4)
 int dev_printk_emit(int level, const struct device *dev, const char *fmt, ...);
 
@@ -927,8 +928,9 @@ int _dev_info(const struct device *dev, const char *fmt, ...);
 
 #else
 
-static int dev_vprintk_emit(int level, const struct device *dev,
-			    const char *fmt, va_list args)
+static inline __printf(3, 0)
+int dev_vprintk_emit(int level, const struct device *dev,
+		     const char *fmt, va_list args)
 { return 0; }
 static inline __printf(3, 4)
 int dev_printk_emit(int level, const struct device *dev, const char *fmt, ...)

^ permalink raw reply related

* [PATCH net] ipv6: return errno pointers consistently for fib6_add_1()
From: Lin Ming @ 2012-09-26  1:17 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

fib6_add_1() should consistently return errno pointers,
rather than a mixture of NULL and errno pointers.

Signed-off-by: Lin Ming <mlin@ss.pku.edu.cn>
---
 net/ipv6/ip6_fib.c |   20 ++++++--------------
 1 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 286acfc..24995a9 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -514,7 +514,7 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr,
 	ln = node_alloc();
 
 	if (!ln)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 	ln->fn_bit = plen;
 
 	ln->parent = pn;
@@ -561,7 +561,7 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr,
 				node_free(in);
 			if (ln)
 				node_free(ln);
-			return NULL;
+			return ERR_PTR(-ENOMEM);
 		}
 
 		/*
@@ -611,7 +611,7 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr,
 		ln = node_alloc();
 
 		if (!ln)
-			return NULL;
+			return ERR_PTR(-ENOMEM);
 
 		ln->fn_bit = plen;
 
@@ -777,11 +777,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info)
 
 	if (IS_ERR(fn)) {
 		err = PTR_ERR(fn);
-		fn = NULL;
-	}
-
-	if (!fn)
 		goto out;
+	}
 
 	pn = fn;
 
@@ -820,15 +817,12 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info)
 					allow_create, replace_required);
 
 			if (IS_ERR(sn)) {
-				err = PTR_ERR(sn);
-				sn = NULL;
-			}
-			if (!sn) {
 				/* If it is failed, discard just allocated
 				   root, and then (in st_failure) stale node
 				   in main tree.
 				 */
 				node_free(sfn);
+				err = PTR_ERR(sn);
 				goto st_failure;
 			}
 
@@ -843,10 +837,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info)
 
 			if (IS_ERR(sn)) {
 				err = PTR_ERR(sn);
-				sn = NULL;
-			}
-			if (!sn)
 				goto st_failure;
+			}
 		}
 
 		if (!fn->leaf) {
-- 
1.7.2.5

^ permalink raw reply related

* [PATCH] netfilter: xt_limit: have r->cost != 0 case work
From: pablo @ 2012-09-25 23:34 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1348616051-5694-1-git-send-email-pablo@netfilter.org>

From: Jan Engelhardt <jengelh@inai.de>

Commit v2.6.19-rc1~1272^2~41 tells us that r->cost != 0 can happen when
a running state is saved to userspace and then reinstated from there.

Make sure that private xt_limit area is initialized with correct values.
Otherwise, random matchings due to use of uninitialized memory.

Signed-off-by: Jan Engelhardt <jengelh@inai.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_limit.c |    8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
index 5c22ce8..a4c1e45 100644
--- a/net/netfilter/xt_limit.c
+++ b/net/netfilter/xt_limit.c
@@ -117,11 +117,11 @@ static int limit_mt_check(const struct xt_mtchk_param *par)
 
 	/* For SMP, we only want to use one set of state. */
 	r->master = priv;
+	/* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies *
+	   128. */
+	priv->prev = jiffies;
+	priv->credit = user2credits(r->avg * r->burst); /* Credits full. */
 	if (r->cost == 0) {
-		/* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies *
-		   128. */
-		priv->prev = jiffies;
-		priv->credit = user2credits(r->avg * r->burst); /* Credits full. */
 		r->credit_cap = priv->credit; /* Credits full. */
 		r->cost = user2credits(r->avg);
 	}
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH] netfilter fix for 3.6-rc7
From: pablo @ 2012-09-25 23:34 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev

From: Pablo Neira Ayuso <pablo@netfilter.org>

Hi David,

If time allows, I'd appreciate if you can take the following fix
for the xt_limit match.

As Jan indicates, random things may occur while using the xt_limit
match due to use of uninitialized memory.

You can pull this change from:

git://1984.lsi.us.es/nf master

Thanks!

Jan Engelhardt (1):
  netfilter: xt_limit: have r->cost != 0 case work

 net/netfilter/xt_limit.c |    8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

-- 
1.7.10.4


^ permalink raw reply

* [PATCHv5 net-next] vxlan: virtual extensible lan
From: Stephen Hemminger @ 2012-09-25 22:09 UTC (permalink / raw)
  To: Jesse Gross, Chris Wright, David Miller; +Cc: netdev
In-Reply-To: <CAEP_g=_PQSx_OVcU47OJFKgO_MpWSKStmGnVB76vSm=Z7r6mvg@mail.gmail.com>

This is an implementation of Virtual eXtensible Local Area Network
as described in draft RFC:
  http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-02

The driver integrates a Virtual Tunnel Endpoint (VTEP) functionality
that learns MAC to IP address mapping. 

This implementation has not been tested for Interoperation with
other equipment.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
v5 - drop MTU discovery since network is overlaid
     use common code to do ECN decapsulation
v4 - fix ecn and set state of fdb entries
v3 - fix ordering of change versus migration message
v2 - fix use of ip header after pskb_may_pull

 Documentation/networking/vxlan.txt |   36 +
 drivers/net/Kconfig                |   13 
 drivers/net/Makefile               |    1 
 drivers/net/vxlan.c                | 1164 +++++++++++++++++++++++++++++++++++++
 include/linux/if_link.h            |   14 
 5 files changed, 1228 insertions(+)

--- a/drivers/net/Kconfig	2012-09-24 17:23:51.888577602 -0700
+++ b/drivers/net/Kconfig	2012-09-25 14:03:41.564362855 -0700
@@ -149,6 +149,19 @@ config MACVTAP
 	  To compile this driver as a module, choose M here: the module
 	  will be called macvtap.
 
+config VXLAN
+       tristate "Virtual eXtensible Local Area Network (VXLAN)"
+       depends on EXPERIMENTAL
+       ---help---
+	  This allows one to create vxlan virtual interfaces that provide
+	  Layer 2 Networks over Layer 3 Networks. VXLAN is often used
+	  to tunnel virtual network infrastructure in virtualized environments.
+	  For more information see:
+	    http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-02
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called vxlan.
+
 config NETCONSOLE
 	tristate "Network console logging support"
 	---help---
--- a/drivers/net/Makefile	2012-09-24 17:23:51.888577602 -0700
+++ b/drivers/net/Makefile	2012-09-25 14:03:41.564362855 -0700
@@ -21,6 +21,7 @@ obj-$(CONFIG_NET_TEAM) += team/
 obj-$(CONFIG_TUN) += tun.o
 obj-$(CONFIG_VETH) += veth.o
 obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
+obj-$(CONFIG_VXLAN) += vxlan.o
 
 #
 # Networking Drivers
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ b/drivers/net/vxlan.c	2012-09-25 15:08:03.546231656 -0700
@@ -0,0 +1,1164 @@
+/*
+ * VXLAN: Virtual eXtensiable Local Area Network
+ *
+ * Copyright (c) 2012 Vyatta Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * TODO
+ *  - use IANA UDP port number (when defined)
+ *  - IPv6 (not in RFC)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/rculist.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/igmp.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/version.h>
+#include <linux/hash.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/rtnetlink.h>
+#include <net/route.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#define VXLAN_VERSION	"0.0"
+
+#define VNI_HASH_BITS	10
+#define VNI_HASH_SIZE	(1<<VNI_HASH_BITS)
+#define FDB_HASH_BITS	8
+#define FDB_HASH_SIZE	(1<<FDB_HASH_BITS)
+
+#define VXLAN_N_VID	(1u << 24)
+#define VXLAN_VID_MASK	(VXLAN_N_VID - 1)
+
+#define FDB_AGE_INTERVAL (10 * HZ)	/* rescan interval */
+#define FDB_AGE_TIME	 (300 * HZ)	/* drop if not used in 5 min */
+
+#define VXLAN_FLAGS 0x08000000	/* struct vxlanhdr.vx_flags required value. */
+
+/* VXLAN protocol header */
+struct vxlanhdr {
+	__be32 vx_flags;
+	__be32 vx_vni;
+};
+
+#define VXLAN_HEADROOM (sizeof(struct iphdr)		\
+			+ sizeof(struct udphdr)		\
+			+ sizeof(struct vxlanhdr))
+
+/* UDP port for VXLAN traffic. */
+static unsigned int vxlan_port __read_mostly = 8472;
+module_param_named(port, vxlan_port, uint, 0);
+MODULE_PARM_DESC(vxlan_port, "Destination UDP port");
+
+static bool log_ecn_error = true;
+module_param(log_ecn_error, bool, 0644);
+MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
+
+/* per-net private data for this module */
+static unsigned int vxlan_net_id;
+struct vxlan_net {
+	struct socket	  *sock;	/* UDP encap socket */
+	struct hlist_head vni_list[VNI_HASH_SIZE];
+};
+
+/* Forwarding table entry */
+struct vxlan_fdb {
+	struct hlist_node hlist;	/* linked list of entries */
+	struct rcu_head	  rcu;
+	unsigned long	  updated;	/* jiffies */
+	unsigned long	  used;
+	__be32		  remote_ip;
+	u16		  state;	/* see ndm_state */
+	u8		  eth_addr[ETH_ALEN];
+};
+
+/* Per-cpu network traffic stats */
+struct vxlan_stats {
+	u64			rx_packets;
+	u64			rx_bytes;
+	u64			tx_packets;
+	u64			tx_bytes;
+	struct u64_stats_sync	syncp;
+};
+
+/* Pseudo network device */
+struct vxlan_dev {
+	struct hlist_node hlist;
+	struct net_device *dev;
+	struct vxlan_stats __percpu *stats;
+	__u32		  vni;		/* virtual network id */
+	__be32	          gaddr;	/* multicast group */
+	__be32		  saddr;	/* source address */
+	unsigned int      link;		/* link to multicast over */
+	__u8		  tos;		/* TOS override */
+	__u8		  ttl;
+	bool		  learn;
+
+	struct timer_list age_timer;
+	spinlock_t	  hash_lock;
+	struct hlist_head fdb_head[FDB_HASH_SIZE];
+};
+
+/* salt for hash table */
+static u32 vxlan_salt __read_mostly;
+
+static inline struct hlist_head *vni_head(struct net *net, u32 id)
+{
+	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+
+	return &vn->vni_list[hash_32(id, VNI_HASH_BITS)];
+}
+
+/* Look up VNI in a per net namespace table */
+static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id)
+{
+	struct vxlan_dev *vxlan;
+	struct hlist_node *node;
+
+	hlist_for_each_entry_rcu(vxlan, node, vni_head(net, id), hlist) {
+		if (vxlan->vni == id)
+			return vxlan;
+	}
+
+	return NULL;
+}
+
+/* Fill in neighbour message in skbuff. */
+static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
+			   const struct vxlan_fdb *fdb,
+			   u32 portid, u32 seq, int type, unsigned int flags)
+{
+	unsigned long now = jiffies;
+	struct nda_cacheinfo ci;
+	struct nlmsghdr *nlh;
+	struct ndmsg *ndm;
+
+	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	ndm = nlmsg_data(nlh);
+	memset(ndm, 0, sizeof(*ndm));
+	ndm->ndm_family	= AF_BRIDGE;
+	ndm->ndm_state = fdb->state;
+
+	if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
+		goto nla_put_failure;
+
+	if (nla_put_be32(skb, NDA_DST, fdb->remote_ip))
+		goto nla_put_failure;
+
+	ci.ndm_used	 = jiffies_to_clock_t(now - fdb->used);
+	ci.ndm_confirmed = 0;
+	ci.ndm_updated	 = jiffies_to_clock_t(now - fdb->updated);
+	ci.ndm_refcnt	 = 0;
+
+	if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
+		goto nla_put_failure;
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static inline size_t vxlan_nlmsg_size(void)
+{
+	return NLMSG_ALIGN(sizeof(struct ndmsg))
+		+ nla_total_size(ETH_ALEN) /* NDA_LLADDR */
+		+ nla_total_size(sizeof(__be32)) /* NDA_DST */
+		+ nla_total_size(sizeof(struct nda_cacheinfo));
+}
+
+static void vxlan_fdb_notify(struct vxlan_dev *vxlan,
+			     const struct vxlan_fdb *fdb, int type)
+{
+	struct net *net = dev_net(vxlan->dev);
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC);
+	if (skb == NULL)
+		goto errout;
+
+	err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0);
+	if (err < 0) {
+		/* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(skb);
+		goto errout;
+	}
+
+	rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+	return;
+errout:
+	if (err < 0)
+		rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
+}
+
+/* Hash Ethernet address */
+static u32 eth_hash(const unsigned char *addr)
+{
+	/* could be optimized for unaligned access */
+	u32 a = addr[5] << 8 | addr[4];
+	u32 b = addr[3] << 24 | addr[2] << 16 | addr[1] << 8 | addr[0];
+
+	return jhash_2words(a, b, vxlan_salt);
+}
+
+/* Hash chain to use given mac address */
+static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
+						const u8 *mac)
+{
+	return &vxlan->fdb_head[hash_32(eth_hash(mac), FDB_HASH_BITS)];
+}
+
+/* Look up Ethernet address in forwarding table */
+static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
+					const u8 *mac)
+
+{
+	struct hlist_head *head = vxlan_fdb_head(vxlan, mac);
+	struct vxlan_fdb *f;
+	struct hlist_node *node;
+
+	hlist_for_each_entry_rcu(f, node, head, hlist) {
+		if (compare_ether_addr(mac, f->eth_addr) == 0)
+			return f;
+	}
+
+	return NULL;
+}
+
+/* Add new entry to forwarding table -- assumes lock held */
+static int vxlan_fdb_create(struct vxlan_dev *vxlan,
+			    const u8 *mac, __be32 ip,
+			    __u16 state, __u16 flags)
+{
+	struct vxlan_fdb *f;
+
+	f = vxlan_find_mac(vxlan, mac);
+	if (f) {
+		if (flags & NLM_F_EXCL) {
+			netdev_dbg(vxlan->dev,
+				   "lost race to create %pM\n", mac);
+			return -EEXIST;
+		}
+	} else {
+		if (!(flags & NLM_F_CREATE))
+			return -ENOENT;
+
+		netdev_dbg(vxlan->dev, "add %pM -> %pI4\n", mac, &ip);
+		f = kmalloc(sizeof(*f), GFP_ATOMIC);
+		if (!f)
+			return -ENOMEM;
+
+		f->remote_ip = ip;
+		memcpy(f->eth_addr, mac, ETH_ALEN);
+		hlist_add_head_rcu(&f->hlist,
+				   vxlan_fdb_head(vxlan, mac));
+		f->state = 0;
+	}
+
+	if (f->state != state) {
+		f->state = state;
+		f->updated = f->used = jiffies;
+		vxlan_fdb_notify(vxlan, f, RTM_NEWNEIGH);
+	}
+
+	return 0;
+}
+
+static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f)
+{
+	netdev_dbg(vxlan->dev,
+		    "delete %pM\n", f->eth_addr);
+
+	vxlan_fdb_notify(vxlan, f, RTM_DELNEIGH);
+
+	hlist_del_rcu(&f->hlist);
+	kfree_rcu(f, rcu);
+}
+
+/* Add static entry (via netlink) */
+static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
+			 struct net_device *dev,
+			 const unsigned char *addr, u16 flags)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	__be32 ip;
+	int err;
+
+	if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) {
+		pr_info("RTM_NEWNEIGH with invalid state %#x\n", ndm->ndm_state);
+		return -EINVAL;
+	}
+
+	if (tb[NDA_DST] == NULL)
+		return -EINVAL;
+
+	if (nla_len(tb[NDA_DST]) != sizeof(__be32))
+		return -EAFNOSUPPORT;
+
+	ip = nla_get_be32(tb[NDA_DST]);
+
+	spin_lock_bh(&vxlan->hash_lock);
+	err = vxlan_fdb_create(vxlan, addr, ip, ndm->ndm_state, flags);
+	spin_unlock_bh(&vxlan->hash_lock);
+
+	return err;
+}
+
+/* Delete entry (via netlink) */
+static int vxlan_fdb_delete(struct ndmsg *ndm, struct net_device *dev,
+			    const unsigned char *addr)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_fdb *f;
+	int err = -ENOENT;
+
+	spin_lock_bh(&vxlan->hash_lock);
+	f = vxlan_find_mac(vxlan, addr);
+	if (f) {
+		vxlan_fdb_destroy(vxlan, f);
+		err = 0;
+	}
+	spin_unlock_bh(&vxlan->hash_lock);
+
+	return err;
+}
+
+/* Dump forwarding table */
+static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
+			  struct net_device *dev, int idx)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	unsigned int h;
+
+	for (h = 0; h < FDB_HASH_SIZE; ++h) {
+		struct vxlan_fdb *f;
+		struct hlist_node *n;
+
+		hlist_for_each_entry_rcu(f, n, &vxlan->fdb_head[h], hlist) {
+			if (idx < cb->args[0])
+				goto skip;
+
+			if (vxlan_fdb_info(skb, vxlan, f,
+					    NETLINK_CB(cb->skb).portid,
+					    cb->nlh->nlmsg_seq,
+					    RTM_NEWNEIGH,
+					    NLM_F_MULTI) < 0)
+				break;
+skip:
+			++idx;
+		}
+	}
+
+	return idx;
+}
+
+/* Watch incoming packets to learn mapping between Ethernet address
+ * and Tunnel endpoint.
+ */
+static void vxlan_snoop(struct net_device *dev,
+			__be32 src_ip, const u8 *src_mac)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_fdb *f;
+	int err;
+
+	f = vxlan_find_mac(vxlan, src_mac);
+	if (likely(f)) {
+		f->used = jiffies;
+		if (likely(f->remote_ip == src_ip))
+			return;
+
+		if (net_ratelimit())
+			netdev_info(dev,
+				    "%pM migrated from %pI4 to %pI4\n",
+				    src_mac, &f->remote_ip, &src_ip);
+
+		f->remote_ip = src_ip;
+		f->updated = jiffies;
+	} else {
+		/* learned new entry */
+		spin_lock(&vxlan->hash_lock);
+		err = vxlan_fdb_create(vxlan, src_mac, src_ip,
+				       NUD_REACHABLE,
+				       NLM_F_EXCL|NLM_F_CREATE);
+		spin_unlock(&vxlan->hash_lock);
+	}
+}
+
+
+/* See if multicast group is already in use by other ID */
+static bool vxlan_group_used(struct vxlan_net *vn,
+			     const struct vxlan_dev *this)
+{
+	const struct vxlan_dev *vxlan;
+	struct hlist_node *node;
+	unsigned h;
+
+	for (h = 0; h < VNI_HASH_SIZE; ++h)
+		hlist_for_each_entry(vxlan, node, &vn->vni_list[h], hlist) {
+			if (vxlan == this)
+				continue;
+
+			if (!netif_running(vxlan->dev))
+				continue;
+
+			if (vxlan->gaddr == this->gaddr)
+				return true;
+		}
+
+	return false;
+}
+
+/* kernel equivalent to IP_ADD_MEMBERSHIP */
+static int vxlan_join_group(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
+	struct sock *sk = vn->sock->sk;
+	struct ip_mreqn mreq = {
+		.imr_multiaddr.s_addr = vxlan->gaddr,
+	};
+	int err;
+
+	if (vxlan_group_used(vn, vxlan))
+		return 0;
+
+	/* Need to drop RTNL to call multicast join */
+	rtnl_unlock();
+	lock_sock(sk);
+	err = ip_mc_join_group(sk, &mreq);
+	release_sock(sk);
+	rtnl_lock();
+
+	return err;
+}
+
+
+/* kernel equivalent to IP_DROP_MEMBERSHIP */
+static int vxlan_leave_group(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
+	int err = 0;
+	struct sock *sk = vn->sock->sk;
+	struct ip_mreqn mreq = {
+		.imr_multiaddr.s_addr = vxlan->gaddr,
+	};
+
+	if (vxlan_group_used(vn, vxlan))
+		return 0;
+
+	/* Need to drop RTNL to call multicast leave */
+	rtnl_unlock();
+	lock_sock(sk);
+	err = ip_mc_leave_group(sk, &mreq);
+	release_sock(sk);
+	rtnl_lock();
+
+	return err;
+}
+
+/* Callback from net/ipv4/udp.c to receive packets */
+static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
+{
+	struct iphdr *oip;
+	struct vxlanhdr *vxh;
+	struct vxlan_dev *vxlan;
+	struct vxlan_stats *stats;
+	__u32 vni;
+	int err;
+
+	/* pop off outer UDP header */
+	__skb_pull(skb, sizeof(struct udphdr));
+
+	/* Need Vxlan and inner Ethernet header to be present */
+	if (!pskb_may_pull(skb,
+			   sizeof(struct vxlanhdr) + sizeof(struct ethhdr)))
+		goto error;
+
+	oip = ip_hdr(skb);
+
+	/* Drop packets with reserved bits set */
+	vxh = (struct vxlanhdr *) skb->data;
+	if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
+	    (vxh->vx_vni & htonl(0xff)))
+		goto error;
+
+	__skb_pull(skb, sizeof(struct vxlanhdr));
+
+	/* Is this VNI defined? */
+	vni = ntohl(vxh->vx_vni) >> 8;
+	vxlan = vxlan_find_vni(sock_net(sk), vni);
+	if (!vxlan)
+		goto drop;
+
+	/* Ignore packets if device is not up */
+	if (!netif_running(vxlan->dev))
+		goto drop;
+
+	/* Re-examine inner Ethernet packet */
+	skb->protocol = eth_type_trans(skb, vxlan->dev);
+	skb->ip_summed = CHECKSUM_NONE;
+
+	/* Ignore packet loops (and multicast echo) */
+	if (compare_ether_addr(eth_hdr(skb)->h_source,
+			       vxlan->dev->dev_addr) == 0)
+		goto drop;
+
+	/* Check for multicast group configuration errors */
+	if (IN_MULTICAST(ntohl(oip->daddr)) &&
+	    oip->daddr != vxlan->gaddr) {
+		if (net_ratelimit())
+			netdev_notice(vxlan->dev,
+				      "group address %pI4 does not match\n",
+				      &oip->daddr);
+		goto drop;
+	}
+
+	if (vxlan->learn)
+		vxlan_snoop(skb->dev, oip->saddr, eth_hdr(skb)->h_source);
+
+	__skb_tunnel_rx(skb, vxlan->dev);
+	skb_reset_network_header(skb);
+
+	err = IP_ECN_decapsulate(oip, skb);
+	if (unlikely(err)) {
+		if (log_ecn_error)
+			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
+					     &oip->saddr, oip->tos);
+		if (err > 1) {
+			++vxlan->dev->stats.rx_frame_errors;
+			++vxlan->dev->stats.rx_errors;
+			goto drop;
+		}
+	}
+
+	stats = this_cpu_ptr(vxlan->stats);
+	u64_stats_update_begin(&stats->syncp);
+	stats->rx_packets++;
+	stats->rx_bytes += skb->len;
+	u64_stats_update_end(&stats->syncp);
+
+	netif_rx(skb);
+
+	return 0;
+error:
+	/* Put UDP header back */
+	__skb_push(skb, sizeof(struct udphdr));
+
+	return 1;
+drop:
+	/* Consume bad packet */
+	kfree_skb(skb);
+	return 0;
+}
+
+/* Extract dsfield from inner protocol */
+static inline u8 vxlan_get_dsfield(const struct iphdr *iph,
+				   const struct sk_buff *skb)
+{
+	if (skb->protocol == htons(ETH_P_IP))
+		return iph->tos;
+	else if (skb->protocol == htons(ETH_P_IPV6))
+		return ipv6_get_dsfield((const struct ipv6hdr *)iph);
+	else
+		return 0;
+}
+
+/* Propogate ECN bits out */
+static inline u8 vxlan_ecn_encap(u8 tos,
+				 const struct iphdr *iph,
+				 const struct sk_buff *skb)
+{
+	u8 inner = vxlan_get_dsfield(iph, skb);
+
+	return INET_ECN_encapsulate(tos, inner);
+}
+
+/* Transmit local packets over Vxlan
+ *
+ * Outer IP header inherits ECN and DF from inner header.
+ * Outer UDP destination is the VXLAN assigned port.
+ *           source port is based on hash of flow if available
+ *                       otherwise use a random value
+ */
+static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct rtable *rt;
+	const struct ethhdr *eth;
+	const struct iphdr *old_iph;
+	struct iphdr *iph;
+	struct vxlanhdr *vxh;
+	struct udphdr *uh;
+	struct flowi4 fl4;
+	struct vxlan_fdb *f;
+	unsigned int pkt_len = skb->len;
+	u32 hash;
+	__be32 dst;
+	__be16 df = 0;
+	__u8 tos, ttl;
+	int err;
+
+	/* Need space for new headers (invalidates iph ptr) */
+	if (skb_cow_head(skb, VXLAN_HEADROOM))
+		goto drop;
+
+	eth = (void *)skb->data;
+	old_iph = ip_hdr(skb);
+
+	if (!is_multicast_ether_addr(eth->h_dest) &&
+	    (f = vxlan_find_mac(vxlan, eth->h_dest)))
+		dst = f->remote_ip;
+	else if (vxlan->gaddr) {
+		dst = vxlan->gaddr;
+	} else
+		goto drop;
+
+	ttl = vxlan->ttl;
+	if (!ttl && IN_MULTICAST(ntohl(dst)))
+		ttl = 1;
+
+	tos = vxlan->tos;
+	if (tos == 1)
+		tos = vxlan_get_dsfield(old_iph, skb);
+
+	hash = skb_get_rxhash(skb);
+
+	rt = ip_route_output_gre(dev_net(dev), &fl4, dst,
+				 vxlan->saddr, vxlan->vni,
+				 RT_TOS(tos), vxlan->link);
+	if (IS_ERR(rt)) {
+		netdev_dbg(dev, "no route to %pI4\n", &dst);
+		dev->stats.tx_carrier_errors++;
+		goto tx_error;
+	}
+
+	if (rt->dst.dev == dev) {
+		netdev_dbg(dev, "circular route to %pI4\n", &dst);
+		ip_rt_put(rt);
+		dev->stats.collisions++;
+		goto tx_error;
+	}
+
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
+			      IPSKB_REROUTED);
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
+
+	vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
+	vxh->vx_flags = htonl(VXLAN_FLAGS);
+	vxh->vx_vni = htonl(vxlan->vni << 8);
+
+	__skb_push(skb, sizeof(*uh));
+	skb_reset_transport_header(skb);
+	uh = udp_hdr(skb);
+
+	uh->dest = htons(vxlan_port);
+	uh->source = hash ? :random32();
+
+	uh->len = htons(skb->len);
+	uh->check = 0;
+
+	__skb_push(skb, sizeof(*iph));
+	skb_reset_network_header(skb);
+	iph		= ip_hdr(skb);
+	iph->version	= 4;
+	iph->ihl	= sizeof(struct iphdr) >> 2;
+	iph->frag_off	= df;
+	iph->protocol	= IPPROTO_UDP;
+	iph->tos	= vxlan_ecn_encap(tos, old_iph, skb);
+	iph->daddr	= fl4.daddr;
+	iph->saddr	= fl4.saddr;
+	iph->ttl	= ttl ? : ip4_dst_hoplimit(&rt->dst);
+
+	/* See __IPTUNNEL_XMIT */
+	skb->ip_summed = CHECKSUM_NONE;
+	ip_select_ident(iph, &rt->dst, NULL);
+
+	err = ip_local_out(skb);
+	if (likely(net_xmit_eval(err) == 0)) {
+		struct vxlan_stats *stats = this_cpu_ptr(vxlan->stats);
+
+		u64_stats_update_begin(&stats->syncp);
+		stats->tx_packets++;
+		stats->tx_bytes += pkt_len;
+		u64_stats_update_end(&stats->syncp);
+	} else {
+		dev->stats.tx_errors++;
+		dev->stats.tx_aborted_errors++;
+	}
+	return NETDEV_TX_OK;
+
+drop:
+	dev->stats.tx_dropped++;
+	goto tx_free;
+
+tx_error:
+	dev->stats.tx_errors++;
+tx_free:
+	dev_kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+/* Walk the forwarding table and purge stale entries */
+static void vxlan_cleanup(unsigned long arg)
+{
+	struct vxlan_dev *vxlan = (struct vxlan_dev *) arg;
+	unsigned long next_timer = jiffies + FDB_AGE_INTERVAL;
+	unsigned int h;
+
+	if (!netif_running(vxlan->dev))
+		return;
+
+	spin_lock_bh(&vxlan->hash_lock);
+	for (h = 0; h < FDB_HASH_SIZE; ++h) {
+		struct hlist_node *p, *n;
+		hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
+			struct vxlan_fdb *f
+				= container_of(p, struct vxlan_fdb, hlist);
+			unsigned long timeout = f->used + FDB_AGE_TIME;
+
+			if (f->state == NUD_PERMANENT)
+				continue;
+
+			if (time_before_eq(timeout, jiffies)) {
+				netdev_dbg(vxlan->dev,
+					   "garbage collect %pM\n",
+					   f->eth_addr);
+				f->state = NUD_STALE;
+				vxlan_fdb_destroy(vxlan, f);
+			} else if (time_before(timeout, next_timer))
+				next_timer = timeout;
+		}
+	}
+	spin_unlock_bh(&vxlan->hash_lock);
+
+	mod_timer(&vxlan->age_timer, next_timer);
+}
+
+/* Setup stats when device is created */
+static int vxlan_init(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	vxlan->stats = alloc_percpu(struct vxlan_stats);
+	if (!vxlan->stats)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/* Start ageing timer and join group when device is brought up */
+static int vxlan_open(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	int err;
+
+	if (vxlan->gaddr) {
+		err = vxlan_join_group(dev);
+		if (err)
+			return err;
+	}
+
+	mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);
+	return 0;
+}
+
+/* Purge the forwarding table */
+static void vxlan_flush(struct vxlan_dev *vxlan)
+{
+	unsigned h;
+
+	spin_lock_bh(&vxlan->hash_lock);
+	for (h = 0; h < FDB_HASH_SIZE; ++h) {
+		struct hlist_node *p, *n;
+		hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
+			struct vxlan_fdb *f
+				= container_of(p, struct vxlan_fdb, hlist);
+			vxlan_fdb_destroy(vxlan, f);
+		}
+	}
+	spin_unlock_bh(&vxlan->hash_lock);
+}
+
+/* Cleanup timer and forwarding table on shutdown */
+static int vxlan_stop(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	if (vxlan->gaddr)
+		vxlan_leave_group(dev);
+
+	del_timer_sync(&vxlan->age_timer);
+
+	vxlan_flush(vxlan);
+
+	return 0;
+}
+
+/* Merge per-cpu statistics */
+static struct rtnl_link_stats64 *vxlan_stats64(struct net_device *dev,
+					       struct rtnl_link_stats64 *stats)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct vxlan_stats tmp, sum = { 0 };
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		unsigned int start;
+		const struct vxlan_stats *stats
+			= per_cpu_ptr(vxlan->stats, cpu);
+
+		do {
+			start = u64_stats_fetch_begin_bh(&stats->syncp);
+			memcpy(&tmp, stats, sizeof(tmp));
+		} while (u64_stats_fetch_retry_bh(&stats->syncp, start));
+
+		sum.tx_bytes   += tmp.tx_bytes;
+		sum.tx_packets += tmp.tx_packets;
+		sum.rx_bytes   += tmp.rx_bytes;
+		sum.rx_packets += tmp.rx_packets;
+	}
+
+	stats->tx_bytes   = sum.tx_bytes;
+	stats->tx_packets = sum.tx_packets;
+	stats->rx_bytes   = sum.rx_bytes;
+	stats->rx_packets = sum.rx_packets;
+
+	stats->tx_dropped = dev->stats.tx_dropped;
+	stats->tx_errors  = dev->stats.tx_errors;
+	stats->tx_carrier_errors  = dev->stats.tx_carrier_errors;
+	stats->collisions  = dev->stats.collisions;
+
+	return stats;
+}
+
+/* Stub, nothing needs to be done. */
+static void vxlan_set_multicast_list(struct net_device *dev)
+{
+}
+
+static const struct net_device_ops vxlan_netdev_ops = {
+	.ndo_init		= vxlan_init,
+	.ndo_open		= vxlan_open,
+	.ndo_stop		= vxlan_stop,
+	.ndo_start_xmit		= vxlan_xmit,
+	.ndo_get_stats64	= vxlan_stats64,
+	.ndo_set_rx_mode	= vxlan_set_multicast_list,
+	.ndo_change_mtu		= eth_change_mtu,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_set_mac_address	= eth_mac_addr,
+	.ndo_fdb_add		= vxlan_fdb_add,
+	.ndo_fdb_del		= vxlan_fdb_delete,
+	.ndo_fdb_dump		= vxlan_fdb_dump,
+};
+
+/* Info for udev, that this is a virtual tunnel endpoint */
+static struct device_type vxlan_type = {
+	.name = "vxlan",
+};
+
+static void vxlan_free(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	free_percpu(vxlan->stats);
+	free_netdev(dev);
+}
+
+/* Initialize the device structure. */
+static void vxlan_setup(struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	unsigned h;
+
+	eth_hw_addr_random(dev);
+	ether_setup(dev);
+
+	dev->netdev_ops = &vxlan_netdev_ops;
+	dev->destructor = vxlan_free;
+	SET_NETDEV_DEVTYPE(dev, &vxlan_type);
+
+	dev->tx_queue_len = 0;
+	dev->features	|= NETIF_F_LLTX;
+	dev->features	|= NETIF_F_NETNS_LOCAL;
+	dev->priv_flags	&= ~IFF_XMIT_DST_RELEASE;
+
+	spin_lock_init(&vxlan->hash_lock);
+
+	init_timer_deferrable(&vxlan->age_timer);
+	vxlan->age_timer.function = vxlan_cleanup;
+	vxlan->age_timer.data = (unsigned long) vxlan;
+
+	vxlan->dev = dev;
+
+	for (h = 0; h < FDB_HASH_SIZE; ++h)
+		INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
+}
+
+static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
+	[IFLA_VXLAN_ID]		= { .type = NLA_U32 },
+	[IFLA_VXLAN_GROUP]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
+	[IFLA_VXLAN_LINK]	= { .type = NLA_U32 },
+	[IFLA_VXLAN_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
+	[IFLA_VXLAN_TOS]	= { .type = NLA_U8 },
+	[IFLA_VXLAN_TTL]	= { .type = NLA_U8 },
+	[IFLA_VXLAN_LEARNING]	= { .type = NLA_U8 },
+};
+
+static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+	if (tb[IFLA_ADDRESS]) {
+		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
+			pr_debug("invalid link address (not ethernet)\n");
+			return -EINVAL;
+		}
+
+		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
+			pr_debug("invalid all zero ethernet address\n");
+			return -EADDRNOTAVAIL;
+		}
+	}
+
+	if (!data)
+		return -EINVAL;
+
+	if (data[IFLA_VXLAN_ID]) {
+		__u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);
+		if (id >= VXLAN_VID_MASK)
+			return -ERANGE;
+	}
+
+	if (data[IFLA_VXLAN_GROUP]) {
+		__be32 gaddr = nla_get_be32(data[IFLA_VXLAN_GROUP]);
+		if (!IN_MULTICAST(ntohl(gaddr))) {
+			pr_debug("group address is not IPv4 multicast\n");
+			return -EADDRNOTAVAIL;
+		}
+	}
+
+	return 0;
+}
+
+static int vxlan_newlink(struct net *net, struct net_device *dev,
+			 struct nlattr *tb[], struct nlattr *data[])
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	__u32 vni;
+	int err;
+
+	if (!data[IFLA_VXLAN_ID])
+		return -EINVAL;
+
+	vni = nla_get_u32(data[IFLA_VXLAN_ID]);
+	if (vxlan_find_vni(net, vni)) {
+		pr_info("duplicate VNI %u\n", vni);
+		return -EEXIST;
+	}
+	vxlan->vni = vni;
+
+	if (data[IFLA_VXLAN_GROUP])
+		vxlan->gaddr = nla_get_be32(data[IFLA_VXLAN_GROUP]);
+
+	if (data[IFLA_VXLAN_LOCAL])
+		vxlan->saddr = nla_get_be32(data[IFLA_VXLAN_LOCAL]);
+
+	if (data[IFLA_VXLAN_LINK])
+		vxlan->link = nla_get_u32(data[IFLA_VXLAN_LINK]);
+
+	if (data[IFLA_VXLAN_TOS])
+		vxlan->tos  = nla_get_u8(data[IFLA_VXLAN_TOS]);
+
+	if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING]))
+		vxlan->learn = true;
+
+	err = register_netdevice(dev);
+	if (!err)
+		hlist_add_head_rcu(&vxlan->hlist, vni_head(net, vxlan->vni));
+
+	return err;
+}
+
+static void vxlan_dellink(struct net_device *dev, struct list_head *head)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	hlist_del_rcu(&vxlan->hlist);
+
+	unregister_netdevice_queue(dev, head);
+}
+
+static size_t vxlan_get_size(const struct net_device *dev)
+{
+
+	return nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_ID */
+		nla_total_size(sizeof(__be32)) +/* IFLA_VXLAN_GROUP */
+		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LINK */
+		nla_total_size(sizeof(__be32))+	/* IFLA_VXLAN_LOCAL */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
+		0;
+}
+
+static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	const struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	if (nla_put_u32(skb, IFLA_VXLAN_ID, vxlan->vni))
+		goto nla_put_failure;
+
+	if (vxlan->gaddr && nla_put_u32(skb, IFLA_VXLAN_GROUP, vxlan->gaddr))
+		goto nla_put_failure;
+
+	if (vxlan->link && nla_put_u32(skb, IFLA_VXLAN_LINK, vxlan->link))
+		goto nla_put_failure;
+
+	if (vxlan->saddr && nla_put_u32(skb, IFLA_VXLAN_LOCAL, vxlan->saddr))
+		goto nla_put_failure;
+
+	if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) ||
+	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) ||
+	    nla_put_u8(skb, IFLA_VXLAN_LEARNING, vxlan->learn))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
+	.kind		= "vxlan",
+	.maxtype	= IFLA_VXLAN_MAX,
+	.policy		= vxlan_policy,
+	.priv_size	= sizeof(struct vxlan_dev),
+	.setup		= vxlan_setup,
+	.validate	= vxlan_validate,
+	.newlink	= vxlan_newlink,
+	.dellink	= vxlan_dellink,
+	.get_size	= vxlan_get_size,
+	.fill_info	= vxlan_fill_info,
+};
+
+static __net_init int vxlan_init_net(struct net *net)
+{
+	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+	struct sock *sk;
+	struct sockaddr_in addr;
+	int rc;
+	unsigned h;
+
+	/* Create UDP socket for encapsulation receive. */
+	rc = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &vn->sock);
+	if (rc < 0) {
+		pr_debug("UDP socket create failed\n");
+		return rc;
+	}
+
+	addr.sin_family = AF_INET;
+	addr.sin_addr.s_addr = INADDR_ANY;
+	addr.sin_port = htons(vxlan_port);
+
+	rc = kernel_bind(vn->sock, (struct sockaddr *) &addr, sizeof(addr));
+	if (rc < 0) {
+		pr_debug("bind for port %u failed %d\n", vxlan_port, rc);
+		sock_release(vn->sock);
+		vn->sock = NULL;
+		return rc;
+	}
+
+	/* Disable multicast loopback */
+	sk = vn->sock->sk;
+	inet_sk(sk)->mc_loop = 0;
+
+	/* Mark socket as an encapsulation socket. */
+	udp_sk(sk)->encap_type = 1;
+	udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv;
+	udp_encap_enable();
+
+	for (h = 0; h < VNI_HASH_SIZE; ++h)
+		INIT_HLIST_HEAD(&vn->vni_list[h]);
+
+	return 0;
+}
+
+static __net_exit void vxlan_exit_net(struct net *net)
+{
+	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+
+	if (vn->sock) {
+		sock_release(vn->sock);
+		vn->sock = NULL;
+	}
+}
+
+static struct pernet_operations vxlan_net_ops = {
+	.init = vxlan_init_net,
+	.exit = vxlan_exit_net,
+	.id   = &vxlan_net_id,
+	.size = sizeof(struct vxlan_net),
+};
+
+static int __init vxlan_init_module(void)
+{
+	int rc;
+
+	get_random_bytes(&vxlan_salt, sizeof(vxlan_salt));
+
+	rc = register_pernet_device(&vxlan_net_ops);
+	if (rc)
+		goto out1;
+
+	rc = rtnl_link_register(&vxlan_link_ops);
+	if (rc)
+		goto out2;
+
+	return 0;
+
+out2:
+	unregister_pernet_device(&vxlan_net_ops);
+out1:
+	return rc;
+}
+module_init(vxlan_init_module);
+
+static void __exit vxlan_cleanup_module(void)
+{
+	rtnl_link_unregister(&vxlan_link_ops);
+	unregister_pernet_device(&vxlan_net_ops);
+}
+module_exit(vxlan_cleanup_module);
+
+MODULE_LICENSE("GPL");
+MODULE_VERSION(VXLAN_VERSION);
+MODULE_AUTHOR("Stephen Hemminger <shemminger@vyatta.com>");
+MODULE_ALIAS_RTNL_LINK("vxlan");
--- a/include/linux/if_link.h	2012-09-24 17:23:51.888577602 -0700
+++ b/include/linux/if_link.h	2012-09-25 14:03:41.568362815 -0700
@@ -272,6 +272,20 @@ enum macvlan_mode {
 
 #define MACVLAN_FLAG_NOPROMISC	1
 
+/* VXLAN section */
+enum {
+	IFLA_VXLAN_UNSPEC,
+	IFLA_VXLAN_ID,
+	IFLA_VXLAN_GROUP,
+	IFLA_VXLAN_LINK,
+	IFLA_VXLAN_LOCAL,
+	IFLA_VXLAN_TTL,
+	IFLA_VXLAN_TOS,
+	IFLA_VXLAN_LEARNING,
+	__IFLA_VXLAN_MAX
+};
+#define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
+
 /* SR-IOV virtual function management section */
 
 enum {
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ b/Documentation/networking/vxlan.txt	2012-09-25 14:03:41.568362815 -0700
@@ -0,0 +1,36 @@
+Virtual eXtensible Local Area Networking documentation
+======================================================
+
+The VXLAN protocol is a tunnelling protocol that is designed to
+solve the problem of limited number of available VLAN's (4096).
+With VXLAN identifier is expanded to 24 bits.
+
+It is a draft RFC standard, that is implemented by Cisco Nexus,
+Vmware and Brocade. The protocol runs over UDP using a single
+destination port (still not standardized by IANA).
+This document describes the Linux kernel tunnel device,
+there is also an implantation of VXLAN for Openvswitch.
+
+Unlike most tunnels, a VXLAN is a 1 to N network, not just point
+to point. A VXLAN device can either dynamically learn the IP address
+of the other end, in a manner similar to a learning bridge, or the
+forwarding entries can be configured statically.
+
+The management of vxlan is done in a similar fashion to it's
+too closest neighbors GRE and VLAN. Configuring VXLAN requires
+the version of iproute2 that matches the kernel release
+where VXLAN was first merged upstream.
+
+1. Create vxlan device
+  # ip li add vxlan0 type vxlan id 42 group 239.1.1.1 dev eth1
+
+This creates a new device (vxlan0). The device uses the
+the multicast group 239.1.1.1 over eth1 to handle packets where
+no entry is in the forwarding table.
+
+2. Delete vxlan device
+  # ip link delete vxlan0
+
+3. Show vxlan info
+  # ip -d show vxlan0
+

^ permalink raw reply

* Re: [PATCHv4 net-next] vxlan: virtual extensible lan
From: Stephen Hemminger @ 2012-09-25 22:03 UTC (permalink / raw)
  To: Jesse Gross; +Cc: Chris Wright, David Miller, netdev
In-Reply-To: <CAEP_g=_PQSx_OVcU47OJFKgO_MpWSKStmGnVB76vSm=Z7r6mvg@mail.gmail.com>

On Tue, 25 Sep 2012 14:55:13 -0700
Jesse Gross <jesse@nicira.com> wrote:

> On Mon, Sep 24, 2012 at 2:50 PM, Stephen Hemminger
> <shemminger@vyatta.com> wrote:
> > +static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
> [...]
> > +       /* Do PMTU */
> > +       if (skb->protocol == htons(ETH_P_IP)) {
> > +               df |= old_iph->frag_off & htons(IP_DF);
> > +               if (df && mtu < pkt_len) {
> > +                       icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
> > +                                 htonl(mtu));
> > +                       ip_rt_put(rt);
> > +                       goto tx_error;
> > +               }
> > +       }
> > +#if IS_ENABLED(CONFIG_IPV6)
> > +       else if (skb->protocol == htons(ETH_P_IPV6)) {
> > +               if (mtu >= IPV6_MIN_MTU && mtu < pkt_len) {
> > +                       icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
> > +                       ip_rt_put(rt);
> > +                       goto tx_error;
> > +               }
> > +       }
> > +#endif
> 
> Won't this black hole packets if we need to generate ICMP messages?
> Since we're doing switching and not routing here icmp_send() doesn't
> necessarily have a route to the relevant endpoint.  It looks like
> Ethernet over GRE has this issue as well.

Good catch. The IP network space of the tunneled network will likely be
completely separate from the overlay network.

^ permalink raw reply

* Re: [PATCHv4 net-next] vxlan: virtual extensible lan
From: Jesse Gross @ 2012-09-25 21:55 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Chris Wright, David Miller, netdev
In-Reply-To: <20120924145031.3b0122e6@nehalam.linuxnetplumber.net>

On Mon, Sep 24, 2012 at 2:50 PM, Stephen Hemminger
<shemminger@vyatta.com> wrote:
> +static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
[...]
> +       /* Do PMTU */
> +       if (skb->protocol == htons(ETH_P_IP)) {
> +               df |= old_iph->frag_off & htons(IP_DF);
> +               if (df && mtu < pkt_len) {
> +                       icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
> +                                 htonl(mtu));
> +                       ip_rt_put(rt);
> +                       goto tx_error;
> +               }
> +       }
> +#if IS_ENABLED(CONFIG_IPV6)
> +       else if (skb->protocol == htons(ETH_P_IPV6)) {
> +               if (mtu >= IPV6_MIN_MTU && mtu < pkt_len) {
> +                       icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
> +                       ip_rt_put(rt);
> +                       goto tx_error;
> +               }
> +       }
> +#endif

Won't this black hole packets if we need to generate ICMP messages?
Since we're doing switching and not routing here icmp_send() doesn't
necessarily have a route to the relevant endpoint.  It looks like
Ethernet over GRE has this issue as well.

^ permalink raw reply

* [PATCH NEXT V3] rtlwifi: rtl8192c: rtl8192ce: Add support for B-CUT version of RTL8188CE
From: Larry Finger @ 2012-09-25 21:51 UTC (permalink / raw)
  To: linville; +Cc: Larry Finger, netdev, Anisse Astier, Li Chaoming

Realtek devices with designation RTL8188CE-VL have the so-called B-cut
of the wireless chip. This patch adds the special programming needed by
these devices. In addition, a variable that was static has been moved into
the private data area as it is now needed in two different routines. This
change also fixes a minor bug that would be present if a system had more
than one RTL81{88,92}CE devices. Other drivers in the rtlwifi family had
already made this change, thus the variable already exists in the private
data structure.

Signed-off-by: Larry Finger <Larry.Finger@lwfinger.net>
Cc: Anisse Astier <anisse@astier.eu>
Cc: Li Chaoming <chaoming_li@realsil.com.cn>
---
 rtl8192c/phy_common.c |   21 +++++++++++++++++
 rtl8192ce/def.h       |    3 ++
 rtl8192ce/hw.c        |   60 +++++++++++++++++++++++++++++++++++++++++++++-----
 rtl8192ce/phy.c       |    2 +
 rtl8192ce/sw.c        |    6 +----
 rtl8192ce/trx.c       |    4 +--
 6 files changed, 85 insertions(+), 11 deletions(-)
---
V1 => V2	Remove extraneous white space.
V2 => V3	A change that is not part of the B-cut change and was introduced
		in V2 is moved to a separate patch. 

John,

This patch is too invasive to backport to the stable kernels, thus it should
be applied to 3.7.

Thanks,

Larry
---

Index: wireless-next/drivers/net/wireless/rtlwifi/rtl8192c/phy_common.c
===================================================================
--- wireless-next.orig/drivers/net/wireless/rtlwifi/rtl8192c/phy_common.c
+++ wireless-next/drivers/net/wireless/rtlwifi/rtl8192c/phy_common.c
@@ -724,6 +724,26 @@ u8 rtl92c_phy_sw_chnl(struct ieee80211_h
 }
 EXPORT_SYMBOL(rtl92c_phy_sw_chnl);
 
+static void _rtl92c_phy_sw_rf_setting(struct ieee80211_hw *hw, u8 channel)
+{
+	struct rtl_priv *rtlpriv = rtl_priv(hw);
+	struct rtl_phy *rtlphy = &(rtlpriv->phy);
+	struct rtl_hal *rtlhal = rtl_hal(rtl_priv(hw));
+
+	if (IS_81xxC_VENDOR_UMC_B_CUT(rtlhal->version)) {
+		if (channel == 6 && rtlphy->current_chan_bw ==
+		    HT_CHANNEL_WIDTH_20)
+			rtl_set_rfreg(hw, RF90_PATH_A, RF_RX_G1, MASKDWORD,
+				      0x00255);
+		else{
+			u32 backupRF0x1A = (u32)rtl_get_rfreg(hw, RF90_PATH_A,
+					    RF_RX_G1, RFREG_OFFSET_MASK);
+			rtl_set_rfreg(hw, RF90_PATH_A, RF_RX_G1, MASKDWORD,
+				      backupRF0x1A);
+		}
+	}
+}
+
 static bool _rtl92c_phy_set_sw_chnl_cmdarray(struct swchnlcmd *cmdtable,
 					     u32 cmdtableidx, u32 cmdtablesz,
 					     enum swchnlcmd_id cmdid,
@@ -837,6 +857,7 @@ bool _rtl92c_phy_sw_chnl_step_by_step(st
 					      currentcmd->para1,
 					      RFREG_OFFSET_MASK,
 					      rtlphy->rfreg_chnlval[rfpath]);
+			_rtl92c_phy_sw_rf_setting(hw, channel);
 			}
 			break;
 		default:
Index: wireless-next/drivers/net/wireless/rtlwifi/rtl8192ce/def.h
===================================================================
--- wireless-next.orig/drivers/net/wireless/rtlwifi/rtl8192ce/def.h
+++ wireless-next/drivers/net/wireless/rtlwifi/rtl8192ce/def.h
@@ -116,6 +116,9 @@
 	LE_BITS_TO_4BYTE(((__pcmdfbhdr) + 4), 20, 12)
 
 #define CHIP_VER_B			BIT(4)
+#define CHIP_BONDING_IDENTIFIER(_value) (((_value) >> 22) & 0x3)
+#define CHIP_BONDING_92C_1T2R		0x1
+#define RF_TYPE_1T2R			BIT(1)
 #define CHIP_92C_BITMASK		BIT(0)
 #define CHIP_92C_1T2R			0x03
 #define CHIP_92C			0x01
Index: wireless-next/drivers/net/wireless/rtlwifi/rtl8192ce/hw.c
===================================================================
--- wireless-next.orig/drivers/net/wireless/rtlwifi/rtl8192ce/hw.c
+++ wireless-next/drivers/net/wireless/rtlwifi/rtl8192ce/hw.c
@@ -896,7 +896,6 @@ int rtl92ce_hw_init(struct ieee80211_hw
 	struct rtl_phy *rtlphy = &(rtlpriv->phy);
 	struct rtl_pci *rtlpci = rtl_pcidev(rtl_pcipriv(hw));
 	struct rtl_ps_ctl *ppsc = rtl_psc(rtl_priv(hw));
-	static bool iqk_initialized; /* initialized to false */
 	bool rtstatus = true;
 	bool is92c;
 	int err;
@@ -921,9 +920,28 @@ int rtl92ce_hw_init(struct ieee80211_hw
 
 	rtlhal->last_hmeboxnum = 0;
 	rtl92c_phy_mac_config(hw);
+	/* because last function modify RCR, so we update
+	 * rcr var here, or TP will unstable for receive_config
+	 * is wrong, RX RCR_ACRC32 will cause TP unstabel & Rx
+	 * RCR_APP_ICV will cause mac80211 unassoc for cisco 1252*/
+	rtlpci->receive_config = rtl_read_dword(rtlpriv, REG_RCR);
+	rtlpci->receive_config &= ~(RCR_ACRC32 | RCR_AICV);
+	rtl_write_dword(rtlpriv, REG_RCR, rtlpci->receive_config);
 	rtl92c_phy_bb_config(hw);
 	rtlphy->rf_mode = RF_OP_BY_SW_3WIRE;
 	rtl92c_phy_rf_config(hw);
+	if (IS_VENDOR_UMC_A_CUT(rtlhal->version) &&
+	    !IS_92C_SERIAL(rtlhal->version)) {
+		rtl_set_rfreg(hw, RF90_PATH_A, RF_RX_G1, MASKDWORD, 0x30255);
+		rtl_set_rfreg(hw, RF90_PATH_A, RF_RX_G2, MASKDWORD, 0x50a00);
+	} else if (IS_81xxC_VENDOR_UMC_B_CUT(rtlhal->version)) {
+		rtl_set_rfreg(hw, RF90_PATH_A, 0x0C, MASKDWORD, 0x894AE);
+		rtl_set_rfreg(hw, RF90_PATH_A, 0x0A, MASKDWORD, 0x1AF31);
+		rtl_set_rfreg(hw, RF90_PATH_A, RF_IPA, MASKDWORD, 0x8F425);
+		rtl_set_rfreg(hw, RF90_PATH_A, RF_SYN_G2, MASKDWORD, 0x4F200);
+		rtl_set_rfreg(hw, RF90_PATH_A, RF_RCK1, MASKDWORD, 0x44053);
+		rtl_set_rfreg(hw, RF90_PATH_A, RF_RCK2, MASKDWORD, 0x80201);
+	}
 	rtlphy->rfreg_chnlval[0] = rtl_get_rfreg(hw, (enum radio_path)0,
 						 RF_CHNLBW, RFREG_OFFSET_MASK);
 	rtlphy->rfreg_chnlval[1] = rtl_get_rfreg(hw, (enum radio_path)1,
@@ -945,11 +963,11 @@ int rtl92ce_hw_init(struct ieee80211_hw
 
 	if (ppsc->rfpwr_state == ERFON) {
 		rtl92c_phy_set_rfpath_switch(hw, 1);
-		if (iqk_initialized) {
+		if (rtlphy->iqk_initialized) {
 			rtl92c_phy_iq_calibrate(hw, true);
 		} else {
 			rtl92c_phy_iq_calibrate(hw, false);
-			iqk_initialized = true;
+			rtlphy->iqk_initialized = true;
 		}
 
 		rtl92c_dm_check_txpower_tracking(hw);
@@ -1011,12 +1029,30 @@ static enum version_8192c _rtl92ce_read_
 	case VERSION_A_CHIP_88C:
 		versionid = "A_CHIP_88C";
 		break;
+	case VERSION_NORMAL_UMC_CHIP_92C_1T2R_A_CUT:
+		versionid = "A_CUT_92C_1T2R";
+		break;
+	case VERSION_NORMAL_UMC_CHIP_92C_A_CUT:
+		versionid = "A_CUT_92C";
+		break;
+	case VERSION_NORMAL_UMC_CHIP_88C_A_CUT:
+		versionid = "A_CUT_88C";
+		break;
+	case VERSION_NORMAL_UMC_CHIP_92C_1T2R_B_CUT:
+		versionid = "B_CUT_92C_1T2R";
+		break;
+	case VERSION_NORMAL_UMC_CHIP_92C_B_CUT:
+		versionid = "B_CUT_92C";
+		break;
+	case VERSION_NORMAL_UMC_CHIP_88C_B_CUT:
+		versionid = "B_CUT_88C";
+		break;
 	default:
 		versionid = "Unknown. Bug?";
 		break;
 	}
 
-	RT_TRACE(rtlpriv, COMP_INIT, DBG_TRACE,
+	RT_TRACE(rtlpriv, COMP_INIT, DBG_EMERG,
 		 "Chip Version ID: %s\n", versionid);
 
 	switch (version & 0x3) {
@@ -1189,6 +1225,7 @@ static void _rtl92ce_poweroff_adapter(st
 {
 	struct rtl_priv *rtlpriv = rtl_priv(hw);
 	struct rtl_pci_priv *rtlpcipriv = rtl_pcipriv(hw);
+	struct rtl_hal *rtlhal = rtl_hal(rtlpriv);
 	u8 u1b_tmp;
 	u32 u4b_tmp;
 
@@ -1217,7 +1254,8 @@ static void _rtl92ce_poweroff_adapter(st
 	rtl_write_word(rtlpriv, REG_GPIO_IO_SEL, 0x0790);
 	rtl_write_word(rtlpriv, REG_LEDCFG0, 0x8080);
 	rtl_write_byte(rtlpriv, REG_AFE_PLL_CTRL, 0x80);
-	rtl_write_byte(rtlpriv, REG_SPS0_CTRL, 0x23);
+	if (!IS_81xxC_VENDOR_UMC_B_CUT(rtlhal->version))
+		rtl_write_byte(rtlpriv, REG_SPS0_CTRL, 0x23);
 	if (rtlpcipriv->bt_coexist.bt_coexistence) {
 		u4b_tmp = rtl_read_dword(rtlpriv, REG_AFE_XTAL_CTRL);
 		u4b_tmp |= 0x03824800;
@@ -1246,6 +1284,9 @@ void rtl92ce_card_disable(struct ieee802
 		rtlpriv->cfg->ops->led_control(hw, LED_CTL_POWER_OFF);
 	RT_SET_PS_LEVEL(ppsc, RT_RF_OFF_LEVL_HALT_NIC);
 	_rtl92ce_poweroff_adapter(hw);
+
+	/* after power off we should do iqk again */
+	rtlpriv->phy.iqk_initialized = false;
 }
 
 void rtl92ce_interrupt_recognized(struct ieee80211_hw *hw,
@@ -1340,6 +1381,13 @@ static void _rtl92ce_read_txpower_info_f
 				    EEPROM_DEFAULT_TXPOWERLEVEL;
 			}
 		}
+		if (IS_92C_SERIAL(version)) {
+			value32 = rtl_read_dword(rtlpriv, REG_HPON_FSM);
+			version = (enum version_8192c)(version |
+				   ((CHIP_BONDING_IDENTIFIER(value32)
+				   == CHIP_BONDING_92C_1T2R) ?
+				   RF_TYPE_1T2R : 0));
+		}
 	}
 
 	for (i = 0; i < 3; i++) {
@@ -1904,6 +1952,8 @@ static void rtl92ce_update_hal_rate_mask
 			ratr_bitmap &= 0x0f0ff0ff;
 		break;
 	}
+	sta_entry->ratr_index = ratr_index;
+
 	RT_TRACE(rtlpriv, COMP_RATR, DBG_DMESG,
 		 "ratr_bitmap :%x\n", ratr_bitmap);
 	*(u32 *)&rate_mask = (ratr_bitmap & 0x0fffffff) |
Index: wireless-next/drivers/net/wireless/rtlwifi/rtl8192ce/phy.c
===================================================================
--- wireless-next.orig/drivers/net/wireless/rtlwifi/rtl8192ce/phy.c
+++ wireless-next/drivers/net/wireless/rtlwifi/rtl8192ce/phy.c
@@ -82,6 +82,8 @@ bool rtl92c_phy_mac_config(struct ieee80
 
 	if (is92c)
 		rtl_write_byte(rtlpriv, 0x14, 0x71);
+	else
+		rtl_write_byte(rtlpriv, 0x04CA, 0x0A);
 	return rtstatus;
 }
 
Index: wireless-next/drivers/net/wireless/rtlwifi/rtl8192ce/trx.c
===================================================================
--- wireless-next.orig/drivers/net/wireless/rtlwifi/rtl8192ce/trx.c
+++ wireless-next/drivers/net/wireless/rtlwifi/rtl8192ce/trx.c
@@ -127,11 +127,11 @@ static void _rtl92ce_query_rxphystatus(s
 {
 	struct rtl_priv *rtlpriv = rtl_priv(hw);
 	struct phy_sts_cck_8192s_t *cck_buf;
+	struct rtl_ps_ctl *ppsc = rtl_psc(rtlpriv);
 	s8 rx_pwr_all = 0, rx_pwr[4];
 	u8 evm, pwdb_all, rf_rx_num = 0;
 	u8 i, max_spatial_stream;
 	u32 rssi, total_rssi = 0;
-	bool in_powersavemode = false;
 	bool is_cck_rate;
 
 	is_cck_rate = RX_HAL_IS_CCK_RATE(pdesc);
@@ -147,7 +147,7 @@ static void _rtl92ce_query_rxphystatus(s
 		u8 report, cck_highpwr;
 		cck_buf = (struct phy_sts_cck_8192s_t *)p_drvinfo;
 
-		if (!in_powersavemode)
+		if (ppsc->rfpwr_state == ERFON)
 			cck_highpwr = (u8) rtl_get_bbreg(hw,
 						 RFPGA0_XA_HSSIPARAMETER2,
 						 BIT(9));

^ permalink raw reply

* [PATCH] 8139too: add 1013:1211 PCI ID for a strange SMC1211TX.
From: W. Trevor King @ 2012-09-25 20:35 UTC (permalink / raw)
  To: netdev; +Cc: David S. Miller, Jason Wang, Ben Greear, Joe Perches,
	linux-kernel

The FCC ID on the board is HEDEN1207DTXR01, which belongs to Accton
Technology Corporation.  This matches the expected 1113 ID.  Perhaps
my board just has a dying EEPROM?

Signed-off-by: W. Trevor King <wking@tremily.us>
---
I'm not sure if this qualifies as a patch-able issue, but I thought
I'd send it in in case someone else gets bitten by this.

 drivers/net/ethernet/realtek/8139too.c |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c
index 1d83565..b7cf947 100644
--- a/drivers/net/ethernet/realtek/8139too.c
+++ b/drivers/net/ethernet/realtek/8139too.c
@@ -238,6 +238,7 @@ static DEFINE_PCI_DEVICE_TABLE(rtl8139_pci_tbl) = {
 	{0x10ec, 0x8139, PCI_ANY_ID, PCI_ANY_ID, 0, 0, RTL8139 },
 	{0x10ec, 0x8138, PCI_ANY_ID, PCI_ANY_ID, 0, 0, RTL8139 },
 	{0x1113, 0x1211, PCI_ANY_ID, PCI_ANY_ID, 0, 0, RTL8139 },
+	{0x1013, 0x1211, PCI_ANY_ID, PCI_ANY_ID, 0, 0, RTL8139 },
 	{0x1500, 0x1360, PCI_ANY_ID, PCI_ANY_ID, 0, 0, RTL8139 },
 	{0x4033, 0x1360, PCI_ANY_ID, PCI_ANY_ID, 0, 0, RTL8139 },
 	{0x1186, 0x1300, PCI_ANY_ID, PCI_ANY_ID, 0, 0, RTL8139 },
-- 
1.7.8.6

^ permalink raw reply related

* [PATCH net-next] net: remove sk_init() helper
From: Eric Dumazet @ 2012-09-25 21:32 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

From: Eric Dumazet <edumazet@google.com>

It seems sk_init() has no value today and even does strange things :

# grep . /proc/sys/net/core/?mem_*
/proc/sys/net/core/rmem_default:212992
/proc/sys/net/core/rmem_max:131071
/proc/sys/net/core/wmem_default:212992
/proc/sys/net/core/wmem_max:131071

We can remove it completely.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/sock.h |    2 --
 net/core/sock.c    |   13 -------------
 net/socket.c       |    6 ------
 3 files changed, 21 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 42053759..02456b6 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2229,8 +2229,6 @@ extern int net_msg_warn;
 extern __u32 sysctl_wmem_max;
 extern __u32 sysctl_rmem_max;
 
-extern void sk_init(void);
-
 extern int sysctl_optmem_max;
 
 extern __u32 sysctl_wmem_default;
diff --git a/net/core/sock.c b/net/core/sock.c
index 727114c..f5a4260 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1464,19 +1464,6 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
 }
 EXPORT_SYMBOL_GPL(sk_setup_caps);
 
-void __init sk_init(void)
-{
-	if (totalram_pages <= 4096) {
-		sysctl_wmem_max = 32767;
-		sysctl_rmem_max = 32767;
-		sysctl_wmem_default = 32767;
-		sysctl_rmem_default = 32767;
-	} else if (totalram_pages >= 131072) {
-		sysctl_wmem_max = 131071;
-		sysctl_rmem_max = 131071;
-	}
-}
-
 /*
  *	Simple resource managers for sockets.
  */
diff --git a/net/socket.c b/net/socket.c
index c641549..80dc7e8 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2601,12 +2601,6 @@ static int __init sock_init(void)
 		goto out;
 
 	/*
-	 *      Initialize sock SLAB cache.
-	 */
-
-	sk_init();
-
-	/*
 	 *      Initialize skbuff SLAB cache
 	 */
 	skb_init();

^ permalink raw reply related

* [GIT] Networking
From: David Miller @ 2012-09-25 21:07 UTC (permalink / raw)
  To: torvalds; +Cc: akpm, netdev, linux-kernel


1) Eric Dumazet discovered and fixed what turned out to be a family of
   bugs.  These functions were using pskb_may_pull() which might need
   to reallocate the linear SKB data buffer, but the callers were not
   expecting this possibility.  The callers have cached pointers to
   the packet header areas, and would need to reload them if we were
   to continue using pskb_may_pull().

   So they could end up reading garbage.

   It's easier to just change these RAW4/RAW6/MIP6 routines to use
   skb_header_pointer() instead of pskb_may_pull(), which won't
   modify the linear SKB data area.

2) Dave Jone's syscall spammer caught a case where a non-TCP socket
   can call down into the TCP keepalive code.  The case basically
   involves creating a raw socket with sk_protocol == IPPROTO_TCP,
   then calling setsockopt(sock_fd, SO_KEEPALIVE, ...)

   Fixed by Eric Dumazet.

3) Bluetooth devices do not get configured properly while being
   powered on, resulting in always using legacy pairing instead
   of SSP.  Fix from Andrzej Kaczmarek.

4) Bluetooth cancels delayed work erroneously, put stricter
   checks in place.  From Andrei Emeltchenko.

5) Fix deadlock between cfg80211_mutex and reg_regdb_search_mutex
   in cfg80211, from Luis R. Rodriguez.

6) Fix interrupt double release in iwlwifi, from Emmanuel Grumbach.

7) Missing module license in bcm87xx driver, from Peter Huewe.

8) Team driver can lose port changed events when adding devices to a
   team, fix from Jiri Pirko.

9) Fix endless loop when trying ot unregister PPPOE device in
   zombie state, from Xiaodong Xu.

10) batman-adv layer needs to set MAC address of software device
    earlier, otherwise we call tt_local_add with it uninitialized.

11) Fix handling of KSZ8021 PHYs, it's matched currently by KS8051
    but that doesn't program the device properly.  From Marek
    Vasut.

Please pull, thanks a lot!

The following changes since commit abef3bd71029b80ec1bdd6c6244b5b0b99f56633:

  Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net (2012-09-21 14:32:55 -0700)

are available in the git repository at:


  git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git master

for you to fetch changes up to 96af69ea2a83d292238bdba20e4508ee967cf8cb:

  ipv6: mip6: fix mip6_mh_filter() (2012-09-25 16:04:44 -0400)

----------------------------------------------------------------
Andrei Emeltchenko (1):
      Bluetooth: Fix freeing uninitialized delayed works

Andrzej Kaczmarek (2):
      Bluetooth: mgmt: Fix enabling SSP while powered off
      Bluetooth: mgmt: Fix enabling LE while powered off

David S. Miller (2):
      Merge branch 'for-davem' of git://git.kernel.org/.../linville/wireless
      Merge tag 'batman-adv-fix-for-davem' of git://git.open-mesh.org/linux-merge

Def (1):
      batman-adv: Fix change mac address of soft iface.

Emmanuel Grumbach (1):
      iwlwifi: don't double free the interrupt in failure path

Eric Dumazet (4):
      ipv4: raw: fix icmp_filter()
      net: guard tcp_set_keepalive() to tcp sockets
      ipv6: raw: fix icmpv6_filter()
      ipv6: mip6: fix mip6_mh_filter()

Jiri Pirko (1):
      team: send port changed when added

John W. Linville (1):
      Merge branch 'master' of git://git.kernel.org/.../linville/wireless into for-davem

Linus Lüssing (1):
      batman-adv: Fix symmetry check / route flapping in multi interface setups

Luis R. Rodriguez (1):
      cfg80211: fix possible circular lock on reg_regdb_search()

Marek Vasut (3):
      phy/micrel: Implement support for KSZ8021
      phy/micrel: Rename KS80xx to KSZ80xx
      phy/micrel: Add missing header to micrel_phy.h

Peter Hüwe (1):
      net/phy/bcm87xx: Add MODULE_LICENSE("GPL") to GPL driver

Vinicius Costa Gomes (1):
      Bluetooth: Fix not removing power_off delayed work

Xiaodong Xu (1):
      pppoe: drop PPPOX_ZOMBIEs in pppoe_release

 arch/arm/mach-mxs/mach-mxs.c              |  2 +-
 drivers/net/phy/bcm87xx.c                 |  2 ++
 drivers/net/phy/micrel.c                  | 45 ++++++++++++++++++++++++++++++++++++---------
 drivers/net/ppp/pppoe.c                   |  2 +-
 drivers/net/team/team.c                   | 32 ++++++++++++++++++++++++--------
 drivers/net/wireless/iwlwifi/pcie/trans.c |  1 +
 include/linux/micrel_phy.h                | 19 ++++++++++++++++---
 net/batman-adv/bat_iv_ogm.c               | 13 +++++++------
 net/batman-adv/soft-interface.c           |  7 +++++--
 net/bluetooth/hci_core.c                  |  2 ++
 net/bluetooth/l2cap_core.c                |  2 +-
 net/bluetooth/mgmt.c                      | 16 ++++++++++++++++
 net/core/sock.c                           |  3 ++-
 net/ipv4/raw.c                            | 14 ++++++++------
 net/ipv6/mip6.c                           | 20 +++++++++++---------
 net/ipv6/raw.c                            | 21 ++++++++++-----------
 net/wireless/reg.c                        | 12 +++++++++---
 17 files changed, 152 insertions(+), 61 deletions(-)

^ permalink raw reply

* Re: [PATCH 4/5] dev: Add dev_vprintk_emit and dev_printk_emit
From: Geert Uytterhoeven @ 2012-09-25 21:05 UTC (permalink / raw)
  To: Joe Perches
  Cc: Andrew Morton, Greg Kroah-Hartman, David S. Miller, Jason Baron,
	Jim Cromie, Kay Sievers, netdev, linux-kernel
In-Reply-To: <7cfb202968f0928bba66965146a828a1cff87950.1345978012.git.joe@perches.com>

On Sun, Aug 26, 2012 at 1:25 PM, Joe Perches <joe@perches.com> wrote:
> --- a/include/linux/device.h
> +++ b/include/linux/device.h
> @@ -914,6 +918,13 @@ int _dev_info(const struct device *dev, const char *fmt, ...);
>
>  #else
>
> +static int dev_vprintk_emit(int level, const struct device *dev,

Missing "inline", cfr. http://kisskb.ellerman.id.au/kisskb/buildresult/7271354/

include/linux/device.h:930:12: error: 'dev_vprintk_emit' defined but
not used [-Werror=unused-function]
cc1: all warnings being treated as errors
make[2]: *** [arch/sh/kernel/dma-nommu.o] Error 1

> +                           const char *fmt, va_list args)
> +{ return 0; }

Gr{oetje,eeting}s,

                        Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* [PATCHv3 net-next 4/4] tunnel: drop packet if ECN present with not-ECT
From: Stephen Hemminger @ 2012-09-25 21:02 UTC (permalink / raw)
  To: Chris Wright, davem; +Cc: netdev
In-Reply-To: <20120925041259.337491375@vyatta.com>

Linux tunnels were written before RFC6040 and therefore never
implemented the corner case of ECN getting set in the outer header
and the inner header not being ready for it.

Section 4.2.  Default Tunnel Egress Behaviour.
 o If the inner ECN field is Not-ECT, the decapsulator MUST NOT
      propagate any other ECN codepoint onwards.  This is because the
      inner Not-ECT marking is set by transports that rely on dropped
      packets as an indication of congestion and would not understand or
      respond to any other ECN codepoint [RFC4774].  Specifically:

      *  If the inner ECN field is Not-ECT and the outer ECN field is
         CE, the decapsulator MUST drop the packet.

      *  If the inner ECN field is Not-ECT and the outer ECN field is
         Not-ECT, ECT(0), or ECT(1), the decapsulator MUST forward the
         outgoing packet with the ECN field cleared to Not-ECT.

This patch moves the ECN decap logic out of the individual tunnels
into a common place.

It also adds logging to allow detecting broken systems that
set ECN bits incorrectly when tunneling (or an intermediate
router might be changing the header).

Overloads rx_frame_error to keep track of ECN related error.

Thanks to Chris Wright who caught this while reviewing the new VXLAN
tunnel.

This code was tested by injecting faulty logic in other end GRE
to send incorrectly encapsulated packets.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
v3 - add common code and logging for ECN broken encapsulation
v2 - supersedes earlier GRE only version

 include/net/inet_ecn.h |   76 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/ip_gre.c      |   38 ++++++++++++++----------
 net/ipv4/ipip.c        |   42 ++++++++++++++++-----------
 net/ipv6/ip6_gre.c     |   54 +++++++++++++++-------------------
 4 files changed, 147 insertions(+), 63 deletions(-)

--- a/net/ipv4/ip_gre.c	2012-09-25 08:37:13.995876570 -0700
+++ b/net/ipv4/ip_gre.c	2012-09-25 11:00:46.918340181 -0700
@@ -120,6 +120,10 @@
    Alexey Kuznetsov.
  */
 
+static bool log_ecn_error = true;
+module_param(log_ecn_error, bool, 0644);
+MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
+
 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
 static int ipgre_tunnel_init(struct net_device *dev);
 static void ipgre_tunnel_setup(struct net_device *dev);
@@ -204,7 +208,9 @@ static struct rtnl_link_stats64 *ipgre_g
 	tot->rx_crc_errors = dev->stats.rx_crc_errors;
 	tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
 	tot->rx_length_errors = dev->stats.rx_length_errors;
+	tot->rx_frame_errors = dev->stats.rx_frame_errors;
 	tot->rx_errors = dev->stats.rx_errors;
+
 	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
 	tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
 	tot->tx_dropped = dev->stats.tx_dropped;
@@ -587,17 +593,6 @@ static void ipgre_err(struct sk_buff *sk
 	t->err_time = jiffies;
 }
 
-static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
-{
-	if (INET_ECN_is_ce(iph->tos)) {
-		if (skb->protocol == htons(ETH_P_IP)) {
-			IP_ECN_set_ce(ip_hdr(skb));
-		} else if (skb->protocol == htons(ETH_P_IPV6)) {
-			IP6_ECN_set_ce(ipv6_hdr(skb));
-		}
-	}
-}
-
 static inline u8
 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
 {
@@ -620,6 +615,7 @@ static int ipgre_rcv(struct sk_buff *skb
 	struct ip_tunnel *tunnel;
 	int    offset = 4;
 	__be16 gre_proto;
+	int    err;
 
 	if (!pskb_may_pull(skb, 16))
 		goto drop;
@@ -723,17 +719,27 @@ static int ipgre_rcv(struct sk_buff *skb
 			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 		}
 
+		__skb_tunnel_rx(skb, tunnel->dev);
+
+		skb_reset_network_header(skb);
+		err = IP_ECN_decapsulate(iph, skb);
+		if (unlikely(err)) {
+			if (log_ecn_error)
+				net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
+						     &iph->saddr, iph->tos);
+			if (err > 1) {
+				++tunnel->dev->stats.rx_frame_errors;
+				++tunnel->dev->stats.rx_errors;
+				goto drop;
+			}
+		}
+
 		tstats = this_cpu_ptr(tunnel->dev->tstats);
 		u64_stats_update_begin(&tstats->syncp);
 		tstats->rx_packets++;
 		tstats->rx_bytes += skb->len;
 		u64_stats_update_end(&tstats->syncp);
 
-		__skb_tunnel_rx(skb, tunnel->dev);
-
-		skb_reset_network_header(skb);
-		ipgre_ecn_decapsulate(iph, skb);
-
 		netif_rx(skb);
 
 		return 0;
--- a/net/ipv4/ipip.c	2012-09-25 08:37:14.791868701 -0700
+++ b/net/ipv4/ipip.c	2012-09-25 11:01:41.733794349 -0700
@@ -120,6 +120,10 @@
 #define HASH_SIZE  16
 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
 
+static bool log_ecn_error = true;
+module_param(log_ecn_error, bool, 0644);
+MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
+
 static int ipip_net_id __read_mostly;
 struct ipip_net {
 	struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
@@ -400,28 +404,18 @@ out:
 	return err;
 }
 
-static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph,
-					struct sk_buff *skb)
-{
-	struct iphdr *inner_iph = ip_hdr(skb);
-
-	if (INET_ECN_is_ce(outer_iph->tos))
-		IP_ECN_set_ce(inner_iph);
-}
-
 static int ipip_rcv(struct sk_buff *skb)
 {
 	struct ip_tunnel *tunnel;
 	const struct iphdr *iph = ip_hdr(skb);
+	int err;
 
 	tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
 	if (tunnel != NULL) {
 		struct pcpu_tstats *tstats;
 
-		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
-			kfree_skb(skb);
-			return 0;
-		}
+		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+			goto drop;
 
 		secpath_reset(skb);
 
@@ -430,21 +424,35 @@ static int ipip_rcv(struct sk_buff *skb)
 		skb->protocol = htons(ETH_P_IP);
 		skb->pkt_type = PACKET_HOST;
 
+		__skb_tunnel_rx(skb, tunnel->dev);
+
+		err = IP_ECN_decapsulate(iph, skb);
+		if (unlikely(err)) {
+			if (log_ecn_error)
+				net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
+						     &iph->saddr, iph->tos);
+			if (err > 1) {
+				++tunnel->dev->stats.rx_frame_errors;
+				++tunnel->dev->stats.rx_errors;
+				goto drop;
+			}
+		}
+
 		tstats = this_cpu_ptr(tunnel->dev->tstats);
 		u64_stats_update_begin(&tstats->syncp);
 		tstats->rx_packets++;
 		tstats->rx_bytes += skb->len;
 		u64_stats_update_end(&tstats->syncp);
 
-		__skb_tunnel_rx(skb, tunnel->dev);
-
-		ipip_ecn_decapsulate(iph, skb);
-
 		netif_rx(skb);
 		return 0;
 	}
 
 	return -1;
+
+drop:
+	kfree_skb(skb);
+	return 0;
 }
 
 /*
--- a/net/ipv6/ip6_gre.c	2012-09-25 08:37:13.995876570 -0700
+++ b/net/ipv6/ip6_gre.c	2012-09-25 11:00:35.478454105 -0700
@@ -56,6 +56,10 @@
 #include <net/ip6_tunnel.h>
 
 
+static bool log_ecn_error = true;
+module_param(log_ecn_error, bool, 0644);
+MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
+
 #define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK)
 #define IPV6_TCLASS_SHIFT 20
 
@@ -149,7 +153,9 @@ static struct rtnl_link_stats64 *ip6gre_
 	tot->rx_crc_errors = dev->stats.rx_crc_errors;
 	tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
 	tot->rx_length_errors = dev->stats.rx_length_errors;
+	tot->rx_frame_errors = dev->stats.rx_frame_errors;
 	tot->rx_errors = dev->stats.rx_errors;
+
 	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
 	tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
 	tot->tx_dropped = dev->stats.tx_dropped;
@@ -489,28 +495,6 @@ static void ip6gre_err(struct sk_buff *s
 	t->err_time = jiffies;
 }
 
-static inline void ip6gre_ecn_decapsulate_ipv4(const struct ip6_tnl *t,
-		const struct ipv6hdr *ipv6h, struct sk_buff *skb)
-{
-	__u8 dsfield = ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK;
-
-	if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY)
-		ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, dsfield);
-
-	if (INET_ECN_is_ce(dsfield))
-		IP_ECN_set_ce(ip_hdr(skb));
-}
-
-static inline void ip6gre_ecn_decapsulate_ipv6(const struct ip6_tnl *t,
-		const struct ipv6hdr *ipv6h, struct sk_buff *skb)
-{
-	if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY)
-		ipv6_copy_dscp(ipv6_get_dsfield(ipv6h), ipv6_hdr(skb));
-
-	if (INET_ECN_is_ce(ipv6_get_dsfield(ipv6h)))
-		IP6_ECN_set_ce(ipv6_hdr(skb));
-}
-
 static int ip6gre_rcv(struct sk_buff *skb)
 {
 	const struct ipv6hdr *ipv6h;
@@ -522,6 +506,7 @@ static int ip6gre_rcv(struct sk_buff *sk
 	struct ip6_tnl *tunnel;
 	int    offset = 4;
 	__be16 gre_proto;
+	int err;
 
 	if (!pskb_may_pull(skb, sizeof(struct in6_addr)))
 		goto drop;
@@ -625,20 +610,29 @@ static int ip6gre_rcv(struct sk_buff *sk
 			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 		}
 
+		__skb_tunnel_rx(skb, tunnel->dev);
+
+		skb_reset_network_header(skb);
+
+		err = IP6_ECN_decapsulate(ipv6h, skb);
+		if (unlikely(err)) {
+			if (log_ecn_error)
+				net_info_ratelimited("non-ECT from %pI6 with dsfield=%#x\n",
+						     &ipv6h->saddr,
+						     ipv6_get_dsfield(ipv6h));
+			if (err > 1) {
+				++tunnel->dev->stats.rx_frame_errors;
+				++tunnel->dev->stats.rx_errors;
+				goto drop;
+			}
+		}
+
 		tstats = this_cpu_ptr(tunnel->dev->tstats);
 		u64_stats_update_begin(&tstats->syncp);
 		tstats->rx_packets++;
 		tstats->rx_bytes += skb->len;
 		u64_stats_update_end(&tstats->syncp);
 
-		__skb_tunnel_rx(skb, tunnel->dev);
-
-		skb_reset_network_header(skb);
-		if (skb->protocol == htons(ETH_P_IP))
-			ip6gre_ecn_decapsulate_ipv4(tunnel, ipv6h, skb);
-		else if (skb->protocol == htons(ETH_P_IPV6))
-			ip6gre_ecn_decapsulate_ipv6(tunnel, ipv6h, skb);
-
 		netif_rx(skb);
 
 		return 0;
--- a/include/net/inet_ecn.h	2012-08-15 08:59:22.766706151 -0700
+++ b/include/net/inet_ecn.h	2012-09-25 10:50:45.408313935 -0700
@@ -15,6 +15,8 @@ enum {
 	INET_ECN_MASK = 3,
 };
 
+extern int sysctl_tunnel_ecn_log;
+
 static inline int INET_ECN_is_ce(__u8 dsfield)
 {
 	return (dsfield & INET_ECN_MASK) == INET_ECN_CE;
@@ -145,4 +147,78 @@ static inline int INET_ECN_set_ce(struct
 	return 0;
 }
 
+/*
+ * RFC 6080 4.2
+ *  To decapsulate the inner header at the tunnel egress, a compliant
+ *  tunnel egress MUST set the outgoing ECN field to the codepoint at the
+ *  intersection of the appropriate arriving inner header (row) and outer
+ *  header (column) in Figure 4
+ *
+ *      +---------+------------------------------------------------+
+ *      |Arriving |            Arriving Outer Header               |
+ *      |   Inner +---------+------------+------------+------------+
+ *      |  Header | Not-ECT | ECT(0)     | ECT(1)     |     CE     |
+ *      +---------+---------+------------+------------+------------+
+ *      | Not-ECT | Not-ECT |Not-ECT(!!!)|Not-ECT(!!!)| <drop>(!!!)|
+ *      |  ECT(0) |  ECT(0) | ECT(0)     | ECT(1)     |     CE     |
+ *      |  ECT(1) |  ECT(1) | ECT(1) (!) | ECT(1)     |     CE     |
+ *      |    CE   |      CE |     CE     |     CE(!!!)|     CE     |
+ *      +---------+---------+------------+------------+------------+
+ *
+ *             Figure 4: New IP in IP Decapsulation Behaviour
+ *
+ *  returns 0 on success
+ *          1 if something is broken and should be logged (!!! above)
+ *          2 if packet should be dropped
+ */
+static inline int INET_ECN_decapsulate(struct sk_buff *skb,
+				       __u8 outer, __u8 inner)
+{
+	if (INET_ECN_is_not_ect(inner)) {
+		switch (outer & INET_ECN_MASK) {
+		case INET_ECN_NOT_ECT:
+			return 0;
+		case INET_ECN_ECT_0:
+		case INET_ECN_ECT_1:
+			return 1;
+		case INET_ECN_CE:
+			return 2;
+		}
+	}
+
+	if (INET_ECN_is_ce(outer))
+		INET_ECN_set_ce(skb);
+
+	return 0;
+}
+
+static inline int IP_ECN_decapsulate(const struct iphdr *oiph,
+				     struct sk_buff *skb)
+{
+	__u8 inner;
+
+	if (skb->protocol == htons(ETH_P_IP))
+		inner = ip_hdr(skb)->tos;
+	else if (skb->protocol == htons(ETH_P_IPV6))
+		inner = ipv6_get_dsfield(ipv6_hdr(skb));
+	else
+		return 0;
+
+	return INET_ECN_decapsulate(skb, oiph->tos, inner);
+}
+
+static inline int IP6_ECN_decapsulate(const struct ipv6hdr *oipv6h,
+				      struct sk_buff *skb)
+{
+	__u8 inner;
+
+	if (skb->protocol == htons(ETH_P_IP))
+		inner = ip_hdr(skb)->tos;
+	else if (skb->protocol == htons(ETH_P_IPV6))
+		inner = ipv6_get_dsfield(ipv6_hdr(skb));
+	else
+		return 0;
+
+	return INET_ECN_decapsulate(skb, ipv6_get_dsfield(oipv6h), inner);
+}
 #endif

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox