Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net 3/4] sock: use lockdep_sock_is_held were appropriate
From: Hannes Frederic Sowa @ 2016-03-31 23:29 UTC (permalink / raw)
  To: davem; +Cc: netdev, sasha.levin, daniel, alexei.starovoitov, mkubecek
In-Reply-To: <1459466982-20432-1-git-send-email-hannes@stressinduktion.org>

Also make lockdep_sock_is_held accept const arguments, so we don't need to
modify too many callers.

Reported-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
---
 include/net/sock.h       | 7 ++++---
 net/dccp/ipv4.c          | 2 +-
 net/dccp/ipv6.c          | 2 +-
 net/ipv4/af_inet.c       | 2 +-
 net/ipv4/cipso_ipv4.c    | 3 ++-
 net/ipv4/ip_sockglue.c   | 4 ++--
 net/ipv4/tcp_ipv4.c      | 8 +++-----
 net/ipv6/ipv6_sockglue.c | 6 ++++--
 net/ipv6/tcp_ipv6.c      | 2 +-
 net/socket.c             | 2 +-
 10 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 30f9b5ad0a82ef..bbea02fdaaa0fd 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1382,8 +1382,9 @@ static inline void unlock_sock_fast(struct sock *sk, bool slow)
 		spin_unlock_bh(&sk->sk_lock.slock);
 }
 
-static bool lockdep_sock_is_held(struct sock *sk)
+static bool lockdep_sock_is_held(const struct sock *csk)
 {
+	struct sock *sk = (struct sock *)csk;
 	return lockdep_is_held(&sk->sk_lock) ||
 	       lockdep_is_held(&sk->sk_lock.slock);
 }
@@ -1589,8 +1590,8 @@ static inline void sk_rethink_txhash(struct sock *sk)
 static inline struct dst_entry *
 __sk_dst_get(struct sock *sk)
 {
-	return rcu_dereference_check(sk->sk_dst_cache, sock_owned_by_user(sk) ||
-						       lockdep_is_held(&sk->sk_lock.slock));
+	return rcu_dereference_check(sk->sk_dst_cache,
+				     lockdep_sock_is_held(sk));
 }
 
 static inline struct dst_entry *
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 9c67a961ba5382..0ea298d849383f 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -62,7 +62,7 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	nexthop = daddr = usin->sin_addr.s_addr;
 
 	inet_opt = rcu_dereference_protected(inet->inet_opt,
-					     sock_owned_by_user(sk));
+					     lockdep_sock_is_held(sk));
 	if (inet_opt != NULL && inet_opt->opt.srr) {
 		if (daddr == 0)
 			return -EINVAL;
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 4663a01d503991..6ea214fa5499c5 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -865,7 +865,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	fl6.fl6_sport = inet->inet_sport;
 	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
 
-	opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk));
+	opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
 	final_p = fl6_update_dst(&fl6, opt, &final);
 
 	dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 9e481992dbaef2..7e37ebb5af396e 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1106,7 +1106,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
 	struct ip_options_rcu *inet_opt;
 
 	inet_opt = rcu_dereference_protected(inet->inet_opt,
-					     sock_owned_by_user(sk));
+					     lockdep_sock_is_held(sk));
 	if (inet_opt && inet_opt->opt.srr)
 		daddr = inet_opt->opt.faddr;
 
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index bdb2a07ec363b7..40d6b87713a132 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1933,7 +1933,8 @@ int cipso_v4_sock_setattr(struct sock *sk,
 
 	sk_inet = inet_sk(sk);
 
-	old = rcu_dereference_protected(sk_inet->inet_opt, sock_owned_by_user(sk));
+	old = rcu_dereference_protected(sk_inet->inet_opt,
+					lockdep_sock_is_held(sk));
 	if (sk_inet->is_icsk) {
 		sk_conn = inet_csk(sk);
 		if (old)
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 035ad645a8d9d8..cf073059192d99 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -635,7 +635,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 		if (err)
 			break;
 		old = rcu_dereference_protected(inet->inet_opt,
-						sock_owned_by_user(sk));
+						lockdep_sock_is_held(sk));
 		if (inet->is_icsk) {
 			struct inet_connection_sock *icsk = inet_csk(sk);
 #if IS_ENABLED(CONFIG_IPV6)
@@ -1295,7 +1295,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 		struct ip_options_rcu *inet_opt;
 
 		inet_opt = rcu_dereference_protected(inet->inet_opt,
-						     sock_owned_by_user(sk));
+						     lockdep_sock_is_held(sk));
 		opt->optlen = 0;
 		if (inet_opt)
 			memcpy(optbuf, &inet_opt->opt,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ad450509029bce..17cc1840337756 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -157,7 +157,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 
 	nexthop = daddr = usin->sin_addr.s_addr;
 	inet_opt = rcu_dereference_protected(inet->inet_opt,
-					     sock_owned_by_user(sk));
+					     lockdep_sock_is_held(sk));
 	if (inet_opt && inet_opt->opt.srr) {
 		if (!daddr)
 			return -EINVAL;
@@ -882,8 +882,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
 
 	/* caller either holds rcu_read_lock() or socket lock */
 	md5sig = rcu_dereference_check(tp->md5sig_info,
-				       sock_owned_by_user(sk) ||
-				       lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
+				       lockdep_sock_is_held(sk));
 	if (!md5sig)
 		return NULL;
 #if IS_ENABLED(CONFIG_IPV6)
@@ -928,8 +927,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 	}
 
 	md5sig = rcu_dereference_protected(tp->md5sig_info,
-					   sock_owned_by_user(sk) ||
-					   lockdep_is_held(&sk->sk_lock.slock));
+					   lockdep_sock_is_held(sk));
 	if (!md5sig) {
 		md5sig = kmalloc(sizeof(*md5sig), gfp);
 		if (!md5sig)
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 4449ad1f81147c..516b6a31c30f7b 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -407,7 +407,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW))
 			break;
 
-		opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk));
+		opt = rcu_dereference_protected(np->opt,
+						lockdep_sock_is_held(sk));
 		opt = ipv6_renew_options(sk, opt, optname,
 					 (struct ipv6_opt_hdr __user *)optval,
 					 optlen);
@@ -1123,7 +1124,8 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		struct ipv6_txoptions *opt;
 
 		lock_sock(sk);
-		opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk));
+		opt = rcu_dereference_protected(np->opt,
+						lockdep_sock_is_held(sk));
 		len = ipv6_getsockopt_sticky(sk, opt, optname, optval, len);
 		release_sock(sk);
 		/* check if ipv6_getsockopt_sticky() returns err code */
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 711d209f912473..bd16dc4b6ba71b 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -234,7 +234,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	fl6.fl6_dport = usin->sin6_port;
 	fl6.fl6_sport = inet->inet_sport;
 
-	opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk));
+	opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
 	final_p = fl6_update_dst(&fl6, opt, &final);
 
 	security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
diff --git a/net/socket.c b/net/socket.c
index 5f77a8e93830bd..e3299cdfe9db39 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1046,7 +1046,7 @@ static int sock_fasync(int fd, struct file *filp, int on)
 		return -EINVAL;
 
 	lock_sock(sk);
-	wq = rcu_dereference_protected(sock->wq, sock_owned_by_user(sk));
+	wq = rcu_dereference_protected(sock->wq, lockdep_sock_is_held(sk));
 	fasync_helper(fd, filp, on, &wq->fasync_list);
 
 	if (!wq->fasync_list)
-- 
2.5.5

^ permalink raw reply related

* [PATCH net 2/4] net: proper check if we hold the socket lock during dereference
From: Hannes Frederic Sowa @ 2016-03-31 23:29 UTC (permalink / raw)
  To: davem; +Cc: netdev, sasha.levin, daniel, alexei.starovoitov, mkubecek
In-Reply-To: <1459466982-20432-1-git-send-email-hannes@stressinduktion.org>

lockdep_sock_is_held makes sure that we currently own the
lock. sock_owned_by_user simply checks if a user holds the socket. This
could lead to non deterministic lock checks.

Reported-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
---
 include/net/sock.h | 5 +++++
 net/core/filter.c  | 6 +++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 255d3e03727b73..30f9b5ad0a82ef 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1382,6 +1382,11 @@ static inline void unlock_sock_fast(struct sock *sk, bool slow)
 		spin_unlock_bh(&sk->sk_lock.slock);
 }
 
+static bool lockdep_sock_is_held(struct sock *sk)
+{
+	return lockdep_is_held(&sk->sk_lock) ||
+	       lockdep_is_held(&sk->sk_lock.slock);
+}
 
 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 		      struct proto *prot, int kern);
diff --git a/net/core/filter.c b/net/core/filter.c
index 4b81b71171b4ce..e8486ba601eae7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1166,7 +1166,7 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
 	}
 
 	old_fp = rcu_dereference_protected(sk->sk_filter,
-					   sock_owned_by_user(sk));
+					   lockdep_sock_is_held(sk));
 	rcu_assign_pointer(sk->sk_filter, fp);
 
 	if (old_fp)
@@ -2259,7 +2259,7 @@ int sk_detach_filter(struct sock *sk)
 		return -EPERM;
 
 	filter = rcu_dereference_protected(sk->sk_filter,
-					   sock_owned_by_user(sk));
+					   lockdep_sock_is_held(sk));
 	if (filter) {
 		RCU_INIT_POINTER(sk->sk_filter, NULL);
 		sk_filter_uncharge(sk, filter);
@@ -2279,7 +2279,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
 
 	lock_sock(sk);
 	filter = rcu_dereference_protected(sk->sk_filter,
-					   sock_owned_by_user(sk));
+					   lockdep_sock_is_held(sk));
 	if (!filter)
 		goto out;
 
-- 
2.5.5

^ permalink raw reply related

* [PATCH net 1/4] tun: add socket locking around sk_{attach,detach}_filter
From: Hannes Frederic Sowa @ 2016-03-31 23:29 UTC (permalink / raw)
  To: davem; +Cc: netdev, sasha.levin, daniel, alexei.starovoitov, mkubecek
In-Reply-To: <1459466982-20432-1-git-send-email-hannes@stressinduktion.org>

[Changelog stolen from Daniel Borkmann:]

Sasha Levin reported a suspicious rcu_dereference_protected() warning
found while fuzzing with trinity that is similar to this one:

  [   52.765684] net/core/filter.c:2262 suspicious rcu_dereference_protected() usage!
  [   52.765688] other info that might help us debug this:
  [   52.765695] rcu_scheduler_active = 1, debug_locks = 1
  [   52.765701] 1 lock held by a.out/1525:
  [   52.765704]  #0:  (rtnl_mutex){+.+.+.}, at: [<ffffffff816a64b7>] rtnl_lock+0x17/0x20
  [   52.765721] stack backtrace:
  [   52.765728] CPU: 1 PID: 1525 Comm: a.out Not tainted 4.5.0+ #264
  [...]
  [   52.765768] Call Trace:
  [   52.765775]  [<ffffffff813e488d>] dump_stack+0x85/0xc8
  [   52.765784]  [<ffffffff810f2fa5>] lockdep_rcu_suspicious+0xd5/0x110
  [   52.765792]  [<ffffffff816afdc2>] sk_detach_filter+0x82/0x90
  [   52.765801]  [<ffffffffa0883425>] tun_detach_filter+0x35/0x90 [tun]
  [   52.765810]  [<ffffffffa0884ed4>] __tun_chr_ioctl+0x354/0x1130 [tun]
  [   52.765818]  [<ffffffff8136fed0>] ? selinux_file_ioctl+0x130/0x210
  [   52.765827]  [<ffffffffa0885ce3>] tun_chr_ioctl+0x13/0x20 [tun]
  [   52.765834]  [<ffffffff81260ea6>] do_vfs_ioctl+0x96/0x690
  [   52.765843]  [<ffffffff81364af3>] ? security_file_ioctl+0x43/0x60
  [   52.765850]  [<ffffffff81261519>] SyS_ioctl+0x79/0x90
  [   52.765858]  [<ffffffff81003ba2>] do_syscall_64+0x62/0x140
  [   52.765866]  [<ffffffff817d563f>] entry_SYSCALL64_slow_path+0x25/0x25

Same can be triggered with PROVE_RCU (+ PROVE_RCU_REPEATEDLY) enabled
from tun_attach_filter() when user space calls ioctl(tun_fd, TUN{ATTACH,
DETACH}FILTER, ...) for adding/removing a BPF filter on tap devices.

Since the fix in commit f91ff5b9ff52 ("net: sk_{detach|attach}_filter()
rcu fixes") sk_attach_filter()/sk_detach_filter() now dereferences the
filter with rcu_dereference_protected(), checking whether socket lock
is held in control path.

Since its introduction in commit 994051625981 ("tun: socket
filter support"), tap filters are managed under RTNL lock from
__tun_chr_ioctl(). Thus the sock_owned_by_user(sk) doesn't apply in this
specific case and therefore triggers the false positive.

Simply holding the locks during the filter updates will fix this problem.

Fixes: 994051625981 ("tun: socket filter support")
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
---
 drivers/net/tun.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index afdf950617c36e..dccbbacbbc7f02 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -622,7 +622,11 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte
 
 	/* Re-attach the filter to persist device */
 	if (!skip_filter && (tun->filter_attached == true)) {
+		bool slow;
+
+		slow = lock_sock_fast(tfile->socket.sk);
 		err = sk_attach_filter(&tun->fprog, tfile->socket.sk);
+		unlock_sock_fast(tfile->socket.sk, slow);
 		if (!err)
 			goto out;
 	}
@@ -1821,8 +1825,12 @@ static void tun_detach_filter(struct tun_struct *tun, int n)
 	struct tun_file *tfile;
 
 	for (i = 0; i < n; i++) {
+		bool slow;
+
 		tfile = rtnl_dereference(tun->tfiles[i]);
+		slow = lock_sock_fast(tfile->socket.sk);
 		sk_detach_filter(tfile->socket.sk);
+		unlock_sock_fast(tfile->socket.sk, slow);
 	}
 
 	tun->filter_attached = false;
@@ -1834,8 +1842,12 @@ static int tun_attach_filter(struct tun_struct *tun)
 	struct tun_file *tfile;
 
 	for (i = 0; i < tun->numqueues; i++) {
+		bool slow;
+
 		tfile = rtnl_dereference(tun->tfiles[i]);
+		slow = lock_sock_fast(tfile->socket.sk);
 		ret = sk_attach_filter(&tun->fprog, tfile->socket.sk);
+		unlock_sock_fast(tfile->socket.sk, slow);
 		if (ret) {
 			tun_detach_filter(tun, i);
 			return ret;
-- 
2.5.5

^ permalink raw reply related

* [PATCH net 0/4] net: fix and tighten rcu dereference checks
From: Hannes Frederic Sowa @ 2016-03-31 23:29 UTC (permalink / raw)
  To: davem; +Cc: netdev, sasha.levin, daniel, alexei.starovoitov, mkubecek

Only the first patch is really applicable for stable. It adds appropriate
socket locks so lockdep doesn't complain if tuntap's ioctls modify the
filters on the socket.

Rest of the patches tighten the rcu dereference socket lock checks.

Last patch fixes missing rcu_read_locks which were discovered by this
change. Certainly there are more to come.

Hannes Frederic Sowa (4):
  tun: add socket locking around sk_{attach,detach}_filter
  net: proper check if we hold the socket lock during dereference
  sock: use lockdep_sock_is_held were appropriate
  tcp: various missing rcu_read_lock around __sk_dst_get

 drivers/net/tun.c        | 12 ++++++++++++
 include/net/sock.h       | 10 ++++++++--
 include/net/tcp.h        |  5 ++++-
 net/core/filter.c        |  6 +++---
 net/core/sock.c          |  7 +++++--
 net/dccp/ipv4.c          |  2 +-
 net/dccp/ipv6.c          |  2 +-
 net/ipv4/af_inet.c       |  2 +-
 net/ipv4/cipso_ipv4.c    |  3 ++-
 net/ipv4/ip_sockglue.c   |  4 ++--
 net/ipv4/tcp_input.c     | 18 ++++++++++++++----
 net/ipv4/tcp_ipv4.c      |  8 +++-----
 net/ipv4/tcp_metrics.c   | 12 +++++-------
 net/ipv4/tcp_output.c    | 22 ++++++++++++++++++++--
 net/ipv6/ipv6_sockglue.c |  6 ++++--
 net/ipv6/tcp_ipv6.c      |  2 +-
 net/socket.c             |  2 +-
 17 files changed, 87 insertions(+), 36 deletions(-)

-- 
2.5.5

^ permalink raw reply

* Urgent Founds!!
From: Mr Tony Hope @ 2016-03-31 22:21 UTC (permalink / raw)
  To: DHS

Urgent Founds!!

We wish to inform you that your over due Inheritance funds which we agreed to pay you in cash is already sealed and package with a security proof box. The funds worth of $7.5 millions US Dollar,in the package will be conveyed to you by an Int'l diplomatic agent, Mr. Tony Hope.

He will be leaving for your country any time from now, therefore reach us with the details below.

Using a Diplomatic agent this time is because of the failure that were recorded in the other transfer options. Just try and give the diplomat your information and offer him all assistance he may need, especially directive assistance so that he will be able to get your consignment box to you in the couple of days.

Please contact him with your full information, such as your.

1) Full name: ......
2) Resident address:......
3) Phone number:.....
4) The name of your nearest airport:....
Send him the information above for him to locate your home with your package.

Below is some of his contact information:

Name: Mr Tony Hope
Email: (dipltonyhope@photofile.ru)Phone numbers:
+229-99 22 69 25

---
This email has been checked for viruses by Avast antivirus software.
https://www.avast.com/antivirus

^ permalink raw reply

* Re: possible bug in latest network tree
From: Cong Wang @ 2016-03-31 22:34 UTC (permalink / raw)
  To: Light, John J; +Cc: David S. Miller, netdev@vger.kernel.org
In-Reply-To: <660C7E53FDDFB64C8741117E0D659AAAB8463B9F@ORSMSX116.amr.corp.intel.com>

On Wed, Mar 30, 2016 at 2:05 PM, Light, John J <john.j.light@intel.com> wrote:
>
> Maybe the problem is that the default shouldn't be to a TCP value, but should be a 'transport' value.
>

The code is fine because net->ipv4 is always there, it doesn't
depend on any CONFIG, so it is safe for dccp to use too, although
I agree it is a bit ugly.

^ permalink raw reply

* Re: Best way to reduce system call overhead for tun device I/O?
From: Guus Sliepen @ 2016-03-31 22:28 UTC (permalink / raw)
  To: David Miller; +Cc: tom, netdev
In-Reply-To: <20160331.172050.1423948167839485574.davem@davemloft.net>

On Thu, Mar 31, 2016 at 05:20:50PM -0400, David Miller wrote:

> >> I'm trying to reduce system call overhead when reading/writing to/from a
> >> tun device in userspace. [...] What would be the right way to do this?
> >>
> > Personally I think tun could benefit greatly if it were implemented as
> > a socket instead of character interface. One thing that could be much
> > better is sending/receiving of meta data attached to skbuf. For
> > instance GSO data could be in ancillary data in a socket instead of
> > inline with packet data as tun seems to be doing now.
> 
> Agreed.

Ok. So how should the userspace API work? Creating an AF_PACKET socket
and then using a tun ioctl to create a tun interface and bind it to the
socket?

int fd = socket(AF_PACKET, ...)
struct ifreq ifr = {...};
ioctl(fd, TUNSETIFF, &ifr);

-- 
Met vriendelijke groet / with kind regards,
     Guus Sliepen <guus@tinc-vpn.org>

^ permalink raw reply

* [PATCH v2 net-next] net: ipv4: Consider unreachable nexthops in multipath routes
From: David Ahern @ 2016-03-31 22:24 UTC (permalink / raw)
  To: netdev; +Cc: ja, David Ahern

Multipath route lookups should consider knowledge about next hops and not
select a hop that is known to be failed.

Example:

                     [h2]                   [h3]   15.0.0.5
                      |                      |
                     3|                     3|
                    [SP1]                  [SP2]--+
                     1  2                   1     2
                     |  |     /-------------+     |
                     |   \   /                    |
                     |     X                      |
                     |    / \                     |
                     |   /   \---------------\    |
                     1  2                     1   2
         12.0.0.2  [TOR1] 3-----------------3 [TOR2] 12.0.0.3
                     4                         4
                      \                       /
                        \                    /
                         \                  /
                          -------|   |-----/
                                 1   2
                                [TOR3]
                                  3|
                                   |
                                  [h1]  12.0.0.1

host h1 with IP 12.0.0.1 has 2 paths to host h3 at 15.0.0.5:

    root@h1:~# ip ro ls
    ...
    12.0.0.0/24 dev swp1  proto kernel  scope link  src 12.0.0.1
    15.0.0.0/16
            nexthop via 12.0.0.2  dev swp1 weight 1
            nexthop via 12.0.0.3  dev swp1 weight 1
    ...

If the link between tor3 and tor1 is down and the link between tor1
and tor2 then tor1 is effectively cut-off from h1. Yet the route lookups
in h1 are alternating between the 2 routes: ping 15.0.0.5 gets one and
ssh 15.0.0.5 gets the other. Connections that attempt to use the
12.0.0.2 nexthop fail since that neighbor is not reachable:

    root@h1:~# ip neigh show
    ...
    12.0.0.3 dev swp1 lladdr 00:02:00:00:00:1b REACHABLE
    12.0.0.2 dev swp1  FAILED
    ...

The failed path can be avoided by considering known neighbor information
when selecting next hops. If the neighbor lookup fails we have no
knowledge about the nexthop, so give it a shot. If there is an entry
then only select the nexthop if the state is sane. This is similar to
what fib_detect_death does.

To maintain backward compatibility use of the neighbor information is
based on a new sysctl, fib_multipath_use_neigh.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
v2
- use rcu locking to avoid refcnts per Eric's suggestion
- only consider neighbor info for nh_scope == RT_SCOPE_LINK per Julian's
  comment
- drop the 'state == NUD_REACHABLE' from the state check since it is
  part of NUD_VALID (comment from Julian)
- wrapped the use of the neigh in a sysctl

 Documentation/networking/ip-sysctl.txt | 10 ++++++++++
 include/net/netns/ipv4.h               |  3 +++
 net/ipv4/fib_semantics.c               | 32 ++++++++++++++++++++++++++++++--
 net/ipv4/sysctl_net_ipv4.c             | 11 +++++++++++
 4 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index b183e2b606c8..5b316d33a23f 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -63,6 +63,16 @@ fwmark_reflect - BOOLEAN
 	fwmark of the packet they are replying to.
 	Default: 0
 
+fib_multipath_use_neigh - BOOLEAN
+	Use status of existing neighbor entry when determining nexthop for
+	multipath routes. If disabled neighbor information is not used then
+	packets could be directed to a dead nexthop. Only valid for kernels
+	built with CONFIG_IP_ROUTE_MULTIPATH enabled.
+	Default: 0 (disabled)
+	Possible values:
+	0 - disabled
+	1 - enabled
+
 route/max_size - INTEGER
 	Maximum number of routes allowed in the kernel.  Increase
 	this when using large numbers of interfaces and/or routes.
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index a69cde3ce460..d061ffeb1e71 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -133,6 +133,9 @@ struct netns_ipv4 {
 	struct fib_rules_ops	*mr_rules_ops;
 #endif
 #endif
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	int sysctl_fib_multipath_use_neigh;
+#endif
 	atomic_t	rt_genid;
 };
 #endif
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index d97268e8ff10..6d423faff0ce 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1559,17 +1559,45 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
 }
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
+static bool fib_good_nh(const struct fib_nh *nh, struct net_device *dev)
+{
+	struct neighbour *n = NULL;
+	int state = NUD_NONE;
+
+	if (nh->nh_scope == RT_SCOPE_LINK) {
+		rcu_read_lock_bh();
+
+		n = __neigh_lookup_noref(&arp_tbl, &nh->nh_gw, dev);
+		if (n)
+			state = n->nud_state;
+
+		rcu_read_unlock_bh();
+	}
+
+	/* outside of rcu locking using n only as a boolean
+	 * on whether a neighbor entry existed
+	 */
+	if (!n || (state & NUD_VALID))
+		return true;
+
+	return false;
+}
 
 void fib_select_multipath(struct fib_result *res, int hash)
 {
 	struct fib_info *fi = res->fi;
+	struct net_device *dev = fi->fib_dev;
+	struct net *net = fi->fib_net;
 
 	for_nexthops(fi) {
 		if (hash > atomic_read(&nh->nh_upper_bound))
 			continue;
 
-		res->nh_sel = nhsel;
-		return;
+		if (!net->ipv4.sysctl_fib_multipath_use_neigh ||
+		    fib_good_nh(nh, dev)) {
+			res->nh_sel = nhsel;
+			return;
+		}
 	} endfor_nexthops(fi);
 
 	/* Race condition: route has just become dead. */
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 1e1fe6086dd9..bb0419582b8d 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -960,6 +960,17 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	{
+		.procname	= "fib_multipath_use_neigh",
+		.data		= &init_net.ipv4.sysctl_fib_multipath_use_neigh,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif
 	{ }
 };
 
-- 
1.9.1

^ permalink raw reply related

* Re: qdisc spin lock
From: Cong Wang @ 2016-03-31 22:16 UTC (permalink / raw)
  To: Michael Ma; +Cc: Linux Kernel Network Developers
In-Reply-To: <CAAmHdhw9bQkCm7uehRZ9mTetMzafdXxWhYj16f8W-YvSz8V4=g@mail.gmail.com>

On Wed, Mar 30, 2016 at 12:20 AM, Michael Ma <make0818@gmail.com> wrote:
> As far as I understand the design of TC is to simplify locking schema
> and minimize the work in __qdisc_run so that throughput won’t be
> affected, especially with large packets. However if the scenario is
> that multiple classes in the queueing discipline only have the shaping
> limit, there isn’t really a necessary correlation between different
> classes. The only synchronization point should be when the packet is
> dequeued from the qdisc queue and enqueued to the transmit queue of
> the device. My question is – is it worth investing on avoiding the
> locking contention by partitioning the queue/lock so that this
> scenario is addressed with relatively smaller latency?

If your HTB classes don't share bandwidth, why do you still make them
under the same hierarchy? IOW, you can just isolate them either with some
other qdisc or just separated interfaces.

^ permalink raw reply

* Re: [net PATCH v2] i40e/i40evf: Limit TSO to 7 descriptors for payload instead of 8 per packet
From: Alexander Duyck @ 2016-03-31 22:09 UTC (permalink / raw)
  To: Jesse Brandeburg
  Cc: Alexander Duyck, Netdev, intel-wired-lan, Jeff Kirsher,
	Sowmini Varadhan
In-Reply-To: <20160331150438.00005617@unknown>

On Thu, Mar 31, 2016 at 3:04 PM, Jesse Brandeburg
<jesse.brandeburg@intel.com> wrote:
> On Wed, 30 Mar 2016 16:15:37 -0700
> Alexander Duyck <aduyck@mirantis.com> wrote:
>
>> This patch addresses a bug introduced based on my interpretation of the
>> XL710 datasheet.  Specifically section 8.4.1 states that "A single transmit
>> packet may span up to 8 buffers (up to 8 data descriptors per packet
>> including both the header and payload buffers)."  It then later goes on to
>> say that each segment for a TSO obeys the previous rule, however it then
>> refers to TSO header and the segment payload buffers.
>>
>> I believe the actual limit for fragments with TSO and a skbuff that has
>> payload data in the header portion of the buffer is actually only 7
>> fragments as the skb->data portion counts as 2 buffers, one for the TSO
>> header, and one for a segment payload buffer.
>>
>> Fixes: 2d37490b82af ("i40e/i40evf: Rewrite logic for 8 descriptor per packet check")
>> Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
>> ---
>>
>> v2: I realized that I overlooked the check in the inline function and as a
>>     result we were still allowing for cases where 8 descriptors were being
>>     used per packet and this would result in 9 DMA buffers.  I updated the
>>     code so that we only allow 8 in the case of a single send, otherwise we
>>     go into the function that walks the frags to verify each block.
>>
>> I have tested this using rds-stress and it seems to run traffic without
>> throwing any errors.
>
> Looking like it is working for me too with at least the PF.

I was testing PF <-> VF in my environment so I think I ended up
covering both in my test at least.

- Alex

^ permalink raw reply

* Re: [net PATCH v2] i40e/i40evf: Limit TSO to 7 descriptors for payload instead of 8 per packet
From: Jesse Brandeburg @ 2016-03-31 22:04 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: netdev, alexander.duyck, intel-wired-lan, jeffrey.t.kirsher,
	sowmini.varadhan
In-Reply-To: <20160330231116.12643.59554.stgit@localhost.localdomain>

On Wed, 30 Mar 2016 16:15:37 -0700
Alexander Duyck <aduyck@mirantis.com> wrote:

> This patch addresses a bug introduced based on my interpretation of the
> XL710 datasheet.  Specifically section 8.4.1 states that "A single transmit
> packet may span up to 8 buffers (up to 8 data descriptors per packet
> including both the header and payload buffers)."  It then later goes on to
> say that each segment for a TSO obeys the previous rule, however it then
> refers to TSO header and the segment payload buffers.
> 
> I believe the actual limit for fragments with TSO and a skbuff that has
> payload data in the header portion of the buffer is actually only 7
> fragments as the skb->data portion counts as 2 buffers, one for the TSO
> header, and one for a segment payload buffer.
> 
> Fixes: 2d37490b82af ("i40e/i40evf: Rewrite logic for 8 descriptor per packet check")
> Signed-off-by: Alexander Duyck <aduyck@mirantis.com>
> ---
> 
> v2: I realized that I overlooked the check in the inline function and as a
>     result we were still allowing for cases where 8 descriptors were being
>     used per packet and this would result in 9 DMA buffers.  I updated the
>     code so that we only allow 8 in the case of a single send, otherwise we
>     go into the function that walks the frags to verify each block.
> 
> I have tested this using rds-stress and it seems to run traffic without
> throwing any errors.

Looking like it is working for me too with at least the PF.

Acked-by: Jesse Brandeburg <jesse.brandeburg@intel.com>

Should also add:
Reported-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>

^ permalink raw reply

* Re: [PATCH net-next 0/8] add TX timestamping via cmsg
From: Martin KaFai Lau @ 2016-03-31 21:57 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: Soheil Hassas Yeganeh, Network Development
In-Reply-To: <CAF=yD-Lpx2nBq1bL64-pzOqHh5E-a4Bbr2goDE59ApWvzFri_Q@mail.gmail.com>

On Wed, Mar 30, 2016 at 11:50:31PM -0400, Willem de Bruijn wrote:
> >> Nice patches!
>
> This does not yet solve the append issue that your MSG_EOR patch
> addresses, of course.
Yes.  I have been thinking about both approaches.

>
> The straightforward jump to new_segment that I proposed as
> simplification is more properly a "start-of-record" than
> "end-of-record" signal. It is probably preferable to indeed be able to
> pass EOR as signal that the last skb must not be appended to in
> subsequent calls.
I suspect we could do better than only checking MSG_EOR and jump.
Before entering the loop, we may be able to check if the
last-not-yet-written out skb has tskey set and the current
tcp_sendmsg may need to overwrite it before jumping.

Also, the 2nd sendmsg may not be called with MSG_EOR set but
the per-write-knob is turned on.  It could overwrite the
1st sendmsg with both per-write-knob on and MSG_EOR set.

Note that there is another collapse-case during tcp retrans
where the MSG_EOR bit is already loss.

Hence, having EOR passed as signal (as you mentioned) and stored
is needed.

I think another bit in the TCP_SKB_CB may help here.
The semantic of this bit could be 'no skb merge under some rare conditions'.
For now, it is limited to tskey.

[Another side note is, the split/fragment case should be fine as it is.
 When splitting a skb into two smaller skbs, the tskey should fall
 into either of them and no information loss.]

> I think that the record bounds issue is best solved independently from
> the interface for intermittent timestamps because
I understand that users may want to selectively do timestamping on a
particular sendmsg (per-write-knob), while do not care if the tskey
will be overwritten (no-tskey-overwritten) by the future
sendmsg/retrans.  Separating them gives the end-user a choice.

On the other hand, if the caller has specifically asked to do tstamp in
a particular tcp_sendmsg, it is a strong intention that the caller wants to
specifically track this message alone and not expecting this tstmap to
include anything else sent after it.  Beside, TLS user needs to make more
effort to pass the per-write-knob to tcp_sendmsg.  Hence, when per-write-knob
is used, I think the no-tskey-overwritten should be at least allowed (if
not the default).

> (a) changing the tcp
> bytestream packetization for timestamping introduces subtle
> differences between tracked and untracked data that are not always
> acceptable and (b) EOR can also be useful outside timestamps. A
> zerocopy sendmsg patchset that I sent for RFC last year encountered a
> similar requirement, to give one example: each skb with user data must
> point to a completion notification structure (ubuf_info), and can only
> point to one at a time. Appends that cause a conflict in skb->uarg
> pointers had to be blocked, at the cost of possibly different
> packetization compared to regular sends.
With no-tskey-overwritten turned on in production, we don't found the
end-user's performance has been impacted in meaningful way. I believe
it is the proper tradeoff as an opt-in feature to get measurement
data for existing tcp-socket users without adding a lot of burden
to the TCP stack which is a byte-oriented socket to begin with.

^ permalink raw reply

* Re: [PATCH net] tun, bpf: fix suspicious RCU usage in tun_{attach,detach}_filter
From: Daniel Borkmann @ 2016-03-31 21:52 UTC (permalink / raw)
  To: Hannes Frederic Sowa, David Miller
  Cc: eric.dumazet, alexei.starovoitov, mkubecek, sasha.levin, jslaby,
	mst, netdev
In-Reply-To: <56FD7F0B.5090602@stressinduktion.org>

On 03/31/2016 09:48 PM, Hannes Frederic Sowa wrote:
[...]
> Tightest solution would probably be to combine both patches.
>
> bool called_by_tuntap;
>
> old_fp = rcu_dereference_protected(sk->sk_filter, called_by_tuntap ? lockdep_rtnl_is_held() : lockdep_sock_is_held());

If I understand you correctly with combining them, you mean you'd still
need the API change to pass the bool 'called_by_tuntap' down, right?

If so, your main difference is, after all, to replace the sock_owned_by_user()
with the lockdep_sock_is_held() construction instead, correct?

But then, isn't it already sufficient when you pass the bool itself down
that 'demuxes' in this case between the sock_owned_by_user() vs
lockdep_rtnl_is_held() check?

Thanks,
Daniel

^ permalink raw reply

* [PATCH] net: thunderx: Fix broken of_node_put() code.
From: David Daney @ 2016-03-31 21:42 UTC (permalink / raw)
  To: David S. Miller, netdev
  Cc: linux-kernel, linux-arm-kernel, Robert Richter, Sunil Goutham,
	David Daney

From: David Daney <david.daney@cavium.com>

commit b7d3e3d3d21a ("net: thunderx: Don't leak phy device references
on -EPROBE_DEFER condition.") incorrectly moved the call to
of_node_put() outside of the loop.  Under normal loop exit, the node
has already had of_node_put() called, so the extra call results in:

[    8.228020] ERROR: Bad of_node_put() on /soc@0/pci@848000000000/mrml-bridge0@1,0/bgx0/xlaui00
[    8.239433] CPU: 16 PID: 608 Comm: systemd-udevd Not tainted 4.6.0-rc1-numa+ #157
[    8.247380] Hardware name: www.cavium.com EBB8800/EBB8800, BIOS 0.3 Mar  2 2016
[    8.273541] Call trace:
[    8.273550] [<fffffc0008097364>] dump_backtrace+0x0/0x210
[    8.273557] [<fffffc0008097598>] show_stack+0x24/0x2c
[    8.273560] [<fffffc0008399ed0>] dump_stack+0x8c/0xb4
[    8.273566] [<fffffc00085aa828>] of_node_release+0xa8/0xac
[    8.273570] [<fffffc000839cad8>] kobject_cleanup+0x8c/0x194
[    8.273573] [<fffffc000839c97c>] kobject_put+0x44/0x6c
[    8.273576] [<fffffc00085a9ab0>] of_node_put+0x24/0x30
[    8.273587] [<fffffc0000bd0f74>] bgx_probe+0x17c/0xcd8 [thunder_bgx]
[    8.273591] [<fffffc00083ed220>] pci_device_probe+0xa0/0x114
[    8.273596] [<fffffc0008473fbc>] driver_probe_device+0x178/0x418
[    8.273599] [<fffffc000847435c>] __driver_attach+0x100/0x118
[    8.273602] [<fffffc0008471b58>] bus_for_each_dev+0x6c/0xac
[    8.273605] [<fffffc0008473884>] driver_attach+0x30/0x38
[    8.273608] [<fffffc00084732f4>] bus_add_driver+0x1f8/0x29c
[    8.273611] [<fffffc0008475028>] driver_register+0x70/0x110
[    8.273617] [<fffffc00083ebf08>] __pci_register_driver+0x60/0x6c
[    8.273623] [<fffffc0000bf0040>] bgx_init_module+0x40/0x48 [thunder_bgx]
[    8.273626] [<fffffc0008090d04>] do_one_initcall+0xcc/0x1c0
[    8.273631] [<fffffc0008198abc>] do_init_module+0x68/0x1c8
[    8.273635] [<fffffc0008125668>] load_module+0xf44/0x11f4
[    8.273638] [<fffffc0008125b64>] SyS_finit_module+0xb8/0xe0
[    8.273641] [<fffffc0008093b30>] el0_svc_naked+0x24/0x28

Go back to the previous (correct) code that only did the extra
of_node_put() call on early exit from the loop.

Signed-off-by: David Daney <david.daney@cavium.com>
---
 drivers/net/ethernet/cavium/thunder/thunder_bgx.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c
index 9679515..d20539a 100644
--- a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c
+++ b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c
@@ -1011,10 +1011,11 @@ static int bgx_init_of_phy(struct bgx *bgx)
 		}
 
 		lmac++;
-		if (lmac == MAX_LMAC_PER_BGX)
+		if (lmac == MAX_LMAC_PER_BGX) {
+			of_node_put(node);
 			break;
+		}
 	}
-	of_node_put(node);
 	return 0;
 
 defer:
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH] sctp: avoid refreshing heartbeat timer too often
From: 'Marcelo Ricardo Leitner' @ 2016-03-31 21:25 UTC (permalink / raw)
  To: David Laight
  Cc: netdev@vger.kernel.org, Neil Horman, Vlad Yasevich,
	linux-sctp@vger.kernel.org
In-Reply-To: <063D6719AE5E284EB5DD2968C1650D6D54DC1DB0@AcuExch.aculab.com>

On Thu, Mar 31, 2016 at 11:16:52AM +0000, David Laight wrote:
> From: Marcelo Ricardo Leitner
> > Sent: 30 March 2016 13:13
> > Em 30-03-2016 06:37, David Laight escreveu:
> > > From: Marcelo Ricardo Leitner
> > >> Sent: 29 March 2016 14:42
> > >>
> > >> Currently on high rate SCTP streams the heartbeat timer refresh can
> > >> consume quite a lot of resources as timer updates are costly and it
> > >> contains a random factor, which a) is also costly and b) invalidates
> > >> mod_timer() optimization for not editing a timer to the same value.
> > >> It may even cause the timer to be slightly advanced, for no good reason.
> > >
> > > Interesting thoughts:
> > > 1) Is it necessary to use a different 'random factor' until the timer actually
> > >     expires?
> > 
> > I don't understand you fully here, but we have to have a random factor
> > on timer expire. As noted by Daniel Borkmann on his commit 8f61059a96c2
> > ("net: sctp: improve timer slack calculation for transport HBs"):
> 
> When a HEARTBEAT chunk is sent determine the new interval, use that
> interval until the timer actually expires when a new interval is
> calculated. So the random number is only generated once per heartbeat.
> 
> >      RFC4960, section 8.3 says:
> > 
> >        On an idle destination address that is allowed to heartbeat,
> >        it is recommended that a HEARTBEAT chunk is sent once per RTO
> >        of that destination address plus the protocol parameter
> >        'HB.interval', with jittering of +/- 50% of the RTO value,
> >        and exponential backoff of the RTO if the previous HEARTBEAT
> >        is unanswered.
> > 
> > Previous to his commit, it was using a random factor based on jiffies.
> > 
> > This patch then assumes that random_A+2 is just as random as random_B as
> > long as it is within the allowed range, avoiding the unnecessary updates.
> > 
> > > 2) It might be better to allow the heartbeat timer to expire, on expiry work
> > >     out the new interval based on when the last 'refresh' was done.
> > 
> > Cool, I thought about this too. It would introduce some extra complexity
> > that is not really worth I think, specially because now we may be doing
> > more timer updates even with this patch but it's not triggering any wake
> > ups and we would need at least 2 wake ups then: one for the first
> > timeout event, and then re-schedule the timer for the next updated one,
> > and maybe again, and again.. less timer updates but more wake ups, one
> > at every heartbeat interval even on a busy transport. Seems it's cheaper
> > to just update the timer then.
> 
> One wakeup per heartbeat interval on a busy connection is probably noise.
> Probably much less than the 1000s of timer updates that would otherwise happen.

I was thinking more on near-idle systems, as the overhead for this
refresh looked rather small now even for busy transports if compared to
other points, a worth trade-off for reducing wake ups, imho.

But then I checked tcp, and it does what you're suggesting.
I'll rework the patch. Thanks

> A further optimisation would be to restart the timer if more than (say) 80%
> of the way through the timeout period.
> 
> Similarly the HEARTBEAT could be sent if the 2nd wakeup would be almost immediate.

^ permalink raw reply

* Re: Best way to reduce system call overhead for tun device I/O?
From: David Miller @ 2016-03-31 21:20 UTC (permalink / raw)
  To: tom; +Cc: guus, netdev
In-Reply-To: <CALx6S37xsTeDyMHOEiGzjpbR_5zFw6XSDEgPkZqpUzEa9WDStg@mail.gmail.com>

From: Tom Herbert <tom@herbertland.com>
Date: Thu, 31 Mar 2016 17:18:48 -0400

> On Tue, Mar 29, 2016 at 6:40 PM, Guus Sliepen <guus@tinc-vpn.org> wrote:
>> I'm trying to reduce system call overhead when reading/writing to/from a
>> tun device in userspace. For sockets, one can use sendmmsg()/recvmmsg(),
>> but a tun fd is not a socket fd, so this doesn't work. I'm see several
>> options to allow userspace to read/write multiple packets with one
>> syscall:
>>
>> - Implement a TX/RX ring buffer that is mmap()ed, like with AF_PACKET
>>   sockets.
>>
>> - Implement a ioctl() to emulate sendmmsg()/recvmmsg().
>>
>> - Add a flag that can be set using TUNSETIFF that makes regular
>>   read()/write() calls handle multiple packets in one go.
>>
>> - Expose a socket fd to userspace, so regular sendmmsg()/recvmmsg() can
>>   be used. There is tun_get_socket() which is used internally in the
>>   kernel, but this is not exposed to userspace, and doesn't look trivial
>>   to do either.
>>
>> What would be the right way to do this?
>>
> Personally I think tun could benefit greatly if it were implemented as
> a socket instead of character interface. One thing that could be much
> better is sending/receiving of meta data attached to skbuf. For
> instance GSO data could be in ancillary data in a socket instead of
> inline with packet data as tun seems to be doing now.

Agreed.

^ permalink raw reply

* Re: [PATCH 1/1] net: usb: cdc_ncm: adding Telit LE910 V2 mobile broadband card
From: David Miller @ 2016-03-31 21:20 UTC (permalink / raw)
  To: bjorn-yOkvZcmFvRU
  Cc: dnlplm-Re5JQEeQqe8AvxtiuMwx3w, linux-usb-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <87lh4ycq5p.fsf-lbf33ChDnrE/G1V5fR+Y7Q@public.gmane.org>

From: Bjørn Mork <bjorn-yOkvZcmFvRU@public.gmane.org>
Date: Thu, 31 Mar 2016 23:07:30 +0200

> David Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org> writes:
>> From: Daniele Palmas <dnlplm-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>> Date: Thu, 31 Mar 2016 15:16:47 +0200
>>
>>> Telit LE910 V2 is a mobile broadband card with no ARP capabilities:
>>> the patch makes this device to use wwan_noarp_info struct
>>> 
>>> Signed-off-by: Daniele Palmas <dnlplm-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>>
>> Bjorn, can you take a quick look at this?
> 
> Looks fine to me.
> 
> Reviewed-by: Bjørn Mork <bjorn-yOkvZcmFvRU@public.gmane.org>

Thanks, applied.
--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: Best way to reduce system call overhead for tun device I/O?
From: Tom Herbert @ 2016-03-31 21:18 UTC (permalink / raw)
  To: Guus Sliepen; +Cc: Linux Kernel Network Developers
In-Reply-To: <20160329224043.GY3784@sliepen.org>

On Tue, Mar 29, 2016 at 6:40 PM, Guus Sliepen <guus@tinc-vpn.org> wrote:
> I'm trying to reduce system call overhead when reading/writing to/from a
> tun device in userspace. For sockets, one can use sendmmsg()/recvmmsg(),
> but a tun fd is not a socket fd, so this doesn't work. I'm see several
> options to allow userspace to read/write multiple packets with one
> syscall:
>
> - Implement a TX/RX ring buffer that is mmap()ed, like with AF_PACKET
>   sockets.
>
> - Implement a ioctl() to emulate sendmmsg()/recvmmsg().
>
> - Add a flag that can be set using TUNSETIFF that makes regular
>   read()/write() calls handle multiple packets in one go.
>
> - Expose a socket fd to userspace, so regular sendmmsg()/recvmmsg() can
>   be used. There is tun_get_socket() which is used internally in the
>   kernel, but this is not exposed to userspace, and doesn't look trivial
>   to do either.
>
> What would be the right way to do this?
>
Personally I think tun could benefit greatly if it were implemented as
a socket instead of character interface. One thing that could be much
better is sending/receiving of meta data attached to skbuf. For
instance GSO data could be in ancillary data in a socket instead of
inline with packet data as tun seems to be doing now.

Tom

> --
> Met vriendelijke groet / with kind regards,
>      Guus Sliepen <guus@tinc-vpn.org>

^ permalink raw reply

* Re: [PATCH 1/1] net: usb: cdc_ncm: adding Telit LE910 V2 mobile broadband card
From: Bjørn Mork @ 2016-03-31 21:07 UTC (permalink / raw)
  To: David Miller
  Cc: dnlplm-Re5JQEeQqe8AvxtiuMwx3w, linux-usb-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20160331.165138.1846092223088368562.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>

David Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org> writes:
> From: Daniele Palmas <dnlplm-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> Date: Thu, 31 Mar 2016 15:16:47 +0200
>
>> Telit LE910 V2 is a mobile broadband card with no ARP capabilities:
>> the patch makes this device to use wwan_noarp_info struct
>> 
>> Signed-off-by: Daniele Palmas <dnlplm-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>
> Bjorn, can you take a quick look at this?

Looks fine to me.

Reviewed-by: Bjørn Mork <bjorn-yOkvZcmFvRU@public.gmane.org>


Bjørn
--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH net-next v2 6/6] net: dsa: mv88e6131: enable hardware bridging
From: Vivien Didelot @ 2016-03-31 20:53 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel, kernel, David S. Miller, Andrew Lunn,
	Vivien Didelot
In-Reply-To: <1459457626-30082-1-git-send-email-vivien.didelot@savoirfairelinux.com>

By adding support for bridge operations, FDB operations, and optionally
VLAN operations (for 802.1Q and VLAN filtering aware systems), the
switch bridges ports correctly, the CPU is able to populate the hardware
address databases, and thus hardware bridging becomes functional within
the 88E6185 family of switches.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
---
 drivers/net/dsa/mv88e6131.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/drivers/net/dsa/mv88e6131.c b/drivers/net/dsa/mv88e6131.c
index a92ca65..2407028 100644
--- a/drivers/net/dsa/mv88e6131.c
+++ b/drivers/net/dsa/mv88e6131.c
@@ -169,6 +169,17 @@ struct dsa_switch_driver mv88e6131_switch_driver = {
 	.get_ethtool_stats	= mv88e6xxx_get_ethtool_stats,
 	.get_sset_count		= mv88e6xxx_get_sset_count,
 	.adjust_link		= mv88e6xxx_adjust_link,
+	.port_bridge_join	= mv88e6xxx_port_bridge_join,
+	.port_bridge_leave	= mv88e6xxx_port_bridge_leave,
+	.port_vlan_filtering	= mv88e6xxx_port_vlan_filtering,
+	.port_vlan_prepare	= mv88e6xxx_port_vlan_prepare,
+	.port_vlan_add		= mv88e6xxx_port_vlan_add,
+	.port_vlan_del		= mv88e6xxx_port_vlan_del,
+	.port_vlan_dump		= mv88e6xxx_port_vlan_dump,
+	.port_fdb_prepare       = mv88e6xxx_port_fdb_prepare,
+	.port_fdb_add           = mv88e6xxx_port_fdb_add,
+	.port_fdb_del           = mv88e6xxx_port_fdb_del,
+	.port_fdb_dump          = mv88e6xxx_port_fdb_dump,
 };
 
 MODULE_ALIAS("platform:mv88e6085");
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next v2 5/6] net: dsa: mv88e6xxx: map destination addresses for 6185
From: Vivien Didelot @ 2016-03-31 20:53 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel, kernel, David S. Miller, Andrew Lunn,
	Vivien Didelot
In-Reply-To: <1459457626-30082-1-git-send-email-vivien.didelot@savoirfairelinux.com>

The 88E6185 switch also has a MapDA bit in its Port Control 2 register.
When this bit is cleared, all frames are sent out to the CPU port.

Set this bit to rely on address databases (ATU) hits and direct frames
out of the correct ports, and thus allow hardware bridging.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
---
 drivers/net/dsa/mv88e6xxx.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index 52c3e17..56f5657 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -2455,7 +2455,8 @@ static int mv88e6xxx_setup_port(struct dsa_switch *ds, int port)
 	reg = 0;
 	if (mv88e6xxx_6352_family(ds) || mv88e6xxx_6351_family(ds) ||
 	    mv88e6xxx_6165_family(ds) || mv88e6xxx_6097_family(ds) ||
-	    mv88e6xxx_6095_family(ds) || mv88e6xxx_6320_family(ds))
+	    mv88e6xxx_6095_family(ds) || mv88e6xxx_6320_family(ds) ||
+	    mv88e6xxx_6185_family(ds))
 		reg = PORT_CONTROL_2_MAP_DA;
 
 	if (mv88e6xxx_6352_family(ds) || mv88e6xxx_6351_family(ds) ||
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next v2 4/6] net: dsa: mv88e6xxx: support 256 databases
From: Vivien Didelot @ 2016-03-31 20:53 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel, kernel, David S. Miller, Andrew Lunn,
	Vivien Didelot
In-Reply-To: <1459457626-30082-1-git-send-email-vivien.didelot@savoirfairelinux.com>

The 6185 family of devices has only 256 address databases. Their 8-bit
FID for ATU and VTU operations are split into ATU Control and ATU/VTU
Operation registers.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
---
 drivers/net/dsa/mv88e6xxx.c | 36 +++++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index 888c66b..52c3e17 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -1003,6 +1003,20 @@ static int _mv88e6xxx_atu_cmd(struct dsa_switch *ds, u16 fid, u16 cmd)
 		ret = _mv88e6xxx_reg_write(ds, REG_GLOBAL, GLOBAL_ATU_FID, fid);
 		if (ret < 0)
 			return ret;
+	} else if (mv88e6xxx_num_databases(ds) == 256) {
+		/* ATU DBNum[7:4] are located in ATU Control 15:12 */
+		ret = _mv88e6xxx_reg_read(ds, REG_GLOBAL, GLOBAL_ATU_CONTROL);
+		if (ret < 0)
+			return ret;
+
+		ret = _mv88e6xxx_reg_write(ds, REG_GLOBAL, GLOBAL_ATU_CONTROL,
+					   (ret & 0xfff) |
+					   ((fid << 8) & 0xf000));
+		if (ret < 0)
+			return ret;
+
+		/* ATU DBNum[3:0] are located in ATU Operation 3:0 */
+		cmd |= fid & 0xf;
 	}
 
 	ret = _mv88e6xxx_reg_write(ds, REG_GLOBAL, GLOBAL_ATU_OP, cmd);
@@ -1373,6 +1387,17 @@ static int _mv88e6xxx_vtu_getnext(struct dsa_switch *ds,
 				return ret;
 
 			next.fid = ret & GLOBAL_VTU_FID_MASK;
+		} else if (mv88e6xxx_num_databases(ds) == 256) {
+			/* VTU DBNum[7:4] are located in VTU Operation 11:8, and
+			 * VTU DBNum[3:0] are located in VTU Operation 3:0
+			 */
+			ret = _mv88e6xxx_reg_read(ds, REG_GLOBAL,
+						  GLOBAL_VTU_OP);
+			if (ret < 0)
+				return ret;
+
+			next.fid = (ret & 0xf00) >> 4;
+			next.fid |= ret & 0xf;
 		}
 
 		if (mv88e6xxx_has_stu(ds)) {
@@ -1443,6 +1468,7 @@ unlock:
 static int _mv88e6xxx_vtu_loadpurge(struct dsa_switch *ds,
 				    struct mv88e6xxx_vtu_stu_entry *entry)
 {
+	u16 op = GLOBAL_VTU_OP_VTU_LOAD_PURGE;
 	u16 reg = 0;
 	int ret;
 
@@ -1470,6 +1496,12 @@ static int _mv88e6xxx_vtu_loadpurge(struct dsa_switch *ds,
 		ret = _mv88e6xxx_reg_write(ds, REG_GLOBAL, GLOBAL_VTU_FID, reg);
 		if (ret < 0)
 			return ret;
+	} else if (mv88e6xxx_num_databases(ds) == 256) {
+		/* VTU DBNum[7:4] are located in VTU Operation 11:8, and
+		 * VTU DBNum[3:0] are located in VTU Operation 3:0
+		 */
+		op |= (entry->fid & 0xf0) << 8;
+		op |= entry->fid & 0xf;
 	}
 
 	reg = GLOBAL_VTU_VID_VALID;
@@ -1479,7 +1511,7 @@ loadpurge:
 	if (ret < 0)
 		return ret;
 
-	return _mv88e6xxx_vtu_cmd(ds, GLOBAL_VTU_OP_VTU_LOAD_PURGE);
+	return _mv88e6xxx_vtu_cmd(ds, op);
 }
 
 static int _mv88e6xxx_stu_getnext(struct dsa_switch *ds, u8 sid,
@@ -1564,6 +1596,8 @@ static int _mv88e6xxx_port_fid(struct dsa_switch *ds, int port, u16 *new,
 
 	if (mv88e6xxx_num_databases(ds) == 4096)
 		upper_mask = 0xff;
+	else if (mv88e6xxx_num_databases(ds) == 256)
+		upper_mask = 0xf;
 	else
 		return -EOPNOTSUPP;
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next v2 3/6] net: dsa: mv88e6xxx: variable number of databases
From: Vivien Didelot @ 2016-03-31 20:53 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel, kernel, David S. Miller, Andrew Lunn,
	Vivien Didelot
In-Reply-To: <1459457626-30082-1-git-send-email-vivien.didelot@savoirfairelinux.com>

Marvell switch chips have different number of address databases.

The code currently only supports models with 4096 databases. Such switch
has dedicated FID registers for ATU and VTU operations. Models with
fewer databases have their FID split in several registers.

List them all but only support models with 4096 databases at the moment.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
---
 drivers/net/dsa/mv88e6xxx.c | 38 ++++++++++++++++++++++++++++++++++----
 1 file changed, 34 insertions(+), 4 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index cc066dc..888c66b 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -482,6 +482,30 @@ static bool mv88e6xxx_6352_family(struct dsa_switch *ds)
 	return false;
 }
 
+static unsigned int mv88e6xxx_num_databases(struct dsa_switch *ds)
+{
+	struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
+
+	/* The following devices have 4-bit identifiers for 16 databases */
+	if (ps->id == PORT_SWITCH_ID_6061)
+		return 16;
+
+	/* The following devices have 6-bit identifiers for 64 databases */
+	if (ps->id == PORT_SWITCH_ID_6065)
+		return 64;
+
+	/* The following devices have 8-bit identifiers for 256 databases */
+	if (mv88e6xxx_6095_family(ds) || mv88e6xxx_6185_family(ds))
+		return 256;
+
+	/* The following devices have 12-bit identifiers for 4096 databases */
+	if (mv88e6xxx_6097_family(ds) || mv88e6xxx_6165_family(ds) ||
+	    mv88e6xxx_6351_family(ds) || mv88e6xxx_6352_family(ds))
+		return 4096;
+
+	return 0;
+}
+
 static bool mv88e6xxx_has_fid_reg(struct dsa_switch *ds)
 {
 	/* Does the device have dedicated FID registers for ATU and VTU ops? */
@@ -1534,9 +1558,15 @@ loadpurge:
 static int _mv88e6xxx_port_fid(struct dsa_switch *ds, int port, u16 *new,
 			       u16 *old)
 {
+	u16 upper_mask;
 	u16 fid;
 	int ret;
 
+	if (mv88e6xxx_num_databases(ds) == 4096)
+		upper_mask = 0xff;
+	else
+		return -EOPNOTSUPP;
+
 	/* Port's default FID bits 3:0 are located in reg 0x06, offset 12 */
 	ret = _mv88e6xxx_reg_read(ds, REG_PORT(port), PORT_BASE_VLAN);
 	if (ret < 0)
@@ -1559,11 +1589,11 @@ static int _mv88e6xxx_port_fid(struct dsa_switch *ds, int port, u16 *new,
 	if (ret < 0)
 		return ret;
 
-	fid |= (ret & PORT_CONTROL_1_FID_11_4_MASK) << 4;
+	fid |= (ret & upper_mask) << 4;
 
 	if (new) {
-		ret &= ~PORT_CONTROL_1_FID_11_4_MASK;
-		ret |= (*new >> 4) & PORT_CONTROL_1_FID_11_4_MASK;
+		ret &= ~upper_mask;
+		ret |= (*new >> 4) & upper_mask;
 
 		ret = _mv88e6xxx_reg_write(ds, REG_PORT(port), PORT_CONTROL_1,
 					   ret);
@@ -1627,7 +1657,7 @@ static int _mv88e6xxx_fid_new(struct dsa_switch *ds, u16 *fid)
 	 * databases are not needed. Return the next positive available.
 	 */
 	*fid = find_next_zero_bit(fid_bitmap, MV88E6XXX_N_FID, 1);
-	if (unlikely(*fid == MV88E6XXX_N_FID))
+	if (unlikely(*fid >= mv88e6xxx_num_databases(ds)))
 		return -ENOSPC;
 
 	/* Clear the database */
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next v2 2/6] net: dsa: mv88e6xxx: protect FID registers access
From: Vivien Didelot @ 2016-03-31 20:53 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel, kernel, David S. Miller, Andrew Lunn,
	Vivien Didelot
In-Reply-To: <1459457626-30082-1-git-send-email-vivien.didelot@savoirfairelinux.com>

Only switch families with 4096 address databases have dedicated FID
registers for ATU and VTU operations.

Factorize the access to the GLOBAL_ATU_FID register and introduce a
mv88e6xxx_has_fid_reg() helper function to protect the access to
GLOBAL_ATU_FID and GLOBAL_VTU_FID.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
---
 drivers/net/dsa/mv88e6xxx.c | 42 +++++++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index 19a8208..cc066dc 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -482,6 +482,16 @@ static bool mv88e6xxx_6352_family(struct dsa_switch *ds)
 	return false;
 }
 
+static bool mv88e6xxx_has_fid_reg(struct dsa_switch *ds)
+{
+	/* Does the device have dedicated FID registers for ATU and VTU ops? */
+	if (mv88e6xxx_6097_family(ds) || mv88e6xxx_6165_family(ds) ||
+	    mv88e6xxx_6351_family(ds) || mv88e6xxx_6352_family(ds))
+		return true;
+
+	return false;
+}
+
 static bool mv88e6xxx_has_stu(struct dsa_switch *ds)
 {
 	/* Does the device have STU and dedicated SID registers for VTU ops? */
@@ -961,10 +971,16 @@ out:
 	return ret;
 }
 
-static int _mv88e6xxx_atu_cmd(struct dsa_switch *ds, u16 cmd)
+static int _mv88e6xxx_atu_cmd(struct dsa_switch *ds, u16 fid, u16 cmd)
 {
 	int ret;
 
+	if (mv88e6xxx_has_fid_reg(ds)) {
+		ret = _mv88e6xxx_reg_write(ds, REG_GLOBAL, GLOBAL_ATU_FID, fid);
+		if (ret < 0)
+			return ret;
+	}
+
 	ret = _mv88e6xxx_reg_write(ds, REG_GLOBAL, GLOBAL_ATU_OP, cmd);
 	if (ret < 0)
 		return ret;
@@ -1011,11 +1027,6 @@ static int _mv88e6xxx_atu_flush_move(struct dsa_switch *ds,
 		return err;
 
 	if (entry->fid) {
-		err = _mv88e6xxx_reg_write(ds, REG_GLOBAL, GLOBAL_ATU_FID,
-					   entry->fid);
-		if (err)
-			return err;
-
 		op = static_too ? GLOBAL_ATU_OP_FLUSH_MOVE_ALL_DB :
 			GLOBAL_ATU_OP_FLUSH_MOVE_NON_STATIC_DB;
 	} else {
@@ -1023,7 +1034,7 @@ static int _mv88e6xxx_atu_flush_move(struct dsa_switch *ds,
 			GLOBAL_ATU_OP_FLUSH_MOVE_NON_STATIC;
 	}
 
-	return _mv88e6xxx_atu_cmd(ds, op);
+	return _mv88e6xxx_atu_cmd(ds, entry->fid, op);
 }
 
 static int _mv88e6xxx_atu_flush(struct dsa_switch *ds, u16 fid, bool static_too)
@@ -1331,8 +1342,7 @@ static int _mv88e6xxx_vtu_getnext(struct dsa_switch *ds,
 		if (ret < 0)
 			return ret;
 
-		if (mv88e6xxx_6097_family(ds) || mv88e6xxx_6165_family(ds) ||
-		    mv88e6xxx_6351_family(ds) || mv88e6xxx_6352_family(ds)) {
+		if (mv88e6xxx_has_fid_reg(ds)) {
 			ret = _mv88e6xxx_reg_read(ds, REG_GLOBAL,
 						  GLOBAL_VTU_FID);
 			if (ret < 0)
@@ -1429,7 +1439,9 @@ static int _mv88e6xxx_vtu_loadpurge(struct dsa_switch *ds,
 		ret = _mv88e6xxx_reg_write(ds, REG_GLOBAL, GLOBAL_VTU_SID, reg);
 		if (ret < 0)
 			return ret;
+	}
 
+	if (mv88e6xxx_has_fid_reg(ds)) {
 		reg = entry->fid & GLOBAL_VTU_FID_MASK;
 		ret = _mv88e6xxx_reg_write(ds, REG_GLOBAL, GLOBAL_VTU_FID, reg);
 		if (ret < 0)
@@ -1976,11 +1988,7 @@ static int _mv88e6xxx_atu_load(struct dsa_switch *ds,
 	if (ret < 0)
 		return ret;
 
-	ret = _mv88e6xxx_reg_write(ds, REG_GLOBAL, GLOBAL_ATU_FID, entry->fid);
-	if (ret < 0)
-		return ret;
-
-	return _mv88e6xxx_atu_cmd(ds, GLOBAL_ATU_OP_LOAD_DB);
+	return _mv88e6xxx_atu_cmd(ds, entry->fid, GLOBAL_ATU_OP_LOAD_DB);
 }
 
 static int _mv88e6xxx_port_fdb_load(struct dsa_switch *ds, int port,
@@ -2063,11 +2071,7 @@ static int _mv88e6xxx_atu_getnext(struct dsa_switch *ds, u16 fid,
 	if (ret < 0)
 		return ret;
 
-	ret = _mv88e6xxx_reg_write(ds, REG_GLOBAL, GLOBAL_ATU_FID, fid);
-	if (ret < 0)
-		return ret;
-
-	ret = _mv88e6xxx_atu_cmd(ds, GLOBAL_ATU_OP_GET_NEXT_DB);
+	ret = _mv88e6xxx_atu_cmd(ds, fid, GLOBAL_ATU_OP_GET_NEXT_DB);
 	if (ret < 0)
 		return ret;
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next v2 1/6] net: dsa: mv88e6xxx: protect SID register access
From: Vivien Didelot @ 2016-03-31 20:53 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel, kernel, David S. Miller, Andrew Lunn,
	Vivien Didelot
In-Reply-To: <1459457626-30082-1-git-send-email-vivien.didelot@savoirfairelinux.com>

Introduce a mv88e6xxx_has_stu() helper to protect the access to the
GLOBAL_VTU_SID register, instead of checking switch families.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
---
 drivers/net/dsa/mv88e6xxx.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index fa086e0..19a8208 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -482,6 +482,16 @@ static bool mv88e6xxx_6352_family(struct dsa_switch *ds)
 	return false;
 }
 
+static bool mv88e6xxx_has_stu(struct dsa_switch *ds)
+{
+	/* Does the device have STU and dedicated SID registers for VTU ops? */
+	if (mv88e6xxx_6097_family(ds) || mv88e6xxx_6165_family(ds) ||
+	    mv88e6xxx_6351_family(ds) || mv88e6xxx_6352_family(ds))
+		return true;
+
+	return false;
+}
+
 /* We expect the switch to perform auto negotiation if there is a real
  * phy. However, in the case of a fixed link phy, we force the port
  * settings from the fixed link settings.
@@ -1329,7 +1339,9 @@ static int _mv88e6xxx_vtu_getnext(struct dsa_switch *ds,
 				return ret;
 
 			next.fid = ret & GLOBAL_VTU_FID_MASK;
+		}
 
+		if (mv88e6xxx_has_stu(ds)) {
 			ret = _mv88e6xxx_reg_read(ds, REG_GLOBAL,
 						  GLOBAL_VTU_SID);
 			if (ret < 0)
@@ -1412,8 +1424,7 @@ static int _mv88e6xxx_vtu_loadpurge(struct dsa_switch *ds,
 	if (ret < 0)
 		return ret;
 
-	if (mv88e6xxx_6097_family(ds) || mv88e6xxx_6165_family(ds) ||
-	    mv88e6xxx_6351_family(ds) || mv88e6xxx_6352_family(ds)) {
+	if (mv88e6xxx_has_stu(ds)) {
 		reg = entry->sid & GLOBAL_VTU_SID_MASK;
 		ret = _mv88e6xxx_reg_write(ds, REG_GLOBAL, GLOBAL_VTU_SID, reg);
 		if (ret < 0)
-- 
2.7.4

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox