Netdev List
 help / color / mirror / Atom feed
* [PATCH RFC net-next 1/7] sock: add sk_dst_pending_confirm flag
From: Julian Anastasov @ 2016-12-18 20:56 UTC (permalink / raw)
  To: netdev; +Cc: linux-sctp, YueHaibing
In-Reply-To: <1482094596-26046-1-git-send-email-ja@ssi.bg>

Add new sock flag to allow sockets to confirm neighbour.
When same struct dst_entry can be used for many different
neighbours we can not use it for pending confirmations.
As not all call paths lock the socket use full word for
the flag.

Add sk_dst_confirm as replacement for dst_confirm when
called for received packets.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
 include/net/sock.h | 14 +++++++++++++-
 net/core/sock.c    |  2 ++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 282d065..e83bb01 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -239,6 +239,7 @@ struct sock_common {
   *	@sk_wq: sock wait queue and async head
   *	@sk_rx_dst: receive input route used by early demux
   *	@sk_dst_cache: destination cache
+  *	@sk_dst_pending_confirm: need to confirm neighbour
   *	@sk_policy: flow policy
   *	@sk_receive_queue: incoming packets
   *	@sk_wmem_alloc: transmit queue bytes committed
@@ -381,7 +382,7 @@ struct sock {
 #endif
 	struct dst_entry	*sk_rx_dst;
 	struct dst_entry __rcu	*sk_dst_cache;
-	atomic_t		sk_omem_alloc;
+	int			sk_dst_pending_confirm;
 	int			sk_sndbuf;
 
 	/* ===== cache line for TX ===== */
@@ -405,6 +406,8 @@ struct sock {
 	unsigned int		sk_gso_max_size;
 	gfp_t			sk_allocation;
 	__u32			sk_txhash;
+	atomic_t		sk_omem_alloc;
+	/* Note: 32bit hole on 64bit arches */
 
 	/*
 	 * Because of non atomicity rules, all
@@ -1761,6 +1764,7 @@ static inline void dst_negative_advice(struct sock *sk)
 		if (ndst != dst) {
 			rcu_assign_pointer(sk->sk_dst_cache, ndst);
 			sk_tx_queue_clear(sk);
+			sk->sk_dst_pending_confirm = 0;
 		}
 	}
 }
@@ -1771,6 +1775,7 @@ static inline void dst_negative_advice(struct sock *sk)
 	struct dst_entry *old_dst;
 
 	sk_tx_queue_clear(sk);
+	sk->sk_dst_pending_confirm = 0;
 	/*
 	 * This can be called while sk is owned by the caller only,
 	 * with no state that can be checked in a rcu_dereference_check() cond
@@ -1786,6 +1791,7 @@ static inline void dst_negative_advice(struct sock *sk)
 	struct dst_entry *old_dst;
 
 	sk_tx_queue_clear(sk);
+	sk->sk_dst_pending_confirm = 0;
 	old_dst = xchg((__force struct dst_entry **)&sk->sk_dst_cache, dst);
 	dst_release(old_dst);
 }
@@ -1806,6 +1812,12 @@ static inline void dst_negative_advice(struct sock *sk)
 
 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie);
 
+static inline void sk_dst_confirm(struct sock *sk)
+{
+	if (!sk->sk_dst_pending_confirm)
+		sk->sk_dst_pending_confirm = 1;
+}
+
 bool sk_mc_loop(struct sock *sk);
 
 static inline bool sk_can_gso(const struct sock *sk)
diff --git a/net/core/sock.c b/net/core/sock.c
index 9fa46b9..8af5296 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -502,6 +502,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 
 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 		sk_tx_queue_clear(sk);
+		sk->sk_dst_pending_confirm = 0;
 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 		dst_release(dst);
 		return NULL;
@@ -1522,6 +1523,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 				af_family_clock_key_strings[newsk->sk_family]);
 
 		newsk->sk_dst_cache	= NULL;
+		newsk->sk_dst_pending_confirm = 0;
 		newsk->sk_wmem_queued	= 0;
 		newsk->sk_forward_alloc = 0;
 		atomic_set(&newsk->sk_drops, 0);
-- 
1.9.3

^ permalink raw reply related

* [PATCH RFC net-next 6/7] net: use dst_confirm_neigh for UDP, RAW, ICMP, L2TP
From: Julian Anastasov @ 2016-12-18 20:56 UTC (permalink / raw)
  To: netdev; +Cc: linux-sctp, YueHaibing
In-Reply-To: <1482094596-26046-1-git-send-email-ja@ssi.bg>

When same struct dst_entry can be used for many different
neighbours we can not use it for pending confirmations.

The datagram protocols can use MSG_CONFIRM to confirm the
neighbour. When used with MSG_PROBE we do not reach the
code where neighbour is confirmed, so we have to do the
same slow lookup by using the dst_confirm_neigh() helper.
When MSG_PROBE is not used, ip_append_data/ip6_append_data
will set the skb flag dst_pending_confirm.

Reported-by: YueHaibing <yuehaibing@huawei.com>
Fixes: 5110effee8fd ("net: Do delayed neigh confirmation.")
Fixes: f2bb4bedf35d ("ipv4: Cache output routes in fib_info nexthops.")
Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
 net/ipv4/ip_output.c  |  6 ++++++
 net/ipv4/ping.c       |  3 ++-
 net/ipv4/raw.c        |  6 +++++-
 net/ipv4/udp.c        |  3 ++-
 net/ipv6/ip6_output.c |  6 ++++++
 net/ipv6/raw.c        |  6 +++++-
 net/ipv6/route.c      | 27 ++++++++++++++-------------
 net/ipv6/udp.c        |  3 ++-
 net/l2tp/l2tp_ip6.c   |  3 ++-
 9 files changed, 44 insertions(+), 19 deletions(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index fbe63cc..3afab90 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -889,6 +889,9 @@ static inline int ip_ufo_append_data(struct sock *sk,
 
 		skb->csum = 0;
 
+		if (flags & MSG_CONFIRM)
+			skb->dst_pending_confirm = 1;
+
 		__skb_queue_tail(queue, skb);
 	} else if (skb_is_gso(skb)) {
 		goto append;
@@ -1089,6 +1092,9 @@ static int __ip_append_data(struct sock *sk,
 			exthdrlen = 0;
 			csummode = CHECKSUM_NONE;
 
+			if ((flags & MSG_CONFIRM) && !skb_prev)
+				skb->dst_pending_confirm = 1;
+
 			/*
 			 * Put the packet on the pending queue.
 			 */
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 86cca61..f3f6f50 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -848,7 +848,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	return err;
 
 do_confirm:
-	dst_confirm(&rt->dst);
+	if (msg->msg_flags & MSG_PROBE)
+		dst_confirm_neigh(&rt->dst, &fl4.daddr);
 	if (!(msg->msg_flags & MSG_PROBE) || len)
 		goto back_from_confirm;
 	err = 0;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 2300fae..47c5451 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -383,6 +383,9 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
 
 	sock_tx_timestamp(sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
 
+	if (flags & MSG_CONFIRM)
+		skb->dst_pending_confirm = 1;
+
 	skb->transport_header = skb->network_header;
 	err = -EFAULT;
 	if (memcpy_from_msg(iph, msg, length))
@@ -666,7 +669,8 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	return len;
 
 do_confirm:
-	dst_confirm(&rt->dst);
+	if (msg->msg_flags & MSG_PROBE)
+		dst_confirm_neigh(&rt->dst, &fl4.daddr);
 	if (!(msg->msg_flags & MSG_PROBE) || len)
 		goto back_from_confirm;
 	err = 0;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 9ca279b..22a1eb1 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1113,7 +1113,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	return err;
 
 do_confirm:
-	dst_confirm(&rt->dst);
+	if (msg->msg_flags & MSG_PROBE)
+		dst_confirm_neigh(&rt->dst, &fl4->daddr);
 	if (!(msg->msg_flags&MSG_PROBE) || len)
 		goto back_from_confirm;
 	err = 0;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 285aa9f..efe8b3f 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1145,6 +1145,9 @@ static inline int ip6_ufo_append_data(struct sock *sk,
 		skb->protocol = htons(ETH_P_IPV6);
 		skb->csum = 0;
 
+		if (flags & MSG_CONFIRM)
+			skb->dst_pending_confirm = 1;
+
 		__skb_queue_tail(queue, skb);
 	} else if (skb_is_gso(skb)) {
 		goto append;
@@ -1517,6 +1520,9 @@ static int __ip6_append_data(struct sock *sk,
 			exthdrlen = 0;
 			dst_exthdrlen = 0;
 
+			if ((flags & MSG_CONFIRM) && !skb_prev)
+				skb->dst_pending_confirm = 1;
+
 			/*
 			 * Put the packet on the pending queue
 			 */
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 291ebc2..0597548 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -650,6 +650,9 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
 
 	skb->ip_summed = CHECKSUM_NONE;
 
+	if (flags & MSG_CONFIRM)
+		skb->dst_pending_confirm = 1;
+
 	skb->transport_header = skb->network_header;
 	err = memcpy_from_msg(iph, msg, length);
 	if (err)
@@ -930,7 +933,8 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	txopt_put(opt_to_free);
 	return err < 0 ? err : len;
 do_confirm:
-	dst_confirm(dst);
+	if (msg->msg_flags & MSG_PROBE)
+		dst_confirm_neigh(dst, &fl6.daddr);
 	if (!(msg->msg_flags & MSG_PROBE) || len)
 		goto back_from_confirm;
 	err = 0;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 4056d0e..c8e3730 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1375,6 +1375,7 @@ static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
 				 const struct ipv6hdr *iph, u32 mtu)
 {
+	const struct in6_addr *daddr, *saddr;
 	struct rt6_info *rt6 = (struct rt6_info *)dst;
 
 	if (rt6->rt6i_flags & RTF_LOCAL)
@@ -1383,26 +1384,26 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
 	if (dst_metric_locked(dst, RTAX_MTU))
 		return;
 
-	dst_confirm(dst);
+	if (iph) {
+		daddr = &iph->daddr;
+		saddr = &iph->saddr;
+	} else if (sk) {
+		daddr = &sk->sk_v6_daddr;
+		saddr = &inet6_sk(sk)->saddr;
+	} else {
+		daddr = NULL;
+		saddr = NULL;
+	}
+	dst_confirm_neigh(dst, daddr);
 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
 	if (mtu >= dst_mtu(dst))
 		return;
 
 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
 		rt6_do_update_pmtu(rt6, mtu);
-	} else {
-		const struct in6_addr *daddr, *saddr;
+	} else if (daddr) {
 		struct rt6_info *nrt6;
 
-		if (iph) {
-			daddr = &iph->daddr;
-			saddr = &iph->saddr;
-		} else if (sk) {
-			daddr = &sk->sk_v6_daddr;
-			saddr = &inet6_sk(sk)->saddr;
-		} else {
-			return;
-		}
 		nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
 		if (nrt6) {
 			rt6_do_update_pmtu(nrt6, mtu);
@@ -2274,7 +2275,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 	 * Look, redirects are sent only in response to data packets,
 	 * so that this nexthop apparently is reachable. --ANK
 	 */
-	dst_confirm(&rt->dst);
+	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
 
 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
 	if (!neigh)
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 649efc2..9956b7e 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1295,7 +1295,8 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	return err;
 
 do_confirm:
-	dst_confirm(dst);
+	if (msg->msg_flags & MSG_PROBE)
+		dst_confirm_neigh(dst, &fl6.daddr);
 	if (!(msg->msg_flags&MSG_PROBE) || len)
 		goto back_from_confirm;
 	err = 0;
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index f092ac4..060dbae 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -658,7 +658,8 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	return err < 0 ? err : len;
 
 do_confirm:
-	dst_confirm(dst);
+	if (msg->msg_flags & MSG_PROBE)
+		dst_confirm_neigh(dst, &fl6.daddr);
 	if (!(msg->msg_flags & MSG_PROBE) || len)
 		goto back_from_confirm;
 	err = 0;
-- 
1.9.3

^ permalink raw reply related

* [PATCH RFC net-next 0/7] net: dst_confirm replacement
From: Julian Anastasov @ 2016-12-18 20:56 UTC (permalink / raw)
  To: netdev; +Cc: linux-sctp, YueHaibing

	This patchset addresses the problem of neighbour
confirmation where received replies from one nexthop
can cause confirmation of different nexthop when using
the same dst. Thanks to YueHaibing <yuehaibing@huawei.com>
for tracking the dst->pending_confirm problem.

	Sockets can obtain cached output route. Such
routes can be to known nexthop (rt_gateway=IP) or to be
used simultaneously for different nexthop IPs by different
subnet prefixes (nh->nh_scope = RT_SCOPE_HOST, rt_gateway=0).

	At first look, there are more problems:

- dst_confirm() sets flag on dst and not on dst->path,
as result, indication is lost when XFRM is used

- DNAT can change the nexthop, so the really used nexthop is
not confirmed

	So, the following solution is to avoid using
dst->pending_confirm.

	The current dst_confirm() usage is as follows:

Protocols confirming dst on received packets:
- TCP (1 dst per socket)
- SCTP (1 dst per transport)
- CXGB*

Protocols supporting sendmsg with MSG_CONFIRM [ | MSG_PROBE ] to
confirm neighbour:
- UDP IPv4/IPv6
- ICMPv4 PING
- RAW IPv4/IPv6
- L2TP/IPv6

MSG_CONFIRM for other purposes (fix not needed):
- CAN

Sending without locking the socket:
- UDP (when no cork)
- RAW (when hdrincl=1)

Redirects from old to new GW:
- rt6_do_redirect


	The patchset includes the following changes:

1. sock: add sk_dst_pending_confirm flag

- used only by TCP with patch 4 to remember the received
indication in sk->sk_dst_pending_confirm

2. net: add dst_pending_confirm flag to skbuff

- skb->dst_pending_confirm will be used by all protocols
in following patches

3. sctp: add dst_pending_confirm flag

- SCTP uses per-transport dsts and can not use
sk->sk_dst_pending_confirm like TCP

4. tcp: replace dst_confirm with sk_dst_confirm

5. net: add confirm_neigh method to dst_ops

- IPv4 and IPv6 provision for slow neigh lookups for MSG_PROBE users.
I decided to use neigh lookup only for this case because on
MSG_PROBE the skb may pass MTU checks but it does not reach
the neigh confirmation code. This patch will be used from patch 6.

- xfrm_confirm_neigh: support is incomplete here, only routes with
known nexthops (gateway) are supported because the tunnel address
is slow to obtain. Or there is solution to this problem?

6. net: use dst_confirm_neigh for UDP, RAW, ICMP, L2TP

- dst_confirm conversion for UDP, RAW, ICMP and L2TP/IPv6

- these protocols use MSG_CONFIRM propagated by ip*_append_data
to skb->dst_pending_confirm. sk->sk_dst_pending_confirm is not
used because some sending paths do not lock the socket. For
MSG_PROBE we use the slow lookup (dst_confirm_neigh).

- there are also 2 cases that need the slow lookup:
__ip6_rt_update_pmtu and rt6_do_redirect. I hope
&ipv6_hdr(skb)->saddr is the correct nexthop address to use here.

7. net: pending_confirm is not used anymore

- I failed to understand the CXGB* code, I see dst_confirm()
calls but I'm not sure dst_neigh_output() was called. For now
I just removed the dst->pending_confirm flag and left all
dst_confirm() calls there. Any better idea?

- Now may be old function neigh_output() should be restored
instead of dst_neigh_output?


Julian Anastasov (7):
  sock: add sk_dst_pending_confirm flag
  net: add dst_pending_confirm flag to skbuff
  sctp: add dst_pending_confirm flag
  tcp: replace dst_confirm with sk_dst_confirm
  net: add confirm_neigh method to dst_ops
  net: use dst_confirm_neigh for UDP, RAW, ICMP, L2TP
  net: pending_confirm is not used anymore

 drivers/net/vrf.c          |  5 ++++-
 include/linux/skbuff.h     |  4 +++-
 include/net/arp.h          | 16 ++++++++++++++++
 include/net/dst.h          | 21 +++++++++------------
 include/net/dst_ops.h      |  2 ++
 include/net/ndisc.h        | 17 +++++++++++++++++
 include/net/sctp/sctp.h    |  6 ++----
 include/net/sctp/structs.h |  4 ++++
 include/net/sock.h         | 28 +++++++++++++++++++++++++++-
 net/core/dst.c             |  1 -
 net/core/sock.c            |  2 ++
 net/ipv4/ip_output.c       | 11 ++++++++++-
 net/ipv4/ping.c            |  3 ++-
 net/ipv4/raw.c             |  6 +++++-
 net/ipv4/route.c           | 19 +++++++++++++++++++
 net/ipv4/tcp_input.c       | 12 +++---------
 net/ipv4/tcp_metrics.c     |  7 ++-----
 net/ipv4/tcp_output.c      |  2 ++
 net/ipv4/udp.c             |  3 ++-
 net/ipv6/ip6_output.c      |  7 +++++++
 net/ipv6/raw.c             |  6 +++++-
 net/ipv6/route.c           | 43 ++++++++++++++++++++++++++++++-------------
 net/ipv6/udp.c             |  3 ++-
 net/l2tp/l2tp_ip6.c        |  3 ++-
 net/sctp/associola.c       |  3 +--
 net/sctp/output.c          | 10 +++++++++-
 net/sctp/outqueue.c        |  2 +-
 net/sctp/sm_make_chunk.c   |  6 ++----
 net/sctp/sm_sideeffect.c   |  2 +-
 net/sctp/socket.c          |  4 ++--
 net/sctp/transport.c       | 18 +++++++++++++++++-
 net/xfrm/xfrm_policy.c     | 16 ++++++++++++++++
 32 files changed, 226 insertions(+), 66 deletions(-)

-- 
1.9.3

^ permalink raw reply

* [PATCH RFC net-next 3/7] sctp: add dst_pending_confirm flag
From: Julian Anastasov @ 2016-12-18 20:56 UTC (permalink / raw)
  To: netdev; +Cc: linux-sctp, YueHaibing
In-Reply-To: <1482094596-26046-1-git-send-email-ja@ssi.bg>

Add new transport flag to allow sockets to confirm neighbour.
When same struct dst_entry can be used for many different
neighbours we can not use it for pending confirmations.
The flag is propagated from transport to every packet.
It is reset when cached dst is reset.

Reported-by: YueHaibing <yuehaibing@huawei.com>
Fixes: 5110effee8fd ("net: Do delayed neigh confirmation.")
Fixes: f2bb4bedf35d ("ipv4: Cache output routes in fib_info nexthops.")
Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
 include/net/sctp/sctp.h    |  6 ++----
 include/net/sctp/structs.h |  4 ++++
 net/sctp/associola.c       |  3 +--
 net/sctp/output.c          | 10 +++++++++-
 net/sctp/outqueue.c        |  2 +-
 net/sctp/sm_make_chunk.c   |  6 ++----
 net/sctp/sm_sideeffect.c   |  2 +-
 net/sctp/socket.c          |  4 ++--
 net/sctp/transport.c       | 18 +++++++++++++++++-
 9 files changed, 39 insertions(+), 16 deletions(-)

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index f0dcaeb..85d9083 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -586,10 +586,8 @@ static inline void sctp_v4_map_v6(union sctp_addr *addr)
  */
 static inline struct dst_entry *sctp_transport_dst_check(struct sctp_transport *t)
 {
-	if (t->dst && !dst_check(t->dst, t->dst_cookie)) {
-		dst_release(t->dst);
-		t->dst = NULL;
-	}
+	if (t->dst && !dst_check(t->dst, t->dst_cookie))
+		sctp_transport_dst_release(t);
 
 	return t->dst;
 }
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 92daabd..e842e84 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -838,6 +838,8 @@ struct sctp_transport {
 
 	__u32 burst_limited;	/* Holds old cwnd when max.burst is applied */
 
+	__u32 dst_pending_confirm;	/* need to confirm neighbour */
+
 	/* Destination */
 	struct dst_entry *dst;
 	/* Source address. */
@@ -980,6 +982,8 @@ void sctp_transport_route(struct sctp_transport *, union sctp_addr *,
 void sctp_transport_reset(struct sctp_transport *);
 void sctp_transport_update_pmtu(struct sock *, struct sctp_transport *, u32);
 void sctp_transport_immediate_rtx(struct sctp_transport *);
+void sctp_transport_dst_release(struct sctp_transport *t);
+void sctp_transport_dst_confirm(struct sctp_transport *t);
 
 
 /* This is the structure we use to queue packets as they come into
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 68428e1..7bd26e0 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -820,8 +820,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 		if (transport->state != SCTP_UNCONFIRMED)
 			transport->state = SCTP_INACTIVE;
 		else {
-			dst_release(transport->dst);
-			transport->dst = NULL;
+			sctp_transport_dst_release(transport);
 			ulp_notify = false;
 		}
 
diff --git a/net/sctp/output.c b/net/sctp/output.c
index f5320a8..4684a00 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -550,6 +550,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
 	struct sctp_association *asoc = tp->asoc;
 	struct sctp_chunk *chunk, *tmp;
 	int pkt_count, gso = 0;
+	int confirm;
 	struct dst_entry *dst;
 	struct sk_buff *head;
 	struct sctphdr *sh;
@@ -628,7 +629,14 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
 			asoc->peer.last_sent_to = tp;
 	}
 	head->ignore_df = packet->ipfragok;
-	tp->af_specific->sctp_xmit(head, tp);
+	confirm = tp->dst_pending_confirm;
+	if (confirm)
+		head->dst_pending_confirm = 1;
+	/* neighbour should be confirmed on successful transmission or
+	 * positive error
+	 */
+	if (tp->af_specific->sctp_xmit(head, tp) >= 0 && confirm)
+		tp->dst_pending_confirm = 0;
 
 out:
 	list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index e540826..8234799 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -1641,7 +1641,7 @@ static void sctp_check_transmitted(struct sctp_outq *q,
 
 		if (forward_progress) {
 			if (transport->dst)
-				dst_confirm(transport->dst);
+				sctp_transport_dst_confirm(transport);
 		}
 	}
 
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 9e9690b..6fb15bf 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -3317,8 +3317,7 @@ static void sctp_asconf_param_success(struct sctp_association *asoc,
 		local_bh_enable();
 		list_for_each_entry(transport, &asoc->peer.transport_addr_list,
 				transports) {
-			dst_release(transport->dst);
-			transport->dst = NULL;
+			sctp_transport_dst_release(transport);
 		}
 		break;
 	case SCTP_PARAM_DEL_IP:
@@ -3332,8 +3331,7 @@ static void sctp_asconf_param_success(struct sctp_association *asoc,
 		local_bh_enable();
 		list_for_each_entry(transport, &asoc->peer.transport_addr_list,
 				transports) {
-			dst_release(transport->dst);
-			transport->dst = NULL;
+			sctp_transport_dst_release(transport);
 		}
 		break;
 	default:
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index c345bf1..9255b29 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -723,7 +723,7 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
 	 * forward progress.
 	 */
 	if (t->dst)
-		dst_confirm(t->dst);
+		sctp_transport_dst_confirm(t);
 
 	/* The receiver of the HEARTBEAT ACK should also perform an
 	 * RTT measurement for that destination transport address
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 318c678..9ee676a 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -588,7 +588,7 @@ static int sctp_send_asconf_add_ip(struct sock		*sk,
 			list_for_each_entry(trans,
 			    &asoc->peer.transport_addr_list, transports) {
 				/* Clear the source and route cache */
-				dst_release(trans->dst);
+				sctp_transport_dst_release(trans);
 				trans->cwnd = min(4*asoc->pathmtu, max_t(__u32,
 				    2*asoc->pathmtu, 4380));
 				trans->ssthresh = asoc->peer.i.a_rwnd;
@@ -839,7 +839,7 @@ static int sctp_send_asconf_del_ip(struct sock		*sk,
 		 */
 		list_for_each_entry(transport, &asoc->peer.transport_addr_list,
 					transports) {
-			dst_release(transport->dst);
+			sctp_transport_dst_release(transport);
 			sctp_transport_route(transport, NULL,
 					     sctp_sk(asoc->base.sk));
 		}
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index ce54dce..db66514 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -227,7 +227,7 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
 {
 	/* If we don't have a fresh route, look one up */
 	if (!transport->dst || transport->dst->obsolete) {
-		dst_release(transport->dst);
+		sctp_transport_dst_release(transport);
 		transport->af_specific->get_dst(transport, &transport->saddr,
 						&transport->fl, sk);
 	}
@@ -659,3 +659,19 @@ void sctp_transport_immediate_rtx(struct sctp_transport *t)
 			sctp_transport_hold(t);
 	}
 }
+
+/* Drop dst */
+void sctp_transport_dst_release(struct sctp_transport *t)
+{
+	dst_release(t->dst);
+	t->dst = NULL;
+	t->dst_pending_confirm = 0;
+}
+
+/* Schedule neighbour confirm */
+void sctp_transport_dst_confirm(struct sctp_transport *t)
+{
+	if (!t->dst_pending_confirm)
+		t->dst_pending_confirm = 1;
+}
+
-- 
1.9.3

^ permalink raw reply related

* [PATCH RFC net-next 4/7] tcp: replace dst_confirm with sk_dst_confirm
From: Julian Anastasov @ 2016-12-18 20:56 UTC (permalink / raw)
  To: netdev; +Cc: linux-sctp, YueHaibing
In-Reply-To: <1482094596-26046-1-git-send-email-ja@ssi.bg>

When same struct dst_entry can be used for many different
neighbours we can not use it for pending confirmations.
Use the new sk_dst_confirm() helper to propagate the
indication from received packets to sock_confirm_neigh().

Reported-by: YueHaibing <yuehaibing@huawei.com>
Fixes: 5110effee8fd ("net: Do delayed neigh confirmation.")
Fixes: f2bb4bedf35d ("ipv4: Cache output routes in fib_info nexthops.")
Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
 net/ipv4/tcp_input.c   | 12 +++---------
 net/ipv4/tcp_metrics.c |  7 ++-----
 net/ipv4/tcp_output.c  |  2 ++
 3 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6c79075..005c428 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3698,11 +3698,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	if (tp->tlp_high_seq)
 		tcp_process_tlp_ack(sk, ack, flag);
 
-	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
-		struct dst_entry *dst = __sk_dst_get(sk);
-		if (dst)
-			dst_confirm(dst);
-	}
+	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
+		sk_dst_confirm(sk);
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS)
 		tcp_schedule_loss_probe(sk);
@@ -6022,7 +6019,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 		break;
 
 	case TCP_FIN_WAIT1: {
-		struct dst_entry *dst;
 		int tmo;
 
 		/* If we enter the TCP_FIN_WAIT1 state and we are a
@@ -6049,9 +6045,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 		tcp_set_state(sk, TCP_FIN_WAIT2);
 		sk->sk_shutdown |= SEND_SHUTDOWN;
 
-		dst = __sk_dst_get(sk);
-		if (dst)
-			dst_confirm(dst);
+		sk_dst_confirm(sk);
 
 		if (!sock_flag(sk, SOCK_DEAD)) {
 			/* Wake up lingering close() */
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index d46f4d5..1b335d6 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -375,12 +375,10 @@ void tcp_update_metrics(struct sock *sk)
 	u32 val;
 	int m;
 
+	sk_dst_confirm(sk);
 	if (sysctl_tcp_nometrics_save || !dst)
 		return;
 
-	if (dst->flags & DST_HOST)
-		dst_confirm(dst);
-
 	rcu_read_lock();
 	if (icsk->icsk_backoff || !tp->srtt_us) {
 		/* This session failed to estimate rtt. Why?
@@ -493,11 +491,10 @@ void tcp_init_metrics(struct sock *sk)
 	struct tcp_metrics_block *tm;
 	u32 val, crtt = 0; /* cached RTT scaled by 8 */
 
+	sk_dst_confirm(sk);
 	if (!dst)
 		goto reset;
 
-	dst_confirm(dst);
-
 	rcu_read_lock();
 	tm = tcp_get_metrics(sk, dst, true);
 	if (!tm) {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b45101f..dc01f35 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -974,6 +974,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	skb_set_hash_from_sk(skb, sk);
 	atomic_add(skb->truesize, &sk->sk_wmem_alloc);
 
+	skb->dst_pending_confirm = sk->sk_dst_pending_confirm;
+
 	/* Build TCP header and checksum it. */
 	th = (struct tcphdr *)skb->data;
 	th->source		= inet->inet_sport;
-- 
1.9.3

^ permalink raw reply related

* [PATCH RFC net-next 2/7] net: add dst_pending_confirm flag to skbuff
From: Julian Anastasov @ 2016-12-18 20:56 UTC (permalink / raw)
  To: netdev; +Cc: linux-sctp, YueHaibing
In-Reply-To: <1482094596-26046-1-git-send-email-ja@ssi.bg>

Add new skbuff flag to allow protocols to confirm neighbour.
When same struct dst_entry can be used for many different
neighbours we can not use it for pending confirmations.

Add sock_confirm_neigh() helper to confirm the neighbour and
use it for IPv4, IPv6 and VRF before dst_neigh_output.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
 drivers/net/vrf.c      |  5 ++++-
 include/linux/skbuff.h |  4 +++-
 include/net/sock.h     | 14 ++++++++++++++
 net/ipv4/ip_output.c   |  5 ++++-
 net/ipv6/ip6_output.c  |  1 +
 5 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 7532646..b118d2b 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -377,6 +377,7 @@ static int vrf_finish_output6(struct net *net, struct sock *sk,
 	if (unlikely(!neigh))
 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 	if (!IS_ERR(neigh)) {
+		sock_confirm_neigh(skb, neigh);
 		ret = dst_neigh_output(dst, neigh, skb);
 		rcu_read_unlock_bh();
 		return ret;
@@ -573,8 +574,10 @@ static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *s
 	neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
 	if (unlikely(!neigh))
 		neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
-	if (!IS_ERR(neigh))
+	if (!IS_ERR(neigh)) {
+		sock_confirm_neigh(skb, neigh);
 		ret = dst_neigh_output(dst, neigh, skb);
+	}
 
 	rcu_read_unlock_bh();
 err:
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ac7fa34..94d7c36 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -610,6 +610,7 @@ static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
  *	@wifi_acked_valid: wifi_acked was set
  *	@wifi_acked: whether frame was acked on wifi or not
  *	@no_fcs:  Request NIC to treat last 4 bytes as Ethernet FCS
+ *	@dst_pending_confirm: need to confirm neighbour
   *	@napi_id: id of the NAPI struct this skb came from
  *	@secmark: security marking
  *	@mark: Generic packet mark
@@ -740,6 +741,7 @@ struct sk_buff {
 	__u8			csum_level:2;
 	__u8			csum_bad:1;
 
+	__u8			dst_pending_confirm:1;
 #ifdef CONFIG_IPV6_NDISC_NODETYPE
 	__u8			ndisc_nodetype:2;
 #endif
@@ -749,7 +751,7 @@ struct sk_buff {
 #ifdef CONFIG_NET_SWITCHDEV
 	__u8			offload_fwd_mark:1;
 #endif
-	/* 2, 4 or 5 bit hole */
+	/* 1, 3 or 4 bit hole */
 
 #ifdef CONFIG_NET_SCHED
 	__u16			tc_index;	/* traffic control index */
diff --git a/include/net/sock.h b/include/net/sock.h
index e83bb01..bd63d4d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1818,6 +1818,20 @@ static inline void sk_dst_confirm(struct sock *sk)
 		sk->sk_dst_pending_confirm = 1;
 }
 
+static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n)
+{
+	if (unlikely(skb->dst_pending_confirm)) {
+		struct sock *sk = skb->sk;
+		unsigned long now = jiffies;
+
+		/* avoid dirtying neighbour */
+		if (n->confirmed != now)
+			n->confirmed = now;
+		if (sk && sk->sk_dst_pending_confirm)
+			sk->sk_dst_pending_confirm = 0;
+	}
+}
+
 bool sk_mc_loop(struct sock *sk);
 
 static inline bool sk_can_gso(const struct sock *sk)
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 6c9615c..fbe63cc 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -222,7 +222,10 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
 	if (unlikely(!neigh))
 		neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
 	if (!IS_ERR(neigh)) {
-		int res = dst_neigh_output(dst, neigh, skb);
+		int res;
+
+		sock_confirm_neigh(skb, neigh);
+		res = dst_neigh_output(dst, neigh, skb);
 
 		rcu_read_unlock_bh();
 		return res;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 70d0de40..285aa9f 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -119,6 +119,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
 	if (unlikely(!neigh))
 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 	if (!IS_ERR(neigh)) {
+		sock_confirm_neigh(skb, neigh);
 		ret = dst_neigh_output(dst, neigh, skb);
 		rcu_read_unlock_bh();
 		return ret;
-- 
1.9.3

^ permalink raw reply related

* [PATCH RFC net-next 5/7] net: add confirm_neigh method to dst_ops
From: Julian Anastasov @ 2016-12-18 20:56 UTC (permalink / raw)
  To: netdev; +Cc: linux-sctp, YueHaibing
In-Reply-To: <1482094596-26046-1-git-send-email-ja@ssi.bg>

Add confirm_neigh method to dst_ops and use it from IPv4 and IPv6
to lookup and confirm the neighbour. Its usage via the new helper
dst_confirm_neigh() should be restricted to MSG_PROBE users for
performance reasons.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
 include/net/arp.h      | 16 ++++++++++++++++
 include/net/dst.h      |  7 +++++++
 include/net/dst_ops.h  |  2 ++
 include/net/ndisc.h    | 17 +++++++++++++++++
 net/ipv4/route.c       | 19 +++++++++++++++++++
 net/ipv6/route.c       | 16 ++++++++++++++++
 net/xfrm/xfrm_policy.c | 16 ++++++++++++++++
 7 files changed, 93 insertions(+)

diff --git a/include/net/arp.h b/include/net/arp.h
index 5e0f891..65619a2 100644
--- a/include/net/arp.h
+++ b/include/net/arp.h
@@ -35,6 +35,22 @@ static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32
 	return n;
 }
 
+static inline void __ipv4_confirm_neigh(struct net_device *dev, u32 key)
+{
+	struct neighbour *n;
+
+	rcu_read_lock_bh();
+	n = __ipv4_neigh_lookup_noref(dev, key);
+	if (n) {
+		unsigned long now = jiffies;
+
+		/* avoid dirtying neighbour */
+		if (n->confirmed != now)
+			n->confirmed = now;
+	}
+	rcu_read_unlock_bh();
+}
+
 void arp_init(void);
 int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg);
 void arp_send(int type, int ptype, __be32 dest_ip,
diff --git a/include/net/dst.h b/include/net/dst.h
index 6835d22..3a3b34b 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -477,6 +477,13 @@ static inline struct neighbour *dst_neigh_lookup_skb(const struct dst_entry *dst
 	return IS_ERR(n) ? NULL : n;
 }
 
+static inline void dst_confirm_neigh(const struct dst_entry *dst,
+				     const void *daddr)
+{
+	if (dst->ops->confirm_neigh)
+		dst->ops->confirm_neigh(dst, daddr);
+}
+
 static inline void dst_link_failure(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h
index a0d443c..fc1bdb4 100644
--- a/include/net/dst_ops.h
+++ b/include/net/dst_ops.h
@@ -33,6 +33,8 @@ struct dst_ops {
 	struct neighbour *	(*neigh_lookup)(const struct dst_entry *dst,
 						struct sk_buff *skb,
 						const void *daddr);
+	void			(*confirm_neigh)(const struct dst_entry *dst,
+						const void *daddr);
 
 	struct kmem_cache	*kmem_cachep;
 
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index d562a2f..8a02146 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -391,6 +391,23 @@ static inline struct neighbour *__ipv6_neigh_lookup(struct net_device *dev, cons
 	return n;
 }
 
+static inline void __ipv6_confirm_neigh(struct net_device *dev,
+					const void *pkey)
+{
+	struct neighbour *n;
+
+	rcu_read_lock_bh();
+	n = __ipv6_neigh_lookup_noref(dev, pkey);
+	if (n) {
+		unsigned long now = jiffies;
+
+		/* avoid dirtying neighbour */
+		if (n->confirmed != now)
+			n->confirmed = now;
+	}
+	rcu_read_unlock_bh();
+}
+
 int ndisc_init(void);
 int ndisc_late_init(void);
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index fa5c037..682ff24 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -154,6 +154,7 @@ static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 					   struct sk_buff *skb,
 					   const void *daddr);
+static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 
 static struct dst_ops ipv4_dst_ops = {
 	.family =		AF_INET,
@@ -168,6 +169,7 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 	.redirect =		ip_do_redirect,
 	.local_out =		__ip_local_out,
 	.neigh_lookup =		ipv4_neigh_lookup,
+	.confirm_neigh =	ipv4_confirm_neigh,
 };
 
 #define ECN_OR_COST(class)	TC_PRIO_##class
@@ -461,6 +463,23 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 	return neigh_create(&arp_tbl, pkey, dev);
 }
 
+static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
+{
+	struct net_device *dev = dst->dev;
+	const __be32 *pkey = daddr;
+	const struct rtable *rt;
+
+	rt = (const struct rtable *)dst;
+	if (rt->rt_gateway)
+		pkey = (const __be32 *)&rt->rt_gateway;
+	else if (!daddr ||
+		 (rt->rt_flags &
+		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
+		return;
+
+	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
+}
+
 #define IP_IDENTS_SZ 2048u
 
 static atomic_t *ip_idents __read_mostly;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 890acac..4056d0e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -217,6 +217,21 @@ static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
 	return neigh_create(&nd_tbl, daddr, dst->dev);
 }
 
+static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
+{
+	struct net_device *dev = dst->dev;
+	struct rt6_info *rt = (struct rt6_info *)dst;
+
+	daddr = choose_neigh_daddr(rt, NULL, daddr);
+	if (!daddr)
+		return;
+	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
+		return;
+	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
+		return;
+	__ipv6_confirm_neigh(dev, daddr);
+}
+
 static struct dst_ops ip6_dst_ops_template = {
 	.family			=	AF_INET6,
 	.gc			=	ip6_dst_gc,
@@ -233,6 +248,7 @@ static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
 	.redirect		=	rt6_do_redirect,
 	.local_out		=	__ip6_local_out,
 	.neigh_lookup		=	ip6_neigh_lookup,
+	.confirm_neigh		=	ip6_confirm_neigh,
 };
 
 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 177e208..c010ee0 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2856,6 +2856,20 @@ static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
 	return dst->path->ops->neigh_lookup(dst, skb, daddr);
 }
 
+static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr)
+{
+	const struct dst_entry *path = dst->path;
+
+	if (path == dst) {
+		dst->ops->confirm_neigh(dst, daddr);
+	} else {
+		/* daddr can be from different family and we need the
+		 * tunnel address. How to get it?
+		 */
+		path->ops->confirm_neigh(path, NULL);
+	}
+}
+
 int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
 {
 	int err = 0;
@@ -2882,6 +2896,8 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
 			dst_ops->link_failure = xfrm_link_failure;
 		if (likely(dst_ops->neigh_lookup == NULL))
 			dst_ops->neigh_lookup = xfrm_neigh_lookup;
+		if (likely(!dst_ops->confirm_neigh))
+			dst_ops->confirm_neigh = xfrm_confirm_neigh;
 		if (likely(afinfo->garbage_collect == NULL))
 			afinfo->garbage_collect = xfrm_garbage_collect_deferred;
 		rcu_assign_pointer(xfrm_policy_afinfo[afinfo->family], afinfo);
-- 
1.9.3

^ permalink raw reply related

* [PATCH] stmmac: fix memory barriers
From: Pavel Machek @ 2016-12-18 20:38 UTC (permalink / raw)
  To: LinoSanfilippo, peppe.cavallaro, alexandre.torgue, davem,
	linux-kernel, netdev, niklas.cassel, Joao.Pinto

[-- Attachment #1: Type: text/plain, Size: 3034 bytes --]


Fix up memory barriers in stmmac driver. They are meant to protect
against DMA engine, so smp_ variants are certainly wrong, and dma_
variants are preferable.
    
Signed-off-by: Pavel Machek <pavel@denx.de>

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
index a340fc8..8816515 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
@@ -334,7 +334,7 @@ static void dwmac4_rd_prepare_tx_desc(struct dma_desc *p, int is_fs, int len,
 		 * descriptors for the same frame has to be set before, to
 		 * avoid race condition.
 		 */
-		wmb();
+		dma_wmb();
 
 	p->des3 = cpu_to_le32(tdes3);
 }
@@ -377,7 +377,7 @@ static void dwmac4_rd_prepare_tso_tx_desc(struct dma_desc *p, int is_fs,
 		 * descriptors for the same frame has to be set before, to
 		 * avoid race condition.
 		 */
-		wmb();
+		dma_wmb();
 
 	p->des3 = cpu_to_le32(tdes3);
 }
diff --git a/drivers/net/ethernet/stmicro/stmmac/enh_desc.c b/drivers/net/ethernet/stmicro/stmmac/enh_desc.c
index ce97e52..f0d8632 100644
--- a/drivers/net/ethernet/stmicro/stmmac/enh_desc.c
+++ b/drivers/net/ethernet/stmicro/stmmac/enh_desc.c
@@ -350,7 +350,7 @@ static void enh_desc_prepare_tx_desc(struct dma_desc *p, int is_fs, int len,
 		 * descriptors for the same frame has to be set before, to
 		 * avoid race condition.
 		 */
-		wmb();
+		dma_wmb();
 
 	p->des0 = cpu_to_le32(tdes0);
 }
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 3e40578..bb40382 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2125,7 +2125,7 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
 	 * descriptor and then barrier is needed to make sure that
 	 * all is coherent before granting the DMA engine.
 	 */
-	smp_wmb();
+	dma_wmb();
 
 	if (netif_msg_pktdata(priv)) {
 		pr_info("%s: curr=%d dirty=%d f=%d, e=%d, f_p=%p, nfrags %d\n",
@@ -2338,7 +2338,7 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 		 * descriptor and then barrier is needed to make sure that
 		 * all is coherent before granting the DMA engine.
 		 */
-		smp_wmb();
+		dma_wmb();
 	}
 
 	netdev_sent_queue(dev, skb->len);
@@ -2443,14 +2443,14 @@ static inline void stmmac_rx_refill(struct stmmac_priv *priv)
 			netif_dbg(priv, rx_status, priv->dev,
 				  "refill entry #%d\n", entry);
 		}
-		wmb();
+		dma_wmb();
 
 		if (unlikely(priv->synopsys_id >= DWMAC_CORE_4_00))
 			priv->hw->desc->init_rx_desc(p, priv->use_riwt, 0, 0);
 		else
 			priv->hw->desc->set_rx_owner(p);
 
-		wmb();
+		dma_wmb();
 
 		entry = STMMAC_GET_ENTRY(entry, DMA_RX_SIZE);
 	}

-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply related

* Re: [PATCHv2 1/5] sh_eth: add generic wake-on-lan support via magic packet
From: Sergei Shtylyov @ 2016-12-18 20:26 UTC (permalink / raw)
  To: Niklas Söderlund, Simon Horman, netdev, linux-renesas-soc
  Cc: Geert Uytterhoeven
In-Reply-To: <20161212160931.6478-2-niklas.soderlund+renesas@ragnatech.se>

Hello.

On 12/12/2016 07:09 PM, Niklas Söderlund wrote:

> Add generic functionality to support Wake-on-Lan using MagicPacket which
> are supported by at least a few versions of sh_eth. Only add
> functionality for WoL, no specific sh_eth version are marked to support

    Versions.

> WoL yet.
>
> WoL is enabled in the suspend callback by setting MagicPacket detection
> and disabling all interrupts expect MagicPacket. In the resume path the
> driver needs to reset the hardware to rearm the WoL logic, this prevents
> the driver from simply restoring the registers and to take advantage of
> that sh_eth was not suspended to reduce resume time. To reset the
> hardware the driver close and reopens the device just like it would do

    Closes.

> in a normal suspend/resume scenario without WoL enabled, but it both
> close and open the device in the resume callback since the device needs

    Closes and opens.

> to be open for WoL to work.

> One quirk needed for WoL is that the module clock needs to be prevented
> from being switched off by Runtime PM. To keep the clock alive the

    I tried to find the code in question and failed, getting muddled in the 
RPM maze. Could you point at this code for my education? :-)

> suspend callback need to call clk_enable() directly to increase the

    My main concern is why we need to manipulate the clock directly --
can't you call RPM to achieve the same effect?

> usage count of the clock. Then when Runtime PM decreases the clock usage
> count it won't reach 0 and be switched off.

    You mean it does this even though we don't call pr_runtime_put_sync()
as done in sh_eth_close()?

> Signed-off-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
[...]

MBR, Sergei

^ permalink raw reply

* Re: [PATCH 1/2] net: ethernet: sxgbe: remove private tx queue lock
From: Pavel Machek @ 2016-12-18 20:16 UTC (permalink / raw)
  To: Lino Sanfilippo
  Cc: Francois Romieu, bh74.an, ks.giri, vipul.pandya, peppe.cavallaro,
	alexandre.torgue, davem, linux-kernel, netdev
In-Reply-To: <a2cbcfcd-f377-565c-a21c-3daa3abce519@gmx.de>

[-- Attachment #1: Type: text/plain, Size: 1266 bytes --]

Hi!

> > For the same reason it's broken if it races with the transmit path: it
> > can release driver resources while the transmit path uses these.
> > 
> > Btw the points below may not matter/hurt much for a proof a concept
> > but they would need to be addressed as well:
> > 1) unchecked (and avoidable) extra error paths due to stmmac_release()
> > 2) racy cancel_work_sync. Low probability as it is, an irq + error could
> >    take place right after cancel_work_sync
> 
> It was indeed only meant as a proof of concept. Nevertheless the race is not 
> good, since one can run into it when faking the tx error for testings purposes.
> So below is a slightly improved version of the restart handling.
> Its not meant as a final version either. But maybe we can use it as a starting
> point.

Certainly works better than version we currently have in tree. I'm
running it in a loop, and it survived 10 minutes of testing so
far. (Previous version killed the hardware at first iteration.)

> Again the patch is only compile tested.

Tested-by: Pavel Machek <pavel@denx.de>

Thanks!
								Pavel

-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply

* Re: regression: ath_tx_edma_tasklet() Illegal idle entry in RCU read-side critical section
From: Paul E. McKenney @ 2016-12-18 20:14 UTC (permalink / raw)
  To: Valo, Kalle
  Cc: Tobias Klausmann, Gabriel C, lkml, ath9k-devel,
	linux-wireless@vger.kernel.org, ath9k-devel@lists.ath9k.org,
	netdev@vger.kernel.org, nbd@nbd.name
In-Reply-To: <87pokpc8ng.fsf@qca.qualcomm.com>

On Sun, Dec 18, 2016 at 07:57:42PM +0000, Valo, Kalle wrote:
> Tobias Klausmann <tobias.johannes.klausmann@mni.thm.de> writes:
> 
> > A patch for this is already floating on the ML for a while now latest:
> > (ath9k: do not return early to fix rcu unlocking)
> 
> It's here:
> 
> https://patchwork.kernel.org/patch/9472709/

Feel free to add:

Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

							Thanx, Paul

> > Hopefully Kalle will include it in one of his upcoming pull requests.
> 
> Yes, I'll try to get it to 4.10-rc2.
> 
> -- 
> Kalle Valo

^ permalink raw reply

* Re: wl1251 & mac address & calibration data
From: Arend Van Spriel @ 2016-12-18 20:08 UTC (permalink / raw)
  To: Pali Rohár
  Cc: Daniel Wagner, Luis R. Rodriguez, Tom Gundersen, Johannes Berg,
	Ming Lei, Mimi Zohar, Bjorn Andersson, Rafał Miłecki,
	Kalle Valo, Sebastian Reichel, Pavel Machek, Michal Kazior,
	Ivaylo Dimitrov, Aaro Koskinen, Tony Lindgren, linux-wireless,
	Network Development, linux-kernel@vger.kernel.org
In-Reply-To: <201612181309.01298@pali>

On 18-12-2016 13:09, Pali Rohár wrote:
> On Sunday 18 December 2016 12:54:00 Arend Van Spriel wrote:
>> On 18-12-2016 12:04, Pali Rohár wrote:
>>> On Sunday 18 December 2016 11:49:53 Arend Van Spriel wrote:
>>>> On 16-12-2016 11:40, Pali Rohár wrote:
>>>>> On Friday 16 December 2016 08:25:44 Daniel Wagner wrote:
>>>>>> On 12/16/2016 03:03 AM, Luis R. Rodriguez wrote:
>>>>>>> For the new API a solution for "fallback mechanisms" should be
>>>>>>> clean though and I am looking to stay as far as possible from
>>>>>>> the existing mess. A solution to help both the old API and new
>>>>>>> API is possible for the "fallback mechanism" though -- but for
>>>>>>> that I can only refer you at this point to some of Daniel
>>>>>>> Wagner and Tom Gunderson's firmwared deamon prospect. It
>>>>>>> should help pave the way for a clean solution and help address
>>>>>>> other stupid issues.
>>>>>>
>>>>>> The firmwared project is hosted here
>>>>>>
>>>>>> https://github.com/teg/firmwared
>>>>>>
>>>>>> As Luis pointed out, firmwared relies on
>>>>>> FW_LOADER_USER_HELPER_FALLBACK, which is not enabled by default.
>>>>>
>>>>> I know. But it does not mean that I cannot enable this option at
>>>>> kernel compile time.
>>>>>
>>>>> Bigger problem is that currently request_firmware() first try to
>>>>> load firmware directly from VFS and after that (if fails)
>>>>> fallback to user helper.
>>>>>
>>>>> So I would need to extend kernel firmware code with new function
>>>>> (or flag) to not use VFS and try only user mode helper.
>>>>
>>>> Why do you need the user-mode helper anyway. This is all static
>>>> data, right?
>>>
>>> Those are static data, but device specific!
>>
>> So what?
>>
>>>> So why not cook up a firmware file in user-space once and put
>>>> it in /lib/firmware for the driver to request directly.
>>>
>>> 1. Violates FHS
>>
>> How?
>>
>>> 2. Does not work for readonly /, readonly /lib, readonly
>>> /lib/firmware
>>
>> Que?
>>
>>> 3. Backup & restore of rootfs between same devices does not work
>>> (as rootfs now contains device specific data).
>>
>> True.
>>
>>> 4. Sharing one rootfs (either via nfs or other technology) does not
>>> work for more devices (even in state when rootfs is used only by
>>> one device at one time).
>>
>> Indeed.
>>
>>> And it is common that N900 developers have rootfs in laptop and via
>>> usb (cdc_ether) exports it over nfs to N900 device and boot
>>> system. It basically break booting from one nfs-exported rootfs,
>>> as that export become model specific...
>>
>> These are all you choices and more a logistic issue. If your take is
>> that udev is the way to solve those, fine by me.
>>
>>>> Seems a bit
>>>> overkill to have a {e,}udev or whatever daemon running if the
>>>> result is always the same. Just my 2 cents.
>>>
>>> No it is not. It will break couple of other things in Linux and
>>> device
>>
>> Now I am curious. What "couple of other things" will be broken.
>>
>>> and model specific calibration data should not be in /lib/firmware!
>>> That directory is used for firmware files, not calibration.
>>
>> What is "firmware"? Really. These are binary blobs required to make
>> the device work. And guess what, your device needs calibration data.
>> Why make the distinction.
>>
>> Regards,
>> Arend
> 
> File wl1251-nvs.bin is provided by linux-firmware package and contains 
> default data which should be overriden by model specific calibrated 
> data.

Ah. Someone thought it was a good idea to provide the "one ring to rule
them all". Nice.

> But overwriting that one file is not possible as it next update of 
> linux-firmware package will overwrite it back. It break any normal usage 
> of package management.
> 
> Also it is ridiculously broken by design if some "boot" files needs to 
> be overwritten to initialize hardware properly. To not break booting you 
> need to overwrite that file before first boot. But without booting 
> device you cannot read calibration data. So some hack with autoreboot 
> after boot is needed. And how to detect that we have real overwritten 
> calibration data and not default one from linux-firmware? Any heuristic 
> or checks will be broken here. And no, nothing like you need to reboot 
> your device now (and similar concept) from windows world is not 
> accepted.

Well. After reading and creating calibration data you could just rebind
the driver to the device to have it probed again. But yeah, the default
one from linux-firmware should never have been there in the first place.

> "firmware" is one for chip. Any N900 device with wl1251 chip needs 
> exactly same firmware "wl1251-fw.bin". But every N900 needs different 
> calibration data which is not firmware.

Ok. This is exactly why Luis is giving the new API different name just
calling it "data".

Regards,
Arend

^ permalink raw reply

* Re: regression: ath_tx_edma_tasklet() Illegal idle entry in RCU read-side critical section
From: Tobias Klausmann @ 2016-12-18 20:04 UTC (permalink / raw)
  To: Valo, Kalle
  Cc: paulmck@linux.vnet.ibm.com, Gabriel C, lkml, ath9k-devel,
	linux-wireless@vger.kernel.org, ath9k-devel@lists.ath9k.org,
	netdev@vger.kernel.org, nbd@nbd.name
In-Reply-To: <87pokpc8ng.fsf@qca.qualcomm.com>


On 18.12.2016 20:57, Valo, Kalle wrote:
> Tobias Klausmann <tobias.johannes.klausmann@mni.thm.de> writes:
>
>> A patch for this is already floating on the ML for a while now latest:
>> (ath9k: do not return early to fix rcu unlocking)
> It's here:
>
> https://patchwork.kernel.org/patch/9472709/

Good to know!
>
>> Hopefully Kalle will include it in one of his upcoming pull requests.
> Yes, I'll try to get it to 4.10-rc2.

Thanks for the update!

^ permalink raw reply

* Re: regression: ath_tx_edma_tasklet() Illegal idle entry in RCU read-side critical section
From: Valo, Kalle @ 2016-12-18 19:57 UTC (permalink / raw)
  To: Tobias Klausmann
  Cc: Gabriel C, netdev@vger.kernel.org, linux-wireless@vger.kernel.org,
	ath9k-devel, lkml, ath9k-devel@lists.ath9k.org,
	paulmck@linux.vnet.ibm.com, nbd@nbd.name
In-Reply-To: <58b67d5b-0275-f80f-479f-78cf748b4319@mni.thm.de>

Tobias Klausmann <tobias.johannes.klausmann@mni.thm.de> writes:

> A patch for this is already floating on the ML for a while now latest:
> (ath9k: do not return early to fix rcu unlocking)

It's here:

https://patchwork.kernel.org/patch/9472709/

> Hopefully Kalle will include it in one of his upcoming pull requests.

Yes, I'll try to get it to 4.10-rc2.

-- 
Kalle Valo

^ permalink raw reply

* [PATCH net v2] ipvlan: fix crash when master is set in loopback mode
From: Mahesh Bandewar @ 2016-12-18 19:00 UTC (permalink / raw)
  To: netdev, Eric Dumazet, David Miller; +Cc: Mahesh Bandewar

From: Mahesh Bandewar <maheshb@google.com>

In an IPvlan setup when master is set in loopback mode e.g.

  ethtool -K eth0 set loopback on

  where eth0 is master device for IPvlan setup.

The failure actually happens while processing mulitcast packets
but that's a result of unconditionally queueing packets without
ensuring ether-header is part of the linear part of skb.

This patch forces this check at the reception and drops packets
which fail this check before queuing them.

------------[ cut here ]------------
kernel BUG at include/linux/skbuff.h:1737!
Call Trace:
 [<ffffffff921fbbc2>] dev_forward_skb+0x92/0xd0
 [<ffffffffc031ac65>] ipvlan_process_multicast+0x395/0x4c0 [ipvlan]
 [<ffffffffc031a9a7>] ? ipvlan_process_multicast+0xd7/0x4c0 [ipvlan]
 [<ffffffff91cdfea7>] ? process_one_work+0x147/0x660
 [<ffffffff91cdff09>] process_one_work+0x1a9/0x660
 [<ffffffff91cdfea7>] ? process_one_work+0x147/0x660
 [<ffffffff91ce086d>] worker_thread+0x11d/0x360
 [<ffffffff91ce0750>] ? rescuer_thread+0x350/0x350
 [<ffffffff91ce960b>] kthread+0xdb/0xe0
 [<ffffffff91c05c70>] ? _raw_spin_unlock_irq+0x30/0x50
 [<ffffffff91ce9530>] ? flush_kthread_worker+0xc0/0xc0
 [<ffffffff92348b7a>] ret_from_fork+0x9a/0xd0
 [<ffffffff91ce9530>] ? flush_kthread_worker+0xc0/0xc0

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
---
v1->v2: commit log update

 drivers/net/ipvlan/ipvlan_core.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index b4e990743e1d..4294fc1f5564 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -660,6 +660,9 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb)
 	if (!port)
 		return RX_HANDLER_PASS;
 
+	if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
+		goto out;
+
 	switch (port->mode) {
 	case IPVLAN_MODE_L2:
 		return ipvlan_handle_mode_l2(pskb, port);
@@ -672,6 +675,8 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb)
 	/* Should not reach here */
 	WARN_ONCE(true, "ipvlan_handle_frame() called for mode = [%hx]\n",
 			  port->mode);
+
+out:
 	kfree_skb(skb);
 	return RX_HANDLER_CONSUMED;
 }
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* Re: [PATCH 1/2] net: ethernet: sxgbe: remove private tx queue lock
From: Pavel Machek @ 2016-12-18 18:30 UTC (permalink / raw)
  To: Lino Sanfilippo
  Cc: Francois Romieu, bh74.an, ks.giri, vipul.pandya, peppe.cavallaro,
	alexandre.torgue, davem, linux-kernel, netdev
In-Reply-To: <a2cbcfcd-f377-565c-a21c-3daa3abce519@gmx.de>

[-- Attachment #1: Type: text/plain, Size: 811 bytes --]

Hi!

> > - e1efa87241272104d6a12c8b9fcdc4f62634d447
> 
> Yep, a sync of the dma descriptors before the hardware gets ownership of the tx tail
> idx is missing in the stmmac, too.

I can reproduce failure with 4.4 fairly easily. I tried with dma_
variant of barriers, and it failed, too

[ 1018.410012] stmmac: early irq
[ 1023.939702] fpga_cmd_read:wait_event timed out!
[ 1033.128692] ------------[ cut here ]------------
[ 1033.133329] WARNING: CPU: 0 PID: 0 at net/sched/sch_generic.c:303
dev_watchdog+0x264/0x284()
[ 1033.141748] NETDEV WATCHDOG: eth0 (socfpga-dwmac): transmit queue 0
timed out
[ 1033.148861] Modules linked in:

									Pavel

-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply

* Re: mlx4: Bug in XDP_TX + 16 rx-queues
From: Martin KaFai Lau @ 2016-12-18 18:14 UTC (permalink / raw)
  To: Tariq Toukan
  Cc: Saeed Mahameed, Tariq Toukan, netdev@vger.kernel.org,
	Alexei Starovoitov
In-Reply-To: <fd3b36fa-8365-3873-b3e5-fff47939c330@gmail.com>

On Sun, Dec 18, 2016 at 12:31:30PM +0200, Tariq Toukan wrote:
> Hi Martin,
>
>
> On 17/12/2016 12:18 PM, Martin KaFai Lau wrote:
> >Hi All,
> >
> >I have been debugging with XDP_TX and 16 rx-queues.
> >
> >1) When 16 rx-queues is used and an XDP prog is doing XDP_TX,
> >it seems that the packet cannot be XDP_TX out if the pkt
> >is received from some particular CPUs (/rx-queues).
> Does the rx_xdp_tx_full counter increase?
The rx_xdp_tx_full counter did not increase.  A capture of
ethtool -S eth0:

[root@kerneltest003.14.prn2 ~]# ethtool -S eth0 | egrep 'rx.*_xdp_tx.*:'
rx_xdp_tx: 1024
rx_xdp_tx_full: 0
rx0_xdp_tx: 64
rx0_xdp_tx_full: 0
rx1_xdp_tx: 64
rx1_xdp_tx_full: 0
rx2_xdp_tx: 64
rx2_xdp_tx_full: 0
rx3_xdp_tx: 64
rx3_xdp_tx_full: 0
rx4_xdp_tx: 64
rx4_xdp_tx_full: 0
rx5_xdp_tx: 64
rx5_xdp_tx_full: 0
rx6_xdp_tx: 64
rx6_xdp_tx_full: 0
rx7_xdp_tx: 64
rx7_xdp_tx_full: 0
rx8_xdp_tx: 64
rx8_xdp_tx_full: 0
rx9_xdp_tx: 63
rx9_xdp_tx_full: 0
rx10_xdp_tx: 65
rx10_xdp_tx_full: 0
rx11_xdp_tx: 64
rx11_xdp_tx_full: 0
rx12_xdp_tx: 64
rx12_xdp_tx_full: 0
rx13_xdp_tx: 64
rx13_xdp_tx_full: 0
rx14_xdp_tx: 64
rx14_xdp_tx_full: 0
rx15_xdp_tx: 64
rx15_xdp_tx_full: 0

> Does the problem repro if you turn off PFC?
>     ethtool -A <intf> rx off tx off
Turning pause off does not help.

> >
> >2) If 8 rx-queues is used, it does not have problem.
> >
> >3) The 16 rx-queues problem also went away after reverting these
> >two patches:
> >15fca2c8eb41 net/mlx4_en: Add ethtool statistics for XDP cases
> >67f8b1dcb9ee net/mlx4_en: Refactor the XDP forwarding rings scheme
> >
> >4) I can reproduce the problem by running samples/bof/xdp_ip_tunnel at
> >the receiver side.  The sender side sends out TCP packets with
> >source port ranging from 1 to 1024.  At the sender side also, do
> >a tcpdump to capture the ip-tunnel packet reflected by xdp_ip_tunnel.
> >With 8 rx-queues,  I can get all 1024 packets back.  With 16 rx-queues,
> >I can only get 512 packets back.  It is a 40 CPUs machine.
> >I also checked the rx*_xdp_tx counters (from ethtool -S eth0) to ensure
> >the xdp prog has XDP_TX-ed it out.
> So all packets were transmitted (according to rx*_xdp_tx), and only half the
> of them received on the other side?
Correct.  The XDP program 'samples/bpf/xdp_tx_iptunnel' received,
processed and sent out 1024 packets.  The rx*_xdp_tx also showed all of the
1024 packets.  However, only half of them reached to the other side (by
observing the tcpdump) when 16 rx-queues was used.

Thanks,
--Martin

^ permalink raw reply

* Re: [PATCH net] ipvlan: fix crash
From: Mahesh Bandewar (महेश बंडेवार) @ 2016-12-18 18:10 UTC (permalink / raw)
  To: David Miller; +Cc: mahesh, linux-netdev, Eric Dumazet
In-Reply-To: <20161217.235452.1604434909844387069.davem@davemloft.net>

On Sat, Dec 17, 2016 at 8:54 PM, David Miller <davem@davemloft.net> wrote:
> From: Mahesh Bandewar <mahesh@bandewar.net>
> Date: Sat, 17 Dec 2016 18:16:19 -0800
>
>> diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
>> index b4e990743e1d..4294fc1f5564 100644
>> --- a/drivers/net/ipvlan/ipvlan_core.c
>> +++ b/drivers/net/ipvlan/ipvlan_core.c
>> @@ -660,6 +660,9 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb)
>>       if (!port)
>>               return RX_HANDLER_PASS;
>>
>> +     if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
>> +             goto out;
>> +
>>       switch (port->mode) {
>
> ipvlan only allows non-loopback ethernet devices to register
> this RX handler.
>
> Such situations being tested here should therefore be completely
> impossible.
>
Yes, correct. This happens when the master device is set in loopback mode.

> Every such device must send the SKB through eth_type_trans(), which
> unconditionally accesses the ethernet header, therefore it must
> be pulled into the linear SKB area already, long before this RX
> handler is invoked.
>
> If this really can legitimately happen, you must explain how so.
>
OK, will update the commit log.

> Just showing the crash that later happens in some (completely
> unrelated BTW) ipvlan multicast workqueue handling function, is
> really an insufficient commit log message for a bug like this.

^ permalink raw reply

* [PATCH iproute2 1/1] tc: updated man page to reflect filter-id use in filter GET command.
From: Roman Mashak @ 2016-12-18 17:25 UTC (permalink / raw)
  To: stephen; +Cc: netdev, jhs, daniel, xiyou.wangcong, Roman Mashak

Signed-off-by: Roman Mashak <mrv@mojatatu.com>
---
 man/man8/tc.8 | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/man/man8/tc.8 b/man/man8/tc.8
index 583b72f..f96911a 100644
--- a/man/man8/tc.8
+++ b/man/man8/tc.8
@@ -32,7 +32,8 @@ tc \- show / manipulate traffic control settings
 \fIDEV\fR
 .B [ parent
 \fIqdisc-id\fR
-.B | root ] protocol
+.B | root ] [ handle \fIfilter-id\fR ]
+.B protocol
 \fIprotocol\fR
 .B prio
 \fIpriority\fR filtertype
@@ -577,7 +578,8 @@ it is created.
 
 .TP
 get
-Displays a single filter given the interface, parent ID, priority, protocol and handle ID.
+Displays a single filter given the interface \fIDEV\fR, \fIqdisc-id\fR,
+\fIpriority\fR, \fIprotocol\fR and \fIfilter-id\fR.
 
 .TP
 show
-- 
1.9.1

^ permalink raw reply related

* [PATCH iproute2 1/1] tc: fixed man page fonts for keywords and variable values
From: Roman Mashak @ 2016-12-18 17:25 UTC (permalink / raw)
  To: stephen; +Cc: netdev, jhs, daniel, xiyou.wangcong, Roman Mashak

Signed-off-by: Roman Mashak <mrv@mojatatu.com>
---
 man/man8/tc.8 | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/man/man8/tc.8 b/man/man8/tc.8
index 8a47a2b..583b72f 100644
--- a/man/man8/tc.8
+++ b/man/man8/tc.8
@@ -5,58 +5,58 @@ tc \- show / manipulate traffic control settings
 .B tc
 .RI "[ " OPTIONS " ]"
 .B qdisc [ add | change | replace | link | delete ] dev
-DEV
+\fIDEV\fR
 .B
 [ parent
-qdisc-id
+\fIqdisc-id\fR
 .B | root ]
 .B [ handle
-qdisc-id ] qdisc
+\fIqdisc-id\fR ] qdisc
 [ qdisc specific parameters ]
 .P
 
 .B tc
 .RI "[ " OPTIONS " ]"
 .B class [ add | change | replace | delete ] dev
-DEV
+\fIDEV\fR
 .B parent
-qdisc-id
+\fIqdisc-id\fR
 .B [ classid
-class-id ] qdisc
+\fIclass-id\fR ] qdisc
 [ qdisc specific parameters ]
 .P
 
 .B tc
 .RI "[ " OPTIONS " ]"
 .B filter [ add | change | replace | delete | get ] dev
-DEV
+\fIDEV\fR
 .B [ parent
-qdisc-id
+\fIqdisc-id\fR
 .B | root ] protocol
-protocol
+\fIprotocol\fR
 .B prio
-priority filtertype
+\fIpriority\fR filtertype
 [ filtertype specific parameters ]
 .B flowid
-flow-id
+\fIflow-id\fR
 
 .B tc
 .RI "[ " OPTIONS " ]"
 .RI "[ " FORMAT " ]"
 .B qdisc show [ dev
-DEV
+\fIDEV\fR
 .B ]
 .P
 .B tc
 .RI "[ " OPTIONS " ]"
 .RI "[ " FORMAT " ]"
 .B class show dev
-DEV
+\fIDEV\fR
 .P
 .B tc
 .RI "[ " OPTIONS " ]"
 .B filter show dev
-DEV
+\fIDEV\fR
 
 .P
 .ti 8
@@ -294,14 +294,14 @@ In the absence of classful qdiscs, classless qdiscs can only be attached at
 the root of a device. Full syntax:
 .P
 .B tc qdisc add dev
-DEV
+\fIDEV\fR
 .B root
 QDISC QDISC-PARAMETERS
 
 To remove, issue
 .P
 .B tc qdisc del dev
-DEV
+\fIDEV\fR
 .B root
 
 The
@@ -386,7 +386,7 @@ Type of Service
 Some qdiscs have built in rules for classifying packets based on the TOS field.
 .TP
 skb->priority
-Userspace programs can encode a class-id in the 'skb->priority' field using
+Userspace programs can encode a \fIclass-id\fR in the 'skb->priority' field using
 the SO_PRIORITY option.
 .P
 Each node within the tree can have its own filters but higher level filters
@@ -554,7 +554,7 @@ must be passed, either by passing its ID or by attaching directly to the root of
 When creating a qdisc or a filter, it can be named with the
 .B handle
 parameter. A class is named with the
-.B classid
+.B \fBclassid\fR
 parameter.
 
 .TP
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH 1/2] net: ethernet: sxgbe: remove private tx queue lock
From: Pavel Machek @ 2016-12-18 17:23 UTC (permalink / raw)
  To: Lino Sanfilippo
  Cc: Francois Romieu, bh74.an, ks.giri, vipul.pandya, peppe.cavallaro,
	alexandre.torgue, davem, linux-kernel, netdev
In-Reply-To: <a2cbcfcd-f377-565c-a21c-3daa3abce519@gmx.de>

[-- Attachment #1: Type: text/plain, Size: 590 bytes --]

Hi!

> > - e1efa87241272104d6a12c8b9fcdc4f62634d447
> 
> Yep, a sync of the dma descriptors before the hardware gets ownership of the tx tail
> idx is missing in the stmmac, too. 

Thanks for the hint. Actually, the driver uses smp_wmb() which is
completely crazy, and probably misses rmb() in clean_tx, too. Anyway,
I did not notice there are dma_ variants, too... we clearly need them.

Thanks and best regards,
								Pavel
								
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply

* Re: regression: ath_tx_edma_tasklet() Illegal idle entry in RCU read-side critical section
From: Tobias Klausmann @ 2016-12-18 16:17 UTC (permalink / raw)
  To: paulmck, Gabriel C
  Cc: lkml, ath9k-devel, linux-wireless, ath9k-devel, netdev, nbd,
	kvalo
In-Reply-To: <20161218155938.GP3924@linux.vnet.ibm.com>

Hi,

A patch for this is already floating on the ML for a while now latest: 
(ath9k: do not return early to fix rcu unlocking)

Hopefully Kalle will include it in one of his upcoming pull requests.

Greetings,

Tobias


On 18.12.2016 16:59, Paul E. McKenney wrote:
> On Sun, Dec 18, 2016 at 02:52:48PM +0100, Gabriel C wrote:
>> Hello,
>>
>> while testing kernel 4.9 I run into a weird issue with the ath9k driver.
>>
>> I can boot the box in console mode and it stay up sometime but is not usable.
> Looks to me like someone forgot an rcu_read_unlock() somewhere.  Given that
> the unmatched rcu_read_lock() appears in ath_tx_edma_tasklet(), perhaps
> that is also where the missing rcu_read_unlock() is.  And sure enough,
> in the middle of this function we have the following:
>
> 		fifo_list = &txq->txq_fifo[txq->txq_tailidx];
> 		if (list_empty(fifo_list)) {
> 			ath_txq_unlock(sc, txq);
> 			return;
> 		}
>
> This will of course return while still in an RCU read-side critical
> section.  The caller cannot tell the difference between a return here
> and falling off the end of the function, so this is likely the bug.
> Or one of the bugs, anyway.  Copying the author and committer for
> their thoughts.
>
> Please try the patch at the end of this email.
>
> 							Thanx, Paul
>
>> from dmesg :
>>
>> ===============================
>> [ INFO: suspicious RCU usage. ]
>> 4.9-fw1 #1 Tainted: G          I
>> -------------------------------
>> kernel/rcu/tree.c:705 Illegal idle entry in RCU read-side critical section.!
>>
>> other info that might help us debug this:
>>
>>
>> RCU used illegally from idle CPU!
>> rcu_scheduler_active = 1, debug_locks = 1
>> RCU used illegally from extended quiescent state!
>> 1 lock held by swapper/0/0:
>>   #0:  (rcu_read_lock){......}, at: [<ffffffffa0ee0240>] ath_tx_edma_tasklet+0x0/0x460 [ath9k]
>>
>> stack backtrace:
>> CPU: 0 PID: 0 Comm: swapper/0 Tainted: G          I     4.9-fw1 #1
>> Hardware name: FUJITSU                          PRIMERGY TX200 S5             /D2709, BIOS 6.00 Rev. 1.14.2709              02/04/2013
>>   ffff88043ee03f38 ffffffff812cf0f3 ffffffff81a11540 0000000000000001
>>   ffff88043ee03f68 ffffffff810b7865 ffffffff81a55d58 ffff88043efcedc0
>>   ffff88083cb1ca00 00000000000000d1 ffff88043ee03f88 ffffffff810dbfe8
>> Call Trace:
>>   <IRQ>
>>   [<ffffffff812cf0f3>] dump_stack+0x86/0xc3
>>   [<ffffffff810b7865>] lockdep_rcu_suspicious+0xc5/0x100
>>   [<ffffffff810dbfe8>] rcu_eqs_enter_common.constprop.62+0x128/0x130
>>   [<ffffffff810ddc78>] rcu_irq_exit+0x38/0x70
>>   [<ffffffff81067ec4>] irq_exit+0x74/0xd0
>>   [<ffffffff8101e561>] do_IRQ+0x71/0x130
>>   [<ffffffff8158700c>] common_interrupt+0x8c/0x8c
>>   <EOI>
>>   [<ffffffff81472836>] ? cpuidle_enter_state+0x156/0x220
>>   [<ffffffff81472922>] cpuidle_enter+0x12/0x20
>>   [<ffffffff810ad23e>] call_cpuidle+0x1e/0x40
>>   [<ffffffff810ad46d>] cpu_startup_entry+0x11d/0x210
>>   [<ffffffff8157892c>] rest_init+0x12c/0x140
>>   [<ffffffff81d02ec3>] start_kernel+0x40f/0x41c
>>   [<ffffffff81d02120>] ? early_idt_handler_array+0x120/0x120
>>   [<ffffffff81d02299>] x86_64_start_reservations+0x2a/0x2c
>>   [<ffffffff81d02386>] x86_64_start_kernel+0xeb/0xf8
> ------------------------------------------------------------------------
>
> commit 5a16fed76936184a7ac22e466cf39bd8bb5ee65e
> Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> Date:   Sun Dec 18 07:49:00 2016 -0800
>
>      drivers/ath: Add missing rcu_read_unlock() to ath_tx_edma_tasklet()
>      
>      Commit d94a461d7a7d ("ath9k: use ieee80211_tx_status_noskb where possible")
>      added rcu_read_lock() and rcu_read_unlock() around the body of
>      ath_tx_edma_tasklet(), but failed to add the needed rcu_read_unlock()
>      before a "return" in the middle of this function.  This commit therefore
>      adds the missing rcu_read_unlock().
>      
>      Reported-by: Gabriel C <nix.or.die@gmail.com>
>      Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
>      Cc: Felix Fietkau <nbd@nbd.name>
>      Cc: Kalle Valo <kvalo@qca.qualcomm.com>
>      Cc: QCA ath9k Development <ath9k-devel@qca.qualcomm.com>
>      Cc: <linux-wireless@vger.kernel.org?
>      Cc: <ath9k-devel@lists.ath9k.org>
>
> diff --git a/drivers/net/wireless/ath/ath9k/xmit.c b/drivers/net/wireless/ath/ath9k/xmit.c
> index 52bfbb988611..857d5ae09a1d 100644
> --- a/drivers/net/wireless/ath/ath9k/xmit.c
> +++ b/drivers/net/wireless/ath/ath9k/xmit.c
> @@ -2787,6 +2787,7 @@ void ath_tx_edma_tasklet(struct ath_softc *sc)
>   		fifo_list = &txq->txq_fifo[txq->txq_tailidx];
>   		if (list_empty(fifo_list)) {
>   			ath_txq_unlock(sc, txq);
> +			rcu_read_unlock();
>   			return;
>   		}
>   
>

^ permalink raw reply

* Re: [PATCH 1/2] net: ethernet: sxgbe: remove private tx queue lock
From: Lino Sanfilippo @ 2016-12-18 16:15 UTC (permalink / raw)
  To: Francois Romieu, Pavel Machek
  Cc: bh74.an, ks.giri, vipul.pandya, peppe.cavallaro, alexandre.torgue,
	davem, linux-kernel, netdev
In-Reply-To: <20161218001507.GA5343@electric-eye.fr.zoreil.com>

Hi,

On 18.12.2016 01:15, Francois Romieu wrote:
> Pavel Machek <pavel@ucw.cz> :
> [...]
>> Won't this up/down the interface, in a way userspace can observe?
> 
> It won't up/down the interface as it doesn't exactly mimic what the
> network code does (there's more than rtnl_lock).
> 

Right. Userspace wont see link down/up, but it will see carrier off/on.
But this is AFAIK acceptable for a rare situation like a tx error.

> For the same reason it's broken if it races with the transmit path: it
> can release driver resources while the transmit path uses these.
> 
> Btw the points below may not matter/hurt much for a proof a concept
> but they would need to be addressed as well:
> 1) unchecked (and avoidable) extra error paths due to stmmac_release()
> 2) racy cancel_work_sync. Low probability as it is, an irq + error could
>    take place right after cancel_work_sync

It was indeed only meant as a proof of concept. Nevertheless the race is not 
good, since one can run into it when faking the tx error for testings purposes.
So below is a slightly improved version of the restart handling.
Its not meant as a final version either. But maybe we can use it as a starting
point.

> Lino, have you considered via-rhine.c since its "move work from irq to
> workqueue context" changes that started in
> 7ab87ff4c770eed71e3777936299292739fcd0fe [*] ?
> It's a shameless plug - originated in r8169.c - but it should be rather
> close to what the sxgbe and friends require. Thought / opinion ?
> 

Not really. There are a few drivers that I use to look into if I want to know
how certain things are done correctly (e.g the sky2 or the intel drivers) because
I think they are well implemented.
But you seem to have put some thoughts into various race condition problems
in the via-rhine driver and I can image that sxgbe and stmmac still have some
of these issues, too.

> [*] Including fixes/changes in:
> - 3a5a883a8a663b930908cae4abe5ec913b9b2fd2

Ok, the issues you fixed here are concerning the tx_curr and tx_dirty
pointers. For now this is not needed in stmmac and sxgbe since the
tx completion handlers in both drivers are not lock-free like in 
the via-rhine.c but are synchronized with xmit by means of the xmit_lock.

> - e1efa87241272104d6a12c8b9fcdc4f62634d447

Yep, a sync of the dma descriptors before the hardware gets ownership of the tx tail
idx is missing in the stmmac, too. 

> - 810f19bcb862f8889b27e0c9d9eceac9593925dd
> - e45af497950a89459a0c4b13ffd91e1729fffef4
> - a926592f5e4e900f3fa903298c4619a131e60963

I think we should use netif_tx_disable() instead of netif_stop_queue(), too, in 
case of restart to avoid a possible schedule of the xmit function while we restart. 
So this is also part of the new patch.

Again the patch is only compile tested.

Regards,
Lino

---
 drivers/net/ethernet/stmicro/stmmac/stmmac.h      |  1 +
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 95 +++++++++++++++--------
 2 files changed, 63 insertions(+), 33 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
index eab04ae..9c240d7 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
@@ -131,6 +131,7 @@ struct stmmac_priv {
 	u32 rx_tail_addr;
 	u32 tx_tail_addr;
 	u32 mss;
+	struct work_struct tx_err_work;
 
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *dbgfs_dir;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 3e40578..5762750 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -1403,37 +1403,6 @@ static inline void stmmac_disable_dma_irq(struct stmmac_priv *priv)
 }
 
 /**
- * stmmac_tx_err - to manage the tx error
- * @priv: driver private structure
- * Description: it cleans the descriptors and restarts the transmission
- * in case of transmission errors.
- */
-static void stmmac_tx_err(struct stmmac_priv *priv)
-{
-	int i;
-	netif_stop_queue(priv->dev);
-
-	priv->hw->dma->stop_tx(priv->ioaddr);
-	dma_free_tx_skbufs(priv);
-	for (i = 0; i < DMA_TX_SIZE; i++)
-		if (priv->extend_desc)
-			priv->hw->desc->init_tx_desc(&priv->dma_etx[i].basic,
-						     priv->mode,
-						     (i == DMA_TX_SIZE - 1));
-		else
-			priv->hw->desc->init_tx_desc(&priv->dma_tx[i],
-						     priv->mode,
-						     (i == DMA_TX_SIZE - 1));
-	priv->dirty_tx = 0;
-	priv->cur_tx = 0;
-	netdev_reset_queue(priv->dev);
-	priv->hw->dma->start_tx(priv->ioaddr);
-
-	priv->dev->stats.tx_errors++;
-	netif_wake_queue(priv->dev);
-}
-
-/**
  * stmmac_dma_interrupt - DMA ISR
  * @priv: driver private structure
  * Description: this is the DMA ISR. It is called by the main ISR.
@@ -1466,7 +1435,7 @@ static void stmmac_dma_interrupt(struct stmmac_priv *priv)
 			priv->xstats.threshold = tc;
 		}
 	} else if (unlikely(status == tx_hard_error))
-		stmmac_tx_err(priv);
+		schedule_work(&priv->tx_err_work);
 }
 
 /**
@@ -1902,6 +1871,7 @@ static int stmmac_release(struct net_device *dev)
 	if (priv->lpi_irq > 0)
 		free_irq(priv->lpi_irq, dev);
 
+	cancel_work_sync(&priv->tx_err_work);
 	/* Stop TX/RX DMA and clear the descriptors */
 	priv->hw->dma->stop_tx(priv->ioaddr);
 	priv->hw->dma->stop_rx(priv->ioaddr);
@@ -1920,9 +1890,67 @@ static int stmmac_release(struct net_device *dev)
 
 	stmmac_release_ptp(priv);
 
+
 	return 0;
 }
 
+static void stmmac_shutdown(struct net_device *dev)
+{
+	struct stmmac_priv *priv = netdev_priv(dev);
+
+	/* make sure xmit is not scheduled any more */
+	netif_tx_disable(dev);
+
+	if (priv->eee_enabled)
+		del_timer_sync(&priv->eee_ctrl_timer);
+
+	/* Stop and disconnect the PHY */
+	if (dev->phydev) {
+		phy_stop(dev->phydev);
+		phy_disconnect(dev->phydev);
+	}
+
+	napi_disable(&priv->napi);
+
+	del_timer_sync(&priv->txtimer);
+
+	/* Free the IRQ lines */
+	free_irq(dev->irq, dev);
+	if (priv->wol_irq != dev->irq)
+		free_irq(priv->wol_irq, dev);
+	if (priv->lpi_irq > 0)
+		free_irq(priv->lpi_irq, dev);
+
+	/* Stop TX/RX DMA and clear the descriptors */
+	priv->hw->dma->stop_tx(priv->ioaddr);
+	priv->hw->dma->stop_rx(priv->ioaddr);
+
+	/* Release and free the Rx/Tx resources */
+	free_dma_desc_resources(priv);
+
+	/* Disable the MAC Rx/Tx */
+	stmmac_set_mac(priv->ioaddr, false);
+
+	netif_carrier_off(dev);
+
+#ifdef CONFIG_DEBUG_FS
+	stmmac_exit_fs(dev);
+#endif
+
+	stmmac_release_ptp(priv);
+}
+
+static void stmmac_tx_err_work(struct work_struct *work)
+{
+	struct stmmac_priv *priv = container_of(work, struct stmmac_priv,
+						tx_err_work);
+	/* restart netdev */
+	rtnl_lock();
+	stmmac_shutdown(priv->dev);
+	stmmac_open(priv->dev);
+	rtnl_unlock();
+}
+
 /**
  *  stmmac_tso_allocator - close entry point of the driver
  *  @priv: driver private structure
@@ -2688,7 +2716,7 @@ static void stmmac_tx_timeout(struct net_device *dev)
 	struct stmmac_priv *priv = netdev_priv(dev);
 
 	/* Clear Tx resources and restart transmitting again */
-	stmmac_tx_err(priv);
+	schedule_work(&priv->tx_err_work);
 }
 
 /**
@@ -3338,6 +3366,7 @@ int stmmac_dvr_probe(struct device *device,
 	netif_napi_add(ndev, &priv->napi, stmmac_poll, 64);
 
 	spin_lock_init(&priv->lock);
+	INIT_WORK(&priv->tx_err_work, stmmac_tx_err_work);
 
 	ret = register_netdev(ndev);
 	if (ret) {
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH net-next v4 2/2] net: stmmac: dwmac-meson8b: make the RGMII TX delay configurable
From: Martin Blumenstingl @ 2016-12-18 16:13 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, devicetree, linux-amlogic, robh+dt, mark.rutland, carlo,
	khilman, peppe.cavallaro, alexandre.torgue, linux-arm-kernel
In-Reply-To: <20161218.104950.1013829528388480468.davem@davemloft.net>

On Sun, Dec 18, 2016 at 4:49 PM, David Miller <davem@davemloft.net> wrote:
> From: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
> Date: Sat, 17 Dec 2016 19:21:19 +0100
>
>> Prior to this patch we were using a hardcoded RGMII TX clock delay of
>> 2ns (= 1/4 cycle of the 125MHz RGMII TX clock). This value works for
>> many boards, but unfortunately not for all (due to the way the actual
>> circuit is designed, sometimes because the TX delay is enabled in the
>> PHY, etc.). Making the TX delay on the MAC side configurable allows us
>> to support all possible hardware combinations.
>>
>> This allows fixing a compatibility issue on some boards, where the
>> RTL8211F PHY is configured to generate the TX delay. We can now turn
>> off the TX delay in the MAC, because otherwise we would be applying the
>> delay twice (which results in non-working TX traffic).
>>
>> Signed-off-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
>> Tested-by: Neil Armstrong <narmstrong@baylibre.com>
>
> Is this really the safest thing to do?
>
> If you say the existing hard-coded setting of 1/4 cycle works on most
> boards, and what you're trying to do is override it with an OF
> property value for boards where the existing setting does not work,
> then you _must_ use a default value that corresponds to what the
> existing code does not when you don't see this new OF property.
it's a bit more complicated in reality: 1/4 cycle works when the TX
delay of the RTL8211F PHY is turned off (until recently it was always
enabled for phy-mode RGMII).

> So please retain the current behavior of the 1/4 cycle TX delay
> setting when you don't see the amlogic,tx-delay-ns property.
>
> I really think you risk breaking existing boards by not doing so,
> unless you can have this patch tested on every such board that exists
> and I don't think you really can feasibly and rigorously do that.
there's a patch in my follow-up series which adds the 2ns to the .dts
for all RGMII based boards: [0] (and I would keep these even if we had
a default value, just to make it explicit and thus easier to
understand for other people).
however, we can add the 2ns default back (I can do this if you want -
Rob Herring was unhappy with the missing documentation of this default
value [1] - so note to myself: take care of that as well). but then we
have to decide when to apply this default value: only when we're in
RGMII mode or also in any of the RGMII_*ID modes?

please let me know how we should proceed


Regards,
Martin


[0] http://lists.infradead.org/pipermail/linux-amlogic/2016-December/001838.html
[1] http://lists.infradead.org/pipermail/linux-amlogic/2016-November/001817.html

^ permalink raw reply

* Re: regression: ath_tx_edma_tasklet() Illegal idle entry in RCU read-side critical section
From: Paul E. McKenney @ 2016-12-18 15:59 UTC (permalink / raw)
  To: Gabriel C
  Cc: lkml, ath9k-devel, linux-wireless, ath9k-devel, netdev, nbd,
	kvalo
In-Reply-To: <23a2a3ab-974a-ed26-6afa-aafab9bb972e@gmail.com>

On Sun, Dec 18, 2016 at 02:52:48PM +0100, Gabriel C wrote:
> Hello,
> 
> while testing kernel 4.9 I run into a weird issue with the ath9k driver.
> 
> I can boot the box in console mode and it stay up sometime but is not usable.

Looks to me like someone forgot an rcu_read_unlock() somewhere.  Given that
the unmatched rcu_read_lock() appears in ath_tx_edma_tasklet(), perhaps
that is also where the missing rcu_read_unlock() is.  And sure enough,
in the middle of this function we have the following:

		fifo_list = &txq->txq_fifo[txq->txq_tailidx];
		if (list_empty(fifo_list)) {
			ath_txq_unlock(sc, txq);
			return;
		}

This will of course return while still in an RCU read-side critical
section.  The caller cannot tell the difference between a return here
and falling off the end of the function, so this is likely the bug.
Or one of the bugs, anyway.  Copying the author and committer for
their thoughts.

Please try the patch at the end of this email.

							Thanx, Paul

> from dmesg :
> 
> ===============================
> [ INFO: suspicious RCU usage. ]
> 4.9-fw1 #1 Tainted: G          I
> -------------------------------
> kernel/rcu/tree.c:705 Illegal idle entry in RCU read-side critical section.!
> 
> other info that might help us debug this:
> 
> 
> RCU used illegally from idle CPU!
> rcu_scheduler_active = 1, debug_locks = 1
> RCU used illegally from extended quiescent state!
> 1 lock held by swapper/0/0:
>  #0:  (rcu_read_lock){......}, at: [<ffffffffa0ee0240>] ath_tx_edma_tasklet+0x0/0x460 [ath9k]
> 
> stack backtrace:
> CPU: 0 PID: 0 Comm: swapper/0 Tainted: G          I     4.9-fw1 #1
> Hardware name: FUJITSU                          PRIMERGY TX200 S5             /D2709, BIOS 6.00 Rev. 1.14.2709              02/04/2013
>  ffff88043ee03f38 ffffffff812cf0f3 ffffffff81a11540 0000000000000001
>  ffff88043ee03f68 ffffffff810b7865 ffffffff81a55d58 ffff88043efcedc0
>  ffff88083cb1ca00 00000000000000d1 ffff88043ee03f88 ffffffff810dbfe8
> Call Trace:
>  <IRQ>
>  [<ffffffff812cf0f3>] dump_stack+0x86/0xc3
>  [<ffffffff810b7865>] lockdep_rcu_suspicious+0xc5/0x100
>  [<ffffffff810dbfe8>] rcu_eqs_enter_common.constprop.62+0x128/0x130
>  [<ffffffff810ddc78>] rcu_irq_exit+0x38/0x70
>  [<ffffffff81067ec4>] irq_exit+0x74/0xd0
>  [<ffffffff8101e561>] do_IRQ+0x71/0x130
>  [<ffffffff8158700c>] common_interrupt+0x8c/0x8c
>  <EOI>
>  [<ffffffff81472836>] ? cpuidle_enter_state+0x156/0x220
>  [<ffffffff81472922>] cpuidle_enter+0x12/0x20
>  [<ffffffff810ad23e>] call_cpuidle+0x1e/0x40
>  [<ffffffff810ad46d>] cpu_startup_entry+0x11d/0x210
>  [<ffffffff8157892c>] rest_init+0x12c/0x140
>  [<ffffffff81d02ec3>] start_kernel+0x40f/0x41c
>  [<ffffffff81d02120>] ? early_idt_handler_array+0x120/0x120
>  [<ffffffff81d02299>] x86_64_start_reservations+0x2a/0x2c
>  [<ffffffff81d02386>] x86_64_start_kernel+0xeb/0xf8

------------------------------------------------------------------------

commit 5a16fed76936184a7ac22e466cf39bd8bb5ee65e
Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Date:   Sun Dec 18 07:49:00 2016 -0800

    drivers/ath: Add missing rcu_read_unlock() to ath_tx_edma_tasklet()
    
    Commit d94a461d7a7d ("ath9k: use ieee80211_tx_status_noskb where possible")
    added rcu_read_lock() and rcu_read_unlock() around the body of
    ath_tx_edma_tasklet(), but failed to add the needed rcu_read_unlock()
    before a "return" in the middle of this function.  This commit therefore
    adds the missing rcu_read_unlock().
    
    Reported-by: Gabriel C <nix.or.die@gmail.com>
    Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
    Cc: Felix Fietkau <nbd@nbd.name>
    Cc: Kalle Valo <kvalo@qca.qualcomm.com>
    Cc: QCA ath9k Development <ath9k-devel@qca.qualcomm.com>
    Cc: <linux-wireless@vger.kernel.org?
    Cc: <ath9k-devel@lists.ath9k.org>

diff --git a/drivers/net/wireless/ath/ath9k/xmit.c b/drivers/net/wireless/ath/ath9k/xmit.c
index 52bfbb988611..857d5ae09a1d 100644
--- a/drivers/net/wireless/ath/ath9k/xmit.c
+++ b/drivers/net/wireless/ath/ath9k/xmit.c
@@ -2787,6 +2787,7 @@ void ath_tx_edma_tasklet(struct ath_softc *sc)
 		fifo_list = &txq->txq_fifo[txq->txq_tailidx];
 		if (list_empty(fifo_list)) {
 			ath_txq_unlock(sc, txq);
+			rcu_read_unlock();
 			return;
 		}
 

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox