Netdev List
 help / color / mirror / Atom feed
* [PATCH v3 25/30] net: add rb_to_skb() and other rb tree helpers
From: Stephen Hemminger @ 2018-09-13 14:58 UTC (permalink / raw)
  To: davem, gregkh; +Cc: netdev, stable, edumazet
In-Reply-To: <20180913145902.17531-1-sthemmin@microsoft.com>

From: Eric Dumazet <edumazet@google.com>

Geeralize private netem_rb_to_skb()

TCP rtx queue will soon be converted to rb-tree,
so we will need skb_rbtree_walk() helpers.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 18a4c0eab2623cc95be98a1e6af1ad18e7695977)
---
 include/linux/skbuff.h  | 18 ++++++++++++++++++
 net/ipv4/tcp_fastopen.c |  8 +++-----
 net/ipv4/tcp_input.c    | 33 ++++++++++++---------------------
 net/sched/sch_netem.c   | 14 ++++----------
 4 files changed, 37 insertions(+), 36 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 758084b434c8..2837e55df03e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3169,6 +3169,12 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
 
 #define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)
 
+#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)
+#define skb_rb_first(root) rb_to_skb(rb_first(root))
+#define skb_rb_last(root)  rb_to_skb(rb_last(root))
+#define skb_rb_next(skb)   rb_to_skb(rb_next(&(skb)->rbnode))
+#define skb_rb_prev(skb)   rb_to_skb(rb_prev(&(skb)->rbnode))
+
 #define skb_queue_walk(queue, skb) \
 		for (skb = (queue)->next;					\
 		     skb != (struct sk_buff *)(queue);				\
@@ -3183,6 +3189,18 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
 		for (; skb != (struct sk_buff *)(queue);			\
 		     skb = skb->next)
 
+#define skb_rbtree_walk(skb, root)						\
+		for (skb = skb_rb_first(root); skb != NULL;			\
+		     skb = skb_rb_next(skb))
+
+#define skb_rbtree_walk_from(skb)						\
+		for (; skb != NULL;						\
+		     skb = skb_rb_next(skb))
+
+#define skb_rbtree_walk_from_safe(skb, tmp)					\
+		for (; tmp = skb ? skb_rb_next(skb) : NULL, (skb != NULL);	\
+		     skb = tmp)
+
 #define skb_queue_walk_from_safe(queue, skb, tmp)				\
 		for (tmp = skb->next;						\
 		     skb != (struct sk_buff *)(queue);				\
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index fbbeda647774..0567edb76522 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -458,17 +458,15 @@ bool tcp_fastopen_active_should_disable(struct sock *sk)
 void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct rb_node *p;
-	struct sk_buff *skb;
 	struct dst_entry *dst;
+	struct sk_buff *skb;
 
 	if (!tp->syn_fastopen)
 		return;
 
 	if (!tp->data_segs_in) {
-		p = rb_first(&tp->out_of_order_queue);
-		if (p && !rb_next(p)) {
-			skb = rb_entry(p, struct sk_buff, rbnode);
+		skb = skb_rb_first(&tp->out_of_order_queue);
+		if (skb && !skb_rb_next(skb)) {
 			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
 				tcp_fastopen_active_disable(sk);
 				return;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bdabd748f4bc..991f382afc1b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4372,7 +4372,7 @@ static void tcp_ofo_queue(struct sock *sk)
 
 	p = rb_first(&tp->out_of_order_queue);
 	while (p) {
-		skb = rb_entry(p, struct sk_buff, rbnode);
+		skb = rb_to_skb(p);
 		if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
 			break;
 
@@ -4440,7 +4440,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
 static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct rb_node **p, *q, *parent;
+	struct rb_node **p, *parent;
 	struct sk_buff *skb1;
 	u32 seq, end_seq;
 	bool fragstolen;
@@ -4503,7 +4503,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	parent = NULL;
 	while (*p) {
 		parent = *p;
-		skb1 = rb_entry(parent, struct sk_buff, rbnode);
+		skb1 = rb_to_skb(parent);
 		if (before(seq, TCP_SKB_CB(skb1)->seq)) {
 			p = &parent->rb_left;
 			continue;
@@ -4548,9 +4548,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 
 merge_right:
 	/* Remove other segments covered by skb. */
-	while ((q = rb_next(&skb->rbnode)) != NULL) {
-		skb1 = rb_entry(q, struct sk_buff, rbnode);
-
+	while ((skb1 = skb_rb_next(skb)) != NULL) {
 		if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
 			break;
 		if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
@@ -4565,7 +4563,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 		tcp_drop(sk, skb1);
 	}
 	/* If there is no skb after us, we are the last_skb ! */
-	if (!q)
+	if (!skb1)
 		tp->ooo_last_skb = skb;
 
 add_sack:
@@ -4749,7 +4747,7 @@ static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *li
 	if (list)
 		return !skb_queue_is_last(list, skb) ? skb->next : NULL;
 
-	return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode);
+	return skb_rb_next(skb);
 }
 
 static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
@@ -4778,7 +4776,7 @@ static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
 
 	while (*p) {
 		parent = *p;
-		skb1 = rb_entry(parent, struct sk_buff, rbnode);
+		skb1 = rb_to_skb(parent);
 		if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
 			p = &parent->rb_left;
 		else
@@ -4898,19 +4896,12 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 range_truesize, sum_tiny = 0;
 	struct sk_buff *skb, *head;
-	struct rb_node *p;
 	u32 start, end;
 
-	p = rb_first(&tp->out_of_order_queue);
-	skb = rb_entry_safe(p, struct sk_buff, rbnode);
+	skb = skb_rb_first(&tp->out_of_order_queue);
 new_range:
 	if (!skb) {
-		p = rb_last(&tp->out_of_order_queue);
-		/* Note: This is possible p is NULL here. We do not
-		 * use rb_entry_safe(), as ooo_last_skb is valid only
-		 * if rbtree is not empty.
-		 */
-		tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode);
+		tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
 		return;
 	}
 	start = TCP_SKB_CB(skb)->seq;
@@ -4918,7 +4909,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
 	range_truesize = skb->truesize;
 
 	for (head = skb;;) {
-		skb = tcp_skb_next(skb, NULL);
+		skb = skb_rb_next(skb);
 
 		/* Range is terminated when we see a gap or when
 		 * we are at the queue end.
@@ -4974,7 +4965,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
 		prev = rb_prev(node);
 		rb_erase(node, &tp->out_of_order_queue);
 		goal -= rb_to_skb(node)->truesize;
-		tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode));
+		tcp_drop(sk, rb_to_skb(node));
 		if (!prev || goal <= 0) {
 			sk_mem_reclaim(sk);
 			if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
@@ -4984,7 +4975,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
 		}
 		node = prev;
 	} while (node);
-	tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode);
+	tp->ooo_last_skb = rb_to_skb(prev);
 
 	/* Reset SACK state.  A conforming SACK implementation will
 	 * do the same at a timeout based retransmit.  When a connection
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 8c8df75dbead..2a2ab6bfe5d8 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -149,12 +149,6 @@ struct netem_skb_cb {
 	ktime_t		tstamp_save;
 };
 
-
-static struct sk_buff *netem_rb_to_skb(struct rb_node *rb)
-{
-	return rb_entry(rb, struct sk_buff, rbnode);
-}
-
 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 {
 	/* we assume we can use skb next/prev/tstamp as storage for rb_node */
@@ -365,7 +359,7 @@ static void tfifo_reset(struct Qdisc *sch)
 	struct rb_node *p;
 
 	while ((p = rb_first(&q->t_root))) {
-		struct sk_buff *skb = netem_rb_to_skb(p);
+		struct sk_buff *skb = rb_to_skb(p);
 
 		rb_erase(p, &q->t_root);
 		rtnl_kfree_skbs(skb, skb);
@@ -382,7 +376,7 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 		struct sk_buff *skb;
 
 		parent = *p;
-		skb = netem_rb_to_skb(parent);
+		skb = rb_to_skb(parent);
 		if (tnext >= netem_skb_cb(skb)->time_to_send)
 			p = &parent->rb_right;
 		else
@@ -538,7 +532,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 				struct sk_buff *t_skb;
 				struct netem_skb_cb *t_last;
 
-				t_skb = netem_rb_to_skb(rb_last(&q->t_root));
+				t_skb = skb_rb_last(&q->t_root);
 				t_last = netem_skb_cb(t_skb);
 				if (!last ||
 				    t_last->time_to_send > last->time_to_send) {
@@ -618,7 +612,7 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 	if (p) {
 		psched_time_t time_to_send;
 
-		skb = netem_rb_to_skb(p);
+		skb = rb_to_skb(p);
 
 		/* if more time remaining? */
 		time_to_send = netem_skb_cb(skb)->time_to_send;
-- 
2.18.0

^ permalink raw reply related

* [PATCH v3 22/30] net: modify skb_rbtree_purge to return the truesize of all purged skbs.
From: Stephen Hemminger @ 2018-09-13 14:58 UTC (permalink / raw)
  To: davem, gregkh; +Cc: netdev, stable, edumazet, Peter Oskolkov, Florian Westphal
In-Reply-To: <20180913145902.17531-1-sthemmin@microsoft.com>

From: Peter Oskolkov <posk@google.com>

Tested: see the next patch is the series.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Peter Oskolkov <posk@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 385114dec8a49b5e5945e77ba7de6356106713f4)
---
 include/linux/skbuff.h | 2 +-
 net/core/skbuff.c      | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f4749678b7ee..9c8457375aee 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2581,7 +2581,7 @@ static inline void __skb_queue_purge(struct sk_buff_head *list)
 		kfree_skb(skb);
 }
 
-void skb_rbtree_purge(struct rb_root *root);
+unsigned int skb_rbtree_purge(struct rb_root *root);
 
 void *netdev_alloc_frag(unsigned int fragsz);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c7c5f05f2af1..8fd690def5c1 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2842,23 +2842,27 @@ EXPORT_SYMBOL(skb_queue_purge);
 /**
  *	skb_rbtree_purge - empty a skb rbtree
  *	@root: root of the rbtree to empty
+ *	Return value: the sum of truesizes of all purged skbs.
  *
  *	Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
  *	the list and one reference dropped. This function does not take
  *	any lock. Synchronization should be handled by the caller (e.g., TCP
  *	out-of-order queue is protected by the socket lock).
  */
-void skb_rbtree_purge(struct rb_root *root)
+unsigned int skb_rbtree_purge(struct rb_root *root)
 {
 	struct rb_node *p = rb_first(root);
+	unsigned int sum = 0;
 
 	while (p) {
 		struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
 
 		p = rb_next(p);
 		rb_erase(&skb->rbnode, root);
+		sum += skb->truesize;
 		kfree_skb(skb);
 	}
+	return sum;
 }
 
 /**
-- 
2.18.0

^ permalink raw reply related

* [PATCH v3 21/30] net: speed up skb_rbtree_purge()
From: Stephen Hemminger @ 2018-09-13 14:58 UTC (permalink / raw)
  To: davem, gregkh; +Cc: netdev, stable, edumazet
In-Reply-To: <20180913145902.17531-1-sthemmin@microsoft.com>

From: Eric Dumazet <edumazet@google.com>

As measured in my prior patch ("sch_netem: faster rb tree removal"),
rbtree_postorder_for_each_entry_safe() is nice looking but much slower
than using rb_next() directly, except when tree is small enough
to fit in CPU caches (then the cost is the same)

Also note that there is not even an increase of text size :
$ size net/core/skbuff.o.before net/core/skbuff.o
   text	   data	    bss	    dec	    hex	filename
  40711	   1298	      0	  42009	   a419	net/core/skbuff.o.before
  40711	   1298	      0	  42009	   a419	net/core/skbuff.o

From: Eric Dumazet <edumazet@google.com>

Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 7c90584c66cc4b033a3b684b0e0950f79e7b7166)
---
 net/core/skbuff.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2e5eeba97de9..c7c5f05f2af1 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2850,12 +2850,15 @@ EXPORT_SYMBOL(skb_queue_purge);
  */
 void skb_rbtree_purge(struct rb_root *root)
 {
-	struct sk_buff *skb, *next;
+	struct rb_node *p = rb_first(root);
 
-	rbtree_postorder_for_each_entry_safe(skb, next, root, rbnode)
-		kfree_skb(skb);
+	while (p) {
+		struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
 
-	*root = RB_ROOT;
+		p = rb_next(p);
+		rb_erase(&skb->rbnode, root);
+		kfree_skb(skb);
+	}
 }
 
 /**
-- 
2.18.0

^ permalink raw reply related

* [PATCH v3 16/30] rhashtable: reorganize struct rhashtable layout
From: Stephen Hemminger @ 2018-09-13 14:58 UTC (permalink / raw)
  To: davem, gregkh; +Cc: netdev, stable, edumazet
In-Reply-To: <20180913145902.17531-1-sthemmin@microsoft.com>

From: Eric Dumazet <edumazet@google.com>

While under frags DDOS I noticed unfortunate false sharing between
@nelems and @params.automatic_shrinking

Move @nelems at the end of struct rhashtable so that first cache line
is shared between all cpus, because almost never dirtied.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit e5d672a0780d9e7118caad4c171ec88b8299398d)
---
 include/linux/rhashtable.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 7fd514f36e74..a4be6388a980 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -152,25 +152,25 @@ struct rhashtable_params {
 /**
  * struct rhashtable - Hash table handle
  * @tbl: Bucket table
- * @nelems: Number of elements in table
  * @key_len: Key length for hashfn
- * @p: Configuration parameters
  * @max_elems: Maximum number of elements in table
+ * @p: Configuration parameters
  * @rhlist: True if this is an rhltable
  * @run_work: Deferred worker to expand/shrink asynchronously
  * @mutex: Mutex to protect current/future table swapping
  * @lock: Spin lock to protect walker list
+ * @nelems: Number of elements in table
  */
 struct rhashtable {
 	struct bucket_table __rcu	*tbl;
-	atomic_t			nelems;
 	unsigned int			key_len;
-	struct rhashtable_params	p;
 	unsigned int			max_elems;
+	struct rhashtable_params	p;
 	bool				rhlist;
 	struct work_struct		run_work;
 	struct mutex                    mutex;
 	spinlock_t			lock;
+	atomic_t			nelems;
 };
 
 /**
-- 
2.18.0

^ permalink raw reply related

* [PATCH v3 12/30] inet: frags: remove inet_frag_maybe_warn_overflow()
From: Stephen Hemminger @ 2018-09-13 14:58 UTC (permalink / raw)
  To: davem, gregkh; +Cc: netdev, stable, edumazet
In-Reply-To: <20180913145902.17531-1-sthemmin@microsoft.com>

From: Eric Dumazet <edumazet@google.com>

This function is obsolete, after rhashtable addition to inet defrag.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 2d44ed22e607f9a285b049de2263e3840673a260)
---
 include/net/inet_frag.h                 |  2 --
 net/ieee802154/6lowpan/reassembly.c     |  5 ++---
 net/ipv4/inet_fragment.c                | 11 -----------
 net/ipv4/ip_fragment.c                  |  5 ++---
 net/ipv6/netfilter/nf_conntrack_reasm.c |  5 ++---
 net/ipv6/reassembly.c                   |  5 ++---
 6 files changed, 8 insertions(+), 25 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 0e8e159d88f7..95e353e3305b 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -110,8 +110,6 @@ void inet_frags_exit_net(struct netns_frags *nf);
 void inet_frag_kill(struct inet_frag_queue *q);
 void inet_frag_destroy(struct inet_frag_queue *q);
 struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key);
-void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
-				   const char *prefix);
 
 static inline void inet_frag_put(struct inet_frag_queue *q)
 {
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 0fa0121f85d4..1aec71a3f904 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -84,10 +84,9 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb,
 	struct inet_frag_queue *q;
 
 	q = inet_frag_find(&ieee802154_lowpan->frags, &key);
-	if (IS_ERR_OR_NULL(q)) {
-		inet_frag_maybe_warn_overflow(q, pr_fmt());
+	if (!q)
 		return NULL;
-	}
+
 	return container_of(q, struct lowpan_frag_queue, q);
 }
 
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index ebb8f411e0db..c9e35b81d093 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -218,14 +218,3 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
 	return inet_frag_create(nf, key);
 }
 EXPORT_SYMBOL(inet_frag_find);
-
-void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
-				   const char *prefix)
-{
-	static const char msg[] = "inet_frag_find: Fragment hash bucket"
-		" list length grew over limit. Dropping fragment.\n";
-
-	if (PTR_ERR(q) == -ENOBUFS)
-		net_dbg_ratelimited("%s%s", prefix, msg);
-}
-EXPORT_SYMBOL(inet_frag_maybe_warn_overflow);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 1222aee3e5ee..38cbf56bb48e 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -221,10 +221,9 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph,
 	struct inet_frag_queue *q;
 
 	q = inet_frag_find(&net->ipv4.frags, &key);
-	if (IS_ERR_OR_NULL(q)) {
-		inet_frag_maybe_warn_overflow(q, pr_fmt());
+	if (!q)
 		return NULL;
-	}
+
 	return container_of(q, struct ipq, q);
 }
 
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 8b12431ae296..54ce1d2a9a9d 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -178,10 +178,9 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
 	struct inet_frag_queue *q;
 
 	q = inet_frag_find(&net->nf_frag.frags, &key);
-	if (IS_ERR_OR_NULL(q)) {
-		inet_frag_maybe_warn_overflow(q, pr_fmt());
+	if (!q)
 		return NULL;
-	}
+
 	return container_of(q, struct frag_queue, q);
 }
 
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 70acad126d04..2a77fda5e3bc 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -155,10 +155,9 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif)
 		key.iif = 0;
 
 	q = inet_frag_find(&net->ipv6.frags, &key);
-	if (IS_ERR_OR_NULL(q)) {
-		inet_frag_maybe_warn_overflow(q, pr_fmt());
+	if (!q)
 		return NULL;
-	}
+
 	return container_of(q, struct frag_queue, q);
 }
 
-- 
2.18.0

^ permalink raw reply related

* [PATCH v3 11/30] inet: frags: get rif of inet_frag_evicting()
From: Stephen Hemminger @ 2018-09-13 14:58 UTC (permalink / raw)
  To: davem, gregkh; +Cc: netdev, stable, edumazet
In-Reply-To: <20180913145902.17531-1-sthemmin@microsoft.com>

From: Eric Dumazet <edumazet@google.com>

This refactors ip_expire() since one indentation level is removed.

Note: in the future, we should try hard to avoid the skb_clone()
since this is a serious performance cost.
Under DDOS, the ICMP message wont be sent because of rate limits.

Fact that ip6_expire_frag_queue() does not use skb_clone() is
disturbing too. Presumably IPv6 should have the same
issue than the one we fixed in commit ec4fbd64751d
("inet: frag: release spinlock before calling icmp_send()")

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 399d1404be660d355192ff4df5ccc3f4159ec1e4)
---
 include/net/inet_frag.h |  5 ----
 net/ipv4/ip_fragment.c  | 65 ++++++++++++++++++++---------------------
 net/ipv6/reassembly.c   |  4 ---
 3 files changed, 32 insertions(+), 42 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 4b5449df0aad..0e8e159d88f7 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -119,11 +119,6 @@ static inline void inet_frag_put(struct inet_frag_queue *q)
 		inet_frag_destroy(q);
 }
 
-static inline bool inet_frag_evicting(struct inet_frag_queue *q)
-{
-	return false;
-}
-
 /* Memory Tracking Functions. */
 
 static inline int frag_mem_limit(struct netns_frags *nf)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 94451fad9994..1222aee3e5ee 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -143,8 +143,11 @@ static bool frag_expire_skip_icmp(u32 user)
 static void ip_expire(struct timer_list *t)
 {
 	struct inet_frag_queue *frag = from_timer(frag, t, timer);
-	struct ipq *qp;
+	struct sk_buff *clone, *head;
+	const struct iphdr *iph;
 	struct net *net;
+	struct ipq *qp;
+	int err;
 
 	qp = container_of(frag, struct ipq, q);
 	net = container_of(qp->q.net, struct net, ipv4.frags);
@@ -158,45 +161,41 @@ static void ip_expire(struct timer_list *t)
 	ipq_kill(qp);
 	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
 
-	if (!inet_frag_evicting(&qp->q)) {
-		struct sk_buff *clone, *head = qp->q.fragments;
-		const struct iphdr *iph;
-		int err;
+	head = qp->q.fragments;
 
-		__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
+	__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
 
-		if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments)
-			goto out;
+	if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head)
+		goto out;
 
-		head->dev = dev_get_by_index_rcu(net, qp->iif);
-		if (!head->dev)
-			goto out;
+	head->dev = dev_get_by_index_rcu(net, qp->iif);
+	if (!head->dev)
+		goto out;
 
 
-		/* skb has no dst, perform route lookup again */
-		iph = ip_hdr(head);
-		err = ip_route_input_noref(head, iph->daddr, iph->saddr,
+	/* skb has no dst, perform route lookup again */
+	iph = ip_hdr(head);
+	err = ip_route_input_noref(head, iph->daddr, iph->saddr,
 					   iph->tos, head->dev);
-		if (err)
-			goto out;
+	if (err)
+		goto out;
 
-		/* Only an end host needs to send an ICMP
-		 * "Fragment Reassembly Timeout" message, per RFC792.
-		 */
-		if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
-		    (skb_rtable(head)->rt_type != RTN_LOCAL))
-			goto out;
-
-		clone = skb_clone(head, GFP_ATOMIC);
-
-		/* Send an ICMP "Fragment Reassembly Timeout" message. */
-		if (clone) {
-			spin_unlock(&qp->q.lock);
-			icmp_send(clone, ICMP_TIME_EXCEEDED,
-				  ICMP_EXC_FRAGTIME, 0);
-			consume_skb(clone);
-			goto out_rcu_unlock;
-		}
+	/* Only an end host needs to send an ICMP
+	 * "Fragment Reassembly Timeout" message, per RFC792.
+	 */
+	if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
+	    (skb_rtable(head)->rt_type != RTN_LOCAL))
+		goto out;
+
+	clone = skb_clone(head, GFP_ATOMIC);
+
+	/* Send an ICMP "Fragment Reassembly Timeout" message. */
+	if (clone) {
+		spin_unlock(&qp->q.lock);
+		icmp_send(clone, ICMP_TIME_EXCEEDED,
+			  ICMP_EXC_FRAGTIME, 0);
+		consume_skb(clone);
+		goto out_rcu_unlock;
 	}
 out:
 	spin_unlock(&qp->q.lock);
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 3fc853e4492a..70acad126d04 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -106,10 +106,6 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)
 		goto out_rcu_unlock;
 
 	__IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
-
-	if (inet_frag_evicting(&fq->q))
-		goto out_rcu_unlock;
-
 	__IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
 
 	/* Don't send error if the first segment did not arrive. */
-- 
2.18.0

^ permalink raw reply related

* [PATCH v3 08/30] rhashtable: add schedule points
From: Stephen Hemminger @ 2018-09-13 14:58 UTC (permalink / raw)
  To: davem, gregkh; +Cc: netdev, stable, edumazet
In-Reply-To: <20180913145902.17531-1-sthemmin@microsoft.com>

From: Eric Dumazet <edumazet@google.com>

Rehashing and destroying large hash table takes a lot of time,
and happens in process context. It is safe to add cond_resched()
in rhashtable_rehash_table() and rhashtable_free_and_destroy()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit ae6da1f503abb5a5081f9f6c4a6881de97830f3e)
---
 lib/rhashtable.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 39215c724fc7..cebbcec877d7 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -364,6 +364,7 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
 		err = rhashtable_rehash_chain(ht, old_hash);
 		if (err)
 			return err;
+		cond_resched();
 	}
 
 	/* Publish the new table pointer. */
@@ -1073,6 +1074,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht,
 		for (i = 0; i < tbl->size; i++) {
 			struct rhash_head *pos, *next;
 
+			cond_resched();
 			for (pos = rht_dereference(*rht_bucket(tbl, i), ht),
 			     next = !rht_is_a_nulls(pos) ?
 					rht_dereference(pos->next, ht) : NULL;
-- 
2.18.0

^ permalink raw reply related

* [PATCH v3 07/30] ipv6: export ip6 fragments sysctl to unprivileged users
From: Stephen Hemminger @ 2018-09-13 14:58 UTC (permalink / raw)
  To: davem, gregkh; +Cc: netdev, stable, edumazet, Nikolay Borisov
In-Reply-To: <20180913145902.17531-1-sthemmin@microsoft.com>

From: Eric Dumazet <edumazet@google.com>

IPv4 was changed in commit 52a773d645e9 ("net: Export ip fragment
sysctl to unprivileged users")

The only sysctl that is not per-netns is not used :
ip6frag_secret_interval

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 18dcbe12fe9fca0ab825f7eff993060525ac2503)
---
 net/ipv6/reassembly.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 42b6b2ba447a..f0071b113a92 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -649,10 +649,6 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net)
 		table[1].data = &net->ipv6.frags.low_thresh;
 		table[1].extra2 = &net->ipv6.frags.high_thresh;
 		table[2].data = &net->ipv6.frags.timeout;
-
-		/* Don't export sysctls to unprivileged users */
-		if (net->user_ns != &init_user_ns)
-			table[0].procname = NULL;
 	}
 
 	hdr = register_net_sysctl(net, "net/ipv6", table);
-- 
2.18.0

^ permalink raw reply related

* [PATCH v3 02/30] inet: frags: add a pointer to struct netns_frags
From: Stephen Hemminger @ 2018-09-13 14:58 UTC (permalink / raw)
  To: davem, gregkh; +Cc: netdev, stable, edumazet
In-Reply-To: <20180913145902.17531-1-sthemmin@microsoft.com>

From: Eric Dumazet <edumazet@google.com>

In order to simplify the API, add a pointer to struct inet_frags.
This will allow us to make things less complex.

These functions no longer have a struct inet_frags parameter :

inet_frag_destroy(struct inet_frag_queue *q  /*, struct inet_frags *f */)
inet_frag_put(struct inet_frag_queue *q /*, struct inet_frags *f */)
inet_frag_kill(struct inet_frag_queue *q /*, struct inet_frags *f */)
inet_frags_exit_net(struct netns_frags *nf /*, struct inet_frags *f */)
ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 093ba72914b696521e4885756a68a3332782c8de)
---
 include/net/inet_frag.h                 | 11 ++++++-----
 include/net/ipv6.h                      |  3 +--
 net/ieee802154/6lowpan/reassembly.c     | 13 +++++++------
 net/ipv4/inet_fragment.c                | 17 ++++++++++-------
 net/ipv4/ip_fragment.c                  |  9 +++++----
 net/ipv6/netfilter/nf_conntrack_reasm.c | 16 +++++++++-------
 net/ipv6/reassembly.c                   | 20 ++++++++++----------
 7 files changed, 48 insertions(+), 41 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 2ad894e446ac..fd338293a095 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -10,6 +10,7 @@ struct netns_frags {
 	int			high_thresh;
 	int			low_thresh;
 	int			max_dist;
+	struct inet_frags	*f;
 };
 
 /**
@@ -109,20 +110,20 @@ static inline int inet_frags_init_net(struct netns_frags *nf)
 	atomic_set(&nf->mem, 0);
 	return 0;
 }
-void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
+void inet_frags_exit_net(struct netns_frags *nf);
 
-void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
-void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f);
+void inet_frag_kill(struct inet_frag_queue *q);
+void inet_frag_destroy(struct inet_frag_queue *q);
 struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
 		struct inet_frags *f, void *key, unsigned int hash);
 
 void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
 				   const char *prefix);
 
-static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f)
+static inline void inet_frag_put(struct inet_frag_queue *q)
 {
 	if (refcount_dec_and_test(&q->refcnt))
-		inet_frag_destroy(q, f);
+		inet_frag_destroy(q);
 }
 
 static inline bool inet_frag_evicting(struct inet_frag_queue *q)
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index f280c61e019a..ff8407b19d05 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -560,8 +560,7 @@ struct frag_queue {
 	u8			ecn;
 };
 
-void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
-			   struct inet_frags *frags);
+void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq);
 
 static inline bool ipv6_addr_any(const struct in6_addr *a)
 {
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 9757ce6c077a..9ccb8458b5c3 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -93,10 +93,10 @@ static void lowpan_frag_expire(unsigned long data)
 	if (fq->q.flags & INET_FRAG_COMPLETE)
 		goto out;
 
-	inet_frag_kill(&fq->q, &lowpan_frags);
+	inet_frag_kill(&fq->q);
 out:
 	spin_unlock(&fq->q.lock);
-	inet_frag_put(&fq->q, &lowpan_frags);
+	inet_frag_put(&fq->q);
 }
 
 static inline struct lowpan_frag_queue *
@@ -229,7 +229,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev,
 	struct sk_buff *fp, *head = fq->q.fragments;
 	int sum_truesize;
 
-	inet_frag_kill(&fq->q, &lowpan_frags);
+	inet_frag_kill(&fq->q);
 
 	/* Make the one we just received the head. */
 	if (prev) {
@@ -437,7 +437,7 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type)
 		ret = lowpan_frag_queue(fq, skb, frag_type);
 		spin_unlock(&fq->q.lock);
 
-		inet_frag_put(&fq->q, &lowpan_frags);
+		inet_frag_put(&fq->q);
 		return ret;
 	}
 
@@ -585,13 +585,14 @@ static int __net_init lowpan_frags_init_net(struct net *net)
 	ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
+	ieee802154_lowpan->frags.f = &lowpan_frags;
 
 	res = inet_frags_init_net(&ieee802154_lowpan->frags);
 	if (res < 0)
 		return res;
 	res = lowpan_frags_ns_sysctl_register(net);
 	if (res < 0)
-		inet_frags_exit_net(&ieee802154_lowpan->frags, &lowpan_frags);
+		inet_frags_exit_net(&ieee802154_lowpan->frags);
 	return res;
 }
 
@@ -601,7 +602,7 @@ static void __net_exit lowpan_frags_exit_net(struct net *net)
 		net_ieee802154_lowpan(net);
 
 	lowpan_frags_ns_sysctl_unregister(net);
-	inet_frags_exit_net(&ieee802154_lowpan->frags, &lowpan_frags);
+	inet_frags_exit_net(&ieee802154_lowpan->frags);
 }
 
 static struct pernet_operations lowpan_frags_ops = {
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index ba4454ecdf0f..4b44f973c37f 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -219,8 +219,9 @@ void inet_frags_fini(struct inet_frags *f)
 }
 EXPORT_SYMBOL(inet_frags_fini);
 
-void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
+void inet_frags_exit_net(struct netns_frags *nf)
 {
+	struct inet_frags *f =nf->f;
 	unsigned int seq;
 	int i;
 
@@ -264,33 +265,34 @@ __acquires(hb->chain_lock)
 	return hb;
 }
 
-static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
+static inline void fq_unlink(struct inet_frag_queue *fq)
 {
 	struct inet_frag_bucket *hb;
 
-	hb = get_frag_bucket_locked(fq, f);
+	hb = get_frag_bucket_locked(fq, fq->net->f);
 	hlist_del(&fq->list);
 	fq->flags |= INET_FRAG_COMPLETE;
 	spin_unlock(&hb->chain_lock);
 }
 
-void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
+void inet_frag_kill(struct inet_frag_queue *fq)
 {
 	if (del_timer(&fq->timer))
 		refcount_dec(&fq->refcnt);
 
 	if (!(fq->flags & INET_FRAG_COMPLETE)) {
-		fq_unlink(fq, f);
+		fq_unlink(fq);
 		refcount_dec(&fq->refcnt);
 	}
 }
 EXPORT_SYMBOL(inet_frag_kill);
 
-void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
+void inet_frag_destroy(struct inet_frag_queue *q)
 {
 	struct sk_buff *fp;
 	struct netns_frags *nf;
 	unsigned int sum, sum_truesize = 0;
+	struct inet_frags *f;
 
 	WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
 	WARN_ON(del_timer(&q->timer) != 0);
@@ -298,6 +300,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
 	/* Release all fragment data. */
 	fp = q->fragments;
 	nf = q->net;
+	f = nf->f;
 	while (fp) {
 		struct sk_buff *xp = fp->next;
 
@@ -333,7 +336,7 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
 			refcount_inc(&qp->refcnt);
 			spin_unlock(&hb->chain_lock);
 			qp_in->flags |= INET_FRAG_COMPLETE;
-			inet_frag_put(qp_in, f);
+			inet_frag_put(qp_in);
 			return qp;
 		}
 	}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 916def2e2afe..c32718b00761 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -168,7 +168,7 @@ static void ip4_frag_free(struct inet_frag_queue *q)
 
 static void ipq_put(struct ipq *ipq)
 {
-	inet_frag_put(&ipq->q, &ip4_frags);
+	inet_frag_put(&ipq->q);
 }
 
 /* Kill ipq entry. It is not destroyed immediately,
@@ -176,7 +176,7 @@ static void ipq_put(struct ipq *ipq)
  */
 static void ipq_kill(struct ipq *ipq)
 {
-	inet_frag_kill(&ipq->q, &ip4_frags);
+	inet_frag_kill(&ipq->q);
 }
 
 static bool frag_expire_skip_icmp(u32 user)
@@ -876,20 +876,21 @@ static int __net_init ipv4_frags_init_net(struct net *net)
 	net->ipv4.frags.timeout = IP_FRAG_TIME;
 
 	net->ipv4.frags.max_dist = 64;
+	net->ipv4.frags.f = &ip4_frags;
 
 	res = inet_frags_init_net(&net->ipv4.frags);
 	if (res < 0)
 		return res;
 	res = ip4_frags_ns_ctl_register(net);
 	if (res < 0)
-		inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
+		inet_frags_exit_net(&net->ipv4.frags);
 	return res;
 }
 
 static void __net_exit ipv4_frags_exit_net(struct net *net)
 {
 	ip4_frags_ns_ctl_unregister(net);
-	inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
+	inet_frags_exit_net(&net->ipv4.frags);
 }
 
 static struct pernet_operations ip4_frags_ops = {
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index afa9ea76155d..7ea2b4490672 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -177,7 +177,7 @@ static void nf_ct_frag6_expire(unsigned long data)
 	fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q);
 	net = container_of(fq->q.net, struct net, nf_frag.frags);
 
-	ip6_expire_frag_queue(net, fq, &nf_frags);
+	ip6_expire_frag_queue(net, fq);
 }
 
 /* Creation primitives. */
@@ -263,7 +263,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
 			 * this case. -DaveM
 			 */
 			pr_debug("end of fragment not rounded to 8 bytes.\n");
-			inet_frag_kill(&fq->q, &nf_frags);
+			inet_frag_kill(&fq->q);
 			return -EPROTO;
 		}
 		if (end > fq->q.len) {
@@ -356,7 +356,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
 	return 0;
 
 discard_fq:
-	inet_frag_kill(&fq->q, &nf_frags);
+	inet_frag_kill(&fq->q);
 err:
 	return -EINVAL;
 }
@@ -378,7 +378,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev,  struct net_devic
 	int    payload_len;
 	u8 ecn;
 
-	inet_frag_kill(&fq->q, &nf_frags);
+	inet_frag_kill(&fq->q);
 
 	WARN_ON(head == NULL);
 	WARN_ON(NFCT_FRAG6_CB(head)->offset != 0);
@@ -623,7 +623,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
 
 out_unlock:
 	spin_unlock_bh(&fq->q.lock);
-	inet_frag_put(&fq->q, &nf_frags);
+	inet_frag_put(&fq->q);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);
@@ -635,19 +635,21 @@ static int nf_ct_net_init(struct net *net)
 	net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
+	net->nf_frag.frags.f = &nf_frags;
+
 	res = inet_frags_init_net(&net->nf_frag.frags);
 	if (res < 0)
 		return res;
 	res = nf_ct_frag6_sysctl_register(net);
 	if (res < 0)
-		inet_frags_exit_net(&net->nf_frag.frags, &nf_frags);
+		inet_frags_exit_net(&net->nf_frag.frags);
 	return res;
 }
 
 static void nf_ct_net_exit(struct net *net)
 {
 	nf_ct_frags6_sysctl_unregister(net);
-	inet_frags_exit_net(&net->nf_frag.frags, &nf_frags);
+	inet_frags_exit_net(&net->nf_frag.frags);
 }
 
 static struct pernet_operations nf_ct_net_ops = {
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 38bf38a9717f..26f737c3fc7b 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -128,8 +128,7 @@ void ip6_frag_init(struct inet_frag_queue *q, const void *a)
 }
 EXPORT_SYMBOL(ip6_frag_init);
 
-void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
-			   struct inet_frags *frags)
+void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)
 {
 	struct net_device *dev = NULL;
 
@@ -138,7 +137,7 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
 	if (fq->q.flags & INET_FRAG_COMPLETE)
 		goto out;
 
-	inet_frag_kill(&fq->q, frags);
+	inet_frag_kill(&fq->q);
 
 	rcu_read_lock();
 	dev = dev_get_by_index_rcu(net, fq->iif);
@@ -166,7 +165,7 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
 	rcu_read_unlock();
 out:
 	spin_unlock(&fq->q.lock);
-	inet_frag_put(&fq->q, frags);
+	inet_frag_put(&fq->q);
 }
 EXPORT_SYMBOL(ip6_expire_frag_queue);
 
@@ -178,7 +177,7 @@ static void ip6_frag_expire(unsigned long data)
 	fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q);
 	net = container_of(fq->q.net, struct net, ipv6.frags);
 
-	ip6_expire_frag_queue(net, fq, &ip6_frags);
+	ip6_expire_frag_queue(net, fq);
 }
 
 static struct frag_queue *
@@ -363,7 +362,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 	return -1;
 
 discard_fq:
-	inet_frag_kill(&fq->q, &ip6_frags);
+	inet_frag_kill(&fq->q);
 err:
 	__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 			IPSTATS_MIB_REASMFAILS);
@@ -390,7 +389,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 	int sum_truesize;
 	u8 ecn;
 
-	inet_frag_kill(&fq->q, &ip6_frags);
+	inet_frag_kill(&fq->q);
 
 	ecn = ip_frag_ecn_table[fq->ecn];
 	if (unlikely(ecn == 0xff))
@@ -568,7 +567,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 		ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff);
 
 		spin_unlock(&fq->q.lock);
-		inet_frag_put(&fq->q, &ip6_frags);
+		inet_frag_put(&fq->q);
 		return ret;
 	}
 
@@ -719,6 +718,7 @@ static int __net_init ipv6_frags_init_net(struct net *net)
 	net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
+	net->ipv6.frags.f = &ip6_frags;
 
 	res = inet_frags_init_net(&net->ipv6.frags);
 	if (res < 0)
@@ -726,14 +726,14 @@ static int __net_init ipv6_frags_init_net(struct net *net)
 
 	res = ip6_frags_ns_sysctl_register(net);
 	if (res < 0)
-		inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
+		inet_frags_exit_net(&net->ipv6.frags);
 	return res;
 }
 
 static void __net_exit ipv6_frags_exit_net(struct net *net)
 {
 	ip6_frags_ns_sysctl_unregister(net);
-	inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
+	inet_frags_exit_net(&net->ipv6.frags);
 }
 
 static struct pernet_operations ip6_frags_ops = {
-- 
2.18.0

^ permalink raw reply related

* [PATCH v3 01/30] inet: frags: change inet_frags_init_net() return value
From: Stephen Hemminger @ 2018-09-13 14:58 UTC (permalink / raw)
  To: davem, gregkh; +Cc: netdev, stable, edumazet
In-Reply-To: <20180913145902.17531-1-sthemmin@microsoft.com>

From: Eric Dumazet <edumazet@google.com>

We will soon initialize one rhashtable per struct netns_frags
in inet_frags_init_net().

This patch changes the return value to eventually propagate an
error.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 787bea7748a76130566f881c2342a0be4127d182)
---
 include/net/inet_frag.h                 |  3 ++-
 net/ieee802154/6lowpan/reassembly.c     | 11 ++++++++---
 net/ipv4/ip_fragment.c                  | 12 +++++++++---
 net/ipv6/netfilter/nf_conntrack_reasm.c | 12 +++++++++---
 net/ipv6/reassembly.c                   | 11 +++++++++--
 5 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index a6e4edd8d4a2..2ad894e446ac 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -104,9 +104,10 @@ struct inet_frags {
 int inet_frags_init(struct inet_frags *);
 void inet_frags_fini(struct inet_frags *);
 
-static inline void inet_frags_init_net(struct netns_frags *nf)
+static inline int inet_frags_init_net(struct netns_frags *nf)
 {
 	atomic_set(&nf->mem, 0);
+	return 0;
 }
 void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
 
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index f85b08baff16..9757ce6c077a 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -580,14 +580,19 @@ static int __net_init lowpan_frags_init_net(struct net *net)
 {
 	struct netns_ieee802154_lowpan *ieee802154_lowpan =
 		net_ieee802154_lowpan(net);
+	int res;
 
 	ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
 
-	inet_frags_init_net(&ieee802154_lowpan->frags);
-
-	return lowpan_frags_ns_sysctl_register(net);
+	res = inet_frags_init_net(&ieee802154_lowpan->frags);
+	if (res < 0)
+		return res;
+	res = lowpan_frags_ns_sysctl_register(net);
+	if (res < 0)
+		inet_frags_exit_net(&ieee802154_lowpan->frags, &lowpan_frags);
+	return res;
 }
 
 static void __net_exit lowpan_frags_exit_net(struct net *net)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 4cb1befc3949..916def2e2afe 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -850,6 +850,8 @@ static void __init ip4_frags_ctl_register(void)
 
 static int __net_init ipv4_frags_init_net(struct net *net)
 {
+	int res;
+
 	/* Fragment cache limits.
 	 *
 	 * The fragment memory accounting code, (tries to) account for
@@ -875,9 +877,13 @@ static int __net_init ipv4_frags_init_net(struct net *net)
 
 	net->ipv4.frags.max_dist = 64;
 
-	inet_frags_init_net(&net->ipv4.frags);
-
-	return ip4_frags_ns_ctl_register(net);
+	res = inet_frags_init_net(&net->ipv4.frags);
+	if (res < 0)
+		return res;
+	res = ip4_frags_ns_ctl_register(net);
+	if (res < 0)
+		inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
+	return res;
 }
 
 static void __net_exit ipv4_frags_exit_net(struct net *net)
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index ee33a6743f3b..afa9ea76155d 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -630,12 +630,18 @@ EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);
 
 static int nf_ct_net_init(struct net *net)
 {
+	int res;
+
 	net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
-	inet_frags_init_net(&net->nf_frag.frags);
-
-	return nf_ct_frag6_sysctl_register(net);
+	res = inet_frags_init_net(&net->nf_frag.frags);
+	if (res < 0)
+		return res;
+	res = nf_ct_frag6_sysctl_register(net);
+	if (res < 0)
+		inet_frags_exit_net(&net->nf_frag.frags, &nf_frags);
+	return res;
 }
 
 static void nf_ct_net_exit(struct net *net)
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 846012eae526..38bf38a9717f 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -714,13 +714,20 @@ static void ip6_frags_sysctl_unregister(void)
 
 static int __net_init ipv6_frags_init_net(struct net *net)
 {
+	int res;
+
 	net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
 
-	inet_frags_init_net(&net->ipv6.frags);
+	res = inet_frags_init_net(&net->ipv6.frags);
+	if (res < 0)
+		return res;
 
-	return ip6_frags_ns_sysctl_register(net);
+	res = ip6_frags_ns_sysctl_register(net);
+	if (res < 0)
+		inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
+	return res;
 }
 
 static void __net_exit ipv6_frags_exit_net(struct net *net)
-- 
2.18.0

^ permalink raw reply related

* [PATCH v3 00/30] backport of IP fragmentation fixes
From: Stephen Hemminger @ 2018-09-13 14:58 UTC (permalink / raw)
  To: davem, gregkh; +Cc: netdev, stable, edumazet, Stephen Hemminger

Took the set of patches from 4.19 to handle IP fragmentation DoS
and applied them against 4.14.69.  Most of these are from Eric.
In a couple case, it required some manual merge conflict resolution.

Tested normal IP fragmentation with iperf3 and malicious IP fragments
with fragmentsmack. Under fragmentation attack (700Kpps) the original
4.14.69 consumes 97% CPU; with this patch it drops to 5%.

v3 - send to wider audience
v2 - added patch from 4.19 linux-next to fix ip fragmentation crash

Dan Carpenter (1):
  ipv4: frags: precedence bug in ip_expire()

Eric Dumazet (22):
  inet: frags: change inet_frags_init_net() return value
  inet: frags: add a pointer to struct netns_frags
  inet: frags: refactor ipfrag_init()
  inet: frags: refactor ipv6_frag_init()
  inet: frags: refactor lowpan_net_frag_init()
  ipv6: export ip6 fragments sysctl to unprivileged users
  rhashtable: add schedule points
  inet: frags: use rhashtables for reassembly units
  inet: frags: remove some helpers
  inet: frags: get rif of inet_frag_evicting()
  inet: frags: remove inet_frag_maybe_warn_overflow()
  inet: frags: break the 2GB limit for frags storage
  inet: frags: do not clone skb in ip_expire()
  ipv6: frags: rewrite ip6_expire_frag_queue()
  rhashtable: reorganize struct rhashtable layout
  inet: frags: reorganize struct netns_frags
  inet: frags: get rid of ipfrag_skb_cb/FRAG_CB
  inet: frags: fix ip6frag_low_thresh boundary
  net: speed up skb_rbtree_purge()
  net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends
  net: add rb_to_skb() and other rb tree helpers
  net: sk_buff rbnode reorg

Florian Westphal (1):
  ipv6: defrag: drop non-last frags smaller than min mtu

Kees Cook (1):
  inet: frags: Convert timers to use timer_setup()

Peter Oskolkov (4):
  ip: discard IPv4 datagrams with overlapping segments.
  net: modify skb_rbtree_purge to return the truesize of all purged
    skbs.
  ip: add helpers to process in-order fragments faster.
  ip: process in-order fragments efficiently

Taehee Yoo (1):
  ip: frags: fix crash in ip_do_fragment()

 Documentation/networking/ip-sysctl.txt  |  13 +-
 include/linux/rhashtable.h              |   8 +-
 include/linux/skbuff.h                  |  50 +-
 include/net/inet_frag.h                 | 135 +++---
 include/net/ip.h                        |   1 -
 include/net/ipv6.h                      |  26 +-
 include/uapi/linux/snmp.h               |   1 +
 lib/rhashtable.c                        |   2 +
 net/core/skbuff.c                       |  31 +-
 net/ieee802154/6lowpan/6lowpan_i.h      |  26 +-
 net/ieee802154/6lowpan/reassembly.c     | 153 ++++---
 net/ipv4/inet_fragment.c                | 378 ++++------------
 net/ipv4/ip_fragment.c                  | 578 +++++++++++++-----------
 net/ipv4/proc.c                         |   7 +-
 net/ipv4/tcp_fastopen.c                 |   8 +-
 net/ipv4/tcp_input.c                    |  33 +-
 net/ipv6/netfilter/nf_conntrack_reasm.c | 105 ++---
 net/ipv6/proc.c                         |   5 +-
 net/ipv6/reassembly.c                   | 217 ++++-----
 net/sched/sch_netem.c                   |  14 +-
 20 files changed, 802 insertions(+), 989 deletions(-)

-- 
2.18.0

^ permalink raw reply

* pull request: bluetooth 2018-09-13
From: Johan Hedberg @ 2018-09-13  9:45 UTC (permalink / raw)
  To: davem; +Cc: linux-bluetooth, netdev

[-- Attachment #1: Type: text/plain, Size: 1097 bytes --]

Hi Dave,

A few Bluetooth fixes for the 4.19-rc series:

 - Fixed rw_semaphore leak in hci_ldisc
 - Fixed local Out-of-Band pairing data handling

Let me know if there are any issues pulling. Thanks.

Johan

---
The following changes since commit 7c5cca3588545e7f255171e28e0dd6e384ebb91d:

  qmi_wwan: Support dynamic config on Quectel EP06 (2018-09-10 10:48:54 -0700)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth.git for-upstream

for you to fetch changes up to e6a57d22f787e73635ce0d29eef0abb77928b3e9:

  Bluetooth: hci_ldisc: Free rw_semaphore on close (2018-09-11 13:33:57 +0200)

----------------------------------------------------------------
Hermes Zhang (1):
      Bluetooth: hci_ldisc: Free rw_semaphore on close

Johan Hedberg (1):
      Bluetooth: SMP: Fix trying to use non-existent local OOB data

Matias Karhumaa (1):
      Bluetooth: Use correct tfm to generate OOB data

 drivers/bluetooth/hci_ldisc.c |  2 ++
 net/bluetooth/smp.c           | 16 +++++++++++++---
 2 files changed, 15 insertions(+), 3 deletions(-)

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* Re: [PATCH net-next v3 02/17] zinc: introduce minimal cryptography library
From: Jason A. Donenfeld @ 2018-09-13 14:34 UTC (permalink / raw)
  To: gmazyland
  Cc: Andrew Lutomirski, Ard Biesheuvel, LKML, Netdev, David Miller,
	Greg Kroah-Hartman, Samuel Neves, Jean-Philippe Aumasson,
	Linux Crypto Mailing List
In-Reply-To: <c5855aa4-1561-8387-aebb-0a7587ac6d4f@gmail.com>

Hi Milan,

On Thu, Sep 13, 2018 at 8:40 AM Milan Broz <gmazyland@gmail.com> wrote:
> Please note, that dm-crypt now uses not only block ciphers and modes,
> but also authenticated encryption and hashes (for ESSIV and HMAC
> in authenticated composed modes) and RNG (for random IV).
> We use crypto API, including async variants (I hope correctly :)
>
> There is a long time battle to move initialization vectors generators
> from dm-crypt to crypto API. If there are any plans to use a new library,
> this issue should be discussed as well.
> (Some dm-crypt IV generators are disk encryption specific, some do more
> that just IV so porting is not straightforward etc).
>
> Related problem here is an optimization of chain of sectors encryption -
> if we have new crypto API, it would be nice if can take chain of sectors
> so possible implementation can process this chain in one batch
> (every sector need to be tweaked by differently generated IV - and we
> are back in problem above).
> I think filesystem encryption uses the same pattern.
>
> And btw, we use the same algorithms through AF_ALG in userspace (cryptsetup).
>
> So please, if you mention dm-crypt, note that it is very complex
> crypto API consumer :) And everything is dynamic, configurable through
> dm-crypt options.
>
> That said, I would be more than happy to help in experiments to porting dm-crypt
> to any other crypto library, but if it doesn't not help with problems
> mentioned above, I do not see any compelling reason for the new library for dm-crypt...

dm-crypt is probably a good consumer of the existing crypto API and
won't be impacted by the introduction of Zinc, which is really just
the exposure of a couple low level simple crypto functions, and not a
fancy API like the crypto API which dm-crypt happily uses.

Jason

^ permalink raw reply

* Re: [PATCH net-next v3 02/17] zinc: introduce minimal cryptography library
From: Jason A. Donenfeld @ 2018-09-13 14:32 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Andrew Lutomirski, LKML, Netdev, David Miller, Greg Kroah-Hartman,
	Samuel Neves, Jean-Philippe Aumasson, Linux Crypto Mailing List
In-Reply-To: <CAKv+Gu8z1r7F_wbR6-e0Vbryqby7djDy2OR6JJURCsTo6ZMa2g@mail.gmail.com>

On Thu, Sep 13, 2018 at 7:41 AM Ard Biesheuvel
<ard.biesheuvel@linaro.org> wrote:
> But one of the supposed selling points of this crypto library is that
> it gives engineers who are frightened of crypto in general and the
> crypto API in particular simple and easy to use crypto primitives
> rather than having to jump through the crypto API's hoops.

The goal is for engineers who want to specifically use algorithm X
from within the kernel in a non-dynamic way to be able to then use
algorithm X with a simple function call. The goal is not to open it up
to people who have no idea what they're doing; for that a NaCL-like
library with functions like "crypto_box_open" or something would fit
the bill; but that's also not what we're trying to do here. Please
don't confuse the design goals. The rest of your email is therefore a
bit of a straw man; cut the rhetoric out.

> A crypto library whose only encryption algorithm is a stream cipher
> does *not* deliver on that promise, since it is only suitable for
> cases where IVs are guaranteed not to be reused.

False. We also offer XChaCha20Poly1305, which takes a massive nonce,
suitable for random generation.

If there became a useful case for AES-PMAC-SIV or even AES-GCM or
something to that extent, then Zinc would add that as required. But
we're not going to start adding random ciphers unless they're needed.

> You yourself were
> bitten by the clunkiness of the crypto API when attempting to use the
> SHA26 code, right? So shouldn't we move that into this crypto library
> as well?

As stated in the initial commit, and in numerous other emails
stretching back a year, yes, sha256 and other things in lib/ are going
to be put into Zinc following the initial merge of Zinc. These changes
will happen incrementally, like everything else that happens in the
kernel. Sha256, in particular, is probably the first thing I'll port
post-merge.

> I think it is reasonable for WireGuard to standardize on
> ChaCha20/Poly1305 only, although I have my concerns about the flag day
> that will be required if this 'one true cipher' ever does turn out to
> be compromised (either that, or we will have to go back in time and
> add some kind of protocol versioning to existing deployments of
> WireGuard)

Those concerns are not valid and have already been addressed (to you,
I believe) on this mailing list and elsewhere. WireGuard is versioned,
hence there's no need to "add" versioning, and it is prepared to roll
out new cryptography in a subsequent version should there be any
issues. In other words, your concern is based on a misunderstanding of
the protocol. If you have issues, however, with the design decisions
of WireGuard, something that's been heavily discussed with members of
the linux kernel community, networking community, cryptography
community, and so forth, for the last 3 years, I invite you to bring
them up on <wireguard@lists.zx2c4.com>.

> And frankly, if the code were as good as the prose, we wouldn't be
> having this discussion.

Please cut out this rhetoric. That's an obviously unprovable
statement, but it probably isn't true anyway. I wish you'd stick to
technical concerns only, rather than what appears to be a desire to
derail this by any means necessary.

> Zinc adds its own clunky ways to mix arch and
> generic code, involving GCC -include command line arguments and
> #ifdefs everywhere. My review comments on this were completely ignored
> by Jason.

No, they were not ignored. v2 cleaned up the #ifdefs. v4 has already
cleaned up the makefile stuff and will be even cleaner. Good things
await, don't worry.

Jason

^ permalink raw reply

* Re: [PATCH v8 0/4] gpiolib: speed up GPIO array processing
From: Linus Walleij @ 2018-09-13  9:22 UTC (permalink / raw)
  To: Janusz Krzysztofik
  Cc: Jonathan Corbet, Miguel Ojeda Sandonis, Peter Korsgaard,
	Peter Rosin, Ulf Hansson, Andrew Lunn, Florian Fainelli,
	David S. Miller, Dominik Brodowski, Greg KH, kishon,
	Lars-Peter Clausen, Michael Hennerich, Jonathan Cameron,
	Hartmut Knaack, Peter Meerwald, Jiri Slaby, Willy Tarreau,
	Geert Uytterhoeven, Sebastien Bourdelin
In-Reply-To: <20180905215008.1649-1-jmkrzyszt@gmail.com>

On Wed, Sep 5, 2018 at 11:49 PM Janusz Krzysztofik <jmkrzyszt@gmail.com> wrote:

> The goal is to boost performance of get/set array functions while
> processing GPIO arrays which represent pins of a signle chip in
> hardware order.  If resulting performance is close to PIO, GPIO API
> can be used for data I/O without much loss of speed.

I applied the v8 to an immutable branch and pushed to kernelorg
so the build servers can churn it a bit, and if it works fine
then we can merge this into the devel branch and also set up
that as something other subsystems can pull in if they need it.

I'm really excited to merge this!

Yours,
Linus Walleij

^ permalink raw reply

* Re: [PATCH net-next RFC] virtio_net: ethtool tx napi configuration
From: Jason Wang @ 2018-09-13  9:13 UTC (permalink / raw)
  To: Willem de Bruijn, netdev; +Cc: mst, f.fainelli, Willem de Bruijn
In-Reply-To: <20180912232911.218610-1-willemdebruijn.kernel@gmail.com>



On 2018年09月13日 07:29, Willem de Bruijn wrote:
> From: Willem de Bruijn <willemb@google.com>
>
> Implement ethtool .set_priv_flags and .get_priv_flags handlers
> and use ethtool private flags to toggle transmit napi:
>
>    ethtool --set-priv-flags eth0 tx-napi on
>    ethtool --show-priv-flags eth0
>
> Link: https://patchwork.ozlabs.org/patch/948149/
> Suggested-by: Jason Wang <jasowang@redhat.com>
> Suggested-by: Florian Fainelli <f.fainelli@gmail.com>
> Signed-off-by: Willem de Bruijn <willemb@google.com>
> ---
>   drivers/net/virtio_net.c | 49 ++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 49 insertions(+)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 765920905226..9ca7e0a0f0d9 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -73,6 +73,10 @@ static const unsigned long guest_offloads[] = {
>   	VIRTIO_NET_F_GUEST_UFO
>   };
>   
> +static const char virtnet_ethtool_priv_flags[][ETH_GSTRING_LEN] = {
> +	"tx-napi",
> +};
> +
>   struct virtnet_stat_desc {
>   	char desc[ETH_GSTRING_LEN];
>   	size_t offset;
> @@ -2059,6 +2063,9 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
>   			}
>   		}
>   		break;
> +	case ETH_SS_PRIV_FLAGS:
> +		memcpy(data, virtnet_ethtool_priv_flags,
> +		       sizeof(virtnet_ethtool_priv_flags));
>   	}
>   }
>   
> @@ -2070,6 +2077,9 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset)
>   	case ETH_SS_STATS:
>   		return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
>   					       VIRTNET_SQ_STATS_LEN);
> +	case ETH_SS_PRIV_FLAGS:
> +		return ARRAY_SIZE(virtnet_ethtool_priv_flags);
> +
>   	default:
>   		return -EOPNOTSUPP;
>   	}
> @@ -2181,6 +2191,43 @@ static int virtnet_get_link_ksettings(struct net_device *dev,
>   	return 0;
>   }
>   
> +static int virtnet_set_priv_flags(struct net_device *dev, u32 priv_flags)
> +{
> +	struct virtnet_info *vi = netdev_priv(dev);
> +	int i, napi_weight;
> +
> +	napi_weight = priv_flags & 0x1 ? NAPI_POLL_WEIGHT : 0;
> +
> +	if (napi_weight ^ vi->sq[0].napi.weight) {
> +		for (i = 0; i < vi->max_queue_pairs; i++) {
> +			struct netdev_queue *txq =
> +				netdev_get_tx_queue(vi->dev, i);
> +
> +			virtnet_napi_tx_disable(&vi->sq[i].napi);
> +			__netif_tx_lock_bh(txq);
> +			vi->sq[i].napi.weight = napi_weight;
> +			if (!napi_weight)
> +				virtqueue_enable_cb(vi->sq[i].vq);

I don't get why we need to disable enable cb here.

Thanks

> +			__netif_tx_unlock_bh(txq);
> +			virtnet_napi_tx_enable(vi, vi->sq[i].vq,
> +					       &vi->sq[i].napi);
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static u32 virtnet_get_priv_flags(struct net_device *dev)
> +{
> +	struct virtnet_info *vi = netdev_priv(dev);
> +	int priv_flags = 0;
> +
> +	if (vi->sq[0].napi.weight)
> +		priv_flags |= 0x1;
> +
> +	return priv_flags;
> +}
> +
>   static void virtnet_init_settings(struct net_device *dev)
>   {
>   	struct virtnet_info *vi = netdev_priv(dev);
> @@ -2219,6 +2266,8 @@ static const struct ethtool_ops virtnet_ethtool_ops = {
>   	.get_ts_info = ethtool_op_get_ts_info,
>   	.get_link_ksettings = virtnet_get_link_ksettings,
>   	.set_link_ksettings = virtnet_set_link_ksettings,
> +	.set_priv_flags = virtnet_set_priv_flags,
> +	.get_priv_flags = virtnet_get_priv_flags,
>   };
>   
>   static void virtnet_freeze_down(struct virtio_device *vdev)

^ permalink raw reply

* Re: [PATCH net-next 1/8] devlink: Add generic parameter hw_tc_offload
From: Jiri Pirko @ 2018-09-13  9:08 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Vasundhara Volam, David Miller, michael.chan@broadcom.com, Netdev
In-Reply-To: <20180912083422.004b21a0@cakuba>

Wed, Sep 12, 2018 at 08:34:22AM CEST, jakub.kicinski@netronome.com wrote:
>On Wed, 12 Sep 2018 11:47:59 +0530, Vasundhara Volam wrote:
>> On Tue, Sep 11, 2018 at 3:25 PM Jiri Pirko <jiri@resnulli.us> wrote:
>> >
>> > Tue, Sep 11, 2018 at 10:44:58AM CEST, vasundhara-v.volam@broadcom.com wrote:  
>> > >hw_tc_offload - Enable/Disable TC flower offload in the device.
>> > >
>> > >Signed-off-by: Vasundhara Volam <vasundhara-v.volam@broadcom.com>
>> > >---
>> > > include/net/devlink.h | 4 ++++
>> > > net/core/devlink.c    | 5 +++++
>> > > 2 files changed, 9 insertions(+)
>> > >
>> > >diff --git a/include/net/devlink.h b/include/net/devlink.h
>> > >index b9b89d6..a0e9ce9 100644
>> > >--- a/include/net/devlink.h
>> > >+++ b/include/net/devlink.h
>> > >@@ -362,6 +362,7 @@ enum devlink_param_generic_id {
>> > >       DEVLINK_PARAM_GENERIC_ID_MAX_MACS,
>> > >       DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV,
>> > >       DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT,
>> > >+      DEVLINK_PARAM_GENERIC_ID_HW_TC_OFFLOAD,  
>> >
>> > Could you please describe why do you need this here and why the
>> > tc_offload flag in ethtool is not enough. How do you imagine the user
>> > should use them together?  
>> Jiri, tc_offload flag in ethtool will modify feature in driver at
>> runtime. But I am adding
>> tc_offload param here to toggle this feature in NVM Config of our
>> adapter, whose configuration
>> mode is permanent and will be effective only with a reboot of the server.
>> 
>> User has to turn on tc_offload feature in NVM config of the adapter and then
>> enabling the tc_offload flag in ethtool will completely enable the
>> feature in driver.
>
>Thanks for explaining, however, I don't think we have trouble
>understanding *what* you are doing, but rather *why* you're doing it.

Exactly. Why ethtool tc_offload flag is not enough?

^ permalink raw reply

* Re: [PATCH net-next v3 02/17] zinc: introduce minimal cryptography library
From: Jason A. Donenfeld @ 2018-09-13 14:18 UTC (permalink / raw)
  To: Andrew Lutomirski
  Cc: Ard Biesheuvel, LKML, Netdev, David Miller, Greg Kroah-Hartman,
	Samuel Neves, Jean-Philippe Aumasson, Linux Crypto Mailing List
In-Reply-To: <CALCETrU2qkLVaxC=cpU5iAeT5B7xGsR+m2ZWtLVK37jMMWtcAA@mail.gmail.com>

On Thu, Sep 13, 2018 at 1:45 AM Andy Lutomirski <luto@kernel.org> wrote:
> I'm not convinced that there's any real need for *all* crypto
> algorithms to move into lib/zinc or to move at all.  As I see it,
> there are two classes of crypto algorithms in the kernel:
>
> a) Crypto that is used by code that chooses its algorithm statically
> and wants synchronous operations.  These include everything in
> drivers/char/random.c, but also a bunch of various networking things
> that are hardcoded and basically everything that uses stack buffers.
> (This means it includes all the code that I broke when I did
> VMAP_STACK.  Sign.)

Right, exactly. This is what will wind up using Zinc. I'm working on
an example usage of this for v4 of the patch submission, which you can
ogle in a preview here if you're curious:

https://git.zx2c4.com/linux-dev/commit/?h=big_key_rewrite

28 insertions, 206 deletions :-D

> b) Crypto that is used dynamically.  This includes dm-crypt
> (aes-xts-plain64, aes-cbc-essiv, etc), all the ALG_IF interfaces, a
> lot of IPSEC stuff, possibly KCM, and probably many more.  These will
> get comparatively little benefit from being converted to a zinc-like
> interface.  For some of these cases, it wouldn't make any sense at all
> to convert them.  Certainly the ones that do async hardware crypto
> using DMA engines will never look at all like zinc, even under the
> hood.

Right, this is what the crypto API will continue to be used for.


> I think that, as a short-term goal, it makes a lot of sense to have
> implementations of the crypto that *new* kernel code (like Wireguard)
> wants to use in style (a) that live in /lib, and it obviously makes
> sense to consolidate their implementations with the crypto/
> implementations in a timely manner.  As a medium-term goal, adding
> more algorithms as needed for things that could use the simpler APIs
> (Bluetooth, perhaps) would make sense.

Agreed 100%. With regards to "consolidate their implementations" --
I've actually already done this after your urging yesterday, and so
that will be a part of v4.

> But I see no reason at all that /lib should ever contain a grab-bag of
> crypto implementations just for the heck of it.  They should have real
> in-kernel users IMO.  And this means that there will probably always
> be some crypto implementations in crypto/ for things like aes-xts.

Right, precisely.

Jason

^ permalink raw reply

* Re: [PATCH net-next v3 02/17] zinc: introduce minimal cryptography library
From: Jason A. Donenfeld @ 2018-09-13 14:15 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: LKML, Netdev, David Miller, Greg Kroah-Hartman, Andrew Lutomirski,
	Samuel Neves, Jean-Philippe Aumasson, Linux Crypto Mailing List
In-Reply-To: <CAKv+Gu9RGo9rtGsmwMLZ_=-WSQ_h7har4agXP-2XOKupq-KYtA@mail.gmail.com>

Hi Ard,

On Thu, Sep 13, 2018 at 12:56 AM Ard Biesheuvel
<ard.biesheuvel@linaro.org> wrote:
> In this series, you are dumping a huge volume of unannotated,
> generated asm into the kernel which has been modified [by you] to
> [among other things?] adhere to the kernel API (without documenting
> what the changes are exactly). How does that live up to the promise of
> better, peer reviewed code?

The code still benefits from the review that's gone into OpenSSL. It's
not modified in ways that would affect the cryptographic operations
being done. It's modified to be suitable for kernel space.

> Then there is the performance claim. We know for instance that the
> OpenSSL ARM NEON code for ChaCha20 is faster on cores that happen to
> possess a micro-architectural property that ALU instructions are
> essentially free when they are interleaved with SIMD instructions. But
> we also know that a) Cortex-A7, which is a relevant target, is not one
> of those cores, and b) that chip designers are not likely to optimize
> for that particular usage pattern so relying on it in generic code is
> unwise in general.

That's interesting. I'll bring this up with AndyP. FWIW, if you think
you have a real and compelling claim here, I'd be much more likely to
accept a different ChaCha20 implementation than I would be to accept a
different Poly1305 implementation. (It's a *lot* harder to screw up
ChaCha20 than it is to screw up Poly1305.)

> I am also concerned about your claim that all software algorithms will
> be moved into this crypto library.

I'll defer to Andy's response here, which I think is a correct one:
https://lkml.org/lkml/2018/9/13/27

The short answer is that Zinc is going to be adding the ciphers that
people want to use for normal reasons from normal code. For example,
after this merges, we'll next be working on moving the remaining
non-optimized C code out of lib/ that's called by places (such as
SHA2).

> You are not specific about whose
> responsibility it will be that this is going to happen in a timely
> fashion.

I thought I laid out the roadmap for this in the commit message. In
case I wasn't clear: my plan is to tackle lib/ after merging, and I
plan to do so in a timely manner. It's a pretty common tactic to keep
layering on tasks, "what about X?", "what about Y?", "I won't agree
unless Z!" -- when in reality kernel development and refactorings are
done incrementally. I've been around on this list contributing code
for long enough that you should have a decent amount of confidence
that I'm not just going to disappear working on this or something
insane like that. And neither are the two academic cryptographers CC'd
on this thread. So, as Andy said, we're going to be porting to Zinc
the primitives that are useful for the various applications of Zinc.
This means yes, we'll have SHA2 in there.

> chaining modes
> What are the APIs
> going to look like for block ciphers, taking chaining modes into
> account?

As mentioned in the commit message and numerous times, we're not
trying to make a win32-like crypto API here or to remake the existing
Linux crypto API. Rather we're providing libraries of specific
functions that are useful for various circumstances. For example, if
AES-GCM is desired at some point, then we'll have a similar API for
that as we do for ChaPoly -- one that takes buffers and one that takes
sg. Likewise, hash functions use the familiar init/update/final.
"Generic" chaining modes aren't really part of the equation or design
goals.

Again, I realize you've spent a long time working on the existing
crypto API, and so your questions and concerns are in the line of,
"how are we going to make Zinc look like the existing crypto API in
functionality?" But that's not what we're up to here. We have a
different and complementary design goal. I understand why you're
squirming, but please recognize we're working on different things.

> I'm sure it is rather simple to port the crypto API implementation of
> ChaCha20 to use your library. I am more concerned about how your
> library is going to expand to cover all other software algorithms that
> we currently use in the kernel.

The subset of algorithms we add will be developed with the same
methodology as the present ones. There is nothing making this
particularly difficult or even more difficult for other primitives
than it was for ChaCha20. It's especially easy, in fact, since we're
following similar design methodologies as the vast majority of other
cryptography libraries that have been developed. Namely, we're
creating simple things called "functions".

> Of course. But please respond to all the concerns,
> You have not
> responded to that concern yet.

Sorry, it's certainly not my intention. I've been on vacation with my
family for the last several weeks, and only returned home
sleep-deprived last night after 4 days of plane delays. I've now
rested and will resume working on this full-time and I'll try my best
to address concerns, and also go back through emails to find things I
might have missed. (First, though, I'm going to deal with getting back
the three suitcases the airline lost in transit...)

> > Anyway, it sounds like this whole thing may have ruffled your feathers
> > a bit. Will you be at Linux Plumbers Conference in November? I'm
> > planning on attending, and perhaps we could find some time there to
> > sit down and talk one on one a bit.
>
> That would be good, yes. I will be there.

Looking forward to talking to you there, and hopefully we can put to
rest any lingering concerns.

Jason

^ permalink raw reply

* Re: [PATCH net-next] virtio_net: ethtool tx napi configuration
From: Jason Wang @ 2018-09-13  9:05 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Network Development, David Miller, caleb.raitto,
	Michael S. Tsirkin, Jon Olson (Google Drive), Willem de Bruijn
In-Reply-To: <CAF=yD-+pf0JictDed5A3TM-6PTgnsiW=XFON5OPuLY=cL0ojKw@mail.gmail.com>



On 2018年09月12日 21:43, Willem de Bruijn wrote:
> On Tue, Sep 11, 2018 at 11:35 PM Jason Wang <jasowang@redhat.com> wrote:
>>
>>
>> On 2018年09月11日 09:14, Willem de Bruijn wrote:
>>>>>> I cook a fixup, and it looks works in my setup:
>>>>>>
>>>>>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>>>>>> index b320b6b14749..9181c3f2f832 100644
>>>>>> --- a/drivers/net/virtio_net.c
>>>>>> +++ b/drivers/net/virtio_net.c
>>>>>> @@ -2204,10 +2204,17 @@ static int virtnet_set_coalesce(struct
>>>>>> net_device *dev,
>>>>>>                     return -EINVAL;
>>>>>>
>>>>>>             if (napi_weight ^ vi->sq[0].napi.weight) {
>>>>>> -               if (dev->flags & IFF_UP)
>>>>>> -                       return -EBUSY;
>>>>>> -               for (i = 0; i < vi->max_queue_pairs; i++)
>>>>>> +               for (i = 0; i < vi->max_queue_pairs; i++) {
>>>>>> +                       struct netdev_queue *txq =
>>>>>> +                              netdev_get_tx_queue(vi->dev, i);
>>>>>> +
>>>>>> + virtnet_napi_tx_disable(&vi->sq[i].napi);
>>>>>> +                       __netif_tx_lock_bh(txq);
>>>>>>                             vi->sq[i].napi.weight = napi_weight;
>>>>>> +                       __netif_tx_unlock_bh(txq);
>>>>>> +                       virtnet_napi_tx_enable(vi, vi->sq[i].vq,
>>>>>> + &vi->sq[i].napi);
>>>>>> +               }
>>>>>>             }
>>>>>>
>>>>>>             return 0;
>>>>> Thanks! It passes my simple stress test, too. Which consists of two
>>>>> concurrent loops, one toggling the ethtool option, another running
>>>>> TCP_RR.
>>>>>
>>>>>> The only left case is the speculative tx polling in RX NAPI. I think we
>>>>>> don't need to care in this case since it was not a must for correctness.
>>>>> As long as the txq lock is held that will be a noop, anyway. The other
>>>>> concurrent action is skb_xmit_done. It looks correct to me, but need
>>>>> to think about it a bit. The tricky transition is coming out of napi without
>>>>> having >= 2 + MAX_SKB_FRAGS clean descriptors. If the queue is
>>>>> stopped it may deadlock transmission in no-napi mode.
>>>> Yes, maybe we can enable tx queue when napi weight is zero in
>>>> virtnet_poll_tx().
>>> Yes, that precaution should resolve that edge case.
>>>
>> I've done a stress test and it passes. The test contains:
>>
>> - vm with 2 queues
>> - a bash script to enable and disable tx napi
>> - two netperf UDP_STREAM sessions to send small packets
> Great. That matches my results. Do you want to send the v2?

Some mails were blocked so I do not receive some replies in time. So I 
post a V2 (but as you've pointed out, it's buggy).

Thanks

^ permalink raw reply

* [PATCH 2/2] net: qcom/emac: add shared mdio bus support
From: Wang Dongsheng @ 2018-09-13  9:04 UTC (permalink / raw)
  To: timur; +Cc: davem, yu.zheng, Wang Dongsheng, netdev
In-Reply-To: <1536829493-10088-1-git-send-email-dongsheng.wang@hxt-semitech.com>

Share the mii_bus for others MAC device because QDF2400 emac
include MDIO, and the motherboard has more than one PHY connected
to an MDIO bus.

Tested: QDF2400 (ACPI), buildin/insmod/rmmod

Signed-off-by: Wang Dongsheng <dongsheng.wang@hxt-semitech.com>
---
 drivers/net/ethernet/qualcomm/emac/emac-phy.c | 211 ++++++++++++++----
 drivers/net/ethernet/qualcomm/emac/emac.c     |   7 +-
 2 files changed, 174 insertions(+), 44 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/emac/emac-phy.c b/drivers/net/ethernet/qualcomm/emac/emac-phy.c
index 53dbf1e163a8..327362f6b673 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac-phy.c
+++ b/drivers/net/ethernet/qualcomm/emac/emac-phy.c
@@ -13,6 +13,7 @@
 /* Qualcomm Technologies, Inc. EMAC PHY Controller driver.
  */
 
+#include <linux/of_platform.h>
 #include <linux/of_mdio.h>
 #include <linux/phy.h>
 #include <linux/iopoll.h>
@@ -96,15 +97,14 @@ static int emac_mdio_write(struct mii_bus *bus, int addr, int regnum, u16 val)
 	return 0;
 }
 
-/* Configure the MDIO bus and connect the external PHY */
-int emac_phy_config(struct platform_device *pdev, struct emac_adapter *adpt)
+static int __do_emac_mido_bus_create(struct platform_device *pdev,
+				     struct emac_adapter *adpt)
 {
 	struct device_node *np = pdev->dev.of_node;
 	struct mii_bus *mii_bus;
 	int ret;
 
-	/* Create the mii_bus object for talking to the MDIO bus */
-	adpt->mii_bus = mii_bus = devm_mdiobus_alloc(&pdev->dev);
+	mii_bus = devm_mdiobus_alloc(&pdev->dev);
 	if (!mii_bus)
 		return -ENOMEM;
 
@@ -115,50 +115,177 @@ int emac_phy_config(struct platform_device *pdev, struct emac_adapter *adpt)
 	mii_bus->parent = &pdev->dev;
 	mii_bus->priv = adpt;
 
-	if (has_acpi_companion(&pdev->dev)) {
-		u32 phy_addr;
+	ret = of_mdiobus_register(mii_bus, has_acpi_companion(&pdev->dev) ?
+				  NULL : np);
+	if (!ret) {
+		adpt->mii_bus = mii_bus;
+		return 0;
+	}
 
-		ret = mdiobus_register(mii_bus);
-		if (ret) {
-			dev_err(&pdev->dev, "could not register mdio bus\n");
-			return ret;
-		}
-		ret = device_property_read_u32(&pdev->dev, "phy-channel",
-					       &phy_addr);
-		if (ret)
-			/* If we can't read a valid phy address, then assume
-			 * that there is only one phy on this mdio bus.
-			 */
-			adpt->phydev = phy_find_first(mii_bus);
-		else
-			adpt->phydev = mdiobus_get_phy(mii_bus, phy_addr);
-
-		/* of_phy_find_device() claims a reference to the phydev,
-		 * so we do that here manually as well. When the driver
-		 * later unloads, it can unilaterally drop the reference
-		 * without worrying about ACPI vs DT.
-		 */
-		if (adpt->phydev)
-			get_device(&adpt->phydev->mdio.dev);
-	} else {
-		struct device_node *phy_np;
-
-		ret = of_mdiobus_register(mii_bus, np);
-		if (ret) {
-			dev_err(&pdev->dev, "could not register mdio bus\n");
-			return ret;
-		}
+	dev_err(&pdev->dev, "Could not register mdio bus\n");
+	return ret;
+}
 
-		phy_np = of_parse_phandle(np, "phy-handle", 0);
-		adpt->phydev = of_phy_find_device(phy_np);
-		of_node_put(phy_np);
+static int acpi_device_match(struct device *dev, void *fwnode)
+{
+	return dev->fwnode == fwnode;
+}
+
+static int emac_acpi_get_shared_bus(struct platform_device *pdev,
+				    struct mii_bus **bus)
+{
+	acpi_handle shared_handle;
+	struct acpi_device *adev;
+	const union acpi_object *obj;
+	union acpi_object *obj_e;
+	struct device *shared_dev;
+	struct net_device *shared_netdev;
+	struct emac_adapter *shared_adpt;
+	int ret;
+
+	adev = ACPI_COMPANION(&pdev->dev);
+	if (!adev)
+		return -ENODEV;
+
+	ret = acpi_dev_get_property(adev, "mdio-device", ACPI_TYPE_ANY, &obj);
+	if (ret) {
+		dev_err(&pdev->dev, "Missing mdio-device property\n");
+		return -ENODEV;
 	}
 
-	if (!adpt->phydev) {
-		dev_err(&pdev->dev, "could not find external phy\n");
-		mdiobus_unregister(mii_bus);
+	if (obj->package.count != 1)
+		return -ENODEV;
+
+	obj_e = &obj->package.elements[0];
+	if (obj_e->type != ACPI_TYPE_LOCAL_REFERENCE)
+		return -ENODEV;
+
+	if (obj_e->reference.actual_type != ACPI_TYPE_DEVICE)
+		return -ENODEV;
+
+	shared_handle = obj_e->reference.handle;
+	if (!shared_handle || acpi_bus_get_device(shared_handle, &adev))
+		return -ENODEV;
+
+	shared_dev = bus_find_device(&platform_bus_type, NULL,
+				     acpi_fwnode_handle(adev),
+				     acpi_device_match);
+	if (!shared_dev)
+		return -EPROBE_DEFER;
+
+	shared_netdev = dev_get_drvdata(shared_dev);
+	if (!shared_netdev)
+		return -EPROBE_DEFER;
+
+	shared_adpt = netdev_priv(shared_netdev);
+	if (!shared_adpt->mii_bus)
+		return -EPROBE_DEFER;
+
+	*bus = shared_adpt->mii_bus;
+	return 0;
+}
+
+static int emac_of_get_shared_bus(struct platform_device *pdev,
+				  struct mii_bus **bus)
+{
+	struct device_node *shared_node;
+	struct platform_device *shared_pdev;
+	struct net_device *shared_netdev;
+	struct emac_adapter *shared_adpt;
+	struct device_node *np = pdev->dev.of_node;
+
+	const phandle *prop;
+
+	prop = of_get_property(np, "mdio-device", NULL);
+	if (!prop) {
+		dev_err(&pdev->dev, "Missing mdio-device property\n");
 		return -ENODEV;
 	}
 
+	shared_node = of_find_node_by_phandle(*prop);
+	if (!shared_node)
+		return -ENODEV;
+
+	shared_pdev = of_find_device_by_node(shared_node);
+	if (!shared_pdev)
+		return -ENODEV;
+
+	shared_netdev = dev_get_drvdata(&shared_pdev->dev);
+	if (!shared_netdev)
+		return -EPROBE_DEFER;
+
+	shared_adpt = netdev_priv(shared_netdev);
+	if (!shared_adpt->mii_bus)
+		return -EPROBE_DEFER;
+
+	*bus = shared_adpt->mii_bus;
 	return 0;
 }
+
+static int __do_get_emac_mido_shared_bus(struct platform_device *pdev,
+					 struct emac_adapter *adpt)
+{
+	int ret = -ENODEV;
+
+	if (IS_ENABLED(CONFIG_ACPI)) {
+		ret = emac_acpi_get_shared_bus(pdev, &adpt->mii_bus);
+		if (adpt->mii_bus || ret == -EPROBE_DEFER)
+			return ret;
+	}
+
+	if (IS_ENABLED(CONFIG_OF)) {
+		ret = emac_of_get_shared_bus(pdev, &adpt->mii_bus);
+		if (adpt->mii_bus || ret == -EPROBE_DEFER)
+			return ret;
+	}
+
+	return ret;
+}
+
+static int emac_mdio_bus_create(struct platform_device *pdev,
+				struct emac_adapter *adpt)
+{
+	bool shared_mdio;
+
+	shared_mdio = device_property_read_bool(&pdev->dev, "mdio-device");
+	if (shared_mdio)
+		return __do_get_emac_mido_shared_bus(pdev, adpt);
+
+	return __do_emac_mido_bus_create(pdev, adpt);
+}
+
+/* Configure the MDIO bus and connect the external PHY */
+int emac_phy_config(struct platform_device *pdev, struct emac_adapter *adpt)
+{
+	struct device *dev = &pdev->dev;
+	u32 phy_addr = PHY_MAX_ADDR;
+	int ret;
+
+	ret = emac_mdio_bus_create(pdev, adpt);
+	if (ret)
+		return ret;
+
+	ret = device_property_read_u32(dev,
+				       has_acpi_companion(dev) ?
+				       "phy-channel" : "phy-handle",
+				       &phy_addr);
+	if (ret || phy_addr == PHY_MAX_ADDR)
+		/* If we can't read a valid phy address, then assume
+		 * that there is only one phy on this mdio bus.
+		 */
+		adpt->phydev = phy_find_first(adpt->mii_bus);
+	else
+		adpt->phydev = mdiobus_get_phy(adpt->mii_bus, phy_addr);
+
+	if (adpt->phydev) {
+		get_device(&adpt->phydev->mdio.dev);
+		return 0;
+	}
+
+	dev_err(dev, "Could not find external phy\n");
+	/* Only the bus creator can unregister mdio bus */
+	if (dev == adpt->mii_bus->parent)
+		mdiobus_unregister(adpt->mii_bus);
+
+	return -ENODEV;
+}
diff --git a/drivers/net/ethernet/qualcomm/emac/emac.c b/drivers/net/ethernet/qualcomm/emac/emac.c
index 2a0cbc535a2e..11d0fe795616 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac.c
+++ b/drivers/net/ethernet/qualcomm/emac/emac.c
@@ -738,7 +738,8 @@ static int emac_probe(struct platform_device *pdev)
 
 static int emac_remove(struct platform_device *pdev)
 {
-	struct net_device *netdev = dev_get_drvdata(&pdev->dev);
+	struct device *dev = &pdev->dev;
+	struct net_device *netdev = dev_get_drvdata(dev);
 	struct emac_adapter *adpt = netdev_priv(netdev);
 
 	unregister_netdev(netdev);
@@ -747,7 +748,9 @@ static int emac_remove(struct platform_device *pdev)
 	emac_clks_teardown(adpt);
 
 	put_device(&adpt->phydev->mdio.dev);
-	mdiobus_unregister(adpt->mii_bus);
+	/* Only the bus creator can unregister mdio bus */
+	if (dev == adpt->mii_bus->parent)
+		mdiobus_unregister(adpt->mii_bus);
 	free_netdev(netdev);
 
 	if (adpt->phy.digital)
-- 
2.18.0

^ permalink raw reply related

* [PATCH 0/2] net: qcom/emac: add shared mdio bus support
From: Wang Dongsheng @ 2018-09-13  9:04 UTC (permalink / raw)
  To: timur; +Cc: davem, yu.zheng, Wang Dongsheng, netdev, devicetree

Share the mii_bus for others MAC device because QDF2400 emac
include MDIO, and the motherboard has more than one PHY connected
to an MDIO bus.

Tested: QDF2400 (ACPI), buildin/insmod/rmmod

Wang Dongsheng (2):
  dt-bindings: net: qcom: Add binding for shared mdio bus
  net: qcom/emac: add shared mdio bus support

 .../devicetree/bindings/net/qcom-emac.txt     |   4 +
 drivers/net/ethernet/qualcomm/emac/emac-phy.c | 211 ++++++++++++++----
 drivers/net/ethernet/qualcomm/emac/emac.c     |   7 +-
 3 files changed, 178 insertions(+), 44 deletions(-)

-- 
2.18.0

^ permalink raw reply

* Re: [PATCH net-next] virtio_net: ethtool tx napi configuration
From: Jason Wang @ 2018-09-13  9:04 UTC (permalink / raw)
  To: Willem de Bruijn, f.fainelli
  Cc: Network Development, David Miller, caleb.raitto,
	Michael S. Tsirkin, Jon Olson (Google Drive), Willem de Bruijn
In-Reply-To: <CAF=yD-+Aqe=9GbKGo_n748D97W2rJHdsYL+cay1gyR4eA2Hc=w@mail.gmail.com>



On 2018年09月13日 03:11, Willem de Bruijn wrote:
> On Wed, Sep 12, 2018 at 2:16 PM Florian Fainelli <f.fainelli@gmail.com> wrote:
>>
>>
>> On 9/12/2018 11:07 AM, Willem de Bruijn wrote:
>>> On Wed, Sep 12, 2018 at 1:42 PM Florian Fainelli <f.fainelli@gmail.com> wrote:
>>>>
>>>>
>>>> On 9/9/2018 3:44 PM, Willem de Bruijn wrote:
>>>>> From: Willem de Bruijn <willemb@google.com>
>>>>>
>>>>> Implement ethtool .set_coalesce (-C) and .get_coalesce (-c) handlers.
>>>>> Interrupt moderation is currently not supported, so these accept and
>>>>> display the default settings of 0 usec and 1 frame.
>>>>>
>>>>> Toggle tx napi through a bit in tx-frames. So as to not interfere
>>>>> with possible future interrupt moderation, use bit 10, well outside
>>>>> the reasonable range of real interrupt moderation values.
>>>>>
>>>>> Changes are not atomic. The tx IRQ, napi BH and transmit path must
>>>>> be quiesced when switching modes. Only allow changing this setting
>>>>> when the device is down.
>>>> Humm, would not a private ethtool flag to switch TX NAPI on/off be more
>>>> appropriate rather than use the coalescing configuration API here?
>>> What do you mean by private ethtool flag? A new field in ethtool
>>> --features (-k)?
>> I meant using ethtool_drvinfo::n_priv_flags, ETH_SS_PRIV_FLAGS and then
>> ETHTOOL_GFPFLAGS and ETHTOOL_SPFLAGS to control the toggling of that
>> private flag. mlx5 has a number of privates flags for instance.
> Interesting, thanks! I was not at all aware of those ethtool flags.
> Am having a look. It definitely looks promising.
>
>>> Configurable napi-tx is not a common feature across devices. We really
>>> want virtio-net to also just convert to napi-tx as default, but need a
>>> way to gradually convert with application opt-out if some workloads
>>> see regressions.
>> The rationale makes sense, no questions about it.
>>
>>> There is prior art in interpreting coalesce values as
>>> more than a direct mapping to usec. The e1000 is one example.
>>>
>> Looked at both e1000 and e1000e and they both have a similar programming
>> of the HW's interrupt target rate register, which is relevant to
>> interrupt coalescing, what part of these drivers do you see as doing
>> something not quite coalescing related?
> It's all coalescing related, for sure. e1000_set_coalesce just does not
> translate the tx-usecs values into microsecond latency directly.
>
> It modifies both the interrupt throttle rate adapter->itr and interrupt mode
> adapter->itr_setting, which are initially set in e1000_check_options from
> module param InterruptThrottleRate.
>
> Value 0 disables interrupt moderation. 1 and 3 program a dynamic mode.
> 2 is an illegal value as is 5..9. 10..10000 converts from usec to interrupt
> rate/sec.
>
> I took tx-napi to be a similar interrupt related option as, say, dynamic
> conservative mode interrupt moderation.

Consider we may have interrupt moderation in the future, I tend to use 
set_coalesce. Otherwise we may need two steps to enable moderation:

- tx-napi on
- set_coalesce

Thanks

^ permalink raw reply

* Re: [PATCH net-next] virtio_net: ethtool tx napi configuration
From: Jason Wang @ 2018-09-13  9:02 UTC (permalink / raw)
  To: Willem de Bruijn, f.fainelli
  Cc: Network Development, David Miller, caleb.raitto,
	Michael S. Tsirkin, Jon Olson (Google Drive), Willem de Bruijn
In-Reply-To: <CAF=yD-KhFpzb8APxZypu-C8rKVkUUKarA1fhB=9JFT7SXCpuXA@mail.gmail.com>



On 2018年09月13日 07:27, Willem de Bruijn wrote:
> On Wed, Sep 12, 2018 at 3:11 PM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
>> On Wed, Sep 12, 2018 at 2:16 PM Florian Fainelli <f.fainelli@gmail.com> wrote:
>>>
>>>
>>> On 9/12/2018 11:07 AM, Willem de Bruijn wrote:
>>>> On Wed, Sep 12, 2018 at 1:42 PM Florian Fainelli <f.fainelli@gmail.com> wrote:
>>>>>
>>>>>
>>>>> On 9/9/2018 3:44 PM, Willem de Bruijn wrote:
>>>>>> From: Willem de Bruijn <willemb@google.com>
>>>>>>
>>>>>> Implement ethtool .set_coalesce (-C) and .get_coalesce (-c) handlers.
>>>>>> Interrupt moderation is currently not supported, so these accept and
>>>>>> display the default settings of 0 usec and 1 frame.
>>>>>>
>>>>>> Toggle tx napi through a bit in tx-frames. So as to not interfere
>>>>>> with possible future interrupt moderation, use bit 10, well outside
>>>>>> the reasonable range of real interrupt moderation values.
>>>>>>
>>>>>> Changes are not atomic. The tx IRQ, napi BH and transmit path must
>>>>>> be quiesced when switching modes. Only allow changing this setting
>>>>>> when the device is down.
>>>>> Humm, would not a private ethtool flag to switch TX NAPI on/off be more
>>>>> appropriate rather than use the coalescing configuration API here?
>>>> What do you mean by private ethtool flag? A new field in ethtool
>>>> --features (-k)?
>>> I meant using ethtool_drvinfo::n_priv_flags, ETH_SS_PRIV_FLAGS and then
>>> ETHTOOL_GFPFLAGS and ETHTOOL_SPFLAGS to control the toggling of that
>>> private flag. mlx5 has a number of privates flags for instance.
>> Interesting, thanks! I was not at all aware of those ethtool flags.
>> Am having a look. It definitely looks promising.
> Okay, I made that change. That is indeed much cleaner, thanks.
> Let me send the patch, initially as RFC.
>
> I've observed one issue where if we toggle the flag before bringing
> up the device, it hits a kernel BUG at include/linux/netdevice.h:515
>
>          BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));

This reminds me that we need to check netif_running() before trying to 
enable and disable tx napi in ethtool_set_coalesce().

Thanks

^ permalink raw reply

* [PATCH] ath10k: Remove duplicated includes
From: YueHaibing @ 2018-09-13 14:01 UTC (permalink / raw)
  To: ath10k, kvalo; +Cc: linux-kernel, netdev, davem, linux-wireless, YueHaibing

remove duplicated include from ath10k driver.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
---
 drivers/net/wireless/ath/ath10k/htt.h     | 1 -
 drivers/net/wireless/ath/ath10k/mac.c     | 1 -
 drivers/net/wireless/ath/ath10k/wmi-tlv.c | 1 -
 3 files changed, 3 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/htt.h b/drivers/net/wireless/ath/ath10k/htt.h
index 5d3ff80..eaf3a00 100644
--- a/drivers/net/wireless/ath/ath10k/htt.h
+++ b/drivers/net/wireless/ath/ath10k/htt.h
@@ -29,7 +29,6 @@
 #include "htc.h"
 #include "hw.h"
 #include "rx_desc.h"
-#include "hw.h"
 
 enum htt_dbg_stats_type {
 	HTT_DBG_STATS_WAL_PDEV_TXRX = 1 << 0,
diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
index 496772d..e37ea39 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -30,7 +30,6 @@
 #include "htt.h"
 #include "txrx.h"
 #include "testmode.h"
-#include "wmi.h"
 #include "wmi-tlv.h"
 #include "wmi-ops.h"
 #include "wow.h"
diff --git a/drivers/net/wireless/ath/ath10k/wmi-tlv.c b/drivers/net/wireless/ath/ath10k/wmi-tlv.c
index cdc1e64..6c6656d6 100644
--- a/drivers/net/wireless/ath/ath10k/wmi-tlv.c
+++ b/drivers/net/wireless/ath/ath10k/wmi-tlv.c
@@ -19,7 +19,6 @@
 #include "debug.h"
 #include "mac.h"
 #include "hw.h"
-#include "mac.h"
 #include "wmi.h"
 #include "wmi-ops.h"
 #include "wmi-tlv.h"
-- 
2.7.0

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox