Netdev List

Netdev List
 help / color / mirror / Atom feed

* [net-next:master 42/46] net/ipv4/fib_frontend.c:411:16: error: 'struct netns_ipv4' has no member named 'fib_has_custom_local_routes'
From: kbuild test robot @ 2017-09-21 23:27 UTC (permalink / raw)
  To: Paolo Abeni; +Cc: kbuild-all, netdev

[-- Attachment #1: Type: text/plain, Size: 2244 bytes --]

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git master
head:   b6cd4b5895848968e8fee93fc5e3dc8babc40b9e
commit: 6e617de84e87d626d1e976fc30e1322239fd4d2d [42/46] net: avoid a full fib lookup when rp_filter is disabled.
config: x86_64-kexec (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
        git checkout 6e617de84e87d626d1e976fc30e1322239fd4d2d
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All errors (new ones prefixed by >>):

   net/ipv4/fib_frontend.c: In function 'fib_validate_source':
>> net/ipv4/fib_frontend.c:411:16: error: 'struct netns_ipv4' has no member named 'fib_has_custom_local_routes'
      if (net->ipv4.fib_has_custom_local_routes)
                   ^
   net/ipv4/fib_frontend.c: In function 'inet_rtm_newroute':
   net/ipv4/fib_frontend.c:773:12: error: 'struct netns_ipv4' has no member named 'fib_has_custom_local_routes'
      net->ipv4.fib_has_custom_local_routes = true;
               ^

vim +411 net/ipv4/fib_frontend.c

   395	
   396	/* Ignore rp_filter for packets protected by IPsec. */
   397	int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
   398				u8 tos, int oif, struct net_device *dev,
   399				struct in_device *idev, u32 *itag)
   400	{
   401		int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
   402		struct net *net = dev_net(dev);
   403	
   404		if (!r && !fib_num_tclassid_users(net) &&
   405		    (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
   406			if (IN_DEV_ACCEPT_LOCAL(idev))
   407				goto ok;
   408			/* if no local routes are added from user space we can check
   409			 * for local addresses looking-up the ifaddr table
   410			 */
 > 411			if (net->ipv4.fib_has_custom_local_routes)
   412				goto full_check;
   413			if (inet_lookup_ifaddr_rcu(net, src))
   414				return -EINVAL;
   415	
   416	ok:
   417			*itag = 0;
   418			return 0;
   419		}
   420	
   421	full_check:
   422		return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
   423	}
   424	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 26118 bytes --]

^ permalink raw reply

* [Patch net-next] net_sched: use idr to allocate bpf filter handles
From: Cong Wang @ 2017-09-21 23:43 UTC (permalink / raw)
  To: netdev; +Cc: Cong Wang, Daniel Borkmann, Chris Mi, Jamal Hadi Salim

Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Chris Mi <chrism@mellanox.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
---
 net/sched/cls_bpf.c | 57 ++++++++++++++++++++++++++---------------------------
 1 file changed, 28 insertions(+), 29 deletions(-)

diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 520c5027646a..4fd352a1778e 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -17,6 +17,7 @@
 #include <linux/skbuff.h>
 #include <linux/filter.h>
 #include <linux/bpf.h>
+#include <linux/idr.h>
 
 #include <net/rtnetlink.h>
 #include <net/pkt_cls.h>
@@ -32,7 +33,7 @@ MODULE_DESCRIPTION("TC BPF based classifier");
 
 struct cls_bpf_head {
 	struct list_head plist;
-	u32 hgen;
+	struct idr handle_idr;
 	struct rcu_head rcu;
 };
 
@@ -238,6 +239,7 @@ static int cls_bpf_init(struct tcf_proto *tp)
 		return -ENOBUFS;
 
 	INIT_LIST_HEAD_RCU(&head->plist);
+	idr_init(&head->handle_idr);
 	rcu_assign_pointer(tp->root, head);
 
 	return 0;
@@ -264,6 +266,9 @@ static void cls_bpf_delete_prog_rcu(struct rcu_head *rcu)
 
 static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog)
 {
+	struct cls_bpf_head *head = rtnl_dereference(tp->root);
+
+	idr_remove_ext(&head->handle_idr, prog->handle);
 	cls_bpf_stop_offload(tp, prog);
 	list_del_rcu(&prog->link);
 	tcf_unbind_filter(tp, &prog->res);
@@ -287,6 +292,7 @@ static void cls_bpf_destroy(struct tcf_proto *tp)
 	list_for_each_entry_safe(prog, tmp, &head->plist, link)
 		__cls_bpf_delete(tp, prog);
 
+	idr_destroy(&head->handle_idr);
 	kfree_rcu(head, rcu);
 }
 
@@ -421,27 +427,6 @@ static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp,
 	return 0;
 }
 
-static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp,
-				   struct cls_bpf_head *head)
-{
-	unsigned int i = 0x80000000;
-	u32 handle;
-
-	do {
-		if (++head->hgen == 0x7FFFFFFF)
-			head->hgen = 1;
-	} while (--i > 0 && cls_bpf_get(tp, head->hgen));
-
-	if (unlikely(i == 0)) {
-		pr_err("Insufficient number of handles\n");
-		handle = 0;
-	} else {
-		handle = head->hgen;
-	}
-
-	return handle;
-}
-
 static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 			  struct tcf_proto *tp, unsigned long base,
 			  u32 handle, struct nlattr **tca,
@@ -451,6 +436,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 	struct cls_bpf_prog *oldprog = *arg;
 	struct nlattr *tb[TCA_BPF_MAX + 1];
 	struct cls_bpf_prog *prog;
+	unsigned long idr_index;
 	int ret;
 
 	if (tca[TCA_OPTIONS] == NULL)
@@ -476,21 +462,30 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 		}
 	}
 
-	if (handle == 0)
-		prog->handle = cls_bpf_grab_new_handle(tp, head);
-	else
+	if (handle == 0) {
+		ret = idr_alloc_ext(&head->handle_idr, prog, &idr_index,
+				    1, 0x80000000, GFP_KERNEL);
+		if (ret)
+			goto errout;
+		prog->handle = idr_index;
+	} else {
+		if (!oldprog) {
+			ret = idr_alloc_ext(&head->handle_idr, prog, &idr_index,
+					    handle, handle + 1, GFP_KERNEL);
+			if (ret)
+				goto errout;
+		}
 		prog->handle = handle;
-	if (prog->handle == 0) {
-		ret = -EINVAL;
-		goto errout;
 	}
 
 	ret = cls_bpf_set_parms(net, tp, prog, base, tb, tca[TCA_RATE], ovr);
 	if (ret < 0)
-		goto errout;
+		goto errout_idr;
 
 	ret = cls_bpf_offload(tp, prog, oldprog);
 	if (ret) {
+		if (!oldprog)
+			idr_remove_ext(&head->handle_idr, prog->handle);
 		__cls_bpf_delete_prog(prog);
 		return ret;
 	}
@@ -499,6 +494,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 		prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW;
 
 	if (oldprog) {
+		idr_replace_ext(&head->handle_idr, prog, handle);
 		list_replace_rcu(&oldprog->link, &prog->link);
 		tcf_unbind_filter(tp, &oldprog->res);
 		call_rcu(&oldprog->rcu, cls_bpf_delete_prog_rcu);
@@ -509,6 +505,9 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 	*arg = prog;
 	return 0;
 
+errout_idr:
+	if (!oldprog)
+		idr_remove_ext(&head->handle_idr, prog->handle);
 errout:
 	tcf_exts_destroy(&prog->exts);
 	kfree(prog);
-- 
2.13.0

^ permalink raw reply related

* [Patch net-next] net_sched: use idr to allocate basic filter handles
From: Cong Wang @ 2017-09-21 23:43 UTC (permalink / raw)
  To: netdev; +Cc: Cong Wang, Chris Mi, Jamal Hadi Salim

Cc: Chris Mi <chrism@mellanox.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
---
 net/sched/cls_basic.c | 37 ++++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index d89ebafd2239..29da19eb6447 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -17,13 +17,14 @@
 #include <linux/errno.h>
 #include <linux/rtnetlink.h>
 #include <linux/skbuff.h>
+#include <linux/idr.h>
 #include <net/netlink.h>
 #include <net/act_api.h>
 #include <net/pkt_cls.h>
 
 struct basic_head {
-	u32			hgenerator;
 	struct list_head	flist;
+	struct idr		handle_idr;
 	struct rcu_head		rcu;
 };
 
@@ -78,6 +79,7 @@ static int basic_init(struct tcf_proto *tp)
 	if (head == NULL)
 		return -ENOBUFS;
 	INIT_LIST_HEAD(&head->flist);
+	idr_init(&head->handle_idr);
 	rcu_assign_pointer(tp->root, head);
 	return 0;
 }
@@ -99,8 +101,10 @@ static void basic_destroy(struct tcf_proto *tp)
 	list_for_each_entry_safe(f, n, &head->flist, link) {
 		list_del_rcu(&f->link);
 		tcf_unbind_filter(tp, &f->res);
+		idr_remove_ext(&head->handle_idr, f->handle);
 		call_rcu(&f->rcu, basic_delete_filter);
 	}
+	idr_destroy(&head->handle_idr);
 	kfree_rcu(head, rcu);
 }
 
@@ -111,6 +115,7 @@ static int basic_delete(struct tcf_proto *tp, void *arg, bool *last)
 
 	list_del_rcu(&f->link);
 	tcf_unbind_filter(tp, &f->res);
+	idr_remove_ext(&head->handle_idr, f->handle);
 	call_rcu(&f->rcu, basic_delete_filter);
 	*last = list_empty(&head->flist);
 	return 0;
@@ -154,6 +159,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
 	struct nlattr *tb[TCA_BASIC_MAX + 1];
 	struct basic_filter *fold = (struct basic_filter *) *arg;
 	struct basic_filter *fnew;
+	unsigned long idr_index;
 
 	if (tca[TCA_OPTIONS] == NULL)
 		return -EINVAL;
@@ -179,30 +185,31 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
 	err = -EINVAL;
 	if (handle) {
 		fnew->handle = handle;
-	} else if (fold) {
-		fnew->handle = fold->handle;
+		if (!fold) {
+			err = idr_alloc_ext(&head->handle_idr, fnew, &idr_index,
+					    handle, handle + 1, GFP_KERNEL);
+			if (err)
+				goto errout;
+		}
 	} else {
-		unsigned int i = 0x80000000;
-		do {
-			if (++head->hgenerator == 0x7FFFFFFF)
-				head->hgenerator = 1;
-		} while (--i > 0 && basic_get(tp, head->hgenerator));
-
-		if (i <= 0) {
-			pr_err("Insufficient number of handles\n");
+		err = idr_alloc_ext(&head->handle_idr, fnew, &idr_index,
+				    1, 0x80000000, GFP_KERNEL);
+		if (err)
 			goto errout;
-		}
-
-		fnew->handle = head->hgenerator;
+		fnew->handle = idr_index;
 	}
 
 	err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr);
-	if (err < 0)
+	if (err < 0) {
+		if (!fold)
+			idr_remove_ext(&head->handle_idr, fnew->handle);
 		goto errout;
+	}
 
 	*arg = fnew;
 
 	if (fold) {
+		idr_replace_ext(&head->handle_idr, fnew, fnew->handle);
 		list_replace_rcu(&fold->link, &fnew->link);
 		tcf_unbind_filter(tp, &fold->res);
 		call_rcu(&fold->rcu, basic_delete_filter);
-- 
2.13.0

^ permalink raw reply related

* [Patch net-next] net_sched: use idr to allocate u32 filter handles
From: Cong Wang @ 2017-09-21 23:43 UTC (permalink / raw)
  To: netdev; +Cc: Cong Wang, Chris Mi, Jamal Hadi Salim

Cc: Chris Mi <chrism@mellanox.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
---
 net/sched/cls_u32.c | 108 ++++++++++++++++++++++++++++++++--------------------
 1 file changed, 67 insertions(+), 41 deletions(-)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 10b8d851fc6b..316b8a791b13 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -46,6 +46,7 @@
 #include <net/act_api.h>
 #include <net/pkt_cls.h>
 #include <linux/netdevice.h>
+#include <linux/idr.h>
 
 struct tc_u_knode {
 	struct tc_u_knode __rcu	*next;
@@ -82,6 +83,7 @@ struct tc_u_hnode {
 	struct tc_u_common	*tp_c;
 	int			refcnt;
 	unsigned int		divisor;
+	struct idr		handle_idr;
 	struct rcu_head		rcu;
 	/* The 'ht' field MUST be the last field in structure to allow for
 	 * more entries allocated at end of structure.
@@ -93,7 +95,7 @@ struct tc_u_common {
 	struct tc_u_hnode __rcu	*hlist;
 	struct Qdisc		*q;
 	int			refcnt;
-	u32			hgenerator;
+	struct idr		handle_idr;
 	struct hlist_node	hnode;
 	struct rcu_head		rcu;
 };
@@ -311,19 +313,19 @@ static void *u32_get(struct tcf_proto *tp, u32 handle)
 	return u32_lookup_key(ht, handle);
 }
 
-static u32 gen_new_htid(struct tc_u_common *tp_c)
+static u32 gen_new_htid(struct tc_u_common *tp_c, struct tc_u_hnode *ptr)
 {
-	int i = 0x800;
+	unsigned long idr_index;
+	int err;
 
-	/* hgenerator only used inside rtnl lock it is safe to increment
+	/* This is only used inside rtnl lock it is safe to increment
 	 * without read _copy_ update semantics
 	 */
-	do {
-		if (++tp_c->hgenerator == 0x7FF)
-			tp_c->hgenerator = 1;
-	} while (--i > 0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20));
-
-	return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0;
+	err = idr_alloc_ext(&tp_c->handle_idr, ptr, &idr_index,
+			    1, 0x800, GFP_KERNEL);
+	if (err)
+		return 0;
+	return (u32)(idr_index | 0x800) << 20;
 }
 
 static struct hlist_head *tc_u_common_hash;
@@ -366,8 +368,9 @@ static int u32_init(struct tcf_proto *tp)
 		return -ENOBUFS;
 
 	root_ht->refcnt++;
-	root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000;
+	root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : 0x80000000;
 	root_ht->prio = tp->prio;
+	idr_init(&root_ht->handle_idr);
 
 	if (tp_c == NULL) {
 		tp_c = kzalloc(sizeof(*tp_c), GFP_KERNEL);
@@ -377,6 +380,7 @@ static int u32_init(struct tcf_proto *tp)
 		}
 		tp_c->q = tp->q;
 		INIT_HLIST_NODE(&tp_c->hnode);
+		idr_init(&tp_c->handle_idr);
 
 		h = tc_u_hash(tp);
 		hlist_add_head(&tp_c->hnode, &tc_u_common_hash[h]);
@@ -565,6 +569,7 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
 					 rtnl_dereference(n->next));
 			tcf_unbind_filter(tp, &n->res);
 			u32_remove_hw_knode(tp, n->handle);
+			idr_remove_ext(&ht->handle_idr, n->handle);
 			call_rcu(&n->rcu, u32_delete_key_freepf_rcu);
 		}
 	}
@@ -586,6 +591,8 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
 	     hn = &phn->next, phn = rtnl_dereference(*hn)) {
 		if (phn == ht) {
 			u32_clear_hw_hnode(tp, ht);
+			idr_destroy(&ht->handle_idr);
+			idr_remove_ext(&tp_c->handle_idr, ht->handle);
 			RCU_INIT_POINTER(*hn, ht->next);
 			kfree_rcu(ht, rcu);
 			return 0;
@@ -633,6 +640,7 @@ static void u32_destroy(struct tcf_proto *tp)
 			kfree_rcu(ht, rcu);
 		}
 
+		idr_destroy(&tp_c->handle_idr);
 		kfree(tp_c);
 	}
 
@@ -701,27 +709,21 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last)
 	return ret;
 }
 
-#define NR_U32_NODE (1<<12)
-static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle)
+static u32 gen_new_kid(struct tc_u_hnode *ht, u32 htid)
 {
-	struct tc_u_knode *n;
-	unsigned long i;
-	unsigned long *bitmap = kzalloc(BITS_TO_LONGS(NR_U32_NODE) * sizeof(unsigned long),
-					GFP_KERNEL);
-	if (!bitmap)
-		return handle | 0xFFF;
-
-	for (n = rtnl_dereference(ht->ht[TC_U32_HASH(handle)]);
-	     n;
-	     n = rtnl_dereference(n->next))
-		set_bit(TC_U32_NODE(n->handle), bitmap);
-
-	i = find_next_zero_bit(bitmap, NR_U32_NODE, 0x800);
-	if (i >= NR_U32_NODE)
-		i = find_next_zero_bit(bitmap, NR_U32_NODE, 1);
+	unsigned long idr_index;
+	u32 start = htid | 0x800;
+	u32 max = htid | 0xFFF;
+	u32 min = htid;
+
+	if (idr_alloc_ext(&ht->handle_idr, NULL, &idr_index,
+			  start, max + 1, GFP_KERNEL)) {
+		if (idr_alloc_ext(&ht->handle_idr, NULL, &idr_index,
+				  min + 1, max + 1, GFP_KERNEL))
+			return max;
+	}
 
-	kfree(bitmap);
-	return handle | (i >= NR_U32_NODE ? 0xFFF : i);
+	return (u32)idr_index;
 }
 
 static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
@@ -806,6 +808,7 @@ static void u32_replace_knode(struct tcf_proto *tp, struct tc_u_common *tp_c,
 		if (pins->handle == n->handle)
 			break;
 
+	idr_replace_ext(&ht->handle_idr, n, n->handle);
 	RCU_INIT_POINTER(n->next, pins->next);
 	rcu_assign_pointer(*ins, n);
 }
@@ -937,22 +940,33 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 			return -EINVAL;
 		if (TC_U32_KEY(handle))
 			return -EINVAL;
-		if (handle == 0) {
-			handle = gen_new_htid(tp->data);
-			if (handle == 0)
-				return -ENOMEM;
-		}
 		ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL);
 		if (ht == NULL)
 			return -ENOBUFS;
+		if (handle == 0) {
+			handle = gen_new_htid(tp->data, ht);
+			if (handle == 0) {
+				kfree(ht);
+				return -ENOMEM;
+			}
+		} else {
+			err = idr_alloc_ext(&tp_c->handle_idr, ht, NULL,
+					    handle, handle + 1, GFP_KERNEL);
+			if (err) {
+				kfree(ht);
+				return err;
+			}
+		}
 		ht->tp_c = tp_c;
 		ht->refcnt = 1;
 		ht->divisor = divisor;
 		ht->handle = handle;
 		ht->prio = tp->prio;
+		idr_init(&ht->handle_idr);
 
 		err = u32_replace_hw_hnode(tp, ht, flags);
 		if (err) {
+			idr_remove_ext(&tp_c->handle_idr, handle);
 			kfree(ht);
 			return err;
 		}
@@ -986,24 +1000,33 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid))
 			return -EINVAL;
 		handle = htid | TC_U32_NODE(handle);
+		err = idr_alloc_ext(&ht->handle_idr, NULL, NULL,
+				    handle, handle + 1,
+				    GFP_KERNEL);
+		if (err)
+			return err;
 	} else
 		handle = gen_new_kid(ht, htid);
 
-	if (tb[TCA_U32_SEL] == NULL)
-		return -EINVAL;
+	if (tb[TCA_U32_SEL] == NULL) {
+		err = -EINVAL;
+		goto erridr;
+	}
 
 	s = nla_data(tb[TCA_U32_SEL]);
 
 	n = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL);
-	if (n == NULL)
-		return -ENOBUFS;
+	if (n == NULL) {
+		err = -ENOBUFS;
+		goto erridr;
+	}
 
 #ifdef CONFIG_CLS_U32_PERF
 	size = sizeof(struct tc_u32_pcnt) + s->nkeys * sizeof(u64);
 	n->pf = __alloc_percpu(size, __alignof__(struct tc_u32_pcnt));
 	if (!n->pf) {
-		kfree(n);
-		return -ENOBUFS;
+		err = -ENOBUFS;
+		goto errfree;
 	}
 #endif
 
@@ -1066,9 +1089,12 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 errout:
 	tcf_exts_destroy(&n->exts);
 #ifdef CONFIG_CLS_U32_PERF
+errfree:
 	free_percpu(n->pf);
 #endif
 	kfree(n);
+erridr:
+	idr_remove_ext(&ht->handle_idr, handle);
 	return err;
 }
 
-- 
2.13.0

^ permalink raw reply related

* [PATCH net] net: qualcomm: rmnet: Fix rcu splat in rmnet_is_real_dev_registered
From: Subash Abhinov Kasiviswanathan @ 2017-09-22  0:00 UTC (permalink / raw)
  To: xiaolong.ye, davem, netdev; +Cc: Subash Abhinov Kasiviswanathan

Xiaolong reported a suspicious rcu_dereference_check in the device
unregister notifier callback. Since we do not dereference the
rx_handler_data, it's ok to just check for the value of the pointer.
Note that this section is already protected by rtnl_lock.

[  101.364846] WARNING: suspicious RCU usage
[  101.365654] 4.13.0-rc6-01701-gceed73a #1 Not tainted
[  101.370873] -----------------------------
[  101.372472] drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c:57 suspicious rcu_dereference_check() usage!
[  101.374427]
[  101.374427] other info that might help us debug this:
[  101.374427]
[  101.387491]
[  101.387491] rcu_scheduler_active = 2, debug_locks = 1
[  101.389368] 1 lock held by trinity-main/2809:
[  101.390736]  #0:  (rtnl_mutex){+.+.+.}, at: [<8146085b>] rtnl_lock+0xf/0x11
[  101.395482]
[  101.395482] stack backtrace:
[  101.396948] CPU: 0 PID: 2809 Comm: trinity-main Not tainted 4.13.0-rc6-01701-gceed73a #1
[  101.398857] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-20161025_171302-gandalf 04/01/2014
[  101.401079] Call Trace:
[  101.401656]  dump_stack+0xa1/0xeb
[  101.402871]  lockdep_rcu_suspicious+0xc7/0xd0
[  101.403665]  rmnet_is_real_dev_registered+0x40/0x4e
[  101.405199]  rmnet_config_notify_cb+0x2c/0x142
[  101.406344]  ? wireless_nlevent_flush+0x47/0x71
[  101.407385]  notifier_call_chain+0x2d/0x47
[  101.408645]  raw_notifier_call_chain+0xc/0xe
[  101.409882]  call_netdevice_notifiers_info+0x41/0x49
[  101.411402]  call_netdevice_notifiers+0xc/0xe
[  101.412713]  rollback_registered_many+0x268/0x36e
[  101.413702]  rollback_registered+0x39/0x56
[  101.414965]  unregister_netdevice_queue+0x79/0x88
[  101.415908]  unregister_netdev+0x16/0x1d

Fixes: ceed73a2cf4a ("drivers: net: ethernet: qualcomm: rmnet: Initial implementation")
Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Reported-by: kernel test robot <xiaolong.ye@intel.com>
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
index 98f2255..1e33aea 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
@@ -51,10 +51,7 @@ struct rmnet_walk_data {
 
 static int rmnet_is_real_dev_registered(const struct net_device *real_dev)
 {
-	rx_handler_func_t *rx_handler;
-
-	rx_handler = rcu_dereference(real_dev->rx_handler);
-	return (rx_handler == rmnet_rx_handler);
+	return rcu_access_pointer(real_dev->rx_handler) == rmnet_rx_handler;
 }
 
 /* Needs rtnl lock */
-- 
1.9.1

^ permalink raw reply related

* Re: Kernel 4.13.0-rc4-next-20170811 - IP Routing / Forwarding performance vs Core/RSS number / HT on
From: Eric Dumazet @ 2017-09-22  0:37 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: Paweł Staszewski, Paolo Abeni, Jesper Dangaard Brouer,
	Linux Kernel Network Developers, Alexander Duyck
In-Reply-To: <7dbc2dd1-eec9-0bd3-7255-4fe6026847aa@gmail.com>

On Thu, 2017-09-21 at 15:07 -0700, Florian Fainelli wrote:
> On 09/21/2017 02:54 PM, Eric Dumazet wrote:
> > On Thu, 2017-09-21 at 14:41 -0700, Florian Fainelli wrote:
> > 
> >> Would not this apply to pretty much any stacked device setup though? It
> >> seems like any network device that just queues up its packet on another
> >> physical device for actual transmission may need that (e.g: DSA, bond,
> >> team, more.?)
> > 
> > We support bonding and team already.
> 
> Right, so that seems to mostly leave us with DSA at least. What about
> other devices that also have IFF_NO_QUEUE set?

It wont work.

loopback has IFF_NO_QUEUE, but you need to keep dst on skbs...

^ permalink raw reply

* Re: [PATCH net] net/ncsi: Don't assume last available channel exists
From: Samuel Mendoza-Jonas @ 2017-09-22  1:00 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-kernel
In-Reply-To: <20170920.160519.764603868579556859.davem@davemloft.net>

On Wed, 2017-09-20 at 16:05 -0700, David Miller wrote:
> From: Samuel Mendoza-Jonas <sam@mendozajonas.com>
> Date: Wed, 20 Sep 2017 14:12:51 +1000
> 
> > When handling new VLAN tags in NCSI we check the maximum allowed number
> > of filters on the last active ("hot") channel. However if the 'add'
> > callback is called before NCSI has configured a channel, this causes a
> > NULL dereference.
> > 
> > Check that we actually have a hot channel, and warn if it is missing.
> > 
> > Signed-off-by: Samuel Mendoza-Jonas <sam@mendozajonas.com>
> 
> Well, a few things.
> 
> We should not allow this driver method to be invoked in the first
> place if the device is not in a state where the operation is legal
> yet.
> 
> Second of all, if !ndp is true, you should not return 0 to indicate
> success.
> 
> But more fundamentally, we should block this method from being
> callable unless the device is in the proper state and can complete the
> operation.
> 
> Seriously, these checks should be completely unnecessary if those
> issues are handled properly.

Good point, this made me step back and reconsider the problem here.

The ncsi_vlan_rx_add_vid() callback exists because the NCSI driver needs
to know which VLAN IDs are set, but we don't have a straightforward way
of accessing that information later in ncsi_configure_channel() - as
opposed to the MAC address for example which is just accessed via
dev->dev_addr. Instead they're kept track of in the ndp->vlan_vids list
and the NCSI driver reconfigures any channels it knows about.

So in that sense the NCSI device *is* ready to handle the operation. The
hot_channel fix here was to check if we were exceeding the maximum number
of VLAN filters supported by the remote channel. That itself is bit
debatable since different channels could support different numbers of
filters but right now the NCSI driver only supports one active channel at
a time.

If we haven't configured a channel yet (or are in the process of doing
so) we won't have a hot_channel - does it make more sense to
- check against the hot_channel as currently done,
- only check the filter size at configure time for /each/ channel,
- only conditionally enable the .ndo_vlan_rx_add_vid net_device callback
once we've configured a channel (eg. for ftgmac100 in the
ftgmac100_ncsi_handler() callback?)

Thanks,
Sam

^ permalink raw reply

* linux-next: build failure after merge of the net-next tree
From: Stephen Rothwell @ 2017-09-22  1:03 UTC (permalink / raw)
  To: David Miller, Networking
  Cc: Linux-Next Mailing List, Linux Kernel Mailing List, Paolo Abeni

Hi all,

After merging the net-next tree, today's linux-next build (arm
multi_v7_defconfig) failed like this:

net/ipv4/fib_frontend.c: In function 'fib_validate_source':
net/ipv4/fib_frontend.c:411:16: error: 'struct netns_ipv4' has no member named 'fib_has_custom_local_routes'
   if (net->ipv4.fib_has_custom_local_routes)
                ^
net/ipv4/fib_frontend.c: In function 'inet_rtm_newroute':
net/ipv4/fib_frontend.c:773:12: error: 'struct netns_ipv4' has no member named 'fib_has_custom_local_routes'
   net->ipv4.fib_has_custom_local_routes = true;
            ^

Caused by commit

  6e617de84e87 ("net: avoid a full fib lookup when rp_filter is disabled.")

This build has CONFIG_IP_MULTIPLE_TABLES unset.

I have reverted that commit for today.

-- 
Cheers,
Stephen Rothwell

^ permalink raw reply

* Re: [PATCH net] net/ncsi: Don't assume last available channel exists
From: David Miller @ 2017-09-22  1:11 UTC (permalink / raw)
  To: sam; +Cc: netdev, linux-kernel
In-Reply-To: <1506042000.1377.21.camel@mendozajonas.com>

From: Samuel Mendoza-Jonas <sam@mendozajonas.com>
Date: Fri, 22 Sep 2017 11:00:00 +1000

> If we haven't configured a channel yet (or are in the process of doing
> so) we won't have a hot_channel - does it make more sense to
> - check against the hot_channel as currently done,
> - only check the filter size at configure time for /each/ channel,
> - only conditionally enable the .ndo_vlan_rx_add_vid net_device callback
> once we've configured a channel (eg. for ftgmac100 in the
> ftgmac100_ncsi_handler() callback?)

The last isn't so feasible.

The device shouldn't be marked attached until a channel is available,
because it seems like communication cannot occur until one is.  Right?

You could experiment with netif_device_detach()/netif_device_attach().

When the device is in the detached state, callbacks such as
->ndo_vlan_rx_add_vid() will not be invoked.

^ permalink raw reply

* (unknown), 
From: unsubscribe.me @ 2017-09-22  1:22 UTC (permalink / raw)
  To: netdev

[-- Attachment #1: 7172596099.doc --]
[-- Type: application/msword, Size: 55811 bytes --]

^ permalink raw reply

* Re: linux-next: build failure after merge of the net-next tree
From: David Miller @ 2017-09-22  1:37 UTC (permalink / raw)
  To: sfr; +Cc: netdev, linux-next, linux-kernel, pabeni
In-Reply-To: <20170922110355.6a631fc5@canb.auug.org.au>

From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Fri, 22 Sep 2017 11:03:55 +1000

> After merging the net-next tree, today's linux-next build (arm
> multi_v7_defconfig) failed like this:
> 
> net/ipv4/fib_frontend.c: In function 'fib_validate_source':
> net/ipv4/fib_frontend.c:411:16: error: 'struct netns_ipv4' has no member named 'fib_has_custom_local_routes'
>    if (net->ipv4.fib_has_custom_local_routes)
>                 ^
> net/ipv4/fib_frontend.c: In function 'inet_rtm_newroute':
> net/ipv4/fib_frontend.c:773:12: error: 'struct netns_ipv4' has no member named 'fib_has_custom_local_routes'
>    net->ipv4.fib_has_custom_local_routes = true;
>             ^
> 
> Caused by commit
> 
>   6e617de84e87 ("net: avoid a full fib lookup when rp_filter is disabled.")

Paolo, it seems this struct member should be placed outside of
IP_MULTIPLE_TABLES protection, since users can insert custom
local routes even without that set.

So I'm installing the following fix for this:

====================
[PATCH] ipv4: Move fib_has_custom_local_routes outside of IP_MULTIPLE_TABLES.

> net/ipv4/fib_frontend.c: In function 'fib_validate_source':
> net/ipv4/fib_frontend.c:411:16: error: 'struct netns_ipv4' has no member named 'fib_has_custom_local_routes'
>    if (net->ipv4.fib_has_custom_local_routes)
>                 ^
> net/ipv4/fib_frontend.c: In function 'inet_rtm_newroute':
> net/ipv4/fib_frontend.c:773:12: error: 'struct netns_ipv4' has no member named 'fib_has_custom_local_routes'
>    net->ipv4.fib_has_custom_local_routes = true;
>             ^

Fixes: 6e617de84e87 ("net: avoid a full fib lookup when rp_filter is disabled.")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 20720721da4b..8387f099115e 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -49,10 +49,10 @@ struct netns_ipv4 {
 #ifdef CONFIG_IP_MULTIPLE_TABLES
 	struct fib_rules_ops	*rules_ops;
 	bool			fib_has_custom_rules;
-	bool			fib_has_custom_local_routes;
 	struct fib_table __rcu	*fib_main;
 	struct fib_table __rcu	*fib_default;
 #endif
+	bool			fib_has_custom_local_routes;
 #ifdef CONFIG_IP_ROUTE_CLASSID
 	int			fib_num_tclassid_users;
 #endif
-- 
2.13.5

^ permalink raw reply related

* Re: [PATCH net-next 01/10] net: hns3: Support for dynamically assigning tx buffer to TC
From: David Miller @ 2017-09-22  1:41 UTC (permalink / raw)
  To: linyunsheng
  Cc: huangdaode, xuwei5, liguozhu, Yisen.Zhuang, gabriele.paoloni,
	john.garry, linuxarm, salil.mehta, lipeng321, netdev,
	linux-kernel
In-Reply-To: <1505992913-107256-2-git-send-email-linyunsheng@huawei.com>

From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Thu, 21 Sep 2017 19:21:44 +0800

> @@ -1324,23 +1324,28 @@ static int hclge_alloc_vport(struct hclge_dev *hdev)
>  	return 0;
>  }
>  
> -static int  hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev, u16 buf_size)
> +static int  hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev)
>  {
>  /* TX buffer size is unit by 128 byte */
>  #define HCLGE_BUF_SIZE_UNIT_SHIFT	7
>  #define HCLGE_BUF_SIZE_UPDATE_EN_MSK	BIT(15)
>  	struct hclge_tx_buff_alloc *req;
> +	struct hclge_priv_buf *priv;
>  	struct hclge_desc desc;
> +	u32 buf_size;
>  	int ret;
>  	u8 i;
>  
>  	req = (struct hclge_tx_buff_alloc *)desc.data;
>  
>  	hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_TX_BUFF_ALLOC, 0);
> -	for (i = 0; i < HCLGE_TC_NUM; i++)
> +	for (i = 0; i < HCLGE_TC_NUM; i++) {
> +		priv = &hdev->priv_buf[i];
> +		buf_size = priv->tx_buf_size;
>  		req->tx_pkt_buff[i] =
>  			cpu_to_le16((buf_size >> HCLGE_BUF_SIZE_UNIT_SHIFT) |
>  				     HCLGE_BUF_SIZE_UPDATE_EN_MSK);
> +	}
>  
>  	ret = hclge_cmd_send(&hdev->hw, &desc, 1);
>  	if (ret) {

Local variable 'buf_size' is assigned but never used in this function.
And with 'buf_size' removed, 'priv' also becomes unused.

If it gets used in a later patch, add it in that later patch.

You can also declare the variables locally in the basic block of
the for() loop.

^ permalink raw reply

* Re: [v2,3/3] can: m_can: Add PM Runtime
From: Yang, Wenyou @ 2017-09-22  1:54 UTC (permalink / raw)
  To: Franklin S Cooper Jr, Sekhar Nori, linux-kernel, devicetree,
	netdev, linux-can, wg, mkl, robh+dt, quentin.schulz
  Cc: Linux OMAP List, Alexandre Belloni, Wenyou Yang,
	Mario Hüttel
In-Reply-To: <bc29f9d2-db3a-1757-41d8-f19eda10a6f1@ti.com>

Hi Franklin,


On 2017/9/21 8:36, Franklin S Cooper Jr wrote:
> On 08/24/2017 03:30 AM, Sekhar Nori wrote:
>> + OMAP mailing list
>>
>> On Tuesday 25 July 2017 04:21 AM, Franklin Cooper wrote:
>>> Add support for PM Runtime which is the new way to handle managing clocks.
>>> However, to avoid breaking SoCs not using PM_RUNTIME leave the old clk
>>> management approach in place.
>> Since hclk is the interface clock, I think at a minimum there needs to
>> be an assumption that pm_runtime_get_sync() will enable that clock and
>> make the module ready for access.
>>
>> The only in-kernel user of this driver seems to be an atmel SoC. I am
>> ccing folks who added support for M_CAN on that SoC to see if hclk
>> management can be left to pm_runtime*() on their SoC.
>>
>> If they are okay, I think its a good to atleast drop explicit hclk
>> enable disable in the driver. That way, we avoid double enable/disable
>> of interface clock (hclk).
> Wenyou,
>
> Do you foresee this being a problem on your board? If not I can send a
> v3 removing these clk_enable and clk_disable calls and it would be great
> if you can test it out and provide your tested by.
Please send a v3 patch.  I would like test it on my side.

>
>>> PM_RUNTIME is required by OMAP based devices to handle clock management.
>>> Therefore, this allows future Texas Instruments SoCs that have the MCAN IP
>>> to work with this driver.
>>>
>>> Signed-off-by: Franklin S Cooper Jr <fcooper@ti.com>
>> Thanks,
>> Sekhar
>>
>>> ---
>>>   drivers/net/can/m_can/m_can.c | 13 +++++++++++++
>>>   1 file changed, 13 insertions(+)
>>>
>>> diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
>>> index ea48e59..93e23f5 100644
>>> --- a/drivers/net/can/m_can/m_can.c
>>> +++ b/drivers/net/can/m_can/m_can.c
>>> @@ -23,6 +23,7 @@
>>>   #include <linux/of.h>
>>>   #include <linux/of_device.h>
>>>   #include <linux/platform_device.h>
>>> +#include <linux/pm_runtime.h>
>>>   #include <linux/iopoll.h>
>>>   #include <linux/can/dev.h>
>>>   
>>> @@ -633,11 +634,15 @@ static int m_can_clk_start(struct m_can_priv *priv)
>>>   	if (err)
>>>   		clk_disable_unprepare(priv->hclk);
>>>   
>>> +	pm_runtime_get_sync(priv->device);
>>> +
>>>   	return err;
>>>   }
>>>   
>>>   static void m_can_clk_stop(struct m_can_priv *priv)
>>>   {
>>> +	pm_runtime_put_sync(priv->device);
>>> +
>>>   	clk_disable_unprepare(priv->cclk);
>>>   	clk_disable_unprepare(priv->hclk);
>>>   }
>>> @@ -1582,6 +1587,8 @@ static int m_can_plat_probe(struct platform_device *pdev)
>>>   	/* Enable clocks. Necessary to read Core Release in order to determine
>>>   	 * M_CAN version
>>>   	 */
>>> +	pm_runtime_enable(&pdev->dev);
>>> +
>>>   	ret = clk_prepare_enable(hclk);
>>>   	if (ret)
>>>   		goto disable_hclk_ret;
>>> @@ -1626,6 +1633,8 @@ static int m_can_plat_probe(struct platform_device *pdev)
>>>   	 */
>>>   	tx_fifo_size = mram_config_vals[7];
>>>   
>>> +	pm_runtime_get_sync(&pdev->dev);
>>> +
>>>   	/* allocate the m_can device */
>>>   	dev = alloc_m_can_dev(pdev, addr, tx_fifo_size);
>>>   	if (!dev) {
>>> @@ -1670,6 +1679,7 @@ static int m_can_plat_probe(struct platform_device *pdev)
>>>   disable_hclk_ret:
>>>   	clk_disable_unprepare(hclk);
>>>   failed_ret:
>>> +	pm_runtime_put_sync(&pdev->dev);
>>>   	return ret;
>>>   }
>>>   
>>> @@ -1726,6 +1736,9 @@ static int m_can_plat_remove(struct platform_device *pdev)
>>>   	struct net_device *dev = platform_get_drvdata(pdev);
>>>   
>>>   	unregister_m_can_dev(dev);
>>> +
>>> +	pm_runtime_disable(&pdev->dev);
>>> +
>>>   	platform_set_drvdata(pdev, NULL);
>>>   
>>>   	free_m_can_dev(dev);
>>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-can" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Best Regards,
Wenyou Yang

^ permalink raw reply

* Re: [PATCH net-next 01/10] net: hns3: Support for dynamically assigning tx buffer to TC
From: Yunsheng Lin @ 2017-09-22  1:57 UTC (permalink / raw)
  To: David Miller
  Cc: huangdaode, xuwei5, liguozhu, Yisen.Zhuang, gabriele.paoloni,
	john.garry, linuxarm, salil.mehta, lipeng321, netdev,
	linux-kernel
In-Reply-To: <20170921.184102.278153022575280817.davem@davemloft.net>

Hi, David

On 2017/9/22 9:41, David Miller wrote:
> From: Yunsheng Lin <linyunsheng@huawei.com>
> Date: Thu, 21 Sep 2017 19:21:44 +0800
> 
>> @@ -1324,23 +1324,28 @@ static int hclge_alloc_vport(struct hclge_dev *hdev)
>>  	return 0;
>>  }
>>  
>> -static int  hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev, u16 buf_size)
>> +static int  hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev)
>>  {
>>  /* TX buffer size is unit by 128 byte */
>>  #define HCLGE_BUF_SIZE_UNIT_SHIFT	7
>>  #define HCLGE_BUF_SIZE_UPDATE_EN_MSK	BIT(15)
>>  	struct hclge_tx_buff_alloc *req;
>> +	struct hclge_priv_buf *priv;
>>  	struct hclge_desc desc;
>> +	u32 buf_size;
>>  	int ret;
>>  	u8 i;
>>  
>>  	req = (struct hclge_tx_buff_alloc *)desc.data;
>>  
>>  	hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_TX_BUFF_ALLOC, 0);
>> -	for (i = 0; i < HCLGE_TC_NUM; i++)
>> +	for (i = 0; i < HCLGE_TC_NUM; i++) {
>> +		priv = &hdev->priv_buf[i];
>> +		buf_size = priv->tx_buf_size;
>>  		req->tx_pkt_buff[i] =
>>  			cpu_to_le16((buf_size >> HCLGE_BUF_SIZE_UNIT_SHIFT) |

buf_size is used here

>>  				     HCLGE_BUF_SIZE_UPDATE_EN_MSK);
>> +	}
>>  
>>  	ret = hclge_cmd_send(&hdev->hw, &desc, 1);
>>  	if (ret) {
> 
> Local variable 'buf_size' is assigned but never used in this function.
> And with 'buf_size' removed, 'priv' also becomes unused.

> 
> If it gets used in a later patch, add it in that later patch.
> 
> You can also declare the variables locally in the basic block of
> the for() loop.

You are right. Will do it if there is more comment coming, thanks for
reviewing .

> 
> .
> 

^ permalink raw reply

* [PATCH,v2,net-next 0/2] Improve code coverage of syzkaller
From: Petar Penkov @ 2017-09-22  2:17 UTC (permalink / raw)
  To: netdev; +Cc: Petar Penkov

This patch series is intended to improve code coverage of syzkaller on
the early receive path, specifically including flow dissector, GRO,
and GRO with frags parts of the networking stack. Syzkaller exercises
the stack through the TUN driver and this is therefore where changes
reside. Current coverage through netif_receive_skb() is limited as it
does not touch on any of the aforementioned code paths. Furthermore,
for full coverage, it is necessary to have more flexibility over the
linear and non-linear data of the skbs.

The following patches address this by providing the user(syzkaller)
with the ability to send via napi_gro_receive() and napi_gro_frags().
Additionally, syzkaller can specify how many fragments there are and
how much data per fragment there is. This is done by exploiting the
convenient structure of iovecs. Finally, this patch series adds
support for exercising the flow dissector during fuzzing.

The code path including napi_gro_receive() can be enabled via the
IFF_NAPI flag.  The remainder of the changes in this patch series give
the user significantly more control over packets entering the kernel.
To avoid potential security vulnerabilities, hide the ability to send
custom skbs and the flow dissector code paths behind a
capable(CAP_NET_ADMIN) check to require special user privileges.

Changes since v1:
Patch 1/ Remove compile-time CONFIG_TUN_NAPI flag and replace it by a
	 run-time IFF_NAPI flag that can be set via TUN_SET_IFF.
Patch 2/ Push IFF_NAPI_FRAGS to 0x0020 to accommodate for IFF_NAPI,
	 Require capable(CAP_NET_ADMIN) to set IFF_NAPI_FRAGS.

Petar Penkov (2):
  tun: enable NAPI for TUN/TAP driver
  tun: enable napi_gro_frags() for TUN/TAP driver

 drivers/net/tun.c           | 258 ++++++++++++++++++++++++++++++++++++++++----
 include/uapi/linux/if_tun.h |   2 +
 2 files changed, 242 insertions(+), 18 deletions(-)

-- 
2.11.0

^ permalink raw reply

* [PATCH,v2,net-next 1/2] tun: enable NAPI for TUN/TAP driver
From: Petar Penkov @ 2017-09-22  2:17 UTC (permalink / raw)
  To: netdev
  Cc: Petar Penkov, Eric Dumazet, Mahesh Bandewar, Willem de Bruijn,
	davem, ppenkov
In-Reply-To: <20170922021715.2618-1-peterpenkov96@gmail.com>

Changes TUN driver to use napi_gro_receive() upon receiving packets
rather than netif_rx_ni(). Adds flag IFF_NAPI that enables these
changes and operation is not affected if the flag is disabled.  SKBs
are constructed upon packet arrival and are queued to be processed
later.

The new path was evaluated with a benchmark with the following setup:
Open two tap devices and a receiver thread that reads in a loop for
each device. Start one sender thread and pin all threads to different
CPUs. Send 1M minimum UDP packets to each device and measure sending
time for each of the sending methods:
	napi_gro_receive():	4.90s
	netif_rx_ni():		4.90s
	netif_receive_skb():	7.20s

Signed-off-by: Petar Penkov <peterpenkov96@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Mahesh Bandewar <maheshb@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: davem@davemloft.net
Cc: ppenkov@stanford.edu
---
 drivers/net/tun.c           | 133 +++++++++++++++++++++++++++++++++++++++-----
 include/uapi/linux/if_tun.h |   1 +
 2 files changed, 119 insertions(+), 15 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 3c9985f29950..f16407242b18 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -121,7 +121,7 @@ do {								\
 #define TUN_VNET_BE     0x40000000
 
 #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
-		      IFF_MULTI_QUEUE)
+		      IFF_MULTI_QUEUE | IFF_NAPI)
 #define GOODCOPY_LEN 128
 
 #define FLT_EXACT_COUNT 8
@@ -172,6 +172,7 @@ struct tun_file {
 		u16 queue_index;
 		unsigned int ifindex;
 	};
+	struct napi_struct napi;
 	struct list_head next;
 	struct tun_struct *detached;
 	struct skb_array tx_array;
@@ -229,6 +230,68 @@ struct tun_struct {
 	struct bpf_prog __rcu *xdp_prog;
 };
 
+static int tun_napi_receive(struct napi_struct *napi, int budget)
+{
+	struct tun_file *tfile = container_of(napi, struct tun_file, napi);
+	struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
+	struct sk_buff_head process_queue;
+	struct sk_buff *skb;
+	int received = 0;
+
+	__skb_queue_head_init(&process_queue);
+
+	spin_lock(&queue->lock);
+	skb_queue_splice_tail_init(queue, &process_queue);
+	spin_unlock(&queue->lock);
+
+	while (received < budget && (skb = __skb_dequeue(&process_queue))) {
+		napi_gro_receive(napi, skb);
+		++received;
+	}
+
+	if (!skb_queue_empty(&process_queue)) {
+		spin_lock(&queue->lock);
+		skb_queue_splice(&process_queue, queue);
+		spin_unlock(&queue->lock);
+	}
+
+	return received;
+}
+
+static int tun_napi_poll(struct napi_struct *napi, int budget)
+{
+	unsigned int received;
+
+	received = tun_napi_receive(napi, budget);
+
+	if (received < budget)
+		napi_complete_done(napi, received);
+
+	return received;
+}
+
+static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
+			  bool napi_en)
+{
+	if (napi_en) {
+		netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
+			       NAPI_POLL_WEIGHT);
+		napi_enable(&tfile->napi);
+	}
+}
+
+static void tun_napi_disable(struct tun_struct *tun, struct tun_file *tfile)
+{
+	if (tun->flags & IFF_NAPI)
+		napi_disable(&tfile->napi);
+}
+
+static void tun_napi_del(struct tun_struct *tun, struct tun_file *tfile)
+{
+	if (tun->flags & IFF_NAPI)
+		netif_napi_del(&tfile->napi);
+}
+
 #ifdef CONFIG_TUN_VNET_CROSS_LE
 static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
 {
@@ -541,6 +604,11 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
 
 	tun = rtnl_dereference(tfile->tun);
 
+	if (tun && clean) {
+		tun_napi_disable(tun, tfile);
+		tun_napi_del(tun, tfile);
+	}
+
 	if (tun && !tfile->detached) {
 		u16 index = tfile->queue_index;
 		BUG_ON(index >= tun->numqueues);
@@ -598,6 +666,7 @@ static void tun_detach_all(struct net_device *dev)
 	for (i = 0; i < n; i++) {
 		tfile = rtnl_dereference(tun->tfiles[i]);
 		BUG_ON(!tfile);
+		tun_napi_disable(tun, tfile);
 		tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
 		tfile->socket.sk->sk_data_ready(tfile->socket.sk);
 		RCU_INIT_POINTER(tfile->tun, NULL);
@@ -613,6 +682,7 @@ static void tun_detach_all(struct net_device *dev)
 	synchronize_net();
 	for (i = 0; i < n; i++) {
 		tfile = rtnl_dereference(tun->tfiles[i]);
+		tun_napi_del(tun, tfile);
 		/* Drop read queue */
 		tun_queue_purge(tfile);
 		sock_put(&tfile->sk);
@@ -631,7 +701,8 @@ static void tun_detach_all(struct net_device *dev)
 		module_put(THIS_MODULE);
 }
 
-static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter)
+static int tun_attach(struct tun_struct *tun, struct file *file,
+		      bool skip_filter, bool napi)
 {
 	struct tun_file *tfile = file->private_data;
 	struct net_device *dev = tun->dev;
@@ -677,10 +748,12 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte
 	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
 	tun->numqueues++;
 
-	if (tfile->detached)
+	if (tfile->detached) {
 		tun_enable_queue(tfile);
-	else
+	} else {
 		sock_hold(&tfile->sk);
+		tun_napi_init(tun, tfile, napi);
+	}
 
 	tun_set_real_num_queues(tun);
 
@@ -956,13 +1029,28 @@ static void tun_poll_controller(struct net_device *dev)
 	 * Tun only receives frames when:
 	 * 1) the char device endpoint gets data from user space
 	 * 2) the tun socket gets a sendmsg call from user space
-	 * Since both of those are synchronous operations, we are guaranteed
-	 * never to have pending data when we poll for it
-	 * so there is nothing to do here but return.
+	 * If NAPI is not enabled, since both of those are synchronous
+	 * operations, we are guaranteed never to have pending data when we poll
+	 * for it so there is nothing to do here but return.
 	 * We need this though so netpoll recognizes us as an interface that
 	 * supports polling, which enables bridge devices in virt setups to
 	 * still use netconsole
+	 * If NAPI is enabled, however, we need to schedule polling for all
+	 * queues.
 	 */
+	struct tun_struct *tun = netdev_priv(dev);
+
+	if (tun->flags & IFF_NAPI) {
+		struct tun_file *tfile;
+		int i;
+
+		rcu_read_lock();
+		for (i = 0; i < tun->numqueues; i++) {
+			tfile = rcu_dereference(tun->tfiles[i]);
+			napi_schedule(&tfile->napi);
+		}
+		rcu_read_unlock();
+	}
 	return;
 }
 #endif
@@ -1549,11 +1637,25 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	}
 
 	rxhash = __skb_get_hash_symmetric(skb);
-#ifndef CONFIG_4KSTACKS
-	tun_rx_batched(tun, tfile, skb, more);
-#else
-	netif_rx_ni(skb);
-#endif
+
+	if (tun->flags & IFF_NAPI) {
+		struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
+		int queue_len;
+
+		spin_lock_bh(&queue->lock);
+		__skb_queue_tail(queue, skb);
+		queue_len = skb_queue_len(queue);
+		spin_unlock(&queue->lock);
+
+		if (!more || queue_len > NAPI_POLL_WEIGHT)
+			napi_schedule(&tfile->napi);
+
+		local_bh_enable();
+	} else if (!IS_ENABLED(CONFIG_4KSTACKS)) {
+		tun_rx_batched(tun, tfile, skb, more);
+	} else {
+		netif_rx_ni(skb);
+	}
 
 	stats = get_cpu_ptr(tun->pcpu_stats);
 	u64_stats_update_begin(&stats->syncp);
@@ -1980,7 +2082,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 		if (err < 0)
 			return err;
 
-		err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER);
+		err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER,
+				 ifr->ifr_flags & IFF_NAPI);
 		if (err < 0)
 			return err;
 
@@ -2066,7 +2169,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 				       NETIF_F_HW_VLAN_STAG_TX);
 
 		INIT_LIST_HEAD(&tun->disabled);
-		err = tun_attach(tun, file, false);
+		err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI);
 		if (err < 0)
 			goto err_free_flow;
 
@@ -2216,7 +2319,7 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
 		ret = security_tun_dev_attach_queue(tun->security);
 		if (ret < 0)
 			goto unlock;
-		ret = tun_attach(tun, file, false);
+		ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI);
 	} else if (ifr->ifr_flags & IFF_DETACH_QUEUE) {
 		tun = rtnl_dereference(tfile->tun);
 		if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached)
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 3cb5e1d85ddd..30b6184884eb 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -60,6 +60,7 @@
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001
 #define IFF_TAP		0x0002
+#define IFF_NAPI	0x0010
 #define IFF_NO_PI	0x1000
 /* This flag has no real effect */
 #define IFF_ONE_QUEUE	0x2000
-- 
2.11.0

^ permalink raw reply related

* [PATCH,v2,net-next 2/2] tun: enable napi_gro_frags() for TUN/TAP driver
From: Petar Penkov @ 2017-09-22  2:17 UTC (permalink / raw)
  To: netdev
  Cc: Petar Penkov, Eric Dumazet, Mahesh Bandewar, Willem de Bruijn,
	davem, ppenkov
In-Reply-To: <20170922021715.2618-1-peterpenkov96@gmail.com>

Add a TUN/TAP receive mode that exercises the napi_gro_frags()
interface. This mode is available only in TAP mode, as the interface
expects packets with Ethernet headers.

Furthermore, packets follow the layout of the iovec_iter that was
received. The first iovec is the linear data, and every one after the
first is a fragment. If there are more fragments than the max number,
drop the packet. Additionally, invoke eth_get_headlen() to exercise flow
dissector code and to verify that the header resides in the linear data.

The napi_gro_frags() mode requires setting the IFF_NAPI_FRAGS option.
This is imposed because this mode is intended for testing via tools like
syzkaller and packetdrill, and the increased flexibility it provides can
introduce security vulnerabilities. This flag is accepted only if the
device is in TAP mode and has the IFF_NAPI flag set as well. This is
done because both of these are explicit requirements for correct
operation in this mode.

Signed-off-by: Petar Penkov <peterpenkov96@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Mahesh Bandewar <maheshb@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: davem@davemloft.net
Cc: ppenkov@stanford.edu
---
 drivers/net/tun.c           | 131 ++++++++++++++++++++++++++++++++++++++++++--
 include/uapi/linux/if_tun.h |   1 +
 2 files changed, 126 insertions(+), 6 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index f16407242b18..e398f019c590 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -75,6 +75,7 @@
 #include <linux/skb_array.h>
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
+#include <linux/mutex.h>
 
 #include <linux/uaccess.h>
 
@@ -121,7 +122,8 @@ do {								\
 #define TUN_VNET_BE     0x40000000
 
 #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
-		      IFF_MULTI_QUEUE | IFF_NAPI)
+		      IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)
+
 #define GOODCOPY_LEN 128
 
 #define FLT_EXACT_COUNT 8
@@ -173,6 +175,7 @@ struct tun_file {
 		unsigned int ifindex;
 	};
 	struct napi_struct napi;
+	struct mutex napi_mutex;	/* Protects access to the above napi */
 	struct list_head next;
 	struct tun_struct *detached;
 	struct skb_array tx_array;
@@ -277,6 +280,7 @@ static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
 		netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
 			       NAPI_POLL_WEIGHT);
 		napi_enable(&tfile->napi);
+		mutex_init(&tfile->napi_mutex);
 	}
 }
 
@@ -292,6 +296,11 @@ static void tun_napi_del(struct tun_struct *tun, struct tun_file *tfile)
 		netif_napi_del(&tfile->napi);
 }
 
+static bool tun_napi_frags_enabled(const struct tun_struct *tun)
+{
+	return READ_ONCE(tun->flags) & IFF_NAPI_FRAGS;
+}
+
 #ifdef CONFIG_TUN_VNET_CROSS_LE
 static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
 {
@@ -1036,7 +1045,8 @@ static void tun_poll_controller(struct net_device *dev)
 	 * supports polling, which enables bridge devices in virt setups to
 	 * still use netconsole
 	 * If NAPI is enabled, however, we need to schedule polling for all
-	 * queues.
+	 * queues unless we are using napi_gro_frags(), which we call in
+	 * process context and not in NAPI context.
 	 */
 	struct tun_struct *tun = netdev_priv(dev);
 
@@ -1044,6 +1054,9 @@ static void tun_poll_controller(struct net_device *dev)
 		struct tun_file *tfile;
 		int i;
 
+		if (tun_napi_frags_enabled(tun))
+			return;
+
 		rcu_read_lock();
 		for (i = 0; i < tun->numqueues; i++) {
 			tfile = rcu_dereference(tun->tfiles[i]);
@@ -1266,6 +1279,64 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
 	return mask;
 }
 
+static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile,
+					    size_t len,
+					    const struct iov_iter *it)
+{
+	struct sk_buff *skb;
+	size_t linear;
+	int err;
+	int i;
+
+	if (it->nr_segs > MAX_SKB_FRAGS + 1)
+		return ERR_PTR(-ENOMEM);
+
+	local_bh_disable();
+	skb = napi_get_frags(&tfile->napi);
+	local_bh_enable();
+	if (!skb)
+		return ERR_PTR(-ENOMEM);
+
+	linear = iov_iter_single_seg_count(it);
+	err = __skb_grow(skb, linear);
+	if (err)
+		goto free;
+
+	skb->len = len;
+	skb->data_len = len - linear;
+	skb->truesize += skb->data_len;
+
+	for (i = 1; i < it->nr_segs; i++) {
+		size_t fragsz = it->iov[i].iov_len;
+		unsigned long offset;
+		struct page *page;
+		void *data;
+
+		if (fragsz == 0 || fragsz > PAGE_SIZE) {
+			err = -EINVAL;
+			goto free;
+		}
+
+		local_bh_disable();
+		data = napi_alloc_frag(fragsz);
+		local_bh_enable();
+		if (!data) {
+			err = -ENOMEM;
+			goto free;
+		}
+
+		page = virt_to_head_page(data);
+		offset = data - page_address(page);
+		skb_fill_page_desc(skb, i - 1, page, offset, fragsz);
+	}
+
+	return skb;
+free:
+	/* frees skb and all frags allocated with napi_alloc_frag() */
+	napi_free_frags(&tfile->napi);
+	return ERR_PTR(err);
+}
+
 /* prepad is the amount to reserve at front.  len is length after that.
  * linear is a hint as to how much to copy (usually headers). */
 static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
@@ -1478,6 +1549,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	int err;
 	u32 rxhash;
 	int skb_xdp = 1;
+	bool frags = tun_napi_frags_enabled(tun);
 
 	if (!(tun->dev->flags & IFF_UP))
 		return -EIO;
@@ -1535,7 +1607,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 			zerocopy = true;
 	}
 
-	if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
+	if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
 		/* For the packet that is not easy to be processed
 		 * (e.g gso or jumbo packet), we will do it at after
 		 * skb was created with generic XDP routine.
@@ -1556,10 +1628,24 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 				linear = tun16_to_cpu(tun, gso.hdr_len);
 		}
 
-		skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
+		if (frags) {
+			mutex_lock(&tfile->napi_mutex);
+			skb = tun_napi_alloc_frags(tfile, copylen, from);
+			/* tun_napi_alloc_frags() enforces a layout for the skb.
+			 * If zerocopy is enabled, then this layout will be
+			 * overwritten by zerocopy_sg_from_iter().
+			 */
+			zerocopy = false;
+		} else {
+			skb = tun_alloc_skb(tfile, align, copylen, linear,
+					    noblock);
+		}
+
 		if (IS_ERR(skb)) {
 			if (PTR_ERR(skb) != -EAGAIN)
 				this_cpu_inc(tun->pcpu_stats->rx_dropped);
+			if (frags)
+				mutex_unlock(&tfile->napi_mutex);
 			return PTR_ERR(skb);
 		}
 
@@ -1571,6 +1657,11 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 		if (err) {
 			this_cpu_inc(tun->pcpu_stats->rx_dropped);
 			kfree_skb(skb);
+			if (frags) {
+				tfile->napi.skb = NULL;
+				mutex_unlock(&tfile->napi_mutex);
+			}
+
 			return -EFAULT;
 		}
 	}
@@ -1578,6 +1669,11 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) {
 		this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
 		kfree_skb(skb);
+		if (frags) {
+			tfile->napi.skb = NULL;
+			mutex_unlock(&tfile->napi_mutex);
+		}
+
 		return -EINVAL;
 	}
 
@@ -1603,7 +1699,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 		skb->dev = tun->dev;
 		break;
 	case IFF_TAP:
-		skb->protocol = eth_type_trans(skb, tun->dev);
+		if (!frags)
+			skb->protocol = eth_type_trans(skb, tun->dev);
 		break;
 	}
 
@@ -1638,7 +1735,23 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 
 	rxhash = __skb_get_hash_symmetric(skb);
 
-	if (tun->flags & IFF_NAPI) {
+	if (frags) {
+		/* Exercise flow dissector code path. */
+		u32 headlen = eth_get_headlen(skb->data, skb_headlen(skb));
+
+		if (headlen > skb_headlen(skb) || headlen < ETH_HLEN) {
+			this_cpu_inc(tun->pcpu_stats->rx_dropped);
+			napi_free_frags(&tfile->napi);
+			mutex_unlock(&tfile->napi_mutex);
+			WARN_ON(1);
+			return -ENOMEM;
+		}
+
+		local_bh_disable();
+		napi_gro_frags(&tfile->napi);
+		local_bh_enable();
+		mutex_unlock(&tfile->napi_mutex);
+	} else if (tun->flags & IFF_NAPI) {
 		struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
 		int queue_len;
 
@@ -2061,6 +2174,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 	if (tfile->detached)
 		return -EINVAL;
 
+	if ((ifr->ifr_flags & IFF_NAPI_FRAGS) && !capable(CAP_NET_ADMIN))
+		return -EPERM;
+
 	dev = __dev_get_by_name(net, ifr->ifr_name);
 	if (dev) {
 		if (ifr->ifr_flags & IFF_TUN_EXCL)
@@ -2185,6 +2301,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 	tun->flags = (tun->flags & ~TUN_FEATURES) |
 		(ifr->ifr_flags & TUN_FEATURES);
 
+	if (!(tun->flags & IFF_NAPI) || (tun->flags & TUN_TYPE_MASK) != IFF_TAP)
+		tun->flags = tun->flags & ~IFF_NAPI_FRAGS;
+
 	/* Make sure persistent devices do not get stuck in
 	 * xoff state.
 	 */
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 30b6184884eb..365ade5685c9 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -61,6 +61,7 @@
 #define IFF_TUN		0x0001
 #define IFF_TAP		0x0002
 #define IFF_NAPI	0x0010
+#define IFF_NAPI_FRAGS	0x0020
 #define IFF_NO_PI	0x1000
 /* This flag has no real effect */
 #define IFF_ONE_QUEUE	0x2000
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next v2] net: Remove useless function skb_header_release
From: gfree.wind @ 2017-09-22  2:25 UTC (permalink / raw)
  To: davem, netdev; +Cc: Gao Feng

From: Gao Feng <gfree.wind@vip.163.com>

There is no one which would invokes the function skb_header_release.
So just remove it now.

Signed-off-by: Gao Feng <gfree.wind@vip.163.com>
---
 v2: Correct some comments, per Joe
 v1: initial version

 drivers/net/usb/asix_common.c   |  2 +-
 include/linux/skbuff.h          | 19 -------------------
 net/batman-adv/soft-interface.c |  2 +-
 3 files changed, 2 insertions(+), 21 deletions(-)

diff --git a/drivers/net/usb/asix_common.c b/drivers/net/usb/asix_common.c
index 522d290..f4d7362 100644
--- a/drivers/net/usb/asix_common.c
+++ b/drivers/net/usb/asix_common.c
@@ -245,7 +245,7 @@ struct sk_buff *asix_tx_fixup(struct usbnet *dev, struct sk_buff *skb,
 	 * - We are allowed to put 4 bytes at tail if skb_cloned()
 	 *   is false (and if we have 4 bytes of tailroom)
 	 *
-	 * TCP packets for example are cloned, but skb_header_release()
+	 * TCP packets for example are cloned, but __skb_header_release()
 	 * was called in tcp stack, allowing us to use headroom for our needs.
 	 */
 	if (!skb_header_cloned(skb) &&
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4928288..f9db553 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1457,27 +1457,8 @@ static inline int skb_header_unclone(struct sk_buff *skb, gfp_t pri)
 }
 
 /**
- *	skb_header_release - release reference to header
- *	@skb: buffer to operate on
- *
- *	Drop a reference to the header part of the buffer.  This is done
- *	by acquiring a payload reference.  You must not read from the header
- *	part of skb->data after this.
- *	Note : Check if you can use __skb_header_release() instead.
- */
-static inline void skb_header_release(struct sk_buff *skb)
-{
-	BUG_ON(skb->nohdr);
-	skb->nohdr = 1;
-	atomic_add(1 << SKB_DATAREF_SHIFT, &skb_shinfo(skb)->dataref);
-}
-
-/**
  *	__skb_header_release - release reference to header
  *	@skb: buffer to operate on
- *
- *	Variant of skb_header_release() assuming skb is private to caller.
- *	We can avoid one atomic operation.
  */
 static inline void __skb_header_release(struct sk_buff *skb)
 {
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 10f7edf..c2c9867 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -69,7 +69,7 @@ int batadv_skb_head_push(struct sk_buff *skb, unsigned int len)
 	int result;
 
 	/* TODO: We must check if we can release all references to non-payload
-	 * data using skb_header_release in our skbs to allow skb_cow_header to
+	 * data using __skb_header_release in our skbs to allow skb_cow_header to
 	 * work optimally. This means that those skbs are not allowed to read
 	 * or write any data which is before the current position of skb->data
 	 * after that call and thus allow other skbs with the same data buffer
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH v3 09/31] jfs: Define usercopy region in jfs_ip slab cache
From: Dave Kleikamp @ 2017-09-22  2:54 UTC (permalink / raw)
  To: Kees Cook, linux-kernel
  Cc: David Windsor, jfs-discussion, linux-fsdevel, netdev, linux-mm,
	kernel-hardening
In-Reply-To: <1505940337-79069-10-git-send-email-keescook@chromium.org>

Acked-by: Dave Kleikamp <dave.kleikamp@oracle.com>

On 09/20/2017 03:45 PM, Kees Cook wrote:
> From: David Windsor <dave@nullcore.net>
> 
> The jfs symlink pathnames, stored in struct jfs_inode_info.i_inline and
> therefore contained in the jfs_ip slab cache, need to be copied to/from
> userspace.
> 
> cache object allocation:
>     fs/jfs/super.c:
>         jfs_alloc_inode(...):
>             ...
>             jfs_inode = kmem_cache_alloc(jfs_inode_cachep, GFP_NOFS);
>             ...
>             return &jfs_inode->vfs_inode;
> 
>     fs/jfs/jfs_incore.h:
>         JFS_IP(struct inode *inode):
>             return container_of(inode, struct jfs_inode_info, vfs_inode);
> 
>     fs/jfs/inode.c:
>         jfs_iget(...):
>             ...
>             inode->i_link = JFS_IP(inode)->i_inline;
> 
> example usage trace:
>     readlink_copy+0x43/0x70
>     vfs_readlink+0x62/0x110
>     SyS_readlinkat+0x100/0x130
> 
>     fs/namei.c:
>         readlink_copy(..., link):
>             ...
>             copy_to_user(..., link, len);
> 
>         (inlined in vfs_readlink)
>         generic_readlink(dentry, ...):
>             struct inode *inode = d_inode(dentry);
>             const char *link = inode->i_link;
>             ...
>             readlink_copy(..., link);
> 
> In support of usercopy hardening, this patch defines a region in the
> jfs_ip slab cache in which userspace copy operations are allowed.
> 
> This region is known as the slab cache's usercopy region. Slab caches can
> now check that each copy operation involving cache-managed memory falls
> entirely within the slab's usercopy region.
> 
> This patch is modified from Brad Spengler/PaX Team's PAX_USERCOPY
> whitelisting code in the last public patch of grsecurity/PaX based on my
> understanding of the code. Changes or omissions from the original code are
> mine and don't reflect the original grsecurity/PaX code.
> 
> Signed-off-by: David Windsor <dave@nullcore.net>
> [kees: adjust commit log, provide usage trace]
> Cc: Dave Kleikamp <shaggy@kernel.org>
> Cc: jfs-discussion@lists.sourceforge.net
> Signed-off-by: Kees Cook <keescook@chromium.org>
> ---
>  fs/jfs/super.c | 8 +++++---
>  1 file changed, 5 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/jfs/super.c b/fs/jfs/super.c
> index 2f14677169c3..e018412608d4 100644
> --- a/fs/jfs/super.c
> +++ b/fs/jfs/super.c
> @@ -966,9 +966,11 @@ static int __init init_jfs_fs(void)
>  	int rc;
>  
>  	jfs_inode_cachep =
> -	    kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0,
> -			    SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
> -			    init_once);
> +	    kmem_cache_create_usercopy("jfs_ip", sizeof(struct jfs_inode_info),
> +			0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
> +			offsetof(struct jfs_inode_info, i_inline),
> +			sizeof_field(struct jfs_inode_info, i_inline),
> +			init_once);
>  	if (jfs_inode_cachep == NULL)
>  		return -ENOMEM;
>  
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* RE: [net-next 1/2] dummy: add device MTU validation check
From: 张胜举 @ 2017-09-22  3:28 UTC (permalink / raw)
  To: 'Eric Dumazet'; +Cc: davem, willemb, stephen, netdev
In-Reply-To: <1506006138.29839.132.camel@edumazet-glaptop3.roam.corp.google.com>

> -----Original Message-----
> From: Eric Dumazet [mailto:eric.dumazet@gmail.com]
> Sent: 2017年9月21日 23:02
> To: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
> Cc: davem@davemloft.net; willemb@google.com;
> stephen@networkplumber.org; netdev@vger.kernel.org
> Subject: Re: [net-next 1/2] dummy: add device MTU validation check
> 
> On Thu, 2017-09-21 at 21:32 +0800, Zhang Shengju wrote:
> > Currently, any mtu value can be assigned when adding a new dummy device:
> > [~]# ip link add name dummy1 mtu 100000 type dummy [~]# ip link show
> > dummy1
> > 15: dummy1: <BROADCAST,NOARP> mtu 100000 qdisc noop state DOWN
> mode DEFAULT group default qlen 1000
> >     link/ether 0a:61:6b:16:14:ce brd ff:ff:ff:ff:ff:ff
> >
> > This patch adds device MTU validation check.
> 
> What is wrong with big MTU on dummy ?
> 
> If this is a generic rule, this check should belong in core network stack.
> 

dummy_setup() function setup mtu range: [0, ETH_MAX_MTU]. 
This will be checked at dev_set_mtu() function in core network stack.

So if you add a new dummy device without specify mtu value, you can't set a value out of range [0, ETH_MAX_MTU] afterward.
BUT you can set any mtu when adding new device. This cause an inconsistence.

> >
> > Signed-off-by: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
> > ---
> >  drivers/net/dummy.c | 8 ++++++++
> >  1 file changed, 8 insertions(+)
> >
> > diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c index
> > e31ab3b..0276b2b 100644
> > --- a/drivers/net/dummy.c
> > +++ b/drivers/net/dummy.c
> > @@ -365,6 +365,14 @@ static int dummy_validate(struct nlattr *tb[], struct
> nlattr *data[],
> >  		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
> >  			return -EADDRNOTAVAIL;
> >  	}
> > +
> > +	if (tb[IFLA_MTU]) {
> > +		u32 mtu = nla_get_u32(tb[IFLA_MTU]);
> 
> You do not verify/validate nla_len(tb[IFLA_MTU]).
> 
> Do not ever trust user space.
MTU attribute is just u32, do you think it's necessary to check the length? 
Actually I don't see any place to check the length of mtu attribute in network stack code. 

> 
> > +
> > +		if (mtu > ETH_MAX_MTU)
> > +			return -EINVAL;
> > +	}
> > +
> >  	return 0;
> >  }
> >
> 

^ permalink raw reply

* RE: [net-next 2/2] ifb: add device MTU validation check
From: 张胜举 @ 2017-09-22  3:35 UTC (permalink / raw)
  To: 'Stephen Hemminger'; +Cc: davem, willemb, netdev
In-Reply-To: <20170921081010.4d6f5731@xeon-e3>

> -----Original Message-----
> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: 2017年9月21日 23:10
> To: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
> Cc: davem@davemloft.net; willemb@google.com; netdev@vger.kernel.org
> Subject: Re: [net-next 2/2] ifb: add device MTU validation check
> 
> On Thu, 21 Sep 2017 21:32:02 +0800
> Zhang Shengju <zhangshengju@cmss.chinamobile.com> wrote:
> 
> > Currently, any mtu value can be assigned when adding a new ifb device:
> > [~]# ip link add name ifb2 mtu 100000 type ifb [~]# ip link show ifb2
> > 18: ifb2: <BROADCAST,NOARP> mtu 100000 qdisc noop state DOWN mode
> DEFAULT group default qlen 32
> >     link/ether 7a:bf:f4:63:da:d1 brd ff:ff:ff:ff:ff:ff
> >
> > This patch adds device MTU validation check.
> >
> > Signed-off-by: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
> > ---
> >  drivers/net/ifb.c | 8 ++++++++
> >  1 file changed, 8 insertions(+)
> >
> > diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c index
> > 8870bd2..ce84ad2 100644
> > --- a/drivers/net/ifb.c
> > +++ b/drivers/net/ifb.c
> > @@ -282,6 +282,14 @@ static int ifb_validate(struct nlattr *tb[], struct
> nlattr *data[],
> >  		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
> >  			return -EADDRNOTAVAIL;
> >  	}
> > +
> > +	if (tb[IFLA_MTU]) {
> > +		u32 mtu = nla_get_u32(tb[IFLA_MTU]);
> > +
> > +		if (mtu < ETH_MIN_MTU || mtu > ETH_DATA_LEN)
> > +			return -EINVAL;
> > +	}
> > +
> >  	return 0;
> >  }
> >
> 
> What about running ifb with packets coming from devices with jumbo frames?
> Also since ifb is an input only device, MTU doesn't matter.

Actually ifb_setup() function setup ifb valid MTU range: [68, 1500], and
this will be check at dev_set_mtu() function. 
You can't setup mtu out of this range after creation, but you can set any
value when adding new ifb device.
This is inconsistent, and I think we should add this check at creation time 

BRs,
ZSJ

^ permalink raw reply

* Re: [PATCH] net: phy: Fix truncation of large IRQ numbers in phy_attached_print()
From: David Miller @ 2017-09-22  3:36 UTC (permalink / raw)
  To: geert+renesas; +Cc: andrew, f.fainelli, romain.perier, netdev, linux-kernel
In-Reply-To: <1505993222-16721-1-git-send-email-geert+renesas@glider.be>

From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Thu, 21 Sep 2017 13:27:02 +0200

> Given NR_IRQS is 2048 on sparc64, and even 32784 on alpha, 3 digits is
> not enough to represent interrupt numbers on all architectures.  Hence
> PHY interrupt numbers may be truncated during printing.
> 
> Increase the buffer size from 4 to 8 bytes to fix this.
> 
> Fixes: 5e369aefdce4818c ("net: stmmac: Delete dead code for MDIO registration")
> Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>

Applied.

^ permalink raw reply

* Re: [PATCH net-next] net: vrf: remove skb_dst_force() after skb_dst_set()
From: David Miller @ 2017-09-22  3:36 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, dsa, shm
In-Reply-To: <1506005428.29839.129.camel@edumazet-glaptop3.roam.corp.google.com>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 21 Sep 2017 07:50:28 -0700

> From: Eric Dumazet <edumazet@google.com>
> 
> skb_dst_set(skb, dst) installs a normal (refcounted) dst, there is no
> point using skb_dst_force(skb)
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Applied, thanks Eric.

^ permalink raw reply

* Re: [PATCH net-next] test_rhashtable: remove initdata annotation
From: David Miller @ 2017-09-22  3:42 UTC (permalink / raw)
  To: fw; +Cc: netdev
In-Reply-To: <20170921153608.8789-1-fw@strlen.de>

From: Florian Westphal <fw@strlen.de>
Date: Thu, 21 Sep 2017 17:36:08 +0200

> kbuild test robot reported a section mismatch warning w. gcc 4.x:
> WARNING: lib/test_rhashtable.o(.text+0x139e):
> Section mismatch in reference from the function rhltable_insert.clone.3() to the variable .init.data:rhlt
> 
> so remove this annotation.
> 
> Fixes: cdd4de372ea06 ("test_rhashtable: add test case for rhl_table interface")
> Reported-by: kbuild test robot <lkp@intel.com>
> Signed-off-by: Florian Westphal <fw@strlen.de>

Applied.

^ permalink raw reply

* Re: [PATCH net] net: prevent dst uses after free
From: David Miller @ 2017-09-22  3:42 UTC (permalink / raw)
  To: eric.dumazet; +Cc: pstaszewski, weiwan, xiyou.wangcong, netdev, edumazet
In-Reply-To: <1506010546.29839.148.camel@edumazet-glaptop3.roam.corp.google.com>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 21 Sep 2017 09:15:46 -0700

> From: Eric Dumazet <edumazet@google.com>
> 
> In linux-4.13, Wei worked hard to convert dst to a traditional
> refcounted model, removing GC.
> 
> We now want to make sure a dst refcount can not transition from 0 back
> to 1.
> 
> The problem here is that input path attached a not refcounted dst to an
> skb. Then later, because packet is forwarded and hits skb_dst_force()
> before exiting RCU section, we might try to take a refcount on one dst
> that is about to be freed, if another cpu saw 1 -> 0 transition in
> dst_release() and queued the dst for freeing after one RCU grace period.
> 
> Lets unify skb_dst_force() and skb_dst_force_safe(), since we should
> always perform the complete check against dst refcount, and not assume
> it is not zero.
> 
> Bugzilla : https://bugzilla.kernel.org/show_bug.cgi?id=197005
 ...
> Similarly dst_clone() can use dst_hold() helper to have additional
> debugging, as a follow up to commit 44ebe79149ff ("net: add debug
> atomic_inc_not_zero() in dst_hold()")
> 
> In net-next we will convert dst atomic_t to refcount_t for peace of
> mind.
> 
> Fixes: a4c2fd7f7891 ("net: remove DST_NOCACHE flag")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Wei Wang <weiwan@google.com>
> Reported-by: Paweł Staszewski <pstaszewski@itcare.pl>
> Bisected-by: Paweł Staszewski <pstaszewski@itcare.pl>

Applied and queued up for -stable, thanks Eric.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox