Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v2 net-next 5/5] ipv6: don't walk node's leaf during serial number update
From: Hannes Frederic Sowa @ 2014-10-06 17:58 UTC (permalink / raw)
  To: netdev; +Cc: hideaki, kafai, cwang
In-Reply-To: <cover.1412618014.git.hannes@stressinduktion.org>

Cc: YOSHIFUJI Hideaki <hideaki@yoshifuji.org>
Cc: Martin Lau <kafai@fb.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
---
 net/ipv6/ip6_fib.c | 47 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 6f9beb1..b2d1838 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -50,6 +50,7 @@ struct fib6_cleaner {
 	struct fib6_walker w;
 	struct net *net;
 	int (*func)(struct rt6_info *, void *arg);
+	int sernum;
 	void *arg;
 };
 
@@ -105,6 +106,10 @@ static int fib6_new_sernum(struct net *net)
 	return new;
 }
 
+enum {
+	FIB6_NO_SERNUM_CHANGE = 0,
+};
+
 /*
  *	Auxiliary address test functions for the radix tree.
  *
@@ -1514,6 +1519,16 @@ static int fib6_clean_node(struct fib6_walker *w)
 		.nl_net = c->net,
 	};
 
+	if (c->sernum != FIB6_NO_SERNUM_CHANGE &&
+	    w->node->fn_sernum != c->sernum)
+		w->node->fn_sernum = c->sernum;
+
+	if (!c->func) {
+		WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE);
+		w->leaf = NULL;
+		return 0;
+	}
+
 	for (rt = w->leaf; rt; rt = rt->dst.rt6_next) {
 		res = c->func(rt, c->arg);
 		if (res < 0) {
@@ -1547,7 +1562,7 @@ static int fib6_clean_node(struct fib6_walker *w)
 
 static void fib6_clean_tree(struct net *net, struct fib6_node *root,
 			    int (*func)(struct rt6_info *, void *arg),
-			    bool prune, void *arg)
+			    bool prune, int sernum, void *arg)
 {
 	struct fib6_cleaner c;
 
@@ -1557,14 +1572,16 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root,
 	c.w.count = 0;
 	c.w.skip = 0;
 	c.func = func;
+	c.sernum = sernum;
 	c.arg = arg;
 	c.net = net;
 
 	fib6_walk(&c.w);
 }
 
-void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg),
-		    void *arg)
+static void __fib6_clean_all(struct net *net,
+			     int (*func)(struct rt6_info *, void *),
+			     int sernum, void *arg)
 {
 	struct fib6_table *table;
 	struct hlist_head *head;
@@ -1576,13 +1593,19 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg),
 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
 			write_lock_bh(&table->tb6_lock);
 			fib6_clean_tree(net, &table->tb6_root,
-					func, false, arg);
+					func, false, sernum, arg);
 			write_unlock_bh(&table->tb6_lock);
 		}
 	}
 	rcu_read_unlock();
 }
 
+void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *),
+		    void *arg)
+{
+	__fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg);
+}
+
 static int fib6_prune_clone(struct rt6_info *rt, void *arg)
 {
 	if (rt->rt6i_flags & RTF_CACHE) {
@@ -1595,25 +1618,15 @@ static int fib6_prune_clone(struct rt6_info *rt, void *arg)
 
 static void fib6_prune_clones(struct net *net, struct fib6_node *fn)
 {
-	fib6_clean_tree(net, fn, fib6_prune_clone, true, NULL);
-}
-
-static int fib6_update_sernum(struct rt6_info *rt, void *arg)
-{
-	int sernum = *(int *)arg;
-
-	if (rt->rt6i_node &&
-	    rt->rt6i_node->fn_sernum != sernum)
-		rt->rt6i_node->fn_sernum = sernum;
-
-	return 0;
+	fib6_clean_tree(net, fn, fib6_prune_clone, true,
+			FIB6_NO_SERNUM_CHANGE, NULL);
 }
 
 static void fib6_flush_trees(struct net *net)
 {
 	int new_sernum = fib6_new_sernum(net);
 
-	fib6_clean_all(net, fib6_update_sernum, &new_sernum);
+	__fib6_clean_all(net, NULL, new_sernum, NULL);
 }
 
 /*
-- 
1.9.3

^ permalink raw reply related

* [PATCH v2 net-next 3/5] ipv6: only generate one new serial number per fib mutation
From: Hannes Frederic Sowa @ 2014-10-06 17:58 UTC (permalink / raw)
  To: netdev; +Cc: hideaki, kafai, cwang
In-Reply-To: <cover.1412618014.git.hannes@stressinduktion.org>

Cc: YOSHIFUJI Hideaki <hideaki@yoshifuji.org>
Cc: Martin Lau <kafai@fb.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
---
 net/ipv6/ip6_fib.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 332f1e0..be9cb09 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -417,14 +417,13 @@ out:
 static struct fib6_node *fib6_add_1(struct fib6_node *root,
 				     struct in6_addr *addr, int plen,
 				     int offset, int allow_create,
-				     int replace_required)
+				     int replace_required, int sernum)
 {
 	struct fib6_node *fn, *in, *ln;
 	struct fib6_node *pn = NULL;
 	struct rt6key *key;
 	int	bit;
 	__be32	dir = 0;
-	int	sernum = fib6_new_sernum();
 
 	RT6_TRACE("fib6_add_1\n");
 
@@ -842,6 +841,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info,
 	int err = -ENOMEM;
 	int allow_create = 1;
 	int replace_required = 0;
+	int sernum = fib6_new_sernum();
 
 	if (info->nlh) {
 		if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
@@ -854,7 +854,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info,
 
 	fn = fib6_add_1(root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen,
 			offsetof(struct rt6_info, rt6i_dst), allow_create,
-			replace_required);
+			replace_required, sernum);
 	if (IS_ERR(fn)) {
 		err = PTR_ERR(fn);
 		fn = NULL;
@@ -888,14 +888,14 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info,
 			sfn->leaf = info->nl_net->ipv6.ip6_null_entry;
 			atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref);
 			sfn->fn_flags = RTN_ROOT;
-			sfn->fn_sernum = fib6_new_sernum();
+			sfn->fn_sernum = sernum;
 
 			/* Now add the first leaf node to new subtree */
 
 			sn = fib6_add_1(sfn, &rt->rt6i_src.addr,
 					rt->rt6i_src.plen,
 					offsetof(struct rt6_info, rt6i_src),
-					allow_create, replace_required);
+					allow_create, replace_required, sernum);
 
 			if (IS_ERR(sn)) {
 				/* If it is failed, discard just allocated
@@ -914,7 +914,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info,
 			sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr,
 					rt->rt6i_src.plen,
 					offsetof(struct rt6_info, rt6i_src),
-					allow_create, replace_required);
+					allow_create, replace_required, sernum);
 
 			if (IS_ERR(sn)) {
 				err = PTR_ERR(sn);
-- 
1.9.3

^ permalink raw reply related

* [PATCH v2 net-next 4/5] ipv6: make fib6 serial number per namespace
From: Hannes Frederic Sowa @ 2014-10-06 17:58 UTC (permalink / raw)
  To: netdev; +Cc: hideaki, kafai, cwang
In-Reply-To: <cover.1412618014.git.hannes@stressinduktion.org>

Try to reduce number of possible fn_sernum mutation by constraining them
to their namespace.

Also remove rt_genid which I forgot to remove in 705f1c869d577c ("ipv6:
remove rt6i_genid").

Cc: YOSHIFUJI Hideaki <hideaki@yoshifuji.org>
Cc: Martin Lau <kafai@fb.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
---
 include/net/netns/ipv6.h |  2 +-
 net/ipv6/af_inet6.c      |  2 +-
 net/ipv6/ip6_fib.c       | 13 ++++++-------
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index eade27a..69ae41f 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -76,7 +76,7 @@ struct netns_ipv6 {
 #endif
 #endif
 	atomic_t		dev_addr_genid;
-	atomic_t		rt_genid;
+	atomic_t		fib6_sernum;
 };
 
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 34f726f..e8c4400 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -766,7 +766,7 @@ static int __net_init inet6_net_init(struct net *net)
 	net->ipv6.sysctl.icmpv6_time = 1*HZ;
 	net->ipv6.sysctl.flowlabel_consistency = 1;
 	net->ipv6.sysctl.auto_flowlabels = 0;
-	atomic_set(&net->ipv6.rt_genid, 0);
+	atomic_set(&net->ipv6.fib6_sernum, 1);
 
 	err = ipv6_init_mibs(net);
 	if (err)
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index be9cb09..6f9beb1 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -74,8 +74,6 @@ static int fib6_walk_continue(struct fib6_walker *w);
  *	result of redirects, path MTU changes, etc.
  */
 
-static atomic_t rt_sernum = ATOMIC_INIT(1);
-
 static void fib6_gc_timer_cb(unsigned long arg);
 
 static LIST_HEAD(fib6_walkers);
@@ -95,14 +93,15 @@ static void fib6_walker_unlink(struct fib6_walker *w)
 	write_unlock_bh(&fib6_walker_lock);
 }
 
-static int fib6_new_sernum(void)
+static int fib6_new_sernum(struct net *net)
 {
 	int new, old;
 
 	do {
-		old = atomic_read(&rt_sernum);
+		old = atomic_read(&net->ipv6.fib6_sernum);
 		new = old < INT_MAX ? old + 1 : 1;
-	} while (atomic_cmpxchg(&rt_sernum, old, new) != old);
+	} while (atomic_cmpxchg(&net->ipv6.fib6_sernum,
+				old, new) != old);
 	return new;
 }
 
@@ -841,7 +840,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info,
 	int err = -ENOMEM;
 	int allow_create = 1;
 	int replace_required = 0;
-	int sernum = fib6_new_sernum();
+	int sernum = fib6_new_sernum(info->nl_net);
 
 	if (info->nlh) {
 		if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
@@ -1612,7 +1611,7 @@ static int fib6_update_sernum(struct rt6_info *rt, void *arg)
 
 static void fib6_flush_trees(struct net *net)
 {
-	int new_sernum = fib6_new_sernum();
+	int new_sernum = fib6_new_sernum(net);
 
 	fib6_clean_all(net, fib6_update_sernum, &new_sernum);
 }
-- 
1.9.3

^ permalink raw reply related

* [PATCH v2 net-next 1/5] ipv6: minor fib6 cleanups like type safety, bool conversion, inline removal
From: Hannes Frederic Sowa @ 2014-10-06 17:58 UTC (permalink / raw)
  To: netdev; +Cc: hideaki, kafai, cwang
In-Reply-To: <cover.1412618014.git.hannes@stressinduktion.org>

Also renamed struct fib6_walker_t to fib6_walker and enum fib_walk_state_t
to fib6_walk_state as recommended by Cong Wang.

Cc: Cong Wang <cwang@twopensource.com>
Cc: YOSHIFUJI Hideaki <hideaki@yoshifuji.org>
Cc: Martin Lau <kafai@fb.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
---
 include/net/ip6_fib.h | 18 ++++++++++---
 net/ipv6/ip6_fib.c    | 71 ++++++++++++++++++++++-----------------------------
 2 files changed, 45 insertions(+), 44 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index cf485f9..9221bf4 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -202,15 +202,25 @@ static inline void ip6_rt_put(struct rt6_info *rt)
 	dst_release(&rt->dst);
 }
 
-struct fib6_walker_t {
+enum fib6_walk_state {
+#ifdef CONFIG_IPV6_SUBTREES
+	FWS_S,
+#endif
+	FWS_L,
+	FWS_R,
+	FWS_C,
+	FWS_U
+};
+
+struct fib6_walker {
 	struct list_head lh;
 	struct fib6_node *root, *node;
 	struct rt6_info *leaf;
-	unsigned char state;
-	unsigned char prune;
+	enum fib6_walk_state state;
+	bool prune;
 	unsigned int skip;
 	unsigned int count;
-	int (*func)(struct fib6_walker_t *);
+	int (*func)(struct fib6_walker *);
 	void *args;
 };
 
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 97b9fa8..e8d7465 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -46,18 +46,8 @@
 
 static struct kmem_cache *fib6_node_kmem __read_mostly;
 
-enum fib_walk_state_t {
-#ifdef CONFIG_IPV6_SUBTREES
-	FWS_S,
-#endif
-	FWS_L,
-	FWS_R,
-	FWS_C,
-	FWS_U
-};
-
-struct fib6_cleaner_t {
-	struct fib6_walker_t w;
+struct fib6_cleaner {
+	struct fib6_walker w;
 	struct net *net;
 	int (*func)(struct rt6_info *, void *arg);
 	void *arg;
@@ -74,8 +64,8 @@ static DEFINE_RWLOCK(fib6_walker_lock);
 static void fib6_prune_clones(struct net *net, struct fib6_node *fn);
 static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn);
 static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn);
-static int fib6_walk(struct fib6_walker_t *w);
-static int fib6_walk_continue(struct fib6_walker_t *w);
+static int fib6_walk(struct fib6_walker *w);
+static int fib6_walk_continue(struct fib6_walker *w);
 
 /*
  *	A routing update causes an increase of the serial number on the
@@ -91,20 +81,21 @@ static void fib6_gc_timer_cb(unsigned long arg);
 static LIST_HEAD(fib6_walkers);
 #define FOR_WALKERS(w) list_for_each_entry(w, &fib6_walkers, lh)
 
-static inline void fib6_walker_link(struct fib6_walker_t *w)
+static void fib6_walker_link(struct fib6_walker *w)
 {
 	write_lock_bh(&fib6_walker_lock);
 	list_add(&w->lh, &fib6_walkers);
 	write_unlock_bh(&fib6_walker_lock);
 }
 
-static inline void fib6_walker_unlink(struct fib6_walker_t *w)
+static void fib6_walker_unlink(struct fib6_walker *w)
 {
 	write_lock_bh(&fib6_walker_lock);
 	list_del(&w->lh);
 	write_unlock_bh(&fib6_walker_lock);
 }
-static __inline__ u32 fib6_new_sernum(void)
+
+static u32 fib6_new_sernum(void)
 {
 	u32 n = ++rt_sernum;
 	if ((__s32)n <= 0)
@@ -128,7 +119,7 @@ static __inline__ u32 fib6_new_sernum(void)
 # define BITOP_BE32_SWIZZLE	0
 #endif
 
-static __inline__ __be32 addr_bit_set(const void *token, int fn_bit)
+static __be32 addr_bit_set(const void *token, int fn_bit)
 {
 	const __be32 *addr = token;
 	/*
@@ -142,7 +133,7 @@ static __inline__ __be32 addr_bit_set(const void *token, int fn_bit)
 	       addr[fn_bit >> 5];
 }
 
-static __inline__ struct fib6_node *node_alloc(void)
+static struct fib6_node *node_alloc(void)
 {
 	struct fib6_node *fn;
 
@@ -151,12 +142,12 @@ static __inline__ struct fib6_node *node_alloc(void)
 	return fn;
 }
 
-static __inline__ void node_free(struct fib6_node *fn)
+static void node_free(struct fib6_node *fn)
 {
 	kmem_cache_free(fib6_node_kmem, fn);
 }
 
-static __inline__ void rt6_release(struct rt6_info *rt)
+static void rt6_release(struct rt6_info *rt)
 {
 	if (atomic_dec_and_test(&rt->rt6i_ref))
 		dst_free(&rt->dst);
@@ -267,7 +258,7 @@ static void __net_init fib6_tables_init(struct net *net)
 
 #endif
 
-static int fib6_dump_node(struct fib6_walker_t *w)
+static int fib6_dump_node(struct fib6_walker *w)
 {
 	int res;
 	struct rt6_info *rt;
@@ -287,7 +278,7 @@ static int fib6_dump_node(struct fib6_walker_t *w)
 
 static void fib6_dump_end(struct netlink_callback *cb)
 {
-	struct fib6_walker_t *w = (void *)cb->args[2];
+	struct fib6_walker *w = (void *)cb->args[2];
 
 	if (w) {
 		if (cb->args[4]) {
@@ -310,7 +301,7 @@ static int fib6_dump_done(struct netlink_callback *cb)
 static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
 			   struct netlink_callback *cb)
 {
-	struct fib6_walker_t *w;
+	struct fib6_walker *w;
 	int res;
 
 	w = (void *)cb->args[2];
@@ -355,7 +346,7 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 	unsigned int h, s_h;
 	unsigned int e = 0, s_e;
 	struct rt6_rtnl_dump_arg arg;
-	struct fib6_walker_t *w;
+	struct fib6_walker *w;
 	struct fib6_table *tb;
 	struct hlist_head *head;
 	int res = 0;
@@ -627,7 +618,7 @@ insert_above:
 	return ln;
 }
 
-static inline bool rt6_qualify_for_ecmp(struct rt6_info *rt)
+static bool rt6_qualify_for_ecmp(struct rt6_info *rt)
 {
 	return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
 	       RTF_GATEWAY;
@@ -820,7 +811,7 @@ add:
 	return 0;
 }
 
-static __inline__ void fib6_start_gc(struct net *net, struct rt6_info *rt)
+static void fib6_start_gc(struct net *net, struct rt6_info *rt)
 {
 	if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
 	    (rt->rt6i_flags & (RTF_EXPIRES | RTF_CACHE)))
@@ -1174,7 +1165,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
 	int children;
 	int nstate;
 	struct fib6_node *child, *pn;
-	struct fib6_walker_t *w;
+	struct fib6_walker *w;
 	int iter = 0;
 
 	for (;;) {
@@ -1276,7 +1267,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
 static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
 			   struct nl_info *info)
 {
-	struct fib6_walker_t *w;
+	struct fib6_walker *w;
 	struct rt6_info *rt = *rtp;
 	struct net *net = info->nl_net;
 
@@ -1414,7 +1405,7 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info)
  *	<0  -> walk is terminated by an error.
  */
 
-static int fib6_walk_continue(struct fib6_walker_t *w)
+static int fib6_walk_continue(struct fib6_walker *w)
 {
 	struct fib6_node *fn, *pn;
 
@@ -1498,7 +1489,7 @@ skip:
 	}
 }
 
-static int fib6_walk(struct fib6_walker_t *w)
+static int fib6_walk(struct fib6_walker *w)
 {
 	int res;
 
@@ -1512,11 +1503,11 @@ static int fib6_walk(struct fib6_walker_t *w)
 	return res;
 }
 
-static int fib6_clean_node(struct fib6_walker_t *w)
+static int fib6_clean_node(struct fib6_walker *w)
 {
 	int res;
 	struct rt6_info *rt;
-	struct fib6_cleaner_t *c = container_of(w, struct fib6_cleaner_t, w);
+	struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
 	struct nl_info info = {
 		.nl_net = c->net,
 	};
@@ -1554,9 +1545,9 @@ static int fib6_clean_node(struct fib6_walker_t *w)
 
 static void fib6_clean_tree(struct net *net, struct fib6_node *root,
 			    int (*func)(struct rt6_info *, void *arg),
-			    int prune, void *arg)
+			    bool prune, void *arg)
 {
-	struct fib6_cleaner_t c;
+	struct fib6_cleaner c;
 
 	c.w.root = root;
 	c.w.func = fib6_clean_node;
@@ -1583,7 +1574,7 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg),
 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
 			write_lock_bh(&table->tb6_lock);
 			fib6_clean_tree(net, &table->tb6_root,
-					func, 0, arg);
+					func, false, arg);
 			write_unlock_bh(&table->tb6_lock);
 		}
 	}
@@ -1602,7 +1593,7 @@ static int fib6_prune_clone(struct rt6_info *rt, void *arg)
 
 static void fib6_prune_clones(struct net *net, struct fib6_node *fn)
 {
-	fib6_clean_tree(net, fn, fib6_prune_clone, 1, NULL);
+	fib6_clean_tree(net, fn, fib6_prune_clone, true, NULL);
 }
 
 static int fib6_update_sernum(struct rt6_info *rt, void *arg)
@@ -1828,7 +1819,7 @@ void fib6_gc_cleanup(void)
 
 struct ipv6_route_iter {
 	struct seq_net_private p;
-	struct fib6_walker_t w;
+	struct fib6_walker w;
 	loff_t skip;
 	struct fib6_table *tbl;
 	__u32 sernum;
@@ -1859,7 +1850,7 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int ipv6_route_yield(struct fib6_walker_t *w)
+static int ipv6_route_yield(struct fib6_walker *w)
 {
 	struct ipv6_route_iter *iter = w->args;
 
@@ -1980,7 +1971,7 @@ static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos)
 
 static bool ipv6_route_iter_active(struct ipv6_route_iter *iter)
 {
-	struct fib6_walker_t *w = &iter->w;
+	struct fib6_walker *w = &iter->w;
 	return w->node && !(w->state == FWS_U && w->node == w->root);
 }
 
-- 
1.9.3

^ permalink raw reply related

* [PATCH v2 net-next 2/5] ipv6: make rt_sernum atomic and serial number fields ordinary ints
From: Hannes Frederic Sowa @ 2014-10-06 17:58 UTC (permalink / raw)
  To: netdev; +Cc: hideaki, kafai, cwang
In-Reply-To: <cover.1412618014.git.hannes@stressinduktion.org>

Cc: YOSHIFUJI Hideaki <hideaki@yoshifuji.org>
Cc: Martin Lau <kafai@fb.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
---
 include/net/ip6_fib.h |  2 +-
 net/ipv6/ip6_fib.c    | 23 +++++++++++++----------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 9221bf4..8eea35d 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -64,7 +64,7 @@ struct fib6_node {
 
 	__u16			fn_bit;		/* bit key */
 	__u16			fn_flags;
-	__u32			fn_sernum;
+	int			fn_sernum;
 	struct rt6_info		*rr_ptr;
 };
 
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index e8d7465..332f1e0 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -74,7 +74,7 @@ static int fib6_walk_continue(struct fib6_walker *w);
  *	result of redirects, path MTU changes, etc.
  */
 
-static __u32 rt_sernum;
+static atomic_t rt_sernum = ATOMIC_INIT(1);
 
 static void fib6_gc_timer_cb(unsigned long arg);
 
@@ -95,12 +95,15 @@ static void fib6_walker_unlink(struct fib6_walker *w)
 	write_unlock_bh(&fib6_walker_lock);
 }
 
-static u32 fib6_new_sernum(void)
+static int fib6_new_sernum(void)
 {
-	u32 n = ++rt_sernum;
-	if ((__s32)n <= 0)
-		rt_sernum = n = 1;
-	return n;
+	int new, old;
+
+	do {
+		old = atomic_read(&rt_sernum);
+		new = old < INT_MAX ? old + 1 : 1;
+	} while (atomic_cmpxchg(&rt_sernum, old, new) != old);
+	return new;
 }
 
 /*
@@ -421,7 +424,7 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root,
 	struct rt6key *key;
 	int	bit;
 	__be32	dir = 0;
-	__u32	sernum = fib6_new_sernum();
+	int	sernum = fib6_new_sernum();
 
 	RT6_TRACE("fib6_add_1\n");
 
@@ -1598,7 +1601,7 @@ static void fib6_prune_clones(struct net *net, struct fib6_node *fn)
 
 static int fib6_update_sernum(struct rt6_info *rt, void *arg)
 {
-	__u32 sernum = *(__u32 *)arg;
+	int sernum = *(int *)arg;
 
 	if (rt->rt6i_node &&
 	    rt->rt6i_node->fn_sernum != sernum)
@@ -1609,7 +1612,7 @@ static int fib6_update_sernum(struct rt6_info *rt, void *arg)
 
 static void fib6_flush_trees(struct net *net)
 {
-	__u32 new_sernum = fib6_new_sernum();
+	int new_sernum = fib6_new_sernum();
 
 	fib6_clean_all(net, fib6_update_sernum, &new_sernum);
 }
@@ -1822,7 +1825,7 @@ struct ipv6_route_iter {
 	struct fib6_walker w;
 	loff_t skip;
 	struct fib6_table *tbl;
-	__u32 sernum;
+	int sernum;
 };
 
 static int ipv6_route_seq_show(struct seq_file *seq, void *v)
-- 
1.9.3

^ permalink raw reply related

* [PATCH v2 net-next 0/5] ipv6: cleanup after rt6_genid removal
From: Hannes Frederic Sowa @ 2014-10-06 17:58 UTC (permalink / raw)
  To: netdev; +Cc: hideaki, kafai, cwang

Leftover patches after rt6_genid removal after 705f1c869d577c ("ipv6:
remove rt6i_genid").

Major two changes are:
* keep fib6_sernum per namespace to reduce number of flushes in case
  system has high number of namespaces
* make fn_sernum updates cheaper

v2: Incorporated feedback from Cong Wang, thanks a lot!

Hannes Frederic Sowa (5):
  ipv6: minor fib6 cleanups like type safety, bool conversion, inline
    removal
  ipv6: make rt_sernum atomic and serial number fields ordinary ints
  ipv6: only generate one new serial number per fib mutation
  ipv6: make fib6 serial number per namespace
  ipv6: don't walk node's leaf during serial number update

 include/net/ip6_fib.h    |  20 +++++--
 include/net/netns/ipv6.h |   2 +-
 net/ipv6/af_inet6.c      |   2 +-
 net/ipv6/ip6_fib.c       | 142 ++++++++++++++++++++++++-----------------------
 4 files changed, 91 insertions(+), 75 deletions(-)

-- 
1.9.3

^ permalink raw reply

* Re: [PATCH net-next 5/5] ipv6: don't walk node's leaf during serial number update
From: Cong Wang @ 2014-10-06 17:58 UTC (permalink / raw)
  To: Hannes Frederic Sowa; +Cc: netdev, hideaki, kafai
In-Reply-To: <60321af8a7305bec398e62858ef53bb6605878fb.1412585163.git.hannes@stressinduktion.org>

On Mon, Oct 6, 2014 at 1:52 AM, Hannes Frederic Sowa
<hannes@stressinduktion.org> wrote:
> @@ -105,6 +106,10 @@ static int fib6_new_sernum(struct net *net)
>         return new;
>  }
>
> +enum {
> +       FIB6_NO_SERNUM_CHANGE = 0,
> +};
> +

Not sure if it worth an enum definition... seems overkill for me.

^ permalink raw reply

* Re: randconfig build error with next-20141001, in drivers/i2c/algos/i2c-algo-bit.c
From: Oliver Hartkopp @ 2014-10-06 17:39 UTC (permalink / raw)
  To: Randy Dunlap, Jim Davis, Stephen Rothwell
  Cc: linux-next, Stephane Grosjean, linux-i2c, netdev@vger.kernel.org,
	linux-can
In-Reply-To: <5432C8C6.7060506@infradead.org>



On 10/06/2014 06:52 PM, Randy Dunlap wrote:
> On 10/06/14 01:06, Oliver Hartkopp wrote:
>> Hello all,
>>
>> just to get it right:
>>
>> So far it looks like this in linux/drivers/net/can/sja1000/Kconfig
>>
>> config CAN_PEAK_PCIEC
>>         bool "PEAK PCAN-ExpressCard Cards"
>>         depends on CAN_PEAK_PCI
>>         select I2C
>>         select I2C_ALGOBIT
>>
>> If one would change the
>>
>>         select I2C
>>
>> into
>>
>>         depends on I2C
>>
>> IMHO the CAN_PEAK_PCIEC hardware would *only* be visible and selectable when
>> I2C was selected before (from anyone else?).
> 
> That is correct.
> 
>> So what it wrong on the current Kconfig entry?
>> Is 'select' deprecated?
> 
> No, it's not deprecated.  It's just dangerous.  and driver configs should not
> enable entire subsystems via 'select'.
> 
>> Or did randconfig generate a configuration that would not be possible by
>> properly generating the config file with 'make menuconfig' ??
> 
> randconfig generated a config for another driver which causes a build error,
> not for a CAN driver.  The CAN driver does not have a build error AFAIK.
> Its Kconfig is just doing something with a very big & ugly stick.

But when it is not done like this, we might have an invisible config option in
the corner case that I2C is not enabled by anyone else.

So what would you propose then?

AFAICS there is 'just' a style problem as 'configs should not enable entire
subsystems'. But it finally is a correct and valid Kconfig, right?

When I2C is already enabled - fine. If (unlikely) I2C is not enabled, we need
to pull the ugly stick. So what is dangerous on this? Was there any misuse of
select statements before?

Best regards,
Oliver

^ permalink raw reply

* Re: [PATCH net-next 2/5] ipv6: make rt_sernum atomic and serial number fields ordinary ints
From: Cong Wang @ 2014-10-06 17:26 UTC (permalink / raw)
  To: Hannes Frederic Sowa; +Cc: netdev, hideaki, kafai
In-Reply-To: <c98d73f367912e2fa164ffa7f6cc0c7ecbd21de1.1412585163.git.hannes@stressinduktion.org>

On Mon, Oct 6, 2014 at 1:52 AM, Hannes Frederic Sowa
<hannes@stressinduktion.org> wrote:
> @@ -1598,7 +1601,7 @@ static void fib6_prune_clones(struct net *net, struct fib6_node *fn)
>
>  static int fib6_update_sernum(struct rt6_info *rt, void *arg)
>  {
> -       __u32 sernum = *(__u32 *)arg;
> +       int sernum = *(__u32 *)arg;

Should be  *(int *) arg?

^ permalink raw reply

* Re: [PATCH net-next 1/5] ipv6: minor fib6 cleanups like type safety, bool conversion, inline removal
From: Cong Wang @ 2014-10-06 17:23 UTC (permalink / raw)
  To: Hannes Frederic Sowa; +Cc: netdev, hideaki, kafai
In-Reply-To: <d848631ba3331e6e18c3208b217d8ecf9b69d650.1412585163.git.hannes@stressinduktion.org>

On Mon, Oct 6, 2014 at 1:52 AM, Hannes Frederic Sowa
<hannes@stressinduktion.org> wrote:
> diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
> index cf485f9..f4e6b3e 100644
> --- a/include/net/ip6_fib.h
> +++ b/include/net/ip6_fib.h
> @@ -202,12 +202,22 @@ static inline void ip6_rt_put(struct rt6_info *rt)
>         dst_release(&rt->dst);
>  }
>
> +enum fib_walk_state_t {
> +#ifdef CONFIG_IPV6_SUBTREES
> +       FWS_S,
> +#endif
> +       FWS_L,
> +       FWS_R,
> +       FWS_C,
> +       FWS_U
> +};
> +
>  struct fib6_walker_t {

While you are on it, I think these *_t types don't actually need "_t",
they are not typedef's anyway. But this definitely could be a folllowup patch.

^ permalink raw reply

* Re: [net-next PATCH v1 3/3] net: sched: do not use tcf_proto 'tp' argument from call_rcu
From: Cong Wang @ 2014-10-06 17:05 UTC (permalink / raw)
  To: John Fastabend
  Cc: Cong Wang, David Miller, netdev, Jamal Hadi Salim, Eric Dumazet
In-Reply-To: <20141006042850.6010.176.stgit@nitbit.x32>

On Sun, Oct 5, 2014 at 9:28 PM, John Fastabend <john.fastabend@gmail.com> wrote:
> Using the tcf_proto pointer 'tp' from inside the classifiers callback
> is not valid because it may have been cleaned up by another call_rcu
> occuring on another CPU.
>
> 'tp' is currently being used by tcf_unbind_filter() in this patch we
> move instances of tcf_unbind_filter outside of the call_rcu() context.
> This is safe to do because any running schedulers will either read the
> valid class field or it will be zeroed.
>
> And all schedulers today when the class is 0 do a lookup using the
> same call used by the tcf_exts_bind(). So even if we have a running
> classifier hit the null class pointer it will do a lookup and get
> to the same result. This is particularly fragile at the moment because
> the only way to verify this is to audit the schedulers call sites.
>
> Reported-by: Cong Wang <xiyou.wangconf@gmail.com>
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>

Acked-by: Cong Wang <cwang@twopensource.com>

^ permalink raw reply

* Re: [net-next PATCH v1 1/3] net: sched: af_packet support for direct ring access
From: Hannes Frederic Sowa @ 2014-10-06 17:03 UTC (permalink / raw)
  To: John Fastabend
  Cc: Daniel Borkmann, John Fastabend, Jesper Dangaard Brouer,
	John W. Linville, Neil Horman, Florian Westphal, gerlitz.or,
	netdev, john.ronciak, amirv, eric.dumazet, danny.zhou
In-Reply-To: <5432AEE0.9000600@intel.com>

Hi John,

On Mo, 2014-10-06 at 08:01 -0700, John Fastabend wrote:
> On 10/06/2014 02:49 AM, Daniel Borkmann wrote:
> > Hi John,
> > 
> > On 10/06/2014 03:12 AM, John Fastabend wrote:
> >> On 10/05/2014 05:29 PM, Florian Westphal wrote:
> >>> John Fastabend <john.fastabend@gmail.com> wrote:
> >>>> There is one critical difference when running with these interfaces
> >>>> vs running without them. In the normal case the af_packet module
> >>>> uses a standard descriptor format exported by the af_packet user
> >>>> space headers. In this model because we are working directly with
> >>>> driver queues the descriptor format maps to the descriptor format
> >>>> used by the device. User space applications can learn device
> >>>> information from the socket option PACKET_DEV_DESC_INFO which
> >>>> should provide enough details to extrapulate the descriptor formats.
> >>>> Although this adds some complexity to user space it removes the
> >>>> requirement to copy descriptor fields around.
> >>>
> >>> I find it very disappointing that we seem to have to expose such
> >>> hardware specific details to userspace via hw-independent interface.
> >>
> >> Well it was only for convenience if it doesn't fit as a socket
> >> option we can remove it. We can look up the device using the netdev
> >> name from the bind call. I see your point though so if there is
> >> consensus that this is not needed that is fine.
> >>
> >>> How big of a cost are we talking about when you say that it 'removes
> >>> the requirement to copy descriptor fields'?
> >>
> >> This was likely a poor description. If you want to let user space
> >> poll on the ring (without using system calls or interrupts) then
> >> I don't see how you can _not_ expose the ring directly complete with
> >> the vendor descriptor formats.
> > 
> > But how big is the concrete performance degradation you're seeing if you
> > use an e.g. `netmap-alike` Linux-own variant as a hw-neutral interface
> > that does *not* directly expose hw descriptor formats to user space?
> 
> If we don't directly expose the hardware descriptor formats then we
> need to somehow kick the driver when we want it to do the copy from
> the driver descriptor format to the common descriptor format.
>
> This requires a system call as far as I can tell. Which has unwanted
> overhead. I can micro-benchmark this if its helpful. But if we dredge
> up Jesper's slides here we are really counting cycles so even small
> numbers count if we want to hit line rate in a user space application
> with 40Gpbs hardware.

I agree, it seems pretty hard to achieve non-syscall sending on the same
core, as we somehow must transfer control over to the kernel without
doing a syscall.

The only other idea would be to export machine code up to user space,
which you can mmap(MAP_EXEC) from the socket somehow to make this API
truly NIC agnostic without recompiling. This code then would transform
the generic descriptors to the hardware specific ones. Seems also pretty
hairy to do that correctly, maybe.

> > With 1 core netmap does 10G line-rate on 64b; I don't know their numbers
> > on 40G when run on decent hardware though.
> > 
> > It would really be great if we have something vendor neutral exposed as
> > a stable ABI and could leverage emerging infrastructure we already have
> > in the kernel such as eBPF and recent qdisc batching for raw sockets
> > instead of reinventing the wheels. (Don't get me wrong, I would love to
> > see AF_PACKET improved ...)
> 
> I don't think the interface is vendor specific. It does require some
> knowledge of the hardware descriptor layout though. It is though vendor
> neutral from my point of view. I provided the ixgbe patch simple because
> I'm most familiar with it and have a NIC here. If someone wants to send me
> a Mellanox NIC I can give it a try although I was hoping to recruit Or or
> Amir? The only hardware feature required is flow classification to queues
> which seems to be common across 10Gbps and 40/100Gbps devices. So most
> of the drivers should be able to support this.

Does flow classification work at the same level as registering network
addresses? Do I have to bind a e.g. multicast address wie ip maddr and
then set up flow director/ntuple to get the packets on the correct user
space facing queue or is it in case of the ixgbe card enough to just add
those addresses via fdir? Have you thought about letting the
kernel/driver handle that? In case one would like to connect their
virtual machines via this interface to the network maybe we need central
policing and resource constraints for queue management here?

Do other drivers need a separate af-packet managed way to bind addresses
to the queue? Maybe there are other quirks we might need to add to
actually build support for other network interface cards. Would be great
to at least examine one other driver in regard to this.
        
What other properties of the NIC must be exported? I think we also have
to deal with MTUs currently configured in the NIC, promisc mode and
maybe TSO?

> If your worried driver writers will implement the interface but not make
> their descriptor formats easily available I considered putting the layout
> in a header file in the uapi somewhere. Then we could just reject any
> implementation that doesn't include the header file needed to use it
> from user space.
> 
> With regards to leveraging eBPF and qdisc batching I don't see how this
> works with direct DMA and polling. Needed to give the lowest overhead
> between kernel and user space. In this case we want to use the hardware
> to do the filtering that would normally be done for eBPF and for many
> use cases the hardware flow classifiers is sufficient.

I agree, those features are hard to connect.

> We already added a qdisc bypass option I see this as taking this path
> further. I believe there is room for a continuum here. For basic cases
> use af_packet v1,v2 for mmap rings but using common descriptors use
> af_packet v3 and set QOS_BYASS. For absolute lowest overhead and
> specific applications that don't need QOS, eBPF use this interface.

You can simply write C code instead of eBPF code, yes.

I find the six additional ndo ops a bit worrisome as we are adding more
and more subsystem specific ndoops to this struct. I would like to see
some unification here, but currently cannot make concrete proposals,
sorry.

Patch 2/3 does not yet expose hw ring descriptors in uapi headers it
seems?

Are there plans to push a user space framework (maybe even into the
kernel), too? Will this be dpdk (alike) in the end?

Bye,
Hannes

^ permalink raw reply

* [PATCH net] bna: allow transmit tagged frames
From: Ivan Vecera @ 2014-10-06 17:02 UTC (permalink / raw)
  To: netdev; +Cc: Rasesh Mody

When Tx VLAN offloading is disabled frames with size ~ MTU are not
transmitted as the driver does not account 4 bytes of VLAN header added
by stack. It should use VLAN_ETH_HLEN instead of ETH_HLEN.

The second problem is with newer BNA chips (BNA 1860). These chips filter
out any VLAN tagged frames in Tx path. This is a problem when Tx VLAN
offloading is disabled and frames are tagged by stack. Older chips like
1010/1020 are not affected as they probably don't do such filtering.

Cc: Rasesh Mody <rasesh.mody@qlogic.com>
Signed-off-by: Ivan Vecera <ivecera@redhat.com>
---
 drivers/net/ethernet/brocade/bna/bna_tx_rx.c | 2 +-
 drivers/net/ethernet/brocade/bna/bnad.c      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/brocade/bna/bna_tx_rx.c b/drivers/net/ethernet/brocade/bna/bna_tx_rx.c
index 85e6354..9949bd9 100644
--- a/drivers/net/ethernet/brocade/bna/bna_tx_rx.c
+++ b/drivers/net/ethernet/brocade/bna/bna_tx_rx.c
@@ -3410,7 +3410,7 @@ bna_bfi_tx_enet_start(struct bna_tx *tx)
 
 	cfg_req->tx_cfg.vlan_mode = BFI_ENET_TX_VLAN_WI;
 	cfg_req->tx_cfg.vlan_id = htons((u16)tx->txf_vlan_id);
-	cfg_req->tx_cfg.admit_tagged_frame = BNA_STATUS_T_DISABLED;
+	cfg_req->tx_cfg.admit_tagged_frame = BNA_STATUS_T_ENABLED;
 	cfg_req->tx_cfg.apply_vlan_filter = BNA_STATUS_T_DISABLED;
 
 	bfa_msgq_cmd_set(&tx->msgq_cmd, NULL, NULL,
diff --git a/drivers/net/ethernet/brocade/bna/bnad.c b/drivers/net/ethernet/brocade/bna/bnad.c
index ffc92a4..153cafa 100644
--- a/drivers/net/ethernet/brocade/bna/bnad.c
+++ b/drivers/net/ethernet/brocade/bna/bnad.c
@@ -2864,7 +2864,7 @@ bnad_txq_wi_prepare(struct bnad *bnad, struct bna_tcb *tcb,
 		txqent->hdr.wi.opcode =	htons(BNA_TXQ_WI_SEND);
 		txqent->hdr.wi.lso_mss = 0;
 
-		if (unlikely(skb->len > (bnad->netdev->mtu + ETH_HLEN))) {
+		if (unlikely(skb->len > (bnad->netdev->mtu + VLAN_ETH_HLEN))) {
 			BNAD_UPDATE_CTR(bnad, tx_skb_non_tso_too_long);
 			return -EINVAL;
 		}
-- 
2.0.4

^ permalink raw reply related

* Re: [iproute2 1/1] RFC: obsolete direct invocation of police
From: Stephen Hemminger @ 2014-10-06 17:01 UTC (permalink / raw)
  To: Jamal Hadi Salim; +Cc: netdev, xiyou.wangcong, john.r.fastabend
In-Reply-To: <1412595960-7862-1-git-send-email-jhs@emojatatu.com>

On Mon,  6 Oct 2014 07:46:00 -0400
Jamal Hadi Salim <jhs@mojatatu.com> wrote:

> From: Jamal Hadi Salim <jhs@mojatatu.com>
> 
> I realize this may be controversial. I dont think people are still using
> the 1990s syntax anymore. This is one way to test it.
> We want to eventually phase out the kernel hacks for backward compat.
> 
> Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>

I think iproute utilities needs to accept the old syntax and warn about
deprecated syntax use. Later (like 2yr +) the code can be removed.

The old syntax can be removed from all documentation and help messages
now though.

^ permalink raw reply

* Re: [net-next PATCH v1 2/3] net: sched: cls_cgroup tear down exts and ematch from rcu callback
From: Cong Wang @ 2014-10-06 17:01 UTC (permalink / raw)
  To: John Fastabend
  Cc: Cong Wang, David Miller, netdev, Jamal Hadi Salim, Eric Dumazet
In-Reply-To: <20141006042819.6010.32857.stgit@nitbit.x32>

On Sun, Oct 5, 2014 at 9:28 PM, John Fastabend <john.fastabend@gmail.com> wrote:
> It is not RCU safe to destroy the action chain while there
> is a possibility of readers accessing it. Move this code
> into the rcu callback using the same rcu callback used in the
> code patch to make a change to head.
>
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>

Acked-by: Cong Wang <cwang@twopensource.com>

^ permalink raw reply

* Re: [Patch net-next] net_sched: refactor out tcf_exts
From: Cong Wang @ 2014-10-06 16:56 UTC (permalink / raw)
  To: Jamal Hadi Salim; +Cc: Cong Wang, netdev, John Fastabend, David S. Miller
In-Reply-To: <54327B09.9070204@mojatatu.com>

On Mon, Oct 6, 2014 at 4:20 AM, Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> Sorry for being annoying.
> Cong - here's a simple test that will test for both .police and .action
> for one classifier. If you have time please run it when updating your
> patch.
>

Sure. Thanks for sharing it!

^ permalink raw reply

* Re: [net-next PATCH v1 1/3] net: sched: af_packet support for direct ring access
From: Stephen Hemminger @ 2014-10-06 16:55 UTC (permalink / raw)
  To: John Fastabend
  Cc: dborkman, fw, gerlitz.or, hannes, netdev, john.ronciak, amirv,
	eric.dumazet, danny.zhou
In-Reply-To: <20141006000629.32055.2295.stgit@nitbit.x32>

On Sun, 05 Oct 2014 17:06:31 -0700
John Fastabend <john.fastabend@gmail.com> wrote:

> This patch adds a net_device ops to split off a set of driver queues
> from the driver and map the queues into user space via mmap. This
> allows the queues to be directly manipulated from user space. For
> raw packet interface this removes any overhead from the kernel network
> stack.
> 
> Typically in an af_packet interface a packet_type handler is
> registered and used to filter traffic to the socket and do other
> things such as fan out traffic to multiple sockets. In this case the
> networking stack is being bypassed so this code is not run. So the
> hardware must push the correct traffic to the queues obtained from
> the ndo callback ndo_split_queue_pairs().
> 
> Fortunately there is already a flow classification interface which
> is part of the ethtool command set, ETHTOOL_SRXCLSRLINS. It is
> currently supported by multiple drivers including sfc, mlx4, niu,
> ixgbe, and i40e. Supporting some way to steer traffic to a queue
> is the _only_ hardware requirement to support the interface, plus
> the driver needs to implement the correct ndo ops. A follow on
> patch adds support for ixgbe but we expect at least the subset of
> drivers implementing ETHTOOL_SRXCLSRLINS to be implemented later.
> 
> The interface is driven over an af_packet socket which we believe
> is the most natural interface to use. Because it is already used
> for raw packet interfaces which is what we are providing here.
>  The high level flow for this interface looks like:
> 
> 	bind(fd, &sockaddr, sizeof(sockaddr));
> 
> 	/* Get the device type and info */
> 	getsockopt(fd, SOL_PACKET, PACKET_DEV_DESC_INFO, &def_info,
> 		   &optlen);
> 
> 	/* With device info we can look up descriptor format */
> 
> 	/* Get the layout of ring space offset, page_sz, cnt */
> 	getsockopt(fd, SOL_PACKET, PACKET_DEV_QPAIR_MAP_REGION_INFO,
> 		   &info, &optlen);
> 
> 	/* request some queues from the driver */
> 	setsockopt(fd, SOL_PACKET, PACKET_RXTX_QPAIRS_SPLIT,
> 		   &qpairs_info, sizeof(qpairs_info));
> 
> 	/* if we let the driver pick us queues learn which queues
>          * we were given
>          */
> 	getsockopt(fd, SOL_PACKET, PACKET_RXTX_QPAIRS_SPLIT,
> 		   &qpairs_info, sizeof(qpairs_info));
> 
> 	/* And mmap queue pairs to user space */
> 	mmap(NULL, info.tp_dev_bar_sz, PROT_READ | PROT_WRITE,
> 	     MAP_SHARED, fd, 0);
> 
> 	/* Now we have some user space queues to read/write to*/
> 
> There is one critical difference when running with these interfaces
> vs running without them. In the normal case the af_packet module
> uses a standard descriptor format exported by the af_packet user
> space headers. In this model because we are working directly with
> driver queues the descriptor format maps to the descriptor format
> used by the device. User space applications can learn device
> information from the socket option PACKET_DEV_DESC_INFO which
> should provide enough details to extrapulate the descriptor formats.
> Although this adds some complexity to user space it removes the
> requirement to copy descriptor fields around.
> 
> The formats are usually provided by the device vendor documentation
> If folks want I can provide a follow up patch to provide the formats
> in a .h file in ./include/uapi/linux/ for ease of use. I have access
> to formats for ixgbe and mlx drivers other driver owners would need to
> provide their formats.
> 
> We tested this interface using traffic generators and doing basic
> L2 forwarding tests on ixgbe devices. Our tests use a set of patches
> to DPDK to enable an interface using this socket interfaace. With
> this interface we can xmit/receive @ line rate from a test user space
> application on a single core.
> 
> Additionally we have a set of DPDK patches to enable DPDK with this
> interface. DPDK can be downloaded @ dpdk.org although as I hope is
> clear from above DPDK is just our paticular test environment we
> expect other libraries could be built on this interface.
> 
> Signed-off-by: Danny Zhou <danny.zhou@intel.com>
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>

I like the ability to share a device between kernel and user mode networking.
The model used for DPDK for this is really ugly and fragile/broken.
Your proposal assumes that you fully trust the user mode networking application
which is not a generally safe assumption.

A device can DMA from/to any arbitrary physical memory. 
And it would be hard to use IOMMU to protect because the
IOMMU doesn't know that the difference between the applications queue and
the rest of the queues.

At least with DPDK you can use VFIO, and you are claiming the whole device to
allow protection against random memory being read/written.

^ permalink raw reply

* Re: [Patch net-next] net_sched: refactor out tcf_exts
From: Cong Wang @ 2014-10-06 16:56 UTC (permalink / raw)
  To: John Fastabend
  Cc: Cong Wang, netdev, Jamal Hadi Salim, John Fastabend,
	David S. Miller
In-Reply-To: <543206B4.7090504@gmail.com>

On Sun, Oct 5, 2014 at 8:04 PM, John Fastabend <john.fastabend@gmail.com> wrote:
> On 10/05/2014 06:47 PM, John Fastabend wrote:
>>
>> On 10/03/2014 03:51 PM, Cong Wang wrote:
>>>
>>> As Jamal pointed it out, tcf_exts is really unnecessary,
>>> we can definitely refactor it out without losing any functionality.
>>> This could also remove an indirect layer which makes the code
>>> much easier to read.
>>>
>>> This patch:
>>>
>>> 1) moves exts->action and exts->police into tp->ops, since they
>>> are statically assigned
>>>
>>> 2) moves exts->actions list head out
>>>
>>> 3) removes exts->type, act->type does the same thing
>>>
>>> 4) renames tcf_exts_*() functions to tcf_act_*()
>>>
>>> Cc: Jamal Hadi Salim <jhs@mojatatu.com>
>>> Cc: John Fastabend <john.r.fastabend@intel.com>
>>> Cc: "David S. Miller" <davem@davemloft.net>
>>> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
>>> ---
>>
>>
>> Looks OK to me and removes a layer of abstraction without changing
>> the code much. This is going to conflict with my series so I'll hold
>> off resubmitting it until this is dealt with. I need to respin that
>> ematch fix up to drop the ingress lock.
>>
>> Acked-by: John Fastabend <john.r.fastabend@intel.com>
>>
>> [...]
>>
>
> But after running my test kit I see a null pointer dereference
> in cls_cgroup in tcf_act_change().
>
> Looks like you dropped an initializer...

Oops, yeah, should have an INIT_LIST_HEAD()...

I will send an update.

Thanks!

^ permalink raw reply

* Re: [net-next PATCH v1 1/3] net: sched: remove tcf_proto from ematch calls
From: Cong Wang @ 2014-10-06 16:52 UTC (permalink / raw)
  To: John Fastabend
  Cc: Cong Wang, David Miller, netdev, Jamal Hadi Salim, Eric Dumazet
In-Reply-To: <20141006042751.6010.86647.stgit@nitbit.x32>

On Sun, Oct 5, 2014 at 9:27 PM, John Fastabend <john.fastabend@gmail.com> wrote:
> This removes the tcf_proto argument from the ematch code paths that
> only need it to reference the net namespace. This allows simplifying
> qdisc code paths especially when we need to tear down the ematch
> from an RCU callback. In this case we can not guarentee that the
> tcf_proto structure is still valid.
>
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>

The code looks cleaner now. :)

Acked-by: Cong Wang <cwang@twopensource.com>

^ permalink raw reply

* Re: randconfig build error with next-20141001, in drivers/i2c/algos/i2c-algo-bit.c
From: Randy Dunlap @ 2014-10-06 16:52 UTC (permalink / raw)
  To: Oliver Hartkopp, Jim Davis, Stephen Rothwell
  Cc: linux-next, Stephane Grosjean, linux-i2c, netdev@vger.kernel.org,
	linux-can
In-Reply-To: <54324D68.6070309@hartkopp.net>

On 10/06/14 01:06, Oliver Hartkopp wrote:
> Hello all,
> 
> just to get it right:
> 
> So far it looks like this in linux/drivers/net/can/sja1000/Kconfig
> 
> config CAN_PEAK_PCIEC
>         bool "PEAK PCAN-ExpressCard Cards"
>         depends on CAN_PEAK_PCI
>         select I2C
>         select I2C_ALGOBIT
> 
> If one would change the
> 
>         select I2C
> 
> into
> 
>         depends on I2C
> 
> IMHO the CAN_PEAK_PCIEC hardware would *only* be visible and selectable when
> I2C was selected before (from anyone else?).

That is correct.

> So what it wrong on the current Kconfig entry?
> Is 'select' deprecated?

No, it's not deprecated.  It's just dangerous.  and driver configs should not
enable entire subsystems via 'select'.

> Or did randconfig generate a configuration that would not be possible by
> properly generating the config file with 'make menuconfig' ??

randconfig generated a config for another driver which causes a build error,
not for a CAN driver.  The CAN driver does not have a build error AFAIK.
Its Kconfig is just doing something with a very big & ugly stick.


> Please explain.
> 
> Thanks,
> Oliver
> 
> On 10/02/2014 01:26 AM, Randy Dunlap wrote:
>> On 10/01/14 14:37, Jim Davis wrote:
>>> Building with the attached random configuration file,
>>
>> Also:
>> warning: (CAN_PEAK_PCIEC && SFC && IGB && VIDEO_TW68 && DRM && FB_DDC && FB_VIA) selects I2C_ALGOBIT which has unmet direct dependencies (I2C)
>>
>>> drivers/i2c/algos/i2c-algo-bit.c: In function ‘i2c_bit_add_bus’:
>>> drivers/i2c/algos/i2c-algo-bit.c:658:33: error: ‘i2c_add_adapter’
>>> undeclared (first use in this function)
>>>   return __i2c_bit_add_bus(adap, i2c_add_adapter);
>>>                                  ^
>>> drivers/i2c/algos/i2c-algo-bit.c:658:33: note: each undeclared
>>> identifier is reported only once for each function it appears in
>>> drivers/i2c/algos/i2c-algo-bit.c: In function ‘i2c_bit_add_numbered_bus’:
>>> drivers/i2c/algos/i2c-algo-bit.c:664:33: error:
>>> ‘i2c_add_numbered_adapter’ undeclared (first use in this function)
>>>   return __i2c_bit_add_bus(adap, i2c_add_numbered_adapter);
>>>                                  ^
>>>   CC      net/openvswitch/actions.o
>>> drivers/i2c/algos/i2c-algo-bit.c: In function ‘i2c_bit_add_bus’:
>>> drivers/i2c/algos/i2c-algo-bit.c:659:1: warning: control reaches end of non-void
>>>  function [-Wreturn-type]
>>>  }
>>>  ^
>>> drivers/i2c/algos/i2c-algo-bit.c: In function ‘i2c_bit_add_numbered_bus’:
>>> drivers/i2c/algos/i2c-algo-bit.c:665:1: warning: control reaches end of non-void
>>>  function [-Wreturn-type]
>>>  }
>>>  ^
>>> make[3]: *** [drivers/i2c/algos/i2c-algo-bit.o] Error 1
>>
>> In drivers/media/pci/tw68/Kconfig, VIDEO_TW68 should depend on I2C in order
>> to make it safe to select I2C_ALGOBIT.
>>
>> In drivers/net/can/sja1000/Kconfig, CAN_PEAK_PCIEC should depend on I2C
>> instead of selecting I2C (and change the help text).


-- 
~Randy

^ permalink raw reply

* Re: [PATCH] net: bcmgenet: fix increase rx_read_ptr
From: Florian Fainelli @ 2014-10-06 16:45 UTC (permalink / raw)
  To: Jaedon Shin; +Cc: netdev
In-Reply-To: <1412564726-40192-1-git-send-email-jaedon.shin@gmail.com>

On 10/05/2014 08:05 PM, Jaedon Shin wrote:
> The rx_read_ptr must increase after using it.

Your commit message is too terse, you need to explain why you think the
current code is bad, and how your patch is fixing it.

One possible thing that I see is that we might be off by one in how we
use the enet_cb versus how we read the HW packet descriptor.

> 
> Signed-off-by: Jaedon Shin <jaedon.shin@gmail.com>
> ---
>  drivers/net/ethernet/broadcom/genet/bcmgenet.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
> index 5cc9cae..b47db5e 100644
> --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
> +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
> @@ -1282,9 +1282,6 @@ static unsigned int bcmgenet_desc_rx(struct bcmgenet_priv *priv,
>  
>  		rxpktprocessed++;
>  
> -		priv->rx_read_ptr++;
> -		priv->rx_read_ptr &= (priv->num_rx_bds - 1);
> -
>  		/* We do not have a backing SKB, so we do not have a
>  		 * corresponding DMA mapping for this incoming packet since
>  		 * bcmgenet_rx_refill always either has both skb and mapping or
> @@ -1399,6 +1396,9 @@ refill:
>  		err = bcmgenet_rx_refill(priv, cb);
>  		if (err)
>  			netif_err(priv, rx_err, dev, "Rx refill failed\n");
> +
> +		priv->rx_read_ptr++;
> +		priv->rx_read_ptr &= (priv->num_rx_bds - 1);
>  	}
>  
>  	return rxpktprocessed;
> 

^ permalink raw reply

* Re: [net-next PATCH v1 1/3] net: sched: af_packet support for direct ring access
From: Jesper Dangaard Brouer @ 2014-10-06 16:35 UTC (permalink / raw)
  To: John Fastabend
  Cc: brouer, Daniel Borkmann, John Fastabend, John W. Linville,
	Neil Horman, Florian Westphal, gerlitz.or, hannes, netdev,
	john.ronciak, amirv, eric.dumazet, danny.zhou
In-Reply-To: <5432AEE0.9000600@intel.com>


On Mon, 06 Oct 2014 08:01:52 -0700 John Fastabend <john.r.fastabend@intel.com> wrote:
 
> This requires a system call as far as I can tell. Which has unwanted
> overhead. I can micro-benchmark this if its helpful. But if we dredge
> up Jesper's slides here we are really counting cycles so even small
> numbers count if we want to hit line rate in a user space application
> with 40Gpbs hardware.

The micro-benchmarked syscall[2] cost is approx 42 ns [1] (when
disabling CONFIG_AUDITSYSCALL else its approx 88ns), which is
significant compared to the 10G wirespeed smallest packet size budget
of 67.2ns.

See:
 [1] http://netoptimizer.blogspot.dk/2014/05/the-calculations-10gbits-wirespeed.html
 [2] https://github.com/netoptimizer/network-testing/blob/master/src/syscall_overhead.c

[...] 
> We already added a qdisc bypass option I see this as taking this path
> further. I believe there is room for a continuum here. For basic cases
> use af_packet v1,v2 for mmap rings but using common descriptors use
> af_packet v3 and set QOS_BYASS. For absolute lowest overhead and
> specific applications that don't need QOS, eBPF use this interface.

Well, after the qdisc bulking changes, when bulking kicks in then the
qdisc path is faster than the qdisc bypass (measured with trafgen).

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Sr. Network Kernel Developer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* [PATCH net-next] net/mlx4_en: remove NETDEV_TX_BUSY
From: Eric Dumazet @ 2014-10-06 16:30 UTC (permalink / raw)
  To: Amir Vadai
  Cc: David S. Miller, Eric Dumazet, netdev, Yevgeny Petrilin,
	Or Gerlitz, Ido Shamay
In-Reply-To: <1412599904.11091.66.camel@edumazet-glaptop2.roam.corp.google.com>

From: Eric Dumazet <edumazet@google.com>

Drivers should avoid NETDEV_TX_BUSY as much as possible.

They should stop the tx queue before qdisc even tries to push another
packet, to avoid requeues.

For a driver supporting skb->xmit_more, this is likely to be a prereq
anyway, otherwise we could have a tx deadlock : We need to force a
doorbell if TX ring is full.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_tx.c |   48 +++++++++----------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 92a7cf46d9af9260d601a0d65ec5bba4..44004e331e4e760f18cfa69ef99a2b2b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -706,6 +706,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 	void *fragptr = NULL;
 	bool bounce = false;
 	bool send_doorbell;
+	bool stop_queue;
 	bool inline_ok;
 	u32 ring_cons;
 
@@ -735,30 +736,6 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (vlan_tx_tag_present(skb))
 		vlan_tag = vlan_tx_tag_get(skb);
 
-	/* Check available TXBBs And 2K spare for prefetch */
-	if (unlikely(((int)(ring->prod - ring_cons)) >
-		     ring->size - HEADROOM - MAX_DESC_TXBBS)) {
-		/* every full Tx ring stops queue */
-		netif_tx_stop_queue(ring->tx_queue);
-		ring->queue_stopped++;
-
-		/* If queue was emptied after the if, and before the
-		 * stop_queue - need to wake the queue, or else it will remain
-		 * stopped forever.
-		 * Need a memory barrier to make sure ring->cons was not
-		 * updated before queue was stopped.
-		 */
-		wmb();
-
-		ring_cons = ACCESS_ONCE(ring->cons);
-		if (unlikely(((int)(ring->prod - ring_cons)) <=
-			     ring->size - HEADROOM - MAX_DESC_TXBBS)) {
-			netif_tx_wake_queue(ring->tx_queue);
-			ring->wake_queue++;
-		} else {
-			return NETDEV_TX_BUSY;
-		}
-	}
 
 	prefetchw(&ring->tx_queue->dql);
 
@@ -929,6 +906,13 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	skb_tx_timestamp(skb);
 
+	/* Check available TXBBs And 2K spare for prefetch */
+	stop_queue = (int)(ring->prod - ring_cons) >
+		      ring->size - HEADROOM - MAX_DESC_TXBBS;
+	if (unlikely(stop_queue)) {
+		netif_tx_stop_queue(ring->tx_queue);
+		ring->queue_stopped++;
+	}
 	send_doorbell = !skb->xmit_more || netif_xmit_stopped(ring->tx_queue);
 
 	real_size = (real_size / 16) & 0x3f;
@@ -973,6 +957,22 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 		}
 	}
 
+	if (unlikely(stop_queue)) {
+		/* If queue was emptied after the if (stop_queue) , and before
+		 * the netif_tx_stop_queue() - need to wake the queue,
+		 * or else it will remain stopped forever.
+		 * Need a memory barrier to make sure ring->cons was not
+		 * updated before queue was stopped.
+		 */
+		smp_rmb();
+
+		ring_cons = ACCESS_ONCE(ring->cons);
+		if (unlikely(((int)(ring->prod - ring_cons)) <=
+			     ring->size - HEADROOM - MAX_DESC_TXBBS)) {
+			netif_tx_wake_queue(ring->tx_queue);
+			ring->wake_queue++;
+		}
+	}
 	return NETDEV_TX_OK;
 
 tx_drop_unmap:

^ permalink raw reply related

* Re: [PATCH net-next 0/2] sunvnet: Packet processing in non-interrupt context.
From: Sowmini Varadhan @ 2014-10-06 16:04 UTC (permalink / raw)
  To: David Miller; +Cc: raghuram.kothakota, netdev
In-Reply-To: <20141003.120802.1213573830649867131.davem@davemloft.net>

> I think you should be able to get rid of all of the in-driver
> locking in the fast paths.
> 
> NAPI ->poll() is non-reentrant, so all RX processing occurs
> strictly in a serialized environment.
> 
> Once you do TX reclaim in NAPI context, then all you have to do is
> take the generic netdev TX queue lock during the evaluation of whether
> to wakeup the TX queue or not.  Worst case you need to hold the
> TX netdev queue lock across the whole TX reclaim operation.
> 
> The VIO lock really ought to be entirely superfluous in the data
> paths.

A few clarifications, since there are more driver-examples using NAPI for
Rx than for Tx reclaim

so I can move the LDC_EVENT_RESET/LDC_EVENT_UP processing code into the 
napi callback, and that enables the removal of irqsave/restore for
vio.lock from vio_port_up at the least (I do this conditional on
in_softirq() so as to not perturb vdc code at the moment)

But there are still a lot of irqsaves at the ldc layer for the lp lock.
I dont know if these can/should be optimized out. 

I looked at tg3 for a template on how to use NAPI in the TX path
The analog of the tg3_poll_work->tg3_tx invocation is probably the
maybe_tx_wakeup call triggered from the Rx side vnet processing,
which, with NAPI happens naturally from softirq context (no need for
extra tasklet). 

Regarding rcu locking of port_list and the hash in struct vnet_port,
the thorn here is that vnet_set_rx_mode may end up allocating a
vnet_mcast_entry as part of __update_mc_list
(there may be a different bug here in that it assumes that the 
first entry is the switch_port, and this is the only switch_port)
I dont know of a simple way to avoid that (a rwlock just for this
function?!).

But we still need to hold the vio lock around the ldc_write 
(and also around dring write) in vnet_start_xmit, right?

--Sowmini

^ permalink raw reply

* Re: [Xen-devel] [PATCHv1] xen-netfront: always keep the Rx ring full of requests
From: David Vrabel @ 2014-10-06 16:00 UTC (permalink / raw)
  To: annie li; +Cc: netdev, xen-devel, Boris Ostrovsky
In-Reply-To: <5432B6D2.9030503@oracle.com>

On 06/10/14 16:35, annie li wrote:
> 
> On 2014/10/2 9:33, David Vrabel wrote:
>> A full Rx ring only requires 1 MiB of memory.  This is not enough
>> memory that it is useful to dynamically scale the number of Rx
>> requests in the ring based on traffic rates.
>>
>> Keeping the ring full of Rx requests handles bursty traffic better
>> than trying to converges on an optimal number of requests to keep
>> filled.
>>
>> On a 4 core host, an iperf -P 64 -t 60 run from dom0 to a 4 VCPU guest
>> improved from 5.1 Gbit/s to 5.6 Gbit/s.  Gains with more bursty
>> traffic are expected to be higher.
> 
> Although removing sysfs is connected with the code change for full Rx
> ring utilization, I assume it is better to split this patch into two to
> make it simpler?

I don't see how splitting the patch would be an improvement.

>>   +    queue->rx.req_prod_pvt = req_prod;
>> +
>> +    /* Not enough requests? Try again later. */
>> +    if (req_prod - queue->rx.rsp_cons < NET_RX_SLOTS_MIN) {
>> +        mod_timer(&queue->rx_refill_timer, jiffies + (HZ/10));
>> +        return;
> 
> If the previous for loop breaks because of failure of
> xennet_alloc_one_rx_buffer, then notify_remote_via_irq is missed here if
> the code returns directly.

This is deliberate -- there's no point notifying the backend if there
aren't enough requests for the next packet.  Since we don't know what
the next packet might be we assume it's the largest possible.

David

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox