Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 1/2] ipv4: Remember FIB alias list head and table in lookup results.
From: David Miller @ 2011-02-01  0:25 UTC (permalink / raw)
  To: netdev


This will be used later to implement fib_select_default() in a
completely generic manner, instead of the current situation where the
default route is re-looked up in the TRIE/HASH table and then the
available aliases are analyzed.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h     |    3 +++
 net/ipv4/fib_hash.c      |    2 +-
 net/ipv4/fib_lookup.h    |    2 +-
 net/ipv4/fib_semantics.c |    7 +++++--
 net/ipv4/fib_trie.c      |    8 ++++----
 5 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 2c0508a..f5199b0 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -96,12 +96,15 @@ struct fib_info {
 struct fib_rule;
 #endif
 
+struct fib_table;
 struct fib_result {
 	unsigned char	prefixlen;
 	unsigned char	nh_sel;
 	unsigned char	type;
 	unsigned char	scope;
 	struct fib_info *fi;
+	struct fib_table *table;
+	struct list_head *fa_head;
 #ifdef CONFIG_IP_MULTIPLE_TABLES
 	struct fib_rule	*r;
 #endif
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index b3acb04..0a88866 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -288,7 +288,7 @@ int fib_table_lookup(struct fib_table *tb,
 				if (f->fn_key != k)
 					continue;
 
-				err = fib_semantic_match(&f->fn_alias,
+				err = fib_semantic_match(tb, &f->fn_alias,
 						 flp, res,
 						 fz->fz_order, fib_flags);
 				if (err <= 0)
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index c079cc0..d5c40d8 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -25,7 +25,7 @@ static inline void fib_alias_accessed(struct fib_alias *fa)
 }
 
 /* Exported by fib_semantics.c */
-extern int fib_semantic_match(struct list_head *head,
+extern int fib_semantic_match(struct fib_table *tb, struct list_head *head,
 			      const struct flowi *flp,
 			      struct fib_result *res, int prefixlen, int fib_flags);
 extern void fib_release_info(struct fib_info *);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 48e93a5..1bf6fb9 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -889,8 +889,9 @@ failure:
 }
 
 /* Note! fib_semantic_match intentionally uses  RCU list functions. */
-int fib_semantic_match(struct list_head *head, const struct flowi *flp,
-		       struct fib_result *res, int prefixlen, int fib_flags)
+int fib_semantic_match(struct fib_table *tb, struct list_head *head,
+		       const struct flowi *flp, struct fib_result *res,
+		       int prefixlen, int fib_flags)
 {
 	struct fib_alias *fa;
 	int nh_sel = 0;
@@ -954,6 +955,8 @@ out_fill_res:
 	res->type = fa->fa_type;
 	res->scope = fa->fa_scope;
 	res->fi = fa->fa_info;
+	res->table = tb;
+	res->fa_head = head;
 	if (!(fib_flags & FIB_LOOKUP_NOREF))
 		atomic_inc(&res->fi->fib_clntref);
 	return 0;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 0f28034..8cee5c8 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1340,7 +1340,7 @@ err:
 }
 
 /* should be called with rcu_read_lock */
-static int check_leaf(struct trie *t, struct leaf *l,
+static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
 		      t_key key,  const struct flowi *flp,
 		      struct fib_result *res, int fib_flags)
 {
@@ -1356,7 +1356,7 @@ static int check_leaf(struct trie *t, struct leaf *l,
 		if (l->key != (key & ntohl(mask)))
 			continue;
 
-		err = fib_semantic_match(&li->falh, flp, res, plen, fib_flags);
+		err = fib_semantic_match(tb, &li->falh, flp, res, plen, fib_flags);
 
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 		if (err <= 0)
@@ -1398,7 +1398,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
 
 	/* Just a leaf? */
 	if (IS_LEAF(n)) {
-		ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
+		ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
 		goto found;
 	}
 
@@ -1423,7 +1423,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
 		}
 
 		if (IS_LEAF(n)) {
-			ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
+			ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
 			if (ret > 0)
 				goto backtrace;
 			goto found;
-- 
1.7.4


^ permalink raw reply related

* [PATCH 2/2] ipv4: Consolidate all default route selection implementations.
From: David Miller @ 2011-02-01  0:25 UTC (permalink / raw)
  To: netdev


Both fib_trie and fib_hash have a local implementation of
fib_table_select_default().  This is completely unnecessary
code duplication.

Since we now remember the fib_table and the head of the fib
alias list of the default route, we can implement one single
generic version of this routine.

Looking at the fib_hash implementation you may get the impression
that it's possible for there to be multiple top-level routes in
the table for the default route.  The truth is, it isn't, the
insert code will only allow one entry to exist in the zero
prefix hash table, because all keys evaluate to zero and all
keys in a hash table must be unique.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h     |    6 +---
 net/ipv4/fib_frontend.c  |   15 ---------
 net/ipv4/fib_hash.c      |   72 --------------------------------------------
 net/ipv4/fib_semantics.c |   56 ++++++++++++++++++++++++++++++++++
 net/ipv4/fib_trie.c      |   74 ----------------------------------------------
 net/ipv4/route.c         |    2 +-
 6 files changed, 58 insertions(+), 167 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index f5199b0..819d61c 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -158,9 +158,6 @@ extern int fib_table_delete(struct fib_table *, struct fib_config *);
 extern int fib_table_dump(struct fib_table *table, struct sk_buff *skb,
 			  struct netlink_callback *cb);
 extern int fib_table_flush(struct fib_table *table);
-extern void fib_table_select_default(struct fib_table *table,
-				     const struct flowi *flp,
-				     struct fib_result *res);
 extern void fib_free_table(struct fib_table *tb);
 
 
@@ -221,8 +218,7 @@ extern void		ip_fib_init(void);
 extern int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
 			       struct net_device *dev, __be32 *spec_dst,
 			       u32 *itag, u32 mark);
-extern void fib_select_default(struct net *net, const struct flowi *flp,
-			       struct fib_result *res);
+extern void fib_select_default(struct fib_result *res);
 
 /* Exported by fib_semantics.c */
 extern int ip_fib_check_default(__be32 gw, struct net_device *dev);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 1d2cdd4..930768b 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -114,21 +114,6 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
 }
 #endif /* CONFIG_IP_MULTIPLE_TABLES */
 
-void fib_select_default(struct net *net,
-			const struct flowi *flp, struct fib_result *res)
-{
-	struct fib_table *tb;
-	int table = RT_TABLE_MAIN;
-#ifdef CONFIG_IP_MULTIPLE_TABLES
-	if (res->r == NULL || res->r->action != FR_ACT_TO_TBL)
-		return;
-	table = res->r->table;
-#endif
-	tb = fib_get_table(net, table);
-	if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
-		fib_table_select_default(tb, flp, res);
-}
-
 static void fib_flush(struct net *net)
 {
 	int flushed = 0;
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 0a88866..fadb602 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -302,78 +302,6 @@ out:
 	return err;
 }
 
-void fib_table_select_default(struct fib_table *tb,
-			      const struct flowi *flp, struct fib_result *res)
-{
-	int order, last_idx;
-	struct hlist_node *node;
-	struct fib_node *f;
-	struct fib_info *fi = NULL;
-	struct fib_info *last_resort;
-	struct fn_hash *t = (struct fn_hash *)tb->tb_data;
-	struct fn_zone *fz = t->fn_zones[0];
-	struct hlist_head *head;
-
-	if (fz == NULL)
-		return;
-
-	last_idx = -1;
-	last_resort = NULL;
-	order = -1;
-
-	rcu_read_lock();
-	head = rcu_dereference(fz->fz_hash);
-	hlist_for_each_entry_rcu(f, node, head, fn_hash) {
-		struct fib_alias *fa;
-
-		list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
-			struct fib_info *next_fi = fa->fa_info;
-
-			if (fa->fa_scope != res->scope ||
-			    fa->fa_type != RTN_UNICAST)
-				continue;
-
-			if (next_fi->fib_priority > res->fi->fib_priority)
-				break;
-			if (!next_fi->fib_nh[0].nh_gw ||
-			    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
-				continue;
-
-			fib_alias_accessed(fa);
-
-			if (fi == NULL) {
-				if (next_fi != res->fi)
-					break;
-			} else if (!fib_detect_death(fi, order, &last_resort,
-						&last_idx, tb->tb_default)) {
-				fib_result_assign(res, fi);
-				tb->tb_default = order;
-				goto out;
-			}
-			fi = next_fi;
-			order++;
-		}
-	}
-
-	if (order <= 0 || fi == NULL) {
-		tb->tb_default = -1;
-		goto out;
-	}
-
-	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
-				tb->tb_default)) {
-		fib_result_assign(res, fi);
-		tb->tb_default = order;
-		goto out;
-	}
-
-	if (last_idx >= 0)
-		fib_result_assign(res, last_resort);
-	tb->tb_default = last_idx;
-out:
-	rcu_read_unlock();
-}
-
 /* Insert node F to FZ. */
 static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
 {
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 1bf6fb9..b15857d 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1136,6 +1136,62 @@ int fib_sync_down_dev(struct net_device *dev, int force)
 	return ret;
 }
 
+/* Must be invoked inside of an RCU protected region.  */
+void fib_select_default(struct fib_result *res)
+{
+	struct fib_info *fi = NULL, *last_resort = NULL;
+	struct list_head *fa_head = res->fa_head;
+	struct fib_table *tb = res->table;
+	int order = -1, last_idx = -1;
+	struct fib_alias *fa;
+
+	list_for_each_entry_rcu(fa, fa_head, fa_list) {
+		struct fib_info *next_fi = fa->fa_info;
+
+		if (fa->fa_scope != res->scope ||
+		    fa->fa_type != RTN_UNICAST)
+			continue;
+
+		if (next_fi->fib_priority > res->fi->fib_priority)
+			break;
+		if (!next_fi->fib_nh[0].nh_gw ||
+		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
+			continue;
+
+		fib_alias_accessed(fa);
+
+		if (fi == NULL) {
+			if (next_fi != res->fi)
+				break;
+		} else if (!fib_detect_death(fi, order, &last_resort,
+					     &last_idx, tb->tb_default)) {
+			fib_result_assign(res, fi);
+			tb->tb_default = order;
+			goto out;
+		}
+		fi = next_fi;
+		order++;
+	}
+
+	if (order <= 0 || fi == NULL) {
+		tb->tb_default = -1;
+		goto out;
+	}
+
+	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
+				tb->tb_default)) {
+		fib_result_assign(res, fi);
+		tb->tb_default = order;
+		goto out;
+	}
+
+	if (last_idx >= 0)
+		fib_result_assign(res, last_resort);
+	tb->tb_default = last_idx;
+out:
+	rcu_read_unlock();
+}
+
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 
 /*
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 8cee5c8..16d589c 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1802,80 +1802,6 @@ void fib_free_table(struct fib_table *tb)
 	kfree(tb);
 }
 
-void fib_table_select_default(struct fib_table *tb,
-			      const struct flowi *flp,
-			      struct fib_result *res)
-{
-	struct trie *t = (struct trie *) tb->tb_data;
-	int order, last_idx;
-	struct fib_info *fi = NULL;
-	struct fib_info *last_resort;
-	struct fib_alias *fa = NULL;
-	struct list_head *fa_head;
-	struct leaf *l;
-
-	last_idx = -1;
-	last_resort = NULL;
-	order = -1;
-
-	rcu_read_lock();
-
-	l = fib_find_node(t, 0);
-	if (!l)
-		goto out;
-
-	fa_head = get_fa_head(l, 0);
-	if (!fa_head)
-		goto out;
-
-	if (list_empty(fa_head))
-		goto out;
-
-	list_for_each_entry_rcu(fa, fa_head, fa_list) {
-		struct fib_info *next_fi = fa->fa_info;
-
-		if (fa->fa_scope != res->scope ||
-		    fa->fa_type != RTN_UNICAST)
-			continue;
-
-		if (next_fi->fib_priority > res->fi->fib_priority)
-			break;
-		if (!next_fi->fib_nh[0].nh_gw ||
-		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
-			continue;
-
-		fib_alias_accessed(fa);
-
-		if (fi == NULL) {
-			if (next_fi != res->fi)
-				break;
-		} else if (!fib_detect_death(fi, order, &last_resort,
-					     &last_idx, tb->tb_default)) {
-			fib_result_assign(res, fi);
-			tb->tb_default = order;
-			goto out;
-		}
-		fi = next_fi;
-		order++;
-	}
-	if (order <= 0 || fi == NULL) {
-		tb->tb_default = -1;
-		goto out;
-	}
-
-	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
-				tb->tb_default)) {
-		fib_result_assign(res, fi);
-		tb->tb_default = order;
-		goto out;
-	}
-	if (last_idx >= 0)
-		fib_result_assign(res, last_resort);
-	tb->tb_default = last_idx;
-out:
-	rcu_read_unlock();
-}
-
 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
 			   struct fib_table *tb,
 			   struct sk_buff *skb, struct netlink_callback *cb)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index b1e5d3a..242a3de 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2711,7 +2711,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
 	else
 #endif
 	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
-		fib_select_default(net, &fl, &res);
+		fib_select_default(&res);
 
 	if (!fl.fl4_src)
 		fl.fl4_src = FIB_RES_PREFSRC(res);
-- 
1.7.4


^ permalink raw reply related

* Re: [GIT PULL nf-next-2.6] IPVS build fixes and clean-ups
From: Randy Dunlap @ 2011-02-01  0:50 UTC (permalink / raw)
  To: Simon Horman
  Cc: netdev, linux-next, linux-kernel, lvs-devel, Randy Dunlap,
	Stephen Rothwell, Hans Schillstrom, Patrick McHardy
In-Reply-To: <1296519255-10602-1-git-send-email-horms@verge.net.au>

On Tue,  1 Feb 2011 11:14:11 +1100 Simon Horman wrote:

> Hi,
> 
> This short patch series addresses two linux-next build problems
> raised by Randy Dunlap:
> 
> * net/netfilter/ipvs/ip_vs_core.c:1891: warning: format '%lu' expects type 'long unsigned int', but argument 2 has type 'unsigned int'
> * ERROR: "unregister_net_sysctl_table" [net/netfilter/ipvs/ip_vs.ko]
>   ERROR: "register_net_sysctl_table" [net/netfilter/ipvs/ip_vs.ko] undefined!
> 
> The remainder of the changset is cleanups that I noticed along the way.

These 4 patches build successfully for me.
However, I do see these warnings (sorry I missed them earlier):

WARNING: net/netfilter/ipvs/ip_vs.o(.init.text+0x161): Section mismatch in reference from the function init_module() to the function .exit.text:ip_vs_sync_cleanup()
WARNING: net/netfilter/ipvs/ip_vs.o(.init.text+0x161): Section mismatch in reference from the function init_module() to the function .exit.text:ip_vs_sync_cleanup()


Thanks for the patch series.

> The changes are available at
> git://git.kernel.org/pub/scm/linux/kernel/git/horms/lvs-test-2.6.git master
> 
> They are currently compile-tested only.
> 
>  include/net/ip_vs.h              |    2 --
>  net/netfilter/ipvs/ip_vs_core.c  |    2 +-
>  net/netfilter/ipvs/ip_vs_ctl.c   |   17 +++++++++--------
>  net/netfilter/ipvs/ip_vs_lblc.c  |   20 ++++++++++----------
>  net/netfilter/ipvs/ip_vs_lblcr.c |   20 ++++++++++----------
>  5 files changed, 30 insertions(+), 31 deletions(-)


---
~Randy
*** Remember to use Documentation/SubmitChecklist when testing your code ***

^ permalink raw reply

* Re: Network performance with small packets
From: Sridhar Samudrala @ 2011-02-01  1:30 UTC (permalink / raw)
  To: Steve Dobbelstein; +Cc: Michael S. Tsirkin, David Miller, kvm, mashirle, netdev
In-Reply-To: <OF05820FED.7465B77B-ON8625782A.0001B247-8625782A.00023FA7@us.ibm.com>

On Mon, 2011-01-31 at 18:24 -0600, Steve Dobbelstein wrote:
> "Michael S. Tsirkin" <mst@redhat.com> wrote on 01/28/2011 06:16:16 AM:
> 
> > OK, so thinking about it more, maybe the issue is this:
> > tx becomes full. We process one request and interrupt the guest,
> > then it adds one request and the queue is full again.
> >
> > Maybe the following will help it stabilize?
> > By itself it does nothing, but if you set
> > all the parameters to a huge value we will
> > only interrupt when we see an empty ring.
> > Which might be too much: pls try other values
> > in the middle: e.g. make bufs half the ring,
> > or bytes some small value, or packets some
> > small value etc.
> >
> > Warning: completely untested.
> >
> > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > index aac05bc..6769cdc 100644
> > --- a/drivers/vhost/net.c
> > +++ b/drivers/vhost/net.c
> > @@ -32,6 +32,13 @@
> >   * Using this limit prevents one virtqueue from starving others. */
> >  #define VHOST_NET_WEIGHT 0x80000
> >
> > +int tx_bytes_coalesce = 0;
> > +module_param(tx_bytes_coalesce, int, 0644);
> > +int tx_bufs_coalesce = 0;
> > +module_param(tx_bufs_coalesce, int, 0644);
> > +int tx_packets_coalesce = 0;
> > +module_param(tx_packets_coalesce, int, 0644);
> > +
> >  enum {
> >     VHOST_NET_VQ_RX = 0,
> >     VHOST_NET_VQ_TX = 1,
> > @@ -127,6 +134,9 @@ static void handle_tx(struct vhost_net *net)
> >     int err, wmem;
> >     size_t hdr_size;
> >     struct socket *sock;
> > +   int bytes_coalesced = 0;
> > +   int bufs_coalesced = 0;
> > +   int packets_coalesced = 0;
> >
> >     /* TODO: check that we are running from vhost_worker? */
> >     sock = rcu_dereference_check(vq->private_data, 1);
> > @@ -196,14 +206,26 @@ static void handle_tx(struct vhost_net *net)
> >        if (err != len)
> >           pr_debug("Truncated TX packet: "
> >               " len %d != %zd\n", err, len);
> > -      vhost_add_used_and_signal(&net->dev, vq, head, 0);
> >        total_len += len;
> > +      packets_coalesced += 1;
> > +      bytes_coalesced += len;
> > +      bufs_coalesced += in;
> 
> Should this instead be:
>       bufs_coalesced += out;
> 
> Perusing the code I see that earlier there is a check to see if "in" is not
> zero, and, if so, error out of the loop.  After the check, "in" is not
> touched until it is added to bufs_coalesced, effectively not changing
> bufs_coalesced, meaning bufs_coalesced will never trigger the conditions
> below.

Yes. It definitely should be 'out'. 'in' should be 0 in the tx path.

I tried a simpler version of this patch without any tunables by
delaying the signaling until we come out of the for loop.
It definitely reduced the number of vmexits significantly for small message
guest to host stream test and the throughput went up a little.

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 9b3ca10..5f9fae9 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -197,7 +197,7 @@ static void handle_tx(struct vhost_net *net)
 		if (err != len)
 			pr_debug("Truncated TX packet: "
 				 " len %d != %zd\n", err, len);
-		vhost_add_used_and_signal(&net->dev, vq, head, 0);
+		vhost_add_used(vq, head, 0);
 		total_len += len;
 		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
 			vhost_poll_queue(&vq->poll);
@@ -205,6 +205,8 @@ static void handle_tx(struct vhost_net *net)
 		}
 	}
 
+	if (total_len > 0)
+		vhost_signal(&net->dev, vq);
 	mutex_unlock(&vq->mutex);
 }
 

> 
> Or am I missing something?
> 
> > +      if (unlikely(packets_coalesced > tx_packets_coalesce ||
> > +              bytes_coalesced > tx_bytes_coalesce ||
> > +              bufs_coalesced > tx_bufs_coalesce))
> > +         vhost_add_used_and_signal(&net->dev, vq, head, 0);
> > +      else
> > +         vhost_add_used(vq, head, 0);
> >        if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
> >           vhost_poll_queue(&vq->poll);
> >           break;
> >        }
> >     }
> >
> > +   if (likely(packets_coalesced > tx_packets_coalesce ||
> > +         bytes_coalesced > tx_bytes_coalesce ||
> > +         bufs_coalesced > tx_bufs_coalesce))
> > +      vhost_signal(&net->dev, vq);
> >     mutex_unlock(&vq->mutex);
> >  }

It is possible that we can miss signaling the guest even after
processing a few pkts, if we don't hit any of these conditions.

> >
> 
> Steve D.
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply related

* [PATCH net-next-2.6] bnx2x, cnic: Consolidate iSCSI/FCoE shared mem logic in bnx2x
From: Michael Chan @ 2011-02-01  0:39 UTC (permalink / raw)
  To: davem; +Cc: vladz, eilong, netdev

From: Vladislav Zolotarov <vladz@broadcom.com>

Move all shared mem code to bnx2x to avoid code duplication.  bnx2x now
performs:

- Read the FCoE and iSCSI max connection information.
- Read the iSCSI and FCoE MACs from NPAR configuration in shmem.
- Block the CNIC for the current function if there is neither FCoE nor
  iSCSI valid configuration by returning NULL from bnx2x_cnic_probe().

Signed-off-by: Vladislav Zolotarov <vladz@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
Signed-off-by: Michael Chan <mchan@broadcom.com>
---
 drivers/net/bnx2.h             |    2 +
 drivers/net/bnx2x/bnx2x.h      |    5 +-
 drivers/net/bnx2x/bnx2x_hsi.h  |   25 +++++---
 drivers/net/bnx2x/bnx2x_main.c |  112 ++++++++++++++++++++++++++++----
 drivers/net/cnic.c             |  143 +++++-----------------------------------
 drivers/net/cnic_if.h          |    8 ++-
 6 files changed, 146 insertions(+), 149 deletions(-)

diff --git a/drivers/net/bnx2.h b/drivers/net/bnx2.h
index 0132ea9..7a5e88f 100644
--- a/drivers/net/bnx2.h
+++ b/drivers/net/bnx2.h
@@ -6207,6 +6207,8 @@ struct l2_fhdr {
 
 #define BNX2_CP_SCRATCH					0x001a0000
 
+#define BNX2_FW_MAX_ISCSI_CONN				 0x001a0080
+
 
 /*
  *  mcp_reg definition
diff --git a/drivers/net/bnx2x/bnx2x.h b/drivers/net/bnx2x/bnx2x.h
index 04fb72b..ff87ec3 100644
--- a/drivers/net/bnx2x/bnx2x.h
+++ b/drivers/net/bnx2x/bnx2x.h
@@ -976,8 +976,12 @@ struct bnx2x {
 #define MF_FUNC_DIS			0x1000
 #define FCOE_MACS_SET			0x2000
 #define NO_FCOE_FLAG			0x4000
+#define NO_ISCSI_OOO_FLAG		0x8000
+#define NO_ISCSI_FLAG			0x10000
 
 #define NO_FCOE(bp)		((bp)->flags & NO_FCOE_FLAG)
+#define NO_ISCSI(bp)		((bp)->flags & NO_ISCSI_FLAG)
+#define NO_ISCSI_OOO(bp)	((bp)->flags & NO_ISCSI_OOO_FLAG)
 
 	int			pf_num;	/* absolute PF number */
 	int			pfid;	/* per-path PF number */
@@ -1125,7 +1129,6 @@ struct bnx2x {
 	u16			cnic_kwq_pending;
 	u16			cnic_spq_pending;
 	struct mutex		cnic_mutex;
-	u8			iscsi_mac[ETH_ALEN];
 	u8			fip_mac[ETH_ALEN];
 #endif
 
diff --git a/drivers/net/bnx2x/bnx2x_hsi.h b/drivers/net/bnx2x/bnx2x_hsi.h
index 51d69db..be503cc 100644
--- a/drivers/net/bnx2x/bnx2x_hsi.h
+++ b/drivers/net/bnx2x/bnx2x_hsi.h
@@ -11,20 +11,27 @@
 
 #include "bnx2x_fw_defs.h"
 
+#define FW_ENCODE_32BIT_PATTERN		0x1e1e1e1e
+
 struct license_key {
 	u32 reserved[6];
 
-#if defined(__BIG_ENDIAN)
-	u16 max_iscsi_init_conn;
-	u16 max_iscsi_trgt_conn;
-#elif defined(__LITTLE_ENDIAN)
-	u16 max_iscsi_trgt_conn;
-	u16 max_iscsi_init_conn;
-#endif
+	u32 max_iscsi_conn;
+#define BNX2X_MAX_ISCSI_TRGT_CONN_MASK	0xFFFF
+#define BNX2X_MAX_ISCSI_TRGT_CONN_SHIFT	0
+#define BNX2X_MAX_ISCSI_INIT_CONN_MASK	0xFFFF0000
+#define BNX2X_MAX_ISCSI_INIT_CONN_SHIFT	16
 
-	u32 reserved_a[6];
-};
+	u32 reserved_a;
+
+	u32 max_fcoe_conn;
+#define BNX2X_MAX_FCOE_TRGT_CONN_MASK	0xFFFF
+#define BNX2X_MAX_FCOE_TRGT_CONN_SHIFT	0
+#define BNX2X_MAX_FCOE_INIT_CONN_MASK	0xFFFF0000
+#define BNX2X_MAX_FCOE_INIT_CONN_SHIFT	16
 
+	u32 reserved_b[4];
+};
 
 #define PORT_0				0
 #define PORT_1				1
diff --git a/drivers/net/bnx2x/bnx2x_main.c b/drivers/net/bnx2x/bnx2x_main.c
index 2215a39..ae8d20a 100644
--- a/drivers/net/bnx2x/bnx2x_main.c
+++ b/drivers/net/bnx2x/bnx2x_main.c
@@ -6456,12 +6456,13 @@ static int bnx2x_set_iscsi_eth_mac_addr(struct bnx2x *bp, int set)
 	u32 iscsi_l2_cl_id = BNX2X_ISCSI_ETH_CL_ID +
 		BP_E1HVN(bp) * NONE_ETH_CONTEXT_USE;
 	u32 cl_bit_vec = (1 << iscsi_l2_cl_id);
+	u8 *iscsi_mac = bp->cnic_eth_dev.iscsi_mac;
 
 	/* Send a SET_MAC ramrod */
-	bnx2x_set_mac_addr_gen(bp, set, bp->iscsi_mac, cl_bit_vec,
+	bnx2x_set_mac_addr_gen(bp, set, iscsi_mac, cl_bit_vec,
 			       cam_offset, 0);
 
-	bnx2x_set_mac_in_nig(bp, set, bp->iscsi_mac, LLH_CAM_ISCSI_ETH_LINE);
+	bnx2x_set_mac_in_nig(bp, set, iscsi_mac, LLH_CAM_ISCSI_ETH_LINE);
 
 	return 0;
 }
@@ -8385,11 +8386,47 @@ static void __devinit bnx2x_get_port_hwinfo(struct bnx2x *bp)
 							bp->common.shmem2_base);
 }
 
+#ifdef BCM_CNIC
+static void __devinit bnx2x_get_cnic_info(struct bnx2x *bp)
+{
+	u32 max_iscsi_conn = FW_ENCODE_32BIT_PATTERN ^ SHMEM_RD(bp,
+				drv_lic_key[BP_PORT(bp)].max_iscsi_conn);
+	u32 max_fcoe_conn = FW_ENCODE_32BIT_PATTERN ^ SHMEM_RD(bp,
+				drv_lic_key[BP_PORT(bp)].max_fcoe_conn);
+
+	/* Get the number of maximum allowed iSCSI and FCoE connections */
+	bp->cnic_eth_dev.max_iscsi_conn =
+		(max_iscsi_conn & BNX2X_MAX_ISCSI_INIT_CONN_MASK) >>
+		BNX2X_MAX_ISCSI_INIT_CONN_SHIFT;
+
+	bp->cnic_eth_dev.max_fcoe_conn =
+		(max_fcoe_conn & BNX2X_MAX_FCOE_INIT_CONN_MASK) >>
+		BNX2X_MAX_FCOE_INIT_CONN_SHIFT;
+
+	BNX2X_DEV_INFO("max_iscsi_conn 0x%x max_fcoe_conn 0x%x\n",
+		       bp->cnic_eth_dev.max_iscsi_conn,
+		       bp->cnic_eth_dev.max_fcoe_conn);
+
+	/* If mamimum allowed number of connections is zero -
+	 * disable the feature.
+	 */
+	if (!bp->cnic_eth_dev.max_iscsi_conn)
+		bp->flags |= NO_ISCSI_OOO_FLAG | NO_ISCSI_FLAG;
+
+	if (!bp->cnic_eth_dev.max_fcoe_conn)
+		bp->flags |= NO_FCOE_FLAG;
+}
+#endif
+
 static void __devinit bnx2x_get_mac_hwinfo(struct bnx2x *bp)
 {
 	u32 val, val2;
 	int func = BP_ABS_FUNC(bp);
 	int port = BP_PORT(bp);
+#ifdef BCM_CNIC
+	u8 *iscsi_mac = bp->cnic_eth_dev.iscsi_mac;
+	u8 *fip_mac = bp->fip_mac;
+#endif
 
 	if (BP_NOMCP(bp)) {
 		BNX2X_ERROR("warning: random MAC workaround active\n");
@@ -8402,7 +8439,9 @@ static void __devinit bnx2x_get_mac_hwinfo(struct bnx2x *bp)
 			bnx2x_set_mac_buf(bp->dev->dev_addr, val, val2);
 
 #ifdef BCM_CNIC
-		/* iSCSI NPAR MAC */
+		/* iSCSI and FCoE NPAR MACs: if there is no either iSCSI or
+		 * FCoE MAC then the appropriate feature should be disabled.
+		 */
 		if (IS_MF_SI(bp)) {
 			u32 cfg = MF_CFG_RD(bp, func_ext_config[func].func_cfg);
 			if (cfg & MACP_FUNC_CFG_FLAGS_ISCSI_OFFLOAD) {
@@ -8410,8 +8449,39 @@ static void __devinit bnx2x_get_mac_hwinfo(struct bnx2x *bp)
 						     iscsi_mac_addr_upper);
 				val = MF_CFG_RD(bp, func_ext_config[func].
 						    iscsi_mac_addr_lower);
-				bnx2x_set_mac_buf(bp->iscsi_mac, val, val2);
-			}
+				BNX2X_DEV_INFO("Read iSCSI MAC: "
+					       "0x%x:0x%04x\n", val2, val);
+				bnx2x_set_mac_buf(iscsi_mac, val, val2);
+
+				/* Disable iSCSI OOO if MAC configuration is
+				 * invalid.
+				 */
+				if (!is_valid_ether_addr(iscsi_mac)) {
+					bp->flags |= NO_ISCSI_OOO_FLAG |
+						     NO_ISCSI_FLAG;
+					memset(iscsi_mac, 0, ETH_ALEN);
+				}
+			} else
+				bp->flags |= NO_ISCSI_OOO_FLAG | NO_ISCSI_FLAG;
+
+			if (cfg & MACP_FUNC_CFG_FLAGS_FCOE_OFFLOAD) {
+				val2 = MF_CFG_RD(bp, func_ext_config[func].
+						     fcoe_mac_addr_upper);
+				val = MF_CFG_RD(bp, func_ext_config[func].
+						    fcoe_mac_addr_lower);
+				BNX2X_DEV_INFO("Read FCoE MAC to "
+					       "0x%x:0x%04x\n", val2, val);
+				bnx2x_set_mac_buf(fip_mac, val, val2);
+
+				/* Disable FCoE if MAC configuration is
+				 * invalid.
+				 */
+				if (!is_valid_ether_addr(fip_mac)) {
+					bp->flags |= NO_FCOE_FLAG;
+					memset(bp->fip_mac, 0, ETH_ALEN);
+				}
+			} else
+				bp->flags |= NO_FCOE_FLAG;
 		}
 #endif
 	} else {
@@ -8425,7 +8495,7 @@ static void __devinit bnx2x_get_mac_hwinfo(struct bnx2x *bp)
 				    iscsi_mac_upper);
 		val = SHMEM_RD(bp, dev_info.port_hw_config[port].
 				   iscsi_mac_lower);
-		bnx2x_set_mac_buf(bp->iscsi_mac, val, val2);
+		bnx2x_set_mac_buf(iscsi_mac, val, val2);
 #endif
 	}
 
@@ -8433,14 +8503,12 @@ static void __devinit bnx2x_get_mac_hwinfo(struct bnx2x *bp)
 	memcpy(bp->dev->perm_addr, bp->dev->dev_addr, ETH_ALEN);
 
 #ifdef BCM_CNIC
-	/* Inform the upper layers about FCoE MAC */
+	/* Set the FCoE MAC in modes other then MF_SI */
 	if (!CHIP_IS_E1x(bp)) {
 		if (IS_MF_SD(bp))
-			memcpy(bp->fip_mac, bp->dev->dev_addr,
-			       sizeof(bp->fip_mac));
-		else
-			memcpy(bp->fip_mac, bp->iscsi_mac,
-			       sizeof(bp->fip_mac));
+			memcpy(fip_mac, bp->dev->dev_addr, ETH_ALEN);
+		else if (!IS_MF(bp))
+			memcpy(fip_mac, iscsi_mac, ETH_ALEN);
 	}
 #endif
 }
@@ -8603,6 +8671,10 @@ static int __devinit bnx2x_get_hwinfo(struct bnx2x *bp)
 	/* Get MAC addresses */
 	bnx2x_get_mac_hwinfo(bp);
 
+#ifdef BCM_CNIC
+	bnx2x_get_cnic_info(bp);
+#endif
+
 	return rc;
 }
 
@@ -10077,6 +10149,13 @@ struct cnic_eth_dev *bnx2x_cnic_probe(struct net_device *dev)
 	struct bnx2x *bp = netdev_priv(dev);
 	struct cnic_eth_dev *cp = &bp->cnic_eth_dev;
 
+	/* If both iSCSI and FCoE are disabled - return NULL in
+	 * order to indicate CNIC that it should not try to work
+	 * with this device.
+	 */
+	if (NO_ISCSI(bp) && NO_FCOE(bp))
+		return NULL;
+
 	cp->drv_owner = THIS_MODULE;
 	cp->chip_id = CHIP_ID(bp);
 	cp->pdev = bp->pdev;
@@ -10097,6 +10176,15 @@ struct cnic_eth_dev *bnx2x_cnic_probe(struct net_device *dev)
 		BP_E1HVN(bp) * NONE_ETH_CONTEXT_USE;
 	cp->iscsi_l2_cid = BNX2X_ISCSI_ETH_CID;
 
+	if (NO_ISCSI_OOO(bp))
+		cp->drv_state |= CNIC_DRV_STATE_NO_ISCSI_OOO;
+
+	if (NO_ISCSI(bp))
+		cp->drv_state |= CNIC_DRV_STATE_NO_ISCSI;
+
+	if (NO_FCOE(bp))
+		cp->drv_state |= CNIC_DRV_STATE_NO_FCOE;
+
 	DP(BNX2X_MSG_SP, "page_size %d, tbl_offset %d, tbl_lines %d, "
 			 "starting cid %d\n",
 	   cp->ctx_blk_size,
diff --git a/drivers/net/cnic.c b/drivers/net/cnic.c
index c820496..2d2d28f 100644
--- a/drivers/net/cnic.c
+++ b/drivers/net/cnic.c
@@ -4179,6 +4179,14 @@ static void cnic_enable_bnx2_int(struct cnic_dev *dev)
 		BNX2_PCICFG_INT_ACK_CMD_INDEX_VALID | cp->last_status_idx);
 }
 
+static void cnic_get_bnx2_iscsi_info(struct cnic_dev *dev)
+{
+	u32 max_conn;
+
+	max_conn = cnic_reg_rd_ind(dev, BNX2_FW_MAX_ISCSI_CONN);
+	dev->max_iscsi_conn = max_conn;
+}
+
 static void cnic_disable_bnx2_int_sync(struct cnic_dev *dev)
 {
 	struct cnic_local *cp = dev->cnic_priv;
@@ -4503,6 +4511,8 @@ static int cnic_start_bnx2_hw(struct cnic_dev *dev)
 		return err;
 	}
 
+	cnic_get_bnx2_iscsi_info(dev);
+
 	return 0;
 }
 
@@ -4714,129 +4724,6 @@ static void cnic_init_bnx2x_rx_ring(struct cnic_dev *dev,
 	cp->rx_cons = *cp->rx_cons_ptr;
 }
 
-static int cnic_read_bnx2x_iscsi_mac(struct cnic_dev *dev, u32 upper_addr,
-				     u32 lower_addr)
-{
-	u32 val;
-	u8 mac[6];
-
-	val = CNIC_RD(dev, upper_addr);
-
-	mac[0] = (u8) (val >> 8);
-	mac[1] = (u8) val;
-
-	val = CNIC_RD(dev, lower_addr);
-
-	mac[2] = (u8) (val >> 24);
-	mac[3] = (u8) (val >> 16);
-	mac[4] = (u8) (val >> 8);
-	mac[5] = (u8) val;
-
-	if (is_valid_ether_addr(mac)) {
-		memcpy(dev->mac_addr, mac, 6);
-		return 0;
-	} else {
-		return -EINVAL;
-	}
-}
-
-static void cnic_get_bnx2x_iscsi_info(struct cnic_dev *dev)
-{
-	struct cnic_local *cp = dev->cnic_priv;
-	u32 base, base2, addr, addr1, val;
-	int port = CNIC_PORT(cp);
-
-	dev->max_iscsi_conn = 0;
-	base = CNIC_RD(dev, MISC_REG_SHARED_MEM_ADDR);
-	if (base == 0)
-		return;
-
-	base2 = CNIC_RD(dev, (CNIC_PATH(cp) ? MISC_REG_GENERIC_CR_1 :
-					      MISC_REG_GENERIC_CR_0));
-	addr = BNX2X_SHMEM_ADDR(base,
-		dev_info.port_hw_config[port].iscsi_mac_upper);
-
-	addr1 = BNX2X_SHMEM_ADDR(base,
-		dev_info.port_hw_config[port].iscsi_mac_lower);
-
-	cnic_read_bnx2x_iscsi_mac(dev, addr, addr1);
-
-	addr = BNX2X_SHMEM_ADDR(base, validity_map[port]);
-	val = CNIC_RD(dev, addr);
-
-	if (!(val & SHR_MEM_VALIDITY_LIC_NO_KEY_IN_EFFECT)) {
-		u16 val16;
-
-		addr = BNX2X_SHMEM_ADDR(base,
-				drv_lic_key[port].max_iscsi_init_conn);
-		val16 = CNIC_RD16(dev, addr);
-
-		if (val16)
-			val16 ^= 0x1e1e;
-		dev->max_iscsi_conn = val16;
-	}
-
-	if (BNX2X_CHIP_IS_E2(cp->chip_id))
-		dev->max_fcoe_conn = BNX2X_FCOE_NUM_CONNECTIONS;
-
-	if (BNX2X_CHIP_IS_E1H(cp->chip_id) || BNX2X_CHIP_IS_E2(cp->chip_id)) {
-		int func = CNIC_FUNC(cp);
-		u32 mf_cfg_addr;
-
-		if (BNX2X_SHMEM2_HAS(base2, mf_cfg_addr))
-			mf_cfg_addr = CNIC_RD(dev, BNX2X_SHMEM2_ADDR(base2,
-					      mf_cfg_addr));
-		else
-			mf_cfg_addr = base + BNX2X_SHMEM_MF_BLK_OFFSET;
-
-		if (BNX2X_CHIP_IS_E2(cp->chip_id)) {
-			/* Must determine if the MF is SD vs SI mode */
-			addr = BNX2X_SHMEM_ADDR(base,
-					dev_info.shared_feature_config.config);
-			val = CNIC_RD(dev, addr);
-			if ((val & SHARED_FEAT_CFG_FORCE_SF_MODE_MASK) ==
-			    SHARED_FEAT_CFG_FORCE_SF_MODE_SWITCH_INDEPT) {
-				int rc;
-
-				/* MULTI_FUNCTION_SI mode */
-				addr = BNX2X_MF_CFG_ADDR(mf_cfg_addr,
-					func_ext_config[func].func_cfg);
-				val = CNIC_RD(dev, addr);
-				if (!(val & MACP_FUNC_CFG_FLAGS_ISCSI_OFFLOAD))
-					dev->max_iscsi_conn = 0;
-
-				if (!(val & MACP_FUNC_CFG_FLAGS_FCOE_OFFLOAD))
-					dev->max_fcoe_conn = 0;
-
-				addr = BNX2X_MF_CFG_ADDR(mf_cfg_addr,
-					func_ext_config[func].
-					iscsi_mac_addr_upper);
-				addr1 = BNX2X_MF_CFG_ADDR(mf_cfg_addr,
-					func_ext_config[func].
-					iscsi_mac_addr_lower);
-				rc = cnic_read_bnx2x_iscsi_mac(dev, addr,
-								addr1);
-				if (rc && func > 1)
-					dev->max_iscsi_conn = 0;
-
-				return;
-			}
-		}
-
-		addr = BNX2X_MF_CFG_ADDR(mf_cfg_addr,
-			func_mf_config[func].e1hov_tag);
-
-		val = CNIC_RD(dev, addr);
-		val &= FUNC_MF_CFG_E1HOV_TAG_MASK;
-		if (val != FUNC_MF_CFG_E1HOV_TAG_DEFAULT) {
-			dev->max_fcoe_conn = 0;
-			dev->max_iscsi_conn = 0;
-		}
-	}
-	if (!is_valid_ether_addr(dev->mac_addr))
-		dev->max_iscsi_conn = 0;
-}
-
 static void cnic_init_bnx2x_kcq(struct cnic_dev *dev)
 {
 	struct cnic_local *cp = dev->cnic_priv;
@@ -4918,8 +4805,6 @@ static int cnic_start_bnx2x_hw(struct cnic_dev *dev)
 
 	cnic_init_bnx2x_kcq(dev);
 
-	cnic_get_bnx2x_iscsi_info(dev);
-
 	/* Only 1 EQ */
 	CNIC_WR16(dev, cp->kcq1.io_addr, MAX_KCQ_IDX);
 	CNIC_WR(dev, BAR_CSTRORM_INTMEM +
@@ -5352,6 +5237,14 @@ static struct cnic_dev *init_bnx2x_cnic(struct net_device *dev)
 	cdev->pcidev = pdev;
 	cp->chip_id = ethdev->chip_id;
 
+	if (!(ethdev->drv_state & CNIC_DRV_STATE_NO_ISCSI))
+		cdev->max_iscsi_conn = ethdev->max_iscsi_conn;
+	if (BNX2X_CHIP_IS_E2(cp->chip_id) &&
+	    !(ethdev->drv_state & CNIC_DRV_STATE_NO_FCOE))
+		cdev->max_fcoe_conn = ethdev->max_fcoe_conn;
+
+	memcpy(cdev->mac_addr, ethdev->iscsi_mac, 6);
+
 	cp->cnic_ops = &cnic_bnx2x_ops;
 	cp->start_hw = cnic_start_bnx2x_hw;
 	cp->stop_hw = cnic_stop_bnx2x_hw;
diff --git a/drivers/net/cnic_if.h b/drivers/net/cnic_if.h
index 9f44e0f..e01b49e 100644
--- a/drivers/net/cnic_if.h
+++ b/drivers/net/cnic_if.h
@@ -12,8 +12,8 @@
 #ifndef CNIC_IF_H
 #define CNIC_IF_H
 
-#define CNIC_MODULE_VERSION	"2.2.12"
-#define CNIC_MODULE_RELDATE	"Jan 03, 2011"
+#define CNIC_MODULE_VERSION	"2.2.13"
+#define CNIC_MODULE_RELDATE	"Jan 31, 2011"
 
 #define CNIC_ULP_RDMA		0
 #define CNIC_ULP_ISCSI		1
@@ -159,6 +159,9 @@ struct cnic_eth_dev {
 	u32		drv_state;
 #define CNIC_DRV_STATE_REGD		0x00000001
 #define CNIC_DRV_STATE_USING_MSIX	0x00000002
+#define CNIC_DRV_STATE_NO_ISCSI_OOO	0x00000004
+#define CNIC_DRV_STATE_NO_ISCSI		0x00000008
+#define CNIC_DRV_STATE_NO_FCOE		0x00000010
 	u32		chip_id;
 	u32		max_kwqe_pending;
 	struct pci_dev	*pdev;
@@ -176,6 +179,7 @@ struct cnic_eth_dev {
 	u32		fcoe_init_cid;
 	u16		iscsi_l2_client_id;
 	u16		iscsi_l2_cid;
+	u8		iscsi_mac[ETH_ALEN];
 
 	int		num_irq;
 	struct cnic_irq	irq_arr[MAX_CNIC_VEC];
-- 
1.6.4.GIT



^ permalink raw reply related

* Re: [PATCH] Make INET_LHTABLE_SIZE a compile-time tunable
From: Bill Sommerfeld @ 2011-02-01  2:04 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, therbert
In-Reply-To: <20110131.140503.179938794.davem@davemloft.net>

On Mon, Jan 31, 2011 at 14:05, David Miller <davem@davemloft.net> wrote:
> From: Bill Sommerfeld <wsommerfeld@google.com>
>> I haven't seen any responses to this patch submission.  Can someone
>> take a look?  Thanks.
> It should be dynamically sized.  Compile time configuration knobs
> generally stick.

I'll work on that.  Consider this patch withdrawn until then.

^ permalink raw reply

* Re: [PATCH v4] net: add Faraday FTMAC100 10/100 Ethernet driver
From: Po-Yu Chuang @ 2011-02-01  3:56 UTC (permalink / raw)
  To: Michał Mirosław
  Cc: netdev, linux-kernel, bhutchings, eric.dumazet, joe, dilinger,
	Po-Yu Chuang
In-Reply-To: <AANLkTimz7gx7BAQG8RBswDCxnTuVtt3u+YoBZk8+RgEz@mail.gmail.com>

Dear Michał

2011/1/25 Po-Yu Chuang <ratbert.chuang@gmail.com>:
> 2011/1/25 Michał Mirosław <mirqus@gmail.com>:
>>
>> Ah, I blindly assumed that you're just appending the buffers to the
>> skb (using skb_fill_page_desc() and friends). Since you have to mark
>> descriptors for the device anyway, it might be faster to allocate new
>> skbs and map those as rx buffers (changing the descriptor's buffer
>> address after every RX) instead of keeping static buffer and copying
>> every time. (For small packets it wastes lot of memory, though - so
>> the right choice depends on the expected workload.)
>
> The reason I chose to use memcpy rx buffer to skb is that I didn't know
> how to deal with multi-segment packets if I preallocated skb for each
> rx descriptor. This function seems to be what I need.
> Let me rework this. Thanks.

After looking at some drivers using skb_fill_page_desc(), I am still
confused.

It seems that this function is mainly for jumbo packet and the
beginning of data of received packet should still be in skb->data.

Scheme 1)
If I simply allocate a page for each rx ring entry, I still need to allocate
an skb and copy at least packet header in first page to skb->data. Then
add the page of rest of payload to skb by skb_fill_page_desc().

Scheme 2)
If I simply allocate an skb for each rx ring entry, zero copy can be easily
achieved if the packet contains only one segment (one rx ring entry).
However, if the received packet has more than one segment, I guess I
might need to allocate a bigger skb and copy data of all segments
(skb->data) to the new skb?

The problem is, in most cases, the received packets are one-segment only.
(TBH, I don't know when will multi-segment packets appear.)

If all packets are one-segment, I can just allocate an skb for each rx ring
enry and achieve zero-copy. However, since there might be multi-segment
packets, I need to deal with them.

How can I do?

best regards,
Po-Yu Chuang

^ permalink raw reply

* Re: [GIT PULL nf-next-2.6] IPVS build fixes and clean-ups
From: Simon Horman @ 2011-02-01  4:06 UTC (permalink / raw)
  To: Randy Dunlap
  Cc: netdev, linux-next, linux-kernel, lvs-devel, Stephen Rothwell,
	Hans Schillstrom, Patrick McHardy
In-Reply-To: <20110131165009.66999952.randy.dunlap@oracle.com>

On Mon, Jan 31, 2011 at 04:50:09PM -0800, Randy Dunlap wrote:
> On Tue,  1 Feb 2011 11:14:11 +1100 Simon Horman wrote:
> 
> > Hi,
> > 
> > This short patch series addresses two linux-next build problems
> > raised by Randy Dunlap:
> > 
> > * net/netfilter/ipvs/ip_vs_core.c:1891: warning: format '%lu' expects type 'long unsigned int', but argument 2 has type 'unsigned int'
> > * ERROR: "unregister_net_sysctl_table" [net/netfilter/ipvs/ip_vs.ko]
> >   ERROR: "register_net_sysctl_table" [net/netfilter/ipvs/ip_vs.ko] undefined!
> > 
> > The remainder of the changset is cleanups that I noticed along the way.
> 
> These 4 patches build successfully for me.
> However, I do see these warnings (sorry I missed them earlier):
> 
> WARNING: net/netfilter/ipvs/ip_vs.o(.init.text+0x161): Section mismatch in reference from the function init_module() to the function .exit.text:ip_vs_sync_cleanup()
> WARNING: net/netfilter/ipvs/ip_vs.o(.init.text+0x161): Section mismatch in reference from the function init_module() to the function .exit.text:ip_vs_sync_cleanup()

Thanks, I'll look into that. I will be travelling for a good portion of the
next day and a bit so I apologise in advance if that delays my next patch.

^ permalink raw reply

* Re: [PATCH] IPv6 configurable default value for the privacy extension flag
From: YOSHIFUJI Hideaki @ 2011-02-01  4:04 UTC (permalink / raw)
  To: j.aube; +Cc: Julien Aubé, netdev, YOSHIFUJI Hideaki
In-Reply-To: <20110131164945.3370cbcd@baileys.at.home>

Hello.

(2011/02/01 0:49), Julien Aubé wrote:

>+	} else {
>+		printk(KERN_WARNING "IPv6 Privacy Extension "
>+	"is disabled by default (invalid value %d)\n", ipv6_default_privacy);
>+	}

I think WARNING is too high.

--yoshfuji

^ permalink raw reply

* RE: [PATCH net-next-2.6 v5 1/1] can: c_can: Added support for Bosch C_CAN controller
From: Bhupesh SHARMA @ 2011-02-01  4:29 UTC (permalink / raw)
  To: Wolfgang Grandegger
  Cc: Socketcan-core-0fE9KPoRgkgATYTw5x5z8w@public.gmane.org,
	netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Marc Kleine-Budde,
	David Miller
In-Reply-To: <4D3E8118.10904-5Yr1BZd7O62+XT7JhA+gdA@public.gmane.org>

Hello Wolfgang,

> ...
> >>> +		/* handle error on the bus */
> >>> +		lec_type = c_can_has_and_handle_berr(priv);
> >>> +		if (lec_type && (error_type != C_CAN_NO_ERROR))
> >>> +			work_done += c_can_err(dev, error_type, lec_type);
> >>
> >> State changes are only reported if berr_reporting is enabled and a
> bus
> >> error occured. This needs to be fixed.
> >
> > As I mentioned earlier in a response to a review comment, the Bus
> Error
> > reporting for C_CAN seems different from sja1000 and flexcan
> approaches.
> > Do you think it will be useful to drop CAN_CTRLMODE_BERR_REPORTING
> from
> > priv->can.ctrlmode_supported as done by *pch* driver? Or do you have
> > a better idea..
> 
> You bus error reporting is OK. The problem is that it does not only
> affect bus errors but also state changes. State change messages should
> alway be send independent of priv->can.ctrlmode. It's just a matter of
> moving code to the right location. E.g. the code snippet above inside
> c_can_err() before you check for bus errors.
> 
> >> Feel free to send the output of "candump any,0:0,#FFFFFFFF" when
> >> sending
> >> messages without cable connected and with a bus error provocuted.
> >
> > OK. I will try to cross-compile candump for my arm-v7 architecture
> > and will send the output.
> 

I did some changes to the code to ensure that the state change and lec
handling are handled separately and properly.
Please find the candump any,0:0,#FFFFFFFF output below:

1. With No-Cable connected, I keep getting:
  can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
  can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
  can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
  can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
  can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
  can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
  can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
  can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
  can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
  can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
  can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
  can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
  can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
  can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
  can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
  can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
  can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
  can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
  can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
  can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
  can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
  can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
  can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
  can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
  can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME

2. With Tx and Rx shorted to simulate bus-error, I get:
  can0  20000044  [8] 00 20 00 00 00 00 00 00   ERRORFRAME

In case 2, when I enable debug messages I get the correct state change sequence:
entered error warning state
entered error passive state
entered bus off state

Does this result seem fine to you?

Regards,
Bhupesh

^ permalink raw reply

* Re: [PATCH v4] net: add Faraday FTMAC100 10/100 Ethernet driver
From: David Miller @ 2011-02-01  4:35 UTC (permalink / raw)
  To: ratbert.chuang
  Cc: mirqus, netdev, linux-kernel, bhutchings, eric.dumazet, joe,
	dilinger, ratbert
In-Reply-To: <AANLkTikgDsM15DO7u2SP15xx=Y_pg6JdCLrie8ACobJP@mail.gmail.com>

From: Po-Yu Chuang <ratbert.chuang@gmail.com>
Date: Tue, 1 Feb 2011 11:56:16 +0800

> If I simply allocate a page for each rx ring entry, I still need to allocate
> an skb and copy at least packet header in first page to skb->data. Then
> add the page of rest of payload to skb by skb_fill_page_desc().

You should attach the pages, the use __pskb_pull_tail() to bring in the
headers to the linear skb->data area.

See drivers/net/niu.c:niu_process_rx_pkt().

^ permalink raw reply

* Re: [net-2.6 PATCH 1/3] net: dcb: match dcb_app protocol field with 802.1Qaz spec
From: David Miller @ 2011-02-01  4:41 UTC (permalink / raw)
  To: john.r.fastabend; +Cc: netdev
In-Reply-To: <20110131220048.29758.22379.stgit@jf-dev1-dcblab>

From: John Fastabend <john.r.fastabend@intel.com>
Date: Mon, 31 Jan 2011 14:00:49 -0800

> The dcb_app protocol field is a __u32 however the 802.1Qaz
> specification defines it as a 16 bit field. This patch brings
> the structure inline with the spec making it a __u16.
> 
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
 ...
> @@ -101,7 +101,7 @@ struct ieee_pfc {
>   */
>  struct dcb_app {
>  	__u8	selector;
> -	__u32	protocol;
> +	__u16	protocol;
>  	__u8	priority;
>  };
>  

If we're going to do this, please fix this wasteful structure
layout.  Put the "protocol" either first, or last, so that the
structure size is 4 bytes, rather than something like 8.

^ permalink raw reply

* Re: [net-2.6 PATCH 2/3] net: dcb: use _safe() version of list iterators
From: David Miller @ 2011-02-01  4:42 UTC (permalink / raw)
  To: john.r.fastabend; +Cc: netdev
In-Reply-To: <20110131220054.29758.20521.stgit@jf-dev1-dcblab>

From: John Fastabend <john.r.fastabend@intel.com>
Date: Mon, 31 Jan 2011 14:00:54 -0800

> Use _safe() version of list iterator macros in dcb_setapp().
> 
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>

Why?  It's unnecessary overhead, since we always branch to "out", and
therefore out of the list traversal loop, any time we list_del()
something on the list.

^ permalink raw reply

* Re: [net-2.6 PATCH 3/3] net: dcb: application priority is per net_device
From: David Miller @ 2011-02-01  4:43 UTC (permalink / raw)
  To: john.r.fastabend; +Cc: netdev
In-Reply-To: <20110131220059.29758.17857.stgit@jf-dev1-dcblab>

From: John Fastabend <john.r.fastabend@intel.com>
Date: Mon, 31 Jan 2011 14:00:59 -0800

> The app_data priority may not be the same for all net devices.
> In order for stacks with application notifiers to identify the
> specific net device dcb_app_type should be passed in the ptr.
> 
> This allows handlers to use dev_get_by_name() to pin priority
> to net devices.
> 
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>

Given your track record with the previous two patches (ie. they
need work or are nonsense) I want someone who understands this
dcbnl stuff to review this third patch before I'm willing to
apply it.

^ permalink raw reply

* Re: [PATCH net-next-2.6] bnx2x, cnic: Consolidate iSCSI/FCoE shared mem logic in bnx2x
From: David Miller @ 2011-02-01  4:44 UTC (permalink / raw)
  To: mchan; +Cc: vladz, eilong, netdev
In-Reply-To: <1296520757-5387-1-git-send-email-mchan@broadcom.com>

From: "Michael Chan" <mchan@broadcom.com>
Date: Mon, 31 Jan 2011 16:39:17 -0800

> From: Vladislav Zolotarov <vladz@broadcom.com>
> 
> Move all shared mem code to bnx2x to avoid code duplication.  bnx2x now
> performs:
> 
> - Read the FCoE and iSCSI max connection information.
> - Read the iSCSI and FCoE MACs from NPAR configuration in shmem.
> - Block the CNIC for the current function if there is neither FCoE nor
>   iSCSI valid configuration by returning NULL from bnx2x_cnic_probe().
> 
> Signed-off-by: Vladislav Zolotarov <vladz@broadcom.com>
> Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
> Signed-off-by: Michael Chan <mchan@broadcom.com>

Applied, thanks.

^ permalink raw reply

* Re: [PATCH kernel 2.6.38rc2-git7] axnet_cs: reduce delay time at ei_rx_overrun
From: David Miller @ 2011-02-01  4:53 UTC (permalink / raw)
  To: ken_kawasaki; +Cc: netdev
In-Reply-To: <20110131061616.05b2fa6f.ken_kawasaki@spring.nifty.jp>

From: Ken Kawasaki <ken_kawasaki@spring.nifty.jp>
Date: Mon, 31 Jan 2011 06:16:16 +0900

> 
> axnet_cs:
>     mdelay of 10ms is too long at ei_rx_overrun.
>     It should be reduced to 2ms.
> 
> Signed-off-by: Ken Kawasaki <ken_kawasaki@spring.nifty.jp>

Applied, thank you.

^ permalink raw reply

* Re: [PATCH] net: Fix ipv6 neighbour unregister_sysctl_table warning
From: David Miller @ 2011-02-01  4:55 UTC (permalink / raw)
  To: ebiederm; +Cc: netdev, yoshfuji
In-Reply-To: <m11v3utagp.fsf@fess.ebiederm.org>

From: ebiederm@xmission.com (Eric W. Biederman)
Date: Sun, 30 Jan 2011 12:15:02 -0800

> 
> In my testing of 2.6.37 I was occassionally getting a warning about
> sysctl table entries being unregistered in the wrong order.  Digging
> in it turns out this dates back to the last great sysctl reorg done
> where Al Viro introduced the requirement that sysctl directories
> needed to be created before and destroyed after the files in them.
> 
> It turns out that in that great reorg /proc/sys/net/ipv6/neigh was
> overlooked.  So this patch fixes that oversight and makes an annoying
> warning message go away.
> 
>>------------[ cut here ]------------
>>WARNING: at kernel/sysctl.c:1992 unregister_sysctl_table+0x134/0x164()
>>Pid: 23951, comm: kworker/u:3 Not tainted 2.6.37-350888.2010AroraKernelBeta.fc14.x86_64 #1
>>Call Trace:
 ...
> 
> Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>

Applied and queued for -stable, thanks a lot Eric.

^ permalink raw reply

* Re: [PATCH] vxge: Fix wrong boolean operator
From: David Miller @ 2011-02-01  4:55 UTC (permalink / raw)
  To: weil
  Cc: jon.mason, ramkrishna.vepa, sivakumar.subramani,
	sreenivasa.honnur, netdev, linux-kernel
In-Reply-To: <1296253817-7668-1-git-send-email-weil@mail.berlios.de>

From: Stefan Weil <weil@mail.berlios.de>
Date: Fri, 28 Jan 2011 23:30:17 +0100

> This error is reported by cppcheck:
> drivers/net/vxge/vxge-config.c:3693: warning: Mutual exclusion over || always evaluates to true. Did you intend to use && instead?
> 
> It looks like cppcheck is correct, so fix this. No test was run.
> 
> Cc: Ramkrishna Vepa <ramkrishna.vepa@exar.com>
> Cc: Sivakumar Subramani <sivakumar.subramani@exar.com>
> Cc: Sreenivasa Honnur <sreenivasa.honnur@exar.com>
> Cc: Jon Mason <jon.mason@exar.com>
> Cc: netdev@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Signed-off-by: Stefan Weil <weil@mail.berlios.de>

Can the VXGE folks please review this patch?

^ permalink raw reply

* Re: [PATCH] enc28j60: Fix reading of transmit status vector
From: David Miller @ 2011-02-01  4:57 UTC (permalink / raw)
  To: weil; +Cc: eric.dumazet, tj, jpirko, netdev, linux-kernel
In-Reply-To: <1296253519-7580-1-git-send-email-weil@mail.berlios.de>

From: Stefan Weil <weil@mail.berlios.de>
Date: Fri, 28 Jan 2011 23:25:19 +0100

> This error was reported by cppcheck:
> drivers/net/enc28j60.c:815: error: Using sizeof for array given as function argument returns the size of pointer.
> 
> The original code reads 4 or 8 bytes instead of TSV_SIZE (= 100) bytes.
> I just fixed the code, but did not run any tests.
> 
> Signed-off-by: Stefan Weil <weil@mail.berlios.de>

Applied, thanks.

^ permalink raw reply

* Re: Network performance with small packets
From: Michael S. Tsirkin @ 2011-02-01  5:54 UTC (permalink / raw)
  To: Steve Dobbelstein; +Cc: David Miller, kvm, mashirle, netdev
In-Reply-To: <OF05820FED.7465B77B-ON8625782A.0001B247-8625782A.00023FA7@us.ibm.com>

On Mon, Jan 31, 2011 at 06:24:34PM -0600, Steve Dobbelstein wrote:
> "Michael S. Tsirkin" <mst@redhat.com> wrote on 01/28/2011 06:16:16 AM:
> 
> > OK, so thinking about it more, maybe the issue is this:
> > tx becomes full. We process one request and interrupt the guest,
> > then it adds one request and the queue is full again.
> >
> > Maybe the following will help it stabilize?
> > By itself it does nothing, but if you set
> > all the parameters to a huge value we will
> > only interrupt when we see an empty ring.
> > Which might be too much: pls try other values
> > in the middle: e.g. make bufs half the ring,
> > or bytes some small value, or packets some
> > small value etc.
> >
> > Warning: completely untested.
> >
> > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > index aac05bc..6769cdc 100644
> > --- a/drivers/vhost/net.c
> > +++ b/drivers/vhost/net.c
> > @@ -32,6 +32,13 @@
> >   * Using this limit prevents one virtqueue from starving others. */
> >  #define VHOST_NET_WEIGHT 0x80000
> >
> > +int tx_bytes_coalesce = 0;
> > +module_param(tx_bytes_coalesce, int, 0644);
> > +int tx_bufs_coalesce = 0;
> > +module_param(tx_bufs_coalesce, int, 0644);
> > +int tx_packets_coalesce = 0;
> > +module_param(tx_packets_coalesce, int, 0644);
> > +
> >  enum {
> >     VHOST_NET_VQ_RX = 0,
> >     VHOST_NET_VQ_TX = 1,
> > @@ -127,6 +134,9 @@ static void handle_tx(struct vhost_net *net)
> >     int err, wmem;
> >     size_t hdr_size;
> >     struct socket *sock;
> > +   int bytes_coalesced = 0;
> > +   int bufs_coalesced = 0;
> > +   int packets_coalesced = 0;
> >
> >     /* TODO: check that we are running from vhost_worker? */
> >     sock = rcu_dereference_check(vq->private_data, 1);
> > @@ -196,14 +206,26 @@ static void handle_tx(struct vhost_net *net)
> >        if (err != len)
> >           pr_debug("Truncated TX packet: "
> >               " len %d != %zd\n", err, len);
> > -      vhost_add_used_and_signal(&net->dev, vq, head, 0);
> >        total_len += len;
> > +      packets_coalesced += 1;
> > +      bytes_coalesced += len;
> > +      bufs_coalesced += in;
> 
> Should this instead be:
>       bufs_coalesced += out;

Correct.

> Perusing the code I see that earlier there is a check to see if "in" is not
> zero, and, if so, error out of the loop.  After the check, "in" is not
> touched until it is added to bufs_coalesced, effectively not changing
> bufs_coalesced, meaning bufs_coalesced will never trigger the conditions
> below.
> 
> Or am I missing something?
> 
> > +      if (unlikely(packets_coalesced > tx_packets_coalesce ||
> > +              bytes_coalesced > tx_bytes_coalesce ||
> > +              bufs_coalesced > tx_bufs_coalesce))
> > +         vhost_add_used_and_signal(&net->dev, vq, head, 0);
> > +      else
> > +         vhost_add_used(vq, head, 0);
> >        if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
> >           vhost_poll_queue(&vq->poll);
> >           break;
> >        }
> >     }
> >
> > +   if (likely(packets_coalesced > tx_packets_coalesce ||
> > +         bytes_coalesced > tx_bytes_coalesce ||
> > +         bufs_coalesced > tx_bufs_coalesce))
> > +      vhost_signal(&net->dev, vq);
> >     mutex_unlock(&vq->mutex);
> >  }
> >
> 
> Steve D.

^ permalink raw reply

* Re: Network performance with small packets
From: Michael S. Tsirkin @ 2011-02-01  5:56 UTC (permalink / raw)
  To: Sridhar Samudrala; +Cc: Steve Dobbelstein, David Miller, kvm, mashirle, netdev
In-Reply-To: <1296523838.30191.39.camel@sridhar.beaverton.ibm.com>

On Mon, Jan 31, 2011 at 05:30:38PM -0800, Sridhar Samudrala wrote:
> On Mon, 2011-01-31 at 18:24 -0600, Steve Dobbelstein wrote:
> > "Michael S. Tsirkin" <mst@redhat.com> wrote on 01/28/2011 06:16:16 AM:
> > 
> > > OK, so thinking about it more, maybe the issue is this:
> > > tx becomes full. We process one request and interrupt the guest,
> > > then it adds one request and the queue is full again.
> > >
> > > Maybe the following will help it stabilize?
> > > By itself it does nothing, but if you set
> > > all the parameters to a huge value we will
> > > only interrupt when we see an empty ring.
> > > Which might be too much: pls try other values
> > > in the middle: e.g. make bufs half the ring,
> > > or bytes some small value, or packets some
> > > small value etc.
> > >
> > > Warning: completely untested.
> > >
> > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > > index aac05bc..6769cdc 100644
> > > --- a/drivers/vhost/net.c
> > > +++ b/drivers/vhost/net.c
> > > @@ -32,6 +32,13 @@
> > >   * Using this limit prevents one virtqueue from starving others. */
> > >  #define VHOST_NET_WEIGHT 0x80000
> > >
> > > +int tx_bytes_coalesce = 0;
> > > +module_param(tx_bytes_coalesce, int, 0644);
> > > +int tx_bufs_coalesce = 0;
> > > +module_param(tx_bufs_coalesce, int, 0644);
> > > +int tx_packets_coalesce = 0;
> > > +module_param(tx_packets_coalesce, int, 0644);
> > > +
> > >  enum {
> > >     VHOST_NET_VQ_RX = 0,
> > >     VHOST_NET_VQ_TX = 1,
> > > @@ -127,6 +134,9 @@ static void handle_tx(struct vhost_net *net)
> > >     int err, wmem;
> > >     size_t hdr_size;
> > >     struct socket *sock;
> > > +   int bytes_coalesced = 0;
> > > +   int bufs_coalesced = 0;
> > > +   int packets_coalesced = 0;
> > >
> > >     /* TODO: check that we are running from vhost_worker? */
> > >     sock = rcu_dereference_check(vq->private_data, 1);
> > > @@ -196,14 +206,26 @@ static void handle_tx(struct vhost_net *net)
> > >        if (err != len)
> > >           pr_debug("Truncated TX packet: "
> > >               " len %d != %zd\n", err, len);
> > > -      vhost_add_used_and_signal(&net->dev, vq, head, 0);
> > >        total_len += len;
> > > +      packets_coalesced += 1;
> > > +      bytes_coalesced += len;
> > > +      bufs_coalesced += in;
> > 
> > Should this instead be:
> >       bufs_coalesced += out;
> > 
> > Perusing the code I see that earlier there is a check to see if "in" is not
> > zero, and, if so, error out of the loop.  After the check, "in" is not
> > touched until it is added to bufs_coalesced, effectively not changing
> > bufs_coalesced, meaning bufs_coalesced will never trigger the conditions
> > below.
> 
> Yes. It definitely should be 'out'. 'in' should be 0 in the tx path.
> 
> I tried a simpler version of this patch without any tunables by
> delaying the signaling until we come out of the for loop.
> It definitely reduced the number of vmexits significantly for small message
> guest to host stream test and the throughput went up a little.
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 9b3ca10..5f9fae9 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -197,7 +197,7 @@ static void handle_tx(struct vhost_net *net)
>  		if (err != len)
>  			pr_debug("Truncated TX packet: "
>  				 " len %d != %zd\n", err, len);
> -		vhost_add_used_and_signal(&net->dev, vq, head, 0);
> +		vhost_add_used(vq, head, 0);
>  		total_len += len;
>  		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
>  			vhost_poll_queue(&vq->poll);
> @@ -205,6 +205,8 @@ static void handle_tx(struct vhost_net *net)
>  		}
>  	}
>  
> +	if (total_len > 0)
> +		vhost_signal(&net->dev, vq);
>  	mutex_unlock(&vq->mutex);
>  }
>  
> 
> > 
> > Or am I missing something?
> > 
> > > +      if (unlikely(packets_coalesced > tx_packets_coalesce ||
> > > +              bytes_coalesced > tx_bytes_coalesce ||
> > > +              bufs_coalesced > tx_bufs_coalesce))
> > > +         vhost_add_used_and_signal(&net->dev, vq, head, 0);
> > > +      else
> > > +         vhost_add_used(vq, head, 0);
> > >        if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
> > >           vhost_poll_queue(&vq->poll);
> > >           break;
> > >        }
> > >     }
> > >
> > > +   if (likely(packets_coalesced > tx_packets_coalesce ||
> > > +         bytes_coalesced > tx_bytes_coalesce ||
> > > +         bufs_coalesced > tx_bufs_coalesce))
> > > +      vhost_signal(&net->dev, vq);
> > >     mutex_unlock(&vq->mutex);
> > >  }
> 
> It is possible that we can miss signaling the guest even after
> processing a few pkts, if we don't hit any of these conditions.

Yes. It really should be
   if (likely(packets_coalesced && bytes_coalesced && bufs_coalesced))
      vhost_signal(&net->dev, vq);

> > >
> > 
> > Steve D.
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe netdev" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH net-next-2.6 v5 1/1] can: c_can: Added support for Bosch C_CAN controller
From: Wolfgang Grandegger @ 2011-02-01  7:23 UTC (permalink / raw)
  To: Bhupesh SHARMA
  Cc: Socketcan-core-0fE9KPoRgkgATYTw5x5z8w@public.gmane.org,
	netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Marc Kleine-Budde,
	David Miller
In-Reply-To: <D5ECB3C7A6F99444980976A8C6D896384DEE0832F8-8vAmw3ZAcdzhJTuQ9jeba9BPR1lH4CV8@public.gmane.org>

Hi Bhupesh,

On 02/01/2011 05:29 AM, Bhupesh SHARMA wrote:
> Hello Wolfgang,
> 
>> ...
>>>>> +		/* handle error on the bus */
>>>>> +		lec_type = c_can_has_and_handle_berr(priv);
>>>>> +		if (lec_type && (error_type != C_CAN_NO_ERROR))
>>>>> +			work_done += c_can_err(dev, error_type, lec_type);
>>>>
>>>> State changes are only reported if berr_reporting is enabled and a
>> bus
>>>> error occured. This needs to be fixed.
>>>
>>> As I mentioned earlier in a response to a review comment, the Bus
>> Error
>>> reporting for C_CAN seems different from sja1000 and flexcan
>> approaches.
>>> Do you think it will be useful to drop CAN_CTRLMODE_BERR_REPORTING
>> from
>>> priv->can.ctrlmode_supported as done by *pch* driver? Or do you have
>>> a better idea..
>>
>> You bus error reporting is OK. The problem is that it does not only
>> affect bus errors but also state changes. State change messages should
>> alway be send independent of priv->can.ctrlmode. It's just a matter of
>> moving code to the right location. E.g. the code snippet above inside
>> c_can_err() before you check for bus errors.
>>
>>>> Feel free to send the output of "candump any,0:0,#FFFFFFFF" when
>>>> sending
>>>> messages without cable connected and with a bus error provocuted.
>>>
>>> OK. I will try to cross-compile candump for my arm-v7 architecture
>>> and will send the output.
>>
> 
> I did some changes to the code to ensure that the state change and lec
> handling are handled separately and properly.

Great.

> Please find the candump any,0:0,#FFFFFFFF output below:
> 
> 1. With No-Cable connected, I keep getting:
>   can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
>   can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
>   can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
>   can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
>   can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
>   can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
>   can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
>   can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
>   can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
>   can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
>   can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
>   can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
>   can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
>   can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
>   can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
>   can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
>   can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
>   can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
>   can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
>   can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
>   can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
>   can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
>   can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
>   can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME
>   can0  20000004  [8] 00 28 00 00 00 00 00 00   ERRORFRAME

The SJA1000 reports:

---- Error-active -> error-warning -> error-passive
---- cable disconnected
-bash-3.2# ./candump -t d any,0:0,#FFFFFFFF

 (0.000000)  can0  20000004  [8] 00 08 00 00 00 00 60 00   ERROR-WARNING
 (0.002588)  can0  20000004  [8] 00 20 00 00 00 00 80 00   ERROR-PASSIVE

I wonder why you get data[1]=28 immediately. Also it's nice to have the
txerr and rxerr in data[6..7].

> 2. With Tx and Rx shorted to simulate bus-error, I get:
>   can0  20000044  [8] 00 20 00 00 00 00 00 00   ERRORFRAME

Also here, you should see:

---- Error-active -> error-warning -> error-passive -> bus-off

-bash-3.2# ./candump -t d any,0:0,#FFFFFFFF
 (0.000000)  can0  20000004  [8] 00 08 00 00 00 00 88 00   ERROR-WARNING
 (0.001362)  can0  20000004  [8] 00 20 00 00 00 00 88 00   ERROR-PASSIVE
 (0.001560)  can0  20000044  [8] 00 00 00 00 00 00 7F 00   BUS-OFF

> In case 2, when I enable debug messages I get the correct state change sequence:
> entered error warning state
> entered error passive state
> entered bus off state

As the log confirms.

> Does this result seem fine to you?

Not yet. Also the output with "berr-reporting on" should be checked. And
the output when you recover from bus-off manually (using "ip link set
type can restart")

Thanks,

Wolfgang.

^ permalink raw reply

* RE: [PATCH] vxge: Fix wrong boolean operator {nodisc}
From: Ramkrishna Vepa @ 2011-02-01  7:26 UTC (permalink / raw)
  To: Stefan Weil
  Cc: Sivakumar Subramani, Sreenivasa Honnur, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, Jon Mason
In-Reply-To: <1296253817-7668-1-git-send-email-weil@mail.berlios.de>

> This error is reported by cppcheck:
> drivers/net/vxge/vxge-config.c:3693: warning: Mutual exclusion over ||
> always evaluates to true. Did you intend to use && instead?
> 
> It looks like cppcheck is correct, so fix this. No test was run.
> 
> Signed-off-by: Stefan Weil <weil@mail.berlios.de>
> ---
>  drivers/net/vxge/vxge-config.c |    2 +-
>  1 files changed, 1 insertions(+), 1 deletions(-)
> 
> diff --git a/drivers/net/vxge/vxge-config.c b/drivers/net/vxge/vxge-
> config.c
> index 01c05f5..228d4f7 100644
> --- a/drivers/net/vxge/vxge-config.c
> +++ b/drivers/net/vxge/vxge-config.c
> @@ -3690,7 +3690,7 @@ __vxge_hw_vpath_rts_table_get(struct
> __vxge_hw_vpath_handle *vp,
>  	if (status != VXGE_HW_OK)
>  		goto exit;
> 
> -	if ((rts_table != VXGE_HW_RTS_ACCESS_STEER_CTRL_DATA_STRUCT_SEL_DA)
> ||
> +	if ((rts_table != VXGE_HW_RTS_ACCESS_STEER_CTRL_DATA_STRUCT_SEL_DA)
> &&
>  	    (rts_table !=
>  	     VXGE_HW_RTS_ACS_STEER_CTRL_DATA_STRUCT_SEL_RTH_MULTI_IT))
>  		*data1 = 0;
> --
Fix looks good. Thanks!

Acked-by: Ram Vepa <ram.vepa@exar.com>

^ permalink raw reply

* [patch 4/7] [PATCH] qeth: allow HiperSockets framesize change in suspend
From: frank.blaschka @ 2011-02-01  8:16 UTC (permalink / raw)
  To: davem; +Cc: netdev, linux-s390, Ursula Braun
In-Reply-To: <20110201081647.173934635@de.ibm.com>

[-- Attachment #1: 607-qeth-chparm-change.diff --]
[-- Type: text/plain, Size: 1268 bytes --]

From: Ursula Braun <ursula.braun@de.ibm.com>

For HiperSockets the framesize-definition determines the selected
mtu-size and the size of the allocated qdio buffers.
A framesize-change may occur while a Linux system with probed
HiperSockets device is in suspend state. This patch enables proper
resuming of a HiperSockets device in this case.

Signed-off-by: Ursula Braun <ursula.braun@de.ibm.com>
Signed-off-by: Frank Blaschka <frank.blaschka@de.ibm.com>
---


 drivers/s390/net/qeth_core_main.c |   10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -1887,8 +1887,16 @@ static int qeth_ulp_enable_cb(struct qet
 			QETH_DBF_TEXT_(SETUP, 2, "  rc%d", iob->rc);
 			return 0;
 		}
-		card->info.max_mtu = mtu;
+		if (card->info.initial_mtu && (card->info.initial_mtu != mtu)) {
+			/* frame size has changed */
+			if (card->dev &&
+			    ((card->dev->mtu == card->info.initial_mtu) ||
+			     (card->dev->mtu > mtu)))
+				card->dev->mtu = mtu;
+			qeth_free_qdio_buffers(card);
+		}
 		card->info.initial_mtu = mtu;
+		card->info.max_mtu = mtu;
 		card->qdio.in_buf_size = mtu + 2 * PAGE_SIZE;
 	} else {
 		card->info.initial_mtu = qeth_get_initial_mtu_for_card(card);


^ permalink raw reply

* [patch 0/7] s390: network patches for net-next
From: frank.blaschka @ 2011-02-01  8:16 UTC (permalink / raw)
  To: davem; +Cc: netdev, linux-s390

Hi Dave,

here are some patches for net-next.

shortlog:
Ursula Braun (3)
qeth: show new mac-address if its setting fails
qeth: allow HiperSockets framesize change in suspend
qeth: allow OSA CHPARM change in suspend state

Frank Blaschka (1)
qeth: add more strict MTU checking

Horst Hartmann (1)
net,s390: provide architecture specific NET_SKB_PAD

Stefan Weil (2)
s390: Fix wrong size in memcmp (netiucv)
s390: Fix possibly wrong size in strncmp (smsgiucv)

Thanks,
        Frank


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox