Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH net-2.6 2/2] be2net: remove netif_stop_queue being called before register_netdev.
From: David Miller @ 2011-02-01 23:42 UTC (permalink / raw)
  To: ajit.khaparde; +Cc: netdev
In-Reply-To: <20110131232755.GA4691@akhaparde-VBox>

From: Ajit Khaparde <ajit.khaparde@emulex.com>
Date: Mon, 31 Jan 2011 17:27:55 -0600

> It is illegal to call netif_stop_queue before register_netdev.
> 
> Signed-off-by: Ajit Khaparde <ajit.khaparde@emulex.com>

Applied, thanks.

^ permalink raw reply

* Re: [PATCH net-2.6 1/2] be2net: fix a crash seen during insmod/rmmod test
From: David Miller @ 2011-02-01 23:41 UTC (permalink / raw)
  To: ajit.khaparde; +Cc: netdev
In-Reply-To: <20110131232704.GA4635@akhaparde-VBox>

From: Ajit Khaparde <ajit.khaparde@emulex.com>
Date: Mon, 31 Jan 2011 17:27:04 -0600

> While running insmod/rmood in a loop, an unnecessary netif_stop_queue
> causes the system to crash. Remove the netif_stop_queue call
> and netif_start_queue in the link status update path.
> 
> Signed-off-by: Ajit Khaparde <ajit.khaparde@emulex.com>

Applied.

^ permalink raw reply

* [PATCH] ipv4: Rename fib_hash_* locals in fib_semantics.c
From: David Miller @ 2011-02-01 23:38 UTC (permalink / raw)
  To: netdev


To avoid confusion with the recently deleted fib_hash.c
code, use "fib_info_hash_*" instead of plain "fib_hash_*".

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c |   40 ++++++++++++++++++++--------------------
 1 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index b15857d..146bd82 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -49,7 +49,7 @@
 static DEFINE_SPINLOCK(fib_info_lock);
 static struct hlist_head *fib_info_hash;
 static struct hlist_head *fib_info_laddrhash;
-static unsigned int fib_hash_size;
+static unsigned int fib_info_hash_size;
 static unsigned int fib_info_cnt;
 
 #define DEVINDEX_HASHBITS 8
@@ -223,7 +223,7 @@ static inline unsigned int fib_devindex_hashfn(unsigned int val)
 
 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
 {
-	unsigned int mask = (fib_hash_size - 1);
+	unsigned int mask = (fib_info_hash_size - 1);
 	unsigned int val = fi->fib_nhs;
 
 	val ^= fi->fib_protocol;
@@ -615,14 +615,14 @@ out:
 
 static inline unsigned int fib_laddr_hashfn(__be32 val)
 {
-	unsigned int mask = (fib_hash_size - 1);
+	unsigned int mask = (fib_info_hash_size - 1);
 
 	return ((__force u32)val ^
 		((__force u32)val >> 7) ^
 		((__force u32)val >> 14)) & mask;
 }
 
-static struct hlist_head *fib_hash_alloc(int bytes)
+static struct hlist_head *fib_info_hash_alloc(int bytes)
 {
 	if (bytes <= PAGE_SIZE)
 		return kzalloc(bytes, GFP_KERNEL);
@@ -632,7 +632,7 @@ static struct hlist_head *fib_hash_alloc(int bytes)
 					 get_order(bytes));
 }
 
-static void fib_hash_free(struct hlist_head *hash, int bytes)
+static void fib_info_hash_free(struct hlist_head *hash, int bytes)
 {
 	if (!hash)
 		return;
@@ -643,18 +643,18 @@ static void fib_hash_free(struct hlist_head *hash, int bytes)
 		free_pages((unsigned long) hash, get_order(bytes));
 }
 
-static void fib_hash_move(struct hlist_head *new_info_hash,
-			  struct hlist_head *new_laddrhash,
-			  unsigned int new_size)
+static void fib_info_hash_move(struct hlist_head *new_info_hash,
+			       struct hlist_head *new_laddrhash,
+			       unsigned int new_size)
 {
 	struct hlist_head *old_info_hash, *old_laddrhash;
-	unsigned int old_size = fib_hash_size;
+	unsigned int old_size = fib_info_hash_size;
 	unsigned int i, bytes;
 
 	spin_lock_bh(&fib_info_lock);
 	old_info_hash = fib_info_hash;
 	old_laddrhash = fib_info_laddrhash;
-	fib_hash_size = new_size;
+	fib_info_hash_size = new_size;
 
 	for (i = 0; i < old_size; i++) {
 		struct hlist_head *head = &fib_info_hash[i];
@@ -695,8 +695,8 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
 	spin_unlock_bh(&fib_info_lock);
 
 	bytes = old_size * sizeof(struct hlist_head *);
-	fib_hash_free(old_info_hash, bytes);
-	fib_hash_free(old_laddrhash, bytes);
+	fib_info_hash_free(old_info_hash, bytes);
+	fib_info_hash_free(old_laddrhash, bytes);
 }
 
 struct fib_info *fib_create_info(struct fib_config *cfg)
@@ -720,8 +720,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 #endif
 
 	err = -ENOBUFS;
-	if (fib_info_cnt >= fib_hash_size) {
-		unsigned int new_size = fib_hash_size << 1;
+	if (fib_info_cnt >= fib_info_hash_size) {
+		unsigned int new_size = fib_info_hash_size << 1;
 		struct hlist_head *new_info_hash;
 		struct hlist_head *new_laddrhash;
 		unsigned int bytes;
@@ -729,15 +729,15 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 		if (!new_size)
 			new_size = 1;
 		bytes = new_size * sizeof(struct hlist_head *);
-		new_info_hash = fib_hash_alloc(bytes);
-		new_laddrhash = fib_hash_alloc(bytes);
+		new_info_hash = fib_info_hash_alloc(bytes);
+		new_laddrhash = fib_info_hash_alloc(bytes);
 		if (!new_info_hash || !new_laddrhash) {
-			fib_hash_free(new_info_hash, bytes);
-			fib_hash_free(new_laddrhash, bytes);
+			fib_info_hash_free(new_info_hash, bytes);
+			fib_info_hash_free(new_laddrhash, bytes);
 		} else
-			fib_hash_move(new_info_hash, new_laddrhash, new_size);
+			fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
 
-		if (!fib_hash_size)
+		if (!fib_info_hash_size)
 			goto failure;
 	}
 
-- 
1.7.4


^ permalink raw reply related

* [PATCH] ipv4: Update some fib_hash centric interface names.
From: David Miller @ 2011-02-01 23:38 UTC (permalink / raw)
  To: netdev


fib_hash_init() --> fib_trie_init()
fib_hash_table() --> fib_trie_table()

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h    |    6 +++---
 net/ipv4/fib_frontend.c |    8 ++++----
 net/ipv4/fib_trie.c     |    5 ++---
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 819d61c..08b46b8 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -228,9 +228,9 @@ extern int fib_sync_up(struct net_device *dev);
 extern __be32  __fib_res_prefsrc(struct fib_result *res);
 extern void fib_select_multipath(const struct flowi *flp, struct fib_result *res);
 
-/* Exported by fib_{hash|trie}.c */
-extern void fib_hash_init(void);
-extern struct fib_table *fib_hash_table(u32 id);
+/* Exported by fib_trie.c */
+extern void fib_trie_init(void);
+extern struct fib_table *fib_trie_table(u32 id);
 
 static inline void fib_combine_itag(u32 *itag, struct fib_result *res)
 {
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 930768b..2a49c06 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -51,11 +51,11 @@ static int __net_init fib4_rules_init(struct net *net)
 {
 	struct fib_table *local_table, *main_table;
 
-	local_table = fib_hash_table(RT_TABLE_LOCAL);
+	local_table = fib_trie_table(RT_TABLE_LOCAL);
 	if (local_table == NULL)
 		return -ENOMEM;
 
-	main_table  = fib_hash_table(RT_TABLE_MAIN);
+	main_table  = fib_trie_table(RT_TABLE_MAIN);
 	if (main_table == NULL)
 		goto fail;
 
@@ -82,7 +82,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
 	if (tb)
 		return tb;
 
-	tb = fib_hash_table(id);
+	tb = fib_trie_table(id);
 	if (!tb)
 		return NULL;
 	h = id & (FIB_TABLE_HASHSZ - 1);
@@ -1086,5 +1086,5 @@ void __init ip_fib_init(void)
 	register_netdevice_notifier(&fib_netdev_notifier);
 	register_inetaddr_notifier(&fib_inetaddr_notifier);
 
-	fib_hash_init();
+	fib_trie_init();
 }
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 16d589c..73cb984 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1916,7 +1916,7 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
 	return skb->len;
 }
 
-void __init fib_hash_init(void)
+void __init fib_trie_init(void)
 {
 	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
 					  sizeof(struct fib_alias),
@@ -1929,8 +1929,7 @@ void __init fib_hash_init(void)
 }
 
 
-/* Fix more generic FIB names for init later */
-struct fib_table *fib_hash_table(u32 id)
+struct fib_table *fib_trie_table(u32 id)
 {
 	struct fib_table *tb;
 	struct trie *t;
-- 
1.7.4


^ permalink raw reply related

* Re: [PATCH] ipv4: Remove fib_hash.
From: David Miller @ 2011-02-01 23:35 UTC (permalink / raw)
  To: shemminger; +Cc: netdev
In-Reply-To: <20110201153332.46a38ad8@nehalam>

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Tue, 1 Feb 2011 15:33:32 -0800

> On Tue, 1 Feb 2011 15:26:03 -0800
> Stephen Hemminger <shemminger@vyatta.com> wrote:
> 
>> On Tue, 01 Feb 2011 15:19:41 -0800 (PST)
>> David Miller <davem@davemloft.net> wrote:
>> 
>> > 
>> > The time has finally come to remove the hash based routing table
>> > implementation in ipv4.
>> > 
>> > FIB Trie is mature, well tested, and I've done an audit of it's code
>> > to confirm that it implements insert, delete, and lookup with the same
>> > identical semantics as fib_hash did.
>> > 
>> > If there are any semantic differences found in fib_trie, we should
>> > simply fix them.
>> > 
>> > I've placed the trie statistic config option under advanced router
>> > configuration.
>> > 
>> > Signed-off-by: David S. Miller <davem@davemloft.net>
>> 
>> Vyatta has been shipping FIB TRIE for over 3 years with reported
>> bugs.
> 
> NO reported bugs!!

Understood. :-)

Thanks for reviewing Stephen.

^ permalink raw reply

* Re: [PATCH] ipv4: Remove fib_hash.
From: Stephen Hemminger @ 2011-02-01 23:33 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev
In-Reply-To: <20110201152603.3ce5bd26@nehalam>

On Tue, 1 Feb 2011 15:26:03 -0800
Stephen Hemminger <shemminger@vyatta.com> wrote:

> On Tue, 01 Feb 2011 15:19:41 -0800 (PST)
> David Miller <davem@davemloft.net> wrote:
> 
> > 
> > The time has finally come to remove the hash based routing table
> > implementation in ipv4.
> > 
> > FIB Trie is mature, well tested, and I've done an audit of it's code
> > to confirm that it implements insert, delete, and lookup with the same
> > identical semantics as fib_hash did.
> > 
> > If there are any semantic differences found in fib_trie, we should
> > simply fix them.
> > 
> > I've placed the trie statistic config option under advanced router
> > configuration.
> > 
> > Signed-off-by: David S. Miller <davem@davemloft.net>
> 
> Vyatta has been shipping FIB TRIE for over 3 years with reported
> bugs.

NO reported bugs!!

-- 

^ permalink raw reply

* Re: [PATCHv2 dontapply] vhost-net tx tuning
From: Shirley Ma @ 2011-02-01 23:27 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: Michael S. Tsirkin, Steve Dobbelstein, mashirle, kvm, netdev
In-Reply-To: <1296601658.30191.46.camel@sridhar.beaverton.ibm.com>

On Tue, 2011-02-01 at 15:07 -0800, Sridhar Samudrala wrote:
> I think the counters that exceed the limits need to be reset to 0
> here.
> Otherwise we keep signaling for every buffer once we hit this
> condition. 

I will modify the patch to rerun the test to see the difference.

Shirley


^ permalink raw reply

* Re: [PATCH] ipv4: Remove fib_hash.
From: Stephen Hemminger @ 2011-02-01 23:26 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20110201.151941.48509941.davem@davemloft.net>

On Tue, 01 Feb 2011 15:19:41 -0800 (PST)
David Miller <davem@davemloft.net> wrote:

> 
> The time has finally come to remove the hash based routing table
> implementation in ipv4.
> 
> FIB Trie is mature, well tested, and I've done an audit of it's code
> to confirm that it implements insert, delete, and lookup with the same
> identical semantics as fib_hash did.
> 
> If there are any semantic differences found in fib_trie, we should
> simply fix them.
> 
> I've placed the trie statistic config option under advanced router
> configuration.
> 
> Signed-off-by: David S. Miller <davem@davemloft.net>

Vyatta has been shipping FIB TRIE for over 3 years with reported
bugs.

Acked-by: Stephen Hemminger <shemminger@vyatta.com>

-- 

^ permalink raw reply

* [PATCH] ipv4: Remove fib_hash.
From: David Miller @ 2011-02-01 23:19 UTC (permalink / raw)
  To: netdev


The time has finally come to remove the hash based routing table
implementation in ipv4.

FIB Trie is mature, well tested, and I've done an audit of it's code
to confirm that it implements insert, delete, and lookup with the same
identical semantics as fib_hash did.

If there are any semantic differences found in fib_trie, we should
simply fix them.

I've placed the trie statistic config option under advanced router
configuration.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig    |   38 +--
 net/ipv4/Makefile   |    4 +-
 net/ipv4/fib_hash.c | 1061 ---------------------------------------------------
 3 files changed, 2 insertions(+), 1101 deletions(-)
 delete mode 100644 net/ipv4/fib_hash.c

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 8949a05..cbb505b 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -55,45 +55,9 @@ config IP_ADVANCED_ROUTER
 
 	  If unsure, say N here.
 
-choice
-	prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
-	depends on IP_ADVANCED_ROUTER
-	default ASK_IP_FIB_HASH
-
-config ASK_IP_FIB_HASH
-	bool "FIB_HASH"
-	---help---
-	  Current FIB is very proven and good enough for most users.
-
-config IP_FIB_TRIE
-	bool "FIB_TRIE"
-	---help---
-	  Use new experimental LC-trie as FIB lookup algorithm.
-	  This improves lookup performance if you have a large
-	  number of routes.
-
-	  LC-trie is a longest matching prefix lookup algorithm which
-	  performs better than FIB_HASH for large routing tables.
-	  But, it consumes more memory and is more complex.
-
-	  LC-trie is described in:
-
-	  IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
-	  IEEE Journal on Selected Areas in Communications, 17(6):1083-1092,
-	  June 1999
-
-	  An experimental study of compression methods for dynamic tries
-	  Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
-	  <http://www.csc.kth.se/~snilsson/software/dyntrie2/>
-
-endchoice
-
-config IP_FIB_HASH
-	def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER
-
 config IP_FIB_TRIE_STATS
 	bool "FIB TRIE statistics"
-	depends on IP_FIB_TRIE
+	depends on IP_ADVANCED_ROUTER
 	---help---
 	  Keep track of statistics on structure of FIB TRIE table.
 	  Useful for testing and measuring TRIE performance.
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 4978d22..0dc772d 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -10,12 +10,10 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     tcp_minisocks.o tcp_cong.o \
 	     datagram.o raw.o udp.o udplite.o \
 	     arp.o icmp.o devinet.o af_inet.o  igmp.o \
-	     fib_frontend.o fib_semantics.o \
+	     fib_frontend.o fib_semantics.o fib_trie.o \
 	     inet_fragment.o
 
 obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
-obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
-obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
 obj-$(CONFIG_IP_MROUTE) += ipmr.o
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
deleted file mode 100644
index fadb602..0000000
--- a/net/ipv4/fib_hash.c
+++ /dev/null
@@ -1,1061 +0,0 @@
-/*
- * INET		An implementation of the TCP/IP protocol suite for the LINUX
- *		operating system.  INET is implemented using the  BSD Socket
- *		interface as the means of communication with the user level.
- *
- *		IPv4 FIB: lookup engine and maintenance routines.
- *
- * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- *
- *		This program is free software; you can redistribute it and/or
- *		modify it under the terms of the GNU General Public License
- *		as published by the Free Software Foundation; either version
- *		2 of the License, or (at your option) any later version.
- */
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/errno.h>
-#include <linux/in.h>
-#include <linux/inet.h>
-#include <linux/inetdevice.h>
-#include <linux/netdevice.h>
-#include <linux/if_arp.h>
-#include <linux/proc_fs.h>
-#include <linux/skbuff.h>
-#include <linux/netlink.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-
-#include <net/net_namespace.h>
-#include <net/ip.h>
-#include <net/protocol.h>
-#include <net/route.h>
-#include <net/tcp.h>
-#include <net/sock.h>
-#include <net/ip_fib.h>
-
-#include "fib_lookup.h"
-
-static struct kmem_cache *fn_hash_kmem __read_mostly;
-static struct kmem_cache *fn_alias_kmem __read_mostly;
-
-struct fib_node {
-	struct hlist_node	fn_hash;
-	struct list_head	fn_alias;
-	__be32			fn_key;
-	struct fib_alias        fn_embedded_alias;
-};
-
-#define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head))
-
-struct fn_zone {
-	struct fn_zone __rcu	*fz_next;	/* Next not empty zone	*/
-	struct hlist_head __rcu	*fz_hash;	/* Hash table pointer	*/
-	seqlock_t		fz_lock;
-	u32			fz_hashmask;	/* (fz_divisor - 1)	*/
-
-	u8			fz_order;	/* Zone order (0..32)	*/
-	u8			fz_revorder;	/* 32 - fz_order	*/
-	__be32			fz_mask;	/* inet_make_mask(order) */
-#define FZ_MASK(fz)		((fz)->fz_mask)
-
-	struct hlist_head	fz_embedded_hash[EMBEDDED_HASH_SIZE];
-
-	int			fz_nent;	/* Number of entries	*/
-	int			fz_divisor;	/* Hash size (mask+1)	*/
-};
-
-struct fn_hash {
-	struct fn_zone		*fn_zones[33];
-	struct fn_zone __rcu	*fn_zone_list;
-};
-
-static inline u32 fn_hash(__be32 key, struct fn_zone *fz)
-{
-	u32 h = ntohl(key) >> fz->fz_revorder;
-	h ^= (h>>20);
-	h ^= (h>>10);
-	h ^= (h>>5);
-	h &= fz->fz_hashmask;
-	return h;
-}
-
-static inline __be32 fz_key(__be32 dst, struct fn_zone *fz)
-{
-	return dst & FZ_MASK(fz);
-}
-
-static unsigned int fib_hash_genid;
-
-#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
-
-static struct hlist_head *fz_hash_alloc(int divisor)
-{
-	unsigned long size = divisor * sizeof(struct hlist_head);
-
-	if (size <= PAGE_SIZE)
-		return kzalloc(size, GFP_KERNEL);
-
-	return (struct hlist_head *)
-		__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
-}
-
-/* The fib hash lock must be held when this is called. */
-static inline void fn_rebuild_zone(struct fn_zone *fz,
-				   struct hlist_head *old_ht,
-				   int old_divisor)
-{
-	int i;
-
-	for (i = 0; i < old_divisor; i++) {
-		struct hlist_node *node, *n;
-		struct fib_node *f;
-
-		hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
-			struct hlist_head *new_head;
-
-			hlist_del_rcu(&f->fn_hash);
-
-			new_head = rcu_dereference_protected(fz->fz_hash, 1) +
-				   fn_hash(f->fn_key, fz);
-			hlist_add_head_rcu(&f->fn_hash, new_head);
-		}
-	}
-}
-
-static void fz_hash_free(struct hlist_head *hash, int divisor)
-{
-	unsigned long size = divisor * sizeof(struct hlist_head);
-
-	if (size <= PAGE_SIZE)
-		kfree(hash);
-	else
-		free_pages((unsigned long)hash, get_order(size));
-}
-
-static void fn_rehash_zone(struct fn_zone *fz)
-{
-	struct hlist_head *ht, *old_ht;
-	int old_divisor, new_divisor;
-	u32 new_hashmask;
-
-	new_divisor = old_divisor = fz->fz_divisor;
-
-	switch (old_divisor) {
-	case EMBEDDED_HASH_SIZE:
-		new_divisor *= EMBEDDED_HASH_SIZE;
-		break;
-	case EMBEDDED_HASH_SIZE*EMBEDDED_HASH_SIZE:
-		new_divisor *= (EMBEDDED_HASH_SIZE/2);
-		break;
-	default:
-		if ((old_divisor << 1) > FZ_MAX_DIVISOR) {
-			printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor);
-			return;
-		}
-		new_divisor = (old_divisor << 1);
-		break;
-	}
-
-	new_hashmask = (new_divisor - 1);
-
-#if RT_CACHE_DEBUG >= 2
-	printk(KERN_DEBUG "fn_rehash_zone: hash for zone %d grows from %d\n",
-	       fz->fz_order, old_divisor);
-#endif
-
-	ht = fz_hash_alloc(new_divisor);
-
-	if (ht)	{
-		struct fn_zone nfz;
-
-		memcpy(&nfz, fz, sizeof(nfz));
-
-		write_seqlock_bh(&fz->fz_lock);
-		old_ht = rcu_dereference_protected(fz->fz_hash, 1);
-		RCU_INIT_POINTER(nfz.fz_hash, ht);
-		nfz.fz_hashmask = new_hashmask;
-		nfz.fz_divisor = new_divisor;
-		fn_rebuild_zone(&nfz, old_ht, old_divisor);
-		fib_hash_genid++;
-		rcu_assign_pointer(fz->fz_hash, ht);
-		fz->fz_hashmask = new_hashmask;
-		fz->fz_divisor = new_divisor;
-		write_sequnlock_bh(&fz->fz_lock);
-
-		if (old_ht != fz->fz_embedded_hash) {
-			synchronize_rcu();
-			fz_hash_free(old_ht, old_divisor);
-		}
-	}
-}
-
-static void fn_free_node_rcu(struct rcu_head *head)
-{
-	struct fib_node *f = container_of(head, struct fib_node, fn_embedded_alias.rcu);
-
-	kmem_cache_free(fn_hash_kmem, f);
-}
-
-static inline void fn_free_node(struct fib_node *f)
-{
-	call_rcu(&f->fn_embedded_alias.rcu, fn_free_node_rcu);
-}
-
-static void fn_free_alias_rcu(struct rcu_head *head)
-{
-	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
-
-	kmem_cache_free(fn_alias_kmem, fa);
-}
-
-static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f)
-{
-	fib_release_info(fa->fa_info);
-	if (fa == &f->fn_embedded_alias)
-		fa->fa_info = NULL;
-	else
-		call_rcu(&fa->rcu, fn_free_alias_rcu);
-}
-
-static struct fn_zone *
-fn_new_zone(struct fn_hash *table, int z)
-{
-	int i;
-	struct fn_zone *fz = kzalloc(sizeof(struct fn_zone), GFP_KERNEL);
-	if (!fz)
-		return NULL;
-
-	seqlock_init(&fz->fz_lock);
-	fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1;
-	fz->fz_hashmask = fz->fz_divisor - 1;
-	RCU_INIT_POINTER(fz->fz_hash, fz->fz_embedded_hash);
-	fz->fz_order = z;
-	fz->fz_revorder = 32 - z;
-	fz->fz_mask = inet_make_mask(z);
-
-	/* Find the first not empty zone with more specific mask */
-	for (i = z + 1; i <= 32; i++)
-		if (table->fn_zones[i])
-			break;
-	if (i > 32) {
-		/* No more specific masks, we are the first. */
-		rcu_assign_pointer(fz->fz_next,
-				   rtnl_dereference(table->fn_zone_list));
-		rcu_assign_pointer(table->fn_zone_list, fz);
-	} else {
-		rcu_assign_pointer(fz->fz_next,
-				   rtnl_dereference(table->fn_zones[i]->fz_next));
-		rcu_assign_pointer(table->fn_zones[i]->fz_next, fz);
-	}
-	table->fn_zones[z] = fz;
-	fib_hash_genid++;
-	return fz;
-}
-
-int fib_table_lookup(struct fib_table *tb,
-		     const struct flowi *flp, struct fib_result *res,
-		     int fib_flags)
-{
-	int err;
-	struct fn_zone *fz;
-	struct fn_hash *t = (struct fn_hash *)tb->tb_data;
-
-	rcu_read_lock();
-	for (fz = rcu_dereference(t->fn_zone_list);
-	     fz != NULL;
-	     fz = rcu_dereference(fz->fz_next)) {
-		struct hlist_head *head;
-		struct hlist_node *node;
-		struct fib_node *f;
-		__be32 k;
-		unsigned int seq;
-
-		do {
-			seq = read_seqbegin(&fz->fz_lock);
-			k = fz_key(flp->fl4_dst, fz);
-
-			head = rcu_dereference(fz->fz_hash) + fn_hash(k, fz);
-			hlist_for_each_entry_rcu(f, node, head, fn_hash) {
-				if (f->fn_key != k)
-					continue;
-
-				err = fib_semantic_match(tb, &f->fn_alias,
-						 flp, res,
-						 fz->fz_order, fib_flags);
-				if (err <= 0)
-					goto out;
-			}
-		} while (read_seqretry(&fz->fz_lock, seq));
-	}
-	err = 1;
-out:
-	rcu_read_unlock();
-	return err;
-}
-
-/* Insert node F to FZ. */
-static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
-{
-	struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(f->fn_key, fz);
-
-	hlist_add_head_rcu(&f->fn_hash, head);
-}
-
-/* Return the node in FZ matching KEY. */
-static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
-{
-	struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(key, fz);
-	struct hlist_node *node;
-	struct fib_node *f;
-
-	hlist_for_each_entry_rcu(f, node, head, fn_hash) {
-		if (f->fn_key == key)
-			return f;
-	}
-
-	return NULL;
-}
-
-
-static struct fib_alias *fib_fast_alloc(struct fib_node *f)
-{
-	struct fib_alias *fa = &f->fn_embedded_alias;
-
-	if (fa->fa_info != NULL)
-		fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
-	return fa;
-}
-
-/* Caller must hold RTNL. */
-int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
-{
-	struct fn_hash *table = (struct fn_hash *) tb->tb_data;
-	struct fib_node *new_f = NULL;
-	struct fib_node *f;
-	struct fib_alias *fa, *new_fa;
-	struct fn_zone *fz;
-	struct fib_info *fi;
-	u8 tos = cfg->fc_tos;
-	__be32 key;
-	int err;
-
-	if (cfg->fc_dst_len > 32)
-		return -EINVAL;
-
-	fz = table->fn_zones[cfg->fc_dst_len];
-	if (!fz && !(fz = fn_new_zone(table, cfg->fc_dst_len)))
-		return -ENOBUFS;
-
-	key = 0;
-	if (cfg->fc_dst) {
-		if (cfg->fc_dst & ~FZ_MASK(fz))
-			return -EINVAL;
-		key = fz_key(cfg->fc_dst, fz);
-	}
-
-	fi = fib_create_info(cfg);
-	if (IS_ERR(fi))
-		return PTR_ERR(fi);
-
-	if (fz->fz_nent > (fz->fz_divisor<<1) &&
-	    fz->fz_divisor < FZ_MAX_DIVISOR &&
-	    (cfg->fc_dst_len == 32 ||
-	     (1 << cfg->fc_dst_len) > fz->fz_divisor))
-		fn_rehash_zone(fz);
-
-	f = fib_find_node(fz, key);
-
-	if (!f)
-		fa = NULL;
-	else
-		fa = fib_find_alias(&f->fn_alias, tos, fi->fib_priority);
-
-	/* Now fa, if non-NULL, points to the first fib alias
-	 * with the same keys [prefix,tos,priority], if such key already
-	 * exists or to the node before which we will insert new one.
-	 *
-	 * If fa is NULL, we will need to allocate a new one and
-	 * insert to the head of f.
-	 *
-	 * If f is NULL, no fib node matched the destination key
-	 * and we need to allocate a new one of those as well.
-	 */
-
-	if (fa && fa->fa_tos == tos &&
-	    fa->fa_info->fib_priority == fi->fib_priority) {
-		struct fib_alias *fa_first, *fa_match;
-
-		err = -EEXIST;
-		if (cfg->fc_nlflags & NLM_F_EXCL)
-			goto out;
-
-		/* We have 2 goals:
-		 * 1. Find exact match for type, scope, fib_info to avoid
-		 * duplicate routes
-		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
-		 */
-		fa_match = NULL;
-		fa_first = fa;
-		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
-		list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
-			if (fa->fa_tos != tos)
-				break;
-			if (fa->fa_info->fib_priority != fi->fib_priority)
-				break;
-			if (fa->fa_type == cfg->fc_type &&
-			    fa->fa_scope == cfg->fc_scope &&
-			    fa->fa_info == fi) {
-				fa_match = fa;
-				break;
-			}
-		}
-
-		if (cfg->fc_nlflags & NLM_F_REPLACE) {
-			u8 state;
-
-			fa = fa_first;
-			if (fa_match) {
-				if (fa == fa_match)
-					err = 0;
-				goto out;
-			}
-			err = -ENOBUFS;
-			new_fa = fib_fast_alloc(f);
-			if (new_fa == NULL)
-				goto out;
-
-			new_fa->fa_tos = fa->fa_tos;
-			new_fa->fa_info = fi;
-			new_fa->fa_type = cfg->fc_type;
-			new_fa->fa_scope = cfg->fc_scope;
-			state = fa->fa_state;
-			new_fa->fa_state = state & ~FA_S_ACCESSED;
-			fib_hash_genid++;
-			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
-
-			fn_free_alias(fa, f);
-			if (state & FA_S_ACCESSED)
-				rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
-			rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len,
-				  tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
-			return 0;
-		}
-
-		/* Error if we find a perfect match which
-		 * uses the same scope, type, and nexthop
-		 * information.
-		 */
-		if (fa_match)
-			goto out;
-
-		if (!(cfg->fc_nlflags & NLM_F_APPEND))
-			fa = fa_first;
-	}
-
-	err = -ENOENT;
-	if (!(cfg->fc_nlflags & NLM_F_CREATE))
-		goto out;
-
-	err = -ENOBUFS;
-
-	if (!f) {
-		new_f = kmem_cache_zalloc(fn_hash_kmem, GFP_KERNEL);
-		if (new_f == NULL)
-			goto out;
-
-		INIT_HLIST_NODE(&new_f->fn_hash);
-		INIT_LIST_HEAD(&new_f->fn_alias);
-		new_f->fn_key = key;
-		f = new_f;
-	}
-
-	new_fa = fib_fast_alloc(f);
-	if (new_fa == NULL)
-		goto out;
-
-	new_fa->fa_info = fi;
-	new_fa->fa_tos = tos;
-	new_fa->fa_type = cfg->fc_type;
-	new_fa->fa_scope = cfg->fc_scope;
-	new_fa->fa_state = 0;
-
-	/*
-	 * Insert new entry to the list.
-	 */
-
-	if (new_f)
-		fib_insert_node(fz, new_f);
-	list_add_tail_rcu(&new_fa->fa_list,
-		 (fa ? &fa->fa_list : &f->fn_alias));
-	fib_hash_genid++;
-
-	if (new_f)
-		fz->fz_nent++;
-	rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
-
-	rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id,
-		  &cfg->fc_nlinfo, 0);
-	return 0;
-
-out:
-	if (new_f)
-		kmem_cache_free(fn_hash_kmem, new_f);
-	fib_release_info(fi);
-	return err;
-}
-
-int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
-{
-	struct fn_hash *table = (struct fn_hash *)tb->tb_data;
-	struct fib_node *f;
-	struct fib_alias *fa, *fa_to_delete;
-	struct fn_zone *fz;
-	__be32 key;
-
-	if (cfg->fc_dst_len > 32)
-		return -EINVAL;
-
-	if ((fz  = table->fn_zones[cfg->fc_dst_len]) == NULL)
-		return -ESRCH;
-
-	key = 0;
-	if (cfg->fc_dst) {
-		if (cfg->fc_dst & ~FZ_MASK(fz))
-			return -EINVAL;
-		key = fz_key(cfg->fc_dst, fz);
-	}
-
-	f = fib_find_node(fz, key);
-
-	if (!f)
-		fa = NULL;
-	else
-		fa = fib_find_alias(&f->fn_alias, cfg->fc_tos, 0);
-	if (!fa)
-		return -ESRCH;
-
-	fa_to_delete = NULL;
-	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
-	list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
-		struct fib_info *fi = fa->fa_info;
-
-		if (fa->fa_tos != cfg->fc_tos)
-			break;
-
-		if ((!cfg->fc_type ||
-		     fa->fa_type == cfg->fc_type) &&
-		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
-		     fa->fa_scope == cfg->fc_scope) &&
-		    (!cfg->fc_protocol ||
-		     fi->fib_protocol == cfg->fc_protocol) &&
-		    fib_nh_match(cfg, fi) == 0) {
-			fa_to_delete = fa;
-			break;
-		}
-	}
-
-	if (fa_to_delete) {
-		int kill_fn;
-
-		fa = fa_to_delete;
-		rtmsg_fib(RTM_DELROUTE, key, fa, cfg->fc_dst_len,
-			  tb->tb_id, &cfg->fc_nlinfo, 0);
-
-		kill_fn = 0;
-		list_del_rcu(&fa->fa_list);
-		if (list_empty(&f->fn_alias)) {
-			hlist_del_rcu(&f->fn_hash);
-			kill_fn = 1;
-		}
-		fib_hash_genid++;
-
-		if (fa->fa_state & FA_S_ACCESSED)
-			rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
-		fn_free_alias(fa, f);
-		if (kill_fn) {
-			fn_free_node(f);
-			fz->fz_nent--;
-		}
-
-		return 0;
-	}
-	return -ESRCH;
-}
-
-static int fn_flush_list(struct fn_zone *fz, int idx)
-{
-	struct hlist_head *head = rtnl_dereference(fz->fz_hash) + idx;
-	struct hlist_node *node, *n;
-	struct fib_node *f;
-	int found = 0;
-
-	hlist_for_each_entry_safe(f, node, n, head, fn_hash) {
-		struct fib_alias *fa, *fa_node;
-		int kill_f;
-
-		kill_f = 0;
-		list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) {
-			struct fib_info *fi = fa->fa_info;
-
-			if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
-				list_del_rcu(&fa->fa_list);
-				if (list_empty(&f->fn_alias)) {
-					hlist_del_rcu(&f->fn_hash);
-					kill_f = 1;
-				}
-				fib_hash_genid++;
-
-				fn_free_alias(fa, f);
-				found++;
-			}
-		}
-		if (kill_f) {
-			fn_free_node(f);
-			fz->fz_nent--;
-		}
-	}
-	return found;
-}
-
-/* caller must hold RTNL. */
-int fib_table_flush(struct fib_table *tb)
-{
-	struct fn_hash *table = (struct fn_hash *) tb->tb_data;
-	struct fn_zone *fz;
-	int found = 0;
-
-	for (fz = rtnl_dereference(table->fn_zone_list);
-	     fz != NULL;
-	     fz = rtnl_dereference(fz->fz_next)) {
-		int i;
-
-		for (i = fz->fz_divisor - 1; i >= 0; i--)
-			found += fn_flush_list(fz, i);
-	}
-	return found;
-}
-
-void fib_free_table(struct fib_table *tb)
-{
-	struct fn_hash *table = (struct fn_hash *) tb->tb_data;
-	struct fn_zone *fz, *next;
-
-	next = table->fn_zone_list;
-	while (next != NULL) {
-		fz = next;
-		next = fz->fz_next;
-
-		if (fz->fz_hash != fz->fz_embedded_hash)
-			fz_hash_free(fz->fz_hash, fz->fz_divisor);
-
-		kfree(fz);
-	}
-
-	kfree(tb);
-}
-
-static inline int
-fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
-		     struct fib_table *tb,
-		     struct fn_zone *fz,
-		     struct hlist_head *head)
-{
-	struct hlist_node *node;
-	struct fib_node *f;
-	int i, s_i;
-
-	s_i = cb->args[4];
-	i = 0;
-	hlist_for_each_entry_rcu(f, node, head, fn_hash) {
-		struct fib_alias *fa;
-
-		list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
-			if (i < s_i)
-				goto next;
-
-			if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
-					  cb->nlh->nlmsg_seq,
-					  RTM_NEWROUTE,
-					  tb->tb_id,
-					  fa->fa_type,
-					  fa->fa_scope,
-					  f->fn_key,
-					  fz->fz_order,
-					  fa->fa_tos,
-					  fa->fa_info,
-					  NLM_F_MULTI) < 0) {
-				cb->args[4] = i;
-				return -1;
-			}
-next:
-			i++;
-		}
-	}
-	cb->args[4] = i;
-	return skb->len;
-}
-
-static inline int
-fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
-		   struct fib_table *tb,
-		   struct fn_zone *fz)
-{
-	int h, s_h;
-	struct hlist_head *head = rcu_dereference(fz->fz_hash);
-
-	if (head == NULL)
-		return skb->len;
-	s_h = cb->args[3];
-	for (h = s_h; h < fz->fz_divisor; h++) {
-		if (hlist_empty(head + h))
-			continue;
-		if (fn_hash_dump_bucket(skb, cb, tb, fz, head + h) < 0) {
-			cb->args[3] = h;
-			return -1;
-		}
-		memset(&cb->args[4], 0,
-		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
-	}
-	cb->args[3] = h;
-	return skb->len;
-}
-
-int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
-		   struct netlink_callback *cb)
-{
-	int m = 0, s_m;
-	struct fn_zone *fz;
-	struct fn_hash *table = (struct fn_hash *)tb->tb_data;
-
-	s_m = cb->args[2];
-	rcu_read_lock();
-	for (fz = rcu_dereference(table->fn_zone_list);
-	     fz != NULL;
-	     fz = rcu_dereference(fz->fz_next), m++) {
-		if (m < s_m)
-			continue;
-		if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
-			cb->args[2] = m;
-			rcu_read_unlock();
-			return -1;
-		}
-		memset(&cb->args[3], 0,
-		       sizeof(cb->args) - 3*sizeof(cb->args[0]));
-	}
-	rcu_read_unlock();
-	cb->args[2] = m;
-	return skb->len;
-}
-
-void __init fib_hash_init(void)
-{
-	fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node),
-					 0, SLAB_PANIC, NULL);
-
-	fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias),
-					  0, SLAB_PANIC, NULL);
-
-}
-
-struct fib_table *fib_hash_table(u32 id)
-{
-	struct fib_table *tb;
-
-	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash),
-		     GFP_KERNEL);
-	if (tb == NULL)
-		return NULL;
-
-	tb->tb_id = id;
-	tb->tb_default = -1;
-
-	memset(tb->tb_data, 0, sizeof(struct fn_hash));
-	return tb;
-}
-
-/* ------------------------------------------------------------------------ */
-#ifdef CONFIG_PROC_FS
-
-struct fib_iter_state {
-	struct seq_net_private p;
-	struct fn_zone	*zone;
-	int		bucket;
-	struct hlist_head *hash_head;
-	struct fib_node *fn;
-	struct fib_alias *fa;
-	loff_t pos;
-	unsigned int genid;
-	int valid;
-};
-
-static struct fib_alias *fib_get_first(struct seq_file *seq)
-{
-	struct fib_iter_state *iter = seq->private;
-	struct fib_table *main_table;
-	struct fn_hash *table;
-
-	main_table = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
-	table = (struct fn_hash *)main_table->tb_data;
-
-	iter->bucket    = 0;
-	iter->hash_head = NULL;
-	iter->fn        = NULL;
-	iter->fa        = NULL;
-	iter->pos	= 0;
-	iter->genid	= fib_hash_genid;
-	iter->valid	= 1;
-
-	for (iter->zone = rcu_dereference(table->fn_zone_list);
-	     iter->zone != NULL;
-	     iter->zone = rcu_dereference(iter->zone->fz_next)) {
-		int maxslot;
-
-		if (!iter->zone->fz_nent)
-			continue;
-
-		iter->hash_head = rcu_dereference(iter->zone->fz_hash);
-		maxslot = iter->zone->fz_divisor;
-
-		for (iter->bucket = 0; iter->bucket < maxslot;
-		     ++iter->bucket, ++iter->hash_head) {
-			struct hlist_node *node;
-			struct fib_node *fn;
-
-			hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
-				struct fib_alias *fa;
-
-				list_for_each_entry(fa, &fn->fn_alias, fa_list) {
-					iter->fn = fn;
-					iter->fa = fa;
-					goto out;
-				}
-			}
-		}
-	}
-out:
-	return iter->fa;
-}
-
-static struct fib_alias *fib_get_next(struct seq_file *seq)
-{
-	struct fib_iter_state *iter = seq->private;
-	struct fib_node *fn;
-	struct fib_alias *fa;
-
-	/* Advance FA, if any. */
-	fn = iter->fn;
-	fa = iter->fa;
-	if (fa) {
-		BUG_ON(!fn);
-		list_for_each_entry_continue(fa, &fn->fn_alias, fa_list) {
-			iter->fa = fa;
-			goto out;
-		}
-	}
-
-	fa = iter->fa = NULL;
-
-	/* Advance FN. */
-	if (fn) {
-		struct hlist_node *node = &fn->fn_hash;
-		hlist_for_each_entry_continue(fn, node, fn_hash) {
-			iter->fn = fn;
-
-			list_for_each_entry(fa, &fn->fn_alias, fa_list) {
-				iter->fa = fa;
-				goto out;
-			}
-		}
-	}
-
-	fn = iter->fn = NULL;
-
-	/* Advance hash chain. */
-	if (!iter->zone)
-		goto out;
-
-	for (;;) {
-		struct hlist_node *node;
-		int maxslot;
-
-		maxslot = iter->zone->fz_divisor;
-
-		while (++iter->bucket < maxslot) {
-			iter->hash_head++;
-
-			hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
-				list_for_each_entry(fa, &fn->fn_alias, fa_list) {
-					iter->fn = fn;
-					iter->fa = fa;
-					goto out;
-				}
-			}
-		}
-
-		iter->zone = rcu_dereference(iter->zone->fz_next);
-
-		if (!iter->zone)
-			goto out;
-
-		iter->bucket = 0;
-		iter->hash_head = rcu_dereference(iter->zone->fz_hash);
-
-		hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
-			list_for_each_entry(fa, &fn->fn_alias, fa_list) {
-				iter->fn = fn;
-				iter->fa = fa;
-				goto out;
-			}
-		}
-	}
-out:
-	iter->pos++;
-	return fa;
-}
-
-static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
-{
-	struct fib_iter_state *iter = seq->private;
-	struct fib_alias *fa;
-
-	if (iter->valid && pos >= iter->pos && iter->genid == fib_hash_genid) {
-		fa   = iter->fa;
-		pos -= iter->pos;
-	} else
-		fa = fib_get_first(seq);
-
-	if (fa)
-		while (pos && (fa = fib_get_next(seq)))
-			--pos;
-	return pos ? NULL : fa;
-}
-
-static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
-	__acquires(RCU)
-{
-	void *v = NULL;
-
-	rcu_read_lock();
-	if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN))
-		v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
-	return v;
-}
-
-static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
-	++*pos;
-	return v == SEQ_START_TOKEN ? fib_get_first(seq) : fib_get_next(seq);
-}
-
-static void fib_seq_stop(struct seq_file *seq, void *v)
-	__releases(RCU)
-{
-	rcu_read_unlock();
-}
-
-static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
-{
-	static const unsigned type2flags[RTN_MAX + 1] = {
-		[7] = RTF_REJECT,
-		[8] = RTF_REJECT,
-	};
-	unsigned flags = type2flags[type];
-
-	if (fi && fi->fib_nh->nh_gw)
-		flags |= RTF_GATEWAY;
-	if (mask == htonl(0xFFFFFFFF))
-		flags |= RTF_HOST;
-	flags |= RTF_UP;
-	return flags;
-}
-
-/*
- *	This outputs /proc/net/route.
- *
- *	It always works in backward compatibility mode.
- *	The format of the file is not supposed to be changed.
- */
-static int fib_seq_show(struct seq_file *seq, void *v)
-{
-	struct fib_iter_state *iter;
-	int len;
-	__be32 prefix, mask;
-	unsigned flags;
-	struct fib_node *f;
-	struct fib_alias *fa;
-	struct fib_info *fi;
-
-	if (v == SEQ_START_TOKEN) {
-		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
-			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
-			   "\tWindow\tIRTT");
-		goto out;
-	}
-
-	iter	= seq->private;
-	f	= iter->fn;
-	fa	= iter->fa;
-	fi	= fa->fa_info;
-	prefix	= f->fn_key;
-	mask	= FZ_MASK(iter->zone);
-	flags	= fib_flag_trans(fa->fa_type, mask, fi);
-	if (fi)
-		seq_printf(seq,
-			 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n",
-			 fi->fib_dev ? fi->fib_dev->name : "*", prefix,
-			 fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority,
-			 mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
-			 fi->fib_window,
-			 fi->fib_rtt >> 3, &len);
-	else
-		seq_printf(seq,
-			 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n",
-			 prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0, &len);
-
-	seq_printf(seq, "%*s\n", 127 - len, "");
-out:
-	return 0;
-}
-
-static const struct seq_operations fib_seq_ops = {
-	.start  = fib_seq_start,
-	.next   = fib_seq_next,
-	.stop   = fib_seq_stop,
-	.show   = fib_seq_show,
-};
-
-static int fib_seq_open(struct inode *inode, struct file *file)
-{
-	return seq_open_net(inode, file, &fib_seq_ops,
-			    sizeof(struct fib_iter_state));
-}
-
-static const struct file_operations fib_seq_fops = {
-	.owner		= THIS_MODULE,
-	.open           = fib_seq_open,
-	.read           = seq_read,
-	.llseek         = seq_lseek,
-	.release	= seq_release_net,
-};
-
-int __net_init fib_proc_init(struct net *net)
-{
-	if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_seq_fops))
-		return -ENOMEM;
-	return 0;
-}
-
-void __net_exit fib_proc_exit(struct net *net)
-{
-	proc_net_remove(net, "route");
-}
-#endif /* CONFIG_PROC_FS */
-- 
1.7.4


^ permalink raw reply related

* Re: [PATCHv2 dontapply] vhost-net tx tuning
From: Sridhar Samudrala @ 2011-02-01 23:07 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Steve Dobbelstein, mashirle, kvm, netdev
In-Reply-To: <20110201155253.GA22959@redhat.com>

On Tue, 2011-02-01 at 17:52 +0200, Michael S. Tsirkin wrote:
> OK, so thinking about it more, maybe the issue is this:
> tx becomes full. We process one request and interrupt the guest,
> then it adds one request and the queue is full again.
> 
> Maybe the following will help it stabilize?  By default with it we will
> only interrupt when we see an empty ring.
> Which is liklely too much: pls try other values
> in the middle: e.g. make bufs half the ring,
> or bytes some small value like half ring * 200, or packets some
> small value etc.
> 
> Set any one parameter to 0 to get current
> behaviour (interrupt immediately when enabled).
> 
> Warning: completely untested.
> 
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> 
> ---
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index aac05bc..6769cdc 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -32,6 +32,13 @@
>   * Using this limit prevents one virtqueue from starving others. */
>  #define VHOST_NET_WEIGHT 0x80000
> 
> +int tx_bytes_coalesce = 1000000000;
> +module_param(tx_bytes_coalesce, int, 0644);
> +int tx_bufs_coalesce = 1000000000;
> +module_param(tx_bufs_coalesce, int, 0644);
> +int tx_packets_coalesce = 1000000000;
> +module_param(tx_packets_coalesce, int, 0644);
> +
>  enum {
>  	VHOST_NET_VQ_RX = 0,
>  	VHOST_NET_VQ_TX = 1,
> @@ -127,6 +134,9 @@ static void handle_tx(struct vhost_net *net)
>  	int err, wmem;
>  	size_t hdr_size;
>  	struct socket *sock;
> +	int bytes_coalesced = 0;
> +	int bufs_coalesced = 0;
> +	int packets_coalesced = 0;
> 
>  	/* TODO: check that we are running from vhost_worker? */
>  	sock = rcu_dereference_check(vq->private_data, 1);
> @@ -196,14 +206,26 @@ static void handle_tx(struct vhost_net *net)
>  		if (err != len)
>  			pr_debug("Truncated TX packet: "
>  				 " len %d != %zd\n", err, len);
> -		vhost_add_used_and_signal(&net->dev, vq, head, 0);
>  		total_len += len;
> +		packets_coalesced += 1;
> +		bytes_coalesced += len;
> +		bufs_coalesced += out;
> +		if (unlikely(packets_coalesced > tx_packets_coalesce ||
> +			     bytes_coalesced > tx_bytes_coalesce ||
> +			     bufs_coalesced > tx_bufs_coalesce))
> +			vhost_add_used_and_signal(&net->dev, vq, head, 0);

I think the counters that exceed the limits need to be reset to 0 here.
Otherwise we keep signaling for every buffer once we hit this condition.

Thanks
Sridhar

> +		else
> +			vhost_add_used(vq, head, 0);
>  		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
>  			vhost_poll_queue(&vq->poll);
>  			break;
>  		}
>  	}
> 
> +	if (likely(packets_coalesced &&
> +		   bytes_coalesced &&
> +		   bufs_coalesced))
> +		vhost_signal(&net->dev, vq);
>  	mutex_unlock(&vq->mutex);
>  }
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply

* Re: Network performance with small packets
From: Shirley Ma @ 2011-02-01 22:59 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Sridhar Samudrala, Steve Dobbelstein, David Miller, kvm, mashirle,
	netdev
In-Reply-To: <20110201215603.GA31348@redhat.com>

On Tue, 2011-02-01 at 23:56 +0200, Michael S. Tsirkin wrote:
> There are flags for bytes, buffers and packets.
> Try playing with any one of them :)
> Just be sure to use v2.
> 
> 
> >I would like to change it to
> > half of the ring size instead for signaling. Is that OK?
> > 
> > Shirley
> > 
> > 
> 
> Sure that is why I made it a parameter so you can experiment. 

The initial test results shows that the CPUs utilization has been
reduced some, and BW has increased some with the default parameters,
like 1K message size BW goes from 2.5Gb/s about 2.8Gb/s, CPU utilization
down from 4x% to 38%, (Similar results from the patch I submitted a
while ago to reduce signaling on vhost) but far away from dropping
packet results.

I am going to change the code to use 1/2 ring size to wake the netif
queue.

Shirley


^ permalink raw reply

* Re: [PATCH] include/net/genetlink.h: Allow genlmsg_cancel to accept a NULL argument
From: David Miller @ 2011-02-01 22:54 UTC (permalink / raw)
  To: julia; +Cc: netdev, linux-kernel, paul.moore, kernel-janitors
In-Reply-To: <Pine.LNX.4.64.1101281642080.8546@pc-004.diku.dk>

From: Julia Lawall <julia@diku.dk>
Date: Fri, 28 Jan 2011 16:43:40 +0100 (CET)

> nlmsg_cancel can accept NULL as its second argument, so for similarity,
> this patch extends genlmsg_cancel to be able to accept a NULL second
> argument as well.
> 
> Signed-off-by: Julia Lawall <julia@diku.dk>

I did a scan of all of the cases where this interface is used, and
I cannot find a situation where this capability would even be useful.

The use pattern is always:

	hdr = genlmsg_put(skb, ...);
	if (!hdr)
		goto out;

	NLA_PUT_*();
	NLA_PUT_*();
	....

	return genlmsg_end(skb, hdr);

nla_put_failure:
	genlmsg_cancel(skb, hdr);
out:
	return -EWHATEVER;

Always, hdr will be non-NULL.

We have to allocate the header first, then put the netlink
attributes.

Looking over users of nlmsg_cancel(), the situation seems to
match identically.

Therefore, it seems to me that it makes more sense to remove
the NULL check from nlmsg_cancel() than to add the NULL check
to genlmsg_cancel().

Thanks.

^ permalink raw reply

* AVB support (IEEE802.1 audio/video bridging)
From: Eliot Blennerhassett @ 2011-02-01 22:47 UTC (permalink / raw)
  To: netdev

Grettings,

before I go into any details, please tell me if this is the right place
to enquire/discuss if/how linux network stacks can support the various
protocols required by AVB

To quote the introduction from wikipedia [1]
"Audio Video Bridging (AVB) is a common name for the set of standards in
development by the IEEE 802.1 Audio Video Bridging Task Group. The
charter of this organization is to "provide the specifications that will
allow time-synchronized low latency streaming services through IEEE 802
networks"."

regards

-- 
Eliot Blennerhassett
AudioScience Inc.

[1] http://en.wikipedia.org/wiki/Audio_Video_Bridging

^ permalink raw reply

* Re: [PATCH] isdn: icn: Fix potentially wrong string handling
From: David Miller @ 2011-02-01 22:18 UTC (permalink / raw)
  To: weil; +Cc: isdn, tj, rostedt, netdev, linux-kernel
In-Reply-To: <1296419486-5482-1-git-send-email-weil@mail.berlios.de>

From: Stefan Weil <weil@mail.berlios.de>
Date: Sun, 30 Jan 2011 21:31:26 +0100

> This warning was reported by cppcheck:
> drivers/isdn/icn/icn.c:1641: error: Dangerous usage of 'rev' (strncpy doesn't always 0-terminate it)
> 
> If strncpy copied 20 bytes, the destination string rev was not terminated.
> The patch adds one more byte to rev and makes sure that this byte is
> always 0.
> 
> Cc: Karsten Keil <isdn@linux-pingi.de>
> Cc: "David S. Miller" <davem@davemloft.net>
> Cc: Tejun Heo <tj@kernel.org>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: netdev@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Signed-off-by: Stefan Weil <weil@mail.berlios.de>

Applied, thanks.

^ permalink raw reply

* Re: [patch 3/7] [PATCH] net,s390: provide architecture specific NET_SKB_PAD
From: David Miller @ 2011-02-01 22:17 UTC (permalink / raw)
  To: frank.blaschka; +Cc: netdev, linux-s390, horsth
In-Reply-To: <20110201081723.562745244@de.ibm.com>

From: frank.blaschka@de.ibm.com
Date: Tue, 01 Feb 2011 09:16:50 +0100

> From: Horst Hartmann <horsth@linux.vnet.ibm.com>
> 
> NET_SKB_PAD has been increased from 32 to 64 and later to max(32, L1_CACHE_BYTES). 
> This led to a 25% throughput decrease for streaming workloads accompanied by a                                                               
> 37% CPU cost increase on s390.
> In order to fix this provide an architecture specific NET_SKB_PAD config symbol.
> 
> Signed-off-by: Horst Hartmann <horsth@linux.vnet.ibm.com>
> Signed-off-by: Frank Blaschka <frank.blaschka@de.ibm.com>

Define it in your arch specific header file like it is designed to
be overridden.

Also, even if this Kconfig thing was the right thing to do, you would
need to put a default for it generically in init/Kconfig or
lib/Kconfig or similar.  As this is where the central documentation
for the knob would be placed.

Lastly, you failed in your commit message to describe why you wanted
to use this whacky Kconfig mechanism to override instead of using
a straight CPP define in the s390 headers.

You're modifying generic code, so you better explain what you're doing
and exactly why.

I'm not applying this series until you fix up this change, resubmit
the entire series when you have this stuff fixed up.

Thanks.


^ permalink raw reply

* Re: [PATCH net-2.6] bnx2x: multicasts in NPAR mode
From: David Miller @ 2011-02-01 22:05 UTC (permalink / raw)
  To: vladz; +Cc: netdev, eilong
In-Reply-To: <201102011257.11610.vladz@broadcom.com>

From: "Vlad Zolotarov" <vladz@broadcom.com>
Date: Tue, 1 Feb 2011 12:57:10 +0200

> The chip was erroneously configured to accept all multicast frames
> in a normal (none-promisc) rx mode both on the RSS and on the FCoE L2 rings
> when in an NPAR mode. This caused packet duplication for every received multicast
> frame in this mode.
> 
> Signed-off-by: Vladislav Zolotarov <vladz@broadcom.com>
> Signed-off-by: Eilon Greenstein <eilong@broadcom.com>

Applied, thanks Vlad.

^ permalink raw reply

* Re: Network performance with small packets
From: Michael S. Tsirkin @ 2011-02-01 21:56 UTC (permalink / raw)
  To: Shirley Ma
  Cc: Sridhar Samudrala, Steve Dobbelstein, David Miller, kvm, mashirle,
	netdev
In-Reply-To: <1296597185.26937.829.camel@localhost.localdomain>

On Tue, Feb 01, 2011 at 01:53:05PM -0800, Shirley Ma wrote:
> On Tue, 2011-02-01 at 23:42 +0200, Michael S. Tsirkin wrote:
> > On Tue, Feb 01, 2011 at 01:32:35PM -0800, Shirley Ma wrote:
> > > On Tue, 2011-02-01 at 23:24 +0200, Michael S. Tsirkin wrote:
> > > > My theory is that the issue is not signalling.
> > > > Rather, our queue fills up, then host handles
> > > > one packet and sends an interrupt, and we
> > > > immediately wake the queue. So the vq
> > > > once it gets full, stays full.
> > > 
> > > >From the printk debugging output, it might not be exactly the case.
> > The
> > > ring gets full, run a bit, then gets full, then run a bit, then
> > full...
> > 
> > Yes, but does it get even half empty in between?
> 
> Sometimes, most of them not half of empty in between. But printk slow
> down the traffics, so it's not accurate. I think your patch will improve
> the performance if it signals guest when half of the ring size is
> empty. 
> 
> But you manage signal by using TX bytes,

There are flags for bytes, buffers and packets.
Try playing with any one of them :)
Just be sure to use v2.


>I would like to change it to
> half of the ring size instead for signaling. Is that OK?
> 
> Shirley
> 
> 

Sure that is why I made it a parameter so you can experiment.

-- 
MST

^ permalink raw reply

* Re: Network performance with small packets
From: Shirley Ma @ 2011-02-01 21:53 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Sridhar Samudrala, Steve Dobbelstein, David Miller, kvm, mashirle,
	netdev
In-Reply-To: <20110201214211.GB31105@redhat.com>

On Tue, 2011-02-01 at 23:42 +0200, Michael S. Tsirkin wrote:
> On Tue, Feb 01, 2011 at 01:32:35PM -0800, Shirley Ma wrote:
> > On Tue, 2011-02-01 at 23:24 +0200, Michael S. Tsirkin wrote:
> > > My theory is that the issue is not signalling.
> > > Rather, our queue fills up, then host handles
> > > one packet and sends an interrupt, and we
> > > immediately wake the queue. So the vq
> > > once it gets full, stays full.
> > 
> > >From the printk debugging output, it might not be exactly the case.
> The
> > ring gets full, run a bit, then gets full, then run a bit, then
> full...
> 
> Yes, but does it get even half empty in between?

Sometimes, most of them not half of empty in between. But printk slow
down the traffics, so it's not accurate. I think your patch will improve
the performance if it signals guest when half of the ring size is
empty. 

But you manage signal by using TX bytes, I would like to change it to
half of the ring size instead for signaling. Is that OK?

Shirley




^ permalink raw reply

* Re: Network performance with small packets
From: Michael S. Tsirkin @ 2011-02-01 21:42 UTC (permalink / raw)
  To: Shirley Ma
  Cc: Sridhar Samudrala, Steve Dobbelstein, David Miller, kvm, mashirle,
	netdev
In-Reply-To: <1296595955.26937.822.camel@localhost.localdomain>

On Tue, Feb 01, 2011 at 01:32:35PM -0800, Shirley Ma wrote:
> On Tue, 2011-02-01 at 23:24 +0200, Michael S. Tsirkin wrote:
> > My theory is that the issue is not signalling.
> > Rather, our queue fills up, then host handles
> > one packet and sends an interrupt, and we
> > immediately wake the queue. So the vq
> > once it gets full, stays full.
> 
> >From the printk debugging output, it might not be exactly the case. The
> ring gets full, run a bit, then gets full, then run a bit, then full...

Yes, but does it get even half empty in between?

> > If you try my patch with bufs threshold set to e.g.
> > half the vq, what we will do is send interrupt after we have processed
> > half the vq.  So host has half the vq to go, and guest has half the vq
> > to fill.
> > 
> > See?
> 
> I am cleaning up my set up to run your patch ...
> 
> Shirley
> 

^ permalink raw reply

* Re: Network performance with small packets
From: Michael S. Tsirkin @ 2011-02-01 21:41 UTC (permalink / raw)
  To: Shirley Ma; +Cc: David Miller, steved, netdev, kvm, rusty
In-Reply-To: <1296595725.26937.819.camel@localhost.localdomain>

On Tue, Feb 01, 2011 at 01:28:45PM -0800, Shirley Ma wrote:
> On Tue, 2011-02-01 at 23:21 +0200, Michael S. Tsirkin wrote:
> > Confused. We compare capacity to skb frags, no?
> > That's sg I think ...
> 
> Current guest kernel use indirect buffers, num_free returns how many
> available descriptors not skb frags. So it's wrong here.
> 
> Shirley

I see. Good point. In other words when we complete the buffer
it was indirect, but when we add a new one we
can not allocate indirect so we consume.
And then we start the queue and add will fail.
I guess we need some kind of API to figure out
whether the buf we complete was indirect?

Another failure mode is when skb_xmit_done
wakes the queue: it might be too early, there
might not be space for the next packet in the vq yet.

A solution might be to keep some kind of pool
around for indirect, we wanted to do it for block anyway ...

-- 
MST

^ permalink raw reply

* Re: Network performance with small packets
From: Shirley Ma @ 2011-02-01 21:32 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Sridhar Samudrala, Steve Dobbelstein, David Miller, kvm, mashirle,
	netdev
In-Reply-To: <20110201212411.GD30770@redhat.com>

On Tue, 2011-02-01 at 23:24 +0200, Michael S. Tsirkin wrote:
> My theory is that the issue is not signalling.
> Rather, our queue fills up, then host handles
> one packet and sends an interrupt, and we
> immediately wake the queue. So the vq
> once it gets full, stays full.

>From the printk debugging output, it might not be exactly the case. The
ring gets full, run a bit, then gets full, then run a bit, then full...

> If you try my patch with bufs threshold set to e.g.
> half the vq, what we will do is send interrupt after we have processed
> half the vq.  So host has half the vq to go, and guest has half the vq
> to fill.
> 
> See?

I am cleaning up my set up to run your patch ...

Shirley



^ permalink raw reply

* Re: Network performance with small packets
From: Shirley Ma @ 2011-02-01 21:28 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: David Miller, steved, netdev, kvm
In-Reply-To: <20110201212110.GC30770@redhat.com>

On Tue, 2011-02-01 at 23:21 +0200, Michael S. Tsirkin wrote:
> Confused. We compare capacity to skb frags, no?
> That's sg I think ...

Current guest kernel use indirect buffers, num_free returns how many
available descriptors not skb frags. So it's wrong here.

Shirley


^ permalink raw reply

* Re: Network performance with small packets
From: Michael S. Tsirkin @ 2011-02-01 21:24 UTC (permalink / raw)
  To: Shirley Ma
  Cc: Sridhar Samudrala, Steve Dobbelstein, David Miller, kvm, mashirle,
	netdev
In-Reply-To: <1296594585.26937.817.camel@localhost.localdomain>

On Tue, Feb 01, 2011 at 01:09:45PM -0800, Shirley Ma wrote:
> On Mon, 2011-01-31 at 17:30 -0800, Sridhar Samudrala wrote:
> > Yes. It definitely should be 'out'. 'in' should be 0 in the tx path.
> > 
> > I tried a simpler version of this patch without any tunables by
> > delaying the signaling until we come out of the for loop.
> > It definitely reduced the number of vmexits significantly for small
> > message
> > guest to host stream test and the throughput went up a little.
> > 
> > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > index 9b3ca10..5f9fae9 100644
> > --- a/drivers/vhost/net.c
> > +++ b/drivers/vhost/net.c
> > @@ -197,7 +197,7 @@ static void handle_tx(struct vhost_net *net)
> >                 if (err != len)
> >                         pr_debug("Truncated TX packet: "
> >                                  " len %d != %zd\n", err, len);
> > -               vhost_add_used_and_signal(&net->dev, vq, head, 0);
> > +               vhost_add_used(vq, head, 0);
> >                 total_len += len;
> >                 if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
> >                         vhost_poll_queue(&vq->poll);
> > @@ -205,6 +205,8 @@ static void handle_tx(struct vhost_net *net)
> >                 }
> >         }
> > 
> > +       if (total_len > 0)
> > +               vhost_signal(&net->dev, vq);
> >         mutex_unlock(&vq->mutex);
> >  }
> 
> Reducing the signaling will reduce the CPU utilization by reducing VM
> exits. 
> 
> The small message BW is a problem we have seen faster guest/slow vhost,
> even I increased VHOST_NET_WEIGHT times, it didn't help that much for
> BW. For large message size, vhost is able to process all packets on
> time. I played around with guest/host codes, I only see huge BW
> improvement by dropping packets on guest side so far.
> 
> Thanks
> Shirley


My theory is that the issue is not signalling.
Rather, our queue fills up, then host handles
one packet and sends an interrupt, and we
immediately wake the queue. So the vq
once it gets full, stays full.

If you try my patch with bufs threshold set to e.g.
half the vq, what we will do is send interrupt after we have processed
half the vq.  So host has half the vq to go, and guest has half the vq
to fill.

See?

-- 
MST

^ permalink raw reply

* Re: Network performance with small packets
From: Michael S. Tsirkin @ 2011-02-01 21:21 UTC (permalink / raw)
  To: Shirley Ma; +Cc: David Miller, steved, netdev, kvm
In-Reply-To: <1296591908.26937.809.camel@localhost.localdomain>

On Tue, Feb 01, 2011 at 12:25:08PM -0800, Shirley Ma wrote:
> On Tue, 2011-02-01 at 22:17 +0200, Michael S. Tsirkin wrote:
> > On Tue, Feb 01, 2011 at 12:09:03PM -0800, Shirley Ma wrote:
> > > On Tue, 2011-02-01 at 19:23 +0200, Michael S. Tsirkin wrote:
> > > > On Thu, Jan 27, 2011 at 01:30:38PM -0800, Shirley Ma wrote:
> > > > > On Thu, 2011-01-27 at 13:02 -0800, David Miller wrote:
> > > > > > > Interesting. Could this is be a variant of the now famuous
> > > > > > bufferbloat then?
> > > > > > 
> > > > > > Sigh, bufferbloat is the new global warming... :-/ 
> > > > > 
> > > > > Yep, some places become colder, some other places become warmer;
> > > > Same as
> > > > > BW results, sometimes faster, sometimes slower. :)
> > > > > 
> > > > > Shirley
> > > > 
> > > > Sent a tuning patch (v2) that might help.
> > > > Could you try it and play with the module parameters please? 
> > > 
> > > Hello Michael,
> > > 
> > > Sure I will play with this patch to see how it could help. 
> > > 
> > > I am looking at guest side as well, I found a couple issues on guest
> > > side:
> > > 
> > > 1. free_old_xmit_skbs() should return the number of skbs instead of
> > the
> > > total of sgs since we are using ring size to stop/start netif queue.
> > > static unsigned int free_old_xmit_skbs(struct virtnet_info *vi)
> > > {
> > >         struct sk_buff *skb;
> > >         unsigned int len, tot_sgs = 0;
> > > 
> > >         while ((skb = virtqueue_get_buf(vi->svq, &len)) != NULL) {
> > >                 pr_debug("Sent skb %p\n", skb);
> > >                 vi->dev->stats.tx_bytes += skb->len;
> > >                 vi->dev->stats.tx_packets++;
> > >                 tot_sgs += skb_vnet_hdr(skb)->num_sg;
> > >                 dev_kfree_skb_any(skb);
> > >         }
> > >         return tot_sgs; <---- should return numbers of skbs to track
> > > ring usage here, I think;
> > > }
> > > 
> > > Did the old guest use number of buffers to track ring usage before?
> > > 
> > > 2. In start_xmit, I think we should move capacity +=
> > free_old_xmit_skbs
> > > before netif_stop_queue(); so we avoid unnecessary netif queue
> > > stop/start. This condition is heavily hit for small message size.
> > > 
> > > Also we capacity checking condition should change to something like
> > half
> > > of the vring.num size, instead of comparing 2+MAX_SKB_FRAGS?
> > > 
> > >        if (capacity < 2+MAX_SKB_FRAGS) {
> > >                 netif_stop_queue(dev);
> > >                 if (unlikely(!virtqueue_enable_cb(vi->svq))) {
> > >                         /* More just got used, free them then
> > recheck.
> > > */
> > >                         capacity += free_old_xmit_skbs(vi);
> > >                         if (capacity >= 2+MAX_SKB_FRAGS) {
> > >                                 netif_start_queue(dev);
> > >                                 virtqueue_disable_cb(vi->svq);
> > >                         }
> > >                 }
> > >         }
> > > 
> > > 3. Looks like the xmit callback is only used to wake the queue when
> > the
> > > queue has stopped, right? Should we put a condition check here?
> > > static void skb_xmit_done(struct virtqueue *svq)
> > > {
> > >         struct virtnet_info *vi = svq->vdev->priv;
> > > 
> > >         /* Suppress further interrupts. */
> > >         virtqueue_disable_cb(svq);
> > > 
> > >         /* We were probably waiting for more output buffers. */
> > > --->   if (netif_queue_stopped(vi->dev))
> > >         netif_wake_queue(vi->dev);
> > > }
> > > 
> > > 
> > > Shirley
> > 
> > Well the return value is used to calculate capacity and that counts
> > the # of s/g. No?
> 
> Nope, the current guest kernel uses descriptors not number of sgs.

Confused. We compare capacity to skb frags, no?
That's sg I think ...

> not sure the old guest.
> 
> > From cache utilization POV it might be better to read from the skb and
> > not peek at virtio header though...
> > Pls Cc the lists on any discussions in the future.
> > 
> > -- 
> > MST
> 
> Sorry I missed reply all. :(
> 
> Shirley

^ permalink raw reply

* Re: [PATCH] depca: Fix warnings
From: David Miller @ 2011-02-01 21:19 UTC (permalink / raw)
  To: alan; +Cc: netdev
In-Reply-To: <20110201113214.24608.34659.stgit@bob.linux.org.uk>

From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Tue, 01 Feb 2011 11:32:29 +0000

> From: Alan Cox <alan@linux.intel.com>
> 
> Replace the rather weird use of ++ with + 1 as the value is being assigned
> 
> Signed-off-by: Alan Cox <alan@linux.intel.com>

Applied, thanks Alan.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox