netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [6/6]: jenkins hash for neigh
@ 2004-09-24  5:51 David S. Miller
  2004-09-24  8:52 ` Harald Welte
  0 siblings, 1 reply; 43+ messages in thread
From: David S. Miller @ 2004-09-24  5:51 UTC (permalink / raw)
  To: laforge; +Cc: netdev


This makes all the neigh implementations use jenkins.

# This is a BitKeeper generated diff -Nru style patch.
#
# ChangeSet
#   2004/09/23 18:03:13-07:00 davem@nuts.davemloft.net 
#   [NET]: Convert neigh hashing over to jenkins.
#   
#   Based upon work by Harald Welte <laforge@netfilter.org>
#   
#   Signed-off-by: David S. Miller <davem@davemloft.net>
# 
# net/ipv6/ndisc.c
#   2004/09/23 18:02:32-07:00 davem@nuts.davemloft.net +7 -7
#   [NET]: Convert neigh hashing over to jenkins.
# 
# net/ipv4/arp.c
#   2004/09/23 18:02:32-07:00 davem@nuts.davemloft.net +3 -9
#   [NET]: Convert neigh hashing over to jenkins.
# 
# net/decnet/dn_neigh.c
#   2004/09/23 18:02:32-07:00 davem@nuts.davemloft.net +2 -7
#   [NET]: Convert neigh hashing over to jenkins.
# 
# net/core/neighbour.c
#   2004/09/23 18:02:32-07:00 davem@nuts.davemloft.net +3 -0
#   [NET]: Convert neigh hashing over to jenkins.
# 
# net/atm/clip.c
#   2004/09/23 18:02:32-07:00 davem@nuts.davemloft.net +2 -9
#   [NET]: Convert neigh hashing over to jenkins.
# 
# include/net/neighbour.h
#   2004/09/23 18:02:32-07:00 davem@nuts.davemloft.net +1 -0
#   [NET]: Convert neigh hashing over to jenkins.
# 
diff -Nru a/include/net/neighbour.h b/include/net/neighbour.h
--- a/include/net/neighbour.h	2004-09-23 22:27:14 -07:00
+++ b/include/net/neighbour.h	2004-09-23 22:27:14 -07:00
@@ -175,6 +175,7 @@
 	struct neigh_statistics	stats;
 	struct neighbour	**hash_buckets;
 	unsigned int		hash_mask;
+	__u32			hash_rnd;
 	struct pneigh_entry	**phash_buckets;
 };
 
diff -Nru a/net/atm/clip.c b/net/atm/clip.c
--- a/net/atm/clip.c	2004-09-23 22:27:14 -07:00
+++ b/net/atm/clip.c	2004-09-23 22:27:14 -07:00
@@ -27,6 +27,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/rcupdate.h>
+#include <linux/jhash.h>
 #include <net/route.h> /* for struct rtable and routing */
 #include <net/icmp.h> /* icmp_send */
 #include <asm/param.h> /* for HZ */
@@ -328,15 +329,7 @@
 
 static u32 clip_hash(const void *pkey, const struct net_device *dev)
 {
-	u32 hash_val;
-
-	hash_val = *(u32*)pkey;
-	hash_val ^= (hash_val>>16);
-	hash_val ^= hash_val>>8;
-	hash_val ^= hash_val>>3;
-	hash_val = (hash_val^dev->ifindex);
-
-	return hash_val;
+	return jhash_2words(*(u32 *)pkey, dev->ifindex, clip_tbl.hash_rnd);
 }
 
 static struct neigh_table clip_tbl = {
diff -Nru a/net/core/neighbour.c b/net/core/neighbour.c
--- a/net/core/neighbour.c	2004-09-23 22:27:14 -07:00
+++ b/net/core/neighbour.c	2004-09-23 22:27:14 -07:00
@@ -29,6 +29,7 @@
 #include <net/dst.h>
 #include <net/sock.h>
 #include <linux/rtnetlink.h>
+#include <linux/random.h>
 
 #define NEIGH_DEBUG 1
 
@@ -1316,6 +1317,8 @@
 		panic("cannot allocate neighbour cache hashes");
 
 	memset(tbl->phash_buckets, 0, phsize);
+
+	get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
 
 	tbl->lock	       = RW_LOCK_UNLOCKED;
 	init_timer(&tbl->gc_timer);
diff -Nru a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
--- a/net/decnet/dn_neigh.c	2004-09-23 22:27:14 -07:00
+++ b/net/decnet/dn_neigh.c	2004-09-23 22:27:14 -07:00
@@ -36,6 +36,7 @@
 #include <linux/spinlock.h>
 #include <linux/seq_file.h>
 #include <linux/rcupdate.h>
+#include <linux/jhash.h>
 #include <asm/atomic.h>
 #include <net/neighbour.h>
 #include <net/dst.h>
@@ -122,13 +123,7 @@
 
 static u32 dn_neigh_hash(const void *pkey, const struct net_device *dev)
 {
-	u32 hash_val;
-
-	hash_val = *(dn_address *)pkey;
-	hash_val ^= (hash_val >> 10);
-	hash_val ^= (hash_val >> 3);
-
-	return hash_val;
+	return jhash_2words(*(dn_address *)pkey, 0, dn_neigh_table.hash_rnd);
 }
 
 static int dn_neigh_construct(struct neighbour *neigh)
diff -Nru a/net/ipv4/arp.c b/net/ipv4/arp.c
--- a/net/ipv4/arp.c	2004-09-23 22:27:14 -07:00
+++ b/net/ipv4/arp.c	2004-09-23 22:27:14 -07:00
@@ -71,6 +71,7 @@
  *					arp_xmit so intermediate drivers like
  *					bonding can change the skb before
  *					sending (e.g. insert 8021q tag).
+ *		Harald Welte	:	convert to make use of jenkins hash
  */
 
 #include <linux/module.h>
@@ -97,6 +98,7 @@
 #include <linux/init.h>
 #include <linux/net.h>
 #include <linux/rcupdate.h>
+#include <linux/jhash.h>
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
 #endif
@@ -223,15 +225,7 @@
 
 static u32 arp_hash(const void *pkey, const struct net_device *dev)
 {
-	u32 hash_val;
-
-	hash_val = *(u32*)pkey;
-	hash_val ^= (hash_val>>16);
-	hash_val ^= hash_val>>8;
-	hash_val ^= hash_val>>3;
-	hash_val = (hash_val^dev->ifindex);
-
-	return hash_val;
+	return jhash_2words(*(u32 *)pkey, dev->ifindex, arp_tbl.hash_rnd);
 }
 
 static int arp_constructor(struct neighbour *neigh)
diff -Nru a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
--- a/net/ipv6/ndisc.c	2004-09-23 22:27:14 -07:00
+++ b/net/ipv6/ndisc.c	2004-09-23 22:27:14 -07:00
@@ -66,6 +66,7 @@
 #include <linux/if_arp.h>
 #include <linux/ipv6.h>
 #include <linux/icmpv6.h>
+#include <linux/jhash.h>
 
 #include <net/sock.h>
 #include <net/snmp.h>
@@ -270,15 +271,14 @@
 
 static u32 ndisc_hash(const void *pkey, const struct net_device *dev)
 {
-	u32 hash_val;
+	const u32 *p32 = pkey;
+	u32 addr_hash, i;
 
-	hash_val = *(u32*)(pkey + sizeof(struct in6_addr) - 4);
-	hash_val ^= (hash_val>>16);
-	hash_val ^= hash_val>>8;
-	hash_val ^= hash_val>>3;
-	hash_val = (hash_val^dev->ifindex);
+	addr_hash = 0;
+	for (i = 0; i < (sizeof(struct in6_addr) / sizeof(u32)); i++)
+		addr_hash ^= *p32++;
 
-	return hash_val;
+	return jhash_2words(addr_hash, dev->ifindex, nd_tbl.hash_rnd);
 }
 
 static int ndisc_constructor(struct neighbour *neigh)

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh
  2004-09-24  5:51 [6/6]: jenkins hash for neigh David S. Miller
@ 2004-09-24  8:52 ` Harald Welte
  2004-09-24 21:27   ` David S. Miller
  0 siblings, 1 reply; 43+ messages in thread
From: Harald Welte @ 2004-09-24  8:52 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev

[-- Attachment #1: Type: text/plain, Size: 1727 bytes --]

Hi Dave, I'm just reviewing your patches right now...

On Thu, Sep 23, 2004 at 10:51:58PM -0700, David S. Miller wrote:
> 
> This makes all the neigh implementations use jenkins.

> --- a/net/core/neighbour.c	2004-09-23 22:27:14 -07:00
> +++ b/net/core/neighbour.c	2004-09-23 22:27:14 -07:00
> @@ -1316,6 +1317,8 @@
>  		panic("cannot allocate neighbour cache hashes");
>  
>  	memset(tbl->phash_buckets, 0, phsize);
> +
> +	get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
>  
>  	tbl->lock	       = RW_LOCK_UNLOCKED;
>  	init_timer(&tbl->gc_timer);

So this means you put get_random_bytes into the __init function.  I
explicitly didn't want to do that (and added that _initted variable),
since at bootup time we might not have sufficient entropy yet.
This is before userspace has had time to reload the random seed saved
before shutdown, so especially on automatic-booting embedded devices
without any user interaction and never-changing flash layout (and thus
similar interrupt patterns) I think this is quite weak.

If we defer get_random_bytes() until the first neighbour is created,
this gives the system some more time to gather entropy...  

That's the same reasoning we have for making the conntrack hash work
this way.

Also, wouldn't it make sense to use a new random value if we grow the
hash table?  I mean it's cheap, and we make it harder for someone trying
a hash-based attack.  

Anyway, maybe I'm just being too paranoid.

-- 
- Harald Welte <laforge@gnumonks.org>               http://www.gnumonks.org/
============================================================================
Programming is like sex: One mistake and you have to support it your lifetime

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh
  2004-09-24  8:52 ` Harald Welte
@ 2004-09-24 21:27   ` David S. Miller
  2004-09-25  6:44     ` Harald Welte
  0 siblings, 1 reply; 43+ messages in thread
From: David S. Miller @ 2004-09-24 21:27 UTC (permalink / raw)
  To: Harald Welte; +Cc: netdev

On Fri, 24 Sep 2004 10:52:34 +0200
Harald Welte <laforge@gnumonks.org> wrote:

> So this means you put get_random_bytes into the __init function.  I
> explicitly didn't want to do that (and added that _initted variable),
> since at bootup time we might not have sufficient entropy yet.

Good point.

> If we defer get_random_bytes() until the first neighbour is created,
> this gives the system some more time to gather entropy...  
 ...
> Also, wouldn't it make sense to use a new random value if we grow the
> hash table?  I mean it's cheap, and we make it harder for someone trying
> a hash-based attack.  

I've combined all of your thoughts into the patch below.
We set the initial ARP table very small (2 chains), and we
regenerate the random seed every time we grow the hash table.

This should address all of your concerns.

# This is a BitKeeper generated diff -Nru style patch.
#
# ChangeSet
#   2004/09/24 14:06:45-07:00 davem@nuts.davemloft.net 
#   [NET]: Neighbour hashing tweaks.
#   
#   1) Start with a smaller initial hash table size.
#      This stresses the new code better.
#   2) Generate a new hash_rnd every time we grow
#      the hashes.
#   
#   Based upon commentary from Harald Welte.
#   
#   Signed-off-by: David S. Miller <davem@davemloft.net>
# 
# net/core/neighbour.c
#   2004/09/24 14:06:02-07:00 davem@nuts.davemloft.net +2 -1
#   [NET]: Neighbour hashing tweaks.
#   
#   1) Start with a smaller initial hash table size.
#      This stresses the new code better.
#   2) Generate a new hash_rnd every time we grow
#      the hashes.
#   
#   Based upon commentary from Harald Welte.
#   
#   Signed-off-by: David S. Miller <davem@davemloft.net>
# 
diff -Nru a/net/core/neighbour.c b/net/core/neighbour.c
--- a/net/core/neighbour.c	2004-09-24 14:07:07 -07:00
+++ b/net/core/neighbour.c	2004-09-24 14:07:07 -07:00
@@ -331,6 +331,7 @@
 	old_hash = tbl->hash_buckets;
 
 	write_lock_bh(&tbl->lock);
+	get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
 	for (i = 0; i < old_entries; i++) {
 		struct neighbour *n, *next;
 
@@ -1306,7 +1307,7 @@
 	if (!tbl->kmem_cachep)
 		panic("cannot create neighbour cache");
 
-	tbl->hash_mask = 0x1f;
+	tbl->hash_mask = 1;
 	tbl->hash_buckets = neigh_hash_alloc(tbl->hash_mask + 1);
 
 	phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *);

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh
  2004-09-24 21:27   ` David S. Miller
@ 2004-09-25  6:44     ` Harald Welte
  2004-09-25  7:56       ` David S. Miller
  0 siblings, 1 reply; 43+ messages in thread
From: Harald Welte @ 2004-09-25  6:44 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev

[-- Attachment #1: Type: text/plain, Size: 886 bytes --]

On Fri, Sep 24, 2004 at 02:27:02PM -0700, David S. Miller wrote:
> I've combined all of your thoughts into the patch below.
> We set the initial ARP table very small (2 chains), and we
> regenerate the random seed every time we grow the hash table.

Thanks, Dave.

> This should address all of your concerns.

Indeed, it does. This basically means I am very happy with your overall
patchset.  I'll inclue it in my next round of kernel builds and give it
some testing.  Do you still want to push this for 2.6.9?

I think I'll defer the 2.4.x backport until we've sorted out the one
remaining 'starvation due to incomplete' issue.

-- 
- Harald Welte <laforge@gnumonks.org>               http://www.gnumonks.org/
============================================================================
Programming is like sex: One mistake and you have to support it your lifetime

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh
  2004-09-25  6:44     ` Harald Welte
@ 2004-09-25  7:56       ` David S. Miller
  2004-09-25  8:14         ` YOSHIFUJI Hideaki / 吉藤英明
                           ` (5 more replies)
  0 siblings, 6 replies; 43+ messages in thread
From: David S. Miller @ 2004-09-25  7:56 UTC (permalink / raw)
  To: Harald Welte; +Cc: netdev

[-- Attachment #1: Type: text/plain, Size: 1566 bytes --]

On Sat, 25 Sep 2004 08:44:06 +0200
Harald Welte <laforge@gnumonks.org> wrote:

> I'll inclue it in my next round of kernel builds and give it
> some testing.

Thanks.

> Do you still want to push this for 2.6.9?
> 
> I think I'll defer the 2.4.x backport until we've sorted out the one
> remaining 'starvation due to incomplete' issue.

Yes, and I'll defer the push to 2.6.9 until we conquer that too.

So let's start! :-)

Attached are 4 patches, the first 3 I'm happy with the last
is a major RFC.  Here's the breakdown:

1) Remove unnecessary next = NULL assignment noticed earlier today.

2) Tim Gardner's change to smooth out periodic neighbour GC.

3) Fix locking and GFP_* bugs.

4) The controversial/RFC patch, dorking with neigh_forced_gc()

So let's discuss #4.  It is the first idea I had to combat the
"problem", but honestly right now I am beginning to think that
the real solution is to simply remove the INCOMPLETE checks
altogether.

Neighbours are a sub-cache of the routing cache.  Therefore when
a neigh entry has a singular refcount, no routing cache entry
points to it.  No routing cache entry, we're not sending packets
to that neighbour any time soon, so there is no reason (especially
during strong pressure) to hold onto such entries.

Agree or disagree?  Regardless, I'd be interested how effective
your stress case is with patch #4 and my new suggestion which
is just to remove the:

			    (n->nud_state != NUD_INCOMPLETE ||
			     time_after(jiffies, n->used + n->parms->retrans_time)

from the neigh_forced_gc() code.

Have at it :-)


[-- Attachment #2: diff1 --]
[-- Type: application/octet-stream, Size: 753 bytes --]

# This is a BitKeeper generated diff -Nru style patch.
#
# ChangeSet
#   2004/09/24 13:09:50-07:00 kumarkr@us.ibm.com 
#   [NET]: Remove unnecessary local var initialization.
#   
#   Signed-off-by: David S. Miller <davem@davemloft.net>
# 
# net/core/neighbour.c
#   2004/09/24 13:09:00-07:00 kumarkr@us.ibm.com +0 -1
#   [NET]: Remove unnecessary local var initialization.
# 
diff -Nru a/net/core/neighbour.c b/net/core/neighbour.c
--- a/net/core/neighbour.c	2004-09-25 00:30:13 -07:00
+++ b/net/core/neighbour.c	2004-09-25 00:30:13 -07:00
@@ -334,7 +334,6 @@
 	for (i = 0; i < old_entries; i++) {
 		struct neighbour *n, *next;
 
-		next = NULL;
 		for (n = old_hash[i]; n; n = next) {
 			unsigned int hash_val = tbl->hash(n->primary_key, n->dev);
 

[-- Attachment #3: diff2 --]
[-- Type: application/octet-stream, Size: 3958 bytes --]

# This is a BitKeeper generated diff -Nru style patch.
#
# ChangeSet
#   2004/09/24 16:05:14-07:00 davem@nuts.davemloft.net 
#   [NET]: Smooth out periodic neighbour GC.
#   
#   Based almost entirely upon work by Tim Gardner
#   (timg@tpi.com) and Harald Welte (laforge@gnumonks.org)
#   
#   Signed-off-by: David S. Miller <davem@davemloft.net>
# 
# net/core/neighbour.c
#   2004/09/24 16:04:44-07:00 davem@nuts.davemloft.net +38 -32
#   [NET]: Smooth out periodic neighbour GC.
#   
#   Based almost entirely upon work by Tim Gardner
#   (timg@tpi.com) and Harald Welte (laforge@gnumonks.org)
#   
#   Signed-off-by: David S. Miller <davem@davemloft.net>
# 
# include/net/neighbour.h
#   2004/09/24 16:04:44-07:00 davem@nuts.davemloft.net +1 -0
#   [NET]: Smooth out periodic neighbour GC.
#   
#   Based almost entirely upon work by Tim Gardner
#   (timg@tpi.com) and Harald Welte (laforge@gnumonks.org)
#   
#   Signed-off-by: David S. Miller <davem@davemloft.net>
# 
diff -Nru a/include/net/neighbour.h b/include/net/neighbour.h
--- a/include/net/neighbour.h	2004-09-25 00:30:27 -07:00
+++ b/include/net/neighbour.h	2004-09-25 00:30:27 -07:00
@@ -176,6 +176,7 @@
 	struct neighbour	**hash_buckets;
 	unsigned int		hash_mask;
 	__u32			hash_rnd;
+	unsigned int		hash_chain_gc;
 	struct pneigh_entry	**phash_buckets;
 };
 
diff -Nru a/net/core/neighbour.c b/net/core/neighbour.c
--- a/net/core/neighbour.c	2004-09-25 00:30:27 -07:00
+++ b/net/core/neighbour.c	2004-09-25 00:30:27 -07:00
@@ -631,9 +631,8 @@
 static void neigh_periodic_timer(unsigned long arg)
 {
 	struct neigh_table *tbl = (struct neigh_table *)arg;
-	unsigned long now = jiffies;
-	int i;
-
+	struct neighbour *n, **np;
+	unsigned long expire, now = jiffies;
 
 	write_lock(&tbl->lock);
 
@@ -649,41 +648,49 @@
 				neigh_rand_reach_time(p->base_reachable_time);
 	}
 
-	for (i = 0; i <= tbl->hash_mask; i++) {
-		struct neighbour *n, **np;
+	np = &tbl->hash_buckets[tbl->hash_chain_gc];
+	tbl->hash_chain_gc = ((tbl->hash_chain_gc + 1) & tbl->hash_mask);
 
-		np = &tbl->hash_buckets[i];
-		while ((n = *np) != NULL) {
-			unsigned state;
-
-			write_lock(&n->lock);
-
-			state = n->nud_state;
-			if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
-				write_unlock(&n->lock);
-				goto next_elt;
-			}
+	while ((n = *np) != NULL) {
+		unsigned int state;
 
-			if (time_before(n->used, n->confirmed))
-				n->used = n->confirmed;
+		write_lock(&n->lock);
 
-			if (atomic_read(&n->refcnt) == 1 &&
-			    (state == NUD_FAILED ||
-			     time_after(now, n->used + n->parms->gc_staletime))) {
-				*np = n->next;
-				n->dead = 1;
-				write_unlock(&n->lock);
-				neigh_release(n);
-				continue;
-			}
+		state = n->nud_state;
+		if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
 			write_unlock(&n->lock);
+			goto next_elt;
+		}
 
-next_elt:
-			np = &n->next;
+		if (time_before(n->used, n->confirmed))
+			n->used = n->confirmed;
+
+		if (atomic_read(&n->refcnt) == 1 &&
+		    (state == NUD_FAILED ||
+		     time_after(now, n->used + n->parms->gc_staletime))) {
+			*np = n->next;
+			n->dead = 1;
+			write_unlock(&n->lock);
+			neigh_release(n);
+			continue;
 		}
+		write_unlock(&n->lock);
+
+next_elt:
+		np = &n->next;
 	}
 
-	mod_timer(&tbl->gc_timer, now + tbl->gc_interval);
+ 	/* Cycle through all hash buckets every base_reachable_time/2 ticks.
+ 	 * ARP entry timeouts range from 1/2 base_reachable_time to 3/2
+ 	 * base_reachable_time.
+	 */
+	expire = tbl->parms.base_reachable_time >> 1;
+	expire /= (tbl->hash_mask + 1);
+	if (!expire)
+		expire = 1;
+
+ 	mod_timer(&tbl->gc_timer, now + expire);
+
 	write_unlock(&tbl->lock);
 }
 
@@ -1324,8 +1331,7 @@
 	init_timer(&tbl->gc_timer);
 	tbl->gc_timer.data     = (unsigned long)tbl;
 	tbl->gc_timer.function = neigh_periodic_timer;
-	tbl->gc_timer.expires  = now + tbl->gc_interval +
-				 tbl->parms.reachable_time;
+	tbl->gc_timer.expires  = now + 1;
 	add_timer(&tbl->gc_timer);
 
 	init_timer(&tbl->proxy_timer);

[-- Attachment #4: diff3 --]
[-- Type: application/octet-stream, Size: 3022 bytes --]

# This is a BitKeeper generated diff -Nru style patch.
#
# ChangeSet
#   2004/09/24 16:22:14-07:00 davem@nuts.davemloft.net 
#   [NET]: Fix some neigh table locking errors.
#   
#   Also, make neigh_hash_alloc() use GFP_ATOMIC.
#   
#   Signed-off-by: David S. Miller <davem@davemloft.net>
# 
# net/core/neighbour.c
#   2004/09/24 16:21:49-07:00 davem@nuts.davemloft.net +14 -9
#   [NET]: Fix some neigh table locking errors.
#   
#   Also, make neigh_hash_alloc() use GFP_ATOMIC.
#   
#   Signed-off-by: David S. Miller <davem@davemloft.net>
# 
diff -Nru a/net/core/neighbour.c b/net/core/neighbour.c
--- a/net/core/neighbour.c	2004-09-25 00:30:40 -07:00
+++ b/net/core/neighbour.c	2004-09-25 00:30:40 -07:00
@@ -116,11 +116,11 @@
 	int shrunk = 0;
 	int i;
 
+	write_lock_bh(&tbl->lock);
 	for (i = 0; i <= tbl->hash_mask; i++) {
 		struct neighbour *n, **np;
 
 		np = &tbl->hash_buckets[i];
-		write_lock_bh(&tbl->lock);
 		while ((n = *np) != NULL) {
 			/* Neighbour record may be discarded if:
 			   - nobody refers to it.
@@ -147,10 +147,12 @@
 			write_unlock(&n->lock);
 			np = &n->next;
 		}
-		write_unlock_bh(&tbl->lock);
 	}
 
 	tbl->last_flush = jiffies;
+
+	write_unlock_bh(&tbl->lock);
+
 	return shrunk;
 }
 
@@ -295,10 +297,10 @@
 	struct neighbour **ret;
 
 	if (size <= PAGE_SIZE) {
-		ret = kmalloc(size, GFP_KERNEL);
+		ret = kmalloc(size, GFP_ATOMIC);
 	} else {
 		ret = (struct neighbour **)
-			__get_free_pages(GFP_KERNEL, get_order(size));
+			__get_free_pages(GFP_ATOMIC, get_order(size));
 	}
 	if (ret)
 		memset(ret, 0, size);
@@ -330,7 +332,6 @@
 	new_hash_mask = new_entries - 1;
 	old_hash = tbl->hash_buckets;
 
-	write_lock_bh(&tbl->lock);
 	get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
 	for (i = 0; i < old_entries; i++) {
 		struct neighbour *n, *next;
@@ -347,7 +348,6 @@
 	}
 	tbl->hash_buckets = new_hash;
 	tbl->hash_mask = new_hash_mask;
-	write_unlock_bh(&tbl->lock);
 
 	neigh_hash_free(old_hash, old_entries);
 }
@@ -400,8 +400,11 @@
 		goto out;
 	}
 
-	if (tbl->entries > (tbl->hash_mask + 1))
+	if (tbl->entries > (tbl->hash_mask + 1)) {
+		write_lock_bh(&tbl->lock);
 		neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1);
+		write_unlock_bh(&tbl->lock);
+	}
 
 	memcpy(n->primary_key, pkey, key_len);
 	n->dev = dev;
@@ -422,9 +425,10 @@
 
 	n->confirmed = jiffies - (n->parms->base_reachable_time << 1);
 
+	write_lock_bh(&tbl->lock);
+
 	hash_val = tbl->hash(pkey, dev) & tbl->hash_mask;
 
-	write_lock_bh(&tbl->lock);
 	if (n->parms->dead) {
 		rc = ERR_PTR(-EINVAL);
 		goto out_tbl_unlock;
@@ -514,10 +518,10 @@
 	hash_val ^= hash_val >> 4;
 	hash_val &= PNEIGH_HASHMASK;
 
+	write_lock_bh(&tbl->lock);
 	for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL;
 	     np = &n->next) {
 		if (!memcmp(n->key, pkey, key_len) && n->dev == dev) {
-			write_lock_bh(&tbl->lock);
 			*np = n->next;
 			write_unlock_bh(&tbl->lock);
 			if (tbl->pdestructor)
@@ -526,6 +530,7 @@
 			return 0;
 		}
 	}
+	write_unlock_bh(&tbl->lock);
 	return -ENOENT;
 }
 

[-- Attachment #5: diff4 --]
[-- Type: application/octet-stream, Size: 2379 bytes --]

===== net/core/neighbour.c 1.47 vs edited =====
--- 1.47/net/core/neighbour.c	2004-09-24 16:21:49 -07:00
+++ edited/net/core/neighbour.c	2004-09-24 16:38:58 -07:00
@@ -110,13 +110,13 @@
 	return (net_random() % base) + (base >> 1);
 }
 
-
-static int neigh_forced_gc(struct neigh_table *tbl)
+static int neigh_forced_gc(struct neigh_table *tbl, int goal)
 {
-	int shrunk = 0;
+	int shrunk = 0, num_incomplete = 0, reap_incomplete = 0;
 	int i;
 
 	write_lock_bh(&tbl->lock);
+rescan:
 	for (i = 0; i <= tbl->hash_mask; i++) {
 		struct neighbour *n, **np;
 
@@ -125,30 +125,46 @@
 			/* Neighbour record may be discarded if:
 			   - nobody refers to it.
 			   - it is not permanent
-			   - (NEW and probably wrong)
+			   - On the first pass of this scan,
 			     INCOMPLETE entries are kept at least for
 			     n->parms->retrans_time, otherwise we could
 			     flood network with resolution requests.
-			     It is not clear, what is better table overflow
-			     or flooding.
 			 */
 			write_lock(&n->lock);
-			if (atomic_read(&n->refcnt) == 1 &&
-			    !(n->nud_state & NUD_PERMANENT) &&
-			    (n->nud_state != NUD_INCOMPLETE ||
-			     time_after(jiffies, n->used + n->parms->retrans_time))) {
-				*np	= n->next;
-				n->dead = 1;
-				shrunk	= 1;
-				write_unlock(&n->lock);
-				neigh_release(n);
-				continue;
+			if (atomic_read(&n->refcnt) != 1)
+				goto next_ent;
+
+			if (n->nud_state & NUD_PERMANENT)
+				goto next_ent;
+
+			if (n->nud_state -= NUD_INCOMPLETE &&
+			    reap_incomplete == 0 &&
+			    time_after(jiffies,
+				       n->used + n->parms->retrans_time)) {
+				num_incomplete++;
+				goto next_ent;
 			}
+
+			*np	= n->next;
+			n->dead = 1;
+			shrunk++;
+			write_unlock(&n->lock);
+			neigh_release(n);
+			continue;
+
+		next_ent:
 			write_unlock(&n->lock);
 			np = &n->next;
 		}
 	}
 
+	if (reap_incomplete == 0 &&
+	    shrunk < goal &&
+	    (shrunk + num_incomplete) >= goal) {
+		reap_incomplete = 1;
+		goto rescan;
+	}
+
 	tbl->last_flush = jiffies;
 
 	write_unlock_bh(&tbl->lock);
@@ -261,7 +277,9 @@
 	if (tbl->entries > tbl->gc_thresh3 ||
 	    (tbl->entries > tbl->gc_thresh2 &&
 	     time_after(now, tbl->last_flush + 5 * HZ))) {
-		if (!neigh_forced_gc(tbl) &&
+		int goal = tbl->entries - tbl->gc_thresh3;
+
+		if (!neigh_forced_gc(tbl, goal) &&
 		    tbl->entries > tbl->gc_thresh3)
 			goto out;
 	}

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh
  2004-09-25  7:56       ` David S. Miller
@ 2004-09-25  8:14         ` YOSHIFUJI Hideaki / 吉藤英明
  2004-09-25  8:27           ` YOSHIFUJI Hideaki / 吉藤英明
  2004-09-25  9:09         ` Harald Welte
                           ` (4 subsequent siblings)
  5 siblings, 1 reply; 43+ messages in thread
From: YOSHIFUJI Hideaki / 吉藤英明 @ 2004-09-25  8:14 UTC (permalink / raw)
  To: davem; +Cc: laforge, netdev, yoshfuji

In article <20040925005623.2faf8faf.davem@davemloft.net> (at Sat, 25 Sep 2004 00:56:23 -0700), "David S. Miller" <davem@davemloft.net> says:


> +			if (n->nud_state -= NUD_INCOMPLETE &&

I guess this is typo of:
			if (n->nud_state == NUD_INCOMPLETE &&

Am I wrong?

-- 
Hideaki YOSHIFUJI @ USAGI Project <yoshfuji@linux-ipv6.org>
GPG FP: 9022 65EB 1ECF 3AD1 0BDF  80D8 4807 F894 E062 0EEA

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh
  2004-09-25  8:14         ` YOSHIFUJI Hideaki / 吉藤英明
@ 2004-09-25  8:27           ` YOSHIFUJI Hideaki / 吉藤英明
  2004-09-25  8:30             ` David S. Miller
  0 siblings, 1 reply; 43+ messages in thread
From: YOSHIFUJI Hideaki / 吉藤英明 @ 2004-09-25  8:27 UTC (permalink / raw)
  To: davem; +Cc: laforge, netdev, yoshfuji

In article <20040925.171412.75484376.yoshfuji@linux-ipv6.org> (at Sat, 25 Sep 2004 17:14:12 +0900 (JST)), YOSHIFUJI Hideaki / 吉藤英明 <yoshfuji@linux-ipv6.org> says:

> In article <20040925005623.2faf8faf.davem@davemloft.net> (at Sat, 25 Sep 2004 00:56:23 -0700), "David S. Miller" <davem@davemloft.net> says:
> 
> 
> > +			if (n->nud_state -= NUD_INCOMPLETE &&
> 
> I guess this is typo of:
> 			if (n->nud_state == NUD_INCOMPLETE &&
> 
> Am I wrong?

Sorry, I forgot to state the source; this lives in diff4.

--yoshfuji

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh
  2004-09-25  8:27           ` YOSHIFUJI Hideaki / 吉藤英明
@ 2004-09-25  8:30             ` David S. Miller
  0 siblings, 0 replies; 43+ messages in thread
From: David S. Miller @ 2004-09-25  8:30 UTC (permalink / raw)
  To: yoshfuji; +Cc: laforge, netdev

[-- Attachment #1: Type: text/plain, Size: 729 bytes --]

On Sat, 25 Sep 2004 17:27:12 +0900 (JST)
YOSHIFUJI Hideaki / ^[$B5HF#1QL@^[(B <yoshfuji@linux-ipv6.org> wrote:

> In article <20040925.171412.75484376.yoshfuji@linux-ipv6.org> (at Sat, 25 Sep 2004 17:14:12 +0900 (JST)), YOSHIFUJI Hideaki / ^[$B5HF#1QL@^[(B <yoshfuji@linux-ipv6.org> says:
> 
> > In article <20040925005623.2faf8faf.davem@davemloft.net> (at Sat, 25 Sep 2004 00:56:23 -0700), "David S. Miller" <davem@davemloft.net> says:
> > 
> > 
> > > +			if (n->nud_state -= NUD_INCOMPLETE &&
> > 
> > I guess this is typo of:
> > 			if (n->nud_state == NUD_INCOMPLETE &&
> > 
> > Am I wrong?
> 
> Sorry, I forgot to state the source; this lives in diff4.

Good catch, you are correct.

Fixed version attached.

[-- Attachment #2: diff4 --]
[-- Type: application/octet-stream, Size: 2379 bytes --]

===== net/core/neighbour.c 1.47 vs edited =====
--- 1.47/net/core/neighbour.c	2004-09-24 16:21:49 -07:00
+++ edited/net/core/neighbour.c	2004-09-25 01:11:47 -07:00
@@ -110,13 +110,13 @@
 	return (net_random() % base) + (base >> 1);
 }
 
-
-static int neigh_forced_gc(struct neigh_table *tbl)
+static int neigh_forced_gc(struct neigh_table *tbl, int goal)
 {
-	int shrunk = 0;
+	int shrunk = 0, num_incomplete = 0, reap_incomplete = 0;
 	int i;
 
 	write_lock_bh(&tbl->lock);
+rescan:
 	for (i = 0; i <= tbl->hash_mask; i++) {
 		struct neighbour *n, **np;
 
@@ -125,30 +125,46 @@
 			/* Neighbour record may be discarded if:
 			   - nobody refers to it.
 			   - it is not permanent
-			   - (NEW and probably wrong)
+			   - On the first pass of this scan,
 			     INCOMPLETE entries are kept at least for
 			     n->parms->retrans_time, otherwise we could
 			     flood network with resolution requests.
-			     It is not clear, what is better table overflow
-			     or flooding.
 			 */
 			write_lock(&n->lock);
-			if (atomic_read(&n->refcnt) == 1 &&
-			    !(n->nud_state & NUD_PERMANENT) &&
-			    (n->nud_state != NUD_INCOMPLETE ||
-			     time_after(jiffies, n->used + n->parms->retrans_time))) {
-				*np	= n->next;
-				n->dead = 1;
-				shrunk	= 1;
-				write_unlock(&n->lock);
-				neigh_release(n);
-				continue;
+			if (atomic_read(&n->refcnt) != 1)
+				goto next_ent;
+
+			if (n->nud_state & NUD_PERMANENT)
+				goto next_ent;
+
+			if (n->nud_state == NUD_INCOMPLETE &&
+			    reap_incomplete == 0 &&
+			    time_after(jiffies,
+				       n->used + n->parms->retrans_time)) {
+				num_incomplete++;
+				goto next_ent;
 			}
+
+			*np	= n->next;
+			n->dead = 1;
+			shrunk++;
+			write_unlock(&n->lock);
+			neigh_release(n);
+			continue;
+
+		next_ent:
 			write_unlock(&n->lock);
 			np = &n->next;
 		}
 	}
 
+	if (reap_incomplete == 0 &&
+	    shrunk < goal &&
+	    (shrunk + num_incomplete) >= goal) {
+		reap_incomplete = 1;
+		goto rescan;
+	}
+
 	tbl->last_flush = jiffies;
 
 	write_unlock_bh(&tbl->lock);
@@ -261,7 +277,9 @@
 	if (tbl->entries > tbl->gc_thresh3 ||
 	    (tbl->entries > tbl->gc_thresh2 &&
 	     time_after(now, tbl->last_flush + 5 * HZ))) {
-		if (!neigh_forced_gc(tbl) &&
+		int goal = tbl->entries - tbl->gc_thresh3;
+
+		if (!neigh_forced_gc(tbl, goal) &&
 		    tbl->entries > tbl->gc_thresh3)
 			goto out;
 	}

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh
  2004-09-25  7:56       ` David S. Miller
  2004-09-25  8:14         ` YOSHIFUJI Hideaki / 吉藤英明
@ 2004-09-25  9:09         ` Harald Welte
  2004-09-25 13:33           ` Steven Whitehouse
                             ` (2 more replies)
  2004-09-26 10:11         ` YOSHIFUJI Hideaki / 吉藤英明
                           ` (3 subsequent siblings)
  5 siblings, 3 replies; 43+ messages in thread
From: Harald Welte @ 2004-09-25  9:09 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev

[-- Attachment #1: Type: text/plain, Size: 1602 bytes --]

On Sat, Sep 25, 2004 at 12:56:23AM -0700, David S. Miller wrote:
> > I'll inclue it in my next round of kernel builds and give it
> > some testing.
> 
> Thanks.

Please note that I guess I won't have any results until late Sunday/Monday.

> 4) The controversial/RFC patch, dorking with neigh_forced_gc()
> 
> So let's discuss #4.  It is the first idea I had to combat the
> "problem", but honestly right now I am beginning to think that
> the real solution is to simply remove the INCOMPLETE checks
> altogether.
> 
> Neighbours are a sub-cache of the routing cache.  Therefore when
> a neigh entry has a singular refcount, no routing cache entry
> points to it.  No routing cache entry, we're not sending packets
> to that neighbour any time soon, so there is no reason (especially
> during strong pressure) to hold onto such entries.

I am sure this is valid for IPv4 and IPv6.  How about other users of the
neighbour cache, do they share this assumption?  I have to admit that I
never looked throgh the ATM or 

> Agree or disagree?  Regardless, I'd be interested how effective
> your stress case is with patch #4 and my new suggestion which
> is just to remove the:

I'll do tests with and without INCOMPLETE check. No results until late
Sunday/Monday, as indicated above.

[yes, I noticed your corrected version of diff4]

-- 
- Harald Welte <laforge@gnumonks.org>               http://www.gnumonks.org/
============================================================================
Programming is like sex: One mistake and you have to support it your lifetime

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh
  2004-09-25  9:09         ` Harald Welte
@ 2004-09-25 13:33           ` Steven Whitehouse
  2004-09-26  0:48             ` David S. Miller
  2004-09-26  3:31           ` David S. Miller
  2004-09-27  9:29           ` Harald Welte
  2 siblings, 1 reply; 43+ messages in thread
From: Steven Whitehouse @ 2004-09-25 13:33 UTC (permalink / raw)
  To: Harald Welte; +Cc: David S. Miller, netdev

Hi,

On Sat, Sep 25, 2004 at 11:09:33AM +0200, Harald Welte wrote:
> On Sat, Sep 25, 2004 at 12:56:23AM -0700, David S. Miller wrote:
> > So let's discuss #4.  It is the first idea I had to combat the
> > "problem", but honestly right now I am beginning to think that
> > the real solution is to simply remove the INCOMPLETE checks
> > altogether.
> > 
> > Neighbours are a sub-cache of the routing cache.  Therefore when
> > a neigh entry has a singular refcount, no routing cache entry
> > points to it.  No routing cache entry, we're not sending packets
> > to that neighbour any time soon, so there is no reason (especially
> > during strong pressure) to hold onto such entries.
> 
> I am sure this is valid for IPv4 and IPv6.  How about other users of the
> neighbour cache, do they share this assumption?  I have to admit that I
> never looked throgh the ATM or 
>
I cannot see this being any problem for DECnet at all.... the entry you
most want to hold on to is the entry for the default router of which
there will be a max of one per interface. This applies only in end node
mode and we hold a ref count to it anyway, so that it should have the
same effect as the routing cache entry holding a ref,

Steve.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh
  2004-09-25 13:33           ` Steven Whitehouse
@ 2004-09-26  0:48             ` David S. Miller
  0 siblings, 0 replies; 43+ messages in thread
From: David S. Miller @ 2004-09-26  0:48 UTC (permalink / raw)
  To: Steven Whitehouse; +Cc: laforge, netdev

On Sat, 25 Sep 2004 14:33:42 +0100
Steven Whitehouse <steve@chygwyn.com> wrote:

> On Sat, Sep 25, 2004 at 11:09:33AM +0200, Harald Welte wrote:
> > On Sat, Sep 25, 2004 at 12:56:23AM -0700, David S. Miller wrote:
> > > So let's discuss #4.  It is the first idea I had to combat the
> > > "problem", but honestly right now I am beginning to think that
> > > the real solution is to simply remove the INCOMPLETE checks
> > > altogether.
> > > 
> > > Neighbours are a sub-cache of the routing cache.  Therefore when
> > > a neigh entry has a singular refcount, no routing cache entry
> > > points to it.  No routing cache entry, we're not sending packets
> > > to that neighbour any time soon, so there is no reason (especially
> > > during strong pressure) to hold onto such entries.
> > 
> > I am sure this is valid for IPv4 and IPv6.  How about other users of the
> > neighbour cache, do they share this assumption?  I have to admit that I
> > never looked throgh the ATM or 
> >
> I cannot see this being any problem for DECnet at all.... the entry you
> most want to hold on to is the entry for the default router of which
> there will be a max of one per interface. This applies only in end node
> mode and we hold a ref count to it anyway, so that it should have the
> same effect as the routing cache entry holding a ref,

And ATM clip is for ipv4's routing layer too so...

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh
  2004-09-25  9:09         ` Harald Welte
  2004-09-25 13:33           ` Steven Whitehouse
@ 2004-09-26  3:31           ` David S. Miller
  2004-09-26 11:21             ` Thomas Graf
  2004-09-27  9:29           ` Harald Welte
  2 siblings, 1 reply; 43+ messages in thread
From: David S. Miller @ 2004-09-26  3:31 UTC (permalink / raw)
  To: Harald Welte; +Cc: netdev

On Sat, 25 Sep 2004 11:09:33 +0200
Harald Welte <laforge@gnumonks.org> wrote:

> I am sure this is valid for IPv4 and IPv6.  How about other users of the
> neighbour cache, do they share this assumption?  I have to admit that I
> never looked throgh the ATM or 

As Steven Whitehouse mentioned for DecNET and I mentioned for ATM clip,
this is valid.

So consider the following as well.  Any time we send to a neighbour
we will generate a routing cache entry, 'dst', and we will stick it
to skb->dst.  This dst will have neighbour, and the SKB will sit
in the neighbour SKB queue until the neighbour entry is resolved
or it times out.

Therefore I think that test for INCOMPLETE in neigh_force_gc()
is even more rediculious.

Assuming that your test case runs fine with it, what I think we should
do is just forget about my 'diff4' changes to neigh_forced_gc(), and
instead remove that INCOMPLETE test entirely.  Then, neigh_forced_gc()
will simply remove all non-permanent entries with refcount of 1.

Finally, we could think about possibly doing the following:

1) Parameterizing neigh_forced_gc() so that it purges say "N"
   entries each run instead of scanning the entire hash table.
   Perhaps some fraction of tbl->entries

2) Look seriously into making more formal the relationship between
   the routing caches and neighbour cache.

What I mean by #2 is the following.  We know that the worst case
neighbour cache usage for a table is the size of the routing cache.
No routing cache entry, no reference to a corresponding neighbour
entry.  In this sense, all of these gc_thresh* values are superfluous.

What do people think about this?

As a side note, I checked BSD just for the usual shits and grins,
and they make no limits on ARP table growth.  Each routing table
entry may allocate a ARP resolution structure.  It is not exactly
stupid, frankly.

Really, if we got rid of all of this garbage collection crap we could
delete even that annoying message spit out by net/ipv4/route.c when
we run into the problems that made us work on these changes to begin
with.

At the very least, we should have a threshold to run forced GC but
failures to GC purge should not cause neigh_create() failures.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh
  2004-09-25  7:56       ` David S. Miller
  2004-09-25  8:14         ` YOSHIFUJI Hideaki / 吉藤英明
  2004-09-25  9:09         ` Harald Welte
@ 2004-09-26 10:11         ` YOSHIFUJI Hideaki / 吉藤英明
  2004-09-27 11:43         ` Herbert Xu
                           ` (2 subsequent siblings)
  5 siblings, 0 replies; 43+ messages in thread
From: YOSHIFUJI Hideaki / 吉藤英明 @ 2004-09-26 10:11 UTC (permalink / raw)
  To: davem; +Cc: laforge, netdev, yoshfuji

In article <20040925005623.2faf8faf.davem@davemloft.net> (at Sat, 25 Sep 2004 00:56:23 -0700), "David S. Miller" <davem@davemloft.net> says:

> So let's discuss #4.  It is the first idea I had to combat the
> "problem", but honestly right now I am beginning to think that
> the real solution is to simply remove the INCOMPLETE checks
> altogether.
> 
> Neighbours are a sub-cache of the routing cache.  Therefore when
> a neigh entry has a singular refcount, no routing cache entry
> points to it.  No routing cache entry, we're not sending packets
> to that neighbour any time soon, so there is no reason (especially
> during strong pressure) to hold onto such entries.

First, in IPv6, we usually do not create cache entries for now.
(I assume it is rt6_info with RTF_CACHE flag set.)
We create one if we receives
 - redirect
or
 - too big (for pmtu).

Personally, I agree to change this; always to create routing cache.

Second, we should free only STALE entries first for specification
conformity; lifetime of stale entries is not standardized,
but ones of others are.
We will be broken especially for neighbour-cache reachable-time is
longer than routing-cache lifetime) case.

So, I would say
 - stage 1: if num_entries > low_thresh,
             try purge STALE entries (with refcnt == 1).
             if not enough, schedule flushing routing cache.
 - stage 2: if num_entries > high_thresh,
             try to purge all states (except for permanent entries, 
             but including 1 second-old INCOMPLETE entries.)
             if not enough, schedule flushing routing cache.
 - stage 3: if num_entries > ultimate_thresh, we fail.
or something.

So, I cannot agree simiply removing the "INCOMPLETE" test.

-- 
Hideaki YOSHIFUJI @ USAGI Project <yoshfuji@linux-ipv6.org>
GPG FP: 9022 65EB 1ECF 3AD1 0BDF  80D8 4807 F894 E062 0EEA

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh
  2004-09-26  3:31           ` David S. Miller
@ 2004-09-26 11:21             ` Thomas Graf
  0 siblings, 0 replies; 43+ messages in thread
From: Thomas Graf @ 2004-09-26 11:21 UTC (permalink / raw)
  To: David S. Miller; +Cc: Harald Welte, netdev

* David S. Miller <20040925203116.4e6f9b06.davem@davemloft.net> 2004-09-25 20:31
> 1) Parameterizing neigh_forced_gc() so that it purges say "N"
>    entries each run instead of scanning the entire hash table.
>    Perhaps some fraction of tbl->entries

I'm not familiar with the neighbour code but here's a wild
thought:

Calculate the average number of neigh additions (A) over the last
few minutes. Count the number of additions since the last call
to neigh_forced_gc() (L). Make N depdendant on L, the faster the
list grows the more entries we should check by each run to avoid
the list growing too fast. Calculate a aggresivity factor out
of A and L and remove stale and incomplete entries depending on this
factor. Maybe L needs to be calculate from the last few gc runs.

I don't know if any of this is implemented already or whether
it's possible or not. The basic idea is to figure out unnatural
situations and handle them more aggressively.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh
  2004-09-25  9:09         ` Harald Welte
  2004-09-25 13:33           ` Steven Whitehouse
  2004-09-26  3:31           ` David S. Miller
@ 2004-09-27  9:29           ` Harald Welte
  2004-09-27 18:57             ` David S. Miller
  2 siblings, 1 reply; 43+ messages in thread
From: Harald Welte @ 2004-09-27  9:29 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev

[-- Attachment #1: Type: text/plain, Size: 1241 bytes --]

On Sat, Sep 25, 2004 at 11:09:33AM +0200, Harald Welte wrote:

> Please note that I guess I won't have any results until late Sunday/Monday.

Ok, there you go.  I've tested a kernel with your 6-patch-set for
cleanup/jenkins/..., and the four patch set, including

> > 4) The controversial/RFC patch, dorking with neigh_forced_gc()

It performed perfectly, I wasn't able to reproduce those overflows
anymore (testing with two /16 networks and about 200kpps incoming
packets to unresolved and non-existant neighbours)

> I'll do tests with and without INCOMPLETE check. No results until late
> Sunday/Monday, as indicated above.

I didn't even bother doing the INCOMPLETE check since Yoshifuji
indicated that this caused problems with IPv6.... and it already works
for me while the INCOMPLETE check is still in place.  If you still want
me to run it, I'll give it a go.

Please also note my next mail, where I'll publish some neighbour cache
statistics similar to rt_cache_stat

-- 
- Harald Welte <laforge@gnumonks.org>               http://www.gnumonks.org/
============================================================================
Programming is like sex: One mistake and you have to support it your lifetime

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh
  2004-09-25  7:56       ` David S. Miller
                           ` (2 preceding siblings ...)
  2004-09-26 10:11         ` YOSHIFUJI Hideaki / 吉藤英明
@ 2004-09-27 11:43         ` Herbert Xu
  2004-09-27 19:12           ` David S. Miller
  2004-09-27 11:48         ` Herbert Xu
  2004-09-27 11:56         ` Herbert Xu
  5 siblings, 1 reply; 43+ messages in thread
From: Herbert Xu @ 2004-09-27 11:43 UTC (permalink / raw)
  To: David S. Miller; +Cc: laforge, netdev

David S. Miller <davem@davemloft.net> wrote:
> 
> 2) Tim Gardner's change to smooth out periodic neighbour GC.

I think tbl->gc_interval can be killed altogether now.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh
  2004-09-25  7:56       ` David S. Miller
                           ` (3 preceding siblings ...)
  2004-09-27 11:43         ` Herbert Xu
@ 2004-09-27 11:48         ` Herbert Xu
  2004-09-27 18:15           ` David S. Miller
  2004-09-27 11:56         ` Herbert Xu
  5 siblings, 1 reply; 43+ messages in thread
From: Herbert Xu @ 2004-09-27 11:48 UTC (permalink / raw)
  To: David S. Miller; +Cc: laforge, netdev

David S. Miller <davem@davemloft.net> wrote:
> 
> 3) Fix locking and GFP_* bugs.
>
> @@ -400,8 +400,11 @@
>  		goto out;
>  	}
>  
> -	if (tbl->entries > (tbl->hash_mask + 1))
> +	if (tbl->entries > (tbl->hash_mask + 1)) {
> +		write_lock_bh(&tbl->lock);
>  		neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1);
> +		write_unlock_bh(&tbl->lock);
> +	}

The locking should be outside the if block as otherwise you may grow
unnecessarily or grow into the same size :)
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh
  2004-09-25  7:56       ` David S. Miller
                           ` (4 preceding siblings ...)
  2004-09-27 11:48         ` Herbert Xu
@ 2004-09-27 11:56         ` Herbert Xu
  2004-09-27 19:14           ` David S. Miller
  5 siblings, 1 reply; 43+ messages in thread
From: Herbert Xu @ 2004-09-27 11:56 UTC (permalink / raw)
  To: David S. Miller; +Cc: laforge, netdev

David S. Miller <davem@davemloft.net> wrote:
> 
> 4) The controversial/RFC patch, dorking with neigh_forced_gc()
>
> +			if (n->nud_state -= NUD_INCOMPLETE &&
> +			    reap_incomplete == 0 &&
> +			    time_after(jiffies,
> +				       n->used + n->parms->retrans_time)) {
> +				num_incomplete++;
> +				goto next_ent;

That should either be time_before, or you need to swap the arguments.

@@ -261,7 +277,9 @@
 	if (tbl->entries > tbl->gc_thresh3 ||
 	    (tbl->entries > tbl->gc_thresh2 &&
 	     time_after(now, tbl->last_flush + 5 * HZ))) {
-		if (!neigh_forced_gc(tbl) &&
+		int goal = tbl->entries - tbl->gc_thresh3;

This could be negative but I suppose it's harmless...

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh
  2004-09-27 11:48         ` Herbert Xu
@ 2004-09-27 18:15           ` David S. Miller
  2004-09-27 21:41             ` Herbert Xu
  2004-10-02  7:50             ` Herbert Xu
  0 siblings, 2 replies; 43+ messages in thread
From: David S. Miller @ 2004-09-27 18:15 UTC (permalink / raw)
  To: Herbert Xu; +Cc: laforge, netdev

On Mon, 27 Sep 2004 21:48:33 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> > -	if (tbl->entries > (tbl->hash_mask + 1))
> > +	if (tbl->entries > (tbl->hash_mask + 1)) {
> > +		write_lock_bh(&tbl->lock);
> >  		neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1);
> > +		write_unlock_bh(&tbl->lock);
> > +	}
> 
> The locking should be outside the if block as otherwise you may grow
> unnecessarily or grow into the same size :)

I'm not going to do that, because then we're grabbing that lock
twice on every neigh create operation and I was trying hard
to avoid that overhead.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh
  2004-09-27  9:29           ` Harald Welte
@ 2004-09-27 18:57             ` David S. Miller
  0 siblings, 0 replies; 43+ messages in thread
From: David S. Miller @ 2004-09-27 18:57 UTC (permalink / raw)
  To: Harald Welte; +Cc: netdev

On Mon, 27 Sep 2004 11:29:33 +0200
Harald Welte <laforge@gnumonks.org> wrote:

> > > 4) The controversial/RFC patch, dorking with neigh_forced_gc()
> 
> It performed perfectly, I wasn't able to reproduce those overflows
> anymore (testing with two /16 networks and about 200kpps incoming
> packets to unresolved and non-existant neighbours)
> 
> > I'll do tests with and without INCOMPLETE check. No results until late
> > Sunday/Monday, as indicated above.
> 
> I didn't even bother doing the INCOMPLETE check since Yoshifuji
> indicated that this caused problems with IPv6....

You might as well have.  Effectively, due to a bug that Herbert Xu
spotted, diff4 acts the same as if the INCOMPLETE checks were
removed.  The time_after() check for INCOMPLETE entries in the
diff4 changes ended up being opposite of what it should be.

Yoshifuji said:

> So, I cannot agree simiply removing the "INCOMPLETE" test.

What some standard says is meaningless in these sorts
of situations.  It lacks any sense.  If we are reaching
limits of caches and we need to create a new neighbour entry
in order to communicate with another hosts we must purge
anything we can.  I challenge something like TAHI to even
be able to check out a case like this and for the results to
be meaningful in some way.  They certainly won't be.

So I'm going to remove the INCOMPLETE tests instead of my
original diff4 since:

1) That is effectively what my buggy diff4 was doing anyways.
2) It is the simplest change and keeps us from having to scan
   the hash table twice under high load situations when we need
   the cycles very badly.

Yoshifuji, if we drop neighbour entry, we will just reprobe for
this neighbour if we should try to communicate with it again.
It is not the end of the world.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh
  2004-09-27 11:43         ` Herbert Xu
@ 2004-09-27 19:12           ` David S. Miller
  0 siblings, 0 replies; 43+ messages in thread
From: David S. Miller @ 2004-09-27 19:12 UTC (permalink / raw)
  To: Herbert Xu; +Cc: laforge, netdev

On Mon, 27 Sep 2004 21:43:58 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> David S. Miller <davem@davemloft.net> wrote:
> > 
> > 2) Tim Gardner's change to smooth out periodic neighbour GC.
> 
> I think tbl->gc_interval can be killed altogether now.

Let's keep it around so we don't break scripts people
have writing sysctl/procfs values to that thing.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh
  2004-09-27 11:56         ` Herbert Xu
@ 2004-09-27 19:14           ` David S. Miller
  2004-09-27 22:26             ` [6/6]: jenkins hash for neigh / Statistics Harald Welte
  0 siblings, 1 reply; 43+ messages in thread
From: David S. Miller @ 2004-09-27 19:14 UTC (permalink / raw)
  To: Herbert Xu; +Cc: laforge, netdev

On Mon, 27 Sep 2004 21:56:10 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> David S. Miller <davem@davemloft.net> wrote:
> > 
> > 4) The controversial/RFC patch, dorking with neigh_forced_gc()
> >
> > +			if (n->nud_state -= NUD_INCOMPLETE &&
> > +			    reap_incomplete == 0 &&
> > +			    time_after(jiffies,
> > +				       n->used + n->parms->retrans_time)) {
> > +				num_incomplete++;
> > +				goto next_ent;
> 
> That should either be time_before, or you need to swap the arguments.

Good catch, and it means that the code basically behaved
as if the NUD_INCOMPLETE tests weren't even there.

So, as mentioned in another email, this is what I'm using
in the end:

# This is a BitKeeper generated diff -Nru style patch.
#
# ChangeSet
#   2004/09/27 11:46:12-07:00 davem@nuts.davemloft.net 
#   [NET]: Remove INCOMPLETE checks from neigh_forced_gc().
#   
#   Signed-off-by: David S. Miller <davem@davemloft.net>
# 
# net/core/neighbour.c
#   2004/09/27 11:45:40-07:00 davem@nuts.davemloft.net +3 -11
#   [NET]: Remove INCOMPLETE checks from neigh_forced_gc().
# 
diff -Nru a/net/core/neighbour.c b/net/core/neighbour.c
--- a/net/core/neighbour.c	2004-09-27 11:55:57 -07:00
+++ b/net/core/neighbour.c	2004-09-27 11:55:57 -07:00
@@ -123,20 +123,12 @@
 		np = &tbl->hash_buckets[i];
 		while ((n = *np) != NULL) {
 			/* Neighbour record may be discarded if:
-			   - nobody refers to it.
-			   - it is not permanent
-			   - (NEW and probably wrong)
-			     INCOMPLETE entries are kept at least for
-			     n->parms->retrans_time, otherwise we could
-			     flood network with resolution requests.
-			     It is not clear, what is better table overflow
-			     or flooding.
+			 * - nobody refers to it.
+			 * - it is not permanent
 			 */
 			write_lock(&n->lock);
 			if (atomic_read(&n->refcnt) == 1 &&
-			    !(n->nud_state & NUD_PERMANENT) &&
-			    (n->nud_state != NUD_INCOMPLETE ||
-			     time_after(jiffies, n->used + n->parms->retrans_time))) {
+			    !(n->nud_state & NUD_PERMANENT)) {
 				*np	= n->next;
 				n->dead = 1;
 				shrunk	= 1;

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh
  2004-09-27 18:15           ` David S. Miller
@ 2004-09-27 21:41             ` Herbert Xu
  2004-09-27 22:00               ` Herbert Xu
  2004-10-02  7:50             ` Herbert Xu
  1 sibling, 1 reply; 43+ messages in thread
From: Herbert Xu @ 2004-09-27 21:41 UTC (permalink / raw)
  To: David S. Miller; +Cc: herbert, laforge, netdev

David S. Miller <davem@davemloft.net> wrote:
> On Mon, 27 Sep 2004 21:48:33 +1000
> Herbert Xu <herbert@gondor.apana.org.au> wrote:
> 
>> > -   if (tbl->entries > (tbl->hash_mask + 1))
>> > +   if (tbl->entries > (tbl->hash_mask + 1)) {
>> > +           write_lock_bh(&tbl->lock);
>> >             neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1);
>> > +           write_unlock_bh(&tbl->lock);
>> > +   }
>> 
>> The locking should be outside the if block as otherwise you may grow
>> unnecessarily or grow into the same size :)
> 
> I'm not going to do that, because then we're grabbing that lock
> twice on every neigh create operation and I was trying hard
> to avoid that overhead.

Please at least do something like this:

size = tbl->hash_mask + 1;
if (tbl->entries > size) {
	write_lock_bh(&tbl->lock);
	neigh_hash_grow(tbl, size << 1);
	write_unlock_bh(&tbl->lock);
}

That'll at least kill the first race.  You can kill the race of
growing into the same size by doing the if check again inside the
locks.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh
  2004-09-27 21:41             ` Herbert Xu
@ 2004-09-27 22:00               ` Herbert Xu
  0 siblings, 0 replies; 43+ messages in thread
From: Herbert Xu @ 2004-09-27 22:00 UTC (permalink / raw)
  To: Herbert Xu; +Cc: davem, herbert, laforge, netdev

Herbert Xu <herbert@gondor.apana.org.au> wrote:
> 
> Please at least do something like this:
> 
> size = tbl->hash_mask + 1;
> if (tbl->entries > size) {
>        write_lock_bh(&tbl->lock);
>        neigh_hash_grow(tbl, size << 1);
>        write_unlock_bh(&tbl->lock);
> }
> 
> That'll at least kill the first race.  You can kill the race of
> growing into the same size by doing the if check again inside the
> locks.

Never mind.  Unless you do the check again inside the lock this
may actually shrink :)
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh / Statistics
  2004-09-27 19:14           ` David S. Miller
@ 2004-09-27 22:26             ` Harald Welte
  2004-09-27 23:06               ` David S. Miller
  0 siblings, 1 reply; 43+ messages in thread
From: Harald Welte @ 2004-09-27 22:26 UTC (permalink / raw)
  To: David S. Miller; +Cc: Herbert Xu, netdev

[-- Attachment #1: Type: text/plain, Size: 1601 bytes --]

On Mon, Sep 27, 2004 at 12:14:03PM -0700, David S. Miller wrote:
> Herbert Xu <herbert@gondor.apana.org.au> wrote:
> 
> > David S. Miller <davem@davemloft.net> wrote:
> > > 
> > > 4) The controversial/RFC patch, dorking with neigh_forced_gc()
> > >
> > > +			if (n->nud_state -= NUD_INCOMPLETE &&
> > > +			    reap_incomplete == 0 &&
> > > +			    time_after(jiffies,
> > > +				       n->used + n->parms->retrans_time)) {
> > > +				num_incomplete++;
> > > +				goto next_ent;
> > 
> > That should either be time_before, or you need to swap the arguments.
> 
> Good catch, and it means that the code basically behaved
> as if the NUD_INCOMPLETE tests weren't even there.

which also explains why my statistics code actually never catched a
forced GC that didn't fulfill it's goal ;)

to get back at the statistics code:

As stated before, I would like to change rt_stat and ct_stat in order to
include a first 'template' line, too.  This way it is easier to write a
generic foo_stat program, that could deal with any of those statistics
files, even with new ones...  but this of course would break existing
rtstat binaries.  I personally  don't care, since it's a little-known
and little-used feature, which to my knowledge is in a lot of
distributions either non-existant [Debian] or incompatible [SuSE]. What
do you think?

-- 
- Harald Welte <laforge@gnumonks.org>               http://www.gnumonks.org/
============================================================================
Programming is like sex: One mistake and you have to support it your lifetime

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh / Statistics
  2004-09-27 22:26             ` [6/6]: jenkins hash for neigh / Statistics Harald Welte
@ 2004-09-27 23:06               ` David S. Miller
  2004-09-27 23:27                 ` Stephen Hemminger
  0 siblings, 1 reply; 43+ messages in thread
From: David S. Miller @ 2004-09-27 23:06 UTC (permalink / raw)
  To: Harald Welte; +Cc: herbert, netdev, shemminger

On Tue, 28 Sep 2004 00:26:13 +0200
Harald Welte <laforge@gnumonks.org> wrote:

> As stated before, I would like to change rt_stat and ct_stat in order to
> include a first 'template' line, too.  This way it is easier to write a
> generic foo_stat program, that could deal with any of those statistics
> files, even with new ones...  but this of course would break existing
> rtstat binaries.  I personally  don't care, since it's a little-known
> and little-used feature, which to my knowledge is in a lot of
> distributions either non-existant [Debian] or incompatible [SuSE]. What
> do you think?

I agree.  And while we're add it let's get a fixed rtstat into
iproute2 and make sure that binary gets installed by default
so maybe the dists will start shipping it properly.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh / Statistics
  2004-09-27 23:06               ` David S. Miller
@ 2004-09-27 23:27                 ` Stephen Hemminger
  2004-09-28  8:44                   ` Robert Olsson
  0 siblings, 1 reply; 43+ messages in thread
From: Stephen Hemminger @ 2004-09-27 23:27 UTC (permalink / raw)
  To: David S. Miller; +Cc: Harald Welte, herbert, netdev

On Mon, 2004-09-27 at 16:06 -0700, David S. Miller wrote:
> On Tue, 28 Sep 2004 00:26:13 +0200
> Harald Welte <laforge@gnumonks.org> wrote:
> 
> > As stated before, I would like to change rt_stat and ct_stat in order to
> > include a first 'template' line, too.  This way it is easier to write a
> > generic foo_stat program, that could deal with any of those statistics
> > files, even with new ones...  but this of course would break existing
> > rtstat binaries.  I personally  don't care, since it's a little-known
> > and little-used feature, which to my knowledge is in a lot of
> > distributions either non-existant [Debian] or incompatible [SuSE]. What
> > do you think?
> 
> I agree.  And while we're add it let's get a fixed rtstat into
> iproute2 and make sure that binary gets installed by default
> so maybe the dists will start shipping it properly.

I have the old one in the repository.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh / Statistics
  2004-09-27 23:27                 ` Stephen Hemminger
@ 2004-09-28  8:44                   ` Robert Olsson
  2004-09-28 11:19                     ` [PATCH 2.6] generic network statistics (was Re: [6/6]: jenkins hash for neigh / Statistics) Harald Welte
  2004-09-28 16:27                     ` [6/6]: jenkins hash for neigh / Statistics Stephen Hemminger
  0 siblings, 2 replies; 43+ messages in thread
From: Robert Olsson @ 2004-09-28  8:44 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David S. Miller, Harald Welte, herbert, netdev


Stephen Hemminger writes:

 > > Harald Welte <laforge@gnumonks.org> wrote:
 > > 
 > > > As stated before, I would like to change rt_stat and ct_stat in order to
 > > > include a first 'template' line, too.  This way it is easier to write a
 > > > generic foo_stat program, that could deal with any of those statistics
 > > > files, even with new ones...  but this of course would break existing
 > > > rtstat binaries.  I personally  don't care, since it's a little-known
 > > > and little-used feature, which to my knowledge is in a lot of
 > > > distributions either non-existant [Debian] or incompatible [SuSE]. What
 > > > do you think?
 > > 
 > > I agree.  And while we're add it let's get a fixed rtstat into
 > > iproute2 and make sure that binary gets installed by default
 > > so maybe the dists will start shipping it properly.
 > 
 > I have the old one in the repository.

 I sent you the latest rtstat (Martin sent ctstat) during netfilter workshop. 
 Remember? Having a common foo_stat sounds like a good idea.

 Cheers.
						--ro
 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 2.6] generic network statistics (was Re: [6/6]: jenkins hash for neigh / Statistics)
  2004-09-28  8:44                   ` Robert Olsson
@ 2004-09-28 11:19                     ` Harald Welte
  2004-09-28 12:48                       ` jamal
                                         ` (2 more replies)
  2004-09-28 16:27                     ` [6/6]: jenkins hash for neigh / Statistics Stephen Hemminger
  1 sibling, 3 replies; 43+ messages in thread
From: Harald Welte @ 2004-09-28 11:19 UTC (permalink / raw)
  To: Robert Olsson; +Cc: Stephen Hemminger, David S. Miller, herbert, netdev


[-- Attachment #1.1: Type: text/plain, Size: 2170 bytes --]

On Tue, Sep 28, 2004 at 10:44:30AM +0200, Robert Olsson wrote:
> 
> Stephen Hemminger writes:
> 
>  > > Harald Welte <laforge@gnumonks.org> wrote:
>  > > 
>  > > > As stated before, I would like to change rt_stat and ct_stat in order to
>  > > > include a first 'template' line, too.  This way it is easier to write a
>  > > > generic foo_stat program, that could deal with any of those statistics
>  > > > files, even with new ones...  but this of course would break existing
>  > > > rtstat binaries.  I personally  don't care, since it's a little-known
>  > > > and little-used feature, which to my knowledge is in a lot of
>  > > > distributions either non-existant [Debian] or incompatible [SuSE]. What
>  > > > do you think?
>  > > 
>  > > I agree.  And while we're add it let's get a fixed rtstat into
>  > > iproute2 and make sure that binary gets installed by default
>  > > so maybe the dists will start shipping it properly.
>  > 
>  > I have the old one in the repository.
> 
>  I sent you the latest rtstat (Martin sent ctstat) during netfilter workshop. 
>  Remember? Having a common foo_stat sounds like a good idea.

I'm working on this right now.  The plan is to make the parser code
shared between a commandline tool and some daemon that can be run on
production systems to gather long-term statistics.  You can select the
fields you're interested in with keys like "rt_stat:entries".  Maybe
I'll even add stuff like mysql/pgsql-logging like ulogd has, but that
future plans for now.

I've called it lnstat for now (as in linux networking statistics), but
I'm open to better suggestions on the name ;)

Please find the current kernel-part patch attached to this mail,
it adds teplate-headerlines and moves all statistics files to
/proc/net/stat.  It even cleans up the neigh_stat code a bit.
Depends on my latest neighbour statistics patch that davem has already
merged.

-- 
- Harald Welte <laforge@gnumonks.org>               http://www.gnumonks.org/
============================================================================
Programming is like sex: One mistake and you have to support it your lifetime

[-- Attachment #1.2: laforge-rtstat-ctstat-template.patch --]
[-- Type: text/plain, Size: 7697 bytes --]


This patch moves the following files in /proc:
	/proc/net/rt_cache_stat 	/proc/net/stat/rt_cache
	/proc/net/ip_conntrack_stat	/proc/net/stat/ip_conntrack
	/proc/net/arp_cache_stat	/proc/net/stat/arp_cache
	/proc/net/clip_arp_cache_stat	/proc/net/stat/clib_arp_cache
	/proc/net/dn_neigh_cache_stat	/proc/net/stat/dn_neigh_cache

This allows a generic statistics tool to scan for all available statistics
by doing readdir(2) on /proc/net/stat

It also adds a special first "template" line to rt_cache and ip_conntrack
in order to facilitate compatibility once somebody adds new fields to the
output lines.

WARNING: 
	This breaks existing rtstat.c and ctstat.c userspace programs
	(hopefully for the last time).  rtstat is non-existant or broken in
	major distributions anyway, and ctstat is too new for any distros
	having it picked up.  Therefore, we justify this breakage.

A new unified statistics tool for routing cache, connection tracking and
neighbour cache is under development and will be included with iproute2.

Signed-off-by: Harald Welte <laforge@gnumonks.org>

Index: linux-2.6.9-rc2-bk9-neigh1/net/ipv4/netfilter/ip_conntrack_standalone.c
===================================================================
--- linux-2.6.9-rc2-bk9-neigh1.orig/net/ipv4/netfilter/ip_conntrack_standalone.c	2004-09-28 10:33:07.000000000 +0200
+++ linux-2.6.9-rc2-bk9-neigh1/net/ipv4/netfilter/ip_conntrack_standalone.c	2004-09-28 10:55:17.000000000 +0200
@@ -270,10 +270,13 @@
 {
 	int cpu;
 
-	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
+	if (*pos == 0)
+		return SEQ_START_TOKEN;
+
+	for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
 		if (!cpu_possible(cpu))
 			continue;
-		*pos = cpu;
+		*pos = cpu+1;
 		return &per_cpu(ip_conntrack_stat, cpu);
 	}
 
@@ -284,10 +287,10 @@
 {
 	int cpu;
 
-	for (cpu = *pos + 1; cpu < NR_CPUS; ++cpu) {
+	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 		if (!cpu_possible(cpu))
 			continue;
-		*pos = cpu;
+		*pos = cpu+1;
 		return &per_cpu(ip_conntrack_stat, cpu);
 	}
 
@@ -303,6 +306,11 @@
 	unsigned int nr_conntracks = atomic_read(&ip_conntrack_count);
 	struct ip_conntrack_stat *st = v;
 
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq, "entries  searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error  expect_new expect_create expect_delete\n");
+		return 0;
+	}
+
 	seq_printf(seq, "%08x  %08x %08x %08x %08x %08x %08x %08x "
 			"%08x %08x %08x %08x %08x  %08x %08x %08x \n",
 		   nr_conntracks,
@@ -729,10 +737,11 @@
 					&exp_file_ops);
 	if (!proc_exp) goto cleanup_proc;
 
-	proc_stat = proc_net_fops_create("ip_conntrack_stat", S_IRUGO,
-					 &ct_cpu_seq_fops);
+	proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, proc_net_stat);
 	if (!proc_stat)
 		goto cleanup_proc_exp;
+
+	proc_stat->proc_fops = &ct_cpu_seq_fops;
 	proc_stat->owner = THIS_MODULE;
 #endif
 
Index: linux-2.6.9-rc2-bk9-neigh1/net/ipv4/route.c
===================================================================
--- linux-2.6.9-rc2-bk9-neigh1.orig/net/ipv4/route.c	2004-09-26 12:11:20.000000000 +0200
+++ linux-2.6.9-rc2-bk9-neigh1/net/ipv4/route.c	2004-09-28 12:12:42.000000000 +0200
@@ -356,10 +356,13 @@
 {
 	int cpu;
 
-	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
+	if (*pos == 0)
+		return SEQ_START_TOKEN;
+
+	for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
 		if (!cpu_possible(cpu))
 			continue;
-		*pos = cpu;
+		*pos = cpu+1;
 		return per_cpu_ptr(rt_cache_stat, cpu);
 	}
 	return NULL;
@@ -369,10 +372,10 @@
 {
 	int cpu;
 
-	for (cpu = *pos + 1; cpu < NR_CPUS; ++cpu) {
+	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 		if (!cpu_possible(cpu))
 			continue;
-		*pos = cpu;
+		*pos = cpu+1;
 		return per_cpu_ptr(rt_cache_stat, cpu);
 	}
 	return NULL;
@@ -387,6 +390,11 @@
 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 {
 	struct rt_cache_stat *st = v;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq, "entries  in_hit in_slow_tot in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
+		return 0;
+	}
 	
 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
@@ -2783,12 +2791,16 @@
 	add_timer(&rt_secret_timer);
 
 #ifdef CONFIG_PROC_FS
+	{
+	struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
 	if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
-	    !proc_net_fops_create("rt_cache_stat", S_IRUGO, &rt_cpu_seq_fops)) {
+	    !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, 
+			    		     proc_net_stat))) {
 		free_percpu(rt_cache_stat);
 		return -ENOMEM;
 	}
-
+	rtstat_pde->proc_fops = &rt_cpu_seq_fops;
+	}
 #ifdef CONFIG_NET_CLS_ROUTE
 	create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
 #endif
Index: linux-2.6.9-rc2-bk9-neigh1/fs/proc/root.c
===================================================================
--- linux-2.6.9-rc2-bk9-neigh1.orig/fs/proc/root.c	2004-09-13 07:33:11.000000000 +0200
+++ linux-2.6.9-rc2-bk9-neigh1/fs/proc/root.c	2004-09-28 11:41:10.000000000 +0200
@@ -18,7 +18,7 @@
 #include <asm/bitops.h>
 #include <linux/smp_lock.h>
 
-struct proc_dir_entry *proc_net, *proc_bus, *proc_root_fs, *proc_root_driver;
+struct proc_dir_entry *proc_net, *proc_net_stat, *proc_bus, *proc_root_fs, *proc_root_driver;
 
 #ifdef CONFIG_SYSCTL
 struct proc_dir_entry *proc_sys_root;
@@ -53,6 +53,8 @@
 	}
 	proc_misc_init();
 	proc_net = proc_mkdir("net", NULL);
+	proc_net_stat = proc_mkdir("net/stat", NULL);
+
 #ifdef CONFIG_SYSVIPC
 	proc_mkdir("sysvipc", NULL);
 #endif
@@ -157,5 +159,6 @@
 EXPORT_SYMBOL(proc_root);
 EXPORT_SYMBOL(proc_root_fs);
 EXPORT_SYMBOL(proc_net);
+EXPORT_SYMBOL(proc_net_stat);
 EXPORT_SYMBOL(proc_bus);
 EXPORT_SYMBOL(proc_root_driver);
Index: linux-2.6.9-rc2-bk9-neigh1/include/linux/proc_fs.h
===================================================================
--- linux-2.6.9-rc2-bk9-neigh1.orig/include/linux/proc_fs.h	2004-09-13 07:33:39.000000000 +0200
+++ linux-2.6.9-rc2-bk9-neigh1/include/linux/proc_fs.h	2004-09-28 10:47:17.000000000 +0200
@@ -79,6 +79,7 @@
 extern struct proc_dir_entry proc_root;
 extern struct proc_dir_entry *proc_root_fs;
 extern struct proc_dir_entry *proc_net;
+extern struct proc_dir_entry *proc_net_stat;
 extern struct proc_dir_entry *proc_bus;
 extern struct proc_dir_entry *proc_root_driver;
 extern struct proc_dir_entry *proc_root_kcore;
Index: linux-2.6.9-rc2-bk9-neigh1/net/core/neighbour.c
===================================================================
--- linux-2.6.9-rc2-bk9-neigh1.orig/net/core/neighbour.c	2004-09-28 00:15:51.000000000 +0200
+++ linux-2.6.9-rc2-bk9-neigh1/net/core/neighbour.c	2004-09-28 10:56:44.000000000 +0200
@@ -1333,22 +1333,11 @@
 		panic("cannot create neighbour cache statistics");
 	
 #ifdef CONFIG_PROC_FS
-#define NC_STAT_SUFFIX "_stat"
-	{
-	char *proc_stat_name;
-	proc_stat_name = kmalloc(strlen(tbl->id) + 
-				 strlen(NC_STAT_SUFFIX) + 1, GFP_KERNEL);
-	if (!proc_stat_name)
-		panic("cannot allocate neighbour cache proc name buffer");
-	strcpy(proc_stat_name, tbl->id);
-	strcat(proc_stat_name, NC_STAT_SUFFIX);
-
-	tbl->pde = create_proc_entry(proc_stat_name, 0, proc_net);
+	tbl->pde = create_proc_entry(tbl->id, 0, proc_net_stat);
 	if (!tbl->pde) 
 		panic("cannot create neighbour proc dir entry");
 	tbl->pde->proc_fops = &neigh_stat_seq_fops;
 	tbl->pde->data = tbl;
-	}
 #endif
 
 	tbl->hash_mask = 0x1f;

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2.6] generic network statistics (was Re: [6/6]: jenkins hash for neigh / Statistics)
  2004-09-28 11:19                     ` [PATCH 2.6] generic network statistics (was Re: [6/6]: jenkins hash for neigh / Statistics) Harald Welte
@ 2004-09-28 12:48                       ` jamal
  2004-09-28 13:33                         ` Thomas Graf
  2004-09-28 14:22                         ` Robert Olsson
  2004-09-28 14:55                       ` Harald Welte
  2004-09-28 21:43                       ` David S. Miller
  2 siblings, 2 replies; 43+ messages in thread
From: jamal @ 2004-09-28 12:48 UTC (permalink / raw)
  To: Harald Welte
  Cc: Robert Olsson, Stephen Hemminger, David S. Miller, herbert,
	netdev

[-- Attachment #1: Type: text/plain, Size: 262 bytes --]


Thanks for changing the subject Harald (I wanted to catchup with that
thread at some point). 
Speaking of generic stats; i have a patch netlink ready which may need
some extensions. I did post it  a while back on netdev but didnt get
feedback.

cheers,
jamal



[-- Attachment #2: stats1.patch --]
[-- Type: text/plain, Size: 9342 bytes --]

--- 268rc3/net/core/Makefile	2004/08/09 02:44:08	1.1
+++ 268rc3/net/core/Makefile	2004/08/09 02:46:01
@@ -2,7 +2,7 @@
 # Makefile for the Linux networking core.
 #
 
-obj-y := sock.o skbuff.o iovec.o datagram.o stream.o scm.o
+obj-y := sock.o skbuff.o iovec.o datagram.o stream.o scm.o gen_stats.o gen_estimator.o
 
 obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
 
--- /dev/null	1998-05-05 16:32:27.000000000 -0400
+++ 268rc3/net/core/gen_stats.c	2004-08-09 09:22:21.000000000 -0400
@@ -0,0 +1,105 @@
+/*
+ * net/core/gen_stats.c
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:  Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Changes:
+ * Jamal Hadi Salim adapted from net_sched_api for gen purpose use
+ *
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <net/sock.h>
+#include <linux/gen_stats.h>
+
+
+/* 
+ * USAGE:
+ *
+ * declare in mystruct:
+ * struct gen_stats    mystats;
+ *
+ * increment as appropriate,eg :
+ *
+ * mystruct->mystats.packets++;
+ *
+ * update is lockless
+ *
+ * passing to user space:
+ *
+ * in routine my_dump():
+ *
+ *  if (gen_copy_stats(skb, &mystruct->mystats,MYSTAT_V), my_lock)
+ *                         goto rtattr_failure;
+ *
+ *
+ * locks:
+ *
+ * You are responsible for making sure that stats lock is
+ * initialized to something valid 
+ * (typically the table lock -- i.e updates happen only when
+ * you are dumping like here) 
+ * */
+int gen_copy_stats(struct sk_buff *skb, struct gnet_stats *st,int type, spinlock_t *lock)
+{
+	spin_lock_bh(lock);
+	RTA_PUT(skb, type, sizeof(struct gnet_stats), st);
+	spin_unlock_bh(lock);
+	return 0;
+
+rtattr_failure:
+	spin_unlock_bh(lock);
+	return -1;
+}
+
+/* 
+ * USAGE:
+ *
+ * declare your own private formated in mystruct:
+ * struct mypriv_stats    mystats;
+ *
+ * passing to user space:
+ *
+ * in routine my_dump():
+ *
+ *  if (gen_copy_xstats(skb, (void *)&mystruct->mystats,sizeof(struct mypriv_stats), MYPSTAT_V),my_lock)
+ *                         goto rtattr_failure;
+ *
+ *   Lock rules apply the same as in general stats
+ */
+int gen_copy_xstats(struct sk_buff *skb, void *st, int size, int type, spinlock_t *lock)
+{
+	spin_lock_bh(lock);
+	RTA_PUT(skb, type, size, st);
+	spin_unlock_bh(lock);
+	return 0;
+
+rtattr_failure:
+	spin_unlock_bh(lock);
+	return -1;
+}
+
+EXPORT_SYMBOL(gen_copy_stats);
+EXPORT_SYMBOL(gen_copy_xstats);
--- /dev/null	1998-05-05 16:32:27.000000000 -0400
+++ 268rc3/net/core/gen_estimator.c	2004-08-09 09:00:56.000000000 -0400
@@ -0,0 +1,207 @@
+/*
+ * net/sched/estimator.c	Simple rate estimator.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Changes:
+ *              Jamal Hadi Salim - moved it to net/core and reshulfed
+ *              names to make it usable in general net subsystem.
+ *    
+ *
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <net/sock.h>
+#include <linux/gen_stats.h>
+
+/*
+   This code is NOT intended to be used for statistics collection,
+   its purpose is to provide a base for statistical multiplexing
+   for controlled load service.
+   If you need only statistics, run a user level daemon which
+   periodically reads byte counters.
+
+   Unfortunately, rate estimation is not a very easy task.
+   F.e. I did not find a simple way to estimate the current peak rate
+   and even failed to formulate the problem 8)8)
+
+   So I preferred not to built an estimator into the scheduler,
+   but run this task separately.
+   Ideally, it should be kernel thread(s), but for now it runs
+   from timers, which puts apparent top bounds on the number of rated
+   flows, has minimal overhead on small, but is enough
+   to handle controlled load service, sets of aggregates.
+
+   We measure rate over A=(1<<interval) seconds and evaluate EWMA:
+
+   avrate = avrate*(1-W) + rate*W
+
+   where W is chosen as negative power of 2: W = 2^(-ewma_log)
+
+   The resulting time constant is:
+
+   T = A/(-ln(1-W))
+
+
+   NOTES.
+
+   * The stored value for avbps is scaled by 2^5, so that maximal
+     rate is ~1Gbit, avpps is scaled by 2^10.
+
+   * Minimal interval is HZ/4=250msec (it is the greatest common divisor
+     for HZ=100 and HZ=1024 8)), maximal interval
+     is (HZ/4)*2^EST_MAX_INTERVAL = 8sec. Shorter intervals
+     are too expensive, longer ones can be implemented
+     at user level painlessly.
+ */
+
+#if (HZ%4) != 0
+#error Bad HZ value.
+#endif
+
+#define EST_MAX_INTERVAL	5
+
+struct gen_estimator
+{
+	struct gen_estimator	*next;
+	struct gnet_stats	*stats;
+	spinlock_t		*stats_lock;
+	unsigned		interval;
+	int			ewma_log;
+	u64			last_bytes;
+	u32			last_packets;
+	u32			avpps;
+	u32			avbps;
+};
+
+struct gen_estimator_head
+{
+	struct timer_list	timer;
+	struct gen_estimator	*list;
+};
+
+static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
+
+/* Estimator array lock */
+static rwlock_t est_lock = RW_LOCK_UNLOCKED;
+
+static void est_timer(unsigned long arg)
+{
+	int idx = (int)arg;
+	struct gen_estimator *e;
+
+	read_lock(&est_lock);
+	for (e = elist[idx].list; e; e = e->next) {
+		struct gnet_stats *st = e->stats;
+		u64 nbytes;
+		u32 npackets;
+		u32 rate;
+
+		spin_lock(e->stats_lock);
+		nbytes = st->bytes;
+		npackets = st->packets;
+		rate = (nbytes - e->last_bytes)<<(7 - idx);
+		e->last_bytes = nbytes;
+		e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log;
+		st->bps = (e->avbps+0xF)>>5;
+
+		rate = (npackets - e->last_packets)<<(12 - idx);
+		e->last_packets = npackets;
+		e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log;
+		e->stats->pps = (e->avpps+0x1FF)>>10;
+		spin_unlock(e->stats_lock);
+	}
+
+	mod_timer(&elist[idx].timer, jiffies + ((HZ/4)<<idx));
+	read_unlock(&est_lock);
+}
+
+int gen_new_estimator(struct gnet_stats *stats, spinlock_t *stats_lock, struct rtattr *opt)
+{
+	struct gen_estimator *est;
+	struct gnet_estimator *parm = RTA_DATA(opt);
+
+	if (RTA_PAYLOAD(opt) < sizeof(*parm))
+		return -EINVAL;
+
+	if (parm->interval < -2 || parm->interval > 3)
+		return -EINVAL;
+
+	est = kmalloc(sizeof(*est), GFP_KERNEL);
+	if (est == NULL)
+		return -ENOBUFS;
+
+	memset(est, 0, sizeof(*est));
+	est->interval = parm->interval + 2;
+	est->stats = stats;
+	est->stats_lock = stats_lock;
+	est->ewma_log = parm->ewma_log;
+	est->last_bytes = stats->bytes;
+	est->avbps = stats->bps<<5;
+	est->last_packets = stats->packets;
+	est->avpps = stats->pps<<10;
+
+	est->next = elist[est->interval].list;
+	if (est->next == NULL) {
+		init_timer(&elist[est->interval].timer);
+		elist[est->interval].timer.data = est->interval;
+		elist[est->interval].timer.expires = jiffies + ((HZ/4)<<est->interval);
+		elist[est->interval].timer.function = est_timer;
+		add_timer(&elist[est->interval].timer);
+	}
+	write_lock_bh(&est_lock);
+	elist[est->interval].list = est;
+	write_unlock_bh(&est_lock);
+	return 0;
+}
+
+void gen_kill_estimator(struct gnet_stats *stats)
+{
+	int idx;
+	struct gen_estimator *est, **pest;
+
+	for (idx=0; idx <= EST_MAX_INTERVAL; idx++) {
+		int killed = 0;
+		pest = &elist[idx].list;
+		while ((est=*pest) != NULL) {
+			if (est->stats != stats) {
+				pest = &est->next;
+				continue;
+			}
+
+			write_lock_bh(&est_lock);
+			*pest = est->next;
+			write_unlock_bh(&est_lock);
+
+			kfree(est);
+			killed++;
+		}
+		if (killed && elist[idx].list == NULL)
+			del_timer(&elist[idx].timer);
+	}
+}
+
+EXPORT_SYMBOL(gen_kill_estimator);
+EXPORT_SYMBOL(gen_new_estimator);
--- /dev/null	1998-05-05 16:32:27.000000000 -0400
+++ 268rc3/include/linux/gen_stats.h	2004-08-09 09:06:29.000000000 -0400
@@ -0,0 +1,21 @@
+#ifndef __LINUX_GEN_STATS_H
+#define __LINUX_GEN_STATS_H
+
+struct gnet_stats
+{
+	__u64	bytes;			/* Number of seen bytes */
+	__u32	packets;		/* Number of seen packets	*/
+	__u32	drops;			/* Packets dropped */
+	__u32	bps;			/* Current flow byte rate */
+	__u32	pps;			/* Current flow packet rate */
+	__u32   qlen;
+	__u32   backlog;
+};
+
+struct gnet_estimator
+{
+	signed char	interval;
+	unsigned char	ewma_log;
+};
+
+#endif

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2.6] generic network statistics (was Re: [6/6]: jenkins hash for neigh / Statistics)
  2004-09-28 12:48                       ` jamal
@ 2004-09-28 13:33                         ` Thomas Graf
  2004-09-29  2:22                           ` jamal
  2004-09-28 14:22                         ` Robert Olsson
  1 sibling, 1 reply; 43+ messages in thread
From: Thomas Graf @ 2004-09-28 13:33 UTC (permalink / raw)
  To: jamal
  Cc: Harald Welte, Robert Olsson, Stephen Hemminger, David S. Miller,
	herbert, netdev

> Speaking of generic stats; i have a patch netlink ready which may need
> some extensions. I did post it  a while back on netdev but didnt get
> feedback.

The code looks good and I couldn't spot any errors but I'm not
sure if the locking in gen_copy_[x]stats is a good thing.
Shouldn't that be done earlier by the caller? This prevents
corruption but it allows duplicated TLVs in an skb. I suggest
to make the caller have a lock on his data and only allow one
dumper at the same time until the dump is complete, or at least
provide a lockless variant for callers doing the locking on
their own.

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2.6] generic network statistics (was Re: [6/6]: jenkins hash for neigh / Statistics)
  2004-09-28 12:48                       ` jamal
  2004-09-28 13:33                         ` Thomas Graf
@ 2004-09-28 14:22                         ` Robert Olsson
  2004-09-29  2:16                           ` jamal
  1 sibling, 1 reply; 43+ messages in thread
From: Robert Olsson @ 2004-09-28 14:22 UTC (permalink / raw)
  To: hadi
  Cc: Harald Welte, Robert Olsson, Stephen Hemminger, David S. Miller,
	herbert, netdev


jamal writes:
 > 
 > Thanks for changing the subject Harald (I wanted to catchup with that
 > thread at some point). 
 > Speaking of generic stats; i have a patch netlink ready which may need
 > some extensions. I did post it  a while back on netdev but didnt get
 > feedback.

 OK it's EWMA calc in kernel. For another purpose I tried another approach 
 to satisfy some users that wanted Cisco like average stats on our routers. 
 User app/daemon reads stats via netlink and does average calc. looks like:
 
 ifstat2  eth*
             RX --------------------------   TX -------------------------
eth0          389.4 M bit/s       40 k pps     20.7 M bit/s       40 k pps 
eth1            3.1 k bit/s        4   pps    686.5 k bit/s       65   pps 
eth2           19.3 M bit/s       37 k pps    363.5 M bit/s       38 k pps 


As usual most code is honestly stolen from Alexey. :-)

ftp://robur.slu.se/pub/Linux/net-development/ifstat2/

Cheers.
						--ro

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2.6] generic network statistics (was Re: [6/6]: jenkins hash for neigh / Statistics)
  2004-09-28 11:19                     ` [PATCH 2.6] generic network statistics (was Re: [6/6]: jenkins hash for neigh / Statistics) Harald Welte
  2004-09-28 12:48                       ` jamal
@ 2004-09-28 14:55                       ` Harald Welte
  2004-09-28 15:17                         ` Robert Olsson
  2004-09-28 21:43                       ` David S. Miller
  2 siblings, 1 reply; 43+ messages in thread
From: Harald Welte @ 2004-09-28 14:55 UTC (permalink / raw)
  To: Robert Olsson; +Cc: Stephen Hemminger, David S. Miller, herbert, netdev


[-- Attachment #1.1: Type: text/plain, Size: 652 bytes --]

Here goes some initial client code, so you can actually test the new
proc files from kernel.

As stated above, I'll continue working on this until it's somewhat
more feature-complete.

it's a multicall-binary so

rtstat		equals		lnstat -f rt_cache
ctstat		equals		lnstat -f ip_conntrack

I do not suggest inclusion of this current code into iproute2 at this
point, it's still under a lot of flux.

-- 
- Harald Welte <laforge@gnumonks.org>               http://www.gnumonks.org/
============================================================================
Programming is like sex: One mistake and you have to support it your lifetime

[-- Attachment #1.2: lnstat.c --]
[-- Type: text/x-csrc, Size: 8322 bytes --]

/* lnstat.c:  Unified linux network statistics
 *
 * Copyright (C) 2004 by Harald Welte <laforge@gnumonks.org>
 *
 * Development of this code was funded by Astaro AG, http://www.astaro.com/
 *
 * Based on original concept and ideas from predecessor rtstat.c:
 *
 * Copyright 2001 by Robert Olsson <robert.olsson@its.uu.se>
 *                                 Uppsala University, Sweden
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 */

#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <dirent.h>
#include <limits.h>
#include <time.h>
#include <getopt.h>

#include <sys/time.h>
#include <sys/types.h>


#define LNSTAT_VERSION "0.01 040928"

#define PROC_NET_STAT	"/proc/net/stat"

#define LNSTAT_MAX_FILES			32
#define LNSTAT_MAX_FIELDS_PER_LINE		32
#define LNSTAT_MAX_FIELD_NAME_LEN		32

struct lnstat_file;

struct lnstat_field {
	struct lnstat_file *file;
	unsigned int num;			/* field number in line */
	char name[LNSTAT_MAX_FIELD_NAME_LEN+1];
	unsigned long values[2];		/* two buffers for values */
};

struct lnstat_file {
	struct lnstat_file *next;
	char path[PATH_MAX+1];
	char basename[NAME_MAX+1];
	struct timeval last_read;		/* last time of read */
	struct timeval interval;		/* interval */
	FILE *fp;
	unsigned int num_fields;		/* number of fields */
	struct lnstat_field fields[LNSTAT_MAX_FIELDS_PER_LINE];
};



/* Read (and summarize for SMP) the different stats vars. */

static int scan_lines(struct lnstat_file *lf, int i)
{
	int j, num_lines = 0;

	for (j = 0; j < lf->num_fields; j++)
		lf->fields[j].values[i] = 0;

	while(!feof(lf->fp)) {
		#define FGETS_BUF_SIZE 512
		char buf[FGETS_BUF_SIZE];
		char *ptr = buf;

		num_lines++;

		fgets(buf, FGETS_BUF_SIZE-1, lf->fp); 
		gettimeofday(&lf->last_read, NULL);

		for (j = 0; j < lf->num_fields; j++)
			lf->fields[j].values[i] += strtoul(ptr, &ptr, 16);
	}
	return num_lines;
}

static int time_after(struct timeval *last, 
		      struct timeval *tout, 
		      struct timeval *now)
{
	if (now->tv_sec > last->tv_sec + tout->tv_sec)
		return 1;

	if (now->tv_sec == last->tv_sec + tout->tv_sec) {
		if (now->tv_usec > last->tv_usec + tout->tv_usec)
			return 1;
	}

	return 0;
}

int lnstat_update(struct lnstat_file *lnstat_files,
		  void (*print_cb)(struct lnstat_file *, void *), void *v)
{
	struct lnstat_file *lf;
	char buf[FGETS_BUF_SIZE];
	struct timeval tv;

	/* FIXME: for more precision we need to move this into the loop */
	gettimeofday(&tv, NULL);

	for (lf = lnstat_files; lf; lf = lf->next) {
		if (time_after(&lf->last_read, &lf->interval, &tv)) {
			rewind(lf->fp);
			fgets(buf, FGETS_BUF_SIZE-1, lf->fp);
			scan_lines(lf, 1);

			(print_cb)(lf, v);

			rewind(lf->fp);
			fgets(buf, FGETS_BUF_SIZE-1, lf->fp);
			scan_lines(lf, 0);
		}
	}

	return 0;
}

/* scan first template line and fill in per-field data structures */
static int lnstat_scan_fields(struct lnstat_file *lf)
{
	char buf[FGETS_BUF_SIZE];
	char *tok;
	int i;

	rewind(lf->fp);
	fgets(buf, FGETS_BUF_SIZE-1, lf->fp);

	tok = strtok(buf, " \t\n");
	for (i = 0; i < LNSTAT_MAX_FIELDS_PER_LINE; i++) {
		lf->fields[i].file = lf;
		strncpy(lf->fields[i].name, tok, LNSTAT_MAX_FIELD_NAME_LEN);
		/* has to be null-terminate since we initialize to zero
		 * and field size is NAME_LEN + 1 */
		tok = strtok(NULL, " \t\n");
		if (!tok) {
			lf->num_fields = i+1;
			return 0;
		}
	}
	return 0;
}

/* find out whether string 'name; is in given string array */
static int name_in_array(const int num, const char **arr, const char *name)
{
	int i;
	for (i = 0; i < num; i++) {
		if (!strcmp(arr[i], name))
			return 1;
	}
	return 0;
}

/* allocate lnstat_file and open given file */
static struct lnstat_file *alloc_and_open(const char *path, const char *file)
{
	struct lnstat_file *lf;

	/* allocate */
	lf = malloc(sizeof(*lf));
	if (!lf)
		return NULL;

	/* initialize */
	memset(lf, 0, sizeof(*lf));

	/* de->d_name is guaranteed to be <= NAME_MAX */
	strcpy(lf->basename, file);
	strcpy(lf->path, path);
	strcat(lf->path, "/");
	strcat(lf->path, lf->basename);

	/* open */
	lf->fp = fopen(lf->path, "r");
	if (!lf->fp) {
		free(lf);
		return NULL;
	}

	return lf;
}


/* lnstat_scan_dir - find and parse all available statistics files/fields */
struct lnstat_file *lnstat_scan_dir(const char *path, const int num_req_files,
				    const char **req_files)
{
	DIR *dir;
	struct lnstat_file *lnstat_files = NULL;
	struct dirent *de;

	if (!path)
		path = PROC_NET_STAT;
	
	dir = opendir(path);
	if (!dir)
		return NULL;

	while (de = readdir(dir)) {
		struct lnstat_file *lf;

		if (de->d_type != DT_REG)
			continue;

		if (num_req_files && !name_in_array(num_req_files,
						    req_files, de->d_name))
			continue;

		lf = alloc_and_open(path, de->d_name);
		if (!lf)
			return NULL;

		/* fill in field structure */
		if (lnstat_scan_fields(lf) < 0)
			return NULL;

		/* prepend to global list */
		lf->next = lnstat_files;
		lnstat_files = lf;
	}
	closedir(dir);

	return lnstat_files;
}

int lnstat_dump(FILE *outfd, struct lnstat_file *lnstat_files)
{
	struct lnstat_file *lf;

	for (lf = lnstat_files; lf; lf = lf->next) {
		int i;

		fprintf(outfd, "%s:\n", lf->path);

		for (i = 0; i < lf->num_fields; i++)
			fprintf(outfd, "\t%2u: %s\n", i+1, lf->fields[i].name);

	}
	return 0;
}



static struct option opts[] = {
	{ "version", 0, NULL, 'V' },
	{ "help", 0, NULL, 'h' },
	{ "interval", 1, NULL, 'i' },
	{ "subject", 1, NULL, 's' },
	{ "file", 1, NULL, 'f' },
	{ "key", 1, NULL, 'k' },
};

static int usage(char *name, int exit_code)
{
	fprintf(stderr, "%s Version %s\n", name, LNSTAT_VERSION);
	fprintf(stderr, "Copyright (C) 2004 by Harald Welte "
			"<laforge@gnumonks.org>\n");
	fprintf(stderr, "This program is free software with ABSOLUTELY NO "
			"WARRANTY.\n\n");
	fprintf(stderr, "Parameters:\n");
	fprintf(stderr, "\t-h --help\t\tThis help message\n");
	fprintf(stderr, "\t-i --interval\t\tSet interval\n");
	fprintf(stderr, "\t-s --subject [0-2]\t?\n");	
	fprintf(stderr, "\t-f --file\t\tStatistics file to use\n");	
	fprintf(stderr, "\n");	

	exit(exit_code);
}

static void print_cb(struct lnstat_file *lf, void *v)
{
	int i;
	int *interval = v;

	for (i = 0; i < lf->num_fields; i++) {
		unsigned long val = (lf->fields[i].values[1]
				    -lf->fields[i].values[0])/(*interval);
		fprintf(stdout, "%5lu ", val);
	}
	fputc('\n', stdout);
}


int main(int argc, char **argv)
{
	char *basename;
	int c, i = 1, interval = 2, hdr = 2;
	struct lnstat_file *lnstat_files;

	int num_req_files = 0, num_req_keys = 0;
	char *req_files[LNSTAT_MAX_FILES];
	char *req_keys[LNSTAT_MAX_FILES*LNSTAT_MAX_FIELDS_PER_LINE];
  
	/* backwards compatibility mode for old tools */
	basename = strrchr(argv[0], '/') + 1;
	if (!strcmp(basename, "rtstat")) {
		/* rtstat compatibility mode */
		req_files[0] = "rt_cache";
		num_req_files = 1;
	} else if (!strcmp(basename, "ctstat")) {
		/* ctstat compatibility mode */
		req_files[0] = "ip_conntrack";
		num_req_files = 1;
	}

	while ((c = getopt_long(argc, argv,"Vh?s:i:f:k:", opts, NULL)) != -1)
		switch (c) {
			case '?':
			case 'h':       
				usage(argv[0], 0);
			case 'i':
				sscanf(optarg, "%u", &interval);
				break;
			case 's':
				sscanf(optarg, "%u", &hdr);
				break;
			case 'f':
				req_files[num_req_files++] = strdup(optarg);
				break;
			case 'k':
				req_keys[num_req_keys++] = strdup(optarg);
				break;
			default:
				usage(argv[0], 1);
		}

	if (interval < 1 ) 
		interval=1;

	lnstat_files = lnstat_scan_dir(PROC_NET_STAT, num_req_files,
				       req_files);

	lnstat_dump(stderr, lnstat_files);

	for (; 1; i++) {
#if 0
		if (hdr > 1 && (! (i % 20)))
			print_hdr_line();
#endif
		lnstat_update(lnstat_files, &print_cb, &interval);
		sleep(interval);
	}

	return 1;
}


[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2.6] generic network statistics (was Re: [6/6]: jenkins hash for neigh / Statistics)
  2004-09-28 14:55                       ` Harald Welte
@ 2004-09-28 15:17                         ` Robert Olsson
  2004-09-28 16:24                           ` Harald Welte
  0 siblings, 1 reply; 43+ messages in thread
From: Robert Olsson @ 2004-09-28 15:17 UTC (permalink / raw)
  To: Harald Welte
  Cc: Robert Olsson, Stephen Hemminger, David S. Miller, herbert,
	netdev


Harald Welte writes:
 > Here goes some initial client code, so you can actually test the new
 > proc files from kernel.
 > 
 > As stated above, I'll continue working on this until it's somewhat
 > more feature-complete.

 OK! Seems pretty generic, maybe the stats info for headers could be read 
 from current kernels /proc? to keep kernel stats in sync with this user app?

 Cheers.
						--ro

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2.6] generic network statistics (was Re: [6/6]: jenkins hash for neigh / Statistics)
  2004-09-28 15:17                         ` Robert Olsson
@ 2004-09-28 16:24                           ` Harald Welte
  0 siblings, 0 replies; 43+ messages in thread
From: Harald Welte @ 2004-09-28 16:24 UTC (permalink / raw)
  To: Robert Olsson; +Cc: Stephen Hemminger, David S. Miller, herbert, netdev

[-- Attachment #1: Type: text/plain, Size: 1087 bytes --]

On Tue, Sep 28, 2004 at 05:17:51PM +0200, Robert Olsson wrote:
> 
> Harald Welte writes:
>  > Here goes some initial client code, so you can actually test the new
>  > proc files from kernel.
>  > 
>  > As stated above, I'll continue working on this until it's somewhat
>  > more feature-complete.
> 
>  OK! Seems pretty generic, maybe the stats info for headers could be read 
>  from current kernels /proc? to keep kernel stats in sync with this user app?

that's what it intends to do.  The field names are all stored in
'lnstat_field.name', so it is easy to build that line.  However, it has
to be multiple-line, since the field names from the kernel are sometimes
quite long...

The only difference is that it won't show you "IN: hits" but rather
"in_hits" which I think we can safely ignore.

>  Cheers.
> 						--ro

-- 
- Harald Welte <laforge@gnumonks.org>               http://www.gnumonks.org/
============================================================================
Programming is like sex: One mistake and you have to support it your lifetime

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh / Statistics
  2004-09-28  8:44                   ` Robert Olsson
  2004-09-28 11:19                     ` [PATCH 2.6] generic network statistics (was Re: [6/6]: jenkins hash for neigh / Statistics) Harald Welte
@ 2004-09-28 16:27                     ` Stephen Hemminger
  2004-09-28 17:06                       ` Harald Welte
  1 sibling, 1 reply; 43+ messages in thread
From: Stephen Hemminger @ 2004-09-28 16:27 UTC (permalink / raw)
  To: Robert Olsson; +Cc: David S. Miller, Harald Welte, herbert, netdev

On Tue, 2004-09-28 at 10:44 +0200, Robert Olsson wrote:
> Stephen Hemminger writes:
> 
>  > > Harald Welte <laforge@gnumonks.org> wrote:
>  > > 
>  > > > As stated before, I would like to change rt_stat and ct_stat in order to
>  > > > include a first 'template' line, too.  This way it is easier to write a
>  > > > generic foo_stat program, that could deal with any of those statistics
>  > > > files, even with new ones...  but this of course would break existing
>  > > > rtstat binaries.  I personally  don't care, since it's a little-known
>  > > > and little-used feature, which to my knowledge is in a lot of
>  > > > distributions either non-existant [Debian] or incompatible [SuSE]. What
>  > > > do you think?
>  > > 
>  > > I agree.  And while we're add it let's get a fixed rtstat into
>  > > iproute2 and make sure that binary gets installed by default
>  > > so maybe the dists will start shipping it properly.
>  > 
>  > I have the old one in the repository.
> 
>  I sent you the latest rtstat (Martin sent ctstat) during netfilter workshop. 
>  Remember? Having a common foo_stat sounds like a good idea.

I have the newer rtstat/ctstat queued for inclusion this week unless
Harald get's his newer one done sooner (hint, hint)

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh / Statistics
  2004-09-28 16:27                     ` [6/6]: jenkins hash for neigh / Statistics Stephen Hemminger
@ 2004-09-28 17:06                       ` Harald Welte
  0 siblings, 0 replies; 43+ messages in thread
From: Harald Welte @ 2004-09-28 17:06 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Robert Olsson, David S. Miller, herbert, netdev

[-- Attachment #1: Type: text/plain, Size: 537 bytes --]

 
> I have the newer rtstat/ctstat queued for inclusion this week unless
> Harald get's his newer one done sooner (hint, hint)

well, the new version currently provides backwards compatibility for the
old commands, but it doesn't support old kernels... I'll see how I can
integrate that.

-- 
- Harald Welte <laforge@gnumonks.org>               http://www.gnumonks.org/
============================================================================
Programming is like sex: One mistake and you have to support it your lifetime

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2.6] generic network statistics (was Re: [6/6]: jenkins hash for neigh / Statistics)
  2004-09-28 11:19                     ` [PATCH 2.6] generic network statistics (was Re: [6/6]: jenkins hash for neigh / Statistics) Harald Welte
  2004-09-28 12:48                       ` jamal
  2004-09-28 14:55                       ` Harald Welte
@ 2004-09-28 21:43                       ` David S. Miller
  2004-09-29  8:04                         ` Harald Welte
  2 siblings, 1 reply; 43+ messages in thread
From: David S. Miller @ 2004-09-28 21:43 UTC (permalink / raw)
  To: Harald Welte; +Cc: Robert.Olsson, shemminger, herbert, netdev

On Tue, 28 Sep 2004 13:19:06 +0200
Harald Welte <laforge@gnumonks.org> wrote:

> Please find the current kernel-part patch attached to this mail,
> it adds teplate-headerlines and moves all statistics files to
> /proc/net/stat.  It even cleans up the neigh_stat code a bit.
> Depends on my latest neighbour statistics patch that davem has already
> merged.

I like this patch a lot and have applied it.

Thanks Harald.

Harald, we've created a lot of churn over the past week or
two.  Can I expect a set of 2.4.x patches for all the neigh
stuff and this generic statistics thing sometime soon?

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2.6] generic network statistics (was Re: [6/6]: jenkins hash for neigh / Statistics)
  2004-09-28 14:22                         ` Robert Olsson
@ 2004-09-29  2:16                           ` jamal
  0 siblings, 0 replies; 43+ messages in thread
From: jamal @ 2004-09-29  2:16 UTC (permalink / raw)
  To: Robert Olsson
  Cc: Harald Welte, Stephen Hemminger, David S. Miller, herbert, netdev

On Tue, 2004-09-28 at 10:22, Robert Olsson wrote:
>  OK it's EWMA calc in kernel. 
[..]
> As usual most code is honestly stolen from Alexey. :-)

>From the maestro himself ;-> Why reinvent music my friend?
You are probably using the same code. I just ripped it from netsched
and made it available to the general masses.
BTW, if you see any missing tricks there let me know

cheers,
jamal

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2.6] generic network statistics (was Re: [6/6]: jenkins hash for neigh / Statistics)
  2004-09-28 13:33                         ` Thomas Graf
@ 2004-09-29  2:22                           ` jamal
  0 siblings, 0 replies; 43+ messages in thread
From: jamal @ 2004-09-29  2:22 UTC (permalink / raw)
  To: Thomas Graf
  Cc: Harald Welte, Robert Olsson, Stephen Hemminger, David S. Miller,
	herbert, netdev

On Tue, 2004-09-28 at 09:33, Thomas Graf wrote:
> > Speaking of generic stats; i have a patch netlink ready which may need
> > some extensions. I did post it  a while back on netdev but didnt get
> > feedback.
> 
> The code looks good and I couldn't spot any errors but I'm not
> sure if the locking in gen_copy_[x]stats is a good thing.
> Shouldn't that be done earlier by the caller? 

In the netsched code that became a portability issue; Dave fixed it
there, so i just replicated here. If you feel like doing something
clever you are welcome to submit a patch.

> This prevents
> corruption but it allows duplicated TLVs in an skb. I suggest
> to make the caller have a lock on his data and only allow one
> dumper at the same time until the dump is complete, or at least
> provide a lockless variant for callers doing the locking on
> their own.
> 

Reminds me:
gnet_stats needs to have TLVs embedded in it.
bytes,drops, packets are generic enough; others are not.
So if we add a length field then we can add TLVs for things like
QSTATS = { qlen, backlog}  etc.
This means we could then allow for adding a lot of different
stats. A big lesson from current tc_stats.

cheers,
jamal

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 2.6] generic network statistics (was Re: [6/6]: jenkins hash for neigh / Statistics)
  2004-09-28 21:43                       ` David S. Miller
@ 2004-09-29  8:04                         ` Harald Welte
  0 siblings, 0 replies; 43+ messages in thread
From: Harald Welte @ 2004-09-29  8:04 UTC (permalink / raw)
  To: David S. Miller; +Cc: Robert.Olsson, shemminger, herbert, netdev

[-- Attachment #1: Type: text/plain, Size: 640 bytes --]

On Tue, Sep 28, 2004 at 02:43:56PM -0700, David S. Miller wrote:

> I like this patch a lot and have applied it.

thanks :)

> Harald, we've created a lot of churn over the past week or
> two.  Can I expect a set of 2.4.x patches for all the neigh
> stuff and this generic statistics thing sometime soon?

Yes, I'll be working on this today or tomorrow, so you can expect a
patch soon.

-- 
- Harald Welte <laforge@gnumonks.org>               http://www.gnumonks.org/
============================================================================
Programming is like sex: One mistake and you have to support it your lifetime

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh
  2004-09-27 18:15           ` David S. Miller
  2004-09-27 21:41             ` Herbert Xu
@ 2004-10-02  7:50             ` Herbert Xu
  2004-10-03 21:55               ` David S. Miller
  1 sibling, 1 reply; 43+ messages in thread
From: Herbert Xu @ 2004-10-02  7:50 UTC (permalink / raw)
  To: David S. Miller; +Cc: laforge, netdev

[-- Attachment #1: Type: text/plain, Size: 1029 bytes --]

On Mon, Sep 27, 2004 at 11:15:20AM -0700, David S. Miller wrote:
> On Mon, 27 Sep 2004 21:48:33 +1000
> Herbert Xu <herbert@gondor.apana.org.au> wrote:
> 
> > > -	if (tbl->entries > (tbl->hash_mask + 1))
> > > +	if (tbl->entries > (tbl->hash_mask + 1)) {
> > > +		write_lock_bh(&tbl->lock);
> > >  		neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1);
> > > +		write_unlock_bh(&tbl->lock);
> > > +	}
> > 
> > The locking should be outside the if block as otherwise you may grow
> > unnecessarily or grow into the same size :)
> 
> I'm not going to do that, because then we're grabbing that lock
> twice on every neigh create operation and I was trying hard
> to avoid that overhead.

Actually, why don't we just move the expansion into the locked section?

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #2: p --]
[-- Type: text/plain, Size: 714 bytes --]

===== net/core/neighbour.c 1.51 vs edited =====
--- 1.51/net/core/neighbour.c	2004-10-02 07:58:21 +10:00
+++ edited/net/core/neighbour.c	2004-10-02 17:47:11 +10:00
@@ -406,12 +406,6 @@
 		goto out;
 	}
 
-	if (tbl->entries > (tbl->hash_mask + 1)) {
-		write_lock_bh(&tbl->lock);
-		neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1);
-		write_unlock_bh(&tbl->lock);
-	}
-
 	memcpy(n->primary_key, pkey, key_len);
 	n->dev = dev;
 	dev_hold(dev);
@@ -432,6 +426,9 @@
 	n->confirmed = jiffies - (n->parms->base_reachable_time << 1);
 
 	write_lock_bh(&tbl->lock);
+
+	if (tbl->entries > (tbl->hash_mask + 1)) {
+		neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1);
 
 	hash_val = tbl->hash(pkey, dev) & tbl->hash_mask;
 

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [6/6]: jenkins hash for neigh
  2004-10-02  7:50             ` Herbert Xu
@ 2004-10-03 21:55               ` David S. Miller
  0 siblings, 0 replies; 43+ messages in thread
From: David S. Miller @ 2004-10-03 21:55 UTC (permalink / raw)
  To: Herbert Xu; +Cc: laforge, netdev

On Sat, 2 Oct 2004 17:50:51 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:

> On Mon, Sep 27, 2004 at 11:15:20AM -0700, David S. Miller wrote:
> > On Mon, 27 Sep 2004 21:48:33 +1000
> > Herbert Xu <herbert@gondor.apana.org.au> wrote:
> > 
> > > > -	if (tbl->entries > (tbl->hash_mask + 1))
> > > > +	if (tbl->entries > (tbl->hash_mask + 1)) {
> > > > +		write_lock_bh(&tbl->lock);
> > > >  		neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1);
> > > > +		write_unlock_bh(&tbl->lock);
> > > > +	}
> > > 
> > > The locking should be outside the if block as otherwise you may grow
> > > unnecessarily or grow into the same size :)
> > 
> > I'm not going to do that, because then we're grabbing that lock
> > twice on every neigh create operation and I was trying hard
> > to avoid that overhead.
> 
> Actually, why don't we just move the expansion into the locked section?
> 
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Works for me, patch applied.

Thanks Herbert.

^ permalink raw reply	[flat|nested] 43+ messages in thread

end of thread, other threads:[~2004-10-03 21:55 UTC | newest]

Thread overview: 43+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-09-24  5:51 [6/6]: jenkins hash for neigh David S. Miller
2004-09-24  8:52 ` Harald Welte
2004-09-24 21:27   ` David S. Miller
2004-09-25  6:44     ` Harald Welte
2004-09-25  7:56       ` David S. Miller
2004-09-25  8:14         ` YOSHIFUJI Hideaki / 吉藤英明
2004-09-25  8:27           ` YOSHIFUJI Hideaki / 吉藤英明
2004-09-25  8:30             ` David S. Miller
2004-09-25  9:09         ` Harald Welte
2004-09-25 13:33           ` Steven Whitehouse
2004-09-26  0:48             ` David S. Miller
2004-09-26  3:31           ` David S. Miller
2004-09-26 11:21             ` Thomas Graf
2004-09-27  9:29           ` Harald Welte
2004-09-27 18:57             ` David S. Miller
2004-09-26 10:11         ` YOSHIFUJI Hideaki / 吉藤英明
2004-09-27 11:43         ` Herbert Xu
2004-09-27 19:12           ` David S. Miller
2004-09-27 11:48         ` Herbert Xu
2004-09-27 18:15           ` David S. Miller
2004-09-27 21:41             ` Herbert Xu
2004-09-27 22:00               ` Herbert Xu
2004-10-02  7:50             ` Herbert Xu
2004-10-03 21:55               ` David S. Miller
2004-09-27 11:56         ` Herbert Xu
2004-09-27 19:14           ` David S. Miller
2004-09-27 22:26             ` [6/6]: jenkins hash for neigh / Statistics Harald Welte
2004-09-27 23:06               ` David S. Miller
2004-09-27 23:27                 ` Stephen Hemminger
2004-09-28  8:44                   ` Robert Olsson
2004-09-28 11:19                     ` [PATCH 2.6] generic network statistics (was Re: [6/6]: jenkins hash for neigh / Statistics) Harald Welte
2004-09-28 12:48                       ` jamal
2004-09-28 13:33                         ` Thomas Graf
2004-09-29  2:22                           ` jamal
2004-09-28 14:22                         ` Robert Olsson
2004-09-29  2:16                           ` jamal
2004-09-28 14:55                       ` Harald Welte
2004-09-28 15:17                         ` Robert Olsson
2004-09-28 16:24                           ` Harald Welte
2004-09-28 21:43                       ` David S. Miller
2004-09-29  8:04                         ` Harald Welte
2004-09-28 16:27                     ` [6/6]: jenkins hash for neigh / Statistics Stephen Hemminger
2004-09-28 17:06                       ` Harald Welte

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).