[PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
@ 2007-11-01 10:16 Eric Dumazet
  2007-11-01 11:03 ` David Miller
                   ` (4 more replies)
  0 siblings, 5 replies; 29+ messages in thread
From: Eric Dumazet @ 2007-11-01 10:16 UTC (permalink / raw)
  To: David S. Miller; +Cc: Linux Netdev List, Andi Kleen, Arnaldo Carvalho de Melo

[-- Attachment #1: Type: text/plain, Size: 1309 bytes --]

As done two years ago on IP route cache table (commit 
22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one lock per 
hash bucket for the huge TCP/DCCP hash tables.

On a typical x86_64 platform, this saves about 2MB or 4MB of ram, for litle 
performance differences. (we hit a different cache line for the rwlock, but 
then the bucket cache line have a better sharing factor among cpus, since we 
dirty it less often)

Using a 'small' table of hashed rwlocks should be more than enough to provide 
correct SMP concurrency between different buckets, without using too much 
memory. Sizing of this table depends on NR_CPUS and various CONFIG settings.

This patch provides some locking abstraction that may ease a future work using 
  a different model for TCP/DCCP table.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

  include/net/inet_hashtables.h |   40 ++++++++++++++++++++++++++++----
  net/dccp/proto.c              |   16 ++++++++++--
  net/ipv4/inet_diag.c          |    9 ++++---
  net/ipv4/inet_hashtables.c    |    7 +++--
  net/ipv4/inet_timewait_sock.c |   13 +++++-----
  net/ipv4/tcp.c                |   11 +++++++-
  net/ipv4/tcp_ipv4.c           |   11 ++++----
  net/ipv6/inet6_hashtables.c   |   19 ++++++++-------
  8 files changed, 89 insertions(+), 37 deletions(-)


[-- Attachment #2: tcp_ehash_locks.patch --]
[-- Type: text/plain, Size: 13597 bytes --]

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 4427dcd..5cbfbac 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -37,7 +37,6 @@
  * I'll experiment with dynamic table growth later.
  */
 struct inet_ehash_bucket {
-	rwlock_t	  lock;
 	struct hlist_head chain;
 	struct hlist_head twchain;
 };
@@ -91,6 +90,28 @@ struct inet_bind_hashbucket {
 /* This is for listening sockets, thus all sockets which possess wildcards. */
 #define INET_LHTABLE_SIZE	32	/* Yes, really, this is all you need. */
 
+#if defined(CONFIG_SMP) || defined(CONFIG_PROVE_LOCKING)
+/*
+ * Instead of using one rwlock for each inet_ehash_bucket, we use a table of locks
+ * The size of this table is a power of two and depends on the number of CPUS.
+ */
+# if defined(CONFIG_DEBUG_LOCK_ALLOC)
+#  define EHASH_LOCK_SZ 256
+# elif NR_CPUS >= 32
+#  define EHASH_LOCK_SZ	4096
+# elif NR_CPUS >= 16
+#  define EHASH_LOCK_SZ	2048
+# elif NR_CPUS >= 8
+#  define EHASH_LOCK_SZ	1024
+# elif NR_CPUS >= 4
+#  define EHASH_LOCK_SZ	512
+# else
+#  define EHASH_LOCK_SZ	256
+# endif
+#else
+# define EHASH_LOCK_SZ 0
+#endif
+
 struct inet_hashinfo {
 	/* This is for sockets with full identity only.  Sockets here will
 	 * always be without wildcards and will have the following invariant:
@@ -100,6 +121,7 @@ struct inet_hashinfo {
 	 * TIME_WAIT sockets use a separate chain (twchain).
 	 */
 	struct inet_ehash_bucket	*ehash;
+	rwlock_t			*ehash_locks;
 
 	/* Ok, let's try this, I give up, we do need a local binding
 	 * TCP hash as well as the others for fast bind/connect.
@@ -134,6 +156,13 @@ static inline struct inet_ehash_bucket *inet_ehash_bucket(
 	return &hashinfo->ehash[hash & (hashinfo->ehash_size - 1)];
 }
 
+static inline rwlock_t *inet_ehash_lockp(
+	struct inet_hashinfo *hashinfo,
+	unsigned int hash)
+{
+	return &hashinfo->ehash_locks[hash & (EHASH_LOCK_SZ - 1)];
+}
+
 extern struct inet_bind_bucket *
 		    inet_bind_bucket_create(struct kmem_cache *cachep,
 					    struct inet_bind_hashbucket *head,
@@ -222,7 +251,7 @@ static inline void __inet_hash(struct inet_hashinfo *hashinfo,
 		sk->sk_hash = inet_sk_ehashfn(sk);
 		head = inet_ehash_bucket(hashinfo, sk->sk_hash);
 		list = &head->chain;
-		lock = &head->lock;
+		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
 		write_lock(lock);
 	}
 	__sk_add_node(sk, list);
@@ -253,7 +282,7 @@ static inline void inet_unhash(struct inet_hashinfo *hashinfo, struct sock *sk)
 		inet_listen_wlock(hashinfo);
 		lock = &hashinfo->lhash_lock;
 	} else {
-		lock = &inet_ehash_bucket(hashinfo, sk->sk_hash)->lock;
+		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
 		write_lock_bh(lock);
 	}
 
@@ -354,9 +383,10 @@ static inline struct sock *
 	 */
 	unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport);
 	struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash);
+	rwlock_t *lock = inet_ehash_lockp(hashinfo, hash);
 
 	prefetch(head->chain.first);
-	read_lock(&head->lock);
+	read_lock(lock);
 	sk_for_each(sk, node, &head->chain) {
 		if (INET_MATCH(sk, hash, acookie, saddr, daddr, ports, dif))
 			goto hit; /* You sunk my battleship! */
@@ -369,7 +399,7 @@ static inline struct sock *
 	}
 	sk = NULL;
 out:
-	read_unlock(&head->lock);
+	read_unlock(lock);
 	return sk;
 hit:
 	sock_hold(sk);
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index d849739..3b5f97a 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1072,11 +1072,18 @@ static int __init dccp_init(void)
 	}
 
 	for (i = 0; i < dccp_hashinfo.ehash_size; i++) {
-		rwlock_init(&dccp_hashinfo.ehash[i].lock);
 		INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain);
 		INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].twchain);
 	}
-
+	if (EHASH_LOCK_SZ) {
+		dccp_hashinfo.ehash_locks =
+			kmalloc(EHASH_LOCK_SZ * sizeof(rwlock_t),
+				GFP_KERNEL);
+		if (!dccp_hashinfo.ehash_locks)
+			goto out_free_dccp_ehash;
+		for (i = 0; i < EHASH_LOCK_SZ; i++)
+			rwlock_init(&dccp_hashinfo.ehash_locks[i]);
+	}
 	bhash_order = ehash_order;
 
 	do {
@@ -1091,7 +1098,7 @@ static int __init dccp_init(void)
 
 	if (!dccp_hashinfo.bhash) {
 		DCCP_CRIT("Failed to allocate DCCP bind hash table");
-		goto out_free_dccp_ehash;
+		goto out_free_dccp_locks;
 	}
 
 	for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
@@ -1121,6 +1128,9 @@ out_free_dccp_mib:
 out_free_dccp_bhash:
 	free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
 	dccp_hashinfo.bhash = NULL;
+out_free_dccp_locks:
+	kfree(dccp_hashinfo.ehash_locks);
+	dccp_hashinfo.ehash_locks = NULL;
 out_free_dccp_ehash:
 	free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
 	dccp_hashinfo.ehash = NULL;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index dc429b6..b017073 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -747,13 +747,14 @@ skip_listen_ht:
 
 	for (i = s_i; i < hashinfo->ehash_size; i++) {
 		struct inet_ehash_bucket *head = &hashinfo->ehash[i];
+		rwlock_t *lock = inet_ehash_lockp(hashinfo, i);
 		struct sock *sk;
 		struct hlist_node *node;
 
 		if (i > s_i)
 			s_num = 0;
 
-		read_lock_bh(&head->lock);
+		read_lock_bh(lock);
 		num = 0;
 		sk_for_each(sk, node, &head->chain) {
 			struct inet_sock *inet = inet_sk(sk);
@@ -769,7 +770,7 @@ skip_listen_ht:
 			    r->id.idiag_dport)
 				goto next_normal;
 			if (inet_csk_diag_dump(sk, skb, cb) < 0) {
-				read_unlock_bh(&head->lock);
+				read_unlock_bh(lock);
 				goto done;
 			}
 next_normal:
@@ -791,14 +792,14 @@ next_normal:
 				    r->id.idiag_dport)
 					goto next_dying;
 				if (inet_twsk_diag_dump(tw, skb, cb) < 0) {
-					read_unlock_bh(&head->lock);
+					read_unlock_bh(lock);
 					goto done;
 				}
 next_dying:
 				++num;
 			}
 		}
-		read_unlock_bh(&head->lock);
+		read_unlock_bh(lock);
 	}
 
 done:
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 16eecc7..67704da 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -204,12 +204,13 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
 	const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport);
 	unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
 	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
+	rwlock_t *lock = inet_ehash_lockp(hinfo, hash);
 	struct sock *sk2;
 	const struct hlist_node *node;
 	struct inet_timewait_sock *tw;
 
 	prefetch(head->chain.first);
-	write_lock(&head->lock);
+	write_lock(lock);
 
 	/* Check TIME-WAIT sockets first. */
 	sk_for_each(sk2, node, &head->twchain) {
@@ -239,7 +240,7 @@ unique:
 	BUG_TRAP(sk_unhashed(sk));
 	__sk_add_node(sk, &head->chain);
 	sock_prot_inc_use(sk->sk_prot);
-	write_unlock(&head->lock);
+	write_unlock(lock);
 
 	if (twp) {
 		*twp = tw;
@@ -255,7 +256,7 @@ unique:
 	return 0;
 
 not_unique:
-	write_unlock(&head->lock);
+	write_unlock(lock);
 	return -EADDRNOTAVAIL;
 }
 
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 4e189e2..a60b99e 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -20,16 +20,16 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
 	struct inet_bind_hashbucket *bhead;
 	struct inet_bind_bucket *tb;
 	/* Unlink from established hashes. */
-	struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, tw->tw_hash);
+	rwlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
 
-	write_lock(&ehead->lock);
+	write_lock(lock);
 	if (hlist_unhashed(&tw->tw_node)) {
-		write_unlock(&ehead->lock);
+		write_unlock(lock);
 		return;
 	}
 	__hlist_del(&tw->tw_node);
 	sk_node_init(&tw->tw_node);
-	write_unlock(&ehead->lock);
+	write_unlock(lock);
 
 	/* Disassociate with bind bucket. */
 	bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)];
@@ -59,6 +59,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
 	const struct inet_sock *inet = inet_sk(sk);
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
+	rwlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
 	struct inet_bind_hashbucket *bhead;
 	/* Step 1: Put TW into bind hash. Original socket stays there too.
 	   Note, that any socket with inet->num != 0 MUST be bound in
@@ -71,7 +72,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
 	inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
 	spin_unlock(&bhead->lock);
 
-	write_lock(&ehead->lock);
+	write_lock(lock);
 
 	/* Step 2: Remove SK from established hash. */
 	if (__sk_del_node_init(sk))
@@ -81,7 +82,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
 	inet_twsk_add_node(tw, &ehead->twchain);
 	atomic_inc(&tw->tw_refcnt);
 
-	write_unlock(&ehead->lock);
+	write_unlock(lock);
 }
 
 EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c64072b..e7aca8e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2456,11 +2456,18 @@ void __init tcp_init(void)
 					thash_entries ? 0 : 512 * 1024);
 	tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size;
 	for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
-		rwlock_init(&tcp_hashinfo.ehash[i].lock);
 		INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
 		INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain);
 	}
-
+	if (EHASH_LOCK_SZ) {
+		tcp_hashinfo.ehash_locks =
+			kmalloc(EHASH_LOCK_SZ * sizeof(rwlock_t),
+				GFP_KERNEL);
+		if (!tcp_hashinfo.ehash_locks)
+			panic("TCP: failed to alloc ehash_locks");
+		for (i = 0; i < EHASH_LOCK_SZ; i++)
+			rwlock_init(&tcp_hashinfo.ehash_locks[i]);
+	}
 	tcp_hashinfo.bhash =
 		alloc_large_system_hash("TCP bind",
 					sizeof(struct inet_bind_hashbucket),
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index eec02b2..cd82c0e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2049,8 +2049,9 @@ static void *established_get_first(struct seq_file *seq)
 		struct sock *sk;
 		struct hlist_node *node;
 		struct inet_timewait_sock *tw;
+		rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
 
-		read_lock_bh(&tcp_hashinfo.ehash[st->bucket].lock);
+		read_lock_bh(lock);
 		sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
 			if (sk->sk_family != st->family) {
 				continue;
@@ -2067,7 +2068,7 @@ static void *established_get_first(struct seq_file *seq)
 			rc = tw;
 			goto out;
 		}
-		read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock);
+		read_unlock_bh(lock);
 		st->state = TCP_SEQ_STATE_ESTABLISHED;
 	}
 out:
@@ -2094,11 +2095,11 @@ get_tw:
 			cur = tw;
 			goto out;
 		}
-		read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock);
+		read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
 		st->state = TCP_SEQ_STATE_ESTABLISHED;
 
 		if (++st->bucket < tcp_hashinfo.ehash_size) {
-			read_lock_bh(&tcp_hashinfo.ehash[st->bucket].lock);
+			read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
 			sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
 		} else {
 			cur = NULL;
@@ -2206,7 +2207,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
 	case TCP_SEQ_STATE_TIME_WAIT:
 	case TCP_SEQ_STATE_ESTABLISHED:
 		if (v)
-			read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock);
+			read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
 		break;
 	}
 }
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index d6f1026..adc73ad 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -37,9 +37,8 @@ void __inet6_hash(struct inet_hashinfo *hashinfo,
 	} else {
 		unsigned int hash;
 		sk->sk_hash = hash = inet6_sk_ehashfn(sk);
-		hash &= (hashinfo->ehash_size - 1);
-		list = &hashinfo->ehash[hash].chain;
-		lock = &hashinfo->ehash[hash].lock;
+		list = &inet_ehash_bucket(hashinfo, hash)->chain;
+		lock = inet_ehash_lockp(hashinfo, hash);
 		write_lock(lock);
 	}
 
@@ -70,9 +69,10 @@ struct sock *__inet6_lookup_established(struct inet_hashinfo *hashinfo,
 	 */
 	unsigned int hash = inet6_ehashfn(daddr, hnum, saddr, sport);
 	struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash);
+	rwlock_t *lock = inet_ehash_lockp(hashinfo, hash);
 
 	prefetch(head->chain.first);
-	read_lock(&head->lock);
+	read_lock(lock);
 	sk_for_each(sk, node, &head->chain) {
 		/* For IPV6 do the cheaper port and family tests first. */
 		if (INET6_MATCH(sk, hash, saddr, daddr, ports, dif))
@@ -92,12 +92,12 @@ struct sock *__inet6_lookup_established(struct inet_hashinfo *hashinfo,
 				goto hit;
 		}
 	}
-	read_unlock(&head->lock);
+	read_unlock(lock);
 	return NULL;
 
 hit:
 	sock_hold(sk);
-	read_unlock(&head->lock);
+	read_unlock(lock);
 	return sk;
 }
 EXPORT_SYMBOL(__inet6_lookup_established);
@@ -175,12 +175,13 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
 	const unsigned int hash = inet6_ehashfn(daddr, lport, saddr,
 						inet->dport);
 	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
+	rwlock_t *lock = inet_ehash_lockp(hinfo, hash);
 	struct sock *sk2;
 	const struct hlist_node *node;
 	struct inet_timewait_sock *tw;
 
 	prefetch(head->chain.first);
-	write_lock(&head->lock);
+	write_lock(lock);
 
 	/* Check TIME-WAIT sockets first. */
 	sk_for_each(sk2, node, &head->twchain) {
@@ -216,7 +217,7 @@ unique:
 	__sk_add_node(sk, &head->chain);
 	sk->sk_hash = hash;
 	sock_prot_inc_use(sk->sk_prot);
-	write_unlock(&head->lock);
+	write_unlock(lock);
 
 	if (twp != NULL) {
 		*twp = tw;
@@ -231,7 +232,7 @@ unique:
 	return 0;
 
 not_unique:
-	write_unlock(&head->lock);
+	write_unlock(lock);
 	return -EADDRNOTAVAIL;
 }
 

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
  2007-11-01 10:16 [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table Eric Dumazet
@ 2007-11-01 11:03 ` David Miller
  2007-11-01 11:20   ` Arnaldo Carvalho de Melo
  2007-11-01 11:15 ` Ilpo Järvinen
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 29+ messages in thread
From: David Miller @ 2007-11-01 11:03 UTC (permalink / raw)
  To: dada1; +Cc: netdev, ak, acme

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Thu, 01 Nov 2007 11:16:20 +0100

> As done two years ago on IP route cache table (commit 
> 22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one lock per 
> hash bucket for the huge TCP/DCCP hash tables.
> 
> On a typical x86_64 platform, this saves about 2MB or 4MB of ram, for litle 
> performance differences. (we hit a different cache line for the rwlock, but 
> then the bucket cache line have a better sharing factor among cpus, since we 
> dirty it less often)
> 
> Using a 'small' table of hashed rwlocks should be more than enough to provide 
> correct SMP concurrency between different buckets, without using too much 
> memory. Sizing of this table depends on NR_CPUS and various CONFIG settings.
> 
> This patch provides some locking abstraction that may ease a future work using 
>   a different model for TCP/DCCP table.
> 
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

Nice work Eric.

I've tossed this into my local tree and we'll let this cook
for a few days.  If no problems crop up I will submit it
for 2.6.24 because the memory savings is non-trivial.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
  2007-11-01 11:03 ` David Miller
@ 2007-11-01 11:20   ` Arnaldo Carvalho de Melo
  0 siblings, 0 replies; 29+ messages in thread
From: Arnaldo Carvalho de Melo @ 2007-11-01 11:20 UTC (permalink / raw)
  To: David Miller; +Cc: dada1, netdev, ak, acme

Em Thu, Nov 01, 2007 at 04:03:40AM -0700, David Miller escreveu:
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Thu, 01 Nov 2007 11:16:20 +0100
> 
> > As done two years ago on IP route cache table (commit 
> > 22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one lock per 
> > hash bucket for the huge TCP/DCCP hash tables.
> > 
> > On a typical x86_64 platform, this saves about 2MB or 4MB of ram, for litle 
> > performance differences. (we hit a different cache line for the rwlock, but 
> > then the bucket cache line have a better sharing factor among cpus, since we 
> > dirty it less often)
> > 
> > Using a 'small' table of hashed rwlocks should be more than enough to provide 
> > correct SMP concurrency between different buckets, without using too much 
> > memory. Sizing of this table depends on NR_CPUS and various CONFIG settings.
> > 
> > This patch provides some locking abstraction that may ease a future work using 
> >   a different model for TCP/DCCP table.
> > 
> > Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
> 
> Nice work Eric.
> 
> I've tossed this into my local tree and we'll let this cook
> for a few days.  If no problems crop up I will submit it
> for 2.6.24 because the memory savings is non-trivial.

Agreed, thanks!

Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
  2007-11-01 10:16 [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table Eric Dumazet
  2007-11-01 11:03 ` David Miller
@ 2007-11-01 11:15 ` Ilpo Järvinen
  2007-11-01 16:06 ` Jarek Poplawski
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 29+ messages in thread
From: Ilpo Järvinen @ 2007-11-01 11:15 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S. Miller, Linux Netdev List, Andi Kleen,
	Arnaldo Carvalho de Melo

On Thu, 1 Nov 2007, Eric Dumazet wrote:

> @@ -134,6 +156,13 @@ static inline struct inet_ehash_bucket *inet_ehash_bucket(
>  	return &hashinfo->ehash[hash & (hashinfo->ehash_size - 1)];
>  }
>  
> +static inline rwlock_t *inet_ehash_lockp(
> +	struct inet_hashinfo *hashinfo,

...These two fit to 80 columns.

> +	unsigned int hash)
> +{
> +	return &hashinfo->ehash_locks[hash & (EHASH_LOCK_SZ - 1)];
> +}
> +
>  extern struct inet_bind_bucket *
>  		    inet_bind_bucket_create(struct kmem_cache *cachep,
>  					    struct inet_bind_hashbucket *head,

-- 
 i.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
  2007-11-01 10:16 [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table Eric Dumazet
  2007-11-01 11:03 ` David Miller
  2007-11-01 11:15 ` Ilpo Järvinen
@ 2007-11-01 16:06 ` Jarek Poplawski
  2007-11-01 18:00   ` Eric Dumazet
  2007-11-01 16:14 ` Stephen Hemminger
  2007-11-03 23:18 ` Andi Kleen
  4 siblings, 1 reply; 29+ messages in thread
From: Jarek Poplawski @ 2007-11-01 16:06 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S. Miller, Linux Netdev List, Andi Kleen,
	Arnaldo Carvalho de Melo

Hi,

A few doubts below: 

Eric Dumazet wrote:
> As done two years ago on IP route cache table (commit 
> 22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one lock per 
> hash bucket for the huge TCP/DCCP hash tables.
...
> diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
> index 4427dcd..5cbfbac 100644
> --- a/include/net/inet_hashtables.h
> +++ b/include/net/inet_hashtables.h
> @@ -37,7 +37,6 @@
>   * I'll experiment with dynamic table growth later.
>   */
>  struct inet_ehash_bucket {
> -	rwlock_t	  lock;
>  	struct hlist_head chain;
>  	struct hlist_head twchain;
>  };
> @@ -91,6 +90,28 @@ struct inet_bind_hashbucket {
>  /* This is for listening sockets, thus all sockets which possess wildcards. */
>  #define INET_LHTABLE_SIZE	32	/* Yes, really, this is all you need. */
>  
> +#if defined(CONFIG_SMP) || defined(CONFIG_PROVE_LOCKING)

Probably "|| defined(CONFIG_DEBUG_SPINLOCK)" is needed here.

> +/*
> + * Instead of using one rwlock for each inet_ehash_bucket, we use a table of locks
> + * The size of this table is a power of two and depends on the number of CPUS.
> + */
> +# if defined(CONFIG_DEBUG_LOCK_ALLOC)
> +#  define EHASH_LOCK_SZ 256
> +# elif NR_CPUS >= 32
> +#  define EHASH_LOCK_SZ	4096
> +# elif NR_CPUS >= 16
> +#  define EHASH_LOCK_SZ	2048
> +# elif NR_CPUS >= 8
> +#  define EHASH_LOCK_SZ	1024
> +# elif NR_CPUS >= 4
> +#  define EHASH_LOCK_SZ	512
> +# else
> +#  define EHASH_LOCK_SZ	256
> +# endif
> +#else
> +# define EHASH_LOCK_SZ 0
> +#endif
> +

Looks hackish: usually DEBUG code checks "real" environment, and here it's
a special case. But omitting locks if no SMP or DEBUG is strange. IMHO,
there should be 1 instead of 0.

>  struct inet_hashinfo {
>  	/* This is for sockets with full identity only.  Sockets here will
>  	 * always be without wildcards and will have the following invariant:
> @@ -100,6 +121,7 @@ struct inet_hashinfo {
>  	 * TIME_WAIT sockets use a separate chain (twchain).
>  	 */
>  	struct inet_ehash_bucket	*ehash;
> +	rwlock_t			*ehash_locks;
>  
>  	/* Ok, let's try this, I give up, we do need a local binding
>  	 * TCP hash as well as the others for fast bind/connect.
> @@ -134,6 +156,13 @@ static inline struct inet_ehash_bucket *inet_ehash_bucket(
>  	return &hashinfo->ehash[hash & (hashinfo->ehash_size - 1)];
>  }
>  
> +static inline rwlock_t *inet_ehash_lockp(
> +	struct inet_hashinfo *hashinfo,
> +	unsigned int hash)
> +{
> +	return &hashinfo->ehash_locks[hash & (EHASH_LOCK_SZ - 1)];
> +}
> +

Is it OK for EHASH_LOCK_SZ == 0?

...
> diff --git a/net/dccp/proto.c b/net/dccp/proto.c
> index d849739..3b5f97a 100644
> --- a/net/dccp/proto.c
> +++ b/net/dccp/proto.c
> @@ -1072,11 +1072,18 @@ static int __init dccp_init(void)
>  	}
>  
>  	for (i = 0; i < dccp_hashinfo.ehash_size; i++) {
> -		rwlock_init(&dccp_hashinfo.ehash[i].lock);
>  		INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain);
>  		INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].twchain);
>  	}
> -
> +	if (EHASH_LOCK_SZ) {

Why not #ifdef then? But, IMHO, rwlock_init() should be done at least
once here. (Similarly later for tcp.)

> +		dccp_hashinfo.ehash_locks =
> +			kmalloc(EHASH_LOCK_SZ * sizeof(rwlock_t),
> +				GFP_KERNEL);
> +		if (!dccp_hashinfo.ehash_locks)
> +			goto out_free_dccp_ehash;
> +		for (i = 0; i < EHASH_LOCK_SZ; i++)
> +			rwlock_init(&dccp_hashinfo.ehash_locks[i]);
> +	}
>  	bhash_order = ehash_order;
>  
>  	do {
> @@ -1091,7 +1098,7 @@ static int __init dccp_init(void)
>  
>  	if (!dccp_hashinfo.bhash) {
>  		DCCP_CRIT("Failed to allocate DCCP bind hash table");
> -		goto out_free_dccp_ehash;
> +		goto out_free_dccp_locks;
>  	}
>  
>  	for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
> @@ -1121,6 +1128,9 @@ out_free_dccp_mib:
>  out_free_dccp_bhash:
>  	free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
>  	dccp_hashinfo.bhash = NULL;
> +out_free_dccp_locks:
> +	kfree(dccp_hashinfo.ehash_locks);
> +	dccp_hashinfo.ehash_locks = NULL;
>  out_free_dccp_ehash:
>  	free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
>  	dccp_hashinfo.ehash = NULL;

Isn't such kfree(dccp_hashinfo.ehash_locks) needed in dccp_fini()?

Regards,
Jarek P.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
  2007-11-01 16:06 ` Jarek Poplawski
@ 2007-11-01 18:00   ` Eric Dumazet
  0 siblings, 0 replies; 29+ messages in thread
From: Eric Dumazet @ 2007-11-01 18:00 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: David S. Miller, Linux Netdev List, Andi Kleen,
	Arnaldo Carvalho de Melo

Jarek Poplawski a écrit :
> Hi,
> 
> A few doubts below: 
> 
>>  
>> +#if defined(CONFIG_SMP) || defined(CONFIG_PROVE_LOCKING)
> 
> Probably "|| defined(CONFIG_DEBUG_SPINLOCK)" is needed here.

Not sure, because DEBUG_SPINLOCK only applies to spinlocks.
Here we deal with rwlocks.

> 
>> +/*
>> + * Instead of using one rwlock for each inet_ehash_bucket, we use a table of locks
>> + * The size of this table is a power of two and depends on the number of CPUS.
>> + */
>> +# if defined(CONFIG_DEBUG_LOCK_ALLOC)
>> +#  define EHASH_LOCK_SZ 256
>> +# elif NR_CPUS >= 32
>> +#  define EHASH_LOCK_SZ	4096
>> +# elif NR_CPUS >= 16
>> +#  define EHASH_LOCK_SZ	2048
>> +# elif NR_CPUS >= 8
>> +#  define EHASH_LOCK_SZ	1024
>> +# elif NR_CPUS >= 4
>> +#  define EHASH_LOCK_SZ	512
>> +# else
>> +#  define EHASH_LOCK_SZ	256
>> +# endif
>> +#else
>> +# define EHASH_LOCK_SZ 0
>> +#endif
>> +
> 
> Looks hackish: usually DEBUG code checks "real" environment, and here it's
> a special case. But omitting locks if no SMP or DEBUG is strange. IMHO,
> there should be 1 instead of 0.

It is 0 so that no alloc is done. (see your next questions)


> 
>>  struct inet_hashinfo {
>>  	/* This is for sockets with full identity only.  Sockets here will
>>  	 * always be without wildcards and will have the following invariant:
>> @@ -100,6 +121,7 @@ struct inet_hashinfo {
>>  	 * TIME_WAIT sockets use a separate chain (twchain).
>>  	 */
>>  	struct inet_ehash_bucket	*ehash;
>> +	rwlock_t			*ehash_locks;
>>  
>>  	/* Ok, let's try this, I give up, we do need a local binding
>>  	 * TCP hash as well as the others for fast bind/connect.
>> @@ -134,6 +156,13 @@ static inline struct inet_ehash_bucket *inet_ehash_bucket(
>>  	return &hashinfo->ehash[hash & (hashinfo->ehash_size - 1)];
>>  }
>>  
>> +static inline rwlock_t *inet_ehash_lockp(
>> +	struct inet_hashinfo *hashinfo,
>> +	unsigned int hash)
>> +{
>> +	return &hashinfo->ehash_locks[hash & (EHASH_LOCK_SZ - 1)];
>> +}
>> +
> 
> Is it OK for EHASH_LOCK_SZ == 0?

At least, compiled tested and booted on UP ;)

> 
> ...
>> diff --git a/net/dccp/proto.c b/net/dccp/proto.c
>> index d849739..3b5f97a 100644
>> --- a/net/dccp/proto.c
>> +++ b/net/dccp/proto.c
>> @@ -1072,11 +1072,18 @@ static int __init dccp_init(void)
>>  	}
>>  
>>  	for (i = 0; i < dccp_hashinfo.ehash_size; i++) {
>> -		rwlock_init(&dccp_hashinfo.ehash[i].lock);
>>  		INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain);
>>  		INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].twchain);
>>  	}
>> -
>> +	if (EHASH_LOCK_SZ) {
> 
> Why not #ifdef then? But, IMHO, rwlock_init() should be done at least
> once here. (Similarly later for tcp.)

well, #ifdef are not so nice :)

> 
>> +		dccp_hashinfo.ehash_locks =
>> +			kmalloc(EHASH_LOCK_SZ * sizeof(rwlock_t),
>> +				GFP_KERNEL);
>> +		if (!dccp_hashinfo.ehash_locks)
>> +			goto out_free_dccp_ehash;
>> +		for (i = 0; i < EHASH_LOCK_SZ; i++)
>> +			rwlock_init(&dccp_hashinfo.ehash_locks[i]);
>> +	}
>>  	bhash_order = ehash_order;
>>  
>>  	do {
>> @@ -1091,7 +1098,7 @@ static int __init dccp_init(void)
>>  
>>  	if (!dccp_hashinfo.bhash) {
>>  		DCCP_CRIT("Failed to allocate DCCP bind hash table");
>> -		goto out_free_dccp_ehash;
>> +		goto out_free_dccp_locks;
>>  	}
>>  
>>  	for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
>> @@ -1121,6 +1128,9 @@ out_free_dccp_mib:
>>  out_free_dccp_bhash:
>>  	free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
>>  	dccp_hashinfo.bhash = NULL;
>> +out_free_dccp_locks:
>> +	kfree(dccp_hashinfo.ehash_locks);
>> +	dccp_hashinfo.ehash_locks = NULL;
>>  out_free_dccp_ehash:
>>  	free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
>>  	dccp_hashinfo.ehash = NULL;
> 
> Isn't such kfree(dccp_hashinfo.ehash_locks) needed in dccp_fini()?
> 

Probably ! Thank you for reviewing !

Eric

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
  2007-11-01 10:16 [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table Eric Dumazet
                   ` (2 preceding siblings ...)
  2007-11-01 16:06 ` Jarek Poplawski
@ 2007-11-01 16:14 ` Stephen Hemminger
  2007-11-01 17:54   ` Eric Dumazet
  2007-11-03 23:18 ` Andi Kleen
  4 siblings, 1 reply; 29+ messages in thread
From: Stephen Hemminger @ 2007-11-01 16:14 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S. Miller, Linux Netdev List, Andi Kleen,
	Arnaldo Carvalho de Melo

On Thu, 01 Nov 2007 11:16:20 +0100
Eric Dumazet <dada1@cosmosbay.com> wrote:

> As done two years ago on IP route cache table (commit 
> 22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one lock per 
> hash bucket for the huge TCP/DCCP hash tables.
> 
> On a typical x86_64 platform, this saves about 2MB or 4MB of ram, for litle 
> performance differences. (we hit a different cache line for the rwlock, but 
> then the bucket cache line have a better sharing factor among cpus, since we 
> dirty it less often)
> 
> Using a 'small' table of hashed rwlocks should be more than enough to provide 
> correct SMP concurrency between different buckets, without using too much 
> memory. Sizing of this table depends on NR_CPUS and various CONFIG settings.
> 
> This patch provides some locking abstraction that may ease a future work using 
>   a different model for TCP/DCCP table.
> 
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
> 
>   include/net/inet_hashtables.h |   40 ++++++++++++++++++++++++++++----
>   net/dccp/proto.c              |   16 ++++++++++--
>   net/ipv4/inet_diag.c          |    9 ++++---
>   net/ipv4/inet_hashtables.c    |    7 +++--
>   net/ipv4/inet_timewait_sock.c |   13 +++++-----
>   net/ipv4/tcp.c                |   11 +++++++-
>   net/ipv4/tcp_ipv4.c           |   11 ++++----
>   net/ipv6/inet6_hashtables.c   |   19 ++++++++-------
>   8 files changed, 89 insertions(+), 37 deletions(-)
> 

Longterm is there any chance of using rcu for this? Seems like
it could be a big win.

-- 
Stephen Hemminger <shemminger@linux-foundation.org>

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
  2007-11-01 16:14 ` Stephen Hemminger
@ 2007-11-01 17:54   ` Eric Dumazet
  2007-11-01 18:48     ` Rick Jones
  2007-11-01 21:46     ` David Miller
  0 siblings, 2 replies; 29+ messages in thread
From: Eric Dumazet @ 2007-11-01 17:54 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David S. Miller, Linux Netdev List, Andi Kleen,
	Arnaldo Carvalho de Melo

Stephen Hemminger a écrit :
> On Thu, 01 Nov 2007 11:16:20 +0100
> Eric Dumazet <dada1@cosmosbay.com> wrote:
> 
>> As done two years ago on IP route cache table (commit 
>> 22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one lock per 
>> hash bucket for the huge TCP/DCCP hash tables.
>>
>> On a typical x86_64 platform, this saves about 2MB or 4MB of ram, for litle 
>> performance differences. (we hit a different cache line for the rwlock, but 
>> then the bucket cache line have a better sharing factor among cpus, since we 
>> dirty it less often)
>>
>> Using a 'small' table of hashed rwlocks should be more than enough to provide 
>> correct SMP concurrency between different buckets, without using too much 
>> memory. Sizing of this table depends on NR_CPUS and various CONFIG settings.
>>
>> This patch provides some locking abstraction that may ease a future work using 
>>   a different model for TCP/DCCP table.
>>
>> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
>>
>>   include/net/inet_hashtables.h |   40 ++++++++++++++++++++++++++++----
>>   net/dccp/proto.c              |   16 ++++++++++--
>>   net/ipv4/inet_diag.c          |    9 ++++---
>>   net/ipv4/inet_hashtables.c    |    7 +++--
>>   net/ipv4/inet_timewait_sock.c |   13 +++++-----
>>   net/ipv4/tcp.c                |   11 +++++++-
>>   net/ipv4/tcp_ipv4.c           |   11 ++++----
>>   net/ipv6/inet6_hashtables.c   |   19 ++++++++-------
>>   8 files changed, 89 insertions(+), 37 deletions(-)
>>
> 
> Longterm is there any chance of using rcu for this? Seems like
> it could be a big win.
> 

This was discussed in the past, and I even believe some patch was proposed, 
but some guys (including David) complained that RCU is well suited for 'mostly 
  read' structures.

On some web server workloads, TCP hash table is constantly accessed in write 
mode (socket creation, socket move to timewait state, socket  deleted...), and 
RCU added overhead and poor cache re-use (because sockets must be placed on 
RCU queue before reuse)

On these typical workload, hash table without RCU is still the best.

Longterm changes would rather be based on Robert Olsson suggestion last year 
(trie based lookups and unified IP/TCP cache)

Short term changes would be to be able to resize the TCP hash table (being 
small at boot, and be able to grow it if necessary). Its current size on 
modern machines is just insane.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
  2007-11-01 17:54   ` Eric Dumazet
@ 2007-11-01 18:48     ` Rick Jones
  2007-11-01 19:00       ` Eric Dumazet
  2007-11-01 21:46     ` David Miller
  1 sibling, 1 reply; 29+ messages in thread
From: Rick Jones @ 2007-11-01 18:48 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, David S. Miller, Linux Netdev List, Andi Kleen,
	Arnaldo Carvalho de Melo

Eric Dumazet wrote:
> Stephen Hemminger a écrit :
> 
>> On Thu, 01 Nov 2007 11:16:20 +0100
>> Eric Dumazet <dada1@cosmosbay.com> wrote:
>>
>>> As done two years ago on IP route cache table (commit 
>>> 22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one 
>>> lock per hash bucket for the huge TCP/DCCP hash tables.

The TCP hashes are looked at with higher frequency than the route cache yes?

>>> On a typical x86_64 platform, this saves about 2MB or 4MB of ram, for 
>>> litle performance differences. (we hit a different cache line for the 
>>> rwlock, but then the bucket cache line have a better sharing factor 
>>> among cpus, since we dirty it less often)
>>>
>>> Using a 'small' table of hashed rwlocks should be more than enough to 
>>> provide correct SMP concurrency between different buckets, without 
>>> using too much memory. Sizing of this table depends on NR_CPUS and 
>>> various CONFIG settings.

Something is telling me finding a 64 core system with a suitable workload to try 
this could be a good thing.  Wish I had one at my disposal.

rick jones

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
  2007-11-01 18:48     ` Rick Jones
@ 2007-11-01 19:00       ` Eric Dumazet
  2007-11-01 19:17         ` Eric Dumazet
  0 siblings, 1 reply; 29+ messages in thread
From: Eric Dumazet @ 2007-11-01 19:00 UTC (permalink / raw)
  To: Rick Jones
  Cc: Stephen Hemminger, David S. Miller, Linux Netdev List, Andi Kleen,
	Arnaldo Carvalho de Melo

Rick Jones a écrit :
> Eric Dumazet wrote:
>> Stephen Hemminger a écrit :
>>
>>> On Thu, 01 Nov 2007 11:16:20 +0100
>>> Eric Dumazet <dada1@cosmosbay.com> wrote:
>>>
>>>> As done two years ago on IP route cache table (commit 
>>>> 22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one 
>>>> lock per hash bucket for the huge TCP/DCCP hash tables.
> 
> The TCP hashes are looked at with higher frequency than the route cache 
> yes?

It depends on the workload, but in general I would say the reverse.

> 
>>>> On a typical x86_64 platform, this saves about 2MB or 4MB of ram, 
>>>> for litle performance differences. (we hit a different cache line 
>>>> for the rwlock, but then the bucket cache line have a better sharing 
>>>> factor among cpus, since we dirty it less often)
>>>>
>>>> Using a 'small' table of hashed rwlocks should be more than enough 
>>>> to provide correct SMP concurrency between different buckets, 
>>>> without using too much memory. Sizing of this table depends on 
>>>> NR_CPUS and various CONFIG settings.
> 
> Something is telling me finding a 64 core system with a suitable 
> workload to try this could be a good thing.  Wish I had one at my disposal.

If you find one, please give it to me when you finished playing^Wworking with 
it :)



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
  2007-11-01 19:00       ` Eric Dumazet
@ 2007-11-01 19:17         ` Eric Dumazet
  2007-11-01 21:52           ` David Miller
  0 siblings, 1 reply; 29+ messages in thread
From: Eric Dumazet @ 2007-11-01 19:17 UTC (permalink / raw)
  To: Rick Jones
  Cc: Stephen Hemminger, David S. Miller, Linux Netdev List, Andi Kleen,
	Arnaldo Carvalho de Melo


> Rick Jones a écrit :
> Something is telling me finding a 64 core system with a suitable 
> workload to try this could be a good thing.  Wish I had one at my 
> disposal.

Maybe on big NUMA machines, we might prefer to spread the rwlock array on 
multiple nodes (ie using vmalloc() instead of kmalloc())

After my patch, it is true all rwlocks are on one memory node. Not exactly 
optimal :(

But if it was an issue, the spinlock array used in IP route cache would have 
the same problem and *someone* should already have complained...



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
  2007-11-01 19:17         ` Eric Dumazet
@ 2007-11-01 21:52           ` David Miller
  0 siblings, 0 replies; 29+ messages in thread
From: David Miller @ 2007-11-01 21:52 UTC (permalink / raw)
  To: dada1; +Cc: rick.jones2, shemminger, netdev, ak, acme

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Thu, 01 Nov 2007 20:17:35 +0100

> But if it was an issue, the spinlock array used in IP route cache
> would have the same problem and *someone* should already have
> complained...

I'd say that having a few MB less of rwlock cache lines to
touch offsets the issue even if it does exist.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
  2007-11-01 17:54   ` Eric Dumazet
  2007-11-01 18:48     ` Rick Jones
@ 2007-11-01 21:46     ` David Miller
  1 sibling, 0 replies; 29+ messages in thread
From: David Miller @ 2007-11-01 21:46 UTC (permalink / raw)
  To: dada1; +Cc: shemminger, netdev, ak, acme

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Thu, 01 Nov 2007 18:54:24 +0100

> Stephen Hemminger a écrit :
> > Longterm is there any chance of using rcu for this? Seems like
> > it could be a big win.
> 
> This was discussed in the past, and I even believe some patch was proposed, 
> but some guys (including David) complained that RCU is well suited for 'mostly 
>   read' structures.
> 
> On some web server workloads, TCP hash table is constantly accessed in write 
> mode (socket creation, socket move to timewait state, socket  deleted...), and 
> RCU added overhead and poor cache re-use (because sockets must be placed on 
> RCU queue before reuse)
> 
> On these typical workload, hash table without RCU is still the best.

Right, and none of the submitted RCU attempts were correct, it's very
hard to get the synchronization right.

> Longterm changes would rather be based on Robert Olsson suggestion
> last year (trie based lookups and unified IP/TCP cache)
>
> Short term changes would be to be able to resize the TCP hash table (being 
> small at boot, and be able to grow it if necessary). Its current size on 
> modern machines is just insane.

Resizing the hash is also, unfortunately, very hard to implement
as well.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
  2007-11-01 10:16 [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table Eric Dumazet
                   ` (3 preceding siblings ...)
  2007-11-01 16:14 ` Stephen Hemminger
@ 2007-11-03 23:18 ` Andi Kleen
  2007-11-03 23:23   ` David Miller
  4 siblings, 1 reply; 29+ messages in thread
From: Andi Kleen @ 2007-11-03 23:18 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S. Miller, Linux Netdev List, Arnaldo Carvalho de Melo

On Thursday 01 November 2007 11:16:20 Eric Dumazet wrote:

Looks good from a quick look. Thanks for doing that work.

Some quick comments:

> +#if defined(CONFIG_SMP) || defined(CONFIG_PROVE_LOCKING)
> +/*
> + * Instead of using one rwlock for each inet_ehash_bucket, we use a table of locks
> + * The size of this table is a power of two and depends on the number of CPUS.
> + */

This shouldn't be hard coded based on NR_CPUS, but be done on runtime
based on num_possible_cpus(). This is better for kernels with a large
NR_CPUS, but which typically run on much smaller systems (like 
distribution kernels) 

Also the EHASH_LOCK_SZ == 0 special case is a little strange. Why did
you add that?

And as a unrelated node have you tried converting the rwlocks 
into normal spinlocks? spinlocks should be somewhat cheaper
because they have less cache protocol overhead and with
the huge thash tables in Linux the chain walks should be short
anyways so not doing this in parallel is probably not a big issue.
At some point I also had a crazy idea of using a special locking
scheme that special cases the common case that a hash chain
has only one member and doesn't take a look for that at all. 

-Andi

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
  2007-11-03 23:18 ` Andi Kleen
@ 2007-11-03 23:23   ` David Miller
  2007-11-04  0:54     ` Andi Kleen
  2007-11-04 11:31     ` Eric Dumazet
  0 siblings, 2 replies; 29+ messages in thread
From: David Miller @ 2007-11-03 23:23 UTC (permalink / raw)
  To: ak; +Cc: dada1, netdev, acme

From: Andi Kleen <ak@suse.de>
Date: Sun, 4 Nov 2007 00:18:14 +0100

> On Thursday 01 November 2007 11:16:20 Eric Dumazet wrote:
> 
> Some quick comments:
> 
> > +#if defined(CONFIG_SMP) || defined(CONFIG_PROVE_LOCKING)
> > +/*
> > + * Instead of using one rwlock for each inet_ehash_bucket, we use a table of locks
> > + * The size of this table is a power of two and depends on the number of CPUS.
> > + */
> 
> This shouldn't be hard coded based on NR_CPUS, but be done on runtime
> based on num_possible_cpus(). This is better for kernels with a large
> NR_CPUS, but which typically run on much smaller systems (like 
> distribution kernels) 

I think this is a good idea.  Eric, could you make this change?

> Also the EHASH_LOCK_SZ == 0 special case is a little strange. Why did
> you add that?

He explained this in another reply, because ifdefs are ugly.

> And as a unrelated node have you tried converting the rwlocks 
> into normal spinlocks? spinlocks should be somewhat cheaper
> because they have less cache protocol overhead and with
> the huge thash tables in Linux the chain walks should be short
> anyways so not doing this in parallel is probably not a big issue.
> At some point I also had a crazy idea of using a special locking
> scheme that special cases the common case that a hash chain
> has only one member and doesn't take a look for that at all. 

I agree.

There was movement at one point to get rid of all rwlock's in the
kernel, I personally think they are pointless.  Any use that makes
"sense" is a case where the code should be rewritten to decrease the
lock hold time or convert to RCU.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
  2007-11-03 23:23   ` David Miller
@ 2007-11-04  0:54     ` Andi Kleen
  2007-11-04 11:31     ` Eric Dumazet
  1 sibling, 0 replies; 29+ messages in thread
From: Andi Kleen @ 2007-11-04  0:54 UTC (permalink / raw)
  To: David Miller; +Cc: dada1, netdev, acme

> > Also the EHASH_LOCK_SZ == 0 special case is a little strange. Why did
> > you add that?
> 
> He explained this in another reply, because ifdefs are ugly.

I meant why having it at all? 

> Any use that makes 
> "sense" is a case where the code should be rewritten to decrease the
> lock hold time or convert to RCU.

I don't think RCU would be really needed for single entry buckets (which
are common) if they are special cased. After all it would be just
a single pointer. Perhaps one could distingush this case e.g. by 
setting the low order bit of the hash bucket pointer.

In fact to optimize for this case it might be an interesting
experiment to go towards an closed hash table and allocate the sockets
in page sized objects and then remap them directly into a virtual
continuous table similar to the new vmemmap code (need to try this at some 
point). 

-Andi

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
  2007-11-03 23:23   ` David Miller
  2007-11-04  0:54     ` Andi Kleen
@ 2007-11-04 11:31     ` Eric Dumazet
  2007-11-04 12:26       ` Andi Kleen
                         ` (2 more replies)
  1 sibling, 3 replies; 29+ messages in thread
From: Eric Dumazet @ 2007-11-04 11:31 UTC (permalink / raw)
  To: David Miller; +Cc: ak, netdev, acme, Jarek Poplawski

[-- Attachment #1: Type: text/plain, Size: 3657 bytes --]

David Miller a écrit :
> From: Andi Kleen <ak@suse.de>
> Date: Sun, 4 Nov 2007 00:18:14 +0100
> 
>> On Thursday 01 November 2007 11:16:20 Eric Dumazet wrote:
>>
>> Some quick comments:
>>
>>> +#if defined(CONFIG_SMP) || defined(CONFIG_PROVE_LOCKING)
>>> +/*
>>> + * Instead of using one rwlock for each inet_ehash_bucket, we use a table of locks
>>> + * The size of this table is a power of two and depends on the number of CPUS.
>>> + */
>> This shouldn't be hard coded based on NR_CPUS, but be done on runtime
>> based on num_possible_cpus(). This is better for kernels with a large
>> NR_CPUS, but which typically run on much smaller systems (like 
>> distribution kernels) 
> 
> I think this is a good idea.  Eric, could you make this change?

Yes of course, since using a non constant value for masking is cheap.

But I suspect distributions kernels enable CONFIG_HOTPLUG_CPU so 
num_possible_cpus() will be NR_CPUS.

> 
>> Also the EHASH_LOCK_SZ == 0 special case is a little strange. Why did
>> you add that?
> 
> He explained this in another reply, because ifdefs are ugly.

This will vanish if done on runtime anyway.

> 
>> And as a unrelated node have you tried converting the rwlocks 
>> into normal spinlocks? spinlocks should be somewhat cheaper
>> because they have less cache protocol overhead and with
>> the huge thash tables in Linux the chain walks should be short
>> anyways so not doing this in parallel is probably not a big issue.
>> At some point I also had a crazy idea of using a special locking
>> scheme that special cases the common case that a hash chain
>> has only one member and doesn't take a look for that at all. 
> 
> I agree.
> 
> There was movement at one point to get rid of all rwlock's in the
> kernel, I personally think they are pointless.  Any use that makes
> "sense" is a case where the code should be rewritten to decrease the
> lock hold time or convert to RCU.
> 

I agree too, rwlocks are more expensive when contention is low, so let do this 
rwlock->spinlock change on next step (separate patch), because it means 
changing also lhash_lock.

Thanks to Jarek, I added locks cleanup in dccp_fini()

[PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table

As done two years ago on IP route cache table (commit 
22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one lock per 
hash bucket for the huge TCP/DCCP hash tables.

On a typical x86_64 platform, this saves about 2MB or 4MB of ram, for litle 
performance differences. (we hit a different cache line for the rwlock, but 
then the bucket cache line have a better sharing factor among cpus, since we 
dirty it less often). For netstat or ss commands that want a full scan of hash 
table, we perform fewer memory accesses.

Using a 'small' table of hashed rwlocks should be more than enough to provide 
correct SMP concurrency between different buckets, without using too much 
memory. Sizing of this table depends on num_possible_cpus() and various CONFIG 
settings.

This patch provides some locking abstraction that may ease a future work using 
  a different model for TCP/DCCP table.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>

  include/net/inet_hashtables.h |   71 +++++++++++++++++++++++++++++---
  net/dccp/proto.c              |    9 +++-
  net/ipv4/inet_diag.c          |    9 ++--
  net/ipv4/inet_hashtables.c    |    7 +--
  net/ipv4/inet_timewait_sock.c |   13 +++--
  net/ipv4/tcp.c                |    4 -
  net/ipv4/tcp_ipv4.c           |   11 ++--
  net/ipv6/inet6_hashtables.c   |   19 ++++----
  8 files changed, 106 insertions(+), 37 deletions(-)


[-- Attachment #2: tcp_ehash_locks.patch --]
[-- Type: text/plain, Size: 14142 bytes --]

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 4427dcd..8461cda 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -37,7 +37,6 @@
  * I'll experiment with dynamic table growth later.
  */
 struct inet_ehash_bucket {
-	rwlock_t	  lock;
 	struct hlist_head chain;
 	struct hlist_head twchain;
 };
@@ -100,6 +99,9 @@ struct inet_hashinfo {
 	 * TIME_WAIT sockets use a separate chain (twchain).
 	 */
 	struct inet_ehash_bucket	*ehash;
+	rwlock_t			*ehash_locks;
+	unsigned int			ehash_size;
+	unsigned int			ehash_locks_mask;
 
 	/* Ok, let's try this, I give up, we do need a local binding
 	 * TCP hash as well as the others for fast bind/connect.
@@ -107,7 +109,7 @@ struct inet_hashinfo {
 	struct inet_bind_hashbucket	*bhash;
 
 	unsigned int			bhash_size;
-	unsigned int			ehash_size;
+	/* Note : 4 bytes padding on 64 bit arches */
 
 	/* All sockets in TCP_LISTEN state will be in here.  This is the only
 	 * table where wildcard'd TCP sockets can exist.  Hash function here
@@ -134,6 +136,62 @@ static inline struct inet_ehash_bucket *inet_ehash_bucket(
 	return &hashinfo->ehash[hash & (hashinfo->ehash_size - 1)];
 }
 
+static inline rwlock_t *inet_ehash_lockp(
+	struct inet_hashinfo *hashinfo,
+	unsigned int hash)
+{
+	return &hashinfo->ehash_locks[hash & hashinfo->ehash_locks_mask];
+}
+
+static inline int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
+{
+	unsigned int i, size = 256;
+#if defined(CONFIG_PROVE_LOCKING)
+	unsigned int nr_pcpus = 2;
+#else
+	unsigned int nr_pcpus = num_possible_cpus();
+#endif
+	if (nr_pcpus >= 4)
+		size = 512;
+	if (nr_pcpus >= 8)
+		size = 1024;
+	if (nr_pcpus >= 16)
+		size = 2048;
+	if (nr_pcpus >= 32)
+		size = 4096;
+	if (sizeof(rwlock_t) != 0) {
+#ifdef CONFIG_NUMA
+		if (size * sizeof(rwlock_t) > PAGE_SIZE)
+			hashinfo->ehash_locks = vmalloc(size * sizeof(rwlock_t));
+		else
+#endif
+		hashinfo->ehash_locks =	kmalloc(size * sizeof(rwlock_t),
+						GFP_KERNEL);
+		if (!hashinfo->ehash_locks)
+			return ENOMEM;
+		for (i = 0; i < size; i++)
+			rwlock_init(&hashinfo->ehash_locks[i]);
+	}
+	hashinfo->ehash_locks_mask = size - 1;
+	return 0;
+}
+
+static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo)
+{
+	if (hashinfo->ehash_locks) {
+#ifdef CONFIG_NUMA
+		unsigned int size = (hashinfo->ehash_locks_mask + 1) *
+							sizeof(rwlock_t);
+		if (size > PAGE_SIZE)
+			vfree(hashinfo->ehash_locks);
+		else
+#else
+		kfree(hashinfo->ehash_locks);
+#endif
+		hashinfo->ehash_locks = NULL;
+	}
+}
+
 extern struct inet_bind_bucket *
 		    inet_bind_bucket_create(struct kmem_cache *cachep,
 					    struct inet_bind_hashbucket *head,
@@ -222,7 +280,7 @@ static inline void __inet_hash(struct inet_hashinfo *hashinfo,
 		sk->sk_hash = inet_sk_ehashfn(sk);
 		head = inet_ehash_bucket(hashinfo, sk->sk_hash);
 		list = &head->chain;
-		lock = &head->lock;
+		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
 		write_lock(lock);
 	}
 	__sk_add_node(sk, list);
@@ -253,7 +311,7 @@ static inline void inet_unhash(struct inet_hashinfo *hashinfo, struct sock *sk)
 		inet_listen_wlock(hashinfo);
 		lock = &hashinfo->lhash_lock;
 	} else {
-		lock = &inet_ehash_bucket(hashinfo, sk->sk_hash)->lock;
+		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
 		write_lock_bh(lock);
 	}
 
@@ -354,9 +412,10 @@ static inline struct sock *
 	 */
 	unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport);
 	struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash);
+	rwlock_t *lock = inet_ehash_lockp(hashinfo, hash);
 
 	prefetch(head->chain.first);
-	read_lock(&head->lock);
+	read_lock(lock);
 	sk_for_each(sk, node, &head->chain) {
 		if (INET_MATCH(sk, hash, acookie, saddr, daddr, ports, dif))
 			goto hit; /* You sunk my battleship! */
@@ -369,7 +428,7 @@ static inline struct sock *
 	}
 	sk = NULL;
 out:
-	read_unlock(&head->lock);
+	read_unlock(lock);
 	return sk;
 hit:
 	sock_hold(sk);
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index d849739..7a3bea9 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1072,11 +1072,13 @@ static int __init dccp_init(void)
 	}
 
 	for (i = 0; i < dccp_hashinfo.ehash_size; i++) {
-		rwlock_init(&dccp_hashinfo.ehash[i].lock);
 		INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain);
 		INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].twchain);
 	}
 
+	if (inet_ehash_locks_alloc(&dccp_hashinfo))
+			goto out_free_dccp_ehash;
+
 	bhash_order = ehash_order;
 
 	do {
@@ -1091,7 +1093,7 @@ static int __init dccp_init(void)
 
 	if (!dccp_hashinfo.bhash) {
 		DCCP_CRIT("Failed to allocate DCCP bind hash table");
-		goto out_free_dccp_ehash;
+		goto out_free_dccp_locks;
 	}
 
 	for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
@@ -1121,6 +1123,8 @@ out_free_dccp_mib:
 out_free_dccp_bhash:
 	free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
 	dccp_hashinfo.bhash = NULL;
+out_free_dccp_locks:
+	inet_ehash_locks_free(&dccp_hashinfo);
 out_free_dccp_ehash:
 	free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
 	dccp_hashinfo.ehash = NULL;
@@ -1139,6 +1143,7 @@ static void __exit dccp_fini(void)
 	free_pages((unsigned long)dccp_hashinfo.ehash,
 		   get_order(dccp_hashinfo.ehash_size *
 			     sizeof(struct inet_ehash_bucket)));
+	inet_ehash_locks_free(&dccp_hashinfo);
 	kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
 	dccp_ackvec_exit();
 	dccp_sysctl_exit();
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index dc429b6..b017073 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -747,13 +747,14 @@ skip_listen_ht:
 
 	for (i = s_i; i < hashinfo->ehash_size; i++) {
 		struct inet_ehash_bucket *head = &hashinfo->ehash[i];
+		rwlock_t *lock = inet_ehash_lockp(hashinfo, i);
 		struct sock *sk;
 		struct hlist_node *node;
 
 		if (i > s_i)
 			s_num = 0;
 
-		read_lock_bh(&head->lock);
+		read_lock_bh(lock);
 		num = 0;
 		sk_for_each(sk, node, &head->chain) {
 			struct inet_sock *inet = inet_sk(sk);
@@ -769,7 +770,7 @@ skip_listen_ht:
 			    r->id.idiag_dport)
 				goto next_normal;
 			if (inet_csk_diag_dump(sk, skb, cb) < 0) {
-				read_unlock_bh(&head->lock);
+				read_unlock_bh(lock);
 				goto done;
 			}
 next_normal:
@@ -791,14 +792,14 @@ next_normal:
 				    r->id.idiag_dport)
 					goto next_dying;
 				if (inet_twsk_diag_dump(tw, skb, cb) < 0) {
-					read_unlock_bh(&head->lock);
+					read_unlock_bh(lock);
 					goto done;
 				}
 next_dying:
 				++num;
 			}
 		}
-		read_unlock_bh(&head->lock);
+		read_unlock_bh(lock);
 	}
 
 done:
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 16eecc7..67704da 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -204,12 +204,13 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
 	const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport);
 	unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
 	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
+	rwlock_t *lock = inet_ehash_lockp(hinfo, hash);
 	struct sock *sk2;
 	const struct hlist_node *node;
 	struct inet_timewait_sock *tw;
 
 	prefetch(head->chain.first);
-	write_lock(&head->lock);
+	write_lock(lock);
 
 	/* Check TIME-WAIT sockets first. */
 	sk_for_each(sk2, node, &head->twchain) {
@@ -239,7 +240,7 @@ unique:
 	BUG_TRAP(sk_unhashed(sk));
 	__sk_add_node(sk, &head->chain);
 	sock_prot_inc_use(sk->sk_prot);
-	write_unlock(&head->lock);
+	write_unlock(lock);
 
 	if (twp) {
 		*twp = tw;
@@ -255,7 +256,7 @@ unique:
 	return 0;
 
 not_unique:
-	write_unlock(&head->lock);
+	write_unlock(lock);
 	return -EADDRNOTAVAIL;
 }
 
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 4e189e2..a60b99e 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -20,16 +20,16 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
 	struct inet_bind_hashbucket *bhead;
 	struct inet_bind_bucket *tb;
 	/* Unlink from established hashes. */
-	struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, tw->tw_hash);
+	rwlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
 
-	write_lock(&ehead->lock);
+	write_lock(lock);
 	if (hlist_unhashed(&tw->tw_node)) {
-		write_unlock(&ehead->lock);
+		write_unlock(lock);
 		return;
 	}
 	__hlist_del(&tw->tw_node);
 	sk_node_init(&tw->tw_node);
-	write_unlock(&ehead->lock);
+	write_unlock(lock);
 
 	/* Disassociate with bind bucket. */
 	bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)];
@@ -59,6 +59,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
 	const struct inet_sock *inet = inet_sk(sk);
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
+	rwlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
 	struct inet_bind_hashbucket *bhead;
 	/* Step 1: Put TW into bind hash. Original socket stays there too.
 	   Note, that any socket with inet->num != 0 MUST be bound in
@@ -71,7 +72,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
 	inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
 	spin_unlock(&bhead->lock);
 
-	write_lock(&ehead->lock);
+	write_lock(lock);
 
 	/* Step 2: Remove SK from established hash. */
 	if (__sk_del_node_init(sk))
@@ -81,7 +82,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
 	inet_twsk_add_node(tw, &ehead->twchain);
 	atomic_inc(&tw->tw_refcnt);
 
-	write_unlock(&ehead->lock);
+	write_unlock(lock);
 }
 
 EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c64072b..8e65182 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2456,11 +2456,11 @@ void __init tcp_init(void)
 					thash_entries ? 0 : 512 * 1024);
 	tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size;
 	for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
-		rwlock_init(&tcp_hashinfo.ehash[i].lock);
 		INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
 		INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain);
 	}
-
+	if (inet_ehash_locks_alloc(&tcp_hashinfo))
+		panic("TCP: failed to alloc ehash_locks");
 	tcp_hashinfo.bhash =
 		alloc_large_system_hash("TCP bind",
 					sizeof(struct inet_bind_hashbucket),
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index eec02b2..cd82c0e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2049,8 +2049,9 @@ static void *established_get_first(struct seq_file *seq)
 		struct sock *sk;
 		struct hlist_node *node;
 		struct inet_timewait_sock *tw;
+		rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
 
-		read_lock_bh(&tcp_hashinfo.ehash[st->bucket].lock);
+		read_lock_bh(lock);
 		sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
 			if (sk->sk_family != st->family) {
 				continue;
@@ -2067,7 +2068,7 @@ static void *established_get_first(struct seq_file *seq)
 			rc = tw;
 			goto out;
 		}
-		read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock);
+		read_unlock_bh(lock);
 		st->state = TCP_SEQ_STATE_ESTABLISHED;
 	}
 out:
@@ -2094,11 +2095,11 @@ get_tw:
 			cur = tw;
 			goto out;
 		}
-		read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock);
+		read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
 		st->state = TCP_SEQ_STATE_ESTABLISHED;
 
 		if (++st->bucket < tcp_hashinfo.ehash_size) {
-			read_lock_bh(&tcp_hashinfo.ehash[st->bucket].lock);
+			read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
 			sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
 		} else {
 			cur = NULL;
@@ -2206,7 +2207,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
 	case TCP_SEQ_STATE_TIME_WAIT:
 	case TCP_SEQ_STATE_ESTABLISHED:
 		if (v)
-			read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock);
+			read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
 		break;
 	}
 }
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index d6f1026..adc73ad 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -37,9 +37,8 @@ void __inet6_hash(struct inet_hashinfo *hashinfo,
 	} else {
 		unsigned int hash;
 		sk->sk_hash = hash = inet6_sk_ehashfn(sk);
-		hash &= (hashinfo->ehash_size - 1);
-		list = &hashinfo->ehash[hash].chain;
-		lock = &hashinfo->ehash[hash].lock;
+		list = &inet_ehash_bucket(hashinfo, hash)->chain;
+		lock = inet_ehash_lockp(hashinfo, hash);
 		write_lock(lock);
 	}
 
@@ -70,9 +69,10 @@ struct sock *__inet6_lookup_established(struct inet_hashinfo *hashinfo,
 	 */
 	unsigned int hash = inet6_ehashfn(daddr, hnum, saddr, sport);
 	struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash);
+	rwlock_t *lock = inet_ehash_lockp(hashinfo, hash);
 
 	prefetch(head->chain.first);
-	read_lock(&head->lock);
+	read_lock(lock);
 	sk_for_each(sk, node, &head->chain) {
 		/* For IPV6 do the cheaper port and family tests first. */
 		if (INET6_MATCH(sk, hash, saddr, daddr, ports, dif))
@@ -92,12 +92,12 @@ struct sock *__inet6_lookup_established(struct inet_hashinfo *hashinfo,
 				goto hit;
 		}
 	}
-	read_unlock(&head->lock);
+	read_unlock(lock);
 	return NULL;
 
 hit:
 	sock_hold(sk);
-	read_unlock(&head->lock);
+	read_unlock(lock);
 	return sk;
 }
 EXPORT_SYMBOL(__inet6_lookup_established);
@@ -175,12 +175,13 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
 	const unsigned int hash = inet6_ehashfn(daddr, lport, saddr,
 						inet->dport);
 	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
+	rwlock_t *lock = inet_ehash_lockp(hinfo, hash);
 	struct sock *sk2;
 	const struct hlist_node *node;
 	struct inet_timewait_sock *tw;
 
 	prefetch(head->chain.first);
-	write_lock(&head->lock);
+	write_lock(lock);
 
 	/* Check TIME-WAIT sockets first. */
 	sk_for_each(sk2, node, &head->twchain) {
@@ -216,7 +217,7 @@ unique:
 	__sk_add_node(sk, &head->chain);
 	sk->sk_hash = hash;
 	sock_prot_inc_use(sk->sk_prot);
-	write_unlock(&head->lock);
+	write_unlock(lock);
 
 	if (twp != NULL) {
 		*twp = tw;
@@ -231,7 +232,7 @@ unique:
 	return 0;
 
 not_unique:
-	write_unlock(&head->lock);
+	write_unlock(lock);
 	return -EADDRNOTAVAIL;
 }
 

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
  2007-11-04 11:31     ` Eric Dumazet
@ 2007-11-04 12:26       ` Andi Kleen
  2007-11-04 13:05         ` Eric Dumazet
  2007-11-04 21:56         ` David Miller
  2007-11-04 17:58       ` Jarek Poplawski
  2007-11-07 10:41       ` David Miller
  2 siblings, 2 replies; 29+ messages in thread
From: Andi Kleen @ 2007-11-04 12:26 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev, acme, Jarek Poplawski


> But I suspect distributions kernels enable CONFIG_HOTPLUG_CPU so 
> num_possible_cpus() will be NR_CPUS.

Nope, on x86 num_possible_cpus() is derived from BIOS tables these days.

-Andi

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
  2007-11-04 12:26       ` Andi Kleen
@ 2007-11-04 13:05         ` Eric Dumazet
  2007-11-04 21:56         ` David Miller
  1 sibling, 0 replies; 29+ messages in thread
From: Eric Dumazet @ 2007-11-04 13:05 UTC (permalink / raw)
  To: Andi Kleen; +Cc: David Miller, netdev, acme, Jarek Poplawski

Andi Kleen a écrit :
>> But I suspect distributions kernels enable CONFIG_HOTPLUG_CPU so 
>> num_possible_cpus() will be NR_CPUS.
> 
> Nope, on x86 num_possible_cpus() is derived from BIOS tables these days.

Good to know, thank you Andi for this clarification.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
  2007-11-04 12:26       ` Andi Kleen
  2007-11-04 13:05         ` Eric Dumazet
@ 2007-11-04 21:56         ` David Miller
  2007-11-04 23:01           ` Andi Kleen
  1 sibling, 1 reply; 29+ messages in thread
From: David Miller @ 2007-11-04 21:56 UTC (permalink / raw)
  To: ak; +Cc: dada1, netdev, acme, jarkao2

From: Andi Kleen <ak@suse.de>
Date: Sun, 4 Nov 2007 13:26:38 +0100

> 
> > But I suspect distributions kernels enable CONFIG_HOTPLUG_CPU so 
> > num_possible_cpus() will be NR_CPUS.
> 
> Nope, on x86 num_possible_cpus() is derived from BIOS tables these days.

And similarly on SPARC64 is will be set based upon the
physical capabilities of the system.

This makes a huge different as we have to set NR_CPUS to 4096
in order to handle the cpu numbering of some UltraSPARC-IV
machines.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
  2007-11-04 21:56         ` David Miller
@ 2007-11-04 23:01           ` Andi Kleen
  2007-11-05  4:24             ` David Miller
  0 siblings, 1 reply; 29+ messages in thread
From: Andi Kleen @ 2007-11-04 23:01 UTC (permalink / raw)
  To: David Miller; +Cc: dada1, netdev, acme, jarkao2

On Sunday 04 November 2007 22:56:21 David Miller wrote:
> From: Andi Kleen <ak@suse.de>

> This makes a huge different as we have to set NR_CPUS to 4096
> in order to handle the cpu numbering of some UltraSPARC-IV
> machines.

Really? Hopefully you have a large enough stack then. There 
are various users who put char str[NR_CPUS] on the stack
and a few other data structures also get incredibly big with
NR_CPUS arrays.

If it's for sparse cpu ids -- x86 handles those with an 
translation array.

-Andi

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
  2007-11-04 23:01           ` Andi Kleen
@ 2007-11-05  4:24             ` David Miller
  2007-11-05  4:35               ` David Miller
  0 siblings, 1 reply; 29+ messages in thread
From: David Miller @ 2007-11-05  4:24 UTC (permalink / raw)
  To: ak; +Cc: dada1, netdev, acme, jarkao2

From: Andi Kleen <ak@suse.de>
Date: Mon, 5 Nov 2007 00:01:03 +0100

> On Sunday 04 November 2007 22:56:21 David Miller wrote:
> > From: Andi Kleen <ak@suse.de>
> 
> > This makes a huge different as we have to set NR_CPUS to 4096
> > in order to handle the cpu numbering of some UltraSPARC-IV
> > machines.
> 
> Really? Hopefully you have a large enough stack then. There 
> are various users who put char str[NR_CPUS] on the stack
> and a few other data structures also get incredibly big with
> NR_CPUS arrays.

For the stack case there is one debugging case, and that's for
sprintf'ing cpusets.  That could be easily eliminated.

> If it's for sparse cpu ids -- x86 handles those with an 
> translation array.

I would rather not do this, so much assembler code indexes straight
into the per-cpu arrays.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
  2007-11-05  4:24             ` David Miller
@ 2007-11-05  4:35               ` David Miller
  0 siblings, 0 replies; 29+ messages in thread
From: David Miller @ 2007-11-05  4:35 UTC (permalink / raw)
  To: ak; +Cc: dada1, netdev, acme, jarkao2

From: David Miller <davem@davemloft.net>
Date: Sun, 04 Nov 2007 20:24:54 -0800 (PST)

> From: Andi Kleen <ak@suse.de>
> Date: Mon, 5 Nov 2007 00:01:03 +0100
> 
> > If it's for sparse cpu ids -- x86 handles those with an 
> > translation array.
> 
> I would rather not do this, so much assembler code indexes straight
> into the per-cpu arrays.

Also, at current rates, I'll need to be able to support
4096 cpus for real not very long from now :-)

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
  2007-11-04 11:31     ` Eric Dumazet
  2007-11-04 12:26       ` Andi Kleen
@ 2007-11-04 17:58       ` Jarek Poplawski
  2007-11-04 18:15         ` Jarek Poplawski
  2007-11-07 10:41       ` David Miller
  2 siblings, 1 reply; 29+ messages in thread
From: Jarek Poplawski @ 2007-11-04 17:58 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, ak, netdev, acme

Eric Dumazet wrote, On 11/04/2007 12:31 PM:

> David Miller a écrit :
>> From: Andi Kleen <ak@suse.de>
>> Date: Sun, 4 Nov 2007 00:18:14 +0100
>>
>>> On Thursday 01 November 2007 11:16:20 Eric Dumazet wrote:

...

>>> Also the EHASH_LOCK_SZ == 0 special case is a little strange. Why did
>>> you add that?
>> He explained this in another reply, because ifdefs are ugly.


But I hope he was only joking, didn't he?

Let's make it clear: ifdefs are in K&R, so they are very nice! Just like
all C! (K, &, and R as well.)

You know, I can even imagine, there are people, who have K&R around their
beds, instead of some other book, so they could be serious about such 
things. (But, don't worry, it's not me - happily I'm not serious!)

This patch looks OK now, but a bit of grumbling shouldn't harm?:

...

> [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
> 
> As done two years ago on IP route cache table (commit 
> 22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one lock per 
> hash bucket for the huge TCP/DCCP hash tables.
> 
> On a typical x86_64 platform, this saves about 2MB or 4MB of ram, for litle

- litle
+ little

... 

> +static inline int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
> +{
> +	unsigned int i, size = 256;
> +#if defined(CONFIG_PROVE_LOCKING)
> +	unsigned int nr_pcpus = 2;
> +#else
> +	unsigned int nr_pcpus = num_possible_cpus();
> +#endif
> +	if (nr_pcpus >= 4)
> +		size = 512;
> +	if (nr_pcpus >= 8)
> +		size = 1024;
> +	if (nr_pcpus >= 16)
> +		size = 2048;
> +	if (nr_pcpus >= 32)
> +		size = 4096;


It seems, maybe in the future this could look a bit nicer with some log
type shifting.

> +	if (sizeof(rwlock_t) != 0) {
> +#ifdef CONFIG_NUMA
> +		if (size * sizeof(rwlock_t) > PAGE_SIZE)
> +			hashinfo->ehash_locks = vmalloc(size * sizeof(rwlock_t));
> +		else
> +#endif
> +		hashinfo->ehash_locks =	kmalloc(size * sizeof(rwlock_t),
> +						GFP_KERNEL);
> +		if (!hashinfo->ehash_locks)
> +			return ENOMEM;


Probably doesn't matter now, but maybe more common?:
			return -ENOMEM;

> +		for (i = 0; i < size; i++)
> +			rwlock_init(&hashinfo->ehash_locks[i]);


This looks better now, but still is doubtful to me: even if it's safe with
current rwlock implementation, can't we imagine some new debugging or
statistical code added, which would be called from rwlock_init() without
using rwlock_t structure? IMHO, if read_lock() etc. are called in such a
case, rwlock_init() should be done as well.

Regards,
Jarek P.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
  2007-11-04 17:58       ` Jarek Poplawski
@ 2007-11-04 18:15         ` Jarek Poplawski
  2007-11-04 21:23           ` Eric Dumazet
  0 siblings, 1 reply; 29+ messages in thread
From: Jarek Poplawski @ 2007-11-04 18:15 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: Eric Dumazet, David Miller, ak, netdev, acme

Jarek Poplawski wrote, On 11/04/2007 06:58 PM:

> Eric Dumazet wrote, On 11/04/2007 12:31 PM:

...

>> +static inline int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
>> +{

...

>> +	if (sizeof(rwlock_t) != 0) {

...

>> +		for (i = 0; i < size; i++)
>> +			rwlock_init(&hashinfo->ehash_locks[i]);
> 
> 
> This looks better now, but still is doubtful to me: even if it's safe with
> current rwlock implementation, can't we imagine some new debugging or
> statistical code added, which would be called from rwlock_init() without
> using rwlock_t structure? IMHO, if read_lock() etc. are called in such a
> case, rwlock_init() should be done as well.


Of course I mean: if sizeof(rwlock_t) == 0.

 
Jarek P

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
  2007-11-04 18:15         ` Jarek Poplawski
@ 2007-11-04 21:23           ` Eric Dumazet
  2007-11-04 23:08             ` Jarek Poplawski
  0 siblings, 1 reply; 29+ messages in thread
From: Eric Dumazet @ 2007-11-04 21:23 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: David Miller, ak, netdev, acme

Jarek Poplawski a écrit :
> Jarek Poplawski wrote, On 11/04/2007 06:58 PM:
> 
>> Eric Dumazet wrote, On 11/04/2007 12:31 PM:
> 
> ...
> 
>>> +static inline int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
>>> +{
> 
> ...
> 
>>> +	if (sizeof(rwlock_t) != 0) {
> 
> ...
> 
>>> +		for (i = 0; i < size; i++)
>>> +			rwlock_init(&hashinfo->ehash_locks[i]);
>>
>> This looks better now, but still is doubtful to me: even if it's safe with
>> current rwlock implementation, can't we imagine some new debugging or
>> statistical code added, which would be called from rwlock_init() without
>> using rwlock_t structure? IMHO, if read_lock() etc. are called in such a
>> case, rwlock_init() should be done as well.
> 
> 
> Of course I mean: if sizeof(rwlock_t) == 0.

Given those two choices :

#if defined(CONFIG_SMP) || defined(CONFIG_PROVE__LOCKING)
     kmalloc(sizeof(rwlock_t) * size);
#endif

and

    if (sizeof(rwlock_t) != 0) {
        kmalloc(sizeof(rwlock_t) * size);
    }

I prefer the 2nd one. Less error prone, and no need to remember how are 
spelled the gazillions CONFIG_something we have.



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
  2007-11-04 21:23           ` Eric Dumazet
@ 2007-11-04 23:08             ` Jarek Poplawski
  0 siblings, 0 replies; 29+ messages in thread
From: Jarek Poplawski @ 2007-11-04 23:08 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, ak, netdev, acme

Eric Dumazet wrote, On 11/04/2007 10:23 PM:

> Jarek Poplawski a écrit :
>> Jarek Poplawski wrote, On 11/04/2007 06:58 PM:
>>
>>> Eric Dumazet wrote, On 11/04/2007 12:31 PM:
>> ...
>>
>>>> +static inline int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
>>>> +{
>> ...
>>
>>>> +	if (sizeof(rwlock_t) != 0) {
>> ...
>>
>>>> +		for (i = 0; i < size; i++)
>>>> +			rwlock_init(&hashinfo->ehash_locks[i]);
>>> This looks better now, but still is doubtful to me: even if it's safe with
>>> current rwlock implementation, can't we imagine some new debugging or
>>> statistical code added, which would be called from rwlock_init() without
>>> using rwlock_t structure? IMHO, if read_lock() etc. are called in such a
>>> case, rwlock_init() should be done as well.
>>
>> Of course I mean: if sizeof(rwlock_t) == 0.
> 
> Given those two choices :
> 
> #if defined(CONFIG_SMP) || defined(CONFIG_PROVE__LOCKING)
>      kmalloc(sizeof(rwlock_t) * size);
> #endif
> 
> and
> 
>     if (sizeof(rwlock_t) != 0) {
>         kmalloc(sizeof(rwlock_t) * size);
>     }
> 
> I prefer the 2nd one. Less error prone, and no need to remember how are 
> spelled the gazillions CONFIG_something we have.


I've written it's better, too. But this could be improved yet (someday),
I hope.

Thanks,
Jarek P.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
  2007-11-04 11:31     ` Eric Dumazet
  2007-11-04 12:26       ` Andi Kleen
  2007-11-04 17:58       ` Jarek Poplawski
@ 2007-11-07 10:41       ` David Miller
  2007-11-07 12:13         ` Jarek Poplawski
  2 siblings, 1 reply; 29+ messages in thread
From: David Miller @ 2007-11-07 10:41 UTC (permalink / raw)
  To: dada1; +Cc: ak, netdev, acme, jarkao2

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sun, 04 Nov 2007 12:31:28 +0100

> [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
> 
> As done two years ago on IP route cache table (commit 
> 22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one lock per 
> hash bucket for the huge TCP/DCCP hash tables.
> 
> On a typical x86_64 platform, this saves about 2MB or 4MB of ram, for litle 
> performance differences. (we hit a different cache line for the rwlock, but 
> then the bucket cache line have a better sharing factor among cpus, since we 
> dirty it less often). For netstat or ss commands that want a full scan of hash 
> table, we perform fewer memory accesses.
> 
> Using a 'small' table of hashed rwlocks should be more than enough to provide 
> correct SMP concurrency between different buckets, without using too much 
> memory. Sizing of this table depends on num_possible_cpus() and various CONFIG 
> settings.
> 
> This patch provides some locking abstraction that may ease a future work using 
>   a different model for TCP/DCCP table.
> 
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
> Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>

I'm going to push this current version to Linus, the space saving
really justify it and if we want to refine things further we do it
with followon work rather than blocking this patch.

Thanks Eric!

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
  2007-11-07 10:41       ` David Miller
@ 2007-11-07 12:13         ` Jarek Poplawski
  0 siblings, 0 replies; 29+ messages in thread
From: Jarek Poplawski @ 2007-11-07 12:13 UTC (permalink / raw)
  To: David Miller; +Cc: dada1, ak, netdev, acme

On Wed, Nov 07, 2007 at 02:41:14AM -0800, David Miller wrote:
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Sun, 04 Nov 2007 12:31:28 +0100
> 
> > [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
> > 
> > As done two years ago on IP route cache table (commit 
> > 22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one lock per 
> > hash bucket for the huge TCP/DCCP hash tables.
> > 
> > On a typical x86_64 platform, this saves about 2MB or 4MB of ram, for litle 
> > performance differences. (we hit a different cache line for the rwlock, but 
> > then the bucket cache line have a better sharing factor among cpus, since we 
> > dirty it less often). For netstat or ss commands that want a full scan of hash 
> > table, we perform fewer memory accesses.
> > 
> > Using a 'small' table of hashed rwlocks should be more than enough to provide 
> > correct SMP concurrency between different buckets, without using too much 
> > memory. Sizing of this table depends on num_possible_cpus() and various CONFIG 
> > settings.
> > 
> > This patch provides some locking abstraction that may ease a future work using 
> >   a different model for TCP/DCCP table.
> > 
> > Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
> > Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
> 
> I'm going to push this current version to Linus, the space saving
> really justify it and if we want to refine things further we do it
> with followon work rather than blocking this patch.
> 
> Thanks Eric!

I hope my remarks didn't block anything?! I've written it's OK.
So, I'm not sure it's useful or expected, but anyway:

Acked-by: Jarek Poplawski <jarkao2@o2.pl>

Thanks,
Jarek P.

^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2007-11-07 12:09 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-11-01 10:16 [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table Eric Dumazet
2007-11-01 11:03 ` David Miller
2007-11-01 11:20   ` Arnaldo Carvalho de Melo
2007-11-01 11:15 ` Ilpo Järvinen
2007-11-01 16:06 ` Jarek Poplawski
2007-11-01 18:00   ` Eric Dumazet
2007-11-01 16:14 ` Stephen Hemminger
2007-11-01 17:54   ` Eric Dumazet
2007-11-01 18:48     ` Rick Jones
2007-11-01 19:00       ` Eric Dumazet
2007-11-01 19:17         ` Eric Dumazet
2007-11-01 21:52           ` David Miller
2007-11-01 21:46     ` David Miller
2007-11-03 23:18 ` Andi Kleen
2007-11-03 23:23   ` David Miller
2007-11-04  0:54     ` Andi Kleen
2007-11-04 11:31     ` Eric Dumazet
2007-11-04 12:26       ` Andi Kleen
2007-11-04 13:05         ` Eric Dumazet
2007-11-04 21:56         ` David Miller
2007-11-04 23:01           ` Andi Kleen
2007-11-05  4:24             ` David Miller
2007-11-05  4:35               ` David Miller
2007-11-04 17:58       ` Jarek Poplawski
2007-11-04 18:15         ` Jarek Poplawski
2007-11-04 21:23           ` Eric Dumazet
2007-11-04 23:08             ` Jarek Poplawski
2007-11-07 10:41       ` David Miller
2007-11-07 12:13         ` Jarek Poplawski

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).