* [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table
@ 2007-11-01 10:16 Eric Dumazet
2007-11-01 11:03 ` David Miller
` (4 more replies)
0 siblings, 5 replies; 29+ messages in thread
From: Eric Dumazet @ 2007-11-01 10:16 UTC (permalink / raw)
To: David S. Miller; +Cc: Linux Netdev List, Andi Kleen, Arnaldo Carvalho de Melo
[-- Attachment #1: Type: text/plain, Size: 1309 bytes --]
As done two years ago on IP route cache table (commit
22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one lock per
hash bucket for the huge TCP/DCCP hash tables.
On a typical x86_64 platform, this saves about 2MB or 4MB of ram, for litle
performance differences. (we hit a different cache line for the rwlock, but
then the bucket cache line have a better sharing factor among cpus, since we
dirty it less often)
Using a 'small' table of hashed rwlocks should be more than enough to provide
correct SMP concurrency between different buckets, without using too much
memory. Sizing of this table depends on NR_CPUS and various CONFIG settings.
This patch provides some locking abstraction that may ease a future work using
a different model for TCP/DCCP table.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
include/net/inet_hashtables.h | 40 ++++++++++++++++++++++++++++----
net/dccp/proto.c | 16 ++++++++++--
net/ipv4/inet_diag.c | 9 ++++---
net/ipv4/inet_hashtables.c | 7 +++--
net/ipv4/inet_timewait_sock.c | 13 +++++-----
net/ipv4/tcp.c | 11 +++++++-
net/ipv4/tcp_ipv4.c | 11 ++++----
net/ipv6/inet6_hashtables.c | 19 ++++++++-------
8 files changed, 89 insertions(+), 37 deletions(-)
[-- Attachment #2: tcp_ehash_locks.patch --]
[-- Type: text/plain, Size: 13597 bytes --]
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 4427dcd..5cbfbac 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -37,7 +37,6 @@
* I'll experiment with dynamic table growth later.
*/
struct inet_ehash_bucket {
- rwlock_t lock;
struct hlist_head chain;
struct hlist_head twchain;
};
@@ -91,6 +90,28 @@ struct inet_bind_hashbucket {
/* This is for listening sockets, thus all sockets which possess wildcards. */
#define INET_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */
+#if defined(CONFIG_SMP) || defined(CONFIG_PROVE_LOCKING)
+/*
+ * Instead of using one rwlock for each inet_ehash_bucket, we use a table of locks
+ * The size of this table is a power of two and depends on the number of CPUS.
+ */
+# if defined(CONFIG_DEBUG_LOCK_ALLOC)
+# define EHASH_LOCK_SZ 256
+# elif NR_CPUS >= 32
+# define EHASH_LOCK_SZ 4096
+# elif NR_CPUS >= 16
+# define EHASH_LOCK_SZ 2048
+# elif NR_CPUS >= 8
+# define EHASH_LOCK_SZ 1024
+# elif NR_CPUS >= 4
+# define EHASH_LOCK_SZ 512
+# else
+# define EHASH_LOCK_SZ 256
+# endif
+#else
+# define EHASH_LOCK_SZ 0
+#endif
+
struct inet_hashinfo {
/* This is for sockets with full identity only. Sockets here will
* always be without wildcards and will have the following invariant:
@@ -100,6 +121,7 @@ struct inet_hashinfo {
* TIME_WAIT sockets use a separate chain (twchain).
*/
struct inet_ehash_bucket *ehash;
+ rwlock_t *ehash_locks;
/* Ok, let's try this, I give up, we do need a local binding
* TCP hash as well as the others for fast bind/connect.
@@ -134,6 +156,13 @@ static inline struct inet_ehash_bucket *inet_ehash_bucket(
return &hashinfo->ehash[hash & (hashinfo->ehash_size - 1)];
}
+static inline rwlock_t *inet_ehash_lockp(
+ struct inet_hashinfo *hashinfo,
+ unsigned int hash)
+{
+ return &hashinfo->ehash_locks[hash & (EHASH_LOCK_SZ - 1)];
+}
+
extern struct inet_bind_bucket *
inet_bind_bucket_create(struct kmem_cache *cachep,
struct inet_bind_hashbucket *head,
@@ -222,7 +251,7 @@ static inline void __inet_hash(struct inet_hashinfo *hashinfo,
sk->sk_hash = inet_sk_ehashfn(sk);
head = inet_ehash_bucket(hashinfo, sk->sk_hash);
list = &head->chain;
- lock = &head->lock;
+ lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
write_lock(lock);
}
__sk_add_node(sk, list);
@@ -253,7 +282,7 @@ static inline void inet_unhash(struct inet_hashinfo *hashinfo, struct sock *sk)
inet_listen_wlock(hashinfo);
lock = &hashinfo->lhash_lock;
} else {
- lock = &inet_ehash_bucket(hashinfo, sk->sk_hash)->lock;
+ lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
write_lock_bh(lock);
}
@@ -354,9 +383,10 @@ static inline struct sock *
*/
unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport);
struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash);
+ rwlock_t *lock = inet_ehash_lockp(hashinfo, hash);
prefetch(head->chain.first);
- read_lock(&head->lock);
+ read_lock(lock);
sk_for_each(sk, node, &head->chain) {
if (INET_MATCH(sk, hash, acookie, saddr, daddr, ports, dif))
goto hit; /* You sunk my battleship! */
@@ -369,7 +399,7 @@ static inline struct sock *
}
sk = NULL;
out:
- read_unlock(&head->lock);
+ read_unlock(lock);
return sk;
hit:
sock_hold(sk);
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index d849739..3b5f97a 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1072,11 +1072,18 @@ static int __init dccp_init(void)
}
for (i = 0; i < dccp_hashinfo.ehash_size; i++) {
- rwlock_init(&dccp_hashinfo.ehash[i].lock);
INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain);
INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].twchain);
}
-
+ if (EHASH_LOCK_SZ) {
+ dccp_hashinfo.ehash_locks =
+ kmalloc(EHASH_LOCK_SZ * sizeof(rwlock_t),
+ GFP_KERNEL);
+ if (!dccp_hashinfo.ehash_locks)
+ goto out_free_dccp_ehash;
+ for (i = 0; i < EHASH_LOCK_SZ; i++)
+ rwlock_init(&dccp_hashinfo.ehash_locks[i]);
+ }
bhash_order = ehash_order;
do {
@@ -1091,7 +1098,7 @@ static int __init dccp_init(void)
if (!dccp_hashinfo.bhash) {
DCCP_CRIT("Failed to allocate DCCP bind hash table");
- goto out_free_dccp_ehash;
+ goto out_free_dccp_locks;
}
for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
@@ -1121,6 +1128,9 @@ out_free_dccp_mib:
out_free_dccp_bhash:
free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
dccp_hashinfo.bhash = NULL;
+out_free_dccp_locks:
+ kfree(dccp_hashinfo.ehash_locks);
+ dccp_hashinfo.ehash_locks = NULL;
out_free_dccp_ehash:
free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
dccp_hashinfo.ehash = NULL;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index dc429b6..b017073 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -747,13 +747,14 @@ skip_listen_ht:
for (i = s_i; i < hashinfo->ehash_size; i++) {
struct inet_ehash_bucket *head = &hashinfo->ehash[i];
+ rwlock_t *lock = inet_ehash_lockp(hashinfo, i);
struct sock *sk;
struct hlist_node *node;
if (i > s_i)
s_num = 0;
- read_lock_bh(&head->lock);
+ read_lock_bh(lock);
num = 0;
sk_for_each(sk, node, &head->chain) {
struct inet_sock *inet = inet_sk(sk);
@@ -769,7 +770,7 @@ skip_listen_ht:
r->id.idiag_dport)
goto next_normal;
if (inet_csk_diag_dump(sk, skb, cb) < 0) {
- read_unlock_bh(&head->lock);
+ read_unlock_bh(lock);
goto done;
}
next_normal:
@@ -791,14 +792,14 @@ next_normal:
r->id.idiag_dport)
goto next_dying;
if (inet_twsk_diag_dump(tw, skb, cb) < 0) {
- read_unlock_bh(&head->lock);
+ read_unlock_bh(lock);
goto done;
}
next_dying:
++num;
}
}
- read_unlock_bh(&head->lock);
+ read_unlock_bh(lock);
}
done:
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 16eecc7..67704da 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -204,12 +204,13 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport);
unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
+ rwlock_t *lock = inet_ehash_lockp(hinfo, hash);
struct sock *sk2;
const struct hlist_node *node;
struct inet_timewait_sock *tw;
prefetch(head->chain.first);
- write_lock(&head->lock);
+ write_lock(lock);
/* Check TIME-WAIT sockets first. */
sk_for_each(sk2, node, &head->twchain) {
@@ -239,7 +240,7 @@ unique:
BUG_TRAP(sk_unhashed(sk));
__sk_add_node(sk, &head->chain);
sock_prot_inc_use(sk->sk_prot);
- write_unlock(&head->lock);
+ write_unlock(lock);
if (twp) {
*twp = tw;
@@ -255,7 +256,7 @@ unique:
return 0;
not_unique:
- write_unlock(&head->lock);
+ write_unlock(lock);
return -EADDRNOTAVAIL;
}
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 4e189e2..a60b99e 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -20,16 +20,16 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
struct inet_bind_hashbucket *bhead;
struct inet_bind_bucket *tb;
/* Unlink from established hashes. */
- struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, tw->tw_hash);
+ rwlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
- write_lock(&ehead->lock);
+ write_lock(lock);
if (hlist_unhashed(&tw->tw_node)) {
- write_unlock(&ehead->lock);
+ write_unlock(lock);
return;
}
__hlist_del(&tw->tw_node);
sk_node_init(&tw->tw_node);
- write_unlock(&ehead->lock);
+ write_unlock(lock);
/* Disassociate with bind bucket. */
bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)];
@@ -59,6 +59,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
const struct inet_sock *inet = inet_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
+ rwlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
struct inet_bind_hashbucket *bhead;
/* Step 1: Put TW into bind hash. Original socket stays there too.
Note, that any socket with inet->num != 0 MUST be bound in
@@ -71,7 +72,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
spin_unlock(&bhead->lock);
- write_lock(&ehead->lock);
+ write_lock(lock);
/* Step 2: Remove SK from established hash. */
if (__sk_del_node_init(sk))
@@ -81,7 +82,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
inet_twsk_add_node(tw, &ehead->twchain);
atomic_inc(&tw->tw_refcnt);
- write_unlock(&ehead->lock);
+ write_unlock(lock);
}
EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c64072b..e7aca8e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2456,11 +2456,18 @@ void __init tcp_init(void)
thash_entries ? 0 : 512 * 1024);
tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size;
for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
- rwlock_init(&tcp_hashinfo.ehash[i].lock);
INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain);
}
-
+ if (EHASH_LOCK_SZ) {
+ tcp_hashinfo.ehash_locks =
+ kmalloc(EHASH_LOCK_SZ * sizeof(rwlock_t),
+ GFP_KERNEL);
+ if (!tcp_hashinfo.ehash_locks)
+ panic("TCP: failed to alloc ehash_locks");
+ for (i = 0; i < EHASH_LOCK_SZ; i++)
+ rwlock_init(&tcp_hashinfo.ehash_locks[i]);
+ }
tcp_hashinfo.bhash =
alloc_large_system_hash("TCP bind",
sizeof(struct inet_bind_hashbucket),
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index eec02b2..cd82c0e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2049,8 +2049,9 @@ static void *established_get_first(struct seq_file *seq)
struct sock *sk;
struct hlist_node *node;
struct inet_timewait_sock *tw;
+ rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
- read_lock_bh(&tcp_hashinfo.ehash[st->bucket].lock);
+ read_lock_bh(lock);
sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
if (sk->sk_family != st->family) {
continue;
@@ -2067,7 +2068,7 @@ static void *established_get_first(struct seq_file *seq)
rc = tw;
goto out;
}
- read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock);
+ read_unlock_bh(lock);
st->state = TCP_SEQ_STATE_ESTABLISHED;
}
out:
@@ -2094,11 +2095,11 @@ get_tw:
cur = tw;
goto out;
}
- read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock);
+ read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
st->state = TCP_SEQ_STATE_ESTABLISHED;
if (++st->bucket < tcp_hashinfo.ehash_size) {
- read_lock_bh(&tcp_hashinfo.ehash[st->bucket].lock);
+ read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
} else {
cur = NULL;
@@ -2206,7 +2207,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
case TCP_SEQ_STATE_TIME_WAIT:
case TCP_SEQ_STATE_ESTABLISHED:
if (v)
- read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock);
+ read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
break;
}
}
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index d6f1026..adc73ad 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -37,9 +37,8 @@ void __inet6_hash(struct inet_hashinfo *hashinfo,
} else {
unsigned int hash;
sk->sk_hash = hash = inet6_sk_ehashfn(sk);
- hash &= (hashinfo->ehash_size - 1);
- list = &hashinfo->ehash[hash].chain;
- lock = &hashinfo->ehash[hash].lock;
+ list = &inet_ehash_bucket(hashinfo, hash)->chain;
+ lock = inet_ehash_lockp(hashinfo, hash);
write_lock(lock);
}
@@ -70,9 +69,10 @@ struct sock *__inet6_lookup_established(struct inet_hashinfo *hashinfo,
*/
unsigned int hash = inet6_ehashfn(daddr, hnum, saddr, sport);
struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash);
+ rwlock_t *lock = inet_ehash_lockp(hashinfo, hash);
prefetch(head->chain.first);
- read_lock(&head->lock);
+ read_lock(lock);
sk_for_each(sk, node, &head->chain) {
/* For IPV6 do the cheaper port and family tests first. */
if (INET6_MATCH(sk, hash, saddr, daddr, ports, dif))
@@ -92,12 +92,12 @@ struct sock *__inet6_lookup_established(struct inet_hashinfo *hashinfo,
goto hit;
}
}
- read_unlock(&head->lock);
+ read_unlock(lock);
return NULL;
hit:
sock_hold(sk);
- read_unlock(&head->lock);
+ read_unlock(lock);
return sk;
}
EXPORT_SYMBOL(__inet6_lookup_established);
@@ -175,12 +175,13 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
const unsigned int hash = inet6_ehashfn(daddr, lport, saddr,
inet->dport);
struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
+ rwlock_t *lock = inet_ehash_lockp(hinfo, hash);
struct sock *sk2;
const struct hlist_node *node;
struct inet_timewait_sock *tw;
prefetch(head->chain.first);
- write_lock(&head->lock);
+ write_lock(lock);
/* Check TIME-WAIT sockets first. */
sk_for_each(sk2, node, &head->twchain) {
@@ -216,7 +217,7 @@ unique:
__sk_add_node(sk, &head->chain);
sk->sk_hash = hash;
sock_prot_inc_use(sk->sk_prot);
- write_unlock(&head->lock);
+ write_unlock(lock);
if (twp != NULL) {
*twp = tw;
@@ -231,7 +232,7 @@ unique:
return 0;
not_unique:
- write_unlock(&head->lock);
+ write_unlock(lock);
return -EADDRNOTAVAIL;
}
^ permalink raw reply related [flat|nested] 29+ messages in thread* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table 2007-11-01 10:16 [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table Eric Dumazet @ 2007-11-01 11:03 ` David Miller 2007-11-01 11:20 ` Arnaldo Carvalho de Melo 2007-11-01 11:15 ` Ilpo Järvinen ` (3 subsequent siblings) 4 siblings, 1 reply; 29+ messages in thread From: David Miller @ 2007-11-01 11:03 UTC (permalink / raw) To: dada1; +Cc: netdev, ak, acme From: Eric Dumazet <dada1@cosmosbay.com> Date: Thu, 01 Nov 2007 11:16:20 +0100 > As done two years ago on IP route cache table (commit > 22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one lock per > hash bucket for the huge TCP/DCCP hash tables. > > On a typical x86_64 platform, this saves about 2MB or 4MB of ram, for litle > performance differences. (we hit a different cache line for the rwlock, but > then the bucket cache line have a better sharing factor among cpus, since we > dirty it less often) > > Using a 'small' table of hashed rwlocks should be more than enough to provide > correct SMP concurrency between different buckets, without using too much > memory. Sizing of this table depends on NR_CPUS and various CONFIG settings. > > This patch provides some locking abstraction that may ease a future work using > a different model for TCP/DCCP table. > > Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Nice work Eric. I've tossed this into my local tree and we'll let this cook for a few days. If no problems crop up I will submit it for 2.6.24 because the memory savings is non-trivial. ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table 2007-11-01 11:03 ` David Miller @ 2007-11-01 11:20 ` Arnaldo Carvalho de Melo 0 siblings, 0 replies; 29+ messages in thread From: Arnaldo Carvalho de Melo @ 2007-11-01 11:20 UTC (permalink / raw) To: David Miller; +Cc: dada1, netdev, ak, acme Em Thu, Nov 01, 2007 at 04:03:40AM -0700, David Miller escreveu: > From: Eric Dumazet <dada1@cosmosbay.com> > Date: Thu, 01 Nov 2007 11:16:20 +0100 > > > As done two years ago on IP route cache table (commit > > 22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one lock per > > hash bucket for the huge TCP/DCCP hash tables. > > > > On a typical x86_64 platform, this saves about 2MB or 4MB of ram, for litle > > performance differences. (we hit a different cache line for the rwlock, but > > then the bucket cache line have a better sharing factor among cpus, since we > > dirty it less often) > > > > Using a 'small' table of hashed rwlocks should be more than enough to provide > > correct SMP concurrency between different buckets, without using too much > > memory. Sizing of this table depends on NR_CPUS and various CONFIG settings. > > > > This patch provides some locking abstraction that may ease a future work using > > a different model for TCP/DCCP table. > > > > Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> > > Nice work Eric. > > I've tossed this into my local tree and we'll let this cook > for a few days. If no problems crop up I will submit it > for 2.6.24 because the memory savings is non-trivial. Agreed, thanks! Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com> ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table 2007-11-01 10:16 [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table Eric Dumazet 2007-11-01 11:03 ` David Miller @ 2007-11-01 11:15 ` Ilpo Järvinen 2007-11-01 16:06 ` Jarek Poplawski ` (2 subsequent siblings) 4 siblings, 0 replies; 29+ messages in thread From: Ilpo Järvinen @ 2007-11-01 11:15 UTC (permalink / raw) To: Eric Dumazet Cc: David S. Miller, Linux Netdev List, Andi Kleen, Arnaldo Carvalho de Melo On Thu, 1 Nov 2007, Eric Dumazet wrote: > @@ -134,6 +156,13 @@ static inline struct inet_ehash_bucket *inet_ehash_bucket( > return &hashinfo->ehash[hash & (hashinfo->ehash_size - 1)]; > } > > +static inline rwlock_t *inet_ehash_lockp( > + struct inet_hashinfo *hashinfo, ...These two fit to 80 columns. > + unsigned int hash) > +{ > + return &hashinfo->ehash_locks[hash & (EHASH_LOCK_SZ - 1)]; > +} > + > extern struct inet_bind_bucket * > inet_bind_bucket_create(struct kmem_cache *cachep, > struct inet_bind_hashbucket *head, -- i. ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table 2007-11-01 10:16 [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table Eric Dumazet 2007-11-01 11:03 ` David Miller 2007-11-01 11:15 ` Ilpo Järvinen @ 2007-11-01 16:06 ` Jarek Poplawski 2007-11-01 18:00 ` Eric Dumazet 2007-11-01 16:14 ` Stephen Hemminger 2007-11-03 23:18 ` Andi Kleen 4 siblings, 1 reply; 29+ messages in thread From: Jarek Poplawski @ 2007-11-01 16:06 UTC (permalink / raw) To: Eric Dumazet Cc: David S. Miller, Linux Netdev List, Andi Kleen, Arnaldo Carvalho de Melo Hi, A few doubts below: Eric Dumazet wrote: > As done two years ago on IP route cache table (commit > 22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one lock per > hash bucket for the huge TCP/DCCP hash tables. ... > diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h > index 4427dcd..5cbfbac 100644 > --- a/include/net/inet_hashtables.h > +++ b/include/net/inet_hashtables.h > @@ -37,7 +37,6 @@ > * I'll experiment with dynamic table growth later. > */ > struct inet_ehash_bucket { > - rwlock_t lock; > struct hlist_head chain; > struct hlist_head twchain; > }; > @@ -91,6 +90,28 @@ struct inet_bind_hashbucket { > /* This is for listening sockets, thus all sockets which possess wildcards. */ > #define INET_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */ > > +#if defined(CONFIG_SMP) || defined(CONFIG_PROVE_LOCKING) Probably "|| defined(CONFIG_DEBUG_SPINLOCK)" is needed here. > +/* > + * Instead of using one rwlock for each inet_ehash_bucket, we use a table of locks > + * The size of this table is a power of two and depends on the number of CPUS. > + */ > +# if defined(CONFIG_DEBUG_LOCK_ALLOC) > +# define EHASH_LOCK_SZ 256 > +# elif NR_CPUS >= 32 > +# define EHASH_LOCK_SZ 4096 > +# elif NR_CPUS >= 16 > +# define EHASH_LOCK_SZ 2048 > +# elif NR_CPUS >= 8 > +# define EHASH_LOCK_SZ 1024 > +# elif NR_CPUS >= 4 > +# define EHASH_LOCK_SZ 512 > +# else > +# define EHASH_LOCK_SZ 256 > +# endif > +#else > +# define EHASH_LOCK_SZ 0 > +#endif > + Looks hackish: usually DEBUG code checks "real" environment, and here it's a special case. But omitting locks if no SMP or DEBUG is strange. IMHO, there should be 1 instead of 0. > struct inet_hashinfo { > /* This is for sockets with full identity only. Sockets here will > * always be without wildcards and will have the following invariant: > @@ -100,6 +121,7 @@ struct inet_hashinfo { > * TIME_WAIT sockets use a separate chain (twchain). > */ > struct inet_ehash_bucket *ehash; > + rwlock_t *ehash_locks; > > /* Ok, let's try this, I give up, we do need a local binding > * TCP hash as well as the others for fast bind/connect. > @@ -134,6 +156,13 @@ static inline struct inet_ehash_bucket *inet_ehash_bucket( > return &hashinfo->ehash[hash & (hashinfo->ehash_size - 1)]; > } > > +static inline rwlock_t *inet_ehash_lockp( > + struct inet_hashinfo *hashinfo, > + unsigned int hash) > +{ > + return &hashinfo->ehash_locks[hash & (EHASH_LOCK_SZ - 1)]; > +} > + Is it OK for EHASH_LOCK_SZ == 0? ... > diff --git a/net/dccp/proto.c b/net/dccp/proto.c > index d849739..3b5f97a 100644 > --- a/net/dccp/proto.c > +++ b/net/dccp/proto.c > @@ -1072,11 +1072,18 @@ static int __init dccp_init(void) > } > > for (i = 0; i < dccp_hashinfo.ehash_size; i++) { > - rwlock_init(&dccp_hashinfo.ehash[i].lock); > INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain); > INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].twchain); > } > - > + if (EHASH_LOCK_SZ) { Why not #ifdef then? But, IMHO, rwlock_init() should be done at least once here. (Similarly later for tcp.) > + dccp_hashinfo.ehash_locks = > + kmalloc(EHASH_LOCK_SZ * sizeof(rwlock_t), > + GFP_KERNEL); > + if (!dccp_hashinfo.ehash_locks) > + goto out_free_dccp_ehash; > + for (i = 0; i < EHASH_LOCK_SZ; i++) > + rwlock_init(&dccp_hashinfo.ehash_locks[i]); > + } > bhash_order = ehash_order; > > do { > @@ -1091,7 +1098,7 @@ static int __init dccp_init(void) > > if (!dccp_hashinfo.bhash) { > DCCP_CRIT("Failed to allocate DCCP bind hash table"); > - goto out_free_dccp_ehash; > + goto out_free_dccp_locks; > } > > for (i = 0; i < dccp_hashinfo.bhash_size; i++) { > @@ -1121,6 +1128,9 @@ out_free_dccp_mib: > out_free_dccp_bhash: > free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); > dccp_hashinfo.bhash = NULL; > +out_free_dccp_locks: > + kfree(dccp_hashinfo.ehash_locks); > + dccp_hashinfo.ehash_locks = NULL; > out_free_dccp_ehash: > free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order); > dccp_hashinfo.ehash = NULL; Isn't such kfree(dccp_hashinfo.ehash_locks) needed in dccp_fini()? Regards, Jarek P. ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table 2007-11-01 16:06 ` Jarek Poplawski @ 2007-11-01 18:00 ` Eric Dumazet 0 siblings, 0 replies; 29+ messages in thread From: Eric Dumazet @ 2007-11-01 18:00 UTC (permalink / raw) To: Jarek Poplawski Cc: David S. Miller, Linux Netdev List, Andi Kleen, Arnaldo Carvalho de Melo Jarek Poplawski a écrit : > Hi, > > A few doubts below: > >> >> +#if defined(CONFIG_SMP) || defined(CONFIG_PROVE_LOCKING) > > Probably "|| defined(CONFIG_DEBUG_SPINLOCK)" is needed here. Not sure, because DEBUG_SPINLOCK only applies to spinlocks. Here we deal with rwlocks. > >> +/* >> + * Instead of using one rwlock for each inet_ehash_bucket, we use a table of locks >> + * The size of this table is a power of two and depends on the number of CPUS. >> + */ >> +# if defined(CONFIG_DEBUG_LOCK_ALLOC) >> +# define EHASH_LOCK_SZ 256 >> +# elif NR_CPUS >= 32 >> +# define EHASH_LOCK_SZ 4096 >> +# elif NR_CPUS >= 16 >> +# define EHASH_LOCK_SZ 2048 >> +# elif NR_CPUS >= 8 >> +# define EHASH_LOCK_SZ 1024 >> +# elif NR_CPUS >= 4 >> +# define EHASH_LOCK_SZ 512 >> +# else >> +# define EHASH_LOCK_SZ 256 >> +# endif >> +#else >> +# define EHASH_LOCK_SZ 0 >> +#endif >> + > > Looks hackish: usually DEBUG code checks "real" environment, and here it's > a special case. But omitting locks if no SMP or DEBUG is strange. IMHO, > there should be 1 instead of 0. It is 0 so that no alloc is done. (see your next questions) > >> struct inet_hashinfo { >> /* This is for sockets with full identity only. Sockets here will >> * always be without wildcards and will have the following invariant: >> @@ -100,6 +121,7 @@ struct inet_hashinfo { >> * TIME_WAIT sockets use a separate chain (twchain). >> */ >> struct inet_ehash_bucket *ehash; >> + rwlock_t *ehash_locks; >> >> /* Ok, let's try this, I give up, we do need a local binding >> * TCP hash as well as the others for fast bind/connect. >> @@ -134,6 +156,13 @@ static inline struct inet_ehash_bucket *inet_ehash_bucket( >> return &hashinfo->ehash[hash & (hashinfo->ehash_size - 1)]; >> } >> >> +static inline rwlock_t *inet_ehash_lockp( >> + struct inet_hashinfo *hashinfo, >> + unsigned int hash) >> +{ >> + return &hashinfo->ehash_locks[hash & (EHASH_LOCK_SZ - 1)]; >> +} >> + > > Is it OK for EHASH_LOCK_SZ == 0? At least, compiled tested and booted on UP ;) > > ... >> diff --git a/net/dccp/proto.c b/net/dccp/proto.c >> index d849739..3b5f97a 100644 >> --- a/net/dccp/proto.c >> +++ b/net/dccp/proto.c >> @@ -1072,11 +1072,18 @@ static int __init dccp_init(void) >> } >> >> for (i = 0; i < dccp_hashinfo.ehash_size; i++) { >> - rwlock_init(&dccp_hashinfo.ehash[i].lock); >> INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain); >> INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].twchain); >> } >> - >> + if (EHASH_LOCK_SZ) { > > Why not #ifdef then? But, IMHO, rwlock_init() should be done at least > once here. (Similarly later for tcp.) well, #ifdef are not so nice :) > >> + dccp_hashinfo.ehash_locks = >> + kmalloc(EHASH_LOCK_SZ * sizeof(rwlock_t), >> + GFP_KERNEL); >> + if (!dccp_hashinfo.ehash_locks) >> + goto out_free_dccp_ehash; >> + for (i = 0; i < EHASH_LOCK_SZ; i++) >> + rwlock_init(&dccp_hashinfo.ehash_locks[i]); >> + } >> bhash_order = ehash_order; >> >> do { >> @@ -1091,7 +1098,7 @@ static int __init dccp_init(void) >> >> if (!dccp_hashinfo.bhash) { >> DCCP_CRIT("Failed to allocate DCCP bind hash table"); >> - goto out_free_dccp_ehash; >> + goto out_free_dccp_locks; >> } >> >> for (i = 0; i < dccp_hashinfo.bhash_size; i++) { >> @@ -1121,6 +1128,9 @@ out_free_dccp_mib: >> out_free_dccp_bhash: >> free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); >> dccp_hashinfo.bhash = NULL; >> +out_free_dccp_locks: >> + kfree(dccp_hashinfo.ehash_locks); >> + dccp_hashinfo.ehash_locks = NULL; >> out_free_dccp_ehash: >> free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order); >> dccp_hashinfo.ehash = NULL; > > Isn't such kfree(dccp_hashinfo.ehash_locks) needed in dccp_fini()? > Probably ! Thank you for reviewing ! Eric ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table 2007-11-01 10:16 [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table Eric Dumazet ` (2 preceding siblings ...) 2007-11-01 16:06 ` Jarek Poplawski @ 2007-11-01 16:14 ` Stephen Hemminger 2007-11-01 17:54 ` Eric Dumazet 2007-11-03 23:18 ` Andi Kleen 4 siblings, 1 reply; 29+ messages in thread From: Stephen Hemminger @ 2007-11-01 16:14 UTC (permalink / raw) To: Eric Dumazet Cc: David S. Miller, Linux Netdev List, Andi Kleen, Arnaldo Carvalho de Melo On Thu, 01 Nov 2007 11:16:20 +0100 Eric Dumazet <dada1@cosmosbay.com> wrote: > As done two years ago on IP route cache table (commit > 22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one lock per > hash bucket for the huge TCP/DCCP hash tables. > > On a typical x86_64 platform, this saves about 2MB or 4MB of ram, for litle > performance differences. (we hit a different cache line for the rwlock, but > then the bucket cache line have a better sharing factor among cpus, since we > dirty it less often) > > Using a 'small' table of hashed rwlocks should be more than enough to provide > correct SMP concurrency between different buckets, without using too much > memory. Sizing of this table depends on NR_CPUS and various CONFIG settings. > > This patch provides some locking abstraction that may ease a future work using > a different model for TCP/DCCP table. > > Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> > > include/net/inet_hashtables.h | 40 ++++++++++++++++++++++++++++---- > net/dccp/proto.c | 16 ++++++++++-- > net/ipv4/inet_diag.c | 9 ++++--- > net/ipv4/inet_hashtables.c | 7 +++-- > net/ipv4/inet_timewait_sock.c | 13 +++++----- > net/ipv4/tcp.c | 11 +++++++- > net/ipv4/tcp_ipv4.c | 11 ++++---- > net/ipv6/inet6_hashtables.c | 19 ++++++++------- > 8 files changed, 89 insertions(+), 37 deletions(-) > Longterm is there any chance of using rcu for this? Seems like it could be a big win. -- Stephen Hemminger <shemminger@linux-foundation.org> ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table 2007-11-01 16:14 ` Stephen Hemminger @ 2007-11-01 17:54 ` Eric Dumazet 2007-11-01 18:48 ` Rick Jones 2007-11-01 21:46 ` David Miller 0 siblings, 2 replies; 29+ messages in thread From: Eric Dumazet @ 2007-11-01 17:54 UTC (permalink / raw) To: Stephen Hemminger Cc: David S. Miller, Linux Netdev List, Andi Kleen, Arnaldo Carvalho de Melo Stephen Hemminger a écrit : > On Thu, 01 Nov 2007 11:16:20 +0100 > Eric Dumazet <dada1@cosmosbay.com> wrote: > >> As done two years ago on IP route cache table (commit >> 22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one lock per >> hash bucket for the huge TCP/DCCP hash tables. >> >> On a typical x86_64 platform, this saves about 2MB or 4MB of ram, for litle >> performance differences. (we hit a different cache line for the rwlock, but >> then the bucket cache line have a better sharing factor among cpus, since we >> dirty it less often) >> >> Using a 'small' table of hashed rwlocks should be more than enough to provide >> correct SMP concurrency between different buckets, without using too much >> memory. Sizing of this table depends on NR_CPUS and various CONFIG settings. >> >> This patch provides some locking abstraction that may ease a future work using >> a different model for TCP/DCCP table. >> >> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> >> >> include/net/inet_hashtables.h | 40 ++++++++++++++++++++++++++++---- >> net/dccp/proto.c | 16 ++++++++++-- >> net/ipv4/inet_diag.c | 9 ++++--- >> net/ipv4/inet_hashtables.c | 7 +++-- >> net/ipv4/inet_timewait_sock.c | 13 +++++----- >> net/ipv4/tcp.c | 11 +++++++- >> net/ipv4/tcp_ipv4.c | 11 ++++---- >> net/ipv6/inet6_hashtables.c | 19 ++++++++------- >> 8 files changed, 89 insertions(+), 37 deletions(-) >> > > Longterm is there any chance of using rcu for this? Seems like > it could be a big win. > This was discussed in the past, and I even believe some patch was proposed, but some guys (including David) complained that RCU is well suited for 'mostly read' structures. On some web server workloads, TCP hash table is constantly accessed in write mode (socket creation, socket move to timewait state, socket deleted...), and RCU added overhead and poor cache re-use (because sockets must be placed on RCU queue before reuse) On these typical workload, hash table without RCU is still the best. Longterm changes would rather be based on Robert Olsson suggestion last year (trie based lookups and unified IP/TCP cache) Short term changes would be to be able to resize the TCP hash table (being small at boot, and be able to grow it if necessary). Its current size on modern machines is just insane. ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table 2007-11-01 17:54 ` Eric Dumazet @ 2007-11-01 18:48 ` Rick Jones 2007-11-01 19:00 ` Eric Dumazet 2007-11-01 21:46 ` David Miller 1 sibling, 1 reply; 29+ messages in thread From: Rick Jones @ 2007-11-01 18:48 UTC (permalink / raw) To: Eric Dumazet Cc: Stephen Hemminger, David S. Miller, Linux Netdev List, Andi Kleen, Arnaldo Carvalho de Melo Eric Dumazet wrote: > Stephen Hemminger a écrit : > >> On Thu, 01 Nov 2007 11:16:20 +0100 >> Eric Dumazet <dada1@cosmosbay.com> wrote: >> >>> As done two years ago on IP route cache table (commit >>> 22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one >>> lock per hash bucket for the huge TCP/DCCP hash tables. The TCP hashes are looked at with higher frequency than the route cache yes? >>> On a typical x86_64 platform, this saves about 2MB or 4MB of ram, for >>> litle performance differences. (we hit a different cache line for the >>> rwlock, but then the bucket cache line have a better sharing factor >>> among cpus, since we dirty it less often) >>> >>> Using a 'small' table of hashed rwlocks should be more than enough to >>> provide correct SMP concurrency between different buckets, without >>> using too much memory. Sizing of this table depends on NR_CPUS and >>> various CONFIG settings. Something is telling me finding a 64 core system with a suitable workload to try this could be a good thing. Wish I had one at my disposal. rick jones ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table 2007-11-01 18:48 ` Rick Jones @ 2007-11-01 19:00 ` Eric Dumazet 2007-11-01 19:17 ` Eric Dumazet 0 siblings, 1 reply; 29+ messages in thread From: Eric Dumazet @ 2007-11-01 19:00 UTC (permalink / raw) To: Rick Jones Cc: Stephen Hemminger, David S. Miller, Linux Netdev List, Andi Kleen, Arnaldo Carvalho de Melo Rick Jones a écrit : > Eric Dumazet wrote: >> Stephen Hemminger a écrit : >> >>> On Thu, 01 Nov 2007 11:16:20 +0100 >>> Eric Dumazet <dada1@cosmosbay.com> wrote: >>> >>>> As done two years ago on IP route cache table (commit >>>> 22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one >>>> lock per hash bucket for the huge TCP/DCCP hash tables. > > The TCP hashes are looked at with higher frequency than the route cache > yes? It depends on the workload, but in general I would say the reverse. > >>>> On a typical x86_64 platform, this saves about 2MB or 4MB of ram, >>>> for litle performance differences. (we hit a different cache line >>>> for the rwlock, but then the bucket cache line have a better sharing >>>> factor among cpus, since we dirty it less often) >>>> >>>> Using a 'small' table of hashed rwlocks should be more than enough >>>> to provide correct SMP concurrency between different buckets, >>>> without using too much memory. Sizing of this table depends on >>>> NR_CPUS and various CONFIG settings. > > Something is telling me finding a 64 core system with a suitable > workload to try this could be a good thing. Wish I had one at my disposal. If you find one, please give it to me when you finished playing^Wworking with it :) ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table 2007-11-01 19:00 ` Eric Dumazet @ 2007-11-01 19:17 ` Eric Dumazet 2007-11-01 21:52 ` David Miller 0 siblings, 1 reply; 29+ messages in thread From: Eric Dumazet @ 2007-11-01 19:17 UTC (permalink / raw) To: Rick Jones Cc: Stephen Hemminger, David S. Miller, Linux Netdev List, Andi Kleen, Arnaldo Carvalho de Melo > Rick Jones a écrit : > Something is telling me finding a 64 core system with a suitable > workload to try this could be a good thing. Wish I had one at my > disposal. Maybe on big NUMA machines, we might prefer to spread the rwlock array on multiple nodes (ie using vmalloc() instead of kmalloc()) After my patch, it is true all rwlocks are on one memory node. Not exactly optimal :( But if it was an issue, the spinlock array used in IP route cache would have the same problem and *someone* should already have complained... ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table 2007-11-01 19:17 ` Eric Dumazet @ 2007-11-01 21:52 ` David Miller 0 siblings, 0 replies; 29+ messages in thread From: David Miller @ 2007-11-01 21:52 UTC (permalink / raw) To: dada1; +Cc: rick.jones2, shemminger, netdev, ak, acme From: Eric Dumazet <dada1@cosmosbay.com> Date: Thu, 01 Nov 2007 20:17:35 +0100 > But if it was an issue, the spinlock array used in IP route cache > would have the same problem and *someone* should already have > complained... I'd say that having a few MB less of rwlock cache lines to touch offsets the issue even if it does exist. ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table 2007-11-01 17:54 ` Eric Dumazet 2007-11-01 18:48 ` Rick Jones @ 2007-11-01 21:46 ` David Miller 1 sibling, 0 replies; 29+ messages in thread From: David Miller @ 2007-11-01 21:46 UTC (permalink / raw) To: dada1; +Cc: shemminger, netdev, ak, acme From: Eric Dumazet <dada1@cosmosbay.com> Date: Thu, 01 Nov 2007 18:54:24 +0100 > Stephen Hemminger a écrit : > > Longterm is there any chance of using rcu for this? Seems like > > it could be a big win. > > This was discussed in the past, and I even believe some patch was proposed, > but some guys (including David) complained that RCU is well suited for 'mostly > read' structures. > > On some web server workloads, TCP hash table is constantly accessed in write > mode (socket creation, socket move to timewait state, socket deleted...), and > RCU added overhead and poor cache re-use (because sockets must be placed on > RCU queue before reuse) > > On these typical workload, hash table without RCU is still the best. Right, and none of the submitted RCU attempts were correct, it's very hard to get the synchronization right. > Longterm changes would rather be based on Robert Olsson suggestion > last year (trie based lookups and unified IP/TCP cache) > > Short term changes would be to be able to resize the TCP hash table (being > small at boot, and be able to grow it if necessary). Its current size on > modern machines is just insane. Resizing the hash is also, unfortunately, very hard to implement as well. ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table 2007-11-01 10:16 [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table Eric Dumazet ` (3 preceding siblings ...) 2007-11-01 16:14 ` Stephen Hemminger @ 2007-11-03 23:18 ` Andi Kleen 2007-11-03 23:23 ` David Miller 4 siblings, 1 reply; 29+ messages in thread From: Andi Kleen @ 2007-11-03 23:18 UTC (permalink / raw) To: Eric Dumazet; +Cc: David S. Miller, Linux Netdev List, Arnaldo Carvalho de Melo On Thursday 01 November 2007 11:16:20 Eric Dumazet wrote: Looks good from a quick look. Thanks for doing that work. Some quick comments: > +#if defined(CONFIG_SMP) || defined(CONFIG_PROVE_LOCKING) > +/* > + * Instead of using one rwlock for each inet_ehash_bucket, we use a table of locks > + * The size of this table is a power of two and depends on the number of CPUS. > + */ This shouldn't be hard coded based on NR_CPUS, but be done on runtime based on num_possible_cpus(). This is better for kernels with a large NR_CPUS, but which typically run on much smaller systems (like distribution kernels) Also the EHASH_LOCK_SZ == 0 special case is a little strange. Why did you add that? And as a unrelated node have you tried converting the rwlocks into normal spinlocks? spinlocks should be somewhat cheaper because they have less cache protocol overhead and with the huge thash tables in Linux the chain walks should be short anyways so not doing this in parallel is probably not a big issue. At some point I also had a crazy idea of using a special locking scheme that special cases the common case that a hash chain has only one member and doesn't take a look for that at all. -Andi ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table 2007-11-03 23:18 ` Andi Kleen @ 2007-11-03 23:23 ` David Miller 2007-11-04 0:54 ` Andi Kleen 2007-11-04 11:31 ` Eric Dumazet 0 siblings, 2 replies; 29+ messages in thread From: David Miller @ 2007-11-03 23:23 UTC (permalink / raw) To: ak; +Cc: dada1, netdev, acme From: Andi Kleen <ak@suse.de> Date: Sun, 4 Nov 2007 00:18:14 +0100 > On Thursday 01 November 2007 11:16:20 Eric Dumazet wrote: > > Some quick comments: > > > +#if defined(CONFIG_SMP) || defined(CONFIG_PROVE_LOCKING) > > +/* > > + * Instead of using one rwlock for each inet_ehash_bucket, we use a table of locks > > + * The size of this table is a power of two and depends on the number of CPUS. > > + */ > > This shouldn't be hard coded based on NR_CPUS, but be done on runtime > based on num_possible_cpus(). This is better for kernels with a large > NR_CPUS, but which typically run on much smaller systems (like > distribution kernels) I think this is a good idea. Eric, could you make this change? > Also the EHASH_LOCK_SZ == 0 special case is a little strange. Why did > you add that? He explained this in another reply, because ifdefs are ugly. > And as a unrelated node have you tried converting the rwlocks > into normal spinlocks? spinlocks should be somewhat cheaper > because they have less cache protocol overhead and with > the huge thash tables in Linux the chain walks should be short > anyways so not doing this in parallel is probably not a big issue. > At some point I also had a crazy idea of using a special locking > scheme that special cases the common case that a hash chain > has only one member and doesn't take a look for that at all. I agree. There was movement at one point to get rid of all rwlock's in the kernel, I personally think they are pointless. Any use that makes "sense" is a case where the code should be rewritten to decrease the lock hold time or convert to RCU. ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table 2007-11-03 23:23 ` David Miller @ 2007-11-04 0:54 ` Andi Kleen 2007-11-04 11:31 ` Eric Dumazet 1 sibling, 0 replies; 29+ messages in thread From: Andi Kleen @ 2007-11-04 0:54 UTC (permalink / raw) To: David Miller; +Cc: dada1, netdev, acme > > Also the EHASH_LOCK_SZ == 0 special case is a little strange. Why did > > you add that? > > He explained this in another reply, because ifdefs are ugly. I meant why having it at all? > Any use that makes > "sense" is a case where the code should be rewritten to decrease the > lock hold time or convert to RCU. I don't think RCU would be really needed for single entry buckets (which are common) if they are special cased. After all it would be just a single pointer. Perhaps one could distingush this case e.g. by setting the low order bit of the hash bucket pointer. In fact to optimize for this case it might be an interesting experiment to go towards an closed hash table and allocate the sockets in page sized objects and then remap them directly into a virtual continuous table similar to the new vmemmap code (need to try this at some point). -Andi ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table 2007-11-03 23:23 ` David Miller 2007-11-04 0:54 ` Andi Kleen @ 2007-11-04 11:31 ` Eric Dumazet 2007-11-04 12:26 ` Andi Kleen ` (2 more replies) 1 sibling, 3 replies; 29+ messages in thread From: Eric Dumazet @ 2007-11-04 11:31 UTC (permalink / raw) To: David Miller; +Cc: ak, netdev, acme, Jarek Poplawski [-- Attachment #1: Type: text/plain, Size: 3657 bytes --] David Miller a écrit : > From: Andi Kleen <ak@suse.de> > Date: Sun, 4 Nov 2007 00:18:14 +0100 > >> On Thursday 01 November 2007 11:16:20 Eric Dumazet wrote: >> >> Some quick comments: >> >>> +#if defined(CONFIG_SMP) || defined(CONFIG_PROVE_LOCKING) >>> +/* >>> + * Instead of using one rwlock for each inet_ehash_bucket, we use a table of locks >>> + * The size of this table is a power of two and depends on the number of CPUS. >>> + */ >> This shouldn't be hard coded based on NR_CPUS, but be done on runtime >> based on num_possible_cpus(). This is better for kernels with a large >> NR_CPUS, but which typically run on much smaller systems (like >> distribution kernels) > > I think this is a good idea. Eric, could you make this change? Yes of course, since using a non constant value for masking is cheap. But I suspect distributions kernels enable CONFIG_HOTPLUG_CPU so num_possible_cpus() will be NR_CPUS. > >> Also the EHASH_LOCK_SZ == 0 special case is a little strange. Why did >> you add that? > > He explained this in another reply, because ifdefs are ugly. This will vanish if done on runtime anyway. > >> And as a unrelated node have you tried converting the rwlocks >> into normal spinlocks? spinlocks should be somewhat cheaper >> because they have less cache protocol overhead and with >> the huge thash tables in Linux the chain walks should be short >> anyways so not doing this in parallel is probably not a big issue. >> At some point I also had a crazy idea of using a special locking >> scheme that special cases the common case that a hash chain >> has only one member and doesn't take a look for that at all. > > I agree. > > There was movement at one point to get rid of all rwlock's in the > kernel, I personally think they are pointless. Any use that makes > "sense" is a case where the code should be rewritten to decrease the > lock hold time or convert to RCU. > I agree too, rwlocks are more expensive when contention is low, so let do this rwlock->spinlock change on next step (separate patch), because it means changing also lhash_lock. Thanks to Jarek, I added locks cleanup in dccp_fini() [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table As done two years ago on IP route cache table (commit 22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one lock per hash bucket for the huge TCP/DCCP hash tables. On a typical x86_64 platform, this saves about 2MB or 4MB of ram, for litle performance differences. (we hit a different cache line for the rwlock, but then the bucket cache line have a better sharing factor among cpus, since we dirty it less often). For netstat or ss commands that want a full scan of hash table, we perform fewer memory accesses. Using a 'small' table of hashed rwlocks should be more than enough to provide correct SMP concurrency between different buckets, without using too much memory. Sizing of this table depends on num_possible_cpus() and various CONFIG settings. This patch provides some locking abstraction that may ease a future work using a different model for TCP/DCCP table. Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com> include/net/inet_hashtables.h | 71 +++++++++++++++++++++++++++++--- net/dccp/proto.c | 9 +++- net/ipv4/inet_diag.c | 9 ++-- net/ipv4/inet_hashtables.c | 7 +-- net/ipv4/inet_timewait_sock.c | 13 +++-- net/ipv4/tcp.c | 4 - net/ipv4/tcp_ipv4.c | 11 ++-- net/ipv6/inet6_hashtables.c | 19 ++++---- 8 files changed, 106 insertions(+), 37 deletions(-) [-- Attachment #2: tcp_ehash_locks.patch --] [-- Type: text/plain, Size: 14142 bytes --] diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 4427dcd..8461cda 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -37,7 +37,6 @@ * I'll experiment with dynamic table growth later. */ struct inet_ehash_bucket { - rwlock_t lock; struct hlist_head chain; struct hlist_head twchain; }; @@ -100,6 +99,9 @@ struct inet_hashinfo { * TIME_WAIT sockets use a separate chain (twchain). */ struct inet_ehash_bucket *ehash; + rwlock_t *ehash_locks; + unsigned int ehash_size; + unsigned int ehash_locks_mask; /* Ok, let's try this, I give up, we do need a local binding * TCP hash as well as the others for fast bind/connect. @@ -107,7 +109,7 @@ struct inet_hashinfo { struct inet_bind_hashbucket *bhash; unsigned int bhash_size; - unsigned int ehash_size; + /* Note : 4 bytes padding on 64 bit arches */ /* All sockets in TCP_LISTEN state will be in here. This is the only * table where wildcard'd TCP sockets can exist. Hash function here @@ -134,6 +136,62 @@ static inline struct inet_ehash_bucket *inet_ehash_bucket( return &hashinfo->ehash[hash & (hashinfo->ehash_size - 1)]; } +static inline rwlock_t *inet_ehash_lockp( + struct inet_hashinfo *hashinfo, + unsigned int hash) +{ + return &hashinfo->ehash_locks[hash & hashinfo->ehash_locks_mask]; +} + +static inline int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) +{ + unsigned int i, size = 256; +#if defined(CONFIG_PROVE_LOCKING) + unsigned int nr_pcpus = 2; +#else + unsigned int nr_pcpus = num_possible_cpus(); +#endif + if (nr_pcpus >= 4) + size = 512; + if (nr_pcpus >= 8) + size = 1024; + if (nr_pcpus >= 16) + size = 2048; + if (nr_pcpus >= 32) + size = 4096; + if (sizeof(rwlock_t) != 0) { +#ifdef CONFIG_NUMA + if (size * sizeof(rwlock_t) > PAGE_SIZE) + hashinfo->ehash_locks = vmalloc(size * sizeof(rwlock_t)); + else +#endif + hashinfo->ehash_locks = kmalloc(size * sizeof(rwlock_t), + GFP_KERNEL); + if (!hashinfo->ehash_locks) + return ENOMEM; + for (i = 0; i < size; i++) + rwlock_init(&hashinfo->ehash_locks[i]); + } + hashinfo->ehash_locks_mask = size - 1; + return 0; +} + +static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) +{ + if (hashinfo->ehash_locks) { +#ifdef CONFIG_NUMA + unsigned int size = (hashinfo->ehash_locks_mask + 1) * + sizeof(rwlock_t); + if (size > PAGE_SIZE) + vfree(hashinfo->ehash_locks); + else +#else + kfree(hashinfo->ehash_locks); +#endif + hashinfo->ehash_locks = NULL; + } +} + extern struct inet_bind_bucket * inet_bind_bucket_create(struct kmem_cache *cachep, struct inet_bind_hashbucket *head, @@ -222,7 +280,7 @@ static inline void __inet_hash(struct inet_hashinfo *hashinfo, sk->sk_hash = inet_sk_ehashfn(sk); head = inet_ehash_bucket(hashinfo, sk->sk_hash); list = &head->chain; - lock = &head->lock; + lock = inet_ehash_lockp(hashinfo, sk->sk_hash); write_lock(lock); } __sk_add_node(sk, list); @@ -253,7 +311,7 @@ static inline void inet_unhash(struct inet_hashinfo *hashinfo, struct sock *sk) inet_listen_wlock(hashinfo); lock = &hashinfo->lhash_lock; } else { - lock = &inet_ehash_bucket(hashinfo, sk->sk_hash)->lock; + lock = inet_ehash_lockp(hashinfo, sk->sk_hash); write_lock_bh(lock); } @@ -354,9 +412,10 @@ static inline struct sock * */ unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport); struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); + rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); prefetch(head->chain.first); - read_lock(&head->lock); + read_lock(lock); sk_for_each(sk, node, &head->chain) { if (INET_MATCH(sk, hash, acookie, saddr, daddr, ports, dif)) goto hit; /* You sunk my battleship! */ @@ -369,7 +428,7 @@ static inline struct sock * } sk = NULL; out: - read_unlock(&head->lock); + read_unlock(lock); return sk; hit: sock_hold(sk); diff --git a/net/dccp/proto.c b/net/dccp/proto.c index d849739..7a3bea9 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1072,11 +1072,13 @@ static int __init dccp_init(void) } for (i = 0; i < dccp_hashinfo.ehash_size; i++) { - rwlock_init(&dccp_hashinfo.ehash[i].lock); INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain); INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].twchain); } + if (inet_ehash_locks_alloc(&dccp_hashinfo)) + goto out_free_dccp_ehash; + bhash_order = ehash_order; do { @@ -1091,7 +1093,7 @@ static int __init dccp_init(void) if (!dccp_hashinfo.bhash) { DCCP_CRIT("Failed to allocate DCCP bind hash table"); - goto out_free_dccp_ehash; + goto out_free_dccp_locks; } for (i = 0; i < dccp_hashinfo.bhash_size; i++) { @@ -1121,6 +1123,8 @@ out_free_dccp_mib: out_free_dccp_bhash: free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); dccp_hashinfo.bhash = NULL; +out_free_dccp_locks: + inet_ehash_locks_free(&dccp_hashinfo); out_free_dccp_ehash: free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order); dccp_hashinfo.ehash = NULL; @@ -1139,6 +1143,7 @@ static void __exit dccp_fini(void) free_pages((unsigned long)dccp_hashinfo.ehash, get_order(dccp_hashinfo.ehash_size * sizeof(struct inet_ehash_bucket))); + inet_ehash_locks_free(&dccp_hashinfo); kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); dccp_ackvec_exit(); dccp_sysctl_exit(); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index dc429b6..b017073 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -747,13 +747,14 @@ skip_listen_ht: for (i = s_i; i < hashinfo->ehash_size; i++) { struct inet_ehash_bucket *head = &hashinfo->ehash[i]; + rwlock_t *lock = inet_ehash_lockp(hashinfo, i); struct sock *sk; struct hlist_node *node; if (i > s_i) s_num = 0; - read_lock_bh(&head->lock); + read_lock_bh(lock); num = 0; sk_for_each(sk, node, &head->chain) { struct inet_sock *inet = inet_sk(sk); @@ -769,7 +770,7 @@ skip_listen_ht: r->id.idiag_dport) goto next_normal; if (inet_csk_diag_dump(sk, skb, cb) < 0) { - read_unlock_bh(&head->lock); + read_unlock_bh(lock); goto done; } next_normal: @@ -791,14 +792,14 @@ next_normal: r->id.idiag_dport) goto next_dying; if (inet_twsk_diag_dump(tw, skb, cb) < 0) { - read_unlock_bh(&head->lock); + read_unlock_bh(lock); goto done; } next_dying: ++num; } } - read_unlock_bh(&head->lock); + read_unlock_bh(lock); } done: diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 16eecc7..67704da 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -204,12 +204,13 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport); unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); + rwlock_t *lock = inet_ehash_lockp(hinfo, hash); struct sock *sk2; const struct hlist_node *node; struct inet_timewait_sock *tw; prefetch(head->chain.first); - write_lock(&head->lock); + write_lock(lock); /* Check TIME-WAIT sockets first. */ sk_for_each(sk2, node, &head->twchain) { @@ -239,7 +240,7 @@ unique: BUG_TRAP(sk_unhashed(sk)); __sk_add_node(sk, &head->chain); sock_prot_inc_use(sk->sk_prot); - write_unlock(&head->lock); + write_unlock(lock); if (twp) { *twp = tw; @@ -255,7 +256,7 @@ unique: return 0; not_unique: - write_unlock(&head->lock); + write_unlock(lock); return -EADDRNOTAVAIL; } diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 4e189e2..a60b99e 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -20,16 +20,16 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_bind_hashbucket *bhead; struct inet_bind_bucket *tb; /* Unlink from established hashes. */ - struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, tw->tw_hash); + rwlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); - write_lock(&ehead->lock); + write_lock(lock); if (hlist_unhashed(&tw->tw_node)) { - write_unlock(&ehead->lock); + write_unlock(lock); return; } __hlist_del(&tw->tw_node); sk_node_init(&tw->tw_node); - write_unlock(&ehead->lock); + write_unlock(lock); /* Disassociate with bind bucket. */ bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)]; @@ -59,6 +59,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, const struct inet_sock *inet = inet_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash); + rwlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); struct inet_bind_hashbucket *bhead; /* Step 1: Put TW into bind hash. Original socket stays there too. Note, that any socket with inet->num != 0 MUST be bound in @@ -71,7 +72,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); spin_unlock(&bhead->lock); - write_lock(&ehead->lock); + write_lock(lock); /* Step 2: Remove SK from established hash. */ if (__sk_del_node_init(sk)) @@ -81,7 +82,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, inet_twsk_add_node(tw, &ehead->twchain); atomic_inc(&tw->tw_refcnt); - write_unlock(&ehead->lock); + write_unlock(lock); } EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c64072b..8e65182 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2456,11 +2456,11 @@ void __init tcp_init(void) thash_entries ? 0 : 512 * 1024); tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; for (i = 0; i < tcp_hashinfo.ehash_size; i++) { - rwlock_init(&tcp_hashinfo.ehash[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain); } - + if (inet_ehash_locks_alloc(&tcp_hashinfo)) + panic("TCP: failed to alloc ehash_locks"); tcp_hashinfo.bhash = alloc_large_system_hash("TCP bind", sizeof(struct inet_bind_hashbucket), diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index eec02b2..cd82c0e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2049,8 +2049,9 @@ static void *established_get_first(struct seq_file *seq) struct sock *sk; struct hlist_node *node; struct inet_timewait_sock *tw; + rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); - read_lock_bh(&tcp_hashinfo.ehash[st->bucket].lock); + read_lock_bh(lock); sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { if (sk->sk_family != st->family) { continue; @@ -2067,7 +2068,7 @@ static void *established_get_first(struct seq_file *seq) rc = tw; goto out; } - read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock); + read_unlock_bh(lock); st->state = TCP_SEQ_STATE_ESTABLISHED; } out: @@ -2094,11 +2095,11 @@ get_tw: cur = tw; goto out; } - read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock); + read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); st->state = TCP_SEQ_STATE_ESTABLISHED; if (++st->bucket < tcp_hashinfo.ehash_size) { - read_lock_bh(&tcp_hashinfo.ehash[st->bucket].lock); + read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); } else { cur = NULL; @@ -2206,7 +2207,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) case TCP_SEQ_STATE_TIME_WAIT: case TCP_SEQ_STATE_ESTABLISHED: if (v) - read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock); + read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); break; } } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index d6f1026..adc73ad 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -37,9 +37,8 @@ void __inet6_hash(struct inet_hashinfo *hashinfo, } else { unsigned int hash; sk->sk_hash = hash = inet6_sk_ehashfn(sk); - hash &= (hashinfo->ehash_size - 1); - list = &hashinfo->ehash[hash].chain; - lock = &hashinfo->ehash[hash].lock; + list = &inet_ehash_bucket(hashinfo, hash)->chain; + lock = inet_ehash_lockp(hashinfo, hash); write_lock(lock); } @@ -70,9 +69,10 @@ struct sock *__inet6_lookup_established(struct inet_hashinfo *hashinfo, */ unsigned int hash = inet6_ehashfn(daddr, hnum, saddr, sport); struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); + rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); prefetch(head->chain.first); - read_lock(&head->lock); + read_lock(lock); sk_for_each(sk, node, &head->chain) { /* For IPV6 do the cheaper port and family tests first. */ if (INET6_MATCH(sk, hash, saddr, daddr, ports, dif)) @@ -92,12 +92,12 @@ struct sock *__inet6_lookup_established(struct inet_hashinfo *hashinfo, goto hit; } } - read_unlock(&head->lock); + read_unlock(lock); return NULL; hit: sock_hold(sk); - read_unlock(&head->lock); + read_unlock(lock); return sk; } EXPORT_SYMBOL(__inet6_lookup_established); @@ -175,12 +175,13 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, const unsigned int hash = inet6_ehashfn(daddr, lport, saddr, inet->dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); + rwlock_t *lock = inet_ehash_lockp(hinfo, hash); struct sock *sk2; const struct hlist_node *node; struct inet_timewait_sock *tw; prefetch(head->chain.first); - write_lock(&head->lock); + write_lock(lock); /* Check TIME-WAIT sockets first. */ sk_for_each(sk2, node, &head->twchain) { @@ -216,7 +217,7 @@ unique: __sk_add_node(sk, &head->chain); sk->sk_hash = hash; sock_prot_inc_use(sk->sk_prot); - write_unlock(&head->lock); + write_unlock(lock); if (twp != NULL) { *twp = tw; @@ -231,7 +232,7 @@ unique: return 0; not_unique: - write_unlock(&head->lock); + write_unlock(lock); return -EADDRNOTAVAIL; } ^ permalink raw reply related [flat|nested] 29+ messages in thread
* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table 2007-11-04 11:31 ` Eric Dumazet @ 2007-11-04 12:26 ` Andi Kleen 2007-11-04 13:05 ` Eric Dumazet 2007-11-04 21:56 ` David Miller 2007-11-04 17:58 ` Jarek Poplawski 2007-11-07 10:41 ` David Miller 2 siblings, 2 replies; 29+ messages in thread From: Andi Kleen @ 2007-11-04 12:26 UTC (permalink / raw) To: Eric Dumazet; +Cc: David Miller, netdev, acme, Jarek Poplawski > But I suspect distributions kernels enable CONFIG_HOTPLUG_CPU so > num_possible_cpus() will be NR_CPUS. Nope, on x86 num_possible_cpus() is derived from BIOS tables these days. -Andi ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table 2007-11-04 12:26 ` Andi Kleen @ 2007-11-04 13:05 ` Eric Dumazet 2007-11-04 21:56 ` David Miller 1 sibling, 0 replies; 29+ messages in thread From: Eric Dumazet @ 2007-11-04 13:05 UTC (permalink / raw) To: Andi Kleen; +Cc: David Miller, netdev, acme, Jarek Poplawski Andi Kleen a écrit : >> But I suspect distributions kernels enable CONFIG_HOTPLUG_CPU so >> num_possible_cpus() will be NR_CPUS. > > Nope, on x86 num_possible_cpus() is derived from BIOS tables these days. Good to know, thank you Andi for this clarification. ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table 2007-11-04 12:26 ` Andi Kleen 2007-11-04 13:05 ` Eric Dumazet @ 2007-11-04 21:56 ` David Miller 2007-11-04 23:01 ` Andi Kleen 1 sibling, 1 reply; 29+ messages in thread From: David Miller @ 2007-11-04 21:56 UTC (permalink / raw) To: ak; +Cc: dada1, netdev, acme, jarkao2 From: Andi Kleen <ak@suse.de> Date: Sun, 4 Nov 2007 13:26:38 +0100 > > > But I suspect distributions kernels enable CONFIG_HOTPLUG_CPU so > > num_possible_cpus() will be NR_CPUS. > > Nope, on x86 num_possible_cpus() is derived from BIOS tables these days. And similarly on SPARC64 is will be set based upon the physical capabilities of the system. This makes a huge different as we have to set NR_CPUS to 4096 in order to handle the cpu numbering of some UltraSPARC-IV machines. ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table 2007-11-04 21:56 ` David Miller @ 2007-11-04 23:01 ` Andi Kleen 2007-11-05 4:24 ` David Miller 0 siblings, 1 reply; 29+ messages in thread From: Andi Kleen @ 2007-11-04 23:01 UTC (permalink / raw) To: David Miller; +Cc: dada1, netdev, acme, jarkao2 On Sunday 04 November 2007 22:56:21 David Miller wrote: > From: Andi Kleen <ak@suse.de> > This makes a huge different as we have to set NR_CPUS to 4096 > in order to handle the cpu numbering of some UltraSPARC-IV > machines. Really? Hopefully you have a large enough stack then. There are various users who put char str[NR_CPUS] on the stack and a few other data structures also get incredibly big with NR_CPUS arrays. If it's for sparse cpu ids -- x86 handles those with an translation array. -Andi ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table 2007-11-04 23:01 ` Andi Kleen @ 2007-11-05 4:24 ` David Miller 2007-11-05 4:35 ` David Miller 0 siblings, 1 reply; 29+ messages in thread From: David Miller @ 2007-11-05 4:24 UTC (permalink / raw) To: ak; +Cc: dada1, netdev, acme, jarkao2 From: Andi Kleen <ak@suse.de> Date: Mon, 5 Nov 2007 00:01:03 +0100 > On Sunday 04 November 2007 22:56:21 David Miller wrote: > > From: Andi Kleen <ak@suse.de> > > > This makes a huge different as we have to set NR_CPUS to 4096 > > in order to handle the cpu numbering of some UltraSPARC-IV > > machines. > > Really? Hopefully you have a large enough stack then. There > are various users who put char str[NR_CPUS] on the stack > and a few other data structures also get incredibly big with > NR_CPUS arrays. For the stack case there is one debugging case, and that's for sprintf'ing cpusets. That could be easily eliminated. > If it's for sparse cpu ids -- x86 handles those with an > translation array. I would rather not do this, so much assembler code indexes straight into the per-cpu arrays. ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table 2007-11-05 4:24 ` David Miller @ 2007-11-05 4:35 ` David Miller 0 siblings, 0 replies; 29+ messages in thread From: David Miller @ 2007-11-05 4:35 UTC (permalink / raw) To: ak; +Cc: dada1, netdev, acme, jarkao2 From: David Miller <davem@davemloft.net> Date: Sun, 04 Nov 2007 20:24:54 -0800 (PST) > From: Andi Kleen <ak@suse.de> > Date: Mon, 5 Nov 2007 00:01:03 +0100 > > > If it's for sparse cpu ids -- x86 handles those with an > > translation array. > > I would rather not do this, so much assembler code indexes straight > into the per-cpu arrays. Also, at current rates, I'll need to be able to support 4096 cpus for real not very long from now :-) ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table 2007-11-04 11:31 ` Eric Dumazet 2007-11-04 12:26 ` Andi Kleen @ 2007-11-04 17:58 ` Jarek Poplawski 2007-11-04 18:15 ` Jarek Poplawski 2007-11-07 10:41 ` David Miller 2 siblings, 1 reply; 29+ messages in thread From: Jarek Poplawski @ 2007-11-04 17:58 UTC (permalink / raw) To: Eric Dumazet; +Cc: David Miller, ak, netdev, acme Eric Dumazet wrote, On 11/04/2007 12:31 PM: > David Miller a écrit : >> From: Andi Kleen <ak@suse.de> >> Date: Sun, 4 Nov 2007 00:18:14 +0100 >> >>> On Thursday 01 November 2007 11:16:20 Eric Dumazet wrote: ... >>> Also the EHASH_LOCK_SZ == 0 special case is a little strange. Why did >>> you add that? >> He explained this in another reply, because ifdefs are ugly. But I hope he was only joking, didn't he? Let's make it clear: ifdefs are in K&R, so they are very nice! Just like all C! (K, &, and R as well.) You know, I can even imagine, there are people, who have K&R around their beds, instead of some other book, so they could be serious about such things. (But, don't worry, it's not me - happily I'm not serious!) This patch looks OK now, but a bit of grumbling shouldn't harm?: ... > [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table > > As done two years ago on IP route cache table (commit > 22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one lock per > hash bucket for the huge TCP/DCCP hash tables. > > On a typical x86_64 platform, this saves about 2MB or 4MB of ram, for litle - litle + little ... > +static inline int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) > +{ > + unsigned int i, size = 256; > +#if defined(CONFIG_PROVE_LOCKING) > + unsigned int nr_pcpus = 2; > +#else > + unsigned int nr_pcpus = num_possible_cpus(); > +#endif > + if (nr_pcpus >= 4) > + size = 512; > + if (nr_pcpus >= 8) > + size = 1024; > + if (nr_pcpus >= 16) > + size = 2048; > + if (nr_pcpus >= 32) > + size = 4096; It seems, maybe in the future this could look a bit nicer with some log type shifting. > + if (sizeof(rwlock_t) != 0) { > +#ifdef CONFIG_NUMA > + if (size * sizeof(rwlock_t) > PAGE_SIZE) > + hashinfo->ehash_locks = vmalloc(size * sizeof(rwlock_t)); > + else > +#endif > + hashinfo->ehash_locks = kmalloc(size * sizeof(rwlock_t), > + GFP_KERNEL); > + if (!hashinfo->ehash_locks) > + return ENOMEM; Probably doesn't matter now, but maybe more common?: return -ENOMEM; > + for (i = 0; i < size; i++) > + rwlock_init(&hashinfo->ehash_locks[i]); This looks better now, but still is doubtful to me: even if it's safe with current rwlock implementation, can't we imagine some new debugging or statistical code added, which would be called from rwlock_init() without using rwlock_t structure? IMHO, if read_lock() etc. are called in such a case, rwlock_init() should be done as well. Regards, Jarek P. ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table 2007-11-04 17:58 ` Jarek Poplawski @ 2007-11-04 18:15 ` Jarek Poplawski 2007-11-04 21:23 ` Eric Dumazet 0 siblings, 1 reply; 29+ messages in thread From: Jarek Poplawski @ 2007-11-04 18:15 UTC (permalink / raw) To: Jarek Poplawski; +Cc: Eric Dumazet, David Miller, ak, netdev, acme Jarek Poplawski wrote, On 11/04/2007 06:58 PM: > Eric Dumazet wrote, On 11/04/2007 12:31 PM: ... >> +static inline int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) >> +{ ... >> + if (sizeof(rwlock_t) != 0) { ... >> + for (i = 0; i < size; i++) >> + rwlock_init(&hashinfo->ehash_locks[i]); > > > This looks better now, but still is doubtful to me: even if it's safe with > current rwlock implementation, can't we imagine some new debugging or > statistical code added, which would be called from rwlock_init() without > using rwlock_t structure? IMHO, if read_lock() etc. are called in such a > case, rwlock_init() should be done as well. Of course I mean: if sizeof(rwlock_t) == 0. Jarek P ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table 2007-11-04 18:15 ` Jarek Poplawski @ 2007-11-04 21:23 ` Eric Dumazet 2007-11-04 23:08 ` Jarek Poplawski 0 siblings, 1 reply; 29+ messages in thread From: Eric Dumazet @ 2007-11-04 21:23 UTC (permalink / raw) To: Jarek Poplawski; +Cc: David Miller, ak, netdev, acme Jarek Poplawski a écrit : > Jarek Poplawski wrote, On 11/04/2007 06:58 PM: > >> Eric Dumazet wrote, On 11/04/2007 12:31 PM: > > ... > >>> +static inline int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) >>> +{ > > ... > >>> + if (sizeof(rwlock_t) != 0) { > > ... > >>> + for (i = 0; i < size; i++) >>> + rwlock_init(&hashinfo->ehash_locks[i]); >> >> This looks better now, but still is doubtful to me: even if it's safe with >> current rwlock implementation, can't we imagine some new debugging or >> statistical code added, which would be called from rwlock_init() without >> using rwlock_t structure? IMHO, if read_lock() etc. are called in such a >> case, rwlock_init() should be done as well. > > > Of course I mean: if sizeof(rwlock_t) == 0. Given those two choices : #if defined(CONFIG_SMP) || defined(CONFIG_PROVE__LOCKING) kmalloc(sizeof(rwlock_t) * size); #endif and if (sizeof(rwlock_t) != 0) { kmalloc(sizeof(rwlock_t) * size); } I prefer the 2nd one. Less error prone, and no need to remember how are spelled the gazillions CONFIG_something we have. ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table 2007-11-04 21:23 ` Eric Dumazet @ 2007-11-04 23:08 ` Jarek Poplawski 0 siblings, 0 replies; 29+ messages in thread From: Jarek Poplawski @ 2007-11-04 23:08 UTC (permalink / raw) To: Eric Dumazet; +Cc: David Miller, ak, netdev, acme Eric Dumazet wrote, On 11/04/2007 10:23 PM: > Jarek Poplawski a écrit : >> Jarek Poplawski wrote, On 11/04/2007 06:58 PM: >> >>> Eric Dumazet wrote, On 11/04/2007 12:31 PM: >> ... >> >>>> +static inline int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) >>>> +{ >> ... >> >>>> + if (sizeof(rwlock_t) != 0) { >> ... >> >>>> + for (i = 0; i < size; i++) >>>> + rwlock_init(&hashinfo->ehash_locks[i]); >>> This looks better now, but still is doubtful to me: even if it's safe with >>> current rwlock implementation, can't we imagine some new debugging or >>> statistical code added, which would be called from rwlock_init() without >>> using rwlock_t structure? IMHO, if read_lock() etc. are called in such a >>> case, rwlock_init() should be done as well. >> >> Of course I mean: if sizeof(rwlock_t) == 0. > > Given those two choices : > > #if defined(CONFIG_SMP) || defined(CONFIG_PROVE__LOCKING) > kmalloc(sizeof(rwlock_t) * size); > #endif > > and > > if (sizeof(rwlock_t) != 0) { > kmalloc(sizeof(rwlock_t) * size); > } > > I prefer the 2nd one. Less error prone, and no need to remember how are > spelled the gazillions CONFIG_something we have. I've written it's better, too. But this could be improved yet (someday), I hope. Thanks, Jarek P. ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table 2007-11-04 11:31 ` Eric Dumazet 2007-11-04 12:26 ` Andi Kleen 2007-11-04 17:58 ` Jarek Poplawski @ 2007-11-07 10:41 ` David Miller 2007-11-07 12:13 ` Jarek Poplawski 2 siblings, 1 reply; 29+ messages in thread From: David Miller @ 2007-11-07 10:41 UTC (permalink / raw) To: dada1; +Cc: ak, netdev, acme, jarkao2 From: Eric Dumazet <dada1@cosmosbay.com> Date: Sun, 04 Nov 2007 12:31:28 +0100 > [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table > > As done two years ago on IP route cache table (commit > 22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one lock per > hash bucket for the huge TCP/DCCP hash tables. > > On a typical x86_64 platform, this saves about 2MB or 4MB of ram, for litle > performance differences. (we hit a different cache line for the rwlock, but > then the bucket cache line have a better sharing factor among cpus, since we > dirty it less often). For netstat or ss commands that want a full scan of hash > table, we perform fewer memory accesses. > > Using a 'small' table of hashed rwlocks should be more than enough to provide > correct SMP concurrency between different buckets, without using too much > memory. Sizing of this table depends on num_possible_cpus() and various CONFIG > settings. > > This patch provides some locking abstraction that may ease a future work using > a different model for TCP/DCCP table. > > Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> > Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com> I'm going to push this current version to Linus, the space saving really justify it and if we want to refine things further we do it with followon work rather than blocking this patch. Thanks Eric! ^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table 2007-11-07 10:41 ` David Miller @ 2007-11-07 12:13 ` Jarek Poplawski 0 siblings, 0 replies; 29+ messages in thread From: Jarek Poplawski @ 2007-11-07 12:13 UTC (permalink / raw) To: David Miller; +Cc: dada1, ak, netdev, acme On Wed, Nov 07, 2007 at 02:41:14AM -0800, David Miller wrote: > From: Eric Dumazet <dada1@cosmosbay.com> > Date: Sun, 04 Nov 2007 12:31:28 +0100 > > > [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table > > > > As done two years ago on IP route cache table (commit > > 22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one lock per > > hash bucket for the huge TCP/DCCP hash tables. > > > > On a typical x86_64 platform, this saves about 2MB or 4MB of ram, for litle > > performance differences. (we hit a different cache line for the rwlock, but > > then the bucket cache line have a better sharing factor among cpus, since we > > dirty it less often). For netstat or ss commands that want a full scan of hash > > table, we perform fewer memory accesses. > > > > Using a 'small' table of hashed rwlocks should be more than enough to provide > > correct SMP concurrency between different buckets, without using too much > > memory. Sizing of this table depends on num_possible_cpus() and various CONFIG > > settings. > > > > This patch provides some locking abstraction that may ease a future work using > > a different model for TCP/DCCP table. > > > > Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> > > Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com> > > I'm going to push this current version to Linus, the space saving > really justify it and if we want to refine things further we do it > with followon work rather than blocking this patch. > > Thanks Eric! I hope my remarks didn't block anything?! I've written it's OK. So, I'm not sure it's useful or expected, but anyway: Acked-by: Jarek Poplawski <jarkao2@o2.pl> Thanks, Jarek P. ^ permalink raw reply [flat|nested] 29+ messages in thread
end of thread, other threads:[~2007-11-07 12:09 UTC | newest] Thread overview: 29+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2007-11-01 10:16 [PATCH] INET : removes per bucket rwlock in tcp/dccp ehash table Eric Dumazet 2007-11-01 11:03 ` David Miller 2007-11-01 11:20 ` Arnaldo Carvalho de Melo 2007-11-01 11:15 ` Ilpo Järvinen 2007-11-01 16:06 ` Jarek Poplawski 2007-11-01 18:00 ` Eric Dumazet 2007-11-01 16:14 ` Stephen Hemminger 2007-11-01 17:54 ` Eric Dumazet 2007-11-01 18:48 ` Rick Jones 2007-11-01 19:00 ` Eric Dumazet 2007-11-01 19:17 ` Eric Dumazet 2007-11-01 21:52 ` David Miller 2007-11-01 21:46 ` David Miller 2007-11-03 23:18 ` Andi Kleen 2007-11-03 23:23 ` David Miller 2007-11-04 0:54 ` Andi Kleen 2007-11-04 11:31 ` Eric Dumazet 2007-11-04 12:26 ` Andi Kleen 2007-11-04 13:05 ` Eric Dumazet 2007-11-04 21:56 ` David Miller 2007-11-04 23:01 ` Andi Kleen 2007-11-05 4:24 ` David Miller 2007-11-05 4:35 ` David Miller 2007-11-04 17:58 ` Jarek Poplawski 2007-11-04 18:15 ` Jarek Poplawski 2007-11-04 21:23 ` Eric Dumazet 2007-11-04 23:08 ` Jarek Poplawski 2007-11-07 10:41 ` David Miller 2007-11-07 12:13 ` Jarek Poplawski
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).