[PATCH 3/3] Convert the UDP hash lock to RCU

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 3/3] Convert the UDP hash lock to RCU
@ 2008-10-06 18:50 Corey Minyard
  2008-10-06 21:22 ` Eric Dumazet
  0 siblings, 1 reply; 134+ messages in thread
From: Corey Minyard @ 2008-10-06 18:50 UTC (permalink / raw)
  To: Linux Kernel, netdev; +Cc: shemminger, paulmck

Change the UDP hash lock from an rwlock to RCU.

Signed-off-by: Corey Minyard <cminyard@mvista.com>
---
 include/net/udp.h |    9 +++++----
 net/ipv4/udp.c    |   47 +++++++++++++++++++++++++++--------------------
 net/ipv6/udp.c    |   17 +++++++++--------
 3 files changed, 41 insertions(+), 32 deletions(-)

diff --git a/include/net/udp.h b/include/net/udp.h
index addcdc6..35aa104 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -51,7 +51,7 @@ struct udp_skb_cb {
 #define UDP_SKB_CB(__skb)	((struct udp_skb_cb *)((__skb)->cb))
 
 extern struct hlist_head udp_hash[UDP_HTABLE_SIZE];
-extern rwlock_t udp_hash_lock;
+extern spinlock_t udp_hash_wlock;
 
 
 /* Note: this must match 'valbool' in sock_setsockopt */
@@ -112,12 +112,13 @@ static inline void udp_lib_hash(struct sock *sk)
 
 static inline void udp_lib_unhash(struct sock *sk)
 {
-	write_lock_bh(&udp_hash_lock);
-	if (sk_del_node_init(sk)) {
+	spin_lock_bh(&udp_hash_wlock);
+	if (sk_del_node_init_rcu(sk)) {
 		inet_sk(sk)->num = 0;
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 	}
-	write_unlock_bh(&udp_hash_lock);
+	spin_unlock_bh(&udp_hash_wlock);
+	synchronize_rcu();
 }
 
 static inline void udp_lib_close(struct sock *sk, long timeout)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 57e26fa..1b65cb6 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -112,7 +112,8 @@ DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6) __read_mostly;
 EXPORT_SYMBOL(udp_stats_in6);
 
 struct hlist_head udp_hash[UDP_HTABLE_SIZE];
-DEFINE_RWLOCK(udp_hash_lock);
+DEFINE_SPINLOCK(udp_hash_wlock);
+EXPORT_SYMBOL(udp_hash_wlock);
 
 int sysctl_udp_mem[3] __read_mostly;
 int sysctl_udp_rmem_min __read_mostly;
@@ -155,7 +156,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 	int    error = 1;
 	struct net *net = sock_net(sk);
 
-	write_lock_bh(&udp_hash_lock);
+	spin_lock_bh(&udp_hash_wlock);
 
 	if (!snum) {
 		int i, low, high, remaining;
@@ -225,12 +226,12 @@ gotit:
 	sk->sk_hash = snum;
 	if (sk_unhashed(sk)) {
 		head = &udptable[udp_hashfn(net, snum)];
-		sk_add_node(sk, head);
+		sk_add_node_rcu(sk, head);
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 	}
 	error = 0;
 fail:
-	write_unlock_bh(&udp_hash_lock);
+	spin_unlock_bh(&udp_hash_wlock);
 	return error;
 }
 
@@ -260,8 +261,8 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 	unsigned short hnum = ntohs(dport);
 	int badness = -1;
 
-	read_lock(&udp_hash_lock);
-	sk_for_each(sk, node, &udptable[udp_hashfn(net, hnum)]) {
+	rcu_read_lock();
+	sk_for_each_rcu(sk, node, &udptable[udp_hashfn(net, hnum)]) {
 		struct inet_sock *inet = inet_sk(sk);
 
 		if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
@@ -296,9 +297,17 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 			}
 		}
 	}
+	/*
+	 * Note that this is safe, even with an RCU lock.
+	 * udp_lib_unhash() is the removal function, it calls
+	 * synchronize_rcu() and the socket counter cannot go to
+	 * zero until it returns.  So if we increment it inside the
+	 * RCU read lock, it should never go to zero and then be
+	 * incremented again.
+	 */
 	if (result)
 		sock_hold(result);
-	read_unlock(&udp_hash_lock);
+	rcu_read_unlock();
 	return result;
 }
 
@@ -311,7 +320,7 @@ static inline struct sock *udp_v4_mcast_next(struct sock *sk,
 	struct sock *s = sk;
 	unsigned short hnum = ntohs(loc_port);
 
-	sk_for_each_from(s, node) {
+	sk_for_each_from_rcu(s, node) {
 		struct inet_sock *inet = inet_sk(s);
 
 		if (s->sk_hash != hnum					||
@@ -1094,8 +1103,8 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	struct sock *sk;
 	int dif;
 
-	read_lock(&udp_hash_lock);
-	sk = sk_head(&udptable[udp_hashfn(net, ntohs(uh->dest))]);
+	rcu_read_lock();
+	sk = sk_head_rcu(&udptable[udp_hashfn(net, ntohs(uh->dest))]);
 	dif = skb->dev->ifindex;
 	sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
 	if (sk) {
@@ -1104,8 +1113,9 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 		do {
 			struct sk_buff *skb1 = skb;
 
-			sknext = udp_v4_mcast_next(sk_next(sk), uh->dest, daddr,
-						   uh->source, saddr, dif);
+			sknext = udp_v4_mcast_next(sk_next_rcu(sk), uh->dest,
+						   daddr, uh->source, saddr,
+						   dif);
 			if (sknext)
 				skb1 = skb_clone(skb, GFP_ATOMIC);
 
@@ -1120,7 +1130,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 		} while (sknext);
 	} else
 		kfree_skb(skb);
-	read_unlock(&udp_hash_lock);
+	rcu_read_unlock();
 	return 0;
 }
 
@@ -1543,13 +1553,13 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
 	struct net *net = seq_file_net(seq);
 
 	do {
-		sk = sk_next(sk);
+		sk = sk_next_rcu(sk);
 try_again:
 		;
 	} while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
 
 	if (!sk && ++state->bucket < UDP_HTABLE_SIZE) {
-		sk = sk_head(state->hashtable + state->bucket);
+		sk = sk_head_rcu(state->hashtable + state->bucket);
 		goto try_again;
 	}
 	return sk;
@@ -1566,9 +1576,8 @@ static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
 }
 
 static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
-	__acquires(udp_hash_lock)
 {
-	read_lock(&udp_hash_lock);
+	rcu_read_lock();
 	return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
 }
 
@@ -1586,9 +1595,8 @@ static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 }
 
 static void udp_seq_stop(struct seq_file *seq, void *v)
-	__releases(udp_hash_lock)
 {
-	read_unlock(&udp_hash_lock);
+	rcu_read_unlock();
 }
 
 static int udp_seq_open(struct inode *inode, struct file *file)
@@ -1732,7 +1740,6 @@ void __init udp_init(void)
 
 EXPORT_SYMBOL(udp_disconnect);
 EXPORT_SYMBOL(udp_hash);
-EXPORT_SYMBOL(udp_hash_lock);
 EXPORT_SYMBOL(udp_ioctl);
 EXPORT_SYMBOL(udp_prot);
 EXPORT_SYMBOL(udp_sendmsg);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index a6aecf7..b807de7 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -64,8 +64,8 @@ static struct sock *__udp6_lib_lookup(struct net *net,
 	unsigned short hnum = ntohs(dport);
 	int badness = -1;
 
-	read_lock(&udp_hash_lock);
-	sk_for_each(sk, node, &udptable[udp_hashfn(net, hnum)]) {
+	rcu_read_lock();
+	sk_for_each_rcu(sk, node, &udptable[udp_hashfn(net, hnum)]) {
 		struct inet_sock *inet = inet_sk(sk);
 
 		if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
@@ -101,9 +101,10 @@ static struct sock *__udp6_lib_lookup(struct net *net,
 			}
 		}
 	}
+	/* See comment in __udp4_lib_lookup on why this is safe. */
 	if (result)
 		sock_hold(result);
-	read_unlock(&udp_hash_lock);
+	rcu_read_unlock();
 	return result;
 }
 
@@ -322,7 +323,7 @@ static struct sock *udp_v6_mcast_next(struct sock *sk,
 	struct sock *s = sk;
 	unsigned short num = ntohs(loc_port);
 
-	sk_for_each_from(s, node) {
+	sk_for_each_from_rcu(s, node) {
 		struct inet_sock *inet = inet_sk(s);
 
 		if (sock_net(s) != sock_net(sk))
@@ -365,8 +366,8 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	const struct udphdr *uh = udp_hdr(skb);
 	int dif;
 
-	read_lock(&udp_hash_lock);
-	sk = sk_head(&udptable[udp_hashfn(net, ntohs(uh->dest))]);
+	rcu_read_lock();
+	sk = sk_head_rcu(&udptable[udp_hashfn(net, ntohs(uh->dest))]);
 	dif = inet6_iif(skb);
 	sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
 	if (!sk) {
@@ -375,7 +376,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	}
 
 	sk2 = sk;
-	while ((sk2 = udp_v6_mcast_next(sk_next(sk2), uh->dest, daddr,
+	while ((sk2 = udp_v6_mcast_next(sk_next_rcu(sk2), uh->dest, daddr,
 					uh->source, saddr, dif))) {
 		struct sk_buff *buff = skb_clone(skb, GFP_ATOMIC);
 		if (buff) {
@@ -394,7 +395,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 		sk_add_backlog(sk, skb);
 	bh_unlock_sock(sk);
 out:
-	read_unlock(&udp_hash_lock);
+	rcu_read_unlock();
 	return 0;
 }
 
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-06 18:50 [PATCH 3/3] Convert the UDP hash lock to RCU Corey Minyard
@ 2008-10-06 21:22 ` Eric Dumazet
  2008-10-06 21:40   ` David Miller
                     ` (2 more replies)
  0 siblings, 3 replies; 134+ messages in thread
From: Eric Dumazet @ 2008-10-06 21:22 UTC (permalink / raw)
  To: minyard; +Cc: Linux Kernel, netdev, shemminger, paulmck

Corey Minyard a écrit :
> Change the UDP hash lock from an rwlock to RCU.
> 
> Signed-off-by: Corey Minyard <cminyard@mvista.com>
> ---
>  include/net/udp.h |    9 +++++----
>  net/ipv4/udp.c    |   47 +++++++++++++++++++++++++++--------------------
>  net/ipv6/udp.c    |   17 +++++++++--------
>  3 files changed, 41 insertions(+), 32 deletions(-)
> 
> diff --git a/include/net/udp.h b/include/net/udp.h
> index addcdc6..35aa104 100644
> --- a/include/net/udp.h
> +++ b/include/net/udp.h
> @@ -51,7 +51,7 @@ struct udp_skb_cb {
>  #define UDP_SKB_CB(__skb)	((struct udp_skb_cb *)((__skb)->cb))
>  
>  extern struct hlist_head udp_hash[UDP_HTABLE_SIZE];
> -extern rwlock_t udp_hash_lock;
> +extern spinlock_t udp_hash_wlock;
>  
>  
>  /* Note: this must match 'valbool' in sock_setsockopt */
> @@ -112,12 +112,13 @@ static inline void udp_lib_hash(struct sock *sk)
>  
>  static inline void udp_lib_unhash(struct sock *sk)
>  {
> -	write_lock_bh(&udp_hash_lock);
> -	if (sk_del_node_init(sk)) {
> +	spin_lock_bh(&udp_hash_wlock);
> +	if (sk_del_node_init_rcu(sk)) {
>  		inet_sk(sk)->num = 0;
>  		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
>  	}
> -	write_unlock_bh(&udp_hash_lock);
> +	spin_unlock_bh(&udp_hash_wlock);
> +	synchronize_rcu();

UDP central rwlock can hurt performance, because of cache line ping pong,
so your patch really makes sense.

Me wondering what impact this synchronize_rcu() can have on mono-threaded
VOIP applications using lot of UDP sockets. What is the maximum delay of
this function ?

For "struct file" freeing, we chose call_rcu() instead of synchronize_rcu()

Maybe we could add a generic rcu head to struct sock, and use call_rcu() in
sk_prot_free() for sockets needing RCU (udp after your patch is applied, maybe
tcp on future patches, while I believe previous work on the subject concluded
RCU was not giving good results for short lived http sessions) ?

Or just add SLAB_DESTROY_BY_RCU to slab creation in proto_register()
for "struct proto udp_prot/udpv6_prot" so that kmem_cache_free() 
done in sk_prot_free() can defer freeing to RCU...

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-06 21:22 ` Eric Dumazet
@ 2008-10-06 21:40   ` David Miller
  2008-10-06 23:08     ` Corey Minyard
                       ` (2 more replies)
  2008-10-06 22:07   ` Corey Minyard
  2008-10-07  8:17   ` Peter Zijlstra
  2 siblings, 3 replies; 134+ messages in thread
From: David Miller @ 2008-10-06 21:40 UTC (permalink / raw)
  To: dada1; +Cc: minyard, linux-kernel, netdev, shemminger, paulmck

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Mon, 06 Oct 2008 23:22:31 +0200

> Me wondering what impact this synchronize_rcu() can have on mono-threaded
> VOIP applications using lot of UDP sockets. What is the maximum delay of
> this function ?

The cost is enormous, we really can't use it here.

I have a patch that did top-level socket destruction using RCU,
and that didn't use synchronize_rcu(), and that killed connection
rates by up to %20.

I can only imagine what the cost would be if I had to add such a call
in there.

Really, I can't consider these changes seriously, as-is.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-06 21:22 ` Eric Dumazet
  2008-10-06 21:40   ` David Miller
@ 2008-10-06 22:07   ` Corey Minyard
  2008-10-07  8:17   ` Peter Zijlstra
  2 siblings, 0 replies; 134+ messages in thread
From: Corey Minyard @ 2008-10-06 22:07 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Linux Kernel, netdev, shemminger, paulmck

Eric Dumazet wrote:
> Corey Minyard a écrit :
>> Change the UDP hash lock from an rwlock to RCU.
>>
>> Signed-off-by: Corey Minyard <cminyard@mvista.com>
>> ---
>>  include/net/udp.h |    9 +++++----
>>  net/ipv4/udp.c    |   47 
>> +++++++++++++++++++++++++++--------------------
>>  net/ipv6/udp.c    |   17 +++++++++--------
>>  3 files changed, 41 insertions(+), 32 deletions(-)
>>
>> diff --git a/include/net/udp.h b/include/net/udp.h
>> index addcdc6..35aa104 100644
>> --- a/include/net/udp.h
>> +++ b/include/net/udp.h
>> @@ -51,7 +51,7 @@ struct udp_skb_cb {
>>  #define UDP_SKB_CB(__skb)    ((struct udp_skb_cb *)((__skb)->cb))
>>  
>>  extern struct hlist_head udp_hash[UDP_HTABLE_SIZE];
>> -extern rwlock_t udp_hash_lock;
>> +extern spinlock_t udp_hash_wlock;
>>  
>>  
>>  /* Note: this must match 'valbool' in sock_setsockopt */
>> @@ -112,12 +112,13 @@ static inline void udp_lib_hash(struct sock *sk)
>>  
>>  static inline void udp_lib_unhash(struct sock *sk)
>>  {
>> -    write_lock_bh(&udp_hash_lock);
>> -    if (sk_del_node_init(sk)) {
>> +    spin_lock_bh(&udp_hash_wlock);
>> +    if (sk_del_node_init_rcu(sk)) {
>>          inet_sk(sk)->num = 0;
>>          sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
>>      }
>> -    write_unlock_bh(&udp_hash_lock);
>> +    spin_unlock_bh(&udp_hash_wlock);
>> +    synchronize_rcu();
>
> UDP central rwlock can hurt performance, because of cache line ping pong,
> so your patch really makes sense.
>
> Me wondering what impact this synchronize_rcu() can have on mono-threaded
> VOIP applications using lot of UDP sockets. What is the maximum delay of
> this function ?
It delays until all currently executing RCU read-side sections have 
executed (new ones don't count, just currently executing ones).  I'm not 
sure what this delay is, but I would expect it to be fairly small.  This 
function is only called when a socket is closed, too, so it's not a 
high-runner.  Paul would certainly know better than me.

>
> For "struct file" freeing, we chose call_rcu() instead of 
> synchronize_rcu()
I'd prefer that, too, but that would mean adding another member to the 
socket structure.

>
> Maybe we could add a generic rcu head to struct sock, and use 
> call_rcu() in
> sk_prot_free() for sockets needing RCU (udp after your patch is 
> applied, maybe
> tcp on future patches, while I believe previous work on the subject 
> concluded
> RCU was not giving good results for short lived http sessions) ?
RCU probably wouldn't be a good choice for short-lived http sessions, 
since you will only get a couple of messages that would matter.  I'm not 
against adding an item to struct sock, but this is not a common thing 
and struct sock was already big and ugly.

>
> Or just add SLAB_DESTROY_BY_RCU to slab creation in proto_register()
> for "struct proto udp_prot/udpv6_prot" so that kmem_cache_free() done 
> in sk_prot_free() can defer freeing to RCU...
That's an interesting thought; I didn't know that capability was there.  
I can look at that.  With this, the short-lived TCP sessions might not 
matter, though that's a different issue.

-corey

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-06 21:40   ` David Miller
@ 2008-10-06 23:08     ` Corey Minyard
  2008-10-07  8:37       ` Evgeniy Polyakov
  2008-10-07  5:24     ` Eric Dumazet
  2008-10-07  8:31     ` Peter Zijlstra
  2 siblings, 1 reply; 134+ messages in thread
From: Corey Minyard @ 2008-10-06 23:08 UTC (permalink / raw)
  To: David Miller; +Cc: dada1, linux-kernel, netdev, shemminger, paulmck

David Miller wrote:
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Mon, 06 Oct 2008 23:22:31 +0200
>
>   
>> Me wondering what impact this synchronize_rcu() can have on mono-threaded
>> VOIP applications using lot of UDP sockets. What is the maximum delay of
>> this function ?
>>     
>
> The cost is enormous, we really can't use it here.
>
> I have a patch that did top-level socket destruction using RCU,
> and that didn't use synchronize_rcu(), and that killed connection
> rates by up to %20.
>
> I can only imagine what the cost would be if I had to add such a call
> in there.
>
> Really, I can't consider these changes seriously, as-is.
>
>   
Would using SLAB_DESTROY_BY_RCU be ok, or would that be too expensive?

-corey

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-06 21:40   ` David Miller
  2008-10-06 23:08     ` Corey Minyard
@ 2008-10-07  5:24     ` Eric Dumazet
  2008-10-07  8:54       ` Benny Amorsen
  2008-10-07 18:26       ` David Miller
  2008-10-07  8:31     ` Peter Zijlstra
  2 siblings, 2 replies; 134+ messages in thread
From: Eric Dumazet @ 2008-10-07  5:24 UTC (permalink / raw)
  To: David Miller; +Cc: minyard, linux-kernel, netdev, shemminger, paulmck

David Miller a écrit :
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Mon, 06 Oct 2008 23:22:31 +0200
> 
>> Me wondering what impact this synchronize_rcu() can have on mono-threaded
>> VOIP applications using lot of UDP sockets. What is the maximum delay of
>> this function ?
> 
> The cost is enormous, we really can't use it here.
> 
> I have a patch that did top-level socket destruction using RCU,
> and that didn't use synchronize_rcu(), and that killed connection
> rates by up to %20.
> 
> I can only imagine what the cost would be if I had to add such a call
> in there.
> 
> Really, I can't consider these changes seriously, as-is.

Yes, I suppose you are right about TCP sessions, that should stay as they are.

Then if we use call_rcu() RCU freeing only for UDP sockets, we should get rid of
taking this rwlock each time we handle an incoming datagram, and introduce
no extra cost for other sockets.

Most UDP sockets are setup for long periods (RTP trafic), or if an application really
wants to {open/send or receive one UDP frame/close} many sockets, it already hits
RCU handling of its file structures and should not be slowed down that much.

By 'long period' I mean thousand of packets sent/received by each RTP session, being
voice (50 packets/second) or even worse video...

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-06 21:22 ` Eric Dumazet
  2008-10-06 21:40   ` David Miller
  2008-10-06 22:07   ` Corey Minyard
@ 2008-10-07  8:17   ` Peter Zijlstra
  2008-10-07  9:24     ` Eric Dumazet
  2 siblings, 1 reply; 134+ messages in thread
From: Peter Zijlstra @ 2008-10-07  8:17 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: minyard, Linux Kernel, netdev, shemminger, paulmck

On Mon, 2008-10-06 at 23:22 +0200, Eric Dumazet wrote:

> Or just add SLAB_DESTROY_BY_RCU to slab creation in proto_register()
> for "struct proto udp_prot/udpv6_prot" so that kmem_cache_free() 
> done in sk_prot_free() can defer freeing to RCU...

Be careful!, SLAB_DESTROY_BY_RCU just means the slab page gets
RCU-freed, this means that slab object pointers stay pointing to valid
memory, but it does _NOT_ mean those slab objects themselves remain
valid.

The slab allocator is free to re-use those objects at any time -
irrespective of the rcu-grace period. Therefore you will have to be able
to validate that the object you point to is indeed the object you
expect, otherwise strange and wonderful things will happen.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-06 21:40   ` David Miller
  2008-10-06 23:08     ` Corey Minyard
  2008-10-07  5:24     ` Eric Dumazet
@ 2008-10-07  8:31     ` Peter Zijlstra
  2008-10-07 14:36       ` Paul E. McKenney
  2008-10-07 18:29       ` David Miller
  2 siblings, 2 replies; 134+ messages in thread
From: Peter Zijlstra @ 2008-10-07  8:31 UTC (permalink / raw)
  To: David Miller; +Cc: dada1, minyard, linux-kernel, netdev, shemminger, paulmck

On Mon, 2008-10-06 at 14:40 -0700, David Miller wrote:
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Mon, 06 Oct 2008 23:22:31 +0200
> 
> > Me wondering what impact this synchronize_rcu() can have on mono-threaded
> > VOIP applications using lot of UDP sockets. What is the maximum delay of
> > this function ?
> 
> The cost is enormous, we really can't use it here.
> 
> I have a patch that did top-level socket destruction using RCU,
> and that didn't use synchronize_rcu(), and that killed connection
> rates by up to %20.

Did you ever figure out why you lost those 20% ?

> I can only imagine what the cost would be if I had to add such a call
> in there.

Yeah, sync_rcu() is rediculously expensive, at best 3 jiffies IIRC.


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-06 23:08     ` Corey Minyard
@ 2008-10-07  8:37       ` Evgeniy Polyakov
  2008-10-07 14:16         ` Christoph Lameter
  0 siblings, 1 reply; 134+ messages in thread
From: Evgeniy Polyakov @ 2008-10-07  8:37 UTC (permalink / raw)
  To: Corey Minyard
  Cc: David Miller, dada1, linux-kernel, netdev, shemminger, paulmck

On Mon, Oct 06, 2008 at 06:08:09PM -0500, Corey Minyard (minyard@acm.org) wrote:
> Would using SLAB_DESTROY_BY_RCU be ok, or would that be too expensive?

I tested skb destruction via RCU path, and got 2.5 times worse numbers
with small-packets-bulk-transfer workload.

For more details
http://tservice.net.ru/~s0mbre/blog/devel/networking/2006_12_05_1.html

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07  5:24     ` Eric Dumazet
@ 2008-10-07  8:54       ` Benny Amorsen
  2008-10-07 12:59         ` Eric Dumazet
  2008-10-07 18:26       ` David Miller
  1 sibling, 1 reply; 134+ messages in thread
From: Benny Amorsen @ 2008-10-07  8:54 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, minyard, linux-kernel, netdev, shemminger, paulmck

Eric Dumazet <dada1@cosmosbay.com> writes:

> Most UDP sockets are setup for long periods (RTP trafic), or if an application really
> wants to {open/send or receive one UDP frame/close} many sockets, it already hits
> RCU handling of its file structures and should not be slowed down that much.
>
> By 'long period' I mean thousand of packets sent/received by each RTP session, being
> voice (50 packets/second) or even worse video...

Does DNS with port randomization need short lived sockets?


/Benny


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07  8:17   ` Peter Zijlstra
@ 2008-10-07  9:24     ` Eric Dumazet
  2008-10-07 14:15       ` Christoph Lameter
  0 siblings, 1 reply; 134+ messages in thread
From: Eric Dumazet @ 2008-10-07  9:24 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: minyard, Linux Kernel, netdev, shemminger, paulmck

Peter Zijlstra a écrit :
> On Mon, 2008-10-06 at 23:22 +0200, Eric Dumazet wrote:
> 
>> Or just add SLAB_DESTROY_BY_RCU to slab creation in proto_register()
>> for "struct proto udp_prot/udpv6_prot" so that kmem_cache_free() 
>> done in sk_prot_free() can defer freeing to RCU...
> 
> Be careful!, SLAB_DESTROY_BY_RCU just means the slab page gets
> RCU-freed, this means that slab object pointers stay pointing to valid
> memory, but it does _NOT_ mean those slab objects themselves remain
> valid.
> 
> The slab allocator is free to re-use those objects at any time -
> irrespective of the rcu-grace period. Therefore you will have to be able
> to validate that the object you point to is indeed the object you
> expect, otherwise strange and wonderful things will happen.
> 
Thanks for this clarification. I guess we really need a rcu head then :)




^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07  8:54       ` Benny Amorsen
@ 2008-10-07 12:59         ` Eric Dumazet
  2008-10-07 14:07           ` Stephen Hemminger
  2008-10-07 16:43           ` [PATCH 3/3] " Corey Minyard
  0 siblings, 2 replies; 134+ messages in thread
From: Eric Dumazet @ 2008-10-07 12:59 UTC (permalink / raw)
  To: Benny Amorsen
  Cc: David Miller, minyard, linux-kernel, netdev, shemminger, paulmck

Benny Amorsen a écrit :
> Eric Dumazet <dada1@cosmosbay.com> writes:
> 
>> Most UDP sockets are setup for long periods (RTP trafic), or if an application really
>> wants to {open/send or receive one UDP frame/close} many sockets, it already hits
>> RCU handling of its file structures and should not be slowed down that much.
>>

I should have say 'Many' instead of 'Most' :)

>> By 'long period' I mean thousand of packets sent/received by each RTP session, being
>> voice (50 packets/second) or even worse video...
> 
> Does DNS with port randomization need short lived sockets?
> 

Yes very true, but current allocation of a random port can be very expensive, 
since we scan all the UDP hash table to select the smaller hash chain.

We stop the scan if we find an empty slot, but on machines with say more than 200
bound UDP sockets, they are probably no empty slots. (UDP_HTABLE_SIZE is 128)

bind(NULL port) algo is then O(N), N being number of bound UDP sockets.

So heavy DNS servers/proxies probably use a pool/range of pre-allocated sockets
to avoid costs of allocating/freeing them ? If they dont care about that cost,
the extra call_rcu() will be unnoticed.

For pathological (yet very common :) ) cases like single DNS query/answer, RCU
would mean :

Pros :
- one few rwlock hit when receiving the answer (if any)
Cons :
- one call_rcu() to delay socket freeing/reuse after RCU period.

So it might be a litle bit more expensive than without RCU

I agree I am more interested in optimizing UDP stack for heavy users like RTP 
servers/proxies handling xxx.000 packets/second than DNS users/servers.
Shame on me :)

(2 weeks ago, Corey mentioned a 10x increase on UDP throughput on a 16-way machine,
that sounds promising)

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 12:59         ` Eric Dumazet
@ 2008-10-07 14:07           ` Stephen Hemminger
  2008-10-07 20:55             ` David Miller
  2008-10-07 16:43           ` [PATCH 3/3] " Corey Minyard
  1 sibling, 1 reply; 134+ messages in thread
From: Stephen Hemminger @ 2008-10-07 14:07 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Benny Amorsen, David Miller, minyard, linux-kernel, netdev,
	paulmck

On Tue, 07 Oct 2008 14:59:20 +0200
Eric Dumazet <dada1@cosmosbay.com> wrote:

> Benny Amorsen a écrit :
> > Eric Dumazet <dada1@cosmosbay.com> writes:
> > 
> >> Most UDP sockets are setup for long periods (RTP trafic), or if an application really
> >> wants to {open/send or receive one UDP frame/close} many sockets, it already hits
> >> RCU handling of its file structures and should not be slowed down that much.
> >>
> 
> I should have say 'Many' instead of 'Most' :)
> 
> >> By 'long period' I mean thousand of packets sent/received by each RTP session, being
> >> voice (50 packets/second) or even worse video...
> > 
> > Does DNS with port randomization need short lived sockets?
> > 
> 
> Yes very true, but current allocation of a random port can be very expensive, 
> since we scan all the UDP hash table to select the smaller hash chain.
> 
> We stop the scan if we find an empty slot, but on machines with say more than 200
> bound UDP sockets, they are probably no empty slots. (UDP_HTABLE_SIZE is 128)
> 
> bind(NULL port) algo is then O(N), N being number of bound UDP sockets.
> 
> So heavy DNS servers/proxies probably use a pool/range of pre-allocated sockets
> to avoid costs of allocating/freeing them ? If they dont care about that cost,
> the extra call_rcu() will be unnoticed.
> 
> For pathological (yet very common :) ) cases like single DNS query/answer, RCU
> would mean :
> 
> Pros :
> - one few rwlock hit when receiving the answer (if any)
> Cons :
> - one call_rcu() to delay socket freeing/reuse after RCU period.
> 
> So it might be a litle bit more expensive than without RCU
> 
> I agree I am more interested in optimizing UDP stack for heavy users like RTP 
> servers/proxies handling xxx.000 packets/second than DNS users/servers.
> Shame on me :)
> 
> (2 weeks ago, Corey mentioned a 10x increase on UDP throughput on a 16-way machine,
> that sounds promising)

The idea of keeping chains short is the problem. That code should just be pulled because
it doesn't help that much, and also creates bias on the port randomization.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07  9:24     ` Eric Dumazet
@ 2008-10-07 14:15       ` Christoph Lameter
  2008-10-07 14:38         ` Paul E. McKenney
  2008-10-07 14:50         ` Eric Dumazet
  0 siblings, 2 replies; 134+ messages in thread
From: Christoph Lameter @ 2008-10-07 14:15 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Peter Zijlstra, minyard, Linux Kernel, netdev, shemminger,
	paulmck

Eric Dumazet wrote:
>>> Or just add SLAB_DESTROY_BY_RCU to slab creation in proto_register()
>>> for "struct proto udp_prot/udpv6_prot" so that kmem_cache_free() done
>>> in sk_prot_free() can defer freeing to RCU...
>>
>> Be careful!, SLAB_DESTROY_BY_RCU just means the slab page gets
>> RCU-freed, this means that slab object pointers stay pointing to valid
>> memory, but it does _NOT_ mean those slab objects themselves remain
>> valid.
>>
>> The slab allocator is free to re-use those objects at any time -
>> irrespective of the rcu-grace period. Therefore you will have to be able
>> to validate that the object you point to is indeed the object you
>> expect, otherwise strange and wonderful things will happen.
>>
> Thanks for this clarification. I guess we really need a rcu head then :)

No you just need to make sure that the object you located is still active
(f.e. refcount > 0) and that it is really a match (hash pointers may be
updated asynchronously and therefore point to the object that has been reused
for something else).

Generally it is advisable to use SLAB_DESTROY_BY_RCU because it preserves the
cache hot advantages of the objects. Regular RCU freeing will let the object
expire for a tick or so which will result in the cacheline cooling down.



^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07  8:37       ` Evgeniy Polyakov
@ 2008-10-07 14:16         ` Christoph Lameter
  2008-10-07 14:29           ` Evgeniy Polyakov
  2008-10-07 14:33           ` Paul E. McKenney
  0 siblings, 2 replies; 134+ messages in thread
From: Christoph Lameter @ 2008-10-07 14:16 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Corey Minyard, David Miller, dada1, linux-kernel, netdev,
	shemminger, paulmck

Evgeniy Polyakov wrote:
> On Mon, Oct 06, 2008 at 06:08:09PM -0500, Corey Minyard (minyard@acm.org) wrote:
>> Would using SLAB_DESTROY_BY_RCU be ok, or would that be too expensive?
> 
> I tested skb destruction via RCU path, and got 2.5 times worse numbers
> with small-packets-bulk-transfer workload.

Was this with regular RCU freeing? This will cool down the cacheline before
frees. You need SLAB_DESTROY_BY_RCU to keep the objects cache hot.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 14:16         ` Christoph Lameter
@ 2008-10-07 14:29           ` Evgeniy Polyakov
  2008-10-07 14:38             ` Christoph Lameter
  2008-10-07 14:33           ` Paul E. McKenney
  1 sibling, 1 reply; 134+ messages in thread
From: Evgeniy Polyakov @ 2008-10-07 14:29 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Corey Minyard, David Miller, dada1, linux-kernel, netdev,
	shemminger, paulmck

On Tue, Oct 07, 2008 at 09:16:13AM -0500, Christoph Lameter (cl@linux-foundation.org) wrote:
> > I tested skb destruction via RCU path, and got 2.5 times worse numbers
> > with small-packets-bulk-transfer workload.
> 
> Was this with regular RCU freeing? This will cool down the cacheline before
> frees. You need SLAB_DESTROY_BY_RCU to keep the objects cache hot.

I believe there were no SLAB_DESTROY_BY_RCU 2 yars ago :)

It was pure call_rcu(&skb->rcu, real_skb_freeing), where
real_skb_freeing() just did usual kfree().

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 14:16         ` Christoph Lameter
  2008-10-07 14:29           ` Evgeniy Polyakov
@ 2008-10-07 14:33           ` Paul E. McKenney
  2008-10-07 14:45             ` Christoph Lameter
  1 sibling, 1 reply; 134+ messages in thread
From: Paul E. McKenney @ 2008-10-07 14:33 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Evgeniy Polyakov, Corey Minyard, David Miller, dada1,
	linux-kernel, netdev, shemminger

On Tue, Oct 07, 2008 at 09:16:13AM -0500, Christoph Lameter wrote:
> Evgeniy Polyakov wrote:
> > On Mon, Oct 06, 2008 at 06:08:09PM -0500, Corey Minyard (minyard@acm.org) wrote:
> >> Would using SLAB_DESTROY_BY_RCU be ok, or would that be too expensive?
> > 
> > I tested skb destruction via RCU path, and got 2.5 times worse numbers
> > with small-packets-bulk-transfer workload.
> 
> Was this with regular RCU freeing? This will cool down the cacheline before
> frees. You need SLAB_DESTROY_BY_RCU to keep the objects cache hot.

Indeed!

But care is required -- SLAB_DESTROY_BY_RCU permits objects to be freed
and reallocated while a reader holds a reference.  The only guarantee is
that the -type- of the data structure will not change while a reader holds
a reference.  With something like UDP, this might well be sufficient.

Just be careful!  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07  8:31     ` Peter Zijlstra
@ 2008-10-07 14:36       ` Paul E. McKenney
  2008-10-07 18:29       ` David Miller
  1 sibling, 0 replies; 134+ messages in thread
From: Paul E. McKenney @ 2008-10-07 14:36 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: David Miller, dada1, minyard, linux-kernel, netdev, shemminger

On Tue, Oct 07, 2008 at 10:31:30AM +0200, Peter Zijlstra wrote:
> On Mon, 2008-10-06 at 14:40 -0700, David Miller wrote:
> > From: Eric Dumazet <dada1@cosmosbay.com>
> > Date: Mon, 06 Oct 2008 23:22:31 +0200
> > 
> > > Me wondering what impact this synchronize_rcu() can have on mono-threaded
> > > VOIP applications using lot of UDP sockets. What is the maximum delay of
> > > this function ?
> > 
> > The cost is enormous, we really can't use it here.
> > 
> > I have a patch that did top-level socket destruction using RCU,
> > and that didn't use synchronize_rcu(), and that killed connection
> > rates by up to %20.
> 
> Did you ever figure out why you lost those 20% ?
> 
> > I can only imagine what the cost would be if I had to add such a call
> > in there.
> 
> Yeah, sync_rcu() is rediculously expensive, at best 3 jiffies IIRC.

I could make it -much- faster, but at the expense of -serious- CPU
overhead.  Still, might be useful during boot time (when the system
can't do anything useful anyway) to accelerate getting data structures
initialized.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 14:15       ` Christoph Lameter
@ 2008-10-07 14:38         ` Paul E. McKenney
  2008-10-07 14:50         ` Eric Dumazet
  1 sibling, 0 replies; 134+ messages in thread
From: Paul E. McKenney @ 2008-10-07 14:38 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Eric Dumazet, Peter Zijlstra, minyard, Linux Kernel, netdev,
	shemminger

On Tue, Oct 07, 2008 at 09:15:00AM -0500, Christoph Lameter wrote:
> Eric Dumazet wrote:
> >>> Or just add SLAB_DESTROY_BY_RCU to slab creation in proto_register()
> >>> for "struct proto udp_prot/udpv6_prot" so that kmem_cache_free() done
> >>> in sk_prot_free() can defer freeing to RCU...
> >>
> >> Be careful!, SLAB_DESTROY_BY_RCU just means the slab page gets
> >> RCU-freed, this means that slab object pointers stay pointing to valid
> >> memory, but it does _NOT_ mean those slab objects themselves remain
> >> valid.
> >>
> >> The slab allocator is free to re-use those objects at any time -
> >> irrespective of the rcu-grace period. Therefore you will have to be able
> >> to validate that the object you point to is indeed the object you
> >> expect, otherwise strange and wonderful things will happen.
> >>
> > Thanks for this clarification. I guess we really need a rcu head then :)
> 
> No you just need to make sure that the object you located is still active
> (f.e. refcount > 0) and that it is really a match (hash pointers may be
> updated asynchronously and therefore point to the object that has been reused
> for something else).

In some cases, you might be able to not care, but yes, most of the time,
you will need to validate the object.

> Generally it is advisable to use SLAB_DESTROY_BY_RCU because it preserves the
> cache hot advantages of the objects. Regular RCU freeing will let the object
> expire for a tick or so which will result in the cacheline cooling down.

And SLAB_DESTROY_BY_RCU guarantees that the type of the object will
remain the same during any given RCU read-side critical section.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 14:29           ` Evgeniy Polyakov
@ 2008-10-07 14:38             ` Christoph Lameter
  0 siblings, 0 replies; 134+ messages in thread
From: Christoph Lameter @ 2008-10-07 14:38 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Corey Minyard, David Miller, dada1, linux-kernel, netdev,
	shemminger, paulmck, Hugh Dickins

Evgeniy Polyakov wrote:
> On Tue, Oct 07, 2008 at 09:16:13AM -0500, Christoph Lameter (cl@linux-foundation.org) wrote:
>>> I tested skb destruction via RCU path, and got 2.5 times worse numbers
>>> with small-packets-bulk-transfer workload.
>> Was this with regular RCU freeing? This will cool down the cacheline before
>> frees. You need SLAB_DESTROY_BY_RCU to keep the objects cache hot.
> 
> I believe there were no SLAB_DESTROY_BY_RCU 2 yars ago :)

Its been awhile. Hugh created it

> It was pure call_rcu(&skb->rcu, real_skb_freeing), where
> real_skb_freeing() just did usual kfree().

Right. That results in cacheline cooldown. You'd want to recycle the object as
they are cache hot on a per cpu basis. That is screwed up by the delayed
regular rcu processing. We have seen multiple regressions due to cacheline
cooldown.

The only choice in cacheline hot sensitive areas is to deal with the
complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 14:33           ` Paul E. McKenney
@ 2008-10-07 14:45             ` Christoph Lameter
  2008-10-07 15:07               ` Eric Dumazet
  2008-10-07 15:07               ` Paul E. McKenney
  0 siblings, 2 replies; 134+ messages in thread
From: Christoph Lameter @ 2008-10-07 14:45 UTC (permalink / raw)
  To: paulmck
  Cc: Evgeniy Polyakov, Corey Minyard, David Miller, dada1,
	linux-kernel, netdev, shemminger

Paul E. McKenney wrote:

> But care is required -- SLAB_DESTROY_BY_RCU permits objects to be freed
> and reallocated while a reader holds a reference.  The only guarantee is
> that the -type- of the data structure will not change while a reader holds
> a reference.  With something like UDP, this might well be sufficient.

Right so after the hash lookup operation you are not assured that the object
has not been freed or even reallocated for a different purpose. So after
finding the pointer to the object two things need to happen (under rcu_lock):

1. Verify that the object is still in use
2. Verify that the object is matching the hash

If not then the operation needs to be redone because we have a stale hash pointer.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 14:15       ` Christoph Lameter
  2008-10-07 14:38         ` Paul E. McKenney
@ 2008-10-07 14:50         ` Eric Dumazet
  2008-10-07 15:05           ` Paul E. McKenney
                             ` (2 more replies)
  1 sibling, 3 replies; 134+ messages in thread
From: Eric Dumazet @ 2008-10-07 14:50 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Peter Zijlstra, minyard, Linux Kernel, netdev, shemminger,
	paulmck

Christoph Lameter a écrit :
> Eric Dumazet wrote:
>>>> Or just add SLAB_DESTROY_BY_RCU to slab creation in proto_register()
>>>> for "struct proto udp_prot/udpv6_prot" so that kmem_cache_free() done
>>>> in sk_prot_free() can defer freeing to RCU...
>>> Be careful!, SLAB_DESTROY_BY_RCU just means the slab page gets
>>> RCU-freed, this means that slab object pointers stay pointing to valid
>>> memory, but it does _NOT_ mean those slab objects themselves remain
>>> valid.
>>>
>>> The slab allocator is free to re-use those objects at any time -
>>> irrespective of the rcu-grace period. Therefore you will have to be able
>>> to validate that the object you point to is indeed the object you
>>> expect, otherwise strange and wonderful things will happen.
>>>
>> Thanks for this clarification. I guess we really need a rcu head then :)
> 
> No you just need to make sure that the object you located is still active
> (f.e. refcount > 0) and that it is really a match (hash pointers may be
> updated asynchronously and therefore point to the object that has been reused
> for something else).
> 
> Generally it is advisable to use SLAB_DESTROY_BY_RCU because it preserves the
> cache hot advantages of the objects. Regular RCU freeing will let the object
> expire for a tick or so which will result in the cacheline cooling down.

Seems really good to master this SLAB_DESTROY_BY_RCU thing (I see almost no use
of it in current kernel)

1) Hum, do you know why "struct file" objects dont use SLAB_DESTROY_BY_RCU then,
since we noticed a performance regression for several workloads at RCUification
of file structures ?

2) What prevents an object to be *freed* (and deleted from a hash chain), then
re-allocated and inserted to another chain (different keys) ? (final refcount=1)

If the lookup detects a key mismatch, how will it continue to the next item,
since 'next' pointer will have been reused for the new chain insertion...

Me confused...




^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 14:50         ` Eric Dumazet
@ 2008-10-07 15:05           ` Paul E. McKenney
  2008-10-07 15:09           ` Peter Zijlstra
  2008-10-07 15:23           ` Christoph Lameter
  2 siblings, 0 replies; 134+ messages in thread
From: Paul E. McKenney @ 2008-10-07 15:05 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Christoph Lameter, Peter Zijlstra, minyard, Linux Kernel, netdev,
	shemminger

On Tue, Oct 07, 2008 at 04:50:47PM +0200, Eric Dumazet wrote:
> Christoph Lameter a écrit :
>> Eric Dumazet wrote:
>>>>> Or just add SLAB_DESTROY_BY_RCU to slab creation in proto_register()
>>>>> for "struct proto udp_prot/udpv6_prot" so that kmem_cache_free() done
>>>>> in sk_prot_free() can defer freeing to RCU...
>>>> Be careful!, SLAB_DESTROY_BY_RCU just means the slab page gets
>>>> RCU-freed, this means that slab object pointers stay pointing to valid
>>>> memory, but it does _NOT_ mean those slab objects themselves remain
>>>> valid.
>>>>
>>>> The slab allocator is free to re-use those objects at any time -
>>>> irrespective of the rcu-grace period. Therefore you will have to be able
>>>> to validate that the object you point to is indeed the object you
>>>> expect, otherwise strange and wonderful things will happen.
>>>>
>>> Thanks for this clarification. I guess we really need a rcu head then :)
>> No you just need to make sure that the object you located is still active
>> (f.e. refcount > 0) and that it is really a match (hash pointers may be
>> updated asynchronously and therefore point to the object that has been 
>> reused
>> for something else).
>> Generally it is advisable to use SLAB_DESTROY_BY_RCU because it preserves 
>> the
>> cache hot advantages of the objects. Regular RCU freeing will let the 
>> object
>> expire for a tick or so which will result in the cacheline cooling down.
>
> Seems really good to master this SLAB_DESTROY_BY_RCU thing (I see almost no 
> use of it in current kernel)

It is not the easiest thing to use...

> 1) Hum, do you know why "struct file" objects dont use SLAB_DESTROY_BY_RCU 
> then, since we noticed a performance regression for several workloads at 
> RCUification of file structures ?
>
> 2) What prevents an object to be *freed* (and deleted from a hash chain), 
> then re-allocated and inserted to another chain (different keys) ? (final 
> refcount=1)

Nothing prevents this from happening.  You either have to have some sort
of validation step based on object identity (e.g., a generation number
that is incremented on each allocation), or have an algorithm that
doesn't care if searches sometimes spuriously fail to find something
that really is in the list.

Which is one of the reasons that its use is rare.  But perhaps more
experience with it will show more/better ways to use it.

> If the lookup detects a key mismatch, how will it continue to the next 
> item, since 'next' pointer will have been reused for the new chain
> insertion...
>
> Me confused...

One way to approach this is to have a generation number that is
incremented each time the object is recycled along with a pointer to the
list header.  The list header contains the most recent generation number
of any element in the list.  Then if either the generation number of
a give element is greater than that of the header when you started the
search, or the element's pointer no longer references the list header
you started your search from, restart the search.  Read-side memory
barriers may also be required in some cases.

It may be possible to simplify this in some special cases.

There are probably better ways to approach this, but that is one way.

						Thanx, Paul

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 14:45             ` Christoph Lameter
@ 2008-10-07 15:07               ` Eric Dumazet
  2008-10-07 15:07               ` Paul E. McKenney
  1 sibling, 0 replies; 134+ messages in thread
From: Eric Dumazet @ 2008-10-07 15:07 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: paulmck, Evgeniy Polyakov, Corey Minyard, David Miller,
	linux-kernel, netdev, shemminger

Christoph Lameter a écrit :
> Paul E. McKenney wrote:
> 
>> But care is required -- SLAB_DESTROY_BY_RCU permits objects to be freed
>> and reallocated while a reader holds a reference.  The only guarantee is
>> that the -type- of the data structure will not change while a reader holds
>> a reference.  With something like UDP, this might well be sufficient.
> 
> Right so after the hash lookup operation you are not assured that the object
> has not been freed or even reallocated for a different purpose. So after
> finding the pointer to the object two things need to happen (under rcu_lock):
> 
> 1. Verify that the object is still in use
> 2. Verify that the object is matching the hash
> 
> If not then the operation needs to be redone because we have a stale hash pointer.

OK... so restart full lookup at the begining of hash chain if we detect
points 1 or 2 invalid.

Not that expensive since everything should be cache hot :)

One has to take care to group all components (keys to compute the hash, 
and the *next* pointer) in one cache line to minimize cache misses,
since we now need to access them all to compute/check hash value.

Now if a freed object is re-inserted with same hash value, 
same hash chain, we'll also restart the lookup, but it is harmless.





^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 14:45             ` Christoph Lameter
  2008-10-07 15:07               ` Eric Dumazet
@ 2008-10-07 15:07               ` Paul E. McKenney
  1 sibling, 0 replies; 134+ messages in thread
From: Paul E. McKenney @ 2008-10-07 15:07 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Evgeniy Polyakov, Corey Minyard, David Miller, dada1,
	linux-kernel, netdev, shemminger

On Tue, Oct 07, 2008 at 09:45:43AM -0500, Christoph Lameter wrote:
> Paul E. McKenney wrote:
> 
> > But care is required -- SLAB_DESTROY_BY_RCU permits objects to be freed
> > and reallocated while a reader holds a reference.  The only guarantee is
> > that the -type- of the data structure will not change while a reader holds
> > a reference.  With something like UDP, this might well be sufficient.
> 
> Right so after the hash lookup operation you are not assured that the object
> has not been freed or even reallocated for a different purpose. So after
> finding the pointer to the object two things need to happen (under rcu_lock):
> 
> 1. Verify that the object is still in use
> 2. Verify that the object is matching the hash
> 
> If not then the operation needs to be redone because we have a stale hash
> pointer.

There is also the possibility that the element will be reused, but placed
in the same list that it resided in last time.  A reader referencing
that item during this process might then be relocated in the list, which
could either cause the reader to skip elements in the list (if the new
element is relocated farther down the list) or endlessly loop through
the list (if new elements were relocated closer to the list head and this
free-reallocate process repeated and the reader was insanely unlucky).

							Thanx, Paul

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 14:50         ` Eric Dumazet
  2008-10-07 15:05           ` Paul E. McKenney
@ 2008-10-07 15:09           ` Peter Zijlstra
  2008-10-07 15:23           ` Christoph Lameter
  2 siblings, 0 replies; 134+ messages in thread
From: Peter Zijlstra @ 2008-10-07 15:09 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Christoph Lameter, minyard, Linux Kernel, netdev, shemminger,
	paulmck, Hugh Dickins, Nick Piggin

On Tue, 2008-10-07 at 16:50 +0200, Eric Dumazet wrote:
> Christoph Lameter a écrit :
> > Eric Dumazet wrote:
> >>>> Or just add SLAB_DESTROY_BY_RCU to slab creation in proto_register()
> >>>> for "struct proto udp_prot/udpv6_prot" so that kmem_cache_free() done
> >>>> in sk_prot_free() can defer freeing to RCU...
> >>> Be careful!, SLAB_DESTROY_BY_RCU just means the slab page gets
> >>> RCU-freed, this means that slab object pointers stay pointing to valid
> >>> memory, but it does _NOT_ mean those slab objects themselves remain
> >>> valid.
> >>>
> >>> The slab allocator is free to re-use those objects at any time -
> >>> irrespective of the rcu-grace period. Therefore you will have to be able
> >>> to validate that the object you point to is indeed the object you
> >>> expect, otherwise strange and wonderful things will happen.
> >>>
> >> Thanks for this clarification. I guess we really need a rcu head then :)
> > 
> > No you just need to make sure that the object you located is still active
> > (f.e. refcount > 0) and that it is really a match (hash pointers may be
> > updated asynchronously and therefore point to the object that has been reused
> > for something else).
> > 
> > Generally it is advisable to use SLAB_DESTROY_BY_RCU because it preserves the
> > cache hot advantages of the objects. Regular RCU freeing will let the object
> > expire for a tick or so which will result in the cacheline cooling down.
> 
> Seems really good to master this SLAB_DESTROY_BY_RCU thing (I see almost no use
> of it in current kernel)

There is (AFAIK) only 1 user, the anon_vma stuff.

> 1) Hum, do you know why "struct file" objects dont use SLAB_DESTROY_BY_RCU then,
> since we noticed a performance regression for several workloads at RCUification
> of file structures ?
> 
> 2) What prevents an object to be *freed* (and deleted from a hash chain), then
> re-allocated and inserted to another chain (different keys) ? (final refcount=1)
> 
> If the lookup detects a key mismatch, how will it continue to the next item,
> since 'next' pointer will have been reused for the new chain insertion...

Right, you can't have lists with items like that. You can only do
matching lookups. What you do is:

find_get_obj(key)
{
 rcu_read_lock()
again:
 obj = lookup(key);
 if (!obj)
  goto out;

 /* 
  * if we can't get a ref, the item got freed concurrently
  * try again
  */
 if (!get_ref_unless_zero(obj))
  goto again;

 /*
  * if we did get a ref, but its not the object we expected
  * try again
  */
 if (obj->key != key) {
   put_ref(obj);
   goto again;
 }
out:
 rcu_read_unlock();
 return obj;
}

Which is basically what we do with the lockless pagecache, where we
don't need the RCU because the page-frames are never freed.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 14:50         ` Eric Dumazet
  2008-10-07 15:05           ` Paul E. McKenney
  2008-10-07 15:09           ` Peter Zijlstra
@ 2008-10-07 15:23           ` Christoph Lameter
  2 siblings, 0 replies; 134+ messages in thread
From: Christoph Lameter @ 2008-10-07 15:23 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Peter Zijlstra, minyard, Linux Kernel, netdev, shemminger,
	paulmck

Eric Dumazet wrote:

> 
> 1) Hum, do you know why "struct file" objects dont use
> SLAB_DESTROY_BY_RCU then,
> since we noticed a performance regression for several workloads at
> RCUification
> of file structures ?

Because my patches were not accepted that fix the issue.
http://lkml.org/lkml/2006/6/16/144


> 2) What prevents an object to be *freed* (and deleted from a hash
> chain), then
> re-allocated and inserted to another chain (different keys) ? (final
> refcount=1)

Nothing.

> If the lookup detects a key mismatch, how will it continue to the next
> item,
> since 'next' pointer will have been reused for the new chain insertion...
> 
> Me confused...

If there is a mismatch then you have to do another hash lookup. Do an rcu
unlock and start over.




^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 12:59         ` Eric Dumazet
  2008-10-07 14:07           ` Stephen Hemminger
@ 2008-10-07 16:43           ` Corey Minyard
  1 sibling, 0 replies; 134+ messages in thread
From: Corey Minyard @ 2008-10-07 16:43 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Benny Amorsen, David Miller, linux-kernel, netdev, shemminger,
	paulmck

Eric Dumazet wrote:
>
> (2 weeks ago, Corey mentioned a 10x increase on UDP throughput on a 
> 16-way machine,
> that sounds promising)
Just to be clear, that was 10x with preempt RT, which converts rwlocks 
into PI mutexes.  So 16 processors going for the same lock is pretty ugly.

Under heavy loads there is also a writer starvation problem, I believe 
in non-RT.  You can't actually create or destroy a UDP socket when the 
load is high because there's always a reader holding the lock.  RCU also 
solves that problem.

-corey


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07  5:24     ` Eric Dumazet
  2008-10-07  8:54       ` Benny Amorsen
@ 2008-10-07 18:26       ` David Miller
  2008-10-08  8:35         ` Eric Dumazet
  1 sibling, 1 reply; 134+ messages in thread
From: David Miller @ 2008-10-07 18:26 UTC (permalink / raw)
  To: dada1; +Cc: minyard, linux-kernel, netdev, shemminger, paulmck

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 07 Oct 2008 07:24:45 +0200

> Most UDP sockets are setup for long periods (RTP trafic), or if an application really
> wants to {open/send or receive one UDP frame/close} many sockets, it already hits
> RCU handling of its file structures and should not be slowed down that much.

As stated, I added RCU destruction generically for socket objects, and it
showed up clearly.

So "not be slowed down that much" has been disproven, at least to me,
already :-)

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07  8:31     ` Peter Zijlstra
  2008-10-07 14:36       ` Paul E. McKenney
@ 2008-10-07 18:29       ` David Miller
  1 sibling, 0 replies; 134+ messages in thread
From: David Miller @ 2008-10-07 18:29 UTC (permalink / raw)
  To: a.p.zijlstra; +Cc: dada1, minyard, linux-kernel, netdev, shemminger, paulmck

From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 07 Oct 2008 10:31:30 +0200

> On Mon, 2008-10-06 at 14:40 -0700, David Miller wrote:
> > From: Eric Dumazet <dada1@cosmosbay.com>
> > Date: Mon, 06 Oct 2008 23:22:31 +0200
> > 
> > > Me wondering what impact this synchronize_rcu() can have on mono-threaded
> > > VOIP applications using lot of UDP sockets. What is the maximum delay of
> > > this function ?
> > 
> > The cost is enormous, we really can't use it here.
> > 
> > I have a patch that did top-level socket destruction using RCU,
> > and that didn't use synchronize_rcu(), and that killed connection
> > rates by up to %20.
> 
> Did you ever figure out why you lost those 20% ?

Probably the RCU delay on a 128 cpu machine :-)

Also I bet batching the socket destruction eliminates all of
the cached local state we have in the cpu at the actual socket
destruction time.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 14:07           ` Stephen Hemminger
@ 2008-10-07 20:55             ` David Miller
  2008-10-07 21:20               ` Stephen Hemminger
  2008-10-08 13:55               ` Eric Dumazet
  0 siblings, 2 replies; 134+ messages in thread
From: David Miller @ 2008-10-07 20:55 UTC (permalink / raw)
  To: shemminger; +Cc: dada1, benny+usenet, minyard, linux-kernel, netdev, paulmck

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Tue, 7 Oct 2008 16:07:29 +0200

> The idea of keeping chains short is the problem. That code should
> just be pulled because it doesn't help that much, and also creates
> bias on the port randomization.

I have that patch from Vitaly Mayatskikh which does exactly this.

I keep looking at it, but I can't bring myself to apply it since
I'm not completely convinced.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 20:55             ` David Miller
@ 2008-10-07 21:20               ` Stephen Hemminger
  2008-10-08 13:55               ` Eric Dumazet
  1 sibling, 0 replies; 134+ messages in thread
From: Stephen Hemminger @ 2008-10-07 21:20 UTC (permalink / raw)
  To: David Miller; +Cc: dada1, benny+usenet, minyard, linux-kernel, netdev, paulmck

On Tue, 07 Oct 2008 13:55:48 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> From: Stephen Hemminger <shemminger@vyatta.com>
> Date: Tue, 7 Oct 2008 16:07:29 +0200
> 
> > The idea of keeping chains short is the problem. That code should
> > just be pulled because it doesn't help that much, and also creates
> > bias on the port randomization.
> 
> I have that patch from Vitaly Mayatskikh which does exactly this.
> 
> I keep looking at it, but I can't bring myself to apply it since
> I'm not completely convinced.

Some one on a busy server should run it and measure the delta in hash chains.
I would, but don't run anything that has more than a few UDP's open.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 18:26       ` David Miller
@ 2008-10-08  8:35         ` Eric Dumazet
  2008-10-08 16:38           ` David Miller
  0 siblings, 1 reply; 134+ messages in thread
From: Eric Dumazet @ 2008-10-08  8:35 UTC (permalink / raw)
  To: David Miller; +Cc: minyard, linux-kernel, netdev, shemminger, paulmck

David Miller a écrit :
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Tue, 07 Oct 2008 07:24:45 +0200
> 
>> Most UDP sockets are setup for long periods (RTP trafic), or if an application really
>> wants to {open/send or receive one UDP frame/close} many sockets, it already hits
>> RCU handling of its file structures and should not be slowed down that much.
> 
> As stated, I added RCU destruction generically for socket objects, and it
> showed up clearly.
> 
> So "not be slowed down that much" has been disproven, at least to me,
> already :-)
> 
> 
RCU in hash table is ok for managing read mostly data, since
only during the read access you avoid to dirty a rwlock...

If we have a workload that insert/delete sockets as hell but
receive few frames (that hit the hash table in a read only way),
then you defeat the purpose of RCU, and pay the price of
throwing away (in rcu queue) hot data that will become cold
before reuse...

BTW is there any chance your results were obtained before October 2005 ?

At that time, RCU was able to queue an unlimited number of events.
a single loop doing close(open("/dev/null",0)) could exhaust RAM...

Refs: commit  5ee832dbc6770135ec8d63296af0a4374557bb79
and many others...

Anyway we can probably code something without call_rcu() cache blower
for UDP, if time permits :)

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 20:55             ` David Miller
  2008-10-07 21:20               ` Stephen Hemminger
@ 2008-10-08 13:55               ` Eric Dumazet
  2008-10-08 18:45                 ` David Miller
  2008-10-28 20:37                 ` [PATCH 0/2] udp: Convert the UDP hash lock to RCU Eric Dumazet
  1 sibling, 2 replies; 134+ messages in thread
From: Eric Dumazet @ 2008-10-08 13:55 UTC (permalink / raw)
  To: David Miller
  Cc: shemminger, benny+usenet, minyard, linux-kernel, netdev, paulmck

[-- Attachment #1: Type: text/plain, Size: 1456 bytes --]

David Miller a écrit :
> From: Stephen Hemminger <shemminger@vyatta.com>
> Date: Tue, 7 Oct 2008 16:07:29 +0200
> 
>> The idea of keeping chains short is the problem. That code should
>> just be pulled because it doesn't help that much, and also creates
>> bias on the port randomization.
> 
> I have that patch from Vitaly Mayatskikh which does exactly this.
> 
> I keep looking at it, but I can't bring myself to apply it since
> I'm not completely convinced.

Vitaly patch might be appropriate if only few UDP ports are opened.

We could zap the code to search short chains and extend Vitaly's
idea with following patch :

[PATCH] udp: Improve port randomization

Current UDP port allocation is suboptimal.
We select the shortest chain to chose a port (out of 512)
that will hash in this shortest chain.

First, it can lead to give not so ramdom ports and ease
give attackers more opportunities to break the system.

Second, it can consume a lot of CPU to scan all table
in order to find the shortest chain.

Third, in some pathological cases we can fail to find
a free port even if they are plenty of them.

This patch zap the search for a short chain and only
use one random seed. Problem of getting long chains
should be addressed in another way, since we can
obtain long chains with non random ports.

Based on a report and patch from Vitaly Mayatskikh

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>



[-- Attachment #2: udp_random.patch --]
[-- Type: text/plain, Size: 1773 bytes --]

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 85f8e8e..67d8430 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -155,55 +155,23 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 	write_lock_bh(&udp_hash_lock);
 
 	if (!snum) {
-		int i, low, high, remaining;
-		unsigned rover, best, best_size_so_far;
+		int low, high, remaining;
+		unsigned rand;
+		unsigned short first;
 
 		inet_get_local_port_range(&low, &high);
 		remaining = (high - low) + 1;
 
-		best_size_so_far = UINT_MAX;
-		best = rover = net_random() % remaining + low;
-
-		/* 1st pass: look for empty (or shortest) hash chain */
-		for (i = 0; i < UDP_HTABLE_SIZE; i++) {
-			int size = 0;
-
-			head = &udptable[udp_hashfn(net, rover)];
-			if (hlist_empty(head))
-				goto gotit;
-
-			sk_for_each(sk2, node, head) {
-				if (++size >= best_size_so_far)
-					goto next;
-			}
-			best_size_so_far = size;
-			best = rover;
-		next:
-			/* fold back if end of range */
-			if (++rover > high)
-				rover = low + ((rover - low)
-					       & (UDP_HTABLE_SIZE - 1));
-
-
-		}
-
-		/* 2nd pass: find hole in shortest hash chain */
-		rover = best;
-		for (i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++) {
-			if (! __udp_lib_lport_inuse(net, rover, udptable))
-				goto gotit;
-			rover += UDP_HTABLE_SIZE;
-			if (rover > high)
-				rover = low + ((rover - low)
-					       & (UDP_HTABLE_SIZE - 1));
+		rand = net_random();
+		snum = first = rand % remaining + low;
+		rand |= 1;
+		while (__udp_lib_lport_inuse(net, snum, udptable)) {
+			do {
+				snum = snum + rand;
+			} while (snum < low || snum > high);
+			if (snum == first)
+				goto fail;
 		}
-
-
-		/* All ports in use! */
-		goto fail;
-
-gotit:
-		snum = rover;
 	} else {
 		head = &udptable[udp_hashfn(net, snum)];
 

^ permalink raw reply related	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-08  8:35         ` Eric Dumazet
@ 2008-10-08 16:38           ` David Miller
  0 siblings, 0 replies; 134+ messages in thread
From: David Miller @ 2008-10-08 16:38 UTC (permalink / raw)
  To: dada1; +Cc: minyard, linux-kernel, netdev, shemminger, paulmck

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 08 Oct 2008 10:35:07 +0200

> BTW is there any chance your results were obtained before October 2005 ?

No, the timestamp on the saved patch file I have is June 16, 2008 :-)

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-08 13:55               ` Eric Dumazet
@ 2008-10-08 18:45                 ` David Miller
  2008-10-28 20:37                   ` [PATCH 1/2] udp: introduce struct udp_table and multiple rwlocks Eric Dumazet
  2008-10-28 20:42                   ` [PATCH 2/2] udp: RCU handling for Unicast packets Eric Dumazet
  2008-10-28 20:37                 ` [PATCH 0/2] udp: Convert the UDP hash lock to RCU Eric Dumazet
  1 sibling, 2 replies; 134+ messages in thread
From: David Miller @ 2008-10-08 18:45 UTC (permalink / raw)
  To: dada1; +Cc: shemminger, benny+usenet, minyard, linux-kernel, netdev, paulmck

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 08 Oct 2008 15:55:36 +0200

> David Miller a écrit :
> > From: Stephen Hemminger <shemminger@vyatta.com>
> > Date: Tue, 7 Oct 2008 16:07:29 +0200
> > 
> >> The idea of keeping chains short is the problem. That code should
> >> just be pulled because it doesn't help that much, and also creates
> >> bias on the port randomization.
> > I have that patch from Vitaly Mayatskikh which does exactly this.
> > I keep looking at it, but I can't bring myself to apply it since
> > I'm not completely convinced.
> 
> Vitaly patch might be appropriate if only few UDP ports are opened.
> 
> We could zap the code to search short chains and extend Vitaly's
> idea with following patch :

I really like this, and I've applied it to net-next-2.6

I think the "increment until back in range" do/while loop can
be improved a bit.  It can spin for more than 60,000 iterations
in some edge case scenerios as-is :-)

Ugh, there's also that expensive divide in there for the modulus.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* [PATCH 0/2] udp: Convert the UDP hash lock to RCU
  2008-10-08 13:55               ` Eric Dumazet
  2008-10-08 18:45                 ` David Miller
@ 2008-10-28 20:37                 ` Eric Dumazet
  2008-10-28 21:28                   ` Stephen Hemminger
  1 sibling, 1 reply; 134+ messages in thread
From: Eric Dumazet @ 2008-10-28 20:37 UTC (permalink / raw)
  To: David Miller
  Cc: shemminger, benny+usenet, minyard, netdev, paulmck,
	Christoph Lameter, Peter Zijlstra, Evgeniy Polyakov

UDP sockets are hashed in a 128 slots hash table.

This hash table is protected by *one* rwlock.

This rwlock is readlocked each time an incoming UDP message is handled.

This rwlock is writelocked each time a socket must be inserted in
hash table (bind time), or deleted from this table (unbind time)

This is not scalable on SMP machines :

1) Even in read mode, lock() and unlock() are atomic operations and
must dirty a contended cache line, shared by all cpus.

2) A writer might be starved if many readers are 'in flight'. This can
happen on a machine with some NIC receiving many UDP messages. User
process can be delayed a long time at socket creation/dismantle time.

What Corey and I propose is to use RCU to protect this hash table.

Goals are :

1) Optimizing handling of incoming Unicast UDP frames, so that no memory
writes should happen in the fast path. Using an array of rwlocks (one per
slot for example is not an option in this regard)

Note: Multicasts and broadcasts still will need to take a lock,
because doing a full lockless lookup in this case is difficult.

2) No expensive operations in the socket bind/unhash phases :
  - No expensive synchronize_rcu() calls.

  - No added rcu_head in socket structure, increasing memory needs,
  but more important, forcing us to use call_rcu() calls,
  that have the bad property of making sockets structure cold.
  (rcu grace period between socket freeing and its potential reuse
   make this socket being cold in CPU cache).
  David did a previous patch using call_rcu() and noticed a 20%
  impact on TCP connection rates.

  Quoting Cristopher Lameter :
  "Right. That results in cacheline cooldown. You'd want to recycle
   the object as they are cache hot on a per cpu basis. That is screwed
   up by the delayed regular rcu processing. We have seen multiple
   regressions due to cacheline cooldown.
   The only choice in cacheline hot sensitive areas is to deal with the
   complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU."

  - Because udp sockets are allocated from dedicated kmem_cache,
  use of SLAB_DESTROY_BY_RCU can help here.

Theory of operation :
---------------------

As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()),
special attention must be taken by readers and writers.

Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed,
reused, inserted in a different chain or in worst case in the same chain
while readers could do lookups in the same time.

In order to avoid loops, a reader must check each socket found in a chain
really belongs to the chain the reader was traversing. If it finds a
mismatch, lookup must start again at the begining. This *restart* loop
is the reason we had to use rdlock for the multicast case, because
we dont want to send same message several times to the same socket.

We use RCU only for fast path. Thus, /proc/net/udp still take rdlocks.

Work splited on two patches.

[PATCH 1/2] udp: introduce struct udp_table and multiple rwlocks

Introduction 'struct udp_table and struct udp_hslot',
with one rwlock per chain, instead of a global one.
Some cleanups were done to ease review of next patch.

[PATCH 2/2] udp: RCU handling for Unicast packets.

Tests done on a dual quad core machine (8 cpus) with IPV4 only were
pretty good, since some microbenches ran ten times faster. 

Many thanks to all contributors (David Miller, Christoph Lameter,
Peter Zijlstra, Stephen Hemminger, Paul E. McKenney, Evgeniy Polyakov)
for their review/comments on initial Corey work.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* [PATCH 1/2] udp: introduce struct udp_table and multiple rwlocks
  2008-10-08 18:45                 ` David Miller
@ 2008-10-28 20:37                   ` Eric Dumazet
  2008-10-28 21:23                     ` Christian Bell
  2008-10-28 21:28                     ` Evgeniy Polyakov
  2008-10-28 20:42                   ` [PATCH 2/2] udp: RCU handling for Unicast packets Eric Dumazet
  1 sibling, 2 replies; 134+ messages in thread
From: Eric Dumazet @ 2008-10-28 20:37 UTC (permalink / raw)
  To: David Miller
  Cc: shemminger, benny+usenet, minyard, netdev, paulmck,
	Christoph Lameter, Peter Zijlstra, Evgeniy Polyakov

[-- Attachment #1: Type: text/plain, Size: 1468 bytes --]

UDP sockets are hashed in a 128 slots hash table.

This hash table is protected by *one* rwlock.

This rwlock is readlocked each time an incoming UDP message is handled.

This rwlock is writelocked each time a socket must be inserted in
hash table (bind time), or deleted from this table (close time)

This is not scalable on SMP machines :

1) Even in read mode, lock() and unlock() are atomic operations and
  must dirty a contended cache line, shared by all cpus.

2) A writer might be starved if many readers are 'in flight'. This can
  happen on a machine with some NIC receiving many UDP messages. User
  process can be delayed a long time at socket creation/dismantle time.

This patch prepares RCU migration, by introducing 'struct udp_table
and struct udp_hslot', and using one rwlock per chain, to reduce
contention on central rwlock.

Introducing one rwlock per chain reduces latencies, for port
randomization on heavily loaded UDP servers.

Some cleanups were done to ease review of following patch.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
 include/net/sock.h    |    2
 include/net/udp.h     |   25 ++--
 include/net/udplite.h |    2
 net/ipv4/udp.c        |  208 +++++++++++++++++++++++-----------------
 net/ipv4/udp_impl.h   |    4
 net/ipv4/udplite.c    |   13 +-
 net/ipv6/udp.c        |  112 +++++++++++----------
 net/ipv6/udp_impl.h   |    4
 net/ipv6/udplite.c    |    8 -
 9 files changed, 214 insertions(+), 164 deletions(-)





[-- Attachment #2: PATCH_UDP.1 --]
[-- Type: text/plain, Size: 24776 bytes --]

diff --git a/include/net/sock.h b/include/net/sock.h
index ada50c0..4d630d5 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -597,7 +597,7 @@ struct proto {
 
 	union {
 		struct inet_hashinfo	*hashinfo;
-		struct hlist_head	*udp_hash;
+		struct udp_table	*udp_table;
 		struct raw_hashinfo	*raw_hash;
 	} h;
 
diff --git a/include/net/udp.h b/include/net/udp.h
index 1e20509..708f5ad 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -50,8 +50,15 @@ struct udp_skb_cb {
 };
 #define UDP_SKB_CB(__skb)	((struct udp_skb_cb *)((__skb)->cb))
 
-extern struct hlist_head udp_hash[UDP_HTABLE_SIZE];
-extern rwlock_t udp_hash_lock;
+struct udp_hslot {
+	struct hlist_head	head;
+	rwlock_t		lock;
+};
+struct udp_table {
+	struct udp_hslot	hash[UDP_HTABLE_SIZE];
+};
+extern struct udp_table udp_table;
+extern void udp_table_init(struct udp_table *);
 
 
 /* Note: this must match 'valbool' in sock_setsockopt */
@@ -110,15 +117,7 @@ static inline void udp_lib_hash(struct sock *sk)
 	BUG();
 }
 
-static inline void udp_lib_unhash(struct sock *sk)
-{
-	write_lock_bh(&udp_hash_lock);
-	if (sk_del_node_init(sk)) {
-		inet_sk(sk)->num = 0;
-		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
-	}
-	write_unlock_bh(&udp_hash_lock);
-}
+extern void udp_lib_unhash(struct sock *sk);
 
 static inline void udp_lib_close(struct sock *sk, long timeout)
 {
@@ -187,7 +186,7 @@ extern struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
 struct udp_seq_afinfo {
 	char			*name;
 	sa_family_t		family;
-	struct hlist_head	*hashtable;
+	struct udp_table	*udp_table;
 	struct file_operations	seq_fops;
 	struct seq_operations	seq_ops;
 };
@@ -196,7 +195,7 @@ struct udp_iter_state {
 	struct seq_net_private  p;
 	sa_family_t		family;
 	int			bucket;
-	struct hlist_head	*hashtable;
+	struct udp_table	*udp_table;
 };
 
 #ifdef CONFIG_PROC_FS
diff --git a/include/net/udplite.h b/include/net/udplite.h
index b76b2e3..afdffe6 100644
--- a/include/net/udplite.h
+++ b/include/net/udplite.h
@@ -11,7 +11,7 @@
 #define UDPLITE_RECV_CSCOV   11 /* receiver partial coverage (threshold ) */
 
 extern struct proto 		udplite_prot;
-extern struct hlist_head 	udplite_hash[UDP_HTABLE_SIZE];
+extern struct udp_table		udplite_table;
 
 /*
  *	Checksum computation is all in software, hence simpler getfrag.
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2095abc..168276a 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -104,12 +104,8 @@
 #include <net/xfrm.h>
 #include "udp_impl.h"
 
-/*
- *	Snmp MIB for the UDP layer
- */
-
-struct hlist_head udp_hash[UDP_HTABLE_SIZE];
-DEFINE_RWLOCK(udp_hash_lock);
+struct udp_table udp_table;
+EXPORT_SYMBOL(udp_table);
 
 int sysctl_udp_mem[3] __read_mostly;
 int sysctl_udp_rmem_min __read_mostly;
@@ -123,7 +119,7 @@ atomic_t udp_memory_allocated;
 EXPORT_SYMBOL(udp_memory_allocated);
 
 static int udp_lib_lport_inuse(struct net *net, __u16 num,
-			       const struct hlist_head udptable[],
+			       const struct udp_hslot *hslot,
 			       struct sock *sk,
 			       int (*saddr_comp)(const struct sock *sk1,
 						 const struct sock *sk2))
@@ -131,7 +127,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
 	struct sock *sk2;
 	struct hlist_node *node;
 
-	sk_for_each(sk2, node, &udptable[udp_hashfn(net, num)])
+	sk_for_each(sk2, node, &hslot->head)
 		if (net_eq(sock_net(sk2), net)			&&
 		    sk2 != sk					&&
 		    sk2->sk_hash == num				&&
@@ -154,12 +150,11 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 		       int (*saddr_comp)(const struct sock *sk1,
 					 const struct sock *sk2 )    )
 {
-	struct hlist_head *udptable = sk->sk_prot->h.udp_hash;
+	struct udp_hslot *hslot;
+	struct udp_table *udptable = sk->sk_prot->h.udp_table;
 	int    error = 1;
 	struct net *net = sock_net(sk);
 
-	write_lock_bh(&udp_hash_lock);
-
 	if (!snum) {
 		int low, high, remaining;
 		unsigned rand;
@@ -171,26 +166,33 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 		rand = net_random();
 		snum = first = rand % remaining + low;
 		rand |= 1;
-		while (udp_lib_lport_inuse(net, snum, udptable, sk,
-					   saddr_comp)) {
+		for (;;) {
+			hslot = &udptable->hash[udp_hashfn(net, snum)];
+			write_lock_bh(&hslot->lock);
+			if (!udp_lib_lport_inuse(net, snum, hslot, sk, saddr_comp))
+				break;
+			write_unlock_bh(&hslot->lock);
 			do {
 				snum = snum + rand;
 			} while (snum < low || snum > high);
 			if (snum == first)
 				goto fail;
 		}
-	} else if (udp_lib_lport_inuse(net, snum, udptable, sk, saddr_comp))
-		goto fail;
-
+	} else {
+		hslot = &udptable->hash[udp_hashfn(net, snum)];
+		write_lock_bh(&hslot->lock);
+		if (udp_lib_lport_inuse(net, snum, hslot, sk, saddr_comp))
+			goto fail;
+	}
 	inet_sk(sk)->num = snum;
 	sk->sk_hash = snum;
 	if (sk_unhashed(sk)) {
-		sk_add_node(sk, &udptable[udp_hashfn(net, snum)]);
+		sk_add_node(sk, &hslot->head);
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 	}
+	write_unlock_bh(&hslot->lock);
 	error = 0;
 fail:
-	write_unlock_bh(&udp_hash_lock);
 	return error;
 }
 
@@ -208,63 +210,73 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum)
 	return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal);
 }
 
+static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
+			 unsigned short hnum,
+			 __be16 sport, __be32 daddr, __be16 dport, int dif)
+{
+	int score = -1;
+
+	if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
+			!ipv6_only_sock(sk)) {
+		struct inet_sock *inet = inet_sk(sk);
+
+		score = (sk->sk_family == PF_INET ? 1 : 0);
+		if (inet->rcv_saddr) {
+			if (inet->rcv_saddr != daddr)
+				return -1;
+			score += 2;
+		}
+		if (inet->daddr) {
+			if (inet->daddr != saddr)
+				return -1;
+			score += 2;
+		}
+		if (inet->dport) {
+			if (inet->dport != sport)
+				return -1;
+			score += 2;
+		}
+		if (sk->sk_bound_dev_if) {
+			if (sk->sk_bound_dev_if != dif)
+				return -1;
+			score += 2;
+		}
+	}
+	return score;
+}
+
 /* UDP is nearly always wildcards out the wazoo, it makes no sense to try
  * harder than this. -DaveM
  */
 static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 		__be16 sport, __be32 daddr, __be16 dport,
-		int dif, struct hlist_head udptable[])
+		int dif, struct udp_table *udptable)
 {
 	struct sock *sk, *result = NULL;
 	struct hlist_node *node;
 	unsigned short hnum = ntohs(dport);
-	int badness = -1;
-
-	read_lock(&udp_hash_lock);
-	sk_for_each(sk, node, &udptable[udp_hashfn(net, hnum)]) {
-		struct inet_sock *inet = inet_sk(sk);
-
-		if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
-				!ipv6_only_sock(sk)) {
-			int score = (sk->sk_family == PF_INET ? 1 : 0);
-			if (inet->rcv_saddr) {
-				if (inet->rcv_saddr != daddr)
-					continue;
-				score+=2;
-			}
-			if (inet->daddr) {
-				if (inet->daddr != saddr)
-					continue;
-				score+=2;
-			}
-			if (inet->dport) {
-				if (inet->dport != sport)
-					continue;
-				score+=2;
-			}
-			if (sk->sk_bound_dev_if) {
-				if (sk->sk_bound_dev_if != dif)
-					continue;
-				score+=2;
-			}
-			if (score == 9) {
-				result = sk;
-				break;
-			} else if (score > badness) {
-				result = sk;
-				badness = score;
-			}
-		}
+	unsigned int hash = udp_hashfn(net, hnum);
+	struct udp_hslot *hslot = &udptable->hash[hash];
+	int score, badness = -1;
+
+	read_lock(&hslot->lock);
+	sk_for_each(sk, node, &hslot->head) {
+		score = compute_score(sk, net, saddr, hnum, sport,
+				      daddr, dport, dif);
+		if (score > badness) {
+			result = sk;
+			badness = score;
+ 		}
 	}
 	if (result)
 		sock_hold(result);
-	read_unlock(&udp_hash_lock);
+	read_unlock(&hslot->lock);
 	return result;
 }
 
 static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
 						 __be16 sport, __be16 dport,
-						 struct hlist_head udptable[])
+						 struct udp_table *udptable)
 {
 	struct sock *sk;
 	const struct iphdr *iph = ip_hdr(skb);
@@ -280,7 +292,7 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
 struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
 			     __be32 daddr, __be16 dport, int dif)
 {
-	return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, udp_hash);
+	return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table);
 }
 EXPORT_SYMBOL_GPL(udp4_lib_lookup);
 
@@ -323,7 +335,7 @@ found:
  * to find the appropriate port.
  */
 
-void __udp4_lib_err(struct sk_buff *skb, u32 info, struct hlist_head udptable[])
+void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 {
 	struct inet_sock *inet;
 	struct iphdr *iph = (struct iphdr*)skb->data;
@@ -392,7 +404,7 @@ out:
 
 void udp_err(struct sk_buff *skb, u32 info)
 {
-	__udp4_lib_err(skb, info, udp_hash);
+	__udp4_lib_err(skb, info, &udp_table);
 }
 
 /*
@@ -933,6 +945,21 @@ int udp_disconnect(struct sock *sk, int flags)
 	return 0;
 }
 
+void udp_lib_unhash(struct sock *sk)
+{
+	struct udp_table *udptable = sk->sk_prot->h.udp_table;
+	unsigned int hash = udp_hashfn(sock_net(sk), sk->sk_hash);
+	struct udp_hslot *hslot = &udptable->hash[hash];
+
+	write_lock(&hslot->lock);
+	if (sk_del_node_init(sk)) {
+		inet_sk(sk)->num = 0;
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+	}
+	write_unlock(&hslot->lock);
+}
+EXPORT_SYMBOL(udp_lib_unhash);
+
 static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
 	int is_udplite = IS_UDPLITE(sk);
@@ -1071,13 +1098,14 @@ drop:
 static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 				    struct udphdr  *uh,
 				    __be32 saddr, __be32 daddr,
-				    struct hlist_head udptable[])
+				    struct udp_table *udptable)
 {
 	struct sock *sk;
+	struct udp_hslot *hslot = &udptable->hash[udp_hashfn(net, ntohs(uh->dest))];
 	int dif;
 
-	read_lock(&udp_hash_lock);
-	sk = sk_head(&udptable[udp_hashfn(net, ntohs(uh->dest))]);
+	read_lock(&hslot->lock);
+	sk = sk_head(&hslot->head);
 	dif = skb->dev->ifindex;
 	sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
 	if (sk) {
@@ -1102,7 +1130,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 		} while (sknext);
 	} else
 		kfree_skb(skb);
-	read_unlock(&udp_hash_lock);
+	read_unlock(&hslot->lock);
 	return 0;
 }
 
@@ -1148,7 +1176,7 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
  *	All we need to do is get the socket, and then do a checksum.
  */
 
-int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[],
+int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 		   int proto)
 {
 	struct sock *sk;
@@ -1246,7 +1274,7 @@ drop:
 
 int udp_rcv(struct sk_buff *skb)
 {
-	return __udp4_lib_rcv(skb, udp_hash, IPPROTO_UDP);
+	return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);
 }
 
 void udp_destroy_sock(struct sock *sk)
@@ -1488,7 +1516,7 @@ struct proto udp_prot = {
 	.sysctl_wmem	   = &sysctl_udp_wmem_min,
 	.sysctl_rmem	   = &sysctl_udp_rmem_min,
 	.obj_size	   = sizeof(struct udp_sock),
-	.h.udp_hash	   = udp_hash,
+	.h.udp_table	   = &udp_table,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_udp_setsockopt,
 	.compat_getsockopt = compat_udp_getsockopt,
@@ -1498,20 +1526,23 @@ struct proto udp_prot = {
 /* ------------------------------------------------------------------------ */
 #ifdef CONFIG_PROC_FS
 
-static struct sock *udp_get_first(struct seq_file *seq)
+static struct sock *udp_get_first(struct seq_file *seq, int start)
 {
 	struct sock *sk;
 	struct udp_iter_state *state = seq->private;
 	struct net *net = seq_file_net(seq);
 
-	for (state->bucket = 0; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) {
+	for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) {
 		struct hlist_node *node;
-		sk_for_each(sk, node, state->hashtable + state->bucket) {
+		struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
+		read_lock(&hslot->lock);
+		sk_for_each(sk, node, &hslot->head) {
 			if (!net_eq(sock_net(sk), net))
 				continue;
 			if (sk->sk_family == state->family)
 				goto found;
 		}
+		read_unlock(&hslot->lock);
 	}
 	sk = NULL;
 found:
@@ -1525,20 +1556,18 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
 
 	do {
 		sk = sk_next(sk);
-try_again:
-		;
 	} while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
 
-	if (!sk && ++state->bucket < UDP_HTABLE_SIZE) {
-		sk = sk_head(state->hashtable + state->bucket);
-		goto try_again;
+	if (!sk) {
+		read_unlock(&state->udp_table->hash[state->bucket].lock);
+		return udp_get_first(seq, state->bucket + 1);
 	}
 	return sk;
 }
 
 static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
 {
-	struct sock *sk = udp_get_first(seq);
+	struct sock *sk = udp_get_first(seq, 0);
 
 	if (sk)
 		while (pos && (sk = udp_get_next(seq, sk)) != NULL)
@@ -1547,9 +1576,7 @@ static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
 }
 
 static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
-	__acquires(udp_hash_lock)
 {
-	read_lock(&udp_hash_lock);
 	return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
 }
 
@@ -1567,9 +1594,11 @@ static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 }
 
 static void udp_seq_stop(struct seq_file *seq, void *v)
-	__releases(udp_hash_lock)
 {
-	read_unlock(&udp_hash_lock);
+	struct udp_iter_state *state = seq->private;
+
+	if (state->bucket < UDP_HTABLE_SIZE)
+		read_unlock(&state->udp_table->hash[state->bucket].lock);
 }
 
 static int udp_seq_open(struct inode *inode, struct file *file)
@@ -1585,7 +1614,7 @@ static int udp_seq_open(struct inode *inode, struct file *file)
 
 	s = ((struct seq_file *)file->private_data)->private;
 	s->family		= afinfo->family;
-	s->hashtable		= afinfo->hashtable;
+	s->udp_table		= afinfo->udp_table;
 	return err;
 }
 
@@ -1657,7 +1686,7 @@ int udp4_seq_show(struct seq_file *seq, void *v)
 static struct udp_seq_afinfo udp4_seq_afinfo = {
 	.name		= "udp",
 	.family		= AF_INET,
-	.hashtable	= udp_hash,
+	.udp_table	= &udp_table,
 	.seq_fops	= {
 		.owner	=	THIS_MODULE,
 	},
@@ -1692,10 +1721,21 @@ void udp4_proc_exit(void)
 }
 #endif /* CONFIG_PROC_FS */
 
+void __init udp_table_init(struct udp_table *table)
+{
+	int i;
+	
+	for (i = 0; i < UDP_HTABLE_SIZE; i++) {
+		INIT_HLIST_HEAD(&table->hash[i].head);
+		rwlock_init(&table->hash[i].lock);
+	}
+}
+
 void __init udp_init(void)
 {
 	unsigned long limit;
 
+	udp_table_init(&udp_table);
 	/* Set the pressure threshold up by the same strategy of TCP. It is a
 	 * fraction of global memory that is up to 1/2 at 256 MB, decreasing
 	 * toward zero with the amount of memory, with a floor of 128 pages.
@@ -1712,8 +1752,6 @@ void __init udp_init(void)
 }
 
 EXPORT_SYMBOL(udp_disconnect);
-EXPORT_SYMBOL(udp_hash);
-EXPORT_SYMBOL(udp_hash_lock);
 EXPORT_SYMBOL(udp_ioctl);
 EXPORT_SYMBOL(udp_prot);
 EXPORT_SYMBOL(udp_sendmsg);
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index 2e9bad2..9f4a616 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -5,8 +5,8 @@
 #include <net/protocol.h>
 #include <net/inet_common.h>
 
-extern int  	__udp4_lib_rcv(struct sk_buff *, struct hlist_head [], int );
-extern void 	__udp4_lib_err(struct sk_buff *, u32, struct hlist_head []);
+extern int  	__udp4_lib_rcv(struct sk_buff *, struct udp_table *, int );
+extern void 	__udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
 
 extern int	udp_v4_get_port(struct sock *sk, unsigned short snum);
 
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 3c80796..d8ea8e5 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -12,16 +12,17 @@
  */
 #include "udp_impl.h"
 
-struct hlist_head 	udplite_hash[UDP_HTABLE_SIZE];
+struct udp_table 	udplite_table;
+EXPORT_SYMBOL(udplite_table);
 
 static int udplite_rcv(struct sk_buff *skb)
 {
-	return __udp4_lib_rcv(skb, udplite_hash, IPPROTO_UDPLITE);
+	return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
 }
 
 static void udplite_err(struct sk_buff *skb, u32 info)
 {
-	__udp4_lib_err(skb, info, udplite_hash);
+	__udp4_lib_err(skb, info, &udplite_table);
 }
 
 static	struct net_protocol udplite_protocol = {
@@ -50,7 +51,7 @@ struct proto 	udplite_prot = {
 	.unhash		   = udp_lib_unhash,
 	.get_port	   = udp_v4_get_port,
 	.obj_size	   = sizeof(struct udp_sock),
-	.h.udp_hash	   = udplite_hash,
+	.h.udp_table	   = &udplite_table,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_udp_setsockopt,
 	.compat_getsockopt = compat_udp_getsockopt,
@@ -71,7 +72,7 @@ static struct inet_protosw udplite4_protosw = {
 static struct udp_seq_afinfo udplite4_seq_afinfo = {
 	.name		= "udplite",
 	.family		= AF_INET,
-	.hashtable	= udplite_hash,
+	.udp_table 	= &udplite_table,
 	.seq_fops	= {
 		.owner	=	THIS_MODULE,
 	},
@@ -108,6 +109,7 @@ static inline int udplite4_proc_init(void)
 
 void __init udplite4_register(void)
 {
+	udp_table_init(&udplite_table);
 	if (proto_register(&udplite_prot, 1))
 		goto out_register_err;
 
@@ -126,5 +128,4 @@ out_register_err:
 	printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __func__);
 }
 
-EXPORT_SYMBOL(udplite_hash);
 EXPORT_SYMBOL(udplite_prot);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index e51da8c..0b76566 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -54,62 +54,73 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum)
 	return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal);
 }
 
+static inline int compute_score(struct sock *sk, struct net *net,
+				unsigned short hnum,
+				struct in6_addr *saddr, __be16 sport,
+				struct in6_addr *daddr, __be16 dport,
+				int dif)
+{
+	int score = -1;
+
+	if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
+			sk->sk_family == PF_INET6) {
+		struct ipv6_pinfo *np = inet6_sk(sk);
+		struct inet_sock *inet = inet_sk(sk);
+
+		score = 0;
+		if (inet->dport) {
+			if (inet->dport != sport)
+				return -1;
+			score++;
+		}
+		if (!ipv6_addr_any(&np->rcv_saddr)) {
+			if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
+				return -1;
+			score++;
+		}
+		if (!ipv6_addr_any(&np->daddr)) {
+			if (!ipv6_addr_equal(&np->daddr, saddr))
+				return -1;
+			score++;
+		}
+		if (sk->sk_bound_dev_if) {
+			if (sk->sk_bound_dev_if != dif)
+				return -1;
+			score++;
+		}
+	}
+	return score;
+}
+
 static struct sock *__udp6_lib_lookup(struct net *net,
 				      struct in6_addr *saddr, __be16 sport,
 				      struct in6_addr *daddr, __be16 dport,
-				      int dif, struct hlist_head udptable[])
+				      int dif, struct udp_table *udptable)
 {
 	struct sock *sk, *result = NULL;
 	struct hlist_node *node;
 	unsigned short hnum = ntohs(dport);
-	int badness = -1;
-
-	read_lock(&udp_hash_lock);
-	sk_for_each(sk, node, &udptable[udp_hashfn(net, hnum)]) {
-		struct inet_sock *inet = inet_sk(sk);
-
-		if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
-				sk->sk_family == PF_INET6) {
-			struct ipv6_pinfo *np = inet6_sk(sk);
-			int score = 0;
-			if (inet->dport) {
-				if (inet->dport != sport)
-					continue;
-				score++;
-			}
-			if (!ipv6_addr_any(&np->rcv_saddr)) {
-				if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
-					continue;
-				score++;
-			}
-			if (!ipv6_addr_any(&np->daddr)) {
-				if (!ipv6_addr_equal(&np->daddr, saddr))
-					continue;
-				score++;
-			}
-			if (sk->sk_bound_dev_if) {
-				if (sk->sk_bound_dev_if != dif)
-					continue;
-				score++;
-			}
-			if (score == 4) {
-				result = sk;
-				break;
-			} else if (score > badness) {
-				result = sk;
-				badness = score;
-			}
+	unsigned int hash = udp_hashfn(net, hnum);
+	struct udp_hslot *hslot = &udptable->hash[hash];
+	int score, badness = -1;
+
+	read_lock(&hslot->lock);
+	sk_for_each(sk, node, &hslot->head) {
+		score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif);
+		if (score > badness) {
+			result = sk;
+			badness = score;
 		}
 	}
 	if (result)
 		sock_hold(result);
-	read_unlock(&udp_hash_lock);
+	read_unlock(&hslot->lock);
 	return result;
 }
 
 static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
 					  __be16 sport, __be16 dport,
-					  struct hlist_head udptable[])
+					  struct udp_table *udptable)
 {
 	struct sock *sk;
 	struct ipv6hdr *iph = ipv6_hdr(skb);
@@ -239,7 +250,7 @@ csum_copy_err:
 
 void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		    int type, int code, int offset, __be32 info,
-		    struct hlist_head udptable[]                    )
+		    struct udp_table *udptable)
 {
 	struct ipv6_pinfo *np;
 	struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data;
@@ -275,7 +286,7 @@ static __inline__ void udpv6_err(struct sk_buff *skb,
 				 struct inet6_skb_parm *opt, int type,
 				 int code, int offset, __be32 info     )
 {
-	__udp6_lib_err(skb, opt, type, code, offset, info, udp_hash);
+	__udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
 }
 
 int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
@@ -374,14 +385,15 @@ static struct sock *udp_v6_mcast_next(struct sock *sk,
  */
 static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 		struct in6_addr *saddr, struct in6_addr *daddr,
-		struct hlist_head udptable[])
+		struct udp_table *udptable)
 {
 	struct sock *sk, *sk2;
 	const struct udphdr *uh = udp_hdr(skb);
+	struct udp_hslot *hslot = &udptable->hash[udp_hashfn(net, ntohs(uh->dest))];
 	int dif;
 
-	read_lock(&udp_hash_lock);
-	sk = sk_head(&udptable[udp_hashfn(net, ntohs(uh->dest))]);
+	read_lock(&hslot->lock);
+	sk = sk_head(&hslot->head);
 	dif = inet6_iif(skb);
 	sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
 	if (!sk) {
@@ -409,7 +421,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 		sk_add_backlog(sk, skb);
 	bh_unlock_sock(sk);
 out:
-	read_unlock(&udp_hash_lock);
+	read_unlock(&hslot->lock);
 	return 0;
 }
 
@@ -447,7 +459,7 @@ static inline int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh,
 	return 0;
 }
 
-int __udp6_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[],
+int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 		   int proto)
 {
 	struct sock *sk;
@@ -544,7 +556,7 @@ discard:
 
 static __inline__ int udpv6_rcv(struct sk_buff *skb)
 {
-	return __udp6_lib_rcv(skb, udp_hash, IPPROTO_UDP);
+	return __udp6_lib_rcv(skb, &udp_table, IPPROTO_UDP);
 }
 
 /*
@@ -1008,7 +1020,7 @@ int udp6_seq_show(struct seq_file *seq, void *v)
 static struct udp_seq_afinfo udp6_seq_afinfo = {
 	.name		= "udp6",
 	.family		= AF_INET6,
-	.hashtable	= udp_hash,
+	.udp_table	= &udp_table,
 	.seq_fops	= {
 		.owner	=	THIS_MODULE,
 	},
@@ -1050,7 +1062,7 @@ struct proto udpv6_prot = {
 	.sysctl_wmem	   = &sysctl_udp_wmem_min,
 	.sysctl_rmem	   = &sysctl_udp_rmem_min,
 	.obj_size	   = sizeof(struct udp6_sock),
-	.h.udp_hash	   = udp_hash,
+	.h.udp_table	   = &udp_table,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_udpv6_setsockopt,
 	.compat_getsockopt = compat_udpv6_getsockopt,
diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h
index 92dd7da..2377920 100644
--- a/net/ipv6/udp_impl.h
+++ b/net/ipv6/udp_impl.h
@@ -7,9 +7,9 @@
 #include <net/inet_common.h>
 #include <net/transp_v6.h>
 
-extern int  	__udp6_lib_rcv(struct sk_buff *, struct hlist_head [], int );
+extern int  	__udp6_lib_rcv(struct sk_buff *, struct udp_table *, int );
 extern void 	__udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *,
-			       int , int , int , __be32 , struct hlist_head []);
+			       int , int , int , __be32 , struct udp_table *);
 
 extern int	udp_v6_get_port(struct sock *sk, unsigned short snum);
 
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index 3cd1a1a..f67bbff 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -15,14 +15,14 @@
 
 static int udplitev6_rcv(struct sk_buff *skb)
 {
-	return __udp6_lib_rcv(skb, udplite_hash, IPPROTO_UDPLITE);
+	return __udp6_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
 }
 
 static void udplitev6_err(struct sk_buff *skb,
 			  struct inet6_skb_parm *opt,
 			  int type, int code, int offset, __be32 info)
 {
-	__udp6_lib_err(skb, opt, type, code, offset, info, udplite_hash);
+	__udp6_lib_err(skb, opt, type, code, offset, info, &udplite_table);
 }
 
 static struct inet6_protocol udplitev6_protocol = {
@@ -49,7 +49,7 @@ struct proto udplitev6_prot = {
 	.unhash		   = udp_lib_unhash,
 	.get_port	   = udp_v6_get_port,
 	.obj_size	   = sizeof(struct udp6_sock),
-	.h.udp_hash	   = udplite_hash,
+	.h.udp_table   = &udplite_table,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_udpv6_setsockopt,
 	.compat_getsockopt = compat_udpv6_getsockopt,
@@ -95,7 +95,7 @@ void udplitev6_exit(void)
 static struct udp_seq_afinfo udplite6_seq_afinfo = {
 	.name		= "udplite6",
 	.family		= AF_INET6,
-	.hashtable	= udplite_hash,
+	.udp_table	= &udplite_table,
 	.seq_fops	= {
 		.owner	=	THIS_MODULE,
 	},

^ permalink raw reply related	[flat|nested] 134+ messages in thread

* [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-08 18:45                 ` David Miller
  2008-10-28 20:37                   ` [PATCH 1/2] udp: introduce struct udp_table and multiple rwlocks Eric Dumazet
@ 2008-10-28 20:42                   ` Eric Dumazet
  2008-10-28 22:45                     ` Eric Dumazet
  1 sibling, 1 reply; 134+ messages in thread
From: Eric Dumazet @ 2008-10-28 20:42 UTC (permalink / raw)
  To: David Miller
  Cc: shemminger, benny+usenet, minyard, netdev, paulmck,
	Christoph Lameter, Peter Zijlstra, Evgeniy Polyakov

[-- Attachment #1: Type: text/plain, Size: 2607 bytes --]

RCUification of UDP hash tables

Goals are :

1) Optimizing handling of incoming Unicast UDP frames, so that no memory
  writes should happen in the fast path. Using an array of rwlocks (one per
  slot for example is not an option in this regard)

  Note: Multicasts and broadcasts still will need to take a lock,
  because doing a full lockless lookup in this case is difficult.

2) No expensive operations in the socket bind/unhash phases :
   - No expensive synchronize_rcu() calls.

   - No added rcu_head in socket structure, increasing memory needs,
   but more important, forcing us to use call_rcu() calls,
   that have the bad property of making sockets structure cold.
   (rcu grace period between socket freeing and its potential reuse
    make this socket being cold in CPU cache).
   David did a previous patch using call_rcu() and noticed a 20%
   impact on TCP connection rates.
   Quoting Cristopher Lameter :
    "Right. That results in cacheline cooldown. You'd want to recycle
     the object as they are cache hot on a per cpu basis. That is screwed
     up by the delayed regular rcu processing. We have seen multiple
     regressions due to cacheline cooldown.
     The only choice in cacheline hot sensitive areas is to deal with the
     complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU."

   - Because udp sockets are allocated from dedicated kmem_cache,
   use of SLAB_DESTROY_BY_RCU can help here.

Theory of operation :
---------------------

As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()),
special attention must be taken by readers and writers.

Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed,
reused, inserted in a different chain or in worst case in the same chain
while readers could do lookups in the same time.

In order to avoid loops, a reader must check each socket found in a chain
really belongs to the chain the reader was traversing. If it finds a
mismatch, lookup must start again at the begining. This *restart* loop
is the reason we had to use rdlock for the multicast case, because
we dont want to send same message several times to the same socket.

We use RCU only for fast path. Thus, /proc/net/udp still take rdlocks.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
 include/net/sock.h |   37 ++++++++++++++++++++++++++++++++++++-
 net/core/sock.c    |    3 ++-
 net/ipv4/udp.c     |   35 ++++++++++++++++++++++++++---------
 net/ipv4/udplite.c |    1 +
 net/ipv6/udp.c     |   25 ++++++++++++++++++-------
 net/ipv6/udplite.c |    1 +
 6 files changed, 84 insertions(+), 18 deletions(-)

[-- Attachment #2: PATCH_UDP.2 --]
[-- Type: text/plain, Size: 7211 bytes --]

diff --git a/include/net/sock.h b/include/net/sock.h
index 4d630d5..5c1a781 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -361,6 +361,27 @@ static __inline__ int sk_del_node_init(struct sock *sk)
 	return rc;
 }
 
+static __inline__ int __sk_del_node_init_rcu(struct sock *sk)
+{
+	if (sk_hashed(sk)) {
+		hlist_del_init_rcu(&sk->sk_node);
+		return 1;
+	}
+	return 0;
+}
+
+static __inline__ int sk_del_node_init_rcu(struct sock *sk)
+{
+	int rc = __sk_del_node_init_rcu(sk);
+
+	if (rc) {
+		/* paranoid for a while -acme */
+		WARN_ON(atomic_read(&sk->sk_refcnt) == 1);
+		__sock_put(sk);
+	}
+	return rc;
+}
+
 static __inline__ void __sk_add_node(struct sock *sk, struct hlist_head *list)
 {
 	hlist_add_head(&sk->sk_node, list);
@@ -372,6 +393,17 @@ static __inline__ void sk_add_node(struct sock *sk, struct hlist_head *list)
 	__sk_add_node(sk, list);
 }
 
+static __inline__ void __sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
+{
+	hlist_add_head_rcu(&sk->sk_node, list);
+}
+
+static __inline__ void sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
+{
+	sock_hold(sk);
+	__sk_add_node_rcu(sk, list);
+}
+
 static __inline__ void __sk_del_bind_node(struct sock *sk)
 {
 	__hlist_del(&sk->sk_bind_node);
@@ -385,6 +417,8 @@ static __inline__ void sk_add_bind_node(struct sock *sk,
 
 #define sk_for_each(__sk, node, list) \
 	hlist_for_each_entry(__sk, node, list, sk_node)
+#define sk_for_each_rcu(__sk, node, list) \
+	hlist_for_each_entry_rcu(__sk, node, list, sk_node)
 #define sk_for_each_from(__sk, node) \
 	if (__sk && ({ node = &(__sk)->sk_node; 1; })) \
 		hlist_for_each_entry_from(__sk, node, sk_node)
@@ -587,8 +621,9 @@ struct proto {
 	int			*sysctl_rmem;
 	int			max_header;
 
-	struct kmem_cache		*slab;
+	struct kmem_cache	*slab;
 	unsigned int		obj_size;
+	int			slab_flags;
 
 	atomic_t		*orphan_count;
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 5e2a313..ded1eb5 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2042,7 +2042,8 @@ int proto_register(struct proto *prot, int alloc_slab)
 
 	if (alloc_slab) {
 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
-					       SLAB_HWCACHE_ALIGN, NULL);
+					SLAB_HWCACHE_ALIGN | prot->slab_flags,
+					NULL);
 
 		if (prot->slab == NULL) {
 			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 168276a..f0a573f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -187,7 +187,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 	inet_sk(sk)->num = snum;
 	sk->sk_hash = snum;
 	if (sk_unhashed(sk)) {
-		sk_add_node(sk, &hslot->head);
+		sk_add_node_rcu(sk, &hslot->head);
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 	}
 	write_unlock_bh(&hslot->lock);
@@ -252,15 +252,24 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 		__be16 sport, __be32 daddr, __be16 dport,
 		int dif, struct udp_table *udptable)
 {
-	struct sock *sk, *result = NULL;
+	struct sock *sk, *result;
 	struct hlist_node *node;
 	unsigned short hnum = ntohs(dport);
 	unsigned int hash = udp_hashfn(net, hnum);
 	struct udp_hslot *hslot = &udptable->hash[hash];
-	int score, badness = -1;
+	int score, badness;
 
-	read_lock(&hslot->lock);
-	sk_for_each(sk, node, &hslot->head) {
+	rcu_read_lock();
+begin:
+	result = NULL;
+	badness = -1;
+	sk_for_each_rcu(sk, node, &hslot->head) {
+		/*
+		 * lockless reader, and SLAB_DESTROY_BY_RCU items:
+		 * We must check this item was not moved to another chain
+		 */
+		if (udp_hashfn(net, sk->sk_hash) != hash)
+			goto begin;
 		score = compute_score(sk, net, saddr, hnum, sport,
 				      daddr, dport, dif);
 		if (score > badness) {
@@ -268,9 +277,16 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 			badness = score;
  		}
 	}
-	if (result)
-		sock_hold(result);
-	read_unlock(&hslot->lock);
+	if (result) {
+		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
+			result = NULL;
+		else if (unlikely(compute_score(result, net, saddr, hnum, sport,
+				  daddr, dport, dif) < badness)) {
+			sock_put(result);
+			goto begin;
+		}
+	}
+	rcu_read_unlock();
 	return result;
 }
 
@@ -952,7 +968,7 @@ void udp_lib_unhash(struct sock *sk)
 	struct udp_hslot *hslot = &udptable->hash[hash];
 
 	write_lock(&hslot->lock);
-	if (sk_del_node_init(sk)) {
+	if (sk_del_node_init_rcu(sk)) {
 		inet_sk(sk)->num = 0;
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 	}
@@ -1516,6 +1532,7 @@ struct proto udp_prot = {
 	.sysctl_wmem	   = &sysctl_udp_wmem_min,
 	.sysctl_rmem	   = &sysctl_udp_rmem_min,
 	.obj_size	   = sizeof(struct udp_sock),
+	.slab_flags	   = SLAB_DESTROY_BY_RCU,
 	.h.udp_table	   = &udp_table,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_udp_setsockopt,
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index d8ea8e5..c784891 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -51,6 +51,7 @@ struct proto 	udplite_prot = {
 	.unhash		   = udp_lib_unhash,
 	.get_port	   = udp_v4_get_port,
 	.obj_size	   = sizeof(struct udp_sock),
+	.slab_flags	   = SLAB_DESTROY_BY_RCU,
 	.h.udp_table	   = &udplite_table,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_udp_setsockopt,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 0b76566..f7671af 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -97,24 +97,34 @@ static struct sock *__udp6_lib_lookup(struct net *net,
 				      struct in6_addr *daddr, __be16 dport,
 				      int dif, struct udp_table *udptable)
 {
-	struct sock *sk, *result = NULL;
+	struct sock *sk, *result;
 	struct hlist_node *node;
 	unsigned short hnum = ntohs(dport);
 	unsigned int hash = udp_hashfn(net, hnum);
 	struct udp_hslot *hslot = &udptable->hash[hash];
-	int score, badness = -1;
+	int score, badness;
 
-	read_lock(&hslot->lock);
-	sk_for_each(sk, node, &hslot->head) {
+	rcu_read_lock();
+begin:
+	result = NULL;
+	badness = -1;
+	sk_for_each_rcu(sk, node, &hslot->head) {
 		score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif);
 		if (score > badness) {
 			result = sk;
 			badness = score;
 		}
 	}
-	if (result)
-		sock_hold(result);
-	read_unlock(&hslot->lock);
+	if (result) {
+		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
+			result = NULL;
+		else if (unlikely(compute_score(result, net, hnum, saddr, sport,
+					daddr, dport, dif) < badness)) {
+			sock_put(result);
+			goto begin;
+ 		}
+	}
+	rcu_read_unlock();
 	return result;
 }
 
@@ -1062,6 +1072,7 @@ struct proto udpv6_prot = {
 	.sysctl_wmem	   = &sysctl_udp_wmem_min,
 	.sysctl_rmem	   = &sysctl_udp_rmem_min,
 	.obj_size	   = sizeof(struct udp6_sock),
+	.slab_flags	   = SLAB_DESTROY_BY_RCU,
 	.h.udp_table	   = &udp_table,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_udpv6_setsockopt,
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index f67bbff..3e1663f 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -49,6 +49,7 @@ struct proto udplitev6_prot = {
 	.unhash		   = udp_lib_unhash,
 	.get_port	   = udp_v6_get_port,
 	.obj_size	   = sizeof(struct udp6_sock),
+	.slab_flags	   = SLAB_DESTROY_BY_RCU,
 	.h.udp_table   = &udplite_table,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_udpv6_setsockopt,

^ permalink raw reply related	[flat|nested] 134+ messages in thread

* Re: [PATCH 1/2] udp: introduce struct udp_table and multiple rwlocks
  2008-10-28 20:37                   ` [PATCH 1/2] udp: introduce struct udp_table and multiple rwlocks Eric Dumazet
@ 2008-10-28 21:23                     ` Christian Bell
  2008-10-28 21:31                       ` Evgeniy Polyakov
  2008-10-28 21:48                       ` Eric Dumazet
  2008-10-28 21:28                     ` Evgeniy Polyakov
  1 sibling, 2 replies; 134+ messages in thread
From: Christian Bell @ 2008-10-28 21:23 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev


On Oct 28, 2008, at 1:37 PM, Eric Dumazet wrote:

> -extern struct hlist_head udp_hash[UDP_HTABLE_SIZE];
> -extern rwlock_t udp_hash_lock;
> +struct udp_hslot {
> +	struct hlist_head	head;
> +	rwlock_t		lock;
> +};

This structure should be aligned up to cacheline to reduce false  
sharing of more than one hslot.

> +	} else {
> +		hslot = &udptable->hash[udp_hashfn(net, snum)];
> +		write_lock_bh(&hslot->lock);
> +		if (udp_lib_lport_inuse(net, snum, hslot, sk, saddr_comp))
> +			goto fail;

The fail: label below should still unlock_bh when the above condition  
fails.

>
> +	}
> 	inet_sk(sk)->num = snum;
> 	sk->sk_hash = snum;
> 	if (sk_unhashed(sk)) {
> -		sk_add_node(sk, &udptable[udp_hashfn(net, snum)]);
> +		sk_add_node(sk, &hslot->head);
> 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
> 	}
> +	write_unlock_bh(&hslot->lock);
> 	error = 0;
> fail:
> -	write_unlock_bh(&udp_hash_lock);
> 	return error;
> }

cheers,

	. . christian

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 1/2] udp: introduce struct udp_table and multiple rwlocks
  2008-10-28 20:37                   ` [PATCH 1/2] udp: introduce struct udp_table and multiple rwlocks Eric Dumazet
  2008-10-28 21:23                     ` Christian Bell
@ 2008-10-28 21:28                     ` Evgeniy Polyakov
  1 sibling, 0 replies; 134+ messages in thread
From: Evgeniy Polyakov @ 2008-10-28 21:28 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, shemminger, benny+usenet, minyard, netdev, paulmck,
	Christoph Lameter, Peter Zijlstra

Hi.

On Tue, Oct 28, 2008 at 09:37:35PM +0100, Eric Dumazet (dada1@cosmosbay.com) wrote:
> Introducing one rwlock per chain reduces latencies, for port
> randomization on heavily loaded UDP servers.
> 
> Some cleanups were done to ease review of following patch.
> 
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

Ugh-ogh, that's a very cool change!
So far reviewed only this and things look very promising.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 0/2] udp: Convert the UDP hash lock to RCU
  2008-10-28 20:37                 ` [PATCH 0/2] udp: Convert the UDP hash lock to RCU Eric Dumazet
@ 2008-10-28 21:28                   ` Stephen Hemminger
  2008-10-28 21:50                     ` Eric Dumazet
  0 siblings, 1 reply; 134+ messages in thread
From: Stephen Hemminger @ 2008-10-28 21:28 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, benny+usenet, minyard, netdev, paulmck,
	Christoph Lameter, Peter Zijlstra, Evgeniy Polyakov

On Tue, 28 Oct 2008 21:37:15 +0100
Eric Dumazet <dada1@cosmosbay.com> wrote:

> UDP sockets are hashed in a 128 slots hash table.
> 
> This hash table is protected by *one* rwlock.
> 
> This rwlock is readlocked each time an incoming UDP message is handled.
> 
> This rwlock is writelocked each time a socket must be inserted in
> hash table (bind time), or deleted from this table (unbind time)
> 
> This is not scalable on SMP machines :
> 
> 1) Even in read mode, lock() and unlock() are atomic operations and
> must dirty a contended cache line, shared by all cpus.
> 
> 2) A writer might be starved if many readers are 'in flight'. This can
> happen on a machine with some NIC receiving many UDP messages. User
> process can be delayed a long time at socket creation/dismantle time.
> 
> 
> What Corey and I propose is to use RCU to protect this hash table.
> 
> Goals are :
> 
> 1) Optimizing handling of incoming Unicast UDP frames, so that no memory
> writes should happen in the fast path. Using an array of rwlocks (one per
> slot for example is not an option in this regard)
> 
> Note: Multicasts and broadcasts still will need to take a lock,
> because doing a full lockless lookup in this case is difficult.
> 
> 2) No expensive operations in the socket bind/unhash phases :
>   - No expensive synchronize_rcu() calls.
> 
>   - No added rcu_head in socket structure, increasing memory needs,
>   but more important, forcing us to use call_rcu() calls,
>   that have the bad property of making sockets structure cold.
>   (rcu grace period between socket freeing and its potential reuse
>    make this socket being cold in CPU cache).
>   David did a previous patch using call_rcu() and noticed a 20%
>   impact on TCP connection rates.
> 
>   Quoting Cristopher Lameter :
>   "Right. That results in cacheline cooldown. You'd want to recycle
>    the object as they are cache hot on a per cpu basis. That is screwed
>    up by the delayed regular rcu processing. We have seen multiple
>    regressions due to cacheline cooldown.
>    The only choice in cacheline hot sensitive areas is to deal with the
>    complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU."
> 
>   - Because udp sockets are allocated from dedicated kmem_cache,
>   use of SLAB_DESTROY_BY_RCU can help here.
> 
> Theory of operation :
> ---------------------
> 
> As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()),
> special attention must be taken by readers and writers.
> 
> Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed,
> reused, inserted in a different chain or in worst case in the same chain
> while readers could do lookups in the same time.
> 
> In order to avoid loops, a reader must check each socket found in a chain
> really belongs to the chain the reader was traversing. If it finds a
> mismatch, lookup must start again at the begining. This *restart* loop
> is the reason we had to use rdlock for the multicast case, because
> we dont want to send same message several times to the same socket.
> 
> We use RCU only for fast path. Thus, /proc/net/udp still take rdlocks.

We should just make it a spin_lock later and speed up udp socket creation.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 1/2] udp: introduce struct udp_table and multiple rwlocks
  2008-10-28 21:23                     ` Christian Bell
@ 2008-10-28 21:31                       ` Evgeniy Polyakov
  2008-10-28 21:48                       ` Eric Dumazet
  1 sibling, 0 replies; 134+ messages in thread
From: Evgeniy Polyakov @ 2008-10-28 21:31 UTC (permalink / raw)
  To: Christian Bell; +Cc: Eric Dumazet, netdev

On Tue, Oct 28, 2008 at 02:23:55PM -0700, Christian Bell (christian@myri.com) wrote:
> >+	} else {
> >+		hslot = &udptable->hash[udp_hashfn(net, snum)];
> >+		write_lock_bh(&hslot->lock);
> >+		if (udp_lib_lport_inuse(net, snum, hslot, sk, saddr_comp))
> >+			goto fail;
> 
> The fail: label below should still unlock_bh when the above condition  
> fails.

This should be new label actually, since it is also used for non-locked
fail.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 1/2] udp: introduce struct udp_table and multiple rwlocks
  2008-10-28 21:23                     ` Christian Bell
  2008-10-28 21:31                       ` Evgeniy Polyakov
@ 2008-10-28 21:48                       ` Eric Dumazet
  1 sibling, 0 replies; 134+ messages in thread
From: Eric Dumazet @ 2008-10-28 21:48 UTC (permalink / raw)
  To: Christian Bell; +Cc: netdev

Christian Bell a écrit :
> 
> On Oct 28, 2008, at 1:37 PM, Eric Dumazet wrote:
> 
>> -extern struct hlist_head udp_hash[UDP_HTABLE_SIZE];
>> -extern rwlock_t udp_hash_lock;
>> +struct udp_hslot {
>> +    struct hlist_head    head;
>> +    rwlock_t        lock;
>> +};
> 
> This structure should be aligned up to cacheline to reduce false sharing 
> of more than one hslot.


Yes, I though about that. But : a full cache line is a waste of memory, and
choosing a power of two alignement is not easy because of 32bit/64bit arches,
and fact that sozepf(wrlock_t) can be > 4 if DEBUG

> 
>> +    } else {
>> +        hslot = &udptable->hash[udp_hashfn(net, snum)];
>> +        write_lock_bh(&hslot->lock);
>> +        if (udp_lib_lport_inuse(net, snum, hslot, sk, saddr_comp))
>> +            goto fail;
> 
> The fail: label below should still unlock_bh when the above condition 
> fails.
> 
>>
>> +    }
>>     inet_sk(sk)->num = snum;
>>     sk->sk_hash = snum;
>>     if (sk_unhashed(sk)) {
>> -        sk_add_node(sk, &udptable[udp_hashfn(net, snum)]);
>> +        sk_add_node(sk, &hslot->head);
>>         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
>>     }
>> +    write_unlock_bh(&hslot->lock);
>>     error = 0;
>> fail:
>> -    write_unlock_bh(&udp_hash_lock);
>>     return error;
>> }

Good spoting, the write_unlock_bh(&hslot->lock); must be moved after the "fail:" label.

Thanks a lot



^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 0/2] udp: Convert the UDP hash lock to RCU
  2008-10-28 21:28                   ` Stephen Hemminger
@ 2008-10-28 21:50                     ` Eric Dumazet
  0 siblings, 0 replies; 134+ messages in thread
From: Eric Dumazet @ 2008-10-28 21:50 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, benny+usenet, minyard, netdev, paulmck,
	Christoph Lameter, Peter Zijlstra, Evgeniy Polyakov

Stephen Hemminger a écrit :
> On Tue, 28 Oct 2008 21:37:15 +0100
> Eric Dumazet <dada1@cosmosbay.com> wrote:
>> We use RCU only for fast path. Thus, /proc/net/udp still take rdlocks.
> 
> We should just make it a spin_lock later and speed up udp socket creation.

Indeed


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-28 20:42                   ` [PATCH 2/2] udp: RCU handling for Unicast packets Eric Dumazet
@ 2008-10-28 22:45                     ` Eric Dumazet
  2008-10-29  5:05                       ` David Miller
  0 siblings, 1 reply; 134+ messages in thread
From: Eric Dumazet @ 2008-10-28 22:45 UTC (permalink / raw)
  To: David Miller
  Cc: shemminger, benny+usenet, minyard, netdev, paulmck,
	Christoph Lameter, Peter Zijlstra, Evgeniy Polyakov

Eric Dumazet a écrit :
> RCUification of UDP hash tables
> 
> Goals are :
> 
> 1) Optimizing handling of incoming Unicast UDP frames, so that no memory
>  writes should happen in the fast path. Using an array of rwlocks (one per
>  slot for example is not an option in this regard)
> 
>  Note: Multicasts and broadcasts still will need to take a lock,
>  because doing a full lockless lookup in this case is difficult.
> 
> 2) No expensive operations in the socket bind/unhash phases :
>   - No expensive synchronize_rcu() calls.
> 
>   - No added rcu_head in socket structure, increasing memory needs,
>   but more important, forcing us to use call_rcu() calls,
>   that have the bad property of making sockets structure cold.
>   (rcu grace period between socket freeing and its potential reuse
>    make this socket being cold in CPU cache).
>   David did a previous patch using call_rcu() and noticed a 20%
>   impact on TCP connection rates.
>   Quoting Cristopher Lameter :
>    "Right. That results in cacheline cooldown. You'd want to recycle
>     the object as they are cache hot on a per cpu basis. That is screwed
>     up by the delayed regular rcu processing. We have seen multiple
>     regressions due to cacheline cooldown.
>     The only choice in cacheline hot sensitive areas is to deal with the
>     complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU."
> 
>   - Because udp sockets are allocated from dedicated kmem_cache,
>   use of SLAB_DESTROY_BY_RCU can help here.
> 
> Theory of operation :
> ---------------------
> 
> As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()),
> special attention must be taken by readers and writers.
> 
> Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed,
> reused, inserted in a different chain or in worst case in the same chain
> while readers could do lookups in the same time.
> 
> In order to avoid loops, a reader must check each socket found in a chain
> really belongs to the chain the reader was traversing. If it finds a
> mismatch, lookup must start again at the begining. This *restart* loop
> is the reason we had to use rdlock for the multicast case, because
> we dont want to send same message several times to the same socket.
> 
> We use RCU only for fast path. Thus, /proc/net/udp still take rdlocks.
> 
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
> ---
> include/net/sock.h |   37 ++++++++++++++++++++++++++++++++++++-
> net/core/sock.c    |    3 ++-
> net/ipv4/udp.c     |   35 ++++++++++++++++++++++++++---------
> net/ipv4/udplite.c |    1 +
> net/ipv6/udp.c     |   25 ++++++++++++++++++-------
> net/ipv6/udplite.c |    1 +
> 6 files changed, 84 insertions(+), 18 deletions(-)
> 

On ipv6 side, I forgot to add a check before compute_score(), like I did on ipv4


+	rcu_read_lock();
+begin:
+	result = NULL;
+	badness = -1;
+	sk_for_each_rcu(sk, node, &hslot->head) {

< BEGIN HERE missing part --->
		/*
		 * lockless reader, and SLAB_DESTROY_BY_RCU items:
		 * We must check this item was not moved to another chain
		 */
		if (udp_hashfn(net, sk->sk_hash) != hash)
			goto begin;

< END missing part --->
 		score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif);
 		if (score > badness) {
 			result = sk;
 			badness = score;
 		}
 	}
-	if (result)
-		sock_hold(result);
-	read_unlock(&hslot->lock);
+	if (result) {
+		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
+			result = NULL;
+		else if (unlikely(compute_score(result, net, hnum, saddr, sport,
+					daddr, dport, dif) < badness)) {
+			sock_put(result);
+			goto begin;
+ 		}
+	}


I will submit a new patch serie tomorrow, with :

Patch 1 : spinlocks instead of rwlocks, and bug spotted by Christian Bell

Patch 2 : splited on two parts (2 & 3) , one for IPV4, one for IPV6, 

Thanks


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-28 22:45                     ` Eric Dumazet
@ 2008-10-29  5:05                       ` David Miller
  2008-10-29  8:23                         ` Eric Dumazet
  0 siblings, 1 reply; 134+ messages in thread
From: David Miller @ 2008-10-29  5:05 UTC (permalink / raw)
  To: dada1
  Cc: shemminger, benny+usenet, minyard, netdev, paulmck, clameter,
	a.p.zijlstra, johnpol

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 28 Oct 2008 23:45:15 +0100

> I will submit a new patch serie tomorrow, with :
> 
> Patch 1 : spinlocks instead of rwlocks, and bug spotted by Christian Bell
> 
> Patch 2 : splited on two parts (2 & 3) , one for IPV4, one for IPV6, 

I very much look forward to this :-)

I like these changes and can't wait to add them to net-next-2.6

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29  5:05                       ` David Miller
@ 2008-10-29  8:23                         ` Eric Dumazet
  2008-10-29  8:56                           ` David Miller
  2008-10-29  9:04                           ` Eric Dumazet
  0 siblings, 2 replies; 134+ messages in thread
From: Eric Dumazet @ 2008-10-29  8:23 UTC (permalink / raw)
  To: David Miller
  Cc: shemminger, benny+usenet, minyard, netdev, paulmck,
	Christoph Lameter, a.p.zijlstra, johnpol, Christian Bell

[-- Attachment #1: Type: text/plain, Size: 2269 bytes --]

David Miller a écrit :
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Tue, 28 Oct 2008 23:45:15 +0100
> 
>> I will submit a new patch serie tomorrow, with :
>>
>> Patch 1 : spinlocks instead of rwlocks, and bug spotted by Christian Bell
>>
>> Patch 2 : splited on two parts (2 & 3) , one for IPV4, one for IPV6, 
> 
> I very much look forward to this :-)
> 
> I like these changes and can't wait to add them to net-next-2.6

Thanks David, please find first updated patch 1

Thanks to Christian Bell and Stephen for their usefull review.


[PATCH] udp: introduce struct udp_table and multiple spinlocks

UDP sockets are hashed in a 128 slots hash table.

This hash table is protected by *one* rwlock.

This rwlock is readlocked each time an incoming UDP message is handled.

This rwlock is writelocked each time a socket must be inserted in
hash table (bind time), or deleted from this table (close time)

This is not scalable on SMP machines :

1) Even in read mode, lock() and unlock() are atomic operations and
 must dirty a contended cache line, shared by all cpus.

2) A writer might be starved if many readers are 'in flight'. This can
 happen on a machine with some NIC receiving many UDP messages. User
 process can be delayed a long time at socket creation/dismantle time.

This patch prepares RCU migration, by introducing 'struct udp_table
and struct udp_hslot', and using one spinlock per chain, to reduce
contention on central rwlock.

Introducing one spinlock per chain reduces latencies, for port
randomization on heavily loaded UDP servers. This also speedup
bindings to specific ports.

udp_lib_unhash() was uninlined, becoming to big.

Some cleanups were done to ease review of following patch
(RCUification of UDP Unicast lookups)

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
 include/net/sock.h    |    2
 include/net/udp.h     |   25 ++--
 include/net/udplite.h |    2
 net/ipv4/udp.c        |  209 +++++++++++++++++++++++-----------------
 net/ipv4/udp_impl.h   |    4
 net/ipv4/udplite.c    |   13 +-
 net/ipv6/udp.c        |  112 +++++++++++----------
 net/ipv6/udp_impl.h   |    4
 net/ipv6/udplite.c    |    8 -
 9 files changed, 215 insertions(+), 164 deletions(-)

[-- Attachment #2: patch_udp_cleanup1.patch --]
[-- Type: text/plain, Size: 24849 bytes --]

diff --git a/include/net/sock.h b/include/net/sock.h
index d6b750a..d200dfb 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -599,7 +599,7 @@ struct proto {
 
 	union {
 		struct inet_hashinfo	*hashinfo;
-		struct hlist_head	*udp_hash;
+		struct udp_table	*udp_table;
 		struct raw_hashinfo	*raw_hash;
 	} h;
 
diff --git a/include/net/udp.h b/include/net/udp.h
index 1e20509..df2bfe5 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -50,8 +50,15 @@ struct udp_skb_cb {
 };
 #define UDP_SKB_CB(__skb)	((struct udp_skb_cb *)((__skb)->cb))
 
-extern struct hlist_head udp_hash[UDP_HTABLE_SIZE];
-extern rwlock_t udp_hash_lock;
+struct udp_hslot {
+	struct hlist_head	head;
+	spinlock_t		lock;
+} __attribute__((aligned(2 * sizeof(long))));
+struct udp_table {
+	struct udp_hslot	hash[UDP_HTABLE_SIZE];
+};
+extern struct udp_table udp_table;
+extern void udp_table_init(struct udp_table *);
 
 
 /* Note: this must match 'valbool' in sock_setsockopt */
@@ -110,15 +117,7 @@ static inline void udp_lib_hash(struct sock *sk)
 	BUG();
 }
 
-static inline void udp_lib_unhash(struct sock *sk)
-{
-	write_lock_bh(&udp_hash_lock);
-	if (sk_del_node_init(sk)) {
-		inet_sk(sk)->num = 0;
-		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
-	}
-	write_unlock_bh(&udp_hash_lock);
-}
+extern void udp_lib_unhash(struct sock *sk);
 
 static inline void udp_lib_close(struct sock *sk, long timeout)
 {
@@ -187,7 +186,7 @@ extern struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
 struct udp_seq_afinfo {
 	char			*name;
 	sa_family_t		family;
-	struct hlist_head	*hashtable;
+	struct udp_table	*udp_table;
 	struct file_operations	seq_fops;
 	struct seq_operations	seq_ops;
 };
@@ -196,7 +195,7 @@ struct udp_iter_state {
 	struct seq_net_private  p;
 	sa_family_t		family;
 	int			bucket;
-	struct hlist_head	*hashtable;
+	struct udp_table	*udp_table;
 };
 
 #ifdef CONFIG_PROC_FS
diff --git a/include/net/udplite.h b/include/net/udplite.h
index b76b2e3..afdffe6 100644
--- a/include/net/udplite.h
+++ b/include/net/udplite.h
@@ -11,7 +11,7 @@
 #define UDPLITE_RECV_CSCOV   11 /* receiver partial coverage (threshold ) */
 
 extern struct proto 		udplite_prot;
-extern struct hlist_head 	udplite_hash[UDP_HTABLE_SIZE];
+extern struct udp_table		udplite_table;
 
 /*
  *	Checksum computation is all in software, hence simpler getfrag.
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2095abc..b91cd0a 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -104,12 +104,8 @@
 #include <net/xfrm.h>
 #include "udp_impl.h"
 
-/*
- *	Snmp MIB for the UDP layer
- */
-
-struct hlist_head udp_hash[UDP_HTABLE_SIZE];
-DEFINE_RWLOCK(udp_hash_lock);
+struct udp_table udp_table;
+EXPORT_SYMBOL(udp_table);
 
 int sysctl_udp_mem[3] __read_mostly;
 int sysctl_udp_rmem_min __read_mostly;
@@ -123,7 +119,7 @@ atomic_t udp_memory_allocated;
 EXPORT_SYMBOL(udp_memory_allocated);
 
 static int udp_lib_lport_inuse(struct net *net, __u16 num,
-			       const struct hlist_head udptable[],
+			       const struct udp_hslot *hslot,
 			       struct sock *sk,
 			       int (*saddr_comp)(const struct sock *sk1,
 						 const struct sock *sk2))
@@ -131,7 +127,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
 	struct sock *sk2;
 	struct hlist_node *node;
 
-	sk_for_each(sk2, node, &udptable[udp_hashfn(net, num)])
+	sk_for_each(sk2, node, &hslot->head)
 		if (net_eq(sock_net(sk2), net)			&&
 		    sk2 != sk					&&
 		    sk2->sk_hash == num				&&
@@ -154,12 +150,11 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 		       int (*saddr_comp)(const struct sock *sk1,
 					 const struct sock *sk2 )    )
 {
-	struct hlist_head *udptable = sk->sk_prot->h.udp_hash;
+	struct udp_hslot *hslot;
+	struct udp_table *udptable = sk->sk_prot->h.udp_table;
 	int    error = 1;
 	struct net *net = sock_net(sk);
 
-	write_lock_bh(&udp_hash_lock);
-
 	if (!snum) {
 		int low, high, remaining;
 		unsigned rand;
@@ -171,26 +166,34 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 		rand = net_random();
 		snum = first = rand % remaining + low;
 		rand |= 1;
-		while (udp_lib_lport_inuse(net, snum, udptable, sk,
-					   saddr_comp)) {
+		for (;;) {
+			hslot = &udptable->hash[udp_hashfn(net, snum)];
+			spin_lock_bh(&hslot->lock);
+			if (!udp_lib_lport_inuse(net, snum, hslot, sk, saddr_comp))
+				break;
+			spin_unlock_bh(&hslot->lock);
 			do {
 				snum = snum + rand;
 			} while (snum < low || snum > high);
 			if (snum == first)
 				goto fail;
 		}
-	} else if (udp_lib_lport_inuse(net, snum, udptable, sk, saddr_comp))
-		goto fail;
-
+	} else {
+		hslot = &udptable->hash[udp_hashfn(net, snum)];
+		spin_lock_bh(&hslot->lock);
+		if (udp_lib_lport_inuse(net, snum, hslot, sk, saddr_comp))
+			goto fail_unlock;
+	}
 	inet_sk(sk)->num = snum;
 	sk->sk_hash = snum;
 	if (sk_unhashed(sk)) {
-		sk_add_node(sk, &udptable[udp_hashfn(net, snum)]);
+		sk_add_node(sk, &hslot->head);
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 	}
 	error = 0;
+fail_unlock:
+	spin_unlock_bh(&hslot->lock);
 fail:
-	write_unlock_bh(&udp_hash_lock);
 	return error;
 }
 
@@ -208,63 +211,73 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum)
 	return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal);
 }
 
+static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
+			 unsigned short hnum,
+			 __be16 sport, __be32 daddr, __be16 dport, int dif)
+{
+	int score = -1;
+
+	if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
+			!ipv6_only_sock(sk)) {
+		struct inet_sock *inet = inet_sk(sk);
+
+		score = (sk->sk_family == PF_INET ? 1 : 0);
+		if (inet->rcv_saddr) {
+			if (inet->rcv_saddr != daddr)
+				return -1;
+			score += 2;
+		}
+		if (inet->daddr) {
+			if (inet->daddr != saddr)
+				return -1;
+			score += 2;
+		}
+		if (inet->dport) {
+			if (inet->dport != sport)
+				return -1;
+			score += 2;
+		}
+		if (sk->sk_bound_dev_if) {
+			if (sk->sk_bound_dev_if != dif)
+				return -1;
+			score += 2;
+		}
+	}
+	return score;
+}
+
 /* UDP is nearly always wildcards out the wazoo, it makes no sense to try
  * harder than this. -DaveM
  */
 static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 		__be16 sport, __be32 daddr, __be16 dport,
-		int dif, struct hlist_head udptable[])
+		int dif, struct udp_table *udptable)
 {
 	struct sock *sk, *result = NULL;
 	struct hlist_node *node;
 	unsigned short hnum = ntohs(dport);
-	int badness = -1;
-
-	read_lock(&udp_hash_lock);
-	sk_for_each(sk, node, &udptable[udp_hashfn(net, hnum)]) {
-		struct inet_sock *inet = inet_sk(sk);
-
-		if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
-				!ipv6_only_sock(sk)) {
-			int score = (sk->sk_family == PF_INET ? 1 : 0);
-			if (inet->rcv_saddr) {
-				if (inet->rcv_saddr != daddr)
-					continue;
-				score+=2;
-			}
-			if (inet->daddr) {
-				if (inet->daddr != saddr)
-					continue;
-				score+=2;
-			}
-			if (inet->dport) {
-				if (inet->dport != sport)
-					continue;
-				score+=2;
-			}
-			if (sk->sk_bound_dev_if) {
-				if (sk->sk_bound_dev_if != dif)
-					continue;
-				score+=2;
-			}
-			if (score == 9) {
-				result = sk;
-				break;
-			} else if (score > badness) {
-				result = sk;
-				badness = score;
-			}
-		}
+	unsigned int hash = udp_hashfn(net, hnum);
+	struct udp_hslot *hslot = &udptable->hash[hash];
+	int score, badness = -1;
+
+	spin_lock(&hslot->lock);
+	sk_for_each(sk, node, &hslot->head) {
+		score = compute_score(sk, net, saddr, hnum, sport,
+				      daddr, dport, dif);
+		if (score > badness) {
+			result = sk;
+			badness = score;
+ 		}
 	}
 	if (result)
 		sock_hold(result);
-	read_unlock(&udp_hash_lock);
+	spin_unlock(&hslot->lock);
 	return result;
 }
 
 static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
 						 __be16 sport, __be16 dport,
-						 struct hlist_head udptable[])
+						 struct udp_table *udptable)
 {
 	struct sock *sk;
 	const struct iphdr *iph = ip_hdr(skb);
@@ -280,7 +293,7 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
 struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
 			     __be32 daddr, __be16 dport, int dif)
 {
-	return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, udp_hash);
+	return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table);
 }
 EXPORT_SYMBOL_GPL(udp4_lib_lookup);
 
@@ -323,7 +336,7 @@ found:
  * to find the appropriate port.
  */
 
-void __udp4_lib_err(struct sk_buff *skb, u32 info, struct hlist_head udptable[])
+void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 {
 	struct inet_sock *inet;
 	struct iphdr *iph = (struct iphdr*)skb->data;
@@ -392,7 +405,7 @@ out:
 
 void udp_err(struct sk_buff *skb, u32 info)
 {
-	__udp4_lib_err(skb, info, udp_hash);
+	__udp4_lib_err(skb, info, &udp_table);
 }
 
 /*
@@ -933,6 +946,21 @@ int udp_disconnect(struct sock *sk, int flags)
 	return 0;
 }
 
+void udp_lib_unhash(struct sock *sk)
+{
+	struct udp_table *udptable = sk->sk_prot->h.udp_table;
+	unsigned int hash = udp_hashfn(sock_net(sk), sk->sk_hash);
+	struct udp_hslot *hslot = &udptable->hash[hash];
+
+	spin_lock(&hslot->lock);
+	if (sk_del_node_init(sk)) {
+		inet_sk(sk)->num = 0;
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+	}
+	spin_unlock(&hslot->lock);
+}
+EXPORT_SYMBOL(udp_lib_unhash);
+
 static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
 	int is_udplite = IS_UDPLITE(sk);
@@ -1071,13 +1099,14 @@ drop:
 static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 				    struct udphdr  *uh,
 				    __be32 saddr, __be32 daddr,
-				    struct hlist_head udptable[])
+				    struct udp_table *udptable)
 {
 	struct sock *sk;
+	struct udp_hslot *hslot = &udptable->hash[udp_hashfn(net, ntohs(uh->dest))];
 	int dif;
 
-	read_lock(&udp_hash_lock);
-	sk = sk_head(&udptable[udp_hashfn(net, ntohs(uh->dest))]);
+	spin_lock(&hslot->lock);
+	sk = sk_head(&hslot->head);
 	dif = skb->dev->ifindex;
 	sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
 	if (sk) {
@@ -1102,7 +1131,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 		} while (sknext);
 	} else
 		kfree_skb(skb);
-	read_unlock(&udp_hash_lock);
+	spin_unlock(&hslot->lock);
 	return 0;
 }
 
@@ -1148,7 +1177,7 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
  *	All we need to do is get the socket, and then do a checksum.
  */
 
-int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[],
+int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 		   int proto)
 {
 	struct sock *sk;
@@ -1246,7 +1275,7 @@ drop:
 
 int udp_rcv(struct sk_buff *skb)
 {
-	return __udp4_lib_rcv(skb, udp_hash, IPPROTO_UDP);
+	return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);
 }
 
 void udp_destroy_sock(struct sock *sk)
@@ -1488,7 +1517,7 @@ struct proto udp_prot = {
 	.sysctl_wmem	   = &sysctl_udp_wmem_min,
 	.sysctl_rmem	   = &sysctl_udp_rmem_min,
 	.obj_size	   = sizeof(struct udp_sock),
-	.h.udp_hash	   = udp_hash,
+	.h.udp_table	   = &udp_table,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_udp_setsockopt,
 	.compat_getsockopt = compat_udp_getsockopt,
@@ -1498,20 +1527,23 @@ struct proto udp_prot = {
 /* ------------------------------------------------------------------------ */
 #ifdef CONFIG_PROC_FS
 
-static struct sock *udp_get_first(struct seq_file *seq)
+static struct sock *udp_get_first(struct seq_file *seq, int start)
 {
 	struct sock *sk;
 	struct udp_iter_state *state = seq->private;
 	struct net *net = seq_file_net(seq);
 
-	for (state->bucket = 0; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) {
+	for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) {
 		struct hlist_node *node;
-		sk_for_each(sk, node, state->hashtable + state->bucket) {
+		struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
+		spin_lock_bh(&hslot->lock);
+		sk_for_each(sk, node, &hslot->head) {
 			if (!net_eq(sock_net(sk), net))
 				continue;
 			if (sk->sk_family == state->family)
 				goto found;
 		}
+		spin_unlock_bh(&hslot->lock);
 	}
 	sk = NULL;
 found:
@@ -1525,20 +1557,18 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
 
 	do {
 		sk = sk_next(sk);
-try_again:
-		;
 	} while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
 
-	if (!sk && ++state->bucket < UDP_HTABLE_SIZE) {
-		sk = sk_head(state->hashtable + state->bucket);
-		goto try_again;
+	if (!sk) {
+		spin_unlock(&state->udp_table->hash[state->bucket].lock);
+		return udp_get_first(seq, state->bucket + 1);
 	}
 	return sk;
 }
 
 static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
 {
-	struct sock *sk = udp_get_first(seq);
+	struct sock *sk = udp_get_first(seq, 0);
 
 	if (sk)
 		while (pos && (sk = udp_get_next(seq, sk)) != NULL)
@@ -1547,9 +1577,7 @@ static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
 }
 
 static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
-	__acquires(udp_hash_lock)
 {
-	read_lock(&udp_hash_lock);
 	return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
 }
 
@@ -1567,9 +1595,11 @@ static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 }
 
 static void udp_seq_stop(struct seq_file *seq, void *v)
-	__releases(udp_hash_lock)
 {
-	read_unlock(&udp_hash_lock);
+	struct udp_iter_state *state = seq->private;
+
+	if (state->bucket < UDP_HTABLE_SIZE)
+		spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
 }
 
 static int udp_seq_open(struct inode *inode, struct file *file)
@@ -1585,7 +1615,7 @@ static int udp_seq_open(struct inode *inode, struct file *file)
 
 	s = ((struct seq_file *)file->private_data)->private;
 	s->family		= afinfo->family;
-	s->hashtable		= afinfo->hashtable;
+	s->udp_table		= afinfo->udp_table;
 	return err;
 }
 
@@ -1657,7 +1687,7 @@ int udp4_seq_show(struct seq_file *seq, void *v)
 static struct udp_seq_afinfo udp4_seq_afinfo = {
 	.name		= "udp",
 	.family		= AF_INET,
-	.hashtable	= udp_hash,
+	.udp_table	= &udp_table,
 	.seq_fops	= {
 		.owner	=	THIS_MODULE,
 	},
@@ -1692,10 +1722,21 @@ void udp4_proc_exit(void)
 }
 #endif /* CONFIG_PROC_FS */
 
+void __init udp_table_init(struct udp_table *table)
+{
+	int i;
+	
+	for (i = 0; i < UDP_HTABLE_SIZE; i++) {
+		INIT_HLIST_HEAD(&table->hash[i].head);
+		spin_lock_init(&table->hash[i].lock);
+	}
+}
+
 void __init udp_init(void)
 {
 	unsigned long limit;
 
+	udp_table_init(&udp_table);
 	/* Set the pressure threshold up by the same strategy of TCP. It is a
 	 * fraction of global memory that is up to 1/2 at 256 MB, decreasing
 	 * toward zero with the amount of memory, with a floor of 128 pages.
@@ -1712,8 +1753,6 @@ void __init udp_init(void)
 }
 
 EXPORT_SYMBOL(udp_disconnect);
-EXPORT_SYMBOL(udp_hash);
-EXPORT_SYMBOL(udp_hash_lock);
 EXPORT_SYMBOL(udp_ioctl);
 EXPORT_SYMBOL(udp_prot);
 EXPORT_SYMBOL(udp_sendmsg);
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index 2e9bad2..9f4a616 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -5,8 +5,8 @@
 #include <net/protocol.h>
 #include <net/inet_common.h>
 
-extern int  	__udp4_lib_rcv(struct sk_buff *, struct hlist_head [], int );
-extern void 	__udp4_lib_err(struct sk_buff *, u32, struct hlist_head []);
+extern int  	__udp4_lib_rcv(struct sk_buff *, struct udp_table *, int );
+extern void 	__udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
 
 extern int	udp_v4_get_port(struct sock *sk, unsigned short snum);
 
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 3c80796..d8ea8e5 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -12,16 +12,17 @@
  */
 #include "udp_impl.h"
 
-struct hlist_head 	udplite_hash[UDP_HTABLE_SIZE];
+struct udp_table 	udplite_table;
+EXPORT_SYMBOL(udplite_table);
 
 static int udplite_rcv(struct sk_buff *skb)
 {
-	return __udp4_lib_rcv(skb, udplite_hash, IPPROTO_UDPLITE);
+	return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
 }
 
 static void udplite_err(struct sk_buff *skb, u32 info)
 {
-	__udp4_lib_err(skb, info, udplite_hash);
+	__udp4_lib_err(skb, info, &udplite_table);
 }
 
 static	struct net_protocol udplite_protocol = {
@@ -50,7 +51,7 @@ struct proto 	udplite_prot = {
 	.unhash		   = udp_lib_unhash,
 	.get_port	   = udp_v4_get_port,
 	.obj_size	   = sizeof(struct udp_sock),
-	.h.udp_hash	   = udplite_hash,
+	.h.udp_table	   = &udplite_table,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_udp_setsockopt,
 	.compat_getsockopt = compat_udp_getsockopt,
@@ -71,7 +72,7 @@ static struct inet_protosw udplite4_protosw = {
 static struct udp_seq_afinfo udplite4_seq_afinfo = {
 	.name		= "udplite",
 	.family		= AF_INET,
-	.hashtable	= udplite_hash,
+	.udp_table 	= &udplite_table,
 	.seq_fops	= {
 		.owner	=	THIS_MODULE,
 	},
@@ -108,6 +109,7 @@ static inline int udplite4_proc_init(void)
 
 void __init udplite4_register(void)
 {
+	udp_table_init(&udplite_table);
 	if (proto_register(&udplite_prot, 1))
 		goto out_register_err;
 
@@ -126,5 +128,4 @@ out_register_err:
 	printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __func__);
 }
 
-EXPORT_SYMBOL(udplite_hash);
 EXPORT_SYMBOL(udplite_prot);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index e51da8c..ccee724 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -54,62 +54,73 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum)
 	return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal);
 }
 
+static inline int compute_score(struct sock *sk, struct net *net,
+				unsigned short hnum,
+				struct in6_addr *saddr, __be16 sport,
+				struct in6_addr *daddr, __be16 dport,
+				int dif)
+{
+	int score = -1;
+
+	if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
+			sk->sk_family == PF_INET6) {
+		struct ipv6_pinfo *np = inet6_sk(sk);
+		struct inet_sock *inet = inet_sk(sk);
+
+		score = 0;
+		if (inet->dport) {
+			if (inet->dport != sport)
+				return -1;
+			score++;
+		}
+		if (!ipv6_addr_any(&np->rcv_saddr)) {
+			if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
+				return -1;
+			score++;
+		}
+		if (!ipv6_addr_any(&np->daddr)) {
+			if (!ipv6_addr_equal(&np->daddr, saddr))
+				return -1;
+			score++;
+		}
+		if (sk->sk_bound_dev_if) {
+			if (sk->sk_bound_dev_if != dif)
+				return -1;
+			score++;
+		}
+	}
+	return score;
+}
+
 static struct sock *__udp6_lib_lookup(struct net *net,
 				      struct in6_addr *saddr, __be16 sport,
 				      struct in6_addr *daddr, __be16 dport,
-				      int dif, struct hlist_head udptable[])
+				      int dif, struct udp_table *udptable)
 {
 	struct sock *sk, *result = NULL;
 	struct hlist_node *node;
 	unsigned short hnum = ntohs(dport);
-	int badness = -1;
-
-	read_lock(&udp_hash_lock);
-	sk_for_each(sk, node, &udptable[udp_hashfn(net, hnum)]) {
-		struct inet_sock *inet = inet_sk(sk);
-
-		if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
-				sk->sk_family == PF_INET6) {
-			struct ipv6_pinfo *np = inet6_sk(sk);
-			int score = 0;
-			if (inet->dport) {
-				if (inet->dport != sport)
-					continue;
-				score++;
-			}
-			if (!ipv6_addr_any(&np->rcv_saddr)) {
-				if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
-					continue;
-				score++;
-			}
-			if (!ipv6_addr_any(&np->daddr)) {
-				if (!ipv6_addr_equal(&np->daddr, saddr))
-					continue;
-				score++;
-			}
-			if (sk->sk_bound_dev_if) {
-				if (sk->sk_bound_dev_if != dif)
-					continue;
-				score++;
-			}
-			if (score == 4) {
-				result = sk;
-				break;
-			} else if (score > badness) {
-				result = sk;
-				badness = score;
-			}
+	unsigned int hash = udp_hashfn(net, hnum);
+	struct udp_hslot *hslot = &udptable->hash[hash];
+	int score, badness = -1;
+
+	spin_lock(&hslot->lock);
+	sk_for_each(sk, node, &hslot->head) {
+		score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif);
+		if (score > badness) {
+			result = sk;
+			badness = score;
 		}
 	}
 	if (result)
 		sock_hold(result);
-	read_unlock(&udp_hash_lock);
+	spin_unlock(&hslot->lock);
 	return result;
 }
 
 static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
 					  __be16 sport, __be16 dport,
-					  struct hlist_head udptable[])
+					  struct udp_table *udptable)
 {
 	struct sock *sk;
 	struct ipv6hdr *iph = ipv6_hdr(skb);
@@ -239,7 +250,7 @@ csum_copy_err:
 
 void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		    int type, int code, int offset, __be32 info,
-		    struct hlist_head udptable[]                    )
+		    struct udp_table *udptable)
 {
 	struct ipv6_pinfo *np;
 	struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data;
@@ -275,7 +286,7 @@ static __inline__ void udpv6_err(struct sk_buff *skb,
 				 struct inet6_skb_parm *opt, int type,
 				 int code, int offset, __be32 info     )
 {
-	__udp6_lib_err(skb, opt, type, code, offset, info, udp_hash);
+	__udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
 }
 
 int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
@@ -374,14 +385,15 @@ static struct sock *udp_v6_mcast_next(struct sock *sk,
  */
 static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 		struct in6_addr *saddr, struct in6_addr *daddr,
-		struct hlist_head udptable[])
+		struct udp_table *udptable)
 {
 	struct sock *sk, *sk2;
 	const struct udphdr *uh = udp_hdr(skb);
+	struct udp_hslot *hslot = &udptable->hash[udp_hashfn(net, ntohs(uh->dest))];
 	int dif;
 
-	read_lock(&udp_hash_lock);
-	sk = sk_head(&udptable[udp_hashfn(net, ntohs(uh->dest))]);
+	spin_lock(&hslot->lock);
+	sk = sk_head(&hslot->head);
 	dif = inet6_iif(skb);
 	sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
 	if (!sk) {
@@ -409,7 +421,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 		sk_add_backlog(sk, skb);
 	bh_unlock_sock(sk);
 out:
-	read_unlock(&udp_hash_lock);
+	spin_unlock(&hslot->lock);
 	return 0;
 }
 
@@ -447,7 +459,7 @@ static inline int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh,
 	return 0;
 }
 
-int __udp6_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[],
+int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 		   int proto)
 {
 	struct sock *sk;
@@ -544,7 +556,7 @@ discard:
 
 static __inline__ int udpv6_rcv(struct sk_buff *skb)
 {
-	return __udp6_lib_rcv(skb, udp_hash, IPPROTO_UDP);
+	return __udp6_lib_rcv(skb, &udp_table, IPPROTO_UDP);
 }
 
 /*
@@ -1008,7 +1020,7 @@ int udp6_seq_show(struct seq_file *seq, void *v)
 static struct udp_seq_afinfo udp6_seq_afinfo = {
 	.name		= "udp6",
 	.family		= AF_INET6,
-	.hashtable	= udp_hash,
+	.udp_table	= &udp_table,
 	.seq_fops	= {
 		.owner	=	THIS_MODULE,
 	},
@@ -1050,7 +1062,7 @@ struct proto udpv6_prot = {
 	.sysctl_wmem	   = &sysctl_udp_wmem_min,
 	.sysctl_rmem	   = &sysctl_udp_rmem_min,
 	.obj_size	   = sizeof(struct udp6_sock),
-	.h.udp_hash	   = udp_hash,
+	.h.udp_table	   = &udp_table,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_udpv6_setsockopt,
 	.compat_getsockopt = compat_udpv6_getsockopt,
diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h
index 92dd7da..2377920 100644
--- a/net/ipv6/udp_impl.h
+++ b/net/ipv6/udp_impl.h
@@ -7,9 +7,9 @@
 #include <net/inet_common.h>
 #include <net/transp_v6.h>
 
-extern int  	__udp6_lib_rcv(struct sk_buff *, struct hlist_head [], int );
+extern int  	__udp6_lib_rcv(struct sk_buff *, struct udp_table *, int );
 extern void 	__udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *,
-			       int , int , int , __be32 , struct hlist_head []);
+			       int , int , int , __be32 , struct udp_table *);
 
 extern int	udp_v6_get_port(struct sock *sk, unsigned short snum);
 
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index 3cd1a1a..f1e892a 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -15,14 +15,14 @@
 
 static int udplitev6_rcv(struct sk_buff *skb)
 {
-	return __udp6_lib_rcv(skb, udplite_hash, IPPROTO_UDPLITE);
+	return __udp6_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
 }
 
 static void udplitev6_err(struct sk_buff *skb,
 			  struct inet6_skb_parm *opt,
 			  int type, int code, int offset, __be32 info)
 {
-	__udp6_lib_err(skb, opt, type, code, offset, info, udplite_hash);
+	__udp6_lib_err(skb, opt, type, code, offset, info, &udplite_table);
 }
 
 static struct inet6_protocol udplitev6_protocol = {
@@ -49,7 +49,7 @@ struct proto udplitev6_prot = {
 	.unhash		   = udp_lib_unhash,
 	.get_port	   = udp_v6_get_port,
 	.obj_size	   = sizeof(struct udp6_sock),
-	.h.udp_hash	   = udplite_hash,
+	.h.udp_table	   = &udplite_table,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_udpv6_setsockopt,
 	.compat_getsockopt = compat_udpv6_getsockopt,
@@ -95,7 +95,7 @@ void udplitev6_exit(void)
 static struct udp_seq_afinfo udplite6_seq_afinfo = {
 	.name		= "udplite6",
 	.family		= AF_INET6,
-	.hashtable	= udplite_hash,
+	.udp_table	= &udplite_table,
 	.seq_fops	= {
 		.owner	=	THIS_MODULE,
 	},

^ permalink raw reply related	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29  8:23                         ` Eric Dumazet
@ 2008-10-29  8:56                           ` David Miller
  2008-10-29 10:19                             ` Eric Dumazet
  2008-10-29  9:04                           ` Eric Dumazet
  1 sibling, 1 reply; 134+ messages in thread
From: David Miller @ 2008-10-29  8:56 UTC (permalink / raw)
  To: dada1
  Cc: shemminger, benny+usenet, minyard, netdev, paulmck, cl,
	a.p.zijlstra, johnpol, christian

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 29 Oct 2008 09:23:03 +0100

> David Miller a écrit :
> > From: Eric Dumazet <dada1@cosmosbay.com>
> > Date: Tue, 28 Oct 2008 23:45:15 +0100
> > 
> >> I will submit a new patch serie tomorrow, with :
> >>
> >> Patch 1 : spinlocks instead of rwlocks, and bug spotted by Christian Bell
> >>
> >> Patch 2 : splited on two parts (2 & 3) , one for IPV4, one for IPV6, 
> > I very much look forward to this :-)
> > I like these changes and can't wait to add them to net-next-2.6
> 
> Thanks David, please find first updated patch 1

Applied, please (re-)send the current version of patch 2 as well.

Thanks.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29  8:23                         ` Eric Dumazet
  2008-10-29  8:56                           ` David Miller
@ 2008-10-29  9:04                           ` Eric Dumazet
  2008-10-29  9:17                             ` David Miller
  2008-10-29 13:17                             ` Corey Minyard
  1 sibling, 2 replies; 134+ messages in thread
From: Eric Dumazet @ 2008-10-29  9:04 UTC (permalink / raw)
  To: David Miller
  Cc: shemminger, benny+usenet, minyard, netdev, paulmck,
	Christoph Lameter, a.p.zijlstra, johnpol, Christian Bell

[-- Attachment #1: Type: text/plain, Size: 3262 bytes --]

Eric Dumazet a écrit :
> David Miller a écrit :
>> From: Eric Dumazet <dada1@cosmosbay.com>
>> Date: Tue, 28 Oct 2008 23:45:15 +0100
>>
>>> I will submit a new patch serie tomorrow, with :
>>>
>>> Patch 1 : spinlocks instead of rwlocks, and bug spotted by Christian 
>>> Bell
>>>
>>> Patch 2 : splited on two parts (2 & 3) , one for IPV4, one for IPV6, 
>>
>> I very much look forward to this :-)
>>
>> I like these changes and can't wait to add them to net-next-2.6
> 

Please find updated patch 2

Missing check in __udp6_lib_lookup() was added,
and based on spinlock version ([PATCH] udp: introduce struct udp_table and multiple spinlocks)

Thank you

[PATCH] udp: RCU handling for Unicast packets.

Goals are :

1) Optimizing handling of incoming Unicast UDP frames, so that no memory
 writes should happen in the fast path.

 Note: Multicasts and broadcasts still will need to take a lock,
 because doing a full lockless lookup in this case is difficult.

2) No expensive operations in the socket bind/unhash phases :
  - No expensive synchronize_rcu() calls.

  - No added rcu_head in socket structure, increasing memory needs,
  but more important, forcing us to use call_rcu() calls,
  that have the bad property of making sockets structure cold.
  (rcu grace period between socket freeing and its potential reuse
   make this socket being cold in CPU cache).
  David did a previous patch using call_rcu() and noticed a 20%
  impact on TCP connection rates.
  Quoting Cristopher Lameter :
   "Right. That results in cacheline cooldown. You'd want to recycle
    the object as they are cache hot on a per cpu basis. That is screwed
    up by the delayed regular rcu processing. We have seen multiple
    regressions due to cacheline cooldown.
    The only choice in cacheline hot sensitive areas is to deal with the
    complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU."

  - Because udp sockets are allocated from dedicated kmem_cache,
  use of SLAB_DESTROY_BY_RCU can help here.

Theory of operation :
---------------------

As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()),
special attention must be taken by readers and writers.

Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed,
reused, inserted in a different chain or in worst case in the same chain
while readers could do lookups in the same time.

In order to avoid loops, a reader must check each socket found in a chain
really belongs to the chain the reader was traversing. If it finds a
mismatch, lookup must start again at the begining. This *restart* loop
is the reason we had to use rdlock for the multicast case, because
we dont want to send same message several times to the same socket.

We use RCU only for fast path.
Thus, /proc/net/udp still takes spinlocks.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
 include/net/sock.h |   37 ++++++++++++++++++++++++++++++++++++-
 net/core/sock.c    |    3 ++-
 net/ipv4/udp.c     |   35 ++++++++++++++++++++++++++---------
 net/ipv4/udplite.c |    1 +
 net/ipv6/udp.c     |   31 ++++++++++++++++++++++++-------
 net/ipv6/udplite.c |    1 +
 6 files changed, 90 insertions(+), 18 deletions(-)

[-- Attachment #2: patch_udp_2.patch --]
[-- Type: text/plain, Size: 7381 bytes --]

diff --git a/include/net/sock.h b/include/net/sock.h
index d200dfb..0bea25d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -363,6 +363,27 @@ static __inline__ int sk_del_node_init(struct sock *sk)
 	return rc;
 }
 
+static __inline__ int __sk_del_node_init_rcu(struct sock *sk)
+{
+	if (sk_hashed(sk)) {
+		hlist_del_init_rcu(&sk->sk_node);
+		return 1;
+	}
+	return 0;
+}
+
+static __inline__ int sk_del_node_init_rcu(struct sock *sk)
+{
+	int rc = __sk_del_node_init_rcu(sk);
+
+	if (rc) {
+		/* paranoid for a while -acme */
+		WARN_ON(atomic_read(&sk->sk_refcnt) == 1);
+		__sock_put(sk);
+	}
+	return rc;
+}
+
 static __inline__ void __sk_add_node(struct sock *sk, struct hlist_head *list)
 {
 	hlist_add_head(&sk->sk_node, list);
@@ -374,6 +395,17 @@ static __inline__ void sk_add_node(struct sock *sk, struct hlist_head *list)
 	__sk_add_node(sk, list);
 }
 
+static __inline__ void __sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
+{
+	hlist_add_head_rcu(&sk->sk_node, list);
+}
+
+static __inline__ void sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
+{
+	sock_hold(sk);
+	__sk_add_node_rcu(sk, list);
+}
+
 static __inline__ void __sk_del_bind_node(struct sock *sk)
 {
 	__hlist_del(&sk->sk_bind_node);
@@ -387,6 +419,8 @@ static __inline__ void sk_add_bind_node(struct sock *sk,
 
 #define sk_for_each(__sk, node, list) \
 	hlist_for_each_entry(__sk, node, list, sk_node)
+#define sk_for_each_rcu(__sk, node, list) \
+	hlist_for_each_entry_rcu(__sk, node, list, sk_node)
 #define sk_for_each_from(__sk, node) \
 	if (__sk && ({ node = &(__sk)->sk_node; 1; })) \
 		hlist_for_each_entry_from(__sk, node, sk_node)
@@ -589,8 +623,9 @@ struct proto {
 	int			*sysctl_rmem;
 	int			max_header;
 
-	struct kmem_cache		*slab;
+	struct kmem_cache	*slab;
 	unsigned int		obj_size;
+	int			slab_flags;
 
 	atomic_t		*orphan_count;
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 5e2a313..ded1eb5 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2042,7 +2042,8 @@ int proto_register(struct proto *prot, int alloc_slab)
 
 	if (alloc_slab) {
 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
-					       SLAB_HWCACHE_ALIGN, NULL);
+					SLAB_HWCACHE_ALIGN | prot->slab_flags,
+					NULL);
 
 		if (prot->slab == NULL) {
 			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index b91cd0a..5c1cbe1 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -187,7 +187,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 	inet_sk(sk)->num = snum;
 	sk->sk_hash = snum;
 	if (sk_unhashed(sk)) {
-		sk_add_node(sk, &hslot->head);
+		sk_add_node_rcu(sk, &hslot->head);
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 	}
 	error = 0;
@@ -253,15 +253,24 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 		__be16 sport, __be32 daddr, __be16 dport,
 		int dif, struct udp_table *udptable)
 {
-	struct sock *sk, *result = NULL;
+	struct sock *sk, *result;
 	struct hlist_node *node;
 	unsigned short hnum = ntohs(dport);
 	unsigned int hash = udp_hashfn(net, hnum);
 	struct udp_hslot *hslot = &udptable->hash[hash];
-	int score, badness = -1;
+	int score, badness;
 
-	spin_lock(&hslot->lock);
-	sk_for_each(sk, node, &hslot->head) {
+	rcu_read_lock();
+begin:
+	result = NULL;
+	badness = -1;
+	sk_for_each_rcu(sk, node, &hslot->head) {
+		/*
+		 * lockless reader, and SLAB_DESTROY_BY_RCU items:
+		 * We must check this item was not moved to another chain
+		 */
+		if (udp_hashfn(net, sk->sk_hash) != hash)
+			goto begin;
 		score = compute_score(sk, net, saddr, hnum, sport,
 				      daddr, dport, dif);
 		if (score > badness) {
@@ -269,9 +278,16 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 			badness = score;
  		}
 	}
-	if (result)
-		sock_hold(result);
-	spin_unlock(&hslot->lock);
+	if (result) {
+		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
+			result = NULL;
+		else if (unlikely(compute_score(result, net, saddr, hnum, sport,
+				  daddr, dport, dif) < badness)) {
+			sock_put(result);
+			goto begin;
+		}
+	}
+	rcu_read_unlock();
 	return result;
 }
 
@@ -953,7 +969,7 @@ void udp_lib_unhash(struct sock *sk)
 	struct udp_hslot *hslot = &udptable->hash[hash];
 
 	spin_lock(&hslot->lock);
-	if (sk_del_node_init(sk)) {
+	if (sk_del_node_init_rcu(sk)) {
 		inet_sk(sk)->num = 0;
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 	}
@@ -1517,6 +1533,7 @@ struct proto udp_prot = {
 	.sysctl_wmem	   = &sysctl_udp_wmem_min,
 	.sysctl_rmem	   = &sysctl_udp_rmem_min,
 	.obj_size	   = sizeof(struct udp_sock),
+	.slab_flags	   = SLAB_DESTROY_BY_RCU,
 	.h.udp_table	   = &udp_table,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_udp_setsockopt,
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index d8ea8e5..c784891 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -51,6 +51,7 @@ struct proto 	udplite_prot = {
 	.unhash		   = udp_lib_unhash,
 	.get_port	   = udp_v4_get_port,
 	.obj_size	   = sizeof(struct udp_sock),
+	.slab_flags	   = SLAB_DESTROY_BY_RCU,
 	.h.udp_table	   = &udplite_table,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_udp_setsockopt,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index ccee724..df78ddc 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -97,24 +97,40 @@ static struct sock *__udp6_lib_lookup(struct net *net,
 				      struct in6_addr *daddr, __be16 dport,
 				      int dif, struct udp_table *udptable)
 {
-	struct sock *sk, *result = NULL;
+	struct sock *sk, *result;
 	struct hlist_node *node;
 	unsigned short hnum = ntohs(dport);
 	unsigned int hash = udp_hashfn(net, hnum);
 	struct udp_hslot *hslot = &udptable->hash[hash];
-	int score, badness = -1;
+	int score, badness;
 
-	spin_lock(&hslot->lock);
-	sk_for_each(sk, node, &hslot->head) {
+	rcu_read_lock();
+begin:
+	result = NULL;
+	badness = -1;
+	sk_for_each_rcu(sk, node, &hslot->head) {
+		/*
+		 * lockless reader, and SLAB_DESTROY_BY_RCU items:
+		 * We must check this item was not moved to another chain
+		 */
+		if (udp_hashfn(net, sk->sk_hash) != hash)
+			goto begin;
 		score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif);
 		if (score > badness) {
 			result = sk;
 			badness = score;
 		}
 	}
-	if (result)
-		sock_hold(result);
-	spin_unlock(&hslot->lock);
+	if (result) {
+		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
+			result = NULL;
+		else if (unlikely(compute_score(result, net, hnum, saddr, sport,
+					daddr, dport, dif) < badness)) {
+			sock_put(result);
+			goto begin;
+ 		}
+	}
+	rcu_read_unlock();
 	return result;
 }
 
@@ -1062,6 +1078,7 @@ struct proto udpv6_prot = {
 	.sysctl_wmem	   = &sysctl_udp_wmem_min,
 	.sysctl_rmem	   = &sysctl_udp_rmem_min,
 	.obj_size	   = sizeof(struct udp6_sock),
+	.slab_flags	   = SLAB_DESTROY_BY_RCU,
 	.h.udp_table	   = &udp_table,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_udpv6_setsockopt,
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index f1e892a..05ab176 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -49,6 +49,7 @@ struct proto udplitev6_prot = {
 	.unhash		   = udp_lib_unhash,
 	.get_port	   = udp_v6_get_port,
 	.obj_size	   = sizeof(struct udp6_sock),
+ 	.slab_flags	   = SLAB_DESTROY_BY_RCU,
 	.h.udp_table	   = &udplite_table,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_udpv6_setsockopt,

^ permalink raw reply related	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29  9:04                           ` Eric Dumazet
@ 2008-10-29  9:17                             ` David Miller
  2008-10-29 13:17                             ` Corey Minyard
  1 sibling, 0 replies; 134+ messages in thread
From: David Miller @ 2008-10-29  9:17 UTC (permalink / raw)
  To: dada1
  Cc: shemminger, benny+usenet, minyard, netdev, paulmck, cl,
	a.p.zijlstra, johnpol, christian

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 29 Oct 2008 10:04:24 +0100

> Eric Dumazet a écrit :
> > David Miller a écrit :
> >> From: Eric Dumazet <dada1@cosmosbay.com>
> >> Date: Tue, 28 Oct 2008 23:45:15 +0100
> >>
> >>> I will submit a new patch serie tomorrow, with :
> >>>
> >>> Patch 1 : spinlocks instead of rwlocks, and bug spotted by Christian Bell
> >>>
> >>> Patch 2 : splited on two parts (2 & 3) , one for IPV4, one for IPV6, 
> >>
> >> I very much look forward to this :-)
> >>
> >> I like these changes and can't wait to add them to net-next-2.6
> > 
> 
> Please find updated patch 2
> 
> Missing check in __udp6_lib_lookup() was added,
> and based on spinlock version ([PATCH] udp: introduce struct udp_table and multiple spinlocks)

Applied, thanks Eric.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29  8:56                           ` David Miller
@ 2008-10-29 10:19                             ` Eric Dumazet
  2008-10-29 18:19                               ` David Miller
  0 siblings, 1 reply; 134+ messages in thread
From: Eric Dumazet @ 2008-10-29 10:19 UTC (permalink / raw)
  To: David Miller
  Cc: shemminger, benny+usenet, minyard, netdev, paulmck, cl,
	a.p.zijlstra, christian

[-- Attachment #1: Type: text/plain, Size: 862 bytes --]

David Miller a écrit :
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Wed, 29 Oct 2008 09:23:03 +0100
> 
>> David Miller a écrit :
>>> From: Eric Dumazet <dada1@cosmosbay.com>
>>> Date: Tue, 28 Oct 2008 23:45:15 +0100
>>>
>>>> I will submit a new patch serie tomorrow, with :
>>>>
>>>> Patch 1 : spinlocks instead of rwlocks, and bug spotted by Christian Bell
>>>>
>>>> Patch 2 : splited on two parts (2 & 3) , one for IPV4, one for IPV6, 
>>> I very much look forward to this :-)
>>> I like these changes and can't wait to add them to net-next-2.6
>> Thanks David, please find first updated patch 1
> 
> Applied, please (re-)send the current version of patch 2 as well.

I found a fatal bug in /proc/net/udp handling, sorry.

[PATCH] udp: udp_get_next() should use spin_unlock_bh()

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

[-- Attachment #2: udp_3.patch --]
[-- Type: text/plain, Size: 499 bytes --]

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 5ba0340..ced8203 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1579,7 +1579,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
 	} while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
 
 	if (!sk) {
-		spin_unlock(&state->udp_table->hash[state->bucket].lock);
+		spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
 		return udp_get_first(seq, state->bucket + 1);
 	}
 	return sk;

^ permalink raw reply related	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29  9:04                           ` Eric Dumazet
  2008-10-29  9:17                             ` David Miller
@ 2008-10-29 13:17                             ` Corey Minyard
  2008-10-29 14:36                               ` Eric Dumazet
  1 sibling, 1 reply; 134+ messages in thread
From: Corey Minyard @ 2008-10-29 13:17 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, shemminger, benny+usenet, netdev, paulmck,
	Christoph Lameter, a.p.zijlstra, johnpol, Christian Bell

I believe there is a race in this patch:

+	sk_for_each_rcu(sk, node, &hslot->head) {
+		/*
+		 * lockless reader, and SLAB_DESTROY_BY_RCU items:
+		 * We must check this item was not moved to another chain
+		 */
+		if (udp_hashfn(net, sk->sk_hash) != hash)
+			goto begin;
 		score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif);
 		if (score > badness) {
 			result = sk;
 			badness = score;
 		}
 	}

If the socket is moved from one list to another list in-between the time 
the hash is calculated and the next field is accessed, and the socket 
has moved to the end of the new list, the traversal will not complete 
properly on the list it should have, since the socket will be on the end 
of the new list and there's not a way to tell it's on a new list and 
restart the list traversal.  I think that this can be solved by 
pre-fetching the "next" field (with proper barriers) before checking the 
hash.

I also might be nice to have a way to avoid recomputing the score the 
second time, perhaps using a sequence number of some type.

-corey

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29 13:17                             ` Corey Minyard
@ 2008-10-29 14:36                               ` Eric Dumazet
  2008-10-29 15:34                                 ` Corey Minyard
                                                   ` (2 more replies)
  0 siblings, 3 replies; 134+ messages in thread
From: Eric Dumazet @ 2008-10-29 14:36 UTC (permalink / raw)
  To: Corey Minyard
  Cc: David Miller, shemminger, benny+usenet, netdev, paulmck,
	Christoph Lameter, a.p.zijlstra, johnpol, Christian Bell

[-- Attachment #1: Type: text/plain, Size: 2684 bytes --]

Corey Minyard a écrit :
> I believe there is a race in this patch:
> 
> +    sk_for_each_rcu(sk, node, &hslot->head) {
> +        /*
> +         * lockless reader, and SLAB_DESTROY_BY_RCU items:
> +         * We must check this item was not moved to another chain
> +         */
> +        if (udp_hashfn(net, sk->sk_hash) != hash)
> +            goto begin;
>         score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, 
> dif);
>         if (score > badness) {
>             result = sk;
>             badness = score;
>         }
>     }
> 
> If the socket is moved from one list to another list in-between the time 
> the hash is calculated and the next field is accessed, and the socket 
> has moved to the end of the new list, the traversal will not complete 
> properly on the list it should have, since the socket will be on the end 
> of the new list and there's not a way to tell it's on a new list and 
> restart the list traversal.  I think that this can be solved by 
> pre-fetching the "next" field (with proper barriers) before checking the 
> hash.
> 

You are absolutely right. As we *need* next pointer anyway for the prefetch(),
we can store it, so that its value is kept.

> I also might be nice to have a way to avoid recomputing the score the 
> second time, perhaps using a sequence number of some type.

Well, computing score is really cheap, everything is in cache, granted
the chain was not really huge and high score item at the beginning.

Adding yet another field in sock structure should be avoided if possible.

Thanks

[PATCH] udp: introduce sk_for_each_rcu_safenext()

Corey Minyard found a race added in commit 271b72c7fa82c2c7a795bc16896149933110672d
(udp: RCU handling for Unicast packets.)

 "If the socket is moved from one list to another list in-between the time 
  the hash is calculated and the next field is accessed, and the socket 
  has moved to the end of the new list, the traversal will not complete 
  properly on the list it should have, since the socket will be on the end 
  of the new list and there's not a way to tell it's on a new list and 
  restart the list traversal.  I think that this can be solved by 
  pre-fetching the "next" field (with proper barriers) before checking the 
  hash."

This patch corrects this problem, introducing a new sk_for_each_rcu_safenext()
macro.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
 include/linux/rculist.h |   17 +++++++++++++++++
 include/net/sock.h      |    4 ++--
 net/ipv4/udp.c          |    4 ++--
 net/ipv6/udp.c          |    4 ++--
 4 files changed, 23 insertions(+), 6 deletions(-)


[-- Attachment #2: corey.patch --]
[-- Type: text/plain, Size: 3439 bytes --]

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index e649bd3..3ba2998 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -383,5 +383,22 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
 		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \
 		pos = rcu_dereference(pos->next))
 
+/**
+ * hlist_for_each_entry_rcu_safenext - iterate over rcu list of given type
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ * @next:       the &struct hlist_node to use as a next cursor
+ *
+ * Special version of hlist_for_each_entry_rcu that make sure
+ * each next pointer is fetched before each iteration.
+ */
+#define hlist_for_each_entry_rcu_safenext(tpos, pos, head, member, next) \
+	for (pos = rcu_dereference((head)->first);			 \
+		pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) &&	\
+		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \
+		pos = rcu_dereference(next))
+
 #endif	/* __KERNEL__ */
 #endif
diff --git a/include/net/sock.h b/include/net/sock.h
index 0bea25d..a4f6d3f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -419,8 +419,8 @@ static __inline__ void sk_add_bind_node(struct sock *sk,
 
 #define sk_for_each(__sk, node, list) \
 	hlist_for_each_entry(__sk, node, list, sk_node)
-#define sk_for_each_rcu(__sk, node, list) \
-	hlist_for_each_entry_rcu(__sk, node, list, sk_node)
+#define sk_for_each_rcu_safenext(__sk, node, list, next) \
+	hlist_for_each_entry_rcu_safenext(__sk, node, list, sk_node, next)
 #define sk_for_each_from(__sk, node) \
 	if (__sk && ({ node = &(__sk)->sk_node; 1; })) \
 		hlist_for_each_entry_from(__sk, node, sk_node)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 5ba0340..c3ecec8 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -256,7 +256,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 		int dif, struct udp_table *udptable)
 {
 	struct sock *sk, *result;
-	struct hlist_node *node;
+	struct hlist_node *node, *next;
 	unsigned short hnum = ntohs(dport);
 	unsigned int hash = udp_hashfn(net, hnum);
 	struct udp_hslot *hslot = &udptable->hash[hash];
@@ -266,7 +266,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 begin:
 	result = NULL;
 	badness = -1;
-	sk_for_each_rcu(sk, node, &hslot->head) {
+	sk_for_each_rcu_safenext(sk, node, &hslot->head, next) {
 		/*
 		 * lockless reader, and SLAB_DESTROY_BY_RCU items:
 		 * We must check this item was not moved to another chain
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 1d9790e..32d914d 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -98,7 +98,7 @@ static struct sock *__udp6_lib_lookup(struct net *net,
 				      int dif, struct udp_table *udptable)
 {
 	struct sock *sk, *result;
-	struct hlist_node *node;
+	struct hlist_node *node, *next;
 	unsigned short hnum = ntohs(dport);
 	unsigned int hash = udp_hashfn(net, hnum);
 	struct udp_hslot *hslot = &udptable->hash[hash];
@@ -108,7 +108,7 @@ static struct sock *__udp6_lib_lookup(struct net *net,
 begin:
 	result = NULL;
 	badness = -1;
-	sk_for_each_rcu(sk, node, &hslot->head) {
+	sk_for_each_rcu_safenext(sk, node, &hslot->head, next) {
 		/*
 		 * lockless reader, and SLAB_DESTROY_BY_RCU items:
 		 * We must check this item was not moved to another chain

^ permalink raw reply related	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29 14:36                               ` Eric Dumazet
@ 2008-10-29 15:34                                 ` Corey Minyard
  2008-10-29 16:09                                   ` Eric Dumazet
  2008-10-29 18:20                                 ` David Miller
  2008-10-30 11:12                                 ` Peter Zijlstra
  2 siblings, 1 reply; 134+ messages in thread
From: Corey Minyard @ 2008-10-29 15:34 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, shemminger, benny+usenet, netdev, paulmck,
	Christoph Lameter, a.p.zijlstra, johnpol, Christian Bell

Eric Dumazet wrote:
> Corey Minyard found a race added in commit 
> 271b72c7fa82c2c7a795bc16896149933110672d
> (udp: RCU handling for Unicast packets.)
>
> "If the socket is moved from one list to another list in-between the 
> time  the hash is calculated and the next field is accessed, and the 
> socket  has moved to the end of the new list, the traversal will not 
> complete  properly on the list it should have, since the socket will 
> be on the end  of the new list and there's not a way to tell it's on a 
> new list and  restart the list traversal.  I think that this can be 
> solved by  pre-fetching the "next" field (with proper barriers) before 
> checking the  hash."
>
> This patch corrects this problem, introducing a new 
> sk_for_each_rcu_safenext()
> macro.
You also need the appropriate smp_wmb() in udp_lib_get_port() after 
sk_hash is set, I think, so the next field is guaranteed to be changed 
after the hash value is changed.


-corey

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29 15:34                                 ` Corey Minyard
@ 2008-10-29 16:09                                   ` Eric Dumazet
  2008-10-29 16:37                                     ` Paul E. McKenney
  0 siblings, 1 reply; 134+ messages in thread
From: Eric Dumazet @ 2008-10-29 16:09 UTC (permalink / raw)
  To: Corey Minyard
  Cc: David Miller, shemminger, benny+usenet, netdev, paulmck,
	Christoph Lameter, a.p.zijlstra, johnpol, Christian Bell

Corey Minyard a écrit :
> Eric Dumazet wrote:
>> Corey Minyard found a race added in commit 
>> 271b72c7fa82c2c7a795bc16896149933110672d
>> (udp: RCU handling for Unicast packets.)
>>
>> "If the socket is moved from one list to another list in-between the 
>> time  the hash is calculated and the next field is accessed, and the 
>> socket  has moved to the end of the new list, the traversal will not 
>> complete  properly on the list it should have, since the socket will 
>> be on the end  of the new list and there's not a way to tell it's on a 
>> new list and  restart the list traversal.  I think that this can be 
>> solved by  pre-fetching the "next" field (with proper barriers) before 
>> checking the  hash."
>>
>> This patch corrects this problem, introducing a new 
>> sk_for_each_rcu_safenext()
>> macro.
> You also need the appropriate smp_wmb() in udp_lib_get_port() after 
> sk_hash is set, I think, so the next field is guaranteed to be changed 
> after the hash value is changed.
> 
> 

Not sure about this one Corey.

If a reader catches previous value of item->sk_hash, two cases are to be taken into :

1) its udp_hashfn(net, sk->sk_hash) is != hash  
  -> goto begin : Reader will redo its scan

2) its udp_hashfn(net, sk->sk_hash) is == hash
  -> next pointer is good enough : it points to next item in same hash chain.
     No need to rescan the chain at this point.
     Yes we could miss the fact that a new port was bound and this UDP message could be lost.


If we force a smp_wmb(), reader would fetch pointer to begin of list.


1) its udp_hashfn(net, sk->sk_hash) is != hash  
  -> goto begin : Reader will redo its scan : next pointer value had no meaning

2) its udp_hashfn(net, sk->sk_hash) is == hash
  ->next pointer "force reader" to rescan the chain, but wont find new items.

In any case, we cannot lost an UDP message sent to a stable port (previously bound)


Thanks


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29 16:09                                   ` Eric Dumazet
@ 2008-10-29 16:37                                     ` Paul E. McKenney
  2008-10-29 17:22                                       ` Corey Minyard
  2008-10-29 17:32                                       ` Eric Dumazet
  0 siblings, 2 replies; 134+ messages in thread
From: Paul E. McKenney @ 2008-10-29 16:37 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Corey Minyard, David Miller, shemminger, benny+usenet, netdev,
	Christoph Lameter, a.p.zijlstra, johnpol, Christian Bell

On Wed, Oct 29, 2008 at 05:09:53PM +0100, Eric Dumazet wrote:
> Corey Minyard a écrit :
>> Eric Dumazet wrote:
>>> Corey Minyard found a race added in commit 
>>> 271b72c7fa82c2c7a795bc16896149933110672d
>>> (udp: RCU handling for Unicast packets.)
>>>
>>> "If the socket is moved from one list to another list in-between the time 
>>>  the hash is calculated and the next field is accessed, and the socket  
>>> has moved to the end of the new list, the traversal will not complete  
>>> properly on the list it should have, since the socket will be on the end  
>>> of the new list and there's not a way to tell it's on a new list and  
>>> restart the list traversal.  I think that this can be solved by  
>>> pre-fetching the "next" field (with proper barriers) before checking the  
>>> hash."
>>>
>>> This patch corrects this problem, introducing a new 
>>> sk_for_each_rcu_safenext()
>>> macro.
>> You also need the appropriate smp_wmb() in udp_lib_get_port() after 
>> sk_hash is set, I think, so the next field is guaranteed to be changed 
>> after the hash value is changed.
>
> Not sure about this one Corey.
>
> If a reader catches previous value of item->sk_hash, two cases are to be 
> taken into :
>
> 1) its udp_hashfn(net, sk->sk_hash) is != hash   -> goto begin : Reader 
> will redo its scan
>
> 2) its udp_hashfn(net, sk->sk_hash) is == hash
>  -> next pointer is good enough : it points to next item in same hash 
> chain.
>     No need to rescan the chain at this point.
>     Yes we could miss the fact that a new port was bound and this UDP 
> message could be lost.

3) its udp_hashfn(net, sk-sk_hash) is == hash, but only because it was
removed, freed, reallocated, and then readded with the same hash value,
possibly carrying the reader to a new position in the same list.

You might well cover this (will examine your code in detail on my plane
flight starting about 20 hours from now), but thought I should point it
out.  ;-)

						Thanx, Paul

> If we force a smp_wmb(), reader would fetch pointer to begin of list.
>
>
> 1) its udp_hashfn(net, sk->sk_hash) is != hash   -> goto begin : Reader 
> will redo its scan : next pointer value had no meaning
>
> 2) its udp_hashfn(net, sk->sk_hash) is == hash
>  ->next pointer "force reader" to rescan the chain, but wont find new 
> items.
>
> In any case, we cannot lost an UDP message sent to a stable port 
> (previously bound)
>
>
> Thanks
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29 16:37                                     ` Paul E. McKenney
@ 2008-10-29 17:22                                       ` Corey Minyard
  2008-10-29 17:45                                         ` Eric Dumazet
  2008-10-29 17:32                                       ` Eric Dumazet
  1 sibling, 1 reply; 134+ messages in thread
From: Corey Minyard @ 2008-10-29 17:22 UTC (permalink / raw)
  To: paulmck
  Cc: Eric Dumazet, David Miller, shemminger, benny+usenet, netdev,
	Christoph Lameter, a.p.zijlstra, johnpol, Christian Bell

Paul E. McKenney wrote:
> On Wed, Oct 29, 2008 at 05:09:53PM +0100, Eric Dumazet wrote:
>   
>> Corey Minyard a écrit :
>>     
>>> Eric Dumazet wrote:
>>>       
>>>> Corey Minyard found a race added in commit 
>>>> 271b72c7fa82c2c7a795bc16896149933110672d
>>>> (udp: RCU handling for Unicast packets.)
>>>>
>>>> "If the socket is moved from one list to another list in-between the time 
>>>>  the hash is calculated and the next field is accessed, and the socket  
>>>> has moved to the end of the new list, the traversal will not complete  
>>>> properly on the list it should have, since the socket will be on the end  
>>>> of the new list and there's not a way to tell it's on a new list and  
>>>> restart the list traversal.  I think that this can be solved by  
>>>> pre-fetching the "next" field (with proper barriers) before checking the  
>>>> hash."
>>>>
>>>> This patch corrects this problem, introducing a new 
>>>> sk_for_each_rcu_safenext()
>>>> macro.
>>>>         
>>> You also need the appropriate smp_wmb() in udp_lib_get_port() after 
>>> sk_hash is set, I think, so the next field is guaranteed to be changed 
>>> after the hash value is changed.
>>>       
>> Not sure about this one Corey.
>>
>> If a reader catches previous value of item->sk_hash, two cases are to be 
>> taken into :
>>
>> 1) its udp_hashfn(net, sk->sk_hash) is != hash   -> goto begin : Reader 
>> will redo its scan
>>
>> 2) its udp_hashfn(net, sk->sk_hash) is == hash
>>  -> next pointer is good enough : it points to next item in same hash 
>> chain.
>>     No need to rescan the chain at this point.
>>     Yes we could miss the fact that a new port was bound and this UDP 
>> message could be lost.
>>     
>
> 3) its udp_hashfn(net, sk-sk_hash) is == hash, but only because it was
> removed, freed, reallocated, and then readded with the same hash value,
> possibly carrying the reader to a new position in the same list.
>   
If I understand this, without the smp_wmb(), it is possible that the 
next field can be written to main memory before the hash value is 
written.  If that happens, the following can occur:

  CPU1                    CPU2
  next is set to NULL (end of new list)
                          fetch next
                          calculate hash and compare to sk_hash
  sk_hash is set to new value

So I think in the above cases, your case #2 is not necessarily valid 
without the barrier.

And another possible issue.  If sk_hash is written before next, and CPU1 
is interrupted before CPU2, CPU2 will continually spin on the list until 
CPU1 comes back and moves it to the new list.  Note sure if that is an 
issue.

-corey

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29 16:37                                     ` Paul E. McKenney
  2008-10-29 17:22                                       ` Corey Minyard
@ 2008-10-29 17:32                                       ` Eric Dumazet
  2008-10-29 18:11                                         ` Paul E. McKenney
  1 sibling, 1 reply; 134+ messages in thread
From: Eric Dumazet @ 2008-10-29 17:32 UTC (permalink / raw)
  To: paulmck
  Cc: Corey Minyard, David Miller, shemminger, benny+usenet, netdev,
	Christoph Lameter, a.p.zijlstra, johnpol, Christian Bell

Paul E. McKenney a écrit :
> On Wed, Oct 29, 2008 at 05:09:53PM +0100, Eric Dumazet wrote:
>> Corey Minyard a écrit :
>>> Eric Dumazet wrote:
>>>> Corey Minyard found a race added in commit 
>>>> 271b72c7fa82c2c7a795bc16896149933110672d
>>>> (udp: RCU handling for Unicast packets.)
>>>>
>>>> "If the socket is moved from one list to another list in-between the time 
>>>>  the hash is calculated and the next field is accessed, and the socket  
>>>> has moved to the end of the new list, the traversal will not complete  
>>>> properly on the list it should have, since the socket will be on the end  
>>>> of the new list and there's not a way to tell it's on a new list and  
>>>> restart the list traversal.  I think that this can be solved by  
>>>> pre-fetching the "next" field (with proper barriers) before checking the  
>>>> hash."
>>>>
>>>> This patch corrects this problem, introducing a new 
>>>> sk_for_each_rcu_safenext()
>>>> macro.
>>> You also need the appropriate smp_wmb() in udp_lib_get_port() after 
>>> sk_hash is set, I think, so the next field is guaranteed to be changed 
>>> after the hash value is changed.
>> Not sure about this one Corey.
>>
>> If a reader catches previous value of item->sk_hash, two cases are to be 
>> taken into :
>>
>> 1) its udp_hashfn(net, sk->sk_hash) is != hash   -> goto begin : Reader 
>> will redo its scan
>>
>> 2) its udp_hashfn(net, sk->sk_hash) is == hash
>>  -> next pointer is good enough : it points to next item in same hash 
>> chain.
>>     No need to rescan the chain at this point.
>>     Yes we could miss the fact that a new port was bound and this UDP 
>> message could be lost.
> 
> 3) its udp_hashfn(net, sk-sk_hash) is == hash, but only because it was
> removed, freed, reallocated, and then readded with the same hash value,
> possibly carrying the reader to a new position in the same list.

yes, but 'new position' is 'before any not yet examined objects', since
we insert objects only at chain head.

> 
> You might well cover this (will examine your code in detail on my plane
> flight starting about 20 hours from now), but thought I should point it
> out.  ;-)
> 
> 	

Yes, I'll double check too, this seems tricky :)

About SLAB_DESTROY_BY_RCU effect, we now have two different kmem_cache for "UDP-Lite"
and "UDP".

This is expected, but we could avoid that and alias these caches, since
these objects have the same *type* . (The fields used for the RCU lookups,
deletes and inserts are the same)

Maybe a hack in net/ipv4/udplite.c before calling proto_register(), to
copy the kmem_cache from UDP.


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29 17:22                                       ` Corey Minyard
@ 2008-10-29 17:45                                         ` Eric Dumazet
  2008-10-29 18:28                                           ` Corey Minyard
  0 siblings, 1 reply; 134+ messages in thread
From: Eric Dumazet @ 2008-10-29 17:45 UTC (permalink / raw)
  To: Corey Minyard
  Cc: paulmck, David Miller, shemminger, benny+usenet, netdev,
	Christoph Lameter, a.p.zijlstra, johnpol, Christian Bell

Corey Minyard a écrit :
> Paul E. McKenney wrote:
>> On Wed, Oct 29, 2008 at 05:09:53PM +0100, Eric Dumazet wrote:
>>  
>>> Corey Minyard a écrit :
>>>    
>>>> Eric Dumazet wrote:
>>>>      
>>>>> Corey Minyard found a race added in commit 
>>>>> 271b72c7fa82c2c7a795bc16896149933110672d
>>>>> (udp: RCU handling for Unicast packets.)
>>>>>
>>>>> "If the socket is moved from one list to another list in-between 
>>>>> the time  the hash is calculated and the next field is accessed, 
>>>>> and the socket  has moved to the end of the new list, the traversal 
>>>>> will not complete  properly on the list it should have, since the 
>>>>> socket will be on the end  of the new list and there's not a way to 
>>>>> tell it's on a new list and  restart the list traversal.  I think 
>>>>> that this can be solved by  pre-fetching the "next" field (with 
>>>>> proper barriers) before checking the  hash."
>>>>>
>>>>> This patch corrects this problem, introducing a new 
>>>>> sk_for_each_rcu_safenext()
>>>>> macro.
>>>>>         
>>>> You also need the appropriate smp_wmb() in udp_lib_get_port() after 
>>>> sk_hash is set, I think, so the next field is guaranteed to be 
>>>> changed after the hash value is changed.
>>>>       
>>> Not sure about this one Corey.
>>>
>>> If a reader catches previous value of item->sk_hash, two cases are to 
>>> be taken into :
>>>
>>> 1) its udp_hashfn(net, sk->sk_hash) is != hash   -> goto begin : 
>>> Reader will redo its scan
>>>
>>> 2) its udp_hashfn(net, sk->sk_hash) is == hash
>>>  -> next pointer is good enough : it points to next item in same hash 
>>> chain.
>>>     No need to rescan the chain at this point.
>>>     Yes we could miss the fact that a new port was bound and this UDP 
>>> message could be lost.
>>>     
>>
>> 3) its udp_hashfn(net, sk-sk_hash) is == hash, but only because it was
>> removed, freed, reallocated, and then readded with the same hash value,
>> possibly carrying the reader to a new position in the same list.
>>   
> If I understand this, without the smp_wmb(), it is possible that the 
> next field can be written to main memory before the hash value is 
> written.  If that happens, the following can occur:
> 
>  CPU1                    CPU2
>  next is set to NULL (end of new list)

Well, if this item is injected to the same chain, next wont be set to NULL.

That would mean previous writers deleted all items from the chain.

In this case, readers can see NULL, it is not a problem at all.
List is/was empty.
An application cannot complain a packet is not
handled if its bind() syscall is not yet completed :)

If item is injected on another chain, we will detect hash mismatch and redo full scan.

>                          fetch next
>                          calculate hash and compare to sk_hash
>  sk_hash is set to new value
> 
> So I think in the above cases, your case #2 is not necessarily valid 
> without the barrier.
> 
> And another possible issue.  If sk_hash is written before next, and CPU1 
> is interrupted before CPU2, CPU2 will continually spin on the list until 
> CPU1 comes back and moves it to the new list.  Note sure if that is an 
> issue.

Probably not. Previously, readers were spining on read_lock(), when 
a writer was inside its critical section (write_lock()/write_unlock()).
So instead of spining inside read_unlock(), issuing stupid memory 
transactions, the readers can now spin reading hash chain and populate
cpu cache :)




^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29 17:32                                       ` Eric Dumazet
@ 2008-10-29 18:11                                         ` Paul E. McKenney
  2008-10-29 18:29                                           ` David Miller
  2008-10-29 18:36                                           ` Eric Dumazet
  0 siblings, 2 replies; 134+ messages in thread
From: Paul E. McKenney @ 2008-10-29 18:11 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Corey Minyard, David Miller, shemminger, benny+usenet, netdev,
	Christoph Lameter, a.p.zijlstra, johnpol, Christian Bell

On Wed, Oct 29, 2008 at 06:32:29PM +0100, Eric Dumazet wrote:
> Paul E. McKenney a écrit :
>> On Wed, Oct 29, 2008 at 05:09:53PM +0100, Eric Dumazet wrote:
>>> Corey Minyard a écrit :
>>>> Eric Dumazet wrote:
>>>>> Corey Minyard found a race added in commit 
>>>>> 271b72c7fa82c2c7a795bc16896149933110672d
>>>>> (udp: RCU handling for Unicast packets.)
>>>>>
>>>>> "If the socket is moved from one list to another list in-between the 
>>>>> time  the hash is calculated and the next field is accessed, and the 
>>>>> socket  has moved to the end of the new list, the traversal will not 
>>>>> complete  properly on the list it should have, since the socket will be 
>>>>> on the end  of the new list and there's not a way to tell it's on a new 
>>>>> list and  restart the list traversal.  I think that this can be solved 
>>>>> by  pre-fetching the "next" field (with proper barriers) before 
>>>>> checking the  hash."
>>>>>
>>>>> This patch corrects this problem, introducing a new 
>>>>> sk_for_each_rcu_safenext()
>>>>> macro.
>>>> You also need the appropriate smp_wmb() in udp_lib_get_port() after 
>>>> sk_hash is set, I think, so the next field is guaranteed to be changed 
>>>> after the hash value is changed.
>>> Not sure about this one Corey.
>>>
>>> If a reader catches previous value of item->sk_hash, two cases are to be 
>>> taken into :
>>>
>>> 1) its udp_hashfn(net, sk->sk_hash) is != hash   -> goto begin : Reader 
>>> will redo its scan
>>>
>>> 2) its udp_hashfn(net, sk->sk_hash) is == hash
>>>  -> next pointer is good enough : it points to next item in same hash 
>>> chain.
>>>     No need to rescan the chain at this point.
>>>     Yes we could miss the fact that a new port was bound and this UDP 
>>> message could be lost.
>> 3) its udp_hashfn(net, sk-sk_hash) is == hash, but only because it was
>> removed, freed, reallocated, and then readded with the same hash value,
>> possibly carrying the reader to a new position in the same list.
>
> yes, but 'new position' is 'before any not yet examined objects', since
> we insert objects only at chain head.

OK.  However, this reasoning assumes that a socket with a given
udp_hashfn() value will appear on one and only one list.  There are no
side lists for sockets in other states?  (listen, &c)

>> You might well cover this (will examine your code in detail on my plane
>> flight starting about 20 hours from now), but thought I should point it
>> out.  ;-)
>
> Yes, I'll double check too, this seems tricky :)

;-)

> About SLAB_DESTROY_BY_RCU effect, we now have two different kmem_cache for 
> "UDP-Lite"
> and "UDP".
>
> This is expected, but we could avoid that and alias these caches, since
> these objects have the same *type* . (The fields used for the RCU lookups,
> deletes and inserts are the same)
>
> Maybe a hack in net/ipv4/udplite.c before calling proto_register(), to
> copy the kmem_cache from UDP.

As long as this preserves the aforementioned assumption that a socket
with a given hash can appear on one and only one list.  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29 10:19                             ` Eric Dumazet
@ 2008-10-29 18:19                               ` David Miller
  0 siblings, 0 replies; 134+ messages in thread
From: David Miller @ 2008-10-29 18:19 UTC (permalink / raw)
  To: dada1
  Cc: shemminger, benny+usenet, minyard, netdev, paulmck, cl,
	a.p.zijlstra, christian

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 29 Oct 2008 11:19:50 +0100

> [PATCH] udp: udp_get_next() should use spin_unlock_bh()
> 
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

Applied, thanks Eric.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29 14:36                               ` Eric Dumazet
  2008-10-29 15:34                                 ` Corey Minyard
@ 2008-10-29 18:20                                 ` David Miller
  2008-10-30 11:12                                 ` Peter Zijlstra
  2 siblings, 0 replies; 134+ messages in thread
From: David Miller @ 2008-10-29 18:20 UTC (permalink / raw)
  To: dada1
  Cc: minyard, shemminger, benny+usenet, netdev, paulmck, cl,
	a.p.zijlstra, johnpol, christian

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 29 Oct 2008 15:36:34 +0100

> [PATCH] udp: introduce sk_for_each_rcu_safenext()

Also applied, thanks Eric.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29 17:45                                         ` Eric Dumazet
@ 2008-10-29 18:28                                           ` Corey Minyard
  2008-10-29 18:52                                             ` Paul E. McKenney
  0 siblings, 1 reply; 134+ messages in thread
From: Corey Minyard @ 2008-10-29 18:28 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: paulmck, David Miller, shemminger, benny+usenet, netdev,
	Christoph Lameter, a.p.zijlstra, johnpol, Christian Bell

Eric Dumazet wrote:
> Corey Minyard a écrit :
>> Paul E. McKenney wrote:
>>> On Wed, Oct 29, 2008 at 05:09:53PM +0100, Eric Dumazet wrote:
>>>  
>>>> Corey Minyard a écrit :
>>>>   
>>>>> Eric Dumazet wrote:
>>>>>     
>>>>>> Corey Minyard found a race added in commit 
>>>>>> 271b72c7fa82c2c7a795bc16896149933110672d
>>>>>> (udp: RCU handling for Unicast packets.)
>>>>>>
>>>>>> "If the socket is moved from one list to another list in-between 
>>>>>> the time  the hash is calculated and the next field is accessed, 
>>>>>> and the socket  has moved to the end of the new list, the 
>>>>>> traversal will not complete  properly on the list it should have, 
>>>>>> since the socket will be on the end  of the new list and there's 
>>>>>> not a way to tell it's on a new list and  restart the list 
>>>>>> traversal.  I think that this can be solved by  pre-fetching the 
>>>>>> "next" field (with proper barriers) before checking the  hash."
>>>>>>
>>>>>> This patch corrects this problem, introducing a new 
>>>>>> sk_for_each_rcu_safenext()
>>>>>> macro.
>>>>>>         
>>>>> You also need the appropriate smp_wmb() in udp_lib_get_port() 
>>>>> after sk_hash is set, I think, so the next field is guaranteed to 
>>>>> be changed after the hash value is changed.
>>>>>       
>>>> Not sure about this one Corey.
>>>>
>>>> If a reader catches previous value of item->sk_hash, two cases are 
>>>> to be taken into :
>>>>
>>>> 1) its udp_hashfn(net, sk->sk_hash) is != hash   -> goto begin : 
>>>> Reader will redo its scan
>>>>
>>>> 2) its udp_hashfn(net, sk->sk_hash) is == hash
>>>>  -> next pointer is good enough : it points to next item in same 
>>>> hash chain.
>>>>     No need to rescan the chain at this point.
>>>>     Yes we could miss the fact that a new port was bound and this 
>>>> UDP message could be lost.
>>>>     
>>>
>>> 3) its udp_hashfn(net, sk-sk_hash) is == hash, but only because it was
>>> removed, freed, reallocated, and then readded with the same hash value,
>>> possibly carrying the reader to a new position in the same list.
>>>   
>> If I understand this, without the smp_wmb(), it is possible that the 
>> next field can be written to main memory before the hash value is 
>> written.  If that happens, the following can occur:
>>
>>  CPU1                    CPU2
>>  next is set to NULL (end of new list)
>
> Well, if this item is injected to the same chain, next wont be set to 
> NULL.
>
> That would mean previous writers deleted all items from the chain.
I put my comment in the wrong place, I wasn't talking about being 
injected into the same chain.

>
> In this case, readers can see NULL, it is not a problem at all.
> List is/was empty.
> An application cannot complain a packet is not
> handled if its bind() syscall is not yet completed :)
>
> If item is injected on another chain, we will detect hash mismatch and 
> redo full scan.
If the item is injected onto the end of another chain, the next field 
will be NULL and you won't detect a hash mismatch.  It's basically the 
same issue as the previous race, though a lot more subtle and unlikely.  
If you get (from the previous socket) an old value of "sk_hash" and 
(from the new socket) a new value of "next" that is NULL, you will 
terminate the loop when you should have restarted it.  I'm pretty sure 
that can occur without the write barrier.

>
>>                          fetch next
>>                          calculate hash and compare to sk_hash
>>  sk_hash is set to new value
>>
>> So I think in the above cases, your case #2 is not necessarily valid 
>> without the barrier.
>>
>> And another possible issue.  If sk_hash is written before next, and 
>> CPU1 is interrupted before CPU2, CPU2 will continually spin on the 
>> list until CPU1 comes back and moves it to the new list.  Note sure 
>> if that is an issue.
>
> Probably not. Previously, readers were spining on read_lock(), when a 
> writer was inside its critical section (write_lock()/write_unlock()).
> So instead of spining inside read_unlock(), issuing stupid memory 
> transactions, the readers can now spin reading hash chain and populate
> cpu cache :)
Yes, I thought about that and thought I would point it out, but I agree, 
what you have is certainly better than spinning on a lock :).


-corey

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29 18:11                                         ` Paul E. McKenney
@ 2008-10-29 18:29                                           ` David Miller
  2008-10-29 18:38                                             ` Paul E. McKenney
  2008-10-29 18:36                                           ` Eric Dumazet
  1 sibling, 1 reply; 134+ messages in thread
From: David Miller @ 2008-10-29 18:29 UTC (permalink / raw)
  To: paulmck
  Cc: dada1, minyard, shemminger, benny+usenet, netdev, cl,
	a.p.zijlstra, johnpol, christian

From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 29 Oct 2008 11:11:14 -0700

> OK.  However, this reasoning assumes that a socket with a given
> udp_hashfn() value will appear on one and only one list.  There are no
> side lists for sockets in other states?  (listen, &c)

Nope, with UDP things are very simple, just one hash table.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29 18:11                                         ` Paul E. McKenney
  2008-10-29 18:29                                           ` David Miller
@ 2008-10-29 18:36                                           ` Eric Dumazet
  1 sibling, 0 replies; 134+ messages in thread
From: Eric Dumazet @ 2008-10-29 18:36 UTC (permalink / raw)
  To: paulmck
  Cc: Corey Minyard, David Miller, shemminger, benny+usenet, netdev,
	Christoph Lameter, a.p.zijlstra, johnpol, Christian Bell

Paul E. McKenney a écrit :
> On Wed, Oct 29, 2008 at 06:32:29PM +0100, Eric Dumazet wrote:
>> Paul E. McKenney a écrit :
>>> On Wed, Oct 29, 2008 at 05:09:53PM +0100, Eric Dumazet wrote:
>>>> Corey Minyard a écrit :
>>>>> Eric Dumazet wrote:
>>>>>> Corey Minyard found a race added in commit 
>>>>>> 271b72c7fa82c2c7a795bc16896149933110672d
>>>>>> (udp: RCU handling for Unicast packets.)
>>>>>>
>>>>>> "If the socket is moved from one list to another list in-between the 
>>>>>> time  the hash is calculated and the next field is accessed, and the 
>>>>>> socket  has moved to the end of the new list, the traversal will not 
>>>>>> complete  properly on the list it should have, since the socket will be 
>>>>>> on the end  of the new list and there's not a way to tell it's on a new 
>>>>>> list and  restart the list traversal.  I think that this can be solved 
>>>>>> by  pre-fetching the "next" field (with proper barriers) before 
>>>>>> checking the  hash."
>>>>>>
>>>>>> This patch corrects this problem, introducing a new 
>>>>>> sk_for_each_rcu_safenext()
>>>>>> macro.
>>>>> You also need the appropriate smp_wmb() in udp_lib_get_port() after 
>>>>> sk_hash is set, I think, so the next field is guaranteed to be changed 
>>>>> after the hash value is changed.
>>>> Not sure about this one Corey.
>>>>
>>>> If a reader catches previous value of item->sk_hash, two cases are to be 
>>>> taken into :
>>>>
>>>> 1) its udp_hashfn(net, sk->sk_hash) is != hash   -> goto begin : Reader 
>>>> will redo its scan
>>>>
>>>> 2) its udp_hashfn(net, sk->sk_hash) is == hash
>>>>  -> next pointer is good enough : it points to next item in same hash 
>>>> chain.
>>>>     No need to rescan the chain at this point.
>>>>     Yes we could miss the fact that a new port was bound and this UDP 
>>>> message could be lost.
>>> 3) its udp_hashfn(net, sk-sk_hash) is == hash, but only because it was
>>> removed, freed, reallocated, and then readded with the same hash value,
>>> possibly carrying the reader to a new position in the same list.
>> yes, but 'new position' is 'before any not yet examined objects', since
>> we insert objects only at chain head.
> 
> OK.  However, this reasoning assumes that a socket with a given
> udp_hashfn() value will appear on one and only one list.  There are no
> side lists for sockets in other states?  (listen, &c)
> 
>>> You might well cover this (will examine your code in detail on my plane
>>> flight starting about 20 hours from now), but thought I should point it
>>> out.  ;-)
>> Yes, I'll double check too, this seems tricky :)
> 
> ;-)
> 
>> About SLAB_DESTROY_BY_RCU effect, we now have two different kmem_cache for 
>> "UDP-Lite"
>> and "UDP".
>>
>> This is expected, but we could avoid that and alias these caches, since
>> these objects have the same *type* . (The fields used for the RCU lookups,
>> deletes and inserts are the same)
>>
>> Maybe a hack in net/ipv4/udplite.c before calling proto_register(), to
>> copy the kmem_cache from UDP.
> 
> As long as this preserves the aforementioned assumption that a socket
> with a given hash can appear on one and only one list.  ;-)
> 

Ouch, thanks Paul, that is indeed the point, well sort of.

If a UDP socket is freed, and re-allocated as an UDP-Lite socket, inserted on
the udplite_table, then we would have a problem with current implementation.

A reader could be directed to the chain of the other hash table, without
noticing it should restart its lookup...

Not worth adding a check to detect such a scenario, we can live with two different
kmem_cache after all, they are not that expensive.



^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29 18:29                                           ` David Miller
@ 2008-10-29 18:38                                             ` Paul E. McKenney
  0 siblings, 0 replies; 134+ messages in thread
From: Paul E. McKenney @ 2008-10-29 18:38 UTC (permalink / raw)
  To: David Miller
  Cc: dada1, minyard, shemminger, benny+usenet, netdev, cl,
	a.p.zijlstra, johnpol, christian

On Wed, Oct 29, 2008 at 11:29:56AM -0700, David Miller wrote:
> From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
> Date: Wed, 29 Oct 2008 11:11:14 -0700
> 
> > OK.  However, this reasoning assumes that a socket with a given
> > udp_hashfn() value will appear on one and only one list.  There are no
> > side lists for sockets in other states?  (listen, &c)
> 
> Nope, with UDP things are very simple, just one hash table.

Cool!  That does make things easier.  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29 18:28                                           ` Corey Minyard
@ 2008-10-29 18:52                                             ` Paul E. McKenney
  2008-10-29 20:00                                               ` Eric Dumazet
  0 siblings, 1 reply; 134+ messages in thread
From: Paul E. McKenney @ 2008-10-29 18:52 UTC (permalink / raw)
  To: Corey Minyard
  Cc: Eric Dumazet, David Miller, shemminger, benny+usenet, netdev,
	Christoph Lameter, a.p.zijlstra, johnpol, Christian Bell

On Wed, Oct 29, 2008 at 01:28:15PM -0500, Corey Minyard wrote:
> Eric Dumazet wrote:
>> Corey Minyard a écrit :
>>> Paul E. McKenney wrote:
>>>> On Wed, Oct 29, 2008 at 05:09:53PM +0100, Eric Dumazet wrote:
>>>>  
>>>>> Corey Minyard a écrit :
>>>>>   
>>>>>> Eric Dumazet wrote:
>>>>>>     
>>>>>>> Corey Minyard found a race added in commit 
>>>>>>> 271b72c7fa82c2c7a795bc16896149933110672d
>>>>>>> (udp: RCU handling for Unicast packets.)
>>>>>>>
>>>>>>> "If the socket is moved from one list to another list in-between the 
>>>>>>> time  the hash is calculated and the next field is accessed, and the 
>>>>>>> socket  has moved to the end of the new list, the traversal will not 
>>>>>>> complete  properly on the list it should have, since the socket will 
>>>>>>> be on the end  of the new list and there's not a way to tell it's on 
>>>>>>> a new list and  restart the list traversal.  I think that this can be 
>>>>>>> solved by  pre-fetching the "next" field (with proper barriers) 
>>>>>>> before checking the  hash."
>>>>>>>
>>>>>>> This patch corrects this problem, introducing a new 
>>>>>>> sk_for_each_rcu_safenext()
>>>>>>> macro.
>>>>>>>         
>>>>>> You also need the appropriate smp_wmb() in udp_lib_get_port() after 
>>>>>> sk_hash is set, I think, so the next field is guaranteed to be changed 
>>>>>> after the hash value is changed.
>>>>>>       
>>>>> Not sure about this one Corey.
>>>>>
>>>>> If a reader catches previous value of item->sk_hash, two cases are to 
>>>>> be taken into :
>>>>>
>>>>> 1) its udp_hashfn(net, sk->sk_hash) is != hash   -> goto begin : Reader 
>>>>> will redo its scan
>>>>>
>>>>> 2) its udp_hashfn(net, sk->sk_hash) is == hash
>>>>>  -> next pointer is good enough : it points to next item in same hash 
>>>>> chain.
>>>>>     No need to rescan the chain at this point.
>>>>>     Yes we could miss the fact that a new port was bound and this UDP 
>>>>> message could be lost.
>>>>>     
>>>>
>>>> 3) its udp_hashfn(net, sk-sk_hash) is == hash, but only because it was
>>>> removed, freed, reallocated, and then readded with the same hash value,
>>>> possibly carrying the reader to a new position in the same list.
>>>>   
>>> If I understand this, without the smp_wmb(), it is possible that the next 
>>> field can be written to main memory before the hash value is written.  If 
>>> that happens, the following can occur:
>>>
>>>  CPU1                    CPU2
>>>  next is set to NULL (end of new list)
>>
>> Well, if this item is injected to the same chain, next wont be set to 
>> NULL.
>>
>> That would mean previous writers deleted all items from the chain.
> I put my comment in the wrong place, I wasn't talking about being injected 
> into the same chain.
>
>>
>> In this case, readers can see NULL, it is not a problem at all.
>> List is/was empty.
>> An application cannot complain a packet is not
>> handled if its bind() syscall is not yet completed :)
>>
>> If item is injected on another chain, we will detect hash mismatch and 
>> redo full scan.
> If the item is injected onto the end of another chain, the next field will 
> be NULL and you won't detect a hash mismatch.  It's basically the same 
> issue as the previous race, though a lot more subtle and unlikely.  If you 
> get (from the previous socket) an old value of "sk_hash" and (from the new 
> socket) a new value of "next" that is NULL, you will terminate the loop 
> when you should have restarted it.  I'm pretty sure that can occur without 
> the write barrier.

One way of dealing with this is to keep a tail pointer.  Then, if the
element containing the NULL pointer doesn't match the tail pointer seen
at the start of the search, or if the tail pointer has changed,
restart the search.  Memory barriers will be needed.  ;-)

							Thanx, Paul

>>>                          fetch next
>>>                          calculate hash and compare to sk_hash
>>>  sk_hash is set to new value
>>>
>>> So I think in the above cases, your case #2 is not necessarily valid 
>>> without the barrier.
>>>
>>> And another possible issue.  If sk_hash is written before next, and CPU1 
>>> is interrupted before CPU2, CPU2 will continually spin on the list until 
>>> CPU1 comes back and moves it to the new list.  Note sure if that is an 
>>> issue.
>>
>> Probably not. Previously, readers were spining on read_lock(), when a 
>> writer was inside its critical section (write_lock()/write_unlock()).
>> So instead of spining inside read_unlock(), issuing stupid memory 
>> transactions, the readers can now spin reading hash chain and populate
>> cpu cache :)
> Yes, I thought about that and thought I would point it out, but I agree, 
> what you have is certainly better than spinning on a lock :).
>
>
> -corey

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29 18:52                                             ` Paul E. McKenney
@ 2008-10-29 20:00                                               ` Eric Dumazet
  2008-10-29 20:17                                                 ` Paul E. McKenney
  2008-10-30 11:04                                                 ` [PATCH 2/2] udp: RCU handling for Unicast packets Peter Zijlstra
  0 siblings, 2 replies; 134+ messages in thread
From: Eric Dumazet @ 2008-10-29 20:00 UTC (permalink / raw)
  To: paulmck
  Cc: Corey Minyard, David Miller, shemminger, benny+usenet, netdev,
	Christoph Lameter, a.p.zijlstra, johnpol, Christian Bell

Paul E. McKenney a écrit :
> On Wed, Oct 29, 2008 at 01:28:15PM -0500, Corey Minyard wrote:
>> Eric Dumazet wrote:
>>> Corey Minyard a écrit :
>>>> Paul E. McKenney wrote:
>>>>> On Wed, Oct 29, 2008 at 05:09:53PM +0100, Eric Dumazet wrote:
>>>>>  
>>>>>> Corey Minyard a écrit :
>>>>>>   
>>>>>>> Eric Dumazet wrote:
>>>>>>>     
>>>>>>>> Corey Minyard found a race added in commit 
>>>>>>>> 271b72c7fa82c2c7a795bc16896149933110672d
>>>>>>>> (udp: RCU handling for Unicast packets.)
>>>>>>>>
>>>>>>>> "If the socket is moved from one list to another list in-between the 
>>>>>>>> time  the hash is calculated and the next field is accessed, and the 
>>>>>>>> socket  has moved to the end of the new list, the traversal will not 
>>>>>>>> complete  properly on the list it should have, since the socket will 
>>>>>>>> be on the end  of the new list and there's not a way to tell it's on 
>>>>>>>> a new list and  restart the list traversal.  I think that this can be 
>>>>>>>> solved by  pre-fetching the "next" field (with proper barriers) 
>>>>>>>> before checking the  hash."
>>>>>>>>
>>>>>>>> This patch corrects this problem, introducing a new 
>>>>>>>> sk_for_each_rcu_safenext()
>>>>>>>> macro.
>>>>>>>>         
>>>>>>> You also need the appropriate smp_wmb() in udp_lib_get_port() after 
>>>>>>> sk_hash is set, I think, so the next field is guaranteed to be changed 
>>>>>>> after the hash value is changed.
>>>>>>>       
>>>>>> Not sure about this one Corey.
>>>>>>
>>>>>> If a reader catches previous value of item->sk_hash, two cases are to 
>>>>>> be taken into :
>>>>>>
>>>>>> 1) its udp_hashfn(net, sk->sk_hash) is != hash   -> goto begin : Reader 
>>>>>> will redo its scan
>>>>>>
>>>>>> 2) its udp_hashfn(net, sk->sk_hash) is == hash
>>>>>>  -> next pointer is good enough : it points to next item in same hash 
>>>>>> chain.
>>>>>>     No need to rescan the chain at this point.
>>>>>>     Yes we could miss the fact that a new port was bound and this UDP 
>>>>>> message could be lost.
>>>>>>     
>>>>> 3) its udp_hashfn(net, sk-sk_hash) is == hash, but only because it was
>>>>> removed, freed, reallocated, and then readded with the same hash value,
>>>>> possibly carrying the reader to a new position in the same list.
>>>>>   
>>>> If I understand this, without the smp_wmb(), it is possible that the next 
>>>> field can be written to main memory before the hash value is written.  If 
>>>> that happens, the following can occur:
>>>>
>>>>  CPU1                    CPU2
>>>>  next is set to NULL (end of new list)
>>> Well, if this item is injected to the same chain, next wont be set to 
>>> NULL.
>>>
>>> That would mean previous writers deleted all items from the chain.
>> I put my comment in the wrong place, I wasn't talking about being injected 
>> into the same chain.
>>
>>> In this case, readers can see NULL, it is not a problem at all.
>>> List is/was empty.
>>> An application cannot complain a packet is not
>>> handled if its bind() syscall is not yet completed :)
>>>
>>> If item is injected on another chain, we will detect hash mismatch and 
>>> redo full scan.
>> If the item is injected onto the end of another chain, the next field will 
>> be NULL and you won't detect a hash mismatch.  It's basically the same 
>> issue as the previous race, though a lot more subtle and unlikely.  If you 
>> get (from the previous socket) an old value of "sk_hash" and (from the new 
>> socket) a new value of "next" that is NULL, you will terminate the loop 
>> when you should have restarted it.  I'm pretty sure that can occur without 
>> the write barrier.
> 
> One way of dealing with this is to keep a tail pointer.  Then, if the
> element containing the NULL pointer doesn't match the tail pointer seen
> at the start of the search, or if the tail pointer has changed,
> restart the search.  Memory barriers will be needed.  ;-)
> 

Hum... Another way of handling all those cases and avoid memory barriers
would be to have different "NULL" pointers.

Each hash chain should have a unique "NULL" pointer (in the case of UDP, it
can be the 128 values : [ (void*)0 .. (void *)127 ]

Then, when performing a lookup, a reader should check the "NULL" pointer
it get at the end of its lookup has is the "hash" value of its chain.

If not -> restart the loop, aka "goto begin;" :)

We could avoid memory barriers then.

In the two cases Corey mentioned, this trick could let us avoid memory barriers.
(existing one in sk_add_node_rcu(sk, &hslot->head); should be enough)

What do you think ?



^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29 20:00                                               ` Eric Dumazet
@ 2008-10-29 20:17                                                 ` Paul E. McKenney
  2008-10-29 21:29                                                   ` Corey Minyard
  2008-10-29 22:08                                                   ` Eric Dumazet
  2008-10-30 11:04                                                 ` [PATCH 2/2] udp: RCU handling for Unicast packets Peter Zijlstra
  1 sibling, 2 replies; 134+ messages in thread
From: Paul E. McKenney @ 2008-10-29 20:17 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Corey Minyard, David Miller, shemminger, benny+usenet, netdev,
	Christoph Lameter, a.p.zijlstra, johnpol, Christian Bell

On Wed, Oct 29, 2008 at 09:00:13PM +0100, Eric Dumazet wrote:
> Paul E. McKenney a écrit :
>> On Wed, Oct 29, 2008 at 01:28:15PM -0500, Corey Minyard wrote:
>>> Eric Dumazet wrote:
>>>> Corey Minyard a écrit :
>>>>> Paul E. McKenney wrote:
>>>>>> On Wed, Oct 29, 2008 at 05:09:53PM +0100, Eric Dumazet wrote:
>>>>>>  
>>>>>>> Corey Minyard a écrit :
>>>>>>>   
>>>>>>>> Eric Dumazet wrote:
>>>>>>>>     
>>>>>>>>> Corey Minyard found a race added in commit 
>>>>>>>>> 271b72c7fa82c2c7a795bc16896149933110672d
>>>>>>>>> (udp: RCU handling for Unicast packets.)
>>>>>>>>>
>>>>>>>>> "If the socket is moved from one list to another list in-between 
>>>>>>>>> the time  the hash is calculated and the next field is accessed, 
>>>>>>>>> and the socket  has moved to the end of the new list, the traversal 
>>>>>>>>> will not complete  properly on the list it should have, since the 
>>>>>>>>> socket will be on the end  of the new list and there's not a way to 
>>>>>>>>> tell it's on a new list and  restart the list traversal.  I think 
>>>>>>>>> that this can be solved by  pre-fetching the "next" field (with 
>>>>>>>>> proper barriers) before checking the  hash."
>>>>>>>>>
>>>>>>>>> This patch corrects this problem, introducing a new 
>>>>>>>>> sk_for_each_rcu_safenext()
>>>>>>>>> macro.
>>>>>>>>>         
>>>>>>>> You also need the appropriate smp_wmb() in udp_lib_get_port() after 
>>>>>>>> sk_hash is set, I think, so the next field is guaranteed to be 
>>>>>>>> changed after the hash value is changed.
>>>>>>>>       
>>>>>>> Not sure about this one Corey.
>>>>>>>
>>>>>>> If a reader catches previous value of item->sk_hash, two cases are to 
>>>>>>> be taken into :
>>>>>>>
>>>>>>> 1) its udp_hashfn(net, sk->sk_hash) is != hash   -> goto begin : 
>>>>>>> Reader will redo its scan
>>>>>>>
>>>>>>> 2) its udp_hashfn(net, sk->sk_hash) is == hash
>>>>>>>  -> next pointer is good enough : it points to next item in same hash 
>>>>>>> chain.
>>>>>>>     No need to rescan the chain at this point.
>>>>>>>     Yes we could miss the fact that a new port was bound and this UDP 
>>>>>>> message could be lost.
>>>>>>>     
>>>>>> 3) its udp_hashfn(net, sk-sk_hash) is == hash, but only because it was
>>>>>> removed, freed, reallocated, and then readded with the same hash 
>>>>>> value,
>>>>>> possibly carrying the reader to a new position in the same list.
>>>>>>   
>>>>> If I understand this, without the smp_wmb(), it is possible that the 
>>>>> next field can be written to main memory before the hash value is 
>>>>> written.  If that happens, the following can occur:
>>>>>
>>>>>  CPU1                    CPU2
>>>>>  next is set to NULL (end of new list)
>>>> Well, if this item is injected to the same chain, next wont be set to 
>>>> NULL.
>>>>
>>>> That would mean previous writers deleted all items from the chain.
>>> I put my comment in the wrong place, I wasn't talking about being 
>>> injected into the same chain.
>>>
>>>> In this case, readers can see NULL, it is not a problem at all.
>>>> List is/was empty.
>>>> An application cannot complain a packet is not
>>>> handled if its bind() syscall is not yet completed :)
>>>>
>>>> If item is injected on another chain, we will detect hash mismatch and 
>>>> redo full scan.
>>> If the item is injected onto the end of another chain, the next field 
>>> will be NULL and you won't detect a hash mismatch.  It's basically the 
>>> same issue as the previous race, though a lot more subtle and unlikely.  
>>> If you get (from the previous socket) an old value of "sk_hash" and (from 
>>> the new socket) a new value of "next" that is NULL, you will terminate 
>>> the loop when you should have restarted it.  I'm pretty sure that can 
>>> occur without the write barrier.
>> One way of dealing with this is to keep a tail pointer.  Then, if the
>> element containing the NULL pointer doesn't match the tail pointer seen
>> at the start of the search, or if the tail pointer has changed,
>> restart the search.  Memory barriers will be needed.  ;-)
>
> Hum... Another way of handling all those cases and avoid memory barriers
> would be to have different "NULL" pointers.
>
> Each hash chain should have a unique "NULL" pointer (in the case of UDP, it
> can be the 128 values : [ (void*)0 .. (void *)127 ]
>
> Then, when performing a lookup, a reader should check the "NULL" pointer
> it get at the end of its lookup has is the "hash" value of its chain.
>
> If not -> restart the loop, aka "goto begin;" :)
>
> We could avoid memory barriers then.
>
> In the two cases Corey mentioned, this trick could let us avoid memory 
> barriers.
> (existing one in sk_add_node_rcu(sk, &hslot->head); should be enough)
>
> What do you think ?

Kinky!!!  ;-)

Then the rcu_dereference() would be supplying the needed memory barriers.

Hmmm...  I guess that the only confusion would be if the element got
removed and then added to the same list.  But then if its pointer was
pseudo-NULL, then that would mean that all subsequent elements had been
removed, and all preceding ones added after the scan started.

Which might well be harmless, but I must defer to you on this one at
the moment.

If you need a larger hash table, another approach would be to set the
pointer's low-order bit, allowing the upper bits to be a full-sized
index -- or even a pointer to the list header.  Just make very sure
to clear the pointer when freeing, or an element on the freelist
could end up looking like a legitimate end of list...  Which again
might well be safe, but why inflict this on oneself?

							Thanx, Paul

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29 20:17                                                 ` Paul E. McKenney
@ 2008-10-29 21:29                                                   ` Corey Minyard
  2008-10-29 21:57                                                     ` Eric Dumazet
  2008-10-29 21:58                                                     ` Paul E. McKenney
  2008-10-29 22:08                                                   ` Eric Dumazet
  1 sibling, 2 replies; 134+ messages in thread
From: Corey Minyard @ 2008-10-29 21:29 UTC (permalink / raw)
  To: paulmck
  Cc: Eric Dumazet, David Miller, shemminger, benny+usenet, netdev,
	Christoph Lameter, a.p.zijlstra, johnpol, Christian Bell

Paul E. McKenney wrote:
> O
..snip
>> Hum... Another way of handling all those cases and avoid memory barriers
>> would be to have different "NULL" pointers.
>>
>> Each hash chain should have a unique "NULL" pointer (in the case of UDP, it
>> can be the 128 values : [ (void*)0 .. (void *)127 ]
>>
>> Then, when performing a lookup, a reader should check the "NULL" pointer
>> it get at the end of its lookup has is the "hash" value of its chain.
>>
>> If not -> restart the loop, aka "goto begin;" :)
>>
>> We could avoid memory barriers then.
>>
>> In the two cases Corey mentioned, this trick could let us avoid memory 
>> barriers.
>> (existing one in sk_add_node_rcu(sk, &hslot->head); should be enough)
>>
>> What do you think ?
>>     
>
> Kinky!!!  ;-)
>   
My thought exactly ;-).

> Then the rcu_dereference() would be supplying the needed memory barriers.
>
> Hmmm...  I guess that the only confusion would be if the element got
> removed and then added to the same list.  But then if its pointer was
> pseudo-NULL, then that would mean that all subsequent elements had been
> removed, and all preceding ones added after the scan started.
>
> Which might well be harmless, but I must defer to you on this one at
> the moment.
>   
I believe that is harmless, as re-scanning the same data should be fine.

> If you need a larger hash table, another approach would be to set the
> pointer's low-order bit, allowing the upper bits to be a full-sized
> index -- or even a pointer to the list header.  Just make very sure
> to clear the pointer when freeing, or an element on the freelist
> could end up looking like a legitimate end of list...  Which again
> might well be safe, but why inflict this on oneself?
>   
Kind of my thought, too.  That's a lot of work to avoid a single 
smb_wmb() on the socket creation path.  Plus this could be extra confusing.

-corey

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29 21:29                                                   ` Corey Minyard
@ 2008-10-29 21:57                                                     ` Eric Dumazet
  2008-10-29 21:58                                                     ` Paul E. McKenney
  1 sibling, 0 replies; 134+ messages in thread
From: Eric Dumazet @ 2008-10-29 21:57 UTC (permalink / raw)
  To: Corey Minyard
  Cc: paulmck, David Miller, shemminger, benny+usenet, netdev,
	Christoph Lameter, a.p.zijlstra, johnpol, Christian Bell

Corey Minyard a écrit :
> Paul E. McKenney wrote:
>> O
> ..snip
>>> Hum... Another way of handling all those cases and avoid memory barriers
>>> would be to have different "NULL" pointers.
>>>
>>> Each hash chain should have a unique "NULL" pointer (in the case of 
>>> UDP, it
>>> can be the 128 values : [ (void*)0 .. (void *)127 ]
>>>
>>> Then, when performing a lookup, a reader should check the "NULL" pointer
>>> it get at the end of its lookup has is the "hash" value of its chain.
>>>
>>> If not -> restart the loop, aka "goto begin;" :)
>>>
>>> We could avoid memory barriers then.
>>>
>>> In the two cases Corey mentioned, this trick could let us avoid 
>>> memory barriers.
>>> (existing one in sk_add_node_rcu(sk, &hslot->head); should be enough)
>>>
>>> What do you think ?
>>>     
>>
>> Kinky!!!  ;-)
>>   
> My thought exactly ;-).
> 
>> Then the rcu_dereference() would be supplying the needed memory barriers.
>>
>> Hmmm...  I guess that the only confusion would be if the element got
>> removed and then added to the same list.  But then if its pointer was
>> pseudo-NULL, then that would mean that all subsequent elements had been
>> removed, and all preceding ones added after the scan started.
>>
>> Which might well be harmless, but I must defer to you on this one at
>> the moment.
>>   
> I believe that is harmless, as re-scanning the same data should be fine.
> 
>> If you need a larger hash table, another approach would be to set the
>> pointer's low-order bit, allowing the upper bits to be a full-sized
>> index -- or even a pointer to the list header.  Just make very sure
>> to clear the pointer when freeing, or an element on the freelist
>> could end up looking like a legitimate end of list...  Which again
>> might well be safe, but why inflict this on oneself?
>>   
> Kind of my thought, too.  That's a lot of work to avoid a single 
> smb_wmb() on the socket creation path.  Plus this could be extra confusing.
> 

Sure this smp_wmb() seems harmless (but, remember this infrastructure will
next be deployed for TCP sockets as well ;) )

But saving smp_rmb() at lookup time, for each item is a clear win, no ?




^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29 21:29                                                   ` Corey Minyard
  2008-10-29 21:57                                                     ` Eric Dumazet
@ 2008-10-29 21:58                                                     ` Paul E. McKenney
  1 sibling, 0 replies; 134+ messages in thread
From: Paul E. McKenney @ 2008-10-29 21:58 UTC (permalink / raw)
  To: Corey Minyard
  Cc: Eric Dumazet, David Miller, shemminger, benny+usenet, netdev,
	Christoph Lameter, a.p.zijlstra, johnpol, Christian Bell

On Wed, Oct 29, 2008 at 04:29:19PM -0500, Corey Minyard wrote:
> Paul E. McKenney wrote:
>> O
> ..snip
>>> Hum... Another way of handling all those cases and avoid memory barriers
>>> would be to have different "NULL" pointers.
>>>
>>> Each hash chain should have a unique "NULL" pointer (in the case of UDP, 
>>> it
>>> can be the 128 values : [ (void*)0 .. (void *)127 ]
>>>
>>> Then, when performing a lookup, a reader should check the "NULL" pointer
>>> it get at the end of its lookup has is the "hash" value of its chain.
>>>
>>> If not -> restart the loop, aka "goto begin;" :)
>>>
>>> We could avoid memory barriers then.
>>>
>>> In the two cases Corey mentioned, this trick could let us avoid memory 
>>> barriers.
>>> (existing one in sk_add_node_rcu(sk, &hslot->head); should be enough)
>>>
>>> What do you think ?
>>>     
>>
>> Kinky!!!  ;-)
>>   
> My thought exactly ;-).
>
>> Then the rcu_dereference() would be supplying the needed memory barriers.
>>
>> Hmmm...  I guess that the only confusion would be if the element got
>> removed and then added to the same list.  But then if its pointer was
>> pseudo-NULL, then that would mean that all subsequent elements had been
>> removed, and all preceding ones added after the scan started.
>>
>> Which might well be harmless, but I must defer to you on this one at
>> the moment.
>>   
> I believe that is harmless, as re-scanning the same data should be fine.
>
>> If you need a larger hash table, another approach would be to set the
>> pointer's low-order bit, allowing the upper bits to be a full-sized
>> index -- or even a pointer to the list header.  Just make very sure
>> to clear the pointer when freeing, or an element on the freelist
>> could end up looking like a legitimate end of list...  Which again
>> might well be safe, but why inflict this on oneself?
>>   
> Kind of my thought, too.  That's a lot of work to avoid a single smb_wmb() 
> on the socket creation path.  Plus this could be extra confusing.

Just to be clear, I was fulminating against any potential failure to
clear the pseudo-NULL pointer, not against the pseudo-pointer itself.
This sort of trick is already used in some of the RCU-protected trees
(for example, FIB tree, IIRC), so I would look a bit funny fulminating
too hard against it.  ;-)

The only other high-level approach I have come up with thus far is to
maintain separate hash tables for the long-lived UDP sockets (protected
by RCU) and for the short-lived UDP sockets (protected by locking).
Given the usual bimodal traffic pattern, most of the sockets are short
lived, but most of the data is transmitted over long-lived sockets.  If a
socket receives more than N packets (10? 50? 100?), it is moved from the
short-lived table to the long-lived table.  Sockets on the short-lived
table may be freed directly, while sockets on the long-lived table must
be RCU freed -- but this added overhead should be in the noise for a
long-lived connection.  Lookups hit the RCU-protected table, then the lock
protected table, then the RCU-protected table again, but still holding
the lock.  (Clearly, only search until you find the desired socket.)

However, I am not certain that this short-term/long-term approach is
better than the approach that Eric is proposing.  It might in fact be
worse.  But I throw it out anyway on the off-chance that it is helpful
as a comparison or as a solution to some future problem.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29 20:17                                                 ` Paul E. McKenney
  2008-10-29 21:29                                                   ` Corey Minyard
@ 2008-10-29 22:08                                                   ` Eric Dumazet
  2008-10-30  3:22                                                     ` Corey Minyard
                                                                       ` (2 more replies)
  1 sibling, 3 replies; 134+ messages in thread
From: Eric Dumazet @ 2008-10-29 22:08 UTC (permalink / raw)
  To: paulmck
  Cc: Corey Minyard, David Miller, shemminger, benny+usenet, netdev,
	Christoph Lameter, a.p.zijlstra, johnpol, Christian Bell

[-- Attachment #1: Type: text/plain, Size: 3034 bytes --]

Paul E. McKenney a écrit :
> On Wed, Oct 29, 2008 at 09:00:13PM +0100, Eric Dumazet wrote:
>> Hum... Another way of handling all those cases and avoid memory barriers
>> would be to have different "NULL" pointers.
>>
>> Each hash chain should have a unique "NULL" pointer (in the case of UDP, it
>> can be the 128 values : [ (void*)0 .. (void *)127 ]
>>
>> Then, when performing a lookup, a reader should check the "NULL" pointer
>> it get at the end of its lookup has is the "hash" value of its chain.
>>
>> If not -> restart the loop, aka "goto begin;" :)
>>
>> We could avoid memory barriers then.
>>
>> In the two cases Corey mentioned, this trick could let us avoid memory 
>> barriers.
>> (existing one in sk_add_node_rcu(sk, &hslot->head); should be enough)
>>
>> What do you think ?
> 
> Kinky!!!  ;-)
> 
> Then the rcu_dereference() would be supplying the needed memory barriers.
> 
> Hmmm...  I guess that the only confusion would be if the element got
> removed and then added to the same list.  But then if its pointer was
> pseudo-NULL, then that would mean that all subsequent elements had been
> removed, and all preceding ones added after the scan started.
> 
> Which might well be harmless, but I must defer to you on this one at
> the moment.
> 
> If you need a larger hash table, another approach would be to set the
> pointer's low-order bit, allowing the upper bits to be a full-sized
> index -- or even a pointer to the list header.  Just make very sure
> to clear the pointer when freeing, or an element on the freelist
> could end up looking like a legitimate end of list...  Which again
> might well be safe, but why inflict this on oneself?

Well, for UDP case, hash table will be <= 65536 anyway, we can assume
no dynamic kernel memory is in the range [0 .. 65535]

Here is a patch (untested yet, its really time for a sleep for me ;) )

[PATCH] udp: Introduce special NULL pointers for hlist termination

In order to safely detect changes in chains, we would like to have different
'NULL' pointers. Each chain in hash table is terminated by an unique 'NULL'
value, so that the lockless readers can detect their lookups evaded from
their starting chain.

We define 'NULL' values as ((unsigned long)values < UDP_HTABLE_SIZE)

This saves memory barriers (a read barrier to fetch 'next' pointers
*before* checking key values) we added in commit 
96631ed16c514cf8b28fab991a076985ce378c26 (udp: introduce 
sk_for_each_rcu_safenext()) 

This also saves a missing memory barrier spotted by Corey Minyard 
(a write one in udp_lib_get_port(), between sk_hash update and ->next update)

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
include/linux/list.h    |   32 ++++++++++++++++++++++++++++++++
include/linux/rculist.h |   15 ++++++++-------
include/net/sock.h      |    9 +++++++--
net/ipv4/udp.c          |   19 +++++++++++++------
net/ipv6/udp.c          |   16 ++++++++++++----
5 files changed, 72 insertions(+), 19 deletions(-)



[-- Attachment #2: nulls.patch --]
[-- Type: text/plain, Size: 9060 bytes --]

diff --git a/include/linux/list.h b/include/linux/list.h
index 969f6e9..a3d5dd1 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -654,6 +654,22 @@ static inline void hlist_move_list(struct hlist_head *old,
 	     pos && ({ prefetch(pos->next); 1;}) &&			 \
 		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
 	     pos = pos->next)
+/**
+ * hlist_for_each_entry_nulls	- iterate over list of given type
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ * @nullval: the iteration should stop if a pointer is < nullval
+ *
+ * Special version of hlist_for_each_entry where the end pointer
+ * can be NULL but also any value < nullval.
+ */
+#define hlist_for_each_entry_nulls(tpos, pos, head, member, nullval)	 \
+	for (pos = (head)->first;					 \
+	     ((unsigned long)pos >= nullval) && ({ prefetch(pos->next); 1;}) && \
+		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
 
 /**
  * hlist_for_each_entry_continue - iterate over a hlist continuing after current point
@@ -679,6 +695,22 @@ static inline void hlist_move_list(struct hlist_head *old,
 	     pos = pos->next)
 
 /**
+ * hlist_for_each_entry_from_nulls - iterate over a hlist continuing from current point
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @member:	the name of the hlist_node within the struct.
+ * @nullval: the iteration should stop if a pointer is < nullval
+ *
+ * Special version of hlist_for_each_entry_from where the end pointer
+ * can be NULL but also any value < nullval.
+ */
+#define hlist_for_each_entry_from_nulls(tpos, pos, member, nullval)	\
+	for (; ((unsigned long)pos >= nullval) && \
+		({ prefetch(pos->next); 1;})   && \
+		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+/**
  * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry
  * @tpos:	the type * to use as a loop cursor.
  * @pos:	the &struct hlist_node to use as a loop cursor.
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 3ba2998..6f78e2b 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -384,21 +384,22 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
 		pos = rcu_dereference(pos->next))
 
 /**
- * hlist_for_each_entry_rcu_safenext - iterate over rcu list of given type
+ * hlist_for_each_entry_rcu_nulls - iterate over rcu list of given type
  * @tpos:	the type * to use as a loop cursor.
  * @pos:	the &struct hlist_node to use as a loop cursor.
  * @head:	the head for your list.
  * @member:	the name of the hlist_node within the struct.
- * @next:       the &struct hlist_node to use as a next cursor
+ * @nullval:       the iteration should stop if a pointer is < nullval
  *
- * Special version of hlist_for_each_entry_rcu that make sure
- * each next pointer is fetched before each iteration.
+ * Special version of hlist_for_each_entry_rcu where the end pointer
+ * can be NULL but also any value < nullval.
  */
-#define hlist_for_each_entry_rcu_safenext(tpos, pos, head, member, next) \
+#define hlist_for_each_entry_rcu_nulls(tpos, pos, head, member, nullval) \
 	for (pos = rcu_dereference((head)->first);			 \
-		pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) &&	\
+		((unsigned long)pos >= nullval) && 			\
+		({ prefetch(pos->next); 1; }) &&			\
 		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \
-		pos = rcu_dereference(next))
+		pos = rcu_dereference(pos->next))
 
 #endif	/* __KERNEL__ */
 #endif
diff --git a/include/net/sock.h b/include/net/sock.h
index a4f6d3f..efe4def 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -419,11 +419,16 @@ static __inline__ void sk_add_bind_node(struct sock *sk,
 
 #define sk_for_each(__sk, node, list) \
 	hlist_for_each_entry(__sk, node, list, sk_node)
-#define sk_for_each_rcu_safenext(__sk, node, list, next) \
-	hlist_for_each_entry_rcu_safenext(__sk, node, list, sk_node, next)
+#define sk_for_each_nulls(__sk, node, list, nullval) \
+	hlist_for_each_entry_nulls(__sk, node, list, sk_node, nullval)
+#define sk_for_each_rcu_nulls(__sk, node, list, nullval) \
+	hlist_for_each_entry_rcu_nulls(__sk, node, list, sk_node, nullval)
 #define sk_for_each_from(__sk, node) \
 	if (__sk && ({ node = &(__sk)->sk_node; 1; })) \
 		hlist_for_each_entry_from(__sk, node, sk_node)
+#define sk_for_each_from_nulls(__sk, node, nullval) \
+	if (__sk && ({ node = &(__sk)->sk_node; 1; })) \
+		hlist_for_each_entry_from_nulls(__sk, node, sk_node, nullval)
 #define sk_for_each_continue(__sk, node) \
 	if (__sk && ({ node = &(__sk)->sk_node; 1; })) \
 		hlist_for_each_entry_continue(__sk, node, sk_node)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c3ecec8..a61fe89 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -129,7 +129,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
 	struct sock *sk2;
 	struct hlist_node *node;
 
-	sk_for_each(sk2, node, &hslot->head)
+	sk_for_each_nulls(sk2, node, &hslot->head, UDP_HTABLE_SIZE)
 		if (net_eq(sock_net(sk2), net)			&&
 		    sk2 != sk					&&
 		    sk2->sk_hash == num				&&
@@ -256,7 +256,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 		int dif, struct udp_table *udptable)
 {
 	struct sock *sk, *result;
-	struct hlist_node *node, *next;
+	struct hlist_node *node;
 	unsigned short hnum = ntohs(dport);
 	unsigned int hash = udp_hashfn(net, hnum);
 	struct udp_hslot *hslot = &udptable->hash[hash];
@@ -266,7 +266,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 begin:
 	result = NULL;
 	badness = -1;
-	sk_for_each_rcu_safenext(sk, node, &hslot->head, next) {
+	sk_for_each_rcu_nulls(sk, node, &hslot->head, UDP_HTABLE_SIZE) {
 		/*
 		 * lockless reader, and SLAB_DESTROY_BY_RCU items:
 		 * We must check this item was not moved to another chain
@@ -280,6 +280,13 @@ begin:
 			badness = score;
 		}
 	}
+	/*
+	 * if the 'NULL' pointer we got at the end of this lookup is
+	 * not the expected one, we must restart lookup.
+	 */
+	if ((unsigned long)node != hash)
+		goto begin;
+
 	if (result) {
 		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
 			result = NULL;
@@ -324,7 +331,7 @@ static inline struct sock *udp_v4_mcast_next(struct sock *sk,
 	struct sock *s = sk;
 	unsigned short hnum = ntohs(loc_port);
 
-	sk_for_each_from(s, node) {
+	sk_for_each_from_nulls(s, node, UDP_HTABLE_SIZE) {
 		struct inet_sock *inet = inet_sk(s);
 
 		if (s->sk_hash != hnum					||
@@ -1556,7 +1563,7 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
 		struct hlist_node *node;
 		struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
 		spin_lock_bh(&hslot->lock);
-		sk_for_each(sk, node, &hslot->head) {
+		sk_for_each_nulls(sk, node, &hslot->head, UDP_HTABLE_SIZE) {
 			if (!net_eq(sock_net(sk), net))
 				continue;
 			if (sk->sk_family == state->family)
@@ -1746,7 +1753,7 @@ void __init udp_table_init(struct udp_table *table)
 	int i;
 
 	for (i = 0; i < UDP_HTABLE_SIZE; i++) {
-		INIT_HLIST_HEAD(&table->hash[i].head);
+		table->hash[i].head.first = (struct hlist_node *)i;
 		spin_lock_init(&table->hash[i].lock);
 	}
 }
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 32d914d..13635ef 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -98,7 +98,7 @@ static struct sock *__udp6_lib_lookup(struct net *net,
 				      int dif, struct udp_table *udptable)
 {
 	struct sock *sk, *result;
-	struct hlist_node *node, *next;
+	struct hlist_node *node;
 	unsigned short hnum = ntohs(dport);
 	unsigned int hash = udp_hashfn(net, hnum);
 	struct udp_hslot *hslot = &udptable->hash[hash];
@@ -108,19 +108,27 @@ static struct sock *__udp6_lib_lookup(struct net *net,
 begin:
 	result = NULL;
 	badness = -1;
-	sk_for_each_rcu_safenext(sk, node, &hslot->head, next) {
+	sk_for_each_rcu_nulls(sk, node, &hslot->head, UDP_HTABLE_SIZE) {
 		/*
 		 * lockless reader, and SLAB_DESTROY_BY_RCU items:
 		 * We must check this item was not moved to another chain
 		 */
 		if (udp_hashfn(net, sk->sk_hash) != hash)
 			goto begin;
-		score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif);
+		score = compute_score(sk, net, hnum, saddr, sport,
+				      daddr, dport, dif);
 		if (score > badness) {
 			result = sk;
 			badness = score;
 		}
 	}
+	/*
+	 * if the 'NULL' pointer we got at the end of this lookup is
+	 * not the expected one, we must restart lookup.
+	 */
+	if ((unsigned long)node != hash)
+		goto begin;
+
 	if (result) {
 		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
 			result = NULL;
@@ -364,7 +372,7 @@ static struct sock *udp_v6_mcast_next(struct sock *sk,
 	struct sock *s = sk;
 	unsigned short num = ntohs(loc_port);
 
-	sk_for_each_from(s, node) {
+	sk_for_each_from_nulls(s, node, UDP_HTABLE_SIZE) {
 		struct inet_sock *inet = inet_sk(s);
 
 		if (sock_net(s) != sock_net(sk))

^ permalink raw reply related	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29 22:08                                                   ` Eric Dumazet
@ 2008-10-30  3:22                                                     ` Corey Minyard
  2008-10-30  5:50                                                       ` Eric Dumazet
  2008-10-30  5:40                                                     ` David Miller
  2008-10-30 15:40                                                     ` [PATCH] udp: Introduce special NULL pointers for hlist termination Eric Dumazet
  2 siblings, 1 reply; 134+ messages in thread
From: Corey Minyard @ 2008-10-30  3:22 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: paulmck, David Miller, shemminger, benny+usenet, netdev,
	Christoph Lameter, a.p.zijlstra, johnpol, Christian Bell

Eric Dumazet wrote:
> Paul E. McKenney a écrit :
>> On Wed, Oct 29, 2008 at 09:00:13PM +0100, Eric Dumazet wrote:
>>> Hum... Another way of handling all those cases and avoid memory 
>>> barriers
>>> would be to have different "NULL" pointers.
>>>
>>> Each hash chain should have a unique "NULL" pointer (in the case of 
>>> UDP, it
>>> can be the 128 values : [ (void*)0 .. (void *)127 ]
>>>
>>> Then, when performing a lookup, a reader should check the "NULL" 
>>> pointer
>>> it get at the end of its lookup has is the "hash" value of its chain.
>>>
>>> If not -> restart the loop, aka "goto begin;" :)
>>>
>>> We could avoid memory barriers then.
>>>
>>> In the two cases Corey mentioned, this trick could let us avoid 
>>> memory barriers.
>>> (existing one in sk_add_node_rcu(sk, &hslot->head); should be enough)
>>>
>>> What do you think ?
>>
>> Kinky!!!  ;-)
>>
>> Then the rcu_dereference() would be supplying the needed memory 
>> barriers.
>>
>> Hmmm...  I guess that the only confusion would be if the element got
>> removed and then added to the same list.  But then if its pointer was
>> pseudo-NULL, then that would mean that all subsequent elements had been
>> removed, and all preceding ones added after the scan started.
>>
>> Which might well be harmless, but I must defer to you on this one at
>> the moment.
>>
>> If you need a larger hash table, another approach would be to set the
>> pointer's low-order bit, allowing the upper bits to be a full-sized
>> index -- or even a pointer to the list header.  Just make very sure
>> to clear the pointer when freeing, or an element on the freelist
>> could end up looking like a legitimate end of list...  Which again
>> might well be safe, but why inflict this on oneself?
>
> Well, for UDP case, hash table will be <= 65536 anyway, we can assume
> no dynamic kernel memory is in the range [0 .. 65535]
>
> Here is a patch (untested yet, its really time for a sleep for me ;) )
>
> [PATCH] udp: Introduce special NULL pointers for hlist termination
>
> In order to safely detect changes in chains, we would like to have 
> different
> 'NULL' pointers. Each chain in hash table is terminated by an unique 
> 'NULL'
> value, so that the lockless readers can detect their lookups evaded from
> their starting chain.
>
> We define 'NULL' values as ((unsigned long)values < UDP_HTABLE_SIZE)
>
> This saves memory barriers (a read barrier to fetch 'next' pointers
> *before* checking key values) we added in commit 
> 96631ed16c514cf8b28fab991a076985ce378c26 (udp: introduce 
> sk_for_each_rcu_safenext())
> This also saves a missing memory barrier spotted by Corey Minyard (a 
> write one in udp_lib_get_port(), between sk_hash update and ->next 
> update)
I think you are right, this will certainly perform a lot better without 
the read barrier in the list traversal.  I haven't seen any problems 
with this approach, though it's unusual enough to perhaps warrant some 
extra comments in the code.

You do need to modify udp_lib_unhash(), as sk_del_node_init_rcu() will 
do a NULL check on the ->next value, so you will need a special version 
of that as well.

-corey

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29 22:08                                                   ` Eric Dumazet
  2008-10-30  3:22                                                     ` Corey Minyard
@ 2008-10-30  5:40                                                     ` David Miller
  2008-10-30  5:51                                                       ` Eric Dumazet
  2008-10-30 15:40                                                     ` [PATCH] udp: Introduce special NULL pointers for hlist termination Eric Dumazet
  2 siblings, 1 reply; 134+ messages in thread
From: David Miller @ 2008-10-30  5:40 UTC (permalink / raw)
  To: dada1
  Cc: paulmck, minyard, shemminger, benny+usenet, netdev, cl,
	a.p.zijlstra, johnpol, christian

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 29 Oct 2008 23:08:30 +0100

> @@ -1746,7 +1753,7 @@ void __init udp_table_init(struct udp_table *table)
>  	int i;
>  
>  	for (i = 0; i < UDP_HTABLE_SIZE; i++) {
> -		INIT_HLIST_HEAD(&table->hash[i].head);
> +		table->hash[i].head.first = (struct hlist_node *)i;

Please hide this behind some list.h interface macro, even something
as simple as INIT_HLIST_HEAD_NULLS(X, index) would suffice.

And as Corey said, the code needs more comments for something as
clever as this! :-)

Thanks.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-30  3:22                                                     ` Corey Minyard
@ 2008-10-30  5:50                                                       ` Eric Dumazet
  2008-11-02  4:19                                                         ` David Miller
  0 siblings, 1 reply; 134+ messages in thread
From: Eric Dumazet @ 2008-10-30  5:50 UTC (permalink / raw)
  To: Corey Minyard
  Cc: paulmck, David Miller, shemminger, benny+usenet, netdev,
	Christoph Lameter, a.p.zijlstra, johnpol, Christian Bell

[-- Attachment #1: Type: text/plain, Size: 4369 bytes --]

Corey Minyard a écrit :
> Eric Dumazet wrote:
>> Paul E. McKenney a écrit :
>>> On Wed, Oct 29, 2008 at 09:00:13PM +0100, Eric Dumazet wrote:
>>>> Hum... Another way of handling all those cases and avoid memory 
>>>> barriers
>>>> would be to have different "NULL" pointers.
>>>>
>>>> Each hash chain should have a unique "NULL" pointer (in the case of 
>>>> UDP, it
>>>> can be the 128 values : [ (void*)0 .. (void *)127 ]
>>>>
>>>> Then, when performing a lookup, a reader should check the "NULL" 
>>>> pointer
>>>> it get at the end of its lookup has is the "hash" value of its chain.
>>>>
>>>> If not -> restart the loop, aka "goto begin;" :)
>>>>
>>>> We could avoid memory barriers then.
>>>>
>>>> In the two cases Corey mentioned, this trick could let us avoid 
>>>> memory barriers.
>>>> (existing one in sk_add_node_rcu(sk, &hslot->head); should be enough)
>>>>
>>>> What do you think ?
>>>
>>> Kinky!!!  ;-)
>>>
>>> Then the rcu_dereference() would be supplying the needed memory 
>>> barriers.
>>>
>>> Hmmm...  I guess that the only confusion would be if the element got
>>> removed and then added to the same list.  But then if its pointer was
>>> pseudo-NULL, then that would mean that all subsequent elements had been
>>> removed, and all preceding ones added after the scan started.
>>>
>>> Which might well be harmless, but I must defer to you on this one at
>>> the moment.
>>>
>>> If you need a larger hash table, another approach would be to set the
>>> pointer's low-order bit, allowing the upper bits to be a full-sized
>>> index -- or even a pointer to the list header.  Just make very sure
>>> to clear the pointer when freeing, or an element on the freelist
>>> could end up looking like a legitimate end of list...  Which again
>>> might well be safe, but why inflict this on oneself?
>>
>> Well, for UDP case, hash table will be <= 65536 anyway, we can assume
>> no dynamic kernel memory is in the range [0 .. 65535]
>>
>> Here is a patch (untested yet, its really time for a sleep for me ;) )
>>
>> [PATCH] udp: Introduce special NULL pointers for hlist termination
>>
>> In order to safely detect changes in chains, we would like to have 
>> different
>> 'NULL' pointers. Each chain in hash table is terminated by an unique 
>> 'NULL'
>> value, so that the lockless readers can detect their lookups evaded from
>> their starting chain.
>>
>> We define 'NULL' values as ((unsigned long)values < UDP_HTABLE_SIZE)
>>
>> This saves memory barriers (a read barrier to fetch 'next' pointers
>> *before* checking key values) we added in commit 
>> 96631ed16c514cf8b28fab991a076985ce378c26 (udp: introduce 
>> sk_for_each_rcu_safenext())
>> This also saves a missing memory barrier spotted by Corey Minyard (a 
>> write one in udp_lib_get_port(), between sk_hash update and ->next 
>> update)
> I think you are right, this will certainly perform a lot better without 
> the read barrier in the list traversal.  I haven't seen any problems 
> with this approach, though it's unusual enough to perhaps warrant some 
> extra comments in the code.
> 
> You do need to modify udp_lib_unhash(), as sk_del_node_init_rcu() will 
> do a NULL check on the ->next value, so you will need a special version 
> of that as well.
> 

Yes, we need many new macros, like sk_next_nulls(), sk_head_nulls(), ...

I have a working patch now, but not yet presentable for lkml :)

This patch need to touch files outside of netdev scope, so will need
really good shape and documentation.

(Probably a new file : include/linux/list_nulls.h ?)

Maybe in the meantime, we can commit a temporary patch doing the smp_wmb()
you suggested ?

Thanks

[PATCH] udp: add a missing smp_wmb() in udp_lib_get_port()

Corey Minyard spotted a missing memory barrier in udp_lib_get_port()

We need to make sure a reader cannot read the new 'sk->sk_next' value
and previous value of 'sk->sk_hash'. Or else, an item could be deleted
from a chain, and inserted into another chain. If new chain was empty
before the move, 'next' pointer is NULL, and lockless reader can
not detect it missed following items in original chain.

This patch is temporary, since we expect an upcoming patch
to introduce another way of handling the problem.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

[-- Attachment #2: missing_barrier.patch --]
[-- Type: text/plain, Size: 521 bytes --]

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c3ecec8..5e605ac 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -189,6 +189,11 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 	inet_sk(sk)->num = snum;
 	sk->sk_hash = snum;
 	if (sk_unhashed(sk)) {
+		/*
+		 * We need that previous write to sk->sk_hash committed
+		 * before write to sk->next done in following add_node() variant
+		 */
+		smp_wmb();
 		sk_add_node_rcu(sk, &hslot->head);
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 	}

^ permalink raw reply related	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-30  5:40                                                     ` David Miller
@ 2008-10-30  5:51                                                       ` Eric Dumazet
  2008-10-30  7:04                                                         ` Eric Dumazet
  0 siblings, 1 reply; 134+ messages in thread
From: Eric Dumazet @ 2008-10-30  5:51 UTC (permalink / raw)
  To: David Miller
  Cc: paulmck, minyard, shemminger, benny+usenet, netdev, cl,
	a.p.zijlstra, johnpol, christian

David Miller a écrit :
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Wed, 29 Oct 2008 23:08:30 +0100
> 
>> @@ -1746,7 +1753,7 @@ void __init udp_table_init(struct udp_table *table)
>>  	int i;
>>  
>>  	for (i = 0; i < UDP_HTABLE_SIZE; i++) {
>> -		INIT_HLIST_HEAD(&table->hash[i].head);
>> +		table->hash[i].head.first = (struct hlist_node *)i;
> 
> Please hide this behind some list.h interface macro, even something
> as simple as INIT_HLIST_HEAD_NULLS(X, index) would suffice.
> 
> And as Corey said, the code needs more comments for something as
> clever as this! :-)
> 

Yes I agree 100%, please give me one day to prepare a real patch,
or else akpm will kill us :)




^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-30  5:51                                                       ` Eric Dumazet
@ 2008-10-30  7:04                                                         ` Eric Dumazet
  2008-10-30  7:05                                                           ` David Miller
  0 siblings, 1 reply; 134+ messages in thread
From: Eric Dumazet @ 2008-10-30  7:04 UTC (permalink / raw)
  To: David Miller
  Cc: paulmck, minyard, shemminger, benny+usenet, netdev, cl,
	a.p.zijlstra, johnpol, christian

Eric Dumazet a écrit :
> David Miller a écrit :
>> From: Eric Dumazet <dada1@cosmosbay.com>
>> Date: Wed, 29 Oct 2008 23:08:30 +0100
>>
>>> @@ -1746,7 +1753,7 @@ void __init udp_table_init(struct udp_table 
>>> *table)
>>>      int i;
>>>  
>>>      for (i = 0; i < UDP_HTABLE_SIZE; i++) {
>>> -        INIT_HLIST_HEAD(&table->hash[i].head);
>>> +        table->hash[i].head.first = (struct hlist_node *)i;
>>
>> Please hide this behind some list.h interface macro, even something
>> as simple as INIT_HLIST_HEAD_NULLS(X, index) would suffice.
>>
>> And as Corey said, the code needs more comments for something as
>> clever as this! :-)
>>
> 
> Yes I agree 100%, please give me one day to prepare a real patch,
> or else akpm will kill us :)
> 

If we design something that could be reused, say for TCP sockets, we need
to be able to handle very large number of 'NULL' pointers, say, up to 64*1024*1024

So lets use the low order bit, set to one for "NULL" pointers, and 0 for regular pointers.

This gives us 31 bits (or 63 bits) to store any valuable info :)

and all ...._nulls() macros would not need to know the max value (UDP_HTABLE_SIZE in UDP case),
since all they have to do is a test ((unsigned long)pos & 1)

At iterator exit, pos would contain the 'index' value, (pos >> 1), to hide this
implementation detail.



^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-30  7:04                                                         ` Eric Dumazet
@ 2008-10-30  7:05                                                           ` David Miller
  0 siblings, 0 replies; 134+ messages in thread
From: David Miller @ 2008-10-30  7:05 UTC (permalink / raw)
  To: dada1
  Cc: paulmck, minyard, shemminger, benny+usenet, netdev, cl,
	a.p.zijlstra, johnpol, christian

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Thu, 30 Oct 2008 08:04:42 +0100

> If we design something that could be reused, say for TCP sockets, we need
> to be able to handle very large number of 'NULL' pointers, say, up to 64*1024*1024
> 
> So lets use the low order bit, set to one for "NULL" pointers, and 0 for regular pointers.
> 
> This gives us 31 bits (or 63 bits) to store any valuable info :)
> 
> and all ...._nulls() macros would not need to know the max value (UDP_HTABLE_SIZE in UDP case),
> since all they have to do is a test ((unsigned long)pos & 1)
> 
> At iterator exit, pos would contain the 'index' value, (pos >> 1), to hide this
> implementation detail.

This sound fine to me.  Quite an improvement in fact :)


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29 20:00                                               ` Eric Dumazet
  2008-10-29 20:17                                                 ` Paul E. McKenney
@ 2008-10-30 11:04                                                 ` Peter Zijlstra
  2008-10-30 11:30                                                   ` Eric Dumazet
  1 sibling, 1 reply; 134+ messages in thread
From: Peter Zijlstra @ 2008-10-30 11:04 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: paulmck, Corey Minyard, David Miller, shemminger, benny+usenet,
	netdev, Christoph Lameter, johnpol, Christian Bell

On Wed, 2008-10-29 at 21:00 +0100, Eric Dumazet wrote:

> 
> Hum... Another way of handling all those cases and avoid memory barriers
> would be to have different "NULL" pointers.
> 
> Each hash chain should have a unique "NULL" pointer (in the case of UDP, it
> can be the 128 values : [ (void*)0 .. (void *)127 ]

Why not use the bucket pointer as terminating condition?

Because all you really need is a pointer that is specific per bucket,
and not a valid element, right?



^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-29 14:36                               ` Eric Dumazet
  2008-10-29 15:34                                 ` Corey Minyard
  2008-10-29 18:20                                 ` David Miller
@ 2008-10-30 11:12                                 ` Peter Zijlstra
  2008-10-30 11:29                                   ` Eric Dumazet
  2 siblings, 1 reply; 134+ messages in thread
From: Peter Zijlstra @ 2008-10-30 11:12 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Corey Minyard, David Miller, shemminger, benny+usenet, netdev,
	paulmck, Christoph Lameter, johnpol, Christian Bell

On Wed, 2008-10-29 at 15:36 +0100, Eric Dumazet wrote:
> +/**
> + * hlist_for_each_entry_rcu_safenext - iterate over rcu list of given type
> + * @tpos:      the type * to use as a loop cursor.
> + * @pos:       the &struct hlist_node to use as a loop cursor.
> + * @head:      the head for your list.
> + * @member:    the name of the hlist_node within the struct.
> + * @next:       the &struct hlist_node to use as a next cursor
> + *
> + * Special version of hlist_for_each_entry_rcu that make sure
> + * each next pointer is fetched before each iteration.
> + */
> +#define hlist_for_each_entry_rcu_safenext(tpos, pos, head, member, next) \
> +       for (pos = rcu_dereference((head)->first);                       \
> +               pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) && \
> +               ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \
> +               pos = rcu_dereference(next))
> +

why _safenext and not _safe like hlist_for_each_entry_safe() which also
keeps a next pointer?

Also note the difference in argument order between these two.


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-30 11:12                                 ` Peter Zijlstra
@ 2008-10-30 11:29                                   ` Eric Dumazet
  0 siblings, 0 replies; 134+ messages in thread
From: Eric Dumazet @ 2008-10-30 11:29 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Corey Minyard, David Miller, shemminger, benny+usenet, netdev,
	paulmck, Christoph Lameter, johnpol, Christian Bell

Peter Zijlstra a écrit :
> On Wed, 2008-10-29 at 15:36 +0100, Eric Dumazet wrote:
>> +/**
>> + * hlist_for_each_entry_rcu_safenext - iterate over rcu list of given type
>> + * @tpos:      the type * to use as a loop cursor.
>> + * @pos:       the &struct hlist_node to use as a loop cursor.
>> + * @head:      the head for your list.
>> + * @member:    the name of the hlist_node within the struct.
>> + * @next:       the &struct hlist_node to use as a next cursor
>> + *
>> + * Special version of hlist_for_each_entry_rcu that make sure
>> + * each next pointer is fetched before each iteration.
>> + */
>> +#define hlist_for_each_entry_rcu_safenext(tpos, pos, head, member, next) \
>> +       for (pos = rcu_dereference((head)->first);                       \
>> +               pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) && \
>> +               ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \
>> +               pos = rcu_dereference(next))
>> +
> 
> why _safenext and not _safe like hlist_for_each_entry_safe() which also
> keeps a next pointer?
> 
> Also note the difference in argument order between these two.
> 

Yes, this one is going to vanish soon.



^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-30 11:04                                                 ` [PATCH 2/2] udp: RCU handling for Unicast packets Peter Zijlstra
@ 2008-10-30 11:30                                                   ` Eric Dumazet
  2008-10-30 18:25                                                     ` Paul E. McKenney
  0 siblings, 1 reply; 134+ messages in thread
From: Eric Dumazet @ 2008-10-30 11:30 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: paulmck, Corey Minyard, David Miller, shemminger, benny+usenet,
	netdev, Christoph Lameter, johnpol, Christian Bell

Peter Zijlstra a écrit :
> On Wed, 2008-10-29 at 21:00 +0100, Eric Dumazet wrote:
> 
>> Hum... Another way of handling all those cases and avoid memory barriers
>> would be to have different "NULL" pointers.
>>
>> Each hash chain should have a unique "NULL" pointer (in the case of UDP, it
>> can be the 128 values : [ (void*)0 .. (void *)127 ]
> 
> Why not use the bucket pointer as terminating condition?
> 
> Because all you really need is a pointer that is specific per bucket,
> and not a valid element, right?

Yes, but it forces compiler to keep around the bucket pointer.

Big chance this value will be stored on stack.

Next patch will use the least significant bit to distinguish a valid
pointer from a "NULL pointer"


^ permalink raw reply	[flat|nested] 134+ messages in thread

* [PATCH] udp: Introduce special NULL pointers for hlist termination
  2008-10-29 22:08                                                   ` Eric Dumazet
  2008-10-30  3:22                                                     ` Corey Minyard
  2008-10-30  5:40                                                     ` David Miller
@ 2008-10-30 15:40                                                     ` Eric Dumazet
  2008-10-30 15:51                                                       ` Stephen Hemminger
                                                                         ` (6 more replies)
  2 siblings, 7 replies; 134+ messages in thread
From: Eric Dumazet @ 2008-10-30 15:40 UTC (permalink / raw)
  To: paulmck, Corey Minyard, David Miller
  Cc: shemminger, benny+usenet, netdev, Christoph Lameter, a.p.zijlstra,
	johnpol, Christian Bell

[-- Attachment #1: Type: text/plain, Size: 3313 bytes --]

Eric Dumazet a écrit :
> Paul E. McKenney a écrit :
>> On Wed, Oct 29, 2008 at 09:00:13PM +0100, Eric Dumazet wrote:
>>> Hum... Another way of handling all those cases and avoid memory barriers
>>> would be to have different "NULL" pointers.
>>>
>>> Each hash chain should have a unique "NULL" pointer (in the case of 
>>> UDP, it
>>> can be the 128 values : [ (void*)0 .. (void *)127 ]
>>>
>>> Then, when performing a lookup, a reader should check the "NULL" pointer
>>> it get at the end of its lookup has is the "hash" value of its chain.
>>>
>>> If not -> restart the loop, aka "goto begin;" :)
>>>
>>> We could avoid memory barriers then.
>>>
>>> In the two cases Corey mentioned, this trick could let us avoid 
>>> memory barriers.
>>> (existing one in sk_add_node_rcu(sk, &hslot->head); should be enough)
>>>
>>> What do you think ?
>>
>> Kinky!!!  ;-)
>>
>> Then the rcu_dereference() would be supplying the needed memory barriers.
>>
>> Hmmm...  I guess that the only confusion would be if the element got
>> removed and then added to the same list.  But then if its pointer was
>> pseudo-NULL, then that would mean that all subsequent elements had been
>> removed, and all preceding ones added after the scan started.
>>
>> Which might well be harmless, but I must defer to you on this one at
>> the moment.
>>
>> If you need a larger hash table, another approach would be to set the
>> pointer's low-order bit, allowing the upper bits to be a full-sized
>> index -- or even a pointer to the list header.  Just make very sure
>> to clear the pointer when freeing, or an element on the freelist
>> could end up looking like a legitimate end of list...  Which again
>> might well be safe, but why inflict this on oneself?
> 

Ok, here is an updated and tested patch.

Thanks everybody

[PATCH] udp: Introduce special NULL pointers for hlist termination

In order to safely detect changes in chains, we would like to have different
'NULL' pointers. Each chain in hash table is terminated by an unique 'NULL'
value, so that the lockless readers can detect their lookups evaded from
their starting chain.

We introduce a new type of hlist implementation, named hlist_nulls, were
we use the least significant bit of the 'ptr' to tell if its a "NULL" value
or a pointer to an object. We expect to use this new hlist variant for TCP
as well.

For UDP/UDP-Lite hash table, we use 128 different "NULL" values,
(UDP_HTABLE_SIZE=128)

Using hlist_nulls saves memory barriers (a read barrier to fetch 'next'
pointers *before* checking key values) we added in commit 
96631ed16c514cf8b28fab991a076985ce378c26
(udp: introduce sk_for_each_rcu_safenext())

This also saves a write memory barrier in udp_lib_get_port(), between
sk->sk_hash update and sk->next update)

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
 include/linux/list_nulls.h    |   97 ++++++++++++++++++++++++++++++++
 include/linux/rculist.h       |   17 -----
 include/linux/rculist_nulls.h |   55 ++++++++++++++++++
 include/net/sock.h            |   50 ++++++++++++----
 include/net/udp.h             |    2
 net/ipv4/udp.c                |   40 ++++++-------
 net/ipv6/udp.c                |   22 ++++---
 7 files changed, 228 insertions(+), 55 deletions(-)


[-- Attachment #2: PATCH_NULLS.patch --]
[-- Type: text/plain, Size: 17979 bytes --]

diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
index e69de29..6adaa75 100644
--- a/include/linux/list_nulls.h
+++ b/include/linux/list_nulls.h
@@ -0,0 +1,97 @@
+#ifndef _LINUX_LIST_NULLS_H
+#define _LINUX_LIST_NULLS_H
+
+#include <linux/list.h>
+
+/*
+ * Special versions of lists, where a NULL pointer may have different values.
+ * (up to 2^31 different values guaranteed on all platforms)
+ *
+ * The least significant bit of 'ptr' is used to encode the 'NULL' value.
+ * Set to 1 : This is a NULL value (ptr >> 1)
+ * Set to 0 : This is a pointer to some object (ptr)
+ *
+ * Used for UDP sockets.
+ */
+
+struct hlist_nulls_head {
+	struct hlist_nulls_node *first;
+};
+
+struct hlist_nulls_node {
+	struct hlist_nulls_node *next, **pprev;
+};
+
+#define INIT_HLIST_NULLS_HEAD(ptr, nulls) \
+	((ptr)->first = (struct hlist_nulls_node *) (1UL | ((nulls) << 1)))
+
+#define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member)
+
+/**
+ * ptr_is_a_nulls - Test if a ptr to struct hlist_nulls_node is a nulls
+ * @ptr: ptr to be tested
+ *
+ */
+static inline int is_a_nulls(struct hlist_nulls_node *ptr)
+{
+	return ((unsigned long)ptr & 1);
+}
+
+/**
+ * get_nulls_value - Returns the null associated with a ptr
+ * @ptr: ptr to struct hlist_nulls_node
+ *
+ * Caller must check is_a_nulls(ptr) is true before calling this.
+ */
+static inline unsigned long get_nulls_value(struct hlist_nulls_node *ptr)
+{
+	return ((unsigned long)ptr) >> 1;
+}
+
+static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h)
+{
+	return !h->pprev;
+}
+
+static inline int hlist_nulls_empty(const struct hlist_nulls_head *h)
+{
+	return is_a_nulls(h->first);
+}
+
+static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
+{
+	struct hlist_nulls_node *next = n->next;
+	struct hlist_nulls_node **pprev = n->pprev;
+	*pprev = next;
+	if (!is_a_nulls(next))
+		next->pprev = pprev;
+}
+
+/**
+ * hlist_nulls_for_each_entry	- iterate over list of given type
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ *
+ */
+#define hlist_nulls_for_each_entry(tpos, pos, head, member)	 \
+	for (pos = (head)->first;				 \
+	     (!is_a_nulls(pos)) && ({ prefetch(pos->next); 1;}) && \
+		({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+/**
+ * hlist_nulls_for_each_entry_from - iterate over a hlist continuing from current point
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @member:	the name of the hlist_node within the struct.
+ *
+ */
+#define hlist_nulls_for_each_entry_from(tpos, pos, member)	\
+	for (; (!is_a_nulls(pos)) && 				\
+		({ prefetch(pos->next); 1;})   &&		\
+		({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+#endif
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 3ba2998..e649bd3 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -383,22 +383,5 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
 		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \
 		pos = rcu_dereference(pos->next))
 
-/**
- * hlist_for_each_entry_rcu_safenext - iterate over rcu list of given type
- * @tpos:	the type * to use as a loop cursor.
- * @pos:	the &struct hlist_node to use as a loop cursor.
- * @head:	the head for your list.
- * @member:	the name of the hlist_node within the struct.
- * @next:       the &struct hlist_node to use as a next cursor
- *
- * Special version of hlist_for_each_entry_rcu that make sure
- * each next pointer is fetched before each iteration.
- */
-#define hlist_for_each_entry_rcu_safenext(tpos, pos, head, member, next) \
-	for (pos = rcu_dereference((head)->first);			 \
-		pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) &&	\
-		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \
-		pos = rcu_dereference(next))
-
 #endif	/* __KERNEL__ */
 #endif
diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index e69de29..f16b455 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -0,0 +1,55 @@
+#ifndef _LINUX_RCULIST_NULLS_H
+#define _LINUX_RCULIST_NULLS_H
+
+#ifdef __KERNEL__
+
+/*
+ * RCU-protected list version, based on 'hlist_nulls' variant
+ *
+ * Used for UDP sockets.
+ */
+#include <linux/list_nulls.h>
+#include <linux/rcupdate.h>
+
+static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
+{
+	if (!hlist_nulls_unhashed(n)) {
+		__hlist_nulls_del(n);
+		n->pprev = NULL;
+	}
+}
+
+static inline void hlist_nulls_del_rcu(struct hlist_nulls_node *n)
+{
+	__hlist_nulls_del(n);
+	n->pprev = LIST_POISON2;
+}
+
+static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
+					struct hlist_nulls_head *h)
+{
+	struct hlist_nulls_node *first = h->first;
+
+	n->next = first;
+	n->pprev = &h->first;
+	rcu_assign_pointer(h->first, n);
+	if (!is_a_nulls(first))
+		first->pprev = &n->next;
+}
+/**
+ * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ *
+ */
+#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \
+	for (pos = rcu_dereference((head)->first);			 \
+		(!is_a_nulls(pos)) && 			\
+		({ prefetch(pos->next); 1; }) &&			\
+		({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \
+		pos = rcu_dereference(pos->next))
+
+#endif  /* __KERNEL__ */
+#endif
diff --git a/include/net/sock.h b/include/net/sock.h
index a4f6d3f..ece2235 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -42,6 +42,7 @@
 
 #include <linux/kernel.h>
 #include <linux/list.h>
+#include <linux/list_nulls.h>
 #include <linux/timer.h>
 #include <linux/cache.h>
 #include <linux/module.h>
@@ -52,6 +53,7 @@
 #include <linux/security.h>
 
 #include <linux/filter.h>
+#include <linux/rculist_nulls.h>
 
 #include <asm/atomic.h>
 #include <net/dst.h>
@@ -106,6 +108,7 @@ struct net;
  *	@skc_reuse: %SO_REUSEADDR setting
  *	@skc_bound_dev_if: bound device index if != 0
  *	@skc_node: main hash linkage for various protocol lookup tables
+ *	@skc_nulls_node: main hash linkage for UDP/UDP-Lite protocol
  *	@skc_bind_node: bind hash linkage for various protocol lookup tables
  *	@skc_refcnt: reference count
  *	@skc_hash: hash value used with various protocol lookup tables
@@ -120,7 +123,10 @@ struct sock_common {
 	volatile unsigned char	skc_state;
 	unsigned char		skc_reuse;
 	int			skc_bound_dev_if;
-	struct hlist_node	skc_node;
+	union {
+		struct hlist_node	skc_node;
+		struct hlist_nulls_node skc_nulls_node;
+	};
 	struct hlist_node	skc_bind_node;
 	atomic_t		skc_refcnt;
 	unsigned int		skc_hash;
@@ -206,6 +212,7 @@ struct sock {
 #define sk_reuse		__sk_common.skc_reuse
 #define sk_bound_dev_if		__sk_common.skc_bound_dev_if
 #define sk_node			__sk_common.skc_node
+#define sk_nulls_node		__sk_common.skc_nulls_node
 #define sk_bind_node		__sk_common.skc_bind_node
 #define sk_refcnt		__sk_common.skc_refcnt
 #define sk_hash			__sk_common.skc_hash
@@ -296,12 +303,28 @@ static inline struct sock *sk_head(const struct hlist_head *head)
 	return hlist_empty(head) ? NULL : __sk_head(head);
 }
 
+static inline struct sock *__sk_nulls_head(const struct hlist_nulls_head *head)
+{
+	return hlist_nulls_entry(head->first, struct sock, sk_nulls_node);
+}
+
+static inline struct sock *sk_nulls_head(const struct hlist_nulls_head *head)
+{
+	return hlist_nulls_empty(head) ? NULL : __sk_nulls_head(head);
+}
+
 static inline struct sock *sk_next(const struct sock *sk)
 {
 	return sk->sk_node.next ?
 		hlist_entry(sk->sk_node.next, struct sock, sk_node) : NULL;
 }
 
+static inline struct sock *sk_nulls_next(const struct sock *sk)
+{
+	return (!is_a_nulls(sk->sk_nulls_node.next)) ?
+		hlist_entry(sk->sk_nulls_node.next, struct sock, sk_nulls_node) : NULL;
+}
+
 static inline int sk_unhashed(const struct sock *sk)
 {
 	return hlist_unhashed(&sk->sk_node);
@@ -363,18 +386,18 @@ static __inline__ int sk_del_node_init(struct sock *sk)
 	return rc;
 }
 
-static __inline__ int __sk_del_node_init_rcu(struct sock *sk)
+static __inline__ int __sk_nulls_del_node_init_rcu(struct sock *sk)
 {
 	if (sk_hashed(sk)) {
-		hlist_del_init_rcu(&sk->sk_node);
+		hlist_nulls_del_init_rcu(&sk->sk_nulls_node);
 		return 1;
 	}
 	return 0;
 }
 
-static __inline__ int sk_del_node_init_rcu(struct sock *sk)
+static __inline__ int sk_nulls_del_node_init_rcu(struct sock *sk)
 {
-	int rc = __sk_del_node_init_rcu(sk);
+	int rc = __sk_nulls_del_node_init_rcu(sk);
 
 	if (rc) {
 		/* paranoid for a while -acme */
@@ -395,15 +418,15 @@ static __inline__ void sk_add_node(struct sock *sk, struct hlist_head *list)
 	__sk_add_node(sk, list);
 }
 
-static __inline__ void __sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
+static __inline__ void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
 {
-	hlist_add_head_rcu(&sk->sk_node, list);
+	hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list);
 }
 
-static __inline__ void sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
+static __inline__ void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
 {
 	sock_hold(sk);
-	__sk_add_node_rcu(sk, list);
+	__sk_nulls_add_node_rcu(sk, list);
 }
 
 static __inline__ void __sk_del_bind_node(struct sock *sk)
@@ -419,11 +442,16 @@ static __inline__ void sk_add_bind_node(struct sock *sk,
 
 #define sk_for_each(__sk, node, list) \
 	hlist_for_each_entry(__sk, node, list, sk_node)
-#define sk_for_each_rcu_safenext(__sk, node, list, next) \
-	hlist_for_each_entry_rcu_safenext(__sk, node, list, sk_node, next)
+#define sk_nulls_for_each(__sk, node, list) \
+	hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node)
+#define sk_nulls_for_each_rcu(__sk, node, list) \
+	hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node)
 #define sk_for_each_from(__sk, node) \
 	if (__sk && ({ node = &(__sk)->sk_node; 1; })) \
 		hlist_for_each_entry_from(__sk, node, sk_node)
+#define sk_nulls_for_each_from(__sk, node) \
+	if (__sk && ({ node = &(__sk)->sk_nulls_node; 1; })) \
+		hlist_nulls_for_each_entry_from(__sk, node, sk_nulls_node)
 #define sk_for_each_continue(__sk, node) \
 	if (__sk && ({ node = &(__sk)->sk_node; 1; })) \
 		hlist_for_each_entry_continue(__sk, node, sk_node)
diff --git a/include/net/udp.h b/include/net/udp.h
index df2bfe5..90e6ce5 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -51,7 +51,7 @@ struct udp_skb_cb {
 #define UDP_SKB_CB(__skb)	((struct udp_skb_cb *)((__skb)->cb))
 
 struct udp_hslot {
-	struct hlist_head	head;
+	struct hlist_nulls_head	head;
 	spinlock_t		lock;
 } __attribute__((aligned(2 * sizeof(long))));
 struct udp_table {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1789b35..0f7ed53 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -127,9 +127,9 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
 						 const struct sock *sk2))
 {
 	struct sock *sk2;
-	struct hlist_node *node;
+	struct hlist_nulls_node *node;
 
-	sk_for_each(sk2, node, &hslot->head)
+	sk_nulls_for_each(sk2, node, &hslot->head)
 		if (net_eq(sock_net(sk2), net)			&&
 		    sk2 != sk					&&
 		    sk2->sk_hash == num				&&
@@ -189,12 +189,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 	inet_sk(sk)->num = snum;
 	sk->sk_hash = snum;
 	if (sk_unhashed(sk)) {
-		/*
-		 * We need that previous write to sk->sk_hash committed
-		 * before write to sk->next done in following add_node() variant
-		 */
-		smp_wmb();
-		sk_add_node_rcu(sk, &hslot->head);
+		sk_nulls_add_node_rcu(sk, &hslot->head);
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 	}
 	error = 0;
@@ -261,7 +256,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 		int dif, struct udp_table *udptable)
 {
 	struct sock *sk, *result;
-	struct hlist_node *node, *next;
+	struct hlist_nulls_node *node;
 	unsigned short hnum = ntohs(dport);
 	unsigned int hash = udp_hashfn(net, hnum);
 	struct udp_hslot *hslot = &udptable->hash[hash];
@@ -271,7 +266,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 begin:
 	result = NULL;
 	badness = -1;
-	sk_for_each_rcu_safenext(sk, node, &hslot->head, next) {
+	sk_nulls_for_each_rcu(sk, node, &hslot->head) {
 		/*
 		 * lockless reader, and SLAB_DESTROY_BY_RCU items:
 		 * We must check this item was not moved to another chain
@@ -285,6 +280,13 @@ begin:
 			badness = score;
 		}
 	}
+	/*
+	 * if the nulls value we got at the end of this lookup is
+	 * not the expected one, we must restart lookup.
+	 */
+	if (get_nulls_value(node) != hash)
+		goto begin;
+
 	if (result) {
 		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
 			result = NULL;
@@ -325,11 +327,11 @@ static inline struct sock *udp_v4_mcast_next(struct sock *sk,
 					     __be16 rmt_port, __be32 rmt_addr,
 					     int dif)
 {
-	struct hlist_node *node;
+	struct hlist_nulls_node *node;
 	struct sock *s = sk;
 	unsigned short hnum = ntohs(loc_port);
 
-	sk_for_each_from(s, node) {
+	sk_nulls_for_each_from(s, node) {
 		struct inet_sock *inet = inet_sk(s);
 
 		if (s->sk_hash != hnum					||
@@ -976,7 +978,7 @@ void udp_lib_unhash(struct sock *sk)
 	struct udp_hslot *hslot = &udptable->hash[hash];
 
 	spin_lock_bh(&hslot->lock);
-	if (sk_del_node_init_rcu(sk)) {
+	if (sk_nulls_del_node_init_rcu(sk)) {
 		inet_sk(sk)->num = 0;
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 	}
@@ -1129,7 +1131,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	int dif;
 
 	spin_lock(&hslot->lock);
-	sk = sk_head(&hslot->head);
+	sk = sk_nulls_head(&hslot->head);
 	dif = skb->dev->ifindex;
 	sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
 	if (sk) {
@@ -1138,7 +1140,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 		do {
 			struct sk_buff *skb1 = skb;
 
-			sknext = udp_v4_mcast_next(sk_next(sk), uh->dest, daddr,
+			sknext = udp_v4_mcast_next(sk_nulls_next(sk), uh->dest, daddr,
 						   uh->source, saddr, dif);
 			if (sknext)
 				skb1 = skb_clone(skb, GFP_ATOMIC);
@@ -1558,10 +1560,10 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
 	struct net *net = seq_file_net(seq);
 
 	for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) {
-		struct hlist_node *node;
+		struct hlist_nulls_node *node;
 		struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
 		spin_lock_bh(&hslot->lock);
-		sk_for_each(sk, node, &hslot->head) {
+		sk_nulls_for_each(sk, node, &hslot->head) {
 			if (!net_eq(sock_net(sk), net))
 				continue;
 			if (sk->sk_family == state->family)
@@ -1580,7 +1582,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
 	struct net *net = seq_file_net(seq);
 
 	do {
-		sk = sk_next(sk);
+		sk = sk_nulls_next(sk);
 	} while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
 
 	if (!sk) {
@@ -1751,7 +1753,7 @@ void __init udp_table_init(struct udp_table *table)
 	int i;
 
 	for (i = 0; i < UDP_HTABLE_SIZE; i++) {
-		INIT_HLIST_HEAD(&table->hash[i].head);
+		INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
 		spin_lock_init(&table->hash[i].lock);
 	}
 }
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 32d914d..581fcc1 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -98,7 +98,7 @@ static struct sock *__udp6_lib_lookup(struct net *net,
 				      int dif, struct udp_table *udptable)
 {
 	struct sock *sk, *result;
-	struct hlist_node *node, *next;
+	struct hlist_nulls_node *node;
 	unsigned short hnum = ntohs(dport);
 	unsigned int hash = udp_hashfn(net, hnum);
 	struct udp_hslot *hslot = &udptable->hash[hash];
@@ -108,19 +108,27 @@ static struct sock *__udp6_lib_lookup(struct net *net,
 begin:
 	result = NULL;
 	badness = -1;
-	sk_for_each_rcu_safenext(sk, node, &hslot->head, next) {
+	sk_nulls_for_each_rcu(sk, node, &hslot->head) {
 		/*
 		 * lockless reader, and SLAB_DESTROY_BY_RCU items:
 		 * We must check this item was not moved to another chain
 		 */
 		if (udp_hashfn(net, sk->sk_hash) != hash)
 			goto begin;
-		score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif);
+		score = compute_score(sk, net, hnum, saddr, sport,
+				      daddr, dport, dif);
 		if (score > badness) {
 			result = sk;
 			badness = score;
 		}
 	}
+	/*
+	 * if the nulls value we got at the end of this lookup is
+	 * not the expected one, we must restart lookup.
+	 */
+	if (get_nulls_value(node) != hash)
+		goto begin;
+
 	if (result) {
 		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
 			result = NULL;
@@ -360,11 +368,11 @@ static struct sock *udp_v6_mcast_next(struct sock *sk,
 				      __be16 rmt_port, struct in6_addr *rmt_addr,
 				      int dif)
 {
-	struct hlist_node *node;
+	struct hlist_nulls_node *node;
 	struct sock *s = sk;
 	unsigned short num = ntohs(loc_port);
 
-	sk_for_each_from(s, node) {
+	sk_nulls_for_each_from(s, node) {
 		struct inet_sock *inet = inet_sk(s);
 
 		if (sock_net(s) != sock_net(sk))
@@ -409,7 +417,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	int dif;
 
 	spin_lock(&hslot->lock);
-	sk = sk_head(&hslot->head);
+	sk = sk_nulls_head(&hslot->head);
 	dif = inet6_iif(skb);
 	sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
 	if (!sk) {
@@ -418,7 +426,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	}
 
 	sk2 = sk;
-	while ((sk2 = udp_v6_mcast_next(sk_next(sk2), uh->dest, daddr,
+	while ((sk2 = udp_v6_mcast_next(sk_nulls_next(sk2), uh->dest, daddr,
 					uh->source, saddr, dif))) {
 		struct sk_buff *buff = skb_clone(skb, GFP_ATOMIC);
 		if (buff) {

^ permalink raw reply related	[flat|nested] 134+ messages in thread

* Re: [PATCH] udp: Introduce special NULL pointers for hlist termination
  2008-10-30 15:40                                                     ` [PATCH] udp: Introduce special NULL pointers for hlist termination Eric Dumazet
@ 2008-10-30 15:51                                                       ` Stephen Hemminger
  2008-10-30 16:28                                                         ` Corey Minyard
  2008-10-30 17:12                                                         ` Eric Dumazet
  2008-10-30 16:01                                                       ` Peter Zijlstra
                                                                         ` (5 subsequent siblings)
  6 siblings, 2 replies; 134+ messages in thread
From: Stephen Hemminger @ 2008-10-30 15:51 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: paulmck, Corey Minyard, David Miller, benny+usenet, netdev,
	Christoph Lameter, a.p.zijlstra, johnpol, Christian Bell

On Thu, 30 Oct 2008 16:40:01 +0100
Eric Dumazet <dada1@cosmosbay.com> wrote:

> Eric Dumazet a écrit :
> > Paul E. McKenney a écrit :
> >> On Wed, Oct 29, 2008 at 09:00:13PM +0100, Eric Dumazet wrote:
> >>> Hum... Another way of handling all those cases and avoid memory barriers
> >>> would be to have different "NULL" pointers.
> >>>
> >>> Each hash chain should have a unique "NULL" pointer (in the case of 
> >>> UDP, it
> >>> can be the 128 values : [ (void*)0 .. (void *)127 ]
> >>>
> >>> Then, when performing a lookup, a reader should check the "NULL" pointer
> >>> it get at the end of its lookup has is the "hash" value of its chain.
> >>>
> >>> If not -> restart the loop, aka "goto begin;" :)
> >>>
> >>> We could avoid memory barriers then.
> >>>
> >>> In the two cases Corey mentioned, this trick could let us avoid 
> >>> memory barriers.
> >>> (existing one in sk_add_node_rcu(sk, &hslot->head); should be enough)
> >>>
> >>> What do you think ?
> >>
> >> Kinky!!!  ;-)
> >>
> >> Then the rcu_dereference() would be supplying the needed memory barriers.
> >>
> >> Hmmm...  I guess that the only confusion would be if the element got
> >> removed and then added to the same list.  But then if its pointer was
> >> pseudo-NULL, then that would mean that all subsequent elements had been
> >> removed, and all preceding ones added after the scan started.
> >>
> >> Which might well be harmless, but I must defer to you on this one at
> >> the moment.
> >>
> >> If you need a larger hash table, another approach would be to set the
> >> pointer's low-order bit, allowing the upper bits to be a full-sized
> >> index -- or even a pointer to the list header.  Just make very sure
> >> to clear the pointer when freeing, or an element on the freelist
> >> could end up looking like a legitimate end of list...  Which again
> >> might well be safe, but why inflict this on oneself?
> > 
> 
> Ok, here is an updated and tested patch.
> 
> Thanks everybody
> 
> [PATCH] udp: Introduce special NULL pointers for hlist termination
> 
> In order to safely detect changes in chains, we would like to have different
> 'NULL' pointers. Each chain in hash table is terminated by an unique 'NULL'
> value, so that the lockless readers can detect their lookups evaded from
> their starting chain.
> 
> We introduce a new type of hlist implementation, named hlist_nulls, were
> we use the least significant bit of the 'ptr' to tell if its a "NULL" value
> or a pointer to an object. We expect to use this new hlist variant for TCP
> as well.
> 
> For UDP/UDP-Lite hash table, we use 128 different "NULL" values,
> (UDP_HTABLE_SIZE=128)
> 
> Using hlist_nulls saves memory barriers (a read barrier to fetch 'next'
> pointers *before* checking key values) we added in commit 
> 96631ed16c514cf8b28fab991a076985ce378c26
> (udp: introduce sk_for_each_rcu_safenext())
> 
> This also saves a write memory barrier in udp_lib_get_port(), between
> sk->sk_hash update and sk->next update)
> 
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
> ---

IMHO this goes over the edge into tricky hack. Is it really worth it?
Is there a better simpler way?

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH] udp: Introduce special NULL pointers for hlist termination
  2008-10-30 15:40                                                     ` [PATCH] udp: Introduce special NULL pointers for hlist termination Eric Dumazet
  2008-10-30 15:51                                                       ` Stephen Hemminger
@ 2008-10-30 16:01                                                       ` Peter Zijlstra
  2008-10-31  0:14                                                       ` Keith Owens
                                                                         ` (4 subsequent siblings)
  6 siblings, 0 replies; 134+ messages in thread
From: Peter Zijlstra @ 2008-10-30 16:01 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: paulmck, Corey Minyard, David Miller, shemminger, benny+usenet,
	netdev, Christoph Lameter, johnpol, Christian Bell

On Thu, 2008-10-30 at 16:40 +0100, Eric Dumazet wrote:
> 
> [PATCH] udp: Introduce special NULL pointers for hlist termination
> 
> In order to safely detect changes in chains, we would like to have different
> 'NULL' pointers. Each chain in hash table is terminated by an unique 'NULL'
> value, so that the lockless readers can detect their lookups evaded from
> their starting chain.
> 
> We introduce a new type of hlist implementation, named hlist_nulls, were
> we use the least significant bit of the 'ptr' to tell if its a "NULL" value
> or a pointer to an object. We expect to use this new hlist variant for TCP
> as well.
> 
> For UDP/UDP-Lite hash table, we use 128 different "NULL" values,
> (UDP_HTABLE_SIZE=128)
> 
> Using hlist_nulls saves memory barriers (a read barrier to fetch 'next'
> pointers *before* checking key values) we added in commit 
> 96631ed16c514cf8b28fab991a076985ce378c26
> (udp: introduce sk_for_each_rcu_safenext())
> 
> This also saves a write memory barrier in udp_lib_get_port(), between
> sk->sk_hash update and sk->next update)
> 
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
> ---
>  include/linux/list_nulls.h    |   97 ++++++++++++++++++++++++++++++++
>  include/linux/rculist.h       |   17 -----
>  include/linux/rculist_nulls.h |   55 ++++++++++++++++++
>  include/net/sock.h            |   50 ++++++++++++----
>  include/net/udp.h             |    2
>  net/ipv4/udp.c                |   40 ++++++-------
>  net/ipv6/udp.c                |   22 ++++---
>  7 files changed, 228 insertions(+), 55 deletions(-)

If we're going to do this, It'd be good to have the list_nulls stuff in
their own patch, as clearly they are not UDP specific.

Also, I think it would be very good to have some extensive comments in
the list_nulls files describing their use in clear and concise language,
because the above changelog doesn't even begin to explain things for
those not following this thread.




^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH] udp: Introduce special NULL pointers for hlist termination
  2008-10-30 15:51                                                       ` Stephen Hemminger
@ 2008-10-30 16:28                                                         ` Corey Minyard
  2008-10-31 14:37                                                           ` Eric Dumazet
  2008-10-30 17:12                                                         ` Eric Dumazet
  1 sibling, 1 reply; 134+ messages in thread
From: Corey Minyard @ 2008-10-30 16:28 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Eric Dumazet, paulmck, David Miller, benny+usenet, netdev,
	Christoph Lameter, a.p.zijlstra, johnpol, Christian Bell

Stephen Hemminger wrote:
> On Thu, 30 Oct 2008 16:40:01 +0100
> Eric Dumazet <dada1@cosmosbay.com> wrote:
>
>   
>> Eric Dumazet a écrit :
>>     
>>> Paul E. McKenney a écrit :
>>>       
>>>> On Wed, Oct 29, 2008 at 09:00:13PM +0100, Eric Dumazet wrote:
>>>>         
>>>>> Hum... Another way of handling all those cases and avoid memory barriers
>>>>> would be to have different "NULL" pointers.
>>>>>
>>>>> Each hash chain should have a unique "NULL" pointer (in the case of 
>>>>> UDP, it
>>>>> can be the 128 values : [ (void*)0 .. (void *)127 ]
>>>>>
>>>>> Then, when performing a lookup, a reader should check the "NULL" pointer
>>>>> it get at the end of its lookup has is the "hash" value of its chain.
>>>>>
>>>>> If not -> restart the loop, aka "goto begin;" :)
>>>>>
>>>>> We could avoid memory barriers then.
>>>>>
>>>>> In the two cases Corey mentioned, this trick could let us avoid 
>>>>> memory barriers.
>>>>> (existing one in sk_add_node_rcu(sk, &hslot->head); should be enough)
>>>>>
>>>>> What do you think ?
>>>>>           
>>>> Kinky!!!  ;-)
>>>>
>>>> Then the rcu_dereference() would be supplying the needed memory barriers.
>>>>
>>>> Hmmm...  I guess that the only confusion would be if the element got
>>>> removed and then added to the same list.  But then if its pointer was
>>>> pseudo-NULL, then that would mean that all subsequent elements had been
>>>> removed, and all preceding ones added after the scan started.
>>>>
>>>> Which might well be harmless, but I must defer to you on this one at
>>>> the moment.
>>>>
>>>> If you need a larger hash table, another approach would be to set the
>>>> pointer's low-order bit, allowing the upper bits to be a full-sized
>>>> index -- or even a pointer to the list header.  Just make very sure
>>>> to clear the pointer when freeing, or an element on the freelist
>>>> could end up looking like a legitimate end of list...  Which again
>>>> might well be safe, but why inflict this on oneself?
>>>>         
>> Ok, here is an updated and tested patch.
>>
>> Thanks everybody
>>
>> [PATCH] udp: Introduce special NULL pointers for hlist termination
>>
>> In order to safely detect changes in chains, we would like to have different
>> 'NULL' pointers. Each chain in hash table is terminated by an unique 'NULL'
>> value, so that the lockless readers can detect their lookups evaded from
>> their starting chain.
>>
>> We introduce a new type of hlist implementation, named hlist_nulls, were
>> we use the least significant bit of the 'ptr' to tell if its a "NULL" value
>> or a pointer to an object. We expect to use this new hlist variant for TCP
>> as well.
>>
>> For UDP/UDP-Lite hash table, we use 128 different "NULL" values,
>> (UDP_HTABLE_SIZE=128)
>>
>> Using hlist_nulls saves memory barriers (a read barrier to fetch 'next'
>> pointers *before* checking key values) we added in commit 
>> 96631ed16c514cf8b28fab991a076985ce378c26
>> (udp: introduce sk_for_each_rcu_safenext())
>>
>> This also saves a write memory barrier in udp_lib_get_port(), between
>> sk->sk_hash update and sk->next update)
>>
>> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
>> ---
>>     
>
> IMHO this goes over the edge into tricky hack. Is it really worth it?
> Is there a better simpler way?
>   
The only think I've thought of is to do a single smp_rmb() after the 
loop scanning the list and check the sk_hash value again.  That's better 
than the read barrier for every list element, but still not as good as 
this list from a performance point of view.

IMHO, this is a tricky hack, but it if is well abstracted and documented 
I think it's ok.  I'd guess something like this will become more often 
used as we get larger numbers of processors on systems.

It is annoying that it doesn't help the performance for multicast.  
However, I think the current patch will solve the DOS issue for 
multicast, since it switches to a normal spinlock and has a per-list lock.

-corey

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH] udp: Introduce special NULL pointers for hlist termination
  2008-10-30 15:51                                                       ` Stephen Hemminger
  2008-10-30 16:28                                                         ` Corey Minyard
@ 2008-10-30 17:12                                                         ` Eric Dumazet
  2008-10-31  7:51                                                           ` David Miller
  1 sibling, 1 reply; 134+ messages in thread
From: Eric Dumazet @ 2008-10-30 17:12 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: paulmck, Corey Minyard, David Miller, benny+usenet, netdev,
	Christoph Lameter, a.p.zijlstra, johnpol, Christian Bell

Stephen Hemminger a écrit :
> On Thu, 30 Oct 2008 16:40:01 +0100
> Eric Dumazet <dada1@cosmosbay.com> wrote:

>>
>> [PATCH] udp: Introduce special NULL pointers for hlist termination
>>
>> In order to safely detect changes in chains, we would like to have different
>> 'NULL' pointers. Each chain in hash table is terminated by an unique 'NULL'
>> value, so that the lockless readers can detect their lookups evaded from
>> their starting chain.
>>
>> We introduce a new type of hlist implementation, named hlist_nulls, were
>> we use the least significant bit of the 'ptr' to tell if its a "NULL" value
>> or a pointer to an object. We expect to use this new hlist variant for TCP
>> as well.
>>
>> For UDP/UDP-Lite hash table, we use 128 different "NULL" values,
>> (UDP_HTABLE_SIZE=128)
>>
>> Using hlist_nulls saves memory barriers (a read barrier to fetch 'next'
>> pointers *before* checking key values) we added in commit 
>> 96631ed16c514cf8b28fab991a076985ce378c26
>> (udp: introduce sk_for_each_rcu_safenext())
>>
>> This also saves a write memory barrier in udp_lib_get_port(), between
>> sk->sk_hash update and sk->next update)
>>
>> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
>> ---
> 
> IMHO this goes over the edge into tricky hack. Is it really worth it?
> Is there a better simpler way?

rwlocks , spinlocks, seqlocks :)

More seriously Stephen, if the infrastructure is clean, and well tested on a relative
simple case (UDP), it can then be deployed on a much more interesting protocol : TCP

The moment we switch to RCU, we have to accept the pain of really understand what
we did. Details are scary yes.



^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-30 11:30                                                   ` Eric Dumazet
@ 2008-10-30 18:25                                                     ` Paul E. McKenney
  2008-10-31 16:40                                                       ` Eric Dumazet
  0 siblings, 1 reply; 134+ messages in thread
From: Paul E. McKenney @ 2008-10-30 18:25 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Peter Zijlstra, Corey Minyard, David Miller, shemminger,
	benny+usenet, netdev, Christoph Lameter, johnpol, Christian Bell

On Thu, Oct 30, 2008 at 12:30:20PM +0100, Eric Dumazet wrote:
> Peter Zijlstra a écrit :
>> On Wed, 2008-10-29 at 21:00 +0100, Eric Dumazet wrote:
>>> Hum... Another way of handling all those cases and avoid memory barriers
>>> would be to have different "NULL" pointers.
>>>
>>> Each hash chain should have a unique "NULL" pointer (in the case of UDP, 
>>> it
>>> can be the 128 values : [ (void*)0 .. (void *)127 ]
>> Why not use the bucket pointer as terminating condition?
>> Because all you really need is a pointer that is specific per bucket,
>> and not a valid element, right?
>
> Yes, but it forces compiler to keep around the bucket pointer.
>
> Big chance this value will be stored on stack.
>
> Next patch will use the least significant bit to distinguish a valid
> pointer from a "NULL pointer"

As you might guess, I do like that idea.  ;-)

On my plane trip, so am reviewing what I believe to be your current
combined patchset consisting of six patches transmitted in this thread:

Message-ID: <49081D67.3050502@cosmosbay.com>
Message-ID: <49082718.2030201@cosmosbay.com>
Message-ID: <490874F2.2060306@cosmosbay.com>
Message-ID: <4908DEDE.5030706@cosmosbay.com>
Message-ID: <49094B0F.2090208@cosmosbay.com>
Message-ID: <490838C6.4060304@cosmosbay.com>

This probably won't be your latest and greatest by the time you receive
this, but it appears to be the latest and greatest that I have.  ;-)

A few comments, search for blank lines.

							Thanx, Paul

> diff --git a/include/linux/list.h b/include/linux/list.h
> index 969f6e9..a3d5dd1 100644
> --- a/include/linux/list.h
> +++ b/include/linux/list.h
> @@ -654,6 +654,22 @@ static inline void hlist_move_list(struct hlist_head *old,
>  	     pos && ({ prefetch(pos->next); 1;}) &&			 \
>  		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
>  	     pos = pos->next)
> +/**
> + * hlist_for_each_entry_nulls	- iterate over list of given type
> + * @tpos:	the type * to use as a loop cursor.
> + * @pos:	the &struct hlist_node to use as a loop cursor.
> + * @head:	the head for your list.
> + * @member:	the name of the hlist_node within the struct.
> + * @nullval: the iteration should stop if a pointer is < nullval
> + *
> + * Special version of hlist_for_each_entry where the end pointer
> + * can be NULL but also any value < nullval.
> + */
> +#define hlist_for_each_entry_nulls(tpos, pos, head, member, nullval)	 \
> +	for (pos = (head)->first;					 \
> +	     ((unsigned long)pos >= nullval) && ({ prefetch(pos->next); 1;}) && \
> +		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
> +	     pos = pos->next)

It might be good to take a predicate macro/function instead of nullval,
in order to allow this primitive to be used for a number of different
pseudo-NULL-pointer schemes, and ditto for the similar macros you define
below.  Might be too early to know exactly how such a primitive should
look, though.

So just a random thought at this point.

In any case, this macro cannot be used in read-side critical sections.

Used by sk_for_each_nulls(), which is called by udp_lib_lport_inuse()
and udp_get_first().

udp_lib_lport_inuse() is called by udp_lib_get_port(), which holds the
hslot->lock, so should be OK.

In udp_get_first(), this same lock is held, so again should be OK.

>  /**
>   * hlist_for_each_entry_continue - iterate over a hlist continuing after current point
> @@ -679,6 +695,22 @@ static inline void hlist_move_list(struct hlist_head *old,
>  	     pos = pos->next)
>  
>  /**
> + * hlist_for_each_entry_from_nulls - iterate over a hlist continuing from current point
> + * @tpos:	the type * to use as a loop cursor.
> + * @pos:	the &struct hlist_node to use as a loop cursor.
> + * @member:	the name of the hlist_node within the struct.
> + * @nullval: the iteration should stop if a pointer is < nullval
> + *
> + * Special version of hlist_for_each_entry_from where the end pointer
> + * can be NULL but also any value < nullval.
> + */
> +#define hlist_for_each_entry_from_nulls(tpos, pos, member, nullval)	\
> +	for (; ((unsigned long)pos >= nullval) && \
> +		({ prefetch(pos->next); 1;})   && \
> +		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
> +	     pos = pos->next)

This also must be called with the update-side lock held.  It is
wrappered by sk_for_each_from_nulls(), which in turn is called from
udp_v4_mcast_next() and udp_v6_mcast_next().  According to messages
earlier in this thread, that is the case.

udp_v4_mcast_next() is called from __udp4_lib_mcast_deliver(), which does
hold hslot->lock, so OK.  Ditto for the calls from udp_v6_mcast_next().

Interestingly enough, these two functions use sk_head(), which calls
__sk_head(), which do not do rcu_dereference().  Which is another reason
that this cannot be called from an RCU read-side critical section.

> +/**
>   * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry
>   * @tpos:	the type * to use as a loop cursor.
>   * @pos:	the &struct hlist_node to use as a loop cursor.
> diff --git a/include/linux/rculist.h b/include/linux/rculist.h
> index e649bd3..6f78e2b 100644
> --- a/include/linux/rculist.h
> +++ b/include/linux/rculist.h
> @@ -383,5 +383,23 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
>  		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \
>  		pos = rcu_dereference(pos->next))
>  
> +/**
> + * hlist_for_each_entry_rcu_nulls - iterate over rcu list of given type
> + * @tpos:	the type * to use as a loop cursor.
> + * @pos:	the &struct hlist_node to use as a loop cursor.
> + * @head:	the head for your list.
> + * @member:	the name of the hlist_node within the struct.
> + * @nullval:       the iteration should stop if a pointer is < nullval
> + *
> + * Special version of hlist_for_each_entry_rcu where the end pointer
> + * can be NULL but also any value < nullval.
> + */
> +#define hlist_for_each_entry_rcu_nulls(tpos, pos, head, member, nullval) \
> +	for (pos = rcu_dereference((head)->first);			 \
> +		((unsigned long)pos >= nullval) && 			\
> +		({ prefetch(pos->next); 1; }) &&			\
> +		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \
> +		pos = rcu_dereference(pos->next))

Looks good.  It should be possible to get rid of the "pos" argument in
the upcoming version, as all of the offsets should be even, so the
bottom bit would come through unchanged.  In fact, it should be possible
in this variant as well:

	((unsigned long)pos - offsetof(typeof(*tpos)) >= nullval)

Or am I missing something?

>  #endif	/* __KERNEL__ */
>  #endif
> diff --git a/include/net/sock.h b/include/net/sock.h
> index ada50c0..6e8545b 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -361,6 +361,27 @@ static __inline__ int sk_del_node_init(struct sock *sk)
>  	return rc;
>  }
>  
> +static __inline__ int __sk_del_node_init_rcu(struct sock *sk)
> +{
> +	if (sk_hashed(sk)) {
> +		hlist_del_init_rcu(&sk->sk_node);

Won't your pseudo-NULL pointers mess up __hlist_del(), which is called
from hlist_del_init_rcu(), and which expects real NULL pointers?

Seems like you need _nulls variants of these list primitives as well.

> +		return 1;
> +	}
> +	return 0;
> +}
> +
> +static __inline__ int sk_del_node_init_rcu(struct sock *sk)
> +{
> +	int rc = __sk_del_node_init_rcu(sk);
> +
> +	if (rc) {
> +		/* paranoid for a while -acme */
> +		WARN_ON(atomic_read(&sk->sk_refcnt) == 1);
> +		__sock_put(sk);
> +	}
> +	return rc;
> +}
> +
>  static __inline__ void __sk_add_node(struct sock *sk, struct hlist_head *list)
>  {
>  	hlist_add_head(&sk->sk_node, list);
> @@ -372,6 +393,17 @@ static __inline__ void sk_add_node(struct sock *sk, struct hlist_head *list)
>  	__sk_add_node(sk, list);
>  }
>  
> +static __inline__ void __sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
> +{
> +	hlist_add_head_rcu(&sk->sk_node, list);

Same here -- hlist_add_head_rcu() expects real NULL pointers.

Or are these different lists?  I believe that these are the same lists,
given that the ->sk_node field is used in both cases.

So I believe that you need parallel _nulls primitives for this one also.

> +}
> +
> +static __inline__ void sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
> +{
> +	sock_hold(sk);
> +	__sk_add_node_rcu(sk, list);
> +}
> +
>  static __inline__ void __sk_del_bind_node(struct sock *sk)
>  {
>  	__hlist_del(&sk->sk_bind_node);
> @@ -385,9 +417,16 @@ static __inline__ void sk_add_bind_node(struct sock *sk,
>  
>  #define sk_for_each(__sk, node, list) \
>  	hlist_for_each_entry(__sk, node, list, sk_node)
> +#define sk_for_each_nulls(__sk, node, list, nullval) \
> +	hlist_for_each_entry_nulls(__sk, node, list, sk_node, nullval)
> +#define sk_for_each_rcu_nulls(__sk, node, list, nullval) \
> +	hlist_for_each_entry_rcu_nulls(__sk, node, list, sk_node, nullval)
>  #define sk_for_each_from(__sk, node) \
>  	if (__sk && ({ node = &(__sk)->sk_node; 1; })) \
>  		hlist_for_each_entry_from(__sk, node, sk_node)
> +#define sk_for_each_from_nulls(__sk, node, nullval) \
> +	if (__sk && ({ node = &(__sk)->sk_node; 1; })) \
> +		hlist_for_each_entry_from_nulls(__sk, node, sk_node, nullval)
>  #define sk_for_each_continue(__sk, node) \
>  	if (__sk && ({ node = &(__sk)->sk_node; 1; })) \
>  		hlist_for_each_entry_continue(__sk, node, sk_node)
> @@ -587,8 +626,9 @@ struct proto {
>  	int			*sysctl_rmem;
>  	int			max_header;
>  
> -	struct kmem_cache		*slab;
> +	struct kmem_cache	*slab;
>  	unsigned int		obj_size;
> +	int			slab_flags;
>  
>  	atomic_t		*orphan_count;
>  
> @@ -597,7 +637,7 @@ struct proto {
>  
>  	union {
>  		struct inet_hashinfo	*hashinfo;
> -		struct hlist_head	*udp_hash;
> +		struct udp_table	*udp_table;
>  		struct raw_hashinfo	*raw_hash;
>  	} h;
>  
> diff --git a/include/net/udp.h b/include/net/udp.h
> index 1e20509..df2bfe5 100644
> --- a/include/net/udp.h
> +++ b/include/net/udp.h
> @@ -50,8 +50,15 @@ struct udp_skb_cb {
>  };
>  #define UDP_SKB_CB(__skb)	((struct udp_skb_cb *)((__skb)->cb))
>  
> -extern struct hlist_head udp_hash[UDP_HTABLE_SIZE];
> -extern rwlock_t udp_hash_lock;
> +struct udp_hslot {
> +	struct hlist_head	head;
> +	spinlock_t		lock;
> +} __attribute__((aligned(2 * sizeof(long))));
> +struct udp_table {
> +	struct udp_hslot	hash[UDP_HTABLE_SIZE];
> +};
> +extern struct udp_table udp_table;
> +extern void udp_table_init(struct udp_table *);
>  
>  
>  /* Note: this must match 'valbool' in sock_setsockopt */
> @@ -110,15 +117,7 @@ static inline void udp_lib_hash(struct sock *sk)
>  	BUG();
>  }
>  
> -static inline void udp_lib_unhash(struct sock *sk)
> -{
> -	write_lock_bh(&udp_hash_lock);
> -	if (sk_del_node_init(sk)) {
> -		inet_sk(sk)->num = 0;
> -		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
> -	}
> -	write_unlock_bh(&udp_hash_lock);
> -}
> +extern void udp_lib_unhash(struct sock *sk);
>  
>  static inline void udp_lib_close(struct sock *sk, long timeout)
>  {
> @@ -187,7 +186,7 @@ extern struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
>  struct udp_seq_afinfo {
>  	char			*name;
>  	sa_family_t		family;
> -	struct hlist_head	*hashtable;
> +	struct udp_table	*udp_table;
>  	struct file_operations	seq_fops;
>  	struct seq_operations	seq_ops;
>  };
> @@ -196,7 +195,7 @@ struct udp_iter_state {
>  	struct seq_net_private  p;
>  	sa_family_t		family;
>  	int			bucket;
> -	struct hlist_head	*hashtable;
> +	struct udp_table	*udp_table;
>  };
>  
>  #ifdef CONFIG_PROC_FS
> diff --git a/include/net/udplite.h b/include/net/udplite.h
> index b76b2e3..afdffe6 100644
> --- a/include/net/udplite.h
> +++ b/include/net/udplite.h
> @@ -11,7 +11,7 @@
>  #define UDPLITE_RECV_CSCOV   11 /* receiver partial coverage (threshold ) */
>  
>  extern struct proto 		udplite_prot;
> -extern struct hlist_head 	udplite_hash[UDP_HTABLE_SIZE];
> +extern struct udp_table		udplite_table;
>  
>  /*
>   *	Checksum computation is all in software, hence simpler getfrag.
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 5e2a313..ded1eb5 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -2042,7 +2042,8 @@ int proto_register(struct proto *prot, int alloc_slab)
>  
>  	if (alloc_slab) {
>  		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
> -					       SLAB_HWCACHE_ALIGN, NULL);
> +					SLAB_HWCACHE_ALIGN | prot->slab_flags,
> +					NULL);
>  
>  		if (prot->slab == NULL) {
>  			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index 2095abc..915d92d 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -104,12 +104,8 @@
>  #include <net/xfrm.h>
>  #include "udp_impl.h"
>  
> -/*
> - *	Snmp MIB for the UDP layer
> - */
> -
> -struct hlist_head udp_hash[UDP_HTABLE_SIZE];
> -DEFINE_RWLOCK(udp_hash_lock);
> +struct udp_table udp_table;
> +EXPORT_SYMBOL(udp_table);
>  
>  int sysctl_udp_mem[3] __read_mostly;
>  int sysctl_udp_rmem_min __read_mostly;
> @@ -123,7 +119,7 @@ atomic_t udp_memory_allocated;
>  EXPORT_SYMBOL(udp_memory_allocated);
>  
>  static int udp_lib_lport_inuse(struct net *net, __u16 num,
> -			       const struct hlist_head udptable[],
> +			       const struct udp_hslot *hslot,
>  			       struct sock *sk,
>  			       int (*saddr_comp)(const struct sock *sk1,
>  						 const struct sock *sk2))
> @@ -131,7 +127,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
>  	struct sock *sk2;
>  	struct hlist_node *node;
>  
> -	sk_for_each(sk2, node, &udptable[udp_hashfn(net, num)])
> +	sk_for_each_nulls(sk2, node, &hslot->head, UDP_HTABLE_SIZE)
>  		if (net_eq(sock_net(sk2), net)			&&
>  		    sk2 != sk					&&
>  		    sk2->sk_hash == num				&&
> @@ -154,12 +150,11 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
>  		       int (*saddr_comp)(const struct sock *sk1,
>  					 const struct sock *sk2 )    )
>  {
> -	struct hlist_head *udptable = sk->sk_prot->h.udp_hash;
> +	struct udp_hslot *hslot;
> +	struct udp_table *udptable = sk->sk_prot->h.udp_table;
>  	int    error = 1;
>  	struct net *net = sock_net(sk);
>  
> -	write_lock_bh(&udp_hash_lock);
> -
>  	if (!snum) {
>  		int low, high, remaining;
>  		unsigned rand;
> @@ -171,26 +166,39 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,

inet_get_local_port_range() used to be under the write_lock_bh(),
and no longer is.  So, how do we now protect against concurrent port
range changes?

>  		rand = net_random();
>  		snum = first = rand % remaining + low;
>  		rand |= 1;
> -		while (udp_lib_lport_inuse(net, snum, udptable, sk,
> -					   saddr_comp)) {
> +		for (;;) {
> +			hslot = &udptable->hash[udp_hashfn(net, snum)];
> +			spin_lock_bh(&hslot->lock);
> +			if (!udp_lib_lport_inuse(net, snum, hslot, sk, saddr_comp))
> +				break;
> +			spin_unlock_bh(&hslot->lock);
>  			do {
>  				snum = snum + rand;
>  			} while (snum < low || snum > high);

The above -really- confuses me, but not part of this patch.  If we are
out of range, keep going?  Well, I guess that since it is a short, we
cannot go very far...

>  			if (snum == first)
>  				goto fail;

And I don't understand how we are guaranteed to have scanned all the
possible ports upon failure, but happy to leave that to you guys.

>  		}
> -	} else if (udp_lib_lport_inuse(net, snum, udptable, sk, saddr_comp))
> -		goto fail;
> -
> +	} else {
> +		hslot = &udptable->hash[udp_hashfn(net, snum)];
> +		spin_lock_bh(&hslot->lock);
> +		if (udp_lib_lport_inuse(net, snum, hslot, sk, saddr_comp))
> +			goto fail_unlock;
> +	}
>  	inet_sk(sk)->num = snum;
>  	sk->sk_hash = snum;
>  	if (sk_unhashed(sk)) {
> -		sk_add_node(sk, &udptable[udp_hashfn(net, snum)]);
> +		/*
> +		 * We need that previous write to sk->sk_hash committed
> +		 * before write to sk->next done in following add_node() variant
> +		 */
> +		smp_wmb();
> +		sk_add_node_rcu(sk, &hslot->head);
>  		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
>  	}
>  	error = 0;
> +fail_unlock:
> +	spin_unlock_bh(&hslot->lock);
>  fail:
> -	write_unlock_bh(&udp_hash_lock);
>  	return error;
>  }
>  
> @@ -208,63 +216,96 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum)
>  	return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal);
>  }
>  
> +static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
> +			 unsigned short hnum,
> +			 __be16 sport, __be32 daddr, __be16 dport, int dif)
> +{
> +	int score = -1;
> +
> +	if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
> +			!ipv6_only_sock(sk)) {
> +		struct inet_sock *inet = inet_sk(sk);
> +
> +		score = (sk->sk_family == PF_INET ? 1 : 0);
> +		if (inet->rcv_saddr) {
> +			if (inet->rcv_saddr != daddr)
> +				return -1;
> +			score += 2;
> +		}
> +		if (inet->daddr) {
> +			if (inet->daddr != saddr)
> +				return -1;
> +			score += 2;
> +		}
> +		if (inet->dport) {
> +			if (inet->dport != sport)
> +				return -1;
> +			score += 2;
> +		}
> +		if (sk->sk_bound_dev_if) {
> +			if (sk->sk_bound_dev_if != dif)
> +				return -1;
> +			score += 2;
> +		}
> +	}
> +	return score;
> +}
> +
>  /* UDP is nearly always wildcards out the wazoo, it makes no sense to try
>   * harder than this. -DaveM
>   */
>  static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
>  		__be16 sport, __be32 daddr, __be16 dport,
> -		int dif, struct hlist_head udptable[])
> +		int dif, struct udp_table *udptable)
>  {
> -	struct sock *sk, *result = NULL;
> +	struct sock *sk, *result;
>  	struct hlist_node *node;
>  	unsigned short hnum = ntohs(dport);
> -	int badness = -1;
> -
> -	read_lock(&udp_hash_lock);
> -	sk_for_each(sk, node, &udptable[udp_hashfn(net, hnum)]) {
> -		struct inet_sock *inet = inet_sk(sk);
> -
> -		if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
> -				!ipv6_only_sock(sk)) {
> -			int score = (sk->sk_family == PF_INET ? 1 : 0);
> -			if (inet->rcv_saddr) {
> -				if (inet->rcv_saddr != daddr)
> -					continue;
> -				score+=2;
> -			}
> -			if (inet->daddr) {
> -				if (inet->daddr != saddr)
> -					continue;
> -				score+=2;
> -			}
> -			if (inet->dport) {
> -				if (inet->dport != sport)
> -					continue;
> -				score+=2;
> -			}
> -			if (sk->sk_bound_dev_if) {
> -				if (sk->sk_bound_dev_if != dif)
> -					continue;
> -				score+=2;
> -			}
> -			if (score == 9) {
> -				result = sk;
> -				break;
> -			} else if (score > badness) {
> -				result = sk;
> -				badness = score;
> -			}
> +	unsigned int hash = udp_hashfn(net, hnum);
> +	struct udp_hslot *hslot = &udptable->hash[hash];
> +	int score, badness;
> +
> +	rcu_read_lock();
> +begin:
> +	result = NULL;
> +	badness = -1;
> +	sk_for_each_rcu_nulls(sk, node, &hslot->head, UDP_HTABLE_SIZE) {
> +		/*
> +		 * lockless reader, and SLAB_DESTROY_BY_RCU items:
> +		 * We must check this item was not moved to another chain
> +		 */
> +		if (udp_hashfn(net, sk->sk_hash) != hash)
> +			goto begin;
> +		score = compute_score(sk, net, saddr, hnum, sport,
> +				      daddr, dport, dif);
> +		if (score > badness) {
> +			result = sk;
> +			badness = score;
> + 		}
> +	}
> +	/*
> +	 * if the 'NULL' pointer we got at the end of this lookup is
> +	 * not the expected one, we must restart lookup.
> +	 */
> +	if ((unsigned long)node != hash)
> +		goto begin;
> +
> +	if (result) {
> +		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
> +			result = NULL;
> +		else if (unlikely(compute_score(result, net, saddr, hnum, sport,
> +				  daddr, dport, dif) < badness)) {
> +			sock_put(result);
> +			goto begin;
>  		}
>  	}
> -	if (result)
> -		sock_hold(result);
> -	read_unlock(&udp_hash_lock);
> +	rcu_read_unlock();
>  	return result;
>  }
>  
>  static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
>  						 __be16 sport, __be16 dport,
> -						 struct hlist_head udptable[])
> +						 struct udp_table *udptable)
>  {
>  	struct sock *sk;
>  	const struct iphdr *iph = ip_hdr(skb);
> @@ -280,7 +321,7 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
>  struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
>  			     __be32 daddr, __be16 dport, int dif)
>  {
> -	return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, udp_hash);
> +	return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table);
>  }
>  EXPORT_SYMBOL_GPL(udp4_lib_lookup);
>  
> @@ -293,7 +334,7 @@ static inline struct sock *udp_v4_mcast_next(struct sock *sk,
>  	struct sock *s = sk;
>  	unsigned short hnum = ntohs(loc_port);
>  
> -	sk_for_each_from(s, node) {
> +	sk_for_each_from_nulls(s, node, UDP_HTABLE_SIZE) {
>  		struct inet_sock *inet = inet_sk(s);
>  
>  		if (s->sk_hash != hnum					||
> @@ -323,7 +364,7 @@ found:
>   * to find the appropriate port.
>   */
>  
> -void __udp4_lib_err(struct sk_buff *skb, u32 info, struct hlist_head udptable[])
> +void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
>  {
>  	struct inet_sock *inet;
>  	struct iphdr *iph = (struct iphdr*)skb->data;
> @@ -392,7 +433,7 @@ out:
>  
>  void udp_err(struct sk_buff *skb, u32 info)
>  {
> -	__udp4_lib_err(skb, info, udp_hash);
> +	__udp4_lib_err(skb, info, &udp_table);
>  }
>  
>  /*
> @@ -933,6 +974,21 @@ int udp_disconnect(struct sock *sk, int flags)
>  	return 0;
>  }
>  
> +void udp_lib_unhash(struct sock *sk)
> +{
> +	struct udp_table *udptable = sk->sk_prot->h.udp_table;
> +	unsigned int hash = udp_hashfn(sock_net(sk), sk->sk_hash);
> +	struct udp_hslot *hslot = &udptable->hash[hash];
> +
> +	spin_lock(&hslot->lock);
> +	if (sk_del_node_init_rcu(sk)) {
> +		inet_sk(sk)->num = 0;
> +		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
> +	}
> +	spin_unlock(&hslot->lock);
> +}
> +EXPORT_SYMBOL(udp_lib_unhash);
> +
>  static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
>  {
>  	int is_udplite = IS_UDPLITE(sk);
> @@ -1071,13 +1127,14 @@ drop:
>  static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
>  				    struct udphdr  *uh,
>  				    __be32 saddr, __be32 daddr,
> -				    struct hlist_head udptable[])
> +				    struct udp_table *udptable)
>  {
>  	struct sock *sk;
> +	struct udp_hslot *hslot = &udptable->hash[udp_hashfn(net, ntohs(uh->dest))];
>  	int dif;
>  
> -	read_lock(&udp_hash_lock);
> -	sk = sk_head(&udptable[udp_hashfn(net, ntohs(uh->dest))]);
> +	spin_lock(&hslot->lock);
> +	sk = sk_head(&hslot->head);
>  	dif = skb->dev->ifindex;
>  	sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
>  	if (sk) {
> @@ -1102,7 +1159,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
>  		} while (sknext);
>  	} else
>  		kfree_skb(skb);
> -	read_unlock(&udp_hash_lock);
> +	spin_unlock(&hslot->lock);
>  	return 0;
>  }
>  
> @@ -1148,7 +1205,7 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
>   *	All we need to do is get the socket, and then do a checksum.
>   */
>  
> -int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[],
> +int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
>  		   int proto)
>  {
>  	struct sock *sk;
> @@ -1246,7 +1303,7 @@ drop:
>  
>  int udp_rcv(struct sk_buff *skb)
>  {
> -	return __udp4_lib_rcv(skb, udp_hash, IPPROTO_UDP);
> +	return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);
>  }
>  
>  void udp_destroy_sock(struct sock *sk)
> @@ -1488,7 +1545,8 @@ struct proto udp_prot = {
>  	.sysctl_wmem	   = &sysctl_udp_wmem_min,
>  	.sysctl_rmem	   = &sysctl_udp_rmem_min,
>  	.obj_size	   = sizeof(struct udp_sock),
> -	.h.udp_hash	   = udp_hash,
> +	.slab_flags	   = SLAB_DESTROY_BY_RCU,
> +	.h.udp_table	   = &udp_table,
>  #ifdef CONFIG_COMPAT
>  	.compat_setsockopt = compat_udp_setsockopt,
>  	.compat_getsockopt = compat_udp_getsockopt,
> @@ -1498,20 +1556,23 @@ struct proto udp_prot = {
>  /* ------------------------------------------------------------------------ */
>  #ifdef CONFIG_PROC_FS
>  
> -static struct sock *udp_get_first(struct seq_file *seq)
> +static struct sock *udp_get_first(struct seq_file *seq, int start)
>  {
>  	struct sock *sk;
>  	struct udp_iter_state *state = seq->private;
>  	struct net *net = seq_file_net(seq);
>  
> -	for (state->bucket = 0; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) {
> +	for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) {
>  		struct hlist_node *node;
> -		sk_for_each(sk, node, state->hashtable + state->bucket) {
> +		struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
> +		spin_lock_bh(&hslot->lock);
> +		sk_for_each_nulls(sk, node, &hslot->head, UDP_HTABLE_SIZE) {
>  			if (!net_eq(sock_net(sk), net))
>  				continue;
>  			if (sk->sk_family == state->family)
>  				goto found;
>  		}
> +		spin_unlock_bh(&hslot->lock);
>  	}
>  	sk = NULL;
>  found:
> @@ -1525,20 +1586,18 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
>  
>  	do {
>  		sk = sk_next(sk);
> -try_again:
> -		;
>  	} while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
>  
> -	if (!sk && ++state->bucket < UDP_HTABLE_SIZE) {
> -		sk = sk_head(state->hashtable + state->bucket);
> -		goto try_again;
> +	if (!sk) {
> +		spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
> +		return udp_get_first(seq, state->bucket + 1);
>  	}
>  	return sk;
>  }
>  
>  static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
>  {
> -	struct sock *sk = udp_get_first(seq);
> +	struct sock *sk = udp_get_first(seq, 0);
>  
>  	if (sk)
>  		while (pos && (sk = udp_get_next(seq, sk)) != NULL)
> @@ -1547,9 +1606,7 @@ static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
>  }
>  
>  static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
> -	__acquires(udp_hash_lock)
>  {
> -	read_lock(&udp_hash_lock);
>  	return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
>  }
>  
> @@ -1567,9 +1624,11 @@ static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
>  }
>  
>  static void udp_seq_stop(struct seq_file *seq, void *v)
> -	__releases(udp_hash_lock)
>  {
> -	read_unlock(&udp_hash_lock);
> +	struct udp_iter_state *state = seq->private;
> +
> +	if (state->bucket < UDP_HTABLE_SIZE)
> +		spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
>  }
>  
>  static int udp_seq_open(struct inode *inode, struct file *file)
> @@ -1585,7 +1644,7 @@ static int udp_seq_open(struct inode *inode, struct file *file)
>  
>  	s = ((struct seq_file *)file->private_data)->private;
>  	s->family		= afinfo->family;
> -	s->hashtable		= afinfo->hashtable;
> +	s->udp_table		= afinfo->udp_table;
>  	return err;
>  }
>  
> @@ -1657,7 +1716,7 @@ int udp4_seq_show(struct seq_file *seq, void *v)
>  static struct udp_seq_afinfo udp4_seq_afinfo = {
>  	.name		= "udp",
>  	.family		= AF_INET,
> -	.hashtable	= udp_hash,
> +	.udp_table	= &udp_table,
>  	.seq_fops	= {
>  		.owner	=	THIS_MODULE,
>  	},
> @@ -1692,10 +1751,21 @@ void udp4_proc_exit(void)
>  }
>  #endif /* CONFIG_PROC_FS */
>  
> +void __init udp_table_init(struct udp_table *table)
> +{
> +	int i;
> +	
> +	for (i = 0; i < UDP_HTABLE_SIZE; i++) {
> +		table->hash[i].head.first = (struct hlist_node *)i;
> +		spin_lock_init(&table->hash[i].lock);
> +	}
> +}
> +
>  void __init udp_init(void)
>  {
>  	unsigned long limit;
>  
> +	udp_table_init(&udp_table);
>  	/* Set the pressure threshold up by the same strategy of TCP. It is a
>  	 * fraction of global memory that is up to 1/2 at 256 MB, decreasing
>  	 * toward zero with the amount of memory, with a floor of 128 pages.
> @@ -1712,8 +1782,6 @@ void __init udp_init(void)
>  }
>  
>  EXPORT_SYMBOL(udp_disconnect);
> -EXPORT_SYMBOL(udp_hash);
> -EXPORT_SYMBOL(udp_hash_lock);
>  EXPORT_SYMBOL(udp_ioctl);
>  EXPORT_SYMBOL(udp_prot);
>  EXPORT_SYMBOL(udp_sendmsg);
> diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
> index 2e9bad2..9f4a616 100644
> --- a/net/ipv4/udp_impl.h
> +++ b/net/ipv4/udp_impl.h
> @@ -5,8 +5,8 @@
>  #include <net/protocol.h>
>  #include <net/inet_common.h>
>  
> -extern int  	__udp4_lib_rcv(struct sk_buff *, struct hlist_head [], int );
> -extern void 	__udp4_lib_err(struct sk_buff *, u32, struct hlist_head []);
> +extern int  	__udp4_lib_rcv(struct sk_buff *, struct udp_table *, int );
> +extern void 	__udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
>  
>  extern int	udp_v4_get_port(struct sock *sk, unsigned short snum);
>  
> diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
> index 3c80796..c784891 100644
> --- a/net/ipv4/udplite.c
> +++ b/net/ipv4/udplite.c
> @@ -12,16 +12,17 @@
>   */
>  #include "udp_impl.h"
>  
> -struct hlist_head 	udplite_hash[UDP_HTABLE_SIZE];
> +struct udp_table 	udplite_table;
> +EXPORT_SYMBOL(udplite_table);
>  
>  static int udplite_rcv(struct sk_buff *skb)
>  {
> -	return __udp4_lib_rcv(skb, udplite_hash, IPPROTO_UDPLITE);
> +	return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
>  }
>  
>  static void udplite_err(struct sk_buff *skb, u32 info)
>  {
> -	__udp4_lib_err(skb, info, udplite_hash);
> +	__udp4_lib_err(skb, info, &udplite_table);
>  }
>  
>  static	struct net_protocol udplite_protocol = {
> @@ -50,7 +51,8 @@ struct proto 	udplite_prot = {
>  	.unhash		   = udp_lib_unhash,
>  	.get_port	   = udp_v4_get_port,
>  	.obj_size	   = sizeof(struct udp_sock),
> -	.h.udp_hash	   = udplite_hash,
> +	.slab_flags	   = SLAB_DESTROY_BY_RCU,
> +	.h.udp_table	   = &udplite_table,
>  #ifdef CONFIG_COMPAT
>  	.compat_setsockopt = compat_udp_setsockopt,
>  	.compat_getsockopt = compat_udp_getsockopt,
> @@ -71,7 +73,7 @@ static struct inet_protosw udplite4_protosw = {
>  static struct udp_seq_afinfo udplite4_seq_afinfo = {
>  	.name		= "udplite",
>  	.family		= AF_INET,
> -	.hashtable	= udplite_hash,
> +	.udp_table 	= &udplite_table,
>  	.seq_fops	= {
>  		.owner	=	THIS_MODULE,
>  	},
> @@ -108,6 +110,7 @@ static inline int udplite4_proc_init(void)
>  
>  void __init udplite4_register(void)
>  {
> +	udp_table_init(&udplite_table);
>  	if (proto_register(&udplite_prot, 1))
>  		goto out_register_err;
>  
> @@ -126,5 +129,4 @@ out_register_err:
>  	printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __func__);
>  }
>  
> -EXPORT_SYMBOL(udplite_hash);
>  EXPORT_SYMBOL(udplite_prot);
> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> index e51da8c..8c3671d 100644
> --- a/net/ipv6/udp.c
> +++ b/net/ipv6/udp.c
> @@ -54,62 +54,97 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum)
>  	return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal);
>  }
>  
> +static inline int compute_score(struct sock *sk, struct net *net,
> +				unsigned short hnum,
> +				struct in6_addr *saddr, __be16 sport,
> +				struct in6_addr *daddr, __be16 dport,
> +				int dif)
> +{
> +	int score = -1;
> +
> +	if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
> +			sk->sk_family == PF_INET6) {
> +		struct ipv6_pinfo *np = inet6_sk(sk);
> +		struct inet_sock *inet = inet_sk(sk);
> +
> +		score = 0;
> +		if (inet->dport) {
> +			if (inet->dport != sport)
> +				return -1;
> +			score++;
> +		}
> +		if (!ipv6_addr_any(&np->rcv_saddr)) {
> +			if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
> +				return -1;
> +			score++;
> +		}
> +		if (!ipv6_addr_any(&np->daddr)) {
> +			if (!ipv6_addr_equal(&np->daddr, saddr))
> +				return -1;
> +			score++;
> +		}
> +		if (sk->sk_bound_dev_if) {
> +			if (sk->sk_bound_dev_if != dif)
> +				return -1;
> +			score++;
> +		}
> +	}
> +	return score;
> +}
> +
>  static struct sock *__udp6_lib_lookup(struct net *net,
>  				      struct in6_addr *saddr, __be16 sport,
>  				      struct in6_addr *daddr, __be16 dport,
> -				      int dif, struct hlist_head udptable[])
> +				      int dif, struct udp_table *udptable)
>  {
> -	struct sock *sk, *result = NULL;
> +	struct sock *sk, *result;
>  	struct hlist_node *node;
>  	unsigned short hnum = ntohs(dport);
> -	int badness = -1;
> -
> -	read_lock(&udp_hash_lock);
> -	sk_for_each(sk, node, &udptable[udp_hashfn(net, hnum)]) {
> -		struct inet_sock *inet = inet_sk(sk);
> -
> -		if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
> -				sk->sk_family == PF_INET6) {
> -			struct ipv6_pinfo *np = inet6_sk(sk);
> -			int score = 0;
> -			if (inet->dport) {
> -				if (inet->dport != sport)
> -					continue;
> -				score++;
> -			}
> -			if (!ipv6_addr_any(&np->rcv_saddr)) {
> -				if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
> -					continue;
> -				score++;
> -			}
> -			if (!ipv6_addr_any(&np->daddr)) {
> -				if (!ipv6_addr_equal(&np->daddr, saddr))
> -					continue;
> -				score++;
> -			}
> -			if (sk->sk_bound_dev_if) {
> -				if (sk->sk_bound_dev_if != dif)
> -					continue;
> -				score++;
> -			}
> -			if (score == 4) {
> -				result = sk;
> -				break;
> -			} else if (score > badness) {
> -				result = sk;
> -				badness = score;
> -			}
> +	unsigned int hash = udp_hashfn(net, hnum);
> +	struct udp_hslot *hslot = &udptable->hash[hash];
> +	int score, badness;
> +
> +	rcu_read_lock();
> +begin:
> +	result = NULL;
> +	badness = -1;
> +	sk_for_each_rcu_nulls(sk, node, &hslot->head, UDP_HTABLE_SIZE) {
> +		/*
> +		 * lockless reader, and SLAB_DESTROY_BY_RCU items:
> +		 * We must check this item was not moved to another chain
> +		 */
> +		if (udp_hashfn(net, sk->sk_hash) != hash)
> +			goto begin;
> +		score = compute_score(sk, net, hnum, saddr, sport,
> +				      daddr, dport, dif);
> +		if (score > badness) {
> +			result = sk;
> +			badness = score;
>  		}
>  	}
> -	if (result)
> -		sock_hold(result);
> -	read_unlock(&udp_hash_lock);
> +	/*
> +	 * if the 'NULL' pointer we got at the end of this lookup is
> +	 * not the expected one, we must restart lookup.
> +	 */
> +	if ((unsigned long)node != hash)
> +		goto begin;
> +
> +	if (result) {
> +		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
> +			result = NULL;
> +		else if (unlikely(compute_score(result, net, hnum, saddr, sport,
> +					daddr, dport, dif) < badness)) {
> +			sock_put(result);
> +			goto begin;
> + 		}
> +	}
> +	rcu_read_unlock();
>  	return result;
>  }
>  
>  static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
>  					  __be16 sport, __be16 dport,
> -					  struct hlist_head udptable[])
> +					  struct udp_table *udptable)
>  {
>  	struct sock *sk;
>  	struct ipv6hdr *iph = ipv6_hdr(skb);
> @@ -239,7 +274,7 @@ csum_copy_err:
>  
>  void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
>  		    int type, int code, int offset, __be32 info,
> -		    struct hlist_head udptable[]                    )
> +		    struct udp_table *udptable)
>  {
>  	struct ipv6_pinfo *np;
>  	struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data;
> @@ -275,7 +310,7 @@ static __inline__ void udpv6_err(struct sk_buff *skb,
>  				 struct inet6_skb_parm *opt, int type,
>  				 int code, int offset, __be32 info     )
>  {
> -	__udp6_lib_err(skb, opt, type, code, offset, info, udp_hash);
> +	__udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
>  }
>  
>  int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
> @@ -337,7 +372,7 @@ static struct sock *udp_v6_mcast_next(struct sock *sk,
>  	struct sock *s = sk;
>  	unsigned short num = ntohs(loc_port);
>  
> -	sk_for_each_from(s, node) {
> +	sk_for_each_from_nulls(s, node, UDP_HTABLE_SIZE) {
>  		struct inet_sock *inet = inet_sk(s);
>  
>  		if (sock_net(s) != sock_net(sk))
> @@ -374,14 +409,15 @@ static struct sock *udp_v6_mcast_next(struct sock *sk,
>   */
>  static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
>  		struct in6_addr *saddr, struct in6_addr *daddr,
> -		struct hlist_head udptable[])
> +		struct udp_table *udptable)
>  {
>  	struct sock *sk, *sk2;
>  	const struct udphdr *uh = udp_hdr(skb);
> +	struct udp_hslot *hslot = &udptable->hash[udp_hashfn(net, ntohs(uh->dest))];
>  	int dif;
>  
> -	read_lock(&udp_hash_lock);
> -	sk = sk_head(&udptable[udp_hashfn(net, ntohs(uh->dest))]);
> +	spin_lock(&hslot->lock);
> +	sk = sk_head(&hslot->head);
>  	dif = inet6_iif(skb);
>  	sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
>  	if (!sk) {
> @@ -409,7 +445,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
>  		sk_add_backlog(sk, skb);
>  	bh_unlock_sock(sk);
>  out:
> -	read_unlock(&udp_hash_lock);
> +	spin_unlock(&hslot->lock);
>  	return 0;
>  }
>  
> @@ -447,7 +483,7 @@ static inline int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh,
>  	return 0;
>  }
>  
> -int __udp6_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[],
> +int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
>  		   int proto)
>  {
>  	struct sock *sk;
> @@ -544,7 +580,7 @@ discard:
>  
>  static __inline__ int udpv6_rcv(struct sk_buff *skb)
>  {
> -	return __udp6_lib_rcv(skb, udp_hash, IPPROTO_UDP);
> +	return __udp6_lib_rcv(skb, &udp_table, IPPROTO_UDP);
>  }
>  
>  /*
> @@ -1008,7 +1044,7 @@ int udp6_seq_show(struct seq_file *seq, void *v)
>  static struct udp_seq_afinfo udp6_seq_afinfo = {
>  	.name		= "udp6",
>  	.family		= AF_INET6,
> -	.hashtable	= udp_hash,
> +	.udp_table	= &udp_table,
>  	.seq_fops	= {
>  		.owner	=	THIS_MODULE,
>  	},
> @@ -1050,7 +1086,8 @@ struct proto udpv6_prot = {
>  	.sysctl_wmem	   = &sysctl_udp_wmem_min,
>  	.sysctl_rmem	   = &sysctl_udp_rmem_min,
>  	.obj_size	   = sizeof(struct udp6_sock),
> -	.h.udp_hash	   = udp_hash,
> +	.slab_flags	   = SLAB_DESTROY_BY_RCU,
> +	.h.udp_table	   = &udp_table,
>  #ifdef CONFIG_COMPAT
>  	.compat_setsockopt = compat_udpv6_setsockopt,
>  	.compat_getsockopt = compat_udpv6_getsockopt,
> diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h
> index 92dd7da..2377920 100644
> --- a/net/ipv6/udp_impl.h
> +++ b/net/ipv6/udp_impl.h
> @@ -7,9 +7,9 @@
>  #include <net/inet_common.h>
>  #include <net/transp_v6.h>
>  
> -extern int  	__udp6_lib_rcv(struct sk_buff *, struct hlist_head [], int );
> +extern int  	__udp6_lib_rcv(struct sk_buff *, struct udp_table *, int );
>  extern void 	__udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *,
> -			       int , int , int , __be32 , struct hlist_head []);
> +			       int , int , int , __be32 , struct udp_table *);
>  
>  extern int	udp_v6_get_port(struct sock *sk, unsigned short snum);
>  
> diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
> index 3cd1a1a..05ab176 100644
> --- a/net/ipv6/udplite.c
> +++ b/net/ipv6/udplite.c
> @@ -15,14 +15,14 @@
>  
>  static int udplitev6_rcv(struct sk_buff *skb)
>  {
> -	return __udp6_lib_rcv(skb, udplite_hash, IPPROTO_UDPLITE);
> +	return __udp6_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
>  }
>  
>  static void udplitev6_err(struct sk_buff *skb,
>  			  struct inet6_skb_parm *opt,
>  			  int type, int code, int offset, __be32 info)
>  {
> -	__udp6_lib_err(skb, opt, type, code, offset, info, udplite_hash);
> +	__udp6_lib_err(skb, opt, type, code, offset, info, &udplite_table);
>  }
>  
>  static struct inet6_protocol udplitev6_protocol = {
> @@ -49,7 +49,8 @@ struct proto udplitev6_prot = {
>  	.unhash		   = udp_lib_unhash,
>  	.get_port	   = udp_v6_get_port,
>  	.obj_size	   = sizeof(struct udp6_sock),
> -	.h.udp_hash	   = udplite_hash,
> + 	.slab_flags	   = SLAB_DESTROY_BY_RCU,
> +	.h.udp_table	   = &udplite_table,
>  #ifdef CONFIG_COMPAT
>  	.compat_setsockopt = compat_udpv6_setsockopt,
>  	.compat_getsockopt = compat_udpv6_getsockopt,
> @@ -95,7 +96,7 @@ void udplitev6_exit(void)
>  static struct udp_seq_afinfo udplite6_seq_afinfo = {
>  	.name		= "udplite6",
>  	.family		= AF_INET6,
> -	.hashtable	= udplite_hash,
> +	.udp_table	= &udplite_table,
>  	.seq_fops	= {
>  		.owner	=	THIS_MODULE,
>  	},

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH] udp: Introduce special NULL pointers for hlist termination
  2008-10-30 15:40                                                     ` [PATCH] udp: Introduce special NULL pointers for hlist termination Eric Dumazet
  2008-10-30 15:51                                                       ` Stephen Hemminger
  2008-10-30 16:01                                                       ` Peter Zijlstra
@ 2008-10-31  0:14                                                       ` Keith Owens
  2008-11-13 13:13                                                       ` [PATCH 0/3] net: RCU lookups for UDP, DCCP and TCP protocol Eric Dumazet
                                                                         ` (3 subsequent siblings)
  6 siblings, 0 replies; 134+ messages in thread
From: Keith Owens @ 2008-10-31  0:14 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: paulmck, Corey Minyard, David Miller, shemminger, benny+usenet,
	netdev, Christoph Lameter, a.p.zijlstra, johnpol, Christian Bell

Eric Dumazet (on Thu, 30 Oct 2008 16:40:01 +0100) wrote:
>+/**
>+ * ptr_is_a_nulls - Test if a ptr to struct hlist_nulls_node is a nulls

Typo: comment says 'ptr_is_a_nulls' but function is called 'is_a_nulls'.

>+ * @ptr: ptr to be tested
>+ *
>+ */
>+static inline int is_a_nulls(struct hlist_nulls_node *ptr)
>+{
>+	return ((unsigned long)ptr & 1);
>+}


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH] udp: Introduce special NULL pointers for hlist termination
  2008-10-30 17:12                                                         ` Eric Dumazet
@ 2008-10-31  7:51                                                           ` David Miller
  0 siblings, 0 replies; 134+ messages in thread
From: David Miller @ 2008-10-31  7:51 UTC (permalink / raw)
  To: dada1
  Cc: shemminger, paulmck, minyard, benny+usenet, netdev, cl,
	a.p.zijlstra, johnpol, christian

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Thu, 30 Oct 2008 18:12:03 +0100

> More seriously Stephen, if the infrastructure is clean, and well
> tested on a relative simple case (UDP), it can then be deployed on a
> much more interesting protocol : TCP

I'm really looking forward to someone finally tackling that problem :)

> The moment we switch to RCU, we have to accept the pain of really
> understand what we did. Details are scary yes.

Agreed.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH] udp: Introduce special NULL pointers for hlist termination
  2008-10-30 16:28                                                         ` Corey Minyard
@ 2008-10-31 14:37                                                           ` Eric Dumazet
  2008-10-31 14:55                                                             ` Pavel Emelyanov
  0 siblings, 1 reply; 134+ messages in thread
From: Eric Dumazet @ 2008-10-31 14:37 UTC (permalink / raw)
  To: Corey Minyard, David Miller
  Cc: Stephen Hemminger, paulmck, benny+usenet, netdev,
	Christoph Lameter, a.p.zijlstra, johnpol, Christian Bell,
	Pavel Emelyanov

[-- Attachment #1: Type: text/plain, Size: 1297 bytes --]

Corey Minyard a écrit :
  
> 
> It is annoying that it doesn't help the performance for multicast.  
> However, I think the current patch will solve the DOS issue for 
> multicast, since it switches to a normal spinlock and has a per-list lock.

About multicast, it should be possible to do something about it, if it happens
to be an issue.

That is, do a lockless lookup and accumulate matching sockets ptr in a table

(incrementing their refcount if not zero, checking key, adding in a local stack).

If lookup must be restarted, forget all accumulated sockets (sock_put(ptrs))
   goto begin;

Then, send the (cloned) packet to all accumulated sockets, and
 sock_put() them to release the refcount.



Well, looking at current implementation, I found that udp_v4_mcast_next()
doesnt take into account the 'struct net *net', so we have a bug here...

udp_v6_mcast_next() is buggy too (or at least its caller is)

David, please find a patch against net-2.6

Thanks

[PATCH] udp: multicast packets need to check namespace

Current UDP multicast delivery is not namespace aware.


Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
 net/ipv4/udp.c |   14 ++++++++------
 net/ipv6/udp.c |    8 ++++----
 2 files changed, 12 insertions(+), 10 deletions(-)

[-- Attachment #2: udp_multi.patch --]
[-- Type: text/plain, Size: 3014 bytes --]

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2095abc..76e3cc5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -284,7 +284,7 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
 }
 EXPORT_SYMBOL_GPL(udp4_lib_lookup);
 
-static inline struct sock *udp_v4_mcast_next(struct sock *sk,
+static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
 					     __be16 loc_port, __be32 loc_addr,
 					     __be16 rmt_port, __be32 rmt_addr,
 					     int dif)
@@ -295,8 +295,9 @@ static inline struct sock *udp_v4_mcast_next(struct sock *sk,
 
 	sk_for_each_from(s, node) {
 		struct inet_sock *inet = inet_sk(s);
-
-		if (s->sk_hash != hnum					||
+	
+		if (!net_eq(sock_net(s), net)				||
+		    s->sk_hash != hnum					||
 		    (inet->daddr && inet->daddr != rmt_addr)		||
 		    (inet->dport != rmt_port && inet->dport)		||
 		    (inet->rcv_saddr && inet->rcv_saddr != loc_addr)	||
@@ -1079,15 +1080,16 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	read_lock(&udp_hash_lock);
 	sk = sk_head(&udptable[udp_hashfn(net, ntohs(uh->dest))]);
 	dif = skb->dev->ifindex;
-	sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
+	sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
 	if (sk) {
 		struct sock *sknext = NULL;
 
 		do {
 			struct sk_buff *skb1 = skb;
 
-			sknext = udp_v4_mcast_next(sk_next(sk), uh->dest, daddr,
-						   uh->source, saddr, dif);
+			sknext = udp_v4_mcast_next(net, sk_next(sk), uh->dest,
+						   daddr, uh->source, saddr,
+						   dif);
 			if (sknext)
 				skb1 = skb_clone(skb, GFP_ATOMIC);
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index e51da8c..71e259e 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -328,7 +328,7 @@ drop:
 	return -1;
 }
 
-static struct sock *udp_v6_mcast_next(struct sock *sk,
+static struct sock *udp_v6_mcast_next(struct net *net, struct sock *sk,
 				      __be16 loc_port, struct in6_addr *loc_addr,
 				      __be16 rmt_port, struct in6_addr *rmt_addr,
 				      int dif)
@@ -340,7 +340,7 @@ static struct sock *udp_v6_mcast_next(struct sock *sk,
 	sk_for_each_from(s, node) {
 		struct inet_sock *inet = inet_sk(s);
 
-		if (sock_net(s) != sock_net(sk))
+		if (!net_eq(sock_net(s), net))
 			continue;
 
 		if (s->sk_hash == num && s->sk_family == PF_INET6) {
@@ -383,14 +383,14 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	read_lock(&udp_hash_lock);
 	sk = sk_head(&udptable[udp_hashfn(net, ntohs(uh->dest))]);
 	dif = inet6_iif(skb);
-	sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
+	sk = udp_v6_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
 	if (!sk) {
 		kfree_skb(skb);
 		goto out;
 	}
 
 	sk2 = sk;
-	while ((sk2 = udp_v6_mcast_next(sk_next(sk2), uh->dest, daddr,
+	while ((sk2 = udp_v6_mcast_next(net, sk_next(sk2), uh->dest, daddr,
 					uh->source, saddr, dif))) {
 		struct sk_buff *buff = skb_clone(skb, GFP_ATOMIC);
 		if (buff) {

^ permalink raw reply related	[flat|nested] 134+ messages in thread

* Re: [PATCH] udp: Introduce special NULL pointers for hlist termination
  2008-10-31 14:37                                                           ` Eric Dumazet
@ 2008-10-31 14:55                                                             ` Pavel Emelyanov
  2008-11-02  4:22                                                               ` David Miller
  0 siblings, 1 reply; 134+ messages in thread
From: Pavel Emelyanov @ 2008-10-31 14:55 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Corey Minyard, David Miller, Stephen Hemminger, paulmck,
	benny+usenet, netdev, Christoph Lameter, a.p.zijlstra, johnpol,
	Christian Bell

> Well, looking at current implementation, I found that udp_v4_mcast_next()
> doesnt take into account the 'struct net *net', so we have a bug here...
> 
> udp_v6_mcast_next() is buggy too (or at least its caller is)
> 
> David, please find a patch against net-2.6
> 
> Thanks
> 
> [PATCH] udp: multicast packets need to check namespace
> 
> Current UDP multicast delivery is not namespace aware.
> 
> 
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

Acked-by: Pavel Emelyanov <xemul@openvz.org>

Thanks!

> ---
>  net/ipv4/udp.c |   14 ++++++++------
>  net/ipv6/udp.c |    8 ++++----
>  2 files changed, 12 insertions(+), 10 deletions(-)
> 


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-30 18:25                                                     ` Paul E. McKenney
@ 2008-10-31 16:40                                                       ` Eric Dumazet
  2008-11-01  3:10                                                         ` Paul E. McKenney
  0 siblings, 1 reply; 134+ messages in thread
From: Eric Dumazet @ 2008-10-31 16:40 UTC (permalink / raw)
  To: paulmck
  Cc: Peter Zijlstra, Corey Minyard, David Miller, shemminger,
	benny+usenet, netdev, Christoph Lameter, johnpol, Christian Bell

Paul E. McKenney a écrit :
> On Thu, Oct 30, 2008 at 12:30:20PM +0100, Eric Dumazet wrote:
>> -		while (udp_lib_lport_inuse(net, snum, udptable, sk,
>> -					   saddr_comp)) {
>> +		for (;;) {
>> +			hslot = &udptable->hash[udp_hashfn(net, snum)];
>> +			spin_lock_bh(&hslot->lock);
>> +			if (!udp_lib_lport_inuse(net, snum, hslot, sk, saddr_comp))
>> +				break;
>> +			spin_unlock_bh(&hslot->lock);
>>  			do {
>>  				snum = snum + rand;
>>  			} while (snum < low || snum > high);
> 
> The above -really- confuses me, but not part of this patch.  If we are
> out of range, keep going?  Well, I guess that since it is a short, we
> cannot go very far...
> 
>>  			if (snum == first)
>>  				goto fail;
> 
> And I don't understand how we are guaranteed to have scanned all the
> possible ports upon failure, but happy to leave that to you guys.

Well, we have 65536(=2^16) possible port values, and while 'rand' is random,
it has the interesting property/bias of being odd.

We know (thanks modular arithmetic / congruence relation) we will hit
all 65356 values exactly once, after exactly 65536 iterations.


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-31 16:40                                                       ` Eric Dumazet
@ 2008-11-01  3:10                                                         ` Paul E. McKenney
  0 siblings, 0 replies; 134+ messages in thread
From: Paul E. McKenney @ 2008-11-01  3:10 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Peter Zijlstra, Corey Minyard, David Miller, shemminger,
	benny+usenet, netdev, Christoph Lameter, johnpol, Christian Bell

On Fri, Oct 31, 2008 at 05:40:46PM +0100, Eric Dumazet wrote:
> Paul E. McKenney a écrit :
>> On Thu, Oct 30, 2008 at 12:30:20PM +0100, Eric Dumazet wrote:
>>> -		while (udp_lib_lport_inuse(net, snum, udptable, sk,
>>> -					   saddr_comp)) {
>>> +		for (;;) {
>>> +			hslot = &udptable->hash[udp_hashfn(net, snum)];
>>> +			spin_lock_bh(&hslot->lock);
>>> +			if (!udp_lib_lport_inuse(net, snum, hslot, sk, saddr_comp))
>>> +				break;
>>> +			spin_unlock_bh(&hslot->lock);
>>>  			do {
>>>  				snum = snum + rand;
>>>  			} while (snum < low || snum > high);
>> The above -really- confuses me, but not part of this patch.  If we are
>> out of range, keep going?  Well, I guess that since it is a short, we
>> cannot go very far...
>>>  			if (snum == first)
>>>  				goto fail;
>> And I don't understand how we are guaranteed to have scanned all the
>> possible ports upon failure, but happy to leave that to you guys.
>
> Well, we have 65536(=2^16) possible port values, and while 'rand' is 
> random,
> it has the interesting property/bias of being odd.
>
> We know (thanks modular arithmetic / congruence relation) we will hit
> all 65356 values exactly once, after exactly 65536 iterations.

Ah, got it!  Thank you for the explanation!

I was fixating on the low..high interval.  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/2] udp: RCU handling for Unicast packets.
  2008-10-30  5:50                                                       ` Eric Dumazet
@ 2008-11-02  4:19                                                         ` David Miller
  0 siblings, 0 replies; 134+ messages in thread
From: David Miller @ 2008-11-02  4:19 UTC (permalink / raw)
  To: dada1
  Cc: minyard, paulmck, shemminger, benny+usenet, netdev, cl,
	a.p.zijlstra, johnpol, christian

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Thu, 30 Oct 2008 06:50:07 +0100

> [PATCH] udp: add a missing smp_wmb() in udp_lib_get_port()
> 
> Corey Minyard spotted a missing memory barrier in udp_lib_get_port()
> 
> We need to make sure a reader cannot read the new 'sk->sk_next' value
> and previous value of 'sk->sk_hash'. Or else, an item could be deleted
> from a chain, and inserted into another chain. If new chain was empty
> before the move, 'next' pointer is NULL, and lockless reader can
> not detect it missed following items in original chain.
> 
> This patch is temporary, since we expect an upcoming patch
> to introduce another way of handling the problem.
> 
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

I've applied this to net-next-2.6

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH] udp: Introduce special NULL pointers for hlist termination
  2008-10-31 14:55                                                             ` Pavel Emelyanov
@ 2008-11-02  4:22                                                               ` David Miller
  0 siblings, 0 replies; 134+ messages in thread
From: David Miller @ 2008-11-02  4:22 UTC (permalink / raw)
  To: xemul
  Cc: dada1, minyard, shemminger, paulmck, benny+usenet, netdev, cl,
	a.p.zijlstra, johnpol, christian

From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 31 Oct 2008 17:55:50 +0300

> > Well, looking at current implementation, I found that udp_v4_mcast_next()
> > doesnt take into account the 'struct net *net', so we have a bug here...
> > 
> > udp_v6_mcast_next() is buggy too (or at least its caller is)
> > 
> > David, please find a patch against net-2.6
> > 
> > Thanks
> > 
> > [PATCH] udp: multicast packets need to check namespace
> > 
> > Current UDP multicast delivery is not namespace aware.
> > 
> > 
> > Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
> 
> Acked-by: Pavel Emelyanov <xemul@openvz.org>

Applied, thanks everyone.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* [PATCH 0/3] net: RCU lookups for UDP, DCCP and TCP protocol
  2008-10-30 15:40                                                     ` [PATCH] udp: Introduce special NULL pointers for hlist termination Eric Dumazet
                                                                         ` (2 preceding siblings ...)
  2008-10-31  0:14                                                       ` Keith Owens
@ 2008-11-13 13:13                                                       ` Eric Dumazet
  2008-11-13 17:20                                                         ` Andi Kleen
  2008-11-17  3:41                                                         ` David Miller
  2008-11-13 13:14                                                       ` [PATCH 1/3] rcu: Introduce hlist_nulls variant of hlist Eric Dumazet
                                                                         ` (2 subsequent siblings)
  6 siblings, 2 replies; 134+ messages in thread
From: Eric Dumazet @ 2008-11-13 13:13 UTC (permalink / raw)
  To: David Miller
  Cc: Paul E. McKenney, Corey Minyard, Stephen Hemminger, benny+usenet,
	Linux Netdev List, Christoph Lameter, Peter Zijlstra,
	Evgeniy Polyakov, Christian Bell

Hi all

Here is a serie of three patches (based on net-next-2.6), to continue work
with RCU on UDP/TCP/DCCP stacks

Many thanks for all usefull reviews and comments, especially from Paul and Corey.

1) Introduce hlist_nulls variant of hlist

   hlist uses NULL value to finish a chain.
   hlist_nulls variant use the low order bit set to 1 to signal an end marker.
   This allows to store many different end markers, so that some RCU lockless
   algos (used in TCP/UDP stack for example) can save some memory barriers in
   fast paths.

2) Use hlist_nulls in UDP RCU code

   This is a straightforward patch, using hlist_nulls infrastructure.
   RCU-ification already done on UDP two weeks ago, so hlist_nulls
   permits us to avoid some memory barriers, both at lookup time
   and delete time. Patch is large because it adds new macros to
   include/net/sock.h. These macros will be used by TCP & DCCP too.

3) Convert TCP & DCCP hash tables to use RCU & hlist_nulls

   RCU was added to UDP lookups, using a fast infrastructure :
   - sockets kmem_cache use SLAB_DESTROY_BY_RCU and dont pay the
     price of call_rcu() at freeing time.
   - hlist_nulls permits to use few memory barriers.

   This patch uses same infrastructure for TCP/DCCP established
   and timewait sockets.

   Thanks to SLAB_DESTROY_BY_RCU, no slowdown for applications
   using short lived TCP connections. A followup patch, converting
   rwlocks to spinlocks will even speedup this case.

   __inet_lookup_established() is pretty fast now we dont have to
   dirty a contended cache line (read_lock/read_unlock)

   Only established and timewait hashtable are converted to RCU
  (bind table and listen table are still using traditional locking)


Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

^ permalink raw reply	[flat|nested] 134+ messages in thread

* [PATCH 1/3] rcu: Introduce hlist_nulls variant of hlist
  2008-10-30 15:40                                                     ` [PATCH] udp: Introduce special NULL pointers for hlist termination Eric Dumazet
                                                                         ` (3 preceding siblings ...)
  2008-11-13 13:13                                                       ` [PATCH 0/3] net: RCU lookups for UDP, DCCP and TCP protocol Eric Dumazet
@ 2008-11-13 13:14                                                       ` Eric Dumazet
  2008-11-13 13:29                                                         ` Peter Zijlstra
                                                                           ` (2 more replies)
  2008-11-13 13:15                                                       ` [PATCH 2/3] udp: Use hlist_nulls in UDP RCU code Eric Dumazet
  2008-11-13 13:15                                                       ` [PATCH 3/3] net: Convert TCP & DCCP hash tables to use RCU / hlist_nulls Eric Dumazet
  6 siblings, 3 replies; 134+ messages in thread
From: Eric Dumazet @ 2008-11-13 13:14 UTC (permalink / raw)
  To: Corey Minyard
  Cc: Paul E. McKenney, David Miller, Stephen Hemminger, benny+usenet,
	Linux Netdev List, Christoph Lameter, Evgeniy Polyakov,
	Peter Zijlstra, Christian Bell

[-- Attachment #1: Type: text/plain, Size: 2524 bytes --]

hlist uses NULL value to finish a chain.

hlist_nulls variant use the low order bit set to 1 to signal an end-of-list marker.

This allows to store many different end markers, so that some RCU lockless
algos (used in TCP/UDP stack for example) can save some memory barriers in
fast paths.

Two new files are added :

include/linux/list_nulls.h
  - mimics hlist part of include/linux/list.h, derived to hlist_nulls variant

include/linux/rculist_nulls.h
  - mimics hlist part of include/linux/rculist.h, derived to hlist_nulls variant

   Only four helpers are declared for the moment :

     hlist_nulls_del_init_rcu(), hlist_nulls_del_rcu(),
     hlist_nulls_add_head_rcu() and hlist_nulls_for_each_entry_rcu()

prefetches() were removed, since an end of list is not anymore NULL value.
prefetches() could trigger useless (and possibly dangerous) memory transactions.

Example of use (extracted from __udp4_lib_lookup())

	struct sock *sk, *result;
        struct hlist_nulls_node *node;
        unsigned short hnum = ntohs(dport);
        unsigned int hash = udp_hashfn(net, hnum);
        struct udp_hslot *hslot = &udptable->hash[hash];
        int score, badness;

        rcu_read_lock();
begin:
        result = NULL;
        badness = -1;
        sk_nulls_for_each_rcu(sk, node, &hslot->head) {
                score = compute_score(sk, net, saddr, hnum, sport,
                                      daddr, dport, dif);
                if (score > badness) {
                        result = sk;
                        badness = score;
                }
        }
        /*
         * if the nulls value we got at the end of this lookup is
         * not the expected one, we must restart lookup.
         * We probably met an item that was moved to another chain.
         */
        if (get_nulls_value(node) != hash)
                goto begin;

        if (result) {
                if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
                        result = NULL;
                else if (unlikely(compute_score(result, net, saddr, hnum, sport,
                                  daddr, dport, dif) < badness)) {
                        sock_put(result);
                        goto begin;
                }
        }
        rcu_read_unlock();
        return result;

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
 include/linux/list_nulls.h    |   94 +++++++++++++++++++++++++++
 include/linux/rculist_nulls.h |  110 ++++++++++++++++++++++++++++++++
 2 files changed, 204 insertions(+)

[-- Attachment #2: nulls.patch --]
[-- Type: text/plain, Size: 7213 bytes --]

diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
new file mode 100644
index 0000000..856dee8
--- /dev/null
+++ b/include/linux/list_nulls.h
@@ -0,0 +1,94 @@
+#ifndef _LINUX_LIST_NULLS_H
+#define _LINUX_LIST_NULLS_H
+
+/*
+ * Special version of lists, where end of list is not a NULL pointer,
+ * but a 'nulls' marker, which can have many different values.
+ * (up to 2^31 different values guaranteed on all platforms)
+ *
+ * In the standard hlist, termination of a list is the NULL pointer.
+ * In this special 'nulls' variant, we use the fact that objects stored in
+ * a list are aligned on a word (4 or 8 bytes alignment).
+ * We therefore use the last significant bit of 'ptr' :
+ * Set to 1 : This is a 'nulls' end-of-list marker (ptr >> 1)
+ * Set to 0 : This is a pointer to some object (ptr)
+ */
+
+struct hlist_nulls_head {
+	struct hlist_nulls_node *first;
+};
+
+struct hlist_nulls_node {
+	struct hlist_nulls_node *next, **pprev;
+};
+#define INIT_HLIST_NULLS_HEAD(ptr, nulls) \
+	((ptr)->first = (struct hlist_nulls_node *) (1UL | (((long)nulls) << 1)))
+
+#define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member)
+/**
+ * ptr_is_a_nulls - Test if a ptr is a nulls
+ * @ptr: ptr to be tested
+ *
+ */
+static inline int is_a_nulls(const struct hlist_nulls_node *ptr)
+{
+	return ((unsigned long)ptr & 1);
+}
+
+/**
+ * get_nulls_value - Get the 'nulls' value of the end of chain
+ * @ptr: end of chain
+ *
+ * Should be called only if is_a_nulls(ptr);
+ */
+static inline unsigned long get_nulls_value(const struct hlist_nulls_node *ptr)
+{
+	return ((unsigned long)ptr) >> 1;
+}
+
+static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h)
+{
+	return !h->pprev;
+}
+
+static inline int hlist_nulls_empty(const struct hlist_nulls_head *h)
+{
+	return is_a_nulls(h->first);
+}
+
+static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
+{
+	struct hlist_nulls_node *next = n->next;
+	struct hlist_nulls_node **pprev = n->pprev;
+	*pprev = next;
+	if (!is_a_nulls(next))
+		next->pprev = pprev;
+}
+
+/**
+ * hlist_nulls_for_each_entry	- iterate over list of given type
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ *
+ */
+#define hlist_nulls_for_each_entry(tpos, pos, head, member)		       \
+	for (pos = (head)->first;					       \
+	     (!is_a_nulls(pos)) &&					       \
+		({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+/**
+ * hlist_nulls_for_each_entry_from - iterate over a hlist continuing from current point
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @member:	the name of the hlist_node within the struct.
+ *
+ */
+#define hlist_nulls_for_each_entry_from(tpos, pos, member)	\
+	for (; (!is_a_nulls(pos)) && 				\
+		({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+#endif
diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
new file mode 100644
index 0000000..b185ac4
--- /dev/null
+++ b/include/linux/rculist_nulls.h
@@ -0,0 +1,110 @@
+#ifndef _LINUX_RCULIST_NULLS_H
+#define _LINUX_RCULIST_NULLS_H
+
+#ifdef __KERNEL__
+
+/*
+ * RCU-protected list version
+ */
+#include <linux/list_nulls.h>
+#include <linux/rcupdate.h>
+
+/**
+ * hlist_nulls_del_init_rcu - deletes entry from hash list with re-initialization
+ * @n: the element to delete from the hash list.
+ *
+ * Note: hlist_nulls_unhashed() on the node return true after this. It is
+ * useful for RCU based read lockfree traversal if the writer side
+ * must know if the list entry is still hashed or already unhashed.
+ *
+ * In particular, it means that we can not poison the forward pointers
+ * that may still be used for walking the hash list and we can only
+ * zero the pprev pointer so list_unhashed() will return true after
+ * this.
+ *
+ * The caller must take whatever precautions are necessary (such as
+ * holding appropriate locks) to avoid racing with another
+ * list-mutation primitive, such as hlist_nulls_add_head_rcu() or
+ * hlist_nulls_del_rcu(), running on this same list.  However, it is
+ * perfectly legal to run concurrently with the _rcu list-traversal
+ * primitives, such as hlist_nulls_for_each_entry_rcu().
+ */
+static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
+{
+	if (!hlist_nulls_unhashed(n)) {
+		__hlist_nulls_del(n);
+		n->pprev = NULL;
+	}
+}
+
+/**
+ * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization
+ * @n: the element to delete from the hash list.
+ *
+ * Note: hlist_nulls_unhashed() on entry does not return true after this,
+ * the entry is in an undefined state. It is useful for RCU based
+ * lockfree traversal.
+ *
+ * In particular, it means that we can not poison the forward
+ * pointers that may still be used for walking the hash list.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
+ * or hlist_nulls_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * hlist_nulls_for_each_entry().
+ */
+static inline void hlist_nulls_del_rcu(struct hlist_nulls_node *n)
+{
+	__hlist_nulls_del(n);
+	n->pprev = LIST_POISON2;
+}
+
+/**
+ * hlist_nulls_add_head_rcu
+ * @n: the element to add to the hash list.
+ * @h: the list to add to.
+ *
+ * Description:
+ * Adds the specified element to the specified hlist_nulls,
+ * while permitting racing traversals.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
+ * or hlist_nulls_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency
+ * problems on Alpha CPUs.  Regardless of the type of CPU, the
+ * list-traversal primitive must be guarded by rcu_read_lock().
+ */
+static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
+					struct hlist_nulls_head *h)
+{
+	struct hlist_nulls_node *first = h->first;
+
+	n->next = first;
+	n->pprev = &h->first;
+	rcu_assign_pointer(h->first, n);
+	if (!is_a_nulls(first))
+		first->pprev = &n->next;
+}
+/**
+ * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_nulls_node to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_nulls_node within the struct.
+ *
+ */
+#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \
+	for (pos = rcu_dereference((head)->first);			 \
+		(!is_a_nulls(pos)) && 			\
+		({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \
+		pos = rcu_dereference(pos->next))
+
+#endif
+#endif

^ permalink raw reply related	[flat|nested] 134+ messages in thread

* [PATCH 2/3] udp: Use hlist_nulls in UDP RCU code
  2008-10-30 15:40                                                     ` [PATCH] udp: Introduce special NULL pointers for hlist termination Eric Dumazet
                                                                         ` (4 preceding siblings ...)
  2008-11-13 13:14                                                       ` [PATCH 1/3] rcu: Introduce hlist_nulls variant of hlist Eric Dumazet
@ 2008-11-13 13:15                                                       ` Eric Dumazet
  2008-11-19 17:29                                                         ` Paul E. McKenney
  2008-11-13 13:15                                                       ` [PATCH 3/3] net: Convert TCP & DCCP hash tables to use RCU / hlist_nulls Eric Dumazet
  6 siblings, 1 reply; 134+ messages in thread
From: Eric Dumazet @ 2008-11-13 13:15 UTC (permalink / raw)
  To: David Miller
  Cc: Paul E. McKenney, Corey Minyard, Stephen Hemminger, benny+usenet,
	Linux Netdev List, Christoph Lameter, Peter Zijlstra,
	Evgeniy Polyakov, Christian Bell

[-- Attachment #1: Type: text/plain, Size: 707 bytes --]

This is a straightforward patch, using hlist_nulls infrastructure.

RCUification already done on UDP two weeks ago.

Using hlist_nulls permits us to avoid some memory barriers, both
at lookup time and delete time.

Patch is large because it adds new macros to include/net/sock.h.
These macros will be used by TCP & DCCP in next patch.


Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
 include/linux/rculist.h |   17 -----------
 include/net/sock.h      |   57 ++++++++++++++++++++++++++++++--------
 include/net/udp.h       |    2 -
 net/ipv4/udp.c          |   47 ++++++++++++++-----------------
 net/ipv6/udp.c          |   26 +++++++++--------
 5 files changed, 83 insertions(+), 66 deletions(-)


[-- Attachment #2: UDP.patch --]
[-- Type: text/plain, Size: 13933 bytes --]

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 3ba2998..e649bd3 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -383,22 +383,5 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
 		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \
 		pos = rcu_dereference(pos->next))
 
-/**
- * hlist_for_each_entry_rcu_safenext - iterate over rcu list of given type
- * @tpos:	the type * to use as a loop cursor.
- * @pos:	the &struct hlist_node to use as a loop cursor.
- * @head:	the head for your list.
- * @member:	the name of the hlist_node within the struct.
- * @next:       the &struct hlist_node to use as a next cursor
- *
- * Special version of hlist_for_each_entry_rcu that make sure
- * each next pointer is fetched before each iteration.
- */
-#define hlist_for_each_entry_rcu_safenext(tpos, pos, head, member, next) \
-	for (pos = rcu_dereference((head)->first);			 \
-		pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) &&	\
-		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \
-		pos = rcu_dereference(next))
-
 #endif	/* __KERNEL__ */
 #endif
diff --git a/include/net/sock.h b/include/net/sock.h
index 8b2b821..0a63894 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -42,6 +42,7 @@
 
 #include <linux/kernel.h>
 #include <linux/list.h>
+#include <linux/list_nulls.h>
 #include <linux/timer.h>
 #include <linux/cache.h>
 #include <linux/module.h>
@@ -52,6 +53,7 @@
 #include <linux/security.h>
 
 #include <linux/filter.h>
+#include <linux/rculist_nulls.h>
 
 #include <asm/atomic.h>
 #include <net/dst.h>
@@ -106,6 +108,7 @@ struct net;
  *	@skc_reuse: %SO_REUSEADDR setting
  *	@skc_bound_dev_if: bound device index if != 0
  *	@skc_node: main hash linkage for various protocol lookup tables
+ *	@skc_nulls_node: main hash linkage for UDP/UDP-Lite protocol
  *	@skc_bind_node: bind hash linkage for various protocol lookup tables
  *	@skc_refcnt: reference count
  *	@skc_hash: hash value used with various protocol lookup tables
@@ -120,7 +123,10 @@ struct sock_common {
 	volatile unsigned char	skc_state;
 	unsigned char		skc_reuse;
 	int			skc_bound_dev_if;
-	struct hlist_node	skc_node;
+	union {
+		struct hlist_node	skc_node;
+		struct hlist_nulls_node skc_nulls_node;
+	};
 	struct hlist_node	skc_bind_node;
 	atomic_t		skc_refcnt;
 	unsigned int		skc_hash;
@@ -206,6 +212,7 @@ struct sock {
 #define sk_reuse		__sk_common.skc_reuse
 #define sk_bound_dev_if		__sk_common.skc_bound_dev_if
 #define sk_node			__sk_common.skc_node
+#define sk_nulls_node		__sk_common.skc_nulls_node
 #define sk_bind_node		__sk_common.skc_bind_node
 #define sk_refcnt		__sk_common.skc_refcnt
 #define sk_hash			__sk_common.skc_hash
@@ -300,12 +307,30 @@ static inline struct sock *sk_head(const struct hlist_head *head)
 	return hlist_empty(head) ? NULL : __sk_head(head);
 }
 
+static inline struct sock *__sk_nulls_head(const struct hlist_nulls_head *head)
+{
+	return hlist_nulls_entry(head->first, struct sock, sk_nulls_node);
+}
+
+static inline struct sock *sk_nulls_head(const struct hlist_nulls_head *head)
+{
+	return hlist_nulls_empty(head) ? NULL : __sk_nulls_head(head);
+}
+
 static inline struct sock *sk_next(const struct sock *sk)
 {
 	return sk->sk_node.next ?
 		hlist_entry(sk->sk_node.next, struct sock, sk_node) : NULL;
 }
 
+static inline struct sock *sk_nulls_next(const struct sock *sk)
+{
+	return (!is_a_nulls(sk->sk_nulls_node.next)) ?
+		hlist_nulls_entry(sk->sk_nulls_node.next,
+				  struct sock, sk_nulls_node) :
+		NULL;
+}
+
 static inline int sk_unhashed(const struct sock *sk)
 {
 	return hlist_unhashed(&sk->sk_node);
@@ -321,6 +346,11 @@ static __inline__ void sk_node_init(struct hlist_node *node)
 	node->pprev = NULL;
 }
 
+static __inline__ void sk_nulls_node_init(struct hlist_nulls_node *node)
+{
+	node->pprev = NULL;
+}
+
 static __inline__ void __sk_del_node(struct sock *sk)
 {
 	__hlist_del(&sk->sk_node);
@@ -367,18 +397,18 @@ static __inline__ int sk_del_node_init(struct sock *sk)
 	return rc;
 }
 
-static __inline__ int __sk_del_node_init_rcu(struct sock *sk)
+static __inline__ int __sk_nulls_del_node_init_rcu(struct sock *sk)
 {
 	if (sk_hashed(sk)) {
-		hlist_del_init_rcu(&sk->sk_node);
+		hlist_nulls_del_init_rcu(&sk->sk_nulls_node);
 		return 1;
 	}
 	return 0;
 }
 
-static __inline__ int sk_del_node_init_rcu(struct sock *sk)
+static __inline__ int sk_nulls_del_node_init_rcu(struct sock *sk)
 {
-	int rc = __sk_del_node_init_rcu(sk);
+	int rc = __sk_nulls_del_node_init_rcu(sk);
 
 	if (rc) {
 		/* paranoid for a while -acme */
@@ -399,15 +429,15 @@ static __inline__ void sk_add_node(struct sock *sk, struct hlist_head *list)
 	__sk_add_node(sk, list);
 }
 
-static __inline__ void __sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
+static __inline__ void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
 {
-	hlist_add_head_rcu(&sk->sk_node, list);
+	hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list);
 }
 
-static __inline__ void sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
+static __inline__ void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
 {
 	sock_hold(sk);
-	__sk_add_node_rcu(sk, list);
+	__sk_nulls_add_node_rcu(sk, list);
 }
 
 static __inline__ void __sk_del_bind_node(struct sock *sk)
@@ -423,11 +453,16 @@ static __inline__ void sk_add_bind_node(struct sock *sk,
 
 #define sk_for_each(__sk, node, list) \
 	hlist_for_each_entry(__sk, node, list, sk_node)
-#define sk_for_each_rcu_safenext(__sk, node, list, next) \
-	hlist_for_each_entry_rcu_safenext(__sk, node, list, sk_node, next)
+#define sk_nulls_for_each(__sk, node, list) \
+	hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node)
+#define sk_nulls_for_each_rcu(__sk, node, list) \
+	hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node)
 #define sk_for_each_from(__sk, node) \
 	if (__sk && ({ node = &(__sk)->sk_node; 1; })) \
 		hlist_for_each_entry_from(__sk, node, sk_node)
+#define sk_nulls_for_each_from(__sk, node) \
+	if (__sk && ({ node = &(__sk)->sk_nulls_node; 1; })) \
+		hlist_nulls_for_each_entry_from(__sk, node, sk_nulls_node)
 #define sk_for_each_continue(__sk, node) \
 	if (__sk && ({ node = &(__sk)->sk_node; 1; })) \
 		hlist_for_each_entry_continue(__sk, node, sk_node)
diff --git a/include/net/udp.h b/include/net/udp.h
index df2bfe5..90e6ce5 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -51,7 +51,7 @@ struct udp_skb_cb {
 #define UDP_SKB_CB(__skb)	((struct udp_skb_cb *)((__skb)->cb))
 
 struct udp_hslot {
-	struct hlist_head	head;
+	struct hlist_nulls_head	head;
 	spinlock_t		lock;
 } __attribute__((aligned(2 * sizeof(long))));
 struct udp_table {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 54badc9..fea2d87 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -127,9 +127,9 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
 						 const struct sock *sk2))
 {
 	struct sock *sk2;
-	struct hlist_node *node;
+	struct hlist_nulls_node *node;
 
-	sk_for_each(sk2, node, &hslot->head)
+	sk_nulls_for_each(sk2, node, &hslot->head)
 		if (net_eq(sock_net(sk2), net)			&&
 		    sk2 != sk					&&
 		    sk2->sk_hash == num				&&
@@ -189,12 +189,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 	inet_sk(sk)->num = snum;
 	sk->sk_hash = snum;
 	if (sk_unhashed(sk)) {
-		/*
-		 * We need that previous write to sk->sk_hash committed
-		 * before write to sk->next done in following add_node() variant
-		 */
-		smp_wmb();
-		sk_add_node_rcu(sk, &hslot->head);
+		sk_nulls_add_node_rcu(sk, &hslot->head);
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 	}
 	error = 0;
@@ -261,7 +256,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 		int dif, struct udp_table *udptable)
 {
 	struct sock *sk, *result;
-	struct hlist_node *node, *next;
+	struct hlist_nulls_node *node;
 	unsigned short hnum = ntohs(dport);
 	unsigned int hash = udp_hashfn(net, hnum);
 	struct udp_hslot *hslot = &udptable->hash[hash];
@@ -271,13 +266,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 begin:
 	result = NULL;
 	badness = -1;
-	sk_for_each_rcu_safenext(sk, node, &hslot->head, next) {
-		/*
-		 * lockless reader, and SLAB_DESTROY_BY_RCU items:
-		 * We must check this item was not moved to another chain
-		 */
-		if (udp_hashfn(net, sk->sk_hash) != hash)
-			goto begin;
+	sk_nulls_for_each_rcu(sk, node, &hslot->head) {
 		score = compute_score(sk, net, saddr, hnum, sport,
 				      daddr, dport, dif);
 		if (score > badness) {
@@ -285,6 +274,14 @@ begin:
 			badness = score;
 		}
 	}
+	/*
+	 * if the nulls value we got at the end of this lookup is
+	 * not the expected one, we must restart lookup.
+	 * We probably met an item that was moved to another chain.
+	 */
+	if (get_nulls_value(node) != hash)
+		goto begin;
+
 	if (result) {
 		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
 			result = NULL;
@@ -325,11 +322,11 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
 					     __be16 rmt_port, __be32 rmt_addr,
 					     int dif)
 {
-	struct hlist_node *node;
+	struct hlist_nulls_node *node;
 	struct sock *s = sk;
 	unsigned short hnum = ntohs(loc_port);
 
-	sk_for_each_from(s, node) {
+	sk_nulls_for_each_from(s, node) {
 		struct inet_sock *inet = inet_sk(s);
 
 		if (!net_eq(sock_net(s), net)				||
@@ -977,7 +974,7 @@ void udp_lib_unhash(struct sock *sk)
 	struct udp_hslot *hslot = &udptable->hash[hash];
 
 	spin_lock_bh(&hslot->lock);
-	if (sk_del_node_init_rcu(sk)) {
+	if (sk_nulls_del_node_init_rcu(sk)) {
 		inet_sk(sk)->num = 0;
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 	}
@@ -1130,7 +1127,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	int dif;
 
 	spin_lock(&hslot->lock);
-	sk = sk_head(&hslot->head);
+	sk = sk_nulls_head(&hslot->head);
 	dif = skb->dev->ifindex;
 	sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
 	if (sk) {
@@ -1139,7 +1136,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 		do {
 			struct sk_buff *skb1 = skb;
 
-			sknext = udp_v4_mcast_next(net, sk_next(sk), uh->dest,
+			sknext = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
 						   daddr, uh->source, saddr,
 						   dif);
 			if (sknext)
@@ -1560,10 +1557,10 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
 	struct net *net = seq_file_net(seq);
 
 	for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) {
-		struct hlist_node *node;
+		struct hlist_nulls_node *node;
 		struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
 		spin_lock_bh(&hslot->lock);
-		sk_for_each(sk, node, &hslot->head) {
+		sk_nulls_for_each(sk, node, &hslot->head) {
 			if (!net_eq(sock_net(sk), net))
 				continue;
 			if (sk->sk_family == state->family)
@@ -1582,7 +1579,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
 	struct net *net = seq_file_net(seq);
 
 	do {
-		sk = sk_next(sk);
+		sk = sk_nulls_next(sk);
 	} while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
 
 	if (!sk) {
@@ -1753,7 +1750,7 @@ void __init udp_table_init(struct udp_table *table)
 	int i;
 
 	for (i = 0; i < UDP_HTABLE_SIZE; i++) {
-		INIT_HLIST_HEAD(&table->hash[i].head);
+		INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
 		spin_lock_init(&table->hash[i].lock);
 	}
 }
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 8dafa36..fd2d9ad 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -98,7 +98,7 @@ static struct sock *__udp6_lib_lookup(struct net *net,
 				      int dif, struct udp_table *udptable)
 {
 	struct sock *sk, *result;
-	struct hlist_node *node, *next;
+	struct hlist_nulls_node *node;
 	unsigned short hnum = ntohs(dport);
 	unsigned int hash = udp_hashfn(net, hnum);
 	struct udp_hslot *hslot = &udptable->hash[hash];
@@ -108,19 +108,21 @@ static struct sock *__udp6_lib_lookup(struct net *net,
 begin:
 	result = NULL;
 	badness = -1;
-	sk_for_each_rcu_safenext(sk, node, &hslot->head, next) {
-		/*
-		 * lockless reader, and SLAB_DESTROY_BY_RCU items:
-		 * We must check this item was not moved to another chain
-		 */
-		if (udp_hashfn(net, sk->sk_hash) != hash)
-			goto begin;
+	sk_nulls_for_each_rcu(sk, node, &hslot->head) {
 		score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif);
 		if (score > badness) {
 			result = sk;
 			badness = score;
 		}
 	}
+	/*
+	 * if the nulls value we got at the end of this lookup is
+	 * not the expected one, we must restart lookup.
+	 * We probably met an item that was moved to another chain.
+	 */
+	if (get_nulls_value(node) != hash)
+		goto begin;
+
 	if (result) {
 		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
 			result = NULL;
@@ -374,11 +376,11 @@ static struct sock *udp_v6_mcast_next(struct net *net, struct sock *sk,
 				      __be16 rmt_port, struct in6_addr *rmt_addr,
 				      int dif)
 {
-	struct hlist_node *node;
+	struct hlist_nulls_node *node;
 	struct sock *s = sk;
 	unsigned short num = ntohs(loc_port);
 
-	sk_for_each_from(s, node) {
+	sk_nulls_for_each_from(s, node) {
 		struct inet_sock *inet = inet_sk(s);
 
 		if (!net_eq(sock_net(s), net))
@@ -423,7 +425,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	int dif;
 
 	spin_lock(&hslot->lock);
-	sk = sk_head(&hslot->head);
+	sk = sk_nulls_head(&hslot->head);
 	dif = inet6_iif(skb);
 	sk = udp_v6_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
 	if (!sk) {
@@ -432,7 +434,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	}
 
 	sk2 = sk;
-	while ((sk2 = udp_v6_mcast_next(net, sk_next(sk2), uh->dest, daddr,
+	while ((sk2 = udp_v6_mcast_next(net, sk_nulls_next(sk2), uh->dest, daddr,
 					uh->source, saddr, dif))) {
 		struct sk_buff *buff = skb_clone(skb, GFP_ATOMIC);
 		if (buff) {

^ permalink raw reply related	[flat|nested] 134+ messages in thread

* [PATCH 3/3] net: Convert TCP & DCCP hash tables to use RCU / hlist_nulls
  2008-10-30 15:40                                                     ` [PATCH] udp: Introduce special NULL pointers for hlist termination Eric Dumazet
                                                                         ` (5 preceding siblings ...)
  2008-11-13 13:15                                                       ` [PATCH 2/3] udp: Use hlist_nulls in UDP RCU code Eric Dumazet
@ 2008-11-13 13:15                                                       ` Eric Dumazet
  2008-11-13 13:34                                                         ` Peter Zijlstra
                                                                           ` (2 more replies)
  6 siblings, 3 replies; 134+ messages in thread
From: Eric Dumazet @ 2008-11-13 13:15 UTC (permalink / raw)
  To: David Miller
  Cc: Paul E. McKenney, Corey Minyard, Stephen Hemminger, benny+usenet,
	Linux Netdev List, Christoph Lameter, Peter Zijlstra,
	Evgeniy Polyakov, Christian Bell

[-- Attachment #1: Type: text/plain, Size: 1449 bytes --]

RCU was added to UDP lookups, using a fast infrastructure :
- sockets kmem_cache use SLAB_DESTROY_BY_RCU and dont pay the
  price of call_rcu() at freeing time.
- hlist_nulls permits to use few memory barriers.

This patch uses same infrastructure for TCP/DCCP established
and timewait sockets.

Thanks to SLAB_DESTROY_BY_RCU, no slowdown for applications
using short lived TCP connections. A followup patch, converting
rwlocks to spinlocks will even speedup this case.

__inet_lookup_established() is pretty fast now we dont have to
dirty a contended cache line (read_lock/read_unlock)

Only established and timewait hashtable are converted to RCU
(bind table and listen table are still using traditional locking)

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
 include/net/inet_hashtables.h    |    4 -
 include/net/inet_timewait_sock.h |   10 +--
 net/core/sock.c                  |    4 +
 net/dccp/ipv4.c                  |    1
 net/dccp/ipv6.c                  |    1
 net/dccp/proto.c                 |    4 -
 net/ipv4/inet_diag.c             |    6 +-
 net/ipv4/inet_hashtables.c       |   78 ++++++++++++++++++++---------
 net/ipv4/inet_timewait_sock.c    |   26 +++++----
 net/ipv4/tcp.c                   |    4 -
 net/ipv4/tcp_ipv4.c              |   25 ++++-----
 net/ipv6/inet6_hashtables.c      |   70 +++++++++++++++++---------
 net/ipv6/tcp_ipv6.c              |    1
 13 files changed, 150 insertions(+), 84 deletions(-)

[-- Attachment #2: TCP_RCU.patch --]
[-- Type: text/plain, Size: 20050 bytes --]

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index cb31fbf..4818960 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -41,8 +41,8 @@
  * I'll experiment with dynamic table growth later.
  */
 struct inet_ehash_bucket {
-	struct hlist_head chain;
-	struct hlist_head twchain;
+	struct hlist_nulls_head chain;
+	struct hlist_nulls_head twchain;
 };
 
 /* There are a few simple rules, which allow for local port reuse by
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 80e4977..4b8ece2 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -110,7 +110,7 @@ struct inet_timewait_sock {
 #define tw_state		__tw_common.skc_state
 #define tw_reuse		__tw_common.skc_reuse
 #define tw_bound_dev_if		__tw_common.skc_bound_dev_if
-#define tw_node			__tw_common.skc_node
+#define tw_node			__tw_common.skc_nulls_node
 #define tw_bind_node		__tw_common.skc_bind_node
 #define tw_refcnt		__tw_common.skc_refcnt
 #define tw_hash			__tw_common.skc_hash
@@ -137,10 +137,10 @@ struct inet_timewait_sock {
 	struct hlist_node	tw_death_node;
 };
 
-static inline void inet_twsk_add_node(struct inet_timewait_sock *tw,
-				      struct hlist_head *list)
+static inline void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
+				      struct hlist_nulls_head *list)
 {
-	hlist_add_head(&tw->tw_node, list);
+	hlist_nulls_add_head_rcu(&tw->tw_node, list);
 }
 
 static inline void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
@@ -175,7 +175,7 @@ static inline int inet_twsk_del_dead_node(struct inet_timewait_sock *tw)
 }
 
 #define inet_twsk_for_each(tw, node, head) \
-	hlist_for_each_entry(tw, node, head, tw_node)
+	hlist_nulls_for_each_entry(tw, node, head, tw_node)
 
 #define inet_twsk_for_each_inmate(tw, node, jail) \
 	hlist_for_each_entry(tw, node, jail, tw_death_node)
diff --git a/net/core/sock.c b/net/core/sock.c
index ded1eb5..38de9c3 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2082,7 +2082,9 @@ int proto_register(struct proto *prot, int alloc_slab)
 			prot->twsk_prot->twsk_slab =
 				kmem_cache_create(timewait_sock_slab_name,
 						  prot->twsk_prot->twsk_obj_size,
-						  0, SLAB_HWCACHE_ALIGN,
+						  0,
+						  SLAB_HWCACHE_ALIGN |
+							prot->slab_flags,
 						  NULL);
 			if (prot->twsk_prot->twsk_slab == NULL)
 				goto out_free_timewait_sock_slab_name;
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 528baa2..d1dd952 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -938,6 +938,7 @@ static struct proto dccp_v4_prot = {
 	.orphan_count		= &dccp_orphan_count,
 	.max_header		= MAX_DCCP_HEADER,
 	.obj_size		= sizeof(struct dccp_sock),
+	.slab_flags		= SLAB_DESTROY_BY_RCU,
 	.rsk_prot		= &dccp_request_sock_ops,
 	.twsk_prot		= &dccp_timewait_sock_ops,
 	.h.hashinfo		= &dccp_hashinfo,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 4aa1148..f033e84 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -1140,6 +1140,7 @@ static struct proto dccp_v6_prot = {
 	.orphan_count	   = &dccp_orphan_count,
 	.max_header	   = MAX_DCCP_HEADER,
 	.obj_size	   = sizeof(struct dccp6_sock),
+	.slab_flags	   = SLAB_DESTROY_BY_RCU,
 	.rsk_prot	   = &dccp6_request_sock_ops,
 	.twsk_prot	   = &dccp6_timewait_sock_ops,
 	.h.hashinfo	   = &dccp_hashinfo,
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 46cb349..1117d4d 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1090,8 +1090,8 @@ static int __init dccp_init(void)
 	}
 
 	for (i = 0; i < dccp_hashinfo.ehash_size; i++) {
-		INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain);
-		INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].twchain);
+		INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i);
+		INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].twchain, i);
 	}
 
 	if (inet_ehash_locks_alloc(&dccp_hashinfo))
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 564230d..41b3672 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -778,18 +778,19 @@ skip_listen_ht:
 		struct inet_ehash_bucket *head = &hashinfo->ehash[i];
 		rwlock_t *lock = inet_ehash_lockp(hashinfo, i);
 		struct sock *sk;
-		struct hlist_node *node;
+		struct hlist_nulls_node *node;
 
 		num = 0;
 
-		if (hlist_empty(&head->chain) && hlist_empty(&head->twchain))
+		if (hlist_nulls_empty(&head->chain) &&
+			hlist_nulls_empty(&head->twchain))
 			continue;
 
 		if (i > s_i)
 			s_num = 0;
 
 		read_lock_bh(lock);
-		sk_for_each(sk, node, &head->chain) {
+		sk_nulls_for_each(sk, node, &head->chain) {
 			struct inet_sock *inet = inet_sk(sk);
 
 			if (num < s_num)
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index be41ebb..fd269cf 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -223,35 +223,65 @@ struct sock * __inet_lookup_established(struct net *net,
 	INET_ADDR_COOKIE(acookie, saddr, daddr)
 	const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
 	struct sock *sk;
-	const struct hlist_node *node;
+	const struct hlist_nulls_node *node;
 	/* Optimize here for direct hit, only listening connections can
 	 * have wildcards anyways.
 	 */
 	unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
-	struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash);
-	rwlock_t *lock = inet_ehash_lockp(hashinfo, hash);
+	unsigned int slot = hash & (hashinfo->ehash_size - 1);
+	struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
 
-	prefetch(head->chain.first);
-	read_lock(lock);
-	sk_for_each(sk, node, &head->chain) {
+	rcu_read_lock();
+begin:
+	sk_nulls_for_each_rcu(sk, node, &head->chain) {
 		if (INET_MATCH(sk, net, hash, acookie,
-					saddr, daddr, ports, dif))
-			goto hit; /* You sunk my battleship! */
+					saddr, daddr, ports, dif)) {
+			if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
+				goto begintw;
+			if (unlikely(!INET_MATCH(sk, net, hash, acookie,
+				saddr, daddr, ports, dif))) {
+				sock_put(sk);
+				goto begin;
+			}
+			goto out;
+		}
 	}
+	/*
+	 * if the nulls value we got at the end of this lookup is
+	 * not the expected one, we must restart lookup.
+	 * We probably met an item that was moved to another chain.
+	 */
+	if (get_nulls_value(node) != slot)
+		goto begin;
 
+begintw:
 	/* Must check for a TIME_WAIT'er before going to listener hash. */
-	sk_for_each(sk, node, &head->twchain) {
+	sk_nulls_for_each_rcu(sk, node, &head->twchain) {
 		if (INET_TW_MATCH(sk, net, hash, acookie,
-					saddr, daddr, ports, dif))
-			goto hit;
+					saddr, daddr, ports, dif)) {
+			if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
+				sk = NULL;
+				goto out;
+			}
+			if (unlikely(!INET_TW_MATCH(sk, net, hash, acookie,
+				 saddr, daddr, ports, dif))) {
+				sock_put(sk);
+				goto begintw;
+			}
+			goto out;
+		}
 	}
+	/*
+	 * if the nulls value we got at the end of this lookup is
+	 * not the expected one, we must restart lookup.
+	 * We probably met an item that was moved to another chain.
+	 */
+	if (get_nulls_value(node) != slot)
+		goto begintw;
 	sk = NULL;
 out:
-	read_unlock(lock);
+	rcu_read_unlock();
 	return sk;
-hit:
-	sock_hold(sk);
-	goto out;
 }
 EXPORT_SYMBOL_GPL(__inet_lookup_established);
 
@@ -272,14 +302,14 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
 	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
 	rwlock_t *lock = inet_ehash_lockp(hinfo, hash);
 	struct sock *sk2;
-	const struct hlist_node *node;
+	const struct hlist_nulls_node *node;
 	struct inet_timewait_sock *tw;
 
 	prefetch(head->chain.first);
 	write_lock(lock);
 
 	/* Check TIME-WAIT sockets first. */
-	sk_for_each(sk2, node, &head->twchain) {
+	sk_nulls_for_each(sk2, node, &head->twchain) {
 		tw = inet_twsk(sk2);
 
 		if (INET_TW_MATCH(sk2, net, hash, acookie,
@@ -293,7 +323,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
 	tw = NULL;
 
 	/* And established part... */
-	sk_for_each(sk2, node, &head->chain) {
+	sk_nulls_for_each(sk2, node, &head->chain) {
 		if (INET_MATCH(sk2, net, hash, acookie,
 					saddr, daddr, ports, dif))
 			goto not_unique;
@@ -306,7 +336,7 @@ unique:
 	inet->sport = htons(lport);
 	sk->sk_hash = hash;
 	WARN_ON(!sk_unhashed(sk));
-	__sk_add_node(sk, &head->chain);
+	__sk_nulls_add_node_rcu(sk, &head->chain);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 	write_unlock(lock);
 
@@ -338,7 +368,7 @@ static inline u32 inet_sk_port_offset(const struct sock *sk)
 void __inet_hash_nolisten(struct sock *sk)
 {
 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
-	struct hlist_head *list;
+	struct hlist_nulls_head *list;
 	rwlock_t *lock;
 	struct inet_ehash_bucket *head;
 
@@ -350,7 +380,7 @@ void __inet_hash_nolisten(struct sock *sk)
 	lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
 
 	write_lock(lock);
-	__sk_add_node(sk, list);
+	__sk_nulls_add_node_rcu(sk, list);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 	write_unlock(lock);
 }
@@ -400,13 +430,15 @@ void inet_unhash(struct sock *sk)
 		local_bh_disable();
 		inet_listen_wlock(hashinfo);
 		lock = &hashinfo->lhash_lock;
+		if (__sk_del_node_init(sk))
+			sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 	} else {
 		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
 		write_lock_bh(lock);
+		if (__sk_nulls_del_node_init_rcu(sk))
+			sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 	}
 
-	if (__sk_del_node_init(sk))
-		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 	write_unlock_bh(lock);
 out:
 	if (sk->sk_state == TCP_LISTEN)
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 1c5fd38..6068995 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -23,12 +23,12 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
 	rwlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
 
 	write_lock(lock);
-	if (hlist_unhashed(&tw->tw_node)) {
+	if (hlist_nulls_unhashed(&tw->tw_node)) {
 		write_unlock(lock);
 		return;
 	}
-	__hlist_del(&tw->tw_node);
-	sk_node_init(&tw->tw_node);
+	hlist_nulls_del_rcu(&tw->tw_node);
+	sk_nulls_node_init(&tw->tw_node);
 	write_unlock(lock);
 
 	/* Disassociate with bind bucket. */
@@ -92,13 +92,17 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
 
 	write_lock(lock);
 
-	/* Step 2: Remove SK from established hash. */
-	if (__sk_del_node_init(sk))
-		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
-
-	/* Step 3: Hash TW into TIMEWAIT chain. */
-	inet_twsk_add_node(tw, &ehead->twchain);
+	/*
+	 * Step 2: Hash TW into TIMEWAIT chain.
+	 * Should be done before removing sk from established chain
+	 * because readers are lockless and search established first.
+	 */
 	atomic_inc(&tw->tw_refcnt);
+	inet_twsk_add_node_rcu(tw, &ehead->twchain);
+
+	/* Step 3: Remove SK from established hash. */
+	if (__sk_nulls_del_node_init_rcu(sk))
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 
 	write_unlock(lock);
 }
@@ -416,7 +420,7 @@ void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo,
 {
 	struct inet_timewait_sock *tw;
 	struct sock *sk;
-	struct hlist_node *node;
+	struct hlist_nulls_node *node;
 	int h;
 
 	local_bh_disable();
@@ -426,7 +430,7 @@ void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo,
 		rwlock_t *lock = inet_ehash_lockp(hashinfo, h);
 restart:
 		write_lock(lock);
-		sk_for_each(sk, node, &head->twchain) {
+		sk_nulls_for_each(sk, node, &head->twchain) {
 
 			tw = inet_twsk(sk);
 			if (!net_eq(twsk_net(tw), net) ||
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f60a591..044224a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2707,8 +2707,8 @@ void __init tcp_init(void)
 					thash_entries ? 0 : 512 * 1024);
 	tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size;
 	for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
-		INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
-		INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain);
+		INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
+		INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
 	}
 	if (inet_ehash_locks_alloc(&tcp_hashinfo))
 		panic("TCP: failed to alloc ehash_locks");
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d49233f..b2e3ab2 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1857,16 +1857,16 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
 #ifdef CONFIG_PROC_FS
 /* Proc filesystem TCP sock list dumping. */
 
-static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
+static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
 {
-	return hlist_empty(head) ? NULL :
+	return hlist_nulls_empty(head) ? NULL :
 		list_entry(head->first, struct inet_timewait_sock, tw_node);
 }
 
 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
 {
-	return tw->tw_node.next ?
-		hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
+	return !is_a_nulls(tw->tw_node.next) ?
+		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
 }
 
 static void *listening_get_next(struct seq_file *seq, void *cur)
@@ -1954,8 +1954,8 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
 
 static inline int empty_bucket(struct tcp_iter_state *st)
 {
-	return hlist_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
-		hlist_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
+	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
+		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
 }
 
 static void *established_get_first(struct seq_file *seq)
@@ -1966,7 +1966,7 @@ static void *established_get_first(struct seq_file *seq)
 
 	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
 		struct sock *sk;
-		struct hlist_node *node;
+		struct hlist_nulls_node *node;
 		struct inet_timewait_sock *tw;
 		rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
 
@@ -1975,7 +1975,7 @@ static void *established_get_first(struct seq_file *seq)
 			continue;
 
 		read_lock_bh(lock);
-		sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
+		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
 			if (sk->sk_family != st->family ||
 			    !net_eq(sock_net(sk), net)) {
 				continue;
@@ -2004,7 +2004,7 @@ static void *established_get_next(struct seq_file *seq, void *cur)
 {
 	struct sock *sk = cur;
 	struct inet_timewait_sock *tw;
-	struct hlist_node *node;
+	struct hlist_nulls_node *node;
 	struct tcp_iter_state *st = seq->private;
 	struct net *net = seq_file_net(seq);
 
@@ -2032,11 +2032,11 @@ get_tw:
 			return NULL;
 
 		read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
-		sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
+		sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
 	} else
-		sk = sk_next(sk);
+		sk = sk_nulls_next(sk);
 
-	sk_for_each_from(sk, node) {
+	sk_nulls_for_each_from(sk, node) {
 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
 			goto found;
 	}
@@ -2375,6 +2375,7 @@ struct proto tcp_prot = {
 	.sysctl_rmem		= sysctl_tcp_rmem,
 	.max_header		= MAX_TCP_HEADER,
 	.obj_size		= sizeof(struct tcp_sock),
+	.slab_flags		= SLAB_DESTROY_BY_RCU,
 	.twsk_prot		= &tcp_timewait_sock_ops,
 	.rsk_prot		= &tcp_request_sock_ops,
 	.h.hashinfo		= &tcp_hashinfo,
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 1646a56..c1b4d40 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -25,24 +25,28 @@
 void __inet6_hash(struct sock *sk)
 {
 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
-	struct hlist_head *list;
 	rwlock_t *lock;
 
 	WARN_ON(!sk_unhashed(sk));
 
 	if (sk->sk_state == TCP_LISTEN) {
+		struct hlist_head *list;
+
 		list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
 		lock = &hashinfo->lhash_lock;
 		inet_listen_wlock(hashinfo);
+		__sk_add_node(sk, list);
 	} else {
 		unsigned int hash;
+		struct hlist_nulls_head *list;
+
 		sk->sk_hash = hash = inet6_sk_ehashfn(sk);
 		list = &inet_ehash_bucket(hashinfo, hash)->chain;
 		lock = inet_ehash_lockp(hashinfo, hash);
 		write_lock(lock);
+		__sk_nulls_add_node_rcu(sk, list);
 	}
 
-	__sk_add_node(sk, list);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 	write_unlock(lock);
 }
@@ -63,33 +67,53 @@ struct sock *__inet6_lookup_established(struct net *net,
 					   const int dif)
 {
 	struct sock *sk;
-	const struct hlist_node *node;
+	const struct hlist_nulls_node *node;
 	const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
 	/* Optimize here for direct hit, only listening connections can
 	 * have wildcards anyways.
 	 */
 	unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport);
-	struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash);
-	rwlock_t *lock = inet_ehash_lockp(hashinfo, hash);
+	unsigned int slot = hash & (hashinfo->ehash_size - 1);
+	struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
 
-	prefetch(head->chain.first);
-	read_lock(lock);
-	sk_for_each(sk, node, &head->chain) {
+
+	rcu_read_lock();
+begin:
+	sk_nulls_for_each_rcu(sk, node, &head->chain) {
 		/* For IPV6 do the cheaper port and family tests first. */
-		if (INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif))
-			goto hit; /* You sunk my battleship! */
+		if (INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif)) {
+			if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
+				goto begintw;
+			if (!INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif)) {
+				sock_put(sk);
+				goto begin;
+			}
+		goto out;
+		}
 	}
+	if (get_nulls_value(node) != slot)
+		goto begin;
+
+begintw:
 	/* Must check for a TIME_WAIT'er before going to listener hash. */
-	sk_for_each(sk, node, &head->twchain) {
-		if (INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif))
-			goto hit;
+	sk_nulls_for_each_rcu(sk, node, &head->twchain) {
+		if (INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif)) {
+			if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
+				sk = NULL;
+				goto out;
+			}
+			if (!INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif)) {
+				sock_put(sk);
+				goto begintw;
+			}
+			goto out;
+		}
 	}
-	read_unlock(lock);
-	return NULL;
-
-hit:
-	sock_hold(sk);
-	read_unlock(lock);
+	if (get_nulls_value(node) != slot)
+		goto begintw;
+	sk = NULL;
+out:
+	rcu_read_unlock();
 	return sk;
 }
 EXPORT_SYMBOL(__inet6_lookup_established);
@@ -172,14 +196,14 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
 	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
 	rwlock_t *lock = inet_ehash_lockp(hinfo, hash);
 	struct sock *sk2;
-	const struct hlist_node *node;
+	const struct hlist_nulls_node *node;
 	struct inet_timewait_sock *tw;
 
 	prefetch(head->chain.first);
 	write_lock(lock);
 
 	/* Check TIME-WAIT sockets first. */
-	sk_for_each(sk2, node, &head->twchain) {
+	sk_nulls_for_each(sk2, node, &head->twchain) {
 		tw = inet_twsk(sk2);
 
 		if (INET6_TW_MATCH(sk2, net, hash, saddr, daddr, ports, dif)) {
@@ -192,7 +216,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
 	tw = NULL;
 
 	/* And established part... */
-	sk_for_each(sk2, node, &head->chain) {
+	sk_nulls_for_each(sk2, node, &head->chain) {
 		if (INET6_MATCH(sk2, net, hash, saddr, daddr, ports, dif))
 			goto not_unique;
 	}
@@ -203,7 +227,7 @@ unique:
 	inet->num = lport;
 	inet->sport = htons(lport);
 	WARN_ON(!sk_unhashed(sk));
-	__sk_add_node(sk, &head->chain);
+	__sk_nulls_add_node_rcu(sk, &head->chain);
 	sk->sk_hash = hash;
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 	write_unlock(lock);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 9842764..b357870 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2043,6 +2043,7 @@ struct proto tcpv6_prot = {
 	.sysctl_rmem		= sysctl_tcp_rmem,
 	.max_header		= MAX_TCP_HEADER,
 	.obj_size		= sizeof(struct tcp6_sock),
+	.slab_flags		= SLAB_DESTROY_BY_RCU,
 	.twsk_prot		= &tcp6_timewait_sock_ops,
 	.rsk_prot		= &tcp6_request_sock_ops,
 	.h.hashinfo		= &tcp_hashinfo,

^ permalink raw reply related	[flat|nested] 134+ messages in thread

* Re: [PATCH 1/3] rcu: Introduce hlist_nulls variant of hlist
  2008-11-13 13:14                                                       ` [PATCH 1/3] rcu: Introduce hlist_nulls variant of hlist Eric Dumazet
@ 2008-11-13 13:29                                                         ` Peter Zijlstra
  2008-11-13 13:44                                                           ` Eric Dumazet
  2008-11-14 15:16                                                         ` [PATCH 1/3] rcu: Introduce hlist_nulls variant of hlist Peter Zijlstra
  2008-11-19 17:01                                                         ` Paul E. McKenney
  2 siblings, 1 reply; 134+ messages in thread
From: Peter Zijlstra @ 2008-11-13 13:29 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Corey Minyard, Paul E. McKenney, David Miller, Stephen Hemminger,
	benny+usenet, Linux Netdev List, Christoph Lameter,
	Evgeniy Polyakov, Christian Bell

On Thu, 2008-11-13 at 14:14 +0100, Eric Dumazet wrote:
> hlist uses NULL value to finish a chain.
> 
> hlist_nulls variant use the low order bit set to 1 to signal an end-of-list marker.
> 
> This allows to store many different end markers, so that some RCU lockless
> algos (used in TCP/UDP stack for example) can save some memory barriers in
> fast paths.
> 
> Two new files are added :
> 
> include/linux/list_nulls.h
>   - mimics hlist part of include/linux/list.h, derived to hlist_nulls variant

How is the !rcu variant useful?

> include/linux/rculist_nulls.h
>   - mimics hlist part of include/linux/rculist.h, derived to hlist_nulls variant
> 
>    Only four helpers are declared for the moment :
> 
>      hlist_nulls_del_init_rcu(), hlist_nulls_del_rcu(),
>      hlist_nulls_add_head_rcu() and hlist_nulls_for_each_entry_rcu()
> 
> prefetches() were removed, since an end of list is not anymore NULL value.
> prefetches() could trigger useless (and possibly dangerous) memory transactions.
> 
> Example of use (extracted from __udp4_lib_lookup())
> 
> 	struct sock *sk, *result;
>         struct hlist_nulls_node *node;
>         unsigned short hnum = ntohs(dport);
>         unsigned int hash = udp_hashfn(net, hnum);
>         struct udp_hslot *hslot = &udptable->hash[hash];
>         int score, badness;
> 
>         rcu_read_lock();
> begin:
>         result = NULL;
>         badness = -1;
>         sk_nulls_for_each_rcu(sk, node, &hslot->head) {
>                 score = compute_score(sk, net, saddr, hnum, sport,
>                                       daddr, dport, dif);
>                 if (score > badness) {
>                         result = sk;
>                         badness = score;
>                 }
>         }
>         /*
>          * if the nulls value we got at the end of this lookup is
>          * not the expected one, we must restart lookup.
>          * We probably met an item that was moved to another chain.
>          */
>         if (get_nulls_value(node) != hash)
>                 goto begin;

So by not using some memory barriers (would be nice to have it
illustrated which ones), we can race and end up on the wrong chain, in
case that happens we detect this by using this per-chain terminator and
try again.

It would be really good to have it explained in the rculist_nulls.h
comments what memory barriers are missing, what races they open, and how
the this special terminator trick closes that race.

I'm sure most of us understand it now, but will we still in a few
months? - how about new people?

Other than that, very cool stuff! :-)


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] net: Convert TCP & DCCP hash tables to use RCU / hlist_nulls
  2008-11-13 13:15                                                       ` [PATCH 3/3] net: Convert TCP & DCCP hash tables to use RCU / hlist_nulls Eric Dumazet
@ 2008-11-13 13:34                                                         ` Peter Zijlstra
  2008-11-13 13:51                                                           ` Eric Dumazet
  2008-11-19 17:53                                                         ` Paul E. McKenney
  2008-11-23  9:33                                                         ` [PATCH] net: Convert TCP/DCCP listening hash tables to use RCU Eric Dumazet
  2 siblings, 1 reply; 134+ messages in thread
From: Peter Zijlstra @ 2008-11-13 13:34 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, Paul E. McKenney, Corey Minyard, Stephen Hemminger,
	benny+usenet, Linux Netdev List, Christoph Lameter,
	Evgeniy Polyakov, Christian Bell

On Thu, 2008-11-13 at 14:15 +0100, Eric Dumazet wrote:
> +begin:
> +       sk_nulls_for_each_rcu(sk, node, &head->chain) {
>                 if (INET_MATCH(sk, net, hash, acookie,
> +                                       saddr, daddr, ports, dif)) {
> +                       if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
> +                               goto begintw;
> +                       if (unlikely(!INET_MATCH(sk, net, hash, acookie,
> +                               saddr, daddr, ports, dif))) {
> +                               sock_put(sk);
> +                               goto begin;
> +                       }

This is the validation step that verifies the race opened by using
SLAB_DESTROY_BY_RCU, right?

Does it make sense to add a little comment to these validation steps to
keep people on their toes and aware of the trickery?

> +                       goto out;
> +               }
>         }
> +       /*
> +        * if the nulls value we got at the end of this lookup is
> +        * not the expected one, we must restart lookup.
> +        * We probably met an item that was moved to another chain.
> +        */
> +       if (get_nulls_value(node) != slot)
> +               goto begin;
>  


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 1/3] rcu: Introduce hlist_nulls variant of hlist
  2008-11-13 13:29                                                         ` Peter Zijlstra
@ 2008-11-13 13:44                                                           ` Eric Dumazet
  2008-11-13 16:02                                                             ` [PATCH 4/3] rcu: documents rculist_nulls Eric Dumazet
  0 siblings, 1 reply; 134+ messages in thread
From: Eric Dumazet @ 2008-11-13 13:44 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Corey Minyard, Paul E. McKenney, David Miller, Stephen Hemminger,
	benny+usenet, Linux Netdev List, Christoph Lameter,
	Evgeniy Polyakov, Christian Bell

Peter Zijlstra a écrit :
> On Thu, 2008-11-13 at 14:14 +0100, Eric Dumazet wrote:
>> hlist uses NULL value to finish a chain.
>>
>> hlist_nulls variant use the low order bit set to 1 to signal an end-of-list marker.
>>
>> This allows to store many different end markers, so that some RCU lockless
>> algos (used in TCP/UDP stack for example) can save some memory barriers in
>> fast paths.
>>
>> Two new files are added :
>>
>> include/linux/list_nulls.h
>>   - mimics hlist part of include/linux/list.h, derived to hlist_nulls variant
> 
> How is the !rcu variant useful?

For example, if a process holds a lock, it doesnt need rcu version.

/proc/net/tcp comes to mind

> 
>> include/linux/rculist_nulls.h
>>   - mimics hlist part of include/linux/rculist.h, derived to hlist_nulls variant
>>
>>    Only four helpers are declared for the moment :
>>
>>      hlist_nulls_del_init_rcu(), hlist_nulls_del_rcu(),
>>      hlist_nulls_add_head_rcu() and hlist_nulls_for_each_entry_rcu()
>>
>> prefetches() were removed, since an end of list is not anymore NULL value.
>> prefetches() could trigger useless (and possibly dangerous) memory transactions.
>>

> 
> So by not using some memory barriers (would be nice to have it
> illustrated which ones), we can race and end up on the wrong chain, in
> case that happens we detect this by using this per-chain terminator and
> try again.
> 
> It would be really good to have it explained in the rculist_nulls.h
> comments what memory barriers are missing, what races they open, and how
> the this special terminator trick closes that race.

OK, maybe I should add a Documentation/RCU/rculist_nulls.txt file with
appropriate examples and documentation.

(Say the lookup/insert algorithms, with standard hlist and memory barriers,
and with hlist_nulls without those two memory barriers.

(These two memory barriers can be found in commits :

c37ccc0d4e2a4ee52f1a40cff1be0049f2104bba :

udp: add a missing smp_wmb() in udp_lib_get_port()

Corey Minyard spotted a missing memory barrier in udp_lib_get_port()

We need to make sure a reader cannot read the new 'sk->sk_next' value
and previous value of 'sk->sk_hash'. Or else, an item could be deleted
from a chain, and inserted into another chain. If new chain was empty
before the move, 'next' pointer is NULL, and lockless reader can
not detect it missed following items in original chain.

This patch is temporary, since we expect an upcoming patch
to introduce another way of handling the problem.

And commit 96631ed16c514cf8b28fab991a076985ce378c26 :

udp: introduce sk_for_each_rcu_safenext()

Corey Minyard found a race added in commit 271b72c7fa82c2c7a795bc16896149933110672d
(udp: RCU handling for Unicast packets.)

 "If the socket is moved from one list to another list in-between the
 time the hash is calculated and the next field is accessed, and the
 socket has moved to the end of the new list, the traversal will not
 complete properly on the list it should have, since the socket will
 be on the end of the new list and there's not a way to tell it's on a
 new list and restart the list traversal.  I think that this can be
 solved by pre-fetching the "next" field (with proper barriers) before
 checking the hash."

This patch corrects this problem, introducing a new
sk_for_each_rcu_safenext() macro.

> 
> I'm sure most of us understand it now, but will we still in a few
> months? - how about new people?
> 
> Other than that, very cool stuff! :-)

Thanks Peter ;)

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] net: Convert TCP & DCCP hash tables to use RCU / hlist_nulls
  2008-11-13 13:34                                                         ` Peter Zijlstra
@ 2008-11-13 13:51                                                           ` Eric Dumazet
  2008-11-13 14:08                                                             ` Christoph Lameter
  2008-11-13 14:22                                                             ` Peter Zijlstra
  0 siblings, 2 replies; 134+ messages in thread
From: Eric Dumazet @ 2008-11-13 13:51 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: David Miller, Paul E. McKenney, Corey Minyard, Stephen Hemminger,
	benny+usenet, Linux Netdev List, Christoph Lameter,
	Evgeniy Polyakov, Christian Bell

Peter Zijlstra a écrit :
> On Thu, 2008-11-13 at 14:15 +0100, Eric Dumazet wrote:
>> +begin:
>> +       sk_nulls_for_each_rcu(sk, node, &head->chain) {
>>                 if (INET_MATCH(sk, net, hash, acookie,
>> +                                       saddr, daddr, ports, dif)) {
>> +                       if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
>> +                               goto begintw;
>> +                       if (unlikely(!INET_MATCH(sk, net, hash, acookie,
>> +                               saddr, daddr, ports, dif))) {
>> +                               sock_put(sk);
>> +                               goto begin;
>> +                       }
> 
> This is the validation step that verifies the race opened by using
> SLAB_DESTROY_BY_RCU, right?

The atomic_inc_not_zero() is not related to SLAB_DESTROY_BY_RCU but
classic RCU lookup. A writer can delete the item right before we try to use it.

Next step is necessary in case the deleted item was re-allocated and inserted
in a hash chain (this one or another one, it doesnt matter). In this case,
previous atomic_inc_not_zero test will succeed. So we must check again the item
we selected (and refcounted) is the one we were searching.

So yes, this bit should be documented, since SLAB_DESTROY_BY_RCU is
not really used in linux kernel at this moment.


> 
> Does it make sense to add a little comment to these validation steps to
> keep people on their toes and aware of the trickery?

Yes, you are right.

Thanks



^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] net: Convert TCP & DCCP hash tables to use RCU / hlist_nulls
  2008-11-13 13:51                                                           ` Eric Dumazet
@ 2008-11-13 14:08                                                             ` Christoph Lameter
  2008-11-13 14:22                                                             ` Peter Zijlstra
  1 sibling, 0 replies; 134+ messages in thread
From: Christoph Lameter @ 2008-11-13 14:08 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Peter Zijlstra, David Miller, Paul E. McKenney, Corey Minyard,
	Stephen Hemminger, benny+usenet, Linux Netdev List,
	Evgeniy Polyakov, Christian Bell

On Thu, 13 Nov 2008, Eric Dumazet wrote:

> So yes, this bit should be documented, since SLAB_DESTROY_BY_RCU is
> not really used in linux kernel at this moment.

It is used for the anonymous vmas. That is the purpose that Hugh
introduced it for since he saw regression if he would use straight rcu
freeing. See mm/rmap.c. SLAB_DESTROY_BY_RCU a pretty strange way of
using RCU and slab so it should always be documented in detail.


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] net: Convert TCP & DCCP hash tables to use RCU / hlist_nulls
  2008-11-13 13:51                                                           ` Eric Dumazet
  2008-11-13 14:08                                                             ` Christoph Lameter
@ 2008-11-13 14:22                                                             ` Peter Zijlstra
  2008-11-13 14:27                                                               ` Christoph Lameter
  1 sibling, 1 reply; 134+ messages in thread
From: Peter Zijlstra @ 2008-11-13 14:22 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, Paul E. McKenney, Corey Minyard, Stephen Hemminger,
	benny+usenet, Linux Netdev List, Christoph Lameter,
	Evgeniy Polyakov, Christian Bell

On Thu, 2008-11-13 at 14:51 +0100, Eric Dumazet wrote:
> Peter Zijlstra a écrit :
> > On Thu, 2008-11-13 at 14:15 +0100, Eric Dumazet wrote:
> >> +begin:
> >> +       sk_nulls_for_each_rcu(sk, node, &head->chain) {
> >>                 if (INET_MATCH(sk, net, hash, acookie,
> >> +                                       saddr, daddr, ports, dif)) {
> >> +                       if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
> >> +                               goto begintw;
> >> +                       if (unlikely(!INET_MATCH(sk, net, hash, acookie,
> >> +                               saddr, daddr, ports, dif))) {
> >> +                               sock_put(sk);
> >> +                               goto begin;
> >> +                       }
> > 
> > This is the validation step that verifies the race opened by using
> > SLAB_DESTROY_BY_RCU, right?
> 
> The atomic_inc_not_zero() is not related to SLAB_DESTROY_BY_RCU but
> classic RCU lookup. A writer can delete the item right before we try to use it.

Yeah, its to stabilize the current situation.

> Next step is necessary in case the deleted item was re-allocated and inserted
> in a hash chain (this one or another one, it doesnt matter). In this case,
> previous atomic_inc_not_zero test will succeed. So we must check again the item
> we selected (and refcounted) is the one we were searching.
> 
> So yes, this bit should be documented, since SLAB_DESTROY_BY_RCU is
> not really used in linux kernel at this moment.

We have one user, anon_vma, and one thing that is very nearly identical
the lockless pagecache.

See page_cache_get_speculative() and find_get_page(). The pagecache gets
away with this due to the simple fact that the page frames are never
freed.

Hmm, I once wrote a comment to go with SLAB_DESTROY_BY_RCU, which seems
to have gotten lost... /me goes dig.

Found it:  http://lkml.org/lkml/2008/4/2/143

I guess I'd better re-submit that..


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] net: Convert TCP & DCCP hash tables to use RCU / hlist_nulls
  2008-11-13 14:22                                                             ` Peter Zijlstra
@ 2008-11-13 14:27                                                               ` Christoph Lameter
  0 siblings, 0 replies; 134+ messages in thread
From: Christoph Lameter @ 2008-11-13 14:27 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Eric Dumazet, David Miller, Paul E. McKenney, Corey Minyard,
	Stephen Hemminger, benny+usenet, Linux Netdev List,
	Evgeniy Polyakov, Christian Bell

On Thu, 13 Nov 2008, Peter Zijlstra wrote:

> Found it:  http://lkml.org/lkml/2008/4/2/143
>
> I guess I'd better re-submit that..

Looks good. CC Pekka and he will merge it.



^ permalink raw reply	[flat|nested] 134+ messages in thread

* [PATCH 4/3] rcu: documents rculist_nulls
  2008-11-13 13:44                                                           ` Eric Dumazet
@ 2008-11-13 16:02                                                             ` Eric Dumazet
  2008-11-14 15:16                                                               ` Peter Zijlstra
  2008-11-19 17:07                                                               ` Paul E. McKenney
  0 siblings, 2 replies; 134+ messages in thread
From: Eric Dumazet @ 2008-11-13 16:02 UTC (permalink / raw)
  To: Peter Zijlstra, David Miller
  Cc: Corey Minyard, Paul E. McKenney, Stephen Hemminger, benny+usenet,
	Linux Netdev List, Christoph Lameter, Evgeniy Polyakov,
	Christian Bell

[-- Attachment #1: Type: text/plain, Size: 1088 bytes --]

Eric Dumazet a écrit :
> Peter Zijlstra a écrit :
>> So by not using some memory barriers (would be nice to have it
>> illustrated which ones), we can race and end up on the wrong chain, in
>> case that happens we detect this by using this per-chain terminator and
>> try again.
>>
>> It would be really good to have it explained in the rculist_nulls.h
>> comments what memory barriers are missing, what races they open, and how
>> the this special terminator trick closes that race.
> 
> OK, maybe I should add a Documentation/RCU/rculist_nulls.txt file with
> appropriate examples and documentation.
> 
> (Say the lookup/insert algorithms, with standard hlist and memory barriers,
> and with hlist_nulls without those two memory barriers.
> 

[PATCH 4/3] rcu: documents rculist_nulls

Adds Documentation/RCU/rculist_nulls.txt file to describe how 'nulls'
end-of-list can help in some RCU algos.


Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
 Documentation/RCU/rculist_nulls.txt |  167 ++++++++++++++++++++++++++
 1 files changed, 167 insertions(+)

[-- Attachment #2: rculist_nulls_doc.patch --]
[-- Type: text/plain, Size: 5489 bytes --]

diff --git a/Documentation/RCU/rculist_nulls.txt b/Documentation/RCU/rculist_nulls.txt
new file mode 100644
index 0000000..5db5549
--- /dev/null
+++ b/Documentation/RCU/rculist_nulls.txt
@@ -0,0 +1,167 @@
+Using hlist_nulls to protect read-mostly linked lists and
+objects using SLAB_DESTROY_BY_RCU allocations.
+
+Please read the basics in Documentation/RCU/listRCU.txt
+
+Using special makers (called 'nulls') is a convenient way
+to solve following problem :
+
+A typical RCU linked list managing objects which are
+allocated with SLAB_DESTROY_BY_RCU kmem_cache can
+use following algos :
+
+1) Lookup algo
+--------------
+rcu_read_lock()
+begin:
+obj = lockless_lookup(key);
+if (obj) {
+  if (!try_get_ref(obj)) // might fail for free objects
+    goto begin;
+  /*
+   * Because a writer could delete object, and a writer could
+   * reuse these object before the RCU grace period, we
+   * must check key after geting the reference on object
+   */
+  if (obj->key != key) { // not the object we expected
+     put_ref(obj);
+     goto begin;
+   }
+}
+rcu_read_unlock();
+
+Beware that lockless_lookup(key) cannot use traditional hlist_for_each_entry_rcu()
+but a version with an additional memory barrier (smp_rmb())
+
+lockless_lookup(key)
+{
+   struct hlist_node *node, *next;
+   for (pos = rcu_dereference((head)->first);
+          pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) &&
+          ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); 
+          pos = rcu_dereference(next))
+      if (obj->key == key)
+         return obj;
+   return NULL;
+
+And note the traditional hlist_for_each_entry_rcu() misses this smp_rmb() :
+
+   struct hlist_node *node;
+   for (pos = rcu_dereference((head)->first);
+		pos && ({ prefetch(pos->next); 1; }) &&
+		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; });
+		pos = rcu_dereference(pos->next))
+      if (obj->key == key)
+         return obj;
+   return NULL;
+}
+
+Quoting Corey Minyard :
+
+"If the object is moved from one list to another list in-between the
+ time the hash is calculated and the next field is accessed, and the
+ object has moved to the end of a new list, the traversal will not
+ complete properly on the list it should have, since the object will
+ be on the end of the new list and there's not a way to tell it's on a
+ new list and restart the list traversal.  I think that this can be
+ solved by pre-fetching the "next" field (with proper barriers) before
+ checking the key."
+
+2) Insert algo :
+----------------
+
+We need to make sure a reader cannot read the new 'obj->obj_next' value
+and previous value of 'obj->key'. Or else, an item could be deleted
+from a chain, and inserted into another chain. If new chain was empty
+before the move, 'next' pointer is NULL, and lockless reader can
+not detect it missed following items in original chain.
+
+/*
+ * Please note that new inserts are done at the head of list,
+ * not in the middle or end.
+ */
+obj = kmem_cache_alloc(...);
+lock_chain(); // typically a spin_lock()
+obj->key = key;
+atomic_inc(&obj->refcnt);
+/*
+ * we need to make sure obj->key is updated before obj->next
+ */
+smp_wmb();
+hlist_add_head_rcu(&obj->obj_node, list);
+unlock_chain(); // typically a spin_unlock()
+
+
+3) Remove algo
+--------------
+Nothing special here, we can use a standard RCU hlist deletion.
+But thanks to SLAB_DESTROY_BY_RCU, beware a deleted object can be reused 
+very very fast (before the end of RCU grace period)
+
+if (put_last_reference_on(obj) {
+   lock_chain(); // typically a spin_lock()
+   hlist_del_init_rcu(&obj->obj_node);
+   unlock_chain(); // typically a spin_unlock()
+   kmem_cache_free(cachep, obj);
+}
+
+
+
+--------------------------------------------------------------------------
+With hlist_nulls we can avoid extra smp_rmb() in lockless_lookup()
+and extra smp_wmb() in insert function.
+
+For example, if we choose to store the slot number as the 'nulls'
+end-of-list marker for each slot of the hash table, we can detect
+a race (some writer did a delete and/or a move of an object
+to another chain) checking the final 'nulls' value if
+the lookup met the end of chain. If final 'nulls' value
+is not the slot number, then we must restart the lookup at
+the begining. If the object was moved to same chain,
+then the reader doesnt care : It might eventually
+scan the list again without harm.
+
+
+1) lookup algo
+
+ head = &table[slot];
+ rcu_read_lock();
+begin:
+ hlist_nulls_for_each_entry_rcu(obj, node, head, member) {
+   if (obj->key == key) {
+      if (!try_get_ref(obj)) // might fail for free objects
+         goto begin;
+      if (obj->key != key) { // not the object we expected
+         put_ref(obj);
+         goto begin;
+      }
+  goto out;
+ }
+/*
+ * if the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(node) != slot)
+   goto begin;
+ obj = NULL;
+
+out:
+ rcu_read_unlock();
+
+2) Insert function :
+--------------------
+
+/*
+ * Please note that new inserts are done at the head of list,
+ * not in the middle or end.
+ */
+obj = kmem_cache_alloc(cachep);
+lock_chain(); // typically a spin_lock()
+obj->key = key;
+atomic_set(&obj->refcnt, 1);
+/*
+ * insert obj in RCU way (readers might be traversing chain)
+ */
+hlist_nulls_add_head_rcu(&obj->obj_node, list);
+unlock_chain(); // typically a spin_unlock()

^ permalink raw reply related	[flat|nested] 134+ messages in thread

* Re: [PATCH 0/3] net: RCU lookups for UDP, DCCP and TCP protocol
  2008-11-13 13:13                                                       ` [PATCH 0/3] net: RCU lookups for UDP, DCCP and TCP protocol Eric Dumazet
@ 2008-11-13 17:20                                                         ` Andi Kleen
  2008-11-17  3:41                                                         ` David Miller
  1 sibling, 0 replies; 134+ messages in thread
From: Andi Kleen @ 2008-11-13 17:20 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, Paul E. McKenney, Corey Minyard, Stephen Hemminger,
	benny+usenet, Linux Netdev List, Christoph Lameter,
	Peter Zijlstra, Evgeniy Polyakov, Christian Bell

Eric Dumazet <dada1@cosmosbay.com> writes:


> 1) Introduce hlist_nulls variant of hlist
>
>   hlist uses NULL value to finish a chain.
>   hlist_nulls variant use the low order bit set to 1 to signal an end marker.
>   This allows to store many different end markers, so that some RCU lockless
>   algos (used in TCP/UDP stack for example) can save some memory barriers in
>   fast paths.

Do you have any numbers that demonstrate the read memory barriers being
a performance problem? At least on x86 they should be very cheap
because they're normally nops.

-Andi

-- 
ak@linux.intel.com

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 1/3] rcu: Introduce hlist_nulls variant of hlist
  2008-11-13 13:14                                                       ` [PATCH 1/3] rcu: Introduce hlist_nulls variant of hlist Eric Dumazet
  2008-11-13 13:29                                                         ` Peter Zijlstra
@ 2008-11-14 15:16                                                         ` Peter Zijlstra
  2008-11-19 17:01                                                         ` Paul E. McKenney
  2 siblings, 0 replies; 134+ messages in thread
From: Peter Zijlstra @ 2008-11-14 15:16 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Corey Minyard, Paul E. McKenney, David Miller, Stephen Hemminger,
	benny+usenet, Linux Netdev List, Christoph Lameter,
	Evgeniy Polyakov, Christian Bell

On Thu, 2008-11-13 at 14:14 +0100, Eric Dumazet wrote:
> hlist uses NULL value to finish a chain.
> 
> hlist_nulls variant use the low order bit set to 1 to signal an end-of-list marker.
> 
> This allows to store many different end markers, so that some RCU lockless
> algos (used in TCP/UDP stack for example) can save some memory barriers in
> fast paths.
> 
> Two new files are added :
> 
> include/linux/list_nulls.h
>   - mimics hlist part of include/linux/list.h, derived to hlist_nulls variant
> 
> include/linux/rculist_nulls.h
>   - mimics hlist part of include/linux/rculist.h, derived to hlist_nulls variant
> 
>    Only four helpers are declared for the moment :
> 
>      hlist_nulls_del_init_rcu(), hlist_nulls_del_rcu(),
>      hlist_nulls_add_head_rcu() and hlist_nulls_for_each_entry_rcu()
> 
> prefetches() were removed, since an end of list is not anymore NULL value.
> prefetches() could trigger useless (and possibly dangerous) memory transactions.
> 
> Example of use (extracted from __udp4_lib_lookup())
> 
>         struct sock *sk, *result;
>         struct hlist_nulls_node *node;
>         unsigned short hnum = ntohs(dport);
>         unsigned int hash = udp_hashfn(net, hnum);
>         struct udp_hslot *hslot = &udptable->hash[hash];
>         int score, badness;
> 
>         rcu_read_lock();
> begin:
>         result = NULL;
>         badness = -1;
>         sk_nulls_for_each_rcu(sk, node, &hslot->head) {
>                 score = compute_score(sk, net, saddr, hnum, sport,
>                                       daddr, dport, dif);
>                 if (score > badness) {
>                         result = sk;
>                         badness = score;
>                 }
>         }
>         /*
>          * if the nulls value we got at the end of this lookup is
>          * not the expected one, we must restart lookup.
>          * We probably met an item that was moved to another chain.
>          */
>         if (get_nulls_value(node) != hash)
>                 goto begin;
> 
>         if (result) {
>                 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
>                         result = NULL;
>                 else if (unlikely(compute_score(result, net, saddr, hnum, sport,
>                                   daddr, dport, dif) < badness)) {
>                         sock_put(result);
>                         goto begin;
>                 }
>         }
>         rcu_read_unlock();
>         return result;
> 
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 4/3] rcu: documents rculist_nulls
  2008-11-13 16:02                                                             ` [PATCH 4/3] rcu: documents rculist_nulls Eric Dumazet
@ 2008-11-14 15:16                                                               ` Peter Zijlstra
  2008-11-17  3:36                                                                 ` David Miller
  2008-11-19 17:07                                                               ` Paul E. McKenney
  1 sibling, 1 reply; 134+ messages in thread
From: Peter Zijlstra @ 2008-11-14 15:16 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, Corey Minyard, Paul E. McKenney, Stephen Hemminger,
	benny+usenet, Linux Netdev List, Christoph Lameter,
	Evgeniy Polyakov, Christian Bell

On Thu, 2008-11-13 at 17:02 +0100, Eric Dumazet wrote:
> 
Eric Dumazet a écrit :
> > Peter Zijlstra a écrit :
> >> So by not using some memory barriers (would be nice to have it
> >> illustrated which ones), we can race and end up on the wrong chain, in
> >> case that happens we detect this by using this per-chain terminator and
> >> try again.
> >>
> >> It would be really good to have it explained in the rculist_nulls.h
> >> comments what memory barriers are missing, what races they open, and how
> >> the this special terminator trick closes that race.
> > 
> > OK, maybe I should add a Documentation/RCU/rculist_nulls.txt file with
> > appropriate examples and documentation.
> > 
> > (Say the lookup/insert algorithms, with standard hlist and memory barriers,
> > and with hlist_nulls without those two memory barriers.
> > 
> 
> [PATCH 4/3] rcu: documents rculist_nulls
> 
> Adds Documentation/RCU/rculist_nulls.txt file to describe how 'nulls'
> end-of-list can help in some RCU algos.
> 
> 
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 4/3] rcu: documents rculist_nulls
  2008-11-14 15:16                                                               ` Peter Zijlstra
@ 2008-11-17  3:36                                                                 ` David Miller
  0 siblings, 0 replies; 134+ messages in thread
From: David Miller @ 2008-11-17  3:36 UTC (permalink / raw)
  To: a.p.zijlstra
  Cc: dada1, minyard, paulmck, shemminger, benny+usenet, netdev, cl,
	zbr, christian

From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 14 Nov 2008 16:16:38 +0100

> On Thu, 2008-11-13 at 17:02 +0100, Eric Dumazet wrote:
> > 
> Eric Dumazet a écrit :
> > > Peter Zijlstra a écrit :
> > >> So by not using some memory barriers (would be nice to have it
> > >> illustrated which ones), we can race and end up on the wrong chain, in
> > >> case that happens we detect this by using this per-chain terminator and
> > >> try again.
> > >>
> > >> It would be really good to have it explained in the rculist_nulls.h
> > >> comments what memory barriers are missing, what races they open, and how
> > >> the this special terminator trick closes that race.
> > > 
> > > OK, maybe I should add a Documentation/RCU/rculist_nulls.txt file with
> > > appropriate examples and documentation.
> > > 
> > > (Say the lookup/insert algorithms, with standard hlist and memory barriers,
> > > and with hlist_nulls without those two memory barriers.
> > > 
> > 
> > [PATCH 4/3] rcu: documents rculist_nulls
> > 
> > Adds Documentation/RCU/rculist_nulls.txt file to describe how 'nulls'
> > end-of-list can help in some RCU algos.
> > 
> > 
> > Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
> 
> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

Since there seems to be consensus for these changes I'm going to merge this
stuff into net-next-2.6 so that I can add in the users that Eric has written.

Thanks everyone for the review and feedback.

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 0/3] net: RCU lookups for UDP, DCCP and TCP protocol
  2008-11-13 13:13                                                       ` [PATCH 0/3] net: RCU lookups for UDP, DCCP and TCP protocol Eric Dumazet
  2008-11-13 17:20                                                         ` Andi Kleen
@ 2008-11-17  3:41                                                         ` David Miller
  2008-11-19 19:52                                                           ` Christoph Lameter
  1 sibling, 1 reply; 134+ messages in thread
From: David Miller @ 2008-11-17  3:41 UTC (permalink / raw)
  To: dada1
  Cc: paulmck, minyard, shemminger, benny+usenet, netdev, cl,
	a.p.zijlstra, zbr, christian

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Thu, 13 Nov 2008 14:13:20 +0100

> Here is a serie of three patches (based on net-next-2.6), to continue work
> with RCU on UDP/TCP/DCCP stacks

All 4 patches applied to net-next-2.6, thanks a lot Eric!

These patches are incredibly cool!


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 1/3] rcu: Introduce hlist_nulls variant of hlist
  2008-11-13 13:14                                                       ` [PATCH 1/3] rcu: Introduce hlist_nulls variant of hlist Eric Dumazet
  2008-11-13 13:29                                                         ` Peter Zijlstra
  2008-11-14 15:16                                                         ` [PATCH 1/3] rcu: Introduce hlist_nulls variant of hlist Peter Zijlstra
@ 2008-11-19 17:01                                                         ` Paul E. McKenney
  2008-11-19 17:53                                                           ` Eric Dumazet
  2 siblings, 1 reply; 134+ messages in thread
From: Paul E. McKenney @ 2008-11-19 17:01 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Corey Minyard, David Miller, Stephen Hemminger, benny+usenet,
	Linux Netdev List, Christoph Lameter, Evgeniy Polyakov,
	Peter Zijlstra, Christian Bell

On Thu, Nov 13, 2008 at 02:14:18PM +0100, Eric Dumazet wrote:
> hlist uses NULL value to finish a chain.
>
> hlist_nulls variant use the low order bit set to 1 to signal an end-of-list 
> marker.
>
> This allows to store many different end markers, so that some RCU lockless
> algos (used in TCP/UDP stack for example) can save some memory barriers in
> fast paths.
>
> Two new files are added :
>
> include/linux/list_nulls.h
>  - mimics hlist part of include/linux/list.h, derived to hlist_nulls 
> variant
>
> include/linux/rculist_nulls.h
>  - mimics hlist part of include/linux/rculist.h, derived to hlist_nulls 
> variant
>
>   Only four helpers are declared for the moment :
>
>     hlist_nulls_del_init_rcu(), hlist_nulls_del_rcu(),
>     hlist_nulls_add_head_rcu() and hlist_nulls_for_each_entry_rcu()
>
> prefetches() were removed, since an end of list is not anymore NULL value.
> prefetches() could trigger useless (and possibly dangerous) memory 
> transactions.
>
> Example of use (extracted from __udp4_lib_lookup())
>
> 	struct sock *sk, *result;
>        struct hlist_nulls_node *node;
>        unsigned short hnum = ntohs(dport);
>        unsigned int hash = udp_hashfn(net, hnum);
>        struct udp_hslot *hslot = &udptable->hash[hash];
>        int score, badness;
>
>        rcu_read_lock();
> begin:
>        result = NULL;
>        badness = -1;
>        sk_nulls_for_each_rcu(sk, node, &hslot->head) {
>                score = compute_score(sk, net, saddr, hnum, sport,
>                                      daddr, dport, dif);
>                if (score > badness) {
>                        result = sk;
>                        badness = score;
>                }
>        }
>        /*
>         * if the nulls value we got at the end of this lookup is
>         * not the expected one, we must restart lookup.
>         * We probably met an item that was moved to another chain.
>         */
>        if (get_nulls_value(node) != hash)
>                goto begin;
>
>        if (result) {
>                if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
>                        result = NULL;
>                else if (unlikely(compute_score(result, net, saddr, hnum, 
> sport,
>                                  daddr, dport, dif) < badness)) {
>                        sock_put(result);
>                        goto begin;
>                }
>        }
>        rcu_read_unlock();
>        return result;

Looks good, but a few questions and suggestions interspersed below.

							Thanx, Paul

> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
> ---
> include/linux/list_nulls.h    |   94 +++++++++++++++++++++++++++
> include/linux/rculist_nulls.h |  110 ++++++++++++++++++++++++++++++++
> 2 files changed, 204 insertions(+)

> diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
> new file mode 100644
> index 0000000..856dee8
> --- /dev/null
> +++ b/include/linux/list_nulls.h
> @@ -0,0 +1,94 @@
> +#ifndef _LINUX_LIST_NULLS_H
> +#define _LINUX_LIST_NULLS_H
> +
> +/*
> + * Special version of lists, where end of list is not a NULL pointer,
> + * but a 'nulls' marker, which can have many different values.
> + * (up to 2^31 different values guaranteed on all platforms)
> + *
> + * In the standard hlist, termination of a list is the NULL pointer.
> + * In this special 'nulls' variant, we use the fact that objects stored in
> + * a list are aligned on a word (4 or 8 bytes alignment).
> + * We therefore use the last significant bit of 'ptr' :
> + * Set to 1 : This is a 'nulls' end-of-list marker (ptr >> 1)
> + * Set to 0 : This is a pointer to some object (ptr)
> + */
> +
> +struct hlist_nulls_head {
> +	struct hlist_nulls_node *first;
> +};
> +
> +struct hlist_nulls_node {
> +	struct hlist_nulls_node *next, **pprev;
> +};
> +#define INIT_HLIST_NULLS_HEAD(ptr, nulls) \
> +	((ptr)->first = (struct hlist_nulls_node *) (1UL | (((long)nulls) << 1)))
> +
> +#define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member)
> +/**
> + * ptr_is_a_nulls - Test if a ptr is a nulls
> + * @ptr: ptr to be tested
> + *
> + */
> +static inline int is_a_nulls(const struct hlist_nulls_node *ptr)
> +{
> +	return ((unsigned long)ptr & 1);
> +}
> +
> +/**
> + * get_nulls_value - Get the 'nulls' value of the end of chain
> + * @ptr: end of chain
> + *
> + * Should be called only if is_a_nulls(ptr);
> + */
> +static inline unsigned long get_nulls_value(const struct hlist_nulls_node *ptr)
> +{
> +	return ((unsigned long)ptr) >> 1;
> +}
> +
> +static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h)
> +{
> +	return !h->pprev;
> +}
> +
> +static inline int hlist_nulls_empty(const struct hlist_nulls_head *h)
> +{
> +	return is_a_nulls(h->first);
> +}
> +
> +static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
> +{
> +	struct hlist_nulls_node *next = n->next;
> +	struct hlist_nulls_node **pprev = n->pprev;
> +	*pprev = next;
> +	if (!is_a_nulls(next))
> +		next->pprev = pprev;
> +}
> +
> +/**
> + * hlist_nulls_for_each_entry	- iterate over list of given type
> + * @tpos:	the type * to use as a loop cursor.
> + * @pos:	the &struct hlist_node to use as a loop cursor.
> + * @head:	the head for your list.
> + * @member:	the name of the hlist_node within the struct.
> + *
> + */
> +#define hlist_nulls_for_each_entry(tpos, pos, head, member)		       \
> +	for (pos = (head)->first;					       \
> +	     (!is_a_nulls(pos)) &&					       \
> +		({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
> +	     pos = pos->next)
> +
> +/**
> + * hlist_nulls_for_each_entry_from - iterate over a hlist continuing from current point
> + * @tpos:	the type * to use as a loop cursor.
> + * @pos:	the &struct hlist_node to use as a loop cursor.

And @pos is the starting point, correct?  Suggest something like:

	@pos:	the &struct hlist_node serving as starting point and cursor

> + * @member:	the name of the hlist_node within the struct.
> + *
> + */
> +#define hlist_nulls_for_each_entry_from(tpos, pos, member)	\
> +	for (; (!is_a_nulls(pos)) && 				\
> +		({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
> +	     pos = pos->next)
> +
> +#endif
> diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
> new file mode 100644
> index 0000000..b185ac4
> --- /dev/null
> +++ b/include/linux/rculist_nulls.h
> @@ -0,0 +1,110 @@
> +#ifndef _LINUX_RCULIST_NULLS_H
> +#define _LINUX_RCULIST_NULLS_H
> +
> +#ifdef __KERNEL__
> +
> +/*
> + * RCU-protected list version
> + */
> +#include <linux/list_nulls.h>
> +#include <linux/rcupdate.h>
> +
> +/**
> + * hlist_nulls_del_init_rcu - deletes entry from hash list with re-initialization
> + * @n: the element to delete from the hash list.
> + *
> + * Note: hlist_nulls_unhashed() on the node return true after this. It is
> + * useful for RCU based read lockfree traversal if the writer side
> + * must know if the list entry is still hashed or already unhashed.
> + *
> + * In particular, it means that we can not poison the forward pointers
> + * that may still be used for walking the hash list and we can only
> + * zero the pprev pointer so list_unhashed() will return true after
> + * this.
> + *
> + * The caller must take whatever precautions are necessary (such as
> + * holding appropriate locks) to avoid racing with another
> + * list-mutation primitive, such as hlist_nulls_add_head_rcu() or
> + * hlist_nulls_del_rcu(), running on this same list.  However, it is
> + * perfectly legal to run concurrently with the _rcu list-traversal
> + * primitives, such as hlist_nulls_for_each_entry_rcu().
> + */
> +static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
> +{
> +	if (!hlist_nulls_unhashed(n)) {
> +		__hlist_nulls_del(n);
> +		n->pprev = NULL;
> +	}
> +}

The point here is to allow an RCU reader to grab the update-side lock
while holding a reference to an hlist_nulls_node, and then be able to
blindly call hlist_nulls_del_init_rcu() without having to do any complex
check to see if the element has already been deleted?

But this only works if each free operation waits for a grace period.
If using SLAB_DESTROY_BY_RCU, the would-be deleter still needs to
revalidate after grabbing the update-side lock, right?  Hmmm...

> +
> +/**
> + * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization
> + * @n: the element to delete from the hash list.
> + *
> + * Note: hlist_nulls_unhashed() on entry does not return true after this,
> + * the entry is in an undefined state. It is useful for RCU based
> + * lockfree traversal.
> + *
> + * In particular, it means that we can not poison the forward
> + * pointers that may still be used for walking the hash list.
> + *
> + * The caller must take whatever precautions are necessary
> + * (such as holding appropriate locks) to avoid racing
> + * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
> + * or hlist_nulls_del_rcu(), running on this same list.
> + * However, it is perfectly legal to run concurrently with
> + * the _rcu list-traversal primitives, such as
> + * hlist_nulls_for_each_entry().
> + */
> +static inline void hlist_nulls_del_rcu(struct hlist_nulls_node *n)
> +{
> +	__hlist_nulls_del(n);
> +	n->pprev = LIST_POISON2;
> +}
> +
> +/**
> + * hlist_nulls_add_head_rcu
> + * @n: the element to add to the hash list.
> + * @h: the list to add to.
> + *
> + * Description:
> + * Adds the specified element to the specified hlist_nulls,
> + * while permitting racing traversals.
> + *
> + * The caller must take whatever precautions are necessary
> + * (such as holding appropriate locks) to avoid racing
> + * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
> + * or hlist_nulls_del_rcu(), running on this same list.
> + * However, it is perfectly legal to run concurrently with
> + * the _rcu list-traversal primitives, such as
> + * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency
> + * problems on Alpha CPUs.  Regardless of the type of CPU, the
> + * list-traversal primitive must be guarded by rcu_read_lock().
> + */
> +static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
> +					struct hlist_nulls_head *h)
> +{
> +	struct hlist_nulls_node *first = h->first;
> +
> +	n->next = first;
> +	n->pprev = &h->first;
> +	rcu_assign_pointer(h->first, n);
> +	if (!is_a_nulls(first))
> +		first->pprev = &n->next;
> +}
> +/**
> + * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type
> + * @tpos:	the type * to use as a loop cursor.
> + * @pos:	the &struct hlist_nulls_node to use as a loop cursor.
> + * @head:	the head for your list.
> + * @member:	the name of the hlist_nulls_node within the struct.
> + *
> + */
> +#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \
> +	for (pos = rcu_dereference((head)->first);			 \
> +		(!is_a_nulls(pos)) && 			\
> +		({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \
> +		pos = rcu_dereference(pos->next))
> +
> +#endif
> +#endif

Any chance of using a trick like Oleg used to get rid of the "pos"
argument?  http://lkml.org/lkml/2008/3/12/47

The hlist_nulls_node must always be at an even address, correct?
Couldn't this fact be used to allow testing for is_a_nulls() on tpos
rather than on pos?  Or is there a better way to approach this?

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 4/3] rcu: documents rculist_nulls
  2008-11-13 16:02                                                             ` [PATCH 4/3] rcu: documents rculist_nulls Eric Dumazet
  2008-11-14 15:16                                                               ` Peter Zijlstra
@ 2008-11-19 17:07                                                               ` Paul E. McKenney
  1 sibling, 0 replies; 134+ messages in thread
From: Paul E. McKenney @ 2008-11-19 17:07 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Peter Zijlstra, David Miller, Corey Minyard, Stephen Hemminger,
	benny+usenet, Linux Netdev List, Christoph Lameter,
	Evgeniy Polyakov, Christian Bell

On Thu, Nov 13, 2008 at 05:02:48PM +0100, Eric Dumazet wrote:
> Eric Dumazet a écrit :
>> Peter Zijlstra a écrit :
>>> So by not using some memory barriers (would be nice to have it
>>> illustrated which ones), we can race and end up on the wrong chain, in
>>> case that happens we detect this by using this per-chain terminator and
>>> try again.
>>>
>>> It would be really good to have it explained in the rculist_nulls.h
>>> comments what memory barriers are missing, what races they open, and how
>>> the this special terminator trick closes that race.
>> OK, maybe I should add a Documentation/RCU/rculist_nulls.txt file with
>> appropriate examples and documentation.
>> (Say the lookup/insert algorithms, with standard hlist and memory 
>> barriers,
>> and with hlist_nulls without those two memory barriers.
>
> [PATCH 4/3] rcu: documents rculist_nulls

Very good -- only one small suggestion below.

Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

> Adds Documentation/RCU/rculist_nulls.txt file to describe how 'nulls'
> end-of-list can help in some RCU algos.
>
>
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
> ---
> Documentation/RCU/rculist_nulls.txt |  167 ++++++++++++++++++++++++++
> 1 files changed, 167 insertions(+)

> diff --git a/Documentation/RCU/rculist_nulls.txt b/Documentation/RCU/rculist_nulls.txt
> new file mode 100644
> index 0000000..5db5549
> --- /dev/null
> +++ b/Documentation/RCU/rculist_nulls.txt
> @@ -0,0 +1,167 @@
> +Using hlist_nulls to protect read-mostly linked lists and
> +objects using SLAB_DESTROY_BY_RCU allocations.
> +
> +Please read the basics in Documentation/RCU/listRCU.txt
> +
> +Using special makers (called 'nulls') is a convenient way
> +to solve following problem :
> +
> +A typical RCU linked list managing objects which are
> +allocated with SLAB_DESTROY_BY_RCU kmem_cache can
> +use following algos :
> +
> +1) Lookup algo
> +--------------
> +rcu_read_lock()
> +begin:
> +obj = lockless_lookup(key);
> +if (obj) {
> +  if (!try_get_ref(obj)) // might fail for free objects
> +    goto begin;
> +  /*
> +   * Because a writer could delete object, and a writer could
> +   * reuse these object before the RCU grace period, we
> +   * must check key after geting the reference on object
> +   */
> +  if (obj->key != key) { // not the object we expected

In some cases, a "generation number" will be needed.  For example, there
are algorithms that must detect when an object has been removed and then
re-inserted with the same key.  One increments the generation number
on each free and sometimes also on each allocation.

> +     put_ref(obj);
> +     goto begin;
> +   }
> +}
> +rcu_read_unlock();
> +
> +Beware that lockless_lookup(key) cannot use traditional hlist_for_each_entry_rcu()
> +but a version with an additional memory barrier (smp_rmb())
> +
> +lockless_lookup(key)
> +{
> +   struct hlist_node *node, *next;
> +   for (pos = rcu_dereference((head)->first);
> +          pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) &&
> +          ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); 
> +          pos = rcu_dereference(next))
> +      if (obj->key == key)
> +         return obj;
> +   return NULL;
> +
> +And note the traditional hlist_for_each_entry_rcu() misses this smp_rmb() :
> +
> +   struct hlist_node *node;
> +   for (pos = rcu_dereference((head)->first);
> +		pos && ({ prefetch(pos->next); 1; }) &&
> +		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; });
> +		pos = rcu_dereference(pos->next))
> +      if (obj->key == key)
> +         return obj;
> +   return NULL;
> +}
> +
> +Quoting Corey Minyard :
> +
> +"If the object is moved from one list to another list in-between the
> + time the hash is calculated and the next field is accessed, and the
> + object has moved to the end of a new list, the traversal will not
> + complete properly on the list it should have, since the object will
> + be on the end of the new list and there's not a way to tell it's on a
> + new list and restart the list traversal.  I think that this can be
> + solved by pre-fetching the "next" field (with proper barriers) before
> + checking the key."
> +
> +2) Insert algo :
> +----------------
> +
> +We need to make sure a reader cannot read the new 'obj->obj_next' value
> +and previous value of 'obj->key'. Or else, an item could be deleted
> +from a chain, and inserted into another chain. If new chain was empty
> +before the move, 'next' pointer is NULL, and lockless reader can
> +not detect it missed following items in original chain.
> +
> +/*
> + * Please note that new inserts are done at the head of list,
> + * not in the middle or end.
> + */
> +obj = kmem_cache_alloc(...);
> +lock_chain(); // typically a spin_lock()
> +obj->key = key;
> +atomic_inc(&obj->refcnt);
> +/*
> + * we need to make sure obj->key is updated before obj->next
> + */
> +smp_wmb();
> +hlist_add_head_rcu(&obj->obj_node, list);
> +unlock_chain(); // typically a spin_unlock()
> +
> +
> +3) Remove algo
> +--------------
> +Nothing special here, we can use a standard RCU hlist deletion.
> +But thanks to SLAB_DESTROY_BY_RCU, beware a deleted object can be reused 
> +very very fast (before the end of RCU grace period)
> +
> +if (put_last_reference_on(obj) {
> +   lock_chain(); // typically a spin_lock()
> +   hlist_del_init_rcu(&obj->obj_node);
> +   unlock_chain(); // typically a spin_unlock()
> +   kmem_cache_free(cachep, obj);
> +}
> +
> +
> +
> +--------------------------------------------------------------------------
> +With hlist_nulls we can avoid extra smp_rmb() in lockless_lookup()
> +and extra smp_wmb() in insert function.
> +
> +For example, if we choose to store the slot number as the 'nulls'
> +end-of-list marker for each slot of the hash table, we can detect
> +a race (some writer did a delete and/or a move of an object
> +to another chain) checking the final 'nulls' value if
> +the lookup met the end of chain. If final 'nulls' value
> +is not the slot number, then we must restart the lookup at
> +the begining. If the object was moved to same chain,
> +then the reader doesnt care : It might eventually
> +scan the list again without harm.
> +
> +
> +1) lookup algo
> +
> + head = &table[slot];
> + rcu_read_lock();
> +begin:
> + hlist_nulls_for_each_entry_rcu(obj, node, head, member) {
> +   if (obj->key == key) {
> +      if (!try_get_ref(obj)) // might fail for free objects
> +         goto begin;
> +      if (obj->key != key) { // not the object we expected
> +         put_ref(obj);
> +         goto begin;
> +      }
> +  goto out;
> + }
> +/*
> + * if the nulls value we got at the end of this lookup is
> + * not the expected one, we must restart lookup.
> + * We probably met an item that was moved to another chain.
> + */
> + if (get_nulls_value(node) != slot)
> +   goto begin;
> + obj = NULL;
> +
> +out:
> + rcu_read_unlock();
> +
> +2) Insert function :
> +--------------------
> +
> +/*
> + * Please note that new inserts are done at the head of list,
> + * not in the middle or end.
> + */
> +obj = kmem_cache_alloc(cachep);
> +lock_chain(); // typically a spin_lock()
> +obj->key = key;
> +atomic_set(&obj->refcnt, 1);
> +/*
> + * insert obj in RCU way (readers might be traversing chain)
> + */
> +hlist_nulls_add_head_rcu(&obj->obj_node, list);
> +unlock_chain(); // typically a spin_unlock()


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/3] udp: Use hlist_nulls in UDP RCU code
  2008-11-13 13:15                                                       ` [PATCH 2/3] udp: Use hlist_nulls in UDP RCU code Eric Dumazet
@ 2008-11-19 17:29                                                         ` Paul E. McKenney
  2008-11-19 17:53                                                           ` Eric Dumazet
  0 siblings, 1 reply; 134+ messages in thread
From: Paul E. McKenney @ 2008-11-19 17:29 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, Corey Minyard, Stephen Hemminger, benny+usenet,
	Linux Netdev List, Christoph Lameter, Peter Zijlstra,
	Evgeniy Polyakov, Christian Bell

On Thu, Nov 13, 2008 at 02:15:19PM +0100, Eric Dumazet wrote:
> This is a straightforward patch, using hlist_nulls infrastructure.
>
> RCUification already done on UDP two weeks ago.
>
> Using hlist_nulls permits us to avoid some memory barriers, both
> at lookup time and delete time.
>
> Patch is large because it adds new macros to include/net/sock.h.
> These macros will be used by TCP & DCCP in next patch.

Looks good, one question below about the lockless searches.  If the
answer is that the search must complete undisturbed by deletions and
additions, then:

Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
> ---
> include/linux/rculist.h |   17 -----------
> include/net/sock.h      |   57 ++++++++++++++++++++++++++++++--------
> include/net/udp.h       |    2 -
> net/ipv4/udp.c          |   47 ++++++++++++++-----------------
> net/ipv6/udp.c          |   26 +++++++++--------
> 5 files changed, 83 insertions(+), 66 deletions(-)
>

> diff --git a/include/linux/rculist.h b/include/linux/rculist.h
> index 3ba2998..e649bd3 100644
> --- a/include/linux/rculist.h
> +++ b/include/linux/rculist.h
> @@ -383,22 +383,5 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
>  		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \
>  		pos = rcu_dereference(pos->next))
> 
> -/**
> - * hlist_for_each_entry_rcu_safenext - iterate over rcu list of given type
> - * @tpos:	the type * to use as a loop cursor.
> - * @pos:	the &struct hlist_node to use as a loop cursor.
> - * @head:	the head for your list.
> - * @member:	the name of the hlist_node within the struct.
> - * @next:       the &struct hlist_node to use as a next cursor
> - *
> - * Special version of hlist_for_each_entry_rcu that make sure
> - * each next pointer is fetched before each iteration.
> - */
> -#define hlist_for_each_entry_rcu_safenext(tpos, pos, head, member, next) \
> -	for (pos = rcu_dereference((head)->first);			 \
> -		pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) &&	\
> -		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \
> -		pos = rcu_dereference(next))
> -

Bonus points for getting rid of a list primitive!!!  ;-)

>  #endif	/* __KERNEL__ */
>  #endif
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 8b2b821..0a63894 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -42,6 +42,7 @@
> 
>  #include <linux/kernel.h>
>  #include <linux/list.h>
> +#include <linux/list_nulls.h>
>  #include <linux/timer.h>
>  #include <linux/cache.h>
>  #include <linux/module.h>
> @@ -52,6 +53,7 @@
>  #include <linux/security.h>
> 
>  #include <linux/filter.h>
> +#include <linux/rculist_nulls.h>
> 
>  #include <asm/atomic.h>
>  #include <net/dst.h>
> @@ -106,6 +108,7 @@ struct net;
>   *	@skc_reuse: %SO_REUSEADDR setting
>   *	@skc_bound_dev_if: bound device index if != 0
>   *	@skc_node: main hash linkage for various protocol lookup tables
> + *	@skc_nulls_node: main hash linkage for UDP/UDP-Lite protocol
>   *	@skc_bind_node: bind hash linkage for various protocol lookup tables
>   *	@skc_refcnt: reference count
>   *	@skc_hash: hash value used with various protocol lookup tables
> @@ -120,7 +123,10 @@ struct sock_common {
>  	volatile unsigned char	skc_state;
>  	unsigned char		skc_reuse;
>  	int			skc_bound_dev_if;
> -	struct hlist_node	skc_node;
> +	union {
> +		struct hlist_node	skc_node;
> +		struct hlist_nulls_node skc_nulls_node;
> +	};
>  	struct hlist_node	skc_bind_node;
>  	atomic_t		skc_refcnt;
>  	unsigned int		skc_hash;
> @@ -206,6 +212,7 @@ struct sock {
>  #define sk_reuse		__sk_common.skc_reuse
>  #define sk_bound_dev_if		__sk_common.skc_bound_dev_if
>  #define sk_node			__sk_common.skc_node
> +#define sk_nulls_node		__sk_common.skc_nulls_node
>  #define sk_bind_node		__sk_common.skc_bind_node
>  #define sk_refcnt		__sk_common.skc_refcnt
>  #define sk_hash			__sk_common.skc_hash
> @@ -300,12 +307,30 @@ static inline struct sock *sk_head(const struct hlist_head *head)
>  	return hlist_empty(head) ? NULL : __sk_head(head);
>  }
> 
> +static inline struct sock *__sk_nulls_head(const struct hlist_nulls_head *head)
> +{
> +	return hlist_nulls_entry(head->first, struct sock, sk_nulls_node);
> +}
> +
> +static inline struct sock *sk_nulls_head(const struct hlist_nulls_head *head)
> +{
> +	return hlist_nulls_empty(head) ? NULL : __sk_nulls_head(head);
> +}
> +
>  static inline struct sock *sk_next(const struct sock *sk)
>  {
>  	return sk->sk_node.next ?
>  		hlist_entry(sk->sk_node.next, struct sock, sk_node) : NULL;
>  }
> 
> +static inline struct sock *sk_nulls_next(const struct sock *sk)
> +{
> +	return (!is_a_nulls(sk->sk_nulls_node.next)) ?
> +		hlist_nulls_entry(sk->sk_nulls_node.next,
> +				  struct sock, sk_nulls_node) :
> +		NULL;
> +}
> +
>  static inline int sk_unhashed(const struct sock *sk)
>  {
>  	return hlist_unhashed(&sk->sk_node);
> @@ -321,6 +346,11 @@ static __inline__ void sk_node_init(struct hlist_node *node)
>  	node->pprev = NULL;
>  }
> 
> +static __inline__ void sk_nulls_node_init(struct hlist_nulls_node *node)
> +{
> +	node->pprev = NULL;
> +}
> +
>  static __inline__ void __sk_del_node(struct sock *sk)
>  {
>  	__hlist_del(&sk->sk_node);
> @@ -367,18 +397,18 @@ static __inline__ int sk_del_node_init(struct sock *sk)
>  	return rc;
>  }
> 
> -static __inline__ int __sk_del_node_init_rcu(struct sock *sk)
> +static __inline__ int __sk_nulls_del_node_init_rcu(struct sock *sk)
>  {
>  	if (sk_hashed(sk)) {
> -		hlist_del_init_rcu(&sk->sk_node);
> +		hlist_nulls_del_init_rcu(&sk->sk_nulls_node);
>  		return 1;
>  	}
>  	return 0;
>  }
> 
> -static __inline__ int sk_del_node_init_rcu(struct sock *sk)
> +static __inline__ int sk_nulls_del_node_init_rcu(struct sock *sk)
>  {
> -	int rc = __sk_del_node_init_rcu(sk);
> +	int rc = __sk_nulls_del_node_init_rcu(sk);
> 
>  	if (rc) {
>  		/* paranoid for a while -acme */
> @@ -399,15 +429,15 @@ static __inline__ void sk_add_node(struct sock *sk, struct hlist_head *list)
>  	__sk_add_node(sk, list);
>  }
> 
> -static __inline__ void __sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
> +static __inline__ void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
>  {
> -	hlist_add_head_rcu(&sk->sk_node, list);
> +	hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list);
>  }
> 
> -static __inline__ void sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
> +static __inline__ void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
>  {
>  	sock_hold(sk);
> -	__sk_add_node_rcu(sk, list);
> +	__sk_nulls_add_node_rcu(sk, list);
>  }
> 
>  static __inline__ void __sk_del_bind_node(struct sock *sk)
> @@ -423,11 +453,16 @@ static __inline__ void sk_add_bind_node(struct sock *sk,
> 
>  #define sk_for_each(__sk, node, list) \
>  	hlist_for_each_entry(__sk, node, list, sk_node)
> -#define sk_for_each_rcu_safenext(__sk, node, list, next) \
> -	hlist_for_each_entry_rcu_safenext(__sk, node, list, sk_node, next)
> +#define sk_nulls_for_each(__sk, node, list) \
> +	hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node)
> +#define sk_nulls_for_each_rcu(__sk, node, list) \
> +	hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node)
>  #define sk_for_each_from(__sk, node) \
>  	if (__sk && ({ node = &(__sk)->sk_node; 1; })) \
>  		hlist_for_each_entry_from(__sk, node, sk_node)
> +#define sk_nulls_for_each_from(__sk, node) \
> +	if (__sk && ({ node = &(__sk)->sk_nulls_node; 1; })) \
> +		hlist_nulls_for_each_entry_from(__sk, node, sk_nulls_node)
>  #define sk_for_each_continue(__sk, node) \
>  	if (__sk && ({ node = &(__sk)->sk_node; 1; })) \
>  		hlist_for_each_entry_continue(__sk, node, sk_node)
> diff --git a/include/net/udp.h b/include/net/udp.h
> index df2bfe5..90e6ce5 100644
> --- a/include/net/udp.h
> +++ b/include/net/udp.h
> @@ -51,7 +51,7 @@ struct udp_skb_cb {
>  #define UDP_SKB_CB(__skb)	((struct udp_skb_cb *)((__skb)->cb))
> 
>  struct udp_hslot {
> -	struct hlist_head	head;
> +	struct hlist_nulls_head	head;
>  	spinlock_t		lock;
>  } __attribute__((aligned(2 * sizeof(long))));
>  struct udp_table {
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index 54badc9..fea2d87 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -127,9 +127,9 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
>  						 const struct sock *sk2))
>  {
>  	struct sock *sk2;
> -	struct hlist_node *node;
> +	struct hlist_nulls_node *node;
> 
> -	sk_for_each(sk2, node, &hslot->head)
> +	sk_nulls_for_each(sk2, node, &hslot->head)
>  		if (net_eq(sock_net(sk2), net)			&&
>  		    sk2 != sk					&&
>  		    sk2->sk_hash == num				&&
> @@ -189,12 +189,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
>  	inet_sk(sk)->num = snum;
>  	sk->sk_hash = snum;
>  	if (sk_unhashed(sk)) {
> -		/*
> -		 * We need that previous write to sk->sk_hash committed
> -		 * before write to sk->next done in following add_node() variant
> -		 */
> -		smp_wmb();
> -		sk_add_node_rcu(sk, &hslot->head);
> +		sk_nulls_add_node_rcu(sk, &hslot->head);
>  		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
>  	}
>  	error = 0;
> @@ -261,7 +256,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
>  		int dif, struct udp_table *udptable)
>  {
>  	struct sock *sk, *result;
> -	struct hlist_node *node, *next;
> +	struct hlist_nulls_node *node;
>  	unsigned short hnum = ntohs(dport);
>  	unsigned int hash = udp_hashfn(net, hnum);
>  	struct udp_hslot *hslot = &udptable->hash[hash];
> @@ -271,13 +266,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
>  begin:
>  	result = NULL;
>  	badness = -1;
> -	sk_for_each_rcu_safenext(sk, node, &hslot->head, next) {
> -		/*
> -		 * lockless reader, and SLAB_DESTROY_BY_RCU items:
> -		 * We must check this item was not moved to another chain
> -		 */
> -		if (udp_hashfn(net, sk->sk_hash) != hash)
> -			goto begin;
> +	sk_nulls_for_each_rcu(sk, node, &hslot->head) {
>  		score = compute_score(sk, net, saddr, hnum, sport,
>  				      daddr, dport, dif);
>  		if (score > badness) {
> @@ -285,6 +274,14 @@ begin:
>  			badness = score;
>  		}
>  	}
> +	/*
> +	 * if the nulls value we got at the end of this lookup is
> +	 * not the expected one, we must restart lookup.
> +	 * We probably met an item that was moved to another chain.
> +	 */
> +	if (get_nulls_value(node) != hash)
> +		goto begin;
> +

Shouldn't this check go -after- the check for "result"?  Or is this a
case where the readers absolutely must have traversed a chain without
modification to be guaranteed of finding the correct result?

>  	if (result) {
>  		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
>  			result = NULL;
> @@ -325,11 +322,11 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
>  					     __be16 rmt_port, __be32 rmt_addr,
>  					     int dif)
>  {
> -	struct hlist_node *node;
> +	struct hlist_nulls_node *node;
>  	struct sock *s = sk;
>  	unsigned short hnum = ntohs(loc_port);
> 
> -	sk_for_each_from(s, node) {
> +	sk_nulls_for_each_from(s, node) {
>  		struct inet_sock *inet = inet_sk(s);
> 
>  		if (!net_eq(sock_net(s), net)				||
> @@ -977,7 +974,7 @@ void udp_lib_unhash(struct sock *sk)
>  	struct udp_hslot *hslot = &udptable->hash[hash];
> 
>  	spin_lock_bh(&hslot->lock);
> -	if (sk_del_node_init_rcu(sk)) {
> +	if (sk_nulls_del_node_init_rcu(sk)) {
>  		inet_sk(sk)->num = 0;
>  		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
>  	}
> @@ -1130,7 +1127,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
>  	int dif;
> 
>  	spin_lock(&hslot->lock);
> -	sk = sk_head(&hslot->head);
> +	sk = sk_nulls_head(&hslot->head);
>  	dif = skb->dev->ifindex;
>  	sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
>  	if (sk) {
> @@ -1139,7 +1136,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
>  		do {
>  			struct sk_buff *skb1 = skb;
> 
> -			sknext = udp_v4_mcast_next(net, sk_next(sk), uh->dest,
> +			sknext = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
>  						   daddr, uh->source, saddr,
>  						   dif);
>  			if (sknext)
> @@ -1560,10 +1557,10 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
>  	struct net *net = seq_file_net(seq);
> 
>  	for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) {
> -		struct hlist_node *node;
> +		struct hlist_nulls_node *node;
>  		struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
>  		spin_lock_bh(&hslot->lock);
> -		sk_for_each(sk, node, &hslot->head) {
> +		sk_nulls_for_each(sk, node, &hslot->head) {
>  			if (!net_eq(sock_net(sk), net))
>  				continue;
>  			if (sk->sk_family == state->family)
> @@ -1582,7 +1579,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
>  	struct net *net = seq_file_net(seq);
> 
>  	do {
> -		sk = sk_next(sk);
> +		sk = sk_nulls_next(sk);
>  	} while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
> 
>  	if (!sk) {
> @@ -1753,7 +1750,7 @@ void __init udp_table_init(struct udp_table *table)
>  	int i;
> 
>  	for (i = 0; i < UDP_HTABLE_SIZE; i++) {
> -		INIT_HLIST_HEAD(&table->hash[i].head);
> +		INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
>  		spin_lock_init(&table->hash[i].lock);
>  	}
>  }
> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> index 8dafa36..fd2d9ad 100644
> --- a/net/ipv6/udp.c
> +++ b/net/ipv6/udp.c
> @@ -98,7 +98,7 @@ static struct sock *__udp6_lib_lookup(struct net *net,
>  				      int dif, struct udp_table *udptable)
>  {
>  	struct sock *sk, *result;
> -	struct hlist_node *node, *next;
> +	struct hlist_nulls_node *node;
>  	unsigned short hnum = ntohs(dport);
>  	unsigned int hash = udp_hashfn(net, hnum);
>  	struct udp_hslot *hslot = &udptable->hash[hash];
> @@ -108,19 +108,21 @@ static struct sock *__udp6_lib_lookup(struct net *net,
>  begin:
>  	result = NULL;
>  	badness = -1;
> -	sk_for_each_rcu_safenext(sk, node, &hslot->head, next) {
> -		/*
> -		 * lockless reader, and SLAB_DESTROY_BY_RCU items:
> -		 * We must check this item was not moved to another chain
> -		 */
> -		if (udp_hashfn(net, sk->sk_hash) != hash)
> -			goto begin;
> +	sk_nulls_for_each_rcu(sk, node, &hslot->head) {
>  		score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif);
>  		if (score > badness) {
>  			result = sk;
>  			badness = score;
>  		}
>  	}
> +	/*
> +	 * if the nulls value we got at the end of this lookup is
> +	 * not the expected one, we must restart lookup.
> +	 * We probably met an item that was moved to another chain.
> +	 */
> +	if (get_nulls_value(node) != hash)
> +		goto begin;
> +

Same question as before...

>  	if (result) {
>  		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
>  			result = NULL;
> @@ -374,11 +376,11 @@ static struct sock *udp_v6_mcast_next(struct net *net, struct sock *sk,
>  				      __be16 rmt_port, struct in6_addr *rmt_addr,
>  				      int dif)
>  {
> -	struct hlist_node *node;
> +	struct hlist_nulls_node *node;
>  	struct sock *s = sk;
>  	unsigned short num = ntohs(loc_port);
> 
> -	sk_for_each_from(s, node) {
> +	sk_nulls_for_each_from(s, node) {
>  		struct inet_sock *inet = inet_sk(s);
> 
>  		if (!net_eq(sock_net(s), net))
> @@ -423,7 +425,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
>  	int dif;
> 
>  	spin_lock(&hslot->lock);
> -	sk = sk_head(&hslot->head);
> +	sk = sk_nulls_head(&hslot->head);
>  	dif = inet6_iif(skb);
>  	sk = udp_v6_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
>  	if (!sk) {
> @@ -432,7 +434,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
>  	}
> 
>  	sk2 = sk;
> -	while ((sk2 = udp_v6_mcast_next(net, sk_next(sk2), uh->dest, daddr,
> +	while ((sk2 = udp_v6_mcast_next(net, sk_nulls_next(sk2), uh->dest, daddr,
>  					uh->source, saddr, dif))) {
>  		struct sk_buff *buff = skb_clone(skb, GFP_ATOMIC);
>  		if (buff) {


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 2/3] udp: Use hlist_nulls in UDP RCU code
  2008-11-19 17:29                                                         ` Paul E. McKenney
@ 2008-11-19 17:53                                                           ` Eric Dumazet
  0 siblings, 0 replies; 134+ messages in thread
From: Eric Dumazet @ 2008-11-19 17:53 UTC (permalink / raw)
  To: paulmck
  Cc: David Miller, Corey Minyard, Stephen Hemminger, benny+usenet,
	Linux Netdev List, Christoph Lameter, Peter Zijlstra,
	Evgeniy Polyakov, Christian Bell

Paul E. McKenney a écrit :
> On Thu, Nov 13, 2008 at 02:15:19PM +0100, Eric Dumazet wrote:
>> This is a straightforward patch, using hlist_nulls infrastructure.
>>
>> RCUification already done on UDP two weeks ago.
>>
>> Using hlist_nulls permits us to avoid some memory barriers, both
>> at lookup time and delete time.
>>
>> Patch is large because it adds new macros to include/net/sock.h.
>> These macros will be used by TCP & DCCP in next patch.
> 
> Looks good, one question below about the lockless searches.  If the
> answer is that the search must complete undisturbed by deletions and
> additions, then:
> 
> Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> 
>> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
>> ---
>> include/linux/rculist.h |   17 -----------
>> include/net/sock.h      |   57 ++++++++++++++++++++++++++++++--------
>> include/net/udp.h       |    2 -
>> net/ipv4/udp.c          |   47 ++++++++++++++-----------------
>> net/ipv6/udp.c          |   26 +++++++++--------
>> 5 files changed, 83 insertions(+), 66 deletions(-)
>>
> 
...
>>  	result = NULL;
>>  	badness = -1;
>> -	sk_for_each_rcu_safenext(sk, node, &hslot->head, next) {
>> -		/*
>> -		 * lockless reader, and SLAB_DESTROY_BY_RCU items:
>> -		 * We must check this item was not moved to another chain
>> -		 */
>> -		if (udp_hashfn(net, sk->sk_hash) != hash)
>> -			goto begin;
>> +	sk_nulls_for_each_rcu(sk, node, &hslot->head) {
>>  		score = compute_score(sk, net, saddr, hnum, sport,
>>  				      daddr, dport, dif);
>>  		if (score > badness) {
>> @@ -285,6 +274,14 @@ begin:
>>  			badness = score;
>>  		}
>>  	}
>> +	/*
>> +	 * if the nulls value we got at the end of this lookup is
>> +	 * not the expected one, we must restart lookup.
>> +	 * We probably met an item that was moved to another chain.
>> +	 */
>> +	if (get_nulls_value(node) != hash)
>> +		goto begin;
>> +
> 
> Shouldn't this check go -after- the check for "result"?  Or is this a
> case where the readers absolutely must have traversed a chain without
> modification to be guaranteed of finding the correct result?

Very good question

Yes, we really have to look at all the sockets, to find the one with higher score, not
one with a low score, and the one with higher score being ignored because we didnt examined it.

So we really must check we finished our lookup on the right chain end, not an alien one.

(Previous UDP code had a shortcut if we found the socket with the maximal possible score,
I deleted this test as it had basically 0.0001 % of chance being hit)

Thanks a lot for your patient review Paul.


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 1/3] rcu: Introduce hlist_nulls variant of hlist
  2008-11-19 17:01                                                         ` Paul E. McKenney
@ 2008-11-19 17:53                                                           ` Eric Dumazet
  2008-11-19 18:46                                                             ` Paul E. McKenney
  0 siblings, 1 reply; 134+ messages in thread
From: Eric Dumazet @ 2008-11-19 17:53 UTC (permalink / raw)
  To: paulmck
  Cc: Corey Minyard, David Miller, Stephen Hemminger, benny+usenet,
	Linux Netdev List, Christoph Lameter, Evgeniy Polyakov,
	Peter Zijlstra, Christian Bell

Paul E. McKenney a écrit :
> On Thu, Nov 13, 2008 at 02:14:18PM +0100, Eric Dumazet wrote:
>> hlist uses NULL value to finish a chain.
>>
>> hlist_nulls variant use the low order bit set to 1 to signal an end-of-list 
>> marker.
>>
>> This allows to store many different end markers, so that some RCU lockless
>> algos (used in TCP/UDP stack for example) can save some memory barriers in
>> fast paths.
>>
>> Two new files are added :
>>
>> include/linux/list_nulls.h
>>  - mimics hlist part of include/linux/list.h, derived to hlist_nulls 
>> variant
>>
>> include/linux/rculist_nulls.h
>>  - mimics hlist part of include/linux/rculist.h, derived to hlist_nulls 
>> variant
>>
>>   Only four helpers are declared for the moment :
>>
>>     hlist_nulls_del_init_rcu(), hlist_nulls_del_rcu(),
>>     hlist_nulls_add_head_rcu() and hlist_nulls_for_each_entry_rcu()
>>
>> prefetches() were removed, since an end of list is not anymore NULL value.
>> prefetches() could trigger useless (and possibly dangerous) memory 
>> transactions.
>>
>> Example of use (extracted from __udp4_lib_lookup())
>>
>> 	struct sock *sk, *result;
>>        struct hlist_nulls_node *node;
>>        unsigned short hnum = ntohs(dport);
>>        unsigned int hash = udp_hashfn(net, hnum);
>>        struct udp_hslot *hslot = &udptable->hash[hash];
>>        int score, badness;
>>
>>        rcu_read_lock();
>> begin:
>>        result = NULL;
>>        badness = -1;
>>        sk_nulls_for_each_rcu(sk, node, &hslot->head) {
>>                score = compute_score(sk, net, saddr, hnum, sport,
>>                                      daddr, dport, dif);
>>                if (score > badness) {
>>                        result = sk;
>>                        badness = score;
>>                }
>>        }
>>        /*
>>         * if the nulls value we got at the end of this lookup is
>>         * not the expected one, we must restart lookup.
>>         * We probably met an item that was moved to another chain.
>>         */
>>        if (get_nulls_value(node) != hash)
>>                goto begin;
>>
>>        if (result) {
>>                if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
>>                        result = NULL;
>>                else if (unlikely(compute_score(result, net, saddr, hnum, 
>> sport,
>>                                  daddr, dport, dif) < badness)) {
>>                        sock_put(result);
>>                        goto begin;
>>                }
>>        }
>>        rcu_read_unlock();
>>        return result;
> 
> Looks good, but a few questions and suggestions interspersed below.
> 
> 							Thanx, Paul
> 
>> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
>> ---
>> include/linux/list_nulls.h    |   94 +++++++++++++++++++++++++++
>> include/linux/rculist_nulls.h |  110 ++++++++++++++++++++++++++++++++
>> 2 files changed, 204 insertions(+)
> 
>> diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
>> new file mode 100644
>> index 0000000..856dee8
>> --- /dev/null
>> +++ b/include/linux/list_nulls.h
>> @@ -0,0 +1,94 @@
>> +#ifndef _LINUX_LIST_NULLS_H
>> +#define _LINUX_LIST_NULLS_H
>> +
>> +/*
>> + * Special version of lists, where end of list is not a NULL pointer,
>> + * but a 'nulls' marker, which can have many different values.
>> + * (up to 2^31 different values guaranteed on all platforms)
>> + *
>> + * In the standard hlist, termination of a list is the NULL pointer.
>> + * In this special 'nulls' variant, we use the fact that objects stored in
>> + * a list are aligned on a word (4 or 8 bytes alignment).
>> + * We therefore use the last significant bit of 'ptr' :
>> + * Set to 1 : This is a 'nulls' end-of-list marker (ptr >> 1)
>> + * Set to 0 : This is a pointer to some object (ptr)
>> + */
>> +
>> +struct hlist_nulls_head {
>> +	struct hlist_nulls_node *first;
>> +};
>> +
>> +struct hlist_nulls_node {
>> +	struct hlist_nulls_node *next, **pprev;
>> +};
>> +#define INIT_HLIST_NULLS_HEAD(ptr, nulls) \
>> +	((ptr)->first = (struct hlist_nulls_node *) (1UL | (((long)nulls) << 1)))
>> +
>> +#define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member)
>> +/**
>> + * ptr_is_a_nulls - Test if a ptr is a nulls
>> + * @ptr: ptr to be tested
>> + *
>> + */
>> +static inline int is_a_nulls(const struct hlist_nulls_node *ptr)
>> +{
>> +	return ((unsigned long)ptr & 1);
>> +}
>> +
>> +/**
>> + * get_nulls_value - Get the 'nulls' value of the end of chain
>> + * @ptr: end of chain
>> + *
>> + * Should be called only if is_a_nulls(ptr);
>> + */
>> +static inline unsigned long get_nulls_value(const struct hlist_nulls_node *ptr)
>> +{
>> +	return ((unsigned long)ptr) >> 1;
>> +}
>> +
>> +static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h)
>> +{
>> +	return !h->pprev;
>> +}
>> +
>> +static inline int hlist_nulls_empty(const struct hlist_nulls_head *h)
>> +{
>> +	return is_a_nulls(h->first);
>> +}
>> +
>> +static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
>> +{
>> +	struct hlist_nulls_node *next = n->next;
>> +	struct hlist_nulls_node **pprev = n->pprev;
>> +	*pprev = next;
>> +	if (!is_a_nulls(next))
>> +		next->pprev = pprev;
>> +}
>> +
>> +/**
>> + * hlist_nulls_for_each_entry	- iterate over list of given type
>> + * @tpos:	the type * to use as a loop cursor.
>> + * @pos:	the &struct hlist_node to use as a loop cursor.
>> + * @head:	the head for your list.
>> + * @member:	the name of the hlist_node within the struct.
>> + *
>> + */
>> +#define hlist_nulls_for_each_entry(tpos, pos, head, member)		       \
>> +	for (pos = (head)->first;					       \
>> +	     (!is_a_nulls(pos)) &&					       \
>> +		({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
>> +	     pos = pos->next)
>> +
>> +/**
>> + * hlist_nulls_for_each_entry_from - iterate over a hlist continuing from current point
>> + * @tpos:	the type * to use as a loop cursor.
>> + * @pos:	the &struct hlist_node to use as a loop cursor.
> 
> And @pos is the starting point, correct?  Suggest something like:
> 
> 	@pos:	the &struct hlist_node serving as starting point and cursor

Yes, comment was copied from hlist_for_each_entry_from() comment, this one
needs update too.

> 
>> + * @member:	the name of the hlist_node within the struct.
>> + *
>> + */
>> +#define hlist_nulls_for_each_entry_from(tpos, pos, member)	\
>> +	for (; (!is_a_nulls(pos)) && 				\
>> +		({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
>> +	     pos = pos->next)
>> +
>> +#endif
>> diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
>> new file mode 100644
>> index 0000000..b185ac4
>> --- /dev/null
>> +++ b/include/linux/rculist_nulls.h
>> @@ -0,0 +1,110 @@
>> +#ifndef _LINUX_RCULIST_NULLS_H
>> +#define _LINUX_RCULIST_NULLS_H
>> +
>> +#ifdef __KERNEL__
>> +
>> +/*
>> + * RCU-protected list version
>> + */
>> +#include <linux/list_nulls.h>
>> +#include <linux/rcupdate.h>
>> +
>> +/**
>> + * hlist_nulls_del_init_rcu - deletes entry from hash list with re-initialization
>> + * @n: the element to delete from the hash list.
>> + *
>> + * Note: hlist_nulls_unhashed() on the node return true after this. It is
>> + * useful for RCU based read lockfree traversal if the writer side
>> + * must know if the list entry is still hashed or already unhashed.
>> + *
>> + * In particular, it means that we can not poison the forward pointers
>> + * that may still be used for walking the hash list and we can only
>> + * zero the pprev pointer so list_unhashed() will return true after
>> + * this.
>> + *
>> + * The caller must take whatever precautions are necessary (such as
>> + * holding appropriate locks) to avoid racing with another
>> + * list-mutation primitive, such as hlist_nulls_add_head_rcu() or
>> + * hlist_nulls_del_rcu(), running on this same list.  However, it is
>> + * perfectly legal to run concurrently with the _rcu list-traversal
>> + * primitives, such as hlist_nulls_for_each_entry_rcu().
>> + */
>> +static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
>> +{
>> +	if (!hlist_nulls_unhashed(n)) {
>> +		__hlist_nulls_del(n);
>> +		n->pprev = NULL;
>> +	}
>> +}
> 
> The point here is to allow an RCU reader to grab the update-side lock
> while holding a reference to an hlist_nulls_node, and then be able to
> blindly call hlist_nulls_del_init_rcu() without having to do any complex
> check to see if the element has already been deleted?
> 
> But this only works if each free operation waits for a grace period.
> If using SLAB_DESTROY_BY_RCU, the would-be deleter still needs to
> revalidate after grabbing the update-side lock, right?  Hmmm...
> 

<start a brain refresh cycle>
  <read again your questions>
    Tilt... 


hlist_nulls_del_init_rcu() is only used by a writer, exactly
like hlist_del_init_rcu().
I see nothing special about SLAB_DESTROY_BY_RCU here.

static inline void hlist_del_init_rcu(struct hlist_node *n)
{
        if (!hlist_unhashed(n)) {
                __hlist_del(n);
                n->pprev = NULL;
        }
}



>> +
>> +/**
>> + * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization
>> + * @n: the element to delete from the hash list.
>> + *
>> + * Note: hlist_nulls_unhashed() on entry does not return true after this,
>> + * the entry is in an undefined state. It is useful for RCU based
>> + * lockfree traversal.
>> + *
>> + * In particular, it means that we can not poison the forward
>> + * pointers that may still be used for walking the hash list.
>> + *
>> + * The caller must take whatever precautions are necessary
>> + * (such as holding appropriate locks) to avoid racing
>> + * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
>> + * or hlist_nulls_del_rcu(), running on this same list.
>> + * However, it is perfectly legal to run concurrently with
>> + * the _rcu list-traversal primitives, such as
>> + * hlist_nulls_for_each_entry().
>> + */
>> +static inline void hlist_nulls_del_rcu(struct hlist_nulls_node *n)
>> +{
>> +	__hlist_nulls_del(n);
>> +	n->pprev = LIST_POISON2;
>> +}
>> +
>> +/**
>> + * hlist_nulls_add_head_rcu
>> + * @n: the element to add to the hash list.
>> + * @h: the list to add to.
>> + *
>> + * Description:
>> + * Adds the specified element to the specified hlist_nulls,
>> + * while permitting racing traversals.
>> + *
>> + * The caller must take whatever precautions are necessary
>> + * (such as holding appropriate locks) to avoid racing
>> + * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
>> + * or hlist_nulls_del_rcu(), running on this same list.
>> + * However, it is perfectly legal to run concurrently with
>> + * the _rcu list-traversal primitives, such as
>> + * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency
>> + * problems on Alpha CPUs.  Regardless of the type of CPU, the
>> + * list-traversal primitive must be guarded by rcu_read_lock().
>> + */
>> +static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
>> +					struct hlist_nulls_head *h)
>> +{
>> +	struct hlist_nulls_node *first = h->first;
>> +
>> +	n->next = first;
>> +	n->pprev = &h->first;
>> +	rcu_assign_pointer(h->first, n);
>> +	if (!is_a_nulls(first))
>> +		first->pprev = &n->next;
>> +}
>> +/**
>> + * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type
>> + * @tpos:	the type * to use as a loop cursor.
>> + * @pos:	the &struct hlist_nulls_node to use as a loop cursor.
>> + * @head:	the head for your list.
>> + * @member:	the name of the hlist_nulls_node within the struct.
>> + *
>> + */
>> +#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \
>> +	for (pos = rcu_dereference((head)->first);			 \
>> +		(!is_a_nulls(pos)) && 			\
>> +		({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \
>> +		pos = rcu_dereference(pos->next))
>> +
>> +#endif
>> +#endif
> 
> Any chance of using a trick like Oleg used to get rid of the "pos"
> argument?  http://lkml.org/lkml/2008/3/12/47
> 
> The hlist_nulls_node must always be at an even address, correct?
> Couldn't this fact be used to allow testing for is_a_nulls() on tpos
> rather than on pos?  Or is there a better way to approach this?

#define sk_nulls_for_each_rcu(__sk, node, list) \
	hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node)

1) __sk is the pointer to found item if any is found in the loop

2) node will contain the end value of chain, in case we find nothing in loop
   because we need to check it after the loop.

if (get_nulls_value(node) != hash)
	 goto begin;

I dont know, it seems quite complex to try to use only three args ?

This algo is not very easy to read as is already ...



^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 3/3] net: Convert TCP & DCCP hash tables to use RCU / hlist_nulls
  2008-11-13 13:15                                                       ` [PATCH 3/3] net: Convert TCP & DCCP hash tables to use RCU / hlist_nulls Eric Dumazet
  2008-11-13 13:34                                                         ` Peter Zijlstra
@ 2008-11-19 17:53                                                         ` Paul E. McKenney
  2008-11-23  9:33                                                         ` [PATCH] net: Convert TCP/DCCP listening hash tables to use RCU Eric Dumazet
  2 siblings, 0 replies; 134+ messages in thread
From: Paul E. McKenney @ 2008-11-19 17:53 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, Corey Minyard, Stephen Hemminger, benny+usenet,
	Linux Netdev List, Christoph Lameter, Peter Zijlstra,
	Evgeniy Polyakov, Christian Bell

On Thu, Nov 13, 2008 at 02:15:31PM +0100, Eric Dumazet wrote:
> RCU was added to UDP lookups, using a fast infrastructure :
> - sockets kmem_cache use SLAB_DESTROY_BY_RCU and dont pay the
>  price of call_rcu() at freeing time.
> - hlist_nulls permits to use few memory barriers.
>
> This patch uses same infrastructure for TCP/DCCP established
> and timewait sockets.
>
> Thanks to SLAB_DESTROY_BY_RCU, no slowdown for applications
> using short lived TCP connections. A followup patch, converting
> rwlocks to spinlocks will even speedup this case.
>
> __inet_lookup_established() is pretty fast now we dont have to
> dirty a contended cache line (read_lock/read_unlock)
>
> Only established and timewait hashtable are converted to RCU
> (bind table and listen table are still using traditional locking)

Looks good to me!

Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
> ---
> include/net/inet_hashtables.h    |    4 -
> include/net/inet_timewait_sock.h |   10 +--
> net/core/sock.c                  |    4 +
> net/dccp/ipv4.c                  |    1
> net/dccp/ipv6.c                  |    1
> net/dccp/proto.c                 |    4 -
> net/ipv4/inet_diag.c             |    6 +-
> net/ipv4/inet_hashtables.c       |   78 ++++++++++++++++++++---------
> net/ipv4/inet_timewait_sock.c    |   26 +++++----
> net/ipv4/tcp.c                   |    4 -
> net/ipv4/tcp_ipv4.c              |   25 ++++-----
> net/ipv6/inet6_hashtables.c      |   70 +++++++++++++++++---------
> net/ipv6/tcp_ipv6.c              |    1
> 13 files changed, 150 insertions(+), 84 deletions(-)

> diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
> index cb31fbf..4818960 100644
> --- a/include/net/inet_hashtables.h
> +++ b/include/net/inet_hashtables.h
> @@ -41,8 +41,8 @@
>   * I'll experiment with dynamic table growth later.
>   */
>  struct inet_ehash_bucket {
> -	struct hlist_head chain;
> -	struct hlist_head twchain;
> +	struct hlist_nulls_head chain;
> +	struct hlist_nulls_head twchain;
>  };
> 
>  /* There are a few simple rules, which allow for local port reuse by
> diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
> index 80e4977..4b8ece2 100644
> --- a/include/net/inet_timewait_sock.h
> +++ b/include/net/inet_timewait_sock.h
> @@ -110,7 +110,7 @@ struct inet_timewait_sock {
>  #define tw_state		__tw_common.skc_state
>  #define tw_reuse		__tw_common.skc_reuse
>  #define tw_bound_dev_if		__tw_common.skc_bound_dev_if
> -#define tw_node			__tw_common.skc_node
> +#define tw_node			__tw_common.skc_nulls_node
>  #define tw_bind_node		__tw_common.skc_bind_node
>  #define tw_refcnt		__tw_common.skc_refcnt
>  #define tw_hash			__tw_common.skc_hash
> @@ -137,10 +137,10 @@ struct inet_timewait_sock {
>  	struct hlist_node	tw_death_node;
>  };
> 
> -static inline void inet_twsk_add_node(struct inet_timewait_sock *tw,
> -				      struct hlist_head *list)
> +static inline void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
> +				      struct hlist_nulls_head *list)
>  {
> -	hlist_add_head(&tw->tw_node, list);
> +	hlist_nulls_add_head_rcu(&tw->tw_node, list);
>  }
> 
>  static inline void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
> @@ -175,7 +175,7 @@ static inline int inet_twsk_del_dead_node(struct inet_timewait_sock *tw)
>  }
> 
>  #define inet_twsk_for_each(tw, node, head) \
> -	hlist_for_each_entry(tw, node, head, tw_node)
> +	hlist_nulls_for_each_entry(tw, node, head, tw_node)
> 
>  #define inet_twsk_for_each_inmate(tw, node, jail) \
>  	hlist_for_each_entry(tw, node, jail, tw_death_node)
> diff --git a/net/core/sock.c b/net/core/sock.c
> index ded1eb5..38de9c3 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -2082,7 +2082,9 @@ int proto_register(struct proto *prot, int alloc_slab)
>  			prot->twsk_prot->twsk_slab =
>  				kmem_cache_create(timewait_sock_slab_name,
>  						  prot->twsk_prot->twsk_obj_size,
> -						  0, SLAB_HWCACHE_ALIGN,
> +						  0,
> +						  SLAB_HWCACHE_ALIGN |
> +							prot->slab_flags,
>  						  NULL);
>  			if (prot->twsk_prot->twsk_slab == NULL)
>  				goto out_free_timewait_sock_slab_name;
> diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
> index 528baa2..d1dd952 100644
> --- a/net/dccp/ipv4.c
> +++ b/net/dccp/ipv4.c
> @@ -938,6 +938,7 @@ static struct proto dccp_v4_prot = {
>  	.orphan_count		= &dccp_orphan_count,
>  	.max_header		= MAX_DCCP_HEADER,
>  	.obj_size		= sizeof(struct dccp_sock),
> +	.slab_flags		= SLAB_DESTROY_BY_RCU,
>  	.rsk_prot		= &dccp_request_sock_ops,
>  	.twsk_prot		= &dccp_timewait_sock_ops,
>  	.h.hashinfo		= &dccp_hashinfo,
> diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
> index 4aa1148..f033e84 100644
> --- a/net/dccp/ipv6.c
> +++ b/net/dccp/ipv6.c
> @@ -1140,6 +1140,7 @@ static struct proto dccp_v6_prot = {
>  	.orphan_count	   = &dccp_orphan_count,
>  	.max_header	   = MAX_DCCP_HEADER,
>  	.obj_size	   = sizeof(struct dccp6_sock),
> +	.slab_flags	   = SLAB_DESTROY_BY_RCU,
>  	.rsk_prot	   = &dccp6_request_sock_ops,
>  	.twsk_prot	   = &dccp6_timewait_sock_ops,
>  	.h.hashinfo	   = &dccp_hashinfo,
> diff --git a/net/dccp/proto.c b/net/dccp/proto.c
> index 46cb349..1117d4d 100644
> --- a/net/dccp/proto.c
> +++ b/net/dccp/proto.c
> @@ -1090,8 +1090,8 @@ static int __init dccp_init(void)
>  	}
> 
>  	for (i = 0; i < dccp_hashinfo.ehash_size; i++) {
> -		INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain);
> -		INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].twchain);
> +		INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i);
> +		INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].twchain, i);
>  	}
> 
>  	if (inet_ehash_locks_alloc(&dccp_hashinfo))
> diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
> index 564230d..41b3672 100644
> --- a/net/ipv4/inet_diag.c
> +++ b/net/ipv4/inet_diag.c
> @@ -778,18 +778,19 @@ skip_listen_ht:
>  		struct inet_ehash_bucket *head = &hashinfo->ehash[i];
>  		rwlock_t *lock = inet_ehash_lockp(hashinfo, i);
>  		struct sock *sk;
> -		struct hlist_node *node;
> +		struct hlist_nulls_node *node;
> 
>  		num = 0;
> 
> -		if (hlist_empty(&head->chain) && hlist_empty(&head->twchain))
> +		if (hlist_nulls_empty(&head->chain) &&
> +			hlist_nulls_empty(&head->twchain))
>  			continue;
> 
>  		if (i > s_i)
>  			s_num = 0;
> 
>  		read_lock_bh(lock);
> -		sk_for_each(sk, node, &head->chain) {
> +		sk_nulls_for_each(sk, node, &head->chain) {
>  			struct inet_sock *inet = inet_sk(sk);
> 
>  			if (num < s_num)
> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
> index be41ebb..fd269cf 100644
> --- a/net/ipv4/inet_hashtables.c
> +++ b/net/ipv4/inet_hashtables.c
> @@ -223,35 +223,65 @@ struct sock * __inet_lookup_established(struct net *net,
>  	INET_ADDR_COOKIE(acookie, saddr, daddr)
>  	const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
>  	struct sock *sk;
> -	const struct hlist_node *node;
> +	const struct hlist_nulls_node *node;
>  	/* Optimize here for direct hit, only listening connections can
>  	 * have wildcards anyways.
>  	 */
>  	unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
> -	struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash);
> -	rwlock_t *lock = inet_ehash_lockp(hashinfo, hash);
> +	unsigned int slot = hash & (hashinfo->ehash_size - 1);
> +	struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
> 
> -	prefetch(head->chain.first);
> -	read_lock(lock);
> -	sk_for_each(sk, node, &head->chain) {
> +	rcu_read_lock();
> +begin:
> +	sk_nulls_for_each_rcu(sk, node, &head->chain) {
>  		if (INET_MATCH(sk, net, hash, acookie,
> -					saddr, daddr, ports, dif))
> -			goto hit; /* You sunk my battleship! */
> +					saddr, daddr, ports, dif)) {
> +			if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
> +				goto begintw;
> +			if (unlikely(!INET_MATCH(sk, net, hash, acookie,
> +				saddr, daddr, ports, dif))) {
> +				sock_put(sk);
> +				goto begin;
> +			}
> +			goto out;
> +		}
>  	}
> +	/*
> +	 * if the nulls value we got at the end of this lookup is
> +	 * not the expected one, we must restart lookup.
> +	 * We probably met an item that was moved to another chain.
> +	 */
> +	if (get_nulls_value(node) != slot)
> +		goto begin;
> 
> +begintw:
>  	/* Must check for a TIME_WAIT'er before going to listener hash. */
> -	sk_for_each(sk, node, &head->twchain) {
> +	sk_nulls_for_each_rcu(sk, node, &head->twchain) {
>  		if (INET_TW_MATCH(sk, net, hash, acookie,
> -					saddr, daddr, ports, dif))
> -			goto hit;
> +					saddr, daddr, ports, dif)) {
> +			if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
> +				sk = NULL;
> +				goto out;
> +			}
> +			if (unlikely(!INET_TW_MATCH(sk, net, hash, acookie,
> +				 saddr, daddr, ports, dif))) {
> +				sock_put(sk);
> +				goto begintw;
> +			}
> +			goto out;
> +		}
>  	}
> +	/*
> +	 * if the nulls value we got at the end of this lookup is
> +	 * not the expected one, we must restart lookup.
> +	 * We probably met an item that was moved to another chain.
> +	 */
> +	if (get_nulls_value(node) != slot)
> +		goto begintw;
>  	sk = NULL;
>  out:
> -	read_unlock(lock);
> +	rcu_read_unlock();
>  	return sk;
> -hit:
> -	sock_hold(sk);
> -	goto out;
>  }
>  EXPORT_SYMBOL_GPL(__inet_lookup_established);
> 
> @@ -272,14 +302,14 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
>  	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
>  	rwlock_t *lock = inet_ehash_lockp(hinfo, hash);
>  	struct sock *sk2;
> -	const struct hlist_node *node;
> +	const struct hlist_nulls_node *node;
>  	struct inet_timewait_sock *tw;
> 
>  	prefetch(head->chain.first);
>  	write_lock(lock);
> 
>  	/* Check TIME-WAIT sockets first. */
> -	sk_for_each(sk2, node, &head->twchain) {
> +	sk_nulls_for_each(sk2, node, &head->twchain) {
>  		tw = inet_twsk(sk2);
> 
>  		if (INET_TW_MATCH(sk2, net, hash, acookie,
> @@ -293,7 +323,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
>  	tw = NULL;
> 
>  	/* And established part... */
> -	sk_for_each(sk2, node, &head->chain) {
> +	sk_nulls_for_each(sk2, node, &head->chain) {
>  		if (INET_MATCH(sk2, net, hash, acookie,
>  					saddr, daddr, ports, dif))
>  			goto not_unique;
> @@ -306,7 +336,7 @@ unique:
>  	inet->sport = htons(lport);
>  	sk->sk_hash = hash;
>  	WARN_ON(!sk_unhashed(sk));
> -	__sk_add_node(sk, &head->chain);
> +	__sk_nulls_add_node_rcu(sk, &head->chain);
>  	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
>  	write_unlock(lock);
> 
> @@ -338,7 +368,7 @@ static inline u32 inet_sk_port_offset(const struct sock *sk)
>  void __inet_hash_nolisten(struct sock *sk)
>  {
>  	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
> -	struct hlist_head *list;
> +	struct hlist_nulls_head *list;
>  	rwlock_t *lock;
>  	struct inet_ehash_bucket *head;
> 
> @@ -350,7 +380,7 @@ void __inet_hash_nolisten(struct sock *sk)
>  	lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
> 
>  	write_lock(lock);
> -	__sk_add_node(sk, list);
> +	__sk_nulls_add_node_rcu(sk, list);
>  	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
>  	write_unlock(lock);
>  }
> @@ -400,13 +430,15 @@ void inet_unhash(struct sock *sk)
>  		local_bh_disable();
>  		inet_listen_wlock(hashinfo);
>  		lock = &hashinfo->lhash_lock;
> +		if (__sk_del_node_init(sk))
> +			sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
>  	} else {
>  		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
>  		write_lock_bh(lock);
> +		if (__sk_nulls_del_node_init_rcu(sk))
> +			sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
>  	}
> 
> -	if (__sk_del_node_init(sk))
> -		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
>  	write_unlock_bh(lock);
>  out:
>  	if (sk->sk_state == TCP_LISTEN)
> diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
> index 1c5fd38..6068995 100644
> --- a/net/ipv4/inet_timewait_sock.c
> +++ b/net/ipv4/inet_timewait_sock.c
> @@ -23,12 +23,12 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
>  	rwlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
> 
>  	write_lock(lock);
> -	if (hlist_unhashed(&tw->tw_node)) {
> +	if (hlist_nulls_unhashed(&tw->tw_node)) {
>  		write_unlock(lock);
>  		return;
>  	}
> -	__hlist_del(&tw->tw_node);
> -	sk_node_init(&tw->tw_node);
> +	hlist_nulls_del_rcu(&tw->tw_node);
> +	sk_nulls_node_init(&tw->tw_node);
>  	write_unlock(lock);
> 
>  	/* Disassociate with bind bucket. */
> @@ -92,13 +92,17 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
> 
>  	write_lock(lock);
> 
> -	/* Step 2: Remove SK from established hash. */
> -	if (__sk_del_node_init(sk))
> -		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
> -
> -	/* Step 3: Hash TW into TIMEWAIT chain. */
> -	inet_twsk_add_node(tw, &ehead->twchain);
> +	/*
> +	 * Step 2: Hash TW into TIMEWAIT chain.
> +	 * Should be done before removing sk from established chain
> +	 * because readers are lockless and search established first.
> +	 */
>  	atomic_inc(&tw->tw_refcnt);
> +	inet_twsk_add_node_rcu(tw, &ehead->twchain);
> +
> +	/* Step 3: Remove SK from established hash. */
> +	if (__sk_nulls_del_node_init_rcu(sk))
> +		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
> 
>  	write_unlock(lock);
>  }
> @@ -416,7 +420,7 @@ void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo,
>  {
>  	struct inet_timewait_sock *tw;
>  	struct sock *sk;
> -	struct hlist_node *node;
> +	struct hlist_nulls_node *node;
>  	int h;
> 
>  	local_bh_disable();
> @@ -426,7 +430,7 @@ void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo,
>  		rwlock_t *lock = inet_ehash_lockp(hashinfo, h);
>  restart:
>  		write_lock(lock);
> -		sk_for_each(sk, node, &head->twchain) {
> +		sk_nulls_for_each(sk, node, &head->twchain) {
> 
>  			tw = inet_twsk(sk);
>  			if (!net_eq(twsk_net(tw), net) ||
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index f60a591..044224a 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -2707,8 +2707,8 @@ void __init tcp_init(void)
>  					thash_entries ? 0 : 512 * 1024);
>  	tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size;
>  	for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
> -		INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
> -		INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain);
> +		INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
> +		INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
>  	}
>  	if (inet_ehash_locks_alloc(&tcp_hashinfo))
>  		panic("TCP: failed to alloc ehash_locks");
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index d49233f..b2e3ab2 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1857,16 +1857,16 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
>  #ifdef CONFIG_PROC_FS
>  /* Proc filesystem TCP sock list dumping. */
> 
> -static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
> +static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
>  {
> -	return hlist_empty(head) ? NULL :
> +	return hlist_nulls_empty(head) ? NULL :
>  		list_entry(head->first, struct inet_timewait_sock, tw_node);
>  }
> 
>  static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
>  {
> -	return tw->tw_node.next ?
> -		hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
> +	return !is_a_nulls(tw->tw_node.next) ?
> +		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
>  }
> 
>  static void *listening_get_next(struct seq_file *seq, void *cur)
> @@ -1954,8 +1954,8 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
> 
>  static inline int empty_bucket(struct tcp_iter_state *st)
>  {
> -	return hlist_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
> -		hlist_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
> +	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
> +		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
>  }
> 
>  static void *established_get_first(struct seq_file *seq)
> @@ -1966,7 +1966,7 @@ static void *established_get_first(struct seq_file *seq)
> 
>  	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
>  		struct sock *sk;
> -		struct hlist_node *node;
> +		struct hlist_nulls_node *node;
>  		struct inet_timewait_sock *tw;
>  		rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
> 
> @@ -1975,7 +1975,7 @@ static void *established_get_first(struct seq_file *seq)
>  			continue;
> 
>  		read_lock_bh(lock);
> -		sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
> +		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
>  			if (sk->sk_family != st->family ||
>  			    !net_eq(sock_net(sk), net)) {
>  				continue;
> @@ -2004,7 +2004,7 @@ static void *established_get_next(struct seq_file *seq, void *cur)
>  {
>  	struct sock *sk = cur;
>  	struct inet_timewait_sock *tw;
> -	struct hlist_node *node;
> +	struct hlist_nulls_node *node;
>  	struct tcp_iter_state *st = seq->private;
>  	struct net *net = seq_file_net(seq);
> 
> @@ -2032,11 +2032,11 @@ get_tw:
>  			return NULL;
> 
>  		read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
> -		sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
> +		sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
>  	} else
> -		sk = sk_next(sk);
> +		sk = sk_nulls_next(sk);
> 
> -	sk_for_each_from(sk, node) {
> +	sk_nulls_for_each_from(sk, node) {
>  		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
>  			goto found;
>  	}
> @@ -2375,6 +2375,7 @@ struct proto tcp_prot = {
>  	.sysctl_rmem		= sysctl_tcp_rmem,
>  	.max_header		= MAX_TCP_HEADER,
>  	.obj_size		= sizeof(struct tcp_sock),
> +	.slab_flags		= SLAB_DESTROY_BY_RCU,
>  	.twsk_prot		= &tcp_timewait_sock_ops,
>  	.rsk_prot		= &tcp_request_sock_ops,
>  	.h.hashinfo		= &tcp_hashinfo,
> diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
> index 1646a56..c1b4d40 100644
> --- a/net/ipv6/inet6_hashtables.c
> +++ b/net/ipv6/inet6_hashtables.c
> @@ -25,24 +25,28 @@
>  void __inet6_hash(struct sock *sk)
>  {
>  	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
> -	struct hlist_head *list;
>  	rwlock_t *lock;
> 
>  	WARN_ON(!sk_unhashed(sk));
> 
>  	if (sk->sk_state == TCP_LISTEN) {
> +		struct hlist_head *list;
> +
>  		list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
>  		lock = &hashinfo->lhash_lock;
>  		inet_listen_wlock(hashinfo);
> +		__sk_add_node(sk, list);
>  	} else {
>  		unsigned int hash;
> +		struct hlist_nulls_head *list;
> +
>  		sk->sk_hash = hash = inet6_sk_ehashfn(sk);
>  		list = &inet_ehash_bucket(hashinfo, hash)->chain;
>  		lock = inet_ehash_lockp(hashinfo, hash);
>  		write_lock(lock);
> +		__sk_nulls_add_node_rcu(sk, list);
>  	}
> 
> -	__sk_add_node(sk, list);
>  	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
>  	write_unlock(lock);
>  }
> @@ -63,33 +67,53 @@ struct sock *__inet6_lookup_established(struct net *net,
>  					   const int dif)
>  {
>  	struct sock *sk;
> -	const struct hlist_node *node;
> +	const struct hlist_nulls_node *node;
>  	const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
>  	/* Optimize here for direct hit, only listening connections can
>  	 * have wildcards anyways.
>  	 */
>  	unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport);
> -	struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash);
> -	rwlock_t *lock = inet_ehash_lockp(hashinfo, hash);
> +	unsigned int slot = hash & (hashinfo->ehash_size - 1);
> +	struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
> 
> -	prefetch(head->chain.first);
> -	read_lock(lock);
> -	sk_for_each(sk, node, &head->chain) {
> +
> +	rcu_read_lock();
> +begin:
> +	sk_nulls_for_each_rcu(sk, node, &head->chain) {
>  		/* For IPV6 do the cheaper port and family tests first. */
> -		if (INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif))
> -			goto hit; /* You sunk my battleship! */
> +		if (INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif)) {
> +			if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
> +				goto begintw;
> +			if (!INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif)) {
> +				sock_put(sk);
> +				goto begin;
> +			}
> +		goto out;
> +		}
>  	}
> +	if (get_nulls_value(node) != slot)
> +		goto begin;
> +
> +begintw:
>  	/* Must check for a TIME_WAIT'er before going to listener hash. */
> -	sk_for_each(sk, node, &head->twchain) {
> -		if (INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif))
> -			goto hit;
> +	sk_nulls_for_each_rcu(sk, node, &head->twchain) {
> +		if (INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif)) {
> +			if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
> +				sk = NULL;
> +				goto out;
> +			}
> +			if (!INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif)) {
> +				sock_put(sk);
> +				goto begintw;
> +			}
> +			goto out;
> +		}
>  	}
> -	read_unlock(lock);
> -	return NULL;
> -
> -hit:
> -	sock_hold(sk);
> -	read_unlock(lock);
> +	if (get_nulls_value(node) != slot)
> +		goto begintw;
> +	sk = NULL;
> +out:
> +	rcu_read_unlock();
>  	return sk;
>  }
>  EXPORT_SYMBOL(__inet6_lookup_established);
> @@ -172,14 +196,14 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
>  	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
>  	rwlock_t *lock = inet_ehash_lockp(hinfo, hash);
>  	struct sock *sk2;
> -	const struct hlist_node *node;
> +	const struct hlist_nulls_node *node;
>  	struct inet_timewait_sock *tw;
> 
>  	prefetch(head->chain.first);
>  	write_lock(lock);
> 
>  	/* Check TIME-WAIT sockets first. */
> -	sk_for_each(sk2, node, &head->twchain) {
> +	sk_nulls_for_each(sk2, node, &head->twchain) {
>  		tw = inet_twsk(sk2);
> 
>  		if (INET6_TW_MATCH(sk2, net, hash, saddr, daddr, ports, dif)) {
> @@ -192,7 +216,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
>  	tw = NULL;
> 
>  	/* And established part... */
> -	sk_for_each(sk2, node, &head->chain) {
> +	sk_nulls_for_each(sk2, node, &head->chain) {
>  		if (INET6_MATCH(sk2, net, hash, saddr, daddr, ports, dif))
>  			goto not_unique;
>  	}
> @@ -203,7 +227,7 @@ unique:
>  	inet->num = lport;
>  	inet->sport = htons(lport);
>  	WARN_ON(!sk_unhashed(sk));
> -	__sk_add_node(sk, &head->chain);
> +	__sk_nulls_add_node_rcu(sk, &head->chain);
>  	sk->sk_hash = hash;
>  	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
>  	write_unlock(lock);
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index 9842764..b357870 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -2043,6 +2043,7 @@ struct proto tcpv6_prot = {
>  	.sysctl_rmem		= sysctl_tcp_rmem,
>  	.max_header		= MAX_TCP_HEADER,
>  	.obj_size		= sizeof(struct tcp6_sock),
> +	.slab_flags		= SLAB_DESTROY_BY_RCU,
>  	.twsk_prot		= &tcp6_timewait_sock_ops,
>  	.rsk_prot		= &tcp6_request_sock_ops,
>  	.h.hashinfo		= &tcp_hashinfo,


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 1/3] rcu: Introduce hlist_nulls variant of hlist
  2008-11-19 17:53                                                           ` Eric Dumazet
@ 2008-11-19 18:46                                                             ` Paul E. McKenney
  2008-11-19 18:53                                                               ` Arnaldo Carvalho de Melo
  2008-11-19 20:39                                                               ` Eric Dumazet
  0 siblings, 2 replies; 134+ messages in thread
From: Paul E. McKenney @ 2008-11-19 18:46 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Corey Minyard, David Miller, Stephen Hemminger, benny+usenet,
	Linux Netdev List, Christoph Lameter, Evgeniy Polyakov,
	Peter Zijlstra, Christian Bell

On Wed, Nov 19, 2008 at 06:53:20PM +0100, Eric Dumazet wrote:
> Paul E. McKenney a écrit :
>> On Thu, Nov 13, 2008 at 02:14:18PM +0100, Eric Dumazet wrote:
>>> hlist uses NULL value to finish a chain.
>>>
>>> hlist_nulls variant use the low order bit set to 1 to signal an 
>>> end-of-list marker.
>>>
>>> This allows to store many different end markers, so that some RCU 
>>> lockless
>>> algos (used in TCP/UDP stack for example) can save some memory barriers 
>>> in
>>> fast paths.
>>>
>>> Two new files are added :
>>>
>>> include/linux/list_nulls.h
>>>  - mimics hlist part of include/linux/list.h, derived to hlist_nulls 
>>> variant
>>>
>>> include/linux/rculist_nulls.h
>>>  - mimics hlist part of include/linux/rculist.h, derived to hlist_nulls 
>>> variant
>>>
>>>   Only four helpers are declared for the moment :
>>>
>>>     hlist_nulls_del_init_rcu(), hlist_nulls_del_rcu(),
>>>     hlist_nulls_add_head_rcu() and hlist_nulls_for_each_entry_rcu()
>>>
>>> prefetches() were removed, since an end of list is not anymore NULL 
>>> value.
>>> prefetches() could trigger useless (and possibly dangerous) memory 
>>> transactions.
>>>
>>> Example of use (extracted from __udp4_lib_lookup())
>>>
>>> 	struct sock *sk, *result;
>>>        struct hlist_nulls_node *node;
>>>        unsigned short hnum = ntohs(dport);
>>>        unsigned int hash = udp_hashfn(net, hnum);
>>>        struct udp_hslot *hslot = &udptable->hash[hash];
>>>        int score, badness;
>>>
>>>        rcu_read_lock();
>>> begin:
>>>        result = NULL;
>>>        badness = -1;
>>>        sk_nulls_for_each_rcu(sk, node, &hslot->head) {
>>>                score = compute_score(sk, net, saddr, hnum, sport,
>>>                                      daddr, dport, dif);
>>>                if (score > badness) {
>>>                        result = sk;
>>>                        badness = score;
>>>                }
>>>        }
>>>        /*
>>>         * if the nulls value we got at the end of this lookup is
>>>         * not the expected one, we must restart lookup.
>>>         * We probably met an item that was moved to another chain.
>>>         */
>>>        if (get_nulls_value(node) != hash)
>>>                goto begin;
>>>
>>>        if (result) {
>>>                if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
>>>                        result = NULL;
>>>                else if (unlikely(compute_score(result, net, saddr, hnum, 
>>> sport,
>>>                                  daddr, dport, dif) < badness)) {
>>>                        sock_put(result);
>>>                        goto begin;
>>>                }
>>>        }
>>>        rcu_read_unlock();
>>>        return result;
>> Looks good, but a few questions and suggestions interspersed below.
>> 							Thanx, Paul
>>> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
>>> ---
>>> include/linux/list_nulls.h    |   94 +++++++++++++++++++++++++++
>>> include/linux/rculist_nulls.h |  110 ++++++++++++++++++++++++++++++++
>>> 2 files changed, 204 insertions(+)
>>> diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
>>> new file mode 100644
>>> index 0000000..856dee8
>>> --- /dev/null
>>> +++ b/include/linux/list_nulls.h
>>> @@ -0,0 +1,94 @@
>>> +#ifndef _LINUX_LIST_NULLS_H
>>> +#define _LINUX_LIST_NULLS_H
>>> +
>>> +/*
>>> + * Special version of lists, where end of list is not a NULL pointer,
>>> + * but a 'nulls' marker, which can have many different values.
>>> + * (up to 2^31 different values guaranteed on all platforms)
>>> + *
>>> + * In the standard hlist, termination of a list is the NULL pointer.
>>> + * In this special 'nulls' variant, we use the fact that objects stored 
>>> in
>>> + * a list are aligned on a word (4 or 8 bytes alignment).
>>> + * We therefore use the last significant bit of 'ptr' :
>>> + * Set to 1 : This is a 'nulls' end-of-list marker (ptr >> 1)
>>> + * Set to 0 : This is a pointer to some object (ptr)
>>> + */
>>> +
>>> +struct hlist_nulls_head {
>>> +	struct hlist_nulls_node *first;
>>> +};
>>> +
>>> +struct hlist_nulls_node {
>>> +	struct hlist_nulls_node *next, **pprev;
>>> +};
>>> +#define INIT_HLIST_NULLS_HEAD(ptr, nulls) \
>>> +	((ptr)->first = (struct hlist_nulls_node *) (1UL | (((long)nulls) << 
>>> 1)))
>>> +
>>> +#define hlist_nulls_entry(ptr, type, member) 
>>> container_of(ptr,type,member)
>>> +/**
>>> + * ptr_is_a_nulls - Test if a ptr is a nulls
>>> + * @ptr: ptr to be tested
>>> + *
>>> + */
>>> +static inline int is_a_nulls(const struct hlist_nulls_node *ptr)
>>> +{
>>> +	return ((unsigned long)ptr & 1);
>>> +}
>>> +
>>> +/**
>>> + * get_nulls_value - Get the 'nulls' value of the end of chain
>>> + * @ptr: end of chain
>>> + *
>>> + * Should be called only if is_a_nulls(ptr);
>>> + */
>>> +static inline unsigned long get_nulls_value(const struct 
>>> hlist_nulls_node *ptr)
>>> +{
>>> +	return ((unsigned long)ptr) >> 1;
>>> +}
>>> +
>>> +static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h)
>>> +{
>>> +	return !h->pprev;
>>> +}
>>> +
>>> +static inline int hlist_nulls_empty(const struct hlist_nulls_head *h)
>>> +{
>>> +	return is_a_nulls(h->first);
>>> +}
>>> +
>>> +static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
>>> +{
>>> +	struct hlist_nulls_node *next = n->next;
>>> +	struct hlist_nulls_node **pprev = n->pprev;
>>> +	*pprev = next;
>>> +	if (!is_a_nulls(next))
>>> +		next->pprev = pprev;
>>> +}
>>> +
>>> +/**
>>> + * hlist_nulls_for_each_entry	- iterate over list of given type
>>> + * @tpos:	the type * to use as a loop cursor.
>>> + * @pos:	the &struct hlist_node to use as a loop cursor.
>>> + * @head:	the head for your list.
>>> + * @member:	the name of the hlist_node within the struct.
>>> + *
>>> + */
>>> +#define hlist_nulls_for_each_entry(tpos, pos, head, member)		       \
>>> +	for (pos = (head)->first;					       \
>>> +	     (!is_a_nulls(pos)) &&					       \
>>> +		({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
>>> +	     pos = pos->next)
>>> +
>>> +/**
>>> + * hlist_nulls_for_each_entry_from - iterate over a hlist continuing 
>>> from current point
>>> + * @tpos:	the type * to use as a loop cursor.
>>> + * @pos:	the &struct hlist_node to use as a loop cursor.
>> And @pos is the starting point, correct?  Suggest something like:
>> 	@pos:	the &struct hlist_node serving as starting point and cursor
>
> Yes, comment was copied from hlist_for_each_entry_from() comment, this one
> needs update too.
>
>>> + * @member:	the name of the hlist_node within the struct.
>>> + *
>>> + */
>>> +#define hlist_nulls_for_each_entry_from(tpos, pos, member)	\
>>> +	for (; (!is_a_nulls(pos)) && 				\
>>> +		({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
>>> +	     pos = pos->next)
>>> +
>>> +#endif
>>> diff --git a/include/linux/rculist_nulls.h 
>>> b/include/linux/rculist_nulls.h
>>> new file mode 100644
>>> index 0000000..b185ac4
>>> --- /dev/null
>>> +++ b/include/linux/rculist_nulls.h
>>> @@ -0,0 +1,110 @@
>>> +#ifndef _LINUX_RCULIST_NULLS_H
>>> +#define _LINUX_RCULIST_NULLS_H
>>> +
>>> +#ifdef __KERNEL__
>>> +
>>> +/*
>>> + * RCU-protected list version
>>> + */
>>> +#include <linux/list_nulls.h>
>>> +#include <linux/rcupdate.h>
>>> +
>>> +/**
>>> + * hlist_nulls_del_init_rcu - deletes entry from hash list with 
>>> re-initialization
>>> + * @n: the element to delete from the hash list.
>>> + *
>>> + * Note: hlist_nulls_unhashed() on the node return true after this. It 
>>> is
>>> + * useful for RCU based read lockfree traversal if the writer side
>>> + * must know if the list entry is still hashed or already unhashed.
>>> + *
>>> + * In particular, it means that we can not poison the forward pointers
>>> + * that may still be used for walking the hash list and we can only
>>> + * zero the pprev pointer so list_unhashed() will return true after
>>> + * this.
>>> + *
>>> + * The caller must take whatever precautions are necessary (such as
>>> + * holding appropriate locks) to avoid racing with another
>>> + * list-mutation primitive, such as hlist_nulls_add_head_rcu() or
>>> + * hlist_nulls_del_rcu(), running on this same list.  However, it is
>>> + * perfectly legal to run concurrently with the _rcu list-traversal
>>> + * primitives, such as hlist_nulls_for_each_entry_rcu().
>>> + */
>>> +static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
>>> +{
>>> +	if (!hlist_nulls_unhashed(n)) {
>>> +		__hlist_nulls_del(n);
>>> +		n->pprev = NULL;
>>> +	}
>>> +}
>> The point here is to allow an RCU reader to grab the update-side lock
>> while holding a reference to an hlist_nulls_node, and then be able to
>> blindly call hlist_nulls_del_init_rcu() without having to do any complex
>> check to see if the element has already been deleted?
>> But this only works if each free operation waits for a grace period.
>> If using SLAB_DESTROY_BY_RCU, the would-be deleter still needs to
>> revalidate after grabbing the update-side lock, right?  Hmmm...
>
> <start a brain refresh cycle>
>  <read again your questions>
>    Tilt... 
>
> hlist_nulls_del_init_rcu() is only used by a writer, exactly
> like hlist_del_init_rcu().
> I see nothing special about SLAB_DESTROY_BY_RCU here.
>
> static inline void hlist_del_init_rcu(struct hlist_node *n)
> {
>        if (!hlist_unhashed(n)) {
>                __hlist_del(n);
>                n->pprev = NULL;
>        }
> }

Not a problem, as you don't use it the way I was thinking.

For whatever it is worth, here is a more complete use case, on the
off-chance that it becomes useful some time:

	retry:
	rcu_read_lock();
	hlist_nulls_for_each_entry_rcu(tpos, pos, head, hn_node) {
		if (!(curgen = still_valid(tpos)))
			goto retry;
		if (needs_deletion(tpos)) {
			spin_lock(&update_side_lock);
			if (still_valid(tpos) == curgen)
				hlist_nulls_del_init_rcu(pos);
			spin_unlock(&update_side_lock);
		}
	}
	rcu_read_unlock();

This approach requires that the key and a generation number be encoded
into a single word, and that the generation number be changed on each
allocation and on each free.

>>> +
>>> +/**
>>> + * hlist_nulls_del_rcu - deletes entry from hash list without 
>>> re-initialization
>>> + * @n: the element to delete from the hash list.
>>> + *
>>> + * Note: hlist_nulls_unhashed() on entry does not return true after 
>>> this,
>>> + * the entry is in an undefined state. It is useful for RCU based
>>> + * lockfree traversal.
>>> + *
>>> + * In particular, it means that we can not poison the forward
>>> + * pointers that may still be used for walking the hash list.
>>> + *
>>> + * The caller must take whatever precautions are necessary
>>> + * (such as holding appropriate locks) to avoid racing
>>> + * with another list-mutation primitive, such as 
>>> hlist_nulls_add_head_rcu()
>>> + * or hlist_nulls_del_rcu(), running on this same list.
>>> + * However, it is perfectly legal to run concurrently with
>>> + * the _rcu list-traversal primitives, such as
>>> + * hlist_nulls_for_each_entry().
>>> + */
>>> +static inline void hlist_nulls_del_rcu(struct hlist_nulls_node *n)
>>> +{
>>> +	__hlist_nulls_del(n);
>>> +	n->pprev = LIST_POISON2;
>>> +}
>>> +
>>> +/**
>>> + * hlist_nulls_add_head_rcu
>>> + * @n: the element to add to the hash list.
>>> + * @h: the list to add to.
>>> + *
>>> + * Description:
>>> + * Adds the specified element to the specified hlist_nulls,
>>> + * while permitting racing traversals.
>>> + *
>>> + * The caller must take whatever precautions are necessary
>>> + * (such as holding appropriate locks) to avoid racing
>>> + * with another list-mutation primitive, such as 
>>> hlist_nulls_add_head_rcu()
>>> + * or hlist_nulls_del_rcu(), running on this same list.
>>> + * However, it is perfectly legal to run concurrently with
>>> + * the _rcu list-traversal primitives, such as
>>> + * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency
>>> + * problems on Alpha CPUs.  Regardless of the type of CPU, the
>>> + * list-traversal primitive must be guarded by rcu_read_lock().
>>> + */
>>> +static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
>>> +					struct hlist_nulls_head *h)
>>> +{
>>> +	struct hlist_nulls_node *first = h->first;
>>> +
>>> +	n->next = first;
>>> +	n->pprev = &h->first;
>>> +	rcu_assign_pointer(h->first, n);
>>> +	if (!is_a_nulls(first))
>>> +		first->pprev = &n->next;
>>> +}
>>> +/**
>>> + * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type
>>> + * @tpos:	the type * to use as a loop cursor.
>>> + * @pos:	the &struct hlist_nulls_node to use as a loop cursor.
>>> + * @head:	the head for your list.
>>> + * @member:	the name of the hlist_nulls_node within the struct.
>>> + *
>>> + */
>>> +#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \
>>> +	for (pos = rcu_dereference((head)->first);			 \
>>> +		(!is_a_nulls(pos)) && 			\
>>> +		({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \
>>> +		pos = rcu_dereference(pos->next))
>>> +
>>> +#endif
>>> +#endif
>> Any chance of using a trick like Oleg used to get rid of the "pos"
>> argument?  http://lkml.org/lkml/2008/3/12/47
>> The hlist_nulls_node must always be at an even address, correct?
>> Couldn't this fact be used to allow testing for is_a_nulls() on tpos
>> rather than on pos?  Or is there a better way to approach this?
>
> #define sk_nulls_for_each_rcu(__sk, node, list) \
> 	hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node)
>
> 1) __sk is the pointer to found item if any is found in the loop
>
> 2) node will contain the end value of chain, in case we find nothing in 
> loop
>   because we need to check it after the loop.
>
> if (get_nulls_value(node) != hash)
> 	 goto begin;
>
> I dont know, it seems quite complex to try to use only three args ?
>
> This algo is not very easy to read as is already ...

One way around #2 would be for get_nulls_value() to undo the
hlist_nulls_entry().  Not sure whether it is worth it, but a
thought.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 1/3] rcu: Introduce hlist_nulls variant of hlist
  2008-11-19 18:46                                                             ` Paul E. McKenney
@ 2008-11-19 18:53                                                               ` Arnaldo Carvalho de Melo
  2008-11-19 21:17                                                                 ` Paul E. McKenney
  2008-11-19 20:39                                                               ` Eric Dumazet
  1 sibling, 1 reply; 134+ messages in thread
From: Arnaldo Carvalho de Melo @ 2008-11-19 18:53 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Eric Dumazet, Corey Minyard, David Miller, Stephen Hemminger,
	benny+usenet, Linux Netdev List, Christoph Lameter,
	Evgeniy Polyakov, Peter Zijlstra, Christian Bell

Em Wed, Nov 19, 2008 at 10:46:24AM -0800, Paul E. McKenney escreveu:
> For whatever it is worth, here is a more complete use case, on the
> off-chance that it becomes useful some time:
> 
> 	retry:
> 	rcu_read_lock();

	retry: /* should be here, huh? */

> 	hlist_nulls_for_each_entry_rcu(tpos, pos, head, hn_node) {
> 		if (!(curgen = still_valid(tpos)))
> 			goto retry;
> 		if (needs_deletion(tpos)) {
> 			spin_lock(&update_side_lock);
> 			if (still_valid(tpos) == curgen)
> 				hlist_nulls_del_init_rcu(pos);
> 			spin_unlock(&update_side_lock);
> 		}
> 	}
> 	rcu_read_unlock();

- Arnaldo

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 0/3] net: RCU lookups for UDP, DCCP and TCP protocol
  2008-11-17  3:41                                                         ` David Miller
@ 2008-11-19 19:52                                                           ` Christoph Lameter
  0 siblings, 0 replies; 134+ messages in thread
From: Christoph Lameter @ 2008-11-19 19:52 UTC (permalink / raw)
  To: David Miller
  Cc: dada1, paulmck, minyard, shemminger, benny+usenet, netdev,
	a.p.zijlstra, zbr, christian

On Sun, 16 Nov 2008, David Miller wrote:

> These patches are incredibly cool!

How much does this give us on the aim9 udp test?

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 1/3] rcu: Introduce hlist_nulls variant of hlist
  2008-11-19 18:46                                                             ` Paul E. McKenney
  2008-11-19 18:53                                                               ` Arnaldo Carvalho de Melo
@ 2008-11-19 20:39                                                               ` Eric Dumazet
  2008-11-19 21:21                                                                 ` Paul E. McKenney
  1 sibling, 1 reply; 134+ messages in thread
From: Eric Dumazet @ 2008-11-19 20:39 UTC (permalink / raw)
  To: paulmck
  Cc: Corey Minyard, David Miller, Stephen Hemminger, benny+usenet,
	Linux Netdev List, Christoph Lameter, Evgeniy Polyakov,
	Peter Zijlstra, Christian Bell

Paul E. McKenney a écrit :
> On Wed, Nov 19, 2008 at 06:53:20PM +0100, Eric Dumazet wrote:
>> Paul E. McKenney a écrit :
>>>> +
>>>> +/**
>>>> + * hlist_nulls_del_init_rcu - deletes entry from hash list with 
>>>> re-initialization
>>>> + * @n: the element to delete from the hash list.
>>>> + *
>>>> + * Note: hlist_nulls_unhashed() on the node return true after this. It 
>>>> is
>>>> + * useful for RCU based read lockfree traversal if the writer side
>>>> + * must know if the list entry is still hashed or already unhashed.
>>>> + *
>>>> + * In particular, it means that we can not poison the forward pointers
>>>> + * that may still be used for walking the hash list and we can only
>>>> + * zero the pprev pointer so list_unhashed() will return true after
>>>> + * this.
>>>> + *
>>>> + * The caller must take whatever precautions are necessary (such as
>>>> + * holding appropriate locks) to avoid racing with another
>>>> + * list-mutation primitive, such as hlist_nulls_add_head_rcu() or
>>>> + * hlist_nulls_del_rcu(), running on this same list.  However, it is
>>>> + * perfectly legal to run concurrently with the _rcu list-traversal
>>>> + * primitives, such as hlist_nulls_for_each_entry_rcu().
>>>> + */
>>>> +static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
>>>> +{
>>>> +	if (!hlist_nulls_unhashed(n)) {
>>>> +		__hlist_nulls_del(n);
>>>> +		n->pprev = NULL;
>>>> +	}
>>>> +}
>>> The point here is to allow an RCU reader to grab the update-side lock
>>> while holding a reference to an hlist_nulls_node, and then be able to
>>> blindly call hlist_nulls_del_init_rcu() without having to do any complex
>>> check to see if the element has already been deleted?
>>> But this only works if each free operation waits for a grace period.
>>> If using SLAB_DESTROY_BY_RCU, the would-be deleter still needs to
>>> revalidate after grabbing the update-side lock, right?  Hmmm...
>> <start a brain refresh cycle>
>>  <read again your questions>
>>    Tilt... 
>>
>> hlist_nulls_del_init_rcu() is only used by a writer, exactly
>> like hlist_del_init_rcu().
>> I see nothing special about SLAB_DESTROY_BY_RCU here.
>>
>> static inline void hlist_del_init_rcu(struct hlist_node *n)
>> {
>>        if (!hlist_unhashed(n)) {
>>                __hlist_del(n);
>>                n->pprev = NULL;
>>        }
>> }
> 
> Not a problem, as you don't use it the way I was thinking.
> 
> For whatever it is worth, here is a more complete use case, on the
> off-chance that it becomes useful some time:
> 
> 	retry:
> 	rcu_read_lock();
> 	hlist_nulls_for_each_entry_rcu(tpos, pos, head, hn_node) {
> 		if (!(curgen = still_valid(tpos)))
> 			goto retry;
> 		if (needs_deletion(tpos)) {
> 			spin_lock(&update_side_lock);
> 			if (still_valid(tpos) == curgen)
> 				hlist_nulls_del_init_rcu(pos);
> 			spin_unlock(&update_side_lock);
> 		}
> 	}
> 	rcu_read_unlock();
> 
> This approach requires that the key and a generation number be encoded
> into a single word, and that the generation number be changed on each
> allocation and on each free.

Hum, we should add this template in Documentation/RCU  I guess

Thanks



^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 1/3] rcu: Introduce hlist_nulls variant of hlist
  2008-11-19 18:53                                                               ` Arnaldo Carvalho de Melo
@ 2008-11-19 21:17                                                                 ` Paul E. McKenney
  0 siblings, 0 replies; 134+ messages in thread
From: Paul E. McKenney @ 2008-11-19 21:17 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo, Eric Dumazet, Corey Minyard,
	David Miller, Stephen Hemminger

On Wed, Nov 19, 2008 at 04:53:47PM -0200, Arnaldo Carvalho de Melo wrote:
> Em Wed, Nov 19, 2008 at 10:46:24AM -0800, Paul E. McKenney escreveu:
> > For whatever it is worth, here is a more complete use case, on the
> > off-chance that it becomes useful some time:
> > 
> > 	retry:
> > 	rcu_read_lock();
> 
> 	retry: /* should be here, huh? */

Indeed!  Either that or have an rcu_read_unlock() before the
goto retry.

Good catch!

							Thanx, Paul

> > 	hlist_nulls_for_each_entry_rcu(tpos, pos, head, hn_node) {
> > 		if (!(curgen = still_valid(tpos)))
> > 			goto retry;
> > 		if (needs_deletion(tpos)) {
> > 			spin_lock(&update_side_lock);
> > 			if (still_valid(tpos) == curgen)
> > 				hlist_nulls_del_init_rcu(pos);
> > 			spin_unlock(&update_side_lock);
> > 		}
> > 	}
> > 	rcu_read_unlock();
> 
> - Arnaldo

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH 1/3] rcu: Introduce hlist_nulls variant of hlist
  2008-11-19 20:39                                                               ` Eric Dumazet
@ 2008-11-19 21:21                                                                 ` Paul E. McKenney
  0 siblings, 0 replies; 134+ messages in thread
From: Paul E. McKenney @ 2008-11-19 21:21 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Corey Minyard, David Miller, Stephen Hemminger, benny+usenet,
	Linux Netdev List, Christoph Lameter, Evgeniy Polyakov,
	Peter Zijlstra, Christian Bell

On Wed, Nov 19, 2008 at 09:39:57PM +0100, Eric Dumazet wrote:
> Paul E. McKenney a écrit :
>> On Wed, Nov 19, 2008 at 06:53:20PM +0100, Eric Dumazet wrote:
>>> Paul E. McKenney a écrit :
>>>>> +
>>>>> +/**
>>>>> + * hlist_nulls_del_init_rcu - deletes entry from hash list with 
>>>>> re-initialization
>>>>> + * @n: the element to delete from the hash list.
>>>>> + *
>>>>> + * Note: hlist_nulls_unhashed() on the node return true after this. It 
>>>>> is
>>>>> + * useful for RCU based read lockfree traversal if the writer side
>>>>> + * must know if the list entry is still hashed or already unhashed.
>>>>> + *
>>>>> + * In particular, it means that we can not poison the forward pointers
>>>>> + * that may still be used for walking the hash list and we can only
>>>>> + * zero the pprev pointer so list_unhashed() will return true after
>>>>> + * this.
>>>>> + *
>>>>> + * The caller must take whatever precautions are necessary (such as
>>>>> + * holding appropriate locks) to avoid racing with another
>>>>> + * list-mutation primitive, such as hlist_nulls_add_head_rcu() or
>>>>> + * hlist_nulls_del_rcu(), running on this same list.  However, it is
>>>>> + * perfectly legal to run concurrently with the _rcu list-traversal
>>>>> + * primitives, such as hlist_nulls_for_each_entry_rcu().
>>>>> + */
>>>>> +static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node 
>>>>> *n)
>>>>> +{
>>>>> +	if (!hlist_nulls_unhashed(n)) {
>>>>> +		__hlist_nulls_del(n);
>>>>> +		n->pprev = NULL;
>>>>> +	}
>>>>> +}
>>>> The point here is to allow an RCU reader to grab the update-side lock
>>>> while holding a reference to an hlist_nulls_node, and then be able to
>>>> blindly call hlist_nulls_del_init_rcu() without having to do any complex
>>>> check to see if the element has already been deleted?
>>>> But this only works if each free operation waits for a grace period.
>>>> If using SLAB_DESTROY_BY_RCU, the would-be deleter still needs to
>>>> revalidate after grabbing the update-side lock, right?  Hmmm...
>>> <start a brain refresh cycle>
>>>  <read again your questions>
>>>    Tilt... 
>>> hlist_nulls_del_init_rcu() is only used by a writer, exactly
>>> like hlist_del_init_rcu().
>>> I see nothing special about SLAB_DESTROY_BY_RCU here.
>>>
>>> static inline void hlist_del_init_rcu(struct hlist_node *n)
>>> {
>>>        if (!hlist_unhashed(n)) {
>>>                __hlist_del(n);
>>>                n->pprev = NULL;
>>>        }
>>> }
>> Not a problem, as you don't use it the way I was thinking.
>> For whatever it is worth, here is a more complete use case, on the
>> off-chance that it becomes useful some time:
>> 	retry:
>> 	rcu_read_lock();
>> 	hlist_nulls_for_each_entry_rcu(tpos, pos, head, hn_node) {
>> 		if (!(curgen = still_valid(tpos)))
>> 			goto retry;
>> 		if (needs_deletion(tpos)) {
>> 			spin_lock(&update_side_lock);
>> 			if (still_valid(tpos) == curgen)
>> 				hlist_nulls_del_init_rcu(pos);
>> 			spin_unlock(&update_side_lock);
>> 		}
>> 	}
>> 	rcu_read_unlock();
>> This approach requires that the key and a generation number be encoded
>> into a single word, and that the generation number be changed on each
>> allocation and on each free.
>
> Hum, we should add this template in Documentation/RCU  I guess

With Arnaldo's change -- probably should prototype and test to find
the other inevitable bugs.  :-/

							Thanx, Paul

^ permalink raw reply	[flat|nested] 134+ messages in thread

* [PATCH] net: Convert TCP/DCCP listening hash tables to use RCU
  2008-11-13 13:15                                                       ` [PATCH 3/3] net: Convert TCP & DCCP hash tables to use RCU / hlist_nulls Eric Dumazet
  2008-11-13 13:34                                                         ` Peter Zijlstra
  2008-11-19 17:53                                                         ` Paul E. McKenney
@ 2008-11-23  9:33                                                         ` Eric Dumazet
  2008-11-23 15:59                                                           ` Paul E. McKenney
  2008-11-24  1:23                                                           ` David Miller
  2 siblings, 2 replies; 134+ messages in thread
From: Eric Dumazet @ 2008-11-23  9:33 UTC (permalink / raw)
  To: David Miller
  Cc: Paul E. McKenney, Corey Minyard, Stephen Hemminger, benny+usenet,
	Linux Netdev List, Christoph Lameter, Peter Zijlstra,
	Evgeniy Polyakov, Christian Bell

[-- Attachment #1: Type: text/plain, Size: 1262 bytes --]

Hi David

Please find patch to convert TCP/DCCP listening hash tables
to RCU.

A followup patch will cleanup all sk_node fields and macros
that are not used anymore.

Thanks

[PATCH] net: Convert TCP/DCCP listening hash tables to use RCU

This is the last step to be able to perform full RCU lookups
in __inet_lookup() : After established/timewait tables, we
add RCU lookups to listening hash table.

The only trick here is that a socket of a given type (TCP ipv4,
TCP ipv6, ...) can now flight between two different tables
(established and listening) during a RCU grace period, so we
must use different 'nulls' end-of-chain values for two tables.

We define a large value :

#define LISTENING_NULLS_BASE (1U << 29)

So that slots in listening table are guaranteed to have different
end-of-chain values than slots in established table. A reader can
still detect it finished its lookup in the right chain.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
 include/net/inet_hashtables.h |    9 +
 net/ipv4/inet_diag.c          |    4
 net/ipv4/inet_hashtables.c    |  148 ++++++++++++++++----------------
 net/ipv4/tcp_ipv4.c           |    8 -
 net/ipv6/inet6_hashtables.c   |   94 ++++++++++++--------
 5 files changed, 147 insertions(+), 116 deletions(-)

[-- Attachment #2: listening_rcu.patch --]
[-- Type: text/plain, Size: 11539 bytes --]

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index ec7ee2e..df90118 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -99,9 +99,16 @@ struct inet_bind_hashbucket {
 	struct hlist_head	chain;
 };
 
+/*
+ * Sockets can be hashed in established or listening table
+ * We must use different 'nulls' end-of-chain value for listening
+ * hash table, or we might find a socket that was closed and
+ * reallocated/inserted into established hash table
+ */
+#define LISTENING_NULLS_BASE (1U << 29)
 struct inet_listen_hashbucket {
 	spinlock_t		lock;
-	struct hlist_head	head;
+	struct hlist_nulls_head	head;
 };
 
 /* This is for listening sockets, thus all sockets which possess wildcards. */
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 998a78f..588a779 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -720,13 +720,13 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
 
 		for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
 			struct sock *sk;
-			struct hlist_node *node;
+			struct hlist_nulls_node *node;
 			struct inet_listen_hashbucket *ilb;
 
 			num = 0;
 			ilb = &hashinfo->listening_hash[i];
 			spin_lock_bh(&ilb->lock);
-			sk_for_each(sk, node, &ilb->head) {
+			sk_nulls_for_each(sk, node, &ilb->head) {
 				struct inet_sock *inet = inet_sk(sk);
 
 				if (num < s_num) {
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 4c273a9..11fcb87 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -110,78 +110,79 @@ void __inet_inherit_port(struct sock *sk, struct sock *child)
 
 EXPORT_SYMBOL_GPL(__inet_inherit_port);
 
+static inline int compute_score(struct sock *sk, struct net *net,
+				const unsigned short hnum, const __be32 daddr,
+				const int dif)
+{
+	int score = -1;
+	struct inet_sock *inet = inet_sk(sk);
+
+	if (net_eq(sock_net(sk), net) && inet->num == hnum &&
+			!ipv6_only_sock(sk)) {
+		__be32 rcv_saddr = inet->rcv_saddr;
+		score = sk->sk_family == PF_INET ? 1 : 0;
+		if (rcv_saddr) {
+			if (rcv_saddr != daddr)
+				return -1;
+			score += 2;
+		}
+		if (sk->sk_bound_dev_if) {
+			if (sk->sk_bound_dev_if != dif)
+				return -1;
+			score += 2;
+		}
+	}
+	return score;
+}
+
 /*
  * Don't inline this cruft. Here are some nice properties to exploit here. The
  * BSD API does not allow a listening sock to specify the remote port nor the
  * remote address for the connection. So always assume those are both
  * wildcarded during the search since they can never be otherwise.
  */
-static struct sock *inet_lookup_listener_slow(struct net *net,
-					      const struct hlist_head *head,
-					      const __be32 daddr,
-					      const unsigned short hnum,
-					      const int dif)
-{
-	struct sock *result = NULL, *sk;
-	const struct hlist_node *node;
-	int hiscore = -1;
-
-	sk_for_each(sk, node, head) {
-		const struct inet_sock *inet = inet_sk(sk);
-
-		if (net_eq(sock_net(sk), net) && inet->num == hnum &&
-				!ipv6_only_sock(sk)) {
-			const __be32 rcv_saddr = inet->rcv_saddr;
-			int score = sk->sk_family == PF_INET ? 1 : 0;
-
-			if (rcv_saddr) {
-				if (rcv_saddr != daddr)
-					continue;
-				score += 2;
-			}
-			if (sk->sk_bound_dev_if) {
-				if (sk->sk_bound_dev_if != dif)
-					continue;
-				score += 2;
-			}
-			if (score == 5)
-				return sk;
-			if (score > hiscore) {
-				hiscore	= score;
-				result	= sk;
-			}
-		}
-	}
-	return result;
-}
 
-/* Optimize the common listener case. */
+
 struct sock *__inet_lookup_listener(struct net *net,
 				    struct inet_hashinfo *hashinfo,
 				    const __be32 daddr, const unsigned short hnum,
 				    const int dif)
 {
-	struct sock *sk = NULL;
-	struct inet_listen_hashbucket *ilb;
+	struct sock *sk, *result;
+	struct hlist_nulls_node *node;
+	unsigned int hash = inet_lhashfn(net, hnum);
+	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
+	int score, hiscore;
 
-	ilb = &hashinfo->listening_hash[inet_lhashfn(net, hnum)];
-	spin_lock(&ilb->lock);
-	if (!hlist_empty(&ilb->head)) {
-		const struct inet_sock *inet = inet_sk((sk = __sk_head(&ilb->head)));
-
-		if (inet->num == hnum && !sk->sk_node.next &&
-		    (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
-		    (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
-		    !sk->sk_bound_dev_if && net_eq(sock_net(sk), net))
-			goto sherry_cache;
-		sk = inet_lookup_listener_slow(net, &ilb->head, daddr, hnum, dif);
+	rcu_read_lock();
+begin:
+	result = NULL;
+	hiscore = -1;
+	sk_nulls_for_each_rcu(sk, node, &ilb->head) {
+		score = compute_score(sk, net, hnum, daddr, dif);
+		if (score > hiscore) {
+			result = sk;
+			hiscore = score;
+		}
 	}
-	if (sk) {
-sherry_cache:
-		sock_hold(sk);
+	/*
+	 * if the nulls value we got at the end of this lookup is
+	 * not the expected one, we must restart lookup.
+	 * We probably met an item that was moved to another chain.
+	 */
+	if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
+		goto begin;
+	if (result) {
+		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
+			result = NULL;
+		else if (unlikely(compute_score(result, net, hnum, daddr,
+				  dif) < hiscore)) {
+			sock_put(result);
+			goto begin;
+		}
 	}
-	spin_unlock(&ilb->lock);
-	return sk;
+	rcu_read_unlock();
+	return result;
 }
 EXPORT_SYMBOL_GPL(__inet_lookup_listener);
 
@@ -370,7 +371,7 @@ static void __inet_hash(struct sock *sk)
 	ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
 
 	spin_lock(&ilb->lock);
-	__sk_add_node(sk, &ilb->head);
+	__sk_nulls_add_node_rcu(sk, &ilb->head);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 	spin_unlock(&ilb->lock);
 }
@@ -388,26 +389,22 @@ EXPORT_SYMBOL_GPL(inet_hash);
 void inet_unhash(struct sock *sk)
 {
 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+	spinlock_t *lock;
+	int done;
 
 	if (sk_unhashed(sk))
 		return;
 
-	if (sk->sk_state == TCP_LISTEN) {
-		struct inet_listen_hashbucket *ilb;
+	if (sk->sk_state == TCP_LISTEN)
+		lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock;
+	else
+		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
 
-		ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
-		spin_lock_bh(&ilb->lock);
-		if (__sk_del_node_init(sk))
-			sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
-		spin_unlock_bh(&ilb->lock);
-	} else {
-		spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
-
-		spin_lock_bh(lock);
-		if (__sk_nulls_del_node_init_rcu(sk))
-			sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
-		spin_unlock_bh(lock);
-	}
+	spin_lock_bh(lock);
+	done =__sk_nulls_del_node_init_rcu(sk);
+	spin_unlock_bh(lock);
+	if (done)
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 }
 EXPORT_SYMBOL_GPL(inet_unhash);
 
@@ -526,8 +523,11 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
 {
 	int i;
 
-	for (i = 0; i < INET_LHTABLE_SIZE; i++)
+	for (i = 0; i < INET_LHTABLE_SIZE; i++) {
 		spin_lock_init(&h->listening_hash[i].lock);
+		INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head,
+				      i + LISTENING_NULLS_BASE);
+		}
 }
 
 EXPORT_SYMBOL_GPL(inet_hashinfo_init);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a81caa1..cab2458 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1868,7 +1868,7 @@ static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
 static void *listening_get_next(struct seq_file *seq, void *cur)
 {
 	struct inet_connection_sock *icsk;
-	struct hlist_node *node;
+	struct hlist_nulls_node *node;
 	struct sock *sk = cur;
 	struct inet_listen_hashbucket *ilb;
 	struct tcp_iter_state *st = seq->private;
@@ -1878,7 +1878,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
 		st->bucket = 0;
 		ilb = &tcp_hashinfo.listening_hash[0];
 		spin_lock_bh(&ilb->lock);
-		sk = sk_head(&ilb->head);
+		sk = sk_nulls_head(&ilb->head);
 		goto get_sk;
 	}
 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
@@ -1914,7 +1914,7 @@ get_req:
 		sk = sk_next(sk);
 	}
 get_sk:
-	sk_for_each_from(sk, node) {
+	sk_nulls_for_each_from(sk, node) {
 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
 			cur = sk;
 			goto out;
@@ -1935,7 +1935,7 @@ start_req:
 	if (++st->bucket < INET_LHTABLE_SIZE) {
 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
 		spin_lock_bh(&ilb->lock);
-		sk = sk_head(&ilb->head);
+		sk = sk_nulls_head(&ilb->head);
 		goto get_sk;
 	}
 	cur = NULL;
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index e0fd681..8fe267f 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -33,7 +33,7 @@ void __inet6_hash(struct sock *sk)
 
 		ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
 		spin_lock(&ilb->lock);
-		__sk_add_node(sk, &ilb->head);
+		__sk_nulls_add_node_rcu(sk, &ilb->head);
 		spin_unlock(&ilb->lock);
 	} else {
 		unsigned int hash;
@@ -118,47 +118,71 @@ out:
 }
 EXPORT_SYMBOL(__inet6_lookup_established);
 
+static int inline compute_score(struct sock *sk, struct net *net,
+				const unsigned short hnum,
+				const struct in6_addr *daddr,
+				const int dif)
+{
+	int score = -1;
+
+	if (net_eq(sock_net(sk), net) && inet_sk(sk)->num == hnum &&
+	    sk->sk_family == PF_INET6) {
+		const struct ipv6_pinfo *np = inet6_sk(sk);
+
+		score = 1;
+		if (!ipv6_addr_any(&np->rcv_saddr)) {
+			if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
+				return -1;
+			score++;
+		}
+		if (sk->sk_bound_dev_if) {
+			if (sk->sk_bound_dev_if != dif)
+				return -1;
+			score++;
+		}
+	}
+	return score;
+}
+
 struct sock *inet6_lookup_listener(struct net *net,
 		struct inet_hashinfo *hashinfo, const struct in6_addr *daddr,
 		const unsigned short hnum, const int dif)
 {
 	struct sock *sk;
-	const struct hlist_node *node;
-	struct sock *result = NULL;
-	int score, hiscore = 0;
-	struct inet_listen_hashbucket *ilb;
-
-	ilb = &hashinfo->listening_hash[inet_lhashfn(net, hnum)];
-	spin_lock(&ilb->lock);
-	sk_for_each(sk, node, &ilb->head) {
-		if (net_eq(sock_net(sk), net) && inet_sk(sk)->num == hnum &&
-				sk->sk_family == PF_INET6) {
-			const struct ipv6_pinfo *np = inet6_sk(sk);
-
-			score = 1;
-			if (!ipv6_addr_any(&np->rcv_saddr)) {
-				if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
-					continue;
-				score++;
-			}
-			if (sk->sk_bound_dev_if) {
-				if (sk->sk_bound_dev_if != dif)
-					continue;
-				score++;
-			}
-			if (score == 3) {
-				result = sk;
-				break;
-			}
-			if (score > hiscore) {
-				hiscore = score;
-				result = sk;
-			}
+	const struct hlist_nulls_node *node;
+	struct sock *result;
+	int score, hiscore;
+	unsigned int hash = inet_lhashfn(net, hnum);
+	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
+
+	rcu_read_lock();
+begin:
+	result = NULL;
+	hiscore = -1;
+	sk_nulls_for_each(sk, node, &ilb->head) {
+		score = compute_score(sk, net, hnum, daddr, dif);
+		if (score > hiscore) {
+			hiscore = score;
+			result = sk;
 		}
 	}
-	if (result)
-		sock_hold(result);
-	spin_unlock(&ilb->lock);
+	/*
+	 * if the nulls value we got at the end of this lookup is
+	 * not the expected one, we must restart lookup.
+	 * We probably met an item that was moved to another chain.
+	 */
+	if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
+		goto begin;
+	if (result) {
+		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
+			result = NULL;
+		else if (unlikely(compute_score(result, net, hnum, daddr,
+				  dif) < hiscore)) {
+			sock_put(result);
+			goto begin;
+		}
+	}
+	rcu_read_unlock();
 	return result;
 }
 

^ permalink raw reply related	[flat|nested] 134+ messages in thread

* Re: [PATCH] net: Convert TCP/DCCP listening hash tables to use RCU
  2008-11-23  9:33                                                         ` [PATCH] net: Convert TCP/DCCP listening hash tables to use RCU Eric Dumazet
@ 2008-11-23 15:59                                                           ` Paul E. McKenney
  2008-11-23 18:42                                                             ` Eric Dumazet
  2008-11-24  1:23                                                           ` David Miller
  1 sibling, 1 reply; 134+ messages in thread
From: Paul E. McKenney @ 2008-11-23 15:59 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, Corey Minyard, Stephen Hemminger, benny+usenet,
	Linux Netdev List, Christoph Lameter, Peter Zijlstra,
	Evgeniy Polyakov, Christian Bell

On Sun, Nov 23, 2008 at 10:33:28AM +0100, Eric Dumazet wrote:
> Hi David
>
> Please find patch to convert TCP/DCCP listening hash tables
> to RCU.
>
> A followup patch will cleanup all sk_node fields and macros
> that are not used anymore.
>
> Thanks
>
> [PATCH] net: Convert TCP/DCCP listening hash tables to use RCU
>
> This is the last step to be able to perform full RCU lookups
> in __inet_lookup() : After established/timewait tables, we
> add RCU lookups to listening hash table.
>
> The only trick here is that a socket of a given type (TCP ipv4,
> TCP ipv6, ...) can now flight between two different tables
> (established and listening) during a RCU grace period, so we
> must use different 'nulls' end-of-chain values for two tables.
>
> We define a large value :
>
> #define LISTENING_NULLS_BASE (1U << 29)

I do like this use of the full set up upper bits!  However, wouldn't it
be a good idea to use a larger base value for 64-bit systems, perhaps
using CONFIG_64BIT to choose?  500M entries might not seem like that
many in a few years time...

						Thanx, Paul

> So that slots in listening table are guaranteed to have different
> end-of-chain values than slots in established table. A reader can
> still detect it finished its lookup in the right chain.
>
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
> ---
> include/net/inet_hashtables.h |    9 +
> net/ipv4/inet_diag.c          |    4
> net/ipv4/inet_hashtables.c    |  148 ++++++++++++++++----------------
> net/ipv4/tcp_ipv4.c           |    8 -
> net/ipv6/inet6_hashtables.c   |   94 ++++++++++++--------
> 5 files changed, 147 insertions(+), 116 deletions(-)

> diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
> index ec7ee2e..df90118 100644
> --- a/include/net/inet_hashtables.h
> +++ b/include/net/inet_hashtables.h
> @@ -99,9 +99,16 @@ struct inet_bind_hashbucket {
>  	struct hlist_head	chain;
>  };
> 
> +/*
> + * Sockets can be hashed in established or listening table
> + * We must use different 'nulls' end-of-chain value for listening
> + * hash table, or we might find a socket that was closed and
> + * reallocated/inserted into established hash table
> + */
> +#define LISTENING_NULLS_BASE (1U << 29)
>  struct inet_listen_hashbucket {
>  	spinlock_t		lock;
> -	struct hlist_head	head;
> +	struct hlist_nulls_head	head;
>  };
> 
>  /* This is for listening sockets, thus all sockets which possess wildcards. */
> diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
> index 998a78f..588a779 100644
> --- a/net/ipv4/inet_diag.c
> +++ b/net/ipv4/inet_diag.c
> @@ -720,13 +720,13 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
> 
>  		for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
>  			struct sock *sk;
> -			struct hlist_node *node;
> +			struct hlist_nulls_node *node;
>  			struct inet_listen_hashbucket *ilb;
> 
>  			num = 0;
>  			ilb = &hashinfo->listening_hash[i];
>  			spin_lock_bh(&ilb->lock);
> -			sk_for_each(sk, node, &ilb->head) {
> +			sk_nulls_for_each(sk, node, &ilb->head) {
>  				struct inet_sock *inet = inet_sk(sk);
> 
>  				if (num < s_num) {
> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
> index 4c273a9..11fcb87 100644
> --- a/net/ipv4/inet_hashtables.c
> +++ b/net/ipv4/inet_hashtables.c
> @@ -110,78 +110,79 @@ void __inet_inherit_port(struct sock *sk, struct sock *child)
> 
>  EXPORT_SYMBOL_GPL(__inet_inherit_port);
> 
> +static inline int compute_score(struct sock *sk, struct net *net,
> +				const unsigned short hnum, const __be32 daddr,
> +				const int dif)
> +{
> +	int score = -1;
> +	struct inet_sock *inet = inet_sk(sk);
> +
> +	if (net_eq(sock_net(sk), net) && inet->num == hnum &&
> +			!ipv6_only_sock(sk)) {
> +		__be32 rcv_saddr = inet->rcv_saddr;
> +		score = sk->sk_family == PF_INET ? 1 : 0;
> +		if (rcv_saddr) {
> +			if (rcv_saddr != daddr)
> +				return -1;
> +			score += 2;
> +		}
> +		if (sk->sk_bound_dev_if) {
> +			if (sk->sk_bound_dev_if != dif)
> +				return -1;
> +			score += 2;
> +		}
> +	}
> +	return score;
> +}
> +
>  /*
>   * Don't inline this cruft. Here are some nice properties to exploit here. The
>   * BSD API does not allow a listening sock to specify the remote port nor the
>   * remote address for the connection. So always assume those are both
>   * wildcarded during the search since they can never be otherwise.
>   */
> -static struct sock *inet_lookup_listener_slow(struct net *net,
> -					      const struct hlist_head *head,
> -					      const __be32 daddr,
> -					      const unsigned short hnum,
> -					      const int dif)
> -{
> -	struct sock *result = NULL, *sk;
> -	const struct hlist_node *node;
> -	int hiscore = -1;
> -
> -	sk_for_each(sk, node, head) {
> -		const struct inet_sock *inet = inet_sk(sk);
> -
> -		if (net_eq(sock_net(sk), net) && inet->num == hnum &&
> -				!ipv6_only_sock(sk)) {
> -			const __be32 rcv_saddr = inet->rcv_saddr;
> -			int score = sk->sk_family == PF_INET ? 1 : 0;
> -
> -			if (rcv_saddr) {
> -				if (rcv_saddr != daddr)
> -					continue;
> -				score += 2;
> -			}
> -			if (sk->sk_bound_dev_if) {
> -				if (sk->sk_bound_dev_if != dif)
> -					continue;
> -				score += 2;
> -			}
> -			if (score == 5)
> -				return sk;
> -			if (score > hiscore) {
> -				hiscore	= score;
> -				result	= sk;
> -			}
> -		}
> -	}
> -	return result;
> -}
> 
> -/* Optimize the common listener case. */
> +
>  struct sock *__inet_lookup_listener(struct net *net,
>  				    struct inet_hashinfo *hashinfo,
>  				    const __be32 daddr, const unsigned short hnum,
>  				    const int dif)
>  {
> -	struct sock *sk = NULL;
> -	struct inet_listen_hashbucket *ilb;
> +	struct sock *sk, *result;
> +	struct hlist_nulls_node *node;
> +	unsigned int hash = inet_lhashfn(net, hnum);
> +	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
> +	int score, hiscore;
> 
> -	ilb = &hashinfo->listening_hash[inet_lhashfn(net, hnum)];
> -	spin_lock(&ilb->lock);
> -	if (!hlist_empty(&ilb->head)) {
> -		const struct inet_sock *inet = inet_sk((sk = __sk_head(&ilb->head)));
> -
> -		if (inet->num == hnum && !sk->sk_node.next &&
> -		    (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
> -		    (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
> -		    !sk->sk_bound_dev_if && net_eq(sock_net(sk), net))
> -			goto sherry_cache;
> -		sk = inet_lookup_listener_slow(net, &ilb->head, daddr, hnum, dif);
> +	rcu_read_lock();
> +begin:
> +	result = NULL;
> +	hiscore = -1;
> +	sk_nulls_for_each_rcu(sk, node, &ilb->head) {
> +		score = compute_score(sk, net, hnum, daddr, dif);
> +		if (score > hiscore) {
> +			result = sk;
> +			hiscore = score;
> +		}
>  	}
> -	if (sk) {
> -sherry_cache:
> -		sock_hold(sk);
> +	/*
> +	 * if the nulls value we got at the end of this lookup is
> +	 * not the expected one, we must restart lookup.
> +	 * We probably met an item that was moved to another chain.
> +	 */
> +	if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
> +		goto begin;
> +	if (result) {
> +		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
> +			result = NULL;
> +		else if (unlikely(compute_score(result, net, hnum, daddr,
> +				  dif) < hiscore)) {
> +			sock_put(result);
> +			goto begin;
> +		}
>  	}
> -	spin_unlock(&ilb->lock);
> -	return sk;
> +	rcu_read_unlock();
> +	return result;
>  }
>  EXPORT_SYMBOL_GPL(__inet_lookup_listener);
> 
> @@ -370,7 +371,7 @@ static void __inet_hash(struct sock *sk)
>  	ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
> 
>  	spin_lock(&ilb->lock);
> -	__sk_add_node(sk, &ilb->head);
> +	__sk_nulls_add_node_rcu(sk, &ilb->head);
>  	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
>  	spin_unlock(&ilb->lock);
>  }
> @@ -388,26 +389,22 @@ EXPORT_SYMBOL_GPL(inet_hash);
>  void inet_unhash(struct sock *sk)
>  {
>  	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
> +	spinlock_t *lock;
> +	int done;
> 
>  	if (sk_unhashed(sk))
>  		return;
> 
> -	if (sk->sk_state == TCP_LISTEN) {
> -		struct inet_listen_hashbucket *ilb;
> +	if (sk->sk_state == TCP_LISTEN)
> +		lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock;
> +	else
> +		lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
> 
> -		ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
> -		spin_lock_bh(&ilb->lock);
> -		if (__sk_del_node_init(sk))
> -			sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
> -		spin_unlock_bh(&ilb->lock);
> -	} else {
> -		spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
> -
> -		spin_lock_bh(lock);
> -		if (__sk_nulls_del_node_init_rcu(sk))
> -			sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
> -		spin_unlock_bh(lock);
> -	}
> +	spin_lock_bh(lock);
> +	done =__sk_nulls_del_node_init_rcu(sk);
> +	spin_unlock_bh(lock);
> +	if (done)
> +		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
>  }
>  EXPORT_SYMBOL_GPL(inet_unhash);
> 
> @@ -526,8 +523,11 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
>  {
>  	int i;
> 
> -	for (i = 0; i < INET_LHTABLE_SIZE; i++)
> +	for (i = 0; i < INET_LHTABLE_SIZE; i++) {
>  		spin_lock_init(&h->listening_hash[i].lock);
> +		INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head,
> +				      i + LISTENING_NULLS_BASE);
> +		}
>  }
> 
>  EXPORT_SYMBOL_GPL(inet_hashinfo_init);
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index a81caa1..cab2458 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1868,7 +1868,7 @@ static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
>  static void *listening_get_next(struct seq_file *seq, void *cur)
>  {
>  	struct inet_connection_sock *icsk;
> -	struct hlist_node *node;
> +	struct hlist_nulls_node *node;
>  	struct sock *sk = cur;
>  	struct inet_listen_hashbucket *ilb;
>  	struct tcp_iter_state *st = seq->private;
> @@ -1878,7 +1878,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
>  		st->bucket = 0;
>  		ilb = &tcp_hashinfo.listening_hash[0];
>  		spin_lock_bh(&ilb->lock);
> -		sk = sk_head(&ilb->head);
> +		sk = sk_nulls_head(&ilb->head);
>  		goto get_sk;
>  	}
>  	ilb = &tcp_hashinfo.listening_hash[st->bucket];
> @@ -1914,7 +1914,7 @@ get_req:
>  		sk = sk_next(sk);
>  	}
>  get_sk:
> -	sk_for_each_from(sk, node) {
> +	sk_nulls_for_each_from(sk, node) {
>  		if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
>  			cur = sk;
>  			goto out;
> @@ -1935,7 +1935,7 @@ start_req:
>  	if (++st->bucket < INET_LHTABLE_SIZE) {
>  		ilb = &tcp_hashinfo.listening_hash[st->bucket];
>  		spin_lock_bh(&ilb->lock);
> -		sk = sk_head(&ilb->head);
> +		sk = sk_nulls_head(&ilb->head);
>  		goto get_sk;
>  	}
>  	cur = NULL;
> diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
> index e0fd681..8fe267f 100644
> --- a/net/ipv6/inet6_hashtables.c
> +++ b/net/ipv6/inet6_hashtables.c
> @@ -33,7 +33,7 @@ void __inet6_hash(struct sock *sk)
> 
>  		ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
>  		spin_lock(&ilb->lock);
> -		__sk_add_node(sk, &ilb->head);
> +		__sk_nulls_add_node_rcu(sk, &ilb->head);
>  		spin_unlock(&ilb->lock);
>  	} else {
>  		unsigned int hash;
> @@ -118,47 +118,71 @@ out:
>  }
>  EXPORT_SYMBOL(__inet6_lookup_established);
> 
> +static int inline compute_score(struct sock *sk, struct net *net,
> +				const unsigned short hnum,
> +				const struct in6_addr *daddr,
> +				const int dif)
> +{
> +	int score = -1;
> +
> +	if (net_eq(sock_net(sk), net) && inet_sk(sk)->num == hnum &&
> +	    sk->sk_family == PF_INET6) {
> +		const struct ipv6_pinfo *np = inet6_sk(sk);
> +
> +		score = 1;
> +		if (!ipv6_addr_any(&np->rcv_saddr)) {
> +			if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
> +				return -1;
> +			score++;
> +		}
> +		if (sk->sk_bound_dev_if) {
> +			if (sk->sk_bound_dev_if != dif)
> +				return -1;
> +			score++;
> +		}
> +	}
> +	return score;
> +}
> +
>  struct sock *inet6_lookup_listener(struct net *net,
>  		struct inet_hashinfo *hashinfo, const struct in6_addr *daddr,
>  		const unsigned short hnum, const int dif)
>  {
>  	struct sock *sk;
> -	const struct hlist_node *node;
> -	struct sock *result = NULL;
> -	int score, hiscore = 0;
> -	struct inet_listen_hashbucket *ilb;
> -
> -	ilb = &hashinfo->listening_hash[inet_lhashfn(net, hnum)];
> -	spin_lock(&ilb->lock);
> -	sk_for_each(sk, node, &ilb->head) {
> -		if (net_eq(sock_net(sk), net) && inet_sk(sk)->num == hnum &&
> -				sk->sk_family == PF_INET6) {
> -			const struct ipv6_pinfo *np = inet6_sk(sk);
> -
> -			score = 1;
> -			if (!ipv6_addr_any(&np->rcv_saddr)) {
> -				if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
> -					continue;
> -				score++;
> -			}
> -			if (sk->sk_bound_dev_if) {
> -				if (sk->sk_bound_dev_if != dif)
> -					continue;
> -				score++;
> -			}
> -			if (score == 3) {
> -				result = sk;
> -				break;
> -			}
> -			if (score > hiscore) {
> -				hiscore = score;
> -				result = sk;
> -			}
> +	const struct hlist_nulls_node *node;
> +	struct sock *result;
> +	int score, hiscore;
> +	unsigned int hash = inet_lhashfn(net, hnum);
> +	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
> +
> +	rcu_read_lock();
> +begin:
> +	result = NULL;
> +	hiscore = -1;
> +	sk_nulls_for_each(sk, node, &ilb->head) {
> +		score = compute_score(sk, net, hnum, daddr, dif);
> +		if (score > hiscore) {
> +			hiscore = score;
> +			result = sk;
>  		}
>  	}
> -	if (result)
> -		sock_hold(result);
> -	spin_unlock(&ilb->lock);
> +	/*
> +	 * if the nulls value we got at the end of this lookup is
> +	 * not the expected one, we must restart lookup.
> +	 * We probably met an item that was moved to another chain.
> +	 */
> +	if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
> +		goto begin;
> +	if (result) {
> +		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
> +			result = NULL;
> +		else if (unlikely(compute_score(result, net, hnum, daddr,
> +				  dif) < hiscore)) {
> +			sock_put(result);
> +			goto begin;
> +		}
> +	}
> +	rcu_read_unlock();
>  	return result;
>  }
> 


^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH] net: Convert TCP/DCCP listening hash tables to use RCU
  2008-11-23 15:59                                                           ` Paul E. McKenney
@ 2008-11-23 18:42                                                             ` Eric Dumazet
  2008-11-23 19:17                                                               ` Paul E. McKenney
  0 siblings, 1 reply; 134+ messages in thread
From: Eric Dumazet @ 2008-11-23 18:42 UTC (permalink / raw)
  To: paulmck
  Cc: David Miller, Corey Minyard, Stephen Hemminger, benny+usenet,
	Linux Netdev List, Christoph Lameter, Peter Zijlstra,
	Evgeniy Polyakov, Christian Bell

Paul E. McKenney a écrit :
> On Sun, Nov 23, 2008 at 10:33:28AM +0100, Eric Dumazet wrote:
>> Hi David
>>
>> Please find patch to convert TCP/DCCP listening hash tables
>> to RCU.
>>
>> A followup patch will cleanup all sk_node fields and macros
>> that are not used anymore.
>>
>> Thanks
>>
>> [PATCH] net: Convert TCP/DCCP listening hash tables to use RCU
>>
>> This is the last step to be able to perform full RCU lookups
>> in __inet_lookup() : After established/timewait tables, we
>> add RCU lookups to listening hash table.
>>
>> The only trick here is that a socket of a given type (TCP ipv4,
>> TCP ipv6, ...) can now flight between two different tables
>> (established and listening) during a RCU grace period, so we
>> must use different 'nulls' end-of-chain values for two tables.
>>
>> We define a large value :
>>
>> #define LISTENING_NULLS_BASE (1U << 29)
> 
> I do like this use of the full set up upper bits!  However, wouldn't it
> be a good idea to use a larger base value for 64-bit systems, perhaps
> using CONFIG_64BIT to choose?  500M entries might not seem like that
> many in a few years time...
> 

Well, this value is correct up to 2^29 slots, and a hash table of 2^32 bytes
(8 bytes per pointer)

A TCP socket uses about 1472 bytes on 64bit arches, so 2^29 sessions
would need 800 GB of ram, not counting dentries, inodes, ...

I really doubt a machine, even with 4096 cpus should/can handle so many
tcp sessions :)



^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH] net: Convert TCP/DCCP listening hash tables to use RCU
  2008-11-23 18:42                                                             ` Eric Dumazet
@ 2008-11-23 19:17                                                               ` Paul E. McKenney
  2008-11-23 20:18                                                                 ` Eric Dumazet
  0 siblings, 1 reply; 134+ messages in thread
From: Paul E. McKenney @ 2008-11-23 19:17 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, Corey Minyard, Stephen Hemminger, benny+usenet,
	Linux Netdev List, Christoph Lameter, Peter Zijlstra,
	Evgeniy Polyakov, Christian Bell

On Sun, Nov 23, 2008 at 07:42:14PM +0100, Eric Dumazet wrote:
> Paul E. McKenney a écrit :
>> On Sun, Nov 23, 2008 at 10:33:28AM +0100, Eric Dumazet wrote:
>>> Hi David
>>>
>>> Please find patch to convert TCP/DCCP listening hash tables
>>> to RCU.
>>>
>>> A followup patch will cleanup all sk_node fields and macros
>>> that are not used anymore.
>>>
>>> Thanks
>>>
>>> [PATCH] net: Convert TCP/DCCP listening hash tables to use RCU
>>>
>>> This is the last step to be able to perform full RCU lookups
>>> in __inet_lookup() : After established/timewait tables, we
>>> add RCU lookups to listening hash table.
>>>
>>> The only trick here is that a socket of a given type (TCP ipv4,
>>> TCP ipv6, ...) can now flight between two different tables
>>> (established and listening) during a RCU grace period, so we
>>> must use different 'nulls' end-of-chain values for two tables.
>>>
>>> We define a large value :
>>>
>>> #define LISTENING_NULLS_BASE (1U << 29)
>> I do like this use of the full set up upper bits!  However, wouldn't it
>> be a good idea to use a larger base value for 64-bit systems, perhaps
>> using CONFIG_64BIT to choose?  500M entries might not seem like that
>> many in a few years time...
>
> Well, this value is correct up to 2^29 slots, and a hash table of 2^32 
> bytes
> (8 bytes per pointer)
>
> A TCP socket uses about 1472 bytes on 64bit arches, so 2^29 sessions
> would need 800 GB of ram, not counting dentries, inodes, ...
>
> I really doubt a machine, even with 4096 cpus should/can handle so many
> tcp sessions :)

200MB per CPU, right?

But yes, now that you mention it, 800GB of memory dedicated to TCP
connections sounds almost as ridiculous as did 640K of memory in the
late 1970s.  ;-)

Nevertheless, I don't have an overwhelming objection to the current
code.  Easy enough to change should it become a problem, right?

						Thanx, Paul

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH] net: Convert TCP/DCCP listening hash tables to use RCU
  2008-11-23 19:17                                                               ` Paul E. McKenney
@ 2008-11-23 20:18                                                                 ` Eric Dumazet
  2008-11-23 22:33                                                                   ` Paul E. McKenney
  0 siblings, 1 reply; 134+ messages in thread
From: Eric Dumazet @ 2008-11-23 20:18 UTC (permalink / raw)
  To: paulmck
  Cc: David Miller, Corey Minyard, Stephen Hemminger, benny+usenet,
	Linux Netdev List, Christoph Lameter, Peter Zijlstra,
	Evgeniy Polyakov, Christian Bell

Paul E. McKenney a écrit :
> On Sun, Nov 23, 2008 at 07:42:14PM +0100, Eric Dumazet wrote:
>> Paul E. McKenney a écrit :
>>> On Sun, Nov 23, 2008 at 10:33:28AM +0100, Eric Dumazet wrote:
>>>> Hi David
>>>>
>>>> Please find patch to convert TCP/DCCP listening hash tables
>>>> to RCU.
>>>>
>>>> A followup patch will cleanup all sk_node fields and macros
>>>> that are not used anymore.
>>>>
>>>> Thanks
>>>>
>>>> [PATCH] net: Convert TCP/DCCP listening hash tables to use RCU
>>>>
>>>> This is the last step to be able to perform full RCU lookups
>>>> in __inet_lookup() : After established/timewait tables, we
>>>> add RCU lookups to listening hash table.
>>>>
>>>> The only trick here is that a socket of a given type (TCP ipv4,
>>>> TCP ipv6, ...) can now flight between two different tables
>>>> (established and listening) during a RCU grace period, so we
>>>> must use different 'nulls' end-of-chain values for two tables.
>>>>
>>>> We define a large value :
>>>>
>>>> #define LISTENING_NULLS_BASE (1U << 29)
>>> I do like this use of the full set up upper bits!  However, wouldn't it
>>> be a good idea to use a larger base value for 64-bit systems, perhaps
>>> using CONFIG_64BIT to choose?  500M entries might not seem like that
>>> many in a few years time...
>> Well, this value is correct up to 2^29 slots, and a hash table of 2^32 
>> bytes
>> (8 bytes per pointer)
>>
>> A TCP socket uses about 1472 bytes on 64bit arches, so 2^29 sessions
>> would need 800 GB of ram, not counting dentries, inodes, ...
>>
>> I really doubt a machine, even with 4096 cpus should/can handle so many
>> tcp sessions :)
> 
> 200MB per CPU, right?
> 
> But yes, now that you mention it, 800GB of memory dedicated to TCP
> connections sounds almost as ridiculous as did 640K of memory in the
> late 1970s.  ;-)

;)

> 
> Nevertheless, I don't have an overwhelming objection to the current
> code.  Easy enough to change should it become a problem, right?

Sure. By that time, cpus might be 128 bits or 256 bits anyway :)



^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH] net: Convert TCP/DCCP listening hash tables to use RCU
  2008-11-23 20:18                                                                 ` Eric Dumazet
@ 2008-11-23 22:33                                                                   ` Paul E. McKenney
  0 siblings, 0 replies; 134+ messages in thread
From: Paul E. McKenney @ 2008-11-23 22:33 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, Corey Minyard, Stephen Hemminger, benny+usenet,
	Linux Netdev List, Christoph Lameter, Peter Zijlstra,
	Evgeniy Polyakov, Christian Bell

On Sun, Nov 23, 2008 at 09:18:17PM +0100, Eric Dumazet wrote:
> Paul E. McKenney a écrit :
>> On Sun, Nov 23, 2008 at 07:42:14PM +0100, Eric Dumazet wrote:
>>> Paul E. McKenney a écrit :
>>>> On Sun, Nov 23, 2008 at 10:33:28AM +0100, Eric Dumazet wrote:
>>>>> Hi David
>>>>>
>>>>> Please find patch to convert TCP/DCCP listening hash tables
>>>>> to RCU.
>>>>>
>>>>> A followup patch will cleanup all sk_node fields and macros
>>>>> that are not used anymore.
>>>>>
>>>>> Thanks
>>>>>
>>>>> [PATCH] net: Convert TCP/DCCP listening hash tables to use RCU
>>>>>
>>>>> This is the last step to be able to perform full RCU lookups
>>>>> in __inet_lookup() : After established/timewait tables, we
>>>>> add RCU lookups to listening hash table.
>>>>>
>>>>> The only trick here is that a socket of a given type (TCP ipv4,
>>>>> TCP ipv6, ...) can now flight between two different tables
>>>>> (established and listening) during a RCU grace period, so we
>>>>> must use different 'nulls' end-of-chain values for two tables.
>>>>>
>>>>> We define a large value :
>>>>>
>>>>> #define LISTENING_NULLS_BASE (1U << 29)
>>>> I do like this use of the full set up upper bits!  However, wouldn't it
>>>> be a good idea to use a larger base value for 64-bit systems, perhaps
>>>> using CONFIG_64BIT to choose?  500M entries might not seem like that
>>>> many in a few years time...
>>> Well, this value is correct up to 2^29 slots, and a hash table of 2^32 
>>> bytes
>>> (8 bytes per pointer)
>>>
>>> A TCP socket uses about 1472 bytes on 64bit arches, so 2^29 sessions
>>> would need 800 GB of ram, not counting dentries, inodes, ...
>>>
>>> I really doubt a machine, even with 4096 cpus should/can handle so many
>>> tcp sessions :)
>> 200MB per CPU, right?
>> But yes, now that you mention it, 800GB of memory dedicated to TCP
>> connections sounds almost as ridiculous as did 640K of memory in the
>> late 1970s.  ;-)
>
> ;)
>
>> Nevertheless, I don't have an overwhelming objection to the current
>> code.  Easy enough to change should it become a problem, right?
>
> Sure. By that time, cpus might be 128 bits or 256 bits anyway :)

Or even 640K bits.  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 134+ messages in thread

* Re: [PATCH] net: Convert TCP/DCCP listening hash tables to use RCU
  2008-11-23  9:33                                                         ` [PATCH] net: Convert TCP/DCCP listening hash tables to use RCU Eric Dumazet
  2008-11-23 15:59                                                           ` Paul E. McKenney
@ 2008-11-24  1:23                                                           ` David Miller
  1 sibling, 0 replies; 134+ messages in thread
From: David Miller @ 2008-11-24  1:23 UTC (permalink / raw)
  To: dada1
  Cc: paulmck, minyard, shemminger, benny+usenet, netdev, cl,
	a.p.zijlstra, zbr, christian

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sun, 23 Nov 2008 10:33:28 +0100

> [PATCH] net: Convert TCP/DCCP listening hash tables to use RCU

Applied, thanks Eric.

^ permalink raw reply	[flat|nested] 134+ messages in thread

end of thread, other threads:[~2008-11-24  1:23 UTC | newest]

Thread overview: 134+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-10-06 18:50 [PATCH 3/3] Convert the UDP hash lock to RCU Corey Minyard
2008-10-06 21:22 ` Eric Dumazet
2008-10-06 21:40   ` David Miller
2008-10-06 23:08     ` Corey Minyard
2008-10-07  8:37       ` Evgeniy Polyakov
2008-10-07 14:16         ` Christoph Lameter
2008-10-07 14:29           ` Evgeniy Polyakov
2008-10-07 14:38             ` Christoph Lameter
2008-10-07 14:33           ` Paul E. McKenney
2008-10-07 14:45             ` Christoph Lameter
2008-10-07 15:07               ` Eric Dumazet
2008-10-07 15:07               ` Paul E. McKenney
2008-10-07  5:24     ` Eric Dumazet
2008-10-07  8:54       ` Benny Amorsen
2008-10-07 12:59         ` Eric Dumazet
2008-10-07 14:07           ` Stephen Hemminger
2008-10-07 20:55             ` David Miller
2008-10-07 21:20               ` Stephen Hemminger
2008-10-08 13:55               ` Eric Dumazet
2008-10-08 18:45                 ` David Miller
2008-10-28 20:37                   ` [PATCH 1/2] udp: introduce struct udp_table and multiple rwlocks Eric Dumazet
2008-10-28 21:23                     ` Christian Bell
2008-10-28 21:31                       ` Evgeniy Polyakov
2008-10-28 21:48                       ` Eric Dumazet
2008-10-28 21:28                     ` Evgeniy Polyakov
2008-10-28 20:42                   ` [PATCH 2/2] udp: RCU handling for Unicast packets Eric Dumazet
2008-10-28 22:45                     ` Eric Dumazet
2008-10-29  5:05                       ` David Miller
2008-10-29  8:23                         ` Eric Dumazet
2008-10-29  8:56                           ` David Miller
2008-10-29 10:19                             ` Eric Dumazet
2008-10-29 18:19                               ` David Miller
2008-10-29  9:04                           ` Eric Dumazet
2008-10-29  9:17                             ` David Miller
2008-10-29 13:17                             ` Corey Minyard
2008-10-29 14:36                               ` Eric Dumazet
2008-10-29 15:34                                 ` Corey Minyard
2008-10-29 16:09                                   ` Eric Dumazet
2008-10-29 16:37                                     ` Paul E. McKenney
2008-10-29 17:22                                       ` Corey Minyard
2008-10-29 17:45                                         ` Eric Dumazet
2008-10-29 18:28                                           ` Corey Minyard
2008-10-29 18:52                                             ` Paul E. McKenney
2008-10-29 20:00                                               ` Eric Dumazet
2008-10-29 20:17                                                 ` Paul E. McKenney
2008-10-29 21:29                                                   ` Corey Minyard
2008-10-29 21:57                                                     ` Eric Dumazet
2008-10-29 21:58                                                     ` Paul E. McKenney
2008-10-29 22:08                                                   ` Eric Dumazet
2008-10-30  3:22                                                     ` Corey Minyard
2008-10-30  5:50                                                       ` Eric Dumazet
2008-11-02  4:19                                                         ` David Miller
2008-10-30  5:40                                                     ` David Miller
2008-10-30  5:51                                                       ` Eric Dumazet
2008-10-30  7:04                                                         ` Eric Dumazet
2008-10-30  7:05                                                           ` David Miller
2008-10-30 15:40                                                     ` [PATCH] udp: Introduce special NULL pointers for hlist termination Eric Dumazet
2008-10-30 15:51                                                       ` Stephen Hemminger
2008-10-30 16:28                                                         ` Corey Minyard
2008-10-31 14:37                                                           ` Eric Dumazet
2008-10-31 14:55                                                             ` Pavel Emelyanov
2008-11-02  4:22                                                               ` David Miller
2008-10-30 17:12                                                         ` Eric Dumazet
2008-10-31  7:51                                                           ` David Miller
2008-10-30 16:01                                                       ` Peter Zijlstra
2008-10-31  0:14                                                       ` Keith Owens
2008-11-13 13:13                                                       ` [PATCH 0/3] net: RCU lookups for UDP, DCCP and TCP protocol Eric Dumazet
2008-11-13 17:20                                                         ` Andi Kleen
2008-11-17  3:41                                                         ` David Miller
2008-11-19 19:52                                                           ` Christoph Lameter
2008-11-13 13:14                                                       ` [PATCH 1/3] rcu: Introduce hlist_nulls variant of hlist Eric Dumazet
2008-11-13 13:29                                                         ` Peter Zijlstra
2008-11-13 13:44                                                           ` Eric Dumazet
2008-11-13 16:02                                                             ` [PATCH 4/3] rcu: documents rculist_nulls Eric Dumazet
2008-11-14 15:16                                                               ` Peter Zijlstra
2008-11-17  3:36                                                                 ` David Miller
2008-11-19 17:07                                                               ` Paul E. McKenney
2008-11-14 15:16                                                         ` [PATCH 1/3] rcu: Introduce hlist_nulls variant of hlist Peter Zijlstra
2008-11-19 17:01                                                         ` Paul E. McKenney
2008-11-19 17:53                                                           ` Eric Dumazet
2008-11-19 18:46                                                             ` Paul E. McKenney
2008-11-19 18:53                                                               ` Arnaldo Carvalho de Melo
2008-11-19 21:17                                                                 ` Paul E. McKenney
2008-11-19 20:39                                                               ` Eric Dumazet
2008-11-19 21:21                                                                 ` Paul E. McKenney
2008-11-13 13:15                                                       ` [PATCH 2/3] udp: Use hlist_nulls in UDP RCU code Eric Dumazet
2008-11-19 17:29                                                         ` Paul E. McKenney
2008-11-19 17:53                                                           ` Eric Dumazet
2008-11-13 13:15                                                       ` [PATCH 3/3] net: Convert TCP & DCCP hash tables to use RCU / hlist_nulls Eric Dumazet
2008-11-13 13:34                                                         ` Peter Zijlstra
2008-11-13 13:51                                                           ` Eric Dumazet
2008-11-13 14:08                                                             ` Christoph Lameter
2008-11-13 14:22                                                             ` Peter Zijlstra
2008-11-13 14:27                                                               ` Christoph Lameter
2008-11-19 17:53                                                         ` Paul E. McKenney
2008-11-23  9:33                                                         ` [PATCH] net: Convert TCP/DCCP listening hash tables to use RCU Eric Dumazet
2008-11-23 15:59                                                           ` Paul E. McKenney
2008-11-23 18:42                                                             ` Eric Dumazet
2008-11-23 19:17                                                               ` Paul E. McKenney
2008-11-23 20:18                                                                 ` Eric Dumazet
2008-11-23 22:33                                                                   ` Paul E. McKenney
2008-11-24  1:23                                                           ` David Miller
2008-10-30 11:04                                                 ` [PATCH 2/2] udp: RCU handling for Unicast packets Peter Zijlstra
2008-10-30 11:30                                                   ` Eric Dumazet
2008-10-30 18:25                                                     ` Paul E. McKenney
2008-10-31 16:40                                                       ` Eric Dumazet
2008-11-01  3:10                                                         ` Paul E. McKenney
2008-10-29 17:32                                       ` Eric Dumazet
2008-10-29 18:11                                         ` Paul E. McKenney
2008-10-29 18:29                                           ` David Miller
2008-10-29 18:38                                             ` Paul E. McKenney
2008-10-29 18:36                                           ` Eric Dumazet
2008-10-29 18:20                                 ` David Miller
2008-10-30 11:12                                 ` Peter Zijlstra
2008-10-30 11:29                                   ` Eric Dumazet
2008-10-28 20:37                 ` [PATCH 0/2] udp: Convert the UDP hash lock to RCU Eric Dumazet
2008-10-28 21:28                   ` Stephen Hemminger
2008-10-28 21:50                     ` Eric Dumazet
2008-10-07 16:43           ` [PATCH 3/3] " Corey Minyard
2008-10-07 18:26       ` David Miller
2008-10-08  8:35         ` Eric Dumazet
2008-10-08 16:38           ` David Miller
2008-10-07  8:31     ` Peter Zijlstra
2008-10-07 14:36       ` Paul E. McKenney
2008-10-07 18:29       ` David Miller
2008-10-06 22:07   ` Corey Minyard
2008-10-07  8:17   ` Peter Zijlstra
2008-10-07  9:24     ` Eric Dumazet
2008-10-07 14:15       ` Christoph Lameter
2008-10-07 14:38         ` Paul E. McKenney
2008-10-07 14:50         ` Eric Dumazet
2008-10-07 15:05           ` Paul E. McKenney
2008-10-07 15:09           ` Peter Zijlstra
2008-10-07 15:23           ` Christoph Lameter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).