[PATCH 3/3] Convert the UDP hash lock to RCU

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 3/3] Convert the UDP hash lock to RCU
@ 2008-10-06 18:50 Corey Minyard
  2008-10-06 21:22 ` Eric Dumazet
  0 siblings, 1 reply; 36+ messages in thread
From: Corey Minyard @ 2008-10-06 18:50 UTC (permalink / raw)
  To: Linux Kernel, netdev; +Cc: shemminger, paulmck

Change the UDP hash lock from an rwlock to RCU.

Signed-off-by: Corey Minyard <cminyard@mvista.com>
---
 include/net/udp.h |    9 +++++----
 net/ipv4/udp.c    |   47 +++++++++++++++++++++++++++--------------------
 net/ipv6/udp.c    |   17 +++++++++--------
 3 files changed, 41 insertions(+), 32 deletions(-)

diff --git a/include/net/udp.h b/include/net/udp.h
index addcdc6..35aa104 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -51,7 +51,7 @@ struct udp_skb_cb {
 #define UDP_SKB_CB(__skb)	((struct udp_skb_cb *)((__skb)->cb))
 
 extern struct hlist_head udp_hash[UDP_HTABLE_SIZE];
-extern rwlock_t udp_hash_lock;
+extern spinlock_t udp_hash_wlock;
 
 
 /* Note: this must match 'valbool' in sock_setsockopt */
@@ -112,12 +112,13 @@ static inline void udp_lib_hash(struct sock *sk)
 
 static inline void udp_lib_unhash(struct sock *sk)
 {
-	write_lock_bh(&udp_hash_lock);
-	if (sk_del_node_init(sk)) {
+	spin_lock_bh(&udp_hash_wlock);
+	if (sk_del_node_init_rcu(sk)) {
 		inet_sk(sk)->num = 0;
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 	}
-	write_unlock_bh(&udp_hash_lock);
+	spin_unlock_bh(&udp_hash_wlock);
+	synchronize_rcu();
 }
 
 static inline void udp_lib_close(struct sock *sk, long timeout)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 57e26fa..1b65cb6 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -112,7 +112,8 @@ DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6) __read_mostly;
 EXPORT_SYMBOL(udp_stats_in6);
 
 struct hlist_head udp_hash[UDP_HTABLE_SIZE];
-DEFINE_RWLOCK(udp_hash_lock);
+DEFINE_SPINLOCK(udp_hash_wlock);
+EXPORT_SYMBOL(udp_hash_wlock);
 
 int sysctl_udp_mem[3] __read_mostly;
 int sysctl_udp_rmem_min __read_mostly;
@@ -155,7 +156,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 	int    error = 1;
 	struct net *net = sock_net(sk);
 
-	write_lock_bh(&udp_hash_lock);
+	spin_lock_bh(&udp_hash_wlock);
 
 	if (!snum) {
 		int i, low, high, remaining;
@@ -225,12 +226,12 @@ gotit:
 	sk->sk_hash = snum;
 	if (sk_unhashed(sk)) {
 		head = &udptable[udp_hashfn(net, snum)];
-		sk_add_node(sk, head);
+		sk_add_node_rcu(sk, head);
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 	}
 	error = 0;
 fail:
-	write_unlock_bh(&udp_hash_lock);
+	spin_unlock_bh(&udp_hash_wlock);
 	return error;
 }
 
@@ -260,8 +261,8 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 	unsigned short hnum = ntohs(dport);
 	int badness = -1;
 
-	read_lock(&udp_hash_lock);
-	sk_for_each(sk, node, &udptable[udp_hashfn(net, hnum)]) {
+	rcu_read_lock();
+	sk_for_each_rcu(sk, node, &udptable[udp_hashfn(net, hnum)]) {
 		struct inet_sock *inet = inet_sk(sk);
 
 		if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
@@ -296,9 +297,17 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 			}
 		}
 	}
+	/*
+	 * Note that this is safe, even with an RCU lock.
+	 * udp_lib_unhash() is the removal function, it calls
+	 * synchronize_rcu() and the socket counter cannot go to
+	 * zero until it returns.  So if we increment it inside the
+	 * RCU read lock, it should never go to zero and then be
+	 * incremented again.
+	 */
 	if (result)
 		sock_hold(result);
-	read_unlock(&udp_hash_lock);
+	rcu_read_unlock();
 	return result;
 }
 
@@ -311,7 +320,7 @@ static inline struct sock *udp_v4_mcast_next(struct sock *sk,
 	struct sock *s = sk;
 	unsigned short hnum = ntohs(loc_port);
 
-	sk_for_each_from(s, node) {
+	sk_for_each_from_rcu(s, node) {
 		struct inet_sock *inet = inet_sk(s);
 
 		if (s->sk_hash != hnum					||
@@ -1094,8 +1103,8 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	struct sock *sk;
 	int dif;
 
-	read_lock(&udp_hash_lock);
-	sk = sk_head(&udptable[udp_hashfn(net, ntohs(uh->dest))]);
+	rcu_read_lock();
+	sk = sk_head_rcu(&udptable[udp_hashfn(net, ntohs(uh->dest))]);
 	dif = skb->dev->ifindex;
 	sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
 	if (sk) {
@@ -1104,8 +1113,9 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 		do {
 			struct sk_buff *skb1 = skb;
 
-			sknext = udp_v4_mcast_next(sk_next(sk), uh->dest, daddr,
-						   uh->source, saddr, dif);
+			sknext = udp_v4_mcast_next(sk_next_rcu(sk), uh->dest,
+						   daddr, uh->source, saddr,
+						   dif);
 			if (sknext)
 				skb1 = skb_clone(skb, GFP_ATOMIC);
 
@@ -1120,7 +1130,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 		} while (sknext);
 	} else
 		kfree_skb(skb);
-	read_unlock(&udp_hash_lock);
+	rcu_read_unlock();
 	return 0;
 }
 
@@ -1543,13 +1553,13 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
 	struct net *net = seq_file_net(seq);
 
 	do {
-		sk = sk_next(sk);
+		sk = sk_next_rcu(sk);
 try_again:
 		;
 	} while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
 
 	if (!sk && ++state->bucket < UDP_HTABLE_SIZE) {
-		sk = sk_head(state->hashtable + state->bucket);
+		sk = sk_head_rcu(state->hashtable + state->bucket);
 		goto try_again;
 	}
 	return sk;
@@ -1566,9 +1576,8 @@ static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
 }
 
 static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
-	__acquires(udp_hash_lock)
 {
-	read_lock(&udp_hash_lock);
+	rcu_read_lock();
 	return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
 }
 
@@ -1586,9 +1595,8 @@ static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 }
 
 static void udp_seq_stop(struct seq_file *seq, void *v)
-	__releases(udp_hash_lock)
 {
-	read_unlock(&udp_hash_lock);
+	rcu_read_unlock();
 }
 
 static int udp_seq_open(struct inode *inode, struct file *file)
@@ -1732,7 +1740,6 @@ void __init udp_init(void)
 
 EXPORT_SYMBOL(udp_disconnect);
 EXPORT_SYMBOL(udp_hash);
-EXPORT_SYMBOL(udp_hash_lock);
 EXPORT_SYMBOL(udp_ioctl);
 EXPORT_SYMBOL(udp_prot);
 EXPORT_SYMBOL(udp_sendmsg);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index a6aecf7..b807de7 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -64,8 +64,8 @@ static struct sock *__udp6_lib_lookup(struct net *net,
 	unsigned short hnum = ntohs(dport);
 	int badness = -1;
 
-	read_lock(&udp_hash_lock);
-	sk_for_each(sk, node, &udptable[udp_hashfn(net, hnum)]) {
+	rcu_read_lock();
+	sk_for_each_rcu(sk, node, &udptable[udp_hashfn(net, hnum)]) {
 		struct inet_sock *inet = inet_sk(sk);
 
 		if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
@@ -101,9 +101,10 @@ static struct sock *__udp6_lib_lookup(struct net *net,
 			}
 		}
 	}
+	/* See comment in __udp4_lib_lookup on why this is safe. */
 	if (result)
 		sock_hold(result);
-	read_unlock(&udp_hash_lock);
+	rcu_read_unlock();
 	return result;
 }
 
@@ -322,7 +323,7 @@ static struct sock *udp_v6_mcast_next(struct sock *sk,
 	struct sock *s = sk;
 	unsigned short num = ntohs(loc_port);
 
-	sk_for_each_from(s, node) {
+	sk_for_each_from_rcu(s, node) {
 		struct inet_sock *inet = inet_sk(s);
 
 		if (sock_net(s) != sock_net(sk))
@@ -365,8 +366,8 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	const struct udphdr *uh = udp_hdr(skb);
 	int dif;
 
-	read_lock(&udp_hash_lock);
-	sk = sk_head(&udptable[udp_hashfn(net, ntohs(uh->dest))]);
+	rcu_read_lock();
+	sk = sk_head_rcu(&udptable[udp_hashfn(net, ntohs(uh->dest))]);
 	dif = inet6_iif(skb);
 	sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
 	if (!sk) {
@@ -375,7 +376,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	}
 
 	sk2 = sk;
-	while ((sk2 = udp_v6_mcast_next(sk_next(sk2), uh->dest, daddr,
+	while ((sk2 = udp_v6_mcast_next(sk_next_rcu(sk2), uh->dest, daddr,
 					uh->source, saddr, dif))) {
 		struct sk_buff *buff = skb_clone(skb, GFP_ATOMIC);
 		if (buff) {
@@ -394,7 +395,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 		sk_add_backlog(sk, skb);
 	bh_unlock_sock(sk);
 out:
-	read_unlock(&udp_hash_lock);
+	rcu_read_unlock();
 	return 0;
 }
 
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-06 18:50 [PATCH 3/3] Convert the UDP hash lock to RCU Corey Minyard
@ 2008-10-06 21:22 ` Eric Dumazet
  2008-10-06 21:40   ` David Miller
                     ` (2 more replies)
  0 siblings, 3 replies; 36+ messages in thread
From: Eric Dumazet @ 2008-10-06 21:22 UTC (permalink / raw)
  To: minyard; +Cc: Linux Kernel, netdev, shemminger, paulmck

Corey Minyard a écrit :
> Change the UDP hash lock from an rwlock to RCU.
> 
> Signed-off-by: Corey Minyard <cminyard@mvista.com>
> ---
>  include/net/udp.h |    9 +++++----
>  net/ipv4/udp.c    |   47 +++++++++++++++++++++++++++--------------------
>  net/ipv6/udp.c    |   17 +++++++++--------
>  3 files changed, 41 insertions(+), 32 deletions(-)
> 
> diff --git a/include/net/udp.h b/include/net/udp.h
> index addcdc6..35aa104 100644
> --- a/include/net/udp.h
> +++ b/include/net/udp.h
> @@ -51,7 +51,7 @@ struct udp_skb_cb {
>  #define UDP_SKB_CB(__skb)	((struct udp_skb_cb *)((__skb)->cb))
>  
>  extern struct hlist_head udp_hash[UDP_HTABLE_SIZE];
> -extern rwlock_t udp_hash_lock;
> +extern spinlock_t udp_hash_wlock;
>  
>  
>  /* Note: this must match 'valbool' in sock_setsockopt */
> @@ -112,12 +112,13 @@ static inline void udp_lib_hash(struct sock *sk)
>  
>  static inline void udp_lib_unhash(struct sock *sk)
>  {
> -	write_lock_bh(&udp_hash_lock);
> -	if (sk_del_node_init(sk)) {
> +	spin_lock_bh(&udp_hash_wlock);
> +	if (sk_del_node_init_rcu(sk)) {
>  		inet_sk(sk)->num = 0;
>  		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
>  	}
> -	write_unlock_bh(&udp_hash_lock);
> +	spin_unlock_bh(&udp_hash_wlock);
> +	synchronize_rcu();

UDP central rwlock can hurt performance, because of cache line ping pong,
so your patch really makes sense.

Me wondering what impact this synchronize_rcu() can have on mono-threaded
VOIP applications using lot of UDP sockets. What is the maximum delay of
this function ?

For "struct file" freeing, we chose call_rcu() instead of synchronize_rcu()

Maybe we could add a generic rcu head to struct sock, and use call_rcu() in
sk_prot_free() for sockets needing RCU (udp after your patch is applied, maybe
tcp on future patches, while I believe previous work on the subject concluded
RCU was not giving good results for short lived http sessions) ?

Or just add SLAB_DESTROY_BY_RCU to slab creation in proto_register()
for "struct proto udp_prot/udpv6_prot" so that kmem_cache_free() 
done in sk_prot_free() can defer freeing to RCU...





^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-06 21:22 ` Eric Dumazet
@ 2008-10-06 21:40   ` David Miller
  2008-10-06 23:08     ` Corey Minyard
                       ` (2 more replies)
  2008-10-06 22:07   ` Corey Minyard
  2008-10-07  8:17   ` Peter Zijlstra
  2 siblings, 3 replies; 36+ messages in thread
From: David Miller @ 2008-10-06 21:40 UTC (permalink / raw)
  To: dada1; +Cc: minyard, linux-kernel, netdev, shemminger, paulmck

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Mon, 06 Oct 2008 23:22:31 +0200

> Me wondering what impact this synchronize_rcu() can have on mono-threaded
> VOIP applications using lot of UDP sockets. What is the maximum delay of
> this function ?

The cost is enormous, we really can't use it here.

I have a patch that did top-level socket destruction using RCU,
and that didn't use synchronize_rcu(), and that killed connection
rates by up to %20.

I can only imagine what the cost would be if I had to add such a call
in there.

Really, I can't consider these changes seriously, as-is.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-06 21:22 ` Eric Dumazet
  2008-10-06 21:40   ` David Miller
@ 2008-10-06 22:07   ` Corey Minyard
  2008-10-07  8:17   ` Peter Zijlstra
  2 siblings, 0 replies; 36+ messages in thread
From: Corey Minyard @ 2008-10-06 22:07 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Linux Kernel, netdev, shemminger, paulmck

Eric Dumazet wrote:
> Corey Minyard a écrit :
>> Change the UDP hash lock from an rwlock to RCU.
>>
>> Signed-off-by: Corey Minyard <cminyard@mvista.com>
>> ---
>>  include/net/udp.h |    9 +++++----
>>  net/ipv4/udp.c    |   47 
>> +++++++++++++++++++++++++++--------------------
>>  net/ipv6/udp.c    |   17 +++++++++--------
>>  3 files changed, 41 insertions(+), 32 deletions(-)
>>
>> diff --git a/include/net/udp.h b/include/net/udp.h
>> index addcdc6..35aa104 100644
>> --- a/include/net/udp.h
>> +++ b/include/net/udp.h
>> @@ -51,7 +51,7 @@ struct udp_skb_cb {
>>  #define UDP_SKB_CB(__skb)    ((struct udp_skb_cb *)((__skb)->cb))
>>  
>>  extern struct hlist_head udp_hash[UDP_HTABLE_SIZE];
>> -extern rwlock_t udp_hash_lock;
>> +extern spinlock_t udp_hash_wlock;
>>  
>>  
>>  /* Note: this must match 'valbool' in sock_setsockopt */
>> @@ -112,12 +112,13 @@ static inline void udp_lib_hash(struct sock *sk)
>>  
>>  static inline void udp_lib_unhash(struct sock *sk)
>>  {
>> -    write_lock_bh(&udp_hash_lock);
>> -    if (sk_del_node_init(sk)) {
>> +    spin_lock_bh(&udp_hash_wlock);
>> +    if (sk_del_node_init_rcu(sk)) {
>>          inet_sk(sk)->num = 0;
>>          sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
>>      }
>> -    write_unlock_bh(&udp_hash_lock);
>> +    spin_unlock_bh(&udp_hash_wlock);
>> +    synchronize_rcu();
>
> UDP central rwlock can hurt performance, because of cache line ping pong,
> so your patch really makes sense.
>
> Me wondering what impact this synchronize_rcu() can have on mono-threaded
> VOIP applications using lot of UDP sockets. What is the maximum delay of
> this function ?
It delays until all currently executing RCU read-side sections have 
executed (new ones don't count, just currently executing ones).  I'm not 
sure what this delay is, but I would expect it to be fairly small.  This 
function is only called when a socket is closed, too, so it's not a 
high-runner.  Paul would certainly know better than me.

>
> For "struct file" freeing, we chose call_rcu() instead of 
> synchronize_rcu()
I'd prefer that, too, but that would mean adding another member to the 
socket structure.

>
> Maybe we could add a generic rcu head to struct sock, and use 
> call_rcu() in
> sk_prot_free() for sockets needing RCU (udp after your patch is 
> applied, maybe
> tcp on future patches, while I believe previous work on the subject 
> concluded
> RCU was not giving good results for short lived http sessions) ?
RCU probably wouldn't be a good choice for short-lived http sessions, 
since you will only get a couple of messages that would matter.  I'm not 
against adding an item to struct sock, but this is not a common thing 
and struct sock was already big and ugly.

>
> Or just add SLAB_DESTROY_BY_RCU to slab creation in proto_register()
> for "struct proto udp_prot/udpv6_prot" so that kmem_cache_free() done 
> in sk_prot_free() can defer freeing to RCU...
That's an interesting thought; I didn't know that capability was there.  
I can look at that.  With this, the short-lived TCP sessions might not 
matter, though that's a different issue.

-corey

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-06 21:40   ` David Miller
@ 2008-10-06 23:08     ` Corey Minyard
  2008-10-07  8:37       ` Evgeniy Polyakov
  2008-10-07  5:24     ` Eric Dumazet
  2008-10-07  8:31     ` Peter Zijlstra
  2 siblings, 1 reply; 36+ messages in thread
From: Corey Minyard @ 2008-10-06 23:08 UTC (permalink / raw)
  To: David Miller; +Cc: dada1, linux-kernel, netdev, shemminger, paulmck

David Miller wrote:
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Mon, 06 Oct 2008 23:22:31 +0200
>
>   
>> Me wondering what impact this synchronize_rcu() can have on mono-threaded
>> VOIP applications using lot of UDP sockets. What is the maximum delay of
>> this function ?
>>     
>
> The cost is enormous, we really can't use it here.
>
> I have a patch that did top-level socket destruction using RCU,
> and that didn't use synchronize_rcu(), and that killed connection
> rates by up to %20.
>
> I can only imagine what the cost would be if I had to add such a call
> in there.
>
> Really, I can't consider these changes seriously, as-is.
>
>   
Would using SLAB_DESTROY_BY_RCU be ok, or would that be too expensive?

-corey

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-06 21:40   ` David Miller
  2008-10-06 23:08     ` Corey Minyard
@ 2008-10-07  5:24     ` Eric Dumazet
  2008-10-07  8:54       ` Benny Amorsen
  2008-10-07 18:26       ` David Miller
  2008-10-07  8:31     ` Peter Zijlstra
  2 siblings, 2 replies; 36+ messages in thread
From: Eric Dumazet @ 2008-10-07  5:24 UTC (permalink / raw)
  To: David Miller; +Cc: minyard, linux-kernel, netdev, shemminger, paulmck

David Miller a écrit :
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Mon, 06 Oct 2008 23:22:31 +0200
> 
>> Me wondering what impact this synchronize_rcu() can have on mono-threaded
>> VOIP applications using lot of UDP sockets. What is the maximum delay of
>> this function ?
> 
> The cost is enormous, we really can't use it here.
> 
> I have a patch that did top-level socket destruction using RCU,
> and that didn't use synchronize_rcu(), and that killed connection
> rates by up to %20.
> 
> I can only imagine what the cost would be if I had to add such a call
> in there.
> 
> Really, I can't consider these changes seriously, as-is.

Yes, I suppose you are right about TCP sessions, that should stay as they are.

Then if we use call_rcu() RCU freeing only for UDP sockets, we should get rid of
taking this rwlock each time we handle an incoming datagram, and introduce
no extra cost for other sockets.

Most UDP sockets are setup for long periods (RTP trafic), or if an application really
wants to {open/send or receive one UDP frame/close} many sockets, it already hits
RCU handling of its file structures and should not be slowed down that much.

By 'long period' I mean thousand of packets sent/received by each RTP session, being
voice (50 packets/second) or even worse video...

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-06 21:22 ` Eric Dumazet
  2008-10-06 21:40   ` David Miller
  2008-10-06 22:07   ` Corey Minyard
@ 2008-10-07  8:17   ` Peter Zijlstra
  2008-10-07  9:24     ` Eric Dumazet
  2 siblings, 1 reply; 36+ messages in thread
From: Peter Zijlstra @ 2008-10-07  8:17 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: minyard, Linux Kernel, netdev, shemminger, paulmck

On Mon, 2008-10-06 at 23:22 +0200, Eric Dumazet wrote:

> Or just add SLAB_DESTROY_BY_RCU to slab creation in proto_register()
> for "struct proto udp_prot/udpv6_prot" so that kmem_cache_free() 
> done in sk_prot_free() can defer freeing to RCU...

Be careful!, SLAB_DESTROY_BY_RCU just means the slab page gets
RCU-freed, this means that slab object pointers stay pointing to valid
memory, but it does _NOT_ mean those slab objects themselves remain
valid.

The slab allocator is free to re-use those objects at any time -
irrespective of the rcu-grace period. Therefore you will have to be able
to validate that the object you point to is indeed the object you
expect, otherwise strange and wonderful things will happen.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-06 21:40   ` David Miller
  2008-10-06 23:08     ` Corey Minyard
  2008-10-07  5:24     ` Eric Dumazet
@ 2008-10-07  8:31     ` Peter Zijlstra
  2008-10-07 14:36       ` Paul E. McKenney
  2008-10-07 18:29       ` David Miller
  2 siblings, 2 replies; 36+ messages in thread
From: Peter Zijlstra @ 2008-10-07  8:31 UTC (permalink / raw)
  To: David Miller; +Cc: dada1, minyard, linux-kernel, netdev, shemminger, paulmck

On Mon, 2008-10-06 at 14:40 -0700, David Miller wrote:
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Mon, 06 Oct 2008 23:22:31 +0200
> 
> > Me wondering what impact this synchronize_rcu() can have on mono-threaded
> > VOIP applications using lot of UDP sockets. What is the maximum delay of
> > this function ?
> 
> The cost is enormous, we really can't use it here.
> 
> I have a patch that did top-level socket destruction using RCU,
> and that didn't use synchronize_rcu(), and that killed connection
> rates by up to %20.

Did you ever figure out why you lost those 20% ?

> I can only imagine what the cost would be if I had to add such a call
> in there.

Yeah, sync_rcu() is rediculously expensive, at best 3 jiffies IIRC.


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-06 23:08     ` Corey Minyard
@ 2008-10-07  8:37       ` Evgeniy Polyakov
  2008-10-07 14:16         ` Christoph Lameter
  0 siblings, 1 reply; 36+ messages in thread
From: Evgeniy Polyakov @ 2008-10-07  8:37 UTC (permalink / raw)
  To: Corey Minyard
  Cc: David Miller, dada1, linux-kernel, netdev, shemminger, paulmck

On Mon, Oct 06, 2008 at 06:08:09PM -0500, Corey Minyard (minyard@acm.org) wrote:
> Would using SLAB_DESTROY_BY_RCU be ok, or would that be too expensive?

I tested skb destruction via RCU path, and got 2.5 times worse numbers
with small-packets-bulk-transfer workload.

For more details
http://tservice.net.ru/~s0mbre/blog/devel/networking/2006_12_05_1.html

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07  5:24     ` Eric Dumazet
@ 2008-10-07  8:54       ` Benny Amorsen
  2008-10-07 12:59         ` Eric Dumazet
  2008-10-07 18:26       ` David Miller
  1 sibling, 1 reply; 36+ messages in thread
From: Benny Amorsen @ 2008-10-07  8:54 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, minyard, linux-kernel, netdev, shemminger, paulmck

Eric Dumazet <dada1@cosmosbay.com> writes:

> Most UDP sockets are setup for long periods (RTP trafic), or if an application really
> wants to {open/send or receive one UDP frame/close} many sockets, it already hits
> RCU handling of its file structures and should not be slowed down that much.
>
> By 'long period' I mean thousand of packets sent/received by each RTP session, being
> voice (50 packets/second) or even worse video...

Does DNS with port randomization need short lived sockets?


/Benny


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07  8:17   ` Peter Zijlstra
@ 2008-10-07  9:24     ` Eric Dumazet
  2008-10-07 14:15       ` Christoph Lameter
  0 siblings, 1 reply; 36+ messages in thread
From: Eric Dumazet @ 2008-10-07  9:24 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: minyard, Linux Kernel, netdev, shemminger, paulmck

Peter Zijlstra a écrit :
> On Mon, 2008-10-06 at 23:22 +0200, Eric Dumazet wrote:
> 
>> Or just add SLAB_DESTROY_BY_RCU to slab creation in proto_register()
>> for "struct proto udp_prot/udpv6_prot" so that kmem_cache_free() 
>> done in sk_prot_free() can defer freeing to RCU...
> 
> Be careful!, SLAB_DESTROY_BY_RCU just means the slab page gets
> RCU-freed, this means that slab object pointers stay pointing to valid
> memory, but it does _NOT_ mean those slab objects themselves remain
> valid.
> 
> The slab allocator is free to re-use those objects at any time -
> irrespective of the rcu-grace period. Therefore you will have to be able
> to validate that the object you point to is indeed the object you
> expect, otherwise strange and wonderful things will happen.
> 
Thanks for this clarification. I guess we really need a rcu head then :)




^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07  8:54       ` Benny Amorsen
@ 2008-10-07 12:59         ` Eric Dumazet
  2008-10-07 14:07           ` Stephen Hemminger
  2008-10-07 16:43           ` Corey Minyard
  0 siblings, 2 replies; 36+ messages in thread
From: Eric Dumazet @ 2008-10-07 12:59 UTC (permalink / raw)
  To: Benny Amorsen
  Cc: David Miller, minyard, linux-kernel, netdev, shemminger, paulmck

Benny Amorsen a écrit :
> Eric Dumazet <dada1@cosmosbay.com> writes:
> 
>> Most UDP sockets are setup for long periods (RTP trafic), or if an application really
>> wants to {open/send or receive one UDP frame/close} many sockets, it already hits
>> RCU handling of its file structures and should not be slowed down that much.
>>

I should have say 'Many' instead of 'Most' :)

>> By 'long period' I mean thousand of packets sent/received by each RTP session, being
>> voice (50 packets/second) or even worse video...
> 
> Does DNS with port randomization need short lived sockets?
> 

Yes very true, but current allocation of a random port can be very expensive, 
since we scan all the UDP hash table to select the smaller hash chain.

We stop the scan if we find an empty slot, but on machines with say more than 200
bound UDP sockets, they are probably no empty slots. (UDP_HTABLE_SIZE is 128)

bind(NULL port) algo is then O(N), N being number of bound UDP sockets.

So heavy DNS servers/proxies probably use a pool/range of pre-allocated sockets
to avoid costs of allocating/freeing them ? If they dont care about that cost,
the extra call_rcu() will be unnoticed.

For pathological (yet very common :) ) cases like single DNS query/answer, RCU
would mean :

Pros :
- one few rwlock hit when receiving the answer (if any)
Cons :
- one call_rcu() to delay socket freeing/reuse after RCU period.

So it might be a litle bit more expensive than without RCU

I agree I am more interested in optimizing UDP stack for heavy users like RTP 
servers/proxies handling xxx.000 packets/second than DNS users/servers.
Shame on me :)

(2 weeks ago, Corey mentioned a 10x increase on UDP throughput on a 16-way machine,
that sounds promising)

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 12:59         ` Eric Dumazet
@ 2008-10-07 14:07           ` Stephen Hemminger
  2008-10-07 20:55             ` David Miller
  2008-10-07 16:43           ` Corey Minyard
  1 sibling, 1 reply; 36+ messages in thread
From: Stephen Hemminger @ 2008-10-07 14:07 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Benny Amorsen, David Miller, minyard, linux-kernel, netdev,
	paulmck

On Tue, 07 Oct 2008 14:59:20 +0200
Eric Dumazet <dada1@cosmosbay.com> wrote:

> Benny Amorsen a écrit :
> > Eric Dumazet <dada1@cosmosbay.com> writes:
> > 
> >> Most UDP sockets are setup for long periods (RTP trafic), or if an application really
> >> wants to {open/send or receive one UDP frame/close} many sockets, it already hits
> >> RCU handling of its file structures and should not be slowed down that much.
> >>
> 
> I should have say 'Many' instead of 'Most' :)
> 
> >> By 'long period' I mean thousand of packets sent/received by each RTP session, being
> >> voice (50 packets/second) or even worse video...
> > 
> > Does DNS with port randomization need short lived sockets?
> > 
> 
> Yes very true, but current allocation of a random port can be very expensive, 
> since we scan all the UDP hash table to select the smaller hash chain.
> 
> We stop the scan if we find an empty slot, but on machines with say more than 200
> bound UDP sockets, they are probably no empty slots. (UDP_HTABLE_SIZE is 128)
> 
> bind(NULL port) algo is then O(N), N being number of bound UDP sockets.
> 
> So heavy DNS servers/proxies probably use a pool/range of pre-allocated sockets
> to avoid costs of allocating/freeing them ? If they dont care about that cost,
> the extra call_rcu() will be unnoticed.
> 
> For pathological (yet very common :) ) cases like single DNS query/answer, RCU
> would mean :
> 
> Pros :
> - one few rwlock hit when receiving the answer (if any)
> Cons :
> - one call_rcu() to delay socket freeing/reuse after RCU period.
> 
> So it might be a litle bit more expensive than without RCU
> 
> I agree I am more interested in optimizing UDP stack for heavy users like RTP 
> servers/proxies handling xxx.000 packets/second than DNS users/servers.
> Shame on me :)
> 
> (2 weeks ago, Corey mentioned a 10x increase on UDP throughput on a 16-way machine,
> that sounds promising)

The idea of keeping chains short is the problem. That code should just be pulled because
it doesn't help that much, and also creates bias on the port randomization.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07  9:24     ` Eric Dumazet
@ 2008-10-07 14:15       ` Christoph Lameter
  2008-10-07 14:38         ` Paul E. McKenney
  2008-10-07 14:50         ` Eric Dumazet
  0 siblings, 2 replies; 36+ messages in thread
From: Christoph Lameter @ 2008-10-07 14:15 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Peter Zijlstra, minyard, Linux Kernel, netdev, shemminger,
	paulmck

Eric Dumazet wrote:
>>> Or just add SLAB_DESTROY_BY_RCU to slab creation in proto_register()
>>> for "struct proto udp_prot/udpv6_prot" so that kmem_cache_free() done
>>> in sk_prot_free() can defer freeing to RCU...
>>
>> Be careful!, SLAB_DESTROY_BY_RCU just means the slab page gets
>> RCU-freed, this means that slab object pointers stay pointing to valid
>> memory, but it does _NOT_ mean those slab objects themselves remain
>> valid.
>>
>> The slab allocator is free to re-use those objects at any time -
>> irrespective of the rcu-grace period. Therefore you will have to be able
>> to validate that the object you point to is indeed the object you
>> expect, otherwise strange and wonderful things will happen.
>>
> Thanks for this clarification. I guess we really need a rcu head then :)

No you just need to make sure that the object you located is still active
(f.e. refcount > 0) and that it is really a match (hash pointers may be
updated asynchronously and therefore point to the object that has been reused
for something else).

Generally it is advisable to use SLAB_DESTROY_BY_RCU because it preserves the
cache hot advantages of the objects. Regular RCU freeing will let the object
expire for a tick or so which will result in the cacheline cooling down.



^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07  8:37       ` Evgeniy Polyakov
@ 2008-10-07 14:16         ` Christoph Lameter
  2008-10-07 14:29           ` Evgeniy Polyakov
  2008-10-07 14:33           ` Paul E. McKenney
  0 siblings, 2 replies; 36+ messages in thread
From: Christoph Lameter @ 2008-10-07 14:16 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Corey Minyard, David Miller, dada1, linux-kernel, netdev,
	shemminger, paulmck

Evgeniy Polyakov wrote:
> On Mon, Oct 06, 2008 at 06:08:09PM -0500, Corey Minyard (minyard@acm.org) wrote:
>> Would using SLAB_DESTROY_BY_RCU be ok, or would that be too expensive?
> 
> I tested skb destruction via RCU path, and got 2.5 times worse numbers
> with small-packets-bulk-transfer workload.

Was this with regular RCU freeing? This will cool down the cacheline before
frees. You need SLAB_DESTROY_BY_RCU to keep the objects cache hot.



^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 14:16         ` Christoph Lameter
@ 2008-10-07 14:29           ` Evgeniy Polyakov
  2008-10-07 14:38             ` Christoph Lameter
  2008-10-07 14:33           ` Paul E. McKenney
  1 sibling, 1 reply; 36+ messages in thread
From: Evgeniy Polyakov @ 2008-10-07 14:29 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Corey Minyard, David Miller, dada1, linux-kernel, netdev,
	shemminger, paulmck

On Tue, Oct 07, 2008 at 09:16:13AM -0500, Christoph Lameter (cl@linux-foundation.org) wrote:
> > I tested skb destruction via RCU path, and got 2.5 times worse numbers
> > with small-packets-bulk-transfer workload.
> 
> Was this with regular RCU freeing? This will cool down the cacheline before
> frees. You need SLAB_DESTROY_BY_RCU to keep the objects cache hot.

I believe there were no SLAB_DESTROY_BY_RCU 2 yars ago :)

It was pure call_rcu(&skb->rcu, real_skb_freeing), where
real_skb_freeing() just did usual kfree().

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 14:16         ` Christoph Lameter
  2008-10-07 14:29           ` Evgeniy Polyakov
@ 2008-10-07 14:33           ` Paul E. McKenney
  2008-10-07 14:45             ` Christoph Lameter
  1 sibling, 1 reply; 36+ messages in thread
From: Paul E. McKenney @ 2008-10-07 14:33 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Evgeniy Polyakov, Corey Minyard, David Miller, dada1,
	linux-kernel, netdev, shemminger

On Tue, Oct 07, 2008 at 09:16:13AM -0500, Christoph Lameter wrote:
> Evgeniy Polyakov wrote:
> > On Mon, Oct 06, 2008 at 06:08:09PM -0500, Corey Minyard (minyard@acm.org) wrote:
> >> Would using SLAB_DESTROY_BY_RCU be ok, or would that be too expensive?
> > 
> > I tested skb destruction via RCU path, and got 2.5 times worse numbers
> > with small-packets-bulk-transfer workload.
> 
> Was this with regular RCU freeing? This will cool down the cacheline before
> frees. You need SLAB_DESTROY_BY_RCU to keep the objects cache hot.

Indeed!

But care is required -- SLAB_DESTROY_BY_RCU permits objects to be freed
and reallocated while a reader holds a reference.  The only guarantee is
that the -type- of the data structure will not change while a reader holds
a reference.  With something like UDP, this might well be sufficient.

Just be careful!  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07  8:31     ` Peter Zijlstra
@ 2008-10-07 14:36       ` Paul E. McKenney
  2008-10-07 18:29       ` David Miller
  1 sibling, 0 replies; 36+ messages in thread
From: Paul E. McKenney @ 2008-10-07 14:36 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: David Miller, dada1, minyard, linux-kernel, netdev, shemminger

On Tue, Oct 07, 2008 at 10:31:30AM +0200, Peter Zijlstra wrote:
> On Mon, 2008-10-06 at 14:40 -0700, David Miller wrote:
> > From: Eric Dumazet <dada1@cosmosbay.com>
> > Date: Mon, 06 Oct 2008 23:22:31 +0200
> > 
> > > Me wondering what impact this synchronize_rcu() can have on mono-threaded
> > > VOIP applications using lot of UDP sockets. What is the maximum delay of
> > > this function ?
> > 
> > The cost is enormous, we really can't use it here.
> > 
> > I have a patch that did top-level socket destruction using RCU,
> > and that didn't use synchronize_rcu(), and that killed connection
> > rates by up to %20.
> 
> Did you ever figure out why you lost those 20% ?
> 
> > I can only imagine what the cost would be if I had to add such a call
> > in there.
> 
> Yeah, sync_rcu() is rediculously expensive, at best 3 jiffies IIRC.

I could make it -much- faster, but at the expense of -serious- CPU
overhead.  Still, might be useful during boot time (when the system
can't do anything useful anyway) to accelerate getting data structures
initialized.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 14:15       ` Christoph Lameter
@ 2008-10-07 14:38         ` Paul E. McKenney
  2008-10-07 14:50         ` Eric Dumazet
  1 sibling, 0 replies; 36+ messages in thread
From: Paul E. McKenney @ 2008-10-07 14:38 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Eric Dumazet, Peter Zijlstra, minyard, Linux Kernel, netdev,
	shemminger

On Tue, Oct 07, 2008 at 09:15:00AM -0500, Christoph Lameter wrote:
> Eric Dumazet wrote:
> >>> Or just add SLAB_DESTROY_BY_RCU to slab creation in proto_register()
> >>> for "struct proto udp_prot/udpv6_prot" so that kmem_cache_free() done
> >>> in sk_prot_free() can defer freeing to RCU...
> >>
> >> Be careful!, SLAB_DESTROY_BY_RCU just means the slab page gets
> >> RCU-freed, this means that slab object pointers stay pointing to valid
> >> memory, but it does _NOT_ mean those slab objects themselves remain
> >> valid.
> >>
> >> The slab allocator is free to re-use those objects at any time -
> >> irrespective of the rcu-grace period. Therefore you will have to be able
> >> to validate that the object you point to is indeed the object you
> >> expect, otherwise strange and wonderful things will happen.
> >>
> > Thanks for this clarification. I guess we really need a rcu head then :)
> 
> No you just need to make sure that the object you located is still active
> (f.e. refcount > 0) and that it is really a match (hash pointers may be
> updated asynchronously and therefore point to the object that has been reused
> for something else).

In some cases, you might be able to not care, but yes, most of the time,
you will need to validate the object.

> Generally it is advisable to use SLAB_DESTROY_BY_RCU because it preserves the
> cache hot advantages of the objects. Regular RCU freeing will let the object
> expire for a tick or so which will result in the cacheline cooling down.

And SLAB_DESTROY_BY_RCU guarantees that the type of the object will
remain the same during any given RCU read-side critical section.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 14:29           ` Evgeniy Polyakov
@ 2008-10-07 14:38             ` Christoph Lameter
  0 siblings, 0 replies; 36+ messages in thread
From: Christoph Lameter @ 2008-10-07 14:38 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Corey Minyard, David Miller, dada1, linux-kernel, netdev,
	shemminger, paulmck, Hugh Dickins

Evgeniy Polyakov wrote:
> On Tue, Oct 07, 2008 at 09:16:13AM -0500, Christoph Lameter (cl@linux-foundation.org) wrote:
>>> I tested skb destruction via RCU path, and got 2.5 times worse numbers
>>> with small-packets-bulk-transfer workload.
>> Was this with regular RCU freeing? This will cool down the cacheline before
>> frees. You need SLAB_DESTROY_BY_RCU to keep the objects cache hot.
> 
> I believe there were no SLAB_DESTROY_BY_RCU 2 yars ago :)

Its been awhile. Hugh created it

> It was pure call_rcu(&skb->rcu, real_skb_freeing), where
> real_skb_freeing() just did usual kfree().

Right. That results in cacheline cooldown. You'd want to recycle the object as
they are cache hot on a per cpu basis. That is screwed up by the delayed
regular rcu processing. We have seen multiple regressions due to cacheline
cooldown.

The only choice in cacheline hot sensitive areas is to deal with the
complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU.







^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 14:33           ` Paul E. McKenney
@ 2008-10-07 14:45             ` Christoph Lameter
  2008-10-07 15:07               ` Eric Dumazet
  2008-10-07 15:07               ` Paul E. McKenney
  0 siblings, 2 replies; 36+ messages in thread
From: Christoph Lameter @ 2008-10-07 14:45 UTC (permalink / raw)
  To: paulmck
  Cc: Evgeniy Polyakov, Corey Minyard, David Miller, dada1,
	linux-kernel, netdev, shemminger

Paul E. McKenney wrote:

> But care is required -- SLAB_DESTROY_BY_RCU permits objects to be freed
> and reallocated while a reader holds a reference.  The only guarantee is
> that the -type- of the data structure will not change while a reader holds
> a reference.  With something like UDP, this might well be sufficient.

Right so after the hash lookup operation you are not assured that the object
has not been freed or even reallocated for a different purpose. So after
finding the pointer to the object two things need to happen (under rcu_lock):

1. Verify that the object is still in use
2. Verify that the object is matching the hash

If not then the operation needs to be redone because we have a stale hash pointer.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 14:15       ` Christoph Lameter
  2008-10-07 14:38         ` Paul E. McKenney
@ 2008-10-07 14:50         ` Eric Dumazet
  2008-10-07 15:05           ` Paul E. McKenney
                             ` (2 more replies)
  1 sibling, 3 replies; 36+ messages in thread
From: Eric Dumazet @ 2008-10-07 14:50 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Peter Zijlstra, minyard, Linux Kernel, netdev, shemminger,
	paulmck

Christoph Lameter a écrit :
> Eric Dumazet wrote:
>>>> Or just add SLAB_DESTROY_BY_RCU to slab creation in proto_register()
>>>> for "struct proto udp_prot/udpv6_prot" so that kmem_cache_free() done
>>>> in sk_prot_free() can defer freeing to RCU...
>>> Be careful!, SLAB_DESTROY_BY_RCU just means the slab page gets
>>> RCU-freed, this means that slab object pointers stay pointing to valid
>>> memory, but it does _NOT_ mean those slab objects themselves remain
>>> valid.
>>>
>>> The slab allocator is free to re-use those objects at any time -
>>> irrespective of the rcu-grace period. Therefore you will have to be able
>>> to validate that the object you point to is indeed the object you
>>> expect, otherwise strange and wonderful things will happen.
>>>
>> Thanks for this clarification. I guess we really need a rcu head then :)
> 
> No you just need to make sure that the object you located is still active
> (f.e. refcount > 0) and that it is really a match (hash pointers may be
> updated asynchronously and therefore point to the object that has been reused
> for something else).
> 
> Generally it is advisable to use SLAB_DESTROY_BY_RCU because it preserves the
> cache hot advantages of the objects. Regular RCU freeing will let the object
> expire for a tick or so which will result in the cacheline cooling down.

Seems really good to master this SLAB_DESTROY_BY_RCU thing (I see almost no use
of it in current kernel)

1) Hum, do you know why "struct file" objects dont use SLAB_DESTROY_BY_RCU then,
since we noticed a performance regression for several workloads at RCUification
of file structures ?

2) What prevents an object to be *freed* (and deleted from a hash chain), then
re-allocated and inserted to another chain (different keys) ? (final refcount=1)

If the lookup detects a key mismatch, how will it continue to the next item,
since 'next' pointer will have been reused for the new chain insertion...

Me confused...




^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 14:50         ` Eric Dumazet
@ 2008-10-07 15:05           ` Paul E. McKenney
  2008-10-07 15:09           ` Peter Zijlstra
  2008-10-07 15:23           ` Christoph Lameter
  2 siblings, 0 replies; 36+ messages in thread
From: Paul E. McKenney @ 2008-10-07 15:05 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Christoph Lameter, Peter Zijlstra, minyard, Linux Kernel, netdev,
	shemminger

On Tue, Oct 07, 2008 at 04:50:47PM +0200, Eric Dumazet wrote:
> Christoph Lameter a écrit :
>> Eric Dumazet wrote:
>>>>> Or just add SLAB_DESTROY_BY_RCU to slab creation in proto_register()
>>>>> for "struct proto udp_prot/udpv6_prot" so that kmem_cache_free() done
>>>>> in sk_prot_free() can defer freeing to RCU...
>>>> Be careful!, SLAB_DESTROY_BY_RCU just means the slab page gets
>>>> RCU-freed, this means that slab object pointers stay pointing to valid
>>>> memory, but it does _NOT_ mean those slab objects themselves remain
>>>> valid.
>>>>
>>>> The slab allocator is free to re-use those objects at any time -
>>>> irrespective of the rcu-grace period. Therefore you will have to be able
>>>> to validate that the object you point to is indeed the object you
>>>> expect, otherwise strange and wonderful things will happen.
>>>>
>>> Thanks for this clarification. I guess we really need a rcu head then :)
>> No you just need to make sure that the object you located is still active
>> (f.e. refcount > 0) and that it is really a match (hash pointers may be
>> updated asynchronously and therefore point to the object that has been 
>> reused
>> for something else).
>> Generally it is advisable to use SLAB_DESTROY_BY_RCU because it preserves 
>> the
>> cache hot advantages of the objects. Regular RCU freeing will let the 
>> object
>> expire for a tick or so which will result in the cacheline cooling down.
>
> Seems really good to master this SLAB_DESTROY_BY_RCU thing (I see almost no 
> use of it in current kernel)

It is not the easiest thing to use...

> 1) Hum, do you know why "struct file" objects dont use SLAB_DESTROY_BY_RCU 
> then, since we noticed a performance regression for several workloads at 
> RCUification of file structures ?
>
> 2) What prevents an object to be *freed* (and deleted from a hash chain), 
> then re-allocated and inserted to another chain (different keys) ? (final 
> refcount=1)

Nothing prevents this from happening.  You either have to have some sort
of validation step based on object identity (e.g., a generation number
that is incremented on each allocation), or have an algorithm that
doesn't care if searches sometimes spuriously fail to find something
that really is in the list.

Which is one of the reasons that its use is rare.  But perhaps more
experience with it will show more/better ways to use it.

> If the lookup detects a key mismatch, how will it continue to the next 
> item, since 'next' pointer will have been reused for the new chain
> insertion...
>
> Me confused...

One way to approach this is to have a generation number that is
incremented each time the object is recycled along with a pointer to the
list header.  The list header contains the most recent generation number
of any element in the list.  Then if either the generation number of
a give element is greater than that of the header when you started the
search, or the element's pointer no longer references the list header
you started your search from, restart the search.  Read-side memory
barriers may also be required in some cases.

It may be possible to simplify this in some special cases.

There are probably better ways to approach this, but that is one way.

						Thanx, Paul

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 14:45             ` Christoph Lameter
@ 2008-10-07 15:07               ` Eric Dumazet
  2008-10-07 15:07               ` Paul E. McKenney
  1 sibling, 0 replies; 36+ messages in thread
From: Eric Dumazet @ 2008-10-07 15:07 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: paulmck, Evgeniy Polyakov, Corey Minyard, David Miller,
	linux-kernel, netdev, shemminger

Christoph Lameter a écrit :
> Paul E. McKenney wrote:
> 
>> But care is required -- SLAB_DESTROY_BY_RCU permits objects to be freed
>> and reallocated while a reader holds a reference.  The only guarantee is
>> that the -type- of the data structure will not change while a reader holds
>> a reference.  With something like UDP, this might well be sufficient.
> 
> Right so after the hash lookup operation you are not assured that the object
> has not been freed or even reallocated for a different purpose. So after
> finding the pointer to the object two things need to happen (under rcu_lock):
> 
> 1. Verify that the object is still in use
> 2. Verify that the object is matching the hash
> 
> If not then the operation needs to be redone because we have a stale hash pointer.

OK... so restart full lookup at the begining of hash chain if we detect
points 1 or 2 invalid.

Not that expensive since everything should be cache hot :)

One has to take care to group all components (keys to compute the hash, 
and the *next* pointer) in one cache line to minimize cache misses,
since we now need to access them all to compute/check hash value.

Now if a freed object is re-inserted with same hash value, 
same hash chain, we'll also restart the lookup, but it is harmless.





^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 14:45             ` Christoph Lameter
  2008-10-07 15:07               ` Eric Dumazet
@ 2008-10-07 15:07               ` Paul E. McKenney
  1 sibling, 0 replies; 36+ messages in thread
From: Paul E. McKenney @ 2008-10-07 15:07 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Evgeniy Polyakov, Corey Minyard, David Miller, dada1,
	linux-kernel, netdev, shemminger

On Tue, Oct 07, 2008 at 09:45:43AM -0500, Christoph Lameter wrote:
> Paul E. McKenney wrote:
> 
> > But care is required -- SLAB_DESTROY_BY_RCU permits objects to be freed
> > and reallocated while a reader holds a reference.  The only guarantee is
> > that the -type- of the data structure will not change while a reader holds
> > a reference.  With something like UDP, this might well be sufficient.
> 
> Right so after the hash lookup operation you are not assured that the object
> has not been freed or even reallocated for a different purpose. So after
> finding the pointer to the object two things need to happen (under rcu_lock):
> 
> 1. Verify that the object is still in use
> 2. Verify that the object is matching the hash
> 
> If not then the operation needs to be redone because we have a stale hash
> pointer.

There is also the possibility that the element will be reused, but placed
in the same list that it resided in last time.  A reader referencing
that item during this process might then be relocated in the list, which
could either cause the reader to skip elements in the list (if the new
element is relocated farther down the list) or endlessly loop through
the list (if new elements were relocated closer to the list head and this
free-reallocate process repeated and the reader was insanely unlucky).

							Thanx, Paul

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 14:50         ` Eric Dumazet
  2008-10-07 15:05           ` Paul E. McKenney
@ 2008-10-07 15:09           ` Peter Zijlstra
  2008-10-07 15:23           ` Christoph Lameter
  2 siblings, 0 replies; 36+ messages in thread
From: Peter Zijlstra @ 2008-10-07 15:09 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Christoph Lameter, minyard, Linux Kernel, netdev, shemminger,
	paulmck, Hugh Dickins, Nick Piggin

On Tue, 2008-10-07 at 16:50 +0200, Eric Dumazet wrote:
> Christoph Lameter a écrit :
> > Eric Dumazet wrote:
> >>>> Or just add SLAB_DESTROY_BY_RCU to slab creation in proto_register()
> >>>> for "struct proto udp_prot/udpv6_prot" so that kmem_cache_free() done
> >>>> in sk_prot_free() can defer freeing to RCU...
> >>> Be careful!, SLAB_DESTROY_BY_RCU just means the slab page gets
> >>> RCU-freed, this means that slab object pointers stay pointing to valid
> >>> memory, but it does _NOT_ mean those slab objects themselves remain
> >>> valid.
> >>>
> >>> The slab allocator is free to re-use those objects at any time -
> >>> irrespective of the rcu-grace period. Therefore you will have to be able
> >>> to validate that the object you point to is indeed the object you
> >>> expect, otherwise strange and wonderful things will happen.
> >>>
> >> Thanks for this clarification. I guess we really need a rcu head then :)
> > 
> > No you just need to make sure that the object you located is still active
> > (f.e. refcount > 0) and that it is really a match (hash pointers may be
> > updated asynchronously and therefore point to the object that has been reused
> > for something else).
> > 
> > Generally it is advisable to use SLAB_DESTROY_BY_RCU because it preserves the
> > cache hot advantages of the objects. Regular RCU freeing will let the object
> > expire for a tick or so which will result in the cacheline cooling down.
> 
> Seems really good to master this SLAB_DESTROY_BY_RCU thing (I see almost no use
> of it in current kernel)

There is (AFAIK) only 1 user, the anon_vma stuff.

> 1) Hum, do you know why "struct file" objects dont use SLAB_DESTROY_BY_RCU then,
> since we noticed a performance regression for several workloads at RCUification
> of file structures ?
> 
> 2) What prevents an object to be *freed* (and deleted from a hash chain), then
> re-allocated and inserted to another chain (different keys) ? (final refcount=1)
> 
> If the lookup detects a key mismatch, how will it continue to the next item,
> since 'next' pointer will have been reused for the new chain insertion...

Right, you can't have lists with items like that. You can only do
matching lookups. What you do is:

find_get_obj(key)
{
 rcu_read_lock()
again:
 obj = lookup(key);
 if (!obj)
  goto out;

 /* 
  * if we can't get a ref, the item got freed concurrently
  * try again
  */
 if (!get_ref_unless_zero(obj))
  goto again;

 /*
  * if we did get a ref, but its not the object we expected
  * try again
  */
 if (obj->key != key) {
   put_ref(obj);
   goto again;
 }
out:
 rcu_read_unlock();
 return obj;
}

Which is basically what we do with the lockless pagecache, where we
don't need the RCU because the page-frames are never freed.




^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 14:50         ` Eric Dumazet
  2008-10-07 15:05           ` Paul E. McKenney
  2008-10-07 15:09           ` Peter Zijlstra
@ 2008-10-07 15:23           ` Christoph Lameter
  2 siblings, 0 replies; 36+ messages in thread
From: Christoph Lameter @ 2008-10-07 15:23 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Peter Zijlstra, minyard, Linux Kernel, netdev, shemminger,
	paulmck

Eric Dumazet wrote:

> 
> 1) Hum, do you know why "struct file" objects dont use
> SLAB_DESTROY_BY_RCU then,
> since we noticed a performance regression for several workloads at
> RCUification
> of file structures ?

Because my patches were not accepted that fix the issue.
http://lkml.org/lkml/2006/6/16/144


> 2) What prevents an object to be *freed* (and deleted from a hash
> chain), then
> re-allocated and inserted to another chain (different keys) ? (final
> refcount=1)

Nothing.

> If the lookup detects a key mismatch, how will it continue to the next
> item,
> since 'next' pointer will have been reused for the new chain insertion...
> 
> Me confused...

If there is a mismatch then you have to do another hash lookup. Do an rcu
unlock and start over.




^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 12:59         ` Eric Dumazet
  2008-10-07 14:07           ` Stephen Hemminger
@ 2008-10-07 16:43           ` Corey Minyard
  1 sibling, 0 replies; 36+ messages in thread
From: Corey Minyard @ 2008-10-07 16:43 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Benny Amorsen, David Miller, linux-kernel, netdev, shemminger,
	paulmck

Eric Dumazet wrote:
>
> (2 weeks ago, Corey mentioned a 10x increase on UDP throughput on a 
> 16-way machine,
> that sounds promising)
Just to be clear, that was 10x with preempt RT, which converts rwlocks 
into PI mutexes.  So 16 processors going for the same lock is pretty ugly.

Under heavy loads there is also a writer starvation problem, I believe 
in non-RT.  You can't actually create or destroy a UDP socket when the 
load is high because there's always a reader holding the lock.  RCU also 
solves that problem.

-corey


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07  5:24     ` Eric Dumazet
  2008-10-07  8:54       ` Benny Amorsen
@ 2008-10-07 18:26       ` David Miller
  2008-10-08  8:35         ` Eric Dumazet
  1 sibling, 1 reply; 36+ messages in thread
From: David Miller @ 2008-10-07 18:26 UTC (permalink / raw)
  To: dada1; +Cc: minyard, linux-kernel, netdev, shemminger, paulmck

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 07 Oct 2008 07:24:45 +0200

> Most UDP sockets are setup for long periods (RTP trafic), or if an application really
> wants to {open/send or receive one UDP frame/close} many sockets, it already hits
> RCU handling of its file structures and should not be slowed down that much.

As stated, I added RCU destruction generically for socket objects, and it
showed up clearly.

So "not be slowed down that much" has been disproven, at least to me,
already :-)

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07  8:31     ` Peter Zijlstra
  2008-10-07 14:36       ` Paul E. McKenney
@ 2008-10-07 18:29       ` David Miller
  1 sibling, 0 replies; 36+ messages in thread
From: David Miller @ 2008-10-07 18:29 UTC (permalink / raw)
  To: a.p.zijlstra; +Cc: dada1, minyard, linux-kernel, netdev, shemminger, paulmck

From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 07 Oct 2008 10:31:30 +0200

> On Mon, 2008-10-06 at 14:40 -0700, David Miller wrote:
> > From: Eric Dumazet <dada1@cosmosbay.com>
> > Date: Mon, 06 Oct 2008 23:22:31 +0200
> > 
> > > Me wondering what impact this synchronize_rcu() can have on mono-threaded
> > > VOIP applications using lot of UDP sockets. What is the maximum delay of
> > > this function ?
> > 
> > The cost is enormous, we really can't use it here.
> > 
> > I have a patch that did top-level socket destruction using RCU,
> > and that didn't use synchronize_rcu(), and that killed connection
> > rates by up to %20.
> 
> Did you ever figure out why you lost those 20% ?

Probably the RCU delay on a 128 cpu machine :-)

Also I bet batching the socket destruction eliminates all of
the cached local state we have in the cpu at the actual socket
destruction time.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 14:07           ` Stephen Hemminger
@ 2008-10-07 20:55             ` David Miller
  2008-10-07 21:20               ` Stephen Hemminger
  2008-10-08 13:55               ` Eric Dumazet
  0 siblings, 2 replies; 36+ messages in thread
From: David Miller @ 2008-10-07 20:55 UTC (permalink / raw)
  To: shemminger; +Cc: dada1, benny+usenet, minyard, linux-kernel, netdev, paulmck

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Tue, 7 Oct 2008 16:07:29 +0200

> The idea of keeping chains short is the problem. That code should
> just be pulled because it doesn't help that much, and also creates
> bias on the port randomization.

I have that patch from Vitaly Mayatskikh which does exactly this.

I keep looking at it, but I can't bring myself to apply it since
I'm not completely convinced.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 20:55             ` David Miller
@ 2008-10-07 21:20               ` Stephen Hemminger
  2008-10-08 13:55               ` Eric Dumazet
  1 sibling, 0 replies; 36+ messages in thread
From: Stephen Hemminger @ 2008-10-07 21:20 UTC (permalink / raw)
  To: David Miller; +Cc: dada1, benny+usenet, minyard, linux-kernel, netdev, paulmck

On Tue, 07 Oct 2008 13:55:48 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> From: Stephen Hemminger <shemminger@vyatta.com>
> Date: Tue, 7 Oct 2008 16:07:29 +0200
> 
> > The idea of keeping chains short is the problem. That code should
> > just be pulled because it doesn't help that much, and also creates
> > bias on the port randomization.
> 
> I have that patch from Vitaly Mayatskikh which does exactly this.
> 
> I keep looking at it, but I can't bring myself to apply it since
> I'm not completely convinced.

Some one on a busy server should run it and measure the delta in hash chains.
I would, but don't run anything that has more than a few UDP's open.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 18:26       ` David Miller
@ 2008-10-08  8:35         ` Eric Dumazet
  2008-10-08 16:38           ` David Miller
  0 siblings, 1 reply; 36+ messages in thread
From: Eric Dumazet @ 2008-10-08  8:35 UTC (permalink / raw)
  To: David Miller; +Cc: minyard, linux-kernel, netdev, shemminger, paulmck

David Miller a écrit :
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Tue, 07 Oct 2008 07:24:45 +0200
> 
>> Most UDP sockets are setup for long periods (RTP trafic), or if an application really
>> wants to {open/send or receive one UDP frame/close} many sockets, it already hits
>> RCU handling of its file structures and should not be slowed down that much.
> 
> As stated, I added RCU destruction generically for socket objects, and it
> showed up clearly.
> 
> So "not be slowed down that much" has been disproven, at least to me,
> already :-)
> 
> 
RCU in hash table is ok for managing read mostly data, since
only during the read access you avoid to dirty a rwlock...

If we have a workload that insert/delete sockets as hell but
receive few frames (that hit the hash table in a read only way),
then you defeat the purpose of RCU, and pay the price of
throwing away (in rcu queue) hot data that will become cold
before reuse...

BTW is there any chance your results were obtained before October 2005 ?

At that time, RCU was able to queue an unlimited number of events.
a single loop doing close(open("/dev/null",0)) could exhaust RAM...

Refs: commit  5ee832dbc6770135ec8d63296af0a4374557bb79
and many others...

Anyway we can probably code something without call_rcu() cache blower
for UDP, if time permits :)

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-07 20:55             ` David Miller
  2008-10-07 21:20               ` Stephen Hemminger
@ 2008-10-08 13:55               ` Eric Dumazet
  2008-10-08 18:45                 ` David Miller
  1 sibling, 1 reply; 36+ messages in thread
From: Eric Dumazet @ 2008-10-08 13:55 UTC (permalink / raw)
  To: David Miller
  Cc: shemminger, benny+usenet, minyard, linux-kernel, netdev, paulmck

[-- Attachment #1: Type: text/plain, Size: 1456 bytes --]

David Miller a écrit :
> From: Stephen Hemminger <shemminger@vyatta.com>
> Date: Tue, 7 Oct 2008 16:07:29 +0200
> 
>> The idea of keeping chains short is the problem. That code should
>> just be pulled because it doesn't help that much, and also creates
>> bias on the port randomization.
> 
> I have that patch from Vitaly Mayatskikh which does exactly this.
> 
> I keep looking at it, but I can't bring myself to apply it since
> I'm not completely convinced.

Vitaly patch might be appropriate if only few UDP ports are opened.

We could zap the code to search short chains and extend Vitaly's
idea with following patch :

[PATCH] udp: Improve port randomization

Current UDP port allocation is suboptimal.
We select the shortest chain to chose a port (out of 512)
that will hash in this shortest chain.

First, it can lead to give not so ramdom ports and ease
give attackers more opportunities to break the system.

Second, it can consume a lot of CPU to scan all table
in order to find the shortest chain.

Third, in some pathological cases we can fail to find
a free port even if they are plenty of them.

This patch zap the search for a short chain and only
use one random seed. Problem of getting long chains
should be addressed in another way, since we can
obtain long chains with non random ports.

Based on a report and patch from Vitaly Mayatskikh

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>



[-- Attachment #2: udp_random.patch --]
[-- Type: text/plain, Size: 1773 bytes --]

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 85f8e8e..67d8430 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -155,55 +155,23 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 	write_lock_bh(&udp_hash_lock);
 
 	if (!snum) {
-		int i, low, high, remaining;
-		unsigned rover, best, best_size_so_far;
+		int low, high, remaining;
+		unsigned rand;
+		unsigned short first;
 
 		inet_get_local_port_range(&low, &high);
 		remaining = (high - low) + 1;
 
-		best_size_so_far = UINT_MAX;
-		best = rover = net_random() % remaining + low;
-
-		/* 1st pass: look for empty (or shortest) hash chain */
-		for (i = 0; i < UDP_HTABLE_SIZE; i++) {
-			int size = 0;
-
-			head = &udptable[udp_hashfn(net, rover)];
-			if (hlist_empty(head))
-				goto gotit;
-
-			sk_for_each(sk2, node, head) {
-				if (++size >= best_size_so_far)
-					goto next;
-			}
-			best_size_so_far = size;
-			best = rover;
-		next:
-			/* fold back if end of range */
-			if (++rover > high)
-				rover = low + ((rover - low)
-					       & (UDP_HTABLE_SIZE - 1));
-
-
-		}
-
-		/* 2nd pass: find hole in shortest hash chain */
-		rover = best;
-		for (i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++) {
-			if (! __udp_lib_lport_inuse(net, rover, udptable))
-				goto gotit;
-			rover += UDP_HTABLE_SIZE;
-			if (rover > high)
-				rover = low + ((rover - low)
-					       & (UDP_HTABLE_SIZE - 1));
+		rand = net_random();
+		snum = first = rand % remaining + low;
+		rand |= 1;
+		while (__udp_lib_lport_inuse(net, snum, udptable)) {
+			do {
+				snum = snum + rand;
+			} while (snum < low || snum > high);
+			if (snum == first)
+				goto fail;
 		}
-
-
-		/* All ports in use! */
-		goto fail;
-
-gotit:
-		snum = rover;
 	} else {
 		head = &udptable[udp_hashfn(net, snum)];
 

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-08  8:35         ` Eric Dumazet
@ 2008-10-08 16:38           ` David Miller
  0 siblings, 0 replies; 36+ messages in thread
From: David Miller @ 2008-10-08 16:38 UTC (permalink / raw)
  To: dada1; +Cc: minyard, linux-kernel, netdev, shemminger, paulmck

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 08 Oct 2008 10:35:07 +0200

> BTW is there any chance your results were obtained before October 2005 ?

No, the timestamp on the saved patch file I have is June 16, 2008 :-)

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 3/3] Convert the UDP hash lock to RCU
  2008-10-08 13:55               ` Eric Dumazet
@ 2008-10-08 18:45                 ` David Miller
  0 siblings, 0 replies; 36+ messages in thread
From: David Miller @ 2008-10-08 18:45 UTC (permalink / raw)
  To: dada1; +Cc: shemminger, benny+usenet, minyard, linux-kernel, netdev, paulmck

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 08 Oct 2008 15:55:36 +0200

> David Miller a écrit :
> > From: Stephen Hemminger <shemminger@vyatta.com>
> > Date: Tue, 7 Oct 2008 16:07:29 +0200
> > 
> >> The idea of keeping chains short is the problem. That code should
> >> just be pulled because it doesn't help that much, and also creates
> >> bias on the port randomization.
> > I have that patch from Vitaly Mayatskikh which does exactly this.
> > I keep looking at it, but I can't bring myself to apply it since
> > I'm not completely convinced.
> 
> Vitaly patch might be appropriate if only few UDP ports are opened.
> 
> We could zap the code to search short chains and extend Vitaly's
> idea with following patch :

I really like this, and I've applied it to net-next-2.6

I think the "increment until back in range" do/while loop can
be improved a bit.  It can spin for more than 60,000 iterations
in some edge case scenerios as-is :-)

Ugh, there's also that expensive divide in there for the modulus.

^ permalink raw reply	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2008-10-08 18:46 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-10-06 18:50 [PATCH 3/3] Convert the UDP hash lock to RCU Corey Minyard
2008-10-06 21:22 ` Eric Dumazet
2008-10-06 21:40   ` David Miller
2008-10-06 23:08     ` Corey Minyard
2008-10-07  8:37       ` Evgeniy Polyakov
2008-10-07 14:16         ` Christoph Lameter
2008-10-07 14:29           ` Evgeniy Polyakov
2008-10-07 14:38             ` Christoph Lameter
2008-10-07 14:33           ` Paul E. McKenney
2008-10-07 14:45             ` Christoph Lameter
2008-10-07 15:07               ` Eric Dumazet
2008-10-07 15:07               ` Paul E. McKenney
2008-10-07  5:24     ` Eric Dumazet
2008-10-07  8:54       ` Benny Amorsen
2008-10-07 12:59         ` Eric Dumazet
2008-10-07 14:07           ` Stephen Hemminger
2008-10-07 20:55             ` David Miller
2008-10-07 21:20               ` Stephen Hemminger
2008-10-08 13:55               ` Eric Dumazet
2008-10-08 18:45                 ` David Miller
2008-10-07 16:43           ` Corey Minyard
2008-10-07 18:26       ` David Miller
2008-10-08  8:35         ` Eric Dumazet
2008-10-08 16:38           ` David Miller
2008-10-07  8:31     ` Peter Zijlstra
2008-10-07 14:36       ` Paul E. McKenney
2008-10-07 18:29       ` David Miller
2008-10-06 22:07   ` Corey Minyard
2008-10-07  8:17   ` Peter Zijlstra
2008-10-07  9:24     ` Eric Dumazet
2008-10-07 14:15       ` Christoph Lameter
2008-10-07 14:38         ` Paul E. McKenney
2008-10-07 14:50         ` Eric Dumazet
2008-10-07 15:05           ` Paul E. McKenney
2008-10-07 15:09           ` Peter Zijlstra
2008-10-07 15:23           ` Christoph Lameter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).