[PATCH] tcp: efficient port randomisation

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] tcp: efficient port randomisation
       [not found] ` <200410291048.01955.michael.vittrup.larsen@ericsson.com>
@ 2004-10-29 17:28   ` Stephen Hemminger
  2004-11-01  9:58     ` Michael Vittrup Larsen
  0 siblings, 1 reply; 21+ messages in thread
From: Stephen Hemminger @ 2004-10-29 17:28 UTC (permalink / raw)
  To: David S. Miller; +Cc: Michael Vittrup Larsen, netdev

Provide port randomization for incoming connections using variation of
existing sequence number hash. Replace tcp_portalloc_lock and tcp_port_rover
with atomic operation to allow better parallelism.

This is based on 
http://www.ietf.org/internet-drafts/draft-larsen-tsvwg-port-randomisation-00.txt
(with confirmation of of no IPR issues).

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>

diff -urNp -X dontdiff linux-2.6/drivers/char/random.c port-2.6/drivers/char/random.c
--- linux-2.6/drivers/char/random.c	2004-10-28 11:06:09.000000000 -0700
+++ port-2.6/drivers/char/random.c	2004-10-28 15:53:50.000000000 -0700
@@ -2352,6 +2352,24 @@ __u32 secure_ip_id(__u32 daddr)
 	return halfMD4Transform(hash, keyptr->secret);
 }
 
+/* Generate secure starting point for ephemeral TCP port search */
+__u32 secure_tcp_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport)
+{
+	struct keydata *keyptr = get_keyptr();
+	u32 hash[4];
+
+	/*
+	 *  Pick a unique starting offset for each ephemeral port search
+	 *  (saddr, daddr, dport).
+	 */
+	hash[0] = saddr;
+	hash[1] = daddr;
+	hash[2] = dport << 16 | smp_processor_id();
+	hash[3] = keyptr->secret[11];
+
+	return halfMD4Transform(hash, keyptr->secret);
+}
+
 #ifdef CONFIG_SYN_COOKIES
 /*
  * Secure SYN cookie computation. This is the algorithm worked out by
diff -urNp -X dontdiff linux-2.6/include/linux/random.h port-2.6/include/linux/random.h
--- linux-2.6/include/linux/random.h	2004-09-13 09:09:19.000000000 -0700
+++ port-2.6/include/linux/random.h	2004-10-28 15:13:34.000000000 -0700
@@ -54,6 +54,7 @@ extern void get_random_bytes(void *buf, 
 void generate_random_uuid(unsigned char uuid_out[16]);
 
 extern __u32 secure_ip_id(__u32 daddr);
+extern __u32 secure_tcp_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport);
 extern __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr,
 					__u16 sport, __u16 dport);
 extern __u32 secure_tcp_syn_cookie(__u32 saddr, __u32 daddr,
diff -urNp -X dontdiff linux-2.6/include/net/tcp.h port-2.6/include/net/tcp.h
--- linux-2.6/include/net/tcp.h	2004-10-26 16:57:47.000000000 -0700
+++ port-2.6/include/net/tcp.h	2004-10-28 15:13:34.000000000 -0700
@@ -140,7 +140,6 @@ extern struct tcp_hashinfo {
 	rwlock_t __tcp_lhash_lock ____cacheline_aligned;
 	atomic_t __tcp_lhash_users;
 	wait_queue_head_t __tcp_lhash_wait;
-	spinlock_t __tcp_portalloc_lock;
 } tcp_hashinfo;
 
 #define tcp_ehash	(tcp_hashinfo.__tcp_ehash)
@@ -151,14 +150,19 @@ extern struct tcp_hashinfo {
 #define tcp_lhash_lock	(tcp_hashinfo.__tcp_lhash_lock)
 #define tcp_lhash_users	(tcp_hashinfo.__tcp_lhash_users)
 #define tcp_lhash_wait	(tcp_hashinfo.__tcp_lhash_wait)
-#define tcp_portalloc_lock (tcp_hashinfo.__tcp_portalloc_lock)
 
 extern kmem_cache_t *tcp_bucket_cachep;
 extern struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
 						 unsigned short snum);
 extern void tcp_bucket_destroy(struct tcp_bind_bucket *tb);
 extern void tcp_bucket_unlock(struct sock *sk);
-extern int tcp_port_rover;
+extern atomic_t tcp_rover_next;
+
+/* offset in ephemeral port space to start next scan */
+static inline u32 tcp_port_rover(void)
+{
+	return (u32) atomic_inc_return(&tcp_rover_next);
+}
 
 /* These are AF independent. */
 static __inline__ int tcp_bhashfn(__u16 lport)
diff -urNp -X dontdiff linux-2.6/net/ipv4/tcp.c port-2.6/net/ipv4/tcp.c
--- linux-2.6/net/ipv4/tcp.c	2004-10-28 11:06:09.000000000 -0700
+++ port-2.6/net/ipv4/tcp.c	2004-10-28 15:13:34.000000000 -0700
@@ -2342,7 +2342,6 @@ void __init tcp_init(void)
 		sysctl_tcp_max_orphans >>= (3 - order);
 		sysctl_max_syn_backlog = 128;
 	}
-	tcp_port_rover = sysctl_local_port_range[0] - 1;
 
 	sysctl_tcp_mem[0] =  768 << order;
 	sysctl_tcp_mem[1] = 1024 << order;
diff -urNp -X dontdiff linux-2.6/net/ipv4/tcp_ipv4.c port-2.6/net/ipv4/tcp_ipv4.c
--- linux-2.6/net/ipv4/tcp_ipv4.c	2004-10-26 16:57:48.000000000 -0700
+++ port-2.6/net/ipv4/tcp_ipv4.c	2004-10-28 15:13:34.000000000 -0700
@@ -93,7 +93,6 @@ struct tcp_hashinfo __cacheline_aligned 
 	.__tcp_lhash_users	=	ATOMIC_INIT(0),
 	.__tcp_lhash_wait
 	  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
-	.__tcp_portalloc_lock	=	SPIN_LOCK_UNLOCKED
 };
 
 /*
@@ -102,7 +101,8 @@ struct tcp_hashinfo __cacheline_aligned 
  * 32768-61000
  */
 int sysctl_local_port_range[2] = { 1024, 4999 };
-int tcp_port_rover = 1024 - 1;
+
+atomic_t tcp_rover_next = ATOMIC_INIT(0);
 
 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
 				 __u32 faddr, __u16 fport)
@@ -219,14 +219,10 @@ static int tcp_v4_get_port(struct sock *
 		int low = sysctl_local_port_range[0];
 		int high = sysctl_local_port_range[1];
 		int remaining = (high - low) + 1;
-		int rover;
+		__u16 rover;
 
-		spin_lock(&tcp_portalloc_lock);
-		rover = tcp_port_rover;
+		rover = low + tcp_port_rover() % (high - low);
 		do {
-			rover++;
-			if (rover < low || rover > high)
-				rover = low;
 			head = &tcp_bhash[tcp_bhashfn(rover)];
 			spin_lock(&head->lock);
 			tb_for_each(tb, node, &head->chain)
@@ -235,9 +231,9 @@ static int tcp_v4_get_port(struct sock *
 			break;
 		next:
 			spin_unlock(&head->lock);
+			if (++rover >= high)
+				rover = low;
 		} while (--remaining > 0);
-		tcp_port_rover = rover;
-		spin_unlock(&tcp_portalloc_lock);
 
 		/* Exhausted local port range during search? */
 		ret = 1;
@@ -634,6 +630,13 @@ not_unique:
 	return -EADDRNOTAVAIL;
 }
 
+static inline u32 connect_port_offset(const struct sock *sk)
+{
+	const struct inet_opt *inet = inet_sk(sk);
+	return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr, 
+					 inet->dport);
+}
+
 /*
  * Bind a port for a connect operation and hash it.
  */
@@ -645,35 +648,17 @@ static int tcp_v4_hash_connect(struct so
 	int ret;
 
  	if (!snum) {
- 		int rover;
  		int low = sysctl_local_port_range[0];
  		int high = sysctl_local_port_range[1];
  		int remaining = (high - low) + 1;
 		struct hlist_node *node;
  		struct tcp_tw_bucket *tw = NULL;
+		__u16 rover;
 
+		rover = low + (tcp_port_rover() + connect_port_offset(sk)) 
+			      % (high - low);
  		local_bh_disable();
-
- 		/* TODO. Actually it is not so bad idea to remove
- 		 * tcp_portalloc_lock before next submission to Linus.
- 		 * As soon as we touch this place at all it is time to think.
- 		 *
- 		 * Now it protects single _advisory_ variable tcp_port_rover,
- 		 * hence it is mostly useless.
- 		 * Code will work nicely if we just delete it, but
- 		 * I am afraid in contented case it will work not better or
- 		 * even worse: another cpu just will hit the same bucket
- 		 * and spin there.
- 		 * So some cpu salt could remove both contention and
- 		 * memory pingpong. Any ideas how to do this in a nice way?
- 		 */
- 		spin_lock(&tcp_portalloc_lock);
- 		rover = tcp_port_rover;
-
  		do {
- 			rover++;
- 			if ((rover < low) || (rover > high))
- 				rover = low;
  			head = &tcp_bhash[tcp_bhashfn(rover)];
  			spin_lock(&head->lock);
 
@@ -704,9 +689,10 @@ static int tcp_v4_hash_connect(struct so
 
  		next_port:
  			spin_unlock(&head->lock);
+
+			if (++rover >= high)
+				rover = low;
  		} while (--remaining > 0);
- 		tcp_port_rover = rover;
- 		spin_unlock(&tcp_portalloc_lock);
 
  		local_bh_enable();
 
@@ -714,9 +700,6 @@ static int tcp_v4_hash_connect(struct so
 
 ok:
  		/* All locks still held and bhs disabled */
- 		tcp_port_rover = rover;
- 		spin_unlock(&tcp_portalloc_lock);
-
  		tcp_bind_hash(sk, tb, rover);
 		if (sk_unhashed(sk)) {
  			inet_sk(sk)->sport = htons(rover);
@@ -2646,8 +2629,8 @@ EXPORT_SYMBOL(tcp_bucket_create);
 EXPORT_SYMBOL(tcp_hashinfo);
 EXPORT_SYMBOL(tcp_inherit_port);
 EXPORT_SYMBOL(tcp_listen_wlock);
-EXPORT_SYMBOL(tcp_port_rover);
 EXPORT_SYMBOL(tcp_prot);
+EXPORT_SYMBOL(tcp_rover_next);
 EXPORT_SYMBOL(tcp_put_port);
 EXPORT_SYMBOL(tcp_unhash);
 EXPORT_SYMBOL(tcp_v4_conn_request);
diff -urNp -X dontdiff linux-2.6/net/ipv6/tcp_ipv6.c port-2.6/net/ipv6/tcp_ipv6.c
--- linux-2.6/net/ipv6/tcp_ipv6.c	2004-10-26 16:57:48.000000000 -0700
+++ port-2.6/net/ipv6/tcp_ipv6.c	2004-10-28 15:13:34.000000000 -0700
@@ -136,13 +136,10 @@ static int tcp_v6_get_port(struct sock *
 		int low = sysctl_local_port_range[0];
 		int high = sysctl_local_port_range[1];
 		int remaining = (high - low) + 1;
-		int rover;
+		u16 rover;
 
-		spin_lock(&tcp_portalloc_lock);
-		rover = tcp_port_rover;
-		do {	rover++;
-			if ((rover < low) || (rover > high))
-				rover = low;
+		rover = low + tcp_port_rover() % (high - low);
+		do {
 			head = &tcp_bhash[tcp_bhashfn(rover)];
 			spin_lock(&head->lock);
 			tb_for_each(tb, node, &head->chain)
@@ -151,9 +148,9 @@ static int tcp_v6_get_port(struct sock *
 			break;
 		next:
 			spin_unlock(&head->lock);
+			if (++rover >= high)
+				rover = low;
 		} while (--remaining > 0);
-		tcp_port_rover = rover;
-		spin_unlock(&tcp_portalloc_lock);
 
 		/* Exhausted local port range during search? */
 		ret = 1;

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] tcp: efficient port randomisation
  2004-10-29 17:28   ` [PATCH] tcp: efficient port randomisation Stephen Hemminger
@ 2004-11-01  9:58     ` Michael Vittrup Larsen
  2004-11-01 17:20       ` Stephen Hemminger
  0 siblings, 1 reply; 21+ messages in thread
From: Michael Vittrup Larsen @ 2004-11-01  9:58 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David S. Miller, netdev

On Friday 29 October 2004 19:28, Stephen Hemminger wrote:
> Provide port randomization for incoming connections using variation of
> existing sequence number hash. Replace tcp_portalloc_lock and
> tcp_port_rover with atomic operation to allow better parallelism.
>
> This is based on
> http://www.ietf.org/internet-drafts/draft-larsen-tsvwg-port-randomisation-0
>0.txt (with confirmation of of no IPR issues).

I have looked through this, and have a few comments:

* It is probably a good strategy to set 'tcp_rover_next' such that
  the next search is resumed from the previous port found to be free.
  (similar to the old algorithm).  I don't see this in your patch,
  but of course I could have missed it.

* connect_port_offset() does not (at least from an algorithm point
  of view) need to return an u32, an u16 is sufficient.

Michael Larsen

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] tcp: efficient port randomisation
  2004-11-01  9:58     ` Michael Vittrup Larsen
@ 2004-11-01 17:20       ` Stephen Hemminger
  2004-11-02  7:54         ` Michael Vittrup Larsen
  0 siblings, 1 reply; 21+ messages in thread
From: Stephen Hemminger @ 2004-11-01 17:20 UTC (permalink / raw)
  To: Michael Vittrup Larsen; +Cc: David S. Miller, netdev

On Mon, 1 Nov 2004 11:58:23 +0200
Michael Vittrup Larsen <michael.vittrup.larsen@ericsson.com> wrote:

> On Friday 29 October 2004 19:28, Stephen Hemminger wrote:
> > Provide port randomization for incoming connections using variation of
> > existing sequence number hash. Replace tcp_portalloc_lock and
> > tcp_port_rover with atomic operation to allow better parallelism.
> >
> > This is based on
> > http://www.ietf.org/internet-drafts/draft-larsen-tsvwg-port-randomisation-0
> >0.txt (with confirmation of of no IPR issues).
> 
> I have looked through this, and have a few comments:
> 
> * It is probably a good strategy to set 'tcp_rover_next' such that
>   the next search is resumed from the previous port found to be free.
>   (similar to the old algorithm).  I don't see this in your patch,
>   but of course I could have missed it.

It was intentional since it would require holding a lock around the search. The tradeoff
is better SMP performance in the sparsely filled port space (more typical) vs.
better UP performance in the case of a mostly full port space.


> * connect_port_offset() does not (at least from an algorithm point
>   of view) need to return an u32, an u16 is sufficient.

If it is truncated to u16, then compiler has to take extra effort to truncate
is unnecessary given later  modulo operation.

> Michael Larsen

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] tcp: efficient port randomisation
  2004-11-01 17:20       ` Stephen Hemminger
@ 2004-11-02  7:54         ` Michael Vittrup Larsen
  2004-11-04 18:01           ` Stephen Hemminger
  0 siblings, 1 reply; 21+ messages in thread
From: Michael Vittrup Larsen @ 2004-11-02  7:54 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David S. Miller, netdev

On Monday 01 November 2004 18:20, Stephen Hemminger wrote:
> > * It is probably a good strategy to set 'tcp_rover_next' such that
> >   the next search is resumed from the previous port found to be free.
> >   (similar to the old algorithm).  I don't see this in your patch,
> >   but of course I could have missed it.
>
> It was intentional since it would require holding a lock around the search.
> The tradeoff is better SMP performance in the sparsely filled port space
> (more typical) vs. better UP performance in the case of a mostly full port
> space.

I think a typical scenario is many short-lived (e.g. minutes) TCP connections, 
few long-lived (e.g. hours) connections and an ephemeral port wrap-around 
probably also in hours - at least a long time compared to the life-time of 
the short-lived connections.

This would result in a closely spaced 'group' of ports being occupied 
somewhere in the ephemeral port range, and 'tcp_rover_next' would point at 
the uppermost extreme of this group and thus always guarantee a free port on 
first try (collisions will only happen with long-lived connections).  If you 
don't update 'tcp_rover_next', and this somehow gets to lag behind this 
'group' of ports (say point at the lower extreme) you will need to search 
through this group first before you enter the unoccupied port space.

Your scheme works initially because you do not lag behind the free port space, 
but eventually you will, and I think this will result in less optimal 
performance compared to the old behaviour.

Since updating the 'tcp_rover_next' practically always result in a free port 
on first try, I think SMP performance will not suffer even though the lock 
was held all through the port search (except when the port space is very 
crowded).

And yes, I do use Linux exclusively, so I do care :-))

>From a statistically point of view, if the connection life-times are uniformly 
distributed from zero to infinite (theoretical scenario), it does not matter 
what starting point you use. However, soon as life-times are not uniformly 
distributed, this kind of search algorithm will benefit from good starting 
point defining where the probability of used vs. unused port drop from high 
to low.

The BSD solution with a pure random rover suffers similarly, especially when 
the port space becomes crowded.

> > * connect_port_offset() does not (at least from an algorithm point
> >   of view) need to return an u32, an u16 is sufficient.
>
> If it is truncated to u16, then compiler has to take extra effort to
> truncate is unnecessary given later  modulo operation.

I agree (in fact thats what I argued in the draft) - it probably depends on 
your platform - you are assuming a 32-bit platform I guess.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] tcp: efficient port randomisation
  2004-11-02  7:54         ` Michael Vittrup Larsen
@ 2004-11-04 18:01           ` Stephen Hemminger
  2004-11-05 10:03             ` Michael Vittrup Larsen
  0 siblings, 1 reply; 21+ messages in thread
From: Stephen Hemminger @ 2004-11-04 18:01 UTC (permalink / raw)
  To: Michael Vittrup Larsen; +Cc: David S. Miller, netdev

On Tue, 2 Nov 2004 09:54:44 +0200
Michael Vittrup Larsen <michael.vittrup.larsen@ericsson.com> wrote:

> On Monday 01 November 2004 18:20, Stephen Hemminger wrote:
> > > * It is probably a good strategy to set 'tcp_rover_next' such that
> > >   the next search is resumed from the previous port found to be free.
> > >   (similar to the old algorithm).  I don't see this in your patch,
> > >   but of course I could have missed it.
> >
> > It was intentional since it would require holding a lock around the search.
> > The tradeoff is better SMP performance in the sparsely filled port space
> > (more typical) vs. better UP performance in the case of a mostly full port
> > space.
> 
> I think a typical scenario is many short-lived (e.g. minutes) TCP connections, 
> few long-lived (e.g. hours) connections and an ephemeral port wrap-around 
> probably also in hours - at least a long time compared to the life-time of 
> the short-lived connections.

But because of the hashing most ports will be scattered all over the port space,
because they come from different hosts.

> This would result in a closely spaced 'group' of ports being occupied 
> somewhere in the ephemeral port range, and 'tcp_rover_next' would point at 
> the uppermost extreme of this group and thus always guarantee a free port on 
> first try (collisions will only happen with long-lived connections).  If you 
> don't update 'tcp_rover_next', and this somehow gets to lag behind this 
> 'group' of ports (say point at the lower extreme) you will need to search 
> through this group first before you enter the unoccupied port space.

Also, Linux TCP will reuse ports if (saddr, daddr, sport) are different.
Look at __tcp_v4_check_established.  This means that the ports actually have
to be in use with real connections to the same host.

> Your scheme works initially because you do not lag behind the free port space, 
> but eventually you will, and I think this will result in less optimal 
> performance compared to the old behaviour.

Free port space should be evenly distributed because of the hash function.

> Since updating the 'tcp_rover_next' practically always result in a free port 
> on first try, I think SMP performance will not suffer even though the lock 
> was held all through the port search (except when the port space is very 
> crowded).

But by not having a global lock on port allocation, different cpu's can be
searching different hash trees.  This would matter under Dos attack with
multiple interfaces.

> And yes, I do use Linux exclusively, so I do care :-))
> 
> >From a statistically point of view, if the connection life-times are uniformly 
> distributed from zero to infinite (theoretical scenario), it does not matter 
> what starting point you use. However, soon as life-times are not uniformly 
> distributed, this kind of search algorithm will benefit from good starting 
> point defining where the probability of used vs. unused port drop from high 
> to low.
> 
> The BSD solution with a pure random rover suffers similarly, especially when 
> the port space becomes crowded.
> 
> 
> > > * connect_port_offset() does not (at least from an algorithm point
> > >   of view) need to return an u32, an u16 is sufficient.
> >
> > If it is truncated to u16, then compiler has to take extra effort to
> > truncate is unnecessary given later  modulo operation.
> 
> I agree (in fact thats what I argued in the draft) - it probably depends on 
> your platform - you are assuming a 32-bit platform I guess.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] tcp: efficient port randomisation
  2004-11-04 18:01           ` Stephen Hemminger
@ 2004-11-05 10:03             ` Michael Vittrup Larsen
  2004-11-17 23:30               ` [PATCH] tcp: efficient port randomisation (revised) Stephen Hemminger
  0 siblings, 1 reply; 21+ messages in thread
From: Michael Vittrup Larsen @ 2004-11-05 10:03 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David S. Miller, netdev

On Thursday 04 November 2004 19:01, Stephen Hemminger wrote:
> But because of the hashing most ports will be scattered all over the port
> space, because they come from different hosts.
>
> Also, Linux TCP will reuse ports if (saddr, daddr, sport) are different.
> Look at __tcp_v4_check_established.  This means that the ports actually
> have to be in use with real connections to the same host.

__tcp_v4_check_established is the linux version of the uniqueness test from 
the draft, and of course ports ports can be reused when at least one of the 
other parameters (saddr, daddr, sport) are different.

I focus on the situation where (saddr, daddr, sport) is constant since this is 
where we get collisions and need to try another port.  Not storing the port 
found to be unused will result in situations like:

tcp_rover_next is say 2000

Ports 2000-2010 are already used because you are browsing www.osdl.org

Your search will find 2011 to be unused after 10 retries and tcp_rover_next 
will be 2001.

Your next search (you continue to browse www.osdl.org) will result in 2012 - 
again after 10 retries.

In a simple browsing scenario like this, you will usually not have holes 
because of TCP TIME-WAIT and your rover will continue to lag behind and you 
will continue to make 10 retries on ports.

The question is then, when do the rover begin to lack behind?

Everytime you meet a long-lived connection in the port space your rover will 
lag one port behind the real 'unused' rover.  Using wget and browsing 
www.osdl.org may easily produce this problem.

I understand your argument for not holding the lock, and maybe the following 
algorithm is a compromise:

1. Use the current algorithm that does not hold the lock

2. If a port was found in first try, then exit (you have already
   incremented tcp_rover_next by 1 so this is up to date as per
   the old algorithm).

3. If more than one port try was necessary, compute the difference between
   the initial rover to the current unused port, and atomic_add() this to
   tcp_rover_next.

The only drawback of this is, that tcp_rover_next may 'run' a little too fast 
in contention cases, which only has theoretical impact on performance. Also, 
this only happens when we meet a long-lived connection, which we usually do 
not have many of.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH] tcp: efficient port randomisation (revised)
  2004-11-05 10:03             ` Michael Vittrup Larsen
@ 2004-11-17 23:30               ` Stephen Hemminger
  2004-11-19  7:38                 ` Michael Vittrup Larsen
  2004-12-01  5:46                 ` David S. Miller
  0 siblings, 2 replies; 21+ messages in thread
From: Stephen Hemminger @ 2004-11-17 23:30 UTC (permalink / raw)
  To: David S. Miller; +Cc: Michael Vittrup Larsen, netdev

Here is a more conservative version of earlier patch vthat keeps the same port rover locking and global port rover. This randomizes TCP ephemeral ports
of incoming connections using variation of existing sequence number hash.

Thanks to original author Michael Larsen. 
http://www.ietf.org/internet-drafts/draft-larsen-tsvwg-port-randomisation-00.txt

It behaves correctly if someone is perverse and sets low > high
and it separates the outgoing port rover (tcp_port_rover) from the incoming port rover (start_rover).

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>

diff -Nru a/drivers/char/random.c b/drivers/char/random.c
--- a/drivers/char/random.c	2004-11-17 15:21:43 -08:00
+++ b/drivers/char/random.c	2004-11-17 15:21:43 -08:00
@@ -2347,6 +2347,24 @@
 	return halfMD4Transform(hash, keyptr->secret);
 }
 
+/* Generate secure starting point for ephemeral TCP port search */
+u32 secure_tcp_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport)
+{
+	struct keydata *keyptr = get_keyptr();
+	u32 hash[4];
+
+	/*
+	 *  Pick a unique starting offset for each ephemeral port search
+	 *  (saddr, daddr, dport) and 48bits of random data.
+	 */
+	hash[0] = saddr;
+	hash[1] = daddr;
+	hash[2] = dport ^ keyptr->secret[10];
+	hash[3] = keyptr->secret[11];
+
+	return halfMD4Transform(hash, keyptr->secret);
+}
+
 #ifdef CONFIG_SYN_COOKIES
 /*
  * Secure SYN cookie computation. This is the algorithm worked out by
diff -Nru a/include/linux/random.h b/include/linux/random.h
--- a/include/linux/random.h	2004-11-17 15:21:43 -08:00
+++ b/include/linux/random.h	2004-11-17 15:21:43 -08:00
@@ -52,6 +52,7 @@
 void generate_random_uuid(unsigned char uuid_out[16]);
 
 extern __u32 secure_ip_id(__u32 daddr);
+extern u32 secure_tcp_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport);
 extern __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr,
 					__u16 sport, __u16 dport);
 extern __u32 secure_tcp_syn_cookie(__u32 saddr, __u32 daddr,
diff -Nru a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
--- a/net/ipv4/tcp_ipv4.c	2004-11-17 15:21:43 -08:00
+++ b/net/ipv4/tcp_ipv4.c	2004-11-17 15:21:43 -08:00
@@ -636,6 +636,13 @@
 	return -EADDRNOTAVAIL;
 }
 
+static inline u32 connect_port_offset(const struct sock *sk)
+{
+	const struct inet_opt *inet = inet_sk(sk);
+	return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr, 
+					 inet->dport);
+}
+
 /*
  * Bind a port for a connect operation and hash it.
  */
@@ -647,10 +654,12 @@
 	int ret;
 
  	if (!snum) {
- 		int rover;
- 		int low = sysctl_local_port_range[0];
- 		int high = sysctl_local_port_range[1];
- 		int remaining = (high - low) + 1;
+ 		const u16 low = sysctl_local_port_range[0];
+ 		const u16 high = sysctl_local_port_range[1];
+ 		u16 rover = low;
+ 		int remaining = (high-low) + 1;
+		u32 offset = connect_port_offset(sk);
+		static u32 rover_start;
 		struct hlist_node *node;
  		struct tcp_tw_bucket *tw = NULL;
 
@@ -660,7 +669,7 @@
  		 * tcp_portalloc_lock before next submission to Linus.
  		 * As soon as we touch this place at all it is time to think.
  		 *
- 		 * Now it protects single _advisory_ variable tcp_port_rover,
+ 		 * Now it protects single _advisory_ variable rover_start,
  		 * hence it is mostly useless.
  		 * Code will work nicely if we just delete it, but
  		 * I am afraid in contented case it will work not better or
@@ -670,12 +679,9 @@
  		 * memory pingpong. Any ideas how to do this in a nice way?
  		 */
  		spin_lock(&tcp_portalloc_lock);
- 		rover = tcp_port_rover;
-
- 		do {
- 			rover++;
- 			if ((rover < low) || (rover > high))
- 				rover = low;
+		while (remaining > 0) {
+			rover = low + (rover_start + offset) % 
+				(high - low);
  			head = &tcp_bhash[tcp_bhashfn(rover)];
  			spin_lock(&head->lock);
 
@@ -706,8 +712,9 @@
 
  		next_port:
  			spin_unlock(&head->lock);
- 		} while (--remaining > 0);
- 		tcp_port_rover = rover;
+			--remaining;
+			++rover_start;
+ 		}
  		spin_unlock(&tcp_portalloc_lock);
 
  		local_bh_enable();
@@ -716,7 +723,6 @@
 
 ok:
  		/* All locks still held and bhs disabled */
- 		tcp_port_rover = rover;
  		spin_unlock(&tcp_portalloc_lock);
 
  		tcp_bind_hash(sk, tb, rover);

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] tcp: efficient port randomisation (revised)
  2004-11-17 23:30               ` [PATCH] tcp: efficient port randomisation (revised) Stephen Hemminger
@ 2004-11-19  7:38                 ` Michael Vittrup Larsen
  2004-12-01  5:46                 ` David S. Miller
  1 sibling, 0 replies; 21+ messages in thread
From: Michael Vittrup Larsen @ 2004-11-19  7:38 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David S. Miller, netdev

I have looked through this patch and found no problems - thank you for 
implementing the draft.

/Michael

On Thursday 18 November 2004 00:30, Stephen Hemminger wrote:
> Here is a more conservative version of earlier patch vthat keeps the same
> port rover locking and global port rover. This randomizes TCP ephemeral
> ports of incoming connections using variation of existing sequence number
> hash.
>
> Thanks to original author Michael Larsen.
> http://www.ietf.org/internet-drafts/draft-larsen-tsvwg-port-randomisation-0
>0.txt
>
> It behaves correctly if someone is perverse and sets low > high
> and it separates the outgoing port rover (tcp_port_rover) from the incoming
> port rover (start_rover).
>
> Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
>
> diff -Nru a/drivers/char/random.c b/drivers/char/random.c
> --- a/drivers/char/random.c 2004-11-17 15:21:43 -08:00
> +++ b/drivers/char/random.c 2004-11-17 15:21:43 -08:00
> @@ -2347,6 +2347,24 @@
>   return halfMD4Transform(hash, keyptr->secret);
>  }
>
> +/* Generate secure starting point for ephemeral TCP port search */
> +u32 secure_tcp_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport)
> +{
> + struct keydata *keyptr = get_keyptr();
> + u32 hash[4];
> +
> + /*
> +  *  Pick a unique starting offset for each ephemeral port search
> +  *  (saddr, daddr, dport) and 48bits of random data.
> +  */
> + hash[0] = saddr;
> + hash[1] = daddr;
> + hash[2] = dport ^ keyptr->secret[10];
> + hash[3] = keyptr->secret[11];
> +
> + return halfMD4Transform(hash, keyptr->secret);
> +}
> +
>  #ifdef CONFIG_SYN_COOKIES
>  /*
>   * Secure SYN cookie computation. This is the algorithm worked out by
> diff -Nru a/include/linux/random.h b/include/linux/random.h
> --- a/include/linux/random.h 2004-11-17 15:21:43 -08:00
> +++ b/include/linux/random.h 2004-11-17 15:21:43 -08:00
> @@ -52,6 +52,7 @@
>  void generate_random_uuid(unsigned char uuid_out[16]);
>
>  extern __u32 secure_ip_id(__u32 daddr);
> +extern u32 secure_tcp_port_ephemeral(__u32 saddr, __u32 daddr, __u16
> dport); extern __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr,
> __u16 sport, __u16 dport);
>  extern __u32 secure_tcp_syn_cookie(__u32 saddr, __u32 daddr,
> diff -Nru a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> --- a/net/ipv4/tcp_ipv4.c 2004-11-17 15:21:43 -08:00
> +++ b/net/ipv4/tcp_ipv4.c 2004-11-17 15:21:43 -08:00
> @@ -636,6 +636,13 @@
>   return -EADDRNOTAVAIL;
>  }
>
> +static inline u32 connect_port_offset(const struct sock *sk)
> +{
> + const struct inet_opt *inet = inet_sk(sk);
> + return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
> +      inet->dport);
> +}
> +
>  /*
>   * Bind a port for a connect operation and hash it.
>   */
> @@ -647,10 +654,12 @@
>   int ret;
>
>    if (!snum) {
> -   int rover;
> -   int low = sysctl_local_port_range[0];
> -   int high = sysctl_local_port_range[1];
> -   int remaining = (high - low) + 1;
> +   const u16 low = sysctl_local_port_range[0];
> +   const u16 high = sysctl_local_port_range[1];
> +   u16 rover = low;
> +   int remaining = (high-low) + 1;
> +  u32 offset = connect_port_offset(sk);
> +  static u32 rover_start;
>    struct hlist_node *node;
>     struct tcp_tw_bucket *tw = NULL;
>
> @@ -660,7 +669,7 @@
>      * tcp_portalloc_lock before next submission to Linus.
>      * As soon as we touch this place at all it is time to think.
>      *
> -    * Now it protects single _advisory_ variable tcp_port_rover,
> +    * Now it protects single _advisory_ variable rover_start,
>      * hence it is mostly useless.
>      * Code will work nicely if we just delete it, but
>      * I am afraid in contented case it will work not better or
> @@ -670,12 +679,9 @@
>      * memory pingpong. Any ideas how to do this in a nice way?
>      */
>     spin_lock(&tcp_portalloc_lock);
> -   rover = tcp_port_rover;
> -
> -   do {
> -    rover++;
> -    if ((rover < low) || (rover > high))
> -     rover = low;
> +  while (remaining > 0) {
> +   rover = low + (rover_start + offset) %
> +    (high - low);
>      head = &tcp_bhash[tcp_bhashfn(rover)];
>      spin_lock(&head->lock);
>
> @@ -706,8 +712,9 @@
>
>     next_port:
>      spin_unlock(&head->lock);
> -   } while (--remaining > 0);
> -   tcp_port_rover = rover;
> +   --remaining;
> +   ++rover_start;
> +   }
>     spin_unlock(&tcp_portalloc_lock);
>
>     local_bh_enable();
> @@ -716,7 +723,6 @@
>
>  ok:
>     /* All locks still held and bhs disabled */
> -   tcp_port_rover = rover;
>     spin_unlock(&tcp_portalloc_lock);
>
>     tcp_bind_hash(sk, tb, rover);

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] tcp: efficient port randomisation (revised)
  2004-11-17 23:30               ` [PATCH] tcp: efficient port randomisation (revised) Stephen Hemminger
  2004-11-19  7:38                 ` Michael Vittrup Larsen
@ 2004-12-01  5:46                 ` David S. Miller
       [not found]                   ` <20041201152446.3a0d5ce3@dxpl.pdx.osdl.net>
  1 sibling, 1 reply; 21+ messages in thread
From: David S. Miller @ 2004-12-01  5:46 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: michael.vittrup.larsen, netdev

On Wed, 17 Nov 2004 15:30:25 -0800
Stephen Hemminger <shemminger@osdl.org> wrote:

> Here is a more conservative version of earlier patch vthat keeps the
> same port rover locking and global port rover. This randomizes TCP
> ephemeral ports of incoming connections using variation of existing
> sequence number hash.
> 
> Thanks to original author Michael Larsen. 
> http://www.ietf.org/internet-drafts/draft-larsen-tsvwg-port-randomisation-00.txt
> 
> It behaves correctly if someone is perverse and sets low > high
> and it separates the outgoing port rover (tcp_port_rover) from the
> incoming port rover (start_rover).

I'm fine with this patch semantically.  What do the
before/after microbenchmarks look like?  We're adding
a MD4 transform plus a modulus for every local port
select operation.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] tcp: efficient port randomisation (revised)
       [not found]                     ` <20041201204622.7b760400.davem@davemloft.net>
@ 2004-12-02 21:49                       ` Stephen Hemminger
  2004-12-02 21:52                         ` David S. Miller
  0 siblings, 1 reply; 21+ messages in thread
From: Stephen Hemminger @ 2004-12-02 21:49 UTC (permalink / raw)
  To: David S. Miller; +Cc: michael.vittrup.larsen, netdev

On Wed, 1 Dec 2004 20:46:22 -0800
"David S. Miller" <davem@davemloft.net> wrote:


> I'm more interested in simply things like "lat_connect"
> from lmbench run over loopback.

Oh, that was easy using OSDL PLM/STP which gives an easy way to run
local tests.  

Baseline...
[STP 299123] lmbench_long results  Kernel: patch-2.6.10-rc2 PLM # 3869
*Local* Communication latencies in microseconds - smaller is better
-------------------------------------------------------------------
Host                 OS 2p/0K  Pipe AF     UDP  RPC/   TCP  RPC/ TCP
                        ctxsw       UNIX         UDP         TCP conn
--------- ------------- ----- ----- ---- ----- ----- ----- ----- ----
stp2-001  Linux 2.6.10- 8.270  38.6 24.3  61.6  48.5  45.9  76.6 74.6
stp2-001  Linux 2.6.10- 8.170  43.5 24.5  58.0  54.8  45.6  63.4 74.7
stp2-001  Linux 2.6.10- 2.740  50.6 29.9  40.3  48.3  59.8  75.1 101.
stp2-001  Linux 2.6.10- 8.140  46.6 29.7  57.6  48.8  45.5  72.0 74.4
stp2-001  Linux 2.6.10- 2.690  47.1 26.3  40.8  48.9  45.5  75.4 74.8
-----------------
Patched...
[STP 299118] lmbench_long results  Kernel: tcp-port-randomization-2 PLM # 3907
*Local* Communication latencies in microseconds - smaller is better
-------------------------------------------------------------------
Host                 OS 2p/0K  Pipe AF     UDP  RPC/   TCP  RPC/ TCP
                        ctxsw       UNIX         UDP         TCP conn
--------- ------------- ----- ----- ---- ----- ----- ----- ----- ----
stp2-001  Linux 2.6.10- 2.770  46.3 25.0  64.4  49.5  44.3  75.2 75.6
stp2-001  Linux 2.6.10- 2.780  44.1 21.2  63.5  55.6  45.3  63.5 75.2
stp2-001  Linux 2.6.10- 2.790  47.5 24.8  40.4  48.5  45.5  63.7 76.9
stp2-001  Linux 2.6.10- 8.330  47.5 24.8  40.7  55.6  44.6  63.8 75.1
stp2-001  Linux 2.6.10- 8.150  47.9 25.7  41.2  49.6  45.2  72.7 75.1


These are run on a relatively slow machine (2 way Pentium III 850Mhz)
and it looks like the results are no change (in the noise).

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] tcp: efficient port randomisation (revised)
  2004-12-02 21:49                       ` Stephen Hemminger
@ 2004-12-02 21:52                         ` David S. Miller
  2004-12-02 22:51                           ` Stephen Hemminger
                                             ` (2 more replies)
  0 siblings, 3 replies; 21+ messages in thread
From: David S. Miller @ 2004-12-02 21:52 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: michael.vittrup.larsen, netdev

On Thu, 2 Dec 2004 13:49:30 -0800
Stephen Hemminger <shemminger@osdl.org> wrote:

> These are run on a relatively slow machine (2 way Pentium III 850Mhz)
> and it looks like the results are no change (in the noise).

Or averaged out, about 1ms more expensive.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] tcp: efficient port randomisation (revised)
  2004-12-02 21:52                         ` David S. Miller
@ 2004-12-02 22:51                           ` Stephen Hemminger
  2004-12-02 23:01                           ` Stephen Hemminger
  2004-12-04  5:42                           ` Stephen Hemminger
  2 siblings, 0 replies; 21+ messages in thread
From: Stephen Hemminger @ 2004-12-02 22:51 UTC (permalink / raw)
  To: David S. Miller; +Cc: michael.vittrup.larsen, netdev

On Thu, 2 Dec 2004 13:52:52 -0800
"David S. Miller" <davem@davemloft.net> wrote:

> On Thu, 2 Dec 2004 13:49:30 -0800
> Stephen Hemminger <shemminger@osdl.org> wrote:
> 
> > These are run on a relatively slow machine (2 way Pentium III 850Mhz)
> > and it looks like the results are no change (in the noise).
> 
> Or averaged out, about 1ms more expensive.

I am writing my own test since, that one seems so noisy.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] tcp: efficient port randomisation (revised)
  2004-12-02 21:52                         ` David S. Miller
  2004-12-02 22:51                           ` Stephen Hemminger
@ 2004-12-02 23:01                           ` Stephen Hemminger
  2004-12-04  5:42                           ` Stephen Hemminger
  2 siblings, 0 replies; 21+ messages in thread
From: Stephen Hemminger @ 2004-12-02 23:01 UTC (permalink / raw)
  To: David S. Miller; +Cc: michael.vittrup.larsen, netdev

On Thu, 2 Dec 2004 13:52:52 -0800
"David S. Miller" <davem@davemloft.net> wrote:

> On Thu, 2 Dec 2004 13:49:30 -0800
> Stephen Hemminger <shemminger@osdl.org> wrote:
> 
> > These are run on a relatively slow machine (2 way Pentium III 850Mhz)
> > and it looks like the results are no change (in the noise).
> 
> Or averaged out, about 1ms more expensive.

We could always benchmark special the loopback case since there is
no risk of man-in-the-middle attacks.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] tcp: efficient port randomisation (revised)
  2004-12-02 21:52                         ` David S. Miller
  2004-12-02 22:51                           ` Stephen Hemminger
  2004-12-02 23:01                           ` Stephen Hemminger
@ 2004-12-04  5:42                           ` Stephen Hemminger
  2004-12-06  8:18                             ` Michael Vittrup Larsen
  2 siblings, 1 reply; 21+ messages in thread
From: Stephen Hemminger @ 2004-12-04  5:42 UTC (permalink / raw)
  To: David S. Miller; +Cc: michael.vittrup.larsen, netdev

If I special case to handle loopback, and get rid of the portalloc
lock, it comes out much better.  These numbers are on the 800Mhz
PIII SMP, on a fast box like the dual Opeteron's it makes no difference
(always 30us).

Before TCP connection latency mean 79.9 std 10.55

*Local* Communication latencies in microseconds - smaller is better
-------------------------------------------------------------------
Host                 OS 2p/0K  Pipe AF     UDP  RPC/   TCP  RPC/ TCP
                         ctxsw       UNIX         UDP         TCP conn
--------- ------------- ----- ----- ---- ----- ----- ----- ----- ----
stp2-001  Linux 2.6.10- 8.270  38.6 24.3  61.6  48.5  45.9  76.6 74.6
stp2-001  Linux 2.6.10- 8.170  43.5 24.5  58.0  54.8  45.6  63.4 74.7
stp2-001  Linux 2.6.10- 2.740  50.6 29.9  40.3  48.3  59.8  75.1 101.
stp2-001  Linux 2.6.10- 8.140  46.6 29.7  57.6  48.8  45.5  72.0 74.4
stp2-001  Linux 2.6.10- 2.690  47.1 26.3  40.8  48.9  45.5  75.4 74.8


After TCP connection latency mean 73.8 std 0.55

*Local* Communication latencies in microseconds - smaller is better
-------------------------------------------------------------------
Host                 OS 2p/0K  Pipe AF     UDP  RPC/   TCP  RPC/ TCP
                         ctxsw       UNIX         UDP         TCP conn
--------- ------------- ----- ----- ---- ----- ----- ----- ----- ----
stp2-001  Linux 2.6.10- 8.260  38.1 25.7  63.3  48.1  66.6  75.4 74.9
stp2-001  Linux 2.6.10- 8.090  46.2 26.0  63.4  55.5  45.9  63.6 73.5
stp2-001  Linux 2.6.10- 8.210  39.0 21.2  63.1  55.4  58.8  63.8 73.5
stp2-001  Linux 2.6.10- 2.850  46.5 26.0  64.8  54.6  45.5  74.0 73.6
stp2-001  Linux 2.6.10- 8.200  42.9 21.5  64.9  55.6  62.4  64.1 73.5

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] tcp: efficient port randomisation (revised)
  2004-12-04  5:42                           ` Stephen Hemminger
@ 2004-12-06  8:18                             ` Michael Vittrup Larsen
  2004-12-06 17:42                               ` [PATCH] tcp: efficient port randomisation (rev 3) Stephen Hemminger
  0 siblings, 1 reply; 21+ messages in thread
From: Michael Vittrup Larsen @ 2004-12-06  8:18 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David S. Miller, netdev

Measuring non-blocking connect using the loopback address I agree with 
Stephen' conclusion, that the cost of the MD4 gets lost in the noise.

I did not measure the variance, since this probably describe scheduling and 
not the actual ephemeral bind.  However, I did measure the minimum value, and 
the assumption is that this is close to a measurement of an uninterrupted 
connect.  Median filtering probably would be more correct...

Below are the results from 10 successive tests.

Unmodified (average and minimum values):

connect 24433 (min 21820) [ticks/op]
connect 24504 (min 21927) [ticks/op]
connect 24530 (min 21952) [ticks/op]
connect 24244 (min 21607) [ticks/op]
connect 24220 (min 21613) [ticks/op]
connect 24117 (min 21665) [ticks/op]
connect 24148 (min 21663) [ticks/op]
connect 24079 (min 21648) [ticks/op]
connect 23998 (min 21700) [ticks/op]
connect 23906 (min 21682) [ticks/op]

Modified (average and minimum values):

connect 23961 (min 21774) [ticks/op]
connect 23894 (min 21750) [ticks/op]
connect 23927 (min 21776) [ticks/op]
connect 23881 (min 21757) [ticks/op]
connect 23956 (min 21749) [ticks/op]
connect 23872 (min 21710) [ticks/op]
connect 23848 (min 21694) [ticks/op]
connect 23729 (min 21769) [ticks/op]
connect 23656 (min 21618) [ticks/op]
connect 23723 (min 21699) [ticks/op]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH] tcp: efficient port randomisation (rev 3)
  2004-12-06  8:18                             ` Michael Vittrup Larsen
@ 2004-12-06 17:42                               ` Stephen Hemminger
  2004-12-09  7:55                                 ` David S. Miller
  0 siblings, 1 reply; 21+ messages in thread
From: Stephen Hemminger @ 2004-12-06 17:42 UTC (permalink / raw)
  To: David S. Miller; +Cc: Michael Vittrup Larsen, netdev

Third revision of the TCP port randomization patch. It randomizes
TCP ephemeral ports of incoming connections using variation of existing
sequence number hash. This one avoids the MD4 for the loopback case since 
there is no reason to bother over loopback and it improves benchmark numbers.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>

Thanks to original author Michael Larsen. 
http://www.ietf.org/internet-drafts/draft-larsen-tsvwg-port-randomisation-00.txt 

diff -urNp -X dontdiff test-2.6/drivers/char/random.c tcpport/drivers/char/random.c
--- test-2.6/drivers/char/random.c	2004-11-30 16:26:41.000000000 -0800
+++ tcpport/drivers/char/random.c	2004-12-03 17:04:18.267850607 -0800
@@ -2347,6 +2347,24 @@ __u32 secure_ip_id(__u32 daddr)
 	return halfMD4Transform(hash, keyptr->secret);
 }
 
+/* Generate secure starting point for ephemeral TCP port search */
+u32 secure_tcp_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport)
+{
+	struct keydata *keyptr = get_keyptr();
+	u32 hash[4];
+
+	/*
+	 *  Pick a unique starting offset for each ephemeral port search
+	 *  (saddr, daddr, dport) and 48bits of random data.
+	 */
+	hash[0] = saddr;
+	hash[1] = daddr;
+	hash[2] = dport ^ keyptr->secret[10];
+	hash[3] = keyptr->secret[11];
+
+	return halfMD4Transform(hash, keyptr->secret);
+}
+
 #ifdef CONFIG_SYN_COOKIES
 /*
  * Secure SYN cookie computation. This is the algorithm worked out by
diff -urNp -X dontdiff test-2.6/include/linux/random.h tcpport/include/linux/random.h
--- test-2.6/include/linux/random.h	2004-11-30 16:26:51.000000000 -0800
+++ tcpport/include/linux/random.h	2004-12-02 17:07:13.000000000 -0800
@@ -52,6 +52,7 @@ extern void get_random_bytes(void *buf, 
 void generate_random_uuid(unsigned char uuid_out[16]);
 
 extern __u32 secure_ip_id(__u32 daddr);
+extern u32 secure_tcp_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport);
 extern __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr,
 					__u16 sport, __u16 dport);
 extern __u32 secure_tcp_syn_cookie(__u32 saddr, __u32 daddr,
diff -urNp -X dontdiff test-2.6/net/ipv4/tcp_ipv4.c tcpport/net/ipv4/tcp_ipv4.c
--- test-2.6/net/ipv4/tcp_ipv4.c	2004-11-30 16:26:51.000000000 -0800
+++ tcpport/net/ipv4/tcp_ipv4.c	2004-12-03 17:04:26.454562583 -0800
@@ -636,10 +636,18 @@ not_unique:
 	return -EADDRNOTAVAIL;
 }
 
+static inline u32 connect_port_offset(const struct sock *sk)
+{
+	const struct inet_opt *inet = inet_sk(sk);
+
+	return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr, 
+					 inet->dport);
+}
+
 /*
  * Bind a port for a connect operation and hash it.
  */
-static int tcp_v4_hash_connect(struct sock *sk)
+static int tcp_v4_hash_connect(struct sock *sk, int loopback)
 {
 	unsigned short snum = inet_sk(sk)->num;
  	struct tcp_bind_hashbucket *head;
@@ -647,36 +655,23 @@ static int tcp_v4_hash_connect(struct so
 	int ret;
 
  	if (!snum) {
- 		int rover;
  		int low = sysctl_local_port_range[0];
  		int high = sysctl_local_port_range[1];
- 		int remaining = (high - low) + 1;
+		int range = high - low;
+ 		int i;
+		int port;
+		static u32 hint;
+		u32 offset = hint;
 		struct hlist_node *node;
  		struct tcp_tw_bucket *tw = NULL;
 
+		if (!loopback) 
+			offset += connect_port_offset(sk);
+		
  		local_bh_disable();
-
- 		/* TODO. Actually it is not so bad idea to remove
- 		 * tcp_portalloc_lock before next submission to Linus.
- 		 * As soon as we touch this place at all it is time to think.
- 		 *
- 		 * Now it protects single _advisory_ variable tcp_port_rover,
- 		 * hence it is mostly useless.
- 		 * Code will work nicely if we just delete it, but
- 		 * I am afraid in contented case it will work not better or
- 		 * even worse: another cpu just will hit the same bucket
- 		 * and spin there.
- 		 * So some cpu salt could remove both contention and
- 		 * memory pingpong. Any ideas how to do this in a nice way?
- 		 */
- 		spin_lock(&tcp_portalloc_lock);
- 		rover = tcp_port_rover;
-
- 		do {
- 			rover++;
- 			if ((rover < low) || (rover > high))
- 				rover = low;
- 			head = &tcp_bhash[tcp_bhashfn(rover)];
+		for (i = 1; i <= range; i++) {
+			port = low + (i + offset) % range;
+ 			head = &tcp_bhash[tcp_bhashfn(port)];
  			spin_lock(&head->lock);
 
  			/* Does not bother with rcv_saddr checks,
@@ -684,19 +679,19 @@ static int tcp_v4_hash_connect(struct so
  			 * unique enough.
  			 */
 			tb_for_each(tb, node, &head->chain) {
- 				if (tb->port == rover) {
+ 				if (tb->port == port) {
  					BUG_TRAP(!hlist_empty(&tb->owners));
  					if (tb->fastreuse >= 0)
  						goto next_port;
  					if (!__tcp_v4_check_established(sk,
-									rover,
+									port,
 									&tw))
  						goto ok;
  					goto next_port;
  				}
  			}
 
- 			tb = tcp_bucket_create(head, rover);
+ 			tb = tcp_bucket_create(head, port);
  			if (!tb) {
  				spin_unlock(&head->lock);
  				break;
@@ -706,22 +701,18 @@ static int tcp_v4_hash_connect(struct so
 
  		next_port:
  			spin_unlock(&head->lock);
- 		} while (--remaining > 0);
- 		tcp_port_rover = rover;
- 		spin_unlock(&tcp_portalloc_lock);
-
+ 		}
  		local_bh_enable();
 
  		return -EADDRNOTAVAIL;
 
 ok:
- 		/* All locks still held and bhs disabled */
- 		tcp_port_rover = rover;
- 		spin_unlock(&tcp_portalloc_lock);
+		hint += i;
 
- 		tcp_bind_hash(sk, tb, rover);
+ 		/* Head lock still held and bh's disabled */
+ 		tcp_bind_hash(sk, tb, port);
 		if (sk_unhashed(sk)) {
- 			inet_sk(sk)->sport = htons(rover);
+ 			inet_sk(sk)->sport = htons(port);
  			__tcp_v4_hash(sk, 0);
  		}
  		spin_unlock(&head->lock);
@@ -832,7 +823,7 @@ int tcp_v4_connect(struct sock *sk, stru
 	 * complete initialization after this.
 	 */
 	tcp_set_state(sk, TCP_SYN_SENT);
-	err = tcp_v4_hash_connect(sk);
+	err = tcp_v4_hash_connect(sk, rt->rt_flags & RTCF_LOCAL);
 	if (err)
 		goto failure;
 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] tcp: efficient port randomisation (rev 3)
  2004-12-06 17:42                               ` [PATCH] tcp: efficient port randomisation (rev 3) Stephen Hemminger
@ 2004-12-09  7:55                                 ` David S. Miller
  2004-12-11  1:09                                   ` [PATCH] tcp: efficient port randomistion " Stephen Hemminger
  0 siblings, 1 reply; 21+ messages in thread
From: David S. Miller @ 2004-12-09  7:55 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: michael.vittrup.larsen, netdev

On Mon, 6 Dec 2004 09:42:34 -0800
Stephen Hemminger <shemminger@osdl.org> wrote:

> Third revision of the TCP port randomization patch. It randomizes
> TCP ephemeral ports of incoming connections using variation of existing
> sequence number hash. This one avoids the MD4 for the loopback case since 
> there is no reason to bother over loopback and it improves benchmark numbers.

I don't think the loopback optimization is really necessary.
And in any event, RTCF_LOCAL doesn't necessarily mean that
the connection doesn't go "on the wire" especially when using
Julian's "send to self" patch which I might add at some point.

Anyways, please resend to me the version without the loopback
hack and I'll add it to my 2.6.11 queue.

Thanks Stephen and Michael.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] tcp: efficient port randomistion (rev 3)
  2004-12-09  7:55                                 ` David S. Miller
@ 2004-12-11  1:09                                   ` Stephen Hemminger
  2004-12-20 23:39                                     ` David S. Miller
  0 siblings, 1 reply; 21+ messages in thread
From: Stephen Hemminger @ 2004-12-11  1:09 UTC (permalink / raw)
  To: David S. Miller; +Cc: michael.vittrup.larsen, netdev

okay, here is the revised version. Testing shows that it
is more consistent, and just as fast as existing code,
probably because of the getting rid of portalloc_lock and
better distribution.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>

diff -Nru a/drivers/char/random.c b/drivers/char/random.c
--- a/drivers/char/random.c	2004-12-09 23:08:07 -08:00
+++ b/drivers/char/random.c	2004-12-09 23:08:07 -08:00
@@ -2347,6 +2347,24 @@
 	return halfMD4Transform(hash, keyptr->secret);
 }
 
+/* Generate secure starting point for ephemeral TCP port search */
+u32 secure_tcp_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport)
+{
+	struct keydata *keyptr = get_keyptr();
+	u32 hash[4];
+
+	/*
+	 *  Pick a unique starting offset for each ephemeral port search
+	 *  (saddr, daddr, dport) and 48bits of random data.
+	 */
+	hash[0] = saddr;
+	hash[1] = daddr;
+	hash[2] = dport ^ keyptr->secret[10];
+	hash[3] = keyptr->secret[11];
+
+	return halfMD4Transform(hash, keyptr->secret);
+}
+
 #ifdef CONFIG_SYN_COOKIES
 /*
  * Secure SYN cookie computation. This is the algorithm worked out by
diff -Nru a/include/linux/random.h b/include/linux/random.h
--- a/include/linux/random.h	2004-12-09 23:08:07 -08:00
+++ b/include/linux/random.h	2004-12-09 23:08:07 -08:00
@@ -52,6 +52,7 @@
 void generate_random_uuid(unsigned char uuid_out[16]);
 
 extern __u32 secure_ip_id(__u32 daddr);
+extern u32 secure_tcp_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport);
 extern __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr,
 					__u16 sport, __u16 dport);
 extern __u32 secure_tcp_syn_cookie(__u32 saddr, __u32 daddr,
diff -Nru a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
--- a/net/ipv4/tcp_ipv4.c	2004-12-09 23:08:07 -08:00
+++ b/net/ipv4/tcp_ipv4.c	2004-12-09 23:08:07 -08:00
@@ -636,10 +636,18 @@
 	return -EADDRNOTAVAIL;
 }
 
+static inline u32 connect_port_offset(const struct sock *sk)
+{
+	const struct inet_opt *inet = inet_sk(sk);
+
+	return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr, 
+					 inet->dport);
+}
+
 /*
  * Bind a port for a connect operation and hash it.
  */
-static int tcp_v4_hash_connect(struct sock *sk)
+static inline int tcp_v4_hash_connect(struct sock *sk)
 {
 	unsigned short snum = inet_sk(sk)->num;
  	struct tcp_bind_hashbucket *head;
@@ -647,36 +655,20 @@
 	int ret;
 
  	if (!snum) {
- 		int rover;
  		int low = sysctl_local_port_range[0];
  		int high = sysctl_local_port_range[1];
- 		int remaining = (high - low) + 1;
+		int range = high - low;
+ 		int i;
+		int port;
+		static u32 hint;
+		u32 offset = hint + connect_port_offset(sk);
 		struct hlist_node *node;
  		struct tcp_tw_bucket *tw = NULL;
 
  		local_bh_disable();
-
- 		/* TODO. Actually it is not so bad idea to remove
- 		 * tcp_portalloc_lock before next submission to Linus.
- 		 * As soon as we touch this place at all it is time to think.
- 		 *
- 		 * Now it protects single _advisory_ variable tcp_port_rover,
- 		 * hence it is mostly useless.
- 		 * Code will work nicely if we just delete it, but
- 		 * I am afraid in contented case it will work not better or
- 		 * even worse: another cpu just will hit the same bucket
- 		 * and spin there.
- 		 * So some cpu salt could remove both contention and
- 		 * memory pingpong. Any ideas how to do this in a nice way?
- 		 */
- 		spin_lock(&tcp_portalloc_lock);
- 		rover = tcp_port_rover;
-
- 		do {
- 			rover++;
- 			if ((rover < low) || (rover > high))
- 				rover = low;
- 			head = &tcp_bhash[tcp_bhashfn(rover)];
+		for (i = 1; i <= range; i++) {
+			port = low + (i + offset) % range;
+ 			head = &tcp_bhash[tcp_bhashfn(port)];
  			spin_lock(&head->lock);
 
  			/* Does not bother with rcv_saddr checks,
@@ -684,19 +676,19 @@
  			 * unique enough.
  			 */
 			tb_for_each(tb, node, &head->chain) {
- 				if (tb->port == rover) {
+ 				if (tb->port == port) {
  					BUG_TRAP(!hlist_empty(&tb->owners));
  					if (tb->fastreuse >= 0)
  						goto next_port;
  					if (!__tcp_v4_check_established(sk,
-									rover,
+									port,
 									&tw))
  						goto ok;
  					goto next_port;
  				}
  			}
 
- 			tb = tcp_bucket_create(head, rover);
+ 			tb = tcp_bucket_create(head, port);
  			if (!tb) {
  				spin_unlock(&head->lock);
  				break;
@@ -706,22 +698,18 @@
 
  		next_port:
  			spin_unlock(&head->lock);
- 		} while (--remaining > 0);
- 		tcp_port_rover = rover;
- 		spin_unlock(&tcp_portalloc_lock);
-
+ 		}
  		local_bh_enable();
 
  		return -EADDRNOTAVAIL;
 
 ok:
- 		/* All locks still held and bhs disabled */
- 		tcp_port_rover = rover;
- 		spin_unlock(&tcp_portalloc_lock);
+		hint += i;
 
- 		tcp_bind_hash(sk, tb, rover);
+ 		/* Head lock still held and bh's disabled */
+ 		tcp_bind_hash(sk, tb, port);
 		if (sk_unhashed(sk)) {
- 			inet_sk(sk)->sport = htons(rover);
+ 			inet_sk(sk)->sport = htons(port);
  			__tcp_v4_hash(sk, 0);
  		}
  		spin_unlock(&head->lock);

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] tcp: efficient port randomistion (rev 3)
  2004-12-11  1:09                                   ` [PATCH] tcp: efficient port randomistion " Stephen Hemminger
@ 2004-12-20 23:39                                     ` David S. Miller
  2005-06-22  9:17                                       ` Michael Vittrup Larsen
  0 siblings, 1 reply; 21+ messages in thread
From: David S. Miller @ 2004-12-20 23:39 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: michael.vittrup.larsen, netdev

On Fri, 10 Dec 2004 17:09:00 -0800
Stephen Hemminger <shemminger@osdl.org> wrote:

> okay, here is the revised version. Testing shows that it
> is more consistent, and just as fast as existing code,
> probably because of the getting rid of portalloc_lock and
> better distribution.
> 
> Signed-off-by: Stephen Hemminger <shemminger@osdl.org>

Queued up for 2.6.11, thanks Stephen.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] tcp: efficient port randomistion (rev 3)
  2004-12-20 23:39                                     ` David S. Miller
@ 2005-06-22  9:17                                       ` Michael Vittrup Larsen
  2005-06-22 16:44                                         ` Stephen Hemminger
  0 siblings, 1 reply; 21+ messages in thread
From: Michael Vittrup Larsen @ 2005-06-22  9:17 UTC (permalink / raw)
  To: David S. Miller; +Cc: Stephen Hemminger, netdev

On Tuesday 21 December 2004 00:39, David S. Miller wrote:
> On Fri, 10 Dec 2004 17:09:00 -0800
>
> Stephen Hemminger <shemminger@osdl.org> wrote:
> > okay, here is the revised version. Testing shows that it
> > is more consistent, and just as fast as existing code,
> > probably because of the getting rid of portalloc_lock and
> > better distribution.
> >
> > Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
>
> Queued up for 2.6.11, thanks Stephen.

What's the status of this - I see it is not part of 2.6.12?

Is there a general dislike of the port randomisation mechanism or?

/Michael

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] tcp: efficient port randomistion (rev 3)
  2005-06-22  9:17                                       ` Michael Vittrup Larsen
@ 2005-06-22 16:44                                         ` Stephen Hemminger
  0 siblings, 0 replies; 21+ messages in thread
From: Stephen Hemminger @ 2005-06-22 16:44 UTC (permalink / raw)
  To: Michael Vittrup Larsen; +Cc: David S. Miller, netdev

On Wed, 22 Jun 2005 11:17:03 +0200
Michael Vittrup Larsen <michael.vittrup.larsen@ericsson.com> wrote:

> On Tuesday 21 December 2004 00:39, David S. Miller wrote:
> > On Fri, 10 Dec 2004 17:09:00 -0800
> >
> > Stephen Hemminger <shemminger@osdl.org> wrote:
> > > okay, here is the revised version. Testing shows that it
> > > is more consistent, and just as fast as existing code,
> > > probably because of the getting rid of portalloc_lock and
> > > better distribution.
> > >
> > > Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
> >
> > Queued up for 2.6.11, thanks Stephen.
> 
> What's the status of this - I see it is not part of 2.6.12?
> 
> Is there a general dislike of the port randomisation mechanism or?

There is port randomization in 2.6.11 and 2.6.12, look for
secure_tcp_port_ephemeral in the source. 2.6.12 also does random port
allocation for IPV6.

We still do the non-random stuff for explicit binds (tcp_v4_get_port),
but there is no state to seed in that case and it only impacts app's 
that do an explicit bind to 0.

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2005-06-22 16:44 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20041027092531.78fe438c@guest-251-240.pdx.osdl.net>
     [not found] ` <200410291048.01955.michael.vittrup.larsen@ericsson.com>
2004-10-29 17:28   ` [PATCH] tcp: efficient port randomisation Stephen Hemminger
2004-11-01  9:58     ` Michael Vittrup Larsen
2004-11-01 17:20       ` Stephen Hemminger
2004-11-02  7:54         ` Michael Vittrup Larsen
2004-11-04 18:01           ` Stephen Hemminger
2004-11-05 10:03             ` Michael Vittrup Larsen
2004-11-17 23:30               ` [PATCH] tcp: efficient port randomisation (revised) Stephen Hemminger
2004-11-19  7:38                 ` Michael Vittrup Larsen
2004-12-01  5:46                 ` David S. Miller
     [not found]                   ` <20041201152446.3a0d5ce3@dxpl.pdx.osdl.net>
     [not found]                     ` <20041201204622.7b760400.davem@davemloft.net>
2004-12-02 21:49                       ` Stephen Hemminger
2004-12-02 21:52                         ` David S. Miller
2004-12-02 22:51                           ` Stephen Hemminger
2004-12-02 23:01                           ` Stephen Hemminger
2004-12-04  5:42                           ` Stephen Hemminger
2004-12-06  8:18                             ` Michael Vittrup Larsen
2004-12-06 17:42                               ` [PATCH] tcp: efficient port randomisation (rev 3) Stephen Hemminger
2004-12-09  7:55                                 ` David S. Miller
2004-12-11  1:09                                   ` [PATCH] tcp: efficient port randomistion " Stephen Hemminger
2004-12-20 23:39                                     ` David S. Miller
2005-06-22  9:17                                       ` Michael Vittrup Larsen
2005-06-22 16:44                                         ` Stephen Hemminger

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).