All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch 0/3][netns] fix and wipeout timewait sockets
@ 2007-09-24 13:29 Daniel Lezcano
  2007-09-24 13:29 ` [patch 1/3][netns] add a reference to the netns for timewait Daniel Lezcano
                   ` (3 more replies)
  0 siblings, 4 replies; 7+ messages in thread
From: Daniel Lezcano @ 2007-09-24 13:29 UTC (permalink / raw)
  To: ebiederm-aS9lmoZGLiVWk0Htik3J/w; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

Denis Lunev spotted that using a reference to the network namespace
with the timewait sockets will be a waste of time because they
are pointless while we will remove the network stack at network
namespace exit.

The following patches do the following:
	- fix missing network namespace reference in timewait 
	socket
	- do some changes in timewait socket code to prepare 
	the next patch, especially split code taking a lock
	- do the effective timewait socket cleanup at network
	namespace exit.

The following code is a test program which creates 100 timewait
sockets.

#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/poll.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <arpa/inet.h>

#include <unistd.h>

#define MAXCONN 100

int client(int *fds)
{
        int i;
        struct sockaddr_in addr;

        close(fds[1]);

        memset(&addr, 0, sizeof(addr));

        addr.sin_family = AF_INET;
        addr.sin_port =  htons(10000);
        addr.sin_addr.s_addr = inet_addr("127.0.0.1");

        if (read(fds[0], &i, sizeof(i)) == -1) {
                perror("read");
                return 1;
        }

        for (i = 0; i < MAXCONN; i++) {
                int fd = socket(PF_INET, SOCK_STREAM, 0);
                if (fd == -1) {
                        perror("socket");
                        return 1;
                }

                if (connect(fd, (const struct sockaddr *)&addr, sizeof(addr))) {
                        perror("connect");
                        return 1;
                }
        }

        return 0;
}

int server(int *fds)
{
        int i, fd, fdpoll[MAXCONN];
        struct sockaddr_in addr;
        socklen_t socklen = sizeof(addr);

        close(fds[0]);

        fd = socket(PF_INET, SOCK_STREAM, 0);
        if (fd == -1) {
                perror("socket");
                return 1;
        }

        memset(&addr, 0, sizeof(addr));

        addr.sin_family = AF_INET;
        addr.sin_port = htons(10000);
        addr.sin_addr.s_addr = inet_addr("127.0.0.1");

        if (bind(fd, (const struct sockaddr *)&addr, sizeof(addr))) {
                perror("bind");
                return 1;
        }

        if (listen(fd, MAXCONN)) {
                perror("listen");
                return 1;
        }

        if (write(fds[1], &i, sizeof(i)) == -1) {
                perror("write");
                return 1;
        }

        for (i = 0; i < MAXCONN; i++) {
                int f = accept(fd, (struct sockaddr *)&addr, &socklen);
                if (f == -1) {
                        perror("accept");
                        return 1;
                }
                fdpoll[i] = f;
        }

        return 0;
}

int main(int argc, char *argv[])
{
        int fds[2];
        int pid;

        if (pipe(fds)) {
                perror("pipe");
                return 1;
        }

        pid = fork();
        if (pid == -1) {
                perror("fork");
                return 1;
        }

        if (!pid)
                return client(fds);
        else
                return server(fds);
}

-- 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [patch 1/3][netns] add a reference to the netns for timewait
  2007-09-24 13:29 [patch 0/3][netns] fix and wipeout timewait sockets Daniel Lezcano
@ 2007-09-24 13:29 ` Daniel Lezcano
  2007-09-24 13:29 ` [patch 2/3][netns] make timewait unhash lock free Daniel Lezcano
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 7+ messages in thread
From: Daniel Lezcano @ 2007-09-24 13:29 UTC (permalink / raw)
  To: ebiederm-aS9lmoZGLiVWk0Htik3J/w; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

[-- Attachment #1: net-fix-timewait-reference.patch --]
[-- Type: text/plain, Size: 1937 bytes --]

From: Daniel Lezcano <dlezcano-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>

When a socket changes to a timewait socket, the network namespace
is not copied from the original socket.

Here we hold a usage reference, not the ref count on the network
namespace, so the network namespace will be freed either the usage
reference is not 0. The network namespace cleanup function will 
fail if there is any usage of it. In this case, we should ensure
there is no usage of the network namespace.

Signed-off-by: Daniel Lezcano <dlezcano-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>
---
 include/net/inet_timewait_sock.h |    2 ++
 net/ipv4/inet_timewait_sock.c    |    1 +
 2 files changed, 3 insertions(+)

Index: linux-2.6-netns/include/net/inet_timewait_sock.h
===================================================================
--- linux-2.6-netns.orig/include/net/inet_timewait_sock.h
+++ linux-2.6-netns/include/net/inet_timewait_sock.h
@@ -197,12 +197,14 @@ static inline void inet_twsk_put(struct 
 {
 	if (atomic_dec_and_test(&tw->tw_refcnt)) {
 		struct module *owner = tw->tw_prot->owner;
+		struct net *net = tw->tw_net;
 		twsk_destructor((struct sock *)tw);
 #ifdef SOCK_REFCNT_DEBUG
 		printk(KERN_DEBUG "%s timewait_sock %p released\n",
 		       tw->tw_prot->name, tw);
 #endif
 		kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw);
+		release_net(net);
 		module_put(owner);
 	}
 }
Index: linux-2.6-netns/net/ipv4/inet_timewait_sock.c
===================================================================
--- linux-2.6-netns.orig/net/ipv4/inet_timewait_sock.c
+++ linux-2.6-netns/net/ipv4/inet_timewait_sock.c
@@ -108,6 +108,7 @@ struct inet_timewait_sock *inet_twsk_all
 		tw->tw_hash	    = sk->sk_hash;
 		tw->tw_ipv6only	    = 0;
 		tw->tw_prot	    = sk->sk_prot_creator;
+		tw->tw_net          = hold_net(sk->sk_net);
 		atomic_set(&tw->tw_refcnt, 1);
 		inet_twsk_dead_node_init(tw);
 		__module_get(tw->tw_prot->owner);

-- 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [patch 2/3][netns] make timewait unhash lock free
  2007-09-24 13:29 [patch 0/3][netns] fix and wipeout timewait sockets Daniel Lezcano
  2007-09-24 13:29 ` [patch 1/3][netns] add a reference to the netns for timewait Daniel Lezcano
@ 2007-09-24 13:29 ` Daniel Lezcano
  2007-09-24 13:29 ` [patch 3/3][netns] remove timewait sockets at cleanup Daniel Lezcano
       [not found] ` <20070924132935.398625515-WECHFHqYCmGD/CxQmPlnQ0FT0OZdM7KVQQ4Iyu8u01E@public.gmane.org>
  3 siblings, 0 replies; 7+ messages in thread
From: Daniel Lezcano @ 2007-09-24 13:29 UTC (permalink / raw)
  To: ebiederm-aS9lmoZGLiVWk0Htik3J/w; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

[-- Attachment #1: net-make-inet-deschedule-lockless.patch --]
[-- Type: text/plain, Size: 3696 bytes --]

From: Daniel Lezcano <dlezcano-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>

The network namespace cleanup will remove all timewait sockets
related to it because there are pointless. 

The problem is we need to browse the established hash table and 
for that we need to take the lock. For each timesocket we call 
inet_deschedule and this one take the established hash table lock
too.

The following patchset split the removing of the established hash
into two parts, one removing the node from the hash and another
taking the lock and calling the first one.

The network namespace cleanup can be done calling the lock free
function.

Signed-off-by: Daniel Lezcano <dlezcano-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>
---
 include/net/inet_timewait_sock.h |   13 ++++++++++++
 net/ipv4/inet_timewait_sock.c    |   40 +++++++++++++++++++++++++++------------
 2 files changed, 41 insertions(+), 12 deletions(-)

Index: linux-2.6-netns/net/ipv4/inet_timewait_sock.c
===================================================================
--- linux-2.6-netns.orig/net/ipv4/inet_timewait_sock.c
+++ linux-2.6-netns/net/ipv4/inet_timewait_sock.c
@@ -13,25 +13,28 @@
 #include <net/inet_timewait_sock.h>
 #include <net/ip.h>
 
-/* Must be called with locally disabled BHs. */
-static void __inet_twsk_kill(struct inet_timewait_sock *tw,
-			     struct inet_hashinfo *hashinfo)
+static inline int inet_twsk_unehash(struct inet_timewait_sock *tw,
+				    struct inet_hashinfo *hashinfo)
 {
-	struct inet_bind_hashbucket *bhead;
-	struct inet_bind_bucket *tb;
-	/* Unlink from established hashes. */
-	struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, tw->tw_hash);
+	struct inet_ehash_bucket *ehead =
+		inet_ehash_bucket(hashinfo, tw->tw_hash);
 
 	write_lock(&ehead->lock);
-	if (hlist_unhashed(&tw->tw_node)) {
+	if (__inet_twsk_unehash(tw)) {
 		write_unlock(&ehead->lock);
-		return;
+		return 1;
 	}
-	__hlist_del(&tw->tw_node);
-	sk_node_init(&tw->tw_node);
 	write_unlock(&ehead->lock);
 
-	/* Disassociate with bind bucket. */
+	return 0;
+}
+
+void inet_twsk_unbhash(struct inet_timewait_sock *tw,
+		       struct inet_hashinfo *hashinfo)
+{
+	struct inet_bind_hashbucket *bhead;
+	struct inet_bind_bucket *tb;
+
 	bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_net, tw->tw_num, hashinfo->bhash_size)];
 	spin_lock(&bhead->lock);
 	tb = tw->tw_tb;
@@ -39,6 +42,19 @@ static void __inet_twsk_kill(struct inet
 	tw->tw_tb = NULL;
 	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
 	spin_unlock(&bhead->lock);
+}
+
+/* Must be called with locally disabled BHs. */
+static void __inet_twsk_kill(struct inet_timewait_sock *tw,
+			     struct inet_hashinfo *hashinfo)
+{
+	/* Unlink from established hashes. */
+	if (inet_twsk_unehash(tw, hashinfo))
+		return;
+
+	/* Disassociate with bind bucket. */
+	inet_twsk_unbhash(tw, hashinfo);
+
 #ifdef SOCK_REFCNT_DEBUG
 	if (atomic_read(&tw->tw_refcnt) != 1) {
 		printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n",
Index: linux-2.6-netns/include/net/inet_timewait_sock.h
===================================================================
--- linux-2.6-netns.orig/include/net/inet_timewait_sock.h
+++ linux-2.6-netns/include/net/inet_timewait_sock.h
@@ -173,6 +173,19 @@ static inline int inet_twsk_del_dead_nod
 	return 0;
 }
 
+static inline int __inet_twsk_unehash(struct inet_timewait_sock *tw)
+{
+	if (hlist_unhashed(&tw->tw_node))
+		return 1;
+	__hlist_del(&tw->tw_node);
+	sk_node_init(&tw->tw_node);
+
+	return 0;
+}
+
+extern void inet_twsk_unbhash(struct inet_timewait_sock *tw,
+			      struct inet_hashinfo *hashinfo);
+
 #define inet_twsk_for_each(tw, node, head) \
 	hlist_for_each_entry(tw, node, head, tw_node)
 

-- 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [patch 3/3][netns] remove timewait sockets at cleanup
  2007-09-24 13:29 [patch 0/3][netns] fix and wipeout timewait sockets Daniel Lezcano
  2007-09-24 13:29 ` [patch 1/3][netns] add a reference to the netns for timewait Daniel Lezcano
  2007-09-24 13:29 ` [patch 2/3][netns] make timewait unhash lock free Daniel Lezcano
@ 2007-09-24 13:29 ` Daniel Lezcano
       [not found]   ` <20070924133315.075225268-WECHFHqYCmGD/CxQmPlnQ0FT0OZdM7KVQQ4Iyu8u01E@public.gmane.org>
       [not found] ` <20070924132935.398625515-WECHFHqYCmGD/CxQmPlnQ0FT0OZdM7KVQQ4Iyu8u01E@public.gmane.org>
  3 siblings, 1 reply; 7+ messages in thread
From: Daniel Lezcano @ 2007-09-24 13:29 UTC (permalink / raw)
  To: ebiederm-aS9lmoZGLiVWk0Htik3J/w; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

[-- Attachment #1: net-wipe-out-timewait-at-cleanup_net.patch --]
[-- Type: text/plain, Size: 3087 bytes --]

From: Daniel Lezcano <dlezcano-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>

Denis Lunev spotted that if we take a reference to the network namespace
with the timewait sockets, we will need to wait for their expiration to
have the network namespace freed. This is a waste of time, the timewait
sockets are for avoiding to receive a duplicate packet from the network,
if the network namespace is freed, the network stack is removed, so no
chance to receive any packets from the outside world.

This patchset remove/destroy the timewait sockets when the
network namespace is freed.

Signed-off-by: Daniel Lezcano <dlezcano-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>
---
 net/core/net_namespace.c |   55 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

Index: linux-2.6-netns/net/core/net_namespace.c
===================================================================
--- linux-2.6-netns.orig/net/core/net_namespace.c
+++ linux-2.6-netns/net/core/net_namespace.c
@@ -7,6 +7,7 @@
 #include <linux/sched.h>
 #include <linux/kallsyms.h>
 #include <net/net_namespace.h>
+#include <net/tcp.h>
 
 /*
  *	Our network namespace constructor/destructor lists
@@ -59,6 +60,57 @@ static int pernet_debug_max=-1;
 module_param(pernet_debug, int, S_IRUSR|S_IWUSR);
 module_param(pernet_debug_max, int, S_IRUSR|S_IWUSR);
 
+/*
+ * This function is called when the network namespace is freed.
+ * It allows to wipe out all timewait sockets. They are useless
+ * because the network namespace will be destroyed and the network
+ * stack with it, so no risks to have a duplicate packet coming
+ * from the outside world.
+ */
+static void clean_up_timewait(struct net *net)
+{
+	struct inet_timewait_sock *tw;
+	struct sock *sk;
+	struct hlist_node *node, *tmp;
+	int h;
+
+	/* Browse the the established hash table */
+	for (h = 0; h < (tcp_hashinfo.ehash_size); h++) {
+                struct inet_ehash_bucket *head =
+                        inet_ehash_bucket(&tcp_hashinfo, h);
+
+		/* Take the look and disable bh */
+ 		write_lock_bh(&head->lock);
+
+		sk_for_each_safe(sk, node, tmp, &head->twchain) {
+
+			tw = inet_twsk(sk);
+			if (tw->tw_net != net)
+				continue;
+
+			/* deschedule the timewait socket */
+			spin_lock(&tcp_death_row.death_lock);
+			if (inet_twsk_del_dead_node(tw)) {
+				inet_twsk_put(tw);
+				if (--tcp_death_row.tw_count == 0)
+					del_timer(&tcp_death_row.tw_timer);
+			}
+			spin_unlock(&tcp_death_row.death_lock);
+
+			/* remove it from the established hash table */
+			__inet_twsk_unehash(tw);
+
+			/* remove it from the bind hash table */
+			inet_twsk_unbhash(tw, tcp_death_row.hashinfo);
+
+			/* last put */
+			inet_twsk_put(tw);
+		}
+
+		write_unlock_bh(&head->lock);
+	}
+}
+
 static void cleanup_net(struct work_struct *work)
 {
 	struct pernet_operations *ops;
@@ -96,6 +148,9 @@ static void cleanup_net(struct work_stru
 
 	mutex_unlock(&net_mutex);
 
+	/* The timewait sockets are pointless */
+	clean_up_timewait(net);
+
 	/* Ensure there are no outstanding rcu callbacks using this
 	 * network namespace.
 	 */

-- 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [patch 3/3][netns] remove timewait sockets at cleanup
       [not found]   ` <20070924133315.075225268-WECHFHqYCmGD/CxQmPlnQ0FT0OZdM7KVQQ4Iyu8u01E@public.gmane.org>
@ 2007-09-26 19:22     ` Eric W. Biederman
       [not found]       ` <m1r6klteod.fsf-T1Yj925okcoyDheHMi7gv2pdwda3JcWeAL8bYrjMMd8@public.gmane.org>
  0 siblings, 1 reply; 7+ messages in thread
From: Eric W. Biederman @ 2007-09-26 19:22 UTC (permalink / raw)
  To: Daniel Lezcano; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

Daniel Lezcano <dlezcano-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org> writes:

> From: Daniel Lezcano <dlezcano-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>
>
> Denis Lunev spotted that if we take a reference to the network namespace
> with the timewait sockets, we will need to wait for their expiration to
> have the network namespace freed. This is a waste of time, the timewait
> sockets are for avoiding to receive a duplicate packet from the network,
> if the network namespace is freed, the network stack is removed, so no
> chance to receive any packets from the outside world.
>
> This patchset remove/destroy the timewait sockets when the
> network namespace is freed.

This code is in the wrong place.  Please do the register_net_subsys
thing so we can keep the code in  net/ipv4/inet_timewait_sock.c

This code just need to be an exit method.

Eric

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [patch 0/3][netns] fix and wipeout timewait sockets
       [not found] ` <20070924132935.398625515-WECHFHqYCmGD/CxQmPlnQ0FT0OZdM7KVQQ4Iyu8u01E@public.gmane.org>
@ 2007-09-26 19:24   ` Eric W. Biederman
  0 siblings, 0 replies; 7+ messages in thread
From: Eric W. Biederman @ 2007-09-26 19:24 UTC (permalink / raw)
  To: Daniel Lezcano; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

Daniel Lezcano <dlezcano-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org> writes:

> Denis Lunev spotted that using a reference to the network namespace
> with the timewait sockets will be a waste of time because they
> are pointless while we will remove the network stack at network
> namespace exit.

The patches look close and look like they are in the right general
direction.

I haven't reviewed them closely yet, so I suspect there are
a few more nits but generally they look good.

Please don't let me forget about this issue.

Thanks,
Eric

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [patch 3/3][netns] remove timewait sockets at cleanup
       [not found]       ` <m1r6klteod.fsf-T1Yj925okcoyDheHMi7gv2pdwda3JcWeAL8bYrjMMd8@public.gmane.org>
@ 2007-09-27  8:36         ` Daniel Lezcano
  0 siblings, 0 replies; 7+ messages in thread
From: Daniel Lezcano @ 2007-09-27  8:36 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

Eric W. Biederman wrote:
> Daniel Lezcano <dlezcano-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org> writes:
> 
>> From: Daniel Lezcano <dlezcano-NmTC/0ZBporQT0dZR+AlfA@public.gmane.org>
>>
>> Denis Lunev spotted that if we take a reference to the network namespace
>> with the timewait sockets, we will need to wait for their expiration to
>> have the network namespace freed. This is a waste of time, the timewait
>> sockets are for avoiding to receive a duplicate packet from the network,
>> if the network namespace is freed, the network stack is removed, so no
>> chance to receive any packets from the outside world.
>>
>> This patchset remove/destroy the timewait sockets when the
>> network namespace is freed.
> 
> This code is in the wrong place.  Please do the register_net_subsys
> thing so we can keep the code in  net/ipv4/inet_timewait_sock.c
> 
> This code just need to be an exit method.

Thanks Eric, I will fix that.

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2007-09-27  8:36 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-09-24 13:29 [patch 0/3][netns] fix and wipeout timewait sockets Daniel Lezcano
2007-09-24 13:29 ` [patch 1/3][netns] add a reference to the netns for timewait Daniel Lezcano
2007-09-24 13:29 ` [patch 2/3][netns] make timewait unhash lock free Daniel Lezcano
2007-09-24 13:29 ` [patch 3/3][netns] remove timewait sockets at cleanup Daniel Lezcano
     [not found]   ` <20070924133315.075225268-WECHFHqYCmGD/CxQmPlnQ0FT0OZdM7KVQQ4Iyu8u01E@public.gmane.org>
2007-09-26 19:22     ` Eric W. Biederman
     [not found]       ` <m1r6klteod.fsf-T1Yj925okcoyDheHMi7gv2pdwda3JcWeAL8bYrjMMd8@public.gmane.org>
2007-09-27  8:36         ` Daniel Lezcano
     [not found] ` <20070924132935.398625515-WECHFHqYCmGD/CxQmPlnQ0FT0OZdM7KVQQ4Iyu8u01E@public.gmane.org>
2007-09-26 19:24   ` [patch 0/3][netns] fix and wipeout timewait sockets Eric W. Biederman

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.