Files, sockets, and closing

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* Files, sockets, and closing
@ 2007-10-26 21:03 Stephen Hemminger
  2007-10-26 21:45 ` Al Viro
  2007-10-27 18:03 ` Andi Kleen
  0 siblings, 2 replies; 5+ messages in thread
From: Stephen Hemminger @ 2007-10-26 21:03 UTC (permalink / raw)
  To: David S. Miller, Al Viro; +Cc: netdev

Looking at this bug:
http://bugzilla.kernel.org/show_bug.cgi?id=9149

Exposes some rather deep issues in the filesystem/socket/inet/tcp
layering. It seems that sys_close() zaps the file table entry, but
since each thread has a separate reference, the actual tcp_close()
doesn't happen until the last thread calls close/exits.

I am no VFS expert.
The semantically correct fix appears to be complex.
  * add a flush handle to the socket_file_ops
  * propagate the flush through socket to inet and tcp
  * split tcp_close into two parts.
     1) tcp_flush - flush buffers and wakeup all other threads on the socket 
     2) tcp_release - release last reference and cleanup.

Bogus patch for explanation purposes:

--- a/include/linux/net.h	2007-10-26 12:44:41.000000000 -0700
+++ b/include/linux/net.h	2007-10-26 13:56:39.000000000 -0700
@@ -128,6 +128,7 @@ struct proto_ops {
 	int		family;
 	struct module	*owner;
 	int		(*release)   (struct socket *sock);
+	void		(*flush)     (struct socket *sock);
 	int		(*bind)	     (struct socket *sock,
 				      struct sockaddr *myaddr,
 				      int sockaddr_len);
--- a/include/net/tcp.h	2007-10-26 12:44:42.000000000 -0700
+++ b/include/net/tcp.h	2007-10-26 13:06:53.000000000 -0700
@@ -367,6 +367,7 @@ extern void			tcp_enter_loss(struct sock
 extern void			tcp_clear_retrans(struct tcp_sock *tp);
 extern void			tcp_update_metrics(struct sock *sk);
 
+extern void			tcp_flush(struct sock *sk);
 extern void			tcp_close(struct sock *sk, 
 					  long timeout);
 extern unsigned int		tcp_poll(struct file * file, struct socket *sock, struct poll_table_struct *wait);
--- a/net/ipv4/af_inet.c	2007-10-26 12:44:46.000000000 -0700
+++ b/net/ipv4/af_inet.c	2007-10-26 13:56:00.000000000 -0700
@@ -386,6 +386,14 @@ out_rcu_unlock:
 	goto out;
 }
 
+/* The file handle is closed, but not all threads maybe gone */
+void inet_flush(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+
+	if (sk && sk->sk_prot->flush)
+		sk->sk_prot->flush(sk);
+}
 
 /*
  *	The peer socket should always be NULL (or else). When we call this
@@ -822,6 +830,7 @@ int inet_ioctl(struct socket *sock, unsi
 const struct proto_ops inet_stream_ops = {
 	.family		   = PF_INET,
 	.owner		   = THIS_MODULE,
+	.flush		   = inet_flush,
 	.release	   = inet_release,
 	.bind		   = inet_bind,
 	.connect	   = inet_stream_connect,
--- a/net/ipv4/tcp.c	2007-10-26 12:44:46.000000000 -0700
+++ b/net/ipv4/tcp.c	2007-10-26 14:00:42.000000000 -0700
@@ -1557,11 +1557,10 @@ void tcp_shutdown(struct sock *sk, int h
 	}
 }
 
-void tcp_close(struct sock *sk, long timeout)
+void tcp_flush(struct sock *sk)
 {
 	struct sk_buff *skb;
 	int data_was_unread = 0;
-	int state;
 
 	lock_sock(sk);
 	sk->sk_shutdown = SHUTDOWN_MASK;
@@ -1572,7 +1571,7 @@ void tcp_close(struct sock *sk, long tim
 		/* Special case. */
 		inet_csk_listen_stop(sk);
 
-		goto adjudge_to_death;
+		return;
 	}
 
 	/*  We need to flush the recv. buffs.  We do this only on the
@@ -1632,10 +1631,16 @@ void tcp_close(struct sock *sk, long tim
 		 */
 		tcp_send_fin(sk);
 	}
+	release_sock(sk);
+}
 
+void tcp_close(struct sock *sk, long timeout)
+{
+	int state;
+
+	lock_sock(sk);
 	sk_stream_wait_close(sk, timeout);
 
-adjudge_to_death:
 	state = sk->sk_state;
 	sock_hold(sk);
 	sock_orphan(sk);
@@ -2524,6 +2529,7 @@ void __init tcp_init(void)
 	tcp_register_congestion_control(&tcp_reno);
 }
 
+EXPORT_SYMBOL(tcp_flush);
 EXPORT_SYMBOL(tcp_close);
 EXPORT_SYMBOL(tcp_disconnect);
 EXPORT_SYMBOL(tcp_getsockopt);
--- a/net/ipv4/tcp_ipv4.c	2007-10-26 12:44:46.000000000 -0700
+++ b/net/ipv4/tcp_ipv4.c	2007-10-26 13:06:04.000000000 -0700
@@ -2420,6 +2420,7 @@ void tcp4_proc_exit(void)
 struct proto tcp_prot = {
 	.name			= "TCP",
 	.owner			= THIS_MODULE,
+	.flush			= tcp_flush,
 	.close			= tcp_close,
 	.connect		= tcp_v4_connect,
 	.disconnect		= tcp_disconnect,
--- a/net/ipv6/tcp_ipv6.c	2007-10-26 12:44:47.000000000 -0700
+++ b/net/ipv6/tcp_ipv6.c	2007-10-26 13:06:25.000000000 -0700
@@ -2109,6 +2109,7 @@ void tcp6_proc_exit(void)
 struct proto tcpv6_prot = {
 	.name			= "TCPv6",
 	.owner			= THIS_MODULE,
+	.flush			= tcp_flush,
 	.close			= tcp_close,
 	.connect		= tcp_v6_connect,
 	.disconnect		= tcp_disconnect,
--- a/net/socket.c	2007-10-26 12:44:48.000000000 -0700
+++ b/net/socket.c	2007-10-26 13:47:48.000000000 -0700
@@ -100,7 +100,7 @@ static ssize_t sock_aio_read(struct kioc
 static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov,
 			  unsigned long nr_segs, loff_t pos);
 static int sock_mmap(struct file *file, struct vm_area_struct *vma);
-
+static int sock_flush(struct file *file, fl_owner_t id);
 static int sock_close(struct inode *inode, struct file *file);
 static unsigned int sock_poll(struct file *file,
 			      struct poll_table_struct *wait);
@@ -130,6 +130,7 @@ static const struct file_operations sock
 #endif
 	.mmap =		sock_mmap,
 	.open =		sock_no_open,	/* special open code to disallow open via /proc */
+	.flush =	sock_flush,
 	.release =	sock_close,
 	.fasync =	sock_fasync,
 	.sendpage =	sock_sendpage,
@@ -961,6 +962,15 @@ static int sock_mmap(struct file *file, 
 	return sock->ops->mmap(file, sock, vma);
 }
 
+static int sock_flush(struct file *file, fl_owner_t id)
+{
+	struct socket *sock = file->private_data;
+
+	if (sock->ops && sock->ops->flush)
+		sock->ops->flush(sock);
+	return 0;
+}
+
 static int sock_close(struct inode *inode, struct file *filp)
 {
 	/*
--- a/include/net/sock.h	2007-10-26 12:44:42.000000000 -0700
+++ b/include/net/sock.h	2007-10-26 13:50:03.000000000 -0700
@@ -515,6 +515,7 @@ struct timewait_sock_ops;
 struct proto {
 	void			(*close)(struct sock *sk, 
 					long timeout);
+	void			(*flush)(struct sock *sk);
 	int			(*connect)(struct sock *sk,
 				        struct sockaddr *uaddr, 
 					int addr_len);

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: Files, sockets, and closing
  2007-10-26 21:03 Files, sockets, and closing Stephen Hemminger
@ 2007-10-26 21:45 ` Al Viro
  2007-10-26 22:09   ` Stephen Hemminger
  2007-10-27 18:03 ` Andi Kleen
  1 sibling, 1 reply; 5+ messages in thread
From: Al Viro @ 2007-10-26 21:45 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David S. Miller, netdev

On Fri, Oct 26, 2007 at 02:03:19PM -0700, Stephen Hemminger wrote:
> Looking at this bug:
> http://bugzilla.kernel.org/show_bug.cgi?id=9149
> 
> Exposes some rather deep issues in the filesystem/socket/inet/tcp
> layering. It seems that sys_close() zaps the file table entry, but
> since each thread has a separate reference, the actual tcp_close()
> doesn't happen until the last thread calls close/exits.

No.  It's not about that at all.  Threads in his case _share_ descriptor
table and the thing he's complaining about is that another thread has
removed the descriptor from their (shared) descriptor table and he's
not getting notified.  It's not about struct file (or socket) at all;
it's all on descriptor level.

The reference to struct file is held by accept() itself, _not_ by descriptor
table.  And he would have the same problem if the opened socket had been
inherited from parent (and still opened by it) - it's really not about
the damn thing getting shut down, etc.

What happens is that there is a mapping from descriptors to opened files,
a reference to opened file is obtained by it once per syscall and that file
remains open at least until the end of syscall.   Whether the descriptor
you've passed remains refering to the same file is up to userland code.
If you have another thread and that thread rips the descriptor out of your
shared descriptor table, it's your responsibility to keep them sane and
happy.

close() from another thread is not a way to abort blocked accept().  Never
promised to be that.  Just as close() from another thread is not a way to
abort blocked write() or read() or sendmsg() or...

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: Files, sockets, and closing
  2007-10-26 21:45 ` Al Viro
@ 2007-10-26 22:09   ` Stephen Hemminger
  2007-10-26 22:46     ` Al Viro
  0 siblings, 1 reply; 5+ messages in thread
From: Stephen Hemminger @ 2007-10-26 22:09 UTC (permalink / raw)
  To: Al Viro; +Cc: David S. Miller, netdev

On Fri, 26 Oct 2007 22:45:13 +0100
Al Viro <viro@ftp.linux.org.uk> wrote:

> On Fri, Oct 26, 2007 at 02:03:19PM -0700, Stephen Hemminger wrote:
> > Looking at this bug:
> > http://bugzilla.kernel.org/show_bug.cgi?id=9149
> > 
> > Exposes some rather deep issues in the filesystem/socket/inet/tcp
> > layering. It seems that sys_close() zaps the file table entry, but
> > since each thread has a separate reference, the actual tcp_close()
> > doesn't happen until the last thread calls close/exits.
> 
> No.  It's not about that at all.  Threads in his case _share_ descriptor
> table and the thing he's complaining about is that another thread has
> removed the descriptor from their (shared) descriptor table and he's
> not getting notified.  It's not about struct file (or socket) at all;
> it's all on descriptor level.
> 
> The reference to struct file is held by accept() itself, _not_ by descriptor
> table.  And he would have the same problem if the opened socket had been
> inherited from parent (and still opened by it) - it's really not about
> the damn thing getting shut down, etc.
> 
> What happens is that there is a mapping from descriptors to opened files,
> a reference to opened file is obtained by it once per syscall and that file
> remains open at least until the end of syscall.   Whether the descriptor
> you've passed remains refering to the same file is up to userland code.
> If you have another thread and that thread rips the descriptor out of your
> shared descriptor table, it's your responsibility to keep them sane and
> happy.
> 
> close() from another thread is not a way to abort blocked accept().  Never
> promised to be that.  Just as close() from another thread is not a way to
> abort blocked write() or read() or sendmsg() or...

The problem is the Linux interpretation conflicts with the expectation
of applications that run on other Unix systems.  Most likely, it is
one of those corner cases not covered by SUS or Posix specs otherwise
it would have come up earlier. The existing Linux behavior works fine
it just isn't expected (or well documented).

I'm fine with just closing the bug (which is what I did initially), but
where should this get documented?

-- 
Stephen Hemminger <shemminger@linux-foundation.org>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: Files, sockets, and closing
  2007-10-26 22:09   ` Stephen Hemminger
@ 2007-10-26 22:46     ` Al Viro
  0 siblings, 0 replies; 5+ messages in thread
From: Al Viro @ 2007-10-26 22:46 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David S. Miller, netdev

On Fri, Oct 26, 2007 at 03:09:01PM -0700, Stephen Hemminger wrote:

> > close() from another thread is not a way to abort blocked accept().  Never
> > promised to be that.  Just as close() from another thread is not a way to
> > abort blocked write() or read() or sendmsg() or...
> 
> The problem is the Linux interpretation conflicts with the expectation
> of applications that run on other Unix systems.  Most likely, it is
> one of those corner cases not covered by SUS or Posix specs otherwise
> it would have come up earlier. The existing Linux behavior works fine
> it just isn't expected (or well documented).
> 
> I'm fine with just closing the bug (which is what I did initially), but
> where should this get documented?

close(2), perhaps?  "System call on opened file holds a reference to
opened file regardless of what happens to descriptor originally passed
to it" or something to the same effect...

That's what really happens - you get the same effect as if there had been
an additional temporary opened descriptor for that sucker.  And really,
multithreaded application that has one thread rip descriptors from under
another should be damn careful on _any_ system.  Anything that goes
"I've got -EBADF, guess another thread had removed that descriptor,
got to recover" is insane - in effect, it calls accept() blindly and
hopes that race will play out nicely, without hitting
	* thread A calls accept(3)
	* thread B calls close()
	* thread B calls e.g. dup() for unrelated reason and gets the same
descriptor reused
	* thread A finally gets from libc to accept(2), sees no EBADF and
proceeds with accept() on completely unrelated socket, with no indication of
the problem (or returns giving you a bogus errno, depending on what the
hell that descriptor happens to be).

IOW, if you rely on -EBADF to deal with such (userland) races, you are
extremely likely to be screwed.  On Linux, on FreeBSD, on Solaris, whatever.
In very controlled circumstances you might get away with that, but it's
almost certainly a Very Bad Idea(tm).

The bottom line: if descriptor table is a shared resource in your
multithreaded program, treat it as such.  Kernel will survive having
descriptors closed in the middle of syscall just fine; your userland
code is a different story.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: Files, sockets, and closing
  2007-10-26 21:03 Files, sockets, and closing Stephen Hemminger
  2007-10-26 21:45 ` Al Viro
@ 2007-10-27 18:03 ` Andi Kleen
  1 sibling, 0 replies; 5+ messages in thread
From: Andi Kleen @ 2007-10-27 18:03 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David S. Miller, Al Viro, netdev


This particular problem has come up many times over the years. So far
the standard strategy has been to ignore it. It's doubtful that the
complexity needed for a fix is worth it.

Workaround at user space level: call shutdown() first instead of close()

-Andi

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2007-10-27 18:03 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-10-26 21:03 Files, sockets, and closing Stephen Hemminger
2007-10-26 21:45 ` Al Viro
2007-10-26 22:09   ` Stephen Hemminger
2007-10-26 22:46     ` Al Viro
2007-10-27 18:03 ` Andi Kleen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).