[PATCH] alternative to sys

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] alternative to sys_indirect, part 1
@ 2008-04-24  4:03 Ulrich Drepper
  2008-04-24 10:25 ` Alan Cox
  2008-04-24 12:27 ` Michael Kerrisk
  0 siblings, 2 replies; 36+ messages in thread
From: Ulrich Drepper @ 2008-04-24  4:03 UTC (permalink / raw)
  To: linux-kernel, netdev; +Cc: akpm, torvalds

The alternative to using sys_indirect is to create a whole bunch of new
syscalls.  Here the beginning.  These are the socket interfaces which
create file descriptors and therefore need a flags parameter to let
the caller decide about setting the close-on-exit bit.

There will be more new syscalls (e.g., a dup2 extension) but one
things after the other.

There are a few noteworthy things about the patch:

- the new syscall names are chosen by adding the number of parameters
  of the new syscall to the name.  Not very witty but this is in the
  kernel itself only.  I might still do something different at
  userlevel.

- I decided against using the O_* flags here.  Most are not useful and
  we might need the bits for something else at some time.  Hence the
  new SOCKFL_* flag.  The intend is to define SOCKFL_CLOEXEC and
  O_CLOEXEC to the same value.  In this case there is zero overhead.

- for x86 I decided to extend the socketcall instead of starting to
  use syscalls.  It's just more consistent and this consistency can
  be of advantage at userlevel.


The following is a test program which checks the new functionality of
all three new syscalls and checks the old calls haven't changed.  The
patch is against the current git tree and changes for x86 and x86-64
are included.  This would be one of the advantages of the sys_indirect
approach: no work on part of the arch maintainers needed.


#include <fcntl.h>
#include <pthread.h>
#include <stdio.h>
#include <unistd.h>
#include <netinet/in.h>
#include <sys/socket.h>
#include <sys/syscall.h>

#ifdef __x86_64__
#define __NR_socket4 288
#define __NR_accept4 289
#define __NR_socketpair5 290
#define SOCKFL_CLOEXEC 02000000
#elif __i386__
#define SYS_SOCKET4     18
#define SYS_ACCEPT4     19
#define SYS_SOCKETPAIR5 20
#define USE_SOCKETCALL 1
#define SOCKFL_CLOEXEC 02000000
#else
#error "define error numbers for this architecture"
#endif

#define PORT 57392

static pthread_barrier_t b;

static void *
tf (void *arg)
{
  pthread_barrier_wait (&b);
  int s = socket (AF_INET, SOCK_STREAM, 0);
  struct sockaddr_in sin;
  sin.sin_family = AF_INET;
  sin.sin_addr.s_addr = htonl (INADDR_LOOPBACK);
  sin.sin_port = htons (PORT);
  connect (s, (const struct sockaddr *) &sin, sizeof (sin));
  close (s);
  pthread_barrier_wait (&b);

  pthread_barrier_wait (&b);
  s = socket (AF_INET, SOCK_STREAM, 0);
  sin.sin_family = AF_INET;
  sin.sin_addr.s_addr = htonl (INADDR_LOOPBACK);
  sin.sin_port = htons (PORT + 1);
  connect (s, (const struct sockaddr *) &sin, sizeof (sin));
  close (s);
  return NULL;
}

int
main (void)
{
  alarm (5);

  int status = 0;
  int s;
  int sp[2];

  s = socket (PF_UNIX, SOCK_STREAM, 0);

  if (s < 0)
    {
      puts ("socket failed");
      status = 1;
    }
  else
    {
      int fl = fcntl(s, F_GETFD);
      if ((fl & FD_CLOEXEC) != 0)
	{
	  puts ("socket did set CLOEXEC");
	  status = 1;
	}

      close (s);
    }

#if USE_SOCKETCALL
  s = syscall(__NR_socketcall, SYS_SOCKET4, PF_UNIX, SOCK_STREAM, 0, SOCKFL_CLOEXEC);
#else
  s = syscall(__NR_socket4, PF_UNIX, SOCK_STREAM, 0, SOCKFL_CLOEXEC);
#endif

  if (s < 0)
    {
      puts ("socket4 failed");
      status = 1;
    }
  else
    {
      int fl = fcntl(s, F_GETFD);
      if ((fl & FD_CLOEXEC) == 0)
	{
	  puts ("socket4 did not set CLOEXEC");
	  status = 1;
	}

      close (s);
    }

  if (socketpair (PF_UNIX, SOCK_STREAM, 0, sp) < 0)
    {
      puts ("socketpair failed");
      status = 1;
    }
  else
    {
      int fl1 = fcntl(sp[0], F_GETFD);
      int fl2 = fcntl(sp[1], F_GETFD);
      if ((fl1 & FD_CLOEXEC) != 0 || (fl2 & FD_CLOEXEC) != 0)
	{
	  puts ("socketpair did set CLOEXEC");
	  status = 1;
	}

      close (sp[0]);
      close (sp[1]);
    }

#if USE_SOCKETCALL
  s = syscall(__NR_socketcall, SYS_SOCKETPAIR5, PF_UNIX, SOCK_STREAM, 0, sp, SOCKFL_CLOEXEC);
#else
  s = syscall(__NR_socketpair5, PF_UNIX, SOCK_STREAM, 0, sp, SOCKFL_CLOEXEC);
#endif
  if (s < 0)
    {
      puts ("socketpair5 failed");
      status = 1;
    }
  else
    {
      int fl1 = fcntl(sp[0], F_GETFD);
      int fl2 = fcntl(sp[1], F_GETFD);
      if ((fl1 & FD_CLOEXEC) == 0 || (fl2 & FD_CLOEXEC) == 0)
	{
	  puts ("socketpair did not set CLOEXEC");
	  status = 1;
	}

      close (sp[0]);
      close (sp[1]);
    }

  pthread_barrier_init (&b, NULL, 2);
  
  pthread_t th;
  if (pthread_create (&th, NULL, tf, NULL) != 0)
    {
      puts ("pthread_create failed");
      status = 1;
    }
  else
    {
      int s = socket (AF_INET, SOCK_STREAM, 0);
      struct sockaddr_in sin;
      sin.sin_family = AF_INET;
      sin.sin_addr.s_addr = htonl (INADDR_LOOPBACK);
      sin.sin_port = htons (PORT);
      bind (s, (struct sockaddr *) &sin, sizeof (sin));
      listen (s, SOMAXCONN);

      pthread_barrier_wait (&b);

      int s2 = accept (s, NULL, 0);
      if (s2 < 0)
	{
	  puts ("accept failed");
	  status = 1;
	}
      else
	{
	  int fl = fcntl(s2, F_GETFD);
	  if ((fl & FD_CLOEXEC) != 0)
	    {
	      puts ("accept did set CLOEXEC");
	      status = 1;
	    }

	  close (s2);
        }

      close (s);

      pthread_barrier_wait (&b);

      sin.sin_family = AF_INET;
      sin.sin_addr.s_addr = htonl (INADDR_LOOPBACK);
      sin.sin_port = htons (PORT + 1);
      s = socket (AF_INET, SOCK_STREAM, 0);
      bind (s, (struct sockaddr *) &sin, sizeof (sin));
      listen (s, SOMAXCONN);

      pthread_barrier_wait (&b);

#if USE_SOCKETCALL
      s2 = syscall (__NR_socketcall, SYS_ACCEPT4, s, NULL, 0, SOCKFL_CLOEXEC);
#else
      s2 = syscall (__NR_accept4, s, NULL, 0, SOCKFL_CLOEXEC);
#endif
      if (s2 < 0)
	{
	  puts ("accept4 failed");
	  status = 1;
	}
      else
	{
	  int fl = fcntl(s2, F_GETFD);
	  if ((fl & FD_CLOEXEC) == 0)
	    {
	      puts ("accept4 did not set CLOEXEC");
	      status = 1;
	    }

	  close (s2);
        }

      close (s);
    }

  return status;
}


Signed-off-by: Ulrich Drepper <drepper@redhat.com>

 include/asm-x86/unistd_64.h |    6 ++
 include/linux/net.h         |    3 +
 include/linux/socket.h      |    8 ++
 include/linux/syscalls.h    |    3 +
 net/compat.c                |   22 +++++---
 net/socket.c                |  118 ++++++++++++++++++++++++++++++++++++--------
 6 files changed, 133 insertions(+), 27 deletions(-)


diff --git a/include/asm-x86/unistd_64.h b/include/asm-x86/unistd_64.h
index fe26e36..0d4aed0 100644
--- a/include/asm-x86/unistd_64.h
+++ b/include/asm-x86/unistd_64.h
@@ -639,6 +639,12 @@ __SYSCALL(__NR_fallocate, sys_fallocate)
 __SYSCALL(__NR_timerfd_settime, sys_timerfd_settime)
 #define __NR_timerfd_gettime			287
 __SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime)
+#define __NR_socket4				288
+__SYSCALL(__NR_socket4, sys_socket4)
+#define __NR_accept4				289
+__SYSCALL(__NR_accept4, sys_accept4)
+#define __NR_socketpair5			290
+__SYSCALL(__NR_socketpair5, sys_socketpair5)
 
 
 #ifndef __NO_STUBS
diff --git a/include/linux/net.h b/include/linux/net.h
index 71f7dd5..5e4a774 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -46,6 +46,9 @@ struct net;
 #define SYS_GETSOCKOPT	15		/* sys_getsockopt(2)		*/
 #define SYS_SENDMSG	16		/* sys_sendmsg(2)		*/
 #define SYS_RECVMSG	17		/* sys_recvmsg(2)		*/
+#define SYS_SOCKET4	18		/* sys_socket4(2)		*/
+#define SYS_ACCEPT4	19		/* sys_accept4(2)		*/
+#define SYS_SOCKETPAIR5	20		/* sys_socketpair5(2)		*/
 
 typedef enum {
 	SS_FREE = 0,			/* not allocated		*/
diff --git a/include/linux/socket.h b/include/linux/socket.h
index bd2b30a..ceaf57f 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -32,6 +32,14 @@ extern void socket_seq_show(struct seq_file *seq);
 typedef unsigned short	sa_family_t;
 
 /*
+ * Flags for the socket functions.
+ */
+#ifndef SOCKFL_CLOEXEC
+#define SOCKFL_CLOEXEC	02000000	/* Created file descriptor(s) have
+					 * close-on-exec flag set.  */
+#endif 
+
+/*
  *	1003.1g requires sa_family_t and that sa_data is char.
  */
  
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 8df6d13..bb50a68 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -407,6 +407,7 @@ asmlinkage long sys_getsockopt(int fd, int level, int optname,
 asmlinkage long sys_bind(int, struct sockaddr __user *, int);
 asmlinkage long sys_connect(int, struct sockaddr __user *, int);
 asmlinkage long sys_accept(int, struct sockaddr __user *, int __user *);
+asmlinkage long sys_accept4(int, struct sockaddr __user *, int __user *, int);
 asmlinkage long sys_getsockname(int, struct sockaddr __user *, int __user *);
 asmlinkage long sys_getpeername(int, struct sockaddr __user *, int __user *);
 asmlinkage long sys_send(int, void __user *, size_t, unsigned);
@@ -418,7 +419,9 @@ asmlinkage long sys_recvfrom(int, void __user *, size_t, unsigned,
 				struct sockaddr __user *, int __user *);
 asmlinkage long sys_recvmsg(int fd, struct msghdr __user *msg, unsigned flags);
 asmlinkage long sys_socket(int, int, int);
+asmlinkage long sys_socket4(int, int, int, int);
 asmlinkage long sys_socketpair(int, int, int, int __user *);
+asmlinkage long sys_socketpair5(int, int, int, int __user *, int);
 asmlinkage long sys_socketcall(int call, unsigned long __user *args);
 asmlinkage long sys_listen(int, int);
 asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
diff --git a/net/compat.c b/net/compat.c
index 80013fb..6e71b68 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -523,9 +523,10 @@ asmlinkage long compat_sys_getsockopt(int fd, int level, int optname,
 }
 /* Argument list sizes for compat_sys_socketcall */
 #define AL(x) ((x) * sizeof(u32))
-static unsigned char nas[18]={AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
+static unsigned char nas[21]={AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
 				AL(3),AL(3),AL(4),AL(4),AL(4),AL(6),
-				AL(6),AL(2),AL(5),AL(5),AL(3),AL(3)};
+				AL(6),AL(2),AL(5),AL(5),AL(3),AL(3),
+				AL(4),AL(4),AL(5)};
 #undef AL
 
 asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg, unsigned flags)
@@ -544,7 +545,7 @@ asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
 	u32 a[6];
 	u32 a0, a1;
 
-	if (call < SYS_SOCKET || call > SYS_RECVMSG)
+	if (call < SYS_SOCKET || call > SYS_SOCKPAIR5)
 		return -EINVAL;
 	if (copy_from_user(a, args, nas[call]))
 		return -EFAULT;
@@ -553,7 +554,7 @@ asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
 
 	switch (call) {
 	case SYS_SOCKET:
-		ret = sys_socket(a0, a1, a[2]);
+		ret = sys_socket4(a0, a1, a[2], 0);
 		break;
 	case SYS_BIND:
 		ret = sys_bind(a0, compat_ptr(a1), a[2]);
@@ -565,7 +566,7 @@ asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
 		ret = sys_listen(a0, a1);
 		break;
 	case SYS_ACCEPT:
-		ret = sys_accept(a0, compat_ptr(a1), compat_ptr(a[2]));
+		ret = sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), 0);
 		break;
 	case SYS_GETSOCKNAME:
 		ret = sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2]));
@@ -574,7 +575,7 @@ asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
 		ret = sys_getpeername(a0, compat_ptr(a1), compat_ptr(a[2]));
 		break;
 	case SYS_SOCKETPAIR:
-		ret = sys_socketpair(a0, a1, a[2], compat_ptr(a[3]));
+		ret = sys_socketpair5(a0, a1, a[2], compat_ptr(a[3]), 0);
 		break;
 	case SYS_SEND:
 		ret = sys_send(a0, compat_ptr(a1), a[2], a[3]);
@@ -605,6 +606,15 @@ asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
 	case SYS_RECVMSG:
 		ret = compat_sys_recvmsg(a0, compat_ptr(a1), a[2]);
 		break;
+	case SYS_SOCKET4:
+		ret = sys_socket4(a0, a1, a[2], a[3]);
+		break;
+	case SYS_ACCEPT4:
+		ret = sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), a[3]);
+		break;
+	case SYS_SOCKETPAIR5:
+		ret = sys_socketpair5(a0, a1, a[2], compat_ptr(a[3]), a[4]);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/net/socket.c b/net/socket.c
index 9b5c917..339bf5a 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -348,11 +348,11 @@ static struct dentry_operations sockfs_dentry_operations = {
  *	but we take care of internal coherence yet.
  */
 
-static int sock_alloc_fd(struct file **filep)
+static int sock_alloc_fd(struct file **filep, int flags)
 {
 	int fd;
 
-	fd = get_unused_fd();
+	fd = get_unused_fd_flags(flags);
 	if (likely(fd >= 0)) {
 		struct file *file = get_empty_filp();
 
@@ -395,10 +395,10 @@ static int sock_attach_fd(struct socket *sock, struct file *file)
 	return 0;
 }
 
-int sock_map_fd(struct socket *sock)
+static int sock_map_fd_flags(struct socket *sock, int flags)
 {
 	struct file *newfile;
-	int fd = sock_alloc_fd(&newfile);
+	int fd = sock_alloc_fd(&newfile, flags);
 
 	if (likely(fd >= 0)) {
 		int err = sock_attach_fd(sock, newfile);
@@ -413,10 +413,15 @@ int sock_map_fd(struct socket *sock)
 	return fd;
 }
 
+int sock_map_fd(struct socket *sock)
+{
+	return sock_map_fd_flags(sock, 0);
+}
+
 static struct socket *sock_from_file(struct file *file, int *err)
 {
 	if (file->f_op == &socket_file_ops)
-		return file->private_data;	/* set in sock_map_fd */
+		return file->private_data;	/* set in sock_map_fd_flags */
 
 	*err = -ENOTSOCK;
 	return NULL;
@@ -1213,16 +1218,30 @@ int sock_create_kern(int family, int type, int protocol, struct socket **res)
 	return __sock_create(&init_net, family, type, protocol, res, 1);
 }
 
-asmlinkage long sys_socket(int family, int type, int protocol)
+asmlinkage long sys_socket4(int family, int type, int protocol, int flags)
 {
 	int retval;
 	struct socket *sock;
+	int fflags;
+
+	if ((flags & ~SOCKFL_CLOEXEC) != 0)
+		return -EINVAL;
+
+	/*
+	 * Convert socket flags into appropriate file system flags.
+	 * The compiler should completely eliminate this code and
+	 * the fflags variable if no transformation is needed.
+	 */
+	if (SOCKFL_CLOEXEC == O_CLOEXEC)
+		fflags = flags;
+	else
+		fflags = (flags & SOCKFL_CLOEXEC) ? O_CLOEXEC : 0;
 
 	retval = sock_create(family, type, protocol, &sock);
 	if (retval < 0)
 		goto out;
 
-	retval = sock_map_fd(sock);
+	retval = sock_map_fd_flags(sock, fflags);
 	if (retval < 0)
 		goto out_release;
 
@@ -1235,16 +1254,35 @@ out_release:
 	return retval;
 }
 
+asmlinkage long sys_socket(int family, int type, int protocol)
+{
+	return sys_socket4(family, type, protocol, 0);
+}
+
 /*
  *	Create a pair of connected sockets.
  */
 
-asmlinkage long sys_socketpair(int family, int type, int protocol,
-			       int __user *usockvec)
+asmlinkage long sys_socketpair5(int family, int type, int protocol,
+			        int __user *usockvec, int flags)
 {
 	struct socket *sock1, *sock2;
 	int fd1, fd2, err;
 	struct file *newfile1, *newfile2;
+	int fflags;
+
+	if ((flags & ~SOCKFL_CLOEXEC) != 0)
+		return -EINVAL;
+
+	/*
+	 * Convert socket flags into appropriate file system flags.
+	 * The compiler should completely eliminate this code and
+	 * the fflags variable if no transformation is needed.
+	 */
+	if (SOCKFL_CLOEXEC == O_CLOEXEC)
+		fflags = flags;
+	else
+		fflags = (flags & SOCKFL_CLOEXEC) ? O_CLOEXEC : 0;
 
 	/*
 	 * Obtain the first socket and check if the underlying protocol
@@ -1263,13 +1301,13 @@ asmlinkage long sys_socketpair(int family, int type, int protocol,
 	if (err < 0)
 		goto out_release_both;
 
-	fd1 = sock_alloc_fd(&newfile1);
+	fd1 = sock_alloc_fd(&newfile1, fflags);
 	if (unlikely(fd1 < 0)) {
 		err = fd1;
 		goto out_release_both;
 	}
 
-	fd2 = sock_alloc_fd(&newfile2);
+	fd2 = sock_alloc_fd(&newfile2, fflags);
 	if (unlikely(fd2 < 0)) {
 		err = fd2;
 		put_filp(newfile1);
@@ -1330,6 +1368,12 @@ out_fd:
 	goto out;
 }
 
+asmlinkage long sys_socketpair(int family, int type, int protocol,
+			       int __user *usockvec)
+{
+	return sys_socketpair5(family, type, protocol, usockvec, 0);
+}
+
 /*
  *	Bind a name to a socket. Nothing much to do here since it's
  *	the protocol's responsibility to handle the local address.
@@ -1400,13 +1444,27 @@ asmlinkage long sys_listen(int fd, int backlog)
  *	clean when we restucture accept also.
  */
 
-asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr,
-			   int __user *upeer_addrlen)
+asmlinkage long sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
+			    int __user *upeer_addrlen, int flags)
 {
 	struct socket *sock, *newsock;
 	struct file *newfile;
 	int err, len, newfd, fput_needed;
 	char address[MAX_SOCK_ADDR];
+	int fflags;
+
+	if ((flags & ~SOCKFL_CLOEXEC) != 0)
+		return -EINVAL;
+
+	/*
+	 * Convert socket flags into appropriate file system flags.
+	 * The compiler should completely eliminate this code and
+	 * the fflags variable if no transformation is needed.
+	 */
+	if (SOCKFL_CLOEXEC == O_CLOEXEC)
+		fflags = flags;
+	else
+		fflags = (flags & SOCKFL_CLOEXEC) ? O_CLOEXEC : 0;
 
 	sock = sockfd_lookup_light(fd, &err, &fput_needed);
 	if (!sock)
@@ -1425,7 +1483,7 @@ asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr,
 	 */
 	__module_get(newsock->ops->owner);
 
-	newfd = sock_alloc_fd(&newfile);
+	newfd = sock_alloc_fd(&newfile, fflags);
 	if (unlikely(newfd < 0)) {
 		err = newfd;
 		sock_release(newsock);
@@ -1478,6 +1536,12 @@ out_fd:
 	goto out_put;
 }
 
+asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr,
+			   int __user *upeer_addrlen)
+{
+	return sys_accept4(fd, upeer_sockaddr, upeer_addrlen, 0);
+}
+
 /*
  *	Attempt to connect to a socket with the server address.  The address
  *	is in user space so we verify it is OK and move it to kernel space.
@@ -1988,10 +2052,11 @@ out:
 
 /* Argument list sizes for sys_socketcall */
 #define AL(x) ((x) * sizeof(unsigned long))
-static const unsigned char nargs[18]={
+static const unsigned char nargs[21]={
 	AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
 	AL(3),AL(3),AL(4),AL(4),AL(4),AL(6),
-	AL(6),AL(2),AL(5),AL(5),AL(3),AL(3)
+	AL(6),AL(2),AL(5),AL(5),AL(3),AL(3),
+	AL(4),AL(4),AL(5)
 };
 
 #undef AL
@@ -2010,7 +2075,7 @@ asmlinkage long sys_socketcall(int call, unsigned long __user *args)
 	unsigned long a0, a1;
 	int err;
 
-	if (call < 1 || call > SYS_RECVMSG)
+	if (call < 1 || call > SYS_SOCKETPAIR5)
 		return -EINVAL;
 
 	/* copy_from_user should be SMP safe. */
@@ -2026,7 +2091,7 @@ asmlinkage long sys_socketcall(int call, unsigned long __user *args)
 
 	switch (call) {
 	case SYS_SOCKET:
-		err = sys_socket(a0, a1, a[2]);
+		err = sys_socket4(a0, a1, a[2], 0);
 		break;
 	case SYS_BIND:
 		err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]);
@@ -2039,8 +2104,8 @@ asmlinkage long sys_socketcall(int call, unsigned long __user *args)
 		break;
 	case SYS_ACCEPT:
 		err =
-		    sys_accept(a0, (struct sockaddr __user *)a1,
-			       (int __user *)a[2]);
+		    sys_accept4(a0, (struct sockaddr __user *)a1,
+			        (int __user *)a[2], 0);
 		break;
 	case SYS_GETSOCKNAME:
 		err =
@@ -2053,7 +2118,7 @@ asmlinkage long sys_socketcall(int call, unsigned long __user *args)
 				    (int __user *)a[2]);
 		break;
 	case SYS_SOCKETPAIR:
-		err = sys_socketpair(a0, a1, a[2], (int __user *)a[3]);
+		err = sys_socketpair5(a0, a1, a[2], (int __user *)a[3], 0);
 		break;
 	case SYS_SEND:
 		err = sys_send(a0, (void __user *)a1, a[2], a[3]);
@@ -2087,6 +2152,17 @@ asmlinkage long sys_socketcall(int call, unsigned long __user *args)
 	case SYS_RECVMSG:
 		err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]);
 		break;
+	case SYS_SOCKET4:
+		err = sys_socket4(a0, a1, a[2], a[3]);
+		break;
+	case SYS_ACCEPT4:
+		err =
+		    sys_accept4(a0, (struct sockaddr __user *)a1,
+			        (int __user *)a[2], a[3]);
+		break;
+	case SYS_SOCKETPAIR5:
+		err = sys_socketpair5(a0, a1, a[2], (int __user *)a[3], a[4]);
+		break;
 	default:
 		err = -EINVAL;
 		break;

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24  4:03 [PATCH] alternative to sys_indirect, part 1 Ulrich Drepper
@ 2008-04-24 10:25 ` Alan Cox
  2008-04-24 12:34   ` Michael Kerrisk
                     ` (2 more replies)
  2008-04-24 12:27 ` Michael Kerrisk
  1 sibling, 3 replies; 36+ messages in thread
From: Alan Cox @ 2008-04-24 10:25 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: linux-kernel, netdev, akpm, torvalds

> - I decided against using the O_* flags here.  Most are not useful and
>   we might need the bits for something else at some time.  Hence the
>   new SOCKFL_* flag.  The intend is to define SOCKFL_CLOEXEC and
>   O_CLOEXEC to the same value.  In this case there is zero overhead.

Given we will never have 2^32 socket types, and in a sense this is part
of the type why not just use

	socket(PF_INET, SOCK_STREAM|SOCK_CLOEXEC, ...)

that would be far far cleaner, no new syscalls on the socket side at all.

Alan

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24  4:03 [PATCH] alternative to sys_indirect, part 1 Ulrich Drepper
  2008-04-24 10:25 ` Alan Cox
@ 2008-04-24 12:27 ` Michael Kerrisk
  2008-04-24 12:46   ` David Collier-Brown
  1 sibling, 1 reply; 36+ messages in thread
From: Michael Kerrisk @ 2008-04-24 12:27 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: linux-kernel, netdev, akpm, torvalds

On 4/24/08, Ulrich Drepper <drepper@redhat.com> wrote:
> The alternative to using sys_indirect is to create a whole bunch of new
>  syscalls.  Here the beginning.  These are the socket interfaces which
>  create file descriptors and therefore need a flags parameter to let
>  the caller decide about setting the close-on-exit bit.

Ulrich,

Could you please CC me on patches that make kernel-userland API changes.

Cheers,

Michael

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 10:25 ` Alan Cox
@ 2008-04-24 12:34   ` Michael Kerrisk
       [not found]     ` <517f3f820804240534r3bbbdc52s52a6dfe3f2d14b7f-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2008-04-24 14:18   ` Ulrich Drepper
  2008-04-24 15:29   ` Linus Torvalds
  2 siblings, 1 reply; 36+ messages in thread
From: Michael Kerrisk @ 2008-04-24 12:34 UTC (permalink / raw)
  To: Alan Cox
  Cc: Ulrich Drepper, linux-kernel, netdev, akpm, Linus Torvalds,
	Michael Kerrisk, michael.kerrisk, linux-man

On 4/24/08, Alan Cox <alan@lxorguk.ukuu.org.uk> wrote:
> > - I decided against using the O_* flags here.  Most are not useful and
>  >   we might need the bits for something else at some time.  Hence the
>  >   new SOCKFL_* flag.  The intend is to define SOCKFL_CLOEXEC and
>  >   O_CLOEXEC to the same value.  In this case there is zero overhead.
>
>
> Given we will never have 2^32 socket types, and in a sense this is part
>  of the type why not just use
>
>         socket(PF_INET, SOCK_STREAM|SOCK_CLOEXEC, ...)
>
>  that would be far far cleaner, no new syscalls on the socket side at all.

That''s not quite true.  There is still the problem of accept().

It's worth trying to summarize all of the syscalls that create file
descriptors to get a handle on how many new syscalls might really be
required.  AFAIK, the list below is all of the syscalls that create
FDs on Linux.

The following system calls all have a flags argument that either
already has a O_CLOEXEC functionality, or to which that functionality
could be added:

* open()
* openat()
* fcntl(F_DUPFD)
* timerfd_create()
* mq_open() (on Linux MQ descriptors are really just file descriptors)

For the following system calls, we could overload another argument for
the purpose:
* socket() (using the 'type' argument, as per Alan's suggestion)

The following syscalls don't have a flags argument, but does it
matter?  For each of them there is an alternative API that can be used
instead, if the functionality is required.

* dup2() -- use fcntl(F_DUPFD) instead
* dup() -- use fcntl(F_DUPFD) instead
* creat() -- use open() instead

The following system call doesn't have a flags argument, but we could
conceivably overload the existing 'fd' argument.  When creating a new
file descriptor, the 'fd' argument must be -1.  We could say that to
create a new fd, the argument must be say NEW_SIGNALFD, defined as
-MAXINT, ORed with the desired flags.

* signalfd()  (glibc API supplies a flags argument, but the syscall
doesn't have one)

The following system calls don't have a flags argument, and the only
way to solve the problem is a new syscall, or sys_indirect().

* eventfd()  (glibc API supplies a flags argument, but the syscall
doesn't have one)
* accept()
* pipe()
* inotify_init()
* epoll_create()

So the alternative to sys_indirect(), at least for the purpose of
O_CLOEXEC and similar, would be to create 5 new system calls (or six,
if one finds the signalfd() hack too ugly, which perhaps it is; or 7
if one doesn't like Alan's suggestion for socket() -- if one went the
route of new syscalls, then I'd suggest creating a new socket()-type
syscall with a flags argument).

Cheers,

Michael


-- 
I'll likely only see replies if they are CCed to mtk.manpages at gmail dot com

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 12:27 ` Michael Kerrisk
@ 2008-04-24 12:46   ` David Collier-Brown
  0 siblings, 0 replies; 36+ messages in thread
From: David Collier-Brown @ 2008-04-24 12:46 UTC (permalink / raw)
  To: Michael Kerrisk; +Cc: Ulrich Drepper, linux-kernel, netdev, akpm, torvalds

  Just FYI, if you end up looking at having to make lots of changes to a 
(probably user-side) interface, I did an open-source version of the
"score" fast-porting tool, described at http://datacenterworks.com/stories/port.html

It makes fixing annoying API changes somewhat less evil (;-))

--dave (at work) c-b

Michael Kerrisk wrote:
> On 4/24/08, Ulrich Drepper <drepper@redhat.com> wrote:
> 
>>The alternative to using sys_indirect is to create a whole bunch of new
>> syscalls.  Here the beginning.  These are the socket interfaces which
>> create file descriptors and therefore need a flags parameter to let
>> the caller decide about setting the close-on-exit bit.
> 
> 
> Ulrich,
> 
> Could you please CC me on patches that make kernel-userland API changes.
> 
> Cheers,
> 
> Michael
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

-- 
David Collier-Brown            | Always do right. This will gratify
Sun Microsystems, Toronto      | some people and astonish the rest
davecb@sun.com                 |                      -- Mark Twain
(905) 943-1983, cell: (647) 833-9377, (800) 555-9786 x56583
bridge: (877) 385-4099 code: 506 9191#

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 10:25 ` Alan Cox
  2008-04-24 12:34   ` Michael Kerrisk
@ 2008-04-24 14:18   ` Ulrich Drepper
  2008-04-24 14:24     ` Alan Cox
  2008-04-24 16:49     ` Evgeniy Polyakov
  2008-04-24 15:29   ` Linus Torvalds
  2 siblings, 2 replies; 36+ messages in thread
From: Ulrich Drepper @ 2008-04-24 14:18 UTC (permalink / raw)
  To: Alan Cox; +Cc: linux-kernel, netdev, akpm, torvalds

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Alan Cox wrote:
> Given we will never have 2^32 socket types, and in a sense this is part
> of the type why not just use
> 
> 	socket(PF_INET, SOCK_STREAM|SOCK_CLOEXEC, ...)
> 
> that would be far far cleaner, no new syscalls on the socket side at all.

You have a strange sense of "clean" I must say.

I don't think this is a viable approach because it is not about the
range.  People can and do select arbitrary values for those types.
Until a value is officially recognized and registered it is in fact best
to choose a (possibly large) random value to not conflict with anything
else.  Who can guarantee that whatever bit is chosen for SOCK_CLOEXEC
isn't already used by someone?

Add to this that it's not a complete solution (no such hack possible for
accept) and I think using a new interface is cleaner(tm).

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFIEJbD2ijCOnn/RHQRAnUBAKDFxC7Xkl8Qlo5u7PS8XBx4WrNzRQCgm2Ic
mV6zeglZaTJMn3IuGv3tB60=
=06jC
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 14:18   ` Ulrich Drepper
@ 2008-04-24 14:24     ` Alan Cox
  2008-04-24 15:16       ` Ulrich Drepper
  2008-04-24 16:49     ` Evgeniy Polyakov
  1 sibling, 1 reply; 36+ messages in thread
From: Alan Cox @ 2008-04-24 14:24 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: linux-kernel, netdev, akpm, torvalds

> I don't think this is a viable approach because it is not about the
> range.  People can and do select arbitrary values for those types.
> Until a value is officially recognized and registered it is in fact best
> to choose a (possibly large) random value to not conflict with anything
> else.  Who can guarantee that whatever bit is chosen for SOCK_CLOEXEC
> isn't already used by someone?

There are only a small number of valid socket types recognized by POSIX
plus a few BSD plus a few Linux ones so Linux can happily assign the
upper bits for a different purpose.

> Add to this that it's not a complete solution (no such hack possible for
> accept) and I think using a new interface is cleaner(tm).

Every other property of a socket via accept() is inherited from the
parent. Making one property different would be bizarre and ugly.

Alan

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
       [not found]         ` <48109DFB.900-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2008-04-24 14:42           ` Alan Cox
  2008-04-24 15:19             ` Ulrich Drepper
  2008-04-24 15:05           ` Michael Kerrisk
  1 sibling, 1 reply; 36+ messages in thread
From: Alan Cox @ 2008-04-24 14:42 UTC (permalink / raw)
  To: Ulrich Drepper
  Cc: Michael Kerrisk, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b, Linus Torvalds,
	michael.kerrisk-Re5JQEeQqe8AvxtiuMwx3w,
	linux-man-u79uwXL29TY76Z2rM5mHXA

> Somebody please make a call and then let's go on with life.  I don't
> care much either way anymore.  I do hope nobody thinks this is an issue
> which can be completely ignored (see, e.g., the bug I pointed to the
> other day).

Its very hard to do that without knowing what other Unixlike and POSIX OS
plan to do and if we'll end up with a third yet again different API to
support when it reaches a standards body.

We shouldn't be seeing this as a Linux problem, its a BSD problem, an AIX
problem, a Solaris problem and an HPUX problem and really wants one
solution applying across distributions or it will harm everyone in the
long term.

Alan
--
To unsubscribe from this list: send the line "unsubscribe linux-man" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
       [not found]     ` <517f3f820804240534r3bbbdc52s52a6dfe3f2d14b7f-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2008-04-24 14:49       ` Ulrich Drepper
       [not found]         ` <48109DFB.900-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 36+ messages in thread
From: Ulrich Drepper @ 2008-04-24 14:49 UTC (permalink / raw)
  To: Michael Kerrisk
  Cc: Alan Cox, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b, Linus Torvalds,
	michael.kerrisk-Re5JQEeQqe8AvxtiuMwx3w,
	linux-man-u79uwXL29TY76Z2rM5mHXA

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Michael Kerrisk wrote:
> * dup2() -- use fcntl(F_DUPFD) instead

Wrong.  You cannot implement dup2 with fcntl since the latter won't use
a file descriptor which is already in use.

> So the alternative to sys_indirect(), at least for the purpose of
> O_CLOEXEC and similar, would be to create 5 new system calls (or six,
> if one finds the signalfd() hack too ugly, which perhaps it is; or 7
> if one doesn't like Alan's suggestion for socket()

Without changing the socket interfaces (plural, socketpair) there would
have to be 7 new syscalls, with changing socket* to an IMO cleaner
interface 9.

Or we just add sys_indirect (which is also usable for other syscall
extensions, not just the CLOEXEC stuff) and let userlevel (i.e., me)
worry about adding new interfaces to libc.  As you can see, for the more
recent interfaces like signalfd I have already added an additional
parameter so the number of interface changes would be reduced.

Somebody please make a call and then let's go on with life.  I don't
care much either way anymore.  I do hope nobody thinks this is an issue
which can be completely ignored (see, e.g., the bug I pointed to the
other day).

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFIEJ372ijCOnn/RHQRAorwAJ4u78MYFrJbPqcqW1fPae2liLxxhwCfa7e7
r2vc7FnEpzyJmqEKU6aCd5E=
=EG3n
-----END PGP SIGNATURE-----
--
To unsubscribe from this list: send the line "unsubscribe linux-man" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 15:16       ` Ulrich Drepper
@ 2008-04-24 15:03         ` Alan Cox
  2008-04-24 15:44           ` Jakub Jelinek
                             ` (2 more replies)
  0 siblings, 3 replies; 36+ messages in thread
From: Alan Cox @ 2008-04-24 15:03 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: linux-kernel, netdev, akpm, torvalds

> You didn't read what I wrote.

The feeling is mutual

> For those the implementer must ensure that during the development no
> value is used which can conflict with any current and future assigned
> value and not with any other development.

Kernel socket type values are assigned by the kernel team so that
isn't a problem.

> > Every other property of a socket via accept() is inherited from the
> > parent. Making one property different would be bizarre and ugly.
> 
> Implementing this would visibly change existing code and it would
> actively violate POSIX.  Not a good idea.

POSIX has no interface for this new behaviour you propose so that is
complete crap. The moment you use one of these features you stepped
outside of the POSIX spec - and you know that. If there was an existing
standard we wouldn't have a problem.

Alan

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
       [not found]         ` <48109DFB.900-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  2008-04-24 14:42           ` Alan Cox
@ 2008-04-24 15:05           ` Michael Kerrisk
  1 sibling, 0 replies; 36+ messages in thread
From: Michael Kerrisk @ 2008-04-24 15:05 UTC (permalink / raw)
  To: Ulrich Drepper
  Cc: Michael Kerrisk, Alan Cox, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b, Linus Torvalds,
	michael.kerrisk-Re5JQEeQqe8AvxtiuMwx3w,
	linux-man-u79uwXL29TY76Z2rM5mHXA

On Thu, Apr 24, 2008 at 4:49 PM, Ulrich Drepper <drepper-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
>  Hash: SHA1
>
>
>  Michael Kerrisk wrote:
>  > * dup2() -- use fcntl(F_DUPFD) instead
>
>  Wrong.  You cannot implement dup2 with fcntl since the latter won't use
>  a file descriptor which is already in use.

True.  One could add a flag to fcntl() to provide that behavior.

>  > So the alternative to sys_indirect(), at least for the purpose of
>  > O_CLOEXEC and similar, would be to create 5 new system calls (or six,
>  > if one finds the signalfd() hack too ugly, which perhaps it is; or 7
>  > if one doesn't like Alan's suggestion for socket()
>
>  Without changing the socket interfaces (plural, socketpair) there would

Yes, I overlooked socket pair()...

>  have to be 7 new syscalls, with changing socket* to an IMO cleaner
>  interface 9.
>
>
>  Or we just add sys_indirect (which is also usable for other syscall
>  extensions, not just the CLOEXEC stuff) and let userlevel (i.e., me)
>  worry about adding new interfaces to libc.  As you can see, for the more
>  recent interfaces like signalfd I have already added an additional
>  parameter so the number of interface changes would be reduced.
>
>  Somebody please make a call and then let's go on with life.  I don't
>  care much either way anymore.  I do hope nobody thinks this is an issue
>  which can be completely ignored (see, e.g., the bug I pointed to the
>  other day).

Since I had to go search, here it is again
http://bugzilla.redhat.com/show_bug.cgi?id=443321
--
To unsubscribe from this list: send the line "unsubscribe linux-man" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 14:24     ` Alan Cox
@ 2008-04-24 15:16       ` Ulrich Drepper
  2008-04-24 15:03         ` Alan Cox
  0 siblings, 1 reply; 36+ messages in thread
From: Ulrich Drepper @ 2008-04-24 15:16 UTC (permalink / raw)
  To: Alan Cox; +Cc: linux-kernel, netdev, akpm, torvalds

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Alan Cox wrote:
> There are only a small number of valid socket types recognized by POSIX
> plus a few BSD plus a few Linux ones so Linux can happily assign the
> upper bits for a different purpose.

You didn't read what I wrote.  It's about the not-yet-assigned types.
For those the implementer must ensure that during the development no
value is used which can conflict with any current and future assigned
value and not with any other development.  Hence common practice is to
use a random value over the entire range.

I don't know about a case for socket but this is definitely how (sane)
development elsewhere works.

> Every other property of a socket via accept() is inherited from the
> parent. Making one property different would be bizarre and ugly.

Implementing this would visibly change existing code and it would
actively violate POSIX.  Not a good idea.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFIEKRr2ijCOnn/RHQRAoltAJ98g8GHGSYceIJxyddjCRI6otoVagCfeTXC
TfgaalHo6XQEzehnST+unhk=
=T272
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 14:42           ` Alan Cox
@ 2008-04-24 15:19             ` Ulrich Drepper
  0 siblings, 0 replies; 36+ messages in thread
From: Ulrich Drepper @ 2008-04-24 15:19 UTC (permalink / raw)
  To: Alan Cox
  Cc: Michael Kerrisk, linux-kernel, netdev, akpm, Linus Torvalds,
	michael.kerrisk, linux-man

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Alan Cox wrote:
> We shouldn't be seeing this as a Linux problem, its a BSD problem, an AIX
> problem, a Solaris problem and an HPUX problem and really wants one
> solution applying across distributions or it will harm everyone in the
> long term.

Look at the other interfaces.  I've proposed them for POSIX and they are
going to be in the next revision (O_CLOEXEC, F_DUPFD_CLOEXEC).  I'm
confident that whatever we'll do will be endorsed this way.  It's just
too late for the next revision, it'd have to wait a few years until work
on the next revision (esp reviews) start.  That's too long.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFIEKT/2ijCOnn/RHQRAkyoAJ9/FRwN3r6zg2cSvzVrA2TiGlfergCdG1r9
H/UghrO5A99ieG6O7wXvu7I=
=pUfi
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 15:44           ` Jakub Jelinek
@ 2008-04-24 15:24             ` Alan Cox
  2008-04-24 16:00               ` David Miller
  0 siblings, 1 reply; 36+ messages in thread
From: Alan Cox @ 2008-04-24 15:24 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Ulrich Drepper, linux-kernel, netdev, akpm, torvalds

> Doing:
> 
> int fd = socket (PF_INET, SOCK_STREAM, 0);
> fcntl (fd, F_SETFD, F_CLOEXEC);
> ...
> int fd2 = accept (fd, addr, addrlen);
> 
> certainly doesn't use any of the new interfaces, yet if accept inherits
> the CLOEXEC flag from the socket, would visibly change existing programs.

The example you give doesn't involve the new interfaces so is outside of
the proposed changes but it is true that a different accept behaviour
would be odd here. Also accept() has several really really irritatingly
undefined semantics already that cause BSD v Linux pain and suffering so
maybe accept() is one case that could benefit from a flag passing version
*and* the ability to pass FNDELAY to accept to do non blocking accepts on
an fd without flipping it to/from NDELAY itself.

BTW in 4.4BSD and derivatives if I remember rightly F_CLOEXEC *is*
inherited across accept() so I doubt any user space software will be too
upset by such a shift.

Alan

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 15:45           ` Ulrich Drepper
@ 2008-04-24 15:27             ` Alan Cox
  2008-04-24 16:04               ` Ulrich Drepper
  0 siblings, 1 reply; 36+ messages in thread
From: Alan Cox @ 2008-04-24 15:27 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: linux-kernel, netdev, akpm, torvalds

> Alan Cox wrote:
> > Kernel socket type values are assigned by the kernel team so that
> > isn't a problem.
> 
> Again, it's not about assigned values.  It about those not yet assigned.

Earth calling Ulrich, Earth calling Ulrich...

"Kernel socket type values are assigned by the kernel team so that isn't a
problem."

Believe it or not we have the compute capability between us to not
accidentally reassign values we assigned to one thing to something else.

> Oh really?  You open a server socket, use fcntl(FD_CLOEXEC), and then
> accept(). 

And your behaviour just became OS specific....

Alan

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 10:25 ` Alan Cox
  2008-04-24 12:34   ` Michael Kerrisk
  2008-04-24 14:18   ` Ulrich Drepper
@ 2008-04-24 15:29   ` Linus Torvalds
  2008-04-24 15:39     ` David Miller
  2 siblings, 1 reply; 36+ messages in thread
From: Linus Torvalds @ 2008-04-24 15:29 UTC (permalink / raw)
  To: Alan Cox; +Cc: Ulrich Drepper, linux-kernel, netdev, akpm



On Thu, 24 Apr 2008, Alan Cox wrote:
> 
> Given we will never have 2^32 socket types, and in a sense this is part
> of the type why not just use
> 
> 	socket(PF_INET, SOCK_STREAM|SOCK_CLOEXEC, ...)

Ok, I have to admit that I find this very appealing. It looks much 
cleaner, but perhaps more importantly, it also looks both readable _and_ 
easier to use for the user-space programmer.

		Linus

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 16:00               ` David Miller
@ 2008-04-24 15:38                 ` Alan Cox
  2008-04-24 16:09                   ` David Miller
  2008-04-24 16:45                   ` Michael Kerrisk
  0 siblings, 2 replies; 36+ messages in thread
From: Alan Cox @ 2008-04-24 15:38 UTC (permalink / raw)
  To: David Miller; +Cc: jakub, drepper, linux-kernel, netdev, akpm, torvalds

On Thu, 24 Apr 2008 09:00:08 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> From: Alan Cox <alan@lxorguk.ukuu.org.uk>
> Date: Thu, 24 Apr 2008 16:24:44 +0100
> 
> > BTW in 4.4BSD and derivatives if I remember rightly F_CLOEXEC *is*
> > inherited across accept() so I doubt any user space software will be too
> > upset by such a shift.
> 
> It actually doesn't.
> 
> Just like in Linux, no file descriptor flags are inherited.

NDELAY certainly appears to be looking at Stevens.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 15:29   ` Linus Torvalds
@ 2008-04-24 15:39     ` David Miller
  2008-04-24 16:03       ` Michael Kerrisk
  0 siblings, 1 reply; 36+ messages in thread
From: David Miller @ 2008-04-24 15:39 UTC (permalink / raw)
  To: torvalds; +Cc: alan, drepper, linux-kernel, netdev, akpm

From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 24 Apr 2008 08:29:14 -0700 (PDT)

> 
> 
> On Thu, 24 Apr 2008, Alan Cox wrote:
> > 
> > Given we will never have 2^32 socket types, and in a sense this is part
> > of the type why not just use
> > 
> > 	socket(PF_INET, SOCK_STREAM|SOCK_CLOEXEC, ...)
> 
> Ok, I have to admit that I find this very appealing. It looks much 
> cleaner, but perhaps more importantly, it also looks both readable _and_ 
> easier to use for the user-space programmer.

Me too.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 16:03       ` Michael Kerrisk
@ 2008-04-24 15:42         ` Alan Cox
  2008-04-24 16:48           ` Michael Kerrisk
  2008-04-24 16:30         ` Linus Torvalds
  1 sibling, 1 reply; 36+ messages in thread
From: Alan Cox @ 2008-04-24 15:42 UTC (permalink / raw)
  To: Michael Kerrisk
  Cc: David Miller, torvalds, drepper, linux-kernel, netdev, akpm

> But this approach fixes just one of the interfaces.  There are 7 or 8
> other interfaces that need to solve the same problem.  What about
> those?

Actually it seems to fix most of them. I accept Jakub's observation we
need a "paccept()" or similar.

> It strikes me to be cleanest to use the same solution for all of them
> -- i.e., new syscalls (seems simplest) or sys_indirect() -- including
> socket().

New syscalls make the interface more complex and harder to learn. They
make it harder to tweak applications neatly to use the new API if
present. They are not immediately obvious from knowling the existing API.

What we don't want to do is to end up with a thousand weird system calls
as Windows NT did where nobody can actually understand chunks of code
without looking calls up in books as they go.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 15:03         ` Alan Cox
@ 2008-04-24 15:44           ` Jakub Jelinek
  2008-04-24 15:24             ` Alan Cox
  2008-04-24 15:45           ` Ulrich Drepper
  2008-04-24 16:06           ` Michael Kerrisk
  2 siblings, 1 reply; 36+ messages in thread
From: Jakub Jelinek @ 2008-04-24 15:44 UTC (permalink / raw)
  To: Alan Cox; +Cc: Ulrich Drepper, linux-kernel, netdev, akpm, torvalds

On Thu, Apr 24, 2008 at 04:03:52PM +0100, Alan Cox wrote:
> > > Every other property of a socket via accept() is inherited from the
> > > parent. Making one property different would be bizarre and ugly.
> > 
> > Implementing this would visibly change existing code and it would
> > actively violate POSIX.  Not a good idea.
> 
> POSIX has no interface for this new behaviour you propose so that is
> complete crap. The moment you use one of these features you stepped
> outside of the POSIX spec - and you know that. If there was an existing
> standard we wouldn't have a problem.

Doing:

int fd = socket (PF_INET, SOCK_STREAM, 0);
fcntl (fd, F_SETFD, F_CLOEXEC);
...
int fd2 = accept (fd, addr, addrlen);

certainly doesn't use any of the new interfaces, yet if accept inherits
the CLOEXEC flag from the socket, would visibly change existing programs.

	Jakub

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 16:04               ` Ulrich Drepper
@ 2008-04-24 15:45                 ` Alan Cox
  0 siblings, 0 replies; 36+ messages in thread
From: Alan Cox @ 2008-04-24 15:45 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: linux-kernel, netdev, akpm, torvalds

> Once again, this is not about assigned values.  This is about the time
> before you get a value assigned.  Not every experiment out there will
> have a value assigned before it starts development.

And no value used by a random experiment on the internet belongs in any
one elses code. When it hits the kernel main tree it becomes definitive
and will remain so. Until then it remains someones devel hack.

The same is true about syscall numbers so your argument on this is
slightly less than sound.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 15:03         ` Alan Cox
  2008-04-24 15:44           ` Jakub Jelinek
@ 2008-04-24 15:45           ` Ulrich Drepper
  2008-04-24 15:27             ` Alan Cox
  2008-04-24 16:06           ` Michael Kerrisk
  2 siblings, 1 reply; 36+ messages in thread
From: Ulrich Drepper @ 2008-04-24 15:45 UTC (permalink / raw)
  To: Alan Cox; +Cc: linux-kernel, netdev, akpm, torvalds

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Alan Cox wrote:
> Kernel socket type values are assigned by the kernel team so that
> isn't a problem.

Again, it's not about assigned values.  It about those not yet assigned.

> POSIX has no interface for this new behaviour you propose so that is
> complete crap. The moment you use one of these features you stepped
> outside of the POSIX spec - and you know that. If there was an existing
> standard we wouldn't have a problem.

Oh really?  You open a server socket, use fcntl(FD_CLOEXEC), and then
accept().  This is identical to the new behavior and it is very well
defined in POSIX.  Hence, code which uses fcntl() this way today will
see a change with your proposal.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFIEKsp2ijCOnn/RHQRAl7yAJ40IosF+DLjiFtmuOt/t9LShBt46ACfbVi6
BXwZE0fKUAh0Iqmrme3On6A=
=iDY5
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 15:24             ` Alan Cox
@ 2008-04-24 16:00               ` David Miller
  2008-04-24 15:38                 ` Alan Cox
  0 siblings, 1 reply; 36+ messages in thread
From: David Miller @ 2008-04-24 16:00 UTC (permalink / raw)
  To: alan; +Cc: jakub, drepper, linux-kernel, netdev, akpm, torvalds

From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Thu, 24 Apr 2008 16:24:44 +0100

> BTW in 4.4BSD and derivatives if I remember rightly F_CLOEXEC *is*
> inherited across accept() so I doubt any user space software will be too
> upset by such a shift.

It actually doesn't.

Just like in Linux, no file descriptor flags are inherited.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 15:39     ` David Miller
@ 2008-04-24 16:03       ` Michael Kerrisk
  2008-04-24 15:42         ` Alan Cox
  2008-04-24 16:30         ` Linus Torvalds
  0 siblings, 2 replies; 36+ messages in thread
From: Michael Kerrisk @ 2008-04-24 16:03 UTC (permalink / raw)
  To: David Miller; +Cc: torvalds, alan, drepper, linux-kernel, netdev, akpm

On 4/24/08, David Miller <davem@davemloft.net> wrote:
> From: Linus Torvalds <torvalds@linux-foundation.org>
>  Date: Thu, 24 Apr 2008 08:29:14 -0700 (PDT)
>
>
>  >
>  >
>  > On Thu, 24 Apr 2008, Alan Cox wrote:
>  > >
>  > > Given we will never have 2^32 socket types, and in a sense this is part
>  > > of the type why not just use
>  > >
>  > >     socket(PF_INET, SOCK_STREAM|SOCK_CLOEXEC, ...)
>  >
>  > Ok, I have to admit that I find this very appealing. It looks much
>  > cleaner, but perhaps more importantly, it also looks both readable _and_
>  > easier to use for the user-space programmer.
>
>
> Me too.

But this approach fixes just one of the interfaces.  There are 7 or 8
other interfaces that need to solve the same problem.  What about
those?

It strikes me to be cleanest to use the same solution for all of them
-- i.e., new syscalls (seems simplest) or sys_indirect() -- including
socket().

-- 
I'll likely only see replies if they are CCed to mtk.manpages at gmail dot com

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 15:27             ` Alan Cox
@ 2008-04-24 16:04               ` Ulrich Drepper
  2008-04-24 15:45                 ` Alan Cox
  0 siblings, 1 reply; 36+ messages in thread
From: Ulrich Drepper @ 2008-04-24 16:04 UTC (permalink / raw)
  To: Alan Cox; +Cc: linux-kernel, netdev, akpm, torvalds

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Alan Cox wrote:
> Believe it or not we have the compute capability between us to not
> accidentally reassign values we assigned to one thing to something else.

Once again, this is not about assigned values.  This is about the time
before you get a value assigned.  Not every experiment out there will
have a value assigned before it starts development.

But it really doesn't matter to me. I'm not the one you would introduce
the problem.  Patch is forthcoming.

>> Oh really?  You open a server socket, use fcntl(FD_CLOEXEC), and then
>> accept(). 
> 
> And your behaviour just became OS specific....

Not according to POSIX.  If some OSes deliberately violate POSIX that's
their problem.  All POSIX OSes will have up to today return a new file
descriptor without the close-on-exec flag set at all times.  Just read
the spec.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFIEK+f2ijCOnn/RHQRAtYRAJ9Ve9XSkMriqkHkiCL00wsXzJJbYgCgmqzQ
3uexpcjM0NvU7qgngOs7LDA=
=uNKi
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 15:03         ` Alan Cox
  2008-04-24 15:44           ` Jakub Jelinek
  2008-04-24 15:45           ` Ulrich Drepper
@ 2008-04-24 16:06           ` Michael Kerrisk
  2 siblings, 0 replies; 36+ messages in thread
From: Michael Kerrisk @ 2008-04-24 16:06 UTC (permalink / raw)
  To: Alan Cox; +Cc: Ulrich Drepper, linux-kernel, netdev, akpm, torvalds

On 4/24/08, Alan Cox <alan@lxorguk.ukuu.org.uk> wrote:
> > You didn't read what I wrote.
>
>
> The feeling is mutual
>
>
>  > For those the implementer must ensure that during the development no
>  > value is used which can conflict with any current and future assigned
>  > value and not with any other development.
>
>
> Kernel socket type values are assigned by the kernel team so that
>  isn't a problem.
>
>
>  > > Every other property of a socket via accept() is inherited from the
>  > > parent. Making one property different would be bizarre and ugly.
>  >
>  > Implementing this would visibly change existing code and it would
>  > actively violate POSIX.  Not a good idea.
>
>
> POSIX has no interface for this new behaviour you propose so that is
>  complete crap. The moment you use one of these features you stepped
>  outside of the POSIX spec - and you know that. If there was an existing
>  standard we wouldn't have a problem.

Alan, I agree with your analysis of the standard on that last para,
but I'm still not convinced that having the behavior inherited from
accept() would be good.  The problem (IIUC) is that after the
accept(), a userland programmer might want to immediately change the
O_CLOEXEC for the descriptor, and there would be the same race there
that this whole thread is about avoiding.

-- 
I'll likely only see replies if they are CCed to mtk.manpages at gmail dot com

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 15:38                 ` Alan Cox
@ 2008-04-24 16:09                   ` David Miller
  2008-04-24 16:45                   ` Michael Kerrisk
  1 sibling, 0 replies; 36+ messages in thread
From: David Miller @ 2008-04-24 16:09 UTC (permalink / raw)
  To: alan; +Cc: jakub, drepper, linux-kernel, netdev, akpm, torvalds

From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Thu, 24 Apr 2008 16:38:35 +0100

> On Thu, 24 Apr 2008 09:00:08 -0700 (PDT)
> David Miller <davem@davemloft.net> wrote:
> 
> > From: Alan Cox <alan@lxorguk.ukuu.org.uk>
> > Date: Thu, 24 Apr 2008 16:24:44 +0100
> > 
> > > BTW in 4.4BSD and derivatives if I remember rightly F_CLOEXEC *is*
> > > inherited across accept() so I doubt any user space software will be too
> > > upset by such a shift.
> > 
> > It actually doesn't.
> > 
> > Just like in Linux, no file descriptor flags are inherited.
> 
> NDELAY certainly appears to be looking at Stevens.

It's checking NDELAY, for the accept() call itself, on the
parent socket.

It allocates the new FD for the process and sets FREAD and
FWRITE in the flags, that's it.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 16:03       ` Michael Kerrisk
  2008-04-24 15:42         ` Alan Cox
@ 2008-04-24 16:30         ` Linus Torvalds
  2008-04-24 16:52           ` Ulrich Drepper
  1 sibling, 1 reply; 36+ messages in thread
From: Linus Torvalds @ 2008-04-24 16:30 UTC (permalink / raw)
  To: Michael Kerrisk; +Cc: David Miller, alan, drepper, linux-kernel, netdev, akpm

On Thu, 24 Apr 2008, Michael Kerrisk wrote:
> 
> It strikes me to be cleanest to use the same solution for all of them
> -- i.e., new syscalls (seems simplest) or sys_indirect() -- including
> socket().

I certainly don't dislike sys_indirect either, but I've also done user 
mode programming, and when it comes to OS-specific things (and especially 
if they are even _version_-specific) I can tell you that basically nobody 
will ever use them if you cannot decide to use them dynamically.

Here's an example of a *successful* use of something like that:

	#ifndef O_NOATIME
	#define O_NOATIME 0
	#endif

	static unsigned int sha1_file_open_flag = O_NOATIME;

	...
	        fd = open(filename, O_RDONLY | sha1_file_open_flag);
	        if (fd < 0) {
	                /* See if it works without O_NOATIME */
	                switch (sha1_file_open_flag) {
	                default:
	                        fd = open(filename, O_RDONLY);
	                        if (fd >= 0)
	                                break;
	                /* Fallthrough */
	                case 0:
	                        return NULL;
	                }

	                /* If it failed once, it will probably fail again.
	                 * Stop using O_NOATIME
	                 */
	                sha1_file_open_flag = 0;
        }
	...

see? This is soemthing where I actually used Linux-specific code. And 
dammit, I'm _Linus_. Think of your normal programmer that isn't quite as 
Linux-oriented.

And that's the problem with anything that isn't flags-based. Once you do 
new system calls, doing the above is really quite nasty. How do you 
statically even _test_ that you have a system call? Now you need to add a 
whole autoconf thing for it existing, and when it does exist you still 
need to test whether it works, and you can't even do it in the slow-path 
like the above (which turns the failure into a fast-path _without_ the 
flag).

So while I don't dislike the indirect system call, I do think that if we 
can handle a large case of the problems with an added flag to already 
existing system calls, that does have huge advantages. Because it allows 
code like the above, which needs absolutely zero autoconf for linking 
errors etc..

			Linus

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 15:38                 ` Alan Cox
  2008-04-24 16:09                   ` David Miller
@ 2008-04-24 16:45                   ` Michael Kerrisk
  2008-04-26 22:41                     ` dean gaudet
  1 sibling, 1 reply; 36+ messages in thread
From: Michael Kerrisk @ 2008-04-24 16:45 UTC (permalink / raw)
  To: Alan Cox; +Cc: David Miller, jakub, drepper, linux-kernel, netdev, akpm,
	torvalds

On 4/24/08, Alan Cox <alan@lxorguk.ukuu.org.uk> wrote:
> On Thu, 24 Apr 2008 09:00:08 -0700 (PDT)
>
> David Miller <davem@davemloft.net> wrote:
>
>
> > From: Alan Cox <alan@lxorguk.ukuu.org.uk>
>  > Date: Thu, 24 Apr 2008 16:24:44 +0100
>  >
>  > > BTW in 4.4BSD and derivatives if I remember rightly F_CLOEXEC *is*
>  > > inherited across accept() so I doubt any user space software will be too
>  > > upset by such a shift.
>  >
>  > It actually doesn't.
>  >
>  > Just like in Linux, no file descriptor flags are inherited.
>
>
> NDELAY certainly appears to be looking at Stevens.

A while back I did some testing of this point.  These were the results I noted:

FreeBSD 4.8
O_NONBLOCK and O_ASYNC are inherited
FD_CLOEXEC is not inherited

Solaris 8
O_NONBLOCK and O_ASYNC are inherited
FD_CLOEXEC is not inherited

Tru64 5.1 (sep 03, testdrive)
No F_SETFL flags are inherited
FD_CLOEXEC is not inherited

HP-UX 11
No F_SETFL flags are inherited
FD_CLOEXEC is not inherited

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 15:42         ` Alan Cox
@ 2008-04-24 16:48           ` Michael Kerrisk
  2008-04-24 17:20             ` H. Peter Anvin
  0 siblings, 1 reply; 36+ messages in thread
From: Michael Kerrisk @ 2008-04-24 16:48 UTC (permalink / raw)
  To: Alan Cox
  Cc: David Miller, torvalds, drepper, linux-kernel, netdev,
	akpm@linux-foundation.org Jakub Jelinek

On 4/24/08, Alan Cox <alan@lxorguk.ukuu.org.uk> wrote:
> > But this approach fixes just one of the interfaces.  There are 7 or 8
>  > other interfaces that need to solve the same problem.  What about
>  > those?
>
>
> Actually it seems to fix most of them.

Am I missingg something?  How?  There a number of system calls that
have neither a flags argument, nor another argument that we can
overload (as you propose with socket()).  For those, we'd need new
system calls os sys_indirect().

> I accept Jakub's observation we
>  need a "paccept()" or similar.

True, that would be nice.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 14:18   ` Ulrich Drepper
  2008-04-24 14:24     ` Alan Cox
@ 2008-04-24 16:49     ` Evgeniy Polyakov
  1 sibling, 0 replies; 36+ messages in thread
From: Evgeniy Polyakov @ 2008-04-24 16:49 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: Alan Cox, linux-kernel, netdev, akpm, torvalds

On Thu, Apr 24, 2008 at 07:18:43AM -0700, Ulrich Drepper (drepper@redhat.com) wrote:
> I don't think this is a viable approach because it is not about the
> range.  People can and do select arbitrary values for those types.
> Until a value is officially recognized and registered it is in fact best
> to choose a (possibly large) random value to not conflict with anything
> else.  Who can guarantee that whatever bit is chosen for SOCK_CLOEXEC
> isn't already used by someone?

type argument is limited to SOCK_MAX, higher half of the word can be
used for flags. It is much cleaner than implementing socket4() for the
single bit.

> Add to this that it's not a complete solution (no such hack possible for
> accept) and I think using a new interface is cleaner(tm).

It can inherit flags from parent by default.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 16:30         ` Linus Torvalds
@ 2008-04-24 16:52           ` Ulrich Drepper
  0 siblings, 0 replies; 36+ messages in thread
From: Ulrich Drepper @ 2008-04-24 16:52 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Michael Kerrisk, David Miller, alan, linux-kernel, netdev, akpm

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Linus Torvalds wrote:
> So while I don't dislike the indirect system call, I do think that if we 
> can handle a large case of the problems with an added flag to already 
> existing system calls,

The easy, clean cases I already handled back when.  I wouldn't have
implemented socket this way to preserve the function signature but
that's just me.  It's hopefully over now.

What remains isn't that easy to fix.  We need syscall interface changes.
 Yes, I'd like to avoid them, too.  But sometimes the existing
interfaces are just wrong and now we have to make a decision: new
syscalls or sys_indirect.  No way around it.

As far as the userlevel interface is concerned, this is not quite the
same.  As explained before, I've anticipated some of the problems.
signalfd, eventfd have no flags parameter in the syscall but I have them
in the userlevel interface.  I.e., any kernel change will be hidden.  At
least as far as the interface signature is concerned.

So, the question still is on the table: do you want sys_indirect?

If yes, then then new sys_accept would use sys_indirect instead of a new
entry point.  If you don't want sys_indirect, then I'll submit a new
sys_accept syscall (already have the patch here ready to go).

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFIELrj2ijCOnn/RHQRAtewAJ4+826rxwtckEvvOaXdiNSr/5ECPACfWwTn
hgt5EYrrj/imBloPE7DxHJA=
=T6LW
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 16:48           ` Michael Kerrisk
@ 2008-04-24 17:20             ` H. Peter Anvin
  2008-04-24 17:31               ` Ulrich Drepper
  0 siblings, 1 reply; 36+ messages in thread
From: H. Peter Anvin @ 2008-04-24 17:20 UTC (permalink / raw)
  To: Michael Kerrisk
  Cc: Alan Cox, David Miller, torvalds, drepper, linux-kernel, netdev,
	akpm@linux-foundation.org Jakub Jelinek

Michael Kerrisk wrote:
> On 4/24/08, Alan Cox <alan@lxorguk.ukuu.org.uk> wrote:
>>> But this approach fixes just one of the interfaces.  There are 7 or 8
>>  > other interfaces that need to solve the same problem.  What about
>>  > those?
>>
>>
>> Actually it seems to fix most of them.
> 
> Am I missingg something?  How?  There a number of system calls that
> have neither a flags argument, nor another argument that we can
> overload (as you propose with socket()).  For those, we'd need new
> system calls os sys_indirect().
> 

sys_indirect is a total red herring here, since it won't help one iota 
making the userspace interface comprehensible - it just introduces a 
different calling convention that the C library will have to thunk.

	-hpa

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 17:20             ` H. Peter Anvin
@ 2008-04-24 17:31               ` Ulrich Drepper
  2008-04-24 17:34                 ` H. Peter Anvin
  0 siblings, 1 reply; 36+ messages in thread
From: Ulrich Drepper @ 2008-04-24 17:31 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Michael Kerrisk, Alan Cox, David Miller, torvalds, linux-kernel,
	netdev, akpm@linux-foundation.org Jakub Jelinek

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

H. Peter Anvin wrote:
> sys_indirect is a total red herring here, since it won't help one iota
> making the userspace interface comprehensible - it just introduces a
> different calling convention that the C library will have to thunk.

Nobody ever suggested that sys_indirect is in any way visible at the
userlevel.  It's only meant to solve the problem of changing many
syscalls (and hence touch lots of arch-specific code).  Again, as said
several times, it could easily be used to fix the existing signalfd and
eventfd syscalls without any arch-specific changes and no userlevel
interface changes (the latter since we already have the correct interface).

Yes, you don't like sys_indirect, we know it.  But don't deliberately
misrepresent the approach.

- --
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.7 (GNU/Linux)

iD8DBQFIEMPx2ijCOnn/RHQRAr7uAJ0aHkZ+bbjk2nsMhhN2xzslA/yhKgCghi8r
9PZw8zfW5fxTVTfrbsHIII0=
=SmAT
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 17:31               ` Ulrich Drepper
@ 2008-04-24 17:34                 ` H. Peter Anvin
  0 siblings, 0 replies; 36+ messages in thread
From: H. Peter Anvin @ 2008-04-24 17:34 UTC (permalink / raw)
  To: Ulrich Drepper
  Cc: Michael Kerrisk, Alan Cox, David Miller, torvalds, linux-kernel,
	netdev, akpm@linux-foundation.org Jakub Jelinek

Ulrich Drepper wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
> 
> H. Peter Anvin wrote:
>> sys_indirect is a total red herring here, since it won't help one iota
>> making the userspace interface comprehensible - it just introduces a
>> different calling convention that the C library will have to thunk.
> 
> Nobody ever suggested that sys_indirect is in any way visible at the
> userlevel.  It's only meant to solve the problem of changing many
> syscalls (and hence touch lots of arch-specific code).  Again, as said
> several times, it could easily be used to fix the existing signalfd and
> eventfd syscalls without any arch-specific changes and no userlevel
> interface changes (the latter since we already have the correct interface).
> 
> Yes, you don't like sys_indirect, we know it.  But don't deliberately
> misrepresent the approach.
> 

I wasn't misrepresenting anything.  I was pointing out to the parent 
post -- not to you -- that sys_indirect does neither hide nor hair for 
what *he* was concerned about, which was the comprehensibility of the 
user-level interface.

	-hpa

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] alternative to sys_indirect, part 1
  2008-04-24 16:45                   ` Michael Kerrisk
@ 2008-04-26 22:41                     ` dean gaudet
  0 siblings, 0 replies; 36+ messages in thread
From: dean gaudet @ 2008-04-26 22:41 UTC (permalink / raw)
  To: Michael Kerrisk
  Cc: Alan Cox, David Miller, jakub, drepper, linux-kernel, netdev,
	akpm, torvalds

On Thu, 24 Apr 2008, Michael Kerrisk wrote:

> A while back I did some testing of this point.  These were the results I noted:
> 
> FreeBSD 4.8
> O_NONBLOCK and O_ASYNC are inherited
> FD_CLOEXEC is not inherited
> 
> Solaris 8
> O_NONBLOCK and O_ASYNC are inherited
> FD_CLOEXEC is not inherited
> 
> Tru64 5.1 (sep 03, testdrive)
> No F_SETFL flags are inherited
> FD_CLOEXEC is not inherited
> 
> HP-UX 11
> No F_SETFL flags are inherited
> FD_CLOEXEC is not inherited

invent FD_CLOEXEC_INHERITED to handle accept()?

-dean

^ permalink raw reply	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2008-04-26 22:41 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-04-24  4:03 [PATCH] alternative to sys_indirect, part 1 Ulrich Drepper
2008-04-24 10:25 ` Alan Cox
2008-04-24 12:34   ` Michael Kerrisk
     [not found]     ` <517f3f820804240534r3bbbdc52s52a6dfe3f2d14b7f-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2008-04-24 14:49       ` Ulrich Drepper
     [not found]         ` <48109DFB.900-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2008-04-24 14:42           ` Alan Cox
2008-04-24 15:19             ` Ulrich Drepper
2008-04-24 15:05           ` Michael Kerrisk
2008-04-24 14:18   ` Ulrich Drepper
2008-04-24 14:24     ` Alan Cox
2008-04-24 15:16       ` Ulrich Drepper
2008-04-24 15:03         ` Alan Cox
2008-04-24 15:44           ` Jakub Jelinek
2008-04-24 15:24             ` Alan Cox
2008-04-24 16:00               ` David Miller
2008-04-24 15:38                 ` Alan Cox
2008-04-24 16:09                   ` David Miller
2008-04-24 16:45                   ` Michael Kerrisk
2008-04-26 22:41                     ` dean gaudet
2008-04-24 15:45           ` Ulrich Drepper
2008-04-24 15:27             ` Alan Cox
2008-04-24 16:04               ` Ulrich Drepper
2008-04-24 15:45                 ` Alan Cox
2008-04-24 16:06           ` Michael Kerrisk
2008-04-24 16:49     ` Evgeniy Polyakov
2008-04-24 15:29   ` Linus Torvalds
2008-04-24 15:39     ` David Miller
2008-04-24 16:03       ` Michael Kerrisk
2008-04-24 15:42         ` Alan Cox
2008-04-24 16:48           ` Michael Kerrisk
2008-04-24 17:20             ` H. Peter Anvin
2008-04-24 17:31               ` Ulrich Drepper
2008-04-24 17:34                 ` H. Peter Anvin
2008-04-24 16:30         ` Linus Torvalds
2008-04-24 16:52           ` Ulrich Drepper
2008-04-24 12:27 ` Michael Kerrisk
2008-04-24 12:46   ` David Collier-Brown

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).