* [RFC PATCH v2] waitfd
@ 2009-01-06 18:11 Casey Dahlin
2009-01-06 18:27 ` Alan Cox
` (3 more replies)
0 siblings, 4 replies; 50+ messages in thread
From: Casey Dahlin @ 2009-01-06 18:11 UTC (permalink / raw)
To: Linux Kernel
Linux now exposes signals, timers, and events via file descriptors
through signalfd, timerfd, and eventfd. This means programmers can use a
single select/[e]poll call to monitor all change in their program. This
patch aims to expose child death via the same mechanism.
waitfd provides a file descriptor out of which may be read a series of
siginfo_t objects describing child death. A child process is reaped as
soon as its information is read. This means child monitoring too can be
performed with that same poll call.
Patch is against v2.6.28
--CJD
diff --git a/arch/x86/include/asm/unistd_32.h
b/arch/x86/include/asm/unistd_32.h
index f2bba78..134d83c 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -338,6 +338,7 @@
#define __NR_dup3 330
#define __NR_pipe2 331
#define __NR_inotify_init1 332
+#define __NR_waitfd 333
#ifdef __KERNEL__
diff --git a/arch/x86/include/asm/unistd_64.h
b/arch/x86/include/asm/unistd_64.h
index d2e415e..b28eb07 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -653,6 +653,8 @@ __SYSCALL(__NR_dup3, sys_dup3)
__SYSCALL(__NR_pipe2, sys_pipe2)
#define __NR_inotify_init1 294
__SYSCALL(__NR_inotify_init1, sys_inotify_init1)
+#define __NR_waitfd 295
+__SYSCALL(__NR_waitfd, sys_waitfd)
#ifndef __NO_STUBS
diff --git a/arch/x86/kernel/syscall_table_32.S
b/arch/x86/kernel/syscall_table_32.S
index d44395f..c796a8b 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -332,3 +332,4 @@ ENTRY(sys_call_table)
.long sys_dup3 /* 330 */
.long sys_pipe2
.long sys_inotify_init1
+ .long sys_waitfd
diff --git a/fs/Makefile b/fs/Makefile
index d9f8afe..74c31fb 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_INOTIFY_USER) += inotify_user.o
obj-$(CONFIG_EPOLL) += eventpoll.o
obj-$(CONFIG_ANON_INODES) += anon_inodes.o
obj-$(CONFIG_SIGNALFD) += signalfd.o
+obj-$(CONFIG_WAITFD) += waitfd.o
obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_AIO) += aio.o
diff --git a/fs/waitfd.c b/fs/waitfd.c
new file mode 100644
index 0000000..0155a83
--- /dev/null
+++ b/fs/waitfd.c
@@ -0,0 +1,117 @@
+/*
+ * fs/waitfd.c
+ *
+ * Copyright (C) 2008 Red Hat, Casey Dahlin <cdahlin@redhat.com>
+ *
+ * Largely derived from fs/signalfd.c
+ */
+
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/list.h>
+#include <linux/anon_inodes.h>
+#include <linux/syscalls.h>
+
+long do_waitid(int which, pid_t upid,
+ struct siginfo __user *infop, int options,
+ struct rusage __user *ru);
+
+struct waitfd_ctx {
+ int ops;
+ int which;
+ pid_t upid;
+};
+
+static int waitfd_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
+static unsigned int waitfd_poll(struct file *file, poll_table *wait)
+{
+ struct waitfd_ctx *ctx = file->private_data;
+ long value;
+
+ poll_wait(file, ¤t->signal->wait_chldexit, wait);
+
+ value = do_waitid(ctx->which, ctx->upid, NULL,
+ ctx->ops | WNOHANG | WNOWAIT, NULL);
+ if (value > 0 || value == -ECHILD)
+ return POLLIN;
+
+ return 0;
+}
+
+/*
+ * Returns a multiple of the size of a struct siginfo, or a negative
+ * error code. The "count" parameter must be at least sizeof(struct
siginfo)
+ */
+static ssize_t waitfd_read(struct file *file, char __user *buf, size_t
count,
+ loff_t *ppos)
+{
+ struct waitfd_ctx *ctx = file->private_data;
+ struct siginfo __user *info_addr = (struct siginfo *)buf;
+ int flags = ctx->ops;
+ ssize_t ret, total = 0;
+
+ count /= sizeof(struct siginfo);
+ if (!count)
+ return -EINVAL;
+
+ do {
+ ret = do_waitid(ctx->which, ctx->upid, info_addr, flags, NULL);
+ if (ret == 0)
+ ret = -EAGAIN;
+ if (ret == -ECHILD)
+ ret = 0;
+ if (ret <= 0)
+ break;
+
+ info_addr++;
+ total += sizeof(struct siginfo);
+ flags |= WNOHANG;
+ } while (--count);
+
+ return total ? total: ret;
+}
+
+static const struct file_operations waitfd_fops = {
+ .release = waitfd_release,
+ .poll = waitfd_poll,
+ .read = waitfd_read,
+};
+
+asmlinkage long sys_waitfd(int which, pid_t upid, int options, int unused)
+{
+ int ufd;
+ struct waitfd_ctx *ctx;
+
+ /* Just to make sure we don't end up with a sys_waitfd4 */
+ (void)unused;
+
+ if (options & ~(WNOHANG|WEXITED|WSTOPPED|WCONTINUED))
+ return -EINVAL;
+ if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
+ return -EINVAL;
+
+ ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->ops = options;
+ ctx->upid = upid;
+ ctx->which = which;
+
+ ufd = anon_inode_getfd("[waitfd]", &waitfd_fops, ctx,
+ (options & WNOHANG) ? O_NONBLOCK : 0);
+ if (ufd < 0)
+ kfree(ctx);
+
+ return ufd;
+}
diff --git a/init/Kconfig b/init/Kconfig
index f763762..bc34871 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -683,6 +683,16 @@ config EPOLL
Disabling this option will cause the kernel to be built without
support for epoll family of system calls.
+config WAITFD
+ bool "Enable waitfd() system call" if EMBEDDED
+ select ANON_INODES
+ default y
+ help
+ Enable the waitfd() system call that allows receving child state
+ changes from a file descriptor.
+
+ If unsure, say Y.
+
config SIGNALFD
bool "Enable signalfd() system call" if EMBEDDED
select ANON_INODES
diff --git a/kernel/exit.c b/kernel/exit.c
index 2d8be7e..b53e8ba 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1233,18 +1233,20 @@ static int wait_noreap_copyout(struct
task_struct *p, pid_t pid, uid_t uid,
int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0;
put_task_struct(p);
- if (!retval)
- retval = put_user(SIGCHLD, &infop->si_signo);
- if (!retval)
- retval = put_user(0, &infop->si_errno);
- if (!retval)
- retval = put_user((short)why, &infop->si_code);
- if (!retval)
- retval = put_user(pid, &infop->si_pid);
- if (!retval)
- retval = put_user(uid, &infop->si_uid);
- if (!retval)
- retval = put_user(status, &infop->si_status);
+ if (infop) {
+ if (!retval)
+ retval = put_user(SIGCHLD, &infop->si_signo);
+ if (!retval)
+ retval = put_user(0, &infop->si_errno);
+ if (!retval)
+ retval = put_user((short)why, &infop->si_code);
+ if (!retval)
+ retval = put_user(pid, &infop->si_pid);
+ if (!retval)
+ retval = put_user(uid, &infop->si_uid);
+ if (!retval)
+ retval = put_user(status, &infop->si_status);
+ }
if (!retval)
retval = pid;
return retval;
@@ -1727,35 +1729,12 @@ repeat:
end:
current->state = TASK_RUNNING;
remove_wait_queue(¤t->signal->wait_chldexit,&wait);
- if (infop) {
- if (retval > 0)
- retval = 0;
- else {
- /*
- * For a WNOHANG return, clear out all the fields
- * we would set so the user can easily tell the
- * difference.
- */
- if (!retval)
- retval = put_user(0, &infop->si_signo);
- if (!retval)
- retval = put_user(0, &infop->si_errno);
- if (!retval)
- retval = put_user(0, &infop->si_code);
- if (!retval)
- retval = put_user(0, &infop->si_pid);
- if (!retval)
- retval = put_user(0, &infop->si_uid);
- if (!retval)
- retval = put_user(0, &infop->si_status);
- }
- }
return retval;
}
-asmlinkage long sys_waitid(int which, pid_t upid,
- struct siginfo __user *infop, int options,
- struct rusage __user *ru)
+long do_waitid(int which, pid_t upid,
+ struct siginfo __user *infop, int options,
+ struct rusage __user *ru)
{
struct pid *pid = NULL;
enum pid_type type;
@@ -1789,6 +1768,39 @@ asmlinkage long sys_waitid(int which, pid_t upid,
ret = do_wait(type, pid, options, infop, NULL, ru);
put_pid(pid);
+ return ret;
+}
+
+asmlinkage long sys_waitid(int which, pid_t upid,
+ struct siginfo __user *infop, int options,
+ struct rusage __user *ru)
+{
+ long ret;
+
+ ret = do_waitid(which, upid, infop, options, ru);
+
+ if (ret > 0)
+ ret = 0;
+ else {
+ /*
+ * For a WNOHANG return, clear out all the fields
+ * we would set so the user can easily tell the
+ * difference.
+ */
+ if (!ret)
+ ret = put_user(0, &infop->si_signo);
+ if (!ret)
+ ret = put_user(0, &infop->si_errno);
+ if (!ret)
+ ret = put_user(0, &infop->si_code);
+ if (!ret)
+ ret = put_user(0, &infop->si_pid);
+ if (!ret)
+ ret = put_user(0, &infop->si_uid);
+ if (!ret)
+ ret = put_user(0, &infop->si_status);
+ }
+
/* avoid REGPARM breakage on x86: */
asmlinkage_protect(5, ret, which, upid, infop, options, ru);
return ret;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e14a232..e8d4da6 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -163,6 +163,7 @@ cond_syscall(sys_ioprio_set);
cond_syscall(sys_ioprio_get);
/* New file descriptors */
+cond_syscall(sys_waitfd);
cond_syscall(sys_signalfd);
cond_syscall(sys_signalfd4);
cond_syscall(compat_sys_signalfd);
^ permalink raw reply related [flat|nested] 50+ messages in thread* Re: [RFC PATCH v2] waitfd 2009-01-06 18:11 [RFC PATCH v2] waitfd Casey Dahlin @ 2009-01-06 18:27 ` Alan Cox 2009-01-06 18:31 ` Randy Dunlap ` (2 subsequent siblings) 3 siblings, 0 replies; 50+ messages in thread From: Alan Cox @ 2009-01-06 18:27 UTC (permalink / raw) To: Casey Dahlin; +Cc: Linux Kernel > waitfd provides a file descriptor out of which may be read a series of > siginfo_t objects describing child death. A child process is reaped as > soon as its information is read. This means child monitoring too can be > performed with that same poll call. > > Patch is against v2.6.28 Same comment as last time - the right way to do this is to extend the existing notify APIs to the /proc file system ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RFC PATCH v2] waitfd 2009-01-06 18:11 [RFC PATCH v2] waitfd Casey Dahlin 2009-01-06 18:27 ` Alan Cox @ 2009-01-06 18:31 ` Randy Dunlap 2009-01-06 18:45 ` Casey Dahlin 2009-01-06 18:48 ` Andi Kleen 2009-01-06 19:07 ` [RESEND][RFC " Casey Dahlin 3 siblings, 1 reply; 50+ messages in thread From: Randy Dunlap @ 2009-01-06 18:31 UTC (permalink / raw) To: Casey Dahlin; +Cc: Linux Kernel Casey Dahlin wrote: > Linux now exposes signals, timers, and events via file descriptors > through signalfd, timerfd, and eventfd. This means programmers can use a > single select/[e]poll call to monitor all change in their program. This > patch aims to expose child death via the same mechanism. > > waitfd provides a file descriptor out of which may be read a series of > siginfo_t objects describing child death. A child process is reaped as > soon as its information is read. This means child monitoring too can be > performed with that same poll call. > > Patch is against v2.6.28 > > --CJD > > diff --git a/arch/x86/include/asm/unistd_32.h > b/arch/x86/include/asm/unistd_32.h > index f2bba78..134d83c 100644 > --- a/arch/x86/include/asm/unistd_32.h > +++ b/arch/x86/include/asm/unistd_32.h > @@ -338,6 +338,7 @@ > #define __NR_dup3 330 > #define __NR_pipe2 331 > #define __NR_inotify_init1 332 > +#define __NR_waitfd 333 > > #ifdef __KERNEL__ > > diff --git a/arch/x86/include/asm/unistd_64.h > b/arch/x86/include/asm/unistd_64.h > index d2e415e..b28eb07 100644 > --- a/arch/x86/include/asm/unistd_64.h > +++ b/arch/x86/include/asm/unistd_64.h > @@ -653,6 +653,8 @@ __SYSCALL(__NR_dup3, sys_dup3) > __SYSCALL(__NR_pipe2, sys_pipe2) > #define __NR_inotify_init1 294 > __SYSCALL(__NR_inotify_init1, sys_inotify_init1) > +#define __NR_waitfd 295 > +__SYSCALL(__NR_waitfd, sys_waitfd) > Only for x86?? > > #ifndef __NO_STUBS > diff --git a/arch/x86/kernel/syscall_table_32.S > b/arch/x86/kernel/syscall_table_32.S > index d44395f..c796a8b 100644 > --- a/arch/x86/kernel/syscall_table_32.S > +++ b/arch/x86/kernel/syscall_table_32.S > @@ -332,3 +332,4 @@ ENTRY(sys_call_table) > .long sys_dup3 /* 330 */ > .long sys_pipe2 > .long sys_inotify_init1 > + .long sys_waitfd > diff --git a/fs/Makefile b/fs/Makefile > index d9f8afe..74c31fb 100644 > --- a/fs/Makefile > +++ b/fs/Makefile > @@ -25,6 +25,7 @@ obj-$(CONFIG_INOTIFY_USER) += inotify_user.o > obj-$(CONFIG_EPOLL) += eventpoll.o > obj-$(CONFIG_ANON_INODES) += anon_inodes.o > obj-$(CONFIG_SIGNALFD) += signalfd.o > +obj-$(CONFIG_WAITFD) += waitfd.o > obj-$(CONFIG_TIMERFD) += timerfd.o > obj-$(CONFIG_EVENTFD) += eventfd.o > obj-$(CONFIG_AIO) += aio.o > diff --git a/fs/waitfd.c b/fs/waitfd.c > new file mode 100644 > index 0000000..0155a83 > --- /dev/null > +++ b/fs/waitfd.c > @@ -0,0 +1,117 @@ > +/* > + * fs/waitfd.c > + * > + * Copyright (C) 2008 Red Hat, Casey Dahlin <cdahlin@redhat.com> > + * > + * Largely derived from fs/signalfd.c > + */ > + > +#include <linux/file.h> > +#include <linux/poll.h> > +#include <linux/init.h> > +#include <linux/fs.h> > +#include <linux/sched.h> > +#include <linux/kernel.h> > +#include <linux/signal.h> > +#include <linux/list.h> > +#include <linux/anon_inodes.h> > +#include <linux/syscalls.h> > + > +long do_waitid(int which, pid_t upid, > + struct siginfo __user *infop, int options, > + struct rusage __user *ru); > + > +struct waitfd_ctx { > + int ops; > + int which; > + pid_t upid; > +}; > + Please use kernel coding style: use tabs to indent, not <lots-of-spaces>, and struct members, functions, etc., are indented by one tab stop minimum. > +static int waitfd_release(struct inode *inode, struct file *file) > +{ > + kfree(file->private_data); > + return 0; > +} > + > +static unsigned int waitfd_poll(struct file *file, poll_table *wait) > +{ > + struct waitfd_ctx *ctx = file->private_data; > + long value; > + > + poll_wait(file, ¤t->signal->wait_chldexit, wait); > + > + value = do_waitid(ctx->which, ctx->upid, NULL, > + ctx->ops | WNOHANG | WNOWAIT, NULL); > + if (value > 0 || value == -ECHILD) > + return POLLIN; > + > + return 0; > +} > + > +/* > + * Returns a multiple of the size of a struct siginfo, or a negative > + * error code. The "count" parameter must be at least sizeof(struct > siginfo) > + */ > +static ssize_t waitfd_read(struct file *file, char __user *buf, size_t > count, > + loff_t *ppos) > +{ > + struct waitfd_ctx *ctx = file->private_data; > + struct siginfo __user *info_addr = (struct siginfo *)buf; > + int flags = ctx->ops; > + ssize_t ret, total = 0; > + > + count /= sizeof(struct siginfo); > + if (!count) > + return -EINVAL; > + > + do { > + ret = do_waitid(ctx->which, ctx->upid, info_addr, flags, NULL); > + if (ret == 0) > + ret = -EAGAIN; > + if (ret == -ECHILD) > + ret = 0; > + if (ret <= 0) > + break; > + > + info_addr++; > + total += sizeof(struct siginfo); > + flags |= WNOHANG; > + } while (--count); > + > + return total ? total: ret; return total ? total : ret; please. > +} > + > +static const struct file_operations waitfd_fops = { > + .release = waitfd_release, > + .poll = waitfd_poll, > + .read = waitfd_read, > +}; > + > +asmlinkage long sys_waitfd(int which, pid_t upid, int options, int unused) > +{ > + int ufd; > + struct waitfd_ctx *ctx; > + > + /* Just to make sure we don't end up with a sys_waitfd4 */ > + (void)unused; > + > + if (options & ~(WNOHANG|WEXITED|WSTOPPED|WCONTINUED)) > + return -EINVAL; > + if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) > + return -EINVAL; Use spaces around '|'. > + > + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); > + if (!ctx) > + return -ENOMEM; > + > + ctx->ops = options; > + ctx->upid = upid; > + ctx->which = which; > + > + ufd = anon_inode_getfd("[waitfd]", &waitfd_fops, ctx, > + (options & WNOHANG) ? O_NONBLOCK : 0); > + if (ufd < 0) > + kfree(ctx); > + > + return ufd; > +} > diff --git a/init/Kconfig b/init/Kconfig > index f763762..bc34871 100644 > --- a/init/Kconfig > +++ b/init/Kconfig > @@ -683,6 +683,16 @@ config EPOLL > Disabling this option will cause the kernel to be built without > support for epoll family of system calls. > > +config WAITFD > + bool "Enable waitfd() system call" if EMBEDDED > + select ANON_INODES > + default y > + help > + Enable the waitfd() system call that allows receving child state receiving Kconfig help text should be indented by <tab><space><space>. > + changes from a file descriptor. > + > + If unsure, say Y. > + > config SIGNALFD > bool "Enable signalfd() system call" if EMBEDDED > select ANON_INODES > diff --git a/kernel/exit.c b/kernel/exit.c > index 2d8be7e..b53e8ba 100644 > --- a/kernel/exit.c > +++ b/kernel/exit.c > @@ -1233,18 +1233,20 @@ static int wait_noreap_copyout(struct > task_struct *p, pid_t pid, uid_t uid, > int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0; > > put_task_struct(p); > - if (!retval) > - retval = put_user(SIGCHLD, &infop->si_signo); > - if (!retval) > - retval = put_user(0, &infop->si_errno); > - if (!retval) > - retval = put_user((short)why, &infop->si_code); > - if (!retval) > - retval = put_user(pid, &infop->si_pid); > - if (!retval) > - retval = put_user(uid, &infop->si_uid); > - if (!retval) > - retval = put_user(status, &infop->si_status); > + if (infop) { > + if (!retval) > + retval = put_user(SIGCHLD, &infop->si_signo); > + if (!retval) > + retval = put_user(0, &infop->si_errno); > + if (!retval) > + retval = put_user((short)why, &infop->si_code); > + if (!retval) > + retval = put_user(pid, &infop->si_pid); > + if (!retval) > + retval = put_user(uid, &infop->si_uid); > + if (!retval) > + retval = put_user(status, &infop->si_status); > + } > if (!retval) > retval = pid; > return retval; > @@ -1727,35 +1729,12 @@ repeat: > end: > current->state = TASK_RUNNING; > remove_wait_queue(¤t->signal->wait_chldexit,&wait); > - if (infop) { > - if (retval > 0) > - retval = 0; > - else { > - /* > - * For a WNOHANG return, clear out all the fields > - * we would set so the user can easily tell the > - * difference. > - */ > - if (!retval) > - retval = put_user(0, &infop->si_signo); > - if (!retval) > - retval = put_user(0, &infop->si_errno); > - if (!retval) > - retval = put_user(0, &infop->si_code); > - if (!retval) > - retval = put_user(0, &infop->si_pid); > - if (!retval) > - retval = put_user(0, &infop->si_uid); > - if (!retval) > - retval = put_user(0, &infop->si_status); > - } > - } > return retval; > } > > -asmlinkage long sys_waitid(int which, pid_t upid, > - struct siginfo __user *infop, int options, > - struct rusage __user *ru) > +long do_waitid(int which, pid_t upid, > + struct siginfo __user *infop, int options, > + struct rusage __user *ru) > { > struct pid *pid = NULL; > enum pid_type type; > @@ -1789,6 +1768,39 @@ asmlinkage long sys_waitid(int which, pid_t upid, > ret = do_wait(type, pid, options, infop, NULL, ru); > put_pid(pid); > > + return ret; > +} > + > +asmlinkage long sys_waitid(int which, pid_t upid, > + struct siginfo __user *infop, int options, > + struct rusage __user *ru) > +{ > + long ret; > + > + ret = do_waitid(which, upid, infop, options, ru); > + > + if (ret > 0) > + ret = 0; > + else { > + /* > + * For a WNOHANG return, clear out all the fields > + * we would set so the user can easily tell the > + * difference. > + */ > + if (!ret) > + ret = put_user(0, &infop->si_signo); > + if (!ret) > + ret = put_user(0, &infop->si_errno); > + if (!ret) > + ret = put_user(0, &infop->si_code); > + if (!ret) > + ret = put_user(0, &infop->si_pid); > + if (!ret) > + ret = put_user(0, &infop->si_uid); > + if (!ret) > + ret = put_user(0, &infop->si_status); > + } > + > /* avoid REGPARM breakage on x86: */ > asmlinkage_protect(5, ret, which, upid, infop, options, ru); > return ret; > diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c > index e14a232..e8d4da6 100644 > --- a/kernel/sys_ni.c > +++ b/kernel/sys_ni.c > @@ -163,6 +163,7 @@ cond_syscall(sys_ioprio_set); > cond_syscall(sys_ioprio_get); > > /* New file descriptors */ > +cond_syscall(sys_waitfd); > cond_syscall(sys_signalfd); > cond_syscall(sys_signalfd4); > cond_syscall(compat_sys_signalfd); -- ~Randy ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RFC PATCH v2] waitfd 2009-01-06 18:31 ` Randy Dunlap @ 2009-01-06 18:45 ` Casey Dahlin 2009-01-06 18:50 ` Randy Dunlap 0 siblings, 1 reply; 50+ messages in thread From: Casey Dahlin @ 2009-01-06 18:45 UTC (permalink / raw) To: Randy Dunlap; +Cc: Linux Kernel Randy Dunlap wrote: > Casey Dahlin wrote: > >> Linux now exposes signals, timers, and events via file descriptors >> through signalfd, timerfd, and eventfd. This means programmers can use a >> single select/[e]poll call to monitor all change in their program. This >> patch aims to expose child death via the same mechanism. >> >> waitfd provides a file descriptor out of which may be read a series of >> siginfo_t objects describing child death. A child process is reaped as >> soon as its information is read. This means child monitoring too can be >> performed with that same poll call. >> >> Patch is against v2.6.28 >> >> --CJD >> >> diff --git a/arch/x86/include/asm/unistd_32.h >> b/arch/x86/include/asm/unistd_32.h >> index f2bba78..134d83c 100644 >> --- a/arch/x86/include/asm/unistd_32.h >> +++ b/arch/x86/include/asm/unistd_32.h >> @@ -338,6 +338,7 @@ >> #define __NR_dup3 330 >> #define __NR_pipe2 331 >> #define __NR_inotify_init1 332 >> +#define __NR_waitfd 333 >> >> #ifdef __KERNEL__ >> >> diff --git a/arch/x86/include/asm/unistd_64.h >> b/arch/x86/include/asm/unistd_64.h >> index d2e415e..b28eb07 100644 >> --- a/arch/x86/include/asm/unistd_64.h >> +++ b/arch/x86/include/asm/unistd_64.h >> @@ -653,6 +653,8 @@ __SYSCALL(__NR_dup3, sys_dup3) >> __SYSCALL(__NR_pipe2, sys_pipe2) >> #define __NR_inotify_init1 294 >> __SYSCALL(__NR_inotify_init1, sys_inotify_init1) >> +#define __NR_waitfd 295 >> +__SYSCALL(__NR_waitfd, sys_waitfd) >> >> > > Only for x86?? > > At the moment. I should have mentioned this earlier but I haven't made the syscall table entries for archs I don't test on. That will change once the rest of the change has settled out. >> #ifndef __NO_STUBS >> diff --git a/arch/x86/kernel/syscall_table_32.S >> b/arch/x86/kernel/syscall_table_32.S >> index d44395f..c796a8b 100644 >> --- a/arch/x86/kernel/syscall_table_32.S >> +++ b/arch/x86/kernel/syscall_table_32.S >> @@ -332,3 +332,4 @@ ENTRY(sys_call_table) >> .long sys_dup3 /* 330 */ >> .long sys_pipe2 >> .long sys_inotify_init1 >> + .long sys_waitfd >> diff --git a/fs/Makefile b/fs/Makefile >> index d9f8afe..74c31fb 100644 >> --- a/fs/Makefile >> +++ b/fs/Makefile >> @@ -25,6 +25,7 @@ obj-$(CONFIG_INOTIFY_USER) += inotify_user.o >> obj-$(CONFIG_EPOLL) += eventpoll.o >> obj-$(CONFIG_ANON_INODES) += anon_inodes.o >> obj-$(CONFIG_SIGNALFD) += signalfd.o >> +obj-$(CONFIG_WAITFD) += waitfd.o >> obj-$(CONFIG_TIMERFD) += timerfd.o >> obj-$(CONFIG_EVENTFD) += eventfd.o >> obj-$(CONFIG_AIO) += aio.o >> diff --git a/fs/waitfd.c b/fs/waitfd.c >> new file mode 100644 >> index 0000000..0155a83 >> --- /dev/null >> +++ b/fs/waitfd.c >> @@ -0,0 +1,117 @@ >> +/* >> + * fs/waitfd.c >> + * >> + * Copyright (C) 2008 Red Hat, Casey Dahlin <cdahlin@redhat.com> >> + * >> + * Largely derived from fs/signalfd.c >> + */ >> + >> +#include <linux/file.h> >> +#include <linux/poll.h> >> +#include <linux/init.h> >> +#include <linux/fs.h> >> +#include <linux/sched.h> >> +#include <linux/kernel.h> >> +#include <linux/signal.h> >> +#include <linux/list.h> >> +#include <linux/anon_inodes.h> >> +#include <linux/syscalls.h> >> + >> +long do_waitid(int which, pid_t upid, >> + struct siginfo __user *infop, int options, >> + struct rusage __user *ru); >> + >> +struct waitfd_ctx { >> + int ops; >> + int which; >> + pid_t upid; >> +}; >> + >> > > Please use kernel coding style: use tabs to indent, not <lots-of-spaces>, > and struct members, functions, etc., are indented by one tab stop minimum. > > Damnit. This is a mailer artifact. This is the first time thunderbird has eaten a patch on me. I'll look in to it. >> +static int waitfd_release(struct inode *inode, struct file *file) >> +{ >> + kfree(file->private_data); >> + return 0; >> +} >> + >> +static unsigned int waitfd_poll(struct file *file, poll_table *wait) >> +{ >> + struct waitfd_ctx *ctx = file->private_data; >> + long value; >> + >> + poll_wait(file, ¤t->signal->wait_chldexit, wait); >> + >> + value = do_waitid(ctx->which, ctx->upid, NULL, >> + ctx->ops | WNOHANG | WNOWAIT, NULL); >> + if (value > 0 || value == -ECHILD) >> + return POLLIN; >> + >> + return 0; >> +} >> + >> +/* >> + * Returns a multiple of the size of a struct siginfo, or a negative >> + * error code. The "count" parameter must be at least sizeof(struct >> siginfo) >> + */ >> +static ssize_t waitfd_read(struct file *file, char __user *buf, size_t >> count, >> + loff_t *ppos) >> +{ >> + struct waitfd_ctx *ctx = file->private_data; >> + struct siginfo __user *info_addr = (struct siginfo *)buf; >> + int flags = ctx->ops; >> + ssize_t ret, total = 0; >> + >> + count /= sizeof(struct siginfo); >> + if (!count) >> + return -EINVAL; >> + >> + do { >> + ret = do_waitid(ctx->which, ctx->upid, info_addr, flags, NULL); >> + if (ret == 0) >> + ret = -EAGAIN; >> + if (ret == -ECHILD) >> + ret = 0; >> + if (ret <= 0) >> + break; >> + >> + info_addr++; >> + total += sizeof(struct siginfo); >> + flags |= WNOHANG; >> + } while (--count); >> + >> + return total ? total: ret; >> > > return total ? total : ret; > please. > > I actually saw this used elsewhere in the kernel. Assumed I'd missed it in the style guidelines :) >> +} >> + >> +static const struct file_operations waitfd_fops = { >> + .release = waitfd_release, >> + .poll = waitfd_poll, >> + .read = waitfd_read, >> +}; >> + >> +asmlinkage long sys_waitfd(int which, pid_t upid, int options, int unused) >> +{ >> + int ufd; >> + struct waitfd_ctx *ctx; >> + >> + /* Just to make sure we don't end up with a sys_waitfd4 */ >> + (void)unused; >> + >> + if (options & ~(WNOHANG|WEXITED|WSTOPPED|WCONTINUED)) >> + return -EINVAL; >> + if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) >> + return -EINVAL; >> > > Use spaces around '|'. > > Those 4 lines are copied almost exactly from kernel/exit.c. Is there motivation to keep them consistent? >> + >> + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); >> + if (!ctx) >> + return -ENOMEM; >> + >> + ctx->ops = options; >> + ctx->upid = upid; >> + ctx->which = which; >> + >> + ufd = anon_inode_getfd("[waitfd]", &waitfd_fops, ctx, >> + (options & WNOHANG) ? O_NONBLOCK : 0); >> + if (ufd < 0) >> + kfree(ctx); >> + >> + return ufd; >> +} >> diff --git a/init/Kconfig b/init/Kconfig >> index f763762..bc34871 100644 >> --- a/init/Kconfig >> +++ b/init/Kconfig >> @@ -683,6 +683,16 @@ config EPOLL >> Disabling this option will cause the kernel to be built without >> support for epoll family of system calls. >> >> +config WAITFD >> + bool "Enable waitfd() system call" if EMBEDDED >> + select ANON_INODES >> + default y >> + help >> + Enable the waitfd() system call that allows receving child state >> > > receiving > > Kconfig help text should be indented by <tab><space><space>. > > Likely more of the mailer eating the patch. >> + changes from a file descriptor. >> + >> + If unsure, say Y. >> + >> config SIGNALFD >> bool "Enable signalfd() system call" if EMBEDDED >> select ANON_INODES >> diff --git a/kernel/exit.c b/kernel/exit.c >> index 2d8be7e..b53e8ba 100644 >> --- a/kernel/exit.c >> +++ b/kernel/exit.c >> @@ -1233,18 +1233,20 @@ static int wait_noreap_copyout(struct >> task_struct *p, pid_t pid, uid_t uid, >> int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0; >> >> put_task_struct(p); >> - if (!retval) >> - retval = put_user(SIGCHLD, &infop->si_signo); >> - if (!retval) >> - retval = put_user(0, &infop->si_errno); >> - if (!retval) >> - retval = put_user((short)why, &infop->si_code); >> - if (!retval) >> - retval = put_user(pid, &infop->si_pid); >> - if (!retval) >> - retval = put_user(uid, &infop->si_uid); >> - if (!retval) >> - retval = put_user(status, &infop->si_status); >> + if (infop) { >> + if (!retval) >> + retval = put_user(SIGCHLD, &infop->si_signo); >> + if (!retval) >> + retval = put_user(0, &infop->si_errno); >> + if (!retval) >> + retval = put_user((short)why, &infop->si_code); >> + if (!retval) >> + retval = put_user(pid, &infop->si_pid); >> + if (!retval) >> + retval = put_user(uid, &infop->si_uid); >> + if (!retval) >> + retval = put_user(status, &infop->si_status); >> + } >> if (!retval) >> retval = pid; >> return retval; >> @@ -1727,35 +1729,12 @@ repeat: >> end: >> current->state = TASK_RUNNING; >> remove_wait_queue(¤t->signal->wait_chldexit,&wait); >> - if (infop) { >> - if (retval > 0) >> - retval = 0; >> - else { >> - /* >> - * For a WNOHANG return, clear out all the fields >> - * we would set so the user can easily tell the >> - * difference. >> - */ >> - if (!retval) >> - retval = put_user(0, &infop->si_signo); >> - if (!retval) >> - retval = put_user(0, &infop->si_errno); >> - if (!retval) >> - retval = put_user(0, &infop->si_code); >> - if (!retval) >> - retval = put_user(0, &infop->si_pid); >> - if (!retval) >> - retval = put_user(0, &infop->si_uid); >> - if (!retval) >> - retval = put_user(0, &infop->si_status); >> - } >> - } >> return retval; >> } >> >> -asmlinkage long sys_waitid(int which, pid_t upid, >> - struct siginfo __user *infop, int options, >> - struct rusage __user *ru) >> +long do_waitid(int which, pid_t upid, >> + struct siginfo __user *infop, int options, >> + struct rusage __user *ru) >> { >> struct pid *pid = NULL; >> enum pid_type type; >> @@ -1789,6 +1768,39 @@ asmlinkage long sys_waitid(int which, pid_t upid, >> ret = do_wait(type, pid, options, infop, NULL, ru); >> put_pid(pid); >> >> + return ret; >> +} >> + >> +asmlinkage long sys_waitid(int which, pid_t upid, >> + struct siginfo __user *infop, int options, >> + struct rusage __user *ru) >> +{ >> + long ret; >> + >> + ret = do_waitid(which, upid, infop, options, ru); >> + >> + if (ret > 0) >> + ret = 0; >> + else { >> + /* >> + * For a WNOHANG return, clear out all the fields >> + * we would set so the user can easily tell the >> + * difference. >> + */ >> + if (!ret) >> + ret = put_user(0, &infop->si_signo); >> + if (!ret) >> + ret = put_user(0, &infop->si_errno); >> + if (!ret) >> + ret = put_user(0, &infop->si_code); >> + if (!ret) >> + ret = put_user(0, &infop->si_pid); >> + if (!ret) >> + ret = put_user(0, &infop->si_uid); >> + if (!ret) >> + ret = put_user(0, &infop->si_status); >> + } >> + >> /* avoid REGPARM breakage on x86: */ >> asmlinkage_protect(5, ret, which, upid, infop, options, ru); >> return ret; >> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c >> index e14a232..e8d4da6 100644 >> --- a/kernel/sys_ni.c >> +++ b/kernel/sys_ni.c >> @@ -163,6 +163,7 @@ cond_syscall(sys_ioprio_set); >> cond_syscall(sys_ioprio_get); >> >> /* New file descriptors */ >> +cond_syscall(sys_waitfd); >> cond_syscall(sys_signalfd); >> cond_syscall(sys_signalfd4); >> cond_syscall(compat_sys_signalfd); >> Thanks for the review. --CJD ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RFC PATCH v2] waitfd 2009-01-06 18:45 ` Casey Dahlin @ 2009-01-06 18:50 ` Randy Dunlap 0 siblings, 0 replies; 50+ messages in thread From: Randy Dunlap @ 2009-01-06 18:50 UTC (permalink / raw) To: Casey Dahlin; +Cc: Linux Kernel Casey Dahlin wrote: > Randy Dunlap wrote: >> Casey Dahlin wrote: >> >>> Linux now exposes signals, timers, and events via file descriptors >>> through signalfd, timerfd, and eventfd. This means programmers can use a >>> single select/[e]poll call to monitor all change in their program. This >>> patch aims to expose child death via the same mechanism. >>> >>> waitfd provides a file descriptor out of which may be read a series of >>> siginfo_t objects describing child death. A child process is reaped as >>> soon as its information is read. This means child monitoring too can be >>> performed with that same poll call. >>> >>> Patch is against v2.6.28 >>> >>> --CJD >>> >>> diff --git a/arch/x86/include/asm/unistd_32.h >>> b/arch/x86/include/asm/unistd_32.h >>> index f2bba78..134d83c 100644 >>> --- a/arch/x86/include/asm/unistd_32.h >>> +++ b/arch/x86/include/asm/unistd_32.h >>> @@ -338,6 +338,7 @@ >>> #define __NR_dup3 330 >>> #define __NR_pipe2 331 >>> #define __NR_inotify_init1 332 >>> +#define __NR_waitfd 333 >>> >>> #ifdef __KERNEL__ >>> >>> diff --git a/arch/x86/include/asm/unistd_64.h >>> b/arch/x86/include/asm/unistd_64.h >>> index d2e415e..b28eb07 100644 >>> --- a/arch/x86/include/asm/unistd_64.h >>> +++ b/arch/x86/include/asm/unistd_64.h >>> @@ -653,6 +653,8 @@ __SYSCALL(__NR_dup3, sys_dup3) >>> __SYSCALL(__NR_pipe2, sys_pipe2) >>> #define __NR_inotify_init1 294 >>> __SYSCALL(__NR_inotify_init1, sys_inotify_init1) >>> +#define __NR_waitfd 295 >>> +__SYSCALL(__NR_waitfd, sys_waitfd) >>> >>> >> >> Only for x86?? >> >> > > At the moment. I should have mentioned this earlier but I haven't made > the syscall table entries for archs I don't test on. That will change > once the rest of the change has settled out. >>> diff --git a/fs/waitfd.c b/fs/waitfd.c >>> new file mode 100644 >>> index 0000000..0155a83 >>> --- /dev/null >>> +++ b/fs/waitfd.c >>> @@ -0,0 +1,117 @@ >>> +/* >>> + * fs/waitfd.c >>> + * >>> + * Copyright (C) 2008 Red Hat, Casey Dahlin <cdahlin@redhat.com> >>> + * >>> + * Largely derived from fs/signalfd.c >>> + */ >>> + >>> +#include <linux/file.h> >>> +#include <linux/poll.h> >>> +#include <linux/init.h> >>> +#include <linux/fs.h> >>> +#include <linux/sched.h> >>> +#include <linux/kernel.h> >>> +#include <linux/signal.h> >>> +#include <linux/list.h> >>> +#include <linux/anon_inodes.h> >>> +#include <linux/syscalls.h> >>> + >>> +long do_waitid(int which, pid_t upid, >>> + struct siginfo __user *infop, int options, >>> + struct rusage __user *ru); >>> + >>> +struct waitfd_ctx { >>> + int ops; >>> + int which; >>> + pid_t upid; >>> +}; >>> + >>> >> >> Please use kernel coding style: use tabs to indent, not >> <lots-of-spaces>, >> and struct members, functions, etc., are indented by one tab stop >> minimum. >> >> > > Damnit. This is a mailer artifact. This is the first time thunderbird > has eaten a patch on me. I'll look in to it. OK. You can see if Documentation/email-clients.txt helps you any. >>> +} >>> + >>> +static const struct file_operations waitfd_fops = { >>> + .release = waitfd_release, >>> + .poll = waitfd_poll, >>> + .read = waitfd_read, >>> +}; >>> + >>> +asmlinkage long sys_waitfd(int which, pid_t upid, int options, int >>> unused) >>> +{ >>> + int ufd; >>> + struct waitfd_ctx *ctx; >>> + >>> + /* Just to make sure we don't end up with a sys_waitfd4 */ >>> + (void)unused; >>> + >>> + if (options & ~(WNOHANG|WEXITED|WSTOPPED|WCONTINUED)) >>> + return -EINVAL; >>> + if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) >>> + return -EINVAL; >>> >> >> Use spaces around '|'. >> >> > > Those 4 lines are copied almost exactly from kernel/exit.c. Is there > motivation to keep them consistent? I would say no. -- ~Randy ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RFC PATCH v2] waitfd 2009-01-06 18:11 [RFC PATCH v2] waitfd Casey Dahlin 2009-01-06 18:27 ` Alan Cox 2009-01-06 18:31 ` Randy Dunlap @ 2009-01-06 18:48 ` Andi Kleen 2009-01-06 19:07 ` [RESEND][RFC " Casey Dahlin 3 siblings, 0 replies; 50+ messages in thread From: Andi Kleen @ 2009-01-06 18:48 UTC (permalink / raw) To: Casey Dahlin; +Cc: Linux Kernel Casey Dahlin <cdahlin@redhat.com> writes: > Linux now exposes signals, timers, and events via file descriptors > through signalfd, timerfd, and eventfd. This means programmers can use > a single select/[e]poll call to monitor all change in their > program. This patch aims to expose child death via the same mechanism. > > waitfd provides a file descriptor out of which may be read a series of > siginfo_t objects describing child death. A child process is reaped as > soon as its information is read. This means child monitoring too can > be performed with that same poll call. > > Patch is against v2.6.28 Can you please provide a manpage with the exact proposed semantics of the interface. Without that it's very hard to evaluate if the interface makes sense or not. In general a single paragraph is not enough to justify a new system call. -Andi -- ak@linux.intel.com ^ permalink raw reply [flat|nested] 50+ messages in thread
* [RESEND][RFC PATCH v2] waitfd 2009-01-06 18:11 [RFC PATCH v2] waitfd Casey Dahlin ` (2 preceding siblings ...) 2009-01-06 18:48 ` Andi Kleen @ 2009-01-06 19:07 ` Casey Dahlin 2009-01-07 12:34 ` Ingo Molnar 3 siblings, 1 reply; 50+ messages in thread From: Casey Dahlin @ 2009-01-06 19:07 UTC (permalink / raw) To: Linux Kernel; +Cc: Randy Dunlap Same as last time, but the formatting should be right now. Original description: Linux now exposes signals, timers, and events via file descriptors through signalfd, timerfd, and eventfd. This means programmers can use a single select/[e]poll call to monitor all change in their program. This patch aims to expose child death via the same mechanism. waitfd provides a file descriptor out of which may be read a series of siginfo_t objects describing child death. A child process is reaped as soon as its information is read. This means child monitoring too can be performed with that same poll call. Patch is against v2.6.28 diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index f2bba78..134d83c 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -338,6 +338,7 @@ #define __NR_dup3 330 #define __NR_pipe2 331 #define __NR_inotify_init1 332 +#define __NR_waitfd 333 #ifdef __KERNEL__ diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index d2e415e..b28eb07 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -653,6 +653,8 @@ __SYSCALL(__NR_dup3, sys_dup3) __SYSCALL(__NR_pipe2, sys_pipe2) #define __NR_inotify_init1 294 __SYSCALL(__NR_inotify_init1, sys_inotify_init1) +#define __NR_waitfd 295 +__SYSCALL(__NR_waitfd, sys_waitfd) #ifndef __NO_STUBS diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index d44395f..c796a8b 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -332,3 +332,4 @@ ENTRY(sys_call_table) .long sys_dup3 /* 330 */ .long sys_pipe2 .long sys_inotify_init1 + .long sys_waitfd diff --git a/fs/Makefile b/fs/Makefile index d9f8afe..74c31fb 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -25,6 +25,7 @@ obj-$(CONFIG_INOTIFY_USER) += inotify_user.o obj-$(CONFIG_EPOLL) += eventpoll.o obj-$(CONFIG_ANON_INODES) += anon_inodes.o obj-$(CONFIG_SIGNALFD) += signalfd.o +obj-$(CONFIG_WAITFD) += waitfd.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_AIO) += aio.o diff --git a/fs/waitfd.c b/fs/waitfd.c new file mode 100644 index 0000000..0155a83 --- /dev/null +++ b/fs/waitfd.c @@ -0,0 +1,117 @@ +/* + * fs/waitfd.c + * + * Copyright (C) 2008 Red Hat, Casey Dahlin <cdahlin@redhat.com> + * + * Largely derived from fs/signalfd.c + */ + +#include <linux/file.h> +#include <linux/poll.h> +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/signal.h> +#include <linux/list.h> +#include <linux/anon_inodes.h> +#include <linux/syscalls.h> + +long do_waitid(int which, pid_t upid, + struct siginfo __user *infop, int options, + struct rusage __user *ru); + +struct waitfd_ctx { + int ops; + int which; + pid_t upid; +}; + +static int waitfd_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static unsigned int waitfd_poll(struct file *file, poll_table *wait) +{ + struct waitfd_ctx *ctx = file->private_data; + long value; + + poll_wait(file, ¤t->signal->wait_chldexit, wait); + + value = do_waitid(ctx->which, ctx->upid, NULL, + ctx->ops | WNOHANG | WNOWAIT, NULL); + if (value > 0 || value == -ECHILD) + return POLLIN; + + return 0; +} + +/* + * Returns a multiple of the size of a struct siginfo, or a negative + * error code. The "count" parameter must be at least sizeof(struct siginfo) + */ +static ssize_t waitfd_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct waitfd_ctx *ctx = file->private_data; + struct siginfo __user *info_addr = (struct siginfo *)buf; + int flags = ctx->ops; + ssize_t ret, total = 0; + + count /= sizeof(struct siginfo); + if (!count) + return -EINVAL; + + do { + ret = do_waitid(ctx->which, ctx->upid, info_addr, flags, NULL); + if (ret == 0) + ret = -EAGAIN; + if (ret == -ECHILD) + ret = 0; + if (ret <= 0) + break; + + info_addr++; + total += sizeof(struct siginfo); + flags |= WNOHANG; + } while (--count); + + return total ? total: ret; +} + +static const struct file_operations waitfd_fops = { + .release = waitfd_release, + .poll = waitfd_poll, + .read = waitfd_read, +}; + +asmlinkage long sys_waitfd(int which, pid_t upid, int options, int unused) +{ + int ufd; + struct waitfd_ctx *ctx; + + /* Just to make sure we don't end up with a sys_waitfd4 */ + (void)unused; + + if (options & ~(WNOHANG|WEXITED|WSTOPPED|WCONTINUED)) + return -EINVAL; + if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) + return -EINVAL; + + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->ops = options; + ctx->upid = upid; + ctx->which = which; + + ufd = anon_inode_getfd("[waitfd]", &waitfd_fops, ctx, + (options & WNOHANG) ? O_NONBLOCK : 0); + if (ufd < 0) + kfree(ctx); + + return ufd; +} diff --git a/init/Kconfig b/init/Kconfig index f763762..bc34871 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -683,6 +683,16 @@ config EPOLL Disabling this option will cause the kernel to be built without support for epoll family of system calls. +config WAITFD + bool "Enable waitfd() system call" if EMBEDDED + select ANON_INODES + default y + help + Enable the waitfd() system call that allows receving child state + changes from a file descriptor. + + If unsure, say Y. + config SIGNALFD bool "Enable signalfd() system call" if EMBEDDED select ANON_INODES diff --git a/kernel/exit.c b/kernel/exit.c index 2d8be7e..b53e8ba 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1233,18 +1233,20 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid, int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0; put_task_struct(p); - if (!retval) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval) - retval = put_user(0, &infop->si_errno); - if (!retval) - retval = put_user((short)why, &infop->si_code); - if (!retval) - retval = put_user(pid, &infop->si_pid); - if (!retval) - retval = put_user(uid, &infop->si_uid); - if (!retval) - retval = put_user(status, &infop->si_status); + if (infop) { + if (!retval) + retval = put_user(SIGCHLD, &infop->si_signo); + if (!retval) + retval = put_user(0, &infop->si_errno); + if (!retval) + retval = put_user((short)why, &infop->si_code); + if (!retval) + retval = put_user(pid, &infop->si_pid); + if (!retval) + retval = put_user(uid, &infop->si_uid); + if (!retval) + retval = put_user(status, &infop->si_status); + } if (!retval) retval = pid; return retval; @@ -1727,35 +1729,12 @@ repeat: end: current->state = TASK_RUNNING; remove_wait_queue(¤t->signal->wait_chldexit,&wait); - if (infop) { - if (retval > 0) - retval = 0; - else { - /* - * For a WNOHANG return, clear out all the fields - * we would set so the user can easily tell the - * difference. - */ - if (!retval) - retval = put_user(0, &infop->si_signo); - if (!retval) - retval = put_user(0, &infop->si_errno); - if (!retval) - retval = put_user(0, &infop->si_code); - if (!retval) - retval = put_user(0, &infop->si_pid); - if (!retval) - retval = put_user(0, &infop->si_uid); - if (!retval) - retval = put_user(0, &infop->si_status); - } - } return retval; } -asmlinkage long sys_waitid(int which, pid_t upid, - struct siginfo __user *infop, int options, - struct rusage __user *ru) +long do_waitid(int which, pid_t upid, + struct siginfo __user *infop, int options, + struct rusage __user *ru) { struct pid *pid = NULL; enum pid_type type; @@ -1789,6 +1768,39 @@ asmlinkage long sys_waitid(int which, pid_t upid, ret = do_wait(type, pid, options, infop, NULL, ru); put_pid(pid); + return ret; +} + +asmlinkage long sys_waitid(int which, pid_t upid, + struct siginfo __user *infop, int options, + struct rusage __user *ru) +{ + long ret; + + ret = do_waitid(which, upid, infop, options, ru); + + if (ret > 0) + ret = 0; + else { + /* + * For a WNOHANG return, clear out all the fields + * we would set so the user can easily tell the + * difference. + */ + if (!ret) + ret = put_user(0, &infop->si_signo); + if (!ret) + ret = put_user(0, &infop->si_errno); + if (!ret) + ret = put_user(0, &infop->si_code); + if (!ret) + ret = put_user(0, &infop->si_pid); + if (!ret) + ret = put_user(0, &infop->si_uid); + if (!ret) + ret = put_user(0, &infop->si_status); + } + /* avoid REGPARM breakage on x86: */ asmlinkage_protect(5, ret, which, upid, infop, options, ru); return ret; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index e14a232..e8d4da6 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -163,6 +163,7 @@ cond_syscall(sys_ioprio_set); cond_syscall(sys_ioprio_get); /* New file descriptors */ +cond_syscall(sys_waitfd); cond_syscall(sys_signalfd); cond_syscall(sys_signalfd4); cond_syscall(compat_sys_signalfd); ^ permalink raw reply related [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-06 19:07 ` [RESEND][RFC " Casey Dahlin @ 2009-01-07 12:34 ` Ingo Molnar 2009-01-07 13:05 ` Casey Dahlin ` (2 more replies) 0 siblings, 3 replies; 50+ messages in thread From: Ingo Molnar @ 2009-01-07 12:34 UTC (permalink / raw) To: Casey Dahlin Cc: Linux Kernel, Randy Dunlap, Roland McGrath, Oleg Nesterov, Davide Libenzi, Peter Zijlstra (Cc:-ed a few more folks who might be interested in this) * Casey Dahlin <cdahlin@redhat.com> wrote: > Original description: > > Linux now exposes signals, timers, and events via file descriptors > through signalfd, timerfd, and eventfd. This means programmers can use a > single select/[e]poll call to monitor all change in their program. This > patch aims to expose child death via the same mechanism. > > waitfd provides a file descriptor out of which may be read a series of > siginfo_t objects describing child death. A child process is reaped as > soon as its information is read. This means child monitoring too can be > performed with that same poll call. > > Patch is against v2.6.28 The principle looks sound and acceptable - to complete the epoll mechanism to all event sources in the system. There's a few small (mostly stylistic) details though: > index 0000000..0155a83 > --- /dev/null > +++ b/fs/waitfd.c > +#include <linux/anon_inodes.h> > +#include <linux/syscalls.h> > + > +long do_waitid(int which, pid_t upid, > + struct siginfo __user *infop, int options, > + struct rusage __user *ru); please move this prototype into sched.h. > +struct waitfd_ctx { > + int ops; > + int which; > + pid_t upid; > +}; please align structure fields vertically, similarly to how you aligned other definitions in your patch. Something like: > +struct waitfd_ctx { > + int ops; > + int which; > + pid_t upid; > +}; (otherwise it looks all a bit too crammed together) > +static int waitfd_release(struct inode *inode, struct file *file) > +{ > + kfree(file->private_data); > + return 0; > +} > + > +static unsigned int waitfd_poll(struct file *file, poll_table *wait) > +{ > + struct waitfd_ctx *ctx = file->private_data; > + long value; > + > + poll_wait(file, ¤t->signal->wait_chldexit, wait); > + > + value = do_waitid(ctx->which, ctx->upid, NULL, > + ctx->ops | WNOHANG | WNOWAIT, NULL); > + if (value > 0 || value == -ECHILD) > + return POLLIN; > + > + return 0; > +} > + > +/* > + * Returns a multiple of the size of a struct siginfo, or a negative > + * error code. The "count" parameter must be at least sizeof(struct siginfo) > + */ > +static ssize_t waitfd_read(struct file *file, char __user *buf, size_t count, > + loff_t *ppos) > +{ > + struct waitfd_ctx *ctx = file->private_data; > + struct siginfo __user *info_addr = (struct siginfo *)buf; > + int flags = ctx->ops; > + ssize_t ret, total = 0; > + > + count /= sizeof(struct siginfo); > + if (!count) > + return -EINVAL; > + > + do { > + ret = do_waitid(ctx->which, ctx->upid, info_addr, flags, NULL); > + if (ret == 0) > + ret = -EAGAIN; > + if (ret == -ECHILD) > + ret = 0; > + if (ret <= 0) > + break; > + > + info_addr++; > + total += sizeof(struct siginfo); > + flags |= WNOHANG; > + } while (--count); > + > + return total ? total: ret; please use symmetric spacing, i.e.: > + return total ? total : ret; > +} > + > +static const struct file_operations waitfd_fops = { > + .release = waitfd_release, > + .poll = waitfd_poll, > + .read = waitfd_read, > +}; > + > +asmlinkage long sys_waitfd(int which, pid_t upid, int options, int unused) > +{ > + int ufd; > + struct waitfd_ctx *ctx; > + > + /* Just to make sure we don't end up with a sys_waitfd4 */ > + (void)unused; looks a bit silly ... > + > + if (options & ~(WNOHANG|WEXITED|WSTOPPED|WCONTINUED)) > + return -EINVAL; > + if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) > + return -EINVAL; > + > + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); > + if (!ctx) > + return -ENOMEM; > + > + ctx->ops = options; > + ctx->upid = upid; > + ctx->which = which; > + > + ufd = anon_inode_getfd("[waitfd]", &waitfd_fops, ctx, > + (options & WNOHANG) ? O_NONBLOCK : 0); > + if (ufd < 0) > + kfree(ctx); > + > + return ufd; > +} > diff --git a/init/Kconfig b/init/Kconfig > index f763762..bc34871 100644 > --- a/init/Kconfig > +++ b/init/Kconfig > @@ -683,6 +683,16 @@ config EPOLL > Disabling this option will cause the kernel to be built without > support for epoll family of system calls. > > +config WAITFD > + bool "Enable waitfd() system call" if EMBEDDED > + select ANON_INODES > + default y > + help > + Enable the waitfd() system call that allows receving child state typo. > + changes from a file descriptor. > + > + If unsure, say Y. > + > config SIGNALFD > bool "Enable signalfd() system call" if EMBEDDED > select ANON_INODES > diff --git a/kernel/exit.c b/kernel/exit.c > index 2d8be7e..b53e8ba 100644 > --- a/kernel/exit.c > +++ b/kernel/exit.c > @@ -1233,18 +1233,20 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid, > int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0; > > put_task_struct(p); > - if (!retval) > - retval = put_user(SIGCHLD, &infop->si_signo); > - if (!retval) > - retval = put_user(0, &infop->si_errno); > - if (!retval) > - retval = put_user((short)why, &infop->si_code); > - if (!retval) > - retval = put_user(pid, &infop->si_pid); > - if (!retval) > - retval = put_user(uid, &infop->si_uid); > - if (!retval) > - retval = put_user(status, &infop->si_status); > + if (infop) { > + if (!retval) > + retval = put_user(SIGCHLD, &infop->si_signo); > + if (!retval) > + retval = put_user(0, &infop->si_errno); > + if (!retval) > + retval = put_user((short)why, &infop->si_code); > + if (!retval) > + retval = put_user(pid, &infop->si_pid); > + if (!retval) > + retval = put_user(uid, &infop->si_uid); > + if (!retval) > + retval = put_user(status, &infop->si_status); > + } > if (!retval) > retval = pid; > return retval; > @@ -1727,35 +1729,12 @@ repeat: > end: > current->state = TASK_RUNNING; > remove_wait_queue(¤t->signal->wait_chldexit,&wait); > - if (infop) { > - if (retval > 0) > - retval = 0; > - else { > - /* > - * For a WNOHANG return, clear out all the fields > - * we would set so the user can easily tell the > - * difference. > - */ > - if (!retval) > - retval = put_user(0, &infop->si_signo); > - if (!retval) > - retval = put_user(0, &infop->si_errno); > - if (!retval) > - retval = put_user(0, &infop->si_code); > - if (!retval) > - retval = put_user(0, &infop->si_pid); > - if (!retval) > - retval = put_user(0, &infop->si_uid); > - if (!retval) > - retval = put_user(0, &infop->si_status); > - } > - } > return retval; > } > > -asmlinkage long sys_waitid(int which, pid_t upid, > - struct siginfo __user *infop, int options, > - struct rusage __user *ru) > +long do_waitid(int which, pid_t upid, > + struct siginfo __user *infop, int options, > + struct rusage __user *ru) > { > struct pid *pid = NULL; > enum pid_type type; > @@ -1789,6 +1768,39 @@ asmlinkage long sys_waitid(int which, pid_t upid, > ret = do_wait(type, pid, options, infop, NULL, ru); > put_pid(pid); > > + return ret; > +} > + > +asmlinkage long sys_waitid(int which, pid_t upid, > + struct siginfo __user *infop, int options, > + struct rusage __user *ru) > +{ > + long ret; > + > + ret = do_waitid(which, upid, infop, options, ru); > + > + if (ret > 0) > + ret = 0; > + else { > + /* > + * For a WNOHANG return, clear out all the fields > + * we would set so the user can easily tell the > + * difference. > + */ > + if (!ret) > + ret = put_user(0, &infop->si_signo); > + if (!ret) > + ret = put_user(0, &infop->si_errno); > + if (!ret) > + ret = put_user(0, &infop->si_code); > + if (!ret) > + ret = put_user(0, &infop->si_pid); > + if (!ret) > + ret = put_user(0, &infop->si_uid); > + if (!ret) > + ret = put_user(0, &infop->si_status); even if this just moves existing code around, if we touch this, it would be far cleaner (and faster as well) to do what other bits of the signal code do: > + ret = put_user(0, &infop->si_signo); > + ret |= put_user(0, &infop->si_errno); > + ret |= put_user(0, &infop->si_code); > + ret |= put_user(0, &infop->si_pid); > + ret |= put_user(0, &infop->si_uid); > + ret |= put_user(0, &infop->si_status); Since put_user() can only return -EFAULT or zero. (same for wait_noreap_copyout()) Ingo ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-07 12:34 ` Ingo Molnar @ 2009-01-07 13:05 ` Casey Dahlin 2009-01-07 15:00 ` Ingo Molnar 2009-01-07 17:19 ` Oleg Nesterov 2009-01-07 20:53 ` Roland McGrath 2 siblings, 1 reply; 50+ messages in thread From: Casey Dahlin @ 2009-01-07 13:05 UTC (permalink / raw) To: Ingo Molnar Cc: Linux Kernel, Randy Dunlap, Roland McGrath, Oleg Nesterov, Davide Libenzi, Peter Zijlstra Ingo Molnar wrote: > (Cc:-ed a few more folks who might be interested in this) > > * Casey Dahlin <cdahlin@redhat.com> wrote: > >> Original description: >> >> Linux now exposes signals, timers, and events via file descriptors >> through signalfd, timerfd, and eventfd. This means programmers can use a >> single select/[e]poll call to monitor all change in their program. This >> patch aims to expose child death via the same mechanism. >> >> waitfd provides a file descriptor out of which may be read a series of >> siginfo_t objects describing child death. A child process is reaped as >> soon as its information is read. This means child monitoring too can be >> performed with that same poll call. >> >> Patch is against v2.6.28 > > The principle looks sound and acceptable - to complete the epoll mechanism > to all event sources in the system. There's a few small (mostly stylistic) > details though: > >> index 0000000..0155a83 >> --- /dev/null >> +++ b/fs/waitfd.c > >> +#include <linux/anon_inodes.h> >> +#include <linux/syscalls.h> >> + >> +long do_waitid(int which, pid_t upid, >> + struct siginfo __user *infop, int options, >> + struct rusage __user *ru); > > please move this prototype into sched.h. > Sure thing. Knew it shouldn't be here, just didn't know where it went. I'm surprised you're the first to have complained :) >> +struct waitfd_ctx { >> + int ops; >> + int which; >> + pid_t upid; >> +}; > > please align structure fields vertically, similarly to how you aligned > other definitions in your patch. Something like: > >> +struct waitfd_ctx { >> + int ops; >> + int which; >> + pid_t upid; >> +}; > > (otherwise it looks all a bit too crammed together) > >> +static int waitfd_release(struct inode *inode, struct file *file) >> +{ >> + kfree(file->private_data); >> + return 0; >> +} >> + >> +static unsigned int waitfd_poll(struct file *file, poll_table *wait) >> +{ >> + struct waitfd_ctx *ctx = file->private_data; >> + long value; >> + >> + poll_wait(file, ¤t->signal->wait_chldexit, wait); >> + >> + value = do_waitid(ctx->which, ctx->upid, NULL, >> + ctx->ops | WNOHANG | WNOWAIT, NULL); >> + if (value > 0 || value == -ECHILD) >> + return POLLIN; >> + >> + return 0; >> +} >> + >> +/* >> + * Returns a multiple of the size of a struct siginfo, or a negative >> + * error code. The "count" parameter must be at least sizeof(struct siginfo) >> + */ >> +static ssize_t waitfd_read(struct file *file, char __user *buf, size_t count, >> + loff_t *ppos) >> +{ >> + struct waitfd_ctx *ctx = file->private_data; >> + struct siginfo __user *info_addr = (struct siginfo *)buf; >> + int flags = ctx->ops; >> + ssize_t ret, total = 0; >> + >> + count /= sizeof(struct siginfo); >> + if (!count) >> + return -EINVAL; >> + >> + do { >> + ret = do_waitid(ctx->which, ctx->upid, info_addr, flags, NULL); >> + if (ret == 0) >> + ret = -EAGAIN; >> + if (ret == -ECHILD) >> + ret = 0; >> + if (ret <= 0) >> + break; >> + >> + info_addr++; >> + total += sizeof(struct siginfo); >> + flags |= WNOHANG; >> + } while (--count); >> + >> + return total ? total: ret; > > please use symmetric spacing, i.e.: > >> + return total ? total : ret; > >> +} >> + >> +static const struct file_operations waitfd_fops = { >> + .release = waitfd_release, >> + .poll = waitfd_poll, >> + .read = waitfd_read, >> +}; >> + >> +asmlinkage long sys_waitfd(int which, pid_t upid, int options, int unused) >> +{ >> + int ufd; >> + struct waitfd_ctx *ctx; >> + >> + /* Just to make sure we don't end up with a sys_waitfd4 */ >> + (void)unused; > > looks a bit silly ... Do you mean the principle of having an unused argument around for future use or the cast to void? The cast to void is there to suppress the "Waning: unused argument" messages and make gcc happy. >> + >> + if (options & ~(WNOHANG|WEXITED|WSTOPPED|WCONTINUED)) >> + return -EINVAL; >> + if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) >> + return -EINVAL; >> + >> + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); >> + if (!ctx) >> + return -ENOMEM; >> + >> + ctx->ops = options; >> + ctx->upid = upid; >> + ctx->which = which; >> + >> + ufd = anon_inode_getfd("[waitfd]", &waitfd_fops, ctx, >> + (options & WNOHANG) ? O_NONBLOCK : 0); >> + if (ufd < 0) >> + kfree(ctx); >> + >> + return ufd; >> +} >> diff --git a/init/Kconfig b/init/Kconfig >> index f763762..bc34871 100644 >> --- a/init/Kconfig >> +++ b/init/Kconfig >> @@ -683,6 +683,16 @@ config EPOLL >> Disabling this option will cause the kernel to be built without >> support for epoll family of system calls. >> >> +config WAITFD >> + bool "Enable waitfd() system call" if EMBEDDED >> + select ANON_INODES >> + default y >> + help >> + Enable the waitfd() system call that allows receving child state > > typo. > >> + changes from a file descriptor. >> + >> + If unsure, say Y. >> + >> config SIGNALFD >> bool "Enable signalfd() system call" if EMBEDDED >> select ANON_INODES >> diff --git a/kernel/exit.c b/kernel/exit.c >> index 2d8be7e..b53e8ba 100644 >> --- a/kernel/exit.c >> +++ b/kernel/exit.c >> @@ -1233,18 +1233,20 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid, >> int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0; >> >> put_task_struct(p); >> - if (!retval) >> - retval = put_user(SIGCHLD, &infop->si_signo); >> - if (!retval) >> - retval = put_user(0, &infop->si_errno); >> - if (!retval) >> - retval = put_user((short)why, &infop->si_code); >> - if (!retval) >> - retval = put_user(pid, &infop->si_pid); >> - if (!retval) >> - retval = put_user(uid, &infop->si_uid); >> - if (!retval) >> - retval = put_user(status, &infop->si_status); >> + if (infop) { >> + if (!retval) >> + retval = put_user(SIGCHLD, &infop->si_signo); >> + if (!retval) >> + retval = put_user(0, &infop->si_errno); >> + if (!retval) >> + retval = put_user((short)why, &infop->si_code); >> + if (!retval) >> + retval = put_user(pid, &infop->si_pid); >> + if (!retval) >> + retval = put_user(uid, &infop->si_uid); >> + if (!retval) >> + retval = put_user(status, &infop->si_status); >> + } >> if (!retval) >> retval = pid; >> return retval; >> @@ -1727,35 +1729,12 @@ repeat: >> end: >> current->state = TASK_RUNNING; >> remove_wait_queue(¤t->signal->wait_chldexit,&wait); >> - if (infop) { >> - if (retval > 0) >> - retval = 0; >> - else { >> - /* >> - * For a WNOHANG return, clear out all the fields >> - * we would set so the user can easily tell the >> - * difference. >> - */ >> - if (!retval) >> - retval = put_user(0, &infop->si_signo); >> - if (!retval) >> - retval = put_user(0, &infop->si_errno); >> - if (!retval) >> - retval = put_user(0, &infop->si_code); >> - if (!retval) >> - retval = put_user(0, &infop->si_pid); >> - if (!retval) >> - retval = put_user(0, &infop->si_uid); >> - if (!retval) >> - retval = put_user(0, &infop->si_status); >> - } >> - } >> return retval; >> } >> >> -asmlinkage long sys_waitid(int which, pid_t upid, >> - struct siginfo __user *infop, int options, >> - struct rusage __user *ru) >> +long do_waitid(int which, pid_t upid, >> + struct siginfo __user *infop, int options, >> + struct rusage __user *ru) >> { >> struct pid *pid = NULL; >> enum pid_type type; >> @@ -1789,6 +1768,39 @@ asmlinkage long sys_waitid(int which, pid_t upid, >> ret = do_wait(type, pid, options, infop, NULL, ru); >> put_pid(pid); >> >> + return ret; >> +} >> + >> +asmlinkage long sys_waitid(int which, pid_t upid, >> + struct siginfo __user *infop, int options, >> + struct rusage __user *ru) >> +{ >> + long ret; >> + >> + ret = do_waitid(which, upid, infop, options, ru); >> + >> + if (ret > 0) >> + ret = 0; >> + else { >> + /* >> + * For a WNOHANG return, clear out all the fields >> + * we would set so the user can easily tell the >> + * difference. >> + */ >> + if (!ret) >> + ret = put_user(0, &infop->si_signo); >> + if (!ret) >> + ret = put_user(0, &infop->si_errno); >> + if (!ret) >> + ret = put_user(0, &infop->si_code); >> + if (!ret) >> + ret = put_user(0, &infop->si_pid); >> + if (!ret) >> + ret = put_user(0, &infop->si_uid); >> + if (!ret) >> + ret = put_user(0, &infop->si_status); > > even if this just moves existing code around, if we touch this, it would > be far cleaner (and faster as well) to do what other bits of the signal > code do: > >> + ret = put_user(0, &infop->si_signo); >> + ret |= put_user(0, &infop->si_errno); >> + ret |= put_user(0, &infop->si_code); >> + ret |= put_user(0, &infop->si_pid); >> + ret |= put_user(0, &infop->si_uid); >> + ret |= put_user(0, &infop->si_status); > > Since put_user() can only return -EFAULT or zero. > > (same for wait_noreap_copyout()) > I'll change this in both of those places. I'm hoping to find time at some point to submit some general cleanups to do_wait. There's a lot about it I don't like. --CJD ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-07 13:05 ` Casey Dahlin @ 2009-01-07 15:00 ` Ingo Molnar 0 siblings, 0 replies; 50+ messages in thread From: Ingo Molnar @ 2009-01-07 15:00 UTC (permalink / raw) To: Casey Dahlin Cc: Linux Kernel, Randy Dunlap, Roland McGrath, Oleg Nesterov, Davide Libenzi, Peter Zijlstra * Casey Dahlin <cdahlin@redhat.com> wrote: > >> +asmlinkage long sys_waitfd(int which, pid_t upid, int options, int unused) > >> +{ > >> + int ufd; > >> + struct waitfd_ctx *ctx; > >> + > >> + /* Just to make sure we don't end up with a sys_waitfd4 */ > >> + (void)unused; > > > > looks a bit silly ... > > > Do you mean the principle of having an unused argument around for future > use or the cast to void? The cast to void is there to suppress the > "Waning: unused argument" messages and make gcc happy. gcc will not warn about unused function arguments - only about unused local variables. The 'unused' argument should either be removed altogether, or replaced with a properly named parameter and a check returning -ENOSYS if the argument is not zero (or something like that). (It's generally better to keep such syscalls extensible via such trivial means, if there's a remote chance for ever needing to extend that syscall.) Ingo ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-07 12:34 ` Ingo Molnar 2009-01-07 13:05 ` Casey Dahlin @ 2009-01-07 17:19 ` Oleg Nesterov 2009-01-07 17:24 ` Ingo Molnar ` (2 more replies) 2009-01-07 20:53 ` Roland McGrath 2 siblings, 3 replies; 50+ messages in thread From: Oleg Nesterov @ 2009-01-07 17:19 UTC (permalink / raw) To: Ingo Molnar Cc: Casey Dahlin, Linux Kernel, Randy Dunlap, Roland McGrath, Davide Libenzi, Peter Zijlstra On 01/07, Ingo Molnar wrote: > > (Cc:-ed a few more folks who might be interested in this) > > * Casey Dahlin <cdahlin@redhat.com> wrote: > > > +asmlinkage long sys_waitfd(int which, pid_t upid, int options, int unused) > > +{ > > + int ufd; > > + struct waitfd_ctx *ctx; > > + > > + /* Just to make sure we don't end up with a sys_waitfd4 */ > > + (void)unused; > > looks a bit silly ... > > > + > > + if (options & ~(WNOHANG|WEXITED|WSTOPPED|WCONTINUED)) > > + return -EINVAL; > > + if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) > > + return -EINVAL; > > + > > + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); > > + if (!ctx) > > + return -ENOMEM; > > + > > + ctx->ops = options; > > + ctx->upid = upid; > > + ctx->which = which; > > + > > + ufd = anon_inode_getfd("[waitfd]", &waitfd_fops, ctx, > > + (options & WNOHANG) ? O_NONBLOCK : 0); minor nit... Please note that unlike other sys_...fd() syscalls, sys_waitfd() doesn't allow to pass O_CLOEXEC. Looks like we need a separate "flags" argument... Also, ioctl(FIONBIO) or fcntl(O_NONBLOCK) have no effect on waitfd, not very good. I'd suggest to remove WNOHANG from waitfd_ctx->ops and treat (->f_flags & O_NONBLOCK) as WNOHANG. (can't resist, ->ops is not the best name ;) Oleg. ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-07 17:19 ` Oleg Nesterov @ 2009-01-07 17:24 ` Ingo Molnar 2009-01-07 17:52 ` Davide Libenzi 2009-01-10 14:47 ` Scott James Remnant 2 siblings, 0 replies; 50+ messages in thread From: Ingo Molnar @ 2009-01-07 17:24 UTC (permalink / raw) To: Oleg Nesterov Cc: Casey Dahlin, Linux Kernel, Randy Dunlap, Roland McGrath, Davide Libenzi, Peter Zijlstra * Oleg Nesterov <oleg@redhat.com> wrote: > On 01/07, Ingo Molnar wrote: > > > > (Cc:-ed a few more folks who might be interested in this) > > > > * Casey Dahlin <cdahlin@redhat.com> wrote: > > > > > +asmlinkage long sys_waitfd(int which, pid_t upid, int options, int unused) > > > +{ > > > + int ufd; > > > + struct waitfd_ctx *ctx; > > > + > > > + /* Just to make sure we don't end up with a sys_waitfd4 */ > > > + (void)unused; > > > > looks a bit silly ... > > > > > + > > > + if (options & ~(WNOHANG|WEXITED|WSTOPPED|WCONTINUED)) > > > + return -EINVAL; > > > + if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) > > > + return -EINVAL; > > > + > > > + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); > > > + if (!ctx) > > > + return -ENOMEM; > > > + > > > + ctx->ops = options; > > > + ctx->upid = upid; > > > + ctx->which = which; > > > + > > > + ufd = anon_inode_getfd("[waitfd]", &waitfd_fops, ctx, > > > + (options & WNOHANG) ? O_NONBLOCK : 0); > > minor nit... > > Please note that unlike other sys_...fd() syscalls, sys_waitfd() > doesn't allow to pass O_CLOEXEC. Looks like we need a separate > "flags" argument... > > Also, ioctl(FIONBIO) or fcntl(O_NONBLOCK) have no effect on > waitfd, not very good. > > I'd suggest to remove WNOHANG from waitfd_ctx->ops and treat > (->f_flags & O_NONBLOCK) as WNOHANG. > > (can't resist, ->ops is not the best name ;) yeah, ->ops should be ->options. The name ->ops is generally use for method vectors and so. Ingo ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-07 17:19 ` Oleg Nesterov 2009-01-07 17:24 ` Ingo Molnar @ 2009-01-07 17:52 ` Davide Libenzi 2009-01-07 20:38 ` Casey Dahlin 2009-01-10 14:47 ` Scott James Remnant 2 siblings, 1 reply; 50+ messages in thread From: Davide Libenzi @ 2009-01-07 17:52 UTC (permalink / raw) To: Oleg Nesterov Cc: Ingo Molnar, Casey Dahlin, Linux Kernel, Randy Dunlap, Roland McGrath, Peter Zijlstra On Wed, 7 Jan 2009, Oleg Nesterov wrote: > minor nit... > > Please note that unlike other sys_...fd() syscalls, sys_waitfd() > doesn't allow to pass O_CLOEXEC. Looks like we need a separate > "flags" argument... I said this even during the first post. Not a minor nit unless we want to have a sys_waitfd2 soon after. > Also, ioctl(FIONBIO) or fcntl(O_NONBLOCK) have no effect on > waitfd, not very good. Ack. > I'd suggest to remove WNOHANG from waitfd_ctx->ops and treat > (->f_flags & O_NONBLOCK) as WNOHANG. Ack. > (can't resist, ->ops is not the best name ;) Ack. "flags" looks a pretty nice shot at it. - Davide ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-07 17:52 ` Davide Libenzi @ 2009-01-07 20:38 ` Casey Dahlin 0 siblings, 0 replies; 50+ messages in thread From: Casey Dahlin @ 2009-01-07 20:38 UTC (permalink / raw) To: Davide Libenzi Cc: Oleg Nesterov, Ingo Molnar, Linux Kernel, Randy Dunlap, Roland McGrath, Peter Zijlstra Davide Libenzi wrote: > On Wed, 7 Jan 2009, Oleg Nesterov wrote: > >> minor nit... >> >> Please note that unlike other sys_...fd() syscalls, sys_waitfd() >> doesn't allow to pass O_CLOEXEC. Looks like we need a separate >> "flags" argument... > > I said this even during the first post. Not a minor nit unless we want to > have a sys_waitfd2 soon after. > I don't think I properly understood the complaint last time you made it. Fix on the way --CJD ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-07 17:19 ` Oleg Nesterov 2009-01-07 17:24 ` Ingo Molnar 2009-01-07 17:52 ` Davide Libenzi @ 2009-01-10 14:47 ` Scott James Remnant 2009-01-10 21:14 ` Casey Dahlin 2 siblings, 1 reply; 50+ messages in thread From: Scott James Remnant @ 2009-01-10 14:47 UTC (permalink / raw) To: Oleg Nesterov Cc: Ingo Molnar, Casey Dahlin, Linux Kernel, Randy Dunlap, Roland McGrath, Davide Libenzi, Peter Zijlstra [-- Attachment #1: Type: text/plain, Size: 847 bytes --] On Wed, 2009-01-07 at 18:19 +0100, Oleg Nesterov wrote: > Please note that unlike other sys_...fd() syscalls, sys_waitfd() > doesn't allow to pass O_CLOEXEC. Looks like we need a separate > "flags" argument... > > Also, ioctl(FIONBIO) or fcntl(O_NONBLOCK) have no effect on > waitfd, not very good. > > I'd suggest to remove WNOHANG from waitfd_ctx->ops and treat > (->f_flags & O_NONBLOCK) as WNOHANG. > > (can't resist, ->ops is not the best name ;) > Definitely agree here, waitfd() doesn't need WNOHANG - we already have ONONBLOCK. That also solves one of the strangest behaves of waitid when you use WNOHANG (it returns zero and you have to check whether it changed the struct), now you just read() - if no child you get EAGAIN, if a child you read a struct. Scott -- Scott James Remnant scott@canonical.com [-- Attachment #2: This is a digitally signed message part --] [-- Type: application/pgp-signature, Size: 197 bytes --] ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-10 14:47 ` Scott James Remnant @ 2009-01-10 21:14 ` Casey Dahlin 2009-01-10 21:20 ` Scott James Remnant 2009-01-10 22:31 ` Oleg Nesterov 0 siblings, 2 replies; 50+ messages in thread From: Casey Dahlin @ 2009-01-10 21:14 UTC (permalink / raw) To: Scott James Remnant Cc: Oleg Nesterov, Ingo Molnar, Linux Kernel, Randy Dunlap, Roland McGrath, Davide Libenzi, Peter Zijlstra Scott James Remnant wrote: > On Wed, 2009-01-07 at 18:19 +0100, Oleg Nesterov wrote: > > >> Please note that unlike other sys_...fd() syscalls, sys_waitfd() >> doesn't allow to pass O_CLOEXEC. Looks like we need a separate >> "flags" argument... >> >> Also, ioctl(FIONBIO) or fcntl(O_NONBLOCK) have no effect on >> waitfd, not very good. >> >> I'd suggest to remove WNOHANG from waitfd_ctx->ops and treat >> (->f_flags & O_NONBLOCK) as WNOHANG. >> >> (can't resist, ->ops is not the best name ;) >> >> > Definitely agree here, waitfd() doesn't need WNOHANG - we already have > ONONBLOCK. > > That also solves one of the strangest behaves of waitid when you use > WNOHANG (it returns zero and you have to check whether it changed the > struct), now you just read() - if no child you get EAGAIN, if a child > you read a struct. > > Scott > From the perspective of waitfd, the only difference between WNOHANG and O_NONBLOCK is which argument you put the flags in. The API should only support one or the other, but internally they would imply the same thing. --CJD ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-10 21:14 ` Casey Dahlin @ 2009-01-10 21:20 ` Scott James Remnant 2009-01-10 22:08 ` Casey Dahlin 2009-01-10 22:31 ` Oleg Nesterov 1 sibling, 1 reply; 50+ messages in thread From: Scott James Remnant @ 2009-01-10 21:20 UTC (permalink / raw) To: Casey Dahlin Cc: Oleg Nesterov, Ingo Molnar, Linux Kernel, Randy Dunlap, Roland McGrath, Davide Libenzi, Peter Zijlstra [-- Attachment #1: Type: text/plain, Size: 522 bytes --] On Sat, 2009-01-10 at 16:14 -0500, Casey Dahlin wrote: > From the perspective of waitfd, the only difference between WNOHANG and > O_NONBLOCK is which argument you put the flags in. The API should only > support one or the other, but internally they would imply the same thing. > Well, you get O_NONBLOCK for free by having a file descriptor; and you can't turn off people trying to turn it on/off with fcntl() - so you may as well just use that, no? :-) Scott -- Scott James Remnant scott@canonical.com [-- Attachment #2: This is a digitally signed message part --] [-- Type: application/pgp-signature, Size: 197 bytes --] ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-10 21:20 ` Scott James Remnant @ 2009-01-10 22:08 ` Casey Dahlin 0 siblings, 0 replies; 50+ messages in thread From: Casey Dahlin @ 2009-01-10 22:08 UTC (permalink / raw) To: Scott James Remnant Cc: Oleg Nesterov, Ingo Molnar, Linux Kernel, Randy Dunlap, Roland McGrath, Davide Libenzi, Peter Zijlstra Scott James Remnant wrote: > On Sat, 2009-01-10 at 16:14 -0500, Casey Dahlin wrote: > > >> From the perspective of waitfd, the only difference between WNOHANG and >> O_NONBLOCK is which argument you put the flags in. The API should only >> support one or the other, but internally they would imply the same thing. >> >> > Well, you get O_NONBLOCK for free by having a file descriptor; and you > can't turn off people trying to turn it on/off with fcntl() - so you may > as well just use that, no? :-) > > Scott > Its purely an api question. We could easily take the WNOHANG flag and just unset it when we get it and set O_NONBLOCK instead. We need O_CLOEXEC anyway though, and the only reason to do it would be to get rid of the O_ options and take only one type of flag (that and just a little more waitid consistency). --CJD ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-10 21:14 ` Casey Dahlin 2009-01-10 21:20 ` Scott James Remnant @ 2009-01-10 22:31 ` Oleg Nesterov 2009-01-10 22:37 ` Casey Dahlin 1 sibling, 1 reply; 50+ messages in thread From: Oleg Nesterov @ 2009-01-10 22:31 UTC (permalink / raw) To: Casey Dahlin Cc: Scott James Remnant, Ingo Molnar, Linux Kernel, Randy Dunlap, Roland McGrath, Davide Libenzi, Peter Zijlstra On 01/10, Casey Dahlin wrote: > > Scott James Remnant wrote: >> On Wed, 2009-01-07 at 18:19 +0100, Oleg Nesterov wrote: >> >>> Please note that unlike other sys_...fd() syscalls, sys_waitfd() >>> doesn't allow to pass O_CLOEXEC. Looks like we need a separate >>> "flags" argument... >>> >>> Also, ioctl(FIONBIO) or fcntl(O_NONBLOCK) have no effect on >>> waitfd, not very good. >>> >>> I'd suggest to remove WNOHANG from waitfd_ctx->ops and treat >>> (->f_flags & O_NONBLOCK) as WNOHANG. >>> >>> (can't resist, ->ops is not the best name ;) >>> >>> >> Definitely agree here, waitfd() doesn't need WNOHANG - we already have >> ONONBLOCK. >> >> That also solves one of the strangest behaves of waitid when you use >> WNOHANG (it returns zero and you have to check whether it changed the >> struct), now you just read() - if no child you get EAGAIN, if a child >> you read a struct. >> > From the perspective of waitfd, the only difference between WNOHANG and > O_NONBLOCK is which argument you put the flags in. No. Please see the note about ioctl/fcntl above. Oleg. ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-10 22:31 ` Oleg Nesterov @ 2009-01-10 22:37 ` Casey Dahlin 2009-01-10 22:46 ` Oleg Nesterov 0 siblings, 1 reply; 50+ messages in thread From: Casey Dahlin @ 2009-01-10 22:37 UTC (permalink / raw) To: Oleg Nesterov Cc: Scott James Remnant, Ingo Molnar, Linux Kernel, Randy Dunlap, Roland McGrath, Davide Libenzi, Peter Zijlstra Oleg Nesterov wrote: > On 01/10, Casey Dahlin wrote: > >> Scott James Remnant wrote: >> >>> On Wed, 2009-01-07 at 18:19 +0100, Oleg Nesterov wrote: >>> >>> >>>> Please note that unlike other sys_...fd() syscalls, sys_waitfd() >>>> doesn't allow to pass O_CLOEXEC. Looks like we need a separate >>>> "flags" argument... >>>> >>>> Also, ioctl(FIONBIO) or fcntl(O_NONBLOCK) have no effect on >>>> waitfd, not very good. >>>> >>>> I'd suggest to remove WNOHANG from waitfd_ctx->ops and treat >>>> (->f_flags & O_NONBLOCK) as WNOHANG. >>>> >>>> (can't resist, ->ops is not the best name ;) >>>> >>>> >>>> >>> Definitely agree here, waitfd() doesn't need WNOHANG - we already have >>> ONONBLOCK. >>> >>> That also solves one of the strangest behaves of waitid when you use >>> WNOHANG (it returns zero and you have to check whether it changed the >>> struct), now you just read() - if no child you get EAGAIN, if a child >>> you read a struct. >>> >>> >> From the perspective of waitfd, the only difference between WNOHANG and >> O_NONBLOCK is which argument you put the flags in. >> > > No. Please see the note about ioctl/fcntl above. > > Oleg. > Yes but the actual waitfd call could simply set O_NONBLOCK on the descriptor when it receive WNOHANG in the flags, and read the descriptor flags going forward. --CJD ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-10 22:37 ` Casey Dahlin @ 2009-01-10 22:46 ` Oleg Nesterov 0 siblings, 0 replies; 50+ messages in thread From: Oleg Nesterov @ 2009-01-10 22:46 UTC (permalink / raw) To: Casey Dahlin Cc: Scott James Remnant, Ingo Molnar, Linux Kernel, Randy Dunlap, Roland McGrath, Davide Libenzi, Peter Zijlstra On 01/10, Casey Dahlin wrote: > > Oleg Nesterov wrote: >>> From the perspective of waitfd, the only difference between WNOHANG and >>> O_NONBLOCK is which argument you put the flags in. >> >> No. Please see the note about ioctl/fcntl above. >> >> Oleg. >> > Yes but the actual waitfd call could simply set O_NONBLOCK on the > descriptor when it receive WNOHANG in the flags, and read the descriptor > flags going forward. Ah, I misunderstood your message as if we shouldn't check f_flags at all. Yes sure. Oleg. ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-07 12:34 ` Ingo Molnar 2009-01-07 13:05 ` Casey Dahlin 2009-01-07 17:19 ` Oleg Nesterov @ 2009-01-07 20:53 ` Roland McGrath 2009-01-07 20:58 ` Ingo Molnar ` (4 more replies) 2 siblings, 5 replies; 50+ messages in thread From: Roland McGrath @ 2009-01-07 20:53 UTC (permalink / raw) To: Ingo Molnar Cc: Casey Dahlin, Linux Kernel, Randy Dunlap, Oleg Nesterov, Davide Libenzi, Peter Zijlstra New syscall should have gone to linux-api, I think. Do we really need another one for this? How about using signalfd plus setting the child's exit_signal to a queuing (SIGRTMIN+n) signal instead of SIGCHLD? It's slightly more magical for the userland process to know to do that (fork -> clone SIGRTMIN). But compared to adding a syscall we don't really have to add, maybe better. Thanks, Roland ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-07 20:53 ` Roland McGrath @ 2009-01-07 20:58 ` Ingo Molnar 2009-01-07 21:05 ` Davide Libenzi 2009-01-07 21:02 ` Ulrich Drepper ` (3 subsequent siblings) 4 siblings, 1 reply; 50+ messages in thread From: Ingo Molnar @ 2009-01-07 20:58 UTC (permalink / raw) To: Roland McGrath Cc: Casey Dahlin, Linux Kernel, Randy Dunlap, Oleg Nesterov, Davide Libenzi, Peter Zijlstra * Roland McGrath <roland@redhat.com> wrote: > New syscall should have gone to linux-api, I think. > > Do we really need another one for this? How about using signalfd plus > setting the child's exit_signal to a queuing (SIGRTMIN+n) signal instead > of SIGCHLD? It's slightly more magical for the userland process to know > to do that (fork -> clone SIGRTMIN). But compared to adding a syscall > we don't really have to add, maybe better. hm, i think it's cleaner conceptually than trying to wrap this into signalfd. Since we already have: #define __NR_signalfd 321 #define __NR_timerfd_create 322 #define __NR_timerfd_settime 325 #define __NR_timerfd_gettime 326 #define __NR_signalfd4 327 is one more really such an issue? Ingo ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-07 20:58 ` Ingo Molnar @ 2009-01-07 21:05 ` Davide Libenzi 2009-01-07 21:50 ` Ingo Molnar 0 siblings, 1 reply; 50+ messages in thread From: Davide Libenzi @ 2009-01-07 21:05 UTC (permalink / raw) To: Ingo Molnar Cc: Roland McGrath, Casey Dahlin, Linux Kernel, Randy Dunlap, Oleg Nesterov, Peter Zijlstra On Wed, 7 Jan 2009, Ingo Molnar wrote: > > * Roland McGrath <roland@redhat.com> wrote: > > > New syscall should have gone to linux-api, I think. > > > > Do we really need another one for this? How about using signalfd plus > > setting the child's exit_signal to a queuing (SIGRTMIN+n) signal instead > > of SIGCHLD? It's slightly more magical for the userland process to know > > to do that (fork -> clone SIGRTMIN). But compared to adding a syscall > > we don't really have to add, maybe better. > > hm, i think it's cleaner conceptually than trying to wrap this into > signalfd. Since we already have: > > #define __NR_signalfd 321 > #define __NR_timerfd_create 322 > #define __NR_timerfd_settime 325 > #define __NR_timerfd_gettime 326 > #define __NR_signalfd4 327 > > is one more really such an issue? And what did eventfd do to you? :) I partially agree with Roland (and I stated this during Casey's first post), this can be achieved in a not too troublesome way already. A new dedicated interface is easier for the challenged userspace coder, but I dunno if it's worth the new code (although it does have little footprint). Both ways are fine from my POV. - Davide ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-07 21:05 ` Davide Libenzi @ 2009-01-07 21:50 ` Ingo Molnar 0 siblings, 0 replies; 50+ messages in thread From: Ingo Molnar @ 2009-01-07 21:50 UTC (permalink / raw) To: Davide Libenzi Cc: Roland McGrath, Casey Dahlin, Linux Kernel, Randy Dunlap, Oleg Nesterov, Peter Zijlstra * Davide Libenzi <davidel@xmailserver.org> wrote: > On Wed, 7 Jan 2009, Ingo Molnar wrote: > > > > > * Roland McGrath <roland@redhat.com> wrote: > > > > > New syscall should have gone to linux-api, I think. > > > > > > Do we really need another one for this? How about using signalfd plus > > > setting the child's exit_signal to a queuing (SIGRTMIN+n) signal instead > > > of SIGCHLD? It's slightly more magical for the userland process to know > > > to do that (fork -> clone SIGRTMIN). But compared to adding a syscall > > > we don't really have to add, maybe better. > > > > hm, i think it's cleaner conceptually than trying to wrap this into > > signalfd. Since we already have: > > > > #define __NR_signalfd 321 > > #define __NR_timerfd_create 322 > > #define __NR_timerfd_settime 325 > > #define __NR_timerfd_gettime 326 > > #define __NR_signalfd4 327 > > > > is one more really such an issue? > > And what did eventfd do to you? :) :) > I partially agree with Roland (and I stated this during Casey's first > post), this can be achieved in a not too troublesome way already. A new > dedicated interface is easier for the challenged userspace coder, but I > dunno if it's worth the new code (although it does have little > footprint). Both ways are fine from my POV. i think we should be defensive and generous and do a clean separate syscall for this - we have a pretty bad track record in syscall interface cleanliness, today's borderline hack is tomorrow's limitation causing headaches. We never know what that extra flexibility that will buy us in the future. Ingo ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-07 20:53 ` Roland McGrath 2009-01-07 20:58 ` Ingo Molnar @ 2009-01-07 21:02 ` Ulrich Drepper 2009-01-08 14:32 ` Oleg Nesterov 2009-01-08 22:04 ` Michael Kerrisk ` (2 subsequent siblings) 4 siblings, 1 reply; 50+ messages in thread From: Ulrich Drepper @ 2009-01-07 21:02 UTC (permalink / raw) To: Roland McGrath Cc: Ingo Molnar, Casey Dahlin, Linux Kernel, Randy Dunlap, Oleg Nesterov, Davide Libenzi, Peter Zijlstra On Wed, Jan 7, 2009 at 12:53 PM, Roland McGrath <roland@redhat.com> wrote: > Do we really need another one for this? How about using signalfd plus > setting the child's exit_signal to a queuing (SIGRTMIN+n) signal instead of > SIGCHLD? It's slightly more magical for the userland process to know to do > that (fork -> clone SIGRTMIN). But compared to adding a syscall we don't > really have to add, maybe better. Since waitfd shouldn't consume the child termination notification waitfd should be more widely usable than the wait*() interfaces. I.e., it's not necessary to restrict the use to parents. Any process with the same UID should be allowed to call waitfd. This would allow some new user cases. ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-07 21:02 ` Ulrich Drepper @ 2009-01-08 14:32 ` Oleg Nesterov 2009-01-08 19:35 ` Roland McGrath 0 siblings, 1 reply; 50+ messages in thread From: Oleg Nesterov @ 2009-01-08 14:32 UTC (permalink / raw) To: Ulrich Drepper Cc: Roland McGrath, Ingo Molnar, Casey Dahlin, Linux Kernel, Randy Dunlap, Davide Libenzi, Peter Zijlstra On 01/07, Ulrich Drepper wrote: > > On Wed, Jan 7, 2009 at 12:53 PM, Roland McGrath <roland@redhat.com> wrote: > > Do we really need another one for this? How about using signalfd plus > > setting the child's exit_signal to a queuing (SIGRTMIN+n) signal instead of > > SIGCHLD? It's slightly more magical for the userland process to know to do > > that (fork -> clone SIGRTMIN). But compared to adding a syscall we don't > > really have to add, maybe better. > > Since waitfd shouldn't consume the child termination notification > waitfd should be more widely usable than the wait*() interfaces. yes, it doesn't eat the notification (SIGCHLD), but it reaps a zombie, clears ->exit_code TASK_STOPPED/TASK_TRACED tasks, clears SIGNAL_STOP_CONTINUED. > I.e., it's not necessary to restrict the use to parents. Any process > with the same UID should be allowed to call waitfd. This would allow > some new user cases. I don't see how it is possible to implement this... The parent can sleep on ->wait_chldexit and it will be notified, but how can we wait for the unrelated process with the same UID ? Even if sys_waitfd() uses P_PID, we can't use task->parent->signal->wait_chldexit, task->parent can exit before task exits. Or I misunderstood you? Oleg. ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-08 14:32 ` Oleg Nesterov @ 2009-01-08 19:35 ` Roland McGrath 2009-01-08 20:36 ` Casey Dahlin 0 siblings, 1 reply; 50+ messages in thread From: Roland McGrath @ 2009-01-08 19:35 UTC (permalink / raw) To: Oleg Nesterov Cc: Ulrich Drepper, Ingo Molnar, Casey Dahlin, Linux Kernel, Randy Dunlap, Davide Libenzi, Peter Zijlstra > > Since waitfd shouldn't consume the child termination notification > > waitfd should be more widely usable than the wait*() interfaces. waitid can be used that way with WNOWAIT. ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-08 19:35 ` Roland McGrath @ 2009-01-08 20:36 ` Casey Dahlin 2009-01-08 21:39 ` Oleg Nesterov 2009-01-10 14:50 ` Scott James Remnant 0 siblings, 2 replies; 50+ messages in thread From: Casey Dahlin @ 2009-01-08 20:36 UTC (permalink / raw) To: Roland McGrath Cc: Oleg Nesterov, Ulrich Drepper, Ingo Molnar, Linux Kernel, Randy Dunlap, Davide Libenzi, Peter Zijlstra Roland McGrath wrote: >>> Since waitfd shouldn't consume the child termination notification >>> waitfd should be more widely usable than the wait*() interfaces. > > waitid can be used that way with WNOWAIT. Yes, but waitfd does not have this flag. The reason being waitfd just calls waitid internally, and there is no guarantee (afaik) that calling waitid with WNOWAIT multiple times in succession will yield different results each time. This breaks the streaming behavior of the descriptor. --CJD ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-08 20:36 ` Casey Dahlin @ 2009-01-08 21:39 ` Oleg Nesterov 2009-01-10 14:52 ` Scott James Remnant 2009-01-10 14:50 ` Scott James Remnant 1 sibling, 1 reply; 50+ messages in thread From: Oleg Nesterov @ 2009-01-08 21:39 UTC (permalink / raw) To: Casey Dahlin Cc: Roland McGrath, Ulrich Drepper, Ingo Molnar, Linux Kernel, Randy Dunlap, Davide Libenzi, Peter Zijlstra On 01/08, Casey Dahlin wrote: > > Roland McGrath wrote: > >>> Since waitfd shouldn't consume the child termination notification > >>> waitfd should be more widely usable than the wait*() interfaces. > > > > waitid can be used that way with WNOWAIT. > > Yes, but waitfd does not have this flag. The reason being waitfd just > calls waitid internally, and there is no guarantee (afaik) that calling > waitid with WNOWAIT multiple times in succession will yield different > results each time. This breaks the streaming behavior of the descriptor. Yes, do_wait(WNOWAIT) will return the same result on each call. Until the next child dies, and this child is closer to the head of ->children list. But the reason is not that waitfd just calls waitid() internally. Let's suppose we add another helper (or waitfd_read() can do this by hand) so that read(waitfd_with_WNOWAIT_flag) correctly returns the info about all childs. Now, what should the next read() do? Btw. It is not that I am trying to argue against sys_waitfd(), but do you have the "real life" example when it can be useful? Yes, poll(). But we have signalfd. SIGCHLD is not rt signal, but afaics this is not the problem actually. Just curious. Oleg. ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-08 21:39 ` Oleg Nesterov @ 2009-01-10 14:52 ` Scott James Remnant 2009-01-10 16:19 ` Oleg Nesterov 0 siblings, 1 reply; 50+ messages in thread From: Scott James Remnant @ 2009-01-10 14:52 UTC (permalink / raw) To: Oleg Nesterov Cc: Casey Dahlin, Roland McGrath, Ulrich Drepper, Ingo Molnar, Linux Kernel, Randy Dunlap, Davide Libenzi, Peter Zijlstra [-- Attachment #1: Type: text/plain, Size: 446 bytes --] On Thu, 2009-01-08 at 22:39 +0100, Oleg Nesterov wrote: > Btw. It is not that I am trying to argue against sys_waitfd(), but do > you have the "real life" example when it can be useful? Yes, poll(). > But we have signalfd. SIGCHLD is not rt signal, but afaics this is not > the problem actually. Just curious. > signalfd() can't currently be made to work in the way you describe. Scott -- Scott James Remnant scott@canonical.com [-- Attachment #2: This is a digitally signed message part --] [-- Type: application/pgp-signature, Size: 197 bytes --] ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-10 14:52 ` Scott James Remnant @ 2009-01-10 16:19 ` Oleg Nesterov 2009-01-10 17:09 ` Scott James Remnant 0 siblings, 1 reply; 50+ messages in thread From: Oleg Nesterov @ 2009-01-10 16:19 UTC (permalink / raw) To: Scott James Remnant Cc: Casey Dahlin, Roland McGrath, Ulrich Drepper, Ingo Molnar, Linux Kernel, Randy Dunlap, Davide Libenzi, Peter Zijlstra On 01/10, Scott James Remnant wrote: > > On Thu, 2009-01-08 at 22:39 +0100, Oleg Nesterov wrote: > > > Btw. It is not that I am trying to argue against sys_waitfd(), but do > > you have the "real life" example when it can be useful? Yes, poll(). > > But we have signalfd. SIGCHLD is not rt signal, but afaics this is not > > the problem actually. Just curious. > > > signalfd() can't currently be made to work in the way you describe. Hmm. Could you clarify? I am not sure we are talking about the same thing, but afaics poll() + signalfd can work to (say) reap the childs. Actually, ppoll() alone is enough. No? Oleg. ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-10 16:19 ` Oleg Nesterov @ 2009-01-10 17:09 ` Scott James Remnant 2009-01-10 18:21 ` Oleg Nesterov 0 siblings, 1 reply; 50+ messages in thread From: Scott James Remnant @ 2009-01-10 17:09 UTC (permalink / raw) To: Oleg Nesterov Cc: Casey Dahlin, Roland McGrath, Ulrich Drepper, Ingo Molnar, Linux Kernel, Randy Dunlap, Davide Libenzi, Peter Zijlstra [-- Attachment #1: Type: text/plain, Size: 481 bytes --] On Sat, 2009-01-10 at 17:19 +0100, Oleg Nesterov wrote: > I am not sure we are talking about the same thing, but afaics poll() + > signalfd can work to (say) reap the childs. Actually, ppoll() alone is > enough. > Last time I checked, ppoll() was not actually implemented across all architectures in a manner that solved the race it was intended to solve. I'd be delighted to learn that this had been fixed? :-) Soctt -- Scott James Remnant scott@canonical.com [-- Attachment #2: This is a digitally signed message part --] [-- Type: application/pgp-signature, Size: 197 bytes --] ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-10 17:09 ` Scott James Remnant @ 2009-01-10 18:21 ` Oleg Nesterov 2009-01-10 18:46 ` Scott James Remnant 0 siblings, 1 reply; 50+ messages in thread From: Oleg Nesterov @ 2009-01-10 18:21 UTC (permalink / raw) To: Scott James Remnant Cc: Casey Dahlin, Roland McGrath, Ulrich Drepper, Ingo Molnar, Linux Kernel, Randy Dunlap, Davide Libenzi, Peter Zijlstra On 01/10, Scott James Remnant wrote: > > On Sat, 2009-01-10 at 17:19 +0100, Oleg Nesterov wrote: > > > I am not sure we are talking about the same thing, but afaics poll() + > > signalfd can work to (say) reap the childs. Actually, ppoll() alone is > > enough. > > > Last time I checked, ppoll() was not actually implemented across all > architectures in a manner that solved the race it was intended to solve. > As I said, this is imho unfair. But I mentioned ppol() "just in case". My questiong was why do you think that "signalfd() can't currently be made to work in the way you describe". You have dropped this part to change the topic? Oleg. ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-10 18:21 ` Oleg Nesterov @ 2009-01-10 18:46 ` Scott James Remnant 0 siblings, 0 replies; 50+ messages in thread From: Scott James Remnant @ 2009-01-10 18:46 UTC (permalink / raw) To: Oleg Nesterov Cc: Casey Dahlin, Roland McGrath, Ulrich Drepper, Ingo Molnar, Linux Kernel, Randy Dunlap, Davide Libenzi, Peter Zijlstra [-- Attachment #1: Type: text/plain, Size: 1481 bytes --] On Sat, 2009-01-10 at 19:21 +0100, Oleg Nesterov wrote: > On 01/10, Scott James Remnant wrote: > > > > On Sat, 2009-01-10 at 17:19 +0100, Oleg Nesterov wrote: > > > > > I am not sure we are talking about the same thing, but afaics poll() + > > > signalfd can work to (say) reap the childs. Actually, ppoll() alone is > > > enough. > > > > > Last time I checked, ppoll() was not actually implemented across all > > architectures in a manner that solved the race it was intended to solve. > > > > As I said, this is imho unfair. But I mentioned ppol() "just in case". > > My questiong was why do you think that "signalfd() can't currently be > made to work in the way you describe". You have dropped this part to > change the topic? > Sorry, I may not be following LKML etiquette correctly. These couple of recent threads (other than some bugs I found in wait last year) are my first real attempt to participate here. I wasn't intending to "change the topic" or dropping the parts about changing signalfd() to somehow sweet it under the carpet. Rather than posting repeatedly across the thread, I tried to consolidate my responses into the other post you've replied to. You made an interesting point about ppoll here, so I only responded to that to find out whether the situation of that syscall had been improved. Not so much changing the topic, but asking a side-bar question ;) Scott -- Scott James Remnant scott@canonical.com [-- Attachment #2: This is a digitally signed message part --] [-- Type: application/pgp-signature, Size: 197 bytes --] ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-08 20:36 ` Casey Dahlin 2009-01-08 21:39 ` Oleg Nesterov @ 2009-01-10 14:50 ` Scott James Remnant 2009-01-10 21:20 ` Casey Dahlin 1 sibling, 1 reply; 50+ messages in thread From: Scott James Remnant @ 2009-01-10 14:50 UTC (permalink / raw) To: Casey Dahlin Cc: Roland McGrath, Oleg Nesterov, Ulrich Drepper, Ingo Molnar, Linux Kernel, Randy Dunlap, Davide Libenzi, Peter Zijlstra [-- Attachment #1: Type: text/plain, Size: 828 bytes --] On Thu, 2009-01-08 at 15:36 -0500, Casey Dahlin wrote: > Roland McGrath wrote: > >>> Since waitfd shouldn't consume the child termination notification > >>> waitfd should be more widely usable than the wait*() interfaces. > > > > waitid can be used that way with WNOWAIT. > > Yes, but waitfd does not have this flag. The reason being waitfd just > calls waitid internally, and there is no guarantee (afaik) that > calling waitid with WNOWAIT multiple times in succession will yield > different results each time. This breaks the streaming behavior of the > descriptor. > This would definitely be a Nice To Have though! Being able to use waitid() on another process of the same uid, with WNOHANG, in a streaming fashion would be a *very* cool thing. Scott -- Scott James Remnant scott@canonical.com [-- Attachment #2: This is a digitally signed message part --] [-- Type: application/pgp-signature, Size: 197 bytes --] ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-10 14:50 ` Scott James Remnant @ 2009-01-10 21:20 ` Casey Dahlin 0 siblings, 0 replies; 50+ messages in thread From: Casey Dahlin @ 2009-01-10 21:20 UTC (permalink / raw) To: Scott James Remnant Cc: Roland McGrath, Oleg Nesterov, Ulrich Drepper, Ingo Molnar, Linux Kernel, Randy Dunlap, Davide Libenzi, Peter Zijlstra Scott James Remnant wrote: > On Thu, 2009-01-08 at 15:36 -0500, Casey Dahlin wrote: > > >> Roland McGrath wrote: >> >>>>> Since waitfd shouldn't consume the child termination notification >>>>> waitfd should be more widely usable than the wait*() interfaces. >>>>> >>> waitid can be used that way with WNOWAIT. >>> >> Yes, but waitfd does not have this flag. The reason being waitfd just >> calls waitid internally, and there is no guarantee (afaik) that >> calling waitid with WNOWAIT multiple times in succession will yield >> different results each time. This breaks the streaming behavior of the >> descriptor. >> >> > This would definitely be a Nice To Have though! > > Being able to use waitid() on another process of the same uid, with > WNOHANG, in a streaming fashion would be a *very* cool thing. > > Scott > That could be done in a separate patch for waitid itself, and its a simple change to propagate it through waitfd (just one less EINVAL case). In terms of waitfd we'd be supersetting the interface, so its no big deal to add. In terms of waitid I'd have to check and make sure that we wouldn't be breaking compliance by allowing it to return something different on every call (this would likely affect cases where waitid(WNOWAIT) was used even in a normal fashion as well). --CJD ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-07 20:53 ` Roland McGrath 2009-01-07 20:58 ` Ingo Molnar 2009-01-07 21:02 ` Ulrich Drepper @ 2009-01-08 22:04 ` Michael Kerrisk 2009-01-10 14:09 ` Scott James Remnant 2009-01-10 14:45 ` Scott James Remnant 4 siblings, 0 replies; 50+ messages in thread From: Michael Kerrisk @ 2009-01-08 22:04 UTC (permalink / raw) To: Roland McGrath, Casey Dahlin Cc: Ingo Molnar, Linux Kernel, Randy Dunlap, Oleg Nesterov, Davide Libenzi, Peter Zijlstra, Andi Kleen, Ulrich Drepper On Thu, Jan 8, 2009 at 9:53 AM, Roland McGrath <roland@redhat.com> wrote: > New syscall should have gone to linux-api, I think. Yes, precisely. This requirement has been documented in SubmittingPatches for several months now. More details here: http://thread.gmane.org/gmane.linux.ltp/5658 Casey, *please* don't submit a patch for a system call without also providing a test program, and some attempt at userspace documentation. (Andi already pointed this out. From my POV, I don't need you to write a full blown man page -- if you send me the text, I'll do the *roff stuff. But that text should accompany the patch that implements the syscall.) Cheers, Michael > > Do we really need another one for this? How about using signalfd plus > setting the child's exit_signal to a queuing (SIGRTMIN+n) signal instead of > SIGCHLD? It's slightly more magical for the userland process to know to do > that (fork -> clone SIGRTMIN). But compared to adding a syscall we don't > really have to add, maybe better. > > > Thanks, > Roland > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ > -- Michael Kerrisk Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/ Found a documentation bug? http://www.kernel.org/doc/man-pages/reporting_bugs.html ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-07 20:53 ` Roland McGrath ` (2 preceding siblings ...) 2009-01-08 22:04 ` Michael Kerrisk @ 2009-01-10 14:09 ` Scott James Remnant 2009-01-10 14:45 ` Scott James Remnant 4 siblings, 0 replies; 50+ messages in thread From: Scott James Remnant @ 2009-01-10 14:09 UTC (permalink / raw) To: Roland McGrath Cc: Ingo Molnar, Casey Dahlin, Linux Kernel, Randy Dunlap, Oleg Nesterov, Davide Libenzi, Peter Zijlstra [-- Attachment #1: Type: text/plain, Size: 588 bytes --] On Wed, 2009-01-07 at 12:53 -0800, Roland McGrath wrote: > New syscall should have gone to linux-api, I think. > > Do we really need another one for this? How about using signalfd plus > setting the child's exit_signal to a queuing (SIGRTMIN+n) signal instead of > SIGCHLD? It's slightly more magical for the userland process to know to do > that (fork -> clone SIGRTMIN). But compared to adding a syscall we don't > really have to add, maybe better. > Doesn't that also change the wait() options you need as well? Scott -- Scott James Remnant scott@canonical.com [-- Attachment #2: This is a digitally signed message part --] [-- Type: application/pgp-signature, Size: 197 bytes --] ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-07 20:53 ` Roland McGrath ` (3 preceding siblings ...) 2009-01-10 14:09 ` Scott James Remnant @ 2009-01-10 14:45 ` Scott James Remnant 2009-01-10 15:57 ` Oleg Nesterov 4 siblings, 1 reply; 50+ messages in thread From: Scott James Remnant @ 2009-01-10 14:45 UTC (permalink / raw) To: Roland McGrath Cc: Ingo Molnar, Casey Dahlin, Linux Kernel, Randy Dunlap, Oleg Nesterov, Davide Libenzi, Peter Zijlstra [-- Attachment #1: Type: text/plain, Size: 1381 bytes --] On Wed, 2009-01-07 at 12:53 -0800, Roland McGrath wrote: > New syscall should have gone to linux-api, I think. > > Do we really need another one for this? How about using signalfd plus > setting the child's exit_signal to a queuing (SIGRTMIN+n) signal instead of > SIGCHLD? It's slightly more magical for the userland process to know to do > that (fork -> clone SIGRTMIN). But compared to adding a syscall we don't > really have to add, maybe better. > This wouldn't help the init daemon case: - the exit_signal is set on the child, not on the parent. While the init daemon could clone() every new process and set exit_signal, this would not be set for processes reparented to init. Even if we had a new syscall to change the exit_signal of a given process, *and* had the init reparent notification patch, this still wouldn't be sufficient; you'd have a race condition between the time you were notified of the reparent, and the time you set exit_signal, in which the child could die. Since exit_signal is always reset to SIGCHLD before reparenting, this could be done by resetting it to a different signal; but at this point we're getting into a rather twisty method full of traps. - exit_signal is reset to SIGCHLD on exec(). Pretty much a plan-killer ;) Scott -- Scott James Remnant scott@canonical.com [-- Attachment #2: This is a digitally signed message part --] [-- Type: application/pgp-signature, Size: 197 bytes --] ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-10 14:45 ` Scott James Remnant @ 2009-01-10 15:57 ` Oleg Nesterov 2009-01-10 17:07 ` Scott James Remnant 2011-03-02 1:37 ` Denys Vlasenko 0 siblings, 2 replies; 50+ messages in thread From: Oleg Nesterov @ 2009-01-10 15:57 UTC (permalink / raw) To: Scott James Remnant Cc: Roland McGrath, Ingo Molnar, Casey Dahlin, Linux Kernel, Randy Dunlap, Davide Libenzi, Peter Zijlstra On 01/10, Scott James Remnant wrote: > > On Wed, 2009-01-07 at 12:53 -0800, Roland McGrath wrote: > > > New syscall should have gone to linux-api, I think. > > > > Do we really need another one for this? How about using signalfd plus > > setting the child's exit_signal to a queuing (SIGRTMIN+n) signal instead of > > SIGCHLD? It's slightly more magical for the userland process to know to do > > that (fork -> clone SIGRTMIN). But compared to adding a syscall we don't > > really have to add, maybe better. > > > This wouldn't help the init daemon case: > > - the exit_signal is set on the child, not on the parent. > > While the init daemon could clone() every new process and set > exit_signal, this would not be set for processes reparented to init. > > Even if we had a new syscall to change the exit_signal of a given > process, *and* had the init reparent notification patch, this still > wouldn't be sufficient; you'd have a race condition between the time > you were notified of the reparent, and the time you set exit_signal, > in which the child could die. > > Since exit_signal is always reset to SIGCHLD before reparenting, this > could be done by resetting it to a different signal; but at this point > we're getting into a rather twisty method full of traps. > > - exit_signal is reset to SIGCHLD on exec(). > > Pretty much a plan-killer ;) I can't understand why should we change ->exit_signal if we want to use signalfd. Yes, SIGCHLD is not rt. So what? We do not need multiple signals in queue if we want to reap multiple zombies. Once we have a single SIGCHLD (reported by signalfd or whatever) we can do do_wait(WNOHANG) in a loop. Confused. Oleg. ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-10 15:57 ` Oleg Nesterov @ 2009-01-10 17:07 ` Scott James Remnant 2009-01-10 18:13 ` Oleg Nesterov ` (2 more replies) 2011-03-02 1:37 ` Denys Vlasenko 1 sibling, 3 replies; 50+ messages in thread From: Scott James Remnant @ 2009-01-10 17:07 UTC (permalink / raw) To: Oleg Nesterov Cc: Roland McGrath, Ingo Molnar, Casey Dahlin, Linux Kernel, Randy Dunlap, Davide Libenzi, Peter Zijlstra [-- Attachment #1: Type: text/plain, Size: 7181 bytes --] On Sat, 2009-01-10 at 16:57 +0100, Oleg Nesterov wrote: > I can't understand why should we change ->exit_signal if we want to > use signalfd. Yes, SIGCHLD is not rt. So what? > > We do not need multiple signals in queue if we want to reap multiple > zombies. Once we have a single SIGCHLD (reported by signalfd or > whatever) we can do do_wait(WNOHANG) in a loop. > Well, a good reason why is that it makes things much easier to do in userspace. The NOTES sections of many of the syscall manpages are already too long with strange behaviours that you have to take into account every time (and which most people fail to do). You may as well ask why we have signalfd() at all, and what was wrong with sigaction() and ordinary signal handlers? Well, lots of things actually; for a start, you couldn't really do much in the signal handler, so from userspace all we ended up doing was writing a byte to a pipe so we could wake up our main loop. You then had to remember to exhaust this pipe *before* checking what signals were triggered, just in case a signal was delivered while you were checking (so at least you'd wake the main loop up once more to check). signalfd() improves matters a lot; we don't have to worry about any strange behaviour, we just read() to know a signal is pending. But there were two competing implementations: one would just poll for read if signals were pending, but have no other detail; the other (which is what we have) gives us a siginfo_t for each pending signal. Except or non-RT signals of course, in which it just gives us the siginfo_t for the first pending signal of that type; future ones are discarded. I've suggested before that that's a bug in of itself, and that signalfd should always queue signals even if they're non-RT. But since, other than SIGCHLD, the only other signals with useful data are SIGILL, SIGFPE, SIGSEGV and SIGBUS (which have si_addr) which you kinda need to catch in a signal handler; and SIGPOLL (which has si_band and si_fd) which is negated by being in poll anyway, that's kinda hard to argue. So let's compare userspace code for trying to reap children using signalfd(); to save space, I've omitted error handling and the select()/poll()/etc. call - assuming that the top half is initialisation, and the bottom half is inside the select() handler. First, what we have today: sigemptyset (&mask); sigaddset (&mask, SIGCHLD); /* Block normal delivery and receive by sigfd instead */ sigprocmask (SIG_BLOCK, &mask, NULL); sigfd = signalfd (-1, &mask, 0); for (;;) { read (sigfd, &fd_siginfo, sizeof siginfo); /* throw away fd_siginfo, we're reading SIGCHLD and * can't use it :-( */ /* SIGCHLD means _at_least_one_ child is pending, there * may be more; so we have to loop AND expect to find * nothing */ for (;;) { /* ARGH! waitid returns 0 with WNOHANG if there * are no children. * * AND the structure, despite being logically * the same, isn't the same as the signalfd * one :-/ */ memset (&w_siginfo, 0, sizeof w_siginfo); waitid (P_ALL, 0, &w_siginfo, WEXITED | WNOHANG); /* Did we find anything? */ if (! w_siginfo.si_pid) break; /* NOW we have the siginfo_t for a recently * deceased process */ mourn (&w_siginfo); } /* Oh-HO! * While we were looping in waitid(), other children may * have died, and we probably already cleaned them up! * * But we'll still have a pending SIGCHLD, it might be * tempting to clear that *BUT* the child could have * died after the last waitid() call but before we clear * it. * * We have no choice in this situation but to loop back * through our entire main loop again, and do nothing. */ } Pros: - code exists today Cons: - having siginfo_t returned by read() is pointless, we can't use it - double loop isn't pretty - strange waitid() API in case of WNOHANG and no child - incompatible structures for signalfd()'s read result and waitid(), despite being logically the same structure! :-/ - can't simultaneously clear pending signal and wait, so we always have to go back round the main loop if a child dies after the read() In fact, that's not what userspace code does at all. Since there's no point listening to SIGCHLD, it's a complete no-op, we don't respond to it at all. We only need to use it to wake up the main loop. The wait() loop tends to be at the bottom of the main loop somewhere, completely outside of the fd processing. Now, what if signalfd() would always queue pending signals even if they're non-RT? This would also be the same code if we could change SIGCHLD to SIGRTMIN for the *waiting* process's children: sigemptyset (&mask); sigaddset (&mask, SIGCHLD); /* Block normal delivery and receive by sigfd instead */ sigprocmask (SIG_BLOCK, &mask, NULL); sigfd = signalfd (-1, &mask, 0); for (;;) { read (sigfd, &siginfo, sizeof siginfo); /* siginfo is immediately useful! */ mourn (&siginfo); /* we didn't clear off the wait queue, but that's easy * since we have the pid from signalfd() */ waitid (P_PID, siginfo.si_pid, NULL, WEXITED); } Pros: - single siginfo_t structure type returned by read() - we get information for each signal, we don't need a signal loop to find out what the signal is telling us - exact match between the signal and the wait call - no need to go round the main loop again just in case! - child signal can now be processed just like anything else. This makes "standard main loop functions" (g_main_loop, etc.) much easier to write. Cons: - needs kernel patch Personally, I think the fact this solves the case where you wait on a process that wasn't part of the original set the signal was pending for, so you have to process an extra SIGCHLD with nothing to do, is a good enough reason in of itself. The overhead of going back through a main loop of a userspace process just to find out whether you already responded to a notification is a waste of cycles. That's pretty neat, much nicer than what we had before. So what about waitfd() [I think I've slightly changed Casey's API here, or he changed my proposed one <g>]? wfd = waitfd (-1, P_ALL, 0, WEXITED); for (;;) { read (wfd, &siginfo, sizeof siginfo); /* siginfo is immediately useful AND the process has * been reaped! */ mourn (&siginfo); } Pros: - we don't need to care about SIGCHLD anymore - why should we listen to a notification to call wait, if we can just call wait? - the absolute easiest for a generic main loop; signals, timers and child death (as well as inotify, netlink, etc.) are now just cases of "select an fd, read structs of known size, process them" - possibility to allow waitfd (WNOWAIT) on children outside of your own usual process tree - notification without reaping? Maybe too much? Cons: - needs kernel patch - new API Scott -- Scott James Remnant scott@canonical.com [-- Attachment #2: This is a digitally signed message part --] [-- Type: application/pgp-signature, Size: 197 bytes --] ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-10 17:07 ` Scott James Remnant @ 2009-01-10 18:13 ` Oleg Nesterov 2009-01-10 20:13 ` Scott James Remnant 2009-01-10 22:25 ` Casey Dahlin 2009-01-10 23:11 ` Davide Libenzi 2 siblings, 1 reply; 50+ messages in thread From: Oleg Nesterov @ 2009-01-10 18:13 UTC (permalink / raw) To: Scott James Remnant Cc: Roland McGrath, Ingo Molnar, Casey Dahlin, Linux Kernel, Randy Dunlap, Davide Libenzi, Peter Zijlstra On 01/10, Scott James Remnant wrote: > > On Sat, 2009-01-10 at 16:57 +0100, Oleg Nesterov wrote: > > > I can't understand why should we change ->exit_signal if we want to > > use signalfd. Yes, SIGCHLD is not rt. So what? > > > > We do not need multiple signals in queue if we want to reap multiple > > zombies. Once we have a single SIGCHLD (reported by signalfd or > > whatever) we can do do_wait(WNOHANG) in a loop. > > > Well, a good reason why is that it makes things much easier to do in > userspace. I never argued with this. And, let me repeat. I am not arguing against waitfd! Actually, I always try to avoid the "do we need this feature" discussions. What I disagree with is that waitfd adds the functionality which does not exists currently. > You may as well ask why we have signalfd() at all, and what was wrong > with sigaction() and ordinary signal handlers? Well, lots of things Cough. You don't have to explain me why signalfd is nice ;) I participated in discussion when it was created. > So let's compare userspace code for trying to reap children using > signalfd(); > > First, what we have today: > > [...snip the code...] > > Pros: > - code exists today That is what I meant. Not more. > Cons: > - having siginfo_t returned by read() is pointless, we can't use it Indeed. We use read() only to wait for the signal death. > - double loop isn't pretty Nice argument to add the new syscall ;) > - strange waitid() API in case of WNOHANG and no child Heh. I also don't like this ;) A reason for waitfd ? > - incompatible structures for signalfd()'s read result and waitid(), > despite being logically the same structure! :-/ I could blaim waitfd because it fills siginfo in the manner which is not compatible with signalfd, despite logically the same structure. > - can't simultaneously clear pending signal and wait, so we always have > to go back round the main loop if a child dies after the read() Can't understand... waitfd doesn't clear the signal too? And you forget to mention another drwaback with the current code: a lot of pathetic comments ;) > Since there's no point listening to SIGCHLD, it's a complete no-op, we > don't respond to it at all. We only need to use it to wake up the main > loop. Yes, sure, indeed, of course. > The wait() loop tends to be at the bottom of the main loop > somewhere, completely outside of the fd processing. Huh. > Now, what if signalfd() would always queue pending signals even if > they're non-RT? Well, I think this is off-topic, and more importantly I don't think this change is possible. > So what about > waitfd() Yes, the user-space code (for this particular artificial example) becomes simpler. Following this logic, let's add sys_copyfile() to kernel? From time to time I regret we don't have it... (from another thread) > > I am not sure we are talking about the same thing, but afaics poll() + > > signalfd can work to (say) reap the childs. Actually, ppoll() alone is > > enough. > > > Last time I checked, ppoll() was not actually implemented across all > architectures in a manner that solved the race it was intended to solve. > > I'd be delighted to learn that this had been fixed? :-) Scott, this is unfair. Yes, some arches do not implement restore_sigmask() logic. So what? Let's suppose ppoll() has a bug. So, this means we should add waitfd? No, let's fix ppol(), and waitfd is orthogonal. Imho. Again, again, again. Please don't forget about "I am not arguing against". But I don't buy your arguments. Oleg. ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-10 18:13 ` Oleg Nesterov @ 2009-01-10 20:13 ` Scott James Remnant 2009-01-10 22:24 ` Oleg Nesterov 0 siblings, 1 reply; 50+ messages in thread From: Scott James Remnant @ 2009-01-10 20:13 UTC (permalink / raw) To: Oleg Nesterov Cc: Roland McGrath, Ingo Molnar, Casey Dahlin, Linux Kernel, Randy Dunlap, Davide Libenzi, Peter Zijlstra [-- Attachment #1: Type: text/plain, Size: 12929 bytes --] On Sat, 2009-01-10 at 19:13 +0100, Oleg Nesterov wrote: > On 01/10, Scott James Remnant wrote: > > > > On Sat, 2009-01-10 at 16:57 +0100, Oleg Nesterov wrote: > > > > > I can't understand why should we change ->exit_signal if we want to > > > use signalfd. Yes, SIGCHLD is not rt. So what? > > > > > > We do not need multiple signals in queue if we want to reap multiple > > > zombies. Once we have a single SIGCHLD (reported by signalfd or > > > whatever) we can do do_wait(WNOHANG) in a loop. > > > > > Well, a good reason why is that it makes things much easier to do in > > userspace. > > I never argued with this. And, let me repeat. I am not arguing against > waitfd! Actually, I always try to avoid the "do we need this feature" > discussions. > Unless I'm misinterpreting you, you're saying that you don't understand why we should change any current behaviour? My post is attempting to illustrate why we should. > What I disagree with is that waitfd adds the functionality which does > not exists currently. > I'm not saying that it doesn't at all; in fact I gave an example of how you implement the exact same functionality today. Changing the behaviour of signalfd() or introducing a call like waitfd() makes it easier to do things in userspace. > > Cons: > > - having siginfo_t returned by read() is pointless, we can't use it > > Indeed. We use read() only to wait for the signal death. > In fact, because main loops use select()/poll(), for the SIGCHLD case you'd never use signalfd() at all! Unless I'm missing something, the following two examples are identical in behaviour: using signalfd: sigemptyset (&mask); sigaddset (&mask, SIGCHLD); sigprocmask (SIG_BLOCK, &mask, NULL); sigfd = signalfd (-1, &mask, 0); /* prepare signal set */ FD_SET (sigfd, &readfds); nfds = sigfd > nfds ? sigfd + 1 : nfds; for (;;) { select (nfds, &readfds, NULL, NULL, NULL); if (FD_ISSET (sigfd, &readfds)) for (;;) read (sigfd, &fdsi, sizeof fdsi); /* SIGCHLD only notifies us that wait() can be called * at least once without blocking. * * Since we have to call it multiple times anyway, * and since it may block, all SIGCHLD really does it * wakes up our main loop. * * Thus unconditionally loop on wait every time through */ while ((pid = waitpid (-1, &status, WNOHANG)) == 0) mourn (pid, status); } using pselect: sigemptyset (&mask); sigaddset (&mask, SIGCHLD); sigprocmask (SIG_BLOCK, &mask, &origmask); /* prepare signal set */ for (;;) { pselect (nfds, &readnfds, NULL, NULL, NULL, &origmask); /* SIGCHLD only needs to wake up our main loop, * unconditionally loop on wait every time through */ while ((pid = waitpid (-1, &status, WNOHANG)) == 0) mourn (pid, status); } But the pselect() version is neater. Which is why I started the previous reply off with "why have signalfd() at all?" > > - can't simultaneously clear pending signal and wait, so we always have > > to go back round the main loop if a child dies after the read() > > Can't understand... waitfd doesn't clear the signal too? > > And you forget to mention another drwaback with the current code: > a lot of pathetic comments ;) > The comments were illustrating the sources of userspace frustration with the current syscalls. One of them was attempting to explain what you don't understand here, I'll try and be more verbose... We only want to be woken up in our main loop if we have something to do. One of those things that we need to do is reap dead child processes. The kernel provides a couple of methods of being woken up because of that event; we can use signalfd() for SIGCHLD, or we could use pselect() and otherwise mask SIGCHLD so that any pending signal wakes up the main loop. That notification only tells us that *at least one* process has died; SIGCHLD may only be pending once at a time. If further children die before we clear the signal, nothing will happen. We clear the signal with read() on the signalfd, or it's inherently cleared inside pselect() by allowing it to be delivered to ourselves. It will not be cleared again until we read() again, or call pselect() again. Because it only tells us that *at least one* process has died, we have to call waitpid() repeatedly until we have exhausted the wait queue. ~~Calling waitpid() does not clear the pending signal.~~ This is the important bit. If a further process dies while we're inside the waitpid() loop, we will most likely reap that straight away. But this does not clear the pending signal. The main loop will be woken up again, even though it does not need to be. Thus: - child process #1 dies - main loop woken up by SIGCHLD - pending status of signal cleared - enter wait loop - child process #2 dies - SIGCHLD pending again - waitpid() called first time, child process #1 reaped - waitpid() called second time, child process #2 reaped (SIGCHLD still pending) - waitpid() called third time, no child processes remain - exit wait loop - back to top of main loop, immediately woken up by pending SIGCHLD - pending status of signal cleared - enter wait loop - waitpid() called first time, but no child processes remain (we reaped it last time round) - exit wait loop - back to top of main loop, sleep A rookie mistake is to assume that you receive SIGCHLD every time a child process dies. Nearly everyone writes this first time out: signal (SIGCHLD, reaper); void reaper (int signum) { pid = waitpid (-1, &status, 0); printf ("%d died! :-(\n", pid); // yes, I know } You may even be encouraged to believe that will work, because you've read that SIGCHLD is masked while the signal handler is running. Of course, we all know that many children may die between the time the first child dies (that makes the signal pending) and the signal handler actually executing. W. Richard Stevens even makes this mistake in Advanced Programming in the UNIX Environment (p. 605), he even justifies called waitpid() in the signal handler because multiple children can die. He doesn't help matters in the second edition of UNIX Network Programming v1 where he defines exactly the same function (p. 122) though he at least goes on to explain all the errors and define a correct function that loops on waitpid() (p. 128) Seasoned developers might smile wryly, but I've seen them make another error. Presumably because they know about select()/poll(), they appear to treat SIGCHLD accordingly. "SIGCHLD is to waitpid() as select() is to read()" or perhaps more verbosely: "A pending SIGCHLD means that a call to waitpid() won't block" We know that this is completely wrong. SIGCHLD and the wait queue are entirely separate, the only connection is that SIGCHLD will be made pending, if not already, when a new entry is added to the wait queue. However since the two are cleared by different means, the following behaviours can all be exhibited: - SIGCHLD not pending, but waitpid() will not block This is true in all example usage; after you've called the read() on the signalfd - or the pselect() has woken, SIGCHLD is probably no longer pending but waitpid() will not block Compare with select() behaviour; if you fail to read() from the fd, select() wakes up yet again - SIGCHLD pending, but waitpid() will block This is true if you exhaust the wait queue in a loop, All SIGCHLD is useful for is to get your main loop out of select()/poll(); you must always exhaust the wait queue every time you have woken up. (I think we agreed on this, I'm trying to explain the next paragraph.) Unfortunately because the wake up, and the wait queue, are *not* connected; we can be woken up when we do not need to be. > > The wait() loop tends to be at the bottom of the main loop > > somewhere, completely outside of the fd processing. > > Huh. > From the above reasoning, SIGCHLD is only there to wake up your main loop. If you fail to exhaust the wait queue, you will not be woken up again until another child process dies, so you must exhaust the wait queue. Now, we can try and work out whether it was SIGCHLD that woke up our main loop. If using signalfd(), we would read() and check ssi_pid; if using pselect() we'd have to have a signal handler that sets a sigatomic_t. We could check that, and then only loop on waitpid() if set. But why? We've proved above that SIGCHLD being pending does not mean that waitpid() will not block. We're not actually saving ourselves from the extra iteration of the main loop, or even saving ourselves from the first call to waitpid() that returns -1. Arguably the extra effort to check whether SIGCHLD was pending is more expensive (certainly in code readability) than just calling waitpid() anyway. Thus it's most common (certainly in the standard libraries I've read) to just always loop on waitpid(). > > Now, what if signalfd() would always queue pending signals even if > > they're non-RT? > > Well, I think this is off-topic, and more importantly I don't think > this change is possible. > From the argument above, it should be reasonably clear that what this patch is attempting to do is join up the "waking up the main loop" from the "clearing of the wait queue". In simpler words, make wait behave more like select(). waitfd() does that absolutely directly. It allows you to select() on the wait queue itself. Clearing the wait queue, and the main loop wake-up, are directly connected. We eliminate the problem of the extra main loop wake-up and iteration, because if we exhaust the wait queue then the waitfd socket will not poll for reading. We also eliminate the problem where if we fail to exhaust the wait queue, we will not be woken up, because the waitfd socket will still poll for reading. If you're using waitfd, SIGCHLD is unnecessary. I'm not personally arguing for waitfd() *itself*, just that there be some way of coupling the main loop wake up with the actual contents of the wait queue. If signalfd() can be used for this, I'd be happy as a clam. But today, signalfd() doesn't help at all because it still has all the same drawbacks. We have an extra main loop wake-up and iteration if a process dies while we're exhausting the wait queue because SIGCHLD is pending again. If we fail to exhaust the wait queue, the main loop will not be woken up because SIGCHLD is not pending - despite the wait queue not being empty. It's actually completely possible, in fact, it's almost a one-line patch. --- kernel/signal.c~ 2009-01-10 20:04:50.000000000 +0000 +++ kernel/signal.c 2009-01-10 20:05:24.000000000 +0000 @@ -816,8 +816,10 @@ * exactly one non-rt signal, so that we can get more * detailed information about the cause of the signal. */ - if (legacy_queue(pending, sig)) + if (legacy_queue(pending, sig)) { + signalfd_notify(t, sig); return 0; + } /* * fast-pathed signals for kernel-internal things like SIGSTOP * or SIGKILL. There might be considerations here I haven't thought of, but it worked pretty well for me. This isn't waitfd(), you're not actually selecting on the wait queue. But it means you'll have a queue of SIGCHLDs for each entry in the wait queue. You still have to remember that for each SIGCHLD you read from the signalfd, you must call waitpid on that pid, but if you don't do that - it'll still poll and you can go round again. Personally I think waitfd() is more elegant, and has a rather nice symmetry with the other system calls. > > So what about > > waitfd() > > Yes, the user-space code (for this particular artificial example) > becomes simpler. Following this logic, let's add sys_copyfile() > to kernel? From time to time I regret we don't have it... > Well, I don't think that argument is quite orthogonal because copyfile() can be implemented in userspace with identical behaviour to a kernel syscall (open, fstat, sendfile, chown/etc.). A more orthogonal example would be pselect(). That implemented, in the kernel, a syscall that it actually wasn't possible to implement in userspace in _quite_ the same way. The procmask/select sequence was known to be racy, so we all worked around it with an interrupt pipe. The argument for waitfd() or similar in the kernel is because there are races in userspace that we can't solve. Hopefully you can see my argument now? :-) > (from another thread) > As noted there, I replied separately and cut the discussion about this because I didn't want to confuse the matter. Scott -- Scott James Remnant scott@canonical.com [-- Attachment #2: This is a digitally signed message part --] [-- Type: application/pgp-signature, Size: 197 bytes --] ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-10 20:13 ` Scott James Remnant @ 2009-01-10 22:24 ` Oleg Nesterov 2009-01-10 23:14 ` Davide Libenzi 0 siblings, 1 reply; 50+ messages in thread From: Oleg Nesterov @ 2009-01-10 22:24 UTC (permalink / raw) To: Scott James Remnant Cc: Roland McGrath, Ingo Molnar, Casey Dahlin, Linux Kernel, Randy Dunlap, Davide Libenzi, Peter Zijlstra On 01/10, Scott James Remnant wrote: > > On Sat, 2009-01-10 at 19:13 +0100, Oleg Nesterov wrote: > > > I never argued with this. And, let me repeat. I am not arguing against > > waitfd! Actually, I always try to avoid the "do we need this feature" > > discussions. > > > Unless I'm misinterpreting you, you're saying that you don't understand > why we should change any current behaviour? My post is attempting to > illustrate why we should. Scott. How many times should I repeat: I am _not_ arguing against waitfd. But to clarify, neither I vote for it. I don't really care. Except I do care about the code if it will be merged, that is why I entered this thread. > > What I disagree with is that waitfd adds the functionality which does > > not exists currently. > > > I'm not saying that it doesn't at all; in fact I gave an example of how > you implement the exact same functionality today. This means I was confused. Because I thought you point is we can't poll for childs without signalfd. And all I asked was: why do you think so. I do understand that waitfd can be handy. > In fact, because main loops use select()/poll(), for the SIGCHLD case > you'd never use signalfd() at all! > > Unless I'm missing something, the following two examples are identical > in behaviour: > > using signalfd: > ... > using pselect: Yes, and that is why I mentioned that ppoll() alone is enough. > But the pselect() version is neater. Which is why I started the > previous reply off with "why have signalfd() at all?" Unlike waitfd, there are things which we just can not do without signalfd, even if we have ppol/pselect. For example: wait for the signal, but not dequeue it. > One of them was attempting to explain what you don't understand here, > I'll try and be more verbose... > ... > ~~Calling waitpid() does not clear the pending signal.~~ > > This is the important bit. > > If a further process dies while we're inside the waitpid() loop, we will > most likely reap that straight away. But this does not clear the > pending signal. The main loop will be woken up again, even though it > does not need to be. > > Thus: > > - child process #1 dies > - main loop woken up by SIGCHLD > - pending status of signal cleared > - enter wait loop > - child process #2 dies > - SIGCHLD pending again > - waitpid() called first time, child process #1 reaped > - waitpid() called second time, child process #2 reaped > (SIGCHLD still pending) > - waitpid() called third time, no child processes remain > - exit wait loop > - back to top of main loop, immediately woken up by pending SIGCHLD > - pending status of signal cleared > - enter wait loop > - waitpid() called first time, but no child processes remain > (we reaped it last time round) > - exit wait loop > - back to top of main loop, sleep Scott, I don't really understand why are you trying to explain this all to me. I do understand this. At least I hope ;) Yes this is possible, and I see no problems here. > - SIGCHLD not pending, but waitpid() will not block > > This is true in all example usage; after you've called the read() on > the signalfd - or the pselect() has woken, SIGCHLD is probably no > longer pending but waitpid() will not block > > Compare with select() behaviour; if you fail to read() from the fd, > select() wakes up yet again > > - SIGCHLD pending, but waitpid() will block > > This is true if you exhaust the wait queue in a loop, ... and this too. > All SIGCHLD is useful for is to get your main loop out of > select()/poll(); you must always exhaust the wait queue every time you > have woken up. Yes, and yes, and yes. Scott, I am sorry, I failed to read to the end so perhaps I missed something ;) > --- kernel/signal.c~ 2009-01-10 20:04:50.000000000 +0000 > +++ kernel/signal.c 2009-01-10 20:05:24.000000000 +0000 > @@ -816,8 +816,10 @@ > * exactly one non-rt signal, so that we can get more > * detailed information about the cause of the signal. > */ > - if (legacy_queue(pending, sig)) > + if (legacy_queue(pending, sig)) { > + signalfd_notify(t, sig); > return 0; > + } I'd prefer to not discuss this here, but I am not sure I understand. There should not be no threads which need the wakeup from here, and I can't see how this change can help. > A more orthogonal example would be pselect(). That implemented, in the > kernel, a syscall that it actually wasn't possible to implement in > userspace Yes, exactly, > The argument for waitfd() or similar in the kernel is because there are > races in userspace that we can't solve. And now I don't understand you again. Please show me which races we _can not_ solve in userspace without waitfd? Yes we can race with the exiting childs while doing waitpid() in a loop, so we can make the unnecessary syscall. But please do not tell me _this_ is the race we can't solve. This is _harmless_. Unlike the problems with the poor user-space implementations of pselect/ppol. Oleg. ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-10 22:24 ` Oleg Nesterov @ 2009-01-10 23:14 ` Davide Libenzi 0 siblings, 0 replies; 50+ messages in thread From: Davide Libenzi @ 2009-01-10 23:14 UTC (permalink / raw) To: Oleg Nesterov Cc: Scott James Remnant, Roland McGrath, Ingo Molnar, Casey Dahlin, Linux Kernel, Randy Dunlap, Peter Zijlstra On Sat, 10 Jan 2009, Oleg Nesterov wrote: > > Thus: > > > > - child process #1 dies > > - main loop woken up by SIGCHLD > > - pending status of signal cleared > > - enter wait loop > > - child process #2 dies > > - SIGCHLD pending again > > - waitpid() called first time, child process #1 reaped > > - waitpid() called second time, child process #2 reaped > > (SIGCHLD still pending) > > - waitpid() called third time, no child processes remain > > - exit wait loop > > - back to top of main loop, immediately woken up by pending SIGCHLD > > - pending status of signal cleared > > - enter wait loop > > - waitpid() called first time, but no child processes remain > > (we reaped it last time round) > > - exit wait loop > > - back to top of main loop, sleep > > Scott, I don't really understand why are you trying to explain this > all to me. I do understand this. At least I hope ;) Indeed :) Scott, you're teaching Linux signals to the guy that is likely the de-facto mantainer of the subsystem. See the irony? - Davide ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-10 17:07 ` Scott James Remnant 2009-01-10 18:13 ` Oleg Nesterov @ 2009-01-10 22:25 ` Casey Dahlin 2009-01-10 23:11 ` Davide Libenzi 2 siblings, 0 replies; 50+ messages in thread From: Casey Dahlin @ 2009-01-10 22:25 UTC (permalink / raw) To: Scott James Remnant Cc: Oleg Nesterov, Roland McGrath, Ingo Molnar, Linux Kernel, Randy Dunlap, Davide Libenzi, Peter Zijlstra Scott James Remnant wrote: > That's pretty neat, much nicer than what we had before. So what about > waitfd() [I think I've slightly changed Casey's API here, or he changed > my proposed one <g>]? > I think you might have changed the argument order slighly, but other than that, counting the corrections I've made from feedback here, its about the same. --CJD ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-10 17:07 ` Scott James Remnant 2009-01-10 18:13 ` Oleg Nesterov 2009-01-10 22:25 ` Casey Dahlin @ 2009-01-10 23:11 ` Davide Libenzi 2 siblings, 0 replies; 50+ messages in thread From: Davide Libenzi @ 2009-01-10 23:11 UTC (permalink / raw) To: Scott James Remnant Cc: Oleg Nesterov, Roland McGrath, Ingo Molnar, Casey Dahlin, Linux Kernel, Randy Dunlap, Peter Zijlstra On Sat, 10 Jan 2009, Scott James Remnant wrote: > First, what we have today: > > sigemptyset (&mask); > sigaddset (&mask, SIGCHLD); > > /* Block normal delivery and receive by sigfd instead */ > sigprocmask (SIG_BLOCK, &mask, NULL); > sigfd = signalfd (-1, &mask, 0); > > for (;;) { > read (sigfd, &fd_siginfo, sizeof siginfo); > > /* throw away fd_siginfo, we're reading SIGCHLD and > * can't use it :-( > */ > > /* SIGCHLD means _at_least_one_ child is pending, there > * may be more; so we have to loop AND expect to find > * nothing > */ > for (;;) { > /* ARGH! waitid returns 0 with WNOHANG if there > * are no children. > * > * AND the structure, despite being logically > * the same, isn't the same as the signalfd > * one :-/ > */ > memset (&w_siginfo, 0, sizeof w_siginfo); > > waitid (P_ALL, 0, &w_siginfo, > WEXITED | WNOHANG); > > /* Did we find anything? */ > if (! w_siginfo.si_pid) > break; > > /* NOW we have the siginfo_t for a recently > * deceased process > */ > > mourn (&w_siginfo); > } > That, once all the glamorous comments are removed, can be solved pretty much with this much userspace code: int waitfd(int flags) { sigemptyset(&mask); sigaddset(&mask, SIGCHLD); sigprocmask(SIG_BLOCK, &mask, NULL); return signalfd(-1, &mask, flags); } int waitfd_read(int fd, siginfo_t *si) { for (;;) { if (read(sigfd, &fd_siginfo, sizeof fd_siginfo) != sizeof(fd_siginfo) retrun 0; memset(si, 0, sizeof *si); waitid(P_ALL, 0, si, WEXITED | WNOHANG); if (si->si_pid) break; } return sizeof *si; } About the exit_signal, that would have made probably more sense for it to be a parent property, instead of childs one. And be: do_notify_parent(p, p->parent->exit_signal); instead of: do_notify_parent(p, p->exit_signal); After all, is the parent that is going to make use of the notification, not the child. - Davide ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2009-01-10 15:57 ` Oleg Nesterov 2009-01-10 17:07 ` Scott James Remnant @ 2011-03-02 1:37 ` Denys Vlasenko 2011-03-02 13:55 ` Oleg Nesterov 1 sibling, 1 reply; 50+ messages in thread From: Denys Vlasenko @ 2011-03-02 1:37 UTC (permalink / raw) To: Oleg Nesterov Cc: Scott James Remnant, Roland McGrath, Ingo Molnar, Casey Dahlin, Linux Kernel, Randy Dunlap, Davide Libenzi, Peter Zijlstra On Sat, Jan 10, 2009 at 4:57 PM, Oleg Nesterov <oleg@redhat.com> wrote: > On 01/10, Scott James Remnant wrote: >> On Wed, 2009-01-07 at 12:53 -0800, Roland McGrath wrote: >> > Do we really need another one for this? How about using signalfd plus >> > setting the child's exit_signal to a queuing (SIGRTMIN+n) signal instead of >> > SIGCHLD? It's slightly more magical for the userland process to know to do >> > that (fork -> clone SIGRTMIN). But compared to adding a syscall we don't >> > really have to add, maybe better. >> > >> This wouldn't help the init daemon case: >> >> - the exit_signal is set on the child, not on the parent. >> >> While the init daemon could clone() every new process and set >> exit_signal, this would not be set for processes reparented to init. >> >> Even if we had a new syscall to change the exit_signal of a given >> process, *and* had the init reparent notification patch, this still >> wouldn't be sufficient; you'd have a race condition between the time >> you were notified of the reparent, and the time you set exit_signal, >> in which the child could die. >> >> Since exit_signal is always reset to SIGCHLD before reparenting, this >> could be done by resetting it to a different signal; but at this point >> we're getting into a rather twisty method full of traps. >> >> - exit_signal is reset to SIGCHLD on exec(). >> >> Pretty much a plan-killer ;) > > I can't understand why should we change ->exit_signal if we want to > use signalfd. Yes, SIGCHLD is not rt. So what? > > We do not need multiple signals in queue if we want to reap multiple > zombies. Once we have a single SIGCHLD (reported by signalfd or > whatever) we can do do_wait(WNOHANG) in a loop. > > Confused. I know I am terribly late for the party :) "do_wait(WNOHANG) in a loop" is a performance problem. Oleg, do you remember that strace bug when it was swamped with gazillions of stop notifications from a multithreaded task, then by dealing with them one-by-one it was causing unfairness and ultimately "this program never finishes when run under strace" bug? And another typical nuisance that running multithreaded stuff under strace is much slower, even with -e option which limits the set of decoded syscalls? Having waitfd would help both cases: strace can gulp a lot of waitpid notifications in one go, and batch process them. -- vda ^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [RESEND][RFC PATCH v2] waitfd 2011-03-02 1:37 ` Denys Vlasenko @ 2011-03-02 13:55 ` Oleg Nesterov 0 siblings, 0 replies; 50+ messages in thread From: Oleg Nesterov @ 2011-03-02 13:55 UTC (permalink / raw) To: Denys Vlasenko Cc: Scott James Remnant, Roland McGrath, Ingo Molnar, Casey Dahlin, Linux Kernel, Randy Dunlap, Davide Libenzi, Peter Zijlstra On 03/02, Denys Vlasenko wrote: > > On Sat, Jan 10, 2009 at 4:57 PM, Oleg Nesterov <oleg@redhat.com> wrote: > > > > We do not need multiple signals in queue if we want to reap multiple > > zombies. Once we have a single SIGCHLD (reported by signalfd or > > whatever) we can do do_wait(WNOHANG) in a loop. > > > > Confused. > > I know I am terribly late for the party :) > > "do_wait(WNOHANG) in a loop" is a performance problem. Yes. > Oleg, do you remember that strace bug when it was swamped > with gazillions of stop notifications from a multithreaded > task, then by dealing with them one-by-one it was causing > unfairness and ultimately "this program never finishes > when run under strace" bug? Yes. But, iirc, this was not connected to the performance problems with do_wait(). The problem was, strace did a single do_wait() instead of wait-them-all. > And another typical nuisance that running multithreaded > stuff under strace is much slower, even with -e option > which limits the set of decoded syscalls? IIUC, this is also because strace is single-threaded, I mean it doesn't scale well. > Having waitfd would help both cases: strace can gulp > a lot of waitpid notifications in one go, and > batch process them. Perhaps. I do not know how much do_wait() contributes to the slowness though. And it is not exactly clear how we can implement the "fast" waitfd. For example, this patch (iirc!) just calls do_wait() in a loop. I doubt very much it can really help to improve the performance. Oh. Can't resist. The real problem is that ptrace API should not be per-thread, and it should not use wait() at all. But this is offtopic. Oleg. ^ permalink raw reply [flat|nested] 50+ messages in thread
end of thread, other threads:[~2011-03-02 14:03 UTC | newest] Thread overview: 50+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2009-01-06 18:11 [RFC PATCH v2] waitfd Casey Dahlin 2009-01-06 18:27 ` Alan Cox 2009-01-06 18:31 ` Randy Dunlap 2009-01-06 18:45 ` Casey Dahlin 2009-01-06 18:50 ` Randy Dunlap 2009-01-06 18:48 ` Andi Kleen 2009-01-06 19:07 ` [RESEND][RFC " Casey Dahlin 2009-01-07 12:34 ` Ingo Molnar 2009-01-07 13:05 ` Casey Dahlin 2009-01-07 15:00 ` Ingo Molnar 2009-01-07 17:19 ` Oleg Nesterov 2009-01-07 17:24 ` Ingo Molnar 2009-01-07 17:52 ` Davide Libenzi 2009-01-07 20:38 ` Casey Dahlin 2009-01-10 14:47 ` Scott James Remnant 2009-01-10 21:14 ` Casey Dahlin 2009-01-10 21:20 ` Scott James Remnant 2009-01-10 22:08 ` Casey Dahlin 2009-01-10 22:31 ` Oleg Nesterov 2009-01-10 22:37 ` Casey Dahlin 2009-01-10 22:46 ` Oleg Nesterov 2009-01-07 20:53 ` Roland McGrath 2009-01-07 20:58 ` Ingo Molnar 2009-01-07 21:05 ` Davide Libenzi 2009-01-07 21:50 ` Ingo Molnar 2009-01-07 21:02 ` Ulrich Drepper 2009-01-08 14:32 ` Oleg Nesterov 2009-01-08 19:35 ` Roland McGrath 2009-01-08 20:36 ` Casey Dahlin 2009-01-08 21:39 ` Oleg Nesterov 2009-01-10 14:52 ` Scott James Remnant 2009-01-10 16:19 ` Oleg Nesterov 2009-01-10 17:09 ` Scott James Remnant 2009-01-10 18:21 ` Oleg Nesterov 2009-01-10 18:46 ` Scott James Remnant 2009-01-10 14:50 ` Scott James Remnant 2009-01-10 21:20 ` Casey Dahlin 2009-01-08 22:04 ` Michael Kerrisk 2009-01-10 14:09 ` Scott James Remnant 2009-01-10 14:45 ` Scott James Remnant 2009-01-10 15:57 ` Oleg Nesterov 2009-01-10 17:07 ` Scott James Remnant 2009-01-10 18:13 ` Oleg Nesterov 2009-01-10 20:13 ` Scott James Remnant 2009-01-10 22:24 ` Oleg Nesterov 2009-01-10 23:14 ` Davide Libenzi 2009-01-10 22:25 ` Casey Dahlin 2009-01-10 23:11 ` Davide Libenzi 2011-03-02 1:37 ` Denys Vlasenko 2011-03-02 13:55 ` Oleg Nesterov
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox