All of lore.kernel.org
 help / color / mirror / Atom feed
From: Casey Dahlin <cdahlin@redhat.com>
To: Linux Kernel <linux-kernel@vger.kernel.org>
Subject: [RFC PATCH v2] waitfd
Date: Tue, 06 Jan 2009 13:11:04 -0500	[thread overview]
Message-ID: <49639EB8.40204@redhat.com> (raw)

Linux now exposes signals, timers, and events via file descriptors 
through signalfd, timerfd, and eventfd. This means programmers can use a 
single select/[e]poll call to monitor all change in their program. This 
patch aims to expose child death via the same mechanism.

waitfd provides a file descriptor out of which may be read a series of 
siginfo_t objects describing child death. A child process is reaped as 
soon as its information is read. This means child monitoring too can be 
performed with that same poll call.

Patch is against v2.6.28

--CJD

diff --git a/arch/x86/include/asm/unistd_32.h 
b/arch/x86/include/asm/unistd_32.h
index f2bba78..134d83c 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -338,6 +338,7 @@
 #define __NR_dup3        330
 #define __NR_pipe2        331
 #define __NR_inotify_init1    332
+#define __NR_waitfd        333
 
 #ifdef __KERNEL__
 
diff --git a/arch/x86/include/asm/unistd_64.h 
b/arch/x86/include/asm/unistd_64.h
index d2e415e..b28eb07 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -653,6 +653,8 @@ __SYSCALL(__NR_dup3, sys_dup3)
 __SYSCALL(__NR_pipe2, sys_pipe2)
 #define __NR_inotify_init1            294
 __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
+#define __NR_waitfd                295
+__SYSCALL(__NR_waitfd, sys_waitfd)
 
 
 #ifndef __NO_STUBS
diff --git a/arch/x86/kernel/syscall_table_32.S 
b/arch/x86/kernel/syscall_table_32.S
index d44395f..c796a8b 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -332,3 +332,4 @@ ENTRY(sys_call_table)
     .long sys_dup3            /* 330 */
     .long sys_pipe2
     .long sys_inotify_init1
+    .long sys_waitfd
diff --git a/fs/Makefile b/fs/Makefile
index d9f8afe..74c31fb 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_INOTIFY_USER)    += inotify_user.o
 obj-$(CONFIG_EPOLL)        += eventpoll.o
 obj-$(CONFIG_ANON_INODES)    += anon_inodes.o
 obj-$(CONFIG_SIGNALFD)        += signalfd.o
+obj-$(CONFIG_WAITFD)        += waitfd.o
 obj-$(CONFIG_TIMERFD)        += timerfd.o
 obj-$(CONFIG_EVENTFD)        += eventfd.o
 obj-$(CONFIG_AIO)               += aio.o
diff --git a/fs/waitfd.c b/fs/waitfd.c
new file mode 100644
index 0000000..0155a83
--- /dev/null
+++ b/fs/waitfd.c
@@ -0,0 +1,117 @@
+/*
+ *  fs/waitfd.c
+ *
+ *  Copyright (C) 2008  Red Hat, Casey Dahlin <cdahlin@redhat.com>
+ *
+ *  Largely derived from fs/signalfd.c
+ */
+
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/list.h>
+#include <linux/anon_inodes.h>
+#include <linux/syscalls.h>
+
+long do_waitid(int which, pid_t upid,
+           struct siginfo __user *infop, int options,
+           struct rusage __user *ru);
+
+struct waitfd_ctx {
+    int ops;
+    int which;
+    pid_t upid;
+};
+
+static int waitfd_release(struct inode *inode, struct file *file)
+{
+    kfree(file->private_data);
+    return 0;
+}
+
+static unsigned int waitfd_poll(struct file *file, poll_table *wait)
+{
+    struct waitfd_ctx *ctx = file->private_data;
+    long value;
+
+    poll_wait(file, &current->signal->wait_chldexit, wait);
+
+    value = do_waitid(ctx->which, ctx->upid, NULL,
+               ctx->ops | WNOHANG | WNOWAIT, NULL);
+    if (value > 0 || value == -ECHILD)
+        return POLLIN;
+
+    return 0;
+}
+
+/*
+ * Returns a multiple of the size of a struct siginfo, or a negative
+ * error code. The "count" parameter must be at least sizeof(struct 
siginfo)
+ */
+static ssize_t waitfd_read(struct file *file, char __user *buf, size_t 
count,
+                 loff_t *ppos)
+{
+    struct waitfd_ctx *ctx = file->private_data;
+    struct siginfo __user *info_addr = (struct siginfo *)buf;
+    int flags = ctx->ops;
+    ssize_t ret, total = 0;
+
+    count /= sizeof(struct siginfo);
+    if (!count)
+        return -EINVAL;
+
+    do {
+        ret = do_waitid(ctx->which, ctx->upid, info_addr, flags, NULL);
+        if (ret == 0)
+            ret = -EAGAIN;
+        if (ret == -ECHILD)
+            ret = 0;
+        if (ret <= 0)
+            break;
+
+        info_addr++;
+        total += sizeof(struct siginfo);
+        flags |= WNOHANG;
+    } while (--count);
+
+    return total ? total: ret;
+}
+
+static const struct file_operations waitfd_fops = {
+    .release    = waitfd_release,
+    .poll        = waitfd_poll,
+    .read        = waitfd_read,
+};
+
+asmlinkage long sys_waitfd(int which, pid_t upid, int options, int unused)
+{
+    int ufd;
+    struct waitfd_ctx *ctx;
+
+    /* Just to make sure we don't end up with a sys_waitfd4 */
+    (void)unused;
+
+    if (options & ~(WNOHANG|WEXITED|WSTOPPED|WCONTINUED))
+        return -EINVAL;
+    if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
+        return -EINVAL;
+
+    ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+    if (!ctx)
+        return -ENOMEM;
+
+    ctx->ops = options;
+    ctx->upid = upid;
+    ctx->which = which;
+
+    ufd = anon_inode_getfd("[waitfd]", &waitfd_fops, ctx,
+                   (options & WNOHANG) ? O_NONBLOCK : 0);
+    if (ufd < 0)
+        kfree(ctx);
+
+    return ufd;
+}
diff --git a/init/Kconfig b/init/Kconfig
index f763762..bc34871 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -683,6 +683,16 @@ config EPOLL
       Disabling this option will cause the kernel to be built without
       support for epoll family of system calls.
 
+config WAITFD
+    bool "Enable waitfd() system call" if EMBEDDED
+    select ANON_INODES
+    default y
+    help
+      Enable the waitfd() system call that allows receving child state
+      changes from a file descriptor.
+
+      If unsure, say Y.
+
 config SIGNALFD
     bool "Enable signalfd() system call" if EMBEDDED
     select ANON_INODES
diff --git a/kernel/exit.c b/kernel/exit.c
index 2d8be7e..b53e8ba 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1233,18 +1233,20 @@ static int wait_noreap_copyout(struct 
task_struct *p, pid_t pid, uid_t uid,
     int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0;
 
     put_task_struct(p);
-    if (!retval)
-        retval = put_user(SIGCHLD, &infop->si_signo);
-    if (!retval)
-        retval = put_user(0, &infop->si_errno);
-    if (!retval)
-        retval = put_user((short)why, &infop->si_code);
-    if (!retval)
-        retval = put_user(pid, &infop->si_pid);
-    if (!retval)
-        retval = put_user(uid, &infop->si_uid);
-    if (!retval)
-        retval = put_user(status, &infop->si_status);
+    if (infop) {
+        if (!retval)
+            retval = put_user(SIGCHLD, &infop->si_signo);
+        if (!retval)
+            retval = put_user(0, &infop->si_errno);
+        if (!retval)
+            retval = put_user((short)why, &infop->si_code);
+        if (!retval)
+            retval = put_user(pid, &infop->si_pid);
+        if (!retval)
+            retval = put_user(uid, &infop->si_uid);
+        if (!retval)
+            retval = put_user(status, &infop->si_status);
+    }
     if (!retval)
         retval = pid;
     return retval;
@@ -1727,35 +1729,12 @@ repeat:
 end:
     current->state = TASK_RUNNING;
     remove_wait_queue(&current->signal->wait_chldexit,&wait);
-    if (infop) {
-        if (retval > 0)
-            retval = 0;
-        else {
-            /*
-             * For a WNOHANG return, clear out all the fields
-             * we would set so the user can easily tell the
-             * difference.
-             */
-            if (!retval)
-                retval = put_user(0, &infop->si_signo);
-            if (!retval)
-                retval = put_user(0, &infop->si_errno);
-            if (!retval)
-                retval = put_user(0, &infop->si_code);
-            if (!retval)
-                retval = put_user(0, &infop->si_pid);
-            if (!retval)
-                retval = put_user(0, &infop->si_uid);
-            if (!retval)
-                retval = put_user(0, &infop->si_status);
-        }
-    }
     return retval;
 }
 
-asmlinkage long sys_waitid(int which, pid_t upid,
-               struct siginfo __user *infop, int options,
-               struct rusage __user *ru)
+long do_waitid(int which, pid_t upid,
+           struct siginfo __user *infop, int options,
+           struct rusage __user *ru)
 {
     struct pid *pid = NULL;
     enum pid_type type;
@@ -1789,6 +1768,39 @@ asmlinkage long sys_waitid(int which, pid_t upid,
     ret = do_wait(type, pid, options, infop, NULL, ru);
     put_pid(pid);
 
+    return ret;
+}
+
+asmlinkage long sys_waitid(int which, pid_t upid,
+               struct siginfo __user *infop, int options,
+               struct rusage __user *ru)
+{
+    long ret;
+
+    ret = do_waitid(which, upid, infop, options, ru);
+
+    if (ret > 0)
+        ret = 0;
+    else {
+        /*
+         * For a WNOHANG return, clear out all the fields
+         * we would set so the user can easily tell the
+         * difference.
+         */
+        if (!ret)
+            ret = put_user(0, &infop->si_signo);
+        if (!ret)
+            ret = put_user(0, &infop->si_errno);
+        if (!ret)
+            ret = put_user(0, &infop->si_code);
+        if (!ret)
+            ret = put_user(0, &infop->si_pid);
+        if (!ret)
+            ret = put_user(0, &infop->si_uid);
+        if (!ret)
+            ret = put_user(0, &infop->si_status);
+    }
+
     /* avoid REGPARM breakage on x86: */
     asmlinkage_protect(5, ret, which, upid, infop, options, ru);
     return ret;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e14a232..e8d4da6 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -163,6 +163,7 @@ cond_syscall(sys_ioprio_set);
 cond_syscall(sys_ioprio_get);
 
 /* New file descriptors */
+cond_syscall(sys_waitfd);
 cond_syscall(sys_signalfd);
 cond_syscall(sys_signalfd4);
 cond_syscall(compat_sys_signalfd);


             reply	other threads:[~2009-01-06 18:10 UTC|newest]

Thread overview: 50+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-01-06 18:11 Casey Dahlin [this message]
2009-01-06 18:27 ` [RFC PATCH v2] waitfd Alan Cox
2009-01-06 18:31 ` Randy Dunlap
2009-01-06 18:45   ` Casey Dahlin
2009-01-06 18:50     ` Randy Dunlap
2009-01-06 18:48 ` Andi Kleen
2009-01-06 19:07 ` [RESEND][RFC " Casey Dahlin
2009-01-07 12:34   ` Ingo Molnar
2009-01-07 13:05     ` Casey Dahlin
2009-01-07 15:00       ` Ingo Molnar
2009-01-07 17:19     ` Oleg Nesterov
2009-01-07 17:24       ` Ingo Molnar
2009-01-07 17:52       ` Davide Libenzi
2009-01-07 20:38         ` Casey Dahlin
2009-01-10 14:47       ` Scott James Remnant
2009-01-10 21:14         ` Casey Dahlin
2009-01-10 21:20           ` Scott James Remnant
2009-01-10 22:08             ` Casey Dahlin
2009-01-10 22:31           ` Oleg Nesterov
2009-01-10 22:37             ` Casey Dahlin
2009-01-10 22:46               ` Oleg Nesterov
2009-01-07 20:53     ` Roland McGrath
2009-01-07 20:58       ` Ingo Molnar
2009-01-07 21:05         ` Davide Libenzi
2009-01-07 21:50           ` Ingo Molnar
2009-01-07 21:02       ` Ulrich Drepper
2009-01-08 14:32         ` Oleg Nesterov
2009-01-08 19:35           ` Roland McGrath
2009-01-08 20:36             ` Casey Dahlin
2009-01-08 21:39               ` Oleg Nesterov
2009-01-10 14:52                 ` Scott James Remnant
2009-01-10 16:19                   ` Oleg Nesterov
2009-01-10 17:09                     ` Scott James Remnant
2009-01-10 18:21                       ` Oleg Nesterov
2009-01-10 18:46                         ` Scott James Remnant
2009-01-10 14:50               ` Scott James Remnant
2009-01-10 21:20                 ` Casey Dahlin
2009-01-08 22:04       ` Michael Kerrisk
2009-01-10 14:09       ` Scott James Remnant
2009-01-10 14:45       ` Scott James Remnant
2009-01-10 15:57         ` Oleg Nesterov
2009-01-10 17:07           ` Scott James Remnant
2009-01-10 18:13             ` Oleg Nesterov
2009-01-10 20:13               ` Scott James Remnant
2009-01-10 22:24                 ` Oleg Nesterov
2009-01-10 23:14                   ` Davide Libenzi
2009-01-10 22:25             ` Casey Dahlin
2009-01-10 23:11             ` Davide Libenzi
2011-03-02  1:37           ` Denys Vlasenko
2011-03-02 13:55             ` Oleg Nesterov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=49639EB8.40204@redhat.com \
    --to=cdahlin@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.