* + do_wait-wakeup-optimization.patch added to -mm tree
@ 2008-11-21 20:15 akpm
2008-11-23 21:39 ` Oleg Nesterov
0 siblings, 1 reply; 9+ messages in thread
From: akpm @ 2008-11-21 20:15 UTC (permalink / raw)
To: mm-commits; +Cc: roland, mingo, oleg, rnalumasu
The patch titled
do_wait() wakeup optimization
has been added to the -mm tree. Its filename is
do_wait-wakeup-optimization.patch
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/SubmitChecklist when testing your code ***
See http://userweb.kernel.org/~akpm/stuff/added-to-mm.txt to find
out what to do about this
The current -mm tree may be found at http://userweb.kernel.org/~akpm/mmotm/
------------------------------------------------------
Subject: do_wait() wakeup optimization
From: Roland McGrath <roland@redhat.com>
Ratan Nalumasu reported that in a process with many threads doing
unnecessary wakeups. Every waiting thread in the process wakes up to loop
through the children and see that the only ones it cares about are still
not ready.
Change do_wait() to use init_waitqueue_func_entry with a custom wake
function. This skips the wakeup for a do_wait() call that is not
interested in the child that's doing wake_up on wait_chldexit.
Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Ratan Nalumasu <rnalumasu@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
kernel/exit.c | 90 +++++++++++++++++++++++++++++++++++++++++-------
1 file changed, 78 insertions(+), 12 deletions(-)
diff -puN kernel/exit.c~do_wait-wakeup-optimization kernel/exit.c
--- a/kernel/exit.c~do_wait-wakeup-optimization
+++ a/kernel/exit.c
@@ -1195,10 +1195,8 @@ static struct pid *task_pid_type(struct
}
static int eligible_child(enum pid_type type, struct pid *pid, int options,
- struct task_struct *p)
+ struct task_struct *p, int exit_signal)
{
- int err;
-
if (type < PIDTYPE_MAX) {
if (task_pid_type(p, type) != pid)
return 0;
@@ -1209,14 +1207,10 @@ static int eligible_child(enum pid_type
* set; otherwise, wait for non-clone children *only*. (Note:
* A "clone" child here is one that reports to its parent
* using a signal other than SIGCHLD.) */
- if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0))
+ if (((exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0))
&& !(options & __WALL))
return 0;
- err = security_task_wait(p);
- if (err)
- return err;
-
return 1;
}
@@ -1563,10 +1557,11 @@ static int wait_consider_task(struct tas
struct siginfo __user *infop,
int __user *stat_addr, struct rusage __user *ru)
{
- int ret = eligible_child(type, pid, options, p);
+ int ret = eligible_child(type, pid, options, p, p->exit_signal);
if (!ret)
return ret;
+ ret = security_task_wait(p);
if (unlikely(ret < 0)) {
/*
* If we have not yet seen any eligible child,
@@ -1665,17 +1660,88 @@ static int ptrace_do_wait(struct task_st
return 0;
}
+/*
+ * This sits on the stack of a thread that is blocked in do_wait().
+ * @wq.private holds the task_struct pointer of that thread.
+ */
+struct do_wait_queue_entry {
+ wait_queue_t wq;
+ struct pid *pid;
+ enum pid_type type;
+ int options;
+};
+
+/*
+ * Here current (@task) is a thread calling do_notify_parent().
+ * Return zero to optimize out the wake-up of a parent thread in
+ * do_wait() that doesn't care about this child. An extra wake-up
+ * is permissible, but missing one is not.
+ */
+static int needs_wakeup(struct task_struct *task, struct do_wait_queue_entry *w)
+{
+ if ((w->options & __WNOTHREAD) && task->parent != w->wq.private)
+ return 0;
+
+ if (eligible_child(w->type, w->pid, w->options,
+ task, task->exit_signal))
+ return 1;
+
+ if (thread_group_leader(task)) {
+ /*
+ * In a group leader, do_notify_parent() may have
+ * just reset task->exit_signal because SIGCHLD was
+ * ignored, but that doesn't prevent the wakeup.
+ */
+ if (!task_detached(task) ||
+ !eligible_child(w->type, w->pid, w->options,
+ task, SIGCHLD))
+ return 0;
+ } else {
+ /*
+ * In a non-leader, this might be the release_task()
+ * case, where it's the leader rather than task
+ * whose parent is being woken.
+ */
+ if (!eligible_child(w->type, w->pid, w->options,
+ task->group_leader,
+ task_detached(task->group_leader) ?
+ SIGCHLD : task->group_leader->exit_signal))
+ return 0;
+ }
+
+ return 1;
+}
+
+static int do_wait_wake_function(wait_queue_t *curr, unsigned mode, int sync,
+ void *key)
+{
+ struct task_struct *task = current;
+ struct do_wait_queue_entry *w =
+ container_of(curr, struct do_wait_queue_entry, wq);
+
+ if (!needs_wakeup(task, w))
+ return 0;
+
+ return default_wake_function(curr, mode, sync, key);
+}
+
static long do_wait(enum pid_type type, struct pid *pid, int options,
struct siginfo __user *infop, int __user *stat_addr,
struct rusage __user *ru)
{
- DECLARE_WAITQUEUE(wait, current);
+ struct do_wait_queue_entry wait;
struct task_struct *tsk;
int retval;
trace_sched_process_wait(pid);
- add_wait_queue(¤t->signal->wait_chldexit,&wait);
+ init_waitqueue_func_entry(&wait.wq, do_wait_wake_function);
+ wait.wq.private = current;
+ wait.type = type;
+ wait.pid = pid;
+ wait.options = options;
+
+ add_wait_queue(¤t->signal->wait_chldexit, &wait.wq);
repeat:
/*
* If there is nothing that can match our critiera just get out.
@@ -1722,7 +1788,7 @@ repeat:
end:
current->state = TASK_RUNNING;
- remove_wait_queue(¤t->signal->wait_chldexit,&wait);
+ remove_wait_queue(¤t->signal->wait_chldexit, &wait.wq);
if (infop) {
if (retval > 0)
retval = 0;
_
Patches currently in -mm which might be from roland@redhat.com are
posix-timers-use-struct-pid-instead-of-struct-task_struct.patch
posix-timers-check-it_signal-instead-of-it_pid-to-validate-the-timer.patch
posix-timers-simplify-de_thread-exit_itimers-path.patch
forkc-cleanup-for-copy_sighand.patch
do_wait-wakeup-optimization.patch
signals-protect-sbin-init-from-unwanted-signals-more.patch
signals-simplify-sig_ignored-pathes.patch
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: + do_wait-wakeup-optimization.patch added to -mm tree
2008-11-21 20:15 + do_wait-wakeup-optimization.patch added to -mm tree akpm
@ 2008-11-23 21:39 ` Oleg Nesterov
2008-11-23 21:55 ` do_wait() vs do_notify_parent_cldstop() theoretical race? Oleg Nesterov
` (2 more replies)
0 siblings, 3 replies; 9+ messages in thread
From: Oleg Nesterov @ 2008-11-23 21:39 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, roland, mingo, rnalumasu
> From: Roland McGrath <roland@redhat.com>
>
> +static int needs_wakeup(struct task_struct *task, struct do_wait_queue_entry *w)
> +{
> + if ((w->options & __WNOTHREAD) && task->parent != w->wq.private)
> + return 0;
> +
> + if (eligible_child(w->type, w->pid, w->options,
> + task, task->exit_signal))
> + return 1;
> +
> + if (thread_group_leader(task)) {
> + /*
> + * In a group leader, do_notify_parent() may have
> + * just reset task->exit_signal because SIGCHLD was
> + * ignored, but that doesn't prevent the wakeup.
> + */
> + if (!task_detached(task) ||
> + !eligible_child(w->type, w->pid, w->options,
> + task, SIGCHLD))
> + return 0;
> + } else {
> + /*
> + * In a non-leader, this might be the release_task()
> + * case, where it's the leader rather than task
> + * whose parent is being woken.
> + */
> + if (!eligible_child(w->type, w->pid, w->options,
> + task->group_leader,
> + task_detached(task->group_leader) ?
> + SIGCHLD : task->group_leader->exit_signal))
> + return 0;
> + }
> +
> + return 1;
> +}
Unless I missed something, this is not right.
This "task" is current, iow it is the caller of do_notify_parent(). Sometime
it is OK (release_task, exit_notify), but in general not, afaics.
Let's suppose the ptracer finds the EXIT_ZOMBIE tracee and notifies its
->real_parent which sleeps in do_wait(). In that case the usage of
eligible_child(task == ptracer) above is bogus, and checking for
group_leader is not rifgt too.
> +static int do_wait_wake_function(wait_queue_t *curr, unsigned mode, int sync,
> + void *key)
> +{
> + struct task_struct *task = current;
I think we can fix (and simplify) this code if we change __wake_up_parent(),
it should call __wake_up(key => p), so we can do
struct task_struct *task = key;
> + if (!needs_wakeup(task, w))
> + return 0;
> +
> + return default_wake_function(curr, mode, sync, key);
perhaps autoremove_wake_function() makes more sense.
Oleg.
^ permalink raw reply [flat|nested] 9+ messages in thread* do_wait() vs do_notify_parent_cldstop() theoretical race?
2008-11-23 21:39 ` Oleg Nesterov
@ 2008-11-23 21:55 ` Oleg Nesterov
2008-11-24 7:31 ` Roland McGrath
2008-12-04 1:05 ` Roland McGrath
2008-11-24 7:26 ` + do_wait-wakeup-optimization.patch added to -mm tree Roland McGrath
2008-12-04 1:06 ` Roland McGrath
2 siblings, 2 replies; 9+ messages in thread
From: Oleg Nesterov @ 2008-11-23 21:55 UTC (permalink / raw)
To: akpm; +Cc: linux-kernel, roland, mingo, rnalumasu
Looking at do_wait(), suddenly I am starting to suspect we have the
highly theoretical race with do_notify_parent_cldstop().
do_wait:
add_wait_queue(...);
current->state = TASK_INTERRUPTIBLE;
read_lock(tasklist_lock);
... try to find the "interesting" task ...
read_unlock(tasklist_lock);
if (!retval) // not found
schedule();
We don't race with do_notify_parent() because it takes tasklist
for writing. But do_notify_parent_cldstop() can run in parallel
under read_lock(tasklist).
Now suppose that "->state = TASK_INTERRUPTIBLE" leaks deeply into
the critical section. In theory, it is possible that wait_consider_task()
checks task_is_stopped_or_traced() or SIGNAL_STOP_CONTINUED first, then
CPU sets state = TASK_INTERRUPTIBLE. And we can miss the event if
do_notify_parent_cldstop() happens in between.
No?
Oleg.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: do_wait() vs do_notify_parent_cldstop() theoretical race?
2008-11-23 21:55 ` do_wait() vs do_notify_parent_cldstop() theoretical race? Oleg Nesterov
@ 2008-11-24 7:31 ` Roland McGrath
2008-12-04 1:05 ` Roland McGrath
1 sibling, 0 replies; 9+ messages in thread
From: Roland McGrath @ 2008-11-24 7:31 UTC (permalink / raw)
To: Oleg Nesterov; +Cc: akpm, linux-kernel, mingo, rnalumasu
I guess. There might well be something in there implicitly that's actually
a memory barrier already. But sounds like that should use set_current_state().
Thanks,
Roland
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: do_wait() vs do_notify_parent_cldstop() theoretical race?
2008-11-23 21:55 ` do_wait() vs do_notify_parent_cldstop() theoretical race? Oleg Nesterov
2008-11-24 7:31 ` Roland McGrath
@ 2008-12-04 1:05 ` Roland McGrath
1 sibling, 0 replies; 9+ messages in thread
From: Roland McGrath @ 2008-12-04 1:05 UTC (permalink / raw)
To: Oleg Nesterov; +Cc: akpm, linux-kernel, mingo, rnalumasu
I guess. There might well be something in there implicitly that's actually
a memory barrier already. But sounds like that should use set_current_state().
Thanks,
Roland
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: + do_wait-wakeup-optimization.patch added to -mm tree
2008-11-23 21:39 ` Oleg Nesterov
2008-11-23 21:55 ` do_wait() vs do_notify_parent_cldstop() theoretical race? Oleg Nesterov
@ 2008-11-24 7:26 ` Roland McGrath
2008-12-04 15:26 ` Oleg Nesterov
2008-12-04 1:06 ` Roland McGrath
2 siblings, 1 reply; 9+ messages in thread
From: Roland McGrath @ 2008-11-24 7:26 UTC (permalink / raw)
To: Oleg Nesterov; +Cc: akpm, linux-kernel, mingo, rnalumasu
> Let's suppose the ptracer finds the EXIT_ZOMBIE tracee and notifies its
> ->real_parent which sleeps in do_wait(). In that case the usage of
> eligible_child(task == ptracer) above is bogus, and checking for
> group_leader is not rifgt too.
I had overlooked that do_notify_parent() call.
> > +static int do_wait_wake_function(wait_queue_t *curr, unsigned mode, int sync,
> > + void *key)
> > +{
> > + struct task_struct *task = current;
>
> I think we can fix (and simplify) this code if we change __wake_up_parent(),
> it should call __wake_up(key => p), so we can do
>
> struct task_struct *task = key;
I had not looked into the bowels of various __wake_up variants, just
assumed it would stay as it is and use wake_up_interruptible_sync.
That would certainly be cleaner. Then do_wait_wake_function would not need
the second of its special cases, only the one double-check for the
thread_group_leader && task_detached case.
I don't see an exposed __wake_up* variant that both passes a "key" pointer
through and does "sync". For __wake_up_parent, "sync" is quite desireable.
> > + if (!needs_wakeup(task, w))
> > + return 0;
> > +
> > + return default_wake_function(curr, mode, sync, key);
>
> perhaps autoremove_wake_function() makes more sense.
Why? The do_wait loop will have to go through again and still might just
sleep again. The explicit remove at the end of do_wait seems fine to me.
Thanks,
Roland
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: + do_wait-wakeup-optimization.patch added to -mm tree
2008-11-24 7:26 ` + do_wait-wakeup-optimization.patch added to -mm tree Roland McGrath
@ 2008-12-04 15:26 ` Oleg Nesterov
2008-12-04 20:59 ` Roland McGrath
0 siblings, 1 reply; 9+ messages in thread
From: Oleg Nesterov @ 2008-12-04 15:26 UTC (permalink / raw)
To: Roland McGrath; +Cc: akpm, linux-kernel, mingo, rnalumasu
On 11/23, Roland McGrath wrote:
>
> > > +static int do_wait_wake_function(wait_queue_t *curr, unsigned mode, int sync,
> > > + void *key)
> > > +{
> > > + struct task_struct *task = current;
> >
> > I think we can fix (and simplify) this code if we change __wake_up_parent(),
> > it should call __wake_up(key => p), so we can do
> >
> > struct task_struct *task = key;
>
> I don't see an exposed __wake_up* variant that both passes a "key" pointer
> through and does "sync". For __wake_up_parent, "sync" is quite desireable.
Well, yes... and __wake_up_common() is static. Perhaps we can make a new
helper. I must admit, I don't understand what "sync" actually means nowadays.
> > > + if (!needs_wakeup(task, w))
> > > + return 0;
> > > +
> > > + return default_wake_function(curr, mode, sync, key);
> >
> > perhaps autoremove_wake_function() makes more sense.
>
> Why? The do_wait loop will have to go through again and still might just
> sleep again. The explicit remove at the end of do_wait seems fine to me.
Yes, yes, I was wrong. I forgot about "repeat:" in do_wait().
Oleg.
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: + do_wait-wakeup-optimization.patch added to -mm tree
2008-12-04 15:26 ` Oleg Nesterov
@ 2008-12-04 20:59 ` Roland McGrath
0 siblings, 0 replies; 9+ messages in thread
From: Roland McGrath @ 2008-12-04 20:59 UTC (permalink / raw)
To: Oleg Nesterov; +Cc: akpm, linux-kernel, mingo, rnalumasu
> > I don't see an exposed __wake_up* variant that both passes a "key" pointer
> > through and does "sync". For __wake_up_parent, "sync" is quite desireable.
>
> Well, yes... and __wake_up_common() is static. Perhaps we can make a new
> helper.
Right, that's what I was suggesting (and not volunteering to do ;-).
> I must admit, I don't understand what "sync" actually means nowadays.
I don't claim to know any actual scheduler innards. But the meaning as I
understand it is to "make it runnable, but don't try to reschedule right
now because current will block quite soon anyway. If this does indeed
reduce work done to immediately reschedule, then it seems quite desireable
to avoid that flutter since the dying/stopping thread is very few cycles
away from yielding, and in the death case it will be for the last time and
rescheduling earlier just means a later unnecessary switch back and delayed
put_task_struct processing after the reap.
Thanks,
Roland
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: + do_wait-wakeup-optimization.patch added to -mm tree
2008-11-23 21:39 ` Oleg Nesterov
2008-11-23 21:55 ` do_wait() vs do_notify_parent_cldstop() theoretical race? Oleg Nesterov
2008-11-24 7:26 ` + do_wait-wakeup-optimization.patch added to -mm tree Roland McGrath
@ 2008-12-04 1:06 ` Roland McGrath
2 siblings, 0 replies; 9+ messages in thread
From: Roland McGrath @ 2008-12-04 1:06 UTC (permalink / raw)
To: Oleg Nesterov; +Cc: akpm, linux-kernel, mingo, rnalumasu
> Let's suppose the ptracer finds the EXIT_ZOMBIE tracee and notifies its
> ->real_parent which sleeps in do_wait(). In that case the usage of
> eligible_child(task == ptracer) above is bogus, and checking for
> group_leader is not rifgt too.
I had overlooked that do_notify_parent() call.
> > +static int do_wait_wake_function(wait_queue_t *curr, unsigned mode, int sync,
> > + void *key)
> > +{
> > + struct task_struct *task = current;
>
> I think we can fix (and simplify) this code if we change __wake_up_parent(),
> it should call __wake_up(key => p), so we can do
>
> struct task_struct *task = key;
I had not looked into the bowels of various __wake_up variants, just
assumed it would stay as it is and use wake_up_interruptible_sync.
That would certainly be cleaner. Then do_wait_wake_function would not need
the second of its special cases, only the one double-check for the
thread_group_leader && task_detached case.
I don't see an exposed __wake_up* variant that both passes a "key" pointer
through and does "sync". For __wake_up_parent, "sync" is quite desireable.
> > + if (!needs_wakeup(task, w))
> > + return 0;
> > +
> > + return default_wake_function(curr, mode, sync, key);
>
> perhaps autoremove_wake_function() makes more sense.
Why? The do_wait loop will have to go through again and still might just
sleep again. The explicit remove at the end of do_wait seems fine to me.
Thanks,
Roland
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2008-12-04 21:00 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-11-21 20:15 + do_wait-wakeup-optimization.patch added to -mm tree akpm
2008-11-23 21:39 ` Oleg Nesterov
2008-11-23 21:55 ` do_wait() vs do_notify_parent_cldstop() theoretical race? Oleg Nesterov
2008-11-24 7:31 ` Roland McGrath
2008-12-04 1:05 ` Roland McGrath
2008-11-24 7:26 ` + do_wait-wakeup-optimization.patch added to -mm tree Roland McGrath
2008-12-04 15:26 ` Oleg Nesterov
2008-12-04 20:59 ` Roland McGrath
2008-12-04 1:06 ` Roland McGrath
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.