From: Andrew Morton <akpm@digeo.com>
To: Linus Torvalds <torvalds@transmeta.com>
Cc: mingo@elte.hu, zwane@holomorphy.com, cw@f00f.org,
linux-kernel@vger.kernel.org, mbligh@aracnet.com,
wli@holomorphy.com
Subject: Re: doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots)
Date: Thu, 20 Feb 2003 12:50:21 -0800 [thread overview]
Message-ID: <20030220125021.03c6d39c.akpm@digeo.com> (raw)
In-Reply-To: <Pine.LNX.4.44.0302201207320.12127-100000@penguin.transmeta.com>
Linus Torvalds <torvalds@transmeta.com> wrote:
>
> wait_task_inactive()
There are two other bugs in this exact area. I received the below from Bill
Irwin and Rick Lindsley yesterday. Can someone take this off my hands?
Fixes two deadlocks in the scheduler exit path:
1: We're calling mmdrop() under spin_lock_irq(&rq->lock). But mmdrop
calls vfree(), which calls smp_call_function().
It is not legal to call smp_call_function() with irq's off. Because
another CPU may be running smp_call_function() against _this_ CPU, which
deadlocks.
So the patch arranges for mmdrop() to not be called under
spin_lock_irq(&rq->lock).
2: We are leaving local interrupts disabled coming out of exit_notify().
But we are about to call wait_task_inactive() which spins, waiting for
another CPU to end a task. If that CPU has issued smp_call_function() to
this CPU, deadlock.
So the patch enables interrupts again before returning from exit_notify().
Also, exit_notify() returns with preemption disabled, so there is no
need to perform another preempt_disable() in do_exit().
exit.c | 17 +++++++++++------
sched.c | 43 ++++++++++++++++++++++++++++++++++++++-----
2 files changed, 49 insertions(+), 11 deletions(-)
diff -puN kernel/exit.c~wli-mem-leak-fix kernel/exit.c
--- 25/kernel/exit.c~wli-mem-leak-fix 2003-02-20 03:10:08.000000000 -0800
+++ 25-akpm/kernel/exit.c 2003-02-20 03:10:35.000000000 -0800
@@ -674,13 +674,19 @@ static void exit_notify(struct task_stru
tsk->state = TASK_ZOMBIE;
/*
- * No need to unlock IRQs, we'll schedule() immediately
- * anyway. In the preemption case this also makes it
- * impossible for the task to get runnable again (thus
- * the "_raw_" unlock - to make sure we don't try to
- * preempt here).
+ * In the preemption case it must be impossible for the task
+ * to get runnable again, so use "_raw_" unlock to keep
+ * preempt_count elevated until we schedule().
+ *
+ * To avoid deadlock on SMP, interrupts must be unmasked. If we
+ * don't, subsequently called functions (e.g, wait_task_inactive()
+ * via release_task()) will spin, with interrupt flags
+ * unwittingly blocked, until the other task sleeps. That task
+ * may itself be waiting for smp_call_function() to answer and
+ * complete, and with interrupts blocked that will never happen.
*/
_raw_write_unlock(&tasklist_lock);
+ local_irq_enable();
}
NORET_TYPE void do_exit(long code)
@@ -727,7 +733,6 @@ NORET_TYPE void do_exit(long code)
tsk->exit_code = code;
exit_notify(tsk);
- preempt_disable();
if (tsk->exit_signal == -1)
release_task(tsk);
diff -puN kernel/sched.c~wli-mem-leak-fix kernel/sched.c
--- 25/kernel/sched.c~wli-mem-leak-fix 2003-02-20 03:10:08.000000000 -0800
+++ 25-akpm/kernel/sched.c 2003-02-20 03:10:08.000000000 -0800
@@ -152,6 +152,7 @@ struct runqueue {
unsigned long nr_running, nr_switches, expired_timestamp,
nr_uninterruptible;
task_t *curr, *idle;
+ struct mm_struct *prev_mm;
prio_array_t *active, *expired, arrays[2];
int prev_nr_running[NR_CPUS];
#ifdef CONFIG_NUMA
@@ -388,7 +389,10 @@ static inline void resched_task(task_t *
* wait_task_inactive - wait for a thread to unschedule.
*
* The caller must ensure that the task *will* unschedule sometime soon,
- * else this function might spin for a *long* time.
+ * else this function might spin for a *long* time. This function can't
+ * be called with interrupts off, or it may introduce deadlock with
+ * smp_call_function() if an IPI is sent by the same process we are
+ * waiting to become inactive.
*/
void wait_task_inactive(task_t * p)
{
@@ -558,10 +562,24 @@ void sched_exit(task_t * p)
/**
* schedule_tail - first thing a freshly forked thread must call.
* @prev: the thread we just switched away from.
+ *
+ * Note that we may have delayed dropping an mm in context_switch(). If
+ * so, we finish that here outside of the runqueue lock. (Doing it
+ * with the lock held can cause deadlocks; see schedule() for
+ * details.)
+ */
+if (mm)
*/
asmlinkage void schedule_tail(task_t *prev)
{
- finish_arch_switch(this_rq(), prev);
+ runqueue_t *rq = this_rq();
+ struct mm_struct *mm = rq->prev_mm;
+
+ rq->prev_mm = NULL;
+ finish_arch_switch(rq, prev);
+ if (mm)
+ mmdrop(mm);
+
if (current->set_child_tid)
put_user(current->pid, current->set_child_tid);
}
@@ -570,7 +588,7 @@ asmlinkage void schedule_tail(task_t *pr
* context_switch - switch to the new MM and the new
* thread's register state.
*/
-static inline task_t * context_switch(task_t *prev, task_t *next)
+static inline task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
{
struct mm_struct *mm = next->mm;
struct mm_struct *oldmm = prev->active_mm;
@@ -584,7 +602,8 @@ static inline task_t * context_switch(ta
if (unlikely(!prev->mm)) {
prev->active_mm = NULL;
- mmdrop(oldmm);
+ WARN_ON(rq->prev_mm);
+ rq->prev_mm = oldmm;
}
/* Here we just switch the register state and the stack. */
@@ -1223,14 +1242,28 @@ switch_tasks:
RCU_qsctr(prev->thread_info->cpu)++;
if (likely(prev != next)) {
+ struct mm_struct *prev_mm;
rq->nr_switches++;
rq->curr = next;
prepare_arch_switch(rq, next);
- prev = context_switch(prev, next);
+ prev = context_switch(rq, prev, next);
barrier();
rq = this_rq();
+ prev_mm = rq->prev_mm;
+ rq->prev_mm = NULL;
+
+ /*
+ * It's extremely improtant to drop the runqueue lock
+ * before mmdrop(): on i386, destroy_context(), called
+ * by mmdrop(), can potentially vfree() LDT's. This may
+ * generate interrupts to processors spinning (with
+ * interrupts blocked) on the runqueue lock we're holding.
+ */
finish_arch_switch(rq, prev);
+
+ if (prev_mm)
+ mmdrop(prev_mm);
} else
spin_unlock_irq(&rq->lock);
_
next prev parent reply other threads:[~2003-02-20 20:42 UTC|newest]
Thread overview: 52+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <Pine.LNX.4.44.0302201830580.474-100000@localhost.localdomain>
2003-02-20 18:01 ` doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots) Linus Torvalds
2003-02-20 18:23 ` Linus Torvalds
2003-02-20 19:36 ` Ingo Molnar
2003-02-20 19:53 ` Ingo Molnar
2003-02-20 19:57 ` Ingo Molnar
2003-02-20 20:14 ` Ingo Molnar
2003-02-20 20:17 ` Linus Torvalds
2003-02-20 20:50 ` Andrew Morton [this message]
2003-02-20 22:04 ` Ingo Molnar
2003-02-20 22:42 ` William Lee Irwin III
2003-02-21 7:05 ` Ingo Molnar
2003-02-20 22:00 ` Ingo Molnar
2003-02-20 22:32 ` Linus Torvalds
2003-02-20 22:40 ` Linus Torvalds
2003-02-20 22:45 ` Linus Torvalds
2003-02-20 22:57 ` John Levon
2003-02-20 23:21 ` Ingo Molnar
2003-02-20 23:36 ` Linus Torvalds
2003-02-21 7:00 ` Ingo Molnar
2003-02-21 15:05 ` Linus Torvalds
2003-02-20 19:00 ` Ingo Molnar
2003-02-18 23:01 Linux v2.5.62 --- spontaneous reboots Chris Wedgwood
2003-02-19 23:35 ` doublefault debugging (was Re: Linux v2.5.62 --- spontaneous reboots) Linus Torvalds
2003-02-20 2:22 ` Zwane Mwaikambo
2003-02-20 2:26 ` William Lee Irwin III
2003-02-20 2:55 ` Zwane Mwaikambo
2003-02-20 3:15 ` William Lee Irwin III
2003-02-20 4:52 ` Linus Torvalds
2003-02-20 5:07 ` William Lee Irwin III
2003-02-20 6:05 ` Zwane Mwaikambo
2003-02-20 11:46 ` Ingo Molnar
2003-02-20 12:12 ` William Lee Irwin III
2003-02-20 12:33 ` Ingo Molnar
2003-02-20 14:03 ` Zwane Mwaikambo
2003-02-20 14:00 ` Zwane Mwaikambo
2003-02-20 15:43 ` Linus Torvalds
2003-02-20 15:52 ` Ingo Molnar
2003-02-20 16:11 ` Martin J. Bligh
2003-02-20 16:54 ` Linus Torvalds
2003-02-20 17:24 ` Jeff Garzik
2003-02-20 21:21 ` Alan Cox
2003-02-20 20:20 ` Linus Torvalds
2003-02-20 20:23 ` Martin J. Bligh
2003-02-20 20:42 ` William Lee Irwin III
2003-02-20 20:51 ` Linus Torvalds
2003-02-27 18:50 ` Randy.Dunlap
2003-02-27 19:39 ` Muli Ben-Yehuda
2003-02-27 19:47 ` Randy.Dunlap
2003-03-02 6:12 ` Keith Owens
2003-02-27 23:32 ` Randy.Dunlap
2003-02-20 23:09 ` Chris Wedgwood
2003-02-20 16:44 ` Ingo Molnar
2003-02-20 20:13 ` Chris Wedgwood
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20030220125021.03c6d39c.akpm@digeo.com \
--to=akpm@digeo.com \
--cc=cw@f00f.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mbligh@aracnet.com \
--cc=mingo@elte.hu \
--cc=torvalds@transmeta.com \
--cc=wli@holomorphy.com \
--cc=zwane@holomorphy.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox