All of lore.kernel.org
 help / color / mirror / Atom feed
From: Oleg Nesterov <oleg@redhat.com>
To: Andrew Morton <akpm@linux-foundation.org>,
	Ingo Molnar <mingo@elte.hu>, Lai Jiangshan <laijs@cn.fujitsu.com>,
	Rusty Russell <rusty@rustcorp.com.au>
Cc: linux-kernel@vger.kernel.org, Li Zefan <lizf@cn.fujitsu.com>,
	Miao Xie <miaox@cn.fujitsu.com>, Paul Menage <menage@google.com>,
	Peter Zijlstra <a.p.zijlstra@chello.nl>,
	Gautham R Shenoy <ego@in.ibm.com>
Subject: [PATCH] cpusets: fix deadlock with cpu_down()->cpuset_lock()
Date: Wed, 29 Jul 2009 23:22:16 +0200	[thread overview]
Message-ID: <20090729212216.GB16970@redhat.com> (raw)
In-Reply-To: <20090729212125.GA16970@redhat.com>

I strongly believe the bug does exist, but this patch needs the review
from maintainers.

Suppose that task T bound to CPU takes callback_mutex. If cpu_down(CPU)
happens before T drops callback_mutex we have a deadlock.

stop_machine() preempts T, then migration_call(CPU_DEAD) tries to take
cpuset_lock() and hangs forever because CPU is already dead and thus
T can't be scheduled.

This patch unexports cpuset_lock/cpuset_unlock and converts kernel/cpuset.c
to use these helpers instead of lock/unlock of callback_mutex. The only
reason for migration_call()->cpuset_lock() was cpuset_cpus_allowed_locked()
called by move_task_off_dead_cpu(), so this patch adds get_online_cpus() to
cpuset_lock().

IOW, with this patch migration_call(CPU_DEAD) runs without callback_mutex,
but kernel/cpuset.c always takes get_online_cpus() before callback_mutex.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---

 include/linux/cpuset.h |    6 ---
 kernel/sched.c         |    2 -
 kernel/cpuset.c        |   86 +++++++++++++++++++++----------------------------
 3 files changed, 37 insertions(+), 57 deletions(-)

--- CPUHP/include/linux/cpuset.h~CPU_SET_LOCK	2009-06-17 14:11:26.000000000 +0200
+++ CPUHP/include/linux/cpuset.h	2009-07-29 22:17:41.000000000 +0200
@@ -69,9 +69,6 @@ struct seq_file;
 extern void cpuset_task_status_allowed(struct seq_file *m,
 					struct task_struct *task);
 
-extern void cpuset_lock(void);
-extern void cpuset_unlock(void);
-
 extern int cpuset_mem_spread_node(void);
 
 static inline int cpuset_do_page_mem_spread(void)
@@ -157,9 +154,6 @@ static inline void cpuset_task_status_al
 {
 }
 
-static inline void cpuset_lock(void) {}
-static inline void cpuset_unlock(void) {}
-
 static inline int cpuset_mem_spread_node(void)
 {
 	return 0;
--- CPUHP/kernel/sched.c~CPU_SET_LOCK	2009-07-23 17:06:39.000000000 +0200
+++ CPUHP/kernel/sched.c	2009-07-29 22:18:33.000000000 +0200
@@ -7550,7 +7550,6 @@ migration_call(struct notifier_block *nf
 
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
-		cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
 		migrate_live_tasks(cpu);
 		rq = cpu_rq(cpu);
 		kthread_stop(rq->migration_thread);
@@ -7565,7 +7564,6 @@ migration_call(struct notifier_block *nf
 		rq->idle->sched_class = &idle_sched_class;
 		migrate_dead_tasks(cpu);
 		spin_unlock_irq(&rq->lock);
-		cpuset_unlock();
 		migrate_nr_uninterruptible(rq);
 		BUG_ON(rq->nr_running != 0);
 		calc_global_load_remove(rq);
--- CPUHP/kernel/cpuset.c~CPU_SET_LOCK	2009-06-17 14:11:26.000000000 +0200
+++ CPUHP/kernel/cpuset.c	2009-07-29 22:49:30.000000000 +0200
@@ -215,6 +215,21 @@ static struct cpuset top_cpuset = {
 
 static DEFINE_MUTEX(callback_mutex);
 
+static void cpuset_lock(void)
+{
+	/* Protect against cpu_down(), move_task_off_dead_cpu() needs
+	 * cpuset_cpus_allowed_locked()
+	 */
+	get_online_cpus();
+	mutex_lock(&callback_mutex);
+}
+
+static void cpuset_unlock(void)
+{
+	mutex_unlock(&callback_mutex);
+	put_online_cpus();
+}
+
 /*
  * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist
  * buffers.  They are statically allocated to prevent using excess stack
@@ -890,9 +905,9 @@ static int update_cpumask(struct cpuset 
 
 	is_load_balanced = is_sched_load_balance(trialcs);
 
-	mutex_lock(&callback_mutex);
+	cpuset_lock();
 	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
-	mutex_unlock(&callback_mutex);
+	cpuset_unlock();
 
 	/*
 	 * Scan tasks in the cpuset, and update the cpumasks of any
@@ -1093,9 +1108,9 @@ static int update_nodemask(struct cpuset
 	if (retval < 0)
 		goto done;
 
-	mutex_lock(&callback_mutex);
+	cpuset_lock();
 	cs->mems_allowed = trialcs->mems_allowed;
-	mutex_unlock(&callback_mutex);
+	cpuset_unlock();
 
 	update_tasks_nodemask(cs, &oldmem, &heap);
 
@@ -1207,9 +1222,9 @@ static int update_flag(cpuset_flagbits_t
 	spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
 			|| (is_spread_page(cs) != is_spread_page(trialcs)));
 
-	mutex_lock(&callback_mutex);
+	cpuset_lock();
 	cs->flags = trialcs->flags;
-	mutex_unlock(&callback_mutex);
+	cpuset_unlock();
 
 	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
 		async_rebuild_sched_domains();
@@ -1516,9 +1531,9 @@ static int cpuset_sprintf_cpulist(char *
 {
 	int ret;
 
-	mutex_lock(&callback_mutex);
+	cpuset_lock();
 	ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
-	mutex_unlock(&callback_mutex);
+	cpuset_unlock();
 
 	return ret;
 }
@@ -1527,9 +1542,9 @@ static int cpuset_sprintf_memlist(char *
 {
 	nodemask_t mask;
 
-	mutex_lock(&callback_mutex);
+	cpuset_lock();
 	mask = cs->mems_allowed;
-	mutex_unlock(&callback_mutex);
+	cpuset_unlock();
 
 	return nodelist_scnprintf(page, PAGE_SIZE, mask);
 }
@@ -1980,12 +1995,12 @@ static void scan_for_empty_cpusets(struc
 		oldmems = cp->mems_allowed;
 
 		/* Remove offline cpus and mems from this cpuset. */
-		mutex_lock(&callback_mutex);
+		cpuset_lock();
 		cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
 			    cpu_online_mask);
 		nodes_and(cp->mems_allowed, cp->mems_allowed,
 						node_states[N_HIGH_MEMORY]);
-		mutex_unlock(&callback_mutex);
+		cpuset_unlock();
 
 		/* Move tasks from the empty cpuset to a parent */
 		if (cpumask_empty(cp->cpus_allowed) ||
@@ -2029,9 +2044,9 @@ static int cpuset_track_online_cpus(stru
 	}
 
 	cgroup_lock();
-	mutex_lock(&callback_mutex);
+	cpuset_lock();
 	cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
-	mutex_unlock(&callback_mutex);
+	cpuset_unlock();
 	scan_for_empty_cpusets(&top_cpuset);
 	ndoms = generate_sched_domains(&doms, &attr);
 	cgroup_unlock();
@@ -2055,9 +2070,9 @@ static int cpuset_track_online_nodes(str
 	switch (action) {
 	case MEM_ONLINE:
 	case MEM_OFFLINE:
-		mutex_lock(&callback_mutex);
+		cpuset_lock();
 		top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
-		mutex_unlock(&callback_mutex);
+		cpuset_unlock();
 		if (action == MEM_OFFLINE)
 			scan_for_empty_cpusets(&top_cpuset);
 		break;
@@ -2100,9 +2115,9 @@ void __init cpuset_init_smp(void)
 
 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 {
-	mutex_lock(&callback_mutex);
+	cpuset_lock();
 	cpuset_cpus_allowed_locked(tsk, pmask);
-	mutex_unlock(&callback_mutex);
+	cpuset_unlock();
 }
 
 /**
@@ -2135,11 +2150,11 @@ nodemask_t cpuset_mems_allowed(struct ta
 {
 	nodemask_t mask;
 
-	mutex_lock(&callback_mutex);
+	cpuset_lock();
 	task_lock(tsk);
 	guarantee_online_mems(task_cs(tsk), &mask);
 	task_unlock(tsk);
-	mutex_unlock(&callback_mutex);
+	cpuset_unlock();
 
 	return mask;
 }
@@ -2252,14 +2267,14 @@ int __cpuset_node_allowed_softwall(int n
 		return 1;
 
 	/* Not hardwall and node outside mems_allowed: scan up cpusets */
-	mutex_lock(&callback_mutex);
+	cpuset_lock();
 
 	task_lock(current);
 	cs = nearest_hardwall_ancestor(task_cs(current));
 	task_unlock(current);
 
 	allowed = node_isset(node, cs->mems_allowed);
-	mutex_unlock(&callback_mutex);
+	cpuset_unlock();
 	return allowed;
 }
 
@@ -2302,33 +2317,6 @@ int __cpuset_node_allowed_hardwall(int n
 }
 
 /**
- * cpuset_lock - lock out any changes to cpuset structures
- *
- * The out of memory (oom) code needs to mutex_lock cpusets
- * from being changed while it scans the tasklist looking for a
- * task in an overlapping cpuset.  Expose callback_mutex via this
- * cpuset_lock() routine, so the oom code can lock it, before
- * locking the task list.  The tasklist_lock is a spinlock, so
- * must be taken inside callback_mutex.
- */
-
-void cpuset_lock(void)
-{
-	mutex_lock(&callback_mutex);
-}
-
-/**
- * cpuset_unlock - release lock on cpuset changes
- *
- * Undo the lock taken in a previous cpuset_lock() call.
- */
-
-void cpuset_unlock(void)
-{
-	mutex_unlock(&callback_mutex);
-}
-
-/**
  * cpuset_mem_spread_node() - On which node to begin search for a page
  *
  * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for


  reply	other threads:[~2009-07-29 21:26 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-07-29  2:33 + cpu_hotplug-dont-affect-current-tasks-affinity.patch added to -mm tree Oleg Nesterov
2009-07-29 21:21 ` Oleg Nesterov
2009-07-29 21:22   ` Oleg Nesterov [this message]
2009-07-29 23:00     ` [PATCH] cpusets: fix deadlock with cpu_down()->cpuset_lock() Oleg Nesterov
2009-07-30  1:53       ` Lai Jiangshan
2009-07-30 17:51         ` Oleg Nesterov
2009-07-31  2:23           ` Lai Jiangshan
2009-08-01  4:42             ` [PATCH] cpusets: rework guarantee_online_cpus() to fix deadlock with cpu_down() Oleg Nesterov
2009-08-01  5:34               ` Oleg Nesterov
2009-08-02  2:18               ` Lai Jiangshan
2009-08-02  6:55                 ` Oleg Nesterov
2009-08-04  7:35                   ` Lai Jiangshan
2009-08-04 16:36                     ` Oleg Nesterov
2009-07-29 21:42   ` [PATCH 0/1] cpu_hotplug: don't play with current->cpus_allowed Oleg Nesterov
2009-07-29 21:43   ` [PATCH 1/1] " Oleg Nesterov
2009-07-30  2:01     ` Lai Jiangshan
2009-07-30 16:32       ` Oleg Nesterov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20090729212216.GB16970@redhat.com \
    --to=oleg@redhat.com \
    --cc=a.p.zijlstra@chello.nl \
    --cc=akpm@linux-foundation.org \
    --cc=ego@in.ibm.com \
    --cc=laijs@cn.fujitsu.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=lizf@cn.fujitsu.com \
    --cc=menage@google.com \
    --cc=miaox@cn.fujitsu.com \
    --cc=mingo@elte.hu \
    --cc=rusty@rustcorp.com.au \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.