From: Greg KH <gregkh@suse.de>
To: linux-kernel@vger.kernel.org, stable@kernel.org
Cc: stable-review@kernel.org, torvalds@linux-foundation.org,
akpm@linux-foundation.org, alan@lxorguk.ukuu.org.uk,
Ingo Molnar <mingo@elte.hu>,
Peter Zijlstra <a.p.zijlstra@chello.nl>, Greg KH <greg@kroah.com>,
Oleg Nesterov <oleg@redhat.com>, Mike Galbraith <efault@gmx.de>
Subject: [093/123] sched: Kill the broken and deadlockable cpuset_lock/cpuset_cpus_allowed_locked code
Date: Sat, 18 Sep 2010 11:58:57 -0700 [thread overview]
Message-ID: <20100918190000.783766814@clark.site> (raw)
In-Reply-To: <20100918190024.GA14388@kroah.com>
[-- Attachment #1: sched-kill-the-broken-and-deadlockable-cpuset_lock-cpuset_cpus_allowed_locked-code.patch --]
[-- Type: text/plain, Size: 5548 bytes --]
From: Oleg Nesterov <oleg@redhat.com>
commit 897f0b3c3ff40b443c84e271bef19bd6ae885195 upstream
This patch just states the fact the cpusets/cpuhotplug interaction is
broken and removes the deadlockable code which only pretends to work.
- cpuset_lock() doesn't really work. It is needed for
cpuset_cpus_allowed_locked() but we can't take this lock in
try_to_wake_up()->select_fallback_rq() path.
- cpuset_lock() is deadlockable. Suppose that a task T bound to CPU takes
callback_mutex. If cpu_down(CPU) happens before T drops callback_mutex
stop_machine() preempts T, then migration_call(CPU_DEAD) tries to take
cpuset_lock() and hangs forever because CPU is already dead and thus
T can't be scheduled.
- cpuset_cpus_allowed_locked() is deadlockable too. It takes task_lock()
which is not irq-safe, but try_to_wake_up() can be called from irq.
Kill them, and change select_fallback_rq() to use cpu_possible_mask, like
we currently do without CONFIG_CPUSETS.
Also, with or without this patch, with or without CONFIG_CPUSETS, the
callers of select_fallback_rq() can race with each other or with
set_cpus_allowed() pathes.
The subsequent patches try to to fix these problems.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100315091003.GA9123@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
include/linux/cpuset.h | 13 -------------
kernel/cpuset.c | 27 +--------------------------
kernel/sched.c | 10 +++-------
3 files changed, 4 insertions(+), 46 deletions(-)
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -21,8 +21,6 @@ extern int number_of_cpusets; /* How man
extern int cpuset_init(void);
extern void cpuset_init_smp(void);
extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
-extern void cpuset_cpus_allowed_locked(struct task_struct *p,
- struct cpumask *mask);
extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
#define cpuset_current_mems_allowed (current->mems_allowed)
void cpuset_init_current_mems_allowed(void);
@@ -69,9 +67,6 @@ struct seq_file;
extern void cpuset_task_status_allowed(struct seq_file *m,
struct task_struct *task);
-extern void cpuset_lock(void);
-extern void cpuset_unlock(void);
-
extern int cpuset_mem_spread_node(void);
static inline int cpuset_do_page_mem_spread(void)
@@ -105,11 +100,6 @@ static inline void cpuset_cpus_allowed(s
{
cpumask_copy(mask, cpu_possible_mask);
}
-static inline void cpuset_cpus_allowed_locked(struct task_struct *p,
- struct cpumask *mask)
-{
- cpumask_copy(mask, cpu_possible_mask);
-}
static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
{
@@ -157,9 +147,6 @@ static inline void cpuset_task_status_al
{
}
-static inline void cpuset_lock(void) {}
-static inline void cpuset_unlock(void) {}
-
static inline int cpuset_mem_spread_node(void)
{
return 0;
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2145,19 +2145,10 @@ void __init cpuset_init_smp(void)
void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
mutex_lock(&callback_mutex);
- cpuset_cpus_allowed_locked(tsk, pmask);
- mutex_unlock(&callback_mutex);
-}
-
-/**
- * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
- * Must be called with callback_mutex held.
- **/
-void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
-{
task_lock(tsk);
guarantee_online_cpus(task_cs(tsk), pmask);
task_unlock(tsk);
+ mutex_unlock(&callback_mutex);
}
void cpuset_init_current_mems_allowed(void)
@@ -2346,22 +2337,6 @@ int __cpuset_node_allowed_hardwall(int n
}
/**
- * cpuset_lock - lock out any changes to cpuset structures
- *
- * The out of memory (oom) code needs to mutex_lock cpusets
- * from being changed while it scans the tasklist looking for a
- * task in an overlapping cpuset. Expose callback_mutex via this
- * cpuset_lock() routine, so the oom code can lock it, before
- * locking the task list. The tasklist_lock is a spinlock, so
- * must be taken inside callback_mutex.
- */
-
-void cpuset_lock(void)
-{
- mutex_lock(&callback_mutex);
-}
-
-/**
* cpuset_unlock - release lock on cpuset changes
*
* Undo the lock taken in a previous cpuset_lock() call.
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2349,11 +2349,9 @@ static int select_fallback_rq(int cpu, s
return dest_cpu;
/* No more Mr. Nice Guy. */
- if (dest_cpu >= nr_cpu_ids) {
- rcu_read_lock();
- cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
- rcu_read_unlock();
- dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
+ if (unlikely(dest_cpu >= nr_cpu_ids)) {
+ cpumask_copy(&p->cpus_allowed, cpu_possible_mask);
+ dest_cpu = cpumask_any(cpu_active_mask);
/*
* Don't tell them about moving exiting tasks or
@@ -7833,7 +7831,6 @@ migration_call(struct notifier_block *nf
case CPU_DEAD:
case CPU_DEAD_FROZEN:
- cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
migrate_live_tasks(cpu);
rq = cpu_rq(cpu);
/* Idle task back to normal (off runqueue, low prio) */
@@ -7844,7 +7841,6 @@ migration_call(struct notifier_block *nf
rq->idle->sched_class = &idle_sched_class;
migrate_dead_tasks(cpu);
spin_unlock_irq(&rq->lock);
- cpuset_unlock();
migrate_nr_uninterruptible(rq);
BUG_ON(rq->nr_running != 0);
calc_global_load_remove(rq);
next prev parent reply other threads:[~2010-09-18 19:13 UTC|newest]
Thread overview: 132+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <20100918185724.290702750@clark.site>
2010-09-18 19:00 ` [000/123] 2.6.32.22-stable review Greg KH
2010-09-18 18:57 ` [001/123] hwmon: (k8temp) Differentiate between AM2 and ASB1 Greg KH
2010-09-18 18:57 ` [002/123] xen: handle events as edge-triggered Greg KH
2010-09-18 18:57 ` [003/123] xen: use percpu interrupts for IPIs and VIRQs Greg KH
2010-09-18 18:57 ` [004/123] ALSA: hda - Rename iMic to Int Mic on Lenovo NB0763 Greg KH
2010-09-18 18:57 ` [005/123] sata_mv: fix broken DSM/TRIM support (v2) Greg KH
2010-09-18 18:57 ` [006/123] x86, tsc, sched: Recompute cyc2ns_offsets during resume from sleep states Greg KH
2010-09-18 18:57 ` [007/123] PCI: MSI: Remove unsafe and unnecessary hardware access Greg KH
2010-09-18 18:57 ` [008/123] PCI: MSI: Restore read_msi_msg_desc(); add get_cached_msi_msg_desc() Greg KH
2010-09-18 18:57 ` [009/123] sched: kill migration thread in CPU_POST_DEAD instead of CPU_DEAD Greg KH
2010-09-18 18:57 ` [010/123] sched: revert stable c6fc81a sched: Fix a race between ttwu() and migrate_task() Greg KH
2010-09-18 18:57 ` [011/123] staging: hv: Fix missing functions for net_device_ops Greg KH
2010-09-18 18:57 ` [012/123] staging: hv: Fixed bounce kmap problem by using correct index Greg KH
2010-09-18 18:57 ` [013/123] staging: hv: Fixed the value of the 64bit-hole inside ring buffer Greg KH
2010-09-18 18:57 ` [014/123] staging: hv: Increased storvsc ringbuffer and max_io_requests Greg KH
2010-09-18 18:57 ` [015/123] staging: hv: Fixed lockup problem with bounce_buffer scatter list Greg KH
2010-09-18 18:57 ` [016/123] fuse: flush background queue on connection close Greg KH
2010-09-18 18:57 ` [017/123] ath9k_hw: fix parsing of HT40 5 GHz CTLs Greg KH
2010-09-18 18:57 ` [018/123] ocfs2: Fix incorrect checksum validation error Greg KH
2010-09-18 18:57 ` [019/123] USB: ehci-ppc-of: problems in unwind Greg KH
2010-09-18 18:57 ` [020/123] USB: Fix kernel oops with g_ether and Windows Greg KH
2010-09-18 18:57 ` [021/123] USB: CP210x Add new device ID Greg KH
2010-09-18 18:57 ` [022/123] USB: cp210x: Add B&G H3000 link cable ID Greg KH
2010-09-18 18:57 ` [023/123] USB: ftdi_sio: Added custom PIDs for ChamSys products Greg KH
2010-09-18 18:57 ` [024/123] USB: serial: Extra device/vendor ID for mos7840 driver Greg KH
2010-09-18 18:57 ` [025/123] usb: serial: mos7840: Add USB ID to support the B&B Electronics USOPTL4-2P Greg KH
2010-09-18 18:57 ` [026/123] USB: mos7840: fix DMA buffers on stack and endianess bugs Greg KH
2010-09-18 18:57 ` [027/123] usb: serial: mos7840: Add USB IDs to support more B&B USB/RS485 converters Greg KH
2010-09-18 18:57 ` [028/123] USB: Exposing second ACM channel as tty for Nokia S60 phones Greg KH
2010-09-18 18:57 ` [029/123] USB: cdc-acm: add another device quirk Greg KH
2010-09-18 18:57 ` [030/123] USB: Expose vendor-specific ACM channel on Nokia 5230 Greg KH
2010-09-18 18:57 ` [031/123] USB: cdc-acm: Adding second ACM channel support for various Nokia and one Samsung phones Greg KH
2010-09-18 18:57 ` [032/123] USB: cdc-acm: Add pseudo modem without AT command capabilities Greg KH
2010-09-18 18:57 ` [033/123] USB: cdc-acm: Fixing crash when ACM probing interfaces with no endpoint descriptors Greg KH
2010-09-18 18:57 ` [034/123] ALSA: hda - Fix auto-parser of ALC269vb for HP pin NID 0x21 Greg KH
2010-09-18 18:57 ` [035/123] ALSA: seq/oss - Fix double-free at error path of snd_seq_oss_open() Greg KH
2010-09-18 18:58 ` [036/123] sysfs: checking for NULL instead of ERR_PTR Greg KH
2010-09-18 18:58 ` [037/123] tun: Dont add sysfs attributes to devices without sysfs directories Greg KH
2010-09-18 18:58 ` [038/123] oprofile: fix crash when accessing freed task structs Greg KH
2010-09-18 18:58 ` [039/123] oprofile, x86: fix init_sysfs error handling Greg KH
2010-09-18 18:58 ` [040/123] oprofile, x86: fix init_sysfs() function stub Greg KH
2010-09-18 18:58 ` [041/123] HID: usbhid: initialize interface pointers early enough Greg KH
2010-09-18 18:58 ` [042/123] HID: fix suspend crash by moving initializations earlier Greg KH
2010-09-18 18:58 ` [043/123] libata: skip EH autopsy and recovery during suspend Greg KH
2010-09-18 18:58 ` [044/123] tracing: Fix a race in function profile Greg KH
2010-09-18 18:58 ` [045/123] tracing: Do not allow llseek to set_ftrace_filter Greg KH
2010-09-18 18:58 ` [046/123] tracing: t_start: reset FTRACE_ITER_HASH in case of seek/pread Greg KH
2010-09-18 18:58 ` [047/123] irda: off by one Greg KH
2010-09-18 18:58 ` [048/123] gcov: fix null-pointer dereference for certain module types Greg KH
2010-09-18 18:58 ` [049/123] tmio_mmc: dont clear unhandled pending interrupts Greg KH
2010-09-18 18:58 ` [050/123] mmc: fix the use of kunmap_atomic() in tmio_mmc.h Greg KH
2010-09-18 18:58 ` [051/123] bounce: call flush_dcache_page() after bounce_copy_vec() Greg KH
2010-09-18 18:58 ` [052/123] kernel/groups.c: fix integer overflow in groups_search Greg KH
2010-09-18 18:58 ` [053/123] binfmt_misc: fix binfmt_misc priority Greg KH
2010-09-18 18:58 ` [054/123] Input: i8042 - fix device removal on unload Greg KH
2010-09-18 18:58 ` [055/123] memory hotplug: fix next block calculation in is_removable Greg KH
2010-09-18 18:58 ` [056/123] perf: Initialize callchains rootss childen hits Greg KH
2010-09-18 18:58 ` [057/123] p54: fix tx feedback status flag check Greg KH
2010-09-18 18:58 ` [058/123] ath5k: check return value of ieee80211_get_tx_rate Greg KH
2010-09-18 18:58 ` [059/123] wireless extensions: fix kernel heap content leak Greg KH
2010-09-18 18:58 ` [060/123] x86, tsc: Fix a preemption leak in restore_sched_clock_state() Greg KH
2010-09-18 18:58 ` [061/123] x86-64, compat: Test %rax for the syscall number, not %eax Greg KH
2010-09-18 18:58 ` [062/123] compat: Make compat_alloc_user_space() incorporate the access_ok() Greg KH
2010-09-18 18:58 ` [063/123] x86-64, compat: Retruncate rax after ia32 syscall entry tracing Greg KH
2010-09-18 18:58 ` [064/123] sched: Protect task->cpus_allowed access in sched_getaffinity() Greg KH
2010-09-18 20:32 ` [Stable-review] " Ben Hutchings
2010-09-18 22:19 ` Greg KH
2010-09-19 5:10 ` Mike Galbraith
2010-09-19 9:40 ` Mike Galbraith
2010-09-18 18:58 ` [065/123] sched: Protect sched_rr_get_param() access to task->sched_class Greg KH
2010-09-18 18:58 ` [066/123] sched: Consolidate select_task_rq() callers Greg KH
2010-09-18 18:58 ` [067/123] sched: Remove unused cpu_nr_migrations() Greg KH
2010-09-18 18:58 ` [068/123] sched: Remove rq->clock coupling from set_task_cpu() Greg KH
2010-09-18 18:58 ` [069/123] sched: Clean up ttwu() rq locking Greg KH
2010-09-18 18:58 ` [070/123] sched: Sanitize fork() handling Greg KH
2010-09-18 18:58 ` [071/123] sched: Remove forced2_migrations stats Greg KH
2010-09-18 18:58 ` [072/123] sched: Make wakeup side and atomic variants of completion API irq safe Greg KH
2010-09-18 18:58 ` [073/123] sched: Use rcu in sys_sched_getscheduler/sys_sched_getparam() Greg KH
2010-09-18 18:58 ` [074/123] sched: Use rcu in sched_get/set_affinity() Greg KH
2010-09-18 18:58 ` [075/123] sched: Use rcu in sched_get_rr_param() Greg KH
2010-09-18 18:58 ` [076/123] sched: Fix set_cpu_active() in cpu_down() Greg KH
2010-09-18 18:58 ` [077/123] sched: Use TASK_WAKING for fork wakups Greg KH
2010-09-18 18:58 ` [078/123] sched: Ensure set_task_cpu() is never called on blocked tasks Greg KH
2010-09-18 18:58 ` [079/123] sched: Make warning less noisy Greg KH
2010-09-18 18:58 ` [080/123] sched: Fix broken assertion Greg KH
2010-09-18 18:58 ` [081/123] sched: Fix sched_exec() balancing Greg KH
2010-09-18 18:58 ` [082/123] sched: Fix select_task_rq() vs hotplug issues Greg KH
2010-09-18 18:58 ` [083/123] sched: Add pre and post wakeup hooks Greg KH
2010-09-18 18:58 ` [084/123] sched: Remove the cfs_rq dependency from set_task_cpu() Greg KH
2010-09-18 18:58 ` [085/123] sched: Fix hotplug hang Greg KH
2010-09-18 18:58 ` [086/123] sched: Fix fork vs hotplug vs cpuset namespaces Greg KH
2010-09-18 18:58 ` [087/123] sched: Fix incorrect sanity check Greg KH
2010-09-18 18:58 ` [088/123] sched: Fix race between ttwu() and task_rq_lock() Greg KH
2010-09-18 18:58 ` [089/123] sched: Extend enqueue_task to allow head queueing Greg KH
2010-09-18 18:58 ` [090/123] sched: Implement head queueing for sched_rt Greg KH
2010-09-18 18:58 ` [091/123] sched: Queue a deboosted task to the head of the RT prio queue Greg KH
2010-09-18 18:58 ` [092/123] sched: set_cpus_allowed_ptr(): Dont use rq->migration_thread after unlock Greg KH
2010-09-18 18:58 ` Greg KH [this message]
2010-09-18 18:58 ` [094/123] sched: move_task_off_dead_cpu(): Take rq->lock around select_fallback_rq() Greg KH
2010-09-18 18:58 ` [095/123] sched: move_task_off_dead_cpu(): Remove retry logic Greg KH
2010-09-18 18:59 ` [096/123] sched: sched_exec(): Remove the select_fallback_rq() logic Greg KH
2010-09-18 18:59 ` [097/123] sched: _cpu_down(): Dont play with current->cpus_allowed Greg KH
2010-09-18 18:59 ` [098/123] sched: Make select_fallback_rq() cpuset friendly Greg KH
2010-09-18 18:59 ` [099/123] sched: Fix TASK_WAKING vs fork deadlock Greg KH
2010-09-18 18:59 ` [100/123] sched: Optimize task_rq_lock() Greg KH
2010-09-18 18:59 ` [101/123] sched: Fix nr_uninterruptible count Greg KH
2010-09-18 18:59 ` [102/123] sched: Fix rq->clock synchronization when migrating tasks Greg KH
2010-09-18 18:59 ` [103/123] sched: Remove unnecessary RCU exclusion Greg KH
2010-09-18 18:59 ` [104/123] sched: apply RCU protection to wake_affine() Greg KH
2010-09-18 18:59 ` [105/123] sched: Cleanup select_task_rq_fair() Greg KH
2010-09-18 18:59 ` [106/123] sched: More generic WAKE_AFFINE vs select_idle_sibling() Greg KH
2010-09-18 18:59 ` [107/123] sched: Fix vmark regression on big machines Greg KH
2010-09-18 18:59 ` [108/123] sched: Fix select_idle_sibling() Greg KH
2010-09-18 18:59 ` [109/123] sched: Pre-compute cpumask_weight(sched_domain_span(sd)) Greg KH
2010-09-18 18:59 ` [110/123] sched: Fix select_idle_sibling() logic in select_task_rq_fair() Greg KH
2010-09-18 18:59 ` [111/123] sched: cpuacct: Use bigger percpu counter batch values for stats counters Greg KH
2010-09-18 18:59 ` [112/123] ALSA: hda - Handle missing NID 0x1b on ALC259 codec Greg KH
2010-09-18 18:59 ` [113/123] ALSA: hda - Handle pin NID 0x1a on ALC259/269 Greg KH
2010-09-18 18:59 ` [114/123] arm: fix really nasty sigreturn bug Greg KH
2010-09-18 18:59 ` [115/123] hwmon: (f75375s) Shift control mode to the correct bit position Greg KH
2010-09-18 18:59 ` [116/123] hwmon: (f75375s) Do not overwrite values read from registers Greg KH
2010-09-18 18:59 ` [117/123] apm_power: Add missing break statement Greg KH
2010-09-18 18:59 ` [118/123] NFS: Fix a typo in nfs_sockaddr_match_ipaddr6 Greg KH
2010-09-18 18:59 ` [119/123] SUNRPC: Fix race corrupting rpc upcall Greg KH
2010-09-18 18:59 ` [120/123] i915: return -EFAULT if copy_to_user fails Greg KH
2010-09-18 18:59 ` [121/123] i915_gem: " Greg KH
2010-09-18 18:59 ` [122/123] drm/i915: Prevent double dpms on Greg KH
2010-09-18 18:59 ` [123/123] drm: Only decouple the old_fb from the crtc is we call mode_set* Greg KH
2010-09-18 21:39 ` [Stable-review] [000/123] 2.6.32.22-stable review Willy Tarreau
2010-09-19 4:00 ` Greg KH
2010-09-19 5:45 ` Willy Tarreau
2010-09-19 4:34 ` David Miller
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20100918190000.783766814@clark.site \
--to=gregkh@suse.de \
--cc=a.p.zijlstra@chello.nl \
--cc=akpm@linux-foundation.org \
--cc=alan@lxorguk.ukuu.org.uk \
--cc=efault@gmx.de \
--cc=greg@kroah.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@elte.hu \
--cc=oleg@redhat.com \
--cc=stable-review@kernel.org \
--cc=stable@kernel.org \
--cc=torvalds@linux-foundation.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox