* [RFC][PATCH] stop_machine: Fix deadlock between multiple stop_two_cpus()
@ 2015-06-05 15:30 Peter Zijlstra
2015-06-05 17:08 ` Rik van Riel
2015-06-19 18:00 ` [tip:sched/core] sched/stop_machine: " tip-bot for Peter Zijlstra
0 siblings, 2 replies; 3+ messages in thread
From: Peter Zijlstra @ 2015-06-05 15:30 UTC (permalink / raw)
To: Ingo Molnar, Rik van Riel, Jiri Olsa; +Cc: linux-kernel
Jiri reported a machine stuck in multi_cpu_stop() with
migrate_swap_stop() as function and with the following src,dst cpu
pairs: {11, 4} {13, 11} { 4, 13}
4 11 13
cpuM: queue(4 ,13)
*Ma
cpuN: queue(13,11)
*N Na
*M Mb
cpuO: queue(11, 4)
*O Oa
*Nb
*Ob
Where *X denotes the cpu running the queueing of cpu-X and X[ab] denotes
the first/second queued work.
You'll observe the top of the workqueue for each cpu: 4,11,13 to be work
from cpus: M, O, N resp. IOW. deadlock.
Do away with the queueing trickery and introduce lg_double_lock() to
lock both CPUs and fully serialize the stop_two_cpus() callers instead
of the partial (and buggy) serialization we have now.
Completely untested..
Reported-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/lglock.h | 5 +++++
kernel/locking/lglock.c | 22 ++++++++++++++++++++++
kernel/stop_machine.c | 41 +++++------------------------------------
3 files changed, 32 insertions(+), 36 deletions(-)
diff --git a/include/linux/lglock.h b/include/linux/lglock.h
index 0081f000e34b..c92ebd100d9b 100644
--- a/include/linux/lglock.h
+++ b/include/linux/lglock.h
@@ -52,10 +52,15 @@ struct lglock {
static struct lglock name = { .lock = &name ## _lock }
void lg_lock_init(struct lglock *lg, char *name);
+
void lg_local_lock(struct lglock *lg);
void lg_local_unlock(struct lglock *lg);
void lg_local_lock_cpu(struct lglock *lg, int cpu);
void lg_local_unlock_cpu(struct lglock *lg, int cpu);
+
+void lg_double_lock(struct lglock *lg, int cpu1, int cpu2);
+void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2);
+
void lg_global_lock(struct lglock *lg);
void lg_global_unlock(struct lglock *lg);
diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
index 86ae2aebf004..951cfcd10b4a 100644
--- a/kernel/locking/lglock.c
+++ b/kernel/locking/lglock.c
@@ -60,6 +60,28 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu)
}
EXPORT_SYMBOL(lg_local_unlock_cpu);
+void lg_double_lock(struct lglock *lg, int cpu1, int cpu2)
+{
+ BUG_ON(cpu1 == cpu2);
+
+ /* lock in cpu order, just like lg_global_lock */
+ if (cpu2 < cpu1)
+ swap(cpu1, cpu2);
+
+ preempt_disable();
+ lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
+ arch_spin_lock(per_cpu_ptr(lg->lock, cpu1));
+ arch_spin_lock(per_cpu_ptr(lg->lock, cpu2));
+}
+
+void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2)
+{
+ lock_release(&lg->lock_dep_map, 1, _RET_IP_);
+ arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1));
+ arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2));
+ preempt_enable();
+}
+
void lg_global_lock(struct lglock *lg)
{
int i;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 695f0c6cd169..fa6b6d2af6b2 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -211,25 +211,6 @@ static int multi_cpu_stop(void *data)
return err;
}
-struct irq_cpu_stop_queue_work_info {
- int cpu1;
- int cpu2;
- struct cpu_stop_work *work1;
- struct cpu_stop_work *work2;
-};
-
-/*
- * This function is always run with irqs and preemption disabled.
- * This guarantees that both work1 and work2 get queued, before
- * our local migrate thread gets the chance to preempt us.
- */
-static void irq_cpu_stop_queue_work(void *arg)
-{
- struct irq_cpu_stop_queue_work_info *info = arg;
- cpu_stop_queue_work(info->cpu1, info->work1);
- cpu_stop_queue_work(info->cpu2, info->work2);
-}
-
/**
* stop_two_cpus - stops two cpus
* @cpu1: the cpu to stop
@@ -262,13 +243,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
.done = &done
};
- call_args = (struct irq_cpu_stop_queue_work_info){
- .cpu1 = cpu1,
- .cpu2 = cpu2,
- .work1 = &work1,
- .work2 = &work2,
- };
-
cpu_stop_init_done(&done, 2);
set_state(&msdata, MULTI_STOP_PREPARE);
@@ -285,16 +259,11 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
return -ENOENT;
}
- lg_local_lock(&stop_cpus_lock);
- /*
- * Queuing needs to be done by the lowest numbered CPU, to ensure
- * that works are always queued in the same order on every CPU.
- * This prevents deadlocks.
- */
- smp_call_function_single(min(cpu1, cpu2),
- &irq_cpu_stop_queue_work,
- &call_args, 1);
- lg_local_unlock(&stop_cpus_lock);
+ lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
+ cpu_stop_queue_work(cpu1, work1);
+ cpu_stop_queue_work(cpu2, work2);
+ lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
+
preempt_enable();
wait_for_completion(&done.completion);
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [RFC][PATCH] stop_machine: Fix deadlock between multiple stop_two_cpus()
2015-06-05 15:30 [RFC][PATCH] stop_machine: Fix deadlock between multiple stop_two_cpus() Peter Zijlstra
@ 2015-06-05 17:08 ` Rik van Riel
2015-06-19 18:00 ` [tip:sched/core] sched/stop_machine: " tip-bot for Peter Zijlstra
1 sibling, 0 replies; 3+ messages in thread
From: Rik van Riel @ 2015-06-05 17:08 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Jiri Olsa; +Cc: linux-kernel
On 06/05/2015 11:30 AM, Peter Zijlstra wrote:
>
> Jiri reported a machine stuck in multi_cpu_stop() with
> migrate_swap_stop() as function and with the following src,dst cpu
> pairs: {11, 4} {13, 11} { 4, 13}
>
> 4 11 13
>
> cpuM: queue(4 ,13)
> *Ma
> cpuN: queue(13,11)
> *N Na
> *M Mb
> cpuO: queue(11, 4)
> *O Oa
> *Nb
> *Ob
>
> Where *X denotes the cpu running the queueing of cpu-X and X[ab] denotes
> the first/second queued work.
>
> You'll observe the top of the workqueue for each cpu: 4,11,13 to be work
> from cpus: M, O, N resp. IOW. deadlock.
>
> Do away with the queueing trickery and introduce lg_double_lock() to
> lock both CPUs and fully serialize the stop_two_cpus() callers instead
> of the partial (and buggy) serialization we have now.
>
> Completely untested..
>
> Reported-by: Jiri Olsa <jolsa@redhat.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Certainly looks like it would work.
I suspect we should probably apply this patch after some basic
testing, since the race is so incredibly hard to reproduce.
--
All rights reversed
^ permalink raw reply [flat|nested] 3+ messages in thread
* [tip:sched/core] sched/stop_machine: Fix deadlock between multiple stop_two_cpus()
2015-06-05 15:30 [RFC][PATCH] stop_machine: Fix deadlock between multiple stop_two_cpus() Peter Zijlstra
2015-06-05 17:08 ` Rik van Riel
@ 2015-06-19 18:00 ` tip-bot for Peter Zijlstra
1 sibling, 0 replies; 3+ messages in thread
From: tip-bot for Peter Zijlstra @ 2015-06-19 18:00 UTC (permalink / raw)
To: linux-tip-commits
Cc: akpm, bp, torvalds, linux-kernel, peterz, mingo, jolsa, riel, hpa,
oleg, tglx
Commit-ID: b17718d02f54b90978d0e0146368b512b11c3e84
Gitweb: http://git.kernel.org/tip/b17718d02f54b90978d0e0146368b512b11c3e84
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Fri, 5 Jun 2015 17:30:23 +0200
Committer: Ingo Molnar <mingo@kernel.org>
CommitDate: Fri, 19 Jun 2015 10:03:12 +0200
sched/stop_machine: Fix deadlock between multiple stop_two_cpus()
Jiri reported a machine stuck in multi_cpu_stop() with
migrate_swap_stop() as function and with the following src,dst cpu
pairs: {11, 4} {13, 11} { 4, 13}
4 11 13
cpuM: queue(4 ,13)
*Ma
cpuN: queue(13,11)
*N Na
*M Mb
cpuO: queue(11, 4)
*O Oa
*Nb
*Ob
Where *X denotes the cpu running the queueing of cpu-X and X[ab] denotes
the first/second queued work.
You'll observe the top of the workqueue for each cpu: 4,11,13 to be work
from cpus: M, O, N resp. IOW. deadlock.
Do away with the queueing trickery and introduce lg_double_lock() to
lock both CPUs and fully serialize the stop_two_cpus() callers instead
of the partial (and buggy) serialization we have now.
Reported-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150605153023.GH19282@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
include/linux/lglock.h | 5 +++++
kernel/locking/lglock.c | 22 ++++++++++++++++++++++
kernel/stop_machine.c | 42 +++++-------------------------------------
3 files changed, 32 insertions(+), 37 deletions(-)
diff --git a/include/linux/lglock.h b/include/linux/lglock.h
index 0081f00..c92ebd1 100644
--- a/include/linux/lglock.h
+++ b/include/linux/lglock.h
@@ -52,10 +52,15 @@ struct lglock {
static struct lglock name = { .lock = &name ## _lock }
void lg_lock_init(struct lglock *lg, char *name);
+
void lg_local_lock(struct lglock *lg);
void lg_local_unlock(struct lglock *lg);
void lg_local_lock_cpu(struct lglock *lg, int cpu);
void lg_local_unlock_cpu(struct lglock *lg, int cpu);
+
+void lg_double_lock(struct lglock *lg, int cpu1, int cpu2);
+void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2);
+
void lg_global_lock(struct lglock *lg);
void lg_global_unlock(struct lglock *lg);
diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
index 86ae2ae..951cfcd 100644
--- a/kernel/locking/lglock.c
+++ b/kernel/locking/lglock.c
@@ -60,6 +60,28 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu)
}
EXPORT_SYMBOL(lg_local_unlock_cpu);
+void lg_double_lock(struct lglock *lg, int cpu1, int cpu2)
+{
+ BUG_ON(cpu1 == cpu2);
+
+ /* lock in cpu order, just like lg_global_lock */
+ if (cpu2 < cpu1)
+ swap(cpu1, cpu2);
+
+ preempt_disable();
+ lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
+ arch_spin_lock(per_cpu_ptr(lg->lock, cpu1));
+ arch_spin_lock(per_cpu_ptr(lg->lock, cpu2));
+}
+
+void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2)
+{
+ lock_release(&lg->lock_dep_map, 1, _RET_IP_);
+ arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1));
+ arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2));
+ preempt_enable();
+}
+
void lg_global_lock(struct lglock *lg)
{
int i;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 695f0c6..fd643d8 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -211,25 +211,6 @@ static int multi_cpu_stop(void *data)
return err;
}
-struct irq_cpu_stop_queue_work_info {
- int cpu1;
- int cpu2;
- struct cpu_stop_work *work1;
- struct cpu_stop_work *work2;
-};
-
-/*
- * This function is always run with irqs and preemption disabled.
- * This guarantees that both work1 and work2 get queued, before
- * our local migrate thread gets the chance to preempt us.
- */
-static void irq_cpu_stop_queue_work(void *arg)
-{
- struct irq_cpu_stop_queue_work_info *info = arg;
- cpu_stop_queue_work(info->cpu1, info->work1);
- cpu_stop_queue_work(info->cpu2, info->work2);
-}
-
/**
* stop_two_cpus - stops two cpus
* @cpu1: the cpu to stop
@@ -245,7 +226,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
{
struct cpu_stop_done done;
struct cpu_stop_work work1, work2;
- struct irq_cpu_stop_queue_work_info call_args;
struct multi_stop_data msdata;
preempt_disable();
@@ -262,13 +242,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
.done = &done
};
- call_args = (struct irq_cpu_stop_queue_work_info){
- .cpu1 = cpu1,
- .cpu2 = cpu2,
- .work1 = &work1,
- .work2 = &work2,
- };
-
cpu_stop_init_done(&done, 2);
set_state(&msdata, MULTI_STOP_PREPARE);
@@ -285,16 +258,11 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
return -ENOENT;
}
- lg_local_lock(&stop_cpus_lock);
- /*
- * Queuing needs to be done by the lowest numbered CPU, to ensure
- * that works are always queued in the same order on every CPU.
- * This prevents deadlocks.
- */
- smp_call_function_single(min(cpu1, cpu2),
- &irq_cpu_stop_queue_work,
- &call_args, 1);
- lg_local_unlock(&stop_cpus_lock);
+ lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
+ cpu_stop_queue_work(cpu1, &work1);
+ cpu_stop_queue_work(cpu2, &work2);
+ lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
+
preempt_enable();
wait_for_completion(&done.completion);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply related [flat|nested] 3+ messages in thread
end of thread, other threads:[~2015-06-19 18:01 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-06-05 15:30 [RFC][PATCH] stop_machine: Fix deadlock between multiple stop_two_cpus() Peter Zijlstra
2015-06-05 17:08 ` Rik van Riel
2015-06-19 18:00 ` [tip:sched/core] sched/stop_machine: " tip-bot for Peter Zijlstra
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.