[PATCH 1/1] sched/core: Fix migrate

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 1/1] sched/core: Fix migrate_swap() vs. hotplug
@ 2025-06-02  7:22 Kuyo Chang
  2025-06-05 10:00 ` Peter Zijlstra
  0 siblings, 1 reply; 8+ messages in thread
From: Kuyo Chang @ 2025-06-02  7:22 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Valentin Schneider, Matthias Brugger, AngeloGioacchino Del Regno
  Cc: kuyo chang, linux-kernel, linux-arm-kernel, linux-mediatek

From: kuyo chang <kuyo.chang@mediatek.com>

It encounters sporadic failures during CPU hotplug stress test.

[Syndrome]
The kernel log shows list add fail as below.

kmemleak: list_add corruption.
prev->next should be next (ffffff82812c7a00),
but was 0000000000000000. (prev=ffffff82812c3208).

kmemleak: kernel BUG at lib/list_debug.c:34!
kmemleak: Call trace:
kmemleak:  __list_add_valid_or_report+0x11c/0x144
kmemleak:  cpu_stop_queue_work+0x440/0x474
kmemleak:  stop_one_cpu_nowait+0xe4/0x138
kmemleak:  balance_push+0x1f4/0x3e4
kmemleak:  __schedule+0x1adc/0x23bc
kmemleak:  preempt_schedule_common+0x68/0xd0
kmemleak:  preempt_schedule+0x60/0x80
kmemleak:  _raw_spin_unlock_irqrestore+0x9c/0xa0
kmemleak:  scan_gray_list+0x220/0x3e4
kmemleak:  kmemleak_scan+0x410/0x740
kmemleak:  kmemleak_scan_thread+0xb0/0xdc
kmemleak:  kthread+0x2bc/0x494
kmemleak:  ret_from_fork+0x10/0x20

[Analysis]
In the failure case, by memory dump, we find
cpu_stopper.enabled = TRUE
but the wakeq is empty(the migrate/1 is at another wakeq)
static bool cpu_stop_queue_work(...)
{
...
..
	enabled = stopper->enabled;
	if (enabled)
		__cpu_stop_queue_work(stopper, work, &wakeq);
...
...
	wake_up_q(&wakeq);  -> wakeq is empty !!
	preempt_enable();
	return enabled;
}

Through analysis of the CPU0 call trace and memory dump

   CPU0: migration/0, pid: 43, priority: 99
   Native callstack:
	vmlinux  __kern_my_cpu_offset()           <arch/arm64/include/asm/percpu.h:40>
	vmlinux  ct_state_inc(incby=8)            <include/linux/context_tracking.h:137>
	vmlinux  rcu_momentary_eqs() + 72         <kernel/rcu/tree.c:375>
	vmlinux  multi_cpu_stop() + 316           <kernel/stop_machine.c:278>
	vmlinux  cpu_stopper_thread() + 676       <kernel/stop_machine.c:563>
	vmlinux  smpboot_thread_fn(data=0) + 1188 <kernel/smpboot.c:164>
	vmlinux  kthread() + 696                  <kernel/kthread.c:389>
	vmlinux  0xFFFFFFC08005941C()             <arch/arm64/kernel/entry.S:845>

  (struct migration_swap_arg *)0xFFFFFFC08FF87A40 (
    src_task = 0xFFFFFF80FF519740 ,
    dst_task = 0xFFFFFF802A579740 ,
    src_cpu = 0x0,
    dst_cpu = 0x1)

  (struct multi_stop_data)*    0xFFFFFFC08FF87930 = (
    fn = 0xFFFFFFC0802657F4 = migrate_swap_stop,
    data = 0xFFFFFFC08FF87A40
    num_threads = 0x2,
    active_cpus = cpu_bit_bitmap[1] -> (
      bits = (0x2)),
    state = MULTI_STOP_PREPARE = 0x1,
    thread_ack = (
      counter = 0x1))

By cpu mask memory dump:
  ((const struct cpumask *)&__cpu_online_mask) (
    bits = (0xFF))
  ((const struct cpumask *)&__cpu_dying_mask)  (
    bits = (0x2))
  ((const struct cpumask *)&__cpu_active_mask)(
    bits = (0xFD))
  ((const struct cpumask *)&__cpu_possible_mask) (
    bits = (0xFF))
->Imply cpu1 is dying & non-active

So, the potential race scenario is:

	CPU0							CPU1
	// doing migrate_swap(cpu0/cpu1)
	stop_two_cpus()
							  ...
							 // doing _cpu_down()
							      sched_cpu_deactivate()
								set_cpu_active(cpu, false);
								balance_push_set(cpu, true);
	cpu_stop_queue_two_works
	    __cpu_stop_queue_work(stopper1,...);
	    __cpu_stop_queue_work(stopper2,..);
	stop_cpus_in_progress -> true
		preempt_enable();
								...
							1st balance_push
							stop_one_cpu_nowait
							cpu_stop_queue_work
							__cpu_stop_queue_work
							list_add_tail  -> 1st add push_work
							wake_up_q(&wakeq);  -> "wakeq is empty.
										This implies that the stopper is at wakeq@migrate_swap."
	preempt_disable
	wake_up_q(&wakeq);
	        wake_up_process // wakeup migrate/0
		    try_to_wake_up
		        ttwu_queue
		            ttwu_queue_cond ->meet below case
		                if (cpu == smp_processor_id())
			         return false;
			ttwu_do_activate
			//migrate/0 wakeup done
		wake_up_process // wakeup migrate/1
	           try_to_wake_up
		    ttwu_queue
			ttwu_queue_cond
		        ttwu_queue_wakelist
			__ttwu_queue_wakelist
			__smp_call_single_queue
	preempt_enable();

							2nd balance_push
							stop_one_cpu_nowait
							cpu_stop_queue_work
							__cpu_stop_queue_work
							list_add_tail  -> 2nd add push_work, so the double list add is detected
							...
							...
							cpu1 get ipi, do sched_ttwu_pending, wakeup migrate/1

[Solution]
Fix this race condition by adding cpus_read_lock/cpus_read_unlock around stop_two_cpus().
This ensures that no CPUs can come up or go down during this operation.

Signed-off-by: kuyo chang <kuyo.chang@mediatek.com>
---
 kernel/sched/core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 62b3416f5e43..1b371575206f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3441,6 +3441,8 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p,
 		.dst_cpu = target_cpu,
 	};
 
+	/* Make sure no CPUs can come up or down */
+	cpus_read_lock();
 	if (arg.src_cpu == arg.dst_cpu)
 		goto out;
 
@@ -3461,6 +3463,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p,
 	ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
 
 out:
+	cpus_read_unlock();
 	return ret;
 }
 #endif /* CONFIG_NUMA_BALANCING */
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/1] sched/core: Fix migrate_swap() vs. hotplug
  2025-06-02  7:22 [PATCH 1/1] sched/core: Fix migrate_swap() vs. hotplug Kuyo Chang
@ 2025-06-05 10:00 ` Peter Zijlstra
  2025-06-06  3:46   ` Kuyo Chang
  2025-07-01 13:13   ` [tip: sched/urgent] " tip-bot2 for Peter Zijlstra
  0 siblings, 2 replies; 8+ messages in thread
From: Peter Zijlstra @ 2025-06-05 10:00 UTC (permalink / raw)
  To: Kuyo Chang
  Cc: Ingo Molnar, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider,
	Matthias Brugger, AngeloGioacchino Del Regno, linux-kernel,
	linux-arm-kernel, linux-mediatek

On Mon, Jun 02, 2025 at 03:22:13PM +0800, Kuyo Chang wrote:

How easy can you reproduce this?

> So, the potential race scenario is:
> 
> 	CPU0							CPU1
> 	// doing migrate_swap(cpu0/cpu1)
> 	stop_two_cpus()
> 							  ...
> 							 // doing _cpu_down()
> 							      sched_cpu_deactivate()
> 								set_cpu_active(cpu, false);
> 								balance_push_set(cpu, true);
> 	cpu_stop_queue_two_works
> 	    __cpu_stop_queue_work(stopper1,...);
> 	    __cpu_stop_queue_work(stopper2,..);
> 	stop_cpus_in_progress -> true
> 		preempt_enable();
> 								...
> 							1st balance_push
> 							stop_one_cpu_nowait
> 							cpu_stop_queue_work
> 							__cpu_stop_queue_work
> 							list_add_tail  -> 1st add push_work
> 							wake_up_q(&wakeq);  -> "wakeq is empty.
> 										This implies that the stopper is at wakeq@migrate_swap."
> 	preempt_disable
> 	wake_up_q(&wakeq);
> 	        wake_up_process // wakeup migrate/0
> 		    try_to_wake_up
> 		        ttwu_queue
> 		            ttwu_queue_cond ->meet below case
> 		                if (cpu == smp_processor_id())
> 			         return false;
> 			ttwu_do_activate
> 			//migrate/0 wakeup done
> 		wake_up_process // wakeup migrate/1
> 	           try_to_wake_up
> 		    ttwu_queue
> 			ttwu_queue_cond
> 		        ttwu_queue_wakelist
> 			__ttwu_queue_wakelist
> 			__smp_call_single_queue
> 	preempt_enable();
> 
> 							2nd balance_push
> 							stop_one_cpu_nowait
> 							cpu_stop_queue_work
> 							__cpu_stop_queue_work
> 							list_add_tail  -> 2nd add push_work, so the double list add is detected
> 							...
> 							...
> 							cpu1 get ipi, do sched_ttwu_pending, wakeup migrate/1
> 

So this balance_push() is part of schedule(), and schedule() is supposed
to switch to stopper task, but because of this race condition, stopper
task is stuck in WAKING state and not actually visible to be picked.

Therefore CPU1 can do another schedule() and end up doing another
balance_push() even though the last one hasn't been done yet.

So how about we do something like this? Does this help?

---
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 62b3416f5e43..c37b80bd53e6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3939,6 +3939,11 @@ static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
 	if (!scx_allow_ttwu_queue(p))
 		return false;
 
+#ifdef CONFIG_SMP
+	if (p->sched_class == &stop_sched_class)
+		return false;
+#endif
+
 	/*
 	 * Do not complicate things with the async wake_list while the CPU is
 	 * in hotplug state.
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 5d2d0562115b..8855a50cc216 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -82,18 +82,15 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done)
 }
 
 static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
-					struct cpu_stop_work *work,
-					struct wake_q_head *wakeq)
+				  struct cpu_stop_work *work)
 {
 	list_add_tail(&work->list, &stopper->works);
-	wake_q_add(wakeq, stopper->thread);
 }
 
 /* queue @work to @stopper.  if offline, @work is completed immediately */
 static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
 {
 	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
-	DEFINE_WAKE_Q(wakeq);
 	unsigned long flags;
 	bool enabled;
 
@@ -101,12 +98,12 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
 	raw_spin_lock_irqsave(&stopper->lock, flags);
 	enabled = stopper->enabled;
 	if (enabled)
-		__cpu_stop_queue_work(stopper, work, &wakeq);
+		__cpu_stop_queue_work(stopper, work);
 	else if (work->done)
 		cpu_stop_signal_done(work->done);
 	raw_spin_unlock_irqrestore(&stopper->lock, flags);
 
-	wake_up_q(&wakeq);
+	wake_up_process(stopper->thread);
 	preempt_enable();
 
 	return enabled;
@@ -264,7 +261,6 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
 {
 	struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
 	struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
-	DEFINE_WAKE_Q(wakeq);
 	int err;
 
 retry:
@@ -300,8 +296,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
 	}
 
 	err = 0;
-	__cpu_stop_queue_work(stopper1, work1, &wakeq);
-	__cpu_stop_queue_work(stopper2, work2, &wakeq);
+	__cpu_stop_queue_work(stopper1, work1);
+	__cpu_stop_queue_work(stopper2, work2);
 
 unlock:
 	raw_spin_unlock(&stopper2->lock);
@@ -316,7 +312,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
 		goto retry;
 	}
 
-	wake_up_q(&wakeq);
+	wake_up_process(stopper1->thread);
+	wake_up_process(stopper2->thread);
 	preempt_enable();
 
 	return err;

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/1] sched/core: Fix migrate_swap() vs. hotplug
  2025-06-05 10:00 ` Peter Zijlstra
@ 2025-06-06  3:46   ` Kuyo Chang
  2025-06-06  8:28     ` Peter Zijlstra
  2025-07-01 13:13   ` [tip: sched/urgent] " tip-bot2 for Peter Zijlstra
  1 sibling, 1 reply; 8+ messages in thread
From: Kuyo Chang @ 2025-06-06  3:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider,
	Matthias Brugger, AngeloGioacchino Del Regno, linux-kernel,
	linux-arm-kernel, linux-mediatek

On Thu, 2025-06-05 at 12:00 +0200, Peter Zijlstra wrote:
> 
> External email : Please do not click links or open attachments until
> you have verified the sender or the content.
> 
> 
> On Mon, Jun 02, 2025 at 03:22:13PM +0800, Kuyo Chang wrote:
> 
> How easy can you reproduce this?
> 

The probability of duplication is very low, roughly with an occurrence
frequency of about every 1~2 weeks.
I think this issue can only occur if all below types of races happened.
1.stop_two_cpus vs. hotplug
2.cpu1 schedule()
3.ttwu queue ipi latency 

So my initial intention to fix this is by adding
cpus_read_lock/cpus_read_unlock around stop_two_cpus-(1)

> > So, the potential race scenario is:
> > 
> >       CPU0                                                    CPU1
> >       // doing migrate_swap(cpu0/cpu1)
> >       stop_two_cpus()
> >                                                         ...
> >                                                        // doing
> > _cpu_down()
> >                                                            
> > sched_cpu_deactivate()
> >                                                              
> > set_cpu_active(cpu, false);
> >                                                              
> > balance_push_set(cpu, true);
> >       cpu_stop_queue_two_works
> >           __cpu_stop_queue_work(stopper1,...);
> >           __cpu_stop_queue_work(stopper2,..);
> >       stop_cpus_in_progress -> true
> >               preempt_enable();
> >                                                               ...
> >                                                       1st
> > balance_push
> >                                                      
> > stop_one_cpu_nowait
> >                                                      
> > cpu_stop_queue_work
> >                                                      
> > __cpu_stop_queue_work
> >                                                      
> > list_add_tail  -> 1st add push_work
> >                                                      
> > wake_up_q(&wakeq);  -> "wakeq is empty.
> >                                                                    
> >            This implies that the stopper is at wakeq@migrate_swap."
> >       preempt_disable
> >       wake_up_q(&wakeq);
> >               wake_up_process // wakeup migrate/0
> >                   try_to_wake_up
> >                       ttwu_queue
> >                           ttwu_queue_cond ->meet below case
> >                               if (cpu == smp_processor_id())
> >                                return false;
> >                       ttwu_do_activate
> >                       //migrate/0 wakeup done
> >               wake_up_process // wakeup migrate/1
> >                  try_to_wake_up
> >                   ttwu_queue
> >                       ttwu_queue_cond
> >                       ttwu_queue_wakelist
> >                       __ttwu_queue_wakelist
> >                       __smp_call_single_queue
> >       preempt_enable();
> > 
> >                                                       2nd
> > balance_push
> >                                                      
> > stop_one_cpu_nowait
> >                                                      
> > cpu_stop_queue_work
> >                                                      
> > __cpu_stop_queue_work
> >                                                      
> > list_add_tail  -> 2nd add push_work, so the double list add is
> > detected
> >                                                       ...
> >                                                       ...
> >                                                       cpu1 get ipi,
> > do sched_ttwu_pending, wakeup migrate/1
> > 
> 
> So this balance_push() is part of schedule(), and schedule() is
> supposed
> to switch to stopper task, but because of this race condition,
> stopper
> task is stuck in WAKING state and not actually visible to be picked.
> 
> Therefore CPU1 can do another schedule() and end up doing another
> balance_push() even though the last one hasn't been done yet.
> 
> So how about we do something like this? Does this help?
> 

Thank you for your patch.
I believe this patch also effectively addresses this race condition.
I will queue it in our test pool for testing.

> ---
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 62b3416f5e43..c37b80bd53e6 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3939,6 +3939,11 @@ static inline bool ttwu_queue_cond(struct
> task_struct *p, int cpu)
>         if (!scx_allow_ttwu_queue(p))
>                 return false;
> 
> +#ifdef CONFIG_SMP
> +       if (p->sched_class == &stop_sched_class)
> +               return false;
> +#endif
> +
>         /*
>          * Do not complicate things with the async wake_list while
> the CPU is
>          * in hotplug state.
> diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
> index 5d2d0562115b..8855a50cc216 100644
> --- a/kernel/stop_machine.c
> +++ b/kernel/stop_machine.c
> @@ -82,18 +82,15 @@ static void cpu_stop_signal_done(struct
> cpu_stop_done *done)
>  }
> 
>  static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
> -                                       struct cpu_stop_work *work,
> -                                       struct wake_q_head *wakeq)
> +                                 struct cpu_stop_work *work)
>  {
>         list_add_tail(&work->list, &stopper->works);
> -       wake_q_add(wakeq, stopper->thread);
>  }
> 
>  /* queue @work to @stopper.  if offline, @work is completed
> immediately */
>  static bool cpu_stop_queue_work(unsigned int cpu, struct
> cpu_stop_work *work)
>  {
>         struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
> -       DEFINE_WAKE_Q(wakeq);
>         unsigned long flags;
>         bool enabled;
> 
> @@ -101,12 +98,12 @@ static bool cpu_stop_queue_work(unsigned int
> cpu, struct cpu_stop_work *work)
>         raw_spin_lock_irqsave(&stopper->lock, flags);
>         enabled = stopper->enabled;
>         if (enabled)
> -               __cpu_stop_queue_work(stopper, work, &wakeq);
> +               __cpu_stop_queue_work(stopper, work);
>         else if (work->done)
>                 cpu_stop_signal_done(work->done);
>         raw_spin_unlock_irqrestore(&stopper->lock, flags);
> 
> -       wake_up_q(&wakeq);
> +       wake_up_process(stopper->thread);

BTW, should we add enabled check here?
	if (enabled) 
		wake_up_process(stopper->thread);

>         preempt_enable();
> 
>         return enabled;
> @@ -264,7 +261,6 @@ static int cpu_stop_queue_two_works(int cpu1,
> struct cpu_stop_work *work1,
>  {
>         struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper,
> cpu1);
>         struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper,
> cpu2);
> -       DEFINE_WAKE_Q(wakeq);
>         int err;
> 
>  retry:
> @@ -300,8 +296,8 @@ static int cpu_stop_queue_two_works(int cpu1,
> struct cpu_stop_work *work1,
>         }
> 
>         err = 0;
> -       __cpu_stop_queue_work(stopper1, work1, &wakeq);
> -       __cpu_stop_queue_work(stopper2, work2, &wakeq);
> +       __cpu_stop_queue_work(stopper1, work1);
> +       __cpu_stop_queue_work(stopper2, work2);
> 
>  unlock:
>         raw_spin_unlock(&stopper2->lock);
> @@ -316,7 +312,8 @@ static int cpu_stop_queue_two_works(int cpu1,
> struct cpu_stop_work *work1,
>                 goto retry;
>         }
> 
> -       wake_up_q(&wakeq);
> +       wake_up_process(stopper1->thread);
> +       wake_up_process(stopper2->thread);
>         preempt_enable();
> 
>         return err;


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/1] sched/core: Fix migrate_swap() vs. hotplug
  2025-06-06  3:46   ` Kuyo Chang
@ 2025-06-06  8:28     ` Peter Zijlstra
  2025-06-13  7:47       ` Kuyo Chang
  2025-06-26  2:23       ` [SPAM]Re: " Kuyo Chang
  0 siblings, 2 replies; 8+ messages in thread
From: Peter Zijlstra @ 2025-06-06  8:28 UTC (permalink / raw)
  To: Kuyo Chang
  Cc: Ingo Molnar, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider,
	Matthias Brugger, AngeloGioacchino Del Regno, linux-kernel,
	linux-arm-kernel, linux-mediatek

On Fri, Jun 06, 2025 at 11:46:57AM +0800, Kuyo Chang wrote:

> Thank you for your patch.
> I believe this patch also effectively addresses this race condition.
> I will queue it in our test pool for testing.

Thank you; I shall await the results!


> > @@ -101,12 +98,12 @@ static bool cpu_stop_queue_work(unsigned int
> > cpu, struct cpu_stop_work *work)
> >         raw_spin_lock_irqsave(&stopper->lock, flags);
> >         enabled = stopper->enabled;
> >         if (enabled)
> > -               __cpu_stop_queue_work(stopper, work, &wakeq);
> > +               __cpu_stop_queue_work(stopper, work);
> >         else if (work->done)
> >                 cpu_stop_signal_done(work->done);
> >         raw_spin_unlock_irqrestore(&stopper->lock, flags);
> > 
> > -       wake_up_q(&wakeq);
> > +       wake_up_process(stopper->thread);
> 
> BTW, should we add enabled check here?
> 	if (enabled) 
> 		wake_up_process(stopper->thread);

Ah yes. Spuriously waking the stopper thread is harmless, but wasteful.

> >         preempt_enable();
> > 
> >         return enabled;

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/1] sched/core: Fix migrate_swap() vs. hotplug
  2025-06-06  8:28     ` Peter Zijlstra
@ 2025-06-13  7:47       ` Kuyo Chang
  2025-06-26 12:53         ` Peter Zijlstra
  2025-06-26  2:23       ` [SPAM]Re: " Kuyo Chang
  1 sibling, 1 reply; 8+ messages in thread
From: Kuyo Chang @ 2025-06-13  7:47 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider,
	Matthias Brugger, AngeloGioacchino Del Regno, linux-kernel,
	linux-arm-kernel, linux-mediatek

On Fri, 2025-06-06 at 10:28 +0200, Peter Zijlstra wrote:
> 
> External email : Please do not click links or open attachments until
> you have verified the sender or the content.
> 
> 
> On Fri, Jun 06, 2025 at 11:46:57AM +0800, Kuyo Chang wrote:
> 
> > Thank you for your patch.
> > I believe this patch also effectively addresses this race
> > condition.
> > I will queue it in our test pool for testing.
> 
> Thank you; I shall await the results!
> 
It works well during both regular and hotplug tests(one week).
I believe the patch is workable.
Please help to merge, thx.

> 
> > > @@ -101,12 +98,12 @@ static bool cpu_stop_queue_work(unsigned int
> > > cpu, struct cpu_stop_work *work)
> > >         raw_spin_lock_irqsave(&stopper->lock, flags);
> > >         enabled = stopper->enabled;
> > >         if (enabled)
> > > -               __cpu_stop_queue_work(stopper, work, &wakeq);
> > > +               __cpu_stop_queue_work(stopper, work);
> > >         else if (work->done)
> > >                 cpu_stop_signal_done(work->done);
> > >         raw_spin_unlock_irqrestore(&stopper->lock, flags);
> > > 
> > > -       wake_up_q(&wakeq);
> > > +       wake_up_process(stopper->thread);
> > 
> > BTW, should we add enabled check here?
> >       if (enabled)
> >               wake_up_process(stopper->thread);
> 
> Ah yes. Spuriously waking the stopper thread is harmless, but
> wasteful.
> 
> > >         preempt_enable();
> > > 
> > >         return enabled;


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [SPAM]Re: [PATCH 1/1] sched/core: Fix migrate_swap() vs. hotplug
  2025-06-06  8:28     ` Peter Zijlstra
  2025-06-13  7:47       ` Kuyo Chang
@ 2025-06-26  2:23       ` Kuyo Chang
  1 sibling, 0 replies; 8+ messages in thread
From: Kuyo Chang @ 2025-06-26  2:23 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider,
	Matthias Brugger, AngeloGioacchino Del Regno, linux-kernel,
	linux-arm-kernel, linux-mediatek

On Fri, 2025-06-06 at 10:28 +0200, Peter Zijlstra wrote:
> 
> External email : Please do not click links or open attachments until
> you have verified the sender or the content.
> 
> 
> On Fri, Jun 06, 2025 at 11:46:57AM +0800, Kuyo Chang wrote:
> 
> > Thank you for your patch.
> > I believe this patch also effectively addresses this race
> > condition.
> > I will queue it in our test pool for testing.
> 
> Thank you; I shall await the results!
> 
> 
Gentle ping if forgotten.

It works well during both regular and hotplug tests(more than two
week).

May I know if this patch can be merged into the mainline?

Thanks!

> > > @@ -101,12 +98,12 @@ static bool cpu_stop_queue_work(unsigned int
> > > cpu, struct cpu_stop_work *work)
> > >         raw_spin_lock_irqsave(&stopper->lock, flags);
> > >         enabled = stopper->enabled;
> > >         if (enabled)
> > > -               __cpu_stop_queue_work(stopper, work, &wakeq);
> > > +               __cpu_stop_queue_work(stopper, work);
> > >         else if (work->done)
> > >                 cpu_stop_signal_done(work->done);
> > >         raw_spin_unlock_irqrestore(&stopper->lock, flags);
> > > 
> > > -       wake_up_q(&wakeq);
> > > +       wake_up_process(stopper->thread);
> > 
> > BTW, should we add enabled check here?
> >       if (enabled)
> >               wake_up_process(stopper->thread);
> 
> Ah yes. Spuriously waking the stopper thread is harmless, but
> wasteful.
> 
> > >         preempt_enable();
> > > 
> > >         return enabled;


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/1] sched/core: Fix migrate_swap() vs. hotplug
  2025-06-13  7:47       ` Kuyo Chang
@ 2025-06-26 12:53         ` Peter Zijlstra
  0 siblings, 0 replies; 8+ messages in thread
From: Peter Zijlstra @ 2025-06-26 12:53 UTC (permalink / raw)
  To: Kuyo Chang
  Cc: Ingo Molnar, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider,
	Matthias Brugger, AngeloGioacchino Del Regno, linux-kernel,
	linux-arm-kernel, linux-mediatek

On Fri, Jun 13, 2025 at 03:47:47PM +0800, Kuyo Chang wrote:
> On Fri, 2025-06-06 at 10:28 +0200, Peter Zijlstra wrote:
> > 
> > External email : Please do not click links or open attachments until
> > you have verified the sender or the content.
> > 
> > 
> > On Fri, Jun 06, 2025 at 11:46:57AM +0800, Kuyo Chang wrote:
> > 
> > > Thank you for your patch.
> > > I believe this patch also effectively addresses this race
> > > condition.
> > > I will queue it in our test pool for testing.
> > 
> > Thank you; I shall await the results!
> > 
> It works well during both regular and hotplug tests(one week).
> I believe the patch is workable.

Thanks!, I'll queue the below in tip/sched/urgent

---
Subject: sched/core: Fix migrate_swap() vs. hotplug
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 5 Jun 2025 12:00:09 +0200

On Mon, Jun 02, 2025 at 03:22:13PM +0800, Kuyo Chang wrote:

> So, the potential race scenario is:
>
> 	CPU0							CPU1
> 	// doing migrate_swap(cpu0/cpu1)
> 	stop_two_cpus()
> 							  ...
> 							 // doing _cpu_down()
> 							      sched_cpu_deactivate()
> 								set_cpu_active(cpu, false);
> 								balance_push_set(cpu, true);
> 	cpu_stop_queue_two_works
> 	    __cpu_stop_queue_work(stopper1,...);
> 	    __cpu_stop_queue_work(stopper2,..);
> 	stop_cpus_in_progress -> true
> 		preempt_enable();
> 								...
> 							1st balance_push
> 							stop_one_cpu_nowait
> 							cpu_stop_queue_work
> 							__cpu_stop_queue_work
> 							list_add_tail  -> 1st add push_work
> 							wake_up_q(&wakeq);  -> "wakeq is empty.
> 										This implies that the stopper is at wakeq@migrate_swap."
> 	preempt_disable
> 	wake_up_q(&wakeq);
> 	        wake_up_process // wakeup migrate/0
> 		    try_to_wake_up
> 		        ttwu_queue
> 		            ttwu_queue_cond ->meet below case
> 		                if (cpu == smp_processor_id())
> 			         return false;
> 			ttwu_do_activate
> 			//migrate/0 wakeup done
> 		wake_up_process // wakeup migrate/1
> 	           try_to_wake_up
> 		    ttwu_queue
> 			ttwu_queue_cond
> 		        ttwu_queue_wakelist
> 			__ttwu_queue_wakelist
> 			__smp_call_single_queue
> 	preempt_enable();
>
> 							2nd balance_push
> 							stop_one_cpu_nowait
> 							cpu_stop_queue_work
> 							__cpu_stop_queue_work
> 							list_add_tail  -> 2nd add push_work, so the double list add is detected
> 							...
> 							...
> 							cpu1 get ipi, do sched_ttwu_pending, wakeup migrate/1
>

So this balance_push() is part of schedule(), and schedule() is supposed
to switch to stopper task, but because of this race condition, stopper
task is stuck in WAKING state and not actually visible to be picked.

Therefore CPU1 can do another schedule() and end up doing another
balance_push() even though the last one hasn't been done yet.

This is a confluence of fail, where both wake_q and ttwu_wakelist can
cause crucial wakeups to be delayed, resulting in the malfunction of
balance_push.

Since there is only a single stopper thread to be woken, the wake_q
doesn't really add anything here, and can be removed in favour of
direct wakeups of the stopper thread.

Then add a clause to ttwu_queue_cond() to ensure the stopper threads
are never queued / delayed.

Of all 3 moving parts, the last addition was the balance_push()
machinery, so pick that as the point the bug was introduced.

Fixes: 2558aacff858 ("sched/hotplug: Ensure only per-cpu kthreads run during hotplug")
Reported-by: Kuyo Chang <kuyo.chang@mediatek.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Kuyo Chang <kuyo.chang@mediatek.com>
Link: https://lkml.kernel.org/r/20250605100009.GO39944@noisy.programming.kicks-ass.net
---
 kernel/sched/core.c   |    5 +++++
 kernel/stop_machine.c |   20 ++++++++++----------
 2 files changed, 15 insertions(+), 10 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3943,6 +3943,11 @@ static inline bool ttwu_queue_cond(struc
 	if (!scx_allow_ttwu_queue(p))
 		return false;
 
+#ifdef CONFIG_SMP
+	if (p->sched_class == &stop_sched_class)
+		return false;
+#endif
+
 	/*
 	 * Do not complicate things with the async wake_list while the CPU is
 	 * in hotplug state.
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -82,18 +82,15 @@ static void cpu_stop_signal_done(struct
 }
 
 static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
-					struct cpu_stop_work *work,
-					struct wake_q_head *wakeq)
+				  struct cpu_stop_work *work)
 {
 	list_add_tail(&work->list, &stopper->works);
-	wake_q_add(wakeq, stopper->thread);
 }
 
 /* queue @work to @stopper.  if offline, @work is completed immediately */
 static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
 {
 	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
-	DEFINE_WAKE_Q(wakeq);
 	unsigned long flags;
 	bool enabled;
 
@@ -101,12 +98,13 @@ static bool cpu_stop_queue_work(unsigned
 	raw_spin_lock_irqsave(&stopper->lock, flags);
 	enabled = stopper->enabled;
 	if (enabled)
-		__cpu_stop_queue_work(stopper, work, &wakeq);
+		__cpu_stop_queue_work(stopper, work);
 	else if (work->done)
 		cpu_stop_signal_done(work->done);
 	raw_spin_unlock_irqrestore(&stopper->lock, flags);
 
-	wake_up_q(&wakeq);
+	if (enabled)
+		wake_up_process(stopper->thread);
 	preempt_enable();
 
 	return enabled;
@@ -264,7 +262,6 @@ static int cpu_stop_queue_two_works(int
 {
 	struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
 	struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
-	DEFINE_WAKE_Q(wakeq);
 	int err;
 
 retry:
@@ -300,8 +297,8 @@ static int cpu_stop_queue_two_works(int
 	}
 
 	err = 0;
-	__cpu_stop_queue_work(stopper1, work1, &wakeq);
-	__cpu_stop_queue_work(stopper2, work2, &wakeq);
+	__cpu_stop_queue_work(stopper1, work1);
+	__cpu_stop_queue_work(stopper2, work2);
 
 unlock:
 	raw_spin_unlock(&stopper2->lock);
@@ -316,7 +313,10 @@ static int cpu_stop_queue_two_works(int
 		goto retry;
 	}
 
-	wake_up_q(&wakeq);
+	if (!err) {
+		wake_up_process(stopper1->thread);
+		wake_up_process(stopper2->thread);
+	}
 	preempt_enable();
 
 	return err;

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [tip: sched/urgent] sched/core: Fix migrate_swap() vs. hotplug
  2025-06-05 10:00 ` Peter Zijlstra
  2025-06-06  3:46   ` Kuyo Chang
@ 2025-07-01 13:13   ` tip-bot2 for Peter Zijlstra
  1 sibling, 0 replies; 8+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2025-07-01 13:13 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: Kuyo Chang, Peter Zijlstra (Intel), x86, linux-kernel

The following commit has been merged into the sched/urgent branch of tip:

Commit-ID:     009836b4fa52f92cba33618e773b1094affa8cd2
Gitweb:        https://git.kernel.org/tip/009836b4fa52f92cba33618e773b1094affa8cd2
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Thu, 05 Jun 2025 12:00:09 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 01 Jul 2025 15:02:03 +02:00

sched/core: Fix migrate_swap() vs. hotplug

On Mon, Jun 02, 2025 at 03:22:13PM +0800, Kuyo Chang wrote:

> So, the potential race scenario is:
>
> 	CPU0							CPU1
> 	// doing migrate_swap(cpu0/cpu1)
> 	stop_two_cpus()
> 							  ...
> 							 // doing _cpu_down()
> 							      sched_cpu_deactivate()
> 								set_cpu_active(cpu, false);
> 								balance_push_set(cpu, true);
> 	cpu_stop_queue_two_works
> 	    __cpu_stop_queue_work(stopper1,...);
> 	    __cpu_stop_queue_work(stopper2,..);
> 	stop_cpus_in_progress -> true
> 		preempt_enable();
> 								...
> 							1st balance_push
> 							stop_one_cpu_nowait
> 							cpu_stop_queue_work
> 							__cpu_stop_queue_work
> 							list_add_tail  -> 1st add push_work
> 							wake_up_q(&wakeq);  -> "wakeq is empty.
> 										This implies that the stopper is at wakeq@migrate_swap."
> 	preempt_disable
> 	wake_up_q(&wakeq);
> 	        wake_up_process // wakeup migrate/0
> 		    try_to_wake_up
> 		        ttwu_queue
> 		            ttwu_queue_cond ->meet below case
> 		                if (cpu == smp_processor_id())
> 			         return false;
> 			ttwu_do_activate
> 			//migrate/0 wakeup done
> 		wake_up_process // wakeup migrate/1
> 	           try_to_wake_up
> 		    ttwu_queue
> 			ttwu_queue_cond
> 		        ttwu_queue_wakelist
> 			__ttwu_queue_wakelist
> 			__smp_call_single_queue
> 	preempt_enable();
>
> 							2nd balance_push
> 							stop_one_cpu_nowait
> 							cpu_stop_queue_work
> 							__cpu_stop_queue_work
> 							list_add_tail  -> 2nd add push_work, so the double list add is detected
> 							...
> 							...
> 							cpu1 get ipi, do sched_ttwu_pending, wakeup migrate/1
>

So this balance_push() is part of schedule(), and schedule() is supposed
to switch to stopper task, but because of this race condition, stopper
task is stuck in WAKING state and not actually visible to be picked.

Therefore CPU1 can do another schedule() and end up doing another
balance_push() even though the last one hasn't been done yet.

This is a confluence of fail, where both wake_q and ttwu_wakelist can
cause crucial wakeups to be delayed, resulting in the malfunction of
balance_push.

Since there is only a single stopper thread to be woken, the wake_q
doesn't really add anything here, and can be removed in favour of
direct wakeups of the stopper thread.

Then add a clause to ttwu_queue_cond() to ensure the stopper threads
are never queued / delayed.

Of all 3 moving parts, the last addition was the balance_push()
machinery, so pick that as the point the bug was introduced.

Fixes: 2558aacff858 ("sched/hotplug: Ensure only per-cpu kthreads run during hotplug")
Reported-by: Kuyo Chang <kuyo.chang@mediatek.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Kuyo Chang <kuyo.chang@mediatek.com>
Link: https://lkml.kernel.org/r/20250605100009.GO39944@noisy.programming.kicks-ass.net
---
 kernel/sched/core.c   |  5 +++++
 kernel/stop_machine.c | 20 ++++++++++----------
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index cd80b66..ec68fc6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3943,6 +3943,11 @@ static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
 	if (!scx_allow_ttwu_queue(p))
 		return false;
 
+#ifdef CONFIG_SMP
+	if (p->sched_class == &stop_sched_class)
+		return false;
+#endif
+
 	/*
 	 * Do not complicate things with the async wake_list while the CPU is
 	 * in hotplug state.
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 5d2d056..3fe6b0c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -82,18 +82,15 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done)
 }
 
 static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
-					struct cpu_stop_work *work,
-					struct wake_q_head *wakeq)
+				  struct cpu_stop_work *work)
 {
 	list_add_tail(&work->list, &stopper->works);
-	wake_q_add(wakeq, stopper->thread);
 }
 
 /* queue @work to @stopper.  if offline, @work is completed immediately */
 static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
 {
 	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
-	DEFINE_WAKE_Q(wakeq);
 	unsigned long flags;
 	bool enabled;
 
@@ -101,12 +98,13 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
 	raw_spin_lock_irqsave(&stopper->lock, flags);
 	enabled = stopper->enabled;
 	if (enabled)
-		__cpu_stop_queue_work(stopper, work, &wakeq);
+		__cpu_stop_queue_work(stopper, work);
 	else if (work->done)
 		cpu_stop_signal_done(work->done);
 	raw_spin_unlock_irqrestore(&stopper->lock, flags);
 
-	wake_up_q(&wakeq);
+	if (enabled)
+		wake_up_process(stopper->thread);
 	preempt_enable();
 
 	return enabled;
@@ -264,7 +262,6 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
 {
 	struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
 	struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
-	DEFINE_WAKE_Q(wakeq);
 	int err;
 
 retry:
@@ -300,8 +297,8 @@ retry:
 	}
 
 	err = 0;
-	__cpu_stop_queue_work(stopper1, work1, &wakeq);
-	__cpu_stop_queue_work(stopper2, work2, &wakeq);
+	__cpu_stop_queue_work(stopper1, work1);
+	__cpu_stop_queue_work(stopper2, work2);
 
 unlock:
 	raw_spin_unlock(&stopper2->lock);
@@ -316,7 +313,10 @@ unlock:
 		goto retry;
 	}
 
-	wake_up_q(&wakeq);
+	if (!err) {
+		wake_up_process(stopper1->thread);
+		wake_up_process(stopper2->thread);
+	}
 	preempt_enable();
 
 	return err;

^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2025-07-01 13:13 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-06-02  7:22 [PATCH 1/1] sched/core: Fix migrate_swap() vs. hotplug Kuyo Chang
2025-06-05 10:00 ` Peter Zijlstra
2025-06-06  3:46   ` Kuyo Chang
2025-06-06  8:28     ` Peter Zijlstra
2025-06-13  7:47       ` Kuyo Chang
2025-06-26 12:53         ` Peter Zijlstra
2025-06-26  2:23       ` [SPAM]Re: " Kuyo Chang
2025-07-01 13:13   ` [tip: sched/urgent] " tip-bot2 for Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).