* [PATCH 1/2] sched_ext: Fix SCX_KICK_WAIT deadlock by deferring wait to balance callback
[not found] <20260329001856.835643-1-tj@kernel.org>
@ 2026-03-29 0:18 ` Tejun Heo
2026-03-29 16:26 ` Andrea Righi
0 siblings, 1 reply; 2+ messages in thread
From: Tejun Heo @ 2026-03-29 0:18 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min
Cc: Christian Loehle, Emil Tsalapatis, sched-ext, linux-kernel,
Tejun Heo, stable
SCX_KICK_WAIT busy-waits in kick_cpus_irq_workfn() using
smp_cond_load_acquire() until the target CPU's kick_sync advances. Because
the irq_work runs in hardirq context, the waiting CPU cannot reschedule and
its own kick_sync never advances. If multiple CPUs form a wait cycle, all
CPUs deadlock.
Replace the busy-wait in kick_cpus_irq_workfn() with resched_curr() to
force the CPU through do_pick_task_scx(), which queues a balance callback
to perform the wait. The balance callback drops the rq lock and enables
IRQs following the sched_core_balance() pattern, so the CPU can process
IPIs while waiting. The local CPU's kick_sync is advanced on entry to
do_pick_task_scx() and continuously during the wait, ensuring any CPU that
starts waiting for us sees the advancement and cannot form cyclic
dependencies.
Fixes: 90e55164dad4 ("sched_ext: Implement SCX_KICK_WAIT")
Cc: stable@vger.kernel.org # v6.12+
Reported-by: Christian Loehle <christian.loehle@arm.com>
Link: https://lore.kernel.org/r/20260316100249.1651641-1-christian.loehle@arm.com
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/ext.c | 95 ++++++++++++++++++++++++++++++++------------
kernel/sched/sched.h | 3 ++
2 files changed, 73 insertions(+), 25 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 26a6ac2f8826..d5bdcdb3f700 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2404,7 +2404,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
{
struct scx_sched *sch = scx_root;
- /* see kick_cpus_irq_workfn() */
+ /* see kick_sync_wait_bal_cb() */
smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
update_curr_scx(rq);
@@ -2447,6 +2447,48 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
switch_class(rq, next);
}
+static void kick_sync_wait_bal_cb(struct rq *rq)
+{
+ struct scx_kick_syncs __rcu *ks = __this_cpu_read(scx_kick_syncs);
+ unsigned long *ksyncs = rcu_dereference_sched(ks)->syncs;
+ bool waited;
+ s32 cpu;
+
+ /*
+ * Drop rq lock and enable IRQs while waiting. IRQs must be enabled
+ * — a target CPU may be waiting for us to process an IPI (e.g. TLB
+ * flush) while we wait for its kick_sync to advance.
+ *
+ * Also, keep advancing our own kick_sync so that new kick_sync waits
+ * targeting us, which can start after we drop the lock, cannot form
+ * cyclic dependencies.
+ */
+retry:
+ waited = false;
+ for_each_cpu(cpu, rq->scx.cpus_to_sync) {
+ /*
+ * smp_load_acquire() pairs with smp_store_release() on
+ * kick_sync updates on the target CPUs.
+ */
+ if (cpu == cpu_of(rq) ||
+ smp_load_acquire(&cpu_rq(cpu)->scx.kick_sync) != ksyncs[cpu]) {
+ cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync);
+ continue;
+ }
+
+ raw_spin_rq_unlock_irq(rq);
+ while (READ_ONCE(cpu_rq(cpu)->scx.kick_sync) == ksyncs[cpu]) {
+ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
+ cpu_relax();
+ }
+ raw_spin_rq_lock_irq(rq);
+ waited = true;
+ }
+
+ if (waited)
+ goto retry;
+}
+
static struct task_struct *first_local_task(struct rq *rq)
{
return list_first_entry_or_null(&rq->scx.local_dsq.list,
@@ -2460,7 +2502,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
bool keep_prev;
struct task_struct *p;
- /* see kick_cpus_irq_workfn() */
+ /* see kick_sync_wait_bal_cb() */
smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
rq_modified_begin(rq, &ext_sched_class);
@@ -2470,6 +2512,17 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
rq_repin_lock(rq, rf);
maybe_queue_balance_callback(rq);
+ /*
+ * Defer to a balance callback which can drop rq lock and enable
+ * IRQs. Waiting directly in the pick path would deadlock against
+ * CPUs sending us IPIs (e.g. TLB flushes) while we wait for them.
+ */
+ if (unlikely(rq->scx.kick_sync_pending)) {
+ rq->scx.kick_sync_pending = false;
+ queue_balance_callback(rq, &rq->scx.kick_sync_bal_cb,
+ kick_sync_wait_bal_cb);
+ }
+
/*
* If any higher-priority sched class enqueued a runnable task on
* this rq during balance_one(), abort and return RETRY_TASK, so
@@ -4713,6 +4766,9 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
if (!cpumask_empty(rq->scx.cpus_to_wait))
dump_line(&ns, " cpus_to_wait : %*pb",
cpumask_pr_args(rq->scx.cpus_to_wait));
+ if (!cpumask_empty(rq->scx.cpus_to_sync))
+ dump_line(&ns, " cpus_to_sync : %*pb",
+ cpumask_pr_args(rq->scx.cpus_to_sync));
used = seq_buf_used(&ns);
if (SCX_HAS_OP(sch, dump_cpu)) {
@@ -5610,11 +5666,11 @@ static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs)
if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) {
if (cur_class == &ext_sched_class) {
+ cpumask_set_cpu(cpu, this_scx->cpus_to_sync);
ksyncs[cpu] = rq->scx.kick_sync;
should_wait = true;
- } else {
- cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
}
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
}
resched_curr(rq);
@@ -5669,27 +5725,15 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
}
- if (!should_wait)
- return;
-
- for_each_cpu(cpu, this_scx->cpus_to_wait) {
- unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync;
-
- /*
- * Busy-wait until the task running at the time of kicking is no
- * longer running. This can be used to implement e.g. core
- * scheduling.
- *
- * smp_cond_load_acquire() pairs with store_releases in
- * pick_task_scx() and put_prev_task_scx(). The former breaks
- * the wait if SCX's scheduling path is entered even if the same
- * task is picked subsequently. The latter is necessary to break
- * the wait when $cpu is taken by a higher sched class.
- */
- if (cpu != cpu_of(this_rq))
- smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]);
-
- cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
+ /*
+ * Can't wait in hardirq — kick_sync can't advance, deadlocking if
+ * CPUs wait for each other. Defer to kick_sync_wait_bal_cb().
+ */
+ if (should_wait) {
+ raw_spin_rq_lock(this_rq);
+ this_scx->kick_sync_pending = true;
+ resched_curr(this_rq);
+ raw_spin_rq_unlock(this_rq);
}
}
@@ -5794,6 +5838,7 @@ void __init init_sched_ext_class(void)
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
+ BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_sync, GFP_KERNEL, n));
rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 43bbf0693cca..1ef9ba480f51 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -805,9 +805,12 @@ struct scx_rq {
cpumask_var_t cpus_to_kick_if_idle;
cpumask_var_t cpus_to_preempt;
cpumask_var_t cpus_to_wait;
+ cpumask_var_t cpus_to_sync;
+ bool kick_sync_pending;
unsigned long kick_sync;
local_t reenq_local_deferred;
struct balance_callback deferred_bal_cb;
+ struct balance_callback kick_sync_bal_cb;
struct irq_work deferred_irq_work;
struct irq_work kick_cpus_irq_work;
struct scx_dispatch_q bypass_dsq;
--
2.53.0
^ permalink raw reply related [flat|nested] 2+ messages in thread
* Re: [PATCH 1/2] sched_ext: Fix SCX_KICK_WAIT deadlock by deferring wait to balance callback
2026-03-29 0:18 ` [PATCH 1/2] sched_ext: Fix SCX_KICK_WAIT deadlock by deferring wait to balance callback Tejun Heo
@ 2026-03-29 16:26 ` Andrea Righi
0 siblings, 0 replies; 2+ messages in thread
From: Andrea Righi @ 2026-03-29 16:26 UTC (permalink / raw)
To: Tejun Heo
Cc: David Vernet, Changwoo Min, Christian Loehle, Emil Tsalapatis,
sched-ext, linux-kernel, stable
Hi Tejun,
On Sat, Mar 28, 2026 at 02:18:55PM -1000, Tejun Heo wrote:
> SCX_KICK_WAIT busy-waits in kick_cpus_irq_workfn() using
> smp_cond_load_acquire() until the target CPU's kick_sync advances. Because
> the irq_work runs in hardirq context, the waiting CPU cannot reschedule and
> its own kick_sync never advances. If multiple CPUs form a wait cycle, all
> CPUs deadlock.
>
> Replace the busy-wait in kick_cpus_irq_workfn() with resched_curr() to
> force the CPU through do_pick_task_scx(), which queues a balance callback
> to perform the wait. The balance callback drops the rq lock and enables
> IRQs following the sched_core_balance() pattern, so the CPU can process
> IPIs while waiting. The local CPU's kick_sync is advanced on entry to
> do_pick_task_scx() and continuously during the wait, ensuring any CPU that
> starts waiting for us sees the advancement and cannot form cyclic
> dependencies.
>
> Fixes: 90e55164dad4 ("sched_ext: Implement SCX_KICK_WAIT")
> Cc: stable@vger.kernel.org # v6.12+
> Reported-by: Christian Loehle <christian.loehle@arm.com>
> Link: https://lore.kernel.org/r/20260316100249.1651641-1-christian.loehle@arm.com
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
> kernel/sched/ext.c | 95 ++++++++++++++++++++++++++++++++------------
> kernel/sched/sched.h | 3 ++
> 2 files changed, 73 insertions(+), 25 deletions(-)
>
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 26a6ac2f8826..d5bdcdb3f700 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -2404,7 +2404,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
> {
> struct scx_sched *sch = scx_root;
>
> - /* see kick_cpus_irq_workfn() */
> + /* see kick_sync_wait_bal_cb() */
> smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
>
> update_curr_scx(rq);
> @@ -2447,6 +2447,48 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
> switch_class(rq, next);
> }
>
> +static void kick_sync_wait_bal_cb(struct rq *rq)
> +{
> + struct scx_kick_syncs __rcu *ks = __this_cpu_read(scx_kick_syncs);
> + unsigned long *ksyncs = rcu_dereference_sched(ks)->syncs;
> + bool waited;
> + s32 cpu;
> +
> + /*
> + * Drop rq lock and enable IRQs while waiting. IRQs must be enabled
> + * — a target CPU may be waiting for us to process an IPI (e.g. TLB
nit: s/—/-/
> + * flush) while we wait for its kick_sync to advance.
> + *
> + * Also, keep advancing our own kick_sync so that new kick_sync waits
> + * targeting us, which can start after we drop the lock, cannot form
> + * cyclic dependencies.
> + */
> +retry:
> + waited = false;
> + for_each_cpu(cpu, rq->scx.cpus_to_sync) {
> + /*
> + * smp_load_acquire() pairs with smp_store_release() on
> + * kick_sync updates on the target CPUs.
> + */
> + if (cpu == cpu_of(rq) ||
> + smp_load_acquire(&cpu_rq(cpu)->scx.kick_sync) != ksyncs[cpu]) {
> + cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync);
> + continue;
> + }
Should we add something like:
if (cpu != cpu_of(rq) && !cpu_online(cpu)) {
cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync);
continue;
}
> +
> + raw_spin_rq_unlock_irq(rq);
> + while (READ_ONCE(cpu_rq(cpu)->scx.kick_sync) == ksyncs[cpu]) {
And here:
if (cpu != cpu_of(rq) && !cpu_online(cpu))
break;
(see below)
> + smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
> + cpu_relax();
> + }
> + raw_spin_rq_lock_irq(rq);
> + waited = true;
> + }
> +
> + if (waited)
> + goto retry;
> +}
> +
> static struct task_struct *first_local_task(struct rq *rq)
> {
> return list_first_entry_or_null(&rq->scx.local_dsq.list,
> @@ -2460,7 +2502,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
> bool keep_prev;
> struct task_struct *p;
>
> - /* see kick_cpus_irq_workfn() */
> + /* see kick_sync_wait_bal_cb() */
> smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
>
> rq_modified_begin(rq, &ext_sched_class);
> @@ -2470,6 +2512,17 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
> rq_repin_lock(rq, rf);
> maybe_queue_balance_callback(rq);
>
> + /*
> + * Defer to a balance callback which can drop rq lock and enable
> + * IRQs. Waiting directly in the pick path would deadlock against
> + * CPUs sending us IPIs (e.g. TLB flushes) while we wait for them.
> + */
> + if (unlikely(rq->scx.kick_sync_pending)) {
> + rq->scx.kick_sync_pending = false;
> + queue_balance_callback(rq, &rq->scx.kick_sync_bal_cb,
> + kick_sync_wait_bal_cb);
queue_balance_callback() is a no-op if the rq is in balance_push, but I
guess it's ok to just clear the kick_sync_pending if we add the checks
above.
> + }
> +
> /*
> * If any higher-priority sched class enqueued a runnable task on
> * this rq during balance_one(), abort and return RETRY_TASK, so
> @@ -4713,6 +4766,9 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
> if (!cpumask_empty(rq->scx.cpus_to_wait))
> dump_line(&ns, " cpus_to_wait : %*pb",
> cpumask_pr_args(rq->scx.cpus_to_wait));
> + if (!cpumask_empty(rq->scx.cpus_to_sync))
> + dump_line(&ns, " cpus_to_sync : %*pb",
> + cpumask_pr_args(rq->scx.cpus_to_sync));
>
> used = seq_buf_used(&ns);
> if (SCX_HAS_OP(sch, dump_cpu)) {
> @@ -5610,11 +5666,11 @@ static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs)
>
> if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) {
> if (cur_class == &ext_sched_class) {
> + cpumask_set_cpu(cpu, this_scx->cpus_to_sync);
> ksyncs[cpu] = rq->scx.kick_sync;
> should_wait = true;
> - } else {
> - cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
> }
> + cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
> }
>
> resched_curr(rq);
> @@ -5669,27 +5725,15 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
> cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
> }
>
> - if (!should_wait)
> - return;
> -
> - for_each_cpu(cpu, this_scx->cpus_to_wait) {
> - unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync;
> -
> - /*
> - * Busy-wait until the task running at the time of kicking is no
> - * longer running. This can be used to implement e.g. core
> - * scheduling.
> - *
> - * smp_cond_load_acquire() pairs with store_releases in
> - * pick_task_scx() and put_prev_task_scx(). The former breaks
> - * the wait if SCX's scheduling path is entered even if the same
> - * task is picked subsequently. The latter is necessary to break
> - * the wait when $cpu is taken by a higher sched class.
> - */
> - if (cpu != cpu_of(this_rq))
> - smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]);
> -
> - cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
> + /*
> + * Can't wait in hardirq — kick_sync can't advance, deadlocking if
> + * CPUs wait for each other. Defer to kick_sync_wait_bal_cb().
> + */
> + if (should_wait) {
> + raw_spin_rq_lock(this_rq);
> + this_scx->kick_sync_pending = true;
> + resched_curr(this_rq);
> + raw_spin_rq_unlock(this_rq);
> }
> }
>
> @@ -5794,6 +5838,7 @@ void __init init_sched_ext_class(void)
> BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
> BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
> BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
> + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_sync, GFP_KERNEL, n));
> rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
> rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);
>
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 43bbf0693cca..1ef9ba480f51 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -805,9 +805,12 @@ struct scx_rq {
> cpumask_var_t cpus_to_kick_if_idle;
> cpumask_var_t cpus_to_preempt;
> cpumask_var_t cpus_to_wait;
> + cpumask_var_t cpus_to_sync;
> + bool kick_sync_pending;
> unsigned long kick_sync;
> local_t reenq_local_deferred;
> struct balance_callback deferred_bal_cb;
> + struct balance_callback kick_sync_bal_cb;
> struct irq_work deferred_irq_work;
> struct irq_work kick_cpus_irq_work;
> struct scx_dispatch_q bypass_dsq;
> --
> 2.53.0
>
Thanks,
-Andrea
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2026-03-29 16:26 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
[not found] <20260329001856.835643-1-tj@kernel.org>
2026-03-29 0:18 ` [PATCH 1/2] sched_ext: Fix SCX_KICK_WAIT deadlock by deferring wait to balance callback Tejun Heo
2026-03-29 16:26 ` Andrea Righi
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox