public inbox for stable@vger.kernel.org
 help / color / mirror / Atom feed
From: Andrea Righi <arighi@nvidia.com>
To: Tejun Heo <tj@kernel.org>
Cc: David Vernet <void@manifault.com>,
	Changwoo Min <changwoo@igalia.com>,
	Christian Loehle <christian.loehle@arm.com>,
	Emil Tsalapatis <emil@etsalapatis.com>,
	sched-ext@lists.linux.dev, linux-kernel@vger.kernel.org,
	stable@vger.kernel.org
Subject: Re: [PATCH 1/2] sched_ext: Fix SCX_KICK_WAIT deadlock by deferring wait to balance callback
Date: Sun, 29 Mar 2026 18:26:10 +0200	[thread overview]
Message-ID: <aclSomF-cmT6_UsN@gpd4> (raw)
In-Reply-To: <20260329001856.835643-2-tj@kernel.org>

Hi Tejun,

On Sat, Mar 28, 2026 at 02:18:55PM -1000, Tejun Heo wrote:
> SCX_KICK_WAIT busy-waits in kick_cpus_irq_workfn() using
> smp_cond_load_acquire() until the target CPU's kick_sync advances. Because
> the irq_work runs in hardirq context, the waiting CPU cannot reschedule and
> its own kick_sync never advances. If multiple CPUs form a wait cycle, all
> CPUs deadlock.
> 
> Replace the busy-wait in kick_cpus_irq_workfn() with resched_curr() to
> force the CPU through do_pick_task_scx(), which queues a balance callback
> to perform the wait. The balance callback drops the rq lock and enables
> IRQs following the sched_core_balance() pattern, so the CPU can process
> IPIs while waiting. The local CPU's kick_sync is advanced on entry to
> do_pick_task_scx() and continuously during the wait, ensuring any CPU that
> starts waiting for us sees the advancement and cannot form cyclic
> dependencies.
> 
> Fixes: 90e55164dad4 ("sched_ext: Implement SCX_KICK_WAIT")
> Cc: stable@vger.kernel.org # v6.12+
> Reported-by: Christian Loehle <christian.loehle@arm.com>
> Link: https://lore.kernel.org/r/20260316100249.1651641-1-christian.loehle@arm.com
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
>  kernel/sched/ext.c   | 95 ++++++++++++++++++++++++++++++++------------
>  kernel/sched/sched.h |  3 ++
>  2 files changed, 73 insertions(+), 25 deletions(-)
> 
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 26a6ac2f8826..d5bdcdb3f700 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -2404,7 +2404,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
>  {
>  	struct scx_sched *sch = scx_root;
>  
> -	/* see kick_cpus_irq_workfn() */
> +	/* see kick_sync_wait_bal_cb() */
>  	smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
>  
>  	update_curr_scx(rq);
> @@ -2447,6 +2447,48 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
>  		switch_class(rq, next);
>  }
>  
> +static void kick_sync_wait_bal_cb(struct rq *rq)
> +{
> +	struct scx_kick_syncs __rcu *ks = __this_cpu_read(scx_kick_syncs);
> +	unsigned long *ksyncs = rcu_dereference_sched(ks)->syncs;
> +	bool waited;
> +	s32 cpu;
> +
> +	/*
> +	 * Drop rq lock and enable IRQs while waiting. IRQs must be enabled
> +	 * — a target CPU may be waiting for us to process an IPI (e.g. TLB

nit: s/—/-/

> +	 * flush) while we wait for its kick_sync to advance.
> +	 *
> +	 * Also, keep advancing our own kick_sync so that new kick_sync waits
> +	 * targeting us, which can start after we drop the lock, cannot form
> +	 * cyclic dependencies.
> +	 */
> +retry:
> +	waited = false;
> +	for_each_cpu(cpu, rq->scx.cpus_to_sync) {
> +		/*
> +		 * smp_load_acquire() pairs with smp_store_release() on
> +		 * kick_sync updates on the target CPUs.
> +		 */
> +		if (cpu == cpu_of(rq) ||
> +		    smp_load_acquire(&cpu_rq(cpu)->scx.kick_sync) != ksyncs[cpu]) {
> +			cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync);
> +			continue;
> +		}

Should we add something like:

		if (cpu != cpu_of(rq) && !cpu_online(cpu)) {
			cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync);
			continue;
		}

> +
> +		raw_spin_rq_unlock_irq(rq);
> +		while (READ_ONCE(cpu_rq(cpu)->scx.kick_sync) == ksyncs[cpu]) {

And here:
			if (cpu != cpu_of(rq) && !cpu_online(cpu))
				break;

(see below)

> +			smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
> +			cpu_relax();
> +		}
> +		raw_spin_rq_lock_irq(rq);
> +		waited = true;
> +	}
> +
> +	if (waited)
> +		goto retry;
> +}
> +
>  static struct task_struct *first_local_task(struct rq *rq)
>  {
>  	return list_first_entry_or_null(&rq->scx.local_dsq.list,
> @@ -2460,7 +2502,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
>  	bool keep_prev;
>  	struct task_struct *p;
>  
> -	/* see kick_cpus_irq_workfn() */
> +	/* see kick_sync_wait_bal_cb() */
>  	smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
>  
>  	rq_modified_begin(rq, &ext_sched_class);
> @@ -2470,6 +2512,17 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
>  	rq_repin_lock(rq, rf);
>  	maybe_queue_balance_callback(rq);
>  
> +	/*
> +	 * Defer to a balance callback which can drop rq lock and enable
> +	 * IRQs. Waiting directly in the pick path would deadlock against
> +	 * CPUs sending us IPIs (e.g. TLB flushes) while we wait for them.
> +	 */
> +	if (unlikely(rq->scx.kick_sync_pending)) {
> +		rq->scx.kick_sync_pending = false;
> +		queue_balance_callback(rq, &rq->scx.kick_sync_bal_cb,
> +				       kick_sync_wait_bal_cb);

queue_balance_callback() is a no-op if the rq is in balance_push, but I
guess it's ok to just clear the kick_sync_pending if we add the checks
above.

> +	}
> +
>  	/*
>  	 * If any higher-priority sched class enqueued a runnable task on
>  	 * this rq during balance_one(), abort and return RETRY_TASK, so
> @@ -4713,6 +4766,9 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
>  		if (!cpumask_empty(rq->scx.cpus_to_wait))
>  			dump_line(&ns, "  cpus_to_wait   : %*pb",
>  				  cpumask_pr_args(rq->scx.cpus_to_wait));
> +		if (!cpumask_empty(rq->scx.cpus_to_sync))
> +			dump_line(&ns, "  cpus_to_sync   : %*pb",
> +				  cpumask_pr_args(rq->scx.cpus_to_sync));
>  
>  		used = seq_buf_used(&ns);
>  		if (SCX_HAS_OP(sch, dump_cpu)) {
> @@ -5610,11 +5666,11 @@ static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs)
>  
>  		if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) {
>  			if (cur_class == &ext_sched_class) {
> +				cpumask_set_cpu(cpu, this_scx->cpus_to_sync);
>  				ksyncs[cpu] = rq->scx.kick_sync;
>  				should_wait = true;
> -			} else {
> -				cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
>  			}
> +			cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
>  		}
>  
>  		resched_curr(rq);
> @@ -5669,27 +5725,15 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
>  		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
>  	}
>  
> -	if (!should_wait)
> -		return;
> -
> -	for_each_cpu(cpu, this_scx->cpus_to_wait) {
> -		unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync;
> -
> -		/*
> -		 * Busy-wait until the task running at the time of kicking is no
> -		 * longer running. This can be used to implement e.g. core
> -		 * scheduling.
> -		 *
> -		 * smp_cond_load_acquire() pairs with store_releases in
> -		 * pick_task_scx() and put_prev_task_scx(). The former breaks
> -		 * the wait if SCX's scheduling path is entered even if the same
> -		 * task is picked subsequently. The latter is necessary to break
> -		 * the wait when $cpu is taken by a higher sched class.
> -		 */
> -		if (cpu != cpu_of(this_rq))
> -			smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]);
> -
> -		cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
> +	/*
> +	 * Can't wait in hardirq — kick_sync can't advance, deadlocking if
> +	 * CPUs wait for each other. Defer to kick_sync_wait_bal_cb().
> +	 */
> +	if (should_wait) {
> +		raw_spin_rq_lock(this_rq);
> +		this_scx->kick_sync_pending = true;
> +		resched_curr(this_rq);
> +		raw_spin_rq_unlock(this_rq);
>  	}
>  }
>  
> @@ -5794,6 +5838,7 @@ void __init init_sched_ext_class(void)
>  		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
>  		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
>  		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
> +		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_sync, GFP_KERNEL, n));
>  		rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
>  		rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);
>  
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 43bbf0693cca..1ef9ba480f51 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -805,9 +805,12 @@ struct scx_rq {
>  	cpumask_var_t		cpus_to_kick_if_idle;
>  	cpumask_var_t		cpus_to_preempt;
>  	cpumask_var_t		cpus_to_wait;
> +	cpumask_var_t		cpus_to_sync;
> +	bool			kick_sync_pending;
>  	unsigned long		kick_sync;
>  	local_t			reenq_local_deferred;
>  	struct balance_callback	deferred_bal_cb;
> +	struct balance_callback	kick_sync_bal_cb;
>  	struct irq_work		deferred_irq_work;
>  	struct irq_work		kick_cpus_irq_work;
>  	struct scx_dispatch_q	bypass_dsq;
> -- 
> 2.53.0
> 

Thanks,
-Andrea

      reply	other threads:[~2026-03-29 16:26 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <20260329001856.835643-1-tj@kernel.org>
2026-03-29  0:18 ` [PATCH 1/2] sched_ext: Fix SCX_KICK_WAIT deadlock by deferring wait to balance callback Tejun Heo
2026-03-29 16:26   ` Andrea Righi [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aclSomF-cmT6_UsN@gpd4 \
    --to=arighi@nvidia.com \
    --cc=changwoo@igalia.com \
    --cc=christian.loehle@arm.com \
    --cc=emil@etsalapatis.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=sched-ext@lists.linux.dev \
    --cc=stable@vger.kernel.org \
    --cc=tj@kernel.org \
    --cc=void@manifault.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox