Re: [PATCH v22 4/6] sched: Handle blocked-waiter migration (and return migration)

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Peter Zijlstra <peterz@infradead.org>
To: John Stultz <jstultz@google.com>
Cc: LKML <linux-kernel@vger.kernel.org>,
	Joel Fernandes <joelagnelf@nvidia.com>,
	Qais Yousef <qyousef@layalina.io>, Ingo Molnar <mingo@redhat.com>,
	Juri Lelli <juri.lelli@redhat.com>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Valentin Schneider <vschneid@redhat.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>,
	Zimuzo Ezeozue <zezeozue@google.com>,
	Mel Gorman <mgorman@suse.de>, Will Deacon <will@kernel.org>,
	Waiman Long <longman@redhat.com>,
	Boqun Feng <boqun.feng@gmail.com>,
	"Paul E. McKenney" <paulmck@kernel.org>,
	Metin Kaya <Metin.Kaya@arm.com>,
	Xuewen Yan <xuewen.yan94@gmail.com>,
	K Prateek Nayak <kprateek.nayak@amd.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Daniel Lezcano <daniel.lezcano@linaro.org>,
	Suleiman Souhlal <suleiman@google.com>,
	kuyo chang <kuyo.chang@mediatek.com>, hupu <hupu.gm@gmail.com>,
	kernel-team@android.com
Subject: Re: [PATCH v22 4/6] sched: Handle blocked-waiter migration (and return migration)
Date: Wed, 8 Oct 2025 15:32:23 +0200	[thread overview]
Message-ID: <20251008133223.GT4067720@noisy.programming.kicks-ass.net> (raw)
In-Reply-To: <20250926032931.27663-5-jstultz@google.com>

On Fri, Sep 26, 2025 at 03:29:12AM +0000, John Stultz wrote:
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 7bba05c07a79d..d063d2c9bd5aa 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3157,6 +3157,14 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
>  
>  	__do_set_cpus_allowed(p, ctx);
>  
> +	/*
> +	 * It might be that the p->wake_cpu is no longer
> +	 * allowed, so set it to the dest_cpu so return
> +	 * migration doesn't send it to an invalid cpu
> +	 */

This comment isn't quite right; ->wake_cpu is a mere preference, it does
not have correctness concerns. That is, it is okay for ->wake_cpu to not
be inside cpus_allowed.

> +	if (!is_cpu_allowed(p, p->wake_cpu))
> +		p->wake_cpu = dest_cpu;
> +
>  	return affine_move_task(rq, p, rf, dest_cpu, ctx->flags);
>  
>  out:
> @@ -3717,6 +3725,72 @@ static inline void ttwu_do_wakeup(struct task_struct *p)
>  	trace_sched_wakeup(p);
>  }
>  
> +#ifdef CONFIG_SCHED_PROXY_EXEC
> +static inline void proxy_set_task_cpu(struct task_struct *p, int cpu)
> +{
> +	unsigned int wake_cpu;
> +
> +	/*
> +	 * Since we are enqueuing a blocked task on a cpu it may
> +	 * not be able to run on, preserve wake_cpu when we
> +	 * __set_task_cpu so we can return the task to where it
> +	 * was previously runnable.
> +	 */
> +	wake_cpu = p->wake_cpu;
> +	__set_task_cpu(p, cpu);
> +	p->wake_cpu = wake_cpu;

Humm, perhaps add an argument to __set_task_cpu() to not set ->wake_cpu
instead?

> +}
> +
> +static bool proxy_task_runnable_but_waking(struct task_struct *p)
> +{
> +	if (!sched_proxy_exec())
> +		return false;
> +	return (READ_ONCE(p->__state) == TASK_RUNNING &&
> +		READ_ONCE(p->blocked_on_state) == BO_WAKING);
> +}
> +
> +/*
> + * Checks to see if task p has been proxy-migrated to another rq
> + * and needs to be returned. If so, we deactivate the task here
> + * so that it can be properly woken up on the p->wake_cpu
> + * (or whichever cpu select_task_rq() picks at the bottom of
> + * try_to_wake_up()
> + */
> +static inline bool proxy_needs_return(struct rq *rq, struct task_struct *p)
> +{
> +	bool ret = false;
> +
> +	if (!sched_proxy_exec())
> +		return false;
> +
> +	raw_spin_lock(&p->blocked_lock);
> +	if (__get_task_blocked_on(p) && p->blocked_on_state == BO_WAKING) {
> +		if (!task_current(rq, p) && (p->wake_cpu != cpu_of(rq))) {
> +			if (task_current_donor(rq, p)) {
> +				put_prev_task(rq, p);
> +				rq_set_donor(rq, rq->idle);
> +			}
> +			deactivate_task(rq, p, DEQUEUE_NOCLOCK);
> +			ret = true;
> +		}
> +		__set_blocked_on_runnable(p);
> +		resched_curr(rq);
> +	}
> +	raw_spin_unlock(&p->blocked_lock);
> +	return ret;
> +}
> +#else /* !CONFIG_SCHED_PROXY_EXEC */
> +static bool proxy_task_runnable_but_waking(struct task_struct *p)
> +{
> +	return false;
> +}
> +
> +static inline bool proxy_needs_return(struct rq *rq, struct task_struct *p)
> +{
> +	return false;
> +}
> +#endif /* CONFIG_SCHED_PROXY_EXEC */
> +
>  static void
>  ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
>  		 struct rq_flags *rf)
> @@ -3802,6 +3876,8 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
>  		update_rq_clock(rq);
>  		if (p->se.sched_delayed)
>  			enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED);
> +		if (proxy_needs_return(rq, p))
> +			goto out;
>  		if (!task_on_cpu(rq, p)) {
>  			/*
>  			 * When on_rq && !on_cpu the task is preempted, see if
> @@ -3812,6 +3888,7 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
>  		ttwu_do_wakeup(p);
>  		ret = 1;
>  	}
> +out:
>  	__task_rq_unlock(rq, &rf);
>  
>  	return ret;
> @@ -4199,6 +4276,8 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
>  		 *    it disabling IRQs (this allows not taking ->pi_lock).
>  		 */
>  		WARN_ON_ONCE(p->se.sched_delayed);
> +		/* If current is waking up, we know we can run here, so set BO_RUNNBLE */
> +		set_blocked_on_runnable(p);
>  		if (!ttwu_state_match(p, state, &success))
>  			goto out;
>  
> @@ -4215,8 +4294,15 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
>  	 */
>  	scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
>  		smp_mb__after_spinlock();
> -		if (!ttwu_state_match(p, state, &success))
> -			break;
> +		if (!ttwu_state_match(p, state, &success)) {
> +			/*
> +			 * If we're already TASK_RUNNING, and BO_WAKING
> +			 * continue on to ttwu_runnable check to force
> +			 * proxy_needs_return evaluation
> +			 */
> +			if (!proxy_task_runnable_but_waking(p))
> +				break;
> +		}
>  
>  		trace_sched_waking(p);

Oh gawd :-( why !?!? 

So AFAICT this makes ttwu() do dequeue when it finds WAKING, and then it
falls through to do the normal wakeup. So why can't we do dequeue to
begin with -- instead of setting WAKING in the first place?

next prev parent reply	other threads:[~2025-10-08 13:32 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-09-26  3:29 [PATCH v22 0/6] Donor Migration for Proxy Execution (v22) John Stultz
2025-09-26  3:29 ` [PATCH v22 1/6] locking: Add task::blocked_lock to serialize blocked_on state John Stultz
2025-10-08 10:27   ` Peter Zijlstra
2025-09-26  3:29 ` [PATCH v22 2/6] sched/locking: Add blocked_on_state to provide necessary tri-state for proxy return-migration John Stultz
2025-10-08 11:26   ` Peter Zijlstra
2025-10-09  0:07     ` John Stultz
2025-10-09 11:43       ` Peter Zijlstra
2025-10-09 11:45         ` Peter Zijlstra
2025-10-14  2:43         ` John Stultz
2025-10-16 22:23           ` John Stultz
2025-09-26  3:29 ` [PATCH v22 3/6] sched: Add logic to zap balance callbacks if we pick again John Stultz
2025-10-08 11:37   ` Peter Zijlstra
2025-09-26  3:29 ` [PATCH v22 4/6] sched: Handle blocked-waiter migration (and return migration) John Stultz
2025-10-08 13:32   ` Peter Zijlstra [this message]
2025-10-16  0:15     ` John Stultz
2025-09-26  3:29 ` [PATCH v22 5/6] sched: Add blocked_donor link to task for smarter mutex handoffs John Stultz
2025-09-26  3:29 ` [PATCH v22 6/6] sched: Migrate whole chain in proxy_migrate_task() John Stultz

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20251008133223.GT4067720@noisy.programming.kicks-ass.net \
    --to=peterz@infradead.org \
    --cc=Metin.Kaya@arm.com \
    --cc=boqun.feng@gmail.com \
    --cc=bsegall@google.com \
    --cc=daniel.lezcano@linaro.org \
    --cc=dietmar.eggemann@arm.com \
    --cc=hupu.gm@gmail.com \
    --cc=joelagnelf@nvidia.com \
    --cc=jstultz@google.com \
    --cc=juri.lelli@redhat.com \
    --cc=kernel-team@android.com \
    --cc=kprateek.nayak@amd.com \
    --cc=kuyo.chang@mediatek.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=longman@redhat.com \
    --cc=mgorman@suse.de \
    --cc=mingo@redhat.com \
    --cc=paulmck@kernel.org \
    --cc=qyousef@layalina.io \
    --cc=rostedt@goodmis.org \
    --cc=suleiman@google.com \
    --cc=tglx@linutronix.de \
    --cc=vincent.guittot@linaro.org \
    --cc=vschneid@redhat.com \
    --cc=will@kernel.org \
    --cc=xuewen.yan94@gmail.com \
    --cc=zezeozue@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.