From mboxrd@z Thu Jan  1 00:00:00 1970
Received: from desiato.infradead.org (desiato.infradead.org [90.155.92.199])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id B127F38E5D5
	for <linux-kernel@vger.kernel.org>; Thu, 19 Mar 2026 12:50:15 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=90.155.92.199
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1773924618; cv=none; b=cArKSBAi7qaFKgLg/g9TqBjlKcZ86tk1KJI08nNdbykZh6wpAnnsjmPbySO9SAP2PBZmN4m3gGT5EKYhDoWwO+M9ixbJYYBdXnX+PNiZtXVymceuKC47GynFp1zYiYffEkbKrFikSszQErzX3VKP/v+3c7xCjTngwhr73XPcwNA=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1773924618; c=relaxed/simple;
	bh=W1viadc2VWTuZDguUg+aoxZC1x8jX+FSXrsDdbj1ibc=;
	h=Date:From:To:Cc:Subject:Message-ID:References:MIME-Version:
	 Content-Type:Content-Disposition:In-Reply-To; b=MZkgnt7DP1A4jcAI3t9b1p1f9QvqbUbZezCkFlZeRZu8mY/vlz1QlSHLcqNfudJgV2DzqVSoXDBGKOrXidgVL+HdDgR7d73+xLYUehgVb6RVo3szqgk0qHvipe0Dp+PLO/TOdqFq++JoJ6SFdd+mRmp3UUnNboVAnyuU64ftaoY=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=infradead.org; spf=none smtp.mailfrom=infradead.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b=eSKFh54t; arc=none smtp.client-ip=90.155.92.199
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=infradead.org
Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=infradead.org
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b="eSKFh54t"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
	d=infradead.org; s=desiato.20200630; h=In-Reply-To:Content-Type:MIME-Version:
	References:Message-ID:Subject:Cc:To:From:Date:Sender:Reply-To:
	Content-Transfer-Encoding:Content-ID:Content-Description;
	bh=/gr/sFG8MoU+Mt7QjMGjp/24JXSAAsACB8XPI8hYE8g=; b=eSKFh54tfpmtP7WcmCu5GA7vO/
	Pxk5skI+VHpQrny0EerR/ckz2FRbw2JFzd5nAV1lRMdOeO+Ed/4iAVh77n4ChGx2N146YbO6kFrNl
	P8YH2cbTmV7H0bq+D62jOtPNDDuX6MH0NUL4z+QAScdlXlJNMgL/agZOwce5u6XvZZaQxLesG7jEb
	X/QbKRvFvvSbxbES/ysTOhf05V+Ra3koce2yEpGdxosqfal1o/VFR1mRfOr1xisNDFMoNHf4HqYj3
	4DU00KvS2fQC+xH3wZZxG+tesR2U9Z9rkQ6v2sOHa/DJ/c//bRhP5qq3mMLqxIrK/j8IGQmzNO2rp
	gOxnANWA==;
Received: from 2001-1c00-8d85-5700-266e-96ff-fe07-7dcc.cable.dynamic.v6.ziggo.nl ([2001:1c00:8d85:5700:266e:96ff:fe07:7dcc] helo=noisy.programming.kicks-ass.net)
	by desiato.infradead.org with esmtpsa (Exim 4.98.2 #2 (Red Hat Linux))
	id 1w3Cp5-0000000DVIU-0Z3r;
	Thu, 19 Mar 2026 12:49:59 +0000
Received: by noisy.programming.kicks-ass.net (Postfix, from userid 1000)
	id E0026301185; Thu, 19 Mar 2026 13:49:56 +0100 (CET)
Date: Thu, 19 Mar 2026 13:49:56 +0100
From: Peter Zijlstra <peterz@infradead.org>
To: John Stultz <jstultz@google.com>
Cc: LKML <linux-kernel@vger.kernel.org>,
	Joel Fernandes <joelagnelf@nvidia.com>,
	Qais Yousef <qyousef@layalina.io>, Ingo Molnar <mingo@redhat.com>,
	Juri Lelli <juri.lelli@redhat.com>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Valentin Schneider <vschneid@redhat.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>,
	Zimuzo Ezeozue <zezeozue@google.com>, Mel Gorman <mgorman@suse.de>,
	Will Deacon <will@kernel.org>, Waiman Long <longman@redhat.com>,
	Boqun Feng <boqun.feng@gmail.com>,
	"Paul E. McKenney" <paulmck@kernel.org>,
	Metin Kaya <Metin.Kaya@arm.com>,
	Xuewen Yan <xuewen.yan94@gmail.com>,
	K Prateek Nayak <kprateek.nayak@amd.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Daniel Lezcano <daniel.lezcano@linaro.org>,
	Suleiman Souhlal <suleiman@google.com>,
	kuyo chang <kuyo.chang@mediatek.com>, hupu <hupu.gm@gmail.com>,
	kernel-team@android.com
Subject: Re: [PATCH v25 9/9] sched: Handle blocked-waiter migration (and
 return migration)
Message-ID: <20260319124956.GV3738010@noisy.programming.kicks-ass.net>
References: <20260313023022.2902479-1-jstultz@google.com>
 <20260313023022.2902479-10-jstultz@google.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <20260313023022.2902479-10-jstultz@google.com>

On Fri, Mar 13, 2026 at 02:30:10AM +0000, John Stultz wrote:
> +/*
> + * If the blocked-on relationship crosses CPUs, migrate @p to the
> + * owner's CPU.
> + *
> + * This is because we must respect the CPU affinity of execution
> + * contexts (owner) but we can ignore affinity for scheduling
> + * contexts (@p). So we have to move scheduling contexts towards
> + * potential execution contexts.
> + *
> + * Note: The owner can disappear, but simply migrate to @target_cpu
> + * and leave that CPU to sort things out.
> + */
> +static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf,
> +			       struct task_struct *p, int target_cpu)
>  {
> +	struct rq *target_rq = cpu_rq(target_cpu);
> +
> +	lockdep_assert_rq_held(rq);
> +
> +	/*
> +	 * Since we're going to drop @rq, we have to put(@rq->donor) first,
> +	 * otherwise we have a reference that no longer belongs to us.
> +	 *
> +	 * Additionally, as we put_prev_task(prev) earlier, its possible that
> +	 * prev will migrate away as soon as we drop the rq lock, however we
> +	 * still have it marked as rq->curr, as we've not yet switched tasks.
> +	 *
> +	 * So call proxy_resched_idle() to let go of the references before
> +	 * we release the lock.
> +	 */
> +	proxy_resched_idle(rq);

This comment confuses the heck out of me. It seems to imply we need to
schedule before dropping rq->lock.

> +
> +	WARN_ON(p == rq->curr);
> +
> +	deactivate_task(rq, p, DEQUEUE_NOCLOCK);
> +	proxy_set_task_cpu(p, target_cpu);
> +
> +	/*
> +	 * We have to zap callbacks before unlocking the rq
> +	 * as another CPU may jump in and call sched_balance_rq
> +	 * which can trip the warning in rq_pin_lock() if we
> +	 * leave callbacks set.
> +	 */

It might be good to explain where these callbacks come from.

> +	zap_balance_callbacks(rq);
> +	rq_unpin_lock(rq, rf);
> +	raw_spin_rq_unlock(rq);
> +
> +	attach_one_task(target_rq, p);
> +
> +	raw_spin_rq_lock(rq);
> +	rq_repin_lock(rq, rf);
> +	update_rq_clock(rq);
> +}
> +
> +static void proxy_force_return(struct rq *rq, struct rq_flags *rf,
> +			       struct task_struct *p)
> +{
> +	struct rq *this_rq, *target_rq;
> +	struct rq_flags this_rf;
> +	int cpu, wake_flag = WF_TTWU;
> +
> +	lockdep_assert_rq_held(rq);
> +	WARN_ON(p == rq->curr);
> +
> +	/*
> +	 * We have to zap callbacks before unlocking the rq
> +	 * as another CPU may jump in and call sched_balance_rq
> +	 * which can trip the warning in rq_pin_lock() if we
> +	 * leave callbacks set.
> +	 */

idem

> +	zap_balance_callbacks(rq);
> +	rq_unpin_lock(rq, rf);
> +	raw_spin_rq_unlock(rq);

This is in fact the very same sequence as above.

> +
> +	/*
> +	 * We drop the rq lock, and re-grab task_rq_lock to get
> +	 * the pi_lock (needed for select_task_rq) as well.
> +	 */
> +	this_rq = task_rq_lock(p, &this_rf);
> +
> +	/*
> +	 * Since we let go of the rq lock, the task may have been
> +	 * woken or migrated to another rq before we  got the
> +	 * task_rq_lock. So re-check we're on the same RQ. If
> +	 * not, the task has already been migrated and that CPU
> +	 * will handle any futher migrations.
> +	 */
> +	if (this_rq != rq)
> +		goto err_out;
> +
> +	/* Similarly, if we've been dequeued, someone else will wake us */
> +	if (!task_on_rq_queued(p))
> +		goto err_out;
> +
> +	/*
> +	 * Since we should only be calling here from __schedule()
> +	 * -> find_proxy_task(), no one else should have
> +	 * assigned current out from under us. But check and warn
> +	 * if we see this, then bail.
> +	 */
> +	if (task_current(this_rq, p) || task_on_cpu(this_rq, p)) {
> +		WARN_ONCE(1, "%s rq: %i current/on_cpu task %s %d  on_cpu: %i\n",
> +			  __func__, cpu_of(this_rq),
> +			  p->comm, p->pid, p->on_cpu);
> +		goto err_out;
>  	}
> -	return NULL;
> +
> +	update_rq_clock(this_rq);
> +	proxy_resched_idle(this_rq);
> +	deactivate_task(this_rq, p, DEQUEUE_NOCLOCK);
> +	cpu = select_task_rq(p, p->wake_cpu, &wake_flag);
> +	set_task_cpu(p, cpu);
> +	target_rq = cpu_rq(cpu);
> +	clear_task_blocked_on(p, NULL);
> +	task_rq_unlock(this_rq, p, &this_rf);
> +
> +	attach_one_task(target_rq, p);
> +
> +	/* Finally, re-grab the origianl rq lock and return to pick-again */
> +	raw_spin_rq_lock(rq);
> +	rq_repin_lock(rq, rf);
> +	update_rq_clock(rq);
> +	return;
> +
> +err_out:
> +	task_rq_unlock(this_rq, p, &this_rf);
> +	raw_spin_rq_lock(rq);
> +	rq_repin_lock(rq, rf);
> +	update_rq_clock(rq);
>  }

Hurm... how about something like so?

---
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6662,6 +6662,28 @@ static bool proxy_deactivate(struct rq *
 	return try_to_block_task(rq, donor, &state, true);
 }
 
+static inline void proxy_release_rq_lock(struct rq *rq, struct rq_flags *rf)
+	__releases(__rq_lockp(rq))
+{
+	/*
+	 * We have to zap callbacks before unlocking the rq
+	 * as another CPU may jump in and call sched_balance_rq
+	 * which can trip the warning in rq_pin_lock() if we
+	 * leave callbacks set.
+	 */
+	zap_balance_callbacks(rq);
+	rq_unpin_lock(rq, rf);
+	raw_spin_rq_unlock(rq);
+}
+
+static inline void proxy_reacquire_rq_lock(struct rq *rq, struct rq_flags *rf)
+	__acquires(__rq_lockp(rq))
+{
+	raw_spin_rq_lock(rq);
+	rq_repin_lock(rq, rf);
+	update_rq_clock(rq);
+}
+
 /*
  * If the blocked-on relationship crosses CPUs, migrate @p to the
  * owner's CPU.
@@ -6676,6 +6698,7 @@ static bool proxy_deactivate(struct rq *
  */
 static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf,
 			       struct task_struct *p, int target_cpu)
+	__must_hold(__rq_lockp(rq))
 {
 	struct rq *target_rq = cpu_rq(target_cpu);
 
@@ -6699,98 +6722,72 @@ static void proxy_migrate_task(struct rq
 	deactivate_task(rq, p, DEQUEUE_NOCLOCK);
 	proxy_set_task_cpu(p, target_cpu);
 
-	/*
-	 * We have to zap callbacks before unlocking the rq
-	 * as another CPU may jump in and call sched_balance_rq
-	 * which can trip the warning in rq_pin_lock() if we
-	 * leave callbacks set.
-	 */
-	zap_balance_callbacks(rq);
-	rq_unpin_lock(rq, rf);
-	raw_spin_rq_unlock(rq);
+	proxy_release_rq_lock(rq, rf);
 
 	attach_one_task(target_rq, p);
 
-	raw_spin_rq_lock(rq);
-	rq_repin_lock(rq, rf);
-	update_rq_clock(rq);
+	proxy_reacquire_rq_lock(rq, rf);
 }
 
 static void proxy_force_return(struct rq *rq, struct rq_flags *rf,
 			       struct task_struct *p)
+	__must_hold(__rq_lockp(rq))
 {
-	struct rq *this_rq, *target_rq;
-	struct rq_flags this_rf;
+	struct rq *task_rq, *target_rq = NULL;
 	int cpu, wake_flag = WF_TTWU;
 
 	lockdep_assert_rq_held(rq);
 	WARN_ON(p == rq->curr);
 
-	/*
-	 * We have to zap callbacks before unlocking the rq
-	 * as another CPU may jump in and call sched_balance_rq
-	 * which can trip the warning in rq_pin_lock() if we
-	 * leave callbacks set.
-	 */
-	zap_balance_callbacks(rq);
-	rq_unpin_lock(rq, rf);
-	raw_spin_rq_unlock(rq);
+	proxy_release_rq_lock(rq, rf);
 
 	/*
 	 * We drop the rq lock, and re-grab task_rq_lock to get
 	 * the pi_lock (needed for select_task_rq) as well.
 	 */
-	this_rq = task_rq_lock(p, &this_rf);
+	scoped_guard (task_rq_lock, p) {
+		task_rq = scope.rq;
 
-	/*
-	 * Since we let go of the rq lock, the task may have been
-	 * woken or migrated to another rq before we  got the
-	 * task_rq_lock. So re-check we're on the same RQ. If
-	 * not, the task has already been migrated and that CPU
-	 * will handle any futher migrations.
-	 */
-	if (this_rq != rq)
-		goto err_out;
-
-	/* Similarly, if we've been dequeued, someone else will wake us */
-	if (!task_on_rq_queued(p))
-		goto err_out;
-
-	/*
-	 * Since we should only be calling here from __schedule()
-	 * -> find_proxy_task(), no one else should have
-	 * assigned current out from under us. But check and warn
-	 * if we see this, then bail.
-	 */
-	if (task_current(this_rq, p) || task_on_cpu(this_rq, p)) {
-		WARN_ONCE(1, "%s rq: %i current/on_cpu task %s %d  on_cpu: %i\n",
-			  __func__, cpu_of(this_rq),
-			  p->comm, p->pid, p->on_cpu);
-		goto err_out;
+		/*
+		 * Since we let go of the rq lock, the task may have been
+		 * woken or migrated to another rq before we  got the
+		 * task_rq_lock. So re-check we're on the same RQ. If
+		 * not, the task has already been migrated and that CPU
+		 * will handle any futher migrations.
+		 */
+		if (task_rq != rq)
+			break;
+
+		/* Similarly, if we've been dequeued, someone else will wake us */
+		if (!task_on_rq_queued(p))
+			break;
+
+		/*
+		 * Since we should only be calling here from __schedule()
+		 * -> find_proxy_task(), no one else should have
+		 * assigned current out from under us. But check and warn
+		 * if we see this, then bail.
+		 */
+		if (task_current(task_rq, p) || task_on_cpu(task_rq, p)) {
+			WARN_ONCE(1, "%s rq: %i current/on_cpu task %s %d  on_cpu: %i\n",
+				  __func__, cpu_of(task_rq),
+				  p->comm, p->pid, p->on_cpu);
+			break;
+		}
+
+		update_rq_clock(task_rq);
+		proxy_resched_idle(task_rq);
+		deactivate_task(task_rq, p, DEQUEUE_NOCLOCK);
+		cpu = select_task_rq(p, p->wake_cpu, &wake_flag);
+		set_task_cpu(p, cpu);
+		target_rq = cpu_rq(cpu);
+		clear_task_blocked_on(p, NULL);
 	}
 
-	update_rq_clock(this_rq);
-	proxy_resched_idle(this_rq);
-	deactivate_task(this_rq, p, DEQUEUE_NOCLOCK);
-	cpu = select_task_rq(p, p->wake_cpu, &wake_flag);
-	set_task_cpu(p, cpu);
-	target_rq = cpu_rq(cpu);
-	clear_task_blocked_on(p, NULL);
-	task_rq_unlock(this_rq, p, &this_rf);
+	if (target_rq)
+		attach_one_task(target_rq, p);
 
-	attach_one_task(target_rq, p);
-
-	/* Finally, re-grab the origianl rq lock and return to pick-again */
-	raw_spin_rq_lock(rq);
-	rq_repin_lock(rq, rf);
-	update_rq_clock(rq);
-	return;
-
-err_out:
-	task_rq_unlock(this_rq, p, &this_rf);
-	raw_spin_rq_lock(rq);
-	rq_repin_lock(rq, rf);
-	update_rq_clock(rq);
+	proxy_reacquire_rq_lock(rq, rf);
 }
 
 /*
@@ -6811,6 +6808,7 @@ static void proxy_force_return(struct rq
  */
 static struct task_struct *
 find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
+	__must_hold(__rq_lockp(rq))
 {
 	enum { FOUND, DEACTIVATE_DONOR, MIGRATE, NEEDS_RETURN } action = FOUND;
 	struct task_struct *owner = NULL;