public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Shrikanth Hegde <sshegde@linux.ibm.com>
To: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org, juri.lelli@redhat.com,
	dietmar.eggemann@arm.com, rostedt@goodmis.org,
	bsegall@google.com, mgorman@suse.de, vschneid@redhat.com,
	tj@kernel.org, void@manifault.com, arighi@nvidia.com,
	changwoo@igalia.com, sched-ext@lists.linux.dev, mingo@kernel.org,
	vincent.guittot@linaro.org
Subject: Re: [PATCH 5/5] sched: Rework sched_class::wakeup_preempt() and rq_modified_*()
Date: Sat, 29 Nov 2025 23:38:49 +0530	[thread overview]
Message-ID: <3d9d0e43-b73c-4bed-b59c-dc1387d183e4@linux.ibm.com> (raw)
In-Reply-To: <20251127154725.901391274@infradead.org>



On 11/27/25 9:09 PM, Peter Zijlstra wrote:
> Change sched_class::wakeup_preempt() to also get called for
> cross-class wakeups, specifically those where the woken task is of a
> higher class than the previous highest class.
> 
> In order to do this, track the current highest class of the runqueue
> in rq::next_class and have wakeup_preempt() track this upwards for
> each new wakeup. Additionally have set_next_task() re-set the value to
> the current class.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>   kernel/sched/core.c      |   32 +++++++++++++++++++++++---------
>   kernel/sched/deadline.c  |   14 +++++++++-----
>   kernel/sched/ext.c       |    9 ++++-----
>   kernel/sched/fair.c      |   17 ++++++++++-------
>   kernel/sched/idle.c      |    3 ---
>   kernel/sched/rt.c        |    9 ++++++---
>   kernel/sched/sched.h     |   26 ++------------------------
>   kernel/sched/stop_task.c |    3 ---
>   8 files changed, 54 insertions(+), 59 deletions(-)
> 
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2090,7 +2090,6 @@ void enqueue_task(struct rq *rq, struct
>   	 */
>   	uclamp_rq_inc(rq, p, flags);
>   
> -	rq->queue_mask |= p->sched_class->queue_mask;
>   	p->sched_class->enqueue_task(rq, p, flags);
>   
>   	psi_enqueue(p, flags);
> @@ -2123,7 +2122,6 @@ inline bool dequeue_task(struct rq *rq,
>   	 * and mark the task ->sched_delayed.
>   	 */
>   	uclamp_rq_dec(rq, p);
> -	rq->queue_mask |= p->sched_class->queue_mask;
>   	return p->sched_class->dequeue_task(rq, p, flags);
>   }
>   
> @@ -2174,10 +2172,14 @@ void wakeup_preempt(struct rq *rq, struc
>   {
>   	struct task_struct *donor = rq->donor;
>   
> -	if (p->sched_class == donor->sched_class)
> -		donor->sched_class->wakeup_preempt(rq, p, flags);
> -	else if (sched_class_above(p->sched_class, donor->sched_class))
> +	if (p->sched_class == rq->next_class) {
> +		rq->next_class->wakeup_preempt(rq, p, flags);
> +
> +	} else if (sched_class_above(p->sched_class, rq->next_class)) {
> +		rq->next_class->wakeup_preempt(rq, p, flags);

Whats the logic of calling wakeup_preempt here?

say rq was running CFS, now RT is waking up. but first thing we do is return if not
fair_sched_class. it is effectively resched_curr right?

>   		resched_curr(rq);
> +		rq->next_class = p->sched_class;

Since resched will happen and __schedule can set the next_class. it is necessary to set it
even earlier?

> +	}
>   
>   	/*
>   	 * A queue event has occurred, and we're going to schedule.  In
> @@ -6797,6 +6799,7 @@ static void __sched notrace __schedule(i
>   pick_again:
>   	next = pick_next_task(rq, rq->donor, &rf);
>   	rq_set_donor(rq, next);
> +	rq->next_class = next->sched_class;
>   	if (unlikely(task_is_blocked(next))) {
>   		next = find_proxy_task(rq, next, &rf);
>   		if (!next)
> @@ -8646,6 +8649,8 @@ void __init sched_init(void)
>   		rq->rt.rt_runtime = global_rt_runtime();
>   		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
>   #endif
> +		rq->next_class = &idle_sched_class;
> +
>   		rq->sd = NULL;
>   		rq->rd = NULL;
>   		rq->cpu_capacity = SCHED_CAPACITY_SCALE;
> @@ -10771,10 +10776,8 @@ struct sched_change_ctx *sched_change_be
>   		flags |= DEQUEUE_NOCLOCK;
>   	}
>   
> -	if (flags & DEQUEUE_CLASS) {
> -		if (p->sched_class->switching_from)
> -			p->sched_class->switching_from(rq, p);
> -	}
> +	if ((flags & DEQUEUE_CLASS) && p->sched_class->switching_from)
> +		p->sched_class->switching_from(rq, p);
>   
>   	*ctx = (struct sched_change_ctx){
>   		.p = p,
> @@ -10827,6 +10830,17 @@ void sched_change_end(struct sched_chang
>   			p->sched_class->switched_to(rq, p);
>   
>   		/*
> +		 * If this was a class promotion; let the old class know it
> +		 * got preempted. Note that none of the switch*_from() methods
> +		 * know the new class and none of the switch*_to() methods
> +		 * know the old class.
> +		 */
> +		if (ctx->running && sched_class_above(p->sched_class, ctx->class)) {
> +			rq->next_class->wakeup_preempt(rq, p, 0);
> +			rq->next_class = p->sched_class;
> +		}
> +
> +		/*
>   		 * If this was a degradation in class someone should have set
>   		 * need_resched by now.
>   		 */
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -2499,9 +2499,16 @@ static int balance_dl(struct rq *rq, str
>    * Only called when both the current and waking task are -deadline
>    * tasks.
>    */
> -static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
> -				  int flags)
> +static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags)
>   {
> +	/*
> +	 * Can only get preempted by stop-class, and those should be
> +	 * few and short lived, doesn't really make sense to push
> +	 * anything away for that.
> +	 */
> +	if (p->sched_class != &dl_sched_class)
> +		return;
> +
>   	if (dl_entity_preempt(&p->dl, &rq->donor->dl)) {
>   		resched_curr(rq);
>   		return;
> @@ -3304,9 +3311,6 @@ static int task_is_throttled_dl(struct t
>   #endif
>   
>   DEFINE_SCHED_CLASS(dl) = {
> -
> -	.queue_mask		= 8,
> -
>   	.enqueue_task		= enqueue_task_dl,
>   	.dequeue_task		= dequeue_task_dl,
>   	.yield_task		= yield_task_dl,
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -2338,12 +2338,12 @@ static struct task_struct *pick_task_scx
>   	bool keep_prev, kick_idle = false;
>   	struct task_struct *p;
>   
> -	rq_modified_clear(rq);
> +	rq->next_class = &ext_sched_class;
>   	rq_unpin_lock(rq, rf);
>   	balance_one(rq, prev);
>   	rq_repin_lock(rq, rf);
>   	maybe_queue_balance_callback(rq);
> -	if (rq_modified_above(rq, &ext_sched_class))
> +	if (sched_class_above(rq->next_class, &ext_sched_class))
>   		return RETRY_TASK;
>   
>   	keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
> @@ -2967,7 +2967,8 @@ static void switched_from_scx(struct rq
>   	scx_disable_task(p);
>   }
>   
> -static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
> +static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) {}
> +
>   static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
>   
>   int scx_check_setscheduler(struct task_struct *p, int policy)
> @@ -3216,8 +3217,6 @@ static void scx_cgroup_unlock(void) {}
>    *   their current sched_class. Call them directly from sched core instead.
>    */
>   DEFINE_SCHED_CLASS(ext) = {
> -	.queue_mask		= 1,
> -
>   	.enqueue_task		= enqueue_task_scx,
>   	.dequeue_task		= dequeue_task_scx,
>   	.yield_task		= yield_task_scx,
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -8697,7 +8697,7 @@ preempt_sync(struct rq *rq, int wake_fla
>   /*
>    * Preempt the current task with a newly woken task if needed:
>    */
> -static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
> +static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_flags)
>   {
>   	enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK;
>   	struct task_struct *donor = rq->donor;
> @@ -8705,6 +8705,12 @@ static void check_preempt_wakeup_fair(st
>   	struct cfs_rq *cfs_rq = task_cfs_rq(donor);
>   	int cse_is_idle, pse_is_idle;
>   
> +	/*
> +	 * XXX Getting preempted by higher class, try and find idle CPU?
> +	 */
> +	if (p->sched_class != &fair_sched_class)
> +		return;
> +
>   	if (unlikely(se == pse))
>   		return;
>   
> @@ -12872,7 +12878,7 @@ static int sched_balance_newidle(struct
>   	t0 = sched_clock_cpu(this_cpu);
>   	__sched_balance_update_blocked_averages(this_rq);
>   
> -	rq_modified_clear(this_rq);
> +	this_rq->next_class = &fair_sched_class;
>   	raw_spin_rq_unlock(this_rq);
>   
>   	for_each_domain(this_cpu, sd) {
> @@ -12939,7 +12945,7 @@ static int sched_balance_newidle(struct
>   		pulled_task = 1;
>   
>   	/* If a higher prio class was modified, restart the pick */
> -	if (rq_modified_above(this_rq, &fair_sched_class))
> +	if (sched_class_above(this_rq->next_class, &fair_sched_class))
>   		pulled_task = -1;
>   
>   out:
> @@ -13837,15 +13843,12 @@ static unsigned int get_rr_interval_fair
>    * All the scheduling class methods:
>    */
>   DEFINE_SCHED_CLASS(fair) = {
> -
> -	.queue_mask		= 2,
> -
>   	.enqueue_task		= enqueue_task_fair,
>   	.dequeue_task		= dequeue_task_fair,
>   	.yield_task		= yield_task_fair,
>   	.yield_to_task		= yield_to_task_fair,
>   
> -	.wakeup_preempt		= check_preempt_wakeup_fair,
> +	.wakeup_preempt		= wakeup_preempt_fair,
>   
>   	.pick_task		= pick_task_fair,
>   	.pick_next_task		= pick_next_task_fair,
> --- a/kernel/sched/idle.c
> +++ b/kernel/sched/idle.c
> @@ -534,9 +534,6 @@ static void update_curr_idle(struct rq *
>    * Simple, special scheduling class for the per-CPU idle tasks:
>    */
>   DEFINE_SCHED_CLASS(idle) = {
> -
> -	.queue_mask		= 0,
> -
>   	/* no enqueue/yield_task for idle tasks */
>   
>   	/* dequeue is not valid, we print a debug message there: */
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -1615,6 +1615,12 @@ static void wakeup_preempt_rt(struct rq
>   {
>   	struct task_struct *donor = rq->donor;
>   
> +	/*
> +	 * XXX If we're preempted by DL, queue a push?
> +	 */
> +	if (p->sched_class != &rt_sched_class)
> +		return;
> +
>   	if (p->prio < donor->prio) {
>   		resched_curr(rq);
>   		return;
> @@ -2568,9 +2574,6 @@ static int task_is_throttled_rt(struct t
>   #endif /* CONFIG_SCHED_CORE */
>   
>   DEFINE_SCHED_CLASS(rt) = {
> -
> -	.queue_mask		= 4,
> -
>   	.enqueue_task		= enqueue_task_rt,
>   	.dequeue_task		= dequeue_task_rt,
>   	.yield_task		= yield_task_rt,
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1119,7 +1119,6 @@ struct rq {
>   	raw_spinlock_t		__lock;
>   
>   	/* Per class runqueue modification mask; bits in class order. */
> -	unsigned int		queue_mask;
>   	unsigned int		nr_running;
>   #ifdef CONFIG_NUMA_BALANCING
>   	unsigned int		nr_numa_running;
> @@ -1179,6 +1178,7 @@ struct rq {
>   	struct sched_dl_entity	*dl_server;
>   	struct task_struct	*idle;
>   	struct task_struct	*stop;
> +	const struct sched_class *next_class;
>   	unsigned long		next_balance;
>   	struct mm_struct	*prev_mm;
>   
> @@ -2426,15 +2426,6 @@ struct sched_class {
>   #ifdef CONFIG_UCLAMP_TASK
>   	int uclamp_enabled;
>   #endif
> -	/*
> -	 * idle:  0
> -	 * ext:   1
> -	 * fair:  2
> -	 * rt:    4
> -	 * dl:    8
> -	 * stop: 16
> -	 */
> -	unsigned int queue_mask;
>   
>   	/*
>   	 * move_queued_task/activate_task/enqueue_task: rq->lock
> @@ -2593,20 +2584,6 @@ struct sched_class {
>   #endif
>   };
>   
> -/*
> - * Does not nest; only used around sched_class::pick_task() rq-lock-breaks.
> - */
> -static inline void rq_modified_clear(struct rq *rq)
> -{
> -	rq->queue_mask = 0;
> -}
> -
> -static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class)
> -{
> -	unsigned int mask = class->queue_mask;
> -	return rq->queue_mask & ~((mask << 1) - 1);
> -}
> -
>   static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
>   {
>   	WARN_ON_ONCE(rq->donor != prev);
> @@ -3899,6 +3876,7 @@ void move_queued_task_locked(struct rq *
>   	deactivate_task(src_rq, task, 0);
>   	set_task_cpu(task, dst_rq->cpu);
>   	activate_task(dst_rq, task, 0);
> +	wakeup_preempt(dst_rq, task, 0);

Whats the need of wakeup_preempt here?

In all places, move_queued_task_locked is followed by resched_curr
except in __migrate_swap_task which does same wakeup_preempt.


>   }
>   
>   static inline
> --- a/kernel/sched/stop_task.c
> +++ b/kernel/sched/stop_task.c
> @@ -97,9 +97,6 @@ static void update_curr_stop(struct rq *
>    * Simple, special scheduling class for the per-CPU stop tasks:
>    */
>   DEFINE_SCHED_CLASS(stop) = {
> -
> -	.queue_mask		= 16,
> -
>   	.enqueue_task		= enqueue_task_stop,
>   	.dequeue_task		= dequeue_task_stop,
>   	.yield_task		= yield_task_stop,
> 
> 


  parent reply	other threads:[~2025-11-29 18:09 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-11-27 15:39 [PATCH 0/5] sched: Random collection of patches Peter Zijlstra
2025-11-27 15:39 ` [PATCH 1/5] sched/fair: Fold the sched_avg update Peter Zijlstra
2025-12-14  7:46   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2025-12-14  7:46   ` [tip: sched/core] <linux/compiler_types.h>: Add the __signed_scalar_typeof() helper tip-bot2 for Peter Zijlstra
2025-11-27 15:39 ` [PATCH 2/5] sched/fair: Avoid rq->lock bouncing in sched_balance_newidle() Peter Zijlstra
2025-11-29 18:59   ` Shrikanth Hegde
2025-12-14  7:46   ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2025-11-27 15:39 ` [PATCH 3/5] sched: Change rcu_dereference_check_sched_domain() to rcu-sched Peter Zijlstra
2025-11-28 10:57   ` Peter Zijlstra
2025-11-28 11:04     ` Peter Zijlstra
2025-11-28 11:21       ` Paul E. McKenney
2025-11-28 11:37         ` Peter Zijlstra
2025-12-14  7:46   ` [tip: sched/core] sched/fair: Remove superfluous rcu_read_lock() tip-bot2 for Peter Zijlstra
2025-11-27 15:39 ` [PATCH 4/5] sched: Add assertions to QUEUE_CLASS Peter Zijlstra
2025-12-14  7:46   ` [tip: sched/core] sched/core: " tip-bot2 for Peter Zijlstra
2025-12-18 10:09   ` [PATCH 4/5] sched: " Marek Szyprowski
2025-12-18 10:12     ` Peter Zijlstra
2025-11-27 15:39 ` [PATCH 5/5] sched: Rework sched_class::wakeup_preempt() and rq_modified_*() Peter Zijlstra
2025-11-28 13:26   ` Kuba Piecuch
2025-11-28 13:36     ` Peter Zijlstra
2025-11-28 13:44       ` Peter Zijlstra
2025-11-28 22:29   ` Andrea Righi
2025-11-29 18:08   ` Shrikanth Hegde [this message]
2025-11-30 11:32     ` Peter Zijlstra
2025-11-30 13:03       ` Shrikanth Hegde
2025-12-02 23:27   ` Tejun Heo
2025-12-14  7:46   ` [tip: sched/core] sched/core: " tip-bot2 for Peter Zijlstra
2025-12-15  6:07   ` error: implicit declaration of function ‘rq_modified_clear’ (was [PATCH 5/5] sched: Rework sched_class::wakeup_preempt() and rq_modified_*()) Thorsten Leemhuis
2025-12-15  7:12     ` Ingo Molnar
2025-12-15 11:51       ` Nathan Chancellor
2025-12-16  7:02         ` Thorsten Leemhuis
2025-12-16 18:40           ` Tejun Heo
2025-12-16 21:42             ` Peter Zijlstra
2025-12-17  9:58               ` Peter Zijlstra
2025-12-15  7:59   ` [tip: sched/core] sched/core: Rework sched_class::wakeup_preempt() and rq_modified_*() tip-bot2 for Peter Zijlstra
2025-12-17 10:02   ` tip-bot2 for Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=3d9d0e43-b73c-4bed-b59c-dc1387d183e4@linux.ibm.com \
    --to=sshegde@linux.ibm.com \
    --cc=arighi@nvidia.com \
    --cc=bsegall@google.com \
    --cc=changwoo@igalia.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=juri.lelli@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mgorman@suse.de \
    --cc=mingo@kernel.org \
    --cc=peterz@infradead.org \
    --cc=rostedt@goodmis.org \
    --cc=sched-ext@lists.linux.dev \
    --cc=tj@kernel.org \
    --cc=vincent.guittot@linaro.org \
    --cc=void@manifault.com \
    --cc=vschneid@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox