public inbox for cgroups@vger.kernel.org
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: tj@kernel.org
Cc: linux-kernel@vger.kernel.org, peterz@infradead.org,
	mingo@kernel.org, juri.lelli@redhat.com,
	vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
	rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
	vschneid@redhat.com, longman@redhat.com, hannes@cmpxchg.org,
	mkoutny@suse.com, void@manifault.com, arighi@nvidia.com,
	changwoo@igalia.com, cgroups@vger.kernel.org,
	sched-ext@lists.linux.dev, liuwenfang@honor.com,
	tglx@linutronix.de
Subject: [PATCH 03/12] sched: Fold sched_class::switch{ing,ed}_{to,from}() into the change pattern
Date: Mon, 06 Oct 2025 12:44:05 +0200	[thread overview]
Message-ID: <20251006104526.861755244@infradead.org> (raw)
In-Reply-To: 20251006104402.946760805@infradead.org

Add {DE,EN}QUEUE_CLASS and fold the sched_class::switch* methods into
the change pattern. This completes and makes the pattern more
symmetric.

This changes the order of callbacks slightly:

				|
				|  switching_from()
  dequeue_task();		|  dequeue_task()
  put_prev_task();		|  put_prev_task()
				|  switched_from()
				|
  ... change task ...		|  ... change task ...
				|
  switching_to();		|  switching_to()
  enqueue_task();		|  enqueue_task()
  set_next_task();		|  set_next_task()
  prev_class->switched_from()	|
  switched_to()			|  switched_to()
				|

Notably, it moves the switched_from() callback right after the
dequeue/put. Existing implementations don't appear to be affected by
this change in location -- specifically the task isn't enqueued on the
class in question in either location.

Make (CLASS)^(SAVE|MOVE), because there is nothing to save-restore
when changing scheduling classes.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/core.c      |   56 +++++++++++++++++++++--------------------------
 kernel/sched/ext.c       |   26 ++++++++++++++++-----
 kernel/sched/idle.c      |    4 +--
 kernel/sched/rt.c        |    2 -
 kernel/sched/sched.h     |   22 ++++++------------
 kernel/sched/stop_task.c |    4 +--
 kernel/sched/syscalls.c  |    9 +++++--
 7 files changed, 66 insertions(+), 57 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2169,34 +2169,9 @@ inline int task_curr(const struct task_s
 	return cpu_curr(task_cpu(p)) == p;
 }
 
-/*
- * ->switching_to() is called with the pi_lock and rq_lock held and must not
- * mess with locking.
- */
-void check_class_changing(struct rq *rq, struct task_struct *p,
-			  const struct sched_class *prev_class)
+void check_prio_changed(struct rq *rq, struct task_struct *p, int oldprio)
 {
-	if (prev_class != p->sched_class && p->sched_class->switching_to)
-		p->sched_class->switching_to(rq, p);
-}
-
-/*
- * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
- * use the balance_callback list if you want balancing.
- *
- * this means any call to check_class_changed() must be followed by a call to
- * balance_callback().
- */
-void check_class_changed(struct rq *rq, struct task_struct *p,
-			 const struct sched_class *prev_class,
-			 int oldprio)
-{
-	if (prev_class != p->sched_class) {
-		if (prev_class->switched_from)
-			prev_class->switched_from(rq, p);
-
-		p->sched_class->switched_to(rq, p);
-	} else if (oldprio != p->prio || dl_task(p))
+	if (oldprio != p->prio || dl_task(p))
 		p->sched_class->prio_changed(rq, p, oldprio);
 }
 
@@ -7388,6 +7363,11 @@ void rt_mutex_setprio(struct task_struct
 	prev_class = p->sched_class;
 	next_class = __setscheduler_class(p->policy, prio);
 
+	if (prev_class != next_class) {
+		queue_flag |= DEQUEUE_CLASS;
+		queue_flag &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
+	}
+
 	if (prev_class != next_class && p->se.sched_delayed)
 		dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
 
@@ -7424,11 +7404,10 @@ void rt_mutex_setprio(struct task_struct
 
 		p->sched_class = next_class;
 		p->prio = prio;
-
-		check_class_changing(rq, p, prev_class);
 	}
 
-	check_class_changed(rq, p, prev_class, oldprio);
+	if (!(queue_flag & DEQUEUE_CLASS))
+		check_prio_changed(rq, p, oldprio);
 out_unlock:
 	/* Avoid rq from going away on us: */
 	preempt_disable();
@@ -10862,6 +10841,14 @@ struct sched_change_ctx *sched_change_be
 
 	lockdep_assert_rq_held(rq);
 
+	if (flags & DEQUEUE_CLASS) {
+		if (WARN_ON_ONCE(flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)))
+			flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
+
+		if (p->sched_class->switching_from)
+			p->sched_class->switching_from(rq, p);
+	}
+
 	*ctx = (struct sched_change_ctx){
 		.p = p,
 		.flags = flags,
@@ -10874,6 +10861,9 @@ struct sched_change_ctx *sched_change_be
 	if (ctx->running)
 		put_prev_task(rq, p);
 
+	if ((flags & DEQUEUE_CLASS) && p->sched_class->switched_from)
+		p->sched_class->switched_from(rq, p);
+
 	return ctx;
 }
 
@@ -10884,8 +10874,14 @@ void sched_change_end(struct sched_chang
 
 	lockdep_assert_rq_held(rq);
 
+	if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to)
+		p->sched_class->switching_to(rq, p);
+
 	if (ctx->queued)
 		enqueue_task(rq, p, ctx->flags | ENQUEUE_NOCLOCK);
 	if (ctx->running)
 		set_next_task(rq, p);
+
+	if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switched_to)
+		p->sched_class->switched_to(rq, p);
 }
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3912,21 +3912,28 @@ static void scx_disable_workfn(struct kt
 
 	scx_task_iter_start(&sti);
 	while ((p = scx_task_iter_next_locked(&sti))) {
+		unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
 		const struct sched_class *old_class = p->sched_class;
 		const struct sched_class *new_class =
 			__setscheduler_class(p->policy, p->prio);
 
 		update_rq_clock(task_rq(p));
 
+		if (old_class != new_class) {
+			queue_flags |= DEQUEUE_CLASS;
+			queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
+		}
+
 		if (old_class != new_class && p->se.sched_delayed)
 			dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
 
-		scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
+		scoped_guard (sched_change, p, queue_flags) {
 			p->sched_class = new_class;
-			check_class_changing(task_rq(p), p, old_class);
 		}
 
-		check_class_changed(task_rq(p), p, old_class, p->prio);
+		if (!(queue_flags & DEQUEUE_CLASS))
+			check_prio_changed(task_rq(p), p, p->prio);
+
 		scx_exit_task(p);
 	}
 	scx_task_iter_stop(&sti);
@@ -4655,6 +4662,7 @@ static int scx_enable(struct sched_ext_o
 	percpu_down_write(&scx_fork_rwsem);
 	scx_task_iter_start(&sti);
 	while ((p = scx_task_iter_next_locked(&sti))) {
+		unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
 		const struct sched_class *old_class = p->sched_class;
 		const struct sched_class *new_class =
 			__setscheduler_class(p->policy, p->prio);
@@ -4664,16 +4672,22 @@ static int scx_enable(struct sched_ext_o
 
 		update_rq_clock(task_rq(p));
 
+		if (old_class != new_class) {
+			queue_flags |= DEQUEUE_CLASS;
+			queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
+		}
+
 		if (old_class != new_class && p->se.sched_delayed)
 			dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
 
-		scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
+		scoped_guard (sched_change, p, queue_flags) {
 			p->scx.slice = SCX_SLICE_DFL;
 			p->sched_class = new_class;
-			check_class_changing(task_rq(p), p, old_class);
 		}
 
-		check_class_changed(task_rq(p), p, old_class, p->prio);
+		if (!(queue_flags & DEQUEUE_CLASS))
+			check_prio_changed(task_rq(p), p, p->prio);
+
 		put_task_struct(p);
 	}
 	scx_task_iter_stop(&sti);
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -498,7 +498,7 @@ static void task_tick_idle(struct rq *rq
 {
 }
 
-static void switched_to_idle(struct rq *rq, struct task_struct *p)
+static void switching_to_idle(struct rq *rq, struct task_struct *p)
 {
 	BUG();
 }
@@ -536,6 +536,6 @@ DEFINE_SCHED_CLASS(idle) = {
 	.task_tick		= task_tick_idle,
 
 	.prio_changed		= prio_changed_idle,
-	.switched_to		= switched_to_idle,
+	.switching_to		= switching_to_idle,
 	.update_curr		= update_curr_idle,
 };
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2589,8 +2589,8 @@ DEFINE_SCHED_CLASS(rt) = {
 
 	.get_rr_interval	= get_rr_interval_rt,
 
-	.prio_changed		= prio_changed_rt,
 	.switched_to		= switched_to_rt,
+	.prio_changed		= prio_changed_rt,
 
 	.update_curr		= update_curr_rt,
 
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -20,7 +20,6 @@
 #include <linux/sched/task_flags.h>
 #include <linux/sched/task.h>
 #include <linux/sched/topology.h>
-
 #include <linux/atomic.h>
 #include <linux/bitmap.h>
 #include <linux/bug.h>
@@ -2369,6 +2368,7 @@ extern const u32		sched_prio_to_wmult[40
 
 #define DEQUEUE_MIGRATING	0x0010 /* Matches ENQUEUE_MIGRATING */
 #define DEQUEUE_DELAYED		0x0020 /* Matches ENQUEUE_DELAYED */
+#define DEQUEUE_CLASS		0x0040 /* Matches ENQUEUE_CLASS */
 
 #define DEQUEUE_SPECIAL		0x00010000
 #define DEQUEUE_THROTTLE	0x00020000
@@ -2380,6 +2380,7 @@ extern const u32		sched_prio_to_wmult[40
 
 #define ENQUEUE_MIGRATING	0x0010
 #define ENQUEUE_DELAYED		0x0020
+#define ENQUEUE_CLASS		0x0040
 
 #define ENQUEUE_HEAD		0x00010000
 #define ENQUEUE_REPLENISH	0x00020000
@@ -2443,14 +2444,11 @@ struct sched_class {
 	void (*task_fork)(struct task_struct *p);
 	void (*task_dead)(struct task_struct *p);
 
-	/*
-	 * The switched_from() call is allowed to drop rq->lock, therefore we
-	 * cannot assume the switched_from/switched_to pair is serialized by
-	 * rq->lock. They are however serialized by p->pi_lock.
-	 */
-	void (*switching_to) (struct rq *this_rq, struct task_struct *task);
-	void (*switched_from)(struct rq *this_rq, struct task_struct *task);
-	void (*switched_to)  (struct rq *this_rq, struct task_struct *task);
+	void (*switching_from)(struct rq *this_rq, struct task_struct *task);
+	void (*switched_from) (struct rq *this_rq, struct task_struct *task);
+	void (*switching_to)  (struct rq *this_rq, struct task_struct *task);
+	void (*switched_to)   (struct rq *this_rq, struct task_struct *task);
+
 	void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
 			      const struct load_weight *lw);
 	void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
@@ -3879,11 +3877,7 @@ extern void set_load_weight(struct task_
 extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
 extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
 
-extern void check_class_changing(struct rq *rq, struct task_struct *p,
-				 const struct sched_class *prev_class);
-extern void check_class_changed(struct rq *rq, struct task_struct *p,
-				const struct sched_class *prev_class,
-				int oldprio);
+extern void check_prio_changed(struct rq *rq, struct task_struct *p, int oldprio);
 
 extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
 extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -75,7 +75,7 @@ static void task_tick_stop(struct rq *rq
 {
 }
 
-static void switched_to_stop(struct rq *rq, struct task_struct *p)
+static void switching_to_stop(struct rq *rq, struct task_struct *p)
 {
 	BUG(); /* its impossible to change to this class */
 }
@@ -112,6 +112,6 @@ DEFINE_SCHED_CLASS(stop) = {
 	.task_tick		= task_tick_stop,
 
 	.prio_changed		= prio_changed_stop,
-	.switched_to		= switched_to_stop,
+	.switching_to		= switching_to_stop,
 	.update_curr		= update_curr_stop,
 };
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -684,6 +684,11 @@ int __sched_setscheduler(struct task_str
 	prev_class = p->sched_class;
 	next_class = __setscheduler_class(policy, newprio);
 
+	if (prev_class != next_class) {
+		queue_flags |= DEQUEUE_CLASS;
+		queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
+	}
+
 	if (prev_class != next_class && p->se.sched_delayed)
 		dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
 
@@ -695,7 +700,6 @@ int __sched_setscheduler(struct task_str
 			p->prio = newprio;
 		}
 		__setscheduler_uclamp(p, attr);
-		check_class_changing(rq, p, prev_class);
 
 		if (scope->queued) {
 			/*
@@ -707,7 +711,8 @@ int __sched_setscheduler(struct task_str
 		}
 	}
 
-	check_class_changed(rq, p, prev_class, oldprio);
+	if (!(queue_flags & DEQUEUE_CLASS))
+		check_prio_changed(rq, p, oldprio);
 
 	/* Avoid rq from going away on us: */
 	preempt_disable();



  parent reply	other threads:[~2025-10-06 10:46 UTC|newest]

Thread overview: 56+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-10-06 10:44 [PATCH 00/12] sched: Cleanup the change-pattern and related locking Peter Zijlstra
2025-10-06 10:44 ` [PATCH 01/12] sched: Employ sched_change guards Peter Zijlstra
2025-10-07  8:20   ` Andrea Righi
2025-10-08  6:51     ` Peter Zijlstra
2025-10-08  6:58       ` Andrea Righi
2025-10-07 16:58   ` Valentin Schneider
2025-10-08 14:02     ` Peter Zijlstra
2025-10-06 10:44 ` [PATCH 02/12] sched: Re-arrange the {EN,DE}QUEUE flags Peter Zijlstra
2025-10-06 10:44 ` Peter Zijlstra [this message]
2025-10-09 13:30   ` [PATCH 03/12] sched: Fold sched_class::switch{ing,ed}_{to,from}() into the change pattern Dietmar Eggemann
2025-10-09 13:54     ` Peter Zijlstra
2025-10-09 14:09       ` Peter Zijlstra
2025-10-09 16:50         ` Dietmar Eggemann
2025-10-13 10:23           ` Peter Zijlstra
2025-10-06 10:44 ` [PATCH 04/12] sched: Cleanup sched_delayed handling for class switches Peter Zijlstra
2025-10-07 15:22   ` Vincent Guittot
2025-10-06 10:44 ` [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern Peter Zijlstra
2026-01-12 20:44   ` Pierre Gondois
2026-01-13  4:12     ` K Prateek Nayak
2026-01-13 10:45       ` Pierre Gondois
2026-01-13 11:05         ` K Prateek Nayak
2026-01-13 11:53           ` Peter Zijlstra
2026-01-13 11:56             ` Peter Zijlstra
2026-01-13 13:07               ` Pierre Gondois
2026-01-13 13:10               ` Pierre Gondois
2026-01-13 11:47         ` Peter Zijlstra
2026-01-14  6:47           ` K Prateek Nayak
2026-01-14 10:23             ` Peter Zijlstra
2026-01-14 13:05               ` Peter Zijlstra
2026-01-14 14:04                 ` luca abeni
2026-01-14 14:20                 ` Juri Lelli
2026-01-14 15:25                   ` luca abeni
2026-01-15  8:24                   ` Peter Zijlstra
2026-01-15  9:05                     ` Peter Zijlstra
2026-01-15 13:13                       ` Pierre Gondois
2026-01-15 13:56                         ` Juri Lelli
2025-10-06 10:44 ` [PATCH 06/12] sched: Fix migrate_disable_switch() locking Peter Zijlstra
2025-10-06 10:44 ` [PATCH 07/12] sched: Fix do_set_cpus_allowed() locking Peter Zijlstra
2025-10-24 14:58   ` [REGRESSION] Deadlock during CPU hotplug caused by abfc01077df6 Jan Polensky
2025-10-06 10:44 ` [PATCH 08/12] sched: Rename do_set_cpus_allowed() Peter Zijlstra
2025-10-06 10:44 ` [PATCH 09/12] sched: Make __do_set_cpus_allowed() use the sched_change pattern Peter Zijlstra
2025-10-06 10:44 ` [PATCH 10/12] sched: Add locking comments to sched_class methods Peter Zijlstra
2025-10-07  9:54   ` Juri Lelli
2025-10-08  7:04     ` Peter Zijlstra
2025-10-08  7:33       ` Greg Kroah-Hartman
2025-10-08  9:43         ` Juri Lelli
2025-10-08 10:06           ` Greg Kroah-Hartman
2025-10-08 14:34             ` Steven Rostedt
2025-10-06 10:44 ` [PATCH 11/12] sched: Match __task_rq_{,un}lock() Peter Zijlstra
2025-10-07 20:44   ` Tejun Heo
2025-10-06 10:44 ` [PATCH 12/12] sched: Cleanup the sched_change NOCLOCK usage Peter Zijlstra
2025-10-07  8:25 ` [PATCH 00/12] sched: Cleanup the change-pattern and related locking Andrea Righi
2025-10-07  9:55 ` Juri Lelli
2025-10-07 15:23 ` Vincent Guittot
2025-10-07 20:46 ` Tejun Heo
2025-10-08 13:54 ` Valentin Schneider

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20251006104526.861755244@infradead.org \
    --to=peterz@infradead.org \
    --cc=arighi@nvidia.com \
    --cc=bsegall@google.com \
    --cc=cgroups@vger.kernel.org \
    --cc=changwoo@igalia.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=hannes@cmpxchg.org \
    --cc=juri.lelli@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=liuwenfang@honor.com \
    --cc=longman@redhat.com \
    --cc=mgorman@suse.de \
    --cc=mingo@kernel.org \
    --cc=mkoutny@suse.com \
    --cc=rostedt@goodmis.org \
    --cc=sched-ext@lists.linux.dev \
    --cc=tglx@linutronix.de \
    --cc=tj@kernel.org \
    --cc=vincent.guittot@linaro.org \
    --cc=void@manifault.com \
    --cc=vschneid@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox