From: Peter Zijlstra <peterz@infradead.org>
To: tj@kernel.org
Cc: linux-kernel@vger.kernel.org, peterz@infradead.org,
mingo@kernel.org, juri.lelli@redhat.com,
vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
vschneid@redhat.com, longman@redhat.com, hannes@cmpxchg.org,
mkoutny@suse.com, void@manifault.com, arighi@nvidia.com,
changwoo@igalia.com, cgroups@vger.kernel.org,
sched-ext@lists.linux.dev, liuwenfang@honor.com,
tglx@linutronix.de
Subject: [PATCH 03/12] sched: Fold sched_class::switch{ing,ed}_{to,from}() into the change pattern
Date: Mon, 06 Oct 2025 12:44:05 +0200 [thread overview]
Message-ID: <20251006104526.861755244@infradead.org> (raw)
In-Reply-To: 20251006104402.946760805@infradead.org
Add {DE,EN}QUEUE_CLASS and fold the sched_class::switch* methods into
the change pattern. This completes and makes the pattern more
symmetric.
This changes the order of callbacks slightly:
|
| switching_from()
dequeue_task(); | dequeue_task()
put_prev_task(); | put_prev_task()
| switched_from()
|
... change task ... | ... change task ...
|
switching_to(); | switching_to()
enqueue_task(); | enqueue_task()
set_next_task(); | set_next_task()
prev_class->switched_from() |
switched_to() | switched_to()
|
Notably, it moves the switched_from() callback right after the
dequeue/put. Existing implementations don't appear to be affected by
this change in location -- specifically the task isn't enqueued on the
class in question in either location.
Make (CLASS)^(SAVE|MOVE), because there is nothing to save-restore
when changing scheduling classes.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/core.c | 56 +++++++++++++++++++++--------------------------
kernel/sched/ext.c | 26 ++++++++++++++++-----
kernel/sched/idle.c | 4 +--
kernel/sched/rt.c | 2 -
kernel/sched/sched.h | 22 ++++++------------
kernel/sched/stop_task.c | 4 +--
kernel/sched/syscalls.c | 9 +++++--
7 files changed, 66 insertions(+), 57 deletions(-)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2169,34 +2169,9 @@ inline int task_curr(const struct task_s
return cpu_curr(task_cpu(p)) == p;
}
-/*
- * ->switching_to() is called with the pi_lock and rq_lock held and must not
- * mess with locking.
- */
-void check_class_changing(struct rq *rq, struct task_struct *p,
- const struct sched_class *prev_class)
+void check_prio_changed(struct rq *rq, struct task_struct *p, int oldprio)
{
- if (prev_class != p->sched_class && p->sched_class->switching_to)
- p->sched_class->switching_to(rq, p);
-}
-
-/*
- * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
- * use the balance_callback list if you want balancing.
- *
- * this means any call to check_class_changed() must be followed by a call to
- * balance_callback().
- */
-void check_class_changed(struct rq *rq, struct task_struct *p,
- const struct sched_class *prev_class,
- int oldprio)
-{
- if (prev_class != p->sched_class) {
- if (prev_class->switched_from)
- prev_class->switched_from(rq, p);
-
- p->sched_class->switched_to(rq, p);
- } else if (oldprio != p->prio || dl_task(p))
+ if (oldprio != p->prio || dl_task(p))
p->sched_class->prio_changed(rq, p, oldprio);
}
@@ -7388,6 +7363,11 @@ void rt_mutex_setprio(struct task_struct
prev_class = p->sched_class;
next_class = __setscheduler_class(p->policy, prio);
+ if (prev_class != next_class) {
+ queue_flag |= DEQUEUE_CLASS;
+ queue_flag &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
+ }
+
if (prev_class != next_class && p->se.sched_delayed)
dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
@@ -7424,11 +7404,10 @@ void rt_mutex_setprio(struct task_struct
p->sched_class = next_class;
p->prio = prio;
-
- check_class_changing(rq, p, prev_class);
}
- check_class_changed(rq, p, prev_class, oldprio);
+ if (!(queue_flag & DEQUEUE_CLASS))
+ check_prio_changed(rq, p, oldprio);
out_unlock:
/* Avoid rq from going away on us: */
preempt_disable();
@@ -10862,6 +10841,14 @@ struct sched_change_ctx *sched_change_be
lockdep_assert_rq_held(rq);
+ if (flags & DEQUEUE_CLASS) {
+ if (WARN_ON_ONCE(flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)))
+ flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
+
+ if (p->sched_class->switching_from)
+ p->sched_class->switching_from(rq, p);
+ }
+
*ctx = (struct sched_change_ctx){
.p = p,
.flags = flags,
@@ -10874,6 +10861,9 @@ struct sched_change_ctx *sched_change_be
if (ctx->running)
put_prev_task(rq, p);
+ if ((flags & DEQUEUE_CLASS) && p->sched_class->switched_from)
+ p->sched_class->switched_from(rq, p);
+
return ctx;
}
@@ -10884,8 +10874,14 @@ void sched_change_end(struct sched_chang
lockdep_assert_rq_held(rq);
+ if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to)
+ p->sched_class->switching_to(rq, p);
+
if (ctx->queued)
enqueue_task(rq, p, ctx->flags | ENQUEUE_NOCLOCK);
if (ctx->running)
set_next_task(rq, p);
+
+ if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switched_to)
+ p->sched_class->switched_to(rq, p);
}
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3912,21 +3912,28 @@ static void scx_disable_workfn(struct kt
scx_task_iter_start(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
+ unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
const struct sched_class *old_class = p->sched_class;
const struct sched_class *new_class =
__setscheduler_class(p->policy, p->prio);
update_rq_clock(task_rq(p));
+ if (old_class != new_class) {
+ queue_flags |= DEQUEUE_CLASS;
+ queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
+ }
+
if (old_class != new_class && p->se.sched_delayed)
dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
- scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
+ scoped_guard (sched_change, p, queue_flags) {
p->sched_class = new_class;
- check_class_changing(task_rq(p), p, old_class);
}
- check_class_changed(task_rq(p), p, old_class, p->prio);
+ if (!(queue_flags & DEQUEUE_CLASS))
+ check_prio_changed(task_rq(p), p, p->prio);
+
scx_exit_task(p);
}
scx_task_iter_stop(&sti);
@@ -4655,6 +4662,7 @@ static int scx_enable(struct sched_ext_o
percpu_down_write(&scx_fork_rwsem);
scx_task_iter_start(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
+ unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
const struct sched_class *old_class = p->sched_class;
const struct sched_class *new_class =
__setscheduler_class(p->policy, p->prio);
@@ -4664,16 +4672,22 @@ static int scx_enable(struct sched_ext_o
update_rq_clock(task_rq(p));
+ if (old_class != new_class) {
+ queue_flags |= DEQUEUE_CLASS;
+ queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
+ }
+
if (old_class != new_class && p->se.sched_delayed)
dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
- scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
+ scoped_guard (sched_change, p, queue_flags) {
p->scx.slice = SCX_SLICE_DFL;
p->sched_class = new_class;
- check_class_changing(task_rq(p), p, old_class);
}
- check_class_changed(task_rq(p), p, old_class, p->prio);
+ if (!(queue_flags & DEQUEUE_CLASS))
+ check_prio_changed(task_rq(p), p, p->prio);
+
put_task_struct(p);
}
scx_task_iter_stop(&sti);
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -498,7 +498,7 @@ static void task_tick_idle(struct rq *rq
{
}
-static void switched_to_idle(struct rq *rq, struct task_struct *p)
+static void switching_to_idle(struct rq *rq, struct task_struct *p)
{
BUG();
}
@@ -536,6 +536,6 @@ DEFINE_SCHED_CLASS(idle) = {
.task_tick = task_tick_idle,
.prio_changed = prio_changed_idle,
- .switched_to = switched_to_idle,
+ .switching_to = switching_to_idle,
.update_curr = update_curr_idle,
};
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2589,8 +2589,8 @@ DEFINE_SCHED_CLASS(rt) = {
.get_rr_interval = get_rr_interval_rt,
- .prio_changed = prio_changed_rt,
.switched_to = switched_to_rt,
+ .prio_changed = prio_changed_rt,
.update_curr = update_curr_rt,
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -20,7 +20,6 @@
#include <linux/sched/task_flags.h>
#include <linux/sched/task.h>
#include <linux/sched/topology.h>
-
#include <linux/atomic.h>
#include <linux/bitmap.h>
#include <linux/bug.h>
@@ -2369,6 +2368,7 @@ extern const u32 sched_prio_to_wmult[40
#define DEQUEUE_MIGRATING 0x0010 /* Matches ENQUEUE_MIGRATING */
#define DEQUEUE_DELAYED 0x0020 /* Matches ENQUEUE_DELAYED */
+#define DEQUEUE_CLASS 0x0040 /* Matches ENQUEUE_CLASS */
#define DEQUEUE_SPECIAL 0x00010000
#define DEQUEUE_THROTTLE 0x00020000
@@ -2380,6 +2380,7 @@ extern const u32 sched_prio_to_wmult[40
#define ENQUEUE_MIGRATING 0x0010
#define ENQUEUE_DELAYED 0x0020
+#define ENQUEUE_CLASS 0x0040
#define ENQUEUE_HEAD 0x00010000
#define ENQUEUE_REPLENISH 0x00020000
@@ -2443,14 +2444,11 @@ struct sched_class {
void (*task_fork)(struct task_struct *p);
void (*task_dead)(struct task_struct *p);
- /*
- * The switched_from() call is allowed to drop rq->lock, therefore we
- * cannot assume the switched_from/switched_to pair is serialized by
- * rq->lock. They are however serialized by p->pi_lock.
- */
- void (*switching_to) (struct rq *this_rq, struct task_struct *task);
- void (*switched_from)(struct rq *this_rq, struct task_struct *task);
- void (*switched_to) (struct rq *this_rq, struct task_struct *task);
+ void (*switching_from)(struct rq *this_rq, struct task_struct *task);
+ void (*switched_from) (struct rq *this_rq, struct task_struct *task);
+ void (*switching_to) (struct rq *this_rq, struct task_struct *task);
+ void (*switched_to) (struct rq *this_rq, struct task_struct *task);
+
void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
const struct load_weight *lw);
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
@@ -3879,11 +3877,7 @@ extern void set_load_weight(struct task_
extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
-extern void check_class_changing(struct rq *rq, struct task_struct *p,
- const struct sched_class *prev_class);
-extern void check_class_changed(struct rq *rq, struct task_struct *p,
- const struct sched_class *prev_class,
- int oldprio);
+extern void check_prio_changed(struct rq *rq, struct task_struct *p, int oldprio);
extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -75,7 +75,7 @@ static void task_tick_stop(struct rq *rq
{
}
-static void switched_to_stop(struct rq *rq, struct task_struct *p)
+static void switching_to_stop(struct rq *rq, struct task_struct *p)
{
BUG(); /* its impossible to change to this class */
}
@@ -112,6 +112,6 @@ DEFINE_SCHED_CLASS(stop) = {
.task_tick = task_tick_stop,
.prio_changed = prio_changed_stop,
- .switched_to = switched_to_stop,
+ .switching_to = switching_to_stop,
.update_curr = update_curr_stop,
};
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -684,6 +684,11 @@ int __sched_setscheduler(struct task_str
prev_class = p->sched_class;
next_class = __setscheduler_class(policy, newprio);
+ if (prev_class != next_class) {
+ queue_flags |= DEQUEUE_CLASS;
+ queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
+ }
+
if (prev_class != next_class && p->se.sched_delayed)
dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
@@ -695,7 +700,6 @@ int __sched_setscheduler(struct task_str
p->prio = newprio;
}
__setscheduler_uclamp(p, attr);
- check_class_changing(rq, p, prev_class);
if (scope->queued) {
/*
@@ -707,7 +711,8 @@ int __sched_setscheduler(struct task_str
}
}
- check_class_changed(rq, p, prev_class, oldprio);
+ if (!(queue_flags & DEQUEUE_CLASS))
+ check_prio_changed(rq, p, oldprio);
/* Avoid rq from going away on us: */
preempt_disable();
next prev parent reply other threads:[~2025-10-06 10:46 UTC|newest]
Thread overview: 56+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-10-06 10:44 [PATCH 00/12] sched: Cleanup the change-pattern and related locking Peter Zijlstra
2025-10-06 10:44 ` [PATCH 01/12] sched: Employ sched_change guards Peter Zijlstra
2025-10-07 8:20 ` Andrea Righi
2025-10-08 6:51 ` Peter Zijlstra
2025-10-08 6:58 ` Andrea Righi
2025-10-07 16:58 ` Valentin Schneider
2025-10-08 14:02 ` Peter Zijlstra
2025-10-06 10:44 ` [PATCH 02/12] sched: Re-arrange the {EN,DE}QUEUE flags Peter Zijlstra
2025-10-06 10:44 ` Peter Zijlstra [this message]
2025-10-09 13:30 ` [PATCH 03/12] sched: Fold sched_class::switch{ing,ed}_{to,from}() into the change pattern Dietmar Eggemann
2025-10-09 13:54 ` Peter Zijlstra
2025-10-09 14:09 ` Peter Zijlstra
2025-10-09 16:50 ` Dietmar Eggemann
2025-10-13 10:23 ` Peter Zijlstra
2025-10-06 10:44 ` [PATCH 04/12] sched: Cleanup sched_delayed handling for class switches Peter Zijlstra
2025-10-07 15:22 ` Vincent Guittot
2025-10-06 10:44 ` [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern Peter Zijlstra
2026-01-12 20:44 ` Pierre Gondois
2026-01-13 4:12 ` K Prateek Nayak
2026-01-13 10:45 ` Pierre Gondois
2026-01-13 11:05 ` K Prateek Nayak
2026-01-13 11:53 ` Peter Zijlstra
2026-01-13 11:56 ` Peter Zijlstra
2026-01-13 13:07 ` Pierre Gondois
2026-01-13 13:10 ` Pierre Gondois
2026-01-13 11:47 ` Peter Zijlstra
2026-01-14 6:47 ` K Prateek Nayak
2026-01-14 10:23 ` Peter Zijlstra
2026-01-14 13:05 ` Peter Zijlstra
2026-01-14 14:04 ` luca abeni
2026-01-14 14:20 ` Juri Lelli
2026-01-14 15:25 ` luca abeni
2026-01-15 8:24 ` Peter Zijlstra
2026-01-15 9:05 ` Peter Zijlstra
2026-01-15 13:13 ` Pierre Gondois
2026-01-15 13:56 ` Juri Lelli
2025-10-06 10:44 ` [PATCH 06/12] sched: Fix migrate_disable_switch() locking Peter Zijlstra
2025-10-06 10:44 ` [PATCH 07/12] sched: Fix do_set_cpus_allowed() locking Peter Zijlstra
2025-10-24 14:58 ` [REGRESSION] Deadlock during CPU hotplug caused by abfc01077df6 Jan Polensky
2025-10-06 10:44 ` [PATCH 08/12] sched: Rename do_set_cpus_allowed() Peter Zijlstra
2025-10-06 10:44 ` [PATCH 09/12] sched: Make __do_set_cpus_allowed() use the sched_change pattern Peter Zijlstra
2025-10-06 10:44 ` [PATCH 10/12] sched: Add locking comments to sched_class methods Peter Zijlstra
2025-10-07 9:54 ` Juri Lelli
2025-10-08 7:04 ` Peter Zijlstra
2025-10-08 7:33 ` Greg Kroah-Hartman
2025-10-08 9:43 ` Juri Lelli
2025-10-08 10:06 ` Greg Kroah-Hartman
2025-10-08 14:34 ` Steven Rostedt
2025-10-06 10:44 ` [PATCH 11/12] sched: Match __task_rq_{,un}lock() Peter Zijlstra
2025-10-07 20:44 ` Tejun Heo
2025-10-06 10:44 ` [PATCH 12/12] sched: Cleanup the sched_change NOCLOCK usage Peter Zijlstra
2025-10-07 8:25 ` [PATCH 00/12] sched: Cleanup the change-pattern and related locking Andrea Righi
2025-10-07 9:55 ` Juri Lelli
2025-10-07 15:23 ` Vincent Guittot
2025-10-07 20:46 ` Tejun Heo
2025-10-08 13:54 ` Valentin Schneider
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20251006104526.861755244@infradead.org \
--to=peterz@infradead.org \
--cc=arighi@nvidia.com \
--cc=bsegall@google.com \
--cc=cgroups@vger.kernel.org \
--cc=changwoo@igalia.com \
--cc=dietmar.eggemann@arm.com \
--cc=hannes@cmpxchg.org \
--cc=juri.lelli@redhat.com \
--cc=linux-kernel@vger.kernel.org \
--cc=liuwenfang@honor.com \
--cc=longman@redhat.com \
--cc=mgorman@suse.de \
--cc=mingo@kernel.org \
--cc=mkoutny@suse.com \
--cc=rostedt@goodmis.org \
--cc=sched-ext@lists.linux.dev \
--cc=tglx@linutronix.de \
--cc=tj@kernel.org \
--cc=vincent.guittot@linaro.org \
--cc=void@manifault.com \
--cc=vschneid@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox