* [PATCH 01/12] sched: Employ sched_change guards
2025-10-06 10:44 [PATCH 00/12] sched: Cleanup the change-pattern and related locking Peter Zijlstra
@ 2025-10-06 10:44 ` Peter Zijlstra
2025-10-07 8:20 ` Andrea Righi
` (2 more replies)
2025-10-06 10:44 ` [PATCH 02/12] sched: Re-arrange the {EN,DE}QUEUE flags Peter Zijlstra
` (15 subsequent siblings)
16 siblings, 3 replies; 74+ messages in thread
From: Peter Zijlstra @ 2025-10-06 10:44 UTC (permalink / raw)
To: tj
Cc: linux-kernel, peterz, mingo, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, longman,
hannes, mkoutny, void, arighi, changwoo, cgroups, sched-ext,
liuwenfang, tglx
As proposed a long while ago -- and half done by scx -- wrap the
scheduler's 'change' pattern in a guard helper.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
---
include/linux/cleanup.h | 5 +
kernel/sched/core.c | 157 ++++++++++++++++++------------------------------
kernel/sched/ext.c | 39 +++++------
kernel/sched/sched.h | 21 +++---
kernel/sched/syscalls.c | 65 +++++++------------
5 files changed, 116 insertions(+), 171 deletions(-)
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -340,6 +340,11 @@ _label:
#define __DEFINE_CLASS_IS_CONDITIONAL(_name, _is_cond) \
static __maybe_unused const bool class_##_name##_is_conditional = _is_cond
+#define DEFINE_CLASS_IS_UNCONDITIONAL(_name) \
+ __DEFINE_CLASS_IS_CONDITIONAL(_name, false); \
+ static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \
+ { return (void *)1; }
+
#define __GUARD_IS_ERR(_ptr) \
({ \
unsigned long _rc = (__force unsigned long)(_ptr); \
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7326,7 +7326,7 @@ void rt_mutex_post_schedule(void)
*/
void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
{
- int prio, oldprio, queued, running, queue_flag =
+ int prio, oldprio, queue_flag =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
const struct sched_class *prev_class, *next_class;
struct rq_flags rf;
@@ -7391,52 +7391,42 @@ void rt_mutex_setprio(struct task_struct
if (prev_class != next_class && p->se.sched_delayed)
dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
- if (queued)
- dequeue_task(rq, p, queue_flag);
- if (running)
- put_prev_task(rq, p);
-
- /*
- * Boosting condition are:
- * 1. -rt task is running and holds mutex A
- * --> -dl task blocks on mutex A
- *
- * 2. -dl task is running and holds mutex A
- * --> -dl task blocks on mutex A and could preempt the
- * running task
- */
- if (dl_prio(prio)) {
- if (!dl_prio(p->normal_prio) ||
- (pi_task && dl_prio(pi_task->prio) &&
- dl_entity_preempt(&pi_task->dl, &p->dl))) {
- p->dl.pi_se = pi_task->dl.pi_se;
- queue_flag |= ENQUEUE_REPLENISH;
+ scoped_guard (sched_change, p, queue_flag) {
+ /*
+ * Boosting condition are:
+ * 1. -rt task is running and holds mutex A
+ * --> -dl task blocks on mutex A
+ *
+ * 2. -dl task is running and holds mutex A
+ * --> -dl task blocks on mutex A and could preempt the
+ * running task
+ */
+ if (dl_prio(prio)) {
+ if (!dl_prio(p->normal_prio) ||
+ (pi_task && dl_prio(pi_task->prio) &&
+ dl_entity_preempt(&pi_task->dl, &p->dl))) {
+ p->dl.pi_se = pi_task->dl.pi_se;
+ scope->flags |= ENQUEUE_REPLENISH;
+ } else {
+ p->dl.pi_se = &p->dl;
+ }
+ } else if (rt_prio(prio)) {
+ if (dl_prio(oldprio))
+ p->dl.pi_se = &p->dl;
+ if (oldprio < prio)
+ scope->flags |= ENQUEUE_HEAD;
} else {
- p->dl.pi_se = &p->dl;
+ if (dl_prio(oldprio))
+ p->dl.pi_se = &p->dl;
+ if (rt_prio(oldprio))
+ p->rt.timeout = 0;
}
- } else if (rt_prio(prio)) {
- if (dl_prio(oldprio))
- p->dl.pi_se = &p->dl;
- if (oldprio < prio)
- queue_flag |= ENQUEUE_HEAD;
- } else {
- if (dl_prio(oldprio))
- p->dl.pi_se = &p->dl;
- if (rt_prio(oldprio))
- p->rt.timeout = 0;
- }
- p->sched_class = next_class;
- p->prio = prio;
+ p->sched_class = next_class;
+ p->prio = prio;
- check_class_changing(rq, p, prev_class);
-
- if (queued)
- enqueue_task(rq, p, queue_flag);
- if (running)
- set_next_task(rq, p);
+ check_class_changing(rq, p, prev_class);
+ }
check_class_changed(rq, p, prev_class, oldprio);
out_unlock:
@@ -8084,26 +8074,9 @@ int migrate_task_to(struct task_struct *
*/
void sched_setnuma(struct task_struct *p, int nid)
{
- bool queued, running;
- struct rq_flags rf;
- struct rq *rq;
-
- rq = task_rq_lock(p, &rf);
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
-
- if (queued)
- dequeue_task(rq, p, DEQUEUE_SAVE);
- if (running)
- put_prev_task(rq, p);
-
- p->numa_preferred_nid = nid;
-
- if (queued)
- enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- if (running)
- set_next_task(rq, p);
- task_rq_unlock(rq, p, &rf);
+ guard(task_rq_lock)(p);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE)
+ p->numa_preferred_nid = nid;
}
#endif /* CONFIG_NUMA_BALANCING */
@@ -9205,8 +9178,9 @@ static void sched_change_group(struct ta
*/
void sched_move_task(struct task_struct *tsk, bool for_autogroup)
{
- int queued, running, queue_flags =
+ unsigned int queue_flags =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+ bool resched = false;
struct rq *rq;
CLASS(task_rq_lock, rq_guard)(tsk);
@@ -9214,29 +9188,16 @@ void sched_move_task(struct task_struct
update_rq_clock(rq);
- running = task_current_donor(rq, tsk);
- queued = task_on_rq_queued(tsk);
+ scoped_guard (sched_change, tsk, queue_flags) {
+ sched_change_group(tsk);
+ if (!for_autogroup)
+ scx_cgroup_move_task(tsk);
+ if (scope->running)
+ resched = true;
+ }
- if (queued)
- dequeue_task(rq, tsk, queue_flags);
- if (running)
- put_prev_task(rq, tsk);
-
- sched_change_group(tsk);
- if (!for_autogroup)
- scx_cgroup_move_task(tsk);
-
- if (queued)
- enqueue_task(rq, tsk, queue_flags);
- if (running) {
- set_next_task(rq, tsk);
- /*
- * After changing group, the running task may have joined a
- * throttled one but it's still the running task. Trigger a
- * resched to make sure that task can still run.
- */
+ if (resched)
resched_curr(rq);
- }
}
static struct cgroup_subsys_state *
@@ -10892,37 +10853,39 @@ void sched_mm_cid_fork(struct task_struc
}
#endif /* CONFIG_SCHED_MM_CID */
-#ifdef CONFIG_SCHED_CLASS_EXT
-void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
- struct sched_enq_and_set_ctx *ctx)
+static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx);
+
+struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags)
{
+ struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx);
struct rq *rq = task_rq(p);
lockdep_assert_rq_held(rq);
- *ctx = (struct sched_enq_and_set_ctx){
+ *ctx = (struct sched_change_ctx){
.p = p,
- .queue_flags = queue_flags,
+ .flags = flags,
.queued = task_on_rq_queued(p),
.running = task_current(rq, p),
};
- update_rq_clock(rq);
if (ctx->queued)
- dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK);
+ dequeue_task(rq, p, flags);
if (ctx->running)
put_prev_task(rq, p);
+
+ return ctx;
}
-void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
+void sched_change_end(struct sched_change_ctx *ctx)
{
- struct rq *rq = task_rq(ctx->p);
+ struct task_struct *p = ctx->p;
+ struct rq *rq = task_rq(p);
lockdep_assert_rq_held(rq);
if (ctx->queued)
- enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK);
+ enqueue_task(rq, p, ctx->flags | ENQUEUE_NOCLOCK);
if (ctx->running)
- set_next_task(rq, ctx->p);
+ set_next_task(rq, p);
}
-#endif /* CONFIG_SCHED_CLASS_EXT */
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3780,11 +3780,10 @@ static void scx_bypass(bool bypass)
*/
list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list,
scx.runnable_node) {
- struct sched_enq_and_set_ctx ctx;
-
/* cycling deq/enq is enough, see the function comment */
- sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
- sched_enq_and_set_task(&ctx);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+ /* nothing */ ;
+ }
}
/* resched to restore ticks and idle state */
@@ -3916,17 +3915,16 @@ static void scx_disable_workfn(struct kt
const struct sched_class *old_class = p->sched_class;
const struct sched_class *new_class =
__setscheduler_class(p->policy, p->prio);
- struct sched_enq_and_set_ctx ctx;
-
- if (old_class != new_class && p->se.sched_delayed)
- dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
- sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
+ update_rq_clock(task_rq(p));
- p->sched_class = new_class;
- check_class_changing(task_rq(p), p, old_class);
+ if (old_class != new_class && p->se.sched_delayed)
+ dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
- sched_enq_and_set_task(&ctx);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
+ p->sched_class = new_class;
+ check_class_changing(task_rq(p), p, old_class);
+ }
check_class_changed(task_rq(p), p, old_class, p->prio);
scx_exit_task(p);
@@ -4660,21 +4658,20 @@ static int scx_enable(struct sched_ext_o
const struct sched_class *old_class = p->sched_class;
const struct sched_class *new_class =
__setscheduler_class(p->policy, p->prio);
- struct sched_enq_and_set_ctx ctx;
if (!tryget_task_struct(p))
continue;
- if (old_class != new_class && p->se.sched_delayed)
- dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
-
- sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
+ update_rq_clock(task_rq(p));
- p->scx.slice = SCX_SLICE_DFL;
- p->sched_class = new_class;
- check_class_changing(task_rq(p), p, old_class);
+ if (old_class != new_class && p->se.sched_delayed)
+ dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
- sched_enq_and_set_task(&ctx);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
+ p->scx.slice = SCX_SLICE_DFL;
+ p->sched_class = new_class;
+ check_class_changing(task_rq(p), p, old_class);
+ }
check_class_changed(task_rq(p), p, old_class, p->prio);
put_task_struct(p);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3885,23 +3885,22 @@ extern void check_class_changed(struct r
extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
-#ifdef CONFIG_SCHED_CLASS_EXT
-/*
- * Used by SCX in the enable/disable paths to move tasks between sched_classes
- * and establish invariants.
- */
-struct sched_enq_and_set_ctx {
+struct sched_change_ctx {
struct task_struct *p;
- int queue_flags;
+ int flags;
bool queued;
bool running;
};
-void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
- struct sched_enq_and_set_ctx *ctx);
-void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx);
+struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags);
+void sched_change_end(struct sched_change_ctx *ctx);
-#endif /* CONFIG_SCHED_CLASS_EXT */
+DEFINE_CLASS(sched_change, struct sched_change_ctx *,
+ sched_change_end(_T),
+ sched_change_begin(p, flags),
+ struct task_struct *p, unsigned int flags)
+
+DEFINE_CLASS_IS_UNCONDITIONAL(sched_change)
#include "ext.h"
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -64,7 +64,6 @@ static int effective_prio(struct task_st
void set_user_nice(struct task_struct *p, long nice)
{
- bool queued, running;
struct rq *rq;
int old_prio;
@@ -90,22 +89,12 @@ void set_user_nice(struct task_struct *p
return;
}
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
- if (queued)
- dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
- if (running)
- put_prev_task(rq, p);
-
- p->static_prio = NICE_TO_PRIO(nice);
- set_load_weight(p, true);
- old_prio = p->prio;
- p->prio = effective_prio(p);
-
- if (queued)
- enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- if (running)
- set_next_task(rq, p);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK) {
+ p->static_prio = NICE_TO_PRIO(nice);
+ set_load_weight(p, true);
+ old_prio = p->prio;
+ p->prio = effective_prio(p);
+ }
/*
* If the task increased its priority or is running and
@@ -515,7 +504,7 @@ int __sched_setscheduler(struct task_str
bool user, bool pi)
{
int oldpolicy = -1, policy = attr->sched_policy;
- int retval, oldprio, newprio, queued, running;
+ int retval, oldprio, newprio;
const struct sched_class *prev_class, *next_class;
struct balance_callback *head;
struct rq_flags rf;
@@ -698,33 +687,25 @@ int __sched_setscheduler(struct task_str
if (prev_class != next_class && p->se.sched_delayed)
dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
- if (queued)
- dequeue_task(rq, p, queue_flags);
- if (running)
- put_prev_task(rq, p);
-
- if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
- __setscheduler_params(p, attr);
- p->sched_class = next_class;
- p->prio = newprio;
- }
- __setscheduler_uclamp(p, attr);
- check_class_changing(rq, p, prev_class);
+ scoped_guard (sched_change, p, queue_flags) {
- if (queued) {
- /*
- * We enqueue to tail when the priority of a task is
- * increased (user space view).
- */
- if (oldprio < p->prio)
- queue_flags |= ENQUEUE_HEAD;
+ if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
+ __setscheduler_params(p, attr);
+ p->sched_class = next_class;
+ p->prio = newprio;
+ }
+ __setscheduler_uclamp(p, attr);
+ check_class_changing(rq, p, prev_class);
- enqueue_task(rq, p, queue_flags);
+ if (scope->queued) {
+ /*
+ * We enqueue to tail when the priority of a task is
+ * increased (user space view).
+ */
+ if (oldprio < p->prio)
+ scope->flags |= ENQUEUE_HEAD;
+ }
}
- if (running)
- set_next_task(rq, p);
check_class_changed(rq, p, prev_class, oldprio);
^ permalink raw reply [flat|nested] 74+ messages in thread* Re: [PATCH 01/12] sched: Employ sched_change guards
2025-10-06 10:44 ` [PATCH 01/12] sched: Employ sched_change guards Peter Zijlstra
@ 2025-10-07 8:20 ` Andrea Righi
2025-10-08 6:51 ` Peter Zijlstra
2025-10-07 16:58 ` Valentin Schneider
2025-10-16 9:33 ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2 siblings, 1 reply; 74+ messages in thread
From: Andrea Righi @ 2025-10-07 8:20 UTC (permalink / raw)
To: Peter Zijlstra
Cc: tj, linux-kernel, mingo, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, longman,
hannes, mkoutny, void, changwoo, cgroups, sched-ext, liuwenfang,
tglx
Hi Peter,
On Mon, Oct 06, 2025 at 12:44:03PM +0200, Peter Zijlstra wrote:
> As proposed a long while ago -- and half done by scx -- wrap the
> scheduler's 'change' pattern in a guard helper.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Acked-by: Tejun Heo <tj@kernel.org>
> ---
...
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3885,23 +3885,22 @@ extern void check_class_changed(struct r
> extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
> extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
>
> -#ifdef CONFIG_SCHED_CLASS_EXT
> -/*
> - * Used by SCX in the enable/disable paths to move tasks between sched_classes
> - * and establish invariants.
> - */
> -struct sched_enq_and_set_ctx {
Not necessarily for this patch, we can add it later, but I kinda liked the
comment that briefly explained how the context is used. Maybe having
something along these lines could be helpful?
/*
* Used to ensure the correct sequence of task state transitions, such as
* switching between sched_classes, changing CPU affinity, priority, or
* updating the queued/running state.
*/
> +struct sched_change_ctx {
> struct task_struct *p;
> - int queue_flags;
> + int flags;
> bool queued;
> bool running;
> };
>
> -void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
> - struct sched_enq_and_set_ctx *ctx);
> -void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx);
> +struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags);
> +void sched_change_end(struct sched_change_ctx *ctx);
>
> -#endif /* CONFIG_SCHED_CLASS_EXT */
> +DEFINE_CLASS(sched_change, struct sched_change_ctx *,
> + sched_change_end(_T),
> + sched_change_begin(p, flags),
> + struct task_struct *p, unsigned int flags)
> +
> +DEFINE_CLASS_IS_UNCONDITIONAL(sched_change)
>
> #include "ext.h"
>
> --- a/kernel/sched/syscalls.c
> +++ b/kernel/sched/syscalls.c
> @@ -64,7 +64,6 @@ static int effective_prio(struct task_st
>
> void set_user_nice(struct task_struct *p, long nice)
> {
> - bool queued, running;
> struct rq *rq;
> int old_prio;
>
> @@ -90,22 +89,12 @@ void set_user_nice(struct task_struct *p
> return;
> }
>
> - queued = task_on_rq_queued(p);
> - running = task_current_donor(rq, p);
> - if (queued)
> - dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
> - if (running)
> - put_prev_task(rq, p);
> -
> - p->static_prio = NICE_TO_PRIO(nice);
> - set_load_weight(p, true);
> - old_prio = p->prio;
> - p->prio = effective_prio(p);
> -
> - if (queued)
> - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
> - if (running)
> - set_next_task(rq, p);
> + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK) {
> + p->static_prio = NICE_TO_PRIO(nice);
> + set_load_weight(p, true);
> + old_prio = p->prio;
> + p->prio = effective_prio(p);
> + }
>
> /*
> * If the task increased its priority or is running and
> @@ -515,7 +504,7 @@ int __sched_setscheduler(struct task_str
> bool user, bool pi)
> {
> int oldpolicy = -1, policy = attr->sched_policy;
> - int retval, oldprio, newprio, queued, running;
> + int retval, oldprio, newprio;
> const struct sched_class *prev_class, *next_class;
> struct balance_callback *head;
> struct rq_flags rf;
> @@ -698,33 +687,25 @@ int __sched_setscheduler(struct task_str
> if (prev_class != next_class && p->se.sched_delayed)
> dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
>
> - queued = task_on_rq_queued(p);
> - running = task_current_donor(rq, p);
> - if (queued)
> - dequeue_task(rq, p, queue_flags);
> - if (running)
> - put_prev_task(rq, p);
> -
> - if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
> - __setscheduler_params(p, attr);
> - p->sched_class = next_class;
> - p->prio = newprio;
> - }
> - __setscheduler_uclamp(p, attr);
> - check_class_changing(rq, p, prev_class);
> + scoped_guard (sched_change, p, queue_flags) {
>
> - if (queued) {
> - /*
> - * We enqueue to tail when the priority of a task is
> - * increased (user space view).
> - */
> - if (oldprio < p->prio)
> - queue_flags |= ENQUEUE_HEAD;
> + if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
> + __setscheduler_params(p, attr);
> + p->sched_class = next_class;
> + p->prio = newprio;
> + }
> + __setscheduler_uclamp(p, attr);
> + check_class_changing(rq, p, prev_class);
>
> - enqueue_task(rq, p, queue_flags);
> + if (scope->queued) {
> + /*
> + * We enqueue to tail when the priority of a task is
> + * increased (user space view).
> + */
> + if (oldprio < p->prio)
> + scope->flags |= ENQUEUE_HEAD;
> + }
> }
> - if (running)
> - set_next_task(rq, p);
>
> check_class_changed(rq, p, prev_class, oldprio);
>
>
>
Thanks,
-Andrea
^ permalink raw reply [flat|nested] 74+ messages in thread* Re: [PATCH 01/12] sched: Employ sched_change guards
2025-10-07 8:20 ` Andrea Righi
@ 2025-10-08 6:51 ` Peter Zijlstra
2025-10-08 6:58 ` Andrea Righi
0 siblings, 1 reply; 74+ messages in thread
From: Peter Zijlstra @ 2025-10-08 6:51 UTC (permalink / raw)
To: Andrea Righi
Cc: tj, linux-kernel, mingo, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, longman,
hannes, mkoutny, void, changwoo, cgroups, sched-ext, liuwenfang,
tglx
On Tue, Oct 07, 2025 at 10:20:44AM +0200, Andrea Righi wrote:
> Hi Peter,
>
> On Mon, Oct 06, 2025 at 12:44:03PM +0200, Peter Zijlstra wrote:
> > As proposed a long while ago -- and half done by scx -- wrap the
> > scheduler's 'change' pattern in a guard helper.
> >
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > Acked-by: Tejun Heo <tj@kernel.org>
> > ---
> ...
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -3885,23 +3885,22 @@ extern void check_class_changed(struct r
> > extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
> > extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
> >
> > -#ifdef CONFIG_SCHED_CLASS_EXT
> > -/*
> > - * Used by SCX in the enable/disable paths to move tasks between sched_classes
> > - * and establish invariants.
> > - */
> > -struct sched_enq_and_set_ctx {
>
> Not necessarily for this patch, we can add it later, but I kinda liked the
> comment that briefly explained how the context is used. Maybe having
> something along these lines could be helpful?
I have changed it thus:
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3885,6 +3885,22 @@ extern void check_class_changed(struct r
extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
+/*
+ * The 'sched_change' pattern is the safe, easy and slow way of changing a
+ * task's scheduling properties. It dequeues a task, such that the scheduler
+ * is fully unaware of it; at which point its properties can be modified;
+ * after which it is enqueued again.
+ *
+ * Typically this must be called while holding task_rq_lock, since most/all
+ * properties are serialized under those locks. There is currently one
+ * exception to this rule in sched/ext which only holds rq->lock.
+ */
+
+/*
+ * This structure is a temporary, used to preserve/convey the queueing state
+ * of the task between sched_change_begin() and sched_change_end(). Ensuring
+ * the task's queueing state is idempotent across the operation.
+ */
struct sched_change_ctx {
struct task_struct *p;
int flags;
^ permalink raw reply [flat|nested] 74+ messages in thread* Re: [PATCH 01/12] sched: Employ sched_change guards
2025-10-08 6:51 ` Peter Zijlstra
@ 2025-10-08 6:58 ` Andrea Righi
0 siblings, 0 replies; 74+ messages in thread
From: Andrea Righi @ 2025-10-08 6:58 UTC (permalink / raw)
To: Peter Zijlstra
Cc: tj, linux-kernel, mingo, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, longman,
hannes, mkoutny, void, changwoo, cgroups, sched-ext, liuwenfang,
tglx
Hi Peter,
On Wed, Oct 08, 2025 at 08:51:03AM +0200, Peter Zijlstra wrote:
> On Tue, Oct 07, 2025 at 10:20:44AM +0200, Andrea Righi wrote:
> > Hi Peter,
> >
> > On Mon, Oct 06, 2025 at 12:44:03PM +0200, Peter Zijlstra wrote:
> > > As proposed a long while ago -- and half done by scx -- wrap the
> > > scheduler's 'change' pattern in a guard helper.
> > >
> > > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > > Acked-by: Tejun Heo <tj@kernel.org>
> > > ---
> > ...
> > > --- a/kernel/sched/sched.h
> > > +++ b/kernel/sched/sched.h
> > > @@ -3885,23 +3885,22 @@ extern void check_class_changed(struct r
> > > extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
> > > extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
> > >
> > > -#ifdef CONFIG_SCHED_CLASS_EXT
> > > -/*
> > > - * Used by SCX in the enable/disable paths to move tasks between sched_classes
> > > - * and establish invariants.
> > > - */
> > > -struct sched_enq_and_set_ctx {
> >
> > Not necessarily for this patch, we can add it later, but I kinda liked the
> > comment that briefly explained how the context is used. Maybe having
> > something along these lines could be helpful?
>
> I have changed it thus:
>
>
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3885,6 +3885,22 @@ extern void check_class_changed(struct r
> extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
> extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
>
> +/*
> + * The 'sched_change' pattern is the safe, easy and slow way of changing a
> + * task's scheduling properties. It dequeues a task, such that the scheduler
> + * is fully unaware of it; at which point its properties can be modified;
> + * after which it is enqueued again.
> + *
> + * Typically this must be called while holding task_rq_lock, since most/all
> + * properties are serialized under those locks. There is currently one
> + * exception to this rule in sched/ext which only holds rq->lock.
> + */
> +
> +/*
> + * This structure is a temporary, used to preserve/convey the queueing state
> + * of the task between sched_change_begin() and sched_change_end(). Ensuring
> + * the task's queueing state is idempotent across the operation.
> + */
Looks great and very clear, thanks!
-Andrea
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH 01/12] sched: Employ sched_change guards
2025-10-06 10:44 ` [PATCH 01/12] sched: Employ sched_change guards Peter Zijlstra
2025-10-07 8:20 ` Andrea Righi
@ 2025-10-07 16:58 ` Valentin Schneider
2025-10-08 14:02 ` Peter Zijlstra
2025-10-16 9:33 ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2 siblings, 1 reply; 74+ messages in thread
From: Valentin Schneider @ 2025-10-07 16:58 UTC (permalink / raw)
To: Peter Zijlstra, tj
Cc: linux-kernel, peterz, mingo, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, longman, hannes,
mkoutny, void, arighi, changwoo, cgroups, sched-ext, liuwenfang,
tglx
On 06/10/25 12:44, Peter Zijlstra wrote:
> @@ -7391,52 +7391,42 @@ void rt_mutex_setprio(struct task_struct
> if (prev_class != next_class && p->se.sched_delayed)
> dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
>
> - queued = task_on_rq_queued(p);
> - running = task_current_donor(rq, p);
I'm not sure how that plays with sched_ext, but for the "standard" change
pattern such as this one & the others in core.c, that becomes a
task_current() per sched_change_begin(). I'm guessing we want to make
sched_change_begin() use task_current_donor() instead?
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH 01/12] sched: Employ sched_change guards
2025-10-07 16:58 ` Valentin Schneider
@ 2025-10-08 14:02 ` Peter Zijlstra
0 siblings, 0 replies; 74+ messages in thread
From: Peter Zijlstra @ 2025-10-08 14:02 UTC (permalink / raw)
To: Valentin Schneider
Cc: tj, linux-kernel, mingo, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, longman, hannes,
mkoutny, void, arighi, changwoo, cgroups, sched-ext, liuwenfang,
tglx
On Tue, Oct 07, 2025 at 06:58:44PM +0200, Valentin Schneider wrote:
> On 06/10/25 12:44, Peter Zijlstra wrote:
> > @@ -7391,52 +7391,42 @@ void rt_mutex_setprio(struct task_struct
> > if (prev_class != next_class && p->se.sched_delayed)
> > dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
> >
> > - queued = task_on_rq_queued(p);
> > - running = task_current_donor(rq, p);
>
>
> I'm not sure how that plays with sched_ext, but for the "standard" change
> pattern such as this one & the others in core.c, that becomes a
> task_current() per sched_change_begin(). I'm guessing we want to make
> sched_change_begin() use task_current_donor() instead?
Argh yeah, rebase fail. Let me go fix.
^ permalink raw reply [flat|nested] 74+ messages in thread
* [tip: sched/core] sched: Employ sched_change guards
2025-10-06 10:44 ` [PATCH 01/12] sched: Employ sched_change guards Peter Zijlstra
2025-10-07 8:20 ` Andrea Righi
2025-10-07 16:58 ` Valentin Schneider
@ 2025-10-16 9:33 ` tip-bot2 for Peter Zijlstra
2 siblings, 0 replies; 74+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2025-10-16 9:33 UTC (permalink / raw)
To: linux-tip-commits
Cc: Peter Zijlstra (Intel), Juri Lelli, Tejun Heo, Vincent Guittot,
x86, linux-kernel
The following commit has been merged into the sched/core branch of tip:
Commit-ID: e9139f765ac7048cadc9981e962acdf8b08eabf3
Gitweb: https://git.kernel.org/tip/e9139f765ac7048cadc9981e962acdf8b08eabf3
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Wed, 30 Oct 2024 13:43:43 +01:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 16 Oct 2025 11:13:50 +02:00
sched: Employ sched_change guards
As proposed a long while ago -- and half done by scx -- wrap the
scheduler's 'change' pattern in a guard helper.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
---
include/linux/cleanup.h | 5 +-
kernel/sched/core.c | 159 ++++++++++++++-------------------------
kernel/sched/ext.c | 39 ++++------
kernel/sched/sched.h | 33 +++++---
kernel/sched/syscalls.c | 65 +++++-----------
5 files changed, 131 insertions(+), 170 deletions(-)
diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index 2573585..ae38167 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -340,6 +340,11 @@ _label: \
#define __DEFINE_CLASS_IS_CONDITIONAL(_name, _is_cond) \
static __maybe_unused const bool class_##_name##_is_conditional = _is_cond
+#define DEFINE_CLASS_IS_UNCONDITIONAL(_name) \
+ __DEFINE_CLASS_IS_CONDITIONAL(_name, false); \
+ static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \
+ { return (void *)1; }
+
#define __GUARD_IS_ERR(_ptr) \
({ \
unsigned long _rc = (__force unsigned long)(_ptr); \
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 198d2dd..eca40df 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7326,7 +7326,7 @@ void rt_mutex_post_schedule(void)
*/
void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
{
- int prio, oldprio, queued, running, queue_flag =
+ int prio, oldprio, queue_flag =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
const struct sched_class *prev_class, *next_class;
struct rq_flags rf;
@@ -7391,52 +7391,42 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
if (prev_class != next_class && p->se.sched_delayed)
dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
- if (queued)
- dequeue_task(rq, p, queue_flag);
- if (running)
- put_prev_task(rq, p);
-
- /*
- * Boosting condition are:
- * 1. -rt task is running and holds mutex A
- * --> -dl task blocks on mutex A
- *
- * 2. -dl task is running and holds mutex A
- * --> -dl task blocks on mutex A and could preempt the
- * running task
- */
- if (dl_prio(prio)) {
- if (!dl_prio(p->normal_prio) ||
- (pi_task && dl_prio(pi_task->prio) &&
- dl_entity_preempt(&pi_task->dl, &p->dl))) {
- p->dl.pi_se = pi_task->dl.pi_se;
- queue_flag |= ENQUEUE_REPLENISH;
+ scoped_guard (sched_change, p, queue_flag) {
+ /*
+ * Boosting condition are:
+ * 1. -rt task is running and holds mutex A
+ * --> -dl task blocks on mutex A
+ *
+ * 2. -dl task is running and holds mutex A
+ * --> -dl task blocks on mutex A and could preempt the
+ * running task
+ */
+ if (dl_prio(prio)) {
+ if (!dl_prio(p->normal_prio) ||
+ (pi_task && dl_prio(pi_task->prio) &&
+ dl_entity_preempt(&pi_task->dl, &p->dl))) {
+ p->dl.pi_se = pi_task->dl.pi_se;
+ scope->flags |= ENQUEUE_REPLENISH;
+ } else {
+ p->dl.pi_se = &p->dl;
+ }
+ } else if (rt_prio(prio)) {
+ if (dl_prio(oldprio))
+ p->dl.pi_se = &p->dl;
+ if (oldprio < prio)
+ scope->flags |= ENQUEUE_HEAD;
} else {
- p->dl.pi_se = &p->dl;
+ if (dl_prio(oldprio))
+ p->dl.pi_se = &p->dl;
+ if (rt_prio(oldprio))
+ p->rt.timeout = 0;
}
- } else if (rt_prio(prio)) {
- if (dl_prio(oldprio))
- p->dl.pi_se = &p->dl;
- if (oldprio < prio)
- queue_flag |= ENQUEUE_HEAD;
- } else {
- if (dl_prio(oldprio))
- p->dl.pi_se = &p->dl;
- if (rt_prio(oldprio))
- p->rt.timeout = 0;
- }
- p->sched_class = next_class;
- p->prio = prio;
+ p->sched_class = next_class;
+ p->prio = prio;
- check_class_changing(rq, p, prev_class);
-
- if (queued)
- enqueue_task(rq, p, queue_flag);
- if (running)
- set_next_task(rq, p);
+ check_class_changing(rq, p, prev_class);
+ }
check_class_changed(rq, p, prev_class, oldprio);
out_unlock:
@@ -8084,26 +8074,9 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
*/
void sched_setnuma(struct task_struct *p, int nid)
{
- bool queued, running;
- struct rq_flags rf;
- struct rq *rq;
-
- rq = task_rq_lock(p, &rf);
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
-
- if (queued)
- dequeue_task(rq, p, DEQUEUE_SAVE);
- if (running)
- put_prev_task(rq, p);
-
- p->numa_preferred_nid = nid;
-
- if (queued)
- enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- if (running)
- set_next_task(rq, p);
- task_rq_unlock(rq, p, &rf);
+ guard(task_rq_lock)(p);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE)
+ p->numa_preferred_nid = nid;
}
#endif /* CONFIG_NUMA_BALANCING */
@@ -9205,8 +9178,9 @@ static void sched_change_group(struct task_struct *tsk)
*/
void sched_move_task(struct task_struct *tsk, bool for_autogroup)
{
- int queued, running, queue_flags =
+ unsigned int queue_flags =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+ bool resched = false;
struct rq *rq;
CLASS(task_rq_lock, rq_guard)(tsk);
@@ -9214,29 +9188,16 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup)
update_rq_clock(rq);
- running = task_current_donor(rq, tsk);
- queued = task_on_rq_queued(tsk);
-
- if (queued)
- dequeue_task(rq, tsk, queue_flags);
- if (running)
- put_prev_task(rq, tsk);
-
- sched_change_group(tsk);
- if (!for_autogroup)
- scx_cgroup_move_task(tsk);
+ scoped_guard (sched_change, tsk, queue_flags) {
+ sched_change_group(tsk);
+ if (!for_autogroup)
+ scx_cgroup_move_task(tsk);
+ if (scope->running)
+ resched = true;
+ }
- if (queued)
- enqueue_task(rq, tsk, queue_flags);
- if (running) {
- set_next_task(rq, tsk);
- /*
- * After changing group, the running task may have joined a
- * throttled one but it's still the running task. Trigger a
- * resched to make sure that task can still run.
- */
+ if (resched)
resched_curr(rq);
- }
}
static struct cgroup_subsys_state *
@@ -10892,37 +10853,39 @@ void sched_mm_cid_fork(struct task_struct *t)
}
#endif /* CONFIG_SCHED_MM_CID */
-#ifdef CONFIG_SCHED_CLASS_EXT
-void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
- struct sched_enq_and_set_ctx *ctx)
+static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx);
+
+struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags)
{
+ struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx);
struct rq *rq = task_rq(p);
lockdep_assert_rq_held(rq);
- *ctx = (struct sched_enq_and_set_ctx){
+ *ctx = (struct sched_change_ctx){
.p = p,
- .queue_flags = queue_flags,
+ .flags = flags,
.queued = task_on_rq_queued(p),
- .running = task_current(rq, p),
+ .running = task_current_donor(rq, p),
};
- update_rq_clock(rq);
if (ctx->queued)
- dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK);
+ dequeue_task(rq, p, flags);
if (ctx->running)
put_prev_task(rq, p);
+
+ return ctx;
}
-void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
+void sched_change_end(struct sched_change_ctx *ctx)
{
- struct rq *rq = task_rq(ctx->p);
+ struct task_struct *p = ctx->p;
+ struct rq *rq = task_rq(p);
lockdep_assert_rq_held(rq);
if (ctx->queued)
- enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK);
+ enqueue_task(rq, p, ctx->flags | ENQUEUE_NOCLOCK);
if (ctx->running)
- set_next_task(rq, ctx->p);
+ set_next_task(rq, p);
}
-#endif /* CONFIG_SCHED_CLASS_EXT */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 2b0e882..4566a7c 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3780,11 +3780,10 @@ static void scx_bypass(bool bypass)
*/
list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list,
scx.runnable_node) {
- struct sched_enq_and_set_ctx ctx;
-
/* cycling deq/enq is enough, see the function comment */
- sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
- sched_enq_and_set_task(&ctx);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+ /* nothing */ ;
+ }
}
/* resched to restore ticks and idle state */
@@ -3916,17 +3915,16 @@ static void scx_disable_workfn(struct kthread_work *work)
const struct sched_class *old_class = p->sched_class;
const struct sched_class *new_class =
__setscheduler_class(p->policy, p->prio);
- struct sched_enq_and_set_ctx ctx;
- if (old_class != new_class && p->se.sched_delayed)
- dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+ update_rq_clock(task_rq(p));
- sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
-
- p->sched_class = new_class;
- check_class_changing(task_rq(p), p, old_class);
+ if (old_class != new_class && p->se.sched_delayed)
+ dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
- sched_enq_and_set_task(&ctx);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
+ p->sched_class = new_class;
+ check_class_changing(task_rq(p), p, old_class);
+ }
check_class_changed(task_rq(p), p, old_class, p->prio);
scx_exit_task(p);
@@ -4660,21 +4658,20 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
const struct sched_class *old_class = p->sched_class;
const struct sched_class *new_class =
__setscheduler_class(p->policy, p->prio);
- struct sched_enq_and_set_ctx ctx;
if (!tryget_task_struct(p))
continue;
- if (old_class != new_class && p->se.sched_delayed)
- dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
-
- sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
+ update_rq_clock(task_rq(p));
- p->scx.slice = SCX_SLICE_DFL;
- p->sched_class = new_class;
- check_class_changing(task_rq(p), p, old_class);
+ if (old_class != new_class && p->se.sched_delayed)
+ dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
- sched_enq_and_set_task(&ctx);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
+ p->scx.slice = SCX_SLICE_DFL;
+ p->sched_class = new_class;
+ check_class_changing(task_rq(p), p, old_class);
+ }
check_class_changed(task_rq(p), p, old_class, p->prio);
put_task_struct(p);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1f5d070..6546849 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3885,23 +3885,38 @@ extern void check_class_changed(struct rq *rq, struct task_struct *p,
extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
-#ifdef CONFIG_SCHED_CLASS_EXT
/*
- * Used by SCX in the enable/disable paths to move tasks between sched_classes
- * and establish invariants.
+ * The 'sched_change' pattern is the safe, easy and slow way of changing a
+ * task's scheduling properties. It dequeues a task, such that the scheduler
+ * is fully unaware of it; at which point its properties can be modified;
+ * after which it is enqueued again.
+ *
+ * Typically this must be called while holding task_rq_lock, since most/all
+ * properties are serialized under those locks. There is currently one
+ * exception to this rule in sched/ext which only holds rq->lock.
+ */
+
+/*
+ * This structure is a temporary, used to preserve/convey the queueing state
+ * of the task between sched_change_begin() and sched_change_end(). Ensuring
+ * the task's queueing state is idempotent across the operation.
*/
-struct sched_enq_and_set_ctx {
+struct sched_change_ctx {
struct task_struct *p;
- int queue_flags;
+ int flags;
bool queued;
bool running;
};
-void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
- struct sched_enq_and_set_ctx *ctx);
-void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx);
+struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags);
+void sched_change_end(struct sched_change_ctx *ctx);
-#endif /* CONFIG_SCHED_CLASS_EXT */
+DEFINE_CLASS(sched_change, struct sched_change_ctx *,
+ sched_change_end(_T),
+ sched_change_begin(p, flags),
+ struct task_struct *p, unsigned int flags)
+
+DEFINE_CLASS_IS_UNCONDITIONAL(sched_change)
#include "ext.h"
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 77ae87f..09ffe91 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -64,7 +64,6 @@ static int effective_prio(struct task_struct *p)
void set_user_nice(struct task_struct *p, long nice)
{
- bool queued, running;
struct rq *rq;
int old_prio;
@@ -90,22 +89,12 @@ void set_user_nice(struct task_struct *p, long nice)
return;
}
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
- if (queued)
- dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
- if (running)
- put_prev_task(rq, p);
-
- p->static_prio = NICE_TO_PRIO(nice);
- set_load_weight(p, true);
- old_prio = p->prio;
- p->prio = effective_prio(p);
-
- if (queued)
- enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- if (running)
- set_next_task(rq, p);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK) {
+ p->static_prio = NICE_TO_PRIO(nice);
+ set_load_weight(p, true);
+ old_prio = p->prio;
+ p->prio = effective_prio(p);
+ }
/*
* If the task increased its priority or is running and
@@ -515,7 +504,7 @@ int __sched_setscheduler(struct task_struct *p,
bool user, bool pi)
{
int oldpolicy = -1, policy = attr->sched_policy;
- int retval, oldprio, newprio, queued, running;
+ int retval, oldprio, newprio;
const struct sched_class *prev_class, *next_class;
struct balance_callback *head;
struct rq_flags rf;
@@ -698,33 +687,25 @@ change:
if (prev_class != next_class && p->se.sched_delayed)
dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
- if (queued)
- dequeue_task(rq, p, queue_flags);
- if (running)
- put_prev_task(rq, p);
-
- if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
- __setscheduler_params(p, attr);
- p->sched_class = next_class;
- p->prio = newprio;
- }
- __setscheduler_uclamp(p, attr);
- check_class_changing(rq, p, prev_class);
+ scoped_guard (sched_change, p, queue_flags) {
- if (queued) {
- /*
- * We enqueue to tail when the priority of a task is
- * increased (user space view).
- */
- if (oldprio < p->prio)
- queue_flags |= ENQUEUE_HEAD;
+ if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
+ __setscheduler_params(p, attr);
+ p->sched_class = next_class;
+ p->prio = newprio;
+ }
+ __setscheduler_uclamp(p, attr);
+ check_class_changing(rq, p, prev_class);
- enqueue_task(rq, p, queue_flags);
+ if (scope->queued) {
+ /*
+ * We enqueue to tail when the priority of a task is
+ * increased (user space view).
+ */
+ if (oldprio < p->prio)
+ scope->flags |= ENQUEUE_HEAD;
+ }
}
- if (running)
- set_next_task(rq, p);
check_class_changed(rq, p, prev_class, oldprio);
^ permalink raw reply related [flat|nested] 74+ messages in thread
* [PATCH 02/12] sched: Re-arrange the {EN,DE}QUEUE flags
2025-10-06 10:44 [PATCH 00/12] sched: Cleanup the change-pattern and related locking Peter Zijlstra
2025-10-06 10:44 ` [PATCH 01/12] sched: Employ sched_change guards Peter Zijlstra
@ 2025-10-06 10:44 ` Peter Zijlstra
2025-10-16 9:33 ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2025-10-06 10:44 ` [PATCH 03/12] sched: Fold sched_class::switch{ing,ed}_{to,from}() into the change pattern Peter Zijlstra
` (14 subsequent siblings)
16 siblings, 1 reply; 74+ messages in thread
From: Peter Zijlstra @ 2025-10-06 10:44 UTC (permalink / raw)
To: tj
Cc: linux-kernel, peterz, mingo, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, longman,
hannes, mkoutny, void, arighi, changwoo, cgroups, sched-ext,
liuwenfang, tglx
Ensure the matched flags are in the low word while the unmatched flags
go into the second word.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/sched.h | 45 ++++++++++++++++++++++++---------------------
1 file changed, 24 insertions(+), 21 deletions(-)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2362,27 +2362,30 @@ extern const u32 sched_prio_to_wmult[40
*
*/
-#define DEQUEUE_SLEEP 0x01 /* Matches ENQUEUE_WAKEUP */
-#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */
-#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */
-#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */
-#define DEQUEUE_SPECIAL 0x10
-#define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */
-#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */
-#define DEQUEUE_THROTTLE 0x800
-
-#define ENQUEUE_WAKEUP 0x01
-#define ENQUEUE_RESTORE 0x02
-#define ENQUEUE_MOVE 0x04
-#define ENQUEUE_NOCLOCK 0x08
-
-#define ENQUEUE_HEAD 0x10
-#define ENQUEUE_REPLENISH 0x20
-#define ENQUEUE_MIGRATED 0x40
-#define ENQUEUE_INITIAL 0x80
-#define ENQUEUE_MIGRATING 0x100
-#define ENQUEUE_DELAYED 0x200
-#define ENQUEUE_RQ_SELECTED 0x400
+#define DEQUEUE_SLEEP 0x0001 /* Matches ENQUEUE_WAKEUP */
+#define DEQUEUE_SAVE 0x0002 /* Matches ENQUEUE_RESTORE */
+#define DEQUEUE_MOVE 0x0004 /* Matches ENQUEUE_MOVE */
+#define DEQUEUE_NOCLOCK 0x0008 /* Matches ENQUEUE_NOCLOCK */
+
+#define DEQUEUE_MIGRATING 0x0010 /* Matches ENQUEUE_MIGRATING */
+#define DEQUEUE_DELAYED 0x0020 /* Matches ENQUEUE_DELAYED */
+
+#define DEQUEUE_SPECIAL 0x00010000
+#define DEQUEUE_THROTTLE 0x00020000
+
+#define ENQUEUE_WAKEUP 0x0001
+#define ENQUEUE_RESTORE 0x0002
+#define ENQUEUE_MOVE 0x0004
+#define ENQUEUE_NOCLOCK 0x0008
+
+#define ENQUEUE_MIGRATING 0x0010
+#define ENQUEUE_DELAYED 0x0020
+
+#define ENQUEUE_HEAD 0x00010000
+#define ENQUEUE_REPLENISH 0x00020000
+#define ENQUEUE_MIGRATED 0x00040000
+#define ENQUEUE_INITIAL 0x00080000
+#define ENQUEUE_RQ_SELECTED 0x00100000
#define RETRY_TASK ((void *)-1UL)
^ permalink raw reply [flat|nested] 74+ messages in thread* [tip: sched/core] sched: Re-arrange the {EN,DE}QUEUE flags
2025-10-06 10:44 ` [PATCH 02/12] sched: Re-arrange the {EN,DE}QUEUE flags Peter Zijlstra
@ 2025-10-16 9:33 ` tip-bot2 for Peter Zijlstra
0 siblings, 0 replies; 74+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2025-10-16 9:33 UTC (permalink / raw)
To: linux-tip-commits
Cc: Peter Zijlstra (Intel), Juri Lelli, Tejun Heo, Vincent Guittot,
x86, linux-kernel
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 376f8963bbda5fee838eb1823b07562368104024
Gitweb: https://git.kernel.org/tip/376f8963bbda5fee838eb1823b07562368104024
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Wed, 30 Oct 2024 13:52:05 +01:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 16 Oct 2025 11:13:50 +02:00
sched: Re-arrange the {EN,DE}QUEUE flags
Ensure the matched flags are in the low word while the unmatched flags
go into the second word.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
---
kernel/sched/sched.h | 45 ++++++++++++++++++++++---------------------
1 file changed, 24 insertions(+), 21 deletions(-)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6546849..24b3c6c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2362,27 +2362,30 @@ extern const u32 sched_prio_to_wmult[40];
*
*/
-#define DEQUEUE_SLEEP 0x01 /* Matches ENQUEUE_WAKEUP */
-#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */
-#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */
-#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */
-#define DEQUEUE_SPECIAL 0x10
-#define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */
-#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */
-#define DEQUEUE_THROTTLE 0x800
-
-#define ENQUEUE_WAKEUP 0x01
-#define ENQUEUE_RESTORE 0x02
-#define ENQUEUE_MOVE 0x04
-#define ENQUEUE_NOCLOCK 0x08
-
-#define ENQUEUE_HEAD 0x10
-#define ENQUEUE_REPLENISH 0x20
-#define ENQUEUE_MIGRATED 0x40
-#define ENQUEUE_INITIAL 0x80
-#define ENQUEUE_MIGRATING 0x100
-#define ENQUEUE_DELAYED 0x200
-#define ENQUEUE_RQ_SELECTED 0x400
+#define DEQUEUE_SLEEP 0x0001 /* Matches ENQUEUE_WAKEUP */
+#define DEQUEUE_SAVE 0x0002 /* Matches ENQUEUE_RESTORE */
+#define DEQUEUE_MOVE 0x0004 /* Matches ENQUEUE_MOVE */
+#define DEQUEUE_NOCLOCK 0x0008 /* Matches ENQUEUE_NOCLOCK */
+
+#define DEQUEUE_MIGRATING 0x0010 /* Matches ENQUEUE_MIGRATING */
+#define DEQUEUE_DELAYED 0x0020 /* Matches ENQUEUE_DELAYED */
+
+#define DEQUEUE_SPECIAL 0x00010000
+#define DEQUEUE_THROTTLE 0x00020000
+
+#define ENQUEUE_WAKEUP 0x0001
+#define ENQUEUE_RESTORE 0x0002
+#define ENQUEUE_MOVE 0x0004
+#define ENQUEUE_NOCLOCK 0x0008
+
+#define ENQUEUE_MIGRATING 0x0010
+#define ENQUEUE_DELAYED 0x0020
+
+#define ENQUEUE_HEAD 0x00010000
+#define ENQUEUE_REPLENISH 0x00020000
+#define ENQUEUE_MIGRATED 0x00040000
+#define ENQUEUE_INITIAL 0x00080000
+#define ENQUEUE_RQ_SELECTED 0x00100000
#define RETRY_TASK ((void *)-1UL)
^ permalink raw reply related [flat|nested] 74+ messages in thread
* [PATCH 03/12] sched: Fold sched_class::switch{ing,ed}_{to,from}() into the change pattern
2025-10-06 10:44 [PATCH 00/12] sched: Cleanup the change-pattern and related locking Peter Zijlstra
2025-10-06 10:44 ` [PATCH 01/12] sched: Employ sched_change guards Peter Zijlstra
2025-10-06 10:44 ` [PATCH 02/12] sched: Re-arrange the {EN,DE}QUEUE flags Peter Zijlstra
@ 2025-10-06 10:44 ` Peter Zijlstra
2025-10-09 13:30 ` Dietmar Eggemann
2025-10-16 9:33 ` [tip: sched/core] sched: Fold sched_class::switch{ing,ed}_{to,from}() into the change pattern tip-bot2 for Peter Zijlstra
2025-10-06 10:44 ` [PATCH 04/12] sched: Cleanup sched_delayed handling for class switches Peter Zijlstra
` (13 subsequent siblings)
16 siblings, 2 replies; 74+ messages in thread
From: Peter Zijlstra @ 2025-10-06 10:44 UTC (permalink / raw)
To: tj
Cc: linux-kernel, peterz, mingo, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, longman,
hannes, mkoutny, void, arighi, changwoo, cgroups, sched-ext,
liuwenfang, tglx
Add {DE,EN}QUEUE_CLASS and fold the sched_class::switch* methods into
the change pattern. This completes and makes the pattern more
symmetric.
This changes the order of callbacks slightly:
|
| switching_from()
dequeue_task(); | dequeue_task()
put_prev_task(); | put_prev_task()
| switched_from()
|
... change task ... | ... change task ...
|
switching_to(); | switching_to()
enqueue_task(); | enqueue_task()
set_next_task(); | set_next_task()
prev_class->switched_from() |
switched_to() | switched_to()
|
Notably, it moves the switched_from() callback right after the
dequeue/put. Existing implementations don't appear to be affected by
this change in location -- specifically the task isn't enqueued on the
class in question in either location.
Make (CLASS)^(SAVE|MOVE), because there is nothing to save-restore
when changing scheduling classes.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/core.c | 56 +++++++++++++++++++++--------------------------
kernel/sched/ext.c | 26 ++++++++++++++++-----
kernel/sched/idle.c | 4 +--
kernel/sched/rt.c | 2 -
kernel/sched/sched.h | 22 ++++++------------
kernel/sched/stop_task.c | 4 +--
kernel/sched/syscalls.c | 9 +++++--
7 files changed, 66 insertions(+), 57 deletions(-)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2169,34 +2169,9 @@ inline int task_curr(const struct task_s
return cpu_curr(task_cpu(p)) == p;
}
-/*
- * ->switching_to() is called with the pi_lock and rq_lock held and must not
- * mess with locking.
- */
-void check_class_changing(struct rq *rq, struct task_struct *p,
- const struct sched_class *prev_class)
+void check_prio_changed(struct rq *rq, struct task_struct *p, int oldprio)
{
- if (prev_class != p->sched_class && p->sched_class->switching_to)
- p->sched_class->switching_to(rq, p);
-}
-
-/*
- * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
- * use the balance_callback list if you want balancing.
- *
- * this means any call to check_class_changed() must be followed by a call to
- * balance_callback().
- */
-void check_class_changed(struct rq *rq, struct task_struct *p,
- const struct sched_class *prev_class,
- int oldprio)
-{
- if (prev_class != p->sched_class) {
- if (prev_class->switched_from)
- prev_class->switched_from(rq, p);
-
- p->sched_class->switched_to(rq, p);
- } else if (oldprio != p->prio || dl_task(p))
+ if (oldprio != p->prio || dl_task(p))
p->sched_class->prio_changed(rq, p, oldprio);
}
@@ -7388,6 +7363,11 @@ void rt_mutex_setprio(struct task_struct
prev_class = p->sched_class;
next_class = __setscheduler_class(p->policy, prio);
+ if (prev_class != next_class) {
+ queue_flag |= DEQUEUE_CLASS;
+ queue_flag &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
+ }
+
if (prev_class != next_class && p->se.sched_delayed)
dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
@@ -7424,11 +7404,10 @@ void rt_mutex_setprio(struct task_struct
p->sched_class = next_class;
p->prio = prio;
-
- check_class_changing(rq, p, prev_class);
}
- check_class_changed(rq, p, prev_class, oldprio);
+ if (!(queue_flag & DEQUEUE_CLASS))
+ check_prio_changed(rq, p, oldprio);
out_unlock:
/* Avoid rq from going away on us: */
preempt_disable();
@@ -10862,6 +10841,14 @@ struct sched_change_ctx *sched_change_be
lockdep_assert_rq_held(rq);
+ if (flags & DEQUEUE_CLASS) {
+ if (WARN_ON_ONCE(flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)))
+ flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
+
+ if (p->sched_class->switching_from)
+ p->sched_class->switching_from(rq, p);
+ }
+
*ctx = (struct sched_change_ctx){
.p = p,
.flags = flags,
@@ -10874,6 +10861,9 @@ struct sched_change_ctx *sched_change_be
if (ctx->running)
put_prev_task(rq, p);
+ if ((flags & DEQUEUE_CLASS) && p->sched_class->switched_from)
+ p->sched_class->switched_from(rq, p);
+
return ctx;
}
@@ -10884,8 +10874,14 @@ void sched_change_end(struct sched_chang
lockdep_assert_rq_held(rq);
+ if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to)
+ p->sched_class->switching_to(rq, p);
+
if (ctx->queued)
enqueue_task(rq, p, ctx->flags | ENQUEUE_NOCLOCK);
if (ctx->running)
set_next_task(rq, p);
+
+ if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switched_to)
+ p->sched_class->switched_to(rq, p);
}
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3912,21 +3912,28 @@ static void scx_disable_workfn(struct kt
scx_task_iter_start(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
+ unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
const struct sched_class *old_class = p->sched_class;
const struct sched_class *new_class =
__setscheduler_class(p->policy, p->prio);
update_rq_clock(task_rq(p));
+ if (old_class != new_class) {
+ queue_flags |= DEQUEUE_CLASS;
+ queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
+ }
+
if (old_class != new_class && p->se.sched_delayed)
dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
- scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
+ scoped_guard (sched_change, p, queue_flags) {
p->sched_class = new_class;
- check_class_changing(task_rq(p), p, old_class);
}
- check_class_changed(task_rq(p), p, old_class, p->prio);
+ if (!(queue_flags & DEQUEUE_CLASS))
+ check_prio_changed(task_rq(p), p, p->prio);
+
scx_exit_task(p);
}
scx_task_iter_stop(&sti);
@@ -4655,6 +4662,7 @@ static int scx_enable(struct sched_ext_o
percpu_down_write(&scx_fork_rwsem);
scx_task_iter_start(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
+ unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
const struct sched_class *old_class = p->sched_class;
const struct sched_class *new_class =
__setscheduler_class(p->policy, p->prio);
@@ -4664,16 +4672,22 @@ static int scx_enable(struct sched_ext_o
update_rq_clock(task_rq(p));
+ if (old_class != new_class) {
+ queue_flags |= DEQUEUE_CLASS;
+ queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
+ }
+
if (old_class != new_class && p->se.sched_delayed)
dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
- scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
+ scoped_guard (sched_change, p, queue_flags) {
p->scx.slice = SCX_SLICE_DFL;
p->sched_class = new_class;
- check_class_changing(task_rq(p), p, old_class);
}
- check_class_changed(task_rq(p), p, old_class, p->prio);
+ if (!(queue_flags & DEQUEUE_CLASS))
+ check_prio_changed(task_rq(p), p, p->prio);
+
put_task_struct(p);
}
scx_task_iter_stop(&sti);
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -498,7 +498,7 @@ static void task_tick_idle(struct rq *rq
{
}
-static void switched_to_idle(struct rq *rq, struct task_struct *p)
+static void switching_to_idle(struct rq *rq, struct task_struct *p)
{
BUG();
}
@@ -536,6 +536,6 @@ DEFINE_SCHED_CLASS(idle) = {
.task_tick = task_tick_idle,
.prio_changed = prio_changed_idle,
- .switched_to = switched_to_idle,
+ .switching_to = switching_to_idle,
.update_curr = update_curr_idle,
};
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2589,8 +2589,8 @@ DEFINE_SCHED_CLASS(rt) = {
.get_rr_interval = get_rr_interval_rt,
- .prio_changed = prio_changed_rt,
.switched_to = switched_to_rt,
+ .prio_changed = prio_changed_rt,
.update_curr = update_curr_rt,
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -20,7 +20,6 @@
#include <linux/sched/task_flags.h>
#include <linux/sched/task.h>
#include <linux/sched/topology.h>
-
#include <linux/atomic.h>
#include <linux/bitmap.h>
#include <linux/bug.h>
@@ -2369,6 +2368,7 @@ extern const u32 sched_prio_to_wmult[40
#define DEQUEUE_MIGRATING 0x0010 /* Matches ENQUEUE_MIGRATING */
#define DEQUEUE_DELAYED 0x0020 /* Matches ENQUEUE_DELAYED */
+#define DEQUEUE_CLASS 0x0040 /* Matches ENQUEUE_CLASS */
#define DEQUEUE_SPECIAL 0x00010000
#define DEQUEUE_THROTTLE 0x00020000
@@ -2380,6 +2380,7 @@ extern const u32 sched_prio_to_wmult[40
#define ENQUEUE_MIGRATING 0x0010
#define ENQUEUE_DELAYED 0x0020
+#define ENQUEUE_CLASS 0x0040
#define ENQUEUE_HEAD 0x00010000
#define ENQUEUE_REPLENISH 0x00020000
@@ -2443,14 +2444,11 @@ struct sched_class {
void (*task_fork)(struct task_struct *p);
void (*task_dead)(struct task_struct *p);
- /*
- * The switched_from() call is allowed to drop rq->lock, therefore we
- * cannot assume the switched_from/switched_to pair is serialized by
- * rq->lock. They are however serialized by p->pi_lock.
- */
- void (*switching_to) (struct rq *this_rq, struct task_struct *task);
- void (*switched_from)(struct rq *this_rq, struct task_struct *task);
- void (*switched_to) (struct rq *this_rq, struct task_struct *task);
+ void (*switching_from)(struct rq *this_rq, struct task_struct *task);
+ void (*switched_from) (struct rq *this_rq, struct task_struct *task);
+ void (*switching_to) (struct rq *this_rq, struct task_struct *task);
+ void (*switched_to) (struct rq *this_rq, struct task_struct *task);
+
void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
const struct load_weight *lw);
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
@@ -3879,11 +3877,7 @@ extern void set_load_weight(struct task_
extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
-extern void check_class_changing(struct rq *rq, struct task_struct *p,
- const struct sched_class *prev_class);
-extern void check_class_changed(struct rq *rq, struct task_struct *p,
- const struct sched_class *prev_class,
- int oldprio);
+extern void check_prio_changed(struct rq *rq, struct task_struct *p, int oldprio);
extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -75,7 +75,7 @@ static void task_tick_stop(struct rq *rq
{
}
-static void switched_to_stop(struct rq *rq, struct task_struct *p)
+static void switching_to_stop(struct rq *rq, struct task_struct *p)
{
BUG(); /* its impossible to change to this class */
}
@@ -112,6 +112,6 @@ DEFINE_SCHED_CLASS(stop) = {
.task_tick = task_tick_stop,
.prio_changed = prio_changed_stop,
- .switched_to = switched_to_stop,
+ .switching_to = switching_to_stop,
.update_curr = update_curr_stop,
};
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -684,6 +684,11 @@ int __sched_setscheduler(struct task_str
prev_class = p->sched_class;
next_class = __setscheduler_class(policy, newprio);
+ if (prev_class != next_class) {
+ queue_flags |= DEQUEUE_CLASS;
+ queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
+ }
+
if (prev_class != next_class && p->se.sched_delayed)
dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
@@ -695,7 +700,6 @@ int __sched_setscheduler(struct task_str
p->prio = newprio;
}
__setscheduler_uclamp(p, attr);
- check_class_changing(rq, p, prev_class);
if (scope->queued) {
/*
@@ -707,7 +711,8 @@ int __sched_setscheduler(struct task_str
}
}
- check_class_changed(rq, p, prev_class, oldprio);
+ if (!(queue_flags & DEQUEUE_CLASS))
+ check_prio_changed(rq, p, oldprio);
/* Avoid rq from going away on us: */
preempt_disable();
^ permalink raw reply [flat|nested] 74+ messages in thread* Re: [PATCH 03/12] sched: Fold sched_class::switch{ing,ed}_{to,from}() into the change pattern
2025-10-06 10:44 ` [PATCH 03/12] sched: Fold sched_class::switch{ing,ed}_{to,from}() into the change pattern Peter Zijlstra
@ 2025-10-09 13:30 ` Dietmar Eggemann
2025-10-09 13:54 ` Peter Zijlstra
2025-10-16 9:33 ` [tip: sched/core] sched: Fold sched_class::switch{ing,ed}_{to,from}() into the change pattern tip-bot2 for Peter Zijlstra
1 sibling, 1 reply; 74+ messages in thread
From: Dietmar Eggemann @ 2025-10-09 13:30 UTC (permalink / raw)
To: Peter Zijlstra, tj
Cc: linux-kernel, mingo, juri.lelli, vincent.guittot, rostedt,
bsegall, mgorman, vschneid, longman, hannes, mkoutny, void,
arighi, changwoo, cgroups, sched-ext, liuwenfang, tglx
On 06.10.25 12:44, Peter Zijlstra wrote:
> Add {DE,EN}QUEUE_CLASS and fold the sched_class::switch* methods into
> the change pattern. This completes and makes the pattern more
> symmetric.
>
> This changes the order of callbacks slightly:
>
> |
> | switching_from()
> dequeue_task(); | dequeue_task()
> put_prev_task(); | put_prev_task()
> | switched_from()
> |
> ... change task ... | ... change task ...
> |
> switching_to(); | switching_to()
> enqueue_task(); | enqueue_task()
> set_next_task(); | set_next_task()
> prev_class->switched_from() |
> switched_to() | switched_to()
> |
>
> Notably, it moves the switched_from() callback right after the
> dequeue/put. Existing implementations don't appear to be affected by
> this change in location -- specifically the task isn't enqueued on the
> class in question in either location.
>
> Make (CLASS)^(SAVE|MOVE), because there is nothing to save-restore
> when changing scheduling classes.
This one causes a DL bw related warning when I run a simple 1 DL task
rt-app workload:
# rt-app ./rt-app/dl10.json
[rt-app] <notice> thread_data_set_unique_name 0 thread0-0
[rt-app] <notice> [0] starting thread ...
[rt-app] <notice> [0] Starting with SCHED_DEADLINE policy with priority 0
[ 16.390272] sched: DL replenish lagged too much
[ 16.390327] ------------[ cut here ]------------
[ 16.390329] WARNING: CPU: 2 PID: 591 at kernel/sched/deadline.c:239 sub_running_bw.isra.0+0xf4/0x150
[ 16.391849] Modules linked in:
[ 16.392107] CPU: 2 UID: 0 PID: 591 Comm: thread0-0 Not tainted 6.17.0-rc4-00020-ga6b63e5ce187 #46 PREEMPT
[ 16.392885] Hardware name: linux,dummy-virt (DT)
[ 16.393265] pstate: 014000c5 (nzcv daIF +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
[ 16.393783] pc : sub_running_bw.isra.0+0xf4/0x150
[ 16.394153] lr : sub_running_bw.isra.0+0x118/0x150
[ 16.394636] sp : ffff80008137bb10
[ 16.394864] x29: ffff80008137bb10 x28: ffff0000ff7b39c0 x27: ffff0000ce73dd60
[ 16.395333] x26: 0000000000000000 x25: ffffa1134d945000 x24: ffff0000ff7b42c8
[ 16.395805] x23: ffffa1134d944000 x22: ffffa1134d944000 x21: 000000000000cccc
[ 16.396267] x20: 0000000000060000 x19: ffff0000ff7b42c8 x18: fffffffffffe6f58
[ 16.396742] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000001
[ 16.397202] x14: fffffffffffc6f57 x13: 0a6863756d206f6f x12: ffffa1134e743f60
[ 16.397674] x11: 00000000000000c0 x10: 0000000000000001 x9 : 0000000000000000
[ 16.398130] x8 : ffff0000c001e490 x7 : 0000000000000008 x6 : ffff0000c0029968
[ 16.398883] x5 : 00000000ffffffff x4 : 0000000000000064 x3 : ffff0000c0029fa8
[ 16.399432] x2 : ffff5eedb1f6e000 x1 : 000000000000cccc x0 : fffffffffffacccc
[ 16.399962] Call trace:
[ 16.400147] sub_running_bw.isra.0+0xf4/0x150 (P)
[ 16.400510] task_non_contending+0x248/0x2ac
[ 16.400831] dequeue_task_dl+0x178/0x2d4
[ 16.401122] __schedule+0x6ac/0x1038
[ 16.401401] schedule+0x4c/0x164
[ 16.401627] do_nanosleep+0x6c/0x190
[ 16.401862] hrtimer_nanosleep+0xbc/0x200
[ 16.402156] common_nsleep_timens+0x50/0x90
[ 16.402522] __arm64_sys_clock_nanosleep+0xd0/0x150
[ 16.402813] invoke_syscall+0x48/0x104
[ 16.403043] el0_svc_common.constprop.0+0x40/0xe0
[ 16.403327] do_el0_svc+0x1c/0x28
[ 16.403520] el0_svc+0x4c/0x160
[ 16.403711] el0t_64_sync_handler+0xa0/0xf0
[ 16.403950] el0t_64_sync+0x198/0x19c
[ 16.404226] irq event stamp: 196
[ 16.404451] hardirqs last enabled at (195): [<ffffa1134c8021d8>] _raw_spin_unlock_irqrestore+0x6c/0x74
[ 16.405086] hardirqs last disabled at (196): [<ffffa1134c7f7850>] __schedule+0x4e8/0x1038
[ 16.405629] softirqs last enabled at (154): [<ffffa1134b4e157c>] handle_softirqs+0x44c/0x498
[ 16.406218] softirqs last disabled at (145): [<ffffa1134b410774>] __do_softirq+0x14/0x20
with extra logging and removing underflow WARN_ON_ONCE:
# rt-app ./rt-app/dl10.json
[rt-app] <notice> thread_data_set_unique_name 0 thread0-0
[rt-app] <notice> [0] starting thread ...
[rt-app] <notice> [0] Starting with SCHED_DEADLINE policy with priority 0
[ 18.494469] sched: DL replenish lagged too much
[ 18.494483] cpu=3 p->comm=thread0-0 p->pid=592
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[ 18.494486] __sub_running_bw() cpu=3 dl_rq->running_bw=18446744073709210828 dl_bw=393216 old=52428
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
dl_rq->running_bw underflow in task_non_contending()
[ 18.494492] CPU: 3 UID: 0 PID: 592 Comm: thread0-0 Not tainted 6.17.0-rc4-00020-ga6b63e5ce187-dirty #44 PREEMPT
[ 18.494495] Hardware name: linux,dummy-virt (DT)
[ 18.494497] Call trace:
[ 18.494498] show_stack+0x18/0x24 (C)
[ 18.494510] dump_stack_lvl+0x70/0x98
[ 18.494514] dump_stack+0x18/0x24
[ 18.494516] sub_running_bw.isra.0+0x164/0x180
[ 18.494539] task_non_contending+0x298/0x2e8
[ 18.494541] dequeue_task_dl+0x188/0x31c
[ 18.494544] __schedule+0x6ac/0x1038
[ 18.494574] schedule+0x4c/0x164
[ 18.494578] do_nanosleep+0x6c/0x190
[ 18.494580] hrtimer_nanosleep+0xbc/0x200
[ 18.494594] common_nsleep_timens+0x50/0x90
[ 18.494599] __arm64_sys_clock_nanosleep+0xd0/0x150
[ 18.494602] invoke_syscall+0x48/0x104
[ 18.494610] el0_svc_common.constprop.0+0x40/0xe0
[ 18.494612] do_el0_svc+0x1c/0x28
[ 18.494615] el0_svc+0x4c/0x160
[ 18.494617] el0t_64_sync_handler+0xa0/0xf0
[ 18.494620] el0t_64_sync+0x198/0x19c
Not sure yet how this is related to switched_from_dl() being now called earlier?
[...]
^ permalink raw reply [flat|nested] 74+ messages in thread* Re: [PATCH 03/12] sched: Fold sched_class::switch{ing,ed}_{to,from}() into the change pattern
2025-10-09 13:30 ` Dietmar Eggemann
@ 2025-10-09 13:54 ` Peter Zijlstra
2025-10-09 14:09 ` Peter Zijlstra
0 siblings, 1 reply; 74+ messages in thread
From: Peter Zijlstra @ 2025-10-09 13:54 UTC (permalink / raw)
To: Dietmar Eggemann
Cc: tj, linux-kernel, mingo, juri.lelli, vincent.guittot, rostedt,
bsegall, mgorman, vschneid, longman, hannes, mkoutny, void,
arighi, changwoo, cgroups, sched-ext, liuwenfang, tglx
On Thu, Oct 09, 2025 at 03:30:02PM +0200, Dietmar Eggemann wrote:
> On 06.10.25 12:44, Peter Zijlstra wrote:
> > Add {DE,EN}QUEUE_CLASS and fold the sched_class::switch* methods into
> > the change pattern. This completes and makes the pattern more
> > symmetric.
> >
> > This changes the order of callbacks slightly:
> >
> > |
> > | switching_from()
> > dequeue_task(); | dequeue_task()
> > put_prev_task(); | put_prev_task()
> > | switched_from()
> > |
> > ... change task ... | ... change task ...
> > |
> > switching_to(); | switching_to()
> > enqueue_task(); | enqueue_task()
> > set_next_task(); | set_next_task()
> > prev_class->switched_from() |
> > switched_to() | switched_to()
> > |
> >
> > Notably, it moves the switched_from() callback right after the
> > dequeue/put. Existing implementations don't appear to be affected by
> > this change in location -- specifically the task isn't enqueued on the
> > class in question in either location.
> >
> > Make (CLASS)^(SAVE|MOVE), because there is nothing to save-restore
> > when changing scheduling classes.
>
> This one causes a DL bw related warning when I run a simple 1 DL task
> rt-app workload:
> Not sure yet how this is related to switched_from_dl() being now called earlier?
Ooh, I might see a problem. task_non_contending() uses dl_task(), which
uses p->prio. The move above means it is now called using the 'old'
prio, whereas it used to run with the 'new' prio.
I suppose it does this to distinguish 'real' DL tasks from PI boosted DL
tasks.
Let me see if I can figure out something for this.
^ permalink raw reply [flat|nested] 74+ messages in thread* Re: [PATCH 03/12] sched: Fold sched_class::switch{ing,ed}_{to,from}() into the change pattern
2025-10-09 13:54 ` Peter Zijlstra
@ 2025-10-09 14:09 ` Peter Zijlstra
2025-10-09 16:50 ` Dietmar Eggemann
2025-10-16 9:33 ` [tip: sched/core] sched/deadline: Prepare for switched_from() change tip-bot2 for Peter Zijlstra
0 siblings, 2 replies; 74+ messages in thread
From: Peter Zijlstra @ 2025-10-09 14:09 UTC (permalink / raw)
To: Dietmar Eggemann
Cc: tj, linux-kernel, mingo, juri.lelli, vincent.guittot, rostedt,
bsegall, mgorman, vschneid, longman, hannes, mkoutny, void,
arighi, changwoo, cgroups, sched-ext, liuwenfang, tglx
On Thu, Oct 09, 2025 at 03:54:08PM +0200, Peter Zijlstra wrote:
> On Thu, Oct 09, 2025 at 03:30:02PM +0200, Dietmar Eggemann wrote:
> > On 06.10.25 12:44, Peter Zijlstra wrote:
> > > Add {DE,EN}QUEUE_CLASS and fold the sched_class::switch* methods into
> > > the change pattern. This completes and makes the pattern more
> > > symmetric.
> > >
> > > This changes the order of callbacks slightly:
> > >
> > > |
> > > | switching_from()
> > > dequeue_task(); | dequeue_task()
> > > put_prev_task(); | put_prev_task()
> > > | switched_from()
> > > |
> > > ... change task ... | ... change task ...
> > > |
> > > switching_to(); | switching_to()
> > > enqueue_task(); | enqueue_task()
> > > set_next_task(); | set_next_task()
> > > prev_class->switched_from() |
> > > switched_to() | switched_to()
> > > |
> > >
> > > Notably, it moves the switched_from() callback right after the
> > > dequeue/put. Existing implementations don't appear to be affected by
> > > this change in location -- specifically the task isn't enqueued on the
> > > class in question in either location.
> > >
> > > Make (CLASS)^(SAVE|MOVE), because there is nothing to save-restore
> > > when changing scheduling classes.
> >
> > This one causes a DL bw related warning when I run a simple 1 DL task
> > rt-app workload:
>
> > Not sure yet how this is related to switched_from_dl() being now called earlier?
>
> Ooh, I might see a problem. task_non_contending() uses dl_task(), which
> uses p->prio. The move above means it is now called using the 'old'
> prio, whereas it used to run with the 'new' prio.
>
> Let me see if I can figure out something for this.
Does this help? /me goes find rt-app.
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 615411a0a881..fe2272c812b2 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -405,7 +405,7 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se);
* up, and checks if the task is still in the "ACTIVE non contending"
* state or not (in the second case, it updates running_bw).
*/
-static void task_non_contending(struct sched_dl_entity *dl_se)
+static void task_non_contending(struct sched_dl_entity *dl_se, bool dl_task)
{
struct hrtimer *timer = &dl_se->inactive_timer;
struct rq *rq = rq_of_dl_se(dl_se);
@@ -444,10 +444,10 @@ static void task_non_contending(struct sched_dl_entity *dl_se)
} else {
struct task_struct *p = dl_task_of(dl_se);
- if (dl_task(p))
+ if (dl_task)
sub_running_bw(dl_se, dl_rq);
- if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
+ if (!dl_task || READ_ONCE(p->__state) == TASK_DEAD) {
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
if (READ_ONCE(p->__state) == TASK_DEAD)
@@ -2045,7 +2045,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags)
* or "inactive")
*/
if (flags & DEQUEUE_SLEEP)
- task_non_contending(dl_se);
+ task_non_contending(dl_se, true);
}
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
@@ -2970,7 +2970,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
* will reset the task parameters.
*/
if (task_on_rq_queued(p) && p->dl.dl_runtime)
- task_non_contending(&p->dl);
+ task_non_contending(&p->dl, false);
/*
* In case a task is setscheduled out from SCHED_DEADLINE we need to
^ permalink raw reply related [flat|nested] 74+ messages in thread* Re: [PATCH 03/12] sched: Fold sched_class::switch{ing,ed}_{to,from}() into the change pattern
2025-10-09 14:09 ` Peter Zijlstra
@ 2025-10-09 16:50 ` Dietmar Eggemann
2025-10-13 10:23 ` Peter Zijlstra
2025-10-16 9:33 ` [tip: sched/core] sched/deadline: Prepare for switched_from() change tip-bot2 for Peter Zijlstra
1 sibling, 1 reply; 74+ messages in thread
From: Dietmar Eggemann @ 2025-10-09 16:50 UTC (permalink / raw)
To: Peter Zijlstra
Cc: tj, linux-kernel, mingo, juri.lelli, vincent.guittot, rostedt,
bsegall, mgorman, vschneid, longman, hannes, mkoutny, void,
arighi, changwoo, cgroups, sched-ext, liuwenfang, tglx
On 09.10.25 16:09, Peter Zijlstra wrote:
> On Thu, Oct 09, 2025 at 03:54:08PM +0200, Peter Zijlstra wrote:
>> On Thu, Oct 09, 2025 at 03:30:02PM +0200, Dietmar Eggemann wrote:
>>> On 06.10.25 12:44, Peter Zijlstra wrote:
>>>> Add {DE,EN}QUEUE_CLASS and fold the sched_class::switch* methods into
>>>> the change pattern. This completes and makes the pattern more
>>>> symmetric.
>>>>
>>>> This changes the order of callbacks slightly:
>>>>
>>>> |
>>>> | switching_from()
>>>> dequeue_task(); | dequeue_task()
>>>> put_prev_task(); | put_prev_task()
>>>> | switched_from()
>>>> |
>>>> ... change task ... | ... change task ...
>>>> |
>>>> switching_to(); | switching_to()
>>>> enqueue_task(); | enqueue_task()
>>>> set_next_task(); | set_next_task()
>>>> prev_class->switched_from() |
>>>> switched_to() | switched_to()
>>>> |
>>>>
>>>> Notably, it moves the switched_from() callback right after the
>>>> dequeue/put. Existing implementations don't appear to be affected by
>>>> this change in location -- specifically the task isn't enqueued on the
>>>> class in question in either location.
>>>>
>>>> Make (CLASS)^(SAVE|MOVE), because there is nothing to save-restore
>>>> when changing scheduling classes.
>>>
>>> This one causes a DL bw related warning when I run a simple 1 DL task
>>> rt-app workload:
>>
>>> Not sure yet how this is related to switched_from_dl() being now called earlier?
>>
>> Ooh, I might see a problem. task_non_contending() uses dl_task(), which
>> uses p->prio. The move above means it is now called using the 'old'
>> prio, whereas it used to run with the 'new' prio.
>>
>> Let me see if I can figure out something for this.
>
> Does this help? /me goes find rt-app.
Yes, but there seems to be more ... missing DEQUEUE_SAVE (a.k.a.
ENQUEUE_RESTORE) in
enqueue_dl_entity()
if (flags & (ENQUEUE_RESTORE|ENQUEUE_MIGRATING))
^^^^^^^^^^^^^^^
...
add_running_bw(dl_se, dl_rq)
and
__sched_setscheduler()
...
if (prev_class != next_class)
queue_flags |= DEQUEUE_CLASS;
queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
^^^^^^^^^^^^
as well as
sched_change_begin()
...
if (flags & DEQUEUE_CLASS) {
if (WARN_ON_ONCE(flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)))
flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
^^^^^^^^^^^^
With your patch and this the issue went away:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 884926d3dd95..35074799e9ad 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10844,9 +10844,6 @@ struct sched_change_ctx
*sched_change_begin(struct task_struct *p, unsigned int
lockdep_assert_rq_held(rq);
if (flags & DEQUEUE_CLASS) {
- if (WARN_ON_ONCE(flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)))
- flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
-
if (p->sched_class->switching_from)
p->sched_class->switching_from(rq, p);
}
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 007d1440374b..bcef5c72d287 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -684,10 +684,8 @@ int __sched_setscheduler(struct task_struct *p,
prev_class = p->sched_class;
next_class = __setscheduler_class(policy, newprio);
- if (prev_class != next_class) {
+ if (prev_class != next_class)
queue_flags |= DEQUEUE_CLASS;
- queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
- }
^ permalink raw reply related [flat|nested] 74+ messages in thread* Re: [PATCH 03/12] sched: Fold sched_class::switch{ing,ed}_{to,from}() into the change pattern
2025-10-09 16:50 ` Dietmar Eggemann
@ 2025-10-13 10:23 ` Peter Zijlstra
0 siblings, 0 replies; 74+ messages in thread
From: Peter Zijlstra @ 2025-10-13 10:23 UTC (permalink / raw)
To: Dietmar Eggemann
Cc: tj, linux-kernel, mingo, juri.lelli, vincent.guittot, rostedt,
bsegall, mgorman, vschneid, longman, hannes, mkoutny, void,
arighi, changwoo, cgroups, sched-ext, liuwenfang, tglx
On Thu, Oct 09, 2025 at 06:50:55PM +0200, Dietmar Eggemann wrote:
> Yes, but there seems to be more ... missing DEQUEUE_SAVE (a.k.a.
> ENQUEUE_RESTORE) in
>
> enqueue_dl_entity()
>
> if (flags & (ENQUEUE_RESTORE|ENQUEUE_MIGRATING))
> ^^^^^^^^^^^^^^^
> ...
> add_running_bw(dl_se, dl_rq)
>
> and
>
> __sched_setscheduler()
>
> ...
> if (prev_class != next_class)
> queue_flags |= DEQUEUE_CLASS;
> queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
> ^^^^^^^^^^^^
>
> as well as
>
> sched_change_begin()
>
> ...
> if (flags & DEQUEUE_CLASS) {
> if (WARN_ON_ONCE(flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)))
> flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
> ^^^^^^^^^^^^
>
Urgh.. SAVE/RESTORE while changing CLASS is so weird.
But yeah, let me take that bit out for now -- I'll make a note in a
comment that we should look at perhaps cleaning that up instead.
^ permalink raw reply [flat|nested] 74+ messages in thread
* [tip: sched/core] sched/deadline: Prepare for switched_from() change
2025-10-09 14:09 ` Peter Zijlstra
2025-10-09 16:50 ` Dietmar Eggemann
@ 2025-10-16 9:33 ` tip-bot2 for Peter Zijlstra
1 sibling, 0 replies; 74+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2025-10-16 9:33 UTC (permalink / raw)
To: linux-tip-commits
Cc: Dietmar Eggemann, Peter Zijlstra (Intel), x86, linux-kernel
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 5e42d4c123ba9b89ce19b3aa7e22b7684cbfa49c
Gitweb: https://git.kernel.org/tip/5e42d4c123ba9b89ce19b3aa7e22b7684cbfa49c
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 09 Oct 2025 16:09:25 +02:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 16 Oct 2025 11:13:51 +02:00
sched/deadline: Prepare for switched_from() change
Prepare for the sched_class::switch*() methods getting folded into the
change pattern. As a result of that, the location of switched_from
will change slightly. SCHED_DEADLINE is affected by this change in
location:
OLD NEW
|
| switching_from()
dequeue_task(); | dequeue_task()
put_prev_task(); | put_prev_task()
| switched_from()
|
... change task ... | ... change task ...
|
switching_to(); | switching_to()
enqueue_task(); | enqueue_task()
set_next_task(); | set_next_task()
prev_class->switched_from() |
switched_to() | switched_to()
|
Notably, where switched_from() was called *after* the change to the
task, it will get called before it. Specifically, switched_from_dl()
uses dl_task(p) which uses p->prio; which is changed when switching
class (it might be the reason to switch class in case of PI).
When switched_from_dl() gets called, the task will have left the
deadline class and dl_task() must be false, while when doing
dequeue_dl_entity() the task must be a dl_task(), otherwise we'd have
called a different dequeue method.
Reported-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/sched/deadline.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 933bd1f..fd147a7 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -405,7 +405,7 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se);
* up, and checks if the task is still in the "ACTIVE non contending"
* state or not (in the second case, it updates running_bw).
*/
-static void task_non_contending(struct sched_dl_entity *dl_se)
+static void task_non_contending(struct sched_dl_entity *dl_se, bool dl_task)
{
struct hrtimer *timer = &dl_se->inactive_timer;
struct rq *rq = rq_of_dl_se(dl_se);
@@ -444,10 +444,10 @@ static void task_non_contending(struct sched_dl_entity *dl_se)
} else {
struct task_struct *p = dl_task_of(dl_se);
- if (dl_task(p))
+ if (dl_task)
sub_running_bw(dl_se, dl_rq);
- if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
+ if (!dl_task || READ_ONCE(p->__state) == TASK_DEAD) {
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
if (READ_ONCE(p->__state) == TASK_DEAD)
@@ -2045,7 +2045,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags)
* or "inactive")
*/
if (flags & DEQUEUE_SLEEP)
- task_non_contending(dl_se);
+ task_non_contending(dl_se, true);
}
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
@@ -2970,7 +2970,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
* will reset the task parameters.
*/
if (task_on_rq_queued(p) && p->dl.dl_runtime)
- task_non_contending(&p->dl);
+ task_non_contending(&p->dl, false);
/*
* In case a task is setscheduled out from SCHED_DEADLINE we need to
^ permalink raw reply related [flat|nested] 74+ messages in thread
* [tip: sched/core] sched: Fold sched_class::switch{ing,ed}_{to,from}() into the change pattern
2025-10-06 10:44 ` [PATCH 03/12] sched: Fold sched_class::switch{ing,ed}_{to,from}() into the change pattern Peter Zijlstra
2025-10-09 13:30 ` Dietmar Eggemann
@ 2025-10-16 9:33 ` tip-bot2 for Peter Zijlstra
1 sibling, 0 replies; 74+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2025-10-16 9:33 UTC (permalink / raw)
To: linux-tip-commits
Cc: Peter Zijlstra (Intel), Juri Lelli, Tejun Heo, Vincent Guittot,
x86, linux-kernel
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 637b0682821b144d5993211cf0a768b322138a69
Gitweb: https://git.kernel.org/tip/637b0682821b144d5993211cf0a768b322138a69
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Wed, 30 Oct 2024 15:08:15 +01:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 16 Oct 2025 11:13:51 +02:00
sched: Fold sched_class::switch{ing,ed}_{to,from}() into the change pattern
Add {DE,EN}QUEUE_CLASS and fold the sched_class::switch* methods into
the change pattern. This completes and makes the pattern more
symmetric.
This changes the order of callbacks slightly:
OLD NEW
|
| switching_from()
dequeue_task(); | dequeue_task()
put_prev_task(); | put_prev_task()
| switched_from()
|
... change task ... | ... change task ...
|
switching_to(); | switching_to()
enqueue_task(); | enqueue_task()
set_next_task(); | set_next_task()
prev_class->switched_from() |
switched_to() | switched_to()
|
Notably, it moves the switched_from() callback right after the
dequeue/put. Existing implementations don't appear to be affected by
this change in location -- specifically the task isn't enqueued on the
class in question in either location.
Make (CLASS)^(SAVE|MOVE), because there is nothing to save-restore
when changing scheduling classes.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
---
kernel/sched/core.c | 51 ++++++++++++++++-----------------------
kernel/sched/ext.c | 22 ++++++++++++-----
kernel/sched/idle.c | 4 +--
kernel/sched/rt.c | 2 +-
kernel/sched/sched.h | 22 ++++++-----------
kernel/sched/stop_task.c | 4 +--
kernel/sched/syscalls.c | 7 +++--
7 files changed, 55 insertions(+), 57 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index eca40df..4dbd206 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2169,34 +2169,9 @@ inline int task_curr(const struct task_struct *p)
return cpu_curr(task_cpu(p)) == p;
}
-/*
- * ->switching_to() is called with the pi_lock and rq_lock held and must not
- * mess with locking.
- */
-void check_class_changing(struct rq *rq, struct task_struct *p,
- const struct sched_class *prev_class)
-{
- if (prev_class != p->sched_class && p->sched_class->switching_to)
- p->sched_class->switching_to(rq, p);
-}
-
-/*
- * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
- * use the balance_callback list if you want balancing.
- *
- * this means any call to check_class_changed() must be followed by a call to
- * balance_callback().
- */
-void check_class_changed(struct rq *rq, struct task_struct *p,
- const struct sched_class *prev_class,
- int oldprio)
+void check_prio_changed(struct rq *rq, struct task_struct *p, int oldprio)
{
- if (prev_class != p->sched_class) {
- if (prev_class->switched_from)
- prev_class->switched_from(rq, p);
-
- p->sched_class->switched_to(rq, p);
- } else if (oldprio != p->prio || dl_task(p))
+ if (oldprio != p->prio || dl_task(p))
p->sched_class->prio_changed(rq, p, oldprio);
}
@@ -7388,6 +7363,9 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
prev_class = p->sched_class;
next_class = __setscheduler_class(p->policy, prio);
+ if (prev_class != next_class)
+ queue_flag |= DEQUEUE_CLASS;
+
if (prev_class != next_class && p->se.sched_delayed)
dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
@@ -7424,11 +7402,10 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
p->sched_class = next_class;
p->prio = prio;
-
- check_class_changing(rq, p, prev_class);
}
- check_class_changed(rq, p, prev_class, oldprio);
+ if (!(queue_flag & DEQUEUE_CLASS))
+ check_prio_changed(rq, p, oldprio);
out_unlock:
/* Avoid rq from going away on us: */
preempt_disable();
@@ -10862,6 +10839,11 @@ struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int
lockdep_assert_rq_held(rq);
+ if (flags & DEQUEUE_CLASS) {
+ if (p->sched_class->switching_from)
+ p->sched_class->switching_from(rq, p);
+ }
+
*ctx = (struct sched_change_ctx){
.p = p,
.flags = flags,
@@ -10874,6 +10856,9 @@ struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int
if (ctx->running)
put_prev_task(rq, p);
+ if ((flags & DEQUEUE_CLASS) && p->sched_class->switched_from)
+ p->sched_class->switched_from(rq, p);
+
return ctx;
}
@@ -10884,8 +10869,14 @@ void sched_change_end(struct sched_change_ctx *ctx)
lockdep_assert_rq_held(rq);
+ if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to)
+ p->sched_class->switching_to(rq, p);
+
if (ctx->queued)
enqueue_task(rq, p, ctx->flags | ENQUEUE_NOCLOCK);
if (ctx->running)
set_next_task(rq, p);
+
+ if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switched_to)
+ p->sched_class->switched_to(rq, p);
}
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 4566a7c..a408c39 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3912,21 +3912,26 @@ static void scx_disable_workfn(struct kthread_work *work)
scx_task_iter_start(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
+ unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
const struct sched_class *old_class = p->sched_class;
const struct sched_class *new_class =
__setscheduler_class(p->policy, p->prio);
update_rq_clock(task_rq(p));
+ if (old_class != new_class)
+ queue_flags |= DEQUEUE_CLASS;
+
if (old_class != new_class && p->se.sched_delayed)
dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
- scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
+ scoped_guard (sched_change, p, queue_flags) {
p->sched_class = new_class;
- check_class_changing(task_rq(p), p, old_class);
}
- check_class_changed(task_rq(p), p, old_class, p->prio);
+ if (!(queue_flags & DEQUEUE_CLASS))
+ check_prio_changed(task_rq(p), p, p->prio);
+
scx_exit_task(p);
}
scx_task_iter_stop(&sti);
@@ -4655,6 +4660,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
percpu_down_write(&scx_fork_rwsem);
scx_task_iter_start(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
+ unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
const struct sched_class *old_class = p->sched_class;
const struct sched_class *new_class =
__setscheduler_class(p->policy, p->prio);
@@ -4664,16 +4670,20 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
update_rq_clock(task_rq(p));
+ if (old_class != new_class)
+ queue_flags |= DEQUEUE_CLASS;
+
if (old_class != new_class && p->se.sched_delayed)
dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
- scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
+ scoped_guard (sched_change, p, queue_flags) {
p->scx.slice = SCX_SLICE_DFL;
p->sched_class = new_class;
- check_class_changing(task_rq(p), p, old_class);
}
- check_class_changed(task_rq(p), p, old_class, p->prio);
+ if (!(queue_flags & DEQUEUE_CLASS))
+ check_prio_changed(task_rq(p), p, p->prio);
+
put_task_struct(p);
}
scx_task_iter_stop(&sti);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c39b089..f02dced 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -498,7 +498,7 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
{
}
-static void switched_to_idle(struct rq *rq, struct task_struct *p)
+static void switching_to_idle(struct rq *rq, struct task_struct *p)
{
BUG();
}
@@ -536,6 +536,6 @@ DEFINE_SCHED_CLASS(idle) = {
.task_tick = task_tick_idle,
.prio_changed = prio_changed_idle,
- .switched_to = switched_to_idle,
+ .switching_to = switching_to_idle,
.update_curr = update_curr_idle,
};
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7936d43..6b2e811 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2589,8 +2589,8 @@ DEFINE_SCHED_CLASS(rt) = {
.get_rr_interval = get_rr_interval_rt,
- .prio_changed = prio_changed_rt,
.switched_to = switched_to_rt,
+ .prio_changed = prio_changed_rt,
.update_curr = update_curr_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 24b3c6c..e3f4215 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -20,7 +20,6 @@
#include <linux/sched/task_flags.h>
#include <linux/sched/task.h>
#include <linux/sched/topology.h>
-
#include <linux/atomic.h>
#include <linux/bitmap.h>
#include <linux/bug.h>
@@ -2369,6 +2368,7 @@ extern const u32 sched_prio_to_wmult[40];
#define DEQUEUE_MIGRATING 0x0010 /* Matches ENQUEUE_MIGRATING */
#define DEQUEUE_DELAYED 0x0020 /* Matches ENQUEUE_DELAYED */
+#define DEQUEUE_CLASS 0x0040 /* Matches ENQUEUE_CLASS */
#define DEQUEUE_SPECIAL 0x00010000
#define DEQUEUE_THROTTLE 0x00020000
@@ -2380,6 +2380,7 @@ extern const u32 sched_prio_to_wmult[40];
#define ENQUEUE_MIGRATING 0x0010
#define ENQUEUE_DELAYED 0x0020
+#define ENQUEUE_CLASS 0x0040
#define ENQUEUE_HEAD 0x00010000
#define ENQUEUE_REPLENISH 0x00020000
@@ -2443,14 +2444,11 @@ struct sched_class {
void (*task_fork)(struct task_struct *p);
void (*task_dead)(struct task_struct *p);
- /*
- * The switched_from() call is allowed to drop rq->lock, therefore we
- * cannot assume the switched_from/switched_to pair is serialized by
- * rq->lock. They are however serialized by p->pi_lock.
- */
- void (*switching_to) (struct rq *this_rq, struct task_struct *task);
- void (*switched_from)(struct rq *this_rq, struct task_struct *task);
- void (*switched_to) (struct rq *this_rq, struct task_struct *task);
+ void (*switching_from)(struct rq *this_rq, struct task_struct *task);
+ void (*switched_from) (struct rq *this_rq, struct task_struct *task);
+ void (*switching_to) (struct rq *this_rq, struct task_struct *task);
+ void (*switched_to) (struct rq *this_rq, struct task_struct *task);
+
void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
const struct load_weight *lw);
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
@@ -3879,11 +3877,7 @@ extern void set_load_weight(struct task_struct *p, bool update_load);
extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
-extern void check_class_changing(struct rq *rq, struct task_struct *p,
- const struct sched_class *prev_class);
-extern void check_class_changed(struct rq *rq, struct task_struct *p,
- const struct sched_class *prev_class,
- int oldprio);
+extern void check_prio_changed(struct rq *rq, struct task_struct *p, int oldprio);
extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 2d4e279..fcc4c54 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -75,7 +75,7 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
{
}
-static void switched_to_stop(struct rq *rq, struct task_struct *p)
+static void switching_to_stop(struct rq *rq, struct task_struct *p)
{
BUG(); /* its impossible to change to this class */
}
@@ -112,6 +112,6 @@ DEFINE_SCHED_CLASS(stop) = {
.task_tick = task_tick_stop,
.prio_changed = prio_changed_stop,
- .switched_to = switched_to_stop,
+ .switching_to = switching_to_stop,
.update_curr = update_curr_stop,
};
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 09ffe91..bcef5c7 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -684,6 +684,9 @@ change:
prev_class = p->sched_class;
next_class = __setscheduler_class(policy, newprio);
+ if (prev_class != next_class)
+ queue_flags |= DEQUEUE_CLASS;
+
if (prev_class != next_class && p->se.sched_delayed)
dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
@@ -695,7 +698,6 @@ change:
p->prio = newprio;
}
__setscheduler_uclamp(p, attr);
- check_class_changing(rq, p, prev_class);
if (scope->queued) {
/*
@@ -707,7 +709,8 @@ change:
}
}
- check_class_changed(rq, p, prev_class, oldprio);
+ if (!(queue_flags & DEQUEUE_CLASS))
+ check_prio_changed(rq, p, oldprio);
/* Avoid rq from going away on us: */
preempt_disable();
^ permalink raw reply related [flat|nested] 74+ messages in thread
* [PATCH 04/12] sched: Cleanup sched_delayed handling for class switches
2025-10-06 10:44 [PATCH 00/12] sched: Cleanup the change-pattern and related locking Peter Zijlstra
` (2 preceding siblings ...)
2025-10-06 10:44 ` [PATCH 03/12] sched: Fold sched_class::switch{ing,ed}_{to,from}() into the change pattern Peter Zijlstra
@ 2025-10-06 10:44 ` Peter Zijlstra
2025-10-07 15:22 ` Vincent Guittot
2025-10-16 9:33 ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2025-10-06 10:44 ` [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern Peter Zijlstra
` (12 subsequent siblings)
16 siblings, 2 replies; 74+ messages in thread
From: Peter Zijlstra @ 2025-10-06 10:44 UTC (permalink / raw)
To: tj
Cc: linux-kernel, peterz, mingo, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, longman,
hannes, mkoutny, void, arighi, changwoo, cgroups, sched-ext,
liuwenfang, tglx
Use the new sched_class::switching_from() method to dequeue delayed
tasks before switching to another class.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/core.c | 12 ++++++++----
kernel/sched/ext.c | 6 ------
kernel/sched/fair.c | 7 +++++++
kernel/sched/syscalls.c | 3 ---
4 files changed, 15 insertions(+), 13 deletions(-)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7368,9 +7368,6 @@ void rt_mutex_setprio(struct task_struct
queue_flag &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
}
- if (prev_class != next_class && p->se.sched_delayed)
- dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
-
scoped_guard (sched_change, p, queue_flag) {
/*
* Boosting condition are:
@@ -10845,8 +10842,15 @@ struct sched_change_ctx *sched_change_be
if (WARN_ON_ONCE(flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)))
flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
- if (p->sched_class->switching_from)
+ if (p->sched_class->switching_from) {
+ /*
+ * switching_from_fair() assumes CLASS implies NOCLOCK;
+ * fixing this assumption would mean switching_from()
+ * would need to be able to change flags.
+ */
+ WARN_ON(!(flags & DEQUEUE_NOCLOCK));
p->sched_class->switching_from(rq, p);
+ }
}
*ctx = (struct sched_change_ctx){
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3924,9 +3924,6 @@ static void scx_disable_workfn(struct kt
queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
}
- if (old_class != new_class && p->se.sched_delayed)
- dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
-
scoped_guard (sched_change, p, queue_flags) {
p->sched_class = new_class;
}
@@ -4677,9 +4674,6 @@ static int scx_enable(struct sched_ext_o
queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
}
- if (old_class != new_class && p->se.sched_delayed)
- dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
-
scoped_guard (sched_change, p, queue_flags) {
p->scx.slice = SCX_SLICE_DFL;
p->sched_class = new_class;
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -13237,6 +13237,12 @@ static void attach_task_cfs_rq(struct ta
attach_entity_cfs_rq(se);
}
+static void switching_from_fair(struct rq *rq, struct task_struct *p)
+{
+ if (p->se.sched_delayed)
+ dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
+}
+
static void switched_from_fair(struct rq *rq, struct task_struct *p)
{
detach_task_cfs_rq(p);
@@ -13638,6 +13644,7 @@ DEFINE_SCHED_CLASS(fair) = {
.reweight_task = reweight_task_fair,
.prio_changed = prio_changed_fair,
+ .switching_from = switching_from_fair,
.switched_from = switched_from_fair,
.switched_to = switched_to_fair,
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -689,9 +689,6 @@ int __sched_setscheduler(struct task_str
queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
}
- if (prev_class != next_class && p->se.sched_delayed)
- dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
-
scoped_guard (sched_change, p, queue_flags) {
if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
^ permalink raw reply [flat|nested] 74+ messages in thread* Re: [PATCH 04/12] sched: Cleanup sched_delayed handling for class switches
2025-10-06 10:44 ` [PATCH 04/12] sched: Cleanup sched_delayed handling for class switches Peter Zijlstra
@ 2025-10-07 15:22 ` Vincent Guittot
2025-10-16 9:33 ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
1 sibling, 0 replies; 74+ messages in thread
From: Vincent Guittot @ 2025-10-07 15:22 UTC (permalink / raw)
To: Peter Zijlstra
Cc: tj, linux-kernel, mingo, juri.lelli, dietmar.eggemann, rostedt,
bsegall, mgorman, vschneid, longman, hannes, mkoutny, void,
arighi, changwoo, cgroups, sched-ext, liuwenfang, tglx
On Mon, 6 Oct 2025 at 12:45, Peter Zijlstra <peterz@infradead.org> wrote:
>
> Use the new sched_class::switching_from() method to dequeue delayed
> tasks before switching to another class.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
> kernel/sched/core.c | 12 ++++++++----
> kernel/sched/ext.c | 6 ------
> kernel/sched/fair.c | 7 +++++++
> kernel/sched/syscalls.c | 3 ---
> 4 files changed, 15 insertions(+), 13 deletions(-)
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -7368,9 +7368,6 @@ void rt_mutex_setprio(struct task_struct
> queue_flag &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
> }
>
> - if (prev_class != next_class && p->se.sched_delayed)
> - dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
> -
> scoped_guard (sched_change, p, queue_flag) {
> /*
> * Boosting condition are:
> @@ -10845,8 +10842,15 @@ struct sched_change_ctx *sched_change_be
> if (WARN_ON_ONCE(flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)))
> flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
>
> - if (p->sched_class->switching_from)
> + if (p->sched_class->switching_from) {
> + /*
> + * switching_from_fair() assumes CLASS implies NOCLOCK;
> + * fixing this assumption would mean switching_from()
> + * would need to be able to change flags.
> + */
> + WARN_ON(!(flags & DEQUEUE_NOCLOCK));
> p->sched_class->switching_from(rq, p);
> + }
> }
>
> *ctx = (struct sched_change_ctx){
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -3924,9 +3924,6 @@ static void scx_disable_workfn(struct kt
> queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
> }
>
> - if (old_class != new_class && p->se.sched_delayed)
> - dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
> -
> scoped_guard (sched_change, p, queue_flags) {
> p->sched_class = new_class;
> }
> @@ -4677,9 +4674,6 @@ static int scx_enable(struct sched_ext_o
> queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
> }
>
> - if (old_class != new_class && p->se.sched_delayed)
> - dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
> -
> scoped_guard (sched_change, p, queue_flags) {
> p->scx.slice = SCX_SLICE_DFL;
> p->sched_class = new_class;
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -13237,6 +13237,12 @@ static void attach_task_cfs_rq(struct ta
> attach_entity_cfs_rq(se);
> }
>
> +static void switching_from_fair(struct rq *rq, struct task_struct *p)
> +{
> + if (p->se.sched_delayed)
> + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
> +}
> +
> static void switched_from_fair(struct rq *rq, struct task_struct *p)
> {
> detach_task_cfs_rq(p);
> @@ -13638,6 +13644,7 @@ DEFINE_SCHED_CLASS(fair) = {
>
> .reweight_task = reweight_task_fair,
> .prio_changed = prio_changed_fair,
> + .switching_from = switching_from_fair,
> .switched_from = switched_from_fair,
> .switched_to = switched_to_fair,
>
> --- a/kernel/sched/syscalls.c
> +++ b/kernel/sched/syscalls.c
> @@ -689,9 +689,6 @@ int __sched_setscheduler(struct task_str
> queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
> }
>
> - if (prev_class != next_class && p->se.sched_delayed)
> - dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
> -
> scoped_guard (sched_change, p, queue_flags) {
>
> if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
>
>
^ permalink raw reply [flat|nested] 74+ messages in thread* [tip: sched/core] sched: Cleanup sched_delayed handling for class switches
2025-10-06 10:44 ` [PATCH 04/12] sched: Cleanup sched_delayed handling for class switches Peter Zijlstra
2025-10-07 15:22 ` Vincent Guittot
@ 2025-10-16 9:33 ` tip-bot2 for Peter Zijlstra
1 sibling, 0 replies; 74+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2025-10-16 9:33 UTC (permalink / raw)
To: linux-tip-commits
Cc: Peter Zijlstra (Intel), Vincent Guittot, Juri Lelli, Tejun Heo,
x86, linux-kernel
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 1ae5f5dfe5adc64a90b1b0ab5bd9bd7c9d140c28
Gitweb: https://git.kernel.org/tip/1ae5f5dfe5adc64a90b1b0ab5bd9bd7c9d140c28
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Wed, 30 Oct 2024 15:47:46 +01:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 16 Oct 2025 11:13:51 +02:00
sched: Cleanup sched_delayed handling for class switches
Use the new sched_class::switching_from() method to dequeue delayed
tasks before switching to another class.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/core.c | 12 ++++++++----
kernel/sched/ext.c | 6 ------
kernel/sched/fair.c | 7 +++++++
kernel/sched/syscalls.c | 3 ---
4 files changed, 15 insertions(+), 13 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4dbd206..bd2c551 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7366,9 +7366,6 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
if (prev_class != next_class)
queue_flag |= DEQUEUE_CLASS;
- if (prev_class != next_class && p->se.sched_delayed)
- dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
-
scoped_guard (sched_change, p, queue_flag) {
/*
* Boosting condition are:
@@ -10840,8 +10837,15 @@ struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int
lockdep_assert_rq_held(rq);
if (flags & DEQUEUE_CLASS) {
- if (p->sched_class->switching_from)
+ if (p->sched_class->switching_from) {
+ /*
+ * switching_from_fair() assumes CLASS implies NOCLOCK;
+ * fixing this assumption would mean switching_from()
+ * would need to be able to change flags.
+ */
+ WARN_ON(!(flags & DEQUEUE_NOCLOCK));
p->sched_class->switching_from(rq, p);
+ }
}
*ctx = (struct sched_change_ctx){
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index a408c39..b0a1e2a 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3922,9 +3922,6 @@ static void scx_disable_workfn(struct kthread_work *work)
if (old_class != new_class)
queue_flags |= DEQUEUE_CLASS;
- if (old_class != new_class && p->se.sched_delayed)
- dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
-
scoped_guard (sched_change, p, queue_flags) {
p->sched_class = new_class;
}
@@ -4673,9 +4670,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (old_class != new_class)
queue_flags |= DEQUEUE_CLASS;
- if (old_class != new_class && p->se.sched_delayed)
- dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
-
scoped_guard (sched_change, p, queue_flags) {
p->scx.slice = SCX_SLICE_DFL;
p->sched_class = new_class;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ac881df..6c462e4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -13249,6 +13249,12 @@ static void attach_task_cfs_rq(struct task_struct *p)
attach_entity_cfs_rq(se);
}
+static void switching_from_fair(struct rq *rq, struct task_struct *p)
+{
+ if (p->se.sched_delayed)
+ dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
+}
+
static void switched_from_fair(struct rq *rq, struct task_struct *p)
{
detach_task_cfs_rq(p);
@@ -13650,6 +13656,7 @@ DEFINE_SCHED_CLASS(fair) = {
.reweight_task = reweight_task_fair,
.prio_changed = prio_changed_fair,
+ .switching_from = switching_from_fair,
.switched_from = switched_from_fair,
.switched_to = switched_to_fair,
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index bcef5c7..6583faf 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -687,9 +687,6 @@ change:
if (prev_class != next_class)
queue_flags |= DEQUEUE_CLASS;
- if (prev_class != next_class && p->se.sched_delayed)
- dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
-
scoped_guard (sched_change, p, queue_flags) {
if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
^ permalink raw reply related [flat|nested] 74+ messages in thread
* [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern
2025-10-06 10:44 [PATCH 00/12] sched: Cleanup the change-pattern and related locking Peter Zijlstra
` (3 preceding siblings ...)
2025-10-06 10:44 ` [PATCH 04/12] sched: Cleanup sched_delayed handling for class switches Peter Zijlstra
@ 2025-10-06 10:44 ` Peter Zijlstra
2025-10-16 9:33 ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2026-01-12 20:44 ` [PATCH 05/12] " Pierre Gondois
2025-10-06 10:44 ` [PATCH 06/12] sched: Fix migrate_disable_switch() locking Peter Zijlstra
` (11 subsequent siblings)
16 siblings, 2 replies; 74+ messages in thread
From: Peter Zijlstra @ 2025-10-06 10:44 UTC (permalink / raw)
To: tj
Cc: linux-kernel, peterz, mingo, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, longman,
hannes, mkoutny, void, arighi, changwoo, cgroups, sched-ext,
liuwenfang, tglx
Move sched_class::prio_changed() into the change pattern.
And while there, extend it with sched_class::get_prio() in order to
fix the deadline sitation.
Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/core.c | 24 +++++++++++++-----------
kernel/sched/deadline.c | 20 +++++++++++---------
kernel/sched/ext.c | 8 +-------
kernel/sched/fair.c | 8 ++++++--
kernel/sched/idle.c | 5 ++++-
kernel/sched/rt.c | 5 ++++-
kernel/sched/sched.h | 7 ++++---
kernel/sched/stop_task.c | 5 ++++-
kernel/sched/syscalls.c | 9 ---------
9 files changed, 47 insertions(+), 44 deletions(-)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2169,12 +2169,6 @@ inline int task_curr(const struct task_s
return cpu_curr(task_cpu(p)) == p;
}
-void check_prio_changed(struct rq *rq, struct task_struct *p, int oldprio)
-{
- if (oldprio != p->prio || dl_task(p))
- p->sched_class->prio_changed(rq, p, oldprio);
-}
-
void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
{
struct task_struct *donor = rq->donor;
@@ -7402,9 +7396,6 @@ void rt_mutex_setprio(struct task_struct
p->sched_class = next_class;
p->prio = prio;
}
-
- if (!(queue_flag & DEQUEUE_CLASS))
- check_prio_changed(rq, p, oldprio);
out_unlock:
/* Avoid rq from going away on us: */
preempt_disable();
@@ -10860,6 +10851,13 @@ struct sched_change_ctx *sched_change_be
.running = task_current(rq, p),
};
+ if (!(flags & DEQUEUE_CLASS)) {
+ if (p->sched_class->get_prio)
+ ctx->prio = p->sched_class->get_prio(rq, p);
+ else
+ ctx->prio = p->prio;
+ }
+
if (ctx->queued)
dequeue_task(rq, p, flags);
if (ctx->running)
@@ -10886,6 +10884,10 @@ void sched_change_end(struct sched_chang
if (ctx->running)
set_next_task(rq, p);
- if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switched_to)
- p->sched_class->switched_to(rq, p);
+ if (ctx->flags & ENQUEUE_CLASS) {
+ if (p->sched_class->switched_to)
+ p->sched_class->switched_to(rq, p);
+ } else {
+ p->sched_class->prio_changed(rq, p, ctx->prio);
+ }
}
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -3042,23 +3042,24 @@ static void switched_to_dl(struct rq *rq
}
}
+static u64 get_prio_dl(struct rq *rq, struct task_struct *p)
+{
+ return p->dl.deadline;
+}
+
/*
* If the scheduling parameters of a -deadline task changed,
* a push or pull operation might be needed.
*/
-static void prio_changed_dl(struct rq *rq, struct task_struct *p,
- int oldprio)
+static void prio_changed_dl(struct rq *rq, struct task_struct *p, u64 old_deadline)
{
if (!task_on_rq_queued(p))
return;
- /*
- * This might be too much, but unfortunately
- * we don't have the old deadline value, and
- * we can't argue if the task is increasing
- * or lowering its prio, so...
- */
- if (!rq->dl.overloaded)
+ if (p->dl.deadline == old_deadline)
+ return;
+
+ if (dl_time_before(old_deadline, p->dl.deadline))
deadline_queue_pull_task(rq);
if (task_current_donor(rq, p)) {
@@ -3113,6 +3114,7 @@ DEFINE_SCHED_CLASS(dl) = {
.task_tick = task_tick_dl,
.task_fork = task_fork_dl,
+ .get_prio = get_prio_dl,
.prio_changed = prio_changed_dl,
.switched_from = switched_from_dl,
.switched_to = switched_to_dl,
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2961,7 +2961,7 @@ static void reweight_task_scx(struct rq
p, p->scx.weight);
}
-static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
+static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio)
{
}
@@ -3928,9 +3928,6 @@ static void scx_disable_workfn(struct kt
p->sched_class = new_class;
}
- if (!(queue_flags & DEQUEUE_CLASS))
- check_prio_changed(task_rq(p), p, p->prio);
-
scx_exit_task(p);
}
scx_task_iter_stop(&sti);
@@ -4679,9 +4676,6 @@ static int scx_enable(struct sched_ext_o
p->sched_class = new_class;
}
- if (!(queue_flags & DEQUEUE_CLASS))
- check_prio_changed(task_rq(p), p, p->prio);
-
put_task_struct(p);
}
scx_task_iter_stop(&sti);
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -13138,11 +13138,14 @@ static void task_fork_fair(struct task_s
* the current task.
*/
static void
-prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_fair(struct rq *rq, struct task_struct *p, u64 oldprio)
{
if (!task_on_rq_queued(p))
return;
+ if (p->prio == oldprio)
+ return;
+
if (rq->cfs.nr_queued == 1)
return;
@@ -13154,8 +13157,9 @@ prio_changed_fair(struct rq *rq, struct
if (task_current_donor(rq, p)) {
if (p->prio > oldprio)
resched_curr(rq);
- } else
+ } else {
wakeup_preempt(rq, p, 0);
+ }
}
#ifdef CONFIG_FAIR_GROUP_SCHED
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -504,8 +504,11 @@ static void switching_to_idle(struct rq
}
static void
-prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_idle(struct rq *rq, struct task_struct *p, u64 oldprio)
{
+ if (p->prio == oldprio)
+ return;
+
BUG();
}
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2437,11 +2437,14 @@ static void switched_to_rt(struct rq *rq
* us to initiate a push or pull.
*/
static void
-prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_rt(struct rq *rq, struct task_struct *p, u64 oldprio)
{
if (!task_on_rq_queued(p))
return;
+ if (p->prio == oldprio)
+ return;
+
if (task_current_donor(rq, p)) {
/*
* If our priority decreases while running, we
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2451,8 +2451,10 @@ struct sched_class {
void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
const struct load_weight *lw);
+
+ u64 (*get_prio) (struct rq *this_rq, struct task_struct *task);
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
- int oldprio);
+ u64 oldprio);
unsigned int (*get_rr_interval)(struct rq *rq,
struct task_struct *task);
@@ -3877,12 +3879,11 @@ extern void set_load_weight(struct task_
extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
-extern void check_prio_changed(struct rq *rq, struct task_struct *p, int oldprio);
-
extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
struct sched_change_ctx {
+ u64 prio;
struct task_struct *p;
int flags;
bool queued;
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -81,8 +81,11 @@ static void switching_to_stop(struct rq
}
static void
-prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_stop(struct rq *rq, struct task_struct *p, u64 oldprio)
{
+ if (p->prio == oldprio)
+ return;
+
BUG(); /* how!?, what priority? */
}
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -95,12 +95,6 @@ void set_user_nice(struct task_struct *p
old_prio = p->prio;
p->prio = effective_prio(p);
}
-
- /*
- * If the task increased its priority or is running and
- * lowered its priority, then reschedule its CPU:
- */
- p->sched_class->prio_changed(rq, p, old_prio);
}
EXPORT_SYMBOL(set_user_nice);
@@ -708,9 +702,6 @@ int __sched_setscheduler(struct task_str
}
}
- if (!(queue_flags & DEQUEUE_CLASS))
- check_prio_changed(rq, p, oldprio);
-
/* Avoid rq from going away on us: */
preempt_disable();
head = splice_balance_callbacks(rq);
^ permalink raw reply [flat|nested] 74+ messages in thread* [tip: sched/core] sched: Move sched_class::prio_changed() into the change pattern
2025-10-06 10:44 ` [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern Peter Zijlstra
@ 2025-10-16 9:33 ` tip-bot2 for Peter Zijlstra
2026-01-12 20:44 ` [PATCH 05/12] " Pierre Gondois
1 sibling, 0 replies; 74+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2025-10-16 9:33 UTC (permalink / raw)
To: linux-tip-commits
Cc: Tejun Heo, Peter Zijlstra (Intel), Juri Lelli, Vincent Guittot,
x86, linux-kernel
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 6455ad5346c9cf755fa9dda6e326c4028fb3c853
Gitweb: https://git.kernel.org/tip/6455ad5346c9cf755fa9dda6e326c4028fb3c853
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Fri, 01 Nov 2024 14:16:10 +01:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 16 Oct 2025 11:13:52 +02:00
sched: Move sched_class::prio_changed() into the change pattern
Move sched_class::prio_changed() into the change pattern.
And while there, extend it with sched_class::get_prio() in order to
fix the deadline sitation.
Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
---
kernel/sched/core.c | 24 +++++++++++++-----------
kernel/sched/deadline.c | 20 +++++++++++---------
kernel/sched/ext.c | 8 +-------
kernel/sched/fair.c | 8 ++++++--
kernel/sched/idle.c | 5 ++++-
kernel/sched/rt.c | 5 ++++-
kernel/sched/sched.h | 7 ++++---
kernel/sched/stop_task.c | 5 ++++-
kernel/sched/syscalls.c | 9 ---------
9 files changed, 47 insertions(+), 44 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bd2c551..4a4dbce 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2169,12 +2169,6 @@ inline int task_curr(const struct task_struct *p)
return cpu_curr(task_cpu(p)) == p;
}
-void check_prio_changed(struct rq *rq, struct task_struct *p, int oldprio)
-{
- if (oldprio != p->prio || dl_task(p))
- p->sched_class->prio_changed(rq, p, oldprio);
-}
-
void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
{
struct task_struct *donor = rq->donor;
@@ -7400,9 +7394,6 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
p->sched_class = next_class;
p->prio = prio;
}
-
- if (!(queue_flag & DEQUEUE_CLASS))
- check_prio_changed(rq, p, oldprio);
out_unlock:
/* Avoid rq from going away on us: */
preempt_disable();
@@ -10855,6 +10846,13 @@ struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int
.running = task_current_donor(rq, p),
};
+ if (!(flags & DEQUEUE_CLASS)) {
+ if (p->sched_class->get_prio)
+ ctx->prio = p->sched_class->get_prio(rq, p);
+ else
+ ctx->prio = p->prio;
+ }
+
if (ctx->queued)
dequeue_task(rq, p, flags);
if (ctx->running)
@@ -10881,6 +10879,10 @@ void sched_change_end(struct sched_change_ctx *ctx)
if (ctx->running)
set_next_task(rq, p);
- if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switched_to)
- p->sched_class->switched_to(rq, p);
+ if (ctx->flags & ENQUEUE_CLASS) {
+ if (p->sched_class->switched_to)
+ p->sched_class->switched_to(rq, p);
+ } else {
+ p->sched_class->prio_changed(rq, p, ctx->prio);
+ }
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fd147a7..1f94994 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -3042,23 +3042,24 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
}
}
+static u64 get_prio_dl(struct rq *rq, struct task_struct *p)
+{
+ return p->dl.deadline;
+}
+
/*
* If the scheduling parameters of a -deadline task changed,
* a push or pull operation might be needed.
*/
-static void prio_changed_dl(struct rq *rq, struct task_struct *p,
- int oldprio)
+static void prio_changed_dl(struct rq *rq, struct task_struct *p, u64 old_deadline)
{
if (!task_on_rq_queued(p))
return;
- /*
- * This might be too much, but unfortunately
- * we don't have the old deadline value, and
- * we can't argue if the task is increasing
- * or lowering its prio, so...
- */
- if (!rq->dl.overloaded)
+ if (p->dl.deadline == old_deadline)
+ return;
+
+ if (dl_time_before(old_deadline, p->dl.deadline))
deadline_queue_pull_task(rq);
if (task_current_donor(rq, p)) {
@@ -3113,6 +3114,7 @@ DEFINE_SCHED_CLASS(dl) = {
.task_tick = task_tick_dl,
.task_fork = task_fork_dl,
+ .get_prio = get_prio_dl,
.prio_changed = prio_changed_dl,
.switched_from = switched_from_dl,
.switched_to = switched_to_dl,
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index b0a1e2a..ad371b6 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2961,7 +2961,7 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p,
p, p->scx.weight);
}
-static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
+static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio)
{
}
@@ -3926,9 +3926,6 @@ static void scx_disable_workfn(struct kthread_work *work)
p->sched_class = new_class;
}
- if (!(queue_flags & DEQUEUE_CLASS))
- check_prio_changed(task_rq(p), p, p->prio);
-
scx_exit_task(p);
}
scx_task_iter_stop(&sti);
@@ -4675,9 +4672,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
p->sched_class = new_class;
}
- if (!(queue_flags & DEQUEUE_CLASS))
- check_prio_changed(task_rq(p), p, p->prio);
-
put_task_struct(p);
}
scx_task_iter_stop(&sti);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6c462e4..77a713e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -13150,11 +13150,14 @@ static void task_fork_fair(struct task_struct *p)
* the current task.
*/
static void
-prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_fair(struct rq *rq, struct task_struct *p, u64 oldprio)
{
if (!task_on_rq_queued(p))
return;
+ if (p->prio == oldprio)
+ return;
+
if (rq->cfs.nr_queued == 1)
return;
@@ -13166,8 +13169,9 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
if (task_current_donor(rq, p)) {
if (p->prio > oldprio)
resched_curr(rq);
- } else
+ } else {
wakeup_preempt(rq, p, 0);
+ }
}
#ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index f02dced..dee6e01 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -504,8 +504,11 @@ static void switching_to_idle(struct rq *rq, struct task_struct *p)
}
static void
-prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_idle(struct rq *rq, struct task_struct *p, u64 oldprio)
{
+ if (p->prio == oldprio)
+ return;
+
BUG();
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 6b2e811..c2347e4 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2437,11 +2437,14 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
* us to initiate a push or pull.
*/
static void
-prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_rt(struct rq *rq, struct task_struct *p, u64 oldprio)
{
if (!task_on_rq_queued(p))
return;
+ if (p->prio == oldprio)
+ return;
+
if (task_current_donor(rq, p)) {
/*
* If our priority decreases while running, we
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e3f4215..bcde43d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2451,8 +2451,10 @@ struct sched_class {
void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
const struct load_weight *lw);
+
+ u64 (*get_prio) (struct rq *this_rq, struct task_struct *task);
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
- int oldprio);
+ u64 oldprio);
unsigned int (*get_rr_interval)(struct rq *rq,
struct task_struct *task);
@@ -3877,8 +3879,6 @@ extern void set_load_weight(struct task_struct *p, bool update_load);
extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
-extern void check_prio_changed(struct rq *rq, struct task_struct *p, int oldprio);
-
extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
@@ -3899,6 +3899,7 @@ extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
* the task's queueing state is idempotent across the operation.
*/
struct sched_change_ctx {
+ u64 prio;
struct task_struct *p;
int flags;
bool queued;
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index fcc4c54..73aa8de 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -81,8 +81,11 @@ static void switching_to_stop(struct rq *rq, struct task_struct *p)
}
static void
-prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_stop(struct rq *rq, struct task_struct *p, u64 oldprio)
{
+ if (p->prio == oldprio)
+ return;
+
BUG(); /* how!?, what priority? */
}
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 6583faf..20af564 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -95,12 +95,6 @@ void set_user_nice(struct task_struct *p, long nice)
old_prio = p->prio;
p->prio = effective_prio(p);
}
-
- /*
- * If the task increased its priority or is running and
- * lowered its priority, then reschedule its CPU:
- */
- p->sched_class->prio_changed(rq, p, old_prio);
}
EXPORT_SYMBOL(set_user_nice);
@@ -706,9 +700,6 @@ change:
}
}
- if (!(queue_flags & DEQUEUE_CLASS))
- check_prio_changed(rq, p, oldprio);
-
/* Avoid rq from going away on us: */
preempt_disable();
head = splice_balance_callbacks(rq);
^ permalink raw reply related [flat|nested] 74+ messages in thread* Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern
2025-10-06 10:44 ` [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern Peter Zijlstra
2025-10-16 9:33 ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
@ 2026-01-12 20:44 ` Pierre Gondois
2026-01-13 4:12 ` K Prateek Nayak
1 sibling, 1 reply; 74+ messages in thread
From: Pierre Gondois @ 2026-01-12 20:44 UTC (permalink / raw)
To: Peter Zijlstra, tj
Cc: linux-kernel, mingo, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, longman,
hannes, mkoutny, void, arighi, changwoo, cgroups, sched-ext,
liuwenfang, tglx, Christian Loehle
Hello Peter,
It seems this patch:
6455ad5346c9 ("sched: Move sched_class::prio_changed() into the change
pattern")
is triggering the following warning:
rq_pin_lock()
\-WARN_ON_ONCE(rq->balance_callback && rq->balance_callback !=
&balance_push_callback);
On an arm64 Juno, it can be reproduced by creating and killing a
deadline task:
chrt -d -T 1000000 -P 1000000 0 yes > /dev/null
[ 49.518832] Hardware name: ARM LTD ARM Juno Development Platform/ARM
Juno Development Platform, BIOS EDK II Jul 11 2025
[ 49.518838] Call trace:
[ 49.518842] show_stack (arch/arm64/kernel/stacktrace.c:501) (C)
[ 49.518864] dump_stack_lvl (lib/dump_stack.c:122)
[ 49.518878] dump_stack (lib/dump_stack.c:130)
[ 49.518889] prio_changed_dl (kernel/sched/deadline.c:0
kernel/sched/deadline.c:3343)
[ 49.518903] sched_change_end (kernel/sched/core.c:0)
[ 49.518916] sched_move_task (kernel/sched/core.c:9167)
[ 49.518927] sched_autogroup_exit_task (kernel/sched/autogroup.c:157)
[ 49.518940] do_exit (kernel/exit.c:975)
[ 49.518950] do_group_exit (kernel/exit.c:0)
[ 49.518960] get_signal (kernel/signal.c:0)
[ 49.518970] arch_do_signal_or_restart (arch/arm64/kernel/signal.c:1619)
[ 49.518983] exit_to_user_mode_loop (kernel/entry/common.c:43
kernel/entry/common.c:75)
[ 49.518994] el0_svc (./include/linux/irq-entry-common.h:0
./include/linux/irq-entry-common.h:242
arch/arm64/kernel/entry-common.c:81 arch/arm64/kernel/entry-common.c:725)
[ 49.519009] el0t_64_sync_handler (arch/arm64/kernel/entry-common.c:0)
[ 49.519023] el0t_64_sync (arch/arm64/kernel/entry.S:596)
[ 49.519119] ------------[ cut here ]------------
[ 49.519124] WARNING: kernel/sched/sched.h:1829 at
__schedule+0x404/0xf78, CPU#1: yes/326
[ 49.612674] Modules linked in:
[ 49.615737] CPU: 1 UID: 0 PID: 326 Comm: yes Not tainted
6.19.0-rc4-next-20260109-g8be7ad74b7e4 #261 PREEMPT
[ 49.625670] Hardware name: ARM LTD ARM Juno Development Platform/ARM
Juno Development Platform, BIOS EDK II Jul 11 2025
[ 49.636470] pstate: 800000c5 (Nzcv daIF -PAN -UAO -TCO -DIT -SSBS
BTYPE=--)
[ 49.643443] pc : __schedule (kernel/sched/core.c:0
kernel/sched/sched.h:1907 kernel/sched/core.c:6798)
[ 49.647287] lr : __schedule (kernel/sched/sched.h:1827
kernel/sched/sched.h:1907 kernel/sched/core.c:6798)
[ 49.651130] sp : ffff800081d739e0
[ 49.654445] x29: ffff800081d73a40 x28: ffff000809548908 x27:
ffffddc6d7c532e8
[ 49.661604] x26: ffff000809548000 x25: 00000000400004d8 x24:
0000000000000009
[ 49.668762] x23: 0000000000000001 x22: ffffddc6d7bf8500 x21:
ffffddc6d5b9bdb0
[ 49.675919] x20: ffff00097681c500 x19: ffff000809548000 x18:
ffff800081d735b8
[ 49.683076] x17: 0000000000000063 x16: 0000000000000000 x15:
0000000000000004
[ 49.690233] x14: ffff000809548aa0 x13: 000000000dc48bda x12:
000000002edb68e5
[ 49.697391] x11: 0000000000000000 x10: 0000000000000001 x9 :
ffffddc6d7c7b388
[ 49.704548] x8 : ffff000976636420 x7 : ffffddc6d5b9ae64 x6 :
0000000000000000
[ 49.711704] x5 : 0000000000000001 x4 : 0000000000000001 x3 :
0000000000000000
[ 49.718861] x2 : 0000000000000008 x1 : ffff00097681c518 x0 :
0000000000008629
[ 49.726017] Call trace:
[ 49.728462] __schedule (kernel/sched/core.c:0
kernel/sched/sched.h:1907 kernel/sched/core.c:6798) (P)
[ 49.732308] preempt_schedule_common
(./arch/arm64/include/asm/preempt.h:53 kernel/sched/core.c:7080)
[ 49.736762] preempt_schedule (kernel/sched/core.c:0)
[ 49.740606] _raw_spin_unlock_irqrestore
(./include/linux/spinlock_api_smp.h:0 kernel/locking/spinlock.c:194)
[ 49.745410] sched_move_task (kernel/sched/sched.h:0)
[ 49.749341] sched_autogroup_exit_task (kernel/sched/autogroup.c:157)
[ 49.753969] do_exit (kernel/exit.c:975)
[ 49.757202] do_group_exit (kernel/exit.c:0)
[ 49.760782] get_signal (kernel/signal.c:0)
[ 49.764277] arch_do_signal_or_restart (arch/arm64/kernel/signal.c:1619)
[ 49.769078] exit_to_user_mode_loop (kernel/entry/common.c:43
kernel/entry/common.c:75)
[ 49.773530] el0_svc (./include/linux/irq-entry-common.h:0
./include/linux/irq-entry-common.h:242
arch/arm64/kernel/entry-common.c:81 arch/arm64/kernel/entry-common.c:725)
[ 49.776767] el0t_64_sync_handler (arch/arm64/kernel/entry-common.c:0)
[ 49.781048] el0t_64_sync (arch/arm64/kernel/entry.S:596)
[ 49.784716] irq event stamp: 80194
[ 49.788118] hardirqs last enabled at (80193): irqentry_exit
(kernel/entry/common.c:0)
[ 49.796575] hardirqs last disabled at (80194): __schedule
(kernel/sched/core.c:6755)
[ 49.804858] softirqs last enabled at (77126): handle_softirqs
(./arch/arm64/include/asm/preempt.h:12 kernel/softirq.c:469
kernel/softirq.c:654)
[ 49.813575] softirqs last disabled at (77121): __do_softirq
(kernel/softirq.c:661)
[ 49.821856] ---[ end trace 0000000000000000 ]---
The first stack dump comes from this:
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 1f94994984038..4647fea76d748 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -632,11 +640,17 @@ static inline void
deadline_queue_push_tasks(struct rq *rq)
if (!has_pushable_dl_tasks(rq))
return;
+ if (sysctl_sched_debug_local)
+ dump_stack();
+
queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu),
push_dl_tasks);
}
static inline void deadline_queue_pull_task(struct rq *rq)
{
+ if (sysctl_sched_debug_local)
+ dump_stack();
+
queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu),
pull_dl_task);
}
On 10/6/25 12:44, Peter Zijlstra wrote:
> Move sched_class::prio_changed() into the change pattern.
>
> And while there, extend it with sched_class::get_prio() in order to
> fix the deadline sitation.
>
> Suggested-by: Tejun Heo <tj@kernel.org>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Acked-by: Tejun Heo <tj@kernel.org>
> ---
> kernel/sched/core.c | 24 +++++++++++++-----------
> kernel/sched/deadline.c | 20 +++++++++++---------
> kernel/sched/ext.c | 8 +-------
> kernel/sched/fair.c | 8 ++++++--
> kernel/sched/idle.c | 5 ++++-
> kernel/sched/rt.c | 5 ++++-
> kernel/sched/sched.h | 7 ++++---
> kernel/sched/stop_task.c | 5 ++++-
> kernel/sched/syscalls.c | 9 ---------
> 9 files changed, 47 insertions(+), 44 deletions(-)
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2169,12 +2169,6 @@ inline int task_curr(const struct task_s
> return cpu_curr(task_cpu(p)) == p;
> }
>
> -void check_prio_changed(struct rq *rq, struct task_struct *p, int oldprio)
> -{
> - if (oldprio != p->prio || dl_task(p))
> - p->sched_class->prio_changed(rq, p, oldprio);
> -}
> -
> void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
> {
> struct task_struct *donor = rq->donor;
> @@ -7402,9 +7396,6 @@ void rt_mutex_setprio(struct task_struct
> p->sched_class = next_class;
> p->prio = prio;
> }
> -
> - if (!(queue_flag & DEQUEUE_CLASS))
> - check_prio_changed(rq, p, oldprio);
> out_unlock:
> /* Avoid rq from going away on us: */
> preempt_disable();
The cause might be the above. This used to call __balance_callbacks()
while holding the rq lock.
> @@ -10860,6 +10851,13 @@ struct sched_change_ctx *sched_change_be
> .running = task_current(rq, p),
> };
>
> + if (!(flags & DEQUEUE_CLASS)) {
> + if (p->sched_class->get_prio)
> + ctx->prio = p->sched_class->get_prio(rq, p);
> + else
> + ctx->prio = p->prio;
> + }
> +
> if (ctx->queued)
> dequeue_task(rq, p, flags);
> if (ctx->running)
> @@ -10886,6 +10884,10 @@ void sched_change_end(struct sched_chang
> if (ctx->running)
> set_next_task(rq, p);
>
> - if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switched_to)
> - p->sched_class->switched_to(rq, p);
> + if (ctx->flags & ENQUEUE_CLASS) {
> + if (p->sched_class->switched_to)
> + p->sched_class->switched_to(rq, p);
> + } else {
> + p->sched_class->prio_changed(rq, p, ctx->prio);
> + }
Now this is not the case anymore it seems. prio_changed_dl() sets the
balance_callback and rq_pin_lock() is called with a non-NULL value.
^ permalink raw reply related [flat|nested] 74+ messages in thread* Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern
2026-01-12 20:44 ` [PATCH 05/12] " Pierre Gondois
@ 2026-01-13 4:12 ` K Prateek Nayak
2026-01-13 10:45 ` Pierre Gondois
0 siblings, 1 reply; 74+ messages in thread
From: K Prateek Nayak @ 2026-01-13 4:12 UTC (permalink / raw)
To: Pierre Gondois, Peter Zijlstra, tj
Cc: linux-kernel, mingo, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, longman,
hannes, mkoutny, void, arighi, changwoo, cgroups, sched-ext,
liuwenfang, tglx, Christian Loehle
Hello Pierre,
On 1/13/2026 2:14 AM, Pierre Gondois wrote:
> Hello Peter,
>
> It seems this patch:
> 6455ad5346c9 ("sched: Move sched_class::prio_changed() into the change pattern")
> is triggering the following warning:
> rq_pin_lock()
> \-WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback);
Can you check if the following solution helps your case too:
https://lore.kernel.org/all/20260106104113.GX3707891@noisy.programming.kicks-ass.net/
--
Thanks and Regards,
Prateek
^ permalink raw reply [flat|nested] 74+ messages in thread* Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern
2026-01-13 4:12 ` K Prateek Nayak
@ 2026-01-13 10:45 ` Pierre Gondois
2026-01-13 11:05 ` K Prateek Nayak
2026-01-13 11:47 ` [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern Peter Zijlstra
0 siblings, 2 replies; 74+ messages in thread
From: Pierre Gondois @ 2026-01-13 10:45 UTC (permalink / raw)
To: K Prateek Nayak, Peter Zijlstra, tj
Cc: linux-kernel, mingo, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, longman,
hannes, mkoutny, void, arighi, changwoo, cgroups, sched-ext,
liuwenfang, tglx, Christian Loehle
Hello Prateek,
On 1/13/26 05:12, K Prateek Nayak wrote:
> Hello Pierre,
>
> On 1/13/2026 2:14 AM, Pierre Gondois wrote:
>> Hello Peter,
>>
>> It seems this patch:
>> 6455ad5346c9 ("sched: Move sched_class::prio_changed() into the change pattern")
>> is triggering the following warning:
>> rq_pin_lock()
>> \-WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback);
> Can you check if the following solution helps your case too:
> https://lore.kernel.org/all/20260106104113.GX3707891@noisy.programming.kicks-ass.net/
>
I can still see the issue.
It seems the task deadline is also updated in:
sched_change_end()
\-enqueue_task_dl()
\-enqueue_dl_entity()
\-setup_new_dl_entity()
\-replenish_dl_new_period()
if the task's period finished.
So in sched_change_end(), the task priority (i.e. p->dl.deadline) is
updated.
This results in having an old_deadline earlier than the new p->dl.deadline.
Thus the rq->balance_callback:
prio_changed_dl() {
...
if (dl_time_before(old_deadline, p->dl.deadline))
deadline_queue_pull_task(rq);
...
}
^ permalink raw reply [flat|nested] 74+ messages in thread* Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern
2026-01-13 10:45 ` Pierre Gondois
@ 2026-01-13 11:05 ` K Prateek Nayak
2026-01-13 11:53 ` Peter Zijlstra
2026-01-13 11:47 ` [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern Peter Zijlstra
1 sibling, 1 reply; 74+ messages in thread
From: K Prateek Nayak @ 2026-01-13 11:05 UTC (permalink / raw)
To: Pierre Gondois, Peter Zijlstra, tj
Cc: linux-kernel, mingo, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, longman,
hannes, mkoutny, void, arighi, changwoo, cgroups, sched-ext,
liuwenfang, tglx, Christian Loehle
Hello Pierre,
On 1/13/2026 4:15 PM, Pierre Gondois wrote:
> Hello Prateek,
>
> On 1/13/26 05:12, K Prateek Nayak wrote:
>> Hello Pierre,
>>
>> On 1/13/2026 2:14 AM, Pierre Gondois wrote:
>>> Hello Peter,
>>>
>>> It seems this patch:
>>> 6455ad5346c9 ("sched: Move sched_class::prio_changed() into the change pattern")
>>> is triggering the following warning:
>>> rq_pin_lock()
>>> \-WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback);
>> Can you check if the following solution helps your case too:
>> https://lore.kernel.org/all/20260106104113.GX3707891@noisy.programming.kicks-ass.net/
>>
> I can still see the issue.
> It seems the task deadline is also updated in:
> sched_change_end()
> \-enqueue_task_dl()
> \-enqueue_dl_entity()
> \-setup_new_dl_entity()
> \-replenish_dl_new_period()
> if the task's period finished.
Ah! Got it. Thank you for testing the fix.
I'm curious, why is setup_new_dl_entity() doing an
update_rq_clock()? That can advance the rq->clock and make it look like
we need a replenish.
Does enabling WARN_DOUBLE_CLOCK warn of a double clock update before
hitting this warning?
>
> So in sched_change_end(), the task priority (i.e. p->dl.deadline) is updated.
> This results in having an old_deadline earlier than the new p->dl.deadline.
> Thus the rq->balance_callback:
>
> prio_changed_dl() {
> ...
> if (dl_time_before(old_deadline, p->dl.deadline))
> deadline_queue_pull_task(rq);
> ...
> }
>
Thank you for your analysis.
--
Thanks and Regards,
Prateek
^ permalink raw reply [flat|nested] 74+ messages in thread* Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern
2026-01-13 11:05 ` K Prateek Nayak
@ 2026-01-13 11:53 ` Peter Zijlstra
2026-01-13 11:56 ` Peter Zijlstra
0 siblings, 1 reply; 74+ messages in thread
From: Peter Zijlstra @ 2026-01-13 11:53 UTC (permalink / raw)
To: K Prateek Nayak
Cc: Pierre Gondois, tj, linux-kernel, mingo, juri.lelli,
vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
vschneid, longman, hannes, mkoutny, void, arighi, changwoo,
cgroups, sched-ext, liuwenfang, tglx, Christian Loehle
On Tue, Jan 13, 2026 at 04:35:02PM +0530, K Prateek Nayak wrote:
> Does enabling WARN_DOUBLE_CLOCK warn of a double clock update before
> hitting this warning?
setup_new_dl_entity() -> update_rq_clock() seems like it will trip that
in this case.
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern
2026-01-13 11:53 ` Peter Zijlstra
@ 2026-01-13 11:56 ` Peter Zijlstra
2026-01-13 13:07 ` Pierre Gondois
` (2 more replies)
0 siblings, 3 replies; 74+ messages in thread
From: Peter Zijlstra @ 2026-01-13 11:56 UTC (permalink / raw)
To: K Prateek Nayak
Cc: Pierre Gondois, tj, linux-kernel, mingo, juri.lelli,
vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
vschneid, longman, hannes, mkoutny, void, arighi, changwoo,
cgroups, sched-ext, liuwenfang, tglx, Christian Loehle
On Tue, Jan 13, 2026 at 12:53:09PM +0100, Peter Zijlstra wrote:
> On Tue, Jan 13, 2026 at 04:35:02PM +0530, K Prateek Nayak wrote:
>
> > Does enabling WARN_DOUBLE_CLOCK warn of a double clock update before
> > hitting this warning?
>
> setup_new_dl_entity() -> update_rq_clock() seems like it will trip that
> in this case.
Something like so to fix: 9f239df55546 ("sched/deadline: Initialize dl_servers after SMP")
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -752,8 +752,6 @@ static inline void setup_new_dl_entity(s
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
- update_rq_clock(rq);
-
WARN_ON(is_dl_boosted(dl_se));
WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
@@ -1834,6 +1832,7 @@ void sched_init_dl_servers(void)
rq = cpu_rq(cpu);
guard(rq_lock_irq)(rq);
+ update_rq_clock(rq);
dl_se = &rq->fair_server;
^ permalink raw reply [flat|nested] 74+ messages in thread* Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern
2026-01-13 11:56 ` Peter Zijlstra
@ 2026-01-13 13:07 ` Pierre Gondois
2026-01-13 13:10 ` Pierre Gondois
2026-01-15 21:01 ` [tip: sched/urgent] sched/deadline: Avoid double update_rq_clock() tip-bot2 for Peter Zijlstra
2 siblings, 0 replies; 74+ messages in thread
From: Pierre Gondois @ 2026-01-13 13:07 UTC (permalink / raw)
To: Peter Zijlstra, K Prateek Nayak
Cc: tj, linux-kernel, mingo, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, longman,
hannes, mkoutny, void, arighi, changwoo, cgroups, sched-ext,
liuwenfang, tglx, Christian Loehle
On 1/13/26 12:56, Peter Zijlstra wrote:
> On Tue, Jan 13, 2026 at 12:53:09PM +0100, Peter Zijlstra wrote:
>> On Tue, Jan 13, 2026 at 04:35:02PM +0530, K Prateek Nayak wrote:
>>
>>> Does enabling WARN_DOUBLE_CLOCK warn of a double clock update before
>>> hitting this warning?
>> setup_new_dl_entity() -> update_rq_clock() seems like it will trip that
>> in this case.
> Something like so to fix: 9f239df55546 ("sched/deadline: Initialize dl_servers after SMP")
>
>
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -752,8 +752,6 @@ static inline void setup_new_dl_entity(s
> struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
> struct rq *rq = rq_of_dl_rq(dl_rq);
>
> - update_rq_clock(rq);
> -
> WARN_ON(is_dl_boosted(dl_se));
> WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
>
> @@ -1834,6 +1832,7 @@ void sched_init_dl_servers(void)
> rq = cpu_rq(cpu);
>
> guard(rq_lock_irq)(rq);
> + update_rq_clock(rq);
>
> dl_se = &rq->fair_server;
>
Yes right, enabling WARN_DOUBLE_CLOCK detects the double clock update
and this fixes it.
^ permalink raw reply [flat|nested] 74+ messages in thread* Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern
2026-01-13 11:56 ` Peter Zijlstra
2026-01-13 13:07 ` Pierre Gondois
@ 2026-01-13 13:10 ` Pierre Gondois
2026-01-15 21:01 ` [tip: sched/urgent] sched/deadline: Avoid double update_rq_clock() tip-bot2 for Peter Zijlstra
2 siblings, 0 replies; 74+ messages in thread
From: Pierre Gondois @ 2026-01-13 13:10 UTC (permalink / raw)
To: Peter Zijlstra, K Prateek Nayak
Cc: tj, linux-kernel, mingo, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, longman,
hannes, mkoutny, void, arighi, changwoo, cgroups, sched-ext,
liuwenfang, tglx, Christian Loehle
On 1/13/26 12:56, Peter Zijlstra wrote:
> On Tue, Jan 13, 2026 at 12:53:09PM +0100, Peter Zijlstra wrote:
>> On Tue, Jan 13, 2026 at 04:35:02PM +0530, K Prateek Nayak wrote:
>>
>>> Does enabling WARN_DOUBLE_CLOCK warn of a double clock update before
>>> hitting this warning?
>> setup_new_dl_entity() -> update_rq_clock() seems like it will trip that
>> in this case.
> Something like so to fix: 9f239df55546 ("sched/deadline: Initialize dl_servers after SMP")
>
>
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -752,8 +752,6 @@ static inline void setup_new_dl_entity(s
> struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
> struct rq *rq = rq_of_dl_rq(dl_rq);
>
> - update_rq_clock(rq);
> -
> WARN_ON(is_dl_boosted(dl_se));
> WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
>
> @@ -1834,6 +1832,7 @@ void sched_init_dl_servers(void)
> rq = cpu_rq(cpu);
>
> guard(rq_lock_irq)(rq);
> + update_rq_clock(rq);
>
> dl_se = &rq->fair_server;
>
Yes right, enabling WARN_DOUBLE_CLOCK detects the double clock update
and this fixes it.
^ permalink raw reply [flat|nested] 74+ messages in thread* [tip: sched/urgent] sched/deadline: Avoid double update_rq_clock()
2026-01-13 11:56 ` Peter Zijlstra
2026-01-13 13:07 ` Pierre Gondois
2026-01-13 13:10 ` Pierre Gondois
@ 2026-01-15 21:01 ` tip-bot2 for Peter Zijlstra
2 siblings, 0 replies; 74+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2026-01-15 21:01 UTC (permalink / raw)
To: linux-tip-commits
Cc: Pierre Gondois, Peter Zijlstra (Intel), x86, linux-kernel
The following commit has been merged into the sched/urgent branch of tip:
Commit-ID: 4de9ff76067b40c3660df73efaea57389e62ea7a
Gitweb: https://git.kernel.org/tip/4de9ff76067b40c3660df73efaea57389e62ea7a
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Tue, 13 Jan 2026 12:57:14 +01:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 15 Jan 2026 21:57:52 +01:00
sched/deadline: Avoid double update_rq_clock()
When setup_new_dl_entity() is called from enqueue_task_dl() ->
enqueue_dl_entity(), the rq-clock should already be updated, and
calling update_rq_clock() again is not right.
Move the update_rq_clock() to the one other caller of
setup_new_dl_entity(): sched_init_dl_server().
Fixes: 9f239df55546 ("sched/deadline: Initialize dl_servers after SMP")
Reported-by: Pierre Gondois <pierre.gondois@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Pierre Gondois <pierre.gondois@arm.com>
Link: https://patch.msgid.link/20260113115622.GA831285@noisy.programming.kicks-ass.net
---
kernel/sched/deadline.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b7acf74..5d6f3cc 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -752,8 +752,6 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
- update_rq_clock(rq);
-
WARN_ON(is_dl_boosted(dl_se));
WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
@@ -1839,6 +1837,7 @@ void sched_init_dl_servers(void)
rq = cpu_rq(cpu);
guard(rq_lock_irq)(rq);
+ update_rq_clock(rq);
dl_se = &rq->fair_server;
^ permalink raw reply related [flat|nested] 74+ messages in thread
* Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern
2026-01-13 10:45 ` Pierre Gondois
2026-01-13 11:05 ` K Prateek Nayak
@ 2026-01-13 11:47 ` Peter Zijlstra
2026-01-14 6:47 ` K Prateek Nayak
1 sibling, 1 reply; 74+ messages in thread
From: Peter Zijlstra @ 2026-01-13 11:47 UTC (permalink / raw)
To: Pierre Gondois
Cc: K Prateek Nayak, tj, linux-kernel, mingo, juri.lelli,
vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
vschneid, longman, hannes, mkoutny, void, arighi, changwoo,
cgroups, sched-ext, liuwenfang, tglx, Christian Loehle
On Tue, Jan 13, 2026 at 11:45:43AM +0100, Pierre Gondois wrote:
> Hello Prateek,
>
> On 1/13/26 05:12, K Prateek Nayak wrote:
> > Hello Pierre,
> >
> > On 1/13/2026 2:14 AM, Pierre Gondois wrote:
> > > Hello Peter,
> > >
> > > It seems this patch:
> > > 6455ad5346c9 ("sched: Move sched_class::prio_changed() into the change pattern")
> > > is triggering the following warning:
> > > rq_pin_lock()
> > > \-WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback);
> > Can you check if the following solution helps your case too:
> > https://lore.kernel.org/all/20260106104113.GX3707891@noisy.programming.kicks-ass.net/
> >
> I can still see the issue.
> It seems the task deadline is also updated in:
> sched_change_end()
> \-enqueue_task_dl()
> \-enqueue_dl_entity()
> \-setup_new_dl_entity()
> \-replenish_dl_new_period()
> if the task's period finished.
>
> So in sched_change_end(), the task priority (i.e. p->dl.deadline) is
> updated.
> This results in having an old_deadline earlier than the new p->dl.deadline.
> Thus the rq->balance_callback:
>
> prio_changed_dl() {
> ...
> if (dl_time_before(old_deadline, p->dl.deadline))
> deadline_queue_pull_task(rq);
> ...
> }
Hum... so this one is a little more tricky.
So the normal rules are that DEQUEUE_SAVE + ENQUEUE_RESTORE should be as
invariant as possible.
But what I think happens here is that at the point of dequeue we are
effectively ready to throttle/replenish, but we don't.
Then at enqueue, we do. The replenish changes the deadline and we're up
a creek.
Let me think about this for a bit...
^ permalink raw reply [flat|nested] 74+ messages in thread* Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern
2026-01-13 11:47 ` [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern Peter Zijlstra
@ 2026-01-14 6:47 ` K Prateek Nayak
2026-01-14 10:23 ` Peter Zijlstra
0 siblings, 1 reply; 74+ messages in thread
From: K Prateek Nayak @ 2026-01-14 6:47 UTC (permalink / raw)
To: Peter Zijlstra, Pierre Gondois
Cc: tj, linux-kernel, mingo, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, longman,
hannes, mkoutny, void, arighi, changwoo, cgroups, sched-ext,
liuwenfang, tglx, Christian Loehle
Hello Peter,
On 1/13/2026 5:17 PM, Peter Zijlstra wrote:
> Hum... so this one is a little more tricky.
>
> So the normal rules are that DEQUEUE_SAVE + ENQUEUE_RESTORE should be as
> invariant as possible.
>
> But what I think happens here is that at the point of dequeue we are
> effectively ready to throttle/replenish, but we don't.
>
> Then at enqueue, we do. The replenish changes the deadline and we're up
> a creek.
I've the following data from the scenario in which I observe
the same splat as Pierre splat wit the two fixes on top of tip:
yes-4108 [194] d..2. 53.396872: get_prio_dl: get_prio_dl: clock(53060728757)
yes-4108 [194] d..2. 53.396873: update_curr_dl_se: update_curr_dl_se: past throttle label
yes-4108 [194] d..2. 53.396873: update_curr_dl_se: dl_throttled(0) dl_overrun(0) timer_queued(0) server?(0)
yes-4108 [194] d..2. 53.396873: update_curr_dl_se: dl_se->runtime(190623) rq->dl.overloaded(0)
yes-4108 [194] d..2. 53.396874: get_prio_dl: get_prio_dl: deadline(53060017809)
yes-4108 [194] d..2. 53.396878: enqueue_dl_entity: ENQUEUE_RESTORE update_dl_entity
yes-4108 [194] d..2. 53.396878: enqueue_dl_entity: setup_new_dl_entity
yes-4108 [194] d..2. 53.396878: enqueue_dl_entity: Replenish: Old: 53060017809 dl_deadline(1000000)
yes-4108 [194] d..2. 53.396879: enqueue_dl_entity: Replenish: New: 53061728757
yes-4108 [194] d..2. 53.396882: prio_changed_dl.part.0: Woops! prio_changed_dl: CPU(194) clock(53060728757) overloaded(0): Task: yes(4108), Curr: yes(4108) deadline: 53060017809 -> 53061728757
get_prio_dl() sees "deadline < rq->clock" but dl_se->runtime is still
positive so update_curr_dl_se() doesn't fiddle with the deadline.
ENQUEUE_RESTORE sees "deadline" before "rq->clock" and calls
setup_new_dl_entity() which calls replenish.
sched_change_end() will call prio_changed() with the old deadline from
get_prio_dl() but enqueue advanced the deadline so we land in a
pickle.
>
> Let me think about this for a bit...
Should prio_changed_dl() care about "dl_se->dl_deadline" having changed
within the sched_change guard since that is the attribute that can be
changed using sched_setattr() right?
--
Thanks and Regards,
Prateek
^ permalink raw reply [flat|nested] 74+ messages in thread* Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern
2026-01-14 6:47 ` K Prateek Nayak
@ 2026-01-14 10:23 ` Peter Zijlstra
2026-01-14 13:05 ` Peter Zijlstra
0 siblings, 1 reply; 74+ messages in thread
From: Peter Zijlstra @ 2026-01-14 10:23 UTC (permalink / raw)
To: K Prateek Nayak
Cc: Pierre Gondois, tj, linux-kernel, mingo, juri.lelli,
vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
vschneid, longman, hannes, mkoutny, void, arighi, changwoo,
cgroups, sched-ext, liuwenfang, tglx, Christian Loehle,
luca.abeni
On Wed, Jan 14, 2026 at 12:17:11PM +0530, K Prateek Nayak wrote:
> Hello Peter,
>
> On 1/13/2026 5:17 PM, Peter Zijlstra wrote:
> > Hum... so this one is a little more tricky.
> >
> > So the normal rules are that DEQUEUE_SAVE + ENQUEUE_RESTORE should be as
> > invariant as possible.
> >
> > But what I think happens here is that at the point of dequeue we are
> > effectively ready to throttle/replenish, but we don't.
> >
> > Then at enqueue, we do. The replenish changes the deadline and we're up
> > a creek.
>
> I've the following data from the scenario in which I observe
> the same splat as Pierre splat wit the two fixes on top of tip:
>
> yes-4108 [194] d..2. 53.396872: get_prio_dl: get_prio_dl: clock(53060728757)
> yes-4108 [194] d..2. 53.396873: update_curr_dl_se: update_curr_dl_se: past throttle label
> yes-4108 [194] d..2. 53.396873: update_curr_dl_se: dl_throttled(0) dl_overrun(0) timer_queued(0) server?(0)
> yes-4108 [194] d..2. 53.396873: update_curr_dl_se: dl_se->runtime(190623) rq->dl.overloaded(0)
> yes-4108 [194] d..2. 53.396874: get_prio_dl: get_prio_dl: deadline(53060017809)
>
> yes-4108 [194] d..2. 53.396878: enqueue_dl_entity: ENQUEUE_RESTORE update_dl_entity
> yes-4108 [194] d..2. 53.396878: enqueue_dl_entity: setup_new_dl_entity
> yes-4108 [194] d..2. 53.396878: enqueue_dl_entity: Replenish: Old: 53060017809 dl_deadline(1000000)
> yes-4108 [194] d..2. 53.396879: enqueue_dl_entity: Replenish: New: 53061728757
> yes-4108 [194] d..2. 53.396882: prio_changed_dl.part.0: Woops! prio_changed_dl: CPU(194) clock(53060728757) overloaded(0): Task: yes(4108), Curr: yes(4108) deadline: 53060017809 -> 53061728757
>
> get_prio_dl() sees "deadline < rq->clock" but dl_se->runtime is still
> positive so update_curr_dl_se() doesn't fiddle with the deadline.
>
> ENQUEUE_RESTORE sees "deadline" before "rq->clock" and calls
> setup_new_dl_entity() which calls replenish.
Right this. That's more or less where I ended up as well. Just don't
know what to do about that. It doesn't feel right.
That is, it means that a task behaves differently depending on if a
(unrelated) sched_change comes in between.
If undisturbed it will be allowed to exhaust its runtime, irrespective
of it missing its deadline (valid for G-EDF); while when it gets
disturbed it will be forced to replenish.
Juri, Luca, I'm tempted to suggest to simply remove the replenish on
RESTORE entirely -- that would allow the task to continue as it had
been, irrespective of it being 'late'.
Something like so -- what would this break?
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2214,10 +2214,6 @@ enqueue_dl_entity(struct sched_dl_entity
update_dl_entity(dl_se);
} else if (flags & ENQUEUE_REPLENISH) {
replenish_dl_entity(dl_se);
- } else if ((flags & ENQUEUE_RESTORE) &&
- !is_dl_boosted(dl_se) &&
- dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
- setup_new_dl_entity(dl_se);
}
/*
> > Let me think about this for a bit...
>
> Should prio_changed_dl() care about "dl_se->dl_deadline" having changed
> within the sched_change guard since that is the attribute that can be
> changed using sched_setattr() right?
__setparam_dl() changes dl_se->dl_deadline, as you say, but that does
not immediately affect the current dl_se->deadline. It will take effect
the next replenish.
That is, changing dl task attributes changes the next activation, not
the current. And since DL is a dynamic priority scheme, it doesn't
affect the current priority.
^ permalink raw reply [flat|nested] 74+ messages in thread* Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern
2026-01-14 10:23 ` Peter Zijlstra
@ 2026-01-14 13:05 ` Peter Zijlstra
2026-01-14 14:04 ` luca abeni
` (5 more replies)
0 siblings, 6 replies; 74+ messages in thread
From: Peter Zijlstra @ 2026-01-14 13:05 UTC (permalink / raw)
To: K Prateek Nayak
Cc: Pierre Gondois, tj, linux-kernel, mingo, juri.lelli,
vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
vschneid, longman, hannes, mkoutny, void, arighi, changwoo,
cgroups, sched-ext, liuwenfang, tglx, Christian Loehle,
luca.abeni
On Wed, Jan 14, 2026 at 11:23:36AM +0100, Peter Zijlstra wrote:
> Juri, Luca, I'm tempted to suggest to simply remove the replenish on
> RESTORE entirely -- that would allow the task to continue as it had
> been, irrespective of it being 'late'.
>
> Something like so -- what would this break?
>
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -2214,10 +2214,6 @@ enqueue_dl_entity(struct sched_dl_entity
> update_dl_entity(dl_se);
> } else if (flags & ENQUEUE_REPLENISH) {
> replenish_dl_entity(dl_se);
> - } else if ((flags & ENQUEUE_RESTORE) &&
> - !is_dl_boosted(dl_se) &&
> - dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
> - setup_new_dl_entity(dl_se);
> }
>
> /*
Ah, this is de-boost, right? Boosting allows one to break the CBS rules
and then we have to rein in the excesses.
But we have {DE,EN}QUEUE_MOVE for this, that explicitly allows priority
to change and is set for rt_mutex_setprio() (among others).
So doing s/RESTORE/MOVE/ above.
The corollary to all this is that everybody that sets MOVE must be able
to deal with balance callbacks, so audit that too.
This then gives something like so.. which builds and boots for me, but
clearly I haven't been able to trigger these funny cases.
---
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4969,9 +4969,13 @@ struct balance_callback *splice_balance_
return __splice_balance_callbacks(rq, true);
}
-static void __balance_callbacks(struct rq *rq)
+void __balance_callbacks(struct rq *rq, struct rq_flags *rf)
{
+ if (rf)
+ rq_unpin_lock(rq, rf);
do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
+ if (rf)
+ rq_repin_lock(rq, rf);
}
void balance_callbacks(struct rq *rq, struct balance_callback *head)
@@ -5018,7 +5022,7 @@ static inline void finish_lock_switch(st
* prev into current:
*/
spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
- __balance_callbacks(rq);
+ __balance_callbacks(rq, NULL);
raw_spin_rq_unlock_irq(rq);
}
@@ -6901,7 +6905,7 @@ static void __sched notrace __schedule(i
proxy_tag_curr(rq, next);
rq_unpin_lock(rq, &rf);
- __balance_callbacks(rq);
+ __balance_callbacks(rq, NULL);
raw_spin_rq_unlock_irq(rq);
}
trace_sched_exit_tp(is_switch);
@@ -7350,7 +7354,7 @@ void rt_mutex_setprio(struct task_struct
trace_sched_pi_setprio(p, pi_task);
oldprio = p->prio;
- if (oldprio == prio)
+ if (oldprio == prio && !dl_prio(prio))
queue_flag &= ~DEQUEUE_MOVE;
prev_class = p->sched_class;
@@ -7396,9 +7400,7 @@ void rt_mutex_setprio(struct task_struct
out_unlock:
/* Caller holds task_struct::pi_lock, IRQs are still disabled */
- rq_unpin_lock(rq, &rf);
- __balance_callbacks(rq);
- rq_repin_lock(rq, &rf);
+ __balance_callbacks(rq, &rf);
__task_rq_unlock(rq, p, &rf);
}
#endif /* CONFIG_RT_MUTEXES */
@@ -9167,6 +9169,8 @@ void sched_move_task(struct task_struct
if (resched)
resched_curr(rq);
+
+ __balance_callbacks(rq, &rq_guard.rf);
}
static struct cgroup_subsys_state *
@@ -10891,6 +10895,9 @@ void sched_change_end(struct sched_chang
resched_curr(rq);
}
} else {
+ /*
+ * XXX validate prio only really changed when ENQUEUE_MOVE is set.
+ */
p->sched_class->prio_changed(rq, p, ctx->prio);
}
}
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2214,9 +2214,14 @@ enqueue_dl_entity(struct sched_dl_entity
update_dl_entity(dl_se);
} else if (flags & ENQUEUE_REPLENISH) {
replenish_dl_entity(dl_se);
- } else if ((flags & ENQUEUE_RESTORE) &&
+ } else if ((flags & ENQUEUE_MOVE) &&
!is_dl_boosted(dl_se) &&
dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
+ /*
+ * Deals with the de-boost case, and ENQUEUE_MOVE explicitly
+ * allows us to change priority. Callers are expected to deal
+ * with balance_callbacks.
+ */
setup_new_dl_entity(dl_se);
}
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -545,6 +545,7 @@ static void scx_task_iter_start(struct s
static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
{
if (iter->locked_task) {
+ __balance_callbacks(iter->rq, &iter->rf);
task_rq_unlock(iter->rq, iter->locked_task, &iter->rf);
iter->locked_task = NULL;
}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2430,7 +2430,8 @@ extern const u32 sched_prio_to_wmult[40
* should preserve as much state as possible.
*
* MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
- * in the runqueue.
+ * in the runqueue. IOW the priority is allowed to change. Callers
+ * must expect to deal with balance callbacks.
*
* NOCLOCK - skip the update_rq_clock() (avoids double updates)
*
@@ -4019,6 +4020,8 @@ extern void enqueue_task(struct rq *rq,
extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
+
+extern void __balance_callbacks(struct rq *rq, struct rq_flags *rf);
extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
/*
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -639,7 +639,7 @@ int __sched_setscheduler(struct task_str
* itself.
*/
newprio = rt_effective_prio(p, newprio);
- if (newprio == oldprio)
+ if (newprio == oldprio && !dl_prio(newprio))
queue_flags &= ~DEQUEUE_MOVE;
}
^ permalink raw reply [flat|nested] 74+ messages in thread* Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern
2026-01-14 13:05 ` Peter Zijlstra
@ 2026-01-14 14:04 ` luca abeni
2026-01-14 14:20 ` Juri Lelli
` (4 subsequent siblings)
5 siblings, 0 replies; 74+ messages in thread
From: luca abeni @ 2026-01-14 14:04 UTC (permalink / raw)
To: Peter Zijlstra
Cc: K Prateek Nayak, Pierre Gondois, tj, linux-kernel, mingo,
juri.lelli, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
mgorman, vschneid, longman, hannes, mkoutny, void, arighi,
changwoo, cgroups, sched-ext, liuwenfang, tglx, Christian Loehle
Hi Peter,
On Wed, 14 Jan 2026 14:05:28 +0100
Peter Zijlstra <peterz@infradead.org> wrote:
> On Wed, Jan 14, 2026 at 11:23:36AM +0100, Peter Zijlstra wrote:
>
> > Juri, Luca, I'm tempted to suggest to simply remove the replenish on
> > RESTORE entirely -- that would allow the task to continue as it had
> > been, irrespective of it being 'late'.
> >
> > Something like so -- what would this break?
> >
> > --- a/kernel/sched/deadline.c
> > +++ b/kernel/sched/deadline.c
> > @@ -2214,10 +2214,6 @@ enqueue_dl_entity(struct sched_dl_entity
> > update_dl_entity(dl_se);
> > } else if (flags & ENQUEUE_REPLENISH) {
> > replenish_dl_entity(dl_se);
> > - } else if ((flags & ENQUEUE_RESTORE) &&
> > - !is_dl_boosted(dl_se) &&
> > - dl_time_before(dl_se->deadline,
> > rq_clock(rq_of_dl_se(dl_se)))) {
> > - setup_new_dl_entity(dl_se);
> > }
> >
> > /*
>
> Ah, this is de-boost, right? Boosting allows one to break the CBS
> rules and then we have to rein in the excesses.
Sorry, I am missing a little bit of context (I am trying to catch up
reading the mailing list archives)... But I agree that the call to
setup_new_dl_entity() mentioned above does not make too much sense.
I suspect the hunk above could be directly removed, as you originally
suggested (on de-boosting(), the task returns to its original deadline,
which is larger than the inherited one, so I am not sure whether we
should generate a new deadline or just leave it as it is, even if it
has been missed).
Luca
>
> But we have {DE,EN}QUEUE_MOVE for this, that explicitly allows
> priority to change and is set for rt_mutex_setprio() (among others).
>
> So doing s/RESTORE/MOVE/ above.
>
> The corollary to all this is that everybody that sets MOVE must be
> able to deal with balance callbacks, so audit that too.
>
> This then gives something like so.. which builds and boots for me, but
> clearly I haven't been able to trigger these funny cases.
>
> ---
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4969,9 +4969,13 @@ struct balance_callback *splice_balance_
> return __splice_balance_callbacks(rq, true);
> }
>
> -static void __balance_callbacks(struct rq *rq)
> +void __balance_callbacks(struct rq *rq, struct rq_flags *rf)
> {
> + if (rf)
> + rq_unpin_lock(rq, rf);
> do_balance_callbacks(rq, __splice_balance_callbacks(rq,
> false));
> + if (rf)
> + rq_repin_lock(rq, rf);
> }
>
> void balance_callbacks(struct rq *rq, struct balance_callback *head)
> @@ -5018,7 +5022,7 @@ static inline void finish_lock_switch(st
> * prev into current:
> */
> spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
> - __balance_callbacks(rq);
> + __balance_callbacks(rq, NULL);
> raw_spin_rq_unlock_irq(rq);
> }
>
> @@ -6901,7 +6905,7 @@ static void __sched notrace __schedule(i
> proxy_tag_curr(rq, next);
>
> rq_unpin_lock(rq, &rf);
> - __balance_callbacks(rq);
> + __balance_callbacks(rq, NULL);
> raw_spin_rq_unlock_irq(rq);
> }
> trace_sched_exit_tp(is_switch);
> @@ -7350,7 +7354,7 @@ void rt_mutex_setprio(struct task_struct
> trace_sched_pi_setprio(p, pi_task);
> oldprio = p->prio;
>
> - if (oldprio == prio)
> + if (oldprio == prio && !dl_prio(prio))
> queue_flag &= ~DEQUEUE_MOVE;
>
> prev_class = p->sched_class;
> @@ -7396,9 +7400,7 @@ void rt_mutex_setprio(struct task_struct
> out_unlock:
> /* Caller holds task_struct::pi_lock, IRQs are still
> disabled */
> - rq_unpin_lock(rq, &rf);
> - __balance_callbacks(rq);
> - rq_repin_lock(rq, &rf);
> + __balance_callbacks(rq, &rf);
> __task_rq_unlock(rq, p, &rf);
> }
> #endif /* CONFIG_RT_MUTEXES */
> @@ -9167,6 +9169,8 @@ void sched_move_task(struct task_struct
>
> if (resched)
> resched_curr(rq);
> +
> + __balance_callbacks(rq, &rq_guard.rf);
> }
>
> static struct cgroup_subsys_state *
> @@ -10891,6 +10895,9 @@ void sched_change_end(struct sched_chang
> resched_curr(rq);
> }
> } else {
> + /*
> + * XXX validate prio only really changed when
> ENQUEUE_MOVE is set.
> + */
> p->sched_class->prio_changed(rq, p, ctx->prio);
> }
> }
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -2214,9 +2214,14 @@ enqueue_dl_entity(struct sched_dl_entity
> update_dl_entity(dl_se);
> } else if (flags & ENQUEUE_REPLENISH) {
> replenish_dl_entity(dl_se);
> - } else if ((flags & ENQUEUE_RESTORE) &&
> + } else if ((flags & ENQUEUE_MOVE) &&
> !is_dl_boosted(dl_se) &&
> dl_time_before(dl_se->deadline,
> rq_clock(rq_of_dl_se(dl_se)))) {
> + /*
> + * Deals with the de-boost case, and ENQUEUE_MOVE
> explicitly
> + * allows us to change priority. Callers are
> expected to deal
> + * with balance_callbacks.
> + */
> setup_new_dl_entity(dl_se);
> }
>
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -545,6 +545,7 @@ static void scx_task_iter_start(struct s
> static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
> {
> if (iter->locked_task) {
> + __balance_callbacks(iter->rq, &iter->rf);
> task_rq_unlock(iter->rq, iter->locked_task,
> &iter->rf); iter->locked_task = NULL;
> }
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2430,7 +2430,8 @@ extern const u32
> sched_prio_to_wmult[40
> * should preserve as much state as possible.
> *
> * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the
> location
> - * in the runqueue.
> + * in the runqueue. IOW the priority is allowed to change.
> Callers
> + * must expect to deal with balance callbacks.
> *
> * NOCLOCK - skip the update_rq_clock() (avoids double updates)
> *
> @@ -4019,6 +4020,8 @@ extern void enqueue_task(struct rq *rq,
> extern bool dequeue_task(struct rq *rq, struct task_struct *p, int
> flags);
> extern struct balance_callback *splice_balance_callbacks(struct rq
> *rq); +
> +extern void __balance_callbacks(struct rq *rq, struct rq_flags *rf);
> extern void balance_callbacks(struct rq *rq, struct balance_callback
> *head);
> /*
> --- a/kernel/sched/syscalls.c
> +++ b/kernel/sched/syscalls.c
> @@ -639,7 +639,7 @@ int __sched_setscheduler(struct task_str
> * itself.
> */
> newprio = rt_effective_prio(p, newprio);
> - if (newprio == oldprio)
> + if (newprio == oldprio && !dl_prio(newprio))
> queue_flags &= ~DEQUEUE_MOVE;
> }
>
^ permalink raw reply [flat|nested] 74+ messages in thread* Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern
2026-01-14 13:05 ` Peter Zijlstra
2026-01-14 14:04 ` luca abeni
@ 2026-01-14 14:20 ` Juri Lelli
2026-01-14 15:25 ` luca abeni
2026-01-15 8:24 ` Peter Zijlstra
2026-01-15 21:00 ` [tip: sched/urgent] sched/deadline: Use ENQUEUE_MOVE to allow priority change tip-bot2 for Peter Zijlstra
` (3 subsequent siblings)
5 siblings, 2 replies; 74+ messages in thread
From: Juri Lelli @ 2026-01-14 14:20 UTC (permalink / raw)
To: Peter Zijlstra
Cc: K Prateek Nayak, Pierre Gondois, tj, linux-kernel, mingo,
vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
vschneid, longman, hannes, mkoutny, void, arighi, changwoo,
cgroups, sched-ext, liuwenfang, tglx, Christian Loehle,
luca.abeni
On 14/01/26 14:05, Peter Zijlstra wrote:
> On Wed, Jan 14, 2026 at 11:23:36AM +0100, Peter Zijlstra wrote:
>
> > Juri, Luca, I'm tempted to suggest to simply remove the replenish on
> > RESTORE entirely -- that would allow the task to continue as it had
> > been, irrespective of it being 'late'.
> >
> > Something like so -- what would this break?
> >
> > --- a/kernel/sched/deadline.c
> > +++ b/kernel/sched/deadline.c
> > @@ -2214,10 +2214,6 @@ enqueue_dl_entity(struct sched_dl_entity
> > update_dl_entity(dl_se);
> > } else if (flags & ENQUEUE_REPLENISH) {
> > replenish_dl_entity(dl_se);
> > - } else if ((flags & ENQUEUE_RESTORE) &&
> > - !is_dl_boosted(dl_se) &&
> > - dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
> > - setup_new_dl_entity(dl_se);
> > }
> >
> > /*
>
> Ah, this is de-boost, right? Boosting allows one to break the CBS rules
> and then we have to rein in the excesses.
>
> But we have {DE,EN}QUEUE_MOVE for this, that explicitly allows priority
> to change and is set for rt_mutex_setprio() (among others).
>
> So doing s/RESTORE/MOVE/ above.
>
> The corollary to all this is that everybody that sets MOVE must be able
> to deal with balance callbacks, so audit that too.
>
> This then gives something like so.. which builds and boots for me, but
> clearly I haven't been able to trigger these funny cases.
>
> ---
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4969,9 +4969,13 @@ struct balance_callback *splice_balance_
> return __splice_balance_callbacks(rq, true);
> }
>
> -static void __balance_callbacks(struct rq *rq)
> +void __balance_callbacks(struct rq *rq, struct rq_flags *rf)
> {
> + if (rf)
> + rq_unpin_lock(rq, rf);
> do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
> + if (rf)
> + rq_repin_lock(rq, rf);
> }
>
> void balance_callbacks(struct rq *rq, struct balance_callback *head)
> @@ -5018,7 +5022,7 @@ static inline void finish_lock_switch(st
> * prev into current:
> */
> spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
> - __balance_callbacks(rq);
> + __balance_callbacks(rq, NULL);
> raw_spin_rq_unlock_irq(rq);
> }
>
> @@ -6901,7 +6905,7 @@ static void __sched notrace __schedule(i
> proxy_tag_curr(rq, next);
>
> rq_unpin_lock(rq, &rf);
> - __balance_callbacks(rq);
> + __balance_callbacks(rq, NULL);
> raw_spin_rq_unlock_irq(rq);
> }
> trace_sched_exit_tp(is_switch);
> @@ -7350,7 +7354,7 @@ void rt_mutex_setprio(struct task_struct
> trace_sched_pi_setprio(p, pi_task);
> oldprio = p->prio;
>
> - if (oldprio == prio)
> + if (oldprio == prio && !dl_prio(prio))
> queue_flag &= ~DEQUEUE_MOVE;
>
> prev_class = p->sched_class;
> @@ -7396,9 +7400,7 @@ void rt_mutex_setprio(struct task_struct
> out_unlock:
> /* Caller holds task_struct::pi_lock, IRQs are still disabled */
>
> - rq_unpin_lock(rq, &rf);
> - __balance_callbacks(rq);
> - rq_repin_lock(rq, &rf);
> + __balance_callbacks(rq, &rf);
> __task_rq_unlock(rq, p, &rf);
> }
> #endif /* CONFIG_RT_MUTEXES */
> @@ -9167,6 +9169,8 @@ void sched_move_task(struct task_struct
>
> if (resched)
> resched_curr(rq);
> +
> + __balance_callbacks(rq, &rq_guard.rf);
> }
>
> static struct cgroup_subsys_state *
> @@ -10891,6 +10895,9 @@ void sched_change_end(struct sched_chang
> resched_curr(rq);
> }
> } else {
> + /*
> + * XXX validate prio only really changed when ENQUEUE_MOVE is set.
> + */
> p->sched_class->prio_changed(rq, p, ctx->prio);
> }
> }
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -2214,9 +2214,14 @@ enqueue_dl_entity(struct sched_dl_entity
> update_dl_entity(dl_se);
> } else if (flags & ENQUEUE_REPLENISH) {
> replenish_dl_entity(dl_se);
> - } else if ((flags & ENQUEUE_RESTORE) &&
> + } else if ((flags & ENQUEUE_MOVE) &&
> !is_dl_boosted(dl_se) &&
> dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
> + /*
> + * Deals with the de-boost case, and ENQUEUE_MOVE explicitly
> + * allows us to change priority. Callers are expected to deal
> + * with balance_callbacks.
> + */
> setup_new_dl_entity(dl_se);
> }
>
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -545,6 +545,7 @@ static void scx_task_iter_start(struct s
> static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
> {
> if (iter->locked_task) {
> + __balance_callbacks(iter->rq, &iter->rf);
> task_rq_unlock(iter->rq, iter->locked_task, &iter->rf);
> iter->locked_task = NULL;
> }
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2430,7 +2430,8 @@ extern const u32 sched_prio_to_wmult[40
> * should preserve as much state as possible.
> *
> * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
> - * in the runqueue.
> + * in the runqueue. IOW the priority is allowed to change. Callers
> + * must expect to deal with balance callbacks.
> *
> * NOCLOCK - skip the update_rq_clock() (avoids double updates)
> *
> @@ -4019,6 +4020,8 @@ extern void enqueue_task(struct rq *rq,
> extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
>
> extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
> +
> +extern void __balance_callbacks(struct rq *rq, struct rq_flags *rf);
> extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
>
> /*
> --- a/kernel/sched/syscalls.c
> +++ b/kernel/sched/syscalls.c
> @@ -639,7 +639,7 @@ int __sched_setscheduler(struct task_str
> * itself.
> */
> newprio = rt_effective_prio(p, newprio);
> - if (newprio == oldprio)
> + if (newprio == oldprio && !dl_prio(newprio))
> queue_flags &= ~DEQUEUE_MOVE;
> }
We have been using (improperly?) ENQUEUE_SAVE also to know when a new
entity gets setscheduled to DEADLINE (or its parameters are changed) and
it looks like this keeps that happening with DEQUEUE_MOVE. So, from a
quick first look, it does sound good to me.
Thanks!
Juri
^ permalink raw reply [flat|nested] 74+ messages in thread* Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern
2026-01-14 14:20 ` Juri Lelli
@ 2026-01-14 15:25 ` luca abeni
2026-01-15 8:24 ` Peter Zijlstra
1 sibling, 0 replies; 74+ messages in thread
From: luca abeni @ 2026-01-14 15:25 UTC (permalink / raw)
To: Juri Lelli
Cc: Peter Zijlstra, K Prateek Nayak, Pierre Gondois, tj, linux-kernel,
mingo, vincent.guittot, dietmar.eggemann, rostedt, bsegall,
mgorman, vschneid, longman, hannes, mkoutny, void, arighi,
changwoo, cgroups, sched-ext, liuwenfang, tglx, Christian Loehle
Hi Juri,
On Wed, 14 Jan 2026 15:20:48 +0100
Juri Lelli <juri.lelli@redhat.com> wrote:
[...]
> > > --- a/kernel/sched/deadline.c
> > > +++ b/kernel/sched/deadline.c
> > > @@ -2214,10 +2214,6 @@ enqueue_dl_entity(struct sched_dl_entity
> > > update_dl_entity(dl_se);
> > > } else if (flags & ENQUEUE_REPLENISH) {
> > > replenish_dl_entity(dl_se);
> > > - } else if ((flags & ENQUEUE_RESTORE) &&
> > > - !is_dl_boosted(dl_se) &&
> > > - dl_time_before(dl_se->deadline,
> > > rq_clock(rq_of_dl_se(dl_se)))) {
> > > - setup_new_dl_entity(dl_se);
> > > }
> > >
> > > /*
[...]
> > --- a/kernel/sched/syscalls.c
> > +++ b/kernel/sched/syscalls.c
> > @@ -639,7 +639,7 @@ int __sched_setscheduler(struct task_str
> > * itself.
> > */
> > newprio = rt_effective_prio(p, newprio);
> > - if (newprio == oldprio)
> > + if (newprio == oldprio && !dl_prio(newprio))
> > queue_flags &= ~DEQUEUE_MOVE;
> > }
>
> We have been using (improperly?) ENQUEUE_SAVE also to know when a new
> entity gets setscheduled to DEADLINE (or its parameters are changed)
> and it looks like this keeps that happening with DEQUEUE_MOVE.
You are right: double thinking about it, I seem to remember that the
"flags & ENQUEUE_RESTORE" check above was introduced to fix tasks
switching to SCHED_DEADLINE...
So, I agree that changing "ENQUEUE_RESTORE" to "ENQUEUE_MOVE" should be
the right thing to do
Luca
^ permalink raw reply [flat|nested] 74+ messages in thread* Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern
2026-01-14 14:20 ` Juri Lelli
2026-01-14 15:25 ` luca abeni
@ 2026-01-15 8:24 ` Peter Zijlstra
2026-01-15 9:05 ` Peter Zijlstra
1 sibling, 1 reply; 74+ messages in thread
From: Peter Zijlstra @ 2026-01-15 8:24 UTC (permalink / raw)
To: Juri Lelli
Cc: K Prateek Nayak, Pierre Gondois, tj, linux-kernel, mingo,
vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
vschneid, longman, hannes, mkoutny, void, arighi, changwoo,
cgroups, sched-ext, liuwenfang, tglx, Christian Loehle,
luca.abeni
On Wed, Jan 14, 2026 at 03:20:48PM +0100, Juri Lelli wrote:
> > --- a/kernel/sched/syscalls.c
> > +++ b/kernel/sched/syscalls.c
> > @@ -639,7 +639,7 @@ int __sched_setscheduler(struct task_str
> > * itself.
> > */
> > newprio = rt_effective_prio(p, newprio);
> > - if (newprio == oldprio)
> > + if (newprio == oldprio && !dl_prio(newprio))
> > queue_flags &= ~DEQUEUE_MOVE;
> > }
>
> We have been using (improperly?) ENQUEUE_SAVE also to know when a new
> entity gets setscheduled to DEADLINE (or its parameters are changed) and
> it looks like this keeps that happening with DEQUEUE_MOVE. So, from a
> quick first look, it does sound good to me.
If this is strictly about tasks coming into SCHED_DEADLINE there are a
number of alternative options:
- there are the sched_class::switch{ing,ed}_to() callbacks;
- there is (the fairly recent) ENQUEUE_CLASS.
Anyway, let me break up this one patch into individual bits and write
changelogs. I'll stick them in queue/sched/urgent for now; hopefully
Pierre can given them a spin and report back if it all sorts his
problem).
^ permalink raw reply [flat|nested] 74+ messages in thread* Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern
2026-01-15 8:24 ` Peter Zijlstra
@ 2026-01-15 9:05 ` Peter Zijlstra
2026-01-15 13:13 ` Pierre Gondois
0 siblings, 1 reply; 74+ messages in thread
From: Peter Zijlstra @ 2026-01-15 9:05 UTC (permalink / raw)
To: Juri Lelli
Cc: K Prateek Nayak, Pierre Gondois, tj, linux-kernel, mingo,
vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
vschneid, longman, hannes, mkoutny, void, arighi, changwoo,
cgroups, sched-ext, liuwenfang, tglx, Christian Loehle,
luca.abeni
On Thu, Jan 15, 2026 at 09:24:31AM +0100, Peter Zijlstra wrote:
> On Wed, Jan 14, 2026 at 03:20:48PM +0100, Juri Lelli wrote:
>
> > > --- a/kernel/sched/syscalls.c
> > > +++ b/kernel/sched/syscalls.c
> > > @@ -639,7 +639,7 @@ int __sched_setscheduler(struct task_str
> > > * itself.
> > > */
> > > newprio = rt_effective_prio(p, newprio);
> > > - if (newprio == oldprio)
> > > + if (newprio == oldprio && !dl_prio(newprio))
> > > queue_flags &= ~DEQUEUE_MOVE;
> > > }
> >
> > We have been using (improperly?) ENQUEUE_SAVE also to know when a new
> > entity gets setscheduled to DEADLINE (or its parameters are changed) and
> > it looks like this keeps that happening with DEQUEUE_MOVE. So, from a
> > quick first look, it does sound good to me.
>
> If this is strictly about tasks coming into SCHED_DEADLINE there are a
> number of alternative options:
>
> - there are the sched_class::switch{ing,ed}_to() callbacks;
> - there is (the fairly recent) ENQUEUE_CLASS.
>
> Anyway, let me break up this one patch into individual bits and write
> changelogs. I'll stick them in queue/sched/urgent for now; hopefully
> Pierre can given them a spin and report back if it all sorts his
> problem).
Now live at:
https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/log/?h=sched/urgent
Please test.
^ permalink raw reply [flat|nested] 74+ messages in thread* Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern
2026-01-15 9:05 ` Peter Zijlstra
@ 2026-01-15 13:13 ` Pierre Gondois
2026-01-15 13:56 ` Juri Lelli
0 siblings, 1 reply; 74+ messages in thread
From: Pierre Gondois @ 2026-01-15 13:13 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Juri Lelli, K Prateek Nayak, tj, linux-kernel, mingo,
vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
vschneid, longman, hannes, mkoutny, void, arighi, changwoo,
cgroups, sched-ext, liuwenfang, tglx, Christian Loehle,
luca.abeni
Hello Peter,
On 1/15/26 10:05, Peter Zijlstra wrote:
> On Thu, Jan 15, 2026 at 09:24:31AM +0100, Peter Zijlstra wrote:
>> On Wed, Jan 14, 2026 at 03:20:48PM +0100, Juri Lelli wrote:
>>
>>>> --- a/kernel/sched/syscalls.c
>>>> +++ b/kernel/sched/syscalls.c
>>>> @@ -639,7 +639,7 @@ int __sched_setscheduler(struct task_str
>>>> * itself.
>>>> */
>>>> newprio = rt_effective_prio(p, newprio);
>>>> - if (newprio == oldprio)
>>>> + if (newprio == oldprio && !dl_prio(newprio))
>>>> queue_flags &= ~DEQUEUE_MOVE;
>>>> }
>>> We have been using (improperly?) ENQUEUE_SAVE also to know when a new
>>> entity gets setscheduled to DEADLINE (or its parameters are changed) and
>>> it looks like this keeps that happening with DEQUEUE_MOVE. So, from a
>>> quick first look, it does sound good to me.
>> If this is strictly about tasks coming into SCHED_DEADLINE there are a
>> number of alternative options:
>>
>> - there are the sched_class::switch{ing,ed}_to() callbacks;
>> - there is (the fairly recent) ENQUEUE_CLASS.
>>
>> Anyway, let me break up this one patch into individual bits and write
>> changelogs. I'll stick them in queue/sched/urgent for now; hopefully
>> Pierre can given them a spin and report back if it all sorts his
>> problem).
> Now live at:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/log/?h=sched/urgent
>
> Please test.
I don't see the balance_callback or the double clock update warnings
anymore.
Thanks for the branch,
Regards,
Pierre
^ permalink raw reply [flat|nested] 74+ messages in thread* Re: [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern
2026-01-15 13:13 ` Pierre Gondois
@ 2026-01-15 13:56 ` Juri Lelli
0 siblings, 0 replies; 74+ messages in thread
From: Juri Lelli @ 2026-01-15 13:56 UTC (permalink / raw)
To: Pierre Gondois
Cc: Peter Zijlstra, K Prateek Nayak, tj, linux-kernel, mingo,
vincent.guittot, dietmar.eggemann, rostedt, bsegall, mgorman,
vschneid, longman, hannes, mkoutny, void, arighi, changwoo,
cgroups, sched-ext, liuwenfang, tglx, Christian Loehle,
luca.abeni
On 15/01/26 14:13, Pierre Gondois wrote:
> Hello Peter,
>
> On 1/15/26 10:05, Peter Zijlstra wrote:
> > On Thu, Jan 15, 2026 at 09:24:31AM +0100, Peter Zijlstra wrote:
> > > On Wed, Jan 14, 2026 at 03:20:48PM +0100, Juri Lelli wrote:
> > >
> > > > > --- a/kernel/sched/syscalls.c
> > > > > +++ b/kernel/sched/syscalls.c
> > > > > @@ -639,7 +639,7 @@ int __sched_setscheduler(struct task_str
> > > > > * itself.
> > > > > */
> > > > > newprio = rt_effective_prio(p, newprio);
> > > > > - if (newprio == oldprio)
> > > > > + if (newprio == oldprio && !dl_prio(newprio))
> > > > > queue_flags &= ~DEQUEUE_MOVE;
> > > > > }
> > > > We have been using (improperly?) ENQUEUE_SAVE also to know when a new
> > > > entity gets setscheduled to DEADLINE (or its parameters are changed) and
> > > > it looks like this keeps that happening with DEQUEUE_MOVE. So, from a
> > > > quick first look, it does sound good to me.
> > > If this is strictly about tasks coming into SCHED_DEADLINE there are a
> > > number of alternative options:
> > >
> > > - there are the sched_class::switch{ing,ed}_to() callbacks;
> > > - there is (the fairly recent) ENQUEUE_CLASS.
> > >
> > > Anyway, let me break up this one patch into individual bits and write
> > > changelogs. I'll stick them in queue/sched/urgent for now; hopefully
> > > Pierre can given them a spin and report back if it all sorts his
> > > problem).
> > Now live at:
> >
> > https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/log/?h=sched/urgent
> >
> > Please test.
> I don't see the balance_callback or the double clock update warnings
> anymore.
FWIW (as I wasn't seeing the reported issue) I had a look as well and
tested locally. Patches look good and nothing to report on the test
side.
Thanks!
Juri
^ permalink raw reply [flat|nested] 74+ messages in thread
* [tip: sched/urgent] sched/deadline: Use ENQUEUE_MOVE to allow priority change
2026-01-14 13:05 ` Peter Zijlstra
2026-01-14 14:04 ` luca abeni
2026-01-14 14:20 ` Juri Lelli
@ 2026-01-15 21:00 ` tip-bot2 for Peter Zijlstra
2026-01-15 21:00 ` [tip: sched/urgent] sched: Deadline has dynamic priority tip-bot2 for Peter Zijlstra
` (2 subsequent siblings)
5 siblings, 0 replies; 74+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2026-01-15 21:00 UTC (permalink / raw)
To: linux-tip-commits
Cc: Pierre Gondois, Peter Zijlstra (Intel), Juri Lelli, x86,
linux-kernel
The following commit has been merged into the sched/urgent branch of tip:
Commit-ID: 627cc25f84466d557d86e5dc67b43a4eea604c80
Gitweb: https://git.kernel.org/tip/627cc25f84466d557d86e5dc67b43a4eea604c80
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 15 Jan 2026 09:27:22 +01:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 15 Jan 2026 21:57:53 +01:00
sched/deadline: Use ENQUEUE_MOVE to allow priority change
Pierre reported hitting balance callback warnings for deadline tasks
after commit 6455ad5346c9 ("sched: Move sched_class::prio_changed()
into the change pattern").
It turns out that DEQUEUE_SAVE+ENQUEUE_RESTORE does not preserve DL
priority and subsequently trips a balance pass -- where one was not
expected.
>From discussion with Juri and Luca, the purpose of this clause was to
deal with tasks new to DL and all those sites will have MOVE set (as
well as CLASS, but MOVE is move conservative at this point).
Per the previous patches MOVE is audited to always run the balance
callbacks, so switch enqueue_dl_entity() to use MOVE for this case.
Fixes: 6455ad5346c9 ("sched: Move sched_class::prio_changed() into the change pattern")
Reported-by: Pierre Gondois <pierre.gondois@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Pierre Gondois <pierre.gondois@arm.com>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://patch.msgid.link/20260114130528.GB831285@noisy.programming.kicks-ass.net
---
kernel/sched/deadline.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 5d6f3cc..c509f2e 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2214,7 +2214,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
update_dl_entity(dl_se);
} else if (flags & ENQUEUE_REPLENISH) {
replenish_dl_entity(dl_se);
- } else if ((flags & ENQUEUE_RESTORE) &&
+ } else if ((flags & ENQUEUE_MOVE) &&
!is_dl_boosted(dl_se) &&
dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
setup_new_dl_entity(dl_se);
^ permalink raw reply related [flat|nested] 74+ messages in thread* [tip: sched/urgent] sched: Deadline has dynamic priority
2026-01-14 13:05 ` Peter Zijlstra
` (2 preceding siblings ...)
2026-01-15 21:00 ` [tip: sched/urgent] sched/deadline: Use ENQUEUE_MOVE to allow priority change tip-bot2 for Peter Zijlstra
@ 2026-01-15 21:00 ` tip-bot2 for Peter Zijlstra
2026-01-15 21:01 ` [tip: sched/urgent] sched: Audit MOVE vs balance_callbacks tip-bot2 for Peter Zijlstra
2026-01-15 21:01 ` [tip: sched/urgent] sched: Fold rq-pin swizzle into __balance_callbacks() tip-bot2 for Peter Zijlstra
5 siblings, 0 replies; 74+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2026-01-15 21:00 UTC (permalink / raw)
To: linux-tip-commits
Cc: Peter Zijlstra (Intel), Pierre Gondois, Juri Lelli, x86,
linux-kernel
The following commit has been merged into the sched/urgent branch of tip:
Commit-ID: e008ec6c7904ed99d3b2cb634b6545b008a99288
Gitweb: https://git.kernel.org/tip/e008ec6c7904ed99d3b2cb634b6545b008a99288
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 15 Jan 2026 09:25:37 +01:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 15 Jan 2026 21:57:53 +01:00
sched: Deadline has dynamic priority
While FIFO/RR have static priority, DEADLINE is a dynamic priority
scheme. Notably it has static priority -1. Do not assume the priority
doesn't change for deadline tasks just because the static priority
doesn't change.
This ensures DL always sees {DE,EN}QUEUE_MOVE where appropriate.
Fixes: ff77e4685359 ("sched/rt: Fix PI handling vs. sched_setscheduler()")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Pierre Gondois <pierre.gondois@arm.com>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://patch.msgid.link/20260114130528.GB831285@noisy.programming.kicks-ass.net
---
kernel/sched/core.c | 2 +-
kernel/sched/syscalls.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4d925d7..045f83a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7320,7 +7320,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
trace_sched_pi_setprio(p, pi_task);
oldprio = p->prio;
- if (oldprio == prio)
+ if (oldprio == prio && !dl_prio(prio))
queue_flag &= ~DEQUEUE_MOVE;
prev_class = p->sched_class;
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index cb337de..6f10db3 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -639,7 +639,7 @@ change:
* itself.
*/
newprio = rt_effective_prio(p, newprio);
- if (newprio == oldprio)
+ if (newprio == oldprio && !dl_prio(newprio))
queue_flags &= ~DEQUEUE_MOVE;
}
^ permalink raw reply related [flat|nested] 74+ messages in thread* [tip: sched/urgent] sched: Audit MOVE vs balance_callbacks
2026-01-14 13:05 ` Peter Zijlstra
` (3 preceding siblings ...)
2026-01-15 21:00 ` [tip: sched/urgent] sched: Deadline has dynamic priority tip-bot2 for Peter Zijlstra
@ 2026-01-15 21:01 ` tip-bot2 for Peter Zijlstra
2026-01-15 21:01 ` [tip: sched/urgent] sched: Fold rq-pin swizzle into __balance_callbacks() tip-bot2 for Peter Zijlstra
5 siblings, 0 replies; 74+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2026-01-15 21:01 UTC (permalink / raw)
To: linux-tip-commits
Cc: Peter Zijlstra (Intel), Pierre Gondois, Juri Lelli, x86,
linux-kernel
The following commit has been merged into the sched/urgent branch of tip:
Commit-ID: 53439363c0a111f11625982b69c88ee2ce8608ec
Gitweb: https://git.kernel.org/tip/53439363c0a111f11625982b69c88ee2ce8608ec
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 15 Jan 2026 09:17:49 +01:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 15 Jan 2026 21:57:53 +01:00
sched: Audit MOVE vs balance_callbacks
The {DE,EN}QUEUE_MOVE flag indicates a task is allowed to change
priority, which means there could be balance callbacks queued.
Therefore audit all MOVE users and make sure they do run balance
callbacks before dropping rq-lock.
Fixes: 6455ad5346c9 ("sched: Move sched_class::prio_changed() into the change pattern")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Pierre Gondois <pierre.gondois@arm.com>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://patch.msgid.link/20260114130528.GB831285@noisy.programming.kicks-ass.net
---
kernel/sched/core.c | 4 +++-
kernel/sched/ext.c | 1 +
kernel/sched/sched.h | 5 ++++-
3 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 842a3ad..4d925d7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4950,7 +4950,7 @@ struct balance_callback *splice_balance_callbacks(struct rq *rq)
return __splice_balance_callbacks(rq, true);
}
-static void __balance_callbacks(struct rq *rq, struct rq_flags *rf)
+void __balance_callbacks(struct rq *rq, struct rq_flags *rf)
{
if (rf)
rq_unpin_lock(rq, rf);
@@ -9126,6 +9126,8 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup)
if (resched)
resched_curr(rq);
+
+ __balance_callbacks(rq, &rq_guard.rf);
}
static struct cgroup_subsys_state *
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 8f6d8d7..afe28c0 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -545,6 +545,7 @@ static void scx_task_iter_start(struct scx_task_iter *iter)
static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
{
if (iter->locked_task) {
+ __balance_callbacks(iter->rq, &iter->rf);
task_rq_unlock(iter->rq, iter->locked_task, &iter->rf);
iter->locked_task = NULL;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e885a93..93fce4b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2388,7 +2388,8 @@ extern const u32 sched_prio_to_wmult[40];
* should preserve as much state as possible.
*
* MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
- * in the runqueue.
+ * in the runqueue. IOW the priority is allowed to change. Callers
+ * must expect to deal with balance callbacks.
*
* NOCLOCK - skip the update_rq_clock() (avoids double updates)
*
@@ -3969,6 +3970,8 @@ extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
+
+extern void __balance_callbacks(struct rq *rq, struct rq_flags *rf);
extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
/*
^ permalink raw reply related [flat|nested] 74+ messages in thread* [tip: sched/urgent] sched: Fold rq-pin swizzle into __balance_callbacks()
2026-01-14 13:05 ` Peter Zijlstra
` (4 preceding siblings ...)
2026-01-15 21:01 ` [tip: sched/urgent] sched: Audit MOVE vs balance_callbacks tip-bot2 for Peter Zijlstra
@ 2026-01-15 21:01 ` tip-bot2 for Peter Zijlstra
5 siblings, 0 replies; 74+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2026-01-15 21:01 UTC (permalink / raw)
To: linux-tip-commits
Cc: Peter Zijlstra (Intel), Pierre Gondois, Juri Lelli, x86,
linux-kernel
The following commit has been merged into the sched/urgent branch of tip:
Commit-ID: 49041e87f9cd3e6be8926b80b3fee71e89323e1c
Gitweb: https://git.kernel.org/tip/49041e87f9cd3e6be8926b80b3fee71e89323e1c
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 15 Jan 2026 09:16:44 +01:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 15 Jan 2026 21:57:52 +01:00
sched: Fold rq-pin swizzle into __balance_callbacks()
Prepare for more users needing the rq-pin swizzle.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Pierre Gondois <pierre.gondois@arm.com>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://patch.msgid.link/20260114130528.GB831285@noisy.programming.kicks-ass.net
---
kernel/sched/core.c | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 60afadb..842a3ad 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4950,9 +4950,13 @@ struct balance_callback *splice_balance_callbacks(struct rq *rq)
return __splice_balance_callbacks(rq, true);
}
-static void __balance_callbacks(struct rq *rq)
+static void __balance_callbacks(struct rq *rq, struct rq_flags *rf)
{
+ if (rf)
+ rq_unpin_lock(rq, rf);
do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
+ if (rf)
+ rq_repin_lock(rq, rf);
}
void balance_callbacks(struct rq *rq, struct balance_callback *head)
@@ -4991,7 +4995,7 @@ static inline void finish_lock_switch(struct rq *rq)
* prev into current:
*/
spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
- __balance_callbacks(rq);
+ __balance_callbacks(rq, NULL);
raw_spin_rq_unlock_irq(rq);
}
@@ -6867,7 +6871,7 @@ keep_resched:
proxy_tag_curr(rq, next);
rq_unpin_lock(rq, &rf);
- __balance_callbacks(rq);
+ __balance_callbacks(rq, NULL);
raw_spin_rq_unlock_irq(rq);
}
trace_sched_exit_tp(is_switch);
@@ -7362,9 +7366,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
out_unlock:
/* Caller holds task_struct::pi_lock, IRQs are still disabled */
- rq_unpin_lock(rq, &rf);
- __balance_callbacks(rq);
- rq_repin_lock(rq, &rf);
+ __balance_callbacks(rq, &rf);
__task_rq_unlock(rq, p, &rf);
}
#endif /* CONFIG_RT_MUTEXES */
^ permalink raw reply related [flat|nested] 74+ messages in thread
* [PATCH 06/12] sched: Fix migrate_disable_switch() locking
2025-10-06 10:44 [PATCH 00/12] sched: Cleanup the change-pattern and related locking Peter Zijlstra
` (4 preceding siblings ...)
2025-10-06 10:44 ` [PATCH 05/12] sched: Move sched_class::prio_changed() into the change pattern Peter Zijlstra
@ 2025-10-06 10:44 ` Peter Zijlstra
2025-10-16 9:33 ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2025-10-06 10:44 ` [PATCH 07/12] sched: Fix do_set_cpus_allowed() locking Peter Zijlstra
` (10 subsequent siblings)
16 siblings, 1 reply; 74+ messages in thread
From: Peter Zijlstra @ 2025-10-06 10:44 UTC (permalink / raw)
To: tj
Cc: linux-kernel, peterz, mingo, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, longman,
hannes, mkoutny, void, arighi, changwoo, cgroups, sched-ext,
liuwenfang, tglx
For some reason migrate_disable_switch() was more complicated than it
needs to be, resulting in mind bending locking of dubious quality.
Recognise that migrate_disable_switch() must be called before a
context switch, but any place before that switch is equally good.
Since the current place results in troubled locking, simply move the
thing before taking rq->lock.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/sched/core.c | 27 ++++++---------------------
1 file changed, 6 insertions(+), 21 deletions(-)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2346,10 +2346,10 @@ static void migrate_disable_switch(struc
if (p->cpus_ptr != &p->cpus_mask)
return;
- /*
- * Violates locking rules! See comment in __do_set_cpus_allowed().
- */
- __do_set_cpus_allowed(p, &ac);
+ scoped_guard (task_rq_lock, p) {
+ update_rq_clock(scope.rq);
+ __do_set_cpus_allowed(p, &ac);
+ }
}
void ___migrate_enable(void)
@@ -2667,22 +2667,7 @@ __do_set_cpus_allowed(struct task_struct
struct rq *rq = task_rq(p);
bool queued, running;
- /*
- * This here violates the locking rules for affinity, since we're only
- * supposed to change these variables while holding both rq->lock and
- * p->pi_lock.
- *
- * HOWEVER, it magically works, because ttwu() is the only code that
- * accesses these variables under p->pi_lock and only does so after
- * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
- * before finish_task().
- *
- * XXX do further audits, this smells like something putrid.
- */
- if (ctx->flags & SCA_MIGRATE_DISABLE)
- WARN_ON_ONCE(!p->on_cpu);
- else
- lockdep_assert_held(&p->pi_lock);
+ lockdep_assert_held(&p->pi_lock);
queued = task_on_rq_queued(p);
running = task_current_donor(rq, p);
@@ -6781,6 +6766,7 @@ static void __sched notrace __schedule(i
local_irq_disable();
rcu_note_context_switch(preempt);
+ migrate_disable_switch(rq, prev);
/*
* Make sure that signal_pending_state()->signal_pending() below
@@ -6887,7 +6873,6 @@ static void __sched notrace __schedule(i
*/
++*switch_count;
- migrate_disable_switch(rq, prev);
psi_account_irqtime(rq, prev, next);
psi_sched_switch(prev, next, !task_on_rq_queued(prev) ||
prev->se.sched_delayed);
^ permalink raw reply [flat|nested] 74+ messages in thread* [tip: sched/core] sched: Fix migrate_disable_switch() locking
2025-10-06 10:44 ` [PATCH 06/12] sched: Fix migrate_disable_switch() locking Peter Zijlstra
@ 2025-10-16 9:33 ` tip-bot2 for Peter Zijlstra
0 siblings, 0 replies; 74+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2025-10-16 9:33 UTC (permalink / raw)
To: linux-tip-commits
Cc: Peter Zijlstra (Intel), Juri Lelli, Tejun Heo, Vincent Guittot,
x86, linux-kernel
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 942b8db965006cf655d356162f7091a9238da94e
Gitweb: https://git.kernel.org/tip/942b8db965006cf655d356162f7091a9238da94e
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Wed, 10 Sep 2025 09:46:44 +02:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 16 Oct 2025 11:13:52 +02:00
sched: Fix migrate_disable_switch() locking
For some reason migrate_disable_switch() was more complicated than it
needs to be, resulting in mind bending locking of dubious quality.
Recognise that migrate_disable_switch() must be called before a
context switch, but any place before that switch is equally good.
Since the current place results in troubled locking, simply move the
thing before taking rq->lock.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
---
kernel/sched/core.c | 27 ++++++---------------------
1 file changed, 6 insertions(+), 21 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4a4dbce..f2d16d1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2346,10 +2346,10 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
if (p->cpus_ptr != &p->cpus_mask)
return;
- /*
- * Violates locking rules! See comment in __do_set_cpus_allowed().
- */
- __do_set_cpus_allowed(p, &ac);
+ scoped_guard (task_rq_lock, p) {
+ update_rq_clock(scope.rq);
+ __do_set_cpus_allowed(p, &ac);
+ }
}
void ___migrate_enable(void)
@@ -2667,22 +2667,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
struct rq *rq = task_rq(p);
bool queued, running;
- /*
- * This here violates the locking rules for affinity, since we're only
- * supposed to change these variables while holding both rq->lock and
- * p->pi_lock.
- *
- * HOWEVER, it magically works, because ttwu() is the only code that
- * accesses these variables under p->pi_lock and only does so after
- * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
- * before finish_task().
- *
- * XXX do further audits, this smells like something putrid.
- */
- if (ctx->flags & SCA_MIGRATE_DISABLE)
- WARN_ON_ONCE(!p->on_cpu);
- else
- lockdep_assert_held(&p->pi_lock);
+ lockdep_assert_held(&p->pi_lock);
queued = task_on_rq_queued(p);
running = task_current_donor(rq, p);
@@ -6781,6 +6766,7 @@ static void __sched notrace __schedule(int sched_mode)
local_irq_disable();
rcu_note_context_switch(preempt);
+ migrate_disable_switch(rq, prev);
/*
* Make sure that signal_pending_state()->signal_pending() below
@@ -6887,7 +6873,6 @@ keep_resched:
*/
++*switch_count;
- migrate_disable_switch(rq, prev);
psi_account_irqtime(rq, prev, next);
psi_sched_switch(prev, next, !task_on_rq_queued(prev) ||
prev->se.sched_delayed);
^ permalink raw reply related [flat|nested] 74+ messages in thread
* [PATCH 07/12] sched: Fix do_set_cpus_allowed() locking
2025-10-06 10:44 [PATCH 00/12] sched: Cleanup the change-pattern and related locking Peter Zijlstra
` (5 preceding siblings ...)
2025-10-06 10:44 ` [PATCH 06/12] sched: Fix migrate_disable_switch() locking Peter Zijlstra
@ 2025-10-06 10:44 ` Peter Zijlstra
2025-10-16 9:33 ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2025-10-24 14:58 ` [REGRESSION] Deadlock during CPU hotplug caused by abfc01077df6 Jan Polensky
2025-10-06 10:44 ` [PATCH 08/12] sched: Rename do_set_cpus_allowed() Peter Zijlstra
` (9 subsequent siblings)
16 siblings, 2 replies; 74+ messages in thread
From: Peter Zijlstra @ 2025-10-06 10:44 UTC (permalink / raw)
To: tj
Cc: linux-kernel, peterz, mingo, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, longman,
hannes, mkoutny, void, arighi, changwoo, cgroups, sched-ext,
liuwenfang, tglx
All callers of do_set_cpus_allowed() only take p->pi_lock, which is
not sufficient to actually change the cpumask. Again, this is mostly
ok in these cases, but it results in unnecessarily complicated
reasoning.
Furthermore, there is no reason what so ever to not just take all the
required locks, so do just that.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/kthread.c | 15 +++++----------
kernel/sched/core.c | 21 +++++++--------------
kernel/sched/sched.h | 5 +++++
3 files changed, 17 insertions(+), 24 deletions(-)
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -593,18 +593,16 @@ EXPORT_SYMBOL(kthread_create_on_node);
static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state)
{
- unsigned long flags;
-
if (!wait_task_inactive(p, state)) {
WARN_ON(1);
return;
}
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
+ do_set_cpus_allowed(p, mask);
+
/* It's safe because the task is inactive. */
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- do_set_cpus_allowed(p, mask);
p->flags |= PF_NO_SETAFFINITY;
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}
static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state)
@@ -857,7 +855,6 @@ int kthread_affine_preferred(struct task
{
struct kthread *kthread = to_kthread(p);
cpumask_var_t affinity;
- unsigned long flags;
int ret = 0;
if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) {
@@ -882,10 +879,8 @@ int kthread_affine_preferred(struct task
list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
kthread_fetch_affinity(kthread, affinity);
- /* It's safe because the task is inactive. */
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- do_set_cpus_allowed(p, affinity);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
+ do_set_cpus_allowed(p, affinity);
mutex_unlock(&kthreads_hotplug_lock);
out:
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2668,18 +2668,14 @@ __do_set_cpus_allowed(struct task_struct
bool queued, running;
lockdep_assert_held(&p->pi_lock);
+ lockdep_assert_rq_held(rq);
queued = task_on_rq_queued(p);
running = task_current_donor(rq, p);
- if (queued) {
- /*
- * Because __kthread_bind() calls this on blocked tasks without
- * holding rq->lock.
- */
- lockdep_assert_rq_held(rq);
+ if (queued)
dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
- }
+
if (running)
put_prev_task(rq, p);
@@ -2708,7 +2704,10 @@ void do_set_cpus_allowed(struct task_str
struct rcu_head rcu;
};
- __do_set_cpus_allowed(p, &ac);
+ scoped_guard (__task_rq_lock, p) {
+ update_rq_clock(scope.rq);
+ __do_set_cpus_allowed(p, &ac);
+ }
/*
* Because this is called with p->pi_lock held, it is not possible
@@ -3483,12 +3482,6 @@ static int select_fallback_rq(int cpu, s
}
fallthrough;
case possible:
- /*
- * XXX When called from select_task_rq() we only
- * hold p->pi_lock and again violate locking order.
- *
- * More yuck to audit.
- */
do_set_cpus_allowed(p, task_cpu_fallback_mask(p));
state = fail;
break;
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1847,6 +1847,11 @@ DEFINE_LOCK_GUARD_1(task_rq_lock, struct
task_rq_unlock(_T->rq, _T->lock, &_T->rf),
struct rq *rq; struct rq_flags rf)
+DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct,
+ _T->rq = __task_rq_lock(_T->lock, &_T->rf),
+ __task_rq_unlock(_T->rq, &_T->rf),
+ struct rq *rq; struct rq_flags rf)
+
static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
^ permalink raw reply [flat|nested] 74+ messages in thread* [tip: sched/core] sched: Fix do_set_cpus_allowed() locking
2025-10-06 10:44 ` [PATCH 07/12] sched: Fix do_set_cpus_allowed() locking Peter Zijlstra
@ 2025-10-16 9:33 ` tip-bot2 for Peter Zijlstra
2025-10-24 14:58 ` [REGRESSION] Deadlock during CPU hotplug caused by abfc01077df6 Jan Polensky
1 sibling, 0 replies; 74+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2025-10-16 9:33 UTC (permalink / raw)
To: linux-tip-commits
Cc: Peter Zijlstra (Intel), Juri Lelli, Tejun Heo, Vincent Guittot,
x86, linux-kernel
The following commit has been merged into the sched/core branch of tip:
Commit-ID: abfc01077df66593f128d966fdad1d042facc9ac
Gitweb: https://git.kernel.org/tip/abfc01077df66593f128d966fdad1d042facc9ac
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Wed, 10 Sep 2025 09:51:06 +02:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 16 Oct 2025 11:13:52 +02:00
sched: Fix do_set_cpus_allowed() locking
All callers of do_set_cpus_allowed() only take p->pi_lock, which is
not sufficient to actually change the cpumask. Again, this is mostly
ok in these cases, but it results in unnecessarily complicated
reasoning.
Furthermore, there is no reason what so ever to not just take all the
required locks, so do just that.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
---
kernel/kthread.c | 15 +++++----------
kernel/sched/core.c | 21 +++++++--------------
kernel/sched/sched.h | 5 +++++
3 files changed, 17 insertions(+), 24 deletions(-)
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 31b072e..832bd2a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -593,18 +593,16 @@ EXPORT_SYMBOL(kthread_create_on_node);
static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state)
{
- unsigned long flags;
-
if (!wait_task_inactive(p, state)) {
WARN_ON(1);
return;
}
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
+ do_set_cpus_allowed(p, mask);
+
/* It's safe because the task is inactive. */
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- do_set_cpus_allowed(p, mask);
p->flags |= PF_NO_SETAFFINITY;
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}
static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state)
@@ -857,7 +855,6 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
{
struct kthread *kthread = to_kthread(p);
cpumask_var_t affinity;
- unsigned long flags;
int ret = 0;
if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) {
@@ -882,10 +879,8 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
kthread_fetch_affinity(kthread, affinity);
- /* It's safe because the task is inactive. */
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- do_set_cpus_allowed(p, affinity);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
+ do_set_cpus_allowed(p, affinity);
mutex_unlock(&kthreads_hotplug_lock);
out:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f2d16d1..805e650 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2668,18 +2668,14 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
bool queued, running;
lockdep_assert_held(&p->pi_lock);
+ lockdep_assert_rq_held(rq);
queued = task_on_rq_queued(p);
running = task_current_donor(rq, p);
- if (queued) {
- /*
- * Because __kthread_bind() calls this on blocked tasks without
- * holding rq->lock.
- */
- lockdep_assert_rq_held(rq);
+ if (queued)
dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
- }
+
if (running)
put_prev_task(rq, p);
@@ -2708,7 +2704,10 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
struct rcu_head rcu;
};
- __do_set_cpus_allowed(p, &ac);
+ scoped_guard (__task_rq_lock, p) {
+ update_rq_clock(scope.rq);
+ __do_set_cpus_allowed(p, &ac);
+ }
/*
* Because this is called with p->pi_lock held, it is not possible
@@ -3483,12 +3482,6 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
}
fallthrough;
case possible:
- /*
- * XXX When called from select_task_rq() we only
- * hold p->pi_lock and again violate locking order.
- *
- * More yuck to audit.
- */
do_set_cpus_allowed(p, task_cpu_fallback_mask(p));
state = fail;
break;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bcde43d..b23ce9c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1847,6 +1847,11 @@ DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct,
task_rq_unlock(_T->rq, _T->lock, &_T->rf),
struct rq *rq; struct rq_flags rf)
+DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct,
+ _T->rq = __task_rq_lock(_T->lock, &_T->rf),
+ __task_rq_unlock(_T->rq, &_T->rf),
+ struct rq *rq; struct rq_flags rf)
+
static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
^ permalink raw reply related [flat|nested] 74+ messages in thread* [REGRESSION] Deadlock during CPU hotplug caused by abfc01077df6
2025-10-06 10:44 ` [PATCH 07/12] sched: Fix do_set_cpus_allowed() locking Peter Zijlstra
2025-10-16 9:33 ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
@ 2025-10-24 14:58 ` Jan Polensky
1 sibling, 0 replies; 74+ messages in thread
From: Jan Polensky @ 2025-10-24 14:58 UTC (permalink / raw)
To: peterz
Cc: arighi, bsegall, cgroups, changwoo, dietmar.eggemann, hannes,
juri.lelli, linux-kernel, liuwenfang, longman, mgorman, mingo,
mkoutny, rostedt, sched-ext, tglx, tj, vincent.guittot, void,
vschneid
We've identified a regression introduced by commit abfc01077df6 ("sched: Fix
do_set_cpus_allowed() locking") that causes a reproducible deadlock during CPU
hotplug testing on s390x.
While running the cpuhotplug02.sh test from LTP, which dynamically
offlines and onlines CPUs, the system consistently enters a stalled
state.
Observed behavior:
- migration/N attempts to migrate a task currently executing on another
CPU.
- Concurrently, rcu_sched tries to complete an RCU grace period.
- Both threads are blocked on spinlocks (e.g., arch_spin_lock_wait),
likely due to lock contention.
- Neither thread progresses; the grace period stalls.
- The kernel detects the stall and triggers a crash dump.
Sys info:
RELEASE: 6.18.0-20251021.rc2.git224.fe45352cd106.63.fc42.s390x+next
CPUS: 32
TASKS: 623
MEMORY: 16 GB
Crash log excerpt:
[ 6146.992159] rcu: INFO: rcu_sched detected stalls on CPUs/tasks:
[ 6146.992173] rcu: 1-...0: (5 ticks this GP) idle=cea4/1/0x4000000000000000 softirq=1055899/1055901 fqs=4769
[ 6146.992236] rcu: (detected by 3, t=240013 jiffies, g=2041729, q=14778 ncpus=32)
[ 6146.992240] Task dump for CPU 1:
[ 6146.992241] task:migration/1 state:R running task stack:0 pid:22 tgid:22 ppid:2 task_flags:0x4208040 flags:0x00000000
[ 6146.992246] Stopper: __balance_push_cpu_stop+0x0/0x230 <- balance_push+0xea/0x170
[ 6146.992254] Call Trace:
[ 6146.992255] [<000000009d9e2300>] 0x9d9e2300
[ 6146.992280] rcu: rcu_sched kthread starved for 210010 jiffies! g2041729 f0x2 RCU_GP_DOING_FQS(6) ->state=0x0 ->cpu=23
[ 6146.992287] rcu: Unless rcu_sched kthread gets sufficient CPU time, OOM is now expected behavior.
[ 6146.992288] rcu: RCU grace-period kthread stack dump:
[ 6146.992289] task:rcu_sched state:R running task stack:0 pid:16 tgid:16 ppid:2 task_flags:0x208040 flags:0x00000010
[ 6146.992294] Call Trace:
[ 6146.992295] [<0700000000000001>] 0x700000000000001
[ 6146.992298] [<000002e1fb072998>] arch_spin_lock_wait+0xc8/0x110
[ 6146.992303] [<000002e1fa239d06>] raw_spin_rq_lock_nested+0x96/0xc0
[ 6146.992306] [<000002e1fa23bc90>] resched_cpu+0x50/0xc0
[ 6146.992309] [<000002e1fa29d646>] force_qs_rnp+0x306/0x3e0
[ 6146.992314] [<000002e1fa29ed30>] rcu_gp_fqs_loop+0x430/0x6e0
[ 6146.992316] [<000002e1fa2a1b0e>] rcu_gp_kthread+0x1ee/0x270
[ 6146.992320] [<000002e1fa228edc>] kthread+0x12c/0x250
[ 6146.992323] [<000002e1fa19ccfc>] __ret_from_fork+0x3c/0x150
[ 6146.992328] [<000002e1fb0800ba>] ret_from_fork+0xa/0x30
^ permalink raw reply [flat|nested] 74+ messages in thread
* [PATCH 08/12] sched: Rename do_set_cpus_allowed()
2025-10-06 10:44 [PATCH 00/12] sched: Cleanup the change-pattern and related locking Peter Zijlstra
` (6 preceding siblings ...)
2025-10-06 10:44 ` [PATCH 07/12] sched: Fix do_set_cpus_allowed() locking Peter Zijlstra
@ 2025-10-06 10:44 ` Peter Zijlstra
2025-10-16 9:33 ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2025-10-06 10:44 ` [PATCH 09/12] sched: Make __do_set_cpus_allowed() use the sched_change pattern Peter Zijlstra
` (8 subsequent siblings)
16 siblings, 1 reply; 74+ messages in thread
From: Peter Zijlstra @ 2025-10-06 10:44 UTC (permalink / raw)
To: tj
Cc: linux-kernel, peterz, mingo, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, longman,
hannes, mkoutny, void, arighi, changwoo, cgroups, sched-ext,
liuwenfang, tglx
Hopefully saner naming.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/sched.h | 4 ++--
kernel/cgroup/cpuset.c | 2 +-
kernel/kthread.c | 4 ++--
kernel/sched/core.c | 16 ++++++++--------
kernel/sched/sched.h | 2 +-
5 files changed, 14 insertions(+), 14 deletions(-)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1861,8 +1861,8 @@ extern int task_can_attach(struct task_s
extern int dl_bw_alloc(int cpu, u64 dl_bw);
extern void dl_bw_free(int cpu, u64 dl_bw);
-/* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */
-extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
+/* set_cpus_allowed_force() - consider using set_cpus_allowed_ptr() instead */
+extern void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask);
/**
* set_cpus_allowed_ptr - set CPU affinity mask of a task
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -4180,7 +4180,7 @@ bool cpuset_cpus_allowed_fallback(struct
rcu_read_lock();
cs_mask = task_cs(tsk)->cpus_allowed;
if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
- do_set_cpus_allowed(tsk, cs_mask);
+ set_cpus_allowed_force(tsk, cs_mask);
changed = true;
}
rcu_read_unlock();
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -599,7 +599,7 @@ static void __kthread_bind_mask(struct t
}
scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
- do_set_cpus_allowed(p, mask);
+ set_cpus_allowed_force(p, mask);
/* It's safe because the task is inactive. */
p->flags |= PF_NO_SETAFFINITY;
@@ -880,7 +880,7 @@ int kthread_affine_preferred(struct task
kthread_fetch_affinity(kthread, affinity);
scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
- do_set_cpus_allowed(p, affinity);
+ set_cpus_allowed_force(p, affinity);
mutex_unlock(&kthreads_hotplug_lock);
out:
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2331,7 +2331,7 @@ unsigned long wait_task_inactive(struct
}
static void
-__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
+do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
{
@@ -2348,7 +2348,7 @@ static void migrate_disable_switch(struc
scoped_guard (task_rq_lock, p) {
update_rq_clock(scope.rq);
- __do_set_cpus_allowed(p, &ac);
+ do_set_cpus_allowed(p, &ac);
}
}
@@ -2662,7 +2662,7 @@ void set_cpus_allowed_common(struct task
}
static void
-__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
+do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
{
struct rq *rq = task_rq(p);
bool queued, running;
@@ -2692,7 +2692,7 @@ __do_set_cpus_allowed(struct task_struct
* Used for kthread_bind() and select_fallback_rq(), in both cases the user
* affinity (if any) should be destroyed too.
*/
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask)
{
struct affinity_context ac = {
.new_mask = new_mask,
@@ -2706,7 +2706,7 @@ void do_set_cpus_allowed(struct task_str
scoped_guard (__task_rq_lock, p) {
update_rq_clock(scope.rq);
- __do_set_cpus_allowed(p, &ac);
+ do_set_cpus_allowed(p, &ac);
}
/*
@@ -2745,7 +2745,7 @@ int dup_user_cpus_ptr(struct task_struct
* Use pi_lock to protect content of user_cpus_ptr
*
* Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent
- * do_set_cpus_allowed().
+ * set_cpus_allowed_force().
*/
raw_spin_lock_irqsave(&src->pi_lock, flags);
if (src->user_cpus_ptr) {
@@ -3073,7 +3073,7 @@ static int __set_cpus_allowed_ptr_locked
goto out;
}
- __do_set_cpus_allowed(p, ctx);
+ do_set_cpus_allowed(p, ctx);
return affine_move_task(rq, p, rf, dest_cpu, ctx->flags);
@@ -3482,7 +3482,7 @@ static int select_fallback_rq(int cpu, s
}
fallthrough;
case possible:
- do_set_cpus_allowed(p, task_cpu_fallback_mask(p));
+ set_cpus_allowed_force(p, task_cpu_fallback_mask(p));
state = fail;
break;
case fail:
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2617,7 +2617,7 @@ static inline bool task_allowed_on_cpu(s
static inline cpumask_t *alloc_user_cpus_ptr(int node)
{
/*
- * See do_set_cpus_allowed() above for the rcu_head usage.
+ * See set_cpus_allowed_force() above for the rcu_head usage.
*/
int size = max_t(int, cpumask_size(), sizeof(struct rcu_head));
^ permalink raw reply [flat|nested] 74+ messages in thread* [tip: sched/core] sched: Rename do_set_cpus_allowed()
2025-10-06 10:44 ` [PATCH 08/12] sched: Rename do_set_cpus_allowed() Peter Zijlstra
@ 2025-10-16 9:33 ` tip-bot2 for Peter Zijlstra
0 siblings, 0 replies; 74+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2025-10-16 9:33 UTC (permalink / raw)
To: linux-tip-commits
Cc: Peter Zijlstra (Intel), Juri Lelli, Tejun Heo, Vincent Guittot,
x86, linux-kernel
The following commit has been merged into the sched/core branch of tip:
Commit-ID: b079d93796528053cde322f2ca838c2d21c297e7
Gitweb: https://git.kernel.org/tip/b079d93796528053cde322f2ca838c2d21c297e7
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Wed, 10 Sep 2025 10:08:05 +02:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 16 Oct 2025 11:13:53 +02:00
sched: Rename do_set_cpus_allowed()
Hopefully saner naming.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
---
include/linux/sched.h | 4 ++--
kernel/cgroup/cpuset.c | 2 +-
kernel/kthread.c | 4 ++--
kernel/sched/core.c | 16 ++++++++--------
kernel/sched/sched.h | 2 +-
5 files changed, 14 insertions(+), 14 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index cbb7340..77426c3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1861,8 +1861,8 @@ extern int task_can_attach(struct task_struct *p);
extern int dl_bw_alloc(int cpu, u64 dl_bw);
extern void dl_bw_free(int cpu, u64 dl_bw);
-/* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */
-extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
+/* set_cpus_allowed_force() - consider using set_cpus_allowed_ptr() instead */
+extern void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask);
/**
* set_cpus_allowed_ptr - set CPU affinity mask of a task
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 52468d2..185e820 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -4180,7 +4180,7 @@ bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
rcu_read_lock();
cs_mask = task_cs(tsk)->cpus_allowed;
if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
- do_set_cpus_allowed(tsk, cs_mask);
+ set_cpus_allowed_force(tsk, cs_mask);
changed = true;
}
rcu_read_unlock();
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 832bd2a..99a3808 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -599,7 +599,7 @@ static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mas
}
scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
- do_set_cpus_allowed(p, mask);
+ set_cpus_allowed_force(p, mask);
/* It's safe because the task is inactive. */
p->flags |= PF_NO_SETAFFINITY;
@@ -880,7 +880,7 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
kthread_fetch_affinity(kthread, affinity);
scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
- do_set_cpus_allowed(p, affinity);
+ set_cpus_allowed_force(p, affinity);
mutex_unlock(&kthreads_hotplug_lock);
out:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 805e650..638bffd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2331,7 +2331,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
}
static void
-__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
+do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
{
@@ -2348,7 +2348,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
scoped_guard (task_rq_lock, p) {
update_rq_clock(scope.rq);
- __do_set_cpus_allowed(p, &ac);
+ do_set_cpus_allowed(p, &ac);
}
}
@@ -2662,7 +2662,7 @@ void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx
}
static void
-__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
+do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
{
struct rq *rq = task_rq(p);
bool queued, running;
@@ -2692,7 +2692,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
* Used for kthread_bind() and select_fallback_rq(), in both cases the user
* affinity (if any) should be destroyed too.
*/
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask)
{
struct affinity_context ac = {
.new_mask = new_mask,
@@ -2706,7 +2706,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
scoped_guard (__task_rq_lock, p) {
update_rq_clock(scope.rq);
- __do_set_cpus_allowed(p, &ac);
+ do_set_cpus_allowed(p, &ac);
}
/*
@@ -2745,7 +2745,7 @@ int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
* Use pi_lock to protect content of user_cpus_ptr
*
* Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent
- * do_set_cpus_allowed().
+ * set_cpus_allowed_force().
*/
raw_spin_lock_irqsave(&src->pi_lock, flags);
if (src->user_cpus_ptr) {
@@ -3073,7 +3073,7 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
goto out;
}
- __do_set_cpus_allowed(p, ctx);
+ do_set_cpus_allowed(p, ctx);
return affine_move_task(rq, p, rf, dest_cpu, ctx->flags);
@@ -3482,7 +3482,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
}
fallthrough;
case possible:
- do_set_cpus_allowed(p, task_cpu_fallback_mask(p));
+ set_cpus_allowed_force(p, task_cpu_fallback_mask(p));
state = fail;
break;
case fail:
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b23ce9c..ea2ea8f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2617,7 +2617,7 @@ static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu)
static inline cpumask_t *alloc_user_cpus_ptr(int node)
{
/*
- * See do_set_cpus_allowed() above for the rcu_head usage.
+ * See set_cpus_allowed_force() above for the rcu_head usage.
*/
int size = max_t(int, cpumask_size(), sizeof(struct rcu_head));
^ permalink raw reply related [flat|nested] 74+ messages in thread
* [PATCH 09/12] sched: Make __do_set_cpus_allowed() use the sched_change pattern
2025-10-06 10:44 [PATCH 00/12] sched: Cleanup the change-pattern and related locking Peter Zijlstra
` (7 preceding siblings ...)
2025-10-06 10:44 ` [PATCH 08/12] sched: Rename do_set_cpus_allowed() Peter Zijlstra
@ 2025-10-06 10:44 ` Peter Zijlstra
2025-10-16 9:33 ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2025-10-06 10:44 ` [PATCH 10/12] sched: Add locking comments to sched_class methods Peter Zijlstra
` (7 subsequent siblings)
16 siblings, 1 reply; 74+ messages in thread
From: Peter Zijlstra @ 2025-10-06 10:44 UTC (permalink / raw)
To: tj
Cc: linux-kernel, peterz, mingo, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, longman,
hannes, mkoutny, void, arighi, changwoo, cgroups, sched-ext,
liuwenfang, tglx
Now that do_set_cpus_allowed() holds all the regular locks, convert it
to use the sched_change pattern helper.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/sched/core.c | 26 +++++---------------------
1 file changed, 5 insertions(+), 21 deletions(-)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2664,28 +2664,12 @@ void set_cpus_allowed_common(struct task
static void
do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
{
- struct rq *rq = task_rq(p);
- bool queued, running;
+ u32 flags = DEQUEUE_SAVE | DEQUEUE_NOCLOCK;
- lockdep_assert_held(&p->pi_lock);
- lockdep_assert_rq_held(rq);
-
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
-
- if (queued)
- dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
-
- if (running)
- put_prev_task(rq, p);
-
- p->sched_class->set_cpus_allowed(p, ctx);
- mm_set_cpus_allowed(p->mm, ctx->new_mask);
-
- if (queued)
- enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- if (running)
- set_next_task(rq, p);
+ scoped_guard (sched_change, p, flags) {
+ p->sched_class->set_cpus_allowed(p, ctx);
+ mm_set_cpus_allowed(p->mm, ctx->new_mask);
+ }
}
/*
^ permalink raw reply [flat|nested] 74+ messages in thread* [tip: sched/core] sched: Make __do_set_cpus_allowed() use the sched_change pattern
2025-10-06 10:44 ` [PATCH 09/12] sched: Make __do_set_cpus_allowed() use the sched_change pattern Peter Zijlstra
@ 2025-10-16 9:33 ` tip-bot2 for Peter Zijlstra
0 siblings, 0 replies; 74+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2025-10-16 9:33 UTC (permalink / raw)
To: linux-tip-commits
Cc: Peter Zijlstra (Intel), Juri Lelli, Tejun Heo, Vincent Guittot,
x86, linux-kernel
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 650952d3fb3889b04cbda722351b5d6090a1c10b
Gitweb: https://git.kernel.org/tip/650952d3fb3889b04cbda722351b5d6090a1c10b
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Tue, 09 Sep 2025 13:16:23 +02:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 16 Oct 2025 11:13:53 +02:00
sched: Make __do_set_cpus_allowed() use the sched_change pattern
Now that do_set_cpus_allowed() holds all the regular locks, convert it
to use the sched_change pattern helper.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
---
kernel/sched/core.c | 26 +++++---------------------
1 file changed, 5 insertions(+), 21 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 638bffd..e932439 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2664,28 +2664,12 @@ void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx
static void
do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
{
- struct rq *rq = task_rq(p);
- bool queued, running;
-
- lockdep_assert_held(&p->pi_lock);
- lockdep_assert_rq_held(rq);
-
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
-
- if (queued)
- dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
+ u32 flags = DEQUEUE_SAVE | DEQUEUE_NOCLOCK;
- if (running)
- put_prev_task(rq, p);
-
- p->sched_class->set_cpus_allowed(p, ctx);
- mm_set_cpus_allowed(p->mm, ctx->new_mask);
-
- if (queued)
- enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- if (running)
- set_next_task(rq, p);
+ scoped_guard (sched_change, p, flags) {
+ p->sched_class->set_cpus_allowed(p, ctx);
+ mm_set_cpus_allowed(p->mm, ctx->new_mask);
+ }
}
/*
^ permalink raw reply related [flat|nested] 74+ messages in thread
* [PATCH 10/12] sched: Add locking comments to sched_class methods
2025-10-06 10:44 [PATCH 00/12] sched: Cleanup the change-pattern and related locking Peter Zijlstra
` (8 preceding siblings ...)
2025-10-06 10:44 ` [PATCH 09/12] sched: Make __do_set_cpus_allowed() use the sched_change pattern Peter Zijlstra
@ 2025-10-06 10:44 ` Peter Zijlstra
2025-10-07 9:54 ` Juri Lelli
2025-10-16 9:33 ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2025-10-06 10:44 ` [PATCH 11/12] sched: Match __task_rq_{,un}lock() Peter Zijlstra
` (6 subsequent siblings)
16 siblings, 2 replies; 74+ messages in thread
From: Peter Zijlstra @ 2025-10-06 10:44 UTC (permalink / raw)
To: tj
Cc: linux-kernel, peterz, mingo, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, longman,
hannes, mkoutny, void, arighi, changwoo, cgroups, sched-ext,
liuwenfang, tglx
'Document' the locking context the various sched_class methods are
called under.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/sched/core.c | 6 +-
kernel/sched/sched.h | 106 ++++++++++++++++++++++++++++++++++++++++++++++++---
2 files changed, 103 insertions(+), 9 deletions(-)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -583,8 +583,8 @@ EXPORT_SYMBOL(__trace_set_current_state)
*
* p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
*
- * is set by activate_task() and cleared by deactivate_task(), under
- * rq->lock. Non-zero indicates the task is runnable, the special
+ * is set by activate_task() and cleared by deactivate_task()/block_task(),
+ * under rq->lock. Non-zero indicates the task is runnable, the special
* ON_RQ_MIGRATING state is used for migration without holding both
* rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
*
@@ -4162,7 +4162,7 @@ int try_to_wake_up(struct task_struct *p
* __schedule(). See the comment for smp_mb__after_spinlock().
*
* Form a control-dep-acquire with p->on_rq == 0 above, to ensure
- * schedule()'s deactivate_task() has 'happened' and p will no longer
+ * schedule()'s block_task() has 'happened' and p will no longer
* care about it's own p->state. See the comment in __schedule().
*/
smp_acquire__after_ctrl_dep();
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2345,8 +2345,7 @@ extern const u32 sched_prio_to_wmult[40
/*
* {de,en}queue flags:
*
- * DEQUEUE_SLEEP - task is no longer runnable
- * ENQUEUE_WAKEUP - task just became runnable
+ * SLEEP/WAKEUP - task is no-longer/just-became runnable
*
* SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
* are in a known state which allows modification. Such pairs
@@ -2359,6 +2358,11 @@ extern const u32 sched_prio_to_wmult[40
*
* MIGRATION - p->on_rq == TASK_ON_RQ_MIGRATING (used for DEADLINE)
*
+ * DELAYED - de/re-queue a sched_delayed task
+ *
+ * CLASS - going to update p->sched_class; makes sched_change call the
+ * various switch methods.
+ *
* ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
* ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
* ENQUEUE_MIGRATED - the task was migrated during wakeup
@@ -2409,14 +2413,50 @@ struct sched_class {
int uclamp_enabled;
#endif
+ /*
+ * move_queued_task/activate_task/enqueue_task: rq->lock
+ * ttwu_do_activate/activate_task/enqueue_task: rq->lock
+ * wake_up_new_task/activate_task/enqueue_task: task_rq_lock
+ * ttwu_runnable/enqueue_task: task_rq_lock
+ * proxy_task_current: rq->lock
+ * sched_change_end
+ */
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
+ /*
+ * move_queued_task/deactivate_task/dequeue_task: rq->lock
+ * __schedule/block_task/dequeue_task: rq->lock
+ * proxy_task_current: rq->lock
+ * wait_task_inactive: task_rq_lock
+ * sched_change_begin
+ */
bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
+
+ /*
+ * do_sched_yield: rq->lock
+ */
void (*yield_task) (struct rq *rq);
+ /*
+ * yield_to: rq->lock (double)
+ */
bool (*yield_to_task)(struct rq *rq, struct task_struct *p);
+ /*
+ * move_queued_task: rq->lock
+ * __migrate_swap_task: rq->lock
+ * ttwu_do_activate: rq->lock
+ * ttwu_runnable: task_rq_lock
+ * wake_up_new_task: task_rq_lock
+ */
void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
+ /*
+ * schedule/pick_next_task/prev_balance: rq->lock
+ */
int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
+
+ /*
+ * schedule/pick_next_task: rq->lock
+ */
struct task_struct *(*pick_task)(struct rq *rq);
/*
* Optional! When implemented pick_next_task() should be equivalent to:
@@ -2429,48 +2469,102 @@ struct sched_class {
*/
struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev);
+ /*
+ * sched_change:
+ * __schedule: rq->lock
+ */
void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next);
void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
+ /*
+ * select_task_rq: p->pi_lock
+ * sched_exec: p->pi_lock
+ */
int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
+ /*
+ * set_task_cpu: p->pi_lock || rq->lock (ttwu like)
+ */
void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
+ /*
+ * ttwu_do_activate: rq->lock
+ * wake_up_new_task: task_rq_lock
+ */
void (*task_woken)(struct rq *this_rq, struct task_struct *task);
+ /*
+ * do_set_cpus_allowed: task_rq_lock + sched_change
+ */
void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx);
+ /*
+ * sched_set_rq_{on,off}line: rq->lock
+ */
void (*rq_online)(struct rq *rq);
void (*rq_offline)(struct rq *rq);
+ /*
+ * push_cpu_stop: p->pi_lock && rq->lock
+ */
struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
+ /*
+ * hrtick: rq->lock
+ * sched_tick: rq->lock
+ * sched_tick_remote: rq->lock
+ */
void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
+ /*
+ * sched_cgroup_fork: p->pi_lock
+ */
void (*task_fork)(struct task_struct *p);
+ /*
+ * finish_task_switch: no locks
+ */
void (*task_dead)(struct task_struct *p);
+ /*
+ * sched_change
+ */
void (*switching_from)(struct rq *this_rq, struct task_struct *task);
void (*switched_from) (struct rq *this_rq, struct task_struct *task);
void (*switching_to) (struct rq *this_rq, struct task_struct *task);
void (*switched_to) (struct rq *this_rq, struct task_struct *task);
-
- void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
- const struct load_weight *lw);
-
u64 (*get_prio) (struct rq *this_rq, struct task_struct *task);
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
u64 oldprio);
+ /*
+ * set_load_weight: task_rq_lock + sched_change
+ * __setscheduler_parms: task_rq_lock + sched_change
+ */
+ void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
+ const struct load_weight *lw);
+
+ /*
+ * sched_rr_get_interval: task_rq_lock
+ */
unsigned int (*get_rr_interval)(struct rq *rq,
struct task_struct *task);
+ /*
+ * task_sched_runtime: task_rq_lock
+ */
void (*update_curr)(struct rq *rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
+ /*
+ * sched_change_group: task_rq_lock + sched_change
+ */
void (*task_change_group)(struct task_struct *p);
#endif
#ifdef CONFIG_SCHED_CORE
+ /*
+ * pick_next_task: rq->lock
+ * try_steal_cookie: rq->lock (double)
+ */
int (*task_is_throttled)(struct task_struct *p, int cpu);
#endif
};
^ permalink raw reply [flat|nested] 74+ messages in thread* Re: [PATCH 10/12] sched: Add locking comments to sched_class methods
2025-10-06 10:44 ` [PATCH 10/12] sched: Add locking comments to sched_class methods Peter Zijlstra
@ 2025-10-07 9:54 ` Juri Lelli
2025-10-08 7:04 ` Peter Zijlstra
2025-10-16 9:33 ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
1 sibling, 1 reply; 74+ messages in thread
From: Juri Lelli @ 2025-10-07 9:54 UTC (permalink / raw)
To: Peter Zijlstra
Cc: tj, linux-kernel, mingo, vincent.guittot, dietmar.eggemann,
rostedt, bsegall, mgorman, vschneid, longman, hannes, mkoutny,
void, arighi, changwoo, cgroups, sched-ext, liuwenfang, tglx
Hi Peter,
On 06/10/25 12:44, Peter Zijlstra wrote:
> 'Document' the locking context the various sched_class methods are
> called under.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
...
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2345,8 +2345,7 @@ extern const u32 sched_prio_to_wmult[40
> /*
> * {de,en}queue flags:
> *
> - * DEQUEUE_SLEEP - task is no longer runnable
> - * ENQUEUE_WAKEUP - task just became runnable
> + * SLEEP/WAKEUP - task is no-longer/just-became runnable
> *
> * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
> * are in a known state which allows modification. Such pairs
> @@ -2359,6 +2358,11 @@ extern const u32 sched_prio_to_wmult[40
> *
> * MIGRATION - p->on_rq == TASK_ON_RQ_MIGRATING (used for DEADLINE)
> *
> + * DELAYED - de/re-queue a sched_delayed task
> + *
> + * CLASS - going to update p->sched_class; makes sched_change call the
> + * various switch methods.
> + *
> * ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
> * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
> * ENQUEUE_MIGRATED - the task was migrated during wakeup
Not for this patch, but I wondered if, while we are at it, we wanted to
complete documentation of these flags. My new AI friend is suggesting
the following, is it very much garbage? :)
Thanks,
Juri
---
From: Claude <claude-sonnet-4-5@anthropic.com>
Date: Mon, 7 Oct 2025 12:44:13 +0200
Subject: sched: Document remaining DEQUEUE/ENQUEUE flags
Complete the flag documentation by adding descriptions for the three
previously undocumented flags: DEQUEUE_SPECIAL, DEQUEUE_THROTTLE, and
ENQUEUE_INITIAL.
DEQUEUE_SPECIAL is used when dequeuing tasks in special states (stopped,
traced, parked, dead, or frozen) that don't use the normal wait-loop
pattern and must not use delayed dequeue.
DEQUEUE_THROTTLE is used when removing tasks from the runqueue due to
CFS bandwidth throttling, preventing delayed dequeue to ensure proper
throttling behavior.
ENQUEUE_INITIAL is used when enqueueing newly created tasks in
wake_up_new_task(), allowing the fair scheduler to give them preferential
initial placement (half vslice when PLACE_DEADLINE_INITIAL is enabled).
Signed-off-by: Claude <claude-sonnet-4-5@anthropic.com>
Not-so-sure-yet: Juri Lelli <juri.lelli@redhat.com>
---
kernel/sched/sched.h | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4c222fa8f908..1a2b3c8d9e4f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2364,10 +2364,20 @@ extern const u32 sched_prio_to_wmult[40];
* CLASS - going to update p->sched_class; makes sched_change call the
* various switch methods.
*
+ * DEQUEUE_SPECIAL - task is in a special state (STOPPED, TRACED, PARKED,
+ * DEAD, FROZEN) that doesn't use the normal wait-loop;
+ * disables delayed dequeue.
+ *
+ * DEQUEUE_THROTTLE - dequeuing due to CFS bandwidth throttling; disables
+ * delayed dequeue to ensure proper throttling.
+ *
* ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
* ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
* ENQUEUE_MIGRATED - the task was migrated during wakeup
+ * ENQUEUE_INITIAL - enqueuing a newly created task in wake_up_new_task();
+ * fair scheduler may give preferential initial placement
+ * (e.g., half vslice with PLACE_DEADLINE_INITIAL).
* ENQUEUE_RQ_SELECTED - ->select_task_rq() was called
*
*/
^ permalink raw reply related [flat|nested] 74+ messages in thread* Re: [PATCH 10/12] sched: Add locking comments to sched_class methods
2025-10-07 9:54 ` Juri Lelli
@ 2025-10-08 7:04 ` Peter Zijlstra
2025-10-08 7:33 ` Greg Kroah-Hartman
0 siblings, 1 reply; 74+ messages in thread
From: Peter Zijlstra @ 2025-10-08 7:04 UTC (permalink / raw)
To: Juri Lelli
Cc: tj, linux-kernel, mingo, vincent.guittot, dietmar.eggemann,
rostedt, bsegall, mgorman, vschneid, longman, hannes, mkoutny,
void, arighi, changwoo, cgroups, sched-ext, liuwenfang, tglx,
Greg Kroah-Hartman
On Tue, Oct 07, 2025 at 11:54:18AM +0200, Juri Lelli wrote:
> Not for this patch, but I wondered if, while we are at it, we wanted to
> complete documentation of these flags. My new AI friend is suggesting
> the following, is it very much garbage? :)
Heh; its not terrible. I've been playing with local LLMs, but mostly
I've found they struggle with getting enough context to not be utterly
demented. And when you up the context window, they get unusable slow :/
Setting up and configuring the whole pile of subtly interlocking stacks
of software to get anything useful out of this stuff is non-trivial (it
reminds me of the sendmail m4 days).
> ---
>
> From: Claude <claude-sonnet-4-5@anthropic.com>
> Date: Mon, 7 Oct 2025 12:44:13 +0200
> Subject: sched: Document remaining DEQUEUE/ENQUEUE flags
>
> Complete the flag documentation by adding descriptions for the three
> previously undocumented flags: DEQUEUE_SPECIAL, DEQUEUE_THROTTLE, and
> ENQUEUE_INITIAL.
>
> DEQUEUE_SPECIAL is used when dequeuing tasks in special states (stopped,
> traced, parked, dead, or frozen) that don't use the normal wait-loop
> pattern and must not use delayed dequeue.
>
> DEQUEUE_THROTTLE is used when removing tasks from the runqueue due to
> CFS bandwidth throttling, preventing delayed dequeue to ensure proper
> throttling behavior.
>
> ENQUEUE_INITIAL is used when enqueueing newly created tasks in
> wake_up_new_task(), allowing the fair scheduler to give them preferential
> initial placement (half vslice when PLACE_DEADLINE_INITIAL is enabled).
>
> Signed-off-by: Claude <claude-sonnet-4-5@anthropic.com>
> Not-so-sure-yet: Juri Lelli <juri.lelli@redhat.com>
Is this the generally acceptable form of attribution for these things?
I'm not sure what the official guidance is on using these AI tools.
Greg, you have any insights here?
> ---
> kernel/sched/sched.h | 10 ++++++++++
> 1 file changed, 10 insertions(+)
>
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 4c222fa8f908..1a2b3c8d9e4f 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2364,10 +2364,20 @@ extern const u32 sched_prio_to_wmult[40];
> * CLASS - going to update p->sched_class; makes sched_change call the
> * various switch methods.
> *
> + * DEQUEUE_SPECIAL - task is in a special state (STOPPED, TRACED, PARKED,
> + * DEAD, FROZEN) that doesn't use the normal wait-loop;
> + * disables delayed dequeue.
> + *
> + * DEQUEUE_THROTTLE - dequeuing due to CFS bandwidth throttling; disables
> + * delayed dequeue to ensure proper throttling.
> + *
> * ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
> * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
> * ENQUEUE_MIGRATED - the task was migrated during wakeup
> + * ENQUEUE_INITIAL - enqueuing a newly created task in wake_up_new_task();
> + * fair scheduler may give preferential initial placement
> + * (e.g., half vslice with PLACE_DEADLINE_INITIAL).
> * ENQUEUE_RQ_SELECTED - ->select_task_rq() was called
> *
> */
>
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH 10/12] sched: Add locking comments to sched_class methods
2025-10-08 7:04 ` Peter Zijlstra
@ 2025-10-08 7:33 ` Greg Kroah-Hartman
2025-10-08 9:43 ` Juri Lelli
0 siblings, 1 reply; 74+ messages in thread
From: Greg Kroah-Hartman @ 2025-10-08 7:33 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Juri Lelli, tj, linux-kernel, mingo, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, longman,
hannes, mkoutny, void, arighi, changwoo, cgroups, sched-ext,
liuwenfang, tglx
On Wed, Oct 08, 2025 at 09:04:19AM +0200, Peter Zijlstra wrote:
> On Tue, Oct 07, 2025 at 11:54:18AM +0200, Juri Lelli wrote:
>
> > Not for this patch, but I wondered if, while we are at it, we wanted to
> > complete documentation of these flags. My new AI friend is suggesting
> > the following, is it very much garbage? :)
>
> Heh; its not terrible. I've been playing with local LLMs, but mostly
> I've found they struggle with getting enough context to not be utterly
> demented. And when you up the context window, they get unusable slow :/
>
> Setting up and configuring the whole pile of subtly interlocking stacks
> of software to get anything useful out of this stuff is non-trivial (it
> reminds me of the sendmail m4 days).
>
> > ---
> >
> > From: Claude <claude-sonnet-4-5@anthropic.com>
> > Date: Mon, 7 Oct 2025 12:44:13 +0200
> > Subject: sched: Document remaining DEQUEUE/ENQUEUE flags
> >
> > Complete the flag documentation by adding descriptions for the three
> > previously undocumented flags: DEQUEUE_SPECIAL, DEQUEUE_THROTTLE, and
> > ENQUEUE_INITIAL.
> >
> > DEQUEUE_SPECIAL is used when dequeuing tasks in special states (stopped,
> > traced, parked, dead, or frozen) that don't use the normal wait-loop
> > pattern and must not use delayed dequeue.
> >
> > DEQUEUE_THROTTLE is used when removing tasks from the runqueue due to
> > CFS bandwidth throttling, preventing delayed dequeue to ensure proper
> > throttling behavior.
> >
> > ENQUEUE_INITIAL is used when enqueueing newly created tasks in
> > wake_up_new_task(), allowing the fair scheduler to give them preferential
> > initial placement (half vslice when PLACE_DEADLINE_INITIAL is enabled).
> >
> > Signed-off-by: Claude <claude-sonnet-4-5@anthropic.com>
> > Not-so-sure-yet: Juri Lelli <juri.lelli@redhat.com>
>
> Is this the generally acceptable form of attribution for these things?
> I'm not sure what the official guidance is on using these AI tools.
>
> Greg, you have any insights here?
First off, Claude can NOT sign off on anything, so that's a non-starter.
All Red Hat people should know that :)
Otherwise, there is a draft of something that was going to address stuff
like this floating around by Dave Hansen, I'll go poke him to see what
the status of that is.
thanks,
greg k-h
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH 10/12] sched: Add locking comments to sched_class methods
2025-10-08 7:33 ` Greg Kroah-Hartman
@ 2025-10-08 9:43 ` Juri Lelli
2025-10-08 10:06 ` Greg Kroah-Hartman
0 siblings, 1 reply; 74+ messages in thread
From: Juri Lelli @ 2025-10-08 9:43 UTC (permalink / raw)
To: Greg Kroah-Hartman
Cc: Peter Zijlstra, tj, linux-kernel, mingo, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, longman,
hannes, mkoutny, void, arighi, changwoo, cgroups, sched-ext,
liuwenfang, tglx
On 08/10/25 09:33, Greg Kroah-Hartman wrote:
> On Wed, Oct 08, 2025 at 09:04:19AM +0200, Peter Zijlstra wrote:
> > On Tue, Oct 07, 2025 at 11:54:18AM +0200, Juri Lelli wrote:
> >
> > > Not for this patch, but I wondered if, while we are at it, we wanted to
> > > complete documentation of these flags. My new AI friend is suggesting
> > > the following, is it very much garbage? :)
> >
> > Heh; its not terrible. I've been playing with local LLMs, but mostly
> > I've found they struggle with getting enough context to not be utterly
> > demented. And when you up the context window, they get unusable slow :/
> >
> > Setting up and configuring the whole pile of subtly interlocking stacks
> > of software to get anything useful out of this stuff is non-trivial (it
> > reminds me of the sendmail m4 days).
> >
> > > ---
> > >
> > > From: Claude <claude-sonnet-4-5@anthropic.com>
> > > Date: Mon, 7 Oct 2025 12:44:13 +0200
> > > Subject: sched: Document remaining DEQUEUE/ENQUEUE flags
> > >
> > > Complete the flag documentation by adding descriptions for the three
> > > previously undocumented flags: DEQUEUE_SPECIAL, DEQUEUE_THROTTLE, and
> > > ENQUEUE_INITIAL.
> > >
> > > DEQUEUE_SPECIAL is used when dequeuing tasks in special states (stopped,
> > > traced, parked, dead, or frozen) that don't use the normal wait-loop
> > > pattern and must not use delayed dequeue.
> > >
> > > DEQUEUE_THROTTLE is used when removing tasks from the runqueue due to
> > > CFS bandwidth throttling, preventing delayed dequeue to ensure proper
> > > throttling behavior.
> > >
> > > ENQUEUE_INITIAL is used when enqueueing newly created tasks in
> > > wake_up_new_task(), allowing the fair scheduler to give them preferential
> > > initial placement (half vslice when PLACE_DEADLINE_INITIAL is enabled).
> > >
> > > Signed-off-by: Claude <claude-sonnet-4-5@anthropic.com>
> > > Not-so-sure-yet: Juri Lelli <juri.lelli@redhat.com>
> >
> > Is this the generally acceptable form of attribution for these things?
> > I'm not sure what the official guidance is on using these AI tools.
> >
> > Greg, you have any insights here?
>
> First off, Claude can NOT sign off on anything, so that's a non-starter.
> All Red Hat people should know that :)
Yep, knew that. But I felt guilty nontheless as I didn't touch the
change at all. Current SoB was kind of a (silly) joke. :)
> Otherwise, there is a draft of something that was going to address stuff
> like this floating around by Dave Hansen, I'll go poke him to see what
> the status of that is.
I believe it was suggested something like Co-developed-by: <model> and
then Signed-off-by: <human>, but indeed curious to know how that
discussion ended.
Thanks!
Juri
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH 10/12] sched: Add locking comments to sched_class methods
2025-10-08 9:43 ` Juri Lelli
@ 2025-10-08 10:06 ` Greg Kroah-Hartman
2025-10-08 14:34 ` Steven Rostedt
0 siblings, 1 reply; 74+ messages in thread
From: Greg Kroah-Hartman @ 2025-10-08 10:06 UTC (permalink / raw)
To: Juri Lelli
Cc: Peter Zijlstra, tj, linux-kernel, mingo, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, longman,
hannes, mkoutny, void, arighi, changwoo, cgroups, sched-ext,
liuwenfang, tglx
On Wed, Oct 08, 2025 at 11:43:21AM +0200, Juri Lelli wrote:
> On 08/10/25 09:33, Greg Kroah-Hartman wrote:
> > On Wed, Oct 08, 2025 at 09:04:19AM +0200, Peter Zijlstra wrote:
> > > On Tue, Oct 07, 2025 at 11:54:18AM +0200, Juri Lelli wrote:
> > >
> > > > Not for this patch, but I wondered if, while we are at it, we wanted to
> > > > complete documentation of these flags. My new AI friend is suggesting
> > > > the following, is it very much garbage? :)
> > >
> > > Heh; its not terrible. I've been playing with local LLMs, but mostly
> > > I've found they struggle with getting enough context to not be utterly
> > > demented. And when you up the context window, they get unusable slow :/
> > >
> > > Setting up and configuring the whole pile of subtly interlocking stacks
> > > of software to get anything useful out of this stuff is non-trivial (it
> > > reminds me of the sendmail m4 days).
> > >
> > > > ---
> > > >
> > > > From: Claude <claude-sonnet-4-5@anthropic.com>
> > > > Date: Mon, 7 Oct 2025 12:44:13 +0200
> > > > Subject: sched: Document remaining DEQUEUE/ENQUEUE flags
> > > >
> > > > Complete the flag documentation by adding descriptions for the three
> > > > previously undocumented flags: DEQUEUE_SPECIAL, DEQUEUE_THROTTLE, and
> > > > ENQUEUE_INITIAL.
> > > >
> > > > DEQUEUE_SPECIAL is used when dequeuing tasks in special states (stopped,
> > > > traced, parked, dead, or frozen) that don't use the normal wait-loop
> > > > pattern and must not use delayed dequeue.
> > > >
> > > > DEQUEUE_THROTTLE is used when removing tasks from the runqueue due to
> > > > CFS bandwidth throttling, preventing delayed dequeue to ensure proper
> > > > throttling behavior.
> > > >
> > > > ENQUEUE_INITIAL is used when enqueueing newly created tasks in
> > > > wake_up_new_task(), allowing the fair scheduler to give them preferential
> > > > initial placement (half vslice when PLACE_DEADLINE_INITIAL is enabled).
> > > >
> > > > Signed-off-by: Claude <claude-sonnet-4-5@anthropic.com>
> > > > Not-so-sure-yet: Juri Lelli <juri.lelli@redhat.com>
> > >
> > > Is this the generally acceptable form of attribution for these things?
> > > I'm not sure what the official guidance is on using these AI tools.
> > >
> > > Greg, you have any insights here?
> >
> > First off, Claude can NOT sign off on anything, so that's a non-starter.
> > All Red Hat people should know that :)
>
> Yep, knew that. But I felt guilty nontheless as I didn't touch the
> change at all. Current SoB was kind of a (silly) joke. :)
>
> > Otherwise, there is a draft of something that was going to address stuff
> > like this floating around by Dave Hansen, I'll go poke him to see what
> > the status of that is.
>
> I believe it was suggested something like Co-developed-by: <model> and
> then Signed-off-by: <human>, but indeed curious to know how that
> discussion ended.
The general answer is "you better know the copyright ownership
information of the output of the tool you use" before you do anything
with any of these tools. Be careful about this, because adding your
signed-off-by to a patch like makes it your responsibility :)
After that, treat it like any other tool that you use to generate a
patch, document what you used and why/how, and you should be fine. You
have to do this today if you were to use any type of tool, so in that
way, "AI" is no different, with the exception of the ownership of the
output result (again, consult the terms of the tool used.)
Hopefully documentation updates to our process documents will reflect
this in the near future.
thanks,
greg k-h
^ permalink raw reply [flat|nested] 74+ messages in thread
* Re: [PATCH 10/12] sched: Add locking comments to sched_class methods
2025-10-08 10:06 ` Greg Kroah-Hartman
@ 2025-10-08 14:34 ` Steven Rostedt
0 siblings, 0 replies; 74+ messages in thread
From: Steven Rostedt @ 2025-10-08 14:34 UTC (permalink / raw)
To: Greg Kroah-Hartman
Cc: Juri Lelli, Peter Zijlstra, tj, linux-kernel, mingo,
vincent.guittot, dietmar.eggemann, bsegall, mgorman, vschneid,
longman, hannes, mkoutny, void, arighi, changwoo, cgroups,
sched-ext, liuwenfang, tglx
On Wed, 8 Oct 2025 12:06:56 +0200
Greg Kroah-Hartman <gregkh@linuxfoundation.org> wrote:
> The general answer is "you better know the copyright ownership
> information of the output of the tool you use" before you do anything
> with any of these tools. Be careful about this, because adding your
> signed-off-by to a patch like makes it your responsibility :)
And there are a lot of copyright battles going on in courts wrt AI right
now. It's best to see how that plays out too.
>
> After that, treat it like any other tool that you use to generate a
> patch, document what you used and why/how, and you should be fine. You
> have to do this today if you were to use any type of tool, so in that
> way, "AI" is no different, with the exception of the ownership of the
> output result (again, consult the terms of the tool used.)
>
> Hopefully documentation updates to our process documents will reflect
> this in the near future.
Yeah, I need to help Dave on that too.
Thanks for the reminder,
-- Steve
^ permalink raw reply [flat|nested] 74+ messages in thread
* [tip: sched/core] sched: Add locking comments to sched_class methods
2025-10-06 10:44 ` [PATCH 10/12] sched: Add locking comments to sched_class methods Peter Zijlstra
2025-10-07 9:54 ` Juri Lelli
@ 2025-10-16 9:33 ` tip-bot2 for Peter Zijlstra
1 sibling, 0 replies; 74+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2025-10-16 9:33 UTC (permalink / raw)
To: linux-tip-commits
Cc: Peter Zijlstra (Intel), Juri Lelli, Tejun Heo, Vincent Guittot,
x86, linux-kernel
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 46a177fb01e52ec0e3f9eab9b217a0f7c8909eeb
Gitweb: https://git.kernel.org/tip/46a177fb01e52ec0e3f9eab9b217a0f7c8909eeb
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Tue, 09 Sep 2025 11:58:02 +02:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 16 Oct 2025 11:13:53 +02:00
sched: Add locking comments to sched_class methods
'Document' the locking context the various sched_class methods are
called under.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
---
kernel/sched/core.c | 6 +-
kernel/sched/sched.h | 108 +++++++++++++++++++++++++++++++++++++++---
2 files changed, 105 insertions(+), 9 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e932439..8c55740 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -583,8 +583,8 @@ EXPORT_SYMBOL(__trace_set_current_state);
*
* p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
*
- * is set by activate_task() and cleared by deactivate_task(), under
- * rq->lock. Non-zero indicates the task is runnable, the special
+ * is set by activate_task() and cleared by deactivate_task()/block_task(),
+ * under rq->lock. Non-zero indicates the task is runnable, the special
* ON_RQ_MIGRATING state is used for migration without holding both
* rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
*
@@ -4162,7 +4162,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* __schedule(). See the comment for smp_mb__after_spinlock().
*
* Form a control-dep-acquire with p->on_rq == 0 above, to ensure
- * schedule()'s deactivate_task() has 'happened' and p will no longer
+ * schedule()'s block_task() has 'happened' and p will no longer
* care about it's own p->state. See the comment in __schedule().
*/
smp_acquire__after_ctrl_dep();
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ea2ea8f..3462145 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2345,8 +2345,7 @@ extern const u32 sched_prio_to_wmult[40];
/*
* {de,en}queue flags:
*
- * DEQUEUE_SLEEP - task is no longer runnable
- * ENQUEUE_WAKEUP - task just became runnable
+ * SLEEP/WAKEUP - task is no-longer/just-became runnable
*
* SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
* are in a known state which allows modification. Such pairs
@@ -2359,11 +2358,18 @@ extern const u32 sched_prio_to_wmult[40];
*
* MIGRATION - p->on_rq == TASK_ON_RQ_MIGRATING (used for DEADLINE)
*
+ * DELAYED - de/re-queue a sched_delayed task
+ *
+ * CLASS - going to update p->sched_class; makes sched_change call the
+ * various switch methods.
+ *
* ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
* ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
* ENQUEUE_MIGRATED - the task was migrated during wakeup
* ENQUEUE_RQ_SELECTED - ->select_task_rq() was called
*
+ * XXX SAVE/RESTORE in combination with CLASS doesn't really make sense, but
+ * SCHED_DEADLINE seems to rely on this for now.
*/
#define DEQUEUE_SLEEP 0x0001 /* Matches ENQUEUE_WAKEUP */
@@ -2409,14 +2415,50 @@ struct sched_class {
int uclamp_enabled;
#endif
+ /*
+ * move_queued_task/activate_task/enqueue_task: rq->lock
+ * ttwu_do_activate/activate_task/enqueue_task: rq->lock
+ * wake_up_new_task/activate_task/enqueue_task: task_rq_lock
+ * ttwu_runnable/enqueue_task: task_rq_lock
+ * proxy_task_current: rq->lock
+ * sched_change_end
+ */
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
+ /*
+ * move_queued_task/deactivate_task/dequeue_task: rq->lock
+ * __schedule/block_task/dequeue_task: rq->lock
+ * proxy_task_current: rq->lock
+ * wait_task_inactive: task_rq_lock
+ * sched_change_begin
+ */
bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
+
+ /*
+ * do_sched_yield: rq->lock
+ */
void (*yield_task) (struct rq *rq);
+ /*
+ * yield_to: rq->lock (double)
+ */
bool (*yield_to_task)(struct rq *rq, struct task_struct *p);
+ /*
+ * move_queued_task: rq->lock
+ * __migrate_swap_task: rq->lock
+ * ttwu_do_activate: rq->lock
+ * ttwu_runnable: task_rq_lock
+ * wake_up_new_task: task_rq_lock
+ */
void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
+ /*
+ * schedule/pick_next_task/prev_balance: rq->lock
+ */
int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
+
+ /*
+ * schedule/pick_next_task: rq->lock
+ */
struct task_struct *(*pick_task)(struct rq *rq);
/*
* Optional! When implemented pick_next_task() should be equivalent to:
@@ -2429,48 +2471,102 @@ struct sched_class {
*/
struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev);
+ /*
+ * sched_change:
+ * __schedule: rq->lock
+ */
void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next);
void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
+ /*
+ * select_task_rq: p->pi_lock
+ * sched_exec: p->pi_lock
+ */
int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
+ /*
+ * set_task_cpu: p->pi_lock || rq->lock (ttwu like)
+ */
void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
+ /*
+ * ttwu_do_activate: rq->lock
+ * wake_up_new_task: task_rq_lock
+ */
void (*task_woken)(struct rq *this_rq, struct task_struct *task);
+ /*
+ * do_set_cpus_allowed: task_rq_lock + sched_change
+ */
void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx);
+ /*
+ * sched_set_rq_{on,off}line: rq->lock
+ */
void (*rq_online)(struct rq *rq);
void (*rq_offline)(struct rq *rq);
+ /*
+ * push_cpu_stop: p->pi_lock && rq->lock
+ */
struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
+ /*
+ * hrtick: rq->lock
+ * sched_tick: rq->lock
+ * sched_tick_remote: rq->lock
+ */
void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
+ /*
+ * sched_cgroup_fork: p->pi_lock
+ */
void (*task_fork)(struct task_struct *p);
+ /*
+ * finish_task_switch: no locks
+ */
void (*task_dead)(struct task_struct *p);
+ /*
+ * sched_change
+ */
void (*switching_from)(struct rq *this_rq, struct task_struct *task);
void (*switched_from) (struct rq *this_rq, struct task_struct *task);
void (*switching_to) (struct rq *this_rq, struct task_struct *task);
void (*switched_to) (struct rq *this_rq, struct task_struct *task);
-
- void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
- const struct load_weight *lw);
-
u64 (*get_prio) (struct rq *this_rq, struct task_struct *task);
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
u64 oldprio);
+ /*
+ * set_load_weight: task_rq_lock + sched_change
+ * __setscheduler_parms: task_rq_lock + sched_change
+ */
+ void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
+ const struct load_weight *lw);
+
+ /*
+ * sched_rr_get_interval: task_rq_lock
+ */
unsigned int (*get_rr_interval)(struct rq *rq,
struct task_struct *task);
+ /*
+ * task_sched_runtime: task_rq_lock
+ */
void (*update_curr)(struct rq *rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
+ /*
+ * sched_change_group: task_rq_lock + sched_change
+ */
void (*task_change_group)(struct task_struct *p);
#endif
#ifdef CONFIG_SCHED_CORE
+ /*
+ * pick_next_task: rq->lock
+ * try_steal_cookie: rq->lock (double)
+ */
int (*task_is_throttled)(struct task_struct *p, int cpu);
#endif
};
^ permalink raw reply related [flat|nested] 74+ messages in thread
* [PATCH 11/12] sched: Match __task_rq_{,un}lock()
2025-10-06 10:44 [PATCH 00/12] sched: Cleanup the change-pattern and related locking Peter Zijlstra
` (9 preceding siblings ...)
2025-10-06 10:44 ` [PATCH 10/12] sched: Add locking comments to sched_class methods Peter Zijlstra
@ 2025-10-06 10:44 ` Peter Zijlstra
2025-10-07 20:44 ` Tejun Heo
2025-10-16 9:33 ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2025-10-06 10:44 ` [PATCH 12/12] sched: Cleanup the sched_change NOCLOCK usage Peter Zijlstra
` (5 subsequent siblings)
16 siblings, 2 replies; 74+ messages in thread
From: Peter Zijlstra @ 2025-10-06 10:44 UTC (permalink / raw)
To: tj
Cc: linux-kernel, peterz, mingo, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, longman,
hannes, mkoutny, void, arighi, changwoo, cgroups, sched-ext,
liuwenfang, tglx
In preparation to adding more rules to __task_rq_lock(), such that
__task_rq_unlock() will no longer be requivalent to rq_unlock(),
make sure every __task_rq_lock() is matched by a __task_rq_unlock()
and vice-versa.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/sched/core.c | 13 ++++++++-----
kernel/sched/sched.h | 8 ++++----
kernel/sched/stats.h | 2 +-
3 files changed, 13 insertions(+), 10 deletions(-)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2582,7 +2582,8 @@ static int migration_cpu_stop(void *data
*/
WARN_ON_ONCE(!pending->stop_pending);
preempt_disable();
- task_rq_unlock(rq, p, &rf);
+ rq_unlock(rq, &rf);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
&pending->arg, &pending->stop_work);
preempt_enable();
@@ -2591,7 +2592,8 @@ static int migration_cpu_stop(void *data
out:
if (pending)
pending->stop_pending = false;
- task_rq_unlock(rq, p, &rf);
+ rq_unlock(rq, &rf);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
if (complete)
complete_all(&pending->done);
@@ -3708,7 +3710,7 @@ static int ttwu_runnable(struct task_str
ttwu_do_wakeup(p);
ret = 1;
}
- __task_rq_unlock(rq, &rf);
+ __task_rq_unlock(rq, p, &rf);
return ret;
}
@@ -4301,7 +4303,7 @@ int task_call_func(struct task_struct *p
ret = func(p, arg);
if (rq)
- rq_unlock(rq, &rf);
+ __task_rq_unlock(rq, p, &rf);
raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
return ret;
@@ -7364,7 +7366,8 @@ void rt_mutex_setprio(struct task_struct
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq);
- raw_spin_rq_unlock(rq);
+ rq_repin_lock(rq, &rf);
+ __task_rq_unlock(rq, p, &rf);
preempt_enable();
}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1825,7 +1825,8 @@ struct rq *task_rq_lock(struct task_stru
__acquires(p->pi_lock)
__acquires(rq->lock);
-static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
+static inline void
+__task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
@@ -1837,8 +1838,7 @@ task_rq_unlock(struct rq *rq, struct tas
__releases(rq->lock)
__releases(p->pi_lock)
{
- rq_unpin_lock(rq, rf);
- raw_spin_rq_unlock(rq);
+ __task_rq_unlock(rq, p, rf);
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
}
@@ -1849,7 +1849,7 @@ DEFINE_LOCK_GUARD_1(task_rq_lock, struct
DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct,
_T->rq = __task_rq_lock(_T->lock, &_T->rf),
- __task_rq_unlock(_T->rq, &_T->rf),
+ __task_rq_unlock(_T->rq, _T->lock, &_T->rf),
struct rq *rq; struct rq_flags rf)
static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -206,7 +206,7 @@ static inline void psi_ttwu_dequeue(stru
rq = __task_rq_lock(p, &rf);
psi_task_change(p, p->psi_flags, 0);
- __task_rq_unlock(rq, &rf);
+ __task_rq_unlock(rq, p, &rf);
}
}
^ permalink raw reply [flat|nested] 74+ messages in thread* Re: [PATCH 11/12] sched: Match __task_rq_{,un}lock()
2025-10-06 10:44 ` [PATCH 11/12] sched: Match __task_rq_{,un}lock() Peter Zijlstra
@ 2025-10-07 20:44 ` Tejun Heo
2025-10-16 9:33 ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
1 sibling, 0 replies; 74+ messages in thread
From: Tejun Heo @ 2025-10-07 20:44 UTC (permalink / raw)
To: Peter Zijlstra
Cc: linux-kernel, mingo, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, longman,
hannes, mkoutny, void, arighi, changwoo, cgroups, sched-ext,
liuwenfang, tglx
On Mon, Oct 06, 2025 at 12:44:13PM +0200, Peter Zijlstra wrote:
> In preparation to adding more rules to __task_rq_lock(), such that
> __task_rq_unlock() will no longer be requivalent to rq_unlock(),
^
typo
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 74+ messages in thread* [tip: sched/core] sched: Match __task_rq_{,un}lock()
2025-10-06 10:44 ` [PATCH 11/12] sched: Match __task_rq_{,un}lock() Peter Zijlstra
2025-10-07 20:44 ` Tejun Heo
@ 2025-10-16 9:33 ` tip-bot2 for Peter Zijlstra
1 sibling, 0 replies; 74+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2025-10-16 9:33 UTC (permalink / raw)
To: linux-tip-commits
Cc: Peter Zijlstra (Intel), Juri Lelli, Tejun Heo, Vincent Guittot,
x86, linux-kernel
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 5892cbd85dbf9059b8a3a7dd8ab64c0fce671029
Gitweb: https://git.kernel.org/tip/5892cbd85dbf9059b8a3a7dd8ab64c0fce671029
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 25 Sep 2025 11:26:22 +02:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 16 Oct 2025 11:13:54 +02:00
sched: Match __task_rq_{,un}lock()
In preparation to adding more rules to __task_rq_lock(), such that
__task_rq_unlock() will no longer be equivalent to rq_unlock(),
make sure every __task_rq_lock() is matched by a __task_rq_unlock()
and vice-versa.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
---
kernel/sched/core.c | 13 ++++++++-----
kernel/sched/sched.h | 8 ++++----
kernel/sched/stats.h | 2 +-
3 files changed, 13 insertions(+), 10 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8c55740..e715147 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2582,7 +2582,8 @@ static int migration_cpu_stop(void *data)
*/
WARN_ON_ONCE(!pending->stop_pending);
preempt_disable();
- task_rq_unlock(rq, p, &rf);
+ rq_unlock(rq, &rf);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
&pending->arg, &pending->stop_work);
preempt_enable();
@@ -2591,7 +2592,8 @@ static int migration_cpu_stop(void *data)
out:
if (pending)
pending->stop_pending = false;
- task_rq_unlock(rq, p, &rf);
+ rq_unlock(rq, &rf);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
if (complete)
complete_all(&pending->done);
@@ -3708,7 +3710,7 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
ttwu_do_wakeup(p);
ret = 1;
}
- __task_rq_unlock(rq, &rf);
+ __task_rq_unlock(rq, p, &rf);
return ret;
}
@@ -4301,7 +4303,7 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg)
ret = func(p, arg);
if (rq)
- rq_unlock(rq, &rf);
+ __task_rq_unlock(rq, p, &rf);
raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
return ret;
@@ -7362,7 +7364,8 @@ out_unlock:
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq);
- raw_spin_rq_unlock(rq);
+ rq_repin_lock(rq, &rf);
+ __task_rq_unlock(rq, p, &rf);
preempt_enable();
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3462145..e3d2710 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1825,7 +1825,8 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(p->pi_lock)
__acquires(rq->lock);
-static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
+static inline void
+__task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
@@ -1837,8 +1838,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
__releases(rq->lock)
__releases(p->pi_lock)
{
- rq_unpin_lock(rq, rf);
- raw_spin_rq_unlock(rq);
+ __task_rq_unlock(rq, p, rf);
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
}
@@ -1849,7 +1849,7 @@ DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct,
DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct,
_T->rq = __task_rq_lock(_T->lock, &_T->rf),
- __task_rq_unlock(_T->rq, &_T->rf),
+ __task_rq_unlock(_T->rq, _T->lock, &_T->rf),
struct rq *rq; struct rq_flags rf)
static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 26f3fd4..cbf7206 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -206,7 +206,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p)
rq = __task_rq_lock(p, &rf);
psi_task_change(p, p->psi_flags, 0);
- __task_rq_unlock(rq, &rf);
+ __task_rq_unlock(rq, p, &rf);
}
}
^ permalink raw reply related [flat|nested] 74+ messages in thread
* [PATCH 12/12] sched: Cleanup the sched_change NOCLOCK usage
2025-10-06 10:44 [PATCH 00/12] sched: Cleanup the change-pattern and related locking Peter Zijlstra
` (10 preceding siblings ...)
2025-10-06 10:44 ` [PATCH 11/12] sched: Match __task_rq_{,un}lock() Peter Zijlstra
@ 2025-10-06 10:44 ` Peter Zijlstra
2025-10-16 9:33 ` [tip: sched/core] " tip-bot2 for Peter Zijlstra
2025-10-07 8:25 ` [PATCH 00/12] sched: Cleanup the change-pattern and related locking Andrea Righi
` (4 subsequent siblings)
16 siblings, 1 reply; 74+ messages in thread
From: Peter Zijlstra @ 2025-10-06 10:44 UTC (permalink / raw)
To: tj
Cc: linux-kernel, peterz, mingo, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, longman,
hannes, mkoutny, void, arighi, changwoo, cgroups, sched-ext,
liuwenfang, tglx, K Prateek Nayak
Teach the sched_change pattern how to do update_rq_clock(); this
allows for some simplifications / cleanups.
Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/sched/core.c | 33 +++++++++++----------------------
kernel/sched/ext.c | 8 ++------
kernel/sched/syscalls.c | 8 ++------
3 files changed, 15 insertions(+), 34 deletions(-)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2346,10 +2346,8 @@ static void migrate_disable_switch(struc
if (p->cpus_ptr != &p->cpus_mask)
return;
- scoped_guard (task_rq_lock, p) {
- update_rq_clock(scope.rq);
+ scoped_guard (task_rq_lock, p)
do_set_cpus_allowed(p, &ac);
- }
}
void ___migrate_enable(void)
@@ -2666,9 +2664,7 @@ void set_cpus_allowed_common(struct task
static void
do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
{
- u32 flags = DEQUEUE_SAVE | DEQUEUE_NOCLOCK;
-
- scoped_guard (sched_change, p, flags) {
+ scoped_guard (sched_change, p, DEQUEUE_SAVE) {
p->sched_class->set_cpus_allowed(p, ctx);
mm_set_cpus_allowed(p->mm, ctx->new_mask);
}
@@ -2690,10 +2686,8 @@ void set_cpus_allowed_force(struct task_
struct rcu_head rcu;
};
- scoped_guard (__task_rq_lock, p) {
- update_rq_clock(scope.rq);
+ scoped_guard (__task_rq_lock, p)
do_set_cpus_allowed(p, &ac);
- }
/*
* Because this is called with p->pi_lock held, it is not possible
@@ -9110,16 +9104,13 @@ static void sched_change_group(struct ta
*/
void sched_move_task(struct task_struct *tsk, bool for_autogroup)
{
- unsigned int queue_flags =
- DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+ unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
bool resched = false;
struct rq *rq;
CLASS(task_rq_lock, rq_guard)(tsk);
rq = rq_guard.rq;
- update_rq_clock(rq);
-
scoped_guard (sched_change, tsk, queue_flags) {
sched_change_group(tsk);
if (!for_autogroup)
@@ -10794,19 +10785,17 @@ struct sched_change_ctx *sched_change_be
lockdep_assert_rq_held(rq);
+ if (!(flags & DEQUEUE_NOCLOCK)) {
+ update_rq_clock(rq);
+ flags |= DEQUEUE_NOCLOCK;
+ }
+
if (flags & DEQUEUE_CLASS) {
if (WARN_ON_ONCE(flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)))
flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
- if (p->sched_class->switching_from) {
- /*
- * switching_from_fair() assumes CLASS implies NOCLOCK;
- * fixing this assumption would mean switching_from()
- * would need to be able to change flags.
- */
- WARN_ON(!(flags & DEQUEUE_NOCLOCK));
+ if (p->sched_class->switching_from)
p->sched_class->switching_from(rq, p);
- }
}
*ctx = (struct sched_change_ctx){
@@ -10845,7 +10834,7 @@ void sched_change_end(struct sched_chang
p->sched_class->switching_to(rq, p);
if (ctx->queued)
- enqueue_task(rq, p, ctx->flags | ENQUEUE_NOCLOCK);
+ enqueue_task(rq, p, ctx->flags);
if (ctx->running)
set_next_task(rq, p);
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3912,13 +3912,11 @@ static void scx_disable_workfn(struct kt
scx_task_iter_start(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
- unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+ unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
const struct sched_class *old_class = p->sched_class;
const struct sched_class *new_class =
__setscheduler_class(p->policy, p->prio);
- update_rq_clock(task_rq(p));
-
if (old_class != new_class) {
queue_flags |= DEQUEUE_CLASS;
queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
@@ -4656,7 +4654,7 @@ static int scx_enable(struct sched_ext_o
percpu_down_write(&scx_fork_rwsem);
scx_task_iter_start(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
- unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+ unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
const struct sched_class *old_class = p->sched_class;
const struct sched_class *new_class =
__setscheduler_class(p->policy, p->prio);
@@ -4664,8 +4662,6 @@ static int scx_enable(struct sched_ext_o
if (!tryget_task_struct(p))
continue;
- update_rq_clock(task_rq(p));
-
if (old_class != new_class) {
queue_flags |= DEQUEUE_CLASS;
queue_flags &= ~(DEQUEUE_SAVE | DEQUEUE_MOVE);
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -64,7 +64,6 @@ static int effective_prio(struct task_st
void set_user_nice(struct task_struct *p, long nice)
{
- struct rq *rq;
int old_prio;
if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
@@ -73,10 +72,7 @@ void set_user_nice(struct task_struct *p
* We have to be careful, if called from sys_setpriority(),
* the task might be in the middle of scheduling on another CPU.
*/
- CLASS(task_rq_lock, rq_guard)(p);
- rq = rq_guard.rq;
-
- update_rq_clock(rq);
+ guard(task_rq_lock)(p);
/*
* The RT priorities are set via sched_setscheduler(), but we still
@@ -89,7 +85,7 @@ void set_user_nice(struct task_struct *p
return;
}
- scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK) {
+ scoped_guard (sched_change, p, DEQUEUE_SAVE) {
p->static_prio = NICE_TO_PRIO(nice);
set_load_weight(p, true);
old_prio = p->prio;
^ permalink raw reply [flat|nested] 74+ messages in thread* [tip: sched/core] sched: Cleanup the sched_change NOCLOCK usage
2025-10-06 10:44 ` [PATCH 12/12] sched: Cleanup the sched_change NOCLOCK usage Peter Zijlstra
@ 2025-10-16 9:33 ` tip-bot2 for Peter Zijlstra
0 siblings, 0 replies; 74+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2025-10-16 9:33 UTC (permalink / raw)
To: linux-tip-commits
Cc: K Prateek Nayak, Peter Zijlstra (Intel), Juri Lelli, Tejun Heo,
Vincent Guittot, x86, linux-kernel
The following commit has been merged into the sched/core branch of tip:
Commit-ID: d4c64207b88a60dd15a38c790bb73c0b6f9a8c40
Gitweb: https://git.kernel.org/tip/d4c64207b88a60dd15a38c790bb73c0b6f9a8c40
Author: Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 11 Sep 2025 12:09:19 +02:00
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 16 Oct 2025 11:13:54 +02:00
sched: Cleanup the sched_change NOCLOCK usage
Teach the sched_change pattern how to do update_rq_clock(); this
allows for some simplifications / cleanups.
Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
---
kernel/sched/core.c | 33 +++++++++++----------------------
kernel/sched/ext.c | 4 +---
kernel/sched/syscalls.c | 8 ++------
3 files changed, 14 insertions(+), 31 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e715147..3d5659f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2346,10 +2346,8 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
if (p->cpus_ptr != &p->cpus_mask)
return;
- scoped_guard (task_rq_lock, p) {
- update_rq_clock(scope.rq);
+ scoped_guard (task_rq_lock, p)
do_set_cpus_allowed(p, &ac);
- }
}
void ___migrate_enable(void)
@@ -2666,9 +2664,7 @@ void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx
static void
do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
{
- u32 flags = DEQUEUE_SAVE | DEQUEUE_NOCLOCK;
-
- scoped_guard (sched_change, p, flags) {
+ scoped_guard (sched_change, p, DEQUEUE_SAVE) {
p->sched_class->set_cpus_allowed(p, ctx);
mm_set_cpus_allowed(p->mm, ctx->new_mask);
}
@@ -2690,10 +2686,8 @@ void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mas
struct rcu_head rcu;
};
- scoped_guard (__task_rq_lock, p) {
- update_rq_clock(scope.rq);
+ scoped_guard (__task_rq_lock, p)
do_set_cpus_allowed(p, &ac);
- }
/*
* Because this is called with p->pi_lock held, it is not possible
@@ -9108,16 +9102,13 @@ static void sched_change_group(struct task_struct *tsk)
*/
void sched_move_task(struct task_struct *tsk, bool for_autogroup)
{
- unsigned int queue_flags =
- DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+ unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
bool resched = false;
struct rq *rq;
CLASS(task_rq_lock, rq_guard)(tsk);
rq = rq_guard.rq;
- update_rq_clock(rq);
-
scoped_guard (sched_change, tsk, queue_flags) {
sched_change_group(tsk);
if (!for_autogroup)
@@ -10792,16 +10783,14 @@ struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int
lockdep_assert_rq_held(rq);
+ if (!(flags & DEQUEUE_NOCLOCK)) {
+ update_rq_clock(rq);
+ flags |= DEQUEUE_NOCLOCK;
+ }
+
if (flags & DEQUEUE_CLASS) {
- if (p->sched_class->switching_from) {
- /*
- * switching_from_fair() assumes CLASS implies NOCLOCK;
- * fixing this assumption would mean switching_from()
- * would need to be able to change flags.
- */
- WARN_ON(!(flags & DEQUEUE_NOCLOCK));
+ if (p->sched_class->switching_from)
p->sched_class->switching_from(rq, p);
- }
}
*ctx = (struct sched_change_ctx){
@@ -10840,7 +10829,7 @@ void sched_change_end(struct sched_change_ctx *ctx)
p->sched_class->switching_to(rq, p);
if (ctx->queued)
- enqueue_task(rq, p, ctx->flags | ENQUEUE_NOCLOCK);
+ enqueue_task(rq, p, ctx->flags);
if (ctx->running)
set_next_task(rq, p);
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index ad371b6..5717042 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4654,7 +4654,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
percpu_down_write(&scx_fork_rwsem);
scx_task_iter_start(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
- unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+ unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
const struct sched_class *old_class = p->sched_class;
const struct sched_class *new_class =
__setscheduler_class(p->policy, p->prio);
@@ -4662,8 +4662,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (!tryget_task_struct(p))
continue;
- update_rq_clock(task_rq(p));
-
if (old_class != new_class)
queue_flags |= DEQUEUE_CLASS;
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 20af564..8f0f603 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -64,7 +64,6 @@ static int effective_prio(struct task_struct *p)
void set_user_nice(struct task_struct *p, long nice)
{
- struct rq *rq;
int old_prio;
if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
@@ -73,10 +72,7 @@ void set_user_nice(struct task_struct *p, long nice)
* We have to be careful, if called from sys_setpriority(),
* the task might be in the middle of scheduling on another CPU.
*/
- CLASS(task_rq_lock, rq_guard)(p);
- rq = rq_guard.rq;
-
- update_rq_clock(rq);
+ guard(task_rq_lock)(p);
/*
* The RT priorities are set via sched_setscheduler(), but we still
@@ -89,7 +85,7 @@ void set_user_nice(struct task_struct *p, long nice)
return;
}
- scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK) {
+ scoped_guard (sched_change, p, DEQUEUE_SAVE) {
p->static_prio = NICE_TO_PRIO(nice);
set_load_weight(p, true);
old_prio = p->prio;
^ permalink raw reply related [flat|nested] 74+ messages in thread
* Re: [PATCH 00/12] sched: Cleanup the change-pattern and related locking
2025-10-06 10:44 [PATCH 00/12] sched: Cleanup the change-pattern and related locking Peter Zijlstra
` (11 preceding siblings ...)
2025-10-06 10:44 ` [PATCH 12/12] sched: Cleanup the sched_change NOCLOCK usage Peter Zijlstra
@ 2025-10-07 8:25 ` Andrea Righi
2025-10-07 9:55 ` Juri Lelli
` (3 subsequent siblings)
16 siblings, 0 replies; 74+ messages in thread
From: Andrea Righi @ 2025-10-07 8:25 UTC (permalink / raw)
To: Peter Zijlstra
Cc: tj, linux-kernel, mingo, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, longman,
hannes, mkoutny, void, changwoo, cgroups, sched-ext, liuwenfang,
tglx
Hi Peter,
On Mon, Oct 06, 2025 at 12:44:02PM +0200, Peter Zijlstra wrote:
>
> Hi,
>
> There here patches clean up the scheduler 'change' pattern and related locking
> some. They are the less controversial bit of some proposed sched_ext changes
> and stand on their own.
>
> I would like to queue them into sched/core after the merge window.
They all look sane changes to me. I've also stress-tested this quite a bit,
even with the sched_ext dl_server patch set applied, and everything appears
to work well.
Therefore, from a sched_ext perspective:
Acked-by: Andrea Righi <arighi@nvidia.com>
Thanks!
-Andrea
>
>
> Also in:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git sched/cleanup
>
> ---
> include/linux/cleanup.h | 5 +
> include/linux/sched.h | 4 +-
> kernel/cgroup/cpuset.c | 2 +-
> kernel/kthread.c | 15 +--
> kernel/sched/core.c | 327 ++++++++++++++++++-----------------------------
> kernel/sched/deadline.c | 20 +--
> kernel/sched/ext.c | 47 +++----
> kernel/sched/fair.c | 15 ++-
> kernel/sched/idle.c | 9 +-
> kernel/sched/rt.c | 7 +-
> kernel/sched/sched.h | 198 ++++++++++++++++++++--------
> kernel/sched/stats.h | 2 +-
> kernel/sched/stop_task.c | 9 +-
> kernel/sched/syscalls.c | 84 ++++--------
> 14 files changed, 373 insertions(+), 371 deletions(-)
>
>
^ permalink raw reply [flat|nested] 74+ messages in thread* Re: [PATCH 00/12] sched: Cleanup the change-pattern and related locking
2025-10-06 10:44 [PATCH 00/12] sched: Cleanup the change-pattern and related locking Peter Zijlstra
` (12 preceding siblings ...)
2025-10-07 8:25 ` [PATCH 00/12] sched: Cleanup the change-pattern and related locking Andrea Righi
@ 2025-10-07 9:55 ` Juri Lelli
2025-10-07 15:23 ` Vincent Guittot
` (2 subsequent siblings)
16 siblings, 0 replies; 74+ messages in thread
From: Juri Lelli @ 2025-10-07 9:55 UTC (permalink / raw)
To: Peter Zijlstra
Cc: tj, linux-kernel, mingo, vincent.guittot, dietmar.eggemann,
rostedt, bsegall, mgorman, vschneid, longman, hannes, mkoutny,
void, arighi, changwoo, cgroups, sched-ext, liuwenfang, tglx
Hi Peter,
On 06/10/25 12:44, Peter Zijlstra wrote:
>
> Hi,
>
> There here patches clean up the scheduler 'change' pattern and related locking
> some. They are the less controversial bit of some proposed sched_ext changes
> and stand on their own.
>
> I would like to queue them into sched/core after the merge window.
The set looks good to me.
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
For the DEADLINE bits also
Acked-by: Juri Lelli <juri.lelli@redhat.com>
Best,
Juri
^ permalink raw reply [flat|nested] 74+ messages in thread* Re: [PATCH 00/12] sched: Cleanup the change-pattern and related locking
2025-10-06 10:44 [PATCH 00/12] sched: Cleanup the change-pattern and related locking Peter Zijlstra
` (13 preceding siblings ...)
2025-10-07 9:55 ` Juri Lelli
@ 2025-10-07 15:23 ` Vincent Guittot
2025-10-07 20:46 ` Tejun Heo
2025-10-08 13:54 ` Valentin Schneider
16 siblings, 0 replies; 74+ messages in thread
From: Vincent Guittot @ 2025-10-07 15:23 UTC (permalink / raw)
To: Peter Zijlstra
Cc: tj, linux-kernel, mingo, juri.lelli, dietmar.eggemann, rostedt,
bsegall, mgorman, vschneid, longman, hannes, mkoutny, void,
arighi, changwoo, cgroups, sched-ext, liuwenfang, tglx
On Mon, 6 Oct 2025 at 12:46, Peter Zijlstra <peterz@infradead.org> wrote:
>
>
> Hi,
>
> There here patches clean up the scheduler 'change' pattern and related locking
> some. They are the less controversial bit of some proposed sched_ext changes
> and stand on their own.
>
> I would like to queue them into sched/core after the merge window.
Acked-by: Vincent Guittot <vincent.guittot@linaro.org> for the serie
in addition to the reviewed-by for patch 4
>
>
> Also in:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git sched/cleanup
>
> ---
> include/linux/cleanup.h | 5 +
> include/linux/sched.h | 4 +-
> kernel/cgroup/cpuset.c | 2 +-
> kernel/kthread.c | 15 +--
> kernel/sched/core.c | 327 ++++++++++++++++++-----------------------------
> kernel/sched/deadline.c | 20 +--
> kernel/sched/ext.c | 47 +++----
> kernel/sched/fair.c | 15 ++-
> kernel/sched/idle.c | 9 +-
> kernel/sched/rt.c | 7 +-
> kernel/sched/sched.h | 198 ++++++++++++++++++++--------
> kernel/sched/stats.h | 2 +-
> kernel/sched/stop_task.c | 9 +-
> kernel/sched/syscalls.c | 84 ++++--------
> 14 files changed, 373 insertions(+), 371 deletions(-)
>
>
>
^ permalink raw reply [flat|nested] 74+ messages in thread* Re: [PATCH 00/12] sched: Cleanup the change-pattern and related locking
2025-10-06 10:44 [PATCH 00/12] sched: Cleanup the change-pattern and related locking Peter Zijlstra
` (14 preceding siblings ...)
2025-10-07 15:23 ` Vincent Guittot
@ 2025-10-07 20:46 ` Tejun Heo
2025-10-08 13:54 ` Valentin Schneider
16 siblings, 0 replies; 74+ messages in thread
From: Tejun Heo @ 2025-10-07 20:46 UTC (permalink / raw)
To: Peter Zijlstra
Cc: linux-kernel, mingo, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, vschneid, longman,
hannes, mkoutny, void, arighi, changwoo, cgroups, sched-ext,
liuwenfang, tglx
On Mon, Oct 06, 2025 at 12:44:02PM +0200, Peter Zijlstra wrote:
>
> Hi,
>
> There here patches clean up the scheduler 'change' pattern and related locking
> some. They are the less controversial bit of some proposed sched_ext changes
> and stand on their own.
>
> I would like to queue them into sched/core after the merge window.
FWIW, all look good to me.
Acked-by: Tejun Heo <tj@kernel.org>
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 74+ messages in thread* Re: [PATCH 00/12] sched: Cleanup the change-pattern and related locking
2025-10-06 10:44 [PATCH 00/12] sched: Cleanup the change-pattern and related locking Peter Zijlstra
` (15 preceding siblings ...)
2025-10-07 20:46 ` Tejun Heo
@ 2025-10-08 13:54 ` Valentin Schneider
16 siblings, 0 replies; 74+ messages in thread
From: Valentin Schneider @ 2025-10-08 13:54 UTC (permalink / raw)
To: Peter Zijlstra, tj
Cc: linux-kernel, peterz, mingo, juri.lelli, vincent.guittot,
dietmar.eggemann, rostedt, bsegall, mgorman, longman, hannes,
mkoutny, void, arighi, changwoo, cgroups, sched-ext, liuwenfang,
tglx
On 06/10/25 12:44, Peter Zijlstra wrote:
> Hi,
>
> There here patches clean up the scheduler 'change' pattern and related locking
> some. They are the less controversial bit of some proposed sched_ext changes
> and stand on their own.
>
> I would like to queue them into sched/core after the merge window.
>
>
> Also in:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git sched/cleanup
>
Other than what's already been said, that LGTM. It's good to finally have a
canonical change pattern... pattern? :-)
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
^ permalink raw reply [flat|nested] 74+ messages in thread