From: Peter Zijlstra <peterz@infradead.org>
To: mingo@kernel.org
Cc: longman@redhat.com, chenridong@huaweicloud.com,
peterz@infradead.org, juri.lelli@redhat.com,
vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
vschneid@redhat.com, tj@kernel.org, hannes@cmpxchg.org,
mkoutny@suse.com, cgroups@vger.kernel.org,
linux-kernel@vger.kernel.org, jstultz@google.com,
kprateek.nayak@amd.com, qyousef@layalina.io
Subject: [PATCH v2 09/10] sched: Remove sched_class::pick_next_task()
Date: Mon, 11 May 2026 13:31:13 +0200 [thread overview]
Message-ID: <20260511120628.057634261@infradead.org> (raw)
In-Reply-To: 20260511113104.563854162@infradead.org
The reason for pick_next_task_fair() is the put/set optimization that
avoids touching the common ancestors. However, it is possible to
implement this in the put_prev_task() and set_next_task() calls as
used in put_prev_set_next_task().
Notably, put_prev_set_next_task() is the only site that:
- calls put_prev_task() with a .next argument;
- calls set_next_task() with .first = true.
This means that put_prev_task() can determine the common hierarchy and
stop there, and then set_next_task() can terminate where put_prev_task
stopped.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/sched/core.c | 27 +++------
kernel/sched/fair.c | 139 +++++++++++++++++----------------------------------
kernel/sched/sched.h | 14 -----
3 files changed, 57 insertions(+), 123 deletions(-)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5980,16 +5980,15 @@ __pick_next_task(struct rq *rq, struct t
if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
rq->nr_running == rq->cfs.h_nr_queued)) {
- p = pick_next_task_fair(rq, prev, rf);
+ p = pick_task_fair(rq, rf);
if (unlikely(p == RETRY_TASK))
goto restart;
/* Assume the next prioritized class is idle_sched_class */
- if (!p) {
+ if (!p)
p = pick_task_idle(rq, rf);
- put_prev_set_next_task(rq, prev, p);
- }
+ put_prev_set_next_task(rq, prev, p);
return p;
}
@@ -5997,20 +5996,12 @@ __pick_next_task(struct rq *rq, struct t
prev_balance(rq, prev, rf);
for_each_active_class(class) {
- if (class->pick_next_task) {
- p = class->pick_next_task(rq, prev, rf);
- if (unlikely(p == RETRY_TASK))
- goto restart;
- if (p)
- return p;
- } else {
- p = class->pick_task(rq, rf);
- if (unlikely(p == RETRY_TASK))
- goto restart;
- if (p) {
- put_prev_set_next_task(rq, prev, p);
- return p;
- }
+ p = class->pick_task(rq, rf);
+ if (unlikely(p == RETRY_TASK))
+ goto restart;
+ if (p) {
+ put_prev_set_next_task(rq, prev, p);
+ return p;
}
}
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9214,7 +9214,7 @@ static void wakeup_preempt_fair(struct r
resched_curr_lazy(rq);
}
-static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf)
+struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf)
__must_hold(__rq_lockp(rq))
{
struct sched_entity *se;
@@ -9257,72 +9257,6 @@ static struct task_struct *pick_task_fai
return NULL;
}
-static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
-static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
-
-struct task_struct *
-pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
- __must_hold(__rq_lockp(rq))
-{
- struct sched_entity *se;
- struct task_struct *p;
-
- p = pick_task_fair(rq, rf);
- if (unlikely(p == RETRY_TASK))
- return p;
- if (!p)
- return p;
- se = &p->se;
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
- if (prev->sched_class != &fair_sched_class)
- goto simple;
-
- __put_prev_set_next_dl_server(rq, prev, p);
-
- /*
- * Because of the set_next_buddy() in dequeue_task_fair() it is rather
- * likely that a next task is from the same cgroup as the current.
- *
- * Therefore attempt to avoid putting and setting the entire cgroup
- * hierarchy, only change the part that actually changes.
- *
- * Since we haven't yet done put_prev_entity and if the selected task
- * is a different task than we started out with, try and touch the
- * least amount of cfs_rqs.
- */
- if (prev != p) {
- struct sched_entity *pse = &prev->se;
- struct cfs_rq *cfs_rq;
-
- while (!(cfs_rq = is_same_group(se, pse))) {
- int se_depth = se->depth;
- int pse_depth = pse->depth;
-
- if (se_depth <= pse_depth) {
- put_prev_entity(cfs_rq_of(pse), pse);
- pse = parent_entity(pse);
- }
- if (se_depth >= pse_depth) {
- set_next_entity(cfs_rq_of(se), se, true);
- se = parent_entity(se);
- }
- }
-
- put_prev_entity(cfs_rq, pse);
- set_next_entity(cfs_rq, se, true);
-
- __set_next_task_fair(rq, p, true);
- }
-
- return p;
-
-simple:
-#endif /* CONFIG_FAIR_GROUP_SCHED */
- put_prev_set_next_task(rq, prev, p);
- return p;
-}
-
static struct task_struct *
fair_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf)
__must_hold(__rq_lockp(dl_se->rq))
@@ -9346,10 +9280,33 @@ static void put_prev_task_fair(struct rq
{
struct sched_entity *se = &prev->se;
struct cfs_rq *cfs_rq;
+ struct sched_entity *nse = NULL;
- for_each_sched_entity(se) {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ if (next && next->sched_class == &fair_sched_class)
+ nse = &next->se;
+#endif
+
+ while (se) {
cfs_rq = cfs_rq_of(se);
- put_prev_entity(cfs_rq, se);
+ if (!nse || cfs_rq->curr)
+ put_prev_entity(cfs_rq, se);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ if (nse) {
+ if (is_same_group(se, nse))
+ break;
+
+ int d = nse->depth - se->depth;
+ if (d >= 0) {
+ /* nse has equal or greater depth, ascend */
+ nse = parent_entity(nse);
+ /* if nse is the deeper, do not ascend se */
+ if (d > 0)
+ continue;
+ }
+ }
+#endif
+ se = parent_entity(se);
}
}
@@ -13896,10 +13853,30 @@ static void switched_to_fair(struct rq *
}
}
-static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
+/*
+ * Account for a task changing its policy or group.
+ *
+ * This routine is mostly called to set cfs_rq->curr field when a task
+ * migrates between groups/classes.
+ */
+static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
{
struct sched_entity *se = &p->se;
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ if (IS_ENABLED(CONFIG_FAIR_GROUP_SCHED) &&
+ first && cfs_rq->curr)
+ break;
+
+ set_next_entity(cfs_rq, se, first);
+ /* ensure bandwidth has been allocated on our new cfs_rq */
+ account_cfs_rq_runtime(cfs_rq, 0);
+ }
+
+ se = &p->se;
+
if (task_on_rq_queued(p)) {
/*
* Move the next running task to the front of the list, so our
@@ -13919,27 +13896,6 @@ static void __set_next_task_fair(struct
sched_fair_update_stop_tick(rq, p);
}
-/*
- * Account for a task changing its policy or group.
- *
- * This routine is mostly called to set cfs_rq->curr field when a task
- * migrates between groups/classes.
- */
-static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
-{
- struct sched_entity *se = &p->se;
-
- for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
- set_next_entity(cfs_rq, se, first);
- /* ensure bandwidth has been allocated on our new cfs_rq */
- account_cfs_rq_runtime(cfs_rq, 0);
- }
-
- __set_next_task_fair(rq, p, first);
-}
-
void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
@@ -14251,7 +14207,6 @@ DEFINE_SCHED_CLASS(fair) = {
.wakeup_preempt = wakeup_preempt_fair,
.pick_task = pick_task_fair,
- .pick_next_task = pick_next_task_fair,
.put_prev_task = put_prev_task_fair,
.set_next_task = set_next_task_fair,
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2555,17 +2555,6 @@ struct sched_class {
* schedule/pick_next_task: rq->lock
*/
struct task_struct *(*pick_task)(struct rq *rq, struct rq_flags *rf);
- /*
- * Optional! When implemented pick_next_task() should be equivalent to:
- *
- * next = pick_task();
- * if (next) {
- * put_prev_task(prev);
- * set_next_task_first(next);
- * }
- */
- struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev,
- struct rq_flags *rf);
/*
* sched_change:
@@ -2789,8 +2778,7 @@ static inline bool sched_fair_runnable(s
return rq->cfs.nr_queued > 0;
}
-extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev,
- struct rq_flags *rf);
+extern struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf);
extern struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf);
#define SCA_CHECK 0x01
next prev parent reply other threads:[~2026-05-11 12:07 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-11 11:31 [PATCH v2 00/10] sched: Flatten the pick Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 01/10] sched/debug: Use char * instead of char (*)[] Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 02/10] sched: Use {READ,WRITE}_ONCE() for preempt_dynamic_mode Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 03/10] sched/debug: Collapse subsequent CONFIG_SCHED_CLASS_EXT sections Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 04/10] sched/fair: Add cgroup_mode switch Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 05/10] sched/fair: Add cgroup_mode: UP Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 06/10] sched/fair: Add cgroup_mode: MAX Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 07/10] sched/fair: Add cgroup_mode: CONCUR Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 08/10] sched/fair: Add newidle balance to pick_task_fair() Peter Zijlstra
2026-05-12 5:37 ` K Prateek Nayak
2026-05-12 9:45 ` Peter Zijlstra
2026-05-11 11:31 ` Peter Zijlstra [this message]
2026-05-11 11:31 ` [PATCH v2 10/10] sched/eevdf: Move to a single runqueue Peter Zijlstra
2026-05-11 16:21 ` K Prateek Nayak
2026-05-12 11:09 ` Peter Zijlstra
2026-05-11 19:23 ` [PATCH v2 00/10] sched: Flatten the pick Tejun Heo
2026-05-12 8:10 ` Peter Zijlstra
2026-05-12 8:42 ` Vincent Guittot
2026-05-12 9:20 ` Peter Zijlstra
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260511120628.057634261@infradead.org \
--to=peterz@infradead.org \
--cc=bsegall@google.com \
--cc=cgroups@vger.kernel.org \
--cc=chenridong@huaweicloud.com \
--cc=dietmar.eggemann@arm.com \
--cc=hannes@cmpxchg.org \
--cc=jstultz@google.com \
--cc=juri.lelli@redhat.com \
--cc=kprateek.nayak@amd.com \
--cc=linux-kernel@vger.kernel.org \
--cc=longman@redhat.com \
--cc=mgorman@suse.de \
--cc=mingo@kernel.org \
--cc=mkoutny@suse.com \
--cc=qyousef@layalina.io \
--cc=rostedt@goodmis.org \
--cc=tj@kernel.org \
--cc=vincent.guittot@linaro.org \
--cc=vschneid@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox