From: Peter Zijlstra <peterz@infradead.org>
To: mingo@kernel.org
Cc: longman@redhat.com, chenridong@huaweicloud.com,
peterz@infradead.org, juri.lelli@redhat.com,
vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
vschneid@redhat.com, tj@kernel.org, hannes@cmpxchg.org,
mkoutny@suse.com, cgroups@vger.kernel.org,
linux-kernel@vger.kernel.org, jstultz@google.com,
kprateek.nayak@amd.com, qyousef@layalina.io
Subject: [PATCH v2 09/10] sched: Remove sched_class::pick_next_task()
Date: Mon, 11 May 2026 13:31:13 +0200 [thread overview]
Message-ID: <20260511120628.057634261@infradead.org> (raw)
In-Reply-To: 20260511113104.563854162@infradead.org
The reason for pick_next_task_fair() is the put/set optimization that
avoids touching the common ancestors. However, it is possible to
implement this in the put_prev_task() and set_next_task() calls as
used in put_prev_set_next_task().
Notably, put_prev_set_next_task() is the only site that:
- calls put_prev_task() with a .next argument;
- calls set_next_task() with .first = true.
This means that put_prev_task() can determine the common hierarchy and
stop there, and then set_next_task() can terminate where put_prev_task
stopped.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/sched/core.c | 27 +++------
kernel/sched/fair.c | 139 +++++++++++++++++----------------------------------
kernel/sched/sched.h | 14 -----
3 files changed, 57 insertions(+), 123 deletions(-)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5980,16 +5980,15 @@ __pick_next_task(struct rq *rq, struct t
if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
rq->nr_running == rq->cfs.h_nr_queued)) {
- p = pick_next_task_fair(rq, prev, rf);
+ p = pick_task_fair(rq, rf);
if (unlikely(p == RETRY_TASK))
goto restart;
/* Assume the next prioritized class is idle_sched_class */
- if (!p) {
+ if (!p)
p = pick_task_idle(rq, rf);
- put_prev_set_next_task(rq, prev, p);
- }
+ put_prev_set_next_task(rq, prev, p);
return p;
}
@@ -5997,20 +5996,12 @@ __pick_next_task(struct rq *rq, struct t
prev_balance(rq, prev, rf);
for_each_active_class(class) {
- if (class->pick_next_task) {
- p = class->pick_next_task(rq, prev, rf);
- if (unlikely(p == RETRY_TASK))
- goto restart;
- if (p)
- return p;
- } else {
- p = class->pick_task(rq, rf);
- if (unlikely(p == RETRY_TASK))
- goto restart;
- if (p) {
- put_prev_set_next_task(rq, prev, p);
- return p;
- }
+ p = class->pick_task(rq, rf);
+ if (unlikely(p == RETRY_TASK))
+ goto restart;
+ if (p) {
+ put_prev_set_next_task(rq, prev, p);
+ return p;
}
}
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9214,7 +9214,7 @@ static void wakeup_preempt_fair(struct r
resched_curr_lazy(rq);
}
-static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf)
+struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf)
__must_hold(__rq_lockp(rq))
{
struct sched_entity *se;
@@ -9257,72 +9257,6 @@ static struct task_struct *pick_task_fai
return NULL;
}
-static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
-static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
-
-struct task_struct *
-pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
- __must_hold(__rq_lockp(rq))
-{
- struct sched_entity *se;
- struct task_struct *p;
-
- p = pick_task_fair(rq, rf);
- if (unlikely(p == RETRY_TASK))
- return p;
- if (!p)
- return p;
- se = &p->se;
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
- if (prev->sched_class != &fair_sched_class)
- goto simple;
-
- __put_prev_set_next_dl_server(rq, prev, p);
-
- /*
- * Because of the set_next_buddy() in dequeue_task_fair() it is rather
- * likely that a next task is from the same cgroup as the current.
- *
- * Therefore attempt to avoid putting and setting the entire cgroup
- * hierarchy, only change the part that actually changes.
- *
- * Since we haven't yet done put_prev_entity and if the selected task
- * is a different task than we started out with, try and touch the
- * least amount of cfs_rqs.
- */
- if (prev != p) {
- struct sched_entity *pse = &prev->se;
- struct cfs_rq *cfs_rq;
-
- while (!(cfs_rq = is_same_group(se, pse))) {
- int se_depth = se->depth;
- int pse_depth = pse->depth;
-
- if (se_depth <= pse_depth) {
- put_prev_entity(cfs_rq_of(pse), pse);
- pse = parent_entity(pse);
- }
- if (se_depth >= pse_depth) {
- set_next_entity(cfs_rq_of(se), se, true);
- se = parent_entity(se);
- }
- }
-
- put_prev_entity(cfs_rq, pse);
- set_next_entity(cfs_rq, se, true);
-
- __set_next_task_fair(rq, p, true);
- }
-
- return p;
-
-simple:
-#endif /* CONFIG_FAIR_GROUP_SCHED */
- put_prev_set_next_task(rq, prev, p);
- return p;
-}
-
static struct task_struct *
fair_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf)
__must_hold(__rq_lockp(dl_se->rq))
@@ -9346,10 +9280,33 @@ static void put_prev_task_fair(struct rq
{
struct sched_entity *se = &prev->se;
struct cfs_rq *cfs_rq;
+ struct sched_entity *nse = NULL;
- for_each_sched_entity(se) {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ if (next && next->sched_class == &fair_sched_class)
+ nse = &next->se;
+#endif
+
+ while (se) {
cfs_rq = cfs_rq_of(se);
- put_prev_entity(cfs_rq, se);
+ if (!nse || cfs_rq->curr)
+ put_prev_entity(cfs_rq, se);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ if (nse) {
+ if (is_same_group(se, nse))
+ break;
+
+ int d = nse->depth - se->depth;
+ if (d >= 0) {
+ /* nse has equal or greater depth, ascend */
+ nse = parent_entity(nse);
+ /* if nse is the deeper, do not ascend se */
+ if (d > 0)
+ continue;
+ }
+ }
+#endif
+ se = parent_entity(se);
}
}
@@ -13896,10 +13853,30 @@ static void switched_to_fair(struct rq *
}
}
-static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
+/*
+ * Account for a task changing its policy or group.
+ *
+ * This routine is mostly called to set cfs_rq->curr field when a task
+ * migrates between groups/classes.
+ */
+static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
{
struct sched_entity *se = &p->se;
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ if (IS_ENABLED(CONFIG_FAIR_GROUP_SCHED) &&
+ first && cfs_rq->curr)
+ break;
+
+ set_next_entity(cfs_rq, se, first);
+ /* ensure bandwidth has been allocated on our new cfs_rq */
+ account_cfs_rq_runtime(cfs_rq, 0);
+ }
+
+ se = &p->se;
+
if (task_on_rq_queued(p)) {
/*
* Move the next running task to the front of the list, so our
@@ -13919,27 +13896,6 @@ static void __set_next_task_fair(struct
sched_fair_update_stop_tick(rq, p);
}
-/*
- * Account for a task changing its policy or group.
- *
- * This routine is mostly called to set cfs_rq->curr field when a task
- * migrates between groups/classes.
- */
-static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
-{
- struct sched_entity *se = &p->se;
-
- for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
- set_next_entity(cfs_rq, se, first);
- /* ensure bandwidth has been allocated on our new cfs_rq */
- account_cfs_rq_runtime(cfs_rq, 0);
- }
-
- __set_next_task_fair(rq, p, first);
-}
-
void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
@@ -14251,7 +14207,6 @@ DEFINE_SCHED_CLASS(fair) = {
.wakeup_preempt = wakeup_preempt_fair,
.pick_task = pick_task_fair,
- .pick_next_task = pick_next_task_fair,
.put_prev_task = put_prev_task_fair,
.set_next_task = set_next_task_fair,
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2555,17 +2555,6 @@ struct sched_class {
* schedule/pick_next_task: rq->lock
*/
struct task_struct *(*pick_task)(struct rq *rq, struct rq_flags *rf);
- /*
- * Optional! When implemented pick_next_task() should be equivalent to:
- *
- * next = pick_task();
- * if (next) {
- * put_prev_task(prev);
- * set_next_task_first(next);
- * }
- */
- struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev,
- struct rq_flags *rf);
/*
* sched_change:
@@ -2789,8 +2778,7 @@ static inline bool sched_fair_runnable(s
return rq->cfs.nr_queued > 0;
}
-extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev,
- struct rq_flags *rf);
+extern struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf);
extern struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf);
#define SCA_CHECK 0x01
next prev parent reply other threads:[~2026-05-11 12:07 UTC|newest]
Thread overview: 33+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-11 11:31 [PATCH v2 00/10] sched: Flatten the pick Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 01/10] sched/debug: Use char * instead of char (*)[] Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 02/10] sched: Use {READ,WRITE}_ONCE() for preempt_dynamic_mode Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 03/10] sched/debug: Collapse subsequent CONFIG_SCHED_CLASS_EXT sections Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 04/10] sched/fair: Add cgroup_mode switch Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 05/10] sched/fair: Add cgroup_mode: UP Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 06/10] sched/fair: Add cgroup_mode: MAX Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 07/10] sched/fair: Add cgroup_mode: CONCUR Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 08/10] sched/fair: Add newidle balance to pick_task_fair() Peter Zijlstra
2026-05-12 5:37 ` K Prateek Nayak
2026-05-12 9:45 ` Peter Zijlstra
2026-05-11 11:31 ` Peter Zijlstra [this message]
2026-05-11 11:31 ` [PATCH v2 10/10] sched/eevdf: Move to a single runqueue Peter Zijlstra
2026-05-11 16:21 ` K Prateek Nayak
2026-05-12 11:09 ` Peter Zijlstra
2026-05-13 7:01 ` K Prateek Nayak
2026-05-13 7:25 ` Peter Zijlstra
2026-05-13 4:51 ` John Stultz
2026-05-13 5:00 ` John Stultz
2026-05-14 1:36 ` John Stultz
2026-05-14 2:53 ` K Prateek Nayak
2026-05-14 3:14 ` John Stultz
2026-05-11 19:23 ` [PATCH v2 00/10] sched: Flatten the pick Tejun Heo
2026-05-12 8:10 ` Peter Zijlstra
2026-05-12 18:45 ` Tejun Heo
2026-05-12 8:42 ` Vincent Guittot
2026-05-12 9:20 ` Peter Zijlstra
2026-05-12 18:24 ` Peter Zijlstra
2026-05-12 18:25 ` Peter Zijlstra
2026-05-12 18:32 ` Vincent Guittot
2026-05-13 7:25 ` Peter Zijlstra
2026-05-13 11:35 ` Peter Zijlstra
2026-05-13 12:43 ` Peter Zijlstra
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260511120628.057634261@infradead.org \
--to=peterz@infradead.org \
--cc=bsegall@google.com \
--cc=cgroups@vger.kernel.org \
--cc=chenridong@huaweicloud.com \
--cc=dietmar.eggemann@arm.com \
--cc=hannes@cmpxchg.org \
--cc=jstultz@google.com \
--cc=juri.lelli@redhat.com \
--cc=kprateek.nayak@amd.com \
--cc=linux-kernel@vger.kernel.org \
--cc=longman@redhat.com \
--cc=mgorman@suse.de \
--cc=mingo@kernel.org \
--cc=mkoutny@suse.com \
--cc=qyousef@layalina.io \
--cc=rostedt@goodmis.org \
--cc=tj@kernel.org \
--cc=vincent.guittot@linaro.org \
--cc=vschneid@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.