From: Peter Zijlstra <peterz@infradead.org>
To: mingo@kernel.org
Cc: longman@redhat.com, chenridong@huaweicloud.com,
peterz@infradead.org, juri.lelli@redhat.com,
vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
vschneid@redhat.com, tj@kernel.org, hannes@cmpxchg.org,
mkoutny@suse.com, cgroups@vger.kernel.org,
linux-kernel@vger.kernel.org, jstultz@google.com,
kprateek.nayak@amd.com, qyousef@layalina.io
Subject: [PATCH v2 07/10] sched/fair: Add cgroup_mode: CONCUR
Date: Mon, 11 May 2026 13:31:11 +0200 [thread overview]
Message-ID: <20260511120627.829222375@infradead.org> (raw)
In-Reply-To: 20260511113104.563854162@infradead.org
A variation of MAX; where instead of assuming maximal concurrent, this scales
with 'min(nr_tasks, nr_cpus)'. This handles the low concurrency cases more
gracefully, with the exception of CPU affnity.
Note: the tracking of tg->tasks is somewhat expensive :-/
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/sched/debug.c | 1 +
kernel/sched/fair.c | 39 ++++++++++++++++++++++++++++++++++++---
kernel/sched/sched.h | 3 +++
3 files changed, 40 insertions(+), 3 deletions(-)
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -594,6 +594,7 @@ int cgroup_mode = 1;
static const char *cgroup_mode_str[] = {
"up",
"smp",
+ "concur",
"max",
};
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4211,6 +4211,30 @@ static long calc_max_shares(struct cfs_r
return __calc_smp_shares(cfs_rq, tg_shares * nr, max_shares);
}
+static inline int tg_tasks(struct task_group *tg)
+{
+ return max(1, atomic_long_read(&tg->tasks));
+}
+
+/*
+ * Func: min(fraction(num * tg->shares), nice -20); where
+ * num = min(nr_tasks, nr_cpus)
+ *
+ * Similar to max, except scale with min(nr_tasks, nr_cpus), which gives
+ * a far more natural distrubution. Can still create edge case using CPU
+ * affinity.
+ */
+static long calc_concur_shares(struct cfs_rq *cfs_rq)
+{
+ struct task_group *tg = cfs_rq->tg;
+ int nr_cpus = tg_cpus(tg);
+ int nr_tasks = tg_tasks(tg);
+ int nr = min(nr_tasks, nr_cpus);
+ long tg_shares = READ_ONCE(tg->shares);
+ long max_shares = scale_load(sched_prio_to_weight[0]);
+ return __calc_smp_shares(cfs_rq, nr * tg_shares, max_shares);
+}
+
/*
* Func: fraction(tg->shares)
*
@@ -4240,6 +4264,9 @@ static long calc_group_shares(struct cfs
return calc_up_shares(cfs_rq);
if (mode == 2)
+ return calc_concur_shares(cfs_rq);
+
+ if (mode == 3)
return calc_max_shares(cfs_rq);
return calc_smp_shares(cfs_rq);
@@ -4385,7 +4412,7 @@ static inline bool cfs_rq_is_decayed(str
*/
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
{
- long delta;
+ long delta, dt;
u64 now;
/*
@@ -4407,16 +4434,19 @@ static inline void update_tg_load_avg(st
return;
delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
- if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
+ dt = cfs_rq->h_nr_queued - cfs_rq->tg_tasks_contrib;
+ if (dt || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
atomic_long_add(delta, &cfs_rq->tg->load_avg);
+ atomic_long_add(dt, &cfs_rq->tg->tasks);
cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
+ cfs_rq->tg_tasks_contrib = cfs_rq->h_nr_queued;
cfs_rq->last_update_tg_load_avg = now;
}
}
static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq)
{
- long delta;
+ long delta, dt;
u64 now;
/*
@@ -4427,8 +4457,11 @@ static inline void clear_tg_load_avg(str
now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
delta = 0 - cfs_rq->tg_load_avg_contrib;
+ dt = 0 - cfs_rq->tg_tasks_contrib;
atomic_long_add(delta, &cfs_rq->tg->load_avg);
+ atomic_long_add(dt, &cfs_rq->tg->tasks);
cfs_rq->tg_load_avg_contrib = 0;
+ cfs_rq->tg_tasks_contrib = 0;
cfs_rq->last_update_tg_load_avg = now;
}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -491,6 +491,8 @@ struct task_group {
* will also be accessed at each tick.
*/
atomic_long_t load_avg ____cacheline_aligned;
+ atomic_long_t tasks;
+
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
@@ -720,6 +722,7 @@ struct cfs_rq {
#ifdef CONFIG_FAIR_GROUP_SCHED
u64 last_update_tg_load_avg;
unsigned long tg_load_avg_contrib;
+ unsigned long tg_tasks_contrib;
long propagate;
long prop_runnable_sum;
next prev parent reply other threads:[~2026-05-11 12:07 UTC|newest]
Thread overview: 33+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-11 11:31 [PATCH v2 00/10] sched: Flatten the pick Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 01/10] sched/debug: Use char * instead of char (*)[] Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 02/10] sched: Use {READ,WRITE}_ONCE() for preempt_dynamic_mode Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 03/10] sched/debug: Collapse subsequent CONFIG_SCHED_CLASS_EXT sections Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 04/10] sched/fair: Add cgroup_mode switch Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 05/10] sched/fair: Add cgroup_mode: UP Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 06/10] sched/fair: Add cgroup_mode: MAX Peter Zijlstra
2026-05-11 11:31 ` Peter Zijlstra [this message]
2026-05-11 11:31 ` [PATCH v2 08/10] sched/fair: Add newidle balance to pick_task_fair() Peter Zijlstra
2026-05-12 5:37 ` K Prateek Nayak
2026-05-12 9:45 ` Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 09/10] sched: Remove sched_class::pick_next_task() Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 10/10] sched/eevdf: Move to a single runqueue Peter Zijlstra
2026-05-11 16:21 ` K Prateek Nayak
2026-05-12 11:09 ` Peter Zijlstra
2026-05-13 7:01 ` K Prateek Nayak
2026-05-13 7:25 ` Peter Zijlstra
2026-05-13 4:51 ` John Stultz
2026-05-13 5:00 ` John Stultz
2026-05-14 1:36 ` John Stultz
2026-05-14 2:53 ` K Prateek Nayak
2026-05-14 3:14 ` John Stultz
2026-05-11 19:23 ` [PATCH v2 00/10] sched: Flatten the pick Tejun Heo
2026-05-12 8:10 ` Peter Zijlstra
2026-05-12 18:45 ` Tejun Heo
2026-05-12 8:42 ` Vincent Guittot
2026-05-12 9:20 ` Peter Zijlstra
2026-05-12 18:24 ` Peter Zijlstra
2026-05-12 18:25 ` Peter Zijlstra
2026-05-12 18:32 ` Vincent Guittot
2026-05-13 7:25 ` Peter Zijlstra
2026-05-13 11:35 ` Peter Zijlstra
2026-05-13 12:43 ` Peter Zijlstra
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260511120627.829222375@infradead.org \
--to=peterz@infradead.org \
--cc=bsegall@google.com \
--cc=cgroups@vger.kernel.org \
--cc=chenridong@huaweicloud.com \
--cc=dietmar.eggemann@arm.com \
--cc=hannes@cmpxchg.org \
--cc=jstultz@google.com \
--cc=juri.lelli@redhat.com \
--cc=kprateek.nayak@amd.com \
--cc=linux-kernel@vger.kernel.org \
--cc=longman@redhat.com \
--cc=mgorman@suse.de \
--cc=mingo@kernel.org \
--cc=mkoutny@suse.com \
--cc=qyousef@layalina.io \
--cc=rostedt@goodmis.org \
--cc=tj@kernel.org \
--cc=vincent.guittot@linaro.org \
--cc=vschneid@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.