From: Peter Zijlstra <peterz@infradead.org>
To: mingo@kernel.org
Cc: longman@redhat.com, chenridong@huaweicloud.com,
peterz@infradead.org, juri.lelli@redhat.com,
vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
vschneid@redhat.com, tj@kernel.org, hannes@cmpxchg.org,
mkoutny@suse.com, cgroups@vger.kernel.org,
linux-kernel@vger.kernel.org, jstultz@google.com,
kprateek.nayak@amd.com, qyousef@layalina.io
Subject: [PATCH v3 3/7] sched/fair: Add cgroup_mode: max
Date: Fri, 05 Jun 2026 14:40:16 +0200 [thread overview]
Message-ID: <20260605124051.589618504@infradead.org> (raw)
In-Reply-To: 20260605105513.354837583@infradead.org
In order to avoid the average CPU fraction avg(F_g_n) becoming tiny '1/N',
assume each cgroup is maximally concurrent and distrubute 'N*weight', such
that:
F_g_n' = N * F_g_n
Giving:
avg(F_g_n') = N*avg(F_g_n) ~ N * 1/N = 1
And while this sounds like it solves things, remember what that ~ meant. There
is the corner case when a cgroup is minimally loaded, eg a single runnable
task, therefore limit the CPU fraction to that of a nice -20 task to avoid
getting too much load.
This last bit is what makes it different from a previous proposal to allow
raising cpu.weight to '100 * N', that would not limit the mininal concurrency
case and results in a very large F_g_n. And just like F_g_n << 1 is
problematic, so is F_g_n >> 1 for the exact same reasons (it would drown the
kthreads, but it also risks overflowing the load values).
So while this might appear to be a better scheme than the current default
scheme, it doesn't really handle less than maximal concurrency nicely -- it
clips and introduces artificially large weights. So where the traditional SMP
mode works well when nr_tasks << nr_cpus, MAX doesn't work well in that regime
and vice-versa.
The meaning of "cpu.weight" would be: weight per allowed CPU.
Included for completeness (and infrastructure).
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/cpuset.h | 6 +++++
kernel/cgroup/cpuset.c | 15 ++++++++++++++
kernel/sched/debug.c | 1
kernel/sched/fair.c | 52 ++++++++++++++++++++++++++++++++++++++++++++-----
4 files changed, 69 insertions(+), 5 deletions(-)
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -80,6 +80,7 @@ extern void lockdep_assert_cpuset_lock_h
extern void cpuset_cpus_allowed_locked(struct task_struct *p, struct cpumask *mask);
extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
extern bool cpuset_cpus_allowed_fallback(struct task_struct *p);
+extern int cpuset_num_cpus(struct cgroup *cgroup);
extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
#define cpuset_current_mems_allowed (current->mems_allowed)
void cpuset_init_current_mems_allowed(void);
@@ -216,6 +217,11 @@ static inline bool cpuset_cpus_allowed_f
return false;
}
+static inline int cpuset_num_cpus(struct cgroup *cgroup)
+{
+ return num_online_cpus();
+}
+
static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
{
return node_possible_map;
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -4116,6 +4116,21 @@ bool cpuset_cpus_allowed_fallback(struct
return changed;
}
+int cpuset_num_cpus(struct cgroup *cgrp)
+{
+ int nr = num_online_cpus();
+ struct cpuset *cs;
+
+ if (is_in_v2_mode()) {
+ guard(rcu)();
+ cs = css_cs(cgroup_e_css(cgrp, &cpuset_cgrp_subsys));
+ if (cs)
+ nr = cpumask_weight(cs->effective_cpus);
+ }
+
+ return nr;
+}
+
void __init cpuset_init_current_mems_allowed(void)
{
nodes_setall(current->mems_allowed);
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -641,6 +641,7 @@ static int cgroup_mode = 1;
static const char *cgroup_mode_str[] = {
"up",
"smp",
+ "max",
};
static int sched_cgroup_mode(const char *str)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4801,12 +4801,10 @@ static inline int throttled_hierarchy(st
*
* hence icky!
*/
-static long calc_smp(struct cfs_rq *cfs_rq)
+static long __calc_smp_shares(struct cfs_rq *cfs_rq, long tg_shares, long shares_max)
{
- long tg_weight, tg_shares, load, shares;
struct task_group *tg = cfs_rq->tg;
-
- tg_shares = READ_ONCE(tg->shares);
+ long tg_weight, load, shares;
load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
@@ -4832,7 +4830,48 @@ static long calc_smp(struct cfs_rq *cfs_
* case no task is runnable on a CPU MIN_SHARES=2 should be returned
* instead of 0.
*/
- return clamp_t(long, shares, MIN_SHARES, tg_shares);
+ return clamp_t(long, shares, MIN_SHARES, shares_max);
+}
+
+static int tg_cpus(struct task_group *tg)
+{
+ int nr = num_online_cpus();
+
+ if (cpusets_enabled()) {
+ struct cgroup *cgrp = tg->css.cgroup;
+ if (cgrp)
+ nr = cpuset_num_cpus(cgrp);
+ }
+
+ return nr;
+}
+
+/*
+ * Func: min(fraction(nr_cpus * tg->shares), nice -20)
+ *
+ * Scale tg->shares by the maximal number of CPUs; but clip the max shares at
+ * nice -20, otherwise a single spinner on a 512 CPU machine would result in
+ * 512*NICE_0_LOAD, which is also crazy.
+ */
+static long calc_max_shares(struct cfs_rq *cfs_rq)
+{
+ struct task_group *tg = cfs_rq->tg;
+ int nr = tg_cpus(tg);
+ long tg_shares = READ_ONCE(tg->shares);
+ long max_shares = scale_load(sched_prio_to_weight[0]);
+ return __calc_smp_shares(cfs_rq, tg_shares * nr, max_shares);
+}
+
+/*
+ * Func: fraction(tg->shares)
+ *
+ * This infamously results in tiny shares when you have many CPUs.
+ */
+static long calc_smp_shares(struct cfs_rq *cfs_rq)
+{
+ struct task_group *tg = cfs_rq->tg;
+ long tg_shares = READ_ONCE(tg->shares);
+ return __calc_smp_shares(cfs_rq, tg_shares, tg_shares);
}
/*
@@ -4857,6 +4896,9 @@ void __sched_cgroup_mode_update(int mode
default:
func = &calc_smp_shares;
break;
+ case 2:
+ func = &calc_max_shares;
+ break;
}
static_call_update(calc_group_shares, func);
}
next prev parent reply other threads:[~2026-06-05 12:43 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-05 12:40 [PATCH v3 0/7] sched: Flatten the pick Peter Zijlstra
2026-06-05 12:40 ` [PATCH v3 1/7] sched/fair: Add cgroup_mode switch Peter Zijlstra
2026-06-05 12:40 ` [PATCH v3 2/7] sched/fair: Add cgroup_mode: up Peter Zijlstra
2026-06-05 15:07 ` Peter Zijlstra
2026-06-05 12:40 ` Peter Zijlstra [this message]
2026-06-10 15:09 ` [PATCH v3 3/7] sched/fair: Add cgroup_mode: max Waiman Long
2026-06-10 15:42 ` Waiman Long
2026-06-11 13:49 ` Peter Zijlstra
2026-06-11 13:47 ` Peter Zijlstra
2026-06-11 20:57 ` Waiman Long
2026-06-05 12:40 ` [PATCH v3 4/7] sched/fair: Add cgroup_mode: concur Peter Zijlstra
2026-06-05 12:40 ` [PATCH v3 5/7] sched/fair: Add cgroup_mode: tasks Peter Zijlstra
2026-06-05 12:40 ` [PATCH v3 6/7] sched/fair: Change the default cgroup_mode to concur Peter Zijlstra
2026-06-05 12:40 ` [PATCH v3 7/7] sched/eevdf: Move to a single runqueue Peter Zijlstra
2026-06-09 5:37 ` [PATCH v3 0/7] sched: Flatten the pick K Prateek Nayak
2026-06-12 2:29 ` Shubhang Kaushik
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260605124051.589618504@infradead.org \
--to=peterz@infradead.org \
--cc=bsegall@google.com \
--cc=cgroups@vger.kernel.org \
--cc=chenridong@huaweicloud.com \
--cc=dietmar.eggemann@arm.com \
--cc=hannes@cmpxchg.org \
--cc=jstultz@google.com \
--cc=juri.lelli@redhat.com \
--cc=kprateek.nayak@amd.com \
--cc=linux-kernel@vger.kernel.org \
--cc=longman@redhat.com \
--cc=mgorman@suse.de \
--cc=mingo@kernel.org \
--cc=mkoutny@suse.com \
--cc=qyousef@layalina.io \
--cc=rostedt@goodmis.org \
--cc=tj@kernel.org \
--cc=vincent.guittot@linaro.org \
--cc=vschneid@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox