[RFC][PATCH 4/8] sched/fair: Add cgroup_mode: MAX

public inbox for cgroups@vger.kernel.org
 help / color / mirror / Atom feed

From: Peter Zijlstra <peterz@infradead.org>
To: mingo@kernel.org
Cc: longman@redhat.com, chenridong@huaweicloud.com,
	peterz@infradead.org, juri.lelli@redhat.com,
	vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
	rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
	vschneid@redhat.com, tj@kernel.org, hannes@cmpxchg.org,
	mkoutny@suse.com, cgroups@vger.kernel.org,
	linux-kernel@vger.kernel.org, jstultz@google.com,
	kprateek.nayak@amd.com
Subject: [RFC][PATCH 4/8] sched/fair: Add cgroup_mode: MAX
Date: Tue, 17 Mar 2026 10:51:17 +0100	[thread overview]
Message-ID: <20260317104342.815599388@infradead.org> (raw)
In-Reply-To: 20260317095113.387450089@infradead.org

In order to avoid the CPU shares becoming tiny '1 / nr_cpus', assume each
cgroup is maximally concurrent and distrubute 'nr_cpus * tg->shares',
such that each CPU ends up with a 'tg->shares' sized fraction (on
average).

There is the corner case, when a cgroup is minimally loaded, eg a
single spinner, therefore limit the CPU shares to that of a nice -20
task to avoid getting too much load.

It was previously suggested to allow raising cpu.weight to '100 * nr_cpus'
to combat this same problem, but the problem there is the above corner case,
allowing multiple cgroups with such immense weight to the runqueue has
significant problems.

It would drown the kthreads, but it also risks overflowing the load values.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/cpuset.h |    6 +++++
 kernel/cgroup/cpuset.c |   15 ++++++++++++++
 kernel/sched/debug.c   |    1 
 kernel/sched/fair.c    |   50 ++++++++++++++++++++++++++++++++++++++++++++-----
 4 files changed, 67 insertions(+), 5 deletions(-)

--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -80,6 +80,7 @@ extern void lockdep_assert_cpuset_lock_h
 extern void cpuset_cpus_allowed_locked(struct task_struct *p, struct cpumask *mask);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
 extern bool cpuset_cpus_allowed_fallback(struct task_struct *p);
+extern int cpuset_num_cpus(struct cgroup *cgroup);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 #define cpuset_current_mems_allowed (current->mems_allowed)
 void cpuset_init_current_mems_allowed(void);
@@ -216,6 +217,11 @@ static inline bool cpuset_cpus_allowed_f
 	return false;
 }
 
+static inline int cpuset_num_cpus(struct cgroup *cgroup)
+{
+	return num_online_cpus();
+}
+
 static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
 {
 	return node_possible_map;
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -4097,6 +4097,21 @@ bool cpuset_cpus_allowed_fallback(struct
 	return changed;
 }
 
+int cpuset_num_cpus(struct cgroup *cgrp)
+{
+	int nr = num_online_cpus();
+	struct cpuset *cs;
+
+	if (is_in_v2_mode()) {
+		guard(rcu)();
+		cs = css_cs(cgroup_e_css(cgrp, &cpuset_cgrp_subsys));
+		if (cs)
+			nr = cpumask_weight(cs->effective_cpus);
+	}
+
+	return nr;
+}
+
 void __init cpuset_init_current_mems_allowed(void)
 {
 	nodes_setall(current->mems_allowed);
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -593,6 +593,7 @@ int cgroup_mode = 1;
 static const char *cgroup_mode_str[] = {
 	"up",
 	"smp",
+	"max",
 };
 
 static int sched_cgroup_mode(const char *str)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4150,12 +4150,10 @@ static inline int throttled_hierarchy(st
  *
  * hence icky!
  */
-static long calc_smp_shares(struct cfs_rq *cfs_rq)
+static long __calc_smp_shares(struct cfs_rq *cfs_rq, long tg_shares, long shares_max)
 {
-	long tg_weight, tg_shares, load, shares;
 	struct task_group *tg = cfs_rq->tg;
-
-	tg_shares = READ_ONCE(tg->shares);
+	long tg_weight, load, shares;
 
 	load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
 
@@ -4181,7 +4179,47 @@ static long calc_smp_shares(struct cfs_r
 	 * case no task is runnable on a CPU MIN_SHARES=2 should be returned
 	 * instead of 0.
 	 */
-	return clamp_t(long, shares, MIN_SHARES, tg_shares);
+	return clamp_t(long, shares, MIN_SHARES, shares_max);
+}
+
+static int tg_cpus(struct task_group *tg)
+{
+	int nr = num_online_cpus();
+
+	if (cpusets_enabled()) {
+		struct cgroup *cgrp = tg->css.cgroup;
+		nr = cpuset_num_cpus(cgrp);
+	}
+
+	return nr;
+}
+
+/*
+ * Func: min(fraction(num_cpus * tg->shares), nice -20)
+ *
+ * Scale tg->shares by the maximal number of CPUs; but clip the max shares at
+ * nice -20, otherwise a single spinner on a 512 CPU machine would result in
+ * 512*NICE_0_LOAD, which is also crazy.
+ */
+static long calc_max_shares(struct cfs_rq *cfs_rq)
+{
+	struct task_group *tg = cfs_rq->tg;
+	int nr = tg_cpus(tg);
+	long tg_shares = READ_ONCE(tg->shares);
+	long max_shares = scale_load(sched_prio_to_weight[0]);
+	return __calc_smp_shares(cfs_rq, tg_shares * nr, max_shares);
+}
+
+/*
+ * Func: fraction(tg->shares)
+ *
+ * This infamously results in tiny shares when you have many CPUs.
+ */
+static long calc_smp_shares(struct cfs_rq *cfs_rq)
+{
+	struct task_group *tg = cfs_rq->tg;
+	long tg_shares = READ_ONCE(tg->shares);
+	return __calc_smp_shares(cfs_rq, tg_shares, tg_shares);
 }
 
 /*
@@ -4197,6 +4235,8 @@ static long calc_group_shares(struct cfs
 {
 	if (cgroup_mode == 0)
 		return calc_up_shares(cfs_rq);
+	if (cgroup_mode == 2)
+		return calc_max_shares(cfs_rq);
 
 	return calc_smp_shares(cfs_rq);
 }

next prev parent reply	other threads:[~2026-03-17 10:47 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-17  9:51 [RFC][PATCH 0/8] sched: Flatten the pick Peter Zijlstra
2026-03-17  9:51 ` [RFC][PATCH 1/8] sched/debug: Collapse subsequent CONFIG_SCHED_CLASS_EXT sections Peter Zijlstra
2026-03-17  9:51 ` [RFC][PATCH 2/8] sched/fair: Add cgroup_mode switch Peter Zijlstra
2026-03-17  9:51 ` [RFC][PATCH 3/8] sched/fair: Add cgroup_mode: UP Peter Zijlstra
2026-03-17  9:51 ` Peter Zijlstra [this message]
2026-03-17  9:51 ` [RFC][PATCH 5/8] sched/fair: Add cgroup_mode: CONCUR Peter Zijlstra
2026-03-17  9:51 ` [RFC][PATCH 6/8] sched/fair: Add newidle balance to pick_task_fair() Peter Zijlstra
2026-03-17  9:51 ` [RFC][PATCH 7/8] sched: Remove sched_class::pick_next_task() Peter Zijlstra
2026-03-17  9:51 ` [RFC][PATCH 8/8] sched/eevdf: Move to a single runqueue Peter Zijlstra
2026-03-17 17:46   ` K Prateek Nayak
2026-03-18  9:02     ` Peter Zijlstra
2026-03-18  9:32       ` K Prateek Nayak

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260317104342.815599388@infradead.org \
    --to=peterz@infradead.org \
    --cc=bsegall@google.com \
    --cc=cgroups@vger.kernel.org \
    --cc=chenridong@huaweicloud.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=hannes@cmpxchg.org \
    --cc=jstultz@google.com \
    --cc=juri.lelli@redhat.com \
    --cc=kprateek.nayak@amd.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=longman@redhat.com \
    --cc=mgorman@suse.de \
    --cc=mingo@kernel.org \
    --cc=mkoutny@suse.com \
    --cc=rostedt@goodmis.org \
    --cc=tj@kernel.org \
    --cc=vincent.guittot@linaro.org \
    --cc=vschneid@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox