All of lore.kernel.org
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: mingo@kernel.org
Cc: longman@redhat.com, chenridong@huaweicloud.com,
	peterz@infradead.org, juri.lelli@redhat.com,
	vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
	rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
	vschneid@redhat.com, tj@kernel.org, hannes@cmpxchg.org,
	mkoutny@suse.com, cgroups@vger.kernel.org,
	linux-kernel@vger.kernel.org, jstultz@google.com,
	kprateek.nayak@amd.com, qyousef@layalina.io
Subject: [PATCH v2 07/10] sched/fair: Add cgroup_mode: CONCUR
Date: Mon, 11 May 2026 13:31:11 +0200	[thread overview]
Message-ID: <20260511120627.829222375@infradead.org> (raw)
In-Reply-To: 20260511113104.563854162@infradead.org

A variation of MAX; where instead of assuming maximal concurrent, this scales
with 'min(nr_tasks, nr_cpus)'. This handles the low concurrency cases more
gracefully, with the exception of CPU affnity.

Note: the tracking of tg->tasks is somewhat expensive :-/

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/debug.c |    1 +
 kernel/sched/fair.c  |   39 ++++++++++++++++++++++++++++++++++++---
 kernel/sched/sched.h |    3 +++
 3 files changed, 40 insertions(+), 3 deletions(-)

--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -594,6 +594,7 @@ int cgroup_mode = 1;
 static const char *cgroup_mode_str[] = {
 	"up",
 	"smp",
+	"concur",
 	"max",
 };
 
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4211,6 +4211,30 @@ static long calc_max_shares(struct cfs_r
 	return __calc_smp_shares(cfs_rq, tg_shares * nr, max_shares);
 }
 
+static inline int tg_tasks(struct task_group *tg)
+{
+	return max(1, atomic_long_read(&tg->tasks));
+}
+
+/*
+ * Func: min(fraction(num * tg->shares), nice -20); where
+ *       num = min(nr_tasks, nr_cpus)
+ *
+ * Similar to max, except scale with min(nr_tasks, nr_cpus), which gives
+ * a far more natural distrubution. Can still create edge case using CPU
+ * affinity.
+ */
+static long calc_concur_shares(struct cfs_rq *cfs_rq)
+{
+	struct task_group *tg = cfs_rq->tg;
+	int nr_cpus = tg_cpus(tg);
+	int nr_tasks = tg_tasks(tg);
+	int nr = min(nr_tasks, nr_cpus);
+	long tg_shares = READ_ONCE(tg->shares);
+	long max_shares = scale_load(sched_prio_to_weight[0]);
+	return __calc_smp_shares(cfs_rq, nr * tg_shares, max_shares);
+}
+
 /*
  * Func: fraction(tg->shares)
  *
@@ -4240,6 +4264,9 @@ static long calc_group_shares(struct cfs
 		return calc_up_shares(cfs_rq);
 
 	if (mode == 2)
+		return calc_concur_shares(cfs_rq);
+
+	if (mode == 3)
 		return calc_max_shares(cfs_rq);
 
 	return calc_smp_shares(cfs_rq);
@@ -4385,7 +4412,7 @@ static inline bool cfs_rq_is_decayed(str
  */
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
 {
-	long delta;
+	long delta, dt;
 	u64 now;
 
 	/*
@@ -4407,16 +4434,19 @@ static inline void update_tg_load_avg(st
 		return;
 
 	delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
-	if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
+	dt = cfs_rq->h_nr_queued - cfs_rq->tg_tasks_contrib;
+	if (dt || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
 		atomic_long_add(delta, &cfs_rq->tg->load_avg);
+		atomic_long_add(dt, &cfs_rq->tg->tasks);
 		cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
+		cfs_rq->tg_tasks_contrib = cfs_rq->h_nr_queued;
 		cfs_rq->last_update_tg_load_avg = now;
 	}
 }
 
 static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq)
 {
-	long delta;
+	long delta, dt;
 	u64 now;
 
 	/*
@@ -4427,8 +4457,11 @@ static inline void clear_tg_load_avg(str
 
 	now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
 	delta = 0 - cfs_rq->tg_load_avg_contrib;
+	dt = 0 - cfs_rq->tg_tasks_contrib;
 	atomic_long_add(delta, &cfs_rq->tg->load_avg);
+	atomic_long_add(dt, &cfs_rq->tg->tasks);
 	cfs_rq->tg_load_avg_contrib = 0;
+	cfs_rq->tg_tasks_contrib = 0;
 	cfs_rq->last_update_tg_load_avg = now;
 }
 
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -491,6 +491,8 @@ struct task_group {
 	 * will also be accessed at each tick.
 	 */
 	atomic_long_t		load_avg ____cacheline_aligned;
+	atomic_long_t		tasks;
+
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -720,6 +722,7 @@ struct cfs_rq {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	u64			last_update_tg_load_avg;
 	unsigned long		tg_load_avg_contrib;
+	unsigned long		tg_tasks_contrib;
 	long			propagate;
 	long			prop_runnable_sum;
 



  parent reply	other threads:[~2026-05-11 12:07 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-11 11:31 [PATCH v2 00/10] sched: Flatten the pick Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 01/10] sched/debug: Use char * instead of char (*)[] Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 02/10] sched: Use {READ,WRITE}_ONCE() for preempt_dynamic_mode Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 03/10] sched/debug: Collapse subsequent CONFIG_SCHED_CLASS_EXT sections Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 04/10] sched/fair: Add cgroup_mode switch Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 05/10] sched/fair: Add cgroup_mode: UP Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 06/10] sched/fair: Add cgroup_mode: MAX Peter Zijlstra
2026-05-11 11:31 ` Peter Zijlstra [this message]
2026-05-11 11:31 ` [PATCH v2 08/10] sched/fair: Add newidle balance to pick_task_fair() Peter Zijlstra
2026-05-12  5:37   ` K Prateek Nayak
2026-05-12  9:45     ` Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 09/10] sched: Remove sched_class::pick_next_task() Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 10/10] sched/eevdf: Move to a single runqueue Peter Zijlstra
2026-05-11 16:21   ` K Prateek Nayak
2026-05-12 11:09     ` Peter Zijlstra
2026-05-13  7:01       ` K Prateek Nayak
2026-05-13  7:25         ` Peter Zijlstra
2026-05-13  4:51   ` John Stultz
2026-05-13  5:00     ` John Stultz
2026-05-14  1:36       ` John Stultz
2026-05-14  2:53         ` K Prateek Nayak
2026-05-14  3:14           ` John Stultz
2026-05-11 19:23 ` [PATCH v2 00/10] sched: Flatten the pick Tejun Heo
2026-05-12  8:10   ` Peter Zijlstra
2026-05-12 18:45     ` Tejun Heo
2026-05-12  8:42 ` Vincent Guittot
2026-05-12  9:20   ` Peter Zijlstra
2026-05-12 18:24     ` Peter Zijlstra
2026-05-12 18:25       ` Peter Zijlstra
2026-05-12 18:32         ` Vincent Guittot
2026-05-13  7:25           ` Peter Zijlstra
2026-05-13 11:35   ` Peter Zijlstra
2026-05-13 12:43     ` Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260511120627.829222375@infradead.org \
    --to=peterz@infradead.org \
    --cc=bsegall@google.com \
    --cc=cgroups@vger.kernel.org \
    --cc=chenridong@huaweicloud.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=hannes@cmpxchg.org \
    --cc=jstultz@google.com \
    --cc=juri.lelli@redhat.com \
    --cc=kprateek.nayak@amd.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=longman@redhat.com \
    --cc=mgorman@suse.de \
    --cc=mingo@kernel.org \
    --cc=mkoutny@suse.com \
    --cc=qyousef@layalina.io \
    --cc=rostedt@goodmis.org \
    --cc=tj@kernel.org \
    --cc=vincent.guittot@linaro.org \
    --cc=vschneid@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.