From: Peter Zijlstra <peterz@infradead.org>
To: K Prateek Nayak <kprateek.nayak@amd.com>
Cc: mingo@kernel.org, longman@redhat.com, chenridong@huaweicloud.com,
juri.lelli@redhat.com, vincent.guittot@linaro.org,
dietmar.eggemann@arm.com, rostedt@goodmis.org,
bsegall@google.com, mgorman@suse.de, vschneid@redhat.com,
tj@kernel.org, hannes@cmpxchg.org, mkoutny@suse.com,
cgroups@vger.kernel.org, linux-kernel@vger.kernel.org,
jstultz@google.com, qyousef@layalina.io
Subject: Re: [PATCH v2 10/10] sched/eevdf: Move to a single runqueue
Date: Tue, 12 May 2026 13:09:32 +0200 [thread overview]
Message-ID: <20260512110932.GB1889694@noisy.programming.kicks-ass.net> (raw)
In-Reply-To: <133c4d08-5dfb-4f4f-83cb-f9652d4212ef@amd.com>
On Mon, May 11, 2026 at 09:51:57PM +0530, K Prateek Nayak wrote:
> Hello Peter,
>
> On 5/11/2026 5:01 PM, Peter Zijlstra wrote:
> > @@ -9291,34 +9206,25 @@ static void wakeup_preempt_fair(struct r
> > + se = pick_next_entity(rq, true);
> > + if (!se)
> > + goto again;
> >
> > p = task_of(se);
> > - if (unlikely(throttled))
> > + if (unlikely(check_cfs_rq_runtime(cfs_rq_of(se))))
> > task_throttle_setup_work(p);
>
> I think this bit should also be replicated in set_next_task() after
> account_cfs_rq_runtime() since any part of the hierarchy may get
> throttled as a result of failing to grab runtime.
>
> Also check_cfs_rq_runtime() only sees if the cfs_rq is throttled
> but the task can fail to run if it is on a throttled_hierarchy() too
> so that should be the correct check here.
>
> Something like below (only build tested on queue/sched/flat):
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index e54da4c6c945..950c072244b2 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9224,7 +9224,19 @@ struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf)
> goto again;
>
> p = task_of(se);
> - if (unlikely(check_cfs_rq_runtime(cfs_rq_of(se))))
> + /*
> + * For cases where prev is picked again after
> + * being throttled, entity_tick() would have
> + * already marked its hierarchy as throttled.
> + *
> + * Add throttle work here since
> + * put_prev_set_next_task() is skipped on
> + * same task's selection.
> + *
> + * For other case, set_next_task_fair() will
> + * handle adding the throttle work.
> + */
> + if (throttled_hierarchy(cfs_rq_of(se)))
> task_throttle_setup_work(p);
Ah, right, because we've not accumulated runtime, it doesn't make sense
to use check_cfs_rq_runtime() at pick time, all we need to do is check
if the task should be throttled.
However, since set_next_task_fair() will walk the entire hierarchy
anyway, we can remove it here entirely and fully rely on that.
> return p;
>
> @@ -13819,6 +13831,12 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
> if (on_rq)
> weight = __calc_prop_weight(cfs_rq, se, weight);
> }
> + /*
> + * Add throttle work if the bandwidth allocation above failed
> + * to grab any runtime and throttled the task's hierarchy.
> + */
> + if (throttled_hierarchy(task_cfs_rq(p)))
> + task_throttle_setup_work(p);
We already call into account_cfs_rq_runtime(); which basically does all
we need.
I think the distinction between account_cfs_rq_runtime() and
check_cfs_rq_runtime() no longer makes sense. We can throttle a cfs_rq
at any point now, since we no longer remove the cfs_rq, but rather we
make the tasks suspend themselves until the cfs_rq naturally dequeues
for being empty.
Something like so perhaps?
---
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -488,7 +488,7 @@ static int se_is_idle(struct sched_entit
#endif /* !CONFIG_FAIR_GROUP_SCHED */
static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
+bool account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
/**************************************************************
* Scheduling class tree data structure manipulation methods:
@@ -1420,12 +1420,22 @@ static void update_curr(struct cfs_rq *c
}
}
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
+static inline void task_throttle_setup_work(struct task_struct *p);
+
static void update_curr_fair(struct rq *rq)
{
struct sched_entity *se = &rq->donor->se;
+ bool throttled = false;
- for_each_sched_entity(se)
- update_curr(cfs_rq_of(se));
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ update_curr(cfs_rq);
+ throttled |= cfs_rq_throttled(cfs_rq);
+ }
+
+ if (throttled)
+ task_throttle_setup_work(rq->donor);
}
static inline void
@@ -5627,7 +5637,6 @@ place_entity(struct cfs_rq *cfs_rq, stru
}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
-static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
@@ -5830,8 +5839,6 @@ pick_next_entity(struct rq *rq, bool pro
return se;
}
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
-
static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
/*
@@ -5841,9 +5848,6 @@ static void put_prev_entity(struct cfs_r
if (prev->on_rq)
update_curr(cfs_rq);
- /* throttle cfs_rqs exceeding runtime */
- check_cfs_rq_runtime(cfs_rq);
-
if (prev->on_rq) {
update_stats_wait_start_fair(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
@@ -5976,44 +5980,29 @@ static int __assign_cfs_rq_runtime(struc
return cfs_rq->runtime_remaining > 0;
}
-/* returns 0 on failure to allocate runtime */
-static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- int ret;
-
- raw_spin_lock(&cfs_b->lock);
- ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
- raw_spin_unlock(&cfs_b->lock);
-
- return ret;
-}
+static bool throttle_cfs_rq(struct cfs_rq *cfs_rq);
-static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
+static bool __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
/* dock delta_exec before expiring quota (as it could span periods) */
cfs_rq->runtime_remaining -= delta_exec;
if (likely(cfs_rq->runtime_remaining > 0))
- return;
+ return false;
if (cfs_rq->throttled)
- return;
- /*
- * if we're unable to extend our runtime we resched so that the active
- * hierarchy can be throttled
- */
- if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->h_curr))
- resched_curr(rq_of(cfs_rq));
+ return true;
+
+ return throttle_cfs_rq(cfs_rq);
}
static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
+bool account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
- return;
+ return false;
- __account_cfs_rq_runtime(cfs_rq, delta_exec);
+ return __account_cfs_rq_runtime(cfs_rq, delta_exec);
}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
@@ -6284,9 +6273,9 @@ static bool throttle_cfs_rq(struct cfs_r
* We have raced with bandwidth becoming available, and if we
* actually throttled the timer might not unthrottle us for an
* entire period. We additionally needed to make sure that any
- * subsequent check_cfs_rq_runtime calls agree not to throttle
- * us, as we may commit to do cfs put_prev+pick_next, so we ask
- * for 1ns of runtime rather than just check cfs_b.
+ * subsequent account_cfs_rq_runtime() calls agree not to
+ * throttle us, as we may commit to do cfs put_prev+pick_next,
+ * so we ask for 1ns of runtime rather than just check cfs_b.
*/
dequeue = 0;
} else {
@@ -6711,8 +6700,6 @@ static void check_enqueue_throttle(struc
/* update runtime allocation */
account_cfs_rq_runtime(cfs_rq, 0);
- if (cfs_rq->runtime_remaining <= 0)
- throttle_cfs_rq(cfs_rq);
}
static void sync_throttle(struct task_group *tg, int cpu)
@@ -6742,25 +6729,6 @@ static void sync_throttle(struct task_gr
cfs_rq->pelt_clock_throttled = 1;
}
-/* conditionally throttle active cfs_rq's from put_prev_entity() */
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
- if (!cfs_bandwidth_used())
- return false;
-
- if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
- return false;
-
- /*
- * it's possible for a throttled entity to be forced into a running
- * state (e.g. set_curr_task), in this case we're finished.
- */
- if (cfs_rq_throttled(cfs_rq))
- return true;
-
- return throttle_cfs_rq(cfs_rq);
-}
-
static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
@@ -7015,8 +6983,7 @@ static void sched_fair_update_stop_tick(
#else /* !CONFIG_CFS_BANDWIDTH: */
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
+static bool account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
static inline void sync_throttle(struct task_group *tg, int cpu) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -9208,7 +9175,6 @@ struct task_struct *pick_task_fair(struc
{
struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
- struct task_struct *p;
int new_tasks;
again:
@@ -9223,10 +9189,7 @@ struct task_struct *pick_task_fair(struc
if (!se)
goto again;
- p = task_of(se);
- if (unlikely(check_cfs_rq_runtime(cfs_rq_of(se))))
- task_throttle_setup_work(p);
- return p;
+ return task_of(se);
idle:
new_tasks = sched_balance_newidle(rq, rf);
@@ -13618,6 +13581,7 @@ static void task_tick_fair(struct rq *rq
{
struct sched_entity *se = &curr->se;
unsigned long weight = NICE_0_LOAD;
+ bool throttled = false;
struct cfs_rq *cfs_rq;
for_each_sched_entity(se) {
@@ -13625,8 +13589,13 @@ static void task_tick_fair(struct rq *rq
entity_tick(cfs_rq, se, queued);
weight = __calc_prop_weight(cfs_rq, se, weight);
+
+ throttled |= cfs_rq_throttled(cfs_rq);
}
+ if (throttled)
+ task_throttle_setup_work(curr);
+
se = &curr->se;
reweight_eevdf(cfs_rq, se, weight, se->on_rq);
@@ -13800,6 +13769,7 @@ static void set_next_task_fair(struct rq
struct cfs_rq *cfs_rq = &rq->cfs;
unsigned long weight = NICE_0_LOAD;
bool on_rq = se->on_rq;
+ bool throttled = false;
clear_buddies(cfs_rq, se);
@@ -13814,12 +13784,15 @@ static void set_next_task_fair(struct rq
set_next_entity(cfs_rq, se);
/* ensure bandwidth has been allocated on our new cfs_rq */
- account_cfs_rq_runtime(cfs_rq, 0);
+ throttled |= account_cfs_rq_runtime(cfs_rq, 0);
if (on_rq)
weight = __calc_prop_weight(cfs_rq, se, weight);
}
+ if (throttled)
+ task_throttle_setup_work(p);
+
se = &p->se;
cfs_rq->curr = se;
next prev parent reply other threads:[~2026-05-12 11:10 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-11 11:31 [PATCH v2 00/10] sched: Flatten the pick Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 01/10] sched/debug: Use char * instead of char (*)[] Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 02/10] sched: Use {READ,WRITE}_ONCE() for preempt_dynamic_mode Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 03/10] sched/debug: Collapse subsequent CONFIG_SCHED_CLASS_EXT sections Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 04/10] sched/fair: Add cgroup_mode switch Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 05/10] sched/fair: Add cgroup_mode: UP Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 06/10] sched/fair: Add cgroup_mode: MAX Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 07/10] sched/fair: Add cgroup_mode: CONCUR Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 08/10] sched/fair: Add newidle balance to pick_task_fair() Peter Zijlstra
2026-05-12 5:37 ` K Prateek Nayak
2026-05-12 9:45 ` Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 09/10] sched: Remove sched_class::pick_next_task() Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 10/10] sched/eevdf: Move to a single runqueue Peter Zijlstra
2026-05-11 16:21 ` K Prateek Nayak
2026-05-12 11:09 ` Peter Zijlstra [this message]
2026-05-11 19:23 ` [PATCH v2 00/10] sched: Flatten the pick Tejun Heo
2026-05-12 8:10 ` Peter Zijlstra
2026-05-12 8:42 ` Vincent Guittot
2026-05-12 9:20 ` Peter Zijlstra
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260512110932.GB1889694@noisy.programming.kicks-ass.net \
--to=peterz@infradead.org \
--cc=bsegall@google.com \
--cc=cgroups@vger.kernel.org \
--cc=chenridong@huaweicloud.com \
--cc=dietmar.eggemann@arm.com \
--cc=hannes@cmpxchg.org \
--cc=jstultz@google.com \
--cc=juri.lelli@redhat.com \
--cc=kprateek.nayak@amd.com \
--cc=linux-kernel@vger.kernel.org \
--cc=longman@redhat.com \
--cc=mgorman@suse.de \
--cc=mingo@kernel.org \
--cc=mkoutny@suse.com \
--cc=qyousef@layalina.io \
--cc=rostedt@goodmis.org \
--cc=tj@kernel.org \
--cc=vincent.guittot@linaro.org \
--cc=vschneid@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox