All of lore.kernel.org
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: K Prateek Nayak <kprateek.nayak@amd.com>
Cc: mingo@kernel.org, longman@redhat.com, chenridong@huaweicloud.com,
	juri.lelli@redhat.com, vincent.guittot@linaro.org,
	dietmar.eggemann@arm.com, rostedt@goodmis.org,
	bsegall@google.com, mgorman@suse.de, vschneid@redhat.com,
	tj@kernel.org, hannes@cmpxchg.org, mkoutny@suse.com,
	cgroups@vger.kernel.org, linux-kernel@vger.kernel.org,
	jstultz@google.com, qyousef@layalina.io
Subject: Re: [PATCH v2 10/10] sched/eevdf: Move to a single runqueue
Date: Tue, 12 May 2026 13:09:32 +0200	[thread overview]
Message-ID: <20260512110932.GB1889694@noisy.programming.kicks-ass.net> (raw)
In-Reply-To: <133c4d08-5dfb-4f4f-83cb-f9652d4212ef@amd.com>

On Mon, May 11, 2026 at 09:51:57PM +0530, K Prateek Nayak wrote:
> Hello Peter,
> 
> On 5/11/2026 5:01 PM, Peter Zijlstra wrote:
> > @@ -9291,34 +9206,25 @@ static void wakeup_preempt_fair(struct r
> > +	se = pick_next_entity(rq, true);
> > +	if (!se)
> > +		goto again;
> >  
> >  	p = task_of(se);
> > -	if (unlikely(throttled))
> > +	if (unlikely(check_cfs_rq_runtime(cfs_rq_of(se))))
> >  		task_throttle_setup_work(p);
> 
> I think this bit should also be replicated in set_next_task() after
> account_cfs_rq_runtime() since any part of the hierarchy may get
> throttled as a result of failing to grab runtime.
> 
> Also check_cfs_rq_runtime() only sees if the cfs_rq is throttled
> but the task can fail to run if it is on a throttled_hierarchy() too
> so that should be the correct check here.
> 
> Something like below (only build tested on queue/sched/flat):
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index e54da4c6c945..950c072244b2 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9224,7 +9224,19 @@ struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf)
>  		goto again;
>  
>  	p = task_of(se);
> -	if (unlikely(check_cfs_rq_runtime(cfs_rq_of(se))))
> +	/*
> +	 * For cases where prev is picked again after
> +	 * being throttled, entity_tick() would have
> +	 * already marked its hierarchy as throttled.
> +	 *
> +	 * Add throttle work here since
> +	 * put_prev_set_next_task() is skipped on
> +	 * same task's selection.
> +	 *
> +	 * For other case, set_next_task_fair() will
> +	 * handle adding the throttle work.
> +	 */
> +	if (throttled_hierarchy(cfs_rq_of(se)))
>  		task_throttle_setup_work(p);

Ah, right, because we've not accumulated runtime, it doesn't make sense
to use check_cfs_rq_runtime() at pick time, all we need to do is check
if the task should be throttled.

However, since set_next_task_fair() will walk the entire hierarchy
anyway, we can remove it here entirely and fully rely on that.

>  	return p;
>  
> @@ -13819,6 +13831,12 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
>  		if (on_rq)
>  			weight = __calc_prop_weight(cfs_rq, se, weight);
>  	}
> +	/*
> +	 * Add throttle work if the bandwidth allocation above failed
> +	 * to grab any runtime and throttled the task's hierarchy.
> +	 */
> +	if (throttled_hierarchy(task_cfs_rq(p)))
> +		task_throttle_setup_work(p);

We already call into account_cfs_rq_runtime(); which basically does all
we need.

I think the distinction between account_cfs_rq_runtime() and
check_cfs_rq_runtime() no longer makes sense. We can throttle a cfs_rq
at any point now, since we no longer remove the cfs_rq, but rather we
make the tasks suspend themselves until the cfs_rq naturally dequeues
for being empty.

Something like so perhaps?

---
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -488,7 +488,7 @@ static int se_is_idle(struct sched_entit
 #endif /* !CONFIG_FAIR_GROUP_SCHED */
 
 static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
+bool account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 
 /**************************************************************
  * Scheduling class tree data structure manipulation methods:
@@ -1420,12 +1420,22 @@ static void update_curr(struct cfs_rq *c
 	}
 }
 
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
+static inline void task_throttle_setup_work(struct task_struct *p);
+
 static void update_curr_fair(struct rq *rq)
 {
 	struct sched_entity *se = &rq->donor->se;
+	bool throttled = false;
 
-	for_each_sched_entity(se)
-		update_curr(cfs_rq_of(se));
+	for_each_sched_entity(se) {
+		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+		update_curr(cfs_rq);
+		throttled |= cfs_rq_throttled(cfs_rq);
+	}
+
+	if (throttled)
+		task_throttle_setup_work(rq->donor);
 }
 
 static inline void
@@ -5627,7 +5637,6 @@ place_entity(struct cfs_rq *cfs_rq, stru
 }
 
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
-static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
 
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
@@ -5830,8 +5839,6 @@ pick_next_entity(struct rq *rq, bool pro
 	return se;
 }
 
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
-
 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 {
 	/*
@@ -5841,9 +5848,6 @@ static void put_prev_entity(struct cfs_r
 	if (prev->on_rq)
 		update_curr(cfs_rq);
 
-	/* throttle cfs_rqs exceeding runtime */
-	check_cfs_rq_runtime(cfs_rq);
-
 	if (prev->on_rq) {
 		update_stats_wait_start_fair(cfs_rq, prev);
 		/* in !on_rq case, update occurred at dequeue */
@@ -5976,44 +5980,29 @@ static int __assign_cfs_rq_runtime(struc
 	return cfs_rq->runtime_remaining > 0;
 }
 
-/* returns 0 on failure to allocate runtime */
-static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
-	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-	int ret;
-
-	raw_spin_lock(&cfs_b->lock);
-	ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
-	raw_spin_unlock(&cfs_b->lock);
-
-	return ret;
-}
+static bool throttle_cfs_rq(struct cfs_rq *cfs_rq);
 
-static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
+static bool __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 {
 	/* dock delta_exec before expiring quota (as it could span periods) */
 	cfs_rq->runtime_remaining -= delta_exec;
 
 	if (likely(cfs_rq->runtime_remaining > 0))
-		return;
+		return false;
 
 	if (cfs_rq->throttled)
-		return;
-	/*
-	 * if we're unable to extend our runtime we resched so that the active
-	 * hierarchy can be throttled
-	 */
-	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->h_curr))
-		resched_curr(rq_of(cfs_rq));
+		return true;
+
+	return throttle_cfs_rq(cfs_rq);
 }
 
 static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
+bool account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 {
 	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
-		return;
+		return false;
 
-	__account_cfs_rq_runtime(cfs_rq, delta_exec);
+	return __account_cfs_rq_runtime(cfs_rq, delta_exec);
 }
 
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
@@ -6284,9 +6273,9 @@ static bool throttle_cfs_rq(struct cfs_r
 		 * We have raced with bandwidth becoming available, and if we
 		 * actually throttled the timer might not unthrottle us for an
 		 * entire period. We additionally needed to make sure that any
-		 * subsequent check_cfs_rq_runtime calls agree not to throttle
-		 * us, as we may commit to do cfs put_prev+pick_next, so we ask
-		 * for 1ns of runtime rather than just check cfs_b.
+		 * subsequent account_cfs_rq_runtime() calls agree not to
+		 * throttle us, as we may commit to do cfs put_prev+pick_next,
+		 * so we ask for 1ns of runtime rather than just check cfs_b.
 		 */
 		dequeue = 0;
 	} else {
@@ -6711,8 +6700,6 @@ static void check_enqueue_throttle(struc
 
 	/* update runtime allocation */
 	account_cfs_rq_runtime(cfs_rq, 0);
-	if (cfs_rq->runtime_remaining <= 0)
-		throttle_cfs_rq(cfs_rq);
 }
 
 static void sync_throttle(struct task_group *tg, int cpu)
@@ -6742,25 +6729,6 @@ static void sync_throttle(struct task_gr
 		cfs_rq->pelt_clock_throttled = 1;
 }
 
-/* conditionally throttle active cfs_rq's from put_prev_entity() */
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
-	if (!cfs_bandwidth_used())
-		return false;
-
-	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
-		return false;
-
-	/*
-	 * it's possible for a throttled entity to be forced into a running
-	 * state (e.g. set_curr_task), in this case we're finished.
-	 */
-	if (cfs_rq_throttled(cfs_rq))
-		return true;
-
-	return throttle_cfs_rq(cfs_rq);
-}
-
 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
 {
 	struct cfs_bandwidth *cfs_b =
@@ -7015,8 +6983,7 @@ static void sched_fair_update_stop_tick(
 
 #else /* !CONFIG_CFS_BANDWIDTH: */
 
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
+static bool account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { return false; }
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 static inline void sync_throttle(struct task_group *tg, int cpu) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -9208,7 +9175,6 @@ struct task_struct *pick_task_fair(struc
 {
 	struct cfs_rq *cfs_rq = &rq->cfs;
 	struct sched_entity *se;
-	struct task_struct *p;
 	int new_tasks;
 
 again:
@@ -9223,10 +9189,7 @@ struct task_struct *pick_task_fair(struc
 	if (!se)
 		goto again;
 
-	p = task_of(se);
-	if (unlikely(check_cfs_rq_runtime(cfs_rq_of(se))))
-		task_throttle_setup_work(p);
-	return p;
+	return task_of(se);
 
 idle:
 	new_tasks = sched_balance_newidle(rq, rf);
@@ -13618,6 +13581,7 @@ static void task_tick_fair(struct rq *rq
 {
 	struct sched_entity *se = &curr->se;
 	unsigned long weight = NICE_0_LOAD;
+	bool throttled = false;
 	struct cfs_rq *cfs_rq;
 
 	for_each_sched_entity(se) {
@@ -13625,8 +13589,13 @@ static void task_tick_fair(struct rq *rq
 		entity_tick(cfs_rq, se, queued);
 
 		weight = __calc_prop_weight(cfs_rq, se, weight);
+
+		throttled |= cfs_rq_throttled(cfs_rq);
 	}
 
+	if (throttled)
+		task_throttle_setup_work(curr);
+
 	se = &curr->se;
 	reweight_eevdf(cfs_rq, se, weight, se->on_rq);
 
@@ -13800,6 +13769,7 @@ static void set_next_task_fair(struct rq
 	struct cfs_rq *cfs_rq = &rq->cfs;
 	unsigned long weight = NICE_0_LOAD;
 	bool on_rq = se->on_rq;
+	bool throttled = false;
 
 	clear_buddies(cfs_rq, se);
 
@@ -13814,12 +13784,15 @@ static void set_next_task_fair(struct rq
 			set_next_entity(cfs_rq, se);
 
 		/* ensure bandwidth has been allocated on our new cfs_rq */
-		account_cfs_rq_runtime(cfs_rq, 0);
+		throttled |= account_cfs_rq_runtime(cfs_rq, 0);
 
 		if (on_rq)
 			weight = __calc_prop_weight(cfs_rq, se, weight);
 	}
 
+	if (throttled)
+		task_throttle_setup_work(p);
+
 	se = &p->se;
 	cfs_rq->curr = se;
 


  reply	other threads:[~2026-05-12 11:10 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-11 11:31 [PATCH v2 00/10] sched: Flatten the pick Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 01/10] sched/debug: Use char * instead of char (*)[] Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 02/10] sched: Use {READ,WRITE}_ONCE() for preempt_dynamic_mode Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 03/10] sched/debug: Collapse subsequent CONFIG_SCHED_CLASS_EXT sections Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 04/10] sched/fair: Add cgroup_mode switch Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 05/10] sched/fair: Add cgroup_mode: UP Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 06/10] sched/fair: Add cgroup_mode: MAX Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 07/10] sched/fair: Add cgroup_mode: CONCUR Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 08/10] sched/fair: Add newidle balance to pick_task_fair() Peter Zijlstra
2026-05-12  5:37   ` K Prateek Nayak
2026-05-12  9:45     ` Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 09/10] sched: Remove sched_class::pick_next_task() Peter Zijlstra
2026-05-11 11:31 ` [PATCH v2 10/10] sched/eevdf: Move to a single runqueue Peter Zijlstra
2026-05-11 16:21   ` K Prateek Nayak
2026-05-12 11:09     ` Peter Zijlstra [this message]
2026-05-13  7:01       ` K Prateek Nayak
2026-05-13  7:25         ` Peter Zijlstra
2026-05-13  4:51   ` John Stultz
2026-05-13  5:00     ` John Stultz
2026-05-14  1:36       ` John Stultz
2026-05-14  2:53         ` K Prateek Nayak
2026-05-14  3:14           ` John Stultz
2026-05-11 19:23 ` [PATCH v2 00/10] sched: Flatten the pick Tejun Heo
2026-05-12  8:10   ` Peter Zijlstra
2026-05-12 18:45     ` Tejun Heo
2026-05-12  8:42 ` Vincent Guittot
2026-05-12  9:20   ` Peter Zijlstra
2026-05-12 18:24     ` Peter Zijlstra
2026-05-12 18:25       ` Peter Zijlstra
2026-05-12 18:32         ` Vincent Guittot
2026-05-13  7:25           ` Peter Zijlstra
2026-05-13 11:35   ` Peter Zijlstra
2026-05-13 12:43     ` Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260512110932.GB1889694@noisy.programming.kicks-ass.net \
    --to=peterz@infradead.org \
    --cc=bsegall@google.com \
    --cc=cgroups@vger.kernel.org \
    --cc=chenridong@huaweicloud.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=hannes@cmpxchg.org \
    --cc=jstultz@google.com \
    --cc=juri.lelli@redhat.com \
    --cc=kprateek.nayak@amd.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=longman@redhat.com \
    --cc=mgorman@suse.de \
    --cc=mingo@kernel.org \
    --cc=mkoutny@suse.com \
    --cc=qyousef@layalina.io \
    --cc=rostedt@goodmis.org \
    --cc=tj@kernel.org \
    --cc=vincent.guittot@linaro.org \
    --cc=vschneid@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.