[RFC PATCH 1/7] sched/fair: Add related data structure for task based throttle

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [RFC PATCH 1/7] sched/fair: Add related data structure for task based throttle
  2025-03-17 10:56 [RFC PATCH 0/7] Defer throttle when task exits to user Aaron Lu
@ 2025-03-13  7:21 ` Aaron Lu
  2025-03-17 10:28   ` Valentin Schneider
  2025-03-13  7:21 ` [RFC PATCH 2/7] sched/fair: Handle throttle path " Aaron Lu
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 58+ messages in thread
From: Aaron Lu @ 2025-03-13  7:21 UTC (permalink / raw)
  To: Valentin Schneider, Ben Segall, K Prateek Nayak, Peter Zijlstra,
	Josh Don, Ingo Molnar, Vincent Guittot
  Cc: linux-kernel, Juri Lelli, Dietmar Eggemann, Steven Rostedt,
	Mel Gorman, Chengming Zhou, Chuyi Zhou

From: Valentin Schneider <vschneid@redhat.com>

Add related data structures for this new throttle functionality.

[aaronlu: extracted from Valentin's original patches]
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
---
 include/linux/sched.h |  4 ++++
 kernel/sched/core.c   |  3 +++
 kernel/sched/fair.c   | 12 ++++++++++++
 kernel/sched/sched.h  |  2 ++
 4 files changed, 21 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9632e3318e0d6..eec9087232660 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -858,6 +858,10 @@ struct task_struct {

 #ifdef CONFIG_CGROUP_SCHED
 	struct task_group		*sched_task_group;
+#ifdef CONFIG_CFS_BANDWIDTH
+	struct callback_head		sched_throttle_work;
+	struct list_head		throttle_node;
+#endif
 #endif


diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 621cfc731c5be..56e2ea14ac3b4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4493,6 +4493,9 @@ static void __sched_fork(unsigned long
clone_flags, struct task_struct *p)

 #ifdef CONFIG_FAIR_GROUP_SCHED
 	p->se.cfs_rq			= NULL;
+#ifdef CONFIG_CFS_BANDWIDTH
+	init_cfs_throttle_work(p);
+#endif
 #endif

 #ifdef CONFIG_SCHEDSTATS
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9dafb374d76d9..60eb5329bf526 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5832,6 +5832,18 @@ static inline int throttled_lb_pair(struct
task_group *tg,
 	       throttled_hierarchy(dest_cfs_rq);
 }

+static void throttle_cfs_rq_work(struct callback_head *work)
+{
+}
+
+void init_cfs_throttle_work(struct task_struct *p)
+{
+	init_task_work(&p->sched_throttle_work, throttle_cfs_rq_work);
+	/* Protect against double add, see throttle_cfs_rq() and
throttle_cfs_rq_work() */
+	p->sched_throttle_work.next = &p->sched_throttle_work;
+	INIT_LIST_HEAD(&p->throttle_node);
+}
+
 static int tg_unthrottle_up(struct task_group *tg, void *data)
 {
 	struct rq *rq = data;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 023b844159c94..c8bfa3d708081 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2724,6 +2724,8 @@ extern bool sched_rt_bandwidth_account(struct
rt_rq *rt_rq);

 extern void init_dl_entity(struct sched_dl_entity *dl_se);

+extern void init_cfs_throttle_work(struct task_struct *p);
+
 #define BW_SHIFT		20
 #define BW_UNIT			(1 << BW_SHIFT)
 #define RATIO_SHIFT		8
-- 
2.39.5

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-17 10:56 [RFC PATCH 0/7] Defer throttle when task exits to user Aaron Lu
  2025-03-13  7:21 ` [RFC PATCH 1/7] sched/fair: Add related data structure for task based throttle Aaron Lu
@ 2025-03-13  7:21 ` Aaron Lu
  2025-03-13 18:14   ` K Prateek Nayak
                     ` (3 more replies)
  2025-03-13  7:21 ` [RFC PATCH 3/7] sched/fair: Handle unthrottle " Aaron Lu
                   ` (4 subsequent siblings)
  6 siblings, 4 replies; 58+ messages in thread
From: Aaron Lu @ 2025-03-13  7:21 UTC (permalink / raw)
  To: Valentin Schneider, Ben Segall, K Prateek Nayak, Peter Zijlstra,
	Josh Don, Ingo Molnar, Vincent Guittot
  Cc: linux-kernel, Juri Lelli, Dietmar Eggemann, Steven Rostedt,
	Mel Gorman, Chengming Zhou, Chuyi Zhou

From: Valentin Schneider <vschneid@redhat.com>

Once a cfs_rq gets throttled, for all tasks belonging to this cfs_rq,
add a task work to them so that when those tasks return to user, the
actual throttle/dequeue can happen.

Note that since the throttle/dequeue always happens on a task basis when
it returns to user, it's no longer necessary for check_cfs_rq_runtime()
to return a value and pick_task_fair() acts differently according to that
return value, so check_cfs_rq_runtime() is changed to not return a
value.

[aaronlu: extracted from Valentin's original patches.
 Fixed a problem that curr is not in timeline tree and has to be dealed
 with explicitely;
 Make check_cfs_rq_runtime() void.]
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
---
 kernel/sched/fair.c  | 201 ++++++++++++++++++++++++-------------------
 kernel/sched/sched.h |   1 +
 2 files changed, 112 insertions(+), 90 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 60eb5329bf526..ab403ff7d53c8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5607,7 +5607,7 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
 	return se;
 }

-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);

 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 {
@@ -5832,8 +5832,49 @@ static inline int throttled_lb_pair(struct
task_group *tg,
 	       throttled_hierarchy(dest_cfs_rq);
 }

+static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
 static void throttle_cfs_rq_work(struct callback_head *work)
 {
+	struct task_struct *p = container_of(work, struct task_struct,
sched_throttle_work);
+	struct sched_entity *se;
+	struct cfs_rq *cfs_rq;
+	struct rq *rq;
+	struct rq_flags rf;
+
+	WARN_ON_ONCE(p != current);
+	p->sched_throttle_work.next = &p->sched_throttle_work;
+
+	/*
+	 * If task is exiting, then there won't be a return to userspace, so we
+	 * don't have to bother with any of this.
+	 */
+	if ((p->flags & PF_EXITING))
+		return;
+
+	rq = task_rq_lock(p, &rf);
+
+	se = &p->se;
+	cfs_rq = cfs_rq_of(se);
+
+	/* Raced, forget */
+	if (p->sched_class != &fair_sched_class)
+		goto out_unlock;
+
+	/*
+	 * If not in limbo, then either replenish has happened or this task got
+	 * migrated out of the throttled cfs_rq, move along
+	 */
+	if (!cfs_rq->throttle_count)
+		goto out_unlock;
+
+	update_rq_clock(rq);
+	WARN_ON_ONCE(!list_empty(&p->throttle_node));
+	list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+	dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
+	resched_curr(rq);
+
+out_unlock:
+	task_rq_unlock(rq, p, &rf);
 }

 void init_cfs_throttle_work(struct task_struct *p)
@@ -5873,32 +5914,81 @@ static int tg_unthrottle_up(struct task_group
*tg, void *data)
 	return 0;
 }

+static inline bool task_has_throttle_work(struct task_struct *p)
+{
+	return p->sched_throttle_work.next != &p->sched_throttle_work;
+}
+
+static inline void task_throttle_setup_work(struct task_struct *p)
+{
+	/*
+	 * Kthreads and exiting tasks don't return to userspace, so adding the
+	 * work is pointless
+	 */
+	if ((p->flags & (PF_EXITING | PF_KTHREAD)))
+		return;
+
+	if (task_has_throttle_work(p))
+		return;
+
+	task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
+}
+
 static int tg_throttle_down(struct task_group *tg, void *data)
 {
 	struct rq *rq = data;
 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+	struct task_struct *p;
+	struct rb_node *node;
+
+	cfs_rq->throttle_count++;
+	if (cfs_rq->throttle_count > 1)
+		return 0;

 	/* group is entering throttled state, stop time */
-	if (!cfs_rq->throttle_count) {
-		cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
-		list_del_leaf_cfs_rq(cfs_rq);
+	cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+	list_del_leaf_cfs_rq(cfs_rq);

-		SCHED_WARN_ON(cfs_rq->throttled_clock_self);
-		if (cfs_rq->nr_queued)
-			cfs_rq->throttled_clock_self = rq_clock(rq);
+	SCHED_WARN_ON(cfs_rq->throttled_clock_self);
+	if (cfs_rq->nr_queued)
+		cfs_rq->throttled_clock_self = rq_clock(rq);
+
+	WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
+	/*
+	 * rq_lock is held, current is (obviously) executing this in kernelspace.
+	 *
+	 * All other tasks enqueued on this rq have their saved PC at the
+	 * context switch, so they will go through the kernel before returning
+	 * to userspace. Thus, there are no tasks-in-userspace to handle, just
+	 * install the task_work on all of them.
+	 */
+	node = rb_first(&cfs_rq->tasks_timeline.rb_root);
+	while (node) {
+		struct sched_entity *se = __node_2_se(node);
+
+		if (!entity_is_task(se))
+			goto next;
+
+		p = task_of(se);
+		task_throttle_setup_work(p);
+next:
+		node = rb_next(node);
+	}
+
+	/* curr is not in the timeline tree */
+	if (cfs_rq->curr && entity_is_task(cfs_rq->curr)) {
+		p = task_of(cfs_rq->curr);
+		task_throttle_setup_work(p);
 	}
-	cfs_rq->throttle_count++;

 	return 0;
 }

-static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
+static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-	struct sched_entity *se;
-	long queued_delta, runnable_delta, idle_delta, dequeue = 1;
-	long rq_h_nr_queued = rq->cfs.h_nr_queued;
+	int dequeue = 1;

 	raw_spin_lock(&cfs_b->lock);
 	/* This will start the period timer if necessary */
@@ -5919,74 +6009,13 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	raw_spin_unlock(&cfs_b->lock);

 	if (!dequeue)
-		return false;  /* Throttle no longer required. */
-
-	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+		return;  /* Throttle no longer required. */

 	/* freeze hierarchy runnable averages while throttled */
 	rcu_read_lock();
 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
 	rcu_read_unlock();

-	queued_delta = cfs_rq->h_nr_queued;
-	runnable_delta = cfs_rq->h_nr_runnable;
-	idle_delta = cfs_rq->h_nr_idle;
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-		int flags;
-
-		/* throttled entity or throttle-on-deactivate */
-		if (!se->on_rq)
-			goto done;
-
-		/*
-		 * Abuse SPECIAL to avoid delayed dequeue in this instance.
-		 * This avoids teaching dequeue_entities() about throttled
-		 * entities and keeps things relatively simple.
-		 */
-		flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
-		if (se->sched_delayed)
-			flags |= DEQUEUE_DELAYED;
-		dequeue_entity(qcfs_rq, se, flags);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued -= queued_delta;
-		qcfs_rq->h_nr_runnable -= runnable_delta;
-		qcfs_rq->h_nr_idle -= idle_delta;
-
-		if (qcfs_rq->load.weight) {
-			/* Avoid re-evaluating load for this entity: */
-			se = parent_entity(se);
-			break;
-		}
-	}
-
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-		/* throttled entity or throttle-on-deactivate */
-		if (!se->on_rq)
-			goto done;
-
-		update_load_avg(qcfs_rq, se, 0);
-		se_update_runnable(se);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued -= queued_delta;
-		qcfs_rq->h_nr_runnable -= runnable_delta;
-		qcfs_rq->h_nr_idle -= idle_delta;
-	}
-
-	/* At this point se is NULL and we are at root level*/
-	sub_nr_running(rq, queued_delta);
-
-	/* Stop the fair server if throttling resulted in no runnable tasks */
-	if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
-		dl_server_stop(&rq->fair_server);
-done:
 	/*
 	 * Note: distribution will already see us throttled via the
 	 * throttled-list.  rq->lock protects completion.
@@ -5995,7 +6024,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	SCHED_WARN_ON(cfs_rq->throttled_clock);
 	if (cfs_rq->nr_queued)
 		cfs_rq->throttled_clock = rq_clock(rq);
-	return true;
+	return;
 }

 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
@@ -6471,22 +6500,22 @@ static void sync_throttle(struct task_group
*tg, int cpu)
 }

 /* conditionally throttle active cfs_rq's from put_prev_entity() */
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	if (!cfs_bandwidth_used())
-		return false;
+		return;

 	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
-		return false;
+		return;

 	/*
 	 * it's possible for a throttled entity to be forced into a running
 	 * state (e.g. set_curr_task), in this case we're finished.
 	 */
 	if (cfs_rq_throttled(cfs_rq))
-		return true;
+		return;

-	return throttle_cfs_rq(cfs_rq);
+	throttle_cfs_rq(cfs_rq);
 }

 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -6582,6 +6611,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	cfs_rq->runtime_enabled = 0;
 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
 	INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
+	INIT_LIST_HEAD(&cfs_rq->throttled_limbo_list);
 }

 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -7117,10 +7147,6 @@ static int dequeue_entities(struct rq *rq,
struct sched_entity *se, int flags)
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = h_nr_queued;

-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			return 0;
-
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight) {
 			slice = cfs_rq_min_slice(cfs_rq);
@@ -7157,10 +7183,6 @@ static int dequeue_entities(struct rq *rq,
struct sched_entity *se, int flags)

 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = h_nr_queued;
-
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			return 0;
 	}

 	sub_nr_running(rq, h_nr_queued);
@@ -8869,8 +8891,7 @@ static struct task_struct *pick_task_fair(struct rq *rq)
 		if (cfs_rq->curr && cfs_rq->curr->on_rq)
 			update_curr(cfs_rq);

-		if (unlikely(check_cfs_rq_runtime(cfs_rq)))
-			goto again;
+		check_cfs_rq_runtime(cfs_rq);

 		se = pick_next_entity(rq, cfs_rq);
 		if (!se)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c8bfa3d708081..5c2af5a70163c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -742,6 +742,7 @@ struct cfs_rq {
 	int			throttle_count;
 	struct list_head	throttled_list;
 	struct list_head	throttled_csd_list;
+	struct list_head	throttled_limbo_list;
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 };
-- 
2.39.5

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [RFC PATCH 3/7] sched/fair: Handle unthrottle path for task based throttle
  2025-03-17 10:56 [RFC PATCH 0/7] Defer throttle when task exits to user Aaron Lu
  2025-03-13  7:21 ` [RFC PATCH 1/7] sched/fair: Add related data structure for task based throttle Aaron Lu
  2025-03-13  7:21 ` [RFC PATCH 2/7] sched/fair: Handle throttle path " Aaron Lu
@ 2025-03-13  7:21 ` Aaron Lu
  2025-03-14  3:53   ` K Prateek Nayak
  2025-03-13  7:21 ` [RFC PATCH 4/7] sched/fair: Take care of migrated task " Aaron Lu
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 58+ messages in thread
From: Aaron Lu @ 2025-03-13  7:21 UTC (permalink / raw)
  To: Valentin Schneider, Ben Segall, K Prateek Nayak, Peter Zijlstra,
	Josh Don, Ingo Molnar, Vincent Guittot
  Cc: linux-kernel, Juri Lelli, Dietmar Eggemann, Steven Rostedt,
	Mel Gorman, Chengming Zhou, Chuyi Zhou

From: Valentin Schneider <vschneid@redhat.com>

On unthrottle, enqueue throttled tasks back so they can continue to run.

Note that for this task based throttling, the only throttle place is
when it returns to user space so as long as a task is enqueued, no
matter its cfs_rq is throttled or not, it will be allowed to run till it
reaches that throttle place.

leaf_cfs_rq list is handled differently now: as long as a task is
enqueued to a throttled or not cfs_rq, this cfs_rq will be added to that
list and when cfs_rq is throttled and all its tasks are dequeued, it
will be removed from that list. I think this is easy to reason so chose
to do so.

[aaronlu: extracted from Valentin's original patches. I also changed the
 implementation to using enqueue_task_fair() for queuing back tasks to
 unthrottled cfs_rq]
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
---
 kernel/sched/fair.c | 132 +++++++++++++++-----------------------------
 1 file changed, 45 insertions(+), 87 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ab403ff7d53c8..4a95fe3785e43 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5366,18 +5366,17 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct
sched_entity *se, int flags)

 	if (cfs_rq->nr_queued == 1) {
 		check_enqueue_throttle(cfs_rq);
-		if (!throttled_hierarchy(cfs_rq)) {
-			list_add_leaf_cfs_rq(cfs_rq);
-		} else {
+		list_add_leaf_cfs_rq(cfs_rq);
 #ifdef CONFIG_CFS_BANDWIDTH
+		if (throttled_hierarchy(cfs_rq)) {
 			struct rq *rq = rq_of(cfs_rq);

 			if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
 				cfs_rq->throttled_clock = rq_clock(rq);
 			if (!cfs_rq->throttled_clock_self)
 				cfs_rq->throttled_clock_self = rq_clock(rq);
-#endif
 		}
+#endif
 	}
 }

@@ -5525,8 +5524,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct
sched_entity *se, int flags)
 	if (flags & DEQUEUE_DELAYED)
 		finish_delayed_dequeue_entity(se);

-	if (cfs_rq->nr_queued == 0)
+	if (cfs_rq->nr_queued == 0) {
 		update_idle_cfs_rq_clock_pelt(cfs_rq);
+		if (throttled_hierarchy(cfs_rq))
+			list_del_leaf_cfs_rq(cfs_rq);
+	}

 	return true;
 }
@@ -5832,6 +5834,11 @@ static inline int throttled_lb_pair(struct
task_group *tg,
 	       throttled_hierarchy(dest_cfs_rq);
 }

+static inline bool task_is_throttled(struct task_struct *p)
+{
+	return !list_empty(&p->throttle_node);
+}
+
 static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
 static void throttle_cfs_rq_work(struct callback_head *work)
 {
@@ -5885,32 +5892,45 @@ void init_cfs_throttle_work(struct task_struct *p)
 	INIT_LIST_HEAD(&p->throttle_node);
 }

+static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
 static int tg_unthrottle_up(struct task_group *tg, void *data)
 {
 	struct rq *rq = data;
 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+	struct task_struct *p, *tmp;

 	cfs_rq->throttle_count--;
-	if (!cfs_rq->throttle_count) {
-		cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
-					     cfs_rq->throttled_clock_pelt;
+	if (cfs_rq->throttle_count)
+		return 0;

-		/* Add cfs_rq with load or one or more already running entities to
the list */
-		if (!cfs_rq_is_decayed(cfs_rq))
-			list_add_leaf_cfs_rq(cfs_rq);
+	cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
+		cfs_rq->throttled_clock_pelt;

-		if (cfs_rq->throttled_clock_self) {
-			u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
+	if (cfs_rq->throttled_clock_self) {
+		u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;

-			cfs_rq->throttled_clock_self = 0;
+		cfs_rq->throttled_clock_self = 0;

-			if (SCHED_WARN_ON((s64)delta < 0))
-				delta = 0;
+		if (SCHED_WARN_ON((s64)delta < 0))
+			delta = 0;

-			cfs_rq->throttled_clock_self_time += delta;
-		}
+		cfs_rq->throttled_clock_self_time += delta;
 	}

+	/* Re-enqueue the tasks that have been throttled at this level. */
+	list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list,
throttle_node) {
+		list_del_init(&p->throttle_node);
+		/*
+		 * FIXME: p may not be allowed to run on this rq anymore
+		 * due to affinity change while p is throttled.
+		 */
+		enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
+	}
+
+	/* Add cfs_rq with load or one or more already running entities to the list */
+	if (!cfs_rq_is_decayed(cfs_rq))
+		list_add_leaf_cfs_rq(cfs_rq);
+
 	return 0;
 }

@@ -5947,12 +5967,16 @@ static int tg_throttle_down(struct task_group
*tg, void *data)

 	/* group is entering throttled state, stop time */
 	cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
-	list_del_leaf_cfs_rq(cfs_rq);

 	SCHED_WARN_ON(cfs_rq->throttled_clock_self);
 	if (cfs_rq->nr_queued)
 		cfs_rq->throttled_clock_self = rq_clock(rq);

+	if (!cfs_rq->nr_queued) {
+		list_del_leaf_cfs_rq(cfs_rq);
+		return 0;
+	}
+
 	WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
 	/*
 	 * rq_lock is held, current is (obviously) executing this in kernelspace.
@@ -6031,11 +6055,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-	struct sched_entity *se;
-	long queued_delta, runnable_delta, idle_delta;
-	long rq_h_nr_queued = rq->cfs.h_nr_queued;
-
-	se = cfs_rq->tg->se[cpu_of(rq)];
+	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];

 	cfs_rq->throttled = 0;

@@ -6063,62 +6083,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 			if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
 				break;
 		}
-		goto unthrottle_throttle;
 	}

-	queued_delta = cfs_rq->h_nr_queued;
-	runnable_delta = cfs_rq->h_nr_runnable;
-	idle_delta = cfs_rq->h_nr_idle;
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
-		/* Handle any unfinished DELAY_DEQUEUE business first. */
-		if (se->sched_delayed) {
-			int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED;
-
-			dequeue_entity(qcfs_rq, se, flags);
-		} else if (se->on_rq)
-			break;
-		enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued += queued_delta;
-		qcfs_rq->h_nr_runnable += runnable_delta;
-		qcfs_rq->h_nr_idle += idle_delta;
-
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(qcfs_rq))
-			goto unthrottle_throttle;
-	}
-
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
-		update_load_avg(qcfs_rq, se, UPDATE_TG);
-		se_update_runnable(se);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued += queued_delta;
-		qcfs_rq->h_nr_runnable += runnable_delta;
-		qcfs_rq->h_nr_idle += idle_delta;
-
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(qcfs_rq))
-			goto unthrottle_throttle;
-	}
-
-	/* Start the fair server if un-throttling resulted in new runnable tasks */
-	if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
-		dl_server_start(&rq->fair_server);
-
-	/* At this point se is NULL and we are at root level*/
-	add_nr_running(rq, queued_delta);
-
-unthrottle_throttle:
 	assert_list_leaf_cfs_rq(rq);

 	/* Determine whether we need to wake up potentially idle CPU: */
@@ -6989,6 +6955,7 @@ enqueue_task_fair(struct rq *rq, struct
task_struct *p, int flags)
 		util_est_enqueue(&rq->cfs, p);

 	if (flags & ENQUEUE_DELAYED) {
+		SCHED_WARN_ON(task_is_throttled(p));
 		requeue_delayed_entity(se);
 		return;
 	}
@@ -7031,10 +6998,6 @@ enqueue_task_fair(struct rq *rq, struct
task_struct *p, int flags)
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = 1;

-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			goto enqueue_throttle;
-
 		flags = ENQUEUE_WAKEUP;
 	}

@@ -7056,10 +7019,6 @@ enqueue_task_fair(struct rq *rq, struct
task_struct *p, int flags)

 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = 1;
-
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			goto enqueue_throttle;
 	}

 	if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
@@ -7089,7 +7048,6 @@ enqueue_task_fair(struct rq *rq, struct
task_struct *p, int flags)
 	if (!task_new)
 		check_update_overutilized_status(rq);

-enqueue_throttle:
 	assert_list_leaf_cfs_rq(rq);

 	hrtick_update(rq);
-- 
2.39.5

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [RFC PATCH 4/7] sched/fair: Take care of migrated task for task based throttle
  2025-03-17 10:56 [RFC PATCH 0/7] Defer throttle when task exits to user Aaron Lu
                   ` (2 preceding siblings ...)
  2025-03-13  7:21 ` [RFC PATCH 3/7] sched/fair: Handle unthrottle " Aaron Lu
@ 2025-03-13  7:21 ` Aaron Lu
  2025-03-14  4:03   ` K Prateek Nayak
  2025-03-13  7:21 ` [RFC PATCH 5/7] sched/fair: Take care of group/affinity/sched_class change for throttled task Aaron Lu
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 58+ messages in thread
From: Aaron Lu @ 2025-03-13  7:21 UTC (permalink / raw)
  To: Valentin Schneider, Ben Segall, K Prateek Nayak, Peter Zijlstra,
	Josh Don, Ingo Molnar, Vincent Guittot
  Cc: linux-kernel, Juri Lelli, Dietmar Eggemann, Steven Rostedt,
	Mel Gorman, Chengming Zhou, Chuyi Zhou

If a task is migrated to a new cpu, it is possible this task is not
throttled but the new cfs_rq is throttled or vice vesa. Take care of
these situations in enqueue path.

Note that we can't handle this in migrate_task_rq_fair() because there,
the dst cpu's rq lock is not held and things like checking if the new
cfs_rq needs throttle can be racy.

Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
---
 kernel/sched/fair.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4a95fe3785e43..9e036f18d73e6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7051,6 +7051,23 @@ enqueue_task_fair(struct rq *rq, struct
task_struct *p, int flags)
 	assert_list_leaf_cfs_rq(rq);

 	hrtick_update(rq);
+
+	if (!cfs_bandwidth_used())
+		return;
+
+	/*
+	 * This is for migrate_task_rq_fair(): the new_cpu's rq lock is not held
+	 * in migrate_task_rq_fair() so we have to do these things in enqueue
+	 * time when the dst cpu's rq lock is held. Doing this check in enqueue
+	 * time also takes care of newly woken up tasks, e.g. a task wakes up
+	 * into a throttled cfs_rq.
+	 *
+	 * It's possible the task has a throttle work added but this new cfs_rq
+	 * is not in throttled hierarchy but that's OK, throttle_cfs_rq_work()
+	 * will take care of it.
+	 */
+	if (throttled_hierarchy(cfs_rq_of(&p->se)))
+		task_throttle_setup_work(p);
 }

 static void set_next_buddy(struct sched_entity *se);
-- 
2.39.5

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [RFC PATCH 5/7] sched/fair: Take care of group/affinity/sched_class change for throttled task
  2025-03-17 10:56 [RFC PATCH 0/7] Defer throttle when task exits to user Aaron Lu
                   ` (3 preceding siblings ...)
  2025-03-13  7:21 ` [RFC PATCH 4/7] sched/fair: Take care of migrated task " Aaron Lu
@ 2025-03-13  7:21 ` Aaron Lu
  2025-03-14  4:51   ` K Prateek Nayak
  2025-03-13  7:22 ` [RFC PATCH 6/7] sched/fair: fix tasks_rcu with task based throttle Aaron Lu
  2025-03-13  7:22 ` [RFC PATCH 7/7] sched/fair: Make sure cfs_rq has enough runtime_remaining on unthrottle path Aaron Lu
  6 siblings, 1 reply; 58+ messages in thread
From: Aaron Lu @ 2025-03-13  7:21 UTC (permalink / raw)
  To: Valentin Schneider, Ben Segall, K Prateek Nayak, Peter Zijlstra,
	Josh Don, Ingo Molnar, Vincent Guittot
  Cc: linux-kernel, Juri Lelli, Dietmar Eggemann, Steven Rostedt,
	Mel Gorman, Chengming Zhou, Chuyi Zhou

On task group change, for a queued task, core will dequeue it and then
requeued it. The throttled task is still considered as queued by core
because p->on_rq is still set so core will dequeue it too, but since
the task is already dequeued on throttle, handle this case properly in
fair class code.

Affinity and sched class change is similar.

Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
---
 kernel/sched/fair.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9e036f18d73e6..f26d53ac143fe 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5876,8 +5876,8 @@ static void throttle_cfs_rq_work(struct
callback_head *work)

 	update_rq_clock(rq);
 	WARN_ON_ONCE(!list_empty(&p->throttle_node));
-	list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
 	dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
+	list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
 	resched_curr(rq);

 out_unlock:
@@ -5920,10 +5920,6 @@ static int tg_unthrottle_up(struct task_group
*tg, void *data)
 	/* Re-enqueue the tasks that have been throttled at this level. */
 	list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list,
throttle_node) {
 		list_del_init(&p->throttle_node);
-		/*
-		 * FIXME: p may not be allowed to run on this rq anymore
-		 * due to affinity change while p is throttled.
-		 */
 		enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
 	}

@@ -7194,6 +7190,16 @@ static int dequeue_entities(struct rq *rq,
struct sched_entity *se, int flags)
  */
 static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
+	if (task_is_throttled(p)) {
+		/* sched/core wants to dequeue this throttled task. */
+		SCHED_WARN_ON(p->se.on_rq);
+		SCHED_WARN_ON(flags & DEQUEUE_SLEEP);
+
+		list_del_init(&p->throttle_node);
+
+		return true;
+	}
+
 	if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags &
DEQUEUE_SAVE))))
 		util_est_dequeue(&rq->cfs, p);

-- 
2.39.5

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [RFC PATCH 6/7] sched/fair: fix tasks_rcu with task based throttle
  2025-03-17 10:56 [RFC PATCH 0/7] Defer throttle when task exits to user Aaron Lu
                   ` (4 preceding siblings ...)
  2025-03-13  7:21 ` [RFC PATCH 5/7] sched/fair: Take care of group/affinity/sched_class change for throttled task Aaron Lu
@ 2025-03-13  7:22 ` Aaron Lu
  2025-03-14  4:14   ` K Prateek Nayak
  2025-03-13  7:22 ` [RFC PATCH 7/7] sched/fair: Make sure cfs_rq has enough runtime_remaining on unthrottle path Aaron Lu
  6 siblings, 1 reply; 58+ messages in thread
From: Aaron Lu @ 2025-03-13  7:22 UTC (permalink / raw)
  To: Valentin Schneider, Ben Segall, K Prateek Nayak, Peter Zijlstra,
	Josh Don, Ingo Molnar, Vincent Guittot
  Cc: linux-kernel, Juri Lelli, Dietmar Eggemann, Steven Rostedt,
	Mel Gorman, Chengming Zhou, Chuyi Zhou

Taskes throttled on exit to user path are scheduled by cond_resched() in
task_work_run() but that is a preempt schedule and doesn't mark a task
rcu quiescent state.

Fix this by directly calling schedule() in throttle_cfs_rq_work().

Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
---
 kernel/sched/fair.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f26d53ac143fe..be96f7d32998c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5847,6 +5847,7 @@ static void throttle_cfs_rq_work(struct
callback_head *work)
 	struct cfs_rq *cfs_rq;
 	struct rq *rq;
 	struct rq_flags rf;
+	bool sched = false;

 	WARN_ON_ONCE(p != current);
 	p->sched_throttle_work.next = &p->sched_throttle_work;
@@ -5879,9 +5880,13 @@ static void throttle_cfs_rq_work(struct
callback_head *work)
 	dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
 	list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
 	resched_curr(rq);
+	sched = true;

 out_unlock:
 	task_rq_unlock(rq, p, &rf);
+
+	if (sched)
+		schedule();
 }

 void init_cfs_throttle_work(struct task_struct *p)
-- 
2.39.5

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [RFC PATCH 7/7] sched/fair: Make sure cfs_rq has enough runtime_remaining on unthrottle path
  2025-03-17 10:56 [RFC PATCH 0/7] Defer throttle when task exits to user Aaron Lu
                   ` (5 preceding siblings ...)
  2025-03-13  7:22 ` [RFC PATCH 6/7] sched/fair: fix tasks_rcu with task based throttle Aaron Lu
@ 2025-03-13  7:22 ` Aaron Lu
  2025-03-14  4:18   ` K Prateek Nayak
  6 siblings, 1 reply; 58+ messages in thread
From: Aaron Lu @ 2025-03-13  7:22 UTC (permalink / raw)
  To: Valentin Schneider, Ben Segall, K Prateek Nayak, Peter Zijlstra,
	Josh Don, Ingo Molnar, Vincent Guittot
  Cc: linux-kernel, Juri Lelli, Dietmar Eggemann, Steven Rostedt,
	Mel Gorman, Chengming Zhou, Chuyi Zhou

It's possible unthrottle_cfs_rq() is called with !runtime_remaining
due to things like user changed quota setting(see tg_set_cfs_bandwidth())
or async unthrottled us with a positive runtime_remaining but other still
running entities consumed those runtime before we reach there.

Anyway, we can't unthrottle this cfs_rq without any runtime remaining
because task enqueue during unthrottle can immediately trigger a throttle
by check_enqueue_throttle(), which should never happen.

Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
---
 kernel/sched/fair.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index be96f7d32998c..d646451d617c1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6058,6 +6058,19 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];

+	/*
+	 * It's possible we are called with !runtime_remaining due to things
+	 * like user changed quota setting(see tg_set_cfs_bandwidth()) or async
+	 * unthrottled us with a positive runtime_remaining but other still
+	 * running entities consumed those runtime before we reach here.
+	 *
+	 * Anyway, we can't unthrottle this cfs_rq without any runtime remaining
+	 * because any enqueue below will immediately trigger a throttle, which
+	 * is not supposed to happen on unthrottle path.
+	 */
+	if (cfs_rq->runtime_enabled && !cfs_rq->runtime_remaining)
+		return;
+
 	cfs_rq->throttled = 0;

 	update_rq_clock(rq);
-- 
2.39.5

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-13  7:21 ` [RFC PATCH 2/7] sched/fair: Handle throttle path " Aaron Lu
@ 2025-03-13 18:14   ` K Prateek Nayak
  2025-03-14  8:48     ` Aaron Lu
  2025-03-14  3:28   ` K Prateek Nayak
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 58+ messages in thread
From: K Prateek Nayak @ 2025-03-13 18:14 UTC (permalink / raw)
  To: Aaron Lu, Valentin Schneider, Ben Segall, Peter Zijlstra,
	Josh Don, Ingo Molnar, Vincent Guittot
  Cc: linux-kernel, Juri Lelli, Dietmar Eggemann, Steven Rostedt,
	Mel Gorman, Chengming Zhou, Chuyi Zhou

Hello Aaron,

P.S. I've fixed the wrapped lines and have been testing the series. So
far I haven't run into any issues yet on my machine. Will report back if
anything surface.

I've few comments inlined below.

On 3/13/2025 12:51 PM, Aaron Lu wrote:

[..snip..]

> +static inline void task_throttle_setup_work(struct task_struct *p)
> +{
> +	/*
> +	 * Kthreads and exiting tasks don't return to userspace, so adding the
> +	 * work is pointless
> +	 */
> +	if ((p->flags & (PF_EXITING | PF_KTHREAD)))
> +		return;
> +
> +	if (task_has_throttle_work(p))
> +		return;
> +
> +	task_work_add(p, &p->sched_throttle_work, TWA_RESUME);

Does it make sense to add a throttle work to a delayed task? It may be
dequeued soon and when it is queued back, the throttle situation might
have changed but the work is unnecessarily run. Could the throttle work
be instead added at the point of enqueue for delayed tasks?

> +}
> +
>   static int tg_throttle_down(struct task_group *tg, void *data)
>   {
>   	struct rq *rq = data;
>   	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
> +	struct task_struct *p;
> +	struct rb_node *node;
> +
> +	cfs_rq->throttle_count++;
> +	if (cfs_rq->throttle_count > 1)
> +		return 0;
> 
>   	/* group is entering throttled state, stop time */
> -	if (!cfs_rq->throttle_count) {
> -		cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> -		list_del_leaf_cfs_rq(cfs_rq);
> +	cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);

Once cencern here is that the PELT is seemingly frozen despite the
hierarchy being runnable. I've still not tracked down whether it'll
cause any problems once unthrottled and all throttled time is negated
from the pelt clock but is there any concerns here?

Maybe this can be done at dequeue when cfs_rq->nr_queued on a
throttled_hierarchy() reached 0.

> +	list_del_leaf_cfs_rq(cfs_rq);
> 
> -		SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> -		if (cfs_rq->nr_queued)
> -			cfs_rq->throttled_clock_self = rq_clock(rq);
> +	SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> +	if (cfs_rq->nr_queued)
> +		cfs_rq->throttled_clock_self = rq_clock(rq);
> +
> +	WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
> +	/*
> +	 * rq_lock is held, current is (obviously) executing this in kernelspace.
> +	 *
> +	 * All other tasks enqueued on this rq have their saved PC at the
> +	 * context switch, so they will go through the kernel before returning
> +	 * to userspace. Thus, there are no tasks-in-userspace to handle, just
> +	 * install the task_work on all of them.
> +	 */
> +	node = rb_first(&cfs_rq->tasks_timeline.rb_root);
> +	while (node) {
> +		struct sched_entity *se = __node_2_se(node);
> +
> +		if (!entity_is_task(se))
> +			goto next;
> +
> +		p = task_of(se);
> +		task_throttle_setup_work(p);
> +next:
> +		node = rb_next(node);
> +	}
> +
> +	/* curr is not in the timeline tree */
> +	if (cfs_rq->curr && entity_is_task(cfs_rq->curr)) {

I believe we can reach here from pick_next_task_fair() ->
check_cfs_rq_runtime() -> throttle_cfs_rq() in which case cfs_rq->curr
will still be set despite the task being blocked since put_prev_entity()
has not been called yet.

I believe there should be a check for task_on_rq_queued() here for the
current task.

> +		p = task_of(cfs_rq->curr);
> +		task_throttle_setup_work(p);
>   	}
> -	cfs_rq->throttle_count++;
> 
>   	return 0;
>   }
> 

[..snip..]

-- 
Thanks and Regards,
Prateek


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-13  7:21 ` [RFC PATCH 2/7] sched/fair: Handle throttle path " Aaron Lu
  2025-03-13 18:14   ` K Prateek Nayak
@ 2025-03-14  3:28   ` K Prateek Nayak
  2025-03-14  8:57     ` Aaron Lu
  2025-03-14  8:39   ` Chengming Zhou
  2025-03-16  3:25   ` Josh Don
  3 siblings, 1 reply; 58+ messages in thread
From: K Prateek Nayak @ 2025-03-14  3:28 UTC (permalink / raw)
  To: Aaron Lu, Valentin Schneider, Ben Segall, Peter Zijlstra,
	Josh Don, Ingo Molnar, Vincent Guittot
  Cc: linux-kernel, Juri Lelli, Dietmar Eggemann, Steven Rostedt,
	Mel Gorman, Chengming Zhou, Chuyi Zhou

Hello Aaron,

On 3/13/2025 12:51 PM, Aaron Lu wrote:

[..snip..]

> 
> +static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
>   static void throttle_cfs_rq_work(struct callback_head *work)
>   {
> +	struct task_struct *p = container_of(work, struct task_struct,
> sched_throttle_work);
> +	struct sched_entity *se;
> +	struct cfs_rq *cfs_rq;
> +	struct rq *rq;
> +	struct rq_flags rf;
> +
> +	WARN_ON_ONCE(p != current);
> +	p->sched_throttle_work.next = &p->sched_throttle_work;
> +
> +	/*
> +	 * If task is exiting, then there won't be a return to userspace, so we
> +	 * don't have to bother with any of this.
> +	 */
> +	if ((p->flags & PF_EXITING))
> +		return;
> +
> +	rq = task_rq_lock(p, &rf);

nit. With CLASS(task_rq_lock, rq_guard)(p), you can fetch the rq with
"rq_gurad.rq" and the "goto out_unlock" can be replaced with simple
return.

> +
> +	se = &p->se;
> +	cfs_rq = cfs_rq_of(se);
> +
> +	/* Raced, forget */
> +	if (p->sched_class != &fair_sched_class)
> +		goto out_unlock;
> +
> +	/*
> +	 * If not in limbo, then either replenish has happened or this task got
> +	 * migrated out of the throttled cfs_rq, move along
> +	 */
> +	if (!cfs_rq->throttle_count)
> +		goto out_unlock;
> +
> +	update_rq_clock(rq);
> +	WARN_ON_ONCE(!list_empty(&p->throttle_node));
> +	list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> +	dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);> +	resched_curr(rq);
> +
> +out_unlock:
> +	task_rq_unlock(rq, p, &rf);
>   }
> 
>   void init_cfs_throttle_work(struct task_struct *p)
> @@ -5873,32 +5914,81 @@ static int tg_unthrottle_up(struct task_group
> *tg, void *data)
>   	return 0;
>   }
> 
> +static inline bool task_has_throttle_work(struct task_struct *p)
> +{
> +	return p->sched_throttle_work.next != &p->sched_throttle_work;
> +}
> +
> +static inline void task_throttle_setup_work(struct task_struct *p)
> +{
> +	/*
> +	 * Kthreads and exiting tasks don't return to userspace, so adding the
> +	 * work is pointless
> +	 */
> +	if ((p->flags & (PF_EXITING | PF_KTHREAD)))
> +		return;
> +
> +	if (task_has_throttle_work(p))
> +		return;
> +
> +	task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
> +}
> +
>   static int tg_throttle_down(struct task_group *tg, void *data)
>   {
>   	struct rq *rq = data;
>   	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
> +	struct task_struct *p;
> +	struct rb_node *node;
> +
> +	cfs_rq->throttle_count++;
> +	if (cfs_rq->throttle_count > 1)
> +		return 0;

General question: Do we need the throttled_lb_pair() check in
can_migrate_task() with the per-task throttle? Moving a throttled task
to another CPU can ensures that the task can run quicker and exit to
user space as quickly as possible and once the task dequeues, it will
remove itself from the list of fair tasks making it unreachable for
the load balancer. Thoughts?

> 
>   	/* group is entering throttled state, stop time */
> -	if (!cfs_rq->throttle_count) {
> -		cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> -		list_del_leaf_cfs_rq(cfs_rq);
> +	cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> +	list_del_leaf_cfs_rq(cfs_rq);
> 
> -		SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> -		if (cfs_rq->nr_queued)
> -			cfs_rq->throttled_clock_self = rq_clock(rq);
> +	SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> +	if (cfs_rq->nr_queued)
> +		cfs_rq->throttled_clock_self = rq_clock(rq);
> +
> +	WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
> +	/*
> +	 * rq_lock is held, current is (obviously) executing this in kernelspace.
> +	 *
> +	 * All other tasks enqueued on this rq have their saved PC at the
> +	 * context switch, so they will go through the kernel before returning
> +	 * to userspace. Thus, there are no tasks-in-userspace to handle, just
> +	 * install the task_work on all of them.
> +	 */
> +	node = rb_first(&cfs_rq->tasks_timeline.rb_root);
> +	while (node) {
> +		struct sched_entity *se = __node_2_se(node);
> +
> +		if (!entity_is_task(se))
> +			goto next;
> +
> +		p = task_of(se);
> +		task_throttle_setup_work(p);
> +next:
> +		node = rb_next(node);
> +	}
> +
> +	/* curr is not in the timeline tree */
> +	if (cfs_rq->curr && entity_is_task(cfs_rq->curr)) {
> +		p = task_of(cfs_rq->curr);
> +		task_throttle_setup_work(p);
>   	}
> -	cfs_rq->throttle_count++;
> 
>   	return 0;
>   }
> 

[..snip..]

-- 
Thanks and Regards,
Prateek


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 3/7] sched/fair: Handle unthrottle path for task based throttle
  2025-03-13  7:21 ` [RFC PATCH 3/7] sched/fair: Handle unthrottle " Aaron Lu
@ 2025-03-14  3:53   ` K Prateek Nayak
  2025-03-14  4:06     ` K Prateek Nayak
  2025-03-14 10:43     ` Aaron Lu
  0 siblings, 2 replies; 58+ messages in thread
From: K Prateek Nayak @ 2025-03-14  3:53 UTC (permalink / raw)
  To: Aaron Lu, Valentin Schneider, Ben Segall, Peter Zijlstra,
	Josh Don, Ingo Molnar, Vincent Guittot
  Cc: linux-kernel, Juri Lelli, Dietmar Eggemann, Steven Rostedt,
	Mel Gorman, Chengming Zhou, Chuyi Zhou

Hello Aaron,

On 3/13/2025 12:51 PM, Aaron Lu wrote:

[..snip..]

> ---
>   kernel/sched/fair.c | 132 +++++++++++++++-----------------------------
>   1 file changed, 45 insertions(+), 87 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index ab403ff7d53c8..4a95fe3785e43 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5366,18 +5366,17 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct
> sched_entity *se, int flags)
> 
>   	if (cfs_rq->nr_queued == 1) {
>   		check_enqueue_throttle(cfs_rq);
> -		if (!throttled_hierarchy(cfs_rq)) {
> -			list_add_leaf_cfs_rq(cfs_rq);
> -		} else {
> +		list_add_leaf_cfs_rq(cfs_rq);
>   #ifdef CONFIG_CFS_BANDWIDTH
> +		if (throttled_hierarchy(cfs_rq)) {
>   			struct rq *rq = rq_of(cfs_rq);
> 
>   			if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
>   				cfs_rq->throttled_clock = rq_clock(rq);
>   			if (!cfs_rq->throttled_clock_self)
>   				cfs_rq->throttled_clock_self = rq_clock(rq);

These bits probabaly need revisiting. From what I understand, these
stats were maintained to know when a task was woken up on a
throttled hierarchy which was not connected to the parent essentially
tracking the amount of time runnable tasks were waiting on the
cfs_rq before an unthrottle event allowed them to be picked.

With per-task throttle, these semantics no longer apply since a woken
task will run and dequeue itself when exiting to userspace.

Josh do you have any thoughts on this?

> -#endif
>   		}
> +#endif
>   	}
>   }
> 
> @@ -5525,8 +5524,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct
> sched_entity *se, int flags)
>   	if (flags & DEQUEUE_DELAYED)
>   		finish_delayed_dequeue_entity(se);
> 
> -	if (cfs_rq->nr_queued == 0)
> +	if (cfs_rq->nr_queued == 0) {
>   		update_idle_cfs_rq_clock_pelt(cfs_rq);
> +		if (throttled_hierarchy(cfs_rq))
> +			list_del_leaf_cfs_rq(cfs_rq);
> +	}
> 
>   	return true;
>   }
> @@ -5832,6 +5834,11 @@ static inline int throttled_lb_pair(struct
> task_group *tg,
>   	       throttled_hierarchy(dest_cfs_rq);
>   }
> 
> +static inline bool task_is_throttled(struct task_struct *p)
> +{
> +	return !list_empty(&p->throttle_node);
> +}
> +
>   static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
>   static void throttle_cfs_rq_work(struct callback_head *work)
>   {
> @@ -5885,32 +5892,45 @@ void init_cfs_throttle_work(struct task_struct *p)
>   	INIT_LIST_HEAD(&p->throttle_node);
>   }
> 
> +static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
>   static int tg_unthrottle_up(struct task_group *tg, void *data)
>   {
>   	struct rq *rq = data;
>   	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
> +	struct task_struct *p, *tmp;
> 
>   	cfs_rq->throttle_count--;
> -	if (!cfs_rq->throttle_count) {
> -		cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
> -					     cfs_rq->throttled_clock_pelt;
> +	if (cfs_rq->throttle_count)
> +		return 0;
> 
> -		/* Add cfs_rq with load or one or more already running entities to
> the list */
> -		if (!cfs_rq_is_decayed(cfs_rq))
> -			list_add_leaf_cfs_rq(cfs_rq);
> +	cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
> +		cfs_rq->throttled_clock_pelt;
> 
> -		if (cfs_rq->throttled_clock_self) {
> -			u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
> +	if (cfs_rq->throttled_clock_self) {
> +		u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
> 
> -			cfs_rq->throttled_clock_self = 0;
> +		cfs_rq->throttled_clock_self = 0;
> 
> -			if (SCHED_WARN_ON((s64)delta < 0))
> -				delta = 0;
> +		if (SCHED_WARN_ON((s64)delta < 0))
> +			delta = 0;
> 
> -			cfs_rq->throttled_clock_self_time += delta;
> -		}
> +		cfs_rq->throttled_clock_self_time += delta;
>   	}
> 
> +	/* Re-enqueue the tasks that have been throttled at this level. */
> +	list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list,
> throttle_node) {
> +		list_del_init(&p->throttle_node);
> +		/*
> +		 * FIXME: p may not be allowed to run on this rq anymore
> +		 * due to affinity change while p is throttled.
> +		 */

Using dequeue_task_fair() for throttle should ensure that the core now
sees task_on_rq_queued() which should make it go throgh a full dequeue
cycle which will remove the task from the "throttled_limbo_list" and
the enqueue should put it back on the correct runqueue.

Is the above comment inaccurate with your changes or did I miss
something?

> +		enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
> +	}
> +
> +	/* Add cfs_rq with load or one or more already running entities to the list */
> +	if (!cfs_rq_is_decayed(cfs_rq))
> +		list_add_leaf_cfs_rq(cfs_rq);
> +
>   	return 0;
>   }
> 
> @@ -5947,12 +5967,16 @@ static int tg_throttle_down(struct task_group
> *tg, void *data)
> 
>   	/* group is entering throttled state, stop time */
>   	cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> -	list_del_leaf_cfs_rq(cfs_rq);
> 
>   	SCHED_WARN_ON(cfs_rq->throttled_clock_self);
>   	if (cfs_rq->nr_queued)
>   		cfs_rq->throttled_clock_self = rq_clock(rq);
> 
> +	if (!cfs_rq->nr_queued) {
> +		list_del_leaf_cfs_rq(cfs_rq);
> +		return 0;
> +	}
> +

This bit can perhaps go in Patch 2?

>   	WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
>   	/*
>   	 * rq_lock is held, current is (obviously) executing this in kernelspace.
> @@ -6031,11 +6055,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
>   {
>   	struct rq *rq = rq_of(cfs_rq);
>   	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
> -	struct sched_entity *se;
> -	long queued_delta, runnable_delta, idle_delta;
> -	long rq_h_nr_queued = rq->cfs.h_nr_queued;
> -
> -	se = cfs_rq->tg->se[cpu_of(rq)];
> +	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
> 
>   	cfs_rq->throttled = 0;
> 

[..snip..]

-- 
Thanks and Regards,
Prateek


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 4/7] sched/fair: Take care of migrated task for task based throttle
  2025-03-13  7:21 ` [RFC PATCH 4/7] sched/fair: Take care of migrated task " Aaron Lu
@ 2025-03-14  4:03   ` K Prateek Nayak
  2025-03-14  9:49     ` [External] " Aaron Lu
  0 siblings, 1 reply; 58+ messages in thread
From: K Prateek Nayak @ 2025-03-14  4:03 UTC (permalink / raw)
  To: Aaron Lu, Valentin Schneider, Ben Segall, Peter Zijlstra,
	Josh Don, Ingo Molnar, Vincent Guittot
  Cc: linux-kernel, Juri Lelli, Dietmar Eggemann, Steven Rostedt,
	Mel Gorman, Chengming Zhou, Chuyi Zhou

Hello Aaron,

On 3/13/2025 12:51 PM, Aaron Lu wrote:
> If a task is migrated to a new cpu, it is possible this task is not
> throttled but the new cfs_rq is throttled or vice vesa. Take care of
> these situations in enqueue path.
> 
> Note that we can't handle this in migrate_task_rq_fair() because there,
> the dst cpu's rq lock is not held and things like checking if the new
> cfs_rq needs throttle can be racy.
> 
> Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
> ---
>   kernel/sched/fair.c | 17 +++++++++++++++++
>   1 file changed, 17 insertions(+)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 4a95fe3785e43..9e036f18d73e6 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7051,6 +7051,23 @@ enqueue_task_fair(struct rq *rq, struct
> task_struct *p, int flags)
>   	assert_list_leaf_cfs_rq(rq);
> 
>   	hrtick_update(rq);
> +
> +	if (!cfs_bandwidth_used())
> +		return;
> +
> +	/*
> +	 * This is for migrate_task_rq_fair(): the new_cpu's rq lock is not held
> +	 * in migrate_task_rq_fair() so we have to do these things in enqueue
> +	 * time when the dst cpu's rq lock is held. Doing this check in enqueue
> +	 * time also takes care of newly woken up tasks, e.g. a task wakes up
> +	 * into a throttled cfs_rq.
> +	 *
> +	 * It's possible the task has a throttle work added but this new cfs_rq
> +	 * is not in throttled hierarchy but that's OK, throttle_cfs_rq_work()
> +	 * will take care of it.
> +	 */
> +	if (throttled_hierarchy(cfs_rq_of(&p->se)))
> +		task_throttle_setup_work(p);

Any reason we can't move this to somewhere towards the top?
throttled_hierarchy() check should be cheap enough and we probably don't
need the cfs_bandwidth_used() guarding check unless there are other
concerns that I may have missed.

>   }
> 
>   static void set_next_buddy(struct sched_entity *se);

-- 
Thanks and Regards,
Prateek


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 3/7] sched/fair: Handle unthrottle path for task based throttle
  2025-03-14  3:53   ` K Prateek Nayak
@ 2025-03-14  4:06     ` K Prateek Nayak
  2025-03-14 10:43     ` Aaron Lu
  1 sibling, 0 replies; 58+ messages in thread
From: K Prateek Nayak @ 2025-03-14  4:06 UTC (permalink / raw)
  To: Aaron Lu, Valentin Schneider, Ben Segall, Peter Zijlstra,
	Josh Don, Ingo Molnar, Vincent Guittot
  Cc: linux-kernel, Juri Lelli, Dietmar Eggemann, Steven Rostedt,
	Mel Gorman, Chengming Zhou, Chuyi Zhou

On 3/14/2025 9:23 AM, K Prateek Nayak wrote:

[..snip..]

>>
>> +    /* Re-enqueue the tasks that have been throttled at this level. */
>> +    list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list,
>> throttle_node) {
>> +        list_del_init(&p->throttle_node);
>> +        /*
>> +         * FIXME: p may not be allowed to run on this rq anymore
>> +         * due to affinity change while p is throttled.
>> +         */
> 
> Using dequeue_task_fair() for throttle should ensure that the core now
> sees task_on_rq_queued() which should make it go throgh a full dequeue
> cycle which will remove the task from the "throttled_limbo_list" and
> the enqueue should put it back on the correct runqueue.
> 
> Is the above comment inaccurate with your changes or did I miss
> something?

Please ignore this, I just reached Patch 5. Sorry for the noise.

> 
>> +        enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
>> +    }
>> +
>> +    /* Add cfs_rq with load or one or more already running entities to the list */
>> +    if (!cfs_rq_is_decayed(cfs_rq))
>> +        list_add_leaf_cfs_rq(cfs_rq);
>> +
>>       return 0;
>>   }
>>

-- 
Thanks and Regards,
Prateek


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 6/7] sched/fair: fix tasks_rcu with task based throttle
  2025-03-13  7:22 ` [RFC PATCH 6/7] sched/fair: fix tasks_rcu with task based throttle Aaron Lu
@ 2025-03-14  4:14   ` K Prateek Nayak
  2025-03-14 11:37     ` [External] " Aaron Lu
  2025-03-31  6:19     ` Aaron Lu
  0 siblings, 2 replies; 58+ messages in thread
From: K Prateek Nayak @ 2025-03-14  4:14 UTC (permalink / raw)
  To: Aaron Lu, Valentin Schneider, Ben Segall, Peter Zijlstra,
	Josh Don, Ingo Molnar, Vincent Guittot
  Cc: linux-kernel, Juri Lelli, Dietmar Eggemann, Steven Rostedt,
	Mel Gorman, Chengming Zhou, Chuyi Zhou

Hello Aaron,

On 3/13/2025 12:52 PM, Aaron Lu wrote:
> Taskes throttled on exit to user path are scheduled by cond_resched() in
> task_work_run() but that is a preempt schedule and doesn't mark a task
> rcu quiescent state.
> 
> Fix this by directly calling schedule() in throttle_cfs_rq_work().

Perhaps that can be gotten around by just using set_ti_thread_flag()
resched_curr() will also call set_preempt_need_resched() which allows
cond_resched() to resched the task.

Since exit_to_user_mode_loop() will run once again seeing that
TIF_NEED_RESCHED is set, schedule() should follow soon. Thoughts?

> 
> Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
> ---
>   kernel/sched/fair.c | 5 +++++
>   1 file changed, 5 insertions(+)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index f26d53ac143fe..be96f7d32998c 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5847,6 +5847,7 @@ static void throttle_cfs_rq_work(struct
> callback_head *work)
>   	struct cfs_rq *cfs_rq;
>   	struct rq *rq;
>   	struct rq_flags rf;
> +	bool sched = false;
> 
>   	WARN_ON_ONCE(p != current);
>   	p->sched_throttle_work.next = &p->sched_throttle_work;
> @@ -5879,9 +5880,13 @@ static void throttle_cfs_rq_work(struct
> callback_head *work)
>   	dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
>   	list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
>   	resched_curr(rq);
> +	sched = true;
> 
>   out_unlock:
>   	task_rq_unlock(rq, p, &rf);
> +
> +	if (sched)
> +		schedule();
>   }
> 
>   void init_cfs_throttle_work(struct task_struct *p)

-- 
Thanks and Regards,
Prateek


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 7/7] sched/fair: Make sure cfs_rq has enough runtime_remaining on unthrottle path
  2025-03-13  7:22 ` [RFC PATCH 7/7] sched/fair: Make sure cfs_rq has enough runtime_remaining on unthrottle path Aaron Lu
@ 2025-03-14  4:18   ` K Prateek Nayak
  2025-03-14 11:39     ` [External] " Aaron Lu
  0 siblings, 1 reply; 58+ messages in thread
From: K Prateek Nayak @ 2025-03-14  4:18 UTC (permalink / raw)
  To: Aaron Lu, Valentin Schneider, Ben Segall, Peter Zijlstra,
	Josh Don, Ingo Molnar, Vincent Guittot
  Cc: linux-kernel, Juri Lelli, Dietmar Eggemann, Steven Rostedt,
	Mel Gorman, Chengming Zhou, Chuyi Zhou

Hello Aaron,

On 3/13/2025 12:52 PM, Aaron Lu wrote:
> It's possible unthrottle_cfs_rq() is called with !runtime_remaining
> due to things like user changed quota setting(see tg_set_cfs_bandwidth())
> or async unthrottled us with a positive runtime_remaining but other still
> running entities consumed those runtime before we reach there.
> 
> Anyway, we can't unthrottle this cfs_rq without any runtime remaining
> because task enqueue during unthrottle can immediately trigger a throttle
> by check_enqueue_throttle(), which should never happen.
> 
> Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
> ---
>   kernel/sched/fair.c | 13 +++++++++++++
>   1 file changed, 13 insertions(+)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index be96f7d32998c..d646451d617c1 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6058,6 +6058,19 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
>   	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
>   	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
> 
> +	/*
> +	 * It's possible we are called with !runtime_remaining due to things
> +	 * like user changed quota setting(see tg_set_cfs_bandwidth()) or async
> +	 * unthrottled us with a positive runtime_remaining but other still
> +	 * running entities consumed those runtime before we reach here.
> +	 *
> +	 * Anyway, we can't unthrottle this cfs_rq without any runtime remaining
> +	 * because any enqueue below will immediately trigger a throttle, which
> +	 * is not supposed to happen on unthrottle path.
> +	 */
> +	if (cfs_rq->runtime_enabled && !cfs_rq->runtime_remaining)

Should this be "cfs_rq->runtime_remaining <= 0" since slack could have
built up by that time we come here?

> +		return;
> +
>   	cfs_rq->throttled = 0;
> 
>   	update_rq_clock(rq);

-- 
Thanks and Regards,
Prateek


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 5/7] sched/fair: Take care of group/affinity/sched_class change for throttled task
  2025-03-13  7:21 ` [RFC PATCH 5/7] sched/fair: Take care of group/affinity/sched_class change for throttled task Aaron Lu
@ 2025-03-14  4:51   ` K Prateek Nayak
  2025-03-14 11:40     ` [External] " Aaron Lu
  0 siblings, 1 reply; 58+ messages in thread
From: K Prateek Nayak @ 2025-03-14  4:51 UTC (permalink / raw)
  To: Aaron Lu, Valentin Schneider, Ben Segall, Peter Zijlstra,
	Josh Don, Ingo Molnar, Vincent Guittot
  Cc: linux-kernel, Juri Lelli, Dietmar Eggemann, Steven Rostedt,
	Mel Gorman, Chengming Zhou, Chuyi Zhou

Hello Aaron,

On 3/13/2025 12:51 PM, Aaron Lu wrote:
[..snip..]

> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5876,8 +5876,8 @@ static void throttle_cfs_rq_work(struct
> callback_head *work)
> 
>   	update_rq_clock(rq);
>   	WARN_ON_ONCE(!list_empty(&p->throttle_node));
> -	list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
>   	dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
> +	list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
>   	resched_curr(rq);

nit. Perhaps this bit can be moved to Patch 2 to consolidate all
changes in throttle_cfs_rq_work()

> 
>   out_unlock:

[..snip..]

-- 
Thanks and Regards,
Prateek


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-13  7:21 ` [RFC PATCH 2/7] sched/fair: Handle throttle path " Aaron Lu
  2025-03-13 18:14   ` K Prateek Nayak
  2025-03-14  3:28   ` K Prateek Nayak
@ 2025-03-14  8:39   ` Chengming Zhou
  2025-03-14  8:49     ` K Prateek Nayak
  2025-03-14  9:42     ` Aaron Lu
  2025-03-16  3:25   ` Josh Don
  3 siblings, 2 replies; 58+ messages in thread
From: Chengming Zhou @ 2025-03-14  8:39 UTC (permalink / raw)
  To: Aaron Lu, Valentin Schneider, Ben Segall, K Prateek Nayak,
	Peter Zijlstra, Josh Don, Ingo Molnar, Vincent Guittot
  Cc: linux-kernel, Juri Lelli, Dietmar Eggemann, Steven Rostedt,
	Mel Gorman, Chuyi Zhou

On 2025/3/13 15:21, Aaron Lu wrote:
> From: Valentin Schneider <vschneid@redhat.com>
> 
> Once a cfs_rq gets throttled, for all tasks belonging to this cfs_rq,
> add a task work to them so that when those tasks return to user, the
> actual throttle/dequeue can happen.
> 
> Note that since the throttle/dequeue always happens on a task basis when
> it returns to user, it's no longer necessary for check_cfs_rq_runtime()
> to return a value and pick_task_fair() acts differently according to that
> return value, so check_cfs_rq_runtime() is changed to not return a
> value.

Previously with the per-cfs_rq throttling, we use update_curr() -> put() path
to throttle the cfs_rq and dequeue it from the cfs_rq tree.

Now with your per-task throttling, maybe things can become simpler. That we
can just throttle_cfs_rq() (cfs_rq subtree) when curr accouting to mark these
throttled.

Then then if we pick a task from a throttled cfs_rq subtree, we can setup task work
for it, so we don't botter with the delayed_dequeue task case that Prateek mentioned.

WDYT?

Thanks.

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-13 18:14   ` K Prateek Nayak
@ 2025-03-14  8:48     ` Aaron Lu
  2025-03-14  9:00       ` K Prateek Nayak
  0 siblings, 1 reply; 58+ messages in thread
From: Aaron Lu @ 2025-03-14  8:48 UTC (permalink / raw)
  To: K Prateek Nayak
  Cc: Valentin Schneider, Ben Segall, Peter Zijlstra, Josh Don,
	Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chengming Zhou,
	Chuyi Zhou

On Thu, Mar 13, 2025 at 11:44:49PM +0530, K Prateek Nayak wrote:
> Hello Aaron,
> 
> P.S. I've fixed the wrapped lines and have been testing the series. So
> far I haven't run into any issues yet on my machine. Will report back if
> anything surface.

Thanks a lot for taking the time to review and test.

> 
> I've few comments inlined below.
> 
> On 3/13/2025 12:51 PM, Aaron Lu wrote:
> 
> [..snip..]
> 
> > +static inline void task_throttle_setup_work(struct task_struct *p)
> > +{
> > +	/*
> > +	 * Kthreads and exiting tasks don't return to userspace, so adding the
> > +	 * work is pointless
> > +	 */
> > +	if ((p->flags & (PF_EXITING | PF_KTHREAD)))
> > +		return;
> > +
> > +	if (task_has_throttle_work(p))
> > +		return;
> > +
> > +	task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
> 
> Does it make sense to add a throttle work to a delayed task? It may be

I missed the case that a delayed task can still be on cfs_rq and I agree
there is no need to add throttle work to a delayed task.

> dequeued soon and when it is queued back, the throttle situation might
> have changed but the work is unnecessarily run. Could the throttle work
> be instead added at the point of enqueue for delayed tasks?

Yes. If a delayed task gets re-queued and its cfs_rq is in throttled
hierarchy, it should be added the throttle work.

> 
> > +}
> > +
> >   static int tg_throttle_down(struct task_group *tg, void *data)
> >   {
> >   	struct rq *rq = data;
> >   	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
> > +	struct task_struct *p;
> > +	struct rb_node *node;
> > +
> > +	cfs_rq->throttle_count++;
> > +	if (cfs_rq->throttle_count > 1)
> > +		return 0;
> > 
> >   	/* group is entering throttled state, stop time */
> > -	if (!cfs_rq->throttle_count) {
> > -		cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> > -		list_del_leaf_cfs_rq(cfs_rq);
> > +	cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> 
> Once cencern here is that the PELT is seemingly frozen despite the
> hierarchy being runnable. I've still not tracked down whether it'll
> cause any problems once unthrottled and all throttled time is negated
> from the pelt clock but is there any concerns here?
 
I chose to do it this way because:
1 I expect most of the time, if a task has to continue to run after its
cfs_rq gets throttled, the time is relatively small so should not cause
much impact. But I agree there can be times a task runs relatively long;
2 I think the original intent to freeze cfs_rq's pelt clock on throttle
is so that on unthrottle, it can retore its loada(without its load being
decayed etc.). If I chose to not freeze its pelt clock on throttle
because some task is still running in kernel mode, since some of this
cfs_rq's tasks are throttled, its load can become smaller and this can
impact its load on unthrottle.

I think both approach is not perfect, so I chose the simple one for now
:) Not sure if my thinking is correct though.

> Maybe this can be done at dequeue when cfs_rq->nr_queued on a
> throttled_hierarchy() reached 0.

Yes, this looks more consistent, maybe I should change to this approach.

> > +	list_del_leaf_cfs_rq(cfs_rq);
> > 
> > -		SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> > -		if (cfs_rq->nr_queued)
> > -			cfs_rq->throttled_clock_self = rq_clock(rq);
> > +	SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> > +	if (cfs_rq->nr_queued)
> > +		cfs_rq->throttled_clock_self = rq_clock(rq);
> > +
> > +	WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
> > +	/*
> > +	 * rq_lock is held, current is (obviously) executing this in kernelspace.
> > +	 *
> > +	 * All other tasks enqueued on this rq have their saved PC at the
> > +	 * context switch, so they will go through the kernel before returning
> > +	 * to userspace. Thus, there are no tasks-in-userspace to handle, just
> > +	 * install the task_work on all of them.
> > +	 */
> > +	node = rb_first(&cfs_rq->tasks_timeline.rb_root);
> > +	while (node) {
> > +		struct sched_entity *se = __node_2_se(node);
> > +
> > +		if (!entity_is_task(se))
> > +			goto next;
> > +
> > +		p = task_of(se);
> > +		task_throttle_setup_work(p);
> > +next:
> > +		node = rb_next(node);
> > +	}
> > +
> > +	/* curr is not in the timeline tree */
> > +	if (cfs_rq->curr && entity_is_task(cfs_rq->curr)) {
> 
> I believe we can reach here from pick_next_task_fair() ->
> check_cfs_rq_runtime() -> throttle_cfs_rq() in which case cfs_rq->curr
> will still be set despite the task being blocked since put_prev_entity()
> has not been called yet.
> 
> I believe there should be a check for task_on_rq_queued() here for the
> current task.

Ah right, I'll see how to fix this.

Thanks,
Aaron

> > +		p = task_of(cfs_rq->curr);
> > +		task_throttle_setup_work(p);
> >   	}
> > -	cfs_rq->throttle_count++;
> > 
> >   	return 0;
> >   }
> > 
> 
> [..snip..]
> 
> -- 
> Thanks and Regards,
> Prateek
> 

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-14  8:39   ` Chengming Zhou
@ 2025-03-14  8:49     ` K Prateek Nayak
  2025-03-14  9:42     ` Aaron Lu
  1 sibling, 0 replies; 58+ messages in thread
From: K Prateek Nayak @ 2025-03-14  8:49 UTC (permalink / raw)
  To: Chengming Zhou, Aaron Lu, Valentin Schneider, Ben Segall,
	Peter Zijlstra, Josh Don, Ingo Molnar, Vincent Guittot
  Cc: linux-kernel, Juri Lelli, Dietmar Eggemann, Steven Rostedt,
	Mel Gorman, Chuyi Zhou

On 3/14/2025 2:09 PM, Chengming Zhou wrote:
> On 2025/3/13 15:21, Aaron Lu wrote:
>> From: Valentin Schneider <vschneid@redhat.com>
>>
>> Once a cfs_rq gets throttled, for all tasks belonging to this cfs_rq,
>> add a task work to them so that when those tasks return to user, the
>> actual throttle/dequeue can happen.
>>
>> Note that since the throttle/dequeue always happens on a task basis when
>> it returns to user, it's no longer necessary for check_cfs_rq_runtime()
>> to return a value and pick_task_fair() acts differently according to that
>> return value, so check_cfs_rq_runtime() is changed to not return a
>> value.
> 
> Previously with the per-cfs_rq throttling, we use update_curr() -> put() path
> to throttle the cfs_rq and dequeue it from the cfs_rq tree.
> 
> Now with your per-task throttling, maybe things can become simpler. That we
> can just throttle_cfs_rq() (cfs_rq subtree) when curr accouting to mark these
> throttled.
> 
> Then then if we pick a task from a throttled cfs_rq subtree, we can setup task work
> for it, so we don't botter with the delayed_dequeue task case that Prateek mentioned.

+1

That seems like a good idea with per-task approach.

> 
> WDYT?
> 
> Thanks.

-- 
Thanks and Regards,
Prateek


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-14  3:28   ` K Prateek Nayak
@ 2025-03-14  8:57     ` Aaron Lu
  2025-03-14  9:12       ` K Prateek Nayak
  0 siblings, 1 reply; 58+ messages in thread
From: Aaron Lu @ 2025-03-14  8:57 UTC (permalink / raw)
  To: K Prateek Nayak
  Cc: Valentin Schneider, Ben Segall, Peter Zijlstra, Josh Don,
	Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chengming Zhou,
	Chuyi Zhou

On Fri, Mar 14, 2025 at 08:58:14AM +0530, K Prateek Nayak wrote:
> Hello Aaron,
> 
> On 3/13/2025 12:51 PM, Aaron Lu wrote:
> 
> [..snip..]
> 
> > 
> > +static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
> >   static void throttle_cfs_rq_work(struct callback_head *work)
> >   {
> > +	struct task_struct *p = container_of(work, struct task_struct,
> > sched_throttle_work);
> > +	struct sched_entity *se;
> > +	struct cfs_rq *cfs_rq;
> > +	struct rq *rq;
> > +	struct rq_flags rf;
> > +
> > +	WARN_ON_ONCE(p != current);
> > +	p->sched_throttle_work.next = &p->sched_throttle_work;
> > +
> > +	/*
> > +	 * If task is exiting, then there won't be a return to userspace, so we
> > +	 * don't have to bother with any of this.
> > +	 */
> > +	if ((p->flags & PF_EXITING))
> > +		return;
> > +
> > +	rq = task_rq_lock(p, &rf);
> 
> nit. With CLASS(task_rq_lock, rq_guard)(p), you can fetch the rq with
> "rq_gurad.rq" and the "goto out_unlock" can be replaced with simple
> return.

Got it, thanks for the suggestion.

> > +
> > +	se = &p->se;
> > +	cfs_rq = cfs_rq_of(se);
> > +
> > +	/* Raced, forget */
> > +	if (p->sched_class != &fair_sched_class)
> > +		goto out_unlock;
> > +
> > +	/*
> > +	 * If not in limbo, then either replenish has happened or this task got
> > +	 * migrated out of the throttled cfs_rq, move along
> > +	 */
> > +	if (!cfs_rq->throttle_count)
> > +		goto out_unlock;
> > +
> > +	update_rq_clock(rq);
> > +	WARN_ON_ONCE(!list_empty(&p->throttle_node));
> > +	list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> > +	dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);> +	resched_curr(rq);
> > +
> > +out_unlock:
> > +	task_rq_unlock(rq, p, &rf);
> >   }
> > 
> >   void init_cfs_throttle_work(struct task_struct *p)
> > @@ -5873,32 +5914,81 @@ static int tg_unthrottle_up(struct task_group
> > *tg, void *data)
> >   	return 0;
> >   }
> > 
> > +static inline bool task_has_throttle_work(struct task_struct *p)
> > +{
> > +	return p->sched_throttle_work.next != &p->sched_throttle_work;
> > +}
> > +
> > +static inline void task_throttle_setup_work(struct task_struct *p)
> > +{
> > +	/*
> > +	 * Kthreads and exiting tasks don't return to userspace, so adding the
> > +	 * work is pointless
> > +	 */
> > +	if ((p->flags & (PF_EXITING | PF_KTHREAD)))
> > +		return;
> > +
> > +	if (task_has_throttle_work(p))
> > +		return;
> > +
> > +	task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
> > +}
> > +
> >   static int tg_throttle_down(struct task_group *tg, void *data)
> >   {
> >   	struct rq *rq = data;
> >   	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
> > +	struct task_struct *p;
> > +	struct rb_node *node;
> > +
> > +	cfs_rq->throttle_count++;
> > +	if (cfs_rq->throttle_count > 1)
> > +		return 0;
> 
> General question: Do we need the throttled_lb_pair() check in
> can_migrate_task() with the per-task throttle? Moving a throttled task
> to another CPU can ensures that the task can run quicker and exit to
> user space as quickly as possible and once the task dequeues, it will
> remove itself from the list of fair tasks making it unreachable for
> the load balancer. Thoughts?

That's a good point.

The current approach dequeued the task and removed it from rq's
cfs_tasks list, causing it lose the load balance opportunity. This is
pretty sad.

I'll need to think about this. I hope we can somehow keep the throttled
tasks in cfs_tasks list, I'll see how to make this happen.

Thanks,
Aaron

> > 
> >   	/* group is entering throttled state, stop time */
> > -	if (!cfs_rq->throttle_count) {
> > -		cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> > -		list_del_leaf_cfs_rq(cfs_rq);
> > +	cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> > +	list_del_leaf_cfs_rq(cfs_rq);
> > 
> > -		SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> > -		if (cfs_rq->nr_queued)
> > -			cfs_rq->throttled_clock_self = rq_clock(rq);
> > +	SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> > +	if (cfs_rq->nr_queued)
> > +		cfs_rq->throttled_clock_self = rq_clock(rq);
> > +
> > +	WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
> > +	/*
> > +	 * rq_lock is held, current is (obviously) executing this in kernelspace.
> > +	 *
> > +	 * All other tasks enqueued on this rq have their saved PC at the
> > +	 * context switch, so they will go through the kernel before returning
> > +	 * to userspace. Thus, there are no tasks-in-userspace to handle, just
> > +	 * install the task_work on all of them.
> > +	 */
> > +	node = rb_first(&cfs_rq->tasks_timeline.rb_root);
> > +	while (node) {
> > +		struct sched_entity *se = __node_2_se(node);
> > +
> > +		if (!entity_is_task(se))
> > +			goto next;
> > +
> > +		p = task_of(se);
> > +		task_throttle_setup_work(p);
> > +next:
> > +		node = rb_next(node);
> > +	}
> > +
> > +	/* curr is not in the timeline tree */
> > +	if (cfs_rq->curr && entity_is_task(cfs_rq->curr)) {
> > +		p = task_of(cfs_rq->curr);
> > +		task_throttle_setup_work(p);
> >   	}
> > -	cfs_rq->throttle_count++;
> > 
> >   	return 0;
> >   }
> > 
> 
> [..snip..]
> 
> -- 
> Thanks and Regards,
> Prateek
> 

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-14  8:48     ` Aaron Lu
@ 2025-03-14  9:00       ` K Prateek Nayak
  0 siblings, 0 replies; 58+ messages in thread
From: K Prateek Nayak @ 2025-03-14  9:00 UTC (permalink / raw)
  To: Aaron Lu
  Cc: Valentin Schneider, Ben Segall, Peter Zijlstra, Josh Don,
	Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chengming Zhou,
	Chuyi Zhou

Hello Aaron,

On 3/14/2025 2:18 PM, Aaron Lu wrote:
>>>    static int tg_throttle_down(struct task_group *tg, void *data)
>>>    {
>>>    	struct rq *rq = data;
>>>    	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
>>> +	struct task_struct *p;
>>> +	struct rb_node *node;
>>> +
>>> +	cfs_rq->throttle_count++;
>>> +	if (cfs_rq->throttle_count > 1)
>>> +		return 0;
>>>
>>>    	/* group is entering throttled state, stop time */
>>> -	if (!cfs_rq->throttle_count) {
>>> -		cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
>>> -		list_del_leaf_cfs_rq(cfs_rq);
>>> +	cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
>>
>> Once cencern here is that the PELT is seemingly frozen despite the
>> hierarchy being runnable. I've still not tracked down whether it'll
>> cause any problems once unthrottled and all throttled time is negated
>> from the pelt clock but is there any concerns here?
>   
> I chose to do it this way because:
> 1 I expect most of the time, if a task has to continue to run after its
> cfs_rq gets throttled, the time is relatively small so should not cause
> much impact. But I agree there can be times a task runs relatively long;
> 2 I think the original intent to freeze cfs_rq's pelt clock on throttle
> is so that on unthrottle, it can retore its loada(without its load being
> decayed etc.). If I chose to not freeze its pelt clock on throttle
> because some task is still running in kernel mode, since some of this
> cfs_rq's tasks are throttled, its load can become smaller and this can
> impact its load on unthrottle.
> 
> I think both approach is not perfect, so I chose the simple one for now
> :) Not sure if my thinking is correct though.
> 
>> Maybe this can be done at dequeue when cfs_rq->nr_queued on a
>> throttled_hierarchy() reached 0.
> 
> Yes, this looks more consistent, maybe I should change to this approach.

I agree the time might be small in most cases but some syscalls with
enough contention in the system can take a while to exit to user mode.
Even I'm not sure what the correct approach is here - should a
subtree's PELT be frozen when the last task dequeues or should we
freeze it for the whole hierarchy once the throttled cfs_rq dequeues?

I'll wait for other folks to chime in since they know these bits
better than me.

> 
>>> +	list_del_leaf_cfs_rq(cfs_rq);
>>>
>>> -		SCHED_WARN_ON(cfs_rq->throttled_clock_self);
>>> -		if (cfs_rq->nr_queued)
>>> -			cfs_rq->throttled_clock_self = rq_clock(rq);
>>> +	SCHED_WARN_ON(cfs_rq->throttled_clock_self);
>>> +	if (cfs_rq->nr_queued)
>>> +		cfs_rq->throttled_clock_self = rq_clock(rq);
>>> +
>>> +	WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
>>> +	/*
>>> +	 * rq_lock is held, current is (obviously) executing this in kernelspace.
>>> +	 *
>>> +	 * All other tasks enqueued on this rq have their saved PC at the
>>> +	 * context switch, so they will go through the kernel before returning
>>> +	 * to userspace. Thus, there are no tasks-in-userspace to handle, just
>>> +	 * install the task_work on all of them.
>>> +	 */
>>> +	node = rb_first(&cfs_rq->tasks_timeline.rb_root);
>>> +	while (node) {
>>> +		struct sched_entity *se = __node_2_se(node);
>>> +
>>> +		if (!entity_is_task(se))
>>> +			goto next;
>>> +
>>> +		p = task_of(se);
>>> +		task_throttle_setup_work(p);
>>> +next:
>>> +		node = rb_next(node);
>>> +	}
>>> +
>>> +	/* curr is not in the timeline tree */
>>> +	if (cfs_rq->curr && entity_is_task(cfs_rq->curr)) {
>>
>> I believe we can reach here from pick_next_task_fair() ->
>> check_cfs_rq_runtime() -> throttle_cfs_rq() in which case cfs_rq->curr
>> will still be set despite the task being blocked since put_prev_entity()
>> has not been called yet.
>>
>> I believe there should be a check for task_on_rq_queued() here for the
>> current task.
> 
> Ah right, I'll see how to fix this.

It may not be necessary with the recent suggestion from Chengming where
you can just add the task work if the task was picked on a throttled
hierarchy.

-- 
Thanks and Regards,
Prateek

> 
> Thanks,
> Aaron
> 
>>> +		p = task_of(cfs_rq->curr);
>>> +		task_throttle_setup_work(p);
>>>    	}
>>> -	cfs_rq->throttle_count++;
>>>
>>>    	return 0;
>>>    }
>>>
>>
>> [..snip..]
>>
>> -- 
>> Thanks and Regards,
>> Prateek
>>



^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-14  8:57     ` Aaron Lu
@ 2025-03-14  9:12       ` K Prateek Nayak
  2025-03-14 15:10         ` Aaron Lu
  0 siblings, 1 reply; 58+ messages in thread
From: K Prateek Nayak @ 2025-03-14  9:12 UTC (permalink / raw)
  To: Aaron Lu
  Cc: Valentin Schneider, Ben Segall, Peter Zijlstra, Josh Don,
	Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chengming Zhou,
	Chuyi Zhou

Hello Aaron,

On 3/14/2025 2:27 PM, Aaron Lu wrote:
>>>
>>> +static inline bool task_has_throttle_work(struct task_struct *p)
>>> +{
>>> +	return p->sched_throttle_work.next != &p->sched_throttle_work;
>>> +}
>>> +
>>> +static inline void task_throttle_setup_work(struct task_struct *p)
>>> +{
>>> +	/*
>>> +	 * Kthreads and exiting tasks don't return to userspace, so adding the
>>> +	 * work is pointless
>>> +	 */
>>> +	if ((p->flags & (PF_EXITING | PF_KTHREAD)))
>>> +		return;
>>> +
>>> +	if (task_has_throttle_work(p))
>>> +		return;
>>> +
>>> +	task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
>>> +}
>>> +
>>>    static int tg_throttle_down(struct task_group *tg, void *data)
>>>    {
>>>    	struct rq *rq = data;
>>>    	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
>>> +	struct task_struct *p;
>>> +	struct rb_node *node;
>>> +
>>> +	cfs_rq->throttle_count++;
>>> +	if (cfs_rq->throttle_count > 1)
>>> +		return 0;
>>
>> General question: Do we need the throttled_lb_pair() check in
>> can_migrate_task() with the per-task throttle? Moving a throttled task
>> to another CPU can ensures that the task can run quicker and exit to
>> user space as quickly as possible and once the task dequeues, it will
>> remove itself from the list of fair tasks making it unreachable for
>> the load balancer. Thoughts?
> 
> That's a good point.
> 
> The current approach dequeued the task and removed it from rq's
> cfs_tasks list, causing it lose the load balance opportunity. This is
> pretty sad.

That is fine. Today we have the throttled_lb_pair() check since tasks
on throttled hierarchy remain on the fair tasks list and the load
balancer should not move the around since they don't contribute to
current load in throttled state and moving them around will not change
anything since they'll still be waiting on another CPU for unthrottle.

With per-task throttle, we know that all the tasks on the fair task
list are runnable (except for the delayed ones but they contribute to
the load) - tasks on throttled hierarchy that exit to userspace will
dequeue themselves, removing them from the fair task list too.

Since a task that hasn't dequeued itself on a throttled hierarchy is
runnable, I'm suggesting we get rid of the throttled_lb_pair() check
in can_migrate_task() entirely.

> 
> I'll need to think about this. I hope we can somehow keep the throttled
> tasks in cfs_tasks list, I'll see how to make this happen.
> 
> Thanks,
> Aaron
> 

[..snip..]

-- 
Thanks and Regards,
Prateek


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-14  8:39   ` Chengming Zhou
  2025-03-14  8:49     ` K Prateek Nayak
@ 2025-03-14  9:42     ` Aaron Lu
  2025-03-14 10:26       ` K Prateek Nayak
  2025-03-14 11:07       ` Chengming Zhou
  1 sibling, 2 replies; 58+ messages in thread
From: Aaron Lu @ 2025-03-14  9:42 UTC (permalink / raw)
  To: Chengming Zhou
  Cc: Valentin Schneider, Ben Segall, K Prateek Nayak, Peter Zijlstra,
	Josh Don, Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chuyi Zhou

On Fri, Mar 14, 2025 at 04:39:41PM +0800, Chengming Zhou wrote:
> On 2025/3/13 15:21, Aaron Lu wrote:
> > From: Valentin Schneider <vschneid@redhat.com>
> > 
> > Once a cfs_rq gets throttled, for all tasks belonging to this cfs_rq,
> > add a task work to them so that when those tasks return to user, the
> > actual throttle/dequeue can happen.
> > 
> > Note that since the throttle/dequeue always happens on a task basis when
> > it returns to user, it's no longer necessary for check_cfs_rq_runtime()
> > to return a value and pick_task_fair() acts differently according to that
> > return value, so check_cfs_rq_runtime() is changed to not return a
> > value.
> 
> Previously with the per-cfs_rq throttling, we use update_curr() -> put() path
> to throttle the cfs_rq and dequeue it from the cfs_rq tree.
> 
> Now with your per-task throttling, maybe things can become simpler. That we
> can just throttle_cfs_rq() (cfs_rq subtree) when curr accouting to mark these
> throttled.

Do I understand correctly that now in throttle_cfs_rq(), we just mark
this hierarchy as throttled, but do not add any throttle work to these
tasks in this hierarchy and leave the throttle work add job to pick
time?

> Then then if we pick a task from a throttled cfs_rq subtree, we can setup task work
> for it, so we don't botter with the delayed_dequeue task case that Prateek mentioned.

If we add a check point in pick time, maybe we can also avoid the check
in enqueue time. One thing I'm thinking is, for a task, it may be picked
multiple times with only a single enqueue so if we do the check in pick,
the overhead can be larger?

> WDYT?

Thanks for your suggestion. I'll try this approach and see how it turned
out.

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [External] Re: [RFC PATCH 4/7] sched/fair: Take care of migrated task for task based throttle
  2025-03-14  4:03   ` K Prateek Nayak
@ 2025-03-14  9:49     ` Aaron Lu
  0 siblings, 0 replies; 58+ messages in thread
From: Aaron Lu @ 2025-03-14  9:49 UTC (permalink / raw)
  To: K Prateek Nayak
  Cc: Valentin Schneider, Ben Segall, Peter Zijlstra, Josh Don,
	Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chengming Zhou,
	Chuyi Zhou

On Fri, Mar 14, 2025 at 09:33:10AM +0530, K Prateek Nayak wrote:
> Hello Aaron,
> 
> On 3/13/2025 12:51 PM, Aaron Lu wrote:
> > If a task is migrated to a new cpu, it is possible this task is not
> > throttled but the new cfs_rq is throttled or vice vesa. Take care of
> > these situations in enqueue path.
> > 
> > Note that we can't handle this in migrate_task_rq_fair() because there,
> > the dst cpu's rq lock is not held and things like checking if the new
> > cfs_rq needs throttle can be racy.
> > 
> > Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
> > ---
> >   kernel/sched/fair.c | 17 +++++++++++++++++
> >   1 file changed, 17 insertions(+)
> > 
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 4a95fe3785e43..9e036f18d73e6 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -7051,6 +7051,23 @@ enqueue_task_fair(struct rq *rq, struct
> > task_struct *p, int flags)
> >   	assert_list_leaf_cfs_rq(rq);
> > 
> >   	hrtick_update(rq);
> > +
> > +	if (!cfs_bandwidth_used())
> > +		return;
> > +
> > +	/*
> > +	 * This is for migrate_task_rq_fair(): the new_cpu's rq lock is not held
> > +	 * in migrate_task_rq_fair() so we have to do these things in enqueue
> > +	 * time when the dst cpu's rq lock is held. Doing this check in enqueue
> > +	 * time also takes care of newly woken up tasks, e.g. a task wakes up
> > +	 * into a throttled cfs_rq.
> > +	 *
> > +	 * It's possible the task has a throttle work added but this new cfs_rq
> > +	 * is not in throttled hierarchy but that's OK, throttle_cfs_rq_work()
> > +	 * will take care of it.
> > +	 */
> > +	if (throttled_hierarchy(cfs_rq_of(&p->se)))
> > +		task_throttle_setup_work(p);
> 
> Any reason we can't move this to somewhere towards the top?
> throttled_hierarchy() check should be cheap enough and we probably don't
> need the cfs_bandwidth_used() guarding check unless there are other
> concerns that I may have missed.

I didn't realize the delayed dequeue case so I placed this at bottom,
but as you have mentioned, for delayed dequeue tasks that gets
re-queued, this has to be on top.

Will change it to top in next version.
Thanks!

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-14  9:42     ` Aaron Lu
@ 2025-03-14 10:26       ` K Prateek Nayak
  2025-03-14 11:47         ` Aaron Lu
  2025-03-14 11:07       ` Chengming Zhou
  1 sibling, 1 reply; 58+ messages in thread
From: K Prateek Nayak @ 2025-03-14 10:26 UTC (permalink / raw)
  To: Aaron Lu, Chengming Zhou
  Cc: Valentin Schneider, Ben Segall, Peter Zijlstra, Josh Don,
	Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chuyi Zhou

Hello Aaron,

On 3/14/2025 3:12 PM, Aaron Lu wrote:
>> Then then if we pick a task from a throttled cfs_rq subtree, we can setup task work
>> for it, so we don't botter with the delayed_dequeue task case that Prateek mentioned.
> If we add a check point in pick time, maybe we can also avoid the check
> in enqueue time. One thing I'm thinking is, for a task, it may be picked
> multiple times with only a single enqueue so if we do the check in pick,
> the overhead can be larger?

I think it can be minimized to a good extent. Something like:

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d646451d617c..ba6571368840 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5942,6 +5942,9 @@ static inline bool task_has_throttle_work(struct task_struct *p)
  
  static inline void task_throttle_setup_work(struct task_struct *p)
  {
+	if (task_has_throttle_work(p))
+		return;
+
  	/*
  	 * Kthreads and exiting tasks don't return to userspace, so adding the
  	 * work is pointless
@@ -5949,9 +5952,6 @@ static inline void task_throttle_setup_work(struct task_struct *p)
  	if ((p->flags & (PF_EXITING | PF_KTHREAD)))
  		return;
  
-	if (task_has_throttle_work(p))
-		return;
-
  	task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
  }
  
@@ -6000,12 +6000,6 @@ static int tg_throttle_down(struct task_group *tg, void *data)
  		node = rb_next(node);
  	}
  
-	/* curr is not in the timeline tree */
-	if (cfs_rq->curr && entity_is_task(cfs_rq->curr)) {
-		p = task_of(cfs_rq->curr);
-		task_throttle_setup_work(p);
-	}
-
  	return 0;
  }
  
@@ -6049,6 +6043,18 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
  	SCHED_WARN_ON(cfs_rq->throttled_clock);
  	if (cfs_rq->nr_queued)
  		cfs_rq->throttled_clock = rq_clock(rq);
+
+	/*
+	 * If cfs_rq->curr is set, check if current task is queued
+	 * and set up the throttle work proactively.
+	 */
+	if (cfs_rq->curr) {
+		struct task_struct *p = rq->donor; /* scheduling context with proxy */
+
+		if (task_on_rq_queued(p))
+			task_throttle_setup_work(p);
+	}
+
  	return;
  }
  
@@ -8938,6 +8944,13 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
  		struct sched_entity *pse = &prev->se;
  		struct cfs_rq *cfs_rq;
  
+		/*
+		 * Check if throttle work needs to be setup when
+		 * switching to a different task.
+		 */
+		if (throttled_hierarchy(cfs_rq_of(se)))
+			task_throttle_setup_work(p);
+
  		while (!(cfs_rq = is_same_group(se, pse))) {
  			int se_depth = se->depth;
  			int pse_depth = pse->depth;
@@ -13340,6 +13353,9 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
  		account_cfs_rq_runtime(cfs_rq, 0);
  	}
  
+	if (throttled_hierarchy(cfs_rq_of(se)))
+		task_throttle_setup_work(p);
+
  	__set_next_task_fair(rq, p, first);
  }
  
--

.. with the additional plumbing in place of course.

-- 
Thanks and Regards,
Prateek


^ permalink raw reply related	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 3/7] sched/fair: Handle unthrottle path for task based throttle
  2025-03-14  3:53   ` K Prateek Nayak
  2025-03-14  4:06     ` K Prateek Nayak
@ 2025-03-14 10:43     ` Aaron Lu
  2025-03-14 17:52       ` K Prateek Nayak
  1 sibling, 1 reply; 58+ messages in thread
From: Aaron Lu @ 2025-03-14 10:43 UTC (permalink / raw)
  To: K Prateek Nayak
  Cc: Valentin Schneider, Ben Segall, Peter Zijlstra, Josh Don,
	Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chengming Zhou,
	Chuyi Zhou

On Fri, Mar 14, 2025 at 09:23:47AM +0530, K Prateek Nayak wrote:
> Hello Aaron,
> 
> On 3/13/2025 12:51 PM, Aaron Lu wrote:
> 
> [..snip..]
> 
> > ---
> >   kernel/sched/fair.c | 132 +++++++++++++++-----------------------------
> >   1 file changed, 45 insertions(+), 87 deletions(-)
> > 
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index ab403ff7d53c8..4a95fe3785e43 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -5366,18 +5366,17 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct
> > sched_entity *se, int flags)
> > 
> >   	if (cfs_rq->nr_queued == 1) {
> >   		check_enqueue_throttle(cfs_rq);
> > -		if (!throttled_hierarchy(cfs_rq)) {
> > -			list_add_leaf_cfs_rq(cfs_rq);
> > -		} else {
> > +		list_add_leaf_cfs_rq(cfs_rq);
> >   #ifdef CONFIG_CFS_BANDWIDTH
> > +		if (throttled_hierarchy(cfs_rq)) {
> >   			struct rq *rq = rq_of(cfs_rq);
> > 
> >   			if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
> >   				cfs_rq->throttled_clock = rq_clock(rq);
> >   			if (!cfs_rq->throttled_clock_self)
> >   				cfs_rq->throttled_clock_self = rq_clock(rq);
> 
> These bits probabaly need revisiting. From what I understand, these
> stats were maintained to know when a task was woken up on a
> throttled hierarchy which was not connected to the parent essentially
> tracking the amount of time runnable tasks were waiting on the
> cfs_rq before an unthrottle event allowed them to be picked.

Do you mean these throttled_clock stats?

I think they are here because we do not record the throttled_clock for
empty cfs_rqs and once the cfs_rq has task enqueued, it needs to record
its throttled_clock. This is done in commit 79462e8c879a("sched: don't
account throttle time for empty groups") by Josh. I don't think per-task
throttle change this.

With this said, I think there is a gap in per-task throttle, i.e. when
all tasks are dequeued from this throttled cfs_rq, we should record its
throttled_time and clear its throttled_clock.

> 
> With per-task throttle, these semantics no longer apply since a woken
> task will run and dequeue itself when exiting to userspace.
> 
> Josh do you have any thoughts on this?
> 
> > -#endif
> >   		}
> > +#endif
> >   	}
> >   }
> >

> > @@ -5947,12 +5967,16 @@ static int tg_throttle_down(struct task_group
> > *tg, void *data)
> > 
> >   	/* group is entering throttled state, stop time */
> >   	cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> > -	list_del_leaf_cfs_rq(cfs_rq);
> > 
> >   	SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> >   	if (cfs_rq->nr_queued)
> >   		cfs_rq->throttled_clock_self = rq_clock(rq);
> > 
> > +	if (!cfs_rq->nr_queued) {
> > +		list_del_leaf_cfs_rq(cfs_rq);
> > +		return 0;
> > +	}
> > +
> 
> This bit can perhaps go in Patch 2?

I kept all the changes to leaf cfs_rq handling in one patch, I think it
is easier to review :-)

Thanks,
Aaron

> >   	WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
> >   	/*
> >   	 * rq_lock is held, current is (obviously) executing this in kernelspace.

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-14  9:42     ` Aaron Lu
  2025-03-14 10:26       ` K Prateek Nayak
@ 2025-03-14 11:07       ` Chengming Zhou
  2025-03-31  6:42         ` Aaron Lu
  1 sibling, 1 reply; 58+ messages in thread
From: Chengming Zhou @ 2025-03-14 11:07 UTC (permalink / raw)
  To: Aaron Lu
  Cc: Valentin Schneider, Ben Segall, K Prateek Nayak, Peter Zijlstra,
	Josh Don, Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chuyi Zhou

On 2025/3/14 17:42, Aaron Lu wrote:
> On Fri, Mar 14, 2025 at 04:39:41PM +0800, Chengming Zhou wrote:
>> On 2025/3/13 15:21, Aaron Lu wrote:
>>> From: Valentin Schneider <vschneid@redhat.com>
>>>
>>> Once a cfs_rq gets throttled, for all tasks belonging to this cfs_rq,
>>> add a task work to them so that when those tasks return to user, the
>>> actual throttle/dequeue can happen.
>>>
>>> Note that since the throttle/dequeue always happens on a task basis when
>>> it returns to user, it's no longer necessary for check_cfs_rq_runtime()
>>> to return a value and pick_task_fair() acts differently according to that
>>> return value, so check_cfs_rq_runtime() is changed to not return a
>>> value.
>>
>> Previously with the per-cfs_rq throttling, we use update_curr() -> put() path
>> to throttle the cfs_rq and dequeue it from the cfs_rq tree.
>>
>> Now with your per-task throttling, maybe things can become simpler. That we
>> can just throttle_cfs_rq() (cfs_rq subtree) when curr accouting to mark these
>> throttled.
> 
> Do I understand correctly that now in throttle_cfs_rq(), we just mark
> this hierarchy as throttled, but do not add any throttle work to these
> tasks in this hierarchy and leave the throttle work add job to pick
> time?

Right, we can move throttle_cfs_rq() forward to the curr accouting time, which
just mark these throttled.

And move setup_task_work() afterward to the pick task time, which make that task
dequeue when ret2user.

> 
>> Then then if we pick a task from a throttled cfs_rq subtree, we can setup task work
>> for it, so we don't botter with the delayed_dequeue task case that Prateek mentioned.
> 
> If we add a check point in pick time, maybe we can also avoid the check
> in enqueue time. One thing I'm thinking is, for a task, it may be picked
> multiple times with only a single enqueue so if we do the check in pick,
> the overhead can be larger?

As Prateek already mentioned, this check cost is negligeable.

> 
>> WDYT?
> 
> Thanks for your suggestion. I'll try this approach and see how it turned
> out.

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [External] Re: [RFC PATCH 6/7] sched/fair: fix tasks_rcu with task based throttle
  2025-03-14  4:14   ` K Prateek Nayak
@ 2025-03-14 11:37     ` Aaron Lu
  2025-03-31  6:19     ` Aaron Lu
  1 sibling, 0 replies; 58+ messages in thread
From: Aaron Lu @ 2025-03-14 11:37 UTC (permalink / raw)
  To: K Prateek Nayak
  Cc: Valentin Schneider, Ben Segall, Peter Zijlstra, Josh Don,
	Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chengming Zhou,
	Chuyi Zhou

On Fri, Mar 14, 2025 at 09:44:45AM +0530, K Prateek Nayak wrote:
> Hello Aaron,
> 
> On 3/13/2025 12:52 PM, Aaron Lu wrote:
> > Taskes throttled on exit to user path are scheduled by cond_resched() in
> > task_work_run() but that is a preempt schedule and doesn't mark a task
> > rcu quiescent state.
> > 
> > Fix this by directly calling schedule() in throttle_cfs_rq_work().
> 
> Perhaps that can be gotten around by just using set_ti_thread_flag()
> resched_curr() will also call set_preempt_need_resched() which allows
> cond_resched() to resched the task.
> 
> Since exit_to_user_mode_loop() will run once again seeing that
> TIF_NEED_RESCHED is set, schedule() should follow soon. Thoughts?

Might also work, will give it a shot, thanks for the suggestion.

Best regards,
Aaron

> > 
> > Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
> > ---
> >   kernel/sched/fair.c | 5 +++++
> >   1 file changed, 5 insertions(+)
> > 
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index f26d53ac143fe..be96f7d32998c 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -5847,6 +5847,7 @@ static void throttle_cfs_rq_work(struct
> > callback_head *work)
> >   	struct cfs_rq *cfs_rq;
> >   	struct rq *rq;
> >   	struct rq_flags rf;
> > +	bool sched = false;
> > 
> >   	WARN_ON_ONCE(p != current);
> >   	p->sched_throttle_work.next = &p->sched_throttle_work;
> > @@ -5879,9 +5880,13 @@ static void throttle_cfs_rq_work(struct
> > callback_head *work)
> >   	dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
> >   	list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> >   	resched_curr(rq);
> > +	sched = true;
> > 
> >   out_unlock:
> >   	task_rq_unlock(rq, p, &rf);
> > +
> > +	if (sched)
> > +		schedule();
> >   }
> > 
> >   void init_cfs_throttle_work(struct task_struct *p)
> 
> -- 
> Thanks and Regards,
> Prateek
> 

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [External] Re: [RFC PATCH 7/7] sched/fair: Make sure cfs_rq has enough runtime_remaining on unthrottle path
  2025-03-14  4:18   ` K Prateek Nayak
@ 2025-03-14 11:39     ` Aaron Lu
  0 siblings, 0 replies; 58+ messages in thread
From: Aaron Lu @ 2025-03-14 11:39 UTC (permalink / raw)
  To: K Prateek Nayak
  Cc: Valentin Schneider, Ben Segall, Peter Zijlstra, Josh Don,
	Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chengming Zhou,
	Chuyi Zhou

On Fri, Mar 14, 2025 at 09:48:00AM +0530, K Prateek Nayak wrote:
> Hello Aaron,
> 
> On 3/13/2025 12:52 PM, Aaron Lu wrote:
> > It's possible unthrottle_cfs_rq() is called with !runtime_remaining
> > due to things like user changed quota setting(see tg_set_cfs_bandwidth())
> > or async unthrottled us with a positive runtime_remaining but other still
> > running entities consumed those runtime before we reach there.
> > 
> > Anyway, we can't unthrottle this cfs_rq without any runtime remaining
> > because task enqueue during unthrottle can immediately trigger a throttle
> > by check_enqueue_throttle(), which should never happen.
> > 
> > Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
> > ---
> >   kernel/sched/fair.c | 13 +++++++++++++
> >   1 file changed, 13 insertions(+)
> > 
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index be96f7d32998c..d646451d617c1 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -6058,6 +6058,19 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
> >   	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
> >   	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
> > 
> > +	/*
> > +	 * It's possible we are called with !runtime_remaining due to things
> > +	 * like user changed quota setting(see tg_set_cfs_bandwidth()) or async
> > +	 * unthrottled us with a positive runtime_remaining but other still
> > +	 * running entities consumed those runtime before we reach here.
> > +	 *
> > +	 * Anyway, we can't unthrottle this cfs_rq without any runtime remaining
> > +	 * because any enqueue below will immediately trigger a throttle, which
> > +	 * is not supposed to happen on unthrottle path.
> > +	 */
> > +	if (cfs_rq->runtime_enabled && !cfs_rq->runtime_remaining)
> 
> Should this be "cfs_rq->runtime_remaining <= 0" since slack could have
> built up by that time we come here?

Absolutely!
Thanks for pointing this out.

Best regards,
Aaron

> > +		return;
> > +
> >   	cfs_rq->throttled = 0;
> > 
> >   	update_rq_clock(rq);
> 
> -- 
> Thanks and Regards,
> Prateek
> 

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [External] Re: [RFC PATCH 5/7] sched/fair: Take care of group/affinity/sched_class change for throttled task
  2025-03-14  4:51   ` K Prateek Nayak
@ 2025-03-14 11:40     ` Aaron Lu
  0 siblings, 0 replies; 58+ messages in thread
From: Aaron Lu @ 2025-03-14 11:40 UTC (permalink / raw)
  To: K Prateek Nayak
  Cc: Valentin Schneider, Ben Segall, Peter Zijlstra, Josh Don,
	Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chengming Zhou,
	Chuyi Zhou

On Fri, Mar 14, 2025 at 10:21:15AM +0530, K Prateek Nayak wrote:
> Hello Aaron,
> 
> On 3/13/2025 12:51 PM, Aaron Lu wrote:
> [..snip..]
> 
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -5876,8 +5876,8 @@ static void throttle_cfs_rq_work(struct
> > callback_head *work)
> > 
> >   	update_rq_clock(rq);
> >   	WARN_ON_ONCE(!list_empty(&p->throttle_node));
> > -	list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> >   	dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
> > +	list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> >   	resched_curr(rq);
> 
> nit. Perhaps this bit can be moved to Patch 2 to consolidate all
> changes in throttle_cfs_rq_work()

No problem.

I placed it here to better illustrate why list_add() has to be done
after dequeue_task_fair().

Thanks,
Aaron

> > 
> >   out_unlock:
> 
> [..snip..]
> 
> -- 
> Thanks and Regards,
> Prateek
> 

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-14 10:26       ` K Prateek Nayak
@ 2025-03-14 11:47         ` Aaron Lu
  2025-03-14 15:58           ` Chengming Zhou
  2025-03-14 18:04           ` K Prateek Nayak
  0 siblings, 2 replies; 58+ messages in thread
From: Aaron Lu @ 2025-03-14 11:47 UTC (permalink / raw)
  To: K Prateek Nayak
  Cc: Chengming Zhou, Valentin Schneider, Ben Segall, Peter Zijlstra,
	Josh Don, Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chuyi Zhou

Hi Prateek,

On Fri, Mar 14, 2025 at 03:56:26PM +0530, K Prateek Nayak wrote:
> Hello Aaron,
> 
> On 3/14/2025 3:12 PM, Aaron Lu wrote:
> > > Then then if we pick a task from a throttled cfs_rq subtree, we can setup task work
> > > for it, so we don't botter with the delayed_dequeue task case that Prateek mentioned.
> > If we add a check point in pick time, maybe we can also avoid the check
> > in enqueue time. One thing I'm thinking is, for a task, it may be picked
> > multiple times with only a single enqueue so if we do the check in pick,
> > the overhead can be larger?
> 
> I think it can be minimized to a good extent. Something like:

I see, thanks for the illustration.

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index d646451d617c..ba6571368840 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5942,6 +5942,9 @@ static inline bool task_has_throttle_work(struct task_struct *p)
>  static inline void task_throttle_setup_work(struct task_struct *p)
>  {
> +	if (task_has_throttle_work(p))
> +		return;
> +
>  	/*
>  	 * Kthreads and exiting tasks don't return to userspace, so adding the
>  	 * work is pointless
> @@ -5949,9 +5952,6 @@ static inline void task_throttle_setup_work(struct task_struct *p)
>  	if ((p->flags & (PF_EXITING | PF_KTHREAD)))
>  		return;
> -	if (task_has_throttle_work(p))
> -		return;
> -
>  	task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
>  }
> @@ -6000,12 +6000,6 @@ static int tg_throttle_down(struct task_group *tg, void *data)
>  		node = rb_next(node);
>  	}
> -	/* curr is not in the timeline tree */
> -	if (cfs_rq->curr && entity_is_task(cfs_rq->curr)) {
> -		p = task_of(cfs_rq->curr);
> -		task_throttle_setup_work(p);
> -	}
> -

Should we also remove adding throttle work for those tasks in
cfs_rq->tasks_timeline?

>  	return 0;
>  }
> @@ -6049,6 +6043,18 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
>  	SCHED_WARN_ON(cfs_rq->throttled_clock);
>  	if (cfs_rq->nr_queued)
>  		cfs_rq->throttled_clock = rq_clock(rq);
> +
> +	/*
> +	 * If cfs_rq->curr is set, check if current task is queued
> +	 * and set up the throttle work proactively.
> +	 */
> +	if (cfs_rq->curr) {
> +		struct task_struct *p = rq->donor; /* scheduling context with proxy */

I'll have to check what rq->donor means.
I think the point is to proactively add throttle work for rq->curr if
rq->curr is in this throttled hierarchy? Because the only check point to
add throttle work will be at pick time and curr will probably not be
picked anytime soon.

Thanks,
Aaron

> +
> +		if (task_on_rq_queued(p))
> +			task_throttle_setup_work(p);
> +	}
> +
>  	return;
>  }
> @@ -8938,6 +8944,13 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
>  		struct sched_entity *pse = &prev->se;
>  		struct cfs_rq *cfs_rq;
> +		/*
> +		 * Check if throttle work needs to be setup when
> +		 * switching to a different task.
> +		 */
> +		if (throttled_hierarchy(cfs_rq_of(se)))
> +			task_throttle_setup_work(p);
> +
>  		while (!(cfs_rq = is_same_group(se, pse))) {
>  			int se_depth = se->depth;
>  			int pse_depth = pse->depth;
> @@ -13340,6 +13353,9 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
>  		account_cfs_rq_runtime(cfs_rq, 0);
>  	}
> +	if (throttled_hierarchy(cfs_rq_of(se)))
> +		task_throttle_setup_work(p);
> +
>  	__set_next_task_fair(rq, p, first);
>  }
> --
> 
> .. with the additional plumbing in place of course.
> 
> -- 
> Thanks and Regards,
> Prateek
> 

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-14  9:12       ` K Prateek Nayak
@ 2025-03-14 15:10         ` Aaron Lu
  0 siblings, 0 replies; 58+ messages in thread
From: Aaron Lu @ 2025-03-14 15:10 UTC (permalink / raw)
  To: K Prateek Nayak
  Cc: Valentin Schneider, Ben Segall, Peter Zijlstra, Josh Don,
	Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chengming Zhou,
	Chuyi Zhou

On Fri, Mar 14, 2025 at 02:42:46PM +0530, K Prateek Nayak wrote:
> Hello Aaron,
> 
> On 3/14/2025 2:27 PM, Aaron Lu wrote:
> > > > 
> > > > +static inline bool task_has_throttle_work(struct task_struct *p)
> > > > +{
> > > > +	return p->sched_throttle_work.next != &p->sched_throttle_work;
> > > > +}
> > > > +
> > > > +static inline void task_throttle_setup_work(struct task_struct *p)
> > > > +{
> > > > +	/*
> > > > +	 * Kthreads and exiting tasks don't return to userspace, so adding the
> > > > +	 * work is pointless
> > > > +	 */
> > > > +	if ((p->flags & (PF_EXITING | PF_KTHREAD)))
> > > > +		return;
> > > > +
> > > > +	if (task_has_throttle_work(p))
> > > > +		return;
> > > > +
> > > > +	task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
> > > > +}
> > > > +
> > > >    static int tg_throttle_down(struct task_group *tg, void *data)
> > > >    {
> > > >    	struct rq *rq = data;
> > > >    	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
> > > > +	struct task_struct *p;
> > > > +	struct rb_node *node;
> > > > +
> > > > +	cfs_rq->throttle_count++;
> > > > +	if (cfs_rq->throttle_count > 1)
> > > > +		return 0;
> > > 
> > > General question: Do we need the throttled_lb_pair() check in
> > > can_migrate_task() with the per-task throttle? Moving a throttled task
> > > to another CPU can ensures that the task can run quicker and exit to
> > > user space as quickly as possible and once the task dequeues, it will
> > > remove itself from the list of fair tasks making it unreachable for
> > > the load balancer. Thoughts?
> > 
> > That's a good point.
> > 
> > The current approach dequeued the task and removed it from rq's
> > cfs_tasks list, causing it lose the load balance opportunity. This is
> > pretty sad.
> 
> That is fine. Today we have the throttled_lb_pair() check since tasks
> on throttled hierarchy remain on the fair tasks list and the load
> balancer should not move the around since they don't contribute to
> current load in throttled state and moving them around will not change
> anything since they'll still be waiting on another CPU for unthrottle.

OK I misunderstood. I thought tasks in throttled hierarchy still has
chance to participate load balance.

> With per-task throttle, we know that all the tasks on the fair task
> list are runnable (except for the delayed ones but they contribute to
> the load) - tasks on throttled hierarchy that exit to userspace will
> dequeue themselves, removing them from the fair task list too.
> 
> Since a task that hasn't dequeued itself on a throttled hierarchy is
> runnable, I'm suggesting we get rid of the throttled_lb_pair() check
> in can_migrate_task() entirely.

Agree, will fix this in next version, thanks.

Best regards,
Aaron

> > 
> > I'll need to think about this. I hope we can somehow keep the throttled
> > tasks in cfs_tasks list, I'll see how to make this happen.
> > 
> > Thanks,
> > Aaron
> > 
> 
> [..snip..]
> 
> -- 
> Thanks and Regards,
> Prateek
> 

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-14 11:47         ` Aaron Lu
@ 2025-03-14 15:58           ` Chengming Zhou
  2025-03-14 18:04           ` K Prateek Nayak
  1 sibling, 0 replies; 58+ messages in thread
From: Chengming Zhou @ 2025-03-14 15:58 UTC (permalink / raw)
  To: Aaron Lu, K Prateek Nayak
  Cc: Valentin Schneider, Ben Segall, Peter Zijlstra, Josh Don,
	Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chuyi Zhou

On 2025/3/14 19:47, Aaron Lu wrote:
> Hi Prateek,
> 
> On Fri, Mar 14, 2025 at 03:56:26PM +0530, K Prateek Nayak wrote:
>> Hello Aaron,
>>
>> On 3/14/2025 3:12 PM, Aaron Lu wrote:
>>>> Then then if we pick a task from a throttled cfs_rq subtree, we can setup task work
>>>> for it, so we don't botter with the delayed_dequeue task case that Prateek mentioned.
>>> If we add a check point in pick time, maybe we can also avoid the check
>>> in enqueue time. One thing I'm thinking is, for a task, it may be picked
>>> multiple times with only a single enqueue so if we do the check in pick,
>>> the overhead can be larger?
>>
>> I think it can be minimized to a good extent. Something like:
> 
> I see, thanks for the illustration.
> 
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index d646451d617c..ba6571368840 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -5942,6 +5942,9 @@ static inline bool task_has_throttle_work(struct task_struct *p)
>>   static inline void task_throttle_setup_work(struct task_struct *p)
>>   {
>> +	if (task_has_throttle_work(p))
>> +		return;
>> +
>>   	/*
>>   	 * Kthreads and exiting tasks don't return to userspace, so adding the
>>   	 * work is pointless
>> @@ -5949,9 +5952,6 @@ static inline void task_throttle_setup_work(struct task_struct *p)
>>   	if ((p->flags & (PF_EXITING | PF_KTHREAD)))
>>   		return;
>> -	if (task_has_throttle_work(p))
>> -		return;
>> -
>>   	task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
>>   }
>> @@ -6000,12 +6000,6 @@ static int tg_throttle_down(struct task_group *tg, void *data)
>>   		node = rb_next(node);
>>   	}
>> -	/* curr is not in the timeline tree */
>> -	if (cfs_rq->curr && entity_is_task(cfs_rq->curr)) {
>> -		p = task_of(cfs_rq->curr);
>> -		task_throttle_setup_work(p);
>> -	}
>> -
> 
> Should we also remove adding throttle work for those tasks in
> cfs_rq->tasks_timeline?
> 
>>   	return 0;
>>   }
>> @@ -6049,6 +6043,18 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
>>   	SCHED_WARN_ON(cfs_rq->throttled_clock);
>>   	if (cfs_rq->nr_queued)
>>   		cfs_rq->throttled_clock = rq_clock(rq);
>> +
>> +	/*
>> +	 * If cfs_rq->curr is set, check if current task is queued
>> +	 * and set up the throttle work proactively.
>> +	 */
>> +	if (cfs_rq->curr) {
>> +		struct task_struct *p = rq->donor; /* scheduling context with proxy */
> 
> I'll have to check what rq->donor means.
> I think the point is to proactively add throttle work for rq->curr if
> rq->curr is in this throttled hierarchy? Because the only check point to
> add throttle work will be at pick time and curr will probably not be
> picked anytime soon.

The cfs_rq based throttle use update_curr() -> put_prev_task(), so it just
resched_curr() in update_curr().

With per-task throttle, we don't need to call resched_curr(), we should
setup throttle work for this curr, which will dequeue & resched when ret2user.

Thanks.

> 
> Thanks,
> Aaron
> 
>> +
>> +		if (task_on_rq_queued(p))
>> +			task_throttle_setup_work(p);
>> +	}
>> +
>>   	return;
>>   }
>> @@ -8938,6 +8944,13 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
>>   		struct sched_entity *pse = &prev->se;
>>   		struct cfs_rq *cfs_rq;
>> +		/*
>> +		 * Check if throttle work needs to be setup when
>> +		 * switching to a different task.
>> +		 */
>> +		if (throttled_hierarchy(cfs_rq_of(se)))
>> +			task_throttle_setup_work(p);
>> +
>>   		while (!(cfs_rq = is_same_group(se, pse))) {
>>   			int se_depth = se->depth;
>>   			int pse_depth = pse->depth;
>> @@ -13340,6 +13353,9 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
>>   		account_cfs_rq_runtime(cfs_rq, 0);
>>   	}
>> +	if (throttled_hierarchy(cfs_rq_of(se)))
>> +		task_throttle_setup_work(p);
>> +
>>   	__set_next_task_fair(rq, p, first);
>>   }
>> --
>>
>> .. with the additional plumbing in place of course.
>>
>> -- 
>> Thanks and Regards,
>> Prateek
>>

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 3/7] sched/fair: Handle unthrottle path for task based throttle
  2025-03-14 10:43     ` Aaron Lu
@ 2025-03-14 17:52       ` K Prateek Nayak
  2025-03-17  5:48         ` Aaron Lu
  2025-04-02  9:25         ` Aaron Lu
  0 siblings, 2 replies; 58+ messages in thread
From: K Prateek Nayak @ 2025-03-14 17:52 UTC (permalink / raw)
  To: Aaron Lu
  Cc: Valentin Schneider, Ben Segall, Peter Zijlstra, Josh Don,
	Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chengming Zhou,
	Chuyi Zhou

Hello Aaron,

On 3/14/2025 4:13 PM, Aaron Lu wrote:
> On Fri, Mar 14, 2025 at 09:23:47AM +0530, K Prateek Nayak wrote:
>> Hello Aaron,
>>
>> On 3/13/2025 12:51 PM, Aaron Lu wrote:
>>
>> [..snip..]
>>
>>> ---
>>>    kernel/sched/fair.c | 132 +++++++++++++++-----------------------------
>>>    1 file changed, 45 insertions(+), 87 deletions(-)
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index ab403ff7d53c8..4a95fe3785e43 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -5366,18 +5366,17 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct
>>> sched_entity *se, int flags)
>>>
>>>    	if (cfs_rq->nr_queued == 1) {
>>>    		check_enqueue_throttle(cfs_rq);
>>> -		if (!throttled_hierarchy(cfs_rq)) {
>>> -			list_add_leaf_cfs_rq(cfs_rq);
>>> -		} else {
>>> +		list_add_leaf_cfs_rq(cfs_rq);
>>>    #ifdef CONFIG_CFS_BANDWIDTH
>>> +		if (throttled_hierarchy(cfs_rq)) {
>>>    			struct rq *rq = rq_of(cfs_rq);
>>>
>>>    			if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
>>>    				cfs_rq->throttled_clock = rq_clock(rq);
>>>    			if (!cfs_rq->throttled_clock_self)
>>>    				cfs_rq->throttled_clock_self = rq_clock(rq);
>>
>> These bits probabaly need revisiting. From what I understand, these
>> stats were maintained to know when a task was woken up on a
>> throttled hierarchy which was not connected to the parent essentially
>> tracking the amount of time runnable tasks were waiting on the
>> cfs_rq before an unthrottle event allowed them to be picked.
> 
> Do you mean these throttled_clock stats?
> 
> I think they are here because we do not record the throttled_clock for
> empty cfs_rqs and once the cfs_rq has task enqueued, it needs to record
> its throttled_clock. This is done in commit 79462e8c879a("sched: don't
> account throttle time for empty groups") by Josh. I don't think per-task
> throttle change this.
> 
> With this said, I think there is a gap in per-task throttle, i.e. when
> all tasks are dequeued from this throttled cfs_rq, we should record its
> throttled_time and clear its throttled_clock.

Yes but then what it'll track is the amount of time task were running
when the cfs_rq was on a throttled hierarchy. Is that what we want to
track or something else.

The commit log for 677ea015f231 ("sched: add throttled time stat for
throttled children") says the following for "throttled_clock_self_time":

     We currently export the total throttled time for cgroups that are given
     a bandwidth limit. This patch extends this accounting to also account
     the total time that each children cgroup has been throttled.

     This is useful to understand the degree to which children have been
     affected by the throttling control. Children which are not runnable
     during the entire throttled period, for example, will not show any
     self-throttling time during this period.

but with per-task model, it is probably the amount of time that
"throttled_limbo_list" has a task on it since they are runnable
but are in-fact waiting for an unthrottle.

If a task enqueues itself on a throttled hierarchy and then blocks
again before exiting to the userspace, it should not count in
"throttled_clock_self_time" since the task was runnable the whole
time despite the hierarchy being frozen.

> 
>>
>> With per-task throttle, these semantics no longer apply since a woken
>> task will run and dequeue itself when exiting to userspace.
>>
>> Josh do you have any thoughts on this?
>>
>>> -#endif
>>>    		}
>>> +#endif
>>>    	}
>>>    }
>>>
> 
>>> @@ -5947,12 +5967,16 @@ static int tg_throttle_down(struct task_group
>>> *tg, void *data)
>>>
>>>    	/* group is entering throttled state, stop time */
>>>    	cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
>>> -	list_del_leaf_cfs_rq(cfs_rq);
>>>
>>>    	SCHED_WARN_ON(cfs_rq->throttled_clock_self);
>>>    	if (cfs_rq->nr_queued)
>>>    		cfs_rq->throttled_clock_self = rq_clock(rq);
>>>
>>> +	if (!cfs_rq->nr_queued) {
>>> +		list_del_leaf_cfs_rq(cfs_rq);
>>> +		return 0;
>>> +	}
>>> +
>>
>> This bit can perhaps go in Patch 2?
> 
> I kept all the changes to leaf cfs_rq handling in one patch, I think it
> is easier to review :-)

Thank you! That would be great.

> 
> Thanks,
> Aaron
> 
>>>    	WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
>>>    	/*
>>>    	 * rq_lock is held, current is (obviously) executing this in kernelspace.

-- 
Thanks and Regards,
Prateek


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-14 11:47         ` Aaron Lu
  2025-03-14 15:58           ` Chengming Zhou
@ 2025-03-14 18:04           ` K Prateek Nayak
  1 sibling, 0 replies; 58+ messages in thread
From: K Prateek Nayak @ 2025-03-14 18:04 UTC (permalink / raw)
  To: Aaron Lu
  Cc: Chengming Zhou, Valentin Schneider, Ben Segall, Peter Zijlstra,
	Josh Don, Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chuyi Zhou,
	John Stultz

(+jstultz for some clarification around rq->donor)

Hello Aaron,

On 3/14/2025 5:17 PM, Aaron Lu wrote:
> Hi Prateek,
> 
> On Fri, Mar 14, 2025 at 03:56:26PM +0530, K Prateek Nayak wrote:
>> Hello Aaron,
>>
>> On 3/14/2025 3:12 PM, Aaron Lu wrote:
>>>> Then then if we pick a task from a throttled cfs_rq subtree, we can setup task work
>>>> for it, so we don't botter with the delayed_dequeue task case that Prateek mentioned.
>>> If we add a check point in pick time, maybe we can also avoid the check
>>> in enqueue time. One thing I'm thinking is, for a task, it may be picked
>>> multiple times with only a single enqueue so if we do the check in pick,
>>> the overhead can be larger?
>>
>> I think it can be minimized to a good extent. Something like:
> 
> I see, thanks for the illustration.
> 
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index d646451d617c..ba6571368840 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -5942,6 +5942,9 @@ static inline bool task_has_throttle_work(struct task_struct *p)
>>   static inline void task_throttle_setup_work(struct task_struct *p)
>>   {
>> +	if (task_has_throttle_work(p))
>> +		return;
>> +
>>   	/*
>>   	 * Kthreads and exiting tasks don't return to userspace, so adding the
>>   	 * work is pointless
>> @@ -5949,9 +5952,6 @@ static inline void task_throttle_setup_work(struct task_struct *p)
>>   	if ((p->flags & (PF_EXITING | PF_KTHREAD)))
>>   		return;
>> -	if (task_has_throttle_work(p))
>> -		return;
>> -
>>   	task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
>>   }
>> @@ -6000,12 +6000,6 @@ static int tg_throttle_down(struct task_group *tg, void *data)
>>   		node = rb_next(node);
>>   	}
>> -	/* curr is not in the timeline tree */
>> -	if (cfs_rq->curr && entity_is_task(cfs_rq->curr)) {
>> -		p = task_of(cfs_rq->curr);
>> -		task_throttle_setup_work(p);
>> -	}
>> -
> 
> Should we also remove adding throttle work for those tasks in
> cfs_rq->tasks_timeline?

Yes. I don't think that is required with this approach suggested by
Chengming.

> 
>>   	return 0;
>>   }
>> @@ -6049,6 +6043,18 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
>>   	SCHED_WARN_ON(cfs_rq->throttled_clock);
>>   	if (cfs_rq->nr_queued)
>>   		cfs_rq->throttled_clock = rq_clock(rq);
>> +
>> +	/*
>> +	 * If cfs_rq->curr is set, check if current task is queued
>> +	 * and set up the throttle work proactively.
>> +	 */
>> +	if (cfs_rq->curr) {
>> +		struct task_struct *p = rq->donor; /* scheduling context with proxy */
> 
> I'll have to check what rq->donor means.

Essentially "rq->donor" is the scheduling context (where the time
accounting happens) whereas "rq->curr" may point to a mutex-holder that
is running with the context of "rq->donor" to unblock itself as quickly
as possible.

If we experience a throttle, it is because the time accounting to
"rq->donor" hierarchy couldn't fetch anymore bandwidth.

TBH, adding the task work to either of them doesn't make sense since
the "rq->donor" will be picked to run again once mutex-holder does a
handoff and the throttle work can be added then too.

> I think the point is to proactively add throttle work for rq->curr if
> rq->curr is in this throttled hierarchy? Because the only check point to
> add throttle work will be at pick time and curr will probably not be
> picked anytime soon.

John knows these bits best and maybe he can correct me if I'm wrong
here. For the time being, rq->curr and rq->donor both point to the
same task but that will change when Part-2 of proxy execution series
lands. We can just clarify these bits now to avoid any conflicts later.

-- 
Thanks and Regards,
Prateek

> 
> Thanks,
> Aaron
> 
>> +
>> +		if (task_on_rq_queued(p))
>> +			task_throttle_setup_work(p);
>> +	}
>> +
>>   	return;
>>   }
>> @@ -8938,6 +8944,13 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
>>   		struct sched_entity *pse = &prev->se;
>>   		struct cfs_rq *cfs_rq;
>> +		/*
>> +		 * Check if throttle work needs to be setup when
>> +		 * switching to a different task.
>> +		 */
>> +		if (throttled_hierarchy(cfs_rq_of(se)))
>> +			task_throttle_setup_work(p);
>> +
>>   		while (!(cfs_rq = is_same_group(se, pse))) {
>>   			int se_depth = se->depth;
>>   			int pse_depth = pse->depth;
>> @@ -13340,6 +13353,9 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
>>   		account_cfs_rq_runtime(cfs_rq, 0);
>>   	}
>> +	if (throttled_hierarchy(cfs_rq_of(se)))
>> +		task_throttle_setup_work(p);
>> +
>>   	__set_next_task_fair(rq, p, first);
>>   }
>> --
>>
>> .. with the additional plumbing in place of course.
>>
>> -- 
>> Thanks and Regards,
>> Prateek
>>




^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-13  7:21 ` [RFC PATCH 2/7] sched/fair: Handle throttle path " Aaron Lu
                     ` (2 preceding siblings ...)
  2025-03-14  8:39   ` Chengming Zhou
@ 2025-03-16  3:25   ` Josh Don
  2025-03-17  2:54     ` Chengming Zhou
                       ` (2 more replies)
  3 siblings, 3 replies; 58+ messages in thread
From: Josh Don @ 2025-03-16  3:25 UTC (permalink / raw)
  To: Aaron Lu
  Cc: Valentin Schneider, Ben Segall, K Prateek Nayak, Peter Zijlstra,
	Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chengming Zhou,
	Chuyi Zhou, Xi Wang

Hi Aaron,

>  static int tg_throttle_down(struct task_group *tg, void *data)
>  {
>         struct rq *rq = data;
>         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
> +       struct task_struct *p;
> +       struct rb_node *node;
> +
> +       cfs_rq->throttle_count++;
> +       if (cfs_rq->throttle_count > 1)
> +               return 0;
>
>         /* group is entering throttled state, stop time */
> -       if (!cfs_rq->throttle_count) {
> -               cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> -               list_del_leaf_cfs_rq(cfs_rq);
> +       cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> +       list_del_leaf_cfs_rq(cfs_rq);
>
> -               SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> -               if (cfs_rq->nr_queued)
> -                       cfs_rq->throttled_clock_self = rq_clock(rq);
> +       SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> +       if (cfs_rq->nr_queued)
> +               cfs_rq->throttled_clock_self = rq_clock(rq);
> +
> +       WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
> +       /*
> +        * rq_lock is held, current is (obviously) executing this in kernelspace.
> +        *
> +        * All other tasks enqueued on this rq have their saved PC at the
> +        * context switch, so they will go through the kernel before returning
> +        * to userspace. Thus, there are no tasks-in-userspace to handle, just
> +        * install the task_work on all of them.
> +        */
> +       node = rb_first(&cfs_rq->tasks_timeline.rb_root);
> +       while (node) {
> +               struct sched_entity *se = __node_2_se(node);
> +
> +               if (!entity_is_task(se))
> +                       goto next;
> +
> +               p = task_of(se);
> +               task_throttle_setup_work(p);
> +next:
> +               node = rb_next(node);
> +       }

I'd like to strongly push back on this approach. This adds quite a lot
of extra computation to an already expensive path
(throttle/unthrottle). e.g. this function is part of the cgroup walk
and so it is already O(cgroups) for the number of cgroups in the
hierarchy being throttled. This gets even worse when you consider that
we repeat this separately across all the cpus that the
bandwidth-constrained group is running on. Keep in mind that
throttle/unthrottle is done with rq lock held and IRQ disabled.

In K Prateek's last RFC, there was discussion of using context
tracking; did you consider that approach any further? We could keep
track of the number of threads within a cgroup hierarchy currently in
kernel mode (similar to h_nr_runnable), and thus simplify down the
throttling code here.

Best,
Josh

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-16  3:25   ` Josh Don
@ 2025-03-17  2:54     ` Chengming Zhou
  2025-03-20  6:59       ` K Prateek Nayak
  2025-03-19 13:43     ` Aaron Lu
  2025-03-20  6:53     ` K Prateek Nayak
  2 siblings, 1 reply; 58+ messages in thread
From: Chengming Zhou @ 2025-03-17  2:54 UTC (permalink / raw)
  To: Josh Don, Aaron Lu
  Cc: Valentin Schneider, Ben Segall, K Prateek Nayak, Peter Zijlstra,
	Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chuyi Zhou, Xi Wang

On 2025/3/16 11:25, Josh Don wrote:
> Hi Aaron,
> 
>>   static int tg_throttle_down(struct task_group *tg, void *data)
>>   {
>>          struct rq *rq = data;
>>          struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
>> +       struct task_struct *p;
>> +       struct rb_node *node;
>> +
>> +       cfs_rq->throttle_count++;
>> +       if (cfs_rq->throttle_count > 1)
>> +               return 0;
>>
>>          /* group is entering throttled state, stop time */
>> -       if (!cfs_rq->throttle_count) {
>> -               cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
>> -               list_del_leaf_cfs_rq(cfs_rq);
>> +       cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
>> +       list_del_leaf_cfs_rq(cfs_rq);
>>
>> -               SCHED_WARN_ON(cfs_rq->throttled_clock_self);
>> -               if (cfs_rq->nr_queued)
>> -                       cfs_rq->throttled_clock_self = rq_clock(rq);
>> +       SCHED_WARN_ON(cfs_rq->throttled_clock_self);
>> +       if (cfs_rq->nr_queued)
>> +               cfs_rq->throttled_clock_self = rq_clock(rq);
>> +
>> +       WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
>> +       /*
>> +        * rq_lock is held, current is (obviously) executing this in kernelspace.
>> +        *
>> +        * All other tasks enqueued on this rq have their saved PC at the
>> +        * context switch, so they will go through the kernel before returning
>> +        * to userspace. Thus, there are no tasks-in-userspace to handle, just
>> +        * install the task_work on all of them.
>> +        */
>> +       node = rb_first(&cfs_rq->tasks_timeline.rb_root);
>> +       while (node) {
>> +               struct sched_entity *se = __node_2_se(node);
>> +
>> +               if (!entity_is_task(se))
>> +                       goto next;
>> +
>> +               p = task_of(se);
>> +               task_throttle_setup_work(p);
>> +next:
>> +               node = rb_next(node);
>> +       }
> 
> I'd like to strongly push back on this approach. This adds quite a lot
> of extra computation to an already expensive path
> (throttle/unthrottle). e.g. this function is part of the cgroup walk

Actually, with my suggestion in another email that we only need to mark
these cfs_rqs throttled when quote used up, and setup throttle task work
when the tasks under those cfs_rqs get picked, the cost of throttle path
is much like the dual tree approach.

As for unthrottle path, you're right, it has to iterate each task on
a list to get it queued into the cfs_rq tree, so it needs more thinking.

> and so it is already O(cgroups) for the number of cgroups in the
> hierarchy being throttled. This gets even worse when you consider that
> we repeat this separately across all the cpus that the
> bandwidth-constrained group is running on. Keep in mind that
> throttle/unthrottle is done with rq lock held and IRQ disabled.

Maybe we can avoid holding rq lock when unthrottle? As for per-task
unthrottle, it's much like just waking up lots of tasks, so maybe we
can reuse ttwu path to wakeup those tasks, which could utilise remote
IPI to avoid holding remote rq locks. I'm not sure, just some random thinking..

> 
> In K Prateek's last RFC, there was discussion of using context
> tracking; did you consider that approach any further? We could keep
> track of the number of threads within a cgroup hierarchy currently in
> kernel mode (similar to h_nr_runnable), and thus simplify down the

Yeah, I think Prateek's approach is very creative! The only downside of
it is that the code becomes much complex.. on already complex codebase.
Anyway, it would be great that or this could be merged in mainline kernel.

At last, I wonder is it possible that we can implement a cgroup-level
bandwidth control, instead of doing it in each sched_class? Then SCX
tasks could be controlled too, without implementing it in BPF code...

Thanks!

> throttling code here.
> 
> Best,
> Josh

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 3/7] sched/fair: Handle unthrottle path for task based throttle
  2025-03-14 17:52       ` K Prateek Nayak
@ 2025-03-17  5:48         ` Aaron Lu
  2025-04-02  9:25         ` Aaron Lu
  1 sibling, 0 replies; 58+ messages in thread
From: Aaron Lu @ 2025-03-17  5:48 UTC (permalink / raw)
  To: K Prateek Nayak
  Cc: Valentin Schneider, Ben Segall, Peter Zijlstra, Josh Don,
	Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chengming Zhou,
	Chuyi Zhou

On Fri, Mar 14, 2025 at 11:22:20PM +0530, K Prateek Nayak wrote:
> Hello Aaron,
> 
> On 3/14/2025 4:13 PM, Aaron Lu wrote:
> > On Fri, Mar 14, 2025 at 09:23:47AM +0530, K Prateek Nayak wrote:
> > > Hello Aaron,
> > > 
> > > On 3/13/2025 12:51 PM, Aaron Lu wrote:
> > > 
> > > [..snip..]
> > > 
> > > > ---
> > > >    kernel/sched/fair.c | 132 +++++++++++++++-----------------------------
> > > >    1 file changed, 45 insertions(+), 87 deletions(-)
> > > > 
> > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > > index ab403ff7d53c8..4a95fe3785e43 100644
> > > > --- a/kernel/sched/fair.c
> > > > +++ b/kernel/sched/fair.c
> > > > @@ -5366,18 +5366,17 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct
> > > > sched_entity *se, int flags)
> > > > 
> > > >    	if (cfs_rq->nr_queued == 1) {
> > > >    		check_enqueue_throttle(cfs_rq);
> > > > -		if (!throttled_hierarchy(cfs_rq)) {
> > > > -			list_add_leaf_cfs_rq(cfs_rq);
> > > > -		} else {
> > > > +		list_add_leaf_cfs_rq(cfs_rq);
> > > >    #ifdef CONFIG_CFS_BANDWIDTH
> > > > +		if (throttled_hierarchy(cfs_rq)) {
> > > >    			struct rq *rq = rq_of(cfs_rq);
> > > > 
> > > >    			if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
> > > >    				cfs_rq->throttled_clock = rq_clock(rq);
> > > >    			if (!cfs_rq->throttled_clock_self)
> > > >    				cfs_rq->throttled_clock_self = rq_clock(rq);
> > > 
> > > These bits probabaly need revisiting. From what I understand, these
> > > stats were maintained to know when a task was woken up on a
> > > throttled hierarchy which was not connected to the parent essentially
> > > tracking the amount of time runnable tasks were waiting on the
> > > cfs_rq before an unthrottle event allowed them to be picked.
> > 
> > Do you mean these throttled_clock stats?
> > 
> > I think they are here because we do not record the throttled_clock for
> > empty cfs_rqs and once the cfs_rq has task enqueued, it needs to record
> > its throttled_clock. This is done in commit 79462e8c879a("sched: don't
> > account throttle time for empty groups") by Josh. I don't think per-task
> > throttle change this.
> > 
> > With this said, I think there is a gap in per-task throttle, i.e. when
> > all tasks are dequeued from this throttled cfs_rq, we should record its
> > throttled_time and clear its throttled_clock.
> 
> Yes but then what it'll track is the amount of time task were running
> when the cfs_rq was on a throttled hierarchy. Is that what we want to
> track or something else.

Right, my last comment was not correct.

Basically, my current approach tried to mimic the existing accounting,
i.e. when there is task enqueued in a throttled cfs_rq, start recording
this cfs_rq's throttled_clock. It kind of over-accounts the throttled
time for cfs_rq with this per-task throttle model because some task can
still be running in kernel mode while cfs_rq is throttled.

> The commit log for 677ea015f231 ("sched: add throttled time stat for
> throttled children") says the following for "throttled_clock_self_time":
> 
>     We currently export the total throttled time for cgroups that are given
>     a bandwidth limit. This patch extends this accounting to also account
>     the total time that each children cgroup has been throttled.
> 
>     This is useful to understand the degree to which children have been
>     affected by the throttling control. Children which are not runnable
>     during the entire throttled period, for example, will not show any
>     self-throttling time during this period.
> 
> but with per-task model, it is probably the amount of time that
> "throttled_limbo_list" has a task on it since they are runnable
> but are in-fact waiting for an unthrottle.
>
> If a task enqueues itself on a throttled hierarchy and then blocks
> again before exiting to the userspace, it should not count in
> "throttled_clock_self_time" since the task was runnable the whole
> time despite the hierarchy being frozen.

I think there is a mismatch between per-task throttle and per-cfs_rq
stats, it's hard to make the accounting perfect. Assume a throttled
cfs_rq has 4 tasks, with 2 tasks blocked on limbo_list and 2 tasks still
running in kernel mode. Should we treat this time as throttled or not
for this cfs_rq?

This is similar to the pelt clock freeze problem. For the above example,
should we freeze the cfs_rq's pelt clock or let it continue when this
cfs_rq is throttled with some task blocked on limbo_list and some task
still running in kernel mode?

My understanding is, neither approach is perfect, so I just chose the
simpler one for now. Please correct me if my understaning is wrong.

Thanks,
Aaron

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 1/7] sched/fair: Add related data structure for task based throttle
  2025-03-13  7:21 ` [RFC PATCH 1/7] sched/fair: Add related data structure for task based throttle Aaron Lu
@ 2025-03-17 10:28   ` Valentin Schneider
  2025-03-17 11:02     ` Aaron Lu
  0 siblings, 1 reply; 58+ messages in thread
From: Valentin Schneider @ 2025-03-17 10:28 UTC (permalink / raw)
  To: Aaron Lu, Ben Segall, K Prateek Nayak, Peter Zijlstra, Josh Don,
	Ingo Molnar, Vincent Guittot
  Cc: linux-kernel, Juri Lelli, Dietmar Eggemann, Steven Rostedt,
	Mel Gorman, Chengming Zhou, Chuyi Zhou

Hi, just an FYI, I didn't receive the cover letter (PATCH 0/7) and it can't
be found on lore either:

  https://lore.kernel.org/lkml/20250313072030.1032893-1-ziqianlu@bytedance.com/

Not sure what went wrong there, but could you consider resending that?
Thanks.


^ permalink raw reply	[flat|nested] 58+ messages in thread

* [RFC PATCH 0/7] Defer throttle when task exits to user
@ 2025-03-17 10:56 Aaron Lu
  2025-03-13  7:21 ` [RFC PATCH 1/7] sched/fair: Add related data structure for task based throttle Aaron Lu
                   ` (6 more replies)
  0 siblings, 7 replies; 58+ messages in thread
From: Aaron Lu @ 2025-03-17 10:56 UTC (permalink / raw)
  To: Valentin Schneider, Ben Segall, K Prateek Nayak, Peter Zijlstra,
	Josh Don, Ingo Molnar, Vincent Guittot
  Cc: linux-kernel, Juri Lelli, Dietmar Eggemann, Steven Rostedt,
	Mel Gorman, Chengming Zhou, Chuyi Zhou

This is a continuous work based on Valentin Schneider's posting here:
Subject: [RFC PATCH v3 00/10] sched/fair: Defer CFS throttle to user entry
https://lore.kernel.org/lkml/20240711130004.2157737-1-vschneid@redhat.com/

Valentin has described the problem very well in the above link. We also
have task hung problem from time to time in our environment due to cfs quota.
It is mostly visible with rwsem: when a reader is throttled, writer comes in
and has to wait, the writer also makes all subsequent readers wait,
causing problems of priority inversion or even whole system hung.

Changes I've made since Valentin's v3:
- Use enqueue_task_fair() and dequeue_task_fair() in cfs_rq's throttle
  and unthrottle path;
- Get rid of irq_work, since the task work that is supposed to throttle
  the task can figure things out and do things accordingly, so no need
  to have a irq_work to cancel a no longer needed task work;
- Several fix like taking care of task group change, sched class change
  etc. for throttled task;
- tasks_rcu fix with this task based throttle.

Tests:
- A basic test to verify functionality like limit cgroup cpu time and
  change task group, affinity etc.
- A script that tried to mimic a large cgroup setup is used to see how
  bad it is to unthrottle cfs_rqs and enqueue back large number of tasks
  in hrtime context.

  The test was done on a 2sockets/384threads AMD CPU with the following
  cgroup setup: 2 first level cgroups with quota setting, each has 100
  child cgroups and each child cgroup has 10 leaf child cgroups, with a
  total number of 2000 cgroups. In each leaf child cgroup, 10 cpu hog
  tasks are created there. Below is the durations of
  distribute_cfs_runtime() during a 1 minute window:
  @durations:
  [8K, 16K)            274 |@@@@@@@@@@@@@@@@@@@@@                               |
  [16K, 32K)           132 |@@@@@@@@@@                                          |
  [32K, 64K)             6 |                                                    |
  [64K, 128K)            0 |                                                    |
  [128K, 256K)           2 |                                                    |
  [256K, 512K)           0 |                                                    |
  [512K, 1M)           117 |@@@@@@@@@                                           |
  [1M, 2M)             665 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
  [2M, 4M)              10 |                                                    |
  So the biggest duration is in 2-4ms range in this hrtime context. How
  bad is this number? I think it is acceptable but maybe the setup I
  created is not complex enough?
  In older kernels where async unthrottle is not available, the largest
  time range can be about 100ms+.

Patches:
The patchset is arranged to get the basic functionality done first and
then deal with special cases. I hope this can make it easier to review.

Patch1 is preparation work;

Patch2-3 provide the main functionality.
Patch2 deals with throttle path: when a cfs_rq is to be throttled, add a
task work to each of its tasks so that when those tasks return to user, the
task work can throttle it by dequeuing the task and remember this by
adding the task to its cfs_rq's limbo list;
Patch3 deals with unthrottle path: when a cfs_rq is to be unthrottled,
enqueue back those tasks in limbo list;

Patch4-5 deal with special cases.
Patch4 deals with task migration: if a task migrates to a throttled
cfs_rq, setup throttle work for it. If otherwise a task that already has
task work added migrated to a not throttled cfs_rq, its task work will
remain: the work handler will figure things out and skip the throttle.
This also deals with setting throttle task work for tasks that switched
to fair class, changed group etc. because all these things need enqueue
the task to the target cfs_rq;
Patch5 deals with the dequeue path when task changes group, sched class
etc. Task that is throttled is dequeued in fair, but task->on_rq is
still set so when it changes task group, sched class or has affinity
setting change, core will firstly dequeue it. But since this task is
already dequeued in fair class, this patch handle this situation.

Patch6-7 are two fixes while doing test. I can also fold them in if that
is better.
Patch6 makes CONFIG_TASKS_RCU happy. Throttled tasks get scheduled in
tasks_work_run() by cond_resched() but that is a preempt schedule and
doesn't mark a task rcu quiescent state, so I add a schedule call in
throttle task work directly.
Patch7 fixed a problem where unthrottle path can cause throttle to
happen again when enqueuing task.

All the patches changelogs are written by me, so if the changelogs look
poor, it's my bad.

Comments are welcome. If you see any problems or issues with this
approach, please feel free to let me know, thanks.

Base commit: tip/sched/core, commit fd881d0a085f("rseq: Fix segfault on
registration when rseq_cs is non-zero").

Known issues:
- !CONFIG_CFS_BANDWIDTH is totally not tested yet;
- task_is_throttled_fair() could probably be replaced with
  task_is_throttled() now but I'll leave this to next version.
- cfs_rq's pelt clock is stopped on throttle while it can still have
  tasks running(like some task is still running in kernel space).
  It's also possible to keep its pelt clock running till its last task
  is throttled/dequeued, but in this way, this cfs_rq's load may be
  decreased too much since many of its tasks are throttled. For now,
  keep it simple by keeping the current behavior.

Aaron Lu (4):
  sched/fair: Take care of migrated task for task based throttle
  sched/fair: Take care of group/affinity/sched_class change for
    throttled task
  sched/fair: fix tasks_rcu with task based throttle
  sched/fair: Make sure cfs_rq has enough runtime_remaining on
    unthrottle path

Valentin Schneider (3):
  sched/fair: Add related data structure for task based throttle
  sched/fair: Handle throttle path for task based throttle
  sched/fair: Handle unthrottle path for task based throttle

 include/linux/sched.h |   4 +
 kernel/sched/core.c   |   3 +
 kernel/sched/fair.c   | 380 +++++++++++++++++++++++-------------------
 kernel/sched/sched.h  |   3 +
 4 files changed, 216 insertions(+), 174 deletions(-)

-- 
2.39.5

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 1/7] sched/fair: Add related data structure for task based throttle
  2025-03-17 10:28   ` Valentin Schneider
@ 2025-03-17 11:02     ` Aaron Lu
  0 siblings, 0 replies; 58+ messages in thread
From: Aaron Lu @ 2025-03-17 11:02 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: Ben Segall, K Prateek Nayak, Peter Zijlstra, Josh Don,
	Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chengming Zhou,
	Chuyi Zhou

On Mon, Mar 17, 2025 at 11:28:23AM +0100, Valentin Schneider wrote:
> Hi, just an FYI, I didn't receive the cover letter (PATCH 0/7) and it can't
> be found on lore either:
> 
>   https://lore.kernel.org/lkml/20250313072030.1032893-1-ziqianlu@bytedance.com/
> 
> Not sure what went wrong there, but could you consider resending that?
> Thanks.

Sorry for the trouble. The original cover letter mail's message id is
changed to CANCG0Gfn-BENDNqJmWC2BCxXLA8pQWrAwNibx22Dv_yUzyNV5g@mail.gmail.com
for some reason, probably because I used the wrong smtp server when
sending the series.

Seeing your mail, I just realized I can re-send the cover letter with
the correct message id and get this issue fixed.

I have re-sent the cover letter and saw it on lore now:
https://lore.kernel.org/lkml/20250313072030.1032893-1-ziqianlu@bytedance.com/

Sorry again for the trouble.

Best regards,
Aaron

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-16  3:25   ` Josh Don
  2025-03-17  2:54     ` Chengming Zhou
@ 2025-03-19 13:43     ` Aaron Lu
  2025-03-20  1:06       ` Josh Don
  2025-03-20  6:53     ` K Prateek Nayak
  2 siblings, 1 reply; 58+ messages in thread
From: Aaron Lu @ 2025-03-19 13:43 UTC (permalink / raw)
  To: Josh Don
  Cc: Valentin Schneider, Ben Segall, K Prateek Nayak, Peter Zijlstra,
	Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chengming Zhou,
	Chuyi Zhou, Xi Wang

Hi Josh,

On Sat, Mar 15, 2025 at 08:25:53PM -0700, Josh Don wrote:
> Hi Aaron,
> 
> >  static int tg_throttle_down(struct task_group *tg, void *data)
> >  {
> >         struct rq *rq = data;
> >         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
> > +       struct task_struct *p;
> > +       struct rb_node *node;
> > +
> > +       cfs_rq->throttle_count++;
> > +       if (cfs_rq->throttle_count > 1)
> > +               return 0;
> >
> >         /* group is entering throttled state, stop time */
> > -       if (!cfs_rq->throttle_count) {
> > -               cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> > -               list_del_leaf_cfs_rq(cfs_rq);
> > +       cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> > +       list_del_leaf_cfs_rq(cfs_rq);
> >
> > -               SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> > -               if (cfs_rq->nr_queued)
> > -                       cfs_rq->throttled_clock_self = rq_clock(rq);
> > +       SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> > +       if (cfs_rq->nr_queued)
> > +               cfs_rq->throttled_clock_self = rq_clock(rq);
> > +
> > +       WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
> > +       /*
> > +        * rq_lock is held, current is (obviously) executing this in kernelspace.
> > +        *
> > +        * All other tasks enqueued on this rq have their saved PC at the
> > +        * context switch, so they will go through the kernel before returning
> > +        * to userspace. Thus, there are no tasks-in-userspace to handle, just
> > +        * install the task_work on all of them.
> > +        */
> > +       node = rb_first(&cfs_rq->tasks_timeline.rb_root);
> > +       while (node) {
> > +               struct sched_entity *se = __node_2_se(node);
> > +
> > +               if (!entity_is_task(se))
> > +                       goto next;
> > +
> > +               p = task_of(se);
> > +               task_throttle_setup_work(p);
> > +next:
> > +               node = rb_next(node);
> > +       }
> 
> I'd like to strongly push back on this approach. This adds quite a lot
> of extra computation to an already expensive path
> (throttle/unthrottle). e.g. this function is part of the cgroup walk
> and so it is already O(cgroups) for the number of cgroups in the
> hierarchy being throttled. This gets even worse when you consider that
> we repeat this separately across all the cpus that the
> bandwidth-constrained group is running on. Keep in mind that
> throttle/unthrottle is done with rq lock held and IRQ disabled.

Agree that it's not good to do this O(nr_task) thing in
throttle/unthrottle path. As Chengming mentioned, throttle path can
avoid this but unthrottle path does not have an easy way to avoid this.

> In K Prateek's last RFC, there was discussion of using context
> tracking; did you consider that approach any further? We could keep

I haven't tried that approach yet.

> track of the number of threads within a cgroup hierarchy currently in
> kernel mode (similar to h_nr_runnable), and thus simplify down the
> throttling code here.

My initial feeling is the implementation looks pretty complex. If it can
be simplified somehow, that would be great.

Best regards,
Aaron

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-19 13:43     ` Aaron Lu
@ 2025-03-20  1:06       ` Josh Don
  0 siblings, 0 replies; 58+ messages in thread
From: Josh Don @ 2025-03-20  1:06 UTC (permalink / raw)
  To: Aaron Lu, Chengming Zhou, Xi Wang
  Cc: Valentin Schneider, Ben Segall, K Prateek Nayak, Peter Zijlstra,
	Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chuyi Zhou

On Wed, Mar 19, 2025 at 6:43 AM Aaron Lu <ziqianlu@bytedance.com> wrote:
[snip]
> > track of the number of threads within a cgroup hierarchy currently in
> > kernel mode (similar to h_nr_runnable), and thus simplify down the
> > throttling code here.
>
> My initial feeling is the implementation looks pretty complex. If it can
> be simplified somehow, that would be great.

Xi (attached to this message) spoke about this at LPC last year; Xi
can you describe your approach in a little bit more concrete detail?

Best,
Josh

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-16  3:25   ` Josh Don
  2025-03-17  2:54     ` Chengming Zhou
  2025-03-19 13:43     ` Aaron Lu
@ 2025-03-20  6:53     ` K Prateek Nayak
  2 siblings, 0 replies; 58+ messages in thread
From: K Prateek Nayak @ 2025-03-20  6:53 UTC (permalink / raw)
  To: Josh Don, Aaron Lu
  Cc: Valentin Schneider, Ben Segall, Peter Zijlstra, Ingo Molnar,
	Vincent Guittot, linux-kernel, Juri Lelli, Dietmar Eggemann,
	Steven Rostedt, Mel Gorman, Chengming Zhou, Chuyi Zhou, Xi Wang

Hello Josh,

On 3/16/2025 8:55 AM, Josh Don wrote:
> Hi Aaron,
> 
>>   static int tg_throttle_down(struct task_group *tg, void *data)
>>   {
>>          struct rq *rq = data;
>>          struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
>> +       struct task_struct *p;
>> +       struct rb_node *node;
>> +
>> +       cfs_rq->throttle_count++;
>> +       if (cfs_rq->throttle_count > 1)
>> +               return 0;
>>
>>          /* group is entering throttled state, stop time */
>> -       if (!cfs_rq->throttle_count) {
>> -               cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
>> -               list_del_leaf_cfs_rq(cfs_rq);
>> +       cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
>> +       list_del_leaf_cfs_rq(cfs_rq);
>>
>> -               SCHED_WARN_ON(cfs_rq->throttled_clock_self);
>> -               if (cfs_rq->nr_queued)
>> -                       cfs_rq->throttled_clock_self = rq_clock(rq);
>> +       SCHED_WARN_ON(cfs_rq->throttled_clock_self);
>> +       if (cfs_rq->nr_queued)
>> +               cfs_rq->throttled_clock_self = rq_clock(rq);
>> +
>> +       WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
>> +       /*
>> +        * rq_lock is held, current is (obviously) executing this in kernelspace.
>> +        *
>> +        * All other tasks enqueued on this rq have their saved PC at the
>> +        * context switch, so they will go through the kernel before returning
>> +        * to userspace. Thus, there are no tasks-in-userspace to handle, just
>> +        * install the task_work on all of them.
>> +        */
>> +       node = rb_first(&cfs_rq->tasks_timeline.rb_root);
>> +       while (node) {
>> +               struct sched_entity *se = __node_2_se(node);
>> +
>> +               if (!entity_is_task(se))
>> +                       goto next;
>> +
>> +               p = task_of(se);
>> +               task_throttle_setup_work(p);
>> +next:
>> +               node = rb_next(node);
>> +       }
> 
> I'd like to strongly push back on this approach. This adds quite a lot
> of extra computation to an already expensive path
> (throttle/unthrottle). e.g. this function is part of the cgroup walk
> and so it is already O(cgroups) for the number of cgroups in the
> hierarchy being throttled. This gets even worse when you consider that
> we repeat this separately across all the cpus that the
> bandwidth-constrained group is running on. Keep in mind that
> throttle/unthrottle is done with rq lock held and IRQ disabled.

On this note, do you have any statistics for how many tasks are
throttled per-CPU on your system. The info from:

     sudo ./bpftrace -e "kprobe:throttle_cfs_rq { \
         @nr_queued[((struct cfs_rq *)$1)->h_nr_queued] = count(); \
         @nr_runnable[((struct cfs_rq *)$1)->h_nr_runnable] = count(); \
     }"

could help estimate the worst case times with per-task throttling
we are expecting.

> 
> In K Prateek's last RFC, there was discussion of using context
> tracking; did you consider that approach any further? We could keep
> track of the number of threads within a cgroup hierarchy currently in
> kernel mode (similar to h_nr_runnable), and thus simplify down the
> throttling code here.

Based on Chengming's latest suggestion, we can keep tg_throttle_down()
as is and tag the task at pick using throttled_hierarchy() which will
work too.

Since it'll most likely end up doing:

     if (throttled_hierarchy(cfs_rq_of(&p->se)))
             task_throttle_setup_work(p);

The only overhead for the users not using CFS bandwidth is just the
cfs_rq->throttle_count check. If it was set, you are simply moving
the overhead to set the throttle work from the throttle path to
the pick path for the throttled tasks only and it also avoids
adding unnecessary work to task that may never get picked before
unthrottle.

> 
> Best,
> Josh

-- 
Thanks and Regards,
Prateek


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-17  2:54     ` Chengming Zhou
@ 2025-03-20  6:59       ` K Prateek Nayak
  2025-03-20  8:39         ` Chengming Zhou
  2025-03-28 22:47         ` Benjamin Segall
  0 siblings, 2 replies; 58+ messages in thread
From: K Prateek Nayak @ 2025-03-20  6:59 UTC (permalink / raw)
  To: Chengming Zhou, Josh Don, Aaron Lu
  Cc: Valentin Schneider, Ben Segall, Peter Zijlstra, Ingo Molnar,
	Vincent Guittot, linux-kernel, Juri Lelli, Dietmar Eggemann,
	Steven Rostedt, Mel Gorman, Chuyi Zhou, Xi Wang

Hello Chengming,

On 3/17/2025 8:24 AM, Chengming Zhou wrote:
> On 2025/3/16 11:25, Josh Don wrote:
>> Hi Aaron,
>>
>>>   static int tg_throttle_down(struct task_group *tg, void *data)
>>>   {
>>>          struct rq *rq = data;
>>>          struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
>>> +       struct task_struct *p;
>>> +       struct rb_node *node;
>>> +
>>> +       cfs_rq->throttle_count++;
>>> +       if (cfs_rq->throttle_count > 1)
>>> +               return 0;
>>>
>>>          /* group is entering throttled state, stop time */
>>> -       if (!cfs_rq->throttle_count) {
>>> -               cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
>>> -               list_del_leaf_cfs_rq(cfs_rq);
>>> +       cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
>>> +       list_del_leaf_cfs_rq(cfs_rq);
>>>
>>> -               SCHED_WARN_ON(cfs_rq->throttled_clock_self);
>>> -               if (cfs_rq->nr_queued)
>>> -                       cfs_rq->throttled_clock_self = rq_clock(rq);
>>> +       SCHED_WARN_ON(cfs_rq->throttled_clock_self);
>>> +       if (cfs_rq->nr_queued)
>>> +               cfs_rq->throttled_clock_self = rq_clock(rq);
>>> +
>>> +       WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
>>> +       /*
>>> +        * rq_lock is held, current is (obviously) executing this in kernelspace.
>>> +        *
>>> +        * All other tasks enqueued on this rq have their saved PC at the
>>> +        * context switch, so they will go through the kernel before returning
>>> +        * to userspace. Thus, there are no tasks-in-userspace to handle, just
>>> +        * install the task_work on all of them.
>>> +        */
>>> +       node = rb_first(&cfs_rq->tasks_timeline.rb_root);
>>> +       while (node) {
>>> +               struct sched_entity *se = __node_2_se(node);
>>> +
>>> +               if (!entity_is_task(se))
>>> +                       goto next;
>>> +
>>> +               p = task_of(se);
>>> +               task_throttle_setup_work(p);
>>> +next:
>>> +               node = rb_next(node);
>>> +       }
>>
>> I'd like to strongly push back on this approach. This adds quite a lot
>> of extra computation to an already expensive path
>> (throttle/unthrottle). e.g. this function is part of the cgroup walk
> 
> Actually, with my suggestion in another email that we only need to mark
> these cfs_rqs throttled when quote used up, and setup throttle task work
> when the tasks under those cfs_rqs get picked, the cost of throttle path
> is much like the dual tree approach.
> 
> As for unthrottle path, you're right, it has to iterate each task on
> a list to get it queued into the cfs_rq tree, so it needs more thinking.
> 
>> and so it is already O(cgroups) for the number of cgroups in the
>> hierarchy being throttled. This gets even worse when you consider that
>> we repeat this separately across all the cpus that the
>> bandwidth-constrained group is running on. Keep in mind that
>> throttle/unthrottle is done with rq lock held and IRQ disabled.
> 
> Maybe we can avoid holding rq lock when unthrottle? As for per-task
> unthrottle, it's much like just waking up lots of tasks, so maybe we
> can reuse ttwu path to wakeup those tasks, which could utilise remote
> IPI to avoid holding remote rq locks. I'm not sure, just some random thinking..
> 
>>
>> In K Prateek's last RFC, there was discussion of using context
>> tracking; did you consider that approach any further? We could keep
>> track of the number of threads within a cgroup hierarchy currently in
>> kernel mode (similar to h_nr_runnable), and thus simplify down the
> 
> Yeah, I think Prateek's approach is very creative! The only downside of
> it is that the code becomes much complex.. on already complex codebase.
> Anyway, it would be great that or this could be merged in mainline kernel.

I think the consensus is to move to per-task throttling since it'll
simplify the efforts to move to a flat hierarchy sometime in the near
future.

My original approach was complex since i wanted to seamlessly resume the
pick part on unthrottle. In Ben;s patch [1] there is a TODO in
pick_next_entity() and that probably worked with the older vruntime based
CFS algorithm but can break EEVDF guarantees.

[1] https://lore.kernel.org/all/xm26edfxpock.fsf@bsegall-linux.svl.corp.google.com/

Unfortunately keeping a single rbtree meant replicating the stats and
that indeed adds to complexity.

> 
> At last, I wonder is it possible that we can implement a cgroup-level
> bandwidth control, instead of doing it in each sched_class? Then SCX
> tasks could be controlled too, without implementing it in BPF code...
> 
> Thanks!
> 
>> throttling code here.
>>
>> Best,
>> Josh

-- 
Thanks and Regards,
Prateek


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-20  6:59       ` K Prateek Nayak
@ 2025-03-20  8:39         ` Chengming Zhou
  2025-03-20 18:40           ` Xi Wang
  2025-03-28 22:47         ` Benjamin Segall
  1 sibling, 1 reply; 58+ messages in thread
From: Chengming Zhou @ 2025-03-20  8:39 UTC (permalink / raw)
  To: K Prateek Nayak, Josh Don, Aaron Lu
  Cc: Valentin Schneider, Ben Segall, Peter Zijlstra, Ingo Molnar,
	Vincent Guittot, linux-kernel, Juri Lelli, Dietmar Eggemann,
	Steven Rostedt, Mel Gorman, Chuyi Zhou, Xi Wang

Hello Prateek,

On 2025/3/20 14:59, K Prateek Nayak wrote:
> Hello Chengming,
> 
> On 3/17/2025 8:24 AM, Chengming Zhou wrote:
>> On 2025/3/16 11:25, Josh Don wrote:
>>> Hi Aaron,
>>>
>>>>   static int tg_throttle_down(struct task_group *tg, void *data)
>>>>   {
>>>>          struct rq *rq = data;
>>>>          struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
>>>> +       struct task_struct *p;
>>>> +       struct rb_node *node;
>>>> +
>>>> +       cfs_rq->throttle_count++;
>>>> +       if (cfs_rq->throttle_count > 1)
>>>> +               return 0;
>>>>
>>>>          /* group is entering throttled state, stop time */
>>>> -       if (!cfs_rq->throttle_count) {
>>>> -               cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
>>>> -               list_del_leaf_cfs_rq(cfs_rq);
>>>> +       cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
>>>> +       list_del_leaf_cfs_rq(cfs_rq);
>>>>
>>>> -               SCHED_WARN_ON(cfs_rq->throttled_clock_self);
>>>> -               if (cfs_rq->nr_queued)
>>>> -                       cfs_rq->throttled_clock_self = rq_clock(rq);
>>>> +       SCHED_WARN_ON(cfs_rq->throttled_clock_self);
>>>> +       if (cfs_rq->nr_queued)
>>>> +               cfs_rq->throttled_clock_self = rq_clock(rq);
>>>> +
>>>> +       WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
>>>> +       /*
>>>> +        * rq_lock is held, current is (obviously) executing this in kernelspace.
>>>> +        *
>>>> +        * All other tasks enqueued on this rq have their saved PC at the
>>>> +        * context switch, so they will go through the kernel before returning
>>>> +        * to userspace. Thus, there are no tasks-in-userspace to handle, just
>>>> +        * install the task_work on all of them.
>>>> +        */
>>>> +       node = rb_first(&cfs_rq->tasks_timeline.rb_root);
>>>> +       while (node) {
>>>> +               struct sched_entity *se = __node_2_se(node);
>>>> +
>>>> +               if (!entity_is_task(se))
>>>> +                       goto next;
>>>> +
>>>> +               p = task_of(se);
>>>> +               task_throttle_setup_work(p);
>>>> +next:
>>>> +               node = rb_next(node);
>>>> +       }
>>>
>>> I'd like to strongly push back on this approach. This adds quite a lot
>>> of extra computation to an already expensive path
>>> (throttle/unthrottle). e.g. this function is part of the cgroup walk
>>
>> Actually, with my suggestion in another email that we only need to mark
>> these cfs_rqs throttled when quote used up, and setup throttle task work
>> when the tasks under those cfs_rqs get picked, the cost of throttle path
>> is much like the dual tree approach.
>>
>> As for unthrottle path, you're right, it has to iterate each task on
>> a list to get it queued into the cfs_rq tree, so it needs more thinking.
>>
>>> and so it is already O(cgroups) for the number of cgroups in the
>>> hierarchy being throttled. This gets even worse when you consider that
>>> we repeat this separately across all the cpus that the
>>> bandwidth-constrained group is running on. Keep in mind that
>>> throttle/unthrottle is done with rq lock held and IRQ disabled.
>>
>> Maybe we can avoid holding rq lock when unthrottle? As for per-task
>> unthrottle, it's much like just waking up lots of tasks, so maybe we
>> can reuse ttwu path to wakeup those tasks, which could utilise remote
>> IPI to avoid holding remote rq locks. I'm not sure, just some random thinking..
>>
>>>
>>> In K Prateek's last RFC, there was discussion of using context
>>> tracking; did you consider that approach any further? We could keep
>>> track of the number of threads within a cgroup hierarchy currently in
>>> kernel mode (similar to h_nr_runnable), and thus simplify down the
>>
>> Yeah, I think Prateek's approach is very creative! The only downside of
>> it is that the code becomes much complex.. on already complex codebase.
>> Anyway, it would be great that or this could be merged in mainline kernel.
> 
> I think the consensus is to move to per-task throttling since it'll
> simplify the efforts to move to a flat hierarchy sometime in the near
> future.

Ah, agree! And I'm very much looking forward to seeing a flat hierarchy!

At our clusters, there are often too many levels (3-6) of cgroups, which
caused too much cost in scheduler activities.

> 
> My original approach was complex since i wanted to seamlessly resume the
> pick part on unthrottle. In Ben;s patch [1] there is a TODO in
> pick_next_entity() and that probably worked with the older vruntime based
> CFS algorithm but can break EEVDF guarantees.
> 
> [1] https://lore.kernel.org/all/xm26edfxpock.fsf@bsegall-linux.svl.corp.google.com/
> 
> Unfortunately keeping a single rbtree meant replicating the stats and
> that indeed adds to complexity.

Ok, got it.

Thanks!

> 
>>
>> At last, I wonder is it possible that we can implement a cgroup-level
>> bandwidth control, instead of doing it in each sched_class? Then SCX
>> tasks could be controlled too, without implementing it in BPF code...
>>
>> Thanks!
>>
>>> throttling code here.
>>>
>>> Best,
>>> Josh
> 

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-20  8:39         ` Chengming Zhou
@ 2025-03-20 18:40           ` Xi Wang
  2025-03-24  8:58             ` Aaron Lu
  0 siblings, 1 reply; 58+ messages in thread
From: Xi Wang @ 2025-03-20 18:40 UTC (permalink / raw)
  To: Chengming Zhou
  Cc: K Prateek Nayak, Josh Don, Aaron Lu, Valentin Schneider,
	Ben Segall, Peter Zijlstra, Ingo Molnar, Vincent Guittot,
	linux-kernel, Juri Lelli, Dietmar Eggemann, Steven Rostedt,
	Mel Gorman, Chuyi Zhou

On Thu, Mar 20, 2025 at 1:39 AM Chengming Zhou <chengming.zhou@linux.dev> wrote:
>
> Hello Prateek,
>
> On 2025/3/20 14:59, K Prateek Nayak wrote:
> > Hello Chengming,
> >
> > On 3/17/2025 8:24 AM, Chengming Zhou wrote:
> >> On 2025/3/16 11:25, Josh Don wrote:
> >>> Hi Aaron,
> >>>
> >>>>   static int tg_throttle_down(struct task_group *tg, void *data)
> >>>>   {
> >>>>          struct rq *rq = data;
> >>>>          struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
> >>>> +       struct task_struct *p;
> >>>> +       struct rb_node *node;
> >>>> +
> >>>> +       cfs_rq->throttle_count++;
> >>>> +       if (cfs_rq->throttle_count > 1)
> >>>> +               return 0;
> >>>>
> >>>>          /* group is entering throttled state, stop time */
> >>>> -       if (!cfs_rq->throttle_count) {
> >>>> -               cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> >>>> -               list_del_leaf_cfs_rq(cfs_rq);
> >>>> +       cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> >>>> +       list_del_leaf_cfs_rq(cfs_rq);
> >>>>
> >>>> -               SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> >>>> -               if (cfs_rq->nr_queued)
> >>>> -                       cfs_rq->throttled_clock_self = rq_clock(rq);
> >>>> +       SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> >>>> +       if (cfs_rq->nr_queued)
> >>>> +               cfs_rq->throttled_clock_self = rq_clock(rq);
> >>>> +
> >>>> +       WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
> >>>> +       /*
> >>>> +        * rq_lock is held, current is (obviously) executing this in kernelspace.
> >>>> +        *
> >>>> +        * All other tasks enqueued on this rq have their saved PC at the
> >>>> +        * context switch, so they will go through the kernel before returning
> >>>> +        * to userspace. Thus, there are no tasks-in-userspace to handle, just
> >>>> +        * install the task_work on all of them.
> >>>> +        */
> >>>> +       node = rb_first(&cfs_rq->tasks_timeline.rb_root);
> >>>> +       while (node) {
> >>>> +               struct sched_entity *se = __node_2_se(node);
> >>>> +
> >>>> +               if (!entity_is_task(se))
> >>>> +                       goto next;
> >>>> +
> >>>> +               p = task_of(se);
> >>>> +               task_throttle_setup_work(p);
> >>>> +next:
> >>>> +               node = rb_next(node);
> >>>> +       }
> >>>
> >>> I'd like to strongly push back on this approach. This adds quite a lot
> >>> of extra computation to an already expensive path
> >>> (throttle/unthrottle). e.g. this function is part of the cgroup walk
> >>
> >> Actually, with my suggestion in another email that we only need to mark
> >> these cfs_rqs throttled when quote used up, and setup throttle task work
> >> when the tasks under those cfs_rqs get picked, the cost of throttle path
> >> is much like the dual tree approach.
> >>
> >> As for unthrottle path, you're right, it has to iterate each task on
> >> a list to get it queued into the cfs_rq tree, so it needs more thinking.
> >>
> >>> and so it is already O(cgroups) for the number of cgroups in the
> >>> hierarchy being throttled. This gets even worse when you consider that
> >>> we repeat this separately across all the cpus that the
> >>> bandwidth-constrained group is running on. Keep in mind that
> >>> throttle/unthrottle is done with rq lock held and IRQ disabled.
> >>
> >> Maybe we can avoid holding rq lock when unthrottle? As for per-task
> >> unthrottle, it's much like just waking up lots of tasks, so maybe we
> >> can reuse ttwu path to wakeup those tasks, which could utilise remote
> >> IPI to avoid holding remote rq locks. I'm not sure, just some random thinking..
> >>
> >>>
> >>> In K Prateek's last RFC, there was discussion of using context
> >>> tracking; did you consider that approach any further? We could keep
> >>> track of the number of threads within a cgroup hierarchy currently in
> >>> kernel mode (similar to h_nr_runnable), and thus simplify down the
> >>
> >> Yeah, I think Prateek's approach is very creative! The only downside of
> >> it is that the code becomes much complex.. on already complex codebase.
> >> Anyway, it would be great that or this could be merged in mainline kernel.
> >
> > I think the consensus is to move to per-task throttling since it'll
> > simplify the efforts to move to a flat hierarchy sometime in the near
> > future.
>
> Ah, agree! And I'm very much looking forward to seeing a flat hierarchy!
>
> At our clusters, there are often too many levels (3-6) of cgroups, which
> caused too much cost in scheduler activities.
>
> >
> > My original approach was complex since i wanted to seamlessly resume the
> > pick part on unthrottle. In Ben;s patch [1] there is a TODO in
> > pick_next_entity() and that probably worked with the older vruntime based
> > CFS algorithm but can break EEVDF guarantees.
> >
> > [1] https://lore.kernel.org/all/xm26edfxpock.fsf@bsegall-linux.svl.corp.google.com/
> >
> > Unfortunately keeping a single rbtree meant replicating the stats and
> > that indeed adds to complexity.
>
> Ok, got it.
>
> Thanks!
>
> >
> >>
> >> At last, I wonder is it possible that we can implement a cgroup-level
> >> bandwidth control, instead of doing it in each sched_class? Then SCX
> >> tasks could be controlled too, without implementing it in BPF code...
> >>
> >> Thanks!
> >>
> >>> throttling code here.
> >>>
> >>> Best,
> >>> Josh
> >

I am a bit unsure about the overhead experiment results. Maybe we can add some
counters to check how many cgroups per cpu are actually touched and how many
threads are actually dequeued / enqueued for throttling / unthrottling?

Looks like busy loop workloads were used for the experiment. With throttling
deferred to exit_to_user_mode, it would only be triggered by ticks. A large
runtime debt can accumulate before the on cpu threads are actually dequeued.
(Also noted in https://lore.kernel.org/lkml/20240711130004.2157737-11-vschneid@redhat.com/)

distribute_cfs_runtime would finish early if the quotas are used up by the first
few cpus, which would also result in throttling/unthrottling for only a few
runqueues per period. An intermittent workload like hackbench may give us more
information.

See slide 10 of my presentation for more info:
https://lpc.events/event/18/contributions/1883/attachments/1420/3040/Priority%20Inheritance%20for%20CFS%20Bandwidth%20Control.pdf

Indeed we are seeing more cfsb scalability problems with larger servers. Both
the irq off time from the cgroup walk and the overheads from per task actions
can be problematic.

-Xi

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-20 18:40           ` Xi Wang
@ 2025-03-24  8:58             ` Aaron Lu
  2025-03-25 10:02               ` Aaron Lu
  0 siblings, 1 reply; 58+ messages in thread
From: Aaron Lu @ 2025-03-24  8:58 UTC (permalink / raw)
  To: Xi Wang
  Cc: Chengming Zhou, K Prateek Nayak, Josh Don, Valentin Schneider,
	Ben Segall, Peter Zijlstra, Ingo Molnar, Vincent Guittot,
	linux-kernel, Juri Lelli, Dietmar Eggemann, Steven Rostedt,
	Mel Gorman, Chuyi Zhou

[-- Attachment #1: Type: text/plain, Size: 3165 bytes --]

On Thu, Mar 20, 2025 at 11:40:11AM -0700, Xi Wang wrote:
...
> I am a bit unsure about the overhead experiment results. Maybe we can add some
> counters to check how many cgroups per cpu are actually touched and how many
> threads are actually dequeued / enqueued for throttling / unthrottling?

Sure thing.

> Looks like busy loop workloads were used for the experiment. With throttling
> deferred to exit_to_user_mode, it would only be triggered by ticks. A large
> runtime debt can accumulate before the on cpu threads are actually dequeued.
> (Also noted in https://lore.kernel.org/lkml/20240711130004.2157737-11-vschneid@redhat.com/)
> 
> distribute_cfs_runtime would finish early if the quotas are used up by the first
> few cpus, which would also result in throttling/unthrottling for only a few
> runqueues per period. An intermittent workload like hackbench may give us more
> information.

I've added some trace prints and noticed it already invovled almost all
cpu rqs on that 2sockets/384cpus test system, so I suppose it's OK to
continue use that setup as described before:
https://lore.kernel.org/lkml/CANCG0GdOwS7WO0k5Fb+hMd8R-4J_exPTt2aS3-0fAMUC5pVD8g@mail.gmail.com/

Below is one print sample:

          <idle>-0       [214] d.h1.  1879.281972: distribute_cfs_runtime: cpu214: begins                                         <idle>-0       [214] dNh2.  1879.283564: distribute_cfs_runtime: cpu214: finishes. unthrottled rqs=383, unthro
ttled_cfs_rq=1101, unthrottled_task=69

With async unthrottle, it's not possible to account exactly how many
cfs_rqs are unthrottled and how many tasks are enqueued back, just
how many rqs are involved and how many cfs_rqs/tasks the local cpu has
unthrottled. So this sample means in distribute_cfs_runtime(), 383 rqs
are involved and the local cpu has unthrottled 1101 cfs_rqs and a total
of 69 tasks are enqueued back.

The corresponding bpftrace(duration of distribute_cfs_runtime(), in
nano-seconds) is:
@durations:
[4K, 8K)               9 |                                                    |
[8K, 16K)            227 |@@@@@@@@@@@@@@                                      |
[16K, 32K)           120 |@@@@@@@                                             |
[32K, 64K)            70 |@@@@                                                |
[64K, 128K)            0 |                                                    |
[128K, 256K)           0 |                                                    |
[256K, 512K)           0 |                                                    |
[512K, 1M)           158 |@@@@@@@@@                                           |
[1M, 2M)             832 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[2M, 4M)             177 |@@@@@@@@@@@                                         |

Thanks,
Aaron

> See slide 10 of my presentation for more info:
> https://lpc.events/event/18/contributions/1883/attachments/1420/3040/Priority%20Inheritance%20for%20CFS%20Bandwidth%20Control.pdf
> 
> Indeed we are seeing more cfsb scalability problems with larger servers. Both
> the irq off time from the cgroup walk and the overheads from per task actions
> can be problematic.
> 
> -Xi

[-- Attachment #2: upstream.diff --]
[-- Type: text/x-diff, Size: 2512 bytes --]

Subject: [DEBUG PATCH] sched/fair: add profile for distribute_cfs_runtime()

---
 kernel/sched/fair.c  | 10 ++++++++++
 kernel/sched/sched.h |  2 ++
 2 files changed, 12 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d646451d617c1..a4e3780c076e3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5922,10 +5922,12 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
 		cfs_rq->throttled_clock_self_time += delta;
 	}
 
+	rq->unthrottled_cfs_rq++;
 	/* Re-enqueue the tasks that have been throttled at this level. */
 	list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) {
 		list_del_init(&p->throttle_node);
 		enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
+		rq->unthrottled_task++;
 	}
 
 	/* Add cfs_rq with load or one or more already running entities to the list */
@@ -6192,6 +6194,9 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 	struct rq_flags rf;
 	struct rq *rq;
 	LIST_HEAD(local_unthrottle);
+	unsigned int unthrottled_rqs = 0;
+
+	trace_printk("cpu%d: begins\n", this_cpu);
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
@@ -6228,6 +6233,7 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 		if (cfs_rq->runtime_remaining > 0) {
 			if (cpu_of(rq) != this_cpu) {
 				unthrottle_cfs_rq_async(cfs_rq);
+				unthrottled_rqs++;
 			} else {
 				/*
 				 * We currently only expect to be unthrottling
@@ -6250,12 +6256,16 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 		struct rq *rq = rq_of(cfs_rq);
 
 		rq_lock_irqsave(rq, &rf);
+		rq->unthrottled_cfs_rq = rq->unthrottled_task = 0;
 
 		list_del_init(&cfs_rq->throttled_csd_list);
 
 		if (cfs_rq_throttled(cfs_rq))
 			unthrottle_cfs_rq(cfs_rq);
 
+		trace_printk("cpu%d: finishes. unthrottled rqs=%u, unthrottled_cfs_rq=%d, unthrottled_task=%d\n",
+				raw_smp_processor_id(), unthrottled_rqs,
+				rq->unthrottled_cfs_rq, rq->unthrottled_task);
 		rq_unlock_irqrestore(rq, &rf);
 	}
 	SCHED_WARN_ON(!list_empty(&local_unthrottle));
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5c2af5a70163c..d004da2bc9071 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1309,6 +1309,8 @@ struct rq {
 #if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP)
 	call_single_data_t	cfsb_csd;
 	struct list_head	cfsb_csd_list;
+	unsigned int            unthrottled_cfs_rq;
+	unsigned int            unthrottled_task;
 #endif
 };
 
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-24  8:58             ` Aaron Lu
@ 2025-03-25 10:02               ` Aaron Lu
  2025-03-28  0:11                 ` Xi Wang
  0 siblings, 1 reply; 58+ messages in thread
From: Aaron Lu @ 2025-03-25 10:02 UTC (permalink / raw)
  To: Xi Wang
  Cc: Chengming Zhou, K Prateek Nayak, Josh Don, Valentin Schneider,
	Ben Segall, Peter Zijlstra, Ingo Molnar, Vincent Guittot,
	linux-kernel, Juri Lelli, Dietmar Eggemann, Steven Rostedt,
	Mel Gorman, Chuyi Zhou

[-- Attachment #1: Type: text/plain, Size: 3519 bytes --]

On Mon, Mar 24, 2025 at 04:58:22PM +0800, Aaron Lu wrote:
> On Thu, Mar 20, 2025 at 11:40:11AM -0700, Xi Wang wrote:
> ...
> > I am a bit unsure about the overhead experiment results. Maybe we can add some
> > counters to check how many cgroups per cpu are actually touched and how many
> > threads are actually dequeued / enqueued for throttling / unthrottling?
> 
> Sure thing.
> 
> > Looks like busy loop workloads were used for the experiment. With throttling
> > deferred to exit_to_user_mode, it would only be triggered by ticks. A large
> > runtime debt can accumulate before the on cpu threads are actually dequeued.
> > (Also noted in https://lore.kernel.org/lkml/20240711130004.2157737-11-vschneid@redhat.com/)
> > 
> > distribute_cfs_runtime would finish early if the quotas are used up by the first
> > few cpus, which would also result in throttling/unthrottling for only a few
> > runqueues per period. An intermittent workload like hackbench may give us more
> > information.
> 
> I've added some trace prints and noticed it already invovled almost all
> cpu rqs on that 2sockets/384cpus test system, so I suppose it's OK to
> continue use that setup as described before:
> https://lore.kernel.org/lkml/CANCG0GdOwS7WO0k5Fb+hMd8R-4J_exPTt2aS3-0fAMUC5pVD8g@mail.gmail.com/

One more data point that might be interesting. I've tested this on a
v5.15 based kernel where async unthrottle is not available yet so things
should be worse.

As Xi mentioned, since the test program is cpu hog, I tweaked the quota
setting to make throttle happen more likely.

The bpftrace duration of distribute_cfs_runtime() is:

@durations:
[4K, 8K)               1 |                                                    |
[8K, 16K)              8 |                                                    |
[16K, 32K)             1 |                                                    |
[32K, 64K)             0 |                                                    |
[64K, 128K)            0 |                                                    |
[128K, 256K)           0 |                                                    |
[256K, 512K)           0 |                                                    |
[512K, 1M)             0 |                                                    |
[1M, 2M)               0 |                                                    |
[2M, 4M)               0 |                                                    |
[4M, 8M)               0 |                                                    |
[8M, 16M)              0 |                                                    |
[16M, 32M)             0 |                                                    |
[32M, 64M)           376 |@@@@@@@@@@@@@@@@@@@@@@@                             |
[64M, 128M)          824 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|

One random trace point from the trace prints is:

          <idle>-0       [117] d.h1. 83206.734588: distribute_cfs_runtime: cpu117: begins
          <idle>-0       [117] dnh1. 83206.801902: distribute_cfs_runtime: cpu117: finishes: unthrottled_rqs=384, unthrottled_cfs_rq=422784, unthrottled_task=10000

So for the above trace point, distribute_cfs_runtime() unthrottled 384
rqs with a total of 422784 cfs_rqs and enqueued back 10000 tasks, this
took about 70ms.

Note that other things like rq lock contention might make things worse -
I did not notice any lock contention in this setup.

I've attached the corresponding debug diff in case it's not clear what
this trace print means.

[-- Attachment #2: 5.15.diff --]
[-- Type: text/x-diff, Size: 2898 bytes --]

Subject: [DEBUG PATCH] sched/fair: profile distribute_cfs_runtime()

---
 kernel/sched/fair.c  | 17 +++++++++++++++++
 kernel/sched/sched.h |  2 ++
 2 files changed, 19 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index da3f728b27725..e3546274a162d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5009,6 +5009,8 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
 	cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
 					cfs_rq->throttled_clock_pelt;
 
+	rq->unthrottled_cfs_rq++;
+
 	/* Re-enqueue the tasks that have been throttled at this level. */
 	list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) {
 		list_del_init(&p->throttle_node);
@@ -5017,6 +5019,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
 		 * due to affinity change while p is throttled.
 		 */
 		enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
+		rq->unthrottled_task++;
 	}
 
 	/* Add cfs_rq with load or one or more already running entities to the list */
@@ -5193,7 +5196,9 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 {
 	struct cfs_rq *cfs_rq;
 	u64 runtime, remaining = 1;
+	unsigned int unthrottled_rqs = 0, unthrottled_cfs_rq = 0, unthrottled_task = 0;
 
+	trace_printk("cpu%d: begins\n", raw_smp_processor_id());
 	rcu_read_lock();
 	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
 				throttled_list) {
@@ -5201,6 +5206,7 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 		struct rq_flags rf;
 
 		rq_lock_irqsave(rq, &rf);
+		rq->unthrottled_cfs_rq = rq->unthrottled_task = 0;
 		if (!cfs_rq_throttled(cfs_rq))
 			goto next;
 
@@ -5222,12 +5228,23 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 			unthrottle_cfs_rq(cfs_rq);
 
 next:
+		trace_printk("cpu%d: cpu%d unthrottled_cfs_rq=%d/%d, unthrottled_task=%d/%d, remaining=%Lu\n",
+				raw_smp_processor_id(), cpu_of(rq),
+				rq->unthrottled_cfs_rq, unthrottled_cfs_rq,
+				rq->unthrottled_task, unthrottled_task, remaining);
+
+		unthrottled_cfs_rq += rq->unthrottled_cfs_rq;
+		unthrottled_task += rq->unthrottled_task;
+		unthrottled_rqs++;
+		rq->unthrottled_cfs_rq = rq->unthrottled_task = 0;
 		rq_unlock_irqrestore(rq, &rf);
 
 		if (!remaining)
 			break;
 	}
 	rcu_read_unlock();
+	trace_printk("cpu%d: finishes: unthrottled_rqs=%u, unthrottled_cfs_rq=%u, unthrottled_task=%u\n",
+			raw_smp_processor_id(), unthrottled_rqs, unthrottled_cfs_rq, unthrottled_task);
 }
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e0e05847855f0..bd3a11582d5b6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1118,6 +1118,8 @@ struct rq {
 	unsigned int		core_forceidle_occupation;
 	u64			core_forceidle_start;
 #endif
+	unsigned int		unthrottled_cfs_rq;
+	unsigned int		unthrottled_task;
 };
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-25 10:02               ` Aaron Lu
@ 2025-03-28  0:11                 ` Xi Wang
  2025-03-28  3:11                   ` Aaron Lu
  0 siblings, 1 reply; 58+ messages in thread
From: Xi Wang @ 2025-03-28  0:11 UTC (permalink / raw)
  To: Aaron Lu
  Cc: Chengming Zhou, K Prateek Nayak, Josh Don, Valentin Schneider,
	Ben Segall, Peter Zijlstra, Ingo Molnar, Vincent Guittot,
	linux-kernel, Juri Lelli, Dietmar Eggemann, Steven Rostedt,
	Mel Gorman, Chuyi Zhou

On Tue, Mar 25, 2025 at 3:02 AM Aaron Lu <ziqianlu@bytedance.com> wrote:
>
> On Mon, Mar 24, 2025 at 04:58:22PM +0800, Aaron Lu wrote:
> > On Thu, Mar 20, 2025 at 11:40:11AM -0700, Xi Wang wrote:
> > ...
> > > I am a bit unsure about the overhead experiment results. Maybe we can add some
> > > counters to check how many cgroups per cpu are actually touched and how many
> > > threads are actually dequeued / enqueued for throttling / unthrottling?
> >
> > Sure thing.
> >
> > > Looks like busy loop workloads were used for the experiment. With throttling
> > > deferred to exit_to_user_mode, it would only be triggered by ticks. A large
> > > runtime debt can accumulate before the on cpu threads are actually dequeued.
> > > (Also noted in https://lore.kernel.org/lkml/20240711130004.2157737-11-vschneid@redhat.com/)
> > >
> > > distribute_cfs_runtime would finish early if the quotas are used up by the first
> > > few cpus, which would also result in throttling/unthrottling for only a few
> > > runqueues per period. An intermittent workload like hackbench may give us more
> > > information.
> >
> > I've added some trace prints and noticed it already invovled almost all
> > cpu rqs on that 2sockets/384cpus test system, so I suppose it's OK to
> > continue use that setup as described before:
> > https://lore.kernel.org/lkml/CANCG0GdOwS7WO0k5Fb+hMd8R-4J_exPTt2aS3-0fAMUC5pVD8g@mail.gmail.com/
>
> One more data point that might be interesting. I've tested this on a
> v5.15 based kernel where async unthrottle is not available yet so things
> should be worse.
>
> As Xi mentioned, since the test program is cpu hog, I tweaked the quota
> setting to make throttle happen more likely.
>
> The bpftrace duration of distribute_cfs_runtime() is:
>
> @durations:
> [4K, 8K)               1 |                                                    |
> [8K, 16K)              8 |                                                    |
> [16K, 32K)             1 |                                                    |
> [32K, 64K)             0 |                                                    |
> [64K, 128K)            0 |                                                    |
> [128K, 256K)           0 |                                                    |
> [256K, 512K)           0 |                                                    |
> [512K, 1M)             0 |                                                    |
> [1M, 2M)               0 |                                                    |
> [2M, 4M)               0 |                                                    |
> [4M, 8M)               0 |                                                    |
> [8M, 16M)              0 |                                                    |
> [16M, 32M)             0 |                                                    |
> [32M, 64M)           376 |@@@@@@@@@@@@@@@@@@@@@@@                             |
> [64M, 128M)          824 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
>
> One random trace point from the trace prints is:
>
>           <idle>-0       [117] d.h1. 83206.734588: distribute_cfs_runtime: cpu117: begins
>           <idle>-0       [117] dnh1. 83206.801902: distribute_cfs_runtime: cpu117: finishes: unthrottled_rqs=384, unthrottled_cfs_rq=422784, unthrottled_task=10000
>
> So for the above trace point, distribute_cfs_runtime() unthrottled 384
> rqs with a total of 422784 cfs_rqs and enqueued back 10000 tasks, this
> took about 70ms.
>
> Note that other things like rq lock contention might make things worse -
> I did not notice any lock contention in this setup.
>
> I've attached the corresponding debug diff in case it's not clear what
> this trace print means.

Thanks for getting the test results!

My understanding is that you now have 2 test configurations and new vs
old throttling mechanisms. I think the two groups of results were
test1 + new method and test2 + old method. Is that the case?

For test1 + new method, we have "..in distribute_cfs_runtime(), 383
rqs are involved and the local cpu has unthrottled 1101 cfs_rqs and a
total of 69 tasks are enqueued back". I think if the workload is in a
steady and persistently over limit state we'd have 1000+ tasks
periodically being throttled and unthrottled, at least with the old
method. So "1101 cfs_rqs and a total of 69 tasks are enqueued back"
might be a special case?

I'll also try to create a stress test that mimics our production
problems but it would take some time.

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-28  0:11                 ` Xi Wang
@ 2025-03-28  3:11                   ` Aaron Lu
  0 siblings, 0 replies; 58+ messages in thread
From: Aaron Lu @ 2025-03-28  3:11 UTC (permalink / raw)
  To: Xi Wang
  Cc: Chengming Zhou, K Prateek Nayak, Josh Don, Valentin Schneider,
	Ben Segall, Peter Zijlstra, Ingo Molnar, Vincent Guittot,
	linux-kernel, Juri Lelli, Dietmar Eggemann, Steven Rostedt,
	Mel Gorman, Chuyi Zhou

On Thu, Mar 27, 2025 at 05:11:42PM -0700, Xi Wang wrote:
> On Tue, Mar 25, 2025 at 3:02 AM Aaron Lu <ziqianlu@bytedance.com> wrote:
> >
> > On Mon, Mar 24, 2025 at 04:58:22PM +0800, Aaron Lu wrote:
> > > On Thu, Mar 20, 2025 at 11:40:11AM -0700, Xi Wang wrote:
> > > ...
> > > > I am a bit unsure about the overhead experiment results. Maybe we can add some
> > > > counters to check how many cgroups per cpu are actually touched and how many
> > > > threads are actually dequeued / enqueued for throttling / unthrottling?
> > >
> > > Sure thing.
> > >
> > > > Looks like busy loop workloads were used for the experiment. With throttling
> > > > deferred to exit_to_user_mode, it would only be triggered by ticks. A large
> > > > runtime debt can accumulate before the on cpu threads are actually dequeued.
> > > > (Also noted in https://lore.kernel.org/lkml/20240711130004.2157737-11-vschneid@redhat.com/)
> > > >
> > > > distribute_cfs_runtime would finish early if the quotas are used up by the first
> > > > few cpus, which would also result in throttling/unthrottling for only a few
> > > > runqueues per period. An intermittent workload like hackbench may give us more
> > > > information.
> > >
> > > I've added some trace prints and noticed it already invovled almost all
> > > cpu rqs on that 2sockets/384cpus test system, so I suppose it's OK to
> > > continue use that setup as described before:
> > > https://lore.kernel.org/lkml/CANCG0GdOwS7WO0k5Fb+hMd8R-4J_exPTt2aS3-0fAMUC5pVD8g@mail.gmail.com/
> >
> > One more data point that might be interesting. I've tested this on a
> > v5.15 based kernel where async unthrottle is not available yet so things
> > should be worse.
> >
> > As Xi mentioned, since the test program is cpu hog, I tweaked the quota
> > setting to make throttle happen more likely.
> >
> > The bpftrace duration of distribute_cfs_runtime() is:
> >
> > @durations:
> > [4K, 8K)               1 |                                                    |
> > [8K, 16K)              8 |                                                    |
> > [16K, 32K)             1 |                                                    |
> > [32K, 64K)             0 |                                                    |
> > [64K, 128K)            0 |                                                    |
> > [128K, 256K)           0 |                                                    |
> > [256K, 512K)           0 |                                                    |
> > [512K, 1M)             0 |                                                    |
> > [1M, 2M)               0 |                                                    |
> > [2M, 4M)               0 |                                                    |
> > [4M, 8M)               0 |                                                    |
> > [8M, 16M)              0 |                                                    |
> > [16M, 32M)             0 |                                                    |
> > [32M, 64M)           376 |@@@@@@@@@@@@@@@@@@@@@@@                             |
> > [64M, 128M)          824 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
> >
> > One random trace point from the trace prints is:
> >
> >           <idle>-0       [117] d.h1. 83206.734588: distribute_cfs_runtime: cpu117: begins
> >           <idle>-0       [117] dnh1. 83206.801902: distribute_cfs_runtime: cpu117: finishes: unthrottled_rqs=384, unthrottled_cfs_rq=422784, unthrottled_task=10000
> >
> > So for the above trace point, distribute_cfs_runtime() unthrottled 384
> > rqs with a total of 422784 cfs_rqs and enqueued back 10000 tasks, this
> > took about 70ms.
> >
> > Note that other things like rq lock contention might make things worse -
> > I did not notice any lock contention in this setup.
> >
> > I've attached the corresponding debug diff in case it's not clear what
> > this trace print means.
> 
> Thanks for getting the test results!
> 
> My understanding is that you now have 2 test configurations and new vs
> old throttling mechanisms. I think the two groups of results were
> test1 + new method and test2 + old method. Is that the case?

Sorry for the confusion.

First result is done using this patch series on top of latest
tip/sched/core branch which has async unthrottle feature. Second result
is done using this patch series(adjusted to run on an old kernel
of course) on top of v5.15 based kernel where async unthrottle is not
available yet.

> 
> For test1 + new method, we have "..in distribute_cfs_runtime(), 383
> rqs are involved and the local cpu has unthrottled 1101 cfs_rqs and a
> total of 69 tasks are enqueued back". I think if the workload is in a
> steady and persistently over limit state we'd have 1000+ tasks
> periodically being throttled and unthrottled, at least with the old
> method. So "1101 cfs_rqs and a total of 69 tasks are enqueued back"
> might be a special case?

With async unthrottle, distribute_cfs_runtime() will only deal with
local cpu's unthrottle and other cpu's unthrottle is done by sending
an IPI to let those cpus deal with their own unthrottle.

Since there are a total of 2000 leaf cfs_rqs and 20K tasks, on this 384
cpus machine, each cpu should have roughly about 52 tasks. And one top
level hierarchy have about 1000 leaf cfs_rqs. That's why there are 1101
cfs_rqs unthrottled and 69 tasks are enqueued. These numbers mean on that
specific cpu alone where the hrtime fired, it iterates 1101 cfs_rqs and
enqueued back 69 tasks. All other tasks are enqueued in other CPU's IPI
handler __cfsb_csd_unthrottle(). With this said, for this setup, "1101
cfs_rqs and 69 tasks" is not a special case but a worse than normal case,
if not the worst.

When async unthrottle feature is not available(like in the 2nd result),
distribute_cfs_runtime() will have to iterate all cpu's throttled
cfs_rqs and enqueue back all those throttled tasks. Since these number
would be much larger, I showed them for the sole purpose of demonstrating
how bad the duration can be when 10K tasks have to be enqueued back in
one go.

> I'll also try to create a stress test that mimics our production
> problems but it would take some time.

That would be good to have, thanks.

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-20  6:59       ` K Prateek Nayak
  2025-03-20  8:39         ` Chengming Zhou
@ 2025-03-28 22:47         ` Benjamin Segall
  1 sibling, 0 replies; 58+ messages in thread
From: Benjamin Segall @ 2025-03-28 22:47 UTC (permalink / raw)
  To: K Prateek Nayak
  Cc: Chengming Zhou, Josh Don, Aaron Lu, Valentin Schneider,
	Peter Zijlstra, Ingo Molnar, Vincent Guittot, linux-kernel,
	Juri Lelli, Dietmar Eggemann, Steven Rostedt, Mel Gorman,
	Chuyi Zhou, Xi Wang

K Prateek Nayak <kprateek.nayak@amd.com> writes:

> Hello Chengming,
>
> On 3/17/2025 8:24 AM, Chengming Zhou wrote:
>> On 2025/3/16 11:25, Josh Don wrote:
>>> Hi Aaron,
>>>
>>>>   static int tg_throttle_down(struct task_group *tg, void *data)
>>>>   {
>>>>          struct rq *rq = data;
>>>>          struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
>>>> +       struct task_struct *p;
>>>> +       struct rb_node *node;
>>>> +
>>>> +       cfs_rq->throttle_count++;
>>>> +       if (cfs_rq->throttle_count > 1)
>>>> +               return 0;
>>>>
>>>>          /* group is entering throttled state, stop time */
>>>> -       if (!cfs_rq->throttle_count) {
>>>> -               cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
>>>> -               list_del_leaf_cfs_rq(cfs_rq);
>>>> +       cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
>>>> +       list_del_leaf_cfs_rq(cfs_rq);
>>>>
>>>> -               SCHED_WARN_ON(cfs_rq->throttled_clock_self);
>>>> -               if (cfs_rq->nr_queued)
>>>> -                       cfs_rq->throttled_clock_self = rq_clock(rq);
>>>> +       SCHED_WARN_ON(cfs_rq->throttled_clock_self);
>>>> +       if (cfs_rq->nr_queued)
>>>> +               cfs_rq->throttled_clock_self = rq_clock(rq);
>>>> +
>>>> +       WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
>>>> +       /*
>>>> +        * rq_lock is held, current is (obviously) executing this in kernelspace.
>>>> +        *
>>>> +        * All other tasks enqueued on this rq have their saved PC at the
>>>> +        * context switch, so they will go through the kernel before returning
>>>> +        * to userspace. Thus, there are no tasks-in-userspace to handle, just
>>>> +        * install the task_work on all of them.
>>>> +        */
>>>> +       node = rb_first(&cfs_rq->tasks_timeline.rb_root);
>>>> +       while (node) {
>>>> +               struct sched_entity *se = __node_2_se(node);
>>>> +
>>>> +               if (!entity_is_task(se))
>>>> +                       goto next;
>>>> +
>>>> +               p = task_of(se);
>>>> +               task_throttle_setup_work(p);
>>>> +next:
>>>> +               node = rb_next(node);
>>>> +       }
>>>
>>> I'd like to strongly push back on this approach. This adds quite a lot
>>> of extra computation to an already expensive path
>>> (throttle/unthrottle). e.g. this function is part of the cgroup walk
>> Actually, with my suggestion in another email that we only need to mark
>> these cfs_rqs throttled when quote used up, and setup throttle task work
>> when the tasks under those cfs_rqs get picked, the cost of throttle path
>> is much like the dual tree approach.
>> As for unthrottle path, you're right, it has to iterate each task on
>> a list to get it queued into the cfs_rq tree, so it needs more thinking.
>> 
>>> and so it is already O(cgroups) for the number of cgroups in the
>>> hierarchy being throttled. This gets even worse when you consider that
>>> we repeat this separately across all the cpus that the
>>> bandwidth-constrained group is running on. Keep in mind that
>>> throttle/unthrottle is done with rq lock held and IRQ disabled.
>> Maybe we can avoid holding rq lock when unthrottle? As for per-task
>> unthrottle, it's much like just waking up lots of tasks, so maybe we
>> can reuse ttwu path to wakeup those tasks, which could utilise remote
>> IPI to avoid holding remote rq locks. I'm not sure, just some random thinking..
>> 

Yeah, looping over all the fully throttled tasks in unthrottle still
isn't great (and needing to do a full enqueue operation for each). It's
probably much better than looping over all the runnable tasks here
though.

Remote IPI shouldn't be very helpful, because we already have that in
async unthrottle. Whether or not it's useful to occasionally release the
lock while doing all the per-task unthrottles like loop_break who knows,
but it's certainly easy enough to do while just staying under rcu.

>>>
>>> In K Prateek's last RFC, there was discussion of using context
>>> tracking; did you consider that approach any further? We could keep
>>> track of the number of threads within a cgroup hierarchy currently in
>>> kernel mode (similar to h_nr_runnable), and thus simplify down the
>> Yeah, I think Prateek's approach is very creative! The only downside of
>> it is that the code becomes much complex.. on already complex codebase.
>> Anyway, it would be great that or this could be merged in mainline kernel.
>
> I think the consensus is to move to per-task throttling since it'll
> simplify the efforts to move to a flat hierarchy sometime in the near
> future.
>
> My original approach was complex since i wanted to seamlessly resume the
> pick part on unthrottle. In Ben;s patch [1] there is a TODO in
> pick_next_entity() and that probably worked with the older vruntime based
> CFS algorithm but can break EEVDF guarantees.
>
> [1] https://lore.kernel.org/all/xm26edfxpock.fsf@bsegall-linux.svl.corp.google.com/
>
> Unfortunately keeping a single rbtree meant replicating the stats and
> that indeed adds to complexity.
>

Does anything actually rely on those guarantees for correctness in the
scheduler? Would anything actually break if something overrode
pick_next_task_fair to just pick a random runnable task from the rq, or
similar? I'd only expect us to lose out on fairness, and only to the
extent that we're overriding the pick (and not as an ongoing
repercussion from a single unfair pick).

There's still plenty of potential reasons to want to provide better
fairness even between "throttled tasks still running in the kernel" but
I don't think that halfassing it would break EEVDF more than CFS. It
would however be significantly more annoying to duplicate the tree
nowadays with the more data required by entity_eligible.

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 6/7] sched/fair: fix tasks_rcu with task based throttle
  2025-03-14  4:14   ` K Prateek Nayak
  2025-03-14 11:37     ` [External] " Aaron Lu
@ 2025-03-31  6:19     ` Aaron Lu
  2025-04-01  3:17       ` K Prateek Nayak
  1 sibling, 1 reply; 58+ messages in thread
From: Aaron Lu @ 2025-03-31  6:19 UTC (permalink / raw)
  To: K Prateek Nayak
  Cc: Valentin Schneider, Ben Segall, Peter Zijlstra, Josh Don,
	Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chengming Zhou,
	Chuyi Zhou

Hi Prateek,

On Fri, Mar 14, 2025 at 09:44:45AM +0530, K Prateek Nayak wrote:
> Hello Aaron,
> 
> On 3/13/2025 12:52 PM, Aaron Lu wrote:
> > Taskes throttled on exit to user path are scheduled by cond_resched() in
> > task_work_run() but that is a preempt schedule and doesn't mark a task
> > rcu quiescent state.
> > 
> > Fix this by directly calling schedule() in throttle_cfs_rq_work().
> 
> Perhaps that can be gotten around by just using set_ti_thread_flag()
> resched_curr() will also call set_preempt_need_resched() which allows
> cond_resched() to resched the task.
> 
> Since exit_to_user_mode_loop() will run once again seeing that
> TIF_NEED_RESCHED is set, schedule() should follow soon. Thoughts?
> 

I tried this and noticed an unexpected consequence: get_signal() will
also run task works and if the signal is kill, then it can happen:
exit_to_user_mode_loop() -> get_signal() -> throttle_task_work() -> 
do_exit() -> exit_signals() -> percpu_rwsem_wait() -> schedule() -> 
try_to_block_task() -> dequeue_task_fair().

I would like to avoid this path, at least for now. I want to make sure
for throttled tasks, only events like task group change, affinity change
etc. can cause it dequeue from core, that's why I added
SCHED_WARN_ON(flags & DEQUEUE_SLEEP) in dequeue_task_fair(). I think
this can help me capture any unexpected events like this.

Besides, I think explicitely calling schedule() has the advantage of not
relying on other code, i.e. it doesn't matter if someone removed
cond_resched() in task_work_run() some time later or someone changed the
logic in exit_to_user_mode_loop().

So I hope you don't mind I keep schedule() in throttle_cfs_rq_work(),
but if you see anything wrong of doing this, feel free to let me know,
thanks.

Best regards,
Aaron

> > Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
> > ---
> >   kernel/sched/fair.c | 5 +++++
> >   1 file changed, 5 insertions(+)
> > 
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index f26d53ac143fe..be96f7d32998c 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -5847,6 +5847,7 @@ static void throttle_cfs_rq_work(struct
> > callback_head *work)
> >   	struct cfs_rq *cfs_rq;
> >   	struct rq *rq;
> >   	struct rq_flags rf;
> > +	bool sched = false;
> > 
> >   	WARN_ON_ONCE(p != current);
> >   	p->sched_throttle_work.next = &p->sched_throttle_work;
> > @@ -5879,9 +5880,13 @@ static void throttle_cfs_rq_work(struct
> > callback_head *work)
> >   	dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
> >   	list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> >   	resched_curr(rq);
> > +	sched = true;
> > 
> >   out_unlock:
> >   	task_rq_unlock(rq, p, &rf);
> > +
> > +	if (sched)
> > +		schedule();
> >   }
> > 
> >   void init_cfs_throttle_work(struct task_struct *p)
> 
> -- 
> Thanks and Regards,
> Prateek
> 

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-14 11:07       ` Chengming Zhou
@ 2025-03-31  6:42         ` Aaron Lu
  2025-03-31  9:14           ` Chengming Zhou
  0 siblings, 1 reply; 58+ messages in thread
From: Aaron Lu @ 2025-03-31  6:42 UTC (permalink / raw)
  To: Chengming Zhou
  Cc: Valentin Schneider, Ben Segall, K Prateek Nayak, Peter Zijlstra,
	Josh Don, Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chuyi Zhou

Hi Chengming,

On Fri, Mar 14, 2025 at 07:07:10PM +0800, Chengming Zhou wrote:
> On 2025/3/14 17:42, Aaron Lu wrote:
> > On Fri, Mar 14, 2025 at 04:39:41PM +0800, Chengming Zhou wrote:
> > > On 2025/3/13 15:21, Aaron Lu wrote:
> > > > From: Valentin Schneider <vschneid@redhat.com>
> > > > 
> > > > Once a cfs_rq gets throttled, for all tasks belonging to this cfs_rq,
> > > > add a task work to them so that when those tasks return to user, the
> > > > actual throttle/dequeue can happen.
> > > > 
> > > > Note that since the throttle/dequeue always happens on a task basis when
> > > > it returns to user, it's no longer necessary for check_cfs_rq_runtime()
> > > > to return a value and pick_task_fair() acts differently according to that
> > > > return value, so check_cfs_rq_runtime() is changed to not return a
> > > > value.
> > > 
> > > Previously with the per-cfs_rq throttling, we use update_curr() -> put() path
> > > to throttle the cfs_rq and dequeue it from the cfs_rq tree.
> > > 
> > > Now with your per-task throttling, maybe things can become simpler. That we
> > > can just throttle_cfs_rq() (cfs_rq subtree) when curr accouting to mark these
> > > throttled.
> > 
> > Do I understand correctly that now in throttle_cfs_rq(), we just mark
> > this hierarchy as throttled, but do not add any throttle work to these
> > tasks in this hierarchy and leave the throttle work add job to pick
> > time?
> 
> Right, we can move throttle_cfs_rq() forward to the curr accouting time, which
> just mark these throttled.

While preparing the next version, I found that if I move
throttle_cfs_rq() to accounting time, like in __account_cfs_rq_runtime(),
then it is possible on unthrottle path, the following can happen:
unthrottle_cfs_rq() -> enqueue_task_fair() -> update_curr() ->
account_cfs_rq_runtime() -> throttle_cfs_rq()...

Initially I was confused why update_curr() can notice a non-null curr
when this cfs_rq is being unthrottled but then I realized in this task
based throttling model, it is possible some task woke up in that
throttled cfs_rq and have cfs_rq->curr set and then cfs_rq gets
unthrottled.

So I suppose I'll keep the existing way of marking a cfs_rq as
throttled by calling check_cfs_rq_runtime() in the following two places:
- in pick_task_fair(), so that the to-be-picked cfs_rq can be marked for
  throttle;
- in put_prev_entity() for prev runnable task's cfs_rq.

> And move setup_task_work() afterward to the pick task time, which make that task
> dequeue when ret2user.

No problem for this part as far as my test goes :-)

Thanks,
Aaron

> > 
> > > Then then if we pick a task from a throttled cfs_rq subtree, we can setup task work
> > > for it, so we don't botter with the delayed_dequeue task case that Prateek mentioned.
> > 
> > If we add a check point in pick time, maybe we can also avoid the check
> > in enqueue time. One thing I'm thinking is, for a task, it may be picked
> > multiple times with only a single enqueue so if we do the check in pick,
> > the overhead can be larger?
> 
> As Prateek already mentioned, this check cost is negligeable.
> 
> > 
> > > WDYT?
> > 
> > Thanks for your suggestion. I'll try this approach and see how it turned
> > out.

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle
  2025-03-31  6:42         ` Aaron Lu
@ 2025-03-31  9:14           ` Chengming Zhou
  0 siblings, 0 replies; 58+ messages in thread
From: Chengming Zhou @ 2025-03-31  9:14 UTC (permalink / raw)
  To: Aaron Lu
  Cc: Valentin Schneider, Ben Segall, K Prateek Nayak, Peter Zijlstra,
	Josh Don, Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chuyi Zhou

On 2025/3/31 14:42, Aaron Lu wrote:
> Hi Chengming,
> 
> On Fri, Mar 14, 2025 at 07:07:10PM +0800, Chengming Zhou wrote:
>> On 2025/3/14 17:42, Aaron Lu wrote:
>>> On Fri, Mar 14, 2025 at 04:39:41PM +0800, Chengming Zhou wrote:
>>>> On 2025/3/13 15:21, Aaron Lu wrote:
>>>>> From: Valentin Schneider <vschneid@redhat.com>
>>>>>
>>>>> Once a cfs_rq gets throttled, for all tasks belonging to this cfs_rq,
>>>>> add a task work to them so that when those tasks return to user, the
>>>>> actual throttle/dequeue can happen.
>>>>>
>>>>> Note that since the throttle/dequeue always happens on a task basis when
>>>>> it returns to user, it's no longer necessary for check_cfs_rq_runtime()
>>>>> to return a value and pick_task_fair() acts differently according to that
>>>>> return value, so check_cfs_rq_runtime() is changed to not return a
>>>>> value.
>>>>
>>>> Previously with the per-cfs_rq throttling, we use update_curr() -> put() path
>>>> to throttle the cfs_rq and dequeue it from the cfs_rq tree.
>>>>
>>>> Now with your per-task throttling, maybe things can become simpler. That we
>>>> can just throttle_cfs_rq() (cfs_rq subtree) when curr accouting to mark these
>>>> throttled.
>>>
>>> Do I understand correctly that now in throttle_cfs_rq(), we just mark
>>> this hierarchy as throttled, but do not add any throttle work to these
>>> tasks in this hierarchy and leave the throttle work add job to pick
>>> time?
>>
>> Right, we can move throttle_cfs_rq() forward to the curr accouting time, which
>> just mark these throttled.
> 
> While preparing the next version, I found that if I move
> throttle_cfs_rq() to accounting time, like in __account_cfs_rq_runtime(),
> then it is possible on unthrottle path, the following can happen:
> unthrottle_cfs_rq() -> enqueue_task_fair() -> update_curr() ->
> account_cfs_rq_runtime() -> throttle_cfs_rq()...

Ah, right, then it's best to leave throttle_cfs_rq() where it is.

> 
> Initially I was confused why update_curr() can notice a non-null curr
> when this cfs_rq is being unthrottled but then I realized in this task
> based throttling model, it is possible some task woke up in that
> throttled cfs_rq and have cfs_rq->curr set and then cfs_rq gets
> unthrottled.
> 
> So I suppose I'll keep the existing way of marking a cfs_rq as
> throttled by calling check_cfs_rq_runtime() in the following two places:
> - in pick_task_fair(), so that the to-be-picked cfs_rq can be marked for
>    throttle;
> - in put_prev_entity() for prev runnable task's cfs_rq.
> 
>> And move setup_task_work() afterward to the pick task time, which make that task
>> dequeue when ret2user.
> 
> No problem for this part as far as my test goes :-)

Good to hear.

Thanks!

> 
> Thanks,
> Aaron
> 
>>>
>>>> Then then if we pick a task from a throttled cfs_rq subtree, we can setup task work
>>>> for it, so we don't botter with the delayed_dequeue task case that Prateek mentioned.
>>>
>>> If we add a check point in pick time, maybe we can also avoid the check
>>> in enqueue time. One thing I'm thinking is, for a task, it may be picked
>>> multiple times with only a single enqueue so if we do the check in pick,
>>> the overhead can be larger?
>>
>> As Prateek already mentioned, this check cost is negligeable.
>>
>>>
>>>> WDYT?
>>>
>>> Thanks for your suggestion. I'll try this approach and see how it turned
>>> out.

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 6/7] sched/fair: fix tasks_rcu with task based throttle
  2025-03-31  6:19     ` Aaron Lu
@ 2025-04-01  3:17       ` K Prateek Nayak
  2025-04-01  8:48         ` Aaron Lu
  0 siblings, 1 reply; 58+ messages in thread
From: K Prateek Nayak @ 2025-04-01  3:17 UTC (permalink / raw)
  To: Aaron Lu
  Cc: Valentin Schneider, Ben Segall, Peter Zijlstra, Josh Don,
	Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chengming Zhou,
	Chuyi Zhou

Hello Aaron,

On 3/31/2025 11:49 AM, Aaron Lu wrote:
>>> Taskes throttled on exit to user path are scheduled by cond_resched() in
>>> task_work_run() but that is a preempt schedule and doesn't mark a task
>>> rcu quiescent state.

So browsing through kernel/rcu/task.h, I found the
cond_resched_tasks_rcu_qs() that basically clears task holdout before
calling schedule(). The question is, is it safe to be used in the
task_work_run() context? I have no idea but you can do a resched_curr()
followed by a cond_resched_tasks_rcu_qs() in throttle_cfs_rq_work() and
that should give you the same effect as doing a schedule().

Thoughts?

>>>
>>> Fix this by directly calling schedule() in throttle_cfs_rq_work().
>> Perhaps that can be gotten around by just using set_ti_thread_flag()
>> resched_curr() will also call set_preempt_need_resched() which allows
>> cond_resched() to resched the task.
>>
>> Since exit_to_user_mode_loop() will run once again seeing that
>> TIF_NEED_RESCHED is set, schedule() should follow soon. Thoughts?
>>
> I tried this and noticed an unexpected consequence: get_signal() will
> also run task works and if the signal is kill, then it can happen:
> exit_to_user_mode_loop() -> get_signal() -> throttle_task_work() ->
> do_exit() -> exit_signals() -> percpu_rwsem_wait() -> schedule() ->
> try_to_block_task() -> dequeue_task_fair().
> 
> I would like to avoid this path, at least for now. I want to make sure
> for throttled tasks, only events like task group change, affinity change
> etc. can cause it dequeue from core, that's why I added
> SCHED_WARN_ON(flags & DEQUEUE_SLEEP) in dequeue_task_fair(). I think
> this can help me capture any unexpected events like this.
> 
> Besides, I think explicitely calling schedule() has the advantage of not
> relying on other code, i.e. it doesn't matter if someone removed
> cond_resched() in task_work_run() some time later or someone changed the
> logic in exit_to_user_mode_loop().
> 
> So I hope you don't mind I keep schedule() in throttle_cfs_rq_work(),
> but if you see anything wrong of doing this, feel free to let me know,
> thanks.

I don't have any strong feelings. Just that the open-coded schedule()
struck out like a sore thumb and since you mention future changes, the
"local_irq_enable_exit_to_user(ti_work)" could perhaps one day be
extended to not disable IRQs in exit_to_user_mode_loop() in which case
a direct call to schedule() can cause scheduling while atomic warnings.

IMO using cond_resched_tasks_rcu_qs() is cleaner and it is obvious that
the throttle work wants to call schedule() asap while also clearing the
RCU holdout but I'll wait for others to comment if it is safe to do so
or if we are missing something.

-- 
Thanks and Regards,
Prateek

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 6/7] sched/fair: fix tasks_rcu with task based throttle
  2025-04-01  3:17       ` K Prateek Nayak
@ 2025-04-01  8:48         ` Aaron Lu
  0 siblings, 0 replies; 58+ messages in thread
From: Aaron Lu @ 2025-04-01  8:48 UTC (permalink / raw)
  To: K Prateek Nayak
  Cc: Valentin Schneider, Ben Segall, Peter Zijlstra, Josh Don,
	Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chengming Zhou,
	Chuyi Zhou

On Tue, Apr 01, 2025 at 08:47:02AM +0530, K Prateek Nayak wrote:
> Hello Aaron,
> 
> On 3/31/2025 11:49 AM, Aaron Lu wrote:
> > > > Taskes throttled on exit to user path are scheduled by cond_resched() in
> > > > task_work_run() but that is a preempt schedule and doesn't mark a task
> > > > rcu quiescent state.
> 
> So browsing through kernel/rcu/task.h, I found the
> cond_resched_tasks_rcu_qs() that basically clears task holdout before
> calling schedule(). The question is, is it safe to be used in the
> task_work_run() context? I have no idea but you can do a resched_curr()
> followed by a cond_resched_tasks_rcu_qs() in throttle_cfs_rq_work() and
> that should give you the same effect as doing a schedule().
> 
> Thoughts?
>

Looks good to me.

> > > > 
> > > > Fix this by directly calling schedule() in throttle_cfs_rq_work().
> > > Perhaps that can be gotten around by just using set_ti_thread_flag()
> > > resched_curr() will also call set_preempt_need_resched() which allows
> > > cond_resched() to resched the task.
> > > 
> > > Since exit_to_user_mode_loop() will run once again seeing that
> > > TIF_NEED_RESCHED is set, schedule() should follow soon. Thoughts?
> > > 
> > I tried this and noticed an unexpected consequence: get_signal() will
> > also run task works and if the signal is kill, then it can happen:
> > exit_to_user_mode_loop() -> get_signal() -> throttle_task_work() ->
> > do_exit() -> exit_signals() -> percpu_rwsem_wait() -> schedule() ->
> > try_to_block_task() -> dequeue_task_fair().
> > 
> > I would like to avoid this path, at least for now. I want to make sure
> > for throttled tasks, only events like task group change, affinity change
> > etc. can cause it dequeue from core, that's why I added
> > SCHED_WARN_ON(flags & DEQUEUE_SLEEP) in dequeue_task_fair(). I think
> > this can help me capture any unexpected events like this.
> > 
> > Besides, I think explicitely calling schedule() has the advantage of not
> > relying on other code, i.e. it doesn't matter if someone removed
> > cond_resched() in task_work_run() some time later or someone changed the
> > logic in exit_to_user_mode_loop().
> > 
> > So I hope you don't mind I keep schedule() in throttle_cfs_rq_work(),
> > but if you see anything wrong of doing this, feel free to let me know,
> > thanks.
> 
> I don't have any strong feelings. Just that the open-coded schedule()
> struck out like a sore thumb and since you mention future changes, the
> "local_irq_enable_exit_to_user(ti_work)" could perhaps one day be
> extended to not disable IRQs in exit_to_user_mode_loop() in which case
> a direct call to schedule() can cause scheduling while atomic warnings.
> 
> IMO using cond_resched_tasks_rcu_qs() is cleaner and it is obvious that
> the throttle work wants to call schedule() asap while also clearing the
> RCU holdout but I'll wait for others to comment if it is safe to do so
> or if we are missing something.

I'm running some tests and I'll follow your suggestion if no problem
found, thanks for the suggestion.

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 3/7] sched/fair: Handle unthrottle path for task based throttle
  2025-03-14 17:52       ` K Prateek Nayak
  2025-03-17  5:48         ` Aaron Lu
@ 2025-04-02  9:25         ` Aaron Lu
  2025-04-02 17:24           ` K Prateek Nayak
  1 sibling, 1 reply; 58+ messages in thread
From: Aaron Lu @ 2025-04-02  9:25 UTC (permalink / raw)
  To: K Prateek Nayak
  Cc: Valentin Schneider, Ben Segall, Peter Zijlstra, Josh Don,
	Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chengming Zhou,
	Chuyi Zhou

Hi Prateek,

On Fri, Mar 14, 2025 at 11:22:20PM +0530, K Prateek Nayak wrote:
... ... 
> but with per-task model, it is probably the amount of time that
> "throttled_limbo_list" has a task on it since they are runnable
> but are in-fact waiting for an unthrottle.

I tried this way of accounting and realized a problem. Assume a
hierarchy like this: /sys/fs/cgroup/1/1_1, quota configured at
/sys/fs/cgroup/1 level. When throttle happend and tasks of 1_1 get
throttled, the limbo list of /sys/fs/cgroup/1 will always be empty so
its "throttled_clock_self_time" is always 0...This doesn't match
throttled_clock_self_time's semantic. "throttled_time" is similar.

I suppose we can somehow fix this by introducing something like
h_nr_throttled, but I feel that's an overkill. So I'll probabaly just
keep the current accounting as is in the next version, feel free to let
me know if you have other thoughts.

Thanks.

> If a task enqueues itself on a throttled hierarchy and then blocks
> again before exiting to the userspace, it should not count in
> "throttled_clock_self_time" since the task was runnable the whole
> time despite the hierarchy being frozen.

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [RFC PATCH 3/7] sched/fair: Handle unthrottle path for task based throttle
  2025-04-02  9:25         ` Aaron Lu
@ 2025-04-02 17:24           ` K Prateek Nayak
  0 siblings, 0 replies; 58+ messages in thread
From: K Prateek Nayak @ 2025-04-02 17:24 UTC (permalink / raw)
  To: Aaron Lu
  Cc: Valentin Schneider, Ben Segall, Peter Zijlstra, Josh Don,
	Ingo Molnar, Vincent Guittot, linux-kernel, Juri Lelli,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman, Chengming Zhou,
	Chuyi Zhou

Hello Aaron,

On 4/2/2025 2:55 PM, Aaron Lu wrote:
> Hi Prateek,
> 
> On Fri, Mar 14, 2025 at 11:22:20PM +0530, K Prateek Nayak wrote:
> ... ...
>> but with per-task model, it is probably the amount of time that
>> "throttled_limbo_list" has a task on it since they are runnable
>> but are in-fact waiting for an unthrottle.
> 
> I tried this way of accounting and realized a problem. Assume a
> hierarchy like this: /sys/fs/cgroup/1/1_1, quota configured at
> /sys/fs/cgroup/1 level. When throttle happend and tasks of 1_1 get
> throttled, the limbo list of /sys/fs/cgroup/1 will always be empty so
> its "throttled_clock_self_time" is always 0...This doesn't match
> throttled_clock_self_time's semantic. "throttled_time" is similar.
> 
> I suppose we can somehow fix this by introducing something like
> h_nr_throttled, but I feel that's an overkill. So I'll probabaly just
> keep the current accounting as is in the next version, feel free to let
> me know if you have other thoughts.

I agree it might an overkill. We can discuss this more on v2.

-- 
Thanks and Regards,
Prateek

> 
> Thanks.
> 
>> If a task enqueues itself on a throttled hierarchy and then blocks
>> again before exiting to the userspace, it should not count in
>> "throttled_clock_self_time" since the task was runnable the whole
>> time despite the hierarchy being frozen.


^ permalink raw reply	[flat|nested] 58+ messages in thread

end of thread, other threads:[~2025-04-02 17:25 UTC | newest]

Thread overview: 58+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-03-17 10:56 [RFC PATCH 0/7] Defer throttle when task exits to user Aaron Lu
2025-03-13  7:21 ` [RFC PATCH 1/7] sched/fair: Add related data structure for task based throttle Aaron Lu
2025-03-17 10:28   ` Valentin Schneider
2025-03-17 11:02     ` Aaron Lu
2025-03-13  7:21 ` [RFC PATCH 2/7] sched/fair: Handle throttle path " Aaron Lu
2025-03-13 18:14   ` K Prateek Nayak
2025-03-14  8:48     ` Aaron Lu
2025-03-14  9:00       ` K Prateek Nayak
2025-03-14  3:28   ` K Prateek Nayak
2025-03-14  8:57     ` Aaron Lu
2025-03-14  9:12       ` K Prateek Nayak
2025-03-14 15:10         ` Aaron Lu
2025-03-14  8:39   ` Chengming Zhou
2025-03-14  8:49     ` K Prateek Nayak
2025-03-14  9:42     ` Aaron Lu
2025-03-14 10:26       ` K Prateek Nayak
2025-03-14 11:47         ` Aaron Lu
2025-03-14 15:58           ` Chengming Zhou
2025-03-14 18:04           ` K Prateek Nayak
2025-03-14 11:07       ` Chengming Zhou
2025-03-31  6:42         ` Aaron Lu
2025-03-31  9:14           ` Chengming Zhou
2025-03-16  3:25   ` Josh Don
2025-03-17  2:54     ` Chengming Zhou
2025-03-20  6:59       ` K Prateek Nayak
2025-03-20  8:39         ` Chengming Zhou
2025-03-20 18:40           ` Xi Wang
2025-03-24  8:58             ` Aaron Lu
2025-03-25 10:02               ` Aaron Lu
2025-03-28  0:11                 ` Xi Wang
2025-03-28  3:11                   ` Aaron Lu
2025-03-28 22:47         ` Benjamin Segall
2025-03-19 13:43     ` Aaron Lu
2025-03-20  1:06       ` Josh Don
2025-03-20  6:53     ` K Prateek Nayak
2025-03-13  7:21 ` [RFC PATCH 3/7] sched/fair: Handle unthrottle " Aaron Lu
2025-03-14  3:53   ` K Prateek Nayak
2025-03-14  4:06     ` K Prateek Nayak
2025-03-14 10:43     ` Aaron Lu
2025-03-14 17:52       ` K Prateek Nayak
2025-03-17  5:48         ` Aaron Lu
2025-04-02  9:25         ` Aaron Lu
2025-04-02 17:24           ` K Prateek Nayak
2025-03-13  7:21 ` [RFC PATCH 4/7] sched/fair: Take care of migrated task " Aaron Lu
2025-03-14  4:03   ` K Prateek Nayak
2025-03-14  9:49     ` [External] " Aaron Lu
2025-03-13  7:21 ` [RFC PATCH 5/7] sched/fair: Take care of group/affinity/sched_class change for throttled task Aaron Lu
2025-03-14  4:51   ` K Prateek Nayak
2025-03-14 11:40     ` [External] " Aaron Lu
2025-03-13  7:22 ` [RFC PATCH 6/7] sched/fair: fix tasks_rcu with task based throttle Aaron Lu
2025-03-14  4:14   ` K Prateek Nayak
2025-03-14 11:37     ` [External] " Aaron Lu
2025-03-31  6:19     ` Aaron Lu
2025-04-01  3:17       ` K Prateek Nayak
2025-04-01  8:48         ` Aaron Lu
2025-03-13  7:22 ` [RFC PATCH 7/7] sched/fair: Make sure cfs_rq has enough runtime_remaining on unthrottle path Aaron Lu
2025-03-14  4:18   ` K Prateek Nayak
2025-03-14 11:39     ` [External] " Aaron Lu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox