public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC 0/8] Add a deadline server for sched_ext tasks
@ 2025-03-15  2:21 Joel Fernandes
  2025-03-15  2:21 ` [PATCH RFC 1/8] sched: Add support to pick functions to take rf Joel Fernandes
                   ` (7 more replies)
  0 siblings, 8 replies; 23+ messages in thread
From: Joel Fernandes @ 2025-03-15  2:21 UTC (permalink / raw)
  To: linux-kernel, Andrea Righi, Tejun Heo
  Cc: Luigi De Matteis, paulmck, boqun.feng, David Vernet,
	Joel Fernandes, bpf

sched_ext tasks currently are starved by RT hoggers especially since RT
throttling was replaced by deadline servers to boost only CFS tasks. Several
users in the community have reported issues with RT stalling sched_ext tasks.
Add a sched_ext deadline server as well so that sched_ext tasks are also
boosted and do not suffer starvation.

A kselftest is also provided to verify the starvation issues are now fixed.

Andrea Righi (1):
  selftests/sched_ext: Add test for sched_ext dl_server

Joel Fernandes (7):
  sched: Add support to pick functions to take rf
  sched: Add a server arg to dl_server_update_idle_time()
  sched/ext: Add a DL server for sched_ext tasks
  sched/debug: Fix updating of ppos on server write ops
  sched/debug: Stop and start server based on if it was active
  sched/debug: Add support to change sched_ext server params
  sched/deadline: Clear defer params

 include/linux/sched.h                         |   2 +-
 kernel/sched/core.c                           |  19 +-
 kernel/sched/deadline.c                       |  30 +--
 kernel/sched/debug.c                          |  96 ++++----
 kernel/sched/ext.c                            |  64 +++++-
 kernel/sched/fair.c                           |  15 +-
 kernel/sched/idle.c                           |   4 +-
 kernel/sched/rt.c                             |   2 +-
 kernel/sched/sched.h                          |  12 +-
 kernel/sched/stop_task.c                      |   2 +-
 tools/testing/selftests/sched_ext/Makefile    |   1 +
 .../selftests/sched_ext/rt_stall.bpf.c        |  23 ++
 tools/testing/selftests/sched_ext/rt_stall.c  | 213 ++++++++++++++++++
 13 files changed, 406 insertions(+), 77 deletions(-)
 create mode 100644 tools/testing/selftests/sched_ext/rt_stall.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/rt_stall.c

-- 
2.43.0


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH RFC 1/8] sched: Add support to pick functions to take rf
  2025-03-15  2:21 [PATCH RFC 0/8] Add a deadline server for sched_ext tasks Joel Fernandes
@ 2025-03-15  2:21 ` Joel Fernandes
  2025-03-15  2:21 ` [PATCH RFC 2/8] sched: Add a server arg to dl_server_update_idle_time() Joel Fernandes
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 23+ messages in thread
From: Joel Fernandes @ 2025-03-15  2:21 UTC (permalink / raw)
  To: linux-kernel, Andrea Righi, Tejun Heo, Ingo Molnar,
	Peter Zijlstra, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider,
	David Vernet, Changwoo Min
  Cc: Luigi De Matteis, paulmck, boqun.feng, Joel Fernandes

Some pick functions like the internal pick_next_task_fair() already take
rf but some others dont. We need this for scx's server pick function.
Prepare for this by having pick functions accept it.

Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
 include/linux/sched.h    |  2 +-
 kernel/sched/core.c      | 16 ++++++++--------
 kernel/sched/deadline.c  |  8 ++++----
 kernel/sched/ext.c       |  2 +-
 kernel/sched/fair.c      | 13 ++++++++-----
 kernel/sched/idle.c      |  2 +-
 kernel/sched/rt.c        |  2 +-
 kernel/sched/sched.h     |  7 ++++---
 kernel/sched/stop_task.c |  2 +-
 9 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9632e3318e0d..08b1a611bbd5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -609,7 +609,7 @@ struct sched_rt_entity {
 } __randomize_layout;
 
 typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *);
-typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *);
+typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *, void *);
 
 struct sched_dl_entity {
 	struct rb_node			rb_node;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9aecd914ac69..d19e4b7a0020 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6035,7 +6035,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 
 		/* Assume the next prioritized class is idle_sched_class */
 		if (!p) {
-			p = pick_task_idle(rq);
+			p = pick_task_idle(rq, rf);
 			put_prev_set_next_task(rq, prev, p);
 		}
 
@@ -6047,11 +6047,11 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 
 	for_each_active_class(class) {
 		if (class->pick_next_task) {
-			p = class->pick_next_task(rq, prev);
+			p = class->pick_next_task(rq, prev, rf);
 			if (p)
 				return p;
 		} else {
-			p = class->pick_task(rq);
+			p = class->pick_task(rq, rf);
 			if (p) {
 				put_prev_set_next_task(rq, prev, p);
 				return p;
@@ -6081,7 +6081,7 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
 	return a->core_cookie == b->core_cookie;
 }
 
-static inline struct task_struct *pick_task(struct rq *rq)
+static inline struct task_struct *pick_task(struct rq *rq, struct rq_flags *rf)
 {
 	const struct sched_class *class;
 	struct task_struct *p;
@@ -6089,7 +6089,7 @@ static inline struct task_struct *pick_task(struct rq *rq)
 	rq->dl_server = NULL;
 
 	for_each_active_class(class) {
-		p = class->pick_task(rq);
+		p = class->pick_task(rq, rf);
 		if (p)
 			return p;
 	}
@@ -6189,7 +6189,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 	 * and there are no cookied tasks running on siblings.
 	 */
 	if (!need_sync) {
-		next = pick_task(rq);
+		next = pick_task(rq, rf);
 		if (!next->core_cookie) {
 			rq->core_pick = NULL;
 			rq->core_dl_server = NULL;
@@ -6220,7 +6220,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 		if (i != cpu && (rq_i != rq->core || !core_clock_updated))
 			update_rq_clock(rq_i);
 
-		rq_i->core_pick = p = pick_task(rq_i);
+		rq_i->core_pick = p = pick_task(rq_i, rf);
 		rq_i->core_dl_server = rq_i->dl_server;
 
 		if (!max || prio_less(max, p, fi_before))
@@ -6242,7 +6242,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 			if (cookie)
 				p = sched_core_find(rq_i, cookie);
 			if (!p)
-				p = idle_sched_class.pick_task(rq_i);
+				p = idle_sched_class.pick_task(rq_i, rf);
 		}
 
 		rq_i->core_pick = p;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 38e4537790af..3f7fb7251805 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2421,7 +2421,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq)
  * __pick_next_task_dl - Helper to pick the next -deadline task to run.
  * @rq: The runqueue to pick the next task from.
  */
-static struct task_struct *__pick_task_dl(struct rq *rq)
+static struct task_struct *__pick_task_dl(struct rq *rq, struct rq_flags *rf)
 {
 	struct sched_dl_entity *dl_se;
 	struct dl_rq *dl_rq = &rq->dl;
@@ -2435,7 +2435,7 @@ static struct task_struct *__pick_task_dl(struct rq *rq)
 	WARN_ON_ONCE(!dl_se);
 
 	if (dl_server(dl_se)) {
-		p = dl_se->server_pick_task(dl_se);
+		p = dl_se->server_pick_task(dl_se, rf);
 		if (!p) {
 			if (dl_server_active(dl_se)) {
 				dl_se->dl_yielded = 1;
@@ -2451,9 +2451,9 @@ static struct task_struct *__pick_task_dl(struct rq *rq)
 	return p;
 }
 
-static struct task_struct *pick_task_dl(struct rq *rq)
+static struct task_struct *pick_task_dl(struct rq *rq, struct rq_flags *rf)
 {
-	return __pick_task_dl(rq);
+	return __pick_task_dl(rq, rf);
 }
 
 static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_struct *next)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 5a81d9a1e31f..636b08977d19 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3113,7 +3113,7 @@ static struct task_struct *first_local_task(struct rq *rq)
 					struct task_struct, scx.dsq_list.node);
 }
 
-static struct task_struct *pick_task_scx(struct rq *rq)
+static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf)
 {
 	struct task_struct *prev = rq->curr;
 	struct task_struct *p;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1c0ef435a7aa..734c22fb0e2d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8816,7 +8816,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
 	resched_curr_lazy(rq);
 }
 
-static struct task_struct *pick_task_fair(struct rq *rq)
+static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf)
 {
 	struct sched_entity *se;
 	struct cfs_rq *cfs_rq;
@@ -8854,7 +8854,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 	int new_tasks;
 
 again:
-	p = pick_task_fair(rq);
+	p = pick_task_fair(rq, rf);
 	if (!p)
 		goto idle;
 	se = &p->se;
@@ -8933,7 +8933,8 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 	return NULL;
 }
 
-static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev)
+static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev,
+												 struct rq_flags *rf)
 {
 	return pick_next_task_fair(rq, prev, NULL);
 }
@@ -8943,9 +8944,11 @@ static bool fair_server_has_tasks(struct sched_dl_entity *dl_se)
 	return !!dl_se->rq->cfs.nr_queued;
 }
 
-static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se)
+static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se, void *flags)
 {
-	return pick_task_fair(dl_se->rq);
+	struct rq_flags *rf = flags;
+
+	return pick_task_fair(dl_se->rq, rf);
 }
 
 void fair_server_init(struct rq *rq)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 2c85c86b455f..01e9612deefe 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -463,7 +463,7 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir
 	next->se.exec_start = rq_clock_task(rq);
 }
 
-struct task_struct *pick_task_idle(struct rq *rq)
+struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf)
 {
 	scx_update_idle(rq, true, false);
 	return rq->idle;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4b8e33c615b1..f946a4b091e8 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1735,7 +1735,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
 	return rt_task_of(rt_se);
 }
 
-static struct task_struct *pick_task_rt(struct rq *rq)
+static struct task_struct *pick_task_rt(struct rq *rq, struct rq_flags *rf)
 {
 	struct task_struct *p;
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c8512a9fb022..ef6fbc49449f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2422,7 +2422,7 @@ struct sched_class {
 	void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
 
 	int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
-	struct task_struct *(*pick_task)(struct rq *rq);
+	struct task_struct *(*pick_task)(struct rq *rq, struct rq_flags *rf);
 	/*
 	 * Optional! When implemented pick_next_task() should be equivalent to:
 	 *
@@ -2432,7 +2432,8 @@ struct sched_class {
 	 *       set_next_task_first(next);
 	 *   }
 	 */
-	struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev);
+	struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev,
+										  struct rq_flags *rf);
 
 	void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next);
 	void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
@@ -2595,7 +2596,7 @@ static inline bool sched_fair_runnable(struct rq *rq)
 }
 
 extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
-extern struct task_struct *pick_task_idle(struct rq *rq);
+extern struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf);
 
 #define SCA_CHECK		0x01
 #define SCA_MIGRATE_DISABLE	0x02
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 058dd42e3d9b..1c70123cb6a4 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -33,7 +33,7 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool fir
 	stop->se.exec_start = rq_clock_task(rq);
 }
 
-static struct task_struct *pick_task_stop(struct rq *rq)
+static struct task_struct *pick_task_stop(struct rq *rq, struct rq_flags *rf)
 {
 	if (!sched_stop_runnable(rq))
 		return NULL;
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH RFC 2/8] sched: Add a server arg to dl_server_update_idle_time()
  2025-03-15  2:21 [PATCH RFC 0/8] Add a deadline server for sched_ext tasks Joel Fernandes
  2025-03-15  2:21 ` [PATCH RFC 1/8] sched: Add support to pick functions to take rf Joel Fernandes
@ 2025-03-15  2:21 ` Joel Fernandes
  2025-03-15  2:21 ` [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks Joel Fernandes
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 23+ messages in thread
From: Joel Fernandes @ 2025-03-15  2:21 UTC (permalink / raw)
  To: linux-kernel, Andrea Righi, Tejun Heo, Ingo Molnar,
	Peter Zijlstra, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider
  Cc: Luigi De Matteis, paulmck, boqun.feng, David Vernet,
	Joel Fernandes

Since we are adding more servers, make dl_server_update_idle_time()
accept a server argument than a specific server.

Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
 kernel/sched/deadline.c | 17 +++++++++--------
 kernel/sched/fair.c     |  2 +-
 kernel/sched/idle.c     |  2 +-
 kernel/sched/sched.h    |  3 ++-
 4 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 3f7fb7251805..ef592751417f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1610,28 +1610,29 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
  * as time available for the fair server, avoiding a penalty for the
  * rt scheduler that did not consumed that time.
  */
-void dl_server_update_idle_time(struct rq *rq, struct task_struct *p)
+void dl_server_update_idle_time(struct rq *rq, struct task_struct *p,
+			       struct sched_dl_entity *rq_dl_server)
 {
 	s64 delta_exec, scaled_delta_exec;
 
-	if (!rq->fair_server.dl_defer)
+	if (!rq_dl_server->dl_defer)
 		return;
 
 	/* no need to discount more */
-	if (rq->fair_server.runtime < 0)
+	if (rq_dl_server->runtime < 0)
 		return;
 
 	delta_exec = rq_clock_task(rq) - p->se.exec_start;
 	if (delta_exec < 0)
 		return;
 
-	scaled_delta_exec = dl_scaled_delta_exec(rq, &rq->fair_server, delta_exec);
+	scaled_delta_exec = dl_scaled_delta_exec(rq, rq_dl_server, delta_exec);
 
-	rq->fair_server.runtime -= scaled_delta_exec;
+	rq_dl_server->runtime -= scaled_delta_exec;
 
-	if (rq->fair_server.runtime < 0) {
-		rq->fair_server.dl_defer_running = 0;
-		rq->fair_server.runtime = 0;
+	if (rq_dl_server->runtime < 0) {
+		rq_dl_server->dl_defer_running = 0;
+		rq_dl_server->runtime = 0;
 	}
 
 	p->se.exec_start = rq_clock_task(rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 734c22fb0e2d..f6511eaf0389 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7006,7 +7006,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
 		/* Account for idle runtime */
 		if (!rq->nr_running)
-			dl_server_update_idle_time(rq, rq->curr);
+			dl_server_update_idle_time(rq, rq->curr, &rq->fair_server);
 		dl_server_start(&rq->fair_server);
 	}
 
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 01e9612deefe..13a3d20d35e2 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -451,7 +451,7 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
 
 static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next)
 {
-	dl_server_update_idle_time(rq, prev);
+	dl_server_update_idle_time(rq, prev, &rq->fair_server);
 	scx_update_idle(rq, false, true);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ef6fbc49449f..b3d1201b8f3d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -392,7 +392,8 @@ extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
 		    dl_server_pick_f pick_task);
 
 extern void dl_server_update_idle_time(struct rq *rq,
-		    struct task_struct *p);
+		    struct task_struct *p,
+		    struct sched_dl_entity *rq_dl_server);
 extern void fair_server_init(struct rq *rq);
 extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq);
 extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks
  2025-03-15  2:21 [PATCH RFC 0/8] Add a deadline server for sched_ext tasks Joel Fernandes
  2025-03-15  2:21 ` [PATCH RFC 1/8] sched: Add support to pick functions to take rf Joel Fernandes
  2025-03-15  2:21 ` [PATCH RFC 2/8] sched: Add a server arg to dl_server_update_idle_time() Joel Fernandes
@ 2025-03-15  2:21 ` Joel Fernandes
  2025-03-15  7:22   ` Peter Zijlstra
  2025-03-15 17:56   ` Andrea Righi
  2025-03-15  2:21 ` [PATCH RFC 4/8] sched/debug: Fix updating of ppos on server write ops Joel Fernandes
                   ` (4 subsequent siblings)
  7 siblings, 2 replies; 23+ messages in thread
From: Joel Fernandes @ 2025-03-15  2:21 UTC (permalink / raw)
  To: linux-kernel, Andrea Righi, Tejun Heo, Ingo Molnar,
	Peter Zijlstra, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider,
	David Vernet, Changwoo Min
  Cc: Luigi De Matteis, paulmck, boqun.feng, Joel Fernandes

sched_ext currently suffers starvation due to RT. The same workload when
converted to EXT can get zero runtime if RT is 100% running, causing EXT
processes to stall. Fix it by adding a DL server for EXT.

A kselftest is also provided later to verify:

./runner -t rt_stall
===== START =====
TEST: rt_stall
DESCRIPTION: Verify that RT tasks cannot stall SCHED_EXT tasks
OUTPUT:
TAP version 13
1..1
ok 1 PASS: CFS task got more than 4.00% of runtime

Cc: Luigi De Matteis <ldematteis123@gmail.com>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
 kernel/sched/core.c     |  3 ++
 kernel/sched/deadline.c |  2 +-
 kernel/sched/ext.c      | 62 +++++++++++++++++++++++++++++++++++++++--
 kernel/sched/sched.h    |  2 ++
 4 files changed, 66 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d19e4b7a0020..09bff60c22d8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8617,6 +8617,9 @@ void __init sched_init(void)
 		hrtick_rq_init(rq);
 		atomic_set(&rq->nr_iowait, 0);
 		fair_server_init(rq);
+#ifdef CONFIG_SCHED_CLASS_EXT
+		ext_server_init(rq);
+#endif
 
 #ifdef CONFIG_SCHED_CORE
 		rq->core = rq;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index ef592751417f..bcb66d9692ae 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1571,7 +1571,7 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
 	 * The fair server (sole dl_server) does not account for real-time
 	 * workload because it is running fair work.
 	 */
-	if (dl_se == &rq->fair_server)
+	if (dl_se == &rq->fair_server || dl_se == &rq->ext_server)
 		return;
 
 #ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 636b08977d19..553d3e6087fe 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1677,6 +1677,9 @@ static void update_curr_scx(struct rq *rq)
 		if (!curr->scx.slice)
 			touch_core_sched(rq, curr);
 	}
+
+	if (dl_server_active(&rq->ext_server))
+		dl_server_update(&rq->ext_server, delta_exec);
 }
 
 static bool scx_dsq_priq_less(struct rb_node *node_a,
@@ -2147,6 +2150,15 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
 	if (enq_flags & SCX_ENQ_WAKEUP)
 		touch_core_sched(rq, p);
 
+	if (rq->scx.nr_running == 1) {
+		/* Account for idle runtime */
+		if (!rq->nr_running)
+			dl_server_update_idle_time(rq, rq->curr, &rq->ext_server);
+
+		/* Start dl_server if this is the first task being enqueued */
+		dl_server_start(&rq->ext_server);
+	}
+
 	do_enqueue_task(rq, p, enq_flags, sticky_cpu);
 out:
 	rq->scx.flags &= ~SCX_RQ_IN_WAKEUP;
@@ -2238,6 +2250,11 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
 	sub_nr_running(rq, 1);
 
 	dispatch_dequeue(rq, p);
+
+	/* Stop the server if this was the last task */
+	if (rq->scx.nr_running == 0)
+		dl_server_stop(&rq->ext_server);
+
 	return true;
 }
 
@@ -4207,6 +4224,15 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
 static void switched_from_scx(struct rq *rq, struct task_struct *p)
 {
 	scx_ops_disable_task(p);
+
+	/*
+	 * After class switch, if the DL server is still active, restart it so
+	 * that DL timers will be queued, in case SCX switched to higher class.
+	 */
+	if (dl_server_active(&rq->ext_server)) {
+		dl_server_stop(&rq->ext_server);
+		dl_server_start(&rq->ext_server);
+	}
 }
 
 static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
@@ -7440,8 +7466,8 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
  * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the
  * schedutil cpufreq governor chooses the target frequency.
  *
- * The actual performance level chosen, CPU grouping, and the overhead and
- * latency of the operations are dependent on the hardware and cpufreq driver in
+ * The actual performance level chosen, CPU grouping, and the overhead and latency
+ * of the operations are dependent on the hardware and cpufreq driver in
  * use. Consult hardware and cpufreq documentation for more information. The
  * current performance level can be monitored using scx_bpf_cpuperf_cur().
  */
@@ -7793,6 +7819,38 @@ BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
 BTF_ID_FLAGS(func, scx_bpf_now)
 BTF_KFUNCS_END(scx_kfunc_ids_any)
 
+/*
+ * Check if ext scheduler has tasks ready to run.
+ */
+static bool ext_server_has_tasks(struct sched_dl_entity *dl_se)
+{
+	return !!dl_se->rq->scx.nr_running;
+}
+
+/*
+ * Select the next task to run from the ext scheduling class.
+ */
+static struct task_struct *ext_server_pick_task(struct sched_dl_entity *dl_se,
+						void *flags)
+{
+	struct rq_flags *rf = flags;
+
+	balance_scx(dl_se->rq, dl_se->rq->curr, rf);
+	return pick_task_scx(dl_se->rq, rf);
+}
+
+/*
+ * Initialize the ext server deadline entity.
+ */
+void ext_server_init(struct rq *rq)
+{
+	struct sched_dl_entity *dl_se = &rq->ext_server;
+
+	init_dl_entity(dl_se);
+
+	dl_server_init(dl_se, rq, ext_server_has_tasks, ext_server_pick_task);
+}
+
 static const struct btf_kfunc_id_set scx_kfunc_set_any = {
 	.owner			= THIS_MODULE,
 	.set			= &scx_kfunc_ids_any,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b3d1201b8f3d..8421eb56c50b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -395,6 +395,7 @@ extern void dl_server_update_idle_time(struct rq *rq,
 		    struct task_struct *p,
 		    struct sched_dl_entity *rq_dl_server);
 extern void fair_server_init(struct rq *rq);
+extern void ext_server_init(struct rq *rq);
 extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq);
 extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
 		    u64 runtime, u64 period, bool init);
@@ -1141,6 +1142,7 @@ struct rq {
 #endif
 
 	struct sched_dl_entity	fair_server;
+	struct sched_dl_entity	ext_server;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this CPU: */
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH RFC 4/8] sched/debug: Fix updating of ppos on server write ops
  2025-03-15  2:21 [PATCH RFC 0/8] Add a deadline server for sched_ext tasks Joel Fernandes
                   ` (2 preceding siblings ...)
  2025-03-15  2:21 ` [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks Joel Fernandes
@ 2025-03-15  2:21 ` Joel Fernandes
  2025-03-15  2:21 ` [PATCH RFC 5/8] sched/debug: Stop and start server based on if it was active Joel Fernandes
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 23+ messages in thread
From: Joel Fernandes @ 2025-03-15  2:21 UTC (permalink / raw)
  To: linux-kernel, Andrea Righi, Tejun Heo, Ingo Molnar,
	Peter Zijlstra, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider
  Cc: Luigi De Matteis, paulmck, boqun.feng, David Vernet,
	Joel Fernandes

Updating "ppos" on error conditions does not make much sense. The pattern
is to return the error code directly without modifying the position, or
modify the position on success and return the number of bytes written.

Since on success, the return value of apply is 0, there is no point in
modifying ppos either. Fix it by removing all this and just returning
error code or number of bytes written on success.

Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
 kernel/sched/debug.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index ef047add7f9e..1ccc7687e1a8 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -348,8 +348,8 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
 	long cpu = (long) ((struct seq_file *) filp->private_data)->private;
 	struct rq *rq = cpu_rq(cpu);
 	u64 runtime, period;
+	int retval = 0;
 	size_t err;
-	int retval;
 	u64 value;
 
 	err = kstrtoull_from_user(ubuf, cnt, 10, &value);
@@ -385,8 +385,6 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
 		}
 
 		retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0);
-		if (retval)
-			cnt = retval;
 
 		if (!runtime)
 			printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n",
@@ -394,6 +392,9 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
 
 		if (rq->cfs.h_nr_queued)
 			dl_server_start(&rq->fair_server);
+
+		if (retval < 0)
+			return retval;
 	}
 
 	*ppos += cnt;
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH RFC 5/8] sched/debug: Stop and start server based on if it was active
  2025-03-15  2:21 [PATCH RFC 0/8] Add a deadline server for sched_ext tasks Joel Fernandes
                   ` (3 preceding siblings ...)
  2025-03-15  2:21 ` [PATCH RFC 4/8] sched/debug: Fix updating of ppos on server write ops Joel Fernandes
@ 2025-03-15  2:21 ` Joel Fernandes
  2025-03-15  2:21 ` [PATCH RFC 6/8] sched/debug: Add support to change sched_ext server params Joel Fernandes
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 23+ messages in thread
From: Joel Fernandes @ 2025-03-15  2:21 UTC (permalink / raw)
  To: linux-kernel, Andrea Righi, Tejun Heo, Ingo Molnar,
	Peter Zijlstra, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider
  Cc: Luigi De Matteis, paulmck, boqun.feng, David Vernet,
	Joel Fernandes

Currently the DL server interface for applying parameters checkfs
CFS-internals to identify if the server is active. This is error-prone
and make it difficult when adding new servers in the future.

Fix it, by using dl_server_active() which is also used by the DL server
code to determine if the DL server was started.

Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
 kernel/sched/debug.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 1ccc7687e1a8..83cb695d6d46 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -347,6 +347,7 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
 {
 	long cpu = (long) ((struct seq_file *) filp->private_data)->private;
 	struct rq *rq = cpu_rq(cpu);
+	bool was_active = false;
 	u64 runtime, period;
 	int retval = 0;
 	size_t err;
@@ -379,7 +380,8 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
 			return  -EINVAL;
 		}
 
-		if (rq->cfs.h_nr_queued) {
+		if (dl_server_active(&rq->fair_server)) {
+			was_active = true;
 			update_rq_clock(rq);
 			dl_server_stop(&rq->fair_server);
 		}
@@ -390,7 +392,7 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
 			printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n",
 					cpu_of(rq));
 
-		if (rq->cfs.h_nr_queued)
+		if (was_active)
 			dl_server_start(&rq->fair_server);
 
 		if (retval < 0)
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH RFC 6/8] sched/debug: Add support to change sched_ext server params
  2025-03-15  2:21 [PATCH RFC 0/8] Add a deadline server for sched_ext tasks Joel Fernandes
                   ` (4 preceding siblings ...)
  2025-03-15  2:21 ` [PATCH RFC 5/8] sched/debug: Stop and start server based on if it was active Joel Fernandes
@ 2025-03-15  2:21 ` Joel Fernandes
  2025-03-15  2:21 ` [PATCH RFC 7/8] sched/deadline: Clear defer params Joel Fernandes
  2025-03-15  2:21 ` [PATCH RFC 8/8] selftests/sched_ext: Add test for sched_ext dl_server Joel Fernandes
  7 siblings, 0 replies; 23+ messages in thread
From: Joel Fernandes @ 2025-03-15  2:21 UTC (permalink / raw)
  To: linux-kernel, Andrea Righi, Tejun Heo, Ingo Molnar,
	Peter Zijlstra, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider
  Cc: Luigi De Matteis, paulmck, boqun.feng, David Vernet,
	Joel Fernandes

When a sched_ext server is loaded, tasks in CFS are converted to run in
sched_ext class. Modify the ext server parameters as well along with the
fair ones.

Re-use the existing interface to modify both ext and fair servers to
keep number of interfaces less (as it is, we have a per-cpu interface).

Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
 kernel/sched/debug.c | 91 ++++++++++++++++++++++++++------------------
 1 file changed, 54 insertions(+), 37 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 83cb695d6d46..218b3e239128 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -339,17 +339,18 @@ enum dl_param {
 	DL_PERIOD,
 };
 
-static unsigned long fair_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */
-static unsigned long fair_server_period_min = (100) * NSEC_PER_USEC;     /* 100 us */
+static unsigned long dl_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */
+static unsigned long dl_server_period_min = (100) * NSEC_PER_USEC;     /* 100 us */
 
-static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubuf,
+static ssize_t sched_dl_server_write(struct file *filp, const char __user *ubuf,
 				       size_t cnt, loff_t *ppos, enum dl_param param)
 {
 	long cpu = (long) ((struct seq_file *) filp->private_data)->private;
 	struct rq *rq = cpu_rq(cpu);
-	bool was_active = false;
+	bool was_active_fair = false;
+	bool was_active_ext = false;
+	int retval = 0, retval2 = 0;
 	u64 runtime, period;
-	int retval = 0;
 	size_t err;
 	u64 value;
 
@@ -375,41 +376,57 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
 		}
 
 		if (runtime > period ||
-		    period > fair_server_period_max ||
-		    period < fair_server_period_min) {
+		    period > dl_server_period_max ||
+		    period < dl_server_period_min) {
 			return  -EINVAL;
 		}
 
 		if (dl_server_active(&rq->fair_server)) {
-			was_active = true;
+			was_active_fair = true;
 			update_rq_clock(rq);
 			dl_server_stop(&rq->fair_server);
 		}
 
+		if (dl_server_active(&rq->ext_server)) {
+			was_active_ext = true;
+			update_rq_clock(rq);
+			dl_server_stop(&rq->ext_server);
+		}
+
 		retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0);
+		retval2 = dl_server_apply_params(&rq->ext_server, runtime, period, 0);
 
 		if (!runtime)
-			printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n",
+			printk_deferred("Deadline servers are disabled on CPU %d, system may crash due to starvation.\n",
 					cpu_of(rq));
 
-		if (was_active)
+		if (was_active_fair)
 			dl_server_start(&rq->fair_server);
 
+		if (was_active_ext)
+			dl_server_start(&rq->ext_server);
+
 		if (retval < 0)
 			return retval;
+		if (retval2 < 0)
+			return retval2;
 	}
 
 	*ppos += cnt;
 	return cnt;
 }
 
-static size_t sched_fair_server_show(struct seq_file *m, void *v, enum dl_param param)
+static size_t sched_dl_server_show(struct seq_file *m, void *v, enum dl_param param)
 {
 	unsigned long cpu = (unsigned long) m->private;
 	struct rq *rq = cpu_rq(cpu);
 	u64 value;
 
 	switch (param) {
+	/*
+	 * The params for fair server and ext server as set via debugfs
+	 * are the same, so we can just use one of them
+	 */
 	case DL_RUNTIME:
 		value = rq->fair_server.dl_runtime;
 		break;
@@ -424,50 +441,50 @@ static size_t sched_fair_server_show(struct seq_file *m, void *v, enum dl_param
 }
 
 static ssize_t
-sched_fair_server_runtime_write(struct file *filp, const char __user *ubuf,
+sched_dl_server_runtime_write(struct file *filp, const char __user *ubuf,
 				size_t cnt, loff_t *ppos)
 {
-	return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_RUNTIME);
+	return sched_dl_server_write(filp, ubuf, cnt, ppos, DL_RUNTIME);
 }
 
-static int sched_fair_server_runtime_show(struct seq_file *m, void *v)
+static int sched_dl_server_runtime_show(struct seq_file *m, void *v)
 {
-	return sched_fair_server_show(m, v, DL_RUNTIME);
+	return sched_dl_server_show(m, v, DL_RUNTIME);
 }
 
-static int sched_fair_server_runtime_open(struct inode *inode, struct file *filp)
+static int sched_dl_server_runtime_open(struct inode *inode, struct file *filp)
 {
-	return single_open(filp, sched_fair_server_runtime_show, inode->i_private);
+	return single_open(filp, sched_dl_server_runtime_show, inode->i_private);
 }
 
-static const struct file_operations fair_server_runtime_fops = {
-	.open		= sched_fair_server_runtime_open,
-	.write		= sched_fair_server_runtime_write,
+static const struct file_operations dl_server_runtime_fops = {
+	.open		= sched_dl_server_runtime_open,
+	.write		= sched_dl_server_runtime_write,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= single_release,
 };
 
 static ssize_t
-sched_fair_server_period_write(struct file *filp, const char __user *ubuf,
+sched_dl_server_period_write(struct file *filp, const char __user *ubuf,
 			       size_t cnt, loff_t *ppos)
 {
-	return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_PERIOD);
+	return sched_dl_server_write(filp, ubuf, cnt, ppos, DL_PERIOD);
 }
 
-static int sched_fair_server_period_show(struct seq_file *m, void *v)
+static int sched_dl_server_period_show(struct seq_file *m, void *v)
 {
-	return sched_fair_server_show(m, v, DL_PERIOD);
+	return sched_dl_server_show(m, v, DL_PERIOD);
 }
 
-static int sched_fair_server_period_open(struct inode *inode, struct file *filp)
+static int sched_dl_server_period_open(struct inode *inode, struct file *filp)
 {
-	return single_open(filp, sched_fair_server_period_show, inode->i_private);
+	return single_open(filp, sched_dl_server_period_show, inode->i_private);
 }
 
-static const struct file_operations fair_server_period_fops = {
-	.open		= sched_fair_server_period_open,
-	.write		= sched_fair_server_period_write,
+static const struct file_operations dl_server_period_fops = {
+	.open		= sched_dl_server_period_open,
+	.write		= sched_dl_server_period_write,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= single_release,
@@ -475,13 +492,13 @@ static const struct file_operations fair_server_period_fops = {
 
 static struct dentry *debugfs_sched;
 
-static void debugfs_fair_server_init(void)
+static void debugfs_dl_server_init(void)
 {
-	struct dentry *d_fair;
+	struct dentry *d_server;
 	unsigned long cpu;
 
-	d_fair = debugfs_create_dir("fair_server", debugfs_sched);
-	if (!d_fair)
+	d_server = debugfs_create_dir("dl_server", debugfs_sched);
+	if (!d_server)
 		return;
 
 	for_each_possible_cpu(cpu) {
@@ -489,10 +506,10 @@ static void debugfs_fair_server_init(void)
 		char buf[32];
 
 		snprintf(buf, sizeof(buf), "cpu%lu", cpu);
-		d_cpu = debugfs_create_dir(buf, d_fair);
+		d_cpu = debugfs_create_dir(buf, d_server);
 
-		debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &fair_server_runtime_fops);
-		debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &fair_server_period_fops);
+		debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &dl_server_runtime_fops);
+		debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &dl_server_period_fops);
 	}
 }
 
@@ -535,7 +552,7 @@ static __init int sched_init_debug(void)
 
 	debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
 
-	debugfs_fair_server_init();
+	debugfs_dl_server_init();
 
 	return 0;
 }
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH RFC 7/8] sched/deadline: Clear defer params
  2025-03-15  2:21 [PATCH RFC 0/8] Add a deadline server for sched_ext tasks Joel Fernandes
                   ` (5 preceding siblings ...)
  2025-03-15  2:21 ` [PATCH RFC 6/8] sched/debug: Add support to change sched_ext server params Joel Fernandes
@ 2025-03-15  2:21 ` Joel Fernandes
  2025-03-15  2:21 ` [PATCH RFC 8/8] selftests/sched_ext: Add test for sched_ext dl_server Joel Fernandes
  7 siblings, 0 replies; 23+ messages in thread
From: Joel Fernandes @ 2025-03-15  2:21 UTC (permalink / raw)
  To: linux-kernel, Andrea Righi, Tejun Heo, Ingo Molnar,
	Peter Zijlstra, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider,
	Daniel Bristot de Oliveira
  Cc: Luigi De Matteis, paulmck, boqun.feng, David Vernet,
	Joel Fernandes

The defer params were not cleared in __dl_clear_params. Clear them.

Without this is some of my test cases are flaking and the DL timer is
not starting correctly AFAICS.

Fixes: a110a81c52a9 ("sched/deadline: Deferrable dl server")
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
 kernel/sched/deadline.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index bcb66d9692ae..1a9c697a795c 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -3427,6 +3427,9 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se)
 	dl_se->dl_non_contending	= 0;
 	dl_se->dl_overrun		= 0;
 	dl_se->dl_server		= 0;
+	dl_se->dl_defer			= 0;
+	dl_se->dl_defer_running	= 0;
+	dl_se->dl_defer_armed	= 0;
 
 #ifdef CONFIG_RT_MUTEXES
 	dl_se->pi_se			= dl_se;
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH RFC 8/8] selftests/sched_ext: Add test for sched_ext dl_server
  2025-03-15  2:21 [PATCH RFC 0/8] Add a deadline server for sched_ext tasks Joel Fernandes
                   ` (6 preceding siblings ...)
  2025-03-15  2:21 ` [PATCH RFC 7/8] sched/deadline: Clear defer params Joel Fernandes
@ 2025-03-15  2:21 ` Joel Fernandes
  2025-03-15 23:22   ` Joel Fernandes
  7 siblings, 1 reply; 23+ messages in thread
From: Joel Fernandes @ 2025-03-15  2:21 UTC (permalink / raw)
  To: linux-kernel, Andrea Righi, Tejun Heo, David Vernet, Changwoo Min,
	Shuah Khan
  Cc: Luigi De Matteis, paulmck, boqun.feng, Joel Fernandes,
	linux-kselftest, bpf

From: Andrea Righi <arighi@nvidia.com>

Add a selftest to validate the correct behavior of the deadline server
for the ext_sched_class.

[ Joel: Replaced occurences of CFS in the test with EXT. ]

Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
---
 tools/testing/selftests/sched_ext/Makefile    |   1 +
 .../selftests/sched_ext/rt_stall.bpf.c        |  23 ++
 tools/testing/selftests/sched_ext/rt_stall.c  | 213 ++++++++++++++++++
 3 files changed, 237 insertions(+)
 create mode 100644 tools/testing/selftests/sched_ext/rt_stall.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/rt_stall.c

diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
index 011762224600..802e3d8d038f 100644
--- a/tools/testing/selftests/sched_ext/Makefile
+++ b/tools/testing/selftests/sched_ext/Makefile
@@ -180,6 +180,7 @@ auto-test-targets :=			\
 	select_cpu_dispatch_bad_dsq	\
 	select_cpu_dispatch_dbl_dsp	\
 	select_cpu_vtime		\
+	rt_stall			\
 	test_example			\
 
 testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets)))
diff --git a/tools/testing/selftests/sched_ext/rt_stall.bpf.c b/tools/testing/selftests/sched_ext/rt_stall.bpf.c
new file mode 100644
index 000000000000..80086779dd1e
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/rt_stall.bpf.c
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A scheduler that verified if RT tasks can stall SCHED_EXT tasks.
+ *
+ * Copyright (c) 2025 NVIDIA Corporation.
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+void BPF_STRUCT_OPS(rt_stall_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops rt_stall_ops = {
+	.exit			= (void *)rt_stall_exit,
+	.name			= "rt_stall",
+};
diff --git a/tools/testing/selftests/sched_ext/rt_stall.c b/tools/testing/selftests/sched_ext/rt_stall.c
new file mode 100644
index 000000000000..d4cb545ebfd8
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/rt_stall.c
@@ -0,0 +1,213 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2025 NVIDIA Corporation.
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sched.h>
+#include <sys/prctl.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <time.h>
+#include <linux/sched.h>
+#include <signal.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "rt_stall.bpf.skel.h"
+#include "scx_test.h"
+#include "../kselftest.h"
+
+#define CORE_ID		0	/* CPU to pin tasks to */
+#define RUN_TIME        5	/* How long to run the test in seconds */
+
+/* Simple busy-wait function for test tasks */
+static void process_func(void)
+{
+	while (1) {
+		/* Busy wait */
+		for (volatile unsigned long i = 0; i < 10000000UL; i++);
+	}
+}
+
+/* Set CPU affinity to a specific core */
+static void set_affinity(int cpu)
+{
+	cpu_set_t mask;
+
+	CPU_ZERO(&mask);
+	CPU_SET(cpu, &mask);
+	if (sched_setaffinity(0, sizeof(mask), &mask) != 0) {
+		perror("sched_setaffinity");
+		exit(EXIT_FAILURE);
+	}
+}
+
+/* Set task scheduling policy and priority */
+static void set_sched(int policy, int priority)
+{
+	struct sched_param param;
+
+	param.sched_priority = priority;
+	if (sched_setscheduler(0, policy, &param) != 0) {
+		perror("sched_setscheduler");
+		exit(EXIT_FAILURE);
+	}
+}
+
+/* Get process runtime from /proc/<pid>/stat */
+static float get_process_runtime(int pid)
+{
+	char path[256];
+	FILE *file;
+	long utime, stime;
+	int fields;
+
+	snprintf(path, sizeof(path), "/proc/%d/stat", pid);
+	file = fopen(path, "r");
+	if (file == NULL) {
+		perror("Failed to open stat file");
+		return -1;
+	}
+
+	/* Skip the first 13 fields and read the 14th and 15th */
+	fields = fscanf(file,
+			"%*d %*s %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u %lu %lu",
+			&utime, &stime);
+	fclose(file);
+
+	if (fields != 2) {
+		fprintf(stderr, "Failed to read stat file\n");
+		return -1;
+	}
+
+	/* Calculate the total time spent in the process */
+	long total_time = utime + stime;
+	long ticks_per_second = sysconf(_SC_CLK_TCK);
+	float runtime_seconds = total_time * 1.0 / ticks_per_second;
+
+	return runtime_seconds;
+}
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct rt_stall *skel;
+
+	skel = rt_stall__open();
+	SCX_FAIL_IF(!skel, "Failed to open");
+	SCX_ENUM_INIT(skel);
+	SCX_FAIL_IF(rt_stall__load(skel), "Failed to load skel");
+
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static bool sched_stress_test(void)
+{
+	float cfs_runtime, rt_runtime;
+	int cfs_pid, rt_pid;
+	float expected_min_ratio = 0.04; /* 4% */
+
+	ksft_print_header();
+	ksft_set_plan(1);
+
+	/* Create and set up a EXT task */
+	cfs_pid = fork();
+	if (cfs_pid == 0) {
+		set_affinity(CORE_ID);
+		process_func();
+		exit(0);
+	} else if (cfs_pid < 0) {
+		perror("fork for EXT task");
+		ksft_exit_fail();
+	}
+
+	/* Create an RT task */
+	rt_pid = fork();
+	if (rt_pid == 0) {
+		set_affinity(CORE_ID);
+		set_sched(SCHED_FIFO, 50);
+		process_func();
+		exit(0);
+	} else if (rt_pid < 0) {
+		perror("fork for RT task");
+		ksft_exit_fail();
+	}
+
+	/* Let the processes run for the specified time */
+	sleep(RUN_TIME);
+
+	/* Get runtime for the EXT task */
+	cfs_runtime = get_process_runtime(cfs_pid);
+	if (cfs_runtime != -1)
+		ksft_print_msg("Runtime of EXT task (PID %d) is %f seconds\n", cfs_pid, cfs_runtime);
+	else
+		ksft_exit_fail_msg("Error getting runtime for EXT task (PID %d)\n", cfs_pid);
+
+	/* Get runtime for the RT task */
+	rt_runtime = get_process_runtime(rt_pid);
+	if (rt_runtime != -1)
+		ksft_print_msg("Runtime of RT task (PID %d) is %f seconds\n", rt_pid, rt_runtime);
+	else
+		ksft_exit_fail_msg("Error getting runtime for RT task (PID %d)\n", rt_pid);
+
+	/* Kill the processes */
+	kill(cfs_pid, SIGKILL);
+	kill(rt_pid, SIGKILL);
+	waitpid(cfs_pid, NULL, 0);
+	waitpid(rt_pid, NULL, 0);
+
+	/* Verify that the scx task got enough runtime */
+	float actual_ratio = cfs_runtime / (cfs_runtime + rt_runtime);
+	ksft_print_msg("EXT task got %.2f%% of total runtime\n", actual_ratio * 100);
+
+	if (actual_ratio >= expected_min_ratio) {
+		ksft_test_result_pass("PASS: EXT task got more than %.2f%% of runtime\n",
+				      expected_min_ratio * 100);
+		return true;
+	} else {
+		ksft_test_result_fail("FAIL: EXT task got less than %.2f%% of runtime\n",
+				      expected_min_ratio * 100);
+		return false;
+	}
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct rt_stall *skel = ctx;
+	struct bpf_link *link;
+	bool res;
+
+	link = bpf_map__attach_struct_ops(skel->maps.rt_stall_ops);
+	SCX_FAIL_IF(!link, "Failed to attach scheduler");
+
+	res = sched_stress_test();
+
+	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE));
+	bpf_link__destroy(link);
+
+	if (!res)
+		ksft_exit_fail();
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct rt_stall *skel = ctx;
+
+	rt_stall__destroy(skel);
+}
+
+struct scx_test rt_stall = {
+	.name = "rt_stall",
+	.description = "Verify that RT tasks cannot stall SCHED_EXT tasks",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&rt_stall)
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks
  2025-03-15  2:21 ` [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks Joel Fernandes
@ 2025-03-15  7:22   ` Peter Zijlstra
  2025-03-15 23:15     ` Joel Fernandes
  2025-03-15 17:56   ` Andrea Righi
  1 sibling, 1 reply; 23+ messages in thread
From: Peter Zijlstra @ 2025-03-15  7:22 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: linux-kernel, Andrea Righi, Tejun Heo, Ingo Molnar, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Mel Gorman, Valentin Schneider, David Vernet, Changwoo Min,
	Luigi De Matteis, paulmck, boqun.feng

On Fri, Mar 14, 2025 at 10:21:50PM -0400, Joel Fernandes wrote:
> sched_ext currently suffers starvation due to RT. The same workload when
> converted to EXT can get zero runtime if RT is 100% running, causing EXT
> processes to stall. Fix it by adding a DL server for EXT.

This needs a lot more words on why you need a second server. Because I
don't think you do.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks
  2025-03-15  2:21 ` [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks Joel Fernandes
  2025-03-15  7:22   ` Peter Zijlstra
@ 2025-03-15 17:56   ` Andrea Righi
  2025-03-15 23:17     ` Joel Fernandes
  1 sibling, 1 reply; 23+ messages in thread
From: Andrea Righi @ 2025-03-15 17:56 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: linux-kernel, Tejun Heo, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Mel Gorman, Valentin Schneider, David Vernet, Changwoo Min,
	Luigi De Matteis, paulmck, boqun.feng

Hi Joel,

On Fri, Mar 14, 2025 at 10:21:50PM -0400, Joel Fernandes wrote:
...
> @@ -7793,6 +7819,38 @@ BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
>  BTF_ID_FLAGS(func, scx_bpf_now)
>  BTF_KFUNCS_END(scx_kfunc_ids_any)
>  
> +/*
> + * Check if ext scheduler has tasks ready to run.
> + */
> +static bool ext_server_has_tasks(struct sched_dl_entity *dl_se)
> +{
> +	return !!dl_se->rq->scx.nr_running;
> +}
> +
> +/*
> + * Select the next task to run from the ext scheduling class.
> + */
> +static struct task_struct *ext_server_pick_task(struct sched_dl_entity *dl_se,
> +						void *flags)
> +{
> +	struct rq_flags *rf = flags;
> +

It'd be nice to add a comment here to clarify that we need to call
balance_scx() before pick_task_scx(), so that we can trigger ops.dispatch()
and consume tasks that may be pending in the BPF scheduler's DSQs,
otherwise pick_task_scx() may not find any scx task to run, reducing the
effectiveness of the dl_server.

> +	balance_scx(dl_se->rq, dl_se->rq->curr, rf);
> +	return pick_task_scx(dl_se->rq, rf);
> +}

Thanks,
-Andrea

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks
  2025-03-15  7:22   ` Peter Zijlstra
@ 2025-03-15 23:15     ` Joel Fernandes
  2025-03-17 10:31       ` Peter Zijlstra
  0 siblings, 1 reply; 23+ messages in thread
From: Joel Fernandes @ 2025-03-15 23:15 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Andrea Righi, Tejun Heo, Ingo Molnar, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Mel Gorman, Valentin Schneider, David Vernet, Changwoo Min,
	Luigi De Matteis, paulmck, boqun.feng



On 3/15/2025 3:22 AM, Peter Zijlstra wrote:
> On Fri, Mar 14, 2025 at 10:21:50PM -0400, Joel Fernandes wrote:
>> sched_ext currently suffers starvation due to RT. The same workload when
>> converted to EXT can get zero runtime if RT is 100% running, causing EXT
>> processes to stall. Fix it by adding a DL server for EXT.
> 
> This needs a lot more words on why you need a second server. Because I
> don't think you do.

Sure, I will add more words to the change log to explain rationale. When you say
"I don't think you do", do you mean that both FAIR and EXT could be served by
the same server? If so, that will not handle the case where the system has both
FAIR and EXT tasks in the mix (EXT has a partial mode where certain tasks can be
made EXT with certain others left as FAIR) and FAIR runs 100% and starves EXT.
We still need bandwidth allocated to EXT in such a situation. So we do need an
EXT server. Or did you mean something else?

thanks,

 - Joel


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks
  2025-03-15 17:56   ` Andrea Righi
@ 2025-03-15 23:17     ` Joel Fernandes
  0 siblings, 0 replies; 23+ messages in thread
From: Joel Fernandes @ 2025-03-15 23:17 UTC (permalink / raw)
  To: Andrea Righi
  Cc: linux-kernel, Tejun Heo, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Mel Gorman, Valentin Schneider, David Vernet, Changwoo Min,
	Luigi De Matteis, paulmck, boqun.feng



On 3/15/2025 1:56 PM, Andrea Righi wrote:
> Hi Joel,
> 
> On Fri, Mar 14, 2025 at 10:21:50PM -0400, Joel Fernandes wrote:
> ...
>> @@ -7793,6 +7819,38 @@ BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
>>  BTF_ID_FLAGS(func, scx_bpf_now)
>>  BTF_KFUNCS_END(scx_kfunc_ids_any)
>>  
>> +/*
>> + * Check if ext scheduler has tasks ready to run.
>> + */
>> +static bool ext_server_has_tasks(struct sched_dl_entity *dl_se)
>> +{
>> +	return !!dl_se->rq->scx.nr_running;
>> +}
>> +
>> +/*
>> + * Select the next task to run from the ext scheduling class.
>> + */
>> +static struct task_struct *ext_server_pick_task(struct sched_dl_entity *dl_se,
>> +						void *flags)
>> +{
>> +	struct rq_flags *rf = flags;
>> +
> 
> It'd be nice to add a comment here to clarify that we need to call
> balance_scx() before pick_task_scx(), so that we can trigger ops.dispatch()
> and consume tasks that may be pending in the BPF scheduler's DSQs,
> otherwise pick_task_scx() may not find any scx task to run, reducing the
> effectiveness of the dl_server.

Thanks for pointing this out, I will add rationale for the balance as you mentioned.

 - Joel


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH RFC 8/8] selftests/sched_ext: Add test for sched_ext dl_server
  2025-03-15  2:21 ` [PATCH RFC 8/8] selftests/sched_ext: Add test for sched_ext dl_server Joel Fernandes
@ 2025-03-15 23:22   ` Joel Fernandes
  0 siblings, 0 replies; 23+ messages in thread
From: Joel Fernandes @ 2025-03-15 23:22 UTC (permalink / raw)
  To: linux-kernel, Andrea Righi

This patch triggered a build robot warning, adding to thread for tracking the issue:

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/jfern/linux.git
sched/scx-dlserver-boost
branch HEAD: d4faa4aa813acff703cea3c301bb556f69b5210c  TODO_BEFORE_SUBMIT

Warning (recently discovered and may have been fixed):

    https://lore.kernel.org/oe-kbuild-all/202503152220.gAuzob4W-lkp@intel.com

    tools/testing/selftests/sched_ext/rt_stall.c: sys/wait.h is included more
than once.

Warning ids grouped by kconfigs:

recent_errors
`-- x86_64-allnoconfig
    `--
tools-testing-selftests-sched_ext-rt_stall.c:sys-wait.h-is-included-more-than-once.

elapsed time: 1448m

configs tested: 93
configs skipped: 1

tested configs:
alpha                             allnoconfig    gcc-14.2.0
alpha                            allyesconfig    gcc-14.2.0
arc                              allmodconfig    gcc-13.2.0
arc                               allnoconfig    gcc-13.2.0
arc                              allyesconfig    gcc-13.2.0
arc                   randconfig-001-20250315    gcc-13.2.0
arc                   randconfig-002-20250315    gcc-13.2.0
arm                              allmodconfig    gcc-14.2.0
arm                               allnoconfig    clang-17
arm                              allyesconfig    gcc-14.2.0
arm                            mmp2_defconfig    gcc-14.2.0
arm                        mvebu_v7_defconfig    clang-15
arm                   randconfig-001-20250315    gcc-14.2.0
arm                   randconfig-002-20250315    clang-21
arm                   randconfig-003-20250315    clang-21
arm                   randconfig-004-20250315    gcc-14.2.0
arm                        spear3xx_defconfig    clang-16
arm64                            allmodconfig    clang-18
arm64                 randconfig-001-20250315    gcc-14.2.0
arm64                 randconfig-002-20250315    gcc-14.2.0
arm64                 randconfig-003-20250315    clang-16
arm64                 randconfig-004-20250315    gcc-14.2.0
csky                  randconfig-001-20250315    gcc-14.2.0
csky                  randconfig-002-20250315    gcc-14.2.0
hexagon                          allmodconfig    clang-21
hexagon                          allyesconfig    clang-18
hexagon               randconfig-001-20250315    clang-21
hexagon               randconfig-002-20250315    clang-17
i386                             allmodconfig    gcc-12
i386                              allnoconfig    gcc-12
i386        buildonly-randconfig-001-20250315    gcc-12
i386        buildonly-randconfig-002-20250315    clang-19
i386        buildonly-randconfig-003-20250315    clang-19
i386        buildonly-randconfig-004-20250315    clang-19
i386        buildonly-randconfig-005-20250315    gcc-11
i386        buildonly-randconfig-006-20250315    gcc-12
i386                                defconfig    clang-19
loongarch             randconfig-001-20250315    gcc-14.2.0
loongarch             randconfig-002-20250315    gcc-14.2.0
m68k                              allnoconfig    gcc-14.2.0
m68k                             allyesconfig    gcc-14.2.0
microblaze                        allnoconfig    gcc-14.2.0
mips                              allnoconfig    gcc-14.2.0
nios2                             allnoconfig    gcc-14.2.0
nios2                 randconfig-001-20250315    gcc-14.2.0
nios2                 randconfig-002-20250315    gcc-14.2.0
openrisc                          allnoconfig    gcc-14.2.0
parisc                            allnoconfig    gcc-14.2.0
parisc                randconfig-001-20250315    gcc-14.2.0
parisc                randconfig-002-20250315    gcc-14.2.0
powerpc                           allnoconfig    gcc-14.2.0
powerpc               randconfig-001-20250315    clang-21
powerpc               randconfig-002-20250315    gcc-14.2.0
powerpc               randconfig-003-20250315    clang-18
powerpc64             randconfig-001-20250315    gcc-14.2.0
powerpc64             randconfig-002-20250315    clang-18
powerpc64             randconfig-003-20250315    gcc-14.2.0
riscv                             allnoconfig    gcc-14.2.0
riscv                 randconfig-001-20250315    gcc-14.2.0
riscv                 randconfig-002-20250315    gcc-14.2.0
s390                             allmodconfig    clang-19
s390                              allnoconfig    clang-15
s390                             allyesconfig    gcc-14.2.0
s390                  randconfig-001-20250315    clang-19
s390                  randconfig-002-20250315    gcc-14.2.0
sh                               allmodconfig    gcc-14.2.0
sh                                allnoconfig    gcc-14.2.0
sh                               allyesconfig    gcc-14.2.0
sh                    randconfig-001-20250315    gcc-14.2.0
sh                    randconfig-002-20250315    gcc-14.2.0
sparc                            allmodconfig    gcc-14.2.0
sparc                             allnoconfig    gcc-14.2.0
sparc                 randconfig-001-20250315    gcc-14.2.0
sparc                 randconfig-002-20250315    gcc-14.2.0
sparc64               randconfig-001-20250315    gcc-14.2.0
sparc64               randconfig-002-20250315    gcc-14.2.0
um                               allmodconfig    clang-21
um                                allnoconfig    clang-18
um                               allyesconfig    gcc-12
um                    randconfig-001-20250315    gcc-12
um                    randconfig-002-20250315    clang-18
x86_64                            allnoconfig    clang-19
x86_64                           allyesconfig    clang-19
x86_64      buildonly-randconfig-001-20250315    gcc-12
x86_64      buildonly-randconfig-002-20250315    clang-19
x86_64      buildonly-randconfig-003-20250315    clang-19
x86_64      buildonly-randconfig-004-20250315    clang-19
x86_64      buildonly-randconfig-005-20250315    clang-19
x86_64      buildonly-randconfig-006-20250315    gcc-12
x86_64                              defconfig    gcc-11
xtensa                            allnoconfig    gcc-14.2.0
xtensa                randconfig-001-20250315    gcc-14.2.0
xtensa                randconfig-002-20250315    gcc-14.2.0

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki



On 3/14/2025 10:21 PM, Joel Fernandes wrote:
> From: Andrea Righi <arighi@nvidia.com>
> 
> Add a selftest to validate the correct behavior of the deadline server
> for the ext_sched_class.
> 
> [ Joel: Replaced occurences of CFS in the test with EXT. ]
> 
> Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
> Signed-off-by: Andrea Righi <arighi@nvidia.com>
> ---
>  tools/testing/selftests/sched_ext/Makefile    |   1 +
>  .../selftests/sched_ext/rt_stall.bpf.c        |  23 ++
>  tools/testing/selftests/sched_ext/rt_stall.c  | 213 ++++++++++++++++++
>  3 files changed, 237 insertions(+)
>  create mode 100644 tools/testing/selftests/sched_ext/rt_stall.bpf.c
>  create mode 100644 tools/testing/selftests/sched_ext/rt_stall.c
> 
> diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
> index 011762224600..802e3d8d038f 100644
> --- a/tools/testing/selftests/sched_ext/Makefile
> +++ b/tools/testing/selftests/sched_ext/Makefile
> @@ -180,6 +180,7 @@ auto-test-targets :=			\
>  	select_cpu_dispatch_bad_dsq	\
>  	select_cpu_dispatch_dbl_dsp	\
>  	select_cpu_vtime		\
> +	rt_stall			\
>  	test_example			\
>  
>  testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets)))
> diff --git a/tools/testing/selftests/sched_ext/rt_stall.bpf.c b/tools/testing/selftests/sched_ext/rt_stall.bpf.c
> new file mode 100644
> index 000000000000..80086779dd1e
> --- /dev/null
> +++ b/tools/testing/selftests/sched_ext/rt_stall.bpf.c
> @@ -0,0 +1,23 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * A scheduler that verified if RT tasks can stall SCHED_EXT tasks.
> + *
> + * Copyright (c) 2025 NVIDIA Corporation.
> + */
> +
> +#include <scx/common.bpf.h>
> +
> +char _license[] SEC("license") = "GPL";
> +
> +UEI_DEFINE(uei);
> +
> +void BPF_STRUCT_OPS(rt_stall_exit, struct scx_exit_info *ei)
> +{
> +	UEI_RECORD(uei, ei);
> +}
> +
> +SEC(".struct_ops.link")
> +struct sched_ext_ops rt_stall_ops = {
> +	.exit			= (void *)rt_stall_exit,
> +	.name			= "rt_stall",
> +};
> diff --git a/tools/testing/selftests/sched_ext/rt_stall.c b/tools/testing/selftests/sched_ext/rt_stall.c
> new file mode 100644
> index 000000000000..d4cb545ebfd8
> --- /dev/null
> +++ b/tools/testing/selftests/sched_ext/rt_stall.c
> @@ -0,0 +1,213 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2025 NVIDIA Corporation.
> + */
> +#define _GNU_SOURCE
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <sched.h>
> +#include <sys/prctl.h>
> +#include <sys/types.h>
> +#include <sys/wait.h>
> +#include <time.h>
> +#include <linux/sched.h>
> +#include <signal.h>
> +#include <bpf/bpf.h>
> +#include <scx/common.h>
> +#include <sys/wait.h>
> +#include <unistd.h>
> +#include "rt_stall.bpf.skel.h"
> +#include "scx_test.h"
> +#include "../kselftest.h"
> +
> +#define CORE_ID		0	/* CPU to pin tasks to */
> +#define RUN_TIME        5	/* How long to run the test in seconds */
> +
> +/* Simple busy-wait function for test tasks */
> +static void process_func(void)
> +{
> +	while (1) {
> +		/* Busy wait */
> +		for (volatile unsigned long i = 0; i < 10000000UL; i++);
> +	}
> +}
> +
> +/* Set CPU affinity to a specific core */
> +static void set_affinity(int cpu)
> +{
> +	cpu_set_t mask;
> +
> +	CPU_ZERO(&mask);
> +	CPU_SET(cpu, &mask);
> +	if (sched_setaffinity(0, sizeof(mask), &mask) != 0) {
> +		perror("sched_setaffinity");
> +		exit(EXIT_FAILURE);
> +	}
> +}
> +
> +/* Set task scheduling policy and priority */
> +static void set_sched(int policy, int priority)
> +{
> +	struct sched_param param;
> +
> +	param.sched_priority = priority;
> +	if (sched_setscheduler(0, policy, &param) != 0) {
> +		perror("sched_setscheduler");
> +		exit(EXIT_FAILURE);
> +	}
> +}
> +
> +/* Get process runtime from /proc/<pid>/stat */
> +static float get_process_runtime(int pid)
> +{
> +	char path[256];
> +	FILE *file;
> +	long utime, stime;
> +	int fields;
> +
> +	snprintf(path, sizeof(path), "/proc/%d/stat", pid);
> +	file = fopen(path, "r");
> +	if (file == NULL) {
> +		perror("Failed to open stat file");
> +		return -1;
> +	}
> +
> +	/* Skip the first 13 fields and read the 14th and 15th */
> +	fields = fscanf(file,
> +			"%*d %*s %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u %lu %lu",
> +			&utime, &stime);
> +	fclose(file);
> +
> +	if (fields != 2) {
> +		fprintf(stderr, "Failed to read stat file\n");
> +		return -1;
> +	}
> +
> +	/* Calculate the total time spent in the process */
> +	long total_time = utime + stime;
> +	long ticks_per_second = sysconf(_SC_CLK_TCK);
> +	float runtime_seconds = total_time * 1.0 / ticks_per_second;
> +
> +	return runtime_seconds;
> +}
> +
> +static enum scx_test_status setup(void **ctx)
> +{
> +	struct rt_stall *skel;
> +
> +	skel = rt_stall__open();
> +	SCX_FAIL_IF(!skel, "Failed to open");
> +	SCX_ENUM_INIT(skel);
> +	SCX_FAIL_IF(rt_stall__load(skel), "Failed to load skel");
> +
> +	*ctx = skel;
> +
> +	return SCX_TEST_PASS;
> +}
> +
> +static bool sched_stress_test(void)
> +{
> +	float cfs_runtime, rt_runtime;
> +	int cfs_pid, rt_pid;
> +	float expected_min_ratio = 0.04; /* 4% */
> +
> +	ksft_print_header();
> +	ksft_set_plan(1);
> +
> +	/* Create and set up a EXT task */
> +	cfs_pid = fork();
> +	if (cfs_pid == 0) {
> +		set_affinity(CORE_ID);
> +		process_func();
> +		exit(0);
> +	} else if (cfs_pid < 0) {
> +		perror("fork for EXT task");
> +		ksft_exit_fail();
> +	}
> +
> +	/* Create an RT task */
> +	rt_pid = fork();
> +	if (rt_pid == 0) {
> +		set_affinity(CORE_ID);
> +		set_sched(SCHED_FIFO, 50);
> +		process_func();
> +		exit(0);
> +	} else if (rt_pid < 0) {
> +		perror("fork for RT task");
> +		ksft_exit_fail();
> +	}
> +
> +	/* Let the processes run for the specified time */
> +	sleep(RUN_TIME);
> +
> +	/* Get runtime for the EXT task */
> +	cfs_runtime = get_process_runtime(cfs_pid);
> +	if (cfs_runtime != -1)
> +		ksft_print_msg("Runtime of EXT task (PID %d) is %f seconds\n", cfs_pid, cfs_runtime);
> +	else
> +		ksft_exit_fail_msg("Error getting runtime for EXT task (PID %d)\n", cfs_pid);
> +
> +	/* Get runtime for the RT task */
> +	rt_runtime = get_process_runtime(rt_pid);
> +	if (rt_runtime != -1)
> +		ksft_print_msg("Runtime of RT task (PID %d) is %f seconds\n", rt_pid, rt_runtime);
> +	else
> +		ksft_exit_fail_msg("Error getting runtime for RT task (PID %d)\n", rt_pid);
> +
> +	/* Kill the processes */
> +	kill(cfs_pid, SIGKILL);
> +	kill(rt_pid, SIGKILL);
> +	waitpid(cfs_pid, NULL, 0);
> +	waitpid(rt_pid, NULL, 0);
> +
> +	/* Verify that the scx task got enough runtime */
> +	float actual_ratio = cfs_runtime / (cfs_runtime + rt_runtime);
> +	ksft_print_msg("EXT task got %.2f%% of total runtime\n", actual_ratio * 100);
> +
> +	if (actual_ratio >= expected_min_ratio) {
> +		ksft_test_result_pass("PASS: EXT task got more than %.2f%% of runtime\n",
> +				      expected_min_ratio * 100);
> +		return true;
> +	} else {
> +		ksft_test_result_fail("FAIL: EXT task got less than %.2f%% of runtime\n",
> +				      expected_min_ratio * 100);
> +		return false;
> +	}
> +}
> +
> +static enum scx_test_status run(void *ctx)
> +{
> +	struct rt_stall *skel = ctx;
> +	struct bpf_link *link;
> +	bool res;
> +
> +	link = bpf_map__attach_struct_ops(skel->maps.rt_stall_ops);
> +	SCX_FAIL_IF(!link, "Failed to attach scheduler");
> +
> +	res = sched_stress_test();
> +
> +	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE));
> +	bpf_link__destroy(link);
> +
> +	if (!res)
> +		ksft_exit_fail();
> +
> +	return SCX_TEST_PASS;
> +}
> +
> +static void cleanup(void *ctx)
> +{
> +	struct rt_stall *skel = ctx;
> +
> +	rt_stall__destroy(skel);
> +}
> +
> +struct scx_test rt_stall = {
> +	.name = "rt_stall",
> +	.description = "Verify that RT tasks cannot stall SCHED_EXT tasks",
> +	.setup = setup,
> +	.run = run,
> +	.cleanup = cleanup,
> +};
> +REGISTER_SCX_TEST(&rt_stall)


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks
  2025-03-15 23:15     ` Joel Fernandes
@ 2025-03-17 10:31       ` Peter Zijlstra
  2025-03-17 16:57         ` Tejun Heo
  2025-03-17 21:53         ` Joel Fernandes
  0 siblings, 2 replies; 23+ messages in thread
From: Peter Zijlstra @ 2025-03-17 10:31 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: linux-kernel, Andrea Righi, Tejun Heo, Ingo Molnar, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Mel Gorman, Valentin Schneider, David Vernet, Changwoo Min,
	Luigi De Matteis, paulmck, boqun.feng

On Sat, Mar 15, 2025 at 07:15:27PM -0400, Joel Fernandes wrote:
> 
> 
> On 3/15/2025 3:22 AM, Peter Zijlstra wrote:
> > On Fri, Mar 14, 2025 at 10:21:50PM -0400, Joel Fernandes wrote:
> >> sched_ext currently suffers starvation due to RT. The same workload when
> >> converted to EXT can get zero runtime if RT is 100% running, causing EXT
> >> processes to stall. Fix it by adding a DL server for EXT.
> > 
> > This needs a lot more words on why you need a second server. Because I
> > don't think you do.
> 
> Sure, I will add more words to the change log to explain rationale. When you say
> "I don't think you do", do you mean that both FAIR and EXT could be served by
> the same server? 

Yeah, because now you get two deadline entities both having a
reservation on bandwidth. One of which is not going to be used -- this
is not nice.

> If so, that will not handle the case where the system has both
> FAIR and EXT tasks in the mix (EXT has a partial mode where certain tasks can be
> made EXT with certain others left as FAIR) and FAIR runs 100% and starves EXT.

Well, you did not mention that issue, you only babbled about RT.

I did point out that issue with ext, and TJ said this mixed mode wasn't
really meant to be used or somesuch.

So if that's changed, this needs a separate discussion.

Also; I gotta ask, why is nvidia looking at ext ?

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks
  2025-03-17 10:31       ` Peter Zijlstra
@ 2025-03-17 16:57         ` Tejun Heo
  2025-03-17 17:06           ` Peter Zijlstra
  2025-03-17 21:53         ` Joel Fernandes
  1 sibling, 1 reply; 23+ messages in thread
From: Tejun Heo @ 2025-03-17 16:57 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Joel Fernandes, linux-kernel, Andrea Righi, Ingo Molnar,
	Juri Lelli, Vincent Guittot, Dietmar Eggemann, Steven Rostedt,
	Ben Segall, Mel Gorman, Valentin Schneider, David Vernet,
	Changwoo Min, Luigi De Matteis, paulmck, boqun.feng

Hello,

On Mon, Mar 17, 2025 at 11:31:01AM +0100, Peter Zijlstra wrote:
> On Sat, Mar 15, 2025 at 07:15:27PM -0400, Joel Fernandes wrote:
> > If so, that will not handle the case where the system has both
> > FAIR and EXT tasks in the mix (EXT has a partial mode where certain tasks can be
> > made EXT with certain others left as FAIR) and FAIR runs 100% and starves EXT.
> 
> Well, you did not mention that issue, you only babbled about RT.
> 
> I did point out that issue with ext, and TJ said this mixed mode wasn't
> really meant to be used or somesuch.

It's true that most of the current use cases don't use mixed mode. That
said, some folks are interested in it and if we can prevent starvation from
fair saturating CPUs in mixed mode with a DL server, that'd be really nice.
Would it be possible to toggle the reservations depending on the ext's
operation mode?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks
  2025-03-17 16:57         ` Tejun Heo
@ 2025-03-17 17:06           ` Peter Zijlstra
  2025-03-17 21:48             ` Joel Fernandes
  0 siblings, 1 reply; 23+ messages in thread
From: Peter Zijlstra @ 2025-03-17 17:06 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Joel Fernandes, linux-kernel, Andrea Righi, Ingo Molnar,
	Juri Lelli, Vincent Guittot, Dietmar Eggemann, Steven Rostedt,
	Ben Segall, Mel Gorman, Valentin Schneider, David Vernet,
	Changwoo Min, Luigi De Matteis, paulmck, boqun.feng

On Mon, Mar 17, 2025 at 06:57:19AM -1000, Tejun Heo wrote:
> Hello,
> 
> On Mon, Mar 17, 2025 at 11:31:01AM +0100, Peter Zijlstra wrote:
> > On Sat, Mar 15, 2025 at 07:15:27PM -0400, Joel Fernandes wrote:
> > > If so, that will not handle the case where the system has both
> > > FAIR and EXT tasks in the mix (EXT has a partial mode where certain tasks can be
> > > made EXT with certain others left as FAIR) and FAIR runs 100% and starves EXT.
> > 
> > Well, you did not mention that issue, you only babbled about RT.
> > 
> > I did point out that issue with ext, and TJ said this mixed mode wasn't
> > really meant to be used or somesuch.
> 
> It's true that most of the current use cases don't use mixed mode. That
> said, some folks are interested in it and if we can prevent starvation from
> fair saturating CPUs in mixed mode with a DL server, that'd be really nice.
> Would it be possible to toggle the reservations depending on the ext's
> operation mode?

Yeah, that should be doable.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks
  2025-03-17 17:06           ` Peter Zijlstra
@ 2025-03-17 21:48             ` Joel Fernandes
  2025-03-17 22:16               ` Tejun Heo
  0 siblings, 1 reply; 23+ messages in thread
From: Joel Fernandes @ 2025-03-17 21:48 UTC (permalink / raw)
  To: Peter Zijlstra, Tejun Heo
  Cc: linux-kernel, Andrea Righi, Ingo Molnar, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Mel Gorman, Valentin Schneider, David Vernet, Changwoo Min,
	Luigi De Matteis, paulmck, boqun.feng

Hello, Peter, Tejun,

On 3/17/2025 6:06 PM, Peter Zijlstra wrote:
> On Mon, Mar 17, 2025 at 06:57:19AM -1000, Tejun Heo wrote:
>> Hello,
>>
>> On Mon, Mar 17, 2025 at 11:31:01AM +0100, Peter Zijlstra wrote:
>>> On Sat, Mar 15, 2025 at 07:15:27PM -0400, Joel Fernandes wrote:
>>>> If so, that will not handle the case where the system has both
>>>> FAIR and EXT tasks in the mix (EXT has a partial mode where certain tasks can be
>>>> made EXT with certain others left as FAIR) and FAIR runs 100% and starves EXT.
>>>
>>> Well, you did not mention that issue, you only babbled about RT.

You are right, I will add more details about this to the change log.

>>>
>>> I did point out that issue with ext, and TJ said this mixed mode wasn't
>>> really meant to be used or somesuch.
>>
>> It's true that most of the current use cases don't use mixed mode. That
>> said, some folks are interested in it and if we can prevent starvation from
>> fair saturating CPUs in mixed mode with a DL server, that'd be really nice.
>> Would it be possible to toggle the reservations depending on the ext's
>> operation mode?
> 
> Yeah, that should be doable.

Just to clarify, Tejun is suggesting that in mixed mode, we boost EXT
independent of FAIR. And in normal mode, we we boost both FAIR+EXT, because well
- nothing would be running as fair anyway.

But what is the point of doing that, if we have boost EXT independent of FAIR
anyway? We need that code _anyway_ due to mixed mode so it would not simplify
anything.

Or did Tejun mean something else about "toggle the reservations"?

thanks,

 - Joel









^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks
  2025-03-17 10:31       ` Peter Zijlstra
  2025-03-17 16:57         ` Tejun Heo
@ 2025-03-17 21:53         ` Joel Fernandes
  1 sibling, 0 replies; 23+ messages in thread
From: Joel Fernandes @ 2025-03-17 21:53 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Andrea Righi, Tejun Heo, Ingo Molnar, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Mel Gorman, Valentin Schneider, David Vernet, Changwoo Min,
	Luigi De Matteis, paulmck, boqun.feng

Hello, Peter,

I replied to other parts of your email in another thread so I am just replying
to this part:

On 3/17/2025 11:31 AM, Peter Zijlstra wrote:
> 
> Also; I gotta ask, why is nvidia looking at ext ?

There are some complex CPU topologies which perform poorly with the existing
FAIR scheduler as reported by people (I have not seen the data though so there's
that).

There are also workloads where it is beneficial to schedule on cores which have
the data in their cache and are submitting work to GPGPU, which makes the GPGPU
operations faster.

thanks,

 - Joel



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks
  2025-03-17 21:48             ` Joel Fernandes
@ 2025-03-17 22:16               ` Tejun Heo
  2025-03-17 22:39                 ` Joel Fernandes
  0 siblings, 1 reply; 23+ messages in thread
From: Tejun Heo @ 2025-03-17 22:16 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: Peter Zijlstra, linux-kernel, Andrea Righi, Ingo Molnar,
	Juri Lelli, Vincent Guittot, Dietmar Eggemann, Steven Rostedt,
	Ben Segall, Mel Gorman, Valentin Schneider, David Vernet,
	Changwoo Min, Luigi De Matteis, paulmck, boqun.feng

Hello,

On Mon, Mar 17, 2025 at 10:48:16PM +0100, Joel Fernandes wrote:
...
> Just to clarify, Tejun is suggesting that in mixed mode, we boost EXT
> independent of FAIR. And in normal mode, we we boost both FAIR+EXT, because well
> - nothing would be running as fair anyway.
> 
> But what is the point of doing that, if we have boost EXT independent of FAIR
> anyway? We need that code _anyway_ due to mixed mode so it would not simplify
> anything.
> 
> Or did Tejun mean something else about "toggle the reservations"?

My understanding is that if we have both FAIR and EXT's DL servers reserving
execution time all the time, we'd be reserving execution time for something
which can't be active, so the only change necessary I think is just
retracting FAIR's or EXT's reservation whent we know they are not active
(ie. if EXT is not loaded or EXT is loaded in full-sys mode).

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks
  2025-03-17 22:16               ` Tejun Heo
@ 2025-03-17 22:39                 ` Joel Fernandes
  2025-03-17 22:48                   ` Tejun Heo
  0 siblings, 1 reply; 23+ messages in thread
From: Joel Fernandes @ 2025-03-17 22:39 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Peter Zijlstra, linux-kernel, Andrea Righi, Ingo Molnar,
	Juri Lelli, Vincent Guittot, Dietmar Eggemann, Steven Rostedt,
	Ben Segall, Mel Gorman, Valentin Schneider, David Vernet,
	Changwoo Min, Luigi De Matteis, paulmck, boqun.feng



On 3/17/2025 11:16 PM, Tejun Heo wrote:
> Hello,
> 
> On Mon, Mar 17, 2025 at 10:48:16PM +0100, Joel Fernandes wrote:
> ...
>> Just to clarify, Tejun is suggesting that in mixed mode, we boost EXT
>> independent of FAIR. And in normal mode, we we boost both FAIR+EXT, because well
>> - nothing would be running as fair anyway.
>>
>> But what is the point of doing that, if we have boost EXT independent of FAIR
>> anyway? We need that code _anyway_ due to mixed mode so it would not simplify
>> anything.
>>
>> Or did Tejun mean something else about "toggle the reservations"?
> 
> My understanding is that if we have both FAIR and EXT's DL servers reserving
> execution time all the time, we'd be reserving execution time for something
> which can't be active, so the only change necessary I think is just
> retracting FAIR's or EXT's reservation whent we know they are not active
> (ie. if EXT is not loaded or EXT is loaded in full-sys mode).
> 
Ah, I see what you mean. We already have a 'toggle' like that though because if
FAIR or EXT is not running (due to whatever reason), we would have already
called 'dl_server_stop()' or would never have called 'dl_server_start()'.

On the other hand, even if full-sys-mode, we need the EXT server to boost it to
above RT if EXT is running, so we need its server initialized and ready to go.

Let me know if I missed anything though, thanks,

 - Joel




^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks
  2025-03-17 22:39                 ` Joel Fernandes
@ 2025-03-17 22:48                   ` Tejun Heo
  2025-03-18 10:07                     ` Joel Fernandes
  0 siblings, 1 reply; 23+ messages in thread
From: Tejun Heo @ 2025-03-17 22:48 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: Peter Zijlstra, linux-kernel, Andrea Righi, Ingo Molnar,
	Juri Lelli, Vincent Guittot, Dietmar Eggemann, Steven Rostedt,
	Ben Segall, Mel Gorman, Valentin Schneider, David Vernet,
	Changwoo Min, Luigi De Matteis, paulmck, boqun.feng

On Mon, Mar 17, 2025 at 11:39:32PM +0100, Joel Fernandes wrote:
> On 3/17/2025 11:16 PM, Tejun Heo wrote:
> > On Mon, Mar 17, 2025 at 10:48:16PM +0100, Joel Fernandes wrote:
> > ...
> >> Just to clarify, Tejun is suggesting that in mixed mode, we boost EXT
> >> independent of FAIR. And in normal mode, we we boost both FAIR+EXT, because well
> >> - nothing would be running as fair anyway.
> >>
> >> But what is the point of doing that, if we have boost EXT independent of FAIR
> >> anyway? We need that code _anyway_ due to mixed mode so it would not simplify
> >> anything.
> >>
> >> Or did Tejun mean something else about "toggle the reservations"?
> > 
> > My understanding is that if we have both FAIR and EXT's DL servers reserving
> > execution time all the time, we'd be reserving execution time for something
> > which can't be active, so the only change necessary I think is just
> > retracting FAIR's or EXT's reservation whent we know they are not active
> > (ie. if EXT is not loaded or EXT is loaded in full-sys mode).
> > 
> Ah, I see what you mean. We already have a 'toggle' like that though because if
> FAIR or EXT is not running (due to whatever reason), we would have already
> called 'dl_server_stop()' or would never have called 'dl_server_start()'.
> 
> On the other hand, even if full-sys-mode, we need the EXT server to boost it to
> above RT if EXT is running, so we need its server initialized and ready to go.
> 
> Let me know if I missed anything though, thanks,

I'm not very familiar with DL but it looks like a stopped DL server would
still be reserving bandwidth which limits what other actual DL users would
be able to reserve without causing overflow. It looks like EXT's activation
modes should be calling into dl_bw_manage() so that FAIR's and EXT's
reservations can be retracted when not in use.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks
  2025-03-17 22:48                   ` Tejun Heo
@ 2025-03-18 10:07                     ` Joel Fernandes
  0 siblings, 0 replies; 23+ messages in thread
From: Joel Fernandes @ 2025-03-18 10:07 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Peter Zijlstra, linux-kernel, Andrea Righi, Ingo Molnar,
	Juri Lelli, Vincent Guittot, Dietmar Eggemann, Steven Rostedt,
	Ben Segall, Mel Gorman, Valentin Schneider, David Vernet,
	Changwoo Min, Luigi De Matteis, paulmck, boqun.feng



On 3/17/2025 11:48 PM, Tejun Heo wrote:
> On Mon, Mar 17, 2025 at 11:39:32PM +0100, Joel Fernandes wrote:
>> On 3/17/2025 11:16 PM, Tejun Heo wrote:
>>> On Mon, Mar 17, 2025 at 10:48:16PM +0100, Joel Fernandes wrote:
>>> ...
>>>> Just to clarify, Tejun is suggesting that in mixed mode, we boost EXT
>>>> independent of FAIR. And in normal mode, we we boost both FAIR+EXT, because well
>>>> - nothing would be running as fair anyway.
>>>>
>>>> But what is the point of doing that, if we have boost EXT independent of FAIR
>>>> anyway? We need that code _anyway_ due to mixed mode so it would not simplify
>>>> anything.
>>>>
>>>> Or did Tejun mean something else about "toggle the reservations"?
>>> My understanding is that if we have both FAIR and EXT's DL servers reserving
>>> execution time all the time, we'd be reserving execution time for something
>>> which can't be active, so the only change necessary I think is just
>>> retracting FAIR's or EXT's reservation whent we know they are not active
>>> (ie. if EXT is not loaded or EXT is loaded in full-sys mode).
>>>
>> Ah, I see what you mean. We already have a 'toggle' like that though because if
>> FAIR or EXT is not running (due to whatever reason), we would have already
>> called 'dl_server_stop()' or would never have called 'dl_server_start()'.
>>
>> On the other hand, even if full-sys-mode, we need the EXT server to boost it to
>> above RT if EXT is running, so we need its server initialized and ready to go.
>>
>> Let me know if I missed anything though, thanks,
> I'm not very familiar with DL but it looks like a stopped DL server would
> still be reserving bandwidth which limits what other actual DL users would
> be able to reserve without causing overflow. It looks like EXT's activation
> modes should be calling into dl_bw_manage() so that FAIR's and EXT's
> reservations can be retracted when not in use.

Ah, you raise a good point. Sorry, you were on to something and that makes sense
to me. Let me see how to wire it up. Basically, when we switch to full-mode from
say partial, we could/should remove the bandwidth reservation of the servers. I
think I confused the concept of "server not running" to "server reserving
bandwidth". My bad!

thanks,

 - Joel


^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2025-03-18 10:07 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-03-15  2:21 [PATCH RFC 0/8] Add a deadline server for sched_ext tasks Joel Fernandes
2025-03-15  2:21 ` [PATCH RFC 1/8] sched: Add support to pick functions to take rf Joel Fernandes
2025-03-15  2:21 ` [PATCH RFC 2/8] sched: Add a server arg to dl_server_update_idle_time() Joel Fernandes
2025-03-15  2:21 ` [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks Joel Fernandes
2025-03-15  7:22   ` Peter Zijlstra
2025-03-15 23:15     ` Joel Fernandes
2025-03-17 10:31       ` Peter Zijlstra
2025-03-17 16:57         ` Tejun Heo
2025-03-17 17:06           ` Peter Zijlstra
2025-03-17 21:48             ` Joel Fernandes
2025-03-17 22:16               ` Tejun Heo
2025-03-17 22:39                 ` Joel Fernandes
2025-03-17 22:48                   ` Tejun Heo
2025-03-18 10:07                     ` Joel Fernandes
2025-03-17 21:53         ` Joel Fernandes
2025-03-15 17:56   ` Andrea Righi
2025-03-15 23:17     ` Joel Fernandes
2025-03-15  2:21 ` [PATCH RFC 4/8] sched/debug: Fix updating of ppos on server write ops Joel Fernandes
2025-03-15  2:21 ` [PATCH RFC 5/8] sched/debug: Stop and start server based on if it was active Joel Fernandes
2025-03-15  2:21 ` [PATCH RFC 6/8] sched/debug: Add support to change sched_ext server params Joel Fernandes
2025-03-15  2:21 ` [PATCH RFC 7/8] sched/deadline: Clear defer params Joel Fernandes
2025-03-15  2:21 ` [PATCH RFC 8/8] selftests/sched_ext: Add test for sched_ext dl_server Joel Fernandes
2025-03-15 23:22   ` Joel Fernandes

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox