* [PATCH RFC 1/8] sched: Add support to pick functions to take rf
2025-03-15 2:21 [PATCH RFC 0/8] Add a deadline server for sched_ext tasks Joel Fernandes
@ 2025-03-15 2:21 ` Joel Fernandes
2025-03-15 2:21 ` [PATCH RFC 2/8] sched: Add a server arg to dl_server_update_idle_time() Joel Fernandes
` (6 subsequent siblings)
7 siblings, 0 replies; 23+ messages in thread
From: Joel Fernandes @ 2025-03-15 2:21 UTC (permalink / raw)
To: linux-kernel, Andrea Righi, Tejun Heo, Ingo Molnar,
Peter Zijlstra, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider,
David Vernet, Changwoo Min
Cc: Luigi De Matteis, paulmck, boqun.feng, Joel Fernandes
Some pick functions like the internal pick_next_task_fair() already take
rf but some others dont. We need this for scx's server pick function.
Prepare for this by having pick functions accept it.
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
include/linux/sched.h | 2 +-
kernel/sched/core.c | 16 ++++++++--------
kernel/sched/deadline.c | 8 ++++----
kernel/sched/ext.c | 2 +-
kernel/sched/fair.c | 13 ++++++++-----
kernel/sched/idle.c | 2 +-
kernel/sched/rt.c | 2 +-
kernel/sched/sched.h | 7 ++++---
kernel/sched/stop_task.c | 2 +-
9 files changed, 29 insertions(+), 25 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9632e3318e0d..08b1a611bbd5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -609,7 +609,7 @@ struct sched_rt_entity {
} __randomize_layout;
typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *);
-typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *);
+typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *, void *);
struct sched_dl_entity {
struct rb_node rb_node;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9aecd914ac69..d19e4b7a0020 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6035,7 +6035,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/* Assume the next prioritized class is idle_sched_class */
if (!p) {
- p = pick_task_idle(rq);
+ p = pick_task_idle(rq, rf);
put_prev_set_next_task(rq, prev, p);
}
@@ -6047,11 +6047,11 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
for_each_active_class(class) {
if (class->pick_next_task) {
- p = class->pick_next_task(rq, prev);
+ p = class->pick_next_task(rq, prev, rf);
if (p)
return p;
} else {
- p = class->pick_task(rq);
+ p = class->pick_task(rq, rf);
if (p) {
put_prev_set_next_task(rq, prev, p);
return p;
@@ -6081,7 +6081,7 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
return a->core_cookie == b->core_cookie;
}
-static inline struct task_struct *pick_task(struct rq *rq)
+static inline struct task_struct *pick_task(struct rq *rq, struct rq_flags *rf)
{
const struct sched_class *class;
struct task_struct *p;
@@ -6089,7 +6089,7 @@ static inline struct task_struct *pick_task(struct rq *rq)
rq->dl_server = NULL;
for_each_active_class(class) {
- p = class->pick_task(rq);
+ p = class->pick_task(rq, rf);
if (p)
return p;
}
@@ -6189,7 +6189,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* and there are no cookied tasks running on siblings.
*/
if (!need_sync) {
- next = pick_task(rq);
+ next = pick_task(rq, rf);
if (!next->core_cookie) {
rq->core_pick = NULL;
rq->core_dl_server = NULL;
@@ -6220,7 +6220,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if (i != cpu && (rq_i != rq->core || !core_clock_updated))
update_rq_clock(rq_i);
- rq_i->core_pick = p = pick_task(rq_i);
+ rq_i->core_pick = p = pick_task(rq_i, rf);
rq_i->core_dl_server = rq_i->dl_server;
if (!max || prio_less(max, p, fi_before))
@@ -6242,7 +6242,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if (cookie)
p = sched_core_find(rq_i, cookie);
if (!p)
- p = idle_sched_class.pick_task(rq_i);
+ p = idle_sched_class.pick_task(rq_i, rf);
}
rq_i->core_pick = p;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 38e4537790af..3f7fb7251805 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2421,7 +2421,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq)
* __pick_next_task_dl - Helper to pick the next -deadline task to run.
* @rq: The runqueue to pick the next task from.
*/
-static struct task_struct *__pick_task_dl(struct rq *rq)
+static struct task_struct *__pick_task_dl(struct rq *rq, struct rq_flags *rf)
{
struct sched_dl_entity *dl_se;
struct dl_rq *dl_rq = &rq->dl;
@@ -2435,7 +2435,7 @@ static struct task_struct *__pick_task_dl(struct rq *rq)
WARN_ON_ONCE(!dl_se);
if (dl_server(dl_se)) {
- p = dl_se->server_pick_task(dl_se);
+ p = dl_se->server_pick_task(dl_se, rf);
if (!p) {
if (dl_server_active(dl_se)) {
dl_se->dl_yielded = 1;
@@ -2451,9 +2451,9 @@ static struct task_struct *__pick_task_dl(struct rq *rq)
return p;
}
-static struct task_struct *pick_task_dl(struct rq *rq)
+static struct task_struct *pick_task_dl(struct rq *rq, struct rq_flags *rf)
{
- return __pick_task_dl(rq);
+ return __pick_task_dl(rq, rf);
}
static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_struct *next)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 5a81d9a1e31f..636b08977d19 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3113,7 +3113,7 @@ static struct task_struct *first_local_task(struct rq *rq)
struct task_struct, scx.dsq_list.node);
}
-static struct task_struct *pick_task_scx(struct rq *rq)
+static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf)
{
struct task_struct *prev = rq->curr;
struct task_struct *p;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1c0ef435a7aa..734c22fb0e2d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8816,7 +8816,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
resched_curr_lazy(rq);
}
-static struct task_struct *pick_task_fair(struct rq *rq)
+static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf)
{
struct sched_entity *se;
struct cfs_rq *cfs_rq;
@@ -8854,7 +8854,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
int new_tasks;
again:
- p = pick_task_fair(rq);
+ p = pick_task_fair(rq, rf);
if (!p)
goto idle;
se = &p->se;
@@ -8933,7 +8933,8 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
return NULL;
}
-static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev)
+static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev,
+ struct rq_flags *rf)
{
return pick_next_task_fair(rq, prev, NULL);
}
@@ -8943,9 +8944,11 @@ static bool fair_server_has_tasks(struct sched_dl_entity *dl_se)
return !!dl_se->rq->cfs.nr_queued;
}
-static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se)
+static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se, void *flags)
{
- return pick_task_fair(dl_se->rq);
+ struct rq_flags *rf = flags;
+
+ return pick_task_fair(dl_se->rq, rf);
}
void fair_server_init(struct rq *rq)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 2c85c86b455f..01e9612deefe 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -463,7 +463,7 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir
next->se.exec_start = rq_clock_task(rq);
}
-struct task_struct *pick_task_idle(struct rq *rq)
+struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf)
{
scx_update_idle(rq, true, false);
return rq->idle;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4b8e33c615b1..f946a4b091e8 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1735,7 +1735,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
return rt_task_of(rt_se);
}
-static struct task_struct *pick_task_rt(struct rq *rq)
+static struct task_struct *pick_task_rt(struct rq *rq, struct rq_flags *rf)
{
struct task_struct *p;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c8512a9fb022..ef6fbc49449f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2422,7 +2422,7 @@ struct sched_class {
void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
- struct task_struct *(*pick_task)(struct rq *rq);
+ struct task_struct *(*pick_task)(struct rq *rq, struct rq_flags *rf);
/*
* Optional! When implemented pick_next_task() should be equivalent to:
*
@@ -2432,7 +2432,8 @@ struct sched_class {
* set_next_task_first(next);
* }
*/
- struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev);
+ struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev,
+ struct rq_flags *rf);
void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next);
void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
@@ -2595,7 +2596,7 @@ static inline bool sched_fair_runnable(struct rq *rq)
}
extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
-extern struct task_struct *pick_task_idle(struct rq *rq);
+extern struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf);
#define SCA_CHECK 0x01
#define SCA_MIGRATE_DISABLE 0x02
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 058dd42e3d9b..1c70123cb6a4 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -33,7 +33,7 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool fir
stop->se.exec_start = rq_clock_task(rq);
}
-static struct task_struct *pick_task_stop(struct rq *rq)
+static struct task_struct *pick_task_stop(struct rq *rq, struct rq_flags *rf)
{
if (!sched_stop_runnable(rq))
return NULL;
--
2.43.0
^ permalink raw reply related [flat|nested] 23+ messages in thread* [PATCH RFC 2/8] sched: Add a server arg to dl_server_update_idle_time()
2025-03-15 2:21 [PATCH RFC 0/8] Add a deadline server for sched_ext tasks Joel Fernandes
2025-03-15 2:21 ` [PATCH RFC 1/8] sched: Add support to pick functions to take rf Joel Fernandes
@ 2025-03-15 2:21 ` Joel Fernandes
2025-03-15 2:21 ` [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks Joel Fernandes
` (5 subsequent siblings)
7 siblings, 0 replies; 23+ messages in thread
From: Joel Fernandes @ 2025-03-15 2:21 UTC (permalink / raw)
To: linux-kernel, Andrea Righi, Tejun Heo, Ingo Molnar,
Peter Zijlstra, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider
Cc: Luigi De Matteis, paulmck, boqun.feng, David Vernet,
Joel Fernandes
Since we are adding more servers, make dl_server_update_idle_time()
accept a server argument than a specific server.
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
kernel/sched/deadline.c | 17 +++++++++--------
kernel/sched/fair.c | 2 +-
kernel/sched/idle.c | 2 +-
kernel/sched/sched.h | 3 ++-
4 files changed, 13 insertions(+), 11 deletions(-)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 3f7fb7251805..ef592751417f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1610,28 +1610,29 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
* as time available for the fair server, avoiding a penalty for the
* rt scheduler that did not consumed that time.
*/
-void dl_server_update_idle_time(struct rq *rq, struct task_struct *p)
+void dl_server_update_idle_time(struct rq *rq, struct task_struct *p,
+ struct sched_dl_entity *rq_dl_server)
{
s64 delta_exec, scaled_delta_exec;
- if (!rq->fair_server.dl_defer)
+ if (!rq_dl_server->dl_defer)
return;
/* no need to discount more */
- if (rq->fair_server.runtime < 0)
+ if (rq_dl_server->runtime < 0)
return;
delta_exec = rq_clock_task(rq) - p->se.exec_start;
if (delta_exec < 0)
return;
- scaled_delta_exec = dl_scaled_delta_exec(rq, &rq->fair_server, delta_exec);
+ scaled_delta_exec = dl_scaled_delta_exec(rq, rq_dl_server, delta_exec);
- rq->fair_server.runtime -= scaled_delta_exec;
+ rq_dl_server->runtime -= scaled_delta_exec;
- if (rq->fair_server.runtime < 0) {
- rq->fair_server.dl_defer_running = 0;
- rq->fair_server.runtime = 0;
+ if (rq_dl_server->runtime < 0) {
+ rq_dl_server->dl_defer_running = 0;
+ rq_dl_server->runtime = 0;
}
p->se.exec_start = rq_clock_task(rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 734c22fb0e2d..f6511eaf0389 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7006,7 +7006,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
/* Account for idle runtime */
if (!rq->nr_running)
- dl_server_update_idle_time(rq, rq->curr);
+ dl_server_update_idle_time(rq, rq->curr, &rq->fair_server);
dl_server_start(&rq->fair_server);
}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 01e9612deefe..13a3d20d35e2 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -451,7 +451,7 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
- dl_server_update_idle_time(rq, prev);
+ dl_server_update_idle_time(rq, prev, &rq->fair_server);
scx_update_idle(rq, false, true);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ef6fbc49449f..b3d1201b8f3d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -392,7 +392,8 @@ extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
dl_server_pick_f pick_task);
extern void dl_server_update_idle_time(struct rq *rq,
- struct task_struct *p);
+ struct task_struct *p,
+ struct sched_dl_entity *rq_dl_server);
extern void fair_server_init(struct rq *rq);
extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq);
extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
--
2.43.0
^ permalink raw reply related [flat|nested] 23+ messages in thread* [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks
2025-03-15 2:21 [PATCH RFC 0/8] Add a deadline server for sched_ext tasks Joel Fernandes
2025-03-15 2:21 ` [PATCH RFC 1/8] sched: Add support to pick functions to take rf Joel Fernandes
2025-03-15 2:21 ` [PATCH RFC 2/8] sched: Add a server arg to dl_server_update_idle_time() Joel Fernandes
@ 2025-03-15 2:21 ` Joel Fernandes
2025-03-15 7:22 ` Peter Zijlstra
2025-03-15 17:56 ` Andrea Righi
2025-03-15 2:21 ` [PATCH RFC 4/8] sched/debug: Fix updating of ppos on server write ops Joel Fernandes
` (4 subsequent siblings)
7 siblings, 2 replies; 23+ messages in thread
From: Joel Fernandes @ 2025-03-15 2:21 UTC (permalink / raw)
To: linux-kernel, Andrea Righi, Tejun Heo, Ingo Molnar,
Peter Zijlstra, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider,
David Vernet, Changwoo Min
Cc: Luigi De Matteis, paulmck, boqun.feng, Joel Fernandes
sched_ext currently suffers starvation due to RT. The same workload when
converted to EXT can get zero runtime if RT is 100% running, causing EXT
processes to stall. Fix it by adding a DL server for EXT.
A kselftest is also provided later to verify:
./runner -t rt_stall
===== START =====
TEST: rt_stall
DESCRIPTION: Verify that RT tasks cannot stall SCHED_EXT tasks
OUTPUT:
TAP version 13
1..1
ok 1 PASS: CFS task got more than 4.00% of runtime
Cc: Luigi De Matteis <ldematteis123@gmail.com>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
kernel/sched/core.c | 3 ++
kernel/sched/deadline.c | 2 +-
kernel/sched/ext.c | 62 +++++++++++++++++++++++++++++++++++++++--
kernel/sched/sched.h | 2 ++
4 files changed, 66 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d19e4b7a0020..09bff60c22d8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8617,6 +8617,9 @@ void __init sched_init(void)
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
fair_server_init(rq);
+#ifdef CONFIG_SCHED_CLASS_EXT
+ ext_server_init(rq);
+#endif
#ifdef CONFIG_SCHED_CORE
rq->core = rq;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index ef592751417f..bcb66d9692ae 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1571,7 +1571,7 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
* The fair server (sole dl_server) does not account for real-time
* workload because it is running fair work.
*/
- if (dl_se == &rq->fair_server)
+ if (dl_se == &rq->fair_server || dl_se == &rq->ext_server)
return;
#ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 636b08977d19..553d3e6087fe 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1677,6 +1677,9 @@ static void update_curr_scx(struct rq *rq)
if (!curr->scx.slice)
touch_core_sched(rq, curr);
}
+
+ if (dl_server_active(&rq->ext_server))
+ dl_server_update(&rq->ext_server, delta_exec);
}
static bool scx_dsq_priq_less(struct rb_node *node_a,
@@ -2147,6 +2150,15 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
if (enq_flags & SCX_ENQ_WAKEUP)
touch_core_sched(rq, p);
+ if (rq->scx.nr_running == 1) {
+ /* Account for idle runtime */
+ if (!rq->nr_running)
+ dl_server_update_idle_time(rq, rq->curr, &rq->ext_server);
+
+ /* Start dl_server if this is the first task being enqueued */
+ dl_server_start(&rq->ext_server);
+ }
+
do_enqueue_task(rq, p, enq_flags, sticky_cpu);
out:
rq->scx.flags &= ~SCX_RQ_IN_WAKEUP;
@@ -2238,6 +2250,11 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
sub_nr_running(rq, 1);
dispatch_dequeue(rq, p);
+
+ /* Stop the server if this was the last task */
+ if (rq->scx.nr_running == 0)
+ dl_server_stop(&rq->ext_server);
+
return true;
}
@@ -4207,6 +4224,15 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
static void switched_from_scx(struct rq *rq, struct task_struct *p)
{
scx_ops_disable_task(p);
+
+ /*
+ * After class switch, if the DL server is still active, restart it so
+ * that DL timers will be queued, in case SCX switched to higher class.
+ */
+ if (dl_server_active(&rq->ext_server)) {
+ dl_server_stop(&rq->ext_server);
+ dl_server_start(&rq->ext_server);
+ }
}
static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
@@ -7440,8 +7466,8 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
* relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the
* schedutil cpufreq governor chooses the target frequency.
*
- * The actual performance level chosen, CPU grouping, and the overhead and
- * latency of the operations are dependent on the hardware and cpufreq driver in
+ * The actual performance level chosen, CPU grouping, and the overhead and latency
+ * of the operations are dependent on the hardware and cpufreq driver in
* use. Consult hardware and cpufreq documentation for more information. The
* current performance level can be monitored using scx_bpf_cpuperf_cur().
*/
@@ -7793,6 +7819,38 @@ BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
BTF_ID_FLAGS(func, scx_bpf_now)
BTF_KFUNCS_END(scx_kfunc_ids_any)
+/*
+ * Check if ext scheduler has tasks ready to run.
+ */
+static bool ext_server_has_tasks(struct sched_dl_entity *dl_se)
+{
+ return !!dl_se->rq->scx.nr_running;
+}
+
+/*
+ * Select the next task to run from the ext scheduling class.
+ */
+static struct task_struct *ext_server_pick_task(struct sched_dl_entity *dl_se,
+ void *flags)
+{
+ struct rq_flags *rf = flags;
+
+ balance_scx(dl_se->rq, dl_se->rq->curr, rf);
+ return pick_task_scx(dl_se->rq, rf);
+}
+
+/*
+ * Initialize the ext server deadline entity.
+ */
+void ext_server_init(struct rq *rq)
+{
+ struct sched_dl_entity *dl_se = &rq->ext_server;
+
+ init_dl_entity(dl_se);
+
+ dl_server_init(dl_se, rq, ext_server_has_tasks, ext_server_pick_task);
+}
+
static const struct btf_kfunc_id_set scx_kfunc_set_any = {
.owner = THIS_MODULE,
.set = &scx_kfunc_ids_any,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b3d1201b8f3d..8421eb56c50b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -395,6 +395,7 @@ extern void dl_server_update_idle_time(struct rq *rq,
struct task_struct *p,
struct sched_dl_entity *rq_dl_server);
extern void fair_server_init(struct rq *rq);
+extern void ext_server_init(struct rq *rq);
extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq);
extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
u64 runtime, u64 period, bool init);
@@ -1141,6 +1142,7 @@ struct rq {
#endif
struct sched_dl_entity fair_server;
+ struct sched_dl_entity ext_server;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this CPU: */
--
2.43.0
^ permalink raw reply related [flat|nested] 23+ messages in thread* Re: [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks
2025-03-15 2:21 ` [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks Joel Fernandes
@ 2025-03-15 7:22 ` Peter Zijlstra
2025-03-15 23:15 ` Joel Fernandes
2025-03-15 17:56 ` Andrea Righi
1 sibling, 1 reply; 23+ messages in thread
From: Peter Zijlstra @ 2025-03-15 7:22 UTC (permalink / raw)
To: Joel Fernandes
Cc: linux-kernel, Andrea Righi, Tejun Heo, Ingo Molnar, Juri Lelli,
Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
Mel Gorman, Valentin Schneider, David Vernet, Changwoo Min,
Luigi De Matteis, paulmck, boqun.feng
On Fri, Mar 14, 2025 at 10:21:50PM -0400, Joel Fernandes wrote:
> sched_ext currently suffers starvation due to RT. The same workload when
> converted to EXT can get zero runtime if RT is 100% running, causing EXT
> processes to stall. Fix it by adding a DL server for EXT.
This needs a lot more words on why you need a second server. Because I
don't think you do.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks
2025-03-15 7:22 ` Peter Zijlstra
@ 2025-03-15 23:15 ` Joel Fernandes
2025-03-17 10:31 ` Peter Zijlstra
0 siblings, 1 reply; 23+ messages in thread
From: Joel Fernandes @ 2025-03-15 23:15 UTC (permalink / raw)
To: Peter Zijlstra
Cc: linux-kernel, Andrea Righi, Tejun Heo, Ingo Molnar, Juri Lelli,
Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
Mel Gorman, Valentin Schneider, David Vernet, Changwoo Min,
Luigi De Matteis, paulmck, boqun.feng
On 3/15/2025 3:22 AM, Peter Zijlstra wrote:
> On Fri, Mar 14, 2025 at 10:21:50PM -0400, Joel Fernandes wrote:
>> sched_ext currently suffers starvation due to RT. The same workload when
>> converted to EXT can get zero runtime if RT is 100% running, causing EXT
>> processes to stall. Fix it by adding a DL server for EXT.
>
> This needs a lot more words on why you need a second server. Because I
> don't think you do.
Sure, I will add more words to the change log to explain rationale. When you say
"I don't think you do", do you mean that both FAIR and EXT could be served by
the same server? If so, that will not handle the case where the system has both
FAIR and EXT tasks in the mix (EXT has a partial mode where certain tasks can be
made EXT with certain others left as FAIR) and FAIR runs 100% and starves EXT.
We still need bandwidth allocated to EXT in such a situation. So we do need an
EXT server. Or did you mean something else?
thanks,
- Joel
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks
2025-03-15 23:15 ` Joel Fernandes
@ 2025-03-17 10:31 ` Peter Zijlstra
2025-03-17 16:57 ` Tejun Heo
2025-03-17 21:53 ` Joel Fernandes
0 siblings, 2 replies; 23+ messages in thread
From: Peter Zijlstra @ 2025-03-17 10:31 UTC (permalink / raw)
To: Joel Fernandes
Cc: linux-kernel, Andrea Righi, Tejun Heo, Ingo Molnar, Juri Lelli,
Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
Mel Gorman, Valentin Schneider, David Vernet, Changwoo Min,
Luigi De Matteis, paulmck, boqun.feng
On Sat, Mar 15, 2025 at 07:15:27PM -0400, Joel Fernandes wrote:
>
>
> On 3/15/2025 3:22 AM, Peter Zijlstra wrote:
> > On Fri, Mar 14, 2025 at 10:21:50PM -0400, Joel Fernandes wrote:
> >> sched_ext currently suffers starvation due to RT. The same workload when
> >> converted to EXT can get zero runtime if RT is 100% running, causing EXT
> >> processes to stall. Fix it by adding a DL server for EXT.
> >
> > This needs a lot more words on why you need a second server. Because I
> > don't think you do.
>
> Sure, I will add more words to the change log to explain rationale. When you say
> "I don't think you do", do you mean that both FAIR and EXT could be served by
> the same server?
Yeah, because now you get two deadline entities both having a
reservation on bandwidth. One of which is not going to be used -- this
is not nice.
> If so, that will not handle the case where the system has both
> FAIR and EXT tasks in the mix (EXT has a partial mode where certain tasks can be
> made EXT with certain others left as FAIR) and FAIR runs 100% and starves EXT.
Well, you did not mention that issue, you only babbled about RT.
I did point out that issue with ext, and TJ said this mixed mode wasn't
really meant to be used or somesuch.
So if that's changed, this needs a separate discussion.
Also; I gotta ask, why is nvidia looking at ext ?
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks
2025-03-17 10:31 ` Peter Zijlstra
@ 2025-03-17 16:57 ` Tejun Heo
2025-03-17 17:06 ` Peter Zijlstra
2025-03-17 21:53 ` Joel Fernandes
1 sibling, 1 reply; 23+ messages in thread
From: Tejun Heo @ 2025-03-17 16:57 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Joel Fernandes, linux-kernel, Andrea Righi, Ingo Molnar,
Juri Lelli, Vincent Guittot, Dietmar Eggemann, Steven Rostedt,
Ben Segall, Mel Gorman, Valentin Schneider, David Vernet,
Changwoo Min, Luigi De Matteis, paulmck, boqun.feng
Hello,
On Mon, Mar 17, 2025 at 11:31:01AM +0100, Peter Zijlstra wrote:
> On Sat, Mar 15, 2025 at 07:15:27PM -0400, Joel Fernandes wrote:
> > If so, that will not handle the case where the system has both
> > FAIR and EXT tasks in the mix (EXT has a partial mode where certain tasks can be
> > made EXT with certain others left as FAIR) and FAIR runs 100% and starves EXT.
>
> Well, you did not mention that issue, you only babbled about RT.
>
> I did point out that issue with ext, and TJ said this mixed mode wasn't
> really meant to be used or somesuch.
It's true that most of the current use cases don't use mixed mode. That
said, some folks are interested in it and if we can prevent starvation from
fair saturating CPUs in mixed mode with a DL server, that'd be really nice.
Would it be possible to toggle the reservations depending on the ext's
operation mode?
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks
2025-03-17 16:57 ` Tejun Heo
@ 2025-03-17 17:06 ` Peter Zijlstra
2025-03-17 21:48 ` Joel Fernandes
0 siblings, 1 reply; 23+ messages in thread
From: Peter Zijlstra @ 2025-03-17 17:06 UTC (permalink / raw)
To: Tejun Heo
Cc: Joel Fernandes, linux-kernel, Andrea Righi, Ingo Molnar,
Juri Lelli, Vincent Guittot, Dietmar Eggemann, Steven Rostedt,
Ben Segall, Mel Gorman, Valentin Schneider, David Vernet,
Changwoo Min, Luigi De Matteis, paulmck, boqun.feng
On Mon, Mar 17, 2025 at 06:57:19AM -1000, Tejun Heo wrote:
> Hello,
>
> On Mon, Mar 17, 2025 at 11:31:01AM +0100, Peter Zijlstra wrote:
> > On Sat, Mar 15, 2025 at 07:15:27PM -0400, Joel Fernandes wrote:
> > > If so, that will not handle the case where the system has both
> > > FAIR and EXT tasks in the mix (EXT has a partial mode where certain tasks can be
> > > made EXT with certain others left as FAIR) and FAIR runs 100% and starves EXT.
> >
> > Well, you did not mention that issue, you only babbled about RT.
> >
> > I did point out that issue with ext, and TJ said this mixed mode wasn't
> > really meant to be used or somesuch.
>
> It's true that most of the current use cases don't use mixed mode. That
> said, some folks are interested in it and if we can prevent starvation from
> fair saturating CPUs in mixed mode with a DL server, that'd be really nice.
> Would it be possible to toggle the reservations depending on the ext's
> operation mode?
Yeah, that should be doable.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks
2025-03-17 17:06 ` Peter Zijlstra
@ 2025-03-17 21:48 ` Joel Fernandes
2025-03-17 22:16 ` Tejun Heo
0 siblings, 1 reply; 23+ messages in thread
From: Joel Fernandes @ 2025-03-17 21:48 UTC (permalink / raw)
To: Peter Zijlstra, Tejun Heo
Cc: linux-kernel, Andrea Righi, Ingo Molnar, Juri Lelli,
Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
Mel Gorman, Valentin Schneider, David Vernet, Changwoo Min,
Luigi De Matteis, paulmck, boqun.feng
Hello, Peter, Tejun,
On 3/17/2025 6:06 PM, Peter Zijlstra wrote:
> On Mon, Mar 17, 2025 at 06:57:19AM -1000, Tejun Heo wrote:
>> Hello,
>>
>> On Mon, Mar 17, 2025 at 11:31:01AM +0100, Peter Zijlstra wrote:
>>> On Sat, Mar 15, 2025 at 07:15:27PM -0400, Joel Fernandes wrote:
>>>> If so, that will not handle the case where the system has both
>>>> FAIR and EXT tasks in the mix (EXT has a partial mode where certain tasks can be
>>>> made EXT with certain others left as FAIR) and FAIR runs 100% and starves EXT.
>>>
>>> Well, you did not mention that issue, you only babbled about RT.
You are right, I will add more details about this to the change log.
>>>
>>> I did point out that issue with ext, and TJ said this mixed mode wasn't
>>> really meant to be used or somesuch.
>>
>> It's true that most of the current use cases don't use mixed mode. That
>> said, some folks are interested in it and if we can prevent starvation from
>> fair saturating CPUs in mixed mode with a DL server, that'd be really nice.
>> Would it be possible to toggle the reservations depending on the ext's
>> operation mode?
>
> Yeah, that should be doable.
Just to clarify, Tejun is suggesting that in mixed mode, we boost EXT
independent of FAIR. And in normal mode, we we boost both FAIR+EXT, because well
- nothing would be running as fair anyway.
But what is the point of doing that, if we have boost EXT independent of FAIR
anyway? We need that code _anyway_ due to mixed mode so it would not simplify
anything.
Or did Tejun mean something else about "toggle the reservations"?
thanks,
- Joel
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks
2025-03-17 21:48 ` Joel Fernandes
@ 2025-03-17 22:16 ` Tejun Heo
2025-03-17 22:39 ` Joel Fernandes
0 siblings, 1 reply; 23+ messages in thread
From: Tejun Heo @ 2025-03-17 22:16 UTC (permalink / raw)
To: Joel Fernandes
Cc: Peter Zijlstra, linux-kernel, Andrea Righi, Ingo Molnar,
Juri Lelli, Vincent Guittot, Dietmar Eggemann, Steven Rostedt,
Ben Segall, Mel Gorman, Valentin Schneider, David Vernet,
Changwoo Min, Luigi De Matteis, paulmck, boqun.feng
Hello,
On Mon, Mar 17, 2025 at 10:48:16PM +0100, Joel Fernandes wrote:
...
> Just to clarify, Tejun is suggesting that in mixed mode, we boost EXT
> independent of FAIR. And in normal mode, we we boost both FAIR+EXT, because well
> - nothing would be running as fair anyway.
>
> But what is the point of doing that, if we have boost EXT independent of FAIR
> anyway? We need that code _anyway_ due to mixed mode so it would not simplify
> anything.
>
> Or did Tejun mean something else about "toggle the reservations"?
My understanding is that if we have both FAIR and EXT's DL servers reserving
execution time all the time, we'd be reserving execution time for something
which can't be active, so the only change necessary I think is just
retracting FAIR's or EXT's reservation whent we know they are not active
(ie. if EXT is not loaded or EXT is loaded in full-sys mode).
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks
2025-03-17 22:16 ` Tejun Heo
@ 2025-03-17 22:39 ` Joel Fernandes
2025-03-17 22:48 ` Tejun Heo
0 siblings, 1 reply; 23+ messages in thread
From: Joel Fernandes @ 2025-03-17 22:39 UTC (permalink / raw)
To: Tejun Heo
Cc: Peter Zijlstra, linux-kernel, Andrea Righi, Ingo Molnar,
Juri Lelli, Vincent Guittot, Dietmar Eggemann, Steven Rostedt,
Ben Segall, Mel Gorman, Valentin Schneider, David Vernet,
Changwoo Min, Luigi De Matteis, paulmck, boqun.feng
On 3/17/2025 11:16 PM, Tejun Heo wrote:
> Hello,
>
> On Mon, Mar 17, 2025 at 10:48:16PM +0100, Joel Fernandes wrote:
> ...
>> Just to clarify, Tejun is suggesting that in mixed mode, we boost EXT
>> independent of FAIR. And in normal mode, we we boost both FAIR+EXT, because well
>> - nothing would be running as fair anyway.
>>
>> But what is the point of doing that, if we have boost EXT independent of FAIR
>> anyway? We need that code _anyway_ due to mixed mode so it would not simplify
>> anything.
>>
>> Or did Tejun mean something else about "toggle the reservations"?
>
> My understanding is that if we have both FAIR and EXT's DL servers reserving
> execution time all the time, we'd be reserving execution time for something
> which can't be active, so the only change necessary I think is just
> retracting FAIR's or EXT's reservation whent we know they are not active
> (ie. if EXT is not loaded or EXT is loaded in full-sys mode).
>
Ah, I see what you mean. We already have a 'toggle' like that though because if
FAIR or EXT is not running (due to whatever reason), we would have already
called 'dl_server_stop()' or would never have called 'dl_server_start()'.
On the other hand, even if full-sys-mode, we need the EXT server to boost it to
above RT if EXT is running, so we need its server initialized and ready to go.
Let me know if I missed anything though, thanks,
- Joel
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks
2025-03-17 22:39 ` Joel Fernandes
@ 2025-03-17 22:48 ` Tejun Heo
2025-03-18 10:07 ` Joel Fernandes
0 siblings, 1 reply; 23+ messages in thread
From: Tejun Heo @ 2025-03-17 22:48 UTC (permalink / raw)
To: Joel Fernandes
Cc: Peter Zijlstra, linux-kernel, Andrea Righi, Ingo Molnar,
Juri Lelli, Vincent Guittot, Dietmar Eggemann, Steven Rostedt,
Ben Segall, Mel Gorman, Valentin Schneider, David Vernet,
Changwoo Min, Luigi De Matteis, paulmck, boqun.feng
On Mon, Mar 17, 2025 at 11:39:32PM +0100, Joel Fernandes wrote:
> On 3/17/2025 11:16 PM, Tejun Heo wrote:
> > On Mon, Mar 17, 2025 at 10:48:16PM +0100, Joel Fernandes wrote:
> > ...
> >> Just to clarify, Tejun is suggesting that in mixed mode, we boost EXT
> >> independent of FAIR. And in normal mode, we we boost both FAIR+EXT, because well
> >> - nothing would be running as fair anyway.
> >>
> >> But what is the point of doing that, if we have boost EXT independent of FAIR
> >> anyway? We need that code _anyway_ due to mixed mode so it would not simplify
> >> anything.
> >>
> >> Or did Tejun mean something else about "toggle the reservations"?
> >
> > My understanding is that if we have both FAIR and EXT's DL servers reserving
> > execution time all the time, we'd be reserving execution time for something
> > which can't be active, so the only change necessary I think is just
> > retracting FAIR's or EXT's reservation whent we know they are not active
> > (ie. if EXT is not loaded or EXT is loaded in full-sys mode).
> >
> Ah, I see what you mean. We already have a 'toggle' like that though because if
> FAIR or EXT is not running (due to whatever reason), we would have already
> called 'dl_server_stop()' or would never have called 'dl_server_start()'.
>
> On the other hand, even if full-sys-mode, we need the EXT server to boost it to
> above RT if EXT is running, so we need its server initialized and ready to go.
>
> Let me know if I missed anything though, thanks,
I'm not very familiar with DL but it looks like a stopped DL server would
still be reserving bandwidth which limits what other actual DL users would
be able to reserve without causing overflow. It looks like EXT's activation
modes should be calling into dl_bw_manage() so that FAIR's and EXT's
reservations can be retracted when not in use.
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks
2025-03-17 22:48 ` Tejun Heo
@ 2025-03-18 10:07 ` Joel Fernandes
0 siblings, 0 replies; 23+ messages in thread
From: Joel Fernandes @ 2025-03-18 10:07 UTC (permalink / raw)
To: Tejun Heo
Cc: Peter Zijlstra, linux-kernel, Andrea Righi, Ingo Molnar,
Juri Lelli, Vincent Guittot, Dietmar Eggemann, Steven Rostedt,
Ben Segall, Mel Gorman, Valentin Schneider, David Vernet,
Changwoo Min, Luigi De Matteis, paulmck, boqun.feng
On 3/17/2025 11:48 PM, Tejun Heo wrote:
> On Mon, Mar 17, 2025 at 11:39:32PM +0100, Joel Fernandes wrote:
>> On 3/17/2025 11:16 PM, Tejun Heo wrote:
>>> On Mon, Mar 17, 2025 at 10:48:16PM +0100, Joel Fernandes wrote:
>>> ...
>>>> Just to clarify, Tejun is suggesting that in mixed mode, we boost EXT
>>>> independent of FAIR. And in normal mode, we we boost both FAIR+EXT, because well
>>>> - nothing would be running as fair anyway.
>>>>
>>>> But what is the point of doing that, if we have boost EXT independent of FAIR
>>>> anyway? We need that code _anyway_ due to mixed mode so it would not simplify
>>>> anything.
>>>>
>>>> Or did Tejun mean something else about "toggle the reservations"?
>>> My understanding is that if we have both FAIR and EXT's DL servers reserving
>>> execution time all the time, we'd be reserving execution time for something
>>> which can't be active, so the only change necessary I think is just
>>> retracting FAIR's or EXT's reservation whent we know they are not active
>>> (ie. if EXT is not loaded or EXT is loaded in full-sys mode).
>>>
>> Ah, I see what you mean. We already have a 'toggle' like that though because if
>> FAIR or EXT is not running (due to whatever reason), we would have already
>> called 'dl_server_stop()' or would never have called 'dl_server_start()'.
>>
>> On the other hand, even if full-sys-mode, we need the EXT server to boost it to
>> above RT if EXT is running, so we need its server initialized and ready to go.
>>
>> Let me know if I missed anything though, thanks,
> I'm not very familiar with DL but it looks like a stopped DL server would
> still be reserving bandwidth which limits what other actual DL users would
> be able to reserve without causing overflow. It looks like EXT's activation
> modes should be calling into dl_bw_manage() so that FAIR's and EXT's
> reservations can be retracted when not in use.
Ah, you raise a good point. Sorry, you were on to something and that makes sense
to me. Let me see how to wire it up. Basically, when we switch to full-mode from
say partial, we could/should remove the bandwidth reservation of the servers. I
think I confused the concept of "server not running" to "server reserving
bandwidth". My bad!
thanks,
- Joel
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks
2025-03-17 10:31 ` Peter Zijlstra
2025-03-17 16:57 ` Tejun Heo
@ 2025-03-17 21:53 ` Joel Fernandes
1 sibling, 0 replies; 23+ messages in thread
From: Joel Fernandes @ 2025-03-17 21:53 UTC (permalink / raw)
To: Peter Zijlstra
Cc: linux-kernel, Andrea Righi, Tejun Heo, Ingo Molnar, Juri Lelli,
Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
Mel Gorman, Valentin Schneider, David Vernet, Changwoo Min,
Luigi De Matteis, paulmck, boqun.feng
Hello, Peter,
I replied to other parts of your email in another thread so I am just replying
to this part:
On 3/17/2025 11:31 AM, Peter Zijlstra wrote:
>
> Also; I gotta ask, why is nvidia looking at ext ?
There are some complex CPU topologies which perform poorly with the existing
FAIR scheduler as reported by people (I have not seen the data though so there's
that).
There are also workloads where it is beneficial to schedule on cores which have
the data in their cache and are submitting work to GPGPU, which makes the GPGPU
operations faster.
thanks,
- Joel
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks
2025-03-15 2:21 ` [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks Joel Fernandes
2025-03-15 7:22 ` Peter Zijlstra
@ 2025-03-15 17:56 ` Andrea Righi
2025-03-15 23:17 ` Joel Fernandes
1 sibling, 1 reply; 23+ messages in thread
From: Andrea Righi @ 2025-03-15 17:56 UTC (permalink / raw)
To: Joel Fernandes
Cc: linux-kernel, Tejun Heo, Ingo Molnar, Peter Zijlstra, Juri Lelli,
Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
Mel Gorman, Valentin Schneider, David Vernet, Changwoo Min,
Luigi De Matteis, paulmck, boqun.feng
Hi Joel,
On Fri, Mar 14, 2025 at 10:21:50PM -0400, Joel Fernandes wrote:
...
> @@ -7793,6 +7819,38 @@ BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
> BTF_ID_FLAGS(func, scx_bpf_now)
> BTF_KFUNCS_END(scx_kfunc_ids_any)
>
> +/*
> + * Check if ext scheduler has tasks ready to run.
> + */
> +static bool ext_server_has_tasks(struct sched_dl_entity *dl_se)
> +{
> + return !!dl_se->rq->scx.nr_running;
> +}
> +
> +/*
> + * Select the next task to run from the ext scheduling class.
> + */
> +static struct task_struct *ext_server_pick_task(struct sched_dl_entity *dl_se,
> + void *flags)
> +{
> + struct rq_flags *rf = flags;
> +
It'd be nice to add a comment here to clarify that we need to call
balance_scx() before pick_task_scx(), so that we can trigger ops.dispatch()
and consume tasks that may be pending in the BPF scheduler's DSQs,
otherwise pick_task_scx() may not find any scx task to run, reducing the
effectiveness of the dl_server.
> + balance_scx(dl_se->rq, dl_se->rq->curr, rf);
> + return pick_task_scx(dl_se->rq, rf);
> +}
Thanks,
-Andrea
^ permalink raw reply [flat|nested] 23+ messages in thread* Re: [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks
2025-03-15 17:56 ` Andrea Righi
@ 2025-03-15 23:17 ` Joel Fernandes
0 siblings, 0 replies; 23+ messages in thread
From: Joel Fernandes @ 2025-03-15 23:17 UTC (permalink / raw)
To: Andrea Righi
Cc: linux-kernel, Tejun Heo, Ingo Molnar, Peter Zijlstra, Juri Lelli,
Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
Mel Gorman, Valentin Schneider, David Vernet, Changwoo Min,
Luigi De Matteis, paulmck, boqun.feng
On 3/15/2025 1:56 PM, Andrea Righi wrote:
> Hi Joel,
>
> On Fri, Mar 14, 2025 at 10:21:50PM -0400, Joel Fernandes wrote:
> ...
>> @@ -7793,6 +7819,38 @@ BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
>> BTF_ID_FLAGS(func, scx_bpf_now)
>> BTF_KFUNCS_END(scx_kfunc_ids_any)
>>
>> +/*
>> + * Check if ext scheduler has tasks ready to run.
>> + */
>> +static bool ext_server_has_tasks(struct sched_dl_entity *dl_se)
>> +{
>> + return !!dl_se->rq->scx.nr_running;
>> +}
>> +
>> +/*
>> + * Select the next task to run from the ext scheduling class.
>> + */
>> +static struct task_struct *ext_server_pick_task(struct sched_dl_entity *dl_se,
>> + void *flags)
>> +{
>> + struct rq_flags *rf = flags;
>> +
>
> It'd be nice to add a comment here to clarify that we need to call
> balance_scx() before pick_task_scx(), so that we can trigger ops.dispatch()
> and consume tasks that may be pending in the BPF scheduler's DSQs,
> otherwise pick_task_scx() may not find any scx task to run, reducing the
> effectiveness of the dl_server.
Thanks for pointing this out, I will add rationale for the balance as you mentioned.
- Joel
^ permalink raw reply [flat|nested] 23+ messages in thread
* [PATCH RFC 4/8] sched/debug: Fix updating of ppos on server write ops
2025-03-15 2:21 [PATCH RFC 0/8] Add a deadline server for sched_ext tasks Joel Fernandes
` (2 preceding siblings ...)
2025-03-15 2:21 ` [PATCH RFC 3/8] sched/ext: Add a DL server for sched_ext tasks Joel Fernandes
@ 2025-03-15 2:21 ` Joel Fernandes
2025-03-15 2:21 ` [PATCH RFC 5/8] sched/debug: Stop and start server based on if it was active Joel Fernandes
` (3 subsequent siblings)
7 siblings, 0 replies; 23+ messages in thread
From: Joel Fernandes @ 2025-03-15 2:21 UTC (permalink / raw)
To: linux-kernel, Andrea Righi, Tejun Heo, Ingo Molnar,
Peter Zijlstra, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider
Cc: Luigi De Matteis, paulmck, boqun.feng, David Vernet,
Joel Fernandes
Updating "ppos" on error conditions does not make much sense. The pattern
is to return the error code directly without modifying the position, or
modify the position on success and return the number of bytes written.
Since on success, the return value of apply is 0, there is no point in
modifying ppos either. Fix it by removing all this and just returning
error code or number of bytes written on success.
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
kernel/sched/debug.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index ef047add7f9e..1ccc7687e1a8 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -348,8 +348,8 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
long cpu = (long) ((struct seq_file *) filp->private_data)->private;
struct rq *rq = cpu_rq(cpu);
u64 runtime, period;
+ int retval = 0;
size_t err;
- int retval;
u64 value;
err = kstrtoull_from_user(ubuf, cnt, 10, &value);
@@ -385,8 +385,6 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
}
retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0);
- if (retval)
- cnt = retval;
if (!runtime)
printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n",
@@ -394,6 +392,9 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
if (rq->cfs.h_nr_queued)
dl_server_start(&rq->fair_server);
+
+ if (retval < 0)
+ return retval;
}
*ppos += cnt;
--
2.43.0
^ permalink raw reply related [flat|nested] 23+ messages in thread* [PATCH RFC 5/8] sched/debug: Stop and start server based on if it was active
2025-03-15 2:21 [PATCH RFC 0/8] Add a deadline server for sched_ext tasks Joel Fernandes
` (3 preceding siblings ...)
2025-03-15 2:21 ` [PATCH RFC 4/8] sched/debug: Fix updating of ppos on server write ops Joel Fernandes
@ 2025-03-15 2:21 ` Joel Fernandes
2025-03-15 2:21 ` [PATCH RFC 6/8] sched/debug: Add support to change sched_ext server params Joel Fernandes
` (2 subsequent siblings)
7 siblings, 0 replies; 23+ messages in thread
From: Joel Fernandes @ 2025-03-15 2:21 UTC (permalink / raw)
To: linux-kernel, Andrea Righi, Tejun Heo, Ingo Molnar,
Peter Zijlstra, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider
Cc: Luigi De Matteis, paulmck, boqun.feng, David Vernet,
Joel Fernandes
Currently the DL server interface for applying parameters checkfs
CFS-internals to identify if the server is active. This is error-prone
and make it difficult when adding new servers in the future.
Fix it, by using dl_server_active() which is also used by the DL server
code to determine if the DL server was started.
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
kernel/sched/debug.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 1ccc7687e1a8..83cb695d6d46 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -347,6 +347,7 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
{
long cpu = (long) ((struct seq_file *) filp->private_data)->private;
struct rq *rq = cpu_rq(cpu);
+ bool was_active = false;
u64 runtime, period;
int retval = 0;
size_t err;
@@ -379,7 +380,8 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
return -EINVAL;
}
- if (rq->cfs.h_nr_queued) {
+ if (dl_server_active(&rq->fair_server)) {
+ was_active = true;
update_rq_clock(rq);
dl_server_stop(&rq->fair_server);
}
@@ -390,7 +392,7 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n",
cpu_of(rq));
- if (rq->cfs.h_nr_queued)
+ if (was_active)
dl_server_start(&rq->fair_server);
if (retval < 0)
--
2.43.0
^ permalink raw reply related [flat|nested] 23+ messages in thread* [PATCH RFC 6/8] sched/debug: Add support to change sched_ext server params
2025-03-15 2:21 [PATCH RFC 0/8] Add a deadline server for sched_ext tasks Joel Fernandes
` (4 preceding siblings ...)
2025-03-15 2:21 ` [PATCH RFC 5/8] sched/debug: Stop and start server based on if it was active Joel Fernandes
@ 2025-03-15 2:21 ` Joel Fernandes
2025-03-15 2:21 ` [PATCH RFC 7/8] sched/deadline: Clear defer params Joel Fernandes
2025-03-15 2:21 ` [PATCH RFC 8/8] selftests/sched_ext: Add test for sched_ext dl_server Joel Fernandes
7 siblings, 0 replies; 23+ messages in thread
From: Joel Fernandes @ 2025-03-15 2:21 UTC (permalink / raw)
To: linux-kernel, Andrea Righi, Tejun Heo, Ingo Molnar,
Peter Zijlstra, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider
Cc: Luigi De Matteis, paulmck, boqun.feng, David Vernet,
Joel Fernandes
When a sched_ext server is loaded, tasks in CFS are converted to run in
sched_ext class. Modify the ext server parameters as well along with the
fair ones.
Re-use the existing interface to modify both ext and fair servers to
keep number of interfaces less (as it is, we have a per-cpu interface).
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
kernel/sched/debug.c | 91 ++++++++++++++++++++++++++------------------
1 file changed, 54 insertions(+), 37 deletions(-)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 83cb695d6d46..218b3e239128 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -339,17 +339,18 @@ enum dl_param {
DL_PERIOD,
};
-static unsigned long fair_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */
-static unsigned long fair_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */
+static unsigned long dl_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */
+static unsigned long dl_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */
-static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubuf,
+static ssize_t sched_dl_server_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos, enum dl_param param)
{
long cpu = (long) ((struct seq_file *) filp->private_data)->private;
struct rq *rq = cpu_rq(cpu);
- bool was_active = false;
+ bool was_active_fair = false;
+ bool was_active_ext = false;
+ int retval = 0, retval2 = 0;
u64 runtime, period;
- int retval = 0;
size_t err;
u64 value;
@@ -375,41 +376,57 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
}
if (runtime > period ||
- period > fair_server_period_max ||
- period < fair_server_period_min) {
+ period > dl_server_period_max ||
+ period < dl_server_period_min) {
return -EINVAL;
}
if (dl_server_active(&rq->fair_server)) {
- was_active = true;
+ was_active_fair = true;
update_rq_clock(rq);
dl_server_stop(&rq->fair_server);
}
+ if (dl_server_active(&rq->ext_server)) {
+ was_active_ext = true;
+ update_rq_clock(rq);
+ dl_server_stop(&rq->ext_server);
+ }
+
retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0);
+ retval2 = dl_server_apply_params(&rq->ext_server, runtime, period, 0);
if (!runtime)
- printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n",
+ printk_deferred("Deadline servers are disabled on CPU %d, system may crash due to starvation.\n",
cpu_of(rq));
- if (was_active)
+ if (was_active_fair)
dl_server_start(&rq->fair_server);
+ if (was_active_ext)
+ dl_server_start(&rq->ext_server);
+
if (retval < 0)
return retval;
+ if (retval2 < 0)
+ return retval2;
}
*ppos += cnt;
return cnt;
}
-static size_t sched_fair_server_show(struct seq_file *m, void *v, enum dl_param param)
+static size_t sched_dl_server_show(struct seq_file *m, void *v, enum dl_param param)
{
unsigned long cpu = (unsigned long) m->private;
struct rq *rq = cpu_rq(cpu);
u64 value;
switch (param) {
+ /*
+ * The params for fair server and ext server as set via debugfs
+ * are the same, so we can just use one of them
+ */
case DL_RUNTIME:
value = rq->fair_server.dl_runtime;
break;
@@ -424,50 +441,50 @@ static size_t sched_fair_server_show(struct seq_file *m, void *v, enum dl_param
}
static ssize_t
-sched_fair_server_runtime_write(struct file *filp, const char __user *ubuf,
+sched_dl_server_runtime_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_RUNTIME);
+ return sched_dl_server_write(filp, ubuf, cnt, ppos, DL_RUNTIME);
}
-static int sched_fair_server_runtime_show(struct seq_file *m, void *v)
+static int sched_dl_server_runtime_show(struct seq_file *m, void *v)
{
- return sched_fair_server_show(m, v, DL_RUNTIME);
+ return sched_dl_server_show(m, v, DL_RUNTIME);
}
-static int sched_fair_server_runtime_open(struct inode *inode, struct file *filp)
+static int sched_dl_server_runtime_open(struct inode *inode, struct file *filp)
{
- return single_open(filp, sched_fair_server_runtime_show, inode->i_private);
+ return single_open(filp, sched_dl_server_runtime_show, inode->i_private);
}
-static const struct file_operations fair_server_runtime_fops = {
- .open = sched_fair_server_runtime_open,
- .write = sched_fair_server_runtime_write,
+static const struct file_operations dl_server_runtime_fops = {
+ .open = sched_dl_server_runtime_open,
+ .write = sched_dl_server_runtime_write,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static ssize_t
-sched_fair_server_period_write(struct file *filp, const char __user *ubuf,
+sched_dl_server_period_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_PERIOD);
+ return sched_dl_server_write(filp, ubuf, cnt, ppos, DL_PERIOD);
}
-static int sched_fair_server_period_show(struct seq_file *m, void *v)
+static int sched_dl_server_period_show(struct seq_file *m, void *v)
{
- return sched_fair_server_show(m, v, DL_PERIOD);
+ return sched_dl_server_show(m, v, DL_PERIOD);
}
-static int sched_fair_server_period_open(struct inode *inode, struct file *filp)
+static int sched_dl_server_period_open(struct inode *inode, struct file *filp)
{
- return single_open(filp, sched_fair_server_period_show, inode->i_private);
+ return single_open(filp, sched_dl_server_period_show, inode->i_private);
}
-static const struct file_operations fair_server_period_fops = {
- .open = sched_fair_server_period_open,
- .write = sched_fair_server_period_write,
+static const struct file_operations dl_server_period_fops = {
+ .open = sched_dl_server_period_open,
+ .write = sched_dl_server_period_write,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
@@ -475,13 +492,13 @@ static const struct file_operations fair_server_period_fops = {
static struct dentry *debugfs_sched;
-static void debugfs_fair_server_init(void)
+static void debugfs_dl_server_init(void)
{
- struct dentry *d_fair;
+ struct dentry *d_server;
unsigned long cpu;
- d_fair = debugfs_create_dir("fair_server", debugfs_sched);
- if (!d_fair)
+ d_server = debugfs_create_dir("dl_server", debugfs_sched);
+ if (!d_server)
return;
for_each_possible_cpu(cpu) {
@@ -489,10 +506,10 @@ static void debugfs_fair_server_init(void)
char buf[32];
snprintf(buf, sizeof(buf), "cpu%lu", cpu);
- d_cpu = debugfs_create_dir(buf, d_fair);
+ d_cpu = debugfs_create_dir(buf, d_server);
- debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &fair_server_runtime_fops);
- debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &fair_server_period_fops);
+ debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &dl_server_runtime_fops);
+ debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &dl_server_period_fops);
}
}
@@ -535,7 +552,7 @@ static __init int sched_init_debug(void)
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
- debugfs_fair_server_init();
+ debugfs_dl_server_init();
return 0;
}
--
2.43.0
^ permalink raw reply related [flat|nested] 23+ messages in thread* [PATCH RFC 7/8] sched/deadline: Clear defer params
2025-03-15 2:21 [PATCH RFC 0/8] Add a deadline server for sched_ext tasks Joel Fernandes
` (5 preceding siblings ...)
2025-03-15 2:21 ` [PATCH RFC 6/8] sched/debug: Add support to change sched_ext server params Joel Fernandes
@ 2025-03-15 2:21 ` Joel Fernandes
2025-03-15 2:21 ` [PATCH RFC 8/8] selftests/sched_ext: Add test for sched_ext dl_server Joel Fernandes
7 siblings, 0 replies; 23+ messages in thread
From: Joel Fernandes @ 2025-03-15 2:21 UTC (permalink / raw)
To: linux-kernel, Andrea Righi, Tejun Heo, Ingo Molnar,
Peter Zijlstra, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
Steven Rostedt, Ben Segall, Mel Gorman, Valentin Schneider,
Daniel Bristot de Oliveira
Cc: Luigi De Matteis, paulmck, boqun.feng, David Vernet,
Joel Fernandes
The defer params were not cleared in __dl_clear_params. Clear them.
Without this is some of my test cases are flaking and the DL timer is
not starting correctly AFAICS.
Fixes: a110a81c52a9 ("sched/deadline: Deferrable dl server")
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
---
kernel/sched/deadline.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index bcb66d9692ae..1a9c697a795c 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -3427,6 +3427,9 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se)
dl_se->dl_non_contending = 0;
dl_se->dl_overrun = 0;
dl_se->dl_server = 0;
+ dl_se->dl_defer = 0;
+ dl_se->dl_defer_running = 0;
+ dl_se->dl_defer_armed = 0;
#ifdef CONFIG_RT_MUTEXES
dl_se->pi_se = dl_se;
--
2.43.0
^ permalink raw reply related [flat|nested] 23+ messages in thread* [PATCH RFC 8/8] selftests/sched_ext: Add test for sched_ext dl_server
2025-03-15 2:21 [PATCH RFC 0/8] Add a deadline server for sched_ext tasks Joel Fernandes
` (6 preceding siblings ...)
2025-03-15 2:21 ` [PATCH RFC 7/8] sched/deadline: Clear defer params Joel Fernandes
@ 2025-03-15 2:21 ` Joel Fernandes
2025-03-15 23:22 ` Joel Fernandes
7 siblings, 1 reply; 23+ messages in thread
From: Joel Fernandes @ 2025-03-15 2:21 UTC (permalink / raw)
To: linux-kernel, Andrea Righi, Tejun Heo, David Vernet, Changwoo Min,
Shuah Khan
Cc: Luigi De Matteis, paulmck, boqun.feng, Joel Fernandes,
linux-kselftest, bpf
From: Andrea Righi <arighi@nvidia.com>
Add a selftest to validate the correct behavior of the deadline server
for the ext_sched_class.
[ Joel: Replaced occurences of CFS in the test with EXT. ]
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
---
tools/testing/selftests/sched_ext/Makefile | 1 +
.../selftests/sched_ext/rt_stall.bpf.c | 23 ++
tools/testing/selftests/sched_ext/rt_stall.c | 213 ++++++++++++++++++
3 files changed, 237 insertions(+)
create mode 100644 tools/testing/selftests/sched_ext/rt_stall.bpf.c
create mode 100644 tools/testing/selftests/sched_ext/rt_stall.c
diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
index 011762224600..802e3d8d038f 100644
--- a/tools/testing/selftests/sched_ext/Makefile
+++ b/tools/testing/selftests/sched_ext/Makefile
@@ -180,6 +180,7 @@ auto-test-targets := \
select_cpu_dispatch_bad_dsq \
select_cpu_dispatch_dbl_dsp \
select_cpu_vtime \
+ rt_stall \
test_example \
testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets)))
diff --git a/tools/testing/selftests/sched_ext/rt_stall.bpf.c b/tools/testing/selftests/sched_ext/rt_stall.bpf.c
new file mode 100644
index 000000000000..80086779dd1e
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/rt_stall.bpf.c
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A scheduler that verified if RT tasks can stall SCHED_EXT tasks.
+ *
+ * Copyright (c) 2025 NVIDIA Corporation.
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+void BPF_STRUCT_OPS(rt_stall_exit, struct scx_exit_info *ei)
+{
+ UEI_RECORD(uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops rt_stall_ops = {
+ .exit = (void *)rt_stall_exit,
+ .name = "rt_stall",
+};
diff --git a/tools/testing/selftests/sched_ext/rt_stall.c b/tools/testing/selftests/sched_ext/rt_stall.c
new file mode 100644
index 000000000000..d4cb545ebfd8
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/rt_stall.c
@@ -0,0 +1,213 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2025 NVIDIA Corporation.
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sched.h>
+#include <sys/prctl.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <time.h>
+#include <linux/sched.h>
+#include <signal.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "rt_stall.bpf.skel.h"
+#include "scx_test.h"
+#include "../kselftest.h"
+
+#define CORE_ID 0 /* CPU to pin tasks to */
+#define RUN_TIME 5 /* How long to run the test in seconds */
+
+/* Simple busy-wait function for test tasks */
+static void process_func(void)
+{
+ while (1) {
+ /* Busy wait */
+ for (volatile unsigned long i = 0; i < 10000000UL; i++);
+ }
+}
+
+/* Set CPU affinity to a specific core */
+static void set_affinity(int cpu)
+{
+ cpu_set_t mask;
+
+ CPU_ZERO(&mask);
+ CPU_SET(cpu, &mask);
+ if (sched_setaffinity(0, sizeof(mask), &mask) != 0) {
+ perror("sched_setaffinity");
+ exit(EXIT_FAILURE);
+ }
+}
+
+/* Set task scheduling policy and priority */
+static void set_sched(int policy, int priority)
+{
+ struct sched_param param;
+
+ param.sched_priority = priority;
+ if (sched_setscheduler(0, policy, ¶m) != 0) {
+ perror("sched_setscheduler");
+ exit(EXIT_FAILURE);
+ }
+}
+
+/* Get process runtime from /proc/<pid>/stat */
+static float get_process_runtime(int pid)
+{
+ char path[256];
+ FILE *file;
+ long utime, stime;
+ int fields;
+
+ snprintf(path, sizeof(path), "/proc/%d/stat", pid);
+ file = fopen(path, "r");
+ if (file == NULL) {
+ perror("Failed to open stat file");
+ return -1;
+ }
+
+ /* Skip the first 13 fields and read the 14th and 15th */
+ fields = fscanf(file,
+ "%*d %*s %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u %lu %lu",
+ &utime, &stime);
+ fclose(file);
+
+ if (fields != 2) {
+ fprintf(stderr, "Failed to read stat file\n");
+ return -1;
+ }
+
+ /* Calculate the total time spent in the process */
+ long total_time = utime + stime;
+ long ticks_per_second = sysconf(_SC_CLK_TCK);
+ float runtime_seconds = total_time * 1.0 / ticks_per_second;
+
+ return runtime_seconds;
+}
+
+static enum scx_test_status setup(void **ctx)
+{
+ struct rt_stall *skel;
+
+ skel = rt_stall__open();
+ SCX_FAIL_IF(!skel, "Failed to open");
+ SCX_ENUM_INIT(skel);
+ SCX_FAIL_IF(rt_stall__load(skel), "Failed to load skel");
+
+ *ctx = skel;
+
+ return SCX_TEST_PASS;
+}
+
+static bool sched_stress_test(void)
+{
+ float cfs_runtime, rt_runtime;
+ int cfs_pid, rt_pid;
+ float expected_min_ratio = 0.04; /* 4% */
+
+ ksft_print_header();
+ ksft_set_plan(1);
+
+ /* Create and set up a EXT task */
+ cfs_pid = fork();
+ if (cfs_pid == 0) {
+ set_affinity(CORE_ID);
+ process_func();
+ exit(0);
+ } else if (cfs_pid < 0) {
+ perror("fork for EXT task");
+ ksft_exit_fail();
+ }
+
+ /* Create an RT task */
+ rt_pid = fork();
+ if (rt_pid == 0) {
+ set_affinity(CORE_ID);
+ set_sched(SCHED_FIFO, 50);
+ process_func();
+ exit(0);
+ } else if (rt_pid < 0) {
+ perror("fork for RT task");
+ ksft_exit_fail();
+ }
+
+ /* Let the processes run for the specified time */
+ sleep(RUN_TIME);
+
+ /* Get runtime for the EXT task */
+ cfs_runtime = get_process_runtime(cfs_pid);
+ if (cfs_runtime != -1)
+ ksft_print_msg("Runtime of EXT task (PID %d) is %f seconds\n", cfs_pid, cfs_runtime);
+ else
+ ksft_exit_fail_msg("Error getting runtime for EXT task (PID %d)\n", cfs_pid);
+
+ /* Get runtime for the RT task */
+ rt_runtime = get_process_runtime(rt_pid);
+ if (rt_runtime != -1)
+ ksft_print_msg("Runtime of RT task (PID %d) is %f seconds\n", rt_pid, rt_runtime);
+ else
+ ksft_exit_fail_msg("Error getting runtime for RT task (PID %d)\n", rt_pid);
+
+ /* Kill the processes */
+ kill(cfs_pid, SIGKILL);
+ kill(rt_pid, SIGKILL);
+ waitpid(cfs_pid, NULL, 0);
+ waitpid(rt_pid, NULL, 0);
+
+ /* Verify that the scx task got enough runtime */
+ float actual_ratio = cfs_runtime / (cfs_runtime + rt_runtime);
+ ksft_print_msg("EXT task got %.2f%% of total runtime\n", actual_ratio * 100);
+
+ if (actual_ratio >= expected_min_ratio) {
+ ksft_test_result_pass("PASS: EXT task got more than %.2f%% of runtime\n",
+ expected_min_ratio * 100);
+ return true;
+ } else {
+ ksft_test_result_fail("FAIL: EXT task got less than %.2f%% of runtime\n",
+ expected_min_ratio * 100);
+ return false;
+ }
+}
+
+static enum scx_test_status run(void *ctx)
+{
+ struct rt_stall *skel = ctx;
+ struct bpf_link *link;
+ bool res;
+
+ link = bpf_map__attach_struct_ops(skel->maps.rt_stall_ops);
+ SCX_FAIL_IF(!link, "Failed to attach scheduler");
+
+ res = sched_stress_test();
+
+ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE));
+ bpf_link__destroy(link);
+
+ if (!res)
+ ksft_exit_fail();
+
+ return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+ struct rt_stall *skel = ctx;
+
+ rt_stall__destroy(skel);
+}
+
+struct scx_test rt_stall = {
+ .name = "rt_stall",
+ .description = "Verify that RT tasks cannot stall SCHED_EXT tasks",
+ .setup = setup,
+ .run = run,
+ .cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&rt_stall)
--
2.43.0
^ permalink raw reply related [flat|nested] 23+ messages in thread* Re: [PATCH RFC 8/8] selftests/sched_ext: Add test for sched_ext dl_server
2025-03-15 2:21 ` [PATCH RFC 8/8] selftests/sched_ext: Add test for sched_ext dl_server Joel Fernandes
@ 2025-03-15 23:22 ` Joel Fernandes
0 siblings, 0 replies; 23+ messages in thread
From: Joel Fernandes @ 2025-03-15 23:22 UTC (permalink / raw)
To: linux-kernel, Andrea Righi
This patch triggered a build robot warning, adding to thread for tracking the issue:
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/jfern/linux.git
sched/scx-dlserver-boost
branch HEAD: d4faa4aa813acff703cea3c301bb556f69b5210c TODO_BEFORE_SUBMIT
Warning (recently discovered and may have been fixed):
https://lore.kernel.org/oe-kbuild-all/202503152220.gAuzob4W-lkp@intel.com
tools/testing/selftests/sched_ext/rt_stall.c: sys/wait.h is included more
than once.
Warning ids grouped by kconfigs:
recent_errors
`-- x86_64-allnoconfig
`--
tools-testing-selftests-sched_ext-rt_stall.c:sys-wait.h-is-included-more-than-once.
elapsed time: 1448m
configs tested: 93
configs skipped: 1
tested configs:
alpha allnoconfig gcc-14.2.0
alpha allyesconfig gcc-14.2.0
arc allmodconfig gcc-13.2.0
arc allnoconfig gcc-13.2.0
arc allyesconfig gcc-13.2.0
arc randconfig-001-20250315 gcc-13.2.0
arc randconfig-002-20250315 gcc-13.2.0
arm allmodconfig gcc-14.2.0
arm allnoconfig clang-17
arm allyesconfig gcc-14.2.0
arm mmp2_defconfig gcc-14.2.0
arm mvebu_v7_defconfig clang-15
arm randconfig-001-20250315 gcc-14.2.0
arm randconfig-002-20250315 clang-21
arm randconfig-003-20250315 clang-21
arm randconfig-004-20250315 gcc-14.2.0
arm spear3xx_defconfig clang-16
arm64 allmodconfig clang-18
arm64 randconfig-001-20250315 gcc-14.2.0
arm64 randconfig-002-20250315 gcc-14.2.0
arm64 randconfig-003-20250315 clang-16
arm64 randconfig-004-20250315 gcc-14.2.0
csky randconfig-001-20250315 gcc-14.2.0
csky randconfig-002-20250315 gcc-14.2.0
hexagon allmodconfig clang-21
hexagon allyesconfig clang-18
hexagon randconfig-001-20250315 clang-21
hexagon randconfig-002-20250315 clang-17
i386 allmodconfig gcc-12
i386 allnoconfig gcc-12
i386 buildonly-randconfig-001-20250315 gcc-12
i386 buildonly-randconfig-002-20250315 clang-19
i386 buildonly-randconfig-003-20250315 clang-19
i386 buildonly-randconfig-004-20250315 clang-19
i386 buildonly-randconfig-005-20250315 gcc-11
i386 buildonly-randconfig-006-20250315 gcc-12
i386 defconfig clang-19
loongarch randconfig-001-20250315 gcc-14.2.0
loongarch randconfig-002-20250315 gcc-14.2.0
m68k allnoconfig gcc-14.2.0
m68k allyesconfig gcc-14.2.0
microblaze allnoconfig gcc-14.2.0
mips allnoconfig gcc-14.2.0
nios2 allnoconfig gcc-14.2.0
nios2 randconfig-001-20250315 gcc-14.2.0
nios2 randconfig-002-20250315 gcc-14.2.0
openrisc allnoconfig gcc-14.2.0
parisc allnoconfig gcc-14.2.0
parisc randconfig-001-20250315 gcc-14.2.0
parisc randconfig-002-20250315 gcc-14.2.0
powerpc allnoconfig gcc-14.2.0
powerpc randconfig-001-20250315 clang-21
powerpc randconfig-002-20250315 gcc-14.2.0
powerpc randconfig-003-20250315 clang-18
powerpc64 randconfig-001-20250315 gcc-14.2.0
powerpc64 randconfig-002-20250315 clang-18
powerpc64 randconfig-003-20250315 gcc-14.2.0
riscv allnoconfig gcc-14.2.0
riscv randconfig-001-20250315 gcc-14.2.0
riscv randconfig-002-20250315 gcc-14.2.0
s390 allmodconfig clang-19
s390 allnoconfig clang-15
s390 allyesconfig gcc-14.2.0
s390 randconfig-001-20250315 clang-19
s390 randconfig-002-20250315 gcc-14.2.0
sh allmodconfig gcc-14.2.0
sh allnoconfig gcc-14.2.0
sh allyesconfig gcc-14.2.0
sh randconfig-001-20250315 gcc-14.2.0
sh randconfig-002-20250315 gcc-14.2.0
sparc allmodconfig gcc-14.2.0
sparc allnoconfig gcc-14.2.0
sparc randconfig-001-20250315 gcc-14.2.0
sparc randconfig-002-20250315 gcc-14.2.0
sparc64 randconfig-001-20250315 gcc-14.2.0
sparc64 randconfig-002-20250315 gcc-14.2.0
um allmodconfig clang-21
um allnoconfig clang-18
um allyesconfig gcc-12
um randconfig-001-20250315 gcc-12
um randconfig-002-20250315 clang-18
x86_64 allnoconfig clang-19
x86_64 allyesconfig clang-19
x86_64 buildonly-randconfig-001-20250315 gcc-12
x86_64 buildonly-randconfig-002-20250315 clang-19
x86_64 buildonly-randconfig-003-20250315 clang-19
x86_64 buildonly-randconfig-004-20250315 clang-19
x86_64 buildonly-randconfig-005-20250315 clang-19
x86_64 buildonly-randconfig-006-20250315 gcc-12
x86_64 defconfig gcc-11
xtensa allnoconfig gcc-14.2.0
xtensa randconfig-001-20250315 gcc-14.2.0
xtensa randconfig-002-20250315 gcc-14.2.0
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
On 3/14/2025 10:21 PM, Joel Fernandes wrote:
> From: Andrea Righi <arighi@nvidia.com>
>
> Add a selftest to validate the correct behavior of the deadline server
> for the ext_sched_class.
>
> [ Joel: Replaced occurences of CFS in the test with EXT. ]
>
> Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
> Signed-off-by: Andrea Righi <arighi@nvidia.com>
> ---
> tools/testing/selftests/sched_ext/Makefile | 1 +
> .../selftests/sched_ext/rt_stall.bpf.c | 23 ++
> tools/testing/selftests/sched_ext/rt_stall.c | 213 ++++++++++++++++++
> 3 files changed, 237 insertions(+)
> create mode 100644 tools/testing/selftests/sched_ext/rt_stall.bpf.c
> create mode 100644 tools/testing/selftests/sched_ext/rt_stall.c
>
> diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
> index 011762224600..802e3d8d038f 100644
> --- a/tools/testing/selftests/sched_ext/Makefile
> +++ b/tools/testing/selftests/sched_ext/Makefile
> @@ -180,6 +180,7 @@ auto-test-targets := \
> select_cpu_dispatch_bad_dsq \
> select_cpu_dispatch_dbl_dsp \
> select_cpu_vtime \
> + rt_stall \
> test_example \
>
> testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets)))
> diff --git a/tools/testing/selftests/sched_ext/rt_stall.bpf.c b/tools/testing/selftests/sched_ext/rt_stall.bpf.c
> new file mode 100644
> index 000000000000..80086779dd1e
> --- /dev/null
> +++ b/tools/testing/selftests/sched_ext/rt_stall.bpf.c
> @@ -0,0 +1,23 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * A scheduler that verified if RT tasks can stall SCHED_EXT tasks.
> + *
> + * Copyright (c) 2025 NVIDIA Corporation.
> + */
> +
> +#include <scx/common.bpf.h>
> +
> +char _license[] SEC("license") = "GPL";
> +
> +UEI_DEFINE(uei);
> +
> +void BPF_STRUCT_OPS(rt_stall_exit, struct scx_exit_info *ei)
> +{
> + UEI_RECORD(uei, ei);
> +}
> +
> +SEC(".struct_ops.link")
> +struct sched_ext_ops rt_stall_ops = {
> + .exit = (void *)rt_stall_exit,
> + .name = "rt_stall",
> +};
> diff --git a/tools/testing/selftests/sched_ext/rt_stall.c b/tools/testing/selftests/sched_ext/rt_stall.c
> new file mode 100644
> index 000000000000..d4cb545ebfd8
> --- /dev/null
> +++ b/tools/testing/selftests/sched_ext/rt_stall.c
> @@ -0,0 +1,213 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2025 NVIDIA Corporation.
> + */
> +#define _GNU_SOURCE
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <sched.h>
> +#include <sys/prctl.h>
> +#include <sys/types.h>
> +#include <sys/wait.h>
> +#include <time.h>
> +#include <linux/sched.h>
> +#include <signal.h>
> +#include <bpf/bpf.h>
> +#include <scx/common.h>
> +#include <sys/wait.h>
> +#include <unistd.h>
> +#include "rt_stall.bpf.skel.h"
> +#include "scx_test.h"
> +#include "../kselftest.h"
> +
> +#define CORE_ID 0 /* CPU to pin tasks to */
> +#define RUN_TIME 5 /* How long to run the test in seconds */
> +
> +/* Simple busy-wait function for test tasks */
> +static void process_func(void)
> +{
> + while (1) {
> + /* Busy wait */
> + for (volatile unsigned long i = 0; i < 10000000UL; i++);
> + }
> +}
> +
> +/* Set CPU affinity to a specific core */
> +static void set_affinity(int cpu)
> +{
> + cpu_set_t mask;
> +
> + CPU_ZERO(&mask);
> + CPU_SET(cpu, &mask);
> + if (sched_setaffinity(0, sizeof(mask), &mask) != 0) {
> + perror("sched_setaffinity");
> + exit(EXIT_FAILURE);
> + }
> +}
> +
> +/* Set task scheduling policy and priority */
> +static void set_sched(int policy, int priority)
> +{
> + struct sched_param param;
> +
> + param.sched_priority = priority;
> + if (sched_setscheduler(0, policy, ¶m) != 0) {
> + perror("sched_setscheduler");
> + exit(EXIT_FAILURE);
> + }
> +}
> +
> +/* Get process runtime from /proc/<pid>/stat */
> +static float get_process_runtime(int pid)
> +{
> + char path[256];
> + FILE *file;
> + long utime, stime;
> + int fields;
> +
> + snprintf(path, sizeof(path), "/proc/%d/stat", pid);
> + file = fopen(path, "r");
> + if (file == NULL) {
> + perror("Failed to open stat file");
> + return -1;
> + }
> +
> + /* Skip the first 13 fields and read the 14th and 15th */
> + fields = fscanf(file,
> + "%*d %*s %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u %lu %lu",
> + &utime, &stime);
> + fclose(file);
> +
> + if (fields != 2) {
> + fprintf(stderr, "Failed to read stat file\n");
> + return -1;
> + }
> +
> + /* Calculate the total time spent in the process */
> + long total_time = utime + stime;
> + long ticks_per_second = sysconf(_SC_CLK_TCK);
> + float runtime_seconds = total_time * 1.0 / ticks_per_second;
> +
> + return runtime_seconds;
> +}
> +
> +static enum scx_test_status setup(void **ctx)
> +{
> + struct rt_stall *skel;
> +
> + skel = rt_stall__open();
> + SCX_FAIL_IF(!skel, "Failed to open");
> + SCX_ENUM_INIT(skel);
> + SCX_FAIL_IF(rt_stall__load(skel), "Failed to load skel");
> +
> + *ctx = skel;
> +
> + return SCX_TEST_PASS;
> +}
> +
> +static bool sched_stress_test(void)
> +{
> + float cfs_runtime, rt_runtime;
> + int cfs_pid, rt_pid;
> + float expected_min_ratio = 0.04; /* 4% */
> +
> + ksft_print_header();
> + ksft_set_plan(1);
> +
> + /* Create and set up a EXT task */
> + cfs_pid = fork();
> + if (cfs_pid == 0) {
> + set_affinity(CORE_ID);
> + process_func();
> + exit(0);
> + } else if (cfs_pid < 0) {
> + perror("fork for EXT task");
> + ksft_exit_fail();
> + }
> +
> + /* Create an RT task */
> + rt_pid = fork();
> + if (rt_pid == 0) {
> + set_affinity(CORE_ID);
> + set_sched(SCHED_FIFO, 50);
> + process_func();
> + exit(0);
> + } else if (rt_pid < 0) {
> + perror("fork for RT task");
> + ksft_exit_fail();
> + }
> +
> + /* Let the processes run for the specified time */
> + sleep(RUN_TIME);
> +
> + /* Get runtime for the EXT task */
> + cfs_runtime = get_process_runtime(cfs_pid);
> + if (cfs_runtime != -1)
> + ksft_print_msg("Runtime of EXT task (PID %d) is %f seconds\n", cfs_pid, cfs_runtime);
> + else
> + ksft_exit_fail_msg("Error getting runtime for EXT task (PID %d)\n", cfs_pid);
> +
> + /* Get runtime for the RT task */
> + rt_runtime = get_process_runtime(rt_pid);
> + if (rt_runtime != -1)
> + ksft_print_msg("Runtime of RT task (PID %d) is %f seconds\n", rt_pid, rt_runtime);
> + else
> + ksft_exit_fail_msg("Error getting runtime for RT task (PID %d)\n", rt_pid);
> +
> + /* Kill the processes */
> + kill(cfs_pid, SIGKILL);
> + kill(rt_pid, SIGKILL);
> + waitpid(cfs_pid, NULL, 0);
> + waitpid(rt_pid, NULL, 0);
> +
> + /* Verify that the scx task got enough runtime */
> + float actual_ratio = cfs_runtime / (cfs_runtime + rt_runtime);
> + ksft_print_msg("EXT task got %.2f%% of total runtime\n", actual_ratio * 100);
> +
> + if (actual_ratio >= expected_min_ratio) {
> + ksft_test_result_pass("PASS: EXT task got more than %.2f%% of runtime\n",
> + expected_min_ratio * 100);
> + return true;
> + } else {
> + ksft_test_result_fail("FAIL: EXT task got less than %.2f%% of runtime\n",
> + expected_min_ratio * 100);
> + return false;
> + }
> +}
> +
> +static enum scx_test_status run(void *ctx)
> +{
> + struct rt_stall *skel = ctx;
> + struct bpf_link *link;
> + bool res;
> +
> + link = bpf_map__attach_struct_ops(skel->maps.rt_stall_ops);
> + SCX_FAIL_IF(!link, "Failed to attach scheduler");
> +
> + res = sched_stress_test();
> +
> + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE));
> + bpf_link__destroy(link);
> +
> + if (!res)
> + ksft_exit_fail();
> +
> + return SCX_TEST_PASS;
> +}
> +
> +static void cleanup(void *ctx)
> +{
> + struct rt_stall *skel = ctx;
> +
> + rt_stall__destroy(skel);
> +}
> +
> +struct scx_test rt_stall = {
> + .name = "rt_stall",
> + .description = "Verify that RT tasks cannot stall SCHED_EXT tasks",
> + .setup = setup,
> + .run = run,
> + .cleanup = cleanup,
> +};
> +REGISTER_SCX_TEST(&rt_stall)
^ permalink raw reply [flat|nested] 23+ messages in thread