* [PATCH 01/12] sched_ext: Introduce scx_sched
2025-04-23 23:44 [PATCHSET sched_ext/for-6.16] sched_ext: Introduce scx_sched Tejun Heo
@ 2025-04-23 23:44 ` Tejun Heo
2025-04-23 23:44 ` [PATCH 02/12] sched_ext: Avoid NULL scx_root deref through SCX_HAS_OP() Tejun Heo
` (10 subsequent siblings)
11 siblings, 0 replies; 19+ messages in thread
From: Tejun Heo @ 2025-04-23 23:44 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min, linux-kernel; +Cc: Tejun Heo
To support multiple scheduler instances, collect some of the global
variables that should be specific to a scheduler instance into the new
struct scx_sched. scx_root is the root scheduler instance and points to a
static instance of struct scx_sched. Except for an extra dereference through
the scx_root pointer, this patch makes no functional changes.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/ext.c | 243 +++++++++++++++++++++-------------------
kernel/sched/ext_idle.c | 3 +-
2 files changed, 131 insertions(+), 115 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 6e530a91e944..975f6963a01b 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -763,6 +763,18 @@ enum scx_opi {
SCX_OPI_END = SCX_OP_IDX(init),
};
+struct scx_sched {
+ struct sched_ext_ops ops;
+ DECLARE_BITMAP(has_op, SCX_OPI_END);
+
+ bool warned_zero_slice;
+
+ atomic_t exit_kind;
+ struct scx_exit_info *exit_info;
+
+ struct kobject *kobj;
+};
+
enum scx_wake_flags {
/* expose select WF_* flags as enums */
SCX_WAKE_FORK = WF_FORK,
@@ -921,6 +933,12 @@ enum scx_ops_state {
#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK)
+static struct scx_sched __scx_root = {
+ .exit_kind = ATOMIC_INIT(SCX_EXIT_DONE),
+};
+
+static struct scx_sched *scx_root = &__scx_root;
+
/*
* During exit, a task may schedule after losing its PIDs. When disabling the
* BPF scheduler, we need to be able to iterate tasks in every state to
@@ -943,14 +961,6 @@ static bool scx_init_task_enabled;
static bool scx_switching_all;
DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
-static struct sched_ext_ops scx_ops;
-static bool scx_warned_zero_slice;
-
-static DECLARE_BITMAP(scx_has_op, SCX_OPI_END);
-
-static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
-static struct scx_exit_info *scx_exit_info;
-
static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
@@ -1053,7 +1063,6 @@ static struct scx_dump_data scx_dump_data = {
/* /sys/kernel/sched_ext interface */
static struct kset *scx_kset;
-static struct kobject *scx_root_kobj;
#define CREATE_TRACE_POINTS
#include <trace/events/sched_ext.h>
@@ -1072,7 +1081,7 @@ static __printf(3, 4) void __scx_exit(enum scx_exit_kind kind, s64 exit_code,
#define scx_error(fmt, args...) \
__scx_error(SCX_EXIT_ERROR, fmt, ##args)
-#define SCX_HAS_OP(op) test_bit(SCX_OP_IDX(op), scx_has_op)
+#define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op)
static long jiffies_delta_msecs(unsigned long at, unsigned long now)
{
@@ -1168,25 +1177,25 @@ do { \
update_locked_rq(rq); \
if (mask) { \
scx_kf_allow(mask); \
- scx_ops.op(args); \
+ scx_root->ops.op(args); \
scx_kf_disallow(mask); \
} else { \
- scx_ops.op(args); \
+ scx_root->ops.op(args); \
} \
update_locked_rq(NULL); \
} while (0)
#define SCX_CALL_OP_RET(mask, op, rq, args...) \
({ \
- __typeof__(scx_ops.op(args)) __ret; \
+ __typeof__(scx_root->ops.op(args)) __ret; \
\
update_locked_rq(rq); \
if (mask) { \
scx_kf_allow(mask); \
- __ret = scx_ops.op(args); \
+ __ret = scx_root->ops.op(args); \
scx_kf_disallow(mask); \
} else { \
- __ret = scx_ops.op(args); \
+ __ret = scx_root->ops.op(args); \
} \
update_locked_rq(NULL); \
__ret; \
@@ -1213,7 +1222,7 @@ do { \
#define SCX_CALL_OP_TASK_RET(mask, op, rq, task, args...) \
({ \
- __typeof__(scx_ops.op(task, ##args)) __ret; \
+ __typeof__(scx_root->ops.op(task, ##args)) __ret; \
BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
current->scx.kf_tasks[0] = task; \
__ret = SCX_CALL_OP_RET(mask, op, rq, task, ##args); \
@@ -1223,7 +1232,7 @@ do { \
#define SCX_CALL_OP_2TASKS_RET(mask, op, rq, task0, task1, args...) \
({ \
- __typeof__(scx_ops.op(task0, task1, ##args)) __ret; \
+ __typeof__(scx_root->ops.op(task0, task1, ##args)) __ret; \
BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
current->scx.kf_tasks[0] = task0; \
current->scx.kf_tasks[1] = task1; \
@@ -1825,7 +1834,7 @@ static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p)
lockdep_assert_rq_held(rq);
#ifdef CONFIG_SCHED_CORE
- if (unlikely(SCX_HAS_OP(core_sched_before)))
+ if (unlikely(SCX_HAS_OP(scx_root, core_sched_before)))
touch_core_sched(rq, p);
#endif
}
@@ -2200,20 +2209,20 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
goto direct;
/* see %SCX_OPS_ENQ_EXITING */
- if (!(scx_ops.flags & SCX_OPS_ENQ_EXITING) &&
+ if (!(scx_root->ops.flags & SCX_OPS_ENQ_EXITING) &&
unlikely(p->flags & PF_EXITING)) {
__scx_add_event(SCX_EV_ENQ_SKIP_EXITING, 1);
goto local;
}
/* see %SCX_OPS_ENQ_MIGRATION_DISABLED */
- if (!(scx_ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) &&
+ if (!(scx_root->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) &&
is_migration_disabled(p)) {
__scx_add_event(SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
goto local;
}
- if (unlikely(!SCX_HAS_OP(enqueue)))
+ if (unlikely(!SCX_HAS_OP(scx_root, enqueue)))
goto global;
/* DSQ bypass didn't trigger, enqueue on the BPF scheduler */
@@ -2320,7 +2329,7 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
rq->scx.nr_running++;
add_nr_running(rq, 1);
- if (SCX_HAS_OP(runnable) && !task_on_rq_migrating(p))
+ if (SCX_HAS_OP(scx_root, runnable) && !task_on_rq_migrating(p))
SCX_CALL_OP_TASK(SCX_KF_REST, runnable, rq, p, enq_flags);
if (enq_flags & SCX_ENQ_WAKEUP)
@@ -2355,7 +2364,7 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
*/
BUG();
case SCX_OPSS_QUEUED:
- if (SCX_HAS_OP(dequeue))
+ if (SCX_HAS_OP(scx_root, dequeue))
SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, rq, p, deq_flags);
if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
@@ -2403,12 +2412,12 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
* information meaningful to the BPF scheduler and can be suppressed by
* skipping the callbacks if the task is !QUEUED.
*/
- if (SCX_HAS_OP(stopping) && task_current(rq, p)) {
+ if (SCX_HAS_OP(scx_root, stopping) && task_current(rq, p)) {
update_curr_scx(rq);
SCX_CALL_OP_TASK(SCX_KF_REST, stopping, rq, p, false);
}
- if (SCX_HAS_OP(quiescent) && !task_on_rq_migrating(p))
+ if (SCX_HAS_OP(scx_root, quiescent) && !task_on_rq_migrating(p))
SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, rq, p, deq_flags);
if (deq_flags & SCX_DEQ_SLEEP)
@@ -2428,7 +2437,7 @@ static void yield_task_scx(struct rq *rq)
{
struct task_struct *p = rq->curr;
- if (SCX_HAS_OP(yield))
+ if (SCX_HAS_OP(scx_root, yield))
SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, rq, p, NULL);
else
p->scx.slice = 0;
@@ -2438,7 +2447,7 @@ static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
{
struct task_struct *from = rq->curr;
- if (SCX_HAS_OP(yield))
+ if (SCX_HAS_OP(scx_root, yield))
return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, rq, from, to);
else
return false;
@@ -2988,7 +2997,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
rq->scx.flags |= SCX_RQ_IN_BALANCE;
rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP);
- if ((scx_ops.flags & SCX_OPS_HAS_CPU_PREEMPT) &&
+ if ((scx_root->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) &&
unlikely(rq->scx.cpu_released)) {
/*
* If the previous sched_class for the current CPU was not SCX,
@@ -2996,7 +3005,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* core. This callback complements ->cpu_release(), which is
* emitted in switch_class().
*/
- if (SCX_HAS_OP(cpu_acquire))
+ if (SCX_HAS_OP(scx_root, cpu_acquire))
SCX_CALL_OP(SCX_KF_REST, cpu_acquire, rq, cpu_of(rq), NULL);
rq->scx.cpu_released = false;
}
@@ -3027,7 +3036,8 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
if (consume_global_dsq(rq))
goto has_tasks;
- if (unlikely(!SCX_HAS_OP(dispatch)) || scx_rq_bypassing(rq) || !scx_rq_online(rq))
+ if (unlikely(!SCX_HAS_OP(scx_root, dispatch)) ||
+ scx_rq_bypassing(rq) || !scx_rq_online(rq))
goto no_tasks;
dspc->rq = rq;
@@ -3077,7 +3087,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* %SCX_OPS_ENQ_LAST is in effect.
*/
if (prev_on_rq &&
- (!(scx_ops.flags & SCX_OPS_ENQ_LAST) || scx_rq_bypassing(rq))) {
+ (!(scx_root->ops.flags & SCX_OPS_ENQ_LAST) || scx_rq_bypassing(rq))) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
__scx_add_event(SCX_EV_DISPATCH_KEEP_LAST, 1);
goto has_tasks;
@@ -3163,7 +3173,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
p->se.exec_start = rq_clock_task(rq);
/* see dequeue_task_scx() on why we skip when !QUEUED */
- if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED))
+ if (SCX_HAS_OP(scx_root, running) && (p->scx.flags & SCX_TASK_QUEUED))
SCX_CALL_OP_TASK(SCX_KF_REST, running, rq, p);
clr_task_runnable(p, true);
@@ -3217,7 +3227,7 @@ static void switch_class(struct rq *rq, struct task_struct *next)
*/
smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
#endif
- if (!(scx_ops.flags & SCX_OPS_HAS_CPU_PREEMPT))
+ if (!(scx_root->ops.flags & SCX_OPS_HAS_CPU_PREEMPT))
return;
/*
@@ -3239,7 +3249,7 @@ static void switch_class(struct rq *rq, struct task_struct *next)
* next time that balance_scx() is invoked.
*/
if (!rq->scx.cpu_released) {
- if (SCX_HAS_OP(cpu_release)) {
+ if (SCX_HAS_OP(scx_root, cpu_release)) {
struct scx_cpu_release_args args = {
.reason = preempt_reason_from_class(next_class),
.task = next,
@@ -3257,7 +3267,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
update_curr_scx(rq);
/* see dequeue_task_scx() on why we skip when !QUEUED */
- if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED))
+ if (SCX_HAS_OP(scx_root, stopping) && (p->scx.flags & SCX_TASK_QUEUED))
SCX_CALL_OP_TASK(SCX_KF_REST, stopping, rq, p, true);
if (p->scx.flags & SCX_TASK_QUEUED) {
@@ -3281,7 +3291,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
* which should trigger an explicit follow-up scheduling event.
*/
if (sched_class_above(&ext_sched_class, next->sched_class)) {
- WARN_ON_ONCE(!(scx_ops.flags & SCX_OPS_ENQ_LAST));
+ WARN_ON_ONCE(!(scx_root->ops.flags & SCX_OPS_ENQ_LAST));
do_enqueue_task(rq, p, SCX_ENQ_LAST, -1);
} else {
do_enqueue_task(rq, p, 0, -1);
@@ -3356,10 +3366,10 @@ static struct task_struct *pick_task_scx(struct rq *rq)
}
if (unlikely(!p->scx.slice)) {
- if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) {
+ if (!scx_rq_bypassing(rq) && !scx_root->warned_zero_slice) {
printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n",
p->comm, p->pid, __func__);
- scx_warned_zero_slice = true;
+ scx_root->warned_zero_slice = true;
}
refill_task_slice_dfl(p);
}
@@ -3395,7 +3405,8 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
* calling ops.core_sched_before(). Accesses are controlled by the
* verifier.
*/
- if (SCX_HAS_OP(core_sched_before) && !scx_rq_bypassing(task_rq(a)))
+ if (SCX_HAS_OP(scx_root, core_sched_before) &&
+ !scx_rq_bypassing(task_rq(a)))
return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before, NULL,
(struct task_struct *)a,
(struct task_struct *)b);
@@ -3424,7 +3435,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
return prev_cpu;
rq_bypass = scx_rq_bypassing(task_rq(p));
- if (likely(SCX_HAS_OP(select_cpu)) && !rq_bypass) {
+ if (likely(SCX_HAS_OP(scx_root, select_cpu)) && !rq_bypass) {
s32 cpu;
struct task_struct **ddsp_taskp;
@@ -3476,7 +3487,7 @@ static void set_cpus_allowed_scx(struct task_struct *p,
* Fine-grained memory write control is enforced by BPF making the const
* designation pointless. Cast it away when calling the operation.
*/
- if (SCX_HAS_OP(set_cpumask))
+ if (SCX_HAS_OP(scx_root, set_cpumask))
SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, NULL,
p, (struct cpumask *)p->cpus_ptr);
}
@@ -3488,11 +3499,11 @@ static void handle_hotplug(struct rq *rq, bool online)
atomic_long_inc(&scx_hotplug_seq);
if (scx_enabled())
- scx_idle_update_selcpu_topology(&scx_ops);
+ scx_idle_update_selcpu_topology(&scx_root->ops);
- if (online && SCX_HAS_OP(cpu_online))
+ if (online && SCX_HAS_OP(scx_root, cpu_online))
SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, rq, cpu);
- else if (!online && SCX_HAS_OP(cpu_offline))
+ else if (!online && SCX_HAS_OP(scx_root, cpu_offline))
SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, rq, cpu);
else
scx_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
@@ -3595,7 +3606,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
if (scx_rq_bypassing(rq)) {
curr->scx.slice = 0;
touch_core_sched(rq, curr);
- } else if (SCX_HAS_OP(tick)) {
+ } else if (SCX_HAS_OP(scx_root, tick)) {
SCX_CALL_OP_TASK(SCX_KF_REST, tick, rq, curr);
}
@@ -3667,7 +3678,7 @@ static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork
p->scx.disallow = false;
- if (SCX_HAS_OP(init_task)) {
+ if (SCX_HAS_OP(scx_root, init_task)) {
struct scx_init_task_args args = {
SCX_INIT_TASK_ARGS_CGROUP(tg)
.fork = fork,
@@ -3730,11 +3741,11 @@ static void scx_enable_task(struct task_struct *p)
p->scx.weight = sched_weight_to_cgroup(weight);
- if (SCX_HAS_OP(enable))
+ if (SCX_HAS_OP(scx_root, enable))
SCX_CALL_OP_TASK(SCX_KF_REST, enable, rq, p);
scx_set_task_state(p, SCX_TASK_ENABLED);
- if (SCX_HAS_OP(set_weight))
+ if (SCX_HAS_OP(scx_root, set_weight))
SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, rq, p, p->scx.weight);
}
@@ -3745,7 +3756,7 @@ static void scx_disable_task(struct task_struct *p)
lockdep_assert_rq_held(rq);
WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
- if (SCX_HAS_OP(disable))
+ if (SCX_HAS_OP(scx_root, disable))
SCX_CALL_OP_TASK(SCX_KF_REST, disable, rq, p);
scx_set_task_state(p, SCX_TASK_READY);
}
@@ -3774,7 +3785,7 @@ static void scx_exit_task(struct task_struct *p)
return;
}
- if (SCX_HAS_OP(exit_task))
+ if (SCX_HAS_OP(scx_root, exit_task))
SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, task_rq(p), p, &args);
scx_set_task_state(p, SCX_TASK_NONE);
}
@@ -3883,7 +3894,7 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p,
lockdep_assert_rq_held(task_rq(p));
p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight));
- if (SCX_HAS_OP(set_weight))
+ if (SCX_HAS_OP(scx_root, set_weight))
SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, rq, p, p->scx.weight);
}
@@ -3899,7 +3910,7 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
* set_cpus_allowed_scx() is not called while @p is associated with a
* different scheduler class. Keep the BPF scheduler up-to-date.
*/
- if (SCX_HAS_OP(set_cpumask))
+ if (SCX_HAS_OP(scx_root, set_cpumask))
SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, rq,
p, (struct cpumask *)p->cpus_ptr);
}
@@ -3958,7 +3969,7 @@ int scx_tg_online(struct task_group *tg)
percpu_down_read(&scx_cgroup_rwsem);
if (scx_cgroup_enabled) {
- if (SCX_HAS_OP(cgroup_init)) {
+ if (SCX_HAS_OP(scx_root, cgroup_init)) {
struct scx_cgroup_init_args args =
{ .weight = tg->scx_weight };
@@ -3983,7 +3994,7 @@ void scx_tg_offline(struct task_group *tg)
percpu_down_read(&scx_cgroup_rwsem);
- if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))
+ if (SCX_HAS_OP(scx_root, cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))
SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, NULL, tg->css.cgroup);
tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
@@ -4016,7 +4027,7 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
if (from == to)
continue;
- if (SCX_HAS_OP(cgroup_prep_move)) {
+ if (SCX_HAS_OP(scx_root, cgroup_prep_move)) {
ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move, NULL,
p, from, css->cgroup);
if (ret)
@@ -4030,7 +4041,8 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
err:
cgroup_taskset_for_each(p, css, tset) {
- if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
+ if (SCX_HAS_OP(scx_root, cgroup_cancel_move) &&
+ p->scx.cgrp_moving_from)
SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, NULL,
p, p->scx.cgrp_moving_from, css->cgroup);
p->scx.cgrp_moving_from = NULL;
@@ -4049,7 +4061,8 @@ void scx_cgroup_move_task(struct task_struct *p)
* @p must have ops.cgroup_prep_move() called on it and thus
* cgrp_moving_from set.
*/
- if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
+ if (SCX_HAS_OP(scx_root, cgroup_move) &&
+ !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, NULL,
p, p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));
p->scx.cgrp_moving_from = NULL;
@@ -4069,7 +4082,8 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
goto out_unlock;
cgroup_taskset_for_each(p, css, tset) {
- if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
+ if (SCX_HAS_OP(scx_root, cgroup_cancel_move) &&
+ p->scx.cgrp_moving_from)
SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, NULL,
p, p->scx.cgrp_moving_from, css->cgroup);
p->scx.cgrp_moving_from = NULL;
@@ -4083,7 +4097,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
percpu_down_read(&scx_cgroup_rwsem);
if (scx_cgroup_enabled && tg->scx_weight != weight) {
- if (SCX_HAS_OP(cgroup_set_weight))
+ if (SCX_HAS_OP(scx_root, cgroup_set_weight))
SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, NULL,
tg_cgrp(tg), weight);
tg->scx_weight = weight;
@@ -4266,7 +4280,7 @@ static void scx_cgroup_exit(void)
continue;
tg->scx_flags &= ~SCX_TG_INITED;
- if (!scx_ops.cgroup_exit)
+ if (!scx_root->ops.cgroup_exit)
continue;
if (WARN_ON_ONCE(!css_tryget(css)))
@@ -4301,7 +4315,7 @@ static int scx_cgroup_init(void)
(SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE)
continue;
- if (!scx_ops.cgroup_init) {
+ if (!scx_root->ops.cgroup_init) {
tg->scx_flags |= SCX_TG_INITED;
continue;
}
@@ -4402,7 +4416,7 @@ static void scx_kobj_release(struct kobject *kobj)
static ssize_t scx_attr_ops_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
- return sysfs_emit(buf, "%s\n", scx_ops.name);
+ return sysfs_emit(buf, "%s\n", scx_root->ops.name);
}
SCX_ATTR(ops);
@@ -4445,7 +4459,7 @@ static const struct kobj_type scx_ktype = {
static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
{
- return add_uevent_var(env, "SCXOPS=%s", scx_ops.name);
+ return add_uevent_var(env, "SCXOPS=%s", scx_root->ops.name);
}
static const struct kset_uevent_ops scx_uevent_ops = {
@@ -4468,7 +4482,7 @@ bool task_should_scx(int policy)
bool scx_allow_ttwu_queue(const struct task_struct *p)
{
return !scx_enabled() ||
- (scx_ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) ||
+ (scx_root->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) ||
p->sched_class != &ext_sched_class;
}
@@ -4497,7 +4511,7 @@ void scx_softlockup(u32 dur_s)
return;
printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n",
- smp_processor_id(), dur_s, scx_ops.name);
+ smp_processor_id(), dur_s, scx_root->ops.name);
/*
* Some CPUs may be trapped in the dispatch paths. Enable breather
@@ -4686,14 +4700,14 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
static void scx_disable_workfn(struct kthread_work *work)
{
- struct scx_exit_info *ei = scx_exit_info;
+ struct scx_exit_info *ei = scx_root->exit_info;
struct scx_task_iter sti;
struct task_struct *p;
struct rhashtable_iter rht_iter;
struct scx_dispatch_q *dsq;
int kind, cpu;
- kind = atomic_read(&scx_exit_kind);
+ kind = atomic_read(&scx_root->exit_kind);
while (true) {
/*
* NONE indicates that a new scx_ops has been registered since
@@ -4702,7 +4716,7 @@ static void scx_disable_workfn(struct kthread_work *work)
*/
if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)
return;
- if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE))
+ if (atomic_try_cmpxchg(&scx_root->exit_kind, &kind, SCX_EXIT_DONE))
break;
}
ei->kind = kind;
@@ -4717,7 +4731,7 @@ static void scx_disable_workfn(struct kthread_work *work)
break;
case SCX_DISABLED:
pr_warn("sched_ext: ops error detected without ops (%s)\n",
- scx_exit_info->msg);
+ scx_root->exit_info->msg);
WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
goto done;
default:
@@ -4784,25 +4798,26 @@ static void scx_disable_workfn(struct kthread_work *work)
/* no task is on scx, turn off all the switches and flush in-progress calls */
static_branch_disable(&__scx_enabled);
- bitmap_zero(scx_has_op, SCX_OPI_END);
+ bitmap_zero(scx_root->has_op, SCX_OPI_END);
scx_idle_disable();
synchronize_rcu();
if (ei->kind >= SCX_EXIT_ERROR) {
pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
- scx_ops.name, ei->reason);
+ scx_root->ops.name, ei->reason);
if (ei->msg[0] != '\0')
- pr_err("sched_ext: %s: %s\n", scx_ops.name, ei->msg);
+ pr_err("sched_ext: %s: %s\n",
+ scx_root->ops.name, ei->msg);
#ifdef CONFIG_STACKTRACE
stack_trace_print(ei->bt, ei->bt_len, 2);
#endif
} else {
pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
- scx_ops.name, ei->reason);
+ scx_root->ops.name, ei->reason);
}
- if (scx_ops.exit)
+ if (scx_root->ops.exit)
SCX_CALL_OP(SCX_KF_UNLOCKED, exit, NULL, ei);
cancel_delayed_work_sync(&scx_watchdog_work);
@@ -4813,11 +4828,11 @@ static void scx_disable_workfn(struct kthread_work *work)
* asynchronously, sysfs could observe an object of the same name still
* in the hierarchy when another scheduler is loaded.
*/
- kobject_del(scx_root_kobj);
- kobject_put(scx_root_kobj);
- scx_root_kobj = NULL;
+ kobject_del(scx_root->kobj);
+ kobject_put(scx_root->kobj);
+ scx_root->kobj = NULL;
- memset(&scx_ops, 0, sizeof(scx_ops));
+ memset(&scx_root->ops, 0, sizeof(scx_root->ops));
rhashtable_walk_enter(&dsq_hash, &rht_iter);
do {
@@ -4834,8 +4849,8 @@ static void scx_disable_workfn(struct kthread_work *work)
scx_dsp_ctx = NULL;
scx_dsp_max_batch = 0;
- free_exit_info(scx_exit_info);
- scx_exit_info = NULL;
+ free_exit_info(scx_root->exit_info);
+ scx_root->exit_info = NULL;
mutex_unlock(&scx_enable_mutex);
@@ -4865,7 +4880,7 @@ static void scx_disable(enum scx_exit_kind kind)
if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
kind = SCX_EXIT_ERROR;
- atomic_try_cmpxchg(&scx_exit_kind, &none, kind);
+ atomic_try_cmpxchg(&scx_root->exit_kind, &none, kind);
schedule_scx_disable_work();
}
@@ -5007,7 +5022,7 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
p->scx.dsq_vtime, p->scx.slice, p->scx.weight);
dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr));
- if (SCX_HAS_OP(dump_task)) {
+ if (SCX_HAS_OP(scx_root, dump_task)) {
ops_dump_init(s, " ");
SCX_CALL_OP(SCX_KF_REST, dump_task, NULL, dctx, p);
ops_dump_exit();
@@ -5054,7 +5069,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
dump_stack_trace(&s, " ", ei->bt, ei->bt_len);
}
- if (SCX_HAS_OP(dump)) {
+ if (SCX_HAS_OP(scx_root, dump)) {
ops_dump_init(&s, "");
SCX_CALL_OP(SCX_KF_UNLOCKED, dump, NULL, &dctx);
ops_dump_exit();
@@ -5077,7 +5092,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
idle = list_empty(&rq->scx.runnable_list) &&
rq->curr->sched_class == &idle_sched_class;
- if (idle && !SCX_HAS_OP(dump_cpu))
+ if (idle && !SCX_HAS_OP(scx_root, dump_cpu))
goto next;
/*
@@ -5111,7 +5126,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
cpumask_pr_args(rq->scx.cpus_to_wait));
used = seq_buf_used(&ns);
- if (SCX_HAS_OP(dump_cpu)) {
+ if (SCX_HAS_OP(scx_root, dump_cpu)) {
ops_dump_init(&ns, " ");
SCX_CALL_OP(SCX_KF_REST, dump_cpu, NULL, &dctx, cpu, idle);
ops_dump_exit();
@@ -5167,10 +5182,10 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
static void scx_error_irq_workfn(struct irq_work *irq_work)
{
- struct scx_exit_info *ei = scx_exit_info;
+ struct scx_exit_info *ei = scx_root->exit_info;
if (ei->kind >= SCX_EXIT_ERROR)
- scx_dump_state(ei, scx_ops.exit_dump_len);
+ scx_dump_state(ei, scx_root->ops.exit_dump_len);
schedule_scx_disable_work();
}
@@ -5180,11 +5195,11 @@ static DEFINE_IRQ_WORK(scx_error_irq_work, scx_error_irq_workfn);
static __printf(3, 4) void __scx_exit(enum scx_exit_kind kind, s64 exit_code,
const char *fmt, ...)
{
- struct scx_exit_info *ei = scx_exit_info;
+ struct scx_exit_info *ei = scx_root->exit_info;
int none = SCX_EXIT_NONE;
va_list args;
- if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind))
+ if (!atomic_try_cmpxchg(&scx_root->exit_kind, &none, kind))
return;
ei->exit_code = exit_code;
@@ -5327,19 +5342,19 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
goto err_unlock;
}
- scx_root_kobj = kzalloc(sizeof(*scx_root_kobj), GFP_KERNEL);
- if (!scx_root_kobj) {
+ scx_root->kobj = kzalloc(sizeof(*scx_root->kobj), GFP_KERNEL);
+ if (!scx_root->kobj) {
ret = -ENOMEM;
goto err_unlock;
}
- scx_root_kobj->kset = scx_kset;
- ret = kobject_init_and_add(scx_root_kobj, &scx_ktype, NULL, "root");
+ scx_root->kobj->kset = scx_kset;
+ ret = kobject_init_and_add(scx_root->kobj, &scx_ktype, NULL, "root");
if (ret < 0)
goto err;
- scx_exit_info = alloc_exit_info(ops->exit_dump_len);
- if (!scx_exit_info) {
+ scx_root->exit_info = alloc_exit_info(ops->exit_dump_len);
+ if (!scx_root->exit_info) {
ret = -ENOMEM;
goto err_del;
}
@@ -5348,12 +5363,12 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
* Set scx_ops, transition to ENABLING and clear exit info to arm the
* disable path. Failure triggers full disabling from here on.
*/
- scx_ops = *ops;
+ scx_root->ops = *ops;
WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED);
- atomic_set(&scx_exit_kind, SCX_EXIT_NONE);
- scx_warned_zero_slice = false;
+ atomic_set(&scx_root->exit_kind, SCX_EXIT_NONE);
+ scx_root->warned_zero_slice = false;
atomic_long_set(&scx_nr_rejected, 0);
@@ -5368,7 +5383,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_idle_enable(ops);
- if (scx_ops.init) {
+ if (scx_root->ops.init) {
ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init, NULL);
if (ret) {
ret = ops_sanitize_err("init", ret);
@@ -5380,7 +5395,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
if (((void (**)(void))ops)[i])
- set_bit(i, scx_has_op);
+ set_bit(i, scx_root->has_op);
check_hotplug_seq(ops);
scx_idle_update_selcpu_topology(ops);
@@ -5421,10 +5436,10 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
if (((void (**)(void))ops)[i])
- set_bit(i, scx_has_op);
+ set_bit(i, scx_root->has_op);
- if (scx_ops.cpu_acquire || scx_ops.cpu_release)
- scx_ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
+ if (scx_root->ops.cpu_acquire || scx_root->ops.cpu_release)
+ scx_root->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
/*
* Lock out forks, cgroup on/offlining and moves before opening the
@@ -5523,7 +5538,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_bypass(false);
if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) {
- WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE);
+ WARN_ON_ONCE(atomic_read(&scx_root->exit_kind) == SCX_EXIT_NONE);
goto err_disable;
}
@@ -5531,8 +5546,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
static_branch_enable(&__scx_switched_all);
pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
- scx_ops.name, scx_switched_all() ? "" : " (partial)");
- kobject_uevent(scx_root_kobj, KOBJ_ADD);
+ scx_root->ops.name, scx_switched_all() ? "" : " (partial)");
+ kobject_uevent(scx_root->kobj, KOBJ_ADD);
mutex_unlock(&scx_enable_mutex);
atomic_long_inc(&scx_enable_seq);
@@ -5540,13 +5555,13 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
return 0;
err_del:
- kobject_del(scx_root_kobj);
+ kobject_del(scx_root->kobj);
err:
- kobject_put(scx_root_kobj);
- scx_root_kobj = NULL;
- if (scx_exit_info) {
- free_exit_info(scx_exit_info);
- scx_exit_info = NULL;
+ kobject_put(scx_root->kobj);
+ scx_root->kobj = NULL;
+ if (scx_root->exit_info) {
+ free_exit_info(scx_root->exit_info);
+ scx_root->exit_info = NULL;
}
err_unlock:
mutex_unlock(&scx_enable_mutex);
@@ -6006,7 +6021,7 @@ void print_scx_info(const char *log_lvl, struct task_struct *p)
*/
if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) ||
class != &ext_sched_class) {
- printk("%sSched_ext: %s (%s%s)", log_lvl, scx_ops.name,
+ printk("%sSched_ext: %s (%s%s)", log_lvl, scx_root->ops.name,
scx_enable_state_str[state], all);
return;
}
@@ -6018,7 +6033,7 @@ void print_scx_info(const char *log_lvl, struct task_struct *p)
/* print everything onto one line to conserve console space */
printk("%sSched_ext: %s (%s%s), task: runnable_at=%s",
- log_lvl, scx_ops.name, scx_enable_state_str[state], all,
+ log_lvl, scx_root->ops.name, scx_enable_state_str[state], all,
runnable_at_buf);
}
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 35aa309c9584..6915685cd3d6 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -744,7 +744,8 @@ void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
* Idle transitions are indicated by do_notify being set to true,
* managed by put_prev_task_idle()/set_next_task_idle().
*/
- if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
+ if (SCX_HAS_OP(scx_root, update_idle) &&
+ do_notify && !scx_rq_bypassing(rq))
SCX_CALL_OP(SCX_KF_REST, update_idle, rq, cpu_of(rq), idle);
/*
--
2.49.0
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PATCH 02/12] sched_ext: Avoid NULL scx_root deref through SCX_HAS_OP()
2025-04-23 23:44 [PATCHSET sched_ext/for-6.16] sched_ext: Introduce scx_sched Tejun Heo
2025-04-23 23:44 ` [PATCH 01/12] " Tejun Heo
@ 2025-04-23 23:44 ` Tejun Heo
2025-04-24 7:23 ` Chengming Zhou
2025-04-23 23:44 ` [PATCH 03/12] sched_ext: Use dynamic allocation for scx_sched Tejun Heo
` (9 subsequent siblings)
11 siblings, 1 reply; 19+ messages in thread
From: Tejun Heo @ 2025-04-23 23:44 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min, linux-kernel; +Cc: Tejun Heo
SCX_HAS_OP() tests scx_root->has_op bitmap. The bitmap is currently in a
statically allocated struct scx_sched and initialized while loading the BPF
scheduler and cleared while unloading, and thus can be tested anytime.
However, scx_root will be switched to dynamic allocation and thus won't
always be deferenceable.
Most usages of SCX_HAS_OP() are already protected by scx_enabled() either
directly or indirectly (e.g. through a task which is on SCX). However, there
are a couple places that could try to dereference NULL scx_root. Update them
so that scx_root is guaranteed to be valid before SCX_HAS_OP() is called.
- In handle_hotplug(), test whether scx_root is NULL before doing anything
else. This is safe because scx_root updates will be protected by
cpus_read_lock().
- In scx_tg_offline(), test scx_cgroup_enabled before invoking SCX_HAS_OP(),
which should guarnatee that scx_root won't turn NULL. This is also in line
with other cgroup operations. As the code path is synchronized against
scx_cgroup_init/exit() through scx_cgroup_rwsem, this shouldn't cause any
behavior differences.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/ext.c | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 975f6963a01b..ad392890d2dd 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3498,6 +3498,14 @@ static void handle_hotplug(struct rq *rq, bool online)
atomic_long_inc(&scx_hotplug_seq);
+ /*
+ * scx_root updates are protected by cpus_read_lock() and will stay
+ * stable here. Note that we can't depend on scx_enabled() test as the
+ * hotplug ops need to be enabled before __scx_enabled is set.
+ */
+ if (!scx_root)
+ return;
+
if (scx_enabled())
scx_idle_update_selcpu_topology(&scx_root->ops);
@@ -3994,7 +4002,8 @@ void scx_tg_offline(struct task_group *tg)
percpu_down_read(&scx_cgroup_rwsem);
- if (SCX_HAS_OP(scx_root, cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))
+ if (scx_cgroup_enabled && SCX_HAS_OP(scx_root, cgroup_exit) &&
+ (tg->scx_flags & SCX_TG_INITED))
SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, NULL, tg->css.cgroup);
tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
--
2.49.0
^ permalink raw reply related [flat|nested] 19+ messages in thread
* Re: [PATCH 02/12] sched_ext: Avoid NULL scx_root deref through SCX_HAS_OP()
2025-04-23 23:44 ` [PATCH 02/12] sched_ext: Avoid NULL scx_root deref through SCX_HAS_OP() Tejun Heo
@ 2025-04-24 7:23 ` Chengming Zhou
2025-04-24 18:55 ` Tejun Heo
0 siblings, 1 reply; 19+ messages in thread
From: Chengming Zhou @ 2025-04-24 7:23 UTC (permalink / raw)
To: Tejun Heo, David Vernet, Andrea Righi, Changwoo Min, linux-kernel
On 2025/4/24 07:44, Tejun Heo wrote:
> SCX_HAS_OP() tests scx_root->has_op bitmap. The bitmap is currently in a
> statically allocated struct scx_sched and initialized while loading the BPF
> scheduler and cleared while unloading, and thus can be tested anytime.
> However, scx_root will be switched to dynamic allocation and thus won't
> always be deferenceable.
>
> Most usages of SCX_HAS_OP() are already protected by scx_enabled() either
> directly or indirectly (e.g. through a task which is on SCX). However, there
> are a couple places that could try to dereference NULL scx_root. Update them
> so that scx_root is guaranteed to be valid before SCX_HAS_OP() is called.
>
> - In handle_hotplug(), test whether scx_root is NULL before doing anything
> else. This is safe because scx_root updates will be protected by
> cpus_read_lock().
>
> - In scx_tg_offline(), test scx_cgroup_enabled before invoking SCX_HAS_OP(),
> which should guarnatee that scx_root won't turn NULL. This is also in line
> with other cgroup operations. As the code path is synchronized against
> scx_cgroup_init/exit() through scx_cgroup_rwsem, this shouldn't cause any
> behavior differences.
>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
> kernel/sched/ext.c | 11 ++++++++++-
> 1 file changed, 10 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 975f6963a01b..ad392890d2dd 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -3498,6 +3498,14 @@ static void handle_hotplug(struct rq *rq, bool online)
>
> atomic_long_inc(&scx_hotplug_seq);
>
> + /*
> + * scx_root updates are protected by cpus_read_lock() and will stay
> + * stable here. Note that we can't depend on scx_enabled() test as the
> + * hotplug ops need to be enabled before __scx_enabled is set.
> + */
> + if (!scx_root)
> + return;
> +
> if (scx_enabled())
> scx_idle_update_selcpu_topology(&scx_root->ops);
Just be curious, does the comments added above mean we shouldn't
check scx_enabled() here anymore?
Thanks!
>
> @@ -3994,7 +4002,8 @@ void scx_tg_offline(struct task_group *tg)
>
> percpu_down_read(&scx_cgroup_rwsem);
>
> - if (SCX_HAS_OP(scx_root, cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))
> + if (scx_cgroup_enabled && SCX_HAS_OP(scx_root, cgroup_exit) &&
> + (tg->scx_flags & SCX_TG_INITED))
> SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, NULL, tg->css.cgroup);
> tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 02/12] sched_ext: Avoid NULL scx_root deref through SCX_HAS_OP()
2025-04-24 7:23 ` Chengming Zhou
@ 2025-04-24 18:55 ` Tejun Heo
0 siblings, 0 replies; 19+ messages in thread
From: Tejun Heo @ 2025-04-24 18:55 UTC (permalink / raw)
To: Chengming Zhou; +Cc: David Vernet, Andrea Righi, Changwoo Min, linux-kernel
Hello,
On Thu, Apr 24, 2025 at 03:23:40PM +0800, Chengming Zhou wrote:
> > + /*
> > + * scx_root updates are protected by cpus_read_lock() and will stay
> > + * stable here. Note that we can't depend on scx_enabled() test as the
> > + * hotplug ops need to be enabled before __scx_enabled is set.
> > + */
> > + if (!scx_root)
> > + return;
> > +
> > if (scx_enabled())
> > scx_idle_update_selcpu_topology(&scx_root->ops);
>
> Just be curious, does the comments added above mean we shouldn't
> check scx_enabled() here anymore?
There are two things happening in the function - calling the idle topology
update function and calling ops.cpu_on/offline(). The former needs both
scx_root and scx_enabled(). The latter only scx_root, so we still need the
additional scx_enabled() check for the former.
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH 03/12] sched_ext: Use dynamic allocation for scx_sched
2025-04-23 23:44 [PATCHSET sched_ext/for-6.16] sched_ext: Introduce scx_sched Tejun Heo
2025-04-23 23:44 ` [PATCH 01/12] " Tejun Heo
2025-04-23 23:44 ` [PATCH 02/12] sched_ext: Avoid NULL scx_root deref through SCX_HAS_OP() Tejun Heo
@ 2025-04-23 23:44 ` Tejun Heo
2025-04-25 10:14 ` Andrea Righi
2025-04-23 23:44 ` [PATCH 04/12] sched_ext: Inline create_dsq() into scx_bpf_create_dsq() Tejun Heo
` (8 subsequent siblings)
11 siblings, 1 reply; 19+ messages in thread
From: Tejun Heo @ 2025-04-23 23:44 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min, linux-kernel; +Cc: Tejun Heo
To prepare for supporting multiple schedulers, make scx_sched allocated
dynamically. scx_sched->kobj is now an embedded field and the kobj's
lifetime determines the lifetime of the containing scx_sched.
- Enable path is updated so that kobj init and addition are performed later.
- scx_sched freeing is initiated in scx_kobj_release() and also goes through
an rcu_work so that scx_root can be accessed from an unsynchronized path -
scx_disable().
No behavior changes intended.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/ext.c | 151 ++++++++++++++++++++++++++-------------------
1 file changed, 86 insertions(+), 65 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index ad392890d2dd..612232c66d13 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -772,7 +772,8 @@ struct scx_sched {
atomic_t exit_kind;
struct scx_exit_info *exit_info;
- struct kobject *kobj;
+ struct kobject kobj;
+ struct rcu_work rcu_work;
};
enum scx_wake_flags {
@@ -933,11 +934,7 @@ enum scx_ops_state {
#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK)
-static struct scx_sched __scx_root = {
- .exit_kind = ATOMIC_INIT(SCX_EXIT_DONE),
-};
-
-static struct scx_sched *scx_root = &__scx_root;
+static struct scx_sched __rcu *scx_root;
/*
* During exit, a task may schedule after losing its PIDs. When disabling the
@@ -4417,9 +4414,23 @@ static const struct attribute_group scx_global_attr_group = {
.attrs = scx_global_attrs,
};
+static void free_exit_info(struct scx_exit_info *ei);
+
+static void scx_sched_free_rcu_work(struct work_struct *work)
+{
+ struct rcu_work *rcu_work = to_rcu_work(work);
+ struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work);
+
+ free_exit_info(sch->exit_info);
+ kfree(sch);
+}
+
static void scx_kobj_release(struct kobject *kobj)
{
- kfree(kobj);
+ struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
+
+ INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work);
+ queue_rcu_work(system_unbound_wq, &sch->rcu_work);
}
static ssize_t scx_attr_ops_show(struct kobject *kobj,
@@ -4709,14 +4720,15 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
static void scx_disable_workfn(struct kthread_work *work)
{
- struct scx_exit_info *ei = scx_root->exit_info;
+ struct scx_sched *sch = scx_root;
+ struct scx_exit_info *ei = sch->exit_info;
struct scx_task_iter sti;
struct task_struct *p;
struct rhashtable_iter rht_iter;
struct scx_dispatch_q *dsq;
int kind, cpu;
- kind = atomic_read(&scx_root->exit_kind);
+ kind = atomic_read(&sch->exit_kind);
while (true) {
/*
* NONE indicates that a new scx_ops has been registered since
@@ -4725,7 +4737,7 @@ static void scx_disable_workfn(struct kthread_work *work)
*/
if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)
return;
- if (atomic_try_cmpxchg(&scx_root->exit_kind, &kind, SCX_EXIT_DONE))
+ if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE))
break;
}
ei->kind = kind;
@@ -4740,7 +4752,7 @@ static void scx_disable_workfn(struct kthread_work *work)
break;
case SCX_DISABLED:
pr_warn("sched_ext: ops error detected without ops (%s)\n",
- scx_root->exit_info->msg);
+ sch->exit_info->msg);
WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
goto done;
default:
@@ -4807,41 +4819,43 @@ static void scx_disable_workfn(struct kthread_work *work)
/* no task is on scx, turn off all the switches and flush in-progress calls */
static_branch_disable(&__scx_enabled);
- bitmap_zero(scx_root->has_op, SCX_OPI_END);
+ bitmap_zero(sch->has_op, SCX_OPI_END);
scx_idle_disable();
synchronize_rcu();
if (ei->kind >= SCX_EXIT_ERROR) {
pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
- scx_root->ops.name, ei->reason);
+ sch->ops.name, ei->reason);
if (ei->msg[0] != '\0')
- pr_err("sched_ext: %s: %s\n",
- scx_root->ops.name, ei->msg);
+ pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg);
#ifdef CONFIG_STACKTRACE
stack_trace_print(ei->bt, ei->bt_len, 2);
#endif
} else {
pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
- scx_root->ops.name, ei->reason);
+ sch->ops.name, ei->reason);
}
- if (scx_root->ops.exit)
+ if (sch->ops.exit)
SCX_CALL_OP(SCX_KF_UNLOCKED, exit, NULL, ei);
cancel_delayed_work_sync(&scx_watchdog_work);
/*
- * Delete the kobject from the hierarchy eagerly in addition to just
- * dropping a reference. Otherwise, if the object is deleted
- * asynchronously, sysfs could observe an object of the same name still
- * in the hierarchy when another scheduler is loaded.
+ * scx_root clearing must be inside cpus_read_lock(). See
+ * handle_hotplug().
*/
- kobject_del(scx_root->kobj);
- kobject_put(scx_root->kobj);
- scx_root->kobj = NULL;
+ cpus_read_lock();
+ RCU_INIT_POINTER(scx_root, NULL);
+ cpus_read_unlock();
- memset(&scx_root->ops, 0, sizeof(scx_root->ops));
+ /*
+ * Delete the kobject from the hierarchy synchronously. Otherwise, sysfs
+ * could observe an object of the same name still in the hierarchy when
+ * the next scheduler is loaded.
+ */
+ kobject_del(&sch->kobj);
rhashtable_walk_enter(&dsq_hash, &rht_iter);
do {
@@ -4858,9 +4872,6 @@ static void scx_disable_workfn(struct kthread_work *work)
scx_dsp_ctx = NULL;
scx_dsp_max_batch = 0;
- free_exit_info(scx_root->exit_info);
- scx_root->exit_info = NULL;
-
mutex_unlock(&scx_enable_mutex);
WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
@@ -4885,13 +4896,18 @@ static void schedule_scx_disable_work(void)
static void scx_disable(enum scx_exit_kind kind)
{
int none = SCX_EXIT_NONE;
+ struct scx_sched *sch;
if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
kind = SCX_EXIT_ERROR;
- atomic_try_cmpxchg(&scx_root->exit_kind, &none, kind);
-
- schedule_scx_disable_work();
+ rcu_read_lock();
+ sch = rcu_dereference(scx_root);
+ if (sch) {
+ atomic_try_cmpxchg(&sch->exit_kind, &none, kind);
+ schedule_scx_disable_work();
+ }
+ rcu_read_unlock();
}
static void dump_newline(struct seq_buf *s)
@@ -5288,6 +5304,7 @@ static int validate_ops(const struct sched_ext_ops *ops)
static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
{
+ struct scx_sched *sch;
struct scx_task_iter sti;
struct task_struct *p;
unsigned long timeout;
@@ -5351,33 +5368,32 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
goto err_unlock;
}
- scx_root->kobj = kzalloc(sizeof(*scx_root->kobj), GFP_KERNEL);
- if (!scx_root->kobj) {
+ sch = kzalloc(sizeof(*sch), GFP_KERNEL);
+ if (!sch) {
ret = -ENOMEM;
goto err_unlock;
}
- scx_root->kobj->kset = scx_kset;
- ret = kobject_init_and_add(scx_root->kobj, &scx_ktype, NULL, "root");
- if (ret < 0)
- goto err;
-
- scx_root->exit_info = alloc_exit_info(ops->exit_dump_len);
- if (!scx_root->exit_info) {
+ sch->exit_info = alloc_exit_info(ops->exit_dump_len);
+ if (!sch->exit_info) {
ret = -ENOMEM;
- goto err_del;
+ goto err_free;
}
+ sch->kobj.kset = scx_kset;
+ ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
+ if (ret < 0)
+ goto err_free;
+
+ atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
+ sch->ops = *ops;
+
/*
- * Set scx_ops, transition to ENABLING and clear exit info to arm the
- * disable path. Failure triggers full disabling from here on.
+ * Transition to ENABLING and clear exit info to arm the disable path.
+ * Failure triggers full disabling from here on.
*/
- scx_root->ops = *ops;
-
WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED);
-
- atomic_set(&scx_root->exit_kind, SCX_EXIT_NONE);
- scx_root->warned_zero_slice = false;
+ WARN_ON_ONCE(scx_root);
atomic_long_set(&scx_nr_rejected, 0);
@@ -5390,9 +5406,15 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
*/
cpus_read_lock();
+ /*
+ * Make the scheduler instance visible. Must be inside cpus_read_lock().
+ * See handle_hotplug().
+ */
+ rcu_assign_pointer(scx_root, sch);
+
scx_idle_enable(ops);
- if (scx_root->ops.init) {
+ if (sch->ops.init) {
ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init, NULL);
if (ret) {
ret = ops_sanitize_err("init", ret);
@@ -5404,7 +5426,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
if (((void (**)(void))ops)[i])
- set_bit(i, scx_root->has_op);
+ set_bit(i, sch->has_op);
check_hotplug_seq(ops);
scx_idle_update_selcpu_topology(ops);
@@ -5445,10 +5467,10 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
if (((void (**)(void))ops)[i])
- set_bit(i, scx_root->has_op);
+ set_bit(i, sch->has_op);
- if (scx_root->ops.cpu_acquire || scx_root->ops.cpu_release)
- scx_root->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
+ if (sch->ops.cpu_acquire || sch->ops.cpu_release)
+ sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
/*
* Lock out forks, cgroup on/offlining and moves before opening the
@@ -5547,7 +5569,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_bypass(false);
if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) {
- WARN_ON_ONCE(atomic_read(&scx_root->exit_kind) == SCX_EXIT_NONE);
+ WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE);
goto err_disable;
}
@@ -5555,23 +5577,18 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
static_branch_enable(&__scx_switched_all);
pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
- scx_root->ops.name, scx_switched_all() ? "" : " (partial)");
- kobject_uevent(scx_root->kobj, KOBJ_ADD);
+ sch->ops.name, scx_switched_all() ? "" : " (partial)");
+ kobject_uevent(&sch->kobj, KOBJ_ADD);
mutex_unlock(&scx_enable_mutex);
atomic_long_inc(&scx_enable_seq);
return 0;
-err_del:
- kobject_del(scx_root->kobj);
-err:
- kobject_put(scx_root->kobj);
- scx_root->kobj = NULL;
- if (scx_root->exit_info) {
- free_exit_info(scx_root->exit_info);
- scx_root->exit_info = NULL;
- }
+err_free:
+ if (sch->exit_info)
+ free_exit_info(sch->exit_info);
+ kfree(sch);
err_unlock:
mutex_unlock(&scx_enable_mutex);
return ret;
@@ -5593,6 +5610,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
*/
scx_error("scx_enable() failed (%d)", ret);
kthread_flush_work(&scx_disable_work);
+ kobject_put(&sch->kobj);
return 0;
}
@@ -5741,8 +5759,11 @@ static int bpf_scx_reg(void *kdata, struct bpf_link *link)
static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
{
+ struct scx_sched *sch = scx_root;
+
scx_disable(SCX_EXIT_UNREG);
kthread_flush_work(&scx_disable_work);
+ kobject_put(&sch->kobj);
}
static int bpf_scx_init(struct btf *btf)
--
2.49.0
^ permalink raw reply related [flat|nested] 19+ messages in thread
* Re: [PATCH 03/12] sched_ext: Use dynamic allocation for scx_sched
2025-04-23 23:44 ` [PATCH 03/12] sched_ext: Use dynamic allocation for scx_sched Tejun Heo
@ 2025-04-25 10:14 ` Andrea Righi
2025-04-25 19:48 ` Tejun Heo
0 siblings, 1 reply; 19+ messages in thread
From: Andrea Righi @ 2025-04-25 10:14 UTC (permalink / raw)
To: Tejun Heo; +Cc: David Vernet, Changwoo Min, linux-kernel
Hi Tejun,
On Wed, Apr 23, 2025 at 01:44:41PM -1000, Tejun Heo wrote:
> To prepare for supporting multiple schedulers, make scx_sched allocated
> dynamically. scx_sched->kobj is now an embedded field and the kobj's
> lifetime determines the lifetime of the containing scx_sched.
>
> - Enable path is updated so that kobj init and addition are performed later.
>
> - scx_sched freeing is initiated in scx_kobj_release() and also goes through
> an rcu_work so that scx_root can be accessed from an unsynchronized path -
> scx_disable().
>
> No behavior changes intended.
>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
> kernel/sched/ext.c | 151 ++++++++++++++++++++++++++-------------------
> 1 file changed, 86 insertions(+), 65 deletions(-)
>
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index ad392890d2dd..612232c66d13 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -772,7 +772,8 @@ struct scx_sched {
> atomic_t exit_kind;
> struct scx_exit_info *exit_info;
>
> - struct kobject *kobj;
> + struct kobject kobj;
> + struct rcu_work rcu_work;
> };
>
> enum scx_wake_flags {
> @@ -933,11 +934,7 @@ enum scx_ops_state {
> #define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
> #define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK)
>
> -static struct scx_sched __scx_root = {
> - .exit_kind = ATOMIC_INIT(SCX_EXIT_DONE),
> -};
> -
> -static struct scx_sched *scx_root = &__scx_root;
> +static struct scx_sched __rcu *scx_root;
>
> /*
> * During exit, a task may schedule after losing its PIDs. When disabling the
> @@ -4417,9 +4414,23 @@ static const struct attribute_group scx_global_attr_group = {
> .attrs = scx_global_attrs,
> };
>
> +static void free_exit_info(struct scx_exit_info *ei);
> +
> +static void scx_sched_free_rcu_work(struct work_struct *work)
> +{
> + struct rcu_work *rcu_work = to_rcu_work(work);
> + struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work);
> +
> + free_exit_info(sch->exit_info);
> + kfree(sch);
> +}
> +
> static void scx_kobj_release(struct kobject *kobj)
> {
> - kfree(kobj);
> + struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
> +
> + INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work);
> + queue_rcu_work(system_unbound_wq, &sch->rcu_work);
> }
>
> static ssize_t scx_attr_ops_show(struct kobject *kobj,
> @@ -4709,14 +4720,15 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
>
> static void scx_disable_workfn(struct kthread_work *work)
> {
> - struct scx_exit_info *ei = scx_root->exit_info;
> + struct scx_sched *sch = scx_root;
> + struct scx_exit_info *ei = sch->exit_info;
> struct scx_task_iter sti;
> struct task_struct *p;
> struct rhashtable_iter rht_iter;
> struct scx_dispatch_q *dsq;
> int kind, cpu;
>
> - kind = atomic_read(&scx_root->exit_kind);
> + kind = atomic_read(&sch->exit_kind);
> while (true) {
> /*
> * NONE indicates that a new scx_ops has been registered since
> @@ -4725,7 +4737,7 @@ static void scx_disable_workfn(struct kthread_work *work)
> */
> if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)
> return;
> - if (atomic_try_cmpxchg(&scx_root->exit_kind, &kind, SCX_EXIT_DONE))
> + if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE))
> break;
> }
> ei->kind = kind;
> @@ -4740,7 +4752,7 @@ static void scx_disable_workfn(struct kthread_work *work)
> break;
> case SCX_DISABLED:
> pr_warn("sched_ext: ops error detected without ops (%s)\n",
> - scx_root->exit_info->msg);
> + sch->exit_info->msg);
> WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
> goto done;
> default:
> @@ -4807,41 +4819,43 @@ static void scx_disable_workfn(struct kthread_work *work)
>
> /* no task is on scx, turn off all the switches and flush in-progress calls */
> static_branch_disable(&__scx_enabled);
> - bitmap_zero(scx_root->has_op, SCX_OPI_END);
> + bitmap_zero(sch->has_op, SCX_OPI_END);
> scx_idle_disable();
> synchronize_rcu();
>
> if (ei->kind >= SCX_EXIT_ERROR) {
> pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
> - scx_root->ops.name, ei->reason);
> + sch->ops.name, ei->reason);
>
> if (ei->msg[0] != '\0')
> - pr_err("sched_ext: %s: %s\n",
> - scx_root->ops.name, ei->msg);
> + pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg);
> #ifdef CONFIG_STACKTRACE
> stack_trace_print(ei->bt, ei->bt_len, 2);
> #endif
> } else {
> pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
> - scx_root->ops.name, ei->reason);
> + sch->ops.name, ei->reason);
> }
>
> - if (scx_root->ops.exit)
> + if (sch->ops.exit)
> SCX_CALL_OP(SCX_KF_UNLOCKED, exit, NULL, ei);
>
> cancel_delayed_work_sync(&scx_watchdog_work);
>
> /*
> - * Delete the kobject from the hierarchy eagerly in addition to just
> - * dropping a reference. Otherwise, if the object is deleted
> - * asynchronously, sysfs could observe an object of the same name still
> - * in the hierarchy when another scheduler is loaded.
> + * scx_root clearing must be inside cpus_read_lock(). See
> + * handle_hotplug().
> */
> - kobject_del(scx_root->kobj);
> - kobject_put(scx_root->kobj);
> - scx_root->kobj = NULL;
> + cpus_read_lock();
> + RCU_INIT_POINTER(scx_root, NULL);
> + cpus_read_unlock();
>
> - memset(&scx_root->ops, 0, sizeof(scx_root->ops));
> + /*
> + * Delete the kobject from the hierarchy synchronously. Otherwise, sysfs
> + * could observe an object of the same name still in the hierarchy when
> + * the next scheduler is loaded.
> + */
> + kobject_del(&sch->kobj);
>
> rhashtable_walk_enter(&dsq_hash, &rht_iter);
> do {
> @@ -4858,9 +4872,6 @@ static void scx_disable_workfn(struct kthread_work *work)
> scx_dsp_ctx = NULL;
> scx_dsp_max_batch = 0;
>
> - free_exit_info(scx_root->exit_info);
> - scx_root->exit_info = NULL;
> -
> mutex_unlock(&scx_enable_mutex);
>
> WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
> @@ -4885,13 +4896,18 @@ static void schedule_scx_disable_work(void)
> static void scx_disable(enum scx_exit_kind kind)
> {
> int none = SCX_EXIT_NONE;
> + struct scx_sched *sch;
>
> if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
> kind = SCX_EXIT_ERROR;
>
> - atomic_try_cmpxchg(&scx_root->exit_kind, &none, kind);
> -
> - schedule_scx_disable_work();
> + rcu_read_lock();
> + sch = rcu_dereference(scx_root);
> + if (sch) {
> + atomic_try_cmpxchg(&sch->exit_kind, &none, kind);
> + schedule_scx_disable_work();
> + }
> + rcu_read_unlock();
> }
>
> static void dump_newline(struct seq_buf *s)
> @@ -5288,6 +5304,7 @@ static int validate_ops(const struct sched_ext_ops *ops)
>
> static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
> {
> + struct scx_sched *sch;
> struct scx_task_iter sti;
> struct task_struct *p;
> unsigned long timeout;
> @@ -5351,33 +5368,32 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
> goto err_unlock;
> }
>
> - scx_root->kobj = kzalloc(sizeof(*scx_root->kobj), GFP_KERNEL);
> - if (!scx_root->kobj) {
> + sch = kzalloc(sizeof(*sch), GFP_KERNEL);
> + if (!sch) {
> ret = -ENOMEM;
> goto err_unlock;
> }
>
> - scx_root->kobj->kset = scx_kset;
> - ret = kobject_init_and_add(scx_root->kobj, &scx_ktype, NULL, "root");
> - if (ret < 0)
> - goto err;
> -
> - scx_root->exit_info = alloc_exit_info(ops->exit_dump_len);
> - if (!scx_root->exit_info) {
> + sch->exit_info = alloc_exit_info(ops->exit_dump_len);
> + if (!sch->exit_info) {
> ret = -ENOMEM;
> - goto err_del;
> + goto err_free;
> }
>
> + sch->kobj.kset = scx_kset;
> + ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
> + if (ret < 0)
> + goto err_free;
> +
> + atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
> + sch->ops = *ops;
> +
> /*
> - * Set scx_ops, transition to ENABLING and clear exit info to arm the
> - * disable path. Failure triggers full disabling from here on.
> + * Transition to ENABLING and clear exit info to arm the disable path.
> + * Failure triggers full disabling from here on.
> */
> - scx_root->ops = *ops;
> -
> WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED);
> -
> - atomic_set(&scx_root->exit_kind, SCX_EXIT_NONE);
> - scx_root->warned_zero_slice = false;
> + WARN_ON_ONCE(scx_root);
>
> atomic_long_set(&scx_nr_rejected, 0);
>
> @@ -5390,9 +5406,15 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
> */
> cpus_read_lock();
>
> + /*
> + * Make the scheduler instance visible. Must be inside cpus_read_lock().
> + * See handle_hotplug().
> + */
> + rcu_assign_pointer(scx_root, sch);
> +
> scx_idle_enable(ops);
>
> - if (scx_root->ops.init) {
> + if (sch->ops.init) {
> ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init, NULL);
> if (ret) {
> ret = ops_sanitize_err("init", ret);
> @@ -5404,7 +5426,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>
> for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
> if (((void (**)(void))ops)[i])
> - set_bit(i, scx_root->has_op);
> + set_bit(i, sch->has_op);
>
> check_hotplug_seq(ops);
> scx_idle_update_selcpu_topology(ops);
> @@ -5445,10 +5467,10 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>
> for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
> if (((void (**)(void))ops)[i])
> - set_bit(i, scx_root->has_op);
> + set_bit(i, sch->has_op);
>
> - if (scx_root->ops.cpu_acquire || scx_root->ops.cpu_release)
> - scx_root->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
> + if (sch->ops.cpu_acquire || sch->ops.cpu_release)
> + sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
>
> /*
> * Lock out forks, cgroup on/offlining and moves before opening the
> @@ -5547,7 +5569,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
> scx_bypass(false);
>
> if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) {
> - WARN_ON_ONCE(atomic_read(&scx_root->exit_kind) == SCX_EXIT_NONE);
> + WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE);
> goto err_disable;
> }
>
> @@ -5555,23 +5577,18 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
> static_branch_enable(&__scx_switched_all);
>
> pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
> - scx_root->ops.name, scx_switched_all() ? "" : " (partial)");
> - kobject_uevent(scx_root->kobj, KOBJ_ADD);
> + sch->ops.name, scx_switched_all() ? "" : " (partial)");
> + kobject_uevent(&sch->kobj, KOBJ_ADD);
> mutex_unlock(&scx_enable_mutex);
>
> atomic_long_inc(&scx_enable_seq);
>
> return 0;
>
> -err_del:
> - kobject_del(scx_root->kobj);
> -err:
> - kobject_put(scx_root->kobj);
> - scx_root->kobj = NULL;
> - if (scx_root->exit_info) {
> - free_exit_info(scx_root->exit_info);
> - scx_root->exit_info = NULL;
> - }
> +err_free:
> + if (sch->exit_info)
> + free_exit_info(sch->exit_info);
> + kfree(sch);
> err_unlock:
> mutex_unlock(&scx_enable_mutex);
> return ret;
> @@ -5593,6 +5610,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
> */
> scx_error("scx_enable() failed (%d)", ret);
> kthread_flush_work(&scx_disable_work);
> + kobject_put(&sch->kobj);
> return 0;
> }
>
> @@ -5741,8 +5759,11 @@ static int bpf_scx_reg(void *kdata, struct bpf_link *link)
>
> static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
> {
> + struct scx_sched *sch = scx_root;
> +
> scx_disable(SCX_EXIT_UNREG);
> kthread_flush_work(&scx_disable_work);
> + kobject_put(&sch->kobj);
> }
We probably need to check sch != NULL here, I was able to trigger this bug
(using a buggy rustland):
[ 5.048913] sched_ext: rustland: invalid CPU -16 from ops.select_cpu()
[ 5.048984] ops_cpu_valid+0x4a/0x60
[ 5.049039] select_task_rq_scx+0x10f/0x200
[ 5.049100] try_to_wake_up+0x17a/0x890
[ 5.049149] ep_autoremove_wake_function+0x12/0x40
[ 5.049211] __wake_up_common+0x7f/0xc0
[ 5.049259] __wake_up+0x36/0x60
[ 5.049306] ep_poll_callback+0x265/0x320
[ 5.049354] __wake_up_common+0x7f/0xc0
[ 5.049401] __wake_up+0x36/0x60
[ 5.049448] __send_signal_locked+0x71e/0x740
[ 5.049508] group_send_sig_info+0xf3/0x1b0
[ 5.049567] kill_pid_info_type+0x79/0x1a0
[ 5.049627] kill_proc_info+0x5d/0x110
[ 5.049674] __x64_sys_kill+0x91/0xc0
[ 5.049789] do_syscall_64+0xbb/0x1d0
[ 5.049855] entry_SYSCALL_64_after_hwframe+0x77/0x7f
[ 5.050315] BUG: kernel NULL pointer dereference, address: 00000000000003b0
[ 5.050386] #PF: supervisor read access in kernel mode
[ 5.050439] #PF: error_code(0x0000) - not-present page
[ 5.050488] PGD 0 P4D 0
[ 5.050523] Oops: Oops: 0000 [#1] SMP NOPTI
[ 5.050571] CPU: 5 UID: 0 PID: 284 Comm: scx_rustland Not tainted 6.14.0-virtme #27 PREEMPT(full)
[ 5.050670] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014
[ 5.050782] RIP: 0010:kthread_flush_work+0x60/0x140
[ 5.050847] Code: 80 00 00 00 31 c0 48 8d 7c 24 18 48 89 24 24 48 89 64 24 08 48 c7 44 24 10 30 e6 58 b0 f3 48 ab 48 8d 7c 24 30 e8 40 6c 05 00 <48> 8b 6b 18 48 85 ed 74 69 4c 8d 65 08 4c 89 e7 e8 0b 0c d3 00 48
[ 5.051021] RSP: 0018:ffffad01816f7e08 EFLAGS: 00010246
[ 5.051066] RAX: 0000000000000000 RBX: 0000000000000398 RCX: 0000000000000000
[ 5.051131] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffffffffb2f34d80
[ 5.051196] RBP: ffff9b140268e800 R08: 0000000000000002 R09: 0000000000000000
[ 5.051260] R10: 0000000000000001 R11: 0000000000000000 R12: ffff9b14804c1ea0
[ 5.051325] R13: ffff9b1401b6cc20 R14: ffff9b1403965728 R15: 0000000000000000
[ 5.051393] FS: 00007f1ff0550800(0000) GS:ffff9b14cbdb3000(0000) knlGS:0000000000000000
[ 5.051463] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 5.051516] CR2: 00000000000003b0 CR3: 000000000553e002 CR4: 0000000000772ef0
[ 5.051582] PKRU: 55555554
[ 5.051606] Call Trace:
[ 5.051634] <TASK>
[ 5.051665] ? __pfx_kthread_flush_work_fn+0x10/0x10
[ 5.051726] bpf_scx_unreg+0x27/0x40
[ 5.051773] bpf_struct_ops_map_link_dealloc+0x36/0x50
[ 5.051824] bpf_link_release+0x18/0x20
[ 5.051863] __fput+0xf8/0x2c0
[ 5.051905] __x64_sys_close+0x3d/0x80
[ 5.051943] do_syscall_64+0xbb/0x1d0
[ 5.051983] entry_SYSCALL_64_after_hwframe+0x77/0x7f
Changing bpf_scx_unreg() as following fixed the bug for me:
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index d963aa5c99e1a..0e52a8dbd593e 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5752,11 +5752,17 @@ static int bpf_scx_reg(void *kdata, struct bpf_link *link)
static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
{
- struct scx_sched *sch = scx_root;
+ struct scx_sched *sch;
scx_disable(SCX_EXIT_UNREG);
- kthread_flush_work(&sch->disable_work);
- kobject_put(&sch->kobj);
+
+ rcu_read_lock();
+ sch = rcu_dereference(scx_root);
+ if (sch) {
+ kthread_flush_work(&sch->disable_work);
+ kobject_put(&sch->kobj);
+ }
+ rcu_read_unlock();
}
static int bpf_scx_init(struct btf *btf)
Thanks,
-Andrea
^ permalink raw reply related [flat|nested] 19+ messages in thread
* Re: [PATCH 03/12] sched_ext: Use dynamic allocation for scx_sched
2025-04-25 10:14 ` Andrea Righi
@ 2025-04-25 19:48 ` Tejun Heo
0 siblings, 0 replies; 19+ messages in thread
From: Tejun Heo @ 2025-04-25 19:48 UTC (permalink / raw)
To: Andrea Righi; +Cc: David Vernet, Changwoo Min, linux-kernel
On Fri, Apr 25, 2025 at 12:14:13PM +0200, Andrea Righi wrote:
...
> > static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
> > {
> > + struct scx_sched *sch = scx_root;
> > +
> > scx_disable(SCX_EXIT_UNREG);
> > kthread_flush_work(&scx_disable_work);
> > + kobject_put(&sch->kobj);
> > }
>
> We probably need to check sch != NULL here, I was able to trigger this bug
> (using a buggy rustland):
>
> [ 5.048913] sched_ext: rustland: invalid CPU -16 from ops.select_cpu()
> [ 5.048984] ops_cpu_valid+0x4a/0x60
> [ 5.049039] select_task_rq_scx+0x10f/0x200
> [ 5.049100] try_to_wake_up+0x17a/0x890
> [ 5.049149] ep_autoremove_wake_function+0x12/0x40
> [ 5.049211] __wake_up_common+0x7f/0xc0
> [ 5.049259] __wake_up+0x36/0x60
> [ 5.049306] ep_poll_callback+0x265/0x320
> [ 5.049354] __wake_up_common+0x7f/0xc0
> [ 5.049401] __wake_up+0x36/0x60
> [ 5.049448] __send_signal_locked+0x71e/0x740
> [ 5.049508] group_send_sig_info+0xf3/0x1b0
> [ 5.049567] kill_pid_info_type+0x79/0x1a0
> [ 5.049627] kill_proc_info+0x5d/0x110
> [ 5.049674] __x64_sys_kill+0x91/0xc0
> [ 5.049789] do_syscall_64+0xbb/0x1d0
> [ 5.049855] entry_SYSCALL_64_after_hwframe+0x77/0x7f
> [ 5.050315] BUG: kernel NULL pointer dereference, address: 00000000000003b0
> [ 5.050386] #PF: supervisor read access in kernel mode
> [ 5.050439] #PF: error_code(0x0000) - not-present page
> [ 5.050488] PGD 0 P4D 0
> [ 5.050523] Oops: Oops: 0000 [#1] SMP NOPTI
> [ 5.050571] CPU: 5 UID: 0 PID: 284 Comm: scx_rustland Not tainted 6.14.0-virtme #27 PREEMPT(full)
> [ 5.050670] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014
> [ 5.050782] RIP: 0010:kthread_flush_work+0x60/0x140
> [ 5.050847] Code: 80 00 00 00 31 c0 48 8d 7c 24 18 48 89 24 24 48 89 64 24 08 48 c7 44 24 10 30 e6 58 b0 f3 48 ab 48 8d 7c 24 30 e8 40 6c 05 00 <48> 8b 6b 18 48 85 ed 74 69 4c 8d 65 08 4c 89 e7 e8 0b 0c d3 00 48
> [ 5.051021] RSP: 0018:ffffad01816f7e08 EFLAGS: 00010246
> [ 5.051066] RAX: 0000000000000000 RBX: 0000000000000398 RCX: 0000000000000000
> [ 5.051131] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffffffffb2f34d80
> [ 5.051196] RBP: ffff9b140268e800 R08: 0000000000000002 R09: 0000000000000000
> [ 5.051260] R10: 0000000000000001 R11: 0000000000000000 R12: ffff9b14804c1ea0
> [ 5.051325] R13: ffff9b1401b6cc20 R14: ffff9b1403965728 R15: 0000000000000000
> [ 5.051393] FS: 00007f1ff0550800(0000) GS:ffff9b14cbdb3000(0000) knlGS:0000000000000000
> [ 5.051463] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 5.051516] CR2: 00000000000003b0 CR3: 000000000553e002 CR4: 0000000000772ef0
> [ 5.051582] PKRU: 55555554
> [ 5.051606] Call Trace:
> [ 5.051634] <TASK>
> [ 5.051665] ? __pfx_kthread_flush_work_fn+0x10/0x10
> [ 5.051726] bpf_scx_unreg+0x27/0x40
> [ 5.051773] bpf_struct_ops_map_link_dealloc+0x36/0x50
> [ 5.051824] bpf_link_release+0x18/0x20
> [ 5.051863] __fput+0xf8/0x2c0
> [ 5.051905] __x64_sys_close+0x3d/0x80
> [ 5.051943] do_syscall_64+0xbb/0x1d0
> [ 5.051983] entry_SYSCALL_64_after_hwframe+0x77/0x7f
>
> Changing bpf_scx_unreg() as following fixed the bug for me:
>
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index d963aa5c99e1a..0e52a8dbd593e 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -5752,11 +5752,17 @@ static int bpf_scx_reg(void *kdata, struct bpf_link *link)
>
> static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
> {
> - struct scx_sched *sch = scx_root;
> + struct scx_sched *sch;
>
> scx_disable(SCX_EXIT_UNREG);
> - kthread_flush_work(&sch->disable_work);
> - kobject_put(&sch->kobj);
> +
> + rcu_read_lock();
> + sch = rcu_dereference(scx_root);
> + if (sch) {
> + kthread_flush_work(&sch->disable_work);
> + kobject_put(&sch->kobj);
> + }
> + rcu_read_unlock();
> }
Oh I didn't expect that. As scx_root can only be written by the preceding
bpf_scx_reg(), I don't think we need rcu_read_lock() here as subtle as that
may be. I'll update the code.
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH 04/12] sched_ext: Inline create_dsq() into scx_bpf_create_dsq()
2025-04-23 23:44 [PATCHSET sched_ext/for-6.16] sched_ext: Introduce scx_sched Tejun Heo
` (2 preceding siblings ...)
2025-04-23 23:44 ` [PATCH 03/12] sched_ext: Use dynamic allocation for scx_sched Tejun Heo
@ 2025-04-23 23:44 ` Tejun Heo
2025-04-23 23:44 ` [PATCH 05/12] sched_ext: Factor out scx_alloc_and_add_sched() Tejun Heo
` (7 subsequent siblings)
11 siblings, 0 replies; 19+ messages in thread
From: Tejun Heo @ 2025-04-23 23:44 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min, linux-kernel; +Cc: Tejun Heo
create_dsq() is only used by scx_bpf_create_dsq() and the separation gets in
the way of making dsq_hash per scx_sched. Inline it into
scx_bpf_create_dsq(). While at it, add unlikely() around
SCX_DSQ_FLAG_BUILTIN test.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/ext.c | 42 ++++++++++++++++++------------------------
1 file changed, 18 insertions(+), 24 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 612232c66d13..8ec32c458f16 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4193,29 +4193,6 @@ static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
dsq->id = dsq_id;
}
-static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node)
-{
- struct scx_dispatch_q *dsq;
- int ret;
-
- if (dsq_id & SCX_DSQ_FLAG_BUILTIN)
- return ERR_PTR(-EINVAL);
-
- dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node);
- if (!dsq)
- return ERR_PTR(-ENOMEM);
-
- init_dsq(dsq, dsq_id);
-
- ret = rhashtable_lookup_insert_fast(&dsq_hash, &dsq->hash_node,
- dsq_hash_params);
- if (ret) {
- kfree(dsq);
- return ERR_PTR(ret);
- }
- return dsq;
-}
-
static void free_dsq_irq_workfn(struct irq_work *irq_work)
{
struct llist_node *to_free = llist_del_all(&dsqs_to_free);
@@ -6708,10 +6685,27 @@ __bpf_kfunc_start_defs();
*/
__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
{
+ struct scx_dispatch_q *dsq;
+ s32 ret;
+
if (unlikely(node >= (int)nr_node_ids ||
(node < 0 && node != NUMA_NO_NODE)))
return -EINVAL;
- return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node));
+
+ if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN))
+ return -EINVAL;
+
+ dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node);
+ if (!dsq)
+ return -ENOMEM;
+
+ init_dsq(dsq, dsq_id);
+
+ ret = rhashtable_lookup_insert_fast(&dsq_hash, &dsq->hash_node,
+ dsq_hash_params);
+ if (ret)
+ kfree(dsq);
+ return ret;
}
__bpf_kfunc_end_defs();
--
2.49.0
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PATCH 05/12] sched_ext: Factor out scx_alloc_and_add_sched()
2025-04-23 23:44 [PATCHSET sched_ext/for-6.16] sched_ext: Introduce scx_sched Tejun Heo
` (3 preceding siblings ...)
2025-04-23 23:44 ` [PATCH 04/12] sched_ext: Inline create_dsq() into scx_bpf_create_dsq() Tejun Heo
@ 2025-04-23 23:44 ` Tejun Heo
2025-04-23 23:44 ` [PATCH 06/12] sched_ext: Move dsq_hash into scx_sched Tejun Heo
` (6 subsequent siblings)
11 siblings, 0 replies; 19+ messages in thread
From: Tejun Heo @ 2025-04-23 23:44 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min, linux-kernel; +Cc: Tejun Heo
More will be moved into scx_sched. Factor out the allocation and kobject
addition path into scx_alloc_and_add_sched().
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/ext.c | 56 +++++++++++++++++++++++++++++-----------------
1 file changed, 35 insertions(+), 21 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 8ec32c458f16..88a0f6d9cb1e 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5233,6 +5233,38 @@ static struct kthread_worker *scx_create_rt_helper(const char *name)
return helper;
}
+static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
+{
+ struct scx_sched *sch;
+ int ret;
+
+ sch = kzalloc(sizeof(*sch), GFP_KERNEL);
+ if (!sch)
+ return ERR_PTR(-ENOMEM);
+
+ sch->exit_info = alloc_exit_info(ops->exit_dump_len);
+ if (!sch->exit_info) {
+ ret = -ENOMEM;
+ goto err_free_sch;
+ }
+
+ atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
+ sch->ops = *ops;
+
+ sch->kobj.kset = scx_kset;
+ ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
+ if (ret < 0)
+ goto err_free_ei;
+
+ return sch;
+
+err_free_ei:
+ free_exit_info(sch->exit_info);
+err_free_sch:
+ kfree(sch);
+ return ERR_PTR(ret);
+}
+
static void check_hotplug_seq(const struct sched_ext_ops *ops)
{
unsigned long long global_hotplug_seq;
@@ -5345,26 +5377,12 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
goto err_unlock;
}
- sch = kzalloc(sizeof(*sch), GFP_KERNEL);
- if (!sch) {
- ret = -ENOMEM;
+ sch = scx_alloc_and_add_sched(ops);
+ if (IS_ERR(sch)) {
+ ret = PTR_ERR(sch);
goto err_unlock;
}
- sch->exit_info = alloc_exit_info(ops->exit_dump_len);
- if (!sch->exit_info) {
- ret = -ENOMEM;
- goto err_free;
- }
-
- sch->kobj.kset = scx_kset;
- ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
- if (ret < 0)
- goto err_free;
-
- atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
- sch->ops = *ops;
-
/*
* Transition to ENABLING and clear exit info to arm the disable path.
* Failure triggers full disabling from here on.
@@ -5562,10 +5580,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
return 0;
-err_free:
- if (sch->exit_info)
- free_exit_info(sch->exit_info);
- kfree(sch);
err_unlock:
mutex_unlock(&scx_enable_mutex);
return ret;
--
2.49.0
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PATCH 06/12] sched_ext: Move dsq_hash into scx_sched
2025-04-23 23:44 [PATCHSET sched_ext/for-6.16] sched_ext: Introduce scx_sched Tejun Heo
` (4 preceding siblings ...)
2025-04-23 23:44 ` [PATCH 05/12] sched_ext: Factor out scx_alloc_and_add_sched() Tejun Heo
@ 2025-04-23 23:44 ` Tejun Heo
2025-04-23 23:44 ` [PATCH 07/12] sched_ext: Move global_dsqs " Tejun Heo
` (5 subsequent siblings)
11 siblings, 0 replies; 19+ messages in thread
From: Tejun Heo @ 2025-04-23 23:44 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min, linux-kernel; +Cc: Tejun Heo
User DSQs are going to become per scheduler instance. Move dsq_hash into
scx_sched. This shifts the code that assumes scx_root to be the only
scx_sched instance up the call stack but doesn't remove them yet.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/ext.c | 120 ++++++++++++++++++++++++++++++---------------
1 file changed, 80 insertions(+), 40 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 88a0f6d9cb1e..a086b024301f 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -767,6 +767,7 @@ struct scx_sched {
struct sched_ext_ops ops;
DECLARE_BITMAP(has_op, SCX_OPI_END);
+ struct rhashtable dsq_hash;
bool warned_zero_slice;
atomic_t exit_kind;
@@ -1013,7 +1014,6 @@ static const struct rhashtable_params dsq_hash_params = {
.head_offset = offsetof(struct scx_dispatch_q, hash_node),
};
-static struct rhashtable dsq_hash;
static LLIST_HEAD(dsqs_to_free);
/* dispatch buf */
@@ -1111,9 +1111,9 @@ static struct scx_dispatch_q *find_global_dsq(struct task_struct *p)
return global_dsqs[cpu_to_node(task_cpu(p))];
}
-static struct scx_dispatch_q *find_user_dsq(u64 dsq_id)
+static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id)
{
- return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params);
+ return rhashtable_lookup_fast(&sch->dsq_hash, &dsq_id, dsq_hash_params);
}
/*
@@ -2056,7 +2056,8 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
raw_spin_unlock(&dsq->lock);
}
-static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
+static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch,
+ struct rq *rq, u64 dsq_id,
struct task_struct *p)
{
struct scx_dispatch_q *dsq;
@@ -2076,7 +2077,7 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
if (dsq_id == SCX_DSQ_GLOBAL)
dsq = find_global_dsq(p);
else
- dsq = find_user_dsq(dsq_id);
+ dsq = find_user_dsq(sch, dsq_id);
if (unlikely(!dsq)) {
scx_error("non-existent DSQ 0x%llx for %s[%d]",
@@ -2117,11 +2118,12 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task,
p->scx.ddsp_enq_flags = enq_flags;
}
-static void direct_dispatch(struct task_struct *p, u64 enq_flags)
+static void direct_dispatch(struct scx_sched *sch, struct task_struct *p,
+ u64 enq_flags)
{
struct rq *rq = task_rq(p);
struct scx_dispatch_q *dsq =
- find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p);
+ find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p);
touch_core_sched_dispatch(rq, p);
@@ -2180,6 +2182,7 @@ static bool scx_rq_online(struct rq *rq)
static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
int sticky_cpu)
{
+ struct scx_sched *sch = scx_root;
struct task_struct **ddsp_taskp;
unsigned long qseq;
@@ -2246,7 +2249,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
return;
direct:
- direct_dispatch(p, enq_flags);
+ direct_dispatch(sch, p, enq_flags);
return;
local:
@@ -2906,7 +2909,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
* was valid in the first place. Make sure that the task is still owned by the
* BPF scheduler and claim the ownership before dispatching.
*/
-static void finish_dispatch(struct rq *rq, struct task_struct *p,
+static void finish_dispatch(struct scx_sched *sch, struct rq *rq,
+ struct task_struct *p,
unsigned long qseq_at_dispatch,
u64 dsq_id, u64 enq_flags)
{
@@ -2959,7 +2963,7 @@ static void finish_dispatch(struct rq *rq, struct task_struct *p,
BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED));
- dsq = find_dsq_for_dispatch(this_rq(), dsq_id, p);
+ dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, p);
if (dsq->id == SCX_DSQ_LOCAL)
dispatch_to_local_dsq(rq, dsq, p, enq_flags);
@@ -2967,7 +2971,7 @@ static void finish_dispatch(struct rq *rq, struct task_struct *p,
dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
}
-static void flush_dispatch_buf(struct rq *rq)
+static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq)
{
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
u32 u;
@@ -2975,7 +2979,7 @@ static void flush_dispatch_buf(struct rq *rq)
for (u = 0; u < dspc->cursor; u++) {
struct scx_dsp_buf_ent *ent = &dspc->buf[u];
- finish_dispatch(rq, ent->task, ent->qseq, ent->dsq_id,
+ finish_dispatch(sch, rq, ent->task, ent->qseq, ent->dsq_id,
ent->enq_flags);
}
@@ -2985,6 +2989,7 @@ static void flush_dispatch_buf(struct rq *rq)
static int balance_one(struct rq *rq, struct task_struct *prev)
{
+ struct scx_sched *sch = scx_root;
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
bool prev_on_scx = prev->sched_class == &ext_sched_class;
bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED;
@@ -3052,7 +3057,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, rq, cpu_of(rq),
prev_on_scx ? prev : NULL);
- flush_dispatch_buf(rq);
+ flush_dispatch_buf(sch, rq);
if (prev_on_rq && prev->scx.slice) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
@@ -3146,11 +3151,12 @@ static void process_ddsp_deferred_locals(struct rq *rq)
*/
while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals,
struct task_struct, scx.dsq_list.node))) {
+ struct scx_sched *sch = scx_root;
struct scx_dispatch_q *dsq;
list_del_init(&p->scx.dsq_list.node);
- dsq = find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p);
+ dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p);
if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))
dispatch_to_local_dsq(rq, dsq, p, p->scx.ddsp_enq_flags);
}
@@ -4204,14 +4210,14 @@ static void free_dsq_irq_workfn(struct irq_work *irq_work)
static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn);
-static void destroy_dsq(u64 dsq_id)
+static void destroy_dsq(struct scx_sched *sch, u64 dsq_id)
{
struct scx_dispatch_q *dsq;
unsigned long flags;
rcu_read_lock();
- dsq = find_user_dsq(dsq_id);
+ dsq = find_user_dsq(sch, dsq_id);
if (!dsq)
goto out_unlock_rcu;
@@ -4223,7 +4229,8 @@ static void destroy_dsq(u64 dsq_id)
goto out_unlock_dsq;
}
- if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params))
+ if (rhashtable_remove_fast(&sch->dsq_hash, &dsq->hash_node,
+ dsq_hash_params))
goto out_unlock_dsq;
/*
@@ -4397,7 +4404,21 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
{
struct rcu_work *rcu_work = to_rcu_work(work);
struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work);
+ struct rhashtable_iter rht_iter;
+ struct scx_dispatch_q *dsq;
+
+ rhashtable_walk_enter(&sch->dsq_hash, &rht_iter);
+ do {
+ rhashtable_walk_start(&rht_iter);
+
+ while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq))
+ destroy_dsq(sch, dsq->id);
+
+ rhashtable_walk_stop(&rht_iter);
+ } while (dsq == ERR_PTR(-EAGAIN));
+ rhashtable_walk_exit(&rht_iter);
+ rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
free_exit_info(sch->exit_info);
kfree(sch);
}
@@ -4701,8 +4722,6 @@ static void scx_disable_workfn(struct kthread_work *work)
struct scx_exit_info *ei = sch->exit_info;
struct scx_task_iter sti;
struct task_struct *p;
- struct rhashtable_iter rht_iter;
- struct scx_dispatch_q *dsq;
int kind, cpu;
kind = atomic_read(&sch->exit_kind);
@@ -4834,17 +4853,6 @@ static void scx_disable_workfn(struct kthread_work *work)
*/
kobject_del(&sch->kobj);
- rhashtable_walk_enter(&dsq_hash, &rht_iter);
- do {
- rhashtable_walk_start(&rht_iter);
-
- while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq))
- destroy_dsq(dsq->id);
-
- rhashtable_walk_stop(&rht_iter);
- } while (dsq == ERR_PTR(-EAGAIN));
- rhashtable_walk_exit(&rht_iter);
-
free_percpu(scx_dsp_ctx);
scx_dsp_ctx = NULL;
scx_dsp_max_batch = 0;
@@ -5248,16 +5256,22 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
goto err_free_sch;
}
+ ret = rhashtable_init(&sch->dsq_hash, &dsq_hash_params);
+ if (ret < 0)
+ goto err_free_ei;
+
atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
sch->ops = *ops;
sch->kobj.kset = scx_kset;
ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
if (ret < 0)
- goto err_free_ei;
+ goto err_free_hash;
return sch;
+err_free_hash:
+ rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
err_free_ei:
free_exit_info(sch->exit_info);
err_free_sch:
@@ -6098,7 +6112,6 @@ void __init init_sched_ext_class(void)
WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT |
SCX_TG_ONLINE);
- BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
scx_idle_init_masks();
scx_kick_cpus_pnt_seqs =
@@ -6299,6 +6312,7 @@ static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
struct task_struct *p, u64 dsq_id, u64 enq_flags)
{
+ struct scx_sched *sch = scx_root;
struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq;
struct rq *this_rq, *src_rq, *locked_rq;
bool dispatched = false;
@@ -6351,7 +6365,7 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
}
/* @p is still on $src_dsq and stable, determine the destination */
- dst_dsq = find_dsq_for_dispatch(this_rq, dsq_id, p);
+ dst_dsq = find_dsq_for_dispatch(sch, this_rq, dsq_id, p);
/*
* Apply vtime and slice updates before moving so that the new time is
@@ -6431,15 +6445,16 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void)
*/
__bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
{
+ struct scx_sched *sch = scx_root;
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
struct scx_dispatch_q *dsq;
if (!scx_kf_allowed(SCX_KF_DISPATCH))
return false;
- flush_dispatch_buf(dspc->rq);
+ flush_dispatch_buf(sch, dspc->rq);
- dsq = find_user_dsq(dsq_id);
+ dsq = find_user_dsq(sch, dsq_id);
if (unlikely(!dsq)) {
scx_error("invalid DSQ ID 0x%016llx", dsq_id);
return false;
@@ -6700,6 +6715,7 @@ __bpf_kfunc_start_defs();
__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
{
struct scx_dispatch_q *dsq;
+ struct scx_sched *sch;
s32 ret;
if (unlikely(node >= (int)nr_node_ids ||
@@ -6715,8 +6731,16 @@ __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
init_dsq(dsq, dsq_id);
- ret = rhashtable_lookup_insert_fast(&dsq_hash, &dsq->hash_node,
- dsq_hash_params);
+ rcu_read_lock();
+
+ sch = rcu_dereference(scx_root);
+ if (sch)
+ ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node,
+ dsq_hash_params);
+ else
+ ret = -ENODEV;
+
+ rcu_read_unlock();
if (ret)
kfree(dsq);
return ret;
@@ -6815,11 +6839,18 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
*/
__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
{
+ struct scx_sched *sch;
struct scx_dispatch_q *dsq;
s32 ret;
preempt_disable();
+ sch = rcu_dereference_sched(scx_root);
+ if (!sch) {
+ ret = -ENODEV;
+ goto out;
+ }
+
if (dsq_id == SCX_DSQ_LOCAL) {
ret = READ_ONCE(this_rq()->scx.local_dsq.nr);
goto out;
@@ -6831,7 +6862,7 @@ __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
goto out;
}
} else {
- dsq = find_user_dsq(dsq_id);
+ dsq = find_user_dsq(sch, dsq_id);
if (dsq) {
ret = READ_ONCE(dsq->nr);
goto out;
@@ -6854,7 +6885,11 @@ __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
*/
__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id)
{
- destroy_dsq(dsq_id);
+ struct scx_sched *sch;
+
+ sch = rcu_dereference(scx_root);
+ if (sch)
+ destroy_dsq(sch, dsq_id);
}
/**
@@ -6871,16 +6906,21 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
u64 flags)
{
struct bpf_iter_scx_dsq_kern *kit = (void *)it;
+ struct scx_sched *sch;
BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) >
sizeof(struct bpf_iter_scx_dsq));
BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) !=
__alignof__(struct bpf_iter_scx_dsq));
+ sch = rcu_dereference(scx_root);
+ if (!sch)
+ return -ENODEV;
+
if (flags & ~__SCX_DSQ_ITER_USER_FLAGS)
return -EINVAL;
- kit->dsq = find_user_dsq(dsq_id);
+ kit->dsq = find_user_dsq(sch, dsq_id);
if (!kit->dsq)
return -ENOENT;
--
2.49.0
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PATCH 07/12] sched_ext: Move global_dsqs into scx_sched
2025-04-23 23:44 [PATCHSET sched_ext/for-6.16] sched_ext: Introduce scx_sched Tejun Heo
` (5 preceding siblings ...)
2025-04-23 23:44 ` [PATCH 06/12] sched_ext: Move dsq_hash into scx_sched Tejun Heo
@ 2025-04-23 23:44 ` Tejun Heo
2025-04-23 23:44 ` [PATCH 08/12] sched_ext: Relocate scx_event_stats definition Tejun Heo
` (4 subsequent siblings)
11 siblings, 0 replies; 19+ messages in thread
From: Tejun Heo @ 2025-04-23 23:44 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min, linux-kernel; +Cc: Tejun Heo
Global DSQs are going to become per scheduler instance. Move global_dsqs
into scx_sched. find_global_dsq() already takes a task_struct pointer as an
argument and should later be able to determine the scx_sched to use from
that. For now, assume scx_root.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/ext.c | 95 ++++++++++++++++++++++++----------------------
1 file changed, 49 insertions(+), 46 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index a086b024301f..5beb2dd868ad 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -767,7 +767,17 @@ struct scx_sched {
struct sched_ext_ops ops;
DECLARE_BITMAP(has_op, SCX_OPI_END);
+ /*
+ * Dispatch queues.
+ *
+ * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability.
+ * This is to avoid live-locking in bypass mode where all tasks are
+ * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If
+ * per-node split isn't sufficient, it can be further split.
+ */
struct rhashtable dsq_hash;
+ struct scx_dispatch_q **global_dsqs;
+
bool warned_zero_slice;
atomic_t exit_kind;
@@ -998,16 +1008,6 @@ static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
*/
static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task);
-/*
- * Dispatch queues.
- *
- * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. This is
- * to avoid live-locking in bypass mode where all tasks are dispatched to
- * %SCX_DSQ_GLOBAL and all CPUs consume from it. If per-node split isn't
- * sufficient, it can be further split.
- */
-static struct scx_dispatch_q **global_dsqs;
-
static const struct rhashtable_params dsq_hash_params = {
.key_len = sizeof_field(struct scx_dispatch_q, id),
.key_offset = offsetof(struct scx_dispatch_q, id),
@@ -1108,7 +1108,9 @@ static bool u32_before(u32 a, u32 b)
static struct scx_dispatch_q *find_global_dsq(struct task_struct *p)
{
- return global_dsqs[cpu_to_node(task_cpu(p))];
+ struct scx_sched *sch = scx_root;
+
+ return sch->global_dsqs[cpu_to_node(task_cpu(p))];
}
static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id)
@@ -2785,11 +2787,11 @@ static bool consume_dispatch_q(struct rq *rq, struct scx_dispatch_q *dsq)
return false;
}
-static bool consume_global_dsq(struct rq *rq)
+static bool consume_global_dsq(struct scx_sched *sch, struct rq *rq)
{
int node = cpu_to_node(cpu_of(rq));
- return consume_dispatch_q(rq, global_dsqs[node]);
+ return consume_dispatch_q(rq, sch->global_dsqs[node]);
}
/**
@@ -3035,7 +3037,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
if (rq->scx.local_dsq.nr)
goto has_tasks;
- if (consume_global_dsq(rq))
+ if (consume_global_dsq(sch, rq))
goto has_tasks;
if (unlikely(!SCX_HAS_OP(scx_root, dispatch)) ||
@@ -3065,7 +3067,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
}
if (rq->scx.local_dsq.nr)
goto has_tasks;
- if (consume_global_dsq(rq))
+ if (consume_global_dsq(sch, rq))
goto has_tasks;
/*
@@ -4406,6 +4408,11 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work);
struct rhashtable_iter rht_iter;
struct scx_dispatch_q *dsq;
+ int node;
+
+ for_each_node_state(node, N_POSSIBLE)
+ kfree(sch->global_dsqs[node]);
+ kfree(sch->global_dsqs);
rhashtable_walk_enter(&sch->dsq_hash, &rht_iter);
do {
@@ -5244,7 +5251,7 @@ static struct kthread_worker *scx_create_rt_helper(const char *name)
static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
{
struct scx_sched *sch;
- int ret;
+ int node, ret;
sch = kzalloc(sizeof(*sch), GFP_KERNEL);
if (!sch)
@@ -5260,16 +5267,40 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
if (ret < 0)
goto err_free_ei;
+ sch->global_dsqs = kcalloc(nr_node_ids, sizeof(sch->global_dsqs[0]),
+ GFP_KERNEL);
+ if (!sch->global_dsqs) {
+ ret = -ENOMEM;
+ goto err_free_hash;
+ }
+
+ for_each_node_state(node, N_POSSIBLE) {
+ struct scx_dispatch_q *dsq;
+
+ dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node);
+ if (!dsq) {
+ ret = -ENOMEM;
+ goto err_free_gdsqs;
+ }
+
+ init_dsq(dsq, SCX_DSQ_GLOBAL);
+ sch->global_dsqs[node] = dsq;
+ }
+
atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
sch->ops = *ops;
sch->kobj.kset = scx_kset;
ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
if (ret < 0)
- goto err_free_hash;
+ goto err_free_gdsqs;
return sch;
+err_free_gdsqs:
+ for_each_node_state(node, N_POSSIBLE)
+ kfree(sch->global_dsqs[node]);
+ kfree(sch->global_dsqs);
err_free_hash:
rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
err_free_ei:
@@ -5331,7 +5362,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
struct scx_task_iter sti;
struct task_struct *p;
unsigned long timeout;
- int i, cpu, node, ret;
+ int i, cpu, ret;
if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
cpu_possible_mask)) {
@@ -5358,34 +5389,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
}
}
- if (!global_dsqs) {
- struct scx_dispatch_q **dsqs;
-
- dsqs = kcalloc(nr_node_ids, sizeof(dsqs[0]), GFP_KERNEL);
- if (!dsqs) {
- ret = -ENOMEM;
- goto err_unlock;
- }
-
- for_each_node_state(node, N_POSSIBLE) {
- struct scx_dispatch_q *dsq;
-
- dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node);
- if (!dsq) {
- for_each_node_state(node, N_POSSIBLE)
- kfree(dsqs[node]);
- kfree(dsqs);
- ret = -ENOMEM;
- goto err_unlock;
- }
-
- init_dsq(dsq, SCX_DSQ_GLOBAL);
- dsqs[node] = dsq;
- }
-
- global_dsqs = dsqs;
- }
-
if (scx_enable_state() != SCX_DISABLED) {
ret = -EBUSY;
goto err_unlock;
--
2.49.0
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PATCH 08/12] sched_ext: Relocate scx_event_stats definition
2025-04-23 23:44 [PATCHSET sched_ext/for-6.16] sched_ext: Introduce scx_sched Tejun Heo
` (6 preceding siblings ...)
2025-04-23 23:44 ` [PATCH 07/12] sched_ext: Move global_dsqs " Tejun Heo
@ 2025-04-23 23:44 ` Tejun Heo
2025-04-23 23:44 ` [PATCH 09/12] sched_ext: Factor out scx_read_events() Tejun Heo
` (3 subsequent siblings)
11 siblings, 0 replies; 19+ messages in thread
From: Tejun Heo @ 2025-04-23 23:44 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min, linux-kernel; +Cc: Tejun Heo
In prepration of moving event_stats_cpu into scx_sched, move scx_event_stats
definitions above scx_sched definition. No functional changes.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/ext.c | 114 ++++++++++++++++++++++-----------------------
1 file changed, 57 insertions(+), 57 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 5beb2dd868ad..244ccf134cee 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -763,6 +763,63 @@ enum scx_opi {
SCX_OPI_END = SCX_OP_IDX(init),
};
+/*
+ * Collection of event counters. Event types are placed in descending order.
+ */
+struct scx_event_stats {
+ /*
+ * If ops.select_cpu() returns a CPU which can't be used by the task,
+ * the core scheduler code silently picks a fallback CPU.
+ */
+ s64 SCX_EV_SELECT_CPU_FALLBACK;
+
+ /*
+ * When dispatching to a local DSQ, the CPU may have gone offline in
+ * the meantime. In this case, the task is bounced to the global DSQ.
+ */
+ s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
+
+ /*
+ * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
+ * continued to run because there were no other tasks on the CPU.
+ */
+ s64 SCX_EV_DISPATCH_KEEP_LAST;
+
+ /*
+ * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
+ * is dispatched to a local DSQ when exiting.
+ */
+ s64 SCX_EV_ENQ_SKIP_EXITING;
+
+ /*
+ * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
+ * migration disabled task skips ops.enqueue() and is dispatched to its
+ * local DSQ.
+ */
+ s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
+
+ /*
+ * Total number of times a task's time slice was refilled with the
+ * default value (SCX_SLICE_DFL).
+ */
+ s64 SCX_EV_REFILL_SLICE_DFL;
+
+ /*
+ * The total duration of bypass modes in nanoseconds.
+ */
+ s64 SCX_EV_BYPASS_DURATION;
+
+ /*
+ * The number of tasks dispatched in the bypassing mode.
+ */
+ s64 SCX_EV_BYPASS_DISPATCH;
+
+ /*
+ * The number of times the bypassing mode has been activated.
+ */
+ s64 SCX_EV_BYPASS_ACTIVATE;
+};
+
struct scx_sched {
struct sched_ext_ops ops;
DECLARE_BITMAP(has_op, SCX_OPI_END);
@@ -1539,63 +1596,6 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
return p;
}
-/*
- * Collection of event counters. Event types are placed in descending order.
- */
-struct scx_event_stats {
- /*
- * If ops.select_cpu() returns a CPU which can't be used by the task,
- * the core scheduler code silently picks a fallback CPU.
- */
- s64 SCX_EV_SELECT_CPU_FALLBACK;
-
- /*
- * When dispatching to a local DSQ, the CPU may have gone offline in
- * the meantime. In this case, the task is bounced to the global DSQ.
- */
- s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
-
- /*
- * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
- * continued to run because there were no other tasks on the CPU.
- */
- s64 SCX_EV_DISPATCH_KEEP_LAST;
-
- /*
- * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
- * is dispatched to a local DSQ when exiting.
- */
- s64 SCX_EV_ENQ_SKIP_EXITING;
-
- /*
- * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
- * migration disabled task skips ops.enqueue() and is dispatched to its
- * local DSQ.
- */
- s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
-
- /*
- * Total number of times a task's time slice was refilled with the
- * default value (SCX_SLICE_DFL).
- */
- s64 SCX_EV_REFILL_SLICE_DFL;
-
- /*
- * The total duration of bypass modes in nanoseconds.
- */
- s64 SCX_EV_BYPASS_DURATION;
-
- /*
- * The number of tasks dispatched in the bypassing mode.
- */
- s64 SCX_EV_BYPASS_DISPATCH;
-
- /*
- * The number of times the bypassing mode has been activated.
- */
- s64 SCX_EV_BYPASS_ACTIVATE;
-};
-
/*
* The event counter is organized by a per-CPU variable to minimize the
* accounting overhead without synchronization. A system-wide view on the
--
2.49.0
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PATCH 09/12] sched_ext: Factor out scx_read_events()
2025-04-23 23:44 [PATCHSET sched_ext/for-6.16] sched_ext: Introduce scx_sched Tejun Heo
` (7 preceding siblings ...)
2025-04-23 23:44 ` [PATCH 08/12] sched_ext: Relocate scx_event_stats definition Tejun Heo
@ 2025-04-23 23:44 ` Tejun Heo
2025-04-23 23:44 ` [PATCH 10/12] sched_ext: Move event_stats_cpu into scx_sched Tejun Heo
` (2 subsequent siblings)
11 siblings, 0 replies; 19+ messages in thread
From: Tejun Heo @ 2025-04-23 23:44 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min, linux-kernel; +Cc: Tejun Heo
In prepration of moving event_stats_cpu into scx_sched, factor out
scx_read_events() out of scx_bpf_events() and update the in-kernel users -
scx_attr_events_show() and scx_dump_state() - to use scx_read_events()
instead of scx_bpf_events(). No observable behavior changes.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/ext.c | 45 ++++++++++++++++++++++++++-------------------
1 file changed, 26 insertions(+), 19 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 244ccf134cee..4d2866db8cbe 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1648,7 +1648,7 @@ static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
} while (0)
-static void scx_bpf_events(struct scx_event_stats *events, size_t events__sz);
+static void scx_read_events(struct scx_event_stats *events);
static enum scx_enable_state scx_enable_state(void)
{
@@ -4455,7 +4455,7 @@ static ssize_t scx_attr_events_show(struct kobject *kobj,
struct scx_event_stats events;
int at = 0;
- scx_bpf_events(&events, sizeof(events));
+ scx_read_events(&events);
at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK);
at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST);
@@ -5179,7 +5179,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
dump_line(&s, "Event counters");
dump_line(&s, "--------------");
- scx_bpf_events(&events, sizeof(events));
+ scx_read_events(&events);
scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK);
scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST);
@@ -7401,6 +7401,27 @@ __bpf_kfunc u64 scx_bpf_now(void)
return clock;
}
+static void scx_read_events(struct scx_event_stats *events)
+{
+ struct scx_event_stats *e_cpu;
+ int cpu;
+
+ /* Aggregate per-CPU event counters into @events. */
+ memset(events, 0, sizeof(*events));
+ for_each_possible_cpu(cpu) {
+ e_cpu = per_cpu_ptr(&event_stats_cpu, cpu);
+ scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
+ scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
+ scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
+ scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_EXITING);
+ scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+ scx_agg_event(events, e_cpu, SCX_EV_REFILL_SLICE_DFL);
+ scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DURATION);
+ scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DISPATCH);
+ scx_agg_event(events, e_cpu, SCX_EV_BYPASS_ACTIVATE);
+ }
+}
+
/*
* scx_bpf_events - Get a system-wide event counter to
* @events: output buffer from a BPF program
@@ -7409,23 +7430,9 @@ __bpf_kfunc u64 scx_bpf_now(void)
__bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
size_t events__sz)
{
- struct scx_event_stats e_sys, *e_cpu;
- int cpu;
+ struct scx_event_stats e_sys;
- /* Aggregate per-CPU event counters into the system-wide counters. */
- memset(&e_sys, 0, sizeof(e_sys));
- for_each_possible_cpu(cpu) {
- e_cpu = per_cpu_ptr(&event_stats_cpu, cpu);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_EXITING);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_REFILL_SLICE_DFL);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DURATION);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DISPATCH);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_ACTIVATE);
- }
+ scx_read_events(&e_sys);
/*
* We cannot entirely trust a BPF-provided size since a BPF program
--
2.49.0
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PATCH 10/12] sched_ext: Move event_stats_cpu into scx_sched
2025-04-23 23:44 [PATCHSET sched_ext/for-6.16] sched_ext: Introduce scx_sched Tejun Heo
` (8 preceding siblings ...)
2025-04-23 23:44 ` [PATCH 09/12] sched_ext: Factor out scx_read_events() Tejun Heo
@ 2025-04-23 23:44 ` Tejun Heo
2025-04-25 5:38 ` Changwoo Min
2025-04-23 23:44 ` [PATCH 11/12] sched_ext: Move disable machinery " Tejun Heo
2025-04-23 23:44 ` [PATCH 12/12] sched_ext: Clean up SCX_EXIT_NONE handling in scx_disable_workfn() Tejun Heo
11 siblings, 1 reply; 19+ messages in thread
From: Tejun Heo @ 2025-04-23 23:44 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min, linux-kernel; +Cc: Tejun Heo
The event counters are going to become per scheduler instance. Move
event_stats_cpu into scx_sched.
- [__]scx_add_event() are updated to take @sch. While at it, add missing
parentheses around @cnt expansions.
- scx_read_events() is updated to take @sch.
- scx_bpf_events() accesses scx_root under RCU read lock.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/ext.c | 100 ++++++++++++++++++++++++++-------------------
1 file changed, 58 insertions(+), 42 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 4d2866db8cbe..75c91b58430c 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -835,6 +835,13 @@ struct scx_sched {
struct rhashtable dsq_hash;
struct scx_dispatch_q **global_dsqs;
+ /*
+ * The event counters are in a per-CPU variable to minimize the
+ * accounting overhead. A system-wide view on the event counter is
+ * constructed when requested by scx_bpf_get_event_stat().
+ */
+ struct scx_event_stats __percpu *event_stats_cpu;
+
bool warned_zero_slice;
atomic_t exit_kind;
@@ -1596,34 +1603,29 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
return p;
}
-/*
- * The event counter is organized by a per-CPU variable to minimize the
- * accounting overhead without synchronization. A system-wide view on the
- * event counter is constructed when requested by scx_bpf_get_event_stat().
- */
-static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
-
/**
* scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @sch: scx_sched to account events for
* @name: an event name defined in struct scx_event_stats
* @cnt: the number of the event occured
*
* This can be used when preemption is not disabled.
*/
-#define scx_add_event(name, cnt) do { \
- this_cpu_add(event_stats_cpu.name, cnt); \
- trace_sched_ext_event(#name, cnt); \
+#define scx_add_event(sch, name, cnt) do { \
+ this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \
+ trace_sched_ext_event(#name, (cnt)); \
} while(0)
/**
* __scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @sch: scx_sched to account events for
* @name: an event name defined in struct scx_event_stats
* @cnt: the number of the event occured
*
* This should be used only when preemption is disabled.
*/
-#define __scx_add_event(name, cnt) do { \
- __this_cpu_add(event_stats_cpu.name, cnt); \
+#define __scx_add_event(sch, name, cnt) do { \
+ __this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \
trace_sched_ext_event(#name, cnt); \
} while(0)
@@ -1648,7 +1650,8 @@ static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
} while (0)
-static void scx_read_events(struct scx_event_stats *events);
+static void scx_read_events(struct scx_sched *sch,
+ struct scx_event_stats *events);
static enum scx_enable_state scx_enable_state(void)
{
@@ -1874,7 +1877,7 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
static void refill_task_slice_dfl(struct task_struct *p)
{
p->scx.slice = SCX_SLICE_DFL;
- __scx_add_event(SCX_EV_REFILL_SLICE_DFL, 1);
+ __scx_add_event(scx_root, SCX_EV_REFILL_SLICE_DFL, 1);
}
static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
@@ -2203,7 +2206,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
goto local;
if (scx_rq_bypassing(rq)) {
- __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
+ __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
goto global;
}
@@ -2213,14 +2216,14 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
/* see %SCX_OPS_ENQ_EXITING */
if (!(scx_root->ops.flags & SCX_OPS_ENQ_EXITING) &&
unlikely(p->flags & PF_EXITING)) {
- __scx_add_event(SCX_EV_ENQ_SKIP_EXITING, 1);
+ __scx_add_event(sch, SCX_EV_ENQ_SKIP_EXITING, 1);
goto local;
}
/* see %SCX_OPS_ENQ_MIGRATION_DISABLED */
if (!(scx_root->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) &&
is_migration_disabled(p)) {
- __scx_add_event(SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
+ __scx_add_event(sch, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
goto local;
}
@@ -2343,7 +2346,7 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
if ((enq_flags & SCX_ENQ_CPU_SELECTED) &&
unlikely(cpu_of(rq) != p->scx.selected_cpu))
- __scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1);
+ __scx_add_event(scx_root, SCX_EV_SELECT_CPU_FALLBACK, 1);
}
static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
@@ -2571,7 +2574,8 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
if (!scx_rq_online(rq)) {
if (enforce)
- __scx_add_event(SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
+ __scx_add_event(scx_root,
+ SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
return false;
}
@@ -3093,7 +3097,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
if (prev_on_rq &&
(!(scx_root->ops.flags & SCX_OPS_ENQ_LAST) || scx_rq_bypassing(rq))) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
- __scx_add_event(SCX_EV_DISPATCH_KEEP_LAST, 1);
+ __scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1);
goto has_tasks;
}
rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
@@ -3424,6 +3428,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
{
+ struct scx_sched *sch = scx_root;
bool rq_bypass;
/*
@@ -3440,7 +3445,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
return prev_cpu;
rq_bypass = scx_rq_bypassing(task_rq(p));
- if (likely(SCX_HAS_OP(scx_root, select_cpu)) && !rq_bypass) {
+ if (likely(SCX_HAS_OP(sch, select_cpu)) && !rq_bypass) {
s32 cpu;
struct task_struct **ddsp_taskp;
@@ -3469,7 +3474,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
p->scx.selected_cpu = cpu;
if (rq_bypass)
- __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
+ __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
return cpu;
}
}
@@ -4410,6 +4415,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
struct scx_dispatch_q *dsq;
int node;
+ free_percpu(sch->event_stats_cpu);
+
for_each_node_state(node, N_POSSIBLE)
kfree(sch->global_dsqs[node]);
kfree(sch->global_dsqs);
@@ -4452,10 +4459,11 @@ SCX_ATTR(ops);
static ssize_t scx_attr_events_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
+ struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
struct scx_event_stats events;
int at = 0;
- scx_read_events(&events);
+ scx_read_events(sch, &events);
at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK);
at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST);
@@ -4588,25 +4596,29 @@ static void scx_bypass(bool bypass)
{
static DEFINE_RAW_SPINLOCK(bypass_lock);
static unsigned long bypass_timestamp;
-
- int cpu;
+ struct scx_sched *sch;
unsigned long flags;
+ int cpu;
raw_spin_lock_irqsave(&bypass_lock, flags);
+ sch = rcu_dereference_bh(scx_root);
+
if (bypass) {
scx_bypass_depth++;
WARN_ON_ONCE(scx_bypass_depth <= 0);
if (scx_bypass_depth != 1)
goto unlock;
bypass_timestamp = ktime_get_ns();
- scx_add_event(SCX_EV_BYPASS_ACTIVATE, 1);
+ if (sch)
+ scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
} else {
scx_bypass_depth--;
WARN_ON_ONCE(scx_bypass_depth < 0);
if (scx_bypass_depth != 0)
goto unlock;
- scx_add_event(SCX_EV_BYPASS_DURATION,
- ktime_get_ns() - bypass_timestamp);
+ if (sch)
+ scx_add_event(sch, SCX_EV_BYPASS_DURATION,
+ ktime_get_ns() - bypass_timestamp);
}
atomic_inc(&scx_breather_depth);
@@ -5179,7 +5191,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
dump_line(&s, "Event counters");
dump_line(&s, "--------------");
- scx_read_events(&events);
+ scx_read_events(scx_root, &events);
scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK);
scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST);
@@ -5287,16 +5299,22 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
sch->global_dsqs[node] = dsq;
}
+ sch->event_stats_cpu = alloc_percpu(struct scx_event_stats);
+ if (!sch->event_stats_cpu)
+ goto err_free_gdsqs;
+
atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
sch->ops = *ops;
sch->kobj.kset = scx_kset;
ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
if (ret < 0)
- goto err_free_gdsqs;
+ goto err_event_stats;
return sch;
+err_event_stats:
+ free_percpu(sch->event_stats_cpu);
err_free_gdsqs:
for_each_node_state(node, N_POSSIBLE)
kfree(sch->global_dsqs[node]);
@@ -5372,15 +5390,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
mutex_lock(&scx_enable_mutex);
- /*
- * Clear event counters so a new scx scheduler gets
- * fresh event counter values.
- */
- for_each_possible_cpu(cpu) {
- struct scx_event_stats *e = per_cpu_ptr(&event_stats_cpu, cpu);
- memset(e, 0, sizeof(*e));
- }
-
if (!scx_helper) {
WRITE_ONCE(scx_helper, scx_create_rt_helper("sched_ext_helper"));
if (!scx_helper) {
@@ -7401,7 +7410,7 @@ __bpf_kfunc u64 scx_bpf_now(void)
return clock;
}
-static void scx_read_events(struct scx_event_stats *events)
+static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *events)
{
struct scx_event_stats *e_cpu;
int cpu;
@@ -7409,7 +7418,7 @@ static void scx_read_events(struct scx_event_stats *events)
/* Aggregate per-CPU event counters into @events. */
memset(events, 0, sizeof(*events));
for_each_possible_cpu(cpu) {
- e_cpu = per_cpu_ptr(&event_stats_cpu, cpu);
+ e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu);
scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
@@ -7430,9 +7439,16 @@ static void scx_read_events(struct scx_event_stats *events)
__bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
size_t events__sz)
{
+ struct scx_sched *sch;
struct scx_event_stats e_sys;
- scx_read_events(&e_sys);
+ rcu_read_lock();
+ sch = rcu_dereference(scx_root);
+ if (sch)
+ scx_read_events(sch, &e_sys);
+ else
+ memset(&e_sys, 0, sizeof(e_sys));
+ rcu_read_unlock();
/*
* We cannot entirely trust a BPF-provided size since a BPF program
--
2.49.0
^ permalink raw reply related [flat|nested] 19+ messages in thread
* Re: [PATCH 10/12] sched_ext: Move event_stats_cpu into scx_sched
2025-04-23 23:44 ` [PATCH 10/12] sched_ext: Move event_stats_cpu into scx_sched Tejun Heo
@ 2025-04-25 5:38 ` Changwoo Min
0 siblings, 0 replies; 19+ messages in thread
From: Changwoo Min @ 2025-04-25 5:38 UTC (permalink / raw)
To: Tejun Heo, David Vernet, Andrea Righi, linux-kernel
Hi Tejun,
On 4/24/25 08:44, Tejun Heo wrote:
> The event counters are going to become per scheduler instance. Move
> event_stats_cpu into scx_sched.
>
> - [__]scx_add_event() are updated to take @sch. While at it, add missing
> parentheses around @cnt expansions.
>
> - scx_read_events() is updated to take @sch.
>
> - scx_bpf_events() accesses scx_root under RCU read lock.
>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
> kernel/sched/ext.c | 100 ++++++++++++++++++++++++++-------------------
> 1 file changed, 58 insertions(+), 42 deletions(-)
>
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 4d2866db8cbe..75c91b58430c 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -835,6 +835,13 @@ struct scx_sched {
> struct rhashtable dsq_hash;
> struct scx_dispatch_q **global_dsqs;
>
> + /*
> + * The event counters are in a per-CPU variable to minimize the
> + * accounting overhead. A system-wide view on the event counter is
> + * constructed when requested by scx_bpf_get_event_stat().
scx_bpf_get_event_stat() should be replaced to scx_bpf_events(). I
think, in the original code, the old function name,
scx_bpf_get_event_stat(), was creeped in by mistake.
Other than that, the other changes look straightforward and good to me.
Acked-by: Changwoo Min <changwoo@igalia.com>
Regards,
Changwoo Min
> + */
> + struct scx_event_stats __percpu *event_stats_cpu;
> +
> bool warned_zero_slice;
>
> atomic_t exit_kind;
> @@ -1596,34 +1603,29 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
> return p;
> }
>
> -/*
> - * The event counter is organized by a per-CPU variable to minimize the
> - * accounting overhead without synchronization. A system-wide view on the
> - * event counter is constructed when requested by scx_bpf_get_event_stat().
> - */
> -static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
> -
> /**
> * scx_add_event - Increase an event counter for 'name' by 'cnt'
> + * @sch: scx_sched to account events for
> * @name: an event name defined in struct scx_event_stats
> * @cnt: the number of the event occured
> *
> * This can be used when preemption is not disabled.
> */
> -#define scx_add_event(name, cnt) do { \
> - this_cpu_add(event_stats_cpu.name, cnt); \
> - trace_sched_ext_event(#name, cnt); \
> +#define scx_add_event(sch, name, cnt) do { \
> + this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \
> + trace_sched_ext_event(#name, (cnt)); \
> } while(0)
>
> /**
> * __scx_add_event - Increase an event counter for 'name' by 'cnt'
> + * @sch: scx_sched to account events for
> * @name: an event name defined in struct scx_event_stats
> * @cnt: the number of the event occured
> *
> * This should be used only when preemption is disabled.
> */
> -#define __scx_add_event(name, cnt) do { \
> - __this_cpu_add(event_stats_cpu.name, cnt); \
> +#define __scx_add_event(sch, name, cnt) do { \
> + __this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \
> trace_sched_ext_event(#name, cnt); \
> } while(0)
>
> @@ -1648,7 +1650,8 @@ static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
> } while (0)
>
>
> -static void scx_read_events(struct scx_event_stats *events);
> +static void scx_read_events(struct scx_sched *sch,
> + struct scx_event_stats *events);
>
> static enum scx_enable_state scx_enable_state(void)
> {
> @@ -1874,7 +1877,7 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
> static void refill_task_slice_dfl(struct task_struct *p)
> {
> p->scx.slice = SCX_SLICE_DFL;
> - __scx_add_event(SCX_EV_REFILL_SLICE_DFL, 1);
> + __scx_add_event(scx_root, SCX_EV_REFILL_SLICE_DFL, 1);
> }
>
> static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
> @@ -2203,7 +2206,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
> goto local;
>
> if (scx_rq_bypassing(rq)) {
> - __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
> + __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
> goto global;
> }
>
> @@ -2213,14 +2216,14 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
> /* see %SCX_OPS_ENQ_EXITING */
> if (!(scx_root->ops.flags & SCX_OPS_ENQ_EXITING) &&
> unlikely(p->flags & PF_EXITING)) {
> - __scx_add_event(SCX_EV_ENQ_SKIP_EXITING, 1);
> + __scx_add_event(sch, SCX_EV_ENQ_SKIP_EXITING, 1);
> goto local;
> }
>
> /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */
> if (!(scx_root->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) &&
> is_migration_disabled(p)) {
> - __scx_add_event(SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
> + __scx_add_event(sch, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
> goto local;
> }
>
> @@ -2343,7 +2346,7 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
>
> if ((enq_flags & SCX_ENQ_CPU_SELECTED) &&
> unlikely(cpu_of(rq) != p->scx.selected_cpu))
> - __scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1);
> + __scx_add_event(scx_root, SCX_EV_SELECT_CPU_FALLBACK, 1);
> }
>
> static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
> @@ -2571,7 +2574,8 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
>
> if (!scx_rq_online(rq)) {
> if (enforce)
> - __scx_add_event(SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
> + __scx_add_event(scx_root,
> + SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
> return false;
> }
>
> @@ -3093,7 +3097,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
> if (prev_on_rq &&
> (!(scx_root->ops.flags & SCX_OPS_ENQ_LAST) || scx_rq_bypassing(rq))) {
> rq->scx.flags |= SCX_RQ_BAL_KEEP;
> - __scx_add_event(SCX_EV_DISPATCH_KEEP_LAST, 1);
> + __scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1);
> goto has_tasks;
> }
> rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
> @@ -3424,6 +3428,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
>
> static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
> {
> + struct scx_sched *sch = scx_root;
> bool rq_bypass;
>
> /*
> @@ -3440,7 +3445,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
> return prev_cpu;
>
> rq_bypass = scx_rq_bypassing(task_rq(p));
> - if (likely(SCX_HAS_OP(scx_root, select_cpu)) && !rq_bypass) {
> + if (likely(SCX_HAS_OP(sch, select_cpu)) && !rq_bypass) {
> s32 cpu;
> struct task_struct **ddsp_taskp;
>
> @@ -3469,7 +3474,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
> p->scx.selected_cpu = cpu;
>
> if (rq_bypass)
> - __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
> + __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
> return cpu;
> }
> }
> @@ -4410,6 +4415,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
> struct scx_dispatch_q *dsq;
> int node;
>
> + free_percpu(sch->event_stats_cpu);
> +
> for_each_node_state(node, N_POSSIBLE)
> kfree(sch->global_dsqs[node]);
> kfree(sch->global_dsqs);
> @@ -4452,10 +4459,11 @@ SCX_ATTR(ops);
> static ssize_t scx_attr_events_show(struct kobject *kobj,
> struct kobj_attribute *ka, char *buf)
> {
> + struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
> struct scx_event_stats events;
> int at = 0;
>
> - scx_read_events(&events);
> + scx_read_events(sch, &events);
> at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK);
> at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
> at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST);
> @@ -4588,25 +4596,29 @@ static void scx_bypass(bool bypass)
> {
> static DEFINE_RAW_SPINLOCK(bypass_lock);
> static unsigned long bypass_timestamp;
> -
> - int cpu;
> + struct scx_sched *sch;
> unsigned long flags;
> + int cpu;
>
> raw_spin_lock_irqsave(&bypass_lock, flags);
> + sch = rcu_dereference_bh(scx_root);
> +
> if (bypass) {
> scx_bypass_depth++;
> WARN_ON_ONCE(scx_bypass_depth <= 0);
> if (scx_bypass_depth != 1)
> goto unlock;
> bypass_timestamp = ktime_get_ns();
> - scx_add_event(SCX_EV_BYPASS_ACTIVATE, 1);
> + if (sch)
> + scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
> } else {
> scx_bypass_depth--;
> WARN_ON_ONCE(scx_bypass_depth < 0);
> if (scx_bypass_depth != 0)
> goto unlock;
> - scx_add_event(SCX_EV_BYPASS_DURATION,
> - ktime_get_ns() - bypass_timestamp);
> + if (sch)
> + scx_add_event(sch, SCX_EV_BYPASS_DURATION,
> + ktime_get_ns() - bypass_timestamp);
> }
>
> atomic_inc(&scx_breather_depth);
> @@ -5179,7 +5191,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
> dump_line(&s, "Event counters");
> dump_line(&s, "--------------");
>
> - scx_read_events(&events);
> + scx_read_events(scx_root, &events);
> scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK);
> scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
> scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST);
> @@ -5287,16 +5299,22 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
> sch->global_dsqs[node] = dsq;
> }
>
> + sch->event_stats_cpu = alloc_percpu(struct scx_event_stats);
> + if (!sch->event_stats_cpu)
> + goto err_free_gdsqs;
> +
> atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
> sch->ops = *ops;
>
> sch->kobj.kset = scx_kset;
> ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
> if (ret < 0)
> - goto err_free_gdsqs;
> + goto err_event_stats;
>
> return sch;
>
> +err_event_stats:
> + free_percpu(sch->event_stats_cpu);
> err_free_gdsqs:
> for_each_node_state(node, N_POSSIBLE)
> kfree(sch->global_dsqs[node]);
> @@ -5372,15 +5390,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>
> mutex_lock(&scx_enable_mutex);
>
> - /*
> - * Clear event counters so a new scx scheduler gets
> - * fresh event counter values.
> - */
> - for_each_possible_cpu(cpu) {
> - struct scx_event_stats *e = per_cpu_ptr(&event_stats_cpu, cpu);
> - memset(e, 0, sizeof(*e));
> - }
> -
> if (!scx_helper) {
> WRITE_ONCE(scx_helper, scx_create_rt_helper("sched_ext_helper"));
> if (!scx_helper) {
> @@ -7401,7 +7410,7 @@ __bpf_kfunc u64 scx_bpf_now(void)
> return clock;
> }
>
> -static void scx_read_events(struct scx_event_stats *events)
> +static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *events)
> {
> struct scx_event_stats *e_cpu;
> int cpu;
> @@ -7409,7 +7418,7 @@ static void scx_read_events(struct scx_event_stats *events)
> /* Aggregate per-CPU event counters into @events. */
> memset(events, 0, sizeof(*events));
> for_each_possible_cpu(cpu) {
> - e_cpu = per_cpu_ptr(&event_stats_cpu, cpu);
> + e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu);
> scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
> scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
> scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
> @@ -7430,9 +7439,16 @@ static void scx_read_events(struct scx_event_stats *events)
> __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
> size_t events__sz)
> {
> + struct scx_sched *sch;
> struct scx_event_stats e_sys;
>
> - scx_read_events(&e_sys);
> + rcu_read_lock();
> + sch = rcu_dereference(scx_root);
> + if (sch)
> + scx_read_events(sch, &e_sys);
> + else
> + memset(&e_sys, 0, sizeof(e_sys));
> + rcu_read_unlock();
>
> /*
> * We cannot entirely trust a BPF-provided size since a BPF program
^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH 11/12] sched_ext: Move disable machinery into scx_sched
2025-04-23 23:44 [PATCHSET sched_ext/for-6.16] sched_ext: Introduce scx_sched Tejun Heo
` (9 preceding siblings ...)
2025-04-23 23:44 ` [PATCH 10/12] sched_ext: Move event_stats_cpu into scx_sched Tejun Heo
@ 2025-04-23 23:44 ` Tejun Heo
2025-04-23 23:44 ` [PATCH 12/12] sched_ext: Clean up SCX_EXIT_NONE handling in scx_disable_workfn() Tejun Heo
11 siblings, 0 replies; 19+ messages in thread
From: Tejun Heo @ 2025-04-23 23:44 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min, linux-kernel; +Cc: Tejun Heo
Because disable can be triggered from any place and the scheduler cannot be
trusted, disable path uses an irq_work to bounce and a kthread_work which is
executed on an RT helper kthread to perform disable. These must be per
scheduler instance to guarantee forward progress. Move them into scx_sched.
- If an scx_sched is accessible, its helper kthread is always valid making
the `helper` check in schedule_scx_disable_work() unnecessary. As the
function becomes trivial after the removal of the test, inline it.
- scx_create_rt_helper() has only one user - creation of the disable helper
kthread. Inline it into scx_alloc_and_add_sched().
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/ext.c | 73 ++++++++++++++++------------------------------
1 file changed, 25 insertions(+), 48 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 75c91b58430c..d27193010b6a 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -848,6 +848,10 @@ struct scx_sched {
struct scx_exit_info *exit_info;
struct kobject kobj;
+
+ struct kthread_worker *helper;
+ struct irq_work error_irq_work;
+ struct kthread_work disable_work;
struct rcu_work rcu_work;
};
@@ -1021,7 +1025,6 @@ static DEFINE_SPINLOCK(scx_tasks_lock);
static LIST_HEAD(scx_tasks);
/* ops enable/disable */
-static struct kthread_worker *scx_helper;
static DEFINE_MUTEX(scx_enable_mutex);
DEFINE_STATIC_KEY_FALSE(__scx_enabled);
DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
@@ -4415,6 +4418,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
struct scx_dispatch_q *dsq;
int node;
+ kthread_stop(sch->helper->task);
free_percpu(sch->event_stats_cpu);
for_each_node_state(node, N_POSSIBLE)
@@ -4737,7 +4741,7 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
static void scx_disable_workfn(struct kthread_work *work)
{
- struct scx_sched *sch = scx_root;
+ struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
struct scx_exit_info *ei = sch->exit_info;
struct scx_task_iter sti;
struct task_struct *p;
@@ -4883,20 +4887,6 @@ static void scx_disable_workfn(struct kthread_work *work)
scx_bypass(false);
}
-static DEFINE_KTHREAD_WORK(scx_disable_work, scx_disable_workfn);
-
-static void schedule_scx_disable_work(void)
-{
- struct kthread_worker *helper = READ_ONCE(scx_helper);
-
- /*
- * We may be called spuriously before the first bpf_sched_ext_reg(). If
- * scx_helper isn't set up yet, there's nothing to do.
- */
- if (helper)
- kthread_queue_work(helper, &scx_disable_work);
-}
-
static void scx_disable(enum scx_exit_kind kind)
{
int none = SCX_EXIT_NONE;
@@ -4909,7 +4899,7 @@ static void scx_disable(enum scx_exit_kind kind)
sch = rcu_dereference(scx_root);
if (sch) {
atomic_try_cmpxchg(&sch->exit_kind, &none, kind);
- schedule_scx_disable_work();
+ kthread_queue_work(sch->helper, &sch->disable_work);
}
rcu_read_unlock();
}
@@ -5211,16 +5201,15 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
static void scx_error_irq_workfn(struct irq_work *irq_work)
{
- struct scx_exit_info *ei = scx_root->exit_info;
+ struct scx_sched *sch = container_of(irq_work, struct scx_sched, error_irq_work);
+ struct scx_exit_info *ei = sch->exit_info;
if (ei->kind >= SCX_EXIT_ERROR)
- scx_dump_state(ei, scx_root->ops.exit_dump_len);
+ scx_dump_state(ei, sch->ops.exit_dump_len);
- schedule_scx_disable_work();
+ kthread_queue_work(sch->helper, &sch->disable_work);
}
-static DEFINE_IRQ_WORK(scx_error_irq_work, scx_error_irq_workfn);
-
static __printf(3, 4) void __scx_exit(enum scx_exit_kind kind, s64 exit_code,
const char *fmt, ...)
{
@@ -5247,17 +5236,7 @@ static __printf(3, 4) void __scx_exit(enum scx_exit_kind kind, s64 exit_code,
ei->kind = kind;
ei->reason = scx_exit_reason(ei->kind);
- irq_work_queue(&scx_error_irq_work);
-}
-
-static struct kthread_worker *scx_create_rt_helper(const char *name)
-{
- struct kthread_worker *helper;
-
- helper = kthread_run_worker(0, name);
- if (helper)
- sched_set_fifo(helper->task);
- return helper;
+ irq_work_queue(&scx_root->error_irq_work);
}
static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
@@ -5303,16 +5282,25 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
if (!sch->event_stats_cpu)
goto err_free_gdsqs;
+ sch->helper = kthread_run_worker(0, "sched_ext_helper");
+ if (!sch->helper)
+ goto err_event_stats;
+ sched_set_fifo(sch->helper->task);
+
atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
+ init_irq_work(&sch->error_irq_work, scx_error_irq_workfn);
+ kthread_init_work(&sch->disable_work, scx_disable_workfn);
sch->ops = *ops;
sch->kobj.kset = scx_kset;
ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
if (ret < 0)
- goto err_event_stats;
+ goto err_stop_helper;
return sch;
+err_stop_helper:
+ kthread_stop(sch->helper->task);
err_event_stats:
free_percpu(sch->event_stats_cpu);
err_free_gdsqs:
@@ -5390,14 +5378,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
mutex_lock(&scx_enable_mutex);
- if (!scx_helper) {
- WRITE_ONCE(scx_helper, scx_create_rt_helper("sched_ext_helper"));
- if (!scx_helper) {
- ret = -ENOMEM;
- goto err_unlock;
- }
- }
-
if (scx_enable_state() != SCX_DISABLED) {
ret = -EBUSY;
goto err_unlock;
@@ -5626,7 +5606,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
* completion.
*/
scx_error("scx_enable() failed (%d)", ret);
- kthread_flush_work(&scx_disable_work);
+ kthread_flush_work(&sch->disable_work);
kobject_put(&sch->kobj);
return 0;
}
@@ -5779,7 +5759,7 @@ static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
struct scx_sched *sch = scx_root;
scx_disable(SCX_EXIT_UNREG);
- kthread_flush_work(&scx_disable_work);
+ kthread_flush_work(&sch->disable_work);
kobject_put(&sch->kobj);
}
@@ -5902,10 +5882,7 @@ static struct bpf_struct_ops bpf_sched_ext_ops = {
static void sysrq_handle_sched_ext_reset(u8 key)
{
- if (scx_helper)
- scx_disable(SCX_EXIT_SYSRQ);
- else
- pr_info("sched_ext: BPF scheduler not yet used\n");
+ scx_disable(SCX_EXIT_SYSRQ);
}
static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
--
2.49.0
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PATCH 12/12] sched_ext: Clean up SCX_EXIT_NONE handling in scx_disable_workfn()
2025-04-23 23:44 [PATCHSET sched_ext/for-6.16] sched_ext: Introduce scx_sched Tejun Heo
` (10 preceding siblings ...)
2025-04-23 23:44 ` [PATCH 11/12] sched_ext: Move disable machinery " Tejun Heo
@ 2025-04-23 23:44 ` Tejun Heo
11 siblings, 0 replies; 19+ messages in thread
From: Tejun Heo @ 2025-04-23 23:44 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min, linux-kernel; +Cc: Tejun Heo
With the global states and disable machinery moved into scx_sched,
scx_disable_workfn() can only be scheduled and run for the specific
scheduler instance. This makes it impossible for scx_disable_workfn() to see
SCX_EXIT_NONE. Turn that condition into WARN_ON_ONCE().
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/ext.c | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index d27193010b6a..d963aa5c99e1 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4749,13 +4749,9 @@ static void scx_disable_workfn(struct kthread_work *work)
kind = atomic_read(&sch->exit_kind);
while (true) {
- /*
- * NONE indicates that a new scx_ops has been registered since
- * disable was scheduled - don't kill the new ops. DONE
- * indicates that the ops has already been disabled.
- */
- if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)
+ if (kind == SCX_EXIT_DONE) /* already disabled? */
return;
+ WARN_ON_ONCE(kind == SCX_EXIT_NONE);
if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE))
break;
}
--
2.49.0
^ permalink raw reply related [flat|nested] 19+ messages in thread