linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCHSET sched_ext/for-6.16] sched_ext: Introduce scx_sched
@ 2025-04-23 23:44 Tejun Heo
  2025-04-23 23:44 ` [PATCH 01/12] " Tejun Heo
                   ` (11 more replies)
  0 siblings, 12 replies; 19+ messages in thread
From: Tejun Heo @ 2025-04-23 23:44 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, linux-kernel

In preparation of supporting multiple hierarchical schedulers, this patchset
packages the states which are currently global but need to become per
scheduler instance into the new struct scx_sched.

Currently, the only supported scheduler instance is the system-wide root one
(scx_root) and the code assumes scx_root in many places. A follow-up
patchset will introduce multiple scheduler support scrutinizing and updating
each such occurrence appropriately.

There's significant amount of churning but most changes are straightforward
if not trivial, and no user visible changes are expected.

0001-sched_ext-Introduce-scx_sched.patch
0002-sched_ext-Avoid-NULL-scx_root-deref-through-SCX_HAS_.patch
0003-sched_ext-Use-dynamic-allocation-for-scx_sched.patch
0004-sched_ext-Inline-create_dsq-into-scx_bpf_create_dsq.patch
0005-sched_ext-Factor-out-scx_alloc_and_add_sched.patch
0006-sched_ext-Move-dsq_hash-into-scx_sched.patch
0007-sched_ext-Move-global_dsqs-into-scx_sched.patch
0008-sched_ext-Relocate-scx_event_stats-definition.patch
0009-sched_ext-Factor-out-scx_read_events.patch
0010-sched_ext-Move-event_stats_cpu-into-scx_sched.patch
0011-sched_ext-Move-disable-machinery-into-scx_sched.patch
0012-sched_ext-Clean-up-SCX_EXIT_NONE-handling-in-scx_dis.patch

and is also available in the following git branch:

 git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git scx_sched

diffstat follows. Thanks.

 kernel/sched/ext.c      |  886 ++++++++++++++++++++++++++----------------------
 kernel/sched/ext_idle.c |    3
 2 files changed, 491 insertions(+), 398 deletions(-)

--
tejun


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 01/12] sched_ext: Introduce scx_sched
  2025-04-23 23:44 [PATCHSET sched_ext/for-6.16] sched_ext: Introduce scx_sched Tejun Heo
@ 2025-04-23 23:44 ` Tejun Heo
  2025-04-23 23:44 ` [PATCH 02/12] sched_ext: Avoid NULL scx_root deref through SCX_HAS_OP() Tejun Heo
                   ` (10 subsequent siblings)
  11 siblings, 0 replies; 19+ messages in thread
From: Tejun Heo @ 2025-04-23 23:44 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, linux-kernel; +Cc: Tejun Heo

To support multiple scheduler instances, collect some of the global
variables that should be specific to a scheduler instance into the new
struct scx_sched. scx_root is the root scheduler instance and points to a
static instance of struct scx_sched. Except for an extra dereference through
the scx_root pointer, this patch makes no functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c      | 243 +++++++++++++++++++++-------------------
 kernel/sched/ext_idle.c |   3 +-
 2 files changed, 131 insertions(+), 115 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 6e530a91e944..975f6963a01b 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -763,6 +763,18 @@ enum scx_opi {
 	SCX_OPI_END			= SCX_OP_IDX(init),
 };
 
+struct scx_sched {
+	struct sched_ext_ops	ops;
+	DECLARE_BITMAP(has_op, SCX_OPI_END);
+
+	bool			warned_zero_slice;
+
+	atomic_t		exit_kind;
+	struct scx_exit_info	*exit_info;
+
+	struct kobject		*kobj;
+};
+
 enum scx_wake_flags {
 	/* expose select WF_* flags as enums */
 	SCX_WAKE_FORK		= WF_FORK,
@@ -921,6 +933,12 @@ enum scx_ops_state {
 #define SCX_OPSS_STATE_MASK	((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
 #define SCX_OPSS_QSEQ_MASK	(~SCX_OPSS_STATE_MASK)
 
+static struct scx_sched __scx_root = {
+	.exit_kind		= ATOMIC_INIT(SCX_EXIT_DONE),
+};
+
+static struct scx_sched *scx_root = &__scx_root;
+
 /*
  * During exit, a task may schedule after losing its PIDs. When disabling the
  * BPF scheduler, we need to be able to iterate tasks in every state to
@@ -943,14 +961,6 @@ static bool scx_init_task_enabled;
 static bool scx_switching_all;
 DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
 
-static struct sched_ext_ops scx_ops;
-static bool scx_warned_zero_slice;
-
-static DECLARE_BITMAP(scx_has_op, SCX_OPI_END);
-
-static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
-static struct scx_exit_info *scx_exit_info;
-
 static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
 static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
 
@@ -1053,7 +1063,6 @@ static struct scx_dump_data scx_dump_data = {
 
 /* /sys/kernel/sched_ext interface */
 static struct kset *scx_kset;
-static struct kobject *scx_root_kobj;
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched_ext.h>
@@ -1072,7 +1081,7 @@ static __printf(3, 4) void __scx_exit(enum scx_exit_kind kind, s64 exit_code,
 #define scx_error(fmt, args...)							\
 	__scx_error(SCX_EXIT_ERROR, fmt, ##args)
 
-#define SCX_HAS_OP(op)	test_bit(SCX_OP_IDX(op), scx_has_op)
+#define SCX_HAS_OP(sch, op)	test_bit(SCX_OP_IDX(op), (sch)->has_op)
 
 static long jiffies_delta_msecs(unsigned long at, unsigned long now)
 {
@@ -1168,25 +1177,25 @@ do {										\
 	update_locked_rq(rq);							\
 	if (mask) {								\
 		scx_kf_allow(mask);						\
-		scx_ops.op(args);						\
+		scx_root->ops.op(args);						\
 		scx_kf_disallow(mask);						\
 	} else {								\
-		scx_ops.op(args);						\
+		scx_root->ops.op(args);						\
 	}									\
 	update_locked_rq(NULL);							\
 } while (0)
 
 #define SCX_CALL_OP_RET(mask, op, rq, args...)					\
 ({										\
-	__typeof__(scx_ops.op(args)) __ret;					\
+	__typeof__(scx_root->ops.op(args)) __ret;				\
 										\
 	update_locked_rq(rq);							\
 	if (mask) {								\
 		scx_kf_allow(mask);						\
-		__ret = scx_ops.op(args);					\
+		__ret = scx_root->ops.op(args);					\
 		scx_kf_disallow(mask);						\
 	} else {								\
-		__ret = scx_ops.op(args);					\
+		__ret = scx_root->ops.op(args);					\
 	}									\
 	update_locked_rq(NULL);							\
 	__ret;									\
@@ -1213,7 +1222,7 @@ do {										\
 
 #define SCX_CALL_OP_TASK_RET(mask, op, rq, task, args...)			\
 ({										\
-	__typeof__(scx_ops.op(task, ##args)) __ret;				\
+	__typeof__(scx_root->ops.op(task, ##args)) __ret;			\
 	BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL);				\
 	current->scx.kf_tasks[0] = task;					\
 	__ret = SCX_CALL_OP_RET(mask, op, rq, task, ##args);			\
@@ -1223,7 +1232,7 @@ do {										\
 
 #define SCX_CALL_OP_2TASKS_RET(mask, op, rq, task0, task1, args...)		\
 ({										\
-	__typeof__(scx_ops.op(task0, task1, ##args)) __ret;			\
+	__typeof__(scx_root->ops.op(task0, task1, ##args)) __ret;		\
 	BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL);				\
 	current->scx.kf_tasks[0] = task0;					\
 	current->scx.kf_tasks[1] = task1;					\
@@ -1825,7 +1834,7 @@ static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p)
 	lockdep_assert_rq_held(rq);
 
 #ifdef CONFIG_SCHED_CORE
-	if (unlikely(SCX_HAS_OP(core_sched_before)))
+	if (unlikely(SCX_HAS_OP(scx_root, core_sched_before)))
 		touch_core_sched(rq, p);
 #endif
 }
@@ -2200,20 +2209,20 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 		goto direct;
 
 	/* see %SCX_OPS_ENQ_EXITING */
-	if (!(scx_ops.flags & SCX_OPS_ENQ_EXITING) &&
+	if (!(scx_root->ops.flags & SCX_OPS_ENQ_EXITING) &&
 	    unlikely(p->flags & PF_EXITING)) {
 		__scx_add_event(SCX_EV_ENQ_SKIP_EXITING, 1);
 		goto local;
 	}
 
 	/* see %SCX_OPS_ENQ_MIGRATION_DISABLED */
-	if (!(scx_ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) &&
+	if (!(scx_root->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) &&
 	    is_migration_disabled(p)) {
 		__scx_add_event(SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
 		goto local;
 	}
 
-	if (unlikely(!SCX_HAS_OP(enqueue)))
+	if (unlikely(!SCX_HAS_OP(scx_root, enqueue)))
 		goto global;
 
 	/* DSQ bypass didn't trigger, enqueue on the BPF scheduler */
@@ -2320,7 +2329,7 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
 	rq->scx.nr_running++;
 	add_nr_running(rq, 1);
 
-	if (SCX_HAS_OP(runnable) && !task_on_rq_migrating(p))
+	if (SCX_HAS_OP(scx_root, runnable) && !task_on_rq_migrating(p))
 		SCX_CALL_OP_TASK(SCX_KF_REST, runnable, rq, p, enq_flags);
 
 	if (enq_flags & SCX_ENQ_WAKEUP)
@@ -2355,7 +2364,7 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
 		 */
 		BUG();
 	case SCX_OPSS_QUEUED:
-		if (SCX_HAS_OP(dequeue))
+		if (SCX_HAS_OP(scx_root, dequeue))
 			SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, rq, p, deq_flags);
 
 		if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
@@ -2403,12 +2412,12 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
 	 * information meaningful to the BPF scheduler and can be suppressed by
 	 * skipping the callbacks if the task is !QUEUED.
 	 */
-	if (SCX_HAS_OP(stopping) && task_current(rq, p)) {
+	if (SCX_HAS_OP(scx_root, stopping) && task_current(rq, p)) {
 		update_curr_scx(rq);
 		SCX_CALL_OP_TASK(SCX_KF_REST, stopping, rq, p, false);
 	}
 
-	if (SCX_HAS_OP(quiescent) && !task_on_rq_migrating(p))
+	if (SCX_HAS_OP(scx_root, quiescent) && !task_on_rq_migrating(p))
 		SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, rq, p, deq_flags);
 
 	if (deq_flags & SCX_DEQ_SLEEP)
@@ -2428,7 +2437,7 @@ static void yield_task_scx(struct rq *rq)
 {
 	struct task_struct *p = rq->curr;
 
-	if (SCX_HAS_OP(yield))
+	if (SCX_HAS_OP(scx_root, yield))
 		SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, rq, p, NULL);
 	else
 		p->scx.slice = 0;
@@ -2438,7 +2447,7 @@ static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
 {
 	struct task_struct *from = rq->curr;
 
-	if (SCX_HAS_OP(yield))
+	if (SCX_HAS_OP(scx_root, yield))
 		return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, rq, from, to);
 	else
 		return false;
@@ -2988,7 +2997,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 	rq->scx.flags |= SCX_RQ_IN_BALANCE;
 	rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP);
 
-	if ((scx_ops.flags & SCX_OPS_HAS_CPU_PREEMPT) &&
+	if ((scx_root->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) &&
 	    unlikely(rq->scx.cpu_released)) {
 		/*
 		 * If the previous sched_class for the current CPU was not SCX,
@@ -2996,7 +3005,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 		 * core. This callback complements ->cpu_release(), which is
 		 * emitted in switch_class().
 		 */
-		if (SCX_HAS_OP(cpu_acquire))
+		if (SCX_HAS_OP(scx_root, cpu_acquire))
 			SCX_CALL_OP(SCX_KF_REST, cpu_acquire, rq, cpu_of(rq), NULL);
 		rq->scx.cpu_released = false;
 	}
@@ -3027,7 +3036,8 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 	if (consume_global_dsq(rq))
 		goto has_tasks;
 
-	if (unlikely(!SCX_HAS_OP(dispatch)) || scx_rq_bypassing(rq) || !scx_rq_online(rq))
+	if (unlikely(!SCX_HAS_OP(scx_root, dispatch)) ||
+	    scx_rq_bypassing(rq) || !scx_rq_online(rq))
 		goto no_tasks;
 
 	dspc->rq = rq;
@@ -3077,7 +3087,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 	 * %SCX_OPS_ENQ_LAST is in effect.
 	 */
 	if (prev_on_rq &&
-	    (!(scx_ops.flags & SCX_OPS_ENQ_LAST) || scx_rq_bypassing(rq))) {
+	    (!(scx_root->ops.flags & SCX_OPS_ENQ_LAST) || scx_rq_bypassing(rq))) {
 		rq->scx.flags |= SCX_RQ_BAL_KEEP;
 		__scx_add_event(SCX_EV_DISPATCH_KEEP_LAST, 1);
 		goto has_tasks;
@@ -3163,7 +3173,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
 	p->se.exec_start = rq_clock_task(rq);
 
 	/* see dequeue_task_scx() on why we skip when !QUEUED */
-	if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED))
+	if (SCX_HAS_OP(scx_root, running) && (p->scx.flags & SCX_TASK_QUEUED))
 		SCX_CALL_OP_TASK(SCX_KF_REST, running, rq, p);
 
 	clr_task_runnable(p, true);
@@ -3217,7 +3227,7 @@ static void switch_class(struct rq *rq, struct task_struct *next)
 	 */
 	smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
 #endif
-	if (!(scx_ops.flags & SCX_OPS_HAS_CPU_PREEMPT))
+	if (!(scx_root->ops.flags & SCX_OPS_HAS_CPU_PREEMPT))
 		return;
 
 	/*
@@ -3239,7 +3249,7 @@ static void switch_class(struct rq *rq, struct task_struct *next)
 	 *  next time that balance_scx() is invoked.
 	 */
 	if (!rq->scx.cpu_released) {
-		if (SCX_HAS_OP(cpu_release)) {
+		if (SCX_HAS_OP(scx_root, cpu_release)) {
 			struct scx_cpu_release_args args = {
 				.reason = preempt_reason_from_class(next_class),
 				.task = next,
@@ -3257,7 +3267,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
 	update_curr_scx(rq);
 
 	/* see dequeue_task_scx() on why we skip when !QUEUED */
-	if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED))
+	if (SCX_HAS_OP(scx_root, stopping) && (p->scx.flags & SCX_TASK_QUEUED))
 		SCX_CALL_OP_TASK(SCX_KF_REST, stopping, rq, p, true);
 
 	if (p->scx.flags & SCX_TASK_QUEUED) {
@@ -3281,7 +3291,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
 		 * which should trigger an explicit follow-up scheduling event.
 		 */
 		if (sched_class_above(&ext_sched_class, next->sched_class)) {
-			WARN_ON_ONCE(!(scx_ops.flags & SCX_OPS_ENQ_LAST));
+			WARN_ON_ONCE(!(scx_root->ops.flags & SCX_OPS_ENQ_LAST));
 			do_enqueue_task(rq, p, SCX_ENQ_LAST, -1);
 		} else {
 			do_enqueue_task(rq, p, 0, -1);
@@ -3356,10 +3366,10 @@ static struct task_struct *pick_task_scx(struct rq *rq)
 		}
 
 		if (unlikely(!p->scx.slice)) {
-			if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) {
+			if (!scx_rq_bypassing(rq) && !scx_root->warned_zero_slice) {
 				printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n",
 						p->comm, p->pid, __func__);
-				scx_warned_zero_slice = true;
+				scx_root->warned_zero_slice = true;
 			}
 			refill_task_slice_dfl(p);
 		}
@@ -3395,7 +3405,8 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
 	 * calling ops.core_sched_before(). Accesses are controlled by the
 	 * verifier.
 	 */
-	if (SCX_HAS_OP(core_sched_before) && !scx_rq_bypassing(task_rq(a)))
+	if (SCX_HAS_OP(scx_root, core_sched_before) &&
+	    !scx_rq_bypassing(task_rq(a)))
 		return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before, NULL,
 					      (struct task_struct *)a,
 					      (struct task_struct *)b);
@@ -3424,7 +3435,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
 		return prev_cpu;
 
 	rq_bypass = scx_rq_bypassing(task_rq(p));
-	if (likely(SCX_HAS_OP(select_cpu)) && !rq_bypass) {
+	if (likely(SCX_HAS_OP(scx_root, select_cpu)) && !rq_bypass) {
 		s32 cpu;
 		struct task_struct **ddsp_taskp;
 
@@ -3476,7 +3487,7 @@ static void set_cpus_allowed_scx(struct task_struct *p,
 	 * Fine-grained memory write control is enforced by BPF making the const
 	 * designation pointless. Cast it away when calling the operation.
 	 */
-	if (SCX_HAS_OP(set_cpumask))
+	if (SCX_HAS_OP(scx_root, set_cpumask))
 		SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, NULL,
 				 p, (struct cpumask *)p->cpus_ptr);
 }
@@ -3488,11 +3499,11 @@ static void handle_hotplug(struct rq *rq, bool online)
 	atomic_long_inc(&scx_hotplug_seq);
 
 	if (scx_enabled())
-		scx_idle_update_selcpu_topology(&scx_ops);
+		scx_idle_update_selcpu_topology(&scx_root->ops);
 
-	if (online && SCX_HAS_OP(cpu_online))
+	if (online && SCX_HAS_OP(scx_root, cpu_online))
 		SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, rq, cpu);
-	else if (!online && SCX_HAS_OP(cpu_offline))
+	else if (!online && SCX_HAS_OP(scx_root, cpu_offline))
 		SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, rq, cpu);
 	else
 		scx_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
@@ -3595,7 +3606,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
 	if (scx_rq_bypassing(rq)) {
 		curr->scx.slice = 0;
 		touch_core_sched(rq, curr);
-	} else if (SCX_HAS_OP(tick)) {
+	} else if (SCX_HAS_OP(scx_root, tick)) {
 		SCX_CALL_OP_TASK(SCX_KF_REST, tick, rq, curr);
 	}
 
@@ -3667,7 +3678,7 @@ static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork
 
 	p->scx.disallow = false;
 
-	if (SCX_HAS_OP(init_task)) {
+	if (SCX_HAS_OP(scx_root, init_task)) {
 		struct scx_init_task_args args = {
 			SCX_INIT_TASK_ARGS_CGROUP(tg)
 			.fork = fork,
@@ -3730,11 +3741,11 @@ static void scx_enable_task(struct task_struct *p)
 
 	p->scx.weight = sched_weight_to_cgroup(weight);
 
-	if (SCX_HAS_OP(enable))
+	if (SCX_HAS_OP(scx_root, enable))
 		SCX_CALL_OP_TASK(SCX_KF_REST, enable, rq, p);
 	scx_set_task_state(p, SCX_TASK_ENABLED);
 
-	if (SCX_HAS_OP(set_weight))
+	if (SCX_HAS_OP(scx_root, set_weight))
 		SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, rq, p, p->scx.weight);
 }
 
@@ -3745,7 +3756,7 @@ static void scx_disable_task(struct task_struct *p)
 	lockdep_assert_rq_held(rq);
 	WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
 
-	if (SCX_HAS_OP(disable))
+	if (SCX_HAS_OP(scx_root, disable))
 		SCX_CALL_OP_TASK(SCX_KF_REST, disable, rq, p);
 	scx_set_task_state(p, SCX_TASK_READY);
 }
@@ -3774,7 +3785,7 @@ static void scx_exit_task(struct task_struct *p)
 		return;
 	}
 
-	if (SCX_HAS_OP(exit_task))
+	if (SCX_HAS_OP(scx_root, exit_task))
 		SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, task_rq(p), p, &args);
 	scx_set_task_state(p, SCX_TASK_NONE);
 }
@@ -3883,7 +3894,7 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p,
 	lockdep_assert_rq_held(task_rq(p));
 
 	p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight));
-	if (SCX_HAS_OP(set_weight))
+	if (SCX_HAS_OP(scx_root, set_weight))
 		SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, rq, p, p->scx.weight);
 }
 
@@ -3899,7 +3910,7 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
 	 * set_cpus_allowed_scx() is not called while @p is associated with a
 	 * different scheduler class. Keep the BPF scheduler up-to-date.
 	 */
-	if (SCX_HAS_OP(set_cpumask))
+	if (SCX_HAS_OP(scx_root, set_cpumask))
 		SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, rq,
 				 p, (struct cpumask *)p->cpus_ptr);
 }
@@ -3958,7 +3969,7 @@ int scx_tg_online(struct task_group *tg)
 	percpu_down_read(&scx_cgroup_rwsem);
 
 	if (scx_cgroup_enabled) {
-		if (SCX_HAS_OP(cgroup_init)) {
+		if (SCX_HAS_OP(scx_root, cgroup_init)) {
 			struct scx_cgroup_init_args args =
 				{ .weight = tg->scx_weight };
 
@@ -3983,7 +3994,7 @@ void scx_tg_offline(struct task_group *tg)
 
 	percpu_down_read(&scx_cgroup_rwsem);
 
-	if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))
+	if (SCX_HAS_OP(scx_root, cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))
 		SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, NULL, tg->css.cgroup);
 	tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
 
@@ -4016,7 +4027,7 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
 		if (from == to)
 			continue;
 
-		if (SCX_HAS_OP(cgroup_prep_move)) {
+		if (SCX_HAS_OP(scx_root, cgroup_prep_move)) {
 			ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move, NULL,
 					      p, from, css->cgroup);
 			if (ret)
@@ -4030,7 +4041,8 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
 
 err:
 	cgroup_taskset_for_each(p, css, tset) {
-		if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
+		if (SCX_HAS_OP(scx_root, cgroup_cancel_move) &&
+		    p->scx.cgrp_moving_from)
 			SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, NULL,
 				    p, p->scx.cgrp_moving_from, css->cgroup);
 		p->scx.cgrp_moving_from = NULL;
@@ -4049,7 +4061,8 @@ void scx_cgroup_move_task(struct task_struct *p)
 	 * @p must have ops.cgroup_prep_move() called on it and thus
 	 * cgrp_moving_from set.
 	 */
-	if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
+	if (SCX_HAS_OP(scx_root, cgroup_move) &&
+	    !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
 		SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, NULL,
 				 p, p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));
 	p->scx.cgrp_moving_from = NULL;
@@ -4069,7 +4082,8 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
 		goto out_unlock;
 
 	cgroup_taskset_for_each(p, css, tset) {
-		if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
+		if (SCX_HAS_OP(scx_root, cgroup_cancel_move) &&
+		    p->scx.cgrp_moving_from)
 			SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, NULL,
 				    p, p->scx.cgrp_moving_from, css->cgroup);
 		p->scx.cgrp_moving_from = NULL;
@@ -4083,7 +4097,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
 	percpu_down_read(&scx_cgroup_rwsem);
 
 	if (scx_cgroup_enabled && tg->scx_weight != weight) {
-		if (SCX_HAS_OP(cgroup_set_weight))
+		if (SCX_HAS_OP(scx_root, cgroup_set_weight))
 			SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, NULL,
 				    tg_cgrp(tg), weight);
 		tg->scx_weight = weight;
@@ -4266,7 +4280,7 @@ static void scx_cgroup_exit(void)
 			continue;
 		tg->scx_flags &= ~SCX_TG_INITED;
 
-		if (!scx_ops.cgroup_exit)
+		if (!scx_root->ops.cgroup_exit)
 			continue;
 
 		if (WARN_ON_ONCE(!css_tryget(css)))
@@ -4301,7 +4315,7 @@ static int scx_cgroup_init(void)
 		     (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE)
 			continue;
 
-		if (!scx_ops.cgroup_init) {
+		if (!scx_root->ops.cgroup_init) {
 			tg->scx_flags |= SCX_TG_INITED;
 			continue;
 		}
@@ -4402,7 +4416,7 @@ static void scx_kobj_release(struct kobject *kobj)
 static ssize_t scx_attr_ops_show(struct kobject *kobj,
 				 struct kobj_attribute *ka, char *buf)
 {
-	return sysfs_emit(buf, "%s\n", scx_ops.name);
+	return sysfs_emit(buf, "%s\n", scx_root->ops.name);
 }
 SCX_ATTR(ops);
 
@@ -4445,7 +4459,7 @@ static const struct kobj_type scx_ktype = {
 
 static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
 {
-	return add_uevent_var(env, "SCXOPS=%s", scx_ops.name);
+	return add_uevent_var(env, "SCXOPS=%s", scx_root->ops.name);
 }
 
 static const struct kset_uevent_ops scx_uevent_ops = {
@@ -4468,7 +4482,7 @@ bool task_should_scx(int policy)
 bool scx_allow_ttwu_queue(const struct task_struct *p)
 {
 	return !scx_enabled() ||
-		(scx_ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) ||
+		(scx_root->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) ||
 		p->sched_class != &ext_sched_class;
 }
 
@@ -4497,7 +4511,7 @@ void scx_softlockup(u32 dur_s)
 		return;
 
 	printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n",
-			smp_processor_id(), dur_s, scx_ops.name);
+			smp_processor_id(), dur_s, scx_root->ops.name);
 
 	/*
 	 * Some CPUs may be trapped in the dispatch paths. Enable breather
@@ -4686,14 +4700,14 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
 
 static void scx_disable_workfn(struct kthread_work *work)
 {
-	struct scx_exit_info *ei = scx_exit_info;
+	struct scx_exit_info *ei = scx_root->exit_info;
 	struct scx_task_iter sti;
 	struct task_struct *p;
 	struct rhashtable_iter rht_iter;
 	struct scx_dispatch_q *dsq;
 	int kind, cpu;
 
-	kind = atomic_read(&scx_exit_kind);
+	kind = atomic_read(&scx_root->exit_kind);
 	while (true) {
 		/*
 		 * NONE indicates that a new scx_ops has been registered since
@@ -4702,7 +4716,7 @@ static void scx_disable_workfn(struct kthread_work *work)
 		 */
 		if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)
 			return;
-		if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE))
+		if (atomic_try_cmpxchg(&scx_root->exit_kind, &kind, SCX_EXIT_DONE))
 			break;
 	}
 	ei->kind = kind;
@@ -4717,7 +4731,7 @@ static void scx_disable_workfn(struct kthread_work *work)
 		break;
 	case SCX_DISABLED:
 		pr_warn("sched_ext: ops error detected without ops (%s)\n",
-			scx_exit_info->msg);
+			scx_root->exit_info->msg);
 		WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
 		goto done;
 	default:
@@ -4784,25 +4798,26 @@ static void scx_disable_workfn(struct kthread_work *work)
 
 	/* no task is on scx, turn off all the switches and flush in-progress calls */
 	static_branch_disable(&__scx_enabled);
-	bitmap_zero(scx_has_op, SCX_OPI_END);
+	bitmap_zero(scx_root->has_op, SCX_OPI_END);
 	scx_idle_disable();
 	synchronize_rcu();
 
 	if (ei->kind >= SCX_EXIT_ERROR) {
 		pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
-		       scx_ops.name, ei->reason);
+		       scx_root->ops.name, ei->reason);
 
 		if (ei->msg[0] != '\0')
-			pr_err("sched_ext: %s: %s\n", scx_ops.name, ei->msg);
+			pr_err("sched_ext: %s: %s\n",
+			       scx_root->ops.name, ei->msg);
 #ifdef CONFIG_STACKTRACE
 		stack_trace_print(ei->bt, ei->bt_len, 2);
 #endif
 	} else {
 		pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
-			scx_ops.name, ei->reason);
+			scx_root->ops.name, ei->reason);
 	}
 
-	if (scx_ops.exit)
+	if (scx_root->ops.exit)
 		SCX_CALL_OP(SCX_KF_UNLOCKED, exit, NULL, ei);
 
 	cancel_delayed_work_sync(&scx_watchdog_work);
@@ -4813,11 +4828,11 @@ static void scx_disable_workfn(struct kthread_work *work)
 	 * asynchronously, sysfs could observe an object of the same name still
 	 * in the hierarchy when another scheduler is loaded.
 	 */
-	kobject_del(scx_root_kobj);
-	kobject_put(scx_root_kobj);
-	scx_root_kobj = NULL;
+	kobject_del(scx_root->kobj);
+	kobject_put(scx_root->kobj);
+	scx_root->kobj = NULL;
 
-	memset(&scx_ops, 0, sizeof(scx_ops));
+	memset(&scx_root->ops, 0, sizeof(scx_root->ops));
 
 	rhashtable_walk_enter(&dsq_hash, &rht_iter);
 	do {
@@ -4834,8 +4849,8 @@ static void scx_disable_workfn(struct kthread_work *work)
 	scx_dsp_ctx = NULL;
 	scx_dsp_max_batch = 0;
 
-	free_exit_info(scx_exit_info);
-	scx_exit_info = NULL;
+	free_exit_info(scx_root->exit_info);
+	scx_root->exit_info = NULL;
 
 	mutex_unlock(&scx_enable_mutex);
 
@@ -4865,7 +4880,7 @@ static void scx_disable(enum scx_exit_kind kind)
 	if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
 		kind = SCX_EXIT_ERROR;
 
-	atomic_try_cmpxchg(&scx_exit_kind, &none, kind);
+	atomic_try_cmpxchg(&scx_root->exit_kind, &none, kind);
 
 	schedule_scx_disable_work();
 }
@@ -5007,7 +5022,7 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
 		  p->scx.dsq_vtime, p->scx.slice, p->scx.weight);
 	dump_line(s, "      cpus=%*pb", cpumask_pr_args(p->cpus_ptr));
 
-	if (SCX_HAS_OP(dump_task)) {
+	if (SCX_HAS_OP(scx_root, dump_task)) {
 		ops_dump_init(s, "    ");
 		SCX_CALL_OP(SCX_KF_REST, dump_task, NULL, dctx, p);
 		ops_dump_exit();
@@ -5054,7 +5069,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
 		dump_stack_trace(&s, "  ", ei->bt, ei->bt_len);
 	}
 
-	if (SCX_HAS_OP(dump)) {
+	if (SCX_HAS_OP(scx_root, dump)) {
 		ops_dump_init(&s, "");
 		SCX_CALL_OP(SCX_KF_UNLOCKED, dump, NULL, &dctx);
 		ops_dump_exit();
@@ -5077,7 +5092,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
 		idle = list_empty(&rq->scx.runnable_list) &&
 			rq->curr->sched_class == &idle_sched_class;
 
-		if (idle && !SCX_HAS_OP(dump_cpu))
+		if (idle && !SCX_HAS_OP(scx_root, dump_cpu))
 			goto next;
 
 		/*
@@ -5111,7 +5126,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
 				  cpumask_pr_args(rq->scx.cpus_to_wait));
 
 		used = seq_buf_used(&ns);
-		if (SCX_HAS_OP(dump_cpu)) {
+		if (SCX_HAS_OP(scx_root, dump_cpu)) {
 			ops_dump_init(&ns, "  ");
 			SCX_CALL_OP(SCX_KF_REST, dump_cpu, NULL, &dctx, cpu, idle);
 			ops_dump_exit();
@@ -5167,10 +5182,10 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
 
 static void scx_error_irq_workfn(struct irq_work *irq_work)
 {
-	struct scx_exit_info *ei = scx_exit_info;
+	struct scx_exit_info *ei = scx_root->exit_info;
 
 	if (ei->kind >= SCX_EXIT_ERROR)
-		scx_dump_state(ei, scx_ops.exit_dump_len);
+		scx_dump_state(ei, scx_root->ops.exit_dump_len);
 
 	schedule_scx_disable_work();
 }
@@ -5180,11 +5195,11 @@ static DEFINE_IRQ_WORK(scx_error_irq_work, scx_error_irq_workfn);
 static __printf(3, 4) void __scx_exit(enum scx_exit_kind kind, s64 exit_code,
 				      const char *fmt, ...)
 {
-	struct scx_exit_info *ei = scx_exit_info;
+	struct scx_exit_info *ei = scx_root->exit_info;
 	int none = SCX_EXIT_NONE;
 	va_list args;
 
-	if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind))
+	if (!atomic_try_cmpxchg(&scx_root->exit_kind, &none, kind))
 		return;
 
 	ei->exit_code = exit_code;
@@ -5327,19 +5342,19 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 		goto err_unlock;
 	}
 
-	scx_root_kobj = kzalloc(sizeof(*scx_root_kobj), GFP_KERNEL);
-	if (!scx_root_kobj) {
+	scx_root->kobj = kzalloc(sizeof(*scx_root->kobj), GFP_KERNEL);
+	if (!scx_root->kobj) {
 		ret = -ENOMEM;
 		goto err_unlock;
 	}
 
-	scx_root_kobj->kset = scx_kset;
-	ret = kobject_init_and_add(scx_root_kobj, &scx_ktype, NULL, "root");
+	scx_root->kobj->kset = scx_kset;
+	ret = kobject_init_and_add(scx_root->kobj, &scx_ktype, NULL, "root");
 	if (ret < 0)
 		goto err;
 
-	scx_exit_info = alloc_exit_info(ops->exit_dump_len);
-	if (!scx_exit_info) {
+	scx_root->exit_info = alloc_exit_info(ops->exit_dump_len);
+	if (!scx_root->exit_info) {
 		ret = -ENOMEM;
 		goto err_del;
 	}
@@ -5348,12 +5363,12 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 	 * Set scx_ops, transition to ENABLING and clear exit info to arm the
 	 * disable path. Failure triggers full disabling from here on.
 	 */
-	scx_ops = *ops;
+	scx_root->ops = *ops;
 
 	WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED);
 
-	atomic_set(&scx_exit_kind, SCX_EXIT_NONE);
-	scx_warned_zero_slice = false;
+	atomic_set(&scx_root->exit_kind, SCX_EXIT_NONE);
+	scx_root->warned_zero_slice = false;
 
 	atomic_long_set(&scx_nr_rejected, 0);
 
@@ -5368,7 +5383,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 
 	scx_idle_enable(ops);
 
-	if (scx_ops.init) {
+	if (scx_root->ops.init) {
 		ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init, NULL);
 		if (ret) {
 			ret = ops_sanitize_err("init", ret);
@@ -5380,7 +5395,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 
 	for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
 		if (((void (**)(void))ops)[i])
-			set_bit(i, scx_has_op);
+			set_bit(i, scx_root->has_op);
 
 	check_hotplug_seq(ops);
 	scx_idle_update_selcpu_topology(ops);
@@ -5421,10 +5436,10 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 
 	for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
 		if (((void (**)(void))ops)[i])
-			set_bit(i, scx_has_op);
+			set_bit(i, scx_root->has_op);
 
-	if (scx_ops.cpu_acquire || scx_ops.cpu_release)
-		scx_ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
+	if (scx_root->ops.cpu_acquire || scx_root->ops.cpu_release)
+		scx_root->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
 
 	/*
 	 * Lock out forks, cgroup on/offlining and moves before opening the
@@ -5523,7 +5538,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 	scx_bypass(false);
 
 	if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) {
-		WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE);
+		WARN_ON_ONCE(atomic_read(&scx_root->exit_kind) == SCX_EXIT_NONE);
 		goto err_disable;
 	}
 
@@ -5531,8 +5546,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 		static_branch_enable(&__scx_switched_all);
 
 	pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
-		scx_ops.name, scx_switched_all() ? "" : " (partial)");
-	kobject_uevent(scx_root_kobj, KOBJ_ADD);
+		scx_root->ops.name, scx_switched_all() ? "" : " (partial)");
+	kobject_uevent(scx_root->kobj, KOBJ_ADD);
 	mutex_unlock(&scx_enable_mutex);
 
 	atomic_long_inc(&scx_enable_seq);
@@ -5540,13 +5555,13 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 	return 0;
 
 err_del:
-	kobject_del(scx_root_kobj);
+	kobject_del(scx_root->kobj);
 err:
-	kobject_put(scx_root_kobj);
-	scx_root_kobj = NULL;
-	if (scx_exit_info) {
-		free_exit_info(scx_exit_info);
-		scx_exit_info = NULL;
+	kobject_put(scx_root->kobj);
+	scx_root->kobj = NULL;
+	if (scx_root->exit_info) {
+		free_exit_info(scx_root->exit_info);
+		scx_root->exit_info = NULL;
 	}
 err_unlock:
 	mutex_unlock(&scx_enable_mutex);
@@ -6006,7 +6021,7 @@ void print_scx_info(const char *log_lvl, struct task_struct *p)
 	 */
 	if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) ||
 	    class != &ext_sched_class) {
-		printk("%sSched_ext: %s (%s%s)", log_lvl, scx_ops.name,
+		printk("%sSched_ext: %s (%s%s)", log_lvl, scx_root->ops.name,
 		       scx_enable_state_str[state], all);
 		return;
 	}
@@ -6018,7 +6033,7 @@ void print_scx_info(const char *log_lvl, struct task_struct *p)
 
 	/* print everything onto one line to conserve console space */
 	printk("%sSched_ext: %s (%s%s), task: runnable_at=%s",
-	       log_lvl, scx_ops.name, scx_enable_state_str[state], all,
+	       log_lvl, scx_root->ops.name, scx_enable_state_str[state], all,
 	       runnable_at_buf);
 }
 
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 35aa309c9584..6915685cd3d6 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -744,7 +744,8 @@ void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
 	 * Idle transitions are indicated by do_notify being set to true,
 	 * managed by put_prev_task_idle()/set_next_task_idle().
 	 */
-	if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
+	if (SCX_HAS_OP(scx_root, update_idle) &&
+	    do_notify && !scx_rq_bypassing(rq))
 		SCX_CALL_OP(SCX_KF_REST, update_idle, rq, cpu_of(rq), idle);
 
 	/*
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 02/12] sched_ext: Avoid NULL scx_root deref through SCX_HAS_OP()
  2025-04-23 23:44 [PATCHSET sched_ext/for-6.16] sched_ext: Introduce scx_sched Tejun Heo
  2025-04-23 23:44 ` [PATCH 01/12] " Tejun Heo
@ 2025-04-23 23:44 ` Tejun Heo
  2025-04-24  7:23   ` Chengming Zhou
  2025-04-23 23:44 ` [PATCH 03/12] sched_ext: Use dynamic allocation for scx_sched Tejun Heo
                   ` (9 subsequent siblings)
  11 siblings, 1 reply; 19+ messages in thread
From: Tejun Heo @ 2025-04-23 23:44 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, linux-kernel; +Cc: Tejun Heo

SCX_HAS_OP() tests scx_root->has_op bitmap. The bitmap is currently in a
statically allocated struct scx_sched and initialized while loading the BPF
scheduler and cleared while unloading, and thus can be tested anytime.
However, scx_root will be switched to dynamic allocation and thus won't
always be deferenceable.

Most usages of SCX_HAS_OP() are already protected by scx_enabled() either
directly or indirectly (e.g. through a task which is on SCX). However, there
are a couple places that could try to dereference NULL scx_root. Update them
so that scx_root is guaranteed to be valid before SCX_HAS_OP() is called.

- In handle_hotplug(), test whether scx_root is NULL before doing anything
  else. This is safe because scx_root updates will be protected by
  cpus_read_lock().

- In scx_tg_offline(), test scx_cgroup_enabled before invoking SCX_HAS_OP(),
  which should guarnatee that scx_root won't turn NULL. This is also in line
  with other cgroup operations. As the code path is synchronized against
  scx_cgroup_init/exit() through scx_cgroup_rwsem, this shouldn't cause any
  behavior differences.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 975f6963a01b..ad392890d2dd 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3498,6 +3498,14 @@ static void handle_hotplug(struct rq *rq, bool online)
 
 	atomic_long_inc(&scx_hotplug_seq);
 
+	/*
+	 * scx_root updates are protected by cpus_read_lock() and will stay
+	 * stable here. Note that we can't depend on scx_enabled() test as the
+	 * hotplug ops need to be enabled before __scx_enabled is set.
+	 */
+	if (!scx_root)
+		return;
+
 	if (scx_enabled())
 		scx_idle_update_selcpu_topology(&scx_root->ops);
 
@@ -3994,7 +4002,8 @@ void scx_tg_offline(struct task_group *tg)
 
 	percpu_down_read(&scx_cgroup_rwsem);
 
-	if (SCX_HAS_OP(scx_root, cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))
+	if (scx_cgroup_enabled && SCX_HAS_OP(scx_root, cgroup_exit) &&
+	    (tg->scx_flags & SCX_TG_INITED))
 		SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, NULL, tg->css.cgroup);
 	tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
 
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 03/12] sched_ext: Use dynamic allocation for scx_sched
  2025-04-23 23:44 [PATCHSET sched_ext/for-6.16] sched_ext: Introduce scx_sched Tejun Heo
  2025-04-23 23:44 ` [PATCH 01/12] " Tejun Heo
  2025-04-23 23:44 ` [PATCH 02/12] sched_ext: Avoid NULL scx_root deref through SCX_HAS_OP() Tejun Heo
@ 2025-04-23 23:44 ` Tejun Heo
  2025-04-25 10:14   ` Andrea Righi
  2025-04-23 23:44 ` [PATCH 04/12] sched_ext: Inline create_dsq() into scx_bpf_create_dsq() Tejun Heo
                   ` (8 subsequent siblings)
  11 siblings, 1 reply; 19+ messages in thread
From: Tejun Heo @ 2025-04-23 23:44 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, linux-kernel; +Cc: Tejun Heo

To prepare for supporting multiple schedulers, make scx_sched allocated
dynamically. scx_sched->kobj is now an embedded field and the kobj's
lifetime determines the lifetime of the containing scx_sched.

- Enable path is updated so that kobj init and addition are performed later.

- scx_sched freeing is initiated in scx_kobj_release() and also goes through
  an rcu_work so that scx_root can be accessed from an unsynchronized path -
  scx_disable().

No behavior changes intended.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 151 ++++++++++++++++++++++++++-------------------
 1 file changed, 86 insertions(+), 65 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index ad392890d2dd..612232c66d13 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -772,7 +772,8 @@ struct scx_sched {
 	atomic_t		exit_kind;
 	struct scx_exit_info	*exit_info;
 
-	struct kobject		*kobj;
+	struct kobject		kobj;
+	struct rcu_work		rcu_work;
 };
 
 enum scx_wake_flags {
@@ -933,11 +934,7 @@ enum scx_ops_state {
 #define SCX_OPSS_STATE_MASK	((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
 #define SCX_OPSS_QSEQ_MASK	(~SCX_OPSS_STATE_MASK)
 
-static struct scx_sched __scx_root = {
-	.exit_kind		= ATOMIC_INIT(SCX_EXIT_DONE),
-};
-
-static struct scx_sched *scx_root = &__scx_root;
+static struct scx_sched __rcu *scx_root;
 
 /*
  * During exit, a task may schedule after losing its PIDs. When disabling the
@@ -4417,9 +4414,23 @@ static const struct attribute_group scx_global_attr_group = {
 	.attrs = scx_global_attrs,
 };
 
+static void free_exit_info(struct scx_exit_info *ei);
+
+static void scx_sched_free_rcu_work(struct work_struct *work)
+{
+	struct rcu_work *rcu_work = to_rcu_work(work);
+	struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work);
+
+	free_exit_info(sch->exit_info);
+	kfree(sch);
+}
+
 static void scx_kobj_release(struct kobject *kobj)
 {
-	kfree(kobj);
+	struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
+
+	INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work);
+	queue_rcu_work(system_unbound_wq, &sch->rcu_work);
 }
 
 static ssize_t scx_attr_ops_show(struct kobject *kobj,
@@ -4709,14 +4720,15 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
 
 static void scx_disable_workfn(struct kthread_work *work)
 {
-	struct scx_exit_info *ei = scx_root->exit_info;
+	struct scx_sched *sch = scx_root;
+	struct scx_exit_info *ei = sch->exit_info;
 	struct scx_task_iter sti;
 	struct task_struct *p;
 	struct rhashtable_iter rht_iter;
 	struct scx_dispatch_q *dsq;
 	int kind, cpu;
 
-	kind = atomic_read(&scx_root->exit_kind);
+	kind = atomic_read(&sch->exit_kind);
 	while (true) {
 		/*
 		 * NONE indicates that a new scx_ops has been registered since
@@ -4725,7 +4737,7 @@ static void scx_disable_workfn(struct kthread_work *work)
 		 */
 		if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)
 			return;
-		if (atomic_try_cmpxchg(&scx_root->exit_kind, &kind, SCX_EXIT_DONE))
+		if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE))
 			break;
 	}
 	ei->kind = kind;
@@ -4740,7 +4752,7 @@ static void scx_disable_workfn(struct kthread_work *work)
 		break;
 	case SCX_DISABLED:
 		pr_warn("sched_ext: ops error detected without ops (%s)\n",
-			scx_root->exit_info->msg);
+			sch->exit_info->msg);
 		WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
 		goto done;
 	default:
@@ -4807,41 +4819,43 @@ static void scx_disable_workfn(struct kthread_work *work)
 
 	/* no task is on scx, turn off all the switches and flush in-progress calls */
 	static_branch_disable(&__scx_enabled);
-	bitmap_zero(scx_root->has_op, SCX_OPI_END);
+	bitmap_zero(sch->has_op, SCX_OPI_END);
 	scx_idle_disable();
 	synchronize_rcu();
 
 	if (ei->kind >= SCX_EXIT_ERROR) {
 		pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
-		       scx_root->ops.name, ei->reason);
+		       sch->ops.name, ei->reason);
 
 		if (ei->msg[0] != '\0')
-			pr_err("sched_ext: %s: %s\n",
-			       scx_root->ops.name, ei->msg);
+			pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg);
 #ifdef CONFIG_STACKTRACE
 		stack_trace_print(ei->bt, ei->bt_len, 2);
 #endif
 	} else {
 		pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
-			scx_root->ops.name, ei->reason);
+			sch->ops.name, ei->reason);
 	}
 
-	if (scx_root->ops.exit)
+	if (sch->ops.exit)
 		SCX_CALL_OP(SCX_KF_UNLOCKED, exit, NULL, ei);
 
 	cancel_delayed_work_sync(&scx_watchdog_work);
 
 	/*
-	 * Delete the kobject from the hierarchy eagerly in addition to just
-	 * dropping a reference. Otherwise, if the object is deleted
-	 * asynchronously, sysfs could observe an object of the same name still
-	 * in the hierarchy when another scheduler is loaded.
+	 * scx_root clearing must be inside cpus_read_lock(). See
+	 * handle_hotplug().
 	 */
-	kobject_del(scx_root->kobj);
-	kobject_put(scx_root->kobj);
-	scx_root->kobj = NULL;
+	cpus_read_lock();
+	RCU_INIT_POINTER(scx_root, NULL);
+	cpus_read_unlock();
 
-	memset(&scx_root->ops, 0, sizeof(scx_root->ops));
+	/*
+	 * Delete the kobject from the hierarchy synchronously. Otherwise, sysfs
+	 * could observe an object of the same name still in the hierarchy when
+	 * the next scheduler is loaded.
+	 */
+	kobject_del(&sch->kobj);
 
 	rhashtable_walk_enter(&dsq_hash, &rht_iter);
 	do {
@@ -4858,9 +4872,6 @@ static void scx_disable_workfn(struct kthread_work *work)
 	scx_dsp_ctx = NULL;
 	scx_dsp_max_batch = 0;
 
-	free_exit_info(scx_root->exit_info);
-	scx_root->exit_info = NULL;
-
 	mutex_unlock(&scx_enable_mutex);
 
 	WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
@@ -4885,13 +4896,18 @@ static void schedule_scx_disable_work(void)
 static void scx_disable(enum scx_exit_kind kind)
 {
 	int none = SCX_EXIT_NONE;
+	struct scx_sched *sch;
 
 	if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
 		kind = SCX_EXIT_ERROR;
 
-	atomic_try_cmpxchg(&scx_root->exit_kind, &none, kind);
-
-	schedule_scx_disable_work();
+	rcu_read_lock();
+	sch = rcu_dereference(scx_root);
+	if (sch) {
+		atomic_try_cmpxchg(&sch->exit_kind, &none, kind);
+		schedule_scx_disable_work();
+	}
+	rcu_read_unlock();
 }
 
 static void dump_newline(struct seq_buf *s)
@@ -5288,6 +5304,7 @@ static int validate_ops(const struct sched_ext_ops *ops)
 
 static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 {
+	struct scx_sched *sch;
 	struct scx_task_iter sti;
 	struct task_struct *p;
 	unsigned long timeout;
@@ -5351,33 +5368,32 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 		goto err_unlock;
 	}
 
-	scx_root->kobj = kzalloc(sizeof(*scx_root->kobj), GFP_KERNEL);
-	if (!scx_root->kobj) {
+	sch = kzalloc(sizeof(*sch), GFP_KERNEL);
+	if (!sch) {
 		ret = -ENOMEM;
 		goto err_unlock;
 	}
 
-	scx_root->kobj->kset = scx_kset;
-	ret = kobject_init_and_add(scx_root->kobj, &scx_ktype, NULL, "root");
-	if (ret < 0)
-		goto err;
-
-	scx_root->exit_info = alloc_exit_info(ops->exit_dump_len);
-	if (!scx_root->exit_info) {
+	sch->exit_info = alloc_exit_info(ops->exit_dump_len);
+	if (!sch->exit_info) {
 		ret = -ENOMEM;
-		goto err_del;
+		goto err_free;
 	}
 
+	sch->kobj.kset = scx_kset;
+	ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
+	if (ret < 0)
+		goto err_free;
+
+	atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
+	sch->ops = *ops;
+
 	/*
-	 * Set scx_ops, transition to ENABLING and clear exit info to arm the
-	 * disable path. Failure triggers full disabling from here on.
+	 * Transition to ENABLING and clear exit info to arm the disable path.
+	 * Failure triggers full disabling from here on.
 	 */
-	scx_root->ops = *ops;
-
 	WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED);
-
-	atomic_set(&scx_root->exit_kind, SCX_EXIT_NONE);
-	scx_root->warned_zero_slice = false;
+	WARN_ON_ONCE(scx_root);
 
 	atomic_long_set(&scx_nr_rejected, 0);
 
@@ -5390,9 +5406,15 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 	 */
 	cpus_read_lock();
 
+	/*
+	 * Make the scheduler instance visible. Must be inside cpus_read_lock().
+	 * See handle_hotplug().
+	 */
+	rcu_assign_pointer(scx_root, sch);
+
 	scx_idle_enable(ops);
 
-	if (scx_root->ops.init) {
+	if (sch->ops.init) {
 		ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init, NULL);
 		if (ret) {
 			ret = ops_sanitize_err("init", ret);
@@ -5404,7 +5426,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 
 	for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
 		if (((void (**)(void))ops)[i])
-			set_bit(i, scx_root->has_op);
+			set_bit(i, sch->has_op);
 
 	check_hotplug_seq(ops);
 	scx_idle_update_selcpu_topology(ops);
@@ -5445,10 +5467,10 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 
 	for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
 		if (((void (**)(void))ops)[i])
-			set_bit(i, scx_root->has_op);
+			set_bit(i, sch->has_op);
 
-	if (scx_root->ops.cpu_acquire || scx_root->ops.cpu_release)
-		scx_root->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
+	if (sch->ops.cpu_acquire || sch->ops.cpu_release)
+		sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
 
 	/*
 	 * Lock out forks, cgroup on/offlining and moves before opening the
@@ -5547,7 +5569,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 	scx_bypass(false);
 
 	if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) {
-		WARN_ON_ONCE(atomic_read(&scx_root->exit_kind) == SCX_EXIT_NONE);
+		WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE);
 		goto err_disable;
 	}
 
@@ -5555,23 +5577,18 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 		static_branch_enable(&__scx_switched_all);
 
 	pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
-		scx_root->ops.name, scx_switched_all() ? "" : " (partial)");
-	kobject_uevent(scx_root->kobj, KOBJ_ADD);
+		sch->ops.name, scx_switched_all() ? "" : " (partial)");
+	kobject_uevent(&sch->kobj, KOBJ_ADD);
 	mutex_unlock(&scx_enable_mutex);
 
 	atomic_long_inc(&scx_enable_seq);
 
 	return 0;
 
-err_del:
-	kobject_del(scx_root->kobj);
-err:
-	kobject_put(scx_root->kobj);
-	scx_root->kobj = NULL;
-	if (scx_root->exit_info) {
-		free_exit_info(scx_root->exit_info);
-		scx_root->exit_info = NULL;
-	}
+err_free:
+	if (sch->exit_info)
+		free_exit_info(sch->exit_info);
+	kfree(sch);
 err_unlock:
 	mutex_unlock(&scx_enable_mutex);
 	return ret;
@@ -5593,6 +5610,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 	 */
 	scx_error("scx_enable() failed (%d)", ret);
 	kthread_flush_work(&scx_disable_work);
+	kobject_put(&sch->kobj);
 	return 0;
 }
 
@@ -5741,8 +5759,11 @@ static int bpf_scx_reg(void *kdata, struct bpf_link *link)
 
 static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
 {
+	struct scx_sched *sch = scx_root;
+
 	scx_disable(SCX_EXIT_UNREG);
 	kthread_flush_work(&scx_disable_work);
+	kobject_put(&sch->kobj);
 }
 
 static int bpf_scx_init(struct btf *btf)
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 04/12] sched_ext: Inline create_dsq() into scx_bpf_create_dsq()
  2025-04-23 23:44 [PATCHSET sched_ext/for-6.16] sched_ext: Introduce scx_sched Tejun Heo
                   ` (2 preceding siblings ...)
  2025-04-23 23:44 ` [PATCH 03/12] sched_ext: Use dynamic allocation for scx_sched Tejun Heo
@ 2025-04-23 23:44 ` Tejun Heo
  2025-04-23 23:44 ` [PATCH 05/12] sched_ext: Factor out scx_alloc_and_add_sched() Tejun Heo
                   ` (7 subsequent siblings)
  11 siblings, 0 replies; 19+ messages in thread
From: Tejun Heo @ 2025-04-23 23:44 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, linux-kernel; +Cc: Tejun Heo

create_dsq() is only used by scx_bpf_create_dsq() and the separation gets in
the way of making dsq_hash per scx_sched. Inline it into
scx_bpf_create_dsq(). While at it, add unlikely() around
SCX_DSQ_FLAG_BUILTIN test.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 42 ++++++++++++++++++------------------------
 1 file changed, 18 insertions(+), 24 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 612232c66d13..8ec32c458f16 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4193,29 +4193,6 @@ static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
 	dsq->id = dsq_id;
 }
 
-static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node)
-{
-	struct scx_dispatch_q *dsq;
-	int ret;
-
-	if (dsq_id & SCX_DSQ_FLAG_BUILTIN)
-		return ERR_PTR(-EINVAL);
-
-	dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node);
-	if (!dsq)
-		return ERR_PTR(-ENOMEM);
-
-	init_dsq(dsq, dsq_id);
-
-	ret = rhashtable_lookup_insert_fast(&dsq_hash, &dsq->hash_node,
-					    dsq_hash_params);
-	if (ret) {
-		kfree(dsq);
-		return ERR_PTR(ret);
-	}
-	return dsq;
-}
-
 static void free_dsq_irq_workfn(struct irq_work *irq_work)
 {
 	struct llist_node *to_free = llist_del_all(&dsqs_to_free);
@@ -6708,10 +6685,27 @@ __bpf_kfunc_start_defs();
  */
 __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
 {
+	struct scx_dispatch_q *dsq;
+	s32 ret;
+
 	if (unlikely(node >= (int)nr_node_ids ||
 		     (node < 0 && node != NUMA_NO_NODE)))
 		return -EINVAL;
-	return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node));
+
+	if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN))
+		return -EINVAL;
+
+	dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node);
+	if (!dsq)
+		return -ENOMEM;
+
+	init_dsq(dsq, dsq_id);
+
+	ret = rhashtable_lookup_insert_fast(&dsq_hash, &dsq->hash_node,
+					    dsq_hash_params);
+	if (ret)
+		kfree(dsq);
+	return ret;
 }
 
 __bpf_kfunc_end_defs();
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 05/12] sched_ext: Factor out scx_alloc_and_add_sched()
  2025-04-23 23:44 [PATCHSET sched_ext/for-6.16] sched_ext: Introduce scx_sched Tejun Heo
                   ` (3 preceding siblings ...)
  2025-04-23 23:44 ` [PATCH 04/12] sched_ext: Inline create_dsq() into scx_bpf_create_dsq() Tejun Heo
@ 2025-04-23 23:44 ` Tejun Heo
  2025-04-23 23:44 ` [PATCH 06/12] sched_ext: Move dsq_hash into scx_sched Tejun Heo
                   ` (6 subsequent siblings)
  11 siblings, 0 replies; 19+ messages in thread
From: Tejun Heo @ 2025-04-23 23:44 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, linux-kernel; +Cc: Tejun Heo

More will be moved into scx_sched. Factor out the allocation and kobject
addition path into scx_alloc_and_add_sched().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 56 +++++++++++++++++++++++++++++-----------------
 1 file changed, 35 insertions(+), 21 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 8ec32c458f16..88a0f6d9cb1e 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5233,6 +5233,38 @@ static struct kthread_worker *scx_create_rt_helper(const char *name)
 	return helper;
 }
 
+static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
+{
+	struct scx_sched *sch;
+	int ret;
+
+	sch = kzalloc(sizeof(*sch), GFP_KERNEL);
+	if (!sch)
+		return ERR_PTR(-ENOMEM);
+
+	sch->exit_info = alloc_exit_info(ops->exit_dump_len);
+	if (!sch->exit_info) {
+		ret = -ENOMEM;
+		goto err_free_sch;
+	}
+
+	atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
+	sch->ops = *ops;
+
+	sch->kobj.kset = scx_kset;
+	ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
+	if (ret < 0)
+		goto err_free_ei;
+
+	return sch;
+
+err_free_ei:
+	free_exit_info(sch->exit_info);
+err_free_sch:
+	kfree(sch);
+	return ERR_PTR(ret);
+}
+
 static void check_hotplug_seq(const struct sched_ext_ops *ops)
 {
 	unsigned long long global_hotplug_seq;
@@ -5345,26 +5377,12 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 		goto err_unlock;
 	}
 
-	sch = kzalloc(sizeof(*sch), GFP_KERNEL);
-	if (!sch) {
-		ret = -ENOMEM;
+	sch = scx_alloc_and_add_sched(ops);
+	if (IS_ERR(sch)) {
+		ret = PTR_ERR(sch);
 		goto err_unlock;
 	}
 
-	sch->exit_info = alloc_exit_info(ops->exit_dump_len);
-	if (!sch->exit_info) {
-		ret = -ENOMEM;
-		goto err_free;
-	}
-
-	sch->kobj.kset = scx_kset;
-	ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
-	if (ret < 0)
-		goto err_free;
-
-	atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
-	sch->ops = *ops;
-
 	/*
 	 * Transition to ENABLING and clear exit info to arm the disable path.
 	 * Failure triggers full disabling from here on.
@@ -5562,10 +5580,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 
 	return 0;
 
-err_free:
-	if (sch->exit_info)
-		free_exit_info(sch->exit_info);
-	kfree(sch);
 err_unlock:
 	mutex_unlock(&scx_enable_mutex);
 	return ret;
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 06/12] sched_ext: Move dsq_hash into scx_sched
  2025-04-23 23:44 [PATCHSET sched_ext/for-6.16] sched_ext: Introduce scx_sched Tejun Heo
                   ` (4 preceding siblings ...)
  2025-04-23 23:44 ` [PATCH 05/12] sched_ext: Factor out scx_alloc_and_add_sched() Tejun Heo
@ 2025-04-23 23:44 ` Tejun Heo
  2025-04-23 23:44 ` [PATCH 07/12] sched_ext: Move global_dsqs " Tejun Heo
                   ` (5 subsequent siblings)
  11 siblings, 0 replies; 19+ messages in thread
From: Tejun Heo @ 2025-04-23 23:44 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, linux-kernel; +Cc: Tejun Heo

User DSQs are going to become per scheduler instance. Move dsq_hash into
scx_sched. This shifts the code that assumes scx_root to be the only
scx_sched instance up the call stack but doesn't remove them yet.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 120 ++++++++++++++++++++++++++++++---------------
 1 file changed, 80 insertions(+), 40 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 88a0f6d9cb1e..a086b024301f 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -767,6 +767,7 @@ struct scx_sched {
 	struct sched_ext_ops	ops;
 	DECLARE_BITMAP(has_op, SCX_OPI_END);
 
+	struct rhashtable	dsq_hash;
 	bool			warned_zero_slice;
 
 	atomic_t		exit_kind;
@@ -1013,7 +1014,6 @@ static const struct rhashtable_params dsq_hash_params = {
 	.head_offset		= offsetof(struct scx_dispatch_q, hash_node),
 };
 
-static struct rhashtable dsq_hash;
 static LLIST_HEAD(dsqs_to_free);
 
 /* dispatch buf */
@@ -1111,9 +1111,9 @@ static struct scx_dispatch_q *find_global_dsq(struct task_struct *p)
 	return global_dsqs[cpu_to_node(task_cpu(p))];
 }
 
-static struct scx_dispatch_q *find_user_dsq(u64 dsq_id)
+static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id)
 {
-	return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params);
+	return rhashtable_lookup_fast(&sch->dsq_hash, &dsq_id, dsq_hash_params);
 }
 
 /*
@@ -2056,7 +2056,8 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
 		raw_spin_unlock(&dsq->lock);
 }
 
-static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
+static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch,
+						    struct rq *rq, u64 dsq_id,
 						    struct task_struct *p)
 {
 	struct scx_dispatch_q *dsq;
@@ -2076,7 +2077,7 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
 	if (dsq_id == SCX_DSQ_GLOBAL)
 		dsq = find_global_dsq(p);
 	else
-		dsq = find_user_dsq(dsq_id);
+		dsq = find_user_dsq(sch, dsq_id);
 
 	if (unlikely(!dsq)) {
 		scx_error("non-existent DSQ 0x%llx for %s[%d]",
@@ -2117,11 +2118,12 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task,
 	p->scx.ddsp_enq_flags = enq_flags;
 }
 
-static void direct_dispatch(struct task_struct *p, u64 enq_flags)
+static void direct_dispatch(struct scx_sched *sch, struct task_struct *p,
+			    u64 enq_flags)
 {
 	struct rq *rq = task_rq(p);
 	struct scx_dispatch_q *dsq =
-		find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p);
+		find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p);
 
 	touch_core_sched_dispatch(rq, p);
 
@@ -2180,6 +2182,7 @@ static bool scx_rq_online(struct rq *rq)
 static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 			    int sticky_cpu)
 {
+	struct scx_sched *sch = scx_root;
 	struct task_struct **ddsp_taskp;
 	unsigned long qseq;
 
@@ -2246,7 +2249,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 	return;
 
 direct:
-	direct_dispatch(p, enq_flags);
+	direct_dispatch(sch, p, enq_flags);
 	return;
 
 local:
@@ -2906,7 +2909,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
  * was valid in the first place. Make sure that the task is still owned by the
  * BPF scheduler and claim the ownership before dispatching.
  */
-static void finish_dispatch(struct rq *rq, struct task_struct *p,
+static void finish_dispatch(struct scx_sched *sch, struct rq *rq,
+			    struct task_struct *p,
 			    unsigned long qseq_at_dispatch,
 			    u64 dsq_id, u64 enq_flags)
 {
@@ -2959,7 +2963,7 @@ static void finish_dispatch(struct rq *rq, struct task_struct *p,
 
 	BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED));
 
-	dsq = find_dsq_for_dispatch(this_rq(), dsq_id, p);
+	dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, p);
 
 	if (dsq->id == SCX_DSQ_LOCAL)
 		dispatch_to_local_dsq(rq, dsq, p, enq_flags);
@@ -2967,7 +2971,7 @@ static void finish_dispatch(struct rq *rq, struct task_struct *p,
 		dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
 }
 
-static void flush_dispatch_buf(struct rq *rq)
+static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq)
 {
 	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
 	u32 u;
@@ -2975,7 +2979,7 @@ static void flush_dispatch_buf(struct rq *rq)
 	for (u = 0; u < dspc->cursor; u++) {
 		struct scx_dsp_buf_ent *ent = &dspc->buf[u];
 
-		finish_dispatch(rq, ent->task, ent->qseq, ent->dsq_id,
+		finish_dispatch(sch, rq, ent->task, ent->qseq, ent->dsq_id,
 				ent->enq_flags);
 	}
 
@@ -2985,6 +2989,7 @@ static void flush_dispatch_buf(struct rq *rq)
 
 static int balance_one(struct rq *rq, struct task_struct *prev)
 {
+	struct scx_sched *sch = scx_root;
 	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
 	bool prev_on_scx = prev->sched_class == &ext_sched_class;
 	bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED;
@@ -3052,7 +3057,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 		SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, rq, cpu_of(rq),
 			    prev_on_scx ? prev : NULL);
 
-		flush_dispatch_buf(rq);
+		flush_dispatch_buf(sch, rq);
 
 		if (prev_on_rq && prev->scx.slice) {
 			rq->scx.flags |= SCX_RQ_BAL_KEEP;
@@ -3146,11 +3151,12 @@ static void process_ddsp_deferred_locals(struct rq *rq)
 	 */
 	while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals,
 				struct task_struct, scx.dsq_list.node))) {
+		struct scx_sched *sch = scx_root;
 		struct scx_dispatch_q *dsq;
 
 		list_del_init(&p->scx.dsq_list.node);
 
-		dsq = find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p);
+		dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p);
 		if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))
 			dispatch_to_local_dsq(rq, dsq, p, p->scx.ddsp_enq_flags);
 	}
@@ -4204,14 +4210,14 @@ static void free_dsq_irq_workfn(struct irq_work *irq_work)
 
 static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn);
 
-static void destroy_dsq(u64 dsq_id)
+static void destroy_dsq(struct scx_sched *sch, u64 dsq_id)
 {
 	struct scx_dispatch_q *dsq;
 	unsigned long flags;
 
 	rcu_read_lock();
 
-	dsq = find_user_dsq(dsq_id);
+	dsq = find_user_dsq(sch, dsq_id);
 	if (!dsq)
 		goto out_unlock_rcu;
 
@@ -4223,7 +4229,8 @@ static void destroy_dsq(u64 dsq_id)
 		goto out_unlock_dsq;
 	}
 
-	if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params))
+	if (rhashtable_remove_fast(&sch->dsq_hash, &dsq->hash_node,
+				   dsq_hash_params))
 		goto out_unlock_dsq;
 
 	/*
@@ -4397,7 +4404,21 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 {
 	struct rcu_work *rcu_work = to_rcu_work(work);
 	struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work);
+	struct rhashtable_iter rht_iter;
+	struct scx_dispatch_q *dsq;
+
+	rhashtable_walk_enter(&sch->dsq_hash, &rht_iter);
+	do {
+		rhashtable_walk_start(&rht_iter);
+
+		while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq))
+			destroy_dsq(sch, dsq->id);
+
+		rhashtable_walk_stop(&rht_iter);
+	} while (dsq == ERR_PTR(-EAGAIN));
+	rhashtable_walk_exit(&rht_iter);
 
+	rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
 	free_exit_info(sch->exit_info);
 	kfree(sch);
 }
@@ -4701,8 +4722,6 @@ static void scx_disable_workfn(struct kthread_work *work)
 	struct scx_exit_info *ei = sch->exit_info;
 	struct scx_task_iter sti;
 	struct task_struct *p;
-	struct rhashtable_iter rht_iter;
-	struct scx_dispatch_q *dsq;
 	int kind, cpu;
 
 	kind = atomic_read(&sch->exit_kind);
@@ -4834,17 +4853,6 @@ static void scx_disable_workfn(struct kthread_work *work)
 	 */
 	kobject_del(&sch->kobj);
 
-	rhashtable_walk_enter(&dsq_hash, &rht_iter);
-	do {
-		rhashtable_walk_start(&rht_iter);
-
-		while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq))
-			destroy_dsq(dsq->id);
-
-		rhashtable_walk_stop(&rht_iter);
-	} while (dsq == ERR_PTR(-EAGAIN));
-	rhashtable_walk_exit(&rht_iter);
-
 	free_percpu(scx_dsp_ctx);
 	scx_dsp_ctx = NULL;
 	scx_dsp_max_batch = 0;
@@ -5248,16 +5256,22 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
 		goto err_free_sch;
 	}
 
+	ret = rhashtable_init(&sch->dsq_hash, &dsq_hash_params);
+	if (ret < 0)
+		goto err_free_ei;
+
 	atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
 	sch->ops = *ops;
 
 	sch->kobj.kset = scx_kset;
 	ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
 	if (ret < 0)
-		goto err_free_ei;
+		goto err_free_hash;
 
 	return sch;
 
+err_free_hash:
+	rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
 err_free_ei:
 	free_exit_info(sch->exit_info);
 err_free_sch:
@@ -6098,7 +6112,6 @@ void __init init_sched_ext_class(void)
 	WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT |
 		   SCX_TG_ONLINE);
 
-	BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
 	scx_idle_init_masks();
 
 	scx_kick_cpus_pnt_seqs =
@@ -6299,6 +6312,7 @@ static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
 static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
 			 struct task_struct *p, u64 dsq_id, u64 enq_flags)
 {
+	struct scx_sched *sch = scx_root;
 	struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq;
 	struct rq *this_rq, *src_rq, *locked_rq;
 	bool dispatched = false;
@@ -6351,7 +6365,7 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
 	}
 
 	/* @p is still on $src_dsq and stable, determine the destination */
-	dst_dsq = find_dsq_for_dispatch(this_rq, dsq_id, p);
+	dst_dsq = find_dsq_for_dispatch(sch, this_rq, dsq_id, p);
 
 	/*
 	 * Apply vtime and slice updates before moving so that the new time is
@@ -6431,15 +6445,16 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void)
  */
 __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
 {
+	struct scx_sched *sch = scx_root;
 	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
 	struct scx_dispatch_q *dsq;
 
 	if (!scx_kf_allowed(SCX_KF_DISPATCH))
 		return false;
 
-	flush_dispatch_buf(dspc->rq);
+	flush_dispatch_buf(sch, dspc->rq);
 
-	dsq = find_user_dsq(dsq_id);
+	dsq = find_user_dsq(sch, dsq_id);
 	if (unlikely(!dsq)) {
 		scx_error("invalid DSQ ID 0x%016llx", dsq_id);
 		return false;
@@ -6700,6 +6715,7 @@ __bpf_kfunc_start_defs();
 __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
 {
 	struct scx_dispatch_q *dsq;
+	struct scx_sched *sch;
 	s32 ret;
 
 	if (unlikely(node >= (int)nr_node_ids ||
@@ -6715,8 +6731,16 @@ __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
 
 	init_dsq(dsq, dsq_id);
 
-	ret = rhashtable_lookup_insert_fast(&dsq_hash, &dsq->hash_node,
-					    dsq_hash_params);
+	rcu_read_lock();
+
+	sch = rcu_dereference(scx_root);
+	if (sch)
+		ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node,
+						    dsq_hash_params);
+	else
+		ret = -ENODEV;
+
+	rcu_read_unlock();
 	if (ret)
 		kfree(dsq);
 	return ret;
@@ -6815,11 +6839,18 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
  */
 __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
 {
+	struct scx_sched *sch;
 	struct scx_dispatch_q *dsq;
 	s32 ret;
 
 	preempt_disable();
 
+	sch = rcu_dereference_sched(scx_root);
+	if (!sch) {
+		ret = -ENODEV;
+		goto out;
+	}
+
 	if (dsq_id == SCX_DSQ_LOCAL) {
 		ret = READ_ONCE(this_rq()->scx.local_dsq.nr);
 		goto out;
@@ -6831,7 +6862,7 @@ __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
 			goto out;
 		}
 	} else {
-		dsq = find_user_dsq(dsq_id);
+		dsq = find_user_dsq(sch, dsq_id);
 		if (dsq) {
 			ret = READ_ONCE(dsq->nr);
 			goto out;
@@ -6854,7 +6885,11 @@ __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
  */
 __bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id)
 {
-	destroy_dsq(dsq_id);
+	struct scx_sched *sch;
+
+	sch = rcu_dereference(scx_root);
+	if (sch)
+		destroy_dsq(sch, dsq_id);
 }
 
 /**
@@ -6871,16 +6906,21 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
 				     u64 flags)
 {
 	struct bpf_iter_scx_dsq_kern *kit = (void *)it;
+	struct scx_sched *sch;
 
 	BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) >
 		     sizeof(struct bpf_iter_scx_dsq));
 	BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) !=
 		     __alignof__(struct bpf_iter_scx_dsq));
 
+	sch = rcu_dereference(scx_root);
+	if (!sch)
+		return -ENODEV;
+
 	if (flags & ~__SCX_DSQ_ITER_USER_FLAGS)
 		return -EINVAL;
 
-	kit->dsq = find_user_dsq(dsq_id);
+	kit->dsq = find_user_dsq(sch, dsq_id);
 	if (!kit->dsq)
 		return -ENOENT;
 
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 07/12] sched_ext: Move global_dsqs into scx_sched
  2025-04-23 23:44 [PATCHSET sched_ext/for-6.16] sched_ext: Introduce scx_sched Tejun Heo
                   ` (5 preceding siblings ...)
  2025-04-23 23:44 ` [PATCH 06/12] sched_ext: Move dsq_hash into scx_sched Tejun Heo
@ 2025-04-23 23:44 ` Tejun Heo
  2025-04-23 23:44 ` [PATCH 08/12] sched_ext: Relocate scx_event_stats definition Tejun Heo
                   ` (4 subsequent siblings)
  11 siblings, 0 replies; 19+ messages in thread
From: Tejun Heo @ 2025-04-23 23:44 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, linux-kernel; +Cc: Tejun Heo

Global DSQs are going to become per scheduler instance. Move global_dsqs
into scx_sched. find_global_dsq() already takes a task_struct pointer as an
argument and should later be able to determine the scx_sched to use from
that. For now, assume scx_root.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 95 ++++++++++++++++++++++++----------------------
 1 file changed, 49 insertions(+), 46 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index a086b024301f..5beb2dd868ad 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -767,7 +767,17 @@ struct scx_sched {
 	struct sched_ext_ops	ops;
 	DECLARE_BITMAP(has_op, SCX_OPI_END);
 
+	/*
+	 * Dispatch queues.
+	 *
+	 * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability.
+	 * This is to avoid live-locking in bypass mode where all tasks are
+	 * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If
+	 * per-node split isn't sufficient, it can be further split.
+	 */
 	struct rhashtable	dsq_hash;
+	struct scx_dispatch_q	**global_dsqs;
+
 	bool			warned_zero_slice;
 
 	atomic_t		exit_kind;
@@ -998,16 +1008,6 @@ static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
  */
 static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task);
 
-/*
- * Dispatch queues.
- *
- * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. This is
- * to avoid live-locking in bypass mode where all tasks are dispatched to
- * %SCX_DSQ_GLOBAL and all CPUs consume from it. If per-node split isn't
- * sufficient, it can be further split.
- */
-static struct scx_dispatch_q **global_dsqs;
-
 static const struct rhashtable_params dsq_hash_params = {
 	.key_len		= sizeof_field(struct scx_dispatch_q, id),
 	.key_offset		= offsetof(struct scx_dispatch_q, id),
@@ -1108,7 +1108,9 @@ static bool u32_before(u32 a, u32 b)
 
 static struct scx_dispatch_q *find_global_dsq(struct task_struct *p)
 {
-	return global_dsqs[cpu_to_node(task_cpu(p))];
+	struct scx_sched *sch = scx_root;
+
+	return sch->global_dsqs[cpu_to_node(task_cpu(p))];
 }
 
 static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id)
@@ -2785,11 +2787,11 @@ static bool consume_dispatch_q(struct rq *rq, struct scx_dispatch_q *dsq)
 	return false;
 }
 
-static bool consume_global_dsq(struct rq *rq)
+static bool consume_global_dsq(struct scx_sched *sch, struct rq *rq)
 {
 	int node = cpu_to_node(cpu_of(rq));
 
-	return consume_dispatch_q(rq, global_dsqs[node]);
+	return consume_dispatch_q(rq, sch->global_dsqs[node]);
 }
 
 /**
@@ -3035,7 +3037,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 	if (rq->scx.local_dsq.nr)
 		goto has_tasks;
 
-	if (consume_global_dsq(rq))
+	if (consume_global_dsq(sch, rq))
 		goto has_tasks;
 
 	if (unlikely(!SCX_HAS_OP(scx_root, dispatch)) ||
@@ -3065,7 +3067,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 		}
 		if (rq->scx.local_dsq.nr)
 			goto has_tasks;
-		if (consume_global_dsq(rq))
+		if (consume_global_dsq(sch, rq))
 			goto has_tasks;
 
 		/*
@@ -4406,6 +4408,11 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 	struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work);
 	struct rhashtable_iter rht_iter;
 	struct scx_dispatch_q *dsq;
+	int node;
+
+	for_each_node_state(node, N_POSSIBLE)
+		kfree(sch->global_dsqs[node]);
+	kfree(sch->global_dsqs);
 
 	rhashtable_walk_enter(&sch->dsq_hash, &rht_iter);
 	do {
@@ -5244,7 +5251,7 @@ static struct kthread_worker *scx_create_rt_helper(const char *name)
 static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
 {
 	struct scx_sched *sch;
-	int ret;
+	int node, ret;
 
 	sch = kzalloc(sizeof(*sch), GFP_KERNEL);
 	if (!sch)
@@ -5260,16 +5267,40 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
 	if (ret < 0)
 		goto err_free_ei;
 
+	sch->global_dsqs = kcalloc(nr_node_ids, sizeof(sch->global_dsqs[0]),
+				   GFP_KERNEL);
+	if (!sch->global_dsqs) {
+		ret = -ENOMEM;
+		goto err_free_hash;
+	}
+
+	for_each_node_state(node, N_POSSIBLE) {
+		struct scx_dispatch_q *dsq;
+
+		dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node);
+		if (!dsq) {
+			ret = -ENOMEM;
+			goto err_free_gdsqs;
+		}
+
+		init_dsq(dsq, SCX_DSQ_GLOBAL);
+		sch->global_dsqs[node] = dsq;
+	}
+
 	atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
 	sch->ops = *ops;
 
 	sch->kobj.kset = scx_kset;
 	ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
 	if (ret < 0)
-		goto err_free_hash;
+		goto err_free_gdsqs;
 
 	return sch;
 
+err_free_gdsqs:
+	for_each_node_state(node, N_POSSIBLE)
+		kfree(sch->global_dsqs[node]);
+	kfree(sch->global_dsqs);
 err_free_hash:
 	rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
 err_free_ei:
@@ -5331,7 +5362,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 	struct scx_task_iter sti;
 	struct task_struct *p;
 	unsigned long timeout;
-	int i, cpu, node, ret;
+	int i, cpu, ret;
 
 	if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
 			   cpu_possible_mask)) {
@@ -5358,34 +5389,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 		}
 	}
 
-	if (!global_dsqs) {
-		struct scx_dispatch_q **dsqs;
-
-		dsqs = kcalloc(nr_node_ids, sizeof(dsqs[0]), GFP_KERNEL);
-		if (!dsqs) {
-			ret = -ENOMEM;
-			goto err_unlock;
-		}
-
-		for_each_node_state(node, N_POSSIBLE) {
-			struct scx_dispatch_q *dsq;
-
-			dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node);
-			if (!dsq) {
-				for_each_node_state(node, N_POSSIBLE)
-					kfree(dsqs[node]);
-				kfree(dsqs);
-				ret = -ENOMEM;
-				goto err_unlock;
-			}
-
-			init_dsq(dsq, SCX_DSQ_GLOBAL);
-			dsqs[node] = dsq;
-		}
-
-		global_dsqs = dsqs;
-	}
-
 	if (scx_enable_state() != SCX_DISABLED) {
 		ret = -EBUSY;
 		goto err_unlock;
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 08/12] sched_ext: Relocate scx_event_stats definition
  2025-04-23 23:44 [PATCHSET sched_ext/for-6.16] sched_ext: Introduce scx_sched Tejun Heo
                   ` (6 preceding siblings ...)
  2025-04-23 23:44 ` [PATCH 07/12] sched_ext: Move global_dsqs " Tejun Heo
@ 2025-04-23 23:44 ` Tejun Heo
  2025-04-23 23:44 ` [PATCH 09/12] sched_ext: Factor out scx_read_events() Tejun Heo
                   ` (3 subsequent siblings)
  11 siblings, 0 replies; 19+ messages in thread
From: Tejun Heo @ 2025-04-23 23:44 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, linux-kernel; +Cc: Tejun Heo

In prepration of moving event_stats_cpu into scx_sched, move scx_event_stats
definitions above scx_sched definition. No functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 114 ++++++++++++++++++++++-----------------------
 1 file changed, 57 insertions(+), 57 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 5beb2dd868ad..244ccf134cee 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -763,6 +763,63 @@ enum scx_opi {
 	SCX_OPI_END			= SCX_OP_IDX(init),
 };
 
+/*
+ * Collection of event counters. Event types are placed in descending order.
+ */
+struct scx_event_stats {
+	/*
+	 * If ops.select_cpu() returns a CPU which can't be used by the task,
+	 * the core scheduler code silently picks a fallback CPU.
+	 */
+	s64		SCX_EV_SELECT_CPU_FALLBACK;
+
+	/*
+	 * When dispatching to a local DSQ, the CPU may have gone offline in
+	 * the meantime. In this case, the task is bounced to the global DSQ.
+	 */
+	s64		SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
+
+	/*
+	 * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
+	 * continued to run because there were no other tasks on the CPU.
+	 */
+	s64		SCX_EV_DISPATCH_KEEP_LAST;
+
+	/*
+	 * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
+	 * is dispatched to a local DSQ when exiting.
+	 */
+	s64		SCX_EV_ENQ_SKIP_EXITING;
+
+	/*
+	 * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
+	 * migration disabled task skips ops.enqueue() and is dispatched to its
+	 * local DSQ.
+	 */
+	s64		SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
+
+	/*
+	 * Total number of times a task's time slice was refilled with the
+	 * default value (SCX_SLICE_DFL).
+	 */
+	s64		SCX_EV_REFILL_SLICE_DFL;
+
+	/*
+	 * The total duration of bypass modes in nanoseconds.
+	 */
+	s64		SCX_EV_BYPASS_DURATION;
+
+	/*
+	 * The number of tasks dispatched in the bypassing mode.
+	 */
+	s64		SCX_EV_BYPASS_DISPATCH;
+
+	/*
+	 * The number of times the bypassing mode has been activated.
+	 */
+	s64		SCX_EV_BYPASS_ACTIVATE;
+};
+
 struct scx_sched {
 	struct sched_ext_ops	ops;
 	DECLARE_BITMAP(has_op, SCX_OPI_END);
@@ -1539,63 +1596,6 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
 	return p;
 }
 
-/*
- * Collection of event counters. Event types are placed in descending order.
- */
-struct scx_event_stats {
-	/*
-	 * If ops.select_cpu() returns a CPU which can't be used by the task,
-	 * the core scheduler code silently picks a fallback CPU.
-	 */
-	s64		SCX_EV_SELECT_CPU_FALLBACK;
-
-	/*
-	 * When dispatching to a local DSQ, the CPU may have gone offline in
-	 * the meantime. In this case, the task is bounced to the global DSQ.
-	 */
-	s64		SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
-
-	/*
-	 * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
-	 * continued to run because there were no other tasks on the CPU.
-	 */
-	s64		SCX_EV_DISPATCH_KEEP_LAST;
-
-	/*
-	 * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
-	 * is dispatched to a local DSQ when exiting.
-	 */
-	s64		SCX_EV_ENQ_SKIP_EXITING;
-
-	/*
-	 * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
-	 * migration disabled task skips ops.enqueue() and is dispatched to its
-	 * local DSQ.
-	 */
-	s64		SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
-
-	/*
-	 * Total number of times a task's time slice was refilled with the
-	 * default value (SCX_SLICE_DFL).
-	 */
-	s64		SCX_EV_REFILL_SLICE_DFL;
-
-	/*
-	 * The total duration of bypass modes in nanoseconds.
-	 */
-	s64		SCX_EV_BYPASS_DURATION;
-
-	/*
-	 * The number of tasks dispatched in the bypassing mode.
-	 */
-	s64		SCX_EV_BYPASS_DISPATCH;
-
-	/*
-	 * The number of times the bypassing mode has been activated.
-	 */
-	s64		SCX_EV_BYPASS_ACTIVATE;
-};
-
 /*
  * The event counter is organized by a per-CPU variable to minimize the
  * accounting overhead without synchronization. A system-wide view on the
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 09/12] sched_ext: Factor out scx_read_events()
  2025-04-23 23:44 [PATCHSET sched_ext/for-6.16] sched_ext: Introduce scx_sched Tejun Heo
                   ` (7 preceding siblings ...)
  2025-04-23 23:44 ` [PATCH 08/12] sched_ext: Relocate scx_event_stats definition Tejun Heo
@ 2025-04-23 23:44 ` Tejun Heo
  2025-04-23 23:44 ` [PATCH 10/12] sched_ext: Move event_stats_cpu into scx_sched Tejun Heo
                   ` (2 subsequent siblings)
  11 siblings, 0 replies; 19+ messages in thread
From: Tejun Heo @ 2025-04-23 23:44 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, linux-kernel; +Cc: Tejun Heo

In prepration of moving event_stats_cpu into scx_sched, factor out
scx_read_events() out of scx_bpf_events() and update the in-kernel users -
scx_attr_events_show() and scx_dump_state() - to use scx_read_events()
instead of scx_bpf_events(). No observable behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 45 ++++++++++++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 244ccf134cee..4d2866db8cbe 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1648,7 +1648,7 @@ static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
 } while (0)
 
 
-static void scx_bpf_events(struct scx_event_stats *events, size_t events__sz);
+static void scx_read_events(struct scx_event_stats *events);
 
 static enum scx_enable_state scx_enable_state(void)
 {
@@ -4455,7 +4455,7 @@ static ssize_t scx_attr_events_show(struct kobject *kobj,
 	struct scx_event_stats events;
 	int at = 0;
 
-	scx_bpf_events(&events, sizeof(events));
+	scx_read_events(&events);
 	at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK);
 	at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
 	at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST);
@@ -5179,7 +5179,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
 	dump_line(&s, "Event counters");
 	dump_line(&s, "--------------");
 
-	scx_bpf_events(&events, sizeof(events));
+	scx_read_events(&events);
 	scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK);
 	scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
 	scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST);
@@ -7401,6 +7401,27 @@ __bpf_kfunc u64 scx_bpf_now(void)
 	return clock;
 }
 
+static void scx_read_events(struct scx_event_stats *events)
+{
+	struct scx_event_stats *e_cpu;
+	int cpu;
+
+	/* Aggregate per-CPU event counters into @events. */
+	memset(events, 0, sizeof(*events));
+	for_each_possible_cpu(cpu) {
+		e_cpu = per_cpu_ptr(&event_stats_cpu, cpu);
+		scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
+		scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
+		scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
+		scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_EXITING);
+		scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+		scx_agg_event(events, e_cpu, SCX_EV_REFILL_SLICE_DFL);
+		scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DURATION);
+		scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DISPATCH);
+		scx_agg_event(events, e_cpu, SCX_EV_BYPASS_ACTIVATE);
+	}
+}
+
 /*
  * scx_bpf_events - Get a system-wide event counter to
  * @events: output buffer from a BPF program
@@ -7409,23 +7430,9 @@ __bpf_kfunc u64 scx_bpf_now(void)
 __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
 				size_t events__sz)
 {
-	struct scx_event_stats e_sys, *e_cpu;
-	int cpu;
+	struct scx_event_stats e_sys;
 
-	/* Aggregate per-CPU event counters into the system-wide counters. */
-	memset(&e_sys, 0, sizeof(e_sys));
-	for_each_possible_cpu(cpu) {
-		e_cpu = per_cpu_ptr(&event_stats_cpu, cpu);
-		scx_agg_event(&e_sys, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
-		scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
-		scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
-		scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_EXITING);
-		scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
-		scx_agg_event(&e_sys, e_cpu, SCX_EV_REFILL_SLICE_DFL);
-		scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DURATION);
-		scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DISPATCH);
-		scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_ACTIVATE);
-	}
+	scx_read_events(&e_sys);
 
 	/*
 	 * We cannot entirely trust a BPF-provided size since a BPF program
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 10/12] sched_ext: Move event_stats_cpu into scx_sched
  2025-04-23 23:44 [PATCHSET sched_ext/for-6.16] sched_ext: Introduce scx_sched Tejun Heo
                   ` (8 preceding siblings ...)
  2025-04-23 23:44 ` [PATCH 09/12] sched_ext: Factor out scx_read_events() Tejun Heo
@ 2025-04-23 23:44 ` Tejun Heo
  2025-04-25  5:38   ` Changwoo Min
  2025-04-23 23:44 ` [PATCH 11/12] sched_ext: Move disable machinery " Tejun Heo
  2025-04-23 23:44 ` [PATCH 12/12] sched_ext: Clean up SCX_EXIT_NONE handling in scx_disable_workfn() Tejun Heo
  11 siblings, 1 reply; 19+ messages in thread
From: Tejun Heo @ 2025-04-23 23:44 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, linux-kernel; +Cc: Tejun Heo

The event counters are going to become per scheduler instance. Move
event_stats_cpu into scx_sched.

- [__]scx_add_event() are updated to take @sch. While at it, add missing
  parentheses around @cnt expansions.

- scx_read_events() is updated to take @sch.

- scx_bpf_events() accesses scx_root under RCU read lock.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 100 ++++++++++++++++++++++++++-------------------
 1 file changed, 58 insertions(+), 42 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 4d2866db8cbe..75c91b58430c 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -835,6 +835,13 @@ struct scx_sched {
 	struct rhashtable	dsq_hash;
 	struct scx_dispatch_q	**global_dsqs;
 
+	/*
+	 * The event counters are in a per-CPU variable to minimize the
+	 * accounting overhead. A system-wide view on the event counter is
+	 * constructed when requested by scx_bpf_get_event_stat().
+	 */
+	struct scx_event_stats __percpu *event_stats_cpu;
+
 	bool			warned_zero_slice;
 
 	atomic_t		exit_kind;
@@ -1596,34 +1603,29 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
 	return p;
 }
 
-/*
- * The event counter is organized by a per-CPU variable to minimize the
- * accounting overhead without synchronization. A system-wide view on the
- * event counter is constructed when requested by scx_bpf_get_event_stat().
- */
-static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
-
 /**
  * scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @sch: scx_sched to account events for
  * @name: an event name defined in struct scx_event_stats
  * @cnt: the number of the event occured
  *
  * This can be used when preemption is not disabled.
  */
-#define scx_add_event(name, cnt) do {						\
-	this_cpu_add(event_stats_cpu.name, cnt);				\
-	trace_sched_ext_event(#name, cnt);					\
+#define scx_add_event(sch, name, cnt) do {					\
+	this_cpu_add((sch)->event_stats_cpu->name, (cnt));			\
+	trace_sched_ext_event(#name, (cnt));					\
 } while(0)
 
 /**
  * __scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @sch: scx_sched to account events for
  * @name: an event name defined in struct scx_event_stats
  * @cnt: the number of the event occured
  *
  * This should be used only when preemption is disabled.
  */
-#define __scx_add_event(name, cnt) do {						\
-	__this_cpu_add(event_stats_cpu.name, cnt);				\
+#define __scx_add_event(sch, name, cnt) do {					\
+	__this_cpu_add((sch)->event_stats_cpu->name, (cnt));			\
 	trace_sched_ext_event(#name, cnt);					\
 } while(0)
 
@@ -1648,7 +1650,8 @@ static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
 } while (0)
 
 
-static void scx_read_events(struct scx_event_stats *events);
+static void scx_read_events(struct scx_sched *sch,
+			    struct scx_event_stats *events);
 
 static enum scx_enable_state scx_enable_state(void)
 {
@@ -1874,7 +1877,7 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
 static void refill_task_slice_dfl(struct task_struct *p)
 {
 	p->scx.slice = SCX_SLICE_DFL;
-	__scx_add_event(SCX_EV_REFILL_SLICE_DFL, 1);
+	__scx_add_event(scx_root, SCX_EV_REFILL_SLICE_DFL, 1);
 }
 
 static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
@@ -2203,7 +2206,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 		goto local;
 
 	if (scx_rq_bypassing(rq)) {
-		__scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
+		__scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
 		goto global;
 	}
 
@@ -2213,14 +2216,14 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 	/* see %SCX_OPS_ENQ_EXITING */
 	if (!(scx_root->ops.flags & SCX_OPS_ENQ_EXITING) &&
 	    unlikely(p->flags & PF_EXITING)) {
-		__scx_add_event(SCX_EV_ENQ_SKIP_EXITING, 1);
+		__scx_add_event(sch, SCX_EV_ENQ_SKIP_EXITING, 1);
 		goto local;
 	}
 
 	/* see %SCX_OPS_ENQ_MIGRATION_DISABLED */
 	if (!(scx_root->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) &&
 	    is_migration_disabled(p)) {
-		__scx_add_event(SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
+		__scx_add_event(sch, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
 		goto local;
 	}
 
@@ -2343,7 +2346,7 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
 
 	if ((enq_flags & SCX_ENQ_CPU_SELECTED) &&
 	    unlikely(cpu_of(rq) != p->scx.selected_cpu))
-		__scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1);
+		__scx_add_event(scx_root, SCX_EV_SELECT_CPU_FALLBACK, 1);
 }
 
 static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
@@ -2571,7 +2574,8 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
 
 	if (!scx_rq_online(rq)) {
 		if (enforce)
-			__scx_add_event(SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
+			__scx_add_event(scx_root,
+					SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
 		return false;
 	}
 
@@ -3093,7 +3097,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 	if (prev_on_rq &&
 	    (!(scx_root->ops.flags & SCX_OPS_ENQ_LAST) || scx_rq_bypassing(rq))) {
 		rq->scx.flags |= SCX_RQ_BAL_KEEP;
-		__scx_add_event(SCX_EV_DISPATCH_KEEP_LAST, 1);
+		__scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1);
 		goto has_tasks;
 	}
 	rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
@@ -3424,6 +3428,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
 
 static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
 {
+	struct scx_sched *sch = scx_root;
 	bool rq_bypass;
 
 	/*
@@ -3440,7 +3445,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
 		return prev_cpu;
 
 	rq_bypass = scx_rq_bypassing(task_rq(p));
-	if (likely(SCX_HAS_OP(scx_root, select_cpu)) && !rq_bypass) {
+	if (likely(SCX_HAS_OP(sch, select_cpu)) && !rq_bypass) {
 		s32 cpu;
 		struct task_struct **ddsp_taskp;
 
@@ -3469,7 +3474,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
 		p->scx.selected_cpu = cpu;
 
 		if (rq_bypass)
-			__scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
+			__scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
 		return cpu;
 	}
 }
@@ -4410,6 +4415,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 	struct scx_dispatch_q *dsq;
 	int node;
 
+	free_percpu(sch->event_stats_cpu);
+
 	for_each_node_state(node, N_POSSIBLE)
 		kfree(sch->global_dsqs[node]);
 	kfree(sch->global_dsqs);
@@ -4452,10 +4459,11 @@ SCX_ATTR(ops);
 static ssize_t scx_attr_events_show(struct kobject *kobj,
 				    struct kobj_attribute *ka, char *buf)
 {
+	struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
 	struct scx_event_stats events;
 	int at = 0;
 
-	scx_read_events(&events);
+	scx_read_events(sch, &events);
 	at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK);
 	at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
 	at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST);
@@ -4588,25 +4596,29 @@ static void scx_bypass(bool bypass)
 {
 	static DEFINE_RAW_SPINLOCK(bypass_lock);
 	static unsigned long bypass_timestamp;
-
-	int cpu;
+	struct scx_sched *sch;
 	unsigned long flags;
+	int cpu;
 
 	raw_spin_lock_irqsave(&bypass_lock, flags);
+	sch = rcu_dereference_bh(scx_root);
+
 	if (bypass) {
 		scx_bypass_depth++;
 		WARN_ON_ONCE(scx_bypass_depth <= 0);
 		if (scx_bypass_depth != 1)
 			goto unlock;
 		bypass_timestamp = ktime_get_ns();
-		scx_add_event(SCX_EV_BYPASS_ACTIVATE, 1);
+		if (sch)
+			scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
 	} else {
 		scx_bypass_depth--;
 		WARN_ON_ONCE(scx_bypass_depth < 0);
 		if (scx_bypass_depth != 0)
 			goto unlock;
-		scx_add_event(SCX_EV_BYPASS_DURATION,
-			      ktime_get_ns() - bypass_timestamp);
+		if (sch)
+			scx_add_event(sch, SCX_EV_BYPASS_DURATION,
+				      ktime_get_ns() - bypass_timestamp);
 	}
 
 	atomic_inc(&scx_breather_depth);
@@ -5179,7 +5191,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
 	dump_line(&s, "Event counters");
 	dump_line(&s, "--------------");
 
-	scx_read_events(&events);
+	scx_read_events(scx_root, &events);
 	scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK);
 	scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
 	scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST);
@@ -5287,16 +5299,22 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
 		sch->global_dsqs[node] = dsq;
 	}
 
+	sch->event_stats_cpu = alloc_percpu(struct scx_event_stats);
+	if (!sch->event_stats_cpu)
+		goto err_free_gdsqs;
+
 	atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
 	sch->ops = *ops;
 
 	sch->kobj.kset = scx_kset;
 	ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
 	if (ret < 0)
-		goto err_free_gdsqs;
+		goto err_event_stats;
 
 	return sch;
 
+err_event_stats:
+	free_percpu(sch->event_stats_cpu);
 err_free_gdsqs:
 	for_each_node_state(node, N_POSSIBLE)
 		kfree(sch->global_dsqs[node]);
@@ -5372,15 +5390,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 
 	mutex_lock(&scx_enable_mutex);
 
-	/*
-	 * Clear event counters so a new scx scheduler gets
-	 * fresh event counter values.
-	 */
-	for_each_possible_cpu(cpu) {
-		struct scx_event_stats *e = per_cpu_ptr(&event_stats_cpu, cpu);
-		memset(e, 0, sizeof(*e));
-	}
-
 	if (!scx_helper) {
 		WRITE_ONCE(scx_helper, scx_create_rt_helper("sched_ext_helper"));
 		if (!scx_helper) {
@@ -7401,7 +7410,7 @@ __bpf_kfunc u64 scx_bpf_now(void)
 	return clock;
 }
 
-static void scx_read_events(struct scx_event_stats *events)
+static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *events)
 {
 	struct scx_event_stats *e_cpu;
 	int cpu;
@@ -7409,7 +7418,7 @@ static void scx_read_events(struct scx_event_stats *events)
 	/* Aggregate per-CPU event counters into @events. */
 	memset(events, 0, sizeof(*events));
 	for_each_possible_cpu(cpu) {
-		e_cpu = per_cpu_ptr(&event_stats_cpu, cpu);
+		e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu);
 		scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
 		scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
 		scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
@@ -7430,9 +7439,16 @@ static void scx_read_events(struct scx_event_stats *events)
 __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
 				size_t events__sz)
 {
+	struct scx_sched *sch;
 	struct scx_event_stats e_sys;
 
-	scx_read_events(&e_sys);
+	rcu_read_lock();
+	sch = rcu_dereference(scx_root);
+	if (sch)
+		scx_read_events(sch, &e_sys);
+	else
+		memset(&e_sys, 0, sizeof(e_sys));
+	rcu_read_unlock();
 
 	/*
 	 * We cannot entirely trust a BPF-provided size since a BPF program
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 11/12] sched_ext: Move disable machinery into scx_sched
  2025-04-23 23:44 [PATCHSET sched_ext/for-6.16] sched_ext: Introduce scx_sched Tejun Heo
                   ` (9 preceding siblings ...)
  2025-04-23 23:44 ` [PATCH 10/12] sched_ext: Move event_stats_cpu into scx_sched Tejun Heo
@ 2025-04-23 23:44 ` Tejun Heo
  2025-04-23 23:44 ` [PATCH 12/12] sched_ext: Clean up SCX_EXIT_NONE handling in scx_disable_workfn() Tejun Heo
  11 siblings, 0 replies; 19+ messages in thread
From: Tejun Heo @ 2025-04-23 23:44 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, linux-kernel; +Cc: Tejun Heo

Because disable can be triggered from any place and the scheduler cannot be
trusted, disable path uses an irq_work to bounce and a kthread_work which is
executed on an RT helper kthread to perform disable. These must be per
scheduler instance to guarantee forward progress. Move them into scx_sched.

- If an scx_sched is accessible, its helper kthread is always valid making
  the `helper` check in schedule_scx_disable_work() unnecessary. As the
  function becomes trivial after the removal of the test, inline it.

- scx_create_rt_helper() has only one user - creation of the disable helper
  kthread. Inline it into scx_alloc_and_add_sched().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 73 ++++++++++++++++------------------------------
 1 file changed, 25 insertions(+), 48 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 75c91b58430c..d27193010b6a 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -848,6 +848,10 @@ struct scx_sched {
 	struct scx_exit_info	*exit_info;
 
 	struct kobject		kobj;
+
+	struct kthread_worker	*helper;
+	struct irq_work		error_irq_work;
+	struct kthread_work	disable_work;
 	struct rcu_work		rcu_work;
 };
 
@@ -1021,7 +1025,6 @@ static DEFINE_SPINLOCK(scx_tasks_lock);
 static LIST_HEAD(scx_tasks);
 
 /* ops enable/disable */
-static struct kthread_worker *scx_helper;
 static DEFINE_MUTEX(scx_enable_mutex);
 DEFINE_STATIC_KEY_FALSE(__scx_enabled);
 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
@@ -4415,6 +4418,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 	struct scx_dispatch_q *dsq;
 	int node;
 
+	kthread_stop(sch->helper->task);
 	free_percpu(sch->event_stats_cpu);
 
 	for_each_node_state(node, N_POSSIBLE)
@@ -4737,7 +4741,7 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
 
 static void scx_disable_workfn(struct kthread_work *work)
 {
-	struct scx_sched *sch = scx_root;
+	struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
 	struct scx_exit_info *ei = sch->exit_info;
 	struct scx_task_iter sti;
 	struct task_struct *p;
@@ -4883,20 +4887,6 @@ static void scx_disable_workfn(struct kthread_work *work)
 	scx_bypass(false);
 }
 
-static DEFINE_KTHREAD_WORK(scx_disable_work, scx_disable_workfn);
-
-static void schedule_scx_disable_work(void)
-{
-	struct kthread_worker *helper = READ_ONCE(scx_helper);
-
-	/*
-	 * We may be called spuriously before the first bpf_sched_ext_reg(). If
-	 * scx_helper isn't set up yet, there's nothing to do.
-	 */
-	if (helper)
-		kthread_queue_work(helper, &scx_disable_work);
-}
-
 static void scx_disable(enum scx_exit_kind kind)
 {
 	int none = SCX_EXIT_NONE;
@@ -4909,7 +4899,7 @@ static void scx_disable(enum scx_exit_kind kind)
 	sch = rcu_dereference(scx_root);
 	if (sch) {
 		atomic_try_cmpxchg(&sch->exit_kind, &none, kind);
-		schedule_scx_disable_work();
+		kthread_queue_work(sch->helper, &sch->disable_work);
 	}
 	rcu_read_unlock();
 }
@@ -5211,16 +5201,15 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
 
 static void scx_error_irq_workfn(struct irq_work *irq_work)
 {
-	struct scx_exit_info *ei = scx_root->exit_info;
+	struct scx_sched *sch = container_of(irq_work, struct scx_sched, error_irq_work);
+	struct scx_exit_info *ei = sch->exit_info;
 
 	if (ei->kind >= SCX_EXIT_ERROR)
-		scx_dump_state(ei, scx_root->ops.exit_dump_len);
+		scx_dump_state(ei, sch->ops.exit_dump_len);
 
-	schedule_scx_disable_work();
+	kthread_queue_work(sch->helper, &sch->disable_work);
 }
 
-static DEFINE_IRQ_WORK(scx_error_irq_work, scx_error_irq_workfn);
-
 static __printf(3, 4) void __scx_exit(enum scx_exit_kind kind, s64 exit_code,
 				      const char *fmt, ...)
 {
@@ -5247,17 +5236,7 @@ static __printf(3, 4) void __scx_exit(enum scx_exit_kind kind, s64 exit_code,
 	ei->kind = kind;
 	ei->reason = scx_exit_reason(ei->kind);
 
-	irq_work_queue(&scx_error_irq_work);
-}
-
-static struct kthread_worker *scx_create_rt_helper(const char *name)
-{
-	struct kthread_worker *helper;
-
-	helper = kthread_run_worker(0, name);
-	if (helper)
-		sched_set_fifo(helper->task);
-	return helper;
+	irq_work_queue(&scx_root->error_irq_work);
 }
 
 static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
@@ -5303,16 +5282,25 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
 	if (!sch->event_stats_cpu)
 		goto err_free_gdsqs;
 
+	sch->helper = kthread_run_worker(0, "sched_ext_helper");
+	if (!sch->helper)
+		goto err_event_stats;
+	sched_set_fifo(sch->helper->task);
+
 	atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
+	init_irq_work(&sch->error_irq_work, scx_error_irq_workfn);
+	kthread_init_work(&sch->disable_work, scx_disable_workfn);
 	sch->ops = *ops;
 
 	sch->kobj.kset = scx_kset;
 	ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
 	if (ret < 0)
-		goto err_event_stats;
+		goto err_stop_helper;
 
 	return sch;
 
+err_stop_helper:
+	kthread_stop(sch->helper->task);
 err_event_stats:
 	free_percpu(sch->event_stats_cpu);
 err_free_gdsqs:
@@ -5390,14 +5378,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 
 	mutex_lock(&scx_enable_mutex);
 
-	if (!scx_helper) {
-		WRITE_ONCE(scx_helper, scx_create_rt_helper("sched_ext_helper"));
-		if (!scx_helper) {
-			ret = -ENOMEM;
-			goto err_unlock;
-		}
-	}
-
 	if (scx_enable_state() != SCX_DISABLED) {
 		ret = -EBUSY;
 		goto err_unlock;
@@ -5626,7 +5606,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 	 * completion.
 	 */
 	scx_error("scx_enable() failed (%d)", ret);
-	kthread_flush_work(&scx_disable_work);
+	kthread_flush_work(&sch->disable_work);
 	kobject_put(&sch->kobj);
 	return 0;
 }
@@ -5779,7 +5759,7 @@ static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
 	struct scx_sched *sch = scx_root;
 
 	scx_disable(SCX_EXIT_UNREG);
-	kthread_flush_work(&scx_disable_work);
+	kthread_flush_work(&sch->disable_work);
 	kobject_put(&sch->kobj);
 }
 
@@ -5902,10 +5882,7 @@ static struct bpf_struct_ops bpf_sched_ext_ops = {
 
 static void sysrq_handle_sched_ext_reset(u8 key)
 {
-	if (scx_helper)
-		scx_disable(SCX_EXIT_SYSRQ);
-	else
-		pr_info("sched_ext: BPF scheduler not yet used\n");
+	scx_disable(SCX_EXIT_SYSRQ);
 }
 
 static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 12/12] sched_ext: Clean up SCX_EXIT_NONE handling in scx_disable_workfn()
  2025-04-23 23:44 [PATCHSET sched_ext/for-6.16] sched_ext: Introduce scx_sched Tejun Heo
                   ` (10 preceding siblings ...)
  2025-04-23 23:44 ` [PATCH 11/12] sched_ext: Move disable machinery " Tejun Heo
@ 2025-04-23 23:44 ` Tejun Heo
  11 siblings, 0 replies; 19+ messages in thread
From: Tejun Heo @ 2025-04-23 23:44 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, linux-kernel; +Cc: Tejun Heo

With the global states and disable machinery moved into scx_sched,
scx_disable_workfn() can only be scheduled and run for the specific
scheduler instance. This makes it impossible for scx_disable_workfn() to see
SCX_EXIT_NONE. Turn that condition into WARN_ON_ONCE().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index d27193010b6a..d963aa5c99e1 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4749,13 +4749,9 @@ static void scx_disable_workfn(struct kthread_work *work)
 
 	kind = atomic_read(&sch->exit_kind);
 	while (true) {
-		/*
-		 * NONE indicates that a new scx_ops has been registered since
-		 * disable was scheduled - don't kill the new ops. DONE
-		 * indicates that the ops has already been disabled.
-		 */
-		if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)
+		if (kind == SCX_EXIT_DONE)	/* already disabled? */
 			return;
+		WARN_ON_ONCE(kind == SCX_EXIT_NONE);
 		if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE))
 			break;
 	}
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 02/12] sched_ext: Avoid NULL scx_root deref through SCX_HAS_OP()
  2025-04-23 23:44 ` [PATCH 02/12] sched_ext: Avoid NULL scx_root deref through SCX_HAS_OP() Tejun Heo
@ 2025-04-24  7:23   ` Chengming Zhou
  2025-04-24 18:55     ` Tejun Heo
  0 siblings, 1 reply; 19+ messages in thread
From: Chengming Zhou @ 2025-04-24  7:23 UTC (permalink / raw)
  To: Tejun Heo, David Vernet, Andrea Righi, Changwoo Min, linux-kernel

On 2025/4/24 07:44, Tejun Heo wrote:
> SCX_HAS_OP() tests scx_root->has_op bitmap. The bitmap is currently in a
> statically allocated struct scx_sched and initialized while loading the BPF
> scheduler and cleared while unloading, and thus can be tested anytime.
> However, scx_root will be switched to dynamic allocation and thus won't
> always be deferenceable.
> 
> Most usages of SCX_HAS_OP() are already protected by scx_enabled() either
> directly or indirectly (e.g. through a task which is on SCX). However, there
> are a couple places that could try to dereference NULL scx_root. Update them
> so that scx_root is guaranteed to be valid before SCX_HAS_OP() is called.
> 
> - In handle_hotplug(), test whether scx_root is NULL before doing anything
>    else. This is safe because scx_root updates will be protected by
>    cpus_read_lock().
> 
> - In scx_tg_offline(), test scx_cgroup_enabled before invoking SCX_HAS_OP(),
>    which should guarnatee that scx_root won't turn NULL. This is also in line
>    with other cgroup operations. As the code path is synchronized against
>    scx_cgroup_init/exit() through scx_cgroup_rwsem, this shouldn't cause any
>    behavior differences.
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
>   kernel/sched/ext.c | 11 ++++++++++-
>   1 file changed, 10 insertions(+), 1 deletion(-)
> 
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 975f6963a01b..ad392890d2dd 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -3498,6 +3498,14 @@ static void handle_hotplug(struct rq *rq, bool online)
>   
>   	atomic_long_inc(&scx_hotplug_seq);
>   
> +	/*
> +	 * scx_root updates are protected by cpus_read_lock() and will stay
> +	 * stable here. Note that we can't depend on scx_enabled() test as the
> +	 * hotplug ops need to be enabled before __scx_enabled is set.
> +	 */
> +	if (!scx_root)
> +		return;
> +
>   	if (scx_enabled())
>   		scx_idle_update_selcpu_topology(&scx_root->ops);

Just be curious, does the comments added above mean we shouldn't
check scx_enabled() here anymore?

Thanks!

>   
> @@ -3994,7 +4002,8 @@ void scx_tg_offline(struct task_group *tg)
>   
>   	percpu_down_read(&scx_cgroup_rwsem);
>   
> -	if (SCX_HAS_OP(scx_root, cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))
> +	if (scx_cgroup_enabled && SCX_HAS_OP(scx_root, cgroup_exit) &&
> +	    (tg->scx_flags & SCX_TG_INITED))
>   		SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, NULL, tg->css.cgroup);
>   	tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
>   

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 02/12] sched_ext: Avoid NULL scx_root deref through SCX_HAS_OP()
  2025-04-24  7:23   ` Chengming Zhou
@ 2025-04-24 18:55     ` Tejun Heo
  0 siblings, 0 replies; 19+ messages in thread
From: Tejun Heo @ 2025-04-24 18:55 UTC (permalink / raw)
  To: Chengming Zhou; +Cc: David Vernet, Andrea Righi, Changwoo Min, linux-kernel

Hello,

On Thu, Apr 24, 2025 at 03:23:40PM +0800, Chengming Zhou wrote:
> > +	/*
> > +	 * scx_root updates are protected by cpus_read_lock() and will stay
> > +	 * stable here. Note that we can't depend on scx_enabled() test as the
> > +	 * hotplug ops need to be enabled before __scx_enabled is set.
> > +	 */
> > +	if (!scx_root)
> > +		return;
> > +
> >   	if (scx_enabled())
> >   		scx_idle_update_selcpu_topology(&scx_root->ops);
> 
> Just be curious, does the comments added above mean we shouldn't
> check scx_enabled() here anymore?

There are two things happening in the function - calling the idle topology
update function and calling ops.cpu_on/offline(). The former needs both
scx_root and scx_enabled(). The latter only scx_root, so we still need the
additional scx_enabled() check for the former.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 10/12] sched_ext: Move event_stats_cpu into scx_sched
  2025-04-23 23:44 ` [PATCH 10/12] sched_ext: Move event_stats_cpu into scx_sched Tejun Heo
@ 2025-04-25  5:38   ` Changwoo Min
  0 siblings, 0 replies; 19+ messages in thread
From: Changwoo Min @ 2025-04-25  5:38 UTC (permalink / raw)
  To: Tejun Heo, David Vernet, Andrea Righi, linux-kernel

Hi Tejun,

On 4/24/25 08:44, Tejun Heo wrote:
> The event counters are going to become per scheduler instance. Move
> event_stats_cpu into scx_sched.
> 
> - [__]scx_add_event() are updated to take @sch. While at it, add missing
>    parentheses around @cnt expansions.
> 
> - scx_read_events() is updated to take @sch.
> 
> - scx_bpf_events() accesses scx_root under RCU read lock.
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
>   kernel/sched/ext.c | 100 ++++++++++++++++++++++++++-------------------
>   1 file changed, 58 insertions(+), 42 deletions(-)
> 
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 4d2866db8cbe..75c91b58430c 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -835,6 +835,13 @@ struct scx_sched {
>   	struct rhashtable	dsq_hash;
>   	struct scx_dispatch_q	**global_dsqs;
>   
> +	/*
> +	 * The event counters are in a per-CPU variable to minimize the
> +	 * accounting overhead. A system-wide view on the event counter is
> +	 * constructed when requested by scx_bpf_get_event_stat().

scx_bpf_get_event_stat() should be replaced to scx_bpf_events(). I
think, in the original code, the old function name,
scx_bpf_get_event_stat(), was creeped in by mistake.

Other than that, the other changes look straightforward and good to me.

Acked-by: Changwoo Min <changwoo@igalia.com>

Regards,
Changwoo Min


> +	 */
> +	struct scx_event_stats __percpu *event_stats_cpu;
> +
>   	bool			warned_zero_slice;
>   
>   	atomic_t		exit_kind;
> @@ -1596,34 +1603,29 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
>   	return p;
>   }
>   
> -/*
> - * The event counter is organized by a per-CPU variable to minimize the
> - * accounting overhead without synchronization. A system-wide view on the
> - * event counter is constructed when requested by scx_bpf_get_event_stat().
> - */
> -static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
> -
>   /**
>    * scx_add_event - Increase an event counter for 'name' by 'cnt'
> + * @sch: scx_sched to account events for
>    * @name: an event name defined in struct scx_event_stats
>    * @cnt: the number of the event occured
>    *
>    * This can be used when preemption is not disabled.
>    */
> -#define scx_add_event(name, cnt) do {						\
> -	this_cpu_add(event_stats_cpu.name, cnt);				\
> -	trace_sched_ext_event(#name, cnt);					\
> +#define scx_add_event(sch, name, cnt) do {					\
> +	this_cpu_add((sch)->event_stats_cpu->name, (cnt));			\
> +	trace_sched_ext_event(#name, (cnt));					\
>   } while(0)
>   
>   /**
>    * __scx_add_event - Increase an event counter for 'name' by 'cnt'
> + * @sch: scx_sched to account events for
>    * @name: an event name defined in struct scx_event_stats
>    * @cnt: the number of the event occured
>    *
>    * This should be used only when preemption is disabled.
>    */
> -#define __scx_add_event(name, cnt) do {						\
> -	__this_cpu_add(event_stats_cpu.name, cnt);				\
> +#define __scx_add_event(sch, name, cnt) do {					\
> +	__this_cpu_add((sch)->event_stats_cpu->name, (cnt));			\
>   	trace_sched_ext_event(#name, cnt);					\
>   } while(0)
>   
> @@ -1648,7 +1650,8 @@ static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
>   } while (0)
>   
>   
> -static void scx_read_events(struct scx_event_stats *events);
> +static void scx_read_events(struct scx_sched *sch,
> +			    struct scx_event_stats *events);
>   
>   static enum scx_enable_state scx_enable_state(void)
>   {
> @@ -1874,7 +1877,7 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
>   static void refill_task_slice_dfl(struct task_struct *p)
>   {
>   	p->scx.slice = SCX_SLICE_DFL;
> -	__scx_add_event(SCX_EV_REFILL_SLICE_DFL, 1);
> +	__scx_add_event(scx_root, SCX_EV_REFILL_SLICE_DFL, 1);
>   }
>   
>   static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
> @@ -2203,7 +2206,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
>   		goto local;
>   
>   	if (scx_rq_bypassing(rq)) {
> -		__scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
> +		__scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
>   		goto global;
>   	}
>   
> @@ -2213,14 +2216,14 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
>   	/* see %SCX_OPS_ENQ_EXITING */
>   	if (!(scx_root->ops.flags & SCX_OPS_ENQ_EXITING) &&
>   	    unlikely(p->flags & PF_EXITING)) {
> -		__scx_add_event(SCX_EV_ENQ_SKIP_EXITING, 1);
> +		__scx_add_event(sch, SCX_EV_ENQ_SKIP_EXITING, 1);
>   		goto local;
>   	}
>   
>   	/* see %SCX_OPS_ENQ_MIGRATION_DISABLED */
>   	if (!(scx_root->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) &&
>   	    is_migration_disabled(p)) {
> -		__scx_add_event(SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
> +		__scx_add_event(sch, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
>   		goto local;
>   	}
>   
> @@ -2343,7 +2346,7 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
>   
>   	if ((enq_flags & SCX_ENQ_CPU_SELECTED) &&
>   	    unlikely(cpu_of(rq) != p->scx.selected_cpu))
> -		__scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1);
> +		__scx_add_event(scx_root, SCX_EV_SELECT_CPU_FALLBACK, 1);
>   }
>   
>   static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
> @@ -2571,7 +2574,8 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
>   
>   	if (!scx_rq_online(rq)) {
>   		if (enforce)
> -			__scx_add_event(SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
> +			__scx_add_event(scx_root,
> +					SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
>   		return false;
>   	}
>   
> @@ -3093,7 +3097,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
>   	if (prev_on_rq &&
>   	    (!(scx_root->ops.flags & SCX_OPS_ENQ_LAST) || scx_rq_bypassing(rq))) {
>   		rq->scx.flags |= SCX_RQ_BAL_KEEP;
> -		__scx_add_event(SCX_EV_DISPATCH_KEEP_LAST, 1);
> +		__scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1);
>   		goto has_tasks;
>   	}
>   	rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
> @@ -3424,6 +3428,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
>   
>   static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
>   {
> +	struct scx_sched *sch = scx_root;
>   	bool rq_bypass;
>   
>   	/*
> @@ -3440,7 +3445,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
>   		return prev_cpu;
>   
>   	rq_bypass = scx_rq_bypassing(task_rq(p));
> -	if (likely(SCX_HAS_OP(scx_root, select_cpu)) && !rq_bypass) {
> +	if (likely(SCX_HAS_OP(sch, select_cpu)) && !rq_bypass) {
>   		s32 cpu;
>   		struct task_struct **ddsp_taskp;
>   
> @@ -3469,7 +3474,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
>   		p->scx.selected_cpu = cpu;
>   
>   		if (rq_bypass)
> -			__scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
> +			__scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
>   		return cpu;
>   	}
>   }
> @@ -4410,6 +4415,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
>   	struct scx_dispatch_q *dsq;
>   	int node;
>   
> +	free_percpu(sch->event_stats_cpu);
> +
>   	for_each_node_state(node, N_POSSIBLE)
>   		kfree(sch->global_dsqs[node]);
>   	kfree(sch->global_dsqs);
> @@ -4452,10 +4459,11 @@ SCX_ATTR(ops);
>   static ssize_t scx_attr_events_show(struct kobject *kobj,
>   				    struct kobj_attribute *ka, char *buf)
>   {
> +	struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
>   	struct scx_event_stats events;
>   	int at = 0;
>   
> -	scx_read_events(&events);
> +	scx_read_events(sch, &events);
>   	at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK);
>   	at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
>   	at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST);
> @@ -4588,25 +4596,29 @@ static void scx_bypass(bool bypass)
>   {
>   	static DEFINE_RAW_SPINLOCK(bypass_lock);
>   	static unsigned long bypass_timestamp;
> -
> -	int cpu;
> +	struct scx_sched *sch;
>   	unsigned long flags;
> +	int cpu;
>   
>   	raw_spin_lock_irqsave(&bypass_lock, flags);
> +	sch = rcu_dereference_bh(scx_root);
> +
>   	if (bypass) {
>   		scx_bypass_depth++;
>   		WARN_ON_ONCE(scx_bypass_depth <= 0);
>   		if (scx_bypass_depth != 1)
>   			goto unlock;
>   		bypass_timestamp = ktime_get_ns();
> -		scx_add_event(SCX_EV_BYPASS_ACTIVATE, 1);
> +		if (sch)
> +			scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
>   	} else {
>   		scx_bypass_depth--;
>   		WARN_ON_ONCE(scx_bypass_depth < 0);
>   		if (scx_bypass_depth != 0)
>   			goto unlock;
> -		scx_add_event(SCX_EV_BYPASS_DURATION,
> -			      ktime_get_ns() - bypass_timestamp);
> +		if (sch)
> +			scx_add_event(sch, SCX_EV_BYPASS_DURATION,
> +				      ktime_get_ns() - bypass_timestamp);
>   	}
>   
>   	atomic_inc(&scx_breather_depth);
> @@ -5179,7 +5191,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
>   	dump_line(&s, "Event counters");
>   	dump_line(&s, "--------------");
>   
> -	scx_read_events(&events);
> +	scx_read_events(scx_root, &events);
>   	scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK);
>   	scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
>   	scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST);
> @@ -5287,16 +5299,22 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
>   		sch->global_dsqs[node] = dsq;
>   	}
>   
> +	sch->event_stats_cpu = alloc_percpu(struct scx_event_stats);
> +	if (!sch->event_stats_cpu)
> +		goto err_free_gdsqs;
> +
>   	atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
>   	sch->ops = *ops;
>   
>   	sch->kobj.kset = scx_kset;
>   	ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
>   	if (ret < 0)
> -		goto err_free_gdsqs;
> +		goto err_event_stats;
>   
>   	return sch;
>   
> +err_event_stats:
> +	free_percpu(sch->event_stats_cpu);
>   err_free_gdsqs:
>   	for_each_node_state(node, N_POSSIBLE)
>   		kfree(sch->global_dsqs[node]);
> @@ -5372,15 +5390,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>   
>   	mutex_lock(&scx_enable_mutex);
>   
> -	/*
> -	 * Clear event counters so a new scx scheduler gets
> -	 * fresh event counter values.
> -	 */
> -	for_each_possible_cpu(cpu) {
> -		struct scx_event_stats *e = per_cpu_ptr(&event_stats_cpu, cpu);
> -		memset(e, 0, sizeof(*e));
> -	}
> -
>   	if (!scx_helper) {
>   		WRITE_ONCE(scx_helper, scx_create_rt_helper("sched_ext_helper"));
>   		if (!scx_helper) {
> @@ -7401,7 +7410,7 @@ __bpf_kfunc u64 scx_bpf_now(void)
>   	return clock;
>   }
>   
> -static void scx_read_events(struct scx_event_stats *events)
> +static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *events)
>   {
>   	struct scx_event_stats *e_cpu;
>   	int cpu;
> @@ -7409,7 +7418,7 @@ static void scx_read_events(struct scx_event_stats *events)
>   	/* Aggregate per-CPU event counters into @events. */
>   	memset(events, 0, sizeof(*events));
>   	for_each_possible_cpu(cpu) {
> -		e_cpu = per_cpu_ptr(&event_stats_cpu, cpu);
> +		e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu);
>   		scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
>   		scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
>   		scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
> @@ -7430,9 +7439,16 @@ static void scx_read_events(struct scx_event_stats *events)
>   __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
>   				size_t events__sz)
>   {
> +	struct scx_sched *sch;
>   	struct scx_event_stats e_sys;
>   
> -	scx_read_events(&e_sys);
> +	rcu_read_lock();
> +	sch = rcu_dereference(scx_root);
> +	if (sch)
> +		scx_read_events(sch, &e_sys);
> +	else
> +		memset(&e_sys, 0, sizeof(e_sys));
> +	rcu_read_unlock();
>   
>   	/*
>   	 * We cannot entirely trust a BPF-provided size since a BPF program


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 03/12] sched_ext: Use dynamic allocation for scx_sched
  2025-04-23 23:44 ` [PATCH 03/12] sched_ext: Use dynamic allocation for scx_sched Tejun Heo
@ 2025-04-25 10:14   ` Andrea Righi
  2025-04-25 19:48     ` Tejun Heo
  0 siblings, 1 reply; 19+ messages in thread
From: Andrea Righi @ 2025-04-25 10:14 UTC (permalink / raw)
  To: Tejun Heo; +Cc: David Vernet, Changwoo Min, linux-kernel

Hi Tejun,

On Wed, Apr 23, 2025 at 01:44:41PM -1000, Tejun Heo wrote:
> To prepare for supporting multiple schedulers, make scx_sched allocated
> dynamically. scx_sched->kobj is now an embedded field and the kobj's
> lifetime determines the lifetime of the containing scx_sched.
> 
> - Enable path is updated so that kobj init and addition are performed later.
> 
> - scx_sched freeing is initiated in scx_kobj_release() and also goes through
>   an rcu_work so that scx_root can be accessed from an unsynchronized path -
>   scx_disable().
> 
> No behavior changes intended.
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
>  kernel/sched/ext.c | 151 ++++++++++++++++++++++++++-------------------
>  1 file changed, 86 insertions(+), 65 deletions(-)
> 
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index ad392890d2dd..612232c66d13 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -772,7 +772,8 @@ struct scx_sched {
>  	atomic_t		exit_kind;
>  	struct scx_exit_info	*exit_info;
>  
> -	struct kobject		*kobj;
> +	struct kobject		kobj;
> +	struct rcu_work		rcu_work;
>  };
>  
>  enum scx_wake_flags {
> @@ -933,11 +934,7 @@ enum scx_ops_state {
>  #define SCX_OPSS_STATE_MASK	((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
>  #define SCX_OPSS_QSEQ_MASK	(~SCX_OPSS_STATE_MASK)
>  
> -static struct scx_sched __scx_root = {
> -	.exit_kind		= ATOMIC_INIT(SCX_EXIT_DONE),
> -};
> -
> -static struct scx_sched *scx_root = &__scx_root;
> +static struct scx_sched __rcu *scx_root;
>  
>  /*
>   * During exit, a task may schedule after losing its PIDs. When disabling the
> @@ -4417,9 +4414,23 @@ static const struct attribute_group scx_global_attr_group = {
>  	.attrs = scx_global_attrs,
>  };
>  
> +static void free_exit_info(struct scx_exit_info *ei);
> +
> +static void scx_sched_free_rcu_work(struct work_struct *work)
> +{
> +	struct rcu_work *rcu_work = to_rcu_work(work);
> +	struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work);
> +
> +	free_exit_info(sch->exit_info);
> +	kfree(sch);
> +}
> +
>  static void scx_kobj_release(struct kobject *kobj)
>  {
> -	kfree(kobj);
> +	struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
> +
> +	INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work);
> +	queue_rcu_work(system_unbound_wq, &sch->rcu_work);
>  }
>  
>  static ssize_t scx_attr_ops_show(struct kobject *kobj,
> @@ -4709,14 +4720,15 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
>  
>  static void scx_disable_workfn(struct kthread_work *work)
>  {
> -	struct scx_exit_info *ei = scx_root->exit_info;
> +	struct scx_sched *sch = scx_root;
> +	struct scx_exit_info *ei = sch->exit_info;
>  	struct scx_task_iter sti;
>  	struct task_struct *p;
>  	struct rhashtable_iter rht_iter;
>  	struct scx_dispatch_q *dsq;
>  	int kind, cpu;
>  
> -	kind = atomic_read(&scx_root->exit_kind);
> +	kind = atomic_read(&sch->exit_kind);
>  	while (true) {
>  		/*
>  		 * NONE indicates that a new scx_ops has been registered since
> @@ -4725,7 +4737,7 @@ static void scx_disable_workfn(struct kthread_work *work)
>  		 */
>  		if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)
>  			return;
> -		if (atomic_try_cmpxchg(&scx_root->exit_kind, &kind, SCX_EXIT_DONE))
> +		if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE))
>  			break;
>  	}
>  	ei->kind = kind;
> @@ -4740,7 +4752,7 @@ static void scx_disable_workfn(struct kthread_work *work)
>  		break;
>  	case SCX_DISABLED:
>  		pr_warn("sched_ext: ops error detected without ops (%s)\n",
> -			scx_root->exit_info->msg);
> +			sch->exit_info->msg);
>  		WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
>  		goto done;
>  	default:
> @@ -4807,41 +4819,43 @@ static void scx_disable_workfn(struct kthread_work *work)
>  
>  	/* no task is on scx, turn off all the switches and flush in-progress calls */
>  	static_branch_disable(&__scx_enabled);
> -	bitmap_zero(scx_root->has_op, SCX_OPI_END);
> +	bitmap_zero(sch->has_op, SCX_OPI_END);
>  	scx_idle_disable();
>  	synchronize_rcu();
>  
>  	if (ei->kind >= SCX_EXIT_ERROR) {
>  		pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
> -		       scx_root->ops.name, ei->reason);
> +		       sch->ops.name, ei->reason);
>  
>  		if (ei->msg[0] != '\0')
> -			pr_err("sched_ext: %s: %s\n",
> -			       scx_root->ops.name, ei->msg);
> +			pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg);
>  #ifdef CONFIG_STACKTRACE
>  		stack_trace_print(ei->bt, ei->bt_len, 2);
>  #endif
>  	} else {
>  		pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
> -			scx_root->ops.name, ei->reason);
> +			sch->ops.name, ei->reason);
>  	}
>  
> -	if (scx_root->ops.exit)
> +	if (sch->ops.exit)
>  		SCX_CALL_OP(SCX_KF_UNLOCKED, exit, NULL, ei);
>  
>  	cancel_delayed_work_sync(&scx_watchdog_work);
>  
>  	/*
> -	 * Delete the kobject from the hierarchy eagerly in addition to just
> -	 * dropping a reference. Otherwise, if the object is deleted
> -	 * asynchronously, sysfs could observe an object of the same name still
> -	 * in the hierarchy when another scheduler is loaded.
> +	 * scx_root clearing must be inside cpus_read_lock(). See
> +	 * handle_hotplug().
>  	 */
> -	kobject_del(scx_root->kobj);
> -	kobject_put(scx_root->kobj);
> -	scx_root->kobj = NULL;
> +	cpus_read_lock();
> +	RCU_INIT_POINTER(scx_root, NULL);
> +	cpus_read_unlock();
>  
> -	memset(&scx_root->ops, 0, sizeof(scx_root->ops));
> +	/*
> +	 * Delete the kobject from the hierarchy synchronously. Otherwise, sysfs
> +	 * could observe an object of the same name still in the hierarchy when
> +	 * the next scheduler is loaded.
> +	 */
> +	kobject_del(&sch->kobj);
>  
>  	rhashtable_walk_enter(&dsq_hash, &rht_iter);
>  	do {
> @@ -4858,9 +4872,6 @@ static void scx_disable_workfn(struct kthread_work *work)
>  	scx_dsp_ctx = NULL;
>  	scx_dsp_max_batch = 0;
>  
> -	free_exit_info(scx_root->exit_info);
> -	scx_root->exit_info = NULL;
> -
>  	mutex_unlock(&scx_enable_mutex);
>  
>  	WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
> @@ -4885,13 +4896,18 @@ static void schedule_scx_disable_work(void)
>  static void scx_disable(enum scx_exit_kind kind)
>  {
>  	int none = SCX_EXIT_NONE;
> +	struct scx_sched *sch;
>  
>  	if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
>  		kind = SCX_EXIT_ERROR;
>  
> -	atomic_try_cmpxchg(&scx_root->exit_kind, &none, kind);
> -
> -	schedule_scx_disable_work();
> +	rcu_read_lock();
> +	sch = rcu_dereference(scx_root);
> +	if (sch) {
> +		atomic_try_cmpxchg(&sch->exit_kind, &none, kind);
> +		schedule_scx_disable_work();
> +	}
> +	rcu_read_unlock();
>  }
>  
>  static void dump_newline(struct seq_buf *s)
> @@ -5288,6 +5304,7 @@ static int validate_ops(const struct sched_ext_ops *ops)
>  
>  static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  {
> +	struct scx_sched *sch;
>  	struct scx_task_iter sti;
>  	struct task_struct *p;
>  	unsigned long timeout;
> @@ -5351,33 +5368,32 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  		goto err_unlock;
>  	}
>  
> -	scx_root->kobj = kzalloc(sizeof(*scx_root->kobj), GFP_KERNEL);
> -	if (!scx_root->kobj) {
> +	sch = kzalloc(sizeof(*sch), GFP_KERNEL);
> +	if (!sch) {
>  		ret = -ENOMEM;
>  		goto err_unlock;
>  	}
>  
> -	scx_root->kobj->kset = scx_kset;
> -	ret = kobject_init_and_add(scx_root->kobj, &scx_ktype, NULL, "root");
> -	if (ret < 0)
> -		goto err;
> -
> -	scx_root->exit_info = alloc_exit_info(ops->exit_dump_len);
> -	if (!scx_root->exit_info) {
> +	sch->exit_info = alloc_exit_info(ops->exit_dump_len);
> +	if (!sch->exit_info) {
>  		ret = -ENOMEM;
> -		goto err_del;
> +		goto err_free;
>  	}
>  
> +	sch->kobj.kset = scx_kset;
> +	ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
> +	if (ret < 0)
> +		goto err_free;
> +
> +	atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
> +	sch->ops = *ops;
> +
>  	/*
> -	 * Set scx_ops, transition to ENABLING and clear exit info to arm the
> -	 * disable path. Failure triggers full disabling from here on.
> +	 * Transition to ENABLING and clear exit info to arm the disable path.
> +	 * Failure triggers full disabling from here on.
>  	 */
> -	scx_root->ops = *ops;
> -
>  	WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED);
> -
> -	atomic_set(&scx_root->exit_kind, SCX_EXIT_NONE);
> -	scx_root->warned_zero_slice = false;
> +	WARN_ON_ONCE(scx_root);
>  
>  	atomic_long_set(&scx_nr_rejected, 0);
>  
> @@ -5390,9 +5406,15 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  	 */
>  	cpus_read_lock();
>  
> +	/*
> +	 * Make the scheduler instance visible. Must be inside cpus_read_lock().
> +	 * See handle_hotplug().
> +	 */
> +	rcu_assign_pointer(scx_root, sch);
> +
>  	scx_idle_enable(ops);
>  
> -	if (scx_root->ops.init) {
> +	if (sch->ops.init) {
>  		ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init, NULL);
>  		if (ret) {
>  			ret = ops_sanitize_err("init", ret);
> @@ -5404,7 +5426,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  
>  	for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
>  		if (((void (**)(void))ops)[i])
> -			set_bit(i, scx_root->has_op);
> +			set_bit(i, sch->has_op);
>  
>  	check_hotplug_seq(ops);
>  	scx_idle_update_selcpu_topology(ops);
> @@ -5445,10 +5467,10 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  
>  	for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
>  		if (((void (**)(void))ops)[i])
> -			set_bit(i, scx_root->has_op);
> +			set_bit(i, sch->has_op);
>  
> -	if (scx_root->ops.cpu_acquire || scx_root->ops.cpu_release)
> -		scx_root->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
> +	if (sch->ops.cpu_acquire || sch->ops.cpu_release)
> +		sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
>  
>  	/*
>  	 * Lock out forks, cgroup on/offlining and moves before opening the
> @@ -5547,7 +5569,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  	scx_bypass(false);
>  
>  	if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) {
> -		WARN_ON_ONCE(atomic_read(&scx_root->exit_kind) == SCX_EXIT_NONE);
> +		WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE);
>  		goto err_disable;
>  	}
>  
> @@ -5555,23 +5577,18 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  		static_branch_enable(&__scx_switched_all);
>  
>  	pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
> -		scx_root->ops.name, scx_switched_all() ? "" : " (partial)");
> -	kobject_uevent(scx_root->kobj, KOBJ_ADD);
> +		sch->ops.name, scx_switched_all() ? "" : " (partial)");
> +	kobject_uevent(&sch->kobj, KOBJ_ADD);
>  	mutex_unlock(&scx_enable_mutex);
>  
>  	atomic_long_inc(&scx_enable_seq);
>  
>  	return 0;
>  
> -err_del:
> -	kobject_del(scx_root->kobj);
> -err:
> -	kobject_put(scx_root->kobj);
> -	scx_root->kobj = NULL;
> -	if (scx_root->exit_info) {
> -		free_exit_info(scx_root->exit_info);
> -		scx_root->exit_info = NULL;
> -	}
> +err_free:
> +	if (sch->exit_info)
> +		free_exit_info(sch->exit_info);
> +	kfree(sch);
>  err_unlock:
>  	mutex_unlock(&scx_enable_mutex);
>  	return ret;
> @@ -5593,6 +5610,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  	 */
>  	scx_error("scx_enable() failed (%d)", ret);
>  	kthread_flush_work(&scx_disable_work);
> +	kobject_put(&sch->kobj);
>  	return 0;
>  }
>  
> @@ -5741,8 +5759,11 @@ static int bpf_scx_reg(void *kdata, struct bpf_link *link)
>  
>  static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
>  {
> +	struct scx_sched *sch = scx_root;
> +
>  	scx_disable(SCX_EXIT_UNREG);
>  	kthread_flush_work(&scx_disable_work);
> +	kobject_put(&sch->kobj);
>  }

We probably need to check sch != NULL here, I was able to trigger this bug
(using a buggy rustland):

[    5.048913] sched_ext: rustland: invalid CPU -16 from ops.select_cpu()
[    5.048984]    ops_cpu_valid+0x4a/0x60
[    5.049039]    select_task_rq_scx+0x10f/0x200
[    5.049100]    try_to_wake_up+0x17a/0x890
[    5.049149]    ep_autoremove_wake_function+0x12/0x40
[    5.049211]    __wake_up_common+0x7f/0xc0
[    5.049259]    __wake_up+0x36/0x60
[    5.049306]    ep_poll_callback+0x265/0x320
[    5.049354]    __wake_up_common+0x7f/0xc0
[    5.049401]    __wake_up+0x36/0x60
[    5.049448]    __send_signal_locked+0x71e/0x740
[    5.049508]    group_send_sig_info+0xf3/0x1b0
[    5.049567]    kill_pid_info_type+0x79/0x1a0
[    5.049627]    kill_proc_info+0x5d/0x110
[    5.049674]    __x64_sys_kill+0x91/0xc0
[    5.049789]    do_syscall_64+0xbb/0x1d0
[    5.049855]    entry_SYSCALL_64_after_hwframe+0x77/0x7f
[    5.050315] BUG: kernel NULL pointer dereference, address: 00000000000003b0
[    5.050386] #PF: supervisor read access in kernel mode
[    5.050439] #PF: error_code(0x0000) - not-present page
[    5.050488] PGD 0 P4D 0
[    5.050523] Oops: Oops: 0000 [#1] SMP NOPTI
[    5.050571] CPU: 5 UID: 0 PID: 284 Comm: scx_rustland Not tainted 6.14.0-virtme #27 PREEMPT(full)
[    5.050670] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014
[    5.050782] RIP: 0010:kthread_flush_work+0x60/0x140
[    5.050847] Code: 80 00 00 00 31 c0 48 8d 7c 24 18 48 89 24 24 48 89 64 24 08 48 c7 44 24 10 30 e6 58 b0 f3 48 ab 48 8d 7c 24 30 e8 40 6c 05 00 <48> 8b 6b 18 48 85 ed 74 69 4c 8d 65 08 4c 89 e7 e8 0b 0c d3 00 48
[    5.051021] RSP: 0018:ffffad01816f7e08 EFLAGS: 00010246
[    5.051066] RAX: 0000000000000000 RBX: 0000000000000398 RCX: 0000000000000000
[    5.051131] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffffffffb2f34d80
[    5.051196] RBP: ffff9b140268e800 R08: 0000000000000002 R09: 0000000000000000
[    5.051260] R10: 0000000000000001 R11: 0000000000000000 R12: ffff9b14804c1ea0
[    5.051325] R13: ffff9b1401b6cc20 R14: ffff9b1403965728 R15: 0000000000000000
[    5.051393] FS:  00007f1ff0550800(0000) GS:ffff9b14cbdb3000(0000) knlGS:0000000000000000
[    5.051463] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    5.051516] CR2: 00000000000003b0 CR3: 000000000553e002 CR4: 0000000000772ef0
[    5.051582] PKRU: 55555554
[    5.051606] Call Trace:
[    5.051634]  <TASK>
[    5.051665]  ? __pfx_kthread_flush_work_fn+0x10/0x10
[    5.051726]  bpf_scx_unreg+0x27/0x40
[    5.051773]  bpf_struct_ops_map_link_dealloc+0x36/0x50
[    5.051824]  bpf_link_release+0x18/0x20
[    5.051863]  __fput+0xf8/0x2c0
[    5.051905]  __x64_sys_close+0x3d/0x80
[    5.051943]  do_syscall_64+0xbb/0x1d0
[    5.051983]  entry_SYSCALL_64_after_hwframe+0x77/0x7f

Changing bpf_scx_unreg() as following fixed the bug for me:

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index d963aa5c99e1a..0e52a8dbd593e 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5752,11 +5752,17 @@ static int bpf_scx_reg(void *kdata, struct bpf_link *link)
 
 static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
 {
-	struct scx_sched *sch = scx_root;
+	struct scx_sched *sch;
 
 	scx_disable(SCX_EXIT_UNREG);
-	kthread_flush_work(&sch->disable_work);
-	kobject_put(&sch->kobj);
+
+	rcu_read_lock();
+	sch = rcu_dereference(scx_root);
+	if (sch) {
+		kthread_flush_work(&sch->disable_work);
+		kobject_put(&sch->kobj);
+	}
+	rcu_read_unlock();
 }
 
 static int bpf_scx_init(struct btf *btf)

Thanks,
-Andrea

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 03/12] sched_ext: Use dynamic allocation for scx_sched
  2025-04-25 10:14   ` Andrea Righi
@ 2025-04-25 19:48     ` Tejun Heo
  0 siblings, 0 replies; 19+ messages in thread
From: Tejun Heo @ 2025-04-25 19:48 UTC (permalink / raw)
  To: Andrea Righi; +Cc: David Vernet, Changwoo Min, linux-kernel

On Fri, Apr 25, 2025 at 12:14:13PM +0200, Andrea Righi wrote:
...
> >  static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
> >  {
> > +	struct scx_sched *sch = scx_root;
> > +
> >  	scx_disable(SCX_EXIT_UNREG);
> >  	kthread_flush_work(&scx_disable_work);
> > +	kobject_put(&sch->kobj);
> >  }
> 
> We probably need to check sch != NULL here, I was able to trigger this bug
> (using a buggy rustland):
> 
> [    5.048913] sched_ext: rustland: invalid CPU -16 from ops.select_cpu()
> [    5.048984]    ops_cpu_valid+0x4a/0x60
> [    5.049039]    select_task_rq_scx+0x10f/0x200
> [    5.049100]    try_to_wake_up+0x17a/0x890
> [    5.049149]    ep_autoremove_wake_function+0x12/0x40
> [    5.049211]    __wake_up_common+0x7f/0xc0
> [    5.049259]    __wake_up+0x36/0x60
> [    5.049306]    ep_poll_callback+0x265/0x320
> [    5.049354]    __wake_up_common+0x7f/0xc0
> [    5.049401]    __wake_up+0x36/0x60
> [    5.049448]    __send_signal_locked+0x71e/0x740
> [    5.049508]    group_send_sig_info+0xf3/0x1b0
> [    5.049567]    kill_pid_info_type+0x79/0x1a0
> [    5.049627]    kill_proc_info+0x5d/0x110
> [    5.049674]    __x64_sys_kill+0x91/0xc0
> [    5.049789]    do_syscall_64+0xbb/0x1d0
> [    5.049855]    entry_SYSCALL_64_after_hwframe+0x77/0x7f
> [    5.050315] BUG: kernel NULL pointer dereference, address: 00000000000003b0
> [    5.050386] #PF: supervisor read access in kernel mode
> [    5.050439] #PF: error_code(0x0000) - not-present page
> [    5.050488] PGD 0 P4D 0
> [    5.050523] Oops: Oops: 0000 [#1] SMP NOPTI
> [    5.050571] CPU: 5 UID: 0 PID: 284 Comm: scx_rustland Not tainted 6.14.0-virtme #27 PREEMPT(full)
> [    5.050670] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014
> [    5.050782] RIP: 0010:kthread_flush_work+0x60/0x140
> [    5.050847] Code: 80 00 00 00 31 c0 48 8d 7c 24 18 48 89 24 24 48 89 64 24 08 48 c7 44 24 10 30 e6 58 b0 f3 48 ab 48 8d 7c 24 30 e8 40 6c 05 00 <48> 8b 6b 18 48 85 ed 74 69 4c 8d 65 08 4c 89 e7 e8 0b 0c d3 00 48
> [    5.051021] RSP: 0018:ffffad01816f7e08 EFLAGS: 00010246
> [    5.051066] RAX: 0000000000000000 RBX: 0000000000000398 RCX: 0000000000000000
> [    5.051131] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffffffffb2f34d80
> [    5.051196] RBP: ffff9b140268e800 R08: 0000000000000002 R09: 0000000000000000
> [    5.051260] R10: 0000000000000001 R11: 0000000000000000 R12: ffff9b14804c1ea0
> [    5.051325] R13: ffff9b1401b6cc20 R14: ffff9b1403965728 R15: 0000000000000000
> [    5.051393] FS:  00007f1ff0550800(0000) GS:ffff9b14cbdb3000(0000) knlGS:0000000000000000
> [    5.051463] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [    5.051516] CR2: 00000000000003b0 CR3: 000000000553e002 CR4: 0000000000772ef0
> [    5.051582] PKRU: 55555554
> [    5.051606] Call Trace:
> [    5.051634]  <TASK>
> [    5.051665]  ? __pfx_kthread_flush_work_fn+0x10/0x10
> [    5.051726]  bpf_scx_unreg+0x27/0x40
> [    5.051773]  bpf_struct_ops_map_link_dealloc+0x36/0x50
> [    5.051824]  bpf_link_release+0x18/0x20
> [    5.051863]  __fput+0xf8/0x2c0
> [    5.051905]  __x64_sys_close+0x3d/0x80
> [    5.051943]  do_syscall_64+0xbb/0x1d0
> [    5.051983]  entry_SYSCALL_64_after_hwframe+0x77/0x7f
> 
> Changing bpf_scx_unreg() as following fixed the bug for me:
> 
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index d963aa5c99e1a..0e52a8dbd593e 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -5752,11 +5752,17 @@ static int bpf_scx_reg(void *kdata, struct bpf_link *link)
>  
>  static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
>  {
> -	struct scx_sched *sch = scx_root;
> +	struct scx_sched *sch;
>  
>  	scx_disable(SCX_EXIT_UNREG);
> -	kthread_flush_work(&sch->disable_work);
> -	kobject_put(&sch->kobj);
> +
> +	rcu_read_lock();
> +	sch = rcu_dereference(scx_root);
> +	if (sch) {
> +		kthread_flush_work(&sch->disable_work);
> +		kobject_put(&sch->kobj);
> +	}
> +	rcu_read_unlock();
>  }

Oh I didn't expect that. As scx_root can only be written by the preceding
bpf_scx_reg(), I don't think we need rcu_read_lock() here as subtle as that
may be. I'll update the code.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 03/12] sched_ext: Use dynamic allocation for scx_sched
  2025-04-25 21:58 [PATCHSET v2 sched_ext/for-6.16] sched_ext: Introduce scx_sched Tejun Heo
@ 2025-04-25 21:58 ` Tejun Heo
  0 siblings, 0 replies; 19+ messages in thread
From: Tejun Heo @ 2025-04-25 21:58 UTC (permalink / raw)
  To: void, arighi, multics69; +Cc: linux-kernel, sched-ext, Tejun Heo

To prepare for supporting multiple schedulers, make scx_sched allocated
dynamically. scx_sched->kobj is now an embedded field and the kobj's
lifetime determines the lifetime of the containing scx_sched.

- Enable path is updated so that kobj init and addition are performed later.

- scx_sched freeing is initiated in scx_kobj_release() and also goes through
  an rcu_work so that scx_root can be accessed from an unsynchronized path -
  scx_disable().

- sched_ext_ops->priv is added and used to point to scx_sched instance
  created for the ops instance. This is used by bpf_scx_unreg() to determine
  the scx_sched instance to disable and put.

No behavior changes intended.

v2: Andrea reported kernel oops due to scx_bpf_unreg() trying to deref NULL
    scx_root after scheduler init failure. sched_ext_ops->priv added so that
    scx_bpf_unreg() can always find the scx_sched instance to unregister
    even if it failed early during init.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 157 ++++++++++++++++++++++++++-------------------
 1 file changed, 91 insertions(+), 66 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index ad392890d2dd..fe6044f527c4 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -752,6 +752,9 @@ struct sched_ext_ops {
 	 * BPF scheduler is enabled.
 	 */
 	char name[SCX_OPS_NAME_LEN];
+
+	/* internal use only, must be NULL */
+	void *priv;
 };
 
 enum scx_opi {
@@ -772,7 +775,8 @@ struct scx_sched {
 	atomic_t		exit_kind;
 	struct scx_exit_info	*exit_info;
 
-	struct kobject		*kobj;
+	struct kobject		kobj;
+	struct rcu_work		rcu_work;
 };
 
 enum scx_wake_flags {
@@ -933,11 +937,7 @@ enum scx_ops_state {
 #define SCX_OPSS_STATE_MASK	((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
 #define SCX_OPSS_QSEQ_MASK	(~SCX_OPSS_STATE_MASK)
 
-static struct scx_sched __scx_root = {
-	.exit_kind		= ATOMIC_INIT(SCX_EXIT_DONE),
-};
-
-static struct scx_sched *scx_root = &__scx_root;
+static struct scx_sched __rcu *scx_root;
 
 /*
  * During exit, a task may schedule after losing its PIDs. When disabling the
@@ -4417,9 +4417,23 @@ static const struct attribute_group scx_global_attr_group = {
 	.attrs = scx_global_attrs,
 };
 
+static void free_exit_info(struct scx_exit_info *ei);
+
+static void scx_sched_free_rcu_work(struct work_struct *work)
+{
+	struct rcu_work *rcu_work = to_rcu_work(work);
+	struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work);
+
+	free_exit_info(sch->exit_info);
+	kfree(sch);
+}
+
 static void scx_kobj_release(struct kobject *kobj)
 {
-	kfree(kobj);
+	struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
+
+	INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work);
+	queue_rcu_work(system_unbound_wq, &sch->rcu_work);
 }
 
 static ssize_t scx_attr_ops_show(struct kobject *kobj,
@@ -4709,14 +4723,15 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
 
 static void scx_disable_workfn(struct kthread_work *work)
 {
-	struct scx_exit_info *ei = scx_root->exit_info;
+	struct scx_sched *sch = scx_root;
+	struct scx_exit_info *ei = sch->exit_info;
 	struct scx_task_iter sti;
 	struct task_struct *p;
 	struct rhashtable_iter rht_iter;
 	struct scx_dispatch_q *dsq;
 	int kind, cpu;
 
-	kind = atomic_read(&scx_root->exit_kind);
+	kind = atomic_read(&sch->exit_kind);
 	while (true) {
 		/*
 		 * NONE indicates that a new scx_ops has been registered since
@@ -4725,7 +4740,7 @@ static void scx_disable_workfn(struct kthread_work *work)
 		 */
 		if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)
 			return;
-		if (atomic_try_cmpxchg(&scx_root->exit_kind, &kind, SCX_EXIT_DONE))
+		if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE))
 			break;
 	}
 	ei->kind = kind;
@@ -4740,7 +4755,7 @@ static void scx_disable_workfn(struct kthread_work *work)
 		break;
 	case SCX_DISABLED:
 		pr_warn("sched_ext: ops error detected without ops (%s)\n",
-			scx_root->exit_info->msg);
+			sch->exit_info->msg);
 		WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
 		goto done;
 	default:
@@ -4807,41 +4822,43 @@ static void scx_disable_workfn(struct kthread_work *work)
 
 	/* no task is on scx, turn off all the switches and flush in-progress calls */
 	static_branch_disable(&__scx_enabled);
-	bitmap_zero(scx_root->has_op, SCX_OPI_END);
+	bitmap_zero(sch->has_op, SCX_OPI_END);
 	scx_idle_disable();
 	synchronize_rcu();
 
 	if (ei->kind >= SCX_EXIT_ERROR) {
 		pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
-		       scx_root->ops.name, ei->reason);
+		       sch->ops.name, ei->reason);
 
 		if (ei->msg[0] != '\0')
-			pr_err("sched_ext: %s: %s\n",
-			       scx_root->ops.name, ei->msg);
+			pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg);
 #ifdef CONFIG_STACKTRACE
 		stack_trace_print(ei->bt, ei->bt_len, 2);
 #endif
 	} else {
 		pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
-			scx_root->ops.name, ei->reason);
+			sch->ops.name, ei->reason);
 	}
 
-	if (scx_root->ops.exit)
+	if (sch->ops.exit)
 		SCX_CALL_OP(SCX_KF_UNLOCKED, exit, NULL, ei);
 
 	cancel_delayed_work_sync(&scx_watchdog_work);
 
 	/*
-	 * Delete the kobject from the hierarchy eagerly in addition to just
-	 * dropping a reference. Otherwise, if the object is deleted
-	 * asynchronously, sysfs could observe an object of the same name still
-	 * in the hierarchy when another scheduler is loaded.
+	 * scx_root clearing must be inside cpus_read_lock(). See
+	 * handle_hotplug().
 	 */
-	kobject_del(scx_root->kobj);
-	kobject_put(scx_root->kobj);
-	scx_root->kobj = NULL;
+	cpus_read_lock();
+	RCU_INIT_POINTER(scx_root, NULL);
+	cpus_read_unlock();
 
-	memset(&scx_root->ops, 0, sizeof(scx_root->ops));
+	/*
+	 * Delete the kobject from the hierarchy synchronously. Otherwise, sysfs
+	 * could observe an object of the same name still in the hierarchy when
+	 * the next scheduler is loaded.
+	 */
+	kobject_del(&sch->kobj);
 
 	rhashtable_walk_enter(&dsq_hash, &rht_iter);
 	do {
@@ -4858,9 +4875,6 @@ static void scx_disable_workfn(struct kthread_work *work)
 	scx_dsp_ctx = NULL;
 	scx_dsp_max_batch = 0;
 
-	free_exit_info(scx_root->exit_info);
-	scx_root->exit_info = NULL;
-
 	mutex_unlock(&scx_enable_mutex);
 
 	WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
@@ -4885,13 +4899,18 @@ static void schedule_scx_disable_work(void)
 static void scx_disable(enum scx_exit_kind kind)
 {
 	int none = SCX_EXIT_NONE;
+	struct scx_sched *sch;
 
 	if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
 		kind = SCX_EXIT_ERROR;
 
-	atomic_try_cmpxchg(&scx_root->exit_kind, &none, kind);
-
-	schedule_scx_disable_work();
+	rcu_read_lock();
+	sch = rcu_dereference(scx_root);
+	if (sch) {
+		atomic_try_cmpxchg(&sch->exit_kind, &none, kind);
+		schedule_scx_disable_work();
+	}
+	rcu_read_unlock();
 }
 
 static void dump_newline(struct seq_buf *s)
@@ -5288,6 +5307,7 @@ static int validate_ops(const struct sched_ext_ops *ops)
 
 static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 {
+	struct scx_sched *sch;
 	struct scx_task_iter sti;
 	struct task_struct *p;
 	unsigned long timeout;
@@ -5351,33 +5371,33 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 		goto err_unlock;
 	}
 
-	scx_root->kobj = kzalloc(sizeof(*scx_root->kobj), GFP_KERNEL);
-	if (!scx_root->kobj) {
+	sch = kzalloc(sizeof(*sch), GFP_KERNEL);
+	if (!sch) {
 		ret = -ENOMEM;
 		goto err_unlock;
 	}
 
-	scx_root->kobj->kset = scx_kset;
-	ret = kobject_init_and_add(scx_root->kobj, &scx_ktype, NULL, "root");
-	if (ret < 0)
-		goto err;
-
-	scx_root->exit_info = alloc_exit_info(ops->exit_dump_len);
-	if (!scx_root->exit_info) {
+	sch->exit_info = alloc_exit_info(ops->exit_dump_len);
+	if (!sch->exit_info) {
 		ret = -ENOMEM;
-		goto err_del;
+		goto err_free;
 	}
 
+	sch->kobj.kset = scx_kset;
+	ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
+	if (ret < 0)
+		goto err_free;
+
+	atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
+	sch->ops = *ops;
+	ops->priv = sch;
+
 	/*
-	 * Set scx_ops, transition to ENABLING and clear exit info to arm the
-	 * disable path. Failure triggers full disabling from here on.
+	 * Transition to ENABLING and clear exit info to arm the disable path.
+	 * Failure triggers full disabling from here on.
 	 */
-	scx_root->ops = *ops;
-
 	WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED);
-
-	atomic_set(&scx_root->exit_kind, SCX_EXIT_NONE);
-	scx_root->warned_zero_slice = false;
+	WARN_ON_ONCE(scx_root);
 
 	atomic_long_set(&scx_nr_rejected, 0);
 
@@ -5390,9 +5410,15 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 	 */
 	cpus_read_lock();
 
+	/*
+	 * Make the scheduler instance visible. Must be inside cpus_read_lock().
+	 * See handle_hotplug().
+	 */
+	rcu_assign_pointer(scx_root, sch);
+
 	scx_idle_enable(ops);
 
-	if (scx_root->ops.init) {
+	if (sch->ops.init) {
 		ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init, NULL);
 		if (ret) {
 			ret = ops_sanitize_err("init", ret);
@@ -5404,7 +5430,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 
 	for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
 		if (((void (**)(void))ops)[i])
-			set_bit(i, scx_root->has_op);
+			set_bit(i, sch->has_op);
 
 	check_hotplug_seq(ops);
 	scx_idle_update_selcpu_topology(ops);
@@ -5445,10 +5471,10 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 
 	for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
 		if (((void (**)(void))ops)[i])
-			set_bit(i, scx_root->has_op);
+			set_bit(i, sch->has_op);
 
-	if (scx_root->ops.cpu_acquire || scx_root->ops.cpu_release)
-		scx_root->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
+	if (sch->ops.cpu_acquire || sch->ops.cpu_release)
+		sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
 
 	/*
 	 * Lock out forks, cgroup on/offlining and moves before opening the
@@ -5547,7 +5573,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 	scx_bypass(false);
 
 	if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) {
-		WARN_ON_ONCE(atomic_read(&scx_root->exit_kind) == SCX_EXIT_NONE);
+		WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE);
 		goto err_disable;
 	}
 
@@ -5555,23 +5581,18 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 		static_branch_enable(&__scx_switched_all);
 
 	pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
-		scx_root->ops.name, scx_switched_all() ? "" : " (partial)");
-	kobject_uevent(scx_root->kobj, KOBJ_ADD);
+		sch->ops.name, scx_switched_all() ? "" : " (partial)");
+	kobject_uevent(&sch->kobj, KOBJ_ADD);
 	mutex_unlock(&scx_enable_mutex);
 
 	atomic_long_inc(&scx_enable_seq);
 
 	return 0;
 
-err_del:
-	kobject_del(scx_root->kobj);
-err:
-	kobject_put(scx_root->kobj);
-	scx_root->kobj = NULL;
-	if (scx_root->exit_info) {
-		free_exit_info(scx_root->exit_info);
-		scx_root->exit_info = NULL;
-	}
+err_free:
+	if (sch->exit_info)
+		free_exit_info(sch->exit_info);
+	kfree(sch);
 err_unlock:
 	mutex_unlock(&scx_enable_mutex);
 	return ret;
@@ -5589,7 +5610,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 	 * is notified through ops.exit() with all the details.
 	 *
 	 * Flush scx_disable_work to ensure that error is reported before init
-	 * completion.
+	 * completion. sch's base reference will be put by bpf_scx_unreg().
 	 */
 	scx_error("scx_enable() failed (%d)", ret);
 	kthread_flush_work(&scx_disable_work);
@@ -5741,8 +5762,12 @@ static int bpf_scx_reg(void *kdata, struct bpf_link *link)
 
 static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
 {
+	struct sched_ext_ops *ops = kdata;
+	struct scx_sched *sch = ops->priv;
+
 	scx_disable(SCX_EXIT_UNREG);
 	kthread_flush_work(&scx_disable_work);
+	kobject_put(&sch->kobj);
 }
 
 static int bpf_scx_init(struct btf *btf)
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2025-04-25 21:58 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-04-23 23:44 [PATCHSET sched_ext/for-6.16] sched_ext: Introduce scx_sched Tejun Heo
2025-04-23 23:44 ` [PATCH 01/12] " Tejun Heo
2025-04-23 23:44 ` [PATCH 02/12] sched_ext: Avoid NULL scx_root deref through SCX_HAS_OP() Tejun Heo
2025-04-24  7:23   ` Chengming Zhou
2025-04-24 18:55     ` Tejun Heo
2025-04-23 23:44 ` [PATCH 03/12] sched_ext: Use dynamic allocation for scx_sched Tejun Heo
2025-04-25 10:14   ` Andrea Righi
2025-04-25 19:48     ` Tejun Heo
2025-04-23 23:44 ` [PATCH 04/12] sched_ext: Inline create_dsq() into scx_bpf_create_dsq() Tejun Heo
2025-04-23 23:44 ` [PATCH 05/12] sched_ext: Factor out scx_alloc_and_add_sched() Tejun Heo
2025-04-23 23:44 ` [PATCH 06/12] sched_ext: Move dsq_hash into scx_sched Tejun Heo
2025-04-23 23:44 ` [PATCH 07/12] sched_ext: Move global_dsqs " Tejun Heo
2025-04-23 23:44 ` [PATCH 08/12] sched_ext: Relocate scx_event_stats definition Tejun Heo
2025-04-23 23:44 ` [PATCH 09/12] sched_ext: Factor out scx_read_events() Tejun Heo
2025-04-23 23:44 ` [PATCH 10/12] sched_ext: Move event_stats_cpu into scx_sched Tejun Heo
2025-04-25  5:38   ` Changwoo Min
2025-04-23 23:44 ` [PATCH 11/12] sched_ext: Move disable machinery " Tejun Heo
2025-04-23 23:44 ` [PATCH 12/12] sched_ext: Clean up SCX_EXIT_NONE handling in scx_disable_workfn() Tejun Heo
  -- strict thread matches above, loose matches on Subject: below --
2025-04-25 21:58 [PATCHSET v2 sched_ext/for-6.16] sched_ext: Introduce scx_sched Tejun Heo
2025-04-25 21:58 ` [PATCH 03/12] sched_ext: Use dynamic allocation for scx_sched Tejun Heo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).