Sched_ext development
 help / color / mirror / Atom feed
* [PATCH sched_ext/for-7.2] sched_ext: Refactor scx_root_enable_workfn() enablement task migration
@ 2026-04-21  7:41 Cheng-Yang Chou
  2026-04-21 17:30 ` Tejun Heo
  0 siblings, 1 reply; 4+ messages in thread
From: Cheng-Yang Chou @ 2026-04-21  7:41 UTC (permalink / raw)
  To: sched-ext, Tejun Heo, David Vernet, Andrea Righi, Changwoo Min
  Cc: Ching-Chun Huang, Chia-Ping Tsai, yphbchou0911

The scx_root_enable_workfn() function handles several distinct phases of
scheduler enablement. Decomposing it into focused helpers clarifies the
high-level flow:

  +-----------------------+
  | 1. Prep & Sched Alloc |
  +-----------------------+
             |
             v
  +-----------------------+
  | 2. CPU & Subsys Init  |
  +-----------------------+
             |
             v
  +-----------------------+
  | 3. Task Preparation   | <-- Extracted to scx_ops_prep_tasks()
  +-----------------------+
             |
             v
  +-----------------------+
  | 4. THE COMMIT POINT   | <-- static_branch_enable()
  +-----------------------+
             |
             v
  +-----------------------+
  | 5. Task Switching     | <-- Extracted to scx_ops_switch_tasks()
  +-----------------------+
             |
             v
  +-----------------------+
  | 6. Finalize & Cleanup |
  +-----------------------+

Phases 3 & 5 involved large, indented task iteration loops that made
the critical transition point (Phase 4) difficult to identify.

- scx_ops_prep_tasks() handles Phase 3, including scx_fork_rwsem and
  cgroup_lock acquisition, task-specific initialization, and TID hash
  insertion.

- scx_ops_switch_tasks() handles Phase 5, performing the final transition
  of READY tasks to the ext_sched_class.

No functional changes.

Test plan:
- Run all demo schedulers with stress-ng -c 10 without crashing.

Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
---
Currently, scx_root_enable_workfn() is ~240 lines long.
IMHO, this makes it hard to maintain and follow.

 kernel/sched/ext.c | 216 +++++++++++++++++++++++++--------------------
 1 file changed, 119 insertions(+), 97 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 0a53a0dd64bf..21880db4e757 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -6673,14 +6673,128 @@ struct scx_enable_cmd {
 	int			ret;
 };
 
+static int scx_ops_prep_tasks(struct scx_sched *sch)
+{
+	struct scx_task_iter sti;
+	struct task_struct *p;
+	int ret;
+
+	percpu_down_write(&scx_fork_rwsem);
+
+	WARN_ON_ONCE(scx_init_task_enabled);
+	scx_init_task_enabled = true;
+
+	/* flip under fork_rwsem; the iter below covers existing tasks */
+	if (sch->ops.flags & SCX_OPS_TID_TO_TASK)
+		static_branch_enable(&__scx_tid_to_task_enabled);
+
+	/*
+	 * Enable ops for every task. Fork is excluded by scx_fork_rwsem
+	 * preventing new tasks from being added. No need to exclude tasks
+	 * leaving as sched_ext_free() can handle both prepped and enabled
+	 * tasks. Prep all tasks first and then enable them with preemption
+	 * disabled.
+	 *
+	 * All cgroups should be initialized before scx_init_task() so that the
+	 * BPF scheduler can reliably track each task's cgroup membership from
+	 * scx_init_task(). Lock out cgroup on/offlining and task migrations
+	 * while tasks are being initialized so that scx_cgroup_can_attach()
+	 * never sees uninitialized tasks.
+	 */
+	scx_cgroup_lock();
+	set_cgroup_sched(sch_cgroup(sch), sch);
+	ret = scx_cgroup_init(sch);
+	if (ret)
+		goto err_unlock_all;
+
+	scx_task_iter_start(&sti, NULL);
+	while ((p = scx_task_iter_next_locked(&sti))) {
+		/*
+		 * @p may already be dead, have lost all its usages counts and
+		 * be waiting for RCU grace period before being freed. @p can't
+		 * be initialized for SCX in such cases and should be ignored.
+		 */
+		if (!tryget_task_struct(p))
+			continue;
+
+		scx_task_iter_unlock(&sti);
+
+		ret = scx_init_task(sch, p, false);
+		if (ret) {
+			put_task_struct(p);
+			scx_task_iter_stop(&sti);
+			scx_error(sch, "ops.init_task() failed (%d) for %s[%d]",
+				  ret, p->comm, p->pid);
+			goto err_unlock_all;
+		}
+
+		scx_set_task_sched(p, sch);
+		scx_set_task_state(p, SCX_TASK_READY);
+
+		/*
+		 * Insert into the tid hash under scx_tasks_lock so we can't
+		 * race sched_ext_dead() and leave a stale entry for an already
+		 * exited task.
+		 */
+		if (scx_tid_to_task_enabled()) {
+			guard(raw_spinlock_irq)(&scx_tasks_lock);
+			if (!list_empty(&p->scx.tasks_node))
+				scx_tid_hash_insert(p);
+		}
+
+		put_task_struct(p);
+	}
+	scx_task_iter_stop(&sti);
+	scx_cgroup_unlock();
+	percpu_up_write(&scx_fork_rwsem);
+
+	return 0;
+
+err_unlock_all:
+	scx_cgroup_unlock();
+	percpu_up_write(&scx_fork_rwsem);
+	return ret;
+}
+
+static void scx_ops_switch_tasks(struct scx_sched *sch)
+{
+	struct scx_task_iter sti;
+	struct task_struct *p;
+
+	percpu_down_write(&scx_fork_rwsem);
+
+	/*
+	 * We're fully committed and can't fail. The task READY -> ENABLED
+	 * transitions here are synchronized against sched_ext_free() through
+	 * scx_tasks_lock.
+	 */
+	scx_task_iter_start(&sti, NULL);
+	while ((p = scx_task_iter_next_locked(&sti))) {
+		unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
+		const struct sched_class *old_class = p->sched_class;
+		const struct sched_class *new_class = scx_setscheduler_class(p);
+
+		if (scx_get_task_state(p) != SCX_TASK_READY)
+			continue;
+
+		if (old_class != new_class)
+			queue_flags |= DEQUEUE_CLASS;
+
+		scoped_guard (sched_change, p, queue_flags) {
+			p->scx.slice = READ_ONCE(sch->slice_dfl);
+			p->sched_class = new_class;
+		}
+	}
+	scx_task_iter_stop(&sti);
+	percpu_up_write(&scx_fork_rwsem);
+}
+
 static void scx_root_enable_workfn(struct kthread_work *work)
 {
 	struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work);
 	struct sched_ext_ops *ops = cmd->ops;
 	struct cgroup *cgrp = root_cgroup();
 	struct scx_sched *sch;
-	struct scx_task_iter sti;
-	struct task_struct *p;
 	int i, cpu, ret;
 
 	mutex_lock(&scx_enable_mutex);
@@ -6790,74 +6904,9 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 	 * Lock out forks, cgroup on/offlining and moves before opening the
 	 * floodgate so that they don't wander into the operations prematurely.
 	 */
-	percpu_down_write(&scx_fork_rwsem);
-
-	WARN_ON_ONCE(scx_init_task_enabled);
-	scx_init_task_enabled = true;
-
-	/* flip under fork_rwsem; the iter below covers existing tasks */
-	if (ops->flags & SCX_OPS_TID_TO_TASK)
-		static_branch_enable(&__scx_tid_to_task_enabled);
-
-	/*
-	 * Enable ops for every task. Fork is excluded by scx_fork_rwsem
-	 * preventing new tasks from being added. No need to exclude tasks
-	 * leaving as sched_ext_free() can handle both prepped and enabled
-	 * tasks. Prep all tasks first and then enable them with preemption
-	 * disabled.
-	 *
-	 * All cgroups should be initialized before scx_init_task() so that the
-	 * BPF scheduler can reliably track each task's cgroup membership from
-	 * scx_init_task(). Lock out cgroup on/offlining and task migrations
-	 * while tasks are being initialized so that scx_cgroup_can_attach()
-	 * never sees uninitialized tasks.
-	 */
-	scx_cgroup_lock();
-	set_cgroup_sched(sch_cgroup(sch), sch);
-	ret = scx_cgroup_init(sch);
+	ret = scx_ops_prep_tasks(sch);
 	if (ret)
-		goto err_disable_unlock_all;
-
-	scx_task_iter_start(&sti, NULL);
-	while ((p = scx_task_iter_next_locked(&sti))) {
-		/*
-		 * @p may already be dead, have lost all its usages counts and
-		 * be waiting for RCU grace period before being freed. @p can't
-		 * be initialized for SCX in such cases and should be ignored.
-		 */
-		if (!tryget_task_struct(p))
-			continue;
-
-		scx_task_iter_unlock(&sti);
-
-		ret = scx_init_task(sch, p, false);
-		if (ret) {
-			put_task_struct(p);
-			scx_task_iter_stop(&sti);
-			scx_error(sch, "ops.init_task() failed (%d) for %s[%d]",
-				  ret, p->comm, p->pid);
-			goto err_disable_unlock_all;
-		}
-
-		scx_set_task_sched(p, sch);
-		scx_set_task_state(p, SCX_TASK_READY);
-
-		/*
-		 * Insert into the tid hash under scx_tasks_lock so we can't
-		 * race sched_ext_dead() and leave a stale entry for an already
-		 * exited task.
-		 */
-		if (scx_tid_to_task_enabled()) {
-			guard(raw_spinlock_irq)(&scx_tasks_lock);
-			if (!list_empty(&p->scx.tasks_node))
-				scx_tid_hash_insert(p);
-		}
-
-		put_task_struct(p);
-	}
-	scx_task_iter_stop(&sti);
-	scx_cgroup_unlock();
-	percpu_up_write(&scx_fork_rwsem);
+		goto err_disable;
 
 	/*
 	 * All tasks are READY. It's safe to turn on scx_enabled() and switch
@@ -6866,31 +6915,7 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 	WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));
 	static_branch_enable(&__scx_enabled);
 
-	/*
-	 * We're fully committed and can't fail. The task READY -> ENABLED
-	 * transitions here are synchronized against sched_ext_free() through
-	 * scx_tasks_lock.
-	 */
-	percpu_down_write(&scx_fork_rwsem);
-	scx_task_iter_start(&sti, NULL);
-	while ((p = scx_task_iter_next_locked(&sti))) {
-		unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
-		const struct sched_class *old_class = p->sched_class;
-		const struct sched_class *new_class = scx_setscheduler_class(p);
-
-		if (scx_get_task_state(p) != SCX_TASK_READY)
-			continue;
-
-		if (old_class != new_class)
-			queue_flags |= DEQUEUE_CLASS;
-
-		scoped_guard (sched_change, p, queue_flags) {
-			p->scx.slice = READ_ONCE(sch->slice_dfl);
-			p->sched_class = new_class;
-		}
-	}
-	scx_task_iter_stop(&sti);
-	percpu_up_write(&scx_fork_rwsem);
+	scx_ops_switch_tasks(sch);
 
 	scx_bypass(sch, false);
 
@@ -6922,9 +6947,6 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 	cmd->ret = ret;
 	return;
 
-err_disable_unlock_all:
-	scx_cgroup_unlock();
-	percpu_up_write(&scx_fork_rwsem);
 	/* we'll soon enter disable path, keep bypass on */
 err_disable:
 	mutex_unlock(&scx_enable_mutex);
-- 
2.48.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2026-04-22 17:18 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-21  7:41 [PATCH sched_ext/for-7.2] sched_ext: Refactor scx_root_enable_workfn() enablement task migration Cheng-Yang Chou
2026-04-21 17:30 ` Tejun Heo
2026-04-22  1:53   ` Cheng-Yang Chou
2026-04-22 17:18     ` Tejun Heo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox