From: Tejun Heo <tj@kernel.org>
To: David Vernet <void@manifault.com>,
Andrea Righi <arighi@nvidia.com>,
Changwoo Min <changwoo@igalia.com>
Cc: sched-ext@lists.linux.dev, Emil Tsalapatis <emil@etsalapatis.com>,
linux-kernel@vger.kernel.org, Tejun Heo <tj@kernel.org>
Subject: [PATCH v2 sched_ext/for-7.3 4/4] sched_ext: Split sub-scheduler implementation into sub.c
Date: Wed, 1 Jul 2026 08:10:46 -1000 [thread overview]
Message-ID: <20260701181046.2490390-5-tj@kernel.org> (raw)
In-Reply-To: <20260701181046.2490390-1-tj@kernel.org>
The sub-scheduler implementation has grown and will continue to expand. Move
the sub-scheduler functions from ext.c into a new kernel/sched/ext/sub.c.
sub.h holds the prototypes and the !CONFIG_EXT_SUB_SCHED no-op stubs.
scx_dispatch_sched() is shared: balance_one() in ext.c and the
scx_bpf_sub_dispatch() kfunc in sub.c both call it, and the latter re-enters
it as sub-scheduler dispatch nests. It moves into sub.h as a static
__always_inline so both callers keep it inlined and per-level stack stays
bounded across the recursion. The event macros it uses move to internal.h.
No functional change.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
v2: Fold the scx_dispatch_sched() sub.h promotion into this patch (was a
separate later patch in v1) so the split is self-contained (Andrea).
kernel/sched/build_policy.c | 2 +
kernel/sched/ext/ext.c | 811 +-----------------------------------
kernel/sched/ext/internal.h | 28 ++
kernel/sched/ext/sub.c | 668 +++++++++++++++++++++++++++++
kernel/sched/ext/sub.h | 161 +++++++
5 files changed, 860 insertions(+), 810 deletions(-)
create mode 100644 kernel/sched/ext/sub.c
create mode 100644 kernel/sched/ext/sub.h
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index d74b54f81992..01dc7bf89af8 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -66,10 +66,12 @@
# include "ext/cid.h"
# include "ext/arena.h"
# include "ext/idle.h"
+# include "ext/sub.h"
# include "ext/ext.c"
# include "ext/cid.c"
# include "ext/arena.c"
# include "ext/idle.c"
+# include "ext/sub.c"
#endif
#include "syscalls.c"
diff --git a/kernel/sched/ext/ext.c b/kernel/sched/ext/ext.c
index f48d15ecd736..cd18ca6c8a59 100644
--- a/kernel/sched/ext/ext.c
+++ b/kernel/sched/ext/ext.c
@@ -19,6 +19,7 @@
#include "cid.h"
#include "arena.h"
#include "idle.h"
+#include "sub.h"
DEFINE_RAW_SPINLOCK(scx_sched_lock);
@@ -272,58 +273,6 @@ static bool u32_before(u32 a, u32 b)
return (s32)(a - b) < 0;
}
-#ifdef CONFIG_EXT_SUB_SCHED
-/**
- * scx_next_descendant_pre - find the next descendant for pre-order walk
- * @pos: the current position (%NULL to initiate traversal)
- * @root: sched whose descendants to walk
- *
- * To be used by scx_for_each_descendant_pre(). Find the next descendant to
- * visit for pre-order traversal of @root's descendants. @root is included in
- * the iteration and the first node to be visited.
- */
-static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos,
- struct scx_sched *root)
-{
- struct scx_sched *next;
-
- lockdep_assert(lockdep_is_held(&scx_enable_mutex) ||
- lockdep_is_held(&scx_sched_lock));
-
- /* if first iteration, visit @root */
- if (!pos)
- return root;
-
- /* visit the first child if exists */
- next = list_first_entry_or_null(&pos->children, struct scx_sched, sibling);
- if (next)
- return next;
-
- /* no child, visit my or the closest ancestor's next sibling */
- while (pos != root) {
- if (!list_is_last(&pos->sibling, &scx_parent(pos)->children))
- return list_next_entry(pos, sibling);
- pos = scx_parent(pos);
- }
-
- return NULL;
-}
-
-static struct scx_sched *scx_find_sub_sched(u64 cgroup_id)
-{
- return rhashtable_lookup(&scx_sched_hash, &cgroup_id,
- scx_sched_hash_params);
-}
-
-static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch)
-{
- rcu_assign_pointer(p->scx.sched, sch);
-}
-#else /* CONFIG_EXT_SUB_SCHED */
-static inline struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; }
-static inline void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) {}
-#endif /* CONFIG_EXT_SUB_SCHED */
-
/**
* scx_is_descendant - Test whether sched is a descendant
* @sch: sched to test
@@ -338,19 +287,6 @@ static bool scx_is_descendant(struct scx_sched *sch, struct scx_sched *ancestor)
return sch->ancestors[ancestor->level] == ancestor;
}
-/**
- * scx_for_each_descendant_pre - pre-order walk of a sched's descendants
- * @pos: iteration cursor
- * @root: sched to walk the descendants of
- *
- * Walk @root's descendants. @root is included in the iteration and the first
- * node to be visited. Must be called with either scx_enable_mutex or
- * scx_sched_lock held.
- */
-#define scx_for_each_descendant_pre(pos, root) \
- for ((pos) = scx_next_descendant_pre(NULL, (root)); (pos); \
- (pos) = scx_next_descendant_pre((pos), (root)))
-
static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch, s32 cpu)
{
return &sch->pnode[cpu_to_node(cpu)]->global_dsq;
@@ -936,32 +872,6 @@ struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
return NULL;
}
-/**
- * scx_add_event - Increase an event counter for 'name' by 'cnt'
- * @sch: scx_sched to account events for
- * @name: an event name defined in struct scx_event_stats
- * @cnt: the number of the event occurred
- *
- * This can be used when preemption is not disabled.
- */
-#define scx_add_event(sch, name, cnt) do { \
- this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \
- trace_sched_ext_event(#name, (cnt)); \
-} while(0)
-
-/**
- * __scx_add_event - Increase an event counter for 'name' by 'cnt'
- * @sch: scx_sched to account events for
- * @name: an event name defined in struct scx_event_stats
- * @cnt: the number of the event occurred
- *
- * This should be used only when preemption is disabled.
- */
-#define __scx_add_event(sch, name, cnt) do { \
- __this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \
- trace_sched_ext_event(#name, cnt); \
-} while(0)
-
/**
* scx_dump_event - Dump an event 'kind' in 'events' to 's'
* @s: output seq_buf
@@ -2682,115 +2592,6 @@ static inline void maybe_queue_balance_callback(struct rq *rq)
rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING;
}
-/*
- * One user of this function is scx_bpf_dispatch() which can be called
- * recursively as sub-sched dispatches nest. Always inline to reduce stack usage
- * from the call frame.
- */
-static __always_inline bool
-scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
- struct task_struct *prev, bool nested)
-{
- struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
- int nr_loops = SCX_DSP_MAX_LOOPS;
- s32 cpu = cpu_of(rq);
- bool prev_on_sch = (prev->sched_class == &ext_sched_class) &&
- scx_task_on_sched(sch, prev);
-
- if (scx_consume_global_dsq(sch, rq))
- return true;
-
- if (scx_bypass_dsp_enabled(sch)) {
- /* if @sch is bypassing, only the bypass DSQs are active */
- if (scx_bypassing(sch, cpu))
- return scx_consume_dispatch_q(sch, rq, scx_bypass_dsq(sch, cpu), 0);
-
-#ifdef CONFIG_EXT_SUB_SCHED
- /*
- * If @sch isn't bypassing but its children are, @sch is
- * responsible for making forward progress for both its own
- * tasks that aren't bypassing and the bypassing descendants'
- * tasks. The following implements a simple built-in behavior -
- * let each CPU try to run the bypass DSQ every Nth time.
- *
- * Later, if necessary, we can add an ops flag to suppress the
- * auto-consumption and a kfunc to consume the bypass DSQ and,
- * so that the BPF scheduler can fully control scheduling of
- * bypassed tasks.
- */
- struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
-
- if (!(pcpu->bypass_host_seq++ % SCX_BYPASS_HOST_NTH) &&
- scx_consume_dispatch_q(sch, rq, scx_bypass_dsq(sch, cpu), 0)) {
- __scx_add_event(sch, SCX_EV_SUB_BYPASS_DISPATCH, 1);
- return true;
- }
-#endif /* CONFIG_EXT_SUB_SCHED */
- }
-
- if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq))
- return false;
-
- dspc->rq = rq;
-
- /*
- * The dispatch loop. Because scx_flush_dispatch_buf() may drop the rq
- * lock, the local DSQ might still end up empty after a successful
- * ops.dispatch(). If the local DSQ is empty even after ops.dispatch()
- * produced some tasks, retry. The BPF scheduler may depend on this
- * looping behavior to simplify its implementation.
- */
- do {
- dspc->nr_tasks = 0;
-
- if (nested) {
- SCX_CALL_OP(sch, dispatch, rq, scx_cpu_arg(cpu),
- prev_on_sch ? prev : NULL);
- } else {
- /* stash @prev so that nested invocations can access it */
- rq->scx.sub_dispatch_prev = prev;
- SCX_CALL_OP(sch, dispatch, rq, scx_cpu_arg(cpu),
- prev_on_sch ? prev : NULL);
- rq->scx.sub_dispatch_prev = NULL;
- }
-
- scx_flush_dispatch_buf(sch, rq);
-
- if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice) {
- rq->scx.flags |= SCX_RQ_BAL_KEEP;
- return true;
- }
- if (rq->scx.local_dsq.nr)
- return true;
- if (scx_consume_global_dsq(sch, rq))
- return true;
-
- /*
- * ops.dispatch() can trap us in this loop by repeatedly
- * dispatching ineligible tasks. Break out once in a while to
- * allow the watchdog to run. As IRQ can't be enabled in
- * balance(), we want to complete this scheduling cycle and then
- * start a new one. IOW, we want to call resched_curr() on the
- * next, most likely idle, task, not the current one. Use
- * __scx_bpf_kick_cpu() for deferred kicking.
- */
- if (unlikely(!--nr_loops)) {
- scx_kick_cpu(sch, cpu, 0);
- break;
- }
- } while (dspc->nr_tasks);
-
- /*
- * Prevent the CPU from going idle while bypassed descendants have tasks
- * queued. Without this fallback, bypassed tasks could stall if the host
- * scheduler's ops.dispatch() doesn't yield any tasks.
- */
- if (scx_bypass_dsp_enabled(sch))
- return scx_consume_dispatch_q(sch, rq, scx_bypass_dsq(sch, cpu), 0);
-
- return false;
-}
-
static int balance_one(struct rq *rq, struct task_struct *prev)
{
struct scx_sched *sch = scx_root;
@@ -4470,26 +4271,6 @@ static inline void scx_cgroup_lock(void) {}
static inline void scx_cgroup_unlock(void) {}
#endif /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */
-#ifdef CONFIG_EXT_SUB_SCHED
-static struct cgroup *sch_cgroup(struct scx_sched *sch)
-{
- return sch->cgrp;
-}
-
-/* for each descendant of @cgrp including self, set ->scx_sched to @sch */
-static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch)
-{
- struct cgroup *pos;
- struct cgroup_subsys_state *css;
-
- cgroup_for_each_live_descendant_pre(pos, css, cgrp)
- rcu_assign_pointer(pos->scx_sched, sch);
-}
-#else /* CONFIG_EXT_SUB_SCHED */
-static inline struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; }
-static inline void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {}
-#endif /* CONFIG_EXT_SUB_SCHED */
-
/*
* Omitted operations:
*
@@ -5766,202 +5547,6 @@ void scx_log_sched_disable(struct scx_sched *sch)
}
}
-#ifdef CONFIG_EXT_SUB_SCHED
-static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq);
-
-static void drain_descendants(struct scx_sched *sch)
-{
- /*
- * Child scheds that finished the critical part of disabling will take
- * themselves off @sch->children. Wait for it to drain. As propagation
- * is recursive, empty @sch->children means that all proper descendant
- * scheds reached unlinking stage.
- */
- wait_event(scx_unlink_waitq, list_empty(&sch->children));
-}
-
-static void scx_fail_parent(struct scx_sched *sch,
- struct task_struct *failed, s32 fail_code)
-{
- struct scx_sched *parent = scx_parent(sch);
- struct scx_task_iter sti;
- struct task_struct *p;
-
- scx_error(parent, "ops.init_task() failed (%d) for %s[%d] while disabling a sub-scheduler",
- fail_code, failed->comm, failed->pid);
-
- /*
- * Once $parent is bypassed, it's safe to put SCX_TASK_NONE tasks into
- * it. This may cause downstream failures on the BPF side but $parent is
- * dying anyway.
- */
- scx_bypass(parent, true);
-
- scx_task_iter_start(&sti, sch->cgrp);
- while ((p = scx_task_iter_next_locked(&sti))) {
- if (scx_task_on_sched(parent, p))
- continue;
-
- scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
- scx_disable_and_exit_task(sch, p);
- scx_set_task_sched(p, parent);
- }
- }
- scx_task_iter_stop(&sti);
-}
-
-static void scx_sub_disable(struct scx_sched *sch)
-{
- struct scx_sched *parent = scx_parent(sch);
- struct scx_task_iter sti;
- struct task_struct *p;
- int ret;
-
- /*
- * Guarantee forward progress and wait for descendants to be disabled.
- * To limit disruptions, $parent is not bypassed. Tasks are fully
- * prepped and then inserted back into $parent.
- */
- scx_bypass(sch, true);
- drain_descendants(sch);
-
- /*
- * Here, every runnable task is guaranteed to make forward progress and
- * we can safely use blocking synchronization constructs. Actually
- * disable ops.
- */
- mutex_lock(&scx_enable_mutex);
- percpu_down_write(&scx_fork_rwsem);
- scx_cgroup_lock();
-
- set_cgroup_sched(sch_cgroup(sch), parent);
-
- scx_task_iter_start(&sti, sch->cgrp);
- while ((p = scx_task_iter_next_locked(&sti))) {
- struct rq *rq;
- struct rq_flags rf;
-
- /* filter out duplicate visits */
- if (scx_task_on_sched(parent, p))
- continue;
-
- /*
- * By the time control reaches here, all descendant schedulers
- * should already have been disabled.
- */
- WARN_ON_ONCE(!scx_task_on_sched(sch, p));
-
- /*
- * @p is pinned by the iter: css_task_iter_next() takes a
- * reference and holds it until the next iter_next() call, so
- * @p->usage is guaranteed > 0.
- */
- get_task_struct(p);
-
- scx_task_iter_unlock(&sti);
-
- /*
- * $p is READY or ENABLED on @sch. Initialize for $parent,
- * disable and exit from @sch, and then switch over to $parent.
- *
- * If a task fails to initialize for $parent, the only available
- * action is disabling $parent too. While this allows disabling
- * of a child sched to cause the parent scheduler to fail, the
- * failure can only originate from ops.init_task() of the
- * parent. A child can't directly affect the parent through its
- * own failures.
- */
- ret = __scx_init_task(parent, p, false);
- if (ret) {
- scx_fail_parent(sch, p, ret);
- put_task_struct(p);
- break;
- }
-
- rq = task_rq_lock(p, &rf);
-
- if (scx_get_task_state(p) == SCX_TASK_DEAD) {
- /*
- * sched_ext_dead() raced us between __scx_init_task()
- * and this rq lock and ran exit_task() on @sch (the
- * sched @p was on at that point), not on $parent.
- * $parent's just-completed init is owed an exit_task()
- * and we issue it here.
- */
- scx_sub_init_cancel_task(parent, p);
- task_rq_unlock(rq, p, &rf);
- put_task_struct(p);
- continue;
- }
-
- scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
- /*
- * $p is initialized for $parent and still attached to
- * @sch. Disable and exit for @sch, switch over to
- * $parent, override the state to READY to account for
- * $p having already been initialized, and then enable.
- */
- scx_disable_and_exit_task(sch, p);
- scx_set_task_state(p, SCX_TASK_INIT_BEGIN);
- scx_set_task_state(p, SCX_TASK_INIT);
- scx_set_task_sched(p, parent);
- scx_set_task_state(p, SCX_TASK_READY);
- scx_enable_task(parent, p);
- }
-
- task_rq_unlock(rq, p, &rf);
- put_task_struct(p);
- }
- scx_task_iter_stop(&sti);
-
- scx_disable_dump(sch);
-
- scx_cgroup_unlock();
- percpu_up_write(&scx_fork_rwsem);
-
- /*
- * All tasks are moved off of @sch but there may still be on-going
- * operations (e.g. ops.select_cpu()). Drain them by flushing RCU. Use
- * the expedited version as ancestors may be waiting in bypass mode.
- * Also, tell the parent that there is no need to keep running bypass
- * DSQs for us.
- */
- synchronize_rcu_expedited();
- scx_disable_bypass_dsp(sch);
-
- scx_unlink_sched(sch);
-
- mutex_unlock(&scx_enable_mutex);
-
- /*
- * @sch is now unlinked from the parent's children list. Notify and call
- * ops.sub_detach/exit(). Note that ops.sub_detach/exit() must be called
- * after unlinking and releasing all locks. See scx_claim_exit().
- */
- wake_up_all(&scx_unlink_waitq);
-
- if (parent->ops.sub_detach && sch->sub_attached) {
- struct scx_sub_detach_args sub_detach_args = {
- .ops = &sch->ops,
- .cgroup_path = sch->cgrp_path,
- };
- SCX_CALL_OP(parent, sub_detach, NULL,
- &sub_detach_args);
- }
-
- scx_log_sched_disable(sch);
-
- if (sch->ops.exit)
- SCX_CALL_OP(sch, exit, NULL, sch->exit_info);
- if (sch->sub_kset)
- kobject_del(&sch->sub_kset->kobj);
- kobject_del(&sch->kobj);
-}
-#else /* CONFIG_EXT_SUB_SCHED */
-static inline void drain_descendants(struct scx_sched *sch) { }
-static inline void scx_sub_disable(struct scx_sched *sch) { }
-#endif /* CONFIG_EXT_SUB_SCHED */
-
static void scx_root_disable(struct scx_sched *sch)
{
struct scx_task_iter sti;
@@ -7351,347 +6936,6 @@ static void scx_root_enable_workfn(struct kthread_work *work)
cmd->ret = 0;
}
-#ifdef CONFIG_EXT_SUB_SCHED
-/* verify that a scheduler can be attached to @cgrp and return the parent */
-static struct scx_sched *find_parent_sched(struct cgroup *cgrp)
-{
- struct scx_sched *parent = cgrp->scx_sched;
- struct scx_sched *pos;
-
- lockdep_assert_held(&scx_sched_lock);
-
- /* can't attach twice to the same cgroup */
- if (parent->cgrp == cgrp)
- return ERR_PTR(-EBUSY);
-
- /* does $parent allow sub-scheds? */
- if (!parent->ops.sub_attach)
- return ERR_PTR(-EOPNOTSUPP);
-
- /* can't insert between $parent and its exiting children */
- list_for_each_entry(pos, &parent->children, sibling)
- if (cgroup_is_descendant(pos->cgrp, cgrp))
- return ERR_PTR(-EBUSY);
-
- return parent;
-}
-
-static bool assert_task_ready_or_enabled(struct task_struct *p)
-{
- u32 state = scx_get_task_state(p);
-
- switch (state) {
- case SCX_TASK_READY:
- case SCX_TASK_ENABLED:
- return true;
- default:
- WARN_ONCE(true, "sched_ext: Invalid task state %d for %s[%d] during enabling sub sched",
- state, p->comm, p->pid);
- return false;
- }
-}
-
-static void scx_sub_enable_workfn(struct kthread_work *work)
-{
- struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work);
- struct sched_ext_ops *ops = cmd->ops;
- struct cgroup *cgrp;
- struct scx_sched *parent, *sch;
- struct scx_task_iter sti;
- struct task_struct *p;
- s32 i, ret;
-
- mutex_lock(&scx_enable_mutex);
-
- if (!scx_enabled()) {
- ret = -ENODEV;
- goto out_unlock;
- }
-
- /* See scx_root_enable_workfn() for the @ops->priv check. */
- if (rcu_access_pointer(ops->priv)) {
- ret = -EBUSY;
- goto out_unlock;
- }
-
- cgrp = cgroup_get_from_id(ops->sub_cgroup_id);
- if (IS_ERR(cgrp)) {
- ret = PTR_ERR(cgrp);
- goto out_unlock;
- }
-
- raw_spin_lock_irq(&scx_sched_lock);
- parent = find_parent_sched(cgrp);
- if (IS_ERR(parent)) {
- raw_spin_unlock_irq(&scx_sched_lock);
- ret = PTR_ERR(parent);
- goto out_put_cgrp;
- }
- kobject_get(&parent->kobj);
- raw_spin_unlock_irq(&scx_sched_lock);
-
- /* scx_alloc_and_add_sched() consumes @cgrp whether it succeeds or not */
- sch = scx_alloc_and_add_sched(cmd, cgrp, parent);
- kobject_put(&parent->kobj);
- if (IS_ERR(sch)) {
- ret = PTR_ERR(sch);
- goto out_unlock;
- }
-
- ret = scx_link_sched(sch);
- if (ret)
- goto err_disable;
-
- if (sch->level >= SCX_SUB_MAX_DEPTH) {
- scx_error(sch, "max nesting depth %d violated",
- SCX_SUB_MAX_DEPTH);
- goto err_disable;
- }
-
- if (sch->ops.init) {
- ret = SCX_CALL_OP_RET(sch, init, NULL);
- if (ret) {
- ret = scx_ops_sanitize_err(sch, "init", ret);
- scx_error(sch, "ops.init() failed (%d)", ret);
- goto err_disable;
- }
- sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
- }
-
- ret = scx_arena_pool_init(sch);
- if (ret)
- goto err_disable;
-
- ret = scx_set_cmask_scratch_alloc(sch);
- if (ret)
- goto err_disable;
-
- if (scx_validate_ops(sch, ops))
- goto err_disable;
-
- struct scx_sub_attach_args sub_attach_args = {
- .ops = &sch->ops,
- .cgroup_path = sch->cgrp_path,
- };
-
- ret = SCX_CALL_OP_RET(parent, sub_attach, NULL,
- &sub_attach_args);
- if (ret) {
- ret = scx_ops_sanitize_err(sch, "sub_attach", ret);
- scx_error(sch, "parent rejected (%d)", ret);
- goto err_disable;
- }
- sch->sub_attached = true;
-
- scx_bypass(sch, true);
-
- for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
- if (((void (**)(void))ops)[i])
- set_bit(i, sch->has_op);
-
- percpu_down_write(&scx_fork_rwsem);
- scx_cgroup_lock();
-
- /*
- * Set cgroup->scx_sched's and check CSS_ONLINE. Either we see
- * !CSS_ONLINE or scx_cgroup_lifetime_notify() sees and shoots us down.
- */
- set_cgroup_sched(sch_cgroup(sch), sch);
- if (!(cgrp->self.flags & CSS_ONLINE)) {
- scx_error(sch, "cgroup is not online");
- goto err_unlock_and_disable;
- }
-
- /*
- * Initialize tasks for the new child $sch without exiting them for
- * $parent so that the tasks can always be reverted back to $parent
- * sched on child init failure.
- */
- WARN_ON_ONCE(scx_enabling_sub_sched);
- scx_enabling_sub_sched = sch;
-
- scx_task_iter_start(&sti, sch->cgrp);
- while ((p = scx_task_iter_next_locked(&sti))) {
- struct rq *rq;
- struct rq_flags rf;
-
- /*
- * Task iteration may visit the same task twice when racing
- * against exiting. Use %SCX_TASK_SUB_INIT to mark tasks which
- * finished __scx_init_task() and skip if set.
- *
- * A task may exit and get freed between __scx_init_task()
- * completion and scx_enable_task(). In such cases,
- * scx_disable_and_exit_task() must exit the task for both the
- * parent and child scheds.
- */
- if (p->scx.flags & SCX_TASK_SUB_INIT)
- continue;
-
- /* @p is pinned by the iter; see scx_sub_disable() */
- get_task_struct(p);
-
- if (!assert_task_ready_or_enabled(p)) {
- ret = -EINVAL;
- goto abort;
- }
-
- scx_task_iter_unlock(&sti);
-
- /*
- * As $p is still on $parent, it can't be transitioned to INIT.
- * Let's worry about task state later. Use __scx_init_task().
- */
- ret = __scx_init_task(sch, p, false);
- if (ret)
- goto abort;
-
- rq = task_rq_lock(p, &rf);
-
- if (scx_get_task_state(p) == SCX_TASK_DEAD) {
- /*
- * sched_ext_dead() raced us between __scx_init_task()
- * and this rq lock and ran exit_task() on $parent (the
- * sched @p was on at that point), not on @sch. @sch's
- * just-completed init is owed an exit_task() and we
- * issue it here.
- */
- scx_sub_init_cancel_task(sch, p);
- task_rq_unlock(rq, p, &rf);
- put_task_struct(p);
- continue;
- }
-
- p->scx.flags |= SCX_TASK_SUB_INIT;
- task_rq_unlock(rq, p, &rf);
-
- put_task_struct(p);
- }
- scx_task_iter_stop(&sti);
-
- /*
- * All tasks are prepped. Disable/exit tasks for $parent and enable for
- * the new @sch.
- */
- scx_task_iter_start(&sti, sch->cgrp);
- while ((p = scx_task_iter_next_locked(&sti))) {
- /*
- * Use clearing of %SCX_TASK_SUB_INIT to detect and skip
- * duplicate iterations.
- */
- if (!(p->scx.flags & SCX_TASK_SUB_INIT))
- continue;
-
- scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
- /*
- * $p must be either READY or ENABLED. If ENABLED,
- * __scx_disabled_and_exit_task() first disables and
- * makes it READY. However, after exiting $p, it will
- * leave $p as READY.
- */
- assert_task_ready_or_enabled(p);
- __scx_disable_and_exit_task(parent, p);
-
- /*
- * $p is now only initialized for @sch and READY, which
- * is what we want. Assign it to @sch and enable.
- */
- scx_set_task_sched(p, sch);
- scx_enable_task(sch, p);
-
- p->scx.flags &= ~SCX_TASK_SUB_INIT;
- }
- }
- scx_task_iter_stop(&sti);
-
- scx_enabling_sub_sched = NULL;
-
- scx_cgroup_unlock();
- percpu_up_write(&scx_fork_rwsem);
-
- scx_bypass(sch, false);
-
- pr_info("sched_ext: BPF sub-scheduler \"%s\" enabled\n", sch->ops.name);
- kobject_uevent(&sch->kobj, KOBJ_ADD);
- ret = 0;
- goto out_unlock;
-
-out_put_cgrp:
- cgroup_put(cgrp);
-out_unlock:
- mutex_unlock(&scx_enable_mutex);
- cmd->ret = ret;
- return;
-
-abort:
- put_task_struct(p);
- scx_task_iter_stop(&sti);
-
- /*
- * Undo __scx_init_task() for tasks we marked. scx_enable_task() never
- * ran for @sch on them, so calling scx_disable_task() here would invoke
- * ops.disable() without a matching ops.enable(). scx_enabling_sub_sched
- * must stay set until SUB_INIT is cleared from every marked task -
- * scx_disable_and_exit_task() reads it when a task exits concurrently.
- */
- scx_task_iter_start(&sti, sch->cgrp);
- while ((p = scx_task_iter_next_locked(&sti))) {
- if (p->scx.flags & SCX_TASK_SUB_INIT) {
- scx_sub_init_cancel_task(sch, p);
- p->scx.flags &= ~SCX_TASK_SUB_INIT;
- }
- }
- scx_task_iter_stop(&sti);
- scx_enabling_sub_sched = NULL;
-err_unlock_and_disable:
- /* we'll soon enter disable path, keep bypass on */
- scx_cgroup_unlock();
- percpu_up_write(&scx_fork_rwsem);
-err_disable:
- mutex_unlock(&scx_enable_mutex);
- scx_flush_disable_work(sch);
- cmd->ret = 0;
-}
-
-static s32 scx_cgroup_lifetime_notify(struct notifier_block *nb,
- unsigned long action, void *data)
-{
- struct cgroup *cgrp = data;
- struct cgroup *parent = cgroup_parent(cgrp);
-
- if (!cgroup_on_dfl(cgrp))
- return NOTIFY_OK;
-
- switch (action) {
- case CGROUP_LIFETIME_ONLINE:
- /* inherit ->scx_sched from $parent */
- if (parent)
- rcu_assign_pointer(cgrp->scx_sched, parent->scx_sched);
- break;
- case CGROUP_LIFETIME_OFFLINE:
- /* if there is a sched attached, shoot it down */
- if (cgrp->scx_sched && cgrp->scx_sched->cgrp == cgrp)
- scx_exit(cgrp->scx_sched, SCX_EXIT_UNREG_KERN,
- SCX_ECODE_RSN_CGROUP_OFFLINE,
- "cgroup %llu going offline", cgroup_id(cgrp));
- break;
- }
-
- return NOTIFY_OK;
-}
-
-static struct notifier_block scx_cgroup_lifetime_nb = {
- .notifier_call = scx_cgroup_lifetime_notify,
-};
-
-static s32 __init scx_cgroup_lifetime_notifier_init(void)
-{
- return blocking_notifier_chain_register(&cgroup_lifetime_notifier,
- &scx_cgroup_lifetime_nb);
-}
-core_initcall(scx_cgroup_lifetime_notifier_init);
-#endif /* CONFIG_EXT_SUB_SCHED */
-
static s32 scx_enable(struct scx_enable_cmd *cmd, struct bpf_link *link)
{
static struct kthread_worker *helper;
@@ -7838,20 +7082,6 @@ static int bpf_scx_init_member(const struct btf_type *t,
return 0;
}
-#ifdef CONFIG_EXT_SUB_SCHED
-static void scx_pstack_recursion_on_dispatch(struct bpf_prog *prog)
-{
- struct scx_sched *sch;
-
- guard(rcu)();
- sch = scx_prog_sched(prog->aux);
- if (unlikely(!sch))
- return;
-
- scx_error(sch, "dispatch recursion detected");
-}
-#endif /* CONFIG_EXT_SUB_SCHED */
-
static int bpf_scx_check_member(const struct btf_type *t,
const struct btf_member *member,
const struct bpf_prog *prog)
@@ -9022,45 +8252,6 @@ __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter,
p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
}
-#ifdef CONFIG_EXT_SUB_SCHED
-/**
- * scx_bpf_sub_dispatch - Trigger dispatching on a child scheduler
- * @cgroup_id: cgroup ID of the child scheduler to dispatch
- * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
- *
- * Allows a parent scheduler to trigger dispatching on one of its direct
- * child schedulers. The child scheduler runs its dispatch operation to
- * move tasks from dispatch queues to the local runqueue.
- *
- * Returns: true on success, false if cgroup_id is invalid, not a direct
- * child, or caller lacks dispatch permission.
- */
-__bpf_kfunc bool scx_bpf_sub_dispatch(u64 cgroup_id, const struct bpf_prog_aux *aux)
-{
- struct rq *this_rq = this_rq();
- struct scx_sched *parent, *child;
-
- guard(rcu)();
- parent = scx_prog_sched(aux);
- if (unlikely(!parent))
- return false;
-
- child = scx_find_sub_sched(cgroup_id);
-
- if (unlikely(!child))
- return false;
-
- if (unlikely(scx_parent(child) != parent)) {
- scx_error(parent, "trying to dispatch a distant sub-sched on cgroup %llu",
- cgroup_id);
- return false;
- }
-
- return scx_dispatch_sched(child, this_rq, this_rq->scx.sub_dispatch_prev,
- true);
-}
-#endif /* CONFIG_EXT_SUB_SCHED */
-
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
diff --git a/kernel/sched/ext/internal.h b/kernel/sched/ext/internal.h
index 5d861cb0727d..01c8d6eac8dd 100644
--- a/kernel/sched/ext/internal.h
+++ b/kernel/sched/ext/internal.h
@@ -11,6 +11,34 @@
#include "../sched.h"
#include "types.h"
+#include <trace/events/sched_ext.h>
+
+/**
+ * scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @sch: scx_sched to account events for
+ * @name: an event name defined in struct scx_event_stats
+ * @cnt: the number of the event occurred
+ *
+ * This can be used when preemption is not disabled.
+ */
+#define scx_add_event(sch, name, cnt) do { \
+ this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \
+ trace_sched_ext_event(#name, (cnt)); \
+} while(0)
+
+/**
+ * __scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @sch: scx_sched to account events for
+ * @name: an event name defined in struct scx_event_stats
+ * @cnt: the number of the event occurred
+ *
+ * This should be used only when preemption is disabled.
+ */
+#define __scx_add_event(sch, name, cnt) do { \
+ __this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \
+ trace_sched_ext_event(#name, cnt); \
+} while(0)
+
#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
#define SCX_MOFF_IDX(moff) ((moff) / sizeof(void (*)(void)))
diff --git a/kernel/sched/ext/sub.c b/kernel/sched/ext/sub.c
new file mode 100644
index 000000000000..050420427273
--- /dev/null
+++ b/kernel/sched/ext/sub.c
@@ -0,0 +1,668 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Sub-scheduler hierarchy support.
+ *
+ * A sub-scheduler is an scx_sched attached to a cgroup subtree under another
+ * scx_sched. This file holds the sub-scheduler implementation: the scheduler
+ * tree walk, capability delegation, per-shard cap state and its sync, and the
+ * sub-scheduler enable/disable paths. The core dispatch/enqueue machinery it
+ * builds on lives in ext.c.
+ *
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
+ */
+#include <linux/rhashtable.h>
+#include "internal.h"
+#include "cid.h"
+#include "arena.h"
+#include "sub.h"
+
+#ifdef CONFIG_EXT_SUB_SCHED
+
+/**
+ * scx_next_descendant_pre - find the next descendant for pre-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @root: sched whose descendants to walk
+ *
+ * To be used by scx_for_each_descendant_pre(). Find the next descendant to
+ * visit for pre-order traversal of @root's descendants. @root is included in
+ * the iteration and the first node to be visited.
+ */
+struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root)
+{
+ struct scx_sched *next;
+
+ lockdep_assert(lockdep_is_held(&scx_enable_mutex) ||
+ lockdep_is_held(&scx_sched_lock));
+
+ /* if first iteration, visit @root */
+ if (!pos)
+ return root;
+
+ /* visit the first child if exists */
+ next = list_first_entry_or_null(&pos->children, struct scx_sched, sibling);
+ if (next)
+ return next;
+
+ /* no child, visit my or the closest ancestor's next sibling */
+ while (pos != root) {
+ if (!list_is_last(&pos->sibling, &scx_parent(pos)->children))
+ return list_next_entry(pos, sibling);
+ pos = scx_parent(pos);
+ }
+
+ return NULL;
+}
+
+static struct scx_sched *scx_find_sub_sched(u64 cgroup_id)
+{
+ return rhashtable_lookup(&scx_sched_hash, &cgroup_id,
+ scx_sched_hash_params);
+}
+
+void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch)
+{
+ rcu_assign_pointer(p->scx.sched, sch);
+}
+
+struct cgroup *sch_cgroup(struct scx_sched *sch)
+{
+ return sch->cgrp;
+}
+
+/* for each descendant of @cgrp including self, set ->scx_sched to @sch */
+void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch)
+{
+ struct cgroup *pos;
+ struct cgroup_subsys_state *css;
+
+ cgroup_for_each_live_descendant_pre(pos, css, cgrp)
+ rcu_assign_pointer(pos->scx_sched, sch);
+}
+
+static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq);
+
+void drain_descendants(struct scx_sched *sch)
+{
+ /*
+ * Child scheds that finished the critical part of disabling will take
+ * themselves off @sch->children. Wait for it to drain. As propagation
+ * is recursive, empty @sch->children means that all proper descendant
+ * scheds reached unlinking stage.
+ */
+ wait_event(scx_unlink_waitq, list_empty(&sch->children));
+}
+
+static void scx_fail_parent(struct scx_sched *sch,
+ struct task_struct *failed, s32 fail_code)
+{
+ struct scx_sched *parent = scx_parent(sch);
+ struct scx_task_iter sti;
+ struct task_struct *p;
+
+ scx_error(parent, "ops.init_task() failed (%d) for %s[%d] while disabling a sub-scheduler",
+ fail_code, failed->comm, failed->pid);
+
+ /*
+ * Once $parent is bypassed, it's safe to put SCX_TASK_NONE tasks into
+ * it. This may cause downstream failures on the BPF side but $parent is
+ * dying anyway.
+ */
+ scx_bypass(parent, true);
+
+ scx_task_iter_start(&sti, sch->cgrp);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ if (scx_task_on_sched(parent, p))
+ continue;
+
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+ scx_disable_and_exit_task(sch, p);
+ scx_set_task_sched(p, parent);
+ }
+ }
+ scx_task_iter_stop(&sti);
+}
+
+void scx_sub_disable(struct scx_sched *sch)
+{
+ struct scx_sched *parent = scx_parent(sch);
+ struct scx_task_iter sti;
+ struct task_struct *p;
+ int ret;
+
+ /*
+ * Guarantee forward progress and wait for descendants to be disabled.
+ * To limit disruptions, $parent is not bypassed. Tasks are fully
+ * prepped and then inserted back into $parent.
+ */
+ scx_bypass(sch, true);
+ drain_descendants(sch);
+
+ /*
+ * Here, every runnable task is guaranteed to make forward progress and
+ * we can safely use blocking synchronization constructs. Actually
+ * disable ops.
+ */
+ mutex_lock(&scx_enable_mutex);
+ percpu_down_write(&scx_fork_rwsem);
+ scx_cgroup_lock();
+
+ set_cgroup_sched(sch_cgroup(sch), parent);
+
+ scx_task_iter_start(&sti, sch->cgrp);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ struct rq *rq;
+ struct rq_flags rf;
+
+ /* filter out duplicate visits */
+ if (scx_task_on_sched(parent, p))
+ continue;
+
+ /*
+ * By the time control reaches here, all descendant schedulers
+ * should already have been disabled.
+ */
+ WARN_ON_ONCE(!scx_task_on_sched(sch, p));
+
+ /*
+ * @p is pinned by the iter: css_task_iter_next() takes a
+ * reference and holds it until the next iter_next() call, so
+ * @p->usage is guaranteed > 0.
+ */
+ get_task_struct(p);
+
+ scx_task_iter_unlock(&sti);
+
+ /*
+ * $p is READY or ENABLED on @sch. Initialize for $parent,
+ * disable and exit from @sch, and then switch over to $parent.
+ *
+ * If a task fails to initialize for $parent, the only available
+ * action is disabling $parent too. While this allows disabling
+ * of a child sched to cause the parent scheduler to fail, the
+ * failure can only originate from ops.init_task() of the
+ * parent. A child can't directly affect the parent through its
+ * own failures.
+ */
+ ret = __scx_init_task(parent, p, false);
+ if (ret) {
+ scx_fail_parent(sch, p, ret);
+ put_task_struct(p);
+ break;
+ }
+
+ rq = task_rq_lock(p, &rf);
+
+ if (scx_get_task_state(p) == SCX_TASK_DEAD) {
+ /*
+ * sched_ext_dead() raced us between __scx_init_task()
+ * and this rq lock and ran exit_task() on @sch (the
+ * sched @p was on at that point), not on $parent.
+ * $parent's just-completed init is owed an exit_task()
+ * and we issue it here.
+ */
+ scx_sub_init_cancel_task(parent, p);
+ task_rq_unlock(rq, p, &rf);
+ put_task_struct(p);
+ continue;
+ }
+
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+ /*
+ * $p is initialized for $parent and still attached to
+ * @sch. Disable and exit for @sch, switch over to
+ * $parent, override the state to READY to account for
+ * $p having already been initialized, and then enable.
+ */
+ scx_disable_and_exit_task(sch, p);
+ scx_set_task_state(p, SCX_TASK_INIT_BEGIN);
+ scx_set_task_state(p, SCX_TASK_INIT);
+ scx_set_task_sched(p, parent);
+ scx_set_task_state(p, SCX_TASK_READY);
+ scx_enable_task(parent, p);
+ }
+
+ task_rq_unlock(rq, p, &rf);
+ put_task_struct(p);
+ }
+ scx_task_iter_stop(&sti);
+
+ scx_disable_dump(sch);
+
+ scx_cgroup_unlock();
+ percpu_up_write(&scx_fork_rwsem);
+
+ /*
+ * All tasks are moved off of @sch but there may still be on-going
+ * operations (e.g. ops.select_cpu()). Drain them by flushing RCU. Use
+ * the expedited version as ancestors may be waiting in bypass mode.
+ * Also, tell the parent that there is no need to keep running bypass
+ * DSQs for us.
+ */
+ synchronize_rcu_expedited();
+ scx_disable_bypass_dsp(sch);
+
+ scx_unlink_sched(sch);
+
+ mutex_unlock(&scx_enable_mutex);
+
+ /*
+ * @sch is now unlinked from the parent's children list. Notify and call
+ * ops.sub_detach/exit(). Note that ops.sub_detach/exit() must be called
+ * after unlinking and releasing all locks. See scx_claim_exit().
+ */
+ wake_up_all(&scx_unlink_waitq);
+
+ if (parent->ops.sub_detach && sch->sub_attached) {
+ struct scx_sub_detach_args sub_detach_args = {
+ .ops = &sch->ops,
+ .cgroup_path = sch->cgrp_path,
+ };
+ SCX_CALL_OP(parent, sub_detach, NULL,
+ &sub_detach_args);
+ }
+
+ scx_log_sched_disable(sch);
+
+ if (sch->ops.exit)
+ SCX_CALL_OP(sch, exit, NULL, sch->exit_info);
+ if (sch->sub_kset)
+ kobject_del(&sch->sub_kset->kobj);
+ kobject_del(&sch->kobj);
+}
+
+/* verify that a scheduler can be attached to @cgrp and return the parent */
+static struct scx_sched *find_parent_sched(struct cgroup *cgrp)
+{
+ struct scx_sched *parent = cgrp->scx_sched;
+ struct scx_sched *pos;
+
+ lockdep_assert_held(&scx_sched_lock);
+
+ /* can't attach twice to the same cgroup */
+ if (parent->cgrp == cgrp)
+ return ERR_PTR(-EBUSY);
+
+ /* does $parent allow sub-scheds? */
+ if (!parent->ops.sub_attach)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ /* can't insert between $parent and its exiting children */
+ list_for_each_entry(pos, &parent->children, sibling)
+ if (cgroup_is_descendant(pos->cgrp, cgrp))
+ return ERR_PTR(-EBUSY);
+
+ return parent;
+}
+
+static bool assert_task_ready_or_enabled(struct task_struct *p)
+{
+ u32 state = scx_get_task_state(p);
+
+ switch (state) {
+ case SCX_TASK_READY:
+ case SCX_TASK_ENABLED:
+ return true;
+ default:
+ WARN_ONCE(true, "sched_ext: Invalid task state %d for %s[%d] during enabling sub sched",
+ state, p->comm, p->pid);
+ return false;
+ }
+}
+
+void scx_sub_enable_workfn(struct kthread_work *work)
+{
+ struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work);
+ struct sched_ext_ops *ops = cmd->ops;
+ struct cgroup *cgrp;
+ struct scx_sched *parent, *sch;
+ struct scx_task_iter sti;
+ struct task_struct *p;
+ s32 i, ret;
+
+ mutex_lock(&scx_enable_mutex);
+
+ if (!scx_enabled()) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
+
+ /* See scx_root_enable_workfn() for the @ops->priv check. */
+ if (rcu_access_pointer(ops->priv)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ cgrp = cgroup_get_from_id(ops->sub_cgroup_id);
+ if (IS_ERR(cgrp)) {
+ ret = PTR_ERR(cgrp);
+ goto out_unlock;
+ }
+
+ raw_spin_lock_irq(&scx_sched_lock);
+ parent = find_parent_sched(cgrp);
+ if (IS_ERR(parent)) {
+ raw_spin_unlock_irq(&scx_sched_lock);
+ ret = PTR_ERR(parent);
+ goto out_put_cgrp;
+ }
+ kobject_get(&parent->kobj);
+ raw_spin_unlock_irq(&scx_sched_lock);
+
+ /* scx_alloc_and_add_sched() consumes @cgrp whether it succeeds or not */
+ sch = scx_alloc_and_add_sched(cmd, cgrp, parent);
+ kobject_put(&parent->kobj);
+ if (IS_ERR(sch)) {
+ ret = PTR_ERR(sch);
+ goto out_unlock;
+ }
+
+ ret = scx_link_sched(sch);
+ if (ret)
+ goto err_disable;
+
+ if (sch->level >= SCX_SUB_MAX_DEPTH) {
+ scx_error(sch, "max nesting depth %d violated",
+ SCX_SUB_MAX_DEPTH);
+ goto err_disable;
+ }
+
+ if (sch->ops.init) {
+ ret = SCX_CALL_OP_RET(sch, init, NULL);
+ if (ret) {
+ ret = scx_ops_sanitize_err(sch, "init", ret);
+ scx_error(sch, "ops.init() failed (%d)", ret);
+ goto err_disable;
+ }
+ sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
+ }
+
+ ret = scx_arena_pool_init(sch);
+ if (ret)
+ goto err_disable;
+
+ ret = scx_set_cmask_scratch_alloc(sch);
+ if (ret)
+ goto err_disable;
+
+ if (scx_validate_ops(sch, ops))
+ goto err_disable;
+
+ struct scx_sub_attach_args sub_attach_args = {
+ .ops = &sch->ops,
+ .cgroup_path = sch->cgrp_path,
+ };
+
+ ret = SCX_CALL_OP_RET(parent, sub_attach, NULL,
+ &sub_attach_args);
+ if (ret) {
+ ret = scx_ops_sanitize_err(sch, "sub_attach", ret);
+ scx_error(sch, "parent rejected (%d)", ret);
+ goto err_disable;
+ }
+ sch->sub_attached = true;
+
+ scx_bypass(sch, true);
+
+ for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
+ if (((void (**)(void))ops)[i])
+ set_bit(i, sch->has_op);
+
+ percpu_down_write(&scx_fork_rwsem);
+ scx_cgroup_lock();
+
+ /*
+ * Set cgroup->scx_sched's and check CSS_ONLINE. Either we see
+ * !CSS_ONLINE or scx_cgroup_lifetime_notify() sees and shoots us down.
+ */
+ set_cgroup_sched(sch_cgroup(sch), sch);
+ if (!(cgrp->self.flags & CSS_ONLINE)) {
+ scx_error(sch, "cgroup is not online");
+ goto err_unlock_and_disable;
+ }
+
+ /*
+ * Initialize tasks for the new child $sch without exiting them for
+ * $parent so that the tasks can always be reverted back to $parent
+ * sched on child init failure.
+ */
+ WARN_ON_ONCE(scx_enabling_sub_sched);
+ scx_enabling_sub_sched = sch;
+
+ scx_task_iter_start(&sti, sch->cgrp);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ struct rq *rq;
+ struct rq_flags rf;
+
+ /*
+ * Task iteration may visit the same task twice when racing
+ * against exiting. Use %SCX_TASK_SUB_INIT to mark tasks which
+ * finished __scx_init_task() and skip if set.
+ *
+ * A task may exit and get freed between __scx_init_task()
+ * completion and scx_enable_task(). In such cases,
+ * scx_disable_and_exit_task() must exit the task for both the
+ * parent and child scheds.
+ */
+ if (p->scx.flags & SCX_TASK_SUB_INIT)
+ continue;
+
+ /* @p is pinned by the iter; see scx_sub_disable() */
+ get_task_struct(p);
+
+ if (!assert_task_ready_or_enabled(p)) {
+ ret = -EINVAL;
+ goto abort;
+ }
+
+ scx_task_iter_unlock(&sti);
+
+ /*
+ * As $p is still on $parent, it can't be transitioned to INIT.
+ * Let's worry about task state later. Use __scx_init_task().
+ */
+ ret = __scx_init_task(sch, p, false);
+ if (ret)
+ goto abort;
+
+ rq = task_rq_lock(p, &rf);
+
+ if (scx_get_task_state(p) == SCX_TASK_DEAD) {
+ /*
+ * sched_ext_dead() raced us between __scx_init_task()
+ * and this rq lock and ran exit_task() on $parent (the
+ * sched @p was on at that point), not on @sch. @sch's
+ * just-completed init is owed an exit_task() and we
+ * issue it here.
+ */
+ scx_sub_init_cancel_task(sch, p);
+ task_rq_unlock(rq, p, &rf);
+ put_task_struct(p);
+ continue;
+ }
+
+ p->scx.flags |= SCX_TASK_SUB_INIT;
+ task_rq_unlock(rq, p, &rf);
+
+ put_task_struct(p);
+ }
+ scx_task_iter_stop(&sti);
+
+ /*
+ * All tasks are prepped. Disable/exit tasks for $parent and enable for
+ * the new @sch.
+ */
+ scx_task_iter_start(&sti, sch->cgrp);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ /*
+ * Use clearing of %SCX_TASK_SUB_INIT to detect and skip
+ * duplicate iterations.
+ */
+ if (!(p->scx.flags & SCX_TASK_SUB_INIT))
+ continue;
+
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+ /*
+ * $p must be either READY or ENABLED. If ENABLED,
+ * __scx_disabled_and_exit_task() first disables and
+ * makes it READY. However, after exiting $p, it will
+ * leave $p as READY.
+ */
+ assert_task_ready_or_enabled(p);
+ __scx_disable_and_exit_task(parent, p);
+
+ /*
+ * $p is now only initialized for @sch and READY, which
+ * is what we want. Assign it to @sch and enable.
+ */
+ scx_set_task_sched(p, sch);
+ scx_enable_task(sch, p);
+
+ p->scx.flags &= ~SCX_TASK_SUB_INIT;
+ }
+ }
+ scx_task_iter_stop(&sti);
+
+ scx_enabling_sub_sched = NULL;
+
+ scx_cgroup_unlock();
+ percpu_up_write(&scx_fork_rwsem);
+
+ scx_bypass(sch, false);
+
+ pr_info("sched_ext: BPF sub-scheduler \"%s\" enabled\n", sch->ops.name);
+ kobject_uevent(&sch->kobj, KOBJ_ADD);
+ ret = 0;
+ goto out_unlock;
+
+out_put_cgrp:
+ cgroup_put(cgrp);
+out_unlock:
+ mutex_unlock(&scx_enable_mutex);
+ cmd->ret = ret;
+ return;
+
+abort:
+ put_task_struct(p);
+ scx_task_iter_stop(&sti);
+
+ /*
+ * Undo __scx_init_task() for tasks we marked. scx_enable_task() never
+ * ran for @sch on them, so calling scx_disable_task() here would invoke
+ * ops.disable() without a matching ops.enable(). scx_enabling_sub_sched
+ * must stay set until SUB_INIT is cleared from every marked task -
+ * scx_disable_and_exit_task() reads it when a task exits concurrently.
+ */
+ scx_task_iter_start(&sti, sch->cgrp);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ if (p->scx.flags & SCX_TASK_SUB_INIT) {
+ scx_sub_init_cancel_task(sch, p);
+ p->scx.flags &= ~SCX_TASK_SUB_INIT;
+ }
+ }
+ scx_task_iter_stop(&sti);
+ scx_enabling_sub_sched = NULL;
+err_unlock_and_disable:
+ /* we'll soon enter disable path, keep bypass on */
+ scx_cgroup_unlock();
+ percpu_up_write(&scx_fork_rwsem);
+err_disable:
+ mutex_unlock(&scx_enable_mutex);
+ scx_flush_disable_work(sch);
+ cmd->ret = 0;
+}
+
+static s32 scx_cgroup_lifetime_notify(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct cgroup *cgrp = data;
+ struct cgroup *parent = cgroup_parent(cgrp);
+
+ if (!cgroup_on_dfl(cgrp))
+ return NOTIFY_OK;
+
+ switch (action) {
+ case CGROUP_LIFETIME_ONLINE:
+ /* inherit ->scx_sched from $parent */
+ if (parent)
+ rcu_assign_pointer(cgrp->scx_sched, parent->scx_sched);
+ break;
+ case CGROUP_LIFETIME_OFFLINE:
+ /* if there is a sched attached, shoot it down */
+ if (cgrp->scx_sched && cgrp->scx_sched->cgrp == cgrp)
+ scx_exit(cgrp->scx_sched, SCX_EXIT_UNREG_KERN,
+ SCX_ECODE_RSN_CGROUP_OFFLINE,
+ "cgroup %llu going offline", cgroup_id(cgrp));
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block scx_cgroup_lifetime_nb = {
+ .notifier_call = scx_cgroup_lifetime_notify,
+};
+
+static s32 __init scx_cgroup_lifetime_notifier_init(void)
+{
+ return blocking_notifier_chain_register(&cgroup_lifetime_notifier,
+ &scx_cgroup_lifetime_nb);
+}
+core_initcall(scx_cgroup_lifetime_notifier_init);
+
+void scx_pstack_recursion_on_dispatch(struct bpf_prog *prog)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+ sch = scx_prog_sched(prog->aux);
+ if (unlikely(!sch))
+ return;
+
+ scx_error(sch, "dispatch recursion detected");
+}
+
+__bpf_kfunc_start_defs();
+
+/**
+ * scx_bpf_sub_dispatch - Trigger dispatching on a child scheduler
+ * @cgroup_id: cgroup ID of the child scheduler to dispatch
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * Allows a parent scheduler to trigger dispatching on one of its direct
+ * child schedulers. The child scheduler runs its dispatch operation to
+ * move tasks from dispatch queues to the local runqueue.
+ *
+ * Returns: true on success, false if cgroup_id is invalid, not a direct
+ * child, or caller lacks dispatch permission.
+ */
+__bpf_kfunc bool scx_bpf_sub_dispatch(u64 cgroup_id, const struct bpf_prog_aux *aux)
+{
+ struct rq *this_rq = this_rq();
+ struct scx_sched *parent, *child;
+
+ guard(rcu)();
+ parent = scx_prog_sched(aux);
+ if (unlikely(!parent))
+ return false;
+
+ child = scx_find_sub_sched(cgroup_id);
+
+ if (unlikely(!child))
+ return false;
+
+ if (unlikely(scx_parent(child) != parent)) {
+ scx_error(parent, "trying to dispatch a distant sub-sched on cgroup %llu",
+ cgroup_id);
+ return false;
+ }
+
+ return scx_dispatch_sched(child, this_rq, this_rq->scx.sub_dispatch_prev,
+ true);
+}
+
+__bpf_kfunc_end_defs();
+
+#endif /* CONFIG_EXT_SUB_SCHED */
diff --git a/kernel/sched/ext/sub.h b/kernel/sched/ext/sub.h
new file mode 100644
index 000000000000..460a9fd196dc
--- /dev/null
+++ b/kernel/sched/ext/sub.h
@@ -0,0 +1,161 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Sub-scheduler hierarchy support.
+ *
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
+ */
+#ifndef _KERNEL_SCHED_EXT_SUB_H
+#define _KERNEL_SCHED_EXT_SUB_H
+
+#include "internal.h"
+#include "cid.h"
+
+#ifdef CONFIG_EXT_SUB_SCHED
+
+struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root);
+void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch);
+struct cgroup *sch_cgroup(struct scx_sched *sch);
+void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch);
+void scx_pstack_recursion_on_dispatch(struct bpf_prog *prog);
+void drain_descendants(struct scx_sched *sch);
+void scx_sub_disable(struct scx_sched *sch);
+void scx_sub_enable_workfn(struct kthread_work *work);
+bool scx_bpf_sub_dispatch(u64 cgroup_id, const struct bpf_prog_aux *aux);
+
+#else /* CONFIG_EXT_SUB_SCHED */
+
+static inline struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; }
+static inline void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) {}
+static inline struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; }
+static inline void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {}
+static inline void drain_descendants(struct scx_sched *sch) { }
+static inline void scx_sub_disable(struct scx_sched *sch) { }
+
+#endif /* CONFIG_EXT_SUB_SCHED */
+
+/**
+ * scx_for_each_descendant_pre - pre-order walk of a sched's descendants
+ * @pos: iteration cursor
+ * @root: sched to walk the descendants of
+ *
+ * Walk @root's descendants. @root is included in the iteration and the first
+ * node to be visited. Must be called with either scx_enable_mutex or
+ * scx_sched_lock held.
+ */
+#define scx_for_each_descendant_pre(pos, root) \
+ for ((pos) = scx_next_descendant_pre(NULL, (root)); (pos); \
+ (pos) = scx_next_descendant_pre((pos), (root)))
+
+/*
+ * One user of this function is scx_bpf_dispatch() which can be called
+ * recursively as sub-sched dispatches nest. Always inline to reduce stack usage
+ * from the call frame.
+ */
+static __always_inline bool
+scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
+ struct task_struct *prev, bool nested)
+{
+ struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
+ int nr_loops = SCX_DSP_MAX_LOOPS;
+ s32 cpu = cpu_of(rq);
+ bool prev_on_sch = (prev->sched_class == &ext_sched_class) &&
+ scx_task_on_sched(sch, prev);
+
+ if (scx_consume_global_dsq(sch, rq))
+ return true;
+
+ if (scx_bypass_dsp_enabled(sch)) {
+ /* if @sch is bypassing, only the bypass DSQs are active */
+ if (scx_bypassing(sch, cpu))
+ return scx_consume_dispatch_q(sch, rq, scx_bypass_dsq(sch, cpu), 0);
+
+#ifdef CONFIG_EXT_SUB_SCHED
+ /*
+ * If @sch isn't bypassing but its children are, @sch is
+ * responsible for making forward progress for both its own
+ * tasks that aren't bypassing and the bypassing descendants'
+ * tasks. The following implements a simple built-in behavior -
+ * let each CPU try to run the bypass DSQ every Nth time.
+ *
+ * Later, if necessary, we can add an ops flag to suppress the
+ * auto-consumption and a kfunc to consume the bypass DSQ and,
+ * so that the BPF scheduler can fully control scheduling of
+ * bypassed tasks.
+ */
+ struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
+
+ if (!(pcpu->bypass_host_seq++ % SCX_BYPASS_HOST_NTH) &&
+ scx_consume_dispatch_q(sch, rq, scx_bypass_dsq(sch, cpu), 0)) {
+ __scx_add_event(sch, SCX_EV_SUB_BYPASS_DISPATCH, 1);
+ return true;
+ }
+#endif /* CONFIG_EXT_SUB_SCHED */
+ }
+
+ if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq))
+ return false;
+
+ dspc->rq = rq;
+
+ /*
+ * The dispatch loop. Because scx_flush_dispatch_buf() may drop the rq
+ * lock, the local DSQ might still end up empty after a successful
+ * ops.dispatch(). If the local DSQ is empty even after ops.dispatch()
+ * produced some tasks, retry. The BPF scheduler may depend on this
+ * looping behavior to simplify its implementation.
+ */
+ do {
+ dspc->nr_tasks = 0;
+
+ if (nested) {
+ SCX_CALL_OP(sch, dispatch, rq, scx_cpu_arg(cpu),
+ prev_on_sch ? prev : NULL);
+ } else {
+ /* stash @prev so that nested invocations can access it */
+ rq->scx.sub_dispatch_prev = prev;
+ SCX_CALL_OP(sch, dispatch, rq, scx_cpu_arg(cpu),
+ prev_on_sch ? prev : NULL);
+ rq->scx.sub_dispatch_prev = NULL;
+ }
+
+ scx_flush_dispatch_buf(sch, rq);
+
+ if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice) {
+ rq->scx.flags |= SCX_RQ_BAL_KEEP;
+ return true;
+ }
+ if (rq->scx.local_dsq.nr)
+ return true;
+ if (scx_consume_global_dsq(sch, rq))
+ return true;
+
+ /*
+ * ops.dispatch() can trap us in this loop by repeatedly
+ * dispatching ineligible tasks. Break out once in a while to
+ * allow the watchdog to run. As IRQ can't be enabled in
+ * balance(), we want to complete this scheduling cycle and then
+ * start a new one. IOW, we want to call resched_curr() on the
+ * next, most likely idle, task, not the current one. Use
+ * __scx_bpf_kick_cpu() for deferred kicking.
+ */
+ if (unlikely(!--nr_loops)) {
+ scx_kick_cpu(sch, cpu, 0);
+ break;
+ }
+ } while (dspc->nr_tasks);
+
+ /*
+ * Prevent the CPU from going idle while bypassed descendants have tasks
+ * queued. Without this fallback, bypassed tasks could stall if the host
+ * scheduler's ops.dispatch() doesn't yield any tasks.
+ */
+ if (scx_bypass_dsp_enabled(sch))
+ return scx_consume_dispatch_q(sch, rq, scx_bypass_dsq(sch, cpu), 0);
+
+ return false;
+}
+
+#endif /* _KERNEL_SCHED_EXT_SUB_H */
--
2.54.0
next prev parent reply other threads:[~2026-07-01 18:10 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-07-01 18:10 [PATCHSET v2 sched_ext/for-7.3] sched_ext: Split sub-scheduler implementation into sub.c Tejun Heo
2026-07-01 18:10 ` [PATCH v2 sched_ext/for-7.3 1/4] sched_ext: Prefix file-local ext.c helpers exposed by the sub.c split Tejun Heo
2026-07-01 18:10 ` [PATCH v2 sched_ext/for-7.3 2/4] sched_ext: Expose the ext.c internals used " Tejun Heo
2026-07-01 18:10 ` [PATCH v2 sched_ext/for-7.3 3/4] sched_ext: Inline small ext.c helpers shared across " Tejun Heo
2026-07-01 18:10 ` Tejun Heo [this message]
2026-07-01 19:43 ` [PATCHSET v2 sched_ext/for-7.3] sched_ext: Split sub-scheduler implementation into sub.c Andrea Righi
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260701181046.2490390-5-tj@kernel.org \
--to=tj@kernel.org \
--cc=arighi@nvidia.com \
--cc=changwoo@igalia.com \
--cc=emil@etsalapatis.com \
--cc=linux-kernel@vger.kernel.org \
--cc=sched-ext@lists.linux.dev \
--cc=void@manifault.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox