From: Tejun Heo <tj@kernel.org>
To: linux-kernel@vger.kernel.org, sched-ext@lists.linux.dev
Cc: void@manifault.com, arighi@nvidia.com, changwoo@igalia.com,
emil@etsalapatis.com, hannes@cmpxchg.org, mkoutny@suse.com,
cgroups@vger.kernel.org, Tejun Heo <tj@kernel.org>
Subject: [PATCH 29/34] sched_ext: Implement cgroup sub-sched enabling and disabling
Date: Tue, 24 Feb 2026 19:01:47 -1000 [thread overview]
Message-ID: <20260225050152.1070601-30-tj@kernel.org> (raw)
In-Reply-To: <20260225050152.1070601-1-tj@kernel.org>
The preceding changes implemented the framework to support cgroup
sub-scheds and updated scheduling paths and kfuncs so that they have
minimal but working support for sub-scheds. However, actual sub-sched
enabling/disabling hasn't been implemented yet and all tasks stayed on
scx_root.
Implement cgroup sub-sched enabling and disabling to actually activate
sub-scheds:
- Both enable and disable operations bypass only the tasks in the subtree
of the child being enabled or disabled to limit disruptions.
- When enabling, all candidate tasks are first initialized for the child
sched. Once that succeeds, the tasks are exited for the parent and then
switched over to the child. This adds a bit of complication but
guarantees that child scheduler failures are always contained.
- Disabling works the same way in the other direction. However, when the
parent may fail to initialize a task, disabling is propagated up to the
parent. While this means that a parent sched fail due to a child sched
event, the failure can only originate from the parent itself (its
ops.init_task()). The only effect a malfunctioning child can have on the
parent is attempting to move the tasks back to the parent.
After this change, although not all the necessary mechanisms are in place
yet, sub-scheds can take control of their tasks and schedule them.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
include/linux/sched/ext.h | 1 +
kernel/sched/ext.c | 278 +++++++++++++++++++++++++++++++++++++-
2 files changed, 273 insertions(+), 6 deletions(-)
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 3213e31c7979..f354d7d34306 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -88,6 +88,7 @@ enum scx_ent_flags {
SCX_TASK_IN_CUSTODY = 1 << 1, /* in custody, needs ops.dequeue() when leaving */
SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */
+ SCX_TASK_SUB_INIT = 1 << 4, /* task being initialized for a sub sched */
SCX_TASK_STATE_SHIFT = 8, /* bit 8 and 9 are used to carry scx_task_state */
SCX_TASK_STATE_BITS = 2,
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index ea29e77abb46..d26a92bc6be9 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -51,6 +51,15 @@ DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
+#ifdef CONFIG_EXT_SUB_SCHED
+/*
+ * The sub sched being enabled. Used by scx_disable_and_exit_task() to exit
+ * tasks for the sub-sched being enabled. Use a global variable instead of a
+ * per-task field as all enables are serialized.
+ */
+static struct scx_sched *scx_enabling_sub_sched;
+#endif /* CONFIG_EXT_SUB_SCHED */
+
/*
* A monotically increasing sequence number that is incremented every time a
* scheduler is enabled. This can be used by to check if any custom sched_ext
@@ -3337,6 +3346,17 @@ static void scx_disable_and_exit_task(struct scx_sched *sch,
{
__scx_disable_and_exit_task(sch, p);
+ /*
+ * If set, @p exited between __scx_init_task() and scx_enable_task() in
+ * scx_sub_enable() and is initialized for both the associated sched and
+ * its parent. Disable and exit for the child too.
+ */
+ if ((p->scx.flags & SCX_TASK_SUB_INIT) &&
+ !WARN_ON_ONCE(!scx_enabling_sub_sched)) {
+ __scx_disable_and_exit_task(scx_enabling_sub_sched, p);
+ p->scx.flags &= ~SCX_TASK_SUB_INIT;
+ }
+
scx_set_task_sched(p, NULL);
scx_set_task_state(p, SCX_TASK_NONE);
}
@@ -3372,9 +3392,11 @@ int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs)
percpu_rwsem_assert_held(&scx_fork_rwsem);
if (scx_init_task_enabled) {
- ret = scx_init_task(scx_root, p, true);
+ struct scx_sched *sch = kargs->cset->dfl_cgrp->scx_sched;
+
+ ret = scx_init_task(sch, p, true);
if (!ret)
- scx_set_task_sched(p, scx_root);
+ scx_set_task_sched(p, sch);
return ret;
}
@@ -4624,9 +4646,9 @@ static void scx_bypass(struct scx_sched *sch, bool bypass)
struct rq *rq = cpu_rq(cpu);
struct task_struct *p, *n;
+ raw_spin_lock(&scx_sched_lock);
raw_spin_rq_lock(rq);
- raw_spin_lock(&scx_sched_lock);
scx_for_each_descendant_pre(pos, sch) {
struct scx_sched_pcpu *pcpu = per_cpu_ptr(pos->pcpu, cpu);
@@ -4635,6 +4657,7 @@ static void scx_bypass(struct scx_sched *sch, bool bypass)
else
pcpu->flags &= ~SCX_SCHED_PCPU_BYPASSING;
}
+
raw_spin_unlock(&scx_sched_lock);
/*
@@ -4779,23 +4802,139 @@ static void drain_descendants(struct scx_sched *sch)
wait_event(scx_unlink_waitq, list_empty(&sch->children));
}
+static void scx_fail_parent(struct scx_sched *sch,
+ struct task_struct *failed, s32 fail_code)
+{
+ struct scx_sched *parent = scx_parent(sch);
+ struct scx_task_iter sti;
+ struct task_struct *p;
+
+ scx_error(parent, "ops.init_task() failed (%d) for %s[%d] while disabling a sub-scheduler",
+ fail_code, failed->comm, failed->pid);
+
+ /*
+ * Once $parent is bypassed, it's safe to put SCX_TASK_NONE tasks into
+ * it. This may cause downstream failures on the BPF side but $parent is
+ * dying anyway.
+ */
+ scx_bypass(parent, true);
+
+ scx_task_iter_start(&sti, sch->cgrp);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ if (scx_task_on_sched(parent, p))
+ continue;
+
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+ scx_disable_and_exit_task(sch, p);
+ rcu_assign_pointer(p->scx.sched, parent);
+ }
+ }
+ scx_task_iter_stop(&sti);
+}
+
static void scx_sub_disable(struct scx_sched *sch)
{
struct scx_sched *parent = scx_parent(sch);
+ struct scx_task_iter sti;
+ struct task_struct *p;
+ int ret;
+ /*
+ * Guarantee forward progress and wait for descendants to be disabled.
+ * To limit
+ * disruptions, $parent is not bypassed. Tasks are fully prepped and
+ * then inserted back into $parent.
+ */
+ scx_bypass(sch, true);
drain_descendants(sch);
+ /*
+ * Here, every runnable task is guaranteed to make forward progress and
+ * we can safely use blocking synchronization constructs. Actually
+ * disable ops.
+ */
mutex_lock(&scx_enable_mutex);
percpu_down_write(&scx_fork_rwsem);
scx_cgroup_lock();
set_cgroup_sched(sch->cgrp, parent);
- /* TODO - perform actual disabling here */
+ scx_task_iter_start(&sti, sch->cgrp);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ struct rq *rq;
+ struct rq_flags rf;
+
+ /* filter out duplicate visits */
+ if (scx_task_on_sched(parent, p))
+ continue;
+
+ /*
+ * By the time control reaches here, all descendant schedulers
+ * should already have been disabled.
+ */
+ WARN_ON_ONCE(!scx_task_on_sched(sch, p));
+
+ /*
+ * If $p is about to be freed, nothing prevents $sch from
+ * unloading before $p reaches sched_ext_free(). Disable and
+ * exit $p right away.
+ */
+ if (!tryget_task_struct(p)) {
+ scx_disable_and_exit_task(sch, p);
+ continue;
+ }
+
+ scx_task_iter_unlock(&sti);
+
+ /*
+ * $p is READY or ENABLED on @sch. Initialize for $parent,
+ * disable and exit from @sch, and then switch over to $parent.
+ *
+ * If a task fails to initialize for $parent, the only available
+ * action is disabling $parent too. While this allows disabling
+ * of a child sched to cause the parent scheduler to fail, the
+ * failure can only originate from ops.init_task() of the
+ * parent. A child can't directly affect the parent through its
+ * own failures.
+ */
+ ret = __scx_init_task(parent, p, false);
+ if (ret) {
+ scx_fail_parent(sch, p, ret);
+ put_task_struct(p);
+ break;
+ }
+
+ rq = task_rq_lock(p, &rf);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+ /*
+ * $p is initialized for $parent and still attached to
+ * @sch. Disable and exit for @sch, switch over to
+ * $parent, override the state to READY to account for
+ * $p having already been initialized, and then enable.
+ */
+ scx_disable_and_exit_task(sch, p);
+ scx_set_task_state(p, SCX_TASK_INIT);
+ rcu_assign_pointer(p->scx.sched, parent);
+ scx_set_task_state(p, SCX_TASK_READY);
+ scx_enable_task(parent, p);
+ }
+ task_rq_unlock(rq, p, &rf);
+
+ put_task_struct(p);
+ }
+ scx_task_iter_stop(&sti);
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);
+ /*
+ * All tasks are moved off of @sch but there may still be on-going
+ * operations (e.g. ops.select_cpu()). Drain them by flushing RCU. Use
+ * the expedited version as ancestors may be waiting in bypass mode.
+ * Also, tell the parent that there is no need to keep running bypass
+ * DSQs for us.
+ */
+ synchronize_rcu_expedited();
disable_bypass_dsp(sch);
raw_spin_lock_irq(&scx_sched_lock);
@@ -5903,11 +6042,28 @@ static struct scx_sched *find_parent_sched(struct cgroup *cgrp)
return parent;
}
+static bool assert_task_ready_or_enabled(struct task_struct *p)
+{
+ enum scx_task_state state = scx_get_task_state(p);
+
+ switch (state) {
+ case SCX_TASK_READY:
+ case SCX_TASK_ENABLED:
+ return true;
+ default:
+ WARN_ONCE(true, "sched_ext: Invalid task state %d for %s[%d] during enabling sub sched",
+ state, p->comm, p->pid);
+ return false;
+ }
+}
+
static s32 scx_sub_enable(struct sched_ext_ops *ops, struct bpf_link *link)
{
struct cgroup *cgrp;
struct scx_sched *parent, *sch;
- s32 ret;
+ struct scx_task_iter sti;
+ struct task_struct *p;
+ s32 i, ret;
mutex_lock(&scx_enable_mutex);
@@ -5979,6 +6135,12 @@ static s32 scx_sub_enable(struct sched_ext_ops *ops, struct bpf_link *link)
}
sch->sub_attached = true;
+ scx_bypass(sch, true);
+
+ for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
+ if (((void (**)(void))ops)[i])
+ set_bit(i, sch->has_op);
+
percpu_down_write(&scx_fork_rwsem);
scx_cgroup_lock();
@@ -5992,16 +6154,119 @@ static s32 scx_sub_enable(struct sched_ext_ops *ops, struct bpf_link *link)
goto err_unlock_and_disable;
}
- /* TODO - perform actual enabling here */
+ /*
+ * Initialize tasks for the new child $sch without exiting them for
+ * $parent so that the tasks can always be reverted back to $parent
+ * sched on child init failure.
+ */
+ WARN_ON_ONCE(scx_enabling_sub_sched);
+ scx_enabling_sub_sched = sch;
+
+ scx_task_iter_start(&sti, sch->cgrp);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ struct rq *rq;
+ struct rq_flags rf;
+
+ /*
+ * Task iteration may visit the same task twice when racing
+ * against exiting. Use %SCX_TASK_SUB_INIT to mark tasks which
+ * finished __scx_init_task() and skip if set.
+ *
+ * A task may exit and get freed between __scx_init_task()
+ * completion and scx_enable_task(). In such cases,
+ * scx_disable_and_exit_task() must exit the task for both the
+ * parent and child scheds.
+ */
+ if (p->scx.flags & SCX_TASK_SUB_INIT)
+ continue;
+
+ /* see scx_root_enable() */
+ if (!tryget_task_struct(p))
+ continue;
+
+ if (!assert_task_ready_or_enabled(p)) {
+ ret = -EINVAL;
+ goto abort;
+ }
+
+ scx_task_iter_unlock(&sti);
+
+ /*
+ * As $p is still on $parent, it can't be transitioned to INIT.
+ * Let's worry about task state later. Use __scx_init_task().
+ */
+ ret = __scx_init_task(sch, p, false);
+ if (ret)
+ goto abort;
+
+ rq = task_rq_lock(p, &rf);
+ p->scx.flags |= SCX_TASK_SUB_INIT;
+ task_rq_unlock(rq, p, &rf);
+
+ put_task_struct(p);
+ }
+ scx_task_iter_stop(&sti);
+
+ /*
+ * All tasks are prepped. Disable/exit tasks for $parent and enable for
+ * the new @sch.
+ */
+ scx_task_iter_start(&sti, sch->cgrp);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ /*
+ * Use clearing of %SCX_TASK_SUB_INIT to detect and skip
+ * duplicate iterations.
+ */
+ if (!(p->scx.flags & SCX_TASK_SUB_INIT))
+ continue;
+
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+ /*
+ * $p must be either READY or ENABLED. If ENABLED,
+ * __scx_disabled_and_exit_task() first disables and
+ * makes it READY. However, after exiting $p, it will
+ * leave $p as READY.
+ */
+ assert_task_ready_or_enabled(p);
+ __scx_disable_and_exit_task(parent, p);
+
+ /*
+ * $p is now only initialized for @sch and READY, which
+ * is what we want. Assign it to @sch and enable.
+ */
+ rcu_assign_pointer(p->scx.sched, sch);
+ scx_enable_task(sch, p);
+
+ p->scx.flags &= ~SCX_TASK_SUB_INIT;
+ }
+ }
+ scx_task_iter_stop(&sti);
+
+ scx_enabling_sub_sched = NULL;
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);
+ scx_bypass(sch, false);
+
pr_info("sched_ext: BPF sub-scheduler \"%s\" enabled\n", sch->ops.name);
kobject_uevent(&sch->kobj, KOBJ_ADD);
ret = 0;
goto out_unlock;
+abort:
+ put_task_struct(p);
+ scx_task_iter_stop(&sti);
+ scx_enabling_sub_sched = NULL;
+
+ scx_task_iter_start(&sti, sch->cgrp);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ if (p->scx.flags & SCX_TASK_SUB_INIT) {
+ __scx_disable_and_exit_task(sch, p);
+ p->scx.flags &= ~SCX_TASK_SUB_INIT;
+ }
+ }
+ scx_task_iter_stop(&sti);
out_put_cgrp:
cgroup_put(cgrp);
out_unlock:
@@ -6009,6 +6274,7 @@ static s32 scx_sub_enable(struct sched_ext_ops *ops, struct bpf_link *link)
return ret;
err_unlock_and_disable:
+ /* we'll soon enter disable path, keep bypass on */
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);
err_disable:
--
2.53.0
next prev parent reply other threads:[~2026-02-25 5:02 UTC|newest]
Thread overview: 40+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-02-25 5:01 [PATCHSET v2 sched_ext/for-7.1] sched_ext: Implement cgroup sub-scheduler support Tejun Heo
2026-02-25 5:01 ` [PATCH 01/34] sched_ext: Implement cgroup subtree iteration for scx_task_iter Tejun Heo
2026-02-25 5:01 ` [PATCH 02/34] sched_ext: Add @kargs to scx_fork() Tejun Heo
2026-02-25 5:01 ` [PATCH 03/34] sched/core: Swap the order between sched_post_fork() and cgroup_post_fork() Tejun Heo
2026-02-25 5:01 ` [PATCH 04/34] cgroup: Expose some cgroup helpers Tejun Heo
2026-02-25 5:01 ` [PATCH 05/34] sched_ext: Update p->scx.disallow warning in scx_init_task() Tejun Heo
2026-02-25 5:01 ` [PATCH 06/34] sched_ext: Reorganize enable/disable path for multi-scheduler support Tejun Heo
2026-02-25 5:01 ` [PATCH 07/34] sched_ext: Introduce cgroup sub-sched support Tejun Heo
2026-02-25 5:01 ` [PATCH 08/34] sched_ext: Introduce scx_task_sched[_rcu]() Tejun Heo
2026-02-25 5:01 ` [PATCH 09/34] sched_ext: Introduce scx_prog_sched() Tejun Heo
2026-02-25 5:01 ` [PATCH 10/34] sched_ext: Enforce scheduling authority in dispatch and select_cpu operations Tejun Heo
2026-02-25 5:01 ` [PATCH 11/34] sched_ext: Enforce scheduler ownership when updating slice and dsq_vtime Tejun Heo
2026-02-25 5:01 ` [PATCH 12/34] sched_ext: scx_dsq_move() should validate the task belongs to the right scheduler Tejun Heo
2026-02-25 5:01 ` [PATCH 13/34] sched_ext: Refactor task init/exit helpers Tejun Heo
2026-02-25 5:01 ` [PATCH 14/34] sched_ext: Make scx_prio_less() handle multiple schedulers Tejun Heo
2026-02-25 5:01 ` [PATCH 15/34] sched_ext: Move default slice to per-scheduler field Tejun Heo
2026-02-25 5:01 ` [PATCH 16/34] sched_ext: Move aborting flag " Tejun Heo
2026-02-25 5:01 ` [PATCH 17/34] sched_ext: Move bypass_dsq into scx_sched_pcpu Tejun Heo
2026-02-25 5:01 ` [PATCH 18/34] sched_ext: Move bypass state into scx_sched Tejun Heo
2026-02-25 5:01 ` [PATCH 19/34] sched_ext: Prepare bypass mode for hierarchical operation Tejun Heo
2026-02-25 5:01 ` [PATCH 20/34] sched_ext: Factor out scx_dispatch_sched() Tejun Heo
2026-02-25 5:01 ` [PATCH 21/34] sched_ext: When calling ops.dispatch() @prev must be on the same scx_sched Tejun Heo
2026-02-25 5:01 ` [PATCH 22/34] sched_ext: Separate bypass dispatch enabling from bypass depth tracking Tejun Heo
2026-02-25 5:01 ` [PATCH 23/34] sched_ext: Implement hierarchical bypass mode Tejun Heo
2026-02-25 5:01 ` [PATCH 24/34] sched_ext: Dispatch from all scx_sched instances Tejun Heo
2026-02-25 5:01 ` [PATCH 25/34] sched_ext: Move scx_dsp_ctx and scx_dsp_max_batch into scx_sched Tejun Heo
2026-02-25 5:01 ` [PATCH 26/34] sched_ext: Make watchdog sub-sched aware Tejun Heo
2026-02-25 5:01 ` [PATCH 27/34] sched_ext: Convert scx_dump_state() spinlock to raw spinlock Tejun Heo
2026-02-25 5:01 ` [PATCH 28/34] sched_ext: Support dumping multiple schedulers and add scheduler identification Tejun Heo
2026-02-25 5:01 ` Tejun Heo [this message]
2026-02-25 5:01 ` [PATCH 30/34] sched_ext: Add scx_sched back pointer to scx_sched_pcpu Tejun Heo
2026-02-25 5:01 ` [PATCH 31/34] sched_ext: Make scx_bpf_reenqueue_local() sub-sched aware Tejun Heo
2026-02-25 5:01 ` [PATCH 32/34] sched_ext: Factor out scx_link_sched() and scx_unlink_sched() Tejun Heo
2026-02-25 5:01 ` [PATCH 33/34] sched_ext: Add rhashtable lookup for sub-schedulers Tejun Heo
2026-02-25 5:01 ` [PATCH 34/34] sched_ext: Add basic building blocks for nested sub-scheduler dispatching Tejun Heo
2026-02-25 5:18 ` [PATCHSET v2 sched_ext/for-7.1] sched_ext: Implement cgroup sub-scheduler support Tejun Heo
-- strict thread matches above, loose matches on Subject: below --
2026-03-04 22:00 [PATCHSET v3 " Tejun Heo
2026-03-04 22:01 ` [PATCH 29/34] sched_ext: Implement cgroup sub-sched enabling and disabling Tejun Heo
2026-03-06 9:41 ` Cheng-Yang Chou
2026-02-25 5:00 [PATCHSET v2 sched_ext/for-7.1] sched_ext: Implement cgroup sub-scheduler support Tejun Heo
2026-02-25 5:01 ` [PATCH 29/34] sched_ext: Implement cgroup sub-sched enabling and disabling Tejun Heo
2026-01-21 23:11 [PATCHSET v1 sched_ext/for-6.20] sched_ext: Implement cgroup sub-scheduler support Tejun Heo
2026-01-21 23:11 ` [PATCH 29/34] sched_ext: Implement cgroup sub-sched enabling and disabling Tejun Heo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260225050152.1070601-30-tj@kernel.org \
--to=tj@kernel.org \
--cc=arighi@nvidia.com \
--cc=cgroups@vger.kernel.org \
--cc=changwoo@igalia.com \
--cc=emil@etsalapatis.com \
--cc=hannes@cmpxchg.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mkoutny@suse.com \
--cc=sched-ext@lists.linux.dev \
--cc=void@manifault.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox