From: Tejun Heo <tj@kernel.org>
To: linux-kernel@vger.kernel.org, sched-ext@lists.linux.dev
Cc: void@manifault.com, arighi@nvidia.com, changwoo@igalia.com,
emil@etsalapatis.com, hannes@cmpxchg.org, mkoutny@suse.com,
cgroups@vger.kernel.org, Tejun Heo <tj@kernel.org>
Subject: [PATCH 34/34] sched_ext: Add basic building blocks for nested sub-scheduler dispatching
Date: Tue, 24 Feb 2026 19:01:52 -1000 [thread overview]
Message-ID: <20260225050152.1070601-35-tj@kernel.org> (raw)
In-Reply-To: <20260225050152.1070601-1-tj@kernel.org>
This is an early-stage partial implementation that demonstrates the core
building blocks for nested sub-scheduler dispatching. While significant
work remains in the enqueue path and other areas, this patch establishes
the fundamental mechanisms needed for hierarchical scheduler operation.
The key building blocks introduced include:
- Private stack support for ops.dispatch() to prevent stack overflow when
walking down nested schedulers during dispatch operations
- scx_bpf_sub_dispatch() kfunc that allows parent schedulers to trigger
dispatch operations on their direct child schedulers
- Proper parent-child relationship validation to ensure dispatch requests
are only made to legitimate child schedulers
- Updated scx_dispatch_sched() to handle both nested and non-nested
invocations with appropriate kf_mask handling
The qmap scheduler is updated to demonstrate the functionality by calling
scx_bpf_sub_dispatch() on registered child schedulers when it has no
tasks in its own queues.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/ext.c | 120 ++++++++++++++++++++---
kernel/sched/sched.h | 3 +
tools/sched_ext/include/scx/common.bpf.h | 1 +
tools/sched_ext/scx_qmap.bpf.c | 37 ++++++-
4 files changed, 145 insertions(+), 16 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index eccb67a78e90..7255eab3acc3 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2438,8 +2438,14 @@ static inline void maybe_queue_balance_callback(struct rq *rq)
rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING;
}
-static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
- struct task_struct *prev)
+/*
+ * One user of this function is scx_bpf_dispatch() which can be called
+ * recursively as sub-sched dispatches nest. Always inline to reduce stack usage
+ * from the call frame.
+ */
+static __always_inline bool
+scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
+ struct task_struct *prev, bool nested)
{
struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
int nr_loops = SCX_DSP_MAX_LOOPS;
@@ -2491,8 +2497,23 @@ static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
do {
dspc->nr_tasks = 0;
- SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq, cpu,
- prev_on_sch ? prev : NULL);
+ if (nested) {
+ /*
+ * If nested, don't update kf_mask as the originating
+ * invocation would already have set it up.
+ */
+ SCX_CALL_OP(sch, 0, dispatch, rq, cpu,
+ prev_on_sch ? prev : NULL);
+ } else {
+ /*
+ * If not nested, stash @prev so that nested invocations
+ * can access it.
+ */
+ rq->scx.sub_dispatch_prev = prev;
+ SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq, cpu,
+ prev_on_sch ? prev : NULL);
+ rq->scx.sub_dispatch_prev = NULL;
+ }
flush_dispatch_buf(sch, rq);
@@ -2533,7 +2554,7 @@ static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
static int balance_one(struct rq *rq, struct task_struct *prev)
{
- struct scx_sched *sch = scx_root, *pos;
+ struct scx_sched *sch = scx_root;
s32 cpu = cpu_of(rq);
lockdep_assert_rq_held(rq);
@@ -2577,13 +2598,8 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
if (rq->scx.local_dsq.nr)
goto has_tasks;
- /*
- * TEMPORARY - Dispatch all scheds. This will be replaced by BPF-driven
- * hierarchical operation.
- */
- list_for_each_entry_rcu(pos, &scx_sched_all, all)
- if (scx_dispatch_sched(pos, rq, prev))
- goto has_tasks;
+ if (scx_dispatch_sched(sch, rq, prev, false))
+ goto has_tasks;
/*
* Didn't find another task to run. Keep running @prev unless
@@ -4918,9 +4934,8 @@ static void scx_sub_disable(struct scx_sched *sch)
/*
* Guarantee forward progress and wait for descendants to be disabled.
- * To limit
- * disruptions, $parent is not bypassed. Tasks are fully prepped and
- * then inserted back into $parent.
+ * To limit disruptions, $parent is not bypassed. Tasks are fully
+ * prepped and then inserted back into $parent.
*/
scx_bypass(sch, true);
drain_descendants(sch);
@@ -6500,6 +6515,20 @@ static int bpf_scx_init_member(const struct btf_type *t,
return 0;
}
+#ifdef CONFIG_EXT_SUB_SCHED
+static void scx_pstack_recursion_on_dispatch(struct bpf_prog *prog)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+ sch = scx_prog_sched(prog->aux);
+ if (unlikely(!sch))
+ return;
+
+ scx_error(sch, "dispatch recursion detected");
+}
+#endif /* CONFIG_EXT_SUB_SCHED */
+
static int bpf_scx_check_member(const struct btf_type *t,
const struct btf_member *member,
const struct bpf_prog *prog)
@@ -6525,6 +6554,22 @@ static int bpf_scx_check_member(const struct btf_type *t,
return -EINVAL;
}
+#ifdef CONFIG_EXT_SUB_SCHED
+ /*
+ * Enable private stack for operations that can nest along the
+ * hierarchy.
+ *
+ * XXX - Ideally, we should only do this for scheds that allow
+ * sub-scheds and sub-scheds themselves but I don't know how to access
+ * struct_ops from here.
+ */
+ switch (moff) {
+ case offsetof(struct sched_ext_ops, dispatch):
+ prog->aux->priv_stack_requested = true;
+ prog->aux->recursion_detected = scx_pstack_recursion_on_dispatch;
+ }
+#endif /* CONFIG_EXT_SUB_SCHED */
+
return 0;
}
@@ -7509,6 +7554,48 @@ __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter,
p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
}
+#ifdef CONFIG_EXT_SUB_SCHED
+/**
+ * scx_bpf_sub_dispatch - Trigger dispatching on a child scheduler
+ * @cgroup_id: cgroup ID of the child scheduler to dispatch
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * Allows a parent scheduler to trigger dispatching on one of its direct
+ * child schedulers. The child scheduler runs its dispatch operation to
+ * move tasks from dispatch queues to the local runqueue.
+ *
+ * Returns: true on success, false if cgroup_id is invalid, not a direct
+ * child, or caller lacks dispatch permission.
+ */
+__bpf_kfunc bool scx_bpf_sub_dispatch(u64 cgroup_id, const struct bpf_prog_aux *aux)
+{
+ struct rq *this_rq = this_rq();
+ struct scx_sched *parent, *child;
+
+ guard(rcu)();
+ parent = scx_prog_sched(aux);
+ if (unlikely(!parent))
+ return false;
+
+ if (!scx_kf_allowed(parent, SCX_KF_DISPATCH))
+ return false;
+
+ child = scx_find_sub_sched(cgroup_id);
+
+ if (unlikely(!child))
+ return false;
+
+ if (unlikely(scx_parent(child) != parent)) {
+ scx_error(parent, "trying to dispatch a distant sub-sched on cgroup %llu",
+ cgroup_id);
+ return false;
+ }
+
+ return scx_dispatch_sched(child, this_rq, this_rq->scx.sub_dispatch_prev,
+ true);
+}
+#endif /* CONFIG_EXT_SUB_SCHED */
+
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
@@ -7519,6 +7606,9 @@ BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
+#ifdef CONFIG_EXT_SUB_SCHED
+BTF_ID_FLAGS(func, scx_bpf_sub_dispatch, KF_IMPLICIT_ARGS)
+#endif
BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6934dbd1f96e..7324e4a8c99b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -805,6 +805,9 @@ struct scx_rq {
cpumask_var_t cpus_to_preempt;
cpumask_var_t cpus_to_wait;
unsigned long kick_sync;
+
+ struct task_struct *sub_dispatch_prev;
+
struct llist_head deferred_reenq_locals;
struct balance_callback deferred_bal_cb;
struct irq_work deferred_irq_work;
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 821d5791bd42..eba4d87345e0 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -101,6 +101,7 @@ struct rq *scx_bpf_locked_rq(void) __ksym;
struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak;
u64 scx_bpf_now(void) __ksym __weak;
void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak;
+bool scx_bpf_sub_dispatch(u64 cgroup_id) __ksym __weak;
/*
* Use the following as @it__iter when calling scx_bpf_dsq_move[_vtime]() from
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index ff6ff34177ab..91b8eac83f52 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -48,6 +48,9 @@ const volatile bool suppress_dump;
u64 nr_highpri_queued;
u32 test_error_cnt;
+#define MAX_SUB_SCHEDS 8
+u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
+
UEI_DEFINE(uei);
struct qmap {
@@ -451,6 +454,12 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
cpuc->dsp_cnt = 0;
}
+ for (i = 0; i < MAX_SUB_SCHEDS; i++) {
+ if (sub_sched_cgroup_ids[i] &&
+ scx_bpf_sub_dispatch(sub_sched_cgroup_ids[i]))
+ return;
+ }
+
/*
* No other tasks. @prev will keep running. Update its core_sched_seq as
* if the task were enqueued and dispatched immediately.
@@ -895,7 +904,32 @@ void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args)
{
- return 0;
+ s32 i;
+
+ for (i = 0; i < MAX_SUB_SCHEDS; i++) {
+ if (!sub_sched_cgroup_ids[i]) {
+ sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
+ bpf_printk("attaching sub-sched[%d] on %s",
+ i, args->cgroup_path);
+ return 0;
+ }
+ }
+
+ return -ENOSPC;
+}
+
+void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args)
+{
+ s32 i;
+
+ for (i = 0; i < MAX_SUB_SCHEDS; i++) {
+ if (sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
+ sub_sched_cgroup_ids[i] = 0;
+ bpf_printk("detaching sub-sched[%d] on %s",
+ i, args->cgroup_path);
+ break;
+ }
+ }
}
SCX_OPS_DEFINE(qmap_ops,
@@ -914,6 +948,7 @@ SCX_OPS_DEFINE(qmap_ops,
.cgroup_set_weight = (void *)qmap_cgroup_set_weight,
.cgroup_set_bandwidth = (void *)qmap_cgroup_set_bandwidth,
.sub_attach = (void *)qmap_sub_attach,
+ .sub_detach = (void *)qmap_sub_detach,
.cpu_online = (void *)qmap_cpu_online,
.cpu_offline = (void *)qmap_cpu_offline,
.init = (void *)qmap_init,
--
2.53.0
next prev parent reply other threads:[~2026-02-25 5:02 UTC|newest]
Thread overview: 39+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-02-25 5:01 [PATCHSET v2 sched_ext/for-7.1] sched_ext: Implement cgroup sub-scheduler support Tejun Heo
2026-02-25 5:01 ` [PATCH 01/34] sched_ext: Implement cgroup subtree iteration for scx_task_iter Tejun Heo
2026-02-25 5:01 ` [PATCH 02/34] sched_ext: Add @kargs to scx_fork() Tejun Heo
2026-02-25 5:01 ` [PATCH 03/34] sched/core: Swap the order between sched_post_fork() and cgroup_post_fork() Tejun Heo
2026-02-25 5:01 ` [PATCH 04/34] cgroup: Expose some cgroup helpers Tejun Heo
2026-02-25 5:01 ` [PATCH 05/34] sched_ext: Update p->scx.disallow warning in scx_init_task() Tejun Heo
2026-02-25 5:01 ` [PATCH 06/34] sched_ext: Reorganize enable/disable path for multi-scheduler support Tejun Heo
2026-02-25 5:01 ` [PATCH 07/34] sched_ext: Introduce cgroup sub-sched support Tejun Heo
2026-02-25 5:01 ` [PATCH 08/34] sched_ext: Introduce scx_task_sched[_rcu]() Tejun Heo
2026-02-25 5:01 ` [PATCH 09/34] sched_ext: Introduce scx_prog_sched() Tejun Heo
2026-02-25 5:01 ` [PATCH 10/34] sched_ext: Enforce scheduling authority in dispatch and select_cpu operations Tejun Heo
2026-02-25 5:01 ` [PATCH 11/34] sched_ext: Enforce scheduler ownership when updating slice and dsq_vtime Tejun Heo
2026-02-25 5:01 ` [PATCH 12/34] sched_ext: scx_dsq_move() should validate the task belongs to the right scheduler Tejun Heo
2026-02-25 5:01 ` [PATCH 13/34] sched_ext: Refactor task init/exit helpers Tejun Heo
2026-02-25 5:01 ` [PATCH 14/34] sched_ext: Make scx_prio_less() handle multiple schedulers Tejun Heo
2026-02-25 5:01 ` [PATCH 15/34] sched_ext: Move default slice to per-scheduler field Tejun Heo
2026-02-25 5:01 ` [PATCH 16/34] sched_ext: Move aborting flag " Tejun Heo
2026-02-25 5:01 ` [PATCH 17/34] sched_ext: Move bypass_dsq into scx_sched_pcpu Tejun Heo
2026-02-25 5:01 ` [PATCH 18/34] sched_ext: Move bypass state into scx_sched Tejun Heo
2026-02-25 5:01 ` [PATCH 19/34] sched_ext: Prepare bypass mode for hierarchical operation Tejun Heo
2026-02-25 5:01 ` [PATCH 20/34] sched_ext: Factor out scx_dispatch_sched() Tejun Heo
2026-02-25 5:01 ` [PATCH 21/34] sched_ext: When calling ops.dispatch() @prev must be on the same scx_sched Tejun Heo
2026-02-25 5:01 ` [PATCH 22/34] sched_ext: Separate bypass dispatch enabling from bypass depth tracking Tejun Heo
2026-02-25 5:01 ` [PATCH 23/34] sched_ext: Implement hierarchical bypass mode Tejun Heo
2026-02-25 5:01 ` [PATCH 24/34] sched_ext: Dispatch from all scx_sched instances Tejun Heo
2026-02-25 5:01 ` [PATCH 25/34] sched_ext: Move scx_dsp_ctx and scx_dsp_max_batch into scx_sched Tejun Heo
2026-02-25 5:01 ` [PATCH 26/34] sched_ext: Make watchdog sub-sched aware Tejun Heo
2026-02-25 5:01 ` [PATCH 27/34] sched_ext: Convert scx_dump_state() spinlock to raw spinlock Tejun Heo
2026-02-25 5:01 ` [PATCH 28/34] sched_ext: Support dumping multiple schedulers and add scheduler identification Tejun Heo
2026-02-25 5:01 ` [PATCH 29/34] sched_ext: Implement cgroup sub-sched enabling and disabling Tejun Heo
2026-02-25 5:01 ` [PATCH 30/34] sched_ext: Add scx_sched back pointer to scx_sched_pcpu Tejun Heo
2026-02-25 5:01 ` [PATCH 31/34] sched_ext: Make scx_bpf_reenqueue_local() sub-sched aware Tejun Heo
2026-02-25 5:01 ` [PATCH 32/34] sched_ext: Factor out scx_link_sched() and scx_unlink_sched() Tejun Heo
2026-02-25 5:01 ` [PATCH 33/34] sched_ext: Add rhashtable lookup for sub-schedulers Tejun Heo
2026-02-25 5:01 ` Tejun Heo [this message]
2026-02-25 5:18 ` [PATCHSET v2 sched_ext/for-7.1] sched_ext: Implement cgroup sub-scheduler support Tejun Heo
-- strict thread matches above, loose matches on Subject: below --
2026-03-04 22:00 [PATCHSET v3 " Tejun Heo
2026-03-04 22:01 ` [PATCH 34/34] sched_ext: Add basic building blocks for nested sub-scheduler dispatching Tejun Heo
2026-02-25 5:00 [PATCHSET v2 sched_ext/for-7.1] sched_ext: Implement cgroup sub-scheduler support Tejun Heo
2026-02-25 5:01 ` [PATCH 34/34] sched_ext: Add basic building blocks for nested sub-scheduler dispatching Tejun Heo
2026-01-21 23:11 [PATCHSET v1 sched_ext/for-6.20] sched_ext: Implement cgroup sub-scheduler support Tejun Heo
2026-01-21 23:11 ` [PATCH 34/34] sched_ext: Add basic building blocks for nested sub-scheduler dispatching Tejun Heo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260225050152.1070601-35-tj@kernel.org \
--to=tj@kernel.org \
--cc=arighi@nvidia.com \
--cc=cgroups@vger.kernel.org \
--cc=changwoo@igalia.com \
--cc=emil@etsalapatis.com \
--cc=hannes@cmpxchg.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mkoutny@suse.com \
--cc=sched-ext@lists.linux.dev \
--cc=void@manifault.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.