public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Tejun Heo <tj@kernel.org>
To: linux-kernel@vger.kernel.org, sched-ext@lists.linux.dev
Cc: void@manifault.com, arighi@nvidia.com, changwoo@igalia.com,
	emil@etsalapatis.com, hannes@cmpxchg.org, mkoutny@suse.com,
	cgroups@vger.kernel.org, Tejun Heo <tj@kernel.org>
Subject: [PATCH 34/34] sched_ext: Add basic building blocks for nested sub-scheduler dispatching
Date: Tue, 24 Feb 2026 19:01:09 -1000	[thread overview]
Message-ID: <20260225050109.1070059-35-tj@kernel.org> (raw)
In-Reply-To: <20260225050109.1070059-1-tj@kernel.org>

This is an early-stage partial implementation that demonstrates the core
building blocks for nested sub-scheduler dispatching. While significant
work remains in the enqueue path and other areas, this patch establishes
the fundamental mechanisms needed for hierarchical scheduler operation.

The key building blocks introduced include:

- Private stack support for ops.dispatch() to prevent stack overflow when
  walking down nested schedulers during dispatch operations

- scx_bpf_sub_dispatch() kfunc that allows parent schedulers to trigger
  dispatch operations on their direct child schedulers

- Proper parent-child relationship validation to ensure dispatch requests
  are only made to legitimate child schedulers

- Updated scx_dispatch_sched() to handle both nested and non-nested
  invocations with appropriate kf_mask handling

The qmap scheduler is updated to demonstrate the functionality by calling
scx_bpf_sub_dispatch() on registered child schedulers when it has no
tasks in its own queues.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c                       | 120 ++++++++++++++++++++---
 kernel/sched/sched.h                     |   3 +
 tools/sched_ext/include/scx/common.bpf.h |   1 +
 tools/sched_ext/scx_qmap.bpf.c           |  37 ++++++-
 4 files changed, 145 insertions(+), 16 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index eccb67a78e90..7255eab3acc3 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2438,8 +2438,14 @@ static inline void maybe_queue_balance_callback(struct rq *rq)
 	rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING;
 }
 
-static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
-			       struct task_struct *prev)
+/*
+ * One user of this function is scx_bpf_dispatch() which can be called
+ * recursively as sub-sched dispatches nest. Always inline to reduce stack usage
+ * from the call frame.
+ */
+static __always_inline bool
+scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
+		   struct task_struct *prev, bool nested)
 {
 	struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
 	int nr_loops = SCX_DSP_MAX_LOOPS;
@@ -2491,8 +2497,23 @@ static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
 	do {
 		dspc->nr_tasks = 0;
 
-		SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq, cpu,
-			    prev_on_sch ? prev : NULL);
+		if (nested) {
+			/*
+			 * If nested, don't update kf_mask as the originating
+			 * invocation would already have set it up.
+			 */
+			SCX_CALL_OP(sch, 0, dispatch, rq, cpu,
+				    prev_on_sch ? prev : NULL);
+		} else {
+			/*
+			 * If not nested, stash @prev so that nested invocations
+			 * can access it.
+			 */
+			rq->scx.sub_dispatch_prev = prev;
+			SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq, cpu,
+				    prev_on_sch ? prev : NULL);
+			rq->scx.sub_dispatch_prev = NULL;
+		}
 
 		flush_dispatch_buf(sch, rq);
 
@@ -2533,7 +2554,7 @@ static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
 
 static int balance_one(struct rq *rq, struct task_struct *prev)
 {
-	struct scx_sched *sch = scx_root, *pos;
+	struct scx_sched *sch = scx_root;
 	s32 cpu = cpu_of(rq);
 
 	lockdep_assert_rq_held(rq);
@@ -2577,13 +2598,8 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 	if (rq->scx.local_dsq.nr)
 		goto has_tasks;
 
-	/*
-	 * TEMPORARY - Dispatch all scheds. This will be replaced by BPF-driven
-	 * hierarchical operation.
-	 */
-	list_for_each_entry_rcu(pos, &scx_sched_all, all)
-		if (scx_dispatch_sched(pos, rq, prev))
-			goto has_tasks;
+	if (scx_dispatch_sched(sch, rq, prev, false))
+		goto has_tasks;
 
 	/*
 	 * Didn't find another task to run. Keep running @prev unless
@@ -4918,9 +4934,8 @@ static void scx_sub_disable(struct scx_sched *sch)
 
 	/*
 	 * Guarantee forward progress and wait for descendants to be disabled.
-	 * To limit
-	 * disruptions, $parent is not bypassed. Tasks are fully prepped and
-	 * then inserted back into $parent.
+	 * To limit disruptions, $parent is not bypassed. Tasks are fully
+	 * prepped and then inserted back into $parent.
 	 */
 	scx_bypass(sch, true);
 	drain_descendants(sch);
@@ -6500,6 +6515,20 @@ static int bpf_scx_init_member(const struct btf_type *t,
 	return 0;
 }
 
+#ifdef CONFIG_EXT_SUB_SCHED
+static void scx_pstack_recursion_on_dispatch(struct bpf_prog *prog)
+{
+	struct scx_sched *sch;
+
+	guard(rcu)();
+	sch = scx_prog_sched(prog->aux);
+	if (unlikely(!sch))
+		return;
+
+	scx_error(sch, "dispatch recursion detected");
+}
+#endif	/* CONFIG_EXT_SUB_SCHED */
+
 static int bpf_scx_check_member(const struct btf_type *t,
 				const struct btf_member *member,
 				const struct bpf_prog *prog)
@@ -6525,6 +6554,22 @@ static int bpf_scx_check_member(const struct btf_type *t,
 			return -EINVAL;
 	}
 
+#ifdef CONFIG_EXT_SUB_SCHED
+	/*
+	 * Enable private stack for operations that can nest along the
+	 * hierarchy.
+	 *
+	 * XXX - Ideally, we should only do this for scheds that allow
+	 * sub-scheds and sub-scheds themselves but I don't know how to access
+	 * struct_ops from here.
+	 */
+	switch (moff) {
+	case offsetof(struct sched_ext_ops, dispatch):
+		prog->aux->priv_stack_requested = true;
+		prog->aux->recursion_detected = scx_pstack_recursion_on_dispatch;
+	}
+#endif	/* CONFIG_EXT_SUB_SCHED */
+
 	return 0;
 }
 
@@ -7509,6 +7554,48 @@ __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter,
 			    p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
 }
 
+#ifdef CONFIG_EXT_SUB_SCHED
+/**
+ * scx_bpf_sub_dispatch - Trigger dispatching on a child scheduler
+ * @cgroup_id: cgroup ID of the child scheduler to dispatch
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * Allows a parent scheduler to trigger dispatching on one of its direct
+ * child schedulers. The child scheduler runs its dispatch operation to
+ * move tasks from dispatch queues to the local runqueue.
+ *
+ * Returns: true on success, false if cgroup_id is invalid, not a direct
+ * child, or caller lacks dispatch permission.
+ */
+__bpf_kfunc bool scx_bpf_sub_dispatch(u64 cgroup_id, const struct bpf_prog_aux *aux)
+{
+	struct rq *this_rq = this_rq();
+	struct scx_sched *parent, *child;
+
+	guard(rcu)();
+	parent = scx_prog_sched(aux);
+	if (unlikely(!parent))
+		return false;
+
+	if (!scx_kf_allowed(parent, SCX_KF_DISPATCH))
+		return false;
+
+	child = scx_find_sub_sched(cgroup_id);
+
+	if (unlikely(!child))
+		return false;
+
+	if (unlikely(scx_parent(child) != parent)) {
+		scx_error(parent, "trying to dispatch a distant sub-sched on cgroup %llu",
+			  cgroup_id);
+		return false;
+	}
+
+	return scx_dispatch_sched(child, this_rq, this_rq->scx.sub_dispatch_prev,
+				  true);
+}
+#endif	/* CONFIG_EXT_SUB_SCHED */
+
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
@@ -7519,6 +7606,9 @@ BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
+#ifdef CONFIG_EXT_SUB_SCHED
+BTF_ID_FLAGS(func, scx_bpf_sub_dispatch, KF_IMPLICIT_ARGS)
+#endif
 BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
 
 static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6934dbd1f96e..7324e4a8c99b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -805,6 +805,9 @@ struct scx_rq {
 	cpumask_var_t		cpus_to_preempt;
 	cpumask_var_t		cpus_to_wait;
 	unsigned long		kick_sync;
+
+	struct task_struct	*sub_dispatch_prev;
+
 	struct llist_head	deferred_reenq_locals;
 	struct balance_callback	deferred_bal_cb;
 	struct irq_work		deferred_irq_work;
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 821d5791bd42..eba4d87345e0 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -101,6 +101,7 @@ struct rq *scx_bpf_locked_rq(void) __ksym;
 struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak;
 u64 scx_bpf_now(void) __ksym __weak;
 void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak;
+bool scx_bpf_sub_dispatch(u64 cgroup_id) __ksym __weak;
 
 /*
  * Use the following as @it__iter when calling scx_bpf_dsq_move[_vtime]() from
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index ff6ff34177ab..91b8eac83f52 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -48,6 +48,9 @@ const volatile bool suppress_dump;
 u64 nr_highpri_queued;
 u32 test_error_cnt;
 
+#define MAX_SUB_SCHEDS		8
+u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
+
 UEI_DEFINE(uei);
 
 struct qmap {
@@ -451,6 +454,12 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 		cpuc->dsp_cnt = 0;
 	}
 
+	for (i = 0; i < MAX_SUB_SCHEDS; i++) {
+		if (sub_sched_cgroup_ids[i] &&
+		    scx_bpf_sub_dispatch(sub_sched_cgroup_ids[i]))
+			return;
+	}
+
 	/*
 	 * No other tasks. @prev will keep running. Update its core_sched_seq as
 	 * if the task were enqueued and dispatched immediately.
@@ -895,7 +904,32 @@ void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
 
 s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args)
 {
-	return 0;
+	s32 i;
+
+	for (i = 0; i < MAX_SUB_SCHEDS; i++) {
+		if (!sub_sched_cgroup_ids[i]) {
+			sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
+			bpf_printk("attaching sub-sched[%d] on %s",
+				   i, args->cgroup_path);
+			return 0;
+		}
+	}
+
+	return -ENOSPC;
+}
+
+void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args)
+{
+	s32 i;
+
+	for (i = 0; i < MAX_SUB_SCHEDS; i++) {
+		if (sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
+			sub_sched_cgroup_ids[i] = 0;
+			bpf_printk("detaching sub-sched[%d] on %s",
+				   i, args->cgroup_path);
+			break;
+		}
+	}
 }
 
 SCX_OPS_DEFINE(qmap_ops,
@@ -914,6 +948,7 @@ SCX_OPS_DEFINE(qmap_ops,
 	       .cgroup_set_weight	= (void *)qmap_cgroup_set_weight,
 	       .cgroup_set_bandwidth	= (void *)qmap_cgroup_set_bandwidth,
 	       .sub_attach		= (void *)qmap_sub_attach,
+	       .sub_detach		= (void *)qmap_sub_detach,
 	       .cpu_online		= (void *)qmap_cpu_online,
 	       .cpu_offline		= (void *)qmap_cpu_offline,
 	       .init			= (void *)qmap_init,
-- 
2.53.0


  parent reply	other threads:[~2026-02-25  5:01 UTC|newest]

Thread overview: 49+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-25  5:00 [PATCHSET v2 sched_ext/for-7.1] sched_ext: Implement cgroup sub-scheduler support Tejun Heo
2026-02-25  5:00 ` [PATCH 01/34] sched_ext: Implement cgroup subtree iteration for scx_task_iter Tejun Heo
2026-02-25  5:00 ` [PATCH 02/34] sched_ext: Add @kargs to scx_fork() Tejun Heo
2026-02-25  5:00 ` [PATCH 03/34] sched/core: Swap the order between sched_post_fork() and cgroup_post_fork() Tejun Heo
2026-02-25  5:00 ` [PATCH 04/34] cgroup: Expose some cgroup helpers Tejun Heo
2026-02-25  5:00 ` [PATCH 05/34] sched_ext: Update p->scx.disallow warning in scx_init_task() Tejun Heo
2026-02-25  5:00 ` [PATCH 06/34] sched_ext: Reorganize enable/disable path for multi-scheduler support Tejun Heo
2026-02-25  5:00 ` [PATCH 07/34] sched_ext: Introduce cgroup sub-sched support Tejun Heo
2026-02-26 15:37   ` Andrea Righi
2026-02-27 20:14     ` Tejun Heo
2026-02-26 15:52   ` Andrea Righi
2026-02-27 19:51     ` Tejun Heo
2026-02-27 20:04       ` Tejun Heo
2026-02-25  5:00 ` [PATCH 08/34] sched_ext: Introduce scx_task_sched[_rcu]() Tejun Heo
2026-02-25  5:00 ` [PATCH 09/34] sched_ext: Introduce scx_prog_sched() Tejun Heo
2026-02-25  5:00 ` [PATCH 10/34] sched_ext: Enforce scheduling authority in dispatch and select_cpu operations Tejun Heo
2026-02-25  5:00 ` [PATCH 11/34] sched_ext: Enforce scheduler ownership when updating slice and dsq_vtime Tejun Heo
2026-02-26 15:13   ` Andrea Righi
2026-02-27 22:25     ` Tejun Heo
2026-02-27 23:50       ` Andrea Righi
2026-02-25  5:00 ` [PATCH 12/34] sched_ext: scx_dsq_move() should validate the task belongs to the right scheduler Tejun Heo
2026-02-25  5:00 ` [PATCH 13/34] sched_ext: Refactor task init/exit helpers Tejun Heo
2026-02-27  6:55   ` Andrea Righi
2026-02-27 19:50     ` Tejun Heo
2026-02-25  5:00 ` [PATCH 14/34] sched_ext: Make scx_prio_less() handle multiple schedulers Tejun Heo
2026-02-25  5:00 ` [PATCH 15/34] sched_ext: Move default slice to per-scheduler field Tejun Heo
2026-02-25  5:00 ` [PATCH 16/34] sched_ext: Move aborting flag " Tejun Heo
2026-02-25  5:00 ` [PATCH 17/34] sched_ext: Move bypass_dsq into scx_sched_pcpu Tejun Heo
2026-02-25  5:00 ` [PATCH 18/34] sched_ext: Move bypass state into scx_sched Tejun Heo
2026-02-25  5:00 ` [PATCH 19/34] sched_ext: Prepare bypass mode for hierarchical operation Tejun Heo
2026-02-25  5:00 ` [PATCH 20/34] sched_ext: Factor out scx_dispatch_sched() Tejun Heo
2026-02-25  5:00 ` [PATCH 21/34] sched_ext: When calling ops.dispatch() @prev must be on the same scx_sched Tejun Heo
2026-02-25  5:00 ` [PATCH 22/34] sched_ext: Separate bypass dispatch enabling from bypass depth tracking Tejun Heo
2026-02-25  5:00 ` [PATCH 23/34] sched_ext: Implement hierarchical bypass mode Tejun Heo
2026-02-25  5:00 ` [PATCH 24/34] sched_ext: Dispatch from all scx_sched instances Tejun Heo
2026-02-25  5:01 ` [PATCH 25/34] sched_ext: Move scx_dsp_ctx and scx_dsp_max_batch into scx_sched Tejun Heo
2026-02-25  5:01 ` [PATCH 26/34] sched_ext: Make watchdog sub-sched aware Tejun Heo
2026-02-25  5:01 ` [PATCH 27/34] sched_ext: Convert scx_dump_state() spinlock to raw spinlock Tejun Heo
2026-02-25  5:01 ` [PATCH 28/34] sched_ext: Support dumping multiple schedulers and add scheduler identification Tejun Heo
2026-02-25  5:01 ` [PATCH 29/34] sched_ext: Implement cgroup sub-sched enabling and disabling Tejun Heo
2026-02-25  5:01 ` [PATCH 30/34] sched_ext: Add scx_sched back pointer to scx_sched_pcpu Tejun Heo
2026-02-25  5:01 ` [PATCH 31/34] sched_ext: Make scx_bpf_reenqueue_local() sub-sched aware Tejun Heo
2026-02-25  5:01 ` [PATCH 32/34] sched_ext: Factor out scx_link_sched() and scx_unlink_sched() Tejun Heo
2026-02-25  5:01 ` [PATCH 33/34] sched_ext: Add rhashtable lookup for sub-schedulers Tejun Heo
2026-02-25  5:01 ` Tejun Heo [this message]
2026-02-25  5:14 ` [PATCHSET v2 sched_ext/for-7.1] sched_ext: Implement cgroup sub-scheduler support Tejun Heo
  -- strict thread matches above, loose matches on Subject: below --
2026-03-04 22:00 [PATCHSET v3 " Tejun Heo
2026-03-04 22:01 ` [PATCH 34/34] sched_ext: Add basic building blocks for nested sub-scheduler dispatching Tejun Heo
2026-02-25  5:01 [PATCHSET v2 sched_ext/for-7.1] sched_ext: Implement cgroup sub-scheduler support Tejun Heo
2026-02-25  5:01 ` [PATCH 34/34] sched_ext: Add basic building blocks for nested sub-scheduler dispatching Tejun Heo
2026-01-21 23:11 [PATCHSET v1 sched_ext/for-6.20] sched_ext: Implement cgroup sub-scheduler support Tejun Heo
2026-01-21 23:11 ` [PATCH 34/34] sched_ext: Add basic building blocks for nested sub-scheduler dispatching Tejun Heo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260225050109.1070059-35-tj@kernel.org \
    --to=tj@kernel.org \
    --cc=arighi@nvidia.com \
    --cc=cgroups@vger.kernel.org \
    --cc=changwoo@igalia.com \
    --cc=emil@etsalapatis.com \
    --cc=hannes@cmpxchg.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mkoutny@suse.com \
    --cc=sched-ext@lists.linux.dev \
    --cc=void@manifault.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox