[PATCH 31/34] sched_ext: Make scx_bpf_reenqueue_local() sub-sched aware

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

From: Tejun Heo <tj@kernel.org>
To: linux-kernel@vger.kernel.org, sched-ext@lists.linux.dev
Cc: void@manifault.com, andrea.righi@linux.dev, changwoo@igalia.com,
	emil@etsalapatis.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 31/34] sched_ext: Make scx_bpf_reenqueue_local() sub-sched aware
Date: Wed, 21 Jan 2026 13:11:37 -1000	[thread overview]
Message-ID: <20260121231140.832332-32-tj@kernel.org> (raw)
In-Reply-To: <20260121231140.832332-1-tj@kernel.org>

scx_bpf_reenqueue_local() currently re-enqueues all tasks on the local DSQ
regardless of which sub-scheduler owns them. With multiple sub-schedulers,
each should only re-enqueue tasks it owns or are owned by its descendants.

Replace the per-rq boolean flag with a lock-free linked list to track
per-scheduler reenqueue requests. Filter tasks in reenq_local() using
hierarchical ownership checks and block deferrals during bypass to prevent
use on dead schedulers.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c          | 73 ++++++++++++++++++++++++++++++-------
 kernel/sched/ext_internal.h |  1 +
 kernel/sched/sched.h        |  2 +-
 3 files changed, 62 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index d21a3f805704..469fcbe4611c 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -182,7 +182,7 @@ MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microsecond
 #include <trace/events/sched_ext.h>
 
 static void process_ddsp_deferred_locals(struct rq *rq);
-static u32 reenq_local(struct rq *rq);
+static u32 reenq_local(struct scx_sched *sch, struct rq *rq);
 static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags);
 static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind);
 static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
@@ -987,9 +987,16 @@ static void run_deferred(struct rq *rq)
 {
 	process_ddsp_deferred_locals(rq);
 
-	if (local_read(&rq->scx.reenq_local_deferred)) {
-		local_set(&rq->scx.reenq_local_deferred, 0);
-		reenq_local(rq);
+	if (!llist_empty(&rq->scx.deferred_reenq_locals)) {
+		struct llist_node *llist =
+			llist_del_all(&rq->scx.deferred_reenq_locals);
+		struct scx_sched_pcpu *pos, *next;
+
+		llist_for_each_entry_safe(pos, next, llist,
+					  deferred_reenq_locals_node) {
+			init_llist_node(&pos->deferred_reenq_locals_node);
+			reenq_local(pos->sch, rq);
+		}
 	}
 }
 
@@ -3884,7 +3891,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 	struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work);
 	struct rhashtable_iter rht_iter;
 	struct scx_dispatch_q *dsq;
-	int node;
+	int cpu, node;
 
 	irq_work_sync(&sch->error_irq_work);
 	kthread_destroy_worker(sch->helper);
@@ -3896,6 +3903,17 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 		cgroup_put(sch->cgrp);
 #endif	/* CONFIG_EXT_SUB_SCHED */
 
+	/*
+	 * $sch would have entered bypass mode before the RCU grace period. As
+	 * that blocks new deferrals, all deferred_reenq_locals_node's must be
+	 * off-list by now.
+	 */
+	for_each_possible_cpu(cpu) {
+		struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
+
+		WARN_ON_ONCE(llist_on_list(&pcpu->deferred_reenq_locals_node));
+	}
+
 	free_percpu(sch->pcpu);
 
 	for_each_node_state(node, N_POSSIBLE)
@@ -5443,8 +5461,12 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 	for_each_possible_cpu(cpu)
 		init_dsq(bypass_dsq(sch, cpu), SCX_DSQ_BYPASS, sch);
 
-	for_each_possible_cpu(cpu)
-		per_cpu_ptr(sch->pcpu, cpu)->sch = sch;
+	for_each_possible_cpu(cpu) {
+		struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
+
+		pcpu->sch = sch;
+		init_llist_node(&pcpu->deferred_reenq_locals_node);
+	}
 
 	sch->helper = kthread_run_worker(0, "sched_ext_helper");
 	if (IS_ERR(sch->helper)) {
@@ -6695,6 +6717,7 @@ void __init init_sched_ext_class(void)
 		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
 		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
 		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
+		init_llist_head(&rq->scx.deferred_reenq_locals);
 		rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
 		rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);
 
@@ -7266,7 +7289,7 @@ static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
 	.set			= &scx_kfunc_ids_dispatch,
 };
 
-static u32 reenq_local(struct rq *rq)
+static u32 reenq_local(struct scx_sched *sch, struct rq *rq)
 {
 	LIST_HEAD(tasks);
 	u32 nr_enqueued = 0;
@@ -7281,6 +7304,8 @@ static u32 reenq_local(struct rq *rq)
 	 */
 	list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list,
 				 scx.dsq_list.node) {
+		struct scx_sched *task_sch = scx_task_sched(p);
+
 		/*
 		 * If @p is being migrated, @p's current CPU may not agree with
 		 * its allowed CPUs and the migration_cpu_stop is about to
@@ -7295,6 +7320,9 @@ static u32 reenq_local(struct rq *rq)
 		if (p->migration_pending)
 			continue;
 
+		if (!scx_is_descendant(task_sch, sch))
+			continue;
+
 		dispatch_dequeue(rq, p);
 		list_add_tail(&p->scx.dsq_list.node, &tasks);
 	}
@@ -7337,7 +7365,7 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(const struct bpf_prog_aux *aux)
 	rq = cpu_rq(smp_processor_id());
 	lockdep_assert_rq_held(rq);
 
-	return reenq_local(rq);
+	return reenq_local(sch, rq);
 }
 
 __bpf_kfunc_end_defs();
@@ -7908,20 +7936,39 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
 
 /**
  * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Iterate over all of the tasks currently enqueued on the local DSQ of the
  * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from
  * anywhere.
  */
-__bpf_kfunc void scx_bpf_reenqueue_local___v2(void)
+__bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux)
 {
+	unsigned long flags;
+	struct scx_sched *sch;
 	struct rq *rq;
+	struct llist_node *lnode;
 
-	guard(preempt)();
+	raw_local_irq_save(flags);
+
+	sch = scx_prog_sched(aux);
+	if (unlikely(!sch))
+		goto out_irq_restore;
+
+	/*
+	 * Allowing reenqueue-locals doesn't make sense while bypassing. This
+	 * also blocks from new reenqueues to be scheduled on dead scheds.
+	 */
+	if (unlikely(sch->bypass_depth))
+		goto out_irq_restore;
 
 	rq = this_rq();
-	local_set(&rq->scx.reenq_local_deferred, 1);
+	lnode = &this_cpu_ptr(sch->pcpu)->deferred_reenq_locals_node;
+	if (!llist_on_list(lnode))
+		llist_add(lnode, &rq->scx.deferred_reenq_locals);
 	schedule_deferred(rq);
+out_irq_restore:
+	raw_local_irq_restore(flags);
 }
 
 /**
@@ -8346,7 +8393,7 @@ BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_IMPLICIT_ARGS)
-BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2)
+BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_cpuperf_set, KF_IMPLICIT_ARGS)
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index cbe10672f6a4..688be11ab9eb 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -965,6 +965,7 @@ struct scx_sched_pcpu {
 	 */
 	struct scx_event_stats	event_stats;
 
+	struct llist_node	deferred_reenq_locals_node;
 	struct scx_dispatch_q	bypass_dsq;
 #ifdef CONFIG_EXT_SUB_SCHED
 	u32			bypass_host_seq;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 07aaa09df7ed..f5bfe1029ee3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -803,7 +803,7 @@ struct scx_rq {
 	cpumask_var_t		cpus_to_preempt;
 	cpumask_var_t		cpus_to_wait;
 	unsigned long		kick_sync;
-	local_t			reenq_local_deferred;
+	struct llist_head	deferred_reenq_locals;
 	struct balance_callback	deferred_bal_cb;
 	struct irq_work		deferred_irq_work;
 	struct irq_work		kick_cpus_irq_work;
-- 
2.52.0

next prev parent reply	other threads:[~2026-01-21 23:12 UTC|newest]

Thread overview: 38+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-01-21 23:11 [PATCHSET v1 sched_ext/for-6.20] sched_ext: Implement cgroup sub-scheduler support Tejun Heo
2026-01-21 23:11 ` [PATCH 01/34] sched_ext: Implement cgroup subtree iteration for scx_task_iter Tejun Heo
2026-01-21 23:11 ` [PATCH 02/34] sched_ext: Add @kargs to scx_fork() Tejun Heo
2026-01-21 23:11 ` [PATCH 03/34] sched/core: Swap the order between sched_post_fork() and cgroup_post_fork() Tejun Heo
2026-01-21 23:11 ` [PATCH 04/34] cgroup: Expose some cgroup helpers Tejun Heo
2026-01-21 23:11 ` [PATCH 05/34] sched_ext: Update p->scx.disallow warning in scx_init_task() Tejun Heo
2026-01-21 23:11 ` [PATCH 06/34] sched_ext: Reorganize enable/disable path for multi-scheduler support Tejun Heo
2026-01-21 23:11 ` [PATCH 07/34] sched_ext: Introduce cgroup sub-sched support Tejun Heo
2026-01-21 23:11 ` [PATCH 08/34] sched_ext: Introduce scx_task_sched[_rcu]() Tejun Heo
2026-01-21 23:11 ` [PATCH 09/34] sched_ext: Introduce scx_prog_sched() Tejun Heo
2026-01-21 23:11 ` [PATCH 10/34] sched_ext: Enforce scheduling authority in dispatch and select_cpu operations Tejun Heo
2026-01-21 23:11 ` [PATCH 11/34] sched_ext: Enforce scheduler ownership when updating slice and dsq_vtime Tejun Heo
2026-01-21 23:11 ` [PATCH 12/34] sched_ext: scx_dsq_move() should validate the task belongs to the right scheduler Tejun Heo
2026-01-21 23:11 ` [PATCH 13/34] sched_ext: Refactor task init/exit helpers Tejun Heo
2026-01-21 23:11 ` [PATCH 14/34] sched_ext: Make scx_prio_less() handle multiple schedulers Tejun Heo
2026-01-21 23:11 ` [PATCH 15/34] sched_ext: Move default slice to per-scheduler field Tejun Heo
2026-01-21 23:11 ` [PATCH 16/34] sched_ext: Move aborting flag " Tejun Heo
2026-01-21 23:11 ` [PATCH 17/34] sched_ext: Move bypass_dsq into scx_sched_pcpu Tejun Heo
2026-01-21 23:11 ` [PATCH 18/34] sched_ext: Move bypass state into scx_sched Tejun Heo
2026-01-21 23:11 ` [PATCH 19/34] sched_ext: Prepare bypass mode for hierarchical operation Tejun Heo
2026-01-21 23:11 ` [PATCH 20/34] sched_ext: Factor out scx_dispatch_sched() Tejun Heo
2026-01-21 23:11 ` [PATCH 21/34] sched_ext: When calling ops.dispatch() @prev must be on the same scx_sched Tejun Heo
2026-01-21 23:11 ` [PATCH 22/34] sched_ext: Separate bypass dispatch enabling from bypass depth tracking Tejun Heo
2026-01-21 23:11 ` [PATCH 23/34] sched_ext: Implement hierarchical bypass mode Tejun Heo
2026-01-21 23:11 ` [PATCH 24/34] sched_ext: Dispatch from all scx_sched instances Tejun Heo
2026-01-21 23:11 ` [PATCH 25/34] sched_ext: Move scx_dsp_ctx and scx_dsp_max_batch into scx_sched Tejun Heo
2026-01-21 23:11 ` [PATCH 26/34] sched_ext: Make watchdog sub-sched aware Tejun Heo
2026-01-21 23:11 ` [PATCH 27/34] sched_ext: Convert scx_dump_state() spinlock to raw spinlock Tejun Heo
2026-01-21 23:11 ` [PATCH 28/34] sched_ext: Support dumping multiple schedulers and add scheduler identification Tejun Heo
2026-01-21 23:11 ` [PATCH 29/34] sched_ext: Implement cgroup sub-sched enabling and disabling Tejun Heo
2026-01-21 23:11 ` [PATCH 30/34] sched_ext: Add scx_sched back pointer to scx_sched_pcpu Tejun Heo
2026-01-21 23:11 ` Tejun Heo [this message]
2026-01-21 23:11 ` [PATCH 32/34] sched_ext: Factor out scx_link_sched() and scx_unlink_sched() Tejun Heo
2026-01-21 23:11 ` [PATCH 33/34] sched_ext: Add rhashtable lookup for sub-schedulers Tejun Heo
2026-01-21 23:11 ` [PATCH 34/34] sched_ext: Add basic building blocks for nested sub-scheduler dispatching Tejun Heo
  -- strict thread matches above, loose matches on Subject: below --
2026-02-25  5:00 [PATCHSET v2 sched_ext/for-7.1] sched_ext: Implement cgroup sub-scheduler support Tejun Heo
2026-02-25  5:01 ` [PATCH 31/34] sched_ext: Make scx_bpf_reenqueue_local() sub-sched aware Tejun Heo
2026-02-25  5:01 [PATCHSET v2 sched_ext/for-7.1] sched_ext: Implement cgroup sub-scheduler support Tejun Heo
2026-02-25  5:01 ` [PATCH 31/34] sched_ext: Make scx_bpf_reenqueue_local() sub-sched aware Tejun Heo
2026-03-04 22:00 [PATCHSET v3 sched_ext/for-7.1] sched_ext: Implement cgroup sub-scheduler support Tejun Heo
2026-03-04 22:01 ` [PATCH 31/34] sched_ext: Make scx_bpf_reenqueue_local() sub-sched aware Tejun Heo

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:d21a3f80570 dfblob:469fcbe4611 dfblob:cbe10672f6a
dfblob:688be11ab9e dfblob:07aaa09df7e dfblob:f5bfe1029ee )
 OR (
bs:"[PATCH 31/34] sched_ext: Make scx_bpf_reenqueue_local() sub-sched aware" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260121231140.832332-32-tj@kernel.org \
    --to=tj@kernel.org \
    --cc=andrea.righi@linux.dev \
    --cc=changwoo@igalia.com \
    --cc=emil@etsalapatis.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=sched-ext@lists.linux.dev \
    --cc=void@manifault.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox