[PATCH 06/15] sched_ext: Convert deferred_reenq_locals from llist to regular list

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

From: Tejun Heo <tj@kernel.org>
To: linux-kernel@vger.kernel.org, sched-ext@lists.linux.dev
Cc: void@manifault.com, arighi@nvidia.com, changwoo@igalia.com,
	emil@etsalapatis.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 06/15] sched_ext: Convert deferred_reenq_locals from llist to regular list
Date: Fri,  6 Mar 2026 09:06:14 -1000	[thread overview]
Message-ID: <20260306190623.1076074-7-tj@kernel.org> (raw)
In-Reply-To: <20260306190623.1076074-1-tj@kernel.org>

The deferred reenqueue local mechanism uses an llist (lockless list) for
collecting schedulers that need their local DSQs re-enqueued. Convert to a
regular list protected by a raw_spinlock.

The llist was used for its lockless properties, but the upcoming changes to
support remote reenqueue require more complex list operations that are
difficult to implement correctly with lockless data structures. A spinlock-
protected regular list provides the necessary flexibility.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c          | 57 ++++++++++++++++++++++++-------------
 kernel/sched/ext_internal.h |  2 +-
 kernel/sched/sched.h        |  3 +-
 3 files changed, 41 insertions(+), 21 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 1b6cd1e4f8b9..ffccaf04e34d 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3640,23 +3640,37 @@ static u32 reenq_local(struct scx_sched *sch, struct rq *rq)
 	return nr_enqueued;
 }
 
-static void run_deferred(struct rq *rq)
+static void process_deferred_reenq_locals(struct rq *rq)
 {
-	process_ddsp_deferred_locals(rq);
-
-	if (!llist_empty(&rq->scx.deferred_reenq_locals)) {
-		struct llist_node *llist =
-			llist_del_all(&rq->scx.deferred_reenq_locals);
-		struct scx_sched_pcpu *pos, *next;
+	lockdep_assert_rq_held(rq);
 
-		llist_for_each_entry_safe(pos, next, llist,
-					  deferred_reenq_locals_node) {
-			init_llist_node(&pos->deferred_reenq_locals_node);
-			reenq_local(pos->sch, rq);
+	while (true) {
+		struct scx_sched *sch;
+
+		scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
+			struct scx_sched_pcpu *sch_pcpu =
+				list_first_entry_or_null(&rq->scx.deferred_reenq_locals,
+							 struct scx_sched_pcpu,
+							 deferred_reenq_local_node);
+			if (!sch_pcpu)
+				return;
+
+			sch = sch_pcpu->sch;
+			list_del_init(&sch_pcpu->deferred_reenq_local_node);
 		}
+
+		reenq_local(sch, rq);
 	}
 }
 
+static void run_deferred(struct rq *rq)
+{
+	process_ddsp_deferred_locals(rq);
+
+	if (!list_empty(&rq->scx.deferred_reenq_locals))
+		process_deferred_reenq_locals(rq);
+}
+
 #ifdef CONFIG_NO_HZ_FULL
 bool scx_can_stop_tick(struct rq *rq)
 {
@@ -4180,13 +4194,13 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 
 	/*
 	 * $sch would have entered bypass mode before the RCU grace period. As
-	 * that blocks new deferrals, all deferred_reenq_locals_node's must be
+	 * that blocks new deferrals, all deferred_reenq_local_node's must be
 	 * off-list by now.
 	 */
 	for_each_possible_cpu(cpu) {
 		struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
 
-		WARN_ON_ONCE(llist_on_list(&pcpu->deferred_reenq_locals_node));
+		WARN_ON_ONCE(!list_empty(&pcpu->deferred_reenq_local_node));
 	}
 
 	free_percpu(sch->pcpu);
@@ -5799,7 +5813,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 		struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
 
 		pcpu->sch = sch;
-		init_llist_node(&pcpu->deferred_reenq_locals_node);
+		INIT_LIST_HEAD(&pcpu->deferred_reenq_local_node);
 	}
 
 	sch->helper = kthread_run_worker(0, "sched_ext_helper");
@@ -7126,7 +7140,8 @@ void __init init_sched_ext_class(void)
 		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
 		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
 		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
-		init_llist_head(&rq->scx.deferred_reenq_locals);
+		raw_spin_lock_init(&rq->scx.deferred_reenq_lock);
+		INIT_LIST_HEAD(&rq->scx.deferred_reenq_locals);
 		rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
 		rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);
 
@@ -8358,7 +8373,6 @@ __bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux)
 	unsigned long flags;
 	struct scx_sched *sch;
 	struct rq *rq;
-	struct llist_node *lnode;
 
 	raw_local_irq_save(flags);
 
@@ -8374,9 +8388,14 @@ __bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux)
 		goto out_irq_restore;
 
 	rq = this_rq();
-	lnode = &this_cpu_ptr(sch->pcpu)->deferred_reenq_locals_node;
-	if (!llist_on_list(lnode))
-		llist_add(lnode, &rq->scx.deferred_reenq_locals);
+	scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
+		struct scx_sched_pcpu *pcpu = this_cpu_ptr(sch->pcpu);
+
+		if (list_empty(&pcpu->deferred_reenq_local_node))
+			list_move_tail(&pcpu->deferred_reenq_local_node,
+				       &rq->scx.deferred_reenq_locals);
+	}
+
 	schedule_deferred(rq);
 out_irq_restore:
 	raw_local_irq_restore(flags);
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 9e5ebd00ea0c..80d40a9c5ad9 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -965,7 +965,7 @@ struct scx_sched_pcpu {
 	 */
 	struct scx_event_stats	event_stats;
 
-	struct llist_node	deferred_reenq_locals_node;
+	struct list_head	deferred_reenq_local_node;
 	struct scx_dispatch_q	bypass_dsq;
 #ifdef CONFIG_EXT_SUB_SCHED
 	u32			bypass_host_seq;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ebe971d12cb8..0794852524e7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -808,7 +808,8 @@ struct scx_rq {
 
 	struct task_struct	*sub_dispatch_prev;
 
-	struct llist_head	deferred_reenq_locals;
+	raw_spinlock_t		deferred_reenq_lock;
+	struct list_head	deferred_reenq_locals;	/* scheds requesting reenq of local DSQ */
 	struct balance_callback	deferred_bal_cb;
 	struct irq_work		deferred_irq_work;
 	struct irq_work		kick_cpus_irq_work;
-- 
2.53.0

next prev parent reply	other threads:[~2026-03-06 19:06 UTC|newest]

Thread overview: 38+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-06 19:06 [PATCHSET sched_ext/for-7.1] sched_ext: Overhaul DSQ reenqueue infrastructure Tejun Heo
2026-03-06 19:06 ` [PATCH 01/15] sched_ext: Relocate scx_bpf_task_cgroup() and its BTF_ID to the end of kfunc section Tejun Heo
2026-03-06 20:45   ` Emil Tsalapatis
2026-03-06 23:20   ` Daniel Jordan
2026-03-06 19:06 ` [PATCH 02/15] sched_ext: Wrap global DSQs in per-node structure Tejun Heo
2026-03-06 20:52   ` Emil Tsalapatis
2026-03-06 23:20   ` Daniel Jordan
2026-03-06 19:06 ` [PATCH 03/15] sched_ext: Factor out pnode allocation and deallocation into helpers Tejun Heo
2026-03-06 20:54   ` Emil Tsalapatis
2026-03-06 23:21   ` Daniel Jordan
2026-03-06 19:06 ` [PATCH 04/15] sched_ext: Change find_global_dsq() to take CPU number instead of task Tejun Heo
2026-03-06 21:06   ` Emil Tsalapatis
2026-03-06 22:33   ` [PATCH v2 " Tejun Heo
2026-03-06 23:21   ` [PATCH " Daniel Jordan
2026-03-06 19:06 ` [PATCH 05/15] sched_ext: Relocate reenq_local() and run_deferred() Tejun Heo
2026-03-06 21:09   ` Emil Tsalapatis
2026-03-06 23:34   ` Daniel Jordan
2026-03-07  0:12   ` [PATCH v2 05/15] sched_ext: Relocate run_deferred() and its callees Tejun Heo
2026-03-06 19:06 ` Tejun Heo [this message]
2026-03-09 17:12   ` [PATCH 06/15] sched_ext: Convert deferred_reenq_locals from llist to regular list Emil Tsalapatis
2026-03-09 17:16     ` Emil Tsalapatis
2026-03-06 19:06 ` [PATCH 07/15] sched_ext: Wrap deferred_reenq_local_node into a struct Tejun Heo
2026-03-09 17:16   ` Emil Tsalapatis
2026-03-06 19:06 ` [PATCH 08/15] sched_ext: Introduce scx_bpf_dsq_reenq() for remote local DSQ reenqueue Tejun Heo
2026-03-09 17:33   ` Emil Tsalapatis
2026-03-06 19:06 ` [PATCH 09/15] sched_ext: Add reenq_flags plumbing to scx_bpf_dsq_reenq() Tejun Heo
2026-03-09 17:47   ` Emil Tsalapatis
2026-03-06 19:06 ` [PATCH 10/15] sched_ext: Add per-CPU data to DSQs Tejun Heo
2026-03-06 22:54   ` Andrea Righi
2026-03-06 22:56     ` Andrea Righi
2026-03-06 23:09   ` [PATCH v2 " Tejun Heo
2026-03-06 19:06 ` [PATCH 11/15] sched_ext: Factor out nldsq_cursor_next_task() and nldsq_cursor_lost_task() Tejun Heo
2026-03-06 19:06 ` [PATCH 12/15] sched_ext: Implement scx_bpf_dsq_reenq() for user DSQs Tejun Heo
2026-03-06 19:06 ` [PATCH 13/15] sched_ext: Optimize schedule_dsq_reenq() with lockless fast path Tejun Heo
2026-03-06 19:06 ` [PATCH 14/15] sched_ext: Simplify task state handling Tejun Heo
2026-03-06 19:06 ` [PATCH 15/15] sched_ext: Add SCX_TASK_REENQ_REASON flags Tejun Heo
2026-03-06 23:14 ` [PATCHSET sched_ext/for-7.1] sched_ext: Overhaul DSQ reenqueue infrastructure Andrea Righi
2026-03-07 15:38 ` Tejun Heo

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:1b6cd1e4f8b dfblob:ffccaf04e34 dfblob:9e5ebd00ea0
dfblob:80d40a9c5ad dfblob:ebe971d12cb dfblob:0794852524e )
 OR (
bs:"[PATCH 06/15] sched_ext: Convert deferred_reenq_locals from llist to regular list" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260306190623.1076074-7-tj@kernel.org \
    --to=tj@kernel.org \
    --cc=arighi@nvidia.com \
    --cc=changwoo@igalia.com \
    --cc=emil@etsalapatis.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=sched-ext@lists.linux.dev \
    --cc=void@manifault.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox