From: Tejun Heo <tj@kernel.org>
To: linux-kernel@vger.kernel.org, sched-ext@lists.linux.dev
Cc: void@manifault.com, arighi@nvidia.com, changwoo@igalia.com,
emil@etsalapatis.com
Subject: [PATCH v2 10/15] sched_ext: Add per-CPU data to DSQs
Date: Fri, 06 Mar 2026 13:09:30 -1000 [thread overview]
Message-ID: <af18f4873ed5d2ef606c55923acc1014@kernel.org> (raw)
In-Reply-To: <20260306190623.1076074-11-tj@kernel.org>
Add per-CPU data structure to dispatch queues. Each DSQ now has a percpu
scx_dsq_pcpu which contains a back-pointer to the DSQ. This will be used by
future changes to implement per-CPU reenqueue tracking for user DSQs.
init_dsq() now allocates the percpu data and can fail, so it returns an
error code. All callers are updated to handle failures. exit_dsq() is added
to free the percpu data and is called from all DSQ cleanup paths.
In scx_bpf_create_dsq(), init_dsq() is called before rcu_read_lock() since
alloc_percpu() requires GFP_KERNEL context, and dsq->sched is set
afterwards.
v2: Fix err_free_pcpu to only exit_dsq() initialized bypass DSQs (Andrea
Righi).
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andrea Righi <arighi@nvidia.com>
---
include/linux/sched/ext.h | 5 ++
kernel/sched/ext.c | 87 ++++++++++++++++++++++++++++++++++++++--------
2 files changed, 77 insertions(+), 15 deletions(-)
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -62,6 +62,10 @@ enum scx_dsq_id_flags {
SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU,
};
+struct scx_dsq_pcpu {
+ struct scx_dispatch_q *dsq;
+};
+
/*
* A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered
* queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to
@@ -79,6 +83,7 @@ struct scx_dispatch_q {
struct rhash_head hash_node;
struct llist_node free_node;
struct scx_sched *sched;
+ struct scx_dsq_pcpu __percpu *pcpu;
struct rcu_head rcu;
};
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4021,15 +4021,42 @@ DEFINE_SCHED_CLASS(ext) = {
#endif
};
-static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id,
- struct scx_sched *sch)
+static s32 init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id,
+ struct scx_sched *sch)
{
+ s32 cpu;
+
memset(dsq, 0, sizeof(*dsq));
raw_spin_lock_init(&dsq->lock);
INIT_LIST_HEAD(&dsq->list);
dsq->id = dsq_id;
dsq->sched = sch;
+
+ dsq->pcpu = alloc_percpu(struct scx_dsq_pcpu);
+ if (!dsq->pcpu)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu);
+
+ pcpu->dsq = dsq;
+ }
+
+ return 0;
+}
+
+static void exit_dsq(struct scx_dispatch_q *dsq)
+{
+ free_percpu(dsq->pcpu);
+}
+
+static void free_dsq_rcufn(struct rcu_head *rcu)
+{
+ struct scx_dispatch_q *dsq = container_of(rcu, struct scx_dispatch_q, rcu);
+
+ exit_dsq(dsq);
+ kfree(dsq);
}
static void free_dsq_irq_workfn(struct irq_work *irq_work)
@@ -4038,7 +4065,7 @@ static void free_dsq_irq_workfn(struct i
struct scx_dispatch_q *dsq, *tmp_dsq;
llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node)
- kfree_rcu(dsq, rcu);
+ call_rcu(&dsq->rcu, free_dsq_rcufn);
}
static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn);
@@ -4235,15 +4262,17 @@ static void scx_sched_free_rcu_work(stru
cgroup_put(sch_cgroup(sch));
#endif /* CONFIG_EXT_SUB_SCHED */
- /*
- * $sch would have entered bypass mode before the RCU grace period. As
- * that blocks new deferrals, all deferred_reenq_local_node's must be
- * off-list by now.
- */
for_each_possible_cpu(cpu) {
struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
+ /*
+ * $sch would have entered bypass mode before the RCU grace
+ * period. As that blocks new deferrals, all
+ * deferred_reenq_local_node's must be off-list by now.
+ */
WARN_ON_ONCE(!list_empty(&pcpu->deferred_reenq_local.node));
+
+ exit_dsq(bypass_dsq(sch, cpu));
}
free_percpu(sch->pcpu);
@@ -5788,6 +5817,9 @@ static int alloc_kick_syncs(void)
static void free_pnode(struct scx_sched_pnode *pnode)
{
+ if (!pnode)
+ return;
+ exit_dsq(&pnode->global_dsq);
kfree(pnode);
}
@@ -5799,7 +5831,10 @@ static struct scx_sched_pnode *alloc_pno
if (!pnode)
return NULL;
- init_dsq(&pnode->global_dsq, SCX_DSQ_GLOBAL, sch);
+ if (init_dsq(&pnode->global_dsq, SCX_DSQ_GLOBAL, sch)) {
+ kfree(pnode);
+ return NULL;
+ }
return pnode;
}
@@ -5810,7 +5845,7 @@ static struct scx_sched *scx_alloc_and_a
{
struct scx_sched *sch;
s32 level = parent ? parent->level + 1 : 0;
- s32 node, cpu, ret;
+ s32 node, cpu, ret, bypass_fail_cpu = nr_cpu_ids;
sch = kzalloc_flex(*sch, ancestors, level);
if (!sch)
@@ -5849,8 +5884,13 @@ static struct scx_sched *scx_alloc_and_a
goto err_free_pnode;
}
- for_each_possible_cpu(cpu)
- init_dsq(bypass_dsq(sch, cpu), SCX_DSQ_BYPASS, sch);
+ for_each_possible_cpu(cpu) {
+ ret = init_dsq(bypass_dsq(sch, cpu), SCX_DSQ_BYPASS, sch);
+ if (ret) {
+ bypass_fail_cpu = cpu;
+ goto err_free_pcpu;
+ }
+ }
for_each_possible_cpu(cpu) {
struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
@@ -5932,6 +5972,11 @@ static struct scx_sched *scx_alloc_and_a
err_stop_helper:
kthread_destroy_worker(sch->helper);
err_free_pcpu:
+ for_each_possible_cpu(cpu) {
+ if (cpu == bypass_fail_cpu)
+ break;
+ exit_dsq(bypass_dsq(sch, cpu));
+ }
free_percpu(sch->pcpu);
err_free_pnode:
for_each_node_state(node, N_POSSIBLE)
@@ -7174,7 +7219,7 @@ void __init init_sched_ext_class(void)
int n = cpu_to_node(cpu);
/* local_dsq's sch will be set during scx_root_enable() */
- init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL, NULL);
+ BUG_ON(init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL, NULL));
INIT_LIST_HEAD(&rq->scx.runnable_list);
INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);
@@ -7873,11 +7918,21 @@ __bpf_kfunc s32 scx_bpf_create_dsq(u64 d
if (!dsq)
return -ENOMEM;
+ /*
+ * init_dsq() must be called in GFP_KERNEL context. Init it with NULL
+ * @sch and update afterwards.
+ */
+ ret = init_dsq(dsq, dsq_id, NULL);
+ if (ret) {
+ kfree(dsq);
+ return ret;
+ }
+
rcu_read_lock();
sch = scx_prog_sched(aux);
if (sch) {
- init_dsq(dsq, dsq_id, sch);
+ dsq->sched = sch;
ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node,
dsq_hash_params);
} else {
@@ -7885,8 +7940,10 @@ __bpf_kfunc s32 scx_bpf_create_dsq(u64 d
}
rcu_read_unlock();
- if (ret)
+ if (ret) {
+ exit_dsq(dsq);
kfree(dsq);
+ }
return ret;
}
next prev parent reply other threads:[~2026-03-06 23:09 UTC|newest]
Thread overview: 38+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-06 19:06 [PATCHSET sched_ext/for-7.1] sched_ext: Overhaul DSQ reenqueue infrastructure Tejun Heo
2026-03-06 19:06 ` [PATCH 01/15] sched_ext: Relocate scx_bpf_task_cgroup() and its BTF_ID to the end of kfunc section Tejun Heo
2026-03-06 20:45 ` Emil Tsalapatis
2026-03-06 23:20 ` Daniel Jordan
2026-03-06 19:06 ` [PATCH 02/15] sched_ext: Wrap global DSQs in per-node structure Tejun Heo
2026-03-06 20:52 ` Emil Tsalapatis
2026-03-06 23:20 ` Daniel Jordan
2026-03-06 19:06 ` [PATCH 03/15] sched_ext: Factor out pnode allocation and deallocation into helpers Tejun Heo
2026-03-06 20:54 ` Emil Tsalapatis
2026-03-06 23:21 ` Daniel Jordan
2026-03-06 19:06 ` [PATCH 04/15] sched_ext: Change find_global_dsq() to take CPU number instead of task Tejun Heo
2026-03-06 21:06 ` Emil Tsalapatis
2026-03-06 22:33 ` [PATCH v2 " Tejun Heo
2026-03-06 23:21 ` [PATCH " Daniel Jordan
2026-03-06 19:06 ` [PATCH 05/15] sched_ext: Relocate reenq_local() and run_deferred() Tejun Heo
2026-03-06 21:09 ` Emil Tsalapatis
2026-03-06 23:34 ` Daniel Jordan
2026-03-07 0:12 ` [PATCH v2 05/15] sched_ext: Relocate run_deferred() and its callees Tejun Heo
2026-03-06 19:06 ` [PATCH 06/15] sched_ext: Convert deferred_reenq_locals from llist to regular list Tejun Heo
2026-03-09 17:12 ` Emil Tsalapatis
2026-03-09 17:16 ` Emil Tsalapatis
2026-03-06 19:06 ` [PATCH 07/15] sched_ext: Wrap deferred_reenq_local_node into a struct Tejun Heo
2026-03-09 17:16 ` Emil Tsalapatis
2026-03-06 19:06 ` [PATCH 08/15] sched_ext: Introduce scx_bpf_dsq_reenq() for remote local DSQ reenqueue Tejun Heo
2026-03-09 17:33 ` Emil Tsalapatis
2026-03-06 19:06 ` [PATCH 09/15] sched_ext: Add reenq_flags plumbing to scx_bpf_dsq_reenq() Tejun Heo
2026-03-09 17:47 ` Emil Tsalapatis
2026-03-06 19:06 ` [PATCH 10/15] sched_ext: Add per-CPU data to DSQs Tejun Heo
2026-03-06 22:54 ` Andrea Righi
2026-03-06 22:56 ` Andrea Righi
2026-03-06 23:09 ` Tejun Heo [this message]
2026-03-06 19:06 ` [PATCH 11/15] sched_ext: Factor out nldsq_cursor_next_task() and nldsq_cursor_lost_task() Tejun Heo
2026-03-06 19:06 ` [PATCH 12/15] sched_ext: Implement scx_bpf_dsq_reenq() for user DSQs Tejun Heo
2026-03-06 19:06 ` [PATCH 13/15] sched_ext: Optimize schedule_dsq_reenq() with lockless fast path Tejun Heo
2026-03-06 19:06 ` [PATCH 14/15] sched_ext: Simplify task state handling Tejun Heo
2026-03-06 19:06 ` [PATCH 15/15] sched_ext: Add SCX_TASK_REENQ_REASON flags Tejun Heo
2026-03-06 23:14 ` [PATCHSET sched_ext/for-7.1] sched_ext: Overhaul DSQ reenqueue infrastructure Andrea Righi
2026-03-07 15:38 ` Tejun Heo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=af18f4873ed5d2ef606c55923acc1014@kernel.org \
--to=tj@kernel.org \
--cc=arighi@nvidia.com \
--cc=changwoo@igalia.com \
--cc=emil@etsalapatis.com \
--cc=linux-kernel@vger.kernel.org \
--cc=sched-ext@lists.linux.dev \
--cc=void@manifault.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.