* [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists
2026-04-16 8:16 [PATCHSET " Tejun Heo
@ 2026-04-16 8:16 ` Tejun Heo
2026-04-16 10:01 ` Andrea Righi
2026-04-16 15:45 ` Emil Tsalapatis
0 siblings, 2 replies; 9+ messages in thread
From: Tejun Heo @ 2026-04-16 8:16 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min
Cc: Emil Tsalapatis, sched-ext, linux-kernel, Tejun Heo
Arena simplifies verification and allows more natural programming.
Convert scx_qmap to arena as preparation for further sub-sched work.
Replace the five BPF_MAP_TYPE_QUEUE maps with doubly-linked lists in
arena, threaded through task_ctx. Each queue is a struct qmap_fifo with
head/tail pointers and its own per-queue bpf_res_spin_lock.
qmap_dequeue() now properly removes tasks from the queue instead of
leaving stale entries for dispatch to skip.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
tools/sched_ext/scx_qmap.bpf.c | 221 +++++++++++++++++++++------------
tools/sched_ext/scx_qmap.h | 9 ++
2 files changed, 148 insertions(+), 82 deletions(-)
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index e071969c8f32..c26997ff7863 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -71,31 +71,24 @@ struct {
struct qmap_arena __arena qa;
-struct qmap {
- __uint(type, BPF_MAP_TYPE_QUEUE);
- __uint(max_entries, 4096);
- __type(value, u32);
-} queue0 SEC(".maps"),
- queue1 SEC(".maps"),
- queue2 SEC(".maps"),
- queue3 SEC(".maps"),
- queue4 SEC(".maps"),
- dump_store SEC(".maps");
-
-struct {
- __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
- __uint(max_entries, 5);
- __type(key, int);
- __array(values, struct qmap);
-} queue_arr SEC(".maps") = {
- .values = {
- [0] = &queue0,
- [1] = &queue1,
- [2] = &queue2,
- [3] = &queue3,
- [4] = &queue4,
- },
-};
+/* Per-queue locks. Each in its own .data section as bpf_res_spin_lock requires. */
+__hidden struct bpf_res_spin_lock qa_q_lock0 SEC(".data.qa_q_lock0");
+__hidden struct bpf_res_spin_lock qa_q_lock1 SEC(".data.qa_q_lock1");
+__hidden struct bpf_res_spin_lock qa_q_lock2 SEC(".data.qa_q_lock2");
+__hidden struct bpf_res_spin_lock qa_q_lock3 SEC(".data.qa_q_lock3");
+__hidden struct bpf_res_spin_lock qa_q_lock4 SEC(".data.qa_q_lock4");
+
+static struct bpf_res_spin_lock *qa_q_lock(s32 qid)
+{
+ switch (qid) {
+ case 0: return &qa_q_lock0;
+ case 1: return &qa_q_lock1;
+ case 2: return &qa_q_lock2;
+ case 3: return &qa_q_lock3;
+ case 4: return &qa_q_lock4;
+ default: return NULL;
+ }
+}
/*
* If enabled, CPU performance target is set according to the queue index
@@ -123,9 +116,17 @@ static const u32 qidx_to_cpuperf_target[] = {
* arena. While the task is alive the entry is referenced from task_ctx_stor;
* while it's free the entry sits on the free list singly-linked through
* @next_free.
+ *
+ * When the task is queued on one of the five priority FIFOs, @q_idx is the
+ * queue index and @q_next/@q_prev link it in the queue's doubly-linked list.
+ * @q_idx is -1 when the task isn't on any queue.
*/
struct task_ctx {
struct task_ctx __arena *next_free; /* only valid on free list */
+ struct task_ctx __arena *q_next; /* queue link, NULL if tail */
+ struct task_ctx __arena *q_prev; /* queue link, NULL if head */
+ struct qmap_fifo __arena *fifo; /* queue we're on, NULL if not queued */
+ s32 pid;
bool force_local; /* Dispatch directly to local_dsq */
bool highpri;
u64 core_sched_seq;
@@ -196,6 +197,81 @@ static struct task_ctx __arena *lookup_task_ctx(struct task_struct *p)
return v->taskc;
}
+/* Append @taskc to the tail of @fifo. Must not already be queued. */
+static void qmap_fifo_enqueue(struct qmap_fifo __arena *fifo,
+ struct task_ctx __arena *taskc)
+{
+ struct bpf_res_spin_lock *lock = qa_q_lock(fifo->idx);
+
+ if (!lock || qmap_spin_lock(lock))
+ return;
+ taskc->fifo = fifo;
+ taskc->q_next = NULL;
+ taskc->q_prev = fifo->tail;
+ if (fifo->tail)
+ fifo->tail->q_next = taskc;
+ else
+ fifo->head = taskc;
+ fifo->tail = taskc;
+ bpf_res_spin_unlock(lock);
+}
+
+/* Pop the head of @fifo. Returns NULL if empty. */
+static struct task_ctx __arena *qmap_fifo_pop(struct qmap_fifo __arena *fifo)
+{
+ struct bpf_res_spin_lock *lock = qa_q_lock(fifo->idx);
+ struct task_ctx __arena *taskc;
+
+ if (!lock || qmap_spin_lock(lock))
+ return NULL;
+ taskc = fifo->head;
+ if (taskc) {
+ fifo->head = taskc->q_next;
+ if (taskc->q_next)
+ taskc->q_next->q_prev = NULL;
+ else
+ fifo->tail = NULL;
+ taskc->q_next = NULL;
+ taskc->q_prev = NULL;
+ taskc->fifo = NULL;
+ }
+ bpf_res_spin_unlock(lock);
+ return taskc;
+}
+
+/* Remove @taskc from its fifo. No-op if not queued. */
+static void qmap_fifo_remove(struct task_ctx __arena *taskc)
+{
+ struct qmap_fifo __arena *fifo = taskc->fifo;
+ struct bpf_res_spin_lock *lock;
+
+ if (!fifo)
+ return;
+
+ lock = qa_q_lock(fifo->idx);
+ if (!lock || qmap_spin_lock(lock))
+ return;
+
+ /* Re-check under lock — a concurrent pop may have cleared fifo. */
+ if (taskc->fifo != fifo) {
+ bpf_res_spin_unlock(lock);
+ return;
+ }
+
+ if (taskc->q_next)
+ taskc->q_next->q_prev = taskc->q_prev;
+ else
+ fifo->tail = taskc->q_prev;
+ if (taskc->q_prev)
+ taskc->q_prev->q_next = taskc->q_next;
+ else
+ fifo->head = taskc->q_next;
+ taskc->q_next = NULL;
+ taskc->q_prev = NULL;
+ taskc->fifo = NULL;
+ bpf_res_spin_unlock(lock);
+}
+
s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
s32 prev_cpu, u64 wake_flags)
{
@@ -237,9 +313,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
{
static u32 user_cnt, kernel_cnt;
struct task_ctx __arena *taskc;
- u32 pid = p->pid;
int idx = weight_to_idx(p->scx.weight);
- void *ring;
s32 cpu;
if (enq_flags & SCX_ENQ_REENQ) {
@@ -325,17 +399,8 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
return;
}
- ring = bpf_map_lookup_elem(&queue_arr, &idx);
- if (!ring) {
- scx_bpf_error("failed to find ring %d", idx);
- return;
- }
-
- /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
- if (bpf_map_push_elem(ring, &pid, 0)) {
- scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags);
- return;
- }
+ /* Queue on the selected FIFO. */
+ qmap_fifo_enqueue(&qa.fifos[idx], taskc);
if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
taskc->highpri = true;
@@ -344,15 +409,20 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
__sync_fetch_and_add(&qa.nr_enqueued, 1);
}
-/*
- * The BPF queue map doesn't support removal and sched_ext can handle spurious
- * dispatches. qmap_dequeue() is only used to collect statistics.
- */
void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
{
+ struct task_ctx __arena *taskc;
+
__sync_fetch_and_add(&qa.nr_dequeued, 1);
if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
__sync_fetch_and_add(&qa.nr_core_sched_execed, 1);
+
+ taskc = lookup_task_ctx(p);
+ if (taskc && taskc->fifo) {
+ if (taskc->highpri)
+ __sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
+ qmap_fifo_remove(taskc);
+ }
}
static void update_core_sched_head_seq(struct task_struct *p)
@@ -435,8 +505,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
struct cpu_ctx __arena *cpuc;
struct task_ctx __arena *taskc;
u32 batch = dsp_batch ?: 1;
- void *fifo;
- s32 i, pid;
+ s32 i;
if (dispatch_highpri(false))
return;
@@ -467,30 +536,18 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
}
- u64 dsp_idx = cpuc->dsp_idx;
-
- fifo = bpf_map_lookup_elem(&queue_arr, &dsp_idx);
- if (!fifo) {
- scx_bpf_error("failed to find ring %llu", dsp_idx);
- return;
- }
-
/* Dispatch or advance. */
bpf_repeat(BPF_MAX_LOOPS) {
struct task_ctx __arena *taskc;
- if (bpf_map_pop_elem(fifo, &pid))
+ taskc = qmap_fifo_pop(&qa.fifos[cpuc->dsp_idx]);
+ if (!taskc)
break;
- p = bpf_task_from_pid(pid);
+ p = bpf_task_from_pid(taskc->pid);
if (!p)
continue;
- if (!(taskc = lookup_task_ctx(p))) {
- bpf_task_release(p);
- return;
- }
-
if (taskc->highpri)
__sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
@@ -661,6 +718,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init_task, struct task_struct *p,
}
taskc->next_free = NULL;
+ taskc->q_next = NULL;
+ taskc->q_prev = NULL;
+ taskc->fifo = NULL;
+ taskc->pid = p->pid;
taskc->force_local = false;
taskc->highpri = false;
taskc->core_sched_seq = 0;
@@ -701,38 +762,29 @@ void BPF_STRUCT_OPS(qmap_exit_task, struct task_struct *p,
void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
{
- s32 i, pid;
+ struct task_ctx __arena *taskc;
+ s32 i;
+
+ QMAP_TOUCH_ARENA();
if (suppress_dump)
return;
+ /*
+ * Walk the queue lists without locking - kfunc calls (scx_bpf_dump)
+ * aren't in the verifier's kfunc_spin_allowed() list so we can't hold
+ * a lock and dump. Best-effort; racing may print stale pids but the
+ * walk is bounded by bpf_repeat() so it always terminates.
+ */
bpf_for(i, 0, 5) {
- void *fifo;
-
- if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
- return;
-
scx_bpf_dump("QMAP FIFO[%d]:", i);
-
- /*
- * Dump can be invoked anytime and there is no way to iterate in
- * a non-destructive way. Pop and store in dump_store and then
- * restore afterwards. If racing against new enqueues, ordering
- * can get mixed up.
- */
- bpf_repeat(4096) {
- if (bpf_map_pop_elem(fifo, &pid))
- break;
- bpf_map_push_elem(&dump_store, &pid, 0);
- scx_bpf_dump(" %d", pid);
- }
-
+ taskc = qa.fifos[i].head;
bpf_repeat(4096) {
- if (bpf_map_pop_elem(&dump_store, &pid))
+ if (!taskc)
break;
- bpf_map_push_elem(fifo, &pid, 0);
+ scx_bpf_dump(" %d", taskc->pid);
+ taskc = taskc->q_next;
}
-
scx_bpf_dump("\n");
}
}
@@ -756,6 +808,8 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc
QMAP_TOUCH_ARENA();
+ QMAP_TOUCH_ARENA();
+
if (suppress_dump)
return;
v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
@@ -1018,6 +1072,9 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
}
qa.task_ctxs = slab;
+ bpf_for(i, 0, 5)
+ qa.fifos[i].idx = i;
+
bpf_for(i, 0, max_tasks)
slab[i].next_free = (i + 1 < max_tasks) ? &slab[i + 1] : NULL;
qa.task_free_head = &slab[0];
diff --git a/tools/sched_ext/scx_qmap.h b/tools/sched_ext/scx_qmap.h
index c183d82632b3..9c0da5a301cb 100644
--- a/tools/sched_ext/scx_qmap.h
+++ b/tools/sched_ext/scx_qmap.h
@@ -37,6 +37,12 @@ struct cpu_ctx {
/* Opaque to userspace; defined in scx_qmap.bpf.c. */
struct task_ctx;
+struct qmap_fifo {
+ struct task_ctx __arena *head;
+ struct task_ctx __arena *tail;
+ __s32 idx;
+};
+
struct qmap_arena {
/* userspace-visible stats */
__u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0;
@@ -59,6 +65,9 @@ struct qmap_arena {
/* task_ctx slab; allocated and threaded by qmap_init() */
struct task_ctx __arena *task_ctxs;
struct task_ctx __arena *task_free_head;
+
+ /* five priority FIFOs, each a doubly-linked list through task_ctx */
+ struct qmap_fifo fifos[5];
};
#endif /* __SCX_QMAP_H */
--
2.53.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists
2026-04-16 8:16 ` [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists Tejun Heo
@ 2026-04-16 10:01 ` Andrea Righi
2026-04-16 15:45 ` Emil Tsalapatis
1 sibling, 0 replies; 9+ messages in thread
From: Andrea Righi @ 2026-04-16 10:01 UTC (permalink / raw)
To: Tejun Heo
Cc: David Vernet, Changwoo Min, Emil Tsalapatis, sched-ext,
linux-kernel
Hi Tejun,
On Wed, Apr 15, 2026 at 10:16:26PM -1000, Tejun Heo wrote:
> Arena simplifies verification and allows more natural programming.
> Convert scx_qmap to arena as preparation for further sub-sched work.
>
> Replace the five BPF_MAP_TYPE_QUEUE maps with doubly-linked lists in
> arena, threaded through task_ctx. Each queue is a struct qmap_fifo with
> head/tail pointers and its own per-queue bpf_res_spin_lock.
We should probably update the description at the beginning of the files as well,
mentioning the arena-backed lists.
>
> qmap_dequeue() now properly removes tasks from the queue instead of
> leaving stale entries for dispatch to skip.
>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
...
> @@ -756,6 +808,8 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc
>
> QMAP_TOUCH_ARENA();
>
> + QMAP_TOUCH_ARENA();
> +
Copy/paste noise?
> if (suppress_dump)
> return;
> v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
> @@ -1018,6 +1072,9 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
> }
> qa.task_ctxs = slab;
>
> + bpf_for(i, 0, 5)
> + qa.fifos[i].idx = i;
> +
> bpf_for(i, 0, max_tasks)
> slab[i].next_free = (i + 1 < max_tasks) ? &slab[i + 1] : NULL;
> qa.task_free_head = &slab[0];
Thanks,
-Andrea
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists
2026-04-16 8:16 ` [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists Tejun Heo
2026-04-16 10:01 ` Andrea Righi
@ 2026-04-16 15:45 ` Emil Tsalapatis
1 sibling, 0 replies; 9+ messages in thread
From: Emil Tsalapatis @ 2026-04-16 15:45 UTC (permalink / raw)
To: Tejun Heo
Cc: David Vernet, Andrea Righi, Changwoo Min, Emil Tsalapatis,
sched-ext, linux-kernel
On Thu, Apr 16, 2026 at 1:20 AM Tejun Heo <tj@kernel.org> wrote:
>
> >
> Arena simplifies verification and allows more natural programming.
> Convert scx_qmap to arena as preparation for further sub-sched work.
>
> Replace the five BPF_MAP_TYPE_QUEUE maps with doubly-linked lists in
> arena, threaded through task_ctx. Each queue is a struct qmap_fifo with
> head/tail pointers and its own per-queue bpf_res_spin_lock.
>
> qmap_dequeue() now properly removes tasks from the queue instead of
> leaving stale entries for dispatch to skip.
>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
> tools/sched_ext/scx_qmap.bpf.c | 221 +++++++++++++++++++++------------
> tools/sched_ext/scx_qmap.h | 9 ++
> 2 files changed, 148 insertions(+), 82 deletions(-)
>
> diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
> index e071969c8f32..c26997ff7863 100644
> --- a/tools/sched_ext/scx_qmap.bpf.c
> +++ b/tools/sched_ext/scx_qmap.bpf.c
> @@ -71,31 +71,24 @@ struct {
>
> struct qmap_arena __arena qa;
>
> -struct qmap {
> - __uint(type, BPF_MAP_TYPE_QUEUE);
> - __uint(max_entries, 4096);
> - __type(value, u32);
> -} queue0 SEC(".maps"),
> - queue1 SEC(".maps"),
> - queue2 SEC(".maps"),
> - queue3 SEC(".maps"),
> - queue4 SEC(".maps"),
> - dump_store SEC(".maps");
> -
> -struct {
> - __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
> - __uint(max_entries, 5);
> - __type(key, int);
> - __array(values, struct qmap);
> -} queue_arr SEC(".maps") = {
> - .values = {
> - [0] = &queue0,
> - [1] = &queue1,
> - [2] = &queue2,
> - [3] = &queue3,
> - [4] = &queue4,
> - },
> -};
> +/* Per-queue locks. Each in its own .data section as bpf_res_spin_lock requires. */
> +__hidden struct bpf_res_spin_lock qa_q_lock0 SEC(".data.qa_q_lock0");
> +__hidden struct bpf_res_spin_lock qa_q_lock1 SEC(".data.qa_q_lock1");
> +__hidden struct bpf_res_spin_lock qa_q_lock2 SEC(".data.qa_q_lock2");
> +__hidden struct bpf_res_spin_lock qa_q_lock3 SEC(".data.qa_q_lock3");
> +__hidden struct bpf_res_spin_lock qa_q_lock4 SEC(".data.qa_q_lock4");
> +
> +static struct bpf_res_spin_lock *qa_q_lock(s32 qid)
> +{
> + switch (qid) {
> + case 0: return &qa_q_lock0;
> + case 1: return &qa_q_lock1;
> + case 2: return &qa_q_lock2;
> + case 3: return &qa_q_lock3;
> + case 4: return &qa_q_lock4;
> + default: return NULL;
> + }
> +}
>
> /*
> * If enabled, CPU performance target is set according to the queue index
> @@ -123,9 +116,17 @@ static const u32 qidx_to_cpuperf_target[] = {
> * arena. While the task is alive the entry is referenced from task_ctx_stor;
> * while it's free the entry sits on the free list singly-linked through
> * @next_free.
> + *
> + * When the task is queued on one of the five priority FIFOs, @q_idx is the
> + * queue index and @q_next/@q_prev link it in the queue's doubly-linked list.
> + * @q_idx is -1 when the task isn't on any queue.
> */
> struct task_ctx {
> struct task_ctx __arena *next_free; /* only valid on free list */
> + struct task_ctx __arena *q_next; /* queue link, NULL if tail */
> + struct task_ctx __arena *q_prev; /* queue link, NULL if head */
> + struct qmap_fifo __arena *fifo; /* queue we're on, NULL if not queued */
> + s32 pid;
> bool force_local; /* Dispatch directly to local_dsq */
> bool highpri;
> u64 core_sched_seq;
> @@ -196,6 +197,81 @@ static struct task_ctx __arena *lookup_task_ctx(struct task_struct *p)
> return v->taskc;
> }
>
> +/* Append @taskc to the tail of @fifo. Must not already be queued. */
> +static void qmap_fifo_enqueue(struct qmap_fifo __arena *fifo,
> + struct task_ctx __arena *taskc)
> +{
> + struct bpf_res_spin_lock *lock = qa_q_lock(fifo->idx);
> +
> + if (!lock || qmap_spin_lock(lock))
> + return;
> + taskc->fifo = fifo;
> + taskc->q_next = NULL;
> + taskc->q_prev = fifo->tail;
> + if (fifo->tail)
> + fifo->tail->q_next = taskc;
> + else
> + fifo->head = taskc;
> + fifo->tail = taskc;
> + bpf_res_spin_unlock(lock);
> +}
> +
> +/* Pop the head of @fifo. Returns NULL if empty. */
> +static struct task_ctx __arena *qmap_fifo_pop(struct qmap_fifo __arena *fifo)
> +{
> + struct bpf_res_spin_lock *lock = qa_q_lock(fifo->idx);
> + struct task_ctx __arena *taskc;
> +
> + if (!lock || qmap_spin_lock(lock))
> + return NULL;
> + taskc = fifo->head;
> + if (taskc) {
> + fifo->head = taskc->q_next;
> + if (taskc->q_next)
> + taskc->q_next->q_prev = NULL;
> + else
> + fifo->tail = NULL;
> + taskc->q_next = NULL;
> + taskc->q_prev = NULL;
> + taskc->fifo = NULL;
> + }
> + bpf_res_spin_unlock(lock);
> + return taskc;
> +}
> +
> +/* Remove @taskc from its fifo. No-op if not queued. */
> +static void qmap_fifo_remove(struct task_ctx __arena *taskc)
> +{
> + struct qmap_fifo __arena *fifo = taskc->fifo;
> + struct bpf_res_spin_lock *lock;
> +
> + if (!fifo)
> + return;
> +
> + lock = qa_q_lock(fifo->idx);
> + if (!lock || qmap_spin_lock(lock))
> + return;
> +
> + /* Re-check under lock — a concurrent pop may have cleared fifo. */
> + if (taskc->fifo != fifo) {
> + bpf_res_spin_unlock(lock);
> + return;
> + }
> +
> + if (taskc->q_next)
> + taskc->q_next->q_prev = taskc->q_prev;
> + else
> + fifo->tail = taskc->q_prev;
> + if (taskc->q_prev)
> + taskc->q_prev->q_next = taskc->q_next;
> + else
> + fifo->head = taskc->q_next;
> + taskc->q_next = NULL;
> + taskc->q_prev = NULL;
> + taskc->fifo = NULL;
> + bpf_res_spin_unlock(lock);
> +}
> +
> s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
> s32 prev_cpu, u64 wake_flags)
> {
> @@ -237,9 +313,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
> {
> static u32 user_cnt, kernel_cnt;
> struct task_ctx __arena *taskc;
> - u32 pid = p->pid;
> int idx = weight_to_idx(p->scx.weight);
> - void *ring;
> s32 cpu;
>
> if (enq_flags & SCX_ENQ_REENQ) {
> @@ -325,17 +399,8 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
> return;
> }
>
> - ring = bpf_map_lookup_elem(&queue_arr, &idx);
> - if (!ring) {
> - scx_bpf_error("failed to find ring %d", idx);
> - return;
> - }
> -
> - /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
> - if (bpf_map_push_elem(ring, &pid, 0)) {
> - scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags);
> - return;
> - }
> + /* Queue on the selected FIFO. */
> + qmap_fifo_enqueue(&qa.fifos[idx], taskc);
>
> if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
> taskc->highpri = true;
> @@ -344,15 +409,20 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
> __sync_fetch_and_add(&qa.nr_enqueued, 1);
> }
>
> -/*
> - * The BPF queue map doesn't support removal and sched_ext can handle spurious
> - * dispatches. qmap_dequeue() is only used to collect statistics.
> - */
> void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
> {
> + struct task_ctx __arena *taskc;
> +
> __sync_fetch_and_add(&qa.nr_dequeued, 1);
> if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
> __sync_fetch_and_add(&qa.nr_core_sched_execed, 1);
> +
> + taskc = lookup_task_ctx(p);
> + if (taskc && taskc->fifo) {
> + if (taskc->highpri)
> + __sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
> + qmap_fifo_remove(taskc);
> + }
> }
>
> static void update_core_sched_head_seq(struct task_struct *p)
> @@ -435,8 +505,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
> struct cpu_ctx __arena *cpuc;
> struct task_ctx __arena *taskc;
> u32 batch = dsp_batch ?: 1;
> - void *fifo;
> - s32 i, pid;
> + s32 i;
>
> if (dispatch_highpri(false))
> return;
> @@ -467,30 +536,18 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
> cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
> }
>
> - u64 dsp_idx = cpuc->dsp_idx;
> -
> - fifo = bpf_map_lookup_elem(&queue_arr, &dsp_idx);
> - if (!fifo) {
> - scx_bpf_error("failed to find ring %llu", dsp_idx);
> - return;
> - }
> -
> /* Dispatch or advance. */
> bpf_repeat(BPF_MAX_LOOPS) {
> struct task_ctx __arena *taskc;
>
> - if (bpf_map_pop_elem(fifo, &pid))
> + taskc = qmap_fifo_pop(&qa.fifos[cpuc->dsp_idx]);
> + if (!taskc)
> break;
>
> - p = bpf_task_from_pid(pid);
> + p = bpf_task_from_pid(taskc->pid);
> if (!p)
> continue;
>
> - if (!(taskc = lookup_task_ctx(p))) {
> - bpf_task_release(p);
> - return;
> - }
> -
> if (taskc->highpri)
> __sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
>
> @@ -661,6 +718,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init_task, struct task_struct *p,
> }
>
> taskc->next_free = NULL;
> + taskc->q_next = NULL;
> + taskc->q_prev = NULL;
> + taskc->fifo = NULL;
> + taskc->pid = p->pid;
> taskc->force_local = false;
> taskc->highpri = false;
> taskc->core_sched_seq = 0;
> @@ -701,38 +762,29 @@ void BPF_STRUCT_OPS(qmap_exit_task, struct task_struct *p,
>
> void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
> {
> - s32 i, pid;
> + struct task_ctx __arena *taskc;
> + s32 i;
> +
> + QMAP_TOUCH_ARENA();
>
> if (suppress_dump)
> return;
>
> + /*
> + * Walk the queue lists without locking - kfunc calls (scx_bpf_dump)
> + * aren't in the verifier's kfunc_spin_allowed() list so we can't hold
> + * a lock and dump. Best-effort; racing may print stale pids but the
> + * walk is bounded by bpf_repeat() so it always terminates.
> + */
> bpf_for(i, 0, 5) {
> - void *fifo;
> -
> - if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
> - return;
> -
> scx_bpf_dump("QMAP FIFO[%d]:", i);
> -
> - /*
> - * Dump can be invoked anytime and there is no way to iterate in
> - * a non-destructive way. Pop and store in dump_store and then
> - * restore afterwards. If racing against new enqueues, ordering
> - * can get mixed up.
> - */
> - bpf_repeat(4096) {
> - if (bpf_map_pop_elem(fifo, &pid))
> - break;
> - bpf_map_push_elem(&dump_store, &pid, 0);
> - scx_bpf_dump(" %d", pid);
> - }
> -
> + taskc = qa.fifos[i].head;
> bpf_repeat(4096) {
> - if (bpf_map_pop_elem(&dump_store, &pid))
> + if (!taskc)
> break;
> - bpf_map_push_elem(fifo, &pid, 0);
> + scx_bpf_dump(" %d", taskc->pid);
> + taskc = taskc->q_next;
> }
> -
> scx_bpf_dump("\n");
> }
> }
> @@ -756,6 +808,8 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc
>
> QMAP_TOUCH_ARENA();
>
> + QMAP_TOUCH_ARENA();
> +
> if (suppress_dump)
> return;
> v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
> @@ -1018,6 +1072,9 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
> }
> qa.task_ctxs = slab;
>
> + bpf_for(i, 0, 5)
> + qa.fifos[i].idx = i;
> +
> bpf_for(i, 0, max_tasks)
> slab[i].next_free = (i + 1 < max_tasks) ? &slab[i + 1] : NULL;
> qa.task_free_head = &slab[0];
> diff --git a/tools/sched_ext/scx_qmap.h b/tools/sched_ext/scx_qmap.h
> index c183d82632b3..9c0da5a301cb 100644
> --- a/tools/sched_ext/scx_qmap.h
> +++ b/tools/sched_ext/scx_qmap.h
> @@ -37,6 +37,12 @@ struct cpu_ctx {
> /* Opaque to userspace; defined in scx_qmap.bpf.c. */
> struct task_ctx;
>
> +struct qmap_fifo {
> + struct task_ctx __arena *head;
> + struct task_ctx __arena *tail;
> + __s32 idx;
> +};
> +
> struct qmap_arena {
> /* userspace-visible stats */
> __u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0;
> @@ -59,6 +65,9 @@ struct qmap_arena {
> /* task_ctx slab; allocated and threaded by qmap_init() */
> struct task_ctx __arena *task_ctxs;
> struct task_ctx __arena *task_free_head;
> +
> + /* five priority FIFOs, each a doubly-linked list through task_ctx */
> + struct qmap_fifo fifos[5];
> };
>
> #endif /* __SCX_QMAP_H */
> --
> 2.53.0
>
>
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCHSET v2 sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena
@ 2026-04-16 17:20 Tejun Heo
2026-04-16 17:20 ` [PATCH 1/4] sched_ext: scx_qmap: rename tctx to taskc Tejun Heo
` (4 more replies)
0 siblings, 5 replies; 9+ messages in thread
From: Tejun Heo @ 2026-04-16 17:20 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min
Cc: Emil Tsalapatis, sched-ext, linux-kernel
Hello,
Arena simplifies verification and allows more natural programming. This
patchset converts scx_qmap to use BPF arena for all mutable state, as
preparation for further sub-sched work.
0001 Rename tctx to taskc for consistency.
0002 Move globals and cpu_ctx into arena.
0003 Move task_ctx into an arena slab with bpf_res_spin_lock.
0004 Replace FIFO queue maps with arena-backed doubly-linked lists.
v2:
- Drop "mutable" from comments (#2).
- Add task_ctx_t typedef for struct task_ctx __arena (#3, Emil).
- Remove duplicate QMAP_TOUCH_ARENA() in qmap_dump_task (#4, Andrea).
- Update file-level description for arena-backed lists (#4, Andrea).
v1: https://lore.kernel.org/r/20260416081626.1285617-1-tj@kernel.org
Based on linus/master (1d51b370a0f8).
tools/sched_ext/include/scx/common.bpf.h | 4 +
tools/sched_ext/scx_qmap.bpf.c | 572 +++++++++++++++++++------------
tools/sched_ext/scx_qmap.c | 54 +--
tools/sched_ext/scx_qmap.h | 73 ++++
4 files changed, 465 insertions(+), 238 deletions(-)
Git tree: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git qmap-arena-v2
--
tejun
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 1/4] sched_ext: scx_qmap: rename tctx to taskc
2026-04-16 17:20 [PATCHSET v2 sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Tejun Heo
@ 2026-04-16 17:20 ` Tejun Heo
2026-04-16 17:20 ` [PATCH 2/4] sched_ext: scx_qmap: move globals and cpu_ctx into a BPF arena map Tejun Heo
` (3 subsequent siblings)
4 siblings, 0 replies; 9+ messages in thread
From: Tejun Heo @ 2026-04-16 17:20 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min
Cc: Emil Tsalapatis, sched-ext, linux-kernel
Rename the per-task context local variable from tctx to taskc for
consistency.
No functional change.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
---
tools/sched_ext/scx_qmap.bpf.c | 60 +++++++++++++++++-----------------
1 file changed, 30 insertions(+), 30 deletions(-)
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index b68abb9e760b..a18234f3c27a 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -159,22 +159,22 @@ static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
static struct task_ctx *lookup_task_ctx(struct task_struct *p)
{
- struct task_ctx *tctx;
+ struct task_ctx *taskc;
- if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
+ if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
scx_bpf_error("task_ctx lookup failed");
return NULL;
}
- return tctx;
+ return taskc;
}
s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
s32 prev_cpu, u64 wake_flags)
{
- struct task_ctx *tctx;
+ struct task_ctx *taskc;
s32 cpu;
- if (!(tctx = lookup_task_ctx(p)))
+ if (!(taskc = lookup_task_ctx(p)))
return -ESRCH;
if (p->scx.weight < 2 && !(p->flags & PF_KTHREAD))
@@ -183,7 +183,7 @@ s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
cpu = pick_direct_dispatch_cpu(p, prev_cpu);
if (cpu >= 0) {
- tctx->force_local = true;
+ taskc->force_local = true;
return cpu;
} else {
return prev_cpu;
@@ -208,7 +208,7 @@ static int weight_to_idx(u32 weight)
void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
{
static u32 user_cnt, kernel_cnt;
- struct task_ctx *tctx;
+ struct task_ctx *taskc;
u32 pid = p->pid;
int idx = weight_to_idx(p->scx.weight);
void *ring;
@@ -231,14 +231,14 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
if (test_error_cnt && !--test_error_cnt)
scx_bpf_error("test triggering error");
- if (!(tctx = lookup_task_ctx(p)))
+ if (!(taskc = lookup_task_ctx(p)))
return;
/*
* All enqueued tasks must have their core_sched_seq updated for correct
* core-sched ordering. Also, take a look at the end of qmap_dispatch().
*/
- tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
+ taskc->core_sched_seq = core_sched_tail_seqs[idx]++;
/*
* IMMED stress testing: Every immed_stress_nth'th enqueue, dispatch
@@ -249,7 +249,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
static u32 immed_stress_cnt;
if (!(++immed_stress_cnt % immed_stress_nth)) {
- tctx->force_local = false;
+ taskc->force_local = false;
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | scx_bpf_task_cpu(p),
slice_ns, enq_flags);
return;
@@ -260,8 +260,8 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
* If qmap_select_cpu() is telling us to or this is the last runnable
* task on the CPU, enqueue locally.
*/
- if (tctx->force_local) {
- tctx->force_local = false;
+ if (taskc->force_local) {
+ taskc->force_local = false;
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
return;
}
@@ -310,7 +310,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
}
if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
- tctx->highpri = true;
+ taskc->highpri = true;
__sync_fetch_and_add(&nr_highpri_queued, 1);
}
__sync_fetch_and_add(&nr_enqueued, 1);
@@ -330,10 +330,10 @@ void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
static void update_core_sched_head_seq(struct task_struct *p)
{
int idx = weight_to_idx(p->scx.weight);
- struct task_ctx *tctx;
+ struct task_ctx *taskc;
- if ((tctx = lookup_task_ctx(p)))
- core_sched_head_seqs[idx] = tctx->core_sched_seq;
+ if ((taskc = lookup_task_ctx(p)))
+ core_sched_head_seqs[idx] = taskc->core_sched_seq;
}
/*
@@ -354,12 +354,12 @@ static bool dispatch_highpri(bool from_timer)
/* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
static u64 highpri_seq;
- struct task_ctx *tctx;
+ struct task_ctx *taskc;
- if (!(tctx = lookup_task_ctx(p)))
+ if (!(taskc = lookup_task_ctx(p)))
return false;
- if (tctx->highpri) {
+ if (taskc->highpri) {
/* exercise the set_*() and vtime interface too */
scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2);
scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++);
@@ -405,7 +405,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
{
struct task_struct *p;
struct cpu_ctx *cpuc;
- struct task_ctx *tctx;
+ struct task_ctx *taskc;
u32 zero = 0, batch = dsp_batch ?: 1;
void *fifo;
s32 i, pid;
@@ -450,7 +450,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
/* Dispatch or advance. */
bpf_repeat(BPF_MAX_LOOPS) {
- struct task_ctx *tctx;
+ struct task_ctx *taskc;
if (bpf_map_pop_elem(fifo, &pid))
break;
@@ -459,12 +459,12 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
if (!p)
continue;
- if (!(tctx = lookup_task_ctx(p))) {
+ if (!(taskc = lookup_task_ctx(p))) {
bpf_task_release(p);
return;
}
- if (tctx->highpri)
+ if (taskc->highpri)
__sync_fetch_and_sub(&nr_highpri_queued, 1);
update_core_sched_head_seq(p);
@@ -539,13 +539,13 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
* if the task were enqueued and dispatched immediately.
*/
if (prev) {
- tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
- if (!tctx) {
+ taskc = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
+ if (!taskc) {
scx_bpf_error("task_ctx lookup failed");
return;
}
- tctx->core_sched_seq =
+ taskc->core_sched_seq =
core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
}
}
@@ -580,16 +580,16 @@ void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
static s64 task_qdist(struct task_struct *p)
{
int idx = weight_to_idx(p->scx.weight);
- struct task_ctx *tctx;
+ struct task_ctx *taskc;
s64 qdist;
- tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
- if (!tctx) {
+ taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+ if (!taskc) {
scx_bpf_error("task_ctx lookup failed");
return 0;
}
- qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
+ qdist = taskc->core_sched_seq - core_sched_head_seqs[idx];
/*
* As queue index increments, the priority doubles. The queue w/ index 3
--
2.53.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 2/4] sched_ext: scx_qmap: move globals and cpu_ctx into a BPF arena map
2026-04-16 17:20 [PATCHSET v2 sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Tejun Heo
2026-04-16 17:20 ` [PATCH 1/4] sched_ext: scx_qmap: rename tctx to taskc Tejun Heo
@ 2026-04-16 17:20 ` Tejun Heo
2026-04-16 17:20 ` [PATCH 3/4] sched_ext: scx_qmap: move task_ctx into a BPF arena slab Tejun Heo
` (2 subsequent siblings)
4 siblings, 0 replies; 9+ messages in thread
From: Tejun Heo @ 2026-04-16 17:20 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min
Cc: Emil Tsalapatis, sched-ext, linux-kernel
Arena simplifies verification and allows more natural programming.
Convert scx_qmap to arena as preparation for further sub-sched work.
Move scheduler state from BSS globals and a percpu array map
into a single BPF arena map. A shared struct qmap_arena is declared as
an __arena global so BPF accesses it directly and userspace reaches it
through skel->arena->qa.
Scheduling logic unchanged; only memory backing changes.
v2: Drop "mutable" from comments.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
---
tools/sched_ext/scx_qmap.bpf.c | 152 ++++++++++++++-------------------
tools/sched_ext/scx_qmap.c | 45 +++++-----
tools/sched_ext/scx_qmap.h | 57 +++++++++++++
3 files changed, 147 insertions(+), 107 deletions(-)
create mode 100644 tools/sched_ext/scx_qmap.h
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index a18234f3c27a..184a3a729d21 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -22,6 +22,8 @@
*/
#include <scx/common.bpf.h>
+#include "scx_qmap.h"
+
enum consts {
ONE_SEC_IN_NS = 1000000000,
ONE_MSEC_IN_NS = 1000000,
@@ -48,14 +50,26 @@ const volatile bool suppress_dump;
const volatile bool always_enq_immed;
const volatile u32 immed_stress_nth;
-u64 nr_highpri_queued;
-u32 test_error_cnt;
-
-#define MAX_SUB_SCHEDS 8
-u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
-
UEI_DEFINE(uei);
+/*
+ * All scheduler state - per-cpu context, stats counters, core-sched sequence
+ * numbers, sub-sched cgroup ids - lives in this single BPF arena map. Userspace
+ * reaches it via skel->arena->qa.
+ */
+struct {
+ __uint(type, BPF_MAP_TYPE_ARENA);
+ __uint(map_flags, BPF_F_MMAPABLE);
+ __uint(max_entries, 1 << 16); /* upper bound in pages */
+#if defined(__TARGET_ARCH_arm64) || defined(__aarch64__)
+ __ulong(map_extra, 0x1ull << 32); /* user/BPF mmap base */
+#else
+ __ulong(map_extra, 0x1ull << 44);
+#endif
+} arena SEC(".maps");
+
+struct qmap_arena __arena qa;
+
struct qmap {
__uint(type, BPF_MAP_TYPE_QUEUE);
__uint(max_entries, 4096);
@@ -102,8 +116,6 @@ static const u32 qidx_to_cpuperf_target[] = {
* task's seq and the associated queue's head seq is called the queue distance
* and used when comparing two tasks for ordering. See qmap_core_sched_before().
*/
-static u64 core_sched_head_seqs[5];
-static u64 core_sched_tail_seqs[5];
/* Per-task scheduling context */
struct task_ctx {
@@ -119,27 +131,6 @@ struct {
__type(value, struct task_ctx);
} task_ctx_stor SEC(".maps");
-struct cpu_ctx {
- u64 dsp_idx; /* dispatch index */
- u64 dsp_cnt; /* remaining count */
- u32 avg_weight;
- u32 cpuperf_target;
-};
-
-struct {
- __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
- __uint(max_entries, 1);
- __type(key, u32);
- __type(value, struct cpu_ctx);
-} cpu_ctx_stor SEC(".maps");
-
-/* Statistics */
-u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0, nr_dequeued, nr_ddsp_from_enq;
-u64 nr_core_sched_execed;
-u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
-u32 cpuperf_min, cpuperf_avg, cpuperf_max;
-u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
-
static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
{
s32 cpu;
@@ -215,9 +206,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
s32 cpu;
if (enq_flags & SCX_ENQ_REENQ) {
- __sync_fetch_and_add(&nr_reenqueued, 1);
+ __sync_fetch_and_add(&qa.nr_reenqueued, 1);
if (scx_bpf_task_cpu(p) == 0)
- __sync_fetch_and_add(&nr_reenqueued_cpu0, 1);
+ __sync_fetch_and_add(&qa.nr_reenqueued_cpu0, 1);
}
if (p->flags & PF_KTHREAD) {
@@ -228,7 +219,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
return;
}
- if (test_error_cnt && !--test_error_cnt)
+ if (qa.test_error_cnt && !--qa.test_error_cnt)
scx_bpf_error("test triggering error");
if (!(taskc = lookup_task_ctx(p)))
@@ -238,7 +229,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
* All enqueued tasks must have their core_sched_seq updated for correct
* core-sched ordering. Also, take a look at the end of qmap_dispatch().
*/
- taskc->core_sched_seq = core_sched_tail_seqs[idx]++;
+ taskc->core_sched_seq = qa.core_sched_tail_seqs[idx]++;
/*
* IMMED stress testing: Every immed_stress_nth'th enqueue, dispatch
@@ -276,7 +267,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
/* if select_cpu() wasn't called, try direct dispatch */
if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
(cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
- __sync_fetch_and_add(&nr_ddsp_from_enq, 1);
+ __sync_fetch_and_add(&qa.nr_ddsp_from_enq, 1);
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
return;
}
@@ -311,9 +302,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
taskc->highpri = true;
- __sync_fetch_and_add(&nr_highpri_queued, 1);
+ __sync_fetch_and_add(&qa.nr_highpri_queued, 1);
}
- __sync_fetch_and_add(&nr_enqueued, 1);
+ __sync_fetch_and_add(&qa.nr_enqueued, 1);
}
/*
@@ -322,9 +313,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
*/
void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
{
- __sync_fetch_and_add(&nr_dequeued, 1);
+ __sync_fetch_and_add(&qa.nr_dequeued, 1);
if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
- __sync_fetch_and_add(&nr_core_sched_execed, 1);
+ __sync_fetch_and_add(&qa.nr_core_sched_execed, 1);
}
static void update_core_sched_head_seq(struct task_struct *p)
@@ -333,7 +324,7 @@ static void update_core_sched_head_seq(struct task_struct *p)
struct task_ctx *taskc;
if ((taskc = lookup_task_ctx(p)))
- core_sched_head_seqs[idx] = taskc->core_sched_seq;
+ qa.core_sched_head_seqs[idx] = taskc->core_sched_seq;
}
/*
@@ -384,14 +375,14 @@ static bool dispatch_highpri(bool from_timer)
SCX_ENQ_PREEMPT)) {
if (cpu == this_cpu) {
dispatched = true;
- __sync_fetch_and_add(&nr_expedited_local, 1);
+ __sync_fetch_and_add(&qa.nr_expedited_local, 1);
} else {
- __sync_fetch_and_add(&nr_expedited_remote, 1);
+ __sync_fetch_and_add(&qa.nr_expedited_remote, 1);
}
if (from_timer)
- __sync_fetch_and_add(&nr_expedited_from_timer, 1);
+ __sync_fetch_and_add(&qa.nr_expedited_from_timer, 1);
} else {
- __sync_fetch_and_add(&nr_expedited_lost, 1);
+ __sync_fetch_and_add(&qa.nr_expedited_lost, 1);
}
if (dispatched)
@@ -404,19 +395,19 @@ static bool dispatch_highpri(bool from_timer)
void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
{
struct task_struct *p;
- struct cpu_ctx *cpuc;
+ struct cpu_ctx __arena *cpuc;
struct task_ctx *taskc;
- u32 zero = 0, batch = dsp_batch ?: 1;
+ u32 batch = dsp_batch ?: 1;
void *fifo;
s32 i, pid;
if (dispatch_highpri(false))
return;
- if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0))
+ if (!qa.nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0))
return;
- if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
+ if (dsp_inf_loop_after && qa.nr_dispatched > dsp_inf_loop_after) {
/*
* PID 2 should be kthreadd which should mostly be idle and off
* the scheduler. Let's keep dispatching it to force the kernel
@@ -430,10 +421,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
}
}
- if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
- scx_bpf_error("failed to look up cpu_ctx");
- return;
- }
+ cpuc = &qa.cpu_ctxs[bpf_get_smp_processor_id()];
for (i = 0; i < 5; i++) {
/* Advance the dispatch cursor and pick the fifo. */
@@ -442,9 +430,11 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
}
- fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
+ u64 dsp_idx = cpuc->dsp_idx;
+
+ fifo = bpf_map_lookup_elem(&queue_arr, &dsp_idx);
if (!fifo) {
- scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
+ scx_bpf_error("failed to find ring %llu", dsp_idx);
return;
}
@@ -465,10 +455,10 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
}
if (taskc->highpri)
- __sync_fetch_and_sub(&nr_highpri_queued, 1);
+ __sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
update_core_sched_head_seq(p);
- __sync_fetch_and_add(&nr_dispatched, 1);
+ __sync_fetch_and_add(&qa.nr_dispatched, 1);
scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
@@ -529,8 +519,8 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
}
for (i = 0; i < MAX_SUB_SCHEDS; i++) {
- if (sub_sched_cgroup_ids[i] &&
- scx_bpf_sub_dispatch(sub_sched_cgroup_ids[i]))
+ if (qa.sub_sched_cgroup_ids[i] &&
+ scx_bpf_sub_dispatch(qa.sub_sched_cgroup_ids[i]))
return;
}
@@ -546,21 +536,15 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
}
taskc->core_sched_seq =
- core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
+ qa.core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
}
}
void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
{
- struct cpu_ctx *cpuc;
- u32 zero = 0;
+ struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[bpf_get_smp_processor_id()];
int idx;
- if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
- scx_bpf_error("failed to look up cpu_ctx");
- return;
- }
-
/*
* Use the running avg of weights to select the target cpuperf level.
* This is a demonstration of the cpuperf feature rather than a
@@ -589,7 +573,7 @@ static s64 task_qdist(struct task_struct *p)
return 0;
}
- qdist = taskc->core_sched_seq - core_sched_head_seqs[idx];
+ qdist = taskc->core_sched_seq - qa.core_sched_head_seqs[idx];
/*
* As queue index increments, the priority doubles. The queue w/ index 3
@@ -679,13 +663,10 @@ void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
{
- u32 zero = 0;
- struct cpu_ctx *cpuc;
+ struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[cpu];
if (suppress_dump || idle)
return;
- if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
- return;
scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
@@ -802,7 +783,7 @@ struct {
*/
static void monitor_cpuperf(void)
{
- u32 zero = 0, nr_cpu_ids;
+ u32 nr_cpu_ids;
u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
const struct cpumask *online;
@@ -812,7 +793,7 @@ static void monitor_cpuperf(void)
online = scx_bpf_get_online_cpumask();
bpf_for(i, 0, nr_cpu_ids) {
- struct cpu_ctx *cpuc;
+ struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[i];
u32 cap, cur;
if (!bpf_cpumask_test_cpu(i, online))
@@ -834,11 +815,6 @@ static void monitor_cpuperf(void)
cur_sum += cur * cap / SCX_CPUPERF_ONE;
cap_sum += cap;
- if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
- scx_bpf_error("failed to look up cpu_ctx");
- goto out;
- }
-
/* collect target */
cur = cpuc->cpuperf_target;
target_sum += cur;
@@ -846,14 +822,14 @@ static void monitor_cpuperf(void)
target_max = cur > target_max ? cur : target_max;
}
- cpuperf_min = cur_min;
- cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
- cpuperf_max = cur_max;
+ qa.cpuperf_min = cur_min;
+ qa.cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
+ qa.cpuperf_max = cur_max;
+
+ qa.cpuperf_target_min = target_min;
+ qa.cpuperf_target_avg = target_sum / nr_online_cpus;
+ qa.cpuperf_target_max = target_max;
- cpuperf_target_min = target_min;
- cpuperf_target_avg = target_sum / nr_online_cpus;
- cpuperf_target_max = target_max;
-out:
scx_bpf_put_cpumask(online);
}
@@ -996,8 +972,8 @@ s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args)
s32 i;
for (i = 0; i < MAX_SUB_SCHEDS; i++) {
- if (!sub_sched_cgroup_ids[i]) {
- sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
+ if (!qa.sub_sched_cgroup_ids[i]) {
+ qa.sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
bpf_printk("attaching sub-sched[%d] on %s",
i, args->cgroup_path);
return 0;
@@ -1012,8 +988,8 @@ void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args)
s32 i;
for (i = 0; i < MAX_SUB_SCHEDS; i++) {
- if (sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
- sub_sched_cgroup_ids[i] = 0;
+ if (qa.sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
+ qa.sub_sched_cgroup_ids[i] = 0;
bpf_printk("detaching sub-sched[%d] on %s",
i, args->cgroup_path);
break;
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index e7c89a2bc3d8..8844499c14c4 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -10,9 +10,11 @@
#include <inttypes.h>
#include <signal.h>
#include <libgen.h>
+#include <sys/mman.h>
#include <sys/stat.h>
#include <bpf/bpf.h>
#include <scx/common.h>
+#include "scx_qmap.h"
#include "scx_qmap.bpf.skel.h"
const char help_fmt[] =
@@ -60,6 +62,8 @@ int main(int argc, char **argv)
{
struct scx_qmap *skel;
struct bpf_link *link;
+ struct qmap_arena *qa;
+ __u32 test_error_cnt = 0;
int opt;
libbpf_set_print(libbpf_print_fn);
@@ -76,7 +80,7 @@ int main(int argc, char **argv)
skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
break;
case 'e':
- skel->bss->test_error_cnt = strtoul(optarg, NULL, 0);
+ test_error_cnt = strtoul(optarg, NULL, 0);
break;
case 't':
skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0);
@@ -142,29 +146,32 @@ int main(int argc, char **argv)
SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei);
link = SCX_OPS_ATTACH(skel, qmap_ops, scx_qmap);
+ qa = &skel->arena->qa;
+ qa->test_error_cnt = test_error_cnt;
+
while (!exit_req && !UEI_EXITED(skel, uei)) {
- long nr_enqueued = skel->bss->nr_enqueued;
- long nr_dispatched = skel->bss->nr_dispatched;
+ long nr_enqueued = qa->nr_enqueued;
+ long nr_dispatched = qa->nr_dispatched;
- printf("stats : enq=%lu dsp=%lu delta=%ld reenq/cpu0=%"PRIu64"/%"PRIu64" deq=%"PRIu64" core=%"PRIu64" enq_ddsp=%"PRIu64"\n",
+ printf("stats : enq=%lu dsp=%lu delta=%ld reenq/cpu0=%llu/%llu deq=%llu core=%llu enq_ddsp=%llu\n",
nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
- skel->bss->nr_reenqueued, skel->bss->nr_reenqueued_cpu0,
- skel->bss->nr_dequeued,
- skel->bss->nr_core_sched_execed,
- skel->bss->nr_ddsp_from_enq);
- printf(" exp_local=%"PRIu64" exp_remote=%"PRIu64" exp_timer=%"PRIu64" exp_lost=%"PRIu64"\n",
- skel->bss->nr_expedited_local,
- skel->bss->nr_expedited_remote,
- skel->bss->nr_expedited_from_timer,
- skel->bss->nr_expedited_lost);
+ qa->nr_reenqueued, qa->nr_reenqueued_cpu0,
+ qa->nr_dequeued,
+ qa->nr_core_sched_execed,
+ qa->nr_ddsp_from_enq);
+ printf(" exp_local=%llu exp_remote=%llu exp_timer=%llu exp_lost=%llu\n",
+ qa->nr_expedited_local,
+ qa->nr_expedited_remote,
+ qa->nr_expedited_from_timer,
+ qa->nr_expedited_lost);
if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur"))
printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n",
- skel->bss->cpuperf_min,
- skel->bss->cpuperf_avg,
- skel->bss->cpuperf_max,
- skel->bss->cpuperf_target_min,
- skel->bss->cpuperf_target_avg,
- skel->bss->cpuperf_target_max);
+ qa->cpuperf_min,
+ qa->cpuperf_avg,
+ qa->cpuperf_max,
+ qa->cpuperf_target_min,
+ qa->cpuperf_target_avg,
+ qa->cpuperf_target_max);
fflush(stdout);
sleep(1);
}
diff --git a/tools/sched_ext/scx_qmap.h b/tools/sched_ext/scx_qmap.h
new file mode 100644
index 000000000000..e0e19af6dcb3
--- /dev/null
+++ b/tools/sched_ext/scx_qmap.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared definitions between scx_qmap.bpf.c and scx_qmap.c.
+ *
+ * The scheduler keeps all state in a single BPF arena map. struct
+ * qmap_arena is the one object that lives at the base of the arena and is
+ * mmap'd into userspace so the loader can read counters directly.
+ *
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
+ */
+#ifndef __SCX_QMAP_H
+#define __SCX_QMAP_H
+
+#ifdef __BPF__
+#include <scx/bpf_arena_common.bpf.h>
+#else
+#include <linux/types.h>
+#include <scx/bpf_arena_common.h>
+#endif
+
+#define MAX_SUB_SCHEDS 8
+
+/*
+ * cpu_ctxs[] is sized to a fixed cap so the layout is shared between BPF and
+ * userspace. Keep this in sync with NR_CPUS used by the BPF side.
+ */
+#define SCX_QMAP_MAX_CPUS 1024
+
+struct cpu_ctx {
+ __u64 dsp_idx; /* dispatch index */
+ __u64 dsp_cnt; /* remaining count */
+ __u32 avg_weight;
+ __u32 cpuperf_target;
+};
+
+struct qmap_arena {
+ /* userspace-visible stats */
+ __u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0;
+ __u64 nr_dequeued, nr_ddsp_from_enq;
+ __u64 nr_core_sched_execed;
+ __u64 nr_expedited_local, nr_expedited_remote;
+ __u64 nr_expedited_lost, nr_expedited_from_timer;
+ __u64 nr_highpri_queued;
+ __u32 test_error_cnt;
+ __u32 cpuperf_min, cpuperf_avg, cpuperf_max;
+ __u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
+
+ /* kernel-side runtime state */
+ __u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
+ __u64 core_sched_head_seqs[5];
+ __u64 core_sched_tail_seqs[5];
+
+ struct cpu_ctx cpu_ctxs[SCX_QMAP_MAX_CPUS];
+};
+
+#endif /* __SCX_QMAP_H */
--
2.53.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 3/4] sched_ext: scx_qmap: move task_ctx into a BPF arena slab
2026-04-16 17:20 [PATCHSET v2 sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Tejun Heo
2026-04-16 17:20 ` [PATCH 1/4] sched_ext: scx_qmap: rename tctx to taskc Tejun Heo
2026-04-16 17:20 ` [PATCH 2/4] sched_ext: scx_qmap: move globals and cpu_ctx into a BPF arena map Tejun Heo
@ 2026-04-16 17:20 ` Tejun Heo
2026-04-16 17:20 ` [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists Tejun Heo
2026-04-16 17:48 ` [PATCHSET v2 sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Tejun Heo
4 siblings, 0 replies; 9+ messages in thread
From: Tejun Heo @ 2026-04-16 17:20 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min
Cc: Emil Tsalapatis, sched-ext, linux-kernel
Arena simplifies verification and allows more natural programming.
Convert scx_qmap to arena as preparation for further sub-sched work.
Allocate per-task context from an arena slab instead of storing it
directly in task_storage. task_ctx_stor now holds an arena pointer to
the task's slab entry. Free entries form a singly-linked list protected
by bpf_res_spin_lock; slab exhaustion triggers scx_bpf_error().
The slab size is configurable via the new -N option (default 16384).
Also add bpf_res_spin_lock/unlock declarations to common.bpf.h.
Scheduling logic unchanged.
v2: Add task_ctx_t typedef for struct task_ctx __arena (Emil).
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
---
tools/sched_ext/include/scx/common.bpf.h | 4 +
tools/sched_ext/scx_qmap.bpf.c | 181 ++++++++++++++++++-----
tools/sched_ext/scx_qmap.c | 9 +-
tools/sched_ext/scx_qmap.h | 7 +
4 files changed, 162 insertions(+), 39 deletions(-)
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 19459dedde41..35fc62556241 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -526,6 +526,10 @@ static inline bool is_migration_disabled(const struct task_struct *p)
void bpf_rcu_read_lock(void) __ksym;
void bpf_rcu_read_unlock(void) __ksym;
+/* resilient qspinlock */
+int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) __ksym __weak;
+void bpf_res_spin_unlock(struct bpf_res_spin_lock *lock) __ksym __weak;
+
/*
* Time helpers, most of which are from jiffies.h.
*/
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 184a3a729d21..57ce95a306cc 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -49,6 +49,7 @@ const volatile s32 disallow_tgid;
const volatile bool suppress_dump;
const volatile bool always_enq_immed;
const volatile u32 immed_stress_nth;
+const volatile u32 max_tasks;
UEI_DEFINE(uei);
@@ -117,20 +118,46 @@ static const u32 qidx_to_cpuperf_target[] = {
* and used when comparing two tasks for ordering. See qmap_core_sched_before().
*/
-/* Per-task scheduling context */
+/*
+ * Per-task scheduling context. Allocated from the qa.task_ctxs[] slab in
+ * arena. While the task is alive the entry is referenced from task_ctx_stor;
+ * while it's free the entry sits on the free list singly-linked through
+ * @next_free.
+ */
struct task_ctx {
- bool force_local; /* Dispatch directly to local_dsq */
- bool highpri;
- u64 core_sched_seq;
+ struct task_ctx __arena *next_free; /* only valid on free list */
+ bool force_local; /* Dispatch directly to local_dsq */
+ bool highpri;
+ u64 core_sched_seq;
+};
+
+/* All task_ctx pointers are arena pointers. */
+typedef struct task_ctx __arena task_ctx_t;
+
+/* Holds an arena pointer to the task's slab entry. */
+struct task_ctx_stor_val {
+ task_ctx_t *taskc;
};
struct {
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);
__type(key, int);
- __type(value, struct task_ctx);
+ __type(value, struct task_ctx_stor_val);
} task_ctx_stor SEC(".maps");
+/* Protects the task_ctx slab free list. */
+__hidden struct bpf_res_spin_lock qa_task_lock SEC(".data.qa_task_lock");
+
+static int qmap_spin_lock(struct bpf_res_spin_lock *lock)
+{
+ if (bpf_res_spin_lock(lock)) {
+ scx_bpf_error("res_spin_lock failed");
+ return -EBUSY;
+ }
+ return 0;
+}
+
static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
{
s32 cpu;
@@ -148,21 +175,34 @@ static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
return -1;
}
-static struct task_ctx *lookup_task_ctx(struct task_struct *p)
+/*
+ * Force a reference to the arena map. The verifier associates an arena with
+ * a program by finding an LD_IMM64 instruction that loads the arena's BPF
+ * map; programs that only use arena pointers returned from task-local
+ * storage (like qmap_select_cpu) never reference @arena directly. Without
+ * this, the verifier rejects addr_space_cast with "addr_space_cast insn
+ * can only be used in a program that has an associated arena".
+ */
+#define QMAP_TOUCH_ARENA() do { asm volatile("" :: "r"(&arena)); } while (0)
+
+static task_ctx_t *lookup_task_ctx(struct task_struct *p)
{
- struct task_ctx *taskc;
+ struct task_ctx_stor_val *v;
+
+ QMAP_TOUCH_ARENA();
- if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
+ v = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+ if (!v || !v->taskc) {
scx_bpf_error("task_ctx lookup failed");
return NULL;
}
- return taskc;
+ return v->taskc;
}
s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
s32 prev_cpu, u64 wake_flags)
{
- struct task_ctx *taskc;
+ task_ctx_t *taskc;
s32 cpu;
if (!(taskc = lookup_task_ctx(p)))
@@ -199,7 +239,7 @@ static int weight_to_idx(u32 weight)
void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
{
static u32 user_cnt, kernel_cnt;
- struct task_ctx *taskc;
+ task_ctx_t *taskc;
u32 pid = p->pid;
int idx = weight_to_idx(p->scx.weight);
void *ring;
@@ -321,7 +361,7 @@ void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
static void update_core_sched_head_seq(struct task_struct *p)
{
int idx = weight_to_idx(p->scx.weight);
- struct task_ctx *taskc;
+ task_ctx_t *taskc;
if ((taskc = lookup_task_ctx(p)))
qa.core_sched_head_seqs[idx] = taskc->core_sched_seq;
@@ -345,7 +385,7 @@ static bool dispatch_highpri(bool from_timer)
/* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
static u64 highpri_seq;
- struct task_ctx *taskc;
+ task_ctx_t *taskc;
if (!(taskc = lookup_task_ctx(p)))
return false;
@@ -396,7 +436,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
{
struct task_struct *p;
struct cpu_ctx __arena *cpuc;
- struct task_ctx *taskc;
+ task_ctx_t *taskc;
u32 batch = dsp_batch ?: 1;
void *fifo;
s32 i, pid;
@@ -440,7 +480,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
/* Dispatch or advance. */
bpf_repeat(BPF_MAX_LOOPS) {
- struct task_ctx *taskc;
+ task_ctx_t *taskc;
if (bpf_map_pop_elem(fifo, &pid))
break;
@@ -529,11 +569,9 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
* if the task were enqueued and dispatched immediately.
*/
if (prev) {
- taskc = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
- if (!taskc) {
- scx_bpf_error("task_ctx lookup failed");
+ taskc = lookup_task_ctx(prev);
+ if (!taskc)
return;
- }
taskc->core_sched_seq =
qa.core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
@@ -564,14 +602,12 @@ void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
static s64 task_qdist(struct task_struct *p)
{
int idx = weight_to_idx(p->scx.weight);
- struct task_ctx *taskc;
+ task_ctx_t *taskc;
s64 qdist;
- taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
- if (!taskc) {
- scx_bpf_error("task_ctx lookup failed");
+ taskc = lookup_task_ctx(p);
+ if (!taskc)
return 0;
- }
qdist = taskc->core_sched_seq - qa.core_sched_head_seqs[idx];
@@ -606,21 +642,64 @@ bool BPF_STRUCT_OPS(qmap_core_sched_before,
* tasks when a higher-priority scheduling class takes the CPU.
*/
-s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
- struct scx_init_task_args *args)
+s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init_task, struct task_struct *p,
+ struct scx_init_task_args *args)
{
+ struct task_ctx_stor_val *v;
+ task_ctx_t *taskc;
+
if (p->tgid == disallow_tgid)
p->scx.disallow = true;
- /*
- * @p is new. Let's ensure that its task_ctx is available. We can sleep
- * in this function and the following will automatically use GFP_KERNEL.
- */
- if (bpf_task_storage_get(&task_ctx_stor, p, 0,
- BPF_LOCAL_STORAGE_GET_F_CREATE))
- return 0;
- else
+ /* pop a slab entry off the free list */
+ if (qmap_spin_lock(&qa_task_lock))
+ return -EBUSY;
+ taskc = qa.task_free_head;
+ if (taskc)
+ qa.task_free_head = taskc->next_free;
+ bpf_res_spin_unlock(&qa_task_lock);
+ if (!taskc) {
+ scx_bpf_error("task_ctx slab exhausted (max_tasks=%u)", max_tasks);
+ return -ENOMEM;
+ }
+
+ taskc->next_free = NULL;
+ taskc->force_local = false;
+ taskc->highpri = false;
+ taskc->core_sched_seq = 0;
+
+ v = bpf_task_storage_get(&task_ctx_stor, p, NULL,
+ BPF_LOCAL_STORAGE_GET_F_CREATE);
+ if (!v) {
+ /* push back to the free list */
+ if (!qmap_spin_lock(&qa_task_lock)) {
+ taskc->next_free = qa.task_free_head;
+ qa.task_free_head = taskc;
+ bpf_res_spin_unlock(&qa_task_lock);
+ }
return -ENOMEM;
+ }
+ v->taskc = taskc;
+ return 0;
+}
+
+void BPF_STRUCT_OPS(qmap_exit_task, struct task_struct *p,
+ struct scx_exit_task_args *args)
+{
+ struct task_ctx_stor_val *v;
+ task_ctx_t *taskc;
+
+ v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
+ if (!v || !v->taskc)
+ return;
+ taskc = v->taskc;
+ v->taskc = NULL;
+
+ if (qmap_spin_lock(&qa_task_lock))
+ return;
+ taskc->next_free = qa.task_free_head;
+ qa.task_free_head = taskc;
+ bpf_res_spin_unlock(&qa_task_lock);
}
void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
@@ -675,12 +754,17 @@ void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle
void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
{
- struct task_ctx *taskc;
+ struct task_ctx_stor_val *v;
+ task_ctx_t *taskc;
+
+ QMAP_TOUCH_ARENA();
if (suppress_dump)
return;
- if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
+ v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
+ if (!v || !v->taskc)
return;
+ taskc = v->taskc;
scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
taskc->force_local, taskc->core_sched_seq);
@@ -915,10 +999,32 @@ static int lowpri_timerfn(void *map, int *key, struct bpf_timer *timer)
s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
{
- u32 key = 0;
+ task_ctx_t *slab;
+ u32 nr_pages, key = 0, i;
struct bpf_timer *timer;
s32 ret;
+ /*
+ * Allocate the task_ctx slab in arena and thread the entire slab onto
+ * the free list. max_tasks is set by userspace before load.
+ */
+ if (!max_tasks) {
+ scx_bpf_error("max_tasks must be > 0");
+ return -EINVAL;
+ }
+
+ nr_pages = (max_tasks * sizeof(struct task_ctx) + PAGE_SIZE - 1) / PAGE_SIZE;
+ slab = bpf_arena_alloc_pages(&arena, NULL, nr_pages, NUMA_NO_NODE, 0);
+ if (!slab) {
+ scx_bpf_error("failed to allocate task_ctx slab");
+ return -ENOMEM;
+ }
+ qa.task_ctxs = slab;
+
+ bpf_for(i, 0, max_tasks)
+ slab[i].next_free = (i + 1 < max_tasks) ? &slab[i + 1] : NULL;
+ qa.task_free_head = &slab[0];
+
if (print_msgs && !sub_cgroup_id)
print_cpus();
@@ -1005,6 +1111,7 @@ SCX_OPS_DEFINE(qmap_ops,
.tick = (void *)qmap_tick,
.core_sched_before = (void *)qmap_core_sched_before,
.init_task = (void *)qmap_init_task,
+ .exit_task = (void *)qmap_exit_task,
.dump = (void *)qmap_dump,
.dump_cpu = (void *)qmap_dump_cpu,
.dump_task = (void *)qmap_dump_task,
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index 8844499c14c4..4bdcc4bc5fbd 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -23,12 +23,13 @@ const char help_fmt[] =
"See the top-level comment in .bpf.c for more details.\n"
"\n"
"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n"
-" [-P] [-M] [-H] [-d PID] [-D LEN] [-S] [-p] [-I] [-F COUNT] [-v]\n"
+" [-N COUNT] [-P] [-M] [-H] [-d PID] [-D LEN] [-S] [-p] [-I] [-F COUNT] [-v]\n"
"\n"
" -s SLICE_US Override slice duration\n"
" -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n"
" -t COUNT Stall every COUNT'th user thread\n"
" -T COUNT Stall every COUNT'th kernel thread\n"
+" -N COUNT Size of the task_ctx arena slab (default 16384)\n"
" -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n"
" -b COUNT Dispatch upto COUNT tasks together\n"
" -P Print out DSQ content and event counters to trace_pipe every second\n"
@@ -73,8 +74,9 @@ int main(int argc, char **argv)
skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
+ skel->rodata->max_tasks = 16384;
- while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PMHc:d:D:SpIF:vh")) != -1) {
+ while ((opt = getopt(argc, argv, "s:e:t:T:l:b:N:PMHc:d:D:SpIF:vh")) != -1) {
switch (opt) {
case 's':
skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -94,6 +96,9 @@ int main(int argc, char **argv)
case 'b':
skel->rodata->dsp_batch = strtoul(optarg, NULL, 0);
break;
+ case 'N':
+ skel->rodata->max_tasks = strtoul(optarg, NULL, 0);
+ break;
case 'P':
skel->rodata->print_dsqs_and_events = true;
break;
diff --git a/tools/sched_ext/scx_qmap.h b/tools/sched_ext/scx_qmap.h
index e0e19af6dcb3..5beaec82a5db 100644
--- a/tools/sched_ext/scx_qmap.h
+++ b/tools/sched_ext/scx_qmap.h
@@ -34,6 +34,9 @@ struct cpu_ctx {
__u32 cpuperf_target;
};
+/* Opaque to userspace; defined in scx_qmap.bpf.c. */
+struct task_ctx;
+
struct qmap_arena {
/* userspace-visible stats */
__u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0;
@@ -52,6 +55,10 @@ struct qmap_arena {
__u64 core_sched_tail_seqs[5];
struct cpu_ctx cpu_ctxs[SCX_QMAP_MAX_CPUS];
+
+ /* task_ctx slab; allocated and threaded by qmap_init() */
+ struct task_ctx __arena *task_ctxs;
+ struct task_ctx __arena *task_free_head;
};
#endif /* __SCX_QMAP_H */
--
2.53.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists
2026-04-16 17:20 [PATCHSET v2 sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Tejun Heo
` (2 preceding siblings ...)
2026-04-16 17:20 ` [PATCH 3/4] sched_ext: scx_qmap: move task_ctx into a BPF arena slab Tejun Heo
@ 2026-04-16 17:20 ` Tejun Heo
2026-04-16 17:48 ` [PATCHSET v2 sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Tejun Heo
4 siblings, 0 replies; 9+ messages in thread
From: Tejun Heo @ 2026-04-16 17:20 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min
Cc: Emil Tsalapatis, sched-ext, linux-kernel
Arena simplifies verification and allows more natural programming.
Convert scx_qmap to arena as preparation for further sub-sched work.
Replace the five BPF_MAP_TYPE_QUEUE maps with doubly-linked lists in
arena, threaded through task_ctx. Each queue is a struct qmap_fifo with
head/tail pointers and its own per-queue bpf_res_spin_lock.
qmap_dequeue() now properly removes tasks from the queue instead of
leaving stale entries for dispatch to skip.
v2:
- Remove duplicate QMAP_TOUCH_ARENA() in qmap_dump_task (Andrea).
- Update file-level description for arena-backed lists (Andrea).
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
---
tools/sched_ext/scx_qmap.bpf.c | 227 ++++++++++++++++++++-------------
tools/sched_ext/scx_qmap.h | 9 ++
2 files changed, 150 insertions(+), 86 deletions(-)
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 57ce95a306cc..480ae934a526 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -2,15 +2,16 @@
/*
* A simple five-level FIFO queue scheduler.
*
- * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
- * assigned to one depending on its compound weight. Each CPU round robins
- * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
- * queue0, 2 from queue1, 4 from queue2 and so on.
+ * There are five FIFOs implemented as arena-backed doubly-linked lists
+ * threaded through per-task context. A task gets assigned to one depending on
+ * its compound weight. Each CPU round robins through the FIFOs and dispatches
+ * more from FIFOs with higher indices - 1 from queue0, 2 from queue1, 4 from
+ * queue2 and so on.
*
* This scheduler demonstrates:
*
* - BPF-side queueing using PIDs.
- * - Sleepable per-task storage allocation using ops.prep_enable().
+ * - BPF arena for scheduler state.
* - Core-sched support.
*
* This scheduler is primarily for demonstration and testing of sched_ext
@@ -71,31 +72,24 @@ struct {
struct qmap_arena __arena qa;
-struct qmap {
- __uint(type, BPF_MAP_TYPE_QUEUE);
- __uint(max_entries, 4096);
- __type(value, u32);
-} queue0 SEC(".maps"),
- queue1 SEC(".maps"),
- queue2 SEC(".maps"),
- queue3 SEC(".maps"),
- queue4 SEC(".maps"),
- dump_store SEC(".maps");
+/* Per-queue locks. Each in its own .data section as bpf_res_spin_lock requires. */
+__hidden struct bpf_res_spin_lock qa_q_lock0 SEC(".data.qa_q_lock0");
+__hidden struct bpf_res_spin_lock qa_q_lock1 SEC(".data.qa_q_lock1");
+__hidden struct bpf_res_spin_lock qa_q_lock2 SEC(".data.qa_q_lock2");
+__hidden struct bpf_res_spin_lock qa_q_lock3 SEC(".data.qa_q_lock3");
+__hidden struct bpf_res_spin_lock qa_q_lock4 SEC(".data.qa_q_lock4");
-struct {
- __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
- __uint(max_entries, 5);
- __type(key, int);
- __array(values, struct qmap);
-} queue_arr SEC(".maps") = {
- .values = {
- [0] = &queue0,
- [1] = &queue1,
- [2] = &queue2,
- [3] = &queue3,
- [4] = &queue4,
- },
-};
+static struct bpf_res_spin_lock *qa_q_lock(s32 qid)
+{
+ switch (qid) {
+ case 0: return &qa_q_lock0;
+ case 1: return &qa_q_lock1;
+ case 2: return &qa_q_lock2;
+ case 3: return &qa_q_lock3;
+ case 4: return &qa_q_lock4;
+ default: return NULL;
+ }
+}
/*
* If enabled, CPU performance target is set according to the queue index
@@ -123,9 +117,17 @@ static const u32 qidx_to_cpuperf_target[] = {
* arena. While the task is alive the entry is referenced from task_ctx_stor;
* while it's free the entry sits on the free list singly-linked through
* @next_free.
+ *
+ * When the task is queued on one of the five priority FIFOs, @q_idx is the
+ * queue index and @q_next/@q_prev link it in the queue's doubly-linked list.
+ * @q_idx is -1 when the task isn't on any queue.
*/
struct task_ctx {
struct task_ctx __arena *next_free; /* only valid on free list */
+ struct task_ctx __arena *q_next; /* queue link, NULL if tail */
+ struct task_ctx __arena *q_prev; /* queue link, NULL if head */
+ struct qmap_fifo __arena *fifo; /* queue we're on, NULL if not queued */
+ s32 pid;
bool force_local; /* Dispatch directly to local_dsq */
bool highpri;
u64 core_sched_seq;
@@ -199,6 +201,80 @@ static task_ctx_t *lookup_task_ctx(struct task_struct *p)
return v->taskc;
}
+/* Append @taskc to the tail of @fifo. Must not already be queued. */
+static void qmap_fifo_enqueue(struct qmap_fifo __arena *fifo, task_ctx_t *taskc)
+{
+ struct bpf_res_spin_lock *lock = qa_q_lock(fifo->idx);
+
+ if (!lock || qmap_spin_lock(lock))
+ return;
+ taskc->fifo = fifo;
+ taskc->q_next = NULL;
+ taskc->q_prev = fifo->tail;
+ if (fifo->tail)
+ fifo->tail->q_next = taskc;
+ else
+ fifo->head = taskc;
+ fifo->tail = taskc;
+ bpf_res_spin_unlock(lock);
+}
+
+/* Pop the head of @fifo. Returns NULL if empty. */
+static task_ctx_t *qmap_fifo_pop(struct qmap_fifo __arena *fifo)
+{
+ struct bpf_res_spin_lock *lock = qa_q_lock(fifo->idx);
+ task_ctx_t *taskc;
+
+ if (!lock || qmap_spin_lock(lock))
+ return NULL;
+ taskc = fifo->head;
+ if (taskc) {
+ fifo->head = taskc->q_next;
+ if (taskc->q_next)
+ taskc->q_next->q_prev = NULL;
+ else
+ fifo->tail = NULL;
+ taskc->q_next = NULL;
+ taskc->q_prev = NULL;
+ taskc->fifo = NULL;
+ }
+ bpf_res_spin_unlock(lock);
+ return taskc;
+}
+
+/* Remove @taskc from its fifo. No-op if not queued. */
+static void qmap_fifo_remove(task_ctx_t *taskc)
+{
+ struct qmap_fifo __arena *fifo = taskc->fifo;
+ struct bpf_res_spin_lock *lock;
+
+ if (!fifo)
+ return;
+
+ lock = qa_q_lock(fifo->idx);
+ if (!lock || qmap_spin_lock(lock))
+ return;
+
+ /* Re-check under lock — a concurrent pop may have cleared fifo. */
+ if (taskc->fifo != fifo) {
+ bpf_res_spin_unlock(lock);
+ return;
+ }
+
+ if (taskc->q_next)
+ taskc->q_next->q_prev = taskc->q_prev;
+ else
+ fifo->tail = taskc->q_prev;
+ if (taskc->q_prev)
+ taskc->q_prev->q_next = taskc->q_next;
+ else
+ fifo->head = taskc->q_next;
+ taskc->q_next = NULL;
+ taskc->q_prev = NULL;
+ taskc->fifo = NULL;
+ bpf_res_spin_unlock(lock);
+}
+
s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
s32 prev_cpu, u64 wake_flags)
{
@@ -240,9 +316,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
{
static u32 user_cnt, kernel_cnt;
task_ctx_t *taskc;
- u32 pid = p->pid;
int idx = weight_to_idx(p->scx.weight);
- void *ring;
s32 cpu;
if (enq_flags & SCX_ENQ_REENQ) {
@@ -328,17 +402,8 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
return;
}
- ring = bpf_map_lookup_elem(&queue_arr, &idx);
- if (!ring) {
- scx_bpf_error("failed to find ring %d", idx);
- return;
- }
-
- /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
- if (bpf_map_push_elem(ring, &pid, 0)) {
- scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags);
- return;
- }
+ /* Queue on the selected FIFO. */
+ qmap_fifo_enqueue(&qa.fifos[idx], taskc);
if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
taskc->highpri = true;
@@ -347,15 +412,20 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
__sync_fetch_and_add(&qa.nr_enqueued, 1);
}
-/*
- * The BPF queue map doesn't support removal and sched_ext can handle spurious
- * dispatches. qmap_dequeue() is only used to collect statistics.
- */
void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
{
+ task_ctx_t *taskc;
+
__sync_fetch_and_add(&qa.nr_dequeued, 1);
if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
__sync_fetch_and_add(&qa.nr_core_sched_execed, 1);
+
+ taskc = lookup_task_ctx(p);
+ if (taskc && taskc->fifo) {
+ if (taskc->highpri)
+ __sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
+ qmap_fifo_remove(taskc);
+ }
}
static void update_core_sched_head_seq(struct task_struct *p)
@@ -438,8 +508,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
struct cpu_ctx __arena *cpuc;
task_ctx_t *taskc;
u32 batch = dsp_batch ?: 1;
- void *fifo;
- s32 i, pid;
+ s32 i;
if (dispatch_highpri(false))
return;
@@ -470,30 +539,18 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
}
- u64 dsp_idx = cpuc->dsp_idx;
-
- fifo = bpf_map_lookup_elem(&queue_arr, &dsp_idx);
- if (!fifo) {
- scx_bpf_error("failed to find ring %llu", dsp_idx);
- return;
- }
-
/* Dispatch or advance. */
bpf_repeat(BPF_MAX_LOOPS) {
task_ctx_t *taskc;
- if (bpf_map_pop_elem(fifo, &pid))
+ taskc = qmap_fifo_pop(&qa.fifos[cpuc->dsp_idx]);
+ if (!taskc)
break;
- p = bpf_task_from_pid(pid);
+ p = bpf_task_from_pid(taskc->pid);
if (!p)
continue;
- if (!(taskc = lookup_task_ctx(p))) {
- bpf_task_release(p);
- return;
- }
-
if (taskc->highpri)
__sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
@@ -664,6 +721,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init_task, struct task_struct *p,
}
taskc->next_free = NULL;
+ taskc->q_next = NULL;
+ taskc->q_prev = NULL;
+ taskc->fifo = NULL;
+ taskc->pid = p->pid;
taskc->force_local = false;
taskc->highpri = false;
taskc->core_sched_seq = 0;
@@ -704,38 +765,29 @@ void BPF_STRUCT_OPS(qmap_exit_task, struct task_struct *p,
void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
{
- s32 i, pid;
+ task_ctx_t *taskc;
+ s32 i;
+
+ QMAP_TOUCH_ARENA();
if (suppress_dump)
return;
+ /*
+ * Walk the queue lists without locking - kfunc calls (scx_bpf_dump)
+ * aren't in the verifier's kfunc_spin_allowed() list so we can't hold
+ * a lock and dump. Best-effort; racing may print stale pids but the
+ * walk is bounded by bpf_repeat() so it always terminates.
+ */
bpf_for(i, 0, 5) {
- void *fifo;
-
- if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
- return;
-
scx_bpf_dump("QMAP FIFO[%d]:", i);
-
- /*
- * Dump can be invoked anytime and there is no way to iterate in
- * a non-destructive way. Pop and store in dump_store and then
- * restore afterwards. If racing against new enqueues, ordering
- * can get mixed up.
- */
+ taskc = qa.fifos[i].head;
bpf_repeat(4096) {
- if (bpf_map_pop_elem(fifo, &pid))
+ if (!taskc)
break;
- bpf_map_push_elem(&dump_store, &pid, 0);
- scx_bpf_dump(" %d", pid);
+ scx_bpf_dump(" %d", taskc->pid);
+ taskc = taskc->q_next;
}
-
- bpf_repeat(4096) {
- if (bpf_map_pop_elem(&dump_store, &pid))
- break;
- bpf_map_push_elem(fifo, &pid, 0);
- }
-
scx_bpf_dump("\n");
}
}
@@ -1021,6 +1073,9 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
}
qa.task_ctxs = slab;
+ bpf_for(i, 0, 5)
+ qa.fifos[i].idx = i;
+
bpf_for(i, 0, max_tasks)
slab[i].next_free = (i + 1 < max_tasks) ? &slab[i + 1] : NULL;
qa.task_free_head = &slab[0];
diff --git a/tools/sched_ext/scx_qmap.h b/tools/sched_ext/scx_qmap.h
index 5beaec82a5db..9d9af2ad90c6 100644
--- a/tools/sched_ext/scx_qmap.h
+++ b/tools/sched_ext/scx_qmap.h
@@ -37,6 +37,12 @@ struct cpu_ctx {
/* Opaque to userspace; defined in scx_qmap.bpf.c. */
struct task_ctx;
+struct qmap_fifo {
+ struct task_ctx __arena *head;
+ struct task_ctx __arena *tail;
+ __s32 idx;
+};
+
struct qmap_arena {
/* userspace-visible stats */
__u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0;
@@ -59,6 +65,9 @@ struct qmap_arena {
/* task_ctx slab; allocated and threaded by qmap_init() */
struct task_ctx __arena *task_ctxs;
struct task_ctx __arena *task_free_head;
+
+ /* five priority FIFOs, each a doubly-linked list through task_ctx */
+ struct qmap_fifo fifos[5];
};
#endif /* __SCX_QMAP_H */
--
2.53.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCHSET v2 sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena
2026-04-16 17:20 [PATCHSET v2 sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Tejun Heo
` (3 preceding siblings ...)
2026-04-16 17:20 ` [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists Tejun Heo
@ 2026-04-16 17:48 ` Tejun Heo
4 siblings, 0 replies; 9+ messages in thread
From: Tejun Heo @ 2026-04-16 17:48 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min
Cc: Emil Tsalapatis, sched-ext, linux-kernel
Hello,
> Tejun Heo (4):
> sched_ext: scx_qmap: rename tctx to taskc
> sched_ext: scx_qmap: move globals and cpu_ctx into a BPF arena map
> sched_ext: scx_qmap: move task_ctx into a BPF arena slab
> sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists
Applied 1-4 to sched_ext/for-7.2.
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2026-04-16 17:48 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-16 17:20 [PATCHSET v2 sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Tejun Heo
2026-04-16 17:20 ` [PATCH 1/4] sched_ext: scx_qmap: rename tctx to taskc Tejun Heo
2026-04-16 17:20 ` [PATCH 2/4] sched_ext: scx_qmap: move globals and cpu_ctx into a BPF arena map Tejun Heo
2026-04-16 17:20 ` [PATCH 3/4] sched_ext: scx_qmap: move task_ctx into a BPF arena slab Tejun Heo
2026-04-16 17:20 ` [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists Tejun Heo
2026-04-16 17:48 ` [PATCHSET v2 sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Tejun Heo
-- strict thread matches above, loose matches on Subject: below --
2026-04-16 8:16 [PATCHSET " Tejun Heo
2026-04-16 8:16 ` [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists Tejun Heo
2026-04-16 10:01 ` Andrea Righi
2026-04-16 15:45 ` Emil Tsalapatis
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox