* [PATCH 1/4] sched_ext: scx_qmap: rename tctx to taskc
2026-04-16 8:16 [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Tejun Heo
@ 2026-04-16 8:16 ` Tejun Heo
2026-04-16 14:56 ` Emil Tsalapatis
2026-04-16 8:16 ` [PATCH 2/4] sched_ext: scx_qmap: move globals and cpu_ctx into a BPF arena map Tejun Heo
` (3 subsequent siblings)
4 siblings, 1 reply; 12+ messages in thread
From: Tejun Heo @ 2026-04-16 8:16 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min
Cc: Emil Tsalapatis, sched-ext, linux-kernel, Tejun Heo
Rename the per-task context local variable from tctx to taskc for
consistency.
No functional change.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
tools/sched_ext/scx_qmap.bpf.c | 60 +++++++++++++++++-----------------
1 file changed, 30 insertions(+), 30 deletions(-)
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index b68abb9e760b..a18234f3c27a 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -159,22 +159,22 @@ static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
static struct task_ctx *lookup_task_ctx(struct task_struct *p)
{
- struct task_ctx *tctx;
+ struct task_ctx *taskc;
- if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
+ if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
scx_bpf_error("task_ctx lookup failed");
return NULL;
}
- return tctx;
+ return taskc;
}
s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
s32 prev_cpu, u64 wake_flags)
{
- struct task_ctx *tctx;
+ struct task_ctx *taskc;
s32 cpu;
- if (!(tctx = lookup_task_ctx(p)))
+ if (!(taskc = lookup_task_ctx(p)))
return -ESRCH;
if (p->scx.weight < 2 && !(p->flags & PF_KTHREAD))
@@ -183,7 +183,7 @@ s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
cpu = pick_direct_dispatch_cpu(p, prev_cpu);
if (cpu >= 0) {
- tctx->force_local = true;
+ taskc->force_local = true;
return cpu;
} else {
return prev_cpu;
@@ -208,7 +208,7 @@ static int weight_to_idx(u32 weight)
void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
{
static u32 user_cnt, kernel_cnt;
- struct task_ctx *tctx;
+ struct task_ctx *taskc;
u32 pid = p->pid;
int idx = weight_to_idx(p->scx.weight);
void *ring;
@@ -231,14 +231,14 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
if (test_error_cnt && !--test_error_cnt)
scx_bpf_error("test triggering error");
- if (!(tctx = lookup_task_ctx(p)))
+ if (!(taskc = lookup_task_ctx(p)))
return;
/*
* All enqueued tasks must have their core_sched_seq updated for correct
* core-sched ordering. Also, take a look at the end of qmap_dispatch().
*/
- tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
+ taskc->core_sched_seq = core_sched_tail_seqs[idx]++;
/*
* IMMED stress testing: Every immed_stress_nth'th enqueue, dispatch
@@ -249,7 +249,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
static u32 immed_stress_cnt;
if (!(++immed_stress_cnt % immed_stress_nth)) {
- tctx->force_local = false;
+ taskc->force_local = false;
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | scx_bpf_task_cpu(p),
slice_ns, enq_flags);
return;
@@ -260,8 +260,8 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
* If qmap_select_cpu() is telling us to or this is the last runnable
* task on the CPU, enqueue locally.
*/
- if (tctx->force_local) {
- tctx->force_local = false;
+ if (taskc->force_local) {
+ taskc->force_local = false;
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
return;
}
@@ -310,7 +310,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
}
if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
- tctx->highpri = true;
+ taskc->highpri = true;
__sync_fetch_and_add(&nr_highpri_queued, 1);
}
__sync_fetch_and_add(&nr_enqueued, 1);
@@ -330,10 +330,10 @@ void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
static void update_core_sched_head_seq(struct task_struct *p)
{
int idx = weight_to_idx(p->scx.weight);
- struct task_ctx *tctx;
+ struct task_ctx *taskc;
- if ((tctx = lookup_task_ctx(p)))
- core_sched_head_seqs[idx] = tctx->core_sched_seq;
+ if ((taskc = lookup_task_ctx(p)))
+ core_sched_head_seqs[idx] = taskc->core_sched_seq;
}
/*
@@ -354,12 +354,12 @@ static bool dispatch_highpri(bool from_timer)
/* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
static u64 highpri_seq;
- struct task_ctx *tctx;
+ struct task_ctx *taskc;
- if (!(tctx = lookup_task_ctx(p)))
+ if (!(taskc = lookup_task_ctx(p)))
return false;
- if (tctx->highpri) {
+ if (taskc->highpri) {
/* exercise the set_*() and vtime interface too */
scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2);
scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++);
@@ -405,7 +405,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
{
struct task_struct *p;
struct cpu_ctx *cpuc;
- struct task_ctx *tctx;
+ struct task_ctx *taskc;
u32 zero = 0, batch = dsp_batch ?: 1;
void *fifo;
s32 i, pid;
@@ -450,7 +450,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
/* Dispatch or advance. */
bpf_repeat(BPF_MAX_LOOPS) {
- struct task_ctx *tctx;
+ struct task_ctx *taskc;
if (bpf_map_pop_elem(fifo, &pid))
break;
@@ -459,12 +459,12 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
if (!p)
continue;
- if (!(tctx = lookup_task_ctx(p))) {
+ if (!(taskc = lookup_task_ctx(p))) {
bpf_task_release(p);
return;
}
- if (tctx->highpri)
+ if (taskc->highpri)
__sync_fetch_and_sub(&nr_highpri_queued, 1);
update_core_sched_head_seq(p);
@@ -539,13 +539,13 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
* if the task were enqueued and dispatched immediately.
*/
if (prev) {
- tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
- if (!tctx) {
+ taskc = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
+ if (!taskc) {
scx_bpf_error("task_ctx lookup failed");
return;
}
- tctx->core_sched_seq =
+ taskc->core_sched_seq =
core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
}
}
@@ -580,16 +580,16 @@ void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
static s64 task_qdist(struct task_struct *p)
{
int idx = weight_to_idx(p->scx.weight);
- struct task_ctx *tctx;
+ struct task_ctx *taskc;
s64 qdist;
- tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
- if (!tctx) {
+ taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+ if (!taskc) {
scx_bpf_error("task_ctx lookup failed");
return 0;
}
- qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
+ qdist = taskc->core_sched_seq - core_sched_head_seqs[idx];
/*
* As queue index increments, the priority doubles. The queue w/ index 3
--
2.53.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* Re: [PATCH 1/4] sched_ext: scx_qmap: rename tctx to taskc
2026-04-16 8:16 ` [PATCH 1/4] sched_ext: scx_qmap: rename tctx to taskc Tejun Heo
@ 2026-04-16 14:56 ` Emil Tsalapatis
0 siblings, 0 replies; 12+ messages in thread
From: Emil Tsalapatis @ 2026-04-16 14:56 UTC (permalink / raw)
To: Tejun Heo
Cc: David Vernet, Andrea Righi, Changwoo Min, Emil Tsalapatis,
sched-ext, linux-kernel
On Thu, Apr 16, 2026 at 1:16 AM Tejun Heo <tj@kernel.org> wrote:
>
> >
> Rename the per-task context local variable from tctx to taskc for
> consistency.
>
> No functional change.
>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
> tools/sched_ext/scx_qmap.bpf.c | 60 +++++++++++++++++-----------------
> 1 file changed, 30 insertions(+), 30 deletions(-)
>
> diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
> index b68abb9e760b..a18234f3c27a 100644
> --- a/tools/sched_ext/scx_qmap.bpf.c
> +++ b/tools/sched_ext/scx_qmap.bpf.c
> @@ -159,22 +159,22 @@ static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
>
> static struct task_ctx *lookup_task_ctx(struct task_struct *p)
> {
> - struct task_ctx *tctx;
> + struct task_ctx *taskc;
>
> - if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
> + if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
> scx_bpf_error("task_ctx lookup failed");
> return NULL;
> }
> - return tctx;
> + return taskc;
> }
>
> s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
> s32 prev_cpu, u64 wake_flags)
> {
> - struct task_ctx *tctx;
> + struct task_ctx *taskc;
> s32 cpu;
>
> - if (!(tctx = lookup_task_ctx(p)))
> + if (!(taskc = lookup_task_ctx(p)))
> return -ESRCH;
>
> if (p->scx.weight < 2 && !(p->flags & PF_KTHREAD))
> @@ -183,7 +183,7 @@ s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
> cpu = pick_direct_dispatch_cpu(p, prev_cpu);
>
> if (cpu >= 0) {
> - tctx->force_local = true;
> + taskc->force_local = true;
> return cpu;
> } else {
> return prev_cpu;
> @@ -208,7 +208,7 @@ static int weight_to_idx(u32 weight)
> void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
> {
> static u32 user_cnt, kernel_cnt;
> - struct task_ctx *tctx;
> + struct task_ctx *taskc;
> u32 pid = p->pid;
> int idx = weight_to_idx(p->scx.weight);
> void *ring;
> @@ -231,14 +231,14 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
> if (test_error_cnt && !--test_error_cnt)
> scx_bpf_error("test triggering error");
>
> - if (!(tctx = lookup_task_ctx(p)))
> + if (!(taskc = lookup_task_ctx(p)))
> return;
>
> /*
> * All enqueued tasks must have their core_sched_seq updated for correct
> * core-sched ordering. Also, take a look at the end of qmap_dispatch().
> */
> - tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
> + taskc->core_sched_seq = core_sched_tail_seqs[idx]++;
>
> /*
> * IMMED stress testing: Every immed_stress_nth'th enqueue, dispatch
> @@ -249,7 +249,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
> static u32 immed_stress_cnt;
>
> if (!(++immed_stress_cnt % immed_stress_nth)) {
> - tctx->force_local = false;
> + taskc->force_local = false;
> scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | scx_bpf_task_cpu(p),
> slice_ns, enq_flags);
> return;
> @@ -260,8 +260,8 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
> * If qmap_select_cpu() is telling us to or this is the last runnable
> * task on the CPU, enqueue locally.
> */
> - if (tctx->force_local) {
> - tctx->force_local = false;
> + if (taskc->force_local) {
> + taskc->force_local = false;
> scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
> return;
> }
> @@ -310,7 +310,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
> }
>
> if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
> - tctx->highpri = true;
> + taskc->highpri = true;
> __sync_fetch_and_add(&nr_highpri_queued, 1);
> }
> __sync_fetch_and_add(&nr_enqueued, 1);
> @@ -330,10 +330,10 @@ void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
> static void update_core_sched_head_seq(struct task_struct *p)
> {
> int idx = weight_to_idx(p->scx.weight);
> - struct task_ctx *tctx;
> + struct task_ctx *taskc;
>
> - if ((tctx = lookup_task_ctx(p)))
> - core_sched_head_seqs[idx] = tctx->core_sched_seq;
> + if ((taskc = lookup_task_ctx(p)))
> + core_sched_head_seqs[idx] = taskc->core_sched_seq;
> }
>
> /*
> @@ -354,12 +354,12 @@ static bool dispatch_highpri(bool from_timer)
> /* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
> bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
> static u64 highpri_seq;
> - struct task_ctx *tctx;
> + struct task_ctx *taskc;
>
> - if (!(tctx = lookup_task_ctx(p)))
> + if (!(taskc = lookup_task_ctx(p)))
> return false;
>
> - if (tctx->highpri) {
> + if (taskc->highpri) {
> /* exercise the set_*() and vtime interface too */
> scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2);
> scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++);
> @@ -405,7 +405,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
> {
> struct task_struct *p;
> struct cpu_ctx *cpuc;
> - struct task_ctx *tctx;
> + struct task_ctx *taskc;
> u32 zero = 0, batch = dsp_batch ?: 1;
> void *fifo;
> s32 i, pid;
> @@ -450,7 +450,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>
> /* Dispatch or advance. */
> bpf_repeat(BPF_MAX_LOOPS) {
> - struct task_ctx *tctx;
> + struct task_ctx *taskc;
>
> if (bpf_map_pop_elem(fifo, &pid))
> break;
> @@ -459,12 +459,12 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
> if (!p)
> continue;
>
> - if (!(tctx = lookup_task_ctx(p))) {
> + if (!(taskc = lookup_task_ctx(p))) {
> bpf_task_release(p);
> return;
> }
>
> - if (tctx->highpri)
> + if (taskc->highpri)
> __sync_fetch_and_sub(&nr_highpri_queued, 1);
>
> update_core_sched_head_seq(p);
> @@ -539,13 +539,13 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
> * if the task were enqueued and dispatched immediately.
> */
> if (prev) {
> - tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
> - if (!tctx) {
> + taskc = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
> + if (!taskc) {
> scx_bpf_error("task_ctx lookup failed");
> return;
> }
>
> - tctx->core_sched_seq =
> + taskc->core_sched_seq =
> core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
> }
> }
> @@ -580,16 +580,16 @@ void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
> static s64 task_qdist(struct task_struct *p)
> {
> int idx = weight_to_idx(p->scx.weight);
> - struct task_ctx *tctx;
> + struct task_ctx *taskc;
> s64 qdist;
>
> - tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
> - if (!tctx) {
> + taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
> + if (!taskc) {
> scx_bpf_error("task_ctx lookup failed");
> return 0;
> }
>
> - qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
> + qdist = taskc->core_sched_seq - core_sched_head_seqs[idx];
>
> /*
> * As queue index increments, the priority doubles. The queue w/ index 3
> --
> 2.53.0
>
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH 2/4] sched_ext: scx_qmap: move globals and cpu_ctx into a BPF arena map
2026-04-16 8:16 [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Tejun Heo
2026-04-16 8:16 ` [PATCH 1/4] sched_ext: scx_qmap: rename tctx to taskc Tejun Heo
@ 2026-04-16 8:16 ` Tejun Heo
2026-04-16 15:28 ` Emil Tsalapatis
2026-04-16 8:16 ` [PATCH 3/4] sched_ext: scx_qmap: move task_ctx into a BPF arena slab Tejun Heo
` (2 subsequent siblings)
4 siblings, 1 reply; 12+ messages in thread
From: Tejun Heo @ 2026-04-16 8:16 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min
Cc: Emil Tsalapatis, sched-ext, linux-kernel, Tejun Heo
Arena simplifies verification and allows more natural programming.
Convert scx_qmap to arena as preparation for further sub-sched work.
Move mutable scheduler state from BSS globals and a percpu array map
into a single BPF arena map. A shared struct qmap_arena is declared as
an __arena global so BPF accesses it directly and userspace reaches it
through skel->arena->qa.
Scheduling logic unchanged; only memory backing changes.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
tools/sched_ext/scx_qmap.bpf.c | 152 ++++++++++++++-------------------
tools/sched_ext/scx_qmap.c | 45 +++++-----
tools/sched_ext/scx_qmap.h | 57 +++++++++++++
3 files changed, 147 insertions(+), 107 deletions(-)
create mode 100644 tools/sched_ext/scx_qmap.h
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index a18234f3c27a..0f8fbb6d0bc2 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -22,6 +22,8 @@
*/
#include <scx/common.bpf.h>
+#include "scx_qmap.h"
+
enum consts {
ONE_SEC_IN_NS = 1000000000,
ONE_MSEC_IN_NS = 1000000,
@@ -48,14 +50,26 @@ const volatile bool suppress_dump;
const volatile bool always_enq_immed;
const volatile u32 immed_stress_nth;
-u64 nr_highpri_queued;
-u32 test_error_cnt;
-
-#define MAX_SUB_SCHEDS 8
-u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
-
UEI_DEFINE(uei);
+/*
+ * All mutable scheduler state - per-cpu context, stats counters, core-sched
+ * sequence numbers, sub-sched cgroup ids - lives in this single BPF arena map.
+ * Userspace reaches it via skel->arena->qa.
+ */
+struct {
+ __uint(type, BPF_MAP_TYPE_ARENA);
+ __uint(map_flags, BPF_F_MMAPABLE);
+ __uint(max_entries, 1 << 16); /* upper bound in pages */
+#if defined(__TARGET_ARCH_arm64) || defined(__aarch64__)
+ __ulong(map_extra, 0x1ull << 32); /* user/BPF mmap base */
+#else
+ __ulong(map_extra, 0x1ull << 44);
+#endif
+} arena SEC(".maps");
+
+struct qmap_arena __arena qa;
+
struct qmap {
__uint(type, BPF_MAP_TYPE_QUEUE);
__uint(max_entries, 4096);
@@ -102,8 +116,6 @@ static const u32 qidx_to_cpuperf_target[] = {
* task's seq and the associated queue's head seq is called the queue distance
* and used when comparing two tasks for ordering. See qmap_core_sched_before().
*/
-static u64 core_sched_head_seqs[5];
-static u64 core_sched_tail_seqs[5];
/* Per-task scheduling context */
struct task_ctx {
@@ -119,27 +131,6 @@ struct {
__type(value, struct task_ctx);
} task_ctx_stor SEC(".maps");
-struct cpu_ctx {
- u64 dsp_idx; /* dispatch index */
- u64 dsp_cnt; /* remaining count */
- u32 avg_weight;
- u32 cpuperf_target;
-};
-
-struct {
- __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
- __uint(max_entries, 1);
- __type(key, u32);
- __type(value, struct cpu_ctx);
-} cpu_ctx_stor SEC(".maps");
-
-/* Statistics */
-u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0, nr_dequeued, nr_ddsp_from_enq;
-u64 nr_core_sched_execed;
-u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
-u32 cpuperf_min, cpuperf_avg, cpuperf_max;
-u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
-
static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
{
s32 cpu;
@@ -215,9 +206,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
s32 cpu;
if (enq_flags & SCX_ENQ_REENQ) {
- __sync_fetch_and_add(&nr_reenqueued, 1);
+ __sync_fetch_and_add(&qa.nr_reenqueued, 1);
if (scx_bpf_task_cpu(p) == 0)
- __sync_fetch_and_add(&nr_reenqueued_cpu0, 1);
+ __sync_fetch_and_add(&qa.nr_reenqueued_cpu0, 1);
}
if (p->flags & PF_KTHREAD) {
@@ -228,7 +219,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
return;
}
- if (test_error_cnt && !--test_error_cnt)
+ if (qa.test_error_cnt && !--qa.test_error_cnt)
scx_bpf_error("test triggering error");
if (!(taskc = lookup_task_ctx(p)))
@@ -238,7 +229,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
* All enqueued tasks must have their core_sched_seq updated for correct
* core-sched ordering. Also, take a look at the end of qmap_dispatch().
*/
- taskc->core_sched_seq = core_sched_tail_seqs[idx]++;
+ taskc->core_sched_seq = qa.core_sched_tail_seqs[idx]++;
/*
* IMMED stress testing: Every immed_stress_nth'th enqueue, dispatch
@@ -276,7 +267,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
/* if select_cpu() wasn't called, try direct dispatch */
if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
(cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
- __sync_fetch_and_add(&nr_ddsp_from_enq, 1);
+ __sync_fetch_and_add(&qa.nr_ddsp_from_enq, 1);
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
return;
}
@@ -311,9 +302,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
taskc->highpri = true;
- __sync_fetch_and_add(&nr_highpri_queued, 1);
+ __sync_fetch_and_add(&qa.nr_highpri_queued, 1);
}
- __sync_fetch_and_add(&nr_enqueued, 1);
+ __sync_fetch_and_add(&qa.nr_enqueued, 1);
}
/*
@@ -322,9 +313,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
*/
void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
{
- __sync_fetch_and_add(&nr_dequeued, 1);
+ __sync_fetch_and_add(&qa.nr_dequeued, 1);
if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
- __sync_fetch_and_add(&nr_core_sched_execed, 1);
+ __sync_fetch_and_add(&qa.nr_core_sched_execed, 1);
}
static void update_core_sched_head_seq(struct task_struct *p)
@@ -333,7 +324,7 @@ static void update_core_sched_head_seq(struct task_struct *p)
struct task_ctx *taskc;
if ((taskc = lookup_task_ctx(p)))
- core_sched_head_seqs[idx] = taskc->core_sched_seq;
+ qa.core_sched_head_seqs[idx] = taskc->core_sched_seq;
}
/*
@@ -384,14 +375,14 @@ static bool dispatch_highpri(bool from_timer)
SCX_ENQ_PREEMPT)) {
if (cpu == this_cpu) {
dispatched = true;
- __sync_fetch_and_add(&nr_expedited_local, 1);
+ __sync_fetch_and_add(&qa.nr_expedited_local, 1);
} else {
- __sync_fetch_and_add(&nr_expedited_remote, 1);
+ __sync_fetch_and_add(&qa.nr_expedited_remote, 1);
}
if (from_timer)
- __sync_fetch_and_add(&nr_expedited_from_timer, 1);
+ __sync_fetch_and_add(&qa.nr_expedited_from_timer, 1);
} else {
- __sync_fetch_and_add(&nr_expedited_lost, 1);
+ __sync_fetch_and_add(&qa.nr_expedited_lost, 1);
}
if (dispatched)
@@ -404,19 +395,19 @@ static bool dispatch_highpri(bool from_timer)
void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
{
struct task_struct *p;
- struct cpu_ctx *cpuc;
+ struct cpu_ctx __arena *cpuc;
struct task_ctx *taskc;
- u32 zero = 0, batch = dsp_batch ?: 1;
+ u32 batch = dsp_batch ?: 1;
void *fifo;
s32 i, pid;
if (dispatch_highpri(false))
return;
- if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0))
+ if (!qa.nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0))
return;
- if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
+ if (dsp_inf_loop_after && qa.nr_dispatched > dsp_inf_loop_after) {
/*
* PID 2 should be kthreadd which should mostly be idle and off
* the scheduler. Let's keep dispatching it to force the kernel
@@ -430,10 +421,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
}
}
- if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
- scx_bpf_error("failed to look up cpu_ctx");
- return;
- }
+ cpuc = &qa.cpu_ctxs[bpf_get_smp_processor_id()];
for (i = 0; i < 5; i++) {
/* Advance the dispatch cursor and pick the fifo. */
@@ -442,9 +430,11 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
}
- fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
+ u64 dsp_idx = cpuc->dsp_idx;
+
+ fifo = bpf_map_lookup_elem(&queue_arr, &dsp_idx);
if (!fifo) {
- scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
+ scx_bpf_error("failed to find ring %llu", dsp_idx);
return;
}
@@ -465,10 +455,10 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
}
if (taskc->highpri)
- __sync_fetch_and_sub(&nr_highpri_queued, 1);
+ __sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
update_core_sched_head_seq(p);
- __sync_fetch_and_add(&nr_dispatched, 1);
+ __sync_fetch_and_add(&qa.nr_dispatched, 1);
scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
@@ -529,8 +519,8 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
}
for (i = 0; i < MAX_SUB_SCHEDS; i++) {
- if (sub_sched_cgroup_ids[i] &&
- scx_bpf_sub_dispatch(sub_sched_cgroup_ids[i]))
+ if (qa.sub_sched_cgroup_ids[i] &&
+ scx_bpf_sub_dispatch(qa.sub_sched_cgroup_ids[i]))
return;
}
@@ -546,21 +536,15 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
}
taskc->core_sched_seq =
- core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
+ qa.core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
}
}
void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
{
- struct cpu_ctx *cpuc;
- u32 zero = 0;
+ struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[bpf_get_smp_processor_id()];
int idx;
- if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
- scx_bpf_error("failed to look up cpu_ctx");
- return;
- }
-
/*
* Use the running avg of weights to select the target cpuperf level.
* This is a demonstration of the cpuperf feature rather than a
@@ -589,7 +573,7 @@ static s64 task_qdist(struct task_struct *p)
return 0;
}
- qdist = taskc->core_sched_seq - core_sched_head_seqs[idx];
+ qdist = taskc->core_sched_seq - qa.core_sched_head_seqs[idx];
/*
* As queue index increments, the priority doubles. The queue w/ index 3
@@ -679,13 +663,10 @@ void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
{
- u32 zero = 0;
- struct cpu_ctx *cpuc;
+ struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[cpu];
if (suppress_dump || idle)
return;
- if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
- return;
scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
@@ -802,7 +783,7 @@ struct {
*/
static void monitor_cpuperf(void)
{
- u32 zero = 0, nr_cpu_ids;
+ u32 nr_cpu_ids;
u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
const struct cpumask *online;
@@ -812,7 +793,7 @@ static void monitor_cpuperf(void)
online = scx_bpf_get_online_cpumask();
bpf_for(i, 0, nr_cpu_ids) {
- struct cpu_ctx *cpuc;
+ struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[i];
u32 cap, cur;
if (!bpf_cpumask_test_cpu(i, online))
@@ -834,11 +815,6 @@ static void monitor_cpuperf(void)
cur_sum += cur * cap / SCX_CPUPERF_ONE;
cap_sum += cap;
- if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
- scx_bpf_error("failed to look up cpu_ctx");
- goto out;
- }
-
/* collect target */
cur = cpuc->cpuperf_target;
target_sum += cur;
@@ -846,14 +822,14 @@ static void monitor_cpuperf(void)
target_max = cur > target_max ? cur : target_max;
}
- cpuperf_min = cur_min;
- cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
- cpuperf_max = cur_max;
+ qa.cpuperf_min = cur_min;
+ qa.cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
+ qa.cpuperf_max = cur_max;
+
+ qa.cpuperf_target_min = target_min;
+ qa.cpuperf_target_avg = target_sum / nr_online_cpus;
+ qa.cpuperf_target_max = target_max;
- cpuperf_target_min = target_min;
- cpuperf_target_avg = target_sum / nr_online_cpus;
- cpuperf_target_max = target_max;
-out:
scx_bpf_put_cpumask(online);
}
@@ -996,8 +972,8 @@ s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args)
s32 i;
for (i = 0; i < MAX_SUB_SCHEDS; i++) {
- if (!sub_sched_cgroup_ids[i]) {
- sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
+ if (!qa.sub_sched_cgroup_ids[i]) {
+ qa.sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
bpf_printk("attaching sub-sched[%d] on %s",
i, args->cgroup_path);
return 0;
@@ -1012,8 +988,8 @@ void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args)
s32 i;
for (i = 0; i < MAX_SUB_SCHEDS; i++) {
- if (sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
- sub_sched_cgroup_ids[i] = 0;
+ if (qa.sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
+ qa.sub_sched_cgroup_ids[i] = 0;
bpf_printk("detaching sub-sched[%d] on %s",
i, args->cgroup_path);
break;
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index e7c89a2bc3d8..8844499c14c4 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -10,9 +10,11 @@
#include <inttypes.h>
#include <signal.h>
#include <libgen.h>
+#include <sys/mman.h>
#include <sys/stat.h>
#include <bpf/bpf.h>
#include <scx/common.h>
+#include "scx_qmap.h"
#include "scx_qmap.bpf.skel.h"
const char help_fmt[] =
@@ -60,6 +62,8 @@ int main(int argc, char **argv)
{
struct scx_qmap *skel;
struct bpf_link *link;
+ struct qmap_arena *qa;
+ __u32 test_error_cnt = 0;
int opt;
libbpf_set_print(libbpf_print_fn);
@@ -76,7 +80,7 @@ int main(int argc, char **argv)
skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
break;
case 'e':
- skel->bss->test_error_cnt = strtoul(optarg, NULL, 0);
+ test_error_cnt = strtoul(optarg, NULL, 0);
break;
case 't':
skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0);
@@ -142,29 +146,32 @@ int main(int argc, char **argv)
SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei);
link = SCX_OPS_ATTACH(skel, qmap_ops, scx_qmap);
+ qa = &skel->arena->qa;
+ qa->test_error_cnt = test_error_cnt;
+
while (!exit_req && !UEI_EXITED(skel, uei)) {
- long nr_enqueued = skel->bss->nr_enqueued;
- long nr_dispatched = skel->bss->nr_dispatched;
+ long nr_enqueued = qa->nr_enqueued;
+ long nr_dispatched = qa->nr_dispatched;
- printf("stats : enq=%lu dsp=%lu delta=%ld reenq/cpu0=%"PRIu64"/%"PRIu64" deq=%"PRIu64" core=%"PRIu64" enq_ddsp=%"PRIu64"\n",
+ printf("stats : enq=%lu dsp=%lu delta=%ld reenq/cpu0=%llu/%llu deq=%llu core=%llu enq_ddsp=%llu\n",
nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
- skel->bss->nr_reenqueued, skel->bss->nr_reenqueued_cpu0,
- skel->bss->nr_dequeued,
- skel->bss->nr_core_sched_execed,
- skel->bss->nr_ddsp_from_enq);
- printf(" exp_local=%"PRIu64" exp_remote=%"PRIu64" exp_timer=%"PRIu64" exp_lost=%"PRIu64"\n",
- skel->bss->nr_expedited_local,
- skel->bss->nr_expedited_remote,
- skel->bss->nr_expedited_from_timer,
- skel->bss->nr_expedited_lost);
+ qa->nr_reenqueued, qa->nr_reenqueued_cpu0,
+ qa->nr_dequeued,
+ qa->nr_core_sched_execed,
+ qa->nr_ddsp_from_enq);
+ printf(" exp_local=%llu exp_remote=%llu exp_timer=%llu exp_lost=%llu\n",
+ qa->nr_expedited_local,
+ qa->nr_expedited_remote,
+ qa->nr_expedited_from_timer,
+ qa->nr_expedited_lost);
if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur"))
printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n",
- skel->bss->cpuperf_min,
- skel->bss->cpuperf_avg,
- skel->bss->cpuperf_max,
- skel->bss->cpuperf_target_min,
- skel->bss->cpuperf_target_avg,
- skel->bss->cpuperf_target_max);
+ qa->cpuperf_min,
+ qa->cpuperf_avg,
+ qa->cpuperf_max,
+ qa->cpuperf_target_min,
+ qa->cpuperf_target_avg,
+ qa->cpuperf_target_max);
fflush(stdout);
sleep(1);
}
diff --git a/tools/sched_ext/scx_qmap.h b/tools/sched_ext/scx_qmap.h
new file mode 100644
index 000000000000..52153230bfce
--- /dev/null
+++ b/tools/sched_ext/scx_qmap.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared definitions between scx_qmap.bpf.c and scx_qmap.c.
+ *
+ * The scheduler keeps all mutable state in a single BPF arena map. struct
+ * qmap_arena is the one object that lives at the base of the arena and is
+ * mmap'd into userspace so the loader can read counters directly.
+ *
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
+ */
+#ifndef __SCX_QMAP_H
+#define __SCX_QMAP_H
+
+#ifdef __BPF__
+#include <scx/bpf_arena_common.bpf.h>
+#else
+#include <linux/types.h>
+#include <scx/bpf_arena_common.h>
+#endif
+
+#define MAX_SUB_SCHEDS 8
+
+/*
+ * cpu_ctxs[] is sized to a fixed cap so the layout is shared between BPF and
+ * userspace. Keep this in sync with NR_CPUS used by the BPF side.
+ */
+#define SCX_QMAP_MAX_CPUS 1024
+
+struct cpu_ctx {
+ __u64 dsp_idx; /* dispatch index */
+ __u64 dsp_cnt; /* remaining count */
+ __u32 avg_weight;
+ __u32 cpuperf_target;
+};
+
+struct qmap_arena {
+ /* userspace-visible stats */
+ __u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0;
+ __u64 nr_dequeued, nr_ddsp_from_enq;
+ __u64 nr_core_sched_execed;
+ __u64 nr_expedited_local, nr_expedited_remote;
+ __u64 nr_expedited_lost, nr_expedited_from_timer;
+ __u64 nr_highpri_queued;
+ __u32 test_error_cnt;
+ __u32 cpuperf_min, cpuperf_avg, cpuperf_max;
+ __u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
+
+ /* kernel-side runtime state */
+ __u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
+ __u64 core_sched_head_seqs[5];
+ __u64 core_sched_tail_seqs[5];
+
+ struct cpu_ctx cpu_ctxs[SCX_QMAP_MAX_CPUS];
+};
+
+#endif /* __SCX_QMAP_H */
--
2.53.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* Re: [PATCH 2/4] sched_ext: scx_qmap: move globals and cpu_ctx into a BPF arena map
2026-04-16 8:16 ` [PATCH 2/4] sched_ext: scx_qmap: move globals and cpu_ctx into a BPF arena map Tejun Heo
@ 2026-04-16 15:28 ` Emil Tsalapatis
0 siblings, 0 replies; 12+ messages in thread
From: Emil Tsalapatis @ 2026-04-16 15:28 UTC (permalink / raw)
To: Tejun Heo
Cc: David Vernet, Andrea Righi, Changwoo Min, Emil Tsalapatis,
sched-ext, linux-kernel
On Thu, Apr 16, 2026 at 1:20 AM Tejun Heo <tj@kernel.org> wrote:
>
> >
> Arena simplifies verification and allows more natural programming.
> Convert scx_qmap to arena as preparation for further sub-sched work.
>
> Move mutable scheduler state from BSS globals and a percpu array map
> into a single BPF arena map. A shared struct qmap_arena is declared as
> an __arena global so BPF accesses it directly and userspace reaches it
> through skel->arena->qa.
>
> Scheduling logic unchanged; only memory backing changes.
>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
> tools/sched_ext/scx_qmap.bpf.c | 152 ++++++++++++++-------------------
> tools/sched_ext/scx_qmap.c | 45 +++++-----
> tools/sched_ext/scx_qmap.h | 57 +++++++++++++
> 3 files changed, 147 insertions(+), 107 deletions(-)
> create mode 100644 tools/sched_ext/scx_qmap.h
>
> diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
> index a18234f3c27a..0f8fbb6d0bc2 100644
> --- a/tools/sched_ext/scx_qmap.bpf.c
> +++ b/tools/sched_ext/scx_qmap.bpf.c
> @@ -22,6 +22,8 @@
> */
> #include <scx/common.bpf.h>
>
> +#include "scx_qmap.h"
> +
> enum consts {
> ONE_SEC_IN_NS = 1000000000,
> ONE_MSEC_IN_NS = 1000000,
> @@ -48,14 +50,26 @@ const volatile bool suppress_dump;
> const volatile bool always_enq_immed;
> const volatile u32 immed_stress_nth;
>
> -u64 nr_highpri_queued;
> -u32 test_error_cnt;
> -
> -#define MAX_SUB_SCHEDS 8
> -u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
> -
> UEI_DEFINE(uei);
>
> +/*
> + * All mutable scheduler state - per-cpu context, stats counters, core-sched
> + * sequence numbers, sub-sched cgroup ids - lives in this single BPF arena map.
> + * Userspace reaches it via skel->arena->qa.
> + */
> +struct {
> + __uint(type, BPF_MAP_TYPE_ARENA);
> + __uint(map_flags, BPF_F_MMAPABLE);
> + __uint(max_entries, 1 << 16); /* upper bound in pages */
I assume this is picked to handle 64K pages on ARM.
> +#if defined(__TARGET_ARCH_arm64) || defined(__aarch64__)
> + __ulong(map_extra, 0x1ull << 32); /* user/BPF mmap base */
> +#else
> + __ulong(map_extra, 0x1ull << 44);
> +#endif
> +} arena SEC(".maps");
> +
> +struct qmap_arena __arena qa;
> +
> struct qmap {
> __uint(type, BPF_MAP_TYPE_QUEUE);
> __uint(max_entries, 4096);
> @@ -102,8 +116,6 @@ static const u32 qidx_to_cpuperf_target[] = {
> * task's seq and the associated queue's head seq is called the queue distance
> * and used when comparing two tasks for ordering. See qmap_core_sched_before().
> */
> -static u64 core_sched_head_seqs[5];
> -static u64 core_sched_tail_seqs[5];
>
> /* Per-task scheduling context */
> struct task_ctx {
> @@ -119,27 +131,6 @@ struct {
> __type(value, struct task_ctx);
> } task_ctx_stor SEC(".maps");
>
> -struct cpu_ctx {
> - u64 dsp_idx; /* dispatch index */
> - u64 dsp_cnt; /* remaining count */
> - u32 avg_weight;
> - u32 cpuperf_target;
> -};
> -
> -struct {
> - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> - __uint(max_entries, 1);
> - __type(key, u32);
> - __type(value, struct cpu_ctx);
> -} cpu_ctx_stor SEC(".maps");
> -
> -/* Statistics */
> -u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0, nr_dequeued, nr_ddsp_from_enq;
> -u64 nr_core_sched_execed;
> -u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
> -u32 cpuperf_min, cpuperf_avg, cpuperf_max;
> -u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
> -
> static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
> {
> s32 cpu;
> @@ -215,9 +206,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
> s32 cpu;
>
> if (enq_flags & SCX_ENQ_REENQ) {
> - __sync_fetch_and_add(&nr_reenqueued, 1);
> + __sync_fetch_and_add(&qa.nr_reenqueued, 1);
> if (scx_bpf_task_cpu(p) == 0)
> - __sync_fetch_and_add(&nr_reenqueued_cpu0, 1);
> + __sync_fetch_and_add(&qa.nr_reenqueued_cpu0, 1);
> }
>
> if (p->flags & PF_KTHREAD) {
> @@ -228,7 +219,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
> return;
> }
>
> - if (test_error_cnt && !--test_error_cnt)
> + if (qa.test_error_cnt && !--qa.test_error_cnt)
> scx_bpf_error("test triggering error");
>
> if (!(taskc = lookup_task_ctx(p)))
> @@ -238,7 +229,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
> * All enqueued tasks must have their core_sched_seq updated for correct
> * core-sched ordering. Also, take a look at the end of qmap_dispatch().
> */
> - taskc->core_sched_seq = core_sched_tail_seqs[idx]++;
> + taskc->core_sched_seq = qa.core_sched_tail_seqs[idx]++;
>
> /*
> * IMMED stress testing: Every immed_stress_nth'th enqueue, dispatch
> @@ -276,7 +267,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
> /* if select_cpu() wasn't called, try direct dispatch */
> if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
> (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
> - __sync_fetch_and_add(&nr_ddsp_from_enq, 1);
> + __sync_fetch_and_add(&qa.nr_ddsp_from_enq, 1);
> scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
> return;
> }
> @@ -311,9 +302,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>
> if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
> taskc->highpri = true;
> - __sync_fetch_and_add(&nr_highpri_queued, 1);
> + __sync_fetch_and_add(&qa.nr_highpri_queued, 1);
> }
> - __sync_fetch_and_add(&nr_enqueued, 1);
> + __sync_fetch_and_add(&qa.nr_enqueued, 1);
> }
>
> /*
> @@ -322,9 +313,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
> */
> void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
> {
> - __sync_fetch_and_add(&nr_dequeued, 1);
> + __sync_fetch_and_add(&qa.nr_dequeued, 1);
> if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
> - __sync_fetch_and_add(&nr_core_sched_execed, 1);
> + __sync_fetch_and_add(&qa.nr_core_sched_execed, 1);
> }
>
> static void update_core_sched_head_seq(struct task_struct *p)
> @@ -333,7 +324,7 @@ static void update_core_sched_head_seq(struct task_struct *p)
> struct task_ctx *taskc;
>
> if ((taskc = lookup_task_ctx(p)))
> - core_sched_head_seqs[idx] = taskc->core_sched_seq;
> + qa.core_sched_head_seqs[idx] = taskc->core_sched_seq;
> }
>
> /*
> @@ -384,14 +375,14 @@ static bool dispatch_highpri(bool from_timer)
> SCX_ENQ_PREEMPT)) {
> if (cpu == this_cpu) {
> dispatched = true;
> - __sync_fetch_and_add(&nr_expedited_local, 1);
> + __sync_fetch_and_add(&qa.nr_expedited_local, 1);
> } else {
> - __sync_fetch_and_add(&nr_expedited_remote, 1);
> + __sync_fetch_and_add(&qa.nr_expedited_remote, 1);
> }
> if (from_timer)
> - __sync_fetch_and_add(&nr_expedited_from_timer, 1);
> + __sync_fetch_and_add(&qa.nr_expedited_from_timer, 1);
> } else {
> - __sync_fetch_and_add(&nr_expedited_lost, 1);
> + __sync_fetch_and_add(&qa.nr_expedited_lost, 1);
> }
>
> if (dispatched)
> @@ -404,19 +395,19 @@ static bool dispatch_highpri(bool from_timer)
> void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
> {
> struct task_struct *p;
> - struct cpu_ctx *cpuc;
> + struct cpu_ctx __arena *cpuc;
> struct task_ctx *taskc;
> - u32 zero = 0, batch = dsp_batch ?: 1;
> + u32 batch = dsp_batch ?: 1;
> void *fifo;
> s32 i, pid;
>
> if (dispatch_highpri(false))
> return;
>
> - if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0))
> + if (!qa.nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0))
> return;
>
> - if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
> + if (dsp_inf_loop_after && qa.nr_dispatched > dsp_inf_loop_after) {
> /*
> * PID 2 should be kthreadd which should mostly be idle and off
> * the scheduler. Let's keep dispatching it to force the kernel
> @@ -430,10 +421,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
> }
> }
>
> - if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
> - scx_bpf_error("failed to look up cpu_ctx");
> - return;
> - }
> + cpuc = &qa.cpu_ctxs[bpf_get_smp_processor_id()];
>
> for (i = 0; i < 5; i++) {
> /* Advance the dispatch cursor and pick the fifo. */
> @@ -442,9 +430,11 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
> cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
> }
>
> - fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
> + u64 dsp_idx = cpuc->dsp_idx;
> +
> + fifo = bpf_map_lookup_elem(&queue_arr, &dsp_idx);
> if (!fifo) {
> - scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
> + scx_bpf_error("failed to find ring %llu", dsp_idx);
> return;
> }
>
> @@ -465,10 +455,10 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
> }
>
> if (taskc->highpri)
> - __sync_fetch_and_sub(&nr_highpri_queued, 1);
> + __sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
>
> update_core_sched_head_seq(p);
> - __sync_fetch_and_add(&nr_dispatched, 1);
> + __sync_fetch_and_add(&qa.nr_dispatched, 1);
>
> scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
>
> @@ -529,8 +519,8 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
> }
>
> for (i = 0; i < MAX_SUB_SCHEDS; i++) {
> - if (sub_sched_cgroup_ids[i] &&
> - scx_bpf_sub_dispatch(sub_sched_cgroup_ids[i]))
> + if (qa.sub_sched_cgroup_ids[i] &&
> + scx_bpf_sub_dispatch(qa.sub_sched_cgroup_ids[i]))
> return;
> }
>
> @@ -546,21 +536,15 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
> }
>
> taskc->core_sched_seq =
> - core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
> + qa.core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
> }
> }
>
> void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
> {
> - struct cpu_ctx *cpuc;
> - u32 zero = 0;
> + struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[bpf_get_smp_processor_id()];
> int idx;
>
> - if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
> - scx_bpf_error("failed to look up cpu_ctx");
> - return;
> - }
> -
> /*
> * Use the running avg of weights to select the target cpuperf level.
> * This is a demonstration of the cpuperf feature rather than a
> @@ -589,7 +573,7 @@ static s64 task_qdist(struct task_struct *p)
> return 0;
> }
>
> - qdist = taskc->core_sched_seq - core_sched_head_seqs[idx];
> + qdist = taskc->core_sched_seq - qa.core_sched_head_seqs[idx];
>
> /*
> * As queue index increments, the priority doubles. The queue w/ index 3
> @@ -679,13 +663,10 @@ void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
>
> void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
> {
> - u32 zero = 0;
> - struct cpu_ctx *cpuc;
> + struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[cpu];
>
> if (suppress_dump || idle)
> return;
> - if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
> - return;
>
> scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
> cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
> @@ -802,7 +783,7 @@ struct {
> */
> static void monitor_cpuperf(void)
> {
> - u32 zero = 0, nr_cpu_ids;
> + u32 nr_cpu_ids;
> u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
> u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
> const struct cpumask *online;
> @@ -812,7 +793,7 @@ static void monitor_cpuperf(void)
> online = scx_bpf_get_online_cpumask();
>
> bpf_for(i, 0, nr_cpu_ids) {
> - struct cpu_ctx *cpuc;
> + struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[i];
> u32 cap, cur;
>
> if (!bpf_cpumask_test_cpu(i, online))
> @@ -834,11 +815,6 @@ static void monitor_cpuperf(void)
> cur_sum += cur * cap / SCX_CPUPERF_ONE;
> cap_sum += cap;
>
> - if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
> - scx_bpf_error("failed to look up cpu_ctx");
> - goto out;
> - }
> -
> /* collect target */
> cur = cpuc->cpuperf_target;
> target_sum += cur;
> @@ -846,14 +822,14 @@ static void monitor_cpuperf(void)
> target_max = cur > target_max ? cur : target_max;
> }
>
> - cpuperf_min = cur_min;
> - cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
> - cpuperf_max = cur_max;
> + qa.cpuperf_min = cur_min;
> + qa.cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
> + qa.cpuperf_max = cur_max;
> +
> + qa.cpuperf_target_min = target_min;
> + qa.cpuperf_target_avg = target_sum / nr_online_cpus;
> + qa.cpuperf_target_max = target_max;
>
> - cpuperf_target_min = target_min;
> - cpuperf_target_avg = target_sum / nr_online_cpus;
> - cpuperf_target_max = target_max;
> -out:
> scx_bpf_put_cpumask(online);
> }
>
> @@ -996,8 +972,8 @@ s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args)
> s32 i;
>
> for (i = 0; i < MAX_SUB_SCHEDS; i++) {
> - if (!sub_sched_cgroup_ids[i]) {
> - sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
> + if (!qa.sub_sched_cgroup_ids[i]) {
> + qa.sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
> bpf_printk("attaching sub-sched[%d] on %s",
> i, args->cgroup_path);
> return 0;
> @@ -1012,8 +988,8 @@ void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args)
> s32 i;
>
> for (i = 0; i < MAX_SUB_SCHEDS; i++) {
> - if (sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
> - sub_sched_cgroup_ids[i] = 0;
> + if (qa.sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
> + qa.sub_sched_cgroup_ids[i] = 0;
> bpf_printk("detaching sub-sched[%d] on %s",
> i, args->cgroup_path);
> break;
> diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
> index e7c89a2bc3d8..8844499c14c4 100644
> --- a/tools/sched_ext/scx_qmap.c
> +++ b/tools/sched_ext/scx_qmap.c
> @@ -10,9 +10,11 @@
> #include <inttypes.h>
> #include <signal.h>
> #include <libgen.h>
> +#include <sys/mman.h>
> #include <sys/stat.h>
> #include <bpf/bpf.h>
> #include <scx/common.h>
> +#include "scx_qmap.h"
> #include "scx_qmap.bpf.skel.h"
>
> const char help_fmt[] =
> @@ -60,6 +62,8 @@ int main(int argc, char **argv)
> {
> struct scx_qmap *skel;
> struct bpf_link *link;
> + struct qmap_arena *qa;
> + __u32 test_error_cnt = 0;
> int opt;
>
> libbpf_set_print(libbpf_print_fn);
> @@ -76,7 +80,7 @@ int main(int argc, char **argv)
> skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
> break;
> case 'e':
> - skel->bss->test_error_cnt = strtoul(optarg, NULL, 0);
> + test_error_cnt = strtoul(optarg, NULL, 0);
> break;
> case 't':
> skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0);
> @@ -142,29 +146,32 @@ int main(int argc, char **argv)
> SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei);
> link = SCX_OPS_ATTACH(skel, qmap_ops, scx_qmap);
>
> + qa = &skel->arena->qa;
> + qa->test_error_cnt = test_error_cnt;
> +
> while (!exit_req && !UEI_EXITED(skel, uei)) {
> - long nr_enqueued = skel->bss->nr_enqueued;
> - long nr_dispatched = skel->bss->nr_dispatched;
> + long nr_enqueued = qa->nr_enqueued;
> + long nr_dispatched = qa->nr_dispatched;
>
> - printf("stats : enq=%lu dsp=%lu delta=%ld reenq/cpu0=%"PRIu64"/%"PRIu64" deq=%"PRIu64" core=%"PRIu64" enq_ddsp=%"PRIu64"\n",
> + printf("stats : enq=%lu dsp=%lu delta=%ld reenq/cpu0=%llu/%llu deq=%llu core=%llu enq_ddsp=%llu\n",
> nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
> - skel->bss->nr_reenqueued, skel->bss->nr_reenqueued_cpu0,
> - skel->bss->nr_dequeued,
> - skel->bss->nr_core_sched_execed,
> - skel->bss->nr_ddsp_from_enq);
> - printf(" exp_local=%"PRIu64" exp_remote=%"PRIu64" exp_timer=%"PRIu64" exp_lost=%"PRIu64"\n",
> - skel->bss->nr_expedited_local,
> - skel->bss->nr_expedited_remote,
> - skel->bss->nr_expedited_from_timer,
> - skel->bss->nr_expedited_lost);
> + qa->nr_reenqueued, qa->nr_reenqueued_cpu0,
> + qa->nr_dequeued,
> + qa->nr_core_sched_execed,
> + qa->nr_ddsp_from_enq);
> + printf(" exp_local=%llu exp_remote=%llu exp_timer=%llu exp_lost=%llu\n",
> + qa->nr_expedited_local,
> + qa->nr_expedited_remote,
> + qa->nr_expedited_from_timer,
> + qa->nr_expedited_lost);
> if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur"))
> printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n",
> - skel->bss->cpuperf_min,
> - skel->bss->cpuperf_avg,
> - skel->bss->cpuperf_max,
> - skel->bss->cpuperf_target_min,
> - skel->bss->cpuperf_target_avg,
> - skel->bss->cpuperf_target_max);
> + qa->cpuperf_min,
> + qa->cpuperf_avg,
> + qa->cpuperf_max,
> + qa->cpuperf_target_min,
> + qa->cpuperf_target_avg,
> + qa->cpuperf_target_max);
> fflush(stdout);
> sleep(1);
> }
> diff --git a/tools/sched_ext/scx_qmap.h b/tools/sched_ext/scx_qmap.h
> new file mode 100644
> index 000000000000..52153230bfce
> --- /dev/null
> +++ b/tools/sched_ext/scx_qmap.h
> @@ -0,0 +1,57 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Shared definitions between scx_qmap.bpf.c and scx_qmap.c.
> + *
> + * The scheduler keeps all mutable state in a single BPF arena map. struct
> + * qmap_arena is the one object that lives at the base of the arena and is
> + * mmap'd into userspace so the loader can read counters directly.
> + *
> + * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
> + * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
> + */
> +#ifndef __SCX_QMAP_H
> +#define __SCX_QMAP_H
> +
> +#ifdef __BPF__
> +#include <scx/bpf_arena_common.bpf.h>
> +#else
> +#include <linux/types.h>
> +#include <scx/bpf_arena_common.h>
> +#endif
> +
> +#define MAX_SUB_SCHEDS 8
> +
> +/*
> + * cpu_ctxs[] is sized to a fixed cap so the layout is shared between BPF and
> + * userspace. Keep this in sync with NR_CPUS used by the BPF side.
> + */
> +#define SCX_QMAP_MAX_CPUS 1024
> +
> +struct cpu_ctx {
> + __u64 dsp_idx; /* dispatch index */
> + __u64 dsp_cnt; /* remaining count */
> + __u32 avg_weight;
> + __u32 cpuperf_target;
> +};
> +
> +struct qmap_arena {
> + /* userspace-visible stats */
> + __u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0;
> + __u64 nr_dequeued, nr_ddsp_from_enq;
> + __u64 nr_core_sched_execed;
> + __u64 nr_expedited_local, nr_expedited_remote;
> + __u64 nr_expedited_lost, nr_expedited_from_timer;
> + __u64 nr_highpri_queued;
> + __u32 test_error_cnt;
> + __u32 cpuperf_min, cpuperf_avg, cpuperf_max;
> + __u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
> +
> + /* kernel-side runtime state */
> + __u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
> + __u64 core_sched_head_seqs[5];
> + __u64 core_sched_tail_seqs[5];
> +
> + struct cpu_ctx cpu_ctxs[SCX_QMAP_MAX_CPUS];
> +};
> +
> +#endif /* __SCX_QMAP_H */
> --
> 2.53.0
>
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH 3/4] sched_ext: scx_qmap: move task_ctx into a BPF arena slab
2026-04-16 8:16 [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Tejun Heo
2026-04-16 8:16 ` [PATCH 1/4] sched_ext: scx_qmap: rename tctx to taskc Tejun Heo
2026-04-16 8:16 ` [PATCH 2/4] sched_ext: scx_qmap: move globals and cpu_ctx into a BPF arena map Tejun Heo
@ 2026-04-16 8:16 ` Tejun Heo
2026-04-16 15:31 ` Emil Tsalapatis
2026-04-16 8:16 ` [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists Tejun Heo
2026-04-16 10:05 ` [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Andrea Righi
4 siblings, 1 reply; 12+ messages in thread
From: Tejun Heo @ 2026-04-16 8:16 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min
Cc: Emil Tsalapatis, sched-ext, linux-kernel, Tejun Heo
Arena simplifies verification and allows more natural programming.
Convert scx_qmap to arena as preparation for further sub-sched work.
Allocate per-task context from an arena slab instead of storing it
directly in task_storage. task_ctx_stor now holds an arena pointer to
the task's slab entry. Free entries form a singly-linked list protected
by bpf_res_spin_lock; slab exhaustion triggers scx_bpf_error().
The slab size is configurable via the new -N option (default 16384).
Also add bpf_res_spin_lock/unlock declarations to common.bpf.h.
Scheduling logic unchanged.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
tools/sched_ext/include/scx/common.bpf.h | 4 +
tools/sched_ext/scx_qmap.bpf.c | 178 ++++++++++++++++++-----
tools/sched_ext/scx_qmap.c | 9 +-
tools/sched_ext/scx_qmap.h | 7 +
4 files changed, 159 insertions(+), 39 deletions(-)
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 19459dedde41..35fc62556241 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -526,6 +526,10 @@ static inline bool is_migration_disabled(const struct task_struct *p)
void bpf_rcu_read_lock(void) __ksym;
void bpf_rcu_read_unlock(void) __ksym;
+/* resilient qspinlock */
+int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) __ksym __weak;
+void bpf_res_spin_unlock(struct bpf_res_spin_lock *lock) __ksym __weak;
+
/*
* Time helpers, most of which are from jiffies.h.
*/
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 0f8fbb6d0bc2..e071969c8f32 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -49,6 +49,7 @@ const volatile s32 disallow_tgid;
const volatile bool suppress_dump;
const volatile bool always_enq_immed;
const volatile u32 immed_stress_nth;
+const volatile u32 max_tasks;
UEI_DEFINE(uei);
@@ -117,20 +118,43 @@ static const u32 qidx_to_cpuperf_target[] = {
* and used when comparing two tasks for ordering. See qmap_core_sched_before().
*/
-/* Per-task scheduling context */
+/*
+ * Per-task scheduling context. Allocated from the qa.task_ctxs[] slab in
+ * arena. While the task is alive the entry is referenced from task_ctx_stor;
+ * while it's free the entry sits on the free list singly-linked through
+ * @next_free.
+ */
struct task_ctx {
- bool force_local; /* Dispatch directly to local_dsq */
- bool highpri;
- u64 core_sched_seq;
+ struct task_ctx __arena *next_free; /* only valid on free list */
+ bool force_local; /* Dispatch directly to local_dsq */
+ bool highpri;
+ u64 core_sched_seq;
+};
+
+/* Holds an arena pointer to the task's slab entry. */
+struct task_ctx_stor_val {
+ struct task_ctx __arena *taskc;
};
struct {
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);
__type(key, int);
- __type(value, struct task_ctx);
+ __type(value, struct task_ctx_stor_val);
} task_ctx_stor SEC(".maps");
+/* Protects the task_ctx slab free list. */
+__hidden struct bpf_res_spin_lock qa_task_lock SEC(".data.qa_task_lock");
+
+static int qmap_spin_lock(struct bpf_res_spin_lock *lock)
+{
+ if (bpf_res_spin_lock(lock)) {
+ scx_bpf_error("res_spin_lock failed");
+ return -EBUSY;
+ }
+ return 0;
+}
+
static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
{
s32 cpu;
@@ -148,21 +172,34 @@ static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
return -1;
}
-static struct task_ctx *lookup_task_ctx(struct task_struct *p)
+/*
+ * Force a reference to the arena map. The verifier associates an arena with
+ * a program by finding an LD_IMM64 instruction that loads the arena's BPF
+ * map; programs that only use arena pointers returned from task-local
+ * storage (like qmap_select_cpu) never reference @arena directly. Without
+ * this, the verifier rejects addr_space_cast with "addr_space_cast insn
+ * can only be used in a program that has an associated arena".
+ */
+#define QMAP_TOUCH_ARENA() do { asm volatile("" :: "r"(&arena)); } while (0)
+
+static struct task_ctx __arena *lookup_task_ctx(struct task_struct *p)
{
- struct task_ctx *taskc;
+ struct task_ctx_stor_val *v;
+
+ QMAP_TOUCH_ARENA();
- if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
+ v = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+ if (!v || !v->taskc) {
scx_bpf_error("task_ctx lookup failed");
return NULL;
}
- return taskc;
+ return v->taskc;
}
s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
s32 prev_cpu, u64 wake_flags)
{
- struct task_ctx *taskc;
+ struct task_ctx __arena *taskc;
s32 cpu;
if (!(taskc = lookup_task_ctx(p)))
@@ -199,7 +236,7 @@ static int weight_to_idx(u32 weight)
void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
{
static u32 user_cnt, kernel_cnt;
- struct task_ctx *taskc;
+ struct task_ctx __arena *taskc;
u32 pid = p->pid;
int idx = weight_to_idx(p->scx.weight);
void *ring;
@@ -321,7 +358,7 @@ void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
static void update_core_sched_head_seq(struct task_struct *p)
{
int idx = weight_to_idx(p->scx.weight);
- struct task_ctx *taskc;
+ struct task_ctx __arena *taskc;
if ((taskc = lookup_task_ctx(p)))
qa.core_sched_head_seqs[idx] = taskc->core_sched_seq;
@@ -345,7 +382,7 @@ static bool dispatch_highpri(bool from_timer)
/* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
static u64 highpri_seq;
- struct task_ctx *taskc;
+ struct task_ctx __arena *taskc;
if (!(taskc = lookup_task_ctx(p)))
return false;
@@ -396,7 +433,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
{
struct task_struct *p;
struct cpu_ctx __arena *cpuc;
- struct task_ctx *taskc;
+ struct task_ctx __arena *taskc;
u32 batch = dsp_batch ?: 1;
void *fifo;
s32 i, pid;
@@ -440,7 +477,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
/* Dispatch or advance. */
bpf_repeat(BPF_MAX_LOOPS) {
- struct task_ctx *taskc;
+ struct task_ctx __arena *taskc;
if (bpf_map_pop_elem(fifo, &pid))
break;
@@ -529,11 +566,9 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
* if the task were enqueued and dispatched immediately.
*/
if (prev) {
- taskc = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
- if (!taskc) {
- scx_bpf_error("task_ctx lookup failed");
+ taskc = lookup_task_ctx(prev);
+ if (!taskc)
return;
- }
taskc->core_sched_seq =
qa.core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
@@ -564,14 +599,12 @@ void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
static s64 task_qdist(struct task_struct *p)
{
int idx = weight_to_idx(p->scx.weight);
- struct task_ctx *taskc;
+ struct task_ctx __arena *taskc;
s64 qdist;
- taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
- if (!taskc) {
- scx_bpf_error("task_ctx lookup failed");
+ taskc = lookup_task_ctx(p);
+ if (!taskc)
return 0;
- }
qdist = taskc->core_sched_seq - qa.core_sched_head_seqs[idx];
@@ -606,21 +639,64 @@ bool BPF_STRUCT_OPS(qmap_core_sched_before,
* tasks when a higher-priority scheduling class takes the CPU.
*/
-s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
- struct scx_init_task_args *args)
+s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init_task, struct task_struct *p,
+ struct scx_init_task_args *args)
{
+ struct task_ctx_stor_val *v;
+ struct task_ctx __arena *taskc;
+
if (p->tgid == disallow_tgid)
p->scx.disallow = true;
- /*
- * @p is new. Let's ensure that its task_ctx is available. We can sleep
- * in this function and the following will automatically use GFP_KERNEL.
- */
- if (bpf_task_storage_get(&task_ctx_stor, p, 0,
- BPF_LOCAL_STORAGE_GET_F_CREATE))
- return 0;
- else
+ /* pop a slab entry off the free list */
+ if (qmap_spin_lock(&qa_task_lock))
+ return -EBUSY;
+ taskc = qa.task_free_head;
+ if (taskc)
+ qa.task_free_head = taskc->next_free;
+ bpf_res_spin_unlock(&qa_task_lock);
+ if (!taskc) {
+ scx_bpf_error("task_ctx slab exhausted (max_tasks=%u)", max_tasks);
+ return -ENOMEM;
+ }
+
+ taskc->next_free = NULL;
+ taskc->force_local = false;
+ taskc->highpri = false;
+ taskc->core_sched_seq = 0;
+
+ v = bpf_task_storage_get(&task_ctx_stor, p, NULL,
+ BPF_LOCAL_STORAGE_GET_F_CREATE);
+ if (!v) {
+ /* push back to the free list */
+ if (!qmap_spin_lock(&qa_task_lock)) {
+ taskc->next_free = qa.task_free_head;
+ qa.task_free_head = taskc;
+ bpf_res_spin_unlock(&qa_task_lock);
+ }
return -ENOMEM;
+ }
+ v->taskc = taskc;
+ return 0;
+}
+
+void BPF_STRUCT_OPS(qmap_exit_task, struct task_struct *p,
+ struct scx_exit_task_args *args)
+{
+ struct task_ctx_stor_val *v;
+ struct task_ctx __arena *taskc;
+
+ v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
+ if (!v || !v->taskc)
+ return;
+ taskc = v->taskc;
+ v->taskc = NULL;
+
+ if (qmap_spin_lock(&qa_task_lock))
+ return;
+ taskc->next_free = qa.task_free_head;
+ qa.task_free_head = taskc;
+ bpf_res_spin_unlock(&qa_task_lock);
}
void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
@@ -675,12 +751,17 @@ void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle
void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
{
- struct task_ctx *taskc;
+ struct task_ctx_stor_val *v;
+ struct task_ctx __arena *taskc;
+
+ QMAP_TOUCH_ARENA();
if (suppress_dump)
return;
- if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
+ v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
+ if (!v || !v->taskc)
return;
+ taskc = v->taskc;
scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
taskc->force_local, taskc->core_sched_seq);
@@ -915,10 +996,32 @@ static int lowpri_timerfn(void *map, int *key, struct bpf_timer *timer)
s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
{
- u32 key = 0;
+ struct task_ctx __arena *slab;
+ u32 nr_pages, key = 0, i;
struct bpf_timer *timer;
s32 ret;
+ /*
+ * Allocate the task_ctx slab in arena and thread the entire slab onto
+ * the free list. max_tasks is set by userspace before load.
+ */
+ if (!max_tasks) {
+ scx_bpf_error("max_tasks must be > 0");
+ return -EINVAL;
+ }
+
+ nr_pages = (max_tasks * sizeof(struct task_ctx) + PAGE_SIZE - 1) / PAGE_SIZE;
+ slab = bpf_arena_alloc_pages(&arena, NULL, nr_pages, NUMA_NO_NODE, 0);
+ if (!slab) {
+ scx_bpf_error("failed to allocate task_ctx slab");
+ return -ENOMEM;
+ }
+ qa.task_ctxs = slab;
+
+ bpf_for(i, 0, max_tasks)
+ slab[i].next_free = (i + 1 < max_tasks) ? &slab[i + 1] : NULL;
+ qa.task_free_head = &slab[0];
+
if (print_msgs && !sub_cgroup_id)
print_cpus();
@@ -1005,6 +1108,7 @@ SCX_OPS_DEFINE(qmap_ops,
.tick = (void *)qmap_tick,
.core_sched_before = (void *)qmap_core_sched_before,
.init_task = (void *)qmap_init_task,
+ .exit_task = (void *)qmap_exit_task,
.dump = (void *)qmap_dump,
.dump_cpu = (void *)qmap_dump_cpu,
.dump_task = (void *)qmap_dump_task,
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index 8844499c14c4..4bdcc4bc5fbd 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -23,12 +23,13 @@ const char help_fmt[] =
"See the top-level comment in .bpf.c for more details.\n"
"\n"
"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n"
-" [-P] [-M] [-H] [-d PID] [-D LEN] [-S] [-p] [-I] [-F COUNT] [-v]\n"
+" [-N COUNT] [-P] [-M] [-H] [-d PID] [-D LEN] [-S] [-p] [-I] [-F COUNT] [-v]\n"
"\n"
" -s SLICE_US Override slice duration\n"
" -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n"
" -t COUNT Stall every COUNT'th user thread\n"
" -T COUNT Stall every COUNT'th kernel thread\n"
+" -N COUNT Size of the task_ctx arena slab (default 16384)\n"
" -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n"
" -b COUNT Dispatch upto COUNT tasks together\n"
" -P Print out DSQ content and event counters to trace_pipe every second\n"
@@ -73,8 +74,9 @@ int main(int argc, char **argv)
skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
+ skel->rodata->max_tasks = 16384;
- while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PMHc:d:D:SpIF:vh")) != -1) {
+ while ((opt = getopt(argc, argv, "s:e:t:T:l:b:N:PMHc:d:D:SpIF:vh")) != -1) {
switch (opt) {
case 's':
skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -94,6 +96,9 @@ int main(int argc, char **argv)
case 'b':
skel->rodata->dsp_batch = strtoul(optarg, NULL, 0);
break;
+ case 'N':
+ skel->rodata->max_tasks = strtoul(optarg, NULL, 0);
+ break;
case 'P':
skel->rodata->print_dsqs_and_events = true;
break;
diff --git a/tools/sched_ext/scx_qmap.h b/tools/sched_ext/scx_qmap.h
index 52153230bfce..c183d82632b3 100644
--- a/tools/sched_ext/scx_qmap.h
+++ b/tools/sched_ext/scx_qmap.h
@@ -34,6 +34,9 @@ struct cpu_ctx {
__u32 cpuperf_target;
};
+/* Opaque to userspace; defined in scx_qmap.bpf.c. */
+struct task_ctx;
+
struct qmap_arena {
/* userspace-visible stats */
__u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0;
@@ -52,6 +55,10 @@ struct qmap_arena {
__u64 core_sched_tail_seqs[5];
struct cpu_ctx cpu_ctxs[SCX_QMAP_MAX_CPUS];
+
+ /* task_ctx slab; allocated and threaded by qmap_init() */
+ struct task_ctx __arena *task_ctxs;
+ struct task_ctx __arena *task_free_head;
};
#endif /* __SCX_QMAP_H */
--
2.53.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* Re: [PATCH 3/4] sched_ext: scx_qmap: move task_ctx into a BPF arena slab
2026-04-16 8:16 ` [PATCH 3/4] sched_ext: scx_qmap: move task_ctx into a BPF arena slab Tejun Heo
@ 2026-04-16 15:31 ` Emil Tsalapatis
0 siblings, 0 replies; 12+ messages in thread
From: Emil Tsalapatis @ 2026-04-16 15:31 UTC (permalink / raw)
To: Tejun Heo
Cc: David Vernet, Andrea Righi, Changwoo Min, Emil Tsalapatis,
sched-ext, linux-kernel
On Thu, Apr 16, 2026 at 1:20 AM Tejun Heo <tj@kernel.org> wrote:
>
> >
> Arena simplifies verification and allows more natural programming.
> Convert scx_qmap to arena as preparation for further sub-sched work.
>
> Allocate per-task context from an arena slab instead of storing it
> directly in task_storage. task_ctx_stor now holds an arena pointer to
> the task's slab entry. Free entries form a singly-linked list protected
> by bpf_res_spin_lock; slab exhaustion triggers scx_bpf_error().
>
> The slab size is configurable via the new -N option (default 16384).
>
> Also add bpf_res_spin_lock/unlock declarations to common.bpf.h.
>
> Scheduling logic unchanged.
>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
One nit, since we never have non-arena task_ctxs we can do
typedef struct task_ctx __arena task_ctx;
to avoid annotating every instance.
> tools/sched_ext/include/scx/common.bpf.h | 4 +
> tools/sched_ext/scx_qmap.bpf.c | 178 ++++++++++++++++++-----
> tools/sched_ext/scx_qmap.c | 9 +-
> tools/sched_ext/scx_qmap.h | 7 +
> 4 files changed, 159 insertions(+), 39 deletions(-)
>
> diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
> index 19459dedde41..35fc62556241 100644
> --- a/tools/sched_ext/include/scx/common.bpf.h
> +++ b/tools/sched_ext/include/scx/common.bpf.h
> @@ -526,6 +526,10 @@ static inline bool is_migration_disabled(const struct task_struct *p)
> void bpf_rcu_read_lock(void) __ksym;
> void bpf_rcu_read_unlock(void) __ksym;
>
> +/* resilient qspinlock */
> +int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) __ksym __weak;
> +void bpf_res_spin_unlock(struct bpf_res_spin_lock *lock) __ksym __weak;
> +
> /*
> * Time helpers, most of which are from jiffies.h.
> */
> diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
> index 0f8fbb6d0bc2..e071969c8f32 100644
> --- a/tools/sched_ext/scx_qmap.bpf.c
> +++ b/tools/sched_ext/scx_qmap.bpf.c
> @@ -49,6 +49,7 @@ const volatile s32 disallow_tgid;
> const volatile bool suppress_dump;
> const volatile bool always_enq_immed;
> const volatile u32 immed_stress_nth;
> +const volatile u32 max_tasks;
>
> UEI_DEFINE(uei);
>
> @@ -117,20 +118,43 @@ static const u32 qidx_to_cpuperf_target[] = {
> * and used when comparing two tasks for ordering. See qmap_core_sched_before().
> */
>
> -/* Per-task scheduling context */
> +/*
> + * Per-task scheduling context. Allocated from the qa.task_ctxs[] slab in
> + * arena. While the task is alive the entry is referenced from task_ctx_stor;
> + * while it's free the entry sits on the free list singly-linked through
> + * @next_free.
> + */
> struct task_ctx {
> - bool force_local; /* Dispatch directly to local_dsq */
> - bool highpri;
> - u64 core_sched_seq;
> + struct task_ctx __arena *next_free; /* only valid on free list */
> + bool force_local; /* Dispatch directly to local_dsq */
> + bool highpri;
> + u64 core_sched_seq;
> +};
> +
> +/* Holds an arena pointer to the task's slab entry. */
> +struct task_ctx_stor_val {
> + struct task_ctx __arena *taskc;
> };
>
> struct {
> __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
> __uint(map_flags, BPF_F_NO_PREALLOC);
> __type(key, int);
> - __type(value, struct task_ctx);
> + __type(value, struct task_ctx_stor_val);
> } task_ctx_stor SEC(".maps");
>
> +/* Protects the task_ctx slab free list. */
> +__hidden struct bpf_res_spin_lock qa_task_lock SEC(".data.qa_task_lock");
> +
> +static int qmap_spin_lock(struct bpf_res_spin_lock *lock)
> +{
> + if (bpf_res_spin_lock(lock)) {
> + scx_bpf_error("res_spin_lock failed");
> + return -EBUSY;
> + }
> + return 0;
> +}
> +
> static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
> {
> s32 cpu;
> @@ -148,21 +172,34 @@ static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
> return -1;
> }
>
> -static struct task_ctx *lookup_task_ctx(struct task_struct *p)
> +/*
> + * Force a reference to the arena map. The verifier associates an arena with
> + * a program by finding an LD_IMM64 instruction that loads the arena's BPF
> + * map; programs that only use arena pointers returned from task-local
> + * storage (like qmap_select_cpu) never reference @arena directly. Without
> + * this, the verifier rejects addr_space_cast with "addr_space_cast insn
> + * can only be used in a program that has an associated arena".
> + */
> +#define QMAP_TOUCH_ARENA() do { asm volatile("" :: "r"(&arena)); } while (0)
> +
Really nice that this works when placed as a macro.
> +static struct task_ctx __arena *lookup_task_ctx(struct task_struct *p)
> {
> - struct task_ctx *taskc;
> + struct task_ctx_stor_val *v;
> +
> + QMAP_TOUCH_ARENA();
>
> - if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
> + v = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
> + if (!v || !v->taskc) {
> scx_bpf_error("task_ctx lookup failed");
> return NULL;
> }
> - return taskc;
> + return v->taskc;
> }
>
> s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
> s32 prev_cpu, u64 wake_flags)
> {
> - struct task_ctx *taskc;
> + struct task_ctx __arena *taskc;
> s32 cpu;
>
> if (!(taskc = lookup_task_ctx(p)))
> @@ -199,7 +236,7 @@ static int weight_to_idx(u32 weight)
> void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
> {
> static u32 user_cnt, kernel_cnt;
> - struct task_ctx *taskc;
> + struct task_ctx __arena *taskc;
> u32 pid = p->pid;
> int idx = weight_to_idx(p->scx.weight);
> void *ring;
> @@ -321,7 +358,7 @@ void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
> static void update_core_sched_head_seq(struct task_struct *p)
> {
> int idx = weight_to_idx(p->scx.weight);
> - struct task_ctx *taskc;
> + struct task_ctx __arena *taskc;
>
> if ((taskc = lookup_task_ctx(p)))
> qa.core_sched_head_seqs[idx] = taskc->core_sched_seq;
> @@ -345,7 +382,7 @@ static bool dispatch_highpri(bool from_timer)
> /* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
> bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
> static u64 highpri_seq;
> - struct task_ctx *taskc;
> + struct task_ctx __arena *taskc;
>
> if (!(taskc = lookup_task_ctx(p)))
> return false;
> @@ -396,7 +433,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
> {
> struct task_struct *p;
> struct cpu_ctx __arena *cpuc;
> - struct task_ctx *taskc;
> + struct task_ctx __arena *taskc;
> u32 batch = dsp_batch ?: 1;
> void *fifo;
> s32 i, pid;
> @@ -440,7 +477,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>
> /* Dispatch or advance. */
> bpf_repeat(BPF_MAX_LOOPS) {
> - struct task_ctx *taskc;
> + struct task_ctx __arena *taskc;
>
> if (bpf_map_pop_elem(fifo, &pid))
> break;
> @@ -529,11 +566,9 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
> * if the task were enqueued and dispatched immediately.
> */
> if (prev) {
> - taskc = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
> - if (!taskc) {
> - scx_bpf_error("task_ctx lookup failed");
> + taskc = lookup_task_ctx(prev);
> + if (!taskc)
> return;
> - }
>
> taskc->core_sched_seq =
> qa.core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
> @@ -564,14 +599,12 @@ void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
> static s64 task_qdist(struct task_struct *p)
> {
> int idx = weight_to_idx(p->scx.weight);
> - struct task_ctx *taskc;
> + struct task_ctx __arena *taskc;
> s64 qdist;
>
> - taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
> - if (!taskc) {
> - scx_bpf_error("task_ctx lookup failed");
> + taskc = lookup_task_ctx(p);
> + if (!taskc)
> return 0;
> - }
>
> qdist = taskc->core_sched_seq - qa.core_sched_head_seqs[idx];
>
> @@ -606,21 +639,64 @@ bool BPF_STRUCT_OPS(qmap_core_sched_before,
> * tasks when a higher-priority scheduling class takes the CPU.
> */
>
> -s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
> - struct scx_init_task_args *args)
> +s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init_task, struct task_struct *p,
> + struct scx_init_task_args *args)
> {
> + struct task_ctx_stor_val *v;
> + struct task_ctx __arena *taskc;
> +
> if (p->tgid == disallow_tgid)
> p->scx.disallow = true;
>
> - /*
> - * @p is new. Let's ensure that its task_ctx is available. We can sleep
> - * in this function and the following will automatically use GFP_KERNEL.
> - */
> - if (bpf_task_storage_get(&task_ctx_stor, p, 0,
> - BPF_LOCAL_STORAGE_GET_F_CREATE))
> - return 0;
> - else
> + /* pop a slab entry off the free list */
> + if (qmap_spin_lock(&qa_task_lock))
> + return -EBUSY;
> + taskc = qa.task_free_head;
> + if (taskc)
> + qa.task_free_head = taskc->next_free;
> + bpf_res_spin_unlock(&qa_task_lock);
> + if (!taskc) {
> + scx_bpf_error("task_ctx slab exhausted (max_tasks=%u)", max_tasks);
> + return -ENOMEM;
> + }
> +
> + taskc->next_free = NULL;
> + taskc->force_local = false;
> + taskc->highpri = false;
> + taskc->core_sched_seq = 0;
> +
> + v = bpf_task_storage_get(&task_ctx_stor, p, NULL,
> + BPF_LOCAL_STORAGE_GET_F_CREATE);
> + if (!v) {
> + /* push back to the free list */
> + if (!qmap_spin_lock(&qa_task_lock)) {
> + taskc->next_free = qa.task_free_head;
> + qa.task_free_head = taskc;
> + bpf_res_spin_unlock(&qa_task_lock);
> + }
> return -ENOMEM;
> + }
> + v->taskc = taskc;
> + return 0;
> +}
> +
> +void BPF_STRUCT_OPS(qmap_exit_task, struct task_struct *p,
> + struct scx_exit_task_args *args)
> +{
> + struct task_ctx_stor_val *v;
> + struct task_ctx __arena *taskc;
> +
> + v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
> + if (!v || !v->taskc)
> + return;
> + taskc = v->taskc;
> + v->taskc = NULL;
> +
> + if (qmap_spin_lock(&qa_task_lock))
> + return;
> + taskc->next_free = qa.task_free_head;
> + qa.task_free_head = taskc;
> + bpf_res_spin_unlock(&qa_task_lock);
> }
>
> void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
> @@ -675,12 +751,17 @@ void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle
>
> void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
> {
> - struct task_ctx *taskc;
> + struct task_ctx_stor_val *v;
> + struct task_ctx __arena *taskc;
> +
> + QMAP_TOUCH_ARENA();
>
> if (suppress_dump)
> return;
> - if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
> + v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
> + if (!v || !v->taskc)
> return;
> + taskc = v->taskc;
>
> scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
> taskc->force_local, taskc->core_sched_seq);
> @@ -915,10 +996,32 @@ static int lowpri_timerfn(void *map, int *key, struct bpf_timer *timer)
>
> s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
> {
> - u32 key = 0;
> + struct task_ctx __arena *slab;
> + u32 nr_pages, key = 0, i;
> struct bpf_timer *timer;
> s32 ret;
>
> + /*
> + * Allocate the task_ctx slab in arena and thread the entire slab onto
> + * the free list. max_tasks is set by userspace before load.
> + */
> + if (!max_tasks) {
> + scx_bpf_error("max_tasks must be > 0");
> + return -EINVAL;
> + }
> +
> + nr_pages = (max_tasks * sizeof(struct task_ctx) + PAGE_SIZE - 1) / PAGE_SIZE;
> + slab = bpf_arena_alloc_pages(&arena, NULL, nr_pages, NUMA_NO_NODE, 0);
> + if (!slab) {
> + scx_bpf_error("failed to allocate task_ctx slab");
> + return -ENOMEM;
> + }
> + qa.task_ctxs = slab;
> +
> + bpf_for(i, 0, max_tasks)
> + slab[i].next_free = (i + 1 < max_tasks) ? &slab[i + 1] : NULL;
> + qa.task_free_head = &slab[0];
> +
> if (print_msgs && !sub_cgroup_id)
> print_cpus();
>
> @@ -1005,6 +1108,7 @@ SCX_OPS_DEFINE(qmap_ops,
> .tick = (void *)qmap_tick,
> .core_sched_before = (void *)qmap_core_sched_before,
> .init_task = (void *)qmap_init_task,
> + .exit_task = (void *)qmap_exit_task,
> .dump = (void *)qmap_dump,
> .dump_cpu = (void *)qmap_dump_cpu,
> .dump_task = (void *)qmap_dump_task,
> diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
> index 8844499c14c4..4bdcc4bc5fbd 100644
> --- a/tools/sched_ext/scx_qmap.c
> +++ b/tools/sched_ext/scx_qmap.c
> @@ -23,12 +23,13 @@ const char help_fmt[] =
> "See the top-level comment in .bpf.c for more details.\n"
> "\n"
> "Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n"
> -" [-P] [-M] [-H] [-d PID] [-D LEN] [-S] [-p] [-I] [-F COUNT] [-v]\n"
> +" [-N COUNT] [-P] [-M] [-H] [-d PID] [-D LEN] [-S] [-p] [-I] [-F COUNT] [-v]\n"
> "\n"
> " -s SLICE_US Override slice duration\n"
> " -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n"
> " -t COUNT Stall every COUNT'th user thread\n"
> " -T COUNT Stall every COUNT'th kernel thread\n"
> +" -N COUNT Size of the task_ctx arena slab (default 16384)\n"
> " -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n"
> " -b COUNT Dispatch upto COUNT tasks together\n"
> " -P Print out DSQ content and event counters to trace_pipe every second\n"
> @@ -73,8 +74,9 @@ int main(int argc, char **argv)
> skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
>
> skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
> + skel->rodata->max_tasks = 16384;
>
> - while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PMHc:d:D:SpIF:vh")) != -1) {
> + while ((opt = getopt(argc, argv, "s:e:t:T:l:b:N:PMHc:d:D:SpIF:vh")) != -1) {
> switch (opt) {
> case 's':
> skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
> @@ -94,6 +96,9 @@ int main(int argc, char **argv)
> case 'b':
> skel->rodata->dsp_batch = strtoul(optarg, NULL, 0);
> break;
> + case 'N':
> + skel->rodata->max_tasks = strtoul(optarg, NULL, 0);
> + break;
> case 'P':
> skel->rodata->print_dsqs_and_events = true;
> break;
> diff --git a/tools/sched_ext/scx_qmap.h b/tools/sched_ext/scx_qmap.h
> index 52153230bfce..c183d82632b3 100644
> --- a/tools/sched_ext/scx_qmap.h
> +++ b/tools/sched_ext/scx_qmap.h
> @@ -34,6 +34,9 @@ struct cpu_ctx {
> __u32 cpuperf_target;
> };
>
> +/* Opaque to userspace; defined in scx_qmap.bpf.c. */
> +struct task_ctx;
> +
> struct qmap_arena {
> /* userspace-visible stats */
> __u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0;
> @@ -52,6 +55,10 @@ struct qmap_arena {
> __u64 core_sched_tail_seqs[5];
>
> struct cpu_ctx cpu_ctxs[SCX_QMAP_MAX_CPUS];
> +
> + /* task_ctx slab; allocated and threaded by qmap_init() */
> + struct task_ctx __arena *task_ctxs;
> + struct task_ctx __arena *task_free_head;
> };
>
> #endif /* __SCX_QMAP_H */
> --
> 2.53.0
>
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists
2026-04-16 8:16 [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Tejun Heo
` (2 preceding siblings ...)
2026-04-16 8:16 ` [PATCH 3/4] sched_ext: scx_qmap: move task_ctx into a BPF arena slab Tejun Heo
@ 2026-04-16 8:16 ` Tejun Heo
2026-04-16 10:01 ` Andrea Righi
2026-04-16 15:45 ` Emil Tsalapatis
2026-04-16 10:05 ` [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Andrea Righi
4 siblings, 2 replies; 12+ messages in thread
From: Tejun Heo @ 2026-04-16 8:16 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min
Cc: Emil Tsalapatis, sched-ext, linux-kernel, Tejun Heo
Arena simplifies verification and allows more natural programming.
Convert scx_qmap to arena as preparation for further sub-sched work.
Replace the five BPF_MAP_TYPE_QUEUE maps with doubly-linked lists in
arena, threaded through task_ctx. Each queue is a struct qmap_fifo with
head/tail pointers and its own per-queue bpf_res_spin_lock.
qmap_dequeue() now properly removes tasks from the queue instead of
leaving stale entries for dispatch to skip.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
tools/sched_ext/scx_qmap.bpf.c | 221 +++++++++++++++++++++------------
tools/sched_ext/scx_qmap.h | 9 ++
2 files changed, 148 insertions(+), 82 deletions(-)
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index e071969c8f32..c26997ff7863 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -71,31 +71,24 @@ struct {
struct qmap_arena __arena qa;
-struct qmap {
- __uint(type, BPF_MAP_TYPE_QUEUE);
- __uint(max_entries, 4096);
- __type(value, u32);
-} queue0 SEC(".maps"),
- queue1 SEC(".maps"),
- queue2 SEC(".maps"),
- queue3 SEC(".maps"),
- queue4 SEC(".maps"),
- dump_store SEC(".maps");
-
-struct {
- __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
- __uint(max_entries, 5);
- __type(key, int);
- __array(values, struct qmap);
-} queue_arr SEC(".maps") = {
- .values = {
- [0] = &queue0,
- [1] = &queue1,
- [2] = &queue2,
- [3] = &queue3,
- [4] = &queue4,
- },
-};
+/* Per-queue locks. Each in its own .data section as bpf_res_spin_lock requires. */
+__hidden struct bpf_res_spin_lock qa_q_lock0 SEC(".data.qa_q_lock0");
+__hidden struct bpf_res_spin_lock qa_q_lock1 SEC(".data.qa_q_lock1");
+__hidden struct bpf_res_spin_lock qa_q_lock2 SEC(".data.qa_q_lock2");
+__hidden struct bpf_res_spin_lock qa_q_lock3 SEC(".data.qa_q_lock3");
+__hidden struct bpf_res_spin_lock qa_q_lock4 SEC(".data.qa_q_lock4");
+
+static struct bpf_res_spin_lock *qa_q_lock(s32 qid)
+{
+ switch (qid) {
+ case 0: return &qa_q_lock0;
+ case 1: return &qa_q_lock1;
+ case 2: return &qa_q_lock2;
+ case 3: return &qa_q_lock3;
+ case 4: return &qa_q_lock4;
+ default: return NULL;
+ }
+}
/*
* If enabled, CPU performance target is set according to the queue index
@@ -123,9 +116,17 @@ static const u32 qidx_to_cpuperf_target[] = {
* arena. While the task is alive the entry is referenced from task_ctx_stor;
* while it's free the entry sits on the free list singly-linked through
* @next_free.
+ *
+ * When the task is queued on one of the five priority FIFOs, @q_idx is the
+ * queue index and @q_next/@q_prev link it in the queue's doubly-linked list.
+ * @q_idx is -1 when the task isn't on any queue.
*/
struct task_ctx {
struct task_ctx __arena *next_free; /* only valid on free list */
+ struct task_ctx __arena *q_next; /* queue link, NULL if tail */
+ struct task_ctx __arena *q_prev; /* queue link, NULL if head */
+ struct qmap_fifo __arena *fifo; /* queue we're on, NULL if not queued */
+ s32 pid;
bool force_local; /* Dispatch directly to local_dsq */
bool highpri;
u64 core_sched_seq;
@@ -196,6 +197,81 @@ static struct task_ctx __arena *lookup_task_ctx(struct task_struct *p)
return v->taskc;
}
+/* Append @taskc to the tail of @fifo. Must not already be queued. */
+static void qmap_fifo_enqueue(struct qmap_fifo __arena *fifo,
+ struct task_ctx __arena *taskc)
+{
+ struct bpf_res_spin_lock *lock = qa_q_lock(fifo->idx);
+
+ if (!lock || qmap_spin_lock(lock))
+ return;
+ taskc->fifo = fifo;
+ taskc->q_next = NULL;
+ taskc->q_prev = fifo->tail;
+ if (fifo->tail)
+ fifo->tail->q_next = taskc;
+ else
+ fifo->head = taskc;
+ fifo->tail = taskc;
+ bpf_res_spin_unlock(lock);
+}
+
+/* Pop the head of @fifo. Returns NULL if empty. */
+static struct task_ctx __arena *qmap_fifo_pop(struct qmap_fifo __arena *fifo)
+{
+ struct bpf_res_spin_lock *lock = qa_q_lock(fifo->idx);
+ struct task_ctx __arena *taskc;
+
+ if (!lock || qmap_spin_lock(lock))
+ return NULL;
+ taskc = fifo->head;
+ if (taskc) {
+ fifo->head = taskc->q_next;
+ if (taskc->q_next)
+ taskc->q_next->q_prev = NULL;
+ else
+ fifo->tail = NULL;
+ taskc->q_next = NULL;
+ taskc->q_prev = NULL;
+ taskc->fifo = NULL;
+ }
+ bpf_res_spin_unlock(lock);
+ return taskc;
+}
+
+/* Remove @taskc from its fifo. No-op if not queued. */
+static void qmap_fifo_remove(struct task_ctx __arena *taskc)
+{
+ struct qmap_fifo __arena *fifo = taskc->fifo;
+ struct bpf_res_spin_lock *lock;
+
+ if (!fifo)
+ return;
+
+ lock = qa_q_lock(fifo->idx);
+ if (!lock || qmap_spin_lock(lock))
+ return;
+
+ /* Re-check under lock — a concurrent pop may have cleared fifo. */
+ if (taskc->fifo != fifo) {
+ bpf_res_spin_unlock(lock);
+ return;
+ }
+
+ if (taskc->q_next)
+ taskc->q_next->q_prev = taskc->q_prev;
+ else
+ fifo->tail = taskc->q_prev;
+ if (taskc->q_prev)
+ taskc->q_prev->q_next = taskc->q_next;
+ else
+ fifo->head = taskc->q_next;
+ taskc->q_next = NULL;
+ taskc->q_prev = NULL;
+ taskc->fifo = NULL;
+ bpf_res_spin_unlock(lock);
+}
+
s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
s32 prev_cpu, u64 wake_flags)
{
@@ -237,9 +313,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
{
static u32 user_cnt, kernel_cnt;
struct task_ctx __arena *taskc;
- u32 pid = p->pid;
int idx = weight_to_idx(p->scx.weight);
- void *ring;
s32 cpu;
if (enq_flags & SCX_ENQ_REENQ) {
@@ -325,17 +399,8 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
return;
}
- ring = bpf_map_lookup_elem(&queue_arr, &idx);
- if (!ring) {
- scx_bpf_error("failed to find ring %d", idx);
- return;
- }
-
- /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
- if (bpf_map_push_elem(ring, &pid, 0)) {
- scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags);
- return;
- }
+ /* Queue on the selected FIFO. */
+ qmap_fifo_enqueue(&qa.fifos[idx], taskc);
if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
taskc->highpri = true;
@@ -344,15 +409,20 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
__sync_fetch_and_add(&qa.nr_enqueued, 1);
}
-/*
- * The BPF queue map doesn't support removal and sched_ext can handle spurious
- * dispatches. qmap_dequeue() is only used to collect statistics.
- */
void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
{
+ struct task_ctx __arena *taskc;
+
__sync_fetch_and_add(&qa.nr_dequeued, 1);
if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
__sync_fetch_and_add(&qa.nr_core_sched_execed, 1);
+
+ taskc = lookup_task_ctx(p);
+ if (taskc && taskc->fifo) {
+ if (taskc->highpri)
+ __sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
+ qmap_fifo_remove(taskc);
+ }
}
static void update_core_sched_head_seq(struct task_struct *p)
@@ -435,8 +505,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
struct cpu_ctx __arena *cpuc;
struct task_ctx __arena *taskc;
u32 batch = dsp_batch ?: 1;
- void *fifo;
- s32 i, pid;
+ s32 i;
if (dispatch_highpri(false))
return;
@@ -467,30 +536,18 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
}
- u64 dsp_idx = cpuc->dsp_idx;
-
- fifo = bpf_map_lookup_elem(&queue_arr, &dsp_idx);
- if (!fifo) {
- scx_bpf_error("failed to find ring %llu", dsp_idx);
- return;
- }
-
/* Dispatch or advance. */
bpf_repeat(BPF_MAX_LOOPS) {
struct task_ctx __arena *taskc;
- if (bpf_map_pop_elem(fifo, &pid))
+ taskc = qmap_fifo_pop(&qa.fifos[cpuc->dsp_idx]);
+ if (!taskc)
break;
- p = bpf_task_from_pid(pid);
+ p = bpf_task_from_pid(taskc->pid);
if (!p)
continue;
- if (!(taskc = lookup_task_ctx(p))) {
- bpf_task_release(p);
- return;
- }
-
if (taskc->highpri)
__sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
@@ -661,6 +718,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init_task, struct task_struct *p,
}
taskc->next_free = NULL;
+ taskc->q_next = NULL;
+ taskc->q_prev = NULL;
+ taskc->fifo = NULL;
+ taskc->pid = p->pid;
taskc->force_local = false;
taskc->highpri = false;
taskc->core_sched_seq = 0;
@@ -701,38 +762,29 @@ void BPF_STRUCT_OPS(qmap_exit_task, struct task_struct *p,
void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
{
- s32 i, pid;
+ struct task_ctx __arena *taskc;
+ s32 i;
+
+ QMAP_TOUCH_ARENA();
if (suppress_dump)
return;
+ /*
+ * Walk the queue lists without locking - kfunc calls (scx_bpf_dump)
+ * aren't in the verifier's kfunc_spin_allowed() list so we can't hold
+ * a lock and dump. Best-effort; racing may print stale pids but the
+ * walk is bounded by bpf_repeat() so it always terminates.
+ */
bpf_for(i, 0, 5) {
- void *fifo;
-
- if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
- return;
-
scx_bpf_dump("QMAP FIFO[%d]:", i);
-
- /*
- * Dump can be invoked anytime and there is no way to iterate in
- * a non-destructive way. Pop and store in dump_store and then
- * restore afterwards. If racing against new enqueues, ordering
- * can get mixed up.
- */
- bpf_repeat(4096) {
- if (bpf_map_pop_elem(fifo, &pid))
- break;
- bpf_map_push_elem(&dump_store, &pid, 0);
- scx_bpf_dump(" %d", pid);
- }
-
+ taskc = qa.fifos[i].head;
bpf_repeat(4096) {
- if (bpf_map_pop_elem(&dump_store, &pid))
+ if (!taskc)
break;
- bpf_map_push_elem(fifo, &pid, 0);
+ scx_bpf_dump(" %d", taskc->pid);
+ taskc = taskc->q_next;
}
-
scx_bpf_dump("\n");
}
}
@@ -756,6 +808,8 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc
QMAP_TOUCH_ARENA();
+ QMAP_TOUCH_ARENA();
+
if (suppress_dump)
return;
v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
@@ -1018,6 +1072,9 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
}
qa.task_ctxs = slab;
+ bpf_for(i, 0, 5)
+ qa.fifos[i].idx = i;
+
bpf_for(i, 0, max_tasks)
slab[i].next_free = (i + 1 < max_tasks) ? &slab[i + 1] : NULL;
qa.task_free_head = &slab[0];
diff --git a/tools/sched_ext/scx_qmap.h b/tools/sched_ext/scx_qmap.h
index c183d82632b3..9c0da5a301cb 100644
--- a/tools/sched_ext/scx_qmap.h
+++ b/tools/sched_ext/scx_qmap.h
@@ -37,6 +37,12 @@ struct cpu_ctx {
/* Opaque to userspace; defined in scx_qmap.bpf.c. */
struct task_ctx;
+struct qmap_fifo {
+ struct task_ctx __arena *head;
+ struct task_ctx __arena *tail;
+ __s32 idx;
+};
+
struct qmap_arena {
/* userspace-visible stats */
__u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0;
@@ -59,6 +65,9 @@ struct qmap_arena {
/* task_ctx slab; allocated and threaded by qmap_init() */
struct task_ctx __arena *task_ctxs;
struct task_ctx __arena *task_free_head;
+
+ /* five priority FIFOs, each a doubly-linked list through task_ctx */
+ struct qmap_fifo fifos[5];
};
#endif /* __SCX_QMAP_H */
--
2.53.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* Re: [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists
2026-04-16 8:16 ` [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists Tejun Heo
@ 2026-04-16 10:01 ` Andrea Righi
2026-04-16 15:45 ` Emil Tsalapatis
1 sibling, 0 replies; 12+ messages in thread
From: Andrea Righi @ 2026-04-16 10:01 UTC (permalink / raw)
To: Tejun Heo
Cc: David Vernet, Changwoo Min, Emil Tsalapatis, sched-ext,
linux-kernel
Hi Tejun,
On Wed, Apr 15, 2026 at 10:16:26PM -1000, Tejun Heo wrote:
> Arena simplifies verification and allows more natural programming.
> Convert scx_qmap to arena as preparation for further sub-sched work.
>
> Replace the five BPF_MAP_TYPE_QUEUE maps with doubly-linked lists in
> arena, threaded through task_ctx. Each queue is a struct qmap_fifo with
> head/tail pointers and its own per-queue bpf_res_spin_lock.
We should probably update the description at the beginning of the files as well,
mentioning the arena-backed lists.
>
> qmap_dequeue() now properly removes tasks from the queue instead of
> leaving stale entries for dispatch to skip.
>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
...
> @@ -756,6 +808,8 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc
>
> QMAP_TOUCH_ARENA();
>
> + QMAP_TOUCH_ARENA();
> +
Copy/paste noise?
> if (suppress_dump)
> return;
> v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
> @@ -1018,6 +1072,9 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
> }
> qa.task_ctxs = slab;
>
> + bpf_for(i, 0, 5)
> + qa.fifos[i].idx = i;
> +
> bpf_for(i, 0, max_tasks)
> slab[i].next_free = (i + 1 < max_tasks) ? &slab[i + 1] : NULL;
> qa.task_free_head = &slab[0];
Thanks,
-Andrea
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists
2026-04-16 8:16 ` [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists Tejun Heo
2026-04-16 10:01 ` Andrea Righi
@ 2026-04-16 15:45 ` Emil Tsalapatis
1 sibling, 0 replies; 12+ messages in thread
From: Emil Tsalapatis @ 2026-04-16 15:45 UTC (permalink / raw)
To: Tejun Heo
Cc: David Vernet, Andrea Righi, Changwoo Min, Emil Tsalapatis,
sched-ext, linux-kernel
On Thu, Apr 16, 2026 at 1:20 AM Tejun Heo <tj@kernel.org> wrote:
>
> >
> Arena simplifies verification and allows more natural programming.
> Convert scx_qmap to arena as preparation for further sub-sched work.
>
> Replace the five BPF_MAP_TYPE_QUEUE maps with doubly-linked lists in
> arena, threaded through task_ctx. Each queue is a struct qmap_fifo with
> head/tail pointers and its own per-queue bpf_res_spin_lock.
>
> qmap_dequeue() now properly removes tasks from the queue instead of
> leaving stale entries for dispatch to skip.
>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
> tools/sched_ext/scx_qmap.bpf.c | 221 +++++++++++++++++++++------------
> tools/sched_ext/scx_qmap.h | 9 ++
> 2 files changed, 148 insertions(+), 82 deletions(-)
>
> diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
> index e071969c8f32..c26997ff7863 100644
> --- a/tools/sched_ext/scx_qmap.bpf.c
> +++ b/tools/sched_ext/scx_qmap.bpf.c
> @@ -71,31 +71,24 @@ struct {
>
> struct qmap_arena __arena qa;
>
> -struct qmap {
> - __uint(type, BPF_MAP_TYPE_QUEUE);
> - __uint(max_entries, 4096);
> - __type(value, u32);
> -} queue0 SEC(".maps"),
> - queue1 SEC(".maps"),
> - queue2 SEC(".maps"),
> - queue3 SEC(".maps"),
> - queue4 SEC(".maps"),
> - dump_store SEC(".maps");
> -
> -struct {
> - __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
> - __uint(max_entries, 5);
> - __type(key, int);
> - __array(values, struct qmap);
> -} queue_arr SEC(".maps") = {
> - .values = {
> - [0] = &queue0,
> - [1] = &queue1,
> - [2] = &queue2,
> - [3] = &queue3,
> - [4] = &queue4,
> - },
> -};
> +/* Per-queue locks. Each in its own .data section as bpf_res_spin_lock requires. */
> +__hidden struct bpf_res_spin_lock qa_q_lock0 SEC(".data.qa_q_lock0");
> +__hidden struct bpf_res_spin_lock qa_q_lock1 SEC(".data.qa_q_lock1");
> +__hidden struct bpf_res_spin_lock qa_q_lock2 SEC(".data.qa_q_lock2");
> +__hidden struct bpf_res_spin_lock qa_q_lock3 SEC(".data.qa_q_lock3");
> +__hidden struct bpf_res_spin_lock qa_q_lock4 SEC(".data.qa_q_lock4");
> +
> +static struct bpf_res_spin_lock *qa_q_lock(s32 qid)
> +{
> + switch (qid) {
> + case 0: return &qa_q_lock0;
> + case 1: return &qa_q_lock1;
> + case 2: return &qa_q_lock2;
> + case 3: return &qa_q_lock3;
> + case 4: return &qa_q_lock4;
> + default: return NULL;
> + }
> +}
>
> /*
> * If enabled, CPU performance target is set according to the queue index
> @@ -123,9 +116,17 @@ static const u32 qidx_to_cpuperf_target[] = {
> * arena. While the task is alive the entry is referenced from task_ctx_stor;
> * while it's free the entry sits on the free list singly-linked through
> * @next_free.
> + *
> + * When the task is queued on one of the five priority FIFOs, @q_idx is the
> + * queue index and @q_next/@q_prev link it in the queue's doubly-linked list.
> + * @q_idx is -1 when the task isn't on any queue.
> */
> struct task_ctx {
> struct task_ctx __arena *next_free; /* only valid on free list */
> + struct task_ctx __arena *q_next; /* queue link, NULL if tail */
> + struct task_ctx __arena *q_prev; /* queue link, NULL if head */
> + struct qmap_fifo __arena *fifo; /* queue we're on, NULL if not queued */
> + s32 pid;
> bool force_local; /* Dispatch directly to local_dsq */
> bool highpri;
> u64 core_sched_seq;
> @@ -196,6 +197,81 @@ static struct task_ctx __arena *lookup_task_ctx(struct task_struct *p)
> return v->taskc;
> }
>
> +/* Append @taskc to the tail of @fifo. Must not already be queued. */
> +static void qmap_fifo_enqueue(struct qmap_fifo __arena *fifo,
> + struct task_ctx __arena *taskc)
> +{
> + struct bpf_res_spin_lock *lock = qa_q_lock(fifo->idx);
> +
> + if (!lock || qmap_spin_lock(lock))
> + return;
> + taskc->fifo = fifo;
> + taskc->q_next = NULL;
> + taskc->q_prev = fifo->tail;
> + if (fifo->tail)
> + fifo->tail->q_next = taskc;
> + else
> + fifo->head = taskc;
> + fifo->tail = taskc;
> + bpf_res_spin_unlock(lock);
> +}
> +
> +/* Pop the head of @fifo. Returns NULL if empty. */
> +static struct task_ctx __arena *qmap_fifo_pop(struct qmap_fifo __arena *fifo)
> +{
> + struct bpf_res_spin_lock *lock = qa_q_lock(fifo->idx);
> + struct task_ctx __arena *taskc;
> +
> + if (!lock || qmap_spin_lock(lock))
> + return NULL;
> + taskc = fifo->head;
> + if (taskc) {
> + fifo->head = taskc->q_next;
> + if (taskc->q_next)
> + taskc->q_next->q_prev = NULL;
> + else
> + fifo->tail = NULL;
> + taskc->q_next = NULL;
> + taskc->q_prev = NULL;
> + taskc->fifo = NULL;
> + }
> + bpf_res_spin_unlock(lock);
> + return taskc;
> +}
> +
> +/* Remove @taskc from its fifo. No-op if not queued. */
> +static void qmap_fifo_remove(struct task_ctx __arena *taskc)
> +{
> + struct qmap_fifo __arena *fifo = taskc->fifo;
> + struct bpf_res_spin_lock *lock;
> +
> + if (!fifo)
> + return;
> +
> + lock = qa_q_lock(fifo->idx);
> + if (!lock || qmap_spin_lock(lock))
> + return;
> +
> + /* Re-check under lock — a concurrent pop may have cleared fifo. */
> + if (taskc->fifo != fifo) {
> + bpf_res_spin_unlock(lock);
> + return;
> + }
> +
> + if (taskc->q_next)
> + taskc->q_next->q_prev = taskc->q_prev;
> + else
> + fifo->tail = taskc->q_prev;
> + if (taskc->q_prev)
> + taskc->q_prev->q_next = taskc->q_next;
> + else
> + fifo->head = taskc->q_next;
> + taskc->q_next = NULL;
> + taskc->q_prev = NULL;
> + taskc->fifo = NULL;
> + bpf_res_spin_unlock(lock);
> +}
> +
> s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
> s32 prev_cpu, u64 wake_flags)
> {
> @@ -237,9 +313,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
> {
> static u32 user_cnt, kernel_cnt;
> struct task_ctx __arena *taskc;
> - u32 pid = p->pid;
> int idx = weight_to_idx(p->scx.weight);
> - void *ring;
> s32 cpu;
>
> if (enq_flags & SCX_ENQ_REENQ) {
> @@ -325,17 +399,8 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
> return;
> }
>
> - ring = bpf_map_lookup_elem(&queue_arr, &idx);
> - if (!ring) {
> - scx_bpf_error("failed to find ring %d", idx);
> - return;
> - }
> -
> - /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
> - if (bpf_map_push_elem(ring, &pid, 0)) {
> - scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags);
> - return;
> - }
> + /* Queue on the selected FIFO. */
> + qmap_fifo_enqueue(&qa.fifos[idx], taskc);
>
> if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
> taskc->highpri = true;
> @@ -344,15 +409,20 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
> __sync_fetch_and_add(&qa.nr_enqueued, 1);
> }
>
> -/*
> - * The BPF queue map doesn't support removal and sched_ext can handle spurious
> - * dispatches. qmap_dequeue() is only used to collect statistics.
> - */
> void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
> {
> + struct task_ctx __arena *taskc;
> +
> __sync_fetch_and_add(&qa.nr_dequeued, 1);
> if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
> __sync_fetch_and_add(&qa.nr_core_sched_execed, 1);
> +
> + taskc = lookup_task_ctx(p);
> + if (taskc && taskc->fifo) {
> + if (taskc->highpri)
> + __sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
> + qmap_fifo_remove(taskc);
> + }
> }
>
> static void update_core_sched_head_seq(struct task_struct *p)
> @@ -435,8 +505,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
> struct cpu_ctx __arena *cpuc;
> struct task_ctx __arena *taskc;
> u32 batch = dsp_batch ?: 1;
> - void *fifo;
> - s32 i, pid;
> + s32 i;
>
> if (dispatch_highpri(false))
> return;
> @@ -467,30 +536,18 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
> cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
> }
>
> - u64 dsp_idx = cpuc->dsp_idx;
> -
> - fifo = bpf_map_lookup_elem(&queue_arr, &dsp_idx);
> - if (!fifo) {
> - scx_bpf_error("failed to find ring %llu", dsp_idx);
> - return;
> - }
> -
> /* Dispatch or advance. */
> bpf_repeat(BPF_MAX_LOOPS) {
> struct task_ctx __arena *taskc;
>
> - if (bpf_map_pop_elem(fifo, &pid))
> + taskc = qmap_fifo_pop(&qa.fifos[cpuc->dsp_idx]);
> + if (!taskc)
> break;
>
> - p = bpf_task_from_pid(pid);
> + p = bpf_task_from_pid(taskc->pid);
> if (!p)
> continue;
>
> - if (!(taskc = lookup_task_ctx(p))) {
> - bpf_task_release(p);
> - return;
> - }
> -
> if (taskc->highpri)
> __sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
>
> @@ -661,6 +718,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init_task, struct task_struct *p,
> }
>
> taskc->next_free = NULL;
> + taskc->q_next = NULL;
> + taskc->q_prev = NULL;
> + taskc->fifo = NULL;
> + taskc->pid = p->pid;
> taskc->force_local = false;
> taskc->highpri = false;
> taskc->core_sched_seq = 0;
> @@ -701,38 +762,29 @@ void BPF_STRUCT_OPS(qmap_exit_task, struct task_struct *p,
>
> void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
> {
> - s32 i, pid;
> + struct task_ctx __arena *taskc;
> + s32 i;
> +
> + QMAP_TOUCH_ARENA();
>
> if (suppress_dump)
> return;
>
> + /*
> + * Walk the queue lists without locking - kfunc calls (scx_bpf_dump)
> + * aren't in the verifier's kfunc_spin_allowed() list so we can't hold
> + * a lock and dump. Best-effort; racing may print stale pids but the
> + * walk is bounded by bpf_repeat() so it always terminates.
> + */
> bpf_for(i, 0, 5) {
> - void *fifo;
> -
> - if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
> - return;
> -
> scx_bpf_dump("QMAP FIFO[%d]:", i);
> -
> - /*
> - * Dump can be invoked anytime and there is no way to iterate in
> - * a non-destructive way. Pop and store in dump_store and then
> - * restore afterwards. If racing against new enqueues, ordering
> - * can get mixed up.
> - */
> - bpf_repeat(4096) {
> - if (bpf_map_pop_elem(fifo, &pid))
> - break;
> - bpf_map_push_elem(&dump_store, &pid, 0);
> - scx_bpf_dump(" %d", pid);
> - }
> -
> + taskc = qa.fifos[i].head;
> bpf_repeat(4096) {
> - if (bpf_map_pop_elem(&dump_store, &pid))
> + if (!taskc)
> break;
> - bpf_map_push_elem(fifo, &pid, 0);
> + scx_bpf_dump(" %d", taskc->pid);
> + taskc = taskc->q_next;
> }
> -
> scx_bpf_dump("\n");
> }
> }
> @@ -756,6 +808,8 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc
>
> QMAP_TOUCH_ARENA();
>
> + QMAP_TOUCH_ARENA();
> +
> if (suppress_dump)
> return;
> v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
> @@ -1018,6 +1072,9 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
> }
> qa.task_ctxs = slab;
>
> + bpf_for(i, 0, 5)
> + qa.fifos[i].idx = i;
> +
> bpf_for(i, 0, max_tasks)
> slab[i].next_free = (i + 1 < max_tasks) ? &slab[i + 1] : NULL;
> qa.task_free_head = &slab[0];
> diff --git a/tools/sched_ext/scx_qmap.h b/tools/sched_ext/scx_qmap.h
> index c183d82632b3..9c0da5a301cb 100644
> --- a/tools/sched_ext/scx_qmap.h
> +++ b/tools/sched_ext/scx_qmap.h
> @@ -37,6 +37,12 @@ struct cpu_ctx {
> /* Opaque to userspace; defined in scx_qmap.bpf.c. */
> struct task_ctx;
>
> +struct qmap_fifo {
> + struct task_ctx __arena *head;
> + struct task_ctx __arena *tail;
> + __s32 idx;
> +};
> +
> struct qmap_arena {
> /* userspace-visible stats */
> __u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0;
> @@ -59,6 +65,9 @@ struct qmap_arena {
> /* task_ctx slab; allocated and threaded by qmap_init() */
> struct task_ctx __arena *task_ctxs;
> struct task_ctx __arena *task_free_head;
> +
> + /* five priority FIFOs, each a doubly-linked list through task_ctx */
> + struct qmap_fifo fifos[5];
> };
>
> #endif /* __SCX_QMAP_H */
> --
> 2.53.0
>
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena
2026-04-16 8:16 [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Tejun Heo
` (3 preceding siblings ...)
2026-04-16 8:16 ` [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists Tejun Heo
@ 2026-04-16 10:05 ` Andrea Righi
4 siblings, 0 replies; 12+ messages in thread
From: Andrea Righi @ 2026-04-16 10:05 UTC (permalink / raw)
To: Tejun Heo
Cc: David Vernet, Changwoo Min, Emil Tsalapatis, sched-ext,
linux-kernel
Hi Tejun,
On Wed, Apr 15, 2026 at 10:16:22PM -1000, Tejun Heo wrote:
> Hello,
>
> Arena simplifies verification and allows more natural programming. This
> patchset converts scx_qmap to use BPF arena for all mutable state, as
> preparation for further sub-sched work.
>
> 0001 Rename tctx to taskc for consistency.
> 0002 Move globals and cpu_ctx into arena.
> 0003 Move task_ctx into an arena slab with bpf_res_spin_lock.
> 0004 Replace FIFO queue maps with arena-backed doubly-linked lists.
>
> Based on linus/master (1d51b370a0f8).
Sent a couple of comments about patch 4, everything else looks good to me.
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Thanks,
-Andrea
>
> tools/sched_ext/include/scx/common.bpf.h | 4 +
> tools/sched_ext/scx_qmap.bpf.c | 561 ++++++++++++++---------
> tools/sched_ext/scx_qmap.c | 54 +--
> tools/sched_ext/scx_qmap.h | 73 +++
> 4 files changed, 459 insertions(+), 233 deletions(-)
>
> Git tree: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git qmap-arena
>
> --
> tejun
^ permalink raw reply [flat|nested] 12+ messages in thread