[PATCHSET sched_ext/for-7.2] sched_ext: scx

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena
@ 2026-04-16  8:16 Tejun Heo
  2026-04-16  8:16 ` [PATCH 1/4] sched_ext: scx_qmap: rename tctx to taskc Tejun Heo
                   ` (4 more replies)
  0 siblings, 5 replies; 12+ messages in thread
From: Tejun Heo @ 2026-04-16  8:16 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min
  Cc: Emil Tsalapatis, sched-ext, linux-kernel

Hello,

Arena simplifies verification and allows more natural programming. This
patchset converts scx_qmap to use BPF arena for all mutable state, as
preparation for further sub-sched work.

 0001 Rename tctx to taskc for consistency.
 0002 Move globals and cpu_ctx into arena.
 0003 Move task_ctx into an arena slab with bpf_res_spin_lock.
 0004 Replace FIFO queue maps with arena-backed doubly-linked lists.

Based on linus/master (1d51b370a0f8).

 tools/sched_ext/include/scx/common.bpf.h |   4 +
 tools/sched_ext/scx_qmap.bpf.c           | 561 ++++++++++++++---------
 tools/sched_ext/scx_qmap.c               |  54 +--
 tools/sched_ext/scx_qmap.h               |  73 +++
 4 files changed, 459 insertions(+), 233 deletions(-)

Git tree: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git qmap-arena

--
tejun

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 1/4] sched_ext: scx_qmap: rename tctx to taskc
  2026-04-16  8:16 [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Tejun Heo
@ 2026-04-16  8:16 ` Tejun Heo
  2026-04-16 14:56   ` Emil Tsalapatis
  2026-04-16  8:16 ` [PATCH 2/4] sched_ext: scx_qmap: move globals and cpu_ctx into a BPF arena map Tejun Heo
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 12+ messages in thread
From: Tejun Heo @ 2026-04-16  8:16 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min
  Cc: Emil Tsalapatis, sched-ext, linux-kernel, Tejun Heo

Rename the per-task context local variable from tctx to taskc for
consistency.

No functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_qmap.bpf.c | 60 +++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index b68abb9e760b..a18234f3c27a 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -159,22 +159,22 @@ static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
 
 static struct task_ctx *lookup_task_ctx(struct task_struct *p)
 {
-	struct task_ctx *tctx;
+	struct task_ctx *taskc;
 
-	if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
+	if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
 		scx_bpf_error("task_ctx lookup failed");
 		return NULL;
 	}
-	return tctx;
+	return taskc;
 }
 
 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
 {
-	struct task_ctx *tctx;
+	struct task_ctx *taskc;
 	s32 cpu;
 
-	if (!(tctx = lookup_task_ctx(p)))
+	if (!(taskc = lookup_task_ctx(p)))
 		return -ESRCH;
 
 	if (p->scx.weight < 2 && !(p->flags & PF_KTHREAD))
@@ -183,7 +183,7 @@ s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 	cpu = pick_direct_dispatch_cpu(p, prev_cpu);
 
 	if (cpu >= 0) {
-		tctx->force_local = true;
+		taskc->force_local = true;
 		return cpu;
 	} else {
 		return prev_cpu;
@@ -208,7 +208,7 @@ static int weight_to_idx(u32 weight)
 void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 {
 	static u32 user_cnt, kernel_cnt;
-	struct task_ctx *tctx;
+	struct task_ctx *taskc;
 	u32 pid = p->pid;
 	int idx = weight_to_idx(p->scx.weight);
 	void *ring;
@@ -231,14 +231,14 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	if (test_error_cnt && !--test_error_cnt)
 		scx_bpf_error("test triggering error");
 
-	if (!(tctx = lookup_task_ctx(p)))
+	if (!(taskc = lookup_task_ctx(p)))
 		return;
 
 	/*
 	 * All enqueued tasks must have their core_sched_seq updated for correct
 	 * core-sched ordering. Also, take a look at the end of qmap_dispatch().
 	 */
-	tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
+	taskc->core_sched_seq = core_sched_tail_seqs[idx]++;
 
 	/*
 	 * IMMED stress testing: Every immed_stress_nth'th enqueue, dispatch
@@ -249,7 +249,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 		static u32 immed_stress_cnt;
 
 		if (!(++immed_stress_cnt % immed_stress_nth)) {
-			tctx->force_local = false;
+			taskc->force_local = false;
 			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | scx_bpf_task_cpu(p),
 					   slice_ns, enq_flags);
 			return;
@@ -260,8 +260,8 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	 * If qmap_select_cpu() is telling us to or this is the last runnable
 	 * task on the CPU, enqueue locally.
 	 */
-	if (tctx->force_local) {
-		tctx->force_local = false;
+	if (taskc->force_local) {
+		taskc->force_local = false;
 		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
 		return;
 	}
@@ -310,7 +310,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	}
 
 	if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
-		tctx->highpri = true;
+		taskc->highpri = true;
 		__sync_fetch_and_add(&nr_highpri_queued, 1);
 	}
 	__sync_fetch_and_add(&nr_enqueued, 1);
@@ -330,10 +330,10 @@ void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
 static void update_core_sched_head_seq(struct task_struct *p)
 {
 	int idx = weight_to_idx(p->scx.weight);
-	struct task_ctx *tctx;
+	struct task_ctx *taskc;
 
-	if ((tctx = lookup_task_ctx(p)))
-		core_sched_head_seqs[idx] = tctx->core_sched_seq;
+	if ((taskc = lookup_task_ctx(p)))
+		core_sched_head_seqs[idx] = taskc->core_sched_seq;
 }
 
 /*
@@ -354,12 +354,12 @@ static bool dispatch_highpri(bool from_timer)
 	/* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
 	bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
 		static u64 highpri_seq;
-		struct task_ctx *tctx;
+		struct task_ctx *taskc;
 
-		if (!(tctx = lookup_task_ctx(p)))
+		if (!(taskc = lookup_task_ctx(p)))
 			return false;
 
-		if (tctx->highpri) {
+		if (taskc->highpri) {
 			/* exercise the set_*() and vtime interface too */
 			scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2);
 			scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++);
@@ -405,7 +405,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 {
 	struct task_struct *p;
 	struct cpu_ctx *cpuc;
-	struct task_ctx *tctx;
+	struct task_ctx *taskc;
 	u32 zero = 0, batch = dsp_batch ?: 1;
 	void *fifo;
 	s32 i, pid;
@@ -450,7 +450,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 
 		/* Dispatch or advance. */
 		bpf_repeat(BPF_MAX_LOOPS) {
-			struct task_ctx *tctx;
+			struct task_ctx *taskc;
 
 			if (bpf_map_pop_elem(fifo, &pid))
 				break;
@@ -459,12 +459,12 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 			if (!p)
 				continue;
 
-			if (!(tctx = lookup_task_ctx(p))) {
+			if (!(taskc = lookup_task_ctx(p))) {
 				bpf_task_release(p);
 				return;
 			}
 
-			if (tctx->highpri)
+			if (taskc->highpri)
 				__sync_fetch_and_sub(&nr_highpri_queued, 1);
 
 			update_core_sched_head_seq(p);
@@ -539,13 +539,13 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 	 * if the task were enqueued and dispatched immediately.
 	 */
 	if (prev) {
-		tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
-		if (!tctx) {
+		taskc = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
+		if (!taskc) {
 			scx_bpf_error("task_ctx lookup failed");
 			return;
 		}
 
-		tctx->core_sched_seq =
+		taskc->core_sched_seq =
 			core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
 	}
 }
@@ -580,16 +580,16 @@ void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
 static s64 task_qdist(struct task_struct *p)
 {
 	int idx = weight_to_idx(p->scx.weight);
-	struct task_ctx *tctx;
+	struct task_ctx *taskc;
 	s64 qdist;
 
-	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
-	if (!tctx) {
+	taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+	if (!taskc) {
 		scx_bpf_error("task_ctx lookup failed");
 		return 0;
 	}
 
-	qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
+	qdist = taskc->core_sched_seq - core_sched_head_seqs[idx];
 
 	/*
 	 * As queue index increments, the priority doubles. The queue w/ index 3
-- 
2.53.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/4] sched_ext: scx_qmap: move globals and cpu_ctx into a BPF arena map
  2026-04-16  8:16 [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Tejun Heo
  2026-04-16  8:16 ` [PATCH 1/4] sched_ext: scx_qmap: rename tctx to taskc Tejun Heo
@ 2026-04-16  8:16 ` Tejun Heo
  2026-04-16 15:28   ` Emil Tsalapatis
  2026-04-16  8:16 ` [PATCH 3/4] sched_ext: scx_qmap: move task_ctx into a BPF arena slab Tejun Heo
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 12+ messages in thread
From: Tejun Heo @ 2026-04-16  8:16 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min
  Cc: Emil Tsalapatis, sched-ext, linux-kernel, Tejun Heo

Arena simplifies verification and allows more natural programming.
Convert scx_qmap to arena as preparation for further sub-sched work.

Move mutable scheduler state from BSS globals and a percpu array map
into a single BPF arena map. A shared struct qmap_arena is declared as
an __arena global so BPF accesses it directly and userspace reaches it
through skel->arena->qa.

Scheduling logic unchanged; only memory backing changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_qmap.bpf.c | 152 ++++++++++++++-------------------
 tools/sched_ext/scx_qmap.c     |  45 +++++-----
 tools/sched_ext/scx_qmap.h     |  57 +++++++++++++
 3 files changed, 147 insertions(+), 107 deletions(-)
 create mode 100644 tools/sched_ext/scx_qmap.h

diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index a18234f3c27a..0f8fbb6d0bc2 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -22,6 +22,8 @@
  */
 #include <scx/common.bpf.h>
 
+#include "scx_qmap.h"
+
 enum consts {
 	ONE_SEC_IN_NS		= 1000000000,
 	ONE_MSEC_IN_NS		= 1000000,
@@ -48,14 +50,26 @@ const volatile bool suppress_dump;
 const volatile bool always_enq_immed;
 const volatile u32 immed_stress_nth;
 
-u64 nr_highpri_queued;
-u32 test_error_cnt;
-
-#define MAX_SUB_SCHEDS		8
-u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
-
 UEI_DEFINE(uei);
 
+/*
+ * All mutable scheduler state - per-cpu context, stats counters, core-sched
+ * sequence numbers, sub-sched cgroup ids - lives in this single BPF arena map.
+ * Userspace reaches it via skel->arena->qa.
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_ARENA);
+	__uint(map_flags, BPF_F_MMAPABLE);
+	__uint(max_entries, 1 << 16);		/* upper bound in pages */
+#if defined(__TARGET_ARCH_arm64) || defined(__aarch64__)
+	__ulong(map_extra, 0x1ull << 32);	/* user/BPF mmap base */
+#else
+	__ulong(map_extra, 0x1ull << 44);
+#endif
+} arena SEC(".maps");
+
+struct qmap_arena __arena qa;
+
 struct qmap {
 	__uint(type, BPF_MAP_TYPE_QUEUE);
 	__uint(max_entries, 4096);
@@ -102,8 +116,6 @@ static const u32 qidx_to_cpuperf_target[] = {
  * task's seq and the associated queue's head seq is called the queue distance
  * and used when comparing two tasks for ordering. See qmap_core_sched_before().
  */
-static u64 core_sched_head_seqs[5];
-static u64 core_sched_tail_seqs[5];
 
 /* Per-task scheduling context */
 struct task_ctx {
@@ -119,27 +131,6 @@ struct {
 	__type(value, struct task_ctx);
 } task_ctx_stor SEC(".maps");
 
-struct cpu_ctx {
-	u64	dsp_idx;	/* dispatch index */
-	u64	dsp_cnt;	/* remaining count */
-	u32	avg_weight;
-	u32	cpuperf_target;
-};
-
-struct {
-	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-	__uint(max_entries, 1);
-	__type(key, u32);
-	__type(value, struct cpu_ctx);
-} cpu_ctx_stor SEC(".maps");
-
-/* Statistics */
-u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0, nr_dequeued, nr_ddsp_from_enq;
-u64 nr_core_sched_execed;
-u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
-u32 cpuperf_min, cpuperf_avg, cpuperf_max;
-u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
-
 static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
 {
 	s32 cpu;
@@ -215,9 +206,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	s32 cpu;
 
 	if (enq_flags & SCX_ENQ_REENQ) {
-		__sync_fetch_and_add(&nr_reenqueued, 1);
+		__sync_fetch_and_add(&qa.nr_reenqueued, 1);
 		if (scx_bpf_task_cpu(p) == 0)
-			__sync_fetch_and_add(&nr_reenqueued_cpu0, 1);
+			__sync_fetch_and_add(&qa.nr_reenqueued_cpu0, 1);
 	}
 
 	if (p->flags & PF_KTHREAD) {
@@ -228,7 +219,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 			return;
 	}
 
-	if (test_error_cnt && !--test_error_cnt)
+	if (qa.test_error_cnt && !--qa.test_error_cnt)
 		scx_bpf_error("test triggering error");
 
 	if (!(taskc = lookup_task_ctx(p)))
@@ -238,7 +229,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	 * All enqueued tasks must have their core_sched_seq updated for correct
 	 * core-sched ordering. Also, take a look at the end of qmap_dispatch().
 	 */
-	taskc->core_sched_seq = core_sched_tail_seqs[idx]++;
+	taskc->core_sched_seq = qa.core_sched_tail_seqs[idx]++;
 
 	/*
 	 * IMMED stress testing: Every immed_stress_nth'th enqueue, dispatch
@@ -276,7 +267,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	/* if select_cpu() wasn't called, try direct dispatch */
 	if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
 	    (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
-		__sync_fetch_and_add(&nr_ddsp_from_enq, 1);
+		__sync_fetch_and_add(&qa.nr_ddsp_from_enq, 1);
 		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
 		return;
 	}
@@ -311,9 +302,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 
 	if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
 		taskc->highpri = true;
-		__sync_fetch_and_add(&nr_highpri_queued, 1);
+		__sync_fetch_and_add(&qa.nr_highpri_queued, 1);
 	}
-	__sync_fetch_and_add(&nr_enqueued, 1);
+	__sync_fetch_and_add(&qa.nr_enqueued, 1);
 }
 
 /*
@@ -322,9 +313,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
  */
 void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
 {
-	__sync_fetch_and_add(&nr_dequeued, 1);
+	__sync_fetch_and_add(&qa.nr_dequeued, 1);
 	if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
-		__sync_fetch_and_add(&nr_core_sched_execed, 1);
+		__sync_fetch_and_add(&qa.nr_core_sched_execed, 1);
 }
 
 static void update_core_sched_head_seq(struct task_struct *p)
@@ -333,7 +324,7 @@ static void update_core_sched_head_seq(struct task_struct *p)
 	struct task_ctx *taskc;
 
 	if ((taskc = lookup_task_ctx(p)))
-		core_sched_head_seqs[idx] = taskc->core_sched_seq;
+		qa.core_sched_head_seqs[idx] = taskc->core_sched_seq;
 }
 
 /*
@@ -384,14 +375,14 @@ static bool dispatch_highpri(bool from_timer)
 				     SCX_ENQ_PREEMPT)) {
 			if (cpu == this_cpu) {
 				dispatched = true;
-				__sync_fetch_and_add(&nr_expedited_local, 1);
+				__sync_fetch_and_add(&qa.nr_expedited_local, 1);
 			} else {
-				__sync_fetch_and_add(&nr_expedited_remote, 1);
+				__sync_fetch_and_add(&qa.nr_expedited_remote, 1);
 			}
 			if (from_timer)
-				__sync_fetch_and_add(&nr_expedited_from_timer, 1);
+				__sync_fetch_and_add(&qa.nr_expedited_from_timer, 1);
 		} else {
-			__sync_fetch_and_add(&nr_expedited_lost, 1);
+			__sync_fetch_and_add(&qa.nr_expedited_lost, 1);
 		}
 
 		if (dispatched)
@@ -404,19 +395,19 @@ static bool dispatch_highpri(bool from_timer)
 void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 {
 	struct task_struct *p;
-	struct cpu_ctx *cpuc;
+	struct cpu_ctx __arena *cpuc;
 	struct task_ctx *taskc;
-	u32 zero = 0, batch = dsp_batch ?: 1;
+	u32 batch = dsp_batch ?: 1;
 	void *fifo;
 	s32 i, pid;
 
 	if (dispatch_highpri(false))
 		return;
 
-	if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0))
+	if (!qa.nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0))
 		return;
 
-	if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
+	if (dsp_inf_loop_after && qa.nr_dispatched > dsp_inf_loop_after) {
 		/*
 		 * PID 2 should be kthreadd which should mostly be idle and off
 		 * the scheduler. Let's keep dispatching it to force the kernel
@@ -430,10 +421,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 		}
 	}
 
-	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
-		scx_bpf_error("failed to look up cpu_ctx");
-		return;
-	}
+	cpuc = &qa.cpu_ctxs[bpf_get_smp_processor_id()];
 
 	for (i = 0; i < 5; i++) {
 		/* Advance the dispatch cursor and pick the fifo. */
@@ -442,9 +430,11 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 			cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
 		}
 
-		fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
+		u64 dsp_idx = cpuc->dsp_idx;
+
+		fifo = bpf_map_lookup_elem(&queue_arr, &dsp_idx);
 		if (!fifo) {
-			scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
+			scx_bpf_error("failed to find ring %llu", dsp_idx);
 			return;
 		}
 
@@ -465,10 +455,10 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 			}
 
 			if (taskc->highpri)
-				__sync_fetch_and_sub(&nr_highpri_queued, 1);
+				__sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
 
 			update_core_sched_head_seq(p);
-			__sync_fetch_and_add(&nr_dispatched, 1);
+			__sync_fetch_and_add(&qa.nr_dispatched, 1);
 
 			scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
 
@@ -529,8 +519,8 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 	}
 
 	for (i = 0; i < MAX_SUB_SCHEDS; i++) {
-		if (sub_sched_cgroup_ids[i] &&
-		    scx_bpf_sub_dispatch(sub_sched_cgroup_ids[i]))
+		if (qa.sub_sched_cgroup_ids[i] &&
+		    scx_bpf_sub_dispatch(qa.sub_sched_cgroup_ids[i]))
 			return;
 	}
 
@@ -546,21 +536,15 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 		}
 
 		taskc->core_sched_seq =
-			core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
+			qa.core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
 	}
 }
 
 void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
 {
-	struct cpu_ctx *cpuc;
-	u32 zero = 0;
+	struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[bpf_get_smp_processor_id()];
 	int idx;
 
-	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
-		scx_bpf_error("failed to look up cpu_ctx");
-		return;
-	}
-
 	/*
 	 * Use the running avg of weights to select the target cpuperf level.
 	 * This is a demonstration of the cpuperf feature rather than a
@@ -589,7 +573,7 @@ static s64 task_qdist(struct task_struct *p)
 		return 0;
 	}
 
-	qdist = taskc->core_sched_seq - core_sched_head_seqs[idx];
+	qdist = taskc->core_sched_seq - qa.core_sched_head_seqs[idx];
 
 	/*
 	 * As queue index increments, the priority doubles. The queue w/ index 3
@@ -679,13 +663,10 @@ void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
 
 void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
 {
-	u32 zero = 0;
-	struct cpu_ctx *cpuc;
+	struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[cpu];
 
 	if (suppress_dump || idle)
 		return;
-	if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
-		return;
 
 	scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
 		     cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
@@ -802,7 +783,7 @@ struct {
  */
 static void monitor_cpuperf(void)
 {
-	u32 zero = 0, nr_cpu_ids;
+	u32 nr_cpu_ids;
 	u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
 	u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
 	const struct cpumask *online;
@@ -812,7 +793,7 @@ static void monitor_cpuperf(void)
 	online = scx_bpf_get_online_cpumask();
 
 	bpf_for(i, 0, nr_cpu_ids) {
-		struct cpu_ctx *cpuc;
+		struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[i];
 		u32 cap, cur;
 
 		if (!bpf_cpumask_test_cpu(i, online))
@@ -834,11 +815,6 @@ static void monitor_cpuperf(void)
 		cur_sum += cur * cap / SCX_CPUPERF_ONE;
 		cap_sum += cap;
 
-		if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
-			scx_bpf_error("failed to look up cpu_ctx");
-			goto out;
-		}
-
 		/* collect target */
 		cur = cpuc->cpuperf_target;
 		target_sum += cur;
@@ -846,14 +822,14 @@ static void monitor_cpuperf(void)
 		target_max = cur > target_max ? cur : target_max;
 	}
 
-	cpuperf_min = cur_min;
-	cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
-	cpuperf_max = cur_max;
+	qa.cpuperf_min = cur_min;
+	qa.cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
+	qa.cpuperf_max = cur_max;
+
+	qa.cpuperf_target_min = target_min;
+	qa.cpuperf_target_avg = target_sum / nr_online_cpus;
+	qa.cpuperf_target_max = target_max;
 
-	cpuperf_target_min = target_min;
-	cpuperf_target_avg = target_sum / nr_online_cpus;
-	cpuperf_target_max = target_max;
-out:
 	scx_bpf_put_cpumask(online);
 }
 
@@ -996,8 +972,8 @@ s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args)
 	s32 i;
 
 	for (i = 0; i < MAX_SUB_SCHEDS; i++) {
-		if (!sub_sched_cgroup_ids[i]) {
-			sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
+		if (!qa.sub_sched_cgroup_ids[i]) {
+			qa.sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
 			bpf_printk("attaching sub-sched[%d] on %s",
 				   i, args->cgroup_path);
 			return 0;
@@ -1012,8 +988,8 @@ void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args)
 	s32 i;
 
 	for (i = 0; i < MAX_SUB_SCHEDS; i++) {
-		if (sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
-			sub_sched_cgroup_ids[i] = 0;
+		if (qa.sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
+			qa.sub_sched_cgroup_ids[i] = 0;
 			bpf_printk("detaching sub-sched[%d] on %s",
 				   i, args->cgroup_path);
 			break;
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index e7c89a2bc3d8..8844499c14c4 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -10,9 +10,11 @@
 #include <inttypes.h>
 #include <signal.h>
 #include <libgen.h>
+#include <sys/mman.h>
 #include <sys/stat.h>
 #include <bpf/bpf.h>
 #include <scx/common.h>
+#include "scx_qmap.h"
 #include "scx_qmap.bpf.skel.h"
 
 const char help_fmt[] =
@@ -60,6 +62,8 @@ int main(int argc, char **argv)
 {
 	struct scx_qmap *skel;
 	struct bpf_link *link;
+	struct qmap_arena *qa;
+	__u32 test_error_cnt = 0;
 	int opt;
 
 	libbpf_set_print(libbpf_print_fn);
@@ -76,7 +80,7 @@ int main(int argc, char **argv)
 			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
 			break;
 		case 'e':
-			skel->bss->test_error_cnt = strtoul(optarg, NULL, 0);
+			test_error_cnt = strtoul(optarg, NULL, 0);
 			break;
 		case 't':
 			skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0);
@@ -142,29 +146,32 @@ int main(int argc, char **argv)
 	SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei);
 	link = SCX_OPS_ATTACH(skel, qmap_ops, scx_qmap);
 
+	qa = &skel->arena->qa;
+	qa->test_error_cnt = test_error_cnt;
+
 	while (!exit_req && !UEI_EXITED(skel, uei)) {
-		long nr_enqueued = skel->bss->nr_enqueued;
-		long nr_dispatched = skel->bss->nr_dispatched;
+		long nr_enqueued = qa->nr_enqueued;
+		long nr_dispatched = qa->nr_dispatched;
 
-		printf("stats  : enq=%lu dsp=%lu delta=%ld reenq/cpu0=%"PRIu64"/%"PRIu64" deq=%"PRIu64" core=%"PRIu64" enq_ddsp=%"PRIu64"\n",
+		printf("stats  : enq=%lu dsp=%lu delta=%ld reenq/cpu0=%llu/%llu deq=%llu core=%llu enq_ddsp=%llu\n",
 		       nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
-		       skel->bss->nr_reenqueued, skel->bss->nr_reenqueued_cpu0,
-		       skel->bss->nr_dequeued,
-		       skel->bss->nr_core_sched_execed,
-		       skel->bss->nr_ddsp_from_enq);
-		printf("         exp_local=%"PRIu64" exp_remote=%"PRIu64" exp_timer=%"PRIu64" exp_lost=%"PRIu64"\n",
-		       skel->bss->nr_expedited_local,
-		       skel->bss->nr_expedited_remote,
-		       skel->bss->nr_expedited_from_timer,
-		       skel->bss->nr_expedited_lost);
+		       qa->nr_reenqueued, qa->nr_reenqueued_cpu0,
+		       qa->nr_dequeued,
+		       qa->nr_core_sched_execed,
+		       qa->nr_ddsp_from_enq);
+		printf("         exp_local=%llu exp_remote=%llu exp_timer=%llu exp_lost=%llu\n",
+		       qa->nr_expedited_local,
+		       qa->nr_expedited_remote,
+		       qa->nr_expedited_from_timer,
+		       qa->nr_expedited_lost);
 		if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur"))
 			printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n",
-			       skel->bss->cpuperf_min,
-			       skel->bss->cpuperf_avg,
-			       skel->bss->cpuperf_max,
-			       skel->bss->cpuperf_target_min,
-			       skel->bss->cpuperf_target_avg,
-			       skel->bss->cpuperf_target_max);
+			       qa->cpuperf_min,
+			       qa->cpuperf_avg,
+			       qa->cpuperf_max,
+			       qa->cpuperf_target_min,
+			       qa->cpuperf_target_avg,
+			       qa->cpuperf_target_max);
 		fflush(stdout);
 		sleep(1);
 	}
diff --git a/tools/sched_ext/scx_qmap.h b/tools/sched_ext/scx_qmap.h
new file mode 100644
index 000000000000..52153230bfce
--- /dev/null
+++ b/tools/sched_ext/scx_qmap.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared definitions between scx_qmap.bpf.c and scx_qmap.c.
+ *
+ * The scheduler keeps all mutable state in a single BPF arena map. struct
+ * qmap_arena is the one object that lives at the base of the arena and is
+ * mmap'd into userspace so the loader can read counters directly.
+ *
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
+ */
+#ifndef __SCX_QMAP_H
+#define __SCX_QMAP_H
+
+#ifdef __BPF__
+#include <scx/bpf_arena_common.bpf.h>
+#else
+#include <linux/types.h>
+#include <scx/bpf_arena_common.h>
+#endif
+
+#define MAX_SUB_SCHEDS		8
+
+/*
+ * cpu_ctxs[] is sized to a fixed cap so the layout is shared between BPF and
+ * userspace. Keep this in sync with NR_CPUS used by the BPF side.
+ */
+#define SCX_QMAP_MAX_CPUS	1024
+
+struct cpu_ctx {
+	__u64 dsp_idx;		/* dispatch index */
+	__u64 dsp_cnt;		/* remaining count */
+	__u32 avg_weight;
+	__u32 cpuperf_target;
+};
+
+struct qmap_arena {
+	/* userspace-visible stats */
+	__u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0;
+	__u64 nr_dequeued, nr_ddsp_from_enq;
+	__u64 nr_core_sched_execed;
+	__u64 nr_expedited_local, nr_expedited_remote;
+	__u64 nr_expedited_lost, nr_expedited_from_timer;
+	__u64 nr_highpri_queued;
+	__u32 test_error_cnt;
+	__u32 cpuperf_min, cpuperf_avg, cpuperf_max;
+	__u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
+
+	/* kernel-side runtime state */
+	__u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
+	__u64 core_sched_head_seqs[5];
+	__u64 core_sched_tail_seqs[5];
+
+	struct cpu_ctx cpu_ctxs[SCX_QMAP_MAX_CPUS];
+};
+
+#endif /* __SCX_QMAP_H */
-- 
2.53.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 3/4] sched_ext: scx_qmap: move task_ctx into a BPF arena slab
  2026-04-16  8:16 [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Tejun Heo
  2026-04-16  8:16 ` [PATCH 1/4] sched_ext: scx_qmap: rename tctx to taskc Tejun Heo
  2026-04-16  8:16 ` [PATCH 2/4] sched_ext: scx_qmap: move globals and cpu_ctx into a BPF arena map Tejun Heo
@ 2026-04-16  8:16 ` Tejun Heo
  2026-04-16 15:31   ` Emil Tsalapatis
  2026-04-16  8:16 ` [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists Tejun Heo
  2026-04-16 10:05 ` [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Andrea Righi
  4 siblings, 1 reply; 12+ messages in thread
From: Tejun Heo @ 2026-04-16  8:16 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min
  Cc: Emil Tsalapatis, sched-ext, linux-kernel, Tejun Heo

Arena simplifies verification and allows more natural programming.
Convert scx_qmap to arena as preparation for further sub-sched work.

Allocate per-task context from an arena slab instead of storing it
directly in task_storage. task_ctx_stor now holds an arena pointer to
the task's slab entry. Free entries form a singly-linked list protected
by bpf_res_spin_lock; slab exhaustion triggers scx_bpf_error().

The slab size is configurable via the new -N option (default 16384).

Also add bpf_res_spin_lock/unlock declarations to common.bpf.h.

Scheduling logic unchanged.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/include/scx/common.bpf.h |   4 +
 tools/sched_ext/scx_qmap.bpf.c           | 178 ++++++++++++++++++-----
 tools/sched_ext/scx_qmap.c               |   9 +-
 tools/sched_ext/scx_qmap.h               |   7 +
 4 files changed, 159 insertions(+), 39 deletions(-)

diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 19459dedde41..35fc62556241 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -526,6 +526,10 @@ static inline bool is_migration_disabled(const struct task_struct *p)
 void bpf_rcu_read_lock(void) __ksym;
 void bpf_rcu_read_unlock(void) __ksym;
 
+/* resilient qspinlock */
+int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) __ksym __weak;
+void bpf_res_spin_unlock(struct bpf_res_spin_lock *lock) __ksym __weak;
+
 /*
  * Time helpers, most of which are from jiffies.h.
  */
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 0f8fbb6d0bc2..e071969c8f32 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -49,6 +49,7 @@ const volatile s32 disallow_tgid;
 const volatile bool suppress_dump;
 const volatile bool always_enq_immed;
 const volatile u32 immed_stress_nth;
+const volatile u32 max_tasks;
 
 UEI_DEFINE(uei);
 
@@ -117,20 +118,43 @@ static const u32 qidx_to_cpuperf_target[] = {
  * and used when comparing two tasks for ordering. See qmap_core_sched_before().
  */
 
-/* Per-task scheduling context */
+/*
+ * Per-task scheduling context. Allocated from the qa.task_ctxs[] slab in
+ * arena. While the task is alive the entry is referenced from task_ctx_stor;
+ * while it's free the entry sits on the free list singly-linked through
+ * @next_free.
+ */
 struct task_ctx {
-	bool	force_local;	/* Dispatch directly to local_dsq */
-	bool	highpri;
-	u64	core_sched_seq;
+	struct task_ctx __arena	*next_free;	/* only valid on free list */
+	bool			force_local;	/* Dispatch directly to local_dsq */
+	bool			highpri;
+	u64			core_sched_seq;
+};
+
+/* Holds an arena pointer to the task's slab entry. */
+struct task_ctx_stor_val {
+	struct task_ctx __arena	*taskc;
 };
 
 struct {
 	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
 	__uint(map_flags, BPF_F_NO_PREALLOC);
 	__type(key, int);
-	__type(value, struct task_ctx);
+	__type(value, struct task_ctx_stor_val);
 } task_ctx_stor SEC(".maps");
 
+/* Protects the task_ctx slab free list. */
+__hidden struct bpf_res_spin_lock qa_task_lock SEC(".data.qa_task_lock");
+
+static int qmap_spin_lock(struct bpf_res_spin_lock *lock)
+{
+	if (bpf_res_spin_lock(lock)) {
+		scx_bpf_error("res_spin_lock failed");
+		return -EBUSY;
+	}
+	return 0;
+}
+
 static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
 {
 	s32 cpu;
@@ -148,21 +172,34 @@ static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
 	return -1;
 }
 
-static struct task_ctx *lookup_task_ctx(struct task_struct *p)
+/*
+ * Force a reference to the arena map. The verifier associates an arena with
+ * a program by finding an LD_IMM64 instruction that loads the arena's BPF
+ * map; programs that only use arena pointers returned from task-local
+ * storage (like qmap_select_cpu) never reference @arena directly. Without
+ * this, the verifier rejects addr_space_cast with "addr_space_cast insn
+ * can only be used in a program that has an associated arena".
+ */
+#define QMAP_TOUCH_ARENA() do { asm volatile("" :: "r"(&arena)); } while (0)
+
+static struct task_ctx __arena *lookup_task_ctx(struct task_struct *p)
 {
-	struct task_ctx *taskc;
+	struct task_ctx_stor_val *v;
+
+	QMAP_TOUCH_ARENA();
 
-	if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
+	v = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+	if (!v || !v->taskc) {
 		scx_bpf_error("task_ctx lookup failed");
 		return NULL;
 	}
-	return taskc;
+	return v->taskc;
 }
 
 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
 {
-	struct task_ctx *taskc;
+	struct task_ctx __arena *taskc;
 	s32 cpu;
 
 	if (!(taskc = lookup_task_ctx(p)))
@@ -199,7 +236,7 @@ static int weight_to_idx(u32 weight)
 void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 {
 	static u32 user_cnt, kernel_cnt;
-	struct task_ctx *taskc;
+	struct task_ctx __arena *taskc;
 	u32 pid = p->pid;
 	int idx = weight_to_idx(p->scx.weight);
 	void *ring;
@@ -321,7 +358,7 @@ void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
 static void update_core_sched_head_seq(struct task_struct *p)
 {
 	int idx = weight_to_idx(p->scx.weight);
-	struct task_ctx *taskc;
+	struct task_ctx __arena *taskc;
 
 	if ((taskc = lookup_task_ctx(p)))
 		qa.core_sched_head_seqs[idx] = taskc->core_sched_seq;
@@ -345,7 +382,7 @@ static bool dispatch_highpri(bool from_timer)
 	/* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
 	bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
 		static u64 highpri_seq;
-		struct task_ctx *taskc;
+		struct task_ctx __arena *taskc;
 
 		if (!(taskc = lookup_task_ctx(p)))
 			return false;
@@ -396,7 +433,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 {
 	struct task_struct *p;
 	struct cpu_ctx __arena *cpuc;
-	struct task_ctx *taskc;
+	struct task_ctx __arena *taskc;
 	u32 batch = dsp_batch ?: 1;
 	void *fifo;
 	s32 i, pid;
@@ -440,7 +477,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 
 		/* Dispatch or advance. */
 		bpf_repeat(BPF_MAX_LOOPS) {
-			struct task_ctx *taskc;
+			struct task_ctx __arena *taskc;
 
 			if (bpf_map_pop_elem(fifo, &pid))
 				break;
@@ -529,11 +566,9 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 	 * if the task were enqueued and dispatched immediately.
 	 */
 	if (prev) {
-		taskc = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
-		if (!taskc) {
-			scx_bpf_error("task_ctx lookup failed");
+		taskc = lookup_task_ctx(prev);
+		if (!taskc)
 			return;
-		}
 
 		taskc->core_sched_seq =
 			qa.core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
@@ -564,14 +599,12 @@ void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
 static s64 task_qdist(struct task_struct *p)
 {
 	int idx = weight_to_idx(p->scx.weight);
-	struct task_ctx *taskc;
+	struct task_ctx __arena *taskc;
 	s64 qdist;
 
-	taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
-	if (!taskc) {
-		scx_bpf_error("task_ctx lookup failed");
+	taskc = lookup_task_ctx(p);
+	if (!taskc)
 		return 0;
-	}
 
 	qdist = taskc->core_sched_seq - qa.core_sched_head_seqs[idx];
 
@@ -606,21 +639,64 @@ bool BPF_STRUCT_OPS(qmap_core_sched_before,
  * tasks when a higher-priority scheduling class takes the CPU.
  */
 
-s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
-		   struct scx_init_task_args *args)
+s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init_task, struct task_struct *p,
+			     struct scx_init_task_args *args)
 {
+	struct task_ctx_stor_val *v;
+	struct task_ctx __arena *taskc;
+
 	if (p->tgid == disallow_tgid)
 		p->scx.disallow = true;
 
-	/*
-	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
-	 * in this function and the following will automatically use GFP_KERNEL.
-	 */
-	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
-				 BPF_LOCAL_STORAGE_GET_F_CREATE))
-		return 0;
-	else
+	/* pop a slab entry off the free list */
+	if (qmap_spin_lock(&qa_task_lock))
+		return -EBUSY;
+	taskc = qa.task_free_head;
+	if (taskc)
+		qa.task_free_head = taskc->next_free;
+	bpf_res_spin_unlock(&qa_task_lock);
+	if (!taskc) {
+		scx_bpf_error("task_ctx slab exhausted (max_tasks=%u)", max_tasks);
+		return -ENOMEM;
+	}
+
+	taskc->next_free = NULL;
+	taskc->force_local = false;
+	taskc->highpri = false;
+	taskc->core_sched_seq = 0;
+
+	v = bpf_task_storage_get(&task_ctx_stor, p, NULL,
+				 BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (!v) {
+		/* push back to the free list */
+		if (!qmap_spin_lock(&qa_task_lock)) {
+			taskc->next_free = qa.task_free_head;
+			qa.task_free_head = taskc;
+			bpf_res_spin_unlock(&qa_task_lock);
+		}
 		return -ENOMEM;
+	}
+	v->taskc = taskc;
+	return 0;
+}
+
+void BPF_STRUCT_OPS(qmap_exit_task, struct task_struct *p,
+		    struct scx_exit_task_args *args)
+{
+	struct task_ctx_stor_val *v;
+	struct task_ctx __arena *taskc;
+
+	v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
+	if (!v || !v->taskc)
+		return;
+	taskc = v->taskc;
+	v->taskc = NULL;
+
+	if (qmap_spin_lock(&qa_task_lock))
+		return;
+	taskc->next_free = qa.task_free_head;
+	qa.task_free_head = taskc;
+	bpf_res_spin_unlock(&qa_task_lock);
 }
 
 void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
@@ -675,12 +751,17 @@ void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle
 
 void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
 {
-	struct task_ctx *taskc;
+	struct task_ctx_stor_val *v;
+	struct task_ctx __arena *taskc;
+
+	QMAP_TOUCH_ARENA();
 
 	if (suppress_dump)
 		return;
-	if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
+	v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
+	if (!v || !v->taskc)
 		return;
+	taskc = v->taskc;
 
 	scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
 		     taskc->force_local, taskc->core_sched_seq);
@@ -915,10 +996,32 @@ static int lowpri_timerfn(void *map, int *key, struct bpf_timer *timer)
 
 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
 {
-	u32 key = 0;
+	struct task_ctx __arena *slab;
+	u32 nr_pages, key = 0, i;
 	struct bpf_timer *timer;
 	s32 ret;
 
+	/*
+	 * Allocate the task_ctx slab in arena and thread the entire slab onto
+	 * the free list. max_tasks is set by userspace before load.
+	 */
+	if (!max_tasks) {
+		scx_bpf_error("max_tasks must be > 0");
+		return -EINVAL;
+	}
+
+	nr_pages = (max_tasks * sizeof(struct task_ctx) + PAGE_SIZE - 1) / PAGE_SIZE;
+	slab = bpf_arena_alloc_pages(&arena, NULL, nr_pages, NUMA_NO_NODE, 0);
+	if (!slab) {
+		scx_bpf_error("failed to allocate task_ctx slab");
+		return -ENOMEM;
+	}
+	qa.task_ctxs = slab;
+
+	bpf_for(i, 0, max_tasks)
+		slab[i].next_free = (i + 1 < max_tasks) ? &slab[i + 1] : NULL;
+	qa.task_free_head = &slab[0];
+
 	if (print_msgs && !sub_cgroup_id)
 		print_cpus();
 
@@ -1005,6 +1108,7 @@ SCX_OPS_DEFINE(qmap_ops,
 	       .tick			= (void *)qmap_tick,
 	       .core_sched_before	= (void *)qmap_core_sched_before,
 	       .init_task		= (void *)qmap_init_task,
+	       .exit_task		= (void *)qmap_exit_task,
 	       .dump			= (void *)qmap_dump,
 	       .dump_cpu		= (void *)qmap_dump_cpu,
 	       .dump_task		= (void *)qmap_dump_task,
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index 8844499c14c4..4bdcc4bc5fbd 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -23,12 +23,13 @@ const char help_fmt[] =
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
 "Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n"
-"       [-P] [-M] [-H] [-d PID] [-D LEN] [-S] [-p] [-I] [-F COUNT] [-v]\n"
+"       [-N COUNT] [-P] [-M] [-H] [-d PID] [-D LEN] [-S] [-p] [-I] [-F COUNT] [-v]\n"
 "\n"
 "  -s SLICE_US   Override slice duration\n"
 "  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
 "  -t COUNT      Stall every COUNT'th user thread\n"
 "  -T COUNT      Stall every COUNT'th kernel thread\n"
+"  -N COUNT      Size of the task_ctx arena slab (default 16384)\n"
 "  -l COUNT      Trigger dispatch infinite looping after COUNT dispatches\n"
 "  -b COUNT      Dispatch upto COUNT tasks together\n"
 "  -P            Print out DSQ content and event counters to trace_pipe every second\n"
@@ -73,8 +74,9 @@ int main(int argc, char **argv)
 	skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
 
 	skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
+	skel->rodata->max_tasks = 16384;
 
-	while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PMHc:d:D:SpIF:vh")) != -1) {
+	while ((opt = getopt(argc, argv, "s:e:t:T:l:b:N:PMHc:d:D:SpIF:vh")) != -1) {
 		switch (opt) {
 		case 's':
 			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -94,6 +96,9 @@ int main(int argc, char **argv)
 		case 'b':
 			skel->rodata->dsp_batch = strtoul(optarg, NULL, 0);
 			break;
+		case 'N':
+			skel->rodata->max_tasks = strtoul(optarg, NULL, 0);
+			break;
 		case 'P':
 			skel->rodata->print_dsqs_and_events = true;
 			break;
diff --git a/tools/sched_ext/scx_qmap.h b/tools/sched_ext/scx_qmap.h
index 52153230bfce..c183d82632b3 100644
--- a/tools/sched_ext/scx_qmap.h
+++ b/tools/sched_ext/scx_qmap.h
@@ -34,6 +34,9 @@ struct cpu_ctx {
 	__u32 cpuperf_target;
 };
 
+/* Opaque to userspace; defined in scx_qmap.bpf.c. */
+struct task_ctx;
+
 struct qmap_arena {
 	/* userspace-visible stats */
 	__u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0;
@@ -52,6 +55,10 @@ struct qmap_arena {
 	__u64 core_sched_tail_seqs[5];
 
 	struct cpu_ctx cpu_ctxs[SCX_QMAP_MAX_CPUS];
+
+	/* task_ctx slab; allocated and threaded by qmap_init() */
+	struct task_ctx __arena *task_ctxs;
+	struct task_ctx __arena *task_free_head;
 };
 
 #endif /* __SCX_QMAP_H */
-- 
2.53.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists
  2026-04-16  8:16 [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Tejun Heo
                   ` (2 preceding siblings ...)
  2026-04-16  8:16 ` [PATCH 3/4] sched_ext: scx_qmap: move task_ctx into a BPF arena slab Tejun Heo
@ 2026-04-16  8:16 ` Tejun Heo
  2026-04-16 10:01   ` Andrea Righi
  2026-04-16 15:45   ` Emil Tsalapatis
  2026-04-16 10:05 ` [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Andrea Righi
  4 siblings, 2 replies; 12+ messages in thread
From: Tejun Heo @ 2026-04-16  8:16 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min
  Cc: Emil Tsalapatis, sched-ext, linux-kernel, Tejun Heo

Arena simplifies verification and allows more natural programming.
Convert scx_qmap to arena as preparation for further sub-sched work.

Replace the five BPF_MAP_TYPE_QUEUE maps with doubly-linked lists in
arena, threaded through task_ctx. Each queue is a struct qmap_fifo with
head/tail pointers and its own per-queue bpf_res_spin_lock.

qmap_dequeue() now properly removes tasks from the queue instead of
leaving stale entries for dispatch to skip.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_qmap.bpf.c | 221 +++++++++++++++++++++------------
 tools/sched_ext/scx_qmap.h     |   9 ++
 2 files changed, 148 insertions(+), 82 deletions(-)

diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index e071969c8f32..c26997ff7863 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -71,31 +71,24 @@ struct {
 
 struct qmap_arena __arena qa;
 
-struct qmap {
-	__uint(type, BPF_MAP_TYPE_QUEUE);
-	__uint(max_entries, 4096);
-	__type(value, u32);
-} queue0 SEC(".maps"),
-  queue1 SEC(".maps"),
-  queue2 SEC(".maps"),
-  queue3 SEC(".maps"),
-  queue4 SEC(".maps"),
-  dump_store SEC(".maps");
-
-struct {
-	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
-	__uint(max_entries, 5);
-	__type(key, int);
-	__array(values, struct qmap);
-} queue_arr SEC(".maps") = {
-	.values = {
-		[0] = &queue0,
-		[1] = &queue1,
-		[2] = &queue2,
-		[3] = &queue3,
-		[4] = &queue4,
-	},
-};
+/* Per-queue locks. Each in its own .data section as bpf_res_spin_lock requires. */
+__hidden struct bpf_res_spin_lock qa_q_lock0 SEC(".data.qa_q_lock0");
+__hidden struct bpf_res_spin_lock qa_q_lock1 SEC(".data.qa_q_lock1");
+__hidden struct bpf_res_spin_lock qa_q_lock2 SEC(".data.qa_q_lock2");
+__hidden struct bpf_res_spin_lock qa_q_lock3 SEC(".data.qa_q_lock3");
+__hidden struct bpf_res_spin_lock qa_q_lock4 SEC(".data.qa_q_lock4");
+
+static struct bpf_res_spin_lock *qa_q_lock(s32 qid)
+{
+	switch (qid) {
+	case 0:	return &qa_q_lock0;
+	case 1:	return &qa_q_lock1;
+	case 2:	return &qa_q_lock2;
+	case 3:	return &qa_q_lock3;
+	case 4:	return &qa_q_lock4;
+	default: return NULL;
+	}
+}
 
 /*
  * If enabled, CPU performance target is set according to the queue index
@@ -123,9 +116,17 @@ static const u32 qidx_to_cpuperf_target[] = {
  * arena. While the task is alive the entry is referenced from task_ctx_stor;
  * while it's free the entry sits on the free list singly-linked through
  * @next_free.
+ *
+ * When the task is queued on one of the five priority FIFOs, @q_idx is the
+ * queue index and @q_next/@q_prev link it in the queue's doubly-linked list.
+ * @q_idx is -1 when the task isn't on any queue.
  */
 struct task_ctx {
 	struct task_ctx __arena	*next_free;	/* only valid on free list */
+	struct task_ctx __arena	*q_next;	/* queue link, NULL if tail */
+	struct task_ctx __arena	*q_prev;	/* queue link, NULL if head */
+	struct qmap_fifo __arena *fifo;		/* queue we're on, NULL if not queued */
+	s32			pid;
 	bool			force_local;	/* Dispatch directly to local_dsq */
 	bool			highpri;
 	u64			core_sched_seq;
@@ -196,6 +197,81 @@ static struct task_ctx __arena *lookup_task_ctx(struct task_struct *p)
 	return v->taskc;
 }
 
+/* Append @taskc to the tail of @fifo. Must not already be queued. */
+static void qmap_fifo_enqueue(struct qmap_fifo __arena *fifo,
+			      struct task_ctx __arena *taskc)
+{
+	struct bpf_res_spin_lock *lock = qa_q_lock(fifo->idx);
+
+	if (!lock || qmap_spin_lock(lock))
+		return;
+	taskc->fifo = fifo;
+	taskc->q_next = NULL;
+	taskc->q_prev = fifo->tail;
+	if (fifo->tail)
+		fifo->tail->q_next = taskc;
+	else
+		fifo->head = taskc;
+	fifo->tail = taskc;
+	bpf_res_spin_unlock(lock);
+}
+
+/* Pop the head of @fifo. Returns NULL if empty. */
+static struct task_ctx __arena *qmap_fifo_pop(struct qmap_fifo __arena *fifo)
+{
+	struct bpf_res_spin_lock *lock = qa_q_lock(fifo->idx);
+	struct task_ctx __arena *taskc;
+
+	if (!lock || qmap_spin_lock(lock))
+		return NULL;
+	taskc = fifo->head;
+	if (taskc) {
+		fifo->head = taskc->q_next;
+		if (taskc->q_next)
+			taskc->q_next->q_prev = NULL;
+		else
+			fifo->tail = NULL;
+		taskc->q_next = NULL;
+		taskc->q_prev = NULL;
+		taskc->fifo = NULL;
+	}
+	bpf_res_spin_unlock(lock);
+	return taskc;
+}
+
+/* Remove @taskc from its fifo. No-op if not queued. */
+static void qmap_fifo_remove(struct task_ctx __arena *taskc)
+{
+	struct qmap_fifo __arena *fifo = taskc->fifo;
+	struct bpf_res_spin_lock *lock;
+
+	if (!fifo)
+		return;
+
+	lock = qa_q_lock(fifo->idx);
+	if (!lock || qmap_spin_lock(lock))
+		return;
+
+	/* Re-check under lock — a concurrent pop may have cleared fifo. */
+	if (taskc->fifo != fifo) {
+		bpf_res_spin_unlock(lock);
+		return;
+	}
+
+	if (taskc->q_next)
+		taskc->q_next->q_prev = taskc->q_prev;
+	else
+		fifo->tail = taskc->q_prev;
+	if (taskc->q_prev)
+		taskc->q_prev->q_next = taskc->q_next;
+	else
+		fifo->head = taskc->q_next;
+	taskc->q_next = NULL;
+	taskc->q_prev = NULL;
+	taskc->fifo = NULL;
+	bpf_res_spin_unlock(lock);
+}
+
 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
 {
@@ -237,9 +313,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 {
 	static u32 user_cnt, kernel_cnt;
 	struct task_ctx __arena *taskc;
-	u32 pid = p->pid;
 	int idx = weight_to_idx(p->scx.weight);
-	void *ring;
 	s32 cpu;
 
 	if (enq_flags & SCX_ENQ_REENQ) {
@@ -325,17 +399,8 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 		return;
 	}
 
-	ring = bpf_map_lookup_elem(&queue_arr, &idx);
-	if (!ring) {
-		scx_bpf_error("failed to find ring %d", idx);
-		return;
-	}
-
-	/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
-	if (bpf_map_push_elem(ring, &pid, 0)) {
-		scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags);
-		return;
-	}
+	/* Queue on the selected FIFO. */
+	qmap_fifo_enqueue(&qa.fifos[idx], taskc);
 
 	if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
 		taskc->highpri = true;
@@ -344,15 +409,20 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	__sync_fetch_and_add(&qa.nr_enqueued, 1);
 }
 
-/*
- * The BPF queue map doesn't support removal and sched_ext can handle spurious
- * dispatches. qmap_dequeue() is only used to collect statistics.
- */
 void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
 {
+	struct task_ctx __arena *taskc;
+
 	__sync_fetch_and_add(&qa.nr_dequeued, 1);
 	if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
 		__sync_fetch_and_add(&qa.nr_core_sched_execed, 1);
+
+	taskc = lookup_task_ctx(p);
+	if (taskc && taskc->fifo) {
+		if (taskc->highpri)
+			__sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
+		qmap_fifo_remove(taskc);
+	}
 }
 
 static void update_core_sched_head_seq(struct task_struct *p)
@@ -435,8 +505,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 	struct cpu_ctx __arena *cpuc;
 	struct task_ctx __arena *taskc;
 	u32 batch = dsp_batch ?: 1;
-	void *fifo;
-	s32 i, pid;
+	s32 i;
 
 	if (dispatch_highpri(false))
 		return;
@@ -467,30 +536,18 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 			cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
 		}
 
-		u64 dsp_idx = cpuc->dsp_idx;
-
-		fifo = bpf_map_lookup_elem(&queue_arr, &dsp_idx);
-		if (!fifo) {
-			scx_bpf_error("failed to find ring %llu", dsp_idx);
-			return;
-		}
-
 		/* Dispatch or advance. */
 		bpf_repeat(BPF_MAX_LOOPS) {
 			struct task_ctx __arena *taskc;
 
-			if (bpf_map_pop_elem(fifo, &pid))
+			taskc = qmap_fifo_pop(&qa.fifos[cpuc->dsp_idx]);
+			if (!taskc)
 				break;
 
-			p = bpf_task_from_pid(pid);
+			p = bpf_task_from_pid(taskc->pid);
 			if (!p)
 				continue;
 
-			if (!(taskc = lookup_task_ctx(p))) {
-				bpf_task_release(p);
-				return;
-			}
-
 			if (taskc->highpri)
 				__sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
 
@@ -661,6 +718,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init_task, struct task_struct *p,
 	}
 
 	taskc->next_free = NULL;
+	taskc->q_next = NULL;
+	taskc->q_prev = NULL;
+	taskc->fifo = NULL;
+	taskc->pid = p->pid;
 	taskc->force_local = false;
 	taskc->highpri = false;
 	taskc->core_sched_seq = 0;
@@ -701,38 +762,29 @@ void BPF_STRUCT_OPS(qmap_exit_task, struct task_struct *p,
 
 void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
 {
-	s32 i, pid;
+	struct task_ctx __arena *taskc;
+	s32 i;
+
+	QMAP_TOUCH_ARENA();
 
 	if (suppress_dump)
 		return;
 
+	/*
+	 * Walk the queue lists without locking - kfunc calls (scx_bpf_dump)
+	 * aren't in the verifier's kfunc_spin_allowed() list so we can't hold
+	 * a lock and dump. Best-effort; racing may print stale pids but the
+	 * walk is bounded by bpf_repeat() so it always terminates.
+	 */
 	bpf_for(i, 0, 5) {
-		void *fifo;
-
-		if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
-			return;
-
 		scx_bpf_dump("QMAP FIFO[%d]:", i);
-
-		/*
-		 * Dump can be invoked anytime and there is no way to iterate in
-		 * a non-destructive way. Pop and store in dump_store and then
-		 * restore afterwards. If racing against new enqueues, ordering
-		 * can get mixed up.
-		 */
-		bpf_repeat(4096) {
-			if (bpf_map_pop_elem(fifo, &pid))
-				break;
-			bpf_map_push_elem(&dump_store, &pid, 0);
-			scx_bpf_dump(" %d", pid);
-		}
-
+		taskc = qa.fifos[i].head;
 		bpf_repeat(4096) {
-			if (bpf_map_pop_elem(&dump_store, &pid))
+			if (!taskc)
 				break;
-			bpf_map_push_elem(fifo, &pid, 0);
+			scx_bpf_dump(" %d", taskc->pid);
+			taskc = taskc->q_next;
 		}
-
 		scx_bpf_dump("\n");
 	}
 }
@@ -756,6 +808,8 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc
 
 	QMAP_TOUCH_ARENA();
 
+	QMAP_TOUCH_ARENA();
+
 	if (suppress_dump)
 		return;
 	v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
@@ -1018,6 +1072,9 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
 	}
 	qa.task_ctxs = slab;
 
+	bpf_for(i, 0, 5)
+		qa.fifos[i].idx = i;
+
 	bpf_for(i, 0, max_tasks)
 		slab[i].next_free = (i + 1 < max_tasks) ? &slab[i + 1] : NULL;
 	qa.task_free_head = &slab[0];
diff --git a/tools/sched_ext/scx_qmap.h b/tools/sched_ext/scx_qmap.h
index c183d82632b3..9c0da5a301cb 100644
--- a/tools/sched_ext/scx_qmap.h
+++ b/tools/sched_ext/scx_qmap.h
@@ -37,6 +37,12 @@ struct cpu_ctx {
 /* Opaque to userspace; defined in scx_qmap.bpf.c. */
 struct task_ctx;
 
+struct qmap_fifo {
+	struct task_ctx __arena *head;
+	struct task_ctx __arena *tail;
+	__s32 idx;
+};
+
 struct qmap_arena {
 	/* userspace-visible stats */
 	__u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0;
@@ -59,6 +65,9 @@ struct qmap_arena {
 	/* task_ctx slab; allocated and threaded by qmap_init() */
 	struct task_ctx __arena *task_ctxs;
 	struct task_ctx __arena *task_free_head;
+
+	/* five priority FIFOs, each a doubly-linked list through task_ctx */
+	struct qmap_fifo fifos[5];
 };
 
 #endif /* __SCX_QMAP_H */
-- 
2.53.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists
  2026-04-16  8:16 ` [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists Tejun Heo
@ 2026-04-16 10:01   ` Andrea Righi
  2026-04-16 15:45   ` Emil Tsalapatis
  1 sibling, 0 replies; 12+ messages in thread
From: Andrea Righi @ 2026-04-16 10:01 UTC (permalink / raw)
  To: Tejun Heo
  Cc: David Vernet, Changwoo Min, Emil Tsalapatis, sched-ext,
	linux-kernel

Hi Tejun,

On Wed, Apr 15, 2026 at 10:16:26PM -1000, Tejun Heo wrote:
> Arena simplifies verification and allows more natural programming.
> Convert scx_qmap to arena as preparation for further sub-sched work.
> 
> Replace the five BPF_MAP_TYPE_QUEUE maps with doubly-linked lists in
> arena, threaded through task_ctx. Each queue is a struct qmap_fifo with
> head/tail pointers and its own per-queue bpf_res_spin_lock.

We should probably update the description at the beginning of the files as well,
mentioning the arena-backed lists.

> 
> qmap_dequeue() now properly removes tasks from the queue instead of
> leaving stale entries for dispatch to skip.
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
...

> @@ -756,6 +808,8 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc
>  
>  	QMAP_TOUCH_ARENA();
>  
> +	QMAP_TOUCH_ARENA();
> +

Copy/paste noise?

>  	if (suppress_dump)
>  		return;
>  	v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
> @@ -1018,6 +1072,9 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
>  	}
>  	qa.task_ctxs = slab;
>  
> +	bpf_for(i, 0, 5)
> +		qa.fifos[i].idx = i;
> +
>  	bpf_for(i, 0, max_tasks)
>  		slab[i].next_free = (i + 1 < max_tasks) ? &slab[i + 1] : NULL;
>  	qa.task_free_head = &slab[0];

Thanks,
-Andrea

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena
  2026-04-16  8:16 [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Tejun Heo
                   ` (3 preceding siblings ...)
  2026-04-16  8:16 ` [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists Tejun Heo
@ 2026-04-16 10:05 ` Andrea Righi
  4 siblings, 0 replies; 12+ messages in thread
From: Andrea Righi @ 2026-04-16 10:05 UTC (permalink / raw)
  To: Tejun Heo
  Cc: David Vernet, Changwoo Min, Emil Tsalapatis, sched-ext,
	linux-kernel

Hi Tejun,

On Wed, Apr 15, 2026 at 10:16:22PM -1000, Tejun Heo wrote:
> Hello,
> 
> Arena simplifies verification and allows more natural programming. This
> patchset converts scx_qmap to use BPF arena for all mutable state, as
> preparation for further sub-sched work.
> 
>  0001 Rename tctx to taskc for consistency.
>  0002 Move globals and cpu_ctx into arena.
>  0003 Move task_ctx into an arena slab with bpf_res_spin_lock.
>  0004 Replace FIFO queue maps with arena-backed doubly-linked lists.
> 
> Based on linus/master (1d51b370a0f8).

Sent a couple of comments about patch 4, everything else looks good to me.

Reviewed-by: Andrea Righi <arighi@nvidia.com>

Thanks,
-Andrea

> 
>  tools/sched_ext/include/scx/common.bpf.h |   4 +
>  tools/sched_ext/scx_qmap.bpf.c           | 561 ++++++++++++++---------
>  tools/sched_ext/scx_qmap.c               |  54 +--
>  tools/sched_ext/scx_qmap.h               |  73 +++
>  4 files changed, 459 insertions(+), 233 deletions(-)
> 
> Git tree: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git qmap-arena
> 
> --
> tejun

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/4] sched_ext: scx_qmap: rename tctx to taskc
  2026-04-16  8:16 ` [PATCH 1/4] sched_ext: scx_qmap: rename tctx to taskc Tejun Heo
@ 2026-04-16 14:56   ` Emil Tsalapatis
  0 siblings, 0 replies; 12+ messages in thread
From: Emil Tsalapatis @ 2026-04-16 14:56 UTC (permalink / raw)
  To: Tejun Heo
  Cc: David Vernet, Andrea Righi, Changwoo Min, Emil Tsalapatis,
	sched-ext, linux-kernel

On Thu, Apr 16, 2026 at 1:16 AM Tejun Heo <tj@kernel.org> wrote:
>
> >
> Rename the per-task context local variable from tctx to taskc for
> consistency.
>
> No functional change.
>

Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>

> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
>  tools/sched_ext/scx_qmap.bpf.c | 60 +++++++++++++++++-----------------
>  1 file changed, 30 insertions(+), 30 deletions(-)
>
> diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
> index b68abb9e760b..a18234f3c27a 100644
> --- a/tools/sched_ext/scx_qmap.bpf.c
> +++ b/tools/sched_ext/scx_qmap.bpf.c
> @@ -159,22 +159,22 @@ static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
>
>  static struct task_ctx *lookup_task_ctx(struct task_struct *p)
>  {
> -       struct task_ctx *tctx;
> +       struct task_ctx *taskc;
>
> -       if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
> +       if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
>                 scx_bpf_error("task_ctx lookup failed");
>                 return NULL;
>         }
> -       return tctx;
> +       return taskc;
>  }
>
>  s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
>                    s32 prev_cpu, u64 wake_flags)
>  {
> -       struct task_ctx *tctx;
> +       struct task_ctx *taskc;
>         s32 cpu;
>
> -       if (!(tctx = lookup_task_ctx(p)))
> +       if (!(taskc = lookup_task_ctx(p)))
>                 return -ESRCH;
>
>         if (p->scx.weight < 2 && !(p->flags & PF_KTHREAD))
> @@ -183,7 +183,7 @@ s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
>         cpu = pick_direct_dispatch_cpu(p, prev_cpu);
>
>         if (cpu >= 0) {
> -               tctx->force_local = true;
> +               taskc->force_local = true;
>                 return cpu;
>         } else {
>                 return prev_cpu;
> @@ -208,7 +208,7 @@ static int weight_to_idx(u32 weight)
>  void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>  {
>         static u32 user_cnt, kernel_cnt;
> -       struct task_ctx *tctx;
> +       struct task_ctx *taskc;
>         u32 pid = p->pid;
>         int idx = weight_to_idx(p->scx.weight);
>         void *ring;
> @@ -231,14 +231,14 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>         if (test_error_cnt && !--test_error_cnt)
>                 scx_bpf_error("test triggering error");
>
> -       if (!(tctx = lookup_task_ctx(p)))
> +       if (!(taskc = lookup_task_ctx(p)))
>                 return;
>
>         /*
>          * All enqueued tasks must have their core_sched_seq updated for correct
>          * core-sched ordering. Also, take a look at the end of qmap_dispatch().
>          */
> -       tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
> +       taskc->core_sched_seq = core_sched_tail_seqs[idx]++;
>
>         /*
>          * IMMED stress testing: Every immed_stress_nth'th enqueue, dispatch
> @@ -249,7 +249,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>                 static u32 immed_stress_cnt;
>
>                 if (!(++immed_stress_cnt % immed_stress_nth)) {
> -                       tctx->force_local = false;
> +                       taskc->force_local = false;
>                         scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | scx_bpf_task_cpu(p),
>                                            slice_ns, enq_flags);
>                         return;
> @@ -260,8 +260,8 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>          * If qmap_select_cpu() is telling us to or this is the last runnable
>          * task on the CPU, enqueue locally.
>          */
> -       if (tctx->force_local) {
> -               tctx->force_local = false;
> +       if (taskc->force_local) {
> +               taskc->force_local = false;
>                 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
>                 return;
>         }
> @@ -310,7 +310,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>         }
>
>         if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
> -               tctx->highpri = true;
> +               taskc->highpri = true;
>                 __sync_fetch_and_add(&nr_highpri_queued, 1);
>         }
>         __sync_fetch_and_add(&nr_enqueued, 1);
> @@ -330,10 +330,10 @@ void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
>  static void update_core_sched_head_seq(struct task_struct *p)
>  {
>         int idx = weight_to_idx(p->scx.weight);
> -       struct task_ctx *tctx;
> +       struct task_ctx *taskc;
>
> -       if ((tctx = lookup_task_ctx(p)))
> -               core_sched_head_seqs[idx] = tctx->core_sched_seq;
> +       if ((taskc = lookup_task_ctx(p)))
> +               core_sched_head_seqs[idx] = taskc->core_sched_seq;
>  }
>
>  /*
> @@ -354,12 +354,12 @@ static bool dispatch_highpri(bool from_timer)
>         /* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
>         bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
>                 static u64 highpri_seq;
> -               struct task_ctx *tctx;
> +               struct task_ctx *taskc;
>
> -               if (!(tctx = lookup_task_ctx(p)))
> +               if (!(taskc = lookup_task_ctx(p)))
>                         return false;
>
> -               if (tctx->highpri) {
> +               if (taskc->highpri) {
>                         /* exercise the set_*() and vtime interface too */
>                         scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2);
>                         scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++);
> @@ -405,7 +405,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>  {
>         struct task_struct *p;
>         struct cpu_ctx *cpuc;
> -       struct task_ctx *tctx;
> +       struct task_ctx *taskc;
>         u32 zero = 0, batch = dsp_batch ?: 1;
>         void *fifo;
>         s32 i, pid;
> @@ -450,7 +450,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>
>                 /* Dispatch or advance. */
>                 bpf_repeat(BPF_MAX_LOOPS) {
> -                       struct task_ctx *tctx;
> +                       struct task_ctx *taskc;
>
>                         if (bpf_map_pop_elem(fifo, &pid))
>                                 break;
> @@ -459,12 +459,12 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>                         if (!p)
>                                 continue;
>
> -                       if (!(tctx = lookup_task_ctx(p))) {
> +                       if (!(taskc = lookup_task_ctx(p))) {
>                                 bpf_task_release(p);
>                                 return;
>                         }
>
> -                       if (tctx->highpri)
> +                       if (taskc->highpri)
>                                 __sync_fetch_and_sub(&nr_highpri_queued, 1);
>
>                         update_core_sched_head_seq(p);
> @@ -539,13 +539,13 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>          * if the task were enqueued and dispatched immediately.
>          */
>         if (prev) {
> -               tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
> -               if (!tctx) {
> +               taskc = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
> +               if (!taskc) {
>                         scx_bpf_error("task_ctx lookup failed");
>                         return;
>                 }
>
> -               tctx->core_sched_seq =
> +               taskc->core_sched_seq =
>                         core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
>         }
>  }
> @@ -580,16 +580,16 @@ void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
>  static s64 task_qdist(struct task_struct *p)
>  {
>         int idx = weight_to_idx(p->scx.weight);
> -       struct task_ctx *tctx;
> +       struct task_ctx *taskc;
>         s64 qdist;
>
> -       tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
> -       if (!tctx) {
> +       taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
> +       if (!taskc) {
>                 scx_bpf_error("task_ctx lookup failed");
>                 return 0;
>         }
>
> -       qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
> +       qdist = taskc->core_sched_seq - core_sched_head_seqs[idx];
>
>         /*
>          * As queue index increments, the priority doubles. The queue w/ index 3
> --
> 2.53.0
>
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/4] sched_ext: scx_qmap: move globals and cpu_ctx into a BPF arena map
  2026-04-16  8:16 ` [PATCH 2/4] sched_ext: scx_qmap: move globals and cpu_ctx into a BPF arena map Tejun Heo
@ 2026-04-16 15:28   ` Emil Tsalapatis
  0 siblings, 0 replies; 12+ messages in thread
From: Emil Tsalapatis @ 2026-04-16 15:28 UTC (permalink / raw)
  To: Tejun Heo
  Cc: David Vernet, Andrea Righi, Changwoo Min, Emil Tsalapatis,
	sched-ext, linux-kernel

On Thu, Apr 16, 2026 at 1:20 AM Tejun Heo <tj@kernel.org> wrote:
>
> >
> Arena simplifies verification and allows more natural programming.
> Convert scx_qmap to arena as preparation for further sub-sched work.
>
> Move mutable scheduler state from BSS globals and a percpu array map
> into a single BPF arena map. A shared struct qmap_arena is declared as
> an __arena global so BPF accesses it directly and userspace reaches it
> through skel->arena->qa.
>
> Scheduling logic unchanged; only memory backing changes.
>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---

Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>

>  tools/sched_ext/scx_qmap.bpf.c | 152 ++++++++++++++-------------------
>  tools/sched_ext/scx_qmap.c     |  45 +++++-----
>  tools/sched_ext/scx_qmap.h     |  57 +++++++++++++
>  3 files changed, 147 insertions(+), 107 deletions(-)
>  create mode 100644 tools/sched_ext/scx_qmap.h
>
> diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
> index a18234f3c27a..0f8fbb6d0bc2 100644
> --- a/tools/sched_ext/scx_qmap.bpf.c
> +++ b/tools/sched_ext/scx_qmap.bpf.c
> @@ -22,6 +22,8 @@
>   */
>  #include <scx/common.bpf.h>
>
> +#include "scx_qmap.h"
> +
>  enum consts {
>         ONE_SEC_IN_NS           = 1000000000,
>         ONE_MSEC_IN_NS          = 1000000,
> @@ -48,14 +50,26 @@ const volatile bool suppress_dump;
>  const volatile bool always_enq_immed;
>  const volatile u32 immed_stress_nth;
>
> -u64 nr_highpri_queued;
> -u32 test_error_cnt;
> -
> -#define MAX_SUB_SCHEDS         8
> -u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
> -
>  UEI_DEFINE(uei);
>
> +/*
> + * All mutable scheduler state - per-cpu context, stats counters, core-sched
> + * sequence numbers, sub-sched cgroup ids - lives in this single BPF arena map.
> + * Userspace reaches it via skel->arena->qa.
> + */
> +struct {
> +       __uint(type, BPF_MAP_TYPE_ARENA);
> +       __uint(map_flags, BPF_F_MMAPABLE);
> +       __uint(max_entries, 1 << 16);           /* upper bound in pages */

I assume this is picked to handle 64K pages on ARM.

> +#if defined(__TARGET_ARCH_arm64) || defined(__aarch64__)
> +       __ulong(map_extra, 0x1ull << 32);       /* user/BPF mmap base */
> +#else
> +       __ulong(map_extra, 0x1ull << 44);
> +#endif
> +} arena SEC(".maps");
> +
> +struct qmap_arena __arena qa;
> +
>  struct qmap {
>         __uint(type, BPF_MAP_TYPE_QUEUE);
>         __uint(max_entries, 4096);
> @@ -102,8 +116,6 @@ static const u32 qidx_to_cpuperf_target[] = {
>   * task's seq and the associated queue's head seq is called the queue distance
>   * and used when comparing two tasks for ordering. See qmap_core_sched_before().
>   */
> -static u64 core_sched_head_seqs[5];
> -static u64 core_sched_tail_seqs[5];
>
>  /* Per-task scheduling context */
>  struct task_ctx {
> @@ -119,27 +131,6 @@ struct {
>         __type(value, struct task_ctx);
>  } task_ctx_stor SEC(".maps");
>
> -struct cpu_ctx {
> -       u64     dsp_idx;        /* dispatch index */
> -       u64     dsp_cnt;        /* remaining count */
> -       u32     avg_weight;
> -       u32     cpuperf_target;
> -};
> -
> -struct {
> -       __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> -       __uint(max_entries, 1);
> -       __type(key, u32);
> -       __type(value, struct cpu_ctx);
> -} cpu_ctx_stor SEC(".maps");
> -
> -/* Statistics */
> -u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0, nr_dequeued, nr_ddsp_from_enq;
> -u64 nr_core_sched_execed;
> -u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
> -u32 cpuperf_min, cpuperf_avg, cpuperf_max;
> -u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
> -
>  static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
>  {
>         s32 cpu;
> @@ -215,9 +206,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>         s32 cpu;
>
>         if (enq_flags & SCX_ENQ_REENQ) {
> -               __sync_fetch_and_add(&nr_reenqueued, 1);
> +               __sync_fetch_and_add(&qa.nr_reenqueued, 1);
>                 if (scx_bpf_task_cpu(p) == 0)
> -                       __sync_fetch_and_add(&nr_reenqueued_cpu0, 1);
> +                       __sync_fetch_and_add(&qa.nr_reenqueued_cpu0, 1);
>         }
>
>         if (p->flags & PF_KTHREAD) {
> @@ -228,7 +219,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>                         return;
>         }
>
> -       if (test_error_cnt && !--test_error_cnt)
> +       if (qa.test_error_cnt && !--qa.test_error_cnt)
>                 scx_bpf_error("test triggering error");
>
>         if (!(taskc = lookup_task_ctx(p)))
> @@ -238,7 +229,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>          * All enqueued tasks must have their core_sched_seq updated for correct
>          * core-sched ordering. Also, take a look at the end of qmap_dispatch().
>          */
> -       taskc->core_sched_seq = core_sched_tail_seqs[idx]++;
> +       taskc->core_sched_seq = qa.core_sched_tail_seqs[idx]++;
>
>         /*
>          * IMMED stress testing: Every immed_stress_nth'th enqueue, dispatch
> @@ -276,7 +267,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>         /* if select_cpu() wasn't called, try direct dispatch */
>         if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
>             (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
> -               __sync_fetch_and_add(&nr_ddsp_from_enq, 1);
> +               __sync_fetch_and_add(&qa.nr_ddsp_from_enq, 1);
>                 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
>                 return;
>         }
> @@ -311,9 +302,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>
>         if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
>                 taskc->highpri = true;
> -               __sync_fetch_and_add(&nr_highpri_queued, 1);
> +               __sync_fetch_and_add(&qa.nr_highpri_queued, 1);
>         }
> -       __sync_fetch_and_add(&nr_enqueued, 1);
> +       __sync_fetch_and_add(&qa.nr_enqueued, 1);
>  }
>
>  /*
> @@ -322,9 +313,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>   */
>  void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
>  {
> -       __sync_fetch_and_add(&nr_dequeued, 1);
> +       __sync_fetch_and_add(&qa.nr_dequeued, 1);
>         if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
> -               __sync_fetch_and_add(&nr_core_sched_execed, 1);
> +               __sync_fetch_and_add(&qa.nr_core_sched_execed, 1);
>  }
>
>  static void update_core_sched_head_seq(struct task_struct *p)
> @@ -333,7 +324,7 @@ static void update_core_sched_head_seq(struct task_struct *p)
>         struct task_ctx *taskc;
>
>         if ((taskc = lookup_task_ctx(p)))
> -               core_sched_head_seqs[idx] = taskc->core_sched_seq;
> +               qa.core_sched_head_seqs[idx] = taskc->core_sched_seq;
>  }
>
>  /*
> @@ -384,14 +375,14 @@ static bool dispatch_highpri(bool from_timer)
>                                      SCX_ENQ_PREEMPT)) {
>                         if (cpu == this_cpu) {
>                                 dispatched = true;
> -                               __sync_fetch_and_add(&nr_expedited_local, 1);
> +                               __sync_fetch_and_add(&qa.nr_expedited_local, 1);
>                         } else {
> -                               __sync_fetch_and_add(&nr_expedited_remote, 1);
> +                               __sync_fetch_and_add(&qa.nr_expedited_remote, 1);
>                         }
>                         if (from_timer)
> -                               __sync_fetch_and_add(&nr_expedited_from_timer, 1);
> +                               __sync_fetch_and_add(&qa.nr_expedited_from_timer, 1);
>                 } else {
> -                       __sync_fetch_and_add(&nr_expedited_lost, 1);
> +                       __sync_fetch_and_add(&qa.nr_expedited_lost, 1);
>                 }
>
>                 if (dispatched)
> @@ -404,19 +395,19 @@ static bool dispatch_highpri(bool from_timer)
>  void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>  {
>         struct task_struct *p;
> -       struct cpu_ctx *cpuc;
> +       struct cpu_ctx __arena *cpuc;
>         struct task_ctx *taskc;
> -       u32 zero = 0, batch = dsp_batch ?: 1;
> +       u32 batch = dsp_batch ?: 1;
>         void *fifo;
>         s32 i, pid;
>
>         if (dispatch_highpri(false))
>                 return;
>
> -       if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0))
> +       if (!qa.nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0))
>                 return;
>
> -       if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
> +       if (dsp_inf_loop_after && qa.nr_dispatched > dsp_inf_loop_after) {
>                 /*
>                  * PID 2 should be kthreadd which should mostly be idle and off
>                  * the scheduler. Let's keep dispatching it to force the kernel
> @@ -430,10 +421,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>                 }
>         }
>
> -       if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
> -               scx_bpf_error("failed to look up cpu_ctx");
> -               return;
> -       }
> +       cpuc = &qa.cpu_ctxs[bpf_get_smp_processor_id()];
>
>         for (i = 0; i < 5; i++) {
>                 /* Advance the dispatch cursor and pick the fifo. */
> @@ -442,9 +430,11 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>                         cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
>                 }
>
> -               fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
> +               u64 dsp_idx = cpuc->dsp_idx;
> +
> +               fifo = bpf_map_lookup_elem(&queue_arr, &dsp_idx);
>                 if (!fifo) {
> -                       scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
> +                       scx_bpf_error("failed to find ring %llu", dsp_idx);
>                         return;
>                 }
>
> @@ -465,10 +455,10 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>                         }
>
>                         if (taskc->highpri)
> -                               __sync_fetch_and_sub(&nr_highpri_queued, 1);
> +                               __sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
>
>                         update_core_sched_head_seq(p);
> -                       __sync_fetch_and_add(&nr_dispatched, 1);
> +                       __sync_fetch_and_add(&qa.nr_dispatched, 1);
>
>                         scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
>
> @@ -529,8 +519,8 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>         }
>
>         for (i = 0; i < MAX_SUB_SCHEDS; i++) {
> -               if (sub_sched_cgroup_ids[i] &&
> -                   scx_bpf_sub_dispatch(sub_sched_cgroup_ids[i]))
> +               if (qa.sub_sched_cgroup_ids[i] &&
> +                   scx_bpf_sub_dispatch(qa.sub_sched_cgroup_ids[i]))
>                         return;
>         }
>
> @@ -546,21 +536,15 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>                 }
>
>                 taskc->core_sched_seq =
> -                       core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
> +                       qa.core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
>         }
>  }
>
>  void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
>  {
> -       struct cpu_ctx *cpuc;
> -       u32 zero = 0;
> +       struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[bpf_get_smp_processor_id()];
>         int idx;
>
> -       if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
> -               scx_bpf_error("failed to look up cpu_ctx");
> -               return;
> -       }
> -
>         /*
>          * Use the running avg of weights to select the target cpuperf level.
>          * This is a demonstration of the cpuperf feature rather than a
> @@ -589,7 +573,7 @@ static s64 task_qdist(struct task_struct *p)
>                 return 0;
>         }
>
> -       qdist = taskc->core_sched_seq - core_sched_head_seqs[idx];
> +       qdist = taskc->core_sched_seq - qa.core_sched_head_seqs[idx];
>
>         /*
>          * As queue index increments, the priority doubles. The queue w/ index 3
> @@ -679,13 +663,10 @@ void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
>
>  void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
>  {
> -       u32 zero = 0;
> -       struct cpu_ctx *cpuc;
> +       struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[cpu];
>
>         if (suppress_dump || idle)
>                 return;
> -       if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
> -               return;
>
>         scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
>                      cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
> @@ -802,7 +783,7 @@ struct {
>   */
>  static void monitor_cpuperf(void)
>  {
> -       u32 zero = 0, nr_cpu_ids;
> +       u32 nr_cpu_ids;
>         u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
>         u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
>         const struct cpumask *online;
> @@ -812,7 +793,7 @@ static void monitor_cpuperf(void)
>         online = scx_bpf_get_online_cpumask();
>
>         bpf_for(i, 0, nr_cpu_ids) {
> -               struct cpu_ctx *cpuc;
> +               struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[i];
>                 u32 cap, cur;
>
>                 if (!bpf_cpumask_test_cpu(i, online))
> @@ -834,11 +815,6 @@ static void monitor_cpuperf(void)
>                 cur_sum += cur * cap / SCX_CPUPERF_ONE;
>                 cap_sum += cap;
>
> -               if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
> -                       scx_bpf_error("failed to look up cpu_ctx");
> -                       goto out;
> -               }
> -
>                 /* collect target */
>                 cur = cpuc->cpuperf_target;
>                 target_sum += cur;
> @@ -846,14 +822,14 @@ static void monitor_cpuperf(void)
>                 target_max = cur > target_max ? cur : target_max;
>         }
>
> -       cpuperf_min = cur_min;
> -       cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
> -       cpuperf_max = cur_max;
> +       qa.cpuperf_min = cur_min;
> +       qa.cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
> +       qa.cpuperf_max = cur_max;
> +
> +       qa.cpuperf_target_min = target_min;
> +       qa.cpuperf_target_avg = target_sum / nr_online_cpus;
> +       qa.cpuperf_target_max = target_max;
>
> -       cpuperf_target_min = target_min;
> -       cpuperf_target_avg = target_sum / nr_online_cpus;
> -       cpuperf_target_max = target_max;
> -out:
>         scx_bpf_put_cpumask(online);
>  }
>
> @@ -996,8 +972,8 @@ s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args)
>         s32 i;
>
>         for (i = 0; i < MAX_SUB_SCHEDS; i++) {
> -               if (!sub_sched_cgroup_ids[i]) {
> -                       sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
> +               if (!qa.sub_sched_cgroup_ids[i]) {
> +                       qa.sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
>                         bpf_printk("attaching sub-sched[%d] on %s",
>                                    i, args->cgroup_path);
>                         return 0;
> @@ -1012,8 +988,8 @@ void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args)
>         s32 i;
>
>         for (i = 0; i < MAX_SUB_SCHEDS; i++) {
> -               if (sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
> -                       sub_sched_cgroup_ids[i] = 0;
> +               if (qa.sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
> +                       qa.sub_sched_cgroup_ids[i] = 0;
>                         bpf_printk("detaching sub-sched[%d] on %s",
>                                    i, args->cgroup_path);
>                         break;
> diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
> index e7c89a2bc3d8..8844499c14c4 100644
> --- a/tools/sched_ext/scx_qmap.c
> +++ b/tools/sched_ext/scx_qmap.c
> @@ -10,9 +10,11 @@
>  #include <inttypes.h>
>  #include <signal.h>
>  #include <libgen.h>
> +#include <sys/mman.h>
>  #include <sys/stat.h>
>  #include <bpf/bpf.h>
>  #include <scx/common.h>
> +#include "scx_qmap.h"
>  #include "scx_qmap.bpf.skel.h"
>
>  const char help_fmt[] =
> @@ -60,6 +62,8 @@ int main(int argc, char **argv)
>  {
>         struct scx_qmap *skel;
>         struct bpf_link *link;
> +       struct qmap_arena *qa;
> +       __u32 test_error_cnt = 0;
>         int opt;
>
>         libbpf_set_print(libbpf_print_fn);
> @@ -76,7 +80,7 @@ int main(int argc, char **argv)
>                         skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
>                         break;
>                 case 'e':
> -                       skel->bss->test_error_cnt = strtoul(optarg, NULL, 0);
> +                       test_error_cnt = strtoul(optarg, NULL, 0);
>                         break;
>                 case 't':
>                         skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0);
> @@ -142,29 +146,32 @@ int main(int argc, char **argv)
>         SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei);
>         link = SCX_OPS_ATTACH(skel, qmap_ops, scx_qmap);
>
> +       qa = &skel->arena->qa;
> +       qa->test_error_cnt = test_error_cnt;
> +
>         while (!exit_req && !UEI_EXITED(skel, uei)) {
> -               long nr_enqueued = skel->bss->nr_enqueued;
> -               long nr_dispatched = skel->bss->nr_dispatched;
> +               long nr_enqueued = qa->nr_enqueued;
> +               long nr_dispatched = qa->nr_dispatched;
>
> -               printf("stats  : enq=%lu dsp=%lu delta=%ld reenq/cpu0=%"PRIu64"/%"PRIu64" deq=%"PRIu64" core=%"PRIu64" enq_ddsp=%"PRIu64"\n",
> +               printf("stats  : enq=%lu dsp=%lu delta=%ld reenq/cpu0=%llu/%llu deq=%llu core=%llu enq_ddsp=%llu\n",
>                        nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
> -                      skel->bss->nr_reenqueued, skel->bss->nr_reenqueued_cpu0,
> -                      skel->bss->nr_dequeued,
> -                      skel->bss->nr_core_sched_execed,
> -                      skel->bss->nr_ddsp_from_enq);
> -               printf("         exp_local=%"PRIu64" exp_remote=%"PRIu64" exp_timer=%"PRIu64" exp_lost=%"PRIu64"\n",
> -                      skel->bss->nr_expedited_local,
> -                      skel->bss->nr_expedited_remote,
> -                      skel->bss->nr_expedited_from_timer,
> -                      skel->bss->nr_expedited_lost);
> +                      qa->nr_reenqueued, qa->nr_reenqueued_cpu0,
> +                      qa->nr_dequeued,
> +                      qa->nr_core_sched_execed,
> +                      qa->nr_ddsp_from_enq);
> +               printf("         exp_local=%llu exp_remote=%llu exp_timer=%llu exp_lost=%llu\n",
> +                      qa->nr_expedited_local,
> +                      qa->nr_expedited_remote,
> +                      qa->nr_expedited_from_timer,
> +                      qa->nr_expedited_lost);
>                 if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur"))
>                         printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n",
> -                              skel->bss->cpuperf_min,
> -                              skel->bss->cpuperf_avg,
> -                              skel->bss->cpuperf_max,
> -                              skel->bss->cpuperf_target_min,
> -                              skel->bss->cpuperf_target_avg,
> -                              skel->bss->cpuperf_target_max);
> +                              qa->cpuperf_min,
> +                              qa->cpuperf_avg,
> +                              qa->cpuperf_max,
> +                              qa->cpuperf_target_min,
> +                              qa->cpuperf_target_avg,
> +                              qa->cpuperf_target_max);
>                 fflush(stdout);
>                 sleep(1);
>         }
> diff --git a/tools/sched_ext/scx_qmap.h b/tools/sched_ext/scx_qmap.h
> new file mode 100644
> index 000000000000..52153230bfce
> --- /dev/null
> +++ b/tools/sched_ext/scx_qmap.h
> @@ -0,0 +1,57 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Shared definitions between scx_qmap.bpf.c and scx_qmap.c.
> + *
> + * The scheduler keeps all mutable state in a single BPF arena map. struct
> + * qmap_arena is the one object that lives at the base of the arena and is
> + * mmap'd into userspace so the loader can read counters directly.
> + *
> + * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
> + * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
> + */
> +#ifndef __SCX_QMAP_H
> +#define __SCX_QMAP_H
> +
> +#ifdef __BPF__
> +#include <scx/bpf_arena_common.bpf.h>
> +#else
> +#include <linux/types.h>
> +#include <scx/bpf_arena_common.h>
> +#endif
> +
> +#define MAX_SUB_SCHEDS         8
> +
> +/*
> + * cpu_ctxs[] is sized to a fixed cap so the layout is shared between BPF and
> + * userspace. Keep this in sync with NR_CPUS used by the BPF side.
> + */
> +#define SCX_QMAP_MAX_CPUS      1024
> +
> +struct cpu_ctx {
> +       __u64 dsp_idx;          /* dispatch index */
> +       __u64 dsp_cnt;          /* remaining count */
> +       __u32 avg_weight;
> +       __u32 cpuperf_target;
> +};
> +
> +struct qmap_arena {
> +       /* userspace-visible stats */
> +       __u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0;
> +       __u64 nr_dequeued, nr_ddsp_from_enq;
> +       __u64 nr_core_sched_execed;
> +       __u64 nr_expedited_local, nr_expedited_remote;
> +       __u64 nr_expedited_lost, nr_expedited_from_timer;
> +       __u64 nr_highpri_queued;
> +       __u32 test_error_cnt;
> +       __u32 cpuperf_min, cpuperf_avg, cpuperf_max;
> +       __u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
> +
> +       /* kernel-side runtime state */
> +       __u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
> +       __u64 core_sched_head_seqs[5];
> +       __u64 core_sched_tail_seqs[5];
> +
> +       struct cpu_ctx cpu_ctxs[SCX_QMAP_MAX_CPUS];
> +};
> +
> +#endif /* __SCX_QMAP_H */
> --
> 2.53.0
>
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/4] sched_ext: scx_qmap: move task_ctx into a BPF arena slab
  2026-04-16  8:16 ` [PATCH 3/4] sched_ext: scx_qmap: move task_ctx into a BPF arena slab Tejun Heo
@ 2026-04-16 15:31   ` Emil Tsalapatis
  0 siblings, 0 replies; 12+ messages in thread
From: Emil Tsalapatis @ 2026-04-16 15:31 UTC (permalink / raw)
  To: Tejun Heo
  Cc: David Vernet, Andrea Righi, Changwoo Min, Emil Tsalapatis,
	sched-ext, linux-kernel

On Thu, Apr 16, 2026 at 1:20 AM Tejun Heo <tj@kernel.org> wrote:
>
> >
> Arena simplifies verification and allows more natural programming.
> Convert scx_qmap to arena as preparation for further sub-sched work.
>
> Allocate per-task context from an arena slab instead of storing it
> directly in task_storage. task_ctx_stor now holds an arena pointer to
> the task's slab entry. Free entries form a singly-linked list protected
> by bpf_res_spin_lock; slab exhaustion triggers scx_bpf_error().
>
> The slab size is configurable via the new -N option (default 16384).
>
> Also add bpf_res_spin_lock/unlock declarations to common.bpf.h.
>
> Scheduling logic unchanged.
>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---

Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>

One nit, since we never have non-arena task_ctxs we can do

typedef struct task_ctx __arena task_ctx;

to avoid annotating every instance.

>  tools/sched_ext/include/scx/common.bpf.h |   4 +
>  tools/sched_ext/scx_qmap.bpf.c           | 178 ++++++++++++++++++-----
>  tools/sched_ext/scx_qmap.c               |   9 +-
>  tools/sched_ext/scx_qmap.h               |   7 +
>  4 files changed, 159 insertions(+), 39 deletions(-)
>
> diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
> index 19459dedde41..35fc62556241 100644
> --- a/tools/sched_ext/include/scx/common.bpf.h
> +++ b/tools/sched_ext/include/scx/common.bpf.h
> @@ -526,6 +526,10 @@ static inline bool is_migration_disabled(const struct task_struct *p)
>  void bpf_rcu_read_lock(void) __ksym;
>  void bpf_rcu_read_unlock(void) __ksym;
>
> +/* resilient qspinlock */
> +int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) __ksym __weak;
> +void bpf_res_spin_unlock(struct bpf_res_spin_lock *lock) __ksym __weak;
> +
>  /*
>   * Time helpers, most of which are from jiffies.h.
>   */
> diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
> index 0f8fbb6d0bc2..e071969c8f32 100644
> --- a/tools/sched_ext/scx_qmap.bpf.c
> +++ b/tools/sched_ext/scx_qmap.bpf.c
> @@ -49,6 +49,7 @@ const volatile s32 disallow_tgid;
>  const volatile bool suppress_dump;
>  const volatile bool always_enq_immed;
>  const volatile u32 immed_stress_nth;
> +const volatile u32 max_tasks;
>
>  UEI_DEFINE(uei);
>
> @@ -117,20 +118,43 @@ static const u32 qidx_to_cpuperf_target[] = {
>   * and used when comparing two tasks for ordering. See qmap_core_sched_before().
>   */
>
> -/* Per-task scheduling context */
> +/*
> + * Per-task scheduling context. Allocated from the qa.task_ctxs[] slab in
> + * arena. While the task is alive the entry is referenced from task_ctx_stor;
> + * while it's free the entry sits on the free list singly-linked through
> + * @next_free.
> + */
>  struct task_ctx {
> -       bool    force_local;    /* Dispatch directly to local_dsq */
> -       bool    highpri;
> -       u64     core_sched_seq;
> +       struct task_ctx __arena *next_free;     /* only valid on free list */
> +       bool                    force_local;    /* Dispatch directly to local_dsq */
> +       bool                    highpri;
> +       u64                     core_sched_seq;
> +};
> +
> +/* Holds an arena pointer to the task's slab entry. */
> +struct task_ctx_stor_val {
> +       struct task_ctx __arena *taskc;
>  };
>
>  struct {
>         __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
>         __uint(map_flags, BPF_F_NO_PREALLOC);
>         __type(key, int);
> -       __type(value, struct task_ctx);
> +       __type(value, struct task_ctx_stor_val);
>  } task_ctx_stor SEC(".maps");
>
> +/* Protects the task_ctx slab free list. */
> +__hidden struct bpf_res_spin_lock qa_task_lock SEC(".data.qa_task_lock");
> +
> +static int qmap_spin_lock(struct bpf_res_spin_lock *lock)
> +{
> +       if (bpf_res_spin_lock(lock)) {
> +               scx_bpf_error("res_spin_lock failed");
> +               return -EBUSY;
> +       }
> +       return 0;
> +}
> +
>  static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
>  {
>         s32 cpu;
> @@ -148,21 +172,34 @@ static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
>         return -1;
>  }
>
> -static struct task_ctx *lookup_task_ctx(struct task_struct *p)
> +/*
> + * Force a reference to the arena map. The verifier associates an arena with
> + * a program by finding an LD_IMM64 instruction that loads the arena's BPF
> + * map; programs that only use arena pointers returned from task-local
> + * storage (like qmap_select_cpu) never reference @arena directly. Without
> + * this, the verifier rejects addr_space_cast with "addr_space_cast insn
> + * can only be used in a program that has an associated arena".
> + */
> +#define QMAP_TOUCH_ARENA() do { asm volatile("" :: "r"(&arena)); } while (0)
> +

Really nice that this works when placed as a macro.

> +static struct task_ctx __arena *lookup_task_ctx(struct task_struct *p)
>  {
> -       struct task_ctx *taskc;
> +       struct task_ctx_stor_val *v;
> +
> +       QMAP_TOUCH_ARENA();
>
> -       if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
> +       v = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
> +       if (!v || !v->taskc) {
>                 scx_bpf_error("task_ctx lookup failed");
>                 return NULL;
>         }
> -       return taskc;
> +       return v->taskc;
>  }
>
>  s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
>                    s32 prev_cpu, u64 wake_flags)
>  {
> -       struct task_ctx *taskc;
> +       struct task_ctx __arena *taskc;
>         s32 cpu;
>
>         if (!(taskc = lookup_task_ctx(p)))
> @@ -199,7 +236,7 @@ static int weight_to_idx(u32 weight)
>  void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>  {
>         static u32 user_cnt, kernel_cnt;
> -       struct task_ctx *taskc;
> +       struct task_ctx __arena *taskc;
>         u32 pid = p->pid;
>         int idx = weight_to_idx(p->scx.weight);
>         void *ring;
> @@ -321,7 +358,7 @@ void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
>  static void update_core_sched_head_seq(struct task_struct *p)
>  {
>         int idx = weight_to_idx(p->scx.weight);
> -       struct task_ctx *taskc;
> +       struct task_ctx __arena *taskc;
>
>         if ((taskc = lookup_task_ctx(p)))
>                 qa.core_sched_head_seqs[idx] = taskc->core_sched_seq;
> @@ -345,7 +382,7 @@ static bool dispatch_highpri(bool from_timer)
>         /* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
>         bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
>                 static u64 highpri_seq;
> -               struct task_ctx *taskc;
> +               struct task_ctx __arena *taskc;
>
>                 if (!(taskc = lookup_task_ctx(p)))
>                         return false;
> @@ -396,7 +433,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>  {
>         struct task_struct *p;
>         struct cpu_ctx __arena *cpuc;
> -       struct task_ctx *taskc;
> +       struct task_ctx __arena *taskc;
>         u32 batch = dsp_batch ?: 1;
>         void *fifo;
>         s32 i, pid;
> @@ -440,7 +477,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>
>                 /* Dispatch or advance. */
>                 bpf_repeat(BPF_MAX_LOOPS) {
> -                       struct task_ctx *taskc;
> +                       struct task_ctx __arena *taskc;
>
>                         if (bpf_map_pop_elem(fifo, &pid))
>                                 break;
> @@ -529,11 +566,9 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>          * if the task were enqueued and dispatched immediately.
>          */
>         if (prev) {
> -               taskc = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
> -               if (!taskc) {
> -                       scx_bpf_error("task_ctx lookup failed");
> +               taskc = lookup_task_ctx(prev);
> +               if (!taskc)
>                         return;
> -               }
>
>                 taskc->core_sched_seq =
>                         qa.core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
> @@ -564,14 +599,12 @@ void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
>  static s64 task_qdist(struct task_struct *p)
>  {
>         int idx = weight_to_idx(p->scx.weight);
> -       struct task_ctx *taskc;
> +       struct task_ctx __arena *taskc;
>         s64 qdist;
>
> -       taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
> -       if (!taskc) {
> -               scx_bpf_error("task_ctx lookup failed");
> +       taskc = lookup_task_ctx(p);
> +       if (!taskc)
>                 return 0;
> -       }
>
>         qdist = taskc->core_sched_seq - qa.core_sched_head_seqs[idx];
>
> @@ -606,21 +639,64 @@ bool BPF_STRUCT_OPS(qmap_core_sched_before,
>   * tasks when a higher-priority scheduling class takes the CPU.
>   */
>
> -s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
> -                  struct scx_init_task_args *args)
> +s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init_task, struct task_struct *p,
> +                            struct scx_init_task_args *args)
>  {
> +       struct task_ctx_stor_val *v;
> +       struct task_ctx __arena *taskc;
> +
>         if (p->tgid == disallow_tgid)
>                 p->scx.disallow = true;
>
> -       /*
> -        * @p is new. Let's ensure that its task_ctx is available. We can sleep
> -        * in this function and the following will automatically use GFP_KERNEL.
> -        */
> -       if (bpf_task_storage_get(&task_ctx_stor, p, 0,
> -                                BPF_LOCAL_STORAGE_GET_F_CREATE))
> -               return 0;
> -       else
> +       /* pop a slab entry off the free list */
> +       if (qmap_spin_lock(&qa_task_lock))
> +               return -EBUSY;
> +       taskc = qa.task_free_head;
> +       if (taskc)
> +               qa.task_free_head = taskc->next_free;
> +       bpf_res_spin_unlock(&qa_task_lock);
> +       if (!taskc) {
> +               scx_bpf_error("task_ctx slab exhausted (max_tasks=%u)", max_tasks);
> +               return -ENOMEM;
> +       }
> +
> +       taskc->next_free = NULL;
> +       taskc->force_local = false;
> +       taskc->highpri = false;
> +       taskc->core_sched_seq = 0;
> +
> +       v = bpf_task_storage_get(&task_ctx_stor, p, NULL,
> +                                BPF_LOCAL_STORAGE_GET_F_CREATE);
> +       if (!v) {
> +               /* push back to the free list */
> +               if (!qmap_spin_lock(&qa_task_lock)) {
> +                       taskc->next_free = qa.task_free_head;
> +                       qa.task_free_head = taskc;
> +                       bpf_res_spin_unlock(&qa_task_lock);
> +               }
>                 return -ENOMEM;
> +       }
> +       v->taskc = taskc;
> +       return 0;
> +}
> +
> +void BPF_STRUCT_OPS(qmap_exit_task, struct task_struct *p,
> +                   struct scx_exit_task_args *args)
> +{
> +       struct task_ctx_stor_val *v;
> +       struct task_ctx __arena *taskc;
> +
> +       v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
> +       if (!v || !v->taskc)
> +               return;
> +       taskc = v->taskc;
> +       v->taskc = NULL;
> +
> +       if (qmap_spin_lock(&qa_task_lock))
> +               return;
> +       taskc->next_free = qa.task_free_head;
> +       qa.task_free_head = taskc;
> +       bpf_res_spin_unlock(&qa_task_lock);
>  }
>
>  void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
> @@ -675,12 +751,17 @@ void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle
>
>  void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
>  {
> -       struct task_ctx *taskc;
> +       struct task_ctx_stor_val *v;
> +       struct task_ctx __arena *taskc;
> +
> +       QMAP_TOUCH_ARENA();
>
>         if (suppress_dump)
>                 return;
> -       if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
> +       v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
> +       if (!v || !v->taskc)
>                 return;
> +       taskc = v->taskc;
>
>         scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
>                      taskc->force_local, taskc->core_sched_seq);
> @@ -915,10 +996,32 @@ static int lowpri_timerfn(void *map, int *key, struct bpf_timer *timer)
>
>  s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
>  {
> -       u32 key = 0;
> +       struct task_ctx __arena *slab;
> +       u32 nr_pages, key = 0, i;
>         struct bpf_timer *timer;
>         s32 ret;
>
> +       /*
> +        * Allocate the task_ctx slab in arena and thread the entire slab onto
> +        * the free list. max_tasks is set by userspace before load.
> +        */
> +       if (!max_tasks) {
> +               scx_bpf_error("max_tasks must be > 0");
> +               return -EINVAL;
> +       }
> +
> +       nr_pages = (max_tasks * sizeof(struct task_ctx) + PAGE_SIZE - 1) / PAGE_SIZE;
> +       slab = bpf_arena_alloc_pages(&arena, NULL, nr_pages, NUMA_NO_NODE, 0);
> +       if (!slab) {
> +               scx_bpf_error("failed to allocate task_ctx slab");
> +               return -ENOMEM;
> +       }
> +       qa.task_ctxs = slab;
> +
> +       bpf_for(i, 0, max_tasks)
> +               slab[i].next_free = (i + 1 < max_tasks) ? &slab[i + 1] : NULL;
> +       qa.task_free_head = &slab[0];
> +
>         if (print_msgs && !sub_cgroup_id)
>                 print_cpus();
>
> @@ -1005,6 +1108,7 @@ SCX_OPS_DEFINE(qmap_ops,
>                .tick                    = (void *)qmap_tick,
>                .core_sched_before       = (void *)qmap_core_sched_before,
>                .init_task               = (void *)qmap_init_task,
> +              .exit_task               = (void *)qmap_exit_task,
>                .dump                    = (void *)qmap_dump,
>                .dump_cpu                = (void *)qmap_dump_cpu,
>                .dump_task               = (void *)qmap_dump_task,
> diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
> index 8844499c14c4..4bdcc4bc5fbd 100644
> --- a/tools/sched_ext/scx_qmap.c
> +++ b/tools/sched_ext/scx_qmap.c
> @@ -23,12 +23,13 @@ const char help_fmt[] =
>  "See the top-level comment in .bpf.c for more details.\n"
>  "\n"
>  "Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n"
> -"       [-P] [-M] [-H] [-d PID] [-D LEN] [-S] [-p] [-I] [-F COUNT] [-v]\n"
> +"       [-N COUNT] [-P] [-M] [-H] [-d PID] [-D LEN] [-S] [-p] [-I] [-F COUNT] [-v]\n"
>  "\n"
>  "  -s SLICE_US   Override slice duration\n"
>  "  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
>  "  -t COUNT      Stall every COUNT'th user thread\n"
>  "  -T COUNT      Stall every COUNT'th kernel thread\n"
> +"  -N COUNT      Size of the task_ctx arena slab (default 16384)\n"
>  "  -l COUNT      Trigger dispatch infinite looping after COUNT dispatches\n"
>  "  -b COUNT      Dispatch upto COUNT tasks together\n"
>  "  -P            Print out DSQ content and event counters to trace_pipe every second\n"
> @@ -73,8 +74,9 @@ int main(int argc, char **argv)
>         skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
>
>         skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
> +       skel->rodata->max_tasks = 16384;
>
> -       while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PMHc:d:D:SpIF:vh")) != -1) {
> +       while ((opt = getopt(argc, argv, "s:e:t:T:l:b:N:PMHc:d:D:SpIF:vh")) != -1) {
>                 switch (opt) {
>                 case 's':
>                         skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
> @@ -94,6 +96,9 @@ int main(int argc, char **argv)
>                 case 'b':
>                         skel->rodata->dsp_batch = strtoul(optarg, NULL, 0);
>                         break;
> +               case 'N':
> +                       skel->rodata->max_tasks = strtoul(optarg, NULL, 0);
> +                       break;
>                 case 'P':
>                         skel->rodata->print_dsqs_and_events = true;
>                         break;
> diff --git a/tools/sched_ext/scx_qmap.h b/tools/sched_ext/scx_qmap.h
> index 52153230bfce..c183d82632b3 100644
> --- a/tools/sched_ext/scx_qmap.h
> +++ b/tools/sched_ext/scx_qmap.h
> @@ -34,6 +34,9 @@ struct cpu_ctx {
>         __u32 cpuperf_target;
>  };
>
> +/* Opaque to userspace; defined in scx_qmap.bpf.c. */
> +struct task_ctx;
> +
>  struct qmap_arena {
>         /* userspace-visible stats */
>         __u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0;
> @@ -52,6 +55,10 @@ struct qmap_arena {
>         __u64 core_sched_tail_seqs[5];
>
>         struct cpu_ctx cpu_ctxs[SCX_QMAP_MAX_CPUS];
> +
> +       /* task_ctx slab; allocated and threaded by qmap_init() */
> +       struct task_ctx __arena *task_ctxs;
> +       struct task_ctx __arena *task_free_head;
>  };
>
>  #endif /* __SCX_QMAP_H */
> --
> 2.53.0
>
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists
  2026-04-16  8:16 ` [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists Tejun Heo
  2026-04-16 10:01   ` Andrea Righi
@ 2026-04-16 15:45   ` Emil Tsalapatis
  1 sibling, 0 replies; 12+ messages in thread
From: Emil Tsalapatis @ 2026-04-16 15:45 UTC (permalink / raw)
  To: Tejun Heo
  Cc: David Vernet, Andrea Righi, Changwoo Min, Emil Tsalapatis,
	sched-ext, linux-kernel

On Thu, Apr 16, 2026 at 1:20 AM Tejun Heo <tj@kernel.org> wrote:
>
> >
> Arena simplifies verification and allows more natural programming.
> Convert scx_qmap to arena as preparation for further sub-sched work.
>
> Replace the five BPF_MAP_TYPE_QUEUE maps with doubly-linked lists in
> arena, threaded through task_ctx. Each queue is a struct qmap_fifo with
> head/tail pointers and its own per-queue bpf_res_spin_lock.
>
> qmap_dequeue() now properly removes tasks from the queue instead of
> leaving stale entries for dispatch to skip.
>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---

Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>

>  tools/sched_ext/scx_qmap.bpf.c | 221 +++++++++++++++++++++------------
>  tools/sched_ext/scx_qmap.h     |   9 ++
>  2 files changed, 148 insertions(+), 82 deletions(-)
>
> diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
> index e071969c8f32..c26997ff7863 100644
> --- a/tools/sched_ext/scx_qmap.bpf.c
> +++ b/tools/sched_ext/scx_qmap.bpf.c
> @@ -71,31 +71,24 @@ struct {
>
>  struct qmap_arena __arena qa;
>
> -struct qmap {
> -       __uint(type, BPF_MAP_TYPE_QUEUE);
> -       __uint(max_entries, 4096);
> -       __type(value, u32);
> -} queue0 SEC(".maps"),
> -  queue1 SEC(".maps"),
> -  queue2 SEC(".maps"),
> -  queue3 SEC(".maps"),
> -  queue4 SEC(".maps"),
> -  dump_store SEC(".maps");
> -
> -struct {
> -       __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
> -       __uint(max_entries, 5);
> -       __type(key, int);
> -       __array(values, struct qmap);
> -} queue_arr SEC(".maps") = {
> -       .values = {
> -               [0] = &queue0,
> -               [1] = &queue1,
> -               [2] = &queue2,
> -               [3] = &queue3,
> -               [4] = &queue4,
> -       },
> -};
> +/* Per-queue locks. Each in its own .data section as bpf_res_spin_lock requires. */
> +__hidden struct bpf_res_spin_lock qa_q_lock0 SEC(".data.qa_q_lock0");
> +__hidden struct bpf_res_spin_lock qa_q_lock1 SEC(".data.qa_q_lock1");
> +__hidden struct bpf_res_spin_lock qa_q_lock2 SEC(".data.qa_q_lock2");
> +__hidden struct bpf_res_spin_lock qa_q_lock3 SEC(".data.qa_q_lock3");
> +__hidden struct bpf_res_spin_lock qa_q_lock4 SEC(".data.qa_q_lock4");
> +
> +static struct bpf_res_spin_lock *qa_q_lock(s32 qid)
> +{
> +       switch (qid) {
> +       case 0: return &qa_q_lock0;
> +       case 1: return &qa_q_lock1;
> +       case 2: return &qa_q_lock2;
> +       case 3: return &qa_q_lock3;
> +       case 4: return &qa_q_lock4;
> +       default: return NULL;
> +       }
> +}
>
>  /*
>   * If enabled, CPU performance target is set according to the queue index
> @@ -123,9 +116,17 @@ static const u32 qidx_to_cpuperf_target[] = {
>   * arena. While the task is alive the entry is referenced from task_ctx_stor;
>   * while it's free the entry sits on the free list singly-linked through
>   * @next_free.
> + *
> + * When the task is queued on one of the five priority FIFOs, @q_idx is the
> + * queue index and @q_next/@q_prev link it in the queue's doubly-linked list.
> + * @q_idx is -1 when the task isn't on any queue.
>   */
>  struct task_ctx {
>         struct task_ctx __arena *next_free;     /* only valid on free list */
> +       struct task_ctx __arena *q_next;        /* queue link, NULL if tail */
> +       struct task_ctx __arena *q_prev;        /* queue link, NULL if head */
> +       struct qmap_fifo __arena *fifo;         /* queue we're on, NULL if not queued */
> +       s32                     pid;
>         bool                    force_local;    /* Dispatch directly to local_dsq */
>         bool                    highpri;
>         u64                     core_sched_seq;
> @@ -196,6 +197,81 @@ static struct task_ctx __arena *lookup_task_ctx(struct task_struct *p)
>         return v->taskc;
>  }
>
> +/* Append @taskc to the tail of @fifo. Must not already be queued. */
> +static void qmap_fifo_enqueue(struct qmap_fifo __arena *fifo,
> +                             struct task_ctx __arena *taskc)
> +{
> +       struct bpf_res_spin_lock *lock = qa_q_lock(fifo->idx);
> +
> +       if (!lock || qmap_spin_lock(lock))
> +               return;
> +       taskc->fifo = fifo;
> +       taskc->q_next = NULL;
> +       taskc->q_prev = fifo->tail;
> +       if (fifo->tail)
> +               fifo->tail->q_next = taskc;
> +       else
> +               fifo->head = taskc;
> +       fifo->tail = taskc;
> +       bpf_res_spin_unlock(lock);
> +}
> +
> +/* Pop the head of @fifo. Returns NULL if empty. */
> +static struct task_ctx __arena *qmap_fifo_pop(struct qmap_fifo __arena *fifo)
> +{
> +       struct bpf_res_spin_lock *lock = qa_q_lock(fifo->idx);
> +       struct task_ctx __arena *taskc;
> +
> +       if (!lock || qmap_spin_lock(lock))
> +               return NULL;
> +       taskc = fifo->head;
> +       if (taskc) {
> +               fifo->head = taskc->q_next;
> +               if (taskc->q_next)
> +                       taskc->q_next->q_prev = NULL;
> +               else
> +                       fifo->tail = NULL;
> +               taskc->q_next = NULL;
> +               taskc->q_prev = NULL;
> +               taskc->fifo = NULL;
> +       }
> +       bpf_res_spin_unlock(lock);
> +       return taskc;
> +}
> +
> +/* Remove @taskc from its fifo. No-op if not queued. */
> +static void qmap_fifo_remove(struct task_ctx __arena *taskc)
> +{
> +       struct qmap_fifo __arena *fifo = taskc->fifo;
> +       struct bpf_res_spin_lock *lock;
> +
> +       if (!fifo)
> +               return;
> +
> +       lock = qa_q_lock(fifo->idx);
> +       if (!lock || qmap_spin_lock(lock))
> +               return;
> +
> +       /* Re-check under lock — a concurrent pop may have cleared fifo. */
> +       if (taskc->fifo != fifo) {
> +               bpf_res_spin_unlock(lock);
> +               return;
> +       }
> +
> +       if (taskc->q_next)
> +               taskc->q_next->q_prev = taskc->q_prev;
> +       else
> +               fifo->tail = taskc->q_prev;
> +       if (taskc->q_prev)
> +               taskc->q_prev->q_next = taskc->q_next;
> +       else
> +               fifo->head = taskc->q_next;
> +       taskc->q_next = NULL;
> +       taskc->q_prev = NULL;
> +       taskc->fifo = NULL;
> +       bpf_res_spin_unlock(lock);
> +}
> +
>  s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
>                    s32 prev_cpu, u64 wake_flags)
>  {
> @@ -237,9 +313,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>  {
>         static u32 user_cnt, kernel_cnt;
>         struct task_ctx __arena *taskc;
> -       u32 pid = p->pid;
>         int idx = weight_to_idx(p->scx.weight);
> -       void *ring;
>         s32 cpu;
>
>         if (enq_flags & SCX_ENQ_REENQ) {
> @@ -325,17 +399,8 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>                 return;
>         }
>
> -       ring = bpf_map_lookup_elem(&queue_arr, &idx);
> -       if (!ring) {
> -               scx_bpf_error("failed to find ring %d", idx);
> -               return;
> -       }
> -
> -       /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
> -       if (bpf_map_push_elem(ring, &pid, 0)) {
> -               scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags);
> -               return;
> -       }
> +       /* Queue on the selected FIFO. */
> +       qmap_fifo_enqueue(&qa.fifos[idx], taskc);
>
>         if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
>                 taskc->highpri = true;
> @@ -344,15 +409,20 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>         __sync_fetch_and_add(&qa.nr_enqueued, 1);
>  }
>
> -/*
> - * The BPF queue map doesn't support removal and sched_ext can handle spurious
> - * dispatches. qmap_dequeue() is only used to collect statistics.
> - */
>  void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
>  {
> +       struct task_ctx __arena *taskc;
> +
>         __sync_fetch_and_add(&qa.nr_dequeued, 1);
>         if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
>                 __sync_fetch_and_add(&qa.nr_core_sched_execed, 1);
> +
> +       taskc = lookup_task_ctx(p);
> +       if (taskc && taskc->fifo) {
> +               if (taskc->highpri)
> +                       __sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
> +               qmap_fifo_remove(taskc);
> +       }
>  }
>
>  static void update_core_sched_head_seq(struct task_struct *p)
> @@ -435,8 +505,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>         struct cpu_ctx __arena *cpuc;
>         struct task_ctx __arena *taskc;
>         u32 batch = dsp_batch ?: 1;
> -       void *fifo;
> -       s32 i, pid;
> +       s32 i;
>
>         if (dispatch_highpri(false))
>                 return;
> @@ -467,30 +536,18 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>                         cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
>                 }
>
> -               u64 dsp_idx = cpuc->dsp_idx;
> -
> -               fifo = bpf_map_lookup_elem(&queue_arr, &dsp_idx);
> -               if (!fifo) {
> -                       scx_bpf_error("failed to find ring %llu", dsp_idx);
> -                       return;
> -               }
> -
>                 /* Dispatch or advance. */
>                 bpf_repeat(BPF_MAX_LOOPS) {
>                         struct task_ctx __arena *taskc;
>
> -                       if (bpf_map_pop_elem(fifo, &pid))
> +                       taskc = qmap_fifo_pop(&qa.fifos[cpuc->dsp_idx]);
> +                       if (!taskc)
>                                 break;
>
> -                       p = bpf_task_from_pid(pid);
> +                       p = bpf_task_from_pid(taskc->pid);
>                         if (!p)
>                                 continue;
>
> -                       if (!(taskc = lookup_task_ctx(p))) {
> -                               bpf_task_release(p);
> -                               return;
> -                       }
> -
>                         if (taskc->highpri)
>                                 __sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
>
> @@ -661,6 +718,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init_task, struct task_struct *p,
>         }
>
>         taskc->next_free = NULL;
> +       taskc->q_next = NULL;
> +       taskc->q_prev = NULL;
> +       taskc->fifo = NULL;
> +       taskc->pid = p->pid;
>         taskc->force_local = false;
>         taskc->highpri = false;
>         taskc->core_sched_seq = 0;
> @@ -701,38 +762,29 @@ void BPF_STRUCT_OPS(qmap_exit_task, struct task_struct *p,
>
>  void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
>  {
> -       s32 i, pid;
> +       struct task_ctx __arena *taskc;
> +       s32 i;
> +
> +       QMAP_TOUCH_ARENA();
>
>         if (suppress_dump)
>                 return;
>
> +       /*
> +        * Walk the queue lists without locking - kfunc calls (scx_bpf_dump)
> +        * aren't in the verifier's kfunc_spin_allowed() list so we can't hold
> +        * a lock and dump. Best-effort; racing may print stale pids but the
> +        * walk is bounded by bpf_repeat() so it always terminates.
> +        */
>         bpf_for(i, 0, 5) {
> -               void *fifo;
> -
> -               if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
> -                       return;
> -
>                 scx_bpf_dump("QMAP FIFO[%d]:", i);
> -
> -               /*
> -                * Dump can be invoked anytime and there is no way to iterate in
> -                * a non-destructive way. Pop and store in dump_store and then
> -                * restore afterwards. If racing against new enqueues, ordering
> -                * can get mixed up.
> -                */
> -               bpf_repeat(4096) {
> -                       if (bpf_map_pop_elem(fifo, &pid))
> -                               break;
> -                       bpf_map_push_elem(&dump_store, &pid, 0);
> -                       scx_bpf_dump(" %d", pid);
> -               }
> -
> +               taskc = qa.fifos[i].head;
>                 bpf_repeat(4096) {
> -                       if (bpf_map_pop_elem(&dump_store, &pid))
> +                       if (!taskc)
>                                 break;
> -                       bpf_map_push_elem(fifo, &pid, 0);
> +                       scx_bpf_dump(" %d", taskc->pid);
> +                       taskc = taskc->q_next;
>                 }
> -
>                 scx_bpf_dump("\n");
>         }
>  }
> @@ -756,6 +808,8 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc
>
>         QMAP_TOUCH_ARENA();
>
> +       QMAP_TOUCH_ARENA();
> +
>         if (suppress_dump)
>                 return;
>         v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
> @@ -1018,6 +1072,9 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
>         }
>         qa.task_ctxs = slab;
>
> +       bpf_for(i, 0, 5)
> +               qa.fifos[i].idx = i;
> +
>         bpf_for(i, 0, max_tasks)
>                 slab[i].next_free = (i + 1 < max_tasks) ? &slab[i + 1] : NULL;
>         qa.task_free_head = &slab[0];
> diff --git a/tools/sched_ext/scx_qmap.h b/tools/sched_ext/scx_qmap.h
> index c183d82632b3..9c0da5a301cb 100644
> --- a/tools/sched_ext/scx_qmap.h
> +++ b/tools/sched_ext/scx_qmap.h
> @@ -37,6 +37,12 @@ struct cpu_ctx {
>  /* Opaque to userspace; defined in scx_qmap.bpf.c. */
>  struct task_ctx;
>
> +struct qmap_fifo {
> +       struct task_ctx __arena *head;
> +       struct task_ctx __arena *tail;
> +       __s32 idx;
> +};
> +
>  struct qmap_arena {
>         /* userspace-visible stats */
>         __u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0;
> @@ -59,6 +65,9 @@ struct qmap_arena {
>         /* task_ctx slab; allocated and threaded by qmap_init() */
>         struct task_ctx __arena *task_ctxs;
>         struct task_ctx __arena *task_free_head;
> +
> +       /* five priority FIFOs, each a doubly-linked list through task_ctx */
> +       struct qmap_fifo fifos[5];
>  };
>
>  #endif /* __SCX_QMAP_H */
> --
> 2.53.0
>
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 1/4] sched_ext: scx_qmap: rename tctx to taskc
  2026-04-16 17:20 [PATCHSET v2 " Tejun Heo
@ 2026-04-16 17:20 ` Tejun Heo
  0 siblings, 0 replies; 12+ messages in thread
From: Tejun Heo @ 2026-04-16 17:20 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min
  Cc: Emil Tsalapatis, sched-ext, linux-kernel

Rename the per-task context local variable from tctx to taskc for
consistency.

No functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
---
 tools/sched_ext/scx_qmap.bpf.c | 60 +++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index b68abb9e760b..a18234f3c27a 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -159,22 +159,22 @@ static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
 
 static struct task_ctx *lookup_task_ctx(struct task_struct *p)
 {
-	struct task_ctx *tctx;
+	struct task_ctx *taskc;
 
-	if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
+	if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
 		scx_bpf_error("task_ctx lookup failed");
 		return NULL;
 	}
-	return tctx;
+	return taskc;
 }
 
 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
 {
-	struct task_ctx *tctx;
+	struct task_ctx *taskc;
 	s32 cpu;
 
-	if (!(tctx = lookup_task_ctx(p)))
+	if (!(taskc = lookup_task_ctx(p)))
 		return -ESRCH;
 
 	if (p->scx.weight < 2 && !(p->flags & PF_KTHREAD))
@@ -183,7 +183,7 @@ s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 	cpu = pick_direct_dispatch_cpu(p, prev_cpu);
 
 	if (cpu >= 0) {
-		tctx->force_local = true;
+		taskc->force_local = true;
 		return cpu;
 	} else {
 		return prev_cpu;
@@ -208,7 +208,7 @@ static int weight_to_idx(u32 weight)
 void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 {
 	static u32 user_cnt, kernel_cnt;
-	struct task_ctx *tctx;
+	struct task_ctx *taskc;
 	u32 pid = p->pid;
 	int idx = weight_to_idx(p->scx.weight);
 	void *ring;
@@ -231,14 +231,14 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	if (test_error_cnt && !--test_error_cnt)
 		scx_bpf_error("test triggering error");
 
-	if (!(tctx = lookup_task_ctx(p)))
+	if (!(taskc = lookup_task_ctx(p)))
 		return;
 
 	/*
 	 * All enqueued tasks must have their core_sched_seq updated for correct
 	 * core-sched ordering. Also, take a look at the end of qmap_dispatch().
 	 */
-	tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
+	taskc->core_sched_seq = core_sched_tail_seqs[idx]++;
 
 	/*
 	 * IMMED stress testing: Every immed_stress_nth'th enqueue, dispatch
@@ -249,7 +249,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 		static u32 immed_stress_cnt;
 
 		if (!(++immed_stress_cnt % immed_stress_nth)) {
-			tctx->force_local = false;
+			taskc->force_local = false;
 			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | scx_bpf_task_cpu(p),
 					   slice_ns, enq_flags);
 			return;
@@ -260,8 +260,8 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	 * If qmap_select_cpu() is telling us to or this is the last runnable
 	 * task on the CPU, enqueue locally.
 	 */
-	if (tctx->force_local) {
-		tctx->force_local = false;
+	if (taskc->force_local) {
+		taskc->force_local = false;
 		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
 		return;
 	}
@@ -310,7 +310,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	}
 
 	if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
-		tctx->highpri = true;
+		taskc->highpri = true;
 		__sync_fetch_and_add(&nr_highpri_queued, 1);
 	}
 	__sync_fetch_and_add(&nr_enqueued, 1);
@@ -330,10 +330,10 @@ void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
 static void update_core_sched_head_seq(struct task_struct *p)
 {
 	int idx = weight_to_idx(p->scx.weight);
-	struct task_ctx *tctx;
+	struct task_ctx *taskc;
 
-	if ((tctx = lookup_task_ctx(p)))
-		core_sched_head_seqs[idx] = tctx->core_sched_seq;
+	if ((taskc = lookup_task_ctx(p)))
+		core_sched_head_seqs[idx] = taskc->core_sched_seq;
 }
 
 /*
@@ -354,12 +354,12 @@ static bool dispatch_highpri(bool from_timer)
 	/* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
 	bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
 		static u64 highpri_seq;
-		struct task_ctx *tctx;
+		struct task_ctx *taskc;
 
-		if (!(tctx = lookup_task_ctx(p)))
+		if (!(taskc = lookup_task_ctx(p)))
 			return false;
 
-		if (tctx->highpri) {
+		if (taskc->highpri) {
 			/* exercise the set_*() and vtime interface too */
 			scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2);
 			scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++);
@@ -405,7 +405,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 {
 	struct task_struct *p;
 	struct cpu_ctx *cpuc;
-	struct task_ctx *tctx;
+	struct task_ctx *taskc;
 	u32 zero = 0, batch = dsp_batch ?: 1;
 	void *fifo;
 	s32 i, pid;
@@ -450,7 +450,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 
 		/* Dispatch or advance. */
 		bpf_repeat(BPF_MAX_LOOPS) {
-			struct task_ctx *tctx;
+			struct task_ctx *taskc;
 
 			if (bpf_map_pop_elem(fifo, &pid))
 				break;
@@ -459,12 +459,12 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 			if (!p)
 				continue;
 
-			if (!(tctx = lookup_task_ctx(p))) {
+			if (!(taskc = lookup_task_ctx(p))) {
 				bpf_task_release(p);
 				return;
 			}
 
-			if (tctx->highpri)
+			if (taskc->highpri)
 				__sync_fetch_and_sub(&nr_highpri_queued, 1);
 
 			update_core_sched_head_seq(p);
@@ -539,13 +539,13 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 	 * if the task were enqueued and dispatched immediately.
 	 */
 	if (prev) {
-		tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
-		if (!tctx) {
+		taskc = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
+		if (!taskc) {
 			scx_bpf_error("task_ctx lookup failed");
 			return;
 		}
 
-		tctx->core_sched_seq =
+		taskc->core_sched_seq =
 			core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
 	}
 }
@@ -580,16 +580,16 @@ void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
 static s64 task_qdist(struct task_struct *p)
 {
 	int idx = weight_to_idx(p->scx.weight);
-	struct task_ctx *tctx;
+	struct task_ctx *taskc;
 	s64 qdist;
 
-	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
-	if (!tctx) {
+	taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+	if (!taskc) {
 		scx_bpf_error("task_ctx lookup failed");
 		return 0;
 	}
 
-	qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
+	qdist = taskc->core_sched_seq - core_sched_head_seqs[idx];
 
 	/*
 	 * As queue index increments, the priority doubles. The queue w/ index 3
-- 
2.53.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2026-04-16 17:20 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-16  8:16 [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Tejun Heo
2026-04-16  8:16 ` [PATCH 1/4] sched_ext: scx_qmap: rename tctx to taskc Tejun Heo
2026-04-16 14:56   ` Emil Tsalapatis
2026-04-16  8:16 ` [PATCH 2/4] sched_ext: scx_qmap: move globals and cpu_ctx into a BPF arena map Tejun Heo
2026-04-16 15:28   ` Emil Tsalapatis
2026-04-16  8:16 ` [PATCH 3/4] sched_ext: scx_qmap: move task_ctx into a BPF arena slab Tejun Heo
2026-04-16 15:31   ` Emil Tsalapatis
2026-04-16  8:16 ` [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists Tejun Heo
2026-04-16 10:01   ` Andrea Righi
2026-04-16 15:45   ` Emil Tsalapatis
2026-04-16 10:05 ` [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Andrea Righi
  -- strict thread matches above, loose matches on Subject: below --
2026-04-16 17:20 [PATCHSET v2 " Tejun Heo
2026-04-16 17:20 ` [PATCH 1/4] sched_ext: scx_qmap: rename tctx to taskc Tejun Heo

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.