[PATCHSET sched_ext/for-7.2] sched_ext: scx

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena
@ 2026-04-16  8:16 Tejun Heo
  2026-04-16  8:16 ` [PATCH 1/4] sched_ext: scx_qmap: rename tctx to taskc Tejun Heo
                   ` (4 more replies)
  0 siblings, 5 replies; 11+ messages in thread
From: Tejun Heo @ 2026-04-16  8:16 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min
  Cc: Emil Tsalapatis, sched-ext, linux-kernel

Hello,

Arena simplifies verification and allows more natural programming. This
patchset converts scx_qmap to use BPF arena for all mutable state, as
preparation for further sub-sched work.

 0001 Rename tctx to taskc for consistency.
 0002 Move globals and cpu_ctx into arena.
 0003 Move task_ctx into an arena slab with bpf_res_spin_lock.
 0004 Replace FIFO queue maps with arena-backed doubly-linked lists.

Based on linus/master (1d51b370a0f8).

 tools/sched_ext/include/scx/common.bpf.h |   4 +
 tools/sched_ext/scx_qmap.bpf.c           | 561 ++++++++++++++---------
 tools/sched_ext/scx_qmap.c               |  54 +--
 tools/sched_ext/scx_qmap.h               |  73 +++
 4 files changed, 459 insertions(+), 233 deletions(-)

Git tree: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git qmap-arena

--
tejun

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 1/4] sched_ext: scx_qmap: rename tctx to taskc
  2026-04-16  8:16 [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Tejun Heo
@ 2026-04-16  8:16 ` Tejun Heo
  2026-04-16 14:56   ` Emil Tsalapatis
  2026-04-16  8:16 ` [PATCH 2/4] sched_ext: scx_qmap: move globals and cpu_ctx into a BPF arena map Tejun Heo
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 11+ messages in thread
From: Tejun Heo @ 2026-04-16  8:16 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min
  Cc: Emil Tsalapatis, sched-ext, linux-kernel, Tejun Heo

Rename the per-task context local variable from tctx to taskc for
consistency.

No functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_qmap.bpf.c | 60 +++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index b68abb9e760b..a18234f3c27a 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -159,22 +159,22 @@ static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
 
 static struct task_ctx *lookup_task_ctx(struct task_struct *p)
 {
-	struct task_ctx *tctx;
+	struct task_ctx *taskc;
 
-	if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
+	if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
 		scx_bpf_error("task_ctx lookup failed");
 		return NULL;
 	}
-	return tctx;
+	return taskc;
 }
 
 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
 {
-	struct task_ctx *tctx;
+	struct task_ctx *taskc;
 	s32 cpu;
 
-	if (!(tctx = lookup_task_ctx(p)))
+	if (!(taskc = lookup_task_ctx(p)))
 		return -ESRCH;
 
 	if (p->scx.weight < 2 && !(p->flags & PF_KTHREAD))
@@ -183,7 +183,7 @@ s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 	cpu = pick_direct_dispatch_cpu(p, prev_cpu);
 
 	if (cpu >= 0) {
-		tctx->force_local = true;
+		taskc->force_local = true;
 		return cpu;
 	} else {
 		return prev_cpu;
@@ -208,7 +208,7 @@ static int weight_to_idx(u32 weight)
 void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 {
 	static u32 user_cnt, kernel_cnt;
-	struct task_ctx *tctx;
+	struct task_ctx *taskc;
 	u32 pid = p->pid;
 	int idx = weight_to_idx(p->scx.weight);
 	void *ring;
@@ -231,14 +231,14 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	if (test_error_cnt && !--test_error_cnt)
 		scx_bpf_error("test triggering error");
 
-	if (!(tctx = lookup_task_ctx(p)))
+	if (!(taskc = lookup_task_ctx(p)))
 		return;
 
 	/*
 	 * All enqueued tasks must have their core_sched_seq updated for correct
 	 * core-sched ordering. Also, take a look at the end of qmap_dispatch().
 	 */
-	tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
+	taskc->core_sched_seq = core_sched_tail_seqs[idx]++;
 
 	/*
 	 * IMMED stress testing: Every immed_stress_nth'th enqueue, dispatch
@@ -249,7 +249,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 		static u32 immed_stress_cnt;
 
 		if (!(++immed_stress_cnt % immed_stress_nth)) {
-			tctx->force_local = false;
+			taskc->force_local = false;
 			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | scx_bpf_task_cpu(p),
 					   slice_ns, enq_flags);
 			return;
@@ -260,8 +260,8 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	 * If qmap_select_cpu() is telling us to or this is the last runnable
 	 * task on the CPU, enqueue locally.
 	 */
-	if (tctx->force_local) {
-		tctx->force_local = false;
+	if (taskc->force_local) {
+		taskc->force_local = false;
 		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
 		return;
 	}
@@ -310,7 +310,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	}
 
 	if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
-		tctx->highpri = true;
+		taskc->highpri = true;
 		__sync_fetch_and_add(&nr_highpri_queued, 1);
 	}
 	__sync_fetch_and_add(&nr_enqueued, 1);
@@ -330,10 +330,10 @@ void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
 static void update_core_sched_head_seq(struct task_struct *p)
 {
 	int idx = weight_to_idx(p->scx.weight);
-	struct task_ctx *tctx;
+	struct task_ctx *taskc;
 
-	if ((tctx = lookup_task_ctx(p)))
-		core_sched_head_seqs[idx] = tctx->core_sched_seq;
+	if ((taskc = lookup_task_ctx(p)))
+		core_sched_head_seqs[idx] = taskc->core_sched_seq;
 }
 
 /*
@@ -354,12 +354,12 @@ static bool dispatch_highpri(bool from_timer)
 	/* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
 	bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
 		static u64 highpri_seq;
-		struct task_ctx *tctx;
+		struct task_ctx *taskc;
 
-		if (!(tctx = lookup_task_ctx(p)))
+		if (!(taskc = lookup_task_ctx(p)))
 			return false;
 
-		if (tctx->highpri) {
+		if (taskc->highpri) {
 			/* exercise the set_*() and vtime interface too */
 			scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2);
 			scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++);
@@ -405,7 +405,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 {
 	struct task_struct *p;
 	struct cpu_ctx *cpuc;
-	struct task_ctx *tctx;
+	struct task_ctx *taskc;
 	u32 zero = 0, batch = dsp_batch ?: 1;
 	void *fifo;
 	s32 i, pid;
@@ -450,7 +450,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 
 		/* Dispatch or advance. */
 		bpf_repeat(BPF_MAX_LOOPS) {
-			struct task_ctx *tctx;
+			struct task_ctx *taskc;
 
 			if (bpf_map_pop_elem(fifo, &pid))
 				break;
@@ -459,12 +459,12 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 			if (!p)
 				continue;
 
-			if (!(tctx = lookup_task_ctx(p))) {
+			if (!(taskc = lookup_task_ctx(p))) {
 				bpf_task_release(p);
 				return;
 			}
 
-			if (tctx->highpri)
+			if (taskc->highpri)
 				__sync_fetch_and_sub(&nr_highpri_queued, 1);
 
 			update_core_sched_head_seq(p);
@@ -539,13 +539,13 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 	 * if the task were enqueued and dispatched immediately.
 	 */
 	if (prev) {
-		tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
-		if (!tctx) {
+		taskc = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
+		if (!taskc) {
 			scx_bpf_error("task_ctx lookup failed");
 			return;
 		}
 
-		tctx->core_sched_seq =
+		taskc->core_sched_seq =
 			core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
 	}
 }
@@ -580,16 +580,16 @@ void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
 static s64 task_qdist(struct task_struct *p)
 {
 	int idx = weight_to_idx(p->scx.weight);
-	struct task_ctx *tctx;
+	struct task_ctx *taskc;
 	s64 qdist;
 
-	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
-	if (!tctx) {
+	taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+	if (!taskc) {
 		scx_bpf_error("task_ctx lookup failed");
 		return 0;
 	}
 
-	qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
+	qdist = taskc->core_sched_seq - core_sched_head_seqs[idx];
 
 	/*
 	 * As queue index increments, the priority doubles. The queue w/ index 3
-- 
2.53.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 2/4] sched_ext: scx_qmap: move globals and cpu_ctx into a BPF arena map
  2026-04-16  8:16 [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Tejun Heo
  2026-04-16  8:16 ` [PATCH 1/4] sched_ext: scx_qmap: rename tctx to taskc Tejun Heo
@ 2026-04-16  8:16 ` Tejun Heo
  2026-04-16 15:28   ` Emil Tsalapatis
  2026-04-16  8:16 ` [PATCH 3/4] sched_ext: scx_qmap: move task_ctx into a BPF arena slab Tejun Heo
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 11+ messages in thread
From: Tejun Heo @ 2026-04-16  8:16 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min
  Cc: Emil Tsalapatis, sched-ext, linux-kernel, Tejun Heo

Arena simplifies verification and allows more natural programming.
Convert scx_qmap to arena as preparation for further sub-sched work.

Move mutable scheduler state from BSS globals and a percpu array map
into a single BPF arena map. A shared struct qmap_arena is declared as
an __arena global so BPF accesses it directly and userspace reaches it
through skel->arena->qa.

Scheduling logic unchanged; only memory backing changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_qmap.bpf.c | 152 ++++++++++++++-------------------
 tools/sched_ext/scx_qmap.c     |  45 +++++-----
 tools/sched_ext/scx_qmap.h     |  57 +++++++++++++
 3 files changed, 147 insertions(+), 107 deletions(-)
 create mode 100644 tools/sched_ext/scx_qmap.h

diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index a18234f3c27a..0f8fbb6d0bc2 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -22,6 +22,8 @@
  */
 #include <scx/common.bpf.h>
 
+#include "scx_qmap.h"
+
 enum consts {
 	ONE_SEC_IN_NS		= 1000000000,
 	ONE_MSEC_IN_NS		= 1000000,
@@ -48,14 +50,26 @@ const volatile bool suppress_dump;
 const volatile bool always_enq_immed;
 const volatile u32 immed_stress_nth;
 
-u64 nr_highpri_queued;
-u32 test_error_cnt;
-
-#define MAX_SUB_SCHEDS		8
-u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
-
 UEI_DEFINE(uei);
 
+/*
+ * All mutable scheduler state - per-cpu context, stats counters, core-sched
+ * sequence numbers, sub-sched cgroup ids - lives in this single BPF arena map.
+ * Userspace reaches it via skel->arena->qa.
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_ARENA);
+	__uint(map_flags, BPF_F_MMAPABLE);
+	__uint(max_entries, 1 << 16);		/* upper bound in pages */
+#if defined(__TARGET_ARCH_arm64) || defined(__aarch64__)
+	__ulong(map_extra, 0x1ull << 32);	/* user/BPF mmap base */
+#else
+	__ulong(map_extra, 0x1ull << 44);
+#endif
+} arena SEC(".maps");
+
+struct qmap_arena __arena qa;
+
 struct qmap {
 	__uint(type, BPF_MAP_TYPE_QUEUE);
 	__uint(max_entries, 4096);
@@ -102,8 +116,6 @@ static const u32 qidx_to_cpuperf_target[] = {
  * task's seq and the associated queue's head seq is called the queue distance
  * and used when comparing two tasks for ordering. See qmap_core_sched_before().
  */
-static u64 core_sched_head_seqs[5];
-static u64 core_sched_tail_seqs[5];
 
 /* Per-task scheduling context */
 struct task_ctx {
@@ -119,27 +131,6 @@ struct {
 	__type(value, struct task_ctx);
 } task_ctx_stor SEC(".maps");
 
-struct cpu_ctx {
-	u64	dsp_idx;	/* dispatch index */
-	u64	dsp_cnt;	/* remaining count */
-	u32	avg_weight;
-	u32	cpuperf_target;
-};
-
-struct {
-	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-	__uint(max_entries, 1);
-	__type(key, u32);
-	__type(value, struct cpu_ctx);
-} cpu_ctx_stor SEC(".maps");
-
-/* Statistics */
-u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0, nr_dequeued, nr_ddsp_from_enq;
-u64 nr_core_sched_execed;
-u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
-u32 cpuperf_min, cpuperf_avg, cpuperf_max;
-u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
-
 static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
 {
 	s32 cpu;
@@ -215,9 +206,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	s32 cpu;
 
 	if (enq_flags & SCX_ENQ_REENQ) {
-		__sync_fetch_and_add(&nr_reenqueued, 1);
+		__sync_fetch_and_add(&qa.nr_reenqueued, 1);
 		if (scx_bpf_task_cpu(p) == 0)
-			__sync_fetch_and_add(&nr_reenqueued_cpu0, 1);
+			__sync_fetch_and_add(&qa.nr_reenqueued_cpu0, 1);
 	}
 
 	if (p->flags & PF_KTHREAD) {
@@ -228,7 +219,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 			return;
 	}
 
-	if (test_error_cnt && !--test_error_cnt)
+	if (qa.test_error_cnt && !--qa.test_error_cnt)
 		scx_bpf_error("test triggering error");
 
 	if (!(taskc = lookup_task_ctx(p)))
@@ -238,7 +229,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	 * All enqueued tasks must have their core_sched_seq updated for correct
 	 * core-sched ordering. Also, take a look at the end of qmap_dispatch().
 	 */
-	taskc->core_sched_seq = core_sched_tail_seqs[idx]++;
+	taskc->core_sched_seq = qa.core_sched_tail_seqs[idx]++;
 
 	/*
 	 * IMMED stress testing: Every immed_stress_nth'th enqueue, dispatch
@@ -276,7 +267,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	/* if select_cpu() wasn't called, try direct dispatch */
 	if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
 	    (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
-		__sync_fetch_and_add(&nr_ddsp_from_enq, 1);
+		__sync_fetch_and_add(&qa.nr_ddsp_from_enq, 1);
 		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
 		return;
 	}
@@ -311,9 +302,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 
 	if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
 		taskc->highpri = true;
-		__sync_fetch_and_add(&nr_highpri_queued, 1);
+		__sync_fetch_and_add(&qa.nr_highpri_queued, 1);
 	}
-	__sync_fetch_and_add(&nr_enqueued, 1);
+	__sync_fetch_and_add(&qa.nr_enqueued, 1);
 }
 
 /*
@@ -322,9 +313,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
  */
 void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
 {
-	__sync_fetch_and_add(&nr_dequeued, 1);
+	__sync_fetch_and_add(&qa.nr_dequeued, 1);
 	if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
-		__sync_fetch_and_add(&nr_core_sched_execed, 1);
+		__sync_fetch_and_add(&qa.nr_core_sched_execed, 1);
 }
 
 static void update_core_sched_head_seq(struct task_struct *p)
@@ -333,7 +324,7 @@ static void update_core_sched_head_seq(struct task_struct *p)
 	struct task_ctx *taskc;
 
 	if ((taskc = lookup_task_ctx(p)))
-		core_sched_head_seqs[idx] = taskc->core_sched_seq;
+		qa.core_sched_head_seqs[idx] = taskc->core_sched_seq;
 }
 
 /*
@@ -384,14 +375,14 @@ static bool dispatch_highpri(bool from_timer)
 				     SCX_ENQ_PREEMPT)) {
 			if (cpu == this_cpu) {
 				dispatched = true;
-				__sync_fetch_and_add(&nr_expedited_local, 1);
+				__sync_fetch_and_add(&qa.nr_expedited_local, 1);
 			} else {
-				__sync_fetch_and_add(&nr_expedited_remote, 1);
+				__sync_fetch_and_add(&qa.nr_expedited_remote, 1);
 			}
 			if (from_timer)
-				__sync_fetch_and_add(&nr_expedited_from_timer, 1);
+				__sync_fetch_and_add(&qa.nr_expedited_from_timer, 1);
 		} else {
-			__sync_fetch_and_add(&nr_expedited_lost, 1);
+			__sync_fetch_and_add(&qa.nr_expedited_lost, 1);
 		}
 
 		if (dispatched)
@@ -404,19 +395,19 @@ static bool dispatch_highpri(bool from_timer)
 void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 {
 	struct task_struct *p;
-	struct cpu_ctx *cpuc;
+	struct cpu_ctx __arena *cpuc;
 	struct task_ctx *taskc;
-	u32 zero = 0, batch = dsp_batch ?: 1;
+	u32 batch = dsp_batch ?: 1;
 	void *fifo;
 	s32 i, pid;
 
 	if (dispatch_highpri(false))
 		return;
 
-	if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0))
+	if (!qa.nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0))
 		return;
 
-	if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
+	if (dsp_inf_loop_after && qa.nr_dispatched > dsp_inf_loop_after) {
 		/*
 		 * PID 2 should be kthreadd which should mostly be idle and off
 		 * the scheduler. Let's keep dispatching it to force the kernel
@@ -430,10 +421,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 		}
 	}
 
-	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
-		scx_bpf_error("failed to look up cpu_ctx");
-		return;
-	}
+	cpuc = &qa.cpu_ctxs[bpf_get_smp_processor_id()];
 
 	for (i = 0; i < 5; i++) {
 		/* Advance the dispatch cursor and pick the fifo. */
@@ -442,9 +430,11 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 			cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
 		}
 
-		fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
+		u64 dsp_idx = cpuc->dsp_idx;
+
+		fifo = bpf_map_lookup_elem(&queue_arr, &dsp_idx);
 		if (!fifo) {
-			scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
+			scx_bpf_error("failed to find ring %llu", dsp_idx);
 			return;
 		}
 
@@ -465,10 +455,10 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 			}
 
 			if (taskc->highpri)
-				__sync_fetch_and_sub(&nr_highpri_queued, 1);
+				__sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
 
 			update_core_sched_head_seq(p);
-			__sync_fetch_and_add(&nr_dispatched, 1);
+			__sync_fetch_and_add(&qa.nr_dispatched, 1);
 
 			scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
 
@@ -529,8 +519,8 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 	}
 
 	for (i = 0; i < MAX_SUB_SCHEDS; i++) {
-		if (sub_sched_cgroup_ids[i] &&
-		    scx_bpf_sub_dispatch(sub_sched_cgroup_ids[i]))
+		if (qa.sub_sched_cgroup_ids[i] &&
+		    scx_bpf_sub_dispatch(qa.sub_sched_cgroup_ids[i]))
 			return;
 	}
 
@@ -546,21 +536,15 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 		}
 
 		taskc->core_sched_seq =
-			core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
+			qa.core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
 	}
 }
 
 void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
 {
-	struct cpu_ctx *cpuc;
-	u32 zero = 0;
+	struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[bpf_get_smp_processor_id()];
 	int idx;
 
-	if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
-		scx_bpf_error("failed to look up cpu_ctx");
-		return;
-	}
-
 	/*
 	 * Use the running avg of weights to select the target cpuperf level.
 	 * This is a demonstration of the cpuperf feature rather than a
@@ -589,7 +573,7 @@ static s64 task_qdist(struct task_struct *p)
 		return 0;
 	}
 
-	qdist = taskc->core_sched_seq - core_sched_head_seqs[idx];
+	qdist = taskc->core_sched_seq - qa.core_sched_head_seqs[idx];
 
 	/*
 	 * As queue index increments, the priority doubles. The queue w/ index 3
@@ -679,13 +663,10 @@ void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
 
 void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
 {
-	u32 zero = 0;
-	struct cpu_ctx *cpuc;
+	struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[cpu];
 
 	if (suppress_dump || idle)
 		return;
-	if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
-		return;
 
 	scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
 		     cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
@@ -802,7 +783,7 @@ struct {
  */
 static void monitor_cpuperf(void)
 {
-	u32 zero = 0, nr_cpu_ids;
+	u32 nr_cpu_ids;
 	u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
 	u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
 	const struct cpumask *online;
@@ -812,7 +793,7 @@ static void monitor_cpuperf(void)
 	online = scx_bpf_get_online_cpumask();
 
 	bpf_for(i, 0, nr_cpu_ids) {
-		struct cpu_ctx *cpuc;
+		struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[i];
 		u32 cap, cur;
 
 		if (!bpf_cpumask_test_cpu(i, online))
@@ -834,11 +815,6 @@ static void monitor_cpuperf(void)
 		cur_sum += cur * cap / SCX_CPUPERF_ONE;
 		cap_sum += cap;
 
-		if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
-			scx_bpf_error("failed to look up cpu_ctx");
-			goto out;
-		}
-
 		/* collect target */
 		cur = cpuc->cpuperf_target;
 		target_sum += cur;
@@ -846,14 +822,14 @@ static void monitor_cpuperf(void)
 		target_max = cur > target_max ? cur : target_max;
 	}
 
-	cpuperf_min = cur_min;
-	cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
-	cpuperf_max = cur_max;
+	qa.cpuperf_min = cur_min;
+	qa.cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
+	qa.cpuperf_max = cur_max;
+
+	qa.cpuperf_target_min = target_min;
+	qa.cpuperf_target_avg = target_sum / nr_online_cpus;
+	qa.cpuperf_target_max = target_max;
 
-	cpuperf_target_min = target_min;
-	cpuperf_target_avg = target_sum / nr_online_cpus;
-	cpuperf_target_max = target_max;
-out:
 	scx_bpf_put_cpumask(online);
 }
 
@@ -996,8 +972,8 @@ s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args)
 	s32 i;
 
 	for (i = 0; i < MAX_SUB_SCHEDS; i++) {
-		if (!sub_sched_cgroup_ids[i]) {
-			sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
+		if (!qa.sub_sched_cgroup_ids[i]) {
+			qa.sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
 			bpf_printk("attaching sub-sched[%d] on %s",
 				   i, args->cgroup_path);
 			return 0;
@@ -1012,8 +988,8 @@ void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args)
 	s32 i;
 
 	for (i = 0; i < MAX_SUB_SCHEDS; i++) {
-		if (sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
-			sub_sched_cgroup_ids[i] = 0;
+		if (qa.sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
+			qa.sub_sched_cgroup_ids[i] = 0;
 			bpf_printk("detaching sub-sched[%d] on %s",
 				   i, args->cgroup_path);
 			break;
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index e7c89a2bc3d8..8844499c14c4 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -10,9 +10,11 @@
 #include <inttypes.h>
 #include <signal.h>
 #include <libgen.h>
+#include <sys/mman.h>
 #include <sys/stat.h>
 #include <bpf/bpf.h>
 #include <scx/common.h>
+#include "scx_qmap.h"
 #include "scx_qmap.bpf.skel.h"
 
 const char help_fmt[] =
@@ -60,6 +62,8 @@ int main(int argc, char **argv)
 {
 	struct scx_qmap *skel;
 	struct bpf_link *link;
+	struct qmap_arena *qa;
+	__u32 test_error_cnt = 0;
 	int opt;
 
 	libbpf_set_print(libbpf_print_fn);
@@ -76,7 +80,7 @@ int main(int argc, char **argv)
 			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
 			break;
 		case 'e':
-			skel->bss->test_error_cnt = strtoul(optarg, NULL, 0);
+			test_error_cnt = strtoul(optarg, NULL, 0);
 			break;
 		case 't':
 			skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0);
@@ -142,29 +146,32 @@ int main(int argc, char **argv)
 	SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei);
 	link = SCX_OPS_ATTACH(skel, qmap_ops, scx_qmap);
 
+	qa = &skel->arena->qa;
+	qa->test_error_cnt = test_error_cnt;
+
 	while (!exit_req && !UEI_EXITED(skel, uei)) {
-		long nr_enqueued = skel->bss->nr_enqueued;
-		long nr_dispatched = skel->bss->nr_dispatched;
+		long nr_enqueued = qa->nr_enqueued;
+		long nr_dispatched = qa->nr_dispatched;
 
-		printf("stats  : enq=%lu dsp=%lu delta=%ld reenq/cpu0=%"PRIu64"/%"PRIu64" deq=%"PRIu64" core=%"PRIu64" enq_ddsp=%"PRIu64"\n",
+		printf("stats  : enq=%lu dsp=%lu delta=%ld reenq/cpu0=%llu/%llu deq=%llu core=%llu enq_ddsp=%llu\n",
 		       nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
-		       skel->bss->nr_reenqueued, skel->bss->nr_reenqueued_cpu0,
-		       skel->bss->nr_dequeued,
-		       skel->bss->nr_core_sched_execed,
-		       skel->bss->nr_ddsp_from_enq);
-		printf("         exp_local=%"PRIu64" exp_remote=%"PRIu64" exp_timer=%"PRIu64" exp_lost=%"PRIu64"\n",
-		       skel->bss->nr_expedited_local,
-		       skel->bss->nr_expedited_remote,
-		       skel->bss->nr_expedited_from_timer,
-		       skel->bss->nr_expedited_lost);
+		       qa->nr_reenqueued, qa->nr_reenqueued_cpu0,
+		       qa->nr_dequeued,
+		       qa->nr_core_sched_execed,
+		       qa->nr_ddsp_from_enq);
+		printf("         exp_local=%llu exp_remote=%llu exp_timer=%llu exp_lost=%llu\n",
+		       qa->nr_expedited_local,
+		       qa->nr_expedited_remote,
+		       qa->nr_expedited_from_timer,
+		       qa->nr_expedited_lost);
 		if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur"))
 			printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n",
-			       skel->bss->cpuperf_min,
-			       skel->bss->cpuperf_avg,
-			       skel->bss->cpuperf_max,
-			       skel->bss->cpuperf_target_min,
-			       skel->bss->cpuperf_target_avg,
-			       skel->bss->cpuperf_target_max);
+			       qa->cpuperf_min,
+			       qa->cpuperf_avg,
+			       qa->cpuperf_max,
+			       qa->cpuperf_target_min,
+			       qa->cpuperf_target_avg,
+			       qa->cpuperf_target_max);
 		fflush(stdout);
 		sleep(1);
 	}
diff --git a/tools/sched_ext/scx_qmap.h b/tools/sched_ext/scx_qmap.h
new file mode 100644
index 000000000000..52153230bfce
--- /dev/null
+++ b/tools/sched_ext/scx_qmap.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared definitions between scx_qmap.bpf.c and scx_qmap.c.
+ *
+ * The scheduler keeps all mutable state in a single BPF arena map. struct
+ * qmap_arena is the one object that lives at the base of the arena and is
+ * mmap'd into userspace so the loader can read counters directly.
+ *
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
+ */
+#ifndef __SCX_QMAP_H
+#define __SCX_QMAP_H
+
+#ifdef __BPF__
+#include <scx/bpf_arena_common.bpf.h>
+#else
+#include <linux/types.h>
+#include <scx/bpf_arena_common.h>
+#endif
+
+#define MAX_SUB_SCHEDS		8
+
+/*
+ * cpu_ctxs[] is sized to a fixed cap so the layout is shared between BPF and
+ * userspace. Keep this in sync with NR_CPUS used by the BPF side.
+ */
+#define SCX_QMAP_MAX_CPUS	1024
+
+struct cpu_ctx {
+	__u64 dsp_idx;		/* dispatch index */
+	__u64 dsp_cnt;		/* remaining count */
+	__u32 avg_weight;
+	__u32 cpuperf_target;
+};
+
+struct qmap_arena {
+	/* userspace-visible stats */
+	__u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0;
+	__u64 nr_dequeued, nr_ddsp_from_enq;
+	__u64 nr_core_sched_execed;
+	__u64 nr_expedited_local, nr_expedited_remote;
+	__u64 nr_expedited_lost, nr_expedited_from_timer;
+	__u64 nr_highpri_queued;
+	__u32 test_error_cnt;
+	__u32 cpuperf_min, cpuperf_avg, cpuperf_max;
+	__u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
+
+	/* kernel-side runtime state */
+	__u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
+	__u64 core_sched_head_seqs[5];
+	__u64 core_sched_tail_seqs[5];
+
+	struct cpu_ctx cpu_ctxs[SCX_QMAP_MAX_CPUS];
+};
+
+#endif /* __SCX_QMAP_H */
-- 
2.53.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 3/4] sched_ext: scx_qmap: move task_ctx into a BPF arena slab
  2026-04-16  8:16 [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Tejun Heo
  2026-04-16  8:16 ` [PATCH 1/4] sched_ext: scx_qmap: rename tctx to taskc Tejun Heo
  2026-04-16  8:16 ` [PATCH 2/4] sched_ext: scx_qmap: move globals and cpu_ctx into a BPF arena map Tejun Heo
@ 2026-04-16  8:16 ` Tejun Heo
  2026-04-16 15:31   ` Emil Tsalapatis
  2026-04-16  8:16 ` [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists Tejun Heo
  2026-04-16 10:05 ` [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Andrea Righi
  4 siblings, 1 reply; 11+ messages in thread
From: Tejun Heo @ 2026-04-16  8:16 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min
  Cc: Emil Tsalapatis, sched-ext, linux-kernel, Tejun Heo

Arena simplifies verification and allows more natural programming.
Convert scx_qmap to arena as preparation for further sub-sched work.

Allocate per-task context from an arena slab instead of storing it
directly in task_storage. task_ctx_stor now holds an arena pointer to
the task's slab entry. Free entries form a singly-linked list protected
by bpf_res_spin_lock; slab exhaustion triggers scx_bpf_error().

The slab size is configurable via the new -N option (default 16384).

Also add bpf_res_spin_lock/unlock declarations to common.bpf.h.

Scheduling logic unchanged.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/include/scx/common.bpf.h |   4 +
 tools/sched_ext/scx_qmap.bpf.c           | 178 ++++++++++++++++++-----
 tools/sched_ext/scx_qmap.c               |   9 +-
 tools/sched_ext/scx_qmap.h               |   7 +
 4 files changed, 159 insertions(+), 39 deletions(-)

diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 19459dedde41..35fc62556241 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -526,6 +526,10 @@ static inline bool is_migration_disabled(const struct task_struct *p)
 void bpf_rcu_read_lock(void) __ksym;
 void bpf_rcu_read_unlock(void) __ksym;
 
+/* resilient qspinlock */
+int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) __ksym __weak;
+void bpf_res_spin_unlock(struct bpf_res_spin_lock *lock) __ksym __weak;
+
 /*
  * Time helpers, most of which are from jiffies.h.
  */
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 0f8fbb6d0bc2..e071969c8f32 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -49,6 +49,7 @@ const volatile s32 disallow_tgid;
 const volatile bool suppress_dump;
 const volatile bool always_enq_immed;
 const volatile u32 immed_stress_nth;
+const volatile u32 max_tasks;
 
 UEI_DEFINE(uei);
 
@@ -117,20 +118,43 @@ static const u32 qidx_to_cpuperf_target[] = {
  * and used when comparing two tasks for ordering. See qmap_core_sched_before().
  */
 
-/* Per-task scheduling context */
+/*
+ * Per-task scheduling context. Allocated from the qa.task_ctxs[] slab in
+ * arena. While the task is alive the entry is referenced from task_ctx_stor;
+ * while it's free the entry sits on the free list singly-linked through
+ * @next_free.
+ */
 struct task_ctx {
-	bool	force_local;	/* Dispatch directly to local_dsq */
-	bool	highpri;
-	u64	core_sched_seq;
+	struct task_ctx __arena	*next_free;	/* only valid on free list */
+	bool			force_local;	/* Dispatch directly to local_dsq */
+	bool			highpri;
+	u64			core_sched_seq;
+};
+
+/* Holds an arena pointer to the task's slab entry. */
+struct task_ctx_stor_val {
+	struct task_ctx __arena	*taskc;
 };
 
 struct {
 	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
 	__uint(map_flags, BPF_F_NO_PREALLOC);
 	__type(key, int);
-	__type(value, struct task_ctx);
+	__type(value, struct task_ctx_stor_val);
 } task_ctx_stor SEC(".maps");
 
+/* Protects the task_ctx slab free list. */
+__hidden struct bpf_res_spin_lock qa_task_lock SEC(".data.qa_task_lock");
+
+static int qmap_spin_lock(struct bpf_res_spin_lock *lock)
+{
+	if (bpf_res_spin_lock(lock)) {
+		scx_bpf_error("res_spin_lock failed");
+		return -EBUSY;
+	}
+	return 0;
+}
+
 static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
 {
 	s32 cpu;
@@ -148,21 +172,34 @@ static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
 	return -1;
 }
 
-static struct task_ctx *lookup_task_ctx(struct task_struct *p)
+/*
+ * Force a reference to the arena map. The verifier associates an arena with
+ * a program by finding an LD_IMM64 instruction that loads the arena's BPF
+ * map; programs that only use arena pointers returned from task-local
+ * storage (like qmap_select_cpu) never reference @arena directly. Without
+ * this, the verifier rejects addr_space_cast with "addr_space_cast insn
+ * can only be used in a program that has an associated arena".
+ */
+#define QMAP_TOUCH_ARENA() do { asm volatile("" :: "r"(&arena)); } while (0)
+
+static struct task_ctx __arena *lookup_task_ctx(struct task_struct *p)
 {
-	struct task_ctx *taskc;
+	struct task_ctx_stor_val *v;
+
+	QMAP_TOUCH_ARENA();
 
-	if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
+	v = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+	if (!v || !v->taskc) {
 		scx_bpf_error("task_ctx lookup failed");
 		return NULL;
 	}
-	return taskc;
+	return v->taskc;
 }
 
 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
 {
-	struct task_ctx *taskc;
+	struct task_ctx __arena *taskc;
 	s32 cpu;
 
 	if (!(taskc = lookup_task_ctx(p)))
@@ -199,7 +236,7 @@ static int weight_to_idx(u32 weight)
 void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 {
 	static u32 user_cnt, kernel_cnt;
-	struct task_ctx *taskc;
+	struct task_ctx __arena *taskc;
 	u32 pid = p->pid;
 	int idx = weight_to_idx(p->scx.weight);
 	void *ring;
@@ -321,7 +358,7 @@ void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
 static void update_core_sched_head_seq(struct task_struct *p)
 {
 	int idx = weight_to_idx(p->scx.weight);
-	struct task_ctx *taskc;
+	struct task_ctx __arena *taskc;
 
 	if ((taskc = lookup_task_ctx(p)))
 		qa.core_sched_head_seqs[idx] = taskc->core_sched_seq;
@@ -345,7 +382,7 @@ static bool dispatch_highpri(bool from_timer)
 	/* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
 	bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
 		static u64 highpri_seq;
-		struct task_ctx *taskc;
+		struct task_ctx __arena *taskc;
 
 		if (!(taskc = lookup_task_ctx(p)))
 			return false;
@@ -396,7 +433,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 {
 	struct task_struct *p;
 	struct cpu_ctx __arena *cpuc;
-	struct task_ctx *taskc;
+	struct task_ctx __arena *taskc;
 	u32 batch = dsp_batch ?: 1;
 	void *fifo;
 	s32 i, pid;
@@ -440,7 +477,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 
 		/* Dispatch or advance. */
 		bpf_repeat(BPF_MAX_LOOPS) {
-			struct task_ctx *taskc;
+			struct task_ctx __arena *taskc;
 
 			if (bpf_map_pop_elem(fifo, &pid))
 				break;
@@ -529,11 +566,9 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 	 * if the task were enqueued and dispatched immediately.
 	 */
 	if (prev) {
-		taskc = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
-		if (!taskc) {
-			scx_bpf_error("task_ctx lookup failed");
+		taskc = lookup_task_ctx(prev);
+		if (!taskc)
 			return;
-		}
 
 		taskc->core_sched_seq =
 			qa.core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
@@ -564,14 +599,12 @@ void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
 static s64 task_qdist(struct task_struct *p)
 {
 	int idx = weight_to_idx(p->scx.weight);
-	struct task_ctx *taskc;
+	struct task_ctx __arena *taskc;
 	s64 qdist;
 
-	taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
-	if (!taskc) {
-		scx_bpf_error("task_ctx lookup failed");
+	taskc = lookup_task_ctx(p);
+	if (!taskc)
 		return 0;
-	}
 
 	qdist = taskc->core_sched_seq - qa.core_sched_head_seqs[idx];
 
@@ -606,21 +639,64 @@ bool BPF_STRUCT_OPS(qmap_core_sched_before,
  * tasks when a higher-priority scheduling class takes the CPU.
  */
 
-s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
-		   struct scx_init_task_args *args)
+s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init_task, struct task_struct *p,
+			     struct scx_init_task_args *args)
 {
+	struct task_ctx_stor_val *v;
+	struct task_ctx __arena *taskc;
+
 	if (p->tgid == disallow_tgid)
 		p->scx.disallow = true;
 
-	/*
-	 * @p is new. Let's ensure that its task_ctx is available. We can sleep
-	 * in this function and the following will automatically use GFP_KERNEL.
-	 */
-	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
-				 BPF_LOCAL_STORAGE_GET_F_CREATE))
-		return 0;
-	else
+	/* pop a slab entry off the free list */
+	if (qmap_spin_lock(&qa_task_lock))
+		return -EBUSY;
+	taskc = qa.task_free_head;
+	if (taskc)
+		qa.task_free_head = taskc->next_free;
+	bpf_res_spin_unlock(&qa_task_lock);
+	if (!taskc) {
+		scx_bpf_error("task_ctx slab exhausted (max_tasks=%u)", max_tasks);
+		return -ENOMEM;
+	}
+
+	taskc->next_free = NULL;
+	taskc->force_local = false;
+	taskc->highpri = false;
+	taskc->core_sched_seq = 0;
+
+	v = bpf_task_storage_get(&task_ctx_stor, p, NULL,
+				 BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (!v) {
+		/* push back to the free list */
+		if (!qmap_spin_lock(&qa_task_lock)) {
+			taskc->next_free = qa.task_free_head;
+			qa.task_free_head = taskc;
+			bpf_res_spin_unlock(&qa_task_lock);
+		}
 		return -ENOMEM;
+	}
+	v->taskc = taskc;
+	return 0;
+}
+
+void BPF_STRUCT_OPS(qmap_exit_task, struct task_struct *p,
+		    struct scx_exit_task_args *args)
+{
+	struct task_ctx_stor_val *v;
+	struct task_ctx __arena *taskc;
+
+	v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
+	if (!v || !v->taskc)
+		return;
+	taskc = v->taskc;
+	v->taskc = NULL;
+
+	if (qmap_spin_lock(&qa_task_lock))
+		return;
+	taskc->next_free = qa.task_free_head;
+	qa.task_free_head = taskc;
+	bpf_res_spin_unlock(&qa_task_lock);
 }
 
 void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
@@ -675,12 +751,17 @@ void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle
 
 void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
 {
-	struct task_ctx *taskc;
+	struct task_ctx_stor_val *v;
+	struct task_ctx __arena *taskc;
+
+	QMAP_TOUCH_ARENA();
 
 	if (suppress_dump)
 		return;
-	if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
+	v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
+	if (!v || !v->taskc)
 		return;
+	taskc = v->taskc;
 
 	scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
 		     taskc->force_local, taskc->core_sched_seq);
@@ -915,10 +996,32 @@ static int lowpri_timerfn(void *map, int *key, struct bpf_timer *timer)
 
 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
 {
-	u32 key = 0;
+	struct task_ctx __arena *slab;
+	u32 nr_pages, key = 0, i;
 	struct bpf_timer *timer;
 	s32 ret;
 
+	/*
+	 * Allocate the task_ctx slab in arena and thread the entire slab onto
+	 * the free list. max_tasks is set by userspace before load.
+	 */
+	if (!max_tasks) {
+		scx_bpf_error("max_tasks must be > 0");
+		return -EINVAL;
+	}
+
+	nr_pages = (max_tasks * sizeof(struct task_ctx) + PAGE_SIZE - 1) / PAGE_SIZE;
+	slab = bpf_arena_alloc_pages(&arena, NULL, nr_pages, NUMA_NO_NODE, 0);
+	if (!slab) {
+		scx_bpf_error("failed to allocate task_ctx slab");
+		return -ENOMEM;
+	}
+	qa.task_ctxs = slab;
+
+	bpf_for(i, 0, max_tasks)
+		slab[i].next_free = (i + 1 < max_tasks) ? &slab[i + 1] : NULL;
+	qa.task_free_head = &slab[0];
+
 	if (print_msgs && !sub_cgroup_id)
 		print_cpus();
 
@@ -1005,6 +1108,7 @@ SCX_OPS_DEFINE(qmap_ops,
 	       .tick			= (void *)qmap_tick,
 	       .core_sched_before	= (void *)qmap_core_sched_before,
 	       .init_task		= (void *)qmap_init_task,
+	       .exit_task		= (void *)qmap_exit_task,
 	       .dump			= (void *)qmap_dump,
 	       .dump_cpu		= (void *)qmap_dump_cpu,
 	       .dump_task		= (void *)qmap_dump_task,
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index 8844499c14c4..4bdcc4bc5fbd 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -23,12 +23,13 @@ const char help_fmt[] =
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
 "Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n"
-"       [-P] [-M] [-H] [-d PID] [-D LEN] [-S] [-p] [-I] [-F COUNT] [-v]\n"
+"       [-N COUNT] [-P] [-M] [-H] [-d PID] [-D LEN] [-S] [-p] [-I] [-F COUNT] [-v]\n"
 "\n"
 "  -s SLICE_US   Override slice duration\n"
 "  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
 "  -t COUNT      Stall every COUNT'th user thread\n"
 "  -T COUNT      Stall every COUNT'th kernel thread\n"
+"  -N COUNT      Size of the task_ctx arena slab (default 16384)\n"
 "  -l COUNT      Trigger dispatch infinite looping after COUNT dispatches\n"
 "  -b COUNT      Dispatch upto COUNT tasks together\n"
 "  -P            Print out DSQ content and event counters to trace_pipe every second\n"
@@ -73,8 +74,9 @@ int main(int argc, char **argv)
 	skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
 
 	skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
+	skel->rodata->max_tasks = 16384;
 
-	while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PMHc:d:D:SpIF:vh")) != -1) {
+	while ((opt = getopt(argc, argv, "s:e:t:T:l:b:N:PMHc:d:D:SpIF:vh")) != -1) {
 		switch (opt) {
 		case 's':
 			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -94,6 +96,9 @@ int main(int argc, char **argv)
 		case 'b':
 			skel->rodata->dsp_batch = strtoul(optarg, NULL, 0);
 			break;
+		case 'N':
+			skel->rodata->max_tasks = strtoul(optarg, NULL, 0);
+			break;
 		case 'P':
 			skel->rodata->print_dsqs_and_events = true;
 			break;
diff --git a/tools/sched_ext/scx_qmap.h b/tools/sched_ext/scx_qmap.h
index 52153230bfce..c183d82632b3 100644
--- a/tools/sched_ext/scx_qmap.h
+++ b/tools/sched_ext/scx_qmap.h
@@ -34,6 +34,9 @@ struct cpu_ctx {
 	__u32 cpuperf_target;
 };
 
+/* Opaque to userspace; defined in scx_qmap.bpf.c. */
+struct task_ctx;
+
 struct qmap_arena {
 	/* userspace-visible stats */
 	__u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0;
@@ -52,6 +55,10 @@ struct qmap_arena {
 	__u64 core_sched_tail_seqs[5];
 
 	struct cpu_ctx cpu_ctxs[SCX_QMAP_MAX_CPUS];
+
+	/* task_ctx slab; allocated and threaded by qmap_init() */
+	struct task_ctx __arena *task_ctxs;
+	struct task_ctx __arena *task_free_head;
 };
 
 #endif /* __SCX_QMAP_H */
-- 
2.53.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists
  2026-04-16  8:16 [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Tejun Heo
                   ` (2 preceding siblings ...)
  2026-04-16  8:16 ` [PATCH 3/4] sched_ext: scx_qmap: move task_ctx into a BPF arena slab Tejun Heo
@ 2026-04-16  8:16 ` Tejun Heo
  2026-04-16 10:01   ` Andrea Righi
  2026-04-16 15:45   ` Emil Tsalapatis
  2026-04-16 10:05 ` [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Andrea Righi
  4 siblings, 2 replies; 11+ messages in thread
From: Tejun Heo @ 2026-04-16  8:16 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min
  Cc: Emil Tsalapatis, sched-ext, linux-kernel, Tejun Heo

Arena simplifies verification and allows more natural programming.
Convert scx_qmap to arena as preparation for further sub-sched work.

Replace the five BPF_MAP_TYPE_QUEUE maps with doubly-linked lists in
arena, threaded through task_ctx. Each queue is a struct qmap_fifo with
head/tail pointers and its own per-queue bpf_res_spin_lock.

qmap_dequeue() now properly removes tasks from the queue instead of
leaving stale entries for dispatch to skip.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_qmap.bpf.c | 221 +++++++++++++++++++++------------
 tools/sched_ext/scx_qmap.h     |   9 ++
 2 files changed, 148 insertions(+), 82 deletions(-)

diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index e071969c8f32..c26997ff7863 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -71,31 +71,24 @@ struct {
 
 struct qmap_arena __arena qa;
 
-struct qmap {
-	__uint(type, BPF_MAP_TYPE_QUEUE);
-	__uint(max_entries, 4096);
-	__type(value, u32);
-} queue0 SEC(".maps"),
-  queue1 SEC(".maps"),
-  queue2 SEC(".maps"),
-  queue3 SEC(".maps"),
-  queue4 SEC(".maps"),
-  dump_store SEC(".maps");
-
-struct {
-	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
-	__uint(max_entries, 5);
-	__type(key, int);
-	__array(values, struct qmap);
-} queue_arr SEC(".maps") = {
-	.values = {
-		[0] = &queue0,
-		[1] = &queue1,
-		[2] = &queue2,
-		[3] = &queue3,
-		[4] = &queue4,
-	},
-};
+/* Per-queue locks. Each in its own .data section as bpf_res_spin_lock requires. */
+__hidden struct bpf_res_spin_lock qa_q_lock0 SEC(".data.qa_q_lock0");
+__hidden struct bpf_res_spin_lock qa_q_lock1 SEC(".data.qa_q_lock1");
+__hidden struct bpf_res_spin_lock qa_q_lock2 SEC(".data.qa_q_lock2");
+__hidden struct bpf_res_spin_lock qa_q_lock3 SEC(".data.qa_q_lock3");
+__hidden struct bpf_res_spin_lock qa_q_lock4 SEC(".data.qa_q_lock4");
+
+static struct bpf_res_spin_lock *qa_q_lock(s32 qid)
+{
+	switch (qid) {
+	case 0:	return &qa_q_lock0;
+	case 1:	return &qa_q_lock1;
+	case 2:	return &qa_q_lock2;
+	case 3:	return &qa_q_lock3;
+	case 4:	return &qa_q_lock4;
+	default: return NULL;
+	}
+}
 
 /*
  * If enabled, CPU performance target is set according to the queue index
@@ -123,9 +116,17 @@ static const u32 qidx_to_cpuperf_target[] = {
  * arena. While the task is alive the entry is referenced from task_ctx_stor;
  * while it's free the entry sits on the free list singly-linked through
  * @next_free.
+ *
+ * When the task is queued on one of the five priority FIFOs, @q_idx is the
+ * queue index and @q_next/@q_prev link it in the queue's doubly-linked list.
+ * @q_idx is -1 when the task isn't on any queue.
  */
 struct task_ctx {
 	struct task_ctx __arena	*next_free;	/* only valid on free list */
+	struct task_ctx __arena	*q_next;	/* queue link, NULL if tail */
+	struct task_ctx __arena	*q_prev;	/* queue link, NULL if head */
+	struct qmap_fifo __arena *fifo;		/* queue we're on, NULL if not queued */
+	s32			pid;
 	bool			force_local;	/* Dispatch directly to local_dsq */
 	bool			highpri;
 	u64			core_sched_seq;
@@ -196,6 +197,81 @@ static struct task_ctx __arena *lookup_task_ctx(struct task_struct *p)
 	return v->taskc;
 }
 
+/* Append @taskc to the tail of @fifo. Must not already be queued. */
+static void qmap_fifo_enqueue(struct qmap_fifo __arena *fifo,
+			      struct task_ctx __arena *taskc)
+{
+	struct bpf_res_spin_lock *lock = qa_q_lock(fifo->idx);
+
+	if (!lock || qmap_spin_lock(lock))
+		return;
+	taskc->fifo = fifo;
+	taskc->q_next = NULL;
+	taskc->q_prev = fifo->tail;
+	if (fifo->tail)
+		fifo->tail->q_next = taskc;
+	else
+		fifo->head = taskc;
+	fifo->tail = taskc;
+	bpf_res_spin_unlock(lock);
+}
+
+/* Pop the head of @fifo. Returns NULL if empty. */
+static struct task_ctx __arena *qmap_fifo_pop(struct qmap_fifo __arena *fifo)
+{
+	struct bpf_res_spin_lock *lock = qa_q_lock(fifo->idx);
+	struct task_ctx __arena *taskc;
+
+	if (!lock || qmap_spin_lock(lock))
+		return NULL;
+	taskc = fifo->head;
+	if (taskc) {
+		fifo->head = taskc->q_next;
+		if (taskc->q_next)
+			taskc->q_next->q_prev = NULL;
+		else
+			fifo->tail = NULL;
+		taskc->q_next = NULL;
+		taskc->q_prev = NULL;
+		taskc->fifo = NULL;
+	}
+	bpf_res_spin_unlock(lock);
+	return taskc;
+}
+
+/* Remove @taskc from its fifo. No-op if not queued. */
+static void qmap_fifo_remove(struct task_ctx __arena *taskc)
+{
+	struct qmap_fifo __arena *fifo = taskc->fifo;
+	struct bpf_res_spin_lock *lock;
+
+	if (!fifo)
+		return;
+
+	lock = qa_q_lock(fifo->idx);
+	if (!lock || qmap_spin_lock(lock))
+		return;
+
+	/* Re-check under lock — a concurrent pop may have cleared fifo. */
+	if (taskc->fifo != fifo) {
+		bpf_res_spin_unlock(lock);
+		return;
+	}
+
+	if (taskc->q_next)
+		taskc->q_next->q_prev = taskc->q_prev;
+	else
+		fifo->tail = taskc->q_prev;
+	if (taskc->q_prev)
+		taskc->q_prev->q_next = taskc->q_next;
+	else
+		fifo->head = taskc->q_next;
+	taskc->q_next = NULL;
+	taskc->q_prev = NULL;
+	taskc->fifo = NULL;
+	bpf_res_spin_unlock(lock);
+}
+
 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
 {
@@ -237,9 +313,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 {
 	static u32 user_cnt, kernel_cnt;
 	struct task_ctx __arena *taskc;
-	u32 pid = p->pid;
 	int idx = weight_to_idx(p->scx.weight);
-	void *ring;
 	s32 cpu;
 
 	if (enq_flags & SCX_ENQ_REENQ) {
@@ -325,17 +399,8 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 		return;
 	}
 
-	ring = bpf_map_lookup_elem(&queue_arr, &idx);
-	if (!ring) {
-		scx_bpf_error("failed to find ring %d", idx);
-		return;
-	}
-
-	/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
-	if (bpf_map_push_elem(ring, &pid, 0)) {
-		scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags);
-		return;
-	}
+	/* Queue on the selected FIFO. */
+	qmap_fifo_enqueue(&qa.fifos[idx], taskc);
 
 	if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
 		taskc->highpri = true;
@@ -344,15 +409,20 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	__sync_fetch_and_add(&qa.nr_enqueued, 1);
 }
 
-/*
- * The BPF queue map doesn't support removal and sched_ext can handle spurious
- * dispatches. qmap_dequeue() is only used to collect statistics.
- */
 void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
 {
+	struct task_ctx __arena *taskc;
+
 	__sync_fetch_and_add(&qa.nr_dequeued, 1);
 	if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
 		__sync_fetch_and_add(&qa.nr_core_sched_execed, 1);
+
+	taskc = lookup_task_ctx(p);
+	if (taskc && taskc->fifo) {
+		if (taskc->highpri)
+			__sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
+		qmap_fifo_remove(taskc);
+	}
 }
 
 static void update_core_sched_head_seq(struct task_struct *p)
@@ -435,8 +505,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 	struct cpu_ctx __arena *cpuc;
 	struct task_ctx __arena *taskc;
 	u32 batch = dsp_batch ?: 1;
-	void *fifo;
-	s32 i, pid;
+	s32 i;
 
 	if (dispatch_highpri(false))
 		return;
@@ -467,30 +536,18 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 			cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
 		}
 
-		u64 dsp_idx = cpuc->dsp_idx;
-
-		fifo = bpf_map_lookup_elem(&queue_arr, &dsp_idx);
-		if (!fifo) {
-			scx_bpf_error("failed to find ring %llu", dsp_idx);
-			return;
-		}
-
 		/* Dispatch or advance. */
 		bpf_repeat(BPF_MAX_LOOPS) {
 			struct task_ctx __arena *taskc;
 
-			if (bpf_map_pop_elem(fifo, &pid))
+			taskc = qmap_fifo_pop(&qa.fifos[cpuc->dsp_idx]);
+			if (!taskc)
 				break;
 
-			p = bpf_task_from_pid(pid);
+			p = bpf_task_from_pid(taskc->pid);
 			if (!p)
 				continue;
 
-			if (!(taskc = lookup_task_ctx(p))) {
-				bpf_task_release(p);
-				return;
-			}
-
 			if (taskc->highpri)
 				__sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
 
@@ -661,6 +718,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init_task, struct task_struct *p,
 	}
 
 	taskc->next_free = NULL;
+	taskc->q_next = NULL;
+	taskc->q_prev = NULL;
+	taskc->fifo = NULL;
+	taskc->pid = p->pid;
 	taskc->force_local = false;
 	taskc->highpri = false;
 	taskc->core_sched_seq = 0;
@@ -701,38 +762,29 @@ void BPF_STRUCT_OPS(qmap_exit_task, struct task_struct *p,
 
 void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
 {
-	s32 i, pid;
+	struct task_ctx __arena *taskc;
+	s32 i;
+
+	QMAP_TOUCH_ARENA();
 
 	if (suppress_dump)
 		return;
 
+	/*
+	 * Walk the queue lists without locking - kfunc calls (scx_bpf_dump)
+	 * aren't in the verifier's kfunc_spin_allowed() list so we can't hold
+	 * a lock and dump. Best-effort; racing may print stale pids but the
+	 * walk is bounded by bpf_repeat() so it always terminates.
+	 */
 	bpf_for(i, 0, 5) {
-		void *fifo;
-
-		if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
-			return;
-
 		scx_bpf_dump("QMAP FIFO[%d]:", i);
-
-		/*
-		 * Dump can be invoked anytime and there is no way to iterate in
-		 * a non-destructive way. Pop and store in dump_store and then
-		 * restore afterwards. If racing against new enqueues, ordering
-		 * can get mixed up.
-		 */
-		bpf_repeat(4096) {
-			if (bpf_map_pop_elem(fifo, &pid))
-				break;
-			bpf_map_push_elem(&dump_store, &pid, 0);
-			scx_bpf_dump(" %d", pid);
-		}
-
+		taskc = qa.fifos[i].head;
 		bpf_repeat(4096) {
-			if (bpf_map_pop_elem(&dump_store, &pid))
+			if (!taskc)
 				break;
-			bpf_map_push_elem(fifo, &pid, 0);
+			scx_bpf_dump(" %d", taskc->pid);
+			taskc = taskc->q_next;
 		}
-
 		scx_bpf_dump("\n");
 	}
 }
@@ -756,6 +808,8 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc
 
 	QMAP_TOUCH_ARENA();
 
+	QMAP_TOUCH_ARENA();
+
 	if (suppress_dump)
 		return;
 	v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
@@ -1018,6 +1072,9 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
 	}
 	qa.task_ctxs = slab;
 
+	bpf_for(i, 0, 5)
+		qa.fifos[i].idx = i;
+
 	bpf_for(i, 0, max_tasks)
 		slab[i].next_free = (i + 1 < max_tasks) ? &slab[i + 1] : NULL;
 	qa.task_free_head = &slab[0];
diff --git a/tools/sched_ext/scx_qmap.h b/tools/sched_ext/scx_qmap.h
index c183d82632b3..9c0da5a301cb 100644
--- a/tools/sched_ext/scx_qmap.h
+++ b/tools/sched_ext/scx_qmap.h
@@ -37,6 +37,12 @@ struct cpu_ctx {
 /* Opaque to userspace; defined in scx_qmap.bpf.c. */
 struct task_ctx;
 
+struct qmap_fifo {
+	struct task_ctx __arena *head;
+	struct task_ctx __arena *tail;
+	__s32 idx;
+};
+
 struct qmap_arena {
 	/* userspace-visible stats */
 	__u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0;
@@ -59,6 +65,9 @@ struct qmap_arena {
 	/* task_ctx slab; allocated and threaded by qmap_init() */
 	struct task_ctx __arena *task_ctxs;
 	struct task_ctx __arena *task_free_head;
+
+	/* five priority FIFOs, each a doubly-linked list through task_ctx */
+	struct qmap_fifo fifos[5];
 };
 
 #endif /* __SCX_QMAP_H */
-- 
2.53.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists
  2026-04-16  8:16 ` [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists Tejun Heo
@ 2026-04-16 10:01   ` Andrea Righi
  2026-04-16 15:45   ` Emil Tsalapatis
  1 sibling, 0 replies; 11+ messages in thread
From: Andrea Righi @ 2026-04-16 10:01 UTC (permalink / raw)
  To: Tejun Heo
  Cc: David Vernet, Changwoo Min, Emil Tsalapatis, sched-ext,
	linux-kernel

Hi Tejun,

On Wed, Apr 15, 2026 at 10:16:26PM -1000, Tejun Heo wrote:
> Arena simplifies verification and allows more natural programming.
> Convert scx_qmap to arena as preparation for further sub-sched work.
> 
> Replace the five BPF_MAP_TYPE_QUEUE maps with doubly-linked lists in
> arena, threaded through task_ctx. Each queue is a struct qmap_fifo with
> head/tail pointers and its own per-queue bpf_res_spin_lock.

We should probably update the description at the beginning of the files as well,
mentioning the arena-backed lists.

> 
> qmap_dequeue() now properly removes tasks from the queue instead of
> leaving stale entries for dispatch to skip.
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
...

> @@ -756,6 +808,8 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc
>  
>  	QMAP_TOUCH_ARENA();
>  
> +	QMAP_TOUCH_ARENA();
> +

Copy/paste noise?

>  	if (suppress_dump)
>  		return;
>  	v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
> @@ -1018,6 +1072,9 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
>  	}
>  	qa.task_ctxs = slab;
>  
> +	bpf_for(i, 0, 5)
> +		qa.fifos[i].idx = i;
> +
>  	bpf_for(i, 0, max_tasks)
>  		slab[i].next_free = (i + 1 < max_tasks) ? &slab[i + 1] : NULL;
>  	qa.task_free_head = &slab[0];

Thanks,
-Andrea

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena
  2026-04-16  8:16 [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Tejun Heo
                   ` (3 preceding siblings ...)
  2026-04-16  8:16 ` [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists Tejun Heo
@ 2026-04-16 10:05 ` Andrea Righi
  4 siblings, 0 replies; 11+ messages in thread
From: Andrea Righi @ 2026-04-16 10:05 UTC (permalink / raw)
  To: Tejun Heo
  Cc: David Vernet, Changwoo Min, Emil Tsalapatis, sched-ext,
	linux-kernel

Hi Tejun,

On Wed, Apr 15, 2026 at 10:16:22PM -1000, Tejun Heo wrote:
> Hello,
> 
> Arena simplifies verification and allows more natural programming. This
> patchset converts scx_qmap to use BPF arena for all mutable state, as
> preparation for further sub-sched work.
> 
>  0001 Rename tctx to taskc for consistency.
>  0002 Move globals and cpu_ctx into arena.
>  0003 Move task_ctx into an arena slab with bpf_res_spin_lock.
>  0004 Replace FIFO queue maps with arena-backed doubly-linked lists.
> 
> Based on linus/master (1d51b370a0f8).

Sent a couple of comments about patch 4, everything else looks good to me.

Reviewed-by: Andrea Righi <arighi@nvidia.com>

Thanks,
-Andrea

> 
>  tools/sched_ext/include/scx/common.bpf.h |   4 +
>  tools/sched_ext/scx_qmap.bpf.c           | 561 ++++++++++++++---------
>  tools/sched_ext/scx_qmap.c               |  54 +--
>  tools/sched_ext/scx_qmap.h               |  73 +++
>  4 files changed, 459 insertions(+), 233 deletions(-)
> 
> Git tree: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git qmap-arena
> 
> --
> tejun

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/4] sched_ext: scx_qmap: rename tctx to taskc
  2026-04-16  8:16 ` [PATCH 1/4] sched_ext: scx_qmap: rename tctx to taskc Tejun Heo
@ 2026-04-16 14:56   ` Emil Tsalapatis
  0 siblings, 0 replies; 11+ messages in thread
From: Emil Tsalapatis @ 2026-04-16 14:56 UTC (permalink / raw)
  To: Tejun Heo
  Cc: David Vernet, Andrea Righi, Changwoo Min, Emil Tsalapatis,
	sched-ext, linux-kernel

On Thu, Apr 16, 2026 at 1:16 AM Tejun Heo <tj@kernel.org> wrote:
>
> >
> Rename the per-task context local variable from tctx to taskc for
> consistency.
>
> No functional change.
>

Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>

> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
>  tools/sched_ext/scx_qmap.bpf.c | 60 +++++++++++++++++-----------------
>  1 file changed, 30 insertions(+), 30 deletions(-)
>
> diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
> index b68abb9e760b..a18234f3c27a 100644
> --- a/tools/sched_ext/scx_qmap.bpf.c
> +++ b/tools/sched_ext/scx_qmap.bpf.c
> @@ -159,22 +159,22 @@ static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
>
>  static struct task_ctx *lookup_task_ctx(struct task_struct *p)
>  {
> -       struct task_ctx *tctx;
> +       struct task_ctx *taskc;
>
> -       if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
> +       if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
>                 scx_bpf_error("task_ctx lookup failed");
>                 return NULL;
>         }
> -       return tctx;
> +       return taskc;
>  }
>
>  s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
>                    s32 prev_cpu, u64 wake_flags)
>  {
> -       struct task_ctx *tctx;
> +       struct task_ctx *taskc;
>         s32 cpu;
>
> -       if (!(tctx = lookup_task_ctx(p)))
> +       if (!(taskc = lookup_task_ctx(p)))
>                 return -ESRCH;
>
>         if (p->scx.weight < 2 && !(p->flags & PF_KTHREAD))
> @@ -183,7 +183,7 @@ s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
>         cpu = pick_direct_dispatch_cpu(p, prev_cpu);
>
>         if (cpu >= 0) {
> -               tctx->force_local = true;
> +               taskc->force_local = true;
>                 return cpu;
>         } else {
>                 return prev_cpu;
> @@ -208,7 +208,7 @@ static int weight_to_idx(u32 weight)
>  void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>  {
>         static u32 user_cnt, kernel_cnt;
> -       struct task_ctx *tctx;
> +       struct task_ctx *taskc;
>         u32 pid = p->pid;
>         int idx = weight_to_idx(p->scx.weight);
>         void *ring;
> @@ -231,14 +231,14 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>         if (test_error_cnt && !--test_error_cnt)
>                 scx_bpf_error("test triggering error");
>
> -       if (!(tctx = lookup_task_ctx(p)))
> +       if (!(taskc = lookup_task_ctx(p)))
>                 return;
>
>         /*
>          * All enqueued tasks must have their core_sched_seq updated for correct
>          * core-sched ordering. Also, take a look at the end of qmap_dispatch().
>          */
> -       tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
> +       taskc->core_sched_seq = core_sched_tail_seqs[idx]++;
>
>         /*
>          * IMMED stress testing: Every immed_stress_nth'th enqueue, dispatch
> @@ -249,7 +249,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>                 static u32 immed_stress_cnt;
>
>                 if (!(++immed_stress_cnt % immed_stress_nth)) {
> -                       tctx->force_local = false;
> +                       taskc->force_local = false;
>                         scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | scx_bpf_task_cpu(p),
>                                            slice_ns, enq_flags);
>                         return;
> @@ -260,8 +260,8 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>          * If qmap_select_cpu() is telling us to or this is the last runnable
>          * task on the CPU, enqueue locally.
>          */
> -       if (tctx->force_local) {
> -               tctx->force_local = false;
> +       if (taskc->force_local) {
> +               taskc->force_local = false;
>                 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
>                 return;
>         }
> @@ -310,7 +310,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>         }
>
>         if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
> -               tctx->highpri = true;
> +               taskc->highpri = true;
>                 __sync_fetch_and_add(&nr_highpri_queued, 1);
>         }
>         __sync_fetch_and_add(&nr_enqueued, 1);
> @@ -330,10 +330,10 @@ void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
>  static void update_core_sched_head_seq(struct task_struct *p)
>  {
>         int idx = weight_to_idx(p->scx.weight);
> -       struct task_ctx *tctx;
> +       struct task_ctx *taskc;
>
> -       if ((tctx = lookup_task_ctx(p)))
> -               core_sched_head_seqs[idx] = tctx->core_sched_seq;
> +       if ((taskc = lookup_task_ctx(p)))
> +               core_sched_head_seqs[idx] = taskc->core_sched_seq;
>  }
>
>  /*
> @@ -354,12 +354,12 @@ static bool dispatch_highpri(bool from_timer)
>         /* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
>         bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
>                 static u64 highpri_seq;
> -               struct task_ctx *tctx;
> +               struct task_ctx *taskc;
>
> -               if (!(tctx = lookup_task_ctx(p)))
> +               if (!(taskc = lookup_task_ctx(p)))
>                         return false;
>
> -               if (tctx->highpri) {
> +               if (taskc->highpri) {
>                         /* exercise the set_*() and vtime interface too */
>                         scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2);
>                         scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++);
> @@ -405,7 +405,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>  {
>         struct task_struct *p;
>         struct cpu_ctx *cpuc;
> -       struct task_ctx *tctx;
> +       struct task_ctx *taskc;
>         u32 zero = 0, batch = dsp_batch ?: 1;
>         void *fifo;
>         s32 i, pid;
> @@ -450,7 +450,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>
>                 /* Dispatch or advance. */
>                 bpf_repeat(BPF_MAX_LOOPS) {
> -                       struct task_ctx *tctx;
> +                       struct task_ctx *taskc;
>
>                         if (bpf_map_pop_elem(fifo, &pid))
>                                 break;
> @@ -459,12 +459,12 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>                         if (!p)
>                                 continue;
>
> -                       if (!(tctx = lookup_task_ctx(p))) {
> +                       if (!(taskc = lookup_task_ctx(p))) {
>                                 bpf_task_release(p);
>                                 return;
>                         }
>
> -                       if (tctx->highpri)
> +                       if (taskc->highpri)
>                                 __sync_fetch_and_sub(&nr_highpri_queued, 1);
>
>                         update_core_sched_head_seq(p);
> @@ -539,13 +539,13 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>          * if the task were enqueued and dispatched immediately.
>          */
>         if (prev) {
> -               tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
> -               if (!tctx) {
> +               taskc = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
> +               if (!taskc) {
>                         scx_bpf_error("task_ctx lookup failed");
>                         return;
>                 }
>
> -               tctx->core_sched_seq =
> +               taskc->core_sched_seq =
>                         core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
>         }
>  }
> @@ -580,16 +580,16 @@ void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
>  static s64 task_qdist(struct task_struct *p)
>  {
>         int idx = weight_to_idx(p->scx.weight);
> -       struct task_ctx *tctx;
> +       struct task_ctx *taskc;
>         s64 qdist;
>
> -       tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
> -       if (!tctx) {
> +       taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
> +       if (!taskc) {
>                 scx_bpf_error("task_ctx lookup failed");
>                 return 0;
>         }
>
> -       qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
> +       qdist = taskc->core_sched_seq - core_sched_head_seqs[idx];
>
>         /*
>          * As queue index increments, the priority doubles. The queue w/ index 3
> --
> 2.53.0
>
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/4] sched_ext: scx_qmap: move globals and cpu_ctx into a BPF arena map
  2026-04-16  8:16 ` [PATCH 2/4] sched_ext: scx_qmap: move globals and cpu_ctx into a BPF arena map Tejun Heo
@ 2026-04-16 15:28   ` Emil Tsalapatis
  0 siblings, 0 replies; 11+ messages in thread
From: Emil Tsalapatis @ 2026-04-16 15:28 UTC (permalink / raw)
  To: Tejun Heo
  Cc: David Vernet, Andrea Righi, Changwoo Min, Emil Tsalapatis,
	sched-ext, linux-kernel

On Thu, Apr 16, 2026 at 1:20 AM Tejun Heo <tj@kernel.org> wrote:
>
> >
> Arena simplifies verification and allows more natural programming.
> Convert scx_qmap to arena as preparation for further sub-sched work.
>
> Move mutable scheduler state from BSS globals and a percpu array map
> into a single BPF arena map. A shared struct qmap_arena is declared as
> an __arena global so BPF accesses it directly and userspace reaches it
> through skel->arena->qa.
>
> Scheduling logic unchanged; only memory backing changes.
>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---

Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>

>  tools/sched_ext/scx_qmap.bpf.c | 152 ++++++++++++++-------------------
>  tools/sched_ext/scx_qmap.c     |  45 +++++-----
>  tools/sched_ext/scx_qmap.h     |  57 +++++++++++++
>  3 files changed, 147 insertions(+), 107 deletions(-)
>  create mode 100644 tools/sched_ext/scx_qmap.h
>
> diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
> index a18234f3c27a..0f8fbb6d0bc2 100644
> --- a/tools/sched_ext/scx_qmap.bpf.c
> +++ b/tools/sched_ext/scx_qmap.bpf.c
> @@ -22,6 +22,8 @@
>   */
>  #include <scx/common.bpf.h>
>
> +#include "scx_qmap.h"
> +
>  enum consts {
>         ONE_SEC_IN_NS           = 1000000000,
>         ONE_MSEC_IN_NS          = 1000000,
> @@ -48,14 +50,26 @@ const volatile bool suppress_dump;
>  const volatile bool always_enq_immed;
>  const volatile u32 immed_stress_nth;
>
> -u64 nr_highpri_queued;
> -u32 test_error_cnt;
> -
> -#define MAX_SUB_SCHEDS         8
> -u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
> -
>  UEI_DEFINE(uei);
>
> +/*
> + * All mutable scheduler state - per-cpu context, stats counters, core-sched
> + * sequence numbers, sub-sched cgroup ids - lives in this single BPF arena map.
> + * Userspace reaches it via skel->arena->qa.
> + */
> +struct {
> +       __uint(type, BPF_MAP_TYPE_ARENA);
> +       __uint(map_flags, BPF_F_MMAPABLE);
> +       __uint(max_entries, 1 << 16);           /* upper bound in pages */

I assume this is picked to handle 64K pages on ARM.

> +#if defined(__TARGET_ARCH_arm64) || defined(__aarch64__)
> +       __ulong(map_extra, 0x1ull << 32);       /* user/BPF mmap base */
> +#else
> +       __ulong(map_extra, 0x1ull << 44);
> +#endif
> +} arena SEC(".maps");
> +
> +struct qmap_arena __arena qa;
> +
>  struct qmap {
>         __uint(type, BPF_MAP_TYPE_QUEUE);
>         __uint(max_entries, 4096);
> @@ -102,8 +116,6 @@ static const u32 qidx_to_cpuperf_target[] = {
>   * task's seq and the associated queue's head seq is called the queue distance
>   * and used when comparing two tasks for ordering. See qmap_core_sched_before().
>   */
> -static u64 core_sched_head_seqs[5];
> -static u64 core_sched_tail_seqs[5];
>
>  /* Per-task scheduling context */
>  struct task_ctx {
> @@ -119,27 +131,6 @@ struct {
>         __type(value, struct task_ctx);
>  } task_ctx_stor SEC(".maps");
>
> -struct cpu_ctx {
> -       u64     dsp_idx;        /* dispatch index */
> -       u64     dsp_cnt;        /* remaining count */
> -       u32     avg_weight;
> -       u32     cpuperf_target;
> -};
> -
> -struct {
> -       __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> -       __uint(max_entries, 1);
> -       __type(key, u32);
> -       __type(value, struct cpu_ctx);
> -} cpu_ctx_stor SEC(".maps");
> -
> -/* Statistics */
> -u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0, nr_dequeued, nr_ddsp_from_enq;
> -u64 nr_core_sched_execed;
> -u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
> -u32 cpuperf_min, cpuperf_avg, cpuperf_max;
> -u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
> -
>  static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
>  {
>         s32 cpu;
> @@ -215,9 +206,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>         s32 cpu;
>
>         if (enq_flags & SCX_ENQ_REENQ) {
> -               __sync_fetch_and_add(&nr_reenqueued, 1);
> +               __sync_fetch_and_add(&qa.nr_reenqueued, 1);
>                 if (scx_bpf_task_cpu(p) == 0)
> -                       __sync_fetch_and_add(&nr_reenqueued_cpu0, 1);
> +                       __sync_fetch_and_add(&qa.nr_reenqueued_cpu0, 1);
>         }
>
>         if (p->flags & PF_KTHREAD) {
> @@ -228,7 +219,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>                         return;
>         }
>
> -       if (test_error_cnt && !--test_error_cnt)
> +       if (qa.test_error_cnt && !--qa.test_error_cnt)
>                 scx_bpf_error("test triggering error");
>
>         if (!(taskc = lookup_task_ctx(p)))
> @@ -238,7 +229,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>          * All enqueued tasks must have their core_sched_seq updated for correct
>          * core-sched ordering. Also, take a look at the end of qmap_dispatch().
>          */
> -       taskc->core_sched_seq = core_sched_tail_seqs[idx]++;
> +       taskc->core_sched_seq = qa.core_sched_tail_seqs[idx]++;
>
>         /*
>          * IMMED stress testing: Every immed_stress_nth'th enqueue, dispatch
> @@ -276,7 +267,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>         /* if select_cpu() wasn't called, try direct dispatch */
>         if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
>             (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
> -               __sync_fetch_and_add(&nr_ddsp_from_enq, 1);
> +               __sync_fetch_and_add(&qa.nr_ddsp_from_enq, 1);
>                 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
>                 return;
>         }
> @@ -311,9 +302,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>
>         if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
>                 taskc->highpri = true;
> -               __sync_fetch_and_add(&nr_highpri_queued, 1);
> +               __sync_fetch_and_add(&qa.nr_highpri_queued, 1);
>         }
> -       __sync_fetch_and_add(&nr_enqueued, 1);
> +       __sync_fetch_and_add(&qa.nr_enqueued, 1);
>  }
>
>  /*
> @@ -322,9 +313,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>   */
>  void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
>  {
> -       __sync_fetch_and_add(&nr_dequeued, 1);
> +       __sync_fetch_and_add(&qa.nr_dequeued, 1);
>         if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
> -               __sync_fetch_and_add(&nr_core_sched_execed, 1);
> +               __sync_fetch_and_add(&qa.nr_core_sched_execed, 1);
>  }
>
>  static void update_core_sched_head_seq(struct task_struct *p)
> @@ -333,7 +324,7 @@ static void update_core_sched_head_seq(struct task_struct *p)
>         struct task_ctx *taskc;
>
>         if ((taskc = lookup_task_ctx(p)))
> -               core_sched_head_seqs[idx] = taskc->core_sched_seq;
> +               qa.core_sched_head_seqs[idx] = taskc->core_sched_seq;
>  }
>
>  /*
> @@ -384,14 +375,14 @@ static bool dispatch_highpri(bool from_timer)
>                                      SCX_ENQ_PREEMPT)) {
>                         if (cpu == this_cpu) {
>                                 dispatched = true;
> -                               __sync_fetch_and_add(&nr_expedited_local, 1);
> +                               __sync_fetch_and_add(&qa.nr_expedited_local, 1);
>                         } else {
> -                               __sync_fetch_and_add(&nr_expedited_remote, 1);
> +                               __sync_fetch_and_add(&qa.nr_expedited_remote, 1);
>                         }
>                         if (from_timer)
> -                               __sync_fetch_and_add(&nr_expedited_from_timer, 1);
> +                               __sync_fetch_and_add(&qa.nr_expedited_from_timer, 1);
>                 } else {
> -                       __sync_fetch_and_add(&nr_expedited_lost, 1);
> +                       __sync_fetch_and_add(&qa.nr_expedited_lost, 1);
>                 }
>
>                 if (dispatched)
> @@ -404,19 +395,19 @@ static bool dispatch_highpri(bool from_timer)
>  void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>  {
>         struct task_struct *p;
> -       struct cpu_ctx *cpuc;
> +       struct cpu_ctx __arena *cpuc;
>         struct task_ctx *taskc;
> -       u32 zero = 0, batch = dsp_batch ?: 1;
> +       u32 batch = dsp_batch ?: 1;
>         void *fifo;
>         s32 i, pid;
>
>         if (dispatch_highpri(false))
>                 return;
>
> -       if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0))
> +       if (!qa.nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0))
>                 return;
>
> -       if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
> +       if (dsp_inf_loop_after && qa.nr_dispatched > dsp_inf_loop_after) {
>                 /*
>                  * PID 2 should be kthreadd which should mostly be idle and off
>                  * the scheduler. Let's keep dispatching it to force the kernel
> @@ -430,10 +421,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>                 }
>         }
>
> -       if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
> -               scx_bpf_error("failed to look up cpu_ctx");
> -               return;
> -       }
> +       cpuc = &qa.cpu_ctxs[bpf_get_smp_processor_id()];
>
>         for (i = 0; i < 5; i++) {
>                 /* Advance the dispatch cursor and pick the fifo. */
> @@ -442,9 +430,11 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>                         cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
>                 }
>
> -               fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
> +               u64 dsp_idx = cpuc->dsp_idx;
> +
> +               fifo = bpf_map_lookup_elem(&queue_arr, &dsp_idx);
>                 if (!fifo) {
> -                       scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
> +                       scx_bpf_error("failed to find ring %llu", dsp_idx);
>                         return;
>                 }
>
> @@ -465,10 +455,10 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>                         }
>
>                         if (taskc->highpri)
> -                               __sync_fetch_and_sub(&nr_highpri_queued, 1);
> +                               __sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
>
>                         update_core_sched_head_seq(p);
> -                       __sync_fetch_and_add(&nr_dispatched, 1);
> +                       __sync_fetch_and_add(&qa.nr_dispatched, 1);
>
>                         scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
>
> @@ -529,8 +519,8 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>         }
>
>         for (i = 0; i < MAX_SUB_SCHEDS; i++) {
> -               if (sub_sched_cgroup_ids[i] &&
> -                   scx_bpf_sub_dispatch(sub_sched_cgroup_ids[i]))
> +               if (qa.sub_sched_cgroup_ids[i] &&
> +                   scx_bpf_sub_dispatch(qa.sub_sched_cgroup_ids[i]))
>                         return;
>         }
>
> @@ -546,21 +536,15 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>                 }
>
>                 taskc->core_sched_seq =
> -                       core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
> +                       qa.core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
>         }
>  }
>
>  void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
>  {
> -       struct cpu_ctx *cpuc;
> -       u32 zero = 0;
> +       struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[bpf_get_smp_processor_id()];
>         int idx;
>
> -       if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
> -               scx_bpf_error("failed to look up cpu_ctx");
> -               return;
> -       }
> -
>         /*
>          * Use the running avg of weights to select the target cpuperf level.
>          * This is a demonstration of the cpuperf feature rather than a
> @@ -589,7 +573,7 @@ static s64 task_qdist(struct task_struct *p)
>                 return 0;
>         }
>
> -       qdist = taskc->core_sched_seq - core_sched_head_seqs[idx];
> +       qdist = taskc->core_sched_seq - qa.core_sched_head_seqs[idx];
>
>         /*
>          * As queue index increments, the priority doubles. The queue w/ index 3
> @@ -679,13 +663,10 @@ void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
>
>  void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
>  {
> -       u32 zero = 0;
> -       struct cpu_ctx *cpuc;
> +       struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[cpu];
>
>         if (suppress_dump || idle)
>                 return;
> -       if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
> -               return;
>
>         scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
>                      cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
> @@ -802,7 +783,7 @@ struct {
>   */
>  static void monitor_cpuperf(void)
>  {
> -       u32 zero = 0, nr_cpu_ids;
> +       u32 nr_cpu_ids;
>         u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
>         u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
>         const struct cpumask *online;
> @@ -812,7 +793,7 @@ static void monitor_cpuperf(void)
>         online = scx_bpf_get_online_cpumask();
>
>         bpf_for(i, 0, nr_cpu_ids) {
> -               struct cpu_ctx *cpuc;
> +               struct cpu_ctx __arena *cpuc = &qa.cpu_ctxs[i];
>                 u32 cap, cur;
>
>                 if (!bpf_cpumask_test_cpu(i, online))
> @@ -834,11 +815,6 @@ static void monitor_cpuperf(void)
>                 cur_sum += cur * cap / SCX_CPUPERF_ONE;
>                 cap_sum += cap;
>
> -               if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
> -                       scx_bpf_error("failed to look up cpu_ctx");
> -                       goto out;
> -               }
> -
>                 /* collect target */
>                 cur = cpuc->cpuperf_target;
>                 target_sum += cur;
> @@ -846,14 +822,14 @@ static void monitor_cpuperf(void)
>                 target_max = cur > target_max ? cur : target_max;
>         }
>
> -       cpuperf_min = cur_min;
> -       cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
> -       cpuperf_max = cur_max;
> +       qa.cpuperf_min = cur_min;
> +       qa.cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
> +       qa.cpuperf_max = cur_max;
> +
> +       qa.cpuperf_target_min = target_min;
> +       qa.cpuperf_target_avg = target_sum / nr_online_cpus;
> +       qa.cpuperf_target_max = target_max;
>
> -       cpuperf_target_min = target_min;
> -       cpuperf_target_avg = target_sum / nr_online_cpus;
> -       cpuperf_target_max = target_max;
> -out:
>         scx_bpf_put_cpumask(online);
>  }
>
> @@ -996,8 +972,8 @@ s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args)
>         s32 i;
>
>         for (i = 0; i < MAX_SUB_SCHEDS; i++) {
> -               if (!sub_sched_cgroup_ids[i]) {
> -                       sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
> +               if (!qa.sub_sched_cgroup_ids[i]) {
> +                       qa.sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
>                         bpf_printk("attaching sub-sched[%d] on %s",
>                                    i, args->cgroup_path);
>                         return 0;
> @@ -1012,8 +988,8 @@ void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args)
>         s32 i;
>
>         for (i = 0; i < MAX_SUB_SCHEDS; i++) {
> -               if (sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
> -                       sub_sched_cgroup_ids[i] = 0;
> +               if (qa.sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
> +                       qa.sub_sched_cgroup_ids[i] = 0;
>                         bpf_printk("detaching sub-sched[%d] on %s",
>                                    i, args->cgroup_path);
>                         break;
> diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
> index e7c89a2bc3d8..8844499c14c4 100644
> --- a/tools/sched_ext/scx_qmap.c
> +++ b/tools/sched_ext/scx_qmap.c
> @@ -10,9 +10,11 @@
>  #include <inttypes.h>
>  #include <signal.h>
>  #include <libgen.h>
> +#include <sys/mman.h>
>  #include <sys/stat.h>
>  #include <bpf/bpf.h>
>  #include <scx/common.h>
> +#include "scx_qmap.h"
>  #include "scx_qmap.bpf.skel.h"
>
>  const char help_fmt[] =
> @@ -60,6 +62,8 @@ int main(int argc, char **argv)
>  {
>         struct scx_qmap *skel;
>         struct bpf_link *link;
> +       struct qmap_arena *qa;
> +       __u32 test_error_cnt = 0;
>         int opt;
>
>         libbpf_set_print(libbpf_print_fn);
> @@ -76,7 +80,7 @@ int main(int argc, char **argv)
>                         skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
>                         break;
>                 case 'e':
> -                       skel->bss->test_error_cnt = strtoul(optarg, NULL, 0);
> +                       test_error_cnt = strtoul(optarg, NULL, 0);
>                         break;
>                 case 't':
>                         skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0);
> @@ -142,29 +146,32 @@ int main(int argc, char **argv)
>         SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei);
>         link = SCX_OPS_ATTACH(skel, qmap_ops, scx_qmap);
>
> +       qa = &skel->arena->qa;
> +       qa->test_error_cnt = test_error_cnt;
> +
>         while (!exit_req && !UEI_EXITED(skel, uei)) {
> -               long nr_enqueued = skel->bss->nr_enqueued;
> -               long nr_dispatched = skel->bss->nr_dispatched;
> +               long nr_enqueued = qa->nr_enqueued;
> +               long nr_dispatched = qa->nr_dispatched;
>
> -               printf("stats  : enq=%lu dsp=%lu delta=%ld reenq/cpu0=%"PRIu64"/%"PRIu64" deq=%"PRIu64" core=%"PRIu64" enq_ddsp=%"PRIu64"\n",
> +               printf("stats  : enq=%lu dsp=%lu delta=%ld reenq/cpu0=%llu/%llu deq=%llu core=%llu enq_ddsp=%llu\n",
>                        nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
> -                      skel->bss->nr_reenqueued, skel->bss->nr_reenqueued_cpu0,
> -                      skel->bss->nr_dequeued,
> -                      skel->bss->nr_core_sched_execed,
> -                      skel->bss->nr_ddsp_from_enq);
> -               printf("         exp_local=%"PRIu64" exp_remote=%"PRIu64" exp_timer=%"PRIu64" exp_lost=%"PRIu64"\n",
> -                      skel->bss->nr_expedited_local,
> -                      skel->bss->nr_expedited_remote,
> -                      skel->bss->nr_expedited_from_timer,
> -                      skel->bss->nr_expedited_lost);
> +                      qa->nr_reenqueued, qa->nr_reenqueued_cpu0,
> +                      qa->nr_dequeued,
> +                      qa->nr_core_sched_execed,
> +                      qa->nr_ddsp_from_enq);
> +               printf("         exp_local=%llu exp_remote=%llu exp_timer=%llu exp_lost=%llu\n",
> +                      qa->nr_expedited_local,
> +                      qa->nr_expedited_remote,
> +                      qa->nr_expedited_from_timer,
> +                      qa->nr_expedited_lost);
>                 if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur"))
>                         printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n",
> -                              skel->bss->cpuperf_min,
> -                              skel->bss->cpuperf_avg,
> -                              skel->bss->cpuperf_max,
> -                              skel->bss->cpuperf_target_min,
> -                              skel->bss->cpuperf_target_avg,
> -                              skel->bss->cpuperf_target_max);
> +                              qa->cpuperf_min,
> +                              qa->cpuperf_avg,
> +                              qa->cpuperf_max,
> +                              qa->cpuperf_target_min,
> +                              qa->cpuperf_target_avg,
> +                              qa->cpuperf_target_max);
>                 fflush(stdout);
>                 sleep(1);
>         }
> diff --git a/tools/sched_ext/scx_qmap.h b/tools/sched_ext/scx_qmap.h
> new file mode 100644
> index 000000000000..52153230bfce
> --- /dev/null
> +++ b/tools/sched_ext/scx_qmap.h
> @@ -0,0 +1,57 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Shared definitions between scx_qmap.bpf.c and scx_qmap.c.
> + *
> + * The scheduler keeps all mutable state in a single BPF arena map. struct
> + * qmap_arena is the one object that lives at the base of the arena and is
> + * mmap'd into userspace so the loader can read counters directly.
> + *
> + * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
> + * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
> + */
> +#ifndef __SCX_QMAP_H
> +#define __SCX_QMAP_H
> +
> +#ifdef __BPF__
> +#include <scx/bpf_arena_common.bpf.h>
> +#else
> +#include <linux/types.h>
> +#include <scx/bpf_arena_common.h>
> +#endif
> +
> +#define MAX_SUB_SCHEDS         8
> +
> +/*
> + * cpu_ctxs[] is sized to a fixed cap so the layout is shared between BPF and
> + * userspace. Keep this in sync with NR_CPUS used by the BPF side.
> + */
> +#define SCX_QMAP_MAX_CPUS      1024
> +
> +struct cpu_ctx {
> +       __u64 dsp_idx;          /* dispatch index */
> +       __u64 dsp_cnt;          /* remaining count */
> +       __u32 avg_weight;
> +       __u32 cpuperf_target;
> +};
> +
> +struct qmap_arena {
> +       /* userspace-visible stats */
> +       __u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0;
> +       __u64 nr_dequeued, nr_ddsp_from_enq;
> +       __u64 nr_core_sched_execed;
> +       __u64 nr_expedited_local, nr_expedited_remote;
> +       __u64 nr_expedited_lost, nr_expedited_from_timer;
> +       __u64 nr_highpri_queued;
> +       __u32 test_error_cnt;
> +       __u32 cpuperf_min, cpuperf_avg, cpuperf_max;
> +       __u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
> +
> +       /* kernel-side runtime state */
> +       __u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
> +       __u64 core_sched_head_seqs[5];
> +       __u64 core_sched_tail_seqs[5];
> +
> +       struct cpu_ctx cpu_ctxs[SCX_QMAP_MAX_CPUS];
> +};
> +
> +#endif /* __SCX_QMAP_H */
> --
> 2.53.0
>
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/4] sched_ext: scx_qmap: move task_ctx into a BPF arena slab
  2026-04-16  8:16 ` [PATCH 3/4] sched_ext: scx_qmap: move task_ctx into a BPF arena slab Tejun Heo
@ 2026-04-16 15:31   ` Emil Tsalapatis
  0 siblings, 0 replies; 11+ messages in thread
From: Emil Tsalapatis @ 2026-04-16 15:31 UTC (permalink / raw)
  To: Tejun Heo
  Cc: David Vernet, Andrea Righi, Changwoo Min, Emil Tsalapatis,
	sched-ext, linux-kernel

On Thu, Apr 16, 2026 at 1:20 AM Tejun Heo <tj@kernel.org> wrote:
>
> >
> Arena simplifies verification and allows more natural programming.
> Convert scx_qmap to arena as preparation for further sub-sched work.
>
> Allocate per-task context from an arena slab instead of storing it
> directly in task_storage. task_ctx_stor now holds an arena pointer to
> the task's slab entry. Free entries form a singly-linked list protected
> by bpf_res_spin_lock; slab exhaustion triggers scx_bpf_error().
>
> The slab size is configurable via the new -N option (default 16384).
>
> Also add bpf_res_spin_lock/unlock declarations to common.bpf.h.
>
> Scheduling logic unchanged.
>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---

Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>

One nit, since we never have non-arena task_ctxs we can do

typedef struct task_ctx __arena task_ctx;

to avoid annotating every instance.

>  tools/sched_ext/include/scx/common.bpf.h |   4 +
>  tools/sched_ext/scx_qmap.bpf.c           | 178 ++++++++++++++++++-----
>  tools/sched_ext/scx_qmap.c               |   9 +-
>  tools/sched_ext/scx_qmap.h               |   7 +
>  4 files changed, 159 insertions(+), 39 deletions(-)
>
> diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
> index 19459dedde41..35fc62556241 100644
> --- a/tools/sched_ext/include/scx/common.bpf.h
> +++ b/tools/sched_ext/include/scx/common.bpf.h
> @@ -526,6 +526,10 @@ static inline bool is_migration_disabled(const struct task_struct *p)
>  void bpf_rcu_read_lock(void) __ksym;
>  void bpf_rcu_read_unlock(void) __ksym;
>
> +/* resilient qspinlock */
> +int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) __ksym __weak;
> +void bpf_res_spin_unlock(struct bpf_res_spin_lock *lock) __ksym __weak;
> +
>  /*
>   * Time helpers, most of which are from jiffies.h.
>   */
> diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
> index 0f8fbb6d0bc2..e071969c8f32 100644
> --- a/tools/sched_ext/scx_qmap.bpf.c
> +++ b/tools/sched_ext/scx_qmap.bpf.c
> @@ -49,6 +49,7 @@ const volatile s32 disallow_tgid;
>  const volatile bool suppress_dump;
>  const volatile bool always_enq_immed;
>  const volatile u32 immed_stress_nth;
> +const volatile u32 max_tasks;
>
>  UEI_DEFINE(uei);
>
> @@ -117,20 +118,43 @@ static const u32 qidx_to_cpuperf_target[] = {
>   * and used when comparing two tasks for ordering. See qmap_core_sched_before().
>   */
>
> -/* Per-task scheduling context */
> +/*
> + * Per-task scheduling context. Allocated from the qa.task_ctxs[] slab in
> + * arena. While the task is alive the entry is referenced from task_ctx_stor;
> + * while it's free the entry sits on the free list singly-linked through
> + * @next_free.
> + */
>  struct task_ctx {
> -       bool    force_local;    /* Dispatch directly to local_dsq */
> -       bool    highpri;
> -       u64     core_sched_seq;
> +       struct task_ctx __arena *next_free;     /* only valid on free list */
> +       bool                    force_local;    /* Dispatch directly to local_dsq */
> +       bool                    highpri;
> +       u64                     core_sched_seq;
> +};
> +
> +/* Holds an arena pointer to the task's slab entry. */
> +struct task_ctx_stor_val {
> +       struct task_ctx __arena *taskc;
>  };
>
>  struct {
>         __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
>         __uint(map_flags, BPF_F_NO_PREALLOC);
>         __type(key, int);
> -       __type(value, struct task_ctx);
> +       __type(value, struct task_ctx_stor_val);
>  } task_ctx_stor SEC(".maps");
>
> +/* Protects the task_ctx slab free list. */
> +__hidden struct bpf_res_spin_lock qa_task_lock SEC(".data.qa_task_lock");
> +
> +static int qmap_spin_lock(struct bpf_res_spin_lock *lock)
> +{
> +       if (bpf_res_spin_lock(lock)) {
> +               scx_bpf_error("res_spin_lock failed");
> +               return -EBUSY;
> +       }
> +       return 0;
> +}
> +
>  static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
>  {
>         s32 cpu;
> @@ -148,21 +172,34 @@ static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
>         return -1;
>  }
>
> -static struct task_ctx *lookup_task_ctx(struct task_struct *p)
> +/*
> + * Force a reference to the arena map. The verifier associates an arena with
> + * a program by finding an LD_IMM64 instruction that loads the arena's BPF
> + * map; programs that only use arena pointers returned from task-local
> + * storage (like qmap_select_cpu) never reference @arena directly. Without
> + * this, the verifier rejects addr_space_cast with "addr_space_cast insn
> + * can only be used in a program that has an associated arena".
> + */
> +#define QMAP_TOUCH_ARENA() do { asm volatile("" :: "r"(&arena)); } while (0)
> +

Really nice that this works when placed as a macro.

> +static struct task_ctx __arena *lookup_task_ctx(struct task_struct *p)
>  {
> -       struct task_ctx *taskc;
> +       struct task_ctx_stor_val *v;
> +
> +       QMAP_TOUCH_ARENA();
>
> -       if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
> +       v = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
> +       if (!v || !v->taskc) {
>                 scx_bpf_error("task_ctx lookup failed");
>                 return NULL;
>         }
> -       return taskc;
> +       return v->taskc;
>  }
>
>  s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
>                    s32 prev_cpu, u64 wake_flags)
>  {
> -       struct task_ctx *taskc;
> +       struct task_ctx __arena *taskc;
>         s32 cpu;
>
>         if (!(taskc = lookup_task_ctx(p)))
> @@ -199,7 +236,7 @@ static int weight_to_idx(u32 weight)
>  void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>  {
>         static u32 user_cnt, kernel_cnt;
> -       struct task_ctx *taskc;
> +       struct task_ctx __arena *taskc;
>         u32 pid = p->pid;
>         int idx = weight_to_idx(p->scx.weight);
>         void *ring;
> @@ -321,7 +358,7 @@ void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
>  static void update_core_sched_head_seq(struct task_struct *p)
>  {
>         int idx = weight_to_idx(p->scx.weight);
> -       struct task_ctx *taskc;
> +       struct task_ctx __arena *taskc;
>
>         if ((taskc = lookup_task_ctx(p)))
>                 qa.core_sched_head_seqs[idx] = taskc->core_sched_seq;
> @@ -345,7 +382,7 @@ static bool dispatch_highpri(bool from_timer)
>         /* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
>         bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
>                 static u64 highpri_seq;
> -               struct task_ctx *taskc;
> +               struct task_ctx __arena *taskc;
>
>                 if (!(taskc = lookup_task_ctx(p)))
>                         return false;
> @@ -396,7 +433,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>  {
>         struct task_struct *p;
>         struct cpu_ctx __arena *cpuc;
> -       struct task_ctx *taskc;
> +       struct task_ctx __arena *taskc;
>         u32 batch = dsp_batch ?: 1;
>         void *fifo;
>         s32 i, pid;
> @@ -440,7 +477,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>
>                 /* Dispatch or advance. */
>                 bpf_repeat(BPF_MAX_LOOPS) {
> -                       struct task_ctx *taskc;
> +                       struct task_ctx __arena *taskc;
>
>                         if (bpf_map_pop_elem(fifo, &pid))
>                                 break;
> @@ -529,11 +566,9 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>          * if the task were enqueued and dispatched immediately.
>          */
>         if (prev) {
> -               taskc = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
> -               if (!taskc) {
> -                       scx_bpf_error("task_ctx lookup failed");
> +               taskc = lookup_task_ctx(prev);
> +               if (!taskc)
>                         return;
> -               }
>
>                 taskc->core_sched_seq =
>                         qa.core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
> @@ -564,14 +599,12 @@ void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
>  static s64 task_qdist(struct task_struct *p)
>  {
>         int idx = weight_to_idx(p->scx.weight);
> -       struct task_ctx *taskc;
> +       struct task_ctx __arena *taskc;
>         s64 qdist;
>
> -       taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
> -       if (!taskc) {
> -               scx_bpf_error("task_ctx lookup failed");
> +       taskc = lookup_task_ctx(p);
> +       if (!taskc)
>                 return 0;
> -       }
>
>         qdist = taskc->core_sched_seq - qa.core_sched_head_seqs[idx];
>
> @@ -606,21 +639,64 @@ bool BPF_STRUCT_OPS(qmap_core_sched_before,
>   * tasks when a higher-priority scheduling class takes the CPU.
>   */
>
> -s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
> -                  struct scx_init_task_args *args)
> +s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init_task, struct task_struct *p,
> +                            struct scx_init_task_args *args)
>  {
> +       struct task_ctx_stor_val *v;
> +       struct task_ctx __arena *taskc;
> +
>         if (p->tgid == disallow_tgid)
>                 p->scx.disallow = true;
>
> -       /*
> -        * @p is new. Let's ensure that its task_ctx is available. We can sleep
> -        * in this function and the following will automatically use GFP_KERNEL.
> -        */
> -       if (bpf_task_storage_get(&task_ctx_stor, p, 0,
> -                                BPF_LOCAL_STORAGE_GET_F_CREATE))
> -               return 0;
> -       else
> +       /* pop a slab entry off the free list */
> +       if (qmap_spin_lock(&qa_task_lock))
> +               return -EBUSY;
> +       taskc = qa.task_free_head;
> +       if (taskc)
> +               qa.task_free_head = taskc->next_free;
> +       bpf_res_spin_unlock(&qa_task_lock);
> +       if (!taskc) {
> +               scx_bpf_error("task_ctx slab exhausted (max_tasks=%u)", max_tasks);
> +               return -ENOMEM;
> +       }
> +
> +       taskc->next_free = NULL;
> +       taskc->force_local = false;
> +       taskc->highpri = false;
> +       taskc->core_sched_seq = 0;
> +
> +       v = bpf_task_storage_get(&task_ctx_stor, p, NULL,
> +                                BPF_LOCAL_STORAGE_GET_F_CREATE);
> +       if (!v) {
> +               /* push back to the free list */
> +               if (!qmap_spin_lock(&qa_task_lock)) {
> +                       taskc->next_free = qa.task_free_head;
> +                       qa.task_free_head = taskc;
> +                       bpf_res_spin_unlock(&qa_task_lock);
> +               }
>                 return -ENOMEM;
> +       }
> +       v->taskc = taskc;
> +       return 0;
> +}
> +
> +void BPF_STRUCT_OPS(qmap_exit_task, struct task_struct *p,
> +                   struct scx_exit_task_args *args)
> +{
> +       struct task_ctx_stor_val *v;
> +       struct task_ctx __arena *taskc;
> +
> +       v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
> +       if (!v || !v->taskc)
> +               return;
> +       taskc = v->taskc;
> +       v->taskc = NULL;
> +
> +       if (qmap_spin_lock(&qa_task_lock))
> +               return;
> +       taskc->next_free = qa.task_free_head;
> +       qa.task_free_head = taskc;
> +       bpf_res_spin_unlock(&qa_task_lock);
>  }
>
>  void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
> @@ -675,12 +751,17 @@ void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle
>
>  void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
>  {
> -       struct task_ctx *taskc;
> +       struct task_ctx_stor_val *v;
> +       struct task_ctx __arena *taskc;
> +
> +       QMAP_TOUCH_ARENA();
>
>         if (suppress_dump)
>                 return;
> -       if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
> +       v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
> +       if (!v || !v->taskc)
>                 return;
> +       taskc = v->taskc;
>
>         scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
>                      taskc->force_local, taskc->core_sched_seq);
> @@ -915,10 +996,32 @@ static int lowpri_timerfn(void *map, int *key, struct bpf_timer *timer)
>
>  s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
>  {
> -       u32 key = 0;
> +       struct task_ctx __arena *slab;
> +       u32 nr_pages, key = 0, i;
>         struct bpf_timer *timer;
>         s32 ret;
>
> +       /*
> +        * Allocate the task_ctx slab in arena and thread the entire slab onto
> +        * the free list. max_tasks is set by userspace before load.
> +        */
> +       if (!max_tasks) {
> +               scx_bpf_error("max_tasks must be > 0");
> +               return -EINVAL;
> +       }
> +
> +       nr_pages = (max_tasks * sizeof(struct task_ctx) + PAGE_SIZE - 1) / PAGE_SIZE;
> +       slab = bpf_arena_alloc_pages(&arena, NULL, nr_pages, NUMA_NO_NODE, 0);
> +       if (!slab) {
> +               scx_bpf_error("failed to allocate task_ctx slab");
> +               return -ENOMEM;
> +       }
> +       qa.task_ctxs = slab;
> +
> +       bpf_for(i, 0, max_tasks)
> +               slab[i].next_free = (i + 1 < max_tasks) ? &slab[i + 1] : NULL;
> +       qa.task_free_head = &slab[0];
> +
>         if (print_msgs && !sub_cgroup_id)
>                 print_cpus();
>
> @@ -1005,6 +1108,7 @@ SCX_OPS_DEFINE(qmap_ops,
>                .tick                    = (void *)qmap_tick,
>                .core_sched_before       = (void *)qmap_core_sched_before,
>                .init_task               = (void *)qmap_init_task,
> +              .exit_task               = (void *)qmap_exit_task,
>                .dump                    = (void *)qmap_dump,
>                .dump_cpu                = (void *)qmap_dump_cpu,
>                .dump_task               = (void *)qmap_dump_task,
> diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
> index 8844499c14c4..4bdcc4bc5fbd 100644
> --- a/tools/sched_ext/scx_qmap.c
> +++ b/tools/sched_ext/scx_qmap.c
> @@ -23,12 +23,13 @@ const char help_fmt[] =
>  "See the top-level comment in .bpf.c for more details.\n"
>  "\n"
>  "Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n"
> -"       [-P] [-M] [-H] [-d PID] [-D LEN] [-S] [-p] [-I] [-F COUNT] [-v]\n"
> +"       [-N COUNT] [-P] [-M] [-H] [-d PID] [-D LEN] [-S] [-p] [-I] [-F COUNT] [-v]\n"
>  "\n"
>  "  -s SLICE_US   Override slice duration\n"
>  "  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
>  "  -t COUNT      Stall every COUNT'th user thread\n"
>  "  -T COUNT      Stall every COUNT'th kernel thread\n"
> +"  -N COUNT      Size of the task_ctx arena slab (default 16384)\n"
>  "  -l COUNT      Trigger dispatch infinite looping after COUNT dispatches\n"
>  "  -b COUNT      Dispatch upto COUNT tasks together\n"
>  "  -P            Print out DSQ content and event counters to trace_pipe every second\n"
> @@ -73,8 +74,9 @@ int main(int argc, char **argv)
>         skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
>
>         skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
> +       skel->rodata->max_tasks = 16384;
>
> -       while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PMHc:d:D:SpIF:vh")) != -1) {
> +       while ((opt = getopt(argc, argv, "s:e:t:T:l:b:N:PMHc:d:D:SpIF:vh")) != -1) {
>                 switch (opt) {
>                 case 's':
>                         skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
> @@ -94,6 +96,9 @@ int main(int argc, char **argv)
>                 case 'b':
>                         skel->rodata->dsp_batch = strtoul(optarg, NULL, 0);
>                         break;
> +               case 'N':
> +                       skel->rodata->max_tasks = strtoul(optarg, NULL, 0);
> +                       break;
>                 case 'P':
>                         skel->rodata->print_dsqs_and_events = true;
>                         break;
> diff --git a/tools/sched_ext/scx_qmap.h b/tools/sched_ext/scx_qmap.h
> index 52153230bfce..c183d82632b3 100644
> --- a/tools/sched_ext/scx_qmap.h
> +++ b/tools/sched_ext/scx_qmap.h
> @@ -34,6 +34,9 @@ struct cpu_ctx {
>         __u32 cpuperf_target;
>  };
>
> +/* Opaque to userspace; defined in scx_qmap.bpf.c. */
> +struct task_ctx;
> +
>  struct qmap_arena {
>         /* userspace-visible stats */
>         __u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0;
> @@ -52,6 +55,10 @@ struct qmap_arena {
>         __u64 core_sched_tail_seqs[5];
>
>         struct cpu_ctx cpu_ctxs[SCX_QMAP_MAX_CPUS];
> +
> +       /* task_ctx slab; allocated and threaded by qmap_init() */
> +       struct task_ctx __arena *task_ctxs;
> +       struct task_ctx __arena *task_free_head;
>  };
>
>  #endif /* __SCX_QMAP_H */
> --
> 2.53.0
>
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists
  2026-04-16  8:16 ` [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists Tejun Heo
  2026-04-16 10:01   ` Andrea Righi
@ 2026-04-16 15:45   ` Emil Tsalapatis
  1 sibling, 0 replies; 11+ messages in thread
From: Emil Tsalapatis @ 2026-04-16 15:45 UTC (permalink / raw)
  To: Tejun Heo
  Cc: David Vernet, Andrea Righi, Changwoo Min, Emil Tsalapatis,
	sched-ext, linux-kernel

On Thu, Apr 16, 2026 at 1:20 AM Tejun Heo <tj@kernel.org> wrote:
>
> >
> Arena simplifies verification and allows more natural programming.
> Convert scx_qmap to arena as preparation for further sub-sched work.
>
> Replace the five BPF_MAP_TYPE_QUEUE maps with doubly-linked lists in
> arena, threaded through task_ctx. Each queue is a struct qmap_fifo with
> head/tail pointers and its own per-queue bpf_res_spin_lock.
>
> qmap_dequeue() now properly removes tasks from the queue instead of
> leaving stale entries for dispatch to skip.
>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---

Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>

>  tools/sched_ext/scx_qmap.bpf.c | 221 +++++++++++++++++++++------------
>  tools/sched_ext/scx_qmap.h     |   9 ++
>  2 files changed, 148 insertions(+), 82 deletions(-)
>
> diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
> index e071969c8f32..c26997ff7863 100644
> --- a/tools/sched_ext/scx_qmap.bpf.c
> +++ b/tools/sched_ext/scx_qmap.bpf.c
> @@ -71,31 +71,24 @@ struct {
>
>  struct qmap_arena __arena qa;
>
> -struct qmap {
> -       __uint(type, BPF_MAP_TYPE_QUEUE);
> -       __uint(max_entries, 4096);
> -       __type(value, u32);
> -} queue0 SEC(".maps"),
> -  queue1 SEC(".maps"),
> -  queue2 SEC(".maps"),
> -  queue3 SEC(".maps"),
> -  queue4 SEC(".maps"),
> -  dump_store SEC(".maps");
> -
> -struct {
> -       __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
> -       __uint(max_entries, 5);
> -       __type(key, int);
> -       __array(values, struct qmap);
> -} queue_arr SEC(".maps") = {
> -       .values = {
> -               [0] = &queue0,
> -               [1] = &queue1,
> -               [2] = &queue2,
> -               [3] = &queue3,
> -               [4] = &queue4,
> -       },
> -};
> +/* Per-queue locks. Each in its own .data section as bpf_res_spin_lock requires. */
> +__hidden struct bpf_res_spin_lock qa_q_lock0 SEC(".data.qa_q_lock0");
> +__hidden struct bpf_res_spin_lock qa_q_lock1 SEC(".data.qa_q_lock1");
> +__hidden struct bpf_res_spin_lock qa_q_lock2 SEC(".data.qa_q_lock2");
> +__hidden struct bpf_res_spin_lock qa_q_lock3 SEC(".data.qa_q_lock3");
> +__hidden struct bpf_res_spin_lock qa_q_lock4 SEC(".data.qa_q_lock4");
> +
> +static struct bpf_res_spin_lock *qa_q_lock(s32 qid)
> +{
> +       switch (qid) {
> +       case 0: return &qa_q_lock0;
> +       case 1: return &qa_q_lock1;
> +       case 2: return &qa_q_lock2;
> +       case 3: return &qa_q_lock3;
> +       case 4: return &qa_q_lock4;
> +       default: return NULL;
> +       }
> +}
>
>  /*
>   * If enabled, CPU performance target is set according to the queue index
> @@ -123,9 +116,17 @@ static const u32 qidx_to_cpuperf_target[] = {
>   * arena. While the task is alive the entry is referenced from task_ctx_stor;
>   * while it's free the entry sits on the free list singly-linked through
>   * @next_free.
> + *
> + * When the task is queued on one of the five priority FIFOs, @q_idx is the
> + * queue index and @q_next/@q_prev link it in the queue's doubly-linked list.
> + * @q_idx is -1 when the task isn't on any queue.
>   */
>  struct task_ctx {
>         struct task_ctx __arena *next_free;     /* only valid on free list */
> +       struct task_ctx __arena *q_next;        /* queue link, NULL if tail */
> +       struct task_ctx __arena *q_prev;        /* queue link, NULL if head */
> +       struct qmap_fifo __arena *fifo;         /* queue we're on, NULL if not queued */
> +       s32                     pid;
>         bool                    force_local;    /* Dispatch directly to local_dsq */
>         bool                    highpri;
>         u64                     core_sched_seq;
> @@ -196,6 +197,81 @@ static struct task_ctx __arena *lookup_task_ctx(struct task_struct *p)
>         return v->taskc;
>  }
>
> +/* Append @taskc to the tail of @fifo. Must not already be queued. */
> +static void qmap_fifo_enqueue(struct qmap_fifo __arena *fifo,
> +                             struct task_ctx __arena *taskc)
> +{
> +       struct bpf_res_spin_lock *lock = qa_q_lock(fifo->idx);
> +
> +       if (!lock || qmap_spin_lock(lock))
> +               return;
> +       taskc->fifo = fifo;
> +       taskc->q_next = NULL;
> +       taskc->q_prev = fifo->tail;
> +       if (fifo->tail)
> +               fifo->tail->q_next = taskc;
> +       else
> +               fifo->head = taskc;
> +       fifo->tail = taskc;
> +       bpf_res_spin_unlock(lock);
> +}
> +
> +/* Pop the head of @fifo. Returns NULL if empty. */
> +static struct task_ctx __arena *qmap_fifo_pop(struct qmap_fifo __arena *fifo)
> +{
> +       struct bpf_res_spin_lock *lock = qa_q_lock(fifo->idx);
> +       struct task_ctx __arena *taskc;
> +
> +       if (!lock || qmap_spin_lock(lock))
> +               return NULL;
> +       taskc = fifo->head;
> +       if (taskc) {
> +               fifo->head = taskc->q_next;
> +               if (taskc->q_next)
> +                       taskc->q_next->q_prev = NULL;
> +               else
> +                       fifo->tail = NULL;
> +               taskc->q_next = NULL;
> +               taskc->q_prev = NULL;
> +               taskc->fifo = NULL;
> +       }
> +       bpf_res_spin_unlock(lock);
> +       return taskc;
> +}
> +
> +/* Remove @taskc from its fifo. No-op if not queued. */
> +static void qmap_fifo_remove(struct task_ctx __arena *taskc)
> +{
> +       struct qmap_fifo __arena *fifo = taskc->fifo;
> +       struct bpf_res_spin_lock *lock;
> +
> +       if (!fifo)
> +               return;
> +
> +       lock = qa_q_lock(fifo->idx);
> +       if (!lock || qmap_spin_lock(lock))
> +               return;
> +
> +       /* Re-check under lock — a concurrent pop may have cleared fifo. */
> +       if (taskc->fifo != fifo) {
> +               bpf_res_spin_unlock(lock);
> +               return;
> +       }
> +
> +       if (taskc->q_next)
> +               taskc->q_next->q_prev = taskc->q_prev;
> +       else
> +               fifo->tail = taskc->q_prev;
> +       if (taskc->q_prev)
> +               taskc->q_prev->q_next = taskc->q_next;
> +       else
> +               fifo->head = taskc->q_next;
> +       taskc->q_next = NULL;
> +       taskc->q_prev = NULL;
> +       taskc->fifo = NULL;
> +       bpf_res_spin_unlock(lock);
> +}
> +
>  s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
>                    s32 prev_cpu, u64 wake_flags)
>  {
> @@ -237,9 +313,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>  {
>         static u32 user_cnt, kernel_cnt;
>         struct task_ctx __arena *taskc;
> -       u32 pid = p->pid;
>         int idx = weight_to_idx(p->scx.weight);
> -       void *ring;
>         s32 cpu;
>
>         if (enq_flags & SCX_ENQ_REENQ) {
> @@ -325,17 +399,8 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>                 return;
>         }
>
> -       ring = bpf_map_lookup_elem(&queue_arr, &idx);
> -       if (!ring) {
> -               scx_bpf_error("failed to find ring %d", idx);
> -               return;
> -       }
> -
> -       /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
> -       if (bpf_map_push_elem(ring, &pid, 0)) {
> -               scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags);
> -               return;
> -       }
> +       /* Queue on the selected FIFO. */
> +       qmap_fifo_enqueue(&qa.fifos[idx], taskc);
>
>         if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
>                 taskc->highpri = true;
> @@ -344,15 +409,20 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
>         __sync_fetch_and_add(&qa.nr_enqueued, 1);
>  }
>
> -/*
> - * The BPF queue map doesn't support removal and sched_ext can handle spurious
> - * dispatches. qmap_dequeue() is only used to collect statistics.
> - */
>  void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
>  {
> +       struct task_ctx __arena *taskc;
> +
>         __sync_fetch_and_add(&qa.nr_dequeued, 1);
>         if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
>                 __sync_fetch_and_add(&qa.nr_core_sched_execed, 1);
> +
> +       taskc = lookup_task_ctx(p);
> +       if (taskc && taskc->fifo) {
> +               if (taskc->highpri)
> +                       __sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
> +               qmap_fifo_remove(taskc);
> +       }
>  }
>
>  static void update_core_sched_head_seq(struct task_struct *p)
> @@ -435,8 +505,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>         struct cpu_ctx __arena *cpuc;
>         struct task_ctx __arena *taskc;
>         u32 batch = dsp_batch ?: 1;
> -       void *fifo;
> -       s32 i, pid;
> +       s32 i;
>
>         if (dispatch_highpri(false))
>                 return;
> @@ -467,30 +536,18 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
>                         cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
>                 }
>
> -               u64 dsp_idx = cpuc->dsp_idx;
> -
> -               fifo = bpf_map_lookup_elem(&queue_arr, &dsp_idx);
> -               if (!fifo) {
> -                       scx_bpf_error("failed to find ring %llu", dsp_idx);
> -                       return;
> -               }
> -
>                 /* Dispatch or advance. */
>                 bpf_repeat(BPF_MAX_LOOPS) {
>                         struct task_ctx __arena *taskc;
>
> -                       if (bpf_map_pop_elem(fifo, &pid))
> +                       taskc = qmap_fifo_pop(&qa.fifos[cpuc->dsp_idx]);
> +                       if (!taskc)
>                                 break;
>
> -                       p = bpf_task_from_pid(pid);
> +                       p = bpf_task_from_pid(taskc->pid);
>                         if (!p)
>                                 continue;
>
> -                       if (!(taskc = lookup_task_ctx(p))) {
> -                               bpf_task_release(p);
> -                               return;
> -                       }
> -
>                         if (taskc->highpri)
>                                 __sync_fetch_and_sub(&qa.nr_highpri_queued, 1);
>
> @@ -661,6 +718,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init_task, struct task_struct *p,
>         }
>
>         taskc->next_free = NULL;
> +       taskc->q_next = NULL;
> +       taskc->q_prev = NULL;
> +       taskc->fifo = NULL;
> +       taskc->pid = p->pid;
>         taskc->force_local = false;
>         taskc->highpri = false;
>         taskc->core_sched_seq = 0;
> @@ -701,38 +762,29 @@ void BPF_STRUCT_OPS(qmap_exit_task, struct task_struct *p,
>
>  void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
>  {
> -       s32 i, pid;
> +       struct task_ctx __arena *taskc;
> +       s32 i;
> +
> +       QMAP_TOUCH_ARENA();
>
>         if (suppress_dump)
>                 return;
>
> +       /*
> +        * Walk the queue lists without locking - kfunc calls (scx_bpf_dump)
> +        * aren't in the verifier's kfunc_spin_allowed() list so we can't hold
> +        * a lock and dump. Best-effort; racing may print stale pids but the
> +        * walk is bounded by bpf_repeat() so it always terminates.
> +        */
>         bpf_for(i, 0, 5) {
> -               void *fifo;
> -
> -               if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
> -                       return;
> -
>                 scx_bpf_dump("QMAP FIFO[%d]:", i);
> -
> -               /*
> -                * Dump can be invoked anytime and there is no way to iterate in
> -                * a non-destructive way. Pop and store in dump_store and then
> -                * restore afterwards. If racing against new enqueues, ordering
> -                * can get mixed up.
> -                */
> -               bpf_repeat(4096) {
> -                       if (bpf_map_pop_elem(fifo, &pid))
> -                               break;
> -                       bpf_map_push_elem(&dump_store, &pid, 0);
> -                       scx_bpf_dump(" %d", pid);
> -               }
> -
> +               taskc = qa.fifos[i].head;
>                 bpf_repeat(4096) {
> -                       if (bpf_map_pop_elem(&dump_store, &pid))
> +                       if (!taskc)
>                                 break;
> -                       bpf_map_push_elem(fifo, &pid, 0);
> +                       scx_bpf_dump(" %d", taskc->pid);
> +                       taskc = taskc->q_next;
>                 }
> -
>                 scx_bpf_dump("\n");
>         }
>  }
> @@ -756,6 +808,8 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc
>
>         QMAP_TOUCH_ARENA();
>
> +       QMAP_TOUCH_ARENA();
> +
>         if (suppress_dump)
>                 return;
>         v = bpf_task_storage_get(&task_ctx_stor, p, NULL, 0);
> @@ -1018,6 +1072,9 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
>         }
>         qa.task_ctxs = slab;
>
> +       bpf_for(i, 0, 5)
> +               qa.fifos[i].idx = i;
> +
>         bpf_for(i, 0, max_tasks)
>                 slab[i].next_free = (i + 1 < max_tasks) ? &slab[i + 1] : NULL;
>         qa.task_free_head = &slab[0];
> diff --git a/tools/sched_ext/scx_qmap.h b/tools/sched_ext/scx_qmap.h
> index c183d82632b3..9c0da5a301cb 100644
> --- a/tools/sched_ext/scx_qmap.h
> +++ b/tools/sched_ext/scx_qmap.h
> @@ -37,6 +37,12 @@ struct cpu_ctx {
>  /* Opaque to userspace; defined in scx_qmap.bpf.c. */
>  struct task_ctx;
>
> +struct qmap_fifo {
> +       struct task_ctx __arena *head;
> +       struct task_ctx __arena *tail;
> +       __s32 idx;
> +};
> +
>  struct qmap_arena {
>         /* userspace-visible stats */
>         __u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0;
> @@ -59,6 +65,9 @@ struct qmap_arena {
>         /* task_ctx slab; allocated and threaded by qmap_init() */
>         struct task_ctx __arena *task_ctxs;
>         struct task_ctx __arena *task_free_head;
> +
> +       /* five priority FIFOs, each a doubly-linked list through task_ctx */
> +       struct qmap_fifo fifos[5];
>  };
>
>  #endif /* __SCX_QMAP_H */
> --
> 2.53.0
>
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2026-04-16 15:45 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-16  8:16 [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Tejun Heo
2026-04-16  8:16 ` [PATCH 1/4] sched_ext: scx_qmap: rename tctx to taskc Tejun Heo
2026-04-16 14:56   ` Emil Tsalapatis
2026-04-16  8:16 ` [PATCH 2/4] sched_ext: scx_qmap: move globals and cpu_ctx into a BPF arena map Tejun Heo
2026-04-16 15:28   ` Emil Tsalapatis
2026-04-16  8:16 ` [PATCH 3/4] sched_ext: scx_qmap: move task_ctx into a BPF arena slab Tejun Heo
2026-04-16 15:31   ` Emil Tsalapatis
2026-04-16  8:16 ` [PATCH 4/4] sched_ext: scx_qmap: replace FIFO queue maps with arena-backed lists Tejun Heo
2026-04-16 10:01   ` Andrea Righi
2026-04-16 15:45   ` Emil Tsalapatis
2026-04-16 10:05 ` [PATCHSET sched_ext/for-7.2] sched_ext: scx_qmap: Convert to BPF arena Andrea Righi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox