[PATCH 14/16] tools/sched_ext: scx_qmap: Add cmask-based idle tracking and cid-based idle pick

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

From: Tejun Heo <tj@kernel.org>
To: void@manifault.com, arighi@nvidia.com, changwoo@igalia.com
Cc: sched-ext@lists.linux.dev, emil@etsalapatis.com,
	linux-kernel@vger.kernel.org, Tejun Heo <tj@kernel.org>
Subject: [PATCH 14/16] tools/sched_ext: scx_qmap: Add cmask-based idle tracking and cid-based idle pick
Date: Mon, 20 Apr 2026 21:19:43 -1000	[thread overview]
Message-ID: <20260421071945.3110084-15-tj@kernel.org> (raw)
In-Reply-To: <20260421071945.3110084-1-tj@kernel.org>

Switch qmap's idle-cpu picker from scx_bpf_pick_idle_cpu() to a
BPF-side bitmap scan, still under cpu-form struct_ops. qa_idle_cids
tracks idle cids (updated in update_idle / cpu_offline) and each
task's taskc->cpus_allowed tracks its allowed cids (built in
set_cpumask / init_task); select_cpu / enqueue scan the intersection
for an idle cid. Callbacks translate cpu <-> cid on entry;
cid-qmap-port drops those translations.

The scan is barebone - no core preference or other topology-aware
picks like the in-kernel picker - but qmap is a demo and this is
enough to exercise the plumbing.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_qmap.bpf.c | 131 +++++++++++++++++++++++++++++----
 1 file changed, 115 insertions(+), 16 deletions(-)

diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 35a2dc6dd757..d30ec914a118 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -72,6 +72,13 @@ struct {
 
 struct qmap_arena __arena qa;
 
+/*
+ * Global idle-cid tracking, maintained via update_idle / cpu_offline and
+ * scanned by the direct-dispatch path. Allocated in qmap_init() from one
+ * arena page, sized to the full cid space.
+ */
+struct scx_cmask __arena *qa_idle_cids;
+
 /* Per-queue locks. Each in its own .data section as bpf_res_spin_lock requires. */
 __hidden struct bpf_res_spin_lock qa_q_lock0 SEC(".data.qa_q_lock0");
 __hidden struct bpf_res_spin_lock qa_q_lock1 SEC(".data.qa_q_lock1");
@@ -132,8 +139,18 @@ struct task_ctx {
 	bool			force_local;	/* Dispatch directly to local_dsq */
 	bool			highpri;
 	u64			core_sched_seq;
+	struct scx_cmask	cpus_allowed;	/* per-task affinity in cid space */
 };
 
+/*
+ * Slab stride for task_ctx. cpus_allowed's flex array bits[] overlaps the
+ * tail bytes appended per entry; struct_size() gives the actual per-entry
+ * footprint.
+ */
+#define TASK_CTX_STRIDE							\
+	struct_size_t(struct task_ctx, cpus_allowed.bits,		\
+		      CMASK_NR_WORDS(SCX_QMAP_MAX_CPUS))
+
 /* All task_ctx pointers are arena pointers. */
 typedef struct task_ctx __arena task_ctx_t;
 
@@ -161,20 +178,37 @@ static int qmap_spin_lock(struct bpf_res_spin_lock *lock)
 	return 0;
 }
 
-static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
+/*
+ * Try prev_cpu's cid, then scan taskc->cpus_allowed AND qa_idle_cids
+ * round-robin from prev_cid + 1. Atomic claim retries on race; bounded
+ * by IDLE_PICK_RETRIES to keep the verifier's insn budget in check.
+ */
+#define IDLE_PICK_RETRIES	16
+
+static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu,
+				    task_ctx_t *taskc)
 {
-	s32 cpu;
+	u32 nr_cids = scx_bpf_nr_cids();
+	s32 prev_cid, cid;
+	u32 i;
 
 	if (!always_enq_immed && p->nr_cpus_allowed == 1)
 		return prev_cpu;
 
-	if (scx_bpf_test_and_clear_cpu_idle(prev_cpu))
+	prev_cid = scx_bpf_cpu_to_cid(prev_cpu);
+	if (cmask_test_and_clear(qa_idle_cids, prev_cid))
 		return prev_cpu;
 
-	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
-	if (cpu >= 0)
-		return cpu;
-
+	cid = prev_cid;
+	bpf_for(i, 0, IDLE_PICK_RETRIES) {
+		cid = cmask_next_and_set_wrap(&taskc->cpus_allowed,
+					      qa_idle_cids, cid + 1);
+		barrier_var(cid);
+		if (cid >= nr_cids)
+			return -1;
+		if (cmask_test_and_clear(qa_idle_cids, cid))
+			return scx_bpf_cid_to_cpu(cid);
+	}
 	return -1;
 }
 
@@ -286,7 +320,7 @@ s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 	if (p->scx.weight < 2 && !(p->flags & PF_KTHREAD))
 		return prev_cpu;
 
-	cpu = pick_direct_dispatch_cpu(p, prev_cpu);
+	cpu = pick_direct_dispatch_cpu(p, prev_cpu, taskc);
 
 	if (cpu >= 0) {
 		taskc->force_local = true;
@@ -379,7 +413,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 
 	/* if select_cpu() wasn't called, try direct dispatch */
 	if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
-	    (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
+	    (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p), taskc)) >= 0) {
 		__sync_fetch_and_add(&qa.nr_ddsp_from_enq, 1);
 		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
 		return;
@@ -724,6 +758,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init_task, struct task_struct *p,
 	taskc->force_local = false;
 	taskc->highpri = false;
 	taskc->core_sched_seq = 0;
+	cmask_init(&taskc->cpus_allowed, 0, scx_bpf_nr_cids());
+	bpf_rcu_read_lock();
+	cmask_from_cpumask(&taskc->cpus_allowed, p->cpus_ptr);
+	bpf_rcu_read_unlock();
 
 	v = bpf_task_storage_get(&task_ctx_stor, p, NULL,
 				 BPF_LOCAL_STORAGE_GET_F_CREATE);
@@ -841,6 +879,48 @@ void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp,
 			   cgrp->kn->id, period_us, quota_us, burst_us);
 }
 
+void BPF_STRUCT_OPS(qmap_update_idle, s32 cpu, bool idle)
+{
+	s32 cid = scx_bpf_cpu_to_cid(cpu);
+
+	QMAP_TOUCH_ARENA();
+	if (cid < 0)
+		return;
+	if (idle)
+		cmask_set(qa_idle_cids, cid);
+	else
+		cmask_clear(qa_idle_cids, cid);
+}
+
+/*
+ * The cpumask received here is kernel-address memory; walk it bit by bit
+ * (bpf_cpumask_test_cpu handles the access), convert each set cpu to its
+ * cid, and populate the arena-resident taskc cmask.
+ */
+void BPF_STRUCT_OPS(qmap_set_cpumask, struct task_struct *p,
+		    const struct cpumask *cpumask)
+{
+	task_ctx_t *taskc;
+	u32 nr_cpu_ids = scx_bpf_nr_cpu_ids();
+	s32 cpu;
+
+	taskc = lookup_task_ctx(p);
+	if (!taskc)
+		return;
+
+	cmask_zero(&taskc->cpus_allowed);
+
+	bpf_for(cpu, 0, nr_cpu_ids) {
+		s32 cid;
+
+		if (!bpf_cpumask_test_cpu(cpu, cpumask))
+			continue;
+		cid = scx_bpf_cpu_to_cid(cpu);
+		if (cid >= 0)
+			__cmask_set(&taskc->cpus_allowed, cid);
+	}
+}
+
 struct monitor_timer {
 	struct bpf_timer timer;
 };
@@ -990,34 +1070,51 @@ static int lowpri_timerfn(void *map, int *key, struct bpf_timer *timer)
 
 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
 {
-	task_ctx_t *slab;
+	u8 __arena *slab;
 	u32 nr_pages, key = 0, i;
 	struct bpf_timer *timer;
 	s32 ret;
 
 	/*
 	 * Allocate the task_ctx slab in arena and thread the entire slab onto
-	 * the free list. max_tasks is set by userspace before load.
+	 * the free list. max_tasks is set by userspace before load. Each entry
+	 * is TASK_CTX_STRIDE bytes - task_ctx's trailing cpus_allowed flex
+	 * array extends into the stride tail.
 	 */
 	if (!max_tasks) {
 		scx_bpf_error("max_tasks must be > 0");
 		return -EINVAL;
 	}
 
-	nr_pages = (max_tasks * sizeof(struct task_ctx) + PAGE_SIZE - 1) / PAGE_SIZE;
+	nr_pages = (max_tasks * TASK_CTX_STRIDE + PAGE_SIZE - 1) / PAGE_SIZE;
 	slab = bpf_arena_alloc_pages(&arena, NULL, nr_pages, NUMA_NO_NODE, 0);
 	if (!slab) {
 		scx_bpf_error("failed to allocate task_ctx slab");
 		return -ENOMEM;
 	}
-	qa.task_ctxs = slab;
+	qa.task_ctxs = (task_ctx_t *)slab;
 
 	bpf_for(i, 0, 5)
 		qa.fifos[i].idx = i;
 
-	bpf_for(i, 0, max_tasks)
-		slab[i].next_free = (i + 1 < max_tasks) ? &slab[i + 1] : NULL;
-	qa.task_free_head = &slab[0];
+	bpf_for(i, 0, max_tasks) {
+		task_ctx_t *cur = (task_ctx_t *)(slab + i * TASK_CTX_STRIDE);
+		task_ctx_t *next = (i + 1 < max_tasks) ?
+			(task_ctx_t *)(slab + (i + 1) * TASK_CTX_STRIDE) : NULL;
+		cur->next_free = next;
+	}
+	qa.task_free_head = (task_ctx_t *)slab;
+
+	/*
+	 * Allocate and initialize the idle cmask. Starts empty - update_idle
+	 * fills it as cpus enter idle.
+	 */
+	qa_idle_cids = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
+	if (!qa_idle_cids) {
+		scx_bpf_error("failed to allocate idle cmask");
+		return -ENOMEM;
+	}
+	cmask_init(qa_idle_cids, 0, scx_bpf_nr_cids());
 
 	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
 	if (ret) {
@@ -1102,6 +1199,8 @@ SCX_OPS_DEFINE(qmap_ops,
 	       .dispatch		= (void *)qmap_dispatch,
 	       .tick			= (void *)qmap_tick,
 	       .core_sched_before	= (void *)qmap_core_sched_before,
+	       .set_cpumask		= (void *)qmap_set_cpumask,
+	       .update_idle		= (void *)qmap_update_idle,
 	       .init_task		= (void *)qmap_init_task,
 	       .exit_task		= (void *)qmap_exit_task,
 	       .dump			= (void *)qmap_dump,
-- 
2.53.0

next prev parent reply	other threads:[~2026-04-21  7:20 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-21  7:19 [PATCHSET sched_ext/for-7.2] sched_ext: Topological CPU IDs and cid-form struct_ops Tejun Heo
2026-04-21  7:19 ` [PATCH 01/16] sched_ext: Rename ops_cpu_valid() to scx_cpu_valid() and expose it Tejun Heo
2026-04-21 13:31   ` Cheng-Yang Chou
2026-04-21  7:19 ` [PATCH 02/16] sched_ext: Move scx_exit(), scx_error() and friends to ext_internal.h Tejun Heo
2026-04-21 13:36   ` Cheng-Yang Chou
2026-04-21  7:19 ` [PATCH 03/16] sched_ext: Shift scx_kick_cpu() validity check to scx_bpf_kick_cpu() Tejun Heo
2026-04-21 13:49   ` Cheng-Yang Chou
2026-04-21  7:19 ` [PATCH 04/16] sched_ext: Relocate cpu_acquire/cpu_release to end of struct sched_ext_ops Tejun Heo
2026-04-21 13:58   ` Cheng-Yang Chou
2026-04-21  7:19 ` [PATCH 05/16] sched_ext: Make scx_enable() take scx_enable_cmd Tejun Heo
2026-04-21 14:25   ` Cheng-Yang Chou
2026-04-21  7:19 ` [PATCH 06/16] sched_ext: Add topological CPU IDs (cids) Tejun Heo
2026-04-21 17:15   ` [PATCH v2 sched_ext/for-7.2] " Tejun Heo
2026-04-21  7:19 ` [PATCH 07/16] sched_ext: Add scx_bpf_cid_override() kfunc Tejun Heo
2026-04-21  7:19 ` [PATCH 08/16] tools/sched_ext: Add struct_size() helpers to common.bpf.h Tejun Heo
2026-04-21  7:19 ` [PATCH 09/16] sched_ext: Add cmask, a base-windowed bitmap over cid space Tejun Heo
2026-04-21 17:30   ` Cheng-Yang Chou
2026-04-21 23:21   ` [PATCH v2] " Tejun Heo
2026-04-21  7:19 ` [PATCH 10/16] sched_ext: Add cid-form kfunc wrappers alongside cpu-form Tejun Heo
2026-04-21  7:19 ` [PATCH 11/16] sched_ext: Add bpf_sched_ext_ops_cid struct_ops type Tejun Heo
2026-04-21  7:19 ` [PATCH 12/16] sched_ext: Forbid cpu-form kfuncs from cid-form schedulers Tejun Heo
2026-04-21  7:19 ` [PATCH 13/16] tools/sched_ext: scx_qmap: Restart on hotplug instead of cpu_online/offline Tejun Heo
2026-04-21  7:19 ` Tejun Heo [this message]
2026-04-21  7:19 ` [PATCH 15/16] tools/sched_ext: scx_qmap: Port to cid-form struct_ops Tejun Heo
2026-04-21  7:19 ` [PATCH 16/16] sched_ext: Require cid-form struct_ops for sub-sched support Tejun Heo
2026-04-21 18:18 ` [PATCHSET sched_ext/for-7.2] sched_ext: Topological CPU IDs and cid-form struct_ops Cheng-Yang Chou
2026-04-21 18:33   ` Tejun Heo

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:35a2dc6dd75 dfblob:d30ec914a11 )
 OR (
bs:"[PATCH 14/16] tools/sched_ext: scx_qmap: Add cmask-based idle tracking and cid-based idle pick" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260421071945.3110084-15-tj@kernel.org \
    --to=tj@kernel.org \
    --cc=arighi@nvidia.com \
    --cc=changwoo@igalia.com \
    --cc=emil@etsalapatis.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=sched-ext@lists.linux.dev \
    --cc=void@manifault.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox