Sched_ext development
 help / color / mirror / Atom feed
From: Tejun Heo <tj@kernel.org>
To: David Vernet <void@manifault.com>,
	Andrea Righi <arighi@nvidia.com>,
	Changwoo Min <changwoo@igalia.com>
Cc: sched-ext@lists.linux.dev, Emil Tsalapatis <emil@etsalapatis.com>,
	linux-kernel@vger.kernel.org, Tejun Heo <tj@kernel.org>
Subject: [PATCH sched_ext/for-7.3 18/32] sched_ext: Maintain per-cpu effective cap copies for single-read checks
Date: Thu,  2 Jul 2026 22:01:45 -1000	[thread overview]
Message-ID: <20260703080159.2314350-19-tj@kernel.org> (raw)
In-Reply-To: <20260703080159.2314350-1-tj@kernel.org>

Checking a sched's caps on a cid would need to test several cap bits against
caps[] to account for implied caps. Also, caps[] modifications aren't
synchronized against scheduling operations on each cpu, which can lead to
awkward race conditions.

Collect them per cpu instead. caps[] under pshard->lock stays the target
configuration. scx_sched_pcpu->ecaps is added, the transposed effective
copy: the set of cap bits the sched holds on that cpu which can be accessed
with a single read. It is stable under the rq lock. It can also be read
locklessly with READ_ONCE().

Grant and revoke only mutate caps[]. They queue a sync request on the target
cpu's rq->scx.ecaps_to_sync and kick it, and the cpu recomputes the queued
scheds' ecaps from caps[] in balance_one() under its own rq lock. A dying
sched runs the sync directly to retire its queued request before freeing. As
held references can defer the freeing past the enclosing root scheduler's
lifetime, root enable discards leftover sync requests before going live.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext/ext.c      |  10 +++
 kernel/sched/ext/internal.h |  21 +++++-
 kernel/sched/ext/sub.c      | 133 +++++++++++++++++++++++++++++++++++-
 kernel/sched/ext/sub.h      |  16 +++++
 kernel/sched/sched.h        |   3 +
 5 files changed, 180 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/ext/ext.c b/kernel/sched/ext/ext.c
index 4701346765cd..a1b994da9514 100644
--- a/kernel/sched/ext/ext.c
+++ b/kernel/sched/ext/ext.c
@@ -2600,6 +2600,8 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 	rq->scx.flags |= SCX_RQ_IN_BALANCE;
 	rq->scx.flags &= ~SCX_RQ_BAL_KEEP;
 
+	scx_process_sync_ecaps(rq);
+
 	if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) &&
 	    unlikely(rq->scx.cpu_released)) {
 		/*
@@ -4632,6 +4634,9 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 		 */
 		WARN_ON_ONCE(!list_empty(&pcpu->deferred_reenq_local.node));
 
+		/* flush the queued ecaps syncs */
+		scx_discard_ecaps_to_sync(cpu, pcpu);
+
 		/*
 		 * Bypass blocks new kicks. Flush the kick irq_work so this
 		 * pcpu's to_kick_node is off the list before it is freed.
@@ -6376,6 +6381,9 @@ struct scx_sched *scx_alloc_and_add_sched(struct scx_enable_cmd *cmd,
 		node = cpu_to_node(cpu);
 		pcpu->sch = sch;
 		INIT_LIST_HEAD(&pcpu->deferred_reenq_local.node);
+#ifdef CONFIG_EXT_SUB_SCHED
+		init_llist_node(&pcpu->ecaps_to_sync_node);
+#endif
 		INIT_LIST_HEAD(&pcpu->to_kick_node);
 		if (!zalloc_cpumask_var_node(&pcpu->cpus_to_kick, GFP_KERNEL, node) ||
 		    !zalloc_cpumask_var_node(&pcpu->cpus_to_kick_if_idle, GFP_KERNEL, node) ||
@@ -6720,6 +6728,8 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 		rq->scx.cpuperf_target = SCX_CPUPERF_ONE;
 	}
 
+	scx_discard_stale_ecaps_syncs();
+
 	/*
 	 * Keep CPUs stable during enable so that the BPF scheduler can track
 	 * online CPUs by watching ->on/offline_cpu() after ->init().
diff --git a/kernel/sched/ext/internal.h b/kernel/sched/ext/internal.h
index fd75005fcc10..ed56ac5e458d 100644
--- a/kernel/sched/ext/internal.h
+++ b/kernel/sched/ext/internal.h
@@ -1182,6 +1182,24 @@ struct scx_sched_pcpu {
 	cpumask_var_t		cpus_to_wait;
 	struct list_head	to_kick_node;
 
+#ifdef CONFIG_EXT_SUB_SCHED
+	/*
+	 * pshard->caps[cap_bit] is the set of cids the sched holds that one
+	 * cap on. ecaps is its transpose: the set of SCX_CAP_* bits the sched
+	 * holds on this cpu, collected so that the hot-path check is a single
+	 * read.
+	 *
+	 * While pshard->caps[] under pshard->lock is the target configuration,
+	 * ecaps is the effective copy owned by the cpu. It is written under the
+	 * rq lock while processing rq->ecaps_to_sync. Can also be read with
+	 * READ_ONCE() outside rq lock.
+	 *
+	 * See queue_sync_ecaps() and scx_process_sync_ecaps().
+	 */
+	u64			ecaps;
+	struct llist_node	ecaps_to_sync_node;
+#endif
+
 	/*
 	 * The event counters are in a per-CPU variable to minimize the
 	 * accounting overhead. A system-wide view on the event counter is
@@ -1291,7 +1309,8 @@ struct scx_pshard {
 
 	/*
 	 * Per-cap cmask, inline via TRAILING_OVERLAP so cmask.bits[] overlaps
-	 * the trailing _bits[] storage. Access as &caps[i].cmask.
+	 * the trailing _bits[] storage. Access as &caps[i].cmask. See
+	 * scx_sched_pcpu->ecaps.
 	 */
 	TRAILING_OVERLAP(struct scx_cmask, cmask, bits,
 			 u64 _bits[SCX_CMASK_NR_WORDS(SCX_CID_SHARD_MAX_CPUS)];
diff --git a/kernel/sched/ext/sub.c b/kernel/sched/ext/sub.c
index c821d604ac9d..08d9367cf218 100644
--- a/kernel/sched/ext/sub.c
+++ b/kernel/sched/ext/sub.c
@@ -283,6 +283,125 @@ static void scx_sub_seed_caps(struct scx_sched *sch)
 	caps_updated_deliver(&to_deliver);
 }
 
+static u64 calc_effective_caps(struct scx_pshard *ps, s32 cid)
+{
+	u64 ecaps = 0;
+	u32 cap_bit;
+
+	for (cap_bit = 0; cap_bit < __SCX_NR_CAPS; cap_bit++)
+		if (scx_cmask_test(cid, &ps->caps[cap_bit].cmask))
+			ecaps |= BIT_U64(cap_bit) | scx_caps_implied(BIT_U64(cap_bit));
+	return ecaps;
+}
+
+/**
+ * queue_sync_ecaps - Queue ecaps update for a (sch, cid) pair
+ * @sch: sched to update
+ * @cid: cid to update
+ *
+ * Queue an ecaps update for @sch's @cid and kick the cpu so that it syncs in
+ * balance_one().
+ */
+static void queue_sync_ecaps(struct scx_sched *sch, s32 cid)
+{
+	s32 cpu = __scx_cid_to_cpu(cid);
+	struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
+
+	/*
+	 * Pairs with smp_mb() in scx_process_sync_ecaps(). Either the check
+	 * below sees the node off the list and queues it, or the in-flight sync
+	 * sees the caps[] update made before this call.
+	 */
+	smp_mb();
+
+	/* @cid's pshard->lock excludes concurrent queueing attempts */
+	if (llist_on_list(&pcpu->ecaps_to_sync_node))
+		return;
+	if (llist_add(&pcpu->ecaps_to_sync_node, &cpu_rq(cpu)->scx.ecaps_to_sync))
+		scx_kick_cpu(scx_root, cpu, 0);
+}
+
+/* discard @rq's queued ecaps syncs */
+static void discard_queued_syncs(struct rq *rq)
+{
+	struct llist_node *pos, *tmp;
+
+	lockdep_assert_rq_held(rq);
+
+	llist_for_each_safe(pos, tmp, llist_del_all(&rq->scx.ecaps_to_sync))
+		init_llist_node(pos);
+}
+
+/**
+ * scx_process_sync_ecaps - Sync this cpu's ecaps to pshard->caps[]
+ * @rq: the cid's cpu rq
+ *
+ * pshard->caps[] is the target configuration. pcpu->ecaps is the effective
+ * transposed copy owned by the cid's cpu and written only here under @rq's
+ * lock.
+ */
+void scx_process_sync_ecaps(struct rq *rq)
+{
+	s32 cid = __scx_cpu_to_cid(cpu_of(rq));
+	s32 shard = scx_cid_to_shard[cid];
+	struct llist_node *batch, *pos, *tmp;
+
+	lockdep_assert_rq_held(rq);
+
+	if (likely(llist_empty(&rq->scx.ecaps_to_sync)))
+		return;
+
+	batch = llist_del_all(&rq->scx.ecaps_to_sync);
+	llist_for_each_safe(pos, tmp, batch) {
+		struct scx_sched_pcpu *pcpu =
+			container_of(pos, struct scx_sched_pcpu, ecaps_to_sync_node);
+		struct scx_pshard *ps = pcpu->sch->pshard[shard];
+
+		init_llist_node(pos);
+
+		/* pairs with smp_mb() in queue_sync_ecaps(), see there */
+		smp_mb();
+
+		WRITE_ONCE(pcpu->ecaps, calc_effective_caps(ps, cid));
+	}
+}
+
+/*
+ * @pcpu's sched was unhashed before the grace period, so nothing new queues.
+ * Flush its pending sync so the pcpu can be freed. scx_process_sync_ecaps()
+ * takes nodes off the list before syncing and acquiring the rq lock waits for
+ * any in-flight walk.
+ */
+void scx_discard_ecaps_to_sync(s32 cpu, struct scx_sched_pcpu *pcpu)
+{
+	scoped_guard (rq_lock_irqsave, cpu_rq(cpu))
+		scx_process_sync_ecaps(cpu_rq(cpu));
+
+	WARN_ON_ONCE(llist_on_list(&pcpu->ecaps_to_sync_node));
+}
+
+/**
+ * scx_discard_stale_ecaps_syncs - Discard ecaps syncs from earlier schedulers
+ *
+ * To be called during root enable before the scheduler goes live. An earlier
+ * root's sub-sched may not have gone through its RCU free path yet (e.g. a
+ * still-open link fd defers it) and can leave queued ecaps syncs behind.
+ * Processing them would decode the dead sched's pshards with the current cid
+ * layout. Discard them instead. The backing scx_sched_pcpu's are still
+ * allocated as the free path drains ecaps_to_sync_node before freeing.
+ */
+void scx_discard_stale_ecaps_syncs(void)
+{
+	s32 cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+
+		guard(rq_lock_irqsave)(rq);
+		discard_queued_syncs(rq);
+	}
+}
+
 static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq);
 
 void drain_descendants(struct scx_sched *sch)
@@ -1021,9 +1140,14 @@ __bpf_kfunc s32 scx_bpf_sub_grant(u64 cgroup_id, u64 caps,
 				granted_caps |= BIT_U64(cap_bit);
 			}
 
-			if (granted_caps)
+			if (granted_caps) {
+				s32 cid;
+
 				caps_updated_record(cps, changed_cids, granted_caps,
 						    &to_deliver);
+				scx_cmask_for_each_cid(cid, changed_cids)
+					queue_sync_ecaps(child, cid);
+			}
 		}
 
 		/* record cids that didn't make it through into @denied_out */
@@ -1116,9 +1240,14 @@ __bpf_kfunc void scx_bpf_sub_revoke(u64 cgroup_id, u64 caps,
 					revoked_caps |= BIT_U64(cap_bit);
 				}
 
-				if (revoked_caps)
+				if (revoked_caps) {
+					s32 cid;
+
 					caps_updated_record(ps, changed_cids, revoked_caps,
 							    &to_deliver);
+					scx_cmask_for_each_cid(cid, changed_cids)
+						queue_sync_ecaps(pos, cid);
+				}
 			}
 
 			if (revoked_caps)
diff --git a/kernel/sched/ext/sub.h b/kernel/sched/ext/sub.h
index 3a913cc56422..85cadb62ad93 100644
--- a/kernel/sched/ext/sub.h
+++ b/kernel/sched/ext/sub.h
@@ -28,6 +28,9 @@ bool scx_bpf_sub_dispatch(u64 cgroup_id, const struct bpf_prog_aux *aux);
 void scx_free_pshards(struct scx_sched *sch);
 s32 scx_alloc_pshards(struct scx_sched *sch);
 void scx_init_root_caps(struct scx_sched *sch);
+void scx_process_sync_ecaps(struct rq *rq);
+void scx_discard_ecaps_to_sync(s32 cpu, struct scx_sched_pcpu *pcpu);
+void scx_discard_stale_ecaps_syncs(void);
 
 #else	/* CONFIG_EXT_SUB_SCHED */
 
@@ -41,6 +44,9 @@ static inline void scx_sub_disable(struct scx_sched *sch) { }
 static inline void scx_free_pshards(struct scx_sched *sch) {}
 static inline s32 scx_alloc_pshards(struct scx_sched *sch) { return 0; }
 static inline void scx_init_root_caps(struct scx_sched *sch) {}
+static inline void scx_process_sync_ecaps(struct rq *rq) {}
+static inline void scx_discard_ecaps_to_sync(s32 cpu, struct scx_sched_pcpu *pcpu) {}
+static inline void scx_discard_stale_ecaps_syncs(void) {}
 
 #endif	/* CONFIG_EXT_SUB_SCHED */
 
@@ -57,6 +63,16 @@ static inline void scx_init_root_caps(struct scx_sched *sch) {}
 	for ((pos) = scx_next_descendant_pre(NULL, (root)); (pos);		\
 	     (pos) = scx_next_descendant_pre((pos), (root)))
 
+#ifdef CONFIG_EXT_SUB_SCHED
+
+/* caps implied by holding @cap */
+static inline u64 scx_caps_implied(u64 cap)
+{
+	return 0;
+}
+
+#endif	/* CONFIG_EXT_SUB_SCHED */
+
 /*
  * One user of this function is scx_bpf_dispatch() which can be called
  * recursively as sub-sched dispatches nest. Always inline to reduce stack usage
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7da25f918382..e05dcdff3ace 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -805,6 +805,9 @@ struct scx_rq {
 	u32			flags;
 	u32			nr_immed;		/* ENQ_IMMED tasks on local_dsq */
 	u64			clock;			/* current per-rq clock -- see scx_bpf_now() */
+#ifdef CONFIG_EXT_SUB_SCHED
+	struct llist_head	ecaps_to_sync;		/* pending ecaps syncs */
+#endif
 	cpumask_var_t		cpus_to_sync;
 	bool			kick_sync_pending;
 	unsigned long		kick_sync;
-- 
2.54.0


  parent reply	other threads:[~2026-07-03  8:02 UTC|newest]

Thread overview: 61+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-07-03  8:01 [PATCHSET sched_ext/for-7.3] sched_ext: Capability-based CPU delegation for sub-schedulers Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 01/32] sched_ext: Fix premature ops->priv publication in scx_alloc_and_add_sched() Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 02/32] tools/sched_ext: scx - Fix cmask_subset(), cmask_equal() and cmask_weight() Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 03/32] sched_ext: Use READ_ONCE/WRITE_ONCE in cmask word ops and drop _RACY variants Tejun Heo
2026-07-03  8:33   ` sashiko-bot
2026-07-04  0:54     ` Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 04/32] tools/sched_ext: scx_qmap - Use bare u64/u32/s32 integer types Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 05/32] sched_ext: Reject direct slice and dsq_vtime writes for cid-form schedulers Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 06/32] sched_ext: Make scx_bpf_kick_cid() return void Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 07/32] sched_ext: Make the kick machinery per-sched Tejun Heo
2026-07-03  9:02   ` sashiko-bot
2026-07-04  0:54     ` Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 08/32] sched_ext: Add ops.init_cids() to finalize the cid layout before init Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 09/32] sched_ext: Add CID sharding Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 10/32] sched_ext: Add shard boundaries to scx_bpf_cid_override() Tejun Heo
2026-07-03  9:51   ` sashiko-bot
2026-07-04  0:54     ` Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 11/32] sched_ext: Defer scx_sched kobj sysfs add into the enable workfns Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 12/32] sched_ext: Add per-shard scx_sched storage scaffolding Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 13/32] sched_ext: Add scx_cmask_ref for validated arena cmask access Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 14/32] sched_ext: RCU-protect the sub-sched tree's children/sibling lists Tejun Heo
2026-07-03 10:49   ` sashiko-bot
2026-07-04  0:54     ` Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 15/32] sched_ext: Add scx_skip_subtree_pre() Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 16/32] sched_ext: Add per-shard cap delegation for sub-schedulers Tejun Heo
2026-07-03 11:17   ` sashiko-bot
2026-07-04  0:54     ` Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 17/32] sched_ext: Add coalescing sub_caps_updated() notifier " Tejun Heo
2026-07-03  8:01 ` Tejun Heo [this message]
2026-07-03 12:05   ` [PATCH sched_ext/for-7.3 18/32] sched_ext: Maintain per-cpu effective cap copies for single-read checks sashiko-bot
2026-07-04  0:54     ` Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 19/32] sched_ext: Add sub_ecaps_updated() effective-cap change notifier Tejun Heo
2026-07-03 12:25   ` sashiko-bot
2026-07-04  0:54     ` Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 20/32] sched_ext: Generalize local-DSQ handling to rq-owned DSQs Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 21/32] sched_ext: Add reject DSQ for cap-rejected dispatches Tejun Heo
2026-07-03 12:57   ` sashiko-bot
2026-07-04  0:54     ` Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 22/32] sched_ext: Add the SCX_CAP_ENQ_IMMED cap Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 23/32] sched_ext: Assign a unique id to each scheduler instance Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 24/32] sched_ext: Route task slice writes through set_task_slice() Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 25/32] sched_ext: Tie cpu occupancy to SCX_CAP_BASE through the task slice Tejun Heo
2026-07-03 13:34   ` sashiko-bot
2026-07-04  0:54     ` Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 26/32] sched_ext: Add the SCX_CAP_ENQ cap Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 27/32] sched_ext: Gate kicks on SCX_CAP_BASE and preemption on SCX_CAP_PREEMPT Tejun Heo
2026-07-03 14:01   ` sashiko-bot
2026-07-04  0:54     ` Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 28/32] sched_ext: Route ops.update_idle() to sub-schedulers and re-notify owed scheds Tejun Heo
2026-07-03 14:14   ` sashiko-bot
2026-07-04  0:54     ` Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 29/32] sched_ext: Replay ecaps notifications suppressed by bypass Tejun Heo
2026-07-03 14:28   ` sashiko-bot
2026-07-04  0:54     ` Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 30/32] sched_ext: Add scx_bpf_sub_kill() to evict a child sub-scheduler Tejun Heo
2026-07-03 14:45   ` sashiko-bot
2026-07-04  0:54     ` Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 31/32] tools/sched_ext: scx_qmap - Expand hierarchical sub-scheduling Tejun Heo
2026-07-03 14:57   ` sashiko-bot
2026-07-04  0:54     ` Tejun Heo
2026-07-03  8:01 ` [PATCH sched_ext/for-7.3 32/32] tools/sched_ext: scx_qmap - Add sub-sched cap fault injection Tejun Heo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260703080159.2314350-19-tj@kernel.org \
    --to=tj@kernel.org \
    --cc=arighi@nvidia.com \
    --cc=changwoo@igalia.com \
    --cc=emil@etsalapatis.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=sched-ext@lists.linux.dev \
    --cc=void@manifault.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox